From 182fac2689b769a96e7fc9defcd560c5cca92b1e Mon Sep 17 00:00:00 2001 From: Jim Schutt Date: Wed, 29 Feb 2012 08:30:58 -0700 Subject: [PATCH 01/65] net/ceph: Only clear SOCK_NOSPACE when there is sufficient space in the socket buffer The Ceph messenger would sometimes queue multiple work items to write data to a socket when the socket buffer was full. Fix this problem by making ceph_write_space() use SOCK_NOSPACE in the same way that net/core/stream.c:sk_stream_write_space() does, i.e., clearing it only when sufficient space is available in the socket buffer. Signed-off-by: Jim Schutt Reviewed-by: Alex Elder --- net/ceph/messenger.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index ad5b70801f37..d11f91b05452 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -143,16 +143,22 @@ static void ceph_write_space(struct sock *sk) struct ceph_connection *con = (struct ceph_connection *)sk->sk_user_data; - /* only queue to workqueue if there is data we want to write. */ + /* only queue to workqueue if there is data we want to write, + * and there is sufficient space in the socket buffer to accept + * more data. clear SOCK_NOSPACE so that ceph_write_space() + * doesn't get called again until try_write() fills the socket + * buffer. See net/ipv4/tcp_input.c:tcp_check_space() + * and net/core/stream.c:sk_stream_write_space(). + */ if (test_bit(WRITE_PENDING, &con->state)) { - dout("ceph_write_space %p queueing write work\n", con); - queue_con(con); + if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { + dout("ceph_write_space %p queueing write work\n", con); + clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + queue_con(con); + } } else { dout("ceph_write_space %p nothing to write\n", con); } - - /* since we have our own write_space, clear the SOCK_NOSPACE flag */ - clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); } /* socket's state has changed */ From 1ce208a6ce030ea6ccd4b13c8cec0a84c0c7a1e9 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 12 Jan 2012 17:48:11 -0800 Subject: [PATCH 02/65] ceph: don't reset s_cap_ttl to zero Avoid the need to check for a special zero s_cap_ttl value by just using (jiffies - 1) as the value assigned to indicate "sometime in the past." Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- fs/ceph/mds_client.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 866e8d7ca37d..89971e137aab 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -402,7 +402,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, spin_lock_init(&s->s_gen_ttl_lock); s->s_cap_gen = 0; - s->s_cap_ttl = 0; + s->s_cap_ttl = jiffies - 1; spin_lock_init(&s->s_cap_lock); s->s_renew_requested = 0; @@ -1083,8 +1083,7 @@ static void renewed_caps(struct ceph_mds_client *mdsc, int wake = 0; spin_lock(&session->s_cap_lock); - was_stale = is_renew && (session->s_cap_ttl == 0 || - time_after_eq(jiffies, session->s_cap_ttl)); + was_stale = is_renew && time_after_eq(jiffies, session->s_cap_ttl); session->s_cap_ttl = session->s_renew_requested + mdsc->mdsmap->m_session_timeout*HZ; @@ -2332,7 +2331,7 @@ static void handle_session(struct ceph_mds_session *session, session->s_mds); spin_lock(&session->s_gen_ttl_lock); session->s_cap_gen++; - session->s_cap_ttl = 0; + session->s_cap_ttl = jiffies - 1; spin_unlock(&session->s_gen_ttl_lock); send_renew_caps(mdsc, session); break; From a661fc561190c0ee2d7cfabcfa92204e2b3aa349 Mon Sep 17 00:00:00 2001 From: Amon Ott Date: Mon, 23 Jan 2012 09:25:23 -0800 Subject: [PATCH 03/65] ceph: use 2 instead of 1 as fallback for 32-bit inode number The root directory of the Ceph mount has inode number 1, so falling back to 1 always creates a collision. 2 is unused on my test systems and seems less likely to collide. Signed-off-by: Amon Ott Signed-off-by: Sage Weil --- fs/ceph/super.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 1421f3d875a2..18d8a866a07b 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -367,7 +367,7 @@ static inline u32 ceph_ino_to_ino32(__u64 vino) u32 ino = vino & 0xffffffff; ino ^= vino >> 32; if (!ino) - ino = 1; + ino = 2; return ino; } From 810339ec2fae5cbd0164b8acde7fb65652755864 Mon Sep 17 00:00:00 2001 From: Xi Wang Date: Fri, 3 Feb 2012 09:55:36 -0500 Subject: [PATCH 04/65] ceph: avoid panic with mismatched symlink sizes in fill_inode() Return -EINVAL rather than panic if iinfo->symlink_len and inode->i_size do not match. Also use kstrndup rather than kmalloc/memcpy. Signed-off-by: Xi Wang Reviewed-by: Alex Elder --- fs/ceph/inode.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 2c489378b4cd..9fff9f3b17e4 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -677,18 +677,19 @@ static int fill_inode(struct inode *inode, case S_IFLNK: inode->i_op = &ceph_symlink_iops; if (!ci->i_symlink) { - int symlen = iinfo->symlink_len; + u32 symlen = iinfo->symlink_len; char *sym; - BUG_ON(symlen != inode->i_size); spin_unlock(&ci->i_ceph_lock); + err = -EINVAL; + if (WARN_ON(symlen != inode->i_size)) + goto out; + err = -ENOMEM; - sym = kmalloc(symlen+1, GFP_NOFS); + sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS); if (!sym) goto out; - memcpy(sym, iinfo->symlink, symlen); - sym[symlen] = 0; spin_lock(&ci->i_ceph_lock); if (!ci->i_symlink) From 64486697771cbe219fffcb5c8e2ed9ca4fdf086c Mon Sep 17 00:00:00 2001 From: Xi Wang Date: Thu, 16 Feb 2012 11:55:48 -0500 Subject: [PATCH 05/65] libceph: fix overflow check in crush_decode() The existing overflow check (n > ULONG_MAX / b) didn't work, because n = ULONG_MAX / b would both bypass the check and still overflow the allocation size a + n * b. The correct check should be (n > (ULONG_MAX - a) / b). Signed-off-by: Xi Wang Signed-off-by: Sage Weil --- net/ceph/osdmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index fd863fe76934..29ad46ec9dcf 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -283,7 +283,8 @@ static struct crush_map *crush_decode(void *pbyval, void *end) ceph_decode_32_safe(p, end, yes, bad); #if BITS_PER_LONG == 32 err = -EINVAL; - if (yes > ULONG_MAX / sizeof(struct crush_rule_step)) + if (yes > (ULONG_MAX - sizeof(*r)) + / sizeof(struct crush_rule_step)) goto bad; #endif r = c->rules[i] = kmalloc(sizeof(*r) + From 80834312a4da1405a9bc788313c67643de6fcb4c Mon Sep 17 00:00:00 2001 From: Xi Wang Date: Thu, 16 Feb 2012 11:56:29 -0500 Subject: [PATCH 06/65] ceph: fix overflow check in build_snap_context() The overflow check for a + n * b should be (n > (ULONG_MAX - a) / b), rather than (n > ULONG_MAX / b - a). Signed-off-by: Xi Wang Signed-off-by: Sage Weil --- fs/ceph/snap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index a559c80f127a..f04c0961f993 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -331,7 +331,7 @@ static int build_snap_context(struct ceph_snap_realm *realm) /* alloc new snap context */ err = -ENOMEM; - if (num > ULONG_MAX / sizeof(u64) - sizeof(*snapc)) + if (num > (ULONG_MAX - sizeof(*snapc)) / sizeof(u64)) goto fail; snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS); if (!snapc) From 5766651971e81298732466c9aa462ff47898ba37 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 23 Jan 2012 15:49:27 -0600 Subject: [PATCH 07/65] ceph: use a shared zero page rather than one per messenger Each messenger allocates a page to be used when writing zeroes out in the event of error or other abnormal condition. Instead, use the kernel ZERO_PAGE() for that purpose. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- include/linux/ceph/messenger.h | 1 - net/ceph/messenger.c | 43 +++++++++++++++++++++++----------- 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index ffbeb2c217b4..6b5af5f976d1 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -54,7 +54,6 @@ struct ceph_connection_operations { struct ceph_messenger { struct ceph_entity_inst inst; /* my name+address */ struct ceph_entity_addr my_enc_addr; - struct page *zero_page; /* used in certain error cases */ bool nocrc; diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index d11f91b05452..738356255e0b 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -52,6 +52,9 @@ static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN]; static DEFINE_SPINLOCK(addr_str_lock); static int last_addr_str; +static struct page *zero_page; /* used in certain error cases */ +static void *zero_page_address; /* kernel virtual addr of zero_page */ + const char *ceph_pr_addr(const struct sockaddr_storage *ss) { int i; @@ -99,18 +102,41 @@ struct workqueue_struct *ceph_msgr_wq; int ceph_msgr_init(void) { + BUG_ON(zero_page != NULL); + zero_page = ZERO_PAGE(0); + page_cache_get(zero_page); + + BUG_ON(zero_page_address != NULL); + zero_page_address = kmap(zero_page); + ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_NON_REENTRANT, 0); if (!ceph_msgr_wq) { pr_err("msgr_init failed to create workqueue\n"); + + zero_page_address = NULL; + kunmap(zero_page); + page_cache_release(zero_page); + zero_page = NULL; + return -ENOMEM; } + return 0; } EXPORT_SYMBOL(ceph_msgr_init); void ceph_msgr_exit(void) { + BUG_ON(ceph_msgr_wq == NULL); destroy_workqueue(ceph_msgr_wq); + + BUG_ON(zero_page_address == NULL); + zero_page_address = NULL; + + BUG_ON(zero_page == NULL); + kunmap(zero_page); + page_cache_release(zero_page); + zero_page = NULL; } EXPORT_SYMBOL(ceph_msgr_exit); @@ -841,9 +867,9 @@ static int write_partial_msg_pages(struct ceph_connection *con) max_write = bv->bv_len; #endif } else { - page = con->msgr->zero_page; + page = zero_page; if (crc) - kaddr = page_address(con->msgr->zero_page); + kaddr = zero_page_address; } len = min_t(int, max_write - con->out_msg_pos.page_pos, total_max_write); @@ -914,7 +940,7 @@ static int write_partial_skip(struct ceph_connection *con) while (con->out_skip > 0) { struct kvec iov = { - .iov_base = page_address(con->msgr->zero_page), + .iov_base = zero_page_address, .iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE) }; @@ -2222,15 +2248,6 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr, spin_lock_init(&msgr->global_seq_lock); - /* the zero page is needed if a request is "canceled" while the message - * is being written over the socket */ - msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO); - if (!msgr->zero_page) { - kfree(msgr); - return ERR_PTR(-ENOMEM); - } - kmap(msgr->zero_page); - if (myaddr) msgr->inst.addr = *myaddr; @@ -2247,8 +2264,6 @@ EXPORT_SYMBOL(ceph_messenger_create); void ceph_messenger_destroy(struct ceph_messenger *msgr) { dout("destroy %p\n", msgr); - kunmap(msgr->zero_page); - __free_page(msgr->zero_page); kfree(msgr); dout("destroyed messenger %p\n", msgr); } From a5bc3129a296fd4663c3ef0be5575e82453739dd Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 23 Jan 2012 15:49:27 -0600 Subject: [PATCH 08/65] ceph: make use of "else" where appropriate Rearrange ceph_tcp_connect() a bit, making use of "else" rather than re-testing a value with consecutive "if" statements. Don't record a connection's socket pointer unless the connect operation is successful. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- net/ceph/messenger.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 738356255e0b..b5536e4e39a1 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -251,7 +251,6 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con) IPPROTO_TCP, &sock); if (ret) return ERR_PTR(ret); - con->sock = sock; sock->sk->sk_allocation = GFP_NOFS; #ifdef CONFIG_LOCKDEP @@ -268,18 +267,16 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con) dout("connect %s EINPROGRESS sk_state = %u\n", ceph_pr_addr(&con->peer_addr.in_addr), sock->sk->sk_state); - ret = 0; - } - if (ret < 0) { + } else if (ret < 0) { pr_err("connect %s error %d\n", ceph_pr_addr(&con->peer_addr.in_addr), ret); sock_release(sock); - con->sock = NULL; con->error_msg = "connect error"; - } - if (ret < 0) return ERR_PTR(ret); + } + con->sock = sock; + return sock; } From f64a93172b97dcfcfa68f595652220653562f605 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 23 Jan 2012 15:49:27 -0600 Subject: [PATCH 09/65] ceph: kill addr_str_lock spinlock; use atomic instead A spinlock is used to protect a value used for selecting an array index for a string used for formatting a socket address for human consumption. The index is reset to 0 if it ever reaches the maximum index value. Instead, use an ever-increasing atomic variable as a sequence number, and compute the array index by masking off all but the sequence number's lowest bits. Make the number of entries in the array a power of two to allow the use of such a mask (to avoid jumps in the index value when the sequence number wraps). The length of these strings is somewhat arbitrarily set at 60 bytes. The worst-case length of a string produced is 54 bytes, for an IPv6 address that can't be shortened, e.g.: [1234:5678:9abc:def0:1111:2222:123.234.210.100]:32767 Change it so we arbitrarily use 64 bytes instead; if nothing else it will make the array of these line up better in hex dumps. Rename a few things to reinforce the distinction between the number of strings in the array and the length of individual strings. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- net/ceph/messenger.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index b5536e4e39a1..e86bb3f14859 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -44,13 +44,16 @@ static void con_work(struct work_struct *); static void ceph_fault(struct ceph_connection *con); /* - * nicely render a sockaddr as a string. + * Nicely render a sockaddr as a string. An array of formatted + * strings is used, to approximate reentrancy. */ -#define MAX_ADDR_STR 20 -#define MAX_ADDR_STR_LEN 60 -static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN]; -static DEFINE_SPINLOCK(addr_str_lock); -static int last_addr_str; +#define ADDR_STR_COUNT_LOG 5 /* log2(# address strings in array) */ +#define ADDR_STR_COUNT (1 << ADDR_STR_COUNT_LOG) +#define ADDR_STR_COUNT_MASK (ADDR_STR_COUNT - 1) +#define MAX_ADDR_STR_LEN 64 /* 54 is enough */ + +static char addr_str[ADDR_STR_COUNT][MAX_ADDR_STR_LEN]; +static atomic_t addr_str_seq = ATOMIC_INIT(0); static struct page *zero_page; /* used in certain error cases */ static void *zero_page_address; /* kernel virtual addr of zero_page */ @@ -62,11 +65,7 @@ const char *ceph_pr_addr(const struct sockaddr_storage *ss) struct sockaddr_in *in4 = (void *)ss; struct sockaddr_in6 *in6 = (void *)ss; - spin_lock(&addr_str_lock); - i = last_addr_str++; - if (last_addr_str == MAX_ADDR_STR) - last_addr_str = 0; - spin_unlock(&addr_str_lock); + i = atomic_inc_return(&addr_str_seq) & ADDR_STR_COUNT_MASK; s = addr_str[i]; switch (ss->ss_family) { From bd406145129e8724cc71b65ff2a788dbd4d60c50 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 23 Jan 2012 15:49:27 -0600 Subject: [PATCH 10/65] ceph: eliminate some needless casts This eliminates type casts in some places where they are not required. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- net/ceph/messenger.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index e86bb3f14859..09a412ba4b70 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -70,13 +70,13 @@ const char *ceph_pr_addr(const struct sockaddr_storage *ss) switch (ss->ss_family) { case AF_INET: - snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr, - (unsigned int)ntohs(in4->sin_port)); + snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%hu", &in4->sin_addr, + ntohs(in4->sin_port)); break; case AF_INET6: - snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr, - (unsigned int)ntohs(in6->sin6_port)); + snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%hu", &in6->sin6_addr, + ntohs(in6->sin6_port)); break; default: @@ -153,8 +153,8 @@ EXPORT_SYMBOL(ceph_msgr_flush); /* data available on socket, or listen socket received a connect */ static void ceph_data_ready(struct sock *sk, int count_unused) { - struct ceph_connection *con = - (struct ceph_connection *)sk->sk_user_data; + struct ceph_connection *con = sk->sk_user_data; + if (sk->sk_state != TCP_CLOSE_WAIT) { dout("ceph_data_ready on %p state = %lu, queueing work\n", con, con->state); @@ -189,8 +189,7 @@ static void ceph_write_space(struct sock *sk) /* socket's state has changed */ static void ceph_state_change(struct sock *sk) { - struct ceph_connection *con = - (struct ceph_connection *)sk->sk_user_data; + struct ceph_connection *con = sk->sk_user_data; dout("ceph_state_change %p state = %lu sk_state = %u\n", con, con->state, sk->sk_state); @@ -225,7 +224,7 @@ static void set_sock_callbacks(struct socket *sock, struct ceph_connection *con) { struct sock *sk = sock->sk; - sk->sk_user_data = (void *)con; + sk->sk_user_data = con; sk->sk_data_ready = ceph_data_ready; sk->sk_write_space = ceph_write_space; sk->sk_state_change = ceph_state_change; @@ -552,7 +551,7 @@ static void prepare_write_message(struct ceph_connection *con) /* fill in crc (except data pages), footer */ con->out_msg->hdr.crc = - cpu_to_le32(crc32c(0, (void *)&m->hdr, + cpu_to_le32(crc32c(0, &m->hdr, sizeof(m->hdr) - sizeof(m->hdr.crc))); con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE; con->out_msg->footer.front_crc = @@ -1647,7 +1646,7 @@ static int read_partial_message(struct ceph_connection *con) return ret; con->in_base_pos += ret; if (con->in_base_pos == sizeof(con->in_hdr)) { - u32 crc = crc32c(0, (void *)&con->in_hdr, + u32 crc = crc32c(0, &con->in_hdr, sizeof(con->in_hdr) - sizeof(con->in_hdr.crc)); if (crc != le32_to_cpu(con->in_hdr.crc)) { pr_err("read_partial_message bad hdr " From 99f0f3b2c4be15784bb4ede33b5f2c3f7861dba7 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 23 Jan 2012 15:49:27 -0600 Subject: [PATCH 11/65] ceph: eliminate some abusive casts This fixes some spots where a type cast to (void *) was used as as a universal type hiding mechanism. Instead, properly cast the type to the intended target type. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- net/ceph/messenger.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 09a412ba4b70..3917847ad8e5 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -62,8 +62,8 @@ const char *ceph_pr_addr(const struct sockaddr_storage *ss) { int i; char *s; - struct sockaddr_in *in4 = (void *)ss; - struct sockaddr_in6 *in6 = (void *)ss; + struct sockaddr_in *in4 = (struct sockaddr_in *) ss; + struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) ss; i = atomic_inc_return(&addr_str_seq) & ADDR_STR_COUNT_MASK; s = addr_str[i]; @@ -1112,8 +1112,8 @@ static void addr_set_port(struct sockaddr_storage *ss, int p) static int ceph_pton(const char *str, size_t len, struct sockaddr_storage *ss, char delim, const char **ipend) { - struct sockaddr_in *in4 = (void *)ss; - struct sockaddr_in6 *in6 = (void *)ss; + struct sockaddr_in *in4 = (struct sockaddr_in *) ss; + struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) ss; memset(ss, 0, sizeof(*ss)); From b829c1954dbeb42a1277a8cb05943050ee70be94 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 23 Jan 2012 15:49:27 -0600 Subject: [PATCH 12/65] ceph: don't null-terminate xattr values For some reason, ceph_setxattr() allocates an extra byte in which a '\0' is stored past the end of an extended attribute value. This is not needed, and is potentially misleading, so get rid of it. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- fs/ceph/xattr.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index a76f697303d9..bfff735091f5 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -730,11 +730,9 @@ int ceph_setxattr(struct dentry *dentry, const char *name, goto out; if (val_len) { - newval = kmalloc(val_len + 1, GFP_NOFS); + newval = kmemdup(value, val_len, GFP_NOFS); if (!newval) goto out; - memcpy(newval, value, val_len); - newval[val_len] = '\0'; } xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS); From 06476a69d8954f36a15ff5ddbfd47bdfcff22791 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 23 Jan 2012 15:49:27 -0600 Subject: [PATCH 13/65] ceph: pass inode rather than table to ceph_match_vxattr() All callers of ceph_match_vxattr() determine what to pass as the first argument by calling ceph_inode_vxattrs(inode). Just do that inside ceph_match_vxattr() itself, changing it to take an inode rather than the vxattr pointer as its first argument. Also ensure the function works correctly for an empty table (i.e., containing only a terminating null entry). Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- fs/ceph/xattr.c | 43 ++++++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index bfff735091f5..3418615c53ea 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -126,14 +126,19 @@ static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode) return NULL; } -static struct ceph_vxattr_cb *ceph_match_vxattr(struct ceph_vxattr_cb *vxattr, +static struct ceph_vxattr_cb *ceph_match_vxattr(struct inode *inode, const char *name) { - do { - if (strcmp(vxattr->name, name) == 0) - return vxattr; - vxattr++; - } while (vxattr->name); + struct ceph_vxattr_cb *vxattr = ceph_inode_vxattrs(inode); + + if (vxattr) { + while (vxattr->name) { + if (!strcmp(vxattr->name, name)) + return vxattr; + vxattr++; + } + } + return NULL; } @@ -502,7 +507,6 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, { struct inode *inode = dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); int err; struct ceph_inode_xattr *xattr; struct ceph_vxattr_cb *vxattr = NULL; @@ -511,8 +515,7 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, return -ENODATA; /* let's see if a virtual xattr was requested */ - if (vxattrs) - vxattr = ceph_match_vxattr(vxattrs, name); + vxattr = ceph_match_vxattr(inode, name); spin_lock(&ci->i_ceph_lock); dout("getxattr %p ver=%lld index_ver=%lld\n", inode, @@ -698,8 +701,8 @@ int ceph_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; + struct ceph_vxattr_cb *vxattr; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); int err; int name_len = strlen(name); int val_len = size; @@ -716,12 +719,9 @@ int ceph_setxattr(struct dentry *dentry, const char *name, if (!ceph_is_valid_xattr(name)) return -EOPNOTSUPP; - if (vxattrs) { - struct ceph_vxattr_cb *vxattr = - ceph_match_vxattr(vxattrs, name); - if (vxattr && vxattr->readonly) - return -EOPNOTSUPP; - } + vxattr = ceph_match_vxattr(inode, name); + if (vxattr && vxattr->readonly) + return -EOPNOTSUPP; /* preallocate memory for xattr name, value, index node */ err = -ENOMEM; @@ -814,8 +814,8 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name) int ceph_removexattr(struct dentry *dentry, const char *name) { struct inode *inode = dentry->d_inode; + struct ceph_vxattr_cb *vxattr; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); int issued; int err; int required_blob_size; @@ -827,12 +827,9 @@ int ceph_removexattr(struct dentry *dentry, const char *name) if (!ceph_is_valid_xattr(name)) return -EOPNOTSUPP; - if (vxattrs) { - struct ceph_vxattr_cb *vxattr = - ceph_match_vxattr(vxattrs, name); - if (vxattr && vxattr->readonly) - return -EOPNOTSUPP; - } + vxattr = ceph_match_vxattr(inode, name); + if (vxattr && vxattr->readonly) + return -EOPNOTSUPP; err = -ENOMEM; spin_lock(&ci->i_ceph_lock); From 22891907193e005923a14384d82d702f6af4f0cf Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 23 Jan 2012 15:49:28 -0600 Subject: [PATCH 14/65] ceph: use a symbolic name for "ceph." extended attribute namespace Use symbolic constants to define the top-level prefix for "ceph." extended attribute names. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- fs/ceph/xattr.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 3418615c53ea..05bb56f402a4 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -8,9 +8,12 @@ #include #include +#define XATTR_CEPH_PREFIX "ceph." +#define XATTR_CEPH_PREFIX_LEN (sizeof (XATTR_CEPH_PREFIX) - 1) + static bool ceph_is_valid_xattr(const char *name) { - return !strncmp(name, "ceph.", 5) || + return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) || !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) || !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || @@ -80,14 +83,14 @@ static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val, } static struct ceph_vxattr_cb ceph_dir_vxattrs[] = { - { true, "ceph.dir.entries", ceph_vxattrcb_entries}, - { true, "ceph.dir.files", ceph_vxattrcb_files}, - { true, "ceph.dir.subdirs", ceph_vxattrcb_subdirs}, - { true, "ceph.dir.rentries", ceph_vxattrcb_rentries}, - { true, "ceph.dir.rfiles", ceph_vxattrcb_rfiles}, - { true, "ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs}, - { true, "ceph.dir.rbytes", ceph_vxattrcb_rbytes}, - { true, "ceph.dir.rctime", ceph_vxattrcb_rctime}, + { true, XATTR_CEPH_PREFIX "dir.entries", ceph_vxattrcb_entries}, + { true, XATTR_CEPH_PREFIX "dir.files", ceph_vxattrcb_files}, + { true, XATTR_CEPH_PREFIX "dir.subdirs", ceph_vxattrcb_subdirs}, + { true, XATTR_CEPH_PREFIX "dir.rentries", ceph_vxattrcb_rentries}, + { true, XATTR_CEPH_PREFIX "dir.rfiles", ceph_vxattrcb_rfiles}, + { true, XATTR_CEPH_PREFIX "dir.rsubdirs", ceph_vxattrcb_rsubdirs}, + { true, XATTR_CEPH_PREFIX "dir.rbytes", ceph_vxattrcb_rbytes}, + { true, XATTR_CEPH_PREFIX "dir.rctime", ceph_vxattrcb_rctime}, { true, NULL, NULL } }; @@ -111,9 +114,9 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, } static struct ceph_vxattr_cb ceph_file_vxattrs[] = { - { true, "ceph.file.layout", ceph_vxattrcb_layout}, + { true, XATTR_CEPH_PREFIX "file.layout", ceph_vxattrcb_layout}, /* The following extended attribute name is deprecated */ - { true, "ceph.layout", ceph_vxattrcb_layout}, + { true, XATTR_CEPH_PREFIX "layout", ceph_vxattrcb_layout}, { true, NULL, NULL } }; From eb78808446aeed8e25b080c66bf823c1f188236d Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 23 Jan 2012 15:49:28 -0600 Subject: [PATCH 15/65] ceph: use macros to normalize vxattr table definitions Entries in the ceph virtual extended attribute tables all follow a distinct pattern in their definition. Enforce this pattern through the use of a macro. Also, a null name field signals the end of the table, so make that be the first field in the ceph_vxattr_cb structure. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- fs/ceph/xattr.c | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 05bb56f402a4..38aef476f786 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -25,10 +25,10 @@ static bool ceph_is_valid_xattr(const char *name) * statistics and layout metadata. */ struct ceph_vxattr_cb { - bool readonly; char *name; size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val, size_t size); + bool readonly; }; /* directories */ @@ -82,16 +82,25 @@ static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val, (long)ci->i_rctime.tv_nsec); } +#define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name + +#define XATTR_NAME_CEPH(_type, _name) \ + { \ + .name = CEPH_XATTR_NAME(_type, _name), \ + .getxattr_cb = ceph_vxattrcb_ ## _name, \ + .readonly = true, \ + } + static struct ceph_vxattr_cb ceph_dir_vxattrs[] = { - { true, XATTR_CEPH_PREFIX "dir.entries", ceph_vxattrcb_entries}, - { true, XATTR_CEPH_PREFIX "dir.files", ceph_vxattrcb_files}, - { true, XATTR_CEPH_PREFIX "dir.subdirs", ceph_vxattrcb_subdirs}, - { true, XATTR_CEPH_PREFIX "dir.rentries", ceph_vxattrcb_rentries}, - { true, XATTR_CEPH_PREFIX "dir.rfiles", ceph_vxattrcb_rfiles}, - { true, XATTR_CEPH_PREFIX "dir.rsubdirs", ceph_vxattrcb_rsubdirs}, - { true, XATTR_CEPH_PREFIX "dir.rbytes", ceph_vxattrcb_rbytes}, - { true, XATTR_CEPH_PREFIX "dir.rctime", ceph_vxattrcb_rctime}, - { true, NULL, NULL } + XATTR_NAME_CEPH(dir, entries), + XATTR_NAME_CEPH(dir, files), + XATTR_NAME_CEPH(dir, subdirs), + XATTR_NAME_CEPH(dir, rentries), + XATTR_NAME_CEPH(dir, rfiles), + XATTR_NAME_CEPH(dir, rsubdirs), + XATTR_NAME_CEPH(dir, rbytes), + XATTR_NAME_CEPH(dir, rctime), + { 0 } /* Required table terminator */ }; /* files */ @@ -114,10 +123,14 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, } static struct ceph_vxattr_cb ceph_file_vxattrs[] = { - { true, XATTR_CEPH_PREFIX "file.layout", ceph_vxattrcb_layout}, + XATTR_NAME_CEPH(file, layout), /* The following extended attribute name is deprecated */ - { true, XATTR_CEPH_PREFIX "layout", ceph_vxattrcb_layout}, - { true, NULL, NULL } + { + .name = XATTR_CEPH_PREFIX "layout", + .getxattr_cb = ceph_vxattrcb_layout, + .readonly = true, + }, + { 0 } /* Required table terminator */ }; static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode) From 881a5fa20092d221a7c4b365742c959ef4b297ec Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 23 Jan 2012 15:49:28 -0600 Subject: [PATCH 16/65] ceph: drop "_cb" from name of struct ceph_vxattr_cb A struct ceph_vxattr_cb does not represent a callback at all, but rather a virtual extended attribute itself. Drop the "_cb" suffix from its name to reflect that. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- fs/ceph/xattr.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 38aef476f786..e29c7d3fa405 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -24,7 +24,7 @@ static bool ceph_is_valid_xattr(const char *name) * These define virtual xattrs exposing the recursive directory * statistics and layout metadata. */ -struct ceph_vxattr_cb { +struct ceph_vxattr { char *name; size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val, size_t size); @@ -91,7 +91,7 @@ static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val, .readonly = true, \ } -static struct ceph_vxattr_cb ceph_dir_vxattrs[] = { +static struct ceph_vxattr ceph_dir_vxattrs[] = { XATTR_NAME_CEPH(dir, entries), XATTR_NAME_CEPH(dir, files), XATTR_NAME_CEPH(dir, subdirs), @@ -122,7 +122,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, return ret; } -static struct ceph_vxattr_cb ceph_file_vxattrs[] = { +static struct ceph_vxattr ceph_file_vxattrs[] = { XATTR_NAME_CEPH(file, layout), /* The following extended attribute name is deprecated */ { @@ -133,7 +133,7 @@ static struct ceph_vxattr_cb ceph_file_vxattrs[] = { { 0 } /* Required table terminator */ }; -static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode) +static struct ceph_vxattr *ceph_inode_vxattrs(struct inode *inode) { if (S_ISDIR(inode->i_mode)) return ceph_dir_vxattrs; @@ -142,10 +142,10 @@ static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode) return NULL; } -static struct ceph_vxattr_cb *ceph_match_vxattr(struct inode *inode, +static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode, const char *name) { - struct ceph_vxattr_cb *vxattr = ceph_inode_vxattrs(inode); + struct ceph_vxattr *vxattr = ceph_inode_vxattrs(inode); if (vxattr) { while (vxattr->name) { @@ -525,7 +525,7 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, struct ceph_inode_info *ci = ceph_inode(inode); int err; struct ceph_inode_xattr *xattr; - struct ceph_vxattr_cb *vxattr = NULL; + struct ceph_vxattr *vxattr = NULL; if (!ceph_is_valid_xattr(name)) return -ENODATA; @@ -587,7 +587,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) { struct inode *inode = dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); + struct ceph_vxattr *vxattrs = ceph_inode_vxattrs(inode); u32 vir_namelen = 0; u32 namelen; int err; @@ -717,7 +717,7 @@ int ceph_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; - struct ceph_vxattr_cb *vxattr; + struct ceph_vxattr *vxattr; struct ceph_inode_info *ci = ceph_inode(inode); int err; int name_len = strlen(name); @@ -830,7 +830,7 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name) int ceph_removexattr(struct dentry *dentry, const char *name) { struct inode *inode = dentry->d_inode; - struct ceph_vxattr_cb *vxattr; + struct ceph_vxattr *vxattr; struct ceph_inode_info *ci = ceph_inode(inode); int issued; int err; From aa4066ed7ba60421423c35f66b789bb3dd21d89e Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 23 Jan 2012 15:49:28 -0600 Subject: [PATCH 17/65] ceph: encode type in vxattr callback routines The names of the callback functions used for virtual extended attributes are based only on the last component of the attribute name. Because of the way these are defined, this precludes allowing a single (lowest) attribute name for different callbacks, dependent on the type of file being operated on. (For example, it might be nice to support both "ceph.dir.layout" and "ceph.file.layout".) Just change the callback names to avoid this problem. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- fs/ceph/xattr.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index e29c7d3fa405..46be30d6d127 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -33,49 +33,49 @@ struct ceph_vxattr { /* directories */ -static size_t ceph_vxattrcb_entries(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val, size_t size) { return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs); } -static size_t ceph_vxattrcb_files(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_files(struct ceph_inode_info *ci, char *val, size_t size) { return snprintf(val, size, "%lld", ci->i_files); } -static size_t ceph_vxattrcb_subdirs(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_subdirs(struct ceph_inode_info *ci, char *val, size_t size) { return snprintf(val, size, "%lld", ci->i_subdirs); } -static size_t ceph_vxattrcb_rentries(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_rentries(struct ceph_inode_info *ci, char *val, size_t size) { return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs); } -static size_t ceph_vxattrcb_rfiles(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_rfiles(struct ceph_inode_info *ci, char *val, size_t size) { return snprintf(val, size, "%lld", ci->i_rfiles); } -static size_t ceph_vxattrcb_rsubdirs(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_rsubdirs(struct ceph_inode_info *ci, char *val, size_t size) { return snprintf(val, size, "%lld", ci->i_rsubdirs); } -static size_t ceph_vxattrcb_rbytes(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val, size_t size) { return snprintf(val, size, "%lld", ci->i_rbytes); } -static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val, size_t size) { return snprintf(val, size, "%ld.%ld", (long)ci->i_rctime.tv_sec, @@ -87,7 +87,7 @@ static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val, #define XATTR_NAME_CEPH(_type, _name) \ { \ .name = CEPH_XATTR_NAME(_type, _name), \ - .getxattr_cb = ceph_vxattrcb_ ## _name, \ + .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \ .readonly = true, \ } @@ -105,7 +105,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = { /* files */ -static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_file_layout(struct ceph_inode_info *ci, char *val, size_t size) { int ret; @@ -127,7 +127,7 @@ static struct ceph_vxattr ceph_file_vxattrs[] = { /* The following extended attribute name is deprecated */ { .name = XATTR_CEPH_PREFIX "layout", - .getxattr_cb = ceph_vxattrcb_layout, + .getxattr_cb = ceph_vxattrcb_file_layout, .readonly = true, }, { 0 } /* Required table terminator */ From 3ce6cd1233046eb97d6d2bd5d80c1cd40528ea2f Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 23 Jan 2012 15:49:28 -0600 Subject: [PATCH 18/65] ceph: avoid repeatedly computing the size of constant vxattr names All names defined in the directory and file virtual extended attribute tables are constant, and the size of each is known at compile time. So there's no need to compute their length every time any file's attribute is listed. Record the length of each string and use it when needed to determine the space need to represent them. In addition, compute the aggregate size of strings in each table just once at initialization time. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- fs/ceph/super.c | 3 +++ fs/ceph/super.h | 2 ++ fs/ceph/xattr.c | 56 ++++++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 56 insertions(+), 5 deletions(-) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 00de2c9568cd..c3da3b32bdde 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -927,6 +927,7 @@ static int __init init_ceph(void) if (ret) goto out; + ceph_xattr_init(); ret = register_filesystem(&ceph_fs_type); if (ret) goto out_icache; @@ -936,6 +937,7 @@ static int __init init_ceph(void) return 0; out_icache: + ceph_xattr_exit(); destroy_caches(); out: return ret; @@ -945,6 +947,7 @@ static void __exit exit_ceph(void) { dout("exit_ceph\n"); unregister_filesystem(&ceph_fs_type); + ceph_xattr_exit(); destroy_caches(); } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 18d8a866a07b..fc35036d258d 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -733,6 +733,8 @@ extern ssize_t ceph_listxattr(struct dentry *, char *, size_t); extern int ceph_removexattr(struct dentry *, const char *); extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci); extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci); +extern void __init ceph_xattr_init(void); +extern void ceph_xattr_exit(void); /* caps.c */ extern const char *ceph_cap_string(int c); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 46be30d6d127..88eaedf78fa9 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -26,6 +26,7 @@ static bool ceph_is_valid_xattr(const char *name) */ struct ceph_vxattr { char *name; + size_t name_size; /* strlen(name) + 1 (for '\0') */ size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val, size_t size); bool readonly; @@ -87,6 +88,7 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val, #define XATTR_NAME_CEPH(_type, _name) \ { \ .name = CEPH_XATTR_NAME(_type, _name), \ + .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \ .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \ .readonly = true, \ } @@ -102,6 +104,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = { XATTR_NAME_CEPH(dir, rctime), { 0 } /* Required table terminator */ }; +static size_t ceph_dir_vxattrs_name_size; /* total size of all names */ /* files */ @@ -127,11 +130,13 @@ static struct ceph_vxattr ceph_file_vxattrs[] = { /* The following extended attribute name is deprecated */ { .name = XATTR_CEPH_PREFIX "layout", + .name_size = sizeof (XATTR_CEPH_PREFIX "layout"), .getxattr_cb = ceph_vxattrcb_file_layout, .readonly = true, }, { 0 } /* Required table terminator */ }; +static size_t ceph_file_vxattrs_name_size; /* total size of all names */ static struct ceph_vxattr *ceph_inode_vxattrs(struct inode *inode) { @@ -142,6 +147,46 @@ static struct ceph_vxattr *ceph_inode_vxattrs(struct inode *inode) return NULL; } +static size_t ceph_vxattrs_name_size(struct ceph_vxattr *vxattrs) +{ + if (vxattrs == ceph_dir_vxattrs) + return ceph_dir_vxattrs_name_size; + if (vxattrs == ceph_file_vxattrs) + return ceph_file_vxattrs_name_size; + BUG(); + + return 0; +} + +/* + * Compute the aggregate size (including terminating '\0') of all + * virtual extended attribute names in the given vxattr table. + */ +static size_t __init vxattrs_name_size(struct ceph_vxattr *vxattrs) +{ + struct ceph_vxattr *vxattr; + size_t size = 0; + + for (vxattr = vxattrs; vxattr->name; vxattr++) + size += vxattr->name_size; + + return size; +} + +/* Routines called at initialization and exit time */ + +void __init ceph_xattr_init(void) +{ + ceph_dir_vxattrs_name_size = vxattrs_name_size(ceph_dir_vxattrs); + ceph_file_vxattrs_name_size = vxattrs_name_size(ceph_file_vxattrs); +} + +void ceph_xattr_exit(void) +{ + ceph_dir_vxattrs_name_size = 0; + ceph_file_vxattrs_name_size = 0; +} + static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode, const char *name) { @@ -615,11 +660,12 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) goto out; list_xattr: - vir_namelen = 0; - /* include virtual dir xattrs */ - if (vxattrs) - for (i = 0; vxattrs[i].name; i++) - vir_namelen += strlen(vxattrs[i].name) + 1; + /* + * Start with virtual dir xattr names (if any) (including + * terminating '\0' characters for each). + */ + vir_namelen = ceph_vxattrs_name_size(vxattrs); + /* adding 1 byte per each variable due to the null termination */ namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count; err = -ERANGE; From 18fa8b3feaac772925263b04b1429d80e2dfd779 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 23 Jan 2012 15:49:28 -0600 Subject: [PATCH 19/65] ceph: make ceph_setxattr() and ceph_removexattr() more alike This patch just rearranges a few bits of code to make more portions of ceph_setxattr() and ceph_removexattr() identical. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- fs/ceph/xattr.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 88eaedf78fa9..8294f461ecd1 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -765,15 +765,15 @@ int ceph_setxattr(struct dentry *dentry, const char *name, struct inode *inode = dentry->d_inode; struct ceph_vxattr *vxattr; struct ceph_inode_info *ci = ceph_inode(inode); + int issued; int err; + int dirty; int name_len = strlen(name); int val_len = size; char *newname = NULL; char *newval = NULL; struct ceph_inode_xattr *xattr = NULL; - int issued; int required_blob_size; - int dirty; if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; @@ -804,6 +804,7 @@ int ceph_setxattr(struct dentry *dentry, const char *name, spin_lock(&ci->i_ceph_lock); retry: issued = __ceph_caps_issued(ci, NULL); + dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued)); if (!(issued & CEPH_CAP_XATTR_EXCL)) goto do_sync; __build_xattrs(inode); @@ -812,7 +813,7 @@ retry: if (!ci->i_xattrs.prealloc_blob || required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) { - struct ceph_buffer *blob = NULL; + struct ceph_buffer *blob; spin_unlock(&ci->i_ceph_lock); dout(" preaallocating new blob size=%d\n", required_blob_size); @@ -826,12 +827,13 @@ retry: goto retry; } - dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued)); err = __set_xattr(ci, newname, name_len, newval, val_len, 1, 1, 1, &xattr); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); ci->i_xattrs.dirty = true; inode->i_ctime = CURRENT_TIME; + spin_unlock(&ci->i_ceph_lock); if (dirty) __mark_inode_dirty(inode, dirty); @@ -895,13 +897,13 @@ int ceph_removexattr(struct dentry *dentry, const char *name) err = -ENOMEM; spin_lock(&ci->i_ceph_lock); - __build_xattrs(inode); retry: issued = __ceph_caps_issued(ci, NULL); dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued)); if (!(issued & CEPH_CAP_XATTR_EXCL)) goto do_sync; + __build_xattrs(inode); required_blob_size = __get_required_blob_size(ci, 0, 0); @@ -922,10 +924,10 @@ retry: } err = __remove_xattr_by_name(ceph_inode(inode), name); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); ci->i_xattrs.dirty = true; inode->i_ctime = CURRENT_TIME; - spin_unlock(&ci->i_ceph_lock); if (dirty) __mark_inode_dirty(inode, dirty); From 2107978668de13da484f7abc3f03516494c7fca9 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 24 Jan 2012 10:08:36 -0600 Subject: [PATCH 20/65] rbd: a few small cleanups Some minor cleanups in "drivers/block/rbd.c: - Use the more meaningful "RBD_MAX_OBJ_NAME_LEN" in place if "96" in the definition of RBD_MAX_MD_NAME_LEN. - Use DEFINE_SPINLOCK() to define and initialize node_lock. - Drop a needless (char *) cast in parse_rbd_opts_token(). - Make a few minor formatting changes. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- drivers/block/rbd.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index a6278e7e61a0..b9371f0b9532 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -46,7 +46,7 @@ #define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ -#define RBD_MAX_MD_NAME_LEN (96 + sizeof(RBD_SUFFIX)) +#define RBD_MAX_MD_NAME_LEN (RBD_MAX_OBJ_NAME_LEN + sizeof(RBD_SUFFIX)) #define RBD_MAX_POOL_NAME_LEN 64 #define RBD_MAX_SNAP_NAME_LEN 32 #define RBD_MAX_OPT_LEN 1024 @@ -175,7 +175,7 @@ static struct bus_type rbd_bus_type = { .name = "rbd", }; -static spinlock_t node_lock; /* protects client get/put */ +static DEFINE_SPINLOCK(node_lock); /* protects client get/put */ static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ static LIST_HEAD(rbd_dev_list); /* devices */ @@ -324,7 +324,7 @@ static int parse_rbd_opts_token(char *c, void *private) substring_t argstr[MAX_OPT_ARGS]; int token, intval, ret; - token = match_token((char *)c, rbdopt_tokens, argstr); + token = match_token(c, rbdopt_tokens, argstr); if (token < 0) return -EINVAL; @@ -372,7 +372,8 @@ static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr, rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT; ret = ceph_parse_options(&opt, options, mon_addr, - mon_addr + strlen(mon_addr), parse_rbd_opts_token, rbd_opts); + mon_addr + strlen(mon_addr), + parse_rbd_opts_token, rbd_opts); if (ret < 0) goto done_err; @@ -460,15 +461,13 @@ static int rbd_header_from_disk(struct rbd_image_header *header, u32 snap_count = le32_to_cpu(ondisk->snap_count); int ret = -ENOMEM; - if (memcmp(ondisk, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT))) { + if (memcmp(ondisk, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT))) return -ENXIO; - } init_rwsem(&header->snap_rwsem); header->snap_names_len = le64_to_cpu(ondisk->snap_names_len); header->snapc = kmalloc(sizeof(struct ceph_snap_context) + - snap_count * - sizeof(struct rbd_image_snap_ondisk), + snap_count * sizeof (*ondisk), gfp_flags); if (!header->snapc) return -ENOMEM; @@ -498,8 +497,7 @@ static int rbd_header_from_disk(struct rbd_image_header *header, header->snapc->num_snaps = snap_count; header->total_snaps = snap_count; - if (snap_count && - allocated_snaps == snap_count) { + if (snap_count && allocated_snaps == snap_count) { for (i = 0; i < snap_count; i++) { header->snapc->snaps[i] = le64_to_cpu(ondisk->snaps[i].id); @@ -2423,7 +2421,7 @@ static int rbd_sysfs_init(void) rbd_bus_type.bus_attrs = rbd_bus_attrs; ret = bus_register(&rbd_bus_type); - if (ret < 0) + if (ret < 0) return ret; ret = device_register(&rbd_root_dev); @@ -2444,7 +2442,6 @@ int __init rbd_init(void) rc = rbd_sysfs_init(); if (rc) return rc; - spin_lock_init(&node_lock); pr_info("loaded " DRV_NAME_LONG "\n"); return 0; } From ee57741c5209154b8ef124bcaa2496da1b69a988 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 24 Jan 2012 10:08:36 -0600 Subject: [PATCH 21/65] rbd: make ceph_parse_options() return a pointer ceph_parse_options() takes the address of a pointer as an argument and uses it to return the address of an allocated structure if successful. With this interface is not evident at call sites that the pointer is always initialized. Change the interface to return the address instead (or a pointer-coded error code) to make the validity of the returned pointer obvious. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- drivers/block/rbd.c | 6 ++++-- fs/ceph/super.c | 6 ++++-- include/linux/ceph/libceph.h | 2 +- net/ceph/ceph_common.c | 16 ++++++++-------- 4 files changed, 17 insertions(+), 13 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index b9371f0b9532..ed6711e35323 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -371,11 +371,13 @@ static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr, rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT; - ret = ceph_parse_options(&opt, options, mon_addr, + opt = ceph_parse_options(options, mon_addr, mon_addr + strlen(mon_addr), parse_rbd_opts_token, rbd_opts); - if (ret < 0) + if (IS_ERR(opt)) { + ret = PTR_ERR(opt); goto done_err; + } spin_lock(&node_lock); rbdc = __rbd_client_find(opt); diff --git a/fs/ceph/super.c b/fs/ceph/super.c index c3da3b32bdde..4fab1fdcfa6a 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -334,10 +334,12 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, *path += 2; dout("server path '%s'\n", *path); - err = ceph_parse_options(popt, options, dev_name, dev_name_end, + *popt = ceph_parse_options(options, dev_name, dev_name_end, parse_fsopt_token, (void *)fsopt); - if (err) + if (IS_ERR(*popt)) { + err = PTR_ERR(*popt); goto out; + } /* success */ *pfsopt = fsopt; diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 95bd8502e715..92eef7c3d3c5 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -207,7 +207,7 @@ extern struct kmem_cache *ceph_cap_cachep; extern struct kmem_cache *ceph_dentry_cachep; extern struct kmem_cache *ceph_file_cachep; -extern int ceph_parse_options(struct ceph_options **popt, char *options, +extern struct ceph_options *ceph_parse_options(char *options, const char *dev_name, const char *dev_name_end, int (*parse_extra_token)(char *c, void *private), void *private); diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 761ad9d6cc3b..621c3221b393 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -277,10 +277,11 @@ out: return err; } -int ceph_parse_options(struct ceph_options **popt, char *options, - const char *dev_name, const char *dev_name_end, - int (*parse_extra_token)(char *c, void *private), - void *private) +struct ceph_options * +ceph_parse_options(char *options, const char *dev_name, + const char *dev_name_end, + int (*parse_extra_token)(char *c, void *private), + void *private) { struct ceph_options *opt; const char *c; @@ -289,7 +290,7 @@ int ceph_parse_options(struct ceph_options **popt, char *options, opt = kzalloc(sizeof(*opt), GFP_KERNEL); if (!opt) - return err; + return ERR_PTR(-ENOMEM); opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr), GFP_KERNEL); if (!opt->mon_addr) @@ -412,12 +413,11 @@ int ceph_parse_options(struct ceph_options **popt, char *options, } /* success */ - *popt = opt; - return 0; + return opt; out: ceph_destroy_options(opt); - return err; + return ERR_PTR(err); } EXPORT_SYMBOL(ceph_parse_options); From 1dbb439913f0fc0bc30d36411a4a3b3202c0aab1 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 24 Jan 2012 10:08:37 -0600 Subject: [PATCH 22/65] rbd: do not duplicate ceph_client pointer in rbd_device The rbd_device structure maintains a duplicate copy of the ceph_client pointer maintained in its rbd_client structure. There appears to be no good reason for this, and its presence presents a risk of them getting out of synch or otherwise misused. So kill it off, and use the rbd_client copy only. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- drivers/block/rbd.c | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index ed6711e35323..dcdfe8dbf4f1 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -140,7 +140,6 @@ struct rbd_device { struct gendisk *disk; /* blkdev's gendisk and rq */ struct request_queue *q; - struct ceph_client *client; struct rbd_client *rbd_client; char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ @@ -388,7 +387,6 @@ static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr, /* using an existing client */ kref_get(&rbdc->kref); rbd_dev->rbd_client = rbdc; - rbd_dev->client = rbdc->client; spin_unlock(&node_lock); return 0; } @@ -401,7 +399,6 @@ static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr, } rbd_dev->rbd_client = rbdc; - rbd_dev->client = rbdc->client; return 0; done_err: kfree(rbd_opts); @@ -435,7 +432,6 @@ static void rbd_put_client(struct rbd_device *rbd_dev) kref_put(&rbd_dev->rbd_client->kref, rbd_client_release); spin_unlock(&node_lock); rbd_dev->rbd_client = NULL; - rbd_dev->client = NULL; } /* @@ -858,6 +854,7 @@ static int rbd_do_request(struct request *rq, struct rbd_request *req_data; struct ceph_osd_request_head *reqhead; struct rbd_image_header *header = &dev->header; + struct ceph_osd_client *osdc; req_data = kzalloc(sizeof(*req_data), GFP_NOIO); if (!req_data) { @@ -876,11 +873,9 @@ static int rbd_do_request(struct request *rq, down_read(&header->snap_rwsem); - req = ceph_osdc_alloc_request(&dev->client->osdc, flags, - snapc, - ops, - false, - GFP_NOIO, pages, bio); + osdc = &dev->rbd_client->client->osdc; + req = ceph_osdc_alloc_request(osdc, flags, snapc, ops, + false, GFP_NOIO, pages, bio); if (!req) { up_read(&header->snap_rwsem); ret = -ENOMEM; @@ -909,8 +904,8 @@ static int rbd_do_request(struct request *rq, layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); layout->fl_pg_preferred = cpu_to_le32(-1); layout->fl_pg_pool = cpu_to_le32(dev->poolid); - ceph_calc_raw_layout(&dev->client->osdc, layout, snapid, - ofs, &len, &bno, req, ops); + ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno, + req, ops); ceph_osdc_build_request(req, ofs, &len, ops, @@ -920,16 +915,16 @@ static int rbd_do_request(struct request *rq, up_read(&header->snap_rwsem); if (linger_req) { - ceph_osdc_set_request_linger(&dev->client->osdc, req); + ceph_osdc_set_request_linger(osdc, req); *linger_req = req; } - ret = ceph_osdc_start_request(&dev->client->osdc, req, false); + ret = ceph_osdc_start_request(osdc, req, false); if (ret < 0) goto done_err; if (!rbd_cb) { - ret = ceph_osdc_wait_request(&dev->client->osdc, req); + ret = ceph_osdc_wait_request(osdc, req); if (ver) *ver = le64_to_cpu(req->r_reassert_version.version); dout("reassert_ver=%lld\n", @@ -1227,7 +1222,7 @@ static int rbd_req_sync_watch(struct rbd_device *dev, u64 ver) { struct ceph_osd_req_op *ops; - struct ceph_osd_client *osdc = &dev->client->osdc; + struct ceph_osd_client *osdc = &dev->rbd_client->client->osdc; int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0); if (ret < 0) @@ -1314,7 +1309,7 @@ static int rbd_req_sync_notify(struct rbd_device *dev, const char *obj) { struct ceph_osd_req_op *ops; - struct ceph_osd_client *osdc = &dev->client->osdc; + struct ceph_osd_client *osdc = &dev->rbd_client->client->osdc; struct ceph_osd_event *event; struct rbd_notify_info info; int payload_len = sizeof(u32) + sizeof(u32); @@ -1623,13 +1618,14 @@ static int rbd_header_add_snap(struct rbd_device *dev, int ret; void *data, *p, *e; u64 ver; + struct ceph_mon_client *monc; /* we should create a snapshot only if we're pointing at the head */ if (dev->cur_snap) return -EINVAL; - ret = ceph_monc_create_snapid(&dev->client->monc, dev->poolid, - &new_snapid); + monc = &dev->rbd_client->client->monc; + ret = ceph_monc_create_snapid(monc, dev->poolid, &new_snapid); dout("created snapid=%lld\n", new_snapid); if (ret < 0) return ret; @@ -1809,7 +1805,8 @@ static ssize_t rbd_client_id_show(struct device *dev, { struct rbd_device *rbd_dev = dev_to_rbd(dev); - return sprintf(buf, "client%lld\n", ceph_client_id(rbd_dev->client)); + return sprintf(buf, "client%lld\n", + ceph_client_id(rbd_dev->rbd_client->client)); } static ssize_t rbd_pool_show(struct device *dev, @@ -2233,7 +2230,7 @@ static ssize_t rbd_add(struct bus_type *bus, mutex_unlock(&ctl_mutex); /* pick the pool */ - osdc = &rbd_dev->client->osdc; + osdc = &rbd_dev->rbd_client->client->osdc; rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name); if (rc < 0) goto err_out_client; @@ -2312,9 +2309,12 @@ static void rbd_dev_release(struct device *dev) struct rbd_device *rbd_dev = container_of(dev, struct rbd_device, dev); - if (rbd_dev->watch_request) - ceph_osdc_unregister_linger_request(&rbd_dev->client->osdc, + if (rbd_dev->watch_request) { + struct ceph_client *client = rbd_dev->rbd_client->client; + + ceph_osdc_unregister_linger_request(&client->osdc, rbd_dev->watch_request); + } if (rbd_dev->watch_event) rbd_req_sync_unwatch(rbd_dev, rbd_dev->obj_md_name); From cc9d734c3d1b39c6a557673469aea26364060226 Mon Sep 17 00:00:00 2001 From: Josh Durgin Date: Mon, 21 Nov 2011 18:19:13 -0800 Subject: [PATCH 23/65] rbd: use a single value of snap_name to mean no snap There's already a constant for this anyway. Since rbd_header_set_snap() is only used to set the rbd device snap_name field, just do that within that function rather than having it take the snap_name as an argument. Signed-off-by: Alex Elder Signed-off-by: Sage Weil v2: Changed interface rbd_header_set_snap() so it explicitly updates the snap_name in the rbd_device. Also added a BUILD_BUG_ON() to verify the size of the snap_name field is sufficient for SNAP_HEAD_NAME. --- drivers/block/rbd.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index dcdfe8dbf4f1..d8d052d42258 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -553,20 +553,18 @@ static int snap_by_name(struct rbd_image_header *header, const char *snap_name, return i; } -static int rbd_header_set_snap(struct rbd_device *dev, - const char *snap_name, - u64 *size) +static int rbd_header_set_snap(struct rbd_device *dev, u64 *size) { struct rbd_image_header *header = &dev->header; struct ceph_snap_context *snapc = header->snapc; int ret = -ENOENT; + BUILD_BUG_ON(sizeof (dev->snap_name) < sizeof (RBD_SNAP_HEAD_NAME)); + down_write(&header->snap_rwsem); - if (!snap_name || - !*snap_name || - strcmp(snap_name, "-") == 0 || - strcmp(snap_name, RBD_SNAP_HEAD_NAME) == 0) { + if (!memcmp(dev->snap_name, RBD_SNAP_HEAD_NAME, + sizeof (RBD_SNAP_HEAD_NAME))) { if (header->total_snaps) snapc->seq = header->snap_seq; else @@ -576,7 +574,7 @@ static int rbd_header_set_snap(struct rbd_device *dev, if (size) *size = header->image_size; } else { - ret = snap_by_name(header, snap_name, &snapc->seq, size); + ret = snap_by_name(header, dev->snap_name, &snapc->seq, size); if (ret < 0) goto done; @@ -1729,7 +1727,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) if (rc) return rc; - rc = rbd_header_set_snap(rbd_dev, rbd_dev->snap_name, &total_size); + rc = rbd_header_set_snap(rbd_dev, &total_size); if (rc) return rc; @@ -2215,7 +2213,8 @@ static ssize_t rbd_add(struct bus_type *bus, } if (rbd_dev->snap_name[0] == 0) - rbd_dev->snap_name[0] = '-'; + memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME, + sizeof (RBD_SNAP_HEAD_NAME)); rbd_dev->obj_len = strlen(rbd_dev->obj); snprintf(rbd_dev->obj_md_name, sizeof(rbd_dev->obj_md_name), "%s%s", From b7f23c361b65a0bdcc81acd8d38471b7810df3ff Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sun, 29 Jan 2012 13:57:43 -0600 Subject: [PATCH 24/65] rbd: encapsulate new rbd id selection Move the loop that finds a new unique rbd id to use into its own helper function. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- drivers/block/rbd.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index d8d052d42258..aaa19d8c3670 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2149,6 +2149,23 @@ static int rbd_init_watch_dev(struct rbd_device *rbd_dev) return ret; } +/* caller must hold ctl_mutex */ +static int rbd_id_get(void) +{ + struct list_head *tmp; + int new_id = 0; + + list_for_each(tmp, &rbd_dev_list) { + struct rbd_device *rbd_dev; + + rbd_dev = list_entry(tmp, struct rbd_device, node); + if (rbd_dev->id >= new_id) + new_id = rbd_dev->id + 1; + } + + return new_id; +} + static ssize_t rbd_add(struct bus_type *bus, const char *buf, size_t count) @@ -2156,8 +2173,7 @@ static ssize_t rbd_add(struct bus_type *bus, struct ceph_osd_client *osdc; struct rbd_device *rbd_dev; ssize_t rc = -ENOMEM; - int irc, new_id = 0; - struct list_head *tmp; + int irc; char *mon_dev_name; char *options; @@ -2187,15 +2203,7 @@ static ssize_t rbd_add(struct bus_type *bus, /* generate unique id: find highest unique id, add one */ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - list_for_each(tmp, &rbd_dev_list) { - struct rbd_device *rbd_dev; - - rbd_dev = list_entry(tmp, struct rbd_device, node); - if (rbd_dev->id >= new_id) - new_id = rbd_dev->id + 1; - } - - rbd_dev->id = new_id; + rbd_dev->id = rbd_id_get(); /* add to global list */ list_add_tail(&rbd_dev->node, &rbd_dev_list); From 1ddbe94eda58597cb6dd464b455cb62d3f68be7b Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sun, 29 Jan 2012 13:57:44 -0600 Subject: [PATCH 25/65] rbd: rework calculation of new rbd id's In order to select a new unique identifier for an added rbd device, the list of all existing ones is searched and a value one greater than the highest id is used. The list search can be avoided by using an atomic variable that keeps track of the current highest id. Using a get/put model for id's we can limit the boundless growth of id numbers a bit by arranging to reuse the current highest id once it gets released. Add these calls to "put" the id when an rbd is getting removed. Note that this changes the pattern of device id's used--new values will never be below the highest one seen so far (even if there exists an unused lower one). I assert this is OK because the key property of an rbd id is its uniqueness, not its magnitude. Regardless, a follow-on patch will restore the old way of doing things, I just think this commit just makes the incremental change to atomics a little easier to understand. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- drivers/block/rbd.c | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index aaa19d8c3670..62da8ccc5fcb 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2149,21 +2149,29 @@ static int rbd_init_watch_dev(struct rbd_device *rbd_dev) return ret; } -/* caller must hold ctl_mutex */ +static atomic64_t rbd_id_max = ATOMIC64_INIT(0); + +/* + * Get a unique rbd identifier. The minimum rbd id is 1. + */ static int rbd_id_get(void) { - struct list_head *tmp; - int new_id = 0; + return atomic64_inc_return(&rbd_id_max); +} - list_for_each(tmp, &rbd_dev_list) { - struct rbd_device *rbd_dev; +/* + * Record that an rbd identifier is no longer in use. + */ +static void rbd_id_put(int rbd_id) +{ + BUG_ON(rbd_id < 1); - rbd_dev = list_entry(tmp, struct rbd_device, node); - if (rbd_dev->id >= new_id) - new_id = rbd_dev->id + 1; - } - - return new_id; + /* + * New id's are always one more than the current maximum. + * If the id being "put" *is* that maximum, decrement the + * maximum so the next one requested just reuses this one. + */ + atomic64_cmpxchg(&rbd_id_max, rbd_id, rbd_id - 1); } static ssize_t rbd_add(struct bus_type *bus, @@ -2200,7 +2208,7 @@ static ssize_t rbd_add(struct bus_type *bus, init_rwsem(&rbd_dev->header.snap_rwsem); - /* generate unique id: find highest unique id, add one */ + /* generate unique id: one more than highest used so far */ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); rbd_dev->id = rbd_id_get(); @@ -2270,6 +2278,7 @@ err_out_bus: mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); list_del_init(&rbd_dev->node); mutex_unlock(&ctl_mutex); + rbd_id_put(target_id); /* this will also clean up rest of rbd_dev stuff */ @@ -2286,6 +2295,7 @@ err_out_client: err_out_slot: list_del_init(&rbd_dev->node); mutex_unlock(&ctl_mutex); + rbd_id_put(target_id); kfree(rbd_dev); err_out_opt: @@ -2363,6 +2373,7 @@ static ssize_t rbd_remove(struct bus_type *bus, } list_del_init(&rbd_dev->node); + rbd_id_put(target_id); __rbd_remove_all_snaps(rbd_dev); rbd_bus_del_dev(rbd_dev); From e124a82f3c4efc2cc2bae68a2bf30020fb8c4fc2 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sun, 29 Jan 2012 13:57:44 -0600 Subject: [PATCH 26/65] rbd: protect the rbd_dev_list with a spinlock The rbd_dev_list is just a simple list of all the current rbd_devices. Using the ctl_mutex as a concurrency guard is overkill. Instead, use a spinlock for that specific purpose. This also reduces the window that the ctl_mutex needs to be held in rbd_add(). Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- drivers/block/rbd.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 62da8ccc5fcb..e259feedc7d0 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -174,11 +174,13 @@ static struct bus_type rbd_bus_type = { .name = "rbd", }; -static DEFINE_SPINLOCK(node_lock); /* protects client get/put */ - static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ + static LIST_HEAD(rbd_dev_list); /* devices */ +static DEFINE_SPINLOCK(rbd_dev_list_lock); + static LIST_HEAD(rbd_client_list); /* clients */ +static DEFINE_SPINLOCK(node_lock); /* protects client get/put */ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev); static void rbd_dev_release(struct device *dev); @@ -2209,12 +2211,12 @@ static ssize_t rbd_add(struct bus_type *bus, init_rwsem(&rbd_dev->header.snap_rwsem); /* generate unique id: one more than highest used so far */ - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - rbd_dev->id = rbd_id_get(); /* add to global list */ + spin_lock(&rbd_dev_list_lock); list_add_tail(&rbd_dev->node, &rbd_dev_list); + spin_unlock(&rbd_dev_list_lock); /* parse add command */ if (sscanf(buf, "%" __stringify(RBD_MAX_OPT_LEN) "s " @@ -2238,12 +2240,14 @@ static ssize_t rbd_add(struct bus_type *bus, /* initialize rest of new object */ snprintf(rbd_dev->name, DEV_NAME_LEN, DRV_NAME "%d", rbd_dev->id); + + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); rc = rbd_get_client(rbd_dev, mon_dev_name, options); + mutex_unlock(&ctl_mutex); + if (rc < 0) goto err_out_slot; - mutex_unlock(&ctl_mutex); - /* pick the pool */ osdc = &rbd_dev->rbd_client->client->osdc; rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name); @@ -2275,9 +2279,9 @@ static ssize_t rbd_add(struct bus_type *bus, return count; err_out_bus: - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + spin_lock(&rbd_dev_list_lock); list_del_init(&rbd_dev->node); - mutex_unlock(&ctl_mutex); + spin_unlock(&rbd_dev_list_lock); rbd_id_put(target_id); /* this will also clean up rest of rbd_dev stuff */ @@ -2291,10 +2295,10 @@ err_out_blkdev: unregister_blkdev(rbd_dev->major, rbd_dev->name); err_out_client: rbd_put_client(rbd_dev); - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); err_out_slot: + spin_lock(&rbd_dev_list_lock); list_del_init(&rbd_dev->node); - mutex_unlock(&ctl_mutex); + spin_unlock(&rbd_dev_list_lock); rbd_id_put(target_id); kfree(rbd_dev); @@ -2313,11 +2317,15 @@ static struct rbd_device *__rbd_get_dev(unsigned long id) struct list_head *tmp; struct rbd_device *rbd_dev; + spin_lock(&rbd_dev_list_lock); list_for_each(tmp, &rbd_dev_list) { rbd_dev = list_entry(tmp, struct rbd_device, node); - if (rbd_dev->id == id) + if (rbd_dev->id == id) { + spin_unlock(&rbd_dev_list_lock); return rbd_dev; + } } + spin_unlock(&rbd_dev_list_lock); return NULL; } @@ -2372,7 +2380,10 @@ static ssize_t rbd_remove(struct bus_type *bus, goto done; } + spin_lock(&rbd_dev_list_lock); list_del_init(&rbd_dev->node); + spin_unlock(&rbd_dev_list_lock); + rbd_id_put(target_id); __rbd_remove_all_snaps(rbd_dev); From 499afd5b8e742792fda6bd7730c738ad83aecf6b Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 2 Feb 2012 08:13:29 -0600 Subject: [PATCH 27/65] rbd: tie rbd_dev_list changes to rbd_id operations The only time entries are added to or removed from the global rbd_dev_list is exactly when a "put" or "get" operation is being performed on a rbd_dev's id. So just move the list management code into get/put routines. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- drivers/block/rbd.c | 47 ++++++++++++++++++++------------------------- 1 file changed, 21 insertions(+), 26 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index e259feedc7d0..e7727e8337fc 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2154,26 +2154,36 @@ static int rbd_init_watch_dev(struct rbd_device *rbd_dev) static atomic64_t rbd_id_max = ATOMIC64_INIT(0); /* - * Get a unique rbd identifier. The minimum rbd id is 1. + * Get a unique rbd identifier for the given new rbd_dev, and add + * the rbd_dev to the global list. The minimum rbd id is 1. */ -static int rbd_id_get(void) +static void rbd_id_get(struct rbd_device *rbd_dev) { - return atomic64_inc_return(&rbd_id_max); + rbd_dev->id = atomic64_inc_return(&rbd_id_max); + + spin_lock(&rbd_dev_list_lock); + list_add_tail(&rbd_dev->node, &rbd_dev_list); + spin_unlock(&rbd_dev_list_lock); } /* - * Record that an rbd identifier is no longer in use. + * Remove an rbd_dev from the global list, and record that its + * identifier is no longer in use. */ -static void rbd_id_put(int rbd_id) +static void rbd_id_put(struct rbd_device *rbd_dev) { - BUG_ON(rbd_id < 1); + BUG_ON(rbd_dev->id < 1); + + spin_lock(&rbd_dev_list_lock); + list_del_init(&rbd_dev->node); + spin_unlock(&rbd_dev_list_lock); /* * New id's are always one more than the current maximum. * If the id being "put" *is* that maximum, decrement the * maximum so the next one requested just reuses this one. */ - atomic64_cmpxchg(&rbd_id_max, rbd_id, rbd_id - 1); + atomic64_cmpxchg(&rbd_id_max, rbd_dev->id, rbd_dev->id - 1); } static ssize_t rbd_add(struct bus_type *bus, @@ -2211,12 +2221,7 @@ static ssize_t rbd_add(struct bus_type *bus, init_rwsem(&rbd_dev->header.snap_rwsem); /* generate unique id: one more than highest used so far */ - rbd_dev->id = rbd_id_get(); - - /* add to global list */ - spin_lock(&rbd_dev_list_lock); - list_add_tail(&rbd_dev->node, &rbd_dev_list); - spin_unlock(&rbd_dev_list_lock); + rbd_id_get(rbd_dev); /* parse add command */ if (sscanf(buf, "%" __stringify(RBD_MAX_OPT_LEN) "s " @@ -2279,10 +2284,7 @@ static ssize_t rbd_add(struct bus_type *bus, return count; err_out_bus: - spin_lock(&rbd_dev_list_lock); - list_del_init(&rbd_dev->node); - spin_unlock(&rbd_dev_list_lock); - rbd_id_put(target_id); + rbd_id_put(rbd_dev); /* this will also clean up rest of rbd_dev stuff */ @@ -2296,10 +2298,7 @@ err_out_blkdev: err_out_client: rbd_put_client(rbd_dev); err_out_slot: - spin_lock(&rbd_dev_list_lock); - list_del_init(&rbd_dev->node); - spin_unlock(&rbd_dev_list_lock); - rbd_id_put(target_id); + rbd_id_put(rbd_dev); kfree(rbd_dev); err_out_opt: @@ -2380,11 +2379,7 @@ static ssize_t rbd_remove(struct bus_type *bus, goto done; } - spin_lock(&rbd_dev_list_lock); - list_del_init(&rbd_dev->node); - spin_unlock(&rbd_dev_list_lock); - - rbd_id_put(target_id); + rbd_id_put(rbd_dev); __rbd_remove_all_snaps(rbd_dev); rbd_bus_del_dev(rbd_dev); From d184f6bfde1428ad4a690d49b28afc9ab4d57b35 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sun, 29 Jan 2012 13:57:44 -0600 Subject: [PATCH 28/65] rbd: restore previous rbd id sequence behavior It used to be that selecting a new unique identifier for an added rbd device required searching all existing ones to find the highest id is used. A recent change made that unnecessary, but made it so that id's used were monotonically non-decreasing. It's a bit more pleasant to have smaller rbd id's though, and this change makes ids get allocated as they were before--each new id is one more than the maximum currently in use. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- drivers/block/rbd.c | 40 ++++++++++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index e7727e8337fc..9ac1484a95ad 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2172,18 +2172,46 @@ static void rbd_id_get(struct rbd_device *rbd_dev) */ static void rbd_id_put(struct rbd_device *rbd_dev) { - BUG_ON(rbd_dev->id < 1); + struct list_head *tmp; + int rbd_id = rbd_dev->id; + int max_id; + + BUG_ON(rbd_id < 1); spin_lock(&rbd_dev_list_lock); list_del_init(&rbd_dev->node); + + /* + * If the id being "put" is not the current maximum, there + * is nothing special we need to do. + */ + if (rbd_id != atomic64_read(&rbd_id_max)) { + spin_unlock(&rbd_dev_list_lock); + return; + } + + /* + * We need to update the current maximum id. Search the + * list to find out what it is. We're more likely to find + * the maximum at the end, so search the list backward. + */ + max_id = 0; + list_for_each_prev(tmp, &rbd_dev_list) { + struct rbd_device *rbd_dev; + + rbd_dev = list_entry(tmp, struct rbd_device, node); + if (rbd_id > max_id) + max_id = rbd_id; + } spin_unlock(&rbd_dev_list_lock); /* - * New id's are always one more than the current maximum. - * If the id being "put" *is* that maximum, decrement the - * maximum so the next one requested just reuses this one. + * The max id could have been updated by rbd_id_get(), in + * which case it now accurately reflects the new maximum. + * Be careful not to overwrite the maximum value in that + * case. */ - atomic64_cmpxchg(&rbd_id_max, rbd_dev->id, rbd_dev->id - 1); + atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id); } static ssize_t rbd_add(struct bus_type *bus, @@ -2220,7 +2248,7 @@ static ssize_t rbd_add(struct bus_type *bus, init_rwsem(&rbd_dev->header.snap_rwsem); - /* generate unique id: one more than highest used so far */ + /* generate unique id: find highest unique id, add one */ rbd_id_get(rbd_dev); /* parse add command */ From e6994d3ddedf1a9f35cb43655bb4b5810e71199a Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sun, 29 Jan 2012 13:57:44 -0600 Subject: [PATCH 29/65] rbd: release client list lock sooner In rbd_get_client(), if a client is reused, a number of things get done while still holding the list lock unnecessarily. This just moves a few things that need no lock protection outside the lock. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- drivers/block/rbd.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 9ac1484a95ad..bccd350a0323 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -383,13 +383,15 @@ static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr, spin_lock(&node_lock); rbdc = __rbd_client_find(opt); if (rbdc) { + /* using an existing client */ + kref_get(&rbdc->kref); + spin_unlock(&node_lock); + + rbd_dev->rbd_client = rbdc; + ceph_destroy_options(opt); kfree(rbd_opts); - /* using an existing client */ - kref_get(&rbdc->kref); - rbd_dev->rbd_client = rbdc; - spin_unlock(&node_lock); return 0; } spin_unlock(&node_lock); From d97081b0c7bdb55371994cc6690217bf393eb63e Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sun, 29 Jan 2012 13:57:44 -0600 Subject: [PATCH 30/65] rbd: move ctl_mutex lock inside rbd_get_client() Since rbd_get_client() is only called in one place, move the acquisition of the mutex around that call inside that function. Furthermore, within rbd_get_client(), it appears the mutex only needs to be held while calling rbd_client_create(). (Moving the lock inside that function will wait for the next patch.) Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- drivers/block/rbd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index bccd350a0323..7f60ff28ac29 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -396,7 +396,10 @@ static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr, } spin_unlock(&node_lock); + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); rbdc = rbd_client_create(opt, rbd_opts); + mutex_unlock(&ctl_mutex); + if (IS_ERR(rbdc)) { ret = PTR_ERR(rbdc); goto done_err; @@ -2276,10 +2279,7 @@ static ssize_t rbd_add(struct bus_type *bus, /* initialize rest of new object */ snprintf(rbd_dev->name, DEV_NAME_LEN, DRV_NAME "%d", rbd_dev->id); - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); rc = rbd_get_client(rbd_dev, mon_dev_name, options); - mutex_unlock(&ctl_mutex); - if (rc < 0) goto err_out_slot; From bc534d86be71aaf8dfac46131420ab1c47387d42 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sun, 29 Jan 2012 13:57:44 -0600 Subject: [PATCH 31/65] rbd: move ctl_mutex lock inside rbd_client_create() Since rbd_client_create() is only called in one place, move the acquisition of the mutex around that call inside that function. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- drivers/block/rbd.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 7f60ff28ac29..38174bffa049 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -257,9 +257,11 @@ static struct rbd_client *rbd_client_create(struct ceph_options *opt, kref_init(&rbdc->kref); INIT_LIST_HEAD(&rbdc->node); + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + rbdc->client = ceph_create_client(opt, rbdc, 0, 0); if (IS_ERR(rbdc->client)) - goto out_rbdc; + goto out_mutex; opt = NULL; /* Now rbdc->client is responsible for opt */ ret = ceph_open_session(rbdc->client); @@ -272,12 +274,15 @@ static struct rbd_client *rbd_client_create(struct ceph_options *opt, list_add_tail(&rbdc->node, &rbd_client_list); spin_unlock(&node_lock); + mutex_unlock(&ctl_mutex); + dout("rbd_client_create created %p\n", rbdc); return rbdc; out_err: ceph_destroy_client(rbdc->client); -out_rbdc: +out_mutex: + mutex_unlock(&ctl_mutex); kfree(rbdc); out_opt: if (opt) @@ -396,9 +401,7 @@ static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr, } spin_unlock(&node_lock); - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); rbdc = rbd_client_create(opt, rbd_opts); - mutex_unlock(&ctl_mutex); if (IS_ERR(rbdc)) { ret = PTR_ERR(rbdc); From 432b858749631dc011ac919dace4b0705ba8cecf Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sun, 29 Jan 2012 13:57:44 -0600 Subject: [PATCH 32/65] rbd: rename "node_lock" The spinlock used to protect rbd_client_list is named "node_lock". Rename it to "rbd_client_list_lock" to make it more obvious what it's for. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- drivers/block/rbd.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 38174bffa049..92b8c374d84d 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -179,8 +179,8 @@ static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ static LIST_HEAD(rbd_dev_list); /* devices */ static DEFINE_SPINLOCK(rbd_dev_list_lock); -static LIST_HEAD(rbd_client_list); /* clients */ -static DEFINE_SPINLOCK(node_lock); /* protects client get/put */ +static LIST_HEAD(rbd_client_list); /* clients */ +static DEFINE_SPINLOCK(rbd_client_list_lock); static int __rbd_init_snaps_header(struct rbd_device *rbd_dev); static void rbd_dev_release(struct device *dev); @@ -270,9 +270,9 @@ static struct rbd_client *rbd_client_create(struct ceph_options *opt, rbdc->rbd_opts = rbd_opts; - spin_lock(&node_lock); + spin_lock(&rbd_client_list_lock); list_add_tail(&rbdc->node, &rbd_client_list); - spin_unlock(&node_lock); + spin_unlock(&rbd_client_list_lock); mutex_unlock(&ctl_mutex); @@ -385,12 +385,12 @@ static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr, goto done_err; } - spin_lock(&node_lock); + spin_lock(&rbd_client_list_lock); rbdc = __rbd_client_find(opt); if (rbdc) { /* using an existing client */ kref_get(&rbdc->kref); - spin_unlock(&node_lock); + spin_unlock(&rbd_client_list_lock); rbd_dev->rbd_client = rbdc; @@ -399,7 +399,7 @@ static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr, return 0; } - spin_unlock(&node_lock); + spin_unlock(&rbd_client_list_lock); rbdc = rbd_client_create(opt, rbd_opts); @@ -418,7 +418,7 @@ done_err: /* * Destroy ceph client * - * Caller must hold node_lock. + * Caller must hold rbd_client_list_lock. */ static void rbd_client_release(struct kref *kref) { @@ -438,9 +438,9 @@ static void rbd_client_release(struct kref *kref) */ static void rbd_put_client(struct rbd_device *rbd_dev) { - spin_lock(&node_lock); + spin_lock(&rbd_client_list_lock); kref_put(&rbd_dev->rbd_client->kref, rbd_client_release); - spin_unlock(&node_lock); + spin_unlock(&rbd_client_list_lock); rbd_dev->rbd_client = NULL; } From f0f8cef5a30504eaeba5588a9115b46c824d91a3 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sun, 29 Jan 2012 13:57:44 -0600 Subject: [PATCH 33/65] rbd: a few simple changes Here are a few very simple cleanups: - Add a "RBD_" prefix to the two driver name string definitions. - Move the definition of struct rbd_request below struct rbd_req_coll to avoid the need for an empty declaration of the latter. - Move and group the definitions of rbd_root_dev_release() and rbd_root_dev, as well as rbd_bus_type and rbd_bus_attrs[], close to the top of the file. Arrange the latter so rbd_bus_type.bus_attrs can be initialized statically. - Get rid of an unnecessary local variable in rbd_open(). - Rework some hokey logic in rbd_bus_add_dev(), so the value of "ret" at the end is either 0 or -ENOENT to avoid the need for the code duplication that was there. - Rename a goto target in rbd_add(). Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- drivers/block/rbd.c | 107 ++++++++++++++++++++++---------------------- 1 file changed, 53 insertions(+), 54 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 92b8c374d84d..812fd38cba3d 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -41,8 +41,8 @@ #include "rbd_types.h" -#define DRV_NAME "rbd" -#define DRV_NAME_LONG "rbd (rados block device)" +#define RBD_DRV_NAME "rbd" +#define RBD_DRV_NAME_LONG "rbd (rados block device)" #define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ @@ -83,7 +83,7 @@ struct rbd_options { }; /* - * an instance of the client. multiple devices may share a client. + * an instance of the client. multiple devices may share an rbd client. */ struct rbd_client { struct ceph_client *client; @@ -92,20 +92,9 @@ struct rbd_client { struct list_head node; }; -struct rbd_req_coll; - /* - * a single io request + * a request completion status */ -struct rbd_request { - struct request *rq; /* blk layer request */ - struct bio *bio; /* cloned bio */ - struct page **pages; /* list of used pages */ - u64 len; - int coll_index; - struct rbd_req_coll *coll; -}; - struct rbd_req_status { int done; int rc; @@ -122,6 +111,18 @@ struct rbd_req_coll { struct rbd_req_status status[0]; }; +/* + * a single io request + */ +struct rbd_request { + struct request *rq; /* blk layer request */ + struct bio *bio; /* cloned bio */ + struct page **pages; /* list of used pages */ + u64 len; + int coll_index; + struct rbd_req_coll *coll; +}; + struct rbd_snap { struct device dev; const char *name; @@ -170,10 +171,6 @@ struct rbd_device { struct device dev; }; -static struct bus_type rbd_bus_type = { - .name = "rbd", -}; - static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ static LIST_HEAD(rbd_dev_list); /* devices */ @@ -191,6 +188,31 @@ static ssize_t rbd_snap_add(struct device *dev, static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev, struct rbd_snap *snap); +static ssize_t rbd_add(struct bus_type *bus, const char *buf, + size_t count); +static ssize_t rbd_remove(struct bus_type *bus, const char *buf, + size_t count); + +static struct bus_attribute rbd_bus_attrs[] = { + __ATTR(add, S_IWUSR, NULL, rbd_add), + __ATTR(remove, S_IWUSR, NULL, rbd_remove), + __ATTR_NULL +}; + +static struct bus_type rbd_bus_type = { + .name = "rbd", + .bus_attrs = rbd_bus_attrs, +}; + +static void rbd_root_dev_release(struct device *dev) +{ +} + +static struct device rbd_root_dev = { + .init_name = "rbd", + .release = rbd_root_dev_release, +}; + static struct rbd_device *dev_to_rbd(struct device *dev) { @@ -211,8 +233,7 @@ static int __rbd_update_snaps(struct rbd_device *rbd_dev); static int rbd_open(struct block_device *bdev, fmode_t mode) { - struct gendisk *disk = bdev->bd_disk; - struct rbd_device *rbd_dev = disk->private_data; + struct rbd_device *rbd_dev = bdev->bd_disk->private_data; rbd_get_dev(rbd_dev); @@ -1216,8 +1237,8 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) rc = __rbd_update_snaps(dev); mutex_unlock(&ctl_mutex); if (rc) - pr_warning(DRV_NAME "%d got notification but failed to update" - " snaps: %d\n", dev->major, rc); + pr_warning(RBD_DRV_NAME "%d got notification but failed to " + " update snaps: %d\n", dev->major, rc); rbd_req_sync_notify_ack(dev, ver, notify_id, dev->obj_md_name); } @@ -1747,7 +1768,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) if (!disk) goto out; - snprintf(disk->disk_name, sizeof(disk->disk_name), DRV_NAME "%d", + snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", rbd_dev->id); disk->major = rbd_dev->major; disk->first_minor = 0; @@ -2093,19 +2114,9 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev) return 0; } - -static void rbd_root_dev_release(struct device *dev) -{ -} - -static struct device rbd_root_dev = { - .init_name = "rbd", - .release = rbd_root_dev_release, -}; - static int rbd_bus_add_dev(struct rbd_device *rbd_dev) { - int ret = -ENOMEM; + int ret; struct device *dev; struct rbd_snap *snap; @@ -2119,7 +2130,7 @@ static int rbd_bus_add_dev(struct rbd_device *rbd_dev) dev_set_name(dev, "%d", rbd_dev->id); ret = device_register(dev); if (ret < 0) - goto done_free; + goto out; list_for_each_entry(snap, &rbd_dev->snaps, node) { ret = rbd_register_snap_dev(rbd_dev, snap, @@ -2127,10 +2138,7 @@ static int rbd_bus_add_dev(struct rbd_device *rbd_dev) if (ret < 0) break; } - - mutex_unlock(&ctl_mutex); - return 0; -done_free: +out: mutex_unlock(&ctl_mutex); return ret; } @@ -2268,7 +2276,7 @@ static ssize_t rbd_add(struct bus_type *bus, mon_dev_name, options, rbd_dev->pool_name, rbd_dev->obj, rbd_dev->snap_name) < 4) { rc = -EINVAL; - goto err_out_slot; + goto err_put_id; } if (rbd_dev->snap_name[0] == 0) @@ -2280,11 +2288,11 @@ static ssize_t rbd_add(struct bus_type *bus, rbd_dev->obj, RBD_SUFFIX); /* initialize rest of new object */ - snprintf(rbd_dev->name, DEV_NAME_LEN, DRV_NAME "%d", rbd_dev->id); + snprintf(rbd_dev->name, DEV_NAME_LEN, RBD_DRV_NAME "%d", rbd_dev->id); rc = rbd_get_client(rbd_dev, mon_dev_name, options); if (rc < 0) - goto err_out_slot; + goto err_put_id; /* pick the pool */ osdc = &rbd_dev->rbd_client->client->osdc; @@ -2330,9 +2338,8 @@ err_out_blkdev: unregister_blkdev(rbd_dev->major, rbd_dev->name); err_out_client: rbd_put_client(rbd_dev); -err_out_slot: +err_put_id: rbd_id_put(rbd_dev); - kfree(rbd_dev); err_out_opt: kfree(options); @@ -2463,12 +2470,6 @@ err_unlock: return ret; } -static struct bus_attribute rbd_bus_attrs[] = { - __ATTR(add, S_IWUSR, NULL, rbd_add), - __ATTR(remove, S_IWUSR, NULL, rbd_remove), - __ATTR_NULL -}; - /* * create control files in sysfs * /sys/bus/rbd/... @@ -2477,8 +2478,6 @@ static int rbd_sysfs_init(void) { int ret; - rbd_bus_type.bus_attrs = rbd_bus_attrs; - ret = bus_register(&rbd_bus_type); if (ret < 0) return ret; @@ -2501,7 +2500,7 @@ int __init rbd_init(void) rc = rbd_sysfs_init(); if (rc) return rc; - pr_info("loaded " DRV_NAME_LONG "\n"); + pr_info("loaded " RBD_DRV_NAME_LONG "\n"); return 0; } From d720bcb0a8f246eb441ba9d4f341bc16746556c6 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 2 Feb 2012 08:13:30 -0600 Subject: [PATCH 34/65] rbd: have rbd_get_client() return a rbd_client Since rbd_get_client() currently returns an error code. It assigns the rbd_client field of the rbd_device structure it is passed if successful. Instead, have it return the created rbd_client structure and return a pointer-coded error if there is an error. This makes the assignment of the client pointer more obvious at the call site. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- drivers/block/rbd.c | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 812fd38cba3d..3e6f300ba9f1 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -384,17 +384,15 @@ static int parse_rbd_opts_token(char *c, void *private) * Get a ceph client with specific addr and configuration, if one does * not exist create it. */ -static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr, - char *options) +static struct rbd_client *rbd_get_client(const char *mon_addr, char *options) { struct rbd_client *rbdc; struct ceph_options *opt; - int ret; struct rbd_options *rbd_opts; rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL); if (!rbd_opts) - return -ENOMEM; + return ERR_PTR(-ENOMEM); rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT; @@ -402,8 +400,8 @@ static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr, mon_addr + strlen(mon_addr), parse_rbd_opts_token, rbd_opts); if (IS_ERR(opt)) { - ret = PTR_ERR(opt); - goto done_err; + kfree(rbd_opts); + return ERR_CAST(opt); } spin_lock(&rbd_client_list_lock); @@ -413,27 +411,19 @@ static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr, kref_get(&rbdc->kref); spin_unlock(&rbd_client_list_lock); - rbd_dev->rbd_client = rbdc; - ceph_destroy_options(opt); kfree(rbd_opts); - return 0; + return rbdc; } spin_unlock(&rbd_client_list_lock); rbdc = rbd_client_create(opt, rbd_opts); - if (IS_ERR(rbdc)) { - ret = PTR_ERR(rbdc); - goto done_err; - } + if (IS_ERR(rbdc)) + kfree(rbd_opts); - rbd_dev->rbd_client = rbdc; - return 0; -done_err: - kfree(rbd_opts); - return ret; + return rbdc; } /* @@ -2290,9 +2280,11 @@ static ssize_t rbd_add(struct bus_type *bus, /* initialize rest of new object */ snprintf(rbd_dev->name, DEV_NAME_LEN, RBD_DRV_NAME "%d", rbd_dev->id); - rc = rbd_get_client(rbd_dev, mon_dev_name, options); - if (rc < 0) + rbd_dev->rbd_client = rbd_get_client(mon_dev_name, options); + if (IS_ERR(rbd_dev->rbd_client)) { + rc = PTR_ERR(rbd_dev->rbd_client); goto err_put_id; + } /* pick the pool */ osdc = &rbd_dev->rbd_client->client->osdc; From 60571c7d556b10db7e555bd4b6765647af5c41e8 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 2 Feb 2012 08:13:30 -0600 Subject: [PATCH 35/65] rbd: reduce memory used for rbd_dev fields The length of the string containing the monitor address specification(s) will never exceed the length of the string passed in to rbd_add(). The same holds true for the ceph + rbd options string. So reduce the amount of memory allocated for these to that length rather than the maximum (1024 bytes). Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- drivers/block/rbd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 3e6f300ba9f1..606d59aece2b 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2234,11 +2234,11 @@ static ssize_t rbd_add(struct bus_type *bus, if (!try_module_get(THIS_MODULE)) return -ENODEV; - mon_dev_name = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL); + mon_dev_name = kmalloc(count, GFP_KERNEL); if (!mon_dev_name) goto err_out_mod; - options = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL); + options = kmalloc(count, GFP_KERNEL); if (!options) goto err_mon_dev; From 27cc25943fb359241546e7bb7a3ab1c2f35796a2 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 2 Feb 2012 08:13:30 -0600 Subject: [PATCH 36/65] rbd: simplify error handling in rbd_add() If a couple pointers are initialized to NULL then a single "out_nomem" label can be used for all of the memory allocation failure cases in rbd_add(). Also, get rid of the "irc" local variable there. There is no real need for "rc" to be type ssize_t, and it can be used in the spot "irc" was. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- drivers/block/rbd.c | 44 +++++++++++++++++++------------------------- 1 file changed, 19 insertions(+), 25 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 606d59aece2b..8ac26ab09aa0 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2224,28 +2224,24 @@ static ssize_t rbd_add(struct bus_type *bus, const char *buf, size_t count) { - struct ceph_osd_client *osdc; struct rbd_device *rbd_dev; - ssize_t rc = -ENOMEM; - int irc; - char *mon_dev_name; - char *options; + char *mon_dev_name = NULL; + char *options = NULL; + struct ceph_osd_client *osdc; + int rc = -ENOMEM; if (!try_module_get(THIS_MODULE)) return -ENODEV; - mon_dev_name = kmalloc(count, GFP_KERNEL); - if (!mon_dev_name) - goto err_out_mod; - - options = kmalloc(count, GFP_KERNEL); - if (!options) - goto err_mon_dev; - - /* new rbd_device object */ rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL); if (!rbd_dev) - goto err_out_opt; + goto err_nomem; + mon_dev_name = kmalloc(count, GFP_KERNEL); + if (!mon_dev_name) + goto err_nomem; + options = kmalloc(count, GFP_KERNEL); + if (!options) + goto err_nomem; /* static rbd_device initialization */ spin_lock_init(&rbd_dev->lock); @@ -2294,12 +2290,10 @@ static ssize_t rbd_add(struct bus_type *bus, rbd_dev->poolid = rc; /* register our block device */ - irc = register_blkdev(0, rbd_dev->name); - if (irc < 0) { - rc = irc; + rc = register_blkdev(0, rbd_dev->name); + if (rc < 0) goto err_out_client; - } - rbd_dev->major = irc; + rbd_dev->major = rc; rc = rbd_bus_add_dev(rbd_dev); if (rc) @@ -2332,15 +2326,15 @@ err_out_client: rbd_put_client(rbd_dev); err_put_id: rbd_id_put(rbd_dev); - kfree(rbd_dev); -err_out_opt: +err_nomem: kfree(options); -err_mon_dev: kfree(mon_dev_name); -err_out_mod: + kfree(rbd_dev); + dout("Error adding device %s\n", buf); module_put(THIS_MODULE); - return rc; + + return (ssize_t) rc; } static struct rbd_device *__rbd_get_dev(unsigned long id) From a725f65e52de73defb3c7033c471c48c56ca6cdd Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 2 Feb 2012 08:13:30 -0600 Subject: [PATCH 37/65] rbd: encapsulate argument parsing for rbd_add() Move the code that parses the arguments provided to rbd_add() (which are supplied via /sys/bus/rbd/add) into a separate function. Also rename the "mon_dev_name" variable in rbd_add() to be "mon_addrs". The variable represents a list of one or more comma-separated monitor IP addresses, each with an optional port number. I think "mon_addrs" captures that notion a little better. Signed-off-by: Alex Elder --- drivers/block/rbd.c | 70 ++++++++++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 27 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 8ac26ab09aa0..caafe1d87a4b 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2220,12 +2220,43 @@ static void rbd_id_put(struct rbd_device *rbd_dev) atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id); } +/* + * This fills in the pool_name, obj, obj_len, snap_name, obj_len, + * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based + * on the list of monitor addresses and other options provided via + * /sys/bus/rbd/add. + */ +static int rbd_add_parse_args(struct rbd_device *rbd_dev, + const char *buf, + char *mon_addrs, + char *options) +{ + if (sscanf(buf, "%" __stringify(RBD_MAX_OPT_LEN) "s " + "%" __stringify(RBD_MAX_OPT_LEN) "s " + "%" __stringify(RBD_MAX_POOL_NAME_LEN) "s " + "%" __stringify(RBD_MAX_OBJ_NAME_LEN) "s" + "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s", + mon_addrs, options, rbd_dev->pool_name, + rbd_dev->obj, rbd_dev->snap_name) < 4) + return -EINVAL; + + if (rbd_dev->snap_name[0] == 0) + memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME, + sizeof (RBD_SNAP_HEAD_NAME)); + + rbd_dev->obj_len = strlen(rbd_dev->obj); + snprintf(rbd_dev->obj_md_name, sizeof(rbd_dev->obj_md_name), "%s%s", + rbd_dev->obj, RBD_SUFFIX); + + return 0; +} + static ssize_t rbd_add(struct bus_type *bus, const char *buf, size_t count) { struct rbd_device *rbd_dev; - char *mon_dev_name = NULL; + char *mon_addrs = NULL; char *options = NULL; struct ceph_osd_client *osdc; int rc = -ENOMEM; @@ -2236,8 +2267,8 @@ static ssize_t rbd_add(struct bus_type *bus, rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL); if (!rbd_dev) goto err_nomem; - mon_dev_name = kmalloc(count, GFP_KERNEL); - if (!mon_dev_name) + mon_addrs = kmalloc(count, GFP_KERNEL); + if (!mon_addrs) goto err_nomem; options = kmalloc(count, GFP_KERNEL); if (!options) @@ -2253,30 +2284,15 @@ static ssize_t rbd_add(struct bus_type *bus, /* generate unique id: find highest unique id, add one */ rbd_id_get(rbd_dev); - /* parse add command */ - if (sscanf(buf, "%" __stringify(RBD_MAX_OPT_LEN) "s " - "%" __stringify(RBD_MAX_OPT_LEN) "s " - "%" __stringify(RBD_MAX_POOL_NAME_LEN) "s " - "%" __stringify(RBD_MAX_OBJ_NAME_LEN) "s" - "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s", - mon_dev_name, options, rbd_dev->pool_name, - rbd_dev->obj, rbd_dev->snap_name) < 4) { - rc = -EINVAL; - goto err_put_id; - } - - if (rbd_dev->snap_name[0] == 0) - memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME, - sizeof (RBD_SNAP_HEAD_NAME)); - - rbd_dev->obj_len = strlen(rbd_dev->obj); - snprintf(rbd_dev->obj_md_name, sizeof(rbd_dev->obj_md_name), "%s%s", - rbd_dev->obj, RBD_SUFFIX); - - /* initialize rest of new object */ + /* Fill in the device name, now that we have its id. */ snprintf(rbd_dev->name, DEV_NAME_LEN, RBD_DRV_NAME "%d", rbd_dev->id); - rbd_dev->rbd_client = rbd_get_client(mon_dev_name, options); + /* parse add command */ + rc = rbd_add_parse_args(rbd_dev, buf, mon_addrs, options); + if (rc) + goto err_put_id; + + rbd_dev->rbd_client = rbd_get_client(mon_addrs, options); if (IS_ERR(rbd_dev->rbd_client)) { rc = PTR_ERR(rbd_dev->rbd_client); goto err_put_id; @@ -2317,7 +2333,7 @@ err_out_bus: rbd_bus_del_dev(rbd_dev); kfree(options); - kfree(mon_dev_name); + kfree(mon_addrs); return rc; err_out_blkdev: @@ -2328,7 +2344,7 @@ err_put_id: rbd_id_put(rbd_dev); err_nomem: kfree(options); - kfree(mon_dev_name); + kfree(mon_addrs); kfree(rbd_dev); dout("Error adding device %s\n", buf); From e28fff268e7d40ea7a936478c97ce41b6c22815f Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 2 Feb 2012 08:13:30 -0600 Subject: [PATCH 38/65] rbd: don't use sscanf() in rbd_add_parse_args() Make use of a few simple helper routines to parse the arguments rather than sscanf(). This will treat both missing and too-long arguments as invalid input (rather than silently truncating the input in the too-long case). In time this can also be used by rbd_add() to use the passed-in buffer in place, rather than copying its contents into new buffers. It appears to me that the sscanf() previously used would not correctly handle a supplied snapshot--the two final "%s" conversion specifications were not separated by a space, and I'm not sure how sscanf() handles that situation. It may not be well-defined. So that may be a bug this change fixes (but I didn't verify that). The sizes of the mon_addrs and options buffers are now passed to rbd_add_parse_args(), so they can be supplied to copy_token(). Signed-off-by: Alex Elder --- drivers/block/rbd.c | 97 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 84 insertions(+), 13 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index caafe1d87a4b..085df6765d21 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2220,6 +2220,53 @@ static void rbd_id_put(struct rbd_device *rbd_dev) atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id); } +/* + * Skips over white space at *buf, and updates *buf to point to the + * first found non-space character (if any). Returns the length of + * the token (string of non-white space characters) found. + */ +static inline size_t next_token(const char **buf) +{ + /* + * These are the characters that produce nonzero for + * isspace() in the "C" and "POSIX" locales. + */ + const char *spaces = " \f\n\r\t\v"; + + *buf += strspn(*buf, spaces); /* Find start of token */ + + return strcspn(*buf, spaces); /* Return token length */ +} + +/* + * Finds the next token in *buf, and if the provided token buffer is + * big enough, copies the found token into it. The result, if + * copied, is guaranteed to be terminated with '\0'. + * + * Returns the length of the token found (not including the '\0'). + * Return value will be 0 if no token is found, and it will be >= + * token_size if the token would not fit. + * + * The *buf pointer will be updated point beyond the end of the + * found token. Note that this occurs even if the token buffer is + * too small to hold it. + */ +static inline size_t copy_token(const char **buf, + char *token, + size_t token_size) +{ + size_t len; + + len = next_token(buf); + if (len < token_size) { + memcpy(token, *buf, len); + *(token + len) = '\0'; + } + *buf += len; + + return len; +} + /* * This fills in the pool_name, obj, obj_len, snap_name, obj_len, * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based @@ -2229,25 +2276,48 @@ static void rbd_id_put(struct rbd_device *rbd_dev) static int rbd_add_parse_args(struct rbd_device *rbd_dev, const char *buf, char *mon_addrs, - char *options) + size_t mon_addrs_size, + char *options, + size_t options_size) { - if (sscanf(buf, "%" __stringify(RBD_MAX_OPT_LEN) "s " - "%" __stringify(RBD_MAX_OPT_LEN) "s " - "%" __stringify(RBD_MAX_POOL_NAME_LEN) "s " - "%" __stringify(RBD_MAX_OBJ_NAME_LEN) "s" - "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s", - mon_addrs, options, rbd_dev->pool_name, - rbd_dev->obj, rbd_dev->snap_name) < 4) + size_t len; + + /* The first four tokens are required */ + + len = copy_token(&buf, mon_addrs, mon_addrs_size); + if (!len || len >= mon_addrs_size) return -EINVAL; - if (rbd_dev->snap_name[0] == 0) - memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME, - sizeof (RBD_SNAP_HEAD_NAME)); + len = copy_token(&buf, options, options_size); + if (!len || len >= options_size) + return -EINVAL; + + len = copy_token(&buf, rbd_dev->pool_name, sizeof (rbd_dev->pool_name)); + if (!len || len >= sizeof (rbd_dev->pool_name)) + return -EINVAL; + + len = copy_token(&buf, rbd_dev->obj, sizeof (rbd_dev->obj)); + if (!len || len >= sizeof (rbd_dev->obj)) + return -EINVAL; + + /* We have the object length in hand, save it. */ + + rbd_dev->obj_len = len; - rbd_dev->obj_len = strlen(rbd_dev->obj); snprintf(rbd_dev->obj_md_name, sizeof(rbd_dev->obj_md_name), "%s%s", rbd_dev->obj, RBD_SUFFIX); + /* + * The snapshot name is optional, but it's an error if it's + * too long. If no snapshot is supplied, fill in the default. + */ + len = copy_token(&buf, rbd_dev->snap_name, sizeof (rbd_dev->snap_name)); + if (!len) + memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME, + sizeof (RBD_SNAP_HEAD_NAME)); + else if (len >= sizeof (rbd_dev->snap_name)) + return -EINVAL; + return 0; } @@ -2288,7 +2358,8 @@ static ssize_t rbd_add(struct bus_type *bus, snprintf(rbd_dev->name, DEV_NAME_LEN, RBD_DRV_NAME "%d", rbd_dev->id); /* parse add command */ - rc = rbd_add_parse_args(rbd_dev, buf, mon_addrs, options); + rc = rbd_add_parse_args(rbd_dev, buf, mon_addrs, count, + options, count); if (rc) goto err_put_id; From 81a897937827a81107861d50c77b4d04ff8b43a2 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 2 Feb 2012 08:13:30 -0600 Subject: [PATCH 39/65] rbd: do a few checks at build time This is a bit gratuitous, but there are a few things that can be verified at build time rather than run time, so do that. Signed-off-by: Alex Elder --- drivers/block/rbd.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 085df6765d21..14d0a3c9f96a 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -53,7 +53,14 @@ #define RBD_SNAP_HEAD_NAME "-" +/* + * An RBD device name will be "rbd#", where the "rbd" comes from + * RBD_DRV_NAME above, and # is a unique integer identifier. + * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big + * enough to hold all possible device names. + */ #define DEV_NAME_LEN 32 +#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1) #define RBD_NOTIFY_TIMEOUT_DEFAULT 10 @@ -2304,8 +2311,9 @@ static int rbd_add_parse_args(struct rbd_device *rbd_dev, rbd_dev->obj_len = len; - snprintf(rbd_dev->obj_md_name, sizeof(rbd_dev->obj_md_name), "%s%s", - rbd_dev->obj, RBD_SUFFIX); + BUILD_BUG_ON(RBD_MAX_MD_NAME_LEN + < RBD_MAX_OBJ_NAME_LEN + sizeof (RBD_SUFFIX)); + sprintf(rbd_dev->obj_md_name, "%s%s", rbd_dev->obj, RBD_SUFFIX); /* * The snapshot name is optional, but it's an error if it's @@ -2355,7 +2363,9 @@ static ssize_t rbd_add(struct bus_type *bus, rbd_id_get(rbd_dev); /* Fill in the device name, now that we have its id. */ - snprintf(rbd_dev->name, DEV_NAME_LEN, RBD_DRV_NAME "%d", rbd_dev->id); + BUILD_BUG_ON(DEV_NAME_LEN + < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH); + sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->id); /* parse add command */ rc = rbd_add_parse_args(rbd_dev, buf, mon_addrs, count, From 5214ecc45cf9b9f1365b189bcb63e441e3865334 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 2 Feb 2012 08:13:30 -0600 Subject: [PATCH 40/65] rbd: have rbd_parse_args() report found mon_addrs size The argument parsing routine already computes the size of the mon_addrs buffer it extracts from the "command." Pass it to the caller so it can use it to provide the length to rbd_get_client(). Signed-off-by: Alex Elder --- drivers/block/rbd.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 14d0a3c9f96a..2f2d194ca3d4 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -391,7 +391,9 @@ static int parse_rbd_opts_token(char *c, void *private) * Get a ceph client with specific addr and configuration, if one does * not exist create it. */ -static struct rbd_client *rbd_get_client(const char *mon_addr, char *options) +static struct rbd_client *rbd_get_client(const char *mon_addr, + size_t mon_addr_len, + char *options) { struct rbd_client *rbdc; struct ceph_options *opt; @@ -404,7 +406,7 @@ static struct rbd_client *rbd_get_client(const char *mon_addr, char *options) rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT; opt = ceph_parse_options(options, mon_addr, - mon_addr + strlen(mon_addr), + mon_addr + mon_addr_len, parse_rbd_opts_token, rbd_opts); if (IS_ERR(opt)) { kfree(rbd_opts); @@ -2283,7 +2285,7 @@ static inline size_t copy_token(const char **buf, static int rbd_add_parse_args(struct rbd_device *rbd_dev, const char *buf, char *mon_addrs, - size_t mon_addrs_size, + size_t *mon_addrs_size, char *options, size_t options_size) { @@ -2291,9 +2293,10 @@ static int rbd_add_parse_args(struct rbd_device *rbd_dev, /* The first four tokens are required */ - len = copy_token(&buf, mon_addrs, mon_addrs_size); - if (!len || len >= mon_addrs_size) + len = copy_token(&buf, mon_addrs, *mon_addrs_size); + if (!len || len >= *mon_addrs_size) return -EINVAL; + *mon_addrs_size = len + 1; len = copy_token(&buf, options, options_size); if (!len || len >= options_size) @@ -2335,6 +2338,7 @@ static ssize_t rbd_add(struct bus_type *bus, { struct rbd_device *rbd_dev; char *mon_addrs = NULL; + size_t mon_addrs_size; char *options = NULL; struct ceph_osd_client *osdc; int rc = -ENOMEM; @@ -2368,12 +2372,14 @@ static ssize_t rbd_add(struct bus_type *bus, sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->id); /* parse add command */ - rc = rbd_add_parse_args(rbd_dev, buf, mon_addrs, count, + mon_addrs_size = count; + rc = rbd_add_parse_args(rbd_dev, buf, mon_addrs, &mon_addrs_size, options, count); if (rc) goto err_put_id; - rbd_dev->rbd_client = rbd_get_client(mon_addrs, options); + rbd_dev->rbd_client = rbd_get_client(mon_addrs, mon_addrs_size - 1, + options); if (IS_ERR(rbd_dev->rbd_client)) { rc = PTR_ERR(rbd_dev->rbd_client); goto err_put_id; From 7ef3214af220515b8fe223ec92ec017d2e5607a7 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 2 Feb 2012 08:13:30 -0600 Subject: [PATCH 41/65] rbd: don't allocate mon_addrs buffer in rbd_add() The mon_addrs buffer in rbd_add is used to hold a copy of the monitor IP addresses supplied via /sys/bus/rbd/add. That is passed to rbd_get_client(), which never modifies it (nor do any of the functions it gets passed to thereafter)--the mon_addr parameter to rbd_get_client() is a pointer to constant data, so it can't be modifed. Furthermore, rbd_get_client() has the length of the mon_addrs buffer and that is used to ensure nothing goes beyond its end. Based on all this, there is no reason that a buffer needs to be used to hold a copy of the mon_addrs provided via /sys/bus/rbd/add. Instead, the location within that passed-in buffer can be provided, along with the length of the "token" therein which represents the monitor IP's. A small change to rbd_add_parse_args() allows the address within the buffer to be passed back, and the length is already returned. This now means that, at least from the perspective of this interface, there is no such thing as a list of monitor addresses that is too long. Signed-off-by: Alex Elder --- drivers/block/rbd.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 2f2d194ca3d4..641f09898e19 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2284,7 +2284,7 @@ static inline size_t copy_token(const char **buf, */ static int rbd_add_parse_args(struct rbd_device *rbd_dev, const char *buf, - char *mon_addrs, + const char **mon_addrs, size_t *mon_addrs_size, char *options, size_t options_size) @@ -2293,10 +2293,13 @@ static int rbd_add_parse_args(struct rbd_device *rbd_dev, /* The first four tokens are required */ - len = copy_token(&buf, mon_addrs, *mon_addrs_size); - if (!len || len >= *mon_addrs_size) + len = next_token(&buf); + if (!len) return -EINVAL; *mon_addrs_size = len + 1; + *mon_addrs = buf; + + buf += len; len = copy_token(&buf, options, options_size); if (!len || len >= options_size) @@ -2337,8 +2340,8 @@ static ssize_t rbd_add(struct bus_type *bus, size_t count) { struct rbd_device *rbd_dev; - char *mon_addrs = NULL; - size_t mon_addrs_size; + const char *mon_addrs = NULL; + size_t mon_addrs_size = 0; char *options = NULL; struct ceph_osd_client *osdc; int rc = -ENOMEM; @@ -2349,9 +2352,6 @@ static ssize_t rbd_add(struct bus_type *bus, rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL); if (!rbd_dev) goto err_nomem; - mon_addrs = kmalloc(count, GFP_KERNEL); - if (!mon_addrs) - goto err_nomem; options = kmalloc(count, GFP_KERNEL); if (!options) goto err_nomem; @@ -2372,8 +2372,7 @@ static ssize_t rbd_add(struct bus_type *bus, sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->id); /* parse add command */ - mon_addrs_size = count; - rc = rbd_add_parse_args(rbd_dev, buf, mon_addrs, &mon_addrs_size, + rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size, options, count); if (rc) goto err_put_id; @@ -2420,7 +2419,6 @@ err_out_bus: rbd_bus_del_dev(rbd_dev); kfree(options); - kfree(mon_addrs); return rc; err_out_blkdev: @@ -2431,7 +2429,6 @@ err_put_id: rbd_id_put(rbd_dev); err_nomem: kfree(options); - kfree(mon_addrs); kfree(rbd_dev); dout("Error adding device %s\n", buf); From fed4c143ba8f08c8bddfdc7c69738e691a06d565 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 7 Feb 2012 12:03:36 -0600 Subject: [PATCH 42/65] rbd: fix module sysfs setup/teardown code Once rbd_bus_type is registered, it allows an "add" operation via the /sys/bus/rbd/add bus attribute, and adding a new rbd device that way establishes a connection between the device and rbd_root_dev. But rbd_root_dev is not registered until after the rbd_bus_type registration is complete. This could (in principle anyway) result in an invalid state. Since rbd_root_dev has no tie to rbd_bus_type we can reorder these two initializations and never be faced with this scenario. In addition, unregister the device in the event the bus registration fails at module init time. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- drivers/block/rbd.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 641f09898e19..b0f6812f8f99 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2564,19 +2564,21 @@ static int rbd_sysfs_init(void) { int ret; - ret = bus_register(&rbd_bus_type); + ret = device_register(&rbd_root_dev); if (ret < 0) return ret; - ret = device_register(&rbd_root_dev); + ret = bus_register(&rbd_bus_type); + if (ret < 0) + device_unregister(&rbd_root_dev); return ret; } static void rbd_sysfs_cleanup(void) { - device_unregister(&rbd_root_dev); bus_unregister(&rbd_bus_type); + device_unregister(&rbd_root_dev); } int __init rbd_init(void) From 00f1f36ffa29a6b8e0529770bce2a44585ab3af0 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 7 Feb 2012 12:03:36 -0600 Subject: [PATCH 43/65] rbd: do some refactoring A few blocks of code are rearranged a bit here: - In rbd_header_from_disk(): - Don't bother computing snap_count until we're sure the on-disk header starts with a good signature. - Move a few independent lines of code so they are *after* a check for a failed memory allocation. - Get rid of unnecessary local variable "ret". - Make a few other changes in rbd_read_header(), similar to the above--just moving things around a bit while preserving the functionality. - In rbd_rq_fn(), just assign rq in the while loop's controlling expression rather than duplicating it before and at the end of the loop body. This allows the use of "continue" rather than "goto next" in a number of spots. - Rearrange the logic in snap_by_name(). End result is the same. Signed-off-by: Alex Elder --- drivers/block/rbd.c | 82 ++++++++++++++++++++++----------------------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index b0f6812f8f99..a42b28e7f3fa 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -486,19 +486,20 @@ static int rbd_header_from_disk(struct rbd_image_header *header, gfp_t gfp_flags) { int i; - u32 snap_count = le32_to_cpu(ondisk->snap_count); - int ret = -ENOMEM; + u32 snap_count; if (memcmp(ondisk, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT))) return -ENXIO; - init_rwsem(&header->snap_rwsem); - header->snap_names_len = le64_to_cpu(ondisk->snap_names_len); + snap_count = le32_to_cpu(ondisk->snap_count); header->snapc = kmalloc(sizeof(struct ceph_snap_context) + snap_count * sizeof (*ondisk), gfp_flags); if (!header->snapc) return -ENOMEM; + + init_rwsem(&header->snap_rwsem); + header->snap_names_len = le64_to_cpu(ondisk->snap_names_len); if (snap_count) { header->snap_names = kmalloc(header->snap_names_len, GFP_KERNEL); @@ -544,7 +545,7 @@ err_names: kfree(header->snap_names); err_snapc: kfree(header->snapc); - return ret; + return -ENOMEM; } static int snap_index(struct rbd_image_header *header, int snap_num) @@ -568,19 +569,20 @@ static int snap_by_name(struct rbd_image_header *header, const char *snap_name, int i; char *p = header->snap_names; - for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) { - if (strcmp(snap_name, p) == 0) - break; + for (i = 0; i < header->total_snaps; i++) { + if (!strcmp(snap_name, p)) { + + /* Found it. Pass back its id and/or size */ + + if (seq) + *seq = header->snapc->snaps[i]; + if (size) + *size = header->snap_sizes[i]; + return i; + } + p += strlen(p) + 1; /* Skip ahead to the next name */ } - if (i == header->total_snaps) - return -ENOENT; - if (seq) - *seq = header->snapc->snaps[i]; - - if (size) - *size = header->snap_sizes[i]; - - return i; + return -ENOENT; } static int rbd_header_set_snap(struct rbd_device *dev, u64 *size) @@ -1444,9 +1446,7 @@ static void rbd_rq_fn(struct request_queue *q) struct request *rq; struct bio_pair *bp = NULL; - rq = blk_fetch_request(q); - - while (1) { + while ((rq = blk_fetch_request(q))) { struct bio *bio; struct bio *rq_bio, *next_bio = NULL; bool do_write; @@ -1464,7 +1464,7 @@ static void rbd_rq_fn(struct request_queue *q) /* filter out block requests we don't understand */ if ((rq->cmd_type != REQ_TYPE_FS)) { __blk_end_request_all(rq, 0); - goto next; + continue; } /* deduce our operation (read, write) */ @@ -1475,7 +1475,7 @@ static void rbd_rq_fn(struct request_queue *q) rq_bio = rq->bio; if (do_write && rbd_dev->read_only) { __blk_end_request_all(rq, -EROFS); - goto next; + continue; } spin_unlock_irq(q->queue_lock); @@ -1489,7 +1489,7 @@ static void rbd_rq_fn(struct request_queue *q) if (!coll) { spin_lock_irq(q->queue_lock); __blk_end_request_all(rq, -ENOMEM); - goto next; + continue; } do { @@ -1535,8 +1535,6 @@ next_seg: if (bp) bio_pair_release(bp); spin_lock_irq(q->queue_lock); -next: - rq = blk_fetch_request(q); } } @@ -1588,15 +1586,16 @@ static int rbd_read_header(struct rbd_device *rbd_dev, ssize_t rc; struct rbd_image_header_ondisk *dh; int snap_count = 0; - u64 snap_names_len = 0; u64 ver; + size_t len; + /* + * First reads the fixed-size header to determine the number + * of snapshots, then re-reads it, along with all snapshot + * records as well as their stored names. + */ + len = sizeof (*dh); while (1) { - int len = sizeof(*dh) + - snap_count * sizeof(struct rbd_image_snap_ondisk) + - snap_names_len; - - rc = -ENOMEM; dh = kmalloc(len, GFP_KERNEL); if (!dh) return -ENOMEM; @@ -1611,21 +1610,22 @@ static int rbd_read_header(struct rbd_device *rbd_dev, rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL); if (rc < 0) { - if (rc == -ENXIO) { + if (rc == -ENXIO) pr_warning("unrecognized header format" " for image %s", rbd_dev->obj); - } goto out_dh; } - if (snap_count != header->total_snaps) { - snap_count = header->total_snaps; - snap_names_len = header->snap_names_len; - rbd_header_free(header); - kfree(dh); - continue; - } - break; + if (snap_count == header->total_snaps) + break; + + snap_count = header->total_snaps; + len = sizeof (*dh) + + snap_count * sizeof(struct rbd_image_snap_ondisk) + + header->snap_names_len; + + rbd_header_free(header); + kfree(dh); } header->obj_version = ver; From 593a9e7b34fa62d703b473ae923a9681556cdf74 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 7 Feb 2012 12:03:37 -0600 Subject: [PATCH 44/65] rbd: small changes Here is another set of small code tidy-ups: - Define SECTOR_SHIFT and SECTOR_SIZE, and use these symbolic names throughout. Tell the blk_queue system our physical block size, in the (unlikely) event we want to use something other than the default. - Delete the definition of struct rbd_info, which is never used. - Move the definition of dev_to_rbd() down in its source file, just above where it gets first used, and change its name to dev_to_rbd_dev(). - Replace an open-coded operation in rbd_dev_release() to use dev_to_rbd_dev() instead. - Calculate the segment size for a given rbd_device just once in rbd_init_disk(). - Use the '%zd' conversion specifier in rbd_snap_size_show(), since the value formatted is a size_t. - Switch to the '%llu' conversion specifier in rbd_snap_id_show(). since the value formatted is unsigned. Signed-off-by: Alex Elder --- drivers/block/rbd.c | 85 ++++++++++++++++++++++++--------------- drivers/block/rbd_types.h | 4 -- 2 files changed, 52 insertions(+), 37 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index a42b28e7f3fa..568fa5b1206b 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -41,6 +41,15 @@ #include "rbd_types.h" +/* + * The basic unit of block I/O is a sector. It is interpreted in a + * number of contexts in Linux (blk, bio, genhd), but the default is + * universally 512 bytes. These symbols are just slightly more + * meaningful than the bare numbers they represent. + */ +#define SECTOR_SHIFT 9 +#define SECTOR_SIZE (1ULL << SECTOR_SHIFT) + #define RBD_DRV_NAME "rbd" #define RBD_DRV_NAME_LONG "rbd (rados block device)" @@ -221,11 +230,6 @@ static struct device rbd_root_dev = { }; -static struct rbd_device *dev_to_rbd(struct device *dev) -{ - return container_of(dev, struct rbd_device, dev); -} - static struct device *rbd_get_dev(struct rbd_device *rbd_dev) { return get_device(&rbd_dev->dev); @@ -743,7 +747,7 @@ static struct bio *bio_chain_clone(struct bio **old, struct bio **next, /* split the bio. We'll release it either in the next call, or it will have to be released outside */ - bp = bio_split(old_chain, (len - total) / 512ULL); + bp = bio_split(old_chain, (len - total) / SECTOR_SIZE); if (!bp) goto err_out; @@ -1471,7 +1475,7 @@ static void rbd_rq_fn(struct request_queue *q) do_write = (rq_data_dir(rq) == WRITE); size = blk_rq_bytes(rq); - ofs = blk_rq_pos(rq) * 512ULL; + ofs = blk_rq_pos(rq) * SECTOR_SIZE; rq_bio = rq->bio; if (do_write && rbd_dev->read_only) { __blk_end_request_all(rq, -EROFS); @@ -1482,7 +1486,7 @@ static void rbd_rq_fn(struct request_queue *q) dout("%s 0x%x bytes at 0x%llx\n", do_write ? "write" : "read", - size, blk_rq_pos(rq) * 512ULL); + size, blk_rq_pos(rq) * SECTOR_SIZE); num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size); coll = rbd_alloc_coll(num_segs); @@ -1547,13 +1551,17 @@ static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd, struct bio_vec *bvec) { struct rbd_device *rbd_dev = q->queuedata; - unsigned int chunk_sectors = 1 << (rbd_dev->header.obj_order - 9); - sector_t sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev); - unsigned int bio_sectors = bmd->bi_size >> 9; + unsigned int chunk_sectors; + sector_t sector; + unsigned int bio_sectors; int max; + chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT); + sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev); + bio_sectors = bmd->bi_size >> SECTOR_SHIFT; + max = (chunk_sectors - ((sector & (chunk_sectors - 1)) - + bio_sectors)) << 9; + + bio_sectors)) << SECTOR_SHIFT; if (max < 0) max = 0; /* bio_add cannot handle a negative return */ if (max <= bvec->bv_len && bio_sectors == 0) @@ -1708,7 +1716,7 @@ static int __rbd_update_snaps(struct rbd_device *rbd_dev) return ret; /* resized? */ - set_capacity(rbd_dev->disk, h.image_size / 512ULL); + set_capacity(rbd_dev->disk, h.image_size / SECTOR_SIZE); down_write(&rbd_dev->header.snap_rwsem); @@ -1745,6 +1753,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) struct gendisk *disk; struct request_queue *q; int rc; + u64 segment_size; u64 total_size = 0; /* contact OSD, request size info about the object being mapped */ @@ -1780,11 +1789,15 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) if (!q) goto out_disk; + /* We use the default size, but let's be explicit about it. */ + blk_queue_physical_block_size(q, SECTOR_SIZE); + /* set io sizes to object size */ - blk_queue_max_hw_sectors(q, rbd_obj_bytes(&rbd_dev->header) / 512ULL); - blk_queue_max_segment_size(q, rbd_obj_bytes(&rbd_dev->header)); - blk_queue_io_min(q, rbd_obj_bytes(&rbd_dev->header)); - blk_queue_io_opt(q, rbd_obj_bytes(&rbd_dev->header)); + segment_size = rbd_obj_bytes(&rbd_dev->header); + blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE); + blk_queue_max_segment_size(q, segment_size); + blk_queue_io_min(q, segment_size); + blk_queue_io_opt(q, segment_size); blk_queue_merge_bvec(q, rbd_merge_bvec); disk->queue = q; @@ -1795,7 +1808,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) rbd_dev->q = q; /* finally, announce the disk to the world */ - set_capacity(disk, total_size / 512ULL); + set_capacity(disk, total_size / SECTOR_SIZE); add_disk(disk); pr_info("%s: added with size 0x%llx\n", @@ -1812,10 +1825,15 @@ out: sysfs */ +static struct rbd_device *dev_to_rbd_dev(struct device *dev) +{ + return container_of(dev, struct rbd_device, dev); +} + static ssize_t rbd_size_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct rbd_device *rbd_dev = dev_to_rbd(dev); + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); return sprintf(buf, "%llu\n", (unsigned long long)rbd_dev->header.image_size); } @@ -1823,7 +1841,7 @@ static ssize_t rbd_size_show(struct device *dev, static ssize_t rbd_major_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct rbd_device *rbd_dev = dev_to_rbd(dev); + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); return sprintf(buf, "%d\n", rbd_dev->major); } @@ -1831,7 +1849,7 @@ static ssize_t rbd_major_show(struct device *dev, static ssize_t rbd_client_id_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct rbd_device *rbd_dev = dev_to_rbd(dev); + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); return sprintf(buf, "client%lld\n", ceph_client_id(rbd_dev->rbd_client->client)); @@ -1840,7 +1858,7 @@ static ssize_t rbd_client_id_show(struct device *dev, static ssize_t rbd_pool_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct rbd_device *rbd_dev = dev_to_rbd(dev); + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); return sprintf(buf, "%s\n", rbd_dev->pool_name); } @@ -1848,7 +1866,7 @@ static ssize_t rbd_pool_show(struct device *dev, static ssize_t rbd_name_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct rbd_device *rbd_dev = dev_to_rbd(dev); + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); return sprintf(buf, "%s\n", rbd_dev->obj); } @@ -1857,7 +1875,7 @@ static ssize_t rbd_snap_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct rbd_device *rbd_dev = dev_to_rbd(dev); + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); return sprintf(buf, "%s\n", rbd_dev->snap_name); } @@ -1867,7 +1885,7 @@ static ssize_t rbd_image_refresh(struct device *dev, const char *buf, size_t size) { - struct rbd_device *rbd_dev = dev_to_rbd(dev); + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); int rc; int ret = size; @@ -1932,7 +1950,7 @@ static ssize_t rbd_snap_size_show(struct device *dev, { struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); - return sprintf(buf, "%lld\n", (long long)snap->size); + return sprintf(buf, "%zd\n", snap->size); } static ssize_t rbd_snap_id_show(struct device *dev, @@ -1941,7 +1959,7 @@ static ssize_t rbd_snap_id_show(struct device *dev, { struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); - return sprintf(buf, "%lld\n", (long long)snap->id); + return sprintf(buf, "%llu\n", (unsigned long long) snap->id); } static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL); @@ -2232,7 +2250,8 @@ static void rbd_id_put(struct rbd_device *rbd_dev) /* * Skips over white space at *buf, and updates *buf to point to the * first found non-space character (if any). Returns the length of - * the token (string of non-white space characters) found. + * the token (string of non-white space characters) found. Note + * that *buf must be terminated with '\0'. */ static inline size_t next_token(const char **buf) { @@ -2250,13 +2269,14 @@ static inline size_t next_token(const char **buf) /* * Finds the next token in *buf, and if the provided token buffer is * big enough, copies the found token into it. The result, if - * copied, is guaranteed to be terminated with '\0'. + * copied, is guaranteed to be terminated with '\0'. Note that *buf + * must be terminated with '\0' on entry. * * Returns the length of the token found (not including the '\0'). * Return value will be 0 if no token is found, and it will be >= * token_size if the token would not fit. * - * The *buf pointer will be updated point beyond the end of the + * The *buf pointer will be updated to point beyond the end of the * found token. Note that this occurs even if the token buffer is * too small to hold it. */ @@ -2456,8 +2476,7 @@ static struct rbd_device *__rbd_get_dev(unsigned long id) static void rbd_dev_release(struct device *dev) { - struct rbd_device *rbd_dev = - container_of(dev, struct rbd_device, dev); + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); if (rbd_dev->watch_request) { struct ceph_client *client = rbd_dev->rbd_client->client; @@ -2520,7 +2539,7 @@ static ssize_t rbd_snap_add(struct device *dev, const char *buf, size_t count) { - struct rbd_device *rbd_dev = dev_to_rbd(dev); + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); int ret; char *name = kmalloc(count + 1, GFP_KERNEL); if (!name) diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h index fc6c678aa2cb..950708688f17 100644 --- a/drivers/block/rbd_types.h +++ b/drivers/block/rbd_types.h @@ -41,10 +41,6 @@ #define RBD_HEADER_SIGNATURE "RBD" #define RBD_HEADER_VERSION "001.005" -struct rbd_info { - __le64 max_id; -} __attribute__ ((packed)); - struct rbd_image_snap_ondisk { __le64 id; __le64 image_size; From 32eec68d2f233e8a6ae1cd326022f6862e2b9ce3 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 8 Feb 2012 16:11:14 -0600 Subject: [PATCH 45/65] rbd: don't drop the rbd_id too early Currently an rbd device's id is released when it is removed, but it is done before the code is run to clean up sysfs-related files (such as /sys/bus/rbd/devices/1). It's possible that an rbd is still in use after the rbd_remove() call has been made. It's essentially the same as an active inode that stays around after it has been removed--until its final close operation. This means that the id shows up as free for reuse at a time it should not be. The effect of this was seen by Jens Rehpoehler, who: - had a filesystem mounted on an rbd device - unmapped that filesystem (without unmounting) - found that the mount still worked properly - but hit a panic when he attempted to re-map a new rbd device This re-map attempt found the previously-unmapped id available. The subsequent attempt to reuse it was met with a panic while attempting to (re-)install the sysfs entry for the new mapped device. Fix this by holding off "putting" the rbd id, until the rbd_device release function is called--when the last reference is finally dropped. Note: This fixes: http://tracker.newdream.net/issues/1907 Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- drivers/block/rbd.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 568fa5b1206b..6bbd5af1f029 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2421,7 +2421,12 @@ static ssize_t rbd_add(struct bus_type *bus, if (rc) goto err_out_blkdev; - /* set up and announce blkdev mapping */ + /* + * At this point cleanup in the event of an error is the job + * of the sysfs code (initiated by rbd_bus_del_dev()). + * + * Set up and announce blkdev mapping. + */ rc = rbd_init_disk(rbd_dev); if (rc) goto err_out_bus; @@ -2433,8 +2438,6 @@ static ssize_t rbd_add(struct bus_type *bus, return count; err_out_bus: - rbd_id_put(rbd_dev); - /* this will also clean up rest of rbd_dev stuff */ rbd_bus_del_dev(rbd_dev); @@ -2492,6 +2495,9 @@ static void rbd_dev_release(struct device *dev) /* clean up and free blkdev */ rbd_free_disk(rbd_dev); unregister_blkdev(rbd_dev->major, rbd_dev->name); + + /* done with the id, and with the rbd_dev */ + rbd_id_put(rbd_dev); kfree(rbd_dev); /* release module ref */ @@ -2524,8 +2530,6 @@ static ssize_t rbd_remove(struct bus_type *bus, goto done; } - rbd_id_put(rbd_dev); - __rbd_remove_all_snaps(rbd_dev); rbd_bus_del_dev(rbd_dev); From 963be4d7709f84d865f76d12d5b0ec7edad1c498 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 14 Feb 2012 14:05:33 -0600 Subject: [PATCH 46/65] libceph: move prepare_write_banner() One of the arguments to prepare_write_connect() indicates whether it is being called immediately after a call to prepare_write_banner(). Move the prepare_write_banner() call inside prepare_write_connect(), and reinterpret (and rename) the "after_banner" argument so it indicates that prepare_write_connect() should *make* the call rather than should know it has already been made. This was split out from the next patch to highlight this change in logic. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- net/ceph/messenger.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 3917847ad8e5..0665945b6468 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -676,7 +676,7 @@ static void prepare_write_banner(struct ceph_messenger *msgr, static int prepare_write_connect(struct ceph_messenger *msgr, struct ceph_connection *con, - int after_banner) + int include_banner) { unsigned global_seq = get_global_seq(con->msgr, 0); int proto; @@ -705,7 +705,9 @@ static int prepare_write_connect(struct ceph_messenger *msgr, con->out_connect.protocol_version = cpu_to_le32(proto); con->out_connect.flags = 0; - if (!after_banner) { + if (include_banner) + prepare_write_banner(msgr, con); + else { con->out_kvec_left = 0; con->out_kvec_bytes = 0; } @@ -1846,7 +1848,6 @@ more: /* open the socket first? */ if (con->sock == NULL) { - prepare_write_banner(msgr, con); prepare_write_connect(msgr, con, 1); prepare_read_banner(con); set_bit(CONNECTING, &con->state); From 859eb7994876f26fd9f52d9589fbcab8e2fe8069 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 14 Feb 2012 14:05:33 -0600 Subject: [PATCH 47/65] libceph: encapsulate connection kvec operations Encapsulate the operation of adding a new chunk of data to the next open slot in a ceph_connection's out_kvec array. Also add a "reset" operation to make subsequent add operations start at the beginning of the array again. Use these routines throughout, avoiding duplicate code and ensuring all calls are handled consistently. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- net/ceph/messenger.c | 117 +++++++++++++++++++++---------------------- 1 file changed, 56 insertions(+), 61 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 0665945b6468..04d2b975ab0c 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -469,14 +469,35 @@ static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt) return ret; } +static void ceph_con_out_kvec_reset(struct ceph_connection *con) +{ + con->out_kvec_left = 0; + con->out_kvec_bytes = 0; + con->out_kvec_cur = &con->out_kvec[0]; +} + +static void ceph_con_out_kvec_add(struct ceph_connection *con, + size_t size, void *data) +{ + int index; + + index = con->out_kvec_left; + BUG_ON(index >= ARRAY_SIZE(con->out_kvec)); + + con->out_kvec[index].iov_len = size; + con->out_kvec[index].iov_base = data; + con->out_kvec_left++; + con->out_kvec_bytes += size; +} /* * Prepare footer for currently outgoing message, and finish things * off. Assumes out_kvec* are already valid.. we just add on to the end. */ -static void prepare_write_message_footer(struct ceph_connection *con, int v) +static void prepare_write_message_footer(struct ceph_connection *con) { struct ceph_msg *m = con->out_msg; + int v = con->out_kvec_left; dout("prepare_write_message_footer %p\n", con); con->out_kvec_is_msg = true; @@ -494,9 +515,8 @@ static void prepare_write_message_footer(struct ceph_connection *con, int v) static void prepare_write_message(struct ceph_connection *con) { struct ceph_msg *m; - int v = 0; - con->out_kvec_bytes = 0; + ceph_con_out_kvec_reset(con); con->out_kvec_is_msg = true; con->out_msg_done = false; @@ -504,16 +524,13 @@ static void prepare_write_message(struct ceph_connection *con) * TCP packet that's a good thing. */ if (con->in_seq > con->in_seq_acked) { con->in_seq_acked = con->in_seq; - con->out_kvec[v].iov_base = &tag_ack; - con->out_kvec[v++].iov_len = 1; + ceph_con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); con->out_temp_ack = cpu_to_le64(con->in_seq_acked); - con->out_kvec[v].iov_base = &con->out_temp_ack; - con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack); - con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack); + ceph_con_out_kvec_add(con, sizeof (con->out_temp_ack), + &con->out_temp_ack); } - m = list_first_entry(&con->out_queue, - struct ceph_msg, list_head); + m = list_first_entry(&con->out_queue, struct ceph_msg, list_head); con->out_msg = m; /* put message on sent list */ @@ -537,17 +554,13 @@ static void prepare_write_message(struct ceph_connection *con) BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len); /* tag + hdr + front + middle */ - con->out_kvec[v].iov_base = &tag_msg; - con->out_kvec[v++].iov_len = 1; - con->out_kvec[v].iov_base = &m->hdr; - con->out_kvec[v++].iov_len = sizeof(m->hdr); - con->out_kvec[v++] = m->front; + ceph_con_out_kvec_add(con, sizeof (tag_msg), &tag_msg); + ceph_con_out_kvec_add(con, sizeof (m->hdr), &m->hdr); + ceph_con_out_kvec_add(con, m->front.iov_len, m->front.iov_base); + if (m->middle) - con->out_kvec[v++] = m->middle->vec; - con->out_kvec_left = v; - con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len + - (m->middle ? m->middle->vec.iov_len : 0); - con->out_kvec_cur = con->out_kvec; + ceph_con_out_kvec_add(con, m->middle->vec.iov_len, + m->middle->vec.iov_base); /* fill in crc (except data pages), footer */ con->out_msg->hdr.crc = @@ -580,7 +593,7 @@ static void prepare_write_message(struct ceph_connection *con) con->out_more = 1; /* data + footer will follow */ } else { /* no, queue up footer too and be done */ - prepare_write_message_footer(con, v); + prepare_write_message_footer(con); } set_bit(WRITE_PENDING, &con->state); @@ -595,14 +608,14 @@ static void prepare_write_ack(struct ceph_connection *con) con->in_seq_acked, con->in_seq); con->in_seq_acked = con->in_seq; - con->out_kvec[0].iov_base = &tag_ack; - con->out_kvec[0].iov_len = 1; + ceph_con_out_kvec_reset(con); + + ceph_con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); + con->out_temp_ack = cpu_to_le64(con->in_seq_acked); - con->out_kvec[1].iov_base = &con->out_temp_ack; - con->out_kvec[1].iov_len = sizeof(con->out_temp_ack); - con->out_kvec_left = 2; - con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack); - con->out_kvec_cur = con->out_kvec; + ceph_con_out_kvec_add(con, sizeof (con->out_temp_ack), + &con->out_temp_ack); + con->out_more = 1; /* more will follow.. eventually.. */ set_bit(WRITE_PENDING, &con->state); } @@ -613,11 +626,8 @@ static void prepare_write_ack(struct ceph_connection *con) static void prepare_write_keepalive(struct ceph_connection *con) { dout("prepare_write_keepalive %p\n", con); - con->out_kvec[0].iov_base = &tag_keepalive; - con->out_kvec[0].iov_len = 1; - con->out_kvec_left = 1; - con->out_kvec_bytes = 1; - con->out_kvec_cur = con->out_kvec; + ceph_con_out_kvec_reset(con); + ceph_con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive); set_bit(WRITE_PENDING, &con->state); } @@ -646,12 +656,9 @@ static int prepare_connect_authorizer(struct ceph_connection *con) con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol); con->out_connect.authorizer_len = cpu_to_le32(auth_len); - if (auth_len) { - con->out_kvec[con->out_kvec_left].iov_base = auth_buf; - con->out_kvec[con->out_kvec_left].iov_len = auth_len; - con->out_kvec_left++; - con->out_kvec_bytes += auth_len; - } + if (auth_len) + ceph_con_out_kvec_add(con, auth_len, auth_buf); + return 0; } @@ -661,15 +668,11 @@ static int prepare_connect_authorizer(struct ceph_connection *con) static void prepare_write_banner(struct ceph_messenger *msgr, struct ceph_connection *con) { - int len = strlen(CEPH_BANNER); + ceph_con_out_kvec_reset(con); + ceph_con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER); + ceph_con_out_kvec_add(con, sizeof (msgr->my_enc_addr), + &msgr->my_enc_addr); - con->out_kvec[0].iov_base = CEPH_BANNER; - con->out_kvec[0].iov_len = len; - con->out_kvec[1].iov_base = &msgr->my_enc_addr; - con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr); - con->out_kvec_left = 2; - con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr); - con->out_kvec_cur = con->out_kvec; con->out_more = 0; set_bit(WRITE_PENDING, &con->state); } @@ -707,22 +710,16 @@ static int prepare_write_connect(struct ceph_messenger *msgr, if (include_banner) prepare_write_banner(msgr, con); - else { - con->out_kvec_left = 0; - con->out_kvec_bytes = 0; - } - con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect; - con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect); - con->out_kvec_left++; - con->out_kvec_bytes += sizeof(con->out_connect); - con->out_kvec_cur = con->out_kvec; + else + ceph_con_out_kvec_reset(con); + ceph_con_out_kvec_add(con, sizeof (con->out_connect), &con->out_connect); + con->out_more = 0; set_bit(WRITE_PENDING, &con->state); return prepare_connect_authorizer(con); } - /* * write as much of pending kvecs to the socket as we can. * 1 -> done @@ -919,10 +916,8 @@ static int write_partial_msg_pages(struct ceph_connection *con) /* prepare and queue up footer, too */ if (!crc) con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; - con->out_kvec_bytes = 0; - con->out_kvec_left = 0; - con->out_kvec_cur = con->out_kvec; - prepare_write_message_footer(con, 0); + ceph_con_out_kvec_reset(con); + prepare_write_message_footer(con); ret = 1; out: return ret; From e0f43c9419c1900e5b50de4261e9686a45a0a2b8 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 14 Feb 2012 14:05:33 -0600 Subject: [PATCH 48/65] libceph: make ceph_msgr_wq private The messenger workqueue has no need to be public. So give it static scope. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- include/linux/ceph/messenger.h | 2 -- net/ceph/messenger.c | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 6b5af5f976d1..5ca0f8244203 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -14,8 +14,6 @@ struct ceph_msg; struct ceph_connection; -extern struct workqueue_struct *ceph_msgr_wq; /* receive work queue */ - /* * Ceph defines these callbacks for handling connection events. */ diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 04d2b975ab0c..31f59ac03d8a 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -97,7 +97,7 @@ static void encode_my_addr(struct ceph_messenger *msgr) /* * work queue for all reading and writing to/from the socket. */ -struct workqueue_struct *ceph_msgr_wq; +static struct workqueue_struct *ceph_msgr_wq; int ceph_msgr_init(void) { From 6173d1f02fb19c0fba02857ae4e1109b5ec95034 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 14 Feb 2012 14:05:33 -0600 Subject: [PATCH 49/65] libceph: encapsulate some messenger cleanup code Define a helper function to perform various cleanup operations. Use it both in the exit routine and in the init routine in the event of an error. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- net/ceph/messenger.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 31f59ac03d8a..c3023a600ad2 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -99,6 +99,20 @@ static void encode_my_addr(struct ceph_messenger *msgr) */ static struct workqueue_struct *ceph_msgr_wq; +void _ceph_msgr_exit(void) +{ + if (ceph_msgr_wq) + destroy_workqueue(ceph_msgr_wq); + + BUG_ON(zero_page_address == NULL); + zero_page_address = NULL; + + BUG_ON(zero_page == NULL); + kunmap(zero_page); + page_cache_release(zero_page); + zero_page = NULL; +} + int ceph_msgr_init(void) { BUG_ON(zero_page != NULL); @@ -109,33 +123,21 @@ int ceph_msgr_init(void) zero_page_address = kmap(zero_page); ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_NON_REENTRANT, 0); - if (!ceph_msgr_wq) { - pr_err("msgr_init failed to create workqueue\n"); + if (ceph_msgr_wq) + return 0; - zero_page_address = NULL; - kunmap(zero_page); - page_cache_release(zero_page); - zero_page = NULL; + pr_err("msgr_init failed to create workqueue\n"); + _ceph_msgr_exit(); - return -ENOMEM; - } - - return 0; + return -ENOMEM; } EXPORT_SYMBOL(ceph_msgr_init); void ceph_msgr_exit(void) { BUG_ON(ceph_msgr_wq == NULL); - destroy_workqueue(ceph_msgr_wq); - BUG_ON(zero_page_address == NULL); - zero_page_address = NULL; - - BUG_ON(zero_page == NULL); - kunmap(zero_page); - page_cache_release(zero_page); - zero_page = NULL; + _ceph_msgr_exit(); } EXPORT_SYMBOL(ceph_msgr_exit); From 41617d0c9c9832e030667277ddf6b4ffb4ecdc90 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 14 Feb 2012 14:05:33 -0600 Subject: [PATCH 50/65] libceph: make ceph_tcp_connect() return int There is no real need for ceph_tcp_connect() to return the socket pointer it creates, since it already assigns it to con->sock, which is visible to the caller. Instead, have it return an error code, which tidies things up a bit. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- net/ceph/messenger.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index c3023a600ad2..e1e53bb2d0cf 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -240,7 +240,7 @@ static void set_sock_callbacks(struct socket *sock, /* * initiate connection to a remote socket. */ -static struct socket *ceph_tcp_connect(struct ceph_connection *con) +static int ceph_tcp_connect(struct ceph_connection *con) { struct sockaddr_storage *paddr = &con->peer_addr.in_addr; struct socket *sock; @@ -250,7 +250,7 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con) ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM, IPPROTO_TCP, &sock); if (ret) - return ERR_PTR(ret); + return ret; sock->sk->sk_allocation = GFP_NOFS; #ifdef CONFIG_LOCKDEP @@ -273,11 +273,11 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con) sock_release(sock); con->error_msg = "connect error"; - return ERR_PTR(ret); + return ret; } con->sock = sock; - return sock; + return 0; } static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len) @@ -1854,11 +1854,9 @@ more: con->in_tag = CEPH_MSGR_TAG_READY; dout("try_write initiating connect on %p new state %lu\n", con, con->state); - con->sock = ceph_tcp_connect(con); - if (IS_ERR(con->sock)) { - con->sock = NULL; + ret = ceph_tcp_connect(con); + if (ret < 0) { con->error_msg = "connect error"; - ret = -1; goto out; } } From d3002b974cefbb7c1e325cc296966f768ff76b06 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 14 Feb 2012 14:05:33 -0600 Subject: [PATCH 51/65] libceph: a few small changes This gathers a number of very minor changes: - use %hu when formatting the a socket address's address family - null out the ceph_msgr_wq pointer after the queue has been destroyed - drop a needless cast in ceph_write_space() - add a WARN() call in ceph_state_change() in the event an unrecognized socket state is encountered - rearrange the logic in ceph_con_get() and ceph_con_put() so that: - the reference counts are only atomically read once - the values displayed via dout() calls are known to be meaningful at the time they are formatted Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- net/ceph/messenger.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index e1e53bb2d0cf..44d8c77cabdd 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -80,8 +80,8 @@ const char *ceph_pr_addr(const struct sockaddr_storage *ss) break; default: - snprintf(s, MAX_ADDR_STR_LEN, "(unknown sockaddr family %d)", - (int)ss->ss_family); + snprintf(s, MAX_ADDR_STR_LEN, "(unknown sockaddr family %hu)", + ss->ss_family); } return s; @@ -101,8 +101,10 @@ static struct workqueue_struct *ceph_msgr_wq; void _ceph_msgr_exit(void) { - if (ceph_msgr_wq) + if (ceph_msgr_wq) { destroy_workqueue(ceph_msgr_wq); + ceph_msgr_wq = NULL; + } BUG_ON(zero_page_address == NULL); zero_page_address = NULL; @@ -167,8 +169,7 @@ static void ceph_data_ready(struct sock *sk, int count_unused) /* socket has buffer space for writing */ static void ceph_write_space(struct sock *sk) { - struct ceph_connection *con = - (struct ceph_connection *)sk->sk_user_data; + struct ceph_connection *con = sk->sk_user_data; /* only queue to workqueue if there is data we want to write, * and there is sufficient space in the socket buffer to accept @@ -216,6 +217,8 @@ static void ceph_state_change(struct sock *sk) dout("ceph_state_change TCP_ESTABLISHED\n"); queue_con(con); break; + default: /* Everything else is uninteresting */ + break; } } @@ -420,22 +423,23 @@ bool ceph_con_opened(struct ceph_connection *con) */ struct ceph_connection *ceph_con_get(struct ceph_connection *con) { - dout("con_get %p nref = %d -> %d\n", con, - atomic_read(&con->nref), atomic_read(&con->nref) + 1); - if (atomic_inc_not_zero(&con->nref)) - return con; - return NULL; + int nref = __atomic_add_unless(&con->nref, 1, 0); + + dout("con_get %p nref = %d -> %d\n", con, nref, nref + 1); + + return nref ? con : NULL; } void ceph_con_put(struct ceph_connection *con) { - dout("con_put %p nref = %d -> %d\n", con, - atomic_read(&con->nref), atomic_read(&con->nref) - 1); - BUG_ON(atomic_read(&con->nref) == 0); - if (atomic_dec_and_test(&con->nref)) { + int nref = atomic_dec_return(&con->nref); + + BUG_ON(nref < 0); + if (nref == 0) { BUG_ON(con->sock); kfree(con); } + dout("con_put %p nref = %d -> %d\n", con, nref + 1, nref); } /* From cffaba15cd95d4a16eb5a6aa5c22a79f67d555ab Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 15 Feb 2012 07:43:54 -0600 Subject: [PATCH 52/65] ceph: ensure Boolean options support both senses Many ceph-related Boolean options offer the ability to both enable and disable a feature. For all those that don't offer this, add a new option so that they do. Note that ceph_show_options()--which reports mount options currently in effect--only reports the option if it is different from the default value. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- fs/ceph/super.c | 10 ++++++++++ net/ceph/ceph_common.c | 10 ++++++++++ 2 files changed, 20 insertions(+) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 4fab1fdcfa6a..3663cf0cb073 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -130,10 +130,12 @@ enum { Opt_nodirstat, Opt_rbytes, Opt_norbytes, + Opt_asyncreaddir, Opt_noasyncreaddir, Opt_dcache, Opt_nodcache, Opt_ino32, + Opt_noino32, }; static match_table_t fsopt_tokens = { @@ -153,10 +155,12 @@ static match_table_t fsopt_tokens = { {Opt_nodirstat, "nodirstat"}, {Opt_rbytes, "rbytes"}, {Opt_norbytes, "norbytes"}, + {Opt_asyncreaddir, "asyncreaddir"}, {Opt_noasyncreaddir, "noasyncreaddir"}, {Opt_dcache, "dcache"}, {Opt_nodcache, "nodcache"}, {Opt_ino32, "ino32"}, + {Opt_noino32, "noino32"}, {-1, NULL} }; @@ -232,6 +236,9 @@ static int parse_fsopt_token(char *c, void *private) case Opt_norbytes: fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES; break; + case Opt_asyncreaddir: + fsopt->flags &= ~CEPH_MOUNT_OPT_NOASYNCREADDIR; + break; case Opt_noasyncreaddir: fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR; break; @@ -244,6 +251,9 @@ static int parse_fsopt_token(char *c, void *private) case Opt_ino32: fsopt->flags |= CEPH_MOUNT_OPT_INO32; break; + case Opt_noino32: + fsopt->flags &= ~CEPH_MOUNT_OPT_INO32; + break; default: BUG_ON(token); } diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 621c3221b393..cc913193d992 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -201,7 +201,9 @@ enum { Opt_ip, Opt_last_string, /* string args above */ + Opt_share, Opt_noshare, + Opt_crc, Opt_nocrc, }; @@ -217,7 +219,9 @@ static match_table_t opt_tokens = { {Opt_key, "key=%s"}, {Opt_ip, "ip=%s"}, /* string args above */ + {Opt_share, "share"}, {Opt_noshare, "noshare"}, + {Opt_crc, "crc"}, {Opt_nocrc, "nocrc"}, {-1, NULL} }; @@ -399,10 +403,16 @@ ceph_parse_options(char *options, const char *dev_name, opt->mount_timeout = intval; break; + case Opt_share: + opt->flags &= ~CEPH_OPT_NOSHARE; + break; case Opt_noshare: opt->flags |= CEPH_OPT_NOSHARE; break; + case Opt_crc: + opt->flags &= ~CEPH_OPT_NOCRC; + break; case Opt_nocrc: opt->flags |= CEPH_OPT_NOCRC; break; From bca064d236a2e3162a07c758855221bcbe3c475b Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 15 Feb 2012 07:43:54 -0600 Subject: [PATCH 53/65] libceph: use "do" in CRC-related Boolean variables Change the name (and type) of a few CRC-related Boolean local variables so they contain the word "do", to distingish their purpose from variables used for holding an actual CRC value. Note that in the process of doing this I identified a fairly serious logic error in write_partial_msg_pages(): the value of "do_crc" assigned appears to be the opposite of what it should be. No attempt to fix this is made here; this change preserves the erroneous behavior. The problem I found is documented here: http://tracker.newdream.net/issues/2064 Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- include/linux/ceph/messenger.h | 2 +- net/ceph/messenger.c | 40 +++++++++++++++++----------------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 5ca0f8244203..3bff047f6b0f 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -98,7 +98,7 @@ struct ceph_msg { struct ceph_msg_pos { int page, page_pos; /* which page; offset in page */ int data_pos; /* offset in data payload */ - int did_page_crc; /* true if we've calculated crc for current page */ + bool did_page_crc; /* true if we've calculated crc for current page */ }; /* ceph connection fault delay defaults, for exponential backoff */ diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 44d8c77cabdd..204e229e6628 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -595,7 +595,7 @@ static void prepare_write_message(struct ceph_connection *con) else con->out_msg_pos.page_pos = 0; con->out_msg_pos.data_pos = 0; - con->out_msg_pos.did_page_crc = 0; + con->out_msg_pos.did_page_crc = false; con->out_more = 1; /* data + footer will follow */ } else { /* no, queue up footer too and be done */ @@ -805,7 +805,7 @@ static int write_partial_msg_pages(struct ceph_connection *con) struct ceph_msg *msg = con->out_msg; unsigned data_len = le32_to_cpu(msg->hdr.data_len); size_t len; - int crc = con->msgr->nocrc; + bool do_crc = con->msgr->nocrc; int ret; int total_max_write; int in_trail = 0; @@ -843,17 +843,17 @@ static int write_partial_msg_pages(struct ceph_connection *con) page = list_first_entry(&msg->trail->head, struct page, lru); - if (crc) + if (do_crc) kaddr = kmap(page); max_write = PAGE_SIZE; } else if (msg->pages) { page = msg->pages[con->out_msg_pos.page]; - if (crc) + if (do_crc) kaddr = kmap(page); } else if (msg->pagelist) { page = list_first_entry(&msg->pagelist->head, struct page, lru); - if (crc) + if (do_crc) kaddr = kmap(page); #ifdef CONFIG_BLOCK } else if (msg->bio) { @@ -862,26 +862,26 @@ static int write_partial_msg_pages(struct ceph_connection *con) bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg); page = bv->bv_page; page_shift = bv->bv_offset; - if (crc) + if (do_crc) kaddr = kmap(page) + page_shift; max_write = bv->bv_len; #endif } else { page = zero_page; - if (crc) + if (do_crc) kaddr = zero_page_address; } len = min_t(int, max_write - con->out_msg_pos.page_pos, total_max_write); - if (crc && !con->out_msg_pos.did_page_crc) { + if (do_crc && !con->out_msg_pos.did_page_crc) { void *base = kaddr + con->out_msg_pos.page_pos; u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc); BUG_ON(kaddr == NULL); con->out_msg->footer.data_crc = cpu_to_le32(crc32c(tmpcrc, base, len)); - con->out_msg_pos.did_page_crc = 1; + con->out_msg_pos.did_page_crc = true; } ret = kernel_sendpage(con->sock, page, con->out_msg_pos.page_pos + page_shift, @@ -889,7 +889,7 @@ static int write_partial_msg_pages(struct ceph_connection *con) MSG_DONTWAIT | MSG_NOSIGNAL | MSG_MORE); - if (crc && + if (do_crc && (msg->pages || msg->pagelist || msg->bio || in_trail)) kunmap(page); @@ -903,7 +903,7 @@ static int write_partial_msg_pages(struct ceph_connection *con) if (ret == len) { con->out_msg_pos.page_pos = 0; con->out_msg_pos.page++; - con->out_msg_pos.did_page_crc = 0; + con->out_msg_pos.did_page_crc = false; if (in_trail) list_move_tail(&page->lru, &msg->trail->head); @@ -920,7 +920,7 @@ static int write_partial_msg_pages(struct ceph_connection *con) dout("write_partial_msg_pages %p msg %p done\n", con, msg); /* prepare and queue up footer, too */ - if (!crc) + if (!do_crc) con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; ceph_con_out_kvec_reset(con); prepare_write_message_footer(con); @@ -1557,7 +1557,7 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, static int read_partial_message_pages(struct ceph_connection *con, struct page **pages, - unsigned data_len, int datacrc) + unsigned data_len, bool do_datacrc) { void *p; int ret; @@ -1570,7 +1570,7 @@ static int read_partial_message_pages(struct ceph_connection *con, p = kmap(pages[con->in_msg_pos.page]); ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, left); - if (ret > 0 && datacrc) + if (ret > 0 && do_datacrc) con->in_data_crc = crc32c(con->in_data_crc, p + con->in_msg_pos.page_pos, ret); @@ -1590,7 +1590,7 @@ static int read_partial_message_pages(struct ceph_connection *con, #ifdef CONFIG_BLOCK static int read_partial_message_bio(struct ceph_connection *con, struct bio **bio_iter, int *bio_seg, - unsigned data_len, int datacrc) + unsigned data_len, bool do_datacrc) { struct bio_vec *bv = bio_iovec_idx(*bio_iter, *bio_seg); void *p; @@ -1606,7 +1606,7 @@ static int read_partial_message_bio(struct ceph_connection *con, ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, left); - if (ret > 0 && datacrc) + if (ret > 0 && do_datacrc) con->in_data_crc = crc32c(con->in_data_crc, p + con->in_msg_pos.page_pos, ret); @@ -1633,7 +1633,7 @@ static int read_partial_message(struct ceph_connection *con) int ret; int to, left; unsigned front_len, middle_len, data_len; - int datacrc = con->msgr->nocrc; + bool do_datacrc = con->msgr->nocrc; int skip; u64 seq; @@ -1744,7 +1744,7 @@ static int read_partial_message(struct ceph_connection *con) while (con->in_msg_pos.data_pos < data_len) { if (m->pages) { ret = read_partial_message_pages(con, m->pages, - data_len, datacrc); + data_len, do_datacrc); if (ret <= 0) return ret; #ifdef CONFIG_BLOCK @@ -1752,7 +1752,7 @@ static int read_partial_message(struct ceph_connection *con) ret = read_partial_message_bio(con, &m->bio_iter, &m->bio_seg, - data_len, datacrc); + data_len, do_datacrc); if (ret <= 0) return ret; #endif @@ -1787,7 +1787,7 @@ static int read_partial_message(struct ceph_connection *con) m, con->in_middle_crc, m->footer.middle_crc); return -EBADMSG; } - if (datacrc && + if (do_datacrc && (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 && con->in_data_crc != le32_to_cpu(m->footer.data_crc)) { pr_err("read_partial_message %p data crc %u != exp. %u\n", m, From a9a0c51af4e7c825c014b40694571456a75ebbc4 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 15 Feb 2012 07:43:54 -0600 Subject: [PATCH 54/65] libceph: separate CRC calculation from byte swapping Calculate CRC in a separate step from rearranging the byte order of the result, to improve clarity and readability. Use offsetof() to determine the number of bytes to include in the CRC calculation. In read_partial_message(), switch which value gets byte-swapped, since the just-computed CRC is already likely to be in a register. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- net/ceph/messenger.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 204e229e6628..7ec6a228b667 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -521,6 +521,7 @@ static void prepare_write_message_footer(struct ceph_connection *con) static void prepare_write_message(struct ceph_connection *con) { struct ceph_msg *m; + u32 crc; ceph_con_out_kvec_reset(con); con->out_kvec_is_msg = true; @@ -569,17 +570,17 @@ static void prepare_write_message(struct ceph_connection *con) m->middle->vec.iov_base); /* fill in crc (except data pages), footer */ - con->out_msg->hdr.crc = - cpu_to_le32(crc32c(0, &m->hdr, - sizeof(m->hdr) - sizeof(m->hdr.crc))); + crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc)); + con->out_msg->hdr.crc = cpu_to_le32(crc); con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE; - con->out_msg->footer.front_crc = - cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len)); - if (m->middle) - con->out_msg->footer.middle_crc = - cpu_to_le32(crc32c(0, m->middle->vec.iov_base, - m->middle->vec.iov_len)); - else + + crc = crc32c(0, m->front.iov_base, m->front.iov_len); + con->out_msg->footer.front_crc = cpu_to_le32(crc); + if (m->middle) { + crc = crc32c(0, m->middle->vec.iov_base, + m->middle->vec.iov_len); + con->out_msg->footer.middle_crc = cpu_to_le32(crc); + } else con->out_msg->footer.middle_crc = 0; con->out_msg->footer.data_crc = 0; dout("prepare_write_message front_crc %u data_crc %u\n", @@ -875,12 +876,13 @@ static int write_partial_msg_pages(struct ceph_connection *con) total_max_write); if (do_crc && !con->out_msg_pos.did_page_crc) { + u32 crc; void *base = kaddr + con->out_msg_pos.page_pos; u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc); BUG_ON(kaddr == NULL); - con->out_msg->footer.data_crc = - cpu_to_le32(crc32c(tmpcrc, base, len)); + crc = crc32c(tmpcrc, base, len); + con->out_msg->footer.data_crc = cpu_to_le32(crc); con->out_msg_pos.did_page_crc = true; } ret = kernel_sendpage(con->sock, page, @@ -1650,8 +1652,9 @@ static int read_partial_message(struct ceph_connection *con) con->in_base_pos += ret; if (con->in_base_pos == sizeof(con->in_hdr)) { u32 crc = crc32c(0, &con->in_hdr, - sizeof(con->in_hdr) - sizeof(con->in_hdr.crc)); - if (crc != le32_to_cpu(con->in_hdr.crc)) { + offsetof(struct ceph_msg_header, crc)); + + if (cpu_to_le32(crc) != con->in_hdr.crc) { pr_err("read_partial_message bad hdr " " crc %u != expected %u\n", crc, con->in_hdr.crc); From fe3ad593e2c34457ffa6233014ab19f4d36f85f2 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 15 Feb 2012 07:43:54 -0600 Subject: [PATCH 55/65] libceph: do crc calculations outside loop Move blocks of code out of loops in read_partial_message_section() and read_partial_message(). They were only was getting called at the end of the last iteration of the loop anyway. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- net/ceph/messenger.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 7ec6a228b667..575511a29eb7 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1544,10 +1544,9 @@ static int read_partial_message_section(struct ceph_connection *con, if (ret <= 0) return ret; section->iov_len += ret; - if (section->iov_len == sec_len) - *crc = crc32c(0, section->iov_base, - section->iov_len); } + if (section->iov_len == sec_len) + *crc = crc32c(0, section->iov_base, section->iov_len); return 1; } @@ -1638,6 +1637,7 @@ static int read_partial_message(struct ceph_connection *con) bool do_datacrc = con->msgr->nocrc; int skip; u64 seq; + u32 crc; dout("read_partial_message con %p msg %p\n", con, m); @@ -1650,18 +1650,16 @@ static int read_partial_message(struct ceph_connection *con) if (ret <= 0) return ret; con->in_base_pos += ret; - if (con->in_base_pos == sizeof(con->in_hdr)) { - u32 crc = crc32c(0, &con->in_hdr, - offsetof(struct ceph_msg_header, crc)); - - if (cpu_to_le32(crc) != con->in_hdr.crc) { - pr_err("read_partial_message bad hdr " - " crc %u != expected %u\n", - crc, con->in_hdr.crc); - return -EBADMSG; - } - } } + + crc = crc32c(0, &con->in_hdr, offsetof(struct ceph_msg_header, crc)); + if (cpu_to_le32(crc) != con->in_hdr.crc) { + pr_err("read_partial_message bad hdr " + " crc %u != expected %u\n", + crc, con->in_hdr.crc); + return -EBADMSG; + } + front_len = le32_to_cpu(con->in_hdr.front_len); if (front_len > CEPH_MSG_MAX_FRONT_LEN) return -EIO; From f42299e6c3883c69c14079b8c88fe33815b2dcc3 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 15 Feb 2012 07:43:54 -0600 Subject: [PATCH 56/65] libceph: small refactor in write_partial_kvec() Make a small change in the code that counts down kvecs consumed by a ceph_tcp_sendmsg() call. Same functionality, just blocked out a little differently. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- net/ceph/messenger.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 575511a29eb7..e8f236e87f38 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -747,17 +747,18 @@ static int write_partial_kvec(struct ceph_connection *con) con->out_kvec_bytes -= ret; if (con->out_kvec_bytes == 0) break; /* done */ - while (ret > 0) { - if (ret >= con->out_kvec_cur->iov_len) { - ret -= con->out_kvec_cur->iov_len; - con->out_kvec_cur++; - con->out_kvec_left--; - } else { - con->out_kvec_cur->iov_len -= ret; - con->out_kvec_cur->iov_base += ret; - ret = 0; - break; - } + + /* account for full iov entries consumed */ + while (ret >= con->out_kvec_cur->iov_len) { + BUG_ON(!con->out_kvec_left); + ret -= con->out_kvec_cur->iov_len; + con->out_kvec_cur++; + con->out_kvec_left--; + } + /* and for a partially-consumed entry */ + if (ret) { + con->out_kvec_cur->iov_len -= ret; + con->out_kvec_cur->iov_base += ret; } } con->out_kvec_left = 0; From 84495f496170a73ed79667b7fbf91947b7f47c87 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 15 Feb 2012 07:43:55 -0600 Subject: [PATCH 57/65] libceph: some simple changes Nothing too big here. - define the size of the buffer used for consuming ignored incoming data using a symbolic constant - simplify the condition determining whether to unmap the page in write_partial_msg_pages(): do it for crc but not if the page is the zero page Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- net/ceph/messenger.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index e8f236e87f38..1a22975945da 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -38,6 +38,11 @@ static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE; static struct lock_class_key socket_class; #endif +/* + * When skipping (ignoring) a block of input we read it into a "skip + * buffer," which is this many bytes in size. + */ +#define SKIP_BUF_SIZE 1024 static void queue_con(struct ceph_connection *con); static void con_work(struct work_struct *); @@ -892,8 +897,7 @@ static int write_partial_msg_pages(struct ceph_connection *con) MSG_DONTWAIT | MSG_NOSIGNAL | MSG_MORE); - if (do_crc && - (msg->pages || msg->pagelist || msg->bio || in_trail)) + if (do_crc && kaddr != zero_page_address) kunmap(page); if (ret == -EAGAIN) @@ -1982,8 +1986,9 @@ more: * * FIXME: there must be a better way to do this! */ - static char buf[1024]; - int skip = min(1024, -con->in_base_pos); + static char buf[SKIP_BUF_SIZE]; + int skip = min((int) sizeof (buf), -con->in_base_pos); + dout("skipping %d / %d bytes\n", skip, -con->in_base_pos); ret = ceph_tcp_recvmsg(con->sock, buf, skip); if (ret <= 0) From 37675b0f42a8f7699c3602350d1c3b2a1698a3d3 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 7 Mar 2012 11:40:08 -0600 Subject: [PATCH 58/65] libceph: fix inverted crc option logic CRC's are computed for all messages between ceph entities. The CRC computation for the data portion of message can optionally be disabled using the "nocrc" (common) ceph option. The default is for CRC computation for the data portion to be enabled. Unfortunately, the code that implements this feature interprets the feature flag wrong, meaning that by default the CRC's have *not* been computed (or checked) for the data portion of messages unless the "nocrc" option was supplied. Fix this, in write_partial_msg_pages() and read_partial_message(). Also change the flag variable in write_partial_msg_pages() to be "no_datacrc" to match the usage elsewhere in the file. This fixes http://tracker.newdream.net/issues/2064 Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/messenger.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 1a22975945da..589b7689d31b 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -812,7 +812,7 @@ static int write_partial_msg_pages(struct ceph_connection *con) struct ceph_msg *msg = con->out_msg; unsigned data_len = le32_to_cpu(msg->hdr.data_len); size_t len; - bool do_crc = con->msgr->nocrc; + bool do_datacrc = !con->msgr->nocrc; int ret; int total_max_write; int in_trail = 0; @@ -850,17 +850,17 @@ static int write_partial_msg_pages(struct ceph_connection *con) page = list_first_entry(&msg->trail->head, struct page, lru); - if (do_crc) + if (do_datacrc) kaddr = kmap(page); max_write = PAGE_SIZE; } else if (msg->pages) { page = msg->pages[con->out_msg_pos.page]; - if (do_crc) + if (do_datacrc) kaddr = kmap(page); } else if (msg->pagelist) { page = list_first_entry(&msg->pagelist->head, struct page, lru); - if (do_crc) + if (do_datacrc) kaddr = kmap(page); #ifdef CONFIG_BLOCK } else if (msg->bio) { @@ -869,19 +869,19 @@ static int write_partial_msg_pages(struct ceph_connection *con) bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg); page = bv->bv_page; page_shift = bv->bv_offset; - if (do_crc) + if (do_datacrc) kaddr = kmap(page) + page_shift; max_write = bv->bv_len; #endif } else { page = zero_page; - if (do_crc) + if (do_datacrc) kaddr = zero_page_address; } len = min_t(int, max_write - con->out_msg_pos.page_pos, total_max_write); - if (do_crc && !con->out_msg_pos.did_page_crc) { + if (do_datacrc && !con->out_msg_pos.did_page_crc) { u32 crc; void *base = kaddr + con->out_msg_pos.page_pos; u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc); @@ -897,7 +897,7 @@ static int write_partial_msg_pages(struct ceph_connection *con) MSG_DONTWAIT | MSG_NOSIGNAL | MSG_MORE); - if (do_crc && kaddr != zero_page_address) + if (do_datacrc && kaddr != zero_page_address) kunmap(page); if (ret == -EAGAIN) @@ -927,7 +927,7 @@ static int write_partial_msg_pages(struct ceph_connection *con) dout("write_partial_msg_pages %p msg %p done\n", con, msg); /* prepare and queue up footer, too */ - if (!do_crc) + if (!do_datacrc) con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; ceph_con_out_kvec_reset(con); prepare_write_message_footer(con); @@ -1639,7 +1639,7 @@ static int read_partial_message(struct ceph_connection *con) int ret; int to, left; unsigned front_len, middle_len, data_len; - bool do_datacrc = con->msgr->nocrc; + bool do_datacrc = !con->msgr->nocrc; int skip; u64 seq; u32 crc; From 31739139f3ed7be802dd9019ec8d8cc910e3d241 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 7 Mar 2012 11:40:08 -0600 Subject: [PATCH 59/65] libceph: use kernel_sendpage() for sending zeroes If a message queued for send gets revoked, zeroes are sent over the wire instead of any unsent data. This is done by constructing a message and passing it to kernel_sendmsg() via ceph_tcp_sendmsg(). Since we are already working with a page in this case we can use the sendpage interface instead. Create a new ceph_tcp_sendpage() helper that sets up flags to match the way ceph_tcp_sendmsg() does now. Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/messenger.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 589b7689d31b..9207a8c0b214 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -321,6 +321,19 @@ static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov, return r; } +static int ceph_tcp_sendpage(struct socket *sock, struct page *page, + int offset, size_t size, int more) +{ + int flags = MSG_DONTWAIT | MSG_NOSIGNAL | (more ? MSG_MORE : MSG_EOR); + int ret; + + ret = kernel_sendpage(sock, page, offset, size, flags); + if (ret == -EAGAIN) + ret = 0; + + return ret; +} + /* * Shutdown/close the socket for the given connection. @@ -944,12 +957,9 @@ static int write_partial_skip(struct ceph_connection *con) int ret; while (con->out_skip > 0) { - struct kvec iov = { - .iov_base = zero_page_address, - .iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE) - }; + size_t size = min(con->out_skip, (int) PAGE_CACHE_SIZE); - ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1); + ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, 1); if (ret <= 0) goto out; con->out_skip -= ret; From e36b13cceb46136d849aeee06b4907ad3570ba78 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 7 Mar 2012 11:40:08 -0600 Subject: [PATCH 60/65] libceph: only call kernel_sendpage() via helper Make ceph_tcp_sendpage() be the only place kernel_sendpage() is used, by using this helper in write_partial_msg_pages(). Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/messenger.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 9207a8c0b214..adca1e6537ab 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -904,17 +904,13 @@ static int write_partial_msg_pages(struct ceph_connection *con) con->out_msg->footer.data_crc = cpu_to_le32(crc); con->out_msg_pos.did_page_crc = true; } - ret = kernel_sendpage(con->sock, page, + ret = ceph_tcp_sendpage(con->sock, page, con->out_msg_pos.page_pos + page_shift, - len, - MSG_DONTWAIT | MSG_NOSIGNAL | - MSG_MORE); + len, 1); if (do_datacrc && kaddr != zero_page_address) kunmap(page); - if (ret == -EAGAIN) - ret = 0; if (ret <= 0) goto out; From 0cdf9e60189a87356a865a96dbafc2240af5c91d Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 7 Mar 2012 11:40:08 -0600 Subject: [PATCH 61/65] libceph: get rid of zero_page_address There's not a lot of benefit to zero_page_address, which basically holds a mapping of the zero page through the life of the messenger module. Even with our own mapping, the sendpage interface where it's used may need to kmap() it again. It's almost certain to be in low memory anyway. So stop treating the zero page specially in write_partial_msg_pages() and just get rid of zero_page_address entirely. Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/messenger.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index adca1e6537ab..4f1714c4c93b 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -61,7 +61,6 @@ static char addr_str[ADDR_STR_COUNT][MAX_ADDR_STR_LEN]; static atomic_t addr_str_seq = ATOMIC_INIT(0); static struct page *zero_page; /* used in certain error cases */ -static void *zero_page_address; /* kernel virtual addr of zero_page */ const char *ceph_pr_addr(const struct sockaddr_storage *ss) { @@ -111,9 +110,6 @@ void _ceph_msgr_exit(void) ceph_msgr_wq = NULL; } - BUG_ON(zero_page_address == NULL); - zero_page_address = NULL; - BUG_ON(zero_page == NULL); kunmap(zero_page); page_cache_release(zero_page); @@ -126,9 +122,6 @@ int ceph_msgr_init(void) zero_page = ZERO_PAGE(0); page_cache_get(zero_page); - BUG_ON(zero_page_address != NULL); - zero_page_address = kmap(zero_page); - ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_NON_REENTRANT, 0); if (ceph_msgr_wq) return 0; @@ -889,7 +882,7 @@ static int write_partial_msg_pages(struct ceph_connection *con) } else { page = zero_page; if (do_datacrc) - kaddr = zero_page_address; + kaddr = kmap(page); } len = min_t(int, max_write - con->out_msg_pos.page_pos, total_max_write); @@ -908,7 +901,7 @@ static int write_partial_msg_pages(struct ceph_connection *con) con->out_msg_pos.page_pos + page_shift, len, 1); - if (do_datacrc && kaddr != zero_page_address) + if (do_datacrc) kunmap(page); if (ret <= 0) From 9bd1966344bf975b5ce65e80fd6bacc41b4325a8 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 7 Mar 2012 11:40:08 -0600 Subject: [PATCH 62/65] libceph: rename "page_shift" variable to something sensible In write_partial_msg_pages() there is a local variable used to track the starting offset within a bio segment to use. Its name, "page_shift" defies the Linux convention of using that name for log-base-2(page size). Since it's only used in the bio case rename it "bio_offset". Use it along with the page_pos field to compute the memory offset when computing CRC's in that function. This makes the bio case match the others more closely. Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/messenger.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 4f1714c4c93b..2bf9ab4429e6 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -837,7 +837,7 @@ static int write_partial_msg_pages(struct ceph_connection *con) struct page *page = NULL; void *kaddr = NULL; int max_write = PAGE_SIZE; - int page_shift = 0; + int bio_offset = 0; total_max_write = data_len - trail_len - con->out_msg_pos.data_pos; @@ -874,9 +874,9 @@ static int write_partial_msg_pages(struct ceph_connection *con) bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg); page = bv->bv_page; - page_shift = bv->bv_offset; + bio_offset = bv->bv_offset; if (do_datacrc) - kaddr = kmap(page) + page_shift; + kaddr = kmap(page); max_write = bv->bv_len; #endif } else { @@ -888,17 +888,18 @@ static int write_partial_msg_pages(struct ceph_connection *con) total_max_write); if (do_datacrc && !con->out_msg_pos.did_page_crc) { + void *base; u32 crc; - void *base = kaddr + con->out_msg_pos.page_pos; u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc); BUG_ON(kaddr == NULL); + base = kaddr + con->out_msg_pos.page_pos + bio_offset; crc = crc32c(tmpcrc, base, len); con->out_msg->footer.data_crc = cpu_to_le32(crc); con->out_msg_pos.did_page_crc = true; } ret = ceph_tcp_sendpage(con->sock, page, - con->out_msg_pos.page_pos + page_shift, + con->out_msg_pos.page_pos + bio_offset, len, 1); if (do_datacrc) From 8d63e318c4eb1bea6f7e3cb4b77849eaa167bfec Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 7 Mar 2012 11:40:08 -0600 Subject: [PATCH 63/65] libceph: isolate kmap() call in write_partial_msg_pages() In write_partial_msg_pages(), every case now does an identical call to kmap(page). Instead, just call it once inside the CRC-computing block where it's needed. Move the definition of kaddr inside that block, and make it a (char *) to ensure portable pointer arithmetic. We still don't kunmap() it until after the sendpage() call, in case that also ends up needing to use the mapping. Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/messenger.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 2bf9ab4429e6..f0993af2ae4d 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -835,7 +835,6 @@ static int write_partial_msg_pages(struct ceph_connection *con) while (data_len > con->out_msg_pos.data_pos) { struct page *page = NULL; - void *kaddr = NULL; int max_write = PAGE_SIZE; int bio_offset = 0; @@ -856,18 +855,12 @@ static int write_partial_msg_pages(struct ceph_connection *con) page = list_first_entry(&msg->trail->head, struct page, lru); - if (do_datacrc) - kaddr = kmap(page); max_write = PAGE_SIZE; } else if (msg->pages) { page = msg->pages[con->out_msg_pos.page]; - if (do_datacrc) - kaddr = kmap(page); } else if (msg->pagelist) { page = list_first_entry(&msg->pagelist->head, struct page, lru); - if (do_datacrc) - kaddr = kmap(page); #ifdef CONFIG_BLOCK } else if (msg->bio) { struct bio_vec *bv; @@ -875,14 +868,10 @@ static int write_partial_msg_pages(struct ceph_connection *con) bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg); page = bv->bv_page; bio_offset = bv->bv_offset; - if (do_datacrc) - kaddr = kmap(page); max_write = bv->bv_len; #endif } else { page = zero_page; - if (do_datacrc) - kaddr = kmap(page); } len = min_t(int, max_write - con->out_msg_pos.page_pos, total_max_write); @@ -891,7 +880,9 @@ static int write_partial_msg_pages(struct ceph_connection *con) void *base; u32 crc; u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc); + char *kaddr; + kaddr = kmap(page); BUG_ON(kaddr == NULL); base = kaddr + con->out_msg_pos.page_pos + bio_offset; crc = crc32c(tmpcrc, base, len); From 3489b42a72a41d477665ab37f196ae9257180abb Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 8 Mar 2012 16:50:09 -0600 Subject: [PATCH 64/65] ceph: fix three bugs, two in ceph_vxattrcb_file_layout() In ceph_vxattrcb_file_layout(), there is a check to determine whether a preferred PG should be formatted into the output buffer. That check assumes that a preferred PG number of 0 indicates "no preference," but that is wrong. No preference is indicated by a negative (specifically, -1) PG number. In addition, if that condition yields true, the preferred value is formatted into a sized buffer, but the size consumed by the earlier snprintf() call is not accounted for, opening up the possibilty of a buffer overrun. Finally, in ceph_vxattrcb_dir_rctime() where the nanoseconds part of the time displayed did not include leading 0's, which led to erroneous (sub-second portion of) time values being shown. This fixes these three issues: http://tracker.newdream.net/issues/2155 http://tracker.newdream.net/issues/2156 http://tracker.newdream.net/issues/2157 Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- fs/ceph/xattr.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 8294f461ecd1..35b86331d8a5 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -79,7 +79,7 @@ static size_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val, static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val, size_t size) { - return snprintf(val, size, "%ld.%ld", (long)ci->i_rctime.tv_sec, + return snprintf(val, size, "%ld.09%ld", (long)ci->i_rctime.tv_sec, (long)ci->i_rctime.tv_nsec); } @@ -118,10 +118,15 @@ static size_t ceph_vxattrcb_file_layout(struct ceph_inode_info *ci, char *val, (unsigned long long)ceph_file_layout_su(ci->i_layout), (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), (unsigned long long)ceph_file_layout_object_size(ci->i_layout)); - if (ceph_file_layout_pg_preferred(ci->i_layout)) - ret += snprintf(val + ret, size, "preferred_osd=%lld\n", + + if (ceph_file_layout_pg_preferred(ci->i_layout) >= 0) { + val += ret; + size -= ret; + ret += snprintf(val, size, "preferred_osd=%lld\n", (unsigned long long)ceph_file_layout_pg_preferred( ci->i_layout)); + } + return ret; } From c666601a935b94cc0f3310339411b6940de751ba Mon Sep 17 00:00:00 2001 From: Josh Durgin Date: Mon, 21 Nov 2011 17:11:12 -0800 Subject: [PATCH 65/65] rbd: move snap_rwsem to the device, rename to header_rwsem A new temporary header is allocated each time the header changes, but only the changed properties are copied over. We don't need a new semaphore for each header update. This addresses http://tracker.newdream.net/issues/2174 Signed-off-by: Josh Durgin Reviewed-by: Alex Elder --- drivers/block/rbd.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 6bbd5af1f029..013c7a549fb6 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -82,7 +82,6 @@ struct rbd_image_header { __u8 obj_order; __u8 crypt_type; __u8 comp_type; - struct rw_semaphore snap_rwsem; struct ceph_snap_context *snapc; size_t snap_names_len; u64 snap_seq; @@ -173,6 +172,8 @@ struct rbd_device { struct ceph_osd_event *watch_event; struct ceph_osd_request *watch_request; + /* protects updating the header */ + struct rw_semaphore header_rwsem; char snap_name[RBD_MAX_SNAP_NAME_LEN]; u32 cur_snap; /* index+1 of current snapshot within snap context 0 - for the head */ @@ -502,7 +503,6 @@ static int rbd_header_from_disk(struct rbd_image_header *header, if (!header->snapc) return -ENOMEM; - init_rwsem(&header->snap_rwsem); header->snap_names_len = le64_to_cpu(ondisk->snap_names_len); if (snap_count) { header->snap_names = kmalloc(header->snap_names_len, @@ -597,7 +597,7 @@ static int rbd_header_set_snap(struct rbd_device *dev, u64 *size) BUILD_BUG_ON(sizeof (dev->snap_name) < sizeof (RBD_SNAP_HEAD_NAME)); - down_write(&header->snap_rwsem); + down_write(&dev->header_rwsem); if (!memcmp(dev->snap_name, RBD_SNAP_HEAD_NAME, sizeof (RBD_SNAP_HEAD_NAME))) { @@ -620,7 +620,7 @@ static int rbd_header_set_snap(struct rbd_device *dev, u64 *size) ret = 0; done: - up_write(&header->snap_rwsem); + up_write(&dev->header_rwsem); return ret; } @@ -887,7 +887,6 @@ static int rbd_do_request(struct request *rq, struct timespec mtime = CURRENT_TIME; struct rbd_request *req_data; struct ceph_osd_request_head *reqhead; - struct rbd_image_header *header = &dev->header; struct ceph_osd_client *osdc; req_data = kzalloc(sizeof(*req_data), GFP_NOIO); @@ -905,13 +904,13 @@ static int rbd_do_request(struct request *rq, dout("rbd_do_request obj=%s ofs=%lld len=%lld\n", obj, len, ofs); - down_read(&header->snap_rwsem); + down_read(&dev->header_rwsem); osdc = &dev->rbd_client->client->osdc; req = ceph_osdc_alloc_request(osdc, flags, snapc, ops, false, GFP_NOIO, pages, bio); if (!req) { - up_read(&header->snap_rwsem); + up_read(&dev->header_rwsem); ret = -ENOMEM; goto done_pages; } @@ -946,7 +945,7 @@ static int rbd_do_request(struct request *rq, snapc, &mtime, req->r_oid, req->r_oid_len); - up_read(&header->snap_rwsem); + up_read(&dev->header_rwsem); if (linger_req) { ceph_osdc_set_request_linger(osdc, req); @@ -1718,7 +1717,7 @@ static int __rbd_update_snaps(struct rbd_device *rbd_dev) /* resized? */ set_capacity(rbd_dev->disk, h.image_size / SECTOR_SIZE); - down_write(&rbd_dev->header.snap_rwsem); + down_write(&rbd_dev->header_rwsem); snap_seq = rbd_dev->header.snapc->seq; if (rbd_dev->header.total_snaps && @@ -1743,7 +1742,7 @@ static int __rbd_update_snaps(struct rbd_device *rbd_dev) ret = __rbd_init_snaps_header(rbd_dev); - up_write(&rbd_dev->header.snap_rwsem); + up_write(&rbd_dev->header_rwsem); return ret; } @@ -2380,8 +2379,9 @@ static ssize_t rbd_add(struct bus_type *bus, spin_lock_init(&rbd_dev->lock); INIT_LIST_HEAD(&rbd_dev->node); INIT_LIST_HEAD(&rbd_dev->snaps); + init_rwsem(&rbd_dev->header_rwsem); - init_rwsem(&rbd_dev->header.snap_rwsem); + init_rwsem(&rbd_dev->header_rwsem); /* generate unique id: find highest unique id, add one */ rbd_id_get(rbd_dev);