From 22d2cfdffa5bff3566e16cb7320e13ceb814674b Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 4 Jun 2020 11:12:34 +0200 Subject: [PATCH 1/3] libceph: move away from global osd_req_flags osd_req_flags is overly general and doesn't suit its only user (read_from_replica option) well: - applying osd_req_flags in account_request() affects all OSD requests, including linger (i.e. watch and notify). However, linger requests should always go to the primary even though some of them are reads (e.g. notify has side effects but it is a read because it doesn't result in mutation on the OSDs). - calls to class methods that are reads are allowed to go to the replica, but most such calls issued for "rbd map" and/or exclusive lock transitions are requested to be resent to the primary via EAGAIN, doubling the latency. Get rid of global osd_req_flags and set read_from_replica flag only on specific OSD requests instead. Fixes: 8ad44d5e0d1e ("libceph: read_from_replica option") Signed-off-by: Ilya Dryomov Reviewed-by: Jeff Layton --- drivers/block/rbd.c | 4 +++- include/linux/ceph/libceph.h | 4 ++-- net/ceph/ceph_common.c | 14 ++++++-------- net/ceph/osd_client.c | 7 ++----- 4 files changed, 13 insertions(+), 16 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 7420648a1de6..4f61e9209461 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1451,8 +1451,10 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req) static void rbd_osd_format_read(struct ceph_osd_request *osd_req) { struct rbd_obj_request *obj_request = osd_req->r_priv; + struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev; + struct ceph_options *opt = rbd_dev->rbd_client->client->options; - osd_req->r_flags = CEPH_OSD_FLAG_READ; + osd_req->r_flags = CEPH_OSD_FLAG_READ | opt->read_from_replica; osd_req->r_snapid = obj_request->img_request->snap_id; } diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 2247e71beb83..e5ed1c541e7f 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -52,8 +52,7 @@ struct ceph_options { unsigned long osd_idle_ttl; /* jiffies */ unsigned long osd_keepalive_timeout; /* jiffies */ unsigned long osd_request_timeout; /* jiffies */ - - u32 osd_req_flags; /* CEPH_OSD_FLAG_*, applied to each OSD request */ + u32 read_from_replica; /* CEPH_OSD_FLAG_BALANCE/LOCALIZE_READS */ /* * any type that can't be simply compared or doesn't need @@ -76,6 +75,7 @@ struct ceph_options { #define CEPH_OSD_KEEPALIVE_DEFAULT msecs_to_jiffies(5 * 1000) #define CEPH_OSD_IDLE_TTL_DEFAULT msecs_to_jiffies(60 * 1000) #define CEPH_OSD_REQUEST_TIMEOUT_DEFAULT 0 /* no timeout */ +#define CEPH_READ_FROM_REPLICA_DEFAULT 0 /* read from primary */ #define CEPH_MONC_HUNT_INTERVAL msecs_to_jiffies(3 * 1000) #define CEPH_MONC_PING_INTERVAL msecs_to_jiffies(10 * 1000) diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index afe0e8184c23..4e7edd707a14 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -332,6 +332,7 @@ struct ceph_options *ceph_alloc_options(void) opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; opt->osd_request_timeout = CEPH_OSD_REQUEST_TIMEOUT_DEFAULT; + opt->read_from_replica = CEPH_READ_FROM_REPLICA_DEFAULT; return opt; } EXPORT_SYMBOL(ceph_alloc_options); @@ -490,16 +491,13 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt, case Opt_read_from_replica: switch (result.uint_32) { case Opt_read_from_replica_no: - opt->osd_req_flags &= ~(CEPH_OSD_FLAG_BALANCE_READS | - CEPH_OSD_FLAG_LOCALIZE_READS); + opt->read_from_replica = 0; break; case Opt_read_from_replica_balance: - opt->osd_req_flags |= CEPH_OSD_FLAG_BALANCE_READS; - opt->osd_req_flags &= ~CEPH_OSD_FLAG_LOCALIZE_READS; + opt->read_from_replica = CEPH_OSD_FLAG_BALANCE_READS; break; case Opt_read_from_replica_localize: - opt->osd_req_flags |= CEPH_OSD_FLAG_LOCALIZE_READS; - opt->osd_req_flags &= ~CEPH_OSD_FLAG_BALANCE_READS; + opt->read_from_replica = CEPH_OSD_FLAG_LOCALIZE_READS; break; default: BUG(); @@ -613,9 +611,9 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client, } seq_putc(m, ','); } - if (opt->osd_req_flags & CEPH_OSD_FLAG_BALANCE_READS) { + if (opt->read_from_replica == CEPH_OSD_FLAG_BALANCE_READS) { seq_puts(m, "read_from_replica=balance,"); - } else if (opt->osd_req_flags & CEPH_OSD_FLAG_LOCALIZE_READS) { + } else if (opt->read_from_replica == CEPH_OSD_FLAG_LOCALIZE_READS) { seq_puts(m, "read_from_replica=localize,"); } diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 4fea3c33af2a..a37d159019a0 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1117,10 +1117,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, truncate_size, truncate_seq); } - req->r_flags = flags; req->r_base_oloc.pool = layout->pool_id; req->r_base_oloc.pool_ns = ceph_try_get_string(layout->pool_ns); ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum); + req->r_flags = flags | osdc->client->options->read_from_replica; req->r_snapid = vino.snap; if (flags & CEPH_OSD_FLAG_WRITE) @@ -2431,14 +2431,11 @@ promote: static void account_request(struct ceph_osd_request *req) { - struct ceph_osd_client *osdc = req->r_osdc; - WARN_ON(req->r_flags & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK)); WARN_ON(!(req->r_flags & (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE))); req->r_flags |= CEPH_OSD_FLAG_ONDISK; - req->r_flags |= osdc->client->options->osd_req_flags; - atomic_inc(&osdc->num_requests); + atomic_inc(&req->r_osdc->num_requests); req->r_start_stamp = jiffies; req->r_start_latency = ktime_get(); From 2f3fead62144002557f322c2a7c15e1255df0653 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 9 Jun 2020 11:57:56 +0200 Subject: [PATCH 2/3] libceph: don't omit recovery_deletes in target_copy() Currently target_copy() is used only for sending linger pings, so this doesn't come up, but generally omitting recovery_deletes can result in unneeded resends (force_resend in calc_target()). Fixes: ae78dd8139ce ("libceph: make RECOVERY_DELETES feature create a new interval") Signed-off-by: Ilya Dryomov Reviewed-by: Jeff Layton --- net/ceph/osd_client.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index a37d159019a0..8f7fbe861dff 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -445,6 +445,7 @@ static void target_copy(struct ceph_osd_request_target *dest, dest->size = src->size; dest->min_size = src->min_size; dest->sort_bitwise = src->sort_bitwise; + dest->recovery_deletes = src->recovery_deletes; dest->flags = src->flags; dest->paused = src->paused; From 7ed286f3e061ee394782bd9fb4ed96bff0b5a021 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 9 Jun 2020 11:59:08 +0200 Subject: [PATCH 3/3] libceph: don't omit used_replica in target_copy() Currently target_copy() is used only for sending linger pings, so this doesn't come up, but generally omitting used_replica can hang the client as we wouldn't notice the acting set change (legacy_change in calc_target()) or trigger a warning in handle_reply(). Fixes: 117d96a04f00 ("libceph: support for balanced and localized reads") Signed-off-by: Ilya Dryomov Reviewed-by: Jeff Layton --- net/ceph/osd_client.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 8f7fbe861dff..2db8b44e70c2 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -448,6 +448,7 @@ static void target_copy(struct ceph_osd_request_target *dest, dest->recovery_deletes = src->recovery_deletes; dest->flags = src->flags; + dest->used_replica = src->used_replica; dest->paused = src->paused; dest->epoch = src->epoch;