From 663ae2cc04773608e1e741f693e41200fd4faf14 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 May 2016 13:18:57 +0200 Subject: [PATCH 01/71] rbd: get/put img_request in rbd_img_request_submit() By the time we get to checking for_each_obj_request_safe(img_request) terminating condition, all obj_requests may be complete and img_request ref, that rbd_img_request_submit() takes away from its caller, may be put. Moving the next_obj_request cursor is then a use-after-free on img_request. It's totally benign, as the value that's read is never used, but I think it's still worth fixing. Cc: Alex Elder Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 0ede6d7e25686c..c3089f32a39294 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2973,17 +2973,20 @@ static int rbd_img_request_submit(struct rbd_img_request *img_request) { struct rbd_obj_request *obj_request; struct rbd_obj_request *next_obj_request; + int ret = 0; dout("%s: img %p\n", __func__, img_request); - for_each_obj_request_safe(img_request, obj_request, next_obj_request) { - int ret; + rbd_img_request_get(img_request); + for_each_obj_request_safe(img_request, obj_request, next_obj_request) { ret = rbd_img_obj_request_submit(obj_request); if (ret) - return ret; + goto out_put_ireq; } - return 0; +out_put_ireq: + rbd_img_request_put(img_request); + return ret; } static void rbd_img_parent_read_callback(struct rbd_img_request *img_request) From 3ed97d6345a36a0a61e6af62ad8a66ca40f1aa2e Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 26 Apr 2016 15:05:29 +0200 Subject: [PATCH 02/71] libceph: make ceph_osdc_put_request() accept NULL Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 9 +++------ net/ceph/osd_client.c | 8 +++++--- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 4801571f51cb4e..3e61fc8bb37102 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1099,8 +1099,7 @@ static int ceph_writepages_start(struct address_space *mapping, mapping->writeback_index = index; out: - if (req) - ceph_osdc_put_request(req); + ceph_osdc_put_request(req); ceph_put_snap_context(snapc); dout("writepages done, rc = %d\n", rc); return rc; @@ -1824,10 +1823,8 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) out_unlock: up_write(&mdsc->pool_perm_rwsem); - if (rd_req) - ceph_osdc_put_request(rd_req); - if (wr_req) - ceph_osdc_put_request(wr_req); + ceph_osdc_put_request(rd_req); + ceph_osdc_put_request(wr_req); out: if (!err) err = have; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 40a53a70efdffe..cacce9e35f08fa 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -354,9 +354,11 @@ EXPORT_SYMBOL(ceph_osdc_get_request); void ceph_osdc_put_request(struct ceph_osd_request *req) { - dout("%s %p (was %d)\n", __func__, req, - atomic_read(&req->r_kref.refcount)); - kref_put(&req->r_kref, ceph_osdc_release_request); + if (req) { + dout("%s %p (was %d)\n", __func__, req, + atomic_read(&req->r_kref.refcount)); + kref_put(&req->r_kref, ceph_osdc_release_request); + } } EXPORT_SYMBOL(ceph_osdc_put_request); From 841272825b2263174120ab02b4abac9005ee1420 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 26 Apr 2016 15:39:47 +0200 Subject: [PATCH 03/71] libceph: grab snapc in ceph_osdc_alloc_request() ceph_osdc_build_request() is going away. Grab snapc and initialize ->r_snapid in ceph_osdc_alloc_request(). Signed-off-by: Ilya Dryomov --- net/ceph/osd_client.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index cacce9e35f08fa..ccb9539dc78096 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -391,6 +391,8 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, req->r_osdc = osdc; req->r_mempool = use_mempool; req->r_num_ops = num_ops; + req->r_snapid = CEPH_NOSNAP; + req->r_snapc = ceph_get_snap_context(snapc); kref_init(&req->r_kref); init_completion(&req->r_completion); @@ -2457,7 +2459,7 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, unsigned int i; req->r_snapid = snap_id; - req->r_snapc = ceph_get_snap_context(snapc); + WARN_ON(snapc != req->r_snapc); /* encode request */ msg->hdr.version = cpu_to_le16(4); @@ -2508,7 +2510,7 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0); ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0); if (req->r_snapc) { - for (i = 0; i < snapc->num_snaps; i++) { + for (i = 0; i < req->r_snapc->num_snaps; i++) { ceph_encode_64(&p, req->r_snapc->snaps[i]); } } From 13d1ad16d05eebb4db977eb955716b9da2c19fbd Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 27 Apr 2016 14:15:51 +0200 Subject: [PATCH 04/71] libceph: move message allocation out of ceph_osdc_alloc_request() The size of ->r_request and ->r_reply messages depends on the size of the object name (ceph_object_id), while the size of ceph_osd_request is fixed. Move message allocation into a separate function that would have to be called after ceph_object_id and ceph_object_locator (which is also going to become variable in size with RADOS namespaces) have been filled in: req = ceph_osdc_alloc_request(...); r_base_oid> r_base_oloc> ceph_osdc_alloc_messages(req); Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 18 ++++++- fs/ceph/addr.c | 8 +++ fs/ceph/file.c | 7 +++ include/linux/ceph/osd_client.h | 1 + net/ceph/osd_client.c | 88 +++++++++++++++++++-------------- 5 files changed, 82 insertions(+), 40 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index c3089f32a39294..bda4deade82e1e 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1954,7 +1954,7 @@ static struct ceph_osd_request *rbd_osd_req_create( osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO); if (!osd_req) - return NULL; /* ENOMEM */ + goto fail; if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; @@ -1967,7 +1967,14 @@ static struct ceph_osd_request *rbd_osd_req_create( osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name); + if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO)) + goto fail; + return osd_req; + +fail: + ceph_osdc_put_request(osd_req); + return NULL; } /* @@ -2003,7 +2010,7 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops, false, GFP_NOIO); if (!osd_req) - return NULL; /* ENOMEM */ + goto fail; osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; osd_req->r_callback = rbd_osd_req_callback; @@ -2012,7 +2019,14 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name); + if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO)) + goto fail; + return osd_req; + +fail: + ceph_osdc_put_request(osd_req); + return NULL; } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 3e61fc8bb37102..6fee7e0b8931c1 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1762,6 +1762,10 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) "%llx.00000000", ci->i_vino.ino); rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name); + err = ceph_osdc_alloc_messages(rd_req, GFP_NOFS); + if (err) + goto out_unlock; + wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL, 1, false, GFP_NOFS); if (!wr_req) { @@ -1775,6 +1779,10 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) wr_req->r_base_oloc.pool = pool; wr_req->r_base_oid = rd_req->r_base_oid; + err = ceph_osdc_alloc_messages(wr_req, GFP_NOFS); + if (err) + goto out_unlock; + /* one page should be large enough for STAT data */ pages = ceph_alloc_page_vector(1, GFP_KERNEL); if (IS_ERR(pages)) { diff --git a/fs/ceph/file.c b/fs/ceph/file.c index a79f9269831e38..5d46d106bbb77a 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -717,6 +717,13 @@ static void ceph_aio_retry_work(struct work_struct *work) req->r_base_oloc = orig_req->r_base_oloc; req->r_base_oid = orig_req->r_base_oid; + ret = ceph_osdc_alloc_messages(req, GFP_NOFS); + if (ret) { + ceph_osdc_put_request(req); + req = orig_req; + goto out; + } + req->r_ops[0] = orig_req->r_ops[0]; osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index cbf460927c424b..66a1fcd5bff788 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -322,6 +322,7 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client * unsigned int num_ops, bool use_mempool, gfp_t gfp_flags); +int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp); extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, struct ceph_snap_context *snapc, diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index ccb9539dc78096..d66dacc9d0d404 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -369,8 +369,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, gfp_t gfp_flags) { struct ceph_osd_request *req; - struct ceph_msg *msg; - size_t msg_size; if (use_mempool) { BUG_ON(num_ops > CEPH_OSD_SLAB_OPS); @@ -407,53 +405,59 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, req->r_base_oloc.pool = -1; req->r_target_oloc.pool = -1; - msg_size = OSD_OPREPLY_FRONT_LEN; - if (num_ops > CEPH_OSD_SLAB_OPS) { - /* ceph_osd_op and rval */ - msg_size += (num_ops - CEPH_OSD_SLAB_OPS) * - (sizeof(struct ceph_osd_op) + 4); - } + dout("%s req %p\n", __func__, req); + return req; +} +EXPORT_SYMBOL(ceph_osdc_alloc_request); - /* create reply message */ - if (use_mempool) - msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); - else - msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size, - gfp_flags, true); - if (!msg) { - ceph_osdc_put_request(req); - return NULL; - } - req->r_reply = msg; +int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp) +{ + struct ceph_osd_client *osdc = req->r_osdc; + struct ceph_msg *msg; + int msg_size; + /* create request message */ msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */ msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */ msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */ msg_size += 1 + 8 + 4 + 4; /* pgid */ - msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */ - msg_size += 2 + num_ops * sizeof(struct ceph_osd_op); + msg_size += 4 + req->r_base_oid.name_len; /* oid */ + msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op); msg_size += 8; /* snapid */ msg_size += 8; /* snap_seq */ - msg_size += 4 + 8 * (snapc ? snapc->num_snaps : 0); /* snaps */ + msg_size += 4 + 8 * (req->r_snapc ? req->r_snapc->num_snaps : 0); msg_size += 4; /* retry_attempt */ - /* create request message; allow space for oid */ - if (use_mempool) + if (req->r_mempool) msg = ceph_msgpool_get(&osdc->msgpool_op, 0); else - msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags, true); - if (!msg) { - ceph_osdc_put_request(req); - return NULL; - } + msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp, true); + if (!msg) + return -ENOMEM; memset(msg->front.iov_base, 0, msg->front.iov_len); - req->r_request = msg; - return req; + /* create reply message */ + msg_size = OSD_OPREPLY_FRONT_LEN; + if (req->r_num_ops > CEPH_OSD_SLAB_OPS) { + /* ceph_osd_op and rval */ + msg_size += (req->r_num_ops - CEPH_OSD_SLAB_OPS) * + (sizeof(struct ceph_osd_op) + 4); + } + + if (req->r_mempool) + msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); + else + msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size, gfp, true); + if (!msg) + return -ENOMEM; + + req->r_reply = msg; + + return 0; } -EXPORT_SYMBOL(ceph_osdc_alloc_request); +EXPORT_SYMBOL(ceph_osdc_alloc_messages); static bool osd_req_opcode_valid(u16 opcode) { @@ -828,17 +832,17 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool, GFP_NOFS); - if (!req) - return ERR_PTR(-ENOMEM); + if (!req) { + r = -ENOMEM; + goto fail; + } req->r_flags = flags; /* calculate max write size */ r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen); - if (r < 0) { - ceph_osdc_put_request(req); - return ERR_PTR(r); - } + if (r) + goto fail; if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) { osd_req_op_init(req, which, opcode, 0); @@ -864,7 +868,15 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, "%llx.%08llx", vino.ino, objnum); req->r_base_oid.name_len = strlen(req->r_base_oid.name); + r = ceph_osdc_alloc_messages(req, GFP_NOFS); + if (r) + goto fail; + return req; + +fail: + ceph_osdc_put_request(req); + return ERR_PTR(r); } EXPORT_SYMBOL(ceph_osdc_new_request); From 711da55d36a6f1eddcd340969be7223110d2f6b0 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 27 Apr 2016 18:32:56 +0200 Subject: [PATCH 05/71] libceph: change how osd_op_reply message size is calculated For a message pool message, preallocate a page, just like we do for osd_op. For a normal message, take ceph_object_id into account and don't bother subtracting CEPH_OSD_SLAB_OPS ceph_osd_ops. Signed-off-by: Ilya Dryomov --- net/ceph/osd_client.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index d66dacc9d0d404..75e27bd3d3729b 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -19,7 +19,6 @@ #include #include -#define OSD_OP_FRONT_LEN 4096 #define OSD_OPREPLY_FRONT_LEN 512 static struct kmem_cache *ceph_osd_request_cache; @@ -440,11 +439,8 @@ int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp) /* create reply message */ msg_size = OSD_OPREPLY_FRONT_LEN; - if (req->r_num_ops > CEPH_OSD_SLAB_OPS) { - /* ceph_osd_op and rval */ - msg_size += (req->r_num_ops - CEPH_OSD_SLAB_OPS) * - (sizeof(struct ceph_osd_op) + 4); - } + msg_size += req->r_base_oid.name_len; + msg_size += req->r_num_ops * sizeof(struct ceph_osd_op); if (req->r_mempool) msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); @@ -2702,13 +2698,11 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) goto out; err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP, - OSD_OP_FRONT_LEN, 10, true, - "osd_op"); + PAGE_SIZE, 10, true, "osd_op"); if (err < 0) goto out_mempool; err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY, - OSD_OPREPLY_FRONT_LEN, 10, true, - "osd_op_reply"); + PAGE_SIZE, 10, true, "osd_op_reply"); if (err < 0) goto out_msgpool; From d30291b985d1854565d7f2c82a4457869d5265e8 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 29 Apr 2016 19:54:20 +0200 Subject: [PATCH 06/71] libceph: variable-sized ceph_object_id Currently ceph_object_id can hold object names of up to 100 (CEPH_MAX_OID_NAME_LEN) characters. This is enough for all use cases, expect one - long rbd image names: - a format 1 header is named ".rbd" - an object that points to a format 2 header is named "rbd_id." We operate on these potentially long-named objects during rbd map, and, for format 1 images, during header refresh. (A format 2 header name is a small system-generated string.) Lift this 100 character limit by making ceph_object_id be able to point to an externally-allocated string. Apart from being able to work with almost arbitrarily-long named objects, this allows us to reduce the size of ceph_object_id from >100 bytes to 64 bytes. Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 8 +++- fs/ceph/addr.c | 6 +-- fs/ceph/file.c | 2 +- fs/ceph/ioctl.c | 2 +- include/linux/ceph/osdmap.h | 62 +++++++++++++++---------- net/ceph/debugfs.c | 2 +- net/ceph/osd_client.c | 16 ++++--- net/ceph/osdmap.c | 93 ++++++++++++++++++++++++++++++++++++- 8 files changed, 150 insertions(+), 41 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index bda4deade82e1e..3bf93a2a20f023 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1965,7 +1965,9 @@ static struct ceph_osd_request *rbd_osd_req_create( osd_req->r_priv = obj_request; osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); - ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name); + if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s", + obj_request->object_name)) + goto fail; if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO)) goto fail; @@ -2017,7 +2019,9 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) osd_req->r_priv = obj_request; osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); - ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name); + if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s", + obj_request->object_name)) + goto fail; if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO)) goto fail; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 6fee7e0b8931c1..6f28dd9bacb219 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1758,9 +1758,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) rd_req->r_flags = CEPH_OSD_FLAG_READ; osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0); rd_req->r_base_oloc.pool = pool; - snprintf(rd_req->r_base_oid.name, sizeof(rd_req->r_base_oid.name), - "%llx.00000000", ci->i_vino.ino); - rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name); + ceph_oid_printf(&rd_req->r_base_oid, "%llx.00000000", ci->i_vino.ino); err = ceph_osdc_alloc_messages(rd_req, GFP_NOFS); if (err) @@ -1777,7 +1775,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK; osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL); wr_req->r_base_oloc.pool = pool; - wr_req->r_base_oid = rd_req->r_base_oid; + ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid); err = ceph_osdc_alloc_messages(wr_req, GFP_NOFS); if (err) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 5d46d106bbb77a..9d470397e249e8 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -715,7 +715,7 @@ static void ceph_aio_retry_work(struct work_struct *work) CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE; req->r_base_oloc = orig_req->r_base_oloc; - req->r_base_oid = orig_req->r_base_oid; + ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid); ret = ceph_osdc_alloc_messages(req, GFP_NOFS); if (ret) { diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index f851d8d70158ea..db296709784ae3 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -213,7 +213,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) ceph_ino(inode), dl.object_no); oloc.pool = ceph_file_layout_pg_pool(ci->i_layout); - ceph_oid_set_name(&oid, dl.object_name); + ceph_oid_printf(&oid, "%s", dl.object_name); r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid); if (r < 0) { diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index e55c08bc3a960d..777a29412706ff 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -64,11 +64,47 @@ struct ceph_object_locator { */ #define CEPH_MAX_OID_NAME_LEN 100 +/* + * 51-char inline_name is long enough for all cephfs and all but one + * rbd requests: in ".rbd"/"rbd_id." can be + * arbitrarily long (~PAGE_SIZE). It's done once during rbd map; all + * other rbd requests fit into inline_name. + * + * Makes ceph_object_id 64 bytes on 64-bit. + */ +#define CEPH_OID_INLINE_LEN 52 + +/* + * Both inline and external buffers have space for a NUL-terminator, + * which is carried around. It's not required though - RADOS object + * names don't have to be NUL-terminated and may contain NULs. + */ struct ceph_object_id { - char name[CEPH_MAX_OID_NAME_LEN]; + char *name; + char inline_name[CEPH_OID_INLINE_LEN]; int name_len; }; +static inline void ceph_oid_init(struct ceph_object_id *oid) +{ + oid->name = oid->inline_name; + oid->name_len = 0; +} + +static inline bool ceph_oid_empty(const struct ceph_object_id *oid) +{ + return oid->name == oid->inline_name && !oid->name_len; +} + +void ceph_oid_copy(struct ceph_object_id *dest, + const struct ceph_object_id *src); +__printf(2, 3) +void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...); +__printf(3, 4) +int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp, + const char *fmt, ...); +void ceph_oid_destroy(struct ceph_object_id *oid); + struct ceph_pg_mapping { struct rb_node node; struct ceph_pg pgid; @@ -113,30 +149,6 @@ struct ceph_osdmap { int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3]; }; -static inline void ceph_oid_set_name(struct ceph_object_id *oid, - const char *name) -{ - int len; - - len = strlen(name); - if (len > sizeof(oid->name)) { - WARN(1, "ceph_oid_set_name '%s' len %d vs %zu, truncating\n", - name, len, sizeof(oid->name)); - len = sizeof(oid->name); - } - - memcpy(oid->name, name, len); - oid->name_len = len; -} - -static inline void ceph_oid_copy(struct ceph_object_id *dest, - struct ceph_object_id *src) -{ - BUG_ON(src->name_len > sizeof(dest->name)); - memcpy(dest->name, src->name, src->name_len); - dest->name_len = src->name_len; -} - static inline int ceph_osd_exists(struct ceph_osdmap *map, int osd) { return osd >= 0 && osd < map->max_osd && diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index b902fbc7863ef8..6f8413293d151c 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -161,7 +161,7 @@ static int osdc_show(struct seq_file *s, void *pp) req->r_osd ? req->r_osd->o_osd : -1, req->r_pgid.pool, req->r_pgid.seed); - seq_printf(s, "%.*s", req->r_base_oid.name_len, + seq_printf(s, "%*pE", req->r_base_oid.name_len, req->r_base_oid.name); if (req->r_reassert_version.epoch) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 75e27bd3d3729b..95910aed8e2e60 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -334,7 +334,10 @@ static void ceph_osdc_release_request(struct kref *kref) for (which = 0; which < req->r_num_ops; which++) osd_req_op_data_release(req, which); + ceph_oid_destroy(&req->r_base_oid); + ceph_oid_destroy(&req->r_target_oid); ceph_put_snap_context(req->r_snapc); + if (req->r_mempool) mempool_free(req, req->r_osdc->req_mempool); else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS) @@ -401,7 +404,9 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, INIT_LIST_HEAD(&req->r_req_lru_item); INIT_LIST_HEAD(&req->r_osd_item); + ceph_oid_init(&req->r_base_oid); req->r_base_oloc.pool = -1; + ceph_oid_init(&req->r_target_oid); req->r_target_oloc.pool = -1; dout("%s req %p\n", __func__, req); @@ -415,6 +420,8 @@ int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp) struct ceph_msg *msg; int msg_size; + WARN_ON(ceph_oid_empty(&req->r_base_oid)); + /* create request message */ msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */ msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */ @@ -859,10 +866,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, } req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout); - - snprintf(req->r_base_oid.name, sizeof(req->r_base_oid.name), - "%llx.%08llx", vino.ino, objnum); - req->r_base_oid.name_len = strlen(req->r_base_oid.name); + ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum); r = ceph_osdc_alloc_messages(req, GFP_NOFS); if (r) @@ -1410,7 +1414,7 @@ static int __calc_request_pg(struct ceph_osdmap *osdmap, req->r_target_oloc = req->r_base_oloc; /* struct */ need_check_tiering = true; } - if (req->r_target_oid.name_len == 0) { + if (ceph_oid_empty(&req->r_target_oid)) { ceph_oid_copy(&req->r_target_oid, &req->r_base_oid); need_check_tiering = true; } @@ -2501,7 +2505,7 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, /* oid */ ceph_encode_32(&p, req->r_base_oid.name_len); memcpy(p, req->r_base_oid.name, req->r_base_oid.name_len); - dout("oid '%.*s' len %d\n", req->r_base_oid.name_len, + dout("oid %*pE len %d\n", req->r_base_oid.name_len, req->r_base_oid.name, req->r_base_oid.name_len); p += req->r_base_oid.name_len; diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 243574c8cf3380..4668b871ca47d8 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1381,8 +1381,99 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, return ERR_PTR(err); } +void ceph_oid_copy(struct ceph_object_id *dest, + const struct ceph_object_id *src) +{ + WARN_ON(!ceph_oid_empty(dest)); + + if (src->name != src->inline_name) { + /* very rare, see ceph_object_id definition */ + dest->name = kmalloc(src->name_len + 1, + GFP_NOIO | __GFP_NOFAIL); + } + memcpy(dest->name, src->name, src->name_len + 1); + dest->name_len = src->name_len; +} +EXPORT_SYMBOL(ceph_oid_copy); +static __printf(2, 0) +int oid_printf_vargs(struct ceph_object_id *oid, const char *fmt, va_list ap) +{ + int len; + + WARN_ON(!ceph_oid_empty(oid)); + + len = vsnprintf(oid->inline_name, sizeof(oid->inline_name), fmt, ap); + if (len >= sizeof(oid->inline_name)) + return len; + + oid->name_len = len; + return 0; +} + +/* + * If oid doesn't fit into inline buffer, BUG. + */ +void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + BUG_ON(oid_printf_vargs(oid, fmt, ap)); + va_end(ap); +} +EXPORT_SYMBOL(ceph_oid_printf); + +static __printf(3, 0) +int oid_aprintf_vargs(struct ceph_object_id *oid, gfp_t gfp, + const char *fmt, va_list ap) +{ + va_list aq; + int len; + + va_copy(aq, ap); + len = oid_printf_vargs(oid, fmt, aq); + va_end(aq); + + if (len) { + char *external_name; + + external_name = kmalloc(len + 1, gfp); + if (!external_name) + return -ENOMEM; + + oid->name = external_name; + WARN_ON(vsnprintf(oid->name, len + 1, fmt, ap) != len); + oid->name_len = len; + } + + return 0; +} + +/* + * If oid doesn't fit into inline buffer, allocate. + */ +int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp, + const char *fmt, ...) +{ + va_list ap; + int ret; + + va_start(ap, fmt); + ret = oid_aprintf_vargs(oid, gfp, fmt, ap); + va_end(ap); + + return ret; +} +EXPORT_SYMBOL(ceph_oid_aprintf); + +void ceph_oid_destroy(struct ceph_object_id *oid) +{ + if (oid->name != oid->inline_name) + kfree(oid->name); +} +EXPORT_SYMBOL(ceph_oid_destroy); /* * calculate file layout from given offset, length. @@ -1474,7 +1565,7 @@ int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap, pg_out->seed = ceph_str_hash(pi->object_hash, oid->name, oid->name_len); - dout("%s '%.*s' pgid %llu.%x\n", __func__, oid->name_len, oid->name, + dout("%s %*pE pgid %llu.%x\n", __func__, oid->name_len, oid->name, pg_out->pool, pg_out->seed); return 0; } From c41d13a31fefed303f734c0c5106f6dcd262168e Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 29 Apr 2016 20:01:25 +0200 Subject: [PATCH 07/71] rbd: use header_oid instead of header_name Switch to ceph_object_id and use ceph_oid_aprintf() instead of a bare const char *. This reduces noise in rbd_dev_header_name(). Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 57 +++++++++++++++++++-------------------------- 1 file changed, 24 insertions(+), 33 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 3bf93a2a20f023..f3ea927f93deb3 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -350,7 +350,7 @@ struct rbd_device { struct rbd_spec *spec; struct rbd_options *opts; - char *header_name; + struct ceph_object_id header_oid; struct ceph_file_layout layout; @@ -3117,7 +3117,7 @@ static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id) struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; int ret; - obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, + obj_request = rbd_obj_request_create(rbd_dev->header_oid.name, 0, 0, OBJ_REQUEST_NODATA); if (!obj_request) return -ENOMEM; @@ -3148,7 +3148,7 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) int ret; dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__, - rbd_dev->header_name, (unsigned long long)notify_id, + rbd_dev->header_oid.name, (unsigned long long)notify_id, (unsigned int)opcode); /* @@ -3179,7 +3179,7 @@ static struct rbd_obj_request *rbd_obj_watch_request_helper( struct rbd_obj_request *obj_request; int ret; - obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, + obj_request = rbd_obj_request_create(rbd_dev->header_oid.name, 0, 0, OBJ_REQUEST_NODATA); if (!obj_request) return ERR_PTR(-ENOMEM); @@ -3612,7 +3612,7 @@ static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev) if (!ondisk) return -ENOMEM; - ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name, + ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_oid.name, 0, size, ondisk); if (ret < 0) goto out; @@ -4054,6 +4054,8 @@ static void rbd_dev_release(struct device *dev) struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); bool need_put = !!rbd_dev->opts; + ceph_oid_destroy(&rbd_dev->header_oid); + rbd_put_client(rbd_dev->rbd_client); rbd_spec_put(rbd_dev->spec); kfree(rbd_dev->opts); @@ -4084,6 +4086,8 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, INIT_LIST_HEAD(&rbd_dev->node); init_rwsem(&rbd_dev->header_rwsem); + ceph_oid_init(&rbd_dev->header_oid); + rbd_dev->dev.bus = &rbd_bus_type; rbd_dev->dev.type = &rbd_device_type; rbd_dev->dev.parent = &rbd_root_dev; @@ -4132,7 +4136,7 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, __le64 size; } __attribute__ ((packed)) size_buf = { 0 }; - ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, "rbd", "get_size", &snapid, sizeof (snapid), &size_buf, sizeof (size_buf)); @@ -4172,7 +4176,7 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) if (!reply_buf) return -ENOMEM; - ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, "rbd", "get_object_prefix", NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); @@ -4207,7 +4211,7 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, u64 unsup; int ret; - ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, "rbd", "get_features", &snapid, sizeof (snapid), &features_buf, sizeof (features_buf)); @@ -4269,7 +4273,7 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) } snapid = cpu_to_le64(rbd_dev->spec->snap_id); - ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, "rbd", "get_parent", &snapid, sizeof (snapid), reply_buf, size); @@ -4372,7 +4376,7 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) u64 stripe_count; int ret; - ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, "rbd", "get_stripe_unit_count", NULL, 0, (char *)&striping_info_buf, size); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); @@ -4620,7 +4624,7 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev) if (!reply_buf) return -ENOMEM; - ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, "rbd", "get_snapcontext", NULL, 0, reply_buf, size); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); @@ -4685,7 +4689,7 @@ static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, return ERR_PTR(-ENOMEM); snapid = cpu_to_le64(snap_id); - ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, "rbd", "get_snapshot_name", &snapid, sizeof (snapid), reply_buf, size); @@ -5281,35 +5285,25 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev) static int rbd_dev_header_name(struct rbd_device *rbd_dev) { struct rbd_spec *spec = rbd_dev->spec; - size_t size; + int ret; /* Record the header object name for this rbd image. */ rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); if (rbd_dev->image_format == 1) - size = strlen(spec->image_name) + sizeof (RBD_SUFFIX); + ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s", + spec->image_name, RBD_SUFFIX); else - size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id); - - rbd_dev->header_name = kmalloc(size, GFP_KERNEL); - if (!rbd_dev->header_name) - return -ENOMEM; + ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s", + RBD_HEADER_PREFIX, spec->image_id); - if (rbd_dev->image_format == 1) - sprintf(rbd_dev->header_name, "%s%s", - spec->image_name, RBD_SUFFIX); - else - sprintf(rbd_dev->header_name, "%s%s", - RBD_HEADER_PREFIX, spec->image_id); - return 0; + return ret; } static void rbd_dev_image_release(struct rbd_device *rbd_dev) { rbd_dev_unprobe(rbd_dev); - kfree(rbd_dev->header_name); - rbd_dev->header_name = NULL; rbd_dev->image_format = 0; kfree(rbd_dev->spec->image_id); rbd_dev->spec->image_id = NULL; @@ -5348,7 +5342,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) pr_info("image %s/%s does not exist\n", rbd_dev->spec->pool_name, rbd_dev->spec->image_name); - goto out_header_name; + goto err_out_format; } } @@ -5394,7 +5388,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) goto err_out_probe; dout("discovered format %u image, header name is %s\n", - rbd_dev->image_format, rbd_dev->header_name); + rbd_dev->image_format, rbd_dev->header_oid.name); return 0; err_out_probe: @@ -5402,9 +5396,6 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) err_out_watch: if (!depth) rbd_dev_header_unwatch_sync(rbd_dev); -out_header_name: - kfree(rbd_dev->header_name); - rbd_dev->header_name = NULL; err_out_format: rbd_dev->image_format = 0; kfree(rbd_dev->spec->image_id); From 0c0a8de13f9612a663b050afa955e6668858d1eb Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:21 +0200 Subject: [PATCH 08/71] libceph: nuke unused fields and functions Either unused or useless: osdmap->mkfs_epoch osd->o_marked_for_keepalive monc->num_generic_requests osdc->map_waiters osdc->last_requested_map osdc->timeout_tid osd_req_op_cls_response_data() osdmap_apply_incremental() @msgr arg Signed-off-by: Ilya Dryomov --- include/linux/ceph/mon_client.h | 1 - include/linux/ceph/osd_client.h | 8 -------- include/linux/ceph/osdmap.h | 6 ++---- net/ceph/mon_client.c | 3 --- net/ceph/osd_client.c | 13 +------------ net/ceph/osdmap.c | 3 +-- 6 files changed, 4 insertions(+), 30 deletions(-) diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h index e230e7ed60d369..330d045e409295 100644 --- a/include/linux/ceph/mon_client.h +++ b/include/linux/ceph/mon_client.h @@ -77,7 +77,6 @@ struct ceph_mon_client { /* pending generic requests */ struct rb_root generic_request_tree; - int num_generic_requests; u64 last_tid; /* subs, indexed with CEPH_SUB_* */ diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 66a1fcd5bff788..63854a8df18301 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -37,11 +37,9 @@ struct ceph_osd { struct list_head o_osd_lru; struct ceph_auth_handshake o_auth; unsigned long lru_ttl; - int o_marked_for_keepalive; struct list_head o_keepalive_item; }; - #define CEPH_OSD_SLAB_OPS 2 #define CEPH_OSD_MAX_OPS 16 @@ -206,13 +204,10 @@ struct ceph_osd_client { struct ceph_osdmap *osdmap; /* current map */ struct rw_semaphore map_sem; - struct completion map_waiters; - u64 last_requested_map; struct mutex request_mutex; struct rb_root osds; /* osds */ struct list_head osd_lru; /* idle osds */ - u64 timeout_tid; /* tid of timeout triggering rq */ u64 last_tid; /* tid of last request */ struct rb_root requests; /* pending requests */ struct list_head req_lru; /* in-flight lru */ @@ -271,9 +266,6 @@ extern void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req, extern struct ceph_osd_data *osd_req_op_extent_osd_data( struct ceph_osd_request *osd_req, unsigned int which); -extern struct ceph_osd_data *osd_req_op_cls_response_data( - struct ceph_osd_request *osd_req, - unsigned int which); extern void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *, unsigned int which, diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 777a29412706ff..ce7a41a182d425 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -123,7 +123,6 @@ struct ceph_pg_mapping { struct ceph_osdmap { struct ceph_fsid fsid; u32 epoch; - u32 mkfs_epoch; struct ceph_timespec created, modified; u32 flags; /* CEPH_OSDMAP_* */ @@ -205,9 +204,8 @@ static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid) } extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end); -extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, - struct ceph_osdmap *map, - struct ceph_messenger *msgr); +struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, + struct ceph_osdmap *map); extern void ceph_osdmap_destroy(struct ceph_osdmap *map); /* calculate mapping of a file extent to an object */ diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index cf638c009cfabe..3dfafdad92aab0 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -579,7 +579,6 @@ static int __do_generic_request(struct ceph_mon_client *monc, u64 tid, req->tid = tid != 0 ? tid : ++monc->last_tid; req->request->hdr.tid = cpu_to_le64(req->tid); __insert_generic_request(monc, req); - monc->num_generic_requests++; ceph_con_send(&monc->con, ceph_msg_get(req->request)); mutex_unlock(&monc->mutex); @@ -587,7 +586,6 @@ static int __do_generic_request(struct ceph_mon_client *monc, u64 tid, mutex_lock(&monc->mutex); rb_erase(&req->node, &monc->generic_request_tree); - monc->num_generic_requests--; if (!err) err = req->result; @@ -914,7 +912,6 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) INIT_DELAYED_WORK(&monc->delayed_work, delayed_work); monc->generic_request_tree = RB_ROOT; - monc->num_generic_requests = 0; monc->last_tid = 0; return 0; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 95910aed8e2e60..32ba09be6ee66b 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -143,14 +143,6 @@ osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req, } EXPORT_SYMBOL(osd_req_op_extent_osd_data); -struct ceph_osd_data * -osd_req_op_cls_response_data(struct ceph_osd_request *osd_req, - unsigned int which) -{ - return osd_req_op_data(osd_req, which, cls, response_data); -} -EXPORT_SYMBOL(osd_req_op_cls_response_data); /* ??? */ - void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req, unsigned int which, struct page **pages, u64 length, u32 alignment, @@ -2166,8 +2158,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) dout("applying incremental map %u len %d\n", epoch, maplen); newmap = osdmap_apply_incremental(&p, next, - osdc->osdmap, - &osdc->client->msgr); + osdc->osdmap); if (IS_ERR(newmap)) { err = PTR_ERR(newmap); goto bad; @@ -2674,8 +2665,6 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) osdc->client = client; osdc->osdmap = NULL; init_rwsem(&osdc->map_sem); - init_completion(&osdc->map_waiters); - osdc->last_requested_map = 0; mutex_init(&osdc->request_mutex); osdc->last_tid = 0; osdc->osds = RB_ROOT; diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 4668b871ca47d8..9a0cc072a909f5 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1204,8 +1204,7 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end) * decode and apply an incremental map update. */ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, - struct ceph_osdmap *map, - struct ceph_messenger *msgr) + struct ceph_osdmap *map) { struct crush_map *newcrush = NULL; struct ceph_fsid fsid; From 42a2c09f2b0b95fa147bcdb56cdc02b980b9ac5e Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:22 +0200 Subject: [PATCH 09/71] libceph: open-code remove_{all,old}_osds() They are called only once, from ceph_osdc_stop() and handle_osds_timeout() respectively. Signed-off-by: Ilya Dryomov --- net/ceph/osd_client.c | 51 ++++++++++++++++++------------------------- 1 file changed, 21 insertions(+), 30 deletions(-) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 32ba09be6ee66b..c423e11d685700 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1126,18 +1126,6 @@ static void remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) } } -static void remove_all_osds(struct ceph_osd_client *osdc) -{ - dout("%s %p\n", __func__, osdc); - mutex_lock(&osdc->request_mutex); - while (!RB_EMPTY_ROOT(&osdc->osds)) { - struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds), - struct ceph_osd, o_node); - remove_osd(osdc, osd); - } - mutex_unlock(&osdc->request_mutex); -} - static void __move_osd_to_lru(struct ceph_osd_client *osdc, struct ceph_osd *osd) { @@ -1165,20 +1153,6 @@ static void __remove_osd_from_lru(struct ceph_osd *osd) list_del_init(&osd->o_osd_lru); } -static void remove_old_osds(struct ceph_osd_client *osdc) -{ - struct ceph_osd *osd, *nosd; - - dout("__remove_old_osds %p\n", osdc); - mutex_lock(&osdc->request_mutex); - list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) { - if (time_before(jiffies, osd->lru_ttl)) - break; - remove_osd(osdc, osd); - } - mutex_unlock(&osdc->request_mutex); -} - /* * reset osd connect */ @@ -1671,12 +1645,21 @@ static void handle_osds_timeout(struct work_struct *work) container_of(work, struct ceph_osd_client, osds_timeout_work.work); unsigned long delay = osdc->client->options->osd_idle_ttl / 4; + struct ceph_osd *osd, *nosd; - dout("osds timeout\n"); + dout("%s osdc %p\n", __func__, osdc); down_read(&osdc->map_sem); - remove_old_osds(osdc); - up_read(&osdc->map_sem); + mutex_lock(&osdc->request_mutex); + + list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) { + if (time_before(jiffies, osd->lru_ttl)) + break; + + remove_osd(osdc, osd); + } + mutex_unlock(&osdc->request_mutex); + up_read(&osdc->map_sem); schedule_delayed_work(&osdc->osds_timeout_work, round_jiffies_relative(delay)); } @@ -2722,11 +2705,19 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc) destroy_workqueue(osdc->notify_wq); cancel_delayed_work_sync(&osdc->timeout_work); cancel_delayed_work_sync(&osdc->osds_timeout_work); + + mutex_lock(&osdc->request_mutex); + while (!RB_EMPTY_ROOT(&osdc->osds)) { + struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds), + struct ceph_osd, o_node); + remove_osd(osdc, osd); + } + mutex_unlock(&osdc->request_mutex); + if (osdc->osdmap) { ceph_osdmap_destroy(osdc->osdmap); osdc->osdmap = NULL; } - remove_all_osds(osdc); mempool_destroy(osdc->req_mempool); ceph_msgpool_destroy(&osdc->msgpool_op); ceph_msgpool_destroy(&osdc->msgpool_op_reply); From fcd00b68bbe2bf5606cb45c2cd4a250a390bcc1f Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:22 +0200 Subject: [PATCH 10/71] libceph: DEFINE_RB_FUNCS macro Given struct foo { u64 id; struct rb_node bar_node; }; generate insert_bar(), erase_bar() and lookup_bar() functions with DEFINE_RB_FUNCS(bar, struct foo, id, bar_node) The key is assumed to be an integer (u64, int, etc), compared with < and >. nodefld has to be initialized with RB_CLEAR_NODE(). Start using it for MDS, MON and OSD requests and OSD sessions. Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 54 +++++--------------- include/linux/ceph/libceph.h | 57 +++++++++++++++++++++ net/ceph/mon_client.c | 52 +++---------------- net/ceph/osd_client.c | 97 ++++-------------------------------- 4 files changed, 88 insertions(+), 172 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 85b8517f17a09b..cff85af425d4ea 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -567,51 +567,23 @@ void ceph_mdsc_release_request(struct kref *kref) kfree(req); } +DEFINE_RB_FUNCS(request, struct ceph_mds_request, r_tid, r_node) + /* * lookup session, bump ref if found. * * called under mdsc->mutex. */ -static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc, - u64 tid) +static struct ceph_mds_request * +lookup_get_request(struct ceph_mds_client *mdsc, u64 tid) { struct ceph_mds_request *req; - struct rb_node *n = mdsc->request_tree.rb_node; - - while (n) { - req = rb_entry(n, struct ceph_mds_request, r_node); - if (tid < req->r_tid) - n = n->rb_left; - else if (tid > req->r_tid) - n = n->rb_right; - else { - ceph_mdsc_get_request(req); - return req; - } - } - return NULL; -} -static void __insert_request(struct ceph_mds_client *mdsc, - struct ceph_mds_request *new) -{ - struct rb_node **p = &mdsc->request_tree.rb_node; - struct rb_node *parent = NULL; - struct ceph_mds_request *req = NULL; + req = lookup_request(&mdsc->request_tree, tid); + if (req) + ceph_mdsc_get_request(req); - while (*p) { - parent = *p; - req = rb_entry(parent, struct ceph_mds_request, r_node); - if (new->r_tid < req->r_tid) - p = &(*p)->rb_left; - else if (new->r_tid > req->r_tid) - p = &(*p)->rb_right; - else - BUG(); - } - - rb_link_node(&new->r_node, parent, p); - rb_insert_color(&new->r_node, &mdsc->request_tree); + return req; } /* @@ -630,7 +602,7 @@ static void __register_request(struct ceph_mds_client *mdsc, req->r_num_caps); dout("__register_request %p tid %lld\n", req, req->r_tid); ceph_mdsc_get_request(req); - __insert_request(mdsc, req); + insert_request(&mdsc->request_tree, req); req->r_uid = current_fsuid(); req->r_gid = current_fsgid(); @@ -663,8 +635,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc, } } - rb_erase(&req->r_node, &mdsc->request_tree); - RB_CLEAR_NODE(&req->r_node); + erase_request(&mdsc->request_tree, req); if (req->r_unsafe_dir && req->r_got_unsafe) { struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir); @@ -1722,6 +1693,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) INIT_LIST_HEAD(&req->r_unsafe_target_item); req->r_fmode = -1; kref_init(&req->r_kref); + RB_CLEAR_NODE(&req->r_node); INIT_LIST_HEAD(&req->r_wait); init_completion(&req->r_completion); init_completion(&req->r_safe_completion); @@ -2414,7 +2386,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) /* get request, session */ tid = le64_to_cpu(msg->hdr.tid); mutex_lock(&mdsc->mutex); - req = __lookup_request(mdsc, tid); + req = lookup_get_request(mdsc, tid); if (!req) { dout("handle_reply on unknown tid %llu\n", tid); mutex_unlock(&mdsc->mutex); @@ -2604,7 +2576,7 @@ static void handle_forward(struct ceph_mds_client *mdsc, fwd_seq = ceph_decode_32(&p); mutex_lock(&mdsc->mutex); - req = __lookup_request(mdsc, tid); + req = lookup_get_request(mdsc, tid); if (!req) { dout("forward tid %llu to mds%d - req dne\n", tid, next_mds); goto out; /* dup reply? */ diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index db92a8d4926eed..690985daad1c6e 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -180,6 +180,63 @@ static inline int calc_pages_for(u64 off, u64 len) (off >> PAGE_SHIFT); } +/* + * These are not meant to be generic - an integer key is assumed. + */ +#define DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld) \ +static void insert_##name(struct rb_root *root, type *t) \ +{ \ + struct rb_node **n = &root->rb_node; \ + struct rb_node *parent = NULL; \ + \ + BUG_ON(!RB_EMPTY_NODE(&t->nodefld)); \ + \ + while (*n) { \ + type *cur = rb_entry(*n, type, nodefld); \ + \ + parent = *n; \ + if (t->keyfld < cur->keyfld) \ + n = &(*n)->rb_left; \ + else if (t->keyfld > cur->keyfld) \ + n = &(*n)->rb_right; \ + else \ + BUG(); \ + } \ + \ + rb_link_node(&t->nodefld, parent, n); \ + rb_insert_color(&t->nodefld, root); \ +} \ +static void erase_##name(struct rb_root *root, type *t) \ +{ \ + BUG_ON(RB_EMPTY_NODE(&t->nodefld)); \ + rb_erase(&t->nodefld, root); \ + RB_CLEAR_NODE(&t->nodefld); \ +} + +#define DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld) \ +static type *lookup_##name(struct rb_root *root, \ + typeof(((type *)0)->keyfld) key) \ +{ \ + struct rb_node *n = root->rb_node; \ + \ + while (n) { \ + type *cur = rb_entry(n, type, nodefld); \ + \ + if (key < cur->keyfld) \ + n = n->rb_left; \ + else if (key > cur->keyfld) \ + n = n->rb_right; \ + else \ + return cur; \ + } \ + \ + return NULL; \ +} + +#define DEFINE_RB_FUNCS(name, type, keyfld, nodefld) \ +DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld) \ +DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld) + extern struct kmem_cache *ceph_inode_cachep; extern struct kmem_cache *ceph_cap_cachep; extern struct kmem_cache *ceph_cap_flush_cachep; diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 3dfafdad92aab0..a426a4b03e757d 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -478,45 +478,7 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc, /* * generic requests (currently statfs, mon_get_version) */ -static struct ceph_mon_generic_request *__lookup_generic_req( - struct ceph_mon_client *monc, u64 tid) -{ - struct ceph_mon_generic_request *req; - struct rb_node *n = monc->generic_request_tree.rb_node; - - while (n) { - req = rb_entry(n, struct ceph_mon_generic_request, node); - if (tid < req->tid) - n = n->rb_left; - else if (tid > req->tid) - n = n->rb_right; - else - return req; - } - return NULL; -} - -static void __insert_generic_request(struct ceph_mon_client *monc, - struct ceph_mon_generic_request *new) -{ - struct rb_node **p = &monc->generic_request_tree.rb_node; - struct rb_node *parent = NULL; - struct ceph_mon_generic_request *req = NULL; - - while (*p) { - parent = *p; - req = rb_entry(parent, struct ceph_mon_generic_request, node); - if (new->tid < req->tid) - p = &(*p)->rb_left; - else if (new->tid > req->tid) - p = &(*p)->rb_right; - else - BUG(); - } - - rb_link_node(&new->node, parent, p); - rb_insert_color(&new->node, &monc->generic_request_tree); -} +DEFINE_RB_FUNCS(generic_request, struct ceph_mon_generic_request, tid, node) static void release_generic_request(struct kref *kref) { @@ -551,7 +513,7 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con, struct ceph_msg *m; mutex_lock(&monc->mutex); - req = __lookup_generic_req(monc, tid); + req = lookup_generic_request(&monc->generic_request_tree, tid); if (!req) { dout("get_generic_reply %lld dne\n", tid); *skip = 1; @@ -578,14 +540,14 @@ static int __do_generic_request(struct ceph_mon_client *monc, u64 tid, /* register request */ req->tid = tid != 0 ? tid : ++monc->last_tid; req->request->hdr.tid = cpu_to_le64(req->tid); - __insert_generic_request(monc, req); + insert_generic_request(&monc->generic_request_tree, req); ceph_con_send(&monc->con, ceph_msg_get(req->request)); mutex_unlock(&monc->mutex); err = wait_for_completion_interruptible(&req->completion); mutex_lock(&monc->mutex); - rb_erase(&req->node, &monc->generic_request_tree); + erase_generic_request(&monc->generic_request_tree, req); if (!err) err = req->result; @@ -619,7 +581,7 @@ static void handle_statfs_reply(struct ceph_mon_client *monc, dout("handle_statfs_reply %p tid %llu\n", msg, tid); mutex_lock(&monc->mutex); - req = __lookup_generic_req(monc, tid); + req = lookup_generic_request(&monc->generic_request_tree, tid); if (req) { *(struct ceph_statfs *)req->buf = reply->st; req->result = 0; @@ -651,6 +613,7 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) return -ENOMEM; kref_init(&req->kref); + RB_CLEAR_NODE(&req->node); req->buf = buf; init_completion(&req->completion); @@ -696,7 +659,7 @@ static void handle_get_version_reply(struct ceph_mon_client *monc, goto bad; mutex_lock(&monc->mutex); - req = __lookup_generic_req(monc, handle); + req = lookup_generic_request(&monc->generic_request_tree, handle); if (req) { *(u64 *)req->buf = ceph_decode_64(&p); req->result = 0; @@ -732,6 +695,7 @@ int ceph_monc_do_get_version(struct ceph_mon_client *monc, const char *what, return -ENOMEM; kref_init(&req->kref); + RB_CLEAR_NODE(&req->node); req->buf = newest; init_completion(&req->completion); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index c423e11d685700..8256051ed88f10 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -875,45 +875,7 @@ EXPORT_SYMBOL(ceph_osdc_new_request); /* * We keep osd requests in an rbtree, sorted by ->r_tid. */ -static void __insert_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *new) -{ - struct rb_node **p = &osdc->requests.rb_node; - struct rb_node *parent = NULL; - struct ceph_osd_request *req = NULL; - - while (*p) { - parent = *p; - req = rb_entry(parent, struct ceph_osd_request, r_node); - if (new->r_tid < req->r_tid) - p = &(*p)->rb_left; - else if (new->r_tid > req->r_tid) - p = &(*p)->rb_right; - else - BUG(); - } - - rb_link_node(&new->r_node, parent, p); - rb_insert_color(&new->r_node, &osdc->requests); -} - -static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc, - u64 tid) -{ - struct ceph_osd_request *req; - struct rb_node *n = osdc->requests.rb_node; - - while (n) { - req = rb_entry(n, struct ceph_osd_request, r_node); - if (tid < req->r_tid) - n = n->rb_left; - else if (tid > req->r_tid) - n = n->rb_right; - else - return req; - } - return NULL; -} +DEFINE_RB_FUNCS(request, struct ceph_osd_request, r_tid, r_node) static struct ceph_osd_request * __lookup_request_ge(struct ceph_osd_client *osdc, @@ -1101,6 +1063,8 @@ static void put_osd(struct ceph_osd *osd) } } +DEFINE_RB_FUNCS(osd, struct ceph_osd, o_osd, o_node) + /* * remove an osd from our map */ @@ -1111,8 +1075,7 @@ static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) WARN_ON(!list_empty(&osd->o_linger_requests)); list_del_init(&osd->o_osd_lru); - rb_erase(&osd->o_node, &osdc->osds); - RB_CLEAR_NODE(&osd->o_node); + erase_osd(&osdc->osds, osd); } static void remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) @@ -1188,45 +1151,6 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) return 0; } -static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new) -{ - struct rb_node **p = &osdc->osds.rb_node; - struct rb_node *parent = NULL; - struct ceph_osd *osd = NULL; - - dout("__insert_osd %p osd%d\n", new, new->o_osd); - while (*p) { - parent = *p; - osd = rb_entry(parent, struct ceph_osd, o_node); - if (new->o_osd < osd->o_osd) - p = &(*p)->rb_left; - else if (new->o_osd > osd->o_osd) - p = &(*p)->rb_right; - else - BUG(); - } - - rb_link_node(&new->o_node, parent, p); - rb_insert_color(&new->o_node, &osdc->osds); -} - -static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o) -{ - struct ceph_osd *osd; - struct rb_node *n = osdc->osds.rb_node; - - while (n) { - osd = rb_entry(n, struct ceph_osd, o_node); - if (o < osd->o_osd) - n = n->rb_left; - else if (o > osd->o_osd) - n = n->rb_right; - else - return osd; - } - return NULL; -} - static void __schedule_osd_timeout(struct ceph_osd_client *osdc) { schedule_delayed_work(&osdc->timeout_work, @@ -1248,7 +1172,7 @@ static void __register_request(struct ceph_osd_client *osdc, req->r_tid = ++osdc->last_tid; req->r_request->hdr.tid = cpu_to_le64(req->r_tid); dout("__register_request %p tid %lld\n", req, req->r_tid); - __insert_request(osdc, req); + insert_request(&osdc->requests, req); ceph_osdc_get_request(req); osdc->num_requests++; if (osdc->num_requests == 1) { @@ -1270,8 +1194,7 @@ static void __unregister_request(struct ceph_osd_client *osdc, } dout("__unregister_request %p tid %lld\n", req, req->r_tid); - rb_erase(&req->r_node, &osdc->requests); - RB_CLEAR_NODE(&req->r_node); + erase_request(&osdc->requests, req); osdc->num_requests--; if (req->r_osd) { @@ -1482,7 +1405,7 @@ static int __map_request(struct ceph_osd_client *osdc, req->r_osd = NULL; } - req->r_osd = __lookup_osd(osdc, o); + req->r_osd = lookup_osd(&osdc->osds, o); if (!req->r_osd && o >= 0) { err = -ENOMEM; req->r_osd = create_osd(osdc, o); @@ -1492,7 +1415,7 @@ static int __map_request(struct ceph_osd_client *osdc, } dout("map_request osd %p is osd%d\n", req->r_osd, o); - __insert_osd(osdc, req->r_osd); + insert_osd(&osdc->osds, req->r_osd); ceph_con_open(&req->r_osd->o_con, CEPH_ENTITY_TYPE_OSD, o, @@ -1822,7 +1745,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) /* lookup */ down_read(&osdc->map_sem); mutex_lock(&osdc->request_mutex); - req = __lookup_request(osdc, tid); + req = lookup_request(&osdc->requests, tid); if (req == NULL) { dout("handle_reply tid %llu dne\n", tid); goto bad_mutex; @@ -2880,7 +2803,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, tid = le64_to_cpu(hdr->tid); mutex_lock(&osdc->request_mutex); - req = __lookup_request(osdc, tid); + req = lookup_request(&osdc->requests, tid); if (!req) { dout("%s osd%d tid %llu unknown, skipping\n", __func__, osd->o_osd, tid); From 985c1673885b77b2e0167c6478a833817d1e2fe5 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:22 +0200 Subject: [PATCH 11/71] libceph: fix ceph_eversion encoding eversion_t is version+epoch in userspace and is encoded in that order. ceph_eversion is defined as epoch+version in rados.h, yet we memcpy it in __send_request(). Reoder ceph_eversion fields. Signed-off-by: Ilya Dryomov --- include/linux/ceph/rados.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 2f822dca10463b..913c87c26d335f 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -114,8 +114,8 @@ struct ceph_object_layout { * compound epoch+version, used by storage layer to serialize mutations */ struct ceph_eversion { - __le32 epoch; __le64 version; + __le32 epoch; } __attribute__ ((packed)); /* From d9591f5e28686277d9312d3c7422faf1368b305e Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:22 +0200 Subject: [PATCH 12/71] libceph: rename ceph_oloc_oid_to_pg() Rename ceph_oloc_oid_to_pg() to ceph_object_locator_to_pg(). Emphasise that returned is raw PG and return -ENOENT instead of -EIO if the pool doesn't exist. Signed-off-by: Ilya Dryomov --- fs/ceph/ioctl.c | 2 +- include/linux/ceph/osdmap.h | 9 ++++----- net/ceph/osd_client.c | 4 ++-- net/ceph/osdmap.c | 31 ++++++++++++++++--------------- 4 files changed, 23 insertions(+), 23 deletions(-) diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index db296709784ae3..cca7fff227253d 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -215,7 +215,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) oloc.pool = ceph_file_layout_pg_pool(ci->i_layout); ceph_oid_printf(&oid, "%s", dl.object_name); - r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid); + r = ceph_object_locator_to_pg(osdc->osdmap, &oid, &oloc, &pgid); if (r < 0) { up_read(&osdc->map_sem); return r; diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index ce7a41a182d425..b70440c05b491b 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -213,11 +213,10 @@ extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, u64 off, u64 len, u64 *bno, u64 *oxoff, u64 *oxlen); -/* calculate mapping of object to a placement group */ -extern int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap, - struct ceph_object_locator *oloc, - struct ceph_object_id *oid, - struct ceph_pg *pg_out); +int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + struct ceph_pg *raw_pgid); extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 8256051ed88f10..cb9f1953f5fbab 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1324,8 +1324,8 @@ static int __calc_request_pg(struct ceph_osdmap *osdmap, /* !pi is caught in ceph_oloc_oid_to_pg() */ } - return ceph_oloc_oid_to_pg(osdmap, &req->r_target_oloc, - &req->r_target_oid, pg_out); + return ceph_object_locator_to_pg(osdmap, &req->r_target_oid, + &req->r_target_oloc, pg_out); } static void __enqueue_request(struct ceph_osd_request *req) diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 9a0cc072a909f5..6267839cb24654 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1545,30 +1545,31 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, EXPORT_SYMBOL(ceph_calc_file_object_mapping); /* - * Calculate mapping of a (oloc, oid) pair to a PG. Should only be - * called with target's (oloc, oid), since tiering isn't taken into - * account. + * Map an object into a PG. + * + * Should only be called with target_oid and target_oloc (as opposed to + * base_oid and base_oloc), since tiering isn't taken into account. */ -int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap, - struct ceph_object_locator *oloc, - struct ceph_object_id *oid, - struct ceph_pg *pg_out) +int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + struct ceph_pg *raw_pgid) { struct ceph_pg_pool_info *pi; - pi = __lookup_pg_pool(&osdmap->pg_pools, oloc->pool); + pi = ceph_pg_pool_by_id(osdmap, oloc->pool); if (!pi) - return -EIO; + return -ENOENT; - pg_out->pool = oloc->pool; - pg_out->seed = ceph_str_hash(pi->object_hash, oid->name, - oid->name_len); + raw_pgid->pool = oloc->pool; + raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name, + oid->name_len); - dout("%s %*pE pgid %llu.%x\n", __func__, oid->name_len, oid->name, - pg_out->pool, pg_out->seed); + dout("%s %*pE -> raw_pgid %llu.%x\n", __func__, oid->name_len, + oid->name, raw_pgid->pool, raw_pgid->seed); return 0; } -EXPORT_SYMBOL(ceph_oloc_oid_to_pg); +EXPORT_SYMBOL(ceph_object_locator_to_pg); static int do_crush(struct ceph_osdmap *map, int ruleno, int x, int *result, int result_max, From 6f3bfd45cd233eea0b07e3cabc0386b5de9321d2 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:22 +0200 Subject: [PATCH 13/71] libceph: ceph_osds, ceph_pg_to_up_acting_osds() Knowning just acting set isn't enough, we need to be able to record up set as well to detect interval changes. This means returning (up[], up_len, up_primary, acting[], acting_len, acting_primary) and passing it around. Introduce and switch to ceph_osds to help with that. Rename ceph_calc_pg_acting() to ceph_pg_to_up_acting_osds() and return both up and acting sets from it. Signed-off-by: Ilya Dryomov --- include/linux/ceph/osdmap.h | 21 ++- net/ceph/osd_client.c | 36 ++--- net/ceph/osdmap.c | 304 +++++++++++++++++++++--------------- 3 files changed, 215 insertions(+), 146 deletions(-) diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index b70440c05b491b..942189d311e09e 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -208,6 +208,20 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, struct ceph_osdmap *map); extern void ceph_osdmap_destroy(struct ceph_osdmap *map); +struct ceph_osds { + int osds[CEPH_PG_MAX_SIZE]; + int size; + int primary; /* id, NOT index */ +}; + +static inline void ceph_osds_init(struct ceph_osds *set) +{ + set->size = 0; + set->primary = -1; +} + +void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src); + /* calculate mapping of a file extent to an object */ extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, u64 off, u64 len, @@ -218,9 +232,10 @@ int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, struct ceph_object_locator *oloc, struct ceph_pg *raw_pgid); -extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, - struct ceph_pg pgid, - int *osds, int *primary); +void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap, + const struct ceph_pg *raw_pgid, + struct ceph_osds *up, + struct ceph_osds *acting); extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index cb9f1953f5fbab..0ff400a56cd6f6 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1358,8 +1358,7 @@ static int __map_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req, int force_resend) { struct ceph_pg pgid; - int acting[CEPH_PG_MAX_SIZE]; - int num, o; + struct ceph_osds up, acting; int err; bool was_paused; @@ -1372,9 +1371,7 @@ static int __map_request(struct ceph_osd_client *osdc, } req->r_pgid = pgid; - num = ceph_calc_pg_acting(osdc->osdmap, pgid, acting, &o); - if (num < 0) - num = 0; + ceph_pg_to_up_acting_osds(osdc->osdmap, &pgid, &up, &acting); was_paused = req->r_paused; req->r_paused = __req_should_be_paused(osdc, req); @@ -1382,21 +1379,23 @@ static int __map_request(struct ceph_osd_client *osdc, force_resend = 1; if ((!force_resend && - req->r_osd && req->r_osd->o_osd == o && + req->r_osd && req->r_osd->o_osd == acting.primary && req->r_sent >= req->r_osd->o_incarnation && - req->r_num_pg_osds == num && - memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) || - (req->r_osd == NULL && o == -1) || + req->r_num_pg_osds == acting.size && + memcmp(req->r_pg_osds, acting.osds, + acting.size * sizeof(acting.osds[0])) == 0) || + (req->r_osd == NULL && acting.primary == -1) || req->r_paused) return 0; /* no change */ dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n", - req->r_tid, pgid.pool, pgid.seed, o, + req->r_tid, pgid.pool, pgid.seed, acting.primary, req->r_osd ? req->r_osd->o_osd : -1); /* record full pg acting set */ - memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num); - req->r_num_pg_osds = num; + memcpy(req->r_pg_osds, acting.osds, + acting.size * sizeof(acting.osds[0])); + req->r_num_pg_osds = acting.size; if (req->r_osd) { __cancel_request(req); @@ -1405,21 +1404,22 @@ static int __map_request(struct ceph_osd_client *osdc, req->r_osd = NULL; } - req->r_osd = lookup_osd(&osdc->osds, o); - if (!req->r_osd && o >= 0) { + req->r_osd = lookup_osd(&osdc->osds, acting.primary); + if (!req->r_osd && acting.primary >= 0) { err = -ENOMEM; - req->r_osd = create_osd(osdc, o); + req->r_osd = create_osd(osdc, acting.primary); if (!req->r_osd) { list_move(&req->r_req_lru_item, &osdc->req_notarget); goto out; } - dout("map_request osd %p is osd%d\n", req->r_osd, o); + dout("map_request osd %p is osd%d\n", req->r_osd, + acting.primary); insert_osd(&osdc->osds, req->r_osd); ceph_con_open(&req->r_osd->o_con, - CEPH_ENTITY_TYPE_OSD, o, - &osdc->osdmap->osd_addr[o]); + CEPH_ENTITY_TYPE_OSD, acting.primary, + &osdc->osdmap->osd_addr[acting.primary]); } __enqueue_request(req); diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 6267839cb24654..f5fc8fc6387986 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1474,6 +1474,38 @@ void ceph_oid_destroy(struct ceph_object_id *oid) } EXPORT_SYMBOL(ceph_oid_destroy); +static bool osds_valid(const struct ceph_osds *set) +{ + /* non-empty set */ + if (set->size > 0 && set->primary >= 0) + return true; + + /* empty can_shift_osds set */ + if (!set->size && set->primary == -1) + return true; + + /* empty !can_shift_osds set - all NONE */ + if (set->size > 0 && set->primary == -1) { + int i; + + for (i = 0; i < set->size; i++) { + if (set->osds[i] != CRUSH_ITEM_NONE) + break; + } + if (i == set->size) + return true; + } + + return false; +} + +void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src) +{ + memcpy(dest->osds, src->osds, src->size * sizeof(src->osds[0])); + dest->size = src->size; + dest->primary = src->primary; +} + /* * calculate file layout from given offset, length. * fill in correct oid, logical length, and object extent @@ -1571,6 +1603,46 @@ int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, } EXPORT_SYMBOL(ceph_object_locator_to_pg); +/* + * Map a raw PG (full precision ps) into an actual PG. + */ +static void raw_pg_to_pg(struct ceph_pg_pool_info *pi, + const struct ceph_pg *raw_pgid, + struct ceph_pg *pgid) +{ + pgid->pool = raw_pgid->pool; + pgid->seed = ceph_stable_mod(raw_pgid->seed, pi->pg_num, + pi->pg_num_mask); +} + +/* + * Map a raw PG (full precision ps) into a placement ps (placement + * seed). Include pool id in that value so that different pools don't + * use the same seeds. + */ +static u32 raw_pg_to_pps(struct ceph_pg_pool_info *pi, + const struct ceph_pg *raw_pgid) +{ + if (pi->flags & CEPH_POOL_FLAG_HASHPSPOOL) { + /* hash pool id and seed so that pool PGs do not overlap */ + return crush_hash32_2(CRUSH_HASH_RJENKINS1, + ceph_stable_mod(raw_pgid->seed, + pi->pgp_num, + pi->pgp_num_mask), + raw_pgid->pool); + } else { + /* + * legacy behavior: add ps and pool together. this is + * not a great approach because the PGs from each pool + * will overlap on top of each other: 0.5 == 1.4 == + * 2.3 == ... + */ + return ceph_stable_mod(raw_pgid->seed, pi->pgp_num, + pi->pgp_num_mask) + + (unsigned)raw_pgid->pool; + } +} + static int do_crush(struct ceph_osdmap *map, int ruleno, int x, int *result, int result_max, const __u32 *weight, int weight_max) @@ -1588,84 +1660,92 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x, } /* - * Calculate raw (crush) set for given pgid. + * Calculate raw set (CRUSH output) for given PG. The result may + * contain nonexistent OSDs. ->primary is undefined for a raw set. * - * Return raw set length, or error. + * Placement seed (CRUSH input) is returned through @ppps. */ -static int pg_to_raw_osds(struct ceph_osdmap *osdmap, - struct ceph_pg_pool_info *pool, - struct ceph_pg pgid, u32 pps, int *osds) +static void pg_to_raw_osds(struct ceph_osdmap *osdmap, + struct ceph_pg_pool_info *pi, + const struct ceph_pg *raw_pgid, + struct ceph_osds *raw, + u32 *ppps) { + u32 pps = raw_pg_to_pps(pi, raw_pgid); int ruleno; int len; - /* crush */ - ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset, - pool->type, pool->size); + ceph_osds_init(raw); + if (ppps) + *ppps = pps; + + ruleno = crush_find_rule(osdmap->crush, pi->crush_ruleset, pi->type, + pi->size); if (ruleno < 0) { pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n", - pgid.pool, pool->crush_ruleset, pool->type, - pool->size); - return -ENOENT; + pi->id, pi->crush_ruleset, pi->type, pi->size); + return; } - len = do_crush(osdmap, ruleno, pps, osds, - min_t(int, pool->size, CEPH_PG_MAX_SIZE), + len = do_crush(osdmap, ruleno, pps, raw->osds, + min_t(int, pi->size, ARRAY_SIZE(raw->osds)), osdmap->osd_weight, osdmap->max_osd); if (len < 0) { pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n", - len, ruleno, pgid.pool, pool->crush_ruleset, - pool->type, pool->size); - return len; + len, ruleno, pi->id, pi->crush_ruleset, pi->type, + pi->size); + return; } - return len; + raw->size = len; } /* - * Given raw set, calculate up set and up primary. + * Given raw set, calculate up set and up primary. By definition of an + * up set, the result won't contain nonexistent or down OSDs. * - * Return up set length. *primary is set to up primary osd id, or -1 - * if up set is empty. + * This is done in-place - on return @set is the up set. If it's + * empty, ->primary will remain undefined. */ -static int raw_to_up_osds(struct ceph_osdmap *osdmap, - struct ceph_pg_pool_info *pool, - int *osds, int len, int *primary) +static void raw_to_up_osds(struct ceph_osdmap *osdmap, + struct ceph_pg_pool_info *pi, + struct ceph_osds *set) { - int up_primary = -1; int i; - if (ceph_can_shift_osds(pool)) { + /* ->primary is undefined for a raw set */ + BUG_ON(set->primary != -1); + + if (ceph_can_shift_osds(pi)) { int removed = 0; - for (i = 0; i < len; i++) { - if (ceph_osd_is_down(osdmap, osds[i])) { + /* shift left */ + for (i = 0; i < set->size; i++) { + if (ceph_osd_is_down(osdmap, set->osds[i])) { removed++; continue; } if (removed) - osds[i - removed] = osds[i]; + set->osds[i - removed] = set->osds[i]; } - - len -= removed; - if (len > 0) - up_primary = osds[0]; + set->size -= removed; + if (set->size > 0) + set->primary = set->osds[0]; } else { - for (i = len - 1; i >= 0; i--) { - if (ceph_osd_is_down(osdmap, osds[i])) - osds[i] = CRUSH_ITEM_NONE; + /* set down/dne devices to NONE */ + for (i = set->size - 1; i >= 0; i--) { + if (ceph_osd_is_down(osdmap, set->osds[i])) + set->osds[i] = CRUSH_ITEM_NONE; else - up_primary = osds[i]; + set->primary = set->osds[i]; } } - - *primary = up_primary; - return len; } -static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, - struct ceph_pg_pool_info *pool, - int *osds, int len, int *primary) +static void apply_primary_affinity(struct ceph_osdmap *osdmap, + struct ceph_pg_pool_info *pi, + u32 pps, + struct ceph_osds *up) { int i; int pos = -1; @@ -1677,8 +1757,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, if (!osdmap->osd_primary_affinity) return; - for (i = 0; i < len; i++) { - int osd = osds[i]; + for (i = 0; i < up->size; i++) { + int osd = up->osds[i]; if (osd != CRUSH_ITEM_NONE && osdmap->osd_primary_affinity[osd] != @@ -1686,7 +1766,7 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, break; } } - if (i == len) + if (i == up->size) return; /* @@ -1694,8 +1774,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, * osd into the hash/rng so that a proportional fraction of an * osd's pgs get rejected as primary. */ - for (i = 0; i < len; i++) { - int osd = osds[i]; + for (i = 0; i < up->size; i++) { + int osd = up->osds[i]; u32 aff; if (osd == CRUSH_ITEM_NONE) @@ -1720,123 +1800,99 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, if (pos < 0) return; - *primary = osds[pos]; + up->primary = up->osds[pos]; - if (ceph_can_shift_osds(pool) && pos > 0) { + if (ceph_can_shift_osds(pi) && pos > 0) { /* move the new primary to the front */ for (i = pos; i > 0; i--) - osds[i] = osds[i - 1]; - osds[0] = *primary; + up->osds[i] = up->osds[i - 1]; + up->osds[0] = up->primary; } } /* - * Given up set, apply pg_temp and primary_temp mappings. + * Get pg_temp and primary_temp mappings for given PG. * - * Return acting set length. *primary is set to acting primary osd id, - * or -1 if acting set is empty. + * Note that a PG may have none, only pg_temp, only primary_temp or + * both pg_temp and primary_temp mappings. This means @temp isn't + * always a valid OSD set on return: in the "only primary_temp" case, + * @temp will have its ->primary >= 0 but ->size == 0. */ -static int apply_temps(struct ceph_osdmap *osdmap, - struct ceph_pg_pool_info *pool, struct ceph_pg pgid, - int *osds, int len, int *primary) +static void get_temp_osds(struct ceph_osdmap *osdmap, + struct ceph_pg_pool_info *pi, + const struct ceph_pg *raw_pgid, + struct ceph_osds *temp) { + struct ceph_pg pgid; struct ceph_pg_mapping *pg; - int temp_len; - int temp_primary; int i; - /* raw_pg -> pg */ - pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num, - pool->pg_num_mask); + raw_pg_to_pg(pi, raw_pgid, &pgid); + ceph_osds_init(temp); /* pg_temp? */ pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); if (pg) { - temp_len = 0; - temp_primary = -1; - for (i = 0; i < pg->pg_temp.len; i++) { if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) { - if (ceph_can_shift_osds(pool)) + if (ceph_can_shift_osds(pi)) continue; - else - osds[temp_len++] = CRUSH_ITEM_NONE; + + temp->osds[temp->size++] = CRUSH_ITEM_NONE; } else { - osds[temp_len++] = pg->pg_temp.osds[i]; + temp->osds[temp->size++] = pg->pg_temp.osds[i]; } } /* apply pg_temp's primary */ - for (i = 0; i < temp_len; i++) { - if (osds[i] != CRUSH_ITEM_NONE) { - temp_primary = osds[i]; + for (i = 0; i < temp->size; i++) { + if (temp->osds[i] != CRUSH_ITEM_NONE) { + temp->primary = temp->osds[i]; break; } } - } else { - temp_len = len; - temp_primary = *primary; } /* primary_temp? */ pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid); if (pg) - temp_primary = pg->primary_temp.osd; - - *primary = temp_primary; - return temp_len; + temp->primary = pg->primary_temp.osd; } /* - * Calculate acting set for given pgid. + * Map a PG to its acting set as well as its up set. * - * Return acting set length, or error. *primary is set to acting - * primary osd id, or -1 if acting set is empty or on error. + * Acting set is used for data mapping purposes, while up set can be + * recorded for detecting interval changes and deciding whether to + * resend a request. */ -int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, - int *osds, int *primary) +void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap, + const struct ceph_pg *raw_pgid, + struct ceph_osds *up, + struct ceph_osds *acting) { - struct ceph_pg_pool_info *pool; + struct ceph_pg_pool_info *pi; u32 pps; - int len; - pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool); - if (!pool) { - *primary = -1; - return -ENOENT; + pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool); + if (!pi) { + ceph_osds_init(up); + ceph_osds_init(acting); + goto out; } - if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) { - /* hash pool id and seed so that pool PGs do not overlap */ - pps = crush_hash32_2(CRUSH_HASH_RJENKINS1, - ceph_stable_mod(pgid.seed, pool->pgp_num, - pool->pgp_num_mask), - pgid.pool); - } else { - /* - * legacy behavior: add ps and pool together. this is - * not a great approach because the PGs from each pool - * will overlap on top of each other: 0.5 == 1.4 == - * 2.3 == ... - */ - pps = ceph_stable_mod(pgid.seed, pool->pgp_num, - pool->pgp_num_mask) + - (unsigned)pgid.pool; - } - - len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds); - if (len < 0) { - *primary = -1; - return len; + pg_to_raw_osds(osdmap, pi, raw_pgid, up, &pps); + raw_to_up_osds(osdmap, pi, up); + apply_primary_affinity(osdmap, pi, pps, up); + get_temp_osds(osdmap, pi, raw_pgid, acting); + if (!acting->size) { + memcpy(acting->osds, up->osds, up->size * sizeof(up->osds[0])); + acting->size = up->size; + if (acting->primary == -1) + acting->primary = up->primary; } - - len = raw_to_up_osds(osdmap, pool, osds, len, primary); - - apply_primary_affinity(osdmap, pps, pool, osds, len, primary); - - len = apply_temps(osdmap, pool, pgid, osds, len, primary); - - return len; +out: + WARN_ON(!osds_valid(up) || !osds_valid(acting)); } /* @@ -1844,11 +1900,9 @@ int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, */ int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid) { - int osds[CEPH_PG_MAX_SIZE]; - int primary; - - ceph_calc_pg_acting(osdmap, pgid, osds, &primary); + struct ceph_osds up, acting; - return primary; + ceph_pg_to_up_acting_osds(osdmap, &pgid, &up, &acting); + return acting.primary; } EXPORT_SYMBOL(ceph_calc_pg_primary); From f81f16339a05775df600b2ff75a79be1864975c1 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:23 +0200 Subject: [PATCH 14/71] libceph: rename ceph_calc_pg_primary() Rename ceph_calc_pg_primary() to ceph_pg_to_acting_primary() to emphasise that it returns acting primary. Signed-off-by: Ilya Dryomov --- fs/ceph/ioctl.c | 2 +- include/linux/ceph/osdmap.h | 4 ++-- net/ceph/osdmap.c | 9 +++++---- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index cca7fff227253d..1831ad6cf06670 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -221,7 +221,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) return r; } - dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid); + dl.osd = ceph_pg_to_acting_primary(osdc->osdmap, &pgid); if (dl.osd >= 0) { struct ceph_entity_addr *a = ceph_osd_addr(osdc->osdmap, dl.osd); diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 942189d311e09e..3fd978a1639b16 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -236,8 +236,8 @@ void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap, const struct ceph_pg *raw_pgid, struct ceph_osds *up, struct ceph_osds *acting); -extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, - struct ceph_pg pgid); +int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap, + const struct ceph_pg *raw_pgid); extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, u64 id); diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index f5fc8fc6387986..656384a8fd1e2d 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1896,13 +1896,14 @@ void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap, } /* - * Return primary osd for given pgid, or -1 if none. + * Return acting primary for given PG, or -1 if none. */ -int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid) +int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap, + const struct ceph_pg *raw_pgid) { struct ceph_osds up, acting; - ceph_pg_to_up_acting_osds(osdmap, &pgid, &up, &acting); + ceph_pg_to_up_acting_osds(osdmap, raw_pgid, &up, &acting); return acting.primary; } -EXPORT_SYMBOL(ceph_calc_pg_primary); +EXPORT_SYMBOL(ceph_pg_to_acting_primary); From f984cb76cc5fb9fc76d6abb6c4694a5412e3f49b Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:23 +0200 Subject: [PATCH 15/71] libceph: make pgid_cmp() global calc_target() code is going to need to know how to compare PGs. Take lhs and rhs pgid by const * while at it. Signed-off-by: Ilya Dryomov --- include/linux/ceph/osdmap.h | 2 ++ net/ceph/osdmap.c | 23 ++++++++++++----------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 3fd978a1639b16..7783237ab06c1d 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -24,6 +24,8 @@ struct ceph_pg { uint32_t seed; }; +int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs); + #define CEPH_POOL_FLAG_HASHPSPOOL 1 struct ceph_pg_pool_info { diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 656384a8fd1e2d..3c7dc5e581ab81 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -380,23 +380,24 @@ static struct crush_map *crush_decode(void *pbyval, void *end) return ERR_PTR(err); } -/* - * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid - * to a set of osds) and primary_temp (explicit primary setting) - */ -static int pgid_cmp(struct ceph_pg l, struct ceph_pg r) +int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs) { - if (l.pool < r.pool) + if (lhs->pool < rhs->pool) return -1; - if (l.pool > r.pool) + if (lhs->pool > rhs->pool) return 1; - if (l.seed < r.seed) + if (lhs->seed < rhs->seed) return -1; - if (l.seed > r.seed) + if (lhs->seed > rhs->seed) return 1; + return 0; } +/* + * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid + * to a set of osds) and primary_temp (explicit primary setting) + */ static int __insert_pg_mapping(struct ceph_pg_mapping *new, struct rb_root *root) { @@ -409,7 +410,7 @@ static int __insert_pg_mapping(struct ceph_pg_mapping *new, while (*p) { parent = *p; pg = rb_entry(parent, struct ceph_pg_mapping, node); - c = pgid_cmp(new->pgid, pg->pgid); + c = ceph_pg_compare(&new->pgid, &pg->pgid); if (c < 0) p = &(*p)->rb_left; else if (c > 0) @@ -432,7 +433,7 @@ static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root, while (n) { pg = rb_entry(n, struct ceph_pg_mapping, node); - c = pgid_cmp(pgid, pg->pgid); + c = ceph_pg_compare(&pgid, &pg->pgid); if (c < 0) { n = n->rb_left; } else if (c > 0) { From 04812acf572ef41fd51c11e0bf3385f34c0e1b5b Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:23 +0200 Subject: [PATCH 16/71] libceph: pi->min_size, pi->last_force_request_resend Add and decode pi->min_size and pi->last_force_request_resend. These are going to be used by calc_target(). Signed-off-by: Ilya Dryomov --- include/linux/ceph/osdmap.h | 9 ++++--- net/ceph/debugfs.c | 10 ++++---- net/ceph/osdmap.c | 48 ++++++++++++++++++++++++++++++++++++- 3 files changed, 59 insertions(+), 8 deletions(-) diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 7783237ab06c1d..989294d0b8d226 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -26,20 +26,23 @@ struct ceph_pg { int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs); -#define CEPH_POOL_FLAG_HASHPSPOOL 1 +#define CEPH_POOL_FLAG_HASHPSPOOL (1ULL << 0) /* hash pg seed and pool id + together */ struct ceph_pg_pool_info { struct rb_node node; s64 id; - u8 type; + u8 type; /* CEPH_POOL_TYPE_* */ u8 size; + u8 min_size; u8 crush_ruleset; u8 object_hash; + u32 last_force_request_resend; u32 pg_num, pgp_num; int pg_num_mask, pgp_num_mask; s64 read_tier; s64 write_tier; /* wins for read+write ops */ - u64 flags; + u64 flags; /* CEPH_POOL_FLAG_* */ char *name; }; diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 6f8413293d151c..7f1cc22c3e8ba2 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -66,12 +66,14 @@ static int osdmap_show(struct seq_file *s, void *p) (map->flags & CEPH_OSDMAP_FULL) ? " FULL" : ""); for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) { - struct ceph_pg_pool_info *pool = + struct ceph_pg_pool_info *pi = rb_entry(n, struct ceph_pg_pool_info, node); - seq_printf(s, "pool %lld pg_num %u (%d) read_tier %lld write_tier %lld\n", - pool->id, pool->pg_num, pool->pg_num_mask, - pool->read_tier, pool->write_tier); + seq_printf(s, "pool %lld '%s' type %d size %d min_size %d pg_num %u pg_num_mask %d flags 0x%llx lfor %u read_tier %lld write_tier %lld\n", + pi->id, pi->name, pi->type, pi->size, pi->min_size, + pi->pg_num, pi->pg_num_mask, pi->flags, + pi->last_force_request_resend, pi->read_tier, + pi->write_tier); } for (i = 0; i < map->max_osd; i++) { struct ceph_entity_addr *addr = &map->osd_addr[i]; diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 3c7dc5e581ab81..66c3ebead92fa9 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -597,7 +597,9 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) *p += 4; /* skip crash_replay_interval */ if (ev >= 7) - *p += 1; /* skip min_size */ + pi->min_size = ceph_decode_8(p); + else + pi->min_size = pi->size - pi->size / 2; if (ev >= 8) *p += 8 + 8; /* skip quota_max_* */ @@ -617,6 +619,50 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) pi->write_tier = -1; } + if (ev >= 10) { + /* skip properties */ + num = ceph_decode_32(p); + while (num--) { + len = ceph_decode_32(p); + *p += len; /* key */ + len = ceph_decode_32(p); + *p += len; /* val */ + } + } + + if (ev >= 11) { + /* skip hit_set_params */ + *p += 1 + 1; /* versions */ + len = ceph_decode_32(p); + *p += len; + + *p += 4; /* skip hit_set_period */ + *p += 4; /* skip hit_set_count */ + } + + if (ev >= 12) + *p += 4; /* skip stripe_width */ + + if (ev >= 13) { + *p += 8; /* skip target_max_bytes */ + *p += 8; /* skip target_max_objects */ + *p += 4; /* skip cache_target_dirty_ratio_micro */ + *p += 4; /* skip cache_target_full_ratio_micro */ + *p += 4; /* skip cache_min_flush_age */ + *p += 4; /* skip cache_min_evict_age */ + } + + if (ev >= 14) { + /* skip erasure_code_profile */ + len = ceph_decode_32(p); + *p += len; + } + + if (ev >= 15) + pi->last_force_request_resend = ceph_decode_32(p); + else + pi->last_force_request_resend = 0; + /* ignore the rest */ *p = pool_end; From 63244fa123a755e4bbaee03022b68613c71d1332 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:23 +0200 Subject: [PATCH 17/71] libceph: introduce ceph_osd_request_target, calc_target() Introduce ceph_osd_request_target, containing all mapping-related fields of ceph_osd_request and calc_target() for calculating mappings and populating it. Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 2 +- fs/ceph/file.c | 2 +- include/linux/ceph/osd_client.h | 23 +++++ include/linux/ceph/osdmap.h | 34 +++++++ include/linux/ceph/rados.h | 5 + net/ceph/osd_client.c | 157 +++++++++++++++++++++++++++++++- net/ceph/osdmap.c | 121 ++++++++++++++++++++++++ 7 files changed, 340 insertions(+), 4 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 6f28dd9bacb219..c5d75486823b3d 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1774,7 +1774,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) wr_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK; osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL); - wr_req->r_base_oloc.pool = pool; + ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc); ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid); err = ceph_osdc_alloc_messages(wr_req, GFP_NOFS); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 9d470397e249e8..36b4a41dfa67a1 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -714,7 +714,7 @@ static void ceph_aio_retry_work(struct work_struct *work) req->r_flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE; - req->r_base_oloc = orig_req->r_base_oloc; + ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc); ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid); ret = ceph_osdc_alloc_messages(req, GFP_NOFS); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 63854a8df18301..48806ee4488d01 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -24,6 +24,8 @@ typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *, struct ceph_msg *); typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool); +#define CEPH_HOMELESS_OSD -1 + /* a given osd we're communicating with */ struct ceph_osd { atomic_t o_ref; @@ -118,6 +120,27 @@ struct ceph_osd_req_op { }; }; +struct ceph_osd_request_target { + struct ceph_object_id base_oid; + struct ceph_object_locator base_oloc; + struct ceph_object_id target_oid; + struct ceph_object_locator target_oloc; + + struct ceph_pg pgid; + u32 pg_num; + u32 pg_num_mask; + struct ceph_osds acting; + struct ceph_osds up; + int size; + int min_size; + bool sort_bitwise; + + unsigned int flags; /* CEPH_OSD_FLAG_* */ + bool paused; + + int osd; +}; + /* an in-flight request */ struct ceph_osd_request { u64 r_tid; /* unique for this client */ diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 989294d0b8d226..420bb7968b2587 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -28,6 +28,7 @@ int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs); #define CEPH_POOL_FLAG_HASHPSPOOL (1ULL << 0) /* hash pg seed and pool id together */ +#define CEPH_POOL_FLAG_FULL (1ULL << 1) /* pool is full */ struct ceph_pg_pool_info { struct rb_node node; @@ -62,6 +63,22 @@ struct ceph_object_locator { s64 pool; }; +static inline void ceph_oloc_init(struct ceph_object_locator *oloc) +{ + oloc->pool = -1; +} + +static inline bool ceph_oloc_empty(const struct ceph_object_locator *oloc) +{ + return oloc->pool == -1; +} + +static inline void ceph_oloc_copy(struct ceph_object_locator *dest, + const struct ceph_object_locator *src) +{ + dest->pool = src->pool; +} + /* * Maximum supported by kernel client object name length * @@ -227,6 +244,23 @@ static inline void ceph_osds_init(struct ceph_osds *set) void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src); +bool ceph_is_new_interval(const struct ceph_osds *old_acting, + const struct ceph_osds *new_acting, + const struct ceph_osds *old_up, + const struct ceph_osds *new_up, + int old_size, + int new_size, + int old_min_size, + int new_min_size, + u32 old_pg_num, + u32 new_pg_num, + bool old_sort_bitwise, + bool new_sort_bitwise, + const struct ceph_pg *pgid); +bool ceph_osds_changed(const struct ceph_osds *old_acting, + const struct ceph_osds *new_acting, + bool any_change); + /* calculate mapping of a file extent to an object */ extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, u64 off, u64 len, diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 913c87c26d335f..f28ed864e68294 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -153,6 +153,11 @@ extern const char *ceph_osd_state_name(int s); #define CEPH_OSDMAP_NOIN (1<<8) /* block osd auto mark-in */ #define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */ #define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */ +#define CEPH_OSDMAP_NOSCRUB (1<<11) /* block periodic scrub */ +#define CEPH_OSDMAP_NODEEP_SCRUB (1<<12) /* block periodic deep-scrub */ +#define CEPH_OSDMAP_NOTIERAGENT (1<<13) /* disable tiering agent */ +#define CEPH_OSDMAP_NOREBALANCE (1<<14) /* block osd backfill unless pg is degraded */ +#define CEPH_OSDMAP_SORTBITWISE (1<<15) /* use bitwise hobject_t sort */ /* * The error code to return when an OSD can't handle a write diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 0ff400a56cd6f6..cff3a7e29233db 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -298,6 +298,30 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req, } } +/* + * Assumes @t is zero-initialized. + */ +static void target_init(struct ceph_osd_request_target *t) +{ + ceph_oid_init(&t->base_oid); + ceph_oloc_init(&t->base_oloc); + ceph_oid_init(&t->target_oid); + ceph_oloc_init(&t->target_oloc); + + ceph_osds_init(&t->acting); + ceph_osds_init(&t->up); + t->size = -1; + t->min_size = -1; + + t->osd = CEPH_HOMELESS_OSD; +} + +static void target_destroy(struct ceph_osd_request_target *t) +{ + ceph_oid_destroy(&t->base_oid); + ceph_oid_destroy(&t->target_oid); +} + /* * requests */ @@ -1273,6 +1297,11 @@ void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, } EXPORT_SYMBOL(ceph_osdc_set_request_linger); +static bool __pool_full(struct ceph_pg_pool_info *pi) +{ + return pi->flags & CEPH_POOL_FLAG_FULL; +} + /* * Returns whether a request should be blocked from being sent * based on the current osdmap and osd_client settings. @@ -1289,6 +1318,20 @@ static bool __req_should_be_paused(struct ceph_osd_client *osdc, (req->r_flags & CEPH_OSD_FLAG_WRITE && pausewr); } +static bool target_should_be_paused(struct ceph_osd_client *osdc, + const struct ceph_osd_request_target *t, + struct ceph_pg_pool_info *pi) +{ + bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD); + bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) || + ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) || + __pool_full(pi); + + WARN_ON(pi->id != t->base_oloc.pool); + return (t->flags & CEPH_OSD_FLAG_READ && pauserd) || + (t->flags & CEPH_OSD_FLAG_WRITE && pausewr); +} + /* * Calculate mapping of a request to a PG. Takes tiering into account. */ @@ -1328,6 +1371,116 @@ static int __calc_request_pg(struct ceph_osdmap *osdmap, &req->r_target_oloc, pg_out); } +enum calc_target_result { + CALC_TARGET_NO_ACTION = 0, + CALC_TARGET_NEED_RESEND, + CALC_TARGET_POOL_DNE, +}; + +static enum calc_target_result calc_target(struct ceph_osd_client *osdc, + struct ceph_osd_request_target *t, + u32 *last_force_resend, + bool any_change) +{ + struct ceph_pg_pool_info *pi; + struct ceph_pg pgid, last_pgid; + struct ceph_osds up, acting; + bool force_resend = false; + bool need_check_tiering = false; + bool need_resend = false; + bool sort_bitwise = ceph_osdmap_flag(osdc->osdmap, + CEPH_OSDMAP_SORTBITWISE); + enum calc_target_result ct_res; + int ret; + + pi = ceph_pg_pool_by_id(osdc->osdmap, t->base_oloc.pool); + if (!pi) { + t->osd = CEPH_HOMELESS_OSD; + ct_res = CALC_TARGET_POOL_DNE; + goto out; + } + + if (osdc->osdmap->epoch == pi->last_force_request_resend) { + if (last_force_resend && + *last_force_resend < pi->last_force_request_resend) { + *last_force_resend = pi->last_force_request_resend; + force_resend = true; + } else if (!last_force_resend) { + force_resend = true; + } + } + if (ceph_oid_empty(&t->target_oid) || force_resend) { + ceph_oid_copy(&t->target_oid, &t->base_oid); + need_check_tiering = true; + } + if (ceph_oloc_empty(&t->target_oloc) || force_resend) { + ceph_oloc_copy(&t->target_oloc, &t->base_oloc); + need_check_tiering = true; + } + + if (need_check_tiering && + (t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) { + if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0) + t->target_oloc.pool = pi->read_tier; + if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0) + t->target_oloc.pool = pi->write_tier; + } + + ret = ceph_object_locator_to_pg(osdc->osdmap, &t->target_oid, + &t->target_oloc, &pgid); + if (ret) { + WARN_ON(ret != -ENOENT); + t->osd = CEPH_HOMELESS_OSD; + ct_res = CALC_TARGET_POOL_DNE; + goto out; + } + last_pgid.pool = pgid.pool; + last_pgid.seed = ceph_stable_mod(pgid.seed, t->pg_num, t->pg_num_mask); + + ceph_pg_to_up_acting_osds(osdc->osdmap, &pgid, &up, &acting); + if (any_change && + ceph_is_new_interval(&t->acting, + &acting, + &t->up, + &up, + t->size, + pi->size, + t->min_size, + pi->min_size, + t->pg_num, + pi->pg_num, + t->sort_bitwise, + sort_bitwise, + &last_pgid)) + force_resend = true; + + if (t->paused && !target_should_be_paused(osdc, t, pi)) { + t->paused = false; + need_resend = true; + } + + if (ceph_pg_compare(&t->pgid, &pgid) || + ceph_osds_changed(&t->acting, &acting, any_change) || + force_resend) { + t->pgid = pgid; /* struct */ + ceph_osds_copy(&t->acting, &acting); + ceph_osds_copy(&t->up, &up); + t->size = pi->size; + t->min_size = pi->min_size; + t->pg_num = pi->pg_num; + t->pg_num_mask = pi->pg_num_mask; + t->sort_bitwise = sort_bitwise; + + t->osd = acting.primary; + need_resend = true; + } + + ct_res = need_resend ? CALC_TARGET_NEED_RESEND : CALC_TARGET_NO_ACTION; +out: + dout("%s t %p -> ct_res %d osd %d\n", __func__, t, ct_res, t->osd); + return ct_res; +} + static void __enqueue_request(struct ceph_osd_request *req) { struct ceph_osd_client *osdc = req->r_osdc; @@ -1805,12 +1958,12 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) redir.oloc.pool = -1; } - if (redir.oloc.pool != -1) { + if (!ceph_oloc_empty(&redir.oloc)) { dout("redirect pool %lld\n", redir.oloc.pool); __unregister_request(osdc, req); - req->r_target_oloc = redir.oloc; /* struct */ + ceph_oloc_copy(&req->r_target_oloc, &redir.oloc); /* * Start redirect requests with nofail=true. If diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 66c3ebead92fa9..7d4a5b43085e1c 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1521,6 +1521,32 @@ void ceph_oid_destroy(struct ceph_object_id *oid) } EXPORT_SYMBOL(ceph_oid_destroy); +/* + * osds only + */ +static bool __osds_equal(const struct ceph_osds *lhs, + const struct ceph_osds *rhs) +{ + if (lhs->size == rhs->size && + !memcmp(lhs->osds, rhs->osds, rhs->size * sizeof(rhs->osds[0]))) + return true; + + return false; +} + +/* + * osds + primary + */ +static bool osds_equal(const struct ceph_osds *lhs, + const struct ceph_osds *rhs) +{ + if (__osds_equal(lhs, rhs) && + lhs->primary == rhs->primary) + return true; + + return false; +} + static bool osds_valid(const struct ceph_osds *set) { /* non-empty set */ @@ -1553,6 +1579,101 @@ void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src) dest->primary = src->primary; } +static bool is_split(const struct ceph_pg *pgid, + u32 old_pg_num, + u32 new_pg_num) +{ + int old_bits = calc_bits_of(old_pg_num); + int old_mask = (1 << old_bits) - 1; + int n; + + WARN_ON(pgid->seed >= old_pg_num); + if (new_pg_num <= old_pg_num) + return false; + + for (n = 1; ; n++) { + int next_bit = n << (old_bits - 1); + u32 s = next_bit | pgid->seed; + + if (s < old_pg_num || s == pgid->seed) + continue; + if (s >= new_pg_num) + break; + + s = ceph_stable_mod(s, old_pg_num, old_mask); + if (s == pgid->seed) + return true; + } + + return false; +} + +bool ceph_is_new_interval(const struct ceph_osds *old_acting, + const struct ceph_osds *new_acting, + const struct ceph_osds *old_up, + const struct ceph_osds *new_up, + int old_size, + int new_size, + int old_min_size, + int new_min_size, + u32 old_pg_num, + u32 new_pg_num, + bool old_sort_bitwise, + bool new_sort_bitwise, + const struct ceph_pg *pgid) +{ + return !osds_equal(old_acting, new_acting) || + !osds_equal(old_up, new_up) || + old_size != new_size || + old_min_size != new_min_size || + is_split(pgid, old_pg_num, new_pg_num) || + old_sort_bitwise != new_sort_bitwise; +} + +static int calc_pg_rank(int osd, const struct ceph_osds *acting) +{ + int i; + + for (i = 0; i < acting->size; i++) { + if (acting->osds[i] == osd) + return i; + } + + return -1; +} + +static bool primary_changed(const struct ceph_osds *old_acting, + const struct ceph_osds *new_acting) +{ + if (!old_acting->size && !new_acting->size) + return false; /* both still empty */ + + if (!old_acting->size ^ !new_acting->size) + return true; /* was empty, now not, or vice versa */ + + if (old_acting->primary != new_acting->primary) + return true; /* primary changed */ + + if (calc_pg_rank(old_acting->primary, old_acting) != + calc_pg_rank(new_acting->primary, new_acting)) + return true; + + return false; /* same primary (tho replicas may have changed) */ +} + +bool ceph_osds_changed(const struct ceph_osds *old_acting, + const struct ceph_osds *new_acting, + bool any_change) +{ + if (primary_changed(old_acting, new_acting)) + return true; + + if (any_change && !__osds_equal(old_acting, new_acting)) + return true; + + return false; +} + /* * calculate file layout from given offset, length. * fill in correct oid, logical length, and object extent From a66dd38309f5d9c66ec9bc7911ff8da8cc37bb9f Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:23 +0200 Subject: [PATCH 18/71] libceph: switch to calc_target(), part 1 Replace __calc_request_pg() and most of __map_request() with calc_target() and start using req->r_t. ceph_osdc_build_request() however still encodes base_oid, because it's called before calc_target() is and target_oid is empty at that point in time; a printf in osdc_show() also shows base_oid. This is fixed in "libceph: switch to calc_target(), part 2". Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 15 ++-- net/ceph/debugfs.c | 2 +- net/ceph/osd_client.c | 119 ++++++-------------------------- 3 files changed, 29 insertions(+), 107 deletions(-) diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 48806ee4488d01..03bf9d9e1517bc 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -150,12 +150,13 @@ struct ceph_osd_request { struct list_head r_linger_item; struct list_head r_linger_osd_item; struct ceph_osd *r_osd; - struct ceph_pg r_pgid; - int r_pg_osds[CEPH_PG_MAX_SIZE]; - int r_num_pg_osds; + + struct ceph_osd_request_target r_t; +#define r_base_oid r_t.base_oid +#define r_base_oloc r_t.base_oloc +#define r_flags r_t.flags struct ceph_msg *r_request, *r_reply; - int r_flags; /* any additional flags for the osd */ u32 r_sent; /* >0 if r_request is sending/sent */ /* request osd ops array */ @@ -167,7 +168,6 @@ struct ceph_osd_request { __le64 *r_request_pool; void *r_request_pgid; __le32 *r_request_attempts; - bool r_paused; struct ceph_eversion *r_request_reassert_version; int r_result; @@ -186,11 +186,6 @@ struct ceph_osd_request { struct inode *r_inode; /* for use by callbacks */ void *r_priv; /* ditto */ - struct ceph_object_locator r_base_oloc; - struct ceph_object_id r_base_oid; - struct ceph_object_locator r_target_oloc; - struct ceph_object_id r_target_oid; - u64 r_snapid; unsigned long r_stamp; /* send OR check time */ diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 7f1cc22c3e8ba2..0c11ab5f8c30b2 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -161,7 +161,7 @@ static int osdc_show(struct seq_file *s, void *pp) seq_printf(s, "%lld\tosd%d\t%lld.%x\t", req->r_tid, req->r_osd ? req->r_osd->o_osd : -1, - req->r_pgid.pool, req->r_pgid.seed); + req->r_t.pgid.pool, req->r_t.pgid.seed); seq_printf(s, "%*pE", req->r_base_oid.name_len, req->r_base_oid.name); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index cff3a7e29233db..013101598c4109 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -350,8 +350,7 @@ static void ceph_osdc_release_request(struct kref *kref) for (which = 0; which < req->r_num_ops; which++) osd_req_op_data_release(req, which); - ceph_oid_destroy(&req->r_base_oid); - ceph_oid_destroy(&req->r_target_oid); + target_destroy(&req->r_t); ceph_put_snap_context(req->r_snapc); if (req->r_mempool) @@ -420,10 +419,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, INIT_LIST_HEAD(&req->r_req_lru_item); INIT_LIST_HEAD(&req->r_osd_item); - ceph_oid_init(&req->r_base_oid); - req->r_base_oloc.pool = -1; - ceph_oid_init(&req->r_target_oid); - req->r_target_oloc.pool = -1; + target_init(&req->r_t); dout("%s req %p\n", __func__, req); return req; @@ -1308,16 +1304,6 @@ static bool __pool_full(struct ceph_pg_pool_info *pi) * * Caller should hold map_sem for read. */ -static bool __req_should_be_paused(struct ceph_osd_client *osdc, - struct ceph_osd_request *req) -{ - bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD); - bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) || - ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL); - return (req->r_flags & CEPH_OSD_FLAG_READ && pauserd) || - (req->r_flags & CEPH_OSD_FLAG_WRITE && pausewr); -} - static bool target_should_be_paused(struct ceph_osd_client *osdc, const struct ceph_osd_request_target *t, struct ceph_pg_pool_info *pi) @@ -1332,45 +1318,6 @@ static bool target_should_be_paused(struct ceph_osd_client *osdc, (t->flags & CEPH_OSD_FLAG_WRITE && pausewr); } -/* - * Calculate mapping of a request to a PG. Takes tiering into account. - */ -static int __calc_request_pg(struct ceph_osdmap *osdmap, - struct ceph_osd_request *req, - struct ceph_pg *pg_out) -{ - bool need_check_tiering; - - need_check_tiering = false; - if (req->r_target_oloc.pool == -1) { - req->r_target_oloc = req->r_base_oloc; /* struct */ - need_check_tiering = true; - } - if (ceph_oid_empty(&req->r_target_oid)) { - ceph_oid_copy(&req->r_target_oid, &req->r_base_oid); - need_check_tiering = true; - } - - if (need_check_tiering && - (req->r_flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) { - struct ceph_pg_pool_info *pi; - - pi = ceph_pg_pool_by_id(osdmap, req->r_target_oloc.pool); - if (pi) { - if ((req->r_flags & CEPH_OSD_FLAG_READ) && - pi->read_tier >= 0) - req->r_target_oloc.pool = pi->read_tier; - if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && - pi->write_tier >= 0) - req->r_target_oloc.pool = pi->write_tier; - } - /* !pi is caught in ceph_oloc_oid_to_pg() */ - } - - return ceph_object_locator_to_pg(osdmap, &req->r_target_oid, - &req->r_target_oloc, pg_out); -} - enum calc_target_result { CALC_TARGET_NO_ACTION = 0, CALC_TARGET_NEED_RESEND, @@ -1510,46 +1457,26 @@ static void __enqueue_request(struct ceph_osd_request *req) static int __map_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req, int force_resend) { - struct ceph_pg pgid; - struct ceph_osds up, acting; + enum calc_target_result ct_res; int err; - bool was_paused; dout("map_request %p tid %lld\n", req, req->r_tid); - err = __calc_request_pg(osdc->osdmap, req, &pgid); - if (err) { + ct_res = calc_target(osdc, &req->r_t, NULL, force_resend); + switch (ct_res) { + case CALC_TARGET_POOL_DNE: list_move(&req->r_req_lru_item, &osdc->req_notarget); - return err; - } - req->r_pgid = pgid; - - ceph_pg_to_up_acting_osds(osdc->osdmap, &pgid, &up, &acting); - - was_paused = req->r_paused; - req->r_paused = __req_should_be_paused(osdc, req); - if (was_paused && !req->r_paused) - force_resend = 1; - - if ((!force_resend && - req->r_osd && req->r_osd->o_osd == acting.primary && - req->r_sent >= req->r_osd->o_incarnation && - req->r_num_pg_osds == acting.size && - memcmp(req->r_pg_osds, acting.osds, - acting.size * sizeof(acting.osds[0])) == 0) || - (req->r_osd == NULL && acting.primary == -1) || - req->r_paused) + return -EIO; + case CALC_TARGET_NO_ACTION: return 0; /* no change */ + default: + BUG_ON(ct_res != CALC_TARGET_NEED_RESEND); + } dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n", - req->r_tid, pgid.pool, pgid.seed, acting.primary, + req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed, req->r_t.osd, req->r_osd ? req->r_osd->o_osd : -1); - /* record full pg acting set */ - memcpy(req->r_pg_osds, acting.osds, - acting.size * sizeof(acting.osds[0])); - req->r_num_pg_osds = acting.size; - if (req->r_osd) { __cancel_request(req); list_del_init(&req->r_osd_item); @@ -1557,22 +1484,22 @@ static int __map_request(struct ceph_osd_client *osdc, req->r_osd = NULL; } - req->r_osd = lookup_osd(&osdc->osds, acting.primary); - if (!req->r_osd && acting.primary >= 0) { + req->r_osd = lookup_osd(&osdc->osds, req->r_t.osd); + if (!req->r_osd && req->r_t.osd >= 0) { err = -ENOMEM; - req->r_osd = create_osd(osdc, acting.primary); + req->r_osd = create_osd(osdc, req->r_t.osd); if (!req->r_osd) { list_move(&req->r_req_lru_item, &osdc->req_notarget); goto out; } dout("map_request osd %p is osd%d\n", req->r_osd, - acting.primary); + req->r_osd->o_osd); insert_osd(&osdc->osds, req->r_osd); ceph_con_open(&req->r_osd->o_con, - CEPH_ENTITY_TYPE_OSD, acting.primary, - &osdc->osdmap->osd_addr[acting.primary]); + CEPH_ENTITY_TYPE_OSD, req->r_osd->o_osd, + &osdc->osdmap->osd_addr[req->r_osd->o_osd]); } __enqueue_request(req); @@ -1592,15 +1519,15 @@ static void __send_request(struct ceph_osd_client *osdc, dout("send_request %p tid %llu to osd%d flags %d pg %lld.%x\n", req, req->r_tid, req->r_osd->o_osd, req->r_flags, - (unsigned long long)req->r_pgid.pool, req->r_pgid.seed); + req->r_t.pgid.pool, req->r_t.pgid.seed); /* fill in message content that changes each time we send it */ put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch); put_unaligned_le32(req->r_flags, req->r_request_flags); - put_unaligned_le64(req->r_target_oloc.pool, req->r_request_pool); + put_unaligned_le64(req->r_t.target_oloc.pool, req->r_request_pool); p = req->r_request_pgid; - ceph_encode_64(&p, req->r_pgid.pool); - ceph_encode_32(&p, req->r_pgid.seed); + ceph_encode_64(&p, req->r_t.pgid.pool); + ceph_encode_32(&p, req->r_t.pgid.seed); put_unaligned_le64(1, req->r_request_attempts); /* FIXME */ memcpy(req->r_request_reassert_version, &req->r_reassert_version, sizeof(req->r_reassert_version)); @@ -1963,7 +1890,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) __unregister_request(osdc, req); - ceph_oloc_copy(&req->r_target_oloc, &redir.oloc); + ceph_oloc_copy(&req->r_t.target_oloc, &redir.oloc); /* * Start redirect requests with nofail=true. If From bb873b539154ab51893430b4ad6ba4051775276a Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 26 May 2016 00:29:52 +0200 Subject: [PATCH 19/71] libceph: switch to calc_target(), part 2 The crux of this is getting rid of ceph_osdc_build_request(), so that MOSDOp can be encoded not before but after calc_target() calculates the actual target. Encoding now happens within ceph_osdc_start_request(). Also nuked is the accompanying bunch of pointers into the encoded buffer that was used to update fields on each send - instead, the entire front is re-encoded. If we want to support target->name_len != base->name_len in the future, there is no other way, because oid is surrounded by other fields in the encoded buffer. Encoding OSD ops and adding data items to the request message were mixed together in osd_req_encode_op(). While we want to re-encode OSD ops, we don't want to add duplicate data items to the message when resending, so all call to ceph_osdc_msg_data_add() are factored out into a new setup_request_data(). Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 18 +- fs/ceph/addr.c | 16 +- fs/ceph/file.c | 16 +- include/linux/ceph/osd_client.h | 29 +-- include/linux/ceph/rados.h | 7 + net/ceph/debugfs.c | 61 +++--- net/ceph/osd_client.c | 355 ++++++++++++++++---------------- 7 files changed, 247 insertions(+), 255 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index f3ea927f93deb3..0e598916e04820 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1896,27 +1896,17 @@ static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request) { struct rbd_img_request *img_request = obj_request->img_request; struct ceph_osd_request *osd_req = obj_request->osd_req; - u64 snap_id; - - rbd_assert(osd_req != NULL); - snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP; - ceph_osdc_build_request(osd_req, obj_request->offset, - NULL, snap_id, NULL); + if (img_request) + osd_req->r_snapid = img_request->snap_id; } static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) { - struct rbd_img_request *img_request = obj_request->img_request; struct ceph_osd_request *osd_req = obj_request->osd_req; - struct ceph_snap_context *snapc; - struct timespec mtime = CURRENT_TIME; - - rbd_assert(osd_req != NULL); - snapc = img_request ? img_request->snapc : NULL; - ceph_osdc_build_request(osd_req, obj_request->offset, - snapc, CEPH_NOSNAP, &mtime); + osd_req->r_mtime = CURRENT_TIME; + osd_req->r_data_offset = obj_request->offset; } /* diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index c5d75486823b3d..59b3c3fbd3bd79 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -376,8 +376,6 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) req->r_callback = finish_read; req->r_inode = inode; - ceph_osdc_build_request(req, off, NULL, vino.snap, NULL); - dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); ret = ceph_osdc_start_request(osdc, req, false); if (ret < 0) @@ -1063,10 +1061,7 @@ static int ceph_writepages_start(struct address_space *mapping, pages = NULL; } - vino = ceph_vino(inode); - ceph_osdc_build_request(req, offset, snapc, vino.snap, - &inode->i_mtime); - + req->r_mtime = inode->i_mtime; rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); BUG_ON(rc); req = NULL; @@ -1614,7 +1609,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) goto out; } - ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime); + req->r_mtime = inode->i_mtime; err = ceph_osdc_start_request(&fsc->client->osdc, req, false); if (!err) err = ceph_osdc_wait_request(&fsc->client->osdc, req); @@ -1657,7 +1652,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) goto out_put; } - ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime); + req->r_mtime = inode->i_mtime; err = ceph_osdc_start_request(&fsc->client->osdc, req, false); if (!err) err = ceph_osdc_wait_request(&fsc->client->osdc, req); @@ -1790,12 +1785,9 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE, 0, false, true); - ceph_osdc_build_request(rd_req, 0, NULL, CEPH_NOSNAP, - &ci->vfs_inode.i_mtime); err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false); - ceph_osdc_build_request(wr_req, 0, NULL, CEPH_NOSNAP, - &ci->vfs_inode.i_mtime); + wr_req->r_mtime = ci->vfs_inode.i_mtime; err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false); if (!err) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 36b4a41dfa67a1..52e4b72dd5debe 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -727,8 +727,8 @@ static void ceph_aio_retry_work(struct work_struct *work) req->r_ops[0] = orig_req->r_ops[0]; osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); - ceph_osdc_build_request(req, req->r_ops[0].extent.offset, - snapc, CEPH_NOSNAP, &aio_req->mtime); + req->r_mtime = aio_req->mtime; + req->r_data_offset = req->r_ops[0].extent.offset; ceph_osdc_put_request(orig_req); @@ -882,14 +882,12 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, (pos+len) | (PAGE_SIZE - 1)); osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); + req->r_mtime = mtime; } - osd_req_op_extent_osd_data_pages(req, 0, pages, len, start, false, false); - ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); - if (aio_req) { aio_req->total_len += len; aio_req->num_reqs++; @@ -1074,9 +1072,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, true); - /* BUG_ON(vino.snap != CEPH_NOSNAP); */ - ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); - + req->r_mtime = mtime; ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); if (!ret) ret = ceph_osdc_wait_request(&fsc->client->osdc, req); @@ -1532,9 +1528,7 @@ static int ceph_zero_partial_object(struct inode *inode, goto out; } - ceph_osdc_build_request(req, offset, NULL, ceph_vino(inode).snap, - &inode->i_mtime); - + req->r_mtime = inode->i_mtime; ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); if (!ret) { ret = ceph_osdc_wait_request(&fsc->client->osdc, req); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 03bf9d9e1517bc..67a37d98e0ca7f 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -104,7 +104,7 @@ struct ceph_osd_req_op { struct ceph_osd_data response_data; __u8 class_len; __u8 method_len; - __u8 argc; + u32 indata_len; } cls; struct { u64 cookie; @@ -162,14 +162,6 @@ struct ceph_osd_request { /* request osd ops array */ unsigned int r_num_ops; - /* these are updated on each send */ - __le32 *r_request_osdmap_epoch; - __le32 *r_request_flags; - __le64 *r_request_pool; - void *r_request_pgid; - __le32 *r_request_attempts; - struct ceph_eversion *r_request_reassert_version; - int r_result; int r_got_reply; int r_linger; @@ -180,16 +172,22 @@ struct ceph_osd_request { struct completion r_completion, r_safe_completion; ceph_osdc_callback_t r_callback; ceph_osdc_unsafe_callback_t r_unsafe_callback; - struct ceph_eversion r_reassert_version; struct list_head r_unsafe_item; struct inode *r_inode; /* for use by callbacks */ void *r_priv; /* ditto */ - u64 r_snapid; - unsigned long r_stamp; /* send OR check time */ + /* set by submitter */ + u64 r_snapid; /* for reads, CEPH_NOSNAP o/w */ + struct ceph_snap_context *r_snapc; /* for writes */ + struct timespec r_mtime; /* ditto */ + u64 r_data_offset; /* ditto */ - struct ceph_snap_context *r_snapc; /* snap context for writes */ + /* internal */ + unsigned long r_stamp; /* jiffies, send or check time */ + int r_attempts; + struct ceph_eversion r_replay_version; /* aka reassert_version */ + u32 r_last_force_resend; struct ceph_osd_req_op r_ops[]; }; @@ -334,11 +332,6 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client * gfp_t gfp_flags); int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp); -extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, - struct ceph_snap_context *snapc, - u64 snap_id, - struct timespec *mtime); - extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, struct ceph_file_layout *layout, struct ceph_vino vino, diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index f28ed864e68294..28740a58f32cd1 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -394,6 +394,13 @@ enum { CEPH_OSD_FLAG_SKIPRWLOCKS = 0x10000, /* skip rw locks */ CEPH_OSD_FLAG_IGNORE_OVERLAY = 0x20000, /* ignore pool overlay */ CEPH_OSD_FLAG_FLUSH = 0x40000, /* this is part of flush */ + CEPH_OSD_FLAG_MAP_SNAP_CLONE = 0x80000, /* map snap direct to clone id */ + CEPH_OSD_FLAG_ENFORCE_SNAPC = 0x100000, /* use snapc provided even if + pool uses pool snaps */ + CEPH_OSD_FLAG_REDIRECTED = 0x200000, /* op has been redirected */ + CEPH_OSD_FLAG_KNOWN_REDIR = 0x400000, /* redirect bit is authoritative */ + CEPH_OSD_FLAG_FULL_TRY = 0x800000, /* try op despite full flag */ + CEPH_OSD_FLAG_FULL_FORCE = 0x1000000, /* force op despite full flag */ }; enum { diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 0c11ab5f8c30b2..6d3ff713edebbf 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -145,6 +145,43 @@ static int monc_show(struct seq_file *s, void *p) return 0; } +static void dump_target(struct seq_file *s, struct ceph_osd_request_target *t) +{ + int i; + + seq_printf(s, "osd%d\t%llu.%x\t[", t->osd, t->pgid.pool, t->pgid.seed); + for (i = 0; i < t->up.size; i++) + seq_printf(s, "%s%d", (!i ? "" : ","), t->up.osds[i]); + seq_printf(s, "]/%d\t[", t->up.primary); + for (i = 0; i < t->acting.size; i++) + seq_printf(s, "%s%d", (!i ? "" : ","), t->acting.osds[i]); + seq_printf(s, "]/%d\t%*pE\t0x%x", t->acting.primary, + t->target_oid.name_len, t->target_oid.name, t->flags); + if (t->paused) + seq_puts(s, "\tP"); +} + +static void dump_request(struct seq_file *s, struct ceph_osd_request *req) +{ + int i; + + seq_printf(s, "%llu\t", req->r_tid); + dump_target(s, &req->r_t); + + seq_printf(s, "\t%d\t%u'%llu", req->r_attempts, + le32_to_cpu(req->r_replay_version.epoch), + le64_to_cpu(req->r_replay_version.version)); + + for (i = 0; i < req->r_num_ops; i++) { + struct ceph_osd_req_op *op = &req->r_ops[i]; + + seq_printf(s, "%s%s", (i == 0 ? "\t" : ","), + ceph_osd_op_name(op->op)); + } + + seq_putc(s, '\n'); +} + static int osdc_show(struct seq_file *s, void *pp) { struct ceph_client *client = s->private; @@ -154,32 +191,10 @@ static int osdc_show(struct seq_file *s, void *pp) mutex_lock(&osdc->request_mutex); for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { struct ceph_osd_request *req; - unsigned int i; - int opcode; req = rb_entry(p, struct ceph_osd_request, r_node); - seq_printf(s, "%lld\tosd%d\t%lld.%x\t", req->r_tid, - req->r_osd ? req->r_osd->o_osd : -1, - req->r_t.pgid.pool, req->r_t.pgid.seed); - - seq_printf(s, "%*pE", req->r_base_oid.name_len, - req->r_base_oid.name); - - if (req->r_reassert_version.epoch) - seq_printf(s, "\t%u'%llu", - (unsigned int)le32_to_cpu(req->r_reassert_version.epoch), - le64_to_cpu(req->r_reassert_version.version)); - else - seq_printf(s, "\t"); - - for (i = 0; i < req->r_num_ops; i++) { - opcode = req->r_ops[i].op; - seq_printf(s, "%s%s", (i == 0 ? "\t" : ","), - ceph_osd_op_name(opcode)); - } - - seq_printf(s, "\n"); + dump_request(s, req); } mutex_unlock(&osdc->request_mutex); return 0; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 013101598c4109..8a008f083283d6 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -34,8 +34,6 @@ static void __unregister_request(struct ceph_osd_client *osdc, static void __unregister_linger_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req); static void __enqueue_request(struct ceph_osd_request *req); -static void __send_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req); /* * Implement client access to distributed object storage cluster. @@ -209,6 +207,8 @@ void osd_req_op_cls_request_data_pagelist( osd_data = osd_req_op_data(osd_req, which, cls, request_data); ceph_osd_data_pagelist_init(osd_data, pagelist); + osd_req->r_ops[which].cls.indata_len += pagelist->length; + osd_req->r_ops[which].indata_len += pagelist->length; } EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist); @@ -221,6 +221,8 @@ void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req, osd_data = osd_req_op_data(osd_req, which, cls, request_data); ceph_osd_data_pages_init(osd_data, pages, length, alignment, pages_from_pool, own_pages); + osd_req->r_ops[which].cls.indata_len += length; + osd_req->r_ops[which].indata_len += length; } EXPORT_SYMBOL(osd_req_op_cls_request_data_pages); @@ -610,8 +612,6 @@ void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist); - op->cls.argc = 0; /* currently unused */ - op->indata_len = payload_len; } EXPORT_SYMBOL(osd_req_op_cls_init); @@ -709,16 +709,9 @@ static void ceph_osdc_msg_data_add(struct ceph_msg *msg, } } -static u64 osd_req_encode_op(struct ceph_osd_request *req, - struct ceph_osd_op *dst, unsigned int which) +static u32 osd_req_encode_op(struct ceph_osd_op *dst, + const struct ceph_osd_req_op *src) { - struct ceph_osd_req_op *src; - struct ceph_osd_data *osd_data; - u64 request_data_len = 0; - u64 data_length; - - BUG_ON(which >= req->r_num_ops); - src = &req->r_ops[which]; if (WARN_ON(!osd_req_opcode_valid(src->op))) { pr_err("unrecognized osd opcode %d\n", src->op); @@ -727,49 +720,23 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, switch (src->op) { case CEPH_OSD_OP_STAT: - osd_data = &src->raw_data_in; - ceph_osdc_msg_data_add(req->r_reply, osd_data); break; case CEPH_OSD_OP_READ: case CEPH_OSD_OP_WRITE: case CEPH_OSD_OP_WRITEFULL: case CEPH_OSD_OP_ZERO: case CEPH_OSD_OP_TRUNCATE: - if (src->op == CEPH_OSD_OP_WRITE || - src->op == CEPH_OSD_OP_WRITEFULL) - request_data_len = src->extent.length; dst->extent.offset = cpu_to_le64(src->extent.offset); dst->extent.length = cpu_to_le64(src->extent.length); dst->extent.truncate_size = cpu_to_le64(src->extent.truncate_size); dst->extent.truncate_seq = cpu_to_le32(src->extent.truncate_seq); - osd_data = &src->extent.osd_data; - if (src->op == CEPH_OSD_OP_WRITE || - src->op == CEPH_OSD_OP_WRITEFULL) - ceph_osdc_msg_data_add(req->r_request, osd_data); - else - ceph_osdc_msg_data_add(req->r_reply, osd_data); break; case CEPH_OSD_OP_CALL: dst->cls.class_len = src->cls.class_len; dst->cls.method_len = src->cls.method_len; - osd_data = &src->cls.request_info; - ceph_osdc_msg_data_add(req->r_request, osd_data); - BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGELIST); - request_data_len = osd_data->pagelist->length; - - osd_data = &src->cls.request_data; - data_length = ceph_osd_data_length(osd_data); - if (data_length) { - BUG_ON(osd_data->type == CEPH_OSD_DATA_TYPE_NONE); - dst->cls.indata_len = cpu_to_le32(data_length); - ceph_osdc_msg_data_add(req->r_request, osd_data); - src->indata_len += data_length; - request_data_len += data_length; - } - osd_data = &src->cls.response_data; - ceph_osdc_msg_data_add(req->r_reply, osd_data); + dst->cls.indata_len = cpu_to_le32(src->cls.indata_len); break; case CEPH_OSD_OP_STARTSYNC: break; @@ -791,9 +758,6 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, dst->xattr.value_len = cpu_to_le32(src->xattr.value_len); dst->xattr.cmp_op = src->xattr.cmp_op; dst->xattr.cmp_mode = src->xattr.cmp_mode; - osd_data = &src->xattr.osd_data; - ceph_osdc_msg_data_add(req->r_request, osd_data); - request_data_len = osd_data->pagelist->length; break; case CEPH_OSD_OP_CREATE: case CEPH_OSD_OP_DELETE: @@ -810,7 +774,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, dst->flags = cpu_to_le32(src->flags); dst->payload_len = cpu_to_le32(src->indata_len); - return request_data_len; + return src->indata_len; } /* @@ -852,8 +816,6 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, goto fail; } - req->r_flags = flags; - /* calculate max write size */ r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen); if (r) @@ -877,9 +839,14 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, truncate_size, truncate_seq); } + req->r_flags = flags; req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout); ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum); + req->r_snapid = vino.snap; + if (flags & CEPH_OSD_FLAG_WRITE) + req->r_data_offset = off; + r = ceph_osdc_alloc_messages(req, GFP_NOFS); if (r) goto fail; @@ -1509,37 +1476,173 @@ static int __map_request(struct ceph_osd_client *osdc, return err; } -/* - * caller should hold map_sem (for read) and request_mutex - */ -static void __send_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req) +static void setup_request_data(struct ceph_osd_request *req, + struct ceph_msg *msg) { - void *p; + u32 data_len = 0; + int i; + + if (!list_empty(&msg->data)) + return; - dout("send_request %p tid %llu to osd%d flags %d pg %lld.%x\n", - req, req->r_tid, req->r_osd->o_osd, req->r_flags, - req->r_t.pgid.pool, req->r_t.pgid.seed); + WARN_ON(msg->data_length); + for (i = 0; i < req->r_num_ops; i++) { + struct ceph_osd_req_op *op = &req->r_ops[i]; + + switch (op->op) { + /* request */ + case CEPH_OSD_OP_WRITE: + case CEPH_OSD_OP_WRITEFULL: + WARN_ON(op->indata_len != op->extent.length); + ceph_osdc_msg_data_add(msg, &op->extent.osd_data); + break; + case CEPH_OSD_OP_SETXATTR: + case CEPH_OSD_OP_CMPXATTR: + WARN_ON(op->indata_len != op->xattr.name_len + + op->xattr.value_len); + ceph_osdc_msg_data_add(msg, &op->xattr.osd_data); + break; + + /* reply */ + case CEPH_OSD_OP_STAT: + ceph_osdc_msg_data_add(req->r_reply, + &op->raw_data_in); + break; + case CEPH_OSD_OP_READ: + ceph_osdc_msg_data_add(req->r_reply, + &op->extent.osd_data); + break; + + /* both */ + case CEPH_OSD_OP_CALL: + WARN_ON(op->indata_len != op->cls.class_len + + op->cls.method_len + + op->cls.indata_len); + ceph_osdc_msg_data_add(msg, &op->cls.request_info); + /* optional, can be NONE */ + ceph_osdc_msg_data_add(msg, &op->cls.request_data); + /* optional, can be NONE */ + ceph_osdc_msg_data_add(req->r_reply, + &op->cls.response_data); + break; + } + + data_len += op->indata_len; + } - /* fill in message content that changes each time we send it */ - put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch); - put_unaligned_le32(req->r_flags, req->r_request_flags); - put_unaligned_le64(req->r_t.target_oloc.pool, req->r_request_pool); - p = req->r_request_pgid; + WARN_ON(data_len != msg->data_length); +} + +static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg) +{ + void *p = msg->front.iov_base; + void *const end = p + msg->front_alloc_len; + u32 data_len = 0; + int i; + + if (req->r_flags & CEPH_OSD_FLAG_WRITE) { + /* snapshots aren't writeable */ + WARN_ON(req->r_snapid != CEPH_NOSNAP); + } else { + WARN_ON(req->r_mtime.tv_sec || req->r_mtime.tv_nsec || + req->r_data_offset || req->r_snapc); + } + + setup_request_data(req, msg); + + ceph_encode_32(&p, 1); /* client_inc, always 1 */ + ceph_encode_32(&p, req->r_osdc->osdmap->epoch); + ceph_encode_32(&p, req->r_flags); + ceph_encode_timespec(p, &req->r_mtime); + p += sizeof(struct ceph_timespec); + /* aka reassert_version */ + memcpy(p, &req->r_replay_version, sizeof(req->r_replay_version)); + p += sizeof(req->r_replay_version); + + /* oloc */ + ceph_encode_8(&p, 4); + ceph_encode_8(&p, 4); + ceph_encode_32(&p, 8 + 4 + 4); + ceph_encode_64(&p, req->r_t.target_oloc.pool); + ceph_encode_32(&p, -1); /* preferred */ + ceph_encode_32(&p, 0); /* key len */ + + /* pgid */ + ceph_encode_8(&p, 1); ceph_encode_64(&p, req->r_t.pgid.pool); ceph_encode_32(&p, req->r_t.pgid.seed); - put_unaligned_le64(1, req->r_request_attempts); /* FIXME */ - memcpy(req->r_request_reassert_version, &req->r_reassert_version, - sizeof(req->r_reassert_version)); + ceph_encode_32(&p, -1); /* preferred */ - req->r_stamp = jiffies; - list_move_tail(&req->r_req_lru_item, &osdc->req_lru); + /* oid */ + ceph_encode_32(&p, req->r_t.target_oid.name_len); + memcpy(p, req->r_t.target_oid.name, req->r_t.target_oid.name_len); + p += req->r_t.target_oid.name_len; - ceph_msg_get(req->r_request); /* send consumes a ref */ + /* ops, can imply data */ + ceph_encode_16(&p, req->r_num_ops); + for (i = 0; i < req->r_num_ops; i++) { + data_len += osd_req_encode_op(p, &req->r_ops[i]); + p += sizeof(struct ceph_osd_op); + } - req->r_sent = req->r_osd->o_incarnation; + ceph_encode_64(&p, req->r_snapid); /* snapid */ + if (req->r_snapc) { + ceph_encode_64(&p, req->r_snapc->seq); + ceph_encode_32(&p, req->r_snapc->num_snaps); + for (i = 0; i < req->r_snapc->num_snaps; i++) + ceph_encode_64(&p, req->r_snapc->snaps[i]); + } else { + ceph_encode_64(&p, 0); /* snap_seq */ + ceph_encode_32(&p, 0); /* snaps len */ + } + + ceph_encode_32(&p, req->r_attempts); /* retry_attempt */ + + BUG_ON(p > end); + msg->front.iov_len = p - msg->front.iov_base; + msg->hdr.version = cpu_to_le16(4); /* MOSDOp v4 */ + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + msg->hdr.data_len = cpu_to_le32(data_len); + /* + * The header "data_off" is a hint to the receiver allowing it + * to align received data into its buffers such that there's no + * need to re-copy it before writing it to disk (direct I/O). + */ + msg->hdr.data_off = cpu_to_le16(req->r_data_offset); - ceph_con_send(&req->r_osd->o_con, req->r_request); + dout("%s req %p oid %*pE oid_len %d front %zu data %u\n", __func__, + req, req->r_t.target_oid.name_len, req->r_t.target_oid.name, + req->r_t.target_oid.name_len, msg->front.iov_len, data_len); +} + +/* + * @req has to be assigned a tid and registered. + */ +static void send_request(struct ceph_osd_request *req) +{ + struct ceph_osd *osd = req->r_osd; + + WARN_ON(osd->o_osd != req->r_t.osd); + + req->r_flags |= CEPH_OSD_FLAG_KNOWN_REDIR; + if (req->r_attempts) + req->r_flags |= CEPH_OSD_FLAG_RETRY; + else + WARN_ON(req->r_flags & CEPH_OSD_FLAG_RETRY); + + encode_request(req, req->r_request); + + dout("%s req %p tid %llu to pg %llu.%x osd%d flags 0x%x attempt %d\n", + __func__, req, req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed, + req->r_t.osd, req->r_flags, req->r_attempts); + + req->r_t.paused = false; + req->r_stamp = jiffies; + req->r_attempts++; + + req->r_sent = osd->o_incarnation; + req->r_request->hdr.tid = cpu_to_le64(req->r_tid); + ceph_con_send(&osd->o_con, ceph_msg_get(req->r_request)); } /* @@ -1550,8 +1653,10 @@ static void __send_queued(struct ceph_osd_client *osdc) struct ceph_osd_request *req, *tmp; dout("__send_queued\n"); - list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item) - __send_request(osdc, req); + list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item) { + list_move_tail(&req->r_req_lru_item, &osdc->req_lru); + send_request(req); + } } /* @@ -1915,8 +2020,8 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) req->r_result = bytes; /* in case this is a write and we need to replay, */ - req->r_reassert_version.epoch = cpu_to_le32(reassert_epoch); - req->r_reassert_version.version = cpu_to_le64(reassert_version); + req->r_replay_version.epoch = cpu_to_le32(reassert_epoch); + req->r_replay_version.version = cpu_to_le64(reassert_version); req->r_got_reply = 1; } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) { @@ -2432,105 +2537,6 @@ static void handle_watch_notify(struct ceph_osd_client *osdc, pr_err("osdc handle_watch_notify corrupt msg\n"); } -/* - * build new request AND message - * - */ -void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, - struct ceph_snap_context *snapc, u64 snap_id, - struct timespec *mtime) -{ - struct ceph_msg *msg = req->r_request; - void *p; - size_t msg_size; - int flags = req->r_flags; - u64 data_len; - unsigned int i; - - req->r_snapid = snap_id; - WARN_ON(snapc != req->r_snapc); - - /* encode request */ - msg->hdr.version = cpu_to_le16(4); - - p = msg->front.iov_base; - ceph_encode_32(&p, 1); /* client_inc is always 1 */ - req->r_request_osdmap_epoch = p; - p += 4; - req->r_request_flags = p; - p += 4; - if (req->r_flags & CEPH_OSD_FLAG_WRITE) - ceph_encode_timespec(p, mtime); - p += sizeof(struct ceph_timespec); - req->r_request_reassert_version = p; - p += sizeof(struct ceph_eversion); /* will get filled in */ - - /* oloc */ - ceph_encode_8(&p, 4); - ceph_encode_8(&p, 4); - ceph_encode_32(&p, 8 + 4 + 4); - req->r_request_pool = p; - p += 8; - ceph_encode_32(&p, -1); /* preferred */ - ceph_encode_32(&p, 0); /* key len */ - - ceph_encode_8(&p, 1); - req->r_request_pgid = p; - p += 8 + 4; - ceph_encode_32(&p, -1); /* preferred */ - - /* oid */ - ceph_encode_32(&p, req->r_base_oid.name_len); - memcpy(p, req->r_base_oid.name, req->r_base_oid.name_len); - dout("oid %*pE len %d\n", req->r_base_oid.name_len, - req->r_base_oid.name, req->r_base_oid.name_len); - p += req->r_base_oid.name_len; - - /* ops--can imply data */ - ceph_encode_16(&p, (u16)req->r_num_ops); - data_len = 0; - for (i = 0; i < req->r_num_ops; i++) { - data_len += osd_req_encode_op(req, p, i); - p += sizeof(struct ceph_osd_op); - } - - /* snaps */ - ceph_encode_64(&p, req->r_snapid); - ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0); - ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0); - if (req->r_snapc) { - for (i = 0; i < req->r_snapc->num_snaps; i++) { - ceph_encode_64(&p, req->r_snapc->snaps[i]); - } - } - - req->r_request_attempts = p; - p += 4; - - /* data */ - if (flags & CEPH_OSD_FLAG_WRITE) { - u16 data_off; - - /* - * The header "data_off" is a hint to the receiver - * allowing it to align received data into its - * buffers such that there's no need to re-copy - * it before writing it to disk (direct I/O). - */ - data_off = (u16) (off & 0xffff); - req->r_request->hdr.data_off = cpu_to_le16(data_off); - } - req->r_request->hdr.data_len = cpu_to_le32(data_len); - - BUG_ON(p > msg->front.iov_base + msg->front.iov_len); - msg_size = p - msg->front.iov_base; - msg->front.iov_len = msg_size; - msg->hdr.front_len = cpu_to_le32(msg_size); - - dout("build_request msg_size was %d\n", (int)msg_size); -} -EXPORT_SYMBOL(ceph_osdc_build_request); - /* * Register request, send initial attempt. */ @@ -2749,15 +2755,12 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, return PTR_ERR(req); /* it may be a short read due to an object boundary */ - osd_req_op_extent_osd_data_pages(req, 0, pages, *plen, page_align, false, false); dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n", off, *plen, *plen, page_align); - ceph_osdc_build_request(req, off, NULL, vino.snap, NULL); - rc = ceph_osdc_start_request(osdc, req, false); if (!rc) rc = ceph_osdc_wait_request(osdc, req); @@ -2783,7 +2786,6 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, int rc = 0; int page_align = off & ~PAGE_MASK; - BUG_ON(vino.snap != CEPH_NOSNAP); /* snapshots aren't writeable */ req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1, CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, @@ -2797,8 +2799,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, false, false); dout("writepages %llu~%llu (%llu bytes)\n", off, len, len); - ceph_osdc_build_request(req, off, snapc, CEPH_NOSNAP, mtime); - + req->r_mtime = *mtime; rc = ceph_osdc_start_request(osdc, req, true); if (!rc) rc = ceph_osdc_wait_request(osdc, req); From 85e084feb47349d62989efe1713a8723af95f4ea Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:24 +0200 Subject: [PATCH 20/71] libceph: drop msg argument from ceph_osdc_callback_t finish_read(), its only user, uses it to get to hdr.data_len, which is what ->r_result is set to on success. This gains us the ability to safely call callbacks from contexts other than reply, e.g. map check. Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 5 ++--- fs/ceph/addr.c | 9 ++++----- fs/ceph/file.c | 7 +++---- include/linux/ceph/osd_client.h | 3 +-- net/ceph/osd_client.c | 4 ++-- 5 files changed, 12 insertions(+), 16 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 0e598916e04820..82b03aa509e66e 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1828,13 +1828,12 @@ static void rbd_osd_call_callback(struct rbd_obj_request *obj_request) obj_request_done_set(obj_request); } -static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, - struct ceph_msg *msg) +static void rbd_osd_req_callback(struct ceph_osd_request *osd_req) { struct rbd_obj_request *obj_request = osd_req->r_priv; u16 opcode; - dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg); + dout("%s: osd_req %p\n", __func__, osd_req); rbd_assert(osd_req == obj_request->osd_req); if (obj_request_img_data_test(obj_request)) { rbd_assert(obj_request->img_request); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 59b3c3fbd3bd79..a11756a3947100 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -257,12 +257,12 @@ static int ceph_readpage(struct file *filp, struct page *page) /* * Finish an async read(ahead) op. */ -static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) +static void finish_read(struct ceph_osd_request *req) { struct inode *inode = req->r_inode; struct ceph_osd_data *osd_data; - int rc = req->r_result; - int bytes = le32_to_cpu(msg->hdr.data_len); + int rc = req->r_result <= 0 ? req->r_result : 0; + int bytes = req->r_result >= 0 ? req->r_result : 0; int num_pages; int i; @@ -598,8 +598,7 @@ static void ceph_release_pages(struct page **pages, int num) * If we get an error, set the mapping error bit, but not the individual * page error bits. */ -static void writepages_finish(struct ceph_osd_request *req, - struct ceph_msg *msg) +static void writepages_finish(struct ceph_osd_request *req) { struct inode *inode = req->r_inode; struct ceph_inode_info *ci = ceph_inode(inode); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 52e4b72dd5debe..e75fd0b028e995 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -616,8 +616,7 @@ static void ceph_aio_complete(struct inode *inode, kfree(aio_req); } -static void ceph_aio_complete_req(struct ceph_osd_request *req, - struct ceph_msg *msg) +static void ceph_aio_complete_req(struct ceph_osd_request *req) { int rc = req->r_result; struct inode *inode = req->r_inode; @@ -740,7 +739,7 @@ static void ceph_aio_retry_work(struct work_struct *work) out: if (ret < 0) { req->r_result = ret; - ceph_aio_complete_req(req, NULL); + ceph_aio_complete_req(req); } ceph_put_snap_context(snapc); @@ -961,7 +960,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, req, false); if (ret < 0) { req->r_result = ret; - ceph_aio_complete_req(req, NULL); + ceph_aio_complete_req(req); } } return -EIOCBQUEUED; diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 67a37d98e0ca7f..3bebd60e7f9fce 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -20,8 +20,7 @@ struct ceph_osd_client; /* * completion callback for async writepages */ -typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *, - struct ceph_msg *); +typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *); typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool); #define CEPH_HOMELESS_OSD -1 diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 8a008f083283d6..2a30c0bb30459a 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2048,7 +2048,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) result >= 0 && !(flags & CEPH_OSD_FLAG_ONDISK)) req->r_unsafe_callback(req, true); if (req->r_callback) - req->r_callback(req, msg); + req->r_callback(req); else complete_all(&req->r_completion); } @@ -2072,7 +2072,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) req->r_result = -EIO; __unregister_request(osdc, req); if (req->r_callback) - req->r_callback(req, msg); + req->r_callback(req); else complete_all(&req->r_completion); complete_request(req); From fe5da05e979830b43b115d8a18ead521d507c783 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:24 +0200 Subject: [PATCH 21/71] libceph: redo callbacks and factor out MOSDOpReply decoding If you specify ACK | ONDISK and set ->r_unsafe_callback, both ->r_callback and ->r_unsafe_callback(true) are called on ack. This is very confusing. Redo this so that only one of them is called: ->r_unsafe_callback(true), on ack ->r_unsafe_callback(false), on commit or ->r_callback, on ack|commit Decode everything in decode_MOSDOpReply() to reduce clutter. Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 3 +- fs/ceph/file.c | 2 + include/linux/ceph/osd_client.h | 5 +- net/ceph/osd_client.c | 362 ++++++++++++++++++-------------- 4 files changed, 215 insertions(+), 157 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index a11756a3947100..f4741847762947 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1765,8 +1765,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) goto out_unlock; } - wr_req->r_flags = CEPH_OSD_FLAG_WRITE | - CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK; + wr_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ACK; osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL); ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc); ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index e75fd0b028e995..30fd49eb25b4fe 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -770,6 +770,8 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) list_add_tail(&req->r_unsafe_item, &ci->i_unsafe_writes); spin_unlock(&ci->i_unsafe_lock); + + complete_all(&req->r_completion); } else { spin_lock(&ci->i_unsafe_lock); list_del_init(&req->r_unsafe_item); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 3bebd60e7f9fce..2415dc0cb00859 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -162,13 +162,14 @@ struct ceph_osd_request { unsigned int r_num_ops; int r_result; - int r_got_reply; + bool r_got_reply; int r_linger; struct ceph_osd_client *r_osdc; struct kref r_kref; bool r_mempool; - struct completion r_completion, r_safe_completion; + struct completion r_completion; + struct completion r_safe_completion; /* fsync waiter */ ceph_osdc_callback_t r_callback; ceph_osdc_unsafe_callback_t r_unsafe_callback; struct list_head r_unsafe_item; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 2a30c0bb30459a..baf2844b00d61c 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1693,6 +1693,14 @@ static int __ceph_osdc_start_request(struct ceph_osd_client *osdc, return 0; } +static void __complete_request(struct ceph_osd_request *req) +{ + if (req->r_callback) + req->r_callback(req); + else + complete_all(&req->r_completion); +} + /* * Timeout callback, called every N seconds when 1 or more osd * requests has been active for more than N seconds. When this @@ -1875,107 +1883,76 @@ static int ceph_redirect_decode(void **p, void *end, goto out; } -static void complete_request(struct ceph_osd_request *req) -{ - complete_all(&req->r_safe_completion); /* fsync waiter */ -} +struct MOSDOpReply { + struct ceph_pg pgid; + u64 flags; + int result; + u32 epoch; + int num_ops; + u32 outdata_len[CEPH_OSD_MAX_OPS]; + s32 rval[CEPH_OSD_MAX_OPS]; + int retry_attempt; + struct ceph_eversion replay_version; + u64 user_version; + struct ceph_request_redirect redirect; +}; -/* - * handle osd op reply. either call the callback if it is specified, - * or do the completion to wake up the waiting thread. - */ -static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) +static int decode_MOSDOpReply(const struct ceph_msg *msg, struct MOSDOpReply *m) { - void *p, *end; - struct ceph_osd_request *req; - struct ceph_request_redirect redir; - u64 tid; - int object_len; - unsigned int numops; - int payload_len, flags; - s32 result; - s32 retry_attempt; - struct ceph_pg pg; - int err; - u32 reassert_epoch; - u64 reassert_version; - u32 osdmap_epoch; - int already_completed; - u32 bytes; + void *p = msg->front.iov_base; + void *const end = p + msg->front.iov_len; + u16 version = le16_to_cpu(msg->hdr.version); + struct ceph_eversion bad_replay_version; u8 decode_redir; - unsigned int i; - - tid = le64_to_cpu(msg->hdr.tid); - dout("handle_reply %p tid %llu\n", msg, tid); - - p = msg->front.iov_base; - end = p + msg->front.iov_len; + u32 len; + int ret; + int i; - ceph_decode_need(&p, end, 4, bad); - object_len = ceph_decode_32(&p); - ceph_decode_need(&p, end, object_len, bad); - p += object_len; + ceph_decode_32_safe(&p, end, len, e_inval); + ceph_decode_need(&p, end, len, e_inval); + p += len; /* skip oid */ - err = ceph_decode_pgid(&p, end, &pg); - if (err) - goto bad; + ret = ceph_decode_pgid(&p, end, &m->pgid); + if (ret) + return ret; - ceph_decode_need(&p, end, 8 + 4 + 4 + 8 + 4, bad); - flags = ceph_decode_64(&p); - result = ceph_decode_32(&p); - reassert_epoch = ceph_decode_32(&p); - reassert_version = ceph_decode_64(&p); - osdmap_epoch = ceph_decode_32(&p); + ceph_decode_64_safe(&p, end, m->flags, e_inval); + ceph_decode_32_safe(&p, end, m->result, e_inval); + ceph_decode_need(&p, end, sizeof(bad_replay_version), e_inval); + memcpy(&bad_replay_version, p, sizeof(bad_replay_version)); + p += sizeof(bad_replay_version); + ceph_decode_32_safe(&p, end, m->epoch, e_inval); - /* lookup */ - down_read(&osdc->map_sem); - mutex_lock(&osdc->request_mutex); - req = lookup_request(&osdc->requests, tid); - if (req == NULL) { - dout("handle_reply tid %llu dne\n", tid); - goto bad_mutex; - } - ceph_osdc_get_request(req); + ceph_decode_32_safe(&p, end, m->num_ops, e_inval); + if (m->num_ops > ARRAY_SIZE(m->outdata_len)) + goto e_inval; - dout("handle_reply %p tid %llu req %p result %d\n", msg, tid, - req, result); - - ceph_decode_need(&p, end, 4, bad_put); - numops = ceph_decode_32(&p); - if (numops > CEPH_OSD_MAX_OPS) - goto bad_put; - if (numops != req->r_num_ops) - goto bad_put; - payload_len = 0; - ceph_decode_need(&p, end, numops * sizeof(struct ceph_osd_op), bad_put); - for (i = 0; i < numops; i++) { + ceph_decode_need(&p, end, m->num_ops * sizeof(struct ceph_osd_op), + e_inval); + for (i = 0; i < m->num_ops; i++) { struct ceph_osd_op *op = p; - int len; - len = le32_to_cpu(op->payload_len); - req->r_ops[i].outdata_len = len; - dout(" op %d has %d bytes\n", i, len); - payload_len += len; + m->outdata_len[i] = le32_to_cpu(op->payload_len); p += sizeof(*op); } - bytes = le32_to_cpu(msg->hdr.data_len); - if (payload_len != bytes) { - pr_warn("sum of op payload lens %d != data_len %d\n", - payload_len, bytes); - goto bad_put; - } - ceph_decode_need(&p, end, 4 + numops * 4, bad_put); - retry_attempt = ceph_decode_32(&p); - for (i = 0; i < numops; i++) - req->r_ops[i].rval = ceph_decode_32(&p); + ceph_decode_32_safe(&p, end, m->retry_attempt, e_inval); + for (i = 0; i < m->num_ops; i++) + ceph_decode_32_safe(&p, end, m->rval[i], e_inval); - if (le16_to_cpu(msg->hdr.version) >= 6) { - p += 8 + 4; /* skip replay_version */ - p += 8; /* skip user_version */ + if (version >= 5) { + ceph_decode_need(&p, end, sizeof(m->replay_version), e_inval); + memcpy(&m->replay_version, p, sizeof(m->replay_version)); + p += sizeof(m->replay_version); + ceph_decode_64_safe(&p, end, m->user_version, e_inval); + } else { + m->replay_version = bad_replay_version; /* struct */ + m->user_version = le64_to_cpu(m->replay_version.version); + } - if (le16_to_cpu(msg->hdr.version) >= 7) - ceph_decode_8_safe(&p, end, decode_redir, bad_put); + if (version >= 6) { + if (version >= 7) + ceph_decode_8_safe(&p, end, decode_redir, e_inval); else decode_redir = 1; } else { @@ -1983,19 +1960,96 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) } if (decode_redir) { - err = ceph_redirect_decode(&p, end, &redir); - if (err) - goto bad_put; + ret = ceph_redirect_decode(&p, end, &m->redirect); + if (ret) + return ret; } else { - redir.oloc.pool = -1; + ceph_oloc_init(&m->redirect.oloc); } - if (!ceph_oloc_empty(&redir.oloc)) { - dout("redirect pool %lld\n", redir.oloc.pool); + return 0; + +e_inval: + return -EINVAL; +} + +/* + * We are done with @req if + * - @m is a safe reply, or + * - @m is an unsafe reply and we didn't want a safe one + */ +static bool done_request(const struct ceph_osd_request *req, + const struct MOSDOpReply *m) +{ + return (m->result < 0 || + (m->flags & CEPH_OSD_FLAG_ONDISK) || + !(req->r_flags & CEPH_OSD_FLAG_ONDISK)); +} +/* + * handle osd op reply. either call the callback if it is specified, + * or do the completion to wake up the waiting thread. + * + * ->r_unsafe_callback is set? yes no + * + * first reply is OK (needed r_cb/r_completion, r_cb/r_completion, + * any or needed/got safe) r_safe_completion r_safe_completion + * + * first reply is unsafe r_unsafe_cb(true) (nothing) + * + * when we get the safe reply r_unsafe_cb(false), r_cb/r_completion, + * r_safe_completion r_safe_completion + */ +static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) +{ + struct ceph_osd_request *req; + struct MOSDOpReply m; + u64 tid = le64_to_cpu(msg->hdr.tid); + u32 data_len = 0; + bool already_acked; + int ret; + int i; + + dout("%s msg %p tid %llu\n", __func__, msg, tid); + + down_read(&osdc->map_sem); + mutex_lock(&osdc->request_mutex); + req = lookup_request(&osdc->requests, tid); + if (!req) { + dout("%s no tid %llu\n", __func__, tid); + goto out_unlock; + } + ceph_osdc_get_request(req); + + ret = decode_MOSDOpReply(msg, &m); + if (ret) { + pr_err("failed to decode MOSDOpReply for tid %llu: %d\n", + req->r_tid, ret); + ceph_msg_dump(msg); + goto fail_request; + } + dout("%s req %p tid %llu flags 0x%llx pgid %llu.%x epoch %u attempt %d v %u'%llu uv %llu\n", + __func__, req, req->r_tid, m.flags, m.pgid.pool, m.pgid.seed, + m.epoch, m.retry_attempt, le32_to_cpu(m.replay_version.epoch), + le64_to_cpu(m.replay_version.version), m.user_version); + + if (m.retry_attempt >= 0) { + if (m.retry_attempt != req->r_attempts - 1) { + dout("req %p tid %llu retry_attempt %d != %d, ignoring\n", + req, req->r_tid, m.retry_attempt, + req->r_attempts - 1); + goto out_put; + } + } else { + WARN_ON(1); /* MOSDOpReply v4 is assumed */ + } + + if (!ceph_oloc_empty(&m.redirect.oloc)) { + dout("req %p tid %llu redirect pool %lld\n", req, req->r_tid, + m.redirect.oloc.pool); __unregister_request(osdc, req); - ceph_oloc_copy(&req->r_t.target_oloc, &redir.oloc); + ceph_oloc_copy(&req->r_t.target_oloc, &m.redirect.oloc); /* * Start redirect requests with nofail=true. If @@ -2005,85 +2059,85 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) * successfully. In the future we might want to follow * original request's nofail setting here. */ - err = __ceph_osdc_start_request(osdc, req, true); - BUG_ON(err); + ret = __ceph_osdc_start_request(osdc, req, true); + BUG_ON(ret); - goto out_unlock; + goto out_put; } - already_completed = req->r_got_reply; - if (!req->r_got_reply) { - req->r_result = result; - dout("handle_reply result %d bytes %d\n", req->r_result, - bytes); - if (req->r_result == 0) - req->r_result = bytes; - - /* in case this is a write and we need to replay, */ - req->r_replay_version.epoch = cpu_to_le32(reassert_epoch); - req->r_replay_version.version = cpu_to_le64(reassert_version); - - req->r_got_reply = 1; - } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) { - dout("handle_reply tid %llu dup ack\n", tid); - goto out_unlock; + if (m.num_ops != req->r_num_ops) { + pr_err("num_ops %d != %d for tid %llu\n", m.num_ops, + req->r_num_ops, req->r_tid); + goto fail_request; } - - dout("handle_reply tid %llu flags %d\n", tid, flags); - - if (req->r_linger && (flags & CEPH_OSD_FLAG_ONDISK)) - __register_linger_request(osdc, req); - - /* either this is a read, or we got the safe response */ - if (result < 0 || - (flags & CEPH_OSD_FLAG_ONDISK) || - ((flags & CEPH_OSD_FLAG_WRITE) == 0)) + for (i = 0; i < req->r_num_ops; i++) { + dout(" req %p tid %llu op %d rval %d len %u\n", req, + req->r_tid, i, m.rval[i], m.outdata_len[i]); + req->r_ops[i].rval = m.rval[i]; + req->r_ops[i].outdata_len = m.outdata_len[i]; + data_len += m.outdata_len[i]; + } + if (data_len != le32_to_cpu(msg->hdr.data_len)) { + pr_err("sum of lens %u != %u for tid %llu\n", data_len, + le32_to_cpu(msg->hdr.data_len), req->r_tid); + goto fail_request; + } + dout("%s req %p tid %llu acked %d result %d data_len %u\n", __func__, + req, req->r_tid, req->r_got_reply, m.result, data_len); + + already_acked = req->r_got_reply; + if (!already_acked) { + req->r_result = m.result ?: data_len; + req->r_replay_version = m.replay_version; /* struct */ + req->r_got_reply = true; + } else if (!(m.flags & CEPH_OSD_FLAG_ONDISK)) { + dout("req %p tid %llu dup ack\n", req, req->r_tid); + goto out_put; + } + + if (done_request(req, &m)) { __unregister_request(osdc, req); + if (req->r_linger) { + WARN_ON(req->r_unsafe_callback); + __register_linger_request(osdc, req); + } + } mutex_unlock(&osdc->request_mutex); up_read(&osdc->map_sem); - if (!already_completed) { - if (req->r_unsafe_callback && - result >= 0 && !(flags & CEPH_OSD_FLAG_ONDISK)) - req->r_unsafe_callback(req, true); - if (req->r_callback) - req->r_callback(req); - else - complete_all(&req->r_completion); - } - - if (flags & CEPH_OSD_FLAG_ONDISK) { - if (req->r_unsafe_callback && already_completed) + if (done_request(req, &m)) { + if (already_acked && req->r_unsafe_callback) { + dout("req %p tid %llu safe-cb\n", req, req->r_tid); req->r_unsafe_callback(req, false); - complete_request(req); + } else { + dout("req %p tid %llu cb\n", req, req->r_tid); + __complete_request(req); + } + } else { + if (req->r_unsafe_callback) { + dout("req %p tid %llu unsafe-cb\n", req, req->r_tid); + req->r_unsafe_callback(req, true); + } else { + WARN_ON(1); + } } + if (m.flags & CEPH_OSD_FLAG_ONDISK) + complete_all(&req->r_safe_completion); -out: - dout("req=%p req->r_linger=%d\n", req, req->r_linger); ceph_osdc_put_request(req); return; -out_unlock: - mutex_unlock(&osdc->request_mutex); - up_read(&osdc->map_sem); - goto out; -bad_put: +fail_request: req->r_result = -EIO; __unregister_request(osdc, req); - if (req->r_callback) - req->r_callback(req); - else - complete_all(&req->r_completion); - complete_request(req); + __complete_request(req); + complete_all(&req->r_safe_completion); +out_put: ceph_osdc_put_request(req); -bad_mutex: +out_unlock: mutex_unlock(&osdc->request_mutex); up_read(&osdc->map_sem); -bad: - pr_err("corrupt osd_op_reply got %d %d\n", - (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len)); - ceph_msg_dump(msg); } static void reset_changed_osds(struct ceph_osd_client *osdc) @@ -2591,7 +2645,9 @@ int ceph_osdc_wait_request(struct ceph_osd_client *osdc, if (rc < 0) { dout("%s %p tid %llu interrupted\n", __func__, req, req->r_tid); ceph_osdc_cancel_request(req); - complete_request(req); + + /* kludge - need to to wake ceph_osdc_sync() */ + complete_all(&req->r_safe_completion); return rc; } From b37ee1b9b840a6da63e78865f97a9a78b7534477 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:24 +0200 Subject: [PATCH 22/71] libceph: move schedule_delayed_work() in ceph_osdc_init() ceph_osdc_stop() isn't called if ceph_osdc_init() fails, so we end up with handle_osds_timeout() running on invalid memory if any one of the allocations fails. Call schedule_delayed_work() after everything is setup, just before returning. Signed-off-by: Ilya Dryomov --- net/ceph/osd_client.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index baf2844b00d61c..4c7231e55cb7d2 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2729,9 +2729,6 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) osdc->event_tree = RB_ROOT; osdc->event_count = 0; - schedule_delayed_work(&osdc->osds_timeout_work, - round_jiffies_relative(osdc->client->options->osd_idle_ttl)); - err = -ENOMEM; osdc->req_mempool = mempool_create_slab_pool(10, ceph_osd_request_cache); @@ -2752,6 +2749,9 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) if (!osdc->notify_wq) goto out_msgpool_reply; + schedule_delayed_work(&osdc->osds_timeout_work, + round_jiffies_relative(osdc->client->options->osd_idle_ttl)); + return 0; out_msgpool_reply: From fbca963532eba779bffa14c70d1dc619e6bfa16d Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:24 +0200 Subject: [PATCH 23/71] libceph: schedule tick from ceph_osdc_init() Both homeless OSD sessions and watch/notify v2, introduced in later commits, require periodic ticks which don't depend on ->num_requests. Schedule the initial tick from ceph_osdc_init() and reschedule from handle_timeout() unconditionally. Signed-off-by: Ilya Dryomov --- net/ceph/osd_client.c | 37 +++++++++---------------------------- 1 file changed, 9 insertions(+), 28 deletions(-) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 4c7231e55cb7d2..41dabce9c9c37c 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1138,17 +1138,6 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) return 0; } -static void __schedule_osd_timeout(struct ceph_osd_client *osdc) -{ - schedule_delayed_work(&osdc->timeout_work, - osdc->client->options->osd_keepalive_timeout); -} - -static void __cancel_osd_timeout(struct ceph_osd_client *osdc) -{ - cancel_delayed_work(&osdc->timeout_work); -} - /* * Register request, assign tid. If this is the first request, set up * the timeout event. @@ -1162,10 +1151,6 @@ static void __register_request(struct ceph_osd_client *osdc, insert_request(&osdc->requests, req); ceph_osdc_get_request(req); osdc->num_requests++; - if (osdc->num_requests == 1) { - dout(" first request, scheduling timeout\n"); - __schedule_osd_timeout(osdc); - } } /* @@ -1196,11 +1181,6 @@ static void __unregister_request(struct ceph_osd_client *osdc, list_del_init(&req->r_req_lru_item); ceph_osdc_put_request(req); - - if (osdc->num_requests == 0) { - dout(" no requests, canceling timeout\n"); - __cancel_osd_timeout(osdc); - } } /* @@ -1702,13 +1682,10 @@ static void __complete_request(struct ceph_osd_request *req) } /* - * Timeout callback, called every N seconds when 1 or more osd - * requests has been active for more than N seconds. When this - * happens, we ping all OSDs with requests who have timed out to - * ensure any communications channel reset is detected. Reset the - * request timeouts another N seconds in the future as we go. - * Reschedule the timeout event another N seconds in future (unless - * there are no open requests). + * Timeout callback, called every N seconds. When 1 or more OSD + * requests has been active for more than N seconds, we send a keepalive + * (tag + timestamp) to its OSD to ensure any communications channel + * reset is detected. */ static void handle_timeout(struct work_struct *work) { @@ -1749,10 +1726,12 @@ static void handle_timeout(struct work_struct *work) ceph_con_keepalive(&osd->o_con); } - __schedule_osd_timeout(osdc); __send_queued(osdc); mutex_unlock(&osdc->request_mutex); up_read(&osdc->map_sem); + + schedule_delayed_work(&osdc->timeout_work, + osdc->client->options->osd_keepalive_timeout); } static void handle_osds_timeout(struct work_struct *work) @@ -2749,6 +2728,8 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) if (!osdc->notify_wq) goto out_msgpool_reply; + schedule_delayed_work(&osdc->timeout_work, + osdc->client->options->osd_keepalive_timeout); schedule_delayed_work(&osdc->osds_timeout_work, round_jiffies_relative(osdc->client->options->osd_idle_ttl)); From e5253a7bde13788d9dc75f42eb47ea119af5609f Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:25 +0200 Subject: [PATCH 24/71] libceph: allocate dummy osdmap in ceph_osdc_init() This leads to a simpler osdmap handling code, particularly when dealing with pi->was_full, which is introduced in a later commit. Signed-off-by: Ilya Dryomov --- include/linux/ceph/osdmap.h | 1 + net/ceph/osd_client.c | 22 +++++++++++----------- net/ceph/osdmap.c | 23 ++++++++++++++++++----- 3 files changed, 30 insertions(+), 16 deletions(-) diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 420bb7968b2587..8468c734d712d7 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -225,6 +225,7 @@ static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid) return 0; } +struct ceph_osdmap *ceph_osdmap_alloc(void); extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end); struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, struct ceph_osdmap *map); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 41dabce9c9c37c..9c35fd84a4108f 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2255,7 +2255,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) struct ceph_fsid fsid; bool was_full; - dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0); + dout("handle_map have %u\n", osdc->osdmap->epoch); p = msg->front.iov_base; end = p + msg->front.iov_len; @@ -2278,7 +2278,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) maplen = ceph_decode_32(&p); ceph_decode_need(&p, end, maplen, bad); next = p + maplen; - if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) { + if (osdc->osdmap->epoch+1 == epoch) { dout("applying incremental map %u len %d\n", epoch, maplen); newmap = osdmap_apply_incremental(&p, next, @@ -2317,7 +2317,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) if (nr_maps > 1) { dout("skipping non-latest full map %u len %d\n", epoch, maplen); - } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) { + } else if (osdc->osdmap->epoch >= epoch) { dout("skipping full map %u len %d, " "older than our %u\n", epoch, maplen, osdc->osdmap->epoch); @@ -2347,8 +2347,6 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) nr_maps--; } - if (!osdc->osdmap) - goto bad; done: downgrade_write(&osdc->map_sem); ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP, @@ -2690,7 +2688,6 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) dout("init\n"); osdc->client = client; - osdc->osdmap = NULL; init_rwsem(&osdc->map_sem); mutex_init(&osdc->request_mutex); osdc->last_tid = 0; @@ -2709,10 +2706,14 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) osdc->event_count = 0; err = -ENOMEM; + osdc->osdmap = ceph_osdmap_alloc(); + if (!osdc->osdmap) + goto out; + osdc->req_mempool = mempool_create_slab_pool(10, ceph_osd_request_cache); if (!osdc->req_mempool) - goto out; + goto out_map; err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP, PAGE_SIZE, 10, true, "osd_op"); @@ -2741,6 +2742,8 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) ceph_msgpool_destroy(&osdc->msgpool_op); out_mempool: mempool_destroy(osdc->req_mempool); +out_map: + ceph_osdmap_destroy(osdc->osdmap); out: return err; } @@ -2760,10 +2763,7 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc) } mutex_unlock(&osdc->request_mutex); - if (osdc->osdmap) { - ceph_osdmap_destroy(osdc->osdmap); - osdc->osdmap = NULL; - } + ceph_osdmap_destroy(osdc->osdmap); mempool_destroy(osdc->req_mempool); ceph_msgpool_destroy(&osdc->msgpool_op); ceph_msgpool_destroy(&osdc->msgpool_op_reply); diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 7d4a5b43085e1c..cde52e94732f36 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -707,6 +707,23 @@ static int decode_pool_names(void **p, void *end, struct ceph_osdmap *map) /* * osd map */ +struct ceph_osdmap *ceph_osdmap_alloc(void) +{ + struct ceph_osdmap *map; + + map = kzalloc(sizeof(*map), GFP_NOIO); + if (!map) + return NULL; + + map->pg_pools = RB_ROOT; + map->pool_max = -1; + map->pg_temp = RB_ROOT; + map->primary_temp = RB_ROOT; + mutex_init(&map->crush_scratch_mutex); + + return map; +} + void ceph_osdmap_destroy(struct ceph_osdmap *map) { dout("osdmap_destroy %p\n", map); @@ -1230,14 +1247,10 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end) struct ceph_osdmap *map; int ret; - map = kzalloc(sizeof(*map), GFP_NOFS); + map = ceph_osdmap_alloc(); if (!map) return ERR_PTR(-ENOMEM); - map->pg_temp = RB_ROOT; - map->primary_temp = RB_ROOT; - mutex_init(&map->crush_scratch_mutex); - ret = osdmap_decode(p, end, map); if (ret) { ceph_osdmap_destroy(map); From 42c1b1240326cbea86f15f5d4ce565d8b54be31f Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:25 +0200 Subject: [PATCH 25/71] libceph: handle_one_map() Separate osdmap handling from decoding and iterating over a bag of maps in a fresh MOSDMap message. This sets up the scene for the updated OSD client. Of particular importance here is the addition of pi->was_full, which can be used to answer "did this pool go full -> not-full in this map?". This is the key bit for supporting pool quotas. We won't be able to downgrade map_sem for much longer, so drop downgrade_write(). Signed-off-by: Ilya Dryomov --- include/linux/ceph/mon_client.h | 1 + include/linux/ceph/osdmap.h | 2 + net/ceph/mon_client.c | 8 ++ net/ceph/osd_client.c | 186 ++++++++++++++++++++++---------- 4 files changed, 141 insertions(+), 56 deletions(-) diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h index 330d045e409295..c14e9d861cda31 100644 --- a/include/linux/ceph/mon_client.h +++ b/include/linux/ceph/mon_client.h @@ -115,6 +115,7 @@ extern const char *ceph_sub_str[]; bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch, bool continuous); void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch); +void ceph_monc_renew_subs(struct ceph_mon_client *monc); extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc); extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch, diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 8468c734d712d7..821e16fff39af7 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -45,6 +45,8 @@ struct ceph_pg_pool_info { s64 write_tier; /* wins for read+write ops */ u64 flags; /* CEPH_POOL_FLAG_* */ char *name; + + bool was_full; /* for handle_one_map() */ }; static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool) diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index a426a4b03e757d..98bfbe1f680784 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -376,6 +376,14 @@ void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch) } EXPORT_SYMBOL(ceph_monc_got_map); +void ceph_monc_renew_subs(struct ceph_mon_client *monc) +{ + mutex_lock(&monc->mutex); + __send_subscribe(monc); + mutex_unlock(&monc->mutex); +} +EXPORT_SYMBOL(ceph_monc_renew_subs); + /* * Register interest in the next osdmap */ diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 9c35fd84a4108f..4227c55226c35c 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1245,6 +1245,21 @@ static bool __pool_full(struct ceph_pg_pool_info *pi) return pi->flags & CEPH_POOL_FLAG_FULL; } +static bool have_pool_full(struct ceph_osd_client *osdc) +{ + struct rb_node *n; + + for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) { + struct ceph_pg_pool_info *pi = + rb_entry(n, struct ceph_pg_pool_info, node); + + if (__pool_full(pi)) + return true; + } + + return false; +} + /* * Returns whether a request should be blocked from being sent * based on the current osdmap and osd_client settings. @@ -1639,6 +1654,26 @@ static void __send_queued(struct ceph_osd_client *osdc) } } +static void maybe_request_map(struct ceph_osd_client *osdc) +{ + bool continuous = false; + + WARN_ON(!osdc->osdmap->epoch); + + if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) || + ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) || + ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR)) { + dout("%s osdc %p continuous\n", __func__, osdc); + continuous = true; + } else { + dout("%s osdc %p onetime\n", __func__, osdc); + } + + if (ceph_monc_want_map(&osdc->client->monc, CEPH_SUB_OSDMAP, + osdc->osdmap->epoch + 1, continuous)) + ceph_monc_renew_subs(&osdc->client->monc); +} + /* * Caller should hold map_sem for read and request_mutex. */ @@ -2119,6 +2154,18 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) up_read(&osdc->map_sem); } +static void set_pool_was_full(struct ceph_osd_client *osdc) +{ + struct rb_node *n; + + for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) { + struct ceph_pg_pool_info *pi = + rb_entry(n, struct ceph_pg_pool_info, node); + + pi->was_full = __pool_full(pi); + } +} + static void reset_changed_osds(struct ceph_osd_client *osdc) { struct rb_node *p, *n; @@ -2237,6 +2284,57 @@ static void kick_requests(struct ceph_osd_client *osdc, bool force_resend, } } +static int handle_one_map(struct ceph_osd_client *osdc, + void *p, void *end, bool incremental) +{ + struct ceph_osdmap *newmap; + struct rb_node *n; + bool skipped_map = false; + bool was_full; + + was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL); + set_pool_was_full(osdc); + + if (incremental) + newmap = osdmap_apply_incremental(&p, end, osdc->osdmap); + else + newmap = ceph_osdmap_decode(&p, end); + if (IS_ERR(newmap)) + return PTR_ERR(newmap); + + if (newmap != osdc->osdmap) { + /* + * Preserve ->was_full before destroying the old map. + * For pools that weren't in the old map, ->was_full + * should be false. + */ + for (n = rb_first(&newmap->pg_pools); n; n = rb_next(n)) { + struct ceph_pg_pool_info *pi = + rb_entry(n, struct ceph_pg_pool_info, node); + struct ceph_pg_pool_info *old_pi; + + old_pi = ceph_pg_pool_by_id(osdc->osdmap, pi->id); + if (old_pi) + pi->was_full = old_pi->was_full; + else + WARN_ON(pi->was_full); + } + + if (osdc->osdmap->epoch && + osdc->osdmap->epoch + 1 < newmap->epoch) { + WARN_ON(incremental); + skipped_map = true; + } + + ceph_osdmap_destroy(osdc->osdmap); + osdc->osdmap = newmap; + } + + was_full &= !ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL); + kick_requests(osdc, skipped_map, was_full); + + return 0; +} /* * Process updated osd map. @@ -2247,27 +2345,29 @@ static void kick_requests(struct ceph_osd_client *osdc, bool force_resend, */ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) { - void *p, *end, *next; + void *p = msg->front.iov_base; + void *const end = p + msg->front.iov_len; u32 nr_maps, maplen; u32 epoch; - struct ceph_osdmap *newmap = NULL, *oldmap; - int err; struct ceph_fsid fsid; - bool was_full; + bool handled_incremental = false; + bool was_pauserd, was_pausewr; + bool pauserd, pausewr; + int err; - dout("handle_map have %u\n", osdc->osdmap->epoch); - p = msg->front.iov_base; - end = p + msg->front.iov_len; + dout("%s have %u\n", __func__, osdc->osdmap->epoch); + down_write(&osdc->map_sem); /* verify fsid */ ceph_decode_need(&p, end, sizeof(fsid), bad); ceph_decode_copy(&p, &fsid, sizeof(fsid)); if (ceph_check_fsid(osdc->client, &fsid) < 0) - return; - - down_write(&osdc->map_sem); + goto bad; - was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL); + was_pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD); + was_pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) || + ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) || + have_pool_full(osdc); /* incremental maps */ ceph_decode_32_safe(&p, end, nr_maps, bad); @@ -2277,33 +2377,22 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) epoch = ceph_decode_32(&p); maplen = ceph_decode_32(&p); ceph_decode_need(&p, end, maplen, bad); - next = p + maplen; - if (osdc->osdmap->epoch+1 == epoch) { + if (osdc->osdmap->epoch && + osdc->osdmap->epoch + 1 == epoch) { dout("applying incremental map %u len %d\n", epoch, maplen); - newmap = osdmap_apply_incremental(&p, next, - osdc->osdmap); - if (IS_ERR(newmap)) { - err = PTR_ERR(newmap); + err = handle_one_map(osdc, p, p + maplen, true); + if (err) goto bad; - } - BUG_ON(!newmap); - if (newmap != osdc->osdmap) { - ceph_osdmap_destroy(osdc->osdmap); - osdc->osdmap = newmap; - } - was_full = was_full || - ceph_osdmap_flag(osdc->osdmap, - CEPH_OSDMAP_FULL); - kick_requests(osdc, 0, was_full); + handled_incremental = true; } else { dout("ignoring incremental map %u len %d\n", epoch, maplen); } - p = next; + p += maplen; nr_maps--; } - if (newmap) + if (handled_incremental) goto done; /* full maps */ @@ -2322,50 +2411,35 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) "older than our %u\n", epoch, maplen, osdc->osdmap->epoch); } else { - int skipped_map = 0; - dout("taking full map %u len %d\n", epoch, maplen); - newmap = ceph_osdmap_decode(&p, p+maplen); - if (IS_ERR(newmap)) { - err = PTR_ERR(newmap); + err = handle_one_map(osdc, p, p + maplen, false); + if (err) goto bad; - } - BUG_ON(!newmap); - oldmap = osdc->osdmap; - osdc->osdmap = newmap; - if (oldmap) { - if (oldmap->epoch + 1 < newmap->epoch) - skipped_map = 1; - ceph_osdmap_destroy(oldmap); - } - was_full = was_full || - ceph_osdmap_flag(osdc->osdmap, - CEPH_OSDMAP_FULL); - kick_requests(osdc, skipped_map, was_full); } p += maplen; nr_maps--; } done: - downgrade_write(&osdc->map_sem); - ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP, - osdc->osdmap->epoch); - /* * subscribe to subsequent osdmap updates if full to ensure * we find out when we are no longer full and stop returning * ENOSPC. */ - if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) || - ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) || - ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR)) - ceph_monc_request_next_osdmap(&osdc->client->monc); + pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD); + pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) || + ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) || + have_pool_full(osdc); + if (was_pauserd || was_pausewr || pauserd || pausewr) + maybe_request_map(osdc); mutex_lock(&osdc->request_mutex); __send_queued(osdc); mutex_unlock(&osdc->request_mutex); - up_read(&osdc->map_sem); + + ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP, + osdc->osdmap->epoch); + up_write(&osdc->map_sem); wake_up_all(&osdc->client->auth_wq); return; From 0247a0cf3e777932a0cae37ab1d8055a3881458c Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:25 +0200 Subject: [PATCH 26/71] libceph: osd_init() and osd_cleanup() These are going to be used by homeless OSD sessions code. Signed-off-by: Ilya Dryomov --- net/ceph/osd_client.c | 46 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 9 deletions(-) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 4227c55226c35c..77f37b63622d9e 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -864,6 +864,11 @@ EXPORT_SYMBOL(ceph_osdc_new_request); */ DEFINE_RB_FUNCS(request, struct ceph_osd_request, r_tid, r_node) +static bool osd_homeless(struct ceph_osd *osd) +{ + return osd->o_osd == CEPH_HOMELESS_OSD; +} + static struct ceph_osd_request * __lookup_request_ge(struct ceph_osd_client *osdc, u64 tid) @@ -1001,6 +1006,34 @@ static void osd_reset(struct ceph_connection *con) up_read(&osdc->map_sem); } +/* + * Assumes @osd is zero-initialized. + */ +static void osd_init(struct ceph_osd *osd) +{ + atomic_set(&osd->o_ref, 1); + RB_CLEAR_NODE(&osd->o_node); + INIT_LIST_HEAD(&osd->o_requests); + INIT_LIST_HEAD(&osd->o_linger_requests); + INIT_LIST_HEAD(&osd->o_osd_lru); + INIT_LIST_HEAD(&osd->o_keepalive_item); + osd->o_incarnation = 1; +} + +static void osd_cleanup(struct ceph_osd *osd) +{ + WARN_ON(!RB_EMPTY_NODE(&osd->o_node)); + WARN_ON(!list_empty(&osd->o_requests)); + WARN_ON(!list_empty(&osd->o_linger_requests)); + WARN_ON(!list_empty(&osd->o_osd_lru)); + WARN_ON(!list_empty(&osd->o_keepalive_item)); + + if (osd->o_auth.authorizer) { + WARN_ON(osd_homeless(osd)); + ceph_auth_destroy_authorizer(osd->o_auth.authorizer); + } +} + /* * Track open sessions with osds. */ @@ -1008,22 +1041,18 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum) { struct ceph_osd *osd; + WARN_ON(onum == CEPH_HOMELESS_OSD); + osd = kzalloc(sizeof(*osd), GFP_NOFS); if (!osd) return NULL; - atomic_set(&osd->o_ref, 1); + osd_init(osd); osd->o_osdc = osdc; osd->o_osd = onum; - RB_CLEAR_NODE(&osd->o_node); - INIT_LIST_HEAD(&osd->o_requests); - INIT_LIST_HEAD(&osd->o_linger_requests); - INIT_LIST_HEAD(&osd->o_osd_lru); - osd->o_incarnation = 1; ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr); - INIT_LIST_HEAD(&osd->o_keepalive_item); return osd; } @@ -1044,8 +1073,7 @@ static void put_osd(struct ceph_osd *osd) dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref), atomic_read(&osd->o_ref) - 1); if (atomic_dec_and_test(&osd->o_ref)) { - if (osd->o_auth.authorizer) - ceph_auth_destroy_authorizer(osd->o_auth.authorizer); + osd_cleanup(osd); kfree(osd); } } From 7a28f59bf9fb220cdf56ac6ab539fc4a0ae59414 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:25 +0200 Subject: [PATCH 27/71] libceph: allocate ceph_osd with GFP_NOFAIL create_osd() is called way too deep in the stack to be able to error out in a sane way; a failing create_osd() just messes everything up. The current req_notarget list solution is broken - the list is never traversed as it's not entirely clear when to do it, I guess. If we were to start traversing it at regular intervals and retrying each request, we wouldn't be far off from what __GFP_NOFAIL is doing, so allocate OSD sessions with __GFP_NOFAIL, at least until we come up with a better fix. Signed-off-by: Ilya Dryomov --- net/ceph/osd_client.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 77f37b63622d9e..b6950c2c6cc4d6 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1043,10 +1043,7 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum) WARN_ON(onum == CEPH_HOMELESS_OSD); - osd = kzalloc(sizeof(*osd), GFP_NOFS); - if (!osd) - return NULL; - + osd = kzalloc(sizeof(*osd), GFP_NOIO | __GFP_NOFAIL); osd_init(osd); osd->o_osdc = osdc; osd->o_osd = onum; From 9dd2845ccb40452d4ac943231ea34aade4a02c68 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:26 +0200 Subject: [PATCH 28/71] libceph: protect osdc->osd_lru list with a spinlock OSD client is getting moved from the big per-client lock to a set of per-session locks. The big rwlock would only be held for read most of the time, so a global osdc->osd_lru needs additional protection. Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 1 + net/ceph/osd_client.c | 29 ++++++++++++++++++----------- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 2415dc0cb00859..486d681694c4c3 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -224,6 +224,7 @@ struct ceph_osd_client { struct mutex request_mutex; struct rb_root osds; /* osds */ struct list_head osd_lru; /* idle osds */ + spinlock_t osd_lru_lock; u64 last_tid; /* tid of last request */ struct rb_root requests; /* pending requests */ struct list_head req_lru; /* in-flight lru */ diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index b6950c2c6cc4d6..d1c8e06f126178 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1101,31 +1101,37 @@ static void remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) } } -static void __move_osd_to_lru(struct ceph_osd_client *osdc, - struct ceph_osd *osd) +static void __move_osd_to_lru(struct ceph_osd *osd) { - dout("%s %p\n", __func__, osd); + struct ceph_osd_client *osdc = osd->o_osdc; + + dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd); BUG_ON(!list_empty(&osd->o_osd_lru)); + spin_lock(&osdc->osd_lru_lock); list_add_tail(&osd->o_osd_lru, &osdc->osd_lru); + spin_unlock(&osdc->osd_lru_lock); + osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl; } -static void maybe_move_osd_to_lru(struct ceph_osd_client *osdc, - struct ceph_osd *osd) +static void maybe_move_osd_to_lru(struct ceph_osd *osd) { - dout("%s %p\n", __func__, osd); - if (list_empty(&osd->o_requests) && list_empty(&osd->o_linger_requests)) - __move_osd_to_lru(osdc, osd); + __move_osd_to_lru(osd); } static void __remove_osd_from_lru(struct ceph_osd *osd) { - dout("__remove_osd_from_lru %p\n", osd); + struct ceph_osd_client *osdc = osd->o_osdc; + + dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd); + + spin_lock(&osdc->osd_lru_lock); if (!list_empty(&osd->o_osd_lru)) list_del_init(&osd->o_osd_lru); + spin_unlock(&osdc->osd_lru_lock); } /* @@ -1199,7 +1205,7 @@ static void __unregister_request(struct ceph_osd_client *osdc, ceph_msg_revoke(req->r_request); list_del_init(&req->r_osd_item); - maybe_move_osd_to_lru(osdc, req->r_osd); + maybe_move_osd_to_lru(req->r_osd); if (list_empty(&req->r_linger_osd_item)) req->r_osd = NULL; } @@ -1248,7 +1254,7 @@ static void __unregister_linger_request(struct ceph_osd_client *osdc, if (req->r_osd) { list_del_init(&req->r_linger_osd_item); - maybe_move_osd_to_lru(osdc, req->r_osd); + maybe_move_osd_to_lru(req->r_osd); if (list_empty(&req->r_osd_item)) req->r_osd = NULL; } @@ -2792,6 +2798,7 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) osdc->last_tid = 0; osdc->osds = RB_ROOT; INIT_LIST_HEAD(&osdc->osd_lru); + spin_lock_init(&osdc->osd_lru_lock); osdc->requests = RB_ROOT; INIT_LIST_HEAD(&osdc->req_lru); INIT_LIST_HEAD(&osdc->req_unsent); From 5aea3dcd50215fa9563270251ad7323e2f2490ee Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:26 +0200 Subject: [PATCH 29/71] libceph: a major OSD client update This is a major sync up, up to ~Jewel. The highlights are: - per-session request trees (vs a global per-client tree) - per-session locking (vs a global per-client rwlock) - homeless OSD session - no ad-hoc global per-client lists - support for pool quotas - foundation for watch/notify v2 support - foundation for map check (pool deletion detection) support The switchover is incomplete: lingering requests can be setup and teared down but aren't ever reestablished. This functionality is restored with the introduction of the new lingering infrastructure (ceph_osd_linger_request, linger_work, etc) in a later commit. Signed-off-by: Ilya Dryomov --- fs/ceph/ioctl.c | 8 +- fs/ceph/xattr.c | 8 +- include/linux/ceph/osd_client.h | 18 +- net/ceph/debugfs.c | 34 +- net/ceph/osd_client.c | 1164 +++++++++++++++---------------- 5 files changed, 602 insertions(+), 630 deletions(-) diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 1831ad6cf06670..be6b1657b1af2a 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -193,12 +193,12 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) if (copy_from_user(&dl, arg, sizeof(dl))) return -EFAULT; - down_read(&osdc->map_sem); + down_read(&osdc->lock); r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len, &dl.object_no, &dl.object_offset, &olen); if (r < 0) { - up_read(&osdc->map_sem); + up_read(&osdc->lock); return -EIO; } dl.file_offset -= dl.object_offset; @@ -217,7 +217,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) r = ceph_object_locator_to_pg(osdc->osdmap, &oid, &oloc, &pgid); if (r < 0) { - up_read(&osdc->map_sem); + up_read(&osdc->lock); return r; } @@ -230,7 +230,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) } else { memset(&dl.osd_addr, 0, sizeof(dl.osd_addr)); } - up_read(&osdc->map_sem); + up_read(&osdc->lock); /* send result back to user */ if (copy_to_user(arg, &dl, sizeof(dl))) diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 9410abdef3cec5..5afabc4bf4c713 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -75,7 +75,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, char buf[128]; dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode); - down_read(&osdc->map_sem); + down_read(&osdc->lock); pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); if (pool_name) { size_t len = strlen(pool_name); @@ -107,7 +107,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, ret = -ERANGE; } } - up_read(&osdc->map_sem); + up_read(&osdc->lock); return ret; } @@ -141,13 +141,13 @@ static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci, s64 pool = ceph_file_layout_pg_pool(ci->i_layout); const char *pool_name; - down_read(&osdc->map_sem); + down_read(&osdc->lock); pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); if (pool_name) ret = snprintf(val, size, "%s", pool_name); else ret = snprintf(val, size, "%lld", (unsigned long long)pool); - up_read(&osdc->map_sem); + up_read(&osdc->lock); return ret; } diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 486d681694c4c3..342f22f1f0400f 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -33,12 +33,13 @@ struct ceph_osd { int o_incarnation; struct rb_node o_node; struct ceph_connection o_con; - struct list_head o_requests; + struct rb_root o_requests; struct list_head o_linger_requests; struct list_head o_osd_lru; struct ceph_auth_handshake o_auth; unsigned long lru_ttl; struct list_head o_keepalive_item; + struct mutex lock; }; #define CEPH_OSD_SLAB_OPS 2 @@ -144,8 +145,6 @@ struct ceph_osd_request_target { struct ceph_osd_request { u64 r_tid; /* unique for this client */ struct rb_node r_node; - struct list_head r_req_lru_item; - struct list_head r_osd_item; struct list_head r_linger_item; struct list_head r_linger_osd_item; struct ceph_osd *r_osd; @@ -219,19 +218,16 @@ struct ceph_osd_client { struct ceph_client *client; struct ceph_osdmap *osdmap; /* current map */ - struct rw_semaphore map_sem; + struct rw_semaphore lock; - struct mutex request_mutex; struct rb_root osds; /* osds */ struct list_head osd_lru; /* idle osds */ spinlock_t osd_lru_lock; - u64 last_tid; /* tid of last request */ - struct rb_root requests; /* pending requests */ - struct list_head req_lru; /* in-flight lru */ - struct list_head req_unsent; /* unsent/need-resend queue */ - struct list_head req_notarget; /* map to no osd */ struct list_head req_linger; /* lingering requests */ - int num_requests; + struct ceph_osd homeless_osd; + atomic64_t last_tid; /* tid of last request */ + atomic_t num_requests; + atomic_t num_homeless; struct delayed_work timeout_work; struct delayed_work osds_timeout_work; #ifdef CONFIG_DEBUG_FS diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 6d3ff713edebbf..61dbd9de4650b9 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -182,21 +182,39 @@ static void dump_request(struct seq_file *s, struct ceph_osd_request *req) seq_putc(s, '\n'); } +static void dump_requests(struct seq_file *s, struct ceph_osd *osd) +{ + struct rb_node *n; + + mutex_lock(&osd->lock); + for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) { + struct ceph_osd_request *req = + rb_entry(n, struct ceph_osd_request, r_node); + + dump_request(s, req); + } + + mutex_unlock(&osd->lock); +} + static int osdc_show(struct seq_file *s, void *pp) { struct ceph_client *client = s->private; struct ceph_osd_client *osdc = &client->osdc; - struct rb_node *p; - - mutex_lock(&osdc->request_mutex); - for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { - struct ceph_osd_request *req; + struct rb_node *n; - req = rb_entry(p, struct ceph_osd_request, r_node); + down_read(&osdc->lock); + seq_printf(s, "REQUESTS %d homeless %d\n", + atomic_read(&osdc->num_requests), + atomic_read(&osdc->num_homeless)); + for (n = rb_first(&osdc->osds); n; n = rb_next(n)) { + struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); - dump_request(s, req); + dump_requests(s, osd); } - mutex_unlock(&osdc->request_mutex); + dump_requests(s, &osdc->homeless_osd); + + up_read(&osdc->lock); return 0; } diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index d1c8e06f126178..4c856c87b1a961 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -25,16 +25,6 @@ static struct kmem_cache *ceph_osd_request_cache; static const struct ceph_connection_operations osd_con_ops; -static void __send_queued(struct ceph_osd_client *osdc); -static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd); -static void __register_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req); -static void __unregister_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req); -static void __unregister_linger_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req); -static void __enqueue_request(struct ceph_osd_request *req); - /* * Implement client access to distributed object storage cluster. * @@ -53,6 +43,43 @@ static void __enqueue_request(struct ceph_osd_request *req); * channel with an OSD is reset. */ +static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req); +static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req); + +#if 1 +static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem) +{ + bool wrlocked = true; + + if (unlikely(down_read_trylock(sem))) { + wrlocked = false; + up_read(sem); + } + + return wrlocked; +} +static inline void verify_osdc_locked(struct ceph_osd_client *osdc) +{ + WARN_ON(!rwsem_is_locked(&osdc->lock)); +} +static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc) +{ + WARN_ON(!rwsem_is_wrlocked(&osdc->lock)); +} +static inline void verify_osd_locked(struct ceph_osd *osd) +{ + struct ceph_osd_client *osdc = osd->o_osdc; + + WARN_ON(!(mutex_is_locked(&osd->lock) && + rwsem_is_locked(&osdc->lock)) && + !rwsem_is_wrlocked(&osdc->lock)); +} +#else +static inline void verify_osdc_locked(struct ceph_osd_client *osdc) { } +static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc) { } +static inline void verify_osd_locked(struct ceph_osd *osd) { } +#endif + /* * calculate the mapping of a file extent onto an object, and fill out the * request accordingly. shorten extent as necessary if it crosses an @@ -336,18 +363,14 @@ static void ceph_osdc_release_request(struct kref *kref) dout("%s %p (r_request %p r_reply %p)\n", __func__, req, req->r_request, req->r_reply); WARN_ON(!RB_EMPTY_NODE(&req->r_node)); - WARN_ON(!list_empty(&req->r_req_lru_item)); - WARN_ON(!list_empty(&req->r_osd_item)); WARN_ON(!list_empty(&req->r_linger_item)); WARN_ON(!list_empty(&req->r_linger_osd_item)); WARN_ON(req->r_osd); if (req->r_request) ceph_msg_put(req->r_request); - if (req->r_reply) { - ceph_msg_revoke_incoming(req->r_reply); + if (req->r_reply) ceph_msg_put(req->r_reply); - } for (which = 0; which < req->r_num_ops; which++) osd_req_op_data_release(req, which); @@ -418,8 +441,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, INIT_LIST_HEAD(&req->r_unsafe_item); INIT_LIST_HEAD(&req->r_linger_item); INIT_LIST_HEAD(&req->r_linger_osd_item); - INIT_LIST_HEAD(&req->r_req_lru_item); - INIT_LIST_HEAD(&req->r_osd_item); target_init(&req->r_t); @@ -869,141 +890,11 @@ static bool osd_homeless(struct ceph_osd *osd) return osd->o_osd == CEPH_HOMELESS_OSD; } -static struct ceph_osd_request * -__lookup_request_ge(struct ceph_osd_client *osdc, - u64 tid) -{ - struct ceph_osd_request *req; - struct rb_node *n = osdc->requests.rb_node; - - while (n) { - req = rb_entry(n, struct ceph_osd_request, r_node); - if (tid < req->r_tid) { - if (!n->rb_left) - return req; - n = n->rb_left; - } else if (tid > req->r_tid) { - n = n->rb_right; - } else { - return req; - } - } - return NULL; -} - -static void __kick_linger_request(struct ceph_osd_request *req) -{ - struct ceph_osd_client *osdc = req->r_osdc; - struct ceph_osd *osd = req->r_osd; - - /* - * Linger requests need to be resent with a new tid to avoid - * the dup op detection logic on the OSDs. Achieve this with - * a re-register dance instead of open-coding. - */ - ceph_osdc_get_request(req); - if (!list_empty(&req->r_linger_item)) - __unregister_linger_request(osdc, req); - else - __unregister_request(osdc, req); - __register_request(osdc, req); - ceph_osdc_put_request(req); - - /* - * Unless request has been registered as both normal and - * lingering, __unregister{,_linger}_request clears r_osd. - * However, here we need to preserve r_osd to make sure we - * requeue on the same OSD. - */ - WARN_ON(req->r_osd || !osd); - req->r_osd = osd; - - dout("%s requeueing %p tid %llu\n", __func__, req, req->r_tid); - __enqueue_request(req); -} - -/* - * Resubmit requests pending on the given osd. - */ -static void __kick_osd_requests(struct ceph_osd_client *osdc, - struct ceph_osd *osd) -{ - struct ceph_osd_request *req, *nreq; - LIST_HEAD(resend); - LIST_HEAD(resend_linger); - int err; - - dout("%s osd%d\n", __func__, osd->o_osd); - err = __reset_osd(osdc, osd); - if (err) - return; - - /* - * Build up a list of requests to resend by traversing the - * osd's list of requests. Requests for a given object are - * sent in tid order, and that is also the order they're - * kept on this list. Therefore all requests that are in - * flight will be found first, followed by all requests that - * have not yet been sent. And to resend requests while - * preserving this order we will want to put any sent - * requests back on the front of the osd client's unsent - * list. - * - * So we build a separate ordered list of already-sent - * requests for the affected osd and splice it onto the - * front of the osd client's unsent list. Once we've seen a - * request that has not yet been sent we're done. Those - * requests are already sitting right where they belong. - */ - list_for_each_entry(req, &osd->o_requests, r_osd_item) { - if (!req->r_sent) - break; - - if (!req->r_linger) { - dout("%s requeueing %p tid %llu\n", __func__, req, - req->r_tid); - list_move_tail(&req->r_req_lru_item, &resend); - req->r_flags |= CEPH_OSD_FLAG_RETRY; - } else { - list_move_tail(&req->r_req_lru_item, &resend_linger); - } - } - list_splice(&resend, &osdc->req_unsent); - - /* - * Both registered and not yet registered linger requests are - * enqueued with a new tid on the same OSD. We add/move them - * to req_unsent/o_requests at the end to keep things in tid - * order. - */ - list_for_each_entry_safe(req, nreq, &osd->o_linger_requests, - r_linger_osd_item) { - WARN_ON(!list_empty(&req->r_req_lru_item)); - __kick_linger_request(req); - } - - list_for_each_entry_safe(req, nreq, &resend_linger, r_req_lru_item) - __kick_linger_request(req); -} - -/* - * If the osd connection drops, we need to resubmit all requests. - */ -static void osd_reset(struct ceph_connection *con) +static bool osd_registered(struct ceph_osd *osd) { - struct ceph_osd *osd = con->private; - struct ceph_osd_client *osdc; + verify_osdc_locked(osd->o_osdc); - if (!osd) - return; - dout("osd_reset osd%d\n", osd->o_osd); - osdc = osd->o_osdc; - down_read(&osdc->map_sem); - mutex_lock(&osdc->request_mutex); - __kick_osd_requests(osdc, osd); - __send_queued(osdc); - mutex_unlock(&osdc->request_mutex); - up_read(&osdc->map_sem); + return !RB_EMPTY_NODE(&osd->o_node); } /* @@ -1013,17 +904,18 @@ static void osd_init(struct ceph_osd *osd) { atomic_set(&osd->o_ref, 1); RB_CLEAR_NODE(&osd->o_node); - INIT_LIST_HEAD(&osd->o_requests); + osd->o_requests = RB_ROOT; INIT_LIST_HEAD(&osd->o_linger_requests); INIT_LIST_HEAD(&osd->o_osd_lru); INIT_LIST_HEAD(&osd->o_keepalive_item); osd->o_incarnation = 1; + mutex_init(&osd->lock); } static void osd_cleanup(struct ceph_osd *osd) { WARN_ON(!RB_EMPTY_NODE(&osd->o_node)); - WARN_ON(!list_empty(&osd->o_requests)); + WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests)); WARN_ON(!list_empty(&osd->o_linger_requests)); WARN_ON(!list_empty(&osd->o_osd_lru)); WARN_ON(!list_empty(&osd->o_keepalive_item)); @@ -1077,30 +969,6 @@ static void put_osd(struct ceph_osd *osd) DEFINE_RB_FUNCS(osd, struct ceph_osd, o_osd, o_node) -/* - * remove an osd from our map - */ -static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) -{ - dout("%s %p osd%d\n", __func__, osd, osd->o_osd); - WARN_ON(!list_empty(&osd->o_requests)); - WARN_ON(!list_empty(&osd->o_linger_requests)); - - list_del_init(&osd->o_osd_lru); - erase_osd(&osdc->osds, osd); -} - -static void remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) -{ - dout("%s %p osd%d\n", __func__, osd, osd->o_osd); - - if (!RB_EMPTY_NODE(&osd->o_node)) { - ceph_con_close(&osd->o_con); - __remove_osd(osdc, osd); - put_osd(osd); - } -} - static void __move_osd_to_lru(struct ceph_osd *osd) { struct ceph_osd_client *osdc = osd->o_osdc; @@ -1117,7 +985,7 @@ static void __move_osd_to_lru(struct ceph_osd *osd) static void maybe_move_osd_to_lru(struct ceph_osd *osd) { - if (list_empty(&osd->o_requests) && + if (RB_EMPTY_ROOT(&osd->o_requests) && list_empty(&osd->o_linger_requests)) __move_osd_to_lru(osd); } @@ -1134,30 +1002,64 @@ static void __remove_osd_from_lru(struct ceph_osd *osd) spin_unlock(&osdc->osd_lru_lock); } +/* + * Close the connection and assign any leftover requests to the + * homeless session. + */ +static void close_osd(struct ceph_osd *osd) +{ + struct ceph_osd_client *osdc = osd->o_osdc; + struct rb_node *n; + + verify_osdc_wrlocked(osdc); + dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd); + + ceph_con_close(&osd->o_con); + + for (n = rb_first(&osd->o_requests); n; ) { + struct ceph_osd_request *req = + rb_entry(n, struct ceph_osd_request, r_node); + + n = rb_next(n); /* unlink_request() */ + + dout(" reassigning req %p tid %llu\n", req, req->r_tid); + unlink_request(osd, req); + link_request(&osdc->homeless_osd, req); + } + + __remove_osd_from_lru(osd); + erase_osd(&osdc->osds, osd); + put_osd(osd); +} + /* * reset osd connect */ -static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) +static int reopen_osd(struct ceph_osd *osd) { struct ceph_entity_addr *peer_addr; - dout("__reset_osd %p osd%d\n", osd, osd->o_osd); - if (list_empty(&osd->o_requests) && + dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd); + + if (RB_EMPTY_ROOT(&osd->o_requests) && list_empty(&osd->o_linger_requests)) { - remove_osd(osdc, osd); + close_osd(osd); return -ENODEV; } - peer_addr = &osdc->osdmap->osd_addr[osd->o_osd]; + peer_addr = &osd->o_osdc->osdmap->osd_addr[osd->o_osd]; if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) && !ceph_con_opened(&osd->o_con)) { - struct ceph_osd_request *req; + struct rb_node *n; dout("osd addr hasn't changed and connection never opened, " "letting msgr retry\n"); /* touch each r_stamp for handle_timeout()'s benfit */ - list_for_each_entry(req, &osd->o_requests, r_osd_item) + for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) { + struct ceph_osd_request *req = + rb_entry(n, struct ceph_osd_request, r_node); req->r_stamp = jiffies; + } return -EAGAIN; } @@ -1169,73 +1071,84 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) return 0; } -/* - * Register request, assign tid. If this is the first request, set up - * the timeout event. - */ -static void __register_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req) -{ - req->r_tid = ++osdc->last_tid; - req->r_request->hdr.tid = cpu_to_le64(req->r_tid); - dout("__register_request %p tid %lld\n", req, req->r_tid); - insert_request(&osdc->requests, req); - ceph_osdc_get_request(req); - osdc->num_requests++; -} - -/* - * called under osdc->request_mutex - */ -static void __unregister_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req) +static struct ceph_osd *lookup_create_osd(struct ceph_osd_client *osdc, int o, + bool wrlocked) { - if (RB_EMPTY_NODE(&req->r_node)) { - dout("__unregister_request %p tid %lld not registered\n", - req, req->r_tid); - return; - } + struct ceph_osd *osd; - dout("__unregister_request %p tid %lld\n", req, req->r_tid); - erase_request(&osdc->requests, req); - osdc->num_requests--; + if (wrlocked) + verify_osdc_wrlocked(osdc); + else + verify_osdc_locked(osdc); - if (req->r_osd) { - /* make sure the original request isn't in flight. */ - ceph_msg_revoke(req->r_request); + if (o != CEPH_HOMELESS_OSD) + osd = lookup_osd(&osdc->osds, o); + else + osd = &osdc->homeless_osd; + if (!osd) { + if (!wrlocked) + return ERR_PTR(-EAGAIN); - list_del_init(&req->r_osd_item); - maybe_move_osd_to_lru(req->r_osd); - if (list_empty(&req->r_linger_osd_item)) - req->r_osd = NULL; + osd = create_osd(osdc, o); + insert_osd(&osdc->osds, osd); + ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd, + &osdc->osdmap->osd_addr[osd->o_osd]); } - list_del_init(&req->r_req_lru_item); - ceph_osdc_put_request(req); + dout("%s osdc %p osd%d -> osd %p\n", __func__, osdc, o, osd); + return osd; } /* - * Cancel a previously queued request message + * Create request <-> OSD session relation. + * + * @req has to be assigned a tid, @osd may be homeless. */ -static void __cancel_request(struct ceph_osd_request *req) +static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req) { - if (req->r_sent && req->r_osd) { - ceph_msg_revoke(req->r_request); - req->r_sent = 0; - } + verify_osd_locked(osd); + WARN_ON(!req->r_tid || req->r_osd); + dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd, + req, req->r_tid); + + if (!osd_homeless(osd)) + __remove_osd_from_lru(osd); + else + atomic_inc(&osd->o_osdc->num_homeless); + + get_osd(osd); + insert_request(&osd->o_requests, req); + req->r_osd = osd; } -static void __register_linger_request(struct ceph_osd_client *osdc, +static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req) +{ + verify_osd_locked(osd); + WARN_ON(req->r_osd != osd); + dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd, + req, req->r_tid); + + req->r_osd = NULL; + erase_request(&osd->o_requests, req); + put_osd(osd); + + if (!osd_homeless(osd)) + maybe_move_osd_to_lru(osd); + else + atomic_dec(&osd->o_osdc->num_homeless); +} + +static void __register_linger_request(struct ceph_osd *osd, struct ceph_osd_request *req) { dout("%s %p tid %llu\n", __func__, req, req->r_tid); WARN_ON(!req->r_linger); ceph_osdc_get_request(req); - list_add_tail(&req->r_linger_item, &osdc->req_linger); - if (req->r_osd) - list_add_tail(&req->r_linger_osd_item, - &req->r_osd->o_linger_requests); + list_add_tail(&req->r_linger_item, &osd->o_osdc->req_linger); + list_add_tail(&req->r_linger_osd_item, &osd->o_linger_requests); + __remove_osd_from_lru(osd); + req->r_osd = osd; } static void __unregister_linger_request(struct ceph_osd_client *osdc, @@ -1255,7 +1168,7 @@ static void __unregister_linger_request(struct ceph_osd_client *osdc, if (req->r_osd) { list_del_init(&req->r_linger_osd_item); maybe_move_osd_to_lru(req->r_osd); - if (list_empty(&req->r_osd_item)) + if (RB_EMPTY_ROOT(&req->r_osd->o_requests)) req->r_osd = NULL; } ceph_osdc_put_request(req); @@ -1291,11 +1204,20 @@ static bool have_pool_full(struct ceph_osd_client *osdc) return false; } +static bool pool_full(struct ceph_osd_client *osdc, s64 pool_id) +{ + struct ceph_pg_pool_info *pi; + + pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id); + if (!pi) + return false; + + return __pool_full(pi); +} + /* * Returns whether a request should be blocked from being sent * based on the current osdmap and osd_client settings. - * - * Caller should hold map_sem for read. */ static bool target_should_be_paused(struct ceph_osd_client *osdc, const struct ceph_osd_request_target *t, @@ -1421,87 +1343,6 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, return ct_res; } -static void __enqueue_request(struct ceph_osd_request *req) -{ - struct ceph_osd_client *osdc = req->r_osdc; - - dout("%s %p tid %llu to osd%d\n", __func__, req, req->r_tid, - req->r_osd ? req->r_osd->o_osd : -1); - - if (req->r_osd) { - __remove_osd_from_lru(req->r_osd); - list_add_tail(&req->r_osd_item, &req->r_osd->o_requests); - list_move_tail(&req->r_req_lru_item, &osdc->req_unsent); - } else { - list_move_tail(&req->r_req_lru_item, &osdc->req_notarget); - } -} - -/* - * Pick an osd (the first 'up' osd in the pg), allocate the osd struct - * (as needed), and set the request r_osd appropriately. If there is - * no up osd, set r_osd to NULL. Move the request to the appropriate list - * (unsent, homeless) or leave on in-flight lru. - * - * Return 0 if unchanged, 1 if changed, or negative on error. - * - * Caller should hold map_sem for read and request_mutex. - */ -static int __map_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req, int force_resend) -{ - enum calc_target_result ct_res; - int err; - - dout("map_request %p tid %lld\n", req, req->r_tid); - - ct_res = calc_target(osdc, &req->r_t, NULL, force_resend); - switch (ct_res) { - case CALC_TARGET_POOL_DNE: - list_move(&req->r_req_lru_item, &osdc->req_notarget); - return -EIO; - case CALC_TARGET_NO_ACTION: - return 0; /* no change */ - default: - BUG_ON(ct_res != CALC_TARGET_NEED_RESEND); - } - - dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n", - req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed, req->r_t.osd, - req->r_osd ? req->r_osd->o_osd : -1); - - if (req->r_osd) { - __cancel_request(req); - list_del_init(&req->r_osd_item); - list_del_init(&req->r_linger_osd_item); - req->r_osd = NULL; - } - - req->r_osd = lookup_osd(&osdc->osds, req->r_t.osd); - if (!req->r_osd && req->r_t.osd >= 0) { - err = -ENOMEM; - req->r_osd = create_osd(osdc, req->r_t.osd); - if (!req->r_osd) { - list_move(&req->r_req_lru_item, &osdc->req_notarget); - goto out; - } - - dout("map_request osd %p is osd%d\n", req->r_osd, - req->r_osd->o_osd); - insert_osd(&osdc->osds, req->r_osd); - - ceph_con_open(&req->r_osd->o_con, - CEPH_ENTITY_TYPE_OSD, req->r_osd->o_osd, - &osdc->osdmap->osd_addr[req->r_osd->o_osd]); - } - - __enqueue_request(req); - err = 1; /* osd or pg changed */ - -out: - return err; -} - static void setup_request_data(struct ceph_osd_request *req, struct ceph_msg *msg) { @@ -1648,8 +1489,16 @@ static void send_request(struct ceph_osd_request *req) { struct ceph_osd *osd = req->r_osd; + verify_osd_locked(osd); WARN_ON(osd->o_osd != req->r_t.osd); + /* + * We may have a previously queued request message hanging + * around. Cancel it to avoid corrupting the msgr. + */ + if (req->r_sent) + ceph_msg_revoke(req->r_request); + req->r_flags |= CEPH_OSD_FLAG_KNOWN_REDIR; if (req->r_attempts) req->r_flags |= CEPH_OSD_FLAG_RETRY; @@ -1671,24 +1520,11 @@ static void send_request(struct ceph_osd_request *req) ceph_con_send(&osd->o_con, ceph_msg_get(req->r_request)); } -/* - * Send any requests in the queue (req_unsent). - */ -static void __send_queued(struct ceph_osd_client *osdc) -{ - struct ceph_osd_request *req, *tmp; - - dout("__send_queued\n"); - list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item) { - list_move_tail(&req->r_req_lru_item, &osdc->req_lru); - send_request(req); - } -} - static void maybe_request_map(struct ceph_osd_client *osdc) { bool continuous = false; + verify_osdc_locked(osdc); WARN_ON(!osdc->osdmap->epoch); if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) || @@ -1705,38 +1541,121 @@ static void maybe_request_map(struct ceph_osd_client *osdc) ceph_monc_renew_subs(&osdc->client->monc); } -/* - * Caller should hold map_sem for read and request_mutex. - */ -static int __ceph_osdc_start_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req, - bool nofail) +static void __submit_request(struct ceph_osd_request *req, bool wrlocked) { - int rc; + struct ceph_osd_client *osdc = req->r_osdc; + struct ceph_osd *osd; + bool need_send = false; + bool promoted = false; - __register_request(osdc, req); - req->r_sent = 0; - req->r_got_reply = 0; - rc = __map_request(osdc, req, 0); - if (rc < 0) { - if (nofail) { - dout("osdc_start_request failed map, " - " will retry %lld\n", req->r_tid); - rc = 0; - } else { - __unregister_request(osdc, req); - } - return rc; + WARN_ON(req->r_tid || req->r_got_reply); + dout("%s req %p wrlocked %d\n", __func__, req, wrlocked); + +again: + calc_target(osdc, &req->r_t, &req->r_last_force_resend, false); + osd = lookup_create_osd(osdc, req->r_t.osd, wrlocked); + if (IS_ERR(osd)) { + WARN_ON(PTR_ERR(osd) != -EAGAIN || wrlocked); + goto promote; } - if (req->r_osd == NULL) { - dout("send_request %p no up osds in pg\n", req); - ceph_monc_request_next_osdmap(&osdc->client->monc); + if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && + ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR)) { + dout("req %p pausewr\n", req); + req->r_t.paused = true; + maybe_request_map(osdc); + } else if ((req->r_flags & CEPH_OSD_FLAG_READ) && + ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD)) { + dout("req %p pauserd\n", req); + req->r_t.paused = true; + maybe_request_map(osdc); + } else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && + !(req->r_flags & (CEPH_OSD_FLAG_FULL_TRY | + CEPH_OSD_FLAG_FULL_FORCE)) && + (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) || + pool_full(osdc, req->r_t.base_oloc.pool))) { + dout("req %p full/pool_full\n", req); + pr_warn_ratelimited("FULL or reached pool quota\n"); + req->r_t.paused = true; + maybe_request_map(osdc); + } else if (!osd_homeless(osd)) { + need_send = true; } else { - __send_queued(osdc); + maybe_request_map(osdc); } - return 0; + mutex_lock(&osd->lock); + /* + * Assign the tid atomically with send_request() to protect + * multiple writes to the same object from racing with each + * other, resulting in out of order ops on the OSDs. + */ + req->r_tid = atomic64_inc_return(&osdc->last_tid); + link_request(osd, req); + if (need_send) + send_request(req); + mutex_unlock(&osd->lock); + + if (promoted) + downgrade_write(&osdc->lock); + return; + +promote: + up_read(&osdc->lock); + down_write(&osdc->lock); + wrlocked = true; + promoted = true; + goto again; +} + +static void account_request(struct ceph_osd_request *req) +{ + unsigned int mask = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK; + + if (req->r_flags & CEPH_OSD_FLAG_READ) { + WARN_ON(req->r_flags & mask); + req->r_flags |= CEPH_OSD_FLAG_ACK; + } else if (req->r_flags & CEPH_OSD_FLAG_WRITE) + WARN_ON(!(req->r_flags & mask)); + else + WARN_ON(1); + + WARN_ON(req->r_unsafe_callback && (req->r_flags & mask) != mask); + atomic_inc(&req->r_osdc->num_requests); +} + +static void submit_request(struct ceph_osd_request *req, bool wrlocked) +{ + ceph_osdc_get_request(req); + account_request(req); + __submit_request(req, wrlocked); +} + +static void __finish_request(struct ceph_osd_request *req) +{ + struct ceph_osd_client *osdc = req->r_osdc; + struct ceph_osd *osd = req->r_osd; + + verify_osd_locked(osd); + dout("%s req %p tid %llu\n", __func__, req, req->r_tid); + + unlink_request(osd, req); + atomic_dec(&osdc->num_requests); + + /* + * If an OSD has failed or returned and a request has been sent + * twice, it's possible to get a reply and end up here while the + * request message is queued for delivery. We will ignore the + * reply, so not a big deal, but better to try and catch it. + */ + ceph_msg_revoke(req->r_request); + ceph_msg_revoke_incoming(req->r_reply); +} + +static void finish_request(struct ceph_osd_request *req) +{ + __finish_request(req); + ceph_osdc_put_request(req); } static void __complete_request(struct ceph_osd_request *req) @@ -1747,6 +1666,13 @@ static void __complete_request(struct ceph_osd_request *req) complete_all(&req->r_completion); } +static void cancel_request(struct ceph_osd_request *req) +{ + dout("%s req %p tid %llu\n", __func__, req, req->r_tid); + + finish_request(req); +} + /* * Timeout callback, called every N seconds. When 1 or more OSD * requests has been active for more than N seconds, we send a keepalive @@ -1758,44 +1684,49 @@ static void handle_timeout(struct work_struct *work) struct ceph_osd_client *osdc = container_of(work, struct ceph_osd_client, timeout_work.work); struct ceph_options *opts = osdc->client->options; - struct ceph_osd_request *req; - struct ceph_osd *osd; - struct list_head slow_osds; - dout("timeout\n"); - down_read(&osdc->map_sem); - - ceph_monc_request_next_osdmap(&osdc->client->monc); + unsigned long cutoff = jiffies - opts->osd_keepalive_timeout; + LIST_HEAD(slow_osds); + struct rb_node *n, *p; - mutex_lock(&osdc->request_mutex); + dout("%s osdc %p\n", __func__, osdc); + down_write(&osdc->lock); /* * ping osds that are a bit slow. this ensures that if there * is a break in the TCP connection we will notice, and reopen * a connection with that osd (from the fault callback). */ - INIT_LIST_HEAD(&slow_osds); - list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) { - if (time_before(jiffies, - req->r_stamp + opts->osd_keepalive_timeout)) - break; + for (n = rb_first(&osdc->osds); n; n = rb_next(n)) { + struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); + bool found = false; + + for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) { + struct ceph_osd_request *req = + rb_entry(p, struct ceph_osd_request, r_node); + + if (time_before(req->r_stamp, cutoff)) { + dout(" req %p tid %llu on osd%d is laggy\n", + req, req->r_tid, osd->o_osd); + found = true; + } + } - osd = req->r_osd; - BUG_ON(!osd); - dout(" tid %llu is slow, will send keepalive on osd%d\n", - req->r_tid, osd->o_osd); - list_move_tail(&osd->o_keepalive_item, &slow_osds); + if (found) + list_move_tail(&osd->o_keepalive_item, &slow_osds); } + + if (atomic_read(&osdc->num_homeless) || !list_empty(&slow_osds)) + maybe_request_map(osdc); + while (!list_empty(&slow_osds)) { - osd = list_entry(slow_osds.next, struct ceph_osd, - o_keepalive_item); + struct ceph_osd *osd = list_first_entry(&slow_osds, + struct ceph_osd, + o_keepalive_item); list_del_init(&osd->o_keepalive_item); ceph_con_keepalive(&osd->o_con); } - __send_queued(osdc); - mutex_unlock(&osdc->request_mutex); - up_read(&osdc->map_sem); - + up_write(&osdc->lock); schedule_delayed_work(&osdc->timeout_work, osdc->client->options->osd_keepalive_timeout); } @@ -1809,18 +1740,17 @@ static void handle_osds_timeout(struct work_struct *work) struct ceph_osd *osd, *nosd; dout("%s osdc %p\n", __func__, osdc); - down_read(&osdc->map_sem); - mutex_lock(&osdc->request_mutex); - + down_write(&osdc->lock); list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) { if (time_before(jiffies, osd->lru_ttl)) break; - remove_osd(osdc, osd); + WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests)); + WARN_ON(!list_empty(&osd->o_linger_requests)); + close_osd(osd); } - mutex_unlock(&osdc->request_mutex); - up_read(&osdc->map_sem); + up_write(&osdc->lock); schedule_delayed_work(&osdc->osds_timeout_work, round_jiffies_relative(delay)); } @@ -2045,8 +1975,9 @@ static bool done_request(const struct ceph_osd_request *req, * when we get the safe reply r_unsafe_cb(false), r_cb/r_completion, * r_safe_completion r_safe_completion */ -static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) +static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) { + struct ceph_osd_client *osdc = osd->o_osdc; struct ceph_osd_request *req; struct MOSDOpReply m; u64 tid = le64_to_cpu(msg->hdr.tid); @@ -2057,14 +1988,19 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) dout("%s msg %p tid %llu\n", __func__, msg, tid); - down_read(&osdc->map_sem); - mutex_lock(&osdc->request_mutex); - req = lookup_request(&osdc->requests, tid); + down_read(&osdc->lock); + if (!osd_registered(osd)) { + dout("%s osd%d unknown\n", __func__, osd->o_osd); + goto out_unlock_osdc; + } + WARN_ON(osd->o_osd != le64_to_cpu(msg->hdr.src.num)); + + mutex_lock(&osd->lock); + req = lookup_request(&osd->o_requests, tid); if (!req) { - dout("%s no tid %llu\n", __func__, tid); - goto out_unlock; + dout("%s osd%d tid %llu unknown\n", __func__, osd->o_osd, tid); + goto out_unlock_session; } - ceph_osdc_get_request(req); ret = decode_MOSDOpReply(msg, &m); if (ret) { @@ -2083,7 +2019,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) dout("req %p tid %llu retry_attempt %d != %d, ignoring\n", req, req->r_tid, m.retry_attempt, req->r_attempts - 1); - goto out_put; + goto out_unlock_session; } } else { WARN_ON(1); /* MOSDOpReply v4 is assumed */ @@ -2092,22 +2028,14 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) if (!ceph_oloc_empty(&m.redirect.oloc)) { dout("req %p tid %llu redirect pool %lld\n", req, req->r_tid, m.redirect.oloc.pool); - __unregister_request(osdc, req); + unlink_request(osd, req); + mutex_unlock(&osd->lock); ceph_oloc_copy(&req->r_t.target_oloc, &m.redirect.oloc); - - /* - * Start redirect requests with nofail=true. If - * mapping fails, request will end up on the notarget - * list, waiting for the new osdmap (which can take - * a while), even though the original request mapped - * successfully. In the future we might want to follow - * original request's nofail setting here. - */ - ret = __ceph_osdc_start_request(osdc, req, true); - BUG_ON(ret); - - goto out_put; + req->r_flags |= CEPH_OSD_FLAG_REDIRECTED; + req->r_tid = 0; + __submit_request(req, false); + goto out_unlock_osdc; } if (m.num_ops != req->r_num_ops) { @@ -2137,19 +2065,19 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) req->r_got_reply = true; } else if (!(m.flags & CEPH_OSD_FLAG_ONDISK)) { dout("req %p tid %llu dup ack\n", req, req->r_tid); - goto out_put; + goto out_unlock_session; } if (done_request(req, &m)) { - __unregister_request(osdc, req); + __finish_request(req); if (req->r_linger) { WARN_ON(req->r_unsafe_callback); - __register_linger_request(osdc, req); + __register_linger_request(osd, req); } } - mutex_unlock(&osdc->request_mutex); - up_read(&osdc->map_sem); + mutex_unlock(&osd->lock); + up_read(&osdc->lock); if (done_request(req, &m)) { if (already_acked && req->r_unsafe_callback) { @@ -2175,14 +2103,13 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) fail_request: req->r_result = -EIO; - __unregister_request(osdc, req); + __finish_request(req); __complete_request(req); complete_all(&req->r_safe_completion); -out_put: - ceph_osdc_put_request(req); -out_unlock: - mutex_unlock(&osdc->request_mutex); - up_read(&osdc->map_sem); +out_unlock_session: + mutex_unlock(&osd->lock); +out_unlock_osdc: + up_read(&osdc->lock); } static void set_pool_was_full(struct ceph_osd_client *osdc) @@ -2197,126 +2124,66 @@ static void set_pool_was_full(struct ceph_osd_client *osdc) } } -static void reset_changed_osds(struct ceph_osd_client *osdc) +static bool pool_cleared_full(struct ceph_osd_client *osdc, s64 pool_id) { - struct rb_node *p, *n; + struct ceph_pg_pool_info *pi; - dout("%s %p\n", __func__, osdc); - for (p = rb_first(&osdc->osds); p; p = n) { - struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node); + pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id); + if (!pi) + return false; - n = rb_next(p); - if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) || - memcmp(&osd->o_con.peer_addr, - ceph_osd_addr(osdc->osdmap, - osd->o_osd), - sizeof(struct ceph_entity_addr)) != 0) - __reset_osd(osdc, osd); - } + return pi->was_full && !__pool_full(pi); } /* - * Requeue requests whose mapping to an OSD has changed. If requests map to - * no osd, request a new map. - * - * Caller should hold map_sem for read. + * Requeue requests whose mapping to an OSD has changed. */ -static void kick_requests(struct ceph_osd_client *osdc, bool force_resend, - bool force_resend_writes) +static void scan_requests(struct ceph_osd *osd, + bool force_resend, + bool cleared_full, + bool check_pool_cleared_full, + struct rb_root *need_resend, + struct list_head *need_resend_linger) { - struct ceph_osd_request *req, *nreq; - struct rb_node *p; - int needmap = 0; - int err; - bool force_resend_req; - - dout("kick_requests %s %s\n", force_resend ? " (force resend)" : "", - force_resend_writes ? " (force resend writes)" : ""); - mutex_lock(&osdc->request_mutex); - for (p = rb_first(&osdc->requests); p; ) { - req = rb_entry(p, struct ceph_osd_request, r_node); - p = rb_next(p); - - /* - * For linger requests that have not yet been - * registered, move them to the linger list; they'll - * be sent to the osd in the loop below. Unregister - * the request before re-registering it as a linger - * request to ensure the __map_request() below - * will decide it needs to be sent. - */ - if (req->r_linger && list_empty(&req->r_linger_item)) { - dout("%p tid %llu restart on osd%d\n", - req, req->r_tid, - req->r_osd ? req->r_osd->o_osd : -1); - ceph_osdc_get_request(req); - __unregister_request(osdc, req); - __register_linger_request(osdc, req); - ceph_osdc_put_request(req); - continue; - } - - force_resend_req = force_resend || - (force_resend_writes && - req->r_flags & CEPH_OSD_FLAG_WRITE); - err = __map_request(osdc, req, force_resend_req); - if (err < 0) - continue; /* error */ - if (req->r_osd == NULL) { - dout("%p tid %llu maps to no osd\n", req, req->r_tid); - needmap++; /* request a newer map */ - } else if (err > 0) { - if (!req->r_linger) { - dout("%p tid %llu requeued on osd%d\n", req, - req->r_tid, - req->r_osd ? req->r_osd->o_osd : -1); - req->r_flags |= CEPH_OSD_FLAG_RETRY; - } - } - } - - list_for_each_entry_safe(req, nreq, &osdc->req_linger, - r_linger_item) { - dout("linger req=%p req->r_osd=%p\n", req, req->r_osd); - - err = __map_request(osdc, req, - force_resend || force_resend_writes); - dout("__map_request returned %d\n", err); - if (err < 0) - continue; /* hrm! */ - if (req->r_osd == NULL || err > 0) { - if (req->r_osd == NULL) { - dout("lingering %p tid %llu maps to no osd\n", - req, req->r_tid); - /* - * A homeless lingering request makes - * no sense, as it's job is to keep - * a particular OSD connection open. - * Request a newer map and kick the - * request, knowing that it won't be - * resent until we actually get a map - * that can tell us where to send it. - */ - needmap++; - } - - dout("kicking lingering %p tid %llu osd%d\n", req, - req->r_tid, req->r_osd ? req->r_osd->o_osd : -1); - __register_request(osdc, req); - __unregister_linger_request(osdc, req); + struct ceph_osd_client *osdc = osd->o_osdc; + struct rb_node *n; + bool force_resend_writes; + + for (n = rb_first(&osd->o_requests); n; ) { + struct ceph_osd_request *req = + rb_entry(n, struct ceph_osd_request, r_node); + enum calc_target_result ct_res; + + n = rb_next(n); /* unlink_request() */ + + dout("%s req %p tid %llu\n", __func__, req, req->r_tid); + ct_res = calc_target(osdc, &req->r_t, + &req->r_last_force_resend, false); + switch (ct_res) { + case CALC_TARGET_NO_ACTION: + force_resend_writes = cleared_full || + (check_pool_cleared_full && + pool_cleared_full(osdc, req->r_t.base_oloc.pool)); + if (!force_resend && + (!(req->r_flags & CEPH_OSD_FLAG_WRITE) || + !force_resend_writes)) + break; + + /* fall through */ + case CALC_TARGET_NEED_RESEND: + unlink_request(osd, req); + insert_request(need_resend, req); + break; + case CALC_TARGET_POOL_DNE: + break; } } - reset_changed_osds(osdc); - mutex_unlock(&osdc->request_mutex); - - if (needmap) { - dout("%d requests for down osds, need new map\n", needmap); - ceph_monc_request_next_osdmap(&osdc->client->monc); - } } static int handle_one_map(struct ceph_osd_client *osdc, - void *p, void *end, bool incremental) + void *p, void *end, bool incremental, + struct rb_root *need_resend, + struct list_head *need_resend_linger) { struct ceph_osdmap *newmap; struct rb_node *n; @@ -2362,11 +2229,51 @@ static int handle_one_map(struct ceph_osd_client *osdc, } was_full &= !ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL); - kick_requests(osdc, skipped_map, was_full); + scan_requests(&osdc->homeless_osd, skipped_map, was_full, true, + need_resend, need_resend_linger); + + for (n = rb_first(&osdc->osds); n; ) { + struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); + + n = rb_next(n); /* close_osd() */ + + scan_requests(osd, skipped_map, was_full, true, need_resend, + need_resend_linger); + if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) || + memcmp(&osd->o_con.peer_addr, + ceph_osd_addr(osdc->osdmap, osd->o_osd), + sizeof(struct ceph_entity_addr))) + close_osd(osd); + } return 0; } +static void kick_requests(struct ceph_osd_client *osdc, + struct rb_root *need_resend, + struct list_head *need_resend_linger) +{ + struct rb_node *n; + + for (n = rb_first(need_resend); n; ) { + struct ceph_osd_request *req = + rb_entry(n, struct ceph_osd_request, r_node); + struct ceph_osd *osd; + + n = rb_next(n); + erase_request(need_resend, req); /* before link_request() */ + + WARN_ON(req->r_osd); + calc_target(osdc, &req->r_t, NULL, false); + osd = lookup_create_osd(osdc, req->r_t.osd, true); + link_request(osd, req); + if (!req->r_linger) { + if (!osd_homeless(osd) && !req->r_t.paused) + send_request(req); + } + } +} + /* * Process updated osd map. * @@ -2381,13 +2288,15 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) u32 nr_maps, maplen; u32 epoch; struct ceph_fsid fsid; + struct rb_root need_resend = RB_ROOT; + LIST_HEAD(need_resend_linger); bool handled_incremental = false; bool was_pauserd, was_pausewr; bool pauserd, pausewr; int err; dout("%s have %u\n", __func__, osdc->osdmap->epoch); - down_write(&osdc->map_sem); + down_write(&osdc->lock); /* verify fsid */ ceph_decode_need(&p, end, sizeof(fsid), bad); @@ -2412,7 +2321,8 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) osdc->osdmap->epoch + 1 == epoch) { dout("applying incremental map %u len %d\n", epoch, maplen); - err = handle_one_map(osdc, p, p + maplen, true); + err = handle_one_map(osdc, p, p + maplen, true, + &need_resend, &need_resend_linger); if (err) goto bad; handled_incremental = true; @@ -2443,7 +2353,8 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) osdc->osdmap->epoch); } else { dout("taking full map %u len %d\n", epoch, maplen); - err = handle_one_map(osdc, p, p + maplen, false); + err = handle_one_map(osdc, p, p + maplen, false, + &need_resend, &need_resend_linger); if (err) goto bad; } @@ -2464,20 +2375,60 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) if (was_pauserd || was_pausewr || pauserd || pausewr) maybe_request_map(osdc); - mutex_lock(&osdc->request_mutex); - __send_queued(osdc); - mutex_unlock(&osdc->request_mutex); + kick_requests(osdc, &need_resend, &need_resend_linger); ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP, osdc->osdmap->epoch); - up_write(&osdc->map_sem); + up_write(&osdc->lock); wake_up_all(&osdc->client->auth_wq); return; bad: pr_err("osdc handle_map corrupt msg\n"); ceph_msg_dump(msg); - up_write(&osdc->map_sem); + up_write(&osdc->lock); +} + +/* + * Resubmit requests pending on the given osd. + */ +static void kick_osd_requests(struct ceph_osd *osd) +{ + struct rb_node *n; + + for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) { + struct ceph_osd_request *req = + rb_entry(n, struct ceph_osd_request, r_node); + + if (!req->r_linger) { + if (!req->r_t.paused) + send_request(req); + } + } +} + +/* + * If the osd connection drops, we need to resubmit all requests. + */ +static void osd_fault(struct ceph_connection *con) +{ + struct ceph_osd *osd = con->private; + struct ceph_osd_client *osdc = osd->o_osdc; + + dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd); + + down_write(&osdc->lock); + if (!osd_registered(osd)) { + dout("%s osd%d unknown\n", __func__, osd->o_osd); + goto out_unlock; + } + + if (!reopen_osd(osd)) + kick_osd_requests(osd); + maybe_request_map(osdc); + +out_unlock: + up_write(&osdc->lock); } /* @@ -2680,17 +2631,11 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req, bool nofail) { - int rc; - - down_read(&osdc->map_sem); - mutex_lock(&osdc->request_mutex); - - rc = __ceph_osdc_start_request(osdc, req, nofail); + down_read(&osdc->lock); + submit_request(req, false); + up_read(&osdc->lock); - mutex_unlock(&osdc->request_mutex); - up_read(&osdc->map_sem); - - return rc; + return 0; } EXPORT_SYMBOL(ceph_osdc_start_request); @@ -2703,13 +2648,12 @@ void ceph_osdc_cancel_request(struct ceph_osd_request *req) { struct ceph_osd_client *osdc = req->r_osdc; - mutex_lock(&osdc->request_mutex); + down_write(&osdc->lock); if (req->r_linger) __unregister_linger_request(osdc, req); - __unregister_request(osdc, req); - mutex_unlock(&osdc->request_mutex); - - dout("%s %p tid %llu canceled\n", __func__, req, req->r_tid); + if (req->r_osd) + cancel_request(req); + up_write(&osdc->lock); } EXPORT_SYMBOL(ceph_osdc_cancel_request); @@ -2744,32 +2688,40 @@ EXPORT_SYMBOL(ceph_osdc_wait_request); */ void ceph_osdc_sync(struct ceph_osd_client *osdc) { - struct ceph_osd_request *req; - u64 last_tid, next_tid = 0; + struct rb_node *n, *p; + u64 last_tid = atomic64_read(&osdc->last_tid); - mutex_lock(&osdc->request_mutex); - last_tid = osdc->last_tid; - while (1) { - req = __lookup_request_ge(osdc, next_tid); - if (!req) - break; - if (req->r_tid > last_tid) - break; +again: + down_read(&osdc->lock); + for (n = rb_first(&osdc->osds); n; n = rb_next(n)) { + struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); + + mutex_lock(&osd->lock); + for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) { + struct ceph_osd_request *req = + rb_entry(p, struct ceph_osd_request, r_node); + + if (req->r_tid > last_tid) + break; + + if (!(req->r_flags & CEPH_OSD_FLAG_WRITE)) + continue; - next_tid = req->r_tid + 1; - if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0) - continue; + ceph_osdc_get_request(req); + mutex_unlock(&osd->lock); + up_read(&osdc->lock); + dout("%s waiting on req %p tid %llu last_tid %llu\n", + __func__, req, req->r_tid, last_tid); + wait_for_completion(&req->r_safe_completion); + ceph_osdc_put_request(req); + goto again; + } - ceph_osdc_get_request(req); - mutex_unlock(&osdc->request_mutex); - dout("sync waiting on tid %llu (last is %llu)\n", - req->r_tid, last_tid); - wait_for_completion(&req->r_safe_completion); - mutex_lock(&osdc->request_mutex); - ceph_osdc_put_request(req); + mutex_unlock(&osd->lock); } - mutex_unlock(&osdc->request_mutex); - dout("sync done (thru tid %llu)\n", last_tid); + + up_read(&osdc->lock); + dout("%s done last_tid %llu\n", __func__, last_tid); } EXPORT_SYMBOL(ceph_osdc_sync); @@ -2793,18 +2745,14 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) dout("init\n"); osdc->client = client; - init_rwsem(&osdc->map_sem); - mutex_init(&osdc->request_mutex); - osdc->last_tid = 0; + init_rwsem(&osdc->lock); osdc->osds = RB_ROOT; INIT_LIST_HEAD(&osdc->osd_lru); spin_lock_init(&osdc->osd_lru_lock); - osdc->requests = RB_ROOT; - INIT_LIST_HEAD(&osdc->req_lru); - INIT_LIST_HEAD(&osdc->req_unsent); - INIT_LIST_HEAD(&osdc->req_notarget); INIT_LIST_HEAD(&osdc->req_linger); - osdc->num_requests = 0; + osd_init(&osdc->homeless_osd); + osdc->homeless_osd.o_osdc = osdc; + osdc->homeless_osd.o_osd = CEPH_HOMELESS_OSD; INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout); INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout); spin_lock_init(&osdc->event_lock); @@ -2861,13 +2809,19 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc) cancel_delayed_work_sync(&osdc->timeout_work); cancel_delayed_work_sync(&osdc->osds_timeout_work); - mutex_lock(&osdc->request_mutex); + down_write(&osdc->lock); while (!RB_EMPTY_ROOT(&osdc->osds)) { struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds), struct ceph_osd, o_node); - remove_osd(osdc, osd); + close_osd(osd); } - mutex_unlock(&osdc->request_mutex); + up_write(&osdc->lock); + WARN_ON(atomic_read(&osdc->homeless_osd.o_ref) != 1); + osd_cleanup(&osdc->homeless_osd); + + WARN_ON(!list_empty(&osdc->osd_lru)); + WARN_ON(atomic_read(&osdc->num_requests)); + WARN_ON(atomic_read(&osdc->num_homeless)); ceph_osdmap_destroy(osdc->osdmap); mempool_destroy(osdc->req_mempool); @@ -2982,19 +2936,15 @@ EXPORT_SYMBOL(ceph_osdc_cleanup); static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) { struct ceph_osd *osd = con->private; - struct ceph_osd_client *osdc; + struct ceph_osd_client *osdc = osd->o_osdc; int type = le16_to_cpu(msg->hdr.type); - if (!osd) - goto out; - osdc = osd->o_osdc; - switch (type) { case CEPH_MSG_OSD_MAP: ceph_osdc_handle_map(osdc, msg); break; case CEPH_MSG_OSD_OPREPLY: - handle_reply(osdc, msg); + handle_reply(osd, msg); break; case CEPH_MSG_WATCH_NOTIFY: handle_watch_notify(osdc, msg); @@ -3004,7 +2954,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) pr_err("received unknown message type %d %s\n", type, ceph_msg_type_name(type)); } -out: + ceph_msg_put(msg); } @@ -3019,21 +2969,27 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, { struct ceph_osd *osd = con->private; struct ceph_osd_client *osdc = osd->o_osdc; - struct ceph_msg *m; + struct ceph_msg *m = NULL; struct ceph_osd_request *req; int front_len = le32_to_cpu(hdr->front_len); int data_len = le32_to_cpu(hdr->data_len); - u64 tid; + u64 tid = le64_to_cpu(hdr->tid); - tid = le64_to_cpu(hdr->tid); - mutex_lock(&osdc->request_mutex); - req = lookup_request(&osdc->requests, tid); + down_read(&osdc->lock); + if (!osd_registered(osd)) { + dout("%s osd%d unknown, skipping\n", __func__, osd->o_osd); + *skip = 1; + goto out_unlock_osdc; + } + WARN_ON(osd->o_osd != le64_to_cpu(hdr->src.num)); + + mutex_lock(&osd->lock); + req = lookup_request(&osd->o_requests, tid); if (!req) { dout("%s osd%d tid %llu unknown, skipping\n", __func__, osd->o_osd, tid); - m = NULL; *skip = 1; - goto out; + goto out_unlock_session; } ceph_msg_revoke_incoming(req->r_reply); @@ -3045,7 +3001,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS, false); if (!m) - goto out; + goto out_unlock_session; ceph_msg_put(req->r_reply); req->r_reply = m; } @@ -3056,14 +3012,16 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, req->r_reply->data_length); m = NULL; *skip = 1; - goto out; + goto out_unlock_session; } m = ceph_msg_get(req->r_reply); dout("get_reply tid %lld %p\n", tid, m); -out: - mutex_unlock(&osdc->request_mutex); +out_unlock_session: + mutex_unlock(&osd->lock); +out_unlock_osdc: + up_read(&osdc->lock); return m; } @@ -3083,8 +3041,8 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con, case CEPH_MSG_OSD_OPREPLY: return get_reply(con, hdr, skip); default: - pr_info("alloc_msg unexpected msg type %d from osd%d\n", type, - osd->o_osd); + pr_warn("%s osd%d unknown msg type %d, skipping\n", __func__, + osd->o_osd, type); *skip = 1; return NULL; } @@ -3188,5 +3146,5 @@ static const struct ceph_connection_operations osd_con_ops = { .alloc_msg = alloc_msg, .sign_message = osd_sign_message, .check_message_signature = osd_check_message_signature, - .fault = osd_reset, + .fault = osd_fault, }; From 3540bfdb30fcb423a92cc959708e71bc39eb18c3 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:26 +0200 Subject: [PATCH 30/71] libceph: request_init() and request_release_checks() These are going to be used by request_reinit() code. Signed-off-by: Ilya Dryomov --- net/ceph/osd_client.c | 44 ++++++++++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 4c856c87b1a961..46e6fb5a129969 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -354,6 +354,15 @@ static void target_destroy(struct ceph_osd_request_target *t) /* * requests */ +static void request_release_checks(struct ceph_osd_request *req) +{ + WARN_ON(!RB_EMPTY_NODE(&req->r_node)); + WARN_ON(!list_empty(&req->r_linger_item)); + WARN_ON(!list_empty(&req->r_linger_osd_item)); + WARN_ON(!list_empty(&req->r_unsafe_item)); + WARN_ON(req->r_osd); +} + static void ceph_osdc_release_request(struct kref *kref) { struct ceph_osd_request *req = container_of(kref, @@ -362,10 +371,7 @@ static void ceph_osdc_release_request(struct kref *kref) dout("%s %p (r_request %p r_reply %p)\n", __func__, req, req->r_request, req->r_reply); - WARN_ON(!RB_EMPTY_NODE(&req->r_node)); - WARN_ON(!list_empty(&req->r_linger_item)); - WARN_ON(!list_empty(&req->r_linger_osd_item)); - WARN_ON(req->r_osd); + request_release_checks(req); if (req->r_request) ceph_msg_put(req->r_request); @@ -404,6 +410,22 @@ void ceph_osdc_put_request(struct ceph_osd_request *req) } EXPORT_SYMBOL(ceph_osdc_put_request); +static void request_init(struct ceph_osd_request *req) +{ + /* req only, each op is zeroed in _osd_req_op_init() */ + memset(req, 0, sizeof(*req)); + + kref_init(&req->r_kref); + init_completion(&req->r_completion); + init_completion(&req->r_safe_completion); + RB_CLEAR_NODE(&req->r_node); + INIT_LIST_HEAD(&req->r_linger_item); + INIT_LIST_HEAD(&req->r_linger_osd_item); + INIT_LIST_HEAD(&req->r_unsafe_item); + + target_init(&req->r_t); +} + struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_snap_context *snapc, unsigned int num_ops, @@ -425,25 +447,13 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, if (unlikely(!req)) return NULL; - /* req only, each op is zeroed in _osd_req_op_init() */ - memset(req, 0, sizeof(*req)); - + request_init(req); req->r_osdc = osdc; req->r_mempool = use_mempool; req->r_num_ops = num_ops; req->r_snapid = CEPH_NOSNAP; req->r_snapc = ceph_get_snap_context(snapc); - kref_init(&req->r_kref); - init_completion(&req->r_completion); - init_completion(&req->r_safe_completion); - RB_CLEAR_NODE(&req->r_node); - INIT_LIST_HEAD(&req->r_unsafe_item); - INIT_LIST_HEAD(&req->r_linger_item); - INIT_LIST_HEAD(&req->r_linger_osd_item); - - target_init(&req->r_t); - dout("%s req %p\n", __func__, req); return req; } From 42b0696527c49a109e9558162b8d109ae257d402 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:26 +0200 Subject: [PATCH 31/71] libceph: wait_request_timeout() The unwatch timeout is currently implemented in rbd. With watch/unwatch code moving into libceph, we are going to need a ceph_osdc_wait_request() variant with a timeout. Signed-off-by: Ilya Dryomov --- net/ceph/osd_client.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 46e6fb5a129969..ef1bcbe9af2d40 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2668,28 +2668,36 @@ void ceph_osdc_cancel_request(struct ceph_osd_request *req) EXPORT_SYMBOL(ceph_osdc_cancel_request); /* - * wait for a request to complete + * @timeout: in jiffies, 0 means "wait forever" */ -int ceph_osdc_wait_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req) +static int wait_request_timeout(struct ceph_osd_request *req, + unsigned long timeout) { - int rc; - - dout("%s %p tid %llu\n", __func__, req, req->r_tid); + long left; - rc = wait_for_completion_interruptible(&req->r_completion); - if (rc < 0) { - dout("%s %p tid %llu interrupted\n", __func__, req, req->r_tid); + dout("%s req %p tid %llu\n", __func__, req, req->r_tid); + left = wait_for_completion_interruptible_timeout(&req->r_completion, + ceph_timeout_jiffies(timeout)); + if (left <= 0) { + left = left ?: -ETIMEDOUT; ceph_osdc_cancel_request(req); /* kludge - need to to wake ceph_osdc_sync() */ complete_all(&req->r_safe_completion); - return rc; + } else { + left = req->r_result; /* completed */ } - dout("%s %p tid %llu result %d\n", __func__, req, req->r_tid, - req->r_result); - return req->r_result; + return left; +} + +/* + * wait for a request to complete + */ +int ceph_osdc_wait_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + return wait_request_timeout(req, 0); } EXPORT_SYMBOL(ceph_osdc_wait_request); From c525f03601f52c83ded046624138f2a45e0ba56c Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:26 +0200 Subject: [PATCH 32/71] rbd: rbd_dev_header_unwatch_sync() variant Introduce __rbd_dev_header_unwatch_sync(), which doesn't flush notify callbacks. This is for the new rados_watcherrcb_t, which would be called from a notify callback. Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 82b03aa509e66e..fce23dc908e33e 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3246,10 +3246,7 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev) return 0; } -/* - * Tear down a watch request, synchronously. - */ -static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev) +static void __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev) { struct rbd_obj_request *obj_request; @@ -3269,6 +3266,14 @@ static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev) ceph_osdc_cancel_event(rbd_dev->watch_event); rbd_dev->watch_event = NULL; +} + +/* + * Tear down a watch request, synchronously. + */ +static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev) +{ + __rbd_dev_header_unwatch_sync(rbd_dev); dout("%s flushing notifies\n", __func__); ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc); From 922dab6134178cae317ae00de86376cba59f3147 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 26 May 2016 01:15:02 +0200 Subject: [PATCH 33/71] libceph, rbd: ceph_osd_linger_request, watch/notify v2 This adds support and switches rbd to a new, more reliable version of watch/notify protocol. As with the OSD client update, this is mostly about getting the right structures linked into the right places so that reconnects are properly sent when needed. watch/notify v2 also requires sending regular pings to the OSDs - send_linger_ping(). A major change from the old watch/notify implementation is the introduction of ceph_osd_linger_request - linger requests no longer piggy back on ceph_osd_request. ceph_osd_event has been merged into ceph_osd_linger_request. All the details are now hidden within libceph, the interface consists of a simple pair of watch/unwatch functions and ceph_osdc_notify_ack(). ceph_osdc_watch() does return ceph_osd_linger_request, but only to keep the lifetime management simple. ceph_osdc_notify_ack() accepts an optional data payload, which is relayed back to the notifier. Portions of this patch are loosely based on work by Douglas Fuller and Mike Christie . Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 179 ++--- include/linux/ceph/ceph_fs.h | 5 +- include/linux/ceph/osd_client.h | 97 +-- include/linux/ceph/rados.h | 17 +- net/ceph/ceph_strings.c | 16 + net/ceph/debugfs.c | 36 + net/ceph/osd_client.c | 1148 ++++++++++++++++++++++++------- 7 files changed, 1067 insertions(+), 431 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index fce23dc908e33e..d0834c477f9684 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -351,11 +351,11 @@ struct rbd_device { struct rbd_options *opts; struct ceph_object_id header_oid; + struct ceph_object_locator header_oloc; struct ceph_file_layout layout; - struct ceph_osd_event *watch_event; - struct rbd_obj_request *watch_request; + struct ceph_osd_linger_request *watch_handle; struct rbd_spec *parent_spec; u64 parent_overlap; @@ -1596,12 +1596,6 @@ static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) return __rbd_obj_request_wait(obj_request, 0); } -static int rbd_obj_request_wait_timeout(struct rbd_obj_request *obj_request, - unsigned long timeout) -{ - return __rbd_obj_request_wait(obj_request, timeout); -} - static void rbd_img_request_complete(struct rbd_img_request *img_request) { @@ -1751,12 +1745,6 @@ static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) complete_all(&obj_request->completion); } -static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request) -{ - dout("%s: obj %p\n", __func__, obj_request); - obj_request_done_set(obj_request); -} - static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) { struct rbd_img_request *img_request = NULL; @@ -1877,10 +1865,6 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req) case CEPH_OSD_OP_CALL: rbd_osd_call_callback(obj_request); break; - case CEPH_OSD_OP_NOTIFY_ACK: - case CEPH_OSD_OP_WATCH: - rbd_osd_trivial_callback(obj_request); - break; default: rbd_warn(NULL, "%s: unsupported op %hu", obj_request->object_name, (unsigned short) opcode); @@ -3100,45 +3084,18 @@ static void rbd_img_parent_read(struct rbd_obj_request *obj_request) obj_request_done_set(obj_request); } -static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id) -{ - struct rbd_obj_request *obj_request; - struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; - int ret; - - obj_request = rbd_obj_request_create(rbd_dev->header_oid.name, 0, 0, - OBJ_REQUEST_NODATA); - if (!obj_request) - return -ENOMEM; +static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev); +static void __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev); - ret = -ENOMEM; - obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, - obj_request); - if (!obj_request->osd_req) - goto out; - - osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK, - notify_id, 0, 0); - rbd_osd_req_format_read(obj_request); - - ret = rbd_obj_request_submit(osdc, obj_request); - if (ret) - goto out; - ret = rbd_obj_request_wait(obj_request); -out: - rbd_obj_request_put(obj_request); - - return ret; -} - -static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) +static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie, + u64 notifier_id, void *data, size_t data_len) { - struct rbd_device *rbd_dev = (struct rbd_device *)data; + struct rbd_device *rbd_dev = arg; + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; int ret; - dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__, - rbd_dev->header_oid.name, (unsigned long long)notify_id, - (unsigned int)opcode); + dout("%s rbd_dev %p cookie %llu notify_id %llu\n", __func__, rbd_dev, + cookie, notify_id); /* * Until adequate refresh error handling is in place, there is @@ -3150,63 +3107,31 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) if (ret) rbd_warn(rbd_dev, "refresh failed: %d", ret); - ret = rbd_obj_notify_ack_sync(rbd_dev, notify_id); + ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid, + &rbd_dev->header_oloc, notify_id, cookie, + NULL, 0); if (ret) rbd_warn(rbd_dev, "notify_ack ret %d", ret); } -/* - * Send a (un)watch request and wait for the ack. Return a request - * with a ref held on success or error. - */ -static struct rbd_obj_request *rbd_obj_watch_request_helper( - struct rbd_device *rbd_dev, - bool watch) +static void rbd_watch_errcb(void *arg, u64 cookie, int err) { - struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; - struct ceph_options *opts = osdc->client->options; - struct rbd_obj_request *obj_request; + struct rbd_device *rbd_dev = arg; int ret; - obj_request = rbd_obj_request_create(rbd_dev->header_oid.name, 0, 0, - OBJ_REQUEST_NODATA); - if (!obj_request) - return ERR_PTR(-ENOMEM); - - obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_WRITE, 1, - obj_request); - if (!obj_request->osd_req) { - ret = -ENOMEM; - goto out; - } - - osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, - rbd_dev->watch_event->cookie, 0, watch); - rbd_osd_req_format_write(obj_request); - - if (watch) - ceph_osdc_set_request_linger(osdc, obj_request->osd_req); - - ret = rbd_obj_request_submit(osdc, obj_request); - if (ret) - goto out; + rbd_warn(rbd_dev, "encountered watch error: %d", err); - ret = rbd_obj_request_wait_timeout(obj_request, opts->mount_timeout); - if (ret) - goto out; + __rbd_dev_header_unwatch_sync(rbd_dev); - ret = obj_request->result; + ret = rbd_dev_header_watch_sync(rbd_dev); if (ret) { - if (watch) - rbd_obj_request_end(obj_request); - goto out; + rbd_warn(rbd_dev, "failed to reregister watch: %d", ret); + return; } - return obj_request; - -out: - rbd_obj_request_put(obj_request); - return ERR_PTR(ret); + ret = rbd_dev_refresh(rbd_dev); + if (ret) + rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret); } /* @@ -3215,57 +3140,33 @@ static struct rbd_obj_request *rbd_obj_watch_request_helper( static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev) { struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; - struct rbd_obj_request *obj_request; - int ret; + struct ceph_osd_linger_request *handle; - rbd_assert(!rbd_dev->watch_event); - rbd_assert(!rbd_dev->watch_request); - - ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev, - &rbd_dev->watch_event); - if (ret < 0) - return ret; - - obj_request = rbd_obj_watch_request_helper(rbd_dev, true); - if (IS_ERR(obj_request)) { - ceph_osdc_cancel_event(rbd_dev->watch_event); - rbd_dev->watch_event = NULL; - return PTR_ERR(obj_request); - } + rbd_assert(!rbd_dev->watch_handle); - /* - * A watch request is set to linger, so the underlying osd - * request won't go away until we unregister it. We retain - * a pointer to the object request during that time (in - * rbd_dev->watch_request), so we'll keep a reference to it. - * We'll drop that reference after we've unregistered it in - * rbd_dev_header_unwatch_sync(). - */ - rbd_dev->watch_request = obj_request; + handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid, + &rbd_dev->header_oloc, rbd_watch_cb, + rbd_watch_errcb, rbd_dev); + if (IS_ERR(handle)) + return PTR_ERR(handle); + rbd_dev->watch_handle = handle; return 0; } static void __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev) { - struct rbd_obj_request *obj_request; - - rbd_assert(rbd_dev->watch_event); - rbd_assert(rbd_dev->watch_request); + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; + int ret; - rbd_obj_request_end(rbd_dev->watch_request); - rbd_obj_request_put(rbd_dev->watch_request); - rbd_dev->watch_request = NULL; + if (!rbd_dev->watch_handle) + return; - obj_request = rbd_obj_watch_request_helper(rbd_dev, false); - if (!IS_ERR(obj_request)) - rbd_obj_request_put(obj_request); - else - rbd_warn(rbd_dev, "unable to tear down watch request (%ld)", - PTR_ERR(obj_request)); + ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle); + if (ret) + rbd_warn(rbd_dev, "failed to unwatch: %d", ret); - ceph_osdc_cancel_event(rbd_dev->watch_event); - rbd_dev->watch_event = NULL; + rbd_dev->watch_handle = NULL; } /* @@ -4081,6 +3982,7 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, init_rwsem(&rbd_dev->header_rwsem); ceph_oid_init(&rbd_dev->header_oid); + ceph_oloc_init(&rbd_dev->header_oloc); rbd_dev->dev.bus = &rbd_bus_type; rbd_dev->dev.type = &rbd_device_type; @@ -5285,6 +5187,7 @@ static int rbd_dev_header_name(struct rbd_device *rbd_dev) rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); + rbd_dev->header_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); if (rbd_dev->image_format == 1) ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s", spec->image_name, RBD_SUFFIX); diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index 37f28bf55ce426..3b911ff889ddb4 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -153,8 +153,9 @@ struct ceph_dir_layout { /* watch-notify operations */ enum { - WATCH_NOTIFY = 1, /* notifying watcher */ - WATCH_NOTIFY_COMPLETE = 2, /* notifier notified when done */ + CEPH_WATCH_EVENT_NOTIFY = 1, /* notifying watcher */ + CEPH_WATCH_EVENT_NOTIFY_COMPLETE = 2, /* notifier notified when done */ + CEPH_WATCH_EVENT_DISCONNECT = 3, /* we were disconnected */ }; diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 342f22f1f0400f..cd2dcb8939de8e 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -34,7 +34,7 @@ struct ceph_osd { struct rb_node o_node; struct ceph_connection o_con; struct rb_root o_requests; - struct list_head o_linger_requests; + struct rb_root o_linger_requests; struct list_head o_osd_lru; struct ceph_auth_handshake o_auth; unsigned long lru_ttl; @@ -108,11 +108,12 @@ struct ceph_osd_req_op { } cls; struct { u64 cookie; - u64 ver; - u32 prot_ver; - u32 timeout; - __u8 flag; + __u8 op; /* CEPH_OSD_WATCH_OP_ */ + u32 gen; } watch; + struct { + struct ceph_osd_data request_data; + } notify_ack; struct { u64 expected_object_size; u64 expected_write_size; @@ -145,8 +146,6 @@ struct ceph_osd_request_target { struct ceph_osd_request { u64 r_tid; /* unique for this client */ struct rb_node r_node; - struct list_head r_linger_item; - struct list_head r_linger_osd_item; struct ceph_osd *r_osd; struct ceph_osd_request_target r_t; @@ -162,7 +161,6 @@ struct ceph_osd_request { int r_result; bool r_got_reply; - int r_linger; struct ceph_osd_client *r_osdc; struct kref r_kref; @@ -181,6 +179,7 @@ struct ceph_osd_request { struct ceph_snap_context *r_snapc; /* for writes */ struct timespec r_mtime; /* ditto */ u64 r_data_offset; /* ditto */ + bool r_linger; /* don't resend on failure */ /* internal */ unsigned long r_stamp; /* jiffies, send or check time */ @@ -195,23 +194,40 @@ struct ceph_request_redirect { struct ceph_object_locator oloc; }; -struct ceph_osd_event { - u64 cookie; - int one_shot; +typedef void (*rados_watchcb2_t)(void *arg, u64 notify_id, u64 cookie, + u64 notifier_id, void *data, size_t data_len); +typedef void (*rados_watcherrcb_t)(void *arg, u64 cookie, int err); + +struct ceph_osd_linger_request { struct ceph_osd_client *osdc; - void (*cb)(u64, u64, u8, void *); - void *data; - struct rb_node node; - struct list_head osd_node; + u64 linger_id; + bool committed; + + struct ceph_osd *osd; + struct ceph_osd_request *reg_req; + struct ceph_osd_request *ping_req; + unsigned long ping_sent; + + struct ceph_osd_request_target t; + u32 last_force_resend; + + struct timespec mtime; + struct kref kref; -}; + struct mutex lock; + struct rb_node node; /* osd */ + struct rb_node osdc_node; /* osdc */ + struct list_head scan_item; + + struct completion reg_commit_wait; + int reg_commit_error; + int last_error; + + u32 register_gen; -struct ceph_osd_event_work { - struct work_struct work; - struct ceph_osd_event *event; - u64 ver; - u64 notify_id; - u8 opcode; + rados_watchcb2_t wcb; + rados_watcherrcb_t errcb; + void *data; }; struct ceph_osd_client { @@ -223,9 +239,10 @@ struct ceph_osd_client { struct rb_root osds; /* osds */ struct list_head osd_lru; /* idle osds */ spinlock_t osd_lru_lock; - struct list_head req_linger; /* lingering requests */ struct ceph_osd homeless_osd; atomic64_t last_tid; /* tid of last request */ + u64 last_linger_id; + struct rb_root linger_requests; /* lingering requests */ atomic_t num_requests; atomic_t num_homeless; struct delayed_work timeout_work; @@ -239,10 +256,6 @@ struct ceph_osd_client { struct ceph_msgpool msgpool_op; struct ceph_msgpool msgpool_op_reply; - spinlock_t event_lock; - struct rb_root event_tree; - u64 event_count; - struct workqueue_struct *notify_wq; }; @@ -314,9 +327,6 @@ extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req, extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, const char *name, const void *value, size_t size, u8 cmp_op, u8 cmp_mode); -extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req, - unsigned int which, u16 opcode, - u64 cookie, u64 version, int flag); extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, unsigned int which, u64 expected_object_size, @@ -339,9 +349,6 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, u32 truncate_seq, u64 truncate_size, bool use_mempool); -extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, - struct ceph_osd_request *req); - extern void ceph_osdc_get_request(struct ceph_osd_request *req); extern void ceph_osdc_put_request(struct ceph_osd_request *req); @@ -372,11 +379,23 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct timespec *mtime, struct page **pages, int nr_pages); -/* watch/notify events */ -extern int ceph_osdc_create_event(struct ceph_osd_client *osdc, - void (*event_cb)(u64, u64, u8, void *), - void *data, struct ceph_osd_event **pevent); -extern void ceph_osdc_cancel_event(struct ceph_osd_event *event); -extern void ceph_osdc_put_event(struct ceph_osd_event *event); +/* watch/notify */ +struct ceph_osd_linger_request * +ceph_osdc_watch(struct ceph_osd_client *osdc, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + rados_watchcb2_t wcb, + rados_watcherrcb_t errcb, + void *data); +int ceph_osdc_unwatch(struct ceph_osd_client *osdc, + struct ceph_osd_linger_request *lreq); + +int ceph_osdc_notify_ack(struct ceph_osd_client *osdc, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + u64 notify_id, + u64 cookie, + void *payload, + size_t payload_len); #endif diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 28740a58f32cd1..204c8c94470353 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -427,7 +427,17 @@ enum { CEPH_OSD_CMPXATTR_MODE_U64 = 2 }; -#define RADOS_NOTIFY_VER 1 +enum { + CEPH_OSD_WATCH_OP_UNWATCH = 0, + CEPH_OSD_WATCH_OP_LEGACY_WATCH = 1, + /* note: use only ODD ids to prevent pre-giant code from + interpreting the op as UNWATCH */ + CEPH_OSD_WATCH_OP_WATCH = 3, + CEPH_OSD_WATCH_OP_RECONNECT = 5, + CEPH_OSD_WATCH_OP_PING = 7, +}; + +const char *ceph_osd_watch_op_name(int o); /* * an individual object operation. each may be accompanied by some data @@ -462,8 +472,9 @@ struct ceph_osd_op { } __attribute__ ((packed)) snap; struct { __le64 cookie; - __le64 ver; - __u8 flag; /* 0 = unwatch, 1 = watch */ + __le64 ver; /* no longer used */ + __u8 op; /* CEPH_OSD_WATCH_OP_* */ + __le32 gen; /* registration generation */ } __attribute__ ((packed)) watch; struct { __le64 offset, length; diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c index 139a9cb19b0c6c..3773a4fa11e35b 100644 --- a/net/ceph/ceph_strings.c +++ b/net/ceph/ceph_strings.c @@ -27,6 +27,22 @@ __CEPH_FORALL_OSD_OPS(GENERATE_CASE) } } +const char *ceph_osd_watch_op_name(int o) +{ + switch (o) { + case CEPH_OSD_WATCH_OP_UNWATCH: + return "unwatch"; + case CEPH_OSD_WATCH_OP_WATCH: + return "watch"; + case CEPH_OSD_WATCH_OP_RECONNECT: + return "reconnect"; + case CEPH_OSD_WATCH_OP_PING: + return "ping"; + default: + return "???"; + } +} + const char *ceph_osd_state_name(int s) { switch (s) { diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 61dbd9de4650b9..e64cb858353367 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -177,6 +177,9 @@ static void dump_request(struct seq_file *s, struct ceph_osd_request *req) seq_printf(s, "%s%s", (i == 0 ? "\t" : ","), ceph_osd_op_name(op->op)); + if (op->op == CEPH_OSD_OP_WATCH) + seq_printf(s, "-%s", + ceph_osd_watch_op_name(op->watch.op)); } seq_putc(s, '\n'); @@ -197,6 +200,31 @@ static void dump_requests(struct seq_file *s, struct ceph_osd *osd) mutex_unlock(&osd->lock); } +static void dump_linger_request(struct seq_file *s, + struct ceph_osd_linger_request *lreq) +{ + seq_printf(s, "%llu\t", lreq->linger_id); + dump_target(s, &lreq->t); + + seq_printf(s, "\t%u\t%s/%d\n", lreq->register_gen, + lreq->committed ? "C" : "", lreq->last_error); +} + +static void dump_linger_requests(struct seq_file *s, struct ceph_osd *osd) +{ + struct rb_node *n; + + mutex_lock(&osd->lock); + for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) { + struct ceph_osd_linger_request *lreq = + rb_entry(n, struct ceph_osd_linger_request, node); + + dump_linger_request(s, lreq); + } + + mutex_unlock(&osd->lock); +} + static int osdc_show(struct seq_file *s, void *pp) { struct ceph_client *client = s->private; @@ -214,6 +242,14 @@ static int osdc_show(struct seq_file *s, void *pp) } dump_requests(s, &osdc->homeless_osd); + seq_puts(s, "LINGER REQUESTS\n"); + for (n = rb_first(&osdc->osds); n; n = rb_next(n)) { + struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); + + dump_linger_requests(s, osd); + } + dump_linger_requests(s, &osdc->homeless_osd); + up_read(&osdc->lock); return 0; } diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index ef1bcbe9af2d40..ca0a7b58ba4f8b 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -45,6 +45,10 @@ static const struct ceph_connection_operations osd_con_ops; static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req); static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req); +static void link_linger(struct ceph_osd *osd, + struct ceph_osd_linger_request *lreq); +static void unlink_linger(struct ceph_osd *osd, + struct ceph_osd_linger_request *lreq); #if 1 static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem) @@ -74,10 +78,15 @@ static inline void verify_osd_locked(struct ceph_osd *osd) rwsem_is_locked(&osdc->lock)) && !rwsem_is_wrlocked(&osdc->lock)); } +static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq) +{ + WARN_ON(!mutex_is_locked(&lreq->lock)); +} #else static inline void verify_osdc_locked(struct ceph_osd_client *osdc) { } static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc) { } static inline void verify_osd_locked(struct ceph_osd *osd) { } +static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq) { } #endif /* @@ -322,6 +331,9 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req, case CEPH_OSD_OP_STAT: ceph_osd_data_release(&op->raw_data_in); break; + case CEPH_OSD_OP_NOTIFY_ACK: + ceph_osd_data_release(&op->notify_ack.request_data); + break; default: break; } @@ -345,6 +357,29 @@ static void target_init(struct ceph_osd_request_target *t) t->osd = CEPH_HOMELESS_OSD; } +static void target_copy(struct ceph_osd_request_target *dest, + const struct ceph_osd_request_target *src) +{ + ceph_oid_copy(&dest->base_oid, &src->base_oid); + ceph_oloc_copy(&dest->base_oloc, &src->base_oloc); + ceph_oid_copy(&dest->target_oid, &src->target_oid); + ceph_oloc_copy(&dest->target_oloc, &src->target_oloc); + + dest->pgid = src->pgid; /* struct */ + dest->pg_num = src->pg_num; + dest->pg_num_mask = src->pg_num_mask; + ceph_osds_copy(&dest->acting, &src->acting); + ceph_osds_copy(&dest->up, &src->up); + dest->size = src->size; + dest->min_size = src->min_size; + dest->sort_bitwise = src->sort_bitwise; + + dest->flags = src->flags; + dest->paused = src->paused; + + dest->osd = src->osd; +} + static void target_destroy(struct ceph_osd_request_target *t) { ceph_oid_destroy(&t->base_oid); @@ -357,8 +392,6 @@ static void target_destroy(struct ceph_osd_request_target *t) static void request_release_checks(struct ceph_osd_request *req) { WARN_ON(!RB_EMPTY_NODE(&req->r_node)); - WARN_ON(!list_empty(&req->r_linger_item)); - WARN_ON(!list_empty(&req->r_linger_osd_item)); WARN_ON(!list_empty(&req->r_unsafe_item)); WARN_ON(req->r_osd); } @@ -419,13 +452,48 @@ static void request_init(struct ceph_osd_request *req) init_completion(&req->r_completion); init_completion(&req->r_safe_completion); RB_CLEAR_NODE(&req->r_node); - INIT_LIST_HEAD(&req->r_linger_item); - INIT_LIST_HEAD(&req->r_linger_osd_item); INIT_LIST_HEAD(&req->r_unsafe_item); target_init(&req->r_t); } +/* + * This is ugly, but it allows us to reuse linger registration and ping + * requests, keeping the structure of the code around send_linger{_ping}() + * reasonable. Setting up a min_nr=2 mempool for each linger request + * and dealing with copying ops (this blasts req only, watch op remains + * intact) isn't any better. + */ +static void request_reinit(struct ceph_osd_request *req) +{ + struct ceph_osd_client *osdc = req->r_osdc; + bool mempool = req->r_mempool; + unsigned int num_ops = req->r_num_ops; + u64 snapid = req->r_snapid; + struct ceph_snap_context *snapc = req->r_snapc; + bool linger = req->r_linger; + struct ceph_msg *request_msg = req->r_request; + struct ceph_msg *reply_msg = req->r_reply; + + dout("%s req %p\n", __func__, req); + WARN_ON(atomic_read(&req->r_kref.refcount) != 1); + request_release_checks(req); + + WARN_ON(atomic_read(&request_msg->kref.refcount) != 1); + WARN_ON(atomic_read(&reply_msg->kref.refcount) != 1); + target_destroy(&req->r_t); + + request_init(req); + req->r_osdc = osdc; + req->r_mempool = mempool; + req->r_num_ops = num_ops; + req->r_snapid = snapid; + req->r_snapc = snapc; + req->r_linger = linger; + req->r_request = request_msg; + req->r_reply = reply_msg; +} + struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_snap_context *snapc, unsigned int num_ops, @@ -681,21 +749,19 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, } EXPORT_SYMBOL(osd_req_op_xattr_init); -void osd_req_op_watch_init(struct ceph_osd_request *osd_req, - unsigned int which, u16 opcode, - u64 cookie, u64 version, int flag) +/* + * @watch_opcode: CEPH_OSD_WATCH_OP_* + */ +static void osd_req_op_watch_init(struct ceph_osd_request *req, int which, + u64 cookie, u8 watch_opcode) { - struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, - opcode, 0); - - BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH); + struct ceph_osd_req_op *op; + op = _osd_req_op_init(req, which, CEPH_OSD_OP_WATCH, 0); op->watch.cookie = cookie; - op->watch.ver = version; - if (opcode == CEPH_OSD_OP_WATCH && flag) - op->watch.flag = (u8)1; + op->watch.op = watch_opcode; + op->watch.gen = 0; } -EXPORT_SYMBOL(osd_req_op_watch_init); void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, unsigned int which, @@ -771,11 +837,13 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst, break; case CEPH_OSD_OP_STARTSYNC: break; - case CEPH_OSD_OP_NOTIFY_ACK: case CEPH_OSD_OP_WATCH: dst->watch.cookie = cpu_to_le64(src->watch.cookie); - dst->watch.ver = cpu_to_le64(src->watch.ver); - dst->watch.flag = src->watch.flag; + dst->watch.ver = cpu_to_le64(0); + dst->watch.op = src->watch.op; + dst->watch.gen = cpu_to_le32(src->watch.gen); + break; + case CEPH_OSD_OP_NOTIFY_ACK: break; case CEPH_OSD_OP_SETALLOCHINT: dst->alloc_hint.expected_object_size = @@ -915,7 +983,7 @@ static void osd_init(struct ceph_osd *osd) atomic_set(&osd->o_ref, 1); RB_CLEAR_NODE(&osd->o_node); osd->o_requests = RB_ROOT; - INIT_LIST_HEAD(&osd->o_linger_requests); + osd->o_linger_requests = RB_ROOT; INIT_LIST_HEAD(&osd->o_osd_lru); INIT_LIST_HEAD(&osd->o_keepalive_item); osd->o_incarnation = 1; @@ -926,7 +994,7 @@ static void osd_cleanup(struct ceph_osd *osd) { WARN_ON(!RB_EMPTY_NODE(&osd->o_node)); WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests)); - WARN_ON(!list_empty(&osd->o_linger_requests)); + WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests)); WARN_ON(!list_empty(&osd->o_osd_lru)); WARN_ON(!list_empty(&osd->o_keepalive_item)); @@ -996,7 +1064,7 @@ static void __move_osd_to_lru(struct ceph_osd *osd) static void maybe_move_osd_to_lru(struct ceph_osd *osd) { if (RB_EMPTY_ROOT(&osd->o_requests) && - list_empty(&osd->o_linger_requests)) + RB_EMPTY_ROOT(&osd->o_linger_requests)) __move_osd_to_lru(osd); } @@ -1036,6 +1104,17 @@ static void close_osd(struct ceph_osd *osd) unlink_request(osd, req); link_request(&osdc->homeless_osd, req); } + for (n = rb_first(&osd->o_linger_requests); n; ) { + struct ceph_osd_linger_request *lreq = + rb_entry(n, struct ceph_osd_linger_request, node); + + n = rb_next(n); /* unlink_linger() */ + + dout(" reassigning lreq %p linger_id %llu\n", lreq, + lreq->linger_id); + unlink_linger(osd, lreq); + link_linger(&osdc->homeless_osd, lreq); + } __remove_osd_from_lru(osd); erase_osd(&osdc->osds, osd); @@ -1052,7 +1131,7 @@ static int reopen_osd(struct ceph_osd *osd) dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd); if (RB_EMPTY_ROOT(&osd->o_requests) && - list_empty(&osd->o_linger_requests)) { + RB_EMPTY_ROOT(&osd->o_linger_requests)) { close_osd(osd); return -ENODEV; } @@ -1148,52 +1227,6 @@ static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req) atomic_dec(&osd->o_osdc->num_homeless); } -static void __register_linger_request(struct ceph_osd *osd, - struct ceph_osd_request *req) -{ - dout("%s %p tid %llu\n", __func__, req, req->r_tid); - WARN_ON(!req->r_linger); - - ceph_osdc_get_request(req); - list_add_tail(&req->r_linger_item, &osd->o_osdc->req_linger); - list_add_tail(&req->r_linger_osd_item, &osd->o_linger_requests); - __remove_osd_from_lru(osd); - req->r_osd = osd; -} - -static void __unregister_linger_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req) -{ - WARN_ON(!req->r_linger); - - if (list_empty(&req->r_linger_item)) { - dout("%s %p tid %llu not registered\n", __func__, req, - req->r_tid); - return; - } - - dout("%s %p tid %llu\n", __func__, req, req->r_tid); - list_del_init(&req->r_linger_item); - - if (req->r_osd) { - list_del_init(&req->r_linger_osd_item); - maybe_move_osd_to_lru(req->r_osd); - if (RB_EMPTY_ROOT(&req->r_osd->o_requests)) - req->r_osd = NULL; - } - ceph_osdc_put_request(req); -} - -void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, - struct ceph_osd_request *req) -{ - if (!req->r_linger) { - dout("set_request_linger %p\n", req); - req->r_linger = 1; - } -} -EXPORT_SYMBOL(ceph_osdc_set_request_linger); - static bool __pool_full(struct ceph_pg_pool_info *pi) { return pi->flags & CEPH_POOL_FLAG_FULL; @@ -1379,6 +1412,10 @@ static void setup_request_data(struct ceph_osd_request *req, op->xattr.value_len); ceph_osdc_msg_data_add(msg, &op->xattr.osd_data); break; + case CEPH_OSD_OP_NOTIFY_ACK: + ceph_osdc_msg_data_add(msg, + &op->notify_ack.request_data); + break; /* reply */ case CEPH_OSD_OP_STAT: @@ -1683,6 +1720,460 @@ static void cancel_request(struct ceph_osd_request *req) finish_request(req); } +/* + * lingering requests, watch/notify v2 infrastructure + */ +static void linger_release(struct kref *kref) +{ + struct ceph_osd_linger_request *lreq = + container_of(kref, struct ceph_osd_linger_request, kref); + + dout("%s lreq %p reg_req %p ping_req %p\n", __func__, lreq, + lreq->reg_req, lreq->ping_req); + WARN_ON(!RB_EMPTY_NODE(&lreq->node)); + WARN_ON(!RB_EMPTY_NODE(&lreq->osdc_node)); + WARN_ON(!list_empty(&lreq->scan_item)); + WARN_ON(lreq->osd); + + if (lreq->reg_req) + ceph_osdc_put_request(lreq->reg_req); + if (lreq->ping_req) + ceph_osdc_put_request(lreq->ping_req); + target_destroy(&lreq->t); + kfree(lreq); +} + +static void linger_put(struct ceph_osd_linger_request *lreq) +{ + if (lreq) + kref_put(&lreq->kref, linger_release); +} + +static struct ceph_osd_linger_request * +linger_get(struct ceph_osd_linger_request *lreq) +{ + kref_get(&lreq->kref); + return lreq; +} + +static struct ceph_osd_linger_request * +linger_alloc(struct ceph_osd_client *osdc) +{ + struct ceph_osd_linger_request *lreq; + + lreq = kzalloc(sizeof(*lreq), GFP_NOIO); + if (!lreq) + return NULL; + + kref_init(&lreq->kref); + mutex_init(&lreq->lock); + RB_CLEAR_NODE(&lreq->node); + RB_CLEAR_NODE(&lreq->osdc_node); + INIT_LIST_HEAD(&lreq->scan_item); + init_completion(&lreq->reg_commit_wait); + + lreq->osdc = osdc; + target_init(&lreq->t); + + dout("%s lreq %p\n", __func__, lreq); + return lreq; +} + +DEFINE_RB_INSDEL_FUNCS(linger, struct ceph_osd_linger_request, linger_id, node) +DEFINE_RB_FUNCS(linger_osdc, struct ceph_osd_linger_request, linger_id, osdc_node) + +/* + * Create linger request <-> OSD session relation. + * + * @lreq has to be registered, @osd may be homeless. + */ +static void link_linger(struct ceph_osd *osd, + struct ceph_osd_linger_request *lreq) +{ + verify_osd_locked(osd); + WARN_ON(!lreq->linger_id || lreq->osd); + dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd, + osd->o_osd, lreq, lreq->linger_id); + + if (!osd_homeless(osd)) + __remove_osd_from_lru(osd); + else + atomic_inc(&osd->o_osdc->num_homeless); + + get_osd(osd); + insert_linger(&osd->o_linger_requests, lreq); + lreq->osd = osd; +} + +static void unlink_linger(struct ceph_osd *osd, + struct ceph_osd_linger_request *lreq) +{ + verify_osd_locked(osd); + WARN_ON(lreq->osd != osd); + dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd, + osd->o_osd, lreq, lreq->linger_id); + + lreq->osd = NULL; + erase_linger(&osd->o_linger_requests, lreq); + put_osd(osd); + + if (!osd_homeless(osd)) + maybe_move_osd_to_lru(osd); + else + atomic_dec(&osd->o_osdc->num_homeless); +} + +static bool __linger_registered(struct ceph_osd_linger_request *lreq) +{ + verify_osdc_locked(lreq->osdc); + + return !RB_EMPTY_NODE(&lreq->osdc_node); +} + +static bool linger_registered(struct ceph_osd_linger_request *lreq) +{ + struct ceph_osd_client *osdc = lreq->osdc; + bool registered; + + down_read(&osdc->lock); + registered = __linger_registered(lreq); + up_read(&osdc->lock); + + return registered; +} + +static void linger_register(struct ceph_osd_linger_request *lreq) +{ + struct ceph_osd_client *osdc = lreq->osdc; + + verify_osdc_wrlocked(osdc); + WARN_ON(lreq->linger_id); + + linger_get(lreq); + lreq->linger_id = ++osdc->last_linger_id; + insert_linger_osdc(&osdc->linger_requests, lreq); +} + +static void linger_unregister(struct ceph_osd_linger_request *lreq) +{ + struct ceph_osd_client *osdc = lreq->osdc; + + verify_osdc_wrlocked(osdc); + + erase_linger_osdc(&osdc->linger_requests, lreq); + linger_put(lreq); +} + +static void cancel_linger_request(struct ceph_osd_request *req) +{ + struct ceph_osd_linger_request *lreq = req->r_priv; + + WARN_ON(!req->r_linger); + cancel_request(req); + linger_put(lreq); +} + +struct linger_work { + struct work_struct work; + struct ceph_osd_linger_request *lreq; + + union { + struct { + u64 notify_id; + u64 notifier_id; + void *payload; /* points into @msg front */ + size_t payload_len; + + struct ceph_msg *msg; /* for ceph_msg_put() */ + } notify; + struct { + int err; + } error; + }; +}; + +static struct linger_work *lwork_alloc(struct ceph_osd_linger_request *lreq, + work_func_t workfn) +{ + struct linger_work *lwork; + + lwork = kzalloc(sizeof(*lwork), GFP_NOIO); + if (!lwork) + return NULL; + + INIT_WORK(&lwork->work, workfn); + lwork->lreq = linger_get(lreq); + + return lwork; +} + +static void lwork_free(struct linger_work *lwork) +{ + struct ceph_osd_linger_request *lreq = lwork->lreq; + + linger_put(lreq); + kfree(lwork); +} + +static void lwork_queue(struct linger_work *lwork) +{ + struct ceph_osd_linger_request *lreq = lwork->lreq; + struct ceph_osd_client *osdc = lreq->osdc; + + verify_lreq_locked(lreq); + queue_work(osdc->notify_wq, &lwork->work); +} + +static void do_watch_notify(struct work_struct *w) +{ + struct linger_work *lwork = container_of(w, struct linger_work, work); + struct ceph_osd_linger_request *lreq = lwork->lreq; + + if (!linger_registered(lreq)) { + dout("%s lreq %p not registered\n", __func__, lreq); + goto out; + } + + dout("%s lreq %p notify_id %llu notifier_id %llu payload_len %zu\n", + __func__, lreq, lwork->notify.notify_id, lwork->notify.notifier_id, + lwork->notify.payload_len); + lreq->wcb(lreq->data, lwork->notify.notify_id, lreq->linger_id, + lwork->notify.notifier_id, lwork->notify.payload, + lwork->notify.payload_len); + +out: + ceph_msg_put(lwork->notify.msg); + lwork_free(lwork); +} + +static void do_watch_error(struct work_struct *w) +{ + struct linger_work *lwork = container_of(w, struct linger_work, work); + struct ceph_osd_linger_request *lreq = lwork->lreq; + + if (!linger_registered(lreq)) { + dout("%s lreq %p not registered\n", __func__, lreq); + goto out; + } + + dout("%s lreq %p err %d\n", __func__, lreq, lwork->error.err); + lreq->errcb(lreq->data, lreq->linger_id, lwork->error.err); + +out: + lwork_free(lwork); +} + +static void queue_watch_error(struct ceph_osd_linger_request *lreq) +{ + struct linger_work *lwork; + + lwork = lwork_alloc(lreq, do_watch_error); + if (!lwork) { + pr_err("failed to allocate error-lwork\n"); + return; + } + + lwork->error.err = lreq->last_error; + lwork_queue(lwork); +} + +static void linger_reg_commit_complete(struct ceph_osd_linger_request *lreq, + int result) +{ + if (!completion_done(&lreq->reg_commit_wait)) { + lreq->reg_commit_error = (result <= 0 ? result : 0); + complete_all(&lreq->reg_commit_wait); + } +} + +static void linger_commit_cb(struct ceph_osd_request *req) +{ + struct ceph_osd_linger_request *lreq = req->r_priv; + + mutex_lock(&lreq->lock); + dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq, + lreq->linger_id, req->r_result); + WARN_ON(!__linger_registered(lreq)); + linger_reg_commit_complete(lreq, req->r_result); + lreq->committed = true; + + mutex_unlock(&lreq->lock); + linger_put(lreq); +} + +static int normalize_watch_error(int err) +{ + /* + * Translate ENOENT -> ENOTCONN so that a delete->disconnection + * notification and a failure to reconnect because we raced with + * the delete appear the same to the user. + */ + if (err == -ENOENT) + err = -ENOTCONN; + + return err; +} + +static void linger_reconnect_cb(struct ceph_osd_request *req) +{ + struct ceph_osd_linger_request *lreq = req->r_priv; + + mutex_lock(&lreq->lock); + dout("%s lreq %p linger_id %llu result %d last_error %d\n", __func__, + lreq, lreq->linger_id, req->r_result, lreq->last_error); + if (req->r_result < 0) { + if (!lreq->last_error) { + lreq->last_error = normalize_watch_error(req->r_result); + queue_watch_error(lreq); + } + } + + mutex_unlock(&lreq->lock); + linger_put(lreq); +} + +static void send_linger(struct ceph_osd_linger_request *lreq) +{ + struct ceph_osd_request *req = lreq->reg_req; + struct ceph_osd_req_op *op = &req->r_ops[0]; + + verify_osdc_wrlocked(req->r_osdc); + dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id); + + if (req->r_osd) + cancel_linger_request(req); + + request_reinit(req); + ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid); + ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc); + req->r_flags = lreq->t.flags; + req->r_mtime = lreq->mtime; + + mutex_lock(&lreq->lock); + if (lreq->committed) { + WARN_ON(op->op != CEPH_OSD_OP_WATCH || + op->watch.cookie != lreq->linger_id); + op->watch.op = CEPH_OSD_WATCH_OP_RECONNECT; + op->watch.gen = ++lreq->register_gen; + dout("lreq %p reconnect register_gen %u\n", lreq, + op->watch.gen); + req->r_callback = linger_reconnect_cb; + } else { + WARN_ON(op->watch.op != CEPH_OSD_WATCH_OP_WATCH); + dout("lreq %p register\n", lreq); + req->r_callback = linger_commit_cb; + } + mutex_unlock(&lreq->lock); + + req->r_priv = linger_get(lreq); + req->r_linger = true; + + submit_request(req, true); +} + +static void linger_ping_cb(struct ceph_osd_request *req) +{ + struct ceph_osd_linger_request *lreq = req->r_priv; + + mutex_lock(&lreq->lock); + dout("%s lreq %p linger_id %llu result %d ping_sent %lu last_error %d\n", + __func__, lreq, lreq->linger_id, req->r_result, lreq->ping_sent, + lreq->last_error); + if (lreq->register_gen == req->r_ops[0].watch.gen) { + if (req->r_result && !lreq->last_error) { + lreq->last_error = normalize_watch_error(req->r_result); + queue_watch_error(lreq); + } + } else { + dout("lreq %p register_gen %u ignoring old pong %u\n", lreq, + lreq->register_gen, req->r_ops[0].watch.gen); + } + + mutex_unlock(&lreq->lock); + linger_put(lreq); +} + +static void send_linger_ping(struct ceph_osd_linger_request *lreq) +{ + struct ceph_osd_client *osdc = lreq->osdc; + struct ceph_osd_request *req = lreq->ping_req; + struct ceph_osd_req_op *op = &req->r_ops[0]; + + if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD)) { + dout("%s PAUSERD\n", __func__); + return; + } + + lreq->ping_sent = jiffies; + dout("%s lreq %p linger_id %llu ping_sent %lu register_gen %u\n", + __func__, lreq, lreq->linger_id, lreq->ping_sent, + lreq->register_gen); + + if (req->r_osd) + cancel_linger_request(req); + + request_reinit(req); + target_copy(&req->r_t, &lreq->t); + + WARN_ON(op->op != CEPH_OSD_OP_WATCH || + op->watch.cookie != lreq->linger_id || + op->watch.op != CEPH_OSD_WATCH_OP_PING); + op->watch.gen = lreq->register_gen; + req->r_callback = linger_ping_cb; + req->r_priv = linger_get(lreq); + req->r_linger = true; + + ceph_osdc_get_request(req); + account_request(req); + req->r_tid = atomic64_inc_return(&osdc->last_tid); + link_request(lreq->osd, req); + send_request(req); +} + +static void linger_submit(struct ceph_osd_linger_request *lreq) +{ + struct ceph_osd_client *osdc = lreq->osdc; + struct ceph_osd *osd; + + calc_target(osdc, &lreq->t, &lreq->last_force_resend, false); + osd = lookup_create_osd(osdc, lreq->t.osd, true); + link_linger(osd, lreq); + + send_linger(lreq); +} + +/* + * @lreq has to be both registered and linked. + */ +static void __linger_cancel(struct ceph_osd_linger_request *lreq) +{ + if (lreq->ping_req->r_osd) + cancel_linger_request(lreq->ping_req); + if (lreq->reg_req->r_osd) + cancel_linger_request(lreq->reg_req); + unlink_linger(lreq->osd, lreq); + linger_unregister(lreq); +} + +static void linger_cancel(struct ceph_osd_linger_request *lreq) +{ + struct ceph_osd_client *osdc = lreq->osdc; + + down_write(&osdc->lock); + if (__linger_registered(lreq)) + __linger_cancel(lreq); + up_write(&osdc->lock); +} + +static int linger_reg_commit_wait(struct ceph_osd_linger_request *lreq) +{ + int ret; + + dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id); + ret = wait_for_completion_interruptible(&lreq->reg_commit_wait); + return ret ?: lreq->reg_commit_error; +} + /* * Timeout callback, called every N seconds. When 1 or more OSD * requests has been active for more than N seconds, we send a keepalive @@ -1720,6 +2211,19 @@ static void handle_timeout(struct work_struct *work) found = true; } } + for (p = rb_first(&osd->o_linger_requests); p; p = rb_next(p)) { + struct ceph_osd_linger_request *lreq = + rb_entry(p, struct ceph_osd_linger_request, node); + + dout(" lreq %p linger_id %llu is served by osd%d\n", + lreq, lreq->linger_id, osd->o_osd); + found = true; + + mutex_lock(&lreq->lock); + if (lreq->committed && !lreq->last_error) + send_linger_ping(lreq); + mutex_unlock(&lreq->lock); + } if (found) list_move_tail(&osd->o_keepalive_item, &slow_osds); @@ -1756,7 +2260,7 @@ static void handle_osds_timeout(struct work_struct *work) break; WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests)); - WARN_ON(!list_empty(&osd->o_linger_requests)); + WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests)); close_osd(osd); } @@ -2082,7 +2586,8 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) __finish_request(req); if (req->r_linger) { WARN_ON(req->r_unsafe_callback); - __register_linger_request(osd, req); + dout("req %p tid %llu cb (locked)\n", req, req->r_tid); + __complete_request(req); } } @@ -2093,7 +2598,7 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) if (already_acked && req->r_unsafe_callback) { dout("req %p tid %llu safe-cb\n", req, req->r_tid); req->r_unsafe_callback(req, false); - } else { + } else if (!req->r_linger) { dout("req %p tid %llu cb\n", req, req->r_tid); __complete_request(req); } @@ -2145,6 +2650,26 @@ static bool pool_cleared_full(struct ceph_osd_client *osdc, s64 pool_id) return pi->was_full && !__pool_full(pi); } +static enum calc_target_result +recalc_linger_target(struct ceph_osd_linger_request *lreq) +{ + struct ceph_osd_client *osdc = lreq->osdc; + enum calc_target_result ct_res; + + ct_res = calc_target(osdc, &lreq->t, &lreq->last_force_resend, true); + if (ct_res == CALC_TARGET_NEED_RESEND) { + struct ceph_osd *osd; + + osd = lookup_create_osd(osdc, lreq->t.osd, true); + if (osd != lreq->osd) { + unlink_linger(lreq->osd, lreq); + link_linger(osd, lreq); + } + } + + return ct_res; +} + /* * Requeue requests whose mapping to an OSD has changed. */ @@ -2159,6 +2684,39 @@ static void scan_requests(struct ceph_osd *osd, struct rb_node *n; bool force_resend_writes; + for (n = rb_first(&osd->o_linger_requests); n; ) { + struct ceph_osd_linger_request *lreq = + rb_entry(n, struct ceph_osd_linger_request, node); + enum calc_target_result ct_res; + + n = rb_next(n); /* recalc_linger_target() */ + + dout("%s lreq %p linger_id %llu\n", __func__, lreq, + lreq->linger_id); + ct_res = recalc_linger_target(lreq); + switch (ct_res) { + case CALC_TARGET_NO_ACTION: + force_resend_writes = cleared_full || + (check_pool_cleared_full && + pool_cleared_full(osdc, lreq->t.base_oloc.pool)); + if (!force_resend && !force_resend_writes) + break; + + /* fall through */ + case CALC_TARGET_NEED_RESEND: + /* + * scan_requests() for the previous epoch(s) + * may have already added it to the list, since + * it's not unlinked here. + */ + if (list_empty(&lreq->scan_item)) + list_add_tail(&lreq->scan_item, need_resend_linger); + break; + case CALC_TARGET_POOL_DNE: + break; + } + } + for (n = rb_first(&osd->o_requests); n; ) { struct ceph_osd_request *req = rb_entry(n, struct ceph_osd_request, r_node); @@ -2263,6 +2821,7 @@ static void kick_requests(struct ceph_osd_client *osdc, struct rb_root *need_resend, struct list_head *need_resend_linger) { + struct ceph_osd_linger_request *lreq, *nlreq; struct rb_node *n; for (n = rb_first(need_resend); n; ) { @@ -2280,8 +2839,17 @@ static void kick_requests(struct ceph_osd_client *osdc, if (!req->r_linger) { if (!osd_homeless(osd) && !req->r_t.paused) send_request(req); + } else { + cancel_linger_request(req); } } + + list_for_each_entry_safe(lreq, nlreq, need_resend_linger, scan_item) { + if (!osd_homeless(lreq->osd)) + send_linger(lreq); + + list_del_init(&lreq->scan_item); + } } /* @@ -2406,15 +2974,25 @@ static void kick_osd_requests(struct ceph_osd *osd) { struct rb_node *n; - for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) { + for (n = rb_first(&osd->o_requests); n; ) { struct ceph_osd_request *req = rb_entry(n, struct ceph_osd_request, r_node); + n = rb_next(n); /* cancel_linger_request() */ + if (!req->r_linger) { if (!req->r_t.paused) send_request(req); + } else { + cancel_linger_request(req); } } + for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) { + struct ceph_osd_linger_request *lreq = + rb_entry(n, struct ceph_osd_linger_request, node); + + send_linger(lreq); + } } /* @@ -2441,193 +3019,77 @@ static void osd_fault(struct ceph_connection *con) up_write(&osdc->lock); } -/* - * watch/notify callback event infrastructure - * - * These callbacks are used both for watch and notify operations. - */ -static void __release_event(struct kref *kref) -{ - struct ceph_osd_event *event = - container_of(kref, struct ceph_osd_event, kref); - - dout("__release_event %p\n", event); - kfree(event); -} - -static void get_event(struct ceph_osd_event *event) -{ - kref_get(&event->kref); -} - -void ceph_osdc_put_event(struct ceph_osd_event *event) -{ - kref_put(&event->kref, __release_event); -} -EXPORT_SYMBOL(ceph_osdc_put_event); - -static void __insert_event(struct ceph_osd_client *osdc, - struct ceph_osd_event *new) -{ - struct rb_node **p = &osdc->event_tree.rb_node; - struct rb_node *parent = NULL; - struct ceph_osd_event *event = NULL; - - while (*p) { - parent = *p; - event = rb_entry(parent, struct ceph_osd_event, node); - if (new->cookie < event->cookie) - p = &(*p)->rb_left; - else if (new->cookie > event->cookie) - p = &(*p)->rb_right; - else - BUG(); - } - - rb_link_node(&new->node, parent, p); - rb_insert_color(&new->node, &osdc->event_tree); -} - -static struct ceph_osd_event *__find_event(struct ceph_osd_client *osdc, - u64 cookie) -{ - struct rb_node **p = &osdc->event_tree.rb_node; - struct rb_node *parent = NULL; - struct ceph_osd_event *event = NULL; - - while (*p) { - parent = *p; - event = rb_entry(parent, struct ceph_osd_event, node); - if (cookie < event->cookie) - p = &(*p)->rb_left; - else if (cookie > event->cookie) - p = &(*p)->rb_right; - else - return event; - } - return NULL; -} - -static void __remove_event(struct ceph_osd_event *event) -{ - struct ceph_osd_client *osdc = event->osdc; - - if (!RB_EMPTY_NODE(&event->node)) { - dout("__remove_event removed %p\n", event); - rb_erase(&event->node, &osdc->event_tree); - ceph_osdc_put_event(event); - } else { - dout("__remove_event didn't remove %p\n", event); - } -} - -int ceph_osdc_create_event(struct ceph_osd_client *osdc, - void (*event_cb)(u64, u64, u8, void *), - void *data, struct ceph_osd_event **pevent) -{ - struct ceph_osd_event *event; - - event = kmalloc(sizeof(*event), GFP_NOIO); - if (!event) - return -ENOMEM; - - dout("create_event %p\n", event); - event->cb = event_cb; - event->one_shot = 0; - event->data = data; - event->osdc = osdc; - INIT_LIST_HEAD(&event->osd_node); - RB_CLEAR_NODE(&event->node); - kref_init(&event->kref); /* one ref for us */ - kref_get(&event->kref); /* one ref for the caller */ - - spin_lock(&osdc->event_lock); - event->cookie = ++osdc->event_count; - __insert_event(osdc, event); - spin_unlock(&osdc->event_lock); - - *pevent = event; - return 0; -} -EXPORT_SYMBOL(ceph_osdc_create_event); - -void ceph_osdc_cancel_event(struct ceph_osd_event *event) -{ - struct ceph_osd_client *osdc = event->osdc; - - dout("cancel_event %p\n", event); - spin_lock(&osdc->event_lock); - __remove_event(event); - spin_unlock(&osdc->event_lock); - ceph_osdc_put_event(event); /* caller's */ -} -EXPORT_SYMBOL(ceph_osdc_cancel_event); - - -static void do_event_work(struct work_struct *work) -{ - struct ceph_osd_event_work *event_work = - container_of(work, struct ceph_osd_event_work, work); - struct ceph_osd_event *event = event_work->event; - u64 ver = event_work->ver; - u64 notify_id = event_work->notify_id; - u8 opcode = event_work->opcode; - - dout("do_event_work completing %p\n", event); - event->cb(ver, notify_id, opcode, event->data); - dout("do_event_work completed %p\n", event); - ceph_osdc_put_event(event); - kfree(event_work); -} - - /* * Process osd watch notifications */ static void handle_watch_notify(struct ceph_osd_client *osdc, struct ceph_msg *msg) { - void *p, *end; - u8 proto_ver; - u64 cookie, ver, notify_id; - u8 opcode; - struct ceph_osd_event *event; - struct ceph_osd_event_work *event_work; - - p = msg->front.iov_base; - end = p + msg->front.iov_len; + void *p = msg->front.iov_base; + void *const end = p + msg->front.iov_len; + struct ceph_osd_linger_request *lreq; + struct linger_work *lwork; + u8 proto_ver, opcode; + u64 cookie, notify_id; + u64 notifier_id = 0; + void *payload = NULL; + u32 payload_len = 0; ceph_decode_8_safe(&p, end, proto_ver, bad); ceph_decode_8_safe(&p, end, opcode, bad); ceph_decode_64_safe(&p, end, cookie, bad); - ceph_decode_64_safe(&p, end, ver, bad); + p += 8; /* skip ver */ ceph_decode_64_safe(&p, end, notify_id, bad); - spin_lock(&osdc->event_lock); - event = __find_event(osdc, cookie); - if (event) { - BUG_ON(event->one_shot); - get_event(event); - } - spin_unlock(&osdc->event_lock); - dout("handle_watch_notify cookie %lld ver %lld event %p\n", - cookie, ver, event); - if (event) { - event_work = kmalloc(sizeof(*event_work), GFP_NOIO); - if (!event_work) { - pr_err("couldn't allocate event_work\n"); - ceph_osdc_put_event(event); - return; + if (proto_ver >= 1) { + ceph_decode_32_safe(&p, end, payload_len, bad); + ceph_decode_need(&p, end, payload_len, bad); + payload = p; + p += payload_len; + } + + if (le16_to_cpu(msg->hdr.version) >= 2) + p += 4; /* skip return_code */ + + if (le16_to_cpu(msg->hdr.version) >= 3) + ceph_decode_64_safe(&p, end, notifier_id, bad); + + down_read(&osdc->lock); + lreq = lookup_linger_osdc(&osdc->linger_requests, cookie); + if (!lreq) { + dout("%s opcode %d cookie %llu dne\n", __func__, opcode, + cookie); + goto out_unlock_osdc; + } + + mutex_lock(&lreq->lock); + dout("%s opcode %d cookie %llu lreq %p\n", __func__, opcode, cookie, + lreq); + if (opcode == CEPH_WATCH_EVENT_DISCONNECT) { + if (!lreq->last_error) { + lreq->last_error = -ENOTCONN; + queue_watch_error(lreq); + } + } else { + /* CEPH_WATCH_EVENT_NOTIFY */ + lwork = lwork_alloc(lreq, do_watch_notify); + if (!lwork) { + pr_err("failed to allocate notify-lwork\n"); + goto out_unlock_lreq; } - INIT_WORK(&event_work->work, do_event_work); - event_work->event = event; - event_work->ver = ver; - event_work->notify_id = notify_id; - event_work->opcode = opcode; - queue_work(osdc->notify_wq, &event_work->work); + lwork->notify.notify_id = notify_id; + lwork->notify.notifier_id = notifier_id; + lwork->notify.payload = payload; + lwork->notify.payload_len = payload_len; + lwork->notify.msg = ceph_msg_get(msg); + lwork_queue(lwork); } +out_unlock_lreq: + mutex_unlock(&lreq->lock); +out_unlock_osdc: + up_read(&osdc->lock); return; bad: @@ -2659,8 +3121,6 @@ void ceph_osdc_cancel_request(struct ceph_osd_request *req) struct ceph_osd_client *osdc = req->r_osdc; down_write(&osdc->lock); - if (req->r_linger) - __unregister_linger_request(osdc, req); if (req->r_osd) cancel_request(req); up_write(&osdc->lock); @@ -2743,6 +3203,198 @@ void ceph_osdc_sync(struct ceph_osd_client *osdc) } EXPORT_SYMBOL(ceph_osdc_sync); +static struct ceph_osd_request * +alloc_linger_request(struct ceph_osd_linger_request *lreq) +{ + struct ceph_osd_request *req; + + req = ceph_osdc_alloc_request(lreq->osdc, NULL, 1, false, GFP_NOIO); + if (!req) + return NULL; + + ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid); + ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc); + + if (ceph_osdc_alloc_messages(req, GFP_NOIO)) { + ceph_osdc_put_request(req); + return NULL; + } + + return req; +} + +/* + * Returns a handle, caller owns a ref. + */ +struct ceph_osd_linger_request * +ceph_osdc_watch(struct ceph_osd_client *osdc, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + rados_watchcb2_t wcb, + rados_watcherrcb_t errcb, + void *data) +{ + struct ceph_osd_linger_request *lreq; + int ret; + + lreq = linger_alloc(osdc); + if (!lreq) + return ERR_PTR(-ENOMEM); + + lreq->wcb = wcb; + lreq->errcb = errcb; + lreq->data = data; + + ceph_oid_copy(&lreq->t.base_oid, oid); + ceph_oloc_copy(&lreq->t.base_oloc, oloc); + lreq->t.flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; + lreq->mtime = CURRENT_TIME; + + lreq->reg_req = alloc_linger_request(lreq); + if (!lreq->reg_req) { + ret = -ENOMEM; + goto err_put_lreq; + } + + lreq->ping_req = alloc_linger_request(lreq); + if (!lreq->ping_req) { + ret = -ENOMEM; + goto err_put_lreq; + } + + down_write(&osdc->lock); + linger_register(lreq); /* before osd_req_op_* */ + osd_req_op_watch_init(lreq->reg_req, 0, lreq->linger_id, + CEPH_OSD_WATCH_OP_WATCH); + osd_req_op_watch_init(lreq->ping_req, 0, lreq->linger_id, + CEPH_OSD_WATCH_OP_PING); + linger_submit(lreq); + up_write(&osdc->lock); + + ret = linger_reg_commit_wait(lreq); + if (ret) { + linger_cancel(lreq); + goto err_put_lreq; + } + + return lreq; + +err_put_lreq: + linger_put(lreq); + return ERR_PTR(ret); +} +EXPORT_SYMBOL(ceph_osdc_watch); + +/* + * Releases a ref. + * + * Times out after mount_timeout to preserve rbd unmap behaviour + * introduced in 2894e1d76974 ("rbd: timeout watch teardown on unmap + * with mount_timeout"). + */ +int ceph_osdc_unwatch(struct ceph_osd_client *osdc, + struct ceph_osd_linger_request *lreq) +{ + struct ceph_options *opts = osdc->client->options; + struct ceph_osd_request *req; + int ret; + + req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO); + if (!req) + return -ENOMEM; + + ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid); + ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc); + req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; + req->r_mtime = CURRENT_TIME; + osd_req_op_watch_init(req, 0, lreq->linger_id, + CEPH_OSD_WATCH_OP_UNWATCH); + + ret = ceph_osdc_alloc_messages(req, GFP_NOIO); + if (ret) + goto out_put_req; + + ceph_osdc_start_request(osdc, req, false); + linger_cancel(lreq); + linger_put(lreq); + ret = wait_request_timeout(req, opts->mount_timeout); + +out_put_req: + ceph_osdc_put_request(req); + return ret; +} +EXPORT_SYMBOL(ceph_osdc_unwatch); + +static int osd_req_op_notify_ack_init(struct ceph_osd_request *req, int which, + u64 notify_id, u64 cookie, void *payload, + size_t payload_len) +{ + struct ceph_osd_req_op *op; + struct ceph_pagelist *pl; + int ret; + + op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY_ACK, 0); + + pl = kmalloc(sizeof(*pl), GFP_NOIO); + if (!pl) + return -ENOMEM; + + ceph_pagelist_init(pl); + ret = ceph_pagelist_encode_64(pl, notify_id); + ret |= ceph_pagelist_encode_64(pl, cookie); + if (payload) { + ret |= ceph_pagelist_encode_32(pl, payload_len); + ret |= ceph_pagelist_append(pl, payload, payload_len); + } else { + ret |= ceph_pagelist_encode_32(pl, 0); + } + if (ret) { + ceph_pagelist_release(pl); + return -ENOMEM; + } + + ceph_osd_data_pagelist_init(&op->notify_ack.request_data, pl); + op->indata_len = pl->length; + return 0; +} + +int ceph_osdc_notify_ack(struct ceph_osd_client *osdc, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + u64 notify_id, + u64 cookie, + void *payload, + size_t payload_len) +{ + struct ceph_osd_request *req; + int ret; + + req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO); + if (!req) + return -ENOMEM; + + ceph_oid_copy(&req->r_base_oid, oid); + ceph_oloc_copy(&req->r_base_oloc, oloc); + req->r_flags = CEPH_OSD_FLAG_READ; + + ret = ceph_osdc_alloc_messages(req, GFP_NOIO); + if (ret) + goto out_put_req; + + ret = osd_req_op_notify_ack_init(req, 0, notify_id, cookie, payload, + payload_len); + if (ret) + goto out_put_req; + + ceph_osdc_start_request(osdc, req, false); + ret = ceph_osdc_wait_request(osdc, req); + +out_put_req: + ceph_osdc_put_request(req); + return ret; +} +EXPORT_SYMBOL(ceph_osdc_notify_ack); + /* * Call all pending notify callbacks - for use after a watch is * unregistered, to make sure no more callbacks for it will be invoked @@ -2767,15 +3419,12 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) osdc->osds = RB_ROOT; INIT_LIST_HEAD(&osdc->osd_lru); spin_lock_init(&osdc->osd_lru_lock); - INIT_LIST_HEAD(&osdc->req_linger); osd_init(&osdc->homeless_osd); osdc->homeless_osd.o_osdc = osdc; osdc->homeless_osd.o_osd = CEPH_HOMELESS_OSD; + osdc->linger_requests = RB_ROOT; INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout); INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout); - spin_lock_init(&osdc->event_lock); - osdc->event_tree = RB_ROOT; - osdc->event_count = 0; err = -ENOMEM; osdc->osdmap = ceph_osdmap_alloc(); @@ -2838,6 +3487,7 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc) osd_cleanup(&osdc->homeless_osd); WARN_ON(!list_empty(&osdc->osd_lru)); + WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_requests)); WARN_ON(atomic_read(&osdc->num_requests)); WARN_ON(atomic_read(&osdc->num_homeless)); From 1907920324f1f3ebb6618344417c03a2863bba01 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:27 +0200 Subject: [PATCH 34/71] libceph: support for sending notifies Implement ceph_osdc_notify() for sending notifies. Due to the fact that the current messenger can't do read-in into pagelists (it can only do write-out from them), I had to go with a page vector for a NOTIFY_COMPLETE payload, for now. Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 20 +++ include/linux/ceph/rados.h | 3 + net/ceph/debugfs.c | 5 +- net/ceph/osd_client.c | 232 ++++++++++++++++++++++++++++++-- 4 files changed, 249 insertions(+), 11 deletions(-) diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index cd2dcb8939de8e..63054fae4f15a5 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -114,6 +114,11 @@ struct ceph_osd_req_op { struct { struct ceph_osd_data request_data; } notify_ack; + struct { + u64 cookie; + struct ceph_osd_data request_data; + struct ceph_osd_data response_data; + } notify; struct { u64 expected_object_size; u64 expected_write_size; @@ -202,6 +207,7 @@ struct ceph_osd_linger_request { struct ceph_osd_client *osdc; u64 linger_id; bool committed; + bool is_watch; /* watch or notify */ struct ceph_osd *osd; struct ceph_osd_request *reg_req; @@ -220,14 +226,20 @@ struct ceph_osd_linger_request { struct list_head scan_item; struct completion reg_commit_wait; + struct completion notify_finish_wait; int reg_commit_error; + int notify_finish_error; int last_error; u32 register_gen; + u64 notify_id; rados_watchcb2_t wcb; rados_watcherrcb_t errcb; void *data; + + struct page ***preply_pages; + size_t *preply_len; }; struct ceph_osd_client { @@ -397,5 +409,13 @@ int ceph_osdc_notify_ack(struct ceph_osd_client *osdc, u64 cookie, void *payload, size_t payload_len); +int ceph_osdc_notify(struct ceph_osd_client *osdc, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + void *payload, + size_t payload_len, + u32 timeout, + struct page ***preply_pages, + size_t *preply_len); #endif diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 204c8c94470353..5c0da61cb76312 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -476,6 +476,9 @@ struct ceph_osd_op { __u8 op; /* CEPH_OSD_WATCH_OP_* */ __le32 gen; /* registration generation */ } __attribute__ ((packed)) watch; + struct { + __le64 cookie; + } __attribute__ ((packed)) notify; struct { __le64 offset, length; __le64 src_offset; diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index e64cb858353367..39f91c7250f6e8 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -206,8 +206,9 @@ static void dump_linger_request(struct seq_file *s, seq_printf(s, "%llu\t", lreq->linger_id); dump_target(s, &lreq->t); - seq_printf(s, "\t%u\t%s/%d\n", lreq->register_gen, - lreq->committed ? "C" : "", lreq->last_error); + seq_printf(s, "\t%u\t%s%s/%d\n", lreq->register_gen, + lreq->is_watch ? "W" : "N", lreq->committed ? "C" : "", + lreq->last_error); } static void dump_linger_requests(struct seq_file *s, struct ceph_osd *osd) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index ca0a7b58ba4f8b..e6e3ab4223db47 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -334,6 +334,10 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req, case CEPH_OSD_OP_NOTIFY_ACK: ceph_osd_data_release(&op->notify_ack.request_data); break; + case CEPH_OSD_OP_NOTIFY: + ceph_osd_data_release(&op->notify.request_data); + ceph_osd_data_release(&op->notify.response_data); + break; default: break; } @@ -845,6 +849,9 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst, break; case CEPH_OSD_OP_NOTIFY_ACK: break; + case CEPH_OSD_OP_NOTIFY: + dst->notify.cookie = cpu_to_le64(src->notify.cookie); + break; case CEPH_OSD_OP_SETALLOCHINT: dst->alloc_hint.expected_object_size = cpu_to_le64(src->alloc_hint.expected_object_size); @@ -1439,6 +1446,12 @@ static void setup_request_data(struct ceph_osd_request *req, ceph_osdc_msg_data_add(req->r_reply, &op->cls.response_data); break; + case CEPH_OSD_OP_NOTIFY: + ceph_osdc_msg_data_add(msg, + &op->notify.request_data); + ceph_osdc_msg_data_add(req->r_reply, + &op->notify.response_data); + break; } data_len += op->indata_len; @@ -1771,6 +1784,7 @@ linger_alloc(struct ceph_osd_client *osdc) RB_CLEAR_NODE(&lreq->osdc_node); INIT_LIST_HEAD(&lreq->scan_item); init_completion(&lreq->reg_commit_wait); + init_completion(&lreq->notify_finish_wait); lreq->osdc = osdc; target_init(&lreq->t); @@ -1934,6 +1948,7 @@ static void do_watch_notify(struct work_struct *w) goto out; } + WARN_ON(!lreq->is_watch); dout("%s lreq %p notify_id %llu notifier_id %llu payload_len %zu\n", __func__, lreq, lwork->notify.notify_id, lwork->notify.notifier_id, lwork->notify.payload_len); @@ -1997,6 +2012,24 @@ static void linger_commit_cb(struct ceph_osd_request *req) linger_reg_commit_complete(lreq, req->r_result); lreq->committed = true; + if (!lreq->is_watch) { + struct ceph_osd_data *osd_data = + osd_req_op_data(req, 0, notify, response_data); + void *p = page_address(osd_data->pages[0]); + + WARN_ON(req->r_ops[0].op != CEPH_OSD_OP_NOTIFY || + osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); + + /* make note of the notify_id */ + if (req->r_ops[0].outdata_len >= sizeof(u64)) { + lreq->notify_id = ceph_decode_64(&p); + dout("lreq %p notify_id %llu\n", lreq, + lreq->notify_id); + } else { + dout("lreq %p no notify_id\n", lreq); + } + } + mutex_unlock(&lreq->lock); linger_put(lreq); } @@ -2050,7 +2083,7 @@ static void send_linger(struct ceph_osd_linger_request *lreq) req->r_mtime = lreq->mtime; mutex_lock(&lreq->lock); - if (lreq->committed) { + if (lreq->is_watch && lreq->committed) { WARN_ON(op->op != CEPH_OSD_OP_WATCH || op->watch.cookie != lreq->linger_id); op->watch.op = CEPH_OSD_WATCH_OP_RECONNECT; @@ -2059,7 +2092,10 @@ static void send_linger(struct ceph_osd_linger_request *lreq) op->watch.gen); req->r_callback = linger_reconnect_cb; } else { - WARN_ON(op->watch.op != CEPH_OSD_WATCH_OP_WATCH); + if (!lreq->is_watch) + lreq->notify_id = 0; + else + WARN_ON(op->watch.op != CEPH_OSD_WATCH_OP_WATCH); dout("lreq %p register\n", lreq); req->r_callback = linger_commit_cb; } @@ -2147,7 +2183,7 @@ static void linger_submit(struct ceph_osd_linger_request *lreq) */ static void __linger_cancel(struct ceph_osd_linger_request *lreq) { - if (lreq->ping_req->r_osd) + if (lreq->is_watch && lreq->ping_req->r_osd) cancel_linger_request(lreq->ping_req); if (lreq->reg_req->r_osd) cancel_linger_request(lreq->reg_req); @@ -2174,6 +2210,15 @@ static int linger_reg_commit_wait(struct ceph_osd_linger_request *lreq) return ret ?: lreq->reg_commit_error; } +static int linger_notify_finish_wait(struct ceph_osd_linger_request *lreq) +{ + int ret; + + dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id); + ret = wait_for_completion_interruptible(&lreq->notify_finish_wait); + return ret ?: lreq->notify_finish_error; +} + /* * Timeout callback, called every N seconds. When 1 or more OSD * requests has been active for more than N seconds, we send a keepalive @@ -2220,7 +2265,7 @@ static void handle_timeout(struct work_struct *work) found = true; mutex_lock(&lreq->lock); - if (lreq->committed && !lreq->last_error) + if (lreq->is_watch && lreq->committed && !lreq->last_error) send_linger_ping(lreq); mutex_unlock(&lreq->lock); } @@ -3032,6 +3077,7 @@ static void handle_watch_notify(struct ceph_osd_client *osdc, u8 proto_ver, opcode; u64 cookie, notify_id; u64 notifier_id = 0; + s32 return_code = 0; void *payload = NULL; u32 payload_len = 0; @@ -3049,7 +3095,7 @@ static void handle_watch_notify(struct ceph_osd_client *osdc, } if (le16_to_cpu(msg->hdr.version) >= 2) - p += 4; /* skip return_code */ + ceph_decode_32_safe(&p, end, return_code, bad); if (le16_to_cpu(msg->hdr.version) >= 3) ceph_decode_64_safe(&p, end, notifier_id, bad); @@ -3063,13 +3109,38 @@ static void handle_watch_notify(struct ceph_osd_client *osdc, } mutex_lock(&lreq->lock); - dout("%s opcode %d cookie %llu lreq %p\n", __func__, opcode, cookie, - lreq); + dout("%s opcode %d cookie %llu lreq %p is_watch %d\n", __func__, + opcode, cookie, lreq, lreq->is_watch); if (opcode == CEPH_WATCH_EVENT_DISCONNECT) { if (!lreq->last_error) { lreq->last_error = -ENOTCONN; queue_watch_error(lreq); } + } else if (!lreq->is_watch) { + /* CEPH_WATCH_EVENT_NOTIFY_COMPLETE */ + if (lreq->notify_id && lreq->notify_id != notify_id) { + dout("lreq %p notify_id %llu != %llu, ignoring\n", lreq, + lreq->notify_id, notify_id); + } else if (!completion_done(&lreq->notify_finish_wait)) { + struct ceph_msg_data *data = + list_first_entry_or_null(&msg->data, + struct ceph_msg_data, + links); + + if (data) { + if (lreq->preply_pages) { + WARN_ON(data->type != + CEPH_MSG_DATA_PAGES); + *lreq->preply_pages = data->pages; + *lreq->preply_len = data->length; + } else { + ceph_release_page_vector(data->pages, + calc_pages_for(0, data->length)); + } + } + lreq->notify_finish_error = return_code; + complete_all(&lreq->notify_finish_wait); + } } else { /* CEPH_WATCH_EVENT_NOTIFY */ lwork = lwork_alloc(lreq, do_watch_notify); @@ -3241,6 +3312,7 @@ ceph_osdc_watch(struct ceph_osd_client *osdc, if (!lreq) return ERR_PTR(-ENOMEM); + lreq->is_watch = true; lreq->wcb = wcb; lreq->errcb = errcb; lreq->data = data; @@ -3395,6 +3467,116 @@ int ceph_osdc_notify_ack(struct ceph_osd_client *osdc, } EXPORT_SYMBOL(ceph_osdc_notify_ack); +static int osd_req_op_notify_init(struct ceph_osd_request *req, int which, + u64 cookie, u32 prot_ver, u32 timeout, + void *payload, size_t payload_len) +{ + struct ceph_osd_req_op *op; + struct ceph_pagelist *pl; + int ret; + + op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0); + op->notify.cookie = cookie; + + pl = kmalloc(sizeof(*pl), GFP_NOIO); + if (!pl) + return -ENOMEM; + + ceph_pagelist_init(pl); + ret = ceph_pagelist_encode_32(pl, 1); /* prot_ver */ + ret |= ceph_pagelist_encode_32(pl, timeout); + ret |= ceph_pagelist_encode_32(pl, payload_len); + ret |= ceph_pagelist_append(pl, payload, payload_len); + if (ret) { + ceph_pagelist_release(pl); + return -ENOMEM; + } + + ceph_osd_data_pagelist_init(&op->notify.request_data, pl); + op->indata_len = pl->length; + return 0; +} + +/* + * @timeout: in seconds + * + * @preply_{pages,len} are initialized both on success and error. + * The caller is responsible for: + * + * ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len)) + */ +int ceph_osdc_notify(struct ceph_osd_client *osdc, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + void *payload, + size_t payload_len, + u32 timeout, + struct page ***preply_pages, + size_t *preply_len) +{ + struct ceph_osd_linger_request *lreq; + struct page **pages; + int ret; + + WARN_ON(!timeout); + if (preply_pages) { + *preply_pages = NULL; + *preply_len = 0; + } + + lreq = linger_alloc(osdc); + if (!lreq) + return -ENOMEM; + + lreq->preply_pages = preply_pages; + lreq->preply_len = preply_len; + + ceph_oid_copy(&lreq->t.base_oid, oid); + ceph_oloc_copy(&lreq->t.base_oloc, oloc); + lreq->t.flags = CEPH_OSD_FLAG_READ; + + lreq->reg_req = alloc_linger_request(lreq); + if (!lreq->reg_req) { + ret = -ENOMEM; + goto out_put_lreq; + } + + /* for notify_id */ + pages = ceph_alloc_page_vector(1, GFP_NOIO); + if (IS_ERR(pages)) { + ret = PTR_ERR(pages); + goto out_put_lreq; + } + + down_write(&osdc->lock); + linger_register(lreq); /* before osd_req_op_* */ + ret = osd_req_op_notify_init(lreq->reg_req, 0, lreq->linger_id, 1, + timeout, payload, payload_len); + if (ret) { + linger_unregister(lreq); + up_write(&osdc->lock); + ceph_release_page_vector(pages, 1); + goto out_put_lreq; + } + ceph_osd_data_pages_init(osd_req_op_data(lreq->reg_req, 0, notify, + response_data), + pages, PAGE_SIZE, 0, false, true); + linger_submit(lreq); + up_write(&osdc->lock); + + ret = linger_reg_commit_wait(lreq); + if (!ret) + ret = linger_notify_finish_wait(lreq); + else + dout("lreq %p failed to initiate notify %d\n", lreq, ret); + + linger_cancel(lreq); +out_put_lreq: + linger_put(lreq); + return ret; +} +EXPORT_SYMBOL(ceph_osdc_notify); + /* * Call all pending notify callbacks - for use after a watch is * unregistered, to make sure no more callbacks for it will be invoked @@ -3693,19 +3875,51 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, return m; } +/* + * TODO: switch to a msg-owned pagelist + */ +static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr) +{ + struct ceph_msg *m; + int type = le16_to_cpu(hdr->type); + u32 front_len = le32_to_cpu(hdr->front_len); + u32 data_len = le32_to_cpu(hdr->data_len); + + m = ceph_msg_new(type, front_len, GFP_NOIO, false); + if (!m) + return NULL; + + if (data_len) { + struct page **pages; + struct ceph_osd_data osd_data; + + pages = ceph_alloc_page_vector(calc_pages_for(0, data_len), + GFP_NOIO); + if (!pages) { + ceph_msg_put(m); + return NULL; + } + + ceph_osd_data_pages_init(&osd_data, pages, data_len, 0, false, + false); + ceph_osdc_msg_data_add(m, &osd_data); + } + + return m; +} + static struct ceph_msg *alloc_msg(struct ceph_connection *con, struct ceph_msg_header *hdr, int *skip) { struct ceph_osd *osd = con->private; int type = le16_to_cpu(hdr->type); - int front = le32_to_cpu(hdr->front_len); *skip = 0; switch (type) { case CEPH_MSG_OSD_MAP: case CEPH_MSG_WATCH_NOTIFY: - return ceph_msg_new(type, front, GFP_NOFS, false); + return alloc_msg_with_page_vector(hdr); case CEPH_MSG_OSD_OPREPLY: return get_reply(con, hdr, skip); default: From b07d3c4bd7270c74e2b6803af8ac8a00cb3e5ed2 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:27 +0200 Subject: [PATCH 35/71] libceph: support for checking on status of watch Implement ceph_osdc_watch_check() to be able to check on status of watch. Note that the time it takes for a watch/notify event to get delivered through the notify_wq is taken into account. Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 4 +++ net/ceph/osd_client.c | 52 ++++++++++++++++++++++++++++++++- 2 files changed, 55 insertions(+), 1 deletion(-) diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 63054fae4f15a5..2ae7cfd82ec9a5 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -213,6 +213,8 @@ struct ceph_osd_linger_request { struct ceph_osd_request *reg_req; struct ceph_osd_request *ping_req; unsigned long ping_sent; + unsigned long watch_valid_thru; + struct list_head pending_lworks; struct ceph_osd_request_target t; u32 last_force_resend; @@ -417,5 +419,7 @@ int ceph_osdc_notify(struct ceph_osd_client *osdc, u32 timeout, struct page ***preply_pages, size_t *preply_len); +int ceph_osdc_watch_check(struct ceph_osd_client *osdc, + struct ceph_osd_linger_request *lreq); #endif diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index e6e3ab4223db47..5ac6dce74f079c 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1746,6 +1746,7 @@ static void linger_release(struct kref *kref) WARN_ON(!RB_EMPTY_NODE(&lreq->node)); WARN_ON(!RB_EMPTY_NODE(&lreq->osdc_node)); WARN_ON(!list_empty(&lreq->scan_item)); + WARN_ON(!list_empty(&lreq->pending_lworks)); WARN_ON(lreq->osd); if (lreq->reg_req) @@ -1783,6 +1784,7 @@ linger_alloc(struct ceph_osd_client *osdc) RB_CLEAR_NODE(&lreq->node); RB_CLEAR_NODE(&lreq->osdc_node); INIT_LIST_HEAD(&lreq->scan_item); + INIT_LIST_HEAD(&lreq->pending_lworks); init_completion(&lreq->reg_commit_wait); init_completion(&lreq->notify_finish_wait); @@ -1890,6 +1892,8 @@ static void cancel_linger_request(struct ceph_osd_request *req) struct linger_work { struct work_struct work; struct ceph_osd_linger_request *lreq; + struct list_head pending_item; + unsigned long queued_stamp; union { struct { @@ -1916,6 +1920,7 @@ static struct linger_work *lwork_alloc(struct ceph_osd_linger_request *lreq, return NULL; INIT_WORK(&lwork->work, workfn); + INIT_LIST_HEAD(&lwork->pending_item); lwork->lreq = linger_get(lreq); return lwork; @@ -1925,6 +1930,10 @@ static void lwork_free(struct linger_work *lwork) { struct ceph_osd_linger_request *lreq = lwork->lreq; + mutex_lock(&lreq->lock); + list_del(&lwork->pending_item); + mutex_unlock(&lreq->lock); + linger_put(lreq); kfree(lwork); } @@ -1935,6 +1944,10 @@ static void lwork_queue(struct linger_work *lwork) struct ceph_osd_client *osdc = lreq->osdc; verify_lreq_locked(lreq); + WARN_ON(!list_empty(&lwork->pending_item)); + + lwork->queued_stamp = jiffies; + list_add_tail(&lwork->pending_item, &lreq->pending_lworks); queue_work(osdc->notify_wq, &lwork->work); } @@ -2116,7 +2129,9 @@ static void linger_ping_cb(struct ceph_osd_request *req) __func__, lreq, lreq->linger_id, req->r_result, lreq->ping_sent, lreq->last_error); if (lreq->register_gen == req->r_ops[0].watch.gen) { - if (req->r_result && !lreq->last_error) { + if (!req->r_result) { + lreq->watch_valid_thru = lreq->ping_sent; + } else if (!lreq->last_error) { lreq->last_error = normalize_watch_error(req->r_result); queue_watch_error(lreq); } @@ -3316,6 +3331,7 @@ ceph_osdc_watch(struct ceph_osd_client *osdc, lreq->wcb = wcb; lreq->errcb = errcb; lreq->data = data; + lreq->watch_valid_thru = jiffies; ceph_oid_copy(&lreq->t.base_oid, oid); ceph_oloc_copy(&lreq->t.base_oloc, oloc); @@ -3577,6 +3593,40 @@ int ceph_osdc_notify(struct ceph_osd_client *osdc, } EXPORT_SYMBOL(ceph_osdc_notify); +/* + * Return the number of milliseconds since the watch was last + * confirmed, or an error. If there is an error, the watch is no + * longer valid, and should be destroyed with ceph_osdc_unwatch(). + */ +int ceph_osdc_watch_check(struct ceph_osd_client *osdc, + struct ceph_osd_linger_request *lreq) +{ + unsigned long stamp, age; + int ret; + + down_read(&osdc->lock); + mutex_lock(&lreq->lock); + stamp = lreq->watch_valid_thru; + if (!list_empty(&lreq->pending_lworks)) { + struct linger_work *lwork = + list_first_entry(&lreq->pending_lworks, + struct linger_work, + pending_item); + + if (time_before(lwork->queued_stamp, stamp)) + stamp = lwork->queued_stamp; + } + age = jiffies - stamp; + dout("%s lreq %p linger_id %llu age %lu last_error %d\n", __func__, + lreq, lreq->linger_id, age, lreq->last_error); + /* we are truncating to msecs, so return a safe upper bound */ + ret = lreq->last_error ?: 1 + jiffies_to_msecs(age); + + mutex_unlock(&lreq->lock); + up_read(&osdc->lock); + return ret; +} + /* * Call all pending notify callbacks - for use after a watch is * unregistered, to make sure no more callbacks for it will be invoked From d0b19705e99939f5ae5aa9b57bfe41dd4777d951 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:27 +0200 Subject: [PATCH 36/71] libceph: async MON client generic requests For map check, we are going to need to send CEPH_MSG_MON_GET_VERSION messages asynchronously and get a callback on completion. Refactor MON client to allow firing off generic requests asynchronously and add an async variant of ceph_monc_get_version(). ceph_monc_do_statfs() is switched over and remains sync. Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 4 +- include/linux/ceph/mon_client.h | 19 +- net/ceph/mon_client.c | 316 +++++++++++++++++++++----------- 3 files changed, 228 insertions(+), 111 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index d0834c477f9684..8eae6f56194dd0 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4896,8 +4896,8 @@ static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name) again: ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name); if (ret == -ENOENT && tries++ < 1) { - ret = ceph_monc_do_get_version(&rbdc->client->monc, "osdmap", - &newest_epoch); + ret = ceph_monc_get_version(&rbdc->client->monc, "osdmap", + &newest_epoch); if (ret < 0) return ret; diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h index c14e9d861cda31..19800d9b45f3e5 100644 --- a/include/linux/ceph/mon_client.h +++ b/include/linux/ceph/mon_client.h @@ -39,20 +39,31 @@ struct ceph_mon_request { ceph_monc_request_func_t do_request; }; +typedef void (*ceph_monc_callback_t)(struct ceph_mon_generic_request *); + /* * ceph_mon_generic_request is being used for the statfs and * mon_get_version requests which are being done a bit differently * because we need to get data back to the caller */ struct ceph_mon_generic_request { + struct ceph_mon_client *monc; struct kref kref; u64 tid; struct rb_node node; int result; - void *buf; + struct completion completion; + ceph_monc_callback_t complete_cb; + u64 private_data; /* r_tid/linger_id */ + struct ceph_msg *request; /* original request */ struct ceph_msg *reply; /* and reply */ + + union { + struct ceph_statfs *st; + u64 newest; + } u; }; struct ceph_mon_client { @@ -124,8 +135,10 @@ extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch, extern int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf); -extern int ceph_monc_do_get_version(struct ceph_mon_client *monc, - const char *what, u64 *newest); +int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what, + u64 *newest); +int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what, + ceph_monc_callback_t cb, u64 private_data); extern int ceph_monc_open_session(struct ceph_mon_client *monc); diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 98bfbe1f680784..4e49b229692043 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -493,6 +493,10 @@ static void release_generic_request(struct kref *kref) struct ceph_mon_generic_request *req = container_of(kref, struct ceph_mon_generic_request, kref); + dout("%s greq %p request %p reply %p\n", __func__, req, req->request, + req->reply); + WARN_ON(!RB_EMPTY_NODE(&req->node)); + if (req->reply) ceph_msg_put(req->reply); if (req->request) @@ -503,7 +507,8 @@ static void release_generic_request(struct kref *kref) static void put_generic_request(struct ceph_mon_generic_request *req) { - kref_put(&req->kref, release_generic_request); + if (req) + kref_put(&req->kref, release_generic_request); } static void get_generic_request(struct ceph_mon_generic_request *req) @@ -511,6 +516,103 @@ static void get_generic_request(struct ceph_mon_generic_request *req) kref_get(&req->kref); } +static struct ceph_mon_generic_request * +alloc_generic_request(struct ceph_mon_client *monc, gfp_t gfp) +{ + struct ceph_mon_generic_request *req; + + req = kzalloc(sizeof(*req), gfp); + if (!req) + return NULL; + + req->monc = monc; + kref_init(&req->kref); + RB_CLEAR_NODE(&req->node); + init_completion(&req->completion); + + dout("%s greq %p\n", __func__, req); + return req; +} + +static void register_generic_request(struct ceph_mon_generic_request *req) +{ + struct ceph_mon_client *monc = req->monc; + + WARN_ON(req->tid); + + get_generic_request(req); + req->tid = ++monc->last_tid; + insert_generic_request(&monc->generic_request_tree, req); +} + +static void send_generic_request(struct ceph_mon_client *monc, + struct ceph_mon_generic_request *req) +{ + WARN_ON(!req->tid); + + dout("%s greq %p tid %llu\n", __func__, req, req->tid); + req->request->hdr.tid = cpu_to_le64(req->tid); + ceph_con_send(&monc->con, ceph_msg_get(req->request)); +} + +static void __finish_generic_request(struct ceph_mon_generic_request *req) +{ + struct ceph_mon_client *monc = req->monc; + + dout("%s greq %p tid %llu\n", __func__, req, req->tid); + erase_generic_request(&monc->generic_request_tree, req); + + ceph_msg_revoke(req->request); + ceph_msg_revoke_incoming(req->reply); +} + +static void finish_generic_request(struct ceph_mon_generic_request *req) +{ + __finish_generic_request(req); + put_generic_request(req); +} + +static void complete_generic_request(struct ceph_mon_generic_request *req) +{ + if (req->complete_cb) + req->complete_cb(req); + else + complete_all(&req->completion); + put_generic_request(req); +} + +void cancel_generic_request(struct ceph_mon_generic_request *req) +{ + struct ceph_mon_client *monc = req->monc; + struct ceph_mon_generic_request *lookup_req; + + dout("%s greq %p tid %llu\n", __func__, req, req->tid); + + mutex_lock(&monc->mutex); + lookup_req = lookup_generic_request(&monc->generic_request_tree, + req->tid); + if (lookup_req) { + WARN_ON(lookup_req != req); + finish_generic_request(req); + } + + mutex_unlock(&monc->mutex); +} + +static int wait_generic_request(struct ceph_mon_generic_request *req) +{ + int ret; + + dout("%s greq %p tid %llu\n", __func__, req, req->tid); + ret = wait_for_completion_interruptible(&req->completion); + if (ret) + cancel_generic_request(req); + else + ret = req->result; /* completed */ + + return ret; +} + static struct ceph_msg *get_generic_reply(struct ceph_connection *con, struct ceph_msg_header *hdr, int *skip) @@ -540,40 +642,6 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con, return m; } -static int __do_generic_request(struct ceph_mon_client *monc, u64 tid, - struct ceph_mon_generic_request *req) -{ - int err; - - /* register request */ - req->tid = tid != 0 ? tid : ++monc->last_tid; - req->request->hdr.tid = cpu_to_le64(req->tid); - insert_generic_request(&monc->generic_request_tree, req); - ceph_con_send(&monc->con, ceph_msg_get(req->request)); - mutex_unlock(&monc->mutex); - - err = wait_for_completion_interruptible(&req->completion); - - mutex_lock(&monc->mutex); - erase_generic_request(&monc->generic_request_tree, req); - - if (!err) - err = req->result; - return err; -} - -static int do_generic_request(struct ceph_mon_client *monc, - struct ceph_mon_generic_request *req) -{ - int err; - - mutex_lock(&monc->mutex); - err = __do_generic_request(monc, 0, req); - mutex_unlock(&monc->mutex); - - return err; -} - /* * statfs */ @@ -584,22 +652,24 @@ static void handle_statfs_reply(struct ceph_mon_client *monc, struct ceph_mon_statfs_reply *reply = msg->front.iov_base; u64 tid = le64_to_cpu(msg->hdr.tid); + dout("%s msg %p tid %llu\n", __func__, msg, tid); + if (msg->front.iov_len != sizeof(*reply)) goto bad; - dout("handle_statfs_reply %p tid %llu\n", msg, tid); mutex_lock(&monc->mutex); req = lookup_generic_request(&monc->generic_request_tree, tid); - if (req) { - *(struct ceph_statfs *)req->buf = reply->st; - req->result = 0; - get_generic_request(req); + if (!req) { + mutex_unlock(&monc->mutex); + return; } + + req->result = 0; + *req->u.st = reply->st; /* struct */ + __finish_generic_request(req); mutex_unlock(&monc->mutex); - if (req) { - complete_all(&req->completion); - put_generic_request(req); - } + + complete_generic_request(req); return; bad: @@ -614,39 +684,38 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) { struct ceph_mon_generic_request *req; struct ceph_mon_statfs *h; - int err; + int ret = -ENOMEM; - req = kzalloc(sizeof(*req), GFP_NOFS); + req = alloc_generic_request(monc, GFP_NOFS); if (!req) - return -ENOMEM; - - kref_init(&req->kref); - RB_CLEAR_NODE(&req->node); - req->buf = buf; - init_completion(&req->completion); + goto out; - err = -ENOMEM; req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS, true); if (!req->request) goto out; - req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS, - true); + + req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 64, GFP_NOFS, true); if (!req->reply) goto out; + req->u.st = buf; + + mutex_lock(&monc->mutex); + register_generic_request(req); /* fill out request */ h = req->request->front.iov_base; h->monhdr.have_version = 0; h->monhdr.session_mon = cpu_to_le16(-1); h->monhdr.session_mon_tid = 0; h->fsid = monc->monmap->fsid; + send_generic_request(monc, req); + mutex_unlock(&monc->mutex); - err = do_generic_request(monc, req); - + ret = wait_generic_request(req); out: put_generic_request(req); - return err; + return ret; } EXPORT_SYMBOL(ceph_monc_do_statfs); @@ -659,7 +728,7 @@ static void handle_get_version_reply(struct ceph_mon_client *monc, void *end = p + msg->front_alloc_len; u64 handle; - dout("%s %p tid %llu\n", __func__, msg, tid); + dout("%s msg %p tid %llu\n", __func__, msg, tid); ceph_decode_need(&p, end, 2*sizeof(u64), bad); handle = ceph_decode_64(&p); @@ -668,77 +737,110 @@ static void handle_get_version_reply(struct ceph_mon_client *monc, mutex_lock(&monc->mutex); req = lookup_generic_request(&monc->generic_request_tree, handle); - if (req) { - *(u64 *)req->buf = ceph_decode_64(&p); - req->result = 0; - get_generic_request(req); + if (!req) { + mutex_unlock(&monc->mutex); + return; } + + req->result = 0; + req->u.newest = ceph_decode_64(&p); + __finish_generic_request(req); mutex_unlock(&monc->mutex); - if (req) { - complete_all(&req->completion); - put_generic_request(req); - } + complete_generic_request(req); return; + bad: pr_err("corrupt mon_get_version reply, tid %llu\n", tid); ceph_msg_dump(msg); } -/* - * Send MMonGetVersion and wait for the reply. - * - * @what: one of "mdsmap", "osdmap" or "monmap" - */ -int ceph_monc_do_get_version(struct ceph_mon_client *monc, const char *what, - u64 *newest) +static struct ceph_mon_generic_request * +__ceph_monc_get_version(struct ceph_mon_client *monc, const char *what, + ceph_monc_callback_t cb, u64 private_data) { struct ceph_mon_generic_request *req; - void *p, *end; - u64 tid; - int err; - req = kzalloc(sizeof(*req), GFP_NOFS); + req = alloc_generic_request(monc, GFP_NOIO); if (!req) - return -ENOMEM; - - kref_init(&req->kref); - RB_CLEAR_NODE(&req->node); - req->buf = newest; - init_completion(&req->completion); + goto err_put_req; req->request = ceph_msg_new(CEPH_MSG_MON_GET_VERSION, sizeof(u64) + sizeof(u32) + strlen(what), - GFP_NOFS, true); - if (!req->request) { - err = -ENOMEM; - goto out; - } + GFP_NOIO, true); + if (!req->request) + goto err_put_req; - req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 1024, - GFP_NOFS, true); - if (!req->reply) { - err = -ENOMEM; - goto out; - } + req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 32, GFP_NOIO, + true); + if (!req->reply) + goto err_put_req; - p = req->request->front.iov_base; - end = p + req->request->front_alloc_len; + req->complete_cb = cb; + req->private_data = private_data; - /* fill out request */ mutex_lock(&monc->mutex); - tid = ++monc->last_tid; - ceph_encode_64(&p, tid); /* handle */ - ceph_encode_string(&p, end, what, strlen(what)); + register_generic_request(req); + { + void *p = req->request->front.iov_base; + void *const end = p + req->request->front_alloc_len; + + ceph_encode_64(&p, req->tid); /* handle */ + ceph_encode_string(&p, end, what, strlen(what)); + WARN_ON(p != end); + } + send_generic_request(monc, req); + mutex_unlock(&monc->mutex); - err = __do_generic_request(monc, tid, req); + return req; - mutex_unlock(&monc->mutex); -out: +err_put_req: put_generic_request(req); - return err; + return ERR_PTR(-ENOMEM); +} + +/* + * Send MMonGetVersion and wait for the reply. + * + * @what: one of "mdsmap", "osdmap" or "monmap" + */ +int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what, + u64 *newest) +{ + struct ceph_mon_generic_request *req; + int ret; + + req = __ceph_monc_get_version(monc, what, NULL, 0); + if (IS_ERR(req)) + return PTR_ERR(req); + + ret = wait_generic_request(req); + if (!ret) + *newest = req->u.newest; + + put_generic_request(req); + return ret; +} +EXPORT_SYMBOL(ceph_monc_get_version); + +/* + * Send MMonGetVersion, + * + * @what: one of "mdsmap", "osdmap" or "monmap" + */ +int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what, + ceph_monc_callback_t cb, u64 private_data) +{ + struct ceph_mon_generic_request *req; + + req = __ceph_monc_get_version(monc, what, cb, private_data); + if (IS_ERR(req)) + return PTR_ERR(req); + + put_generic_request(req); + return 0; } -EXPORT_SYMBOL(ceph_monc_do_get_version); +EXPORT_SYMBOL(ceph_monc_get_version_async); /* * Resend pending generic requests. @@ -923,6 +1025,8 @@ void ceph_monc_stop(struct ceph_mon_client *monc) ceph_auth_destroy(monc->auth); + WARN_ON(!RB_EMPTY_ROOT(&monc->generic_request_tree)); + ceph_msg_put(monc->m_auth); ceph_msg_put(monc->m_auth_reply); ceph_msg_put(monc->m_subscribe); From 4609245e2670e3698b083bcd9cc69a65b2b6f9a6 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:27 +0200 Subject: [PATCH 37/71] libceph: pool deletion detection This adds the "map check" infrastructure for sending osdmap version checks on CALC_TARGET_POOL_DNE and completing in-flight requests with -ENOENT if the target pool doesn't exist or has just been deleted. Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 6 + net/ceph/osd_client.c | 248 +++++++++++++++++++++++++++++++- 2 files changed, 248 insertions(+), 6 deletions(-) diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 2ae7cfd82ec9a5..3e7bf72e407815 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -151,6 +151,7 @@ struct ceph_osd_request_target { struct ceph_osd_request { u64 r_tid; /* unique for this client */ struct rb_node r_node; + struct rb_node r_mc_node; /* map check */ struct ceph_osd *r_osd; struct ceph_osd_request_target r_t; @@ -191,6 +192,7 @@ struct ceph_osd_request { int r_attempts; struct ceph_eversion r_replay_version; /* aka reassert_version */ u32 r_last_force_resend; + u32 r_map_dne_bound; struct ceph_osd_req_op r_ops[]; }; @@ -218,6 +220,7 @@ struct ceph_osd_linger_request { struct ceph_osd_request_target t; u32 last_force_resend; + u32 map_dne_bound; struct timespec mtime; @@ -225,6 +228,7 @@ struct ceph_osd_linger_request { struct mutex lock; struct rb_node node; /* osd */ struct rb_node osdc_node; /* osdc */ + struct rb_node mc_node; /* map check */ struct list_head scan_item; struct completion reg_commit_wait; @@ -257,6 +261,8 @@ struct ceph_osd_client { atomic64_t last_tid; /* tid of last request */ u64 last_linger_id; struct rb_root linger_requests; /* lingering requests */ + struct rb_root map_checks; + struct rb_root linger_map_checks; atomic_t num_requests; atomic_t num_homeless; struct delayed_work timeout_work; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 5ac6dce74f079c..ece2d10a12081f 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -396,6 +396,7 @@ static void target_destroy(struct ceph_osd_request_target *t) static void request_release_checks(struct ceph_osd_request *req) { WARN_ON(!RB_EMPTY_NODE(&req->r_node)); + WARN_ON(!RB_EMPTY_NODE(&req->r_mc_node)); WARN_ON(!list_empty(&req->r_unsafe_item)); WARN_ON(req->r_osd); } @@ -456,6 +457,7 @@ static void request_init(struct ceph_osd_request *req) init_completion(&req->r_completion); init_completion(&req->r_safe_completion); RB_CLEAR_NODE(&req->r_node); + RB_CLEAR_NODE(&req->r_mc_node); INIT_LIST_HEAD(&req->r_unsafe_item); target_init(&req->r_t); @@ -969,6 +971,7 @@ EXPORT_SYMBOL(ceph_osdc_new_request); * We keep osd requests in an rbtree, sorted by ->r_tid. */ DEFINE_RB_FUNCS(request, struct ceph_osd_request, r_tid, r_node) +DEFINE_RB_FUNCS(request_mc, struct ceph_osd_request, r_tid, r_mc_node) static bool osd_homeless(struct ceph_osd *osd) { @@ -1601,10 +1604,13 @@ static void maybe_request_map(struct ceph_osd_client *osdc) ceph_monc_renew_subs(&osdc->client->monc); } +static void send_map_check(struct ceph_osd_request *req); + static void __submit_request(struct ceph_osd_request *req, bool wrlocked) { struct ceph_osd_client *osdc = req->r_osdc; struct ceph_osd *osd; + enum calc_target_result ct_res; bool need_send = false; bool promoted = false; @@ -1612,7 +1618,10 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked) dout("%s req %p wrlocked %d\n", __func__, req, wrlocked); again: - calc_target(osdc, &req->r_t, &req->r_last_force_resend, false); + ct_res = calc_target(osdc, &req->r_t, &req->r_last_force_resend, false); + if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked) + goto promote; + osd = lookup_create_osd(osdc, req->r_t.osd, wrlocked); if (IS_ERR(osd)) { WARN_ON(PTR_ERR(osd) != -EAGAIN || wrlocked); @@ -1656,6 +1665,9 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked) send_request(req); mutex_unlock(&osd->lock); + if (ct_res == CALC_TARGET_POOL_DNE) + send_map_check(req); + if (promoted) downgrade_write(&osdc->lock); return; @@ -1699,6 +1711,7 @@ static void __finish_request(struct ceph_osd_request *req) verify_osd_locked(osd); dout("%s req %p tid %llu\n", __func__, req, req->r_tid); + WARN_ON(lookup_request_mc(&osdc->map_checks, req->r_tid)); unlink_request(osd, req); atomic_dec(&osdc->num_requests); @@ -1726,13 +1739,127 @@ static void __complete_request(struct ceph_osd_request *req) complete_all(&req->r_completion); } +/* + * Note that this is open-coded in handle_reply(), which has to deal + * with ack vs commit, dup acks, etc. + */ +static void complete_request(struct ceph_osd_request *req, int err) +{ + dout("%s req %p tid %llu err %d\n", __func__, req, req->r_tid, err); + + req->r_result = err; + __finish_request(req); + __complete_request(req); + complete_all(&req->r_safe_completion); + ceph_osdc_put_request(req); +} + +static void cancel_map_check(struct ceph_osd_request *req) +{ + struct ceph_osd_client *osdc = req->r_osdc; + struct ceph_osd_request *lookup_req; + + verify_osdc_wrlocked(osdc); + + lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid); + if (!lookup_req) + return; + + WARN_ON(lookup_req != req); + erase_request_mc(&osdc->map_checks, req); + ceph_osdc_put_request(req); +} + static void cancel_request(struct ceph_osd_request *req) { dout("%s req %p tid %llu\n", __func__, req, req->r_tid); + cancel_map_check(req); finish_request(req); } +static void check_pool_dne(struct ceph_osd_request *req) +{ + struct ceph_osd_client *osdc = req->r_osdc; + struct ceph_osdmap *map = osdc->osdmap; + + verify_osdc_wrlocked(osdc); + WARN_ON(!map->epoch); + + if (req->r_attempts) { + /* + * We sent a request earlier, which means that + * previously the pool existed, and now it does not + * (i.e., it was deleted). + */ + req->r_map_dne_bound = map->epoch; + dout("%s req %p tid %llu pool disappeared\n", __func__, req, + req->r_tid); + } else { + dout("%s req %p tid %llu map_dne_bound %u have %u\n", __func__, + req, req->r_tid, req->r_map_dne_bound, map->epoch); + } + + if (req->r_map_dne_bound) { + if (map->epoch >= req->r_map_dne_bound) { + /* we had a new enough map */ + pr_info_ratelimited("tid %llu pool does not exist\n", + req->r_tid); + complete_request(req, -ENOENT); + } + } else { + send_map_check(req); + } +} + +static void map_check_cb(struct ceph_mon_generic_request *greq) +{ + struct ceph_osd_client *osdc = &greq->monc->client->osdc; + struct ceph_osd_request *req; + u64 tid = greq->private_data; + + WARN_ON(greq->result || !greq->u.newest); + + down_write(&osdc->lock); + req = lookup_request_mc(&osdc->map_checks, tid); + if (!req) { + dout("%s tid %llu dne\n", __func__, tid); + goto out_unlock; + } + + dout("%s req %p tid %llu map_dne_bound %u newest %llu\n", __func__, + req, req->r_tid, req->r_map_dne_bound, greq->u.newest); + if (!req->r_map_dne_bound) + req->r_map_dne_bound = greq->u.newest; + erase_request_mc(&osdc->map_checks, req); + check_pool_dne(req); + + ceph_osdc_put_request(req); +out_unlock: + up_write(&osdc->lock); +} + +static void send_map_check(struct ceph_osd_request *req) +{ + struct ceph_osd_client *osdc = req->r_osdc; + struct ceph_osd_request *lookup_req; + int ret; + + verify_osdc_wrlocked(osdc); + + lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid); + if (lookup_req) { + WARN_ON(lookup_req != req); + return; + } + + ceph_osdc_get_request(req); + insert_request_mc(&osdc->map_checks, req); + ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap", + map_check_cb, req->r_tid); + WARN_ON(ret); +} + /* * lingering requests, watch/notify v2 infrastructure */ @@ -1745,6 +1872,7 @@ static void linger_release(struct kref *kref) lreq->reg_req, lreq->ping_req); WARN_ON(!RB_EMPTY_NODE(&lreq->node)); WARN_ON(!RB_EMPTY_NODE(&lreq->osdc_node)); + WARN_ON(!RB_EMPTY_NODE(&lreq->mc_node)); WARN_ON(!list_empty(&lreq->scan_item)); WARN_ON(!list_empty(&lreq->pending_lworks)); WARN_ON(lreq->osd); @@ -1783,6 +1911,7 @@ linger_alloc(struct ceph_osd_client *osdc) mutex_init(&lreq->lock); RB_CLEAR_NODE(&lreq->node); RB_CLEAR_NODE(&lreq->osdc_node); + RB_CLEAR_NODE(&lreq->mc_node); INIT_LIST_HEAD(&lreq->scan_item); INIT_LIST_HEAD(&lreq->pending_lworks); init_completion(&lreq->reg_commit_wait); @@ -1797,6 +1926,7 @@ linger_alloc(struct ceph_osd_client *osdc) DEFINE_RB_INSDEL_FUNCS(linger, struct ceph_osd_linger_request, linger_id, node) DEFINE_RB_FUNCS(linger_osdc, struct ceph_osd_linger_request, linger_id, osdc_node) +DEFINE_RB_FUNCS(linger_mc, struct ceph_osd_linger_request, linger_id, mc_node) /* * Create linger request <-> OSD session relation. @@ -2193,6 +2323,23 @@ static void linger_submit(struct ceph_osd_linger_request *lreq) send_linger(lreq); } +static void cancel_linger_map_check(struct ceph_osd_linger_request *lreq) +{ + struct ceph_osd_client *osdc = lreq->osdc; + struct ceph_osd_linger_request *lookup_lreq; + + verify_osdc_wrlocked(osdc); + + lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks, + lreq->linger_id); + if (!lookup_lreq) + return; + + WARN_ON(lookup_lreq != lreq); + erase_linger_mc(&osdc->linger_map_checks, lreq); + linger_put(lreq); +} + /* * @lreq has to be both registered and linked. */ @@ -2202,6 +2349,7 @@ static void __linger_cancel(struct ceph_osd_linger_request *lreq) cancel_linger_request(lreq->ping_req); if (lreq->reg_req->r_osd) cancel_linger_request(lreq->reg_req); + cancel_linger_map_check(lreq); unlink_linger(lreq->osd, lreq); linger_unregister(lreq); } @@ -2216,6 +2364,89 @@ static void linger_cancel(struct ceph_osd_linger_request *lreq) up_write(&osdc->lock); } +static void send_linger_map_check(struct ceph_osd_linger_request *lreq); + +static void check_linger_pool_dne(struct ceph_osd_linger_request *lreq) +{ + struct ceph_osd_client *osdc = lreq->osdc; + struct ceph_osdmap *map = osdc->osdmap; + + verify_osdc_wrlocked(osdc); + WARN_ON(!map->epoch); + + if (lreq->register_gen) { + lreq->map_dne_bound = map->epoch; + dout("%s lreq %p linger_id %llu pool disappeared\n", __func__, + lreq, lreq->linger_id); + } else { + dout("%s lreq %p linger_id %llu map_dne_bound %u have %u\n", + __func__, lreq, lreq->linger_id, lreq->map_dne_bound, + map->epoch); + } + + if (lreq->map_dne_bound) { + if (map->epoch >= lreq->map_dne_bound) { + /* we had a new enough map */ + pr_info("linger_id %llu pool does not exist\n", + lreq->linger_id); + linger_reg_commit_complete(lreq, -ENOENT); + __linger_cancel(lreq); + } + } else { + send_linger_map_check(lreq); + } +} + +static void linger_map_check_cb(struct ceph_mon_generic_request *greq) +{ + struct ceph_osd_client *osdc = &greq->monc->client->osdc; + struct ceph_osd_linger_request *lreq; + u64 linger_id = greq->private_data; + + WARN_ON(greq->result || !greq->u.newest); + + down_write(&osdc->lock); + lreq = lookup_linger_mc(&osdc->linger_map_checks, linger_id); + if (!lreq) { + dout("%s linger_id %llu dne\n", __func__, linger_id); + goto out_unlock; + } + + dout("%s lreq %p linger_id %llu map_dne_bound %u newest %llu\n", + __func__, lreq, lreq->linger_id, lreq->map_dne_bound, + greq->u.newest); + if (!lreq->map_dne_bound) + lreq->map_dne_bound = greq->u.newest; + erase_linger_mc(&osdc->linger_map_checks, lreq); + check_linger_pool_dne(lreq); + + linger_put(lreq); +out_unlock: + up_write(&osdc->lock); +} + +static void send_linger_map_check(struct ceph_osd_linger_request *lreq) +{ + struct ceph_osd_client *osdc = lreq->osdc; + struct ceph_osd_linger_request *lookup_lreq; + int ret; + + verify_osdc_wrlocked(osdc); + + lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks, + lreq->linger_id); + if (lookup_lreq) { + WARN_ON(lookup_lreq != lreq); + return; + } + + linger_get(lreq); + insert_linger_mc(&osdc->linger_map_checks, lreq); + ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap", + linger_map_check_cb, lreq->linger_id); + WARN_ON(ret); +} + static int linger_reg_commit_wait(struct ceph_osd_linger_request *lreq) { int ret; @@ -2677,10 +2908,7 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) return; fail_request: - req->r_result = -EIO; - __finish_request(req); - __complete_request(req); - complete_all(&req->r_safe_completion); + complete_request(req, -EIO); out_unlock_session: mutex_unlock(&osd->lock); out_unlock_osdc: @@ -2764,6 +2992,7 @@ static void scan_requests(struct ceph_osd *osd, /* fall through */ case CALC_TARGET_NEED_RESEND: + cancel_linger_map_check(lreq); /* * scan_requests() for the previous epoch(s) * may have already added it to the list, since @@ -2773,6 +3002,7 @@ static void scan_requests(struct ceph_osd *osd, list_add_tail(&lreq->scan_item, need_resend_linger); break; case CALC_TARGET_POOL_DNE: + check_linger_pool_dne(lreq); break; } } @@ -2782,7 +3012,7 @@ static void scan_requests(struct ceph_osd *osd, rb_entry(n, struct ceph_osd_request, r_node); enum calc_target_result ct_res; - n = rb_next(n); /* unlink_request() */ + n = rb_next(n); /* unlink_request(), check_pool_dne() */ dout("%s req %p tid %llu\n", __func__, req, req->r_tid); ct_res = calc_target(osdc, &req->r_t, @@ -2799,10 +3029,12 @@ static void scan_requests(struct ceph_osd *osd, /* fall through */ case CALC_TARGET_NEED_RESEND: + cancel_map_check(req); unlink_request(osd, req); insert_request(need_resend, req); break; case CALC_TARGET_POOL_DNE: + check_pool_dne(req); break; } } @@ -3655,6 +3887,8 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) osdc->homeless_osd.o_osdc = osdc; osdc->homeless_osd.o_osd = CEPH_HOMELESS_OSD; osdc->linger_requests = RB_ROOT; + osdc->map_checks = RB_ROOT; + osdc->linger_map_checks = RB_ROOT; INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout); INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout); @@ -3720,6 +3954,8 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc) WARN_ON(!list_empty(&osdc->osd_lru)); WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_requests)); + WARN_ON(!RB_EMPTY_ROOT(&osdc->map_checks)); + WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_map_checks)); WARN_ON(atomic_read(&osdc->num_requests)); WARN_ON(atomic_read(&osdc->num_homeless)); From b4f34795697de9d1ee84a10c8439b4b9970ebd96 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:27 +0200 Subject: [PATCH 38/71] libceph: take osdc->lock in osdmap_show() and dump flags in hex There is now about a dozen CEPH_OSDMAP_* flags. This is a debugging interface, so just dump in hex instead of spelling each flag out. Signed-off-by: Ilya Dryomov --- net/ceph/debugfs.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 39f91c7250f6e8..6e434c75cd08c1 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -54,16 +54,15 @@ static int osdmap_show(struct seq_file *s, void *p) { int i; struct ceph_client *client = s->private; - struct ceph_osdmap *map = client->osdc.osdmap; + struct ceph_osd_client *osdc = &client->osdc; + struct ceph_osdmap *map = osdc->osdmap; struct rb_node *n; if (map == NULL) return 0; - seq_printf(s, "epoch %d\n", map->epoch); - seq_printf(s, "flags%s%s\n", - (map->flags & CEPH_OSDMAP_NEARFULL) ? " NEARFULL" : "", - (map->flags & CEPH_OSDMAP_FULL) ? " FULL" : ""); + down_read(&osdc->lock); + seq_printf(s, "epoch %d flags 0x%x\n", map->epoch, map->flags); for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) { struct ceph_pg_pool_info *pi = @@ -105,6 +104,7 @@ static int osdmap_show(struct seq_file *s, void *p) pg->pgid.seed, pg->primary_temp.osd); } + up_read(&osdc->lock); return 0; } From 7cca78c9dcd1afa243e46edc31896730df85d2b5 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:28 +0200 Subject: [PATCH 39/71] libceph: replace ceph_monc_request_next_osdmap() ... with a wrapper around maybe_request_map() - no need for two osdmap-specific functions. Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 2 +- include/linux/ceph/mon_client.h | 1 - include/linux/ceph/osd_client.h | 1 + net/ceph/mon_client.c | 14 -------------- net/ceph/osd_client.c | 7 +++++++ 5 files changed, 9 insertions(+), 16 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 8eae6f56194dd0..81666a56415e2b 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4902,7 +4902,7 @@ static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name) return ret; if (rbdc->client->osdc.osdmap->epoch < newest_epoch) { - ceph_monc_request_next_osdmap(&rbdc->client->monc); + ceph_osdc_maybe_request_map(&rbdc->client->osdc); (void) ceph_monc_wait_osdmap(&rbdc->client->monc, newest_epoch, opts->mount_timeout); diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h index 19800d9b45f3e5..1d730993c3f8f7 100644 --- a/include/linux/ceph/mon_client.h +++ b/include/linux/ceph/mon_client.h @@ -128,7 +128,6 @@ bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch, void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch); void ceph_monc_renew_subs(struct ceph_mon_client *monc); -extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc); extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch, unsigned long timeout); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 3e7bf72e407815..19b14862d3e0d9 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -381,6 +381,7 @@ extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc, extern void ceph_osdc_sync(struct ceph_osd_client *osdc); extern void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc); +void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc); extern int ceph_osdc_readpages(struct ceph_osd_client *osdc, struct ceph_vino vino, diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 4e49b229692043..72a910bf7819d7 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -384,20 +384,6 @@ void ceph_monc_renew_subs(struct ceph_mon_client *monc) } EXPORT_SYMBOL(ceph_monc_renew_subs); -/* - * Register interest in the next osdmap - */ -void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc) -{ - dout("%s have %u\n", __func__, monc->subs[CEPH_SUB_OSDMAP].have); - mutex_lock(&monc->mutex); - if (__ceph_monc_want_map(monc, CEPH_SUB_OSDMAP, - monc->subs[CEPH_SUB_OSDMAP].have + 1, false)) - __send_subscribe(monc); - mutex_unlock(&monc->mutex); -} -EXPORT_SYMBOL(ceph_monc_request_next_osdmap); - /* * Wait for an osdmap with a given epoch. * diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index ece2d10a12081f..55cafd3a2ff084 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -3869,6 +3869,13 @@ void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc) } EXPORT_SYMBOL(ceph_osdc_flush_notifies); +void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc) +{ + down_read(&osdc->lock); + maybe_request_map(osdc); + up_read(&osdc->lock); +} +EXPORT_SYMBOL(ceph_osdc_maybe_request_map); /* * init, shutdown From 737cc81ead34bcef0b1f6ea8322228e4378cf21a Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 26 May 2016 00:05:01 +0200 Subject: [PATCH 40/71] libceph: support for subscribing to "mdsmap." maps Signed-off-by: Ilya Dryomov --- include/linux/ceph/ceph_fs.h | 2 ++ include/linux/ceph/mon_client.h | 1 + net/ceph/debugfs.c | 1 + net/ceph/mon_client.c | 18 +++++++++++++----- 4 files changed, 17 insertions(+), 5 deletions(-) diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index 3b911ff889ddb4..bae833d0d05550 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -208,6 +208,8 @@ struct ceph_mon_subscribe_ack { struct ceph_fsid fsid; } __attribute__ ((packed)); +#define CEPH_FS_CLUSTER_ID_NONE -1 + /* * mdsmap flags */ diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h index 1d730993c3f8f7..e2a92df08b47d2 100644 --- a/include/linux/ceph/mon_client.h +++ b/include/linux/ceph/mon_client.h @@ -96,6 +96,7 @@ struct ceph_mon_client { bool want; u32 have; /* epoch */ } subs[3]; + int fs_cluster_id; /* "mdsmap." sub */ #ifdef CONFIG_DEBUG_FS struct dentry *debugfs_file; diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 6e434c75cd08c1..e77b04ca7802c0 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -128,6 +128,7 @@ static int monc_show(struct seq_file *s, void *p) CEPH_SUBSCRIBE_ONETIME ? "" : "+")); seq_putc(s, '\n'); } + seq_printf(s, "fs_cluster_id %d\n", monc->fs_cluster_id); for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) { __u16 op; diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 72a910bf7819d7..37c38a7fb5c586 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -260,20 +260,26 @@ static void __send_subscribe(struct ceph_mon_client *monc) BUG_ON(num < 1); /* monmap sub is always there */ ceph_encode_32(&p, num); for (i = 0; i < ARRAY_SIZE(monc->subs); i++) { - const char *s = ceph_sub_str[i]; + char buf[32]; + int len; if (!monc->subs[i].want) continue; - dout("%s %s start %llu flags 0x%x\n", __func__, s, + len = sprintf(buf, "%s", ceph_sub_str[i]); + if (i == CEPH_SUB_MDSMAP && + monc->fs_cluster_id != CEPH_FS_CLUSTER_ID_NONE) + len += sprintf(buf + len, ".%d", monc->fs_cluster_id); + + dout("%s %s start %llu flags 0x%x\n", __func__, buf, le64_to_cpu(monc->subs[i].item.start), monc->subs[i].item.flags); - ceph_encode_string(&p, end, s, strlen(s)); + ceph_encode_string(&p, end, buf, len); memcpy(p, &monc->subs[i].item, sizeof(monc->subs[i].item)); p += sizeof(monc->subs[i].item); } - BUG_ON(p != (end - 35 - (ARRAY_SIZE(monc->subs) - num) * 19)); + BUG_ON(p > end); msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); ceph_msg_revoke(msg); @@ -948,7 +954,7 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) if (!monc->m_subscribe_ack) goto out_auth; - monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS, + monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 128, GFP_NOFS, true); if (!monc->m_subscribe) goto out_subscribe_ack; @@ -974,6 +980,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) monc->generic_request_tree = RB_ROOT; monc->last_tid = 0; + monc->fs_cluster_id = CEPH_FS_CLUSTER_ID_NONE; + return 0; out_auth_reply: From 235a09821c2bc71d9d07f12217ce2ac00db99eba Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 30 Mar 2016 17:18:34 +0800 Subject: [PATCH 41/71] ceph: multiple filesystem support To access non-default filesystem, we just need to subscribe to mdsmap. and add a new mount option for mds namespace id. Signed-off-by: Yan, Zheng [idryomov@gmail.com: switch to a new libceph API] Signed-off-by: Ilya Dryomov --- fs/ceph/super.c | 9 +++++++++ fs/ceph/super.h | 1 + 2 files changed, 10 insertions(+) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index f12d5e2955c223..452770dc9fc16f 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -108,6 +108,7 @@ static int ceph_sync_fs(struct super_block *sb, int wait) * mount options */ enum { + Opt_mds_namespace, Opt_wsize, Opt_rsize, Opt_rasize, @@ -143,6 +144,7 @@ enum { }; static match_table_t fsopt_tokens = { + {Opt_mds_namespace, "mds_namespace=%d"}, {Opt_wsize, "wsize=%d"}, {Opt_rsize, "rsize=%d"}, {Opt_rasize, "rasize=%d"}, @@ -212,6 +214,9 @@ static int parse_fsopt_token(char *c, void *private) break; /* misc */ + case Opt_mds_namespace: + fsopt->mds_namespace = intval; + break; case Opt_wsize: fsopt->wsize = intval; break; @@ -367,6 +372,7 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; fsopt->congestion_kb = default_congestion_kb(); + fsopt->mds_namespace = CEPH_FS_CLUSTER_ID_NONE; /* * Distinguish the server list from the path in "dev_name". @@ -457,6 +463,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",noacl"); #endif + if (fsopt->mds_namespace != CEPH_FS_CLUSTER_ID_NONE) + seq_printf(m, ",mds_namespace=%d", fsopt->mds_namespace); if (fsopt->wsize) seq_printf(m, ",wsize=%d", fsopt->wsize); if (fsopt->rsize != CEPH_RSIZE_DEFAULT) @@ -530,6 +538,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, goto fail; } fsc->client->extra_mon_dispatch = extra_mon_dispatch; + fsc->client->monc.fs_cluster_id = fsopt->mds_namespace; ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true); fsc->mount_options = fsopt; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index e705c4d612d763..db2200f5ba8097 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -62,6 +62,7 @@ struct ceph_mount_options { int cap_release_safety; int max_readdir; /* max readdir result (entires) */ int max_readdir_bytes; /* max readdir result (bytes) */ + int mds_namespace; /* * everything above this point can be memcmp'd; everything below From d463a43d69f4af85887671d76182437775fd1631 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 31 Mar 2016 15:53:01 +0800 Subject: [PATCH 42/71] ceph: CEPH_FEATURE_MDSENC support Signed-off-by: Yan, Zheng --- fs/ceph/mdsmap.c | 43 ++++++++++++++++++++++++++++++++++--------- fs/ceph/super.c | 5 ++--- 2 files changed, 36 insertions(+), 12 deletions(-) diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 261531e55e9d0f..8c3591a7fbaeef 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -54,16 +54,21 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) const void *start = *p; int i, j, n; int err = -EINVAL; - u16 version; + u8 mdsmap_v, mdsmap_cv; m = kzalloc(sizeof(*m), GFP_NOFS); if (m == NULL) return ERR_PTR(-ENOMEM); - ceph_decode_16_safe(p, end, version, bad); - if (version > 3) { - pr_warn("got mdsmap version %d > 3, failing", version); - goto bad; + ceph_decode_need(p, end, 1 + 1, bad); + mdsmap_v = ceph_decode_8(p); + mdsmap_cv = ceph_decode_8(p); + if (mdsmap_v >= 4) { + u32 mdsmap_len; + ceph_decode_32_safe(p, end, mdsmap_len, bad); + if (end < *p + mdsmap_len) + goto bad; + end = *p + mdsmap_len; } ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad); @@ -87,16 +92,29 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) u32 namelen; s32 mds, inc, state; u64 state_seq; - u8 infoversion; + u8 info_v; + void *info_end = NULL; struct ceph_entity_addr addr; u32 num_export_targets; void *pexport_targets = NULL; struct ceph_timespec laggy_since; struct ceph_mds_info *info; - ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad); + ceph_decode_need(p, end, sizeof(u64) + 1, bad); global_id = ceph_decode_64(p); - infoversion = ceph_decode_8(p); + info_v= ceph_decode_8(p); + if (info_v >= 4) { + u32 info_len; + u8 info_cv; + ceph_decode_need(p, end, 1 + sizeof(u32), bad); + info_cv = ceph_decode_8(p); + info_len = ceph_decode_32(p); + info_end = *p + info_len; + if (info_end > end) + goto bad; + } + + ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad); *p += sizeof(u64); namelen = ceph_decode_32(p); /* skip mds name */ *p += namelen; @@ -115,7 +133,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) *p += sizeof(u32); ceph_decode_32_safe(p, end, namelen, bad); *p += namelen; - if (infoversion >= 2) { + if (info_v >= 2) { ceph_decode_32_safe(p, end, num_export_targets, bad); pexport_targets = *p; *p += num_export_targets * sizeof(u32); @@ -123,6 +141,12 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) num_export_targets = 0; } + if (info_end && *p != info_end) { + if (*p > info_end) + goto bad; + *p = info_end; + } + dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n", i+1, n, global_id, mds, inc, ceph_pr_addr(&addr.in_addr), @@ -163,6 +187,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) m->m_cas_pg_pool = ceph_decode_64(p); /* ok, we don't care about the rest. */ + *p = end; dout("mdsmap_decode success epoch %u\n", m->m_epoch); return m; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 452770dc9fc16f..d714ab20ad24e2 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -519,9 +519,8 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, { struct ceph_fs_client *fsc; const u64 supported_features = - CEPH_FEATURE_FLOCK | - CEPH_FEATURE_DIRLAYOUTHASH | - CEPH_FEATURE_MDS_INLINE_DATA; + CEPH_FEATURE_FLOCK | CEPH_FEATURE_DIRLAYOUTHASH | + CEPH_FEATURE_MDSENC | CEPH_FEATURE_MDS_INLINE_DATA; const u64 required_features = 0; int page_count; size_t size; From 77310320c299b0dc050037ff8fc29fd1861fb005 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 8 Apr 2016 15:27:16 +0800 Subject: [PATCH 43/71] ceph: renew caps for read/write if mds session got killed. When mds session gets killed, read/write operation may hang. Client waits for Frw caps, but mds does not know what caps client wants. To recover this, client sends an open request to mds. The request will tell mds what caps client wants. Signed-off-by: Yan, Zheng --- fs/ceph/caps.c | 43 ++++++++++++++++++++++++++--------- fs/ceph/file.c | 53 ++++++++++++++++++++++++++++++++++++++++++++ fs/ceph/mds_client.c | 6 ++++- fs/ceph/super.h | 2 ++ 4 files changed, 93 insertions(+), 11 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index cfaeef18cbcabc..fab93c66d879f5 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2317,7 +2317,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, /* make sure file is actually open */ file_wanted = __ceph_caps_file_wanted(ci); - if ((file_wanted & need) == 0) { + if ((file_wanted & need) != need) { dout("try_get_cap_refs need %s file_wanted %s, EBADF\n", ceph_cap_string(need), ceph_cap_string(file_wanted)); *err = -EBADF; @@ -2412,12 +2412,26 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, goto out_unlock; } - if (!__ceph_is_any_caps(ci) && - ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { - dout("get_cap_refs %p forced umount\n", inode); - *err = -EIO; - ret = 1; - goto out_unlock; + if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) { + int mds_wanted; + if (ACCESS_ONCE(mdsc->fsc->mount_state) == + CEPH_MOUNT_SHUTDOWN) { + dout("get_cap_refs %p forced umount\n", inode); + *err = -EIO; + ret = 1; + goto out_unlock; + } + mds_wanted = __ceph_caps_mds_wanted(ci); + if ((mds_wanted & need) != need) { + dout("get_cap_refs %p caps were dropped" + " (session killed?)\n", inode); + *err = -ESTALE; + ret = 1; + goto out_unlock; + } + if ((mds_wanted & file_wanted) == + (file_wanted & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR))) + ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED; } dout("get_cap_refs %p have %s needed %s\n", inode, @@ -2487,7 +2501,7 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, if (err == -EAGAIN) continue; if (err < 0) - return err; + ret = err; } else { ret = wait_event_interruptible(ci->i_cap_wq, try_get_cap_refs(ci, need, want, endoff, @@ -2496,8 +2510,15 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, continue; if (err < 0) ret = err; - if (ret < 0) - return ret; + } + if (ret < 0) { + if (err == -ESTALE) { + /* session was killed, try renew caps */ + ret = ceph_renew_caps(&ci->vfs_inode); + if (ret == 0) + continue; + } + return ret; } if (ci->i_inline_version != CEPH_INLINE_NONE && @@ -3226,6 +3247,8 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, if (target < 0) { __ceph_remove_cap(cap, false); + if (!ci->i_auth_cap) + ci->i_ceph_flags |= CEPH_I_CAP_DROPPED; goto out_unlock; } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 30fd49eb25b4fe..996e9ec81e4eb0 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -191,6 +191,59 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) return ret; } +/* + * try renew caps after session gets killed. + */ +int ceph_renew_caps(struct inode *inode) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_request *req; + int err, flags, wanted; + + spin_lock(&ci->i_ceph_lock); + wanted = __ceph_caps_file_wanted(ci); + if (__ceph_is_any_real_caps(ci) && + (!(wanted & CEPH_CAP_ANY_WR) == 0 || ci->i_auth_cap)) { + int issued = __ceph_caps_issued(ci, NULL); + spin_unlock(&ci->i_ceph_lock); + dout("renew caps %p want %s issued %s updating mds_wanted\n", + inode, ceph_cap_string(wanted), ceph_cap_string(issued)); + ceph_check_caps(ci, 0, NULL); + return 0; + } + spin_unlock(&ci->i_ceph_lock); + + flags = 0; + if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR)) + flags = O_RDWR; + else if (wanted & CEPH_CAP_FILE_RD) + flags = O_RDONLY; + else if (wanted & CEPH_CAP_FILE_WR) + flags = O_WRONLY; +#ifdef O_LAZY + if (wanted & CEPH_CAP_FILE_LAZYIO) + flags |= O_LAZY; +#endif + + req = prepare_open_request(inode->i_sb, flags, 0); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + + req->r_inode = inode; + ihold(inode); + req->r_num_caps = 1; + req->r_fmode = -1; + + err = ceph_mdsc_do_request(mdsc, NULL, req); + ceph_mdsc_put_request(req); +out: + dout("renew caps %p open result=%d\n", inode, err); + return err < 0 ? err : 0; +} + /* * If we already have the requisite capabilities, we can satisfy * the open request locally (no need to request new caps from the diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index cff85af425d4ea..1e5965d17cb1b1 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1133,6 +1133,8 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + ci->i_ceph_flags |= CEPH_I_CAP_DROPPED; + while (true) { struct rb_node *n = rb_first(&ci->i_cap_flush_tree); if (!n) @@ -1181,7 +1183,9 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, list_del(&cf->list); ceph_free_cap_flush(cf); } - while (drop--) + + wake_up_all(&ci->i_cap_wq); + if (drop) iput(inode); return 0; } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index db2200f5ba8097..5fef3a6397db65 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -470,6 +470,7 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, #define CEPH_I_POOL_RD (1 << 5) /* can read from pool */ #define CEPH_I_POOL_WR (1 << 6) /* can write to pool */ #define CEPH_I_SEC_INITED (1 << 7) /* security initialized */ +#define CEPH_I_CAP_DROPPED (1 << 8) /* caps were forcibly dropped */ static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, long long release_count, @@ -932,6 +933,7 @@ extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); /* file.c */ extern const struct file_operations ceph_file_fops; +extern int ceph_renew_caps(struct inode *inode); extern int ceph_open(struct inode *inode, struct file *file); extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, struct file *file, unsigned flags, umode_t mode, From 6c93df5db628e710697c43bc1bd78a786549a548 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 15 Apr 2016 13:56:12 +0800 Subject: [PATCH 44/71] ceph: don't call truncate_pagecache in ceph_writepages_start truncate_pagecache() may decrease inode's reference. This can cause deadlock if inode's last reference is dropped and iput_final() wants to evict the inode. (evict() calls inode_wait_for_writeback(), which waits for ceph_writepages_start() to return). The fix is use work thead to truncate dirty pages. Also add 'forced umount' check to ceph_update_writeable_page(), which prevents new pages getting dirty. Signed-off-by: Yan, Zheng --- fs/ceph/addr.c | 14 ++++++++++++-- fs/ceph/inode.c | 11 +++++++++++ fs/ceph/mds_client.c | 22 +++++++++++++++------- 3 files changed, 38 insertions(+), 9 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index f4741847762947..d52e3bcfda7c0c 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -715,8 +715,11 @@ static int ceph_writepages_start(struct address_space *mapping, (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { - pr_warn("writepage_start %p on forced umount\n", inode); - truncate_pagecache(inode, 0); + if (ci->i_wrbuffer_ref > 0) { + pr_warn_ratelimited( + "writepage_start %p %lld forced umount\n", + inode, ceph_ino(inode)); + } mapping_set_error(mapping, -EIO); return -EIO; /* we're in a forced umount, don't write! */ } @@ -1127,6 +1130,7 @@ static int ceph_update_writeable_page(struct file *file, struct page *page) { struct inode *inode = file_inode(file); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); loff_t page_off = pos & PAGE_MASK; int pos_in_page = pos & ~PAGE_MASK; @@ -1135,6 +1139,12 @@ static int ceph_update_writeable_page(struct file *file, int r; struct ceph_snap_context *snapc, *oldest; + if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + dout(" page %p forced umount\n", page); + unlock_page(page); + return -EIO; + } + retry_locked: /* writepages currently holds page lock, but if we change that later, */ wait_on_page_writeback(page); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index edfade03773888..b906e02cddada9 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1623,10 +1623,21 @@ static void ceph_invalidate_work(struct work_struct *work) struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, i_pg_inv_work); struct inode *inode = &ci->vfs_inode; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); u32 orig_gen; int check = 0; mutex_lock(&ci->i_truncate_mutex); + + if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n", + inode, ceph_ino(inode)); + mapping_set_error(inode->i_mapping, -EIO); + truncate_pagecache(inode, 0); + mutex_unlock(&ci->i_truncate_mutex); + goto out; + } + spin_lock(&ci->i_ceph_lock); dout("invalidate_pages %p gen %d revoking %d\n", inode, ci->i_rdcache_gen, ci->i_rdcache_revoking); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 1e5965d17cb1b1..cbe6c0afdadc73 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1120,9 +1120,11 @@ static int iterate_session_caps(struct ceph_mds_session *session, static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) { + struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg; struct ceph_inode_info *ci = ceph_inode(inode); LIST_HEAD(to_remove); - int drop = 0; + bool drop = false; + bool invalidate = false; dout("removing cap %p, ci is %p, inode is %p\n", cap, ci, &ci->vfs_inode); @@ -1130,11 +1132,14 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, __ceph_remove_cap(cap, false); if (!ci->i_auth_cap) { struct ceph_cap_flush *cf; - struct ceph_mds_client *mdsc = - ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = fsc->mdsc; ci->i_ceph_flags |= CEPH_I_CAP_DROPPED; + if (ci->i_wrbuffer_ref > 0 && + ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) + invalidate = true; + while (true) { struct rb_node *n = rb_first(&ci->i_cap_flush_tree); if (!n) @@ -1156,7 +1161,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, inode, ceph_ino(inode)); ci->i_dirty_caps = 0; list_del_init(&ci->i_dirty_item); - drop = 1; + drop = true; } if (!list_empty(&ci->i_flushing_item)) { pr_warn_ratelimited( @@ -1166,7 +1171,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, ci->i_flushing_caps = 0; list_del_init(&ci->i_flushing_item); mdsc->num_cap_flushing--; - drop = 1; + drop = true; } spin_unlock(&mdsc->cap_dirty_lock); @@ -1185,6 +1190,8 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, } wake_up_all(&ci->i_cap_wq); + if (invalidate) + ceph_queue_invalidate(inode); if (drop) iput(inode); return 0; @@ -1195,12 +1202,13 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, */ static void remove_session_caps(struct ceph_mds_session *session) { + struct ceph_fs_client *fsc = session->s_mdsc->fsc; + struct super_block *sb = fsc->sb; dout("remove_session_caps on %p\n", session); - iterate_session_caps(session, remove_session_caps_cb, NULL); + iterate_session_caps(session, remove_session_caps_cb, fsc); spin_lock(&session->s_cap_lock); if (session->s_nr_caps > 0) { - struct super_block *sb = session->s_mdsc->fsc->sb; struct inode *inode; struct ceph_cap *cap, *prev = NULL; struct ceph_vino vino; From aeda081c5eb3d7668b64e2dc9bfc0bf30ef53632 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 18 Apr 2016 16:51:37 +0800 Subject: [PATCH 45/71] ceph: don't show symlink target in debugfs/mdsc symlink target is useless for debug and can be very long. It's annoying to show it in debugfs/mdsc. Signed-off-by: Yan, Zheng --- fs/ceph/debugfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 31f831471ed287..39ff678e567fcb 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -109,7 +109,7 @@ static int mdsc_show(struct seq_file *s, void *p) path ? path : ""); spin_unlock(&req->r_old_dentry->d_lock); kfree(path); - } else if (req->r_path2) { + } else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) { if (req->r_ino2.ino) seq_printf(s, " #%llx/%s", req->r_ino2.ino, req->r_path2); From 3f38495409b613071021fca86629df7ae81820ad Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 21 Apr 2016 11:09:55 +0800 Subject: [PATCH 46/71] ceph: report mount root in session metadata Signed-off-by: Yan, Zheng --- fs/ceph/mds_client.c | 4 +++- fs/ceph/super.c | 33 +++++++++++++++++++-------------- fs/ceph/super.h | 1 + 3 files changed, 23 insertions(+), 15 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index cbe6c0afdadc73..047f723bdbe62a 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -839,12 +839,14 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 int metadata_bytes = 0; int metadata_key_count = 0; struct ceph_options *opt = mdsc->fsc->client->options; + struct ceph_mount_options *fsopt = mdsc->fsc->mount_options; void *p; const char* metadata[][2] = { {"hostname", utsname()->nodename}, {"kernel_version", utsname()->release}, - {"entity_id", opt->name ? opt->name : ""}, + {"entity_id", opt->name ? : ""}, + {"root", fsopt->server_path ? : "/"}, {NULL, NULL} }; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index d714ab20ad24e2..91e02481ce0684 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -302,6 +302,7 @@ static void destroy_mount_options(struct ceph_mount_options *args) { dout("destroy_mount_options %p\n", args); kfree(args->snapdir_name); + kfree(args->server_path); kfree(args); } @@ -333,14 +334,17 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt, if (ret) return ret; + ret = strcmp_null(fsopt1->server_path, fsopt2->server_path); + if (ret) + return ret; + return ceph_compare_options(new_opt, fsc->client); } static int parse_mount_options(struct ceph_mount_options **pfsopt, struct ceph_options **popt, int flags, char *options, - const char *dev_name, - const char **path) + const char *dev_name) { struct ceph_mount_options *fsopt; const char *dev_name_end; @@ -386,12 +390,13 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, */ dev_name_end = strchr(dev_name, '/'); if (dev_name_end) { - /* skip over leading '/' for path */ - *path = dev_name_end + 1; + fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL); + if (!fsopt->server_path) { + err = -ENOMEM; + goto out; + } } else { - /* path is empty */ dev_name_end = dev_name + strlen(dev_name); - *path = dev_name_end; } err = -EINVAL; dev_name_end--; /* back up to ':' separator */ @@ -401,7 +406,8 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, goto out; } dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); - dout("server path '%s'\n", *path); + if (fsopt->server_path) + dout("server path '%s'\n", fsopt->server_path); *popt = ceph_parse_options(options, dev_name, dev_name_end, parse_fsopt_token, (void *)fsopt); @@ -793,8 +799,7 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, /* * mount: join the ceph cluster, and open root directory. */ -static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, - const char *path) +static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc) { int err; unsigned long started = jiffies; /* note the start time */ @@ -823,11 +828,12 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, goto fail; } - if (path[0] == 0) { + if (!fsc->mount_options->server_path) { root = fsc->sb->s_root; dget(root); } else { - dout("mount opening base mountpoint\n"); + const char *path = fsc->mount_options->server_path + 1; + dout("mount opening path %s\n", path); root = open_root_dentry(fsc, path, started); if (IS_ERR(root)) { err = PTR_ERR(root); @@ -943,7 +949,6 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, struct dentry *res; int err; int (*compare_super)(struct super_block *, void *) = ceph_compare_super; - const char *path = NULL; struct ceph_mount_options *fsopt = NULL; struct ceph_options *opt = NULL; @@ -952,7 +957,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, #ifdef CONFIG_CEPH_FS_POSIX_ACL flags |= MS_POSIXACL; #endif - err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path); + err = parse_mount_options(&fsopt, &opt, flags, data, dev_name); if (err < 0) { res = ERR_PTR(err); goto out_final; @@ -995,7 +1000,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, } } - res = ceph_real_mount(fsc, path); + res = ceph_real_mount(fsc); if (IS_ERR(res)) goto out_splat; dout("root %p inode %p ino %llx.%llx\n", res, diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 5fef3a6397db65..0ea86406f463cd 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -70,6 +70,7 @@ struct ceph_mount_options { */ char *snapdir_name; /* default ".snap" */ + char *server_path; /* default "/" */ }; struct ceph_fs_client { From 04303d8ad06985bed4ea4f46018fc8f55a0962a8 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 21 Apr 2016 12:11:54 +0800 Subject: [PATCH 47/71] ceph: use CEPH_MDS_OP_RMXATTR request to remove xattr Setxattr with NULL value and XATTR_REPLACE flag should be equivalent to removexattr. But current MDS does not support deleting vxattrs through MDS_OP_SETXATTR request. The workaround is sending MDS_OP_RMXATTR request if setxattr actually removs xattr. Signed-off-by: Yan, Zheng --- fs/ceph/xattr.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 5afabc4bf4c713..426c83cc683a32 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -886,6 +886,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, struct ceph_mds_request *req; struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_pagelist *pagelist = NULL; + int op = CEPH_MDS_OP_SETXATTR; int err; if (size > 0) { @@ -899,20 +900,21 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, if (err) goto out; } else if (!value) { - flags |= CEPH_XATTR_REMOVE; + if (flags & CEPH_XATTR_REPLACE) + op = CEPH_MDS_OP_RMXATTR; + else + flags |= CEPH_XATTR_REMOVE; } dout("setxattr value=%.*s\n", (int)size, value); /* do request */ - req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR, - USE_AUTH_MDS); + req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); if (IS_ERR(req)) { err = PTR_ERR(req); goto out; } - req->r_args.setxattr.flags = cpu_to_le32(flags); req->r_path2 = kstrdup(name, GFP_NOFS); if (!req->r_path2) { ceph_mdsc_put_request(req); @@ -920,8 +922,11 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, goto out; } - req->r_pagelist = pagelist; - pagelist = NULL; + if (op == CEPH_MDS_OP_SETXATTR) { + req->r_args.setxattr.flags = cpu_to_le32(flags); + req->r_pagelist = pagelist; + pagelist = NULL; + } req->r_inode = inode; ihold(inode); From c530cd24c23818af8da35fe556f9a7754211e50a Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 28 Apr 2016 17:43:35 +0800 Subject: [PATCH 48/71] ceph: search cache postion for dcache readdir use binary search to find cache index that corresponds to readdir postion. Signed-off-by: Yan, Zheng --- fs/ceph/dir.c | 129 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 83 insertions(+), 46 deletions(-) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 4fb2bbc2a2722a..cdea450c59026b 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -110,6 +110,50 @@ static int note_last_dentry(struct ceph_file_info *fi, const char *name, return 0; } + +static struct dentry * +__dcache_find_get_entry(struct dentry *parent, u64 idx, + struct ceph_readdir_cache_control *cache_ctl) +{ + struct inode *dir = d_inode(parent); + struct dentry *dentry; + unsigned idx_mask = (PAGE_SIZE / sizeof(struct dentry *)) - 1; + loff_t ptr_pos = idx * sizeof(struct dentry *); + pgoff_t ptr_pgoff = ptr_pos >> PAGE_SHIFT; + + if (ptr_pos >= i_size_read(dir)) + return NULL; + + if (!cache_ctl->page || ptr_pgoff != page_index(cache_ctl->page)) { + ceph_readdir_cache_release(cache_ctl); + cache_ctl->page = find_lock_page(&dir->i_data, ptr_pgoff); + if (!cache_ctl->page) { + dout(" page %lu not found\n", ptr_pgoff); + return ERR_PTR(-EAGAIN); + } + /* reading/filling the cache are serialized by + i_mutex, no need to use page lock */ + unlock_page(cache_ctl->page); + cache_ctl->dentries = kmap(cache_ctl->page); + } + + cache_ctl->index = idx & idx_mask; + + rcu_read_lock(); + spin_lock(&parent->d_lock); + /* check i_size again here, because empty directory can be + * marked as complete while not holding the i_mutex. */ + if (ceph_dir_is_complete_ordered(dir) && ptr_pos < i_size_read(dir)) + dentry = cache_ctl->dentries[cache_ctl->index]; + else + dentry = NULL; + spin_unlock(&parent->d_lock); + if (dentry && !lockref_get_not_dead(&dentry->d_lockref)) + dentry = NULL; + rcu_read_unlock(); + return dentry ? : ERR_PTR(-EAGAIN); +} + /* * When possible, we try to satisfy a readdir by peeking at the * dcache. We make this work by carefully ordering dentries on @@ -129,62 +173,57 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, struct inode *dir = d_inode(parent); struct dentry *dentry, *last = NULL; struct ceph_dentry_info *di; - unsigned nsize = PAGE_SIZE / sizeof(struct dentry *); - int err = 0; - loff_t ptr_pos = 0; struct ceph_readdir_cache_control cache_ctl = {}; + u64 idx = 0; + int err = 0; dout("__dcache_readdir %p v%u at %llu\n", dir, shared_gen, ctx->pos); - /* we can calculate cache index for the first dirfrag */ - if (ceph_frag_is_leftmost(fpos_frag(ctx->pos))) { - cache_ctl.index = fpos_off(ctx->pos) - 2; - BUG_ON(cache_ctl.index < 0); - ptr_pos = cache_ctl.index * sizeof(struct dentry *); + /* search start position */ + if (ctx->pos > 2) { + u64 count = div_u64(i_size_read(dir), sizeof(struct dentry *)); + while (count > 0) { + u64 step = count >> 1; + dentry = __dcache_find_get_entry(parent, idx + step, + &cache_ctl); + if (!dentry) { + /* use linar search */ + idx = 0; + break; + } + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out; + } + di = ceph_dentry(dentry); + spin_lock(&dentry->d_lock); + if (fpos_cmp(di->offset, ctx->pos) < 0) { + idx += step + 1; + count -= step + 1; + } else { + count = step; + } + spin_unlock(&dentry->d_lock); + dput(dentry); + } + + dout("__dcache_readdir %p cache idx %llu\n", dir, idx); } - while (true) { - pgoff_t pgoff; - bool emit_dentry; - if (ptr_pos >= i_size_read(dir)) { + for (;;) { + bool emit_dentry = false; + dentry = __dcache_find_get_entry(parent, idx++, &cache_ctl); + if (!dentry) { fi->flags |= CEPH_F_ATEND; err = 0; break; } - - err = -EAGAIN; - pgoff = ptr_pos >> PAGE_SHIFT; - if (!cache_ctl.page || pgoff != page_index(cache_ctl.page)) { - ceph_readdir_cache_release(&cache_ctl); - cache_ctl.page = find_lock_page(&dir->i_data, pgoff); - if (!cache_ctl.page) { - dout(" page %lu not found\n", pgoff); - break; - } - /* reading/filling the cache are serialized by - * i_mutex, no need to use page lock */ - unlock_page(cache_ctl.page); - cache_ctl.dentries = kmap(cache_ctl.page); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out; } - rcu_read_lock(); - spin_lock(&parent->d_lock); - /* check i_size again here, because empty directory can be - * marked as complete while not holding the i_mutex. */ - if (ceph_dir_is_complete_ordered(dir) && - ptr_pos < i_size_read(dir)) - dentry = cache_ctl.dentries[cache_ctl.index % nsize]; - else - dentry = NULL; - spin_unlock(&parent->d_lock); - if (dentry && !lockref_get_not_dead(&dentry->d_lockref)) - dentry = NULL; - rcu_read_unlock(); - if (!dentry) - break; - - emit_dentry = false; di = ceph_dentry(dentry); spin_lock(&dentry->d_lock); if (di->lease_shared_gen == shared_gen && @@ -217,10 +256,8 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, } else { dput(dentry); } - - cache_ctl.index++; - ptr_pos += sizeof(struct dentry *); } +out: ceph_readdir_cache_release(&cache_ctl); if (last) { int ret; From 1cd42a429174689c0df1c37b642654a1ab4d1506 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 29 Apr 2016 15:58:32 +0800 Subject: [PATCH 49/71] ceph: remove unnecessary checks in __dcache_readdir we never add snapdir and the hidden .ceph dir into readdir cache Signed-off-by: Yan, Zheng --- fs/ceph/dir.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index cdea450c59026b..734508be1526bb 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -228,8 +228,6 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, spin_lock(&dentry->d_lock); if (di->lease_shared_gen == shared_gen && d_really_is_positive(dentry) && - ceph_snap(d_inode(dentry)) != CEPH_SNAPDIR && - ceph_ino(d_inode(dentry)) != CEPH_INO_CEPH && fpos_cmp(ctx->pos, di->offset) <= 0) { emit_dentry = true; } From a78600e7c4fb47fb5ef34265456b731fde27a9c3 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 27 Apr 2016 17:32:34 +0800 Subject: [PATCH 50/71] ceph: simplify 'offset in frag' don't distinguish leftmost frag from other frags. always use 2 as first entry's offset. Signed-off-by: Yan, Zheng --- fs/ceph/dir.c | 12 +++--------- fs/ceph/inode.c | 5 +---- 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 734508be1526bb..31a2cdf3921034 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -414,10 +414,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) if (req->r_reply_info.dir_end) { kfree(fi->last_name); fi->last_name = NULL; - if (ceph_frag_is_rightmost(frag)) - fi->next_offset = 2; - else - fi->next_offset = 0; + fi->next_offset = 2; } else { err = note_last_dentry(fi, rinfo->dir_dname[rinfo->dir_nr-1], @@ -468,7 +465,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) /* more frags? */ if (!ceph_frag_is_rightmost(frag)) { frag = ceph_frag_next(frag); - off = 0; + off = 2; ctx->pos = ceph_make_fpos(frag, off); dout("readdir next frag is %x\n", frag); goto more; @@ -511,10 +508,7 @@ static void reset_readdir(struct ceph_file_info *fi, unsigned frag) fi->last_name = NULL; fi->dir_release_count = 0; fi->readdir_cache_idx = -1; - if (ceph_frag_is_leftmost(frag)) - fi->next_offset = 2; /* compensate for . and .. */ - else - fi->next_offset = 0; + fi->next_offset = 2; /* compensate for . and .. */ fi->flags &= ~CEPH_F_ATEND; } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index b906e02cddada9..51ce5ce59f0400 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1405,10 +1405,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, dout("readdir_prepopulate got new frag %x -> %x\n", frag, le32_to_cpu(rinfo->dir_dir->frag)); frag = le32_to_cpu(rinfo->dir_dir->frag); - if (ceph_frag_is_leftmost(frag)) - req->r_readdir_offset = 2; - else - req->r_readdir_offset = 0; + req->r_readdir_offset = 2; } if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { From 2a5beea3f1b6544d6c72ea220e860a2eda2f9104 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 28 Apr 2016 09:37:39 +0800 Subject: [PATCH 51/71] ceph: define struct for dir entry in readdir reply This avoids defining multiple arrays for entries in readdir reply Signed-off-by: Yan, Zheng --- fs/ceph/dir.c | 27 ++++++++++++--------------- fs/ceph/inode.c | 21 +++++++++++---------- fs/ceph/mds_client.c | 42 +++++++++++++++++++----------------------- fs/ceph/mds_client.h | 12 ++++++++---- 4 files changed, 50 insertions(+), 52 deletions(-) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 31a2cdf3921034..68530acea2c868 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -416,9 +416,9 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) fi->last_name = NULL; fi->next_offset = 2; } else { - err = note_last_dentry(fi, - rinfo->dir_dname[rinfo->dir_nr-1], - rinfo->dir_dname_len[rinfo->dir_nr-1], + struct ceph_mds_reply_dir_entry *rde = + rinfo->dir_entries + (rinfo->dir_nr-1); + err = note_last_dentry(fi, rde->name, rde->name_len, fi->next_offset + rinfo->dir_nr); if (err) return err; @@ -431,24 +431,21 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) ctx->pos = ceph_make_fpos(frag, off); while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) { - struct ceph_mds_reply_inode *in = - rinfo->dir_in[off - fi->offset].in; + struct ceph_mds_reply_dir_entry *rde = + rinfo->dir_entries + (off - fi->offset); struct ceph_vino vino; ino_t ino; dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n", off, off - fi->offset, rinfo->dir_nr, ctx->pos, - rinfo->dir_dname_len[off - fi->offset], - rinfo->dir_dname[off - fi->offset], in); - BUG_ON(!in); - ftype = le32_to_cpu(in->mode) >> 12; - vino.ino = le64_to_cpu(in->ino); - vino.snap = le64_to_cpu(in->snapid); + rde->name_len, rde->name, &rde->inode.in); + BUG_ON(!rde->inode.in); + ftype = le32_to_cpu(rde->inode.in->mode) >> 12; + vino.ino = le64_to_cpu(rde->inode.in->ino); + vino.snap = le64_to_cpu(rde->inode.in->snapid); ino = ceph_vino_to_ino(vino); - if (!dir_emit(ctx, - rinfo->dir_dname[off - fi->offset], - rinfo->dir_dname_len[off - fi->offset], - ceph_translate_ino(inode->i_sb, ino), ftype)) { + if (!dir_emit(ctx, rde->name, rde->name_len, + ceph_translate_ino(inode->i_sb, ino), ftype)) { dout("filldir stopping us...\n"); return 0; } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 51ce5ce59f0400..40d081d7028ed7 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1308,12 +1308,13 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, int i, err = 0; for (i = 0; i < rinfo->dir_nr; i++) { + struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; struct ceph_vino vino; struct inode *in; int rc; - vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino); - vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid); + vino.ino = le64_to_cpu(rde->inode.in->ino); + vino.snap = le64_to_cpu(rde->inode.in->snapid); in = ceph_get_inode(req->r_dentry->d_sb, vino); if (IS_ERR(in)) { @@ -1321,7 +1322,7 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, dout("new_inode badness got %d\n", err); continue; } - rc = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, + rc = fill_inode(in, NULL, &rde->inode, NULL, session, req->r_request_started, -1, &req->r_caps_reservation); if (rc < 0) { @@ -1433,14 +1434,15 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, /* FIXME: release caps/leases if error occurs */ for (i = 0; i < rinfo->dir_nr; i++) { + struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; struct ceph_vino vino; - dname.name = rinfo->dir_dname[i]; - dname.len = rinfo->dir_dname_len[i]; + dname.name = rde->name; + dname.len = rde->name_len; dname.hash = full_name_hash(dname.name, dname.len); - vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino); - vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid); + vino.ino = le64_to_cpu(rde->inode.in->ino); + vino.snap = le64_to_cpu(rde->inode.in->snapid); retry_lookup: dn = d_lookup(parent, &dname); @@ -1486,7 +1488,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, } } - ret = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, + ret = fill_inode(in, NULL, &rde->inode, NULL, session, req->r_request_started, -1, &req->r_caps_reservation); if (ret < 0) { @@ -1522,8 +1524,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, di = dn->d_fsdata; di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset); - update_dentry_lease(dn, rinfo->dir_dlease[i], - req->r_session, + update_dentry_lease(dn, rde->lease, req->r_session, req->r_request_started); if (err == 0 && skipped == 0 && cache_ctl.index >= 0) { diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 047f723bdbe62a..6220d3caf7aba0 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -186,12 +186,9 @@ static int parse_reply_info_dir(void **p, void *end, if (num == 0) goto done; - BUG_ON(!info->dir_in); - info->dir_dname = (void *)(info->dir_in + num); - info->dir_dname_len = (void *)(info->dir_dname + num); - info->dir_dlease = (void *)(info->dir_dname_len + num); - if ((unsigned long)(info->dir_dlease + num) > - (unsigned long)info->dir_in + info->dir_buf_size) { + BUG_ON(!info->dir_entries); + if ((unsigned long)(info->dir_entries + num) > + (unsigned long)info->dir_entries + info->dir_buf_size) { pr_err("dir contents are larger than expected\n"); WARN_ON(1); goto bad; @@ -199,19 +196,19 @@ static int parse_reply_info_dir(void **p, void *end, info->dir_nr = num; while (num) { + struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i; /* dentry */ ceph_decode_need(p, end, sizeof(u32)*2, bad); - info->dir_dname_len[i] = ceph_decode_32(p); - ceph_decode_need(p, end, info->dir_dname_len[i], bad); - info->dir_dname[i] = *p; - *p += info->dir_dname_len[i]; - dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i], - info->dir_dname[i]); - info->dir_dlease[i] = *p; + rde->name_len = ceph_decode_32(p); + ceph_decode_need(p, end, rde->name_len, bad); + rde->name = *p; + *p += rde->name_len; + dout("parsed dir dname '%.*s'\n", rde->name_len, rde->name); + rde->lease = *p; *p += sizeof(struct ceph_mds_reply_lease); /* inode */ - err = parse_reply_info_in(p, end, &info->dir_in[i], features); + err = parse_reply_info_in(p, end, &rde->inode, features); if (err < 0) goto out_bad; i++; @@ -345,9 +342,9 @@ static int parse_reply_info(struct ceph_msg *msg, static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info) { - if (!info->dir_in) + if (!info->dir_entries) return; - free_pages((unsigned long)info->dir_in, get_order(info->dir_buf_size)); + free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size)); } @@ -1656,8 +1653,7 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, struct ceph_inode_info *ci = ceph_inode(dir); struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options; - size_t size = sizeof(*rinfo->dir_in) + sizeof(*rinfo->dir_dname_len) + - sizeof(*rinfo->dir_dname) + sizeof(*rinfo->dir_dlease); + size_t size = sizeof(struct ceph_mds_reply_dir_entry); int order, num_entries; spin_lock(&ci->i_ceph_lock); @@ -1668,14 +1664,14 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, order = get_order(size * num_entries); while (order >= 0) { - rinfo->dir_in = (void*)__get_free_pages(GFP_KERNEL | - __GFP_NOWARN, - order); - if (rinfo->dir_in) + rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL | + __GFP_NOWARN, + order); + if (rinfo->dir_entries) break; order--; } - if (!rinfo->dir_in) + if (!rinfo->dir_entries) return -ENOMEM; num_entries = (PAGE_SIZE << order) / size; diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index ee69a537dba53b..0b84f9c0afa30a 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -47,6 +47,13 @@ struct ceph_mds_reply_info_in { u32 pool_ns_len; }; +struct ceph_mds_reply_dir_entry { + char *name; + u32 name_len; + struct ceph_mds_reply_lease *lease; + struct ceph_mds_reply_info_in inode; +}; + /* * parsed info about an mds reply, including information about * either: 1) the target inode and/or its parent directory and dentry, @@ -73,11 +80,8 @@ struct ceph_mds_reply_info_parsed { struct ceph_mds_reply_dirfrag *dir_dir; size_t dir_buf_size; int dir_nr; - char **dir_dname; - u32 *dir_dname_len; - struct ceph_mds_reply_lease **dir_dlease; - struct ceph_mds_reply_info_in *dir_in; u8 dir_complete, dir_end; + struct ceph_mds_reply_dir_entry *dir_entries; }; /* for create results */ From 956d39d631dbcf7b57854873a24e309047f2a7f5 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 27 Apr 2016 17:48:30 +0800 Subject: [PATCH 52/71] ceph: define 'end/complete' in readdir reply as bit flags Set a flag in readdir request, which indicates that client interprets 'end/complete' as bit flags. So that mds can reply additional flags in readdir reply. Signed-off-by: Yan, Zheng --- fs/ceph/dir.c | 2 ++ fs/ceph/mds_client.c | 7 +++++-- fs/ceph/mds_client.h | 2 +- include/linux/ceph/ceph_fs.h | 12 ++++++++++++ 4 files changed, 20 insertions(+), 3 deletions(-) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 68530acea2c868..ebcbd1c946b4c3 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -365,6 +365,8 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) req->r_readdir_cache_idx = fi->readdir_cache_idx; req->r_readdir_offset = fi->next_offset; req->r_args.readdir.frag = cpu_to_le32(frag); + req->r_args.readdir.flags = + cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS); req->r_inode = inode; ihold(inode); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 6220d3caf7aba0..1c2befcd24fb37 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -181,8 +181,11 @@ static int parse_reply_info_dir(void **p, void *end, ceph_decode_need(p, end, sizeof(num) + 2, bad); num = ceph_decode_32(p); - info->dir_end = ceph_decode_8(p); - info->dir_complete = ceph_decode_8(p); + { + u16 flags = ceph_decode_16(p); + info->dir_end = !!(flags & CEPH_READDIR_FRAG_END); + info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE); + } if (num == 0) goto done; diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 0b84f9c0afa30a..2a865812a41b49 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -80,7 +80,7 @@ struct ceph_mds_reply_info_parsed { struct ceph_mds_reply_dirfrag *dir_dir; size_t dir_buf_size; int dir_nr; - u8 dir_complete, dir_end; + bool dir_complete, dir_end; struct ceph_mds_reply_dir_entry *dir_entries; }; diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index bae833d0d05550..a811c5e98bfa37 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -347,6 +347,17 @@ extern const char *ceph_mds_op_name(int op); #define CEPH_XATTR_REPLACE (1 << 1) #define CEPH_XATTR_REMOVE (1 << 31) +/* + * readdir request flags; + */ +#define CEPH_READDIR_REPLY_BITFLAGS (1<<0) + +/* + * readdir reply flags. + */ +#define CEPH_READDIR_FRAG_END (1<<0) +#define CEPH_READDIR_FRAG_COMPLETE (1<<8) + union ceph_mds_request_args { struct { __le32 mask; /* CEPH_CAP_* */ @@ -364,6 +375,7 @@ union ceph_mds_request_args { __le32 frag; /* which dir fragment */ __le32 max_entries; /* how many dentries to grab */ __le32 max_bytes; + __le16 flags; } __attribute__ ((packed)) readdir; struct { __le32 mode; From 8974eebd38737c9534d81c4131c5fdb1fe24d3e9 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 28 Apr 2016 15:17:40 +0800 Subject: [PATCH 53/71] ceph: record 'offset' for each entry of readdir result This is preparation for using hash value as dentry 'offset' Signed-off-by: Yan, Zheng --- fs/ceph/dir.c | 83 +++++++++++++++++++++++++++++--------------- fs/ceph/inode.c | 1 + fs/ceph/mds_client.c | 2 ++ fs/ceph/mds_client.h | 1 + fs/ceph/super.h | 1 - 5 files changed, 59 insertions(+), 29 deletions(-) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index ebcbd1c946b4c3..6ae635605be5ae 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -277,12 +277,12 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_mds_client *mdsc = fsc->mdsc; unsigned frag = fpos_frag(ctx->pos); - int off = fpos_off(ctx->pos); + int i; int err; u32 ftype; struct ceph_mds_reply_info_parsed *rinfo; - dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off); + dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos); if (fi->flags & CEPH_F_ATEND) return 0; @@ -294,7 +294,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) inode->i_mode >> 12)) return 0; ctx->pos = 1; - off = 1; } if (ctx->pos == 1) { ino_t ino = parent_ino(file->f_path.dentry); @@ -304,7 +303,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) inode->i_mode >> 12)) return 0; ctx->pos = 2; - off = 2; } /* can we use the dcache? */ @@ -320,7 +318,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) if (err != -EAGAIN) return err; frag = fpos_frag(ctx->pos); - off = fpos_off(ctx->pos); } else { spin_unlock(&ci->i_ceph_lock); } @@ -386,12 +383,12 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) rinfo = &req->r_reply_info; if (le32_to_cpu(rinfo->dir_dir->frag) != frag) { frag = le32_to_cpu(rinfo->dir_dir->frag); - off = req->r_readdir_offset; - fi->next_offset = off; + fi->next_offset = req->r_readdir_offset; + /* adjust ctx->pos to beginning of frag */ + ctx->pos = ceph_make_fpos(frag, fi->next_offset); } fi->frag = frag; - fi->offset = fi->next_offset; fi->last_readdir = req; if (req->r_did_prepopulate) { @@ -399,7 +396,8 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) if (fi->readdir_cache_idx < 0) { /* preclude from marking dir ordered */ fi->dir_ordered_count = 0; - } else if (ceph_frag_is_leftmost(frag) && off == 2) { + } else if (ceph_frag_is_leftmost(frag) && + fi->next_offset == 2) { /* note dir version at start of readdir so * we can tell if any dentries get dropped */ fi->dir_release_count = req->r_dir_release_cnt; @@ -421,37 +419,54 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + (rinfo->dir_nr-1); err = note_last_dentry(fi, rde->name, rde->name_len, - fi->next_offset + rinfo->dir_nr); + fpos_off(rde->offset) + 1); if (err) return err; } } rinfo = &fi->last_readdir->r_reply_info; - dout("readdir frag %x num %d off %d chunkoff %d\n", frag, - rinfo->dir_nr, off, fi->offset); + dout("readdir frag %x num %d pos %llx chunk first %llx\n", + frag, rinfo->dir_nr, ctx->pos, + rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL); - ctx->pos = ceph_make_fpos(frag, off); - while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) { - struct ceph_mds_reply_dir_entry *rde = - rinfo->dir_entries + (off - fi->offset); + i = 0; + /* search start position */ + if (rinfo->dir_nr > 0) { + int step, nr = rinfo->dir_nr; + while (nr > 0) { + step = nr >> 1; + if (rinfo->dir_entries[i + step].offset < ctx->pos) { + i += step + 1; + nr -= step + 1; + } else { + nr = step; + } + } + } + for (; i < rinfo->dir_nr; i++) { + struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; struct ceph_vino vino; ino_t ino; - dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n", - off, off - fi->offset, rinfo->dir_nr, ctx->pos, + BUG_ON(rde->offset < ctx->pos); + + ctx->pos = rde->offset; + dout("readdir (%d/%d) -> %llx '%.*s' %p\n", + i, rinfo->dir_nr, ctx->pos, rde->name_len, rde->name, &rde->inode.in); + BUG_ON(!rde->inode.in); ftype = le32_to_cpu(rde->inode.in->mode) >> 12; vino.ino = le64_to_cpu(rde->inode.in->ino); vino.snap = le64_to_cpu(rde->inode.in->snapid); ino = ceph_vino_to_ino(vino); + if (!dir_emit(ctx, rde->name, rde->name_len, ceph_translate_ino(inode->i_sb, ino), ftype)) { dout("filldir stopping us...\n"); return 0; } - off++; ctx->pos++; } @@ -464,8 +479,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) /* more frags? */ if (!ceph_frag_is_rightmost(frag)) { frag = ceph_frag_next(frag); - off = 2; - ctx->pos = ceph_make_fpos(frag, off); + ctx->pos = ceph_make_fpos(frag, 2); dout("readdir next frag is %x\n", frag); goto more; } @@ -497,7 +511,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) return 0; } -static void reset_readdir(struct ceph_file_info *fi, unsigned frag) +static void reset_readdir(struct ceph_file_info *fi) { if (fi->last_readdir) { ceph_mdsc_put_request(fi->last_readdir); @@ -511,6 +525,23 @@ static void reset_readdir(struct ceph_file_info *fi, unsigned frag) fi->flags &= ~CEPH_F_ATEND; } +/* + * discard buffered readdir content on seekdir(0), or seek to new frag, + * or seek prior to current chunk + */ +static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos) +{ + struct ceph_mds_reply_info_parsed *rinfo; + if (new_pos == 0) + return true; + if (fpos_frag(new_pos) != fi->frag) + return true; + rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL; + if (!rinfo || !rinfo->dir_nr) + return true; + return new_pos < rinfo->dir_entries[0].offset;; +} + static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) { struct ceph_file_info *fi = file->private_data; @@ -539,13 +570,9 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) } retval = offset; - if (offset == 0 || - fpos_frag(offset) != fi->frag || - fpos_off(offset) < fi->offset) { - /* discard buffered readdir content on seekdir(0), or - * seek to new frag, or seek prior to current chunk */ + if (need_reset_readdir(fi, offset)) { dout("dir_llseek dropping %p content\n", file); - reset_readdir(fi, fpos_frag(offset)); + reset_readdir(fi); } else if (fpos_cmp(offset, old_offset) > 0) { /* reset dir_release_count if we did a forward seek */ fi->dir_release_count = 0; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 40d081d7028ed7..b53c95903aebb8 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1523,6 +1523,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, di = dn->d_fsdata; di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset); + rde->offset = di->offset; update_dentry_lease(dn, rde->lease, req->r_session, req->r_request_started); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 1c2befcd24fb37..48def22fc7b940 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -214,6 +214,8 @@ static int parse_reply_info_dir(void **p, void *end, err = parse_reply_info_in(p, end, &rde->inode, features); if (err < 0) goto out_bad; + /* ceph_readdir_prepopulate() will update it */ + rde->offset = 0; i++; num--; } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 2a865812a41b49..4ce19d852657ca 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -52,6 +52,7 @@ struct ceph_mds_reply_dir_entry { u32 name_len; struct ceph_mds_reply_lease *lease; struct ceph_mds_reply_info_in inode; + loff_t offset; }; /* diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 0ea86406f463cd..0628099ba1f2de 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -635,7 +635,6 @@ struct ceph_file_info { struct ceph_mds_request *last_readdir; /* readdir: position within a frag */ - unsigned offset; /* offset of last chunk, adjusted for . and .. */ unsigned next_offset; /* offset of next chunk (last_name's + 1) */ char *last_name; /* last entry in previous chunk */ long long dir_release_count; From 076c40f18d10489e29c515bf5936952830df5e16 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 28 Apr 2016 22:56:44 +0800 Subject: [PATCH 54/71] ceph: don't forbid marking directory complete after forward seek Forward seek within same frag does not update fi->last_name, it will not affect contents of later readdir reply. So there is no need to forbid marking directory complete Signed-off-by: Yan, Zheng --- fs/ceph/dir.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 6ae635605be5ae..e954ea2fb71056 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -546,7 +546,6 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) { struct ceph_file_info *fi = file->private_data; struct inode *inode = file->f_mapping->host; - loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset); loff_t retval; inode_lock(inode); @@ -573,10 +572,6 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) if (need_reset_readdir(fi, offset)) { dout("dir_llseek dropping %p content\n", file); reset_readdir(fi); - } else if (fpos_cmp(offset, old_offset) > 0) { - /* reset dir_release_count if we did a forward seek */ - fi->dir_release_count = 0; - fi->readdir_cache_idx = -1; } } out: From f3c4ebe65ea149ec892f94474233cfebe9cbe299 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 29 Apr 2016 11:27:30 +0800 Subject: [PATCH 55/71] ceph: using hash value to compose dentry offset If MDS sorts dentries in dirfrag in hash order, we use hash value to compose dentry offset. dentry offset is: (0xff << 52) | ((24 bits hash) << 28) | (the nth entry hash hash collision) This offset is stable across directory fragmentation. This alos means there is no need to reset readdir offset if directory get fragmented in the middle of readdir. Signed-off-by: Yan, Zheng --- fs/ceph/dir.c | 140 ++++++++++++++++++++++++++--------- fs/ceph/inode.c | 31 ++++++-- fs/ceph/mds_client.c | 1 + fs/ceph/mds_client.h | 4 +- fs/ceph/super.h | 6 +- include/linux/ceph/ceph_fs.h | 1 + 6 files changed, 136 insertions(+), 47 deletions(-) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index e954ea2fb71056..4850c3624a873c 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -69,16 +69,42 @@ int ceph_init_dentry(struct dentry *dentry) } /* - * for readdir, we encode the directory frag and offset within that - * frag into f_pos. + * for f_pos for readdir: + * - hash order: + * (0xff << 52) | ((24 bits hash) << 28) | + * (the nth entry has hash collision); + * - frag+name order; + * ((frag value) << 28) | (the nth entry in frag); */ +#define OFFSET_BITS 28 +#define OFFSET_MASK ((1 << OFFSET_BITS) - 1) +#define HASH_ORDER (0xffull << (OFFSET_BITS + 24)) +loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order) +{ + loff_t fpos = ((loff_t)high << 28) | (loff_t)off; + if (hash_order) + fpos |= HASH_ORDER; + return fpos; +} + +static bool is_hash_order(loff_t p) +{ + return (p & HASH_ORDER) == HASH_ORDER; +} + static unsigned fpos_frag(loff_t p) { - return p >> 32; + return p >> OFFSET_BITS; } + +static unsigned fpos_hash(loff_t p) +{ + return ceph_frag_value(fpos_frag(p)); +} + static unsigned fpos_off(loff_t p) { - return p & 0xffffffff; + return p & OFFSET_MASK; } static int fpos_cmp(loff_t l, loff_t r) @@ -177,7 +203,7 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, u64 idx = 0; int err = 0; - dout("__dcache_readdir %p v%u at %llu\n", dir, shared_gen, ctx->pos); + dout("__dcache_readdir %p v%u at %llx\n", dir, shared_gen, ctx->pos); /* search start position */ if (ctx->pos > 2) { @@ -234,7 +260,7 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, spin_unlock(&dentry->d_lock); if (emit_dentry) { - dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos, + dout(" %llx dentry %p %pd %p\n", di->offset, dentry, dentry, d_inode(dentry)); ctx->pos = di->offset; if (!dir_emit(ctx, dentry->d_name.name, @@ -269,6 +295,16 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, return err; } +static bool need_send_readdir(struct ceph_file_info *fi, loff_t pos) +{ + if (!fi->last_readdir) + return true; + if (is_hash_order(pos)) + return !ceph_frag_contains_value(fi->frag, fpos_hash(pos)); + else + return fi->frag != fpos_frag(pos); +} + static int ceph_readdir(struct file *file, struct dir_context *ctx) { struct ceph_file_info *fi = file->private_data; @@ -276,7 +312,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_mds_client *mdsc = fsc->mdsc; - unsigned frag = fpos_frag(ctx->pos); int i; int err; u32 ftype; @@ -317,7 +352,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) err = __dcache_readdir(file, ctx, shared_gen); if (err != -EAGAIN) return err; - frag = fpos_frag(ctx->pos); } else { spin_unlock(&ci->i_ceph_lock); } @@ -325,8 +359,9 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) /* proceed with a normal readdir */ more: /* do we have the correct frag content buffered? */ - if (fi->frag != frag || fi->last_readdir == NULL) { + if (need_send_readdir(fi, ctx->pos)) { struct ceph_mds_request *req; + unsigned frag; int op = ceph_snap(inode) == CEPH_SNAPDIR ? CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR; @@ -336,6 +371,13 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) fi->last_readdir = NULL; } + if (is_hash_order(ctx->pos)) { + frag = ceph_choose_frag(ci, fpos_hash(ctx->pos), + NULL, NULL); + } else { + frag = fpos_frag(ctx->pos); + } + dout("readdir fetching %llx.%llx frag %x offset '%s'\n", ceph_vinop(inode), frag, fi->last_name); req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); @@ -373,19 +415,23 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) ceph_mdsc_put_request(req); return err; } - dout("readdir got and parsed readdir result=%d" - " on frag %x, end=%d, complete=%d\n", err, frag, + dout("readdir got and parsed readdir result=%d on " + "frag %x, end=%d, complete=%d, hash_order=%d\n", + err, frag, (int)req->r_reply_info.dir_end, - (int)req->r_reply_info.dir_complete); - + (int)req->r_reply_info.dir_complete, + (int)req->r_reply_info.hash_order); - /* note next offset and last dentry name */ rinfo = &req->r_reply_info; if (le32_to_cpu(rinfo->dir_dir->frag) != frag) { frag = le32_to_cpu(rinfo->dir_dir->frag); - fi->next_offset = req->r_readdir_offset; - /* adjust ctx->pos to beginning of frag */ - ctx->pos = ceph_make_fpos(frag, fi->next_offset); + if (!rinfo->hash_order) { + fi->next_offset = req->r_readdir_offset; + /* adjust ctx->pos to beginning of frag */ + ctx->pos = ceph_make_fpos(frag, + fi->next_offset, + false); + } } fi->frag = frag; @@ -411,23 +457,25 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) fi->dir_release_count = 0; } - if (req->r_reply_info.dir_end) { - kfree(fi->last_name); - fi->last_name = NULL; - fi->next_offset = 2; - } else { + /* note next offset and last dentry name */ + if (rinfo->dir_nr > 0) { struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + (rinfo->dir_nr-1); + unsigned next_offset = req->r_reply_info.dir_end ? + 2 : (fpos_off(rde->offset) + 1); err = note_last_dentry(fi, rde->name, rde->name_len, - fpos_off(rde->offset) + 1); + next_offset); if (err) return err; + } else if (req->r_reply_info.dir_end) { + fi->next_offset = 2; + /* keep last name */ } } rinfo = &fi->last_readdir->r_reply_info; dout("readdir frag %x num %d pos %llx chunk first %llx\n", - frag, rinfo->dir_nr, ctx->pos, + fi->frag, rinfo->dir_nr, ctx->pos, rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL); i = 0; @@ -470,16 +518,26 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) ctx->pos++; } - if (fi->last_name) { + if (fi->next_offset > 2) { ceph_mdsc_put_request(fi->last_readdir); fi->last_readdir = NULL; goto more; } /* more frags? */ - if (!ceph_frag_is_rightmost(frag)) { - frag = ceph_frag_next(frag); - ctx->pos = ceph_make_fpos(frag, 2); + if (!ceph_frag_is_rightmost(fi->frag)) { + unsigned frag = ceph_frag_next(fi->frag); + if (is_hash_order(ctx->pos)) { + loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag), + fi->next_offset, true); + if (new_pos > ctx->pos) + ctx->pos = new_pos; + /* keep last_name */ + } else { + ctx->pos = ceph_make_fpos(frag, fi->next_offset, false); + kfree(fi->last_name); + fi->last_name = NULL; + } dout("readdir next frag is %x\n", frag); goto more; } @@ -532,14 +590,21 @@ static void reset_readdir(struct ceph_file_info *fi) static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos) { struct ceph_mds_reply_info_parsed *rinfo; + loff_t chunk_offset; if (new_pos == 0) return true; - if (fpos_frag(new_pos) != fi->frag) + if (is_hash_order(new_pos)) { + /* no need to reset last_name for a forward seek when + * dentries are sotred in hash order */ + } else if (fi->frag |= fpos_frag(new_pos)) { return true; + } rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL; if (!rinfo || !rinfo->dir_nr) return true; - return new_pos < rinfo->dir_entries[0].offset;; + chunk_offset = rinfo->dir_entries[0].offset; + return new_pos < chunk_offset || + is_hash_order(new_pos) != is_hash_order(chunk_offset); } static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) @@ -562,17 +627,22 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) } if (offset >= 0) { + if (need_reset_readdir(fi, offset)) { + dout("dir_llseek dropping %p content\n", file); + reset_readdir(fi); + } else if (is_hash_order(offset) && offset > file->f_pos) { + /* for hash offset, we don't know if a forward seek + * is within same frag */ + fi->dir_release_count = 0; + fi->readdir_cache_idx = -1; + } + if (offset != file->f_pos) { file->f_pos = offset; file->f_version = 0; fi->flags &= ~CEPH_F_ATEND; } retval = offset; - - if (need_reset_readdir(fi, offset)) { - dout("dir_llseek dropping %p content\n", file); - reset_readdir(fi); - } } out: inode_unlock(inode); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index b53c95903aebb8..f51b6fd5f570e8 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1387,6 +1387,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct ceph_mds_session *session) { struct dentry *parent = req->r_dentry; + struct ceph_inode_info *ci = ceph_inode(d_inode(parent)); struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; struct qstr dname; struct dentry *dn; @@ -1394,19 +1395,27 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, int err = 0, skipped = 0, ret, i; struct inode *snapdir = NULL; struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; - struct ceph_dentry_info *di; u32 frag = le32_to_cpu(rhead->args.readdir.frag); + u32 last_hash = 0; + u32 fpos_offset; struct ceph_readdir_cache_control cache_ctl = {}; if (req->r_aborted) return readdir_prepopulate_inodes_only(req, session); + if (rinfo->hash_order && req->r_path2) { + last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, + req->r_path2, strlen(req->r_path2)); + last_hash = ceph_frag_value(last_hash); + } + if (rinfo->dir_dir && le32_to_cpu(rinfo->dir_dir->frag) != frag) { dout("readdir_prepopulate got new frag %x -> %x\n", frag, le32_to_cpu(rinfo->dir_dir->frag)); frag = le32_to_cpu(rinfo->dir_dir->frag); - req->r_readdir_offset = 2; + if (!rinfo->hash_order) + req->r_readdir_offset = 2; } if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { @@ -1424,13 +1433,13 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) { /* note dir version at start of readdir so we can tell * if any dentries get dropped */ - struct ceph_inode_info *ci = ceph_inode(d_inode(parent)); req->r_dir_release_cnt = atomic64_read(&ci->i_release_count); req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count); req->r_readdir_cache_idx = 0; } cache_ctl.index = req->r_readdir_cache_idx; + fpos_offset = req->r_readdir_offset; /* FIXME: release caps/leases if error occurs */ for (i = 0; i < rinfo->dir_nr; i++) { @@ -1444,6 +1453,18 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, vino.ino = le64_to_cpu(rde->inode.in->ino); vino.snap = le64_to_cpu(rde->inode.in->snapid); + if (rinfo->hash_order) { + u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, + rde->name, rde->name_len); + hash = ceph_frag_value(hash); + if (hash != last_hash) + fpos_offset = 2; + last_hash = hash; + rde->offset = ceph_make_fpos(hash, fpos_offset++, true); + } else { + rde->offset = ceph_make_fpos(frag, fpos_offset++, false); + } + retry_lookup: dn = d_lookup(parent, &dname); dout("d_lookup on parent=%p name=%.*s got %p\n", @@ -1521,9 +1542,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, dn = realdn; } - di = dn->d_fsdata; - di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset); - rde->offset = di->offset; + ceph_dentry(dn)->offset = rde->offset; update_dentry_lease(dn, rde->lease, req->r_session, req->r_request_started); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 48def22fc7b940..7ad31283d510bf 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -185,6 +185,7 @@ static int parse_reply_info_dir(void **p, void *end, u16 flags = ceph_decode_16(p); info->dir_end = !!(flags & CEPH_READDIR_FRAG_END); info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE); + info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER); } if (num == 0) goto done; diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 4ce19d852657ca..e7d38aac71093f 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -81,7 +81,9 @@ struct ceph_mds_reply_info_parsed { struct ceph_mds_reply_dirfrag *dir_dir; size_t dir_buf_size; int dir_nr; - bool dir_complete, dir_end; + bool dir_complete; + bool dir_end; + bool hash_order; struct ceph_mds_reply_dir_entry *dir_entries; }; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 0628099ba1f2de..c9b671dfff810b 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -540,11 +540,6 @@ static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry) return (struct ceph_dentry_info *)dentry->d_fsdata; } -static inline loff_t ceph_make_fpos(unsigned frag, unsigned off) -{ - return ((loff_t)frag << 32) | (loff_t)off; -} - /* * caps helpers */ @@ -949,6 +944,7 @@ extern const struct inode_operations ceph_snapdir_iops; extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops, ceph_snapdir_dentry_ops; +extern loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order); extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry); extern int ceph_handle_snapdir(struct ceph_mds_request *req, struct dentry *dentry, int err); diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index a811c5e98bfa37..dfce616002ad97 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -357,6 +357,7 @@ extern const char *ceph_mds_op_name(int op); */ #define CEPH_READDIR_FRAG_END (1<<0) #define CEPH_READDIR_FRAG_COMPLETE (1<<8) +#define CEPH_READDIR_HASH_ORDER (1<<9) union ceph_mds_request_args { struct { From 209ae762a64397557be2bc48490fd968912e9a71 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 29 Apr 2016 23:40:23 +0800 Subject: [PATCH 56/71] ceph: fix inode reference leak Signed-off-by: Yan, Zheng --- fs/ceph/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index f51b6fd5f570e8..4889aaa72accb2 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1328,8 +1328,8 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, if (rc < 0) { pr_err("fill_inode badness on %p got %d\n", in, rc); err = rc; - continue; } + iput(in); } return err; From a407846ef7c6e7905ab0d1ebc0c86fbebb065b87 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 3 May 2016 20:55:50 +0800 Subject: [PATCH 57/71] ceph: don't assume frag tree splits in mds reply are sorted The algorithm that updates i_fragtree relies on that the frag tree splits in mds reply are of the same order of i_fragtree. This is not true because current MDS encodes frag tree splits in ascending order of (unsigned)frag_t. But nodes in i_fragtree are sorted according to ceph_frag_compare(). The fix is sort the frag tree splits first, then updates i_fragtree. Signed-off-by: Yan, Zheng --- fs/ceph/inode.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 4889aaa72accb2..d91eb6b492593d 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "super.h" #include "mds_client.h" @@ -299,6 +300,13 @@ static int ceph_fill_dirfrag(struct inode *inode, return err; } +static int frag_tree_split_cmp(const void *l, const void *r) +{ + struct ceph_frag_tree_split *ls = (struct ceph_frag_tree_split*)l; + struct ceph_frag_tree_split *rs = (struct ceph_frag_tree_split*)r; + return ceph_frag_compare(ls->frag, rs->frag); +} + static int ceph_fill_fragtree(struct inode *inode, struct ceph_frag_tree_head *fragtree, struct ceph_mds_reply_dirfrag *dirinfo) @@ -331,6 +339,11 @@ static int ceph_fill_fragtree(struct inode *inode, if (!update) goto out_unlock; + if (nsplits > 1) { + sort(fragtree->splits, nsplits, sizeof(fragtree->splits[0]), + frag_tree_split_cmp, NULL); + } + dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode)); rb_node = rb_first(&ci->i_fragtree); for (i = 0; i < nsplits; i++) { From 421721195a68f46a8218c664154076b5b06f5f51 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 3 May 2016 22:33:20 +0800 Subject: [PATCH 58/71] ceph: fix dir_auth check in ceph_fill_dirfrag() -1 is CDIR_AUTH_PARENT, it means dir's auth mds is the same as inode's auth mds Signed-off-by: Yan, Zheng --- fs/ceph/inode.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index d91eb6b492593d..562f57d15f46da 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -254,6 +254,9 @@ static int ceph_fill_dirfrag(struct inode *inode, diri_auth = ci->i_auth_cap->mds; spin_unlock(&ci->i_ceph_lock); + if (mds == -1) /* CDIR_AUTH_PARENT */ + mds = diri_auth; + mutex_lock(&ci->i_fragtree_mutex); if (ndist == 0 && mds == diri_auth) { /* no delegation info needed. */ From a4b7431f39438f415cef1fa28502ec58016686f7 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 4 May 2016 11:05:10 +0800 Subject: [PATCH 59/71] ceph: keep leaf frag when updating fragtree Nodes in i_fragtree are sorted according to ceph_compare_frag(). It means frag node in i_fragtree always follow its direct parent node. To check if a leaf node is valid, we just need to check if it's child of previous split node. Signed-off-by: Yan, Zheng --- fs/ceph/inode.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 562f57d15f46da..09713f32ea88e1 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -310,12 +310,21 @@ static int frag_tree_split_cmp(const void *l, const void *r) return ceph_frag_compare(ls->frag, rs->frag); } +static bool is_frag_child(u32 f, struct ceph_inode_frag *frag) +{ + if (!frag) + return f == ceph_frag_make(0, 0); + if (ceph_frag_bits(f) != ceph_frag_bits(frag->frag) + frag->split_by) + return false; + return ceph_frag_contains_value(frag->frag, ceph_frag_value(f)); +} + static int ceph_fill_fragtree(struct inode *inode, struct ceph_frag_tree_head *fragtree, struct ceph_mds_reply_dirfrag *dirinfo) { struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_inode_frag *frag; + struct ceph_inode_frag *frag, *prev_frag = NULL; struct rb_node *rb_node; int i; u32 id, nsplits; @@ -362,8 +371,12 @@ static int ceph_fill_fragtree(struct inode *inode, break; } rb_node = rb_next(rb_node); - rb_erase(&frag->node, &ci->i_fragtree); - kfree(frag); + /* delete stale split/leaf node */ + if (frag->split_by > 0 || + !is_frag_child(frag->frag, prev_frag)) { + rb_erase(&frag->node, &ci->i_fragtree); + kfree(frag); + } frag = NULL; } if (!frag) { @@ -373,12 +386,17 @@ static int ceph_fill_fragtree(struct inode *inode, } frag->split_by = le32_to_cpu(fragtree->splits[i].by); dout(" frag %x split by %d\n", frag->frag, frag->split_by); + prev_frag = frag; } while (rb_node) { frag = rb_entry(rb_node, struct ceph_inode_frag, node); rb_node = rb_next(rb_node); - rb_erase(&frag->node, &ci->i_fragtree); - kfree(frag); + /* delete stale split/leaf node */ + if (frag->split_by > 0 || + !is_frag_child(frag->frag, prev_frag)) { + rb_erase(&frag->node, &ci->i_fragtree); + kfree(frag); + } } out_unlock: mutex_unlock(&ci->i_fragtree_mutex); From 1b1bc16d66a7c7af3b4f30d1cf5a363168b217f4 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 4 May 2016 11:40:30 +0800 Subject: [PATCH 60/71] ceph: improve fragtree change detection check if number of splits in i_fragtree is equal to number of splits in mds reply Signed-off-by: Yan, Zheng --- fs/ceph/inode.c | 24 ++++++++++++++++++++---- fs/ceph/super.h | 1 + 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 09713f32ea88e1..7ba8e1c2557bbb 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -326,13 +326,15 @@ static int ceph_fill_fragtree(struct inode *inode, struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_frag *frag, *prev_frag = NULL; struct rb_node *rb_node; - int i; - u32 id, nsplits; + unsigned i, split_by, nsplits; + u32 id; bool update = false; mutex_lock(&ci->i_fragtree_mutex); nsplits = le32_to_cpu(fragtree->nsplits); - if (nsplits) { + if (nsplits != ci->i_fragtree_nsplits) { + update = true; + } else if (nsplits) { i = prandom_u32() % nsplits; id = le32_to_cpu(fragtree->splits[i].frag); if (!__ceph_find_frag(ci, id)) @@ -360,6 +362,13 @@ static int ceph_fill_fragtree(struct inode *inode, rb_node = rb_first(&ci->i_fragtree); for (i = 0; i < nsplits; i++) { id = le32_to_cpu(fragtree->splits[i].frag); + split_by = le32_to_cpu(fragtree->splits[i].by); + if (split_by == 0 || ceph_frag_bits(id) + split_by > 24) { + pr_err("fill_fragtree %llx.%llx invalid split %d/%u, " + "frag %x split by %d\n", ceph_vinop(inode), + i, nsplits, id, split_by); + continue; + } frag = NULL; while (rb_node) { frag = rb_entry(rb_node, struct ceph_inode_frag, node); @@ -375,6 +384,8 @@ static int ceph_fill_fragtree(struct inode *inode, if (frag->split_by > 0 || !is_frag_child(frag->frag, prev_frag)) { rb_erase(&frag->node, &ci->i_fragtree); + if (frag->split_by > 0) + ci->i_fragtree_nsplits--; kfree(frag); } frag = NULL; @@ -384,7 +395,9 @@ static int ceph_fill_fragtree(struct inode *inode, if (IS_ERR(frag)) continue; } - frag->split_by = le32_to_cpu(fragtree->splits[i].by); + if (frag->split_by == 0) + ci->i_fragtree_nsplits++; + frag->split_by = split_by; dout(" frag %x split by %d\n", frag->frag, frag->split_by); prev_frag = frag; } @@ -395,6 +408,8 @@ static int ceph_fill_fragtree(struct inode *inode, if (frag->split_by > 0 || !is_frag_child(frag->frag, prev_frag)) { rb_erase(&frag->node, &ci->i_fragtree); + if (frag->split_by > 0) + ci->i_fragtree_nsplits--; kfree(frag); } } @@ -546,6 +561,7 @@ void ceph_destroy_inode(struct inode *inode) rb_erase(n, &ci->i_fragtree); kfree(frag); } + ci->i_fragtree_nsplits = 0; __ceph_destroy_xattrs(ci); if (ci->i_xattrs.blob) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index c9b671dfff810b..a268f18d21190c 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -297,6 +297,7 @@ struct ceph_inode_info { u64 i_files, i_subdirs; struct rb_root i_fragtree; + int i_fragtree_nsplits; struct mutex i_fragtree_mutex; struct ceph_inode_xattrs_info i_xattrs; From 224a7542b8fdde3cc7c600f8b0870c5541a9f678 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 5 May 2016 16:40:17 +0800 Subject: [PATCH 61/71] ceph: tolerate bad i_size for symlink inode A mds bug can cause symlink's size to be truncated to zero. Signed-off-by: Yan, Zheng --- fs/ceph/inode.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 7ba8e1c2557bbb..89d08155986dfc 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -582,6 +582,11 @@ int ceph_drop_inode(struct inode *inode) return 1; } +static inline blkcnt_t calc_inode_blocks(u64 size) +{ + return (size + (1<<9) - 1) >> 9; +} + /* * Helpers to fill in size, ctime, mtime, and atime. We have to be * careful because either the client or MDS may have more up to date @@ -604,7 +609,7 @@ int ceph_fill_file_size(struct inode *inode, int issued, size = 0; } i_size_write(inode, size); - inode->i_blocks = (size + (1<<9) - 1) >> 9; + inode->i_blocks = calc_inode_blocks(size); ci->i_reported_size = size; if (truncate_seq != ci->i_truncate_seq) { dout("truncate_seq %u -> %u\n", @@ -863,9 +868,13 @@ static int fill_inode(struct inode *inode, struct page *locked_page, spin_unlock(&ci->i_ceph_lock); - err = -EINVAL; - if (WARN_ON(symlen != i_size_read(inode))) - goto out; + if (symlen != i_size_read(inode)) { + pr_err("fill_inode %llx.%llx BAD symlink " + "size %lld\n", ceph_vinop(inode), + i_size_read(inode)); + i_size_write(inode, symlen); + inode->i_blocks = calc_inode_blocks(symlen); + } err = -ENOMEM; sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS); @@ -1629,7 +1638,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size) spin_lock(&ci->i_ceph_lock); dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size); i_size_write(inode, size); - inode->i_blocks = (size + (1 << 9) - 1) >> 9; + inode->i_blocks = calc_inode_blocks(size); /* tell the MDS if we are approaching max_size */ if ((size << 1) >= ci->i_max_size && @@ -2002,8 +2011,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size > inode->i_size) { i_size_write(inode, attr->ia_size); - inode->i_blocks = - (attr->ia_size + (1 << 9) - 1) >> 9; + inode->i_blocks = calc_inode_blocks(attr->ia_size); inode->i_ctime = attr->ia_ctime; ci->i_reported_size = attr->ia_size; dirtied |= CEPH_CAP_FILE_EXCL; From 3b33f692c84c28cc8178aaeeb9264d82b48787f1 Mon Sep 17 00:00:00 2001 From: Zhang Zhuoyu Date: Fri, 25 Mar 2016 05:18:39 -0400 Subject: [PATCH 62/71] ceph: make logical calculation functions return bool This patch makes serverl logical caculation functions return bool to improve readability due to these particular functions only using 0/1 as their return value. No functional change. Signed-off-by: Zhang Zhuoyu --- fs/ceph/cache.c | 2 +- fs/ceph/dir.c | 2 +- include/linux/ceph/ceph_frag.h | 4 ++-- include/linux/ceph/decode.h | 2 +- include/linux/ceph/osdmap.h | 6 +++--- net/ceph/ceph_common.c | 2 +- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index a351480dbabc95..c052b5bf219b54 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -236,7 +236,7 @@ static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int unlock_page(page); } -static inline int cache_valid(struct ceph_inode_info *ci) +static inline bool cache_valid(struct ceph_inode_info *ci) { return ((ceph_caps_issued(ci) & CEPH_CAP_FILE_CACHE) && (ci->i_fscache_gen == ci->i_rdcache_gen)); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 4850c3624a873c..f6279a1bd6ec0f 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -710,7 +710,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, return dentry; } -static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry) +static bool is_root_ceph_dentry(struct inode *inode, struct dentry *dentry) { return ceph_ino(inode) == CEPH_INO_ROOT && strncmp(dentry->d_name.name, ".ceph", 5) == 0; diff --git a/include/linux/ceph/ceph_frag.h b/include/linux/ceph/ceph_frag.h index b827e066e55a19..146507df8650c0 100644 --- a/include/linux/ceph/ceph_frag.h +++ b/include/linux/ceph/ceph_frag.h @@ -51,11 +51,11 @@ static inline __u32 ceph_frag_make_child(__u32 f, int by, int i) return ceph_frag_make(newbits, ceph_frag_value(f) | (i << (24 - newbits))); } -static inline int ceph_frag_is_leftmost(__u32 f) +static inline bool ceph_frag_is_leftmost(__u32 f) { return ceph_frag_value(f) == 0; } -static inline int ceph_frag_is_rightmost(__u32 f) +static inline bool ceph_frag_is_rightmost(__u32 f) { return ceph_frag_value(f) == ceph_frag_mask(f); } diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h index a6ef9cc267ec2c..19e9932f3e7719 100644 --- a/include/linux/ceph/decode.h +++ b/include/linux/ceph/decode.h @@ -47,7 +47,7 @@ static inline void ceph_decode_copy(void **p, void *pv, size_t n) /* * bounds check input. */ -static inline int ceph_has_room(void **p, void *end, size_t n) +static inline bool ceph_has_room(void **p, void *end, size_t n) { return end >= *p && n <= end - *p; } diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 821e16fff39af7..ddc426b22d8159 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -172,19 +172,19 @@ struct ceph_osdmap { int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3]; }; -static inline int ceph_osd_exists(struct ceph_osdmap *map, int osd) +static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd) { return osd >= 0 && osd < map->max_osd && (map->osd_state[osd] & CEPH_OSD_EXISTS); } -static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd) +static inline bool ceph_osd_is_up(struct ceph_osdmap *map, int osd) { return ceph_osd_exists(map, osd) && (map->osd_state[osd] & CEPH_OSD_UP); } -static inline int ceph_osd_is_down(struct ceph_osdmap *map, int osd) +static inline bool ceph_osd_is_down(struct ceph_osdmap *map, int osd) { return !ceph_osd_is_up(map, osd); } diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index dcc18c6f7cf9b9..55d2bfee16d798 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -651,7 +651,7 @@ EXPORT_SYMBOL(ceph_destroy_client); /* * true if we have the mon map (and have thus joined the cluster) */ -static int have_mon_and_osd_map(struct ceph_client *client) +static bool have_mon_and_osd_map(struct ceph_client *client) { return client->monc.monmap && client->monc.monmap->epoch && client->osdc.osdmap && client->osdc.osdmap->epoch; From 4f7e89f6ace0f6cd2f20110efd2d405e26bcbf31 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 10 May 2016 18:40:28 +0800 Subject: [PATCH 63/71] ceph: block non-fatal signals for fault/page_mkwrite Fault and page_mkwrite are supposed to be uninterruptable. But they call ceph functions that are interruptible. So they should block signals before calling functions that are interruptible Signed-off-by: Yan, Zheng --- fs/ceph/addr.c | 66 +++++++++++++++++++++++++++++--------------------- 1 file changed, 39 insertions(+), 27 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index d52e3bcfda7c0c..3e204b9ff9f7b1 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1315,6 +1315,17 @@ const struct address_space_operations ceph_aops = { .direct_IO = ceph_direct_io, }; +static void ceph_block_sigs(sigset_t *oldset) +{ + sigset_t mask; + siginitsetinv(&mask, sigmask(SIGKILL)); + sigprocmask(SIG_BLOCK, &mask, oldset); +} + +static void ceph_restore_sigs(sigset_t *oldset) +{ + sigprocmask(SIG_SETMASK, oldset, NULL); +} /* * vm ops @@ -1327,6 +1338,9 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct page *pinned_page = NULL; loff_t off = vmf->pgoff << PAGE_SHIFT; int want, got, ret; + sigset_t oldset; + + ceph_block_sigs(&oldset); dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n", inode, ceph_vinop(inode), off, (size_t)PAGE_SIZE); @@ -1334,16 +1348,12 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; else want = CEPH_CAP_FILE_CACHE; - while (1) { - got = 0; - ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, - -1, &got, &pinned_page); - if (ret == 0) - break; - if (ret != -ERESTARTSYS) { - WARN_ON(1); - return VM_FAULT_SIGBUS; - } + + got = 0; + ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page); + if (ret < 0) { + ret = VM_FAULT_SIGBUS; + goto out_restore; } dout("filemap_fault %p %llu~%zd got cap refs on %s\n", inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got)); @@ -1361,7 +1371,7 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) ceph_put_cap_refs(ci, got); if (ret != -EAGAIN) - return ret; + goto out_restore; /* read inline data */ if (off >= PAGE_SIZE) { @@ -1375,7 +1385,7 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) ~__GFP_FS)); if (!page) { ret = VM_FAULT_OOM; - goto out; + goto out_inline; } ret1 = __ceph_do_getattr(inode, page, CEPH_STAT_CAP_INLINE_DATA, true); @@ -1383,7 +1393,7 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) unlock_page(page); put_page(page); ret = VM_FAULT_SIGBUS; - goto out; + goto out_inline; } if (ret1 < PAGE_SIZE) zero_user_segment(page, ret1, PAGE_SIZE); @@ -1392,10 +1402,12 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) SetPageUptodate(page); vmf->page = page; ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED; +out_inline: + dout("filemap_fault %p %llu~%zd read inline data ret %d\n", + inode, off, (size_t)PAGE_SIZE, ret); } -out: - dout("filemap_fault %p %llu~%zd read inline data ret %d\n", - inode, off, (size_t)PAGE_SIZE, ret); +out_restore: + ceph_restore_sigs(&oldset); return ret; } @@ -1413,11 +1425,14 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) loff_t size = i_size_read(inode); size_t len; int want, got, ret; + sigset_t oldset; prealloc_cf = ceph_alloc_cap_flush(); if (!prealloc_cf) return VM_FAULT_SIGBUS; + ceph_block_sigs(&oldset); + if (ci->i_inline_version != CEPH_INLINE_NONE) { struct page *locked_page = NULL; if (off == 0) { @@ -1444,17 +1459,13 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; else want = CEPH_CAP_FILE_BUFFER; - while (1) { - got = 0; - ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len, - &got, NULL); - if (ret == 0) - break; - if (ret != -ERESTARTSYS) { - WARN_ON(1); - ret = VM_FAULT_SIGBUS; - goto out_free; - } + + got = 0; + ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len, + &got, NULL); + if (ret < 0) { + ret = VM_FAULT_SIGBUS; + goto out_free; } dout("page_mkwrite %p %llu~%zd got cap refs on %s\n", inode, off, len, ceph_cap_string(got)); @@ -1499,6 +1510,7 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) inode, off, len, ceph_cap_string(got), ret); ceph_put_cap_refs(ci, got); out_free: + ceph_restore_sigs(&oldset); ceph_free_cap_flush(prealloc_cf); return ret; From 6ce026e411c4f36c9e51189d28a5dd9d08095b9d Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 10 May 2016 18:59:13 +0800 Subject: [PATCH 64/71] ceph: make fault/page_mkwrite return VM_FAULT_OOM for -ENOMEM Signed-off-by: Yan, Zheng --- fs/ceph/addr.c | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 3e204b9ff9f7b1..d128bb65746d7c 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1351,10 +1351,9 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) got = 0; ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page); - if (ret < 0) { - ret = VM_FAULT_SIGBUS; + if (ret < 0) goto out_restore; - } + dout("filemap_fault %p %llu~%zd got cap refs on %s\n", inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got)); @@ -1392,7 +1391,10 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if (ret1 < 0 || off >= i_size_read(inode)) { unlock_page(page); put_page(page); - ret = VM_FAULT_SIGBUS; + if (ret1 < 0) + ret = ret1; + else + ret = VM_FAULT_SIGBUS; goto out_inline; } if (ret1 < PAGE_SIZE) @@ -1408,6 +1410,9 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } out_restore: ceph_restore_sigs(&oldset); + if (ret < 0) + ret = (ret == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS; + return ret; } @@ -1429,7 +1434,7 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) prealloc_cf = ceph_alloc_cap_flush(); if (!prealloc_cf) - return VM_FAULT_SIGBUS; + return VM_FAULT_OOM; ceph_block_sigs(&oldset); @@ -1442,10 +1447,8 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ret = ceph_uninline_data(vma->vm_file, locked_page); if (locked_page) unlock_page(locked_page); - if (ret < 0) { - ret = VM_FAULT_SIGBUS; + if (ret < 0) goto out_free; - } } if (off + PAGE_SIZE <= size) @@ -1463,10 +1466,9 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) got = 0; ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len, &got, NULL); - if (ret < 0) { - ret = VM_FAULT_SIGBUS; + if (ret < 0) goto out_free; - } + dout("page_mkwrite %p %llu~%zd got cap refs on %s\n", inode, off, len, ceph_cap_string(got)); @@ -1475,10 +1477,9 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) lock_page(page); - ret = VM_FAULT_NOPAGE; - if ((off > size) || - (page->mapping != inode->i_mapping)) { + if ((off > size) || (page->mapping != inode->i_mapping)) { unlock_page(page); + ret = VM_FAULT_NOPAGE; goto out; } @@ -1487,11 +1488,6 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) /* success. we'll keep the page locked. */ set_page_dirty(page); ret = VM_FAULT_LOCKED; - } else { - if (ret == -ENOMEM) - ret = VM_FAULT_OOM; - else - ret = VM_FAULT_SIGBUS; } out: if (ret == VM_FAULT_LOCKED || @@ -1512,7 +1508,8 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) out_free: ceph_restore_sigs(&oldset); ceph_free_cap_flush(prealloc_cf); - + if (ret < 0) + ret = (ret == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS; return ret; } From f0b33df57a5f03c637f75ead7cb4d978c59cc63d Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 10 May 2016 19:09:06 +0800 Subject: [PATCH 65/71] ceph: handle -EAGAIN returned by ceph_update_writeable_page() when ceph_update_writeable_page() return -EAGAIN, caller should lock the page and call ceph_update_writeable_page() again. Signed-off-by: Yan, Zheng --- fs/ceph/addr.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index d128bb65746d7c..97ee5d1fbb61fc 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1475,21 +1475,23 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) /* Update time before taking page lock */ file_update_time(vma->vm_file); - lock_page(page); + do { + lock_page(page); - if ((off > size) || (page->mapping != inode->i_mapping)) { - unlock_page(page); - ret = VM_FAULT_NOPAGE; - goto out; - } + if ((off > size) || (page->mapping != inode->i_mapping)) { + unlock_page(page); + ret = VM_FAULT_NOPAGE; + break; + } + + ret = ceph_update_writeable_page(vma->vm_file, off, len, page); + if (ret >= 0) { + /* success. we'll keep the page locked. */ + set_page_dirty(page); + ret = VM_FAULT_LOCKED; + } + } while (ret == -EAGAIN); - ret = ceph_update_writeable_page(vma->vm_file, off, len, page); - if (ret >= 0) { - /* success. we'll keep the page locked. */ - set_page_dirty(page); - ret = VM_FAULT_LOCKED; - } -out: if (ret == VM_FAULT_LOCKED || ci->i_inline_version != CEPH_INLINE_NONE) { int dirty; From 0e76abf21e769245b6eebb27b439ad014ac49292 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 13 May 2016 11:04:33 +0800 Subject: [PATCH 66/71] libceph: make ceph_osdc_wait_request() uninterruptible Ceph_osdc_wait_request() is used when cephfs issues sync IO. In most cases, the sync IO should be uninterruptible. The fix is use killale wait function in ceph_osdc_wait_request(). Signed-off-by: Yan, Zheng --- net/ceph/osd_client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 55cafd3a2ff084..0160d7d09a1e97 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -3454,7 +3454,7 @@ static int wait_request_timeout(struct ceph_osd_request *req, long left; dout("%s req %p tid %llu\n", __func__, req, req->r_tid); - left = wait_for_completion_interruptible_timeout(&req->r_completion, + left = wait_for_completion_killable_timeout(&req->r_completion, ceph_timeout_jiffies(timeout)); if (left <= 0) { left = left ?: -ETIMEDOUT; From a78bbd4b29c29784f0addb5e3b35790c7ed178ae Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 13 May 2016 11:30:24 +0800 Subject: [PATCH 67/71] ceph: make ceph_update_writeable_page() uninterruptible ceph_update_writeable_page() is used by ceph_write_begin(). It beaks atomicity of write operation if it's interruptible. Signed-off-by: Yan, Zheng --- fs/ceph/addr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 97ee5d1fbb61fc..4aa8e375e648bc 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1168,7 +1168,7 @@ static int ceph_update_writeable_page(struct file *file, snapc = ceph_get_snap_context(snapc); unlock_page(page); ceph_queue_writeback(inode); - r = wait_event_interruptible(ci->i_cap_wq, + r = wait_event_killable(ci->i_cap_wq, context_is_writeable_or_written(inode, snapc)); ceph_put_snap_context(snapc); if (r == -ERESTARTSYS) From ad15ec06e51b6eb73981428109e32c75cbad7d3d Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 13 May 2016 17:29:51 +0800 Subject: [PATCH 68/71] ceph: handle interrupted ceph_writepage() writepage() can be interrupted when it's called by direct memory reclaimer (the direct memory relaimer is killed). To avoid lossing data, we redirty the page. Signed-off-by: Yan, Zheng --- fs/ceph/addr.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 4aa8e375e648bc..080a9cab3ee17f 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -544,11 +544,21 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) truncate_seq, truncate_size, &inode->i_mtime, &page, 1); if (err < 0) { - dout("writepage setting page/mapping error %d %p\n", err, page); + struct writeback_control tmp_wbc; + if (!wbc) + wbc = &tmp_wbc; + if (err == -ERESTARTSYS) { + /* killed by SIGKILL */ + dout("writepage interrupted page %p\n", page); + redirty_page_for_writepage(wbc, page); + end_page_writeback(page); + goto out; + } + dout("writepage setting page/mapping error %d %p\n", + err, page); SetPageError(page); mapping_set_error(&inode->i_data, err); - if (wbc) - wbc->pages_skipped++; + wbc->pages_skipped++; } else { dout("writepage cleaned page %p\n", page); err = 0; /* vfs expects us to return 0 */ @@ -569,12 +579,16 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc) BUG_ON(!inode); ihold(inode); err = writepage_nounlock(page, wbc); + if (err == -ERESTARTSYS) { + /* direct memory reclaimer was killed by SIGKILL. return 0 + * to prevent caller from setting mapping/page error */ + err = 0; + } unlock_page(page); iput(inode); return err; } - /* * lame release_pages helper. release_pages() isn't exported to * modules. From b109eec6f4332bd517e2f41e207037c4b9065094 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 13 May 2016 17:54:17 +0800 Subject: [PATCH 69/71] ceph: SetPageError() for writeback pages if writepages fails Signed-off-by: Yan, Zheng --- fs/ceph/addr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 080a9cab3ee17f..8fa1f91cbf16e4 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -626,7 +626,6 @@ static void writepages_finish(struct ceph_osd_request *req) struct ceph_fs_client *fsc = ceph_inode_to_client(inode); bool remove_page; - dout("writepages_finish %p rc %d\n", inode, rc); if (rc < 0) mapping_set_error(mapping, rc); @@ -661,6 +660,9 @@ static void writepages_finish(struct ceph_osd_request *req) clear_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); + if (rc < 0) + SetPageError(page); + ceph_put_snap_context(page_snap_context(page)); page->private = 0; ClearPagePrivate(page); From 9abd4db713704aac146395e079224ddd716e9b95 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 18 May 2016 20:58:26 +0800 Subject: [PATCH 70/71] ceph: don't use truncate_pagecache() to invalidate read cache truncate_pagecache() drops dirty pages, it's dangerous to use it to invalidate read cache. Besides, we shouldn't start invalidating read cache while there are buffer writers. Because buffer writers may add dirty pages later. Signed-off-by: Yan, Zheng --- fs/ceph/caps.c | 8 ++++---- fs/ceph/inode.c | 4 +++- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index fab93c66d879f5..c17b5d76d75ee9 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1656,7 +1656,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, */ if ((!is_delayed || mdsc->stopping) && !S_ISDIR(inode->i_mode) && /* ignore readdir cache */ - ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ + !(ci->i_wb_ref || ci->i_wrbuffer_ref) && /* no dirty pages... */ inode->i_data.nrpages && /* have cached pages */ (revoking & (CEPH_CAP_FILE_CACHE| CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */ @@ -1698,8 +1698,8 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, revoking = cap->implemented & ~cap->issued; dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n", - cap->mds, cap, ceph_cap_string(cap->issued), - ceph_cap_string(cap_used), + cap->mds, cap, ceph_cap_string(cap_used), + ceph_cap_string(cap->issued), ceph_cap_string(cap->implemented), ceph_cap_string(revoking)); @@ -2828,7 +2828,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */ ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && - !ci->i_wrbuffer_ref) { + !(ci->i_wrbuffer_ref || ci->i_wb_ref)) { if (try_nonblocking_invalidate(inode)) { /* there were locked pages.. invalidate later in a separate thread. */ diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 89d08155986dfc..07495ba61fe3dd 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1728,7 +1728,9 @@ static void ceph_invalidate_work(struct work_struct *work) orig_gen = ci->i_rdcache_gen; spin_unlock(&ci->i_ceph_lock); - truncate_pagecache(inode, 0); + if (invalidate_inode_pages2(inode->i_mapping) < 0) { + pr_err("invalidate_pages %p fails\n", inode); + } spin_lock(&ci->i_ceph_lock); if (orig_gen == ci->i_rdcache_gen && From e536030934aebf049fe6aaebc58dd37aeee21840 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 19 May 2016 19:15:19 +0800 Subject: [PATCH 71/71] ceph: fix wake_up_session_cb() We should reset i_requested_max_size before waking the waiters. (zero i_requested_max_size make waiter re-request the max size) Signed-off-by: Yan, Zheng --- fs/ceph/mds_client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 7ad31283d510bf..2103b823bec078 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1258,13 +1258,13 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap, { struct ceph_inode_info *ci = ceph_inode(inode); - wake_up_all(&ci->i_cap_wq); if (arg) { spin_lock(&ci->i_ceph_lock); ci->i_wanted_max_size = 0; ci->i_requested_max_size = 0; spin_unlock(&ci->i_ceph_lock); } + wake_up_all(&ci->i_cap_wq); return 0; }