From d3b6bfd8973f27ee805b920570cb08c25f0eb809 Mon Sep 17 00:00:00 2001
From: David Disseldorp <ddiss@suse.de>
Date: Mon, 12 Oct 2020 01:23:13 +0200
Subject: [PATCH] rbd: add rbd_img_fill_cmp_and_write_from_bvecs
References: bsc#1177090
Patch-mainline: Not yet, SES clustered LIO/RBD
For compound compare-and-write requests, we need to fill OSD requests
with two separate data buffers - one for the cmpext op and one for the
write op (which is conditional on cmpext success).
These dual_bvec helpers provide this functionality.
Signed-off-by: David Disseldorp <ddiss@suse.de>
Reviewed-by: Roman Penyaev <rpenyaev@suse.com>
Reviewed-by: Luis Henriques <lhenriques@suse.com>
---
drivers/block/rbd.c | 205 +++++++++++++++++++++++++++++++++++-
include/linux/ceph/librbd.h | 3 +
2 files changed, 205 insertions(+), 3 deletions(-)
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 87cf752dce85..e447ab95aed9 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -215,6 +215,11 @@ struct rbd_obj_request {
struct ceph_bvec_iter bvec_pos;
u32 bvec_count;
u32 bvec_idx;
+ /*
+ * iter for cmp portion of compare and write, which
+ * refers to bvecs in @bvec_pos.
+ */
+ struct bvec_iter cmp_bvec_iter;
};
};
@@ -2296,6 +2301,8 @@ union rbd_img_fill_iter {
struct ceph_bvec_iter bvec_iter;
};
+typedef void (*reset_fill_ctx_iter_fn_t)(union rbd_img_fill_iter *iter);
+
struct rbd_img_fill_ctx {
enum obj_request_type pos_type;
union rbd_img_fill_iter *pos;
@@ -2303,6 +2310,7 @@ struct rbd_img_fill_ctx {
ceph_object_extent_fn_t set_pos_fn;
ceph_object_extent_fn_t count_fn;
ceph_object_extent_fn_t copy_fn;
+ reset_fill_ctx_iter_fn_t reset_iter_fn;
};
static struct ceph_object_extent *alloc_object_extent(void *arg)
@@ -2344,7 +2352,10 @@ static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req,
* Create object requests and set each object request's starting
* position in the provided bio (list) or bio_vec array.
*/
- fctx->iter = *fctx->pos;
+ if (fctx->reset_iter_fn)
+ fctx->reset_iter_fn(&fctx->iter);
+ else
+ fctx->iter = *fctx->pos;
for (i = 0; i < num_img_extents; i++) {
ret = ceph_file_to_extents(&img_req->rbd_dev->layout,
img_extents[i].fe_off,
@@ -2396,7 +2407,10 @@ static int rbd_img_fill_request(struct rbd_img_request *img_req,
* or bio_vec array because when mapped, those bio_vecs can straddle
* stripe unit boundaries.
*/
- fctx->iter = *fctx->pos;
+ if (fctx->reset_iter_fn)
+ fctx->reset_iter_fn(&fctx->iter);
+ else
+ fctx->iter = *fctx->pos;
for (i = 0; i < num_img_extents; i++) {
ret = ceph_file_to_extents(&rbd_dev->layout,
img_extents[i].fe_off,
@@ -2420,7 +2434,10 @@ static int rbd_img_fill_request(struct rbd_img_request *img_req,
* Fill in each object request's private bio_vec array, splitting and
* rearranging the provided bio_vecs in stripe unit chunks as needed.
*/
- fctx->iter = *fctx->pos;
+ if (fctx->reset_iter_fn)
+ fctx->reset_iter_fn(&fctx->iter);
+ else
+ fctx->iter = *fctx->pos;
for (i = 0; i < num_img_extents; i++) {
ret = ceph_iterate_extents(&rbd_dev->layout,
img_extents[i].fe_off,
@@ -2578,6 +2595,188 @@ int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
}
EXPORT_SYMBOL(rbd_img_fill_from_bvecs);
+struct ceph_bvec_dual_iter {
+ struct ceph_bvec_iter cmp_iter;
+ /* cur_cmp_iter carried as fctx->iter */
+ struct ceph_bvec_iter write_iter;
+ struct ceph_bvec_iter cur_write_iter;
+ struct rbd_img_fill_ctx fctx;
+};
+
+static void set_dual_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
+{
+ struct rbd_obj_request *obj_req =
+ container_of(ex, struct rbd_obj_request, ex);
+ struct ceph_bvec_iter *cur_cmp_iter = arg;
+ struct rbd_img_fill_ctx *fctx =
+ container_of(arg, struct rbd_img_fill_ctx, iter);
+ struct ceph_bvec_dual_iter *dual =
+ container_of(fctx, struct ceph_bvec_dual_iter, fctx);
+
+ BUG_ON(obj_req->img_request->op_type != OBJ_OP_CMP_AND_WRITE);
+
+ pr_debug("set dual bvec pos: old bi_size: %d, bi_idx: %u, bi_bvec_done: %u, "
+ " new bi_size: %d, bi_idx: %u, bi_bvec_done: %u\n",
+ obj_req->bvec_pos.iter.bi_size,
+ /* current index into bvl_vec */
+ obj_req->bvec_pos.iter.bi_idx,
+ /* number of bytes completed in current bvec */
+ obj_req->bvec_pos.iter.bi_bvec_done,
+ cur_cmp_iter->iter.bi_size, cur_cmp_iter->iter.bi_idx,
+ cur_cmp_iter->iter.bi_bvec_done);
+
+ obj_req->cmp_bvec_iter = cur_cmp_iter->iter;
+ obj_req->bvec_pos = dual->cur_write_iter;
+
+ pr_debug("obj_req %p: shortening dual bvec pos from %u to %u\n",
+ obj_req, cur_cmp_iter->iter.bi_size, bytes);
+
+ BUG_ON(bytes > obj_req->cmp_bvec_iter.bi_size);
+ obj_req->cmp_bvec_iter.bi_size = bytes;
+ ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes);
+ ceph_bvec_iter_advance(cur_cmp_iter, bytes);
+ ceph_bvec_iter_advance(&dual->cur_write_iter, bytes);
+}
+
+static void count_dual_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
+{
+ struct rbd_obj_request *obj_req =
+ container_of(ex, struct rbd_obj_request, ex);
+ struct ceph_bvec_iter *cur_cmp_iter = arg;
+ struct rbd_img_fill_ctx *fctx =
+ container_of(arg, struct rbd_img_fill_ctx, iter);
+ struct ceph_bvec_dual_iter *dual =
+ container_of(fctx, struct ceph_bvec_dual_iter, fctx);
+
+ pr_debug("counting %u bytes from dual bvec for obj_req %p\n",
+ bytes, obj_req);
+
+ ceph_bvec_iter_advance_step(cur_cmp_iter, bytes, ({
+ obj_req->bvec_count++;
+ }));
+ ceph_bvec_iter_advance_step(&dual->cur_write_iter, bytes, ({
+ obj_req->bvec_count++;
+ }));
+}
+
+static void copy_dual_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
+{
+ struct rbd_obj_request *obj_req =
+ container_of(ex, struct rbd_obj_request, ex);
+ struct ceph_bvec_iter *cur_cmp_iter = arg;
+ struct rbd_img_fill_ctx *fctx =
+ container_of(arg, struct rbd_img_fill_ctx, iter);
+ struct ceph_bvec_dual_iter *dual =
+ container_of(fctx, struct ceph_bvec_dual_iter, fctx);
+
+ pr_debug("obj_req %p: copying %u bytes from cmp bvec %u [%u] (%u done)\n",
+ obj_req, bytes,
+ cur_cmp_iter->iter.bi_size,
+ cur_cmp_iter->iter.bi_idx,
+ cur_cmp_iter->iter.bi_bvec_done);
+
+ /* we might be placing the same bv for cmp and write */
+ ceph_bvec_iter_advance_step(cur_cmp_iter, bytes, ({
+ obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
+ obj_req->cmp_bvec_iter.bi_size += bv.bv_len;
+ }));
+
+ pr_debug("obj_req %p: copied cmp bvec to %u [%u] (%u done)\n",
+ obj_req,
+ obj_req->cmp_bvec_iter.bi_size,
+ obj_req->cmp_bvec_iter.bi_idx,
+ obj_req->cmp_bvec_iter.bi_bvec_done);
+
+ pr_debug("obj_req %p: copying %u bytes from write bvec %u [%u] (%u done)\n",
+ obj_req, bytes,
+ dual->cur_write_iter.iter.bi_size,
+ dual->cur_write_iter.iter.bi_idx,
+ dual->cur_write_iter.iter.bi_bvec_done);
+
+ /* init write-data iter to point to start of separate write section */
+ obj_req->bvec_pos.iter.bi_idx = obj_req->bvec_idx;
+ ceph_bvec_iter_advance_step(&dual->cur_write_iter, bytes, ({
+ obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
+ obj_req->bvec_pos.iter.bi_size += bv.bv_len;
+ }));
+
+ pr_debug("obj_req %p: copied write bvec to %u [%u] (%u done)\n",
+ obj_req,
+ obj_req->bvec_pos.iter.bi_size,
+ obj_req->bvec_pos.iter.bi_idx,
+ obj_req->bvec_pos.iter.bi_bvec_done);
+}
+
+static void reset_fill_ctx_dual_iter(union rbd_img_fill_iter *cur_cmp_iter)
+{
+ struct rbd_img_fill_ctx *fctx =
+ container_of(cur_cmp_iter, struct rbd_img_fill_ctx, iter);
+ struct ceph_bvec_dual_iter *dual =
+ container_of(fctx, struct ceph_bvec_dual_iter, fctx);
+
+ pr_debug("resetting cmp bvec from %u [%u] (%u done) to "
+ "%u [%u] (%u done)\n",
+ cur_cmp_iter->bvec_iter.iter.bi_size,
+ cur_cmp_iter->bvec_iter.iter.bi_idx,
+ cur_cmp_iter->bvec_iter.iter.bi_bvec_done,
+ dual->cmp_iter.iter.bi_size,
+ dual->cmp_iter.iter.bi_idx,
+ dual->cmp_iter.iter.bi_bvec_done);
+
+ pr_debug("resetting write bvec from %u [%u] (%u done) to "
+ "%u [%u] (%u done)\n",
+ dual->cur_write_iter.iter.bi_size,
+ dual->cur_write_iter.iter.bi_idx,
+ dual->cur_write_iter.iter.bi_bvec_done,
+ dual->write_iter.iter.bi_size,
+ dual->write_iter.iter.bi_idx,
+ dual->write_iter.iter.bi_bvec_done);
+
+ cur_cmp_iter->bvec_iter = dual->cmp_iter;
+ dual->cur_write_iter = dual->write_iter;
+}
+
+/*
+ * compare and write consumes 2 * extent_bytes. Once for the compare op and
+ * extent_bytes again for the write op. @bvecs must provide compare data
+ * at range [0, img_extent->fe_len) and write data at range
+ * [img_extent->fe_len, img_extent->fe_len * 2)
+ */
+int rbd_img_fill_cmp_and_write_from_bvecs(struct rbd_img_request *img_req,
+ struct ceph_file_extent *img_extent,
+ struct bio_vec *bvecs)
+{
+ struct ceph_bvec_dual_iter dual = {
+ .cmp_iter = {
+ .bvecs = bvecs,
+ .iter = { .bi_size = img_extent->fe_len },
+ },
+ .write_iter = {
+ .bvecs = bvecs,
+ .iter = { .bi_size = img_extent->fe_len * 2 },
+ },
+ .fctx = {
+ .pos_type = OBJ_REQUEST_BVECS,
+ /*
+ * pos is only used to reset fctx.iter, not needed with
+ * reset cb.
+ */
+ .pos = NULL,
+ .set_pos_fn = set_dual_bvec_pos,
+ .count_fn = count_dual_bvecs,
+ .copy_fn = copy_dual_bvecs,
+ .reset_iter_fn = reset_fill_ctx_dual_iter,
+ },
+ };
+ BUG_ON(img_req->op_type != OBJ_OP_CMP_AND_WRITE);
+
+ /* advance to write portion of bvecs */
+ ceph_bvec_iter_advance(&dual.write_iter, img_extent->fe_len);
+
+ return rbd_img_fill_request(img_req, img_extent, 1, &dual.fctx);
+}
+EXPORT_SYMBOL(rbd_img_fill_cmp_and_write_from_bvecs);
+
static void rbd_img_handle_request_work(struct work_struct *work)
{
struct rbd_img_request *img_req =
diff --git a/include/linux/ceph/librbd.h b/include/linux/ceph/librbd.h
index e3afaa0f608e..68d3af9a5d74 100644
--- a/include/linux/ceph/librbd.h
+++ b/include/linux/ceph/librbd.h
@@ -251,6 +251,9 @@ extern int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
struct ceph_file_extent *img_extents,
u32 num_img_extents,
struct bio_vec *bvecs);
+extern int rbd_img_fill_cmp_and_write_from_bvecs(struct rbd_img_request *img_req,
+ struct ceph_file_extent *img_extent,
+ struct bio_vec *bvecs);
extern void rbd_img_handle_request(struct rbd_img_request *img_req, int result);
extern void rbd_img_request_put(struct rbd_img_request *img_request);
--
2.26.2