Blob Blame History Raw
From 2eb6e6ebeea46ccd2595a076ca093b637d075e04 Mon Sep 17 00:00:00 2001
From: Mike Christie <michaelc@cs.wisc.edu>
Date: Wed, 29 Jul 2015 04:23:45 -0500
Subject: [PATCH] rbd: add support for COMPARE_AND_WRITE/CMPEXT
References: fate#318836, bsc#1177090
Patch-mainline: Not yet, SES clustered LIO/RBD

This patch adds support to rbd for SCSI COMPARE_AND_WRITE commands. Higher
levels like LIO will work with IMG_REQ_CMP_AND_WRITE requests, but
rbd breaks it up into CMPEXT and WRITE Ceph requests.

Signed-off-by: Mike Christie <michaelc@cs.wisc.edu>
Acked-by: David Disseldorp <ddiss@suse.de>
[ddiss@suse.de: rbd_osd_req_create()/__rbd_osd_req_create flags rebase
 for SLE15. Squash in rbd-fix-and-simplify-rbd_osd_req_format_rw.patch]
Signed-off-by: Luis Henriques <lhenriques@suse.com>
[luis: rebased on top of a1fbb5e7bbb5 ("rbd: start enums at 1 instead of 0")]
[luis: due to commit 26f887e0a3c4 ("libceph, rbd, ceph: move
 ceph_osdc_alloc_messages() calls"), call ceph_osdc_alloc_messages in
 rbd_img_cmp_and_write_request_fill ]
Acked-by: Luis Henriques <lhenriques@suse.com>
[ddiss: rebase on bcbab1db6c95 ("rbd: introduce obj_req->osd_reqs list")
        94e857718810 ("libceph: rename r_unsafe_item to r_private_item")
        and 0192ce2ee68b ("rbd: introduce image request state machine")
Signed-off-by: David Disseldorp <ddiss@suse.de>
[ddiss: fill op data from separate cmp/write bvec iters and silence
        miscompare error messages]
Reviewed-by: Luis Henriques <lhenriques@suse.com>
---
 drivers/block/rbd.c         | 100 ++++++++++++++++++++++++++++++++++--
 include/linux/ceph/librbd.h |   1 +
 2 files changed, 97 insertions(+), 4 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index e447ab95aed9..f82fcf36c27f 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -680,6 +680,8 @@ static char* obj_op_name(enum obj_operation_type op_type)
 		return "discard";
 	case OBJ_OP_ZEROOUT:
 		return "zeroout";
+	case OBJ_OP_CMP_AND_WRITE:
+		return "compare-and-write";
 	default:
 		return "???";
 	}
@@ -1207,6 +1209,7 @@ static bool rbd_img_is_write(struct rbd_img_request *img_req)
 	case OBJ_OP_WRITE:
 	case OBJ_OP_DISCARD:
 	case OBJ_OP_ZEROOUT:
+	case OBJ_OP_CMP_AND_WRITE:
 		return true;
 	default:
 		BUG();
@@ -2213,6 +2216,69 @@ static int rbd_obj_init_zeroout(struct rbd_obj_request *obj_req)
 	return 0;
 }
 
+static void __rbd_osd_setup_cmp_and_write_ops(struct ceph_osd_request *osd_req,
+				      int which)
+{
+	struct rbd_obj_request *obj_req = osd_req->r_priv;
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+	/* cmp and write iters point to different ranges within same bvecs */
+	struct ceph_bvec_iter cmp_bvec_pos = {
+		.bvecs = obj_req->bvec_pos.bvecs,
+		.iter  = obj_req->cmp_bvec_iter,
+	};
+	struct ceph_bvec_iter *write_bvec_pos = &obj_req->bvec_pos;
+	u16 opcode;
+
+	BUG_ON(obj_req->img_request->data_type != OBJ_REQUEST_BVECS
+		&& obj_req->img_request->data_type != OBJ_REQUEST_OWN_BVECS);
+
+	if (!use_object_map(rbd_dev) ||
+	    !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST)) {
+		osd_req_op_alloc_hint_init(osd_req, which++,
+					   rbd_dev->layout.object_size,
+					   rbd_dev->layout.object_size,
+					   rbd_dev->opts->alloc_hint_flags);
+	}
+
+	osd_req_op_extent_init(osd_req, which, CEPH_OSD_OP_CMPEXT,
+			       obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
+	/*
+	 * Regular rbd_osd_setup_data() can't be used here - separate bvec iters
+	 * need to be used for compare op and write op data.
+	 */
+	osd_req_op_extent_osd_data_bvec_pos(osd_req, which, &cmp_bvec_pos);
+	which++;
+
+	if (rbd_obj_is_entire(obj_req))
+		opcode = CEPH_OSD_OP_WRITEFULL;
+	else
+		opcode = CEPH_OSD_OP_WRITE;
+
+	osd_req_op_extent_init(osd_req, which, opcode,
+			       obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
+	osd_req_op_extent_osd_data_bvec_pos(osd_req, which, write_bvec_pos);
+}
+
+static int rbd_obj_init_cmp_and_write(struct rbd_obj_request *obj_req)
+{
+	int ret;
+
+	/* reverse map the entire object onto the parent */
+	ret = rbd_obj_calc_img_extents(obj_req, true);
+	if (ret)
+		return ret;
+
+	if (rbd_obj_copyup_enabled(obj_req))
+		obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
+
+	/*
+	 * FIXME ensure that copyup *always* occurs for clones,
+	 * regardless of the I/O size.
+	 */
+	obj_req->write_state = RBD_OBJ_WRITE_START;
+	return 0;
+}
+
 static int count_write_ops(struct rbd_obj_request *obj_req)
 {
 	struct rbd_img_request *img_req = obj_req->img_request;
@@ -2232,6 +2297,12 @@ static int count_write_ops(struct rbd_obj_request *obj_req)
 			return 2; /* create + truncate */
 
 		return 1; /* delete/truncate/zero */
+	case OBJ_OP_CMP_AND_WRITE:
+		if (!use_object_map(img_req->rbd_dev) ||
+		    !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST))
+			return 3; /* setallochint + cmpext + write/writefull */
+
+		return 2; /* cmpext + write/writefull */
 	default:
 		BUG();
 	}
@@ -2252,6 +2323,9 @@ static void rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
 	case OBJ_OP_ZEROOUT:
 		__rbd_osd_setup_zeroout_ops(osd_req, which);
 		break;
+	case OBJ_OP_CMP_AND_WRITE:
+		__rbd_osd_setup_cmp_and_write_ops(osd_req, which);
+		break;
 	default:
 		BUG();
 	}
@@ -2281,6 +2355,9 @@ static int __rbd_img_fill_request(struct rbd_img_request *img_req)
 		case OBJ_OP_ZEROOUT:
 			ret = rbd_obj_init_zeroout(obj_req);
 			break;
+		case OBJ_OP_CMP_AND_WRITE:
+			ret = rbd_obj_init_cmp_and_write(obj_req);
+			break;
 		default:
 			BUG();
 		}
@@ -3434,9 +3511,24 @@ static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req,
 
 	if (done && *result) {
 		rbd_assert(*result < 0);
-		rbd_warn(rbd_dev, "%s at objno %llu %llu~%llu result %d",
-			 obj_op_name(img_req->op_type), obj_req->ex.oe_objno,
-			 obj_req->ex.oe_off, obj_req->ex.oe_len, *result);
+		if (img_req->op_type == OBJ_OP_CMP_AND_WRITE &&
+		    *result <= -MAX_ERRNO) {
+			/*
+			 * don't warn on miscompare. cmpext returns:
+			 * (-MAX_ERRNO - offset_of_miscompare)
+			 */
+			pr_debug("%s at objno %llu %llu~%llu: miscompare at %d",
+				 obj_op_name(img_req->op_type),
+				 obj_req->ex.oe_objno, obj_req->ex.oe_off,
+				 obj_req->ex.oe_len,
+				 (*result + MAX_ERRNO) * -1);
+		} else {
+			rbd_warn(rbd_dev,
+				 "%s at objno %llu %llu~%llu result %d",
+				 obj_op_name(img_req->op_type),
+				 obj_req->ex.oe_objno, obj_req->ex.oe_off,
+				 obj_req->ex.oe_len, *result);
+		}
 	}
 	return done;
 }
@@ -3614,7 +3706,7 @@ static bool __rbd_img_handle_request(struct rbd_img_request *img_req,
 		mutex_unlock(&img_req->state_mutex);
 	}
 
-	if (done && *result) {
+	if (done && *result && img_req->op_type != OBJ_OP_CMP_AND_WRITE) {
 		rbd_assert(*result < 0);
 		rbd_warn(rbd_dev, "%s%s result %d",
 		      test_bit(IMG_REQ_CHILD, &img_req->flags) ? "child " : "",
diff --git a/include/linux/ceph/librbd.h b/include/linux/ceph/librbd.h
index 68d3af9a5d74..7d65cc064c50 100644
--- a/include/linux/ceph/librbd.h
+++ b/include/linux/ceph/librbd.h
@@ -91,6 +91,7 @@ enum obj_operation_type {
 	OBJ_OP_WRITE,
 	OBJ_OP_DISCARD,
 	OBJ_OP_ZEROOUT,
+	OBJ_OP_CMP_AND_WRITE,
 };
 
 enum rbd_img_state {
-- 
2.26.2