Blob Blame History Raw
From 150c4ca485aa91c524f922ba22fa0596788a620a Mon Sep 17 00:00:00 2001
From: Mike Christie <michaelc@cs.wisc.edu>
Date: Wed, 29 Jul 2015 04:23:45 -0500
Subject: [PATCH] rbd: add support for COMPARE_AND_WRITE/CMPEXT
References: fate#318836
Patch-mainline: Not yet, SES2 clustered LIO/RBD

This patch adds support to rbd for SCSI COMPARE_AND_WRITE commands. Higher
levels like LIO will work with IMG_REQ_CMP_AND_WRITE requests, but
rbd breaks it up into CMPEXT and WRITE Ceph requests.

Signed-off-by: Mike Christie <michaelc@cs.wisc.edu>
Acked-by: David Disseldorp <ddiss@suse.de>
[ddiss@suse.de: rbd_osd_req_create()/__rbd_osd_req_create flags rebase
 for SLE15. Squash in rbd-fix-and-simplify-rbd_osd_req_format_rw.patch]
Signed-off-by: Luis Henriques <lhenriques@suse.com>
[luis: rebased on top of a1fbb5e7bbb5 ("rbd: start enums at 1 instead of 0")]
[luis: due to commit 26f887e0a3c4 ("libceph, rbd, ceph: move
 ceph_osdc_alloc_messages() calls"), call ceph_osdc_alloc_messages in
 rbd_img_cmp_and_write_request_fill ]
Acked-by: Luis Henriques <lhenriques@suse.com>
---
 drivers/block/rbd.c |  162 +++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 141 insertions(+), 21 deletions(-)

--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -219,6 +219,7 @@ enum obj_operation_type {
 	OBJ_OP_READ = 1,
 	OBJ_OP_WRITE,
 	OBJ_OP_DISCARD,
+	OBJ_OP_CMP_AND_WRITE,
 };
 
 enum obj_req_flags {
@@ -291,6 +292,7 @@ enum img_req_flags {
 	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
 	IMG_REQ_LAYERED,	/* ENOENT handling: normal = 0, layered = 1 */
 	IMG_REQ_DISCARD,	/* discard: normal = 0, discard request = 1 */
+	IMG_REQ_CMP_AND_WRITE,	/* normal = 0, compare and write request = 1 */
 };
 
 struct rbd_img_request {
@@ -298,10 +300,9 @@ struct rbd_img_request {
 	u64			offset;	/* starting image byte offset */
 	u64			length;	/* byte count from offset */
 	unsigned long		flags;
-	union {
-		u64			snap_id;	/* for reads */
-		struct ceph_snap_context *snapc;	/* for writes */
-	};
+
+	u64			snap_id;	/* for reads */
+	struct ceph_snap_context *snapc;	/* for writes */
 
 	struct request		*rq;		/* block request */
 	struct rbd_obj_request	*obj_request;	/* obj req initiator */
@@ -858,6 +859,8 @@ static int obj_num_ops(enum obj_operatio
 	switch (op_type) {
 	case OBJ_OP_WRITE:
 		return 2;
+	case OBJ_OP_CMP_AND_WRITE:
+		return 3;
 	default:
 		return 1;
 	}
@@ -872,6 +875,8 @@ static char* obj_op_name(enum obj_operat
 		return "write";
 	case OBJ_OP_DISCARD:
 		return "discard";
+	case OBJ_OP_CMP_AND_WRITE:
+		return "compare-and-write";
 	default:
 		return "???";
 	}
@@ -1599,10 +1604,23 @@ static bool img_request_layered_test(str
 	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
 }
 
+static void img_request_cmp_and_write_set(struct rbd_img_request *img_request)
+{
+	set_bit(IMG_REQ_CMP_AND_WRITE, &img_request->flags);
+	smp_mb();
+}
+
+static bool img_request_cmp_and_write_test(struct rbd_img_request *img_request)
+{
+	smp_mb();
+	return test_bit(IMG_REQ_CMP_AND_WRITE, &img_request->flags) != 0;
+}
+
 static bool img_request_is_write_type_test(struct rbd_img_request *img_request)
 {
 	return img_request_write_test(img_request) ||
-	       img_request_discard_test(img_request);
+	       img_request_discard_test(img_request) ||
+	       img_request_cmp_and_write_test(img_request);
 }
 
 static enum obj_operation_type
@@ -1612,6 +1630,8 @@ rbd_img_request_op_type(struct rbd_img_r
 		return OBJ_OP_WRITE;
 	else if (img_request_discard_test(img_request))
 		return OBJ_OP_DISCARD;
+	else if (img_request_cmp_and_write_test(img_request))
+		return OBJ_OP_CMP_AND_WRITE;
 	else
 		return OBJ_OP_READ;
 }
@@ -1715,6 +1735,17 @@ static void rbd_osd_write_callback(struc
 	obj_request_done_set(obj_request);
 }
 
+static void rbd_osd_cmpext_callback(struct rbd_obj_request *obj_request,
+				    struct ceph_osd_request *osd_req)
+{
+	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
+		obj_request->result, obj_request->length);
+
+	/* on mismatch, result is -MAX_ERRNO - offset_of_mismatch */
+	obj_request->xferred = obj_request->length;
+	obj_request_done_set(obj_request);
+}
+
 static void rbd_osd_discard_callback(struct rbd_obj_request *obj_request)
 {
 	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
@@ -1781,13 +1812,21 @@ static void rbd_osd_req_callback(struct
 		rbd_osd_read_callback(obj_request);
 		break;
 	case CEPH_OSD_OP_SETALLOCHINT:
-		rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE ||
-			   osd_req->r_ops[1].op == CEPH_OSD_OP_WRITEFULL);
-		/* fall through */
+		if (osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE ||
+		    osd_req->r_ops[1].op == CEPH_OSD_OP_WRITEFULL)
+			rbd_osd_write_callback(obj_request);
+		else if (osd_req->r_ops[1].op == CEPH_OSD_OP_CMPEXT)
+			rbd_osd_cmpext_callback(obj_request, osd_req);
+		else
+			rbd_assert(0);
+		break;
 	case CEPH_OSD_OP_WRITE:
 	case CEPH_OSD_OP_WRITEFULL:
 		rbd_osd_write_callback(obj_request);
 		break;
+	case CEPH_OSD_OP_CMPEXT:
+		rbd_osd_cmpext_callback(obj_request, osd_req);
+		break;
 	case CEPH_OSD_OP_STAT:
 		rbd_osd_stat_callback(obj_request);
 		break;
@@ -1825,6 +1864,12 @@ static void rbd_osd_req_format_write(str
 	osd_req->r_data_offset = obj_request->offset;
 }
 
+static void rbd_osd_req_format_rw(struct rbd_obj_request *obj_request)
+{
+	rbd_osd_req_format_read(obj_request);
+	rbd_osd_req_format_write(obj_request);
+}
+
 static struct ceph_osd_request *
 __rbd_osd_req_create(struct rbd_device *rbd_dev,
 		     struct ceph_snap_context *snapc,
@@ -1861,6 +1906,7 @@ err_req:
  * A write request has either one (watch) or two (hint+write) osd ops.
  * (All rbd data writes are prefixed with an allocation hint op, but
  * technically osd watch is a write request, hence this distinction.)
+ * A extent cmp has three (cmp+write+hint).
  */
 static struct ceph_osd_request *rbd_osd_req_create(
 					struct rbd_device *rbd_dev,
@@ -1871,12 +1917,15 @@ static struct ceph_osd_request *rbd_osd_
 	struct ceph_snap_context *snapc = NULL;
 
 	if (obj_request_img_data_test(obj_request) &&
-		(op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) {
+		(op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE ||
+		 op_type == OBJ_OP_CMP_AND_WRITE)) {
 		struct rbd_img_request *img_request = obj_request->img_request;
 		if (op_type == OBJ_OP_WRITE) {
 			rbd_assert(img_request_write_test(img_request));
-		} else {
+		} else if (op_type == OBJ_OP_DISCARD) {
 			rbd_assert(img_request_discard_test(img_request));
+		} else if (op_type == OBJ_OP_CMP_AND_WRITE) {
+			rbd_assert(img_request_cmp_and_write_test(img_request));
 		}
 		snapc = img_request->snapc;
 	}
@@ -1884,7 +1933,8 @@ static struct ceph_osd_request *rbd_osd_
 	rbd_assert(num_ops == 1 || obj_num_ops(op_type) == num_ops);
 
 	return __rbd_osd_req_create(rbd_dev, snapc, num_ops,
-	    (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) ?
+	    (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD ||
+	     op_type == OBJ_OP_CMP_AND_WRITE) ?
 	    CEPH_OSD_FLAG_WRITE : CEPH_OSD_FLAG_READ, obj_request);
 }
 
@@ -2063,6 +2113,10 @@ static struct rbd_img_request *rbd_img_r
 	} else if (op_type == OBJ_OP_WRITE) {
 		img_request_write_set(img_request);
 		img_request->snapc = snapc;
+	} else if (op_type == OBJ_OP_CMP_AND_WRITE) {
+		img_request_cmp_and_write_set(img_request);
+		img_request->snapc = snapc;
+		img_request->snap_id = rbd_dev->spec->snap_id;
 	} else {
 		img_request->snap_id = rbd_dev->spec->snap_id;
 	}
@@ -2156,18 +2210,11 @@ static bool rbd_img_obj_end_request(stru
 	result = obj_request->result;
 	if (result) {
 		struct rbd_device *rbd_dev = img_request->rbd_dev;
-		enum obj_operation_type op_type;
-
-		if (img_request_discard_test(img_request))
-			op_type = OBJ_OP_DISCARD;
-		else if (img_request_write_test(img_request))
-			op_type = OBJ_OP_WRITE;
-		else
-			op_type = OBJ_OP_READ;
 
 		rbd_warn(rbd_dev, "%s %llx at %llx (%llx)",
-			obj_op_name(op_type), obj_request->length,
-			obj_request->img_offset, obj_request->offset);
+			obj_op_name(rbd_img_request_op_type(img_request)),
+			obj_request->length, obj_request->img_offset,
+			obj_request->offset);
 		rbd_warn(rbd_dev, "  result %d xferred %x",
 			result, xferred);
 		if (!img_request->result)
@@ -2426,6 +2473,79 @@ out_unwind:
 	return -ENOMEM;
 }
 
+int rbd_img_cmp_and_write_request_fill(struct rbd_img_request *img_request,
+				       struct scatterlist *cmp_sgl,
+				       u64 cmp_length,
+				       struct scatterlist *write_sgl,
+				       u64 write_length)
+{
+	struct rbd_device *rbd_dev = img_request->rbd_dev;
+	u64 object_size = rbd_obj_bytes(&rbd_dev->header);
+	struct rbd_obj_request *obj_request;
+	struct ceph_osd_request *osd_req;
+	u64 object_no;
+	int num_ops = 0;
+	u64 img_offset;
+	u64 offset;
+
+	img_offset = img_request->offset;
+	offset = rbd_segment_offset(rbd_dev, img_offset);
+
+	/*
+	 * LIO currently only supports 1 sector reqs and we assume the req
+	 * will not span segments.
+	 */
+	if (rbd_segment_length(rbd_dev, offset, cmp_length) != cmp_length)
+		return -EOPNOTSUPP;
+
+	object_no = img_offset >> rbd_dev->header.obj_order;
+
+	obj_request = rbd_obj_request_create(OBJ_REQUEST_SG);
+	if (!obj_request)
+		return -ENOMEM;
+
+	obj_request->object_no = object_no;
+	obj_request->offset = offset;
+	obj_request->length = cmp_length;
+
+	rbd_img_obj_request_add(img_request, obj_request);
+
+	osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_CMP_AND_WRITE, 3,
+				     obj_request);
+	if (!osd_req)
+		goto del_obj_req;
+
+	obj_request->osd_req = osd_req;
+	obj_request->callback = rbd_img_obj_callback;
+	obj_request->img_offset = img_offset;
+
+	osd_req_op_alloc_hint_init(osd_req, num_ops, object_size, object_size);
+
+	num_ops++;
+	osd_req_op_extent_init(osd_req, num_ops, CEPH_OSD_OP_CMPEXT, offset,
+			       cmp_length, 0, 0);
+	osd_req_op_extent_osd_data_sg(osd_req, num_ops, cmp_sgl, 0, cmp_length);
+
+	num_ops++;
+	osd_req_op_extent_init(osd_req, num_ops, CEPH_OSD_OP_WRITE, offset,
+			       write_length, 0, 0);
+	osd_req_op_extent_osd_data_sg(osd_req, num_ops, write_sgl, 0,
+				      write_length);
+
+	rbd_osd_req_format_rw(obj_request);
+
+	if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
+		goto del_obj_req;
+
+	rbd_img_request_get(img_request);
+	return 0;
+
+del_obj_req:
+	rbd_img_obj_request_del(img_request, obj_request);
+	return -ENOMEM;
+}
+EXPORT_SYMBOL(rbd_img_cmp_and_write_request_fill);
+
 static void
 rbd_osd_copyup_callback(struct rbd_obj_request *obj_request)
 {