Blob Blame History Raw
From c679aad24f0599160f8f1c885c3ef1955d4a76a4 Mon Sep 17 00:00:00 2001
From: David Disseldorp <ddiss@suse.de>
Date: Wed, 31 Aug 2016 14:42:49 +0200
Subject: [PATCH] rbd: truncate objects on cmpext short reads
References: bsc#988715
Patch-mainline: Not yet, SES clustered LIO/RBD

OSDs may respond to cmpext requests with -EINVAL if a portion of the
range is unwritten. Zero-filling is normally handled on the RBD client
side, but this is not possible for atomic compare-and-write requests.
Instead fallback to truncating the object before reissuing the original
request.

Signed-off-by: David Disseldorp <ddiss@suse.de>
Reviewed-by: Jan Fajerski <jfajerski@suse.com>
[ddiss@suse.de: rebase for SLE15, use __rbd_osd_req_create() helper and
squash rbd-ensure-r_request-is-allocated-for-creatrunc.patch]
Acked-by: Luis Henriques <lhenriques@suse.com>
[luis: due to commit 26f887e0a3c4 ("libceph, rbd, ceph: move
 ceph_osdc_alloc_messages() calls"), call ceph_osdc_alloc_messages in
 rbd_img_obj_creatrunc_submit]
---
 drivers/block/rbd.c |  152 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 150 insertions(+), 2 deletions(-)

--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -211,7 +211,7 @@ struct rbd_obj_request {
 	 * 0..(img_request->obj_request_count-1).
 	 */
 	union {
-		struct rbd_obj_request	*obj_request;	/* STAT op */
+		struct rbd_obj_request	*obj_request;	/* STAT or CREATE op */
 		struct {
 			struct rbd_img_request	*img_request;
 			u64			img_offset;
@@ -1590,13 +1590,39 @@ static void rbd_osd_write_callback(struc
 	obj_request_done_set(obj_request);
 }
 
+static int rbd_img_obj_creatrunc_submit(struct rbd_obj_request *obj_request);
+
 static void rbd_osd_cmpext_callback(struct rbd_obj_request *obj_request,
 				    struct ceph_osd_request *osd_req)
 {
 	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
 		obj_request->result, obj_request->length);
 
-	/* on mismatch, result is -MAX_ERRNO - offset_of_mismatch */
+	rbd_assert(osd_req->r_ops[0].op == CEPH_OSD_OP_SETALLOCHINT);
+	rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_CMPEXT);
+	rbd_assert(osd_req->r_ops[2].op == CEPH_OSD_OP_WRITE);
+
+	if (!obj_request->result || (obj_request->result <= -MAX_ERRNO)) {
+		/* on mismatch, result is -MAX_ERRNO - offset_of_mismatch */
+		obj_request->xferred = obj_request->length;
+	} else if ((obj_request->result == -EINVAL) &&
+		   (osd_req->r_ops[1].rval == -EINVAL))	{
+		int ret;
+		/*
+		 * cmpext failed to read the full compare range. truncate and
+		 * retry. Truncate is issued as a separate request, so it could
+		 * theoretically race with a discard.
+		 */
+		ret = rbd_img_obj_creatrunc_submit(obj_request);
+		if (!ret) {
+			/* orig request will be reissued following truncate */
+			return;
+		}
+	} else {
+		rbd_warn(NULL, "unexpected cmpext result: %d",
+			 obj_request->result);
+	}
+
 	obj_request->xferred = obj_request->length;
 	obj_request_done_set(obj_request);
 }
@@ -1636,6 +1662,15 @@ static void rbd_osd_call_callback(struct
 		obj_request_done_set(obj_request);
 }
 
+/*
+ * Nothing to do here, rbd_img_obj_creatrunc_callback() handles everything
+ */
+static void rbd_osd_creatrunc_callback(struct rbd_obj_request *obj_request)
+{
+	dout("%s: obj %p\n", __func__, obj_request);
+	obj_request_done_set(obj_request);
+}
+
 static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
 {
 	struct rbd_obj_request *obj_request = osd_req->r_priv;
@@ -1700,6 +1735,11 @@ static void rbd_osd_req_callback(struct
 	case CEPH_OSD_OP_GETXATTR:
 		obj_request_done_set(obj_request);
 		break;
+	case CEPH_OSD_OP_CREATE:
+		rbd_assert(osd_req->r_num_ops == 2);
+		rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_TRUNCATE);
+		rbd_osd_creatrunc_callback(obj_request);
+		break;
 	default:
 		rbd_warn(NULL, "unexpected OSD op: object_no %016llx opcode %d",
 			 obj_request->object_no, opcode);
@@ -2752,6 +2792,114 @@ fail_stat_request:
 	return ret;
 }
 
+static void
+rbd_img_obj_creatrunc_callback(struct rbd_obj_request *obj_request)
+{
+	struct rbd_obj_request *orig_request;
+	int result;
+
+	rbd_assert(!obj_request_img_data_test(obj_request));
+
+	/*
+	 * All we need from the object request is the original
+	 * request and the result of the TRUNCATE op.
+	 */
+	orig_request = obj_request->obj_request;
+	obj_request->obj_request = NULL;
+	rbd_obj_request_put(orig_request);
+	rbd_assert(orig_request);
+	rbd_assert(orig_request->img_request);
+
+	result = obj_request->result;
+	obj_request->result = 0;
+
+	dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
+		obj_request, orig_request, result,
+		obj_request->xferred, obj_request->length);
+	rbd_obj_request_put(obj_request);
+
+	if (result) {
+		orig_request->result = result;
+		goto out;
+	}
+
+	/*
+	 * Resubmit the original request now that we have truncated
+	 * the target object.
+	 */
+	orig_request->result = rbd_img_obj_request_submit(orig_request);
+out:
+	if (orig_request->result)
+		rbd_obj_request_complete(orig_request);
+}
+
+/*
+ * compare and write failed with -EINVAL, indicating an unwritten range
+ * for cmpext - truncate the object to its full size and then reissue
+ * the same request.
+ * Like rbd_img_obj_exists_submit(), this function tracks the original request
+ * through to the callback via creatrunc_req->osd_req, which means that
+ * creatrunc_req->img_request users must be avoided.
+ */
+static int
+rbd_img_obj_creatrunc_submit(struct rbd_obj_request *obj_request)
+{
+	struct rbd_img_request *img_request;
+	struct rbd_obj_request *creatrunc_req;
+	struct ceph_osd_request *osd_req;
+	struct rbd_device *rbd_dev;
+	u64 object_size;
+	int ret;
+
+	rbd_assert(obj_request_img_data_test(obj_request));
+	img_request = obj_request->img_request;
+	rbd_assert(img_request);
+	rbd_assert(obj_request->result == (s32)-EINVAL);
+	rbd_assert(obj_request_type_valid(obj_request->type));
+	rbd_assert(img_request_cmp_and_write_test(img_request));
+
+	creatrunc_req = rbd_obj_request_create(OBJ_REQUEST_NODATA);
+	if (!creatrunc_req)
+		return -ENOMEM;
+
+	obj_request->object_no = obj_request->object_no;
+
+	rbd_dev = img_request->rbd_dev;
+	object_size = rbd_obj_bytes(&rbd_dev->header);
+
+	osd_req = __rbd_osd_req_create(img_request->rbd_dev,
+				    img_request->snapc, 2,
+				    CEPH_OSD_FLAG_WRITE, creatrunc_req);
+	if (!osd_req) {
+		ret = -ENOMEM;
+		goto fail_creatrunc_request;
+	}
+
+	creatrunc_req->osd_req = osd_req;
+	rbd_obj_request_get(obj_request);
+	creatrunc_req->obj_request = obj_request;
+	creatrunc_req->callback = rbd_img_obj_creatrunc_callback;
+
+	osd_req_op_init(creatrunc_req->osd_req, 0, CEPH_OSD_OP_CREATE, 0);
+	osd_req_op_extent_init(creatrunc_req->osd_req, 1, CEPH_OSD_OP_TRUNCATE,
+				object_size, 0, 0, 0);
+
+	rbd_osd_req_format_write(creatrunc_req);
+
+	ret = ceph_osdc_alloc_messages(creatrunc_req->osd_req, GFP_NOIO);
+	if (ret)
+		goto fail_creatrunc_request;
+
+	rbd_obj_request_submit(creatrunc_req);
+
+	return 0;
+
+fail_creatrunc_request:
+	creatrunc_req->img_request = NULL;
+	rbd_obj_request_put(creatrunc_req);
+	return ret;
+}
+
 static bool img_obj_request_simple(struct rbd_obj_request *obj_request)
 {
 	struct rbd_img_request *img_request = obj_request->img_request;