From c679aad24f0599160f8f1c885c3ef1955d4a76a4 Mon Sep 17 00:00:00 2001
From: David Disseldorp <ddiss@suse.de>
Date: Wed, 31 Aug 2016 14:42:49 +0200
Subject: [PATCH] rbd: truncate objects on cmpext short reads
References: bsc#988715
Patch-mainline: Not yet, SES clustered LIO/RBD
OSDs may respond to cmpext requests with -EINVAL if a portion of the
range is unwritten. Zero-filling is normally handled on the RBD client
side, but this is not possible for atomic compare-and-write requests.
Instead fallback to truncating the object before reissuing the original
request.
Signed-off-by: David Disseldorp <ddiss@suse.de>
Reviewed-by: Jan Fajerski <jfajerski@suse.com>
[ddiss@suse.de: rebase for SLE15, use __rbd_osd_req_create() helper and
squash rbd-ensure-r_request-is-allocated-for-creatrunc.patch]
Acked-by: Luis Henriques <lhenriques@suse.com>
[luis: due to commit 26f887e0a3c4 ("libceph, rbd, ceph: move
ceph_osdc_alloc_messages() calls"), call ceph_osdc_alloc_messages in
rbd_img_obj_creatrunc_submit]
---
drivers/block/rbd.c | 152 +++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 150 insertions(+), 2 deletions(-)
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -211,7 +211,7 @@ struct rbd_obj_request {
* 0..(img_request->obj_request_count-1).
*/
union {
- struct rbd_obj_request *obj_request; /* STAT op */
+ struct rbd_obj_request *obj_request; /* STAT or CREATE op */
struct {
struct rbd_img_request *img_request;
u64 img_offset;
@@ -1590,13 +1590,39 @@ static void rbd_osd_write_callback(struc
obj_request_done_set(obj_request);
}
+static int rbd_img_obj_creatrunc_submit(struct rbd_obj_request *obj_request);
+
static void rbd_osd_cmpext_callback(struct rbd_obj_request *obj_request,
struct ceph_osd_request *osd_req)
{
dout("%s: obj %p result %d %llu\n", __func__, obj_request,
obj_request->result, obj_request->length);
- /* on mismatch, result is -MAX_ERRNO - offset_of_mismatch */
+ rbd_assert(osd_req->r_ops[0].op == CEPH_OSD_OP_SETALLOCHINT);
+ rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_CMPEXT);
+ rbd_assert(osd_req->r_ops[2].op == CEPH_OSD_OP_WRITE);
+
+ if (!obj_request->result || (obj_request->result <= -MAX_ERRNO)) {
+ /* on mismatch, result is -MAX_ERRNO - offset_of_mismatch */
+ obj_request->xferred = obj_request->length;
+ } else if ((obj_request->result == -EINVAL) &&
+ (osd_req->r_ops[1].rval == -EINVAL)) {
+ int ret;
+ /*
+ * cmpext failed to read the full compare range. truncate and
+ * retry. Truncate is issued as a separate request, so it could
+ * theoretically race with a discard.
+ */
+ ret = rbd_img_obj_creatrunc_submit(obj_request);
+ if (!ret) {
+ /* orig request will be reissued following truncate */
+ return;
+ }
+ } else {
+ rbd_warn(NULL, "unexpected cmpext result: %d",
+ obj_request->result);
+ }
+
obj_request->xferred = obj_request->length;
obj_request_done_set(obj_request);
}
@@ -1636,6 +1662,15 @@ static void rbd_osd_call_callback(struct
obj_request_done_set(obj_request);
}
+/*
+ * Nothing to do here, rbd_img_obj_creatrunc_callback() handles everything
+ */
+static void rbd_osd_creatrunc_callback(struct rbd_obj_request *obj_request)
+{
+ dout("%s: obj %p\n", __func__, obj_request);
+ obj_request_done_set(obj_request);
+}
+
static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
{
struct rbd_obj_request *obj_request = osd_req->r_priv;
@@ -1700,6 +1735,11 @@ static void rbd_osd_req_callback(struct
case CEPH_OSD_OP_GETXATTR:
obj_request_done_set(obj_request);
break;
+ case CEPH_OSD_OP_CREATE:
+ rbd_assert(osd_req->r_num_ops == 2);
+ rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_TRUNCATE);
+ rbd_osd_creatrunc_callback(obj_request);
+ break;
default:
rbd_warn(NULL, "unexpected OSD op: object_no %016llx opcode %d",
obj_request->object_no, opcode);
@@ -2752,6 +2792,114 @@ fail_stat_request:
return ret;
}
+static void
+rbd_img_obj_creatrunc_callback(struct rbd_obj_request *obj_request)
+{
+ struct rbd_obj_request *orig_request;
+ int result;
+
+ rbd_assert(!obj_request_img_data_test(obj_request));
+
+ /*
+ * All we need from the object request is the original
+ * request and the result of the TRUNCATE op.
+ */
+ orig_request = obj_request->obj_request;
+ obj_request->obj_request = NULL;
+ rbd_obj_request_put(orig_request);
+ rbd_assert(orig_request);
+ rbd_assert(orig_request->img_request);
+
+ result = obj_request->result;
+ obj_request->result = 0;
+
+ dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
+ obj_request, orig_request, result,
+ obj_request->xferred, obj_request->length);
+ rbd_obj_request_put(obj_request);
+
+ if (result) {
+ orig_request->result = result;
+ goto out;
+ }
+
+ /*
+ * Resubmit the original request now that we have truncated
+ * the target object.
+ */
+ orig_request->result = rbd_img_obj_request_submit(orig_request);
+out:
+ if (orig_request->result)
+ rbd_obj_request_complete(orig_request);
+}
+
+/*
+ * compare and write failed with -EINVAL, indicating an unwritten range
+ * for cmpext - truncate the object to its full size and then reissue
+ * the same request.
+ * Like rbd_img_obj_exists_submit(), this function tracks the original request
+ * through to the callback via creatrunc_req->osd_req, which means that
+ * creatrunc_req->img_request users must be avoided.
+ */
+static int
+rbd_img_obj_creatrunc_submit(struct rbd_obj_request *obj_request)
+{
+ struct rbd_img_request *img_request;
+ struct rbd_obj_request *creatrunc_req;
+ struct ceph_osd_request *osd_req;
+ struct rbd_device *rbd_dev;
+ u64 object_size;
+ int ret;
+
+ rbd_assert(obj_request_img_data_test(obj_request));
+ img_request = obj_request->img_request;
+ rbd_assert(img_request);
+ rbd_assert(obj_request->result == (s32)-EINVAL);
+ rbd_assert(obj_request_type_valid(obj_request->type));
+ rbd_assert(img_request_cmp_and_write_test(img_request));
+
+ creatrunc_req = rbd_obj_request_create(OBJ_REQUEST_NODATA);
+ if (!creatrunc_req)
+ return -ENOMEM;
+
+ obj_request->object_no = obj_request->object_no;
+
+ rbd_dev = img_request->rbd_dev;
+ object_size = rbd_obj_bytes(&rbd_dev->header);
+
+ osd_req = __rbd_osd_req_create(img_request->rbd_dev,
+ img_request->snapc, 2,
+ CEPH_OSD_FLAG_WRITE, creatrunc_req);
+ if (!osd_req) {
+ ret = -ENOMEM;
+ goto fail_creatrunc_request;
+ }
+
+ creatrunc_req->osd_req = osd_req;
+ rbd_obj_request_get(obj_request);
+ creatrunc_req->obj_request = obj_request;
+ creatrunc_req->callback = rbd_img_obj_creatrunc_callback;
+
+ osd_req_op_init(creatrunc_req->osd_req, 0, CEPH_OSD_OP_CREATE, 0);
+ osd_req_op_extent_init(creatrunc_req->osd_req, 1, CEPH_OSD_OP_TRUNCATE,
+ object_size, 0, 0, 0);
+
+ rbd_osd_req_format_write(creatrunc_req);
+
+ ret = ceph_osdc_alloc_messages(creatrunc_req->osd_req, GFP_NOIO);
+ if (ret)
+ goto fail_creatrunc_request;
+
+ rbd_obj_request_submit(creatrunc_req);
+
+ return 0;
+
+fail_creatrunc_request:
+ creatrunc_req->img_request = NULL;
+ rbd_obj_request_put(creatrunc_req);
+ return ret;
+}
+
static bool img_obj_request_simple(struct rbd_obj_request *obj_request)
{
struct rbd_img_request *img_request = obj_request->img_request;