From 150c4ca485aa91c524f922ba22fa0596788a620a Mon Sep 17 00:00:00 2001
From: Mike Christie <michaelc@cs.wisc.edu>
Date: Wed, 29 Jul 2015 04:23:45 -0500
Subject: [PATCH] rbd: add support for COMPARE_AND_WRITE/CMPEXT
References: fate#318836
Patch-mainline: Not yet, SES2 clustered LIO/RBD
This patch adds support to rbd for SCSI COMPARE_AND_WRITE commands. Higher
levels like LIO will work with IMG_REQ_CMP_AND_WRITE requests, but
rbd breaks it up into CMPEXT and WRITE Ceph requests.
Signed-off-by: Mike Christie <michaelc@cs.wisc.edu>
Acked-by: David Disseldorp <ddiss@suse.de>
[ddiss@suse.de: rbd_osd_req_create()/__rbd_osd_req_create flags rebase
for SLE15. Squash in rbd-fix-and-simplify-rbd_osd_req_format_rw.patch]
Signed-off-by: Luis Henriques <lhenriques@suse.com>
[luis: rebased on top of a1fbb5e7bbb5 ("rbd: start enums at 1 instead of 0")]
[luis: due to commit 26f887e0a3c4 ("libceph, rbd, ceph: move
ceph_osdc_alloc_messages() calls"), call ceph_osdc_alloc_messages in
rbd_img_cmp_and_write_request_fill ]
Acked-by: Luis Henriques <lhenriques@suse.com>
---
drivers/block/rbd.c | 162 +++++++++++++++++++++++++++++++++++++++++++++-------
1 file changed, 141 insertions(+), 21 deletions(-)
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -219,6 +219,7 @@ enum obj_operation_type {
OBJ_OP_READ = 1,
OBJ_OP_WRITE,
OBJ_OP_DISCARD,
+ OBJ_OP_CMP_AND_WRITE,
};
enum obj_req_flags {
@@ -291,6 +292,7 @@ enum img_req_flags {
IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
IMG_REQ_DISCARD, /* discard: normal = 0, discard request = 1 */
+ IMG_REQ_CMP_AND_WRITE, /* normal = 0, compare and write request = 1 */
};
struct rbd_img_request {
@@ -298,10 +300,9 @@ struct rbd_img_request {
u64 offset; /* starting image byte offset */
u64 length; /* byte count from offset */
unsigned long flags;
- union {
- u64 snap_id; /* for reads */
- struct ceph_snap_context *snapc; /* for writes */
- };
+
+ u64 snap_id; /* for reads */
+ struct ceph_snap_context *snapc; /* for writes */
struct request *rq; /* block request */
struct rbd_obj_request *obj_request; /* obj req initiator */
@@ -858,6 +859,8 @@ static int obj_num_ops(enum obj_operatio
switch (op_type) {
case OBJ_OP_WRITE:
return 2;
+ case OBJ_OP_CMP_AND_WRITE:
+ return 3;
default:
return 1;
}
@@ -872,6 +875,8 @@ static char* obj_op_name(enum obj_operat
return "write";
case OBJ_OP_DISCARD:
return "discard";
+ case OBJ_OP_CMP_AND_WRITE:
+ return "compare-and-write";
default:
return "???";
}
@@ -1599,10 +1604,23 @@ static bool img_request_layered_test(str
return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
}
+static void img_request_cmp_and_write_set(struct rbd_img_request *img_request)
+{
+ set_bit(IMG_REQ_CMP_AND_WRITE, &img_request->flags);
+ smp_mb();
+}
+
+static bool img_request_cmp_and_write_test(struct rbd_img_request *img_request)
+{
+ smp_mb();
+ return test_bit(IMG_REQ_CMP_AND_WRITE, &img_request->flags) != 0;
+}
+
static bool img_request_is_write_type_test(struct rbd_img_request *img_request)
{
return img_request_write_test(img_request) ||
- img_request_discard_test(img_request);
+ img_request_discard_test(img_request) ||
+ img_request_cmp_and_write_test(img_request);
}
static enum obj_operation_type
@@ -1612,6 +1630,8 @@ rbd_img_request_op_type(struct rbd_img_r
return OBJ_OP_WRITE;
else if (img_request_discard_test(img_request))
return OBJ_OP_DISCARD;
+ else if (img_request_cmp_and_write_test(img_request))
+ return OBJ_OP_CMP_AND_WRITE;
else
return OBJ_OP_READ;
}
@@ -1715,6 +1735,17 @@ static void rbd_osd_write_callback(struc
obj_request_done_set(obj_request);
}
+static void rbd_osd_cmpext_callback(struct rbd_obj_request *obj_request,
+ struct ceph_osd_request *osd_req)
+{
+ dout("%s: obj %p result %d %llu\n", __func__, obj_request,
+ obj_request->result, obj_request->length);
+
+ /* on mismatch, result is -MAX_ERRNO - offset_of_mismatch */
+ obj_request->xferred = obj_request->length;
+ obj_request_done_set(obj_request);
+}
+
static void rbd_osd_discard_callback(struct rbd_obj_request *obj_request)
{
dout("%s: obj %p result %d %llu\n", __func__, obj_request,
@@ -1781,13 +1812,21 @@ static void rbd_osd_req_callback(struct
rbd_osd_read_callback(obj_request);
break;
case CEPH_OSD_OP_SETALLOCHINT:
- rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE ||
- osd_req->r_ops[1].op == CEPH_OSD_OP_WRITEFULL);
- /* fall through */
+ if (osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE ||
+ osd_req->r_ops[1].op == CEPH_OSD_OP_WRITEFULL)
+ rbd_osd_write_callback(obj_request);
+ else if (osd_req->r_ops[1].op == CEPH_OSD_OP_CMPEXT)
+ rbd_osd_cmpext_callback(obj_request, osd_req);
+ else
+ rbd_assert(0);
+ break;
case CEPH_OSD_OP_WRITE:
case CEPH_OSD_OP_WRITEFULL:
rbd_osd_write_callback(obj_request);
break;
+ case CEPH_OSD_OP_CMPEXT:
+ rbd_osd_cmpext_callback(obj_request, osd_req);
+ break;
case CEPH_OSD_OP_STAT:
rbd_osd_stat_callback(obj_request);
break;
@@ -1825,6 +1864,12 @@ static void rbd_osd_req_format_write(str
osd_req->r_data_offset = obj_request->offset;
}
+static void rbd_osd_req_format_rw(struct rbd_obj_request *obj_request)
+{
+ rbd_osd_req_format_read(obj_request);
+ rbd_osd_req_format_write(obj_request);
+}
+
static struct ceph_osd_request *
__rbd_osd_req_create(struct rbd_device *rbd_dev,
struct ceph_snap_context *snapc,
@@ -1861,6 +1906,7 @@ err_req:
* A write request has either one (watch) or two (hint+write) osd ops.
* (All rbd data writes are prefixed with an allocation hint op, but
* technically osd watch is a write request, hence this distinction.)
+ * A extent cmp has three (cmp+write+hint).
*/
static struct ceph_osd_request *rbd_osd_req_create(
struct rbd_device *rbd_dev,
@@ -1871,12 +1917,15 @@ static struct ceph_osd_request *rbd_osd_
struct ceph_snap_context *snapc = NULL;
if (obj_request_img_data_test(obj_request) &&
- (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) {
+ (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE ||
+ op_type == OBJ_OP_CMP_AND_WRITE)) {
struct rbd_img_request *img_request = obj_request->img_request;
if (op_type == OBJ_OP_WRITE) {
rbd_assert(img_request_write_test(img_request));
- } else {
+ } else if (op_type == OBJ_OP_DISCARD) {
rbd_assert(img_request_discard_test(img_request));
+ } else if (op_type == OBJ_OP_CMP_AND_WRITE) {
+ rbd_assert(img_request_cmp_and_write_test(img_request));
}
snapc = img_request->snapc;
}
@@ -1884,7 +1933,8 @@ static struct ceph_osd_request *rbd_osd_
rbd_assert(num_ops == 1 || obj_num_ops(op_type) == num_ops);
return __rbd_osd_req_create(rbd_dev, snapc, num_ops,
- (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) ?
+ (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD ||
+ op_type == OBJ_OP_CMP_AND_WRITE) ?
CEPH_OSD_FLAG_WRITE : CEPH_OSD_FLAG_READ, obj_request);
}
@@ -2063,6 +2113,10 @@ static struct rbd_img_request *rbd_img_r
} else if (op_type == OBJ_OP_WRITE) {
img_request_write_set(img_request);
img_request->snapc = snapc;
+ } else if (op_type == OBJ_OP_CMP_AND_WRITE) {
+ img_request_cmp_and_write_set(img_request);
+ img_request->snapc = snapc;
+ img_request->snap_id = rbd_dev->spec->snap_id;
} else {
img_request->snap_id = rbd_dev->spec->snap_id;
}
@@ -2156,18 +2210,11 @@ static bool rbd_img_obj_end_request(stru
result = obj_request->result;
if (result) {
struct rbd_device *rbd_dev = img_request->rbd_dev;
- enum obj_operation_type op_type;
-
- if (img_request_discard_test(img_request))
- op_type = OBJ_OP_DISCARD;
- else if (img_request_write_test(img_request))
- op_type = OBJ_OP_WRITE;
- else
- op_type = OBJ_OP_READ;
rbd_warn(rbd_dev, "%s %llx at %llx (%llx)",
- obj_op_name(op_type), obj_request->length,
- obj_request->img_offset, obj_request->offset);
+ obj_op_name(rbd_img_request_op_type(img_request)),
+ obj_request->length, obj_request->img_offset,
+ obj_request->offset);
rbd_warn(rbd_dev, " result %d xferred %x",
result, xferred);
if (!img_request->result)
@@ -2426,6 +2473,79 @@ out_unwind:
return -ENOMEM;
}
+int rbd_img_cmp_and_write_request_fill(struct rbd_img_request *img_request,
+ struct scatterlist *cmp_sgl,
+ u64 cmp_length,
+ struct scatterlist *write_sgl,
+ u64 write_length)
+{
+ struct rbd_device *rbd_dev = img_request->rbd_dev;
+ u64 object_size = rbd_obj_bytes(&rbd_dev->header);
+ struct rbd_obj_request *obj_request;
+ struct ceph_osd_request *osd_req;
+ u64 object_no;
+ int num_ops = 0;
+ u64 img_offset;
+ u64 offset;
+
+ img_offset = img_request->offset;
+ offset = rbd_segment_offset(rbd_dev, img_offset);
+
+ /*
+ * LIO currently only supports 1 sector reqs and we assume the req
+ * will not span segments.
+ */
+ if (rbd_segment_length(rbd_dev, offset, cmp_length) != cmp_length)
+ return -EOPNOTSUPP;
+
+ object_no = img_offset >> rbd_dev->header.obj_order;
+
+ obj_request = rbd_obj_request_create(OBJ_REQUEST_SG);
+ if (!obj_request)
+ return -ENOMEM;
+
+ obj_request->object_no = object_no;
+ obj_request->offset = offset;
+ obj_request->length = cmp_length;
+
+ rbd_img_obj_request_add(img_request, obj_request);
+
+ osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_CMP_AND_WRITE, 3,
+ obj_request);
+ if (!osd_req)
+ goto del_obj_req;
+
+ obj_request->osd_req = osd_req;
+ obj_request->callback = rbd_img_obj_callback;
+ obj_request->img_offset = img_offset;
+
+ osd_req_op_alloc_hint_init(osd_req, num_ops, object_size, object_size);
+
+ num_ops++;
+ osd_req_op_extent_init(osd_req, num_ops, CEPH_OSD_OP_CMPEXT, offset,
+ cmp_length, 0, 0);
+ osd_req_op_extent_osd_data_sg(osd_req, num_ops, cmp_sgl, 0, cmp_length);
+
+ num_ops++;
+ osd_req_op_extent_init(osd_req, num_ops, CEPH_OSD_OP_WRITE, offset,
+ write_length, 0, 0);
+ osd_req_op_extent_osd_data_sg(osd_req, num_ops, write_sgl, 0,
+ write_length);
+
+ rbd_osd_req_format_rw(obj_request);
+
+ if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
+ goto del_obj_req;
+
+ rbd_img_request_get(img_request);
+ return 0;
+
+del_obj_req:
+ rbd_img_obj_request_del(img_request, obj_request);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL(rbd_img_cmp_and_write_request_fill);
+
static void
rbd_osd_copyup_callback(struct rbd_obj_request *obj_request)
{