Blob Blame History Raw
From aa489bba1604c95dab3caccca46fb07b71316151 Mon Sep 17 00:00:00 2001
From: Mike Christie <michaelc@cs.wisc.edu>
Date: Wed, 29 Jul 2015 04:23:39 -0500
Subject: [PATCH] rbd: add support for scatterlist obj_request_type
References: fate#318836
Patch-mainline: Not yet, SES2 clustered LIO/RBD

This adds support for a scatterlist rbd obj_request_type, so LIO
can pass down its sg to rbd.

Signed-off-by: Mike Christie <michaelc@cs.wisc.edu>
Acked-by: David Disseldorp <ddiss@suse.de>
Signed-off-by: Luis Henriques <lhenriques@suse.com>
[luis: rebased on top of a1fbb5e7bbb5 ("rbd: start enums at 1 instead of 0")]
---
 drivers/block/rbd.c |  104 +++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 94 insertions(+), 10 deletions(-)

--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -45,6 +45,7 @@
 #include <linux/slab.h>
 #include <linux/idr.h>
 #include <linux/workqueue.h>
+#include <linux/scatterlist.h>
 
 #include "rbd_types.h"
 
@@ -211,6 +212,7 @@ enum obj_request_type {
 	OBJ_REQUEST_NODATA = 1,
 	OBJ_REQUEST_BIO,	/* pointer into provided bio (list) */
 	OBJ_REQUEST_BVECS,	/* pointer into provided bio_vec array */
+	OBJ_REQUEST_SG,
 };
 
 enum obj_operation_type {
@@ -266,6 +268,10 @@ struct rbd_obj_request {
 			struct ceph_bvec_iter	bvec_pos;
 			u32			bvec_count;
 		};
+		struct {
+			struct scatterlist	*sg;
+			unsigned int		init_sg_offset;
+		};
 	};
 	struct bio_vec		*copyup_bvecs;
 	u32			copyup_bvec_count;
@@ -296,14 +302,20 @@ struct rbd_img_request {
 		u64			snap_id;	/* for reads */
 		struct ceph_snap_context *snapc;	/* for writes */
 	};
-	union {
-		struct request		*rq;		/* block request */
-		struct rbd_obj_request	*obj_request;	/* obj req initiator */
-	};
+
+	struct request		*rq;		/* block request */
+	struct rbd_obj_request	*obj_request;	/* obj req initiator */
+
 	spinlock_t		completion_lock;/* protects next_completion */
 	u32			next_completion;
 	rbd_img_callback_t	callback;
+	/*
+	 * xferred is the bytes that have successfully been transferred.
+	 * completed is the bytes that have been accounted for and includes
+	 * both failed and successfully transffered bytes.
+	 */
 	u64			xferred;/* aggregate bytes transferred */
+	u64			completed;
 	int			result;	/* first nonzero obj_request result */
 
 	u32			obj_request_count;
@@ -1273,6 +1285,34 @@ static void zero_bvecs(struct ceph_bvec_
 	}));
 }
 
+static void zero_sg(struct scatterlist *sgl, u64 start, u64 length)
+{
+	struct scatterlist *sg = sgl;
+	u64 end = start + length;
+	u64 pos = 0;
+
+	while (pos < end && sg) {
+		if (pos + sg->length > start) {
+			int sg_offset = max_t(int, start - pos, 0);
+			unsigned int length = min_t(unsigned int,
+						    sg->length - sg_offset,
+						    end - pos);
+			void *kaddr;
+			unsigned long flags;
+
+			local_irq_save(flags);
+			kaddr = kmap_atomic(sg_page(sg));
+			memset(kaddr + sg_offset + sg->offset, 0, length);
+			flush_dcache_page(sg_page(sg));
+			kunmap_atomic(kaddr);
+			local_irq_restore(flags);
+		}
+
+		pos += sg->length;
+		sg = sg_next(sg);
+	}
+}
+
 /*
  * The default/initial value for all object request flags is 0.  For
  * each flag, once its value is set to 1 it is never reset to 0
@@ -1431,6 +1471,7 @@ static bool obj_request_type_valid(enum
 	case OBJ_REQUEST_NODATA:
 	case OBJ_REQUEST_BIO:
 	case OBJ_REQUEST_BVECS:
+	case OBJ_REQUEST_SG:
 		return true;
 	default:
 		return false;
@@ -1579,16 +1620,20 @@ rbd_img_obj_request_read_callback(struct
 	if (obj_request->result == -ENOENT) {
 		if (obj_request->type == OBJ_REQUEST_BIO)
 			zero_bios(&obj_request->bio_pos, 0, length);
-		else
+		else if (obj_request->type == OBJ_REQUEST_BVECS)
 			zero_bvecs(&obj_request->bvec_pos, 0, length);
+		else if (obj_request->type == OBJ_REQUEST_SG)
+			zero_sg(obj_request->sg, 0, length);
 		obj_request->result = 0;
 	} else if (xferred < length && !obj_request->result) {
 		if (obj_request->type == OBJ_REQUEST_BIO)
 			zero_bios(&obj_request->bio_pos, xferred,
 				  length - xferred);
-		else
+		else if (obj_request->type == OBJ_REQUEST_BVECS)
 			zero_bvecs(&obj_request->bvec_pos, xferred,
 				   length - xferred);
+		else if (obj_request->type == OBJ_REQUEST_SG)
+			zero_sg(obj_request->sg, xferred, length);
 	}
 	obj_request->xferred = length;
 	obj_request_done_set(obj_request);
@@ -1899,6 +1944,7 @@ static void rbd_obj_request_destroy(stru
 	case OBJ_REQUEST_NODATA:
 	case OBJ_REQUEST_BIO:
 	case OBJ_REQUEST_BVECS:
+	case OBJ_REQUEST_SG:
 		break;		/* Nothing to do */
 	default:
 		rbd_assert(0);
@@ -1997,6 +2043,7 @@ static struct rbd_img_request *rbd_img_r
 	img_request->rbd_dev = rbd_dev;
 	img_request->offset = offset;
 	img_request->length = length;
+	img_request->completed = 0;
 	if (op_type == OBJ_OP_DISCARD) {
 		img_request_discard_set(img_request);
 		img_request->snapc = snapc;
@@ -2119,18 +2166,22 @@ static bool rbd_img_obj_end_request(stru
 		 */
 		xferred = obj_request->length;
 	}
+	img_request->completed += xferred;
 
 	if (img_request_child_test(img_request)) {
 		rbd_assert(img_request->obj_request != NULL);
 		more = obj_request->which < img_request->obj_request_count - 1;
-	} else {
+	} else if (img_request->rq) {
 		blk_status_t status = errno_to_blk_status(result);
 
-		rbd_assert(img_request->rq != NULL);
-
 		more = blk_update_request(img_request->rq, status, xferred);
 		if (!more)
 			__blk_mq_end_request(img_request->rq, status);
+	} else {
+		if (img_request->completed < img_request->length)
+			more = true;
+		else
+			more = false;
 	}
 
 	return more;
@@ -2234,6 +2285,10 @@ static void rbd_img_obj_request_fill(str
 	else if (obj_request->type == OBJ_REQUEST_BVECS)
 		osd_req_op_extent_osd_data_bvec_pos(osd_request, num_ops,
 					&obj_request->bvec_pos);
+	else if (obj_request->type == OBJ_REQUEST_SG)
+		osd_req_op_extent_osd_data_sg(osd_request, num_ops,
+					obj_request->sg,
+					obj_request->init_sg_offset, length);
 
 	/* Discards are also writes */
 	if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
@@ -2259,7 +2314,9 @@ static int rbd_img_request_fill(struct r
 	struct rbd_obj_request *next_obj_request;
 	struct ceph_bio_iter bio_it;
 	struct ceph_bvec_iter bvec_it;
+	struct scatterlist *sgl = NULL;
 	enum obj_operation_type op_type;
+	unsigned int sg_offset = 0;
 	u64 img_offset;
 	u64 resid;
 
@@ -2277,6 +2334,8 @@ static int rbd_img_request_fill(struct r
 			   bio_it.iter.bi_sector << SECTOR_SHIFT);
 	} else if (type == OBJ_REQUEST_BVECS) {
 		bvec_it = *(struct ceph_bvec_iter *)data_desc;
+	} else if (type == OBJ_REQUEST_SG) {
+		sgl = data_desc;
 	}
 
 	while (resid) {
@@ -2306,6 +2365,27 @@ static int rbd_img_request_fill(struct r
 			obj_request->bvec_pos = bvec_it;
 			ceph_bvec_iter_shorten(&obj_request->bvec_pos, length);
 			ceph_bvec_iter_advance(&bvec_it, length);
+		} else if (type == OBJ_REQUEST_SG) {
+			u64 sg_length = 0;
+
+			obj_request->init_sg_offset = sg_offset;
+			obj_request->sg = sgl;
+			do {
+				sg_length += (sgl->length - sg_offset);
+				sg_offset = 0;
+				if (sg_length > length) {
+					sg_offset = sgl->length -
+						(sg_length - length);
+					break;
+				}
+				/*
+				 * For WRITE_SAME we have a single sg that
+				 * is written possibly multiple times over
+				 * img_request->length bytes.
+				 */
+				if (sg_next(sgl))
+					sgl = sg_next(sgl);
+			} while (true);
 		}
 
 		osd_req = rbd_osd_req_create(rbd_dev, op_type,
@@ -2821,9 +2901,13 @@ static void rbd_img_parent_read(struct r
 	if (obj_request->type == OBJ_REQUEST_BIO)
 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
 						&obj_request->bio_pos);
-	else
+	else if (obj_request->type == OBJ_REQUEST_BVECS)
 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BVECS,
 						&obj_request->bvec_pos);
+	else
+		result = rbd_img_request_fill(img_request, OBJ_REQUEST_SG,
+						obj_request->sg);
+
 	if (result)
 		goto out_err;