Blob Blame History Raw
From https://git.kernel.org/?p=linux/kernel/git/jeremy/xen.git (commit ada3f6a1ba43e163aab95c7808f11b88fc7c79e6)
Subject: pv-ops blktap2
From: xen-devel@lists.xen.org
Patch-mainline: n/a
Acked-by: jbeulich@suse.com

--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ head-2011-08-09/drivers/xen/blktap2-new/Makefile	2011-02-24 13:49:49.000000000 +0100
@@ -0,0 +1,3 @@
+obj-$(CONFIG_XEN_BLKDEV_TAP) := blktap.o
+
+blktap-objs := control.o ring.o device.o request.o sysfs.o
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ head-2011-08-09/drivers/xen/blktap2-new/blktap.h	2011-02-24 13:49:49.000000000 +0100
@@ -0,0 +1,209 @@
+#ifndef _BLKTAP_H_
+#define _BLKTAP_H_
+
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/cdev.h>
+#include <linux/init.h>
+#include <linux/scatterlist.h>
+#include <xen/blkif.h>
+
+extern int blktap_debug_level;
+extern int blktap_ring_major;
+extern int blktap_device_major;
+
+#define BTPRINTK(level, tag, force, _f, _a...)				\
+	do {								\
+		if (blktap_debug_level > level &&			\
+		    (force || printk_ratelimit()))			\
+			printk(tag "%s: " _f, __func__, ##_a);		\
+	} while (0)
+
+#define BTDBG(_f, _a...)             BTPRINTK(8, KERN_DEBUG, 1, _f, ##_a)
+#define BTINFO(_f, _a...)            BTPRINTK(0, KERN_INFO, 0, _f, ##_a)
+#define BTWARN(_f, _a...)            BTPRINTK(0, KERN_WARNING, 0, _f, ##_a)
+#define BTERR(_f, _a...)             BTPRINTK(0, KERN_ERR, 0, _f, ##_a)
+
+#define MAX_BLKTAP_DEVICE            1024
+
+#define BLKTAP_DEVICE                4
+#define BLKTAP_DEVICE_CLOSED         5
+#define BLKTAP_SHUTDOWN_REQUESTED    8
+
+/* blktap IOCTLs: */
+#define BLKTAP2_IOCTL_KICK_FE        1
+#define BLKTAP2_IOCTL_ALLOC_TAP      200
+#define BLKTAP2_IOCTL_FREE_TAP       201
+#define BLKTAP2_IOCTL_CREATE_DEVICE  202
+#define BLKTAP2_IOCTL_REMOVE_DEVICE  207
+
+#define BLKTAP2_MAX_MESSAGE_LEN      256
+
+#define BLKTAP2_RING_MESSAGE_CLOSE   3
+
+#define BLKTAP_REQUEST_FREE          0
+#define BLKTAP_REQUEST_PENDING       1
+
+/*
+ * The maximum number of requests that can be outstanding at any time
+ * is determined by
+ *
+ *   [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST] 
+ *
+ * where mmap_alloc < MAX_DYNAMIC_MEM.
+ *
+ * TODO:
+ * mmap_alloc is initialised to 2 and should be adjustable on the fly via
+ * sysfs.
+ */
+#define BLK_RING_SIZE		__RING_SIZE((struct blkif_sring *)0, PAGE_SIZE)
+#define MAX_DYNAMIC_MEM		BLK_RING_SIZE
+#define MAX_PENDING_REQS	BLK_RING_SIZE
+#define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
+#define MMAP_VADDR(_start, _req, _seg)					\
+        (_start +                                                       \
+         ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) +        \
+         ((_seg) * PAGE_SIZE))
+
+struct grant_handle_pair {
+	grant_handle_t                 kernel;
+	grant_handle_t                 user;
+};
+#define INVALID_GRANT_HANDLE           0xFFFF
+
+struct blktap_handle {
+	unsigned int                   ring;
+	unsigned int                   device;
+	unsigned int                   minor;
+};
+
+struct blktap_params {
+	char                           name[BLKTAP2_MAX_MESSAGE_LEN];
+	unsigned long long             capacity;
+	unsigned long                  sector_size;
+};
+
+struct blktap_device {
+	spinlock_t                     lock;
+	struct gendisk                *gd;
+};
+
+struct blktap_ring {
+	struct task_struct            *task;
+
+	struct vm_area_struct         *vma;
+	struct blkif_front_ring        ring;
+	unsigned long                  ring_vstart;
+	unsigned long                  user_vstart;
+
+	int                            n_pending;
+	struct blktap_request         *pending[MAX_PENDING_REQS];
+
+	wait_queue_head_t              poll_wait;
+
+	dev_t                          devno;
+	struct device                 *dev;
+};
+
+struct blktap_statistics {
+	unsigned long                  st_print;
+	int                            st_rd_req;
+	int                            st_wr_req;
+	int                            st_oo_req;
+	int                            st_rd_sect;
+	int                            st_wr_sect;
+	s64                            st_rd_cnt;
+	s64                            st_rd_sum_usecs;
+	s64                            st_rd_max_usecs;
+	s64                            st_wr_cnt;
+	s64                            st_wr_sum_usecs;
+	s64                            st_wr_max_usecs;	
+};
+
+struct blktap_request {
+	struct blktap                 *tap;
+	struct request                *rq;
+	int                            usr_idx;
+
+	int                            operation;
+	struct timeval                 time;
+
+	struct scatterlist             sg_table[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct page                   *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	int                            nr_pages;
+};
+
+#define blktap_for_each_sg(_sg, _req, _i)	\
+	for (_sg = (_req)->sg_table, _i = 0;	\
+	     _i < (_req)->nr_pages;		\
+	     (_sg)++, (_i)++)
+
+struct blktap {
+	int                            minor;
+	unsigned long                  dev_inuse;
+
+	struct blktap_ring             ring;
+	struct blktap_device           device;
+	struct blktap_page_pool       *pool;
+
+	wait_queue_head_t              remove_wait;
+	struct work_struct             remove_work;
+	char                           name[BLKTAP2_MAX_MESSAGE_LEN];
+
+	struct blktap_statistics       stats;
+};
+
+struct blktap_page_pool {
+	struct mempool_s              *bufs;
+	spinlock_t                     lock;
+	struct kobject                 kobj;
+	wait_queue_head_t              wait;
+};
+
+extern struct mutex blktap_lock;
+extern struct blktap **blktaps;
+extern int blktap_max_minor;
+
+int blktap_control_destroy_tap(struct blktap *);
+size_t blktap_control_debug(struct blktap *, char *, size_t);
+
+int blktap_ring_init(void);
+void blktap_ring_exit(void);
+size_t blktap_ring_debug(struct blktap *, char *, size_t);
+int blktap_ring_create(struct blktap *);
+int blktap_ring_destroy(struct blktap *);
+struct blktap_request *blktap_ring_make_request(struct blktap *);
+void blktap_ring_free_request(struct blktap *,struct blktap_request *);
+void blktap_ring_submit_request(struct blktap *, struct blktap_request *);
+int blktap_ring_map_request_segment(struct blktap *, struct blktap_request *, int);
+int blktap_ring_map_request(struct blktap *, struct blktap_request *);
+void blktap_ring_unmap_request(struct blktap *, struct blktap_request *);
+void blktap_ring_set_message(struct blktap *, int);
+void blktap_ring_kick_user(struct blktap *);
+
+int blktap_sysfs_init(void);
+void blktap_sysfs_exit(void);
+int blktap_sysfs_create(struct blktap *);
+void blktap_sysfs_destroy(struct blktap *);
+
+int blktap_device_init(void);
+void blktap_device_exit(void);
+size_t blktap_device_debug(struct blktap *, char *, size_t);
+int blktap_device_create(struct blktap *, struct blktap_params *);
+int blktap_device_destroy(struct blktap *);
+void blktap_device_destroy_sync(struct blktap *);
+void blktap_device_run_queue(struct blktap *);
+void blktap_device_end_request(struct blktap *, struct blktap_request *, int);
+
+int blktap_page_pool_init(struct kobject *);
+void blktap_page_pool_exit(void);
+struct blktap_page_pool *blktap_page_pool_get(const char *);
+
+size_t blktap_request_debug(struct blktap *, char *, size_t);
+struct blktap_request *blktap_request_alloc(struct blktap *);
+int blktap_request_get_pages(struct blktap *, struct blktap_request *, int);
+void blktap_request_free(struct blktap *, struct blktap_request *);
+void blktap_request_bounce(struct blktap *, struct blktap_request *, int, int);
+
+
+#endif
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ head-2011-08-09/drivers/xen/blktap2-new/control.c	2011-02-24 13:49:49.000000000 +0100
@@ -0,0 +1,315 @@
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/miscdevice.h>
+#include <linux/device.h>
+#include <asm/uaccess.h>
+
+#include "blktap.h"
+
+DEFINE_MUTEX(blktap_lock);
+
+struct blktap **blktaps;
+int blktap_max_minor;
+static struct blktap_page_pool *default_pool;
+
+static struct blktap *
+blktap_control_get_minor(void)
+{
+	int minor;
+	struct blktap *tap;
+
+	tap = kzalloc(sizeof(*tap), GFP_KERNEL);
+	if (unlikely(!tap))
+		return NULL;
+
+	mutex_lock(&blktap_lock);
+
+	for (minor = 0; minor < blktap_max_minor; minor++)
+		if (!blktaps[minor])
+			break;
+
+	if (minor == MAX_BLKTAP_DEVICE)
+		goto fail;
+
+	if (minor == blktap_max_minor) {
+		void *p;
+		int n;
+
+		n = min(2 * blktap_max_minor, MAX_BLKTAP_DEVICE);
+		p = krealloc(blktaps, n * sizeof(blktaps[0]), GFP_KERNEL);
+		if (!p)
+			goto fail;
+
+		blktaps          = p;
+		minor            = blktap_max_minor;
+		blktap_max_minor = n;
+
+		memset(&blktaps[minor], 0, (n - minor) * sizeof(blktaps[0]));
+	}
+
+	tap->minor = minor;
+	blktaps[minor] = tap;
+
+	__module_get(THIS_MODULE);
+out:
+	mutex_unlock(&blktap_lock);
+	return tap;
+
+fail:
+	mutex_unlock(&blktap_lock);
+	kfree(tap);
+	tap = NULL;
+	goto out;
+}
+
+static void
+blktap_control_put_minor(struct blktap* tap)
+{
+	blktaps[tap->minor] = NULL;
+	kfree(tap);
+
+	module_put(THIS_MODULE);
+}
+
+static struct blktap*
+blktap_control_create_tap(void)
+{
+	struct blktap *tap;
+	int err;
+
+	tap = blktap_control_get_minor();
+	if (!tap)
+		return NULL;
+
+	kobject_get(&default_pool->kobj);
+	tap->pool = default_pool;
+
+	err = blktap_ring_create(tap);
+	if (err)
+		goto fail_tap;
+
+	err = blktap_sysfs_create(tap);
+	if (err)
+		goto fail_ring;
+
+	return tap;
+
+fail_ring:
+	blktap_ring_destroy(tap);
+fail_tap:
+	blktap_control_put_minor(tap);
+
+	return NULL;
+}
+
+int
+blktap_control_destroy_tap(struct blktap *tap)
+{
+	int err;
+
+	err = blktap_ring_destroy(tap);
+	if (err)
+		return err;
+
+	kobject_put(&tap->pool->kobj);
+
+	blktap_sysfs_destroy(tap);
+
+	blktap_control_put_minor(tap);
+
+	return 0;
+}
+
+static int
+blktap_control_ioctl(struct inode *inode, struct file *filp,
+		     unsigned int cmd, unsigned long arg)
+{
+	struct blktap *tap;
+
+	switch (cmd) {
+	case BLKTAP2_IOCTL_ALLOC_TAP: {
+		struct blktap_handle h;
+		void __user *ptr = (void __user*)arg;
+
+		tap = blktap_control_create_tap();
+		if (!tap)
+			return -ENOMEM;
+
+		h.ring   = blktap_ring_major;
+		h.device = blktap_device_major;
+		h.minor  = tap->minor;
+
+		if (copy_to_user(ptr, &h, sizeof(h))) {
+			blktap_control_destroy_tap(tap);
+			return -EFAULT;
+		}
+
+		return 0;
+	}
+
+	case BLKTAP2_IOCTL_FREE_TAP: {
+		int minor = arg;
+
+		if (minor > MAX_BLKTAP_DEVICE)
+			return -EINVAL;
+
+		tap = blktaps[minor];
+		if (!tap)
+			return -ENODEV;
+
+		return blktap_control_destroy_tap(tap);
+	}
+	}
+
+	return -ENOIOCTLCMD;
+}
+
+static struct file_operations blktap_control_file_operations = {
+	.owner    = THIS_MODULE,
+	.ioctl    = blktap_control_ioctl,
+};
+
+static struct miscdevice blktap_control = {
+	.minor    = MISC_DYNAMIC_MINOR,
+	.name     = "blktap-control",
+	.fops     = &blktap_control_file_operations,
+};
+
+static struct device *control_device;
+
+static ssize_t
+blktap_control_show_default_pool(struct device *device,
+				 struct device_attribute *attr,
+				 char *buf)
+{
+	return sprintf(buf, "%s", kobject_name(&default_pool->kobj));
+}
+
+static ssize_t
+blktap_control_store_default_pool(struct device *device,
+				  struct device_attribute *attr,
+				  const char *buf, size_t size)
+{
+	struct blktap_page_pool *pool, *tmp = default_pool;
+
+	pool = blktap_page_pool_get(buf);
+	if (IS_ERR(pool))
+		return PTR_ERR(pool);
+
+	default_pool = pool;
+	kobject_put(&tmp->kobj);
+
+	return size;
+}
+
+static DEVICE_ATTR(default_pool, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH,
+		   blktap_control_show_default_pool,
+		   blktap_control_store_default_pool);
+
+size_t
+blktap_control_debug(struct blktap *tap, char *buf, size_t size)
+{
+	char *s = buf, *end = buf + size;
+
+	s += snprintf(s, end - s,
+		      "tap %u:%u name:'%s' flags:%#08lx\n",
+		      MAJOR(tap->ring.devno), MINOR(tap->ring.devno),
+		      tap->name, tap->dev_inuse);
+
+	return s - buf;
+}
+
+static int __init
+blktap_control_init(void)
+{
+	int err;
+
+	err = misc_register(&blktap_control);
+	if (err)
+		return err;
+
+	control_device = blktap_control.this_device;
+
+	blktap_max_minor = min(64, MAX_BLKTAP_DEVICE);
+	blktaps = kzalloc(blktap_max_minor * sizeof(blktaps[0]), GFP_KERNEL);
+	if (!blktaps) {
+		BTERR("failed to allocate blktap minor map");
+		return -ENOMEM;
+	}
+
+	err = blktap_page_pool_init(&control_device->kobj);
+	if (err)
+		return err;
+
+	default_pool = blktap_page_pool_get("default");
+	if (!default_pool)
+		return -ENOMEM;
+
+	err = device_create_file(control_device, &dev_attr_default_pool);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static void
+blktap_control_exit(void)
+{
+	if (default_pool) {
+		kobject_put(&default_pool->kobj);
+		default_pool = NULL;
+	}
+
+	blktap_page_pool_exit();
+
+	if (blktaps) {
+		kfree(blktaps);
+		blktaps = NULL;
+	}
+
+	if (control_device) {
+		misc_deregister(&blktap_control);
+		control_device = NULL;
+	}
+}
+
+static void
+blktap_exit(void)
+{
+	blktap_control_exit();
+	blktap_ring_exit();
+	blktap_sysfs_exit();
+	blktap_device_exit();
+}
+
+static int __init
+blktap_init(void)
+{
+	int err;
+
+	err = blktap_device_init();
+	if (err)
+		goto fail;
+
+	err = blktap_ring_init();
+	if (err)
+		goto fail;
+
+	err = blktap_sysfs_init();
+	if (err)
+		goto fail;
+
+	err = blktap_control_init();
+	if (err)
+		goto fail;
+
+	return 0;
+
+fail:
+	blktap_exit();
+	return err;
+}
+
+module_init(blktap_init);
+module_exit(blktap_exit);
+MODULE_LICENSE("Dual BSD/GPL");
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ head-2011-08-09/drivers/xen/blktap2-new/device.c	2011-08-09 10:31:33.000000000 +0200
@@ -0,0 +1,566 @@
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/cdrom.h>
+#include <linux/hdreg.h>
+#include <scsi/scsi.h>
+#include <scsi/scsi_ioctl.h>
+
+#include "blktap.h"
+
+int blktap_device_major;
+
+#define dev_to_blktap(_dev) container_of(_dev, struct blktap, device)
+
+static int
+blktap_device_open(struct block_device *bdev, fmode_t mode)
+{
+	struct gendisk *disk = bdev->bd_disk;
+	struct blktap_device *tapdev = disk->private_data;
+
+	if (!tapdev)
+		return -ENXIO;
+
+	/* NB. we might have bounced a bd trylock by tapdisk. when
+	 * failing for reasons not !tapdev, make sure to kick tapdisk
+	 * out of destroy wait state again. */
+
+	return 0;
+}
+
+static int
+blktap_device_release(struct gendisk *disk, fmode_t mode)
+{
+	struct blktap_device *tapdev = disk->private_data;
+	struct block_device *bdev = bdget_disk(disk, 0);
+	struct blktap *tap = dev_to_blktap(tapdev);
+
+	bdput(bdev);
+
+	if (!bdev->bd_openers) {
+		set_bit(BLKTAP_DEVICE_CLOSED, &tap->dev_inuse);
+		blktap_ring_kick_user(tap);
+	}
+
+	return 0;
+}
+
+static int
+blktap_device_getgeo(struct block_device *bd, struct hd_geometry *hg)
+{
+	/* We don't have real geometry info, but let's at least return
+	   values consistent with the size of the device */
+	sector_t nsect = get_capacity(bd->bd_disk);
+	sector_t cylinders = nsect;
+
+	hg->heads = 0xff;
+	hg->sectors = 0x3f;
+	sector_div(cylinders, hg->heads * hg->sectors);
+	hg->cylinders = cylinders;
+	if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
+		hg->cylinders = 0xffff;
+	return 0;
+}
+
+static int
+blktap_device_ioctl(struct block_device *bd, fmode_t mode,
+		    unsigned command, unsigned long argument)
+{
+	int i;
+
+	switch (command) {
+	case CDROMMULTISESSION:
+		BTDBG("FIXME: support multisession CDs later\n");
+		for (i = 0; i < sizeof(struct cdrom_multisession); i++)
+			if (put_user(0, (char __user *)(argument + i)))
+				return -EFAULT;
+		return 0;
+
+	case SCSI_IOCTL_GET_IDLUN:
+		if (!access_ok(VERIFY_WRITE, argument, 
+			sizeof(struct scsi_idlun)))
+			return -EFAULT;
+
+		/* return 0 for now. */
+		__put_user(0, &((struct scsi_idlun __user *)argument)->dev_id);
+		__put_user(0, 
+			&((struct scsi_idlun __user *)argument)->host_unique_id);
+		return 0;
+
+	default:
+		/*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
+		  command);*/
+		return -EINVAL; /* same return as native Linux */
+	}
+
+	return 0;
+}
+
+static struct block_device_operations blktap_device_file_operations = {
+	.owner     = THIS_MODULE,
+	.open      = blktap_device_open,
+	.release   = blktap_device_release,
+	.ioctl     = blktap_device_ioctl,
+	.getgeo    = blktap_device_getgeo
+};
+
+/* NB. __blktap holding the queue lock; blktap where unlocked */
+
+static inline struct request*
+__blktap_next_queued_rq(struct request_queue *q)
+{
+	return blk_peek_request(q);
+}
+
+static inline void
+__blktap_dequeue_rq(struct request *rq)
+{
+	blk_start_request(rq);
+}
+
+/* NB. err == 0 indicates success, failures < 0 */
+
+static inline void
+__blktap_end_queued_rq(struct request *rq, int err)
+{
+	blk_start_request(rq);
+	__blk_end_request(rq, err, blk_rq_bytes(rq));
+}
+
+static inline void
+__blktap_end_rq(struct request *rq, int err)
+{
+	__blk_end_request(rq, err, blk_rq_bytes(rq));
+}
+
+static inline void
+blktap_end_rq(struct request *rq, int err)
+{
+	struct request_queue *q = rq->q;
+
+	spin_lock_irq(q->queue_lock);
+	__blktap_end_rq(rq, err);
+	spin_unlock_irq(q->queue_lock);
+}
+
+void
+blktap_device_end_request(struct blktap *tap,
+			  struct blktap_request *request,
+			  int error)
+{
+	struct blktap_device *tapdev = &tap->device;
+	struct request *rq = request->rq;
+
+	blktap_ring_unmap_request(tap, request);
+
+	blktap_ring_free_request(tap, request);
+
+	dev_dbg(disk_to_dev(tapdev->gd),
+		"end_request: op=%d error=%d bytes=%d\n",
+		rq_data_dir(rq), error, blk_rq_bytes(rq));
+
+	blktap_end_rq(rq, error);
+}
+
+int
+blktap_device_make_request(struct blktap *tap, struct request *rq)
+{
+	struct blktap_device *tapdev = &tap->device;
+	struct blktap_request *request;
+	int write, nsegs;
+	int err;
+
+	request = blktap_ring_make_request(tap);
+	if (IS_ERR(request)) {
+		err = PTR_ERR(request);
+		request = NULL;
+
+		if (err == -ENOSPC || err == -ENOMEM)
+			goto stop;
+
+		goto fail;
+	}
+
+	write = rq_data_dir(rq) == WRITE;
+	nsegs = blk_rq_map_sg(rq->q, rq, request->sg_table);
+
+	dev_dbg(disk_to_dev(tapdev->gd),
+		"make_request: op=%c bytes=%d nsegs=%d\n",
+		write ? 'w' : 'r', blk_rq_bytes(rq), nsegs);
+
+	request->rq = rq;
+	request->operation = write ? BLKIF_OP_WRITE : BLKIF_OP_READ;
+
+	err = blktap_request_get_pages(tap, request, nsegs);
+	if (err)
+		goto stop;
+
+	err = blktap_ring_map_request(tap, request);
+	if (err)
+		goto fail;
+
+	blktap_ring_submit_request(tap, request);
+
+	return 0;
+
+stop:
+	tap->stats.st_oo_req++;
+	err = -EBUSY;
+
+_out:
+	if (request)
+		blktap_ring_free_request(tap, request);
+
+	return err;
+fail:
+	if (printk_ratelimit())
+		dev_warn(disk_to_dev(tapdev->gd),
+			 "make request: %d, failing\n", err);
+	goto _out;
+}
+
+/*
+ * called from tapdisk context
+ */
+void
+blktap_device_run_queue(struct blktap *tap)
+{
+	struct blktap_device *tapdev = &tap->device;
+	struct request_queue *q;
+	struct request *rq;
+	int err;
+
+	if (!tapdev->gd)
+		return;
+
+	q = tapdev->gd->queue;
+
+	spin_lock_irq(&tapdev->lock);
+	queue_flag_clear(QUEUE_FLAG_STOPPED, q);
+
+	do {
+		rq = __blktap_next_queued_rq(q);
+		if (!rq)
+			break;
+
+		if (!blk_fs_request(rq)) {
+			__blktap_end_queued_rq(rq, -EOPNOTSUPP);
+			continue;
+		}
+
+		spin_unlock_irq(&tapdev->lock);
+
+		err = blktap_device_make_request(tap, rq);
+
+		spin_lock_irq(&tapdev->lock);
+
+		if (err == -EBUSY) {
+			blk_stop_queue(q);
+			break;
+		}
+
+		__blktap_dequeue_rq(rq);
+
+		if (unlikely(err))
+			__blktap_end_rq(rq, err);
+	} while (1);
+
+	spin_unlock_irq(&tapdev->lock);
+}
+
+static void
+blktap_device_do_request(struct request_queue *rq)
+{
+	struct blktap_device *tapdev = rq->queuedata;
+	struct blktap *tap = dev_to_blktap(tapdev);
+
+	blktap_ring_kick_user(tap);
+}
+
+static void
+blktap_device_configure(struct blktap *tap,
+			struct blktap_params *params)
+{
+	struct request_queue *rq;
+	struct blktap_device *dev = &tap->device;
+
+	dev = &tap->device;
+	rq  = dev->gd->queue;
+
+	spin_lock_irq(&dev->lock);
+
+	set_capacity(dev->gd, params->capacity);
+
+	/* Hard sector size and max sectors impersonate the equiv. hardware. */
+	blk_queue_logical_block_size(rq, params->sector_size);
+	blk_queue_max_sectors(rq, 512);
+
+	/* Each segment in a request is up to an aligned page in size. */
+	blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
+	blk_queue_max_segment_size(rq, PAGE_SIZE);
+
+	/* Ensure a merged request will fit in a single I/O ring slot. */
+	blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+	blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+
+	/* Make sure buffer addresses are sector-aligned. */
+	blk_queue_dma_alignment(rq, 511);
+
+	/* We are reordering, but cacheless. */
+	blk_queue_ordered(rq, QUEUE_ORDERED_DRAIN, NULL);
+
+	spin_unlock_irq(&dev->lock);
+}
+
+static int
+blktap_device_validate_params(struct blktap *tap,
+			      struct blktap_params *params)
+{
+	struct device *dev = tap->ring.dev;
+	int sector_order, name_sz;
+
+	sector_order = ffs(params->sector_size) - 1;
+
+	if (sector_order <  9 ||
+	    sector_order > 12 ||
+	    params->sector_size != 1U<<sector_order)
+		goto fail;
+
+	if (!params->capacity ||
+	    (params->capacity > ULLONG_MAX >> sector_order))
+		goto fail;
+
+	name_sz = min(sizeof(params->name), sizeof(tap->name));
+	if (strnlen(params->name, name_sz) >= name_sz)
+		goto fail;
+
+	return 0;
+
+fail:
+	params->name[name_sz-1] = 0;
+	dev_err(dev, "capacity: %llu, sector-size: %lu, name: %s\n",
+		params->capacity, params->sector_size, params->name);
+	return -EINVAL;
+}
+
+int
+blktap_device_destroy(struct blktap *tap)
+{
+	struct blktap_device *tapdev = &tap->device;
+	struct block_device *bdev;
+	struct gendisk *gd;
+	int err;
+
+	gd = tapdev->gd;
+	if (!gd)
+		return 0;
+
+	bdev = bdget_disk(gd, 0);
+
+	err = !mutex_trylock(&bdev->bd_mutex);
+	if (err) {
+		/* NB. avoid a deadlock. the last opener syncs the
+		 * bdev holding bd_mutex. */
+		err = -EBUSY;
+		goto out_nolock;
+	}
+
+	if (bdev->bd_openers) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	del_gendisk(gd);
+	gd->private_data = NULL;
+
+	blk_cleanup_queue(gd->queue);
+
+	put_disk(gd);
+	tapdev->gd = NULL;
+
+	clear_bit(BLKTAP_DEVICE, &tap->dev_inuse);
+	err = 0;
+out:
+	mutex_unlock(&bdev->bd_mutex);
+out_nolock:
+	bdput(bdev);
+
+	return err;
+}
+
+static void
+blktap_device_fail_queue(struct blktap *tap)
+{
+	struct blktap_device *tapdev = &tap->device;
+	struct request_queue *q = tapdev->gd->queue;
+
+	spin_lock_irq(&tapdev->lock);
+	queue_flag_clear(QUEUE_FLAG_STOPPED, q);
+
+	do {
+		struct request *rq = __blktap_next_queued_rq(q);
+		if (!rq)
+			break;
+
+		__blktap_end_queued_rq(rq, -EIO);
+	} while (1);
+
+	spin_unlock_irq(&tapdev->lock);
+}
+
+static int
+blktap_device_try_destroy(struct blktap *tap)
+{
+	int err;
+
+	err = blktap_device_destroy(tap);
+	if (err)
+		blktap_device_fail_queue(tap);
+
+	return err;
+}
+
+void
+blktap_device_destroy_sync(struct blktap *tap)
+{
+	wait_event(tap->ring.poll_wait,
+		   !blktap_device_try_destroy(tap));
+}
+
+int
+blktap_device_create(struct blktap *tap, struct blktap_params *params)
+{
+	int minor, err;
+	struct gendisk *gd;
+	struct request_queue *rq;
+	struct blktap_device *tapdev;
+
+	gd     = NULL;
+	rq     = NULL;
+	tapdev = &tap->device;
+	minor  = tap->minor;
+
+	if (test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
+		return -EEXIST;
+
+	if (blktap_device_validate_params(tap, params))
+		return -EINVAL;
+
+	gd = alloc_disk(1);
+	if (!gd) {
+		err = -ENOMEM;
+		goto fail;
+	}
+
+	if (minor < 26) {
+		sprintf(gd->disk_name, "td%c", 'a' + minor % 26);
+	} else if (minor < (26 + 1) * 26) {
+		sprintf(gd->disk_name, "td%c%c",
+			'a' + minor / 26 - 1,'a' + minor % 26);
+	} else {
+		const unsigned int m1 = (minor / 26 - 1) / 26 - 1;
+		const unsigned int m2 = (minor / 26 - 1) % 26;
+		const unsigned int m3 =  minor % 26;
+		sprintf(gd->disk_name, "td%c%c%c",
+			'a' + m1, 'a' + m2, 'a' + m3);
+	}
+
+	gd->major = blktap_device_major;
+	gd->first_minor = minor;
+	gd->fops = &blktap_device_file_operations;
+	gd->private_data = tapdev;
+
+	spin_lock_init(&tapdev->lock);
+	rq = blk_init_queue(blktap_device_do_request, &tapdev->lock);
+	if (!rq) {
+		err = -ENOMEM;
+		goto fail;
+	}
+	elevator_init(rq, "noop");
+
+	gd->queue     = rq;
+	rq->queuedata = tapdev;
+	tapdev->gd    = gd;
+
+	blktap_device_configure(tap, params);
+	add_disk(gd);
+
+	if (params->name[0])
+		strncpy(tap->name, params->name, sizeof(tap->name)-1);
+
+	set_bit(BLKTAP_DEVICE, &tap->dev_inuse);
+
+	dev_info(disk_to_dev(gd), "sector-size: %u capacity: %llu\n",
+		 queue_logical_block_size(rq),
+		 (unsigned long long)get_capacity(gd));
+
+	return 0;
+
+fail:
+	if (gd)
+		del_gendisk(gd);
+	if (rq)
+		blk_cleanup_queue(rq);
+
+	return err;
+}
+
+size_t
+blktap_device_debug(struct blktap *tap, char *buf, size_t size)
+{
+	struct gendisk *disk = tap->device.gd;
+	struct request_queue *q;
+	struct block_device *bdev;
+	char *s = buf, *end = buf + size;
+
+	if (!disk)
+		return 0;
+
+	q = disk->queue;
+
+	s += snprintf(s, end - s,
+		      "disk capacity:%llu sector size:%u\n",
+		      (unsigned long long)get_capacity(disk),
+		      queue_logical_block_size(q));
+
+	s += snprintf(s, end - s,
+		      "queue flags:%#lx plugged:%d stopped:%d empty:%d\n",
+		      q->queue_flags,
+		      blk_queue_plugged(q), blk_queue_stopped(q),
+		      elv_queue_empty(q));
+
+	bdev = bdget_disk(disk, 0);
+	if (bdev) {
+		s += snprintf(s, end - s,
+			      "bdev openers:%d closed:%d\n",
+			      bdev->bd_openers,
+			      test_bit(BLKTAP_DEVICE_CLOSED, &tap->dev_inuse));
+		bdput(bdev);
+	}
+
+	return s - buf;
+}
+
+int __init
+blktap_device_init()
+{
+	int major;
+
+	/* Dynamically allocate a major for this device */
+	major = register_blkdev(0, "tapdev");
+	if (major < 0) {
+		BTERR("Couldn't register blktap device\n");
+		return -ENOMEM;
+	}
+
+	blktap_device_major = major;
+	BTINFO("blktap device major %d\n", major);
+
+	return 0;
+}
+
+void
+blktap_device_exit(void)
+{
+	if (blktap_device_major)
+		unregister_blkdev(blktap_device_major, "tapdev");
+}
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ head-2011-08-09/drivers/xen/blktap2-new/request.c	2011-02-24 13:49:49.000000000 +0100
@@ -0,0 +1,418 @@
+#include <linux/mempool.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/device.h>
+
+#include "blktap.h"
+
+/* max pages per shared pool. just to prevent accidental dos. */
+#define POOL_MAX_PAGES           (256*BLKIF_MAX_SEGMENTS_PER_REQUEST)
+
+/* default page pool size. when considering to shrink a shared pool,
+ * note that paused tapdisks may grab a whole lot of pages for a long
+ * time. */
+#define POOL_DEFAULT_PAGES       (2 * MMAP_PAGES)
+
+/* max number of pages allocatable per request. */
+#define POOL_MAX_REQUEST_PAGES   BLKIF_MAX_SEGMENTS_PER_REQUEST
+
+/* min request structs per pool. These grow dynamically. */
+#define POOL_MIN_REQS            BLK_RING_SIZE
+
+static struct kset *pool_set;
+
+#define kobj_to_pool(_kobj) \
+	container_of(_kobj, struct blktap_page_pool, kobj)
+
+static struct kmem_cache *request_cache;
+static mempool_t *request_pool;
+
+static void
+__page_pool_wake(struct blktap_page_pool *pool)
+{
+	mempool_t *mem = pool->bufs;
+
+	/*
+	  NB. slightly wasteful to always wait for a full segment
+	  set. but this ensures the next disk makes
+	  progress. presently, the repeated request struct
+	  alloc/release cycles would otherwise keep everyone spinning.
+	*/
+
+	if (mem->curr_nr >= POOL_MAX_REQUEST_PAGES)
+		wake_up(&pool->wait);
+}
+
+int
+blktap_request_get_pages(struct blktap *tap,
+			 struct blktap_request *request, int nr_pages)
+{
+	struct blktap_page_pool *pool = tap->pool;
+	mempool_t *mem = pool->bufs;
+	struct page *page;
+
+	BUG_ON(request->nr_pages != 0);
+	BUG_ON(nr_pages > POOL_MAX_REQUEST_PAGES);
+
+	if (mem->curr_nr < nr_pages)
+		return -ENOMEM;
+
+	/* NB. avoid thundering herds of tapdisks colliding. */
+	spin_lock(&pool->lock);
+
+	if (mem->curr_nr < nr_pages) {
+		spin_unlock(&pool->lock);
+		return -ENOMEM;
+	}
+
+	while (request->nr_pages < nr_pages) {
+		page = mempool_alloc(mem, GFP_NOWAIT);
+		BUG_ON(!page);
+		request->pages[request->nr_pages++] = page;
+	}
+
+	spin_unlock(&pool->lock);
+
+	return 0;
+}
+
+static void
+blktap_request_put_pages(struct blktap *tap,
+			 struct blktap_request *request)
+{
+	struct blktap_page_pool *pool = tap->pool;
+	struct page *page;
+
+	while (request->nr_pages) {
+		page = request->pages[--request->nr_pages];
+		mempool_free(page, pool->bufs);
+	}
+}
+
+size_t
+blktap_request_debug(struct blktap *tap, char *buf, size_t size)
+{
+	struct blktap_page_pool *pool = tap->pool;
+	mempool_t *mem = pool->bufs;
+	char *s = buf, *end = buf + size;
+
+	s += snprintf(buf, end - s,
+		      "pool:%s pages:%d free:%d\n",
+		      kobject_name(&pool->kobj),
+		      mem->min_nr, mem->curr_nr);
+
+	return s - buf;
+}
+
+struct blktap_request*
+blktap_request_alloc(struct blktap *tap)
+{
+	struct blktap_request *request;
+
+	request = mempool_alloc(request_pool, GFP_NOWAIT);
+	if (request)
+		request->tap = tap;
+
+	return request;
+}
+
+void
+blktap_request_free(struct blktap *tap,
+		    struct blktap_request *request)
+{
+	blktap_request_put_pages(tap, request);
+
+	mempool_free(request, request_pool);
+
+	__page_pool_wake(tap->pool);
+}
+
+void
+blktap_request_bounce(struct blktap *tap,
+		      struct blktap_request *request,
+		      int seg, int write)
+{
+	struct scatterlist *sg = &request->sg_table[seg];
+	void *s, *p;
+
+	BUG_ON(seg >= request->nr_pages);
+
+	s = sg_virt(sg);
+	p = page_address(request->pages[seg]) + sg->offset;
+
+	if (write)
+		memcpy(p, s, sg->length);
+	else
+		memcpy(s, p, sg->length);
+}
+
+static void
+blktap_request_ctor(void *obj)
+{
+	struct blktap_request *request = obj;
+
+	memset(request, 0, sizeof(*request));
+	sg_init_table(request->sg_table, ARRAY_SIZE(request->sg_table));
+}
+
+static int
+blktap_page_pool_resize(struct blktap_page_pool *pool, int target)
+{
+	mempool_t *bufs = pool->bufs;
+	int err;
+
+	/* NB. mempool asserts min_nr >= 1 */
+	target = max(1, target);
+
+	err = mempool_resize(bufs, target, GFP_KERNEL);
+	if (err)
+		return err;
+
+	__page_pool_wake(pool);
+
+	return 0;
+}
+
+struct pool_attribute {
+	struct attribute attr;
+
+	ssize_t (*show)(struct blktap_page_pool *pool,
+			char *buf);
+
+	ssize_t (*store)(struct blktap_page_pool *pool,
+			 const char *buf, size_t count);
+};
+
+#define kattr_to_pool_attr(_kattr) \
+	container_of(_kattr, struct pool_attribute, attr)
+
+static ssize_t
+blktap_page_pool_show_size(struct blktap_page_pool *pool,
+			   char *buf)
+{
+	mempool_t *mem = pool->bufs;
+	return sprintf(buf, "%d", mem->min_nr);
+}
+
+static ssize_t
+blktap_page_pool_store_size(struct blktap_page_pool *pool,
+			    const char *buf, size_t size)
+{
+	int target;
+
+	/*
+	 * NB. target fixup to avoid undesired results. less than a
+	 * full segment set can wedge the disk. much more than a
+	 * couple times the physical queue depth is rarely useful.
+	 */
+
+	target = simple_strtoul(buf, NULL, 0);
+	target = max(POOL_MAX_REQUEST_PAGES, target);
+	target = min(target, POOL_MAX_PAGES);
+
+	return blktap_page_pool_resize(pool, target) ? : size;
+}
+
+static struct pool_attribute blktap_page_pool_attr_size =
+	__ATTR(size, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH,
+	       blktap_page_pool_show_size,
+	       blktap_page_pool_store_size);
+
+static ssize_t
+blktap_page_pool_show_free(struct blktap_page_pool *pool,
+			   char *buf)
+{
+	mempool_t *mem = pool->bufs;
+	return sprintf(buf, "%d", mem->curr_nr);
+}
+
+static struct pool_attribute blktap_page_pool_attr_free =
+	__ATTR(free, S_IRUSR|S_IRGRP|S_IROTH,
+	       blktap_page_pool_show_free,
+	       NULL);
+
+static struct attribute *blktap_page_pool_attrs[] = {
+	&blktap_page_pool_attr_size.attr,
+	&blktap_page_pool_attr_free.attr,
+	NULL,
+};
+
+static inline struct kobject*
+__blktap_kset_find_obj(struct kset *kset, const char *name)
+{
+	struct kobject *k;
+	struct kobject *ret = NULL;
+
+	spin_lock(&kset->list_lock);
+	list_for_each_entry(k, &kset->list, entry) {
+		if (kobject_name(k) && !strcmp(kobject_name(k), name)) {
+			ret = kobject_get(k);
+			break;
+		}
+	}
+	spin_unlock(&kset->list_lock);
+	return ret;
+}
+
+static ssize_t
+blktap_page_pool_show_attr(struct kobject *kobj, struct attribute *kattr,
+			   char *buf)
+{
+	struct blktap_page_pool *pool = kobj_to_pool(kobj);
+	struct pool_attribute *attr = kattr_to_pool_attr(kattr);
+
+	if (attr->show)
+		return attr->show(pool, buf);
+
+	return -EIO;
+}
+
+static ssize_t
+blktap_page_pool_store_attr(struct kobject *kobj, struct attribute *kattr,
+			    const char *buf, size_t size)
+{
+	struct blktap_page_pool *pool = kobj_to_pool(kobj);
+	struct pool_attribute *attr = kattr_to_pool_attr(kattr);
+
+	if (attr->show)
+		return attr->store(pool, buf, size);
+
+	return -EIO;
+}
+
+static struct sysfs_ops blktap_page_pool_sysfs_ops = {
+	.show		= blktap_page_pool_show_attr,
+	.store		= blktap_page_pool_store_attr,
+};
+
+static void
+blktap_page_pool_release(struct kobject *kobj)
+{
+	struct blktap_page_pool *pool = kobj_to_pool(kobj);
+	mempool_destroy(pool->bufs);
+	kfree(pool);
+}
+
+struct kobj_type blktap_page_pool_ktype = {
+	.release       = blktap_page_pool_release,
+	.sysfs_ops     = &blktap_page_pool_sysfs_ops,
+	.default_attrs = blktap_page_pool_attrs,
+};
+
+static void*
+__mempool_page_alloc(gfp_t gfp_mask, void *pool_data)
+{
+	struct page *page;
+
+	if (!(gfp_mask & __GFP_WAIT))
+		return NULL;
+
+	page = alloc_page(gfp_mask);
+	if (page)
+		SetPageReserved(page);
+
+	return page;
+}
+
+static void
+__mempool_page_free(void *element, void *pool_data)
+{
+	struct page *page = element;
+
+	ClearPageReserved(page);
+	put_page(page);
+}
+
+static struct kobject*
+blktap_page_pool_create(const char *name, int nr_pages)
+{
+	struct blktap_page_pool *pool;
+	int err;
+
+	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+	if (!pool)
+		goto fail;
+
+	spin_lock_init(&pool->lock);
+	init_waitqueue_head(&pool->wait);
+
+	pool->bufs = mempool_create(nr_pages,
+				    __mempool_page_alloc, __mempool_page_free,
+				    pool);
+	if (!pool->bufs)
+		goto fail_pool;
+
+	kobject_init(&pool->kobj, &blktap_page_pool_ktype);
+	pool->kobj.kset = pool_set;
+	err = kobject_add(&pool->kobj, &pool_set->kobj, "%s", name);
+	if (err)
+		goto fail_bufs;
+
+	return &pool->kobj;
+
+	kobject_del(&pool->kobj);
+fail_bufs:
+	mempool_destroy(pool->bufs);
+fail_pool:
+	kfree(pool);
+fail:
+	return NULL;
+}
+
+struct blktap_page_pool*
+blktap_page_pool_get(const char *name)
+{
+	struct kobject *kobj;
+
+	kobj = __blktap_kset_find_obj(pool_set, name);
+	if (!kobj)
+		kobj = blktap_page_pool_create(name,
+					       POOL_DEFAULT_PAGES);
+	if (!kobj)
+		return ERR_PTR(-ENOMEM);
+
+	return kobj_to_pool(kobj);
+}
+
+int __init
+blktap_page_pool_init(struct kobject *parent)
+{
+	request_cache =
+		kmem_cache_create("blktap-request",
+				  sizeof(struct blktap_request), 0,
+				  0, blktap_request_ctor);
+	if (!request_cache)
+		return -ENOMEM;
+
+	request_pool =
+		mempool_create_slab_pool(POOL_MIN_REQS, request_cache);
+	if (!request_pool)
+		return -ENOMEM;
+
+	pool_set = kset_create_and_add("pools", NULL, parent);
+	if (!pool_set)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void
+blktap_page_pool_exit(void)
+{
+	if (pool_set) {
+		BUG_ON(!list_empty(&pool_set->list));
+		kset_unregister(pool_set);
+		pool_set = NULL;
+	}
+
+	if (request_pool) {
+		mempool_destroy(request_pool);
+		request_pool = NULL;
+	}
+
+	if (request_cache) {
+		kmem_cache_destroy(request_cache);
+		request_cache = NULL;
+	}
+}
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ head-2011-08-09/drivers/xen/blktap2-new/ring.c	2011-02-24 13:49:49.000000000 +0100
@@ -0,0 +1,550 @@
+
+#include <linux/device.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/poll.h>
+#include <linux/blkdev.h>
+
+#include "blktap.h"
+
+int blktap_ring_major;
+static struct cdev blktap_ring_cdev;
+
+ /* 
+  * BLKTAP - immediately before the mmap area,
+  * we have a bunch of pages reserved for shared memory rings.
+  */
+#define RING_PAGES 1
+
+static void
+blktap_ring_read_response(struct blktap *tap,
+		     const struct blkif_response *rsp)
+{
+	struct blktap_ring *ring = &tap->ring;
+	struct blktap_request *request;
+	int usr_idx, err;
+
+	request = NULL;
+
+	usr_idx = rsp->id;
+	if (usr_idx < 0 || usr_idx >= MAX_PENDING_REQS) {
+		err = -ERANGE;
+		goto invalid;
+	}
+
+	request = ring->pending[usr_idx];
+
+	if (!request) {
+		err = -ESRCH;
+		goto invalid;
+	}
+
+	if (rsp->operation != request->operation) {
+		err = -EINVAL;
+		goto invalid;
+	}
+
+	dev_dbg(ring->dev,
+		"request %d [%p] response: %d\n",
+		request->usr_idx, request, rsp->status);
+
+	err = rsp->status == BLKIF_RSP_OKAY ? 0 : -EIO;
+end_request:
+	blktap_device_end_request(tap, request, err);
+	return;
+
+invalid:
+	dev_warn(ring->dev,
+		 "invalid response, idx:%d status:%d op:%d/%d: err %d\n",
+		 usr_idx, rsp->status,
+		 rsp->operation, request->operation,
+		 err);
+	if (request)
+		goto end_request;
+}
+
+static void
+blktap_read_ring(struct blktap *tap)
+{
+	struct blktap_ring *ring = &tap->ring;
+	struct blkif_response rsp;
+	RING_IDX rc, rp;
+
+	down_read(&current->mm->mmap_sem);
+	if (!ring->vma) {
+		up_read(&current->mm->mmap_sem);
+		return;
+	}
+
+	/* for each outstanding message on the ring  */
+	rp = ring->ring.sring->rsp_prod;
+	rmb();
+
+	for (rc = ring->ring.rsp_cons; rc != rp; rc++) {
+		memcpy(&rsp, RING_GET_RESPONSE(&ring->ring, rc), sizeof(rsp));
+		blktap_ring_read_response(tap, &rsp);
+	}
+
+	ring->ring.rsp_cons = rc;
+
+	up_read(&current->mm->mmap_sem);
+}
+
+static int blktap_ring_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	return VM_FAULT_SIGBUS;
+}
+
+static void
+blktap_ring_fail_pending(struct blktap *tap)
+{
+	struct blktap_ring *ring = &tap->ring;
+	struct blktap_request *request;
+	int usr_idx;
+
+	for (usr_idx = 0; usr_idx < MAX_PENDING_REQS; usr_idx++) {
+		request = ring->pending[usr_idx];
+		if (!request)
+			continue;
+
+		blktap_device_end_request(tap, request, -EIO);
+	}
+}
+
+static void
+blktap_ring_vm_close(struct vm_area_struct *vma)
+{
+	struct blktap *tap = vma->vm_private_data;
+	struct blktap_ring *ring = &tap->ring;
+	struct page *page = virt_to_page(ring->ring.sring);
+
+	blktap_ring_fail_pending(tap);
+
+	zap_page_range(vma, vma->vm_start, PAGE_SIZE, NULL);
+	ClearPageReserved(page);
+	__free_page(page);
+
+	ring->vma = NULL;
+
+	if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
+		blktap_control_destroy_tap(tap);
+}
+
+static struct vm_operations_struct blktap_ring_vm_operations = {
+	.close    = blktap_ring_vm_close,
+	.fault    = blktap_ring_fault,
+};
+
+int
+blktap_ring_map_segment(struct blktap *tap,
+			struct blktap_request *request,
+			int seg)
+{
+	struct blktap_ring *ring = &tap->ring;
+	unsigned long uaddr;
+
+	uaddr = MMAP_VADDR(ring->user_vstart, request->usr_idx, seg);
+	return vm_insert_page(ring->vma, uaddr, request->pages[seg]);
+}
+
+int
+blktap_ring_map_request(struct blktap *tap,
+			struct blktap_request *request)
+{
+	int seg, err = 0;
+	int write;
+
+	write = request->operation == BLKIF_OP_WRITE;
+
+	for (seg = 0; seg < request->nr_pages; seg++) {
+		if (write)
+			blktap_request_bounce(tap, request, seg, write);
+
+		err = blktap_ring_map_segment(tap, request, seg);
+		if (err)
+			break;
+	}
+
+	if (err)
+		blktap_ring_unmap_request(tap, request);
+
+	return err;
+}
+
+void
+blktap_ring_unmap_request(struct blktap *tap,
+			  struct blktap_request *request)
+{
+	struct blktap_ring *ring = &tap->ring;
+	unsigned long uaddr;
+	unsigned size;
+	int seg, read;
+
+	uaddr = MMAP_VADDR(ring->user_vstart, request->usr_idx, 0);
+	size  = request->nr_pages << PAGE_SHIFT;
+	read  = request->operation == BLKIF_OP_READ;
+
+	if (read)
+		for (seg = 0; seg < request->nr_pages; seg++)
+			blktap_request_bounce(tap, request, seg, !read);
+
+	zap_page_range(ring->vma, uaddr, size, NULL);
+}
+
+void
+blktap_ring_free_request(struct blktap *tap,
+			 struct blktap_request *request)
+{
+	struct blktap_ring *ring = &tap->ring;
+
+	ring->pending[request->usr_idx] = NULL;
+	ring->n_pending--;
+
+	blktap_request_free(tap, request);
+}
+
+struct blktap_request*
+blktap_ring_make_request(struct blktap *tap)
+{
+	struct blktap_ring *ring = &tap->ring;
+	struct blktap_request *request;
+	int usr_idx;
+
+	if (RING_FULL(&ring->ring))
+		return ERR_PTR(-ENOSPC);
+
+	request = blktap_request_alloc(tap);
+	if (!request)
+		return ERR_PTR(-ENOMEM);
+
+	for (usr_idx = 0; usr_idx < BLK_RING_SIZE; usr_idx++)
+		if (!ring->pending[usr_idx])
+			break;
+
+	BUG_ON(usr_idx >= BLK_RING_SIZE);
+
+	request->tap     = tap;
+	request->usr_idx = usr_idx;
+
+	ring->pending[usr_idx] = request;
+	ring->n_pending++;
+
+	return request;
+}
+
+void
+blktap_ring_submit_request(struct blktap *tap,
+			   struct blktap_request *request)
+{
+	struct blktap_ring *ring = &tap->ring;
+	struct blkif_request *breq;
+	struct scatterlist *sg;
+	int i, nsecs = 0;
+
+	dev_dbg(ring->dev,
+		"request %d [%p] submit\n", request->usr_idx, request);
+
+	breq = RING_GET_REQUEST(&ring->ring, ring->ring.req_prod_pvt);
+
+	breq->id            = request->usr_idx;
+	breq->sector_number = blk_rq_pos(request->rq);
+	breq->handle        = 0;
+	breq->operation     = request->operation;
+	breq->nr_segments   = request->nr_pages;
+
+	blktap_for_each_sg(sg, request, i) {
+		struct blkif_request_segment *seg = &breq->seg[i];
+		int first, count;
+
+		count = sg->length >> 9;
+		first = sg->offset >> 9;
+
+		seg->first_sect = first;
+		seg->last_sect  = first + count - 1;
+
+		nsecs += count;
+	}
+
+	ring->ring.req_prod_pvt++;
+
+	do_gettimeofday(&request->time);
+
+
+	if (request->operation == BLKIF_OP_WRITE) {
+		tap->stats.st_wr_sect += nsecs;
+		tap->stats.st_wr_req++;
+	}
+
+	if (request->operation == BLKIF_OP_READ) {
+		tap->stats.st_rd_sect += nsecs;
+		tap->stats.st_rd_req++;
+	}
+}
+
+static int
+blktap_ring_open(struct inode *inode, struct file *filp)
+{
+	struct blktap *tap = NULL;
+	int minor;
+
+	minor = iminor(inode);
+
+	if (minor < blktap_max_minor)
+		tap = blktaps[minor];
+
+	if (!tap)
+		return -ENXIO;
+
+	if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
+		return -ENXIO;
+
+	if (tap->ring.task)
+		return -EBUSY;
+
+	filp->private_data = tap;
+	tap->ring.task = current;
+
+	return 0;
+}
+
+static int
+blktap_ring_release(struct inode *inode, struct file *filp)
+{
+	struct blktap *tap = filp->private_data;
+
+	blktap_device_destroy_sync(tap);
+
+	tap->ring.task = NULL;
+
+	if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
+		blktap_control_destroy_tap(tap);
+
+	return 0;
+}
+
+static int
+blktap_ring_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct blktap *tap = filp->private_data;
+	struct blktap_ring *ring = &tap->ring;
+	struct blkif_sring *sring;
+	struct page *page = NULL;
+	int err;
+
+	if (ring->vma)
+		return -EBUSY;
+
+	page = alloc_page(GFP_KERNEL|__GFP_ZERO);
+	if (!page)
+		return -ENOMEM;
+
+	SetPageReserved(page);
+
+	err = vm_insert_page(vma, vma->vm_start, page);
+	if (err)
+		goto fail;
+
+	sring = page_address(page);
+	SHARED_RING_INIT(sring);
+	FRONT_RING_INIT(&ring->ring, sring, PAGE_SIZE);
+
+	ring->ring_vstart = vma->vm_start;
+	ring->user_vstart = ring->ring_vstart + PAGE_SIZE;
+
+	vma->vm_private_data = tap;
+
+	vma->vm_flags |= VM_DONTCOPY;
+	vma->vm_flags |= VM_RESERVED;
+
+	vma->vm_ops = &blktap_ring_vm_operations;
+
+	ring->vma = vma;
+	return 0;
+
+fail:
+	if (page) {
+		zap_page_range(vma, vma->vm_start, PAGE_SIZE, NULL);
+		ClearPageReserved(page);
+		__free_page(page);
+	}
+
+	return err;
+}
+
+static int
+blktap_ring_ioctl(struct inode *inode, struct file *filp,
+		  unsigned int cmd, unsigned long arg)
+{
+	struct blktap *tap = filp->private_data;
+	struct blktap_ring *ring = &tap->ring;
+
+	BTDBG("%d: cmd: %u, arg: %lu\n", tap->minor, cmd, arg);
+
+	if (!ring->vma || ring->vma->vm_mm != current->mm)
+		return -EACCES;
+
+	switch(cmd) {
+	case BLKTAP2_IOCTL_KICK_FE:
+
+		blktap_read_ring(tap);
+		return 0;
+
+	case BLKTAP2_IOCTL_CREATE_DEVICE: {
+		struct blktap_params params;
+		void __user *ptr = (void *)arg;
+
+		if (!arg)
+			return -EINVAL;
+
+		if (copy_from_user(&params, ptr, sizeof(params)))
+			return -EFAULT;
+
+		return blktap_device_create(tap, &params);
+	}
+
+	case BLKTAP2_IOCTL_REMOVE_DEVICE:
+
+		return blktap_device_destroy(tap);
+	}
+
+	return -ENOIOCTLCMD;
+}
+
+static unsigned int blktap_ring_poll(struct file *filp, poll_table *wait)
+{
+	struct blktap *tap = filp->private_data;
+	struct blktap_ring *ring = &tap->ring;
+	int work;
+
+	poll_wait(filp, &tap->pool->wait, wait);
+	poll_wait(filp, &ring->poll_wait, wait);
+
+	down_read(&current->mm->mmap_sem);
+	if (ring->vma && tap->device.gd)
+		blktap_device_run_queue(tap);
+	up_read(&current->mm->mmap_sem);
+
+	work = ring->ring.req_prod_pvt - ring->ring.sring->req_prod;
+	RING_PUSH_REQUESTS(&ring->ring);
+
+	if (work ||
+	    ring->ring.sring->private.tapif_user.msg ||
+	    test_and_clear_bit(BLKTAP_DEVICE_CLOSED, &tap->dev_inuse))
+		return POLLIN | POLLRDNORM;
+
+	return 0;
+}
+
+static struct file_operations blktap_ring_file_operations = {
+	.owner    = THIS_MODULE,
+	.open     = blktap_ring_open,
+	.release  = blktap_ring_release,
+	.ioctl    = blktap_ring_ioctl,
+	.mmap     = blktap_ring_mmap,
+	.poll     = blktap_ring_poll,
+};
+
+void
+blktap_ring_kick_user(struct blktap *tap)
+{
+	wake_up(&tap->ring.poll_wait);
+}
+
+int
+blktap_ring_destroy(struct blktap *tap)
+{
+	struct blktap_ring *ring = &tap->ring;
+
+	if (ring->task || ring->vma)
+		return -EBUSY;
+
+	return 0;
+}
+
+int
+blktap_ring_create(struct blktap *tap)
+{
+	struct blktap_ring *ring = &tap->ring;
+
+	init_waitqueue_head(&ring->poll_wait);
+	ring->devno = MKDEV(blktap_ring_major, tap->minor);
+
+	return 0;
+}
+
+size_t
+blktap_ring_debug(struct blktap *tap, char *buf, size_t size)
+{
+	struct blktap_ring *ring = &tap->ring;
+	char *s = buf, *end = buf + size;
+	int usr_idx;
+
+	s += snprintf(s, end - s,
+		      "begin pending:%d\n", ring->n_pending);
+
+	for (usr_idx = 0; usr_idx < MAX_PENDING_REQS; usr_idx++) {
+		struct blktap_request *request;
+		struct timeval *time;
+		int write;
+
+		request = ring->pending[usr_idx];
+		if (!request)
+			continue;
+
+		write = request->operation == BLKIF_OP_WRITE;
+		time  = &request->time;
+
+		s += snprintf(s, end - s,
+			      "%02d: usr_idx:%02d "
+			      "op:%c nr_pages:%02d time:%lu.%09lu\n",
+			      usr_idx, request->usr_idx,
+			      write ? 'W' : 'R', request->nr_pages,
+			      time->tv_sec, time->tv_usec);
+	}
+
+	s += snprintf(s, end - s, "end pending\n");
+
+	return s - buf;
+}
+
+
+int __init
+blktap_ring_init(void)
+{
+	dev_t dev = 0;
+	int err;
+
+	cdev_init(&blktap_ring_cdev, &blktap_ring_file_operations);
+	blktap_ring_cdev.owner = THIS_MODULE;
+
+	err = alloc_chrdev_region(&dev, 0, MAX_BLKTAP_DEVICE, "blktap2");
+	if (err < 0) {
+		BTERR("error registering ring devices: %d\n", err);
+		return err;
+	}
+
+	err = cdev_add(&blktap_ring_cdev, dev, MAX_BLKTAP_DEVICE);
+	if (err) {
+		BTERR("error adding ring device: %d\n", err);
+		unregister_chrdev_region(dev, MAX_BLKTAP_DEVICE);
+		return err;
+	}
+
+	blktap_ring_major = MAJOR(dev);
+	BTINFO("blktap ring major: %d\n", blktap_ring_major);
+
+	return 0;
+}
+
+void
+blktap_ring_exit(void)
+{
+	if (!blktap_ring_major)
+		return;
+
+	cdev_del(&blktap_ring_cdev);
+	unregister_chrdev_region(MKDEV(blktap_ring_major, 0),
+				 MAX_BLKTAP_DEVICE);
+
+	blktap_ring_major = 0;
+}
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ head-2011-08-09/drivers/xen/blktap2-new/sysfs.c	2011-02-24 13:49:49.000000000 +0100
@@ -0,0 +1,288 @@
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+
+#include "blktap.h"
+
+int blktap_debug_level = 1;
+
+static struct class *class;
+
+static ssize_t
+blktap_sysfs_set_name(struct device *dev, struct device_attribute *attr, const char *buf, size_t size)
+{
+	struct blktap *tap;
+
+	tap = dev_get_drvdata(dev);
+	if (!tap)
+		return 0;
+
+	if (size >= BLKTAP2_MAX_MESSAGE_LEN)
+		return -ENAMETOOLONG;
+
+	if (strnlen(buf, size) != size)
+		return -EINVAL;
+
+	strcpy(tap->name, buf);
+
+	return size;
+}
+
+static ssize_t
+blktap_sysfs_get_name(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct blktap *tap;
+	ssize_t size;
+
+	tap = dev_get_drvdata(dev);
+	if (!tap)
+		return 0;
+
+	if (tap->name[0])
+		size = sprintf(buf, "%s\n", tap->name);
+	else
+		size = sprintf(buf, "%d\n", tap->minor);
+
+	return size;
+}
+static DEVICE_ATTR(name, S_IRUGO|S_IWUSR,
+		   blktap_sysfs_get_name, blktap_sysfs_set_name);
+
+static void
+blktap_sysfs_remove_work(struct work_struct *work)
+{
+	struct blktap *tap
+		= container_of(work, struct blktap, remove_work);
+	blktap_control_destroy_tap(tap);
+}
+
+static ssize_t
+blktap_sysfs_remove_device(struct device *dev,
+			   struct device_attribute *attr,
+			   const char *buf, size_t size)
+{
+	struct blktap *tap;
+	int err;
+
+	tap = dev_get_drvdata(dev);
+	if (!tap)
+		return size;
+
+	if (test_and_set_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
+		goto wait;
+
+	if (tap->ring.vma) {
+		struct blkif_sring *sring = tap->ring.ring.sring;
+		sring->private.tapif_user.msg = BLKTAP2_RING_MESSAGE_CLOSE;
+		blktap_ring_kick_user(tap);
+	} else {
+		INIT_WORK(&tap->remove_work, blktap_sysfs_remove_work);
+		schedule_work(&tap->remove_work);
+	}
+wait:
+	err = wait_event_interruptible(tap->remove_wait,
+				       !dev_get_drvdata(dev));
+	if (err)
+		return err;
+
+	return size;
+}
+static DEVICE_ATTR(remove, S_IWUSR, NULL, blktap_sysfs_remove_device);
+
+static ssize_t
+blktap_sysfs_debug_device(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct blktap *tap;
+	char *s = buf, *end = buf + PAGE_SIZE;
+
+	tap = dev_get_drvdata(dev);
+	if (!tap)
+		return 0;
+
+	s += blktap_control_debug(tap, s, end - s);
+
+	s += blktap_request_debug(tap, s, end - s);
+
+	s += blktap_device_debug(tap, s, end - s);
+
+	s += blktap_ring_debug(tap, s, end - s);
+
+	return s - buf;
+}
+static DEVICE_ATTR(debug, S_IRUGO, blktap_sysfs_debug_device, NULL);
+
+static ssize_t
+blktap_sysfs_show_task(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct blktap *tap;
+	ssize_t rv = 0;
+
+	tap = dev_get_drvdata(dev);
+	if (!tap)
+		return 0;
+
+	if (tap->ring.task)
+		rv = sprintf(buf, "%d\n", tap->ring.task->pid);
+
+	return rv;
+}
+static DEVICE_ATTR(task, S_IRUGO, blktap_sysfs_show_task, NULL);
+
+static ssize_t
+blktap_sysfs_show_pool(struct device *dev,
+		       struct device_attribute *attr,
+		       char *buf)
+{
+	struct blktap *tap = dev_get_drvdata(dev);
+	return sprintf(buf, "%s", kobject_name(&tap->pool->kobj));
+}
+
+static ssize_t
+blktap_sysfs_store_pool(struct device *dev,
+			struct device_attribute *attr,
+			const char *buf, size_t size)
+{
+	struct blktap *tap = dev_get_drvdata(dev);
+	struct blktap_page_pool *pool, *tmp = tap->pool;
+
+	if (tap->device.gd)
+		return -EBUSY;
+
+	pool = blktap_page_pool_get(buf);
+	if (IS_ERR(pool))
+		return PTR_ERR(pool);
+
+	tap->pool = pool;
+	kobject_put(&tmp->kobj);
+
+	return size;
+}
+DEVICE_ATTR(pool, S_IRUSR|S_IWUSR,
+	    blktap_sysfs_show_pool, blktap_sysfs_store_pool);
+
+int
+blktap_sysfs_create(struct blktap *tap)
+{
+	struct blktap_ring *ring = &tap->ring;
+	struct device *dev;
+	int err = 0;
+
+	init_waitqueue_head(&tap->remove_wait);
+
+	dev = device_create(class, NULL, ring->devno,
+			    tap, "blktap%d", tap->minor);
+	if (IS_ERR(dev))
+		err = PTR_ERR(dev);
+	if (!err)
+		err = device_create_file(dev, &dev_attr_name);
+	if (!err)
+		err = device_create_file(dev, &dev_attr_remove);
+	if (!err)
+		err = device_create_file(dev, &dev_attr_debug);
+	if (!err)
+		err = device_create_file(dev, &dev_attr_task);
+	if (!err)
+		err = device_create_file(dev, &dev_attr_pool);
+	if (!err)
+		ring->dev = dev;
+	else
+		device_unregister(dev);
+
+	return err;
+}
+
+void
+blktap_sysfs_destroy(struct blktap *tap)
+{
+	struct blktap_ring *ring = &tap->ring;
+	struct device *dev;
+
+	dev = ring->dev;
+
+	if (!dev)
+		return;
+
+	dev_set_drvdata(dev, NULL);
+	wake_up(&tap->remove_wait);
+
+	device_unregister(dev);
+	ring->dev = NULL;
+}
+
+static ssize_t
+blktap_sysfs_show_verbosity(struct class *class, char *buf)
+{
+	return sprintf(buf, "%d\n", blktap_debug_level);
+}
+
+static ssize_t
+blktap_sysfs_set_verbosity(struct class *class, const char *buf, size_t size)
+{
+	int level;
+
+	if (sscanf(buf, "%d", &level) == 1) {
+		blktap_debug_level = level;
+		return size;
+	}
+
+	return -EINVAL;
+}
+static CLASS_ATTR(verbosity, S_IRUGO|S_IWUSR,
+		  blktap_sysfs_show_verbosity, blktap_sysfs_set_verbosity);
+
+static ssize_t
+blktap_sysfs_show_devices(struct class *class, char *buf)
+{
+	int i, ret;
+	struct blktap *tap;
+
+	mutex_lock(&blktap_lock);
+
+	ret = 0;
+	for (i = 0; i < blktap_max_minor; i++) {
+		tap = blktaps[i];
+		if (!tap)
+			continue;
+
+		if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
+			continue;
+
+		ret += sprintf(buf + ret, "%d %s\n", tap->minor, tap->name);
+	}
+
+	mutex_unlock(&blktap_lock);
+
+	return ret;
+}
+static CLASS_ATTR(devices, S_IRUGO, blktap_sysfs_show_devices, NULL);
+
+void
+blktap_sysfs_exit(void)
+{
+	if (class)
+		class_destroy(class);
+}
+
+int __init
+blktap_sysfs_init(void)
+{
+	struct class *cls;
+	int err = 0;
+
+	cls = class_create(THIS_MODULE, "blktap2");
+	if (IS_ERR(cls))
+		err = PTR_ERR(cls);
+	if (!err)
+		err = class_create_file(cls, &class_attr_verbosity);
+	if (!err)
+		err = class_create_file(cls, &class_attr_devices);
+	if (!err)
+		class = cls;
+	else
+		class_destroy(cls);
+
+	return err;
+}