Blob Blame History Raw
From: Jason Wang <jasowang@redhat.com>
Date: Thu, 4 Jan 2018 11:14:28 +0800
Subject: tuntap: XDP transmission
Patch-mainline: v4.16-rc1
Git-commit: fc72d1d54dd9ffe2552c76b17e9129803ca7b255
References: bsc#1109837

This patch implements XDP transmission for TAP. Since we can't create
new queues for TAP during XDP set, exist ptr_ring was reused for
queuing XDP buffers. To differ xdp_buff from sk_buff, TUN_XDP_FLAG
(0x1UL) was encoded into lowest bit of xpd_buff pointer during
ptr_ring_produce, and was decoded during consuming. XDP metadata was
stored in the headroom of the packet which should work in most of
cases since driver usually reserve enough headroom. Very minor changes
were done for vhost_net: it just need to peek the length depends on
the type of pointer.

Tests were done on two Intel E5-2630 2.40GHz machines connected back
to back through two 82599ES. Traffic were generated/received through
MoonGen/testpmd(rxonly). It reports ~20% improvements when
xdp_redirect_map is doing redirection from ixgbe to TAP (from 2.50Mpps
to 3.05Mpps)

Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Acked-by: Thomas Bogendoerfer <tbogendoerfer@suse.de>
---
 drivers/net/tun.c      |  210 +++++++++++++++++++++++++++++++++++++++++--------
 drivers/vhost/net.c    |   13 ++-
 include/linux/if_tun.h |   17 +++
 3 files changed, 208 insertions(+), 32 deletions(-)

--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -241,6 +241,24 @@ struct tun_struct {
 	struct tun_steering_prog __rcu *steering_prog;
 };
 
+bool tun_is_xdp_buff(void *ptr)
+{
+	return (unsigned long)ptr & TUN_XDP_FLAG;
+}
+EXPORT_SYMBOL(tun_is_xdp_buff);
+
+void *tun_xdp_to_ptr(void *ptr)
+{
+	return (void *)((unsigned long)ptr | TUN_XDP_FLAG);
+}
+EXPORT_SYMBOL(tun_xdp_to_ptr);
+
+void *tun_ptr_to_xdp(void *ptr)
+{
+	return (void *)((unsigned long)ptr & ~TUN_XDP_FLAG);
+}
+EXPORT_SYMBOL(tun_ptr_to_xdp);
+
 static int tun_napi_receive(struct napi_struct *napi, int budget)
 {
 	struct tun_file *tfile = container_of(napi, struct tun_file, napi);
@@ -631,12 +649,25 @@ static struct tun_struct *tun_enable_que
 	return tun;
 }
 
+static void tun_ptr_free(void *ptr)
+{
+	if (!ptr)
+		return;
+	if (tun_is_xdp_buff(ptr)) {
+		struct xdp_buff *xdp = tun_ptr_to_xdp(ptr);
+
+		put_page(virt_to_head_page(xdp->data));
+	} else {
+		__skb_array_destroy_skb(ptr);
+	}
+}
+
 static void tun_queue_purge(struct tun_file *tfile)
 {
-	struct sk_buff *skb;
+	void *ptr;
 
-	while ((skb = ptr_ring_consume(&tfile->tx_ring)) != NULL)
-		kfree_skb(skb);
+	while ((ptr = ptr_ring_consume(&tfile->tx_ring)) != NULL)
+		tun_ptr_free(ptr);
 
 	skb_queue_purge(&tfile->sk.sk_write_queue);
 	skb_queue_purge(&tfile->sk.sk_error_queue);
@@ -645,7 +676,7 @@ static void tun_queue_purge(struct tun_f
 static void tun_cleanup_tx_array(struct tun_file *tfile)
 {
 	if (tfile->tx_ring.queue) {
-		ptr_ring_cleanup(&tfile->tx_ring, __skb_array_destroy_skb);
+		ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free);
 		xdp_rxq_info_unreg(&tfile->xdp_rxq);
 		memset(&tfile->tx_ring, 0, sizeof(tfile->tx_ring));
 	}
@@ -1229,6 +1260,67 @@ static const struct net_device_ops tun_n
 	.ndo_get_stats64	= tun_net_get_stats64,
 };
 
+static int tun_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
+{
+	struct tun_struct *tun = netdev_priv(dev);
+	struct xdp_buff *buff = xdp->data_hard_start;
+	int headroom = xdp->data - xdp->data_hard_start;
+	struct tun_file *tfile;
+	u32 numqueues;
+	int ret = 0;
+
+	/* Assure headroom is available and buff is properly aligned */
+	if (unlikely(headroom < sizeof(*xdp) || tun_is_xdp_buff(xdp)))
+		return -ENOSPC;
+
+	*buff = *xdp;
+
+	rcu_read_lock();
+
+	numqueues = READ_ONCE(tun->numqueues);
+	if (!numqueues) {
+		ret = -ENOSPC;
+		goto out;
+	}
+
+	tfile = rcu_dereference(tun->tfiles[smp_processor_id() %
+					    numqueues]);
+	/* Encode the XDP flag into lowest bit for consumer to differ
+	 * XDP buffer from sk_buff.
+	 */
+	if (ptr_ring_produce(&tfile->tx_ring, tun_xdp_to_ptr(buff))) {
+		this_cpu_inc(tun->pcpu_stats->tx_dropped);
+		ret = -ENOSPC;
+	}
+
+out:
+	rcu_read_unlock();
+	return ret;
+}
+
+static void tun_xdp_flush(struct net_device *dev)
+{
+	struct tun_struct *tun = netdev_priv(dev);
+	struct tun_file *tfile;
+	u32 numqueues;
+
+	rcu_read_lock();
+
+	numqueues = READ_ONCE(tun->numqueues);
+	if (!numqueues)
+		goto out;
+
+	tfile = rcu_dereference(tun->tfiles[smp_processor_id() %
+					    numqueues]);
+	/* Notify and wake up reader process */
+	if (tfile->flags & TUN_FASYNC)
+		kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
+	tfile->socket.sk->sk_data_ready(tfile->socket.sk);
+
+out:
+	rcu_read_unlock();
+}
+
 static const struct net_device_ops tap_netdev_ops = {
 	.ndo_uninit		= tun_net_uninit,
 	.ndo_open		= tun_net_open,
@@ -1246,6 +1338,8 @@ static const struct net_device_ops tap_n
 	.ndo_set_rx_headroom	= tun_set_headroom,
 	.ndo_get_stats64	= tun_net_get_stats64,
 	.ndo_bpf		= tun_xdp,
+	.ndo_xdp_xmit		= tun_xdp_xmit,
+	.ndo_xdp_flush		= tun_xdp_flush,
 };
 
 static void tun_flow_init(struct tun_struct *tun)
@@ -1868,6 +1962,40 @@ static ssize_t tun_chr_write_iter(struct
 	return result;
 }
 
+static ssize_t tun_put_user_xdp(struct tun_struct *tun,
+				struct tun_file *tfile,
+				struct xdp_buff *xdp,
+				struct iov_iter *iter)
+{
+	int vnet_hdr_sz = 0;
+	size_t size = xdp->data_end - xdp->data;
+	struct tun_pcpu_stats *stats;
+	size_t ret;
+
+	if (tun->flags & IFF_VNET_HDR) {
+		struct virtio_net_hdr gso = { 0 };
+
+		vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);
+		if (unlikely(iov_iter_count(iter) < vnet_hdr_sz))
+			return -EINVAL;
+		if (unlikely(copy_to_iter(&gso, sizeof(gso), iter) !=
+			     sizeof(gso)))
+			return -EFAULT;
+		iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso));
+	}
+
+	ret = copy_to_iter(xdp->data, size, iter) + vnet_hdr_sz;
+
+	stats = get_cpu_ptr(tun->pcpu_stats);
+	u64_stats_update_begin(&stats->syncp);
+	stats->tx_packets++;
+	stats->tx_bytes += ret;
+	u64_stats_update_end(&stats->syncp);
+	put_cpu_ptr(tun->pcpu_stats);
+
+	return ret;
+}
+
 /* Put packet to the user space buffer */
 static ssize_t tun_put_user(struct tun_struct *tun,
 			    struct tun_file *tfile,
@@ -1965,15 +2093,14 @@ done:
 	return total;
 }
 
-static struct sk_buff *tun_ring_recv(struct tun_file *tfile, int noblock,
-				     int *err)
+static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
 {
 	DECLARE_WAITQUEUE(wait, current);
-	struct sk_buff *skb = NULL;
+	void *ptr = NULL;
 	int error = 0;
 
-	skb = ptr_ring_consume(&tfile->tx_ring);
-	if (skb)
+	ptr = ptr_ring_consume(&tfile->tx_ring);
+	if (ptr)
 		goto out;
 	if (noblock) {
 		error = -EAGAIN;
@@ -1984,8 +2111,8 @@ static struct sk_buff *tun_ring_recv(str
 	current->state = TASK_INTERRUPTIBLE;
 
 	while (1) {
-		skb = ptr_ring_consume(&tfile->tx_ring);
-		if (skb)
+		ptr = ptr_ring_consume(&tfile->tx_ring);
+		if (ptr)
 			break;
 		if (signal_pending(current)) {
 			error = -ERESTARTSYS;
@@ -2004,12 +2131,12 @@ static struct sk_buff *tun_ring_recv(str
 
 out:
 	*err = error;
-	return skb;
+	return ptr;
 }
 
 static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
 			   struct iov_iter *to,
-			   int noblock, struct sk_buff *skb)
+			   int noblock, void *ptr)
 {
 	ssize_t ret;
 	int err;
@@ -2017,23 +2144,31 @@ static ssize_t tun_do_read(struct tun_st
 	tun_debug(KERN_INFO, tun, "tun_do_read\n");
 
 	if (!iov_iter_count(to)) {
-		if (skb)
-			kfree_skb(skb);
+		tun_ptr_free(ptr);
 		return 0;
 	}
 
-	if (!skb) {
+	if (!ptr) {
 		/* Read frames from ring */
-		skb = tun_ring_recv(tfile, noblock, &err);
-		if (!skb)
+		ptr = tun_ring_recv(tfile, noblock, &err);
+		if (!ptr)
 			return err;
 	}
 
-	ret = tun_put_user(tun, tfile, skb, to);
-	if (unlikely(ret < 0))
-		kfree_skb(skb);
-	else
-		consume_skb(skb);
+	if (tun_is_xdp_buff(ptr)) {
+		struct xdp_buff *xdp = tun_ptr_to_xdp(ptr);
+
+		ret = tun_put_user_xdp(tun, tfile, xdp, to);
+		put_page(virt_to_head_page(xdp->data));
+	} else {
+		struct sk_buff *skb = ptr;
+
+		ret = tun_put_user(tun, tfile, skb, to);
+		if (unlikely(ret < 0))
+			kfree_skb(skb);
+		else
+			consume_skb(skb);
+	}
 
 	return ret;
 }
@@ -2170,12 +2305,12 @@ static int tun_recvmsg(struct socket *so
 {
 	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
 	struct tun_struct *tun = tun_get(tfile);
-	struct sk_buff *skb = m->msg_control;
+	void *ptr = m->msg_control;
 	int ret;
 
 	if (!tun) {
 		ret = -EBADFD;
-		goto out_free_skb;
+		goto out_free;
 	}
 
 	if (flags & ~(MSG_DONTWAIT|MSG_TRUNC|MSG_ERRQUEUE)) {
@@ -2187,7 +2322,7 @@ static int tun_recvmsg(struct socket *so
 					 SOL_PACKET, TUN_TX_TIMESTAMP);
 		goto out;
 	}
-	ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT, skb);
+	ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT, ptr);
 	if (ret > (ssize_t)total_len) {
 		m->msg_flags |= MSG_TRUNC;
 		ret = flags & MSG_TRUNC ? ret : total_len;
@@ -2198,12 +2333,25 @@ out:
 
 out_put_tun:
 	tun_put(tun);
-out_free_skb:
-	if (skb)
-		kfree_skb(skb);
+out_free:
+	tun_ptr_free(ptr);
 	return ret;
 }
 
+static int tun_ptr_peek_len(void *ptr)
+{
+	if (likely(ptr)) {
+		if (tun_is_xdp_buff(ptr)) {
+			struct xdp_buff *xdp = tun_ptr_to_xdp(ptr);
+
+			return xdp->data_end - xdp->data;
+		}
+		return __skb_array_len_with_tag(ptr);
+	} else {
+		return 0;
+	}
+}
+
 static int tun_peek_len(struct socket *sock)
 {
 	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
@@ -2214,7 +2362,7 @@ static int tun_peek_len(struct socket *s
 	if (!tun)
 		return 0;
 
-	ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, __skb_array_len_with_tag);
+	ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len);
 	tun_put(tun);
 
 	return ret;
@@ -3139,7 +3287,7 @@ static int tun_queue_resize(struct tun_s
 
 	ret = ptr_ring_resize_multiple(rings, n,
 				       dev->tx_queue_len, GFP_KERNEL,
-				       __skb_array_destroy_skb);
+				       tun_ptr_free);
 
 	kfree(rings);
 	return ret;
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -175,6 +175,17 @@ static void vhost_net_buf_unproduce(stru
 	}
 }
 
+static int vhost_net_buf_peek_len(void *ptr)
+{
+	if (tun_is_xdp_buff(ptr)) {
+		struct xdp_buff *xdp = tun_ptr_to_xdp(ptr);
+
+		return xdp->data_end - xdp->data;
+	}
+
+	return __skb_array_len_with_tag(ptr);
+}
+
 static int vhost_net_buf_peek(struct vhost_net_virtqueue *nvq)
 {
 	struct vhost_net_buf *rxq = &nvq->rxq;
@@ -186,7 +197,7 @@ static int vhost_net_buf_peek(struct vho
 		return 0;
 
 out:
-	return __skb_array_len_with_tag(vhost_net_buf_get_ptr(rxq));
+	return vhost_net_buf_peek_len(vhost_net_buf_get_ptr(rxq));
 }
 
 static void vhost_net_buf_init(struct vhost_net_buf *rxq)
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -17,9 +17,14 @@
 
 #include <uapi/linux/if_tun.h>
 
+#define TUN_XDP_FLAG 0x1UL
+
 #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
 struct socket *tun_get_socket(struct file *);
 struct ptr_ring *tun_get_tx_ring(struct file *file);
+bool tun_is_xdp_buff(void *ptr);
+void *tun_xdp_to_ptr(void *ptr);
+void *tun_ptr_to_xdp(void *ptr);
 #else
 #include <linux/err.h>
 #include <linux/errno.h>
@@ -33,5 +38,17 @@ static inline struct ptr_ring *tun_get_t
 {
 	return ERR_PTR(-EINVAL);
 }
+static inline bool tun_is_xdp_buff(void *ptr)
+{
+	return false;
+}
+void *tun_xdp_to_ptr(void *ptr)
+{
+	return NULL;
+}
+void *tun_ptr_to_xdp(void *ptr)
+{
+	return NULL;
+}
 #endif /* CONFIG_TUN */
 #endif /* __IF_TUN_H */