Michal Kubecek a20a6c
From: Eric Dumazet <edumazet@google.com>
Michal Kubecek a20a6c
Date: Tue, 19 Sep 2017 16:27:06 -0700
Michal Kubecek a20a6c
Subject: ipv6: addrlabel: per netns list
Michal Kubecek a20a6c
Patch-mainline: v4.15-rc1
Michal Kubecek a20a6c
Git-commit: a90c9347e90ed1e9323d71402ed18023bc910cd8
Michal Kubecek a20a6c
References: bsc#1122982
Michal Kubecek a20a6c
Michal Kubecek a20a6c
Having a global list of labels do not scale to thousands of
Michal Kubecek a20a6c
netns in the cloud era. This causes quadratic behavior on
Michal Kubecek a20a6c
netns creation and deletion.
Michal Kubecek a20a6c
Michal Kubecek a20a6c
This is time having a per netns list of ~10 labels.
Michal Kubecek a20a6c
Michal Kubecek a20a6c
Tested:
Michal Kubecek a20a6c
Michal Kubecek a20a6c
$ time perf record (for f in `seq 1 3000` ; do ip netns add tast$f; done)
Michal Kubecek a20a6c
[ perf record: Woken up 1 times to write data ]
Michal Kubecek a20a6c
[ perf record: Captured and wrote 3.637 MB perf.data (~158898 samples) ]
Michal Kubecek a20a6c
Michal Kubecek a20a6c
real    0m20.837s # instead of 0m24.227s
Michal Kubecek a20a6c
user    0m0.328s
Michal Kubecek a20a6c
sys     0m20.338s # instead of 0m23.753s
Michal Kubecek a20a6c
Michal Kubecek a20a6c
    16.17%       ip  [kernel.kallsyms]  [k] netlink_broadcast_filtered
Michal Kubecek a20a6c
    12.30%       ip  [kernel.kallsyms]  [k] netlink_has_listeners
Michal Kubecek a20a6c
     6.76%       ip  [kernel.kallsyms]  [k] _raw_spin_lock_irqsave
Michal Kubecek a20a6c
     5.78%       ip  [kernel.kallsyms]  [k] memset_erms
Michal Kubecek a20a6c
     5.77%       ip  [kernel.kallsyms]  [k] kobject_uevent_env
Michal Kubecek a20a6c
     5.18%       ip  [kernel.kallsyms]  [k] refcount_sub_and_test
Michal Kubecek a20a6c
     4.96%       ip  [kernel.kallsyms]  [k] _raw_read_lock
Michal Kubecek a20a6c
     3.82%       ip  [kernel.kallsyms]  [k] refcount_inc_not_zero
Michal Kubecek a20a6c
     3.33%       ip  [kernel.kallsyms]  [k] _raw_spin_unlock_irqrestore
Michal Kubecek a20a6c
     2.11%       ip  [kernel.kallsyms]  [k] unmap_page_range
Michal Kubecek a20a6c
     1.77%       ip  [kernel.kallsyms]  [k] __wake_up
Michal Kubecek a20a6c
     1.69%       ip  [kernel.kallsyms]  [k] strlen
Michal Kubecek a20a6c
     1.17%       ip  [kernel.kallsyms]  [k] __wake_up_common
Michal Kubecek a20a6c
     1.09%       ip  [kernel.kallsyms]  [k] insert_header
Michal Kubecek a20a6c
     1.04%       ip  [kernel.kallsyms]  [k] page_remove_rmap
Michal Kubecek a20a6c
     1.01%       ip  [kernel.kallsyms]  [k] consume_skb
Michal Kubecek a20a6c
     0.98%       ip  [kernel.kallsyms]  [k] netlink_trim
Michal Kubecek a20a6c
     0.51%       ip  [kernel.kallsyms]  [k] kernfs_link_sibling
Michal Kubecek a20a6c
     0.51%       ip  [kernel.kallsyms]  [k] filemap_map_pages
Michal Kubecek a20a6c
     0.46%       ip  [kernel.kallsyms]  [k] memcpy_erms
Michal Kubecek a20a6c
Michal Kubecek a20a6c
Signed-off-by: Eric Dumazet <edumazet@google.com>
Michal Kubecek a20a6c
Signed-off-by: David S. Miller <davem@davemloft.net>
Michal Kubecek a20a6c
Acked-by: Michal Kubecek <mkubecek@suse.cz>
Michal Kubecek a20a6c
Michal Kubecek a20a6c
---
Michal Kubecek a20a6c
 include/net/netns/ipv6.h |  5 +++
Michal Kubecek a20a6c
 net/ipv6/addrlabel.c     | 81 +++++++++++++++-------------------------
Michal Kubecek a20a6c
 2 files changed, 35 insertions(+), 51 deletions(-)
Michal Kubecek a20a6c
Michal Kubecek a20a6c
--- a/include/net/netns/ipv6.h
Michal Kubecek a20a6c
+++ b/include/net/netns/ipv6.h
Michal Kubecek a20a6c
@@ -86,6 +86,11 @@ struct netns_ipv6 {
Michal Kubecek a20a6c
 	atomic_t		dev_addr_genid;
Michal Kubecek a20a6c
 	atomic_t		fib6_sernum;
Michal Kubecek a20a6c
 	struct seg6_pernet_data *seg6_data;
Michal Kubecek a20a6c
+	struct {
Michal Kubecek a20a6c
+		struct hlist_head head;
Michal Kubecek a20a6c
+		spinlock_t	lock;
Michal Kubecek a20a6c
+		u32		seq;
Michal Kubecek a20a6c
+	} ip6addrlbl_table;
Michal Kubecek a20a6c
 };
Michal Kubecek a20a6c
 
Michal Kubecek a20a6c
 #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
Michal Kubecek a20a6c
--- a/net/ipv6/addrlabel.c
Michal Kubecek a20a6c
+++ b/net/ipv6/addrlabel.c
Michal Kubecek a20a6c
@@ -29,7 +29,6 @@
Michal Kubecek a20a6c
  * Policy Table
Michal Kubecek a20a6c
  */
Michal Kubecek a20a6c
 struct ip6addrlbl_entry {
Michal Kubecek a20a6c
-	possible_net_t lbl_net;
Michal Kubecek a20a6c
 	struct in6_addr prefix;
Michal Kubecek a20a6c
 	int prefixlen;
Michal Kubecek a20a6c
 	int ifindex;
Michal Kubecek a20a6c
@@ -40,19 +39,6 @@ struct ip6addrlbl_entry {
Michal Kubecek a20a6c
 	struct rcu_head rcu;
Michal Kubecek a20a6c
 };
Michal Kubecek a20a6c
 
Michal Kubecek a20a6c
-static struct ip6addrlbl_table
Michal Kubecek a20a6c
-{
Michal Kubecek a20a6c
-	struct hlist_head head;
Michal Kubecek a20a6c
-	spinlock_t lock;
Michal Kubecek a20a6c
-	u32 seq;
Michal Kubecek a20a6c
-} ip6addrlbl_table;
Michal Kubecek a20a6c
-
Michal Kubecek a20a6c
-static inline
Michal Kubecek a20a6c
-struct net *ip6addrlbl_net(const struct ip6addrlbl_entry *lbl)
Michal Kubecek a20a6c
-{
Michal Kubecek a20a6c
-	return read_pnet(&lbl->lbl_net);
Michal Kubecek a20a6c
-}
Michal Kubecek a20a6c
-
Michal Kubecek a20a6c
 /*
Michal Kubecek a20a6c
  * Default policy table (RFC6724 + extensions)
Michal Kubecek a20a6c
  *
Michal Kubecek a20a6c
@@ -147,13 +133,10 @@ static inline void ip6addrlbl_put(struct ip6addrlbl_entry *p)
Michal Kubecek a20a6c
 }
Michal Kubecek a20a6c
 
Michal Kubecek a20a6c
 /* Find label */
Michal Kubecek a20a6c
-static bool __ip6addrlbl_match(struct net *net,
Michal Kubecek a20a6c
-			       const struct ip6addrlbl_entry *p,
Michal Kubecek a20a6c
+static bool __ip6addrlbl_match(const struct ip6addrlbl_entry *p,
Michal Kubecek a20a6c
 			       const struct in6_addr *addr,
Michal Kubecek a20a6c
 			       int addrtype, int ifindex)
Michal Kubecek a20a6c
 {
Michal Kubecek a20a6c
-	if (!net_eq(ip6addrlbl_net(p), net))
Michal Kubecek a20a6c
-		return false;
Michal Kubecek a20a6c
 	if (p->ifindex && p->ifindex != ifindex)
Michal Kubecek a20a6c
 		return false;
Michal Kubecek a20a6c
 	if (p->addrtype && p->addrtype != addrtype)
Michal Kubecek a20a6c
@@ -168,8 +151,9 @@ static struct ip6addrlbl_entry *__ipv6_addr_label(struct net *net,
Michal Kubecek a20a6c
 						  int type, int ifindex)
Michal Kubecek a20a6c
 {
Michal Kubecek a20a6c
 	struct ip6addrlbl_entry *p;
Michal Kubecek a20a6c
-	hlist_for_each_entry_rcu(p, &ip6addrlbl_table.head, list) {
Michal Kubecek a20a6c
-		if (__ip6addrlbl_match(net, p, addr, type, ifindex))
Michal Kubecek a20a6c
+
Michal Kubecek a20a6c
+	hlist_for_each_entry_rcu(p, &net->ipv6.ip6addrlbl_table.head, list) {
Michal Kubecek a20a6c
+		if (__ip6addrlbl_match(p, addr, type, ifindex))
Michal Kubecek a20a6c
 			return p;
Michal Kubecek a20a6c
 	}
Michal Kubecek a20a6c
 	return NULL;
Michal Kubecek a20a6c
@@ -195,8 +179,7 @@ u32 ipv6_addr_label(struct net *net,
Michal Kubecek a20a6c
 }
Michal Kubecek a20a6c
 
Michal Kubecek a20a6c
 /* allocate one entry */
Michal Kubecek a20a6c
-static struct ip6addrlbl_entry *ip6addrlbl_alloc(struct net *net,
Michal Kubecek a20a6c
-						 const struct in6_addr *prefix,
Michal Kubecek a20a6c
+static struct ip6addrlbl_entry *ip6addrlbl_alloc(const struct in6_addr *prefix,
Michal Kubecek a20a6c
 						 int prefixlen, int ifindex,
Michal Kubecek a20a6c
 						 u32 label)
Michal Kubecek a20a6c
 {
Michal Kubecek a20a6c
@@ -235,24 +218,23 @@ static struct ip6addrlbl_entry *ip6addrlbl_alloc(struct net *net,
Michal Kubecek a20a6c
 	newp->addrtype = addrtype;
Michal Kubecek a20a6c
 	newp->label = label;
Michal Kubecek a20a6c
 	INIT_HLIST_NODE(&newp->list);
Michal Kubecek a20a6c
-	write_pnet(&newp->lbl_net, net);
Michal Kubecek a20a6c
 	atomic_set(&newp->refcnt, 1);
Michal Kubecek a20a6c
 	return newp;
Michal Kubecek a20a6c
 }
Michal Kubecek a20a6c
 
Michal Kubecek a20a6c
 /* add a label */
Michal Kubecek a20a6c
-static int __ip6addrlbl_add(struct ip6addrlbl_entry *newp, int replace)
Michal Kubecek a20a6c
+static int __ip6addrlbl_add(struct net *net, struct ip6addrlbl_entry *newp,
Michal Kubecek a20a6c
+			    int replace)
Michal Kubecek a20a6c
 {
Michal Kubecek a20a6c
-	struct hlist_node *n;
Michal Kubecek a20a6c
 	struct ip6addrlbl_entry *last = NULL, *p = NULL;
Michal Kubecek a20a6c
+	struct hlist_node *n;
Michal Kubecek a20a6c
 	int ret = 0;
Michal Kubecek a20a6c
 
Michal Kubecek a20a6c
 	ADDRLABEL(KERN_DEBUG "%s(newp=%p, replace=%d)\n", __func__, newp,
Michal Kubecek a20a6c
 		  replace);
Michal Kubecek a20a6c
 
Michal Kubecek a20a6c
-	hlist_for_each_entry_safe(p, n,	&ip6addrlbl_table.head, list) {
Michal Kubecek a20a6c
+	hlist_for_each_entry_safe(p, n,	&net->ipv6.ip6addrlbl_table.head, list) {
Michal Kubecek a20a6c
 		if (p->prefixlen == newp->prefixlen &&
Michal Kubecek a20a6c
-		    net_eq(ip6addrlbl_net(p), ip6addrlbl_net(newp)) &&
Michal Kubecek a20a6c
 		    p->ifindex == newp->ifindex &&
Michal Kubecek a20a6c
 		    ipv6_addr_equal(&p->prefix, &newp->prefix)) {
Michal Kubecek a20a6c
 			if (!replace) {
Michal Kubecek a20a6c
@@ -272,10 +254,10 @@ static int __ip6addrlbl_add(struct ip6addrlbl_entry *newp, int replace)
Michal Kubecek a20a6c
 	if (last)
Michal Kubecek a20a6c
 		hlist_add_behind_rcu(&newp->list, &last->list);
Michal Kubecek a20a6c
 	else
Michal Kubecek a20a6c
-		hlist_add_head_rcu(&newp->list, &ip6addrlbl_table.head);
Michal Kubecek a20a6c
+		hlist_add_head_rcu(&newp->list, &net->ipv6.ip6addrlbl_table.head);
Michal Kubecek a20a6c
 out:
Michal Kubecek a20a6c
 	if (!ret)
Michal Kubecek a20a6c
-		ip6addrlbl_table.seq++;
Michal Kubecek a20a6c
+		net->ipv6.ip6addrlbl_table.seq++;
Michal Kubecek a20a6c
 	return ret;
Michal Kubecek a20a6c
 }
Michal Kubecek a20a6c
 
Michal Kubecek a20a6c
@@ -291,12 +273,12 @@ static int ip6addrlbl_add(struct net *net,
Michal Kubecek a20a6c
 		  __func__, prefix, prefixlen, ifindex, (unsigned int)label,
Michal Kubecek a20a6c
 		  replace);
Michal Kubecek a20a6c
 
Michal Kubecek a20a6c
-	newp = ip6addrlbl_alloc(net, prefix, prefixlen, ifindex, label);
Michal Kubecek a20a6c
+	newp = ip6addrlbl_alloc(prefix, prefixlen, ifindex, label);
Michal Kubecek a20a6c
 	if (IS_ERR(newp))
Michal Kubecek a20a6c
 		return PTR_ERR(newp);
Michal Kubecek a20a6c
-	spin_lock(&ip6addrlbl_table.lock);
Michal Kubecek a20a6c
-	ret = __ip6addrlbl_add(newp, replace);
Michal Kubecek a20a6c
-	spin_unlock(&ip6addrlbl_table.lock);
Michal Kubecek a20a6c
+	spin_lock(&net->ipv6.ip6addrlbl_table.lock);
Michal Kubecek a20a6c
+	ret = __ip6addrlbl_add(net, newp, replace);
Michal Kubecek a20a6c
+	spin_unlock(&net->ipv6.ip6addrlbl_table.lock);
Michal Kubecek a20a6c
 	if (ret)
Michal Kubecek a20a6c
 		ip6addrlbl_free(newp);
Michal Kubecek a20a6c
 	return ret;
Michal Kubecek a20a6c
@@ -314,9 +296,8 @@ static int __ip6addrlbl_del(struct net *net,
Michal Kubecek a20a6c
 	ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d)\n",
Michal Kubecek a20a6c
 		  __func__, prefix, prefixlen, ifindex);
Michal Kubecek a20a6c
 
Michal Kubecek a20a6c
-	hlist_for_each_entry_safe(p, n, &ip6addrlbl_table.head, list) {
Michal Kubecek a20a6c
+	hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) {
Michal Kubecek a20a6c
 		if (p->prefixlen == prefixlen &&
Michal Kubecek a20a6c
-		    net_eq(ip6addrlbl_net(p), net) &&
Michal Kubecek a20a6c
 		    p->ifindex == ifindex &&
Michal Kubecek a20a6c
 		    ipv6_addr_equal(&p->prefix, prefix)) {
Michal Kubecek a20a6c
 			hlist_del_rcu(&p->list);
Michal Kubecek a20a6c
@@ -339,9 +320,9 @@ static int ip6addrlbl_del(struct net *net,
Michal Kubecek a20a6c
 		  __func__, prefix, prefixlen, ifindex);
Michal Kubecek a20a6c
 
Michal Kubecek a20a6c
 	ipv6_addr_prefix(&prefix_buf, prefix, prefixlen);
Michal Kubecek a20a6c
-	spin_lock(&ip6addrlbl_table.lock);
Michal Kubecek a20a6c
+	spin_lock(&net->ipv6.ip6addrlbl_table.lock);
Michal Kubecek a20a6c
 	ret = __ip6addrlbl_del(net, &prefix_buf, prefixlen, ifindex);
Michal Kubecek a20a6c
-	spin_unlock(&ip6addrlbl_table.lock);
Michal Kubecek a20a6c
+	spin_unlock(&net->ipv6.ip6addrlbl_table.lock);
Michal Kubecek a20a6c
 	return ret;
Michal Kubecek a20a6c
 }
Michal Kubecek a20a6c
 
Michal Kubecek a20a6c
@@ -353,6 +334,9 @@ static int __net_init ip6addrlbl_net_init(struct net *net)
Michal Kubecek a20a6c
 
Michal Kubecek a20a6c
 	ADDRLABEL(KERN_DEBUG "%s\n", __func__);
Michal Kubecek a20a6c
 
Michal Kubecek a20a6c
+	spin_lock_init(&net->ipv6.ip6addrlbl_table.lock);
Michal Kubecek a20a6c
+	INIT_HLIST_HEAD(&net->ipv6.ip6addrlbl_table.head);
Michal Kubecek a20a6c
+
Michal Kubecek a20a6c
 	for (i = 0; i < ARRAY_SIZE(ip6addrlbl_init_table); i++) {
Michal Kubecek a20a6c
 		int ret = ip6addrlbl_add(net,
Michal Kubecek a20a6c
 					 ip6addrlbl_init_table[i].prefix,
Michal Kubecek a20a6c
@@ -372,14 +356,12 @@ static void __net_exit ip6addrlbl_net_exit(struct net *net)
Michal Kubecek a20a6c
 	struct hlist_node *n;
Michal Kubecek a20a6c
 
Michal Kubecek a20a6c
 	/* Remove all labels belonging to the exiting net */
Michal Kubecek a20a6c
-	spin_lock(&ip6addrlbl_table.lock);
Michal Kubecek a20a6c
-	hlist_for_each_entry_safe(p, n, &ip6addrlbl_table.head, list) {
Michal Kubecek a20a6c
-		if (net_eq(ip6addrlbl_net(p), net)) {
Michal Kubecek a20a6c
-			hlist_del_rcu(&p->list);
Michal Kubecek a20a6c
-			ip6addrlbl_put(p);
Michal Kubecek a20a6c
-		}
Michal Kubecek a20a6c
+	spin_lock(&net->ipv6.ip6addrlbl_table.lock);
Michal Kubecek a20a6c
+	hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) {
Michal Kubecek a20a6c
+		hlist_del_rcu(&p->list);
Michal Kubecek a20a6c
+		ip6addrlbl_put(p);
Michal Kubecek a20a6c
 	}
Michal Kubecek a20a6c
-	spin_unlock(&ip6addrlbl_table.lock);
Michal Kubecek a20a6c
+	spin_unlock(&net->ipv6.ip6addrlbl_table.lock);
Michal Kubecek a20a6c
 }
Michal Kubecek a20a6c
 
Michal Kubecek a20a6c
 static struct pernet_operations ipv6_addr_label_ops = {
Michal Kubecek a20a6c
@@ -389,8 +371,6 @@ static struct pernet_operations ipv6_addr_label_ops = {
Michal Kubecek a20a6c
 
Michal Kubecek a20a6c
 int __init ipv6_addr_label_init(void)
Michal Kubecek a20a6c
 {
Michal Kubecek a20a6c
-	spin_lock_init(&ip6addrlbl_table.lock);
Michal Kubecek a20a6c
-
Michal Kubecek a20a6c
 	return register_pernet_subsys(&ipv6_addr_label_ops);
Michal Kubecek a20a6c
 }
Michal Kubecek a20a6c
 
Michal Kubecek a20a6c
@@ -497,11 +477,10 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb)
Michal Kubecek a20a6c
 	int err;
Michal Kubecek a20a6c
 
Michal Kubecek a20a6c
 	rcu_read_lock();
Michal Kubecek a20a6c
-	hlist_for_each_entry_rcu(p, &ip6addrlbl_table.head, list) {
Michal Kubecek a20a6c
-		if (idx >= s_idx &&
Michal Kubecek a20a6c
-		    net_eq(ip6addrlbl_net(p), net)) {
Michal Kubecek a20a6c
+	hlist_for_each_entry_rcu(p, &net->ipv6.ip6addrlbl_table.head, list) {
Michal Kubecek a20a6c
+		if (idx >= s_idx) {
Michal Kubecek a20a6c
 			err = ip6addrlbl_fill(skb, p,
Michal Kubecek a20a6c
-					      ip6addrlbl_table.seq,
Michal Kubecek a20a6c
+					      net->ipv6.ip6addrlbl_table.seq,
Michal Kubecek a20a6c
 					      NETLINK_CB(cb->skb).portid,
Michal Kubecek a20a6c
 					      cb->nlh->nlmsg_seq,
Michal Kubecek a20a6c
 					      RTM_NEWADDRLABEL,
Michal Kubecek a20a6c
@@ -558,7 +537,7 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh,
Michal Kubecek a20a6c
 	p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index);
Michal Kubecek a20a6c
 	if (p && !ip6addrlbl_hold(p))
Michal Kubecek a20a6c
 		p = NULL;
Michal Kubecek a20a6c
-	lseq = ip6addrlbl_table.seq;
Michal Kubecek a20a6c
+	lseq = net->ipv6.ip6addrlbl_table.seq;
Michal Kubecek a20a6c
 	rcu_read_unlock();
Michal Kubecek a20a6c
 
Michal Kubecek a20a6c
 	if (!p) {