Blob Blame History Raw
From: Firo Yang <firo.yang@suse.com>
Date: Mon, 20 Apr 2020 17:10:01 +0200
Subject: net: fix race condition in __inet_lookup_established()
Patch-mainline: Never, mainline solution (v5.5-rc3) too intrusive
References: bsc#1151794 bsc#1180624

Listening and established sockets share the same slab cache which has
SLAB_TYPESAFE_BY_RCU flag set but this only protects from a slab page being
freed and reused for a different slab cache (or other purpose), not from
being reused for a new object from the same slab cache. Therefore the loop
in __inet_lookup_established() can skip from an established socket to
a listening one.

Since commit 3b24d854cb35 ("tcp/dccp: do not touch listener sk_refcnt under
synflood"), listener hashtable no longer uses "nulls" lists so that after
such switch, sk_nulls_for_each_rcu() loop in __inet_lookup_established()
would hit NULL as end marker which it would fail to recognize. Analogously,
__inet_lookup_listener() and inet_diag_dump_icsk() may hit an opposite
switch from a listener socket to an established one.

The upstream solution, commit 8dbd76e79a16 ("tcp/dccp: fix possible race
__inet_lookup_established()"), is rather intrusive and would break kABI
in a way which would be impossible to work around reliably. Therefore we
use a simpler patch which is safer at the expense of a minor performance
penalty.

For testing purpose, add a temporary debugging message whenever a race
resulting in a switch from an established socket to a listener or vice
versa is detected.

Fixes: 3b24d854cb35 ("tcp/dccp: do not touch listener sk_refcnt under synflood")
Signed-off-by: Firo Yang <firo.yang@suse.com>
Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
---
 net/ipv4/inet_diag.c       |  2 ++
 net/ipv4/inet_hashtables.c | 17 ++++++++++++++---
 net/ipv4/tcp_ipv4.c        | 13 +++++++++++++
 3 files changed, 29 insertions(+), 3 deletions(-)

--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -886,6 +886,8 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
 			sk_for_each(sk, &ilb->head) {
 				struct inet_sock *inet = inet_sk(sk);
 
+				if (unlikely(is_a_nulls(&sk->sk_nulls_node)))
+					break;
 				if (!net_eq(sock_net(sk), net))
 					continue;
 
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -213,12 +213,21 @@ struct sock *__inet_lookup_listener(struct net *net,
 {
 	unsigned int hash = inet_lhashfn(net, hnum);
 	struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
-	int score, hiscore = 0, matches = 0, reuseport = 0;
+	int score, hiscore, matches, reuseport;
 	bool exact_dif = inet_exact_dif_match(net, skb);
-	struct sock *sk, *result = NULL;
-	u32 phash = 0;
+	struct sock *sk, *result;
+	u32 phash;
+
+begin:
+	hiscore = 0;
+	matches = 0;
+	reuseport = 0;
+	result = NULL;
+	phash = 0;
 
 	sk_for_each_rcu(sk, &ilb->head) {
+		if (unlikely(is_a_nulls(&sk->sk_nulls_node)))
+			goto begin;
 		score = compute_score(sk, net, hnum, daddr, dif, exact_dif);
 		if (score > hiscore) {
 			reuseport = sk->sk_reuseport;
@@ -284,6 +293,8 @@ struct sock *__inet_lookup_established(struct net *net,
 
 begin:
 	sk_nulls_for_each_rcu(sk, node, &head->chain) {
+		if (unlikely(!node))
+			goto begin;
 		if (sk->sk_hash != hash)
 			continue;
 		if (likely(INET_MATCH(sk, net, acookie,
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1924,13 +1924,18 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
 	++st->offset;
 
 	sk = sk_next(sk);
+	if (unlikely(is_a_nulls(&sk->sk_nulls_node)))
+		goto next;
 get_sk:
 	sk_for_each_from(sk) {
+		if (unlikely(is_a_nulls(&sk->sk_nulls_node)))
+			break;
 		if (!net_eq(sock_net(sk), net))
 			continue;
 		if (sk->sk_family == st->family)
 			return sk;
 	}
+next:
 	spin_unlock(&ilb->lock);
 	st->offset = 0;
 	if (++st->bucket < INET_LHTABLE_SIZE)
@@ -1980,7 +1985,10 @@ static void *established_get_first(struct seq_file *seq)
 			continue;
 
 		spin_lock_bh(lock);
+retry:
 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
+			if (unlikely(!node))
+				goto retry;
 			if (sk->sk_family != st->family ||
 			    !net_eq(sock_net(sk), net)) {
 				continue;
@@ -2005,12 +2013,17 @@ static void *established_get_next(struct seq_file *seq, void *cur)
 	++st->offset;
 
 	sk = sk_nulls_next(sk);
+	if (!&sk->sk_node)
+		goto next;
 
 	sk_nulls_for_each_from(sk, node) {
+		if (unlikely(!node))
+			break;
 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
 			return sk;
 	}
 
+next:
 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
 	++st->bucket;
 	return established_get_first(seq);