From: Firo Yang <firo.yang@suse.com>
Date: Mon, 20 Apr 2020 17:10:01 +0200
Subject: net: fix race condition in __inet_lookup_established()
Patch-mainline: Never, mainline solution (v5.5-rc3) too intrusive
References: bsc#1151794 bsc#1180624
Listening and established sockets share the same slab cache which has
SLAB_TYPESAFE_BY_RCU flag set but this only protects from a slab page being
freed and reused for a different slab cache (or other purpose), not from
being reused for a new object from the same slab cache. Therefore the loop
in __inet_lookup_established() can skip from an established socket to
a listening one.
Since commit 3b24d854cb35 ("tcp/dccp: do not touch listener sk_refcnt under
synflood"), listener hashtable no longer uses "nulls" lists so that after
such switch, sk_nulls_for_each_rcu() loop in __inet_lookup_established()
would hit NULL as end marker which it would fail to recognize. Analogously,
__inet_lookup_listener() and inet_diag_dump_icsk() may hit an opposite
switch from a listener socket to an established one.
The upstream solution, commit 8dbd76e79a16 ("tcp/dccp: fix possible race
__inet_lookup_established()"), is rather intrusive and would break kABI
in a way which would be impossible to work around reliably. Therefore we
use a simpler patch which is safer at the expense of a minor performance
penalty.
For testing purpose, add a temporary debugging message whenever a race
resulting in a switch from an established socket to a listener or vice
versa is detected.
Fixes: 3b24d854cb35 ("tcp/dccp: do not touch listener sk_refcnt under synflood")
Signed-off-by: Firo Yang <firo.yang@suse.com>
Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
---
net/ipv4/inet_diag.c | 2 ++
net/ipv4/inet_hashtables.c | 17 ++++++++++++++---
net/ipv4/tcp_ipv4.c | 13 +++++++++++++
3 files changed, 29 insertions(+), 3 deletions(-)
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -886,6 +886,8 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
sk_for_each(sk, &ilb->head) {
struct inet_sock *inet = inet_sk(sk);
+ if (unlikely(is_a_nulls(&sk->sk_nulls_node)))
+ break;
if (!net_eq(sock_net(sk), net))
continue;
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -213,12 +213,21 @@ struct sock *__inet_lookup_listener(struct net *net,
{
unsigned int hash = inet_lhashfn(net, hnum);
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
- int score, hiscore = 0, matches = 0, reuseport = 0;
+ int score, hiscore, matches, reuseport;
bool exact_dif = inet_exact_dif_match(net, skb);
- struct sock *sk, *result = NULL;
- u32 phash = 0;
+ struct sock *sk, *result;
+ u32 phash;
+
+begin:
+ hiscore = 0;
+ matches = 0;
+ reuseport = 0;
+ result = NULL;
+ phash = 0;
sk_for_each_rcu(sk, &ilb->head) {
+ if (unlikely(is_a_nulls(&sk->sk_nulls_node)))
+ goto begin;
score = compute_score(sk, net, hnum, daddr, dif, exact_dif);
if (score > hiscore) {
reuseport = sk->sk_reuseport;
@@ -284,6 +293,8 @@ struct sock *__inet_lookup_established(struct net *net,
begin:
sk_nulls_for_each_rcu(sk, node, &head->chain) {
+ if (unlikely(!node))
+ goto begin;
if (sk->sk_hash != hash)
continue;
if (likely(INET_MATCH(sk, net, acookie,
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1924,13 +1924,18 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
++st->offset;
sk = sk_next(sk);
+ if (unlikely(is_a_nulls(&sk->sk_nulls_node)))
+ goto next;
get_sk:
sk_for_each_from(sk) {
+ if (unlikely(is_a_nulls(&sk->sk_nulls_node)))
+ break;
if (!net_eq(sock_net(sk), net))
continue;
if (sk->sk_family == st->family)
return sk;
}
+next:
spin_unlock(&ilb->lock);
st->offset = 0;
if (++st->bucket < INET_LHTABLE_SIZE)
@@ -1980,7 +1985,10 @@ static void *established_get_first(struct seq_file *seq)
continue;
spin_lock_bh(lock);
+retry:
sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
+ if (unlikely(!node))
+ goto retry;
if (sk->sk_family != st->family ||
!net_eq(sock_net(sk), net)) {
continue;
@@ -2005,12 +2013,17 @@ static void *established_get_next(struct seq_file *seq, void *cur)
++st->offset;
sk = sk_nulls_next(sk);
+ if (!&sk->sk_node)
+ goto next;
sk_nulls_for_each_from(sk, node) {
+ if (unlikely(!node))
+ break;
if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
return sk;
}
+next:
spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
++st->bucket;
return established_get_first(seq);