From 32c37dd0911741fd94b073dc78f1bfbd31e647f5 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Aug 03 2018 05:43:33 +0000 Subject: Merge branch 'users/mkubecek/SLE15/1102340' into SLE15_EMBARGO Pull net security fixes from Michal Kubecek --- diff --git a/patches.fixes/net-add-rb_to_skb-and-other-rb-tree-helpers.patch b/patches.fixes/net-add-rb_to_skb-and-other-rb-tree-helpers.patch new file mode 100644 index 0000000..d7eebc6 --- /dev/null +++ b/patches.fixes/net-add-rb_to_skb-and-other-rb-tree-helpers.patch @@ -0,0 +1,256 @@ +From: Eric Dumazet +Date: Thu, 5 Oct 2017 22:21:21 -0700 +Subject: net: add rb_to_skb() and other rb tree helpers +Patch-mainline: v4.15-rc1 +Git-commit: 18a4c0eab2623cc95be98a1e6af1ad18e7695977 +References: bsc#1102340 + +Geeralize private netem_rb_to_skb() + +TCP rtx queue will soon be converted to rb-tree, +so we will need skb_rbtree_walk() helpers. + +Signed-off-by: Eric Dumazet +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + include/linux/skbuff.h | 18 ++++++++++++++++++ + net/ipv4/tcp_fastopen.c | 8 +++----- + net/ipv4/tcp_input.c | 33 ++++++++++++--------------------- + net/sched/sch_netem.c | 14 ++++---------- + 4 files changed, 37 insertions(+), 36 deletions(-) + +diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h +index 4c68aaae5a1a..b33582631782 100644 +--- a/include/linux/skbuff.h ++++ b/include/linux/skbuff.h +@@ -3016,6 +3016,12 @@ static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len) + return __skb_grow(skb, len); + } + ++#define rb_to_skb(rb) rb_entry_safe(rb, struct sk_buff, rbnode) ++#define skb_rb_first(root) rb_to_skb(rb_first(root)) ++#define skb_rb_last(root) rb_to_skb(rb_last(root)) ++#define skb_rb_next(skb) rb_to_skb(rb_next(&(skb)->rbnode)) ++#define skb_rb_prev(skb) rb_to_skb(rb_prev(&(skb)->rbnode)) ++ + #define skb_queue_walk(queue, skb) \ + for (skb = (queue)->next; \ + skb != (struct sk_buff *)(queue); \ +@@ -3030,6 +3036,18 @@ static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len) + for (; skb != (struct sk_buff *)(queue); \ + skb = skb->next) + ++#define skb_rbtree_walk(skb, root) \ ++ for (skb = skb_rb_first(root); skb != NULL; \ ++ skb = skb_rb_next(skb)) ++ ++#define skb_rbtree_walk_from(skb) \ ++ for (; skb != NULL; \ ++ skb = skb_rb_next(skb)) ++ ++#define skb_rbtree_walk_from_safe(skb, tmp) \ ++ for (; tmp = skb ? skb_rb_next(skb) : NULL, (skb != NULL); \ ++ skb = tmp) ++ + #define skb_queue_walk_from_safe(queue, skb, tmp) \ + for (tmp = skb->next; \ + skb != (struct sk_buff *)(queue); \ +diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c +index 4af82b914dd4..4c9945cf6b10 100644 +--- a/net/ipv4/tcp_fastopen.c ++++ b/net/ipv4/tcp_fastopen.c +@@ -458,17 +458,15 @@ bool tcp_fastopen_active_should_disable(struct sock *sk) + void tcp_fastopen_active_disable_ofo_check(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); +- struct rb_node *p; +- struct sk_buff *skb; + struct dst_entry *dst; ++ struct sk_buff *skb; + + if (!tp->syn_fastopen) + return; + + if (!tp->data_segs_in) { +- p = rb_first(&tp->out_of_order_queue); +- if (p && !rb_next(p)) { +- skb = rb_entry(p, struct sk_buff, rbnode); ++ skb = skb_rb_first(&tp->out_of_order_queue); ++ if (skb && !skb_rb_next(skb)) { + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { + tcp_fastopen_active_disable(sk); + return; +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index 9800228d1169..7349eb7cee93 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -4319,7 +4319,7 @@ static void tcp_ofo_queue(struct sock *sk) + + p = rb_first(&tp->out_of_order_queue); + while (p) { +- skb = rb_entry(p, struct sk_buff, rbnode); ++ skb = rb_to_skb(p); + if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) + break; + +@@ -4383,7 +4383,7 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, + static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) + { + struct tcp_sock *tp = tcp_sk(sk); +- struct rb_node **p, *q, *parent; ++ struct rb_node **p, *parent; + struct sk_buff *skb1; + u32 seq, end_seq; + bool fragstolen; +@@ -4441,7 +4441,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) + parent = NULL; + while (*p) { + parent = *p; +- skb1 = rb_entry(parent, struct sk_buff, rbnode); ++ skb1 = rb_to_skb(parent); + if (before(seq, TCP_SKB_CB(skb1)->seq)) { + p = &parent->rb_left; + continue; +@@ -4485,9 +4485,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) + + merge_right: + /* Remove other segments covered by skb. */ +- while ((q = rb_next(&skb->rbnode)) != NULL) { +- skb1 = rb_entry(q, struct sk_buff, rbnode); +- ++ while ((skb1 = skb_rb_next(skb)) != NULL) { + if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) + break; + if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { +@@ -4502,7 +4500,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) + tcp_drop(sk, skb1); + } + /* If there is no skb after us, we are the last_skb ! */ +- if (!q) ++ if (!skb1) + tp->ooo_last_skb = skb; + + add_sack: +@@ -4687,7 +4685,7 @@ static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *li + if (list) + return !skb_queue_is_last(list, skb) ? skb->next : NULL; + +- return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode); ++ return skb_rb_next(skb); + } + + static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, +@@ -4716,7 +4714,7 @@ static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) + + while (*p) { + parent = *p; +- skb1 = rb_entry(parent, struct sk_buff, rbnode); ++ skb1 = rb_to_skb(parent); + if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) + p = &parent->rb_left; + else +@@ -4835,26 +4833,19 @@ static void tcp_collapse_ofo_queue(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb, *head; +- struct rb_node *p; + u32 start, end; + +- p = rb_first(&tp->out_of_order_queue); +- skb = rb_entry_safe(p, struct sk_buff, rbnode); ++ skb = skb_rb_first(&tp->out_of_order_queue); + new_range: + if (!skb) { +- p = rb_last(&tp->out_of_order_queue); +- /* Note: This is possible p is NULL here. We do not +- * use rb_entry_safe(), as ooo_last_skb is valid only +- * if rbtree is not empty. +- */ +- tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode); ++ tp->ooo_last_skb = skb_rb_last(&tp->out_of_order_queue); + return; + } + start = TCP_SKB_CB(skb)->seq; + end = TCP_SKB_CB(skb)->end_seq; + + for (head = skb;;) { +- skb = tcp_skb_next(skb, NULL); ++ skb = skb_rb_next(skb); + + /* Range is terminated when we see a gap or when + * we are at the queue end. +@@ -4897,14 +4888,14 @@ static bool tcp_prune_ofo_queue(struct sock *sk) + do { + prev = rb_prev(node); + rb_erase(node, &tp->out_of_order_queue); +- tcp_drop(sk, rb_entry(node, struct sk_buff, rbnode)); ++ tcp_drop(sk, rb_to_skb(node)); + sk_mem_reclaim(sk); + if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && + !tcp_under_memory_pressure(sk)) + break; + node = prev; + } while (node); +- tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode); ++ tp->ooo_last_skb = rb_to_skb(prev); + + /* Reset SACK state. A conforming SACK implementation will + * do the same at a timeout based retransmit. When a connection +diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c +index 4fbb0c82996f..bce14d8cc674 100644 +--- a/net/sched/sch_netem.c ++++ b/net/sched/sch_netem.c +@@ -149,12 +149,6 @@ struct netem_skb_cb { + ktime_t tstamp_save; + }; + +- +-static struct sk_buff *netem_rb_to_skb(struct rb_node *rb) +-{ +- return rb_entry(rb, struct sk_buff, rbnode); +-} +- + static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb) + { + /* we assume we can use skb next/prev/tstamp as storage for rb_node */ +@@ -365,7 +359,7 @@ static void tfifo_reset(struct Qdisc *sch) + struct rb_node *p; + + while ((p = rb_first(&q->t_root))) { +- struct sk_buff *skb = netem_rb_to_skb(p); ++ struct sk_buff *skb = rb_to_skb(p); + + rb_erase(p, &q->t_root); + rtnl_kfree_skbs(skb, skb); +@@ -382,7 +376,7 @@ static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) + struct sk_buff *skb; + + parent = *p; +- skb = netem_rb_to_skb(parent); ++ skb = rb_to_skb(parent); + if (tnext >= netem_skb_cb(skb)->time_to_send) + p = &parent->rb_right; + else +@@ -538,7 +532,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff *t_skb; + struct netem_skb_cb *t_last; + +- t_skb = netem_rb_to_skb(rb_last(&q->t_root)); ++ t_skb = skb_rb_last(&q->t_root); + t_last = netem_skb_cb(t_skb); + if (!last || + t_last->time_to_send > last->time_to_send) { +@@ -618,7 +612,7 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch) + if (p) { + psched_time_t time_to_send; + +- skb = netem_rb_to_skb(p); ++ skb = rb_to_skb(p); + + /* if more time remaining? */ + time_to_send = netem_skb_cb(skb)->time_to_send; +-- +2.18.0 + diff --git a/patches.fixes/tcp-add-tcp_ooo_try_coalesce-helper.patch b/patches.fixes/tcp-add-tcp_ooo_try_coalesce-helper.patch new file mode 100644 index 0000000..892f80a --- /dev/null +++ b/patches.fixes/tcp-add-tcp_ooo_try_coalesce-helper.patch @@ -0,0 +1,76 @@ +From: Eric Dumazet +Date: Mon, 23 Jul 2018 09:28:21 -0700 +Subject: tcp: add tcp_ooo_try_coalesce() helper +Patch-mainline: v4.18-rc7 +Git-commit: 58152ecbbcc6a0ce7fddd5bf5f6ee535834ece0c +References: CVE-2018-5390 bsc#1102340 + +In case skb in out_or_order_queue is the result of +multiple skbs coalescing, we would like to get a proper gso_segs +counter tracking, so that future tcp_drop() can report an accurate +number. + +I chose to not implement this tracking for skbs in receive queue, +since they are not dropped, unless socket is disconnected. + +Signed-off-by: Eric Dumazet +Acked-by: Soheil Hassas Yeganeh +Acked-by: Yuchung Cheng +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/ipv4/tcp_input.c | 23 +++++++++++++++++++++-- + 1 file changed, 21 insertions(+), 2 deletions(-) + +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index ff05616420b7..85881e5e5d8e 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -4300,6 +4300,23 @@ static bool tcp_try_coalesce(struct sock *sk, + return true; + } + ++static bool tcp_ooo_try_coalesce(struct sock *sk, ++ struct sk_buff *to, ++ struct sk_buff *from, ++ bool *fragstolen) ++{ ++ bool res = tcp_try_coalesce(sk, to, from, fragstolen); ++ ++ /* In case tcp_drop() is called later, update to->gso_segs */ ++ if (res) { ++ u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) + ++ max_t(u16, 1, skb_shinfo(from)->gso_segs); ++ ++ skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, 0xFFFF); ++ } ++ return res; ++} ++ + static void tcp_drop(struct sock *sk, struct sk_buff *skb) + { + sk_drops_add(sk, skb); +@@ -4423,7 +4440,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) + /* In the typical case, we are adding an skb to the end of the list. + * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. + */ +- if (tcp_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) { ++ if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb, ++ skb, &fragstolen)) { + coalesce_done: + tcp_grow_window(sk, skb); + kfree_skb_partial(skb, fragstolen); +@@ -4473,7 +4491,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) + tcp_drop(sk, skb1); + goto merge_right; + } +- } else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { ++ } else if (tcp_ooo_try_coalesce(sk, skb1, ++ skb, &fragstolen)) { + goto coalesce_done; + } + p = &parent->rb_right; +-- +2.18.0 + diff --git a/patches.fixes/tcp-avoid-collapses-in-tcp_prune_queue-if-possible.patch b/patches.fixes/tcp-avoid-collapses-in-tcp_prune_queue-if-possible.patch new file mode 100644 index 0000000..f7b5cca --- /dev/null +++ b/patches.fixes/tcp-avoid-collapses-in-tcp_prune_queue-if-possible.patch @@ -0,0 +1,50 @@ +From: Eric Dumazet +Date: Mon, 23 Jul 2018 09:28:18 -0700 +Subject: tcp: avoid collapses in tcp_prune_queue() if possible +Patch-mainline: v4.18-rc7 +Git-commit: f4a3313d8e2ca9fd8d8f45e40a2903ba782607e7 +References: CVE-2018-5390 bsc#1102340 + +Right after a TCP flow is created, receiving tiny out of order +packets allways hit the condition : + +if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) + tcp_clamp_window(sk); + +tcp_clamp_window() increases sk_rcvbuf to match sk_rmem_alloc +(guarded by tcp_rmem[2]) + +Calling tcp_collapse_ofo_queue() in this case is not useful, +and offers a O(N^2) surface attack to malicious peers. + +Better not attempt anything before full queue capacity is reached, +forcing attacker to spend lots of resource and allow us to more +easily detect the abuse. + +Signed-off-by: Eric Dumazet +Acked-by: Soheil Hassas Yeganeh +Acked-by: Yuchung Cheng +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/ipv4/tcp_input.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index d68bc1ddd0c8..0fb37b9b1b37 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -4934,6 +4934,9 @@ static int tcp_prune_queue(struct sock *sk) + else if (tcp_under_memory_pressure(sk)) + tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); + ++ if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) ++ return 0; ++ + tcp_collapse_ofo_queue(sk); + if (!skb_queue_empty(&sk->sk_receive_queue)) + tcp_collapse(sk, &sk->sk_receive_queue, NULL, +-- +2.18.0 + diff --git a/patches.fixes/tcp-call-tcp_drop-from-tcp_data_queue_ofo.patch b/patches.fixes/tcp-call-tcp_drop-from-tcp_data_queue_ofo.patch new file mode 100644 index 0000000..f1cec39 --- /dev/null +++ b/patches.fixes/tcp-call-tcp_drop-from-tcp_data_queue_ofo.patch @@ -0,0 +1,46 @@ +From: Eric Dumazet +Date: Mon, 23 Jul 2018 09:28:20 -0700 +Subject: tcp: call tcp_drop() from tcp_data_queue_ofo() +Patch-mainline: v4.18-rc7 +Git-commit: 8541b21e781a22dce52a74fef0b9bed00404a1cd +References: CVE-2018-5390 bsc#1102340 + +In order to be able to give better diagnostics and detect +malicious traffic, we need to have better sk->sk_drops tracking. + +Fixes: 9f5afeae5152 ("tcp: use an RB tree for ooo receive queue") +Signed-off-by: Eric Dumazet +Acked-by: Soheil Hassas Yeganeh +Acked-by: Yuchung Cheng +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/ipv4/tcp_input.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index 7c39961279c3..ff05616420b7 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -4451,7 +4451,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) + /* All the bits are present. Drop. */ + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPOFOMERGE); +- __kfree_skb(skb); ++ tcp_drop(sk, skb); + skb = NULL; + tcp_dsack_set(sk, seq, end_seq); + goto add_sack; +@@ -4470,7 +4470,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) + TCP_SKB_CB(skb1)->end_seq); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPOFOMERGE); +- __kfree_skb(skb1); ++ tcp_drop(sk, skb1); + goto merge_right; + } + } else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { +-- +2.18.0 + diff --git a/patches.fixes/tcp-detect-malicious-patterns-in-tcp_collapse_ofo_qu.patch b/patches.fixes/tcp-detect-malicious-patterns-in-tcp_collapse_ofo_qu.patch new file mode 100644 index 0000000..52839d7 --- /dev/null +++ b/patches.fixes/tcp-detect-malicious-patterns-in-tcp_collapse_ofo_qu.patch @@ -0,0 +1,76 @@ +From: Eric Dumazet +Date: Mon, 23 Jul 2018 09:28:19 -0700 +Subject: tcp: detect malicious patterns in tcp_collapse_ofo_queue() +Patch-mainline: v4.18-rc7 +Git-commit: 3d4bf93ac12003f9b8e1e2de37fe27983deebdcf +References: CVE-2018-5390 bsc#1102340 + +In case an attacker feeds tiny packets completely out of order, +tcp_collapse_ofo_queue() might scan the whole rb-tree, performing +expensive copies, but not changing socket memory usage at all. + +1) Do not attempt to collapse tiny skbs. +2) Add logic to exit early when too many tiny skbs are detected. + +We prefer not doing aggressive collapsing (which copies packets) +for pathological flows, and revert to tcp_prune_ofo_queue() which +will be less expensive. + +In the future, we might add the possibility of terminating flows +that are proven to be malicious. + +Signed-off-by: Eric Dumazet +Acked-by: Soheil Hassas Yeganeh +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/ipv4/tcp_input.c | 15 +++++++++++++-- + 1 file changed, 13 insertions(+), 2 deletions(-) + +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index 0fb37b9b1b37..7c39961279c3 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -4832,6 +4832,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root, + static void tcp_collapse_ofo_queue(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); ++ u32 range_truesize, sum_tiny = 0; + struct sk_buff *skb, *head; + u32 start, end; + +@@ -4843,6 +4844,7 @@ static void tcp_collapse_ofo_queue(struct sock *sk) + } + start = TCP_SKB_CB(skb)->seq; + end = TCP_SKB_CB(skb)->end_seq; ++ range_truesize = skb->truesize; + + for (head = skb;;) { + skb = skb_rb_next(skb); +@@ -4853,11 +4855,20 @@ static void tcp_collapse_ofo_queue(struct sock *sk) + if (!skb || + after(TCP_SKB_CB(skb)->seq, end) || + before(TCP_SKB_CB(skb)->end_seq, start)) { +- tcp_collapse(sk, NULL, &tp->out_of_order_queue, +- head, skb, start, end); ++ /* Do not attempt collapsing tiny skbs */ ++ if (range_truesize != head->truesize || ++ end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) { ++ tcp_collapse(sk, NULL, &tp->out_of_order_queue, ++ head, skb, start, end); ++ } else { ++ sum_tiny += range_truesize; ++ if (sum_tiny > sk->sk_rcvbuf >> 3) ++ return; ++ } + goto new_range; + } + ++ range_truesize += skb->truesize; + if (unlikely(before(TCP_SKB_CB(skb)->seq, start))) + start = TCP_SKB_CB(skb)->seq; + if (after(TCP_SKB_CB(skb)->end_seq, end)) +-- +2.18.0 + diff --git a/patches.fixes/tcp-free-batches-of-packets-in-tcp_prune_ofo_queue.patch b/patches.fixes/tcp-free-batches-of-packets-in-tcp_prune_ofo_queue.patch new file mode 100644 index 0000000..27e8912 --- /dev/null +++ b/patches.fixes/tcp-free-batches-of-packets-in-tcp_prune_ofo_queue.patch @@ -0,0 +1,80 @@ +From: Eric Dumazet +Date: Mon, 23 Jul 2018 09:28:17 -0700 +Subject: tcp: free batches of packets in tcp_prune_ofo_queue() +Patch-mainline: v4.18-rc7 +Git-commit: 72cd43ba64fc172a443410ce01645895850844c8 +References: CVE-2018-5390 bsc#1102340 + +Juha-Matti Tilli reported that malicious peers could inject tiny +packets in out_of_order_queue, forcing very expensive calls +to tcp_collapse_ofo_queue() and tcp_prune_ofo_queue() for +every incoming packet. out_of_order_queue rb-tree can contain +thousands of nodes, iterating over all of them is not nice. + +Before linux-4.9, we would have pruned all packets in ofo_queue +in one go, every XXXX packets. XXXX depends on sk_rcvbuf and skbs +truesize, but is about 7000 packets with tcp_rmem[2] default of 6 MB. + +Since we plan to increase tcp_rmem[2] in the future to cope with +modern BDP, can not revert to the old behavior, without great pain. + +Strategy taken in this patch is to purge ~12.5 % of the queue capacity. + +Fixes: 36a6503fedda ("tcp: refine tcp_prune_ofo_queue() to not drop all packets") +Signed-off-by: Eric Dumazet +Reported-by: Juha-Matti Tilli +Acked-by: Yuchung Cheng +Acked-by: Soheil Hassas Yeganeh +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/ipv4/tcp_input.c | 15 +++++++++++---- + 1 file changed, 11 insertions(+), 4 deletions(-) + +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index 7349eb7cee93..d68bc1ddd0c8 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -4872,6 +4872,7 @@ static void tcp_collapse_ofo_queue(struct sock *sk) + * 2) not add too big latencies if thousands of packets sit there. + * (But if application shrinks SO_RCVBUF, we could still end up + * freeing whole queue here) ++ * 3) Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks. + * + * Return true if queue has shrunk. + */ +@@ -4879,20 +4880,26 @@ static bool tcp_prune_ofo_queue(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); + struct rb_node *node, *prev; ++ int goal; + + if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) + return false; + + NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED); ++ goal = sk->sk_rcvbuf >> 3; + node = &tp->ooo_last_skb->rbnode; + do { + prev = rb_prev(node); + rb_erase(node, &tp->out_of_order_queue); ++ goal -= rb_to_skb(node)->truesize; + tcp_drop(sk, rb_to_skb(node)); +- sk_mem_reclaim(sk); +- if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && +- !tcp_under_memory_pressure(sk)) +- break; ++ if (!prev || goal <= 0) { ++ sk_mem_reclaim(sk); ++ if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && ++ !tcp_under_memory_pressure(sk)) ++ break; ++ goal = sk->sk_rcvbuf >> 3; ++ } + node = prev; + } while (node); + tp->ooo_last_skb = rb_to_skb(prev); +-- +2.18.0 + diff --git a/series.conf b/series.conf index 088f9e8..ff02ee6 100644 --- a/series.conf +++ b/series.conf @@ -8447,6 +8447,7 @@ patches.drivers/i40evf-enable-support-for-VF-VLAN-tag-stripping-cont.patch patches.drivers/i40e-ignore-skb-xmit_more-when-deciding-to-set-RS-bi.patch patches.drivers/i40e-i40evf-organize-and-re-number-feature-flags.patch + patches.fixes/net-add-rb_to_skb-and-other-rb-tree-helpers.patch patches.drivers/bnxt_en-don-t-consider-building-bnxt_tc.o-if-option-.patch patches.suse/msft-hv-1477-hv_netvsc-Change-the-hash-level-variable-to-bit-flag.patch patches.suse/msft-hv-1478-hv_netvsc-Add-ethtool-handler-to-set-and-get-TCP-has.patch @@ -15243,6 +15244,11 @@ patches.drivers/can-xilinx_can-fix-RX-overflow-interrupt-not-being-e patches.drivers/can-xilinx_can-fix-incorrect-clear-of-non-processed- patches.drivers/can-xilinx_can-fix-power-management-handling + patches.fixes/tcp-free-batches-of-packets-in-tcp_prune_ofo_queue.patch + patches.fixes/tcp-avoid-collapses-in-tcp_prune_queue-if-possible.patch + patches.fixes/tcp-detect-malicious-patterns-in-tcp_collapse_ofo_qu.patch + patches.fixes/tcp-call-tcp_drop-from-tcp_data_queue_ofo.patch + patches.fixes/tcp-add-tcp_ooo_try_coalesce-helper.patch patches.drivers/net-mlx4_core-Save-the-qpn-from-the-input-modifier-i.patch patches.drivers/qmi_wwan-fix-interface-number-for-DW5821e-production patches.drivers/driver-core-Partially-revert-driver-core-correct-dev