diff --git a/blacklist.conf b/blacklist.conf index e2a3e6f..a10c53d 100644 --- a/blacklist.conf +++ b/blacklist.conf @@ -218,4 +218,5 @@ cd36c3a21a400cac9c457394b9adf94e0027c136 # duplicate to 33ba43ed0afc f6c4fd506cb626e4346aa81688f255e593a7c5a0 # we don't do CONFIG_X86_32 f2078904810373211fb15f91888fba14c01a4acc # we don't do 5level page tables a237f762681e2a394ca67f21df2feb2b76a3609b # Documentation +040ee69226f8a96b7943645d68f41d5d44b5ff7d # too intrusive prerequisities diff --git a/patches.drivers/net-mlx5e-Keep-updating-ethtool-statistics-when-the-.patch b/patches.drivers/net-mlx5e-Keep-updating-ethtool-statistics-when-the-.patch new file mode 100644 index 0000000..794f14b --- /dev/null +++ b/patches.drivers/net-mlx5e-Keep-updating-ethtool-statistics-when-the-.patch @@ -0,0 +1,35 @@ +From: Gal Pressman +Date: Tue, 26 Dec 2017 13:44:49 +0200 +Subject: net/mlx5e: Keep updating ethtool statistics when the interface is + down +Patch-mainline: v4.15-rc9 +Git-commit: e556f6dd47eda62cbb046fa92e03265245a1537f +References: bsc#1046303 FATE#322944 + +ethtool statistics should be updated even when the interface is down +since it shows more than just netdev counters, which might change while +the logical link is down. +One useful use case, for example, is when running RoCE traffic over the +interface (while the logical link is down, but physical link is up) and +examining rx_prioX_bytes. + +Fixes: f62b8bb8f2d3 ("net/mlx5: Extend mlx5_core to support ConnectX-4 Ethernet functionality") +Signed-off-by: Gal Pressman +Signed-off-by: Saeed Mahameed +Acked-by: Thomas Bogendoerfer +--- + drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +@@ -349,8 +349,7 @@ void mlx5e_ethtool_get_ethtool_stats(str + return; + + mutex_lock(&priv->state_lock); +- if (test_bit(MLX5E_STATE_OPENED, &priv->state)) +- mlx5e_update_stats(priv, true); ++ mlx5e_update_stats(priv, true); + channels = &priv->channels; + mutex_unlock(&priv->state_lock); + diff --git a/patches.fixes/IPv4-early-demux-can-return-an-error-code.patch b/patches.fixes/IPv4-early-demux-can-return-an-error-code.patch new file mode 100644 index 0000000..088957a --- /dev/null +++ b/patches.fixes/IPv4-early-demux-can-return-an-error-code.patch @@ -0,0 +1,225 @@ +From: Paolo Abeni +Date: Thu, 28 Sep 2017 15:51:36 +0200 +Subject: IPv4: early demux can return an error code +Patch-mainline: v4.14-rc4 +Git-commit: 7487449c86c65202b3b725c4524cb48dd65e4e6f +References: bsc#1076830 + +Currently no error is emitted, but this infrastructure will +used by the next patch to allow source address validation +for mcast sockets. +Since early demux can do a route lookup and an ipv4 route +lookup can return an error code this is consistent with the +current ipv4 route infrastructure. + +Signed-off-by: Paolo Abeni +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + include/net/protocol.h | 4 ++-- + include/net/tcp.h | 2 +- + include/net/udp.h | 2 +- + net/ipv4/ip_input.c | 25 +++++++++++++++---------- + net/ipv4/tcp_ipv4.c | 9 +++++---- + net/ipv4/udp.c | 11 ++++++----- + 6 files changed, 30 insertions(+), 23 deletions(-) + +diff --git a/include/net/protocol.h b/include/net/protocol.h +index 65ba335b0e7e..4fc75f7ae23b 100644 +--- a/include/net/protocol.h ++++ b/include/net/protocol.h +@@ -39,8 +39,8 @@ + + /* This is used to register protocols. */ + struct net_protocol { +- void (*early_demux)(struct sk_buff *skb); +- void (*early_demux_handler)(struct sk_buff *skb); ++ int (*early_demux)(struct sk_buff *skb); ++ int (*early_demux_handler)(struct sk_buff *skb); + int (*handler)(struct sk_buff *skb); + void (*err_handler)(struct sk_buff *skb, u32 info); + unsigned int no_policy:1, +diff --git a/include/net/tcp.h b/include/net/tcp.h +index 82e9251323cc..9c0515f33ae9 100644 +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -345,7 +345,7 @@ void tcp_v4_err(struct sk_buff *skb, u32); + + void tcp_shutdown(struct sock *sk, int how); + +-void tcp_v4_early_demux(struct sk_buff *skb); ++int tcp_v4_early_demux(struct sk_buff *skb); + int tcp_v4_rcv(struct sk_buff *skb); + + int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw); +diff --git a/include/net/udp.h b/include/net/udp.h +index f7eb9ecee482..7e12b8588f4c 100644 +--- a/include/net/udp.h ++++ b/include/net/udp.h +@@ -259,7 +259,7 @@ static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags, + return __skb_recv_udp(sk, flags, noblock, &peeked, &off, err); + } + +-void udp_v4_early_demux(struct sk_buff *skb); ++int udp_v4_early_demux(struct sk_buff *skb); + bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst); + int udp_get_port(struct sock *sk, unsigned short snum, + int (*saddr_cmp)(const struct sock *, +diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c +index fa2dc8f692c6..57fc13c6ab2b 100644 +--- a/net/ipv4/ip_input.c ++++ b/net/ipv4/ip_input.c +@@ -311,9 +311,10 @@ static inline bool ip_rcv_options(struct sk_buff *skb) + static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) + { + const struct iphdr *iph = ip_hdr(skb); +- struct rtable *rt; ++ int (*edemux)(struct sk_buff *skb); + struct net_device *dev = skb->dev; +- void (*edemux)(struct sk_buff *skb); ++ struct rtable *rt; ++ int err; + + /* if ingress device is enslaved to an L3 master device pass the + * skb to its handler for processing +@@ -331,7 +332,9 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) + + ipprot = rcu_dereference(inet_protos[protocol]); + if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) { +- edemux(skb); ++ err = edemux(skb); ++ if (unlikely(err)) ++ goto drop_error; + /* must reload iph, skb->head might have changed */ + iph = ip_hdr(skb); + } +@@ -342,13 +345,10 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) + * how the packet travels inside Linux networking. + */ + if (!skb_valid_dst(skb)) { +- int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, +- iph->tos, dev); +- if (unlikely(err)) { +- if (err == -EXDEV) +- __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER); +- goto drop; +- } ++ err = ip_route_input_noref(skb, iph->daddr, iph->saddr, ++ iph->tos, dev); ++ if (unlikely(err)) ++ goto drop_error; + } + + #ifdef CONFIG_IP_ROUTE_CLASSID +@@ -399,6 +399,11 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) + drop: + kfree_skb(skb); + return NET_RX_DROP; ++ ++drop_error: ++ if (err == -EXDEV) ++ __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER); ++ goto drop; + } + + /* +diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c +index c9845859fe94..51e440c54944 100644 +--- a/net/ipv4/tcp_ipv4.c ++++ b/net/ipv4/tcp_ipv4.c +@@ -1448,23 +1448,23 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) + } + EXPORT_SYMBOL(tcp_v4_do_rcv); + +-void tcp_v4_early_demux(struct sk_buff *skb) ++int tcp_v4_early_demux(struct sk_buff *skb) + { + const struct iphdr *iph; + const struct tcphdr *th; + struct sock *sk; + + if (skb->pkt_type != PACKET_HOST) +- return; ++ return 0; + + if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) +- return; ++ return 0; + + iph = ip_hdr(skb); + th = tcp_hdr(skb); + + if (th->doff < sizeof(struct tcphdr) / 4) +- return; ++ return 0; + + sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, + iph->saddr, th->source, +@@ -1483,6 +1483,7 @@ void tcp_v4_early_demux(struct sk_buff *skb) + skb_dst_set_noref(skb, dst); + } + } ++ return 0; + } + + bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) +diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c +index 018ef826f37b..25c108218b54 100644 +--- a/net/ipv4/udp.c ++++ b/net/ipv4/udp.c +@@ -2158,7 +2158,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, + return NULL; + } + +-void udp_v4_early_demux(struct sk_buff *skb) ++int udp_v4_early_demux(struct sk_buff *skb) + { + struct net *net = dev_net(skb->dev); + const struct iphdr *iph; +@@ -2170,7 +2170,7 @@ void udp_v4_early_demux(struct sk_buff *skb) + + /* validate the packet */ + if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr))) +- return; ++ return 0; + + iph = ip_hdr(skb); + uh = udp_hdr(skb); +@@ -2180,14 +2180,14 @@ void udp_v4_early_demux(struct sk_buff *skb) + struct in_device *in_dev = __in_dev_get_rcu(skb->dev); + + if (!in_dev) +- return; ++ return 0; + + /* we are supposed to accept bcast packets */ + if (skb->pkt_type == PACKET_MULTICAST) { + ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr, + iph->protocol); + if (!ours) +- return; ++ return 0; + } + + sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr, +@@ -2198,7 +2198,7 @@ void udp_v4_early_demux(struct sk_buff *skb) + } + + if (!sk || !atomic_inc_not_zero_hint(&sk->sk_refcnt, 2)) +- return; ++ return 0; + + skb->sk = sk; + skb->destructor = sock_efree; +@@ -2213,6 +2213,7 @@ void udp_v4_early_demux(struct sk_buff *skb) + */ + skb_dst_set_noref(skb, dst); + } ++ return 0; + } + + int udp_rcv(struct sk_buff *skb) +-- +2.16.0 + diff --git a/patches.fixes/KEYS-Fix-race-between-updating-and-finding-a-negativ.patch b/patches.fixes/KEYS-Fix-race-between-updating-and-finding-a-negativ.patch new file mode 100644 index 0000000..6a5b768 --- /dev/null +++ b/patches.fixes/KEYS-Fix-race-between-updating-and-finding-a-negativ.patch @@ -0,0 +1,496 @@ +From 363b02dab09b3226f3bd1420dad9c72b79a42a76 Mon Sep 17 00:00:00 2001 +From: David Howells +Date: Wed, 4 Oct 2017 16:43:25 +0100 +Subject: [PATCH] KEYS: Fix race between updating and finding a negative key + +References: CVE-2017-15951, bsc#1065615, bsc#1071927 +Patch-mainline: v4.14-rc6 +Git-commit: 363b02dab09b3226f3bd1420dad9c72b79a42a76 + +Consolidate KEY_FLAG_INSTANTIATED, KEY_FLAG_NEGATIVE and the rejection +error into one field such that: + + (1) The instantiation state can be modified/read atomically. + + (2) The error can be accessed atomically with the state. + + (3) The error isn't stored unioned with the payload pointers. + +This deals with the problem that the state is spread over three different +objects (two bits and a separate variable) and reading or updating them +atomically isn't practical, given that not only can uninstantiated keys +change into instantiated or rejected keys, but rejected keys can also turn +into instantiated keys - and someone accessing the key might not be using +any locking. + +The main side effect of this problem is that what was held in the payload +may change, depending on the state. For instance, you might observe the +key to be in the rejected state. You then read the cached error, but if +the key semaphore wasn't locked, the key might've become instantiated +between the two reads - and you might now have something in hand that isn't +actually an error code. + +The state is now KEY_IS_UNINSTANTIATED, KEY_IS_POSITIVE or a negative error +code if the key is negatively instantiated. The key_is_instantiated() +function is replaced with key_is_positive() to avoid confusion as negative +keys are also 'instantiated'. + +Additionally, barriering is included: + + (1) Order payload-set before state-set during instantiation. + + (2) Order state-read before payload-read when using the key. + +Further separate barriering is necessary if RCU is being used to access the +payload content after reading the payload pointers. + +Fixes: 146aa8b1453b ("KEYS: Merge the type-specific data with the payload data") +Cc: stable@vger.kernel.org # v4.4+ +Reported-by: Eric Biggers +Signed-off-by: David Howells +Reviewed-by: Eric Biggers +Acked-by: Michal Suchanek +--- + include/linux/key.h | 45 ++++++++++++++++++++------------ + net/dns_resolver/dns_key.c | 2 +- + security/keys/big_key.c | 4 +-- + security/keys/encrypted-keys/encrypted.c | 2 +- + security/keys/gc.c | 8 +++--- + security/keys/key.c | 31 ++++++++++++++-------- + security/keys/keyctl.c | 9 +++---- + security/keys/keyring.c | 10 +++---- + security/keys/proc.c | 7 +++-- + security/keys/process_keys.c | 2 +- + security/keys/request_key.c | 7 +++-- + security/keys/request_key_auth.c | 2 +- + security/keys/trusted.c | 2 +- + security/keys/user_defined.c | 4 +-- + 14 files changed, 79 insertions(+), 56 deletions(-) + +diff --git a/include/linux/key.h b/include/linux/key.h +index 78e25aabedaf..7d9c3a8e0e4d 100644 +--- a/include/linux/key.h ++++ b/include/linux/key.h +@@ -138,6 +138,11 @@ struct key_restriction { + struct key_type *keytype; + }; + ++enum key_state { ++ KEY_IS_UNINSTANTIATED, ++ KEY_IS_POSITIVE, /* Positively instantiated */ ++}; ++ + /*****************************************************************************/ + /* + * authentication token / access credential / keyring +@@ -169,6 +174,7 @@ struct key { + * - may not match RCU dereferenced payload + * - payload should contain own length + */ ++ short state; /* Key state (+) or rejection error (-) */ + + #ifdef KEY_DEBUGGING + unsigned magic; +@@ -176,17 +182,15 @@ struct key { + #endif + + unsigned long flags; /* status flags (change with bitops) */ +-#define KEY_FLAG_INSTANTIATED 0 /* set if key has been instantiated */ +-#define KEY_FLAG_DEAD 1 /* set if key type has been deleted */ +-#define KEY_FLAG_REVOKED 2 /* set if key had been revoked */ +-#define KEY_FLAG_IN_QUOTA 3 /* set if key consumes quota */ +-#define KEY_FLAG_USER_CONSTRUCT 4 /* set if key is being constructed in userspace */ +-#define KEY_FLAG_NEGATIVE 5 /* set if key is negative */ +-#define KEY_FLAG_ROOT_CAN_CLEAR 6 /* set if key can be cleared by root without permission */ +-#define KEY_FLAG_INVALIDATED 7 /* set if key has been invalidated */ +-#define KEY_FLAG_BUILTIN 8 /* set if key is built in to the kernel */ +-#define KEY_FLAG_ROOT_CAN_INVAL 9 /* set if key can be invalidated by root without permission */ +-#define KEY_FLAG_KEEP 10 /* set if key should not be removed */ ++#define KEY_FLAG_DEAD 0 /* set if key type has been deleted */ ++#define KEY_FLAG_REVOKED 1 /* set if key had been revoked */ ++#define KEY_FLAG_IN_QUOTA 2 /* set if key consumes quota */ ++#define KEY_FLAG_USER_CONSTRUCT 3 /* set if key is being constructed in userspace */ ++#define KEY_FLAG_ROOT_CAN_CLEAR 4 /* set if key can be cleared by root without permission */ ++#define KEY_FLAG_INVALIDATED 5 /* set if key has been invalidated */ ++#define KEY_FLAG_BUILTIN 6 /* set if key is built in to the kernel */ ++#define KEY_FLAG_ROOT_CAN_INVAL 7 /* set if key can be invalidated by root without permission */ ++#define KEY_FLAG_KEEP 8 /* set if key should not be removed */ + + /* the key type and key description string + * - the desc is used to match a key against search criteria +@@ -212,7 +216,6 @@ struct key { + struct list_head name_link; + struct assoc_array keys; + }; +- int reject_error; + }; + + /* This is set on a keyring to restrict the addition of a link to a key +@@ -351,17 +354,27 @@ extern void key_set_timeout(struct key *, unsigned); + #define KEY_NEED_SETATTR 0x20 /* Require permission to change attributes */ + #define KEY_NEED_ALL 0x3f /* All the above permissions */ + ++static inline short key_read_state(const struct key *key) ++{ ++ /* Barrier versus mark_key_instantiated(). */ ++ return smp_load_acquire(&key->state); ++} ++ + /** +- * key_is_instantiated - Determine if a key has been positively instantiated ++ * key_is_positive - Determine if a key has been positively instantiated + * @key: The key to check. + * + * Return true if the specified key has been positively instantiated, false + * otherwise. + */ +-static inline bool key_is_instantiated(const struct key *key) ++static inline bool key_is_positive(const struct key *key) ++{ ++ return key_read_state(key) == KEY_IS_POSITIVE; ++} ++ ++static inline bool key_is_negative(const struct key *key) + { +- return test_bit(KEY_FLAG_INSTANTIATED, &key->flags) && +- !test_bit(KEY_FLAG_NEGATIVE, &key->flags); ++ return key_read_state(key) < 0; + } + + #define dereference_key_rcu(KEY) \ +diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c +index 8737412c7b27..e1d4d898a007 100644 +--- a/net/dns_resolver/dns_key.c ++++ b/net/dns_resolver/dns_key.c +@@ -224,7 +224,7 @@ static int dns_resolver_match_preparse(struct key_match_data *match_data) + static void dns_resolver_describe(const struct key *key, struct seq_file *m) + { + seq_puts(m, key->description); +- if (key_is_instantiated(key)) { ++ if (key_is_positive(key)) { + int err = PTR_ERR(key->payload.data[dns_key_error]); + + if (err) +diff --git a/security/keys/big_key.c b/security/keys/big_key.c +index f3e71b44901c..b83703a624d8 100644 +--- a/security/keys/big_key.c ++++ b/security/keys/big_key.c +@@ -240,7 +240,7 @@ void big_key_revoke(struct key *key) + + /* clear the quota */ + key_payload_reserve(key, 0); +- if (key_is_instantiated(key) && ++ if (key_is_positive(key) && + (size_t)key->payload.data[big_key_len] > BIG_KEY_FILE_THRESHOLD) + vfs_truncate(path, 0); + } +@@ -272,7 +272,7 @@ void big_key_describe(const struct key *key, struct seq_file *m) + + seq_puts(m, key->description); + +- if (key_is_instantiated(key)) ++ if (key_is_positive(key)) + seq_printf(m, ": %zu [%s]", + datalen, + datalen > BIG_KEY_FILE_THRESHOLD ? "file" : "buff"); +diff --git a/security/keys/encrypted-keys/encrypted.c b/security/keys/encrypted-keys/encrypted.c +index bd64a3991e26..fea053611618 100644 +--- a/security/keys/encrypted-keys/encrypted.c ++++ b/security/keys/encrypted-keys/encrypted.c +@@ -854,7 +854,7 @@ static int encrypted_update(struct key *key, struct key_preparsed_payload *prep) + size_t datalen = prep->datalen; + int ret = 0; + +- if (test_bit(KEY_FLAG_NEGATIVE, &key->flags)) ++ if (key_is_negative(key)) + return -ENOKEY; + if (datalen <= 0 || datalen > 32767 || !prep->data) + return -EINVAL; +diff --git a/security/keys/gc.c b/security/keys/gc.c +index 87cb260e4890..f01d48cb3de1 100644 +--- a/security/keys/gc.c ++++ b/security/keys/gc.c +@@ -129,15 +129,15 @@ static noinline void key_gc_unused_keys(struct list_head *keys) + while (!list_empty(keys)) { + struct key *key = + list_entry(keys->next, struct key, graveyard_link); ++ short state = key->state; ++ + list_del(&key->graveyard_link); + + kdebug("- %u", key->serial); + key_check(key); + + /* Throw away the key data if the key is instantiated */ +- if (test_bit(KEY_FLAG_INSTANTIATED, &key->flags) && +- !test_bit(KEY_FLAG_NEGATIVE, &key->flags) && +- key->type->destroy) ++ if (state == KEY_IS_POSITIVE && key->type->destroy) + key->type->destroy(key); + + security_key_free(key); +@@ -151,7 +151,7 @@ static noinline void key_gc_unused_keys(struct list_head *keys) + } + + atomic_dec(&key->user->nkeys); +- if (test_bit(KEY_FLAG_INSTANTIATED, &key->flags)) ++ if (state != KEY_IS_UNINSTANTIATED) + atomic_dec(&key->user->nikeys); + + key_user_put(key->user); +diff --git a/security/keys/key.c b/security/keys/key.c +index bec11eefe04d..146584383b1d 100644 +--- a/security/keys/key.c ++++ b/security/keys/key.c +@@ -400,6 +400,18 @@ int key_payload_reserve(struct key *key, size_t datalen) + EXPORT_SYMBOL(key_payload_reserve); + + /* ++ * Change the key state to being instantiated. ++ */ ++static void mark_key_instantiated(struct key *key, int reject_error) ++{ ++ /* Commit the payload before setting the state; barrier versus ++ * key_read_state(). ++ */ ++ smp_store_release(&key->state, ++ (reject_error < 0) ? reject_error : KEY_IS_POSITIVE); ++} ++ ++/* + * Instantiate a key and link it into the target keyring atomically. Must be + * called with the target keyring's semaphore writelocked. The target key's + * semaphore need not be locked as instantiation is serialised by +@@ -422,14 +434,14 @@ static int __key_instantiate_and_link(struct key *key, + mutex_lock(&key_construction_mutex); + + /* can't instantiate twice */ +- if (!test_bit(KEY_FLAG_INSTANTIATED, &key->flags)) { ++ if (key->state == KEY_IS_UNINSTANTIATED) { + /* instantiate the key */ + ret = key->type->instantiate(key, prep); + + if (ret == 0) { + /* mark the key as being instantiated */ + atomic_inc(&key->user->nikeys); +- set_bit(KEY_FLAG_INSTANTIATED, &key->flags); ++ mark_key_instantiated(key, 0); + + if (test_and_clear_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags)) + awaken = 1; +@@ -575,13 +587,10 @@ int key_reject_and_link(struct key *key, + mutex_lock(&key_construction_mutex); + + /* can't instantiate twice */ +- if (!test_bit(KEY_FLAG_INSTANTIATED, &key->flags)) { ++ if (key->state == KEY_IS_UNINSTANTIATED) { + /* mark the key as being negatively instantiated */ + atomic_inc(&key->user->nikeys); +- key->reject_error = -error; +- smp_wmb(); +- set_bit(KEY_FLAG_NEGATIVE, &key->flags); +- set_bit(KEY_FLAG_INSTANTIATED, &key->flags); ++ mark_key_instantiated(key, -error); + now = current_kernel_time(); + key->expiry = now.tv_sec + timeout; + key_schedule_gc(key->expiry + key_gc_delay); +@@ -750,8 +759,8 @@ static inline key_ref_t __key_update(key_ref_t key_ref, + + ret = key->type->update(key, prep); + if (ret == 0) +- /* updating a negative key instantiates it */ +- clear_bit(KEY_FLAG_NEGATIVE, &key->flags); ++ /* Updating a negative key positively instantiates it */ ++ mark_key_instantiated(key, 0); + + up_write(&key->sem); + +@@ -994,8 +1003,8 @@ int key_update(key_ref_t key_ref, const void *payload, size_t plen) + + ret = key->type->update(key, &prep); + if (ret == 0) +- /* updating a negative key instantiates it */ +- clear_bit(KEY_FLAG_NEGATIVE, &key->flags); ++ /* Updating a negative key positively instantiates it */ ++ mark_key_instantiated(key, 0); + + up_write(&key->sem); + +diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c +index 6a82090c7fc1..2eb624c0aefc 100644 +--- a/security/keys/keyctl.c ++++ b/security/keys/keyctl.c +@@ -766,10 +766,9 @@ long keyctl_read_key(key_serial_t keyid, char __user *buffer, size_t buflen) + + key = key_ref_to_ptr(key_ref); + +- if (test_bit(KEY_FLAG_NEGATIVE, &key->flags)) { +- ret = -ENOKEY; +- goto error2; +- } ++ ret = key_read_state(key); ++ if (ret < 0) ++ goto error2; /* Negatively instantiated */ + + /* see if we can read it directly */ + ret = key_permission(key_ref, KEY_NEED_READ); +@@ -901,7 +900,7 @@ long keyctl_chown_key(key_serial_t id, uid_t user, gid_t group) + atomic_dec(&key->user->nkeys); + atomic_inc(&newowner->nkeys); + +- if (test_bit(KEY_FLAG_INSTANTIATED, &key->flags)) { ++ if (key->state != KEY_IS_UNINSTANTIATED) { + atomic_dec(&key->user->nikeys); + atomic_inc(&newowner->nikeys); + } +diff --git a/security/keys/keyring.c b/security/keys/keyring.c +index b9893fb68e4a..78e2c7b04013 100644 +--- a/security/keys/keyring.c ++++ b/security/keys/keyring.c +@@ -414,7 +414,7 @@ static void keyring_describe(const struct key *keyring, struct seq_file *m) + else + seq_puts(m, "[anon]"); + +- if (key_is_instantiated(keyring)) { ++ if (key_is_positive(keyring)) { + if (keyring->keys.nr_leaves_on_tree != 0) + seq_printf(m, ": %lu", keyring->keys.nr_leaves_on_tree); + else +@@ -552,7 +552,8 @@ static int keyring_search_iterator(const void *object, void *iterator_data) + { + struct keyring_search_context *ctx = iterator_data; + const struct key *key = keyring_ptr_to_key(object); +- unsigned long kflags = key->flags; ++ unsigned long kflags = READ_ONCE(key->flags); ++ short state = READ_ONCE(key->state); + + kenter("{%d}", key->serial); + +@@ -596,9 +597,8 @@ static int keyring_search_iterator(const void *object, void *iterator_data) + + if (ctx->flags & KEYRING_SEARCH_DO_STATE_CHECK) { + /* we set a different error code if we pass a negative key */ +- if (kflags & (1 << KEY_FLAG_NEGATIVE)) { +- smp_rmb(); +- ctx->result = ERR_PTR(key->reject_error); ++ if (state < 0) { ++ ctx->result = ERR_PTR(state); + kleave(" = %d [neg]", ctx->skipped_ret); + goto skipped; + } +diff --git a/security/keys/proc.c b/security/keys/proc.c +index bf08d02b6646..e6aa1b257578 100644 +--- a/security/keys/proc.c ++++ b/security/keys/proc.c +@@ -182,6 +182,7 @@ static int proc_keys_show(struct seq_file *m, void *v) + unsigned long timo; + key_ref_t key_ref, skey_ref; + char xbuf[16]; ++ short state; + int rc; + + struct keyring_search_context ctx = { +@@ -240,17 +241,19 @@ static int proc_keys_show(struct seq_file *m, void *v) + sprintf(xbuf, "%luw", timo / (60*60*24*7)); + } + ++ state = key_read_state(key); ++ + #define showflag(KEY, LETTER, FLAG) \ + (test_bit(FLAG, &(KEY)->flags) ? LETTER : '-') + + seq_printf(m, "%08x %c%c%c%c%c%c%c %5d %4s %08x %5d %5d %-9.9s ", + key->serial, +- showflag(key, 'I', KEY_FLAG_INSTANTIATED), ++ state != KEY_IS_UNINSTANTIATED ? 'I' : '-', + showflag(key, 'R', KEY_FLAG_REVOKED), + showflag(key, 'D', KEY_FLAG_DEAD), + showflag(key, 'Q', KEY_FLAG_IN_QUOTA), + showflag(key, 'U', KEY_FLAG_USER_CONSTRUCT), +- showflag(key, 'N', KEY_FLAG_NEGATIVE), ++ state < 0 ? 'N' : '-', + showflag(key, 'i', KEY_FLAG_INVALIDATED), + refcount_read(&key->usage), + xbuf, +diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c +index 86bced9fdbdf..241f57c59371 100644 +--- a/security/keys/process_keys.c ++++ b/security/keys/process_keys.c +@@ -728,7 +728,7 @@ key_ref_t lookup_user_key(key_serial_t id, unsigned long lflags, + + ret = -EIO; + if (!(lflags & KEY_LOOKUP_PARTIAL) && +- !test_bit(KEY_FLAG_INSTANTIATED, &key->flags)) ++ key_read_state(key) == KEY_IS_UNINSTANTIATED) + goto invalid_key; + + /* check the permissions */ +diff --git a/security/keys/request_key.c b/security/keys/request_key.c +index 3d2272c06cae..5054e3196073 100644 +--- a/security/keys/request_key.c ++++ b/security/keys/request_key.c +@@ -623,10 +623,9 @@ int wait_for_key_construction(struct key *key, bool intr) + intr ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); + if (ret) + return -ERESTARTSYS; +- if (test_bit(KEY_FLAG_NEGATIVE, &key->flags)) { +- smp_rmb(); +- return key->reject_error; +- } ++ ret = key_read_state(key); ++ if (ret < 0) ++ return ret; + return key_validate(key); + } + EXPORT_SYMBOL(wait_for_key_construction); +diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c +index 0f062156dfb2..4e48ca3690a5 100644 +--- a/security/keys/request_key_auth.c ++++ b/security/keys/request_key_auth.c +@@ -73,7 +73,7 @@ static void request_key_auth_describe(const struct key *key, + + seq_puts(m, "key:"); + seq_puts(m, key->description); +- if (key_is_instantiated(key)) ++ if (key_is_positive(key)) + seq_printf(m, " pid:%d ci:%zu", rka->pid, rka->callout_len); + } + +diff --git a/security/keys/trusted.c b/security/keys/trusted.c +index 5a775f2fbe15..e0fcb17068f5 100644 +--- a/security/keys/trusted.c ++++ b/security/keys/trusted.c +@@ -1066,7 +1066,7 @@ static int trusted_update(struct key *key, struct key_preparsed_payload *prep) + char *datablob; + int ret = 0; + +- if (test_bit(KEY_FLAG_NEGATIVE, &key->flags)) ++ if (key_is_negative(key)) + return -ENOKEY; + p = key->payload.data[0]; + if (!p->migratable) +diff --git a/security/keys/user_defined.c b/security/keys/user_defined.c +index 3d8c68eba516..9f558bedba23 100644 +--- a/security/keys/user_defined.c ++++ b/security/keys/user_defined.c +@@ -114,7 +114,7 @@ int user_update(struct key *key, struct key_preparsed_payload *prep) + + /* attach the new data, displacing the old */ + key->expiry = prep->expiry; +- if (!test_bit(KEY_FLAG_NEGATIVE, &key->flags)) ++ if (key_is_positive(key)) + zap = dereference_key_locked(key); + rcu_assign_keypointer(key, prep->payload.data[0]); + prep->payload.data[0] = NULL; +@@ -162,7 +162,7 @@ EXPORT_SYMBOL_GPL(user_destroy); + void user_describe(const struct key *key, struct seq_file *m) + { + seq_puts(m, key->description); +- if (key_is_instantiated(key)) ++ if (key_is_positive(key)) + seq_printf(m, ": %u", key->datalen); + } + +-- +2.13.6 + diff --git a/patches.fixes/KEYS-don-t-let-add_key-update-an-uninstantiated-key.patch b/patches.fixes/KEYS-don-t-let-add_key-update-an-uninstantiated-key.patch new file mode 100644 index 0000000..60754b7 --- /dev/null +++ b/patches.fixes/KEYS-don-t-let-add_key-update-an-uninstantiated-key.patch @@ -0,0 +1,125 @@ +From 60ff5b2f547af3828aebafd54daded44cfb0807a Mon Sep 17 00:00:00 2001 +From: David Howells +Date: Thu, 12 Oct 2017 16:00:41 +0100 +Subject: [PATCH] KEYS: don't let add_key() update an uninstantiated key + +References: CVE-2017-15951, bsc#1065615, bsc#1071927 +Patch-mainline: v4.14-rc6 +Git-commit: 60ff5b2f547af3828aebafd54daded44cfb0807a + +Currently, when passed a key that already exists, add_key() will call the +key's ->update() method if such exists. But this is heavily broken in the +case where the key is uninstantiated because it doesn't call +__key_instantiate_and_link(). Consequently, it doesn't do most of the +things that are supposed to happen when the key is instantiated, such as +setting the instantiation state, clearing KEY_FLAG_USER_CONSTRUCT and +awakening tasks waiting on it, and incrementing key->user->nikeys. + +It also never takes key_construction_mutex, which means that +->instantiate() can run concurrently with ->update() on the same key. In +the case of the "user" and "logon" key types this causes a memory leak, at +best. Maybe even worse, the ->update() methods of the "encrypted" and +"trusted" key types actually just dereference a NULL pointer when passed an +uninstantiated key. + +Change key_create_or_update() to wait interruptibly for the key to finish +construction before continuing. + +This patch only affects *uninstantiated* keys. For now we still allow a +negatively instantiated key to be updated (thereby positively +instantiating it), although that's broken too (the next patch fixes it) +and I'm not sure that anyone actually uses that functionality either. + +Here is a simple reproducer for the bug using the "encrypted" key type +(requires CONFIG_ENCRYPTED_KEYS=y), though as noted above the bug +pertained to more than just the "encrypted" key type: + + #include + #include + #include + + int main(void) + { + int ringid = keyctl_join_session_keyring(NULL); + + if (fork()) { + for (;;) { + const char payload[] = "update user:foo 32"; + + usleep(rand() % 10000); + add_key("encrypted", "desc", payload, sizeof(payload), ringid); + keyctl_clear(ringid); + } + } else { + for (;;) + request_key("encrypted", "desc", "callout_info", ringid); + } + } + +It causes: + + BUG: unable to handle kernel NULL pointer dereference at 0000000000000018 + IP: encrypted_update+0xb0/0x170 + PGD 7a178067 P4D 7a178067 PUD 77269067 PMD 0 + PREEMPT SMP + CPU: 0 PID: 340 Comm: reproduce Tainted: G D 4.14.0-rc1-00025-g428490e38b2e #796 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 + task: ffff8a467a39a340 task.stack: ffffb15c40770000 + RIP: 0010:encrypted_update+0xb0/0x170 + RSP: 0018:ffffb15c40773de8 EFLAGS: 00010246 + RAX: 0000000000000000 RBX: ffff8a467a275b00 RCX: 0000000000000000 + RDX: 0000000000000005 RSI: ffff8a467a275b14 RDI: ffffffffb742f303 + RBP: ffffb15c40773e20 R08: 0000000000000000 R09: ffff8a467a275b17 + R10: 0000000000000020 R11: 0000000000000000 R12: 0000000000000000 + R13: 0000000000000000 R14: ffff8a4677057180 R15: ffff8a467a275b0f + FS: 00007f5d7fb08700(0000) GS:ffff8a467f200000(0000) knlGS:0000000000000000 + CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + CR2: 0000000000000018 CR3: 0000000077262005 CR4: 00000000001606f0 + Call Trace: + key_create_or_update+0x2bc/0x460 + SyS_add_key+0x10c/0x1d0 + entry_SYSCALL_64_fastpath+0x1f/0xbe + RIP: 0033:0x7f5d7f211259 + RSP: 002b:00007ffed03904c8 EFLAGS: 00000246 ORIG_RAX: 00000000000000f8 + RAX: ffffffffffffffda RBX: 000000003b2a7955 RCX: 00007f5d7f211259 + RDX: 00000000004009e4 RSI: 00000000004009ff RDI: 0000000000400a04 + RBP: 0000000068db8bad R08: 000000003b2a7955 R09: 0000000000000004 + R10: 000000000000001a R11: 0000000000000246 R12: 0000000000400868 + R13: 00007ffed03905d0 R14: 0000000000000000 R15: 0000000000000000 + Code: 77 28 e8 64 34 1f 00 45 31 c0 31 c9 48 8d 55 c8 48 89 df 48 8d 75 d0 e8 ff f9 ff ff 85 c0 41 89 c4 0f 88 84 00 00 00 4c 8b 7d c8 <49> 8b 75 18 4c 89 ff e8 24 f8 ff ff 85 c0 41 89 c4 78 6d 49 8b + RIP: encrypted_update+0xb0/0x170 RSP: ffffb15c40773de8 + CR2: 0000000000000018 + +Cc: # v2.6.12+ +Reported-by: Eric Biggers +Signed-off-by: David Howells +cc: Eric Biggers +Acked-by: Michal Suchanek +--- + security/keys/key.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +diff --git a/security/keys/key.c b/security/keys/key.c +index 9385e7cc710f..83bf4b4afd49 100644 +--- a/security/keys/key.c ++++ b/security/keys/key.c +@@ -945,6 +945,16 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref, + */ + __key_link_end(keyring, &index_key, edit); + ++ key = key_ref_to_ptr(key_ref); ++ if (test_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags)) { ++ ret = wait_for_key_construction(key, true); ++ if (ret < 0) { ++ key_ref_put(key_ref); ++ key_ref = ERR_PTR(ret); ++ goto error_free_prep; ++ } ++ } ++ + key_ref = __key_update(key_ref, &prep); + goto error_free_prep; + } +-- +2.13.6 + diff --git a/patches.fixes/dccp-do-not-use-tcp_time_stamp.patch b/patches.fixes/dccp-do-not-use-tcp_time_stamp.patch new file mode 100644 index 0000000..729d421 --- /dev/null +++ b/patches.fixes/dccp-do-not-use-tcp_time_stamp.patch @@ -0,0 +1,75 @@ +From: Eric Dumazet +Date: Tue, 16 May 2017 14:00:02 -0700 +Subject: dccp: do not use tcp_time_stamp +Patch-mainline: v4.13-rc1 +Git-commit: d011b9a448907833a19b2f0a34381419f8ca9b23 +References: bsc#1061739 + +Use our own macro instead of abusing tcp_time_stamp + +Signed-off-by: Eric Dumazet +Acked-by: Soheil Hassas Yeganeh +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/dccp/ccids/ccid2.c | 8 ++++---- + net/dccp/ccids/ccid2.h | 2 +- + 2 files changed, 5 insertions(+), 5 deletions(-) + +diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c +index 5e3a7302f774..e1295d5f2c56 100644 +--- a/net/dccp/ccids/ccid2.c ++++ b/net/dccp/ccids/ccid2.c +@@ -233,7 +233,7 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) + { + struct dccp_sock *dp = dccp_sk(sk); + struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); +- const u32 now = ccid2_time_stamp; ++ const u32 now = ccid2_jiffies32; + struct ccid2_seq *next; + + /* slow-start after idle periods (RFC 2581, RFC 2861) */ +@@ -466,7 +466,7 @@ static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp, + * The cleanest solution is to not use the ccid2s_sent field at all + * and instead use DCCP timestamps: requires changes in other places. + */ +- ccid2_rtt_estimator(sk, ccid2_time_stamp - seqp->ccid2s_sent); ++ ccid2_rtt_estimator(sk, ccid2_jiffies32 - seqp->ccid2s_sent); + } + + static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) +@@ -478,7 +478,7 @@ static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) + return; + } + +- hc->tx_last_cong = ccid2_time_stamp; ++ hc->tx_last_cong = ccid2_jiffies32; + + hc->tx_cwnd = hc->tx_cwnd / 2 ? : 1U; + hc->tx_ssthresh = max(hc->tx_cwnd, 2U); +@@ -731,7 +731,7 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) + + hc->tx_rto = DCCP_TIMEOUT_INIT; + hc->tx_rpdupack = -1; +- hc->tx_last_cong = hc->tx_lsndtime = hc->tx_cwnd_stamp = ccid2_time_stamp; ++ hc->tx_last_cong = hc->tx_lsndtime = hc->tx_cwnd_stamp = ccid2_jiffies32; + hc->tx_cwnd_used = 0; + setup_timer(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire, + (unsigned long)sk); +diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h +index 18c97543e522..6e50ef2898fb 100644 +--- a/net/dccp/ccids/ccid2.h ++++ b/net/dccp/ccids/ccid2.h +@@ -27,7 +27,7 @@ + * CCID-2 timestamping faces the same issues as TCP timestamping. + * Hence we reuse/share as much of the code as possible. + */ +-#define ccid2_time_stamp tcp_time_stamp ++#define ccid2_jiffies32 ((u32)jiffies) + + /* NUMDUPACK parameter from RFC 4341, p. 6 */ + #define NUMDUPACK 3 +-- +2.15.1 + diff --git a/patches.fixes/esp-Fix-error-handling-on-layer-2-xmit.patch b/patches.fixes/esp-Fix-error-handling-on-layer-2-xmit.patch new file mode 100644 index 0000000..2f1535a --- /dev/null +++ b/patches.fixes/esp-Fix-error-handling-on-layer-2-xmit.patch @@ -0,0 +1,50 @@ +From: Steffen Klassert +Date: Mon, 7 Aug 2017 08:31:07 +0200 +Subject: esp: Fix error handling on layer 2 xmit. +Patch-mainline: v4.13 +Git-commit: 4ff0308f06da5016aafb05330ed37809b54f81ae +References: bsc#1076830 + +esp_output_tail() and esp6_output_tail() can return negative +and positive error values. We currently treat only negative +values as errors, fix this to treat both cases as error. + +Fixes: fca11ebde3f0 ("esp4: Reorganize esp_output") +Fixes: 383d0350f2cc ("esp6: Reorganize esp_output") +Signed-off-by: Steffen Klassert +Acked-by: Michal Kubecek + +--- + net/ipv4/esp4_offload.c | 2 +- + net/ipv6/esp6_offload.c | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c +index e0666016a764..50112324fa5c 100644 +--- a/net/ipv4/esp4_offload.c ++++ b/net/ipv4/esp4_offload.c +@@ -257,7 +257,7 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_ + esp.seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32)); + + err = esp_output_tail(x, skb, &esp); +- if (err < 0) ++ if (err) + return err; + + secpath_reset(skb); +diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c +index f02f131f6435..1cf437f75b0b 100644 +--- a/net/ipv6/esp6_offload.c ++++ b/net/ipv6/esp6_offload.c +@@ -286,7 +286,7 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features + esp.seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32)); + + err = esp6_output_tail(x, skb, &esp); +- if (err < 0) ++ if (err) + return err; + + secpath_reset(skb); +-- +2.15.1 + diff --git a/patches.fixes/fou-fix-some-member-types-in-guehdr.patch b/patches.fixes/fou-fix-some-member-types-in-guehdr.patch new file mode 100644 index 0000000..6f8eaf9 --- /dev/null +++ b/patches.fixes/fou-fix-some-member-types-in-guehdr.patch @@ -0,0 +1,78 @@ +From: Xin Long +Date: Sun, 10 Dec 2017 16:56:00 +0800 +Subject: fou: fix some member types in guehdr +Patch-mainline: v4.15-rc4 +Git-commit: 200809716aed1cac586fcac4c0551a688439be1f +References: bsc#1076830 + +guehdr struct is used to build or parse gue packets, which +are always in big endian. It's better to define all guehdr +members as __beXX types. + +Also, in validate_gue_flags it's not good to use a __be32 +variable for both Standard flags(__be16) and Private flags +(__be32), and pass it to other funcions. + +This patch could fix a bunch of sparse warnings from fou. + +Fixes: 5024c33ac354 ("gue: Add infrastructure for flags and options") +Signed-off-by: Xin Long +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + include/net/gue.h | 18 +++++++++--------- + 1 file changed, 9 insertions(+), 9 deletions(-) + +diff --git a/include/net/gue.h b/include/net/gue.h +index 3f28ec7f1c7f..4180b8eb54d5 100644 +--- a/include/net/gue.h ++++ b/include/net/gue.h +@@ -43,10 +43,10 @@ struct guehdr { + #else + #error "Please fix " + #endif +- __u8 proto_ctype; +- __u16 flags; ++ __u8 proto_ctype; ++ __be16 flags; + }; +- __u32 word; ++ __be32 word; + }; + }; + +@@ -83,11 +83,10 @@ static inline size_t guehdr_priv_flags_len(__be32 flags) + * if there is an unknown standard or private flags, or the options length for + * the flags exceeds the options length specific in hlen of the GUE header. + */ +-static inline int validate_gue_flags(struct guehdr *guehdr, +- size_t optlen) ++static inline int validate_gue_flags(struct guehdr *guehdr, size_t optlen) + { ++ __be16 flags = guehdr->flags; + size_t len; +- __be32 flags = guehdr->flags; + + if (flags & ~GUE_FLAGS_ALL) + return 1; +@@ -100,12 +99,13 @@ static inline int validate_gue_flags(struct guehdr *guehdr, + /* Private flags are last four bytes accounted in + * guehdr_flags_len + */ +- flags = *(__be32 *)((void *)&guehdr[1] + len - GUE_LEN_PRIV); ++ __be32 pflags = *(__be32 *)((void *)&guehdr[1] + ++ len - GUE_LEN_PRIV); + +- if (flags & ~GUE_PFLAGS_ALL) ++ if (pflags & ~GUE_PFLAGS_ALL) + return 1; + +- len += guehdr_priv_flags_len(flags); ++ len += guehdr_priv_flags_len(pflags); + if (len > optlen) + return 1; + } +-- +2.15.1 + diff --git a/patches.fixes/ip-options-explicitly-provide-net-ns-to-__ip_options.patch b/patches.fixes/ip-options-explicitly-provide-net-ns-to-__ip_options.patch new file mode 100644 index 0000000..3d47a7f --- /dev/null +++ b/patches.fixes/ip-options-explicitly-provide-net-ns-to-__ip_options.patch @@ -0,0 +1,197 @@ +From: Paolo Abeni +Date: Thu, 3 Aug 2017 18:07:06 +0200 +Subject: ip/options: explicitly provide net ns to __ip_options_echo() +Patch-mainline: v4.14-rc1 +Git-commit: 91ed1e666a4ea2e260452a7d7d311ac5ae852cba +References: bsc#1076830 + +__ip_options_echo() uses the current network namespace, and +currently retrives it via skb->dst->dev. + +This commit adds an explicit 'net' argument to __ip_options_echo() +and update all the call sites to provide it, usually via a simpler +sock_net(). + +After this change, __ip_options_echo() no more needs to access +skb->dst and we can drop a couple of hack to preserve such +info in the rx path. + +Signed-off-by: Paolo Abeni +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + include/net/ip.h | 9 +++++---- + include/net/tcp.h | 5 +++-- + net/ipv4/icmp.c | 4 ++-- + net/ipv4/ip_options.c | 6 +++--- + net/ipv4/ip_output.c | 2 +- + net/ipv4/ip_sockglue.c | 7 ++++--- + net/ipv4/syncookies.c | 2 +- + net/ipv4/tcp_ipv4.c | 2 +- + 8 files changed, 20 insertions(+), 17 deletions(-) + +diff --git a/include/net/ip.h b/include/net/ip.h +index 0cf7f5a65fe6..44beac1debb2 100644 +--- a/include/net/ip.h ++++ b/include/net/ip.h +@@ -567,11 +567,12 @@ int ip_forward(struct sk_buff *skb); + void ip_options_build(struct sk_buff *skb, struct ip_options *opt, + __be32 daddr, struct rtable *rt, int is_frag); + +-int __ip_options_echo(struct ip_options *dopt, struct sk_buff *skb, +- const struct ip_options *sopt); +-static inline int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb) ++int __ip_options_echo(struct net *net, struct ip_options *dopt, ++ struct sk_buff *skb, const struct ip_options *sopt); ++static inline int ip_options_echo(struct net *net, struct ip_options *dopt, ++ struct sk_buff *skb) + { +- return __ip_options_echo(dopt, skb, &IPCB(skb)->opt); ++ return __ip_options_echo(net, dopt, skb, &IPCB(skb)->opt); + } + + void ip_options_fragment(struct sk_buff *skb); +diff --git a/include/net/tcp.h b/include/net/tcp.h +index 82e9251323cc..0f403b065f24 100644 +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -1896,7 +1896,8 @@ extern void tcp_rack_reo_timeout(struct sock *sk); + /* + * Save and compile IPv4 options, return a pointer to it + */ +-static inline struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb) ++static inline struct ip_options_rcu *tcp_v4_save_options(struct net *net, ++ struct sk_buff *skb) + { + const struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; + struct ip_options_rcu *dopt = NULL; +@@ -1905,7 +1906,7 @@ static inline struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb) + int opt_size = sizeof(*dopt) + opt->optlen; + + dopt = kmalloc(opt_size, GFP_ATOMIC); +- if (dopt && __ip_options_echo(&dopt->opt, skb, opt)) { ++ if (dopt && __ip_options_echo(net, &dopt->opt, skb, opt)) { + kfree(dopt); + dopt = NULL; + } +diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c +index 9144fa7df2ad..7cf046f2ea05 100644 +--- a/net/ipv4/icmp.c ++++ b/net/ipv4/icmp.c +@@ -412,7 +412,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) + int type = icmp_param->data.icmph.type; + int code = icmp_param->data.icmph.code; + +- if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb)) ++ if (ip_options_echo(net, &icmp_param->replyopts.opt.opt, skb)) + return; + + /* Needed by both icmp_global_allow and icmp_xmit_lock */ +@@ -694,7 +694,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) + iph->tos; + mark = IP4_REPLY_MARK(net, skb_in->mark); + +- if (ip_options_echo(&icmp_param.replyopts.opt.opt, skb_in)) ++ if (ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in)) + goto out_unlock; + + +diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c +index 93157f2f4758..1894d4775172 100644 +--- a/net/ipv4/ip_options.c ++++ b/net/ipv4/ip_options.c +@@ -86,8 +86,8 @@ void ip_options_build(struct sk_buff *skb, struct ip_options *opt, + * NOTE: dopt cannot point to skb. + */ + +-int __ip_options_echo(struct ip_options *dopt, struct sk_buff *skb, +- const struct ip_options *sopt) ++int __ip_options_echo(struct net *net, struct ip_options *dopt, ++ struct sk_buff *skb, const struct ip_options *sopt) + { + unsigned char *sptr, *dptr; + int soffset, doffset; +@@ -140,7 +140,7 @@ int __ip_options_echo(struct ip_options *dopt, struct sk_buff *skb, + __be32 addr; + + memcpy(&addr, dptr+soffset-1, 4); +- if (inet_addr_type(dev_net(skb_dst(skb)->dev), addr) != RTN_UNICAST) { ++ if (inet_addr_type(net, addr) != RTN_UNICAST) { + dopt->ts_needtime = 1; + soffset += 8; + } +diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c +index 55d2c2a9f599..62cb99ec12fe 100644 +--- a/net/ipv4/ip_output.c ++++ b/net/ipv4/ip_output.c +@@ -1520,7 +1520,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, + int err; + int oif; + +- if (__ip_options_echo(&replyopts.opt.opt, skb, sopt)) ++ if (__ip_options_echo(net, &replyopts.opt.opt, skb, sopt)) + return; + + ipc.addr = daddr; +diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c +index ec4fe3d4b5c9..054dbab27d8c 100644 +--- a/net/ipv4/ip_sockglue.c ++++ b/net/ipv4/ip_sockglue.c +@@ -80,7 +80,8 @@ static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb) + } + + +-static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb) ++static void ip_cmsg_recv_retopts(struct net *net, struct msghdr *msg, ++ struct sk_buff *skb) + { + unsigned char optbuf[sizeof(struct ip_options) + 40]; + struct ip_options *opt = (struct ip_options *)optbuf; +@@ -88,7 +89,7 @@ static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb) + if (IPCB(skb)->opt.optlen == 0) + return; + +- if (ip_options_echo(opt, skb)) { ++ if (ip_options_echo(net, opt, skb)) { + msg->msg_flags |= MSG_CTRUNC; + return; + } +@@ -204,7 +205,7 @@ void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk, + } + + if (flags & IP_CMSG_RETOPTS) { +- ip_cmsg_recv_retopts(msg, skb); ++ ip_cmsg_recv_retopts(sock_net(sk), msg, skb); + + flags &= ~IP_CMSG_RETOPTS; + if (!flags) +diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c +index bd8fd3df5117..e6e5ee76ee29 100644 +--- a/net/ipv4/syncookies.c ++++ b/net/ipv4/syncookies.c +@@ -352,7 +352,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) + /* We throwed the options of the initial SYN away, so we hope + * the ACK carries the same options again (see RFC1122 4.2.3.8) + */ +- ireq->opt = tcp_v4_save_options(skb); ++ ireq->opt = tcp_v4_save_options(sock_net(sk), skb); + + if (security_inet_conn_request(sk, skb, req)) { + reqsk_free(req); +diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c +index c9845859fe94..a787203eab98 100644 +--- a/net/ipv4/tcp_ipv4.c ++++ b/net/ipv4/tcp_ipv4.c +@@ -1213,7 +1213,7 @@ static void tcp_v4_init_req(struct request_sock *req, + + sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); + sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); +- ireq->opt = tcp_v4_save_options(skb); ++ ireq->opt = tcp_v4_save_options(sock_net(sk_listener), skb); + } + + static struct dst_entry *tcp_v4_route_req(const struct sock *sk, +-- +2.15.1 + diff --git a/patches.fixes/ip_tunnel-fix-ip-tunnel-lookup-in-collect_md-mode.patch b/patches.fixes/ip_tunnel-fix-ip-tunnel-lookup-in-collect_md-mode.patch new file mode 100644 index 0000000..27e86ad --- /dev/null +++ b/patches.fixes/ip_tunnel-fix-ip-tunnel-lookup-in-collect_md-mode.patch @@ -0,0 +1,42 @@ +From: Haishuang Yan +Date: Tue, 12 Sep 2017 17:47:56 +0800 +Subject: ip_tunnel: fix ip tunnel lookup in collect_md mode +Patch-mainline: v4.14-rc1 +Git-commit: 833a8b405465e935a1ff7ab086b54a3ef90437ca +References: bsc#1076830 + +In collect_md mode, if the tun dev is down, it still can call +ip_tunnel_rcv to receive on packets, and the rx statistics increase +improperly. + +When the md tunnel is down, it's not neccessary to increase RX drops +for the tunnel device, packets would be recieved on fallback tunnel, +and the RX drops on fallback device will be increased as expected. + +Fixes: 2e15ea390e6f ("ip_gre: Add support to collect tunnel metadata.") +Cc: Pravin B Shelar +Signed-off-by: Haishuang Yan +Acked-by: Pravin B Shelar +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/ipv4/ip_tunnel.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c +index e1856bfa753d..e9805ad664ac 100644 +--- a/net/ipv4/ip_tunnel.c ++++ b/net/ipv4/ip_tunnel.c +@@ -176,7 +176,7 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, + return cand; + + t = rcu_dereference(itn->collect_md_tun); +- if (t) ++ if (t && t->dev->flags & IFF_UP) + return t; + + if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP) +-- +2.15.1 + diff --git a/patches.fixes/ip_tunnel-fix-setting-ttl-and-tos-value-in-collect_m.patch b/patches.fixes/ip_tunnel-fix-setting-ttl-and-tos-value-in-collect_m.patch new file mode 100644 index 0000000..d7a04c5 --- /dev/null +++ b/patches.fixes/ip_tunnel-fix-setting-ttl-and-tos-value-in-collect_m.patch @@ -0,0 +1,39 @@ +From: Haishuang Yan +Date: Thu, 7 Sep 2017 14:08:34 +0800 +Subject: ip_tunnel: fix setting ttl and tos value in collect_md mode +Patch-mainline: v4.14-rc1 +Git-commit: 0f693f1995cf002432b70f43ce73f79bf8d0b6c9 +References: bsc#1076830 + +ttl and tos variables are declared and assigned, but are not used in +iptunnel_xmit() function. + +Fixes: cfc7381b3002 ("ip_tunnel: add collect_md mode to IPIP tunnel") +Cc: Alexei Starovoitov +Signed-off-by: Haishuang Yan +Acked-by: Alexei Starovoitov +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/ipv4/ip_tunnel.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c +index 129d1a3616f8..e1856bfa753d 100644 +--- a/net/ipv4/ip_tunnel.c ++++ b/net/ipv4/ip_tunnel.c +@@ -618,8 +618,8 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto) + ip_rt_put(rt); + goto tx_dropped; + } +- iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, key->tos, +- key->ttl, df, !net_eq(tunnel->net, dev_net(dev))); ++ iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl, ++ df, !net_eq(tunnel->net, dev_net(dev))); + return; + tx_error: + dev->stats.tx_errors++; +-- +2.15.1 + diff --git a/patches.fixes/ipv6-Do-not-consider-linkdown-nexthops-during-multip.patch b/patches.fixes/ipv6-Do-not-consider-linkdown-nexthops-during-multip.patch new file mode 100644 index 0000000..86faa18 --- /dev/null +++ b/patches.fixes/ipv6-Do-not-consider-linkdown-nexthops-during-multip.patch @@ -0,0 +1,47 @@ +From: Ido Schimmel +Date: Tue, 21 Nov 2017 09:50:12 +0200 +Subject: ipv6: Do not consider linkdown nexthops during multipath +Patch-mainline: v4.15-rc1 +Git-commit: bbfcd77631573ac4a9f57eb6169e04256a111bc1 +References: bsc#1076830 + +When the 'ignore_routes_with_linkdown' sysctl is set, we should not +consider linkdown nexthops during route lookup. + +While the code correctly verifies that the initially selected route +('match') has a carrier, it does not perform the same check in the +subsequent multipath selection, resulting in a potential packet loss. + +In case the chosen route does not have a carrier and the sysctl is set, +choose the initially selected route. + +Fixes: 35103d11173b ("net: ipv6 sysctl option to ignore routes when nexthop link is down") +Signed-off-by: Ido Schimmel +Acked-by: David Ahern +Acked-by: Andy Gospodarek +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/ipv6/route.c | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/net/ipv6/route.c b/net/ipv6/route.c +index 70d23147c6ef..528e9d973806 100644 +--- a/net/ipv6/route.c ++++ b/net/ipv6/route.c +@@ -472,6 +472,11 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match, + &match->rt6i_siblings, rt6i_siblings) { + route_choosen--; + if (route_choosen == 0) { ++ struct inet6_dev *idev = sibling->rt6i_idev; ++ ++ if (!netif_carrier_ok(sibling->dst.dev) && ++ idev->cnf.ignore_routes_with_linkdown) ++ break; + if (rt6_score_route(sibling, oif, strict) < 0) + break; + match = sibling; +-- +2.15.1 + diff --git a/patches.fixes/ipv6-avoid-zeroing-per-cpu-data-again.patch b/patches.fixes/ipv6-avoid-zeroing-per-cpu-data-again.patch new file mode 100644 index 0000000..d57b95a --- /dev/null +++ b/patches.fixes/ipv6-avoid-zeroing-per-cpu-data-again.patch @@ -0,0 +1,48 @@ +From: Eric Dumazet +Date: Mon, 9 Oct 2017 06:01:37 -0700 +Subject: ipv6: avoid zeroing per cpu data again +Patch-mainline: v4.15-rc1 +Git-commit: bfd8e5a407133e58a92a38ccf3d0ba6db81f22d8 +References: bsc#1076830 + +per cpu allocations are already zeroed, no need to clear them again. + +Fixes: d52d3997f843f ("ipv6: Create percpu rt6_info") +Signed-off-by: Eric Dumazet +Cc: Martin KaFai Lau +Cc: Tejun Heo +Acked-by: Tejun Heo +Acked-by: Martin KaFai Lau +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/ipv6/route.c | 12 +----------- + 1 file changed, 1 insertion(+), 11 deletions(-) + +diff --git a/net/ipv6/route.c b/net/ipv6/route.c +index 528e9d973806..166217e4bef3 100644 +--- a/net/ipv6/route.c ++++ b/net/ipv6/route.c +@@ -369,17 +369,7 @@ struct rt6_info *ip6_dst_alloc(struct net *net, + + if (rt) { + rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC); +- if (rt->rt6i_pcpu) { +- int cpu; +- +- for_each_possible_cpu(cpu) { +- struct rt6_info **p; +- +- p = per_cpu_ptr(rt->rt6i_pcpu, cpu); +- /* no one shares rt */ +- *p = NULL; +- } +- } else { ++ if (!rt->rt6i_pcpu) { + dst_release_immediate(&rt->dst); + return NULL; + } +-- +2.15.1 + diff --git a/patches.fixes/ipv6-fix-possible-mem-leaks-in-ipv6_make_skb.patch b/patches.fixes/ipv6-fix-possible-mem-leaks-in-ipv6_make_skb.patch new file mode 100644 index 0000000..ecda5b2 --- /dev/null +++ b/patches.fixes/ipv6-fix-possible-mem-leaks-in-ipv6_make_skb.patch @@ -0,0 +1,42 @@ +From: Eric Dumazet +Date: Wed, 10 Jan 2018 03:45:49 -0800 +Subject: ipv6: fix possible mem leaks in ipv6_make_skb() +Patch-mainline: v4.15-rc8 +Git-commit: 862c03ee1deb7e19e0f9931682e0294ecd1fcaf9 +References: bsc#1076830 + +ip6_setup_cork() might return an error, while memory allocations have +been done and must be rolled back. + +Fixes: 6422398c2ab0 ("ipv6: introduce ipv6_make_skb") +Signed-off-by: Eric Dumazet +Cc: Vlad Yasevich +Reported-by: Mike Maloney +Acked-by: Mike Maloney +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/ipv6/ip6_output.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c +index 22f38f49e40f..bdfd52d5e7fc 100644 +--- a/net/ipv6/ip6_output.c ++++ b/net/ipv6/ip6_output.c +@@ -1728,9 +1728,10 @@ struct sk_buff *ip6_make_skb(struct sock *sk, + cork.base.opt = NULL; + v6_cork.opt = NULL; + err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6); +- if (err) ++ if (err) { ++ ip6_cork_release(&cork, &v6_cork); + return ERR_PTR(err); +- ++ } + if (ipc6->dontfrag < 0) + ipc6->dontfrag = inet6_sk(sk)->dontfrag; + +-- +2.15.1 + diff --git a/patches.fixes/ipv6-ip6_make_skb-needs-to-clear-cork.base.dst.patch b/patches.fixes/ipv6-ip6_make_skb-needs-to-clear-cork.base.dst.patch new file mode 100644 index 0000000..2e0a39e --- /dev/null +++ b/patches.fixes/ipv6-ip6_make_skb-needs-to-clear-cork.base.dst.patch @@ -0,0 +1,38 @@ +From: Eric Dumazet +Date: Thu, 11 Jan 2018 22:31:18 -0800 +Subject: ipv6: ip6_make_skb() needs to clear cork.base.dst +Patch-mainline: v4.15-rc9 +Git-commit: 95ef498d977bf44ac094778fd448b98af158a3e6 +References: bsc#1076830 + +In my last patch, I missed fact that cork.base.dst was not initialized +in ip6_make_skb() : + +If ip6_setup_cork() returns an error, we might attempt a dst_release() +on some random pointer. + +Fixes: 862c03ee1deb ("ipv6: fix possible mem leaks in ipv6_make_skb()") +Signed-off-by: Eric Dumazet +Reported-by: syzbot +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/ipv6/ip6_output.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c +index bdfd52d5e7fc..5b1e853439ea 100644 +--- a/net/ipv6/ip6_output.c ++++ b/net/ipv6/ip6_output.c +@@ -1726,6 +1726,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk, + cork.base.flags = 0; + cork.base.addr = 0; + cork.base.opt = NULL; ++ cork.base.dst = NULL; + v6_cork.opt = NULL; + err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6); + if (err) { +-- +2.16.0 + diff --git a/patches.fixes/net-netfilter-nf_conntrack_core-Fix-net_conntrack_lo.patch b/patches.fixes/net-netfilter-nf_conntrack_core-Fix-net_conntrack_lo.patch new file mode 100644 index 0000000..9a23fae --- /dev/null +++ b/patches.fixes/net-netfilter-nf_conntrack_core-Fix-net_conntrack_lo.patch @@ -0,0 +1,120 @@ +From: Manfred Spraul +Date: Thu, 6 Jul 2017 20:45:59 +0200 +Subject: net/netfilter/nf_conntrack_core: Fix net_conntrack_lock() +Patch-mainline: v4.14-rc1 +Git-commit: 3ef0c7a730de0bae03d86c19570af764fa3c4445 +References: bsc#1076830 + +As we want to remove spin_unlock_wait() and replace it with explicit +spin_lock()/spin_unlock() calls, we can use this to simplify the +locking. + +In addition: +- Reading nf_conntrack_locks_all needs ACQUIRE memory ordering. +- The new code avoids the backwards loop. + +Only slightly tested, I did not manage to trigger calls to +nf_conntrack_all_lock(). + +V2: With improved comments, to clearly show how the barriers + pair. + +Fixes: b16c29191dc8 ("netfilter: nf_conntrack: use safer way to lock all buckets") +Signed-off-by: Manfred Spraul +Cc: +Cc: Alan Stern +Cc: Sasha Levin +Cc: Pablo Neira Ayuso +Cc: netfilter-devel@vger.kernel.org +Signed-off-by: Paul E. McKenney +Acked-by: Michal Kubecek + +--- + net/netfilter/nf_conntrack_core.c | 52 ++++++++++++++++++++++----------------- + 1 file changed, 29 insertions(+), 23 deletions(-) + +diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c +index e847dbaa0c6b..e0560c41e371 100644 +--- a/net/netfilter/nf_conntrack_core.c ++++ b/net/netfilter/nf_conntrack_core.c +@@ -96,19 +96,26 @@ static struct conntrack_gc_work conntrack_gc_work; + + void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) + { ++ /* 1) Acquire the lock */ + spin_lock(lock); +- while (unlikely(nf_conntrack_locks_all)) { +- spin_unlock(lock); + +- /* +- * Order the 'nf_conntrack_locks_all' load vs. the +- * spin_unlock_wait() loads below, to ensure +- * that 'nf_conntrack_locks_all_lock' is indeed held: +- */ +- smp_rmb(); /* spin_lock(&nf_conntrack_locks_all_lock) */ +- spin_unlock_wait(&nf_conntrack_locks_all_lock); +- spin_lock(lock); +- } ++ /* 2) read nf_conntrack_locks_all, with ACQUIRE semantics ++ * It pairs with the smp_store_release() in nf_conntrack_all_unlock() ++ */ ++ if (likely(smp_load_acquire(&nf_conntrack_locks_all) == false)) ++ return; ++ ++ /* fast path failed, unlock */ ++ spin_unlock(lock); ++ ++ /* Slow path 1) get global lock */ ++ spin_lock(&nf_conntrack_locks_all_lock); ++ ++ /* Slow path 2) get the lock we want */ ++ spin_lock(lock); ++ ++ /* Slow path 3) release the global lock */ ++ spin_unlock(&nf_conntrack_locks_all_lock); + } + EXPORT_SYMBOL_GPL(nf_conntrack_lock); + +@@ -149,28 +156,27 @@ static void nf_conntrack_all_lock(void) + int i; + + spin_lock(&nf_conntrack_locks_all_lock); +- nf_conntrack_locks_all = true; + +- /* +- * Order the above store of 'nf_conntrack_locks_all' against +- * the spin_unlock_wait() loads below, such that if +- * nf_conntrack_lock() observes 'nf_conntrack_locks_all' +- * we must observe nf_conntrack_locks[] held: +- */ +- smp_mb(); /* spin_lock(&nf_conntrack_locks_all_lock) */ ++ nf_conntrack_locks_all = true; + + for (i = 0; i < CONNTRACK_LOCKS; i++) { +- spin_unlock_wait(&nf_conntrack_locks[i]); ++ spin_lock(&nf_conntrack_locks[i]); ++ ++ /* This spin_unlock provides the "release" to ensure that ++ * nf_conntrack_locks_all==true is visible to everyone that ++ * acquired spin_lock(&nf_conntrack_locks[]). ++ */ ++ spin_unlock(&nf_conntrack_locks[i]); + } + } + + static void nf_conntrack_all_unlock(void) + { +- /* +- * All prior stores must be complete before we clear ++ /* All prior stores must be complete before we clear + * 'nf_conntrack_locks_all'. Otherwise nf_conntrack_lock() + * might observe the false value but not the entire +- * critical section: ++ * critical section. ++ * It pairs with the smp_load_acquire() in nf_conntrack_lock() + */ + smp_store_release(&nf_conntrack_locks_all, false); + spin_unlock(&nf_conntrack_locks_all_lock); +-- +2.15.1 + diff --git a/patches.fixes/net-packet-Fix-Tx-queue-selection-for-AF_PACKET.patch b/patches.fixes/net-packet-Fix-Tx-queue-selection-for-AF_PACKET.patch new file mode 100644 index 0000000..516e0f1 --- /dev/null +++ b/patches.fixes/net-packet-Fix-Tx-queue-selection-for-AF_PACKET.patch @@ -0,0 +1,76 @@ +From: =?UTF-8?q?Iv=C3=A1n=20Briano?= +Date: Thu, 13 Jul 2017 09:46:58 -0700 +Subject: net/packet: Fix Tx queue selection for AF_PACKET +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit +Patch-mainline: v4.13-rc2 +Git-commit: ccd4eb49f3392ebf989d58bd013a7bf44cdca4d6 +References: bsc#1076830 + +When PACKET_QDISC_BYPASS is not used, Tx queue selection will be done +before the packet is enqueued, taking into account any mappings set by +a queuing discipline such as mqprio without hardware offloading. This +selection may be affected by a previously saved queue_mapping, either on +the Rx path, or done before the packet reaches the device, as it's +currently the case for AF_PACKET. + +In order for queue selection to work as expected when using traffic +control, there can't be another selection done before that point is +reached, so move the call to packet_pick_tx_queue to +packet_direct_xmit, leaving the default xmit path as it was before +PACKET_QDISC_BYPASS was introduced. + +A forward declaration of packet_pick_tx_queue() is introduced to avoid +the need to reorder the functions within the file. + +Fixes: d346a3fae3ff ("packet: introduce PACKET_QDISC_BYPASS socket option") +Signed-off-by: Iván Briano +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/packet/af_packet.c | 6 ++---- + 1 file changed, 2 insertions(+), 4 deletions(-) + +diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c +index 7abb9180667f..3310711b37a9 100644 +--- a/net/packet/af_packet.c ++++ b/net/packet/af_packet.c +@@ -216,6 +216,7 @@ static void prb_clear_rxhash(struct tpacket_kbdq_core *, + static void prb_fill_vlan_info(struct tpacket_kbdq_core *, + struct tpacket3_hdr *); + static void packet_flush_mclist(struct sock *sk); ++static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb); + + struct packet_skb_cb { + union { +@@ -262,6 +263,7 @@ static int packet_direct_xmit(struct sk_buff *skb) + if (skb != orig_skb) + goto drop; + ++ packet_pick_tx_queue(dev, skb); + txq = skb_get_tx_queue(dev, skb); + + local_bh_disable(); +@@ -2757,8 +2759,6 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) + goto tpacket_error; + } + +- packet_pick_tx_queue(dev, skb); +- + skb->destructor = tpacket_destruct_skb; + __packet_set_status(po, ph, TP_STATUS_SENDING); + packet_inc_pending(&po->tx_ring); +@@ -2941,8 +2941,6 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) + skb->priority = sk->sk_priority; + skb->mark = sockc.mark; + +- packet_pick_tx_queue(dev, skb); +- + if (po->has_vnet_hdr) { + err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le()); + if (err) +-- +2.15.1 + diff --git a/patches.fixes/net-packet-fix-a-race-in-packet_bind-and-packet_noti.patch b/patches.fixes/net-packet-fix-a-race-in-packet_bind-and-packet_noti.patch new file mode 100644 index 0000000..ba3821f --- /dev/null +++ b/patches.fixes/net-packet-fix-a-race-in-packet_bind-and-packet_noti.patch @@ -0,0 +1,96 @@ +From: Eric Dumazet +Date: Tue, 28 Nov 2017 08:03:30 -0800 +Subject: net/packet: fix a race in packet_bind() and packet_notifier() +Patch-mainline: v4.15-rc2 +Git-commit: 15fe076edea787807a7cdc168df832544b58eba6 +References: bsc#1076830 + +syzbot reported crashes [1] and provided a C repro easing bug hunting. + +When/if packet_do_bind() calls __unregister_prot_hook() and releases +po->bind_lock, another thread can run packet_notifier() and process an +NETDEV_UP event. + +This calls register_prot_hook() and hooks again the socket right before +first thread is able to grab again po->bind_lock. + +Fixes this issue by temporarily setting po->num to 0, as suggested by +David Miller. + +[1] +dev_remove_pack: ffff8801bf16fa80 not found +------------[ cut here ]------------ +kernel BUG at net/core/dev.c:7945! ( BUG_ON(!list_empty(&dev->ptype_all)); ) +invalid opcode: 0000 [#1] SMP KASAN +Dumping ftrace buffer: + (ftrace buffer empty) +Modules linked in: +device syz0 entered promiscuous mode +CPU: 0 PID: 3161 Comm: syzkaller404108 Not tainted 4.14.0+ #190 +Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 +task: ffff8801cc57a500 task.stack: ffff8801cc588000 +RIP: 0010:netdev_run_todo+0x772/0xae0 net/core/dev.c:7945 +RSP: 0018:ffff8801cc58f598 EFLAGS: 00010293 +RAX: ffff8801cc57a500 RBX: dffffc0000000000 RCX: ffffffff841f75b2 +RDX: 0000000000000000 RSI: 1ffff100398b1ede RDI: ffff8801bf1f8810 +device syz0 entered promiscuous mode +RBP: ffff8801cc58f898 R08: 0000000000000001 R09: 0000000000000000 +R10: 0000000000000000 R11: 0000000000000000 R12: ffff8801bf1f8cd8 +R13: ffff8801cc58f870 R14: ffff8801bf1f8780 R15: ffff8801cc58f7f0 +FS: 0000000001716880(0000) GS:ffff8801db400000(0000) knlGS:0000000000000000 +CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +CR2: 0000000020b13000 CR3: 0000000005e25000 CR4: 00000000001406f0 +DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 +DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 +Call Trace: + rtnl_unlock+0xe/0x10 net/core/rtnetlink.c:106 + tun_detach drivers/net/tun.c:670 [inline] + tun_chr_close+0x49/0x60 drivers/net/tun.c:2845 + __fput+0x333/0x7f0 fs/file_table.c:210 + ____fput+0x15/0x20 fs/file_table.c:244 + task_work_run+0x199/0x270 kernel/task_work.c:113 + exit_task_work include/linux/task_work.h:22 [inline] + do_exit+0x9bb/0x1ae0 kernel/exit.c:865 + do_group_exit+0x149/0x400 kernel/exit.c:968 + SYSC_exit_group kernel/exit.c:979 [inline] + SyS_exit_group+0x1d/0x20 kernel/exit.c:977 + entry_SYSCALL_64_fastpath+0x1f/0x96 +RIP: 0033:0x44ad19 + +Fixes: 30f7ea1c2b5f ("packet: race condition in packet_bind") +Signed-off-by: Eric Dumazet +Reported-by: syzbot +Cc: Francesco Ruggeri +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/packet/af_packet.c | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c +index 23c4653ef0f0..65f63250bdb8 100644 +--- a/net/packet/af_packet.c ++++ b/net/packet/af_packet.c +@@ -3110,6 +3110,10 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, + if (need_rehook) { + if (po->running) { + rcu_read_unlock(); ++ /* prevents packet_notifier() from calling ++ * register_prot_hook() ++ */ ++ po->num = 0; + __unregister_prot_hook(sk, true); + rcu_read_lock(); + dev_curr = po->prot_hook.dev; +@@ -3118,6 +3122,7 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, + dev->ifindex); + } + ++ BUG_ON(po->running); + po->num = proto; + po->prot_hook.type = proto; + +-- +2.15.1 + diff --git a/patches.fixes/net-vrf-correct-FRA_L3MDEV-encode-type.patch b/patches.fixes/net-vrf-correct-FRA_L3MDEV-encode-type.patch new file mode 100644 index 0000000..ac9ded8 --- /dev/null +++ b/patches.fixes/net-vrf-correct-FRA_L3MDEV-encode-type.patch @@ -0,0 +1,37 @@ +From: Jeff Barnhill <0xeffeff@gmail.com> +Date: Wed, 1 Nov 2017 14:58:09 +0000 +Subject: net: vrf: correct FRA_L3MDEV encode type +Patch-mainline: v4.14-rc8 +Git-commit: 18129a24983906eaf2a2d448ce4b83e27091ebe2 +References: bsc#1076830 + +FRA_L3MDEV is defined as U8, but is being added as a U32 attribute. On +big endian architecture, this results in the l3mdev entry not being +added to the FIB rules. + +Fixes: 1aa6c4f6b8cd8 ("net: vrf: Add l3mdev rules on first device create") +Signed-off-by: Jeff Barnhill <0xeffeff@gmail.com> +Acked-by: David Ahern +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + drivers/net/vrf.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c +index b8fa968d5e3d..9b8b9ea73dbb 100644 +--- a/drivers/net/vrf.c ++++ b/drivers/net/vrf.c +@@ -1271,7 +1271,7 @@ static int vrf_fib_rule(const struct net_device *dev, __u8 family, bool add_it) + frh->family = family; + frh->action = FR_ACT_TO_TBL; + +- if (nla_put_u32(skb, FRA_L3MDEV, 1)) ++ if (nla_put_u8(skb, FRA_L3MDEV, 1)) + goto nla_put_failure; + + if (nla_put_u32(skb, FRA_PRIORITY, FIB_RULE_PREF)) +-- +2.15.1 + diff --git a/patches.fixes/netfilter-ebt_nflog-fix-unexpected-truncated-packet.patch b/patches.fixes/netfilter-ebt_nflog-fix-unexpected-truncated-packet.patch new file mode 100644 index 0000000..1fde2a5 --- /dev/null +++ b/patches.fixes/netfilter-ebt_nflog-fix-unexpected-truncated-packet.patch @@ -0,0 +1,36 @@ +From: Liping Zhang +Date: Sun, 18 Jun 2017 10:51:09 +0800 +Subject: netfilter: ebt_nflog: fix unexpected truncated packet +Patch-mainline: v4.13-rc1 +Git-commit: 91af6ba7ff16bd7e5919aedfe70aad73a3375619 +References: bsc#1076830 + +"struct nf_loginfo li;" is a local variable, so we should set the flags +to 0 explicitly, else, packets maybe truncated unexpectedly when copied +to the userspace. + +Fixes: 7643507fe8b5 ("netfilter: xt_NFLOG: nflog-range does not truncate packets") +Cc: Vishwanath Pai +Signed-off-by: Liping Zhang +Signed-off-by: Pablo Neira Ayuso +Acked-by: Michal Kubecek + +--- + net/bridge/netfilter/ebt_nflog.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/net/bridge/netfilter/ebt_nflog.c b/net/bridge/netfilter/ebt_nflog.c +index c1dc48686200..da1c2fdc08c8 100644 +--- a/net/bridge/netfilter/ebt_nflog.c ++++ b/net/bridge/netfilter/ebt_nflog.c +@@ -30,6 +30,7 @@ ebt_nflog_tg(struct sk_buff *skb, const struct xt_action_param *par) + li.u.ulog.copy_len = info->len; + li.u.ulog.group = info->group; + li.u.ulog.qthreshold = info->threshold; ++ li.u.ulog.flags = 0; + + nf_log_packet(net, PF_BRIDGE, xt_hooknum(par), skb, xt_in(par), + xt_out(par), &li, "%s", info->prefix); +-- +2.15.1 + diff --git a/patches.fixes/netfilter-ipt_CLUSTERIP-fix-use-after-free-of-proc-e.patch b/patches.fixes/netfilter-ipt_CLUSTERIP-fix-use-after-free-of-proc-e.patch new file mode 100644 index 0000000..9086c4f --- /dev/null +++ b/patches.fixes/netfilter-ipt_CLUSTERIP-fix-use-after-free-of-proc-e.patch @@ -0,0 +1,63 @@ +From: Sabrina Dubroca +Date: Tue, 18 Jul 2017 14:56:17 +0200 +Subject: netfilter: ipt_CLUSTERIP: fix use-after-free of proc entry +Patch-mainline: v4.13 +Git-commit: 3840538ad384fb7891adeeaf36624f870c51fc0e +References: bsc#1076830 + +When we delete a netns with a CLUSTERIP rule, clusterip_net_exit() is +called first, removing /proc/net/ipt_CLUSTERIP. +Then clusterip_config_entry_put() is called from clusterip_tg_destroy(), +and tries to remove its entry under /proc/net/ipt_CLUSTERIP/. + +Fix this by checking that the parent directory of the entry to remove +hasn't already been deleted. + +The following triggers a KASAN splat (stealing the reproducer from +202f59afd441, thanks to Jianlin Shi and Xin Long): + + ip netns add test + ip link add veth0_in type veth peer name veth0_out + ip link set veth0_in netns test + ip netns exec test ip link set lo up + ip netns exec test ip link set veth0_in up + ip netns exec test iptables -I INPUT -d 1.2.3.4 -i veth0_in -j \ + CLUSTERIP --new --clustermac 89:d4:47:eb:9a:fa --total-nodes 3 \ + --local-node 1 --hashmode sourceip-sourceport + ip netns del test + +Fixes: ce4ff76c15a8 ("netfilter: ipt_CLUSTERIP: make proc directory per net namespace") +Signed-off-by: Sabrina Dubroca +Reviewed-by: Xin Long +Signed-off-by: Pablo Neira Ayuso +Acked-by: Michal Kubecek + +--- + net/ipv4/netfilter/ipt_CLUSTERIP.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c +index 038f293c2376..80e74193f522 100644 +--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c ++++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c +@@ -116,7 +116,8 @@ clusterip_config_entry_put(struct clusterip_config *c) + * functions are also incrementing the refcount on their own, + * so it's safe to remove the entry even if it's in use. */ + #ifdef CONFIG_PROC_FS +- proc_remove(c->pde); ++ if (cn->procdir) ++ proc_remove(c->pde); + #endif + return; + } +@@ -764,6 +765,7 @@ static void clusterip_net_exit(struct net *net) + #ifdef CONFIG_PROC_FS + struct clusterip_net *cn = net_generic(net, clusterip_net_id); + proc_remove(cn->procdir); ++ cn->procdir = NULL; + #endif + } + +-- +2.15.1 + diff --git a/patches.fixes/netfilter-ipvs-fix-the-issue-that-sctp_conn_schedule.patch b/patches.fixes/netfilter-ipvs-fix-the-issue-that-sctp_conn_schedule.patch new file mode 100644 index 0000000..d7b4fd5 --- /dev/null +++ b/patches.fixes/netfilter-ipvs-fix-the-issue-that-sctp_conn_schedule.patch @@ -0,0 +1,57 @@ +From: Xin Long +Date: Sun, 20 Aug 2017 13:38:07 +0800 +Subject: netfilter: ipvs: fix the issue that sctp_conn_schedule drops non-INIT packet +Patch-mainline: v4.14-rc1 +Git-commit: 1cc4a018669f2fb18c10010f1a7ab3f6fb688cef +References: bsc#1076830 + +Commit 5e26b1b3abce ("ipvs: support scheduling inverse and icmp SCTP +packets") changed to check packet type early. It introduced a side +effect: if it's not a INIT packet, ports will be set as NULL, and +the packet will be dropped later. + +It caused that sctp couldn't create connection when ipvs module is +loaded and any scheduler is registered on server. + +Li Shuang reproduced it by running the cmds on sctp server: + # ipvsadm -A -t 1.1.1.1:80 -s rr + # ipvsadm -D -t 1.1.1.1:80 +then the server could't work any more. + +This patch is to return 1 when it's not an INIT packet. It means ipvs +will accept it without creating a conn for it, just like what it does +for tcp. + +Fixes: 5e26b1b3abce ("ipvs: support scheduling inverse and icmp SCTP packets") +Reported-by: Li Shuang +Signed-off-by: Xin Long +Signed-off-by: Simon Horman +Signed-off-by: Pablo Neira Ayuso +Acked-by: Michal Kubecek + +--- + net/netfilter/ipvs/ip_vs_proto_sctp.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c +index 56f8e4b204ff..0dbe2517d1c8 100644 +--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c ++++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c +@@ -25,9 +25,12 @@ sctp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, + sch = skb_header_pointer( + skb, iph->len + sizeof(sctp_sctphdr_t), + sizeof(_schunkh), &_schunkh); +- if (sch && (sch->type == SCTP_CID_INIT || +- sysctl_sloppy_sctp(ipvs))) ++ if (sch) { ++ if (!(sysctl_sloppy_sctp(ipvs) || ++ sch->type == SCTP_CID_INIT)) ++ return 1; + ports = &sh->source; ++ } + } + } else { + ports = skb_header_pointer( +-- +2.15.1 + diff --git a/patches.fixes/netfilter-nf_tables-Fix-nft-limit-burst-handling.patch b/patches.fixes/netfilter-nf_tables-Fix-nft-limit-burst-handling.patch new file mode 100644 index 0000000..0ee008d --- /dev/null +++ b/patches.fixes/netfilter-nf_tables-Fix-nft-limit-burst-handling.patch @@ -0,0 +1,77 @@ +From: andy zhou +Date: Mon, 21 Aug 2017 12:38:53 -0700 +Subject: netfilter: nf_tables: Fix nft limit burst handling +Patch-mainline: v4.13 +Git-commit: c26844eda9d4fdbd266660e3b3de2d0270e3a1ed +References: bsc#1076830 + +Current implementation treats the burst configuration the same as +rate configuration. This can cause the per packet cost to be lower +than configured. In effect, this bug causes the token bucket to be +refilled at a higher rate than what user has specified. + +This patch changes the implementation so that the token bucket size +is controlled by "rate + burst", while maintain the token bucket +refill rate the same as user specified. + +Fixes: 96518518cc41 ("netfilter: add nftables") +Signed-off-by: Andy Zhou +Acked-by: Joe Stringer +Signed-off-by: Pablo Neira Ayuso +Acked-by: Michal Kubecek + +--- + net/netfilter/nft_limit.c | 25 ++++++++++++++----------- + 1 file changed, 14 insertions(+), 11 deletions(-) + +diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c +index 18dd57a52651..14538b1d4d11 100644 +--- a/net/netfilter/nft_limit.c ++++ b/net/netfilter/nft_limit.c +@@ -65,19 +65,23 @@ static int nft_limit_init(struct nft_limit *limit, + limit->nsecs = unit * NSEC_PER_SEC; + if (limit->rate == 0 || limit->nsecs < unit) + return -EOVERFLOW; +- limit->tokens = limit->tokens_max = limit->nsecs; +- +- if (tb[NFTA_LIMIT_BURST]) { +- u64 rate; + ++ if (tb[NFTA_LIMIT_BURST]) + limit->burst = ntohl(nla_get_be32(tb[NFTA_LIMIT_BURST])); ++ else ++ limit->burst = 0; ++ ++ if (limit->rate + limit->burst < limit->rate) ++ return -EOVERFLOW; + +- rate = limit->rate + limit->burst; +- if (rate < limit->rate) +- return -EOVERFLOW; ++ /* The token bucket size limits the number of tokens can be ++ * accumulated. tokens_max specifies the bucket size. ++ * tokens_max = unit * (rate + burst) / rate. ++ */ ++ limit->tokens = div_u64(limit->nsecs * (limit->rate + limit->burst), ++ limit->rate); ++ limit->tokens_max = limit->tokens; + +- limit->rate = rate; +- } + if (tb[NFTA_LIMIT_FLAGS]) { + u32 flags = ntohl(nla_get_be32(tb[NFTA_LIMIT_FLAGS])); + +@@ -95,9 +99,8 @@ static int nft_limit_dump(struct sk_buff *skb, const struct nft_limit *limit, + { + u32 flags = limit->invert ? NFT_LIMIT_F_INV : 0; + u64 secs = div_u64(limit->nsecs, NSEC_PER_SEC); +- u64 rate = limit->rate - limit->burst; + +- if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(rate), ++ if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(limit->rate), + NFTA_LIMIT_PAD) || + nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(secs), + NFTA_LIMIT_PAD) || +-- +2.15.1 + diff --git a/patches.fixes/netfilter-nf_tables-fix-update-chain-error.patch b/patches.fixes/netfilter-nf_tables-fix-update-chain-error.patch new file mode 100644 index 0000000..a69bf05 --- /dev/null +++ b/patches.fixes/netfilter-nf_tables-fix-update-chain-error.patch @@ -0,0 +1,54 @@ +From: JingPiao Chen +Date: Sat, 23 Sep 2017 17:10:44 +0800 +Subject: netfilter: nf_tables: fix update chain error +Patch-mainline: v4.14-rc5 +Git-commit: 0d18779be13766b33c69cbc26df38383598da373 +References: bsc#1076830 + + # nft add table filter + # nft add chain filter c1 + # nft rename chain filter c1 c2 + +Error: Could not process rule: No such file or directory +rename chain filter c1 c2 +^^^^^^^^^^^^^^^^^^^^^^^^^^ + + # nft add chain filter c2 + # nft rename chain filter c1 c2 + # nft list table filter + +table ip filter { + chain c2 { + } + + chain c2 { + } +} + +Fixes: 664b0f8cd8 ("netfilter: nf_tables: add generation mask to chains") +Signed-off-by: JingPiao Chen +Signed-off-by: Pablo Neira Ayuso +Acked-by: Michal Kubecek + +--- + net/netfilter/nf_tables_api.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c +index da314be0c048..3a4128cfcb73 100644 +--- a/net/netfilter/nf_tables_api.c ++++ b/net/netfilter/nf_tables_api.c +@@ -1429,8 +1429,8 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, + chain2 = nf_tables_chain_lookup(table, + nla[NFTA_CHAIN_NAME], + genmask); +- if (IS_ERR(chain2)) +- return PTR_ERR(chain2); ++ if (!IS_ERR(chain2)) ++ return -EEXIST; + } + + if (nla[NFTA_CHAIN_COUNTERS]) { +-- +2.15.1 + diff --git a/patches.fixes/netfilter-xt_bpf-Fix-XT_BPF_MODE_FD_PINNED-mode-of-x.patch b/patches.fixes/netfilter-xt_bpf-Fix-XT_BPF_MODE_FD_PINNED-mode-of-x.patch new file mode 100644 index 0000000..f5174ed --- /dev/null +++ b/patches.fixes/netfilter-xt_bpf-Fix-XT_BPF_MODE_FD_PINNED-mode-of-x.patch @@ -0,0 +1,152 @@ +From: Shmulik Ladkani +Date: Mon, 9 Oct 2017 15:27:15 +0300 +Subject: netfilter: xt_bpf: Fix XT_BPF_MODE_FD_PINNED mode of 'xt_bpf_info_v1' +Patch-mainline: v4.14-rc5 +Git-commit: 98589a0998b8b13c4a8fa1ccb0e62751a019faa5 +References: bsc#1076830 + +Commit 2c16d6033264 ("netfilter: xt_bpf: support ebpf") introduced +support for attaching an eBPF object by an fd, with the +'bpf_mt_check_v1' ABI expecting the '.fd' to be specified upon each +IPT_SO_SET_REPLACE call. + +However this breaks subsequent iptables calls: + + # iptables -A INPUT -m bpf --object-pinned /sys/fs/bpf/xxx -j ACCEPT + # iptables -A INPUT -s 5.6.7.8 -j ACCEPT + iptables: Invalid argument. Run `dmesg' for more information. + +That's because iptables works by loading existing rules using +IPT_SO_GET_ENTRIES to userspace, then issuing IPT_SO_SET_REPLACE with +the replacement set. + +However, the loaded 'xt_bpf_info_v1' has an arbitrary '.fd' number +(from the initial "iptables -m bpf" invocation) - so when 2nd invocation +occurs, userspace passes a bogus fd number, which leads to +'bpf_mt_check_v1' to fail. + +One suggested solution [1] was to hack iptables userspace, to perform a +"entries fixup" immediatley after IPT_SO_GET_ENTRIES, by opening a new, +process-local fd per every 'xt_bpf_info_v1' entry seen. + +However, in [2] both Pablo Neira Ayuso and Willem de Bruijn suggested to +depricate the xt_bpf_info_v1 ABI dealing with pinned ebpf objects. + +This fix changes the XT_BPF_MODE_FD_PINNED behavior to ignore the given +'.fd' and instead perform an in-kernel lookup for the bpf object given +the provided '.path'. + +It also defines an alias for the XT_BPF_MODE_FD_PINNED mode, named +XT_BPF_MODE_PATH_PINNED, to better reflect the fact that the user is +expected to provide the path of the pinned object. + +Existing XT_BPF_MODE_FD_ELF behavior (non-pinned fd mode) is preserved. + +References: [1] https://marc.info/?l=netfilter-devel&m=150564724607440&w=2 + [2] https://marc.info/?l=netfilter-devel&m=150575727129880&w=2 + +Reported-by: Rafael Buchbinder +Signed-off-by: Shmulik Ladkani +Acked-by: Willem de Bruijn +Acked-by: Daniel Borkmann +Signed-off-by: Pablo Neira Ayuso +Acked-by: Michal Kubecek + +--- + include/linux/bpf.h | 5 +++++ + include/uapi/linux/netfilter/xt_bpf.h | 1 + + kernel/bpf/inode.c | 1 + + net/netfilter/xt_bpf.c | 22 ++++++++++++++++++++-- + 4 files changed, 27 insertions(+), 2 deletions(-) + +diff --git a/include/linux/bpf.h b/include/linux/bpf.h +index e07aed080d32..bbbbb12e8727 100644 +--- a/include/linux/bpf.h ++++ b/include/linux/bpf.h +@@ -355,6 +355,11 @@ static inline int __bpf_prog_charge(struct user_struct *user, u32 pages) + static inline void __bpf_prog_uncharge(struct user_struct *user, u32 pages) + { + } ++ ++static inline int bpf_obj_get_user(const char __user *pathname) ++{ ++ return -EOPNOTSUPP; ++} + #endif /* CONFIG_BPF_SYSCALL */ + + /* verifier prototypes for helper functions called from eBPF programs */ +diff --git a/include/uapi/linux/netfilter/xt_bpf.h b/include/uapi/linux/netfilter/xt_bpf.h +index b97725af2ac0..da161b56c79e 100644 +--- a/include/uapi/linux/netfilter/xt_bpf.h ++++ b/include/uapi/linux/netfilter/xt_bpf.h +@@ -23,6 +23,7 @@ enum xt_bpf_modes { + XT_BPF_MODE_FD_PINNED, + XT_BPF_MODE_FD_ELF, + }; ++#define XT_BPF_MODE_PATH_PINNED XT_BPF_MODE_FD_PINNED + + struct xt_bpf_info_v1 { + __u16 mode; +diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c +index e833ed914358..be1dde967208 100644 +--- a/kernel/bpf/inode.c ++++ b/kernel/bpf/inode.c +@@ -363,6 +363,7 @@ int bpf_obj_get_user(const char __user *pathname) + putname(pname); + return ret; + } ++EXPORT_SYMBOL_GPL(bpf_obj_get_user); + + static void bpf_evict_inode(struct inode *inode) + { +diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c +index 38986a95216c..29123934887b 100644 +--- a/net/netfilter/xt_bpf.c ++++ b/net/netfilter/xt_bpf.c +@@ -8,6 +8,7 @@ + */ + + #include ++#include + #include + #include + #include +@@ -49,6 +50,22 @@ static int __bpf_mt_check_fd(int fd, struct bpf_prog **ret) + return 0; + } + ++static int __bpf_mt_check_path(const char *path, struct bpf_prog **ret) ++{ ++ mm_segment_t oldfs = get_fs(); ++ int retval, fd; ++ ++ set_fs(KERNEL_DS); ++ fd = bpf_obj_get_user(path); ++ set_fs(oldfs); ++ if (fd < 0) ++ return fd; ++ ++ retval = __bpf_mt_check_fd(fd, ret); ++ sys_close(fd); ++ return retval; ++} ++ + static int bpf_mt_check(const struct xt_mtchk_param *par) + { + struct xt_bpf_info *info = par->matchinfo; +@@ -66,9 +83,10 @@ static int bpf_mt_check_v1(const struct xt_mtchk_param *par) + return __bpf_mt_check_bytecode(info->bpf_program, + info->bpf_program_num_elem, + &info->filter); +- else if (info->mode == XT_BPF_MODE_FD_PINNED || +- info->mode == XT_BPF_MODE_FD_ELF) ++ else if (info->mode == XT_BPF_MODE_FD_ELF) + return __bpf_mt_check_fd(info->fd, &info->filter); ++ else if (info->mode == XT_BPF_MODE_PATH_PINNED) ++ return __bpf_mt_check_path(info->path, &info->filter); + else + return -EINVAL; + } +-- +2.15.1 + diff --git a/patches.fixes/netfilter-xt_bpf-add-overflow-checks.patch b/patches.fixes/netfilter-xt_bpf-add-overflow-checks.patch new file mode 100644 index 0000000..d7cca33 --- /dev/null +++ b/patches.fixes/netfilter-xt_bpf-add-overflow-checks.patch @@ -0,0 +1,83 @@ +From: Jann Horn +Date: Fri, 1 Dec 2017 01:46:07 +0100 +Subject: netfilter: xt_bpf: add overflow checks +Patch-mainline: v4.15-rc4 +Git-commit: 6ab405114b0b229151ef06f4e31c7834dd09d0c0 +References: bsc#1076830 + +Check whether inputs from userspace are too long (explicit length field too +big or string not null-terminated) to avoid out-of-bounds reads. + +As far as I can tell, this can at worst lead to very limited kernel heap +memory disclosure or oopses. + +This bug can be triggered by an unprivileged user even if the xt_bpf module +is not loaded: iptables is available in network namespaces, and the xt_bpf +module can be autoloaded. + +Triggering the bug with a classic BPF filter with fake length 0x1000 causes +the following KASAN report: + +================================================================== +BUG: KASAN: slab-out-of-bounds in bpf_prog_create+0x84/0xf0 +Read of size 32768 at addr ffff8801eff2c494 by task test/4627 + +CPU: 0 PID: 4627 Comm: test Not tainted 4.15.0-rc1+ #1 +[...] +Call Trace: + dump_stack+0x5c/0x85 + print_address_description+0x6a/0x260 + kasan_report+0x254/0x370 + ? bpf_prog_create+0x84/0xf0 + memcpy+0x1f/0x50 + bpf_prog_create+0x84/0xf0 + bpf_mt_check+0x90/0xd6 [xt_bpf] +[...] +Allocated by task 4627: + kasan_kmalloc+0xa0/0xd0 + __kmalloc_node+0x47/0x60 + xt_alloc_table_info+0x41/0x70 [x_tables] +[...] +The buggy address belongs to the object at ffff8801eff2c3c0 + which belongs to the cache kmalloc-2048 of size 2048 +The buggy address is located 212 bytes inside of + 2048-byte region [ffff8801eff2c3c0, ffff8801eff2cbc0) +[...] +================================================================== + +Fixes: e6f30c731718 ("netfilter: x_tables: add xt_bpf match") +Signed-off-by: Jann Horn +Signed-off-by: Pablo Neira Ayuso +Acked-by: Michal Kubecek + +--- + net/netfilter/xt_bpf.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c +index 29123934887b..5185ff0f8f58 100644 +--- a/net/netfilter/xt_bpf.c ++++ b/net/netfilter/xt_bpf.c +@@ -27,6 +27,9 @@ static int __bpf_mt_check_bytecode(struct sock_filter *insns, __u16 len, + { + struct sock_fprog_kern program; + ++ if (len > XT_BPF_MAX_NUM_INSTR) ++ return -EINVAL; ++ + program.len = len; + program.filter = insns; + +@@ -55,6 +58,9 @@ static int __bpf_mt_check_path(const char *path, struct bpf_prog **ret) + mm_segment_t oldfs = get_fs(); + int retval, fd; + ++ if (strnlen(path, XT_BPF_PATH_MAX) == XT_BPF_PATH_MAX) ++ return -EINVAL; ++ + set_fs(KERNEL_DS); + fd = bpf_obj_get_user(path); + set_fs(oldfs); +-- +2.15.1 + diff --git a/patches.fixes/netfilter-xt_socket-Restore-mark-from-full-sockets-o.patch b/patches.fixes/netfilter-xt_socket-Restore-mark-from-full-sockets-o.patch new file mode 100644 index 0000000..5ba8856 --- /dev/null +++ b/patches.fixes/netfilter-xt_socket-Restore-mark-from-full-sockets-o.patch @@ -0,0 +1,87 @@ +From: Subash Abhinov Kasiviswanathan +Date: Thu, 21 Sep 2017 19:01:36 -0600 +Subject: netfilter: xt_socket: Restore mark from full sockets only +Patch-mainline: v4.14-rc5 +Git-commit: 89fcbb564f4a64c439d597c2702f990eed49c8a1 +References: bsc#1076830 + +An out of bounds error was detected on an ARM64 target with +Android based kernel 4.9. This occurs while trying to +restore mark on a skb from an inet request socket. + +BUG: KASAN: slab-out-of-bounds in socket_match.isra.2+0xc8/0x1f0 net/netfilter/xt_socket.c:248 +Read of size 4 at addr ffffffc06a8d824c by task syz-fuzzer/1532 +CPU: 7 PID: 1532 Comm: syz-fuzzer Tainted: G W O 4.9.41+ #1 +Call trace: +[] dump_backtrace+0x0/0x440 arch/arm64/kernel/traps.c:76 +[] show_stack+0x28/0x38 arch/arm64/kernel/traps.c:226 +[] __dump_stack lib/dump_stack.c:15 [inline] +[] dump_stack+0xe4/0x134 lib/dump_stack.c:51 +[] print_address_description+0x68/0x258 mm/kasan/report.c:248 +[] kasan_report_error mm/kasan/report.c:347 [inline] +[] kasan_report.part.2+0x228/0x2f0 mm/kasan/report.c:371 +[] kasan_report+0x5c/0x70 mm/kasan/report.c:372 +[] check_memory_region_inline mm/kasan/kasan.c:308 [inline] +[] __asan_load4+0x88/0xa0 mm/kasan/kasan.c:740 +[] socket_match.isra.2+0xc8/0x1f0 net/netfilter/xt_socket.c:248 +[] socket_mt4_v1_v2_v3+0x3c/0x48 net/netfilter/xt_socket.c:272 +[] ipt_do_table+0x54c/0xad8 net/ipv4/netfilter/ip_tables.c:311 +[] iptable_mangle_hook+0x6c/0x220 net/ipv4/netfilter/iptable_mangle.c:90 +... +Allocated by task 1532: + save_stack_trace_tsk+0x0/0x2a0 arch/arm64/kernel/stacktrace.c:131 + save_stack_trace+0x28/0x38 arch/arm64/kernel/stacktrace.c:215 + save_stack mm/kasan/kasan.c:495 [inline] + set_track mm/kasan/kasan.c:507 [inline] + kasan_kmalloc+0xd8/0x188 mm/kasan/kasan.c:599 + kasan_slab_alloc+0x14/0x20 mm/kasan/kasan.c:537 + slab_post_alloc_hook mm/slab.h:417 [inline] + slab_alloc_node mm/slub.c:2728 [inline] + slab_alloc mm/slub.c:2736 [inline] + kmem_cache_alloc+0x14c/0x2e8 mm/slub.c:2741 + reqsk_alloc include/net/request_sock.h:87 [inline] + inet_reqsk_alloc+0x4c/0x238 net/ipv4/tcp_input.c:6236 + tcp_conn_request+0x2b0/0xea8 net/ipv4/tcp_input.c:6341 + tcp_v4_conn_request+0xe0/0x100 net/ipv4/tcp_ipv4.c:1256 + tcp_rcv_state_process+0x384/0x18a8 net/ipv4/tcp_input.c:5926 + tcp_v4_do_rcv+0x2f0/0x3e0 net/ipv4/tcp_ipv4.c:1430 + tcp_v4_rcv+0x1278/0x1350 net/ipv4/tcp_ipv4.c:1709 + ip_local_deliver_finish+0x174/0x3e0 net/ipv4/ip_input.c:216 + +v1->v2: Change socket_mt6_v1_v2_v3() as well as mentioned by Eric +v2->v3: Put the correct fixes tag + +Fixes: 01555e74bde5 ("netfilter: xt_socket: add XT_SOCKET_RESTORESKMARK flag") +Signed-off-by: Subash Abhinov Kasiviswanathan +Signed-off-by: Pablo Neira Ayuso +Acked-by: Michal Kubecek + +--- + net/netfilter/xt_socket.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c +index e75ef39669c5..575d2153e3b8 100644 +--- a/net/netfilter/xt_socket.c ++++ b/net/netfilter/xt_socket.c +@@ -76,7 +76,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, + transparent = nf_sk_is_transparent(sk); + + if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard && +- transparent) ++ transparent && sk_fullsock(sk)) + pskb->mark = sk->sk_mark; + + if (sk != skb->sk) +@@ -133,7 +133,7 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par) + transparent = nf_sk_is_transparent(sk); + + if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard && +- transparent) ++ transparent && sk_fullsock(sk)) + pskb->mark = sk->sk_mark; + + if (sk != skb->sk) +-- +2.15.1 + diff --git a/patches.fixes/openvswitch-Fix-pop_vlan-action-for-double-tagged-fr.patch b/patches.fixes/openvswitch-Fix-pop_vlan-action-for-double-tagged-fr.patch new file mode 100644 index 0000000..e7a9518 --- /dev/null +++ b/patches.fixes/openvswitch-Fix-pop_vlan-action-for-double-tagged-fr.patch @@ -0,0 +1,63 @@ +From: Eric Garver +Date: Wed, 20 Dec 2017 15:09:22 -0500 +Subject: openvswitch: Fix pop_vlan action for double tagged frames +Patch-mainline: v4.15-rc5 +Git-commit: c48e74736fccf25fb32bb015426359e1c2016e3b +References: bsc#1076830 + +skb_vlan_pop() expects skb->protocol to be a valid TPID for double +tagged frames. So set skb->protocol to the TPID and let skb_vlan_pop() +shift the true ethertype into position for us. + +Fixes: 5108bbaddc37 ("openvswitch: add processing of L3 packets") +Signed-off-by: Eric Garver +Reviewed-by: Jiri Benc +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/openvswitch/flow.c | 15 ++++++++++++--- + 1 file changed, 12 insertions(+), 3 deletions(-) + +diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c +index 3f76cb765e5b..1d20807aeddd 100644 +--- a/net/openvswitch/flow.c ++++ b/net/openvswitch/flow.c +@@ -532,6 +532,7 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) + return -EINVAL; + + skb_reset_network_header(skb); ++ key->eth.type = skb->protocol; + } else { + eth = eth_hdr(skb); + ether_addr_copy(key->eth.src, eth->h_source); +@@ -545,15 +546,23 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) + if (unlikely(parse_vlan(skb, key))) + return -ENOMEM; + +- skb->protocol = parse_ethertype(skb); +- if (unlikely(skb->protocol == htons(0))) ++ key->eth.type = parse_ethertype(skb); ++ if (unlikely(key->eth.type == htons(0))) + return -ENOMEM; + ++ /* Multiple tagged packets need to retain TPID to satisfy ++ * skb_vlan_pop(), which will later shift the ethertype into ++ * skb->protocol. ++ */ ++ if (key->eth.cvlan.tci & htons(VLAN_TAG_PRESENT)) ++ skb->protocol = key->eth.cvlan.tpid; ++ else ++ skb->protocol = key->eth.type; ++ + skb_reset_network_header(skb); + __skb_push(skb, skb->data - skb_mac_header(skb)); + } + skb_reset_mac_len(skb); +- key->eth.type = skb->protocol; + + /* Network layer. */ + if (key->eth.type == htons(ETH_P_IP)) { +-- +2.15.1 + diff --git a/patches.fixes/packet-avoid-panic-in-packet_getsockopt.patch b/patches.fixes/packet-avoid-panic-in-packet_getsockopt.patch new file mode 100644 index 0000000..357fe88 --- /dev/null +++ b/patches.fixes/packet-avoid-panic-in-packet_getsockopt.patch @@ -0,0 +1,89 @@ +From: Eric Dumazet +Date: Wed, 18 Oct 2017 16:14:52 -0700 +Subject: packet: avoid panic in packet_getsockopt() +Patch-mainline: v4.14-rc6 +Git-commit: 509c7a1ecc8601f94ffba8a00889fefb239c00c6 +References: bsc#1076830 + +syzkaller got crashes in packet_getsockopt() processing +PACKET_ROLLOVER_STATS command while another thread was managing +to change po->rollover + +Using RCU will fix this bug. We might later add proper RCU annotations +for sparse sake. + +In v2: I replaced kfree(rollover) in fanout_add() to kfree_rcu() +variant, as spotted by John. + +Fixes: a9b6391814d5 ("packet: rollover statistics") +Signed-off-by: Eric Dumazet +Cc: Willem de Bruijn +Cc: John Sperbeck +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/packet/af_packet.c | 24 ++++++++++++++++-------- + 1 file changed, 16 insertions(+), 8 deletions(-) + +diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c +index 65f63250bdb8..c4bc46e112ac 100644 +--- a/net/packet/af_packet.c ++++ b/net/packet/af_packet.c +@@ -1772,7 +1772,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) + + out: + if (err && rollover) { +- kfree(rollover); ++ kfree_rcu(rollover, rcu); + po->rollover = NULL; + } + mutex_unlock(&fanout_mutex); +@@ -1799,8 +1799,10 @@ static struct packet_fanout *fanout_release(struct sock *sk) + else + f = NULL; + +- if (po->rollover) ++ if (po->rollover) { + kfree_rcu(po->rollover, rcu); ++ po->rollover = NULL; ++ } + } + mutex_unlock(&fanout_mutex); + +@@ -3861,6 +3863,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, + void *data = &val; + union tpacket_stats_u st; + struct tpacket_rollover_stats rstats; ++ struct packet_rollover *rollover; + + if (level != SOL_PACKET) + return -ENOPROTOOPT; +@@ -3939,13 +3942,18 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, + 0); + break; + case PACKET_ROLLOVER_STATS: +- if (!po->rollover) ++ rcu_read_lock(); ++ rollover = rcu_dereference(po->rollover); ++ if (rollover) { ++ rstats.tp_all = atomic_long_read(&rollover->num); ++ rstats.tp_huge = atomic_long_read(&rollover->num_huge); ++ rstats.tp_failed = atomic_long_read(&rollover->num_failed); ++ data = &rstats; ++ lv = sizeof(rstats); ++ } ++ rcu_read_unlock(); ++ if (!rollover) + return -EINVAL; +- rstats.tp_all = atomic_long_read(&po->rollover->num); +- rstats.tp_huge = atomic_long_read(&po->rollover->num_huge); +- rstats.tp_failed = atomic_long_read(&po->rollover->num_failed); +- data = &rstats; +- lv = sizeof(rstats); + break; + case PACKET_TX_HAS_OFF: + val = po->tp_tx_has_off; +-- +2.15.1 + diff --git a/patches.fixes/packet-fix-crash-in-fanout_demux_rollover.patch b/patches.fixes/packet-fix-crash-in-fanout_demux_rollover.patch new file mode 100644 index 0000000..01a9602 --- /dev/null +++ b/patches.fixes/packet-fix-crash-in-fanout_demux_rollover.patch @@ -0,0 +1,162 @@ +From: Mike Maloney +Date: Tue, 28 Nov 2017 10:44:29 -0500 +Subject: packet: fix crash in fanout_demux_rollover() +Patch-mainline: v4.15-rc2 +Git-commit: 57f015f5eccf25fd4a3336fe3cbbee920a8fba6f +References: bsc#1076830 + +syzkaller found a race condition fanout_demux_rollover() while removing +a packet socket from a fanout group. + +po->rollover is read and operated on during packet_rcv_fanout(), via +fanout_demux_rollover(), but the pointer is currently cleared before the +synchronization in packet_release(). It is safer to delay the cleanup +until after synchronize_net() has been called, ensuring all calls to +packet_rcv_fanout() for this socket have finished. + +To further simplify synchronization around the rollover structure, set +po->rollover in fanout_add() only if there are no errors. This removes +the need for rcu in the struct and in the call to +packet_getsockopt(..., PACKET_ROLLOVER_STATS, ...). + +Crashing stack trace: + fanout_demux_rollover+0xb6/0x4d0 net/packet/af_packet.c:1392 + packet_rcv_fanout+0x649/0x7c8 net/packet/af_packet.c:1487 + dev_queue_xmit_nit+0x835/0xc10 net/core/dev.c:1953 + xmit_one net/core/dev.c:2975 [inline] + dev_hard_start_xmit+0x16b/0xac0 net/core/dev.c:2995 + __dev_queue_xmit+0x17a4/0x2050 net/core/dev.c:3476 + dev_queue_xmit+0x17/0x20 net/core/dev.c:3509 + neigh_connected_output+0x489/0x720 net/core/neighbour.c:1379 + neigh_output include/net/neighbour.h:482 [inline] + ip6_finish_output2+0xad1/0x22a0 net/ipv6/ip6_output.c:120 + ip6_finish_output+0x2f9/0x920 net/ipv6/ip6_output.c:146 + NF_HOOK_COND include/linux/netfilter.h:239 [inline] + ip6_output+0x1f4/0x850 net/ipv6/ip6_output.c:163 + dst_output include/net/dst.h:459 [inline] + NF_HOOK.constprop.35+0xff/0x630 include/linux/netfilter.h:250 + mld_sendpack+0x6a8/0xcc0 net/ipv6/mcast.c:1660 + mld_send_initial_cr.part.24+0x103/0x150 net/ipv6/mcast.c:2072 + mld_send_initial_cr net/ipv6/mcast.c:2056 [inline] + ipv6_mc_dad_complete+0x99/0x130 net/ipv6/mcast.c:2079 + addrconf_dad_completed+0x595/0x970 net/ipv6/addrconf.c:4039 + addrconf_dad_work+0xac9/0x1160 net/ipv6/addrconf.c:3971 + process_one_work+0xbf0/0x1bc0 kernel/workqueue.c:2113 + worker_thread+0x223/0x1990 kernel/workqueue.c:2247 + kthread+0x35e/0x430 kernel/kthread.c:231 + ret_from_fork+0x2a/0x40 arch/x86/entry/entry_64.S:432 + +Fixes: 0648ab70afe6 ("packet: rollover prepare: per-socket state") +Fixes: 509c7a1ecc860 ("packet: avoid panic in packet_getsockopt()") +Reported-by: syzbot +Signed-off-by: Mike Maloney +Reviewed-by: Eric Dumazet +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/packet/af_packet.c | 32 ++++++++++---------------------- + net/packet/internal.h | 1 - + 2 files changed, 10 insertions(+), 23 deletions(-) + +diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c +index c4bc46e112ac..7abb9180667f 100644 +--- a/net/packet/af_packet.c ++++ b/net/packet/af_packet.c +@@ -1700,7 +1700,6 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) + atomic_long_set(&rollover->num, 0); + atomic_long_set(&rollover->num_huge, 0); + atomic_long_set(&rollover->num_failed, 0); +- po->rollover = rollover; + } + + if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) { +@@ -1758,6 +1757,8 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) + if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) { + __dev_remove_pack(&po->prot_hook); + po->fanout = match; ++ po->rollover = rollover; ++ rollover = NULL; + atomic_inc(&match->sk_ref); + __fanout_link(sk, po); + err = 0; +@@ -1771,10 +1772,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) + } + + out: +- if (err && rollover) { +- kfree_rcu(rollover, rcu); +- po->rollover = NULL; +- } ++ kfree(rollover); + mutex_unlock(&fanout_mutex); + return err; + } +@@ -1798,11 +1796,6 @@ static struct packet_fanout *fanout_release(struct sock *sk) + list_del(&f->list); + else + f = NULL; +- +- if (po->rollover) { +- kfree_rcu(po->rollover, rcu); +- po->rollover = NULL; +- } + } + mutex_unlock(&fanout_mutex); + +@@ -3044,6 +3037,7 @@ static int packet_release(struct socket *sock) + synchronize_net(); + + if (f) { ++ kfree(po->rollover); + fanout_release_data(f); + kfree(f); + } +@@ -3863,7 +3857,6 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, + void *data = &val; + union tpacket_stats_u st; + struct tpacket_rollover_stats rstats; +- struct packet_rollover *rollover; + + if (level != SOL_PACKET) + return -ENOPROTOOPT; +@@ -3942,18 +3935,13 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, + 0); + break; + case PACKET_ROLLOVER_STATS: +- rcu_read_lock(); +- rollover = rcu_dereference(po->rollover); +- if (rollover) { +- rstats.tp_all = atomic_long_read(&rollover->num); +- rstats.tp_huge = atomic_long_read(&rollover->num_huge); +- rstats.tp_failed = atomic_long_read(&rollover->num_failed); +- data = &rstats; +- lv = sizeof(rstats); +- } +- rcu_read_unlock(); +- if (!rollover) ++ if (!po->rollover) + return -EINVAL; ++ rstats.tp_all = atomic_long_read(&po->rollover->num); ++ rstats.tp_huge = atomic_long_read(&po->rollover->num_huge); ++ rstats.tp_failed = atomic_long_read(&po->rollover->num_failed); ++ data = &rstats; ++ lv = sizeof(rstats); + break; + case PACKET_TX_HAS_OFF: + val = po->tp_tx_has_off; +diff --git a/net/packet/internal.h b/net/packet/internal.h +index 9ee46314b7d7..d55bfc34d6b3 100644 +--- a/net/packet/internal.h ++++ b/net/packet/internal.h +@@ -92,7 +92,6 @@ struct packet_fanout { + + struct packet_rollover { + int sock; +- struct rcu_head rcu; + atomic_long_t num; + atomic_long_t num_huge; + atomic_long_t num_failed; +-- +2.15.1 + diff --git a/patches.fixes/route-also-update-fnhe_genid-when-updating-a-route-c.patch b/patches.fixes/route-also-update-fnhe_genid-when-updating-a-route-c.patch new file mode 100644 index 0000000..77030c7 --- /dev/null +++ b/patches.fixes/route-also-update-fnhe_genid-when-updating-a-route-c.patch @@ -0,0 +1,66 @@ +From: Xin Long +Date: Fri, 17 Nov 2017 14:27:18 +0800 +Subject: route: also update fnhe_genid when updating a route cache +Patch-mainline: v4.15-rc1 +Git-commit: cebe84c6190d741045a322f5343f717139993c08 +References: bsc#1076830 + +Now when ip route flush cache and it turn out all fnhe_genid != genid. +If a redirect/pmtu icmp packet comes and the old fnhe is found and all +it's members but fnhe_genid will be updated. + +Then next time when it looks up route and tries to rebind this fnhe to +the new dst, the fnhe will be flushed due to fnhe_genid != genid. It +causes this redirect/pmtu icmp packet acutally not to be applied. + +This patch is to also reset fnhe_genid when updating a route cache. + +Fixes: 5aad1de5ea2c ("ipv4: use separate genid for next hop exceptions") +Acked-by: Hannes Frederic Sowa +Signed-off-by: Xin Long +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/ipv4/route.c | 9 +++++++-- + 1 file changed, 7 insertions(+), 2 deletions(-) + +diff --git a/net/ipv4/route.c b/net/ipv4/route.c +index 3651be58b128..d56a0491550e 100644 +--- a/net/ipv4/route.c ++++ b/net/ipv4/route.c +@@ -649,9 +649,12 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, + struct fnhe_hash_bucket *hash; + struct fib_nh_exception *fnhe; + struct rtable *rt; ++ u32 genid, hval; + unsigned int i; + int depth; +- u32 hval = fnhe_hashfun(daddr); ++ ++ genid = fnhe_genid(dev_net(nh->nh_dev)); ++ hval = fnhe_hashfun(daddr); + + spin_lock_bh(&fnhe_lock); + +@@ -674,6 +677,8 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, + } + + if (fnhe) { ++ if (fnhe->fnhe_genid != genid) ++ fnhe->fnhe_genid = genid; + if (gw) + fnhe->fnhe_gw = gw; + if (pmtu) { +@@ -698,7 +703,7 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, + fnhe->fnhe_next = hash->chain; + rcu_assign_pointer(hash->chain, fnhe); + } +- fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev)); ++ fnhe->fnhe_genid = genid; + fnhe->fnhe_daddr = daddr; + fnhe->fnhe_gw = gw; + fnhe->fnhe_pmtu = pmtu; +-- +2.15.1 + diff --git a/patches.fixes/sctp-Fix-a-big-endian-bug-in-sctp_diag_dump.patch b/patches.fixes/sctp-Fix-a-big-endian-bug-in-sctp_diag_dump.patch new file mode 100644 index 0000000..1698a79 --- /dev/null +++ b/patches.fixes/sctp-Fix-a-big-endian-bug-in-sctp_diag_dump.patch @@ -0,0 +1,48 @@ +From: Dan Carpenter +Date: Mon, 25 Sep 2017 13:19:26 +0300 +Subject: sctp: Fix a big endian bug in sctp_diag_dump() +Patch-mainline: v4.14-rc4 +Git-commit: c2cc187e53011c1c4931055984657da9085c763b +References: bsc#1076830 + +The sctp_for_each_transport() function takes an pointer to int. The +cb->args[] array holds longs so it's only using the high 32 bits. It +works on little endian system but will break on big endian 64 bit +machines. + +Fixes: d25adbeb0cdb ("sctp: fix an use-after-free issue in sctp_sock_dump") +Signed-off-by: Dan Carpenter +Acked-by: Neil Horman +Reviewed-by: Xin Long +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/sctp/sctp_diag.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c +index 7008a992749b..83b04d73f1fa 100644 +--- a/net/sctp/sctp_diag.c ++++ b/net/sctp/sctp_diag.c +@@ -464,6 +464,7 @@ static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, + .r = r, + .net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN), + }; ++ int pos = cb->args[2]; + + /* eps hashtable dumps + * args: +@@ -494,7 +495,8 @@ static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, + goto done; + + sctp_for_each_transport(sctp_sock_filter, sctp_sock_dump, +- net, (int *)&cb->args[2], &commp); ++ net, &pos, &commp); ++ cb->args[2] = pos; + + done: + cb->args[1] = cb->args[4]; +-- +2.16.0 + diff --git a/patches.fixes/sctp-Replace-use-of-sockets_allocated-with-specified.patch b/patches.fixes/sctp-Replace-use-of-sockets_allocated-with-specified.patch new file mode 100644 index 0000000..21fb41e --- /dev/null +++ b/patches.fixes/sctp-Replace-use-of-sockets_allocated-with-specified.patch @@ -0,0 +1,47 @@ +From: Tonghao Zhang +Date: Fri, 22 Dec 2017 10:15:20 -0800 +Subject: sctp: Replace use of sockets_allocated with specified macro. +Patch-mainline: v4.15-rc6 +Git-commit: 8cb38a602478e9f806571f6920b0a3298aabf042 +References: bsc#1076830 + +The patch(180d8cd942ce) replaces all uses of struct sock fields' +memory_pressure, memory_allocated, sockets_allocated, and sysctl_mem +to accessor macros. But the sockets_allocated field of sctp sock is +not replaced at all. Then replace it now for unifying the code. + +Fixes: 180d8cd942ce ("foundations of per-cgroup memory pressure controlling.") +Cc: Glauber Costa +Signed-off-by: Tonghao Zhang +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/sctp/socket.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/net/sctp/socket.c b/net/sctp/socket.c +index 26b3de0d826f..1957cd1b7d3f 100644 +--- a/net/sctp/socket.c ++++ b/net/sctp/socket.c +@@ -4378,7 +4378,7 @@ static int sctp_init_sock(struct sock *sk) + SCTP_DBG_OBJCNT_INC(sock); + + local_bh_disable(); +- percpu_counter_inc(&sctp_sockets_allocated); ++ sk_sockets_allocated_inc(sk); + sock_prot_inuse_add(net, sk->sk_prot, 1); + + /* Nothing can fail after this block, otherwise +@@ -4422,7 +4422,7 @@ static void sctp_destroy_sock(struct sock *sk) + } + sctp_endpoint_free(sp->ep); + local_bh_disable(); +- percpu_counter_dec(&sctp_sockets_allocated); ++ sk_sockets_allocated_dec(sk); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + local_bh_enable(); + } +-- +2.15.1 + diff --git a/patches.fixes/sctp-fix-an-use-after-free-issue-in-sctp_sock_dump.patch b/patches.fixes/sctp-fix-an-use-after-free-issue-in-sctp_sock_dump.patch new file mode 100644 index 0000000..ea0ba5e --- /dev/null +++ b/patches.fixes/sctp-fix-an-use-after-free-issue-in-sctp_sock_dump.patch @@ -0,0 +1,209 @@ +From: Xin Long +Date: Fri, 15 Sep 2017 11:02:21 +0800 +Subject: sctp: fix an use-after-free issue in sctp_sock_dump +Patch-mainline: v4.14-rc1 +Git-commit: d25adbeb0cdb860fb39e09cdd025e9cfc954c5ab +References: bsc#1076830 + +Commit 86fdb3448cc1 ("sctp: ensure ep is not destroyed before doing the +dump") tried to fix an use-after-free issue by checking !sctp_sk(sk)->ep +with holding sock and sock lock. + +But Paolo noticed that endpoint could be destroyed in sctp_rcv without +sock lock protection. It means the use-after-free issue still could be +triggered when sctp_rcv put and destroy ep after sctp_sock_dump checks +!ep, although it's pretty hard to reproduce. + +I could reproduce it by mdelay in sctp_rcv while msleep in sctp_close +and sctp_sock_dump long time. + +This patch is to add another param cb_done to sctp_for_each_transport +and dump ep->assocs with holding tsp after jumping out of transport's +traversal in it to avoid this issue. + +It can also improve sctp diag dump to make it run faster, as no need +to save sk into cb->args[5] and keep calling sctp_for_each_transport +any more. + +This patch is also to use int * instead of int for the pos argument +in sctp_for_each_transport, which could make postion increment only +in sctp_for_each_transport and no need to keep changing cb->args[2] +in sctp_sock_filter and sctp_sock_dump any more. + +Fixes: 86fdb3448cc1 ("sctp: ensure ep is not destroyed before doing the dump") +Reported-by: Paolo Abeni +Signed-off-by: Xin Long +Acked-by: Marcelo Ricardo Leitner +Acked-by: Neil Horman +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + include/net/sctp/sctp.h | 3 ++- + net/sctp/sctp_diag.c | 32 +++++++++----------------------- + net/sctp/socket.c | 40 +++++++++++++++++++++++++--------------- + 3 files changed, 36 insertions(+), 39 deletions(-) + +diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h +index 06db0c3ec384..7c55b55b7dfa 100644 +--- a/include/net/sctp/sctp.h ++++ b/include/net/sctp/sctp.h +@@ -127,7 +127,8 @@ int sctp_transport_lookup_process(int (*cb)(struct sctp_transport *, void *), + const union sctp_addr *laddr, + const union sctp_addr *paddr, void *p); + int sctp_for_each_transport(int (*cb)(struct sctp_transport *, void *), +- struct net *net, int pos, void *p); ++ int (*cb_done)(struct sctp_transport *, void *), ++ struct net *net, int *pos, void *p); + int sctp_for_each_endpoint(int (*cb)(struct sctp_endpoint *, void *), void *p); + int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc, + struct sctp_info *info); +diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c +index e99518e79b52..7008a992749b 100644 +--- a/net/sctp/sctp_diag.c ++++ b/net/sctp/sctp_diag.c +@@ -279,9 +279,11 @@ static int sctp_tsp_dump_one(struct sctp_transport *tsp, void *p) + return err; + } + +-static int sctp_sock_dump(struct sock *sk, void *p) ++static int sctp_sock_dump(struct sctp_transport *tsp, void *p) + { ++ struct sctp_endpoint *ep = tsp->asoc->ep; + struct sctp_comm_param *commp = p; ++ struct sock *sk = ep->base.sk; + struct sk_buff *skb = commp->skb; + struct netlink_callback *cb = commp->cb; + const struct inet_diag_req_v2 *r = commp->r; +@@ -289,9 +291,7 @@ static int sctp_sock_dump(struct sock *sk, void *p) + int err = 0; + + lock_sock(sk); +- if (!sctp_sk(sk)->ep) +- goto release; +- list_for_each_entry(assoc, &sctp_sk(sk)->ep->asocs, asocs) { ++ list_for_each_entry(assoc, &ep->asocs, asocs) { + if (cb->args[4] < cb->args[1]) + goto next; + +@@ -327,40 +327,30 @@ static int sctp_sock_dump(struct sock *sk, void *p) + cb->args[4]++; + } + cb->args[1] = 0; +- cb->args[2]++; + cb->args[3] = 0; + cb->args[4] = 0; + release: + release_sock(sk); +- sock_put(sk); + return err; + } + +-static int sctp_get_sock(struct sctp_transport *tsp, void *p) ++static int sctp_sock_filter(struct sctp_transport *tsp, void *p) + { + struct sctp_endpoint *ep = tsp->asoc->ep; + struct sctp_comm_param *commp = p; + struct sock *sk = ep->base.sk; +- struct netlink_callback *cb = commp->cb; + const struct inet_diag_req_v2 *r = commp->r; + struct sctp_association *assoc = + list_entry(ep->asocs.next, struct sctp_association, asocs); + + /* find the ep only once through the transports by this condition */ + if (tsp->asoc != assoc) +- goto out; ++ return 0; + + if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) +- goto out; +- +- sock_hold(sk); +- cb->args[5] = (long)sk; ++ return 0; + + return 1; +- +-out: +- cb->args[2]++; +- return 0; + } + + static int sctp_ep_dump(struct sctp_endpoint *ep, void *p) +@@ -503,12 +493,8 @@ static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, + if (!(idiag_states & ~(TCPF_LISTEN | TCPF_CLOSE))) + goto done; + +-next: +- cb->args[5] = 0; +- sctp_for_each_transport(sctp_get_sock, net, cb->args[2], &commp); +- +- if (cb->args[5] && !sctp_sock_dump((struct sock *)cb->args[5], &commp)) +- goto next; ++ sctp_for_each_transport(sctp_sock_filter, sctp_sock_dump, ++ net, (int *)&cb->args[2], &commp); + + done: + cb->args[1] = cb->args[4]; +diff --git a/net/sctp/socket.c b/net/sctp/socket.c +index 1957cd1b7d3f..9949c8c50b17 100644 +--- a/net/sctp/socket.c ++++ b/net/sctp/socket.c +@@ -4660,29 +4660,39 @@ int sctp_transport_lookup_process(int (*cb)(struct sctp_transport *, void *), + EXPORT_SYMBOL_GPL(sctp_transport_lookup_process); + + int sctp_for_each_transport(int (*cb)(struct sctp_transport *, void *), +- struct net *net, int pos, void *p) { ++ int (*cb_done)(struct sctp_transport *, void *), ++ struct net *net, int *pos, void *p) { + struct rhashtable_iter hti; +- void *obj; +- int err; +- +- err = sctp_transport_walk_start(&hti); +- if (err) +- return err; ++ struct sctp_transport *tsp; ++ int ret; + +- obj = sctp_transport_get_idx(net, &hti, pos + 1); +- for (; !IS_ERR_OR_NULL(obj); obj = sctp_transport_get_next(net, &hti)) { +- struct sctp_transport *transport = obj; ++again: ++ ret = sctp_transport_walk_start(&hti); ++ if (ret) ++ return ret; + +- if (!sctp_transport_hold(transport)) ++ tsp = sctp_transport_get_idx(net, &hti, *pos + 1); ++ for (; !IS_ERR_OR_NULL(tsp); tsp = sctp_transport_get_next(net, &hti)) { ++ if (!sctp_transport_hold(tsp)) + continue; +- err = cb(transport, p); +- sctp_transport_put(transport); +- if (err) ++ ret = cb(tsp, p); ++ if (ret) + break; ++ (*pos)++; ++ sctp_transport_put(tsp); + } + sctp_transport_walk_stop(&hti); + +- return err; ++ if (ret) { ++ if (cb_done && !cb_done(tsp, p)) { ++ (*pos)++; ++ sctp_transport_put(tsp); ++ goto again; ++ } ++ sctp_transport_put(tsp); ++ } ++ ++ return ret; + } + EXPORT_SYMBOL_GPL(sctp_for_each_transport); + +-- +2.15.1 + diff --git a/patches.fixes/sctp-make-sure-stream-nums-can-match-optlen-in-sctp_.patch b/patches.fixes/sctp-make-sure-stream-nums-can-match-optlen-in-sctp_.patch new file mode 100644 index 0000000..f8fca5e --- /dev/null +++ b/patches.fixes/sctp-make-sure-stream-nums-can-match-optlen-in-sctp_.patch @@ -0,0 +1,58 @@ +From: Xin Long +Date: Sun, 10 Dec 2017 15:40:51 +0800 +Subject: sctp: make sure stream nums can match optlen in sctp_setsockopt_reset_streams +Patch-mainline: v4.15-rc4 +Git-commit: 2342b8d95bcae5946e1b9b8d58645f37500ef2e7 +References: bsc#1076830 + +Now in sctp_setsockopt_reset_streams, it only does the check +optlen < sizeof(*params) for optlen. But it's not enough, as +params->srs_number_streams should also match optlen. + +If the streams in params->srs_stream_list are less than stream +nums in params->srs_number_streams, later when dereferencing +the stream list, it could cause a slab-out-of-bounds crash, as +reported by syzbot. + +This patch is to fix it by also checking the stream numbers in +sctp_setsockopt_reset_streams to make sure at least it's not +greater than the streams in the list. + +Fixes: 7f9d68ac944e ("sctp: implement sender-side procedures for SSN Reset Request Parameter") +Reported-by: Dmitry Vyukov +Signed-off-by: Xin Long +Acked-by: Marcelo Ricardo Leitner +Acked-by: Neil Horman +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/sctp/socket.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/net/sctp/socket.c b/net/sctp/socket.c +index 19630459712d..26b3de0d826f 100644 +--- a/net/sctp/socket.c ++++ b/net/sctp/socket.c +@@ -3835,13 +3835,17 @@ static int sctp_setsockopt_reset_streams(struct sock *sk, + struct sctp_association *asoc; + int retval = -EINVAL; + +- if (optlen < sizeof(struct sctp_reset_streams)) ++ if (optlen < sizeof(*params)) + return -EINVAL; + + params = memdup_user(optval, optlen); + if (IS_ERR(params)) + return PTR_ERR(params); + ++ if (params->srs_number_streams * sizeof(__u16) > ++ optlen - sizeof(*params)) ++ goto out; ++ + asoc = sctp_id2assoc(sk, params->srs_assoc_id); + if (!asoc) + goto out; +-- +2.15.1 + diff --git a/patches.fixes/tap-double-free-in-error-path-in-tap_open.patch b/patches.fixes/tap-double-free-in-error-path-in-tap_open.patch new file mode 100644 index 0000000..51a5531 --- /dev/null +++ b/patches.fixes/tap-double-free-in-error-path-in-tap_open.patch @@ -0,0 +1,69 @@ +From: Girish Moodalbail +Date: Wed, 25 Oct 2017 00:23:04 -0700 +Subject: tap: double-free in error path in tap_open() +Patch-mainline: v4.14-rc7 +Git-commit: 78e0ea6791d7baafb8a0ca82b1bd0c7b3453c919 +References: bsc#1076830 + +Double free of skb_array in tap module is causing kernel panic. When +tap_set_queue() fails we free skb_array right away by calling +skb_array_cleanup(). However, later on skb_array_cleanup() is called +again by tap_sock_destruct through sock_put(). This patch fixes that +issue. + +Fixes: 362899b8725b35e3 (macvtap: switch to use skb array) +Signed-off-by: Girish Moodalbail +Acked-by: Jason Wang +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + drivers/net/tap.c | 18 +++++++++--------- + 1 file changed, 9 insertions(+), 9 deletions(-) + +diff --git a/drivers/net/tap.c b/drivers/net/tap.c +index 7ba9b3eaa29e..ad8072c29997 100644 +--- a/drivers/net/tap.c ++++ b/drivers/net/tap.c +@@ -517,6 +517,10 @@ static int tap_open(struct inode *inode, struct file *file) + &tap_proto, 0); + if (!q) + goto err; ++ if (skb_array_init(&q->skb_array, tap->dev->tx_queue_len, GFP_KERNEL)) { ++ sk_free(&q->sk); ++ goto err; ++ } + + RCU_INIT_POINTER(q->sock.wq, &q->wq); + init_waitqueue_head(&q->wq.wait); +@@ -540,22 +544,18 @@ static int tap_open(struct inode *inode, struct file *file) + if ((tap->dev->features & NETIF_F_HIGHDMA) && (tap->dev->features & NETIF_F_SG)) + sock_set_flag(&q->sk, SOCK_ZEROCOPY); + +- err = -ENOMEM; +- if (skb_array_init(&q->skb_array, tap->dev->tx_queue_len, GFP_KERNEL)) +- goto err_array; +- + err = tap_set_queue(tap, file, q); +- if (err) +- goto err_queue; ++ if (err) { ++ /* tap_sock_destruct() will take care of freeing skb_array */ ++ goto err_put; ++ } + + dev_put(tap->dev); + + rtnl_unlock(); + return err; + +-err_queue: +- skb_array_cleanup(&q->skb_array); +-err_array: ++err_put: + sock_put(&q->sk); + err: + if (tap) +-- +2.15.1 + diff --git a/patches.fixes/tcp-bic-cubic-use-tcp_jiffies32-instead-of-tcp_time_.patch b/patches.fixes/tcp-bic-cubic-use-tcp_jiffies32-instead-of-tcp_time_.patch new file mode 100644 index 0000000..0715254 --- /dev/null +++ b/patches.fixes/tcp-bic-cubic-use-tcp_jiffies32-instead-of-tcp_time_.patch @@ -0,0 +1,93 @@ +From: Eric Dumazet +Date: Tue, 16 May 2017 14:00:06 -0700 +Subject: tcp: bic, cubic: use tcp_jiffies32 instead of tcp_time_stamp +Patch-mainline: v4.13-rc1 +Git-commit: ac35f562203a45a04f79f412509df48857f928be +References: bsc#1061739 + +Use tcp_jiffies32 instead of tcp_time_stamp, since +tcp_time_stamp will soon be only used for TCP TS option. + +Signed-off-by: Eric Dumazet +Acked-by: Soheil Hassas Yeganeh +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/ipv4/tcp_bic.c | 6 +++--- + net/ipv4/tcp_cubic.c | 12 ++++++------ + 2 files changed, 9 insertions(+), 9 deletions(-) + +diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c +index 36087bca9f48..609965f0e298 100644 +--- a/net/ipv4/tcp_bic.c ++++ b/net/ipv4/tcp_bic.c +@@ -84,14 +84,14 @@ static void bictcp_init(struct sock *sk) + static inline void bictcp_update(struct bictcp *ca, u32 cwnd) + { + if (ca->last_cwnd == cwnd && +- (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32) ++ (s32)(tcp_jiffies32 - ca->last_time) <= HZ / 32) + return; + + ca->last_cwnd = cwnd; +- ca->last_time = tcp_time_stamp; ++ ca->last_time = tcp_jiffies32; + + if (ca->epoch_start == 0) /* record the beginning of an epoch */ +- ca->epoch_start = tcp_time_stamp; ++ ca->epoch_start = tcp_jiffies32; + + /* start off normal */ + if (cwnd <= low_window) { +diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c +index 2052ca740916..57ae5b5ae643 100644 +--- a/net/ipv4/tcp_cubic.c ++++ b/net/ipv4/tcp_cubic.c +@@ -231,21 +231,21 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked) + ca->ack_cnt += acked; /* count the number of ACKed packets */ + + if (ca->last_cwnd == cwnd && +- (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32) ++ (s32)(tcp_jiffies32 - ca->last_time) <= HZ / 32) + return; + + /* The CUBIC function can update ca->cnt at most once per jiffy. + * On all cwnd reduction events, ca->epoch_start is set to 0, + * which will force a recalculation of ca->cnt. + */ +- if (ca->epoch_start && tcp_time_stamp == ca->last_time) ++ if (ca->epoch_start && tcp_jiffies32 == ca->last_time) + goto tcp_friendliness; + + ca->last_cwnd = cwnd; +- ca->last_time = tcp_time_stamp; ++ ca->last_time = tcp_jiffies32; + + if (ca->epoch_start == 0) { +- ca->epoch_start = tcp_time_stamp; /* record beginning */ ++ ca->epoch_start = tcp_jiffies32; /* record beginning */ + ca->ack_cnt = acked; /* start counting */ + ca->tcp_cwnd = cwnd; /* syn with cubic */ + +@@ -276,7 +276,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked) + * if the cwnd < 1 million packets !!! + */ + +- t = (s32)(tcp_time_stamp - ca->epoch_start); ++ t = (s32)(tcp_jiffies32 - ca->epoch_start); + t += msecs_to_jiffies(ca->delay_min >> 3); + /* change the unit from HZ to bictcp_HZ */ + t <<= BICTCP_HZ; +@@ -448,7 +448,7 @@ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) + return; + + /* Discard delay samples right after fast recovery */ +- if (ca->epoch_start && (s32)(tcp_time_stamp - ca->epoch_start) < HZ) ++ if (ca->epoch_start && (s32)(tcp_jiffies32 - ca->epoch_start) < HZ) + return; + + delay = (sample->rtt_us << 3) / USEC_PER_MSEC; +-- +2.15.1 + diff --git a/patches.fixes/tcp-dccp-fix-ireq-opt-races.patch b/patches.fixes/tcp-dccp-fix-ireq-opt-races.patch new file mode 100644 index 0000000..b5eac18 --- /dev/null +++ b/patches.fixes/tcp-dccp-fix-ireq-opt-races.patch @@ -0,0 +1,427 @@ +From: Eric Dumazet +Date: Fri, 20 Oct 2017 09:04:13 -0700 +Subject: tcp/dccp: fix ireq->opt races +Patch-mainline: v4.14-rc6 +Git-commit: c92e8c02fe664155ac4234516e32544bec0f113d +References: bsc#1076830 + +syzkaller found another bug in DCCP/TCP stacks [1] + +For the reasons explained in commit ce1050089c96 ("tcp/dccp: fix +ireq->pktopts race"), we need to make sure we do not access +ireq->opt unless we own the request sock. + +Note the opt field is renamed to ireq_opt to ease grep games. + +[1] +BUG: KASAN: use-after-free in ip_queue_xmit+0x1687/0x18e0 net/ipv4/ip_output.c:474 +Read of size 1 at addr ffff8801c951039c by task syz-executor5/3295 + +CPU: 1 PID: 3295 Comm: syz-executor5 Not tainted 4.14.0-rc4+ #80 +Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 +Call Trace: + __dump_stack lib/dump_stack.c:16 [inline] + dump_stack+0x194/0x257 lib/dump_stack.c:52 + print_address_description+0x73/0x250 mm/kasan/report.c:252 + kasan_report_error mm/kasan/report.c:351 [inline] + kasan_report+0x25b/0x340 mm/kasan/report.c:409 + __asan_report_load1_noabort+0x14/0x20 mm/kasan/report.c:427 + ip_queue_xmit+0x1687/0x18e0 net/ipv4/ip_output.c:474 + tcp_transmit_skb+0x1ab7/0x3840 net/ipv4/tcp_output.c:1135 + tcp_send_ack.part.37+0x3bb/0x650 net/ipv4/tcp_output.c:3587 + tcp_send_ack+0x49/0x60 net/ipv4/tcp_output.c:3557 + __tcp_ack_snd_check+0x2c6/0x4b0 net/ipv4/tcp_input.c:5072 + tcp_ack_snd_check net/ipv4/tcp_input.c:5085 [inline] + tcp_rcv_state_process+0x2eff/0x4850 net/ipv4/tcp_input.c:6071 + tcp_child_process+0x342/0x990 net/ipv4/tcp_minisocks.c:816 + tcp_v4_rcv+0x1827/0x2f80 net/ipv4/tcp_ipv4.c:1682 + ip_local_deliver_finish+0x2e2/0xba0 net/ipv4/ip_input.c:216 + NF_HOOK include/linux/netfilter.h:249 [inline] + ip_local_deliver+0x1ce/0x6e0 net/ipv4/ip_input.c:257 + dst_input include/net/dst.h:464 [inline] + ip_rcv_finish+0x887/0x19a0 net/ipv4/ip_input.c:397 + NF_HOOK include/linux/netfilter.h:249 [inline] + ip_rcv+0xc3f/0x1820 net/ipv4/ip_input.c:493 + __netif_receive_skb_core+0x1a3e/0x34b0 net/core/dev.c:4476 + __netif_receive_skb+0x2c/0x1b0 net/core/dev.c:4514 + netif_receive_skb_internal+0x10b/0x670 net/core/dev.c:4587 + netif_receive_skb+0xae/0x390 net/core/dev.c:4611 + tun_rx_batched.isra.50+0x5ed/0x860 drivers/net/tun.c:1372 + tun_get_user+0x249c/0x36d0 drivers/net/tun.c:1766 + tun_chr_write_iter+0xbf/0x160 drivers/net/tun.c:1792 + call_write_iter include/linux/fs.h:1770 [inline] + new_sync_write fs/read_write.c:468 [inline] + __vfs_write+0x68a/0x970 fs/read_write.c:481 + vfs_write+0x18f/0x510 fs/read_write.c:543 + SYSC_write fs/read_write.c:588 [inline] + SyS_write+0xef/0x220 fs/read_write.c:580 + entry_SYSCALL_64_fastpath+0x1f/0xbe +RIP: 0033:0x40c341 +RSP: 002b:00007f469523ec10 EFLAGS: 00000293 ORIG_RAX: 0000000000000001 +RAX: ffffffffffffffda RBX: 0000000000718000 RCX: 000000000040c341 +RDX: 0000000000000037 RSI: 0000000020004000 RDI: 0000000000000015 +RBP: 0000000000000086 R08: 0000000000000000 R09: 0000000000000000 +R10: 00000000000f4240 R11: 0000000000000293 R12: 00000000004b7fd1 +R13: 00000000ffffffff R14: 0000000020000000 R15: 0000000000025000 + +Allocated by task 3295: + save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59 + save_stack+0x43/0xd0 mm/kasan/kasan.c:447 + set_track mm/kasan/kasan.c:459 [inline] + kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:551 + __do_kmalloc mm/slab.c:3725 [inline] + __kmalloc+0x162/0x760 mm/slab.c:3734 + kmalloc include/linux/slab.h:498 [inline] + tcp_v4_save_options include/net/tcp.h:1962 [inline] + tcp_v4_init_req+0x2d3/0x3e0 net/ipv4/tcp_ipv4.c:1271 + tcp_conn_request+0xf6d/0x3410 net/ipv4/tcp_input.c:6283 + tcp_v4_conn_request+0x157/0x210 net/ipv4/tcp_ipv4.c:1313 + tcp_rcv_state_process+0x8ea/0x4850 net/ipv4/tcp_input.c:5857 + tcp_v4_do_rcv+0x55c/0x7d0 net/ipv4/tcp_ipv4.c:1482 + tcp_v4_rcv+0x2d10/0x2f80 net/ipv4/tcp_ipv4.c:1711 + ip_local_deliver_finish+0x2e2/0xba0 net/ipv4/ip_input.c:216 + NF_HOOK include/linux/netfilter.h:249 [inline] + ip_local_deliver+0x1ce/0x6e0 net/ipv4/ip_input.c:257 + dst_input include/net/dst.h:464 [inline] + ip_rcv_finish+0x887/0x19a0 net/ipv4/ip_input.c:397 + NF_HOOK include/linux/netfilter.h:249 [inline] + ip_rcv+0xc3f/0x1820 net/ipv4/ip_input.c:493 + __netif_receive_skb_core+0x1a3e/0x34b0 net/core/dev.c:4476 + __netif_receive_skb+0x2c/0x1b0 net/core/dev.c:4514 + netif_receive_skb_internal+0x10b/0x670 net/core/dev.c:4587 + netif_receive_skb+0xae/0x390 net/core/dev.c:4611 + tun_rx_batched.isra.50+0x5ed/0x860 drivers/net/tun.c:1372 + tun_get_user+0x249c/0x36d0 drivers/net/tun.c:1766 + tun_chr_write_iter+0xbf/0x160 drivers/net/tun.c:1792 + call_write_iter include/linux/fs.h:1770 [inline] + new_sync_write fs/read_write.c:468 [inline] + __vfs_write+0x68a/0x970 fs/read_write.c:481 + vfs_write+0x18f/0x510 fs/read_write.c:543 + SYSC_write fs/read_write.c:588 [inline] + SyS_write+0xef/0x220 fs/read_write.c:580 + entry_SYSCALL_64_fastpath+0x1f/0xbe + +Freed by task 3306: + save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59 + save_stack+0x43/0xd0 mm/kasan/kasan.c:447 + set_track mm/kasan/kasan.c:459 [inline] + kasan_slab_free+0x71/0xc0 mm/kasan/kasan.c:524 + __cache_free mm/slab.c:3503 [inline] + kfree+0xca/0x250 mm/slab.c:3820 + inet_sock_destruct+0x59d/0x950 net/ipv4/af_inet.c:157 + __sk_destruct+0xfd/0x910 net/core/sock.c:1560 + sk_destruct+0x47/0x80 net/core/sock.c:1595 + __sk_free+0x57/0x230 net/core/sock.c:1603 + sk_free+0x2a/0x40 net/core/sock.c:1614 + sock_put include/net/sock.h:1652 [inline] + inet_csk_complete_hashdance+0xd5/0xf0 net/ipv4/inet_connection_sock.c:959 + tcp_check_req+0xf4d/0x1620 net/ipv4/tcp_minisocks.c:765 + tcp_v4_rcv+0x17f6/0x2f80 net/ipv4/tcp_ipv4.c:1675 + ip_local_deliver_finish+0x2e2/0xba0 net/ipv4/ip_input.c:216 + NF_HOOK include/linux/netfilter.h:249 [inline] + ip_local_deliver+0x1ce/0x6e0 net/ipv4/ip_input.c:257 + dst_input include/net/dst.h:464 [inline] + ip_rcv_finish+0x887/0x19a0 net/ipv4/ip_input.c:397 + NF_HOOK include/linux/netfilter.h:249 [inline] + ip_rcv+0xc3f/0x1820 net/ipv4/ip_input.c:493 + __netif_receive_skb_core+0x1a3e/0x34b0 net/core/dev.c:4476 + __netif_receive_skb+0x2c/0x1b0 net/core/dev.c:4514 + netif_receive_skb_internal+0x10b/0x670 net/core/dev.c:4587 + netif_receive_skb+0xae/0x390 net/core/dev.c:4611 + tun_rx_batched.isra.50+0x5ed/0x860 drivers/net/tun.c:1372 + tun_get_user+0x249c/0x36d0 drivers/net/tun.c:1766 + tun_chr_write_iter+0xbf/0x160 drivers/net/tun.c:1792 + call_write_iter include/linux/fs.h:1770 [inline] + new_sync_write fs/read_write.c:468 [inline] + __vfs_write+0x68a/0x970 fs/read_write.c:481 + vfs_write+0x18f/0x510 fs/read_write.c:543 + SYSC_write fs/read_write.c:588 [inline] + SyS_write+0xef/0x220 fs/read_write.c:580 + entry_SYSCALL_64_fastpath+0x1f/0xbe + +Fixes: e994b2f0fb92 ("tcp: do not lock listener to process SYN packets") +Fixes: 079096f103fa ("tcp/dccp: install syn_recv requests into ehash table") +Signed-off-by: Eric Dumazet +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + include/net/inet_sock.h | 2 +- + net/dccp/ipv4.c | 13 ++++++++----- + net/ipv4/cipso_ipv4.c | 24 +++++++----------------- + net/ipv4/inet_connection_sock.c | 8 +++----- + net/ipv4/syncookies.c | 2 +- + net/ipv4/tcp_input.c | 2 +- + net/ipv4/tcp_ipv4.c | 22 +++++++++++++--------- + 7 files changed, 34 insertions(+), 39 deletions(-) + +diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h +index aa95053dfc78..425752f768d2 100644 +--- a/include/net/inet_sock.h ++++ b/include/net/inet_sock.h +@@ -96,7 +96,7 @@ struct inet_request_sock { + kmemcheck_bitfield_end(flags); + u32 ir_mark; + union { +- struct ip_options_rcu *opt; ++ struct ip_options_rcu __rcu *ireq_opt; + #if IS_ENABLED(CONFIG_IPV6) + struct { + struct ipv6_txoptions *ipv6_opt; +diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c +index 97368f229876..310fdd16df7e 100644 +--- a/net/dccp/ipv4.c ++++ b/net/dccp/ipv4.c +@@ -414,8 +414,7 @@ struct sock *dccp_v4_request_recv_sock(const struct sock *sk, + sk_daddr_set(newsk, ireq->ir_rmt_addr); + sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); + newinet->inet_saddr = ireq->ir_loc_addr; +- newinet->inet_opt = ireq->opt; +- ireq->opt = NULL; ++ RCU_INIT_POINTER(newinet->inet_opt, rcu_dereference(ireq->ireq_opt)); + newinet->mc_index = inet_iif(skb); + newinet->mc_ttl = ip_hdr(skb)->ttl; + newinet->inet_id = jiffies; +@@ -430,7 +429,10 @@ struct sock *dccp_v4_request_recv_sock(const struct sock *sk, + if (__inet_inherit_port(sk, newsk) < 0) + goto put_and_exit; + *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); +- ++ if (*own_req) ++ ireq->ireq_opt = NULL; ++ else ++ newinet->inet_opt = NULL; + return newsk; + + exit_overflow: +@@ -441,6 +443,7 @@ struct sock *dccp_v4_request_recv_sock(const struct sock *sk, + __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS); + return NULL; + put_and_exit: ++ newinet->inet_opt = NULL; + inet_csk_prepare_forced_close(newsk); + dccp_done(newsk); + goto exit; +@@ -492,7 +495,7 @@ static int dccp_v4_send_response(const struct sock *sk, struct request_sock *req + ireq->ir_rmt_addr); + err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, + ireq->ir_rmt_addr, +- ireq->opt); ++ rcu_dereference(ireq->ireq_opt)); + err = net_xmit_eval(err); + } + +@@ -548,7 +551,7 @@ static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb) + static void dccp_v4_reqsk_destructor(struct request_sock *req) + { + dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg); +- kfree(inet_rsk(req)->opt); ++ kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); + } + + void dccp_syn_ack_timeout(const struct request_sock *req) +diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c +index ae206163c273..972353cd1778 100644 +--- a/net/ipv4/cipso_ipv4.c ++++ b/net/ipv4/cipso_ipv4.c +@@ -1943,7 +1943,7 @@ int cipso_v4_req_setattr(struct request_sock *req, + buf = NULL; + + req_inet = inet_rsk(req); +- opt = xchg(&req_inet->opt, opt); ++ opt = xchg((__force struct ip_options_rcu **)&req_inet->ireq_opt, opt); + if (opt) + kfree_rcu(opt, rcu); + +@@ -1965,11 +1965,13 @@ int cipso_v4_req_setattr(struct request_sock *req, + * values on failure. + * + */ +-static int cipso_v4_delopt(struct ip_options_rcu **opt_ptr) ++static int cipso_v4_delopt(struct ip_options_rcu __rcu **opt_ptr) + { ++ struct ip_options_rcu *opt = rcu_dereference_protected(*opt_ptr, 1); + int hdr_delta = 0; +- struct ip_options_rcu *opt = *opt_ptr; + ++ if (!opt || opt->opt.cipso == 0) ++ return 0; + if (opt->opt.srr || opt->opt.rr || opt->opt.ts || opt->opt.router_alert) { + u8 cipso_len; + u8 cipso_off; +@@ -2031,14 +2033,10 @@ static int cipso_v4_delopt(struct ip_options_rcu **opt_ptr) + */ + void cipso_v4_sock_delattr(struct sock *sk) + { +- int hdr_delta; +- struct ip_options_rcu *opt; + struct inet_sock *sk_inet; ++ int hdr_delta; + + sk_inet = inet_sk(sk); +- opt = rcu_dereference_protected(sk_inet->inet_opt, 1); +- if (!opt || opt->opt.cipso == 0) +- return; + + hdr_delta = cipso_v4_delopt(&sk_inet->inet_opt); + if (sk_inet->is_icsk && hdr_delta > 0) { +@@ -2058,15 +2056,7 @@ void cipso_v4_sock_delattr(struct sock *sk) + */ + void cipso_v4_req_delattr(struct request_sock *req) + { +- struct ip_options_rcu *opt; +- struct inet_request_sock *req_inet; +- +- req_inet = inet_rsk(req); +- opt = req_inet->opt; +- if (!opt || opt->opt.cipso == 0) +- return; +- +- cipso_v4_delopt(&req_inet->opt); ++ cipso_v4_delopt(&inet_rsk(req)->ireq_opt); + } + + /** +diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c +index e53a9a862489..ff862662e8c2 100644 +--- a/net/ipv4/inet_connection_sock.c ++++ b/net/ipv4/inet_connection_sock.c +@@ -536,9 +536,10 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, + { + const struct inet_request_sock *ireq = inet_rsk(req); + struct net *net = read_pnet(&ireq->ireq_net); +- struct ip_options_rcu *opt = ireq->opt; ++ struct ip_options_rcu *opt; + struct rtable *rt; + ++ opt = rcu_dereference(ireq->ireq_opt); + flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, + RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, + sk->sk_protocol, inet_sk_flowi_flags(sk), +@@ -572,10 +573,9 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, + struct flowi4 *fl4; + struct rtable *rt; + ++ opt = rcu_dereference(ireq->ireq_opt); + fl4 = &newinet->cork.fl.u.ip4; + +- rcu_read_lock(); +- opt = rcu_dereference(newinet->inet_opt); + flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, + RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, + sk->sk_protocol, inet_sk_flowi_flags(sk), +@@ -588,13 +588,11 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, + goto no_route; + if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) + goto route_err; +- rcu_read_unlock(); + return &rt->dst; + + route_err: + ip_rt_put(rt); + no_route: +- rcu_read_unlock(); + __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); + return NULL; + } +diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c +index e6e5ee76ee29..853e0e369579 100644 +--- a/net/ipv4/syncookies.c ++++ b/net/ipv4/syncookies.c +@@ -352,7 +352,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) + /* We throwed the options of the initial SYN away, so we hope + * the ACK carries the same options again (see RFC1122 4.2.3.8) + */ +- ireq->opt = tcp_v4_save_options(sock_net(sk), skb); ++ RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(sock_net(sk), skb)); + + if (security_inet_conn_request(sk, skb, req)) { + reqsk_free(req); +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index 242795293050..760c623c3518 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -6163,7 +6163,7 @@ struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, + struct inet_request_sock *ireq = inet_rsk(req); + + kmemcheck_annotate_bitfield(ireq, flags); +- ireq->opt = NULL; ++ ireq->ireq_opt = NULL; + #if IS_ENABLED(CONFIG_IPV6) + ireq->pktopts = NULL; + #endif +diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c +index a787203eab98..95705860a577 100644 +--- a/net/ipv4/tcp_ipv4.c ++++ b/net/ipv4/tcp_ipv4.c +@@ -875,7 +875,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, + + err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, + ireq->ir_rmt_addr, +- ireq->opt); ++ rcu_dereference(ireq->ireq_opt)); + err = net_xmit_eval(err); + } + +@@ -887,7 +887,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, + */ + static void tcp_v4_reqsk_destructor(struct request_sock *req) + { +- kfree(inet_rsk(req)->opt); ++ kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); + } + + #ifdef CONFIG_TCP_MD5SIG +@@ -1210,10 +1210,11 @@ static void tcp_v4_init_req(struct request_sock *req, + struct sk_buff *skb) + { + struct inet_request_sock *ireq = inet_rsk(req); ++ struct net *net = sock_net(sk_listener); + + sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); + sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); +- ireq->opt = tcp_v4_save_options(sock_net(sk_listener), skb); ++ RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); + } + + static struct dst_entry *tcp_v4_route_req(const struct sock *sk, +@@ -1300,10 +1301,9 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, + sk_daddr_set(newsk, ireq->ir_rmt_addr); + sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); + newsk->sk_bound_dev_if = ireq->ir_iif; +- newinet->inet_saddr = ireq->ir_loc_addr; +- inet_opt = ireq->opt; +- rcu_assign_pointer(newinet->inet_opt, inet_opt); +- ireq->opt = NULL; ++ newinet->inet_saddr = ireq->ir_loc_addr; ++ inet_opt = rcu_dereference(ireq->ireq_opt); ++ RCU_INIT_POINTER(newinet->inet_opt, inet_opt); + newinet->mc_index = inet_iif(skb); + newinet->mc_ttl = ip_hdr(skb)->ttl; + newinet->rcv_tos = ip_hdr(skb)->tos; +@@ -1348,9 +1348,12 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, + if (__inet_inherit_port(sk, newsk) < 0) + goto put_and_exit; + *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); +- if (*own_req) ++ if (likely(*own_req)) { + tcp_move_syn(newtp, req); +- ++ ireq->ireq_opt = NULL; ++ } else { ++ newinet->inet_opt = NULL; ++ } + return newsk; + + exit_overflow: +@@ -1361,6 +1364,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, + tcp_listendrop(sk); + return NULL; + put_and_exit: ++ newinet->inet_opt = NULL; + inet_csk_prepare_forced_close(newsk); + tcp_done(newsk); + goto exit; +-- +2.15.1 + diff --git a/patches.fixes/tcp-dccp-fix-lockdep-splat-in-inet_csk_route_req.patch b/patches.fixes/tcp-dccp-fix-lockdep-splat-in-inet_csk_route_req.patch new file mode 100644 index 0000000..8a564a4 --- /dev/null +++ b/patches.fixes/tcp-dccp-fix-lockdep-splat-in-inet_csk_route_req.patch @@ -0,0 +1,52 @@ +From: Eric Dumazet +Date: Sun, 22 Oct 2017 12:33:57 -0700 +Subject: tcp/dccp: fix lockdep splat in inet_csk_route_req() +Patch-mainline: v4.14-rc7 +Git-commit: a6ca7abe53633d08eea1c6756cb49c9b2d4c90bf +References: bsc#1076830 + +This patch fixes the following lockdep splat in inet_csk_route_req() + + lockdep_rcu_suspicious + inet_csk_route_req + tcp_v4_send_synack + tcp_rtx_synack + inet_rtx_syn_ack + tcp_fastopen_synack_time + tcp_retransmit_timer + tcp_write_timer_handler + tcp_write_timer + call_timer_fn + +Thread running inet_csk_route_req() owns a reference on the request +socket, so we have the guarantee ireq->ireq_opt wont be changed or +freed. + +lockdep can enforce this invariant for us. + +Fixes: c92e8c02fe66 ("tcp/dccp: fix ireq->opt races") +Signed-off-by: Eric Dumazet +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/ipv4/inet_connection_sock.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c +index ff862662e8c2..446be2ddb014 100644 +--- a/net/ipv4/inet_connection_sock.c ++++ b/net/ipv4/inet_connection_sock.c +@@ -539,7 +539,8 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, + struct ip_options_rcu *opt; + struct rtable *rt; + +- opt = rcu_dereference(ireq->ireq_opt); ++ opt = rcu_dereference_protected(ireq->ireq_opt, ++ refcount_read(&req->rsk_refcnt) > 0); + flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, + RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, + sk->sk_protocol, inet_sk_flowi_flags(sk), +-- +2.16.0 + diff --git a/patches.fixes/tcp-dccp-fix-other-lockdep-splats-accessing-ireq_opt.patch b/patches.fixes/tcp-dccp-fix-other-lockdep-splats-accessing-ireq_opt.patch new file mode 100644 index 0000000..33ee687 --- /dev/null +++ b/patches.fixes/tcp-dccp-fix-other-lockdep-splats-accessing-ireq_opt.patch @@ -0,0 +1,125 @@ +From: Eric Dumazet +Date: Tue, 24 Oct 2017 08:20:31 -0700 +Subject: tcp/dccp: fix other lockdep splats accessing ireq_opt +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit +Patch-mainline: v4.14-rc7 +Git-commit: 06f877d613be3621604c2520ec0351d9fbdca15f +References: bsc#1076830 + +In my first attempt to fix the lockdep splat, I forgot we could +enter inet_csk_route_req() with a freshly allocated request socket, +for which refcount has not yet been elevated, due to complex +SLAB_TYPESAFE_BY_RCU rules. + +We either are in rcu_read_lock() section _or_ we own a refcount on the +request. + +Correct RCU verb to use here is rcu_dereference_check(), although it is +not possible to prove we actually own a reference on a shared +refcount :/ + +In v2, I added ireq_opt_deref() helper and use in three places, to fix other +possible splats. + +[ 49.844590] lockdep_rcu_suspicious+0xea/0xf3 +[ 49.846487] inet_csk_route_req+0x53/0x14d +[ 49.848334] tcp_v4_route_req+0xe/0x10 +[ 49.850174] tcp_conn_request+0x31c/0x6a0 +[ 49.851992] ? __lock_acquire+0x614/0x822 +[ 49.854015] tcp_v4_conn_request+0x5a/0x79 +[ 49.855957] ? tcp_v4_conn_request+0x5a/0x79 +[ 49.858052] tcp_rcv_state_process+0x98/0xdcc +[ 49.859990] ? sk_filter_trim_cap+0x2f6/0x307 +[ 49.862085] tcp_v4_do_rcv+0xfc/0x145 +[ 49.864055] ? tcp_v4_do_rcv+0xfc/0x145 +[ 49.866173] tcp_v4_rcv+0x5ab/0xaf9 +[ 49.868029] ip_local_deliver_finish+0x1af/0x2e7 +[ 49.870064] ip_local_deliver+0x1b2/0x1c5 +[ 49.871775] ? inet_del_offload+0x45/0x45 +[ 49.873916] ip_rcv_finish+0x3f7/0x471 +[ 49.875476] ip_rcv+0x3f1/0x42f +[ 49.876991] ? ip_local_deliver_finish+0x2e7/0x2e7 +[ 49.878791] __netif_receive_skb_core+0x6d3/0x950 +[ 49.880701] ? process_backlog+0x7e/0x216 +[ 49.882589] __netif_receive_skb+0x1d/0x5e +[ 49.884122] process_backlog+0x10c/0x216 +[ 49.885812] net_rx_action+0x147/0x3df + +Fixes: a6ca7abe53633 ("tcp/dccp: fix lockdep splat in inet_csk_route_req()") +Fixes: c92e8c02fe66 ("tcp/dccp: fix ireq->opt races") +Signed-off-by: Eric Dumazet +Reported-by: kernel test robot +Reported-by: Maciej Żenczykowski +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + include/net/inet_sock.h | 6 ++++++ + net/dccp/ipv4.c | 2 +- + net/ipv4/inet_connection_sock.c | 4 ++-- + net/ipv4/tcp_ipv4.c | 2 +- + 4 files changed, 10 insertions(+), 4 deletions(-) + +diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h +index 425752f768d2..db8162dd8c0b 100644 +--- a/include/net/inet_sock.h ++++ b/include/net/inet_sock.h +@@ -132,6 +132,12 @@ static inline int inet_request_bound_dev_if(const struct sock *sk, + return sk->sk_bound_dev_if; + } + ++static inline struct ip_options_rcu *ireq_opt_deref(const struct inet_request_sock *ireq) ++{ ++ return rcu_dereference_check(ireq->ireq_opt, ++ refcount_read(&ireq->req.rsk_refcnt) > 0); ++} ++ + struct inet_cork { + unsigned int flags; + __be32 addr; +diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c +index 310fdd16df7e..02f56423fb01 100644 +--- a/net/dccp/ipv4.c ++++ b/net/dccp/ipv4.c +@@ -495,7 +495,7 @@ static int dccp_v4_send_response(const struct sock *sk, struct request_sock *req + ireq->ir_rmt_addr); + err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, + ireq->ir_rmt_addr, +- rcu_dereference(ireq->ireq_opt)); ++ ireq_opt_deref(ireq)); + err = net_xmit_eval(err); + } + +diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c +index 446be2ddb014..6883482c15e1 100644 +--- a/net/ipv4/inet_connection_sock.c ++++ b/net/ipv4/inet_connection_sock.c +@@ -539,8 +539,8 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, + struct ip_options_rcu *opt; + struct rtable *rt; + +- opt = rcu_dereference_protected(ireq->ireq_opt, +- refcount_read(&req->rsk_refcnt) > 0); ++ opt = ireq_opt_deref(ireq); ++ + flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, + RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, + sk->sk_protocol, inet_sk_flowi_flags(sk), +diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c +index fca1cdeb6588..dbedc3f39b56 100644 +--- a/net/ipv4/tcp_ipv4.c ++++ b/net/ipv4/tcp_ipv4.c +@@ -875,7 +875,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, + + err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, + ireq->ir_rmt_addr, +- rcu_dereference(ireq->ireq_opt)); ++ ireq_opt_deref(ireq)); + err = net_xmit_eval(err); + } + +-- +2.16.0 + diff --git a/patches.fixes/tcp-do-tcp_mstamp_refresh-before-retransmits-on-TSQ-.patch b/patches.fixes/tcp-do-tcp_mstamp_refresh-before-retransmits-on-TSQ-.patch new file mode 100644 index 0000000..4df1d31 --- /dev/null +++ b/patches.fixes/tcp-do-tcp_mstamp_refresh-before-retransmits-on-TSQ-.patch @@ -0,0 +1,54 @@ +From: Koichiro Den +Date: Sun, 22 Oct 2017 13:13:16 +0900 +Subject: tcp: do tcp_mstamp_refresh before retransmits on TSQ handler +Patch-mainline: v4.14-rc7 +Git-commit: 3a91d29f20276fa7cd4d0c9c7f3e78b30708159d +References: bsc#1061739 + +When retransmission on TSQ handler was introduced in the commit +f9616c35a0d7 ("tcp: implement TSQ for retransmits"), the retransmitted +skbs' timestamps were updated on the actual transmission. In the later +commit 385e20706fac ("tcp: use tp->tcp_mstamp in output path"), it stops +being done so. In the commit, the comment says "We try to refresh +tp->tcp_mstamp only when necessary", and at present tcp_tsq_handler and +tcp_v4_mtu_reduced applies to this. About the latter, it's okay since +it's rare enough. + +About the former, even though possible retransmissions on the tasklet +comes just after the destructor run in NET_RX softirq handling, the time +between them could be nonnegligibly large to the extent that +tcp_rack_advance or rto rearming be affected if other (remaining) RX, +BLOCK and (preceding) TASKLET sofirq handlings are unexpectedly heavy. + +So in the same way as tcp_write_timer_handler does, doing tcp_mstamp_refresh +ensures the accuracy of algorithms relying on it. + +Fixes: 385e20706fac ("tcp: use tp->tcp_mstamp in output path") +Signed-off-by: Koichiro Den +Reviewed-by: Eric Dumazet +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/ipv4/tcp_output.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c +index 8e28f4d0acc7..f3a4051ced5f 100644 +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -737,8 +737,10 @@ static void tcp_tsq_handler(struct sock *sk) + struct tcp_sock *tp = tcp_sk(sk); + + if (tp->lost_out > tp->retrans_out && +- tp->snd_cwnd > tcp_packets_in_flight(tp)) ++ tp->snd_cwnd > tcp_packets_in_flight(tp)) { ++ tcp_mstamp_refresh(tp); + tcp_xmit_retransmit_queue(sk); ++ } + + tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle, + 0, GFP_ATOMIC); +-- +2.15.1 + diff --git a/patches.fixes/tcp-fix-TCP_SYNCNT-flakes.patch b/patches.fixes/tcp-fix-TCP_SYNCNT-flakes.patch new file mode 100644 index 0000000..221f03d --- /dev/null +++ b/patches.fixes/tcp-fix-TCP_SYNCNT-flakes.patch @@ -0,0 +1,118 @@ +From: Eric Dumazet +Date: Tue, 23 May 2017 12:38:35 -0700 +Subject: tcp: fix TCP_SYNCNT flakes +Patch-mainline: v4.13-rc1 +Git-commit: ce682ef6e3e019f98cafbdc7058668e0ea8f4a13 +References: bsc#1061739 + +After the mentioned commit, some of our packetdrill tests became flaky. + +TCP_SYNCNT socket option can limit the number of SYN retransmits. + +retransmits_timed_out() has to compare times computations based on +local_clock() while timers are based on jiffies. With NTP adjustments +and roundings we can observe 999 ms delay for 1000 ms timers. +We end up sending one extra SYN packet. + +Gimmick added in commit 6fa12c850314 ("Revert Backoff [v3]: Calculate +TCP's connection close threshold as a time value") makes no +real sense for TCP_SYN_SENT sockets where no RTO backoff can happen at +all. + +Lets use a simpler logic for TCP_SYN_SENT sockets and remove @syn_set +parameter from retransmits_timed_out() + +Fixes: 9a568de4818d ("tcp: switch TCP TS option (RFC 7323) to 1ms clock") +Signed-off-by: Eric Dumazet +Signed-off-by: Yuchung Cheng +Acked-by: Soheil Hassas Yeganeh +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/ipv4/tcp_timer.c | 26 +++++++++++--------------- + 1 file changed, 11 insertions(+), 15 deletions(-) + +diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c +index b7b444b71e7c..655dd8d7f064 100644 +--- a/net/ipv4/tcp_timer.c ++++ b/net/ipv4/tcp_timer.c +@@ -139,21 +139,17 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) + * @timeout: A custom timeout value. + * If set to 0 the default timeout is calculated and used. + * Using TCP_RTO_MIN and the number of unsuccessful retransmits. +- * @syn_set: true if the SYN Bit was set. + * + * The default "timeout" value this function can calculate and use + * is equivalent to the timeout of a TCP Connection + * after "boundary" unsuccessful, exponentially backed-off +- * retransmissions with an initial RTO of TCP_RTO_MIN or TCP_TIMEOUT_INIT if +- * syn_set flag is set. +- * ++ * retransmissions with an initial RTO of TCP_RTO_MIN. + */ + static bool retransmits_timed_out(struct sock *sk, + unsigned int boundary, +- unsigned int timeout, +- bool syn_set) ++ unsigned int timeout) + { +- unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN; ++ const unsigned int rto_base = TCP_RTO_MIN; + unsigned int linear_backoff_thresh, start_ts; + + if (!inet_csk(sk)->icsk_retransmits) +@@ -181,8 +177,8 @@ static int tcp_write_timeout(struct sock *sk) + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); ++ bool expired, do_reset; + int retry_until; +- bool do_reset, syn_set = false; + + if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { + if (icsk->icsk_retransmits) { +@@ -196,9 +192,9 @@ static int tcp_write_timeout(struct sock *sk) + sk_rethink_txhash(sk); + } + retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; +- syn_set = true; ++ expired = icsk->icsk_retransmits >= retry_until; + } else { +- if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0, 0)) { ++ if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0)) { + /* Some middle-boxes may black-hole Fast Open _after_ + * the handshake. Therefore we conservatively disable + * Fast Open on this path on recurring timeouts after +@@ -224,15 +220,15 @@ static int tcp_write_timeout(struct sock *sk) + + retry_until = tcp_orphan_retries(sk, alive); + do_reset = alive || +- !retransmits_timed_out(sk, retry_until, 0, 0); ++ !retransmits_timed_out(sk, retry_until, 0); + + if (tcp_out_of_resources(sk, do_reset)) + return 1; + } ++ expired = retransmits_timed_out(sk, retry_until, ++ icsk->icsk_user_timeout); + } +- +- if (retransmits_timed_out(sk, retry_until, +- syn_set ? 0 : icsk->icsk_user_timeout, syn_set)) { ++ if (expired) { + /* Has it gone just too far? */ + tcp_write_err(sk); + return 1; +@@ -528,7 +524,7 @@ void tcp_retransmit_timer(struct sock *sk) + icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); + } + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); +- if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0, 0)) ++ if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0)) + __sk_dst_reset(sk); + + out:; +-- +2.15.1 + diff --git a/patches.fixes/tcp-fix-potential-underestimation-on-rcv_rtt.patch b/patches.fixes/tcp-fix-potential-underestimation-on-rcv_rtt.patch new file mode 100644 index 0000000..8138778 --- /dev/null +++ b/patches.fixes/tcp-fix-potential-underestimation-on-rcv_rtt.patch @@ -0,0 +1,62 @@ +From: Wei Wang +Date: Tue, 12 Dec 2017 16:28:58 -0800 +Subject: tcp: fix potential underestimation on rcv_rtt +Patch-mainline: v4.15-rc4 +Git-commit: 9ee11bd03cb1a5c3ca33c2bb70e7ed325f68890f +References: bsc#1076830 + +When ms timestamp is used, current logic uses 1us in +tcp_rcv_rtt_update() when the real rcv_rtt is within 1 - 999us. +This could cause rcv_rtt underestimation. +Fix it by always using a min value of 1ms if ms timestamp is used. + +Fixes: 645f4c6f2ebd ("tcp: switch rcv_rtt_est and rcvq_space to high resolution timestamps") +Signed-off-by: Wei Wang +Signed-off-by: Eric Dumazet +Acked-by: Neal Cardwell +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/ipv4/tcp_input.c | 10 ++++++---- + 1 file changed, 6 insertions(+), 4 deletions(-) + +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index 36d5c5e3c8dc..242795293050 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -522,9 +522,6 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) + u32 new_sample = tp->rcv_rtt_est.rtt_us; + long m = sample; + +- if (m == 0) +- m = 1; +- + if (new_sample != 0) { + /* If we sample in larger samples in the non-timestamp + * case, we could grossly overestimate the RTT especially +@@ -561,6 +558,8 @@ static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp) + if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq)) + return; + delta_us = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcv_rtt_est.time); ++ if (!delta_us) ++ delta_us = 1; + tcp_rcv_rtt_update(tp, delta_us, 1); + + new_measure: +@@ -577,8 +576,11 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, + (TCP_SKB_CB(skb)->end_seq - + TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) { + u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; +- u32 delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); ++ u32 delta_us; + ++ if (!delta) ++ delta = 1; ++ delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); + tcp_rcv_rtt_update(tp, delta_us, 0); + } + } +-- +2.15.1 + diff --git a/patches.fixes/tcp-fix-tcp_fastretrans_alert-warning.patch b/patches.fixes/tcp-fix-tcp_fastretrans_alert-warning.patch new file mode 100644 index 0000000..bf6c3f4 --- /dev/null +++ b/patches.fixes/tcp-fix-tcp_fastretrans_alert-warning.patch @@ -0,0 +1,66 @@ +From: Yuchung Cheng +Date: Tue, 7 Nov 2017 15:33:43 -0800 +Subject: tcp: fix tcp_fastretrans_alert warning +Patch-mainline: v4.14 +Git-commit: 0eb96bf754d7fa6635aa0b0f6650c74b8a6b1cc9 +References: bsc#1076830 + +This patch fixes the cause of an WARNING indicatng TCP has pending +retransmission in Open state in tcp_fastretrans_alert(). + +The root cause is a bad interaction between path mtu probing, +if enabled, and the RACK loss detection. Upong receiving a SACK +above the sequence of the MTU probing packet, RACK could mark the +probe packet lost in tcp_fastretrans_alert(), prior to calling +tcp_simple_retransmit(). + +tcp_simple_retransmit() only enters Loss state if it newly marks +the probe packet lost. If the probe packet is already identified as +lost by RACK, the sender remains in Open state with some packets +marked lost and retransmitted. Then the next SACK would trigger +the warning. The likely scenario is that the probe packet was +lost due to its size or network congestion. The actual impact of +this warning is small by potentially entering fast recovery an +ACK later. + +The simple fix is always entering recovery (Loss) state if some +packet is marked lost during path MTU probing. + +Fixes: a0370b3f3f2c ("tcp: enable RACK loss detection to trigger recovery") +Reported-by: Oleksandr Natalenko +Reported-by: Alexei Starovoitov +Reported-by: Roman Gushchin +Signed-off-by: Yuchung Cheng +Reviewed-by: Eric Dumazet +Acked-by: Neal Cardwell +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/ipv4/tcp_input.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index a62863c27be9..bfbccb2a02f3 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -2612,7 +2612,6 @@ void tcp_simple_retransmit(struct sock *sk) + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + unsigned int mss = tcp_current_mss(sk); +- u32 prior_lost = tp->lost_out; + + tcp_for_write_queue(skb, sk) { + if (skb == tcp_send_head(sk)) +@@ -2629,7 +2628,7 @@ void tcp_simple_retransmit(struct sock *sk) + + tcp_clear_retrans_hints_partial(tp); + +- if (prior_lost == tp->lost_out) ++ if (!tp->lost_out) + return; + + if (tcp_is_reno(tp)) +-- +2.15.1 + diff --git a/patches.fixes/tcp-fix-tcp_mtu_probe-vs-highest_sack.patch b/patches.fixes/tcp-fix-tcp_mtu_probe-vs-highest_sack.patch new file mode 100644 index 0000000..7ece60c --- /dev/null +++ b/patches.fixes/tcp-fix-tcp_mtu_probe-vs-highest_sack.patch @@ -0,0 +1,86 @@ +From: Eric Dumazet +Date: Mon, 30 Oct 2017 23:08:20 -0700 +Subject: tcp: fix tcp_mtu_probe() vs highest_sack +Patch-mainline: v4.14-rc8 +Git-commit: 2b7cda9c35d3b940eb9ce74b30bbd5eb30db493d +References: bsc#1076830 + +Based on SNMP values provided by Roman, Yuchung made the observation +that some crashes in tcp_sacktag_walk() might be caused by MTU probing. + +Looking at tcp_mtu_probe(), I found that when a new skb was placed +in front of the write queue, we were not updating tcp highest sack. + +If one skb is freed because all its content was copied to the new skb +(for MTU probing), then tp->highest_sack could point to a now freed skb. + +Bad things would then happen, including infinite loops. + +This patch renames tcp_highest_sack_combine() and uses it +from tcp_mtu_probe() to fix the bug. + +Note that I also removed one test against tp->sacked_out, +since we want to replace tp->highest_sack regardless of whatever +condition, since keeping a stale pointer to freed skb is a recipe +for disaster. + +Fixes: a47e5a988a57 ("[TCP]: Convert highest_sack to sk_buff to allow direct access") +Signed-off-by: Eric Dumazet +Reported-by: Alexei Starovoitov +Reported-by: Roman Gushchin +Reported-by: Oleksandr Natalenko +Acked-by: Alexei Starovoitov +Acked-by: Neal Cardwell +Acked-by: Yuchung Cheng +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + include/net/tcp.h | 6 +++--- + net/ipv4/tcp_output.c | 3 ++- + 2 files changed, 5 insertions(+), 4 deletions(-) + +diff --git a/include/net/tcp.h b/include/net/tcp.h +index 7aeda647e4d2..d70e5aa0d98f 100644 +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -1731,12 +1731,12 @@ static inline void tcp_highest_sack_reset(struct sock *sk) + tcp_sk(sk)->highest_sack = tcp_write_queue_head(sk); + } + +-/* Called when old skb is about to be deleted (to be combined with new skb) */ +-static inline void tcp_highest_sack_combine(struct sock *sk, ++/* Called when old skb is about to be deleted and replaced by new skb */ ++static inline void tcp_highest_sack_replace(struct sock *sk, + struct sk_buff *old, + struct sk_buff *new) + { +- if (tcp_sk(sk)->sacked_out && (old == tcp_sk(sk)->highest_sack)) ++ if (old == tcp_highest_sack(sk)) + tcp_sk(sk)->highest_sack = new; + } + +diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c +index 4371f27205aa..5135975d92e9 100644 +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -2094,6 +2094,7 @@ static int tcp_mtu_probe(struct sock *sk) + nskb->ip_summed = skb->ip_summed; + + tcp_insert_write_queue_before(nskb, skb, sk); ++ tcp_highest_sack_replace(sk, skb, nskb); + + len = 0; + tcp_for_write_queue_from_safe(skb, next, sk) { +@@ -2706,7 +2707,7 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) + else if (!skb_shift(skb, next_skb, next_skb_size)) + return false; + } +- tcp_highest_sack_combine(sk, next_skb, skb); ++ tcp_highest_sack_replace(sk, next_skb, skb); + + tcp_unlink_write_queue(next_skb, sk); + +-- +2.15.1 + diff --git a/patches.fixes/tcp-fix-tcp_probe_timer-for-TCP_USER_TIMEOUT.patch b/patches.fixes/tcp-fix-tcp_probe_timer-for-TCP_USER_TIMEOUT.patch new file mode 100644 index 0000000..06796e3 --- /dev/null +++ b/patches.fixes/tcp-fix-tcp_probe_timer-for-TCP_USER_TIMEOUT.patch @@ -0,0 +1,39 @@ +From: Eric Dumazet +Date: Sun, 21 May 2017 10:39:00 -0700 +Subject: tcp: fix tcp_probe_timer() for TCP_USER_TIMEOUT +Patch-mainline: v4.13-rc1 +Git-commit: 4ab688793e086ef6d1744a0f803fe9770a1ae5d0 +References: bsc#1061739 + +TCP_USER_TIMEOUT is still converted to jiffies value in +icsk_user_timeout + +So we need to make a conversion for the cases HZ != 1000 + +Fixes: 9a568de4818d ("tcp: switch TCP TS option (RFC 7323) to 1ms clock") +Signed-off-by: Eric Dumazet +Acked-by: Soheil Hassas Yeganeh +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/ipv4/tcp_timer.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c +index 3f6aa6270b3f..b7b444b71e7c 100644 +--- a/net/ipv4/tcp_timer.c ++++ b/net/ipv4/tcp_timer.c +@@ -329,7 +329,8 @@ static void tcp_probe_timer(struct sock *sk) + if (!start_ts) + tcp_send_head(sk)->skb_mstamp = tp->tcp_mstamp; + else if (icsk->icsk_user_timeout && +- (s32)(tcp_time_stamp(tp) - start_ts) > icsk->icsk_user_timeout) ++ (s32)(tcp_time_stamp(tp) - start_ts) > ++ jiffies_to_msecs(icsk->icsk_user_timeout)) + goto abort; + + max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2; +-- +2.15.1 + diff --git a/patches.fixes/tcp-fix-tcp_rearm_rto.patch b/patches.fixes/tcp-fix-tcp_rearm_rto.patch new file mode 100644 index 0000000..d3cc15c --- /dev/null +++ b/patches.fixes/tcp-fix-tcp_rearm_rto.patch @@ -0,0 +1,50 @@ +From: Eric Dumazet +Date: Thu, 18 May 2017 09:15:58 -0700 +Subject: tcp: fix tcp_rearm_rto() +Patch-mainline: v4.13-rc1 +Git-commit: b17b8a20c5cd4a264601eacf1fda29008047d05a +References: bsc#1061739 + +skbs in (re)transmit queue no longer have a copy of jiffies +at the time of the transmit : skb->skb_mstamp is now in usec unit, +with no correlation to tcp_jiffies32. + +We have to convert rto from jiffies to usec, compute a time difference +in usec, then convert the delta to HZ units. + +Fixes: 9a568de4818d ("tcp: switch TCP TS option (RFC 7323) to 1ms clock") +Signed-off-by: Eric Dumazet +Acked-by: Soheil Hassas Yeganeh +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/ipv4/tcp_input.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index c6c680b9c7be..a62863c27be9 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -3003,13 +3003,13 @@ void tcp_rearm_rto(struct sock *sk) + if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || + icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { + struct sk_buff *skb = tcp_write_queue_head(sk); +- const u32 rto_time_stamp = +- tcp_skb_timestamp(skb) + rto; +- s32 delta = (s32)(rto_time_stamp - tcp_jiffies32); +- /* delta may not be positive if the socket is locked ++ u64 rto_time_stamp = skb->skb_mstamp + ++ jiffies_to_usecs(rto); ++ s64 delta_us = rto_time_stamp - tp->tcp_mstamp; ++ /* delta_us may not be positive if the socket is locked + * when the retrans timer fires and is rescheduled. + */ +- rto = max(delta, 1); ++ rto = usecs_to_jiffies(max_t(int, delta_us, 1)); + } + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, + TCP_RTO_MAX); +-- +2.15.1 + diff --git a/patches.fixes/tcp-introduce-tcp_jiffies32.patch b/patches.fixes/tcp-introduce-tcp_jiffies32.patch new file mode 100644 index 0000000..34f7033 --- /dev/null +++ b/patches.fixes/tcp-introduce-tcp_jiffies32.patch @@ -0,0 +1,57 @@ +From: Eric Dumazet +Date: Tue, 16 May 2017 14:00:01 -0700 +Subject: tcp: introduce tcp_jiffies32 +Patch-mainline: v4.13-rc1 +Git-commit: ec66eda82d4b0c552bf40005d8f53b63b2b07de4 +References: bsc#1061739 + +We abuse tcp_time_stamp for two different cases : + +1) base to generate TCP Timestamp options (RFC 7323) + +2) A 32bit version of jiffies since some TCP fields + are 32bit wide to save memory. + +Since we want in the future to have 1ms TCP TS clock, +regardless of HZ value, we want to cleanup things. + +tcp_jiffies32 is the truncated jiffies value, +which will be used only in places where we want a 'host' +timestamp. + +Signed-off-by: Eric Dumazet +Acked-by: Soheil Hassas Yeganeh +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + include/net/tcp.h | 13 ++++++++----- + 1 file changed, 8 insertions(+), 5 deletions(-) + +diff --git a/include/net/tcp.h b/include/net/tcp.h +index 3dd09e0f68ee..297a2123cabf 100644 +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -699,11 +699,14 @@ u32 __tcp_select_window(struct sock *sk); + + void tcp_send_window_probe(struct sock *sk); + +-/* TCP timestamps are only 32-bits, this causes a slight +- * complication on 64-bit systems since we store a snapshot +- * of jiffies in the buffer control blocks below. We decided +- * to use only the low 32-bits of jiffies and hide the ugly +- * casts with the following macro. ++/* TCP uses 32bit jiffies to save some space. ++ * Note that this is different from tcp_time_stamp, which ++ * historically has been the same until linux-4.13. ++ */ ++#define tcp_jiffies32 ((u32)jiffies) ++ ++/* Generator for TCP TS option (RFC 7323) ++ * Currently tied to 'jiffies' but will soon be driven by 1 ms clock. + */ + #define tcp_time_stamp ((__u32)(jiffies)) + +-- +2.15.1 + diff --git a/patches.fixes/tcp-md5sig-Use-skb-s-saddr-when-replying-to-an-incom.patch b/patches.fixes/tcp-md5sig-Use-skb-s-saddr-when-replying-to-an-incom.patch new file mode 100644 index 0000000..48c70a2 --- /dev/null +++ b/patches.fixes/tcp-md5sig-Use-skb-s-saddr-when-replying-to-an-incom.patch @@ -0,0 +1,60 @@ +From: Christoph Paasch +Date: Mon, 11 Dec 2017 00:05:46 -0800 +Subject: tcp md5sig: Use skb's saddr when replying to an incoming segment +Patch-mainline: v4.15-rc4 +Git-commit: 30791ac41927ebd3e75486f9504b6d2280463bf0 +References: bsc#1076830 + +The MD5-key that belongs to a connection is identified by the peer's +IP-address. When we are in tcp_v4(6)_reqsk_send_ack(), we are replying +to an incoming segment from tcp_check_req() that failed the seq-number +checks. + +Thus, to find the correct key, we need to use the skb's saddr and not +the daddr. + +This bug seems to have been there since quite a while, but probably got +unnoticed because the consequences are not catastrophic. We will call +tcp_v4_reqsk_send_ack only to send a challenge-ACK back to the peer, +thus the connection doesn't really fail. + +Fixes: 9501f9722922 ("tcp md5sig: Let the caller pass appropriate key for tcp_v{4,6}_do_calc_md5_hash().") +Signed-off-by: Christoph Paasch +Reviewed-by: Eric Dumazet +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/ipv4/tcp_ipv4.c | 2 +- + net/ipv6/tcp_ipv6.c | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c +index f4efef73b5da..c9845859fe94 100644 +--- a/net/ipv4/tcp_ipv4.c ++++ b/net/ipv4/tcp_ipv4.c +@@ -842,7 +842,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, + tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, + req->ts_recent, + 0, +- tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr, ++ tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr, + AF_INET), + inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, + ip_hdr(skb)->tos); +diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c +index 2904ba1dc965..61722535e249 100644 +--- a/net/ipv6/tcp_ipv6.c ++++ b/net/ipv6/tcp_ipv6.c +@@ -973,7 +973,7 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, + req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, + tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, + req->ts_recent, sk->sk_bound_dev_if, +- tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr), ++ tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr), + 0, 0); + } + +-- +2.15.1 + diff --git a/patches.fixes/tcp-refresh-tcp_mstamp-from-timers-callbacks.patch b/patches.fixes/tcp-refresh-tcp_mstamp-from-timers-callbacks.patch new file mode 100644 index 0000000..b46dd34 --- /dev/null +++ b/patches.fixes/tcp-refresh-tcp_mstamp-from-timers-callbacks.patch @@ -0,0 +1,52 @@ +From: Eric Dumazet +Date: Tue, 12 Dec 2017 18:22:52 -0800 +Subject: tcp: refresh tcp_mstamp from timers callbacks +Patch-mainline: v4.15-rc4 +Git-commit: 4688eb7cf3ae2c2721d1dacff5c1384cba47d176 +References: bsc#1061739 + +Only the retransmit timer currently refreshes tcp_mstamp + +We should do the same for delayed acks and keepalives. + +Even if RFC 7323 does not request it, this is consistent to what linux +did in the past, when TS values were based on jiffies. + +Fixes: 385e20706fac ("tcp: use tp->tcp_mstamp in output path") +Signed-off-by: Eric Dumazet +Cc: Soheil Hassas Yeganeh +Cc: Mike Maloney +Cc: Neal Cardwell +Acked-by: Neal Cardwell +Acked-by: Soheil Hassas Yeganeh +Acked-by: Mike Maloney +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/ipv4/tcp_timer.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c +index 655dd8d7f064..e9af1879cd53 100644 +--- a/net/ipv4/tcp_timer.c ++++ b/net/ipv4/tcp_timer.c +@@ -264,6 +264,7 @@ void tcp_delack_timer_handler(struct sock *sk) + icsk->icsk_ack.pingpong = 0; + icsk->icsk_ack.ato = TCP_ATO_MIN; + } ++ tcp_mstamp_refresh(tcp_sk(sk)); + tcp_send_ack(sk); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS); + } +@@ -627,6 +628,7 @@ static void tcp_keepalive_timer (unsigned long data) + goto out; + } + ++ tcp_mstamp_refresh(tp); + if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) { + if (tp->linger2 >= 0) { + const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN; +-- +2.15.1 + diff --git a/patches.fixes/tcp-refresh-tp-timestamp-before-tcp_mtu_probe.patch b/patches.fixes/tcp-refresh-tp-timestamp-before-tcp_mtu_probe.patch new file mode 100644 index 0000000..90b64c7 --- /dev/null +++ b/patches.fixes/tcp-refresh-tp-timestamp-before-tcp_mtu_probe.patch @@ -0,0 +1,45 @@ +From: Eric Dumazet +Date: Thu, 26 Oct 2017 21:21:40 -0700 +Subject: tcp: refresh tp timestamp before tcp_mtu_probe() +Patch-mainline: v4.14-rc7 +Git-commit: ee1836aec4f5a977c1699a311db4d9027ef21ac8 +References: bsc#1061739 + +In the unlikely event tcp_mtu_probe() is sending a packet, we +want tp->tcp_mstamp being as accurate as possible. + +This means we need to call tcp_mstamp_refresh() a bit earlier in +tcp_write_xmit(). + +Fixes: 385e20706fac ("tcp: use tp->tcp_mstamp in output path") +Signed-off-by: Eric Dumazet +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/ipv4/tcp_output.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c +index f3a4051ced5f..4371f27205aa 100644 +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -2270,6 +2270,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + + sent_pkts = 0; + ++ tcp_mstamp_refresh(tp); + if (!push_one) { + /* Do MTU probing. */ + result = tcp_mtu_probe(sk); +@@ -2281,7 +2282,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + } + + max_segs = tcp_tso_segs(sk, mss_now); +- tcp_mstamp_refresh(tp); + while ((skb = tcp_send_head(sk))) { + unsigned int limit; + +-- +2.15.1 + diff --git a/patches.fixes/tcp-replace-misc-tcp_time_stamp-to-tcp_jiffies32.patch b/patches.fixes/tcp-replace-misc-tcp_time_stamp-to-tcp_jiffies32.patch new file mode 100644 index 0000000..a33768b --- /dev/null +++ b/patches.fixes/tcp-replace-misc-tcp_time_stamp-to-tcp_jiffies32.patch @@ -0,0 +1,95 @@ +From: Eric Dumazet +Date: Tue, 16 May 2017 14:00:13 -0700 +Subject: tcp: replace misc tcp_time_stamp to tcp_jiffies32 +Patch-mainline: v4.13-rc1 +Git-commit: ac9517fcf310327fa3e3b0d8366e4b11236b1b4b +References: bsc#1061739 + +After this patch, all uses of tcp_time_stamp will require +a change when we introduce 1 ms and/or 1 us TCP TS option. + +Signed-off-by: Eric Dumazet +Acked-by: Soheil Hassas Yeganeh +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/ipv4/tcp.c | 2 +- + net/ipv4/tcp_htcp.c | 2 +- + net/ipv4/tcp_input.c | 2 +- + net/ipv4/tcp_minisocks.c | 2 +- + net/ipv4/tcp_output.c | 4 ++-- + 5 files changed, 6 insertions(+), 6 deletions(-) + +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index d0725dc266d1..286169347383 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -385,7 +385,7 @@ void tcp_init_sock(struct sock *sk) + + icsk->icsk_rto = TCP_TIMEOUT_INIT; + tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); +- minmax_reset(&tp->rtt_min, tcp_time_stamp, ~0U); ++ minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U); + + /* So many TCP implementations out there (incorrectly) count the + * initial SYN frame in their delayed-ACK and congestion control +diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c +index 4a4d8e76738f..3eb78cde6ff0 100644 +--- a/net/ipv4/tcp_htcp.c ++++ b/net/ipv4/tcp_htcp.c +@@ -104,7 +104,7 @@ static void measure_achieved_throughput(struct sock *sk, + const struct inet_connection_sock *icsk = inet_csk(sk); + const struct tcp_sock *tp = tcp_sk(sk); + struct htcp *ca = inet_csk_ca(sk); +- u32 now = tcp_time_stamp; ++ u32 now = tcp_jiffies32; + + if (icsk->icsk_ca_state == TCP_CA_Open) + ca->pkts_acked = sample->pkts_acked; +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index 79873ace320e..75e7deaa185c 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -2912,7 +2912,7 @@ static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us) + struct tcp_sock *tp = tcp_sk(sk); + u32 wlen = sysctl_tcp_min_rtt_wlen * HZ; + +- minmax_running_min(&tp->rtt_min, wlen, tcp_time_stamp, ++ minmax_running_min(&tp->rtt_min, wlen, tcp_jiffies32, + rtt_us ? : jiffies_to_usecs(1)); + } + +diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c +index 4fcae3ef6894..42f203001229 100644 +--- a/net/ipv4/tcp_minisocks.c ++++ b/net/ipv4/tcp_minisocks.c +@@ -444,7 +444,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, + + newtp->srtt_us = 0; + newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); +- minmax_reset(&newtp->rtt_min, tcp_time_stamp, ~0U); ++ minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U); + newicsk->icsk_rto = TCP_TIMEOUT_INIT; + newicsk->icsk_ack.lrcvtime = tcp_jiffies32; + +diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c +index 74a37e14a00c..10ba45072d1a 100644 +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -2418,10 +2418,10 @@ bool tcp_schedule_loss_probe(struct sock *sk) + timeout = max_t(u32, timeout, msecs_to_jiffies(10)); + + /* If RTO is shorter, just schedule TLP in its place. */ +- tlp_time_stamp = tcp_time_stamp + timeout; ++ tlp_time_stamp = tcp_jiffies32 + timeout; + rto_time_stamp = (u32)inet_csk(sk)->icsk_timeout; + if ((s32)(tlp_time_stamp - rto_time_stamp) > 0) { +- s32 delta = rto_time_stamp - tcp_time_stamp; ++ s32 delta = rto_time_stamp - tcp_jiffies32; + if (delta > 0) + timeout = delta; + } +-- +2.15.1 + diff --git a/patches.fixes/tcp-switch-TCP-TS-option-RFC-7323-to-1ms-clock.patch b/patches.fixes/tcp-switch-TCP-TS-option-RFC-7323-to-1ms-clock.patch new file mode 100644 index 0000000..bb33c43 --- /dev/null +++ b/patches.fixes/tcp-switch-TCP-TS-option-RFC-7323-to-1ms-clock.patch @@ -0,0 +1,1108 @@ +From: Eric Dumazet +Date: Tue, 16 May 2017 14:00:14 -0700 +Subject: tcp: switch TCP TS option (RFC 7323) to 1ms clock +Patch-mainline: v4.13-rc1 +Git-commit: 9a568de4818dea9a05af141046bd3e589245ab83 +References: bsc#1061739 + +TCP Timestamps option is defined in RFC 7323 + +Traditionally on linux, it has been tied to the internal +'jiffies' variable, because it had been a cheap and good enough +generator. + +For TCP flows on the Internet, 1 ms resolution would be much better +than 4ms or 10ms (HZ=250 or HZ=100 respectively) + +For TCP flows in the DC, Google has used usec resolution for more +than two years with great success [1] + +Receive size autotuning (DRS) is indeed more precise and converges +faster to optimal window size. + +This patch converts tp->tcp_mstamp to a plain u64 value storing +a 1 usec TCP clock. + +This choice will allow us to upstream the 1 usec TS option as +discussed in IETF 97. + +[1] https://www.ietf.org/proceedings/97/slides/slides-97-tcpm-tcp-options-for-low-latency-00.pdf + +Signed-off-by: Eric Dumazet +Acked-by: Soheil Hassas Yeganeh +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + include/linux/skbuff.h | 62 +------------------------- + include/linux/tcp.h | 22 ++++----- + include/net/tcp.h | 59 ++++++++++++++++++++---- + net/ipv4/syncookies.c | 8 ++-- + net/ipv4/tcp.c | 4 +- + net/ipv4/tcp_bbr.c | 22 ++++----- + net/ipv4/tcp_input.c | 96 ++++++++++++++++++++-------------------- + net/ipv4/tcp_ipv4.c | 17 +++---- + net/ipv4/tcp_lp.c | 12 ++--- + net/ipv4/tcp_minisocks.c | 4 +- + net/ipv4/tcp_output.c | 16 +++---- + net/ipv4/tcp_rate.c | 16 +++---- + net/ipv4/tcp_recovery.c | 23 +++++----- + net/ipv4/tcp_timer.c | 8 ++-- + net/ipv6/syncookies.c | 2 +- + net/ipv6/tcp_ipv6.c | 4 +- + net/netfilter/nf_synproxy_core.c | 2 +- + 17 files changed, 178 insertions(+), 199 deletions(-) + +diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h +index 4aba51863fd4..5e32a9a442b4 100644 +--- a/include/linux/skbuff.h ++++ b/include/linux/skbuff.h +@@ -508,66 +508,6 @@ typedef unsigned int sk_buff_data_t; + typedef unsigned char *sk_buff_data_t; + #endif + +-/** +- * struct skb_mstamp - multi resolution time stamps +- * @stamp_us: timestamp in us resolution +- * @stamp_jiffies: timestamp in jiffies +- */ +-struct skb_mstamp { +- union { +- u64 v64; +- struct { +- u32 stamp_us; +- u32 stamp_jiffies; +- }; +- }; +-}; +- +-/** +- * skb_mstamp_get - get current timestamp +- * @cl: place to store timestamps +- */ +-static inline void skb_mstamp_get(struct skb_mstamp *cl) +-{ +- u64 val = local_clock(); +- +- do_div(val, NSEC_PER_USEC); +- cl->stamp_us = (u32)val; +- cl->stamp_jiffies = (u32)jiffies; +-} +- +-/** +- * skb_mstamp_delta - compute the difference in usec between two skb_mstamp +- * @t1: pointer to newest sample +- * @t0: pointer to oldest sample +- */ +-static inline u32 skb_mstamp_us_delta(const struct skb_mstamp *t1, +- const struct skb_mstamp *t0) +-{ +- s32 delta_us = t1->stamp_us - t0->stamp_us; +- u32 delta_jiffies = t1->stamp_jiffies - t0->stamp_jiffies; +- +- /* If delta_us is negative, this might be because interval is too big, +- * or local_clock() drift is too big : fallback using jiffies. +- */ +- if (delta_us <= 0 || +- delta_jiffies >= (INT_MAX / (USEC_PER_SEC / HZ))) +- +- delta_us = jiffies_to_usecs(delta_jiffies); +- +- return delta_us; +-} +- +-static inline bool skb_mstamp_after(const struct skb_mstamp *t1, +- const struct skb_mstamp *t0) +-{ +- s32 diff = t1->stamp_jiffies - t0->stamp_jiffies; +- +- if (!diff) +- diff = t1->stamp_us - t0->stamp_us; +- return diff > 0; +-} +- + /** + * struct sk_buff - socket buffer + * @next: Next buffer in list +@@ -649,7 +589,7 @@ struct sk_buff { + + union { + ktime_t tstamp; +- struct skb_mstamp skb_mstamp; ++ u64 skb_mstamp; + }; + }; + struct rb_node rbnode; /* used in netem & tcp stack */ +diff --git a/include/linux/tcp.h b/include/linux/tcp.h +index c1b647306e1e..32fb37cfb0d1 100644 +--- a/include/linux/tcp.h ++++ b/include/linux/tcp.h +@@ -123,7 +123,7 @@ struct tcp_request_sock_ops; + struct tcp_request_sock { + struct inet_request_sock req; + const struct tcp_request_sock_ops *af_specific; +- struct skb_mstamp snt_synack; /* first SYNACK sent time */ ++ u64 snt_synack; /* first SYNACK sent time */ + bool tfo_listener; + u32 txhash; + u32 rcv_isn; +@@ -202,7 +202,7 @@ struct tcp_sock { + + /* Information of the most recently (s)acked skb */ + struct tcp_rack { +- struct skb_mstamp mstamp; /* (Re)sent time of the skb */ ++ u64 mstamp; /* (Re)sent time of the skb */ + u32 rtt_us; /* Associated RTT */ + u32 end_seq; /* Ending TCP sequence of the skb */ + u8 advanced; /* mstamp advanced since last lost marking */ +@@ -231,7 +231,7 @@ struct tcp_sock { + u32 tlp_high_seq; /* snd_nxt at the time of TLP retransmit. */ + + /* RTT measurement */ +- struct skb_mstamp tcp_mstamp; /* most recent packet received/sent */ ++ u64 tcp_mstamp; /* most recent packet received/sent */ + u32 srtt_us; /* smoothed round trip time << 3 in usecs */ + u32 mdev_us; /* medium deviation */ + u32 mdev_max_us; /* maximal mdev for the last rtt period */ +@@ -271,8 +271,8 @@ struct tcp_sock { + u32 delivered; /* Total data packets delivered incl. rexmits */ + u32 lost; /* Total data packets lost incl. rexmits */ + u32 app_limited; /* limited until "delivered" reaches this val */ +- struct skb_mstamp first_tx_mstamp; /* start of window send phase */ +- struct skb_mstamp delivered_mstamp; /* time we reached "delivered" */ ++ u64 first_tx_mstamp; /* start of window send phase */ ++ u64 delivered_mstamp; /* time we reached "delivered" */ + u32 rate_delivered; /* saved rate sample: packets delivered */ + u32 rate_interval_us; /* saved rate sample: time elapsed */ + +@@ -326,16 +326,16 @@ struct tcp_sock { + + /* Receiver side RTT estimation */ + struct { +- u32 rtt_us; +- u32 seq; +- struct skb_mstamp time; ++ u32 rtt_us; ++ u32 seq; ++ u64 time; + } rcv_rtt_est; + + /* Receiver queue space */ + struct { +- int space; +- u32 seq; +- struct skb_mstamp time; ++ int space; ++ u32 seq; ++ u64 time; + } rcvq_space; + + /* TCP-specific MTU probe information. */ +diff --git a/include/net/tcp.h b/include/net/tcp.h +index 517975b11ae0..7aeda647e4d2 100644 +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -518,7 +518,7 @@ static inline u32 tcp_cookie_time(void) + u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, + u16 *mssp); + __u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss); +-__u32 cookie_init_timestamp(struct request_sock *req); ++u64 cookie_init_timestamp(struct request_sock *req); + bool cookie_timestamp_decode(struct tcp_options_received *opt); + bool cookie_ecn_ok(const struct tcp_options_received *opt, + const struct net *net, const struct dst_entry *dst); +@@ -705,14 +705,55 @@ void tcp_send_window_probe(struct sock *sk); + */ + #define tcp_jiffies32 ((u32)jiffies) + +-/* Generator for TCP TS option (RFC 7323) +- * Currently tied to 'jiffies' but will soon be driven by 1 ms clock. ++/* ++ * Deliver a 32bit value for TCP timestamp option (RFC 7323) ++ * It is no longer tied to jiffies, but to 1 ms clock. ++ * Note: double check if you want to use tcp_jiffies32 instead of this. ++ */ ++#define TCP_TS_HZ 1000 ++ ++static inline u64 tcp_clock_ns(void) ++{ ++ return local_clock(); ++} ++ ++static inline u64 tcp_clock_us(void) ++{ ++ return div_u64(tcp_clock_ns(), NSEC_PER_USEC); ++} ++ ++/* This should only be used in contexts where tp->tcp_mstamp is up to date */ ++static inline u32 tcp_time_stamp(const struct tcp_sock *tp) ++{ ++ return div_u64(tp->tcp_mstamp, USEC_PER_SEC / TCP_TS_HZ); ++} ++ ++/* Could use tcp_clock_us() / 1000, but this version uses a single divide */ ++static inline u32 tcp_time_stamp_raw(void) ++{ ++ return div_u64(tcp_clock_ns(), NSEC_PER_SEC / TCP_TS_HZ); ++} ++ ++ ++/* Refresh 1us clock of a TCP socket, ++ * ensuring monotically increasing values. + */ +-#define tcp_time_stamp ((__u32)(jiffies)) ++static inline void tcp_mstamp_refresh(struct tcp_sock *tp) ++{ ++ u64 val = tcp_clock_us(); ++ ++ if (val > tp->tcp_mstamp) ++ tp->tcp_mstamp = val; ++} ++ ++static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) ++{ ++ return max_t(s64, t1 - t0, 0); ++} + + static inline u32 tcp_skb_timestamp(const struct sk_buff *skb) + { +- return skb->skb_mstamp.stamp_jiffies; ++ return div_u64(skb->skb_mstamp, USEC_PER_SEC / TCP_TS_HZ); + } + + +@@ -777,9 +818,9 @@ struct tcp_skb_cb { + /* pkts S/ACKed so far upon tx of skb, incl retrans: */ + __u32 delivered; + /* start of send pipeline phase */ +- struct skb_mstamp first_tx_mstamp; ++ u64 first_tx_mstamp; + /* when we reached the "delivered" count */ +- struct skb_mstamp delivered_mstamp; ++ u64 delivered_mstamp; + } tx; /* only used for outgoing skbs */ + union { + struct inet_skb_parm h4; +@@ -895,7 +936,7 @@ struct ack_sample { + * A sample is invalid if "delivered" or "interval_us" is negative. + */ + struct rate_sample { +- struct skb_mstamp prior_mstamp; /* starting timestamp for interval */ ++ u64 prior_mstamp; /* starting timestamp for interval */ + u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ + s32 delivered; /* number of packets delivered over interval */ + long interval_us; /* time for tp->delivered to incr "delivered" */ +@@ -1850,7 +1891,7 @@ void tcp_init(void); + /* tcp_recovery.c */ + extern void tcp_rack_mark_lost(struct sock *sk); + extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, +- const struct skb_mstamp *xmit_time); ++ u64 xmit_time); + extern void tcp_rack_reo_timeout(struct sock *sk); + + /* +diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c +index 53921757349e..bd8fd3df5117 100644 +--- a/net/ipv4/syncookies.c ++++ b/net/ipv4/syncookies.c +@@ -66,10 +66,10 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, + * Since subsequent timestamps use the normal tcp_time_stamp value, we + * must make sure that the resulting initial timestamp is <= tcp_time_stamp. + */ +-__u32 cookie_init_timestamp(struct request_sock *req) ++u64 cookie_init_timestamp(struct request_sock *req) + { + struct inet_request_sock *ireq; +- u32 ts, ts_now = tcp_time_stamp; ++ u32 ts, ts_now = tcp_time_stamp_raw(); + u32 options = 0; + + ireq = inet_rsk(req); +@@ -88,7 +88,7 @@ __u32 cookie_init_timestamp(struct request_sock *req) + ts <<= TSBITS; + ts |= options; + } +- return ts; ++ return (u64)ts * (USEC_PER_SEC / TCP_TS_HZ); + } + + +@@ -344,7 +344,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) + ireq->wscale_ok = tcp_opt.wscale_ok; + ireq->tstamp_ok = tcp_opt.saw_tstamp; + req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; +- treq->snt_synack.v64 = 0; ++ treq->snt_synack = 0; + treq->tfo_listener = false; + + ireq->ir_iif = inet_request_bound_dev_if(sk, skb); +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index 286169347383..a853ad7d3e01 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -2612,7 +2612,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, + if (!tp->repair) + err = -EPERM; + else +- tp->tsoffset = val - tcp_time_stamp; ++ tp->tsoffset = val - tcp_time_stamp_raw(); + break; + case TCP_REPAIR_WINDOW: + err = tcp_repair_set_window(tp, optval, optlen); +@@ -2978,7 +2978,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, + break; + + case TCP_TIMESTAMP: +- val = tcp_time_stamp + tp->tsoffset; ++ val = tcp_time_stamp_raw() + tp->tsoffset; + break; + case TCP_NOTSENT_LOWAT: + val = tp->notsent_lowat; +diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c +index 816e6a1f6e9e..69ee877574d0 100644 +--- a/net/ipv4/tcp_bbr.c ++++ b/net/ipv4/tcp_bbr.c +@@ -91,7 +91,7 @@ struct bbr { + struct minmax bw; /* Max recent delivery rate in pkts/uS << 24 */ + u32 rtt_cnt; /* count of packet-timed rounds elapsed */ + u32 next_rtt_delivered; /* scb->tx.delivered at end of round */ +- struct skb_mstamp cycle_mstamp; /* time of this cycle phase start */ ++ u64 cycle_mstamp; /* time of this cycle phase start */ + u32 mode:3, /* current bbr_mode in state machine */ + prev_ca_state:3, /* CA state on previous ACK */ + packet_conservation:1, /* use packet conservation? */ +@@ -442,7 +442,7 @@ static bool bbr_is_next_cycle_phase(struct sock *sk, + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + bool is_full_length = +- skb_mstamp_us_delta(&tp->delivered_mstamp, &bbr->cycle_mstamp) > ++ tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) > + bbr->min_rtt_us; + u32 inflight, bw; + +@@ -528,7 +528,7 @@ static void bbr_reset_lt_bw_sampling_interval(struct sock *sk) + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + +- bbr->lt_last_stamp = tp->delivered_mstamp.stamp_jiffies; ++ bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC); + bbr->lt_last_delivered = tp->delivered; + bbr->lt_last_lost = tp->lost; + bbr->lt_rtt_cnt = 0; +@@ -582,7 +582,7 @@ static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs) + struct bbr *bbr = inet_csk_ca(sk); + u32 lost, delivered; + u64 bw; +- s32 t; ++ u32 t; + + if (bbr->lt_use_bw) { /* already using long-term rate, lt_bw? */ + if (bbr->mode == BBR_PROBE_BW && bbr->round_start && +@@ -634,15 +634,15 @@ static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs) + return; + + /* Find average delivery rate in this sampling interval. */ +- t = (s32)(tp->delivered_mstamp.stamp_jiffies - bbr->lt_last_stamp); +- if (t < 1) +- return; /* interval is less than one jiffy, so wait */ +- t = jiffies_to_usecs(t); +- /* Interval long enough for jiffies_to_usecs() to return a bogus 0? */ +- if (t < 1) { ++ t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp; ++ if ((s32)t < 1) ++ return; /* interval is less than one ms, so wait */ ++ /* Check if can multiply without overflow */ ++ if (t >= ~0U / USEC_PER_MSEC) { + bbr_reset_lt_bw_sampling(sk); /* interval too long; reset */ + return; + } ++ t *= USEC_PER_MSEC; + bw = (u64)delivered * BW_UNIT; + do_div(bw, t); + bbr_lt_bw_interval_done(sk, bw); +@@ -852,7 +852,7 @@ static void bbr_init(struct sock *sk) + bbr->idle_restart = 0; + bbr->full_bw = 0; + bbr->full_bw_cnt = 0; +- bbr->cycle_mstamp.v64 = 0; ++ bbr->cycle_mstamp = 0; + bbr->cycle_idx = 0; + bbr_reset_lt_bw_sampling(sk); + bbr_reset_startup_mode(sk); +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index 75e7deaa185c..c6c680b9c7be 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -442,7 +442,7 @@ void tcp_init_buffer_space(struct sock *sk) + tcp_sndbuf_expand(sk); + + tp->rcvq_space.space = tp->rcv_wnd; +- skb_mstamp_get(&tp->tcp_mstamp); ++ tcp_mstamp_refresh(tp); + tp->rcvq_space.time = tp->tcp_mstamp; + tp->rcvq_space.seq = tp->copied_seq; + +@@ -556,11 +556,11 @@ static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp) + { + u32 delta_us; + +- if (tp->rcv_rtt_est.time.v64 == 0) ++ if (tp->rcv_rtt_est.time == 0) + goto new_measure; + if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq)) + return; +- delta_us = skb_mstamp_us_delta(&tp->tcp_mstamp, &tp->rcv_rtt_est.time); ++ delta_us = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcv_rtt_est.time); + tcp_rcv_rtt_update(tp, delta_us, 1); + + new_measure: +@@ -572,13 +572,15 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, + const struct sk_buff *skb) + { + struct tcp_sock *tp = tcp_sk(sk); ++ + if (tp->rx_opt.rcv_tsecr && + (TCP_SKB_CB(skb)->end_seq - +- TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) +- tcp_rcv_rtt_update(tp, +- jiffies_to_usecs(tcp_time_stamp - +- tp->rx_opt.rcv_tsecr), +- 0); ++ TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) { ++ u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; ++ u32 delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); ++ ++ tcp_rcv_rtt_update(tp, delta_us, 0); ++ } + } + + /* +@@ -591,7 +593,7 @@ void tcp_rcv_space_adjust(struct sock *sk) + int time; + int copied; + +- time = skb_mstamp_us_delta(&tp->tcp_mstamp, &tp->rcvq_space.time); ++ time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time); + if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0) + return; + +@@ -1135,8 +1137,8 @@ struct tcp_sacktag_state { + * that was SACKed. RTO needs the earliest RTT to stay conservative, + * but congestion control should still get an accurate delay signal. + */ +- struct skb_mstamp first_sackt; +- struct skb_mstamp last_sackt; ++ u64 first_sackt; ++ u64 last_sackt; + struct rate_sample *rate; + int flag; + }; +@@ -1201,7 +1203,7 @@ static u8 tcp_sacktag_one(struct sock *sk, + struct tcp_sacktag_state *state, u8 sacked, + u32 start_seq, u32 end_seq, + int dup_sack, int pcount, +- const struct skb_mstamp *xmit_time) ++ u64 xmit_time) + { + struct tcp_sock *tp = tcp_sk(sk); + int fack_count = state->fack_count; +@@ -1243,9 +1245,9 @@ static u8 tcp_sacktag_one(struct sock *sk, + state->reord); + if (!after(end_seq, tp->high_seq)) + state->flag |= FLAG_ORIG_SACK_ACKED; +- if (state->first_sackt.v64 == 0) +- state->first_sackt = *xmit_time; +- state->last_sackt = *xmit_time; ++ if (state->first_sackt == 0) ++ state->first_sackt = xmit_time; ++ state->last_sackt = xmit_time; + } + + if (sacked & TCPCB_LOST) { +@@ -1305,7 +1307,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, + */ + tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, + start_seq, end_seq, dup_sack, pcount, +- &skb->skb_mstamp); ++ skb->skb_mstamp); + tcp_rate_skb_delivered(sk, skb, state->rate); + + if (skb == tp->lost_skb_hint) +@@ -1357,8 +1359,8 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, + tcp_advance_highest_sack(sk, skb); + + tcp_skb_collapse_tstamp(prev, skb); +- if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp.v64)) +- TCP_SKB_CB(prev)->tx.delivered_mstamp.v64 = 0; ++ if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp)) ++ TCP_SKB_CB(prev)->tx.delivered_mstamp = 0; + + tcp_unlink_write_queue(skb, sk); + sk_wmem_free_skb(sk, skb); +@@ -1588,7 +1590,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, + TCP_SKB_CB(skb)->end_seq, + dup_sack, + tcp_skb_pcount(skb), +- &skb->skb_mstamp); ++ skb->skb_mstamp); + tcp_rate_skb_delivered(sk, skb, state->rate); + + if (!before(TCP_SKB_CB(skb)->seq, +@@ -2937,9 +2939,12 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, + * See draft-ietf-tcplw-high-performance-00, section 3.3. + */ + if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && +- flag & FLAG_ACKED) +- seq_rtt_us = ca_rtt_us = jiffies_to_usecs(tcp_time_stamp - +- tp->rx_opt.rcv_tsecr); ++ flag & FLAG_ACKED) { ++ u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; ++ u32 delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); ++ ++ seq_rtt_us = ca_rtt_us = delta_us; ++ } + if (seq_rtt_us < 0) + return false; + +@@ -2961,12 +2966,8 @@ void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req) + { + long rtt_us = -1L; + +- if (req && !req->num_retrans && tcp_rsk(req)->snt_synack.v64) { +- struct skb_mstamp now; +- +- skb_mstamp_get(&now); +- rtt_us = skb_mstamp_us_delta(&now, &tcp_rsk(req)->snt_synack); +- } ++ if (req && !req->num_retrans && tcp_rsk(req)->snt_synack) ++ rtt_us = tcp_stamp_us_delta(tcp_clock_us(), tcp_rsk(req)->snt_synack); + + tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us); + } +@@ -3004,7 +3005,7 @@ void tcp_rearm_rto(struct sock *sk) + struct sk_buff *skb = tcp_write_queue_head(sk); + const u32 rto_time_stamp = + tcp_skb_timestamp(skb) + rto; +- s32 delta = (s32)(rto_time_stamp - tcp_time_stamp); ++ s32 delta = (s32)(rto_time_stamp - tcp_jiffies32); + /* delta may not be positive if the socket is locked + * when the retrans timer fires and is rescheduled. + */ +@@ -3060,9 +3061,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, + struct tcp_sacktag_state *sack) + { + const struct inet_connection_sock *icsk = inet_csk(sk); +- struct skb_mstamp first_ackt, last_ackt; ++ u64 first_ackt, last_ackt; + struct tcp_sock *tp = tcp_sk(sk); +- struct skb_mstamp *now = &tp->tcp_mstamp; + u32 prior_sacked = tp->sacked_out; + u32 reord = tp->packets_out; + bool fully_acked = true; +@@ -3075,7 +3075,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, + bool rtt_update; + int flag = 0; + +- first_ackt.v64 = 0; ++ first_ackt = 0; + + while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { + struct tcp_skb_cb *scb = TCP_SKB_CB(skb); +@@ -3106,8 +3106,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, + flag |= FLAG_RETRANS_DATA_ACKED; + } else if (!(sacked & TCPCB_SACKED_ACKED)) { + last_ackt = skb->skb_mstamp; +- WARN_ON_ONCE(last_ackt.v64 == 0); +- if (!first_ackt.v64) ++ WARN_ON_ONCE(last_ackt == 0); ++ if (!first_ackt) + first_ackt = last_ackt; + + last_in_flight = TCP_SKB_CB(skb)->tx.in_flight; +@@ -3122,7 +3122,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, + tp->delivered += acked_pcount; + if (!tcp_skb_spurious_retrans(tp, skb)) + tcp_rack_advance(tp, sacked, scb->end_seq, +- &skb->skb_mstamp); ++ skb->skb_mstamp); + } + if (sacked & TCPCB_LOST) + tp->lost_out -= acked_pcount; +@@ -3165,13 +3165,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, + if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) + flag |= FLAG_SACK_RENEGING; + +- if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) { +- seq_rtt_us = skb_mstamp_us_delta(now, &first_ackt); +- ca_rtt_us = skb_mstamp_us_delta(now, &last_ackt); ++ if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) { ++ seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt); ++ ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, last_ackt); + } +- if (sack->first_sackt.v64) { +- sack_rtt_us = skb_mstamp_us_delta(now, &sack->first_sackt); +- ca_rtt_us = skb_mstamp_us_delta(now, &sack->last_sackt); ++ if (sack->first_sackt) { ++ sack_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->first_sackt); ++ ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->last_sackt); + } + sack->rate->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet, or -1 */ + rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us, +@@ -3201,7 +3201,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, + tp->fackets_out -= min(pkts_acked, tp->fackets_out); + + } else if (skb && rtt_update && sack_rtt_us >= 0 && +- sack_rtt_us > skb_mstamp_us_delta(now, &skb->skb_mstamp)) { ++ sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp)) { + /* Do not re-arm RTO if the sack RTT is measured from data sent + * after when the head was last (re)transmitted. Otherwise the + * timeout may continue to extend in loss recovery. +@@ -3553,7 +3553,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + int acked = 0; /* Number of packets newly acked */ + int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ + +- sack_state.first_sackt.v64 = 0; ++ sack_state.first_sackt = 0; + sack_state.rate = &rs; + + /* We very likely will need to access write queue head. */ +@@ -5321,7 +5321,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, + { + struct tcp_sock *tp = tcp_sk(sk); + +- skb_mstamp_get(&tp->tcp_mstamp); ++ tcp_mstamp_refresh(tp); + if (unlikely(!sk->sk_rx_dst)) + inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); + /* +@@ -5609,7 +5609,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, + + if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && + !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp, +- tcp_time_stamp)) { ++ tcp_time_stamp(tp))) { + NET_INC_STATS(sock_net(sk), + LINUX_MIB_PAWSACTIVEREJECTED); + goto reset_and_undo; +@@ -5854,7 +5854,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) + + case TCP_SYN_SENT: + tp->rx_opt.saw_tstamp = 0; +- skb_mstamp_get(&tp->tcp_mstamp); ++ tcp_mstamp_refresh(tp); + queued = tcp_rcv_synsent_state_process(sk, skb, th); + if (queued >= 0) + return queued; +@@ -5866,7 +5866,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) + return 0; + } + +- skb_mstamp_get(&tp->tcp_mstamp); ++ tcp_mstamp_refresh(tp); + tp->rx_opt.saw_tstamp = 0; + req = tp->fastopen_rsk; + if (req) { +@@ -6135,7 +6135,7 @@ static void tcp_openreq_init(struct request_sock *req, + req->cookie_ts = 0; + tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; + tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; +- skb_mstamp_get(&tcp_rsk(req)->snt_synack); ++ tcp_rsk(req)->snt_synack = tcp_clock_us(); + tcp_rsk(req)->last_oow_ack_time = 0; + req->mss = rx_opt->mss_clamp; + req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; +diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c +index 2946ada75fa3..f4efef73b5da 100644 +--- a/net/ipv4/tcp_ipv4.c ++++ b/net/ipv4/tcp_ipv4.c +@@ -374,8 +374,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) + struct sock *sk; + struct sk_buff *skb; + struct request_sock *fastopen; +- __u32 seq, snd_una; +- __u32 remaining; ++ u32 seq, snd_una; ++ s32 remaining; ++ u32 delta_us; + int err; + struct net *net = dev_net(icmp_skb->dev); + +@@ -481,12 +482,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) + skb = tcp_write_queue_head(sk); + BUG_ON(!skb); + +- skb_mstamp_get(&tp->tcp_mstamp); ++ tcp_mstamp_refresh(tp); ++ delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp); + remaining = icsk->icsk_rto - +- min(icsk->icsk_rto, +- tcp_time_stamp - tcp_skb_timestamp(skb)); ++ usecs_to_jiffies(delta_us); + +- if (remaining) { ++ if (remaining > 0) { + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + remaining, TCP_RTO_MAX); + } else { +@@ -810,7 +811,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) + tcp_v4_send_ack(sk, skb, + tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, + tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, +- tcp_time_stamp + tcptw->tw_ts_offset, ++ tcp_time_stamp_raw() + tcptw->tw_ts_offset, + tcptw->tw_ts_recent, + tw->tw_bound_dev_if, + tcp_twsk_md5_key(tcptw), +@@ -838,7 +839,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, + tcp_v4_send_ack(sk, skb, seq, + tcp_rsk(req)->rcv_nxt, + req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, +- tcp_time_stamp + tcp_rsk(req)->ts_off, ++ tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, + req->ts_recent, + 0, + tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr, +diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c +index ef3122abb373..ae10ed64fe13 100644 +--- a/net/ipv4/tcp_lp.c ++++ b/net/ipv4/tcp_lp.c +@@ -37,7 +37,7 @@ + #include + + /* resolution of owd */ +-#define LP_RESOL 1000 ++#define LP_RESOL TCP_TS_HZ + + /** + * enum tcp_lp_state +@@ -147,9 +147,9 @@ static u32 tcp_lp_remote_hz_estimator(struct sock *sk) + tp->rx_opt.rcv_tsecr == lp->local_ref_time) + goto out; + +- m = HZ * (tp->rx_opt.rcv_tsval - +- lp->remote_ref_time) / (tp->rx_opt.rcv_tsecr - +- lp->local_ref_time); ++ m = TCP_TS_HZ * ++ (tp->rx_opt.rcv_tsval - lp->remote_ref_time) / ++ (tp->rx_opt.rcv_tsecr - lp->local_ref_time); + if (m < 0) + m = -m; + +@@ -194,7 +194,7 @@ static u32 tcp_lp_owd_calculator(struct sock *sk) + if (lp->flag & LP_VALID_RHZ) { + owd = + tp->rx_opt.rcv_tsval * (LP_RESOL / lp->remote_hz) - +- tp->rx_opt.rcv_tsecr * (LP_RESOL / HZ); ++ tp->rx_opt.rcv_tsecr * (LP_RESOL / TCP_TS_HZ); + if (owd < 0) + owd = -owd; + } +@@ -264,7 +264,7 @@ static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample) + { + struct tcp_sock *tp = tcp_sk(sk); + struct lp *lp = inet_csk_ca(sk); +- u32 now = tcp_time_stamp; ++ u32 now = tcp_time_stamp(tp); + u32 delta; + + if (sample->rtt_us > 0) +diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c +index 42f203001229..42de43a5c0d0 100644 +--- a/net/ipv4/tcp_minisocks.c ++++ b/net/ipv4/tcp_minisocks.c +@@ -454,7 +454,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, + newtp->fackets_out = 0; + newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; + newtp->tlp_high_seq = 0; +- newtp->lsndtime = treq->snt_synack.stamp_jiffies; ++ newtp->lsndtime = tcp_jiffies32; + newsk->sk_txhash = treq->txhash; + newtp->last_oow_ack_time = 0; + newtp->total_retrans = req->num_retrans; +@@ -525,7 +525,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, + newtp->fastopen_req = NULL; + newtp->fastopen_rsk = NULL; + newtp->syn_data_acked = 0; +- newtp->rack.mstamp.v64 = 0; ++ newtp->rack.mstamp = 0; + newtp->rack.advanced = 0; + + __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); +diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c +index 10ba45072d1a..8e28f4d0acc7 100644 +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -1962,7 +1962,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, + + head = tcp_write_queue_head(sk); + +- age = skb_mstamp_us_delta(&tp->tcp_mstamp, &head->skb_mstamp); ++ age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp); + /* If next ACK is likely to come too late (half srtt), do not defer */ + if (age < (tp->srtt_us >> 4)) + goto send_now; +@@ -2279,7 +2279,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + } + + max_segs = tcp_tso_segs(sk, mss_now); +- skb_mstamp_get(&tp->tcp_mstamp); ++ tcp_mstamp_refresh(tp); + while ((skb = tcp_send_head(sk))) { + unsigned int limit; + +@@ -3095,7 +3095,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) + skb_reserve(skb, MAX_TCP_HEADER); + tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), + TCPHDR_ACK | TCPHDR_RST); +- skb_mstamp_get(&tcp_sk(sk)->tcp_mstamp); ++ tcp_mstamp_refresh(tcp_sk(sk)); + /* Send it off. */ + if (tcp_transmit_skb(sk, skb, 0, priority)) + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); +@@ -3191,10 +3191,10 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, + memset(&opts, 0, sizeof(opts)); + #ifdef CONFIG_SYN_COOKIES + if (unlikely(req->cookie_ts)) +- skb->skb_mstamp.stamp_jiffies = cookie_init_timestamp(req); ++ skb->skb_mstamp = cookie_init_timestamp(req); + else + #endif +- skb_mstamp_get(&skb->skb_mstamp); ++ skb->skb_mstamp = tcp_clock_us(); + + #ifdef CONFIG_TCP_MD5SIG + rcu_read_lock(); +@@ -3456,8 +3456,8 @@ int tcp_connect(struct sock *sk) + return -ENOBUFS; + + tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); +- skb_mstamp_get(&tp->tcp_mstamp); +- tp->retrans_stamp = tp->tcp_mstamp.stamp_jiffies; ++ tcp_mstamp_refresh(tp); ++ tp->retrans_stamp = tcp_time_stamp(tp); + tcp_connect_queue_skb(sk, buff); + tcp_ecn_send_syn(sk, buff); + +@@ -3618,7 +3618,7 @@ void tcp_send_window_probe(struct sock *sk) + { + if (sk->sk_state == TCP_ESTABLISHED) { + tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1; +- skb_mstamp_get(&tcp_sk(sk)->tcp_mstamp); ++ tcp_mstamp_refresh(tcp_sk(sk)); + tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE); + } + } +diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c +index c6a9fa894646..ad99569d4c1e 100644 +--- a/net/ipv4/tcp_rate.c ++++ b/net/ipv4/tcp_rate.c +@@ -78,7 +78,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + +- if (!scb->tx.delivered_mstamp.v64) ++ if (!scb->tx.delivered_mstamp) + return; + + if (!rs->prior_delivered || +@@ -89,9 +89,9 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, + rs->is_retrans = scb->sacked & TCPCB_RETRANS; + + /* Find the duration of the "send phase" of this window: */ +- rs->interval_us = skb_mstamp_us_delta( +- &skb->skb_mstamp, +- &scb->tx.first_tx_mstamp); ++ rs->interval_us = tcp_stamp_us_delta( ++ skb->skb_mstamp, ++ scb->tx.first_tx_mstamp); + + /* Record send time of most recently ACKed packet: */ + tp->first_tx_mstamp = skb->skb_mstamp; +@@ -101,7 +101,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, + * we don't need to reset since it'll be freed soon. + */ + if (scb->sacked & TCPCB_SACKED_ACKED) +- scb->tx.delivered_mstamp.v64 = 0; ++ scb->tx.delivered_mstamp = 0; + } + + /* Update the connection delivery information and generate a rate sample. */ +@@ -125,7 +125,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, + rs->acked_sacked = delivered; /* freshly ACKed or SACKed */ + rs->losses = lost; /* freshly marked lost */ + /* Return an invalid sample if no timing information is available. */ +- if (!rs->prior_mstamp.v64) { ++ if (!rs->prior_mstamp) { + rs->delivered = -1; + rs->interval_us = -1; + return; +@@ -138,8 +138,8 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, + * longer phase. + */ + snd_us = rs->interval_us; /* send phase */ +- ack_us = skb_mstamp_us_delta(&tp->tcp_mstamp, +- &rs->prior_mstamp); /* ack phase */ ++ ack_us = tcp_stamp_us_delta(tp->tcp_mstamp, ++ rs->prior_mstamp); /* ack phase */ + rs->interval_us = max(snd_us, ack_us); + + /* Normally we expect interval_us >= min-rtt. +diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c +index cd72b3d3879e..fe9a493d0208 100644 +--- a/net/ipv4/tcp_recovery.c ++++ b/net/ipv4/tcp_recovery.c +@@ -17,12 +17,9 @@ static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb) + } + } + +-static bool tcp_rack_sent_after(const struct skb_mstamp *t1, +- const struct skb_mstamp *t2, +- u32 seq1, u32 seq2) ++static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) + { +- return skb_mstamp_after(t1, t2) || +- (t1->v64 == t2->v64 && after(seq1, seq2)); ++ return t1 > t2 || (t1 == t2 && after(seq1, seq2)); + } + + /* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01): +@@ -72,14 +69,14 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) + scb->sacked & TCPCB_SACKED_ACKED) + continue; + +- if (tcp_rack_sent_after(&tp->rack.mstamp, &skb->skb_mstamp, ++ if (tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp, + tp->rack.end_seq, scb->end_seq)) { + /* Step 3 in draft-cheng-tcpm-rack-00.txt: + * A packet is lost if its elapsed time is beyond + * the recent RTT plus the reordering window. + */ +- u32 elapsed = skb_mstamp_us_delta(&tp->tcp_mstamp, +- &skb->skb_mstamp); ++ u32 elapsed = tcp_stamp_us_delta(tp->tcp_mstamp, ++ skb->skb_mstamp); + s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed; + + if (remaining < 0) { +@@ -127,16 +124,16 @@ void tcp_rack_mark_lost(struct sock *sk) + * draft-cheng-tcpm-rack-00.txt + */ + void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, +- const struct skb_mstamp *xmit_time) ++ u64 xmit_time) + { + u32 rtt_us; + +- if (tp->rack.mstamp.v64 && +- !tcp_rack_sent_after(xmit_time, &tp->rack.mstamp, ++ if (tp->rack.mstamp && ++ !tcp_rack_sent_after(xmit_time, tp->rack.mstamp, + end_seq, tp->rack.end_seq)) + return; + +- rtt_us = skb_mstamp_us_delta(&tp->tcp_mstamp, xmit_time); ++ rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, xmit_time); + if (sacked & TCPCB_RETRANS) { + /* If the sacked packet was retransmitted, it's ambiguous + * whether the retransmission or the original (or the prior +@@ -152,7 +149,7 @@ void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, + return; + } + tp->rack.rtt_us = rtt_us; +- tp->rack.mstamp = *xmit_time; ++ tp->rack.mstamp = xmit_time; + tp->rack.end_seq = end_seq; + tp->rack.advanced = 1; + } +diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c +index 5e4041be61b1..3f6aa6270b3f 100644 +--- a/net/ipv4/tcp_timer.c ++++ b/net/ipv4/tcp_timer.c +@@ -153,8 +153,8 @@ static bool retransmits_timed_out(struct sock *sk, + unsigned int timeout, + bool syn_set) + { +- unsigned int linear_backoff_thresh, start_ts; + unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN; ++ unsigned int linear_backoff_thresh, start_ts; + + if (!inet_csk(sk)->icsk_retransmits) + return false; +@@ -172,7 +172,7 @@ static bool retransmits_timed_out(struct sock *sk, + timeout = ((2 << linear_backoff_thresh) - 1) * rto_base + + (boundary - linear_backoff_thresh) * TCP_RTO_MAX; + } +- return (tcp_time_stamp - start_ts) >= timeout; ++ return (tcp_time_stamp(tcp_sk(sk)) - start_ts) >= jiffies_to_msecs(timeout); + } + + /* A write timeout has occurred. Process the after effects. */ +@@ -329,7 +329,7 @@ static void tcp_probe_timer(struct sock *sk) + if (!start_ts) + tcp_send_head(sk)->skb_mstamp = tp->tcp_mstamp; + else if (icsk->icsk_user_timeout && +- (s32)(tcp_time_stamp - start_ts) > icsk->icsk_user_timeout) ++ (s32)(tcp_time_stamp(tp) - start_ts) > icsk->icsk_user_timeout) + goto abort; + + max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2; +@@ -549,7 +549,7 @@ void tcp_write_timer_handler(struct sock *sk) + goto out; + } + +- skb_mstamp_get(&tcp_sk(sk)->tcp_mstamp); ++ tcp_mstamp_refresh(tcp_sk(sk)); + event = icsk->icsk_pending; + + switch (event) { +diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c +index f067dbff43e6..3f0ab6a798ba 100644 +--- a/net/ipv6/syncookies.c ++++ b/net/ipv6/syncookies.c +@@ -211,7 +211,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) + ireq->wscale_ok = tcp_opt.wscale_ok; + ireq->tstamp_ok = tcp_opt.saw_tstamp; + req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; +- treq->snt_synack.v64 = 0; ++ treq->snt_synack = 0; + treq->rcv_isn = ntohl(th->seq) - 1; + treq->snt_isn = cookie; + treq->ts_off = 0; +diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c +index f9366f817097..2904ba1dc965 100644 +--- a/net/ipv6/tcp_ipv6.c ++++ b/net/ipv6/tcp_ipv6.c +@@ -949,7 +949,7 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) + + tcp_v6_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, + tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, +- tcp_time_stamp + tcptw->tw_ts_offset, ++ tcp_time_stamp_raw() + tcptw->tw_ts_offset, + tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw), + tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel)); + +@@ -971,7 +971,7 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, + tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, + tcp_rsk(req)->rcv_nxt, + req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, +- tcp_time_stamp + tcp_rsk(req)->ts_off, ++ tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, + req->ts_recent, sk->sk_bound_dev_if, + tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr), + 0, 0); +diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c +index a504e87c6ddf..49bd8bb16b18 100644 +--- a/net/netfilter/nf_synproxy_core.c ++++ b/net/netfilter/nf_synproxy_core.c +@@ -152,7 +152,7 @@ void synproxy_init_timestamp_cookie(const struct xt_synproxy_info *info, + struct synproxy_options *opts) + { + opts->tsecr = opts->tsval; +- opts->tsval = tcp_time_stamp & ~0x3f; ++ opts->tsval = tcp_time_stamp_raw() & ~0x3f; + + if (opts->options & XT_SYNPROXY_OPT_WSCALE) { + opts->tsval |= opts->wscale; +-- +2.15.1 + diff --git a/patches.fixes/tcp-use-IPCB-instead-of-TCP_SKB_CB-in-inet_exact_dif.patch b/patches.fixes/tcp-use-IPCB-instead-of-TCP_SKB_CB-in-inet_exact_dif.patch new file mode 100644 index 0000000..1c92460 --- /dev/null +++ b/patches.fixes/tcp-use-IPCB-instead-of-TCP_SKB_CB-in-inet_exact_dif.patch @@ -0,0 +1,41 @@ +From: David Ahern +Date: Sun, 3 Dec 2017 09:33:00 -0800 +Subject: tcp: use IPCB instead of TCP_SKB_CB in inet_exact_dif_match() +Patch-mainline: v4.15-rc3 +Git-commit: b4d1605a8ea608fd7dc45b926a05d75d340bde4b +References: bsc#1076830 + +After this fix : ("tcp: add tcp_v4_fill_cb()/tcp_v4_restore_cb()"), +socket lookups happen while skb->cb[] has not been mangled yet by TCP. + +Fixes: a04a480d4392 ("net: Require exact match for TCP socket lookups if dif is l3mdev") +Signed-off-by: David Ahern +Signed-off-by: Eric Dumazet +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + include/net/tcp.h | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/include/net/tcp.h b/include/net/tcp.h +index d70e5aa0d98f..82e9251323cc 100644 +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -846,12 +846,11 @@ static inline int tcp_v6_iif(const struct sk_buff *skb) + } + #endif + +-/* TCP_SKB_CB reference means this can not be used from early demux */ + static inline bool inet_exact_dif_match(struct net *net, struct sk_buff *skb) + { + #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + if (!net->ipv4.sysctl_tcp_l3mdev_accept && +- skb && ipv4_l3mdev_skb(TCP_SKB_CB(skb)->header.h4.flags)) ++ skb && ipv4_l3mdev_skb(IPCB(skb)->flags)) + return true; + #endif + return false; +-- +2.15.1 + diff --git a/patches.fixes/tcp-use-current-time-in-tcp_rcv_space_adjust.patch b/patches.fixes/tcp-use-current-time-in-tcp_rcv_space_adjust.patch new file mode 100644 index 0000000..ac2f719 --- /dev/null +++ b/patches.fixes/tcp-use-current-time-in-tcp_rcv_space_adjust.patch @@ -0,0 +1,40 @@ +From: Eric Dumazet +Date: Wed, 6 Dec 2017 11:08:19 -0800 +Subject: tcp: use current time in tcp_rcv_space_adjust() +Patch-mainline: v4.15-rc3 +Git-commit: 8632385022f2b05a6ca0b9e0f95575865de0e2ce +References: bsc#1076830 + +When I switched rcv_rtt_est to high resolution timestamps, I forgot +that tp->tcp_mstamp needed to be refreshed in tcp_rcv_space_adjust() + +Using an old timestamp leads to autotuning lags. + +Fixes: 645f4c6f2ebd ("tcp: switch rcv_rtt_est and rcvq_space to high resolution timestamps") +Signed-off-by: Eric Dumazet +Cc: Wei Wang +Cc: Neal Cardwell +Cc: Yuchung Cheng +Acked-by: Neal Cardwell +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/ipv4/tcp_input.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index bfbccb2a02f3..36d5c5e3c8dc 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -593,6 +593,7 @@ void tcp_rcv_space_adjust(struct sock *sk) + int time; + int copied; + ++ tcp_mstamp_refresh(tp); + time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time); + if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0) + return; +-- +2.15.1 + diff --git a/patches.fixes/tcp-use-tcp_jiffies32-for-rcv_tstamp-and-lrcvtime.patch b/patches.fixes/tcp-use-tcp_jiffies32-for-rcv_tstamp-and-lrcvtime.patch new file mode 100644 index 0000000..297eba5 --- /dev/null +++ b/patches.fixes/tcp-use-tcp_jiffies32-for-rcv_tstamp-and-lrcvtime.patch @@ -0,0 +1,111 @@ +From: Eric Dumazet +Date: Tue, 16 May 2017 14:00:07 -0700 +Subject: tcp: use tcp_jiffies32 for rcv_tstamp and lrcvtime +Patch-mainline: v4.13-rc1 +Git-commit: 70eabf0e1b8fe11519f793416655266605f700b9 +References: bsc#1061739 + +Use tcp_jiffies32 instead of tcp_time_stamp, since +tcp_time_stamp will soon be only used for TCP TS option. + +Signed-off-by: Eric Dumazet +Acked-by: Soheil Hassas Yeganeh +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + include/net/tcp.h | 4 ++-- + net/ipv4/tcp_input.c | 6 +++--- + net/ipv4/tcp_minisocks.c | 2 +- + net/ipv4/tcp_output.c | 2 +- + net/ipv4/tcp_timer.c | 2 +- + 5 files changed, 8 insertions(+), 8 deletions(-) + +diff --git a/include/net/tcp.h b/include/net/tcp.h +index fbbbae44138f..517975b11ae0 100644 +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -1295,8 +1295,8 @@ static inline u32 keepalive_time_elapsed(const struct tcp_sock *tp) + { + const struct inet_connection_sock *icsk = &tp->inet_conn; + +- return min_t(u32, tcp_time_stamp - icsk->icsk_ack.lrcvtime, +- tcp_time_stamp - tp->rcv_tstamp); ++ return min_t(u32, tcp_jiffies32 - icsk->icsk_ack.lrcvtime, ++ tcp_jiffies32 - tp->rcv_tstamp); + } + + static inline int tcp_fin_time(const struct sock *sk) +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index dd7afa580c38..edd37690326b 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -673,7 +673,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) + + tcp_rcv_rtt_measure(tp); + +- now = tcp_time_stamp; ++ now = tcp_jiffies32; + + if (!icsk->icsk_ack.ato) { + /* The _first_ data packet received, initialize +@@ -3637,7 +3637,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + */ + sk->sk_err_soft = 0; + icsk->icsk_probes_out = 0; +- tp->rcv_tstamp = tcp_time_stamp; ++ tp->rcv_tstamp = tcp_jiffies32; + if (!prior_packets) + goto no_queue; + +@@ -5491,7 +5491,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) + struct inet_connection_sock *icsk = inet_csk(sk); + + tcp_set_state(sk, TCP_ESTABLISHED); +- icsk->icsk_ack.lrcvtime = tcp_time_stamp; ++ icsk->icsk_ack.lrcvtime = tcp_jiffies32; + + if (skb) { + icsk->icsk_af_ops->sk_rx_dst_set(sk, skb); +diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c +index 8d799e68ec55..4fcae3ef6894 100644 +--- a/net/ipv4/tcp_minisocks.c ++++ b/net/ipv4/tcp_minisocks.c +@@ -446,7 +446,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, + newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); + minmax_reset(&newtp->rtt_min, tcp_time_stamp, ~0U); + newicsk->icsk_rto = TCP_TIMEOUT_INIT; +- newicsk->icsk_ack.lrcvtime = tcp_time_stamp; ++ newicsk->icsk_ack.lrcvtime = tcp_jiffies32; + + newtp->packets_out = 0; + newtp->retrans_out = 0; +diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c +index a833c6014cbc..60a6a0d28b66 100644 +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -3324,7 +3324,7 @@ static void tcp_connect_init(struct sock *sk) + if (likely(!tp->repair)) + tp->rcv_nxt = 0; + else +- tp->rcv_tstamp = tcp_time_stamp; ++ tp->rcv_tstamp = tcp_jiffies32; + tp->rcv_wup = tp->rcv_nxt; + tp->copied_seq = tp->rcv_nxt; + +diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c +index b0a471e1718b..bb1857a8bde2 100644 +--- a/net/ipv4/tcp_timer.c ++++ b/net/ipv4/tcp_timer.c +@@ -439,7 +439,7 @@ void tcp_retransmit_timer(struct sock *sk) + tp->snd_una, tp->snd_nxt); + } + #endif +- if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) { ++ if (tcp_jiffies32 - tp->rcv_tstamp > TCP_RTO_MAX) { + tcp_write_err(sk); + goto out; + } +-- +2.15.1 + diff --git a/patches.fixes/tcp-use-tcp_jiffies32-in-__tcp_oow_rate_limited.patch b/patches.fixes/tcp-use-tcp_jiffies32-in-__tcp_oow_rate_limited.patch new file mode 100644 index 0000000..80a1ea8 --- /dev/null +++ b/patches.fixes/tcp-use-tcp_jiffies32-in-__tcp_oow_rate_limited.patch @@ -0,0 +1,43 @@ +From: Eric Dumazet +Date: Tue, 16 May 2017 14:00:10 -0700 +Subject: tcp: use tcp_jiffies32 in __tcp_oow_rate_limited() +Patch-mainline: v4.13-rc1 +Git-commit: 594208afe40c448faca967235691ec04fe9f57e3 +References: bsc#1061739 + +This place wants to use tcp_jiffies32, this is good enough. + +Signed-off-by: Eric Dumazet +Acked-by: Soheil Hassas Yeganeh +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/ipv4/tcp_input.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index edd37690326b..79873ace320e 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -3390,7 +3390,7 @@ static bool __tcp_oow_rate_limited(struct net *net, int mib_idx, + u32 *last_oow_ack_time) + { + if (*last_oow_ack_time) { +- s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time); ++ s32 elapsed = (s32)(tcp_jiffies32 - *last_oow_ack_time); + + if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) { + NET_INC_STATS(net, mib_idx); +@@ -3398,7 +3398,7 @@ static bool __tcp_oow_rate_limited(struct net *net, int mib_idx, + } + } + +- *last_oow_ack_time = tcp_time_stamp; ++ *last_oow_ack_time = tcp_jiffies32; + + return false; /* not rate-limited: go ahead, send dupack now! */ + } +-- +2.15.1 + diff --git a/patches.fixes/tcp-use-tcp_jiffies32-to-feed-probe_timestamp.patch b/patches.fixes/tcp-use-tcp_jiffies32-to-feed-probe_timestamp.patch new file mode 100644 index 0000000..8fe1042 --- /dev/null +++ b/patches.fixes/tcp-use-tcp_jiffies32-to-feed-probe_timestamp.patch @@ -0,0 +1,67 @@ +From: Eric Dumazet +Date: Tue, 16 May 2017 14:00:08 -0700 +Subject: tcp: use tcp_jiffies32 to feed probe_timestamp +Patch-mainline: v4.13-rc1 +Git-commit: c74df29a8d119a09ccc5e50265e3383c76278f3d +References: bsc#1061739 + +Use tcp_jiffies32 instead of tcp_time_stamp, since +tcp_time_stamp will soon be only used for TCP TS option. + +Signed-off-by: Eric Dumazet +Acked-by: Soheil Hassas Yeganeh +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/ipv4/tcp_output.c | 6 +++--- + net/ipv4/tcp_timer.c | 2 +- + 2 files changed, 4 insertions(+), 4 deletions(-) + +diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c +index 60a6a0d28b66..f8139a834414 100644 +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -1475,7 +1475,7 @@ void tcp_mtup_init(struct sock *sk) + icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss); + icsk->icsk_mtup.probe_size = 0; + if (icsk->icsk_mtup.enabled) +- icsk->icsk_mtup.probe_timestamp = tcp_time_stamp; ++ icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; + } + EXPORT_SYMBOL(tcp_mtup_init); + +@@ -1987,7 +1987,7 @@ static inline void tcp_mtu_check_reprobe(struct sock *sk) + s32 delta; + + interval = net->ipv4.sysctl_tcp_probe_interval; +- delta = tcp_time_stamp - icsk->icsk_mtup.probe_timestamp; ++ delta = tcp_jiffies32 - icsk->icsk_mtup.probe_timestamp; + if (unlikely(delta >= interval * HZ)) { + int mss = tcp_current_mss(sk); + +@@ -1999,7 +1999,7 @@ static inline void tcp_mtu_check_reprobe(struct sock *sk) + icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); + + /* Update probe time stamp */ +- icsk->icsk_mtup.probe_timestamp = tcp_time_stamp; ++ icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; + } + } + +diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c +index bb1857a8bde2..5e4041be61b1 100644 +--- a/net/ipv4/tcp_timer.c ++++ b/net/ipv4/tcp_timer.c +@@ -115,7 +115,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) + if (net->ipv4.sysctl_tcp_mtu_probing) { + if (!icsk->icsk_mtup.enabled) { + icsk->icsk_mtup.enabled = 1; +- icsk->icsk_mtup.probe_timestamp = tcp_time_stamp; ++ icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); + } else { + struct net *net = sock_net(sk); +-- +2.15.1 + diff --git a/patches.fixes/tcp-use-tcp_jiffies32-to-feed-tp-lsndtime.patch b/patches.fixes/tcp-use-tcp_jiffies32-to-feed-tp-lsndtime.patch new file mode 100644 index 0000000..dc211dd --- /dev/null +++ b/patches.fixes/tcp-use-tcp_jiffies32-to-feed-tp-lsndtime.patch @@ -0,0 +1,135 @@ +From: Eric Dumazet +Date: Tue, 16 May 2017 14:00:03 -0700 +Subject: tcp: use tcp_jiffies32 to feed tp->lsndtime +Patch-mainline: v4.13-rc1 +Git-commit: d635fbe27ebee0f4b845abe5e9620c9400785a5c +References: bsc#1061739 + +Use tcp_jiffies32 instead of tcp_time_stamp to feed +tp->lsndtime. + +tcp_time_stamp will soon be a litle bit more expensive +than simply reading 'jiffies'. + +Signed-off-by: Eric Dumazet +Acked-by: Soheil Hassas Yeganeh +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + include/net/tcp.h | 2 +- + net/ipv4/tcp.c | 2 +- + net/ipv4/tcp_cubic.c | 2 +- + net/ipv4/tcp_input.c | 4 ++-- + net/ipv4/tcp_output.c | 4 ++-- + net/ipv4/tcp_timer.c | 4 ++-- + 6 files changed, 9 insertions(+), 9 deletions(-) + +diff --git a/include/net/tcp.h b/include/net/tcp.h +index 297a2123cabf..fbbbae44138f 100644 +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -1233,7 +1233,7 @@ static inline void tcp_slow_start_after_idle_check(struct sock *sk) + if (!sysctl_tcp_slow_start_after_idle || tp->packets_out || + ca_ops->cong_control) + return; +- delta = tcp_time_stamp - tp->lsndtime; ++ delta = tcp_jiffies32 - tp->lsndtime; + if (delta > inet_csk(sk)->icsk_rto) + tcp_cwnd_restart(sk, delta); + } +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index bd494a6d1597..451fe49a83d9 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -2747,7 +2747,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) + info->tcpi_retrans = tp->retrans_out; + info->tcpi_fackets = tp->fackets_out; + +- now = tcp_time_stamp; ++ now = tcp_jiffies32; + info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime); + info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime); + info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); +diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c +index 0683ba447d77..2052ca740916 100644 +--- a/net/ipv4/tcp_cubic.c ++++ b/net/ipv4/tcp_cubic.c +@@ -155,7 +155,7 @@ static void bictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) + { + if (event == CA_EVENT_TX_START) { + struct bictcp *ca = inet_csk_ca(sk); +- u32 now = tcp_time_stamp; ++ u32 now = tcp_jiffies32; + s32 delta; + + delta = now - tcp_sk(sk)->lsndtime; +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index 126d545a2542..a6ff82c6a933 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -5508,7 +5508,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) + /* Prevent spurious tcp_cwnd_restart() on first data + * packet. + */ +- tp->lsndtime = tcp_time_stamp; ++ tp->lsndtime = tcp_jiffies32; + + tcp_init_buffer_space(sk); + +@@ -5949,7 +5949,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) + tcp_update_pacing_rate(sk); + + /* Prevent spurious tcp_cwnd_restart() on first data packet */ +- tp->lsndtime = tcp_time_stamp; ++ tp->lsndtime = tcp_jiffies32; + + tcp_initialize_rcv_mss(sk); + tcp_fast_path_on(tp); +diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c +index 97ec0a50aa73..e3d1d04ea5fc 100644 +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -160,7 +160,7 @@ static void tcp_event_data_sent(struct tcp_sock *tp, + struct sock *sk) + { + struct inet_connection_sock *icsk = inet_csk(sk); +- const u32 now = tcp_time_stamp; ++ const u32 now = tcp_jiffies32; + + if (tcp_packets_in_flight(tp) == 0) + tcp_ca_event(sk, CA_EVENT_TX_START); +@@ -1918,7 +1918,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, + /* Avoid bursty behavior by allowing defer + * only if the last write was recent. + */ +- if ((s32)(tcp_time_stamp - tp->lsndtime) > 0) ++ if ((s32)(tcp_jiffies32 - tp->lsndtime) > 0) + goto send_now; + + in_flight = tcp_packets_in_flight(tp); +diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c +index 4dd6a6c3e7fc..b0a471e1718b 100644 +--- a/net/ipv4/tcp_timer.c ++++ b/net/ipv4/tcp_timer.c +@@ -63,7 +63,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) + + /* If peer does not open window for long time, or did not transmit + * anything for long time, penalize it. */ +- if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset) ++ if ((s32)(tcp_jiffies32 - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset) + shift++; + + /* If some dubious ICMP arrived, penalize even more. */ +@@ -73,7 +73,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) + if (tcp_check_oom(sk, shift)) { + /* Catch exceptional cases, when connection requires reset. + * 1. Last segment was sent recently. */ +- if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN || ++ if ((s32)(tcp_jiffies32 - tp->lsndtime) <= TCP_TIMEWAIT_LEN || + /* 2. Window is closed. */ + (!tp->snd_wnd && !tp->packets_out)) + do_reset = true; +-- +2.15.1 + diff --git a/patches.fixes/tcp-use-tcp_jiffies32-to-feed-tp-snd_cwnd_stamp.patch b/patches.fixes/tcp-use-tcp_jiffies32-to-feed-tp-snd_cwnd_stamp.patch new file mode 100644 index 0000000..98e0a83 --- /dev/null +++ b/patches.fixes/tcp-use-tcp_jiffies32-to-feed-tp-snd_cwnd_stamp.patch @@ -0,0 +1,146 @@ +From: Eric Dumazet +Date: Tue, 16 May 2017 14:00:04 -0700 +Subject: tcp: use tcp_jiffies32 to feed tp->snd_cwnd_stamp +Patch-mainline: v4.13-rc1 +Git-commit: c2203cf75ed7dfab8dfc7ac915a726880ee7512f +References: bsc#1061739 + +Use tcp_jiffies32 instead of tcp_time_stamp to feed +tp->snd_cwnd_stamp. + +tcp_time_stamp will soon be a litle bit more expensive +than simply reading 'jiffies'. + +Signed-off-by: Eric Dumazet +Acked-by: Soheil Hassas Yeganeh +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/ipv4/tcp_input.c | 14 +++++++------- + net/ipv4/tcp_metrics.c | 2 +- + net/ipv4/tcp_output.c | 8 ++++---- + 3 files changed, 12 insertions(+), 12 deletions(-) + +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index a6ff82c6a933..dd7afa580c38 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -464,7 +464,7 @@ void tcp_init_buffer_space(struct sock *sk) + tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss); + + tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp); +- tp->snd_cwnd_stamp = tcp_time_stamp; ++ tp->snd_cwnd_stamp = tcp_jiffies32; + } + + /* 5. Recalculate window clamp after socket hit its memory bounds. */ +@@ -1955,7 +1955,7 @@ void tcp_enter_loss(struct sock *sk) + } + tp->snd_cwnd = 1; + tp->snd_cwnd_cnt = 0; +- tp->snd_cwnd_stamp = tcp_time_stamp; ++ tp->snd_cwnd_stamp = tcp_jiffies32; + + tp->retrans_out = 0; + tp->lost_out = 0; +@@ -2384,7 +2384,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) + tcp_ecn_withdraw_cwr(tp); + } + } +- tp->snd_cwnd_stamp = tcp_time_stamp; ++ tp->snd_cwnd_stamp = tcp_jiffies32; + tp->undo_marker = 0; + } + +@@ -2521,7 +2521,7 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk) + if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH && + (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || tp->undo_marker)) { + tp->snd_cwnd = tp->snd_ssthresh; +- tp->snd_cwnd_stamp = tcp_time_stamp; ++ tp->snd_cwnd_stamp = tcp_jiffies32; + } + tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); + } +@@ -2591,7 +2591,7 @@ static void tcp_mtup_probe_success(struct sock *sk) + tcp_mss_to_mtu(sk, tp->mss_cache) / + icsk->icsk_mtup.probe_size; + tp->snd_cwnd_cnt = 0; +- tp->snd_cwnd_stamp = tcp_time_stamp; ++ tp->snd_cwnd_stamp = tcp_jiffies32; + tp->snd_ssthresh = tcp_current_ssthresh(sk); + + icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size; +@@ -2977,7 +2977,7 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) + const struct inet_connection_sock *icsk = inet_csk(sk); + + icsk->icsk_ca_ops->cong_avoid(sk, ack, acked); +- tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp; ++ tcp_sk(sk)->snd_cwnd_stamp = tcp_jiffies32; + } + + /* Restart timer after forward progress on connection. +@@ -5003,7 +5003,7 @@ static void tcp_new_space(struct sock *sk) + + if (tcp_should_expand_sndbuf(sk)) { + tcp_sndbuf_expand(sk); +- tp->snd_cwnd_stamp = tcp_time_stamp; ++ tp->snd_cwnd_stamp = tcp_jiffies32; + } + + sk->sk_write_space(sk); +diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c +index 653bbd67e3a3..102b2c90bb80 100644 +--- a/net/ipv4/tcp_metrics.c ++++ b/net/ipv4/tcp_metrics.c +@@ -524,7 +524,7 @@ void tcp_init_metrics(struct sock *sk) + tp->snd_cwnd = 1; + else + tp->snd_cwnd = tcp_init_cwnd(tp, dst); +- tp->snd_cwnd_stamp = tcp_time_stamp; ++ tp->snd_cwnd_stamp = tcp_jiffies32; + } + + bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst) +diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c +index e3d1d04ea5fc..a833c6014cbc 100644 +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -151,7 +151,7 @@ void tcp_cwnd_restart(struct sock *sk, s32 delta) + while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd) + cwnd >>= 1; + tp->snd_cwnd = max(cwnd, restart_cwnd); +- tp->snd_cwnd_stamp = tcp_time_stamp; ++ tp->snd_cwnd_stamp = tcp_jiffies32; + tp->snd_cwnd_used = 0; + } + +@@ -1576,7 +1576,7 @@ static void tcp_cwnd_application_limited(struct sock *sk) + } + tp->snd_cwnd_used = 0; + } +- tp->snd_cwnd_stamp = tcp_time_stamp; ++ tp->snd_cwnd_stamp = tcp_jiffies32; + } + + static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) +@@ -1597,14 +1597,14 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) + if (tcp_is_cwnd_limited(sk)) { + /* Network is feed fully. */ + tp->snd_cwnd_used = 0; +- tp->snd_cwnd_stamp = tcp_time_stamp; ++ tp->snd_cwnd_stamp = tcp_jiffies32; + } else { + /* Network starves. */ + if (tp->packets_out > tp->snd_cwnd_used) + tp->snd_cwnd_used = tp->packets_out; + + if (sysctl_tcp_slow_start_after_idle && +- (s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto && ++ (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto && + !ca_ops->cong_control) + tcp_cwnd_application_limited(sk); + +-- +2.15.1 + diff --git a/patches.fixes/tcp-use-tp-tcp_mstamp-in-output-path.patch b/patches.fixes/tcp-use-tp-tcp_mstamp-in-output-path.patch new file mode 100644 index 0000000..adbbd6e --- /dev/null +++ b/patches.fixes/tcp-use-tp-tcp_mstamp-in-output-path.patch @@ -0,0 +1,177 @@ +From: Eric Dumazet +Date: Tue, 16 May 2017 14:00:00 -0700 +Subject: tcp: use tp->tcp_mstamp in output path +Patch-mainline: v4.13-rc1 +Git-commit: 385e20706facd376f27863bd55b7cc7720d3f27b +References: bsc#1061739 + +Idea is to later convert tp->tcp_mstamp to a full u64 counter +using usec resolution, so that we can later have fine +grained TCP TS clock (RFC 7323), regardless of HZ value. + +We try to refresh tp->tcp_mstamp only when necessary. + +Signed-off-by: Eric Dumazet +Acked-by: Soheil Hassas Yeganeh +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/ipv4/tcp_ipv4.c | 1 + + net/ipv4/tcp_output.c | 21 +++++++++++---------- + net/ipv4/tcp_recovery.c | 1 - + net/ipv4/tcp_timer.c | 3 ++- + 4 files changed, 14 insertions(+), 12 deletions(-) + +diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c +index 12f1e36342a6..2946ada75fa3 100644 +--- a/net/ipv4/tcp_ipv4.c ++++ b/net/ipv4/tcp_ipv4.c +@@ -481,6 +481,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) + skb = tcp_write_queue_head(sk); + BUG_ON(!skb); + ++ skb_mstamp_get(&tp->tcp_mstamp); + remaining = icsk->icsk_rto - + min(icsk->icsk_rto, + tcp_time_stamp - tcp_skb_timestamp(skb)); +diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c +index 0aa68ef26eb1..97ec0a50aa73 100644 +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -997,8 +997,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, + BUG_ON(!skb || !tcp_skb_pcount(skb)); + tp = tcp_sk(sk); + ++ skb->skb_mstamp = tp->tcp_mstamp; + if (clone_it) { +- skb_mstamp_get(&skb->skb_mstamp); + TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq + - tp->snd_una; + tcp_rate_skb_sent(sk, skb); +@@ -1906,7 +1906,6 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, + const struct inet_connection_sock *icsk = inet_csk(sk); + u32 age, send_win, cong_win, limit, in_flight; + struct tcp_sock *tp = tcp_sk(sk); +- struct skb_mstamp now; + struct sk_buff *head; + int win_divisor; + +@@ -1962,8 +1961,8 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, + } + + head = tcp_write_queue_head(sk); +- skb_mstamp_get(&now); +- age = skb_mstamp_us_delta(&now, &head->skb_mstamp); ++ ++ age = skb_mstamp_us_delta(&tp->tcp_mstamp, &head->skb_mstamp); + /* If next ACK is likely to come too late (half srtt), do not defer */ + if (age < (tp->srtt_us >> 4)) + goto send_now; +@@ -2280,6 +2279,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + } + + max_segs = tcp_tso_segs(sk, mss_now); ++ skb_mstamp_get(&tp->tcp_mstamp); + while ((skb = tcp_send_head(sk))) { + unsigned int limit; + +@@ -2291,7 +2291,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + + if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { + /* "skb_mstamp" is used as a start point for the retransmit timer */ +- skb_mstamp_get(&skb->skb_mstamp); ++ skb->skb_mstamp = tp->tcp_mstamp; + goto repair; /* Skip network transmission */ + } + +@@ -2879,7 +2879,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) + skb_headroom(skb) >= 0xFFFF)) { + struct sk_buff *nskb; + +- skb_mstamp_get(&skb->skb_mstamp); ++ skb->skb_mstamp = tp->tcp_mstamp; + nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); + err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : + -ENOBUFS; +@@ -3095,7 +3095,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) + skb_reserve(skb, MAX_TCP_HEADER); + tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), + TCPHDR_ACK | TCPHDR_RST); +- skb_mstamp_get(&skb->skb_mstamp); ++ skb_mstamp_get(&tcp_sk(sk)->tcp_mstamp); + /* Send it off. */ + if (tcp_transmit_skb(sk, skb, 0, priority)) + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); +@@ -3456,7 +3456,8 @@ int tcp_connect(struct sock *sk) + return -ENOBUFS; + + tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); +- tp->retrans_stamp = tcp_time_stamp; ++ skb_mstamp_get(&tp->tcp_mstamp); ++ tp->retrans_stamp = tp->tcp_mstamp.stamp_jiffies; + tcp_connect_queue_skb(sk, buff); + tcp_ecn_send_syn(sk, buff); + +@@ -3575,7 +3576,6 @@ void tcp_send_ack(struct sock *sk) + skb_set_tcp_pure_ack(buff); + + /* Send it off, this clears delayed acks for us. */ +- skb_mstamp_get(&buff->skb_mstamp); + tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0); + } + EXPORT_SYMBOL_GPL(tcp_send_ack); +@@ -3609,15 +3609,16 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib) + * send it. + */ + tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); +- skb_mstamp_get(&skb->skb_mstamp); + NET_INC_STATS(sock_net(sk), mib); + return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0); + } + ++/* Called from setsockopt( ... TCP_REPAIR ) */ + void tcp_send_window_probe(struct sock *sk) + { + if (sk->sk_state == TCP_ESTABLISHED) { + tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1; ++ skb_mstamp_get(&tcp_sk(sk)->tcp_mstamp); + tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE); + } + } +diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c +index 362b8c75bfab..cd72b3d3879e 100644 +--- a/net/ipv4/tcp_recovery.c ++++ b/net/ipv4/tcp_recovery.c +@@ -166,7 +166,6 @@ void tcp_rack_reo_timeout(struct sock *sk) + u32 timeout, prior_inflight; + + prior_inflight = tcp_packets_in_flight(tp); +- skb_mstamp_get(&tp->tcp_mstamp); + tcp_rack_detect_loss(sk, &timeout); + if (prior_inflight != tcp_packets_in_flight(tp)) { + if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) { +diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c +index 02d271f24af8..4dd6a6c3e7fc 100644 +--- a/net/ipv4/tcp_timer.c ++++ b/net/ipv4/tcp_timer.c +@@ -327,7 +327,7 @@ static void tcp_probe_timer(struct sock *sk) + */ + start_ts = tcp_skb_timestamp(tcp_send_head(sk)); + if (!start_ts) +- skb_mstamp_get(&tcp_send_head(sk)->skb_mstamp); ++ tcp_send_head(sk)->skb_mstamp = tp->tcp_mstamp; + else if (icsk->icsk_user_timeout && + (s32)(tcp_time_stamp - start_ts) > icsk->icsk_user_timeout) + goto abort; +@@ -549,6 +549,7 @@ void tcp_write_timer_handler(struct sock *sk) + goto out; + } + ++ skb_mstamp_get(&tcp_sk(sk)->tcp_mstamp); + event = icsk->icsk_pending; + + switch (event) { +-- +2.15.1 + diff --git a/patches.fixes/tcp-uses-jiffies_32-to-feed-tp-chrono_start.patch b/patches.fixes/tcp-uses-jiffies_32-to-feed-tp-chrono_start.patch new file mode 100644 index 0000000..c68f156 --- /dev/null +++ b/patches.fixes/tcp-uses-jiffies_32-to-feed-tp-chrono_start.patch @@ -0,0 +1,48 @@ +From: Eric Dumazet +Date: Tue, 16 May 2017 14:00:09 -0700 +Subject: tcp: uses jiffies_32 to feed tp->chrono_start +Patch-mainline: v4.13-rc1 +Git-commit: 628174ccc45f648b83374d0a5bd554b0a88522ce +References: bsc#1061739 + +tcp_time_stamp will no longer be tied to jiffies. + +Signed-off-by: Eric Dumazet +Acked-by: Soheil Hassas Yeganeh +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/ipv4/tcp.c | 2 +- + net/ipv4/tcp_output.c | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index 451fe49a83d9..d0725dc266d1 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -2663,7 +2663,7 @@ static void tcp_get_info_chrono_stats(const struct tcp_sock *tp, + for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) { + stats[i] = tp->chrono_stat[i - 1]; + if (i == tp->chrono_type) +- stats[i] += tcp_time_stamp - tp->chrono_start; ++ stats[i] += tcp_jiffies32 - tp->chrono_start; + stats[i] *= USEC_PER_SEC / HZ; + total += stats[i]; + } +diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c +index f8139a834414..74a37e14a00c 100644 +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -2202,7 +2202,7 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, + + static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new) + { +- const u32 now = tcp_time_stamp; ++ const u32 now = tcp_jiffies32; + + if (tp->chrono_type > TCP_CHRONO_UNSPEC) + tp->chrono_stat[tp->chrono_type - 1] += now - tp->chrono_start; +-- +2.15.1 + diff --git a/patches.fixes/tcp_bbr-use-tcp_jiffies32-instead-of-tcp_time_stamp.patch b/patches.fixes/tcp_bbr-use-tcp_jiffies32-instead-of-tcp_time_stamp.patch new file mode 100644 index 0000000..c6915da --- /dev/null +++ b/patches.fixes/tcp_bbr-use-tcp_jiffies32-instead-of-tcp_time_stamp.patch @@ -0,0 +1,70 @@ +From: Eric Dumazet +Date: Tue, 16 May 2017 14:00:05 -0700 +Subject: tcp_bbr: use tcp_jiffies32 instead of tcp_time_stamp +Patch-mainline: v4.13-rc1 +Git-commit: 2660bfa84e9236016f3a4f21b7864431d9663338 +References: bsc#1061739 + +Use tcp_jiffies32 instead of tcp_time_stamp, since +tcp_time_stamp will soon be only used for TCP TS option. + +Signed-off-by: Eric Dumazet +Acked-by: Soheil Hassas Yeganeh +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/ipv4/tcp_bbr.c | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c +index d9eae8104c2c..816e6a1f6e9e 100644 +--- a/net/ipv4/tcp_bbr.c ++++ b/net/ipv4/tcp_bbr.c +@@ -761,12 +761,12 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) + bool filter_expired; + + /* Track min RTT seen in the min_rtt_win_sec filter window: */ +- filter_expired = after(tcp_time_stamp, ++ filter_expired = after(tcp_jiffies32, + bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ); + if (rs->rtt_us >= 0 && + (rs->rtt_us <= bbr->min_rtt_us || filter_expired)) { + bbr->min_rtt_us = rs->rtt_us; +- bbr->min_rtt_stamp = tcp_time_stamp; ++ bbr->min_rtt_stamp = tcp_jiffies32; + } + + if (bbr_probe_rtt_mode_ms > 0 && filter_expired && +@@ -785,7 +785,7 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) + /* Maintain min packets in flight for max(200 ms, 1 round). */ + if (!bbr->probe_rtt_done_stamp && + tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) { +- bbr->probe_rtt_done_stamp = tcp_time_stamp + ++ bbr->probe_rtt_done_stamp = tcp_jiffies32 + + msecs_to_jiffies(bbr_probe_rtt_mode_ms); + bbr->probe_rtt_round_done = 0; + bbr->next_rtt_delivered = tp->delivered; +@@ -793,8 +793,8 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) + if (bbr->round_start) + bbr->probe_rtt_round_done = 1; + if (bbr->probe_rtt_round_done && +- after(tcp_time_stamp, bbr->probe_rtt_done_stamp)) { +- bbr->min_rtt_stamp = tcp_time_stamp; ++ after(tcp_jiffies32, bbr->probe_rtt_done_stamp)) { ++ bbr->min_rtt_stamp = tcp_jiffies32; + bbr->restore_cwnd = 1; /* snap to prior_cwnd */ + bbr_reset_mode(sk); + } +@@ -840,7 +840,7 @@ static void bbr_init(struct sock *sk) + bbr->probe_rtt_done_stamp = 0; + bbr->probe_rtt_round_done = 0; + bbr->min_rtt_us = tcp_min_rtt(tp); +- bbr->min_rtt_stamp = tcp_time_stamp; ++ bbr->min_rtt_stamp = tcp_jiffies32; + + minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */ + +-- +2.15.1 + diff --git a/patches.fixes/tcp_lp-cache-tcp_time_stamp.patch b/patches.fixes/tcp_lp-cache-tcp_time_stamp.patch new file mode 100644 index 0000000..4327b3b --- /dev/null +++ b/patches.fixes/tcp_lp-cache-tcp_time_stamp.patch @@ -0,0 +1,57 @@ +From: Eric Dumazet +Date: Tue, 16 May 2017 14:00:12 -0700 +Subject: tcp_lp: cache tcp_time_stamp +Patch-mainline: v4.13-rc1 +Git-commit: 46bf466f08c9db0db1b77d3ecb5694926c73583a +References: bsc#1061739 + +tcp_time_stamp will become slightly more expensive soon, +cache its value. + +Signed-off-by: Eric Dumazet +Acked-by: Soheil Hassas Yeganeh +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/ipv4/tcp_lp.c | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c +index d6fb6c067af4..ef3122abb373 100644 +--- a/net/ipv4/tcp_lp.c ++++ b/net/ipv4/tcp_lp.c +@@ -264,18 +264,19 @@ static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample) + { + struct tcp_sock *tp = tcp_sk(sk); + struct lp *lp = inet_csk_ca(sk); ++ u32 now = tcp_time_stamp; + u32 delta; + + if (sample->rtt_us > 0) + tcp_lp_rtt_sample(sk, sample->rtt_us); + + /* calc inference */ +- delta = tcp_time_stamp - tp->rx_opt.rcv_tsecr; ++ delta = now - tp->rx_opt.rcv_tsecr; + if ((s32)delta > 0) + lp->inference = 3 * delta; + + /* test if within inference */ +- if (lp->last_drop && (tcp_time_stamp - lp->last_drop < lp->inference)) ++ if (lp->last_drop && (now - lp->last_drop < lp->inference)) + lp->flag |= LP_WITHIN_INF; + else + lp->flag &= ~LP_WITHIN_INF; +@@ -312,7 +313,7 @@ static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample) + tp->snd_cwnd = max(tp->snd_cwnd >> 1U, 1U); + + /* record this drop time */ +- lp->last_drop = tcp_time_stamp; ++ lp->last_drop = now; + } + + static struct tcp_congestion_ops tcp_lp __read_mostly = { +-- +2.15.1 + diff --git a/patches.fixes/tcp_westwood-use-tcp_jiffies32-instead-of-tcp_time_s.patch b/patches.fixes/tcp_westwood-use-tcp_jiffies32-instead-of-tcp_time_s.patch new file mode 100644 index 0000000..58a5a98 --- /dev/null +++ b/patches.fixes/tcp_westwood-use-tcp_jiffies32-instead-of-tcp_time_s.patch @@ -0,0 +1,53 @@ +From: Eric Dumazet +Date: Tue, 16 May 2017 14:00:11 -0700 +Subject: tcp_westwood: use tcp_jiffies32 instead of tcp_time_stamp +Patch-mainline: v4.13-rc1 +Git-commit: ad5ad69e6b48a7e5cc0391cc57c9e8a93a0c969c +References: bsc#1061739 + +This CC does not need 1 ms tcp_time_stamp and can use +the jiffy based 'timestamp'. + +Signed-off-by: Eric Dumazet +Acked-by: Soheil Hassas Yeganeh +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/ipv4/tcp_westwood.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c +index 9775453b8d17..bec9cafbe3f9 100644 +--- a/net/ipv4/tcp_westwood.c ++++ b/net/ipv4/tcp_westwood.c +@@ -68,7 +68,7 @@ static void tcp_westwood_init(struct sock *sk) + w->cumul_ack = 0; + w->reset_rtt_min = 1; + w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT; +- w->rtt_win_sx = tcp_time_stamp; ++ w->rtt_win_sx = tcp_jiffies32; + w->snd_una = tcp_sk(sk)->snd_una; + w->first_ack = 1; + } +@@ -116,7 +116,7 @@ static void tcp_westwood_pkts_acked(struct sock *sk, + static void westwood_update_window(struct sock *sk) + { + struct westwood *w = inet_csk_ca(sk); +- s32 delta = tcp_time_stamp - w->rtt_win_sx; ++ s32 delta = tcp_jiffies32 - w->rtt_win_sx; + + /* Initialize w->snd_una with the first acked sequence number in order + * to fix mismatch between tp->snd_una and w->snd_una for the first +@@ -140,7 +140,7 @@ static void westwood_update_window(struct sock *sk) + westwood_filter(w, delta); + + w->bk = 0; +- w->rtt_win_sx = tcp_time_stamp; ++ w->rtt_win_sx = tcp_jiffies32; + } + } + +-- +2.15.1 + diff --git a/patches.fixes/udp-fix-bcast-packet-reception.patch b/patches.fixes/udp-fix-bcast-packet-reception.patch new file mode 100644 index 0000000..414a012 --- /dev/null +++ b/patches.fixes/udp-fix-bcast-packet-reception.patch @@ -0,0 +1,62 @@ +From: Paolo Abeni +Date: Mon, 9 Oct 2017 14:52:10 +0200 +Subject: udp: fix bcast packet reception +Patch-mainline: v4.14-rc5 +Git-commit: 996b44fcef8f216ea0b6b6e74468c5a77b5e341f +References: bsc#1076830 + +The commit bc044e8db796 ("udp: perform source validation for +mcast early demux") does not take into account that broadcast packets +lands in the same code path and they need different checks for the +source address - notably, zero source address are valid for bcast +and invalid for mcast. + +As a result, 2nd and later broadcast packets with 0 source address +landing to the same socket are dropped. This breaks dhcp servers. + +Since we don't have stringent performance requirements for ingress +broadcast traffic, fix it by disabling UDP early demux such traffic. + +Reported-by: Hannes Frederic Sowa +Fixes: bc044e8db796 ("udp: perform source validation for mcast early demux") +Signed-off-by: Paolo Abeni +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/ipv4/udp.c | 14 +++++--------- + 1 file changed, 5 insertions(+), 9 deletions(-) + +diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c +index 16fc46096c92..a86c0d84f7a1 100644 +--- a/net/ipv4/udp.c ++++ b/net/ipv4/udp.c +@@ -2176,20 +2176,16 @@ int udp_v4_early_demux(struct sk_buff *skb) + iph = ip_hdr(skb); + uh = udp_hdr(skb); + +- if (skb->pkt_type == PACKET_BROADCAST || +- skb->pkt_type == PACKET_MULTICAST) { ++ if (skb->pkt_type == PACKET_MULTICAST) { + in_dev = __in_dev_get_rcu(skb->dev); + + if (!in_dev) + return 0; + +- /* we are supposed to accept bcast packets */ +- if (skb->pkt_type == PACKET_MULTICAST) { +- ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr, +- iph->protocol); +- if (!ours) +- return 0; +- } ++ ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr, ++ iph->protocol); ++ if (!ours) ++ return 0; + + sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr, + uh->source, iph->saddr, dif); +-- +2.16.0 + diff --git a/patches.fixes/udp-perform-source-validation-for-mcast-early-demux.patch b/patches.fixes/udp-perform-source-validation-for-mcast-early-demux.patch new file mode 100644 index 0000000..d5e5788 --- /dev/null +++ b/patches.fixes/udp-perform-source-validation-for-mcast-early-demux.patch @@ -0,0 +1,198 @@ +From: Paolo Abeni +Date: Thu, 28 Sep 2017 15:51:37 +0200 +Subject: udp: perform source validation for mcast early demux +Patch-mainline: v4.14-rc4 +Git-commit: bc044e8db7962e727a75b591b9851ff2ac5cf846 +References: bsc#1076830 + +The UDP early demux can leverate the rx dst cache even for +multicast unconnected sockets. + +In such scenario the ipv4 source address is validated only on +the first packet in the given flow. After that, when we fetch +the dst entry from the socket rx cache, we stop enforcing +the rp_filter and we even start accepting any kind of martian +addresses. + +Disabling the dst cache for unconnected multicast socket will +cause large performace regression, nearly reducing by half the +max ingress tput. + +Instead we factor out a route helper to completely validate an +skb source address for multicast packets and we call it from +the UDP early demux for mcast packets landing on unconnected +sockets, after successful fetching the related cached dst entry. + +This still gives a measurable, but limited performance +regression: + + rp_filter = 0 rp_filter = 1 +edmux disabled: 1182 Kpps 1127 Kpps +edmux before: 2238 Kpps 2238 Kpps +edmux after: 2037 Kpps 2019 Kpps + +The above figures are on top of current net tree. +Applying the net-next commit 6e617de84e87 ("net: avoid a full +fib lookup when rp_filter is disabled.") the delta with +rp_filter == 0 will decrease even more. + +Fixes: 421b3885bf6d ("udp: ipv4: Add udp early demux") +Signed-off-by: Paolo Abeni +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + include/net/route.h | 4 +++- + net/ipv4/route.c | 46 ++++++++++++++++++++++++++-------------------- + net/ipv4/udp.c | 13 ++++++++++++- + 3 files changed, 41 insertions(+), 22 deletions(-) + +diff --git a/include/net/route.h b/include/net/route.h +index b4ce75e0cb6f..51ae143140ea 100644 +--- a/include/net/route.h ++++ b/include/net/route.h +@@ -172,7 +172,9 @@ static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4 + fl4->fl4_gre_key = gre_key; + return ip_route_output_key(net, fl4); + } +- ++int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, ++ u8 tos, struct net_device *dev, ++ struct in_device *in_dev, u32 *itag); + int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src, + u8 tos, struct net_device *devin); + +diff --git a/net/ipv4/route.c b/net/ipv4/route.c +index 5f87e489cf32..3651be58b128 100644 +--- a/net/ipv4/route.c ++++ b/net/ipv4/route.c +@@ -1518,43 +1518,56 @@ struct rtable *rt_dst_alloc(struct net_device *dev, + EXPORT_SYMBOL(rt_dst_alloc); + + /* called in rcu_read_lock() section */ +-static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, +- u8 tos, struct net_device *dev, int our) ++int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, ++ u8 tos, struct net_device *dev, ++ struct in_device *in_dev, u32 *itag) + { +- struct rtable *rth; +- struct in_device *in_dev = __in_dev_get_rcu(dev); +- unsigned int flags = RTCF_MULTICAST; +- u32 itag = 0; + int err; + + /* Primary sanity checks. */ +- + if (!in_dev) + return -EINVAL; + + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || + skb->protocol != htons(ETH_P_IP)) +- goto e_inval; ++ return -EINVAL; + + if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev)) +- goto e_inval; ++ return -EINVAL; + + if (ipv4_is_zeronet(saddr)) { + if (!ipv4_is_local_multicast(daddr)) +- goto e_inval; ++ return -EINVAL; + } else { + err = fib_validate_source(skb, saddr, 0, tos, 0, dev, +- in_dev, &itag); ++ in_dev, itag); + if (err < 0) +- goto e_err; ++ return err; + } ++ return 0; ++} ++ ++/* called in rcu_read_lock() section */ ++static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, ++ u8 tos, struct net_device *dev, int our) ++{ ++ struct in_device *in_dev = __in_dev_get_rcu(dev); ++ unsigned int flags = RTCF_MULTICAST; ++ struct rtable *rth; ++ u32 itag = 0; ++ int err; ++ ++ err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag); ++ if (err) ++ return err; ++ + if (our) + flags |= RTCF_LOCAL; + + rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST, + IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false); + if (!rth) +- goto e_nobufs; ++ return -ENOBUFS; + + #ifdef CONFIG_IP_ROUTE_CLASSID + rth->dst.tclassid = itag; +@@ -1570,13 +1583,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, + + skb_dst_set(skb, &rth->dst); + return 0; +- +-e_nobufs: +- return -ENOBUFS; +-e_inval: +- return -EINVAL; +-e_err: +- return err; + } + + +diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c +index 25c108218b54..16fc46096c92 100644 +--- a/net/ipv4/udp.c ++++ b/net/ipv4/udp.c +@@ -2161,6 +2161,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, + int udp_v4_early_demux(struct sk_buff *skb) + { + struct net *net = dev_net(skb->dev); ++ struct in_device *in_dev = NULL; + const struct iphdr *iph; + const struct udphdr *uh; + struct sock *sk = NULL; +@@ -2177,7 +2178,7 @@ int udp_v4_early_demux(struct sk_buff *skb) + + if (skb->pkt_type == PACKET_BROADCAST || + skb->pkt_type == PACKET_MULTICAST) { +- struct in_device *in_dev = __in_dev_get_rcu(skb->dev); ++ in_dev = __in_dev_get_rcu(skb->dev); + + if (!in_dev) + return 0; +@@ -2207,11 +2208,21 @@ int udp_v4_early_demux(struct sk_buff *skb) + if (dst) + dst = dst_check(dst, 0); + if (dst) { ++ u32 itag = 0; ++ + /* set noref for now. + * any place which wants to hold dst has to call + * dst_hold_safe() + */ + skb_dst_set_noref(skb, dst); ++ ++ /* for unconnected multicast sockets we need to validate ++ * the source on each packet ++ */ ++ if (!inet_sk(sk)->inet_daddr && in_dev) ++ return ip_mc_validate_source(skb, iph->daddr, ++ iph->saddr, iph->tos, ++ skb->dev, in_dev, &itag); + } + return 0; + } +-- +2.16.0 + diff --git a/patches.fixes/udpv6-Fix-the-checksum-computation-when-HW-checksum-.patch b/patches.fixes/udpv6-Fix-the-checksum-computation-when-HW-checksum-.patch new file mode 100644 index 0000000..fff6fb3 --- /dev/null +++ b/patches.fixes/udpv6-Fix-the-checksum-computation-when-HW-checksum-.patch @@ -0,0 +1,39 @@ +From: Subash Abhinov Kasiviswanathan +Date: Wed, 13 Sep 2017 19:30:51 -0600 +Subject: udpv6: Fix the checksum computation when HW checksum does not apply +Patch-mainline: v4.14-rc2 +Git-commit: 63ecc3d9436f8012e49dc846d6cb0a85a3433517 +References: bsc#1076830 + +While trying an ESP transport mode encryption for UDPv6 packets of +datagram size 1436 with MTU 1500, checksum error was observed in +the secondary fragment. + +This error occurs due to the UDP payload checksum being missed out +when computing the full checksum for these packets in +udp6_hwcsum_outgoing(). + +Fixes: d39d938c8228 ("ipv6: Introduce udpv6_send_skb()") +Signed-off-by: Subash Abhinov Kasiviswanathan +Signed-off-by: David S. Miller +Acked-by: Michal Kubecek + +--- + net/ipv6/udp.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c +index cf5f5bd1742c..f722d9fedb44 100644 +--- a/net/ipv6/udp.c ++++ b/net/ipv6/udp.c +@@ -996,6 +996,7 @@ static void udp6_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb, + */ + offset = skb_transport_offset(skb); + skb->csum = skb_checksum(skb, offset, skb->len - offset, 0); ++ csum = skb->csum; + + skb->ip_summed = CHECKSUM_NONE; + +-- +2.15.1 + diff --git a/patches.fixes/vti-fix-NULL-dereference-in-xfrm_input.patch b/patches.fixes/vti-fix-NULL-dereference-in-xfrm_input.patch new file mode 100644 index 0000000..0fe35bb --- /dev/null +++ b/patches.fixes/vti-fix-NULL-dereference-in-xfrm_input.patch @@ -0,0 +1,79 @@ +From: Alexey Kodanev +Date: Tue, 12 Sep 2017 14:53:46 +0300 +Subject: vti: fix NULL dereference in xfrm_input() +Patch-mainline: v4.14-rc5 +Git-commit: 23e9fcfef1f3d10675acce023592796851bcaf1a +References: bsc#1076830 + +Can be reproduced with LTP tests: + # icmp-uni-vti.sh -p ah -a sha256 -m tunnel -S fffffffe -k 1 -s 10 + +IPv4: + RIP: 0010:xfrm_input+0x7f9/0x870 + ... + Call Trace: + + vti_input+0xaa/0x110 [ip_vti] + ? skb_free_head+0x21/0x40 + vti_rcv+0x33/0x40 [ip_vti] + xfrm4_ah_rcv+0x33/0x60 + ip_local_deliver_finish+0x94/0x1e0 + ip_local_deliver+0x6f/0xe0 + ? ip_route_input_noref+0x28/0x50 + ... + + # icmp-uni-vti.sh -6 -p ah -a sha256 -m tunnel -S fffffffe -k 1 -s 10 +IPv6: + RIP: 0010:xfrm_input+0x7f9/0x870 + ... + Call Trace: + + xfrm6_rcv_tnl+0x3c/0x40 + vti6_rcv+0xd5/0xe0 [ip6_vti] + xfrm6_ah_rcv+0x33/0x60 + ip6_input_finish+0xee/0x460 + ip6_input+0x3f/0xb0 + ip6_rcv_finish+0x45/0xa0 + ipv6_rcv+0x34b/0x540 + +xfrm_input() invokes xfrm_rcv_cb() -> vti_rcv_cb(), the last callback +might call skb_scrub_packet(), which in turn can reset secpath. + +Fix it by adding a check that skb->sp is not NULL. + +Fixes: 7e9e9202bccc ("xfrm: Clear RX SKB secpath xfrm_offload") +Signed-off-by: Alexey Kodanev +Signed-off-by: Steffen Klassert +Acked-by: Michal Kubecek + +--- + net/xfrm/xfrm_input.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c +index 43e002347618..b1acfcb9edf7 100644 +--- a/net/xfrm/xfrm_input.c ++++ b/net/xfrm/xfrm_input.c +@@ -450,7 +450,8 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) + nf_reset(skb); + + if (decaps) { +- skb->sp->olen = 0; ++ if (skb->sp) ++ skb->sp->olen = 0; + skb_dst_drop(skb); + gro_cells_receive(&gro_cells, skb); + return 0; +@@ -461,7 +462,8 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) + + err = x->inner_mode->afinfo->transport_finish(skb, xfrm_gro || async); + if (xfrm_gro) { +- skb->sp->olen = 0; ++ if (skb->sp) ++ skb->sp->olen = 0; + skb_dst_drop(skb); + gro_cells_receive(&gro_cells, skb); + return err; +-- +2.16.0 + diff --git a/patches.fixes/xfrm-Clear-RX-SKB-secpath-xfrm_offload.patch b/patches.fixes/xfrm-Clear-RX-SKB-secpath-xfrm_offload.patch new file mode 100644 index 0000000..fe25d96 --- /dev/null +++ b/patches.fixes/xfrm-Clear-RX-SKB-secpath-xfrm_offload.patch @@ -0,0 +1,50 @@ +From: Ilan Tayari +Date: Tue, 1 Aug 2017 12:49:09 +0300 +Subject: xfrm: Clear RX SKB secpath xfrm_offload +Patch-mainline: v4.14-rc1 +Git-commit: 7e9e9202bccc3a8224ae10ad5d69cac8627f9c7b +References: bsc#1076830 + +If an incoming packet undergoes XFRM crypto-offload, its secpath is +filled with xfrm_offload struct denoting offload information. + +If the SKB is then forwarded to a device which supports crypto- +offload, the stack wrongfully attempts to offload it (even though +the output SA may not exist on the device) due to the leftover +secpath xo. + +Clear the ingress xo by zeroizing secpath->olen just before +delivering the decapsulated packet to the network stack. + +Fixes: d77e38e612a0 ("xfrm: Add an IPsec hardware offloading API") +Signed-off-by: Ilan Tayari +Signed-off-by: Steffen Klassert +Acked-by: Michal Kubecek + +--- + net/xfrm/xfrm_input.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c +index f2f7ad03cc3d..fc40de07fcc9 100644 +--- a/net/xfrm/xfrm_input.c ++++ b/net/xfrm/xfrm_input.c +@@ -434,6 +434,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) + nf_reset(skb); + + if (decaps) { ++ skb->sp->olen = 0; + skb_dst_drop(skb); + gro_cells_receive(&gro_cells, skb); + return 0; +@@ -444,6 +445,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) + + err = x->inner_mode->afinfo->transport_finish(skb, xfrm_gro || async); + if (xfrm_gro) { ++ skb->sp->olen = 0; + skb_dst_drop(skb); + gro_cells_receive(&gro_cells, skb); + return err; +-- +2.15.1 + diff --git a/patches.fixes/xfrm-Clear-sk_dst_cache-when-applying-per-socket-pol.patch b/patches.fixes/xfrm-Clear-sk_dst_cache-when-applying-per-socket-pol.patch new file mode 100644 index 0000000..25fc0de --- /dev/null +++ b/patches.fixes/xfrm-Clear-sk_dst_cache-when-applying-per-socket-pol.patch @@ -0,0 +1,57 @@ +From: Jonathan Basseri +Date: Wed, 25 Oct 2017 09:52:27 -0700 +Subject: xfrm: Clear sk_dst_cache when applying per-socket policy. +Patch-mainline: v4.14-rc8 +Git-commit: 2b06cdf3e688b98fcc9945873b5d42792bd4eee0 +References: bsc#1076830 + +If a socket has a valid dst cache, then xfrm_lookup_route will get +skipped. However, the cache is not invalidated when applying policy to a +socket (i.e. IPV6_XFRM_POLICY). The result is that new policies are +sometimes ignored on those sockets. (Note: This was broken for IPv4 and +IPv6 at different times.) + +This can be demonstrated like so, +1. Create UDP socket. +2. connect() the socket. +3. Apply an outbound XFRM policy to the socket. (setsockopt) +4. send() data on the socket. + +Packets will continue to be sent in the clear instead of matching an +xfrm or returning a no-match error (EAGAIN). This affects calls to +send() and not sendto(). + +Invalidating the sk_dst_cache is necessary to correctly apply xfrm +policies. Since we do this in xfrm_user_policy(), the sk_lock was +already acquired in either do_ip_setsockopt() or do_ipv6_setsockopt(), +and we may call __sk_dst_reset(). + +Performance impact should be negligible, since this code is only called +when changing xfrm policy, and only affects the socket in question. + +Fixes: 00bc0ef5880d ("ipv6: Skip XFRM lookup if dst_entry in socket cache is valid") +Tested: https://android-review.googlesource.com/517555 +Tested: https://android-review.googlesource.com/418659 +Signed-off-by: Jonathan Basseri +Signed-off-by: Steffen Klassert +Acked-by: Michal Kubecek + +--- + net/xfrm/xfrm_state.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c +index 2e291bc5f1fc..0d18f19492d1 100644 +--- a/net/xfrm/xfrm_state.c ++++ b/net/xfrm/xfrm_state.c +@@ -2046,6 +2046,7 @@ int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optval, int optlen + if (err >= 0) { + xfrm_sk_policy_insert(sk, err, pol); + xfrm_pol_put(pol); ++ __sk_dst_reset(sk); + err = 0; + } + +-- +2.15.1 + diff --git a/patches.fixes/xfrm-Fix-GSO-for-IPsec-with-GRE-tunnel.patch b/patches.fixes/xfrm-Fix-GSO-for-IPsec-with-GRE-tunnel.patch new file mode 100644 index 0000000..9a3345a --- /dev/null +++ b/patches.fixes/xfrm-Fix-GSO-for-IPsec-with-GRE-tunnel.patch @@ -0,0 +1,48 @@ +From: Steffen Klassert +Date: Mon, 30 Oct 2017 10:04:04 +0100 +Subject: xfrm: Fix GSO for IPsec with GRE tunnel. +Patch-mainline: v4.14-rc8 +Git-commit: 73b9fc49b4c0116a04eda3979f64ed9b540b153c +References: bsc#1076830 + +We reset the encapsulation field of the skb too early +in xfrm_output. As a result, the GRE GSO handler does +not segment the packets. This leads to a performance +drop down. We fix this by resetting the encapsulation +field right before we do the transformation, when +the inner headers become invalid. + +Fixes: f1bd7d659ef0 ("xfrm: Add encapsulation header offsets while SKB is not encrypted") +Reported-by: Vicente De Luca +Signed-off-by: Steffen Klassert +Acked-by: Michal Kubecek + +--- + net/xfrm/xfrm_output.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c +index 8c0b6722aaa8..626424645030 100644 +--- a/net/xfrm/xfrm_output.c ++++ b/net/xfrm/xfrm_output.c +@@ -102,6 +102,9 @@ static int xfrm_output_one(struct sk_buff *skb, int err) + if (xfrm_offload(skb)) { + x->type_offload->encap(x, skb); + } else { ++ /* Inner headers are invalid now. */ ++ skb->encapsulation = 0; ++ + err = x->type->output(x, skb); + if (err == -EINPROGRESS) + goto out; +@@ -205,7 +208,6 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb) + int err; + + secpath_reset(skb); +- skb->encapsulation = 0; + + if (xfrm_dev_offload_ok(skb, x)) { + struct sec_path *sp; +-- +2.15.1 + diff --git a/patches.fixes/xfrm-Fix-negative-device-refcount-on-offload-failure.patch b/patches.fixes/xfrm-Fix-negative-device-refcount-on-offload-failure.patch new file mode 100644 index 0000000..0a12777 --- /dev/null +++ b/patches.fixes/xfrm-Fix-negative-device-refcount-on-offload-failure.patch @@ -0,0 +1,35 @@ +From: Steffen Klassert +Date: Mon, 4 Sep 2017 10:59:55 +0200 +Subject: xfrm: Fix negative device refcount on offload failure. +Patch-mainline: v4.14-rc5 +Git-commit: 67a63387b1417b5954eedb15f638f1f0bee3da49 +References: bsc#1076830 + +Reset the offload device at the xfrm_state if the device was +not able to offload the state. Otherwise we drop the device +refcount twice. + +Fixes: d77e38e612a0 ("xfrm: Add an IPsec hardware offloading API") +Reported-by: Shannon Nelson +Signed-off-by: Steffen Klassert +Acked-by: Michal Kubecek + +--- + net/xfrm/xfrm_device.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c +index 5aba03685d7d..0e6ca5f9ac01 100644 +--- a/net/xfrm/xfrm_device.c ++++ b/net/xfrm/xfrm_device.c +@@ -90,6 +90,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, + } + + if (!dev->xfrmdev_ops || !dev->xfrmdev_ops->xdo_dev_state_add) { ++ xso->dev = NULL; + dev_put(dev); + return 0; + } +-- +2.15.1 + diff --git a/patches.fixes/xfrm-Fix-xfrm_input-to-verify-state-is-valid-when-en.patch b/patches.fixes/xfrm-Fix-xfrm_input-to-verify-state-is-valid-when-en.patch new file mode 100644 index 0000000..a9c330f --- /dev/null +++ b/patches.fixes/xfrm-Fix-xfrm_input-to-verify-state-is-valid-when-en.patch @@ -0,0 +1,56 @@ +From: Aviv Heller +Date: Tue, 28 Nov 2017 19:55:40 +0200 +Subject: xfrm: Fix xfrm_input() to verify state is valid when (encap_type < 0) +Patch-mainline: v4.15-rc6 +Git-commit: 4ce3dbe397d7b6b15f272ae757c78c35e9e4b61d +References: bsc#1076830 + +Code path when (encap_type < 0) does not verify the state is valid +before progressing. + +This will result in a crash if, for instance, x->km.state == +XFRM_STATE_ACQ. + +Fixes: 7785bba299a8 ("esp: Add a software GRO codepath") +Signed-off-by: Aviv Heller +Signed-off-by: Yevgeny Kliteynik +Signed-off-by: Steffen Klassert +Acked-by: Michal Kubecek + +--- + net/xfrm/xfrm_input.c | 12 +++++++++++- + 1 file changed, 11 insertions(+), 1 deletion(-) + +diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c +index 9de4b1dbc0ae..f2f7ad03cc3d 100644 +--- a/net/xfrm/xfrm_input.c ++++ b/net/xfrm/xfrm_input.c +@@ -206,7 +206,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) + xfrm_address_t *daddr; + struct xfrm_mode *inner_mode; + u32 mark = skb->mark; +- unsigned int family; ++ unsigned int family = AF_UNSPEC; + int decaps = 0; + int async = 0; + bool xfrm_gro = false; +@@ -215,6 +215,16 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) + + if (encap_type < 0) { + x = xfrm_input_state(skb); ++ ++ if (unlikely(x->km.state != XFRM_STATE_VALID)) { ++ if (x->km.state == XFRM_STATE_ACQ) ++ XFRM_INC_STATS(net, LINUX_MIB_XFRMACQUIREERROR); ++ else ++ XFRM_INC_STATS(net, ++ LINUX_MIB_XFRMINSTATEINVALID); ++ goto drop; ++ } ++ + family = x->outer_mode->afinfo->family; + + /* An encap_type of -1 indicates async resumption. */ +-- +2.15.1 + diff --git a/patches.fixes/xfrm-Reinject-transport-mode-packets-through-tasklet.patch b/patches.fixes/xfrm-Reinject-transport-mode-packets-through-tasklet.patch new file mode 100644 index 0000000..d3bd287 --- /dev/null +++ b/patches.fixes/xfrm-Reinject-transport-mode-packets-through-tasklet.patch @@ -0,0 +1,208 @@ +From: Herbert Xu +Date: Fri, 15 Dec 2017 16:40:44 +1100 +Subject: xfrm: Reinject transport-mode packets through tasklet +Patch-mainline: v4.15-rc6 +Git-commit: acf568ee859f098279eadf551612f103afdacb4e +References: bsc#1076830 + +This is an old bugbear of mine: + +https://www.mail-archive.com/netdev@vger.kernel.org/msg03894.html + +By crafting special packets, it is possible to cause recursion +in our kernel when processing transport-mode packets at levels +that are only limited by packet size. + +The easiest one is with DNAT, but an even worse one is where +UDP encapsulation is used in which case you just have to insert +an UDP encapsulation header in between each level of recursion. + +This patch avoids this problem by reinjecting tranport-mode packets +through a tasklet. + +Fixes: b05e106698d9 ("[IPV4/6]: Netfilter IPsec input hooks") +Signed-off-by: Herbert Xu +Signed-off-by: Steffen Klassert +Acked-by: Michal Kubecek + +--- + include/net/xfrm.h | 3 +++ + net/ipv4/xfrm4_input.c | 12 ++++++++++- + net/ipv6/xfrm6_input.c | 10 ++++++++- + net/xfrm/xfrm_input.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++ + 4 files changed, 80 insertions(+), 2 deletions(-) + +diff --git a/include/net/xfrm.h b/include/net/xfrm.h +index 62f5a259e597..bd47fb34e809 100644 +--- a/include/net/xfrm.h ++++ b/include/net/xfrm.h +@@ -1568,6 +1568,9 @@ int xfrm_init_state(struct xfrm_state *x); + int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb); + int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type); + int xfrm_input_resume(struct sk_buff *skb, int nexthdr); ++int xfrm_trans_queue(struct sk_buff *skb, ++ int (*finish)(struct net *, struct sock *, ++ struct sk_buff *)); + int xfrm_output_resume(struct sk_buff *skb, int err); + int xfrm_output(struct sock *sk, struct sk_buff *skb); + int xfrm_inner_extract_output(struct xfrm_state *x, struct sk_buff *skb); +diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c +index 1fc684111ce6..c794a9aa15f5 100644 +--- a/net/ipv4/xfrm4_input.c ++++ b/net/ipv4/xfrm4_input.c +@@ -22,6 +22,12 @@ int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb) + return xfrm4_extract_header(skb); + } + ++static int xfrm4_rcv_encap_finish2(struct net *net, struct sock *sk, ++ struct sk_buff *skb) ++{ ++ return dst_input(skb); ++} ++ + static inline int xfrm4_rcv_encap_finish(struct net *net, struct sock *sk, + struct sk_buff *skb) + { +@@ -32,7 +38,11 @@ static inline int xfrm4_rcv_encap_finish(struct net *net, struct sock *sk, + iph->tos, skb->dev)) + goto drop; + } +- return dst_input(skb); ++ ++ if (xfrm_trans_queue(skb, xfrm4_rcv_encap_finish2)) ++ goto drop; ++ ++ return 0; + drop: + kfree_skb(skb); + return NET_RX_DROP; +diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c +index 3ef5d913e7a3..7c5e582b1af8 100644 +--- a/net/ipv6/xfrm6_input.c ++++ b/net/ipv6/xfrm6_input.c +@@ -31,6 +31,14 @@ int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi, + } + EXPORT_SYMBOL(xfrm6_rcv_spi); + ++static int xfrm6_transport_finish2(struct net *net, struct sock *sk, ++ struct sk_buff *skb) ++{ ++ if (xfrm_trans_queue(skb, ip6_rcv_finish)) ++ __kfree_skb(skb); ++ return -1; ++} ++ + int xfrm6_transport_finish(struct sk_buff *skb, int async) + { + struct xfrm_offload *xo = xfrm_offload(skb); +@@ -53,7 +61,7 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async) + + NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, + dev_net(skb->dev), NULL, skb, skb->dev, NULL, +- ip6_rcv_finish); ++ xfrm6_transport_finish2); + return -1; + } + +diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c +index fc40de07fcc9..43e002347618 100644 +--- a/net/xfrm/xfrm_input.c ++++ b/net/xfrm/xfrm_input.c +@@ -7,15 +7,29 @@ + * + */ + ++#include ++#include + #include + #include + #include ++#include + #include + #include + #include + #include + #include + ++struct xfrm_trans_tasklet { ++ struct tasklet_struct tasklet; ++ struct sk_buff_head queue; ++}; ++ ++struct xfrm_trans_cb { ++ int (*finish)(struct net *net, struct sock *sk, struct sk_buff *skb); ++}; ++ ++#define XFRM_TRANS_SKB_CB(__skb) ((struct xfrm_trans_cb *)&((__skb)->cb[0])) ++ + static struct kmem_cache *secpath_cachep __read_mostly; + + static DEFINE_SPINLOCK(xfrm_input_afinfo_lock); +@@ -24,6 +38,8 @@ static struct xfrm_input_afinfo const __rcu *xfrm_input_afinfo[AF_INET6 + 1]; + static struct gro_cells gro_cells; + static struct net_device xfrm_napi_dev; + ++static DEFINE_PER_CPU(struct xfrm_trans_tasklet, xfrm_trans_tasklet); ++ + int xfrm_input_register_afinfo(const struct xfrm_input_afinfo *afinfo) + { + int err = 0; +@@ -469,9 +485,41 @@ int xfrm_input_resume(struct sk_buff *skb, int nexthdr) + } + EXPORT_SYMBOL(xfrm_input_resume); + ++static void xfrm_trans_reinject(unsigned long data) ++{ ++ struct xfrm_trans_tasklet *trans = (void *)data; ++ struct sk_buff_head queue; ++ struct sk_buff *skb; ++ ++ __skb_queue_head_init(&queue); ++ skb_queue_splice_init(&trans->queue, &queue); ++ ++ while ((skb = __skb_dequeue(&queue))) ++ XFRM_TRANS_SKB_CB(skb)->finish(dev_net(skb->dev), NULL, skb); ++} ++ ++int xfrm_trans_queue(struct sk_buff *skb, ++ int (*finish)(struct net *, struct sock *, ++ struct sk_buff *)) ++{ ++ struct xfrm_trans_tasklet *trans; ++ ++ trans = this_cpu_ptr(&xfrm_trans_tasklet); ++ ++ if (skb_queue_len(&trans->queue) >= netdev_max_backlog) ++ return -ENOBUFS; ++ ++ XFRM_TRANS_SKB_CB(skb)->finish = finish; ++ skb_queue_tail(&trans->queue, skb); ++ tasklet_schedule(&trans->tasklet); ++ return 0; ++} ++EXPORT_SYMBOL(xfrm_trans_queue); ++ + void __init xfrm_input_init(void) + { + int err; ++ int i; + + init_dummy_netdev(&xfrm_napi_dev); + err = gro_cells_init(&gro_cells, &xfrm_napi_dev); +@@ -482,4 +530,13 @@ void __init xfrm_input_init(void) + sizeof(struct sec_path), + 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, + NULL); ++ ++ for_each_possible_cpu(i) { ++ struct xfrm_trans_tasklet *trans; ++ ++ trans = &per_cpu(xfrm_trans_tasklet, i); ++ __skb_queue_head_init(&trans->queue); ++ tasklet_init(&trans->tasklet, xfrm_trans_reinject, ++ (unsigned long)trans); ++ } + } +-- +2.15.1 + diff --git a/patches.fixes/xfrm-Use-__skb_queue_tail-in-xfrm_trans_queue.patch b/patches.fixes/xfrm-Use-__skb_queue_tail-in-xfrm_trans_queue.patch new file mode 100644 index 0000000..5bb9e2f --- /dev/null +++ b/patches.fixes/xfrm-Use-__skb_queue_tail-in-xfrm_trans_queue.patch @@ -0,0 +1,38 @@ +From: Herbert Xu +Date: Thu, 4 Jan 2018 22:25:07 +1100 +Subject: xfrm: Use __skb_queue_tail in xfrm_trans_queue +Patch-mainline: v4.15-rc9 +Git-commit: d16b46e4fd8bc6063624605f25b8c0835bb1fbe3 +References: bsc#1076830 + +We do not need locking in xfrm_trans_queue because it is designed +to use per-CPU buffers. However, the original code incorrectly +used skb_queue_tail which takes the lock. This patch switches +it to __skb_queue_tail instead. + +Reported-and-tested-by: Artem Savkov +Fixes: acf568ee859f ("xfrm: Reinject transport-mode packets...") +Signed-off-by: Herbert Xu +Signed-off-by: Steffen Klassert +Acked-by: Michal Kubecek + +--- + net/xfrm/xfrm_input.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c +index b1acfcb9edf7..2ad91eb793fc 100644 +--- a/net/xfrm/xfrm_input.c ++++ b/net/xfrm/xfrm_input.c +@@ -512,7 +512,7 @@ int xfrm_trans_queue(struct sk_buff *skb, + return -ENOBUFS; + + XFRM_TRANS_SKB_CB(skb)->finish = finish; +- skb_queue_tail(&trans->queue, skb); ++ __skb_queue_tail(&trans->queue, skb); + tasklet_schedule(&trans->tasklet); + return 0; + } +-- +2.16.0 + diff --git a/patches.fixes/xfrm_user-fix-info-leak-in-build_aevent.patch b/patches.fixes/xfrm_user-fix-info-leak-in-build_aevent.patch new file mode 100644 index 0000000..516210c --- /dev/null +++ b/patches.fixes/xfrm_user-fix-info-leak-in-build_aevent.patch @@ -0,0 +1,36 @@ +From: Mathias Krause +Date: Sat, 26 Aug 2017 17:09:00 +0200 +Subject: xfrm_user: fix info leak in build_aevent() +Patch-mainline: v4.13 +Git-commit: 931e79d7a7ddee4709c56b39de169a36804589a1 +References: bsc#1076830 + +The memory reserved to dump the ID of the xfrm state includes a padding +byte in struct xfrm_usersa_id added by the compiler for alignment. To +prevent the heap info leak, memset(0) the sa_id before filling it. + +Cc: Jamal Hadi Salim +Fixes: d51d081d6504 ("[IPSEC]: Sync series - user") +Signed-off-by: Mathias Krause +Signed-off-by: Steffen Klassert +Acked-by: Michal Kubecek + +--- + net/xfrm/xfrm_user.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c +index 98a4c4065e83..37b0335521c5 100644 +--- a/net/xfrm/xfrm_user.c ++++ b/net/xfrm/xfrm_user.c +@@ -1871,6 +1871,7 @@ static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, const struct + return -EMSGSIZE; + + id = nlmsg_data(nlh); ++ memset(&id->sa_id, 0, sizeof(id->sa_id)); + memcpy(&id->sa_id.daddr, &x->id.daddr, sizeof(x->id.daddr)); + id->sa_id.spi = x->id.spi; + id->sa_id.family = x->props.family; +-- +2.15.1 + diff --git a/patches.fixes/xfrm_user-fix-info-leak-in-xfrm_notify_sa.patch b/patches.fixes/xfrm_user-fix-info-leak-in-xfrm_notify_sa.patch new file mode 100644 index 0000000..7e7648c --- /dev/null +++ b/patches.fixes/xfrm_user-fix-info-leak-in-xfrm_notify_sa.patch @@ -0,0 +1,37 @@ +From: Mathias Krause +Date: Sat, 26 Aug 2017 17:08:58 +0200 +Subject: xfrm_user: fix info leak in xfrm_notify_sa() +Patch-mainline: v4.13 +Git-commit: 50329c8a340c9dea60d837645fcf13fc36bfb84d +References: bsc#1076830 + +The memory reserved to dump the ID of the xfrm state includes a padding +byte in struct xfrm_usersa_id added by the compiler for alignment. To +prevent the heap info leak, memset(0) the whole struct before filling +it. + +Cc: Herbert Xu +Fixes: 0603eac0d6b7 ("[IPSEC]: Add XFRMA_SA/XFRMA_POLICY for delete notification") +Signed-off-by: Mathias Krause +Signed-off-by: Steffen Klassert +Acked-by: Michal Kubecek + +--- + net/xfrm/xfrm_user.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c +index ff7a3b958f05..98a4c4065e83 100644 +--- a/net/xfrm/xfrm_user.c ++++ b/net/xfrm/xfrm_user.c +@@ -2699,6 +2699,7 @@ static int xfrm_notify_sa(struct xfrm_state *x, const struct km_event *c) + struct nlattr *attr; + + id = nlmsg_data(nlh); ++ memset(id, 0, sizeof(*id)); + memcpy(&id->daddr, &x->id.daddr, sizeof(id->daddr)); + id->spi = x->id.spi; + id->family = x->props.family; +-- +2.15.1 + diff --git a/series.conf b/series.conf index a611d55..38e612e 100644 --- a/series.conf +++ b/series.conf @@ -2835,10 +2835,26 @@ patches.drivers/net-sched-push-tp-down-to-action-init.patch patches.drivers/net-sched-add-termination-action-to-allow-goto-chain.patch patches.drivers/sch_dsmark-Fix-uninitialized-variable-warning.patch + patches.fixes/tcp-use-tp-tcp_mstamp-in-output-path.patch + patches.fixes/tcp-introduce-tcp_jiffies32.patch + patches.fixes/dccp-do-not-use-tcp_time_stamp.patch + patches.fixes/tcp-use-tcp_jiffies32-to-feed-tp-lsndtime.patch + patches.fixes/tcp-use-tcp_jiffies32-to-feed-tp-snd_cwnd_stamp.patch + patches.fixes/tcp_bbr-use-tcp_jiffies32-instead-of-tcp_time_stamp.patch + patches.fixes/tcp-bic-cubic-use-tcp_jiffies32-instead-of-tcp_time_.patch + patches.fixes/tcp-use-tcp_jiffies32-for-rcv_tstamp-and-lrcvtime.patch + patches.fixes/tcp-use-tcp_jiffies32-to-feed-probe_timestamp.patch + patches.fixes/tcp-uses-jiffies_32-to-feed-tp-chrono_start.patch + patches.fixes/tcp-use-tcp_jiffies32-in-__tcp_oow_rate_limited.patch + patches.fixes/tcp_westwood-use-tcp_jiffies32-instead-of-tcp_time_s.patch + patches.fixes/tcp_lp-cache-tcp_time_stamp.patch + patches.fixes/tcp-replace-misc-tcp_time_stamp-to-tcp_jiffies32.patch + patches.fixes/tcp-switch-TCP-TS-option-RFC-7323-to-1ms-clock.patch patches.drivers/udp-make-function-udp_skb_dtor_locked-static.patch patches.drivers/net-make-struct-net_device-tx_queue_len-unsigned-int.patch patches.drivers/net-fix-__skb_try_recv_from_queue-to-return-the-old-.patch patches.drivers/liquidio-make-the-spinlock-octeon_devices_lock-stati.patch + patches.fixes/tcp-fix-tcp_rearm_rto.patch patches.drivers/qed-Utilize-FW-8.20.0.0.patch patches.drivers/ibmvnic-076-fix-missing-unlock-on-error-in-__ibmvnic_reset.patch patches.drivers/qed-Remove-unused-including-linux-version.h.patch @@ -2869,6 +2885,7 @@ patches.drivers/net-fix-documentation-of-struct-scm_timestamping.patch patches.drivers/net-allow-simultaneous-SW-and-HW-transmit-timestampi.patch patches.drivers/net-ethernet-update-drivers-to-make-both-SW-and-HW-T.patch + patches.fixes/tcp-fix-tcp_probe_timer-for-TCP_USER_TIMEOUT.patch patches.drivers/net-Define-SCM_TIMESTAMPING_PKTINFO-on-all-architect.patch patches.drivers/net-Fix-parisc-SCM_TIMESTAMPING_PKTINFO-value.patch patches.drivers/nfp-add-nfp_cppcore_pcie_unit-helper.patch @@ -2900,6 +2917,7 @@ patches.drivers/qed-Replace-set_id-api-with-set_name.patch patches.drivers/net-flow_dissector-add-support-for-dissection-of-tcp.patch patches.drivers/net-sched-flower-add-support-for-matching-on-tcp-fla.patch + patches.fixes/tcp-fix-TCP_SYNCNT-flakes.patch patches.drivers/net-IB-mlx5-Replace-mlx5_vzalloc-with-kvzalloc.patch patches.drivers/net-mlx5-Update-the-list-of-the-PCI-supported-device.patch patches.drivers/net-mlx5-Introduce-trigger_health_work-function.patch @@ -3921,6 +3939,7 @@ patches.drivers/libnvdimm-pmem-disable-dax-flushing-when-pmem-is-fro.patch patches.drivers/IB-core-Fix-uninitialized-variable-use-in-check_qp_p.patch patches.drivers/IB-core-Fix-static-analysis-warning-in-ib_policy_cha.patch + patches.fixes/netfilter-ebt_nflog-fix-unexpected-truncated-packet.patch patches.drivers/nfp-flower-add-missing-clean-up-call-to-avoid-memory.patch patches.drivers/net-hns-Fix-a-wrong-op-phy-C45-code.patch patches.drivers/net-hns-Fix-a-skb-used-after-free-bug.patch @@ -4170,6 +4189,7 @@ patches.drivers/net-hns-add-acpi-function-of-xge-led-control.patch patches.drivers/cxgb4-ptp_clock_register-returns-error-pointers.patch patches.drivers/net-bridge-fix-dest-lookup-when-vlan-proto-doesn-t-m.patch + patches.fixes/net-packet-Fix-Tx-queue-selection-for-AF_PACKET.patch patches.drivers/bnx2x-fix-format-overflow-warning.patch patches.drivers/liquidio-fix-possible-eeprom-format-string-overflow.patch patches.drivers/mlx4_en-remove-unnecessary-returned-value-check.patch @@ -4400,6 +4420,7 @@ patches.drivers/0025-dm-mpath-retry-BLK_STS_RESOURCE-errors.patch patches.drivers/nvme-rdma-default-MR-page-size-to-4k.patch patches.drivers/nvme-pci-use-dma-memory-for-the-host-memory-buffer-d.patch + patches.fixes/esp-Fix-error-handling-on-layer-2-xmit.patch patches.drivers/net-sched-fix-use-after-free-when-tcf_chain_destroy-.patch patches.drivers/net-sched-don-t-do-tcf_chain_flush-from-tcf_chain_de.patch patches.drivers/nfp-don-t-hold-PF-lock-while-enabling-SR-IOV.patch @@ -4409,11 +4430,15 @@ patches.drivers/bnxt_en-Fix-.ndo_setup_tc-to-include-XDP-rings.patch patches.drivers/bnxt_en-Free-MSIX-vectors-when-unregistering-the-dev.patch patches.drivers/bnxt_en-Do-not-setup-MAC-address-in-bnxt_hwrm_func_q.patch + patches.fixes/netfilter-ipt_CLUSTERIP-fix-use-after-free-of-proc-e.patch + patches.fixes/netfilter-nf_tables-Fix-nft-limit-burst-handling.patch patches.drivers/0145-iwlwifi-pcie-move-rx-workqueue-initialization-to-iwl.patch patches.drivers/nfp-fix-unchecked-flow-dissector-use.patch patches.drivers/nfp-fix-supported-key-layers-calculation.patch patches.drivers/nfp-remove-incorrect-mask-check-for-vlan-matching.patch patches.fixes/net-xfrm-don-t-double-hold-dst-when-sk_policy-in-use.patch + patches.fixes/xfrm_user-fix-info-leak-in-xfrm_notify_sa.patch + patches.fixes/xfrm_user-fix-info-leak-in-build_aevent.patch patches.drivers/nfp-double-free-on-error-in-probe.patch patches.drivers/drivers-net-xgene-Correct-probe-sequence-handling.patch patches.drivers/sch_htb-fix-crash-on-init-failure.patch @@ -4798,6 +4823,7 @@ patches.drivers/IB-core-Add-completion-queue-cq-object-actions.patch patches.drivers/IB-core-Assign-root-to-all-drivers.patch patches.drivers/IB-core-Expose-ioctl-interface-through-experimental-.patch + patches.fixes/net-netfilter-nf_conntrack_core-Fix-net_conntrack_lo.patch patches.drivers/s390-mm-tag-pages.patch patches.drivers/s390-mm-no-dat-tlb-flush.patch patches.drivers/s390-mm-guest-asce-tlb-flush.patch @@ -4923,6 +4949,7 @@ patches.drivers/ibmvnic-109-Implement-.get_channels.patch patches.drivers/net-hns-Fix-for-__udivdi3-compiler-error.patch patches.drivers/net-sched-change-names-of-action-number-helpers-to-b.patch + patches.fixes/ip-options-explicitly-provide-net-ns-to-__ip_options.patch patches.drivers/liquidio-add-missing-strings-in-oct_dev_state_str-ar.patch patches.drivers/liquidio-moved-console_bitmask-module-param-to-lio_m.patch patches.drivers/net-sched-make-type-an-argument-for-ndo_setup_tc.patch @@ -5094,6 +5121,7 @@ patches.drivers/net-mlx5e-Use-size_t-to-store-byte-offset-in-statist.patch patches.drivers/cxgb4-cxgbvf-Handle-32-bit-fw-port-capabilities.patch patches.drivers/liquidio-fix-use-of-pf-in-pass-through-mode-in-a-vir.patch + patches.fixes/xfrm-Clear-RX-SKB-secpath-xfrm_offload.patch patches.drivers/qlogic-make-device_attribute-const.patch patches.drivers/net-check-type-when-freeing-metadata-dst.patch patches.drivers/liquidio-move-macro-definition-to-a-proper-place.patch @@ -5496,6 +5524,8 @@ patches.drivers/cfg80211-honor-NL80211_RRF_NO_HT40-MINUS-PLUS.patch patches.drivers/net-sched-fix-memleak-for-chain-zero.patch patches.drivers/0068-iwlwifi-mvm-only-send-LEDS_CMD-when-the-FW-supports-.patch + patches.fixes/netfilter-ipvs-fix-the-issue-that-sctp_conn_schedule.patch + patches.fixes/ip_tunnel-fix-setting-ttl-and-tos-value-in-collect_m.patch patches.drivers/nvme-add-support-for-FW-activation-without-reset.patch patches.drivers/nvme-define-NVME_NSID_ALL.patch patches.drivers/nvme-add-support-for-NVMe-1.3-Timestamp-Feature.patch @@ -5597,6 +5627,7 @@ patches.drivers/0058-fs-fix-kernel_write-prototype patches.drivers/Input-ucb1400_ts-fix-suspend-and-resume-handling.patch patches.drivers/net_sched-fix-reference-counting-of-tc-filter-chain.patch + patches.fixes/ip_tunnel-fix-ip-tunnel-lookup-in-collect_md-mode.patch patches.drivers/be2net-fix-TSO6-GSO-issue-causing-TX-stall-on-Lancer.patch patches.drivers/net-sched-fix-use-after-free-in-tcf_action_destroy-a.patch patches.drivers/nfp-add-whitelist-of-supported-flow-dissector.patch @@ -5605,6 +5636,7 @@ patches.drivers/net_sched-gen_estimator-fix-scaling-error-in-bytes-p.patch patches.drivers/tg3-clean-up-redundant-initialization-of-tnapi.patch patches.drivers/qed-remove-unnecessary-call-to-memset.patch + patches.fixes/sctp-fix-an-use-after-free-issue-in-sctp_sock_dump.patch patches.drivers/bpf-verifier-reject-BPF_ALU64-BPF_END.patch patches.fixes/0001-md-raid5-fix-a-race-condition-in-stripe-batch.patch patches.drivers/0006-md-raid5-preserve-STRIPE_ON_UNPLUG_LIST-in-break_str.patch @@ -5615,6 +5647,7 @@ patches.arch/s390-sles15-01-09-s390-mm-fix-write-access-check-in-gup_huge_pmd.patch patches.arch/s390-sles15-01-07-01-alternative-topology.patch patches.arch/s390-sles15-01-07-02-dynamic-topology.patch + patches.fixes/udpv6-Fix-the-checksum-computation-when-HW-checksum-.patch patches.drivers/net-sched-cls_matchall-fix-crash-when-used-with-clas.patch patches.drivers/bnxt_en-check-for-ingress-qdisc-in-flower-offload.patch patches.fixes/nl80211-check-for-the-required-netlink-attributes-pr.patch @@ -5706,6 +5739,7 @@ patches.drivers/iwlwifi-mvm-use-IWL_HCMD_NOCOPY-for-MCAST_FILTER_CMD.patch patches.drivers/0001-iwlwifi-mvm-initialize-status-in-iwl_mvm_add_int_sta.patch patches.drivers/iwlwifi-mvm-fix-reorder-buffer-for-9000-devices.patch + patches.fixes/sctp-Fix-a-big-endian-bug-in-sctp_diag_dump.patch patches.fixes/inetpeer-fix-RCU-lookup-again.patch patches.fixes/packet-in-packet_do_bind-test-fanout-with-bind_lock-.patch patches.drivers/net-mlx5e-IPoIB-Fix-access-to-invalid-memory-address.patch @@ -5719,6 +5753,8 @@ patches.drivers/net-mlx5e-Fix-calculated-checksum-offloads-counters.patch patches.drivers/net-mlx5-Fix-static-checker-warning-on-steering-trac.patch patches.drivers/net-mlx5-Fix-wrong-indentation-in-enable-SRIOV-code.patch + patches.fixes/IPv4-early-demux-can-return-an-error-code.patch + patches.fixes/udp-perform-source-validation-for-mcast-early-demux.patch patches.drivers/socket-bpf-fix-possible-use-after-free.patch patches.drivers/net-rtnetlink-fix-info-leak-in-RTM_GETSTATS-call.patch patches.drivers/bpf-fix-bpf_tail_call-x64-JIT.patch @@ -5745,12 +5781,18 @@ patches.drivers/gso-fix-payload-length-when-gso_size-is-zero.patch patches.fixes/ipv6-Fix-traffic-triggered-IPsec-connections.patch patches.fixes/ipv4-Fix-traffic-triggered-IPsec-connections.patch + patches.fixes/xfrm-Fix-negative-device-refcount-on-offload-failure.patch + patches.fixes/vti-fix-NULL-dereference-in-xfrm_input.patch + patches.fixes/udp-fix-bcast-packet-reception.patch patches.drivers/ixgbe-Return-error-when-getting-PHY-address-if-PHY-a.patch patches.drivers/ixgbe-fix-masking-of-bits-read-from-IXGBE_VXLANCTRL-.patch patches.drivers/0008-Revert-commit-1a8b6d76dc5b-net-add-one-common-config.patch patches.drivers/0009-ixgbe-Use-new-PCI_DEV_FLAGS_NO_RELAXED_ORDERING-flag.patch patches.drivers/ixgbe-incorrect-XDP-ring-accounting-in-ethtool-tx_fr.patch + patches.fixes/netfilter-xt_socket-Restore-mark-from-full-sockets-o.patch + patches.fixes/netfilter-nf_tables-fix-update-chain-error.patch patches.fixes/netfilter-ebtables-fix-race-condition-in-frame_filte.patch + patches.fixes/netfilter-xt_bpf-Fix-XT_BPF_MODE_FD_PINNED-mode-of-x.patch patches.drivers/ALSA-seq-Fix-copy_from_user-call-inside-lock patches.drivers/ALSA-line6-Fix-missing-initialization-before-error-p patches.drivers/ALSA-line6-Fix-NULL-dereference-at-podhd_disconnect @@ -5800,6 +5842,8 @@ patches.drivers/bpf-disallow-arithmetic-operations-on-context-pointe.patch patches.drivers/bpf-do-not-test-for-PCPU_MIN_UNIT_SIZE-before-percpu.patch patches.fixes/sctp-do-not-peel-off-an-assoc-from-one-netns-to-anot.patch + patches.fixes/tcp-dccp-fix-ireq-opt-races.patch + patches.fixes/packet-avoid-panic-in-packet_getsockopt.patch patches.drivers/bpf-fix-off-by-one-for-range-markings-with-L-T-E-pat.patch patches.drivers/bpf-fix-pattern-matches-for-direct-packet-access.patch patches.drivers/soreuseport-fix-initialization-race.patch @@ -5816,8 +5860,12 @@ patches.drivers/kvm-fix-detection-of-guest-machine-checks.patch patches.arch/s390-sles15-01-08-zfcp-fix-erp_action-use-before-initialize-in-REC-action-trace.patch patches.drivers/drm-i915-perf-fix-perf-enable-disable-ioctls-with-32 + patches.fixes/tcp-do-tcp_mstamp_refresh-before-retransmits-on-TSQ-.patch + patches.fixes/tcp-dccp-fix-lockdep-splat-in-inet_csk_route_req.patch patches.fixes/ipsec-Fix-aborted-xfrm-policy-dump-crash.patch patches.drivers/nfp-refuse-offloading-filters-that-redirects-to-uppe.patch + patches.fixes/tcp-dccp-fix-other-lockdep-splats-accessing-ireq_opt.patch + patches.fixes/tap-double-free-in-error-path-in-tap_open.patch patches.fixes/mac80211-use-constant-time-comparison-with-keys.patch patches.drivers/mac80211-validate-user-rate-mask-before-configuring-.patch patches.fixes/mac80211-don-t-compare-TKIP-TX-MIC-key-in-reinstall-.patch @@ -5831,9 +5879,13 @@ patches.drivers/ixgbe-Fix-Tx-map-failure-path.patch patches.drivers/i40e-Fix-incorrect-use-of-tx_itr_setting-when-checki.patch patches.drivers/i40e-Add-programming-descriptors-to-cleaned_count.patch + patches.fixes/tcp-refresh-tp-timestamp-before-tcp_mtu_probe.patch patches.drivers/net_sched-avoid-matching-qdisc-with-zero-handle.patch patches.drivers/RDMA-nldev-Enforce-device-index-check-for-port-callb.patch patches.drivers/net-hns-set-correct-return-value.patch + patches.fixes/xfrm-Clear-sk_dst_cache-when-applying-per-socket-pol.patch + patches.fixes/xfrm-Fix-GSO-for-IPsec-with-GRE-tunnel.patch + patches.fixes/tcp-fix-tcp_mtu_probe-vs-highest_sack.patch patches.drivers/nvme-rdma-fix-possible-hang-when-issuing-commands-du.patch patches.drivers/nvme-Fix-setting-logical-block-format-when-revalidat.patch patches.fixes/keys-return-full-count-in-keyring_read-if-buffer-is-too-small @@ -5842,6 +5894,7 @@ patches.drivers/ASoC-topology-Fix-a-potential-NULL-pointer-dereferen patches.drivers/ASoC-topology-Fix-a-potential-memory-leak-in-soc_tpl patches.drivers/ASoC-adau17x1-Workaround-for-noise-bug-in-ADC + patches.fixes/net-vrf-correct-FRA_L3MDEV-encode-type.patch patches.drivers/drm-i915-Hold-rcu_read_lock-when-iterating-over-the- patches.drivers/net-mlx5e-core-en_fs-fix-pointer-dereference-after-f.patch patches.drivers/net-usb-asix-fill-null-ptr-deref-in-asix_suspend.patch @@ -5850,6 +5903,7 @@ patches.drivers/ALSA-seq-Avoid-invalid-lockdep-class-warning patches.drivers/ALSA-seq-Fix-OSS-sysex-delivery-in-OSS-emulation patches.drivers/ALSA-hda-fix-headset-mic-problem-for-Dell-machines-alc274 + patches.fixes/tcp-fix-tcp_fastretrans_alert-warning.patch patches.drivers/net-mlx5-Loop-over-temp-list-to-release-delay-events.patch patches.drivers/net-mlx5-Cancel-health-poll-before-sending-panic-tea.patch patches.drivers/net-mlx5e-Fix-napi-poll-with-zero-budget.patch @@ -6174,6 +6228,7 @@ patches.drivers/qed-Add-support-for-freeing-two-ll2-buffers-for-corn.patch patches.drivers/qed-Add-support-for-MPA-header-being-split-over-two-.patch patches.drivers/qed-Add-iWARP-support-for-fpdu-spanned-over-more-tha.patch + patches.fixes/ipv6-avoid-zeroing-per-cpu-data-again.patch patches.drivers/net-mlx4-Fix-endianness-issue-in-qp-context-params.patch patches.drivers/net-mlx4_core-Fix-cast-warning-in-fw.c.patch patches.drivers/net-mlx4_en-Use-__force-to-fix-a-sparse-warning-in-T.patch @@ -6464,6 +6519,7 @@ patches.drivers/nfp-fix-flower-offload-metadata-flag-usage.patch patches.drivers/nfp-fix-vlan-receive-MAC-statistics-typo.patch patches.drivers/nfp-inherit-the-max_mtu-from-the-PF-netdev.patch + patches.fixes/route-also-update-fnhe_genid-when-updating-a-route-c.patch patches.drivers/ibmvnic-122-fix-dma_mapping_error-call.patch patches.drivers/platform-x86-fujitsu-laptop-Fix-radio-LED-detection patches.drivers/platform-x86-sony-laptop-Fix-error-handling-in-sony_ @@ -6483,6 +6539,7 @@ patches.drivers/ALSA-hda-realtek-Fix-ALC700-family-no-sound-issue patches.drivers/ALSA-hda-Add-Raven-PCI-ID patches.drivers/net-sched-fix-crash-when-deleting-secondary-chains.patch + patches.fixes/ipv6-Do-not-consider-linkdown-nexthops-during-multip.patch patches.fixes/net-accept-UFO-datagrams-from-tuntap-and-packet.patch patches.fixes/bpf-fix-branch-pruning-logic.patch patches.drivers/i40e-fix-the-calculation-of-VFs-mac-addresses.patch @@ -6497,6 +6554,8 @@ patches.drivers/net-thunderx-Fix-TCP-UDP-checksum-offload-for-IPv6-p.patch patches.fixes/net-openvswitch-datapath-fix-data-type-in-queue_gso_.patch patches.drivers/bnxt_en-Fix-an-error-handling-path-in-bnxt_get_modul.patch + patches.fixes/packet-fix-crash-in-fanout_demux_rollover.patch + patches.fixes/net-packet-fix-a-race-in-packet_bind-and-packet_noti.patch patches.drivers/net-sched-cbq-create-block-for-q-link.block.patch patches.fixes/mm-hugetlbfs-introduce-split-to-vm_operations_struct.patch patches.fixes/device-dax-implement-split-to-catch-invalid-munmap-a.patch @@ -6529,6 +6588,7 @@ patches.drivers/s390-qeth-fix-thinko-in-IPv4-multicast-address-track.patch patches.drivers/s390-qeth-fix-GSO-throughput-regression.patch patches.drivers/s390-qeth-build-max-size-GSO-skbs-on-L2-devices.patch + patches.fixes/tcp-use-IPCB-instead-of-TCP_SKB_CB-in-inet_exact_dif.patch patches.drivers/serial-8250_pci-Add-Amazon-PCI-serial-device-ID patches.drivers/isa-Prevent-NULL-dereference-in-isa_bus-driver-callb patches.drivers/IB-hfi1-Initialize-bth1-in-16B-rc-ack-builder.patch @@ -6551,6 +6611,7 @@ patches.drivers/net_sched-red-Avoid-illegal-values.patch patches.fixes/dccp-CVE-2017-8824-use-after-free-in-DCCP-code.patch patches.drivers/net-thunderx-Fix-TCP-UDP-checksum-offload-for-IPv4-p.patch + patches.fixes/tcp-use-current-time-in-tcp_rcv_space_adjust.patch patches.drivers/sfc-pass-valid-pointers-from-efx_enqueue_unwind.patch patches.drivers/bnxt_en-Fix-sources-of-spurious-netpoll-warnings.patch patches.drivers/iwlwifi-mvm-don-t-use-transmit-queue-hang-detection-.patch @@ -6565,8 +6626,14 @@ patches.fixes/scsi-scsi_devinfo-cleanly-zero-pad-devinfo-strings.patch patches.fixes/scsi-bfa-fix-type-conversion-warning.patch patches.fixes/netlink-Add-netns-check-on-taps.patch + patches.fixes/sctp-make-sure-stream-nums-can-match-optlen-in-sctp_.patch + patches.fixes/fou-fix-some-member-types-in-guehdr.patch + patches.fixes/tcp-md5sig-Use-skb-s-saddr-when-replying-to-an-incom.patch patches.fixes/netfilter-nfnetlink_cthelper-Add-missing-permission-.patch + patches.fixes/netfilter-xt_bpf-add-overflow-checks.patch patches.fixes/netfilter-xt_osf-Add-missing-permission-checks.patch + patches.fixes/tcp-fix-potential-underestimation-on-rcv_rtt.patch + patches.fixes/tcp-refresh-tcp_mstamp-from-timers-callbacks.patch patches.drivers/net-mlx4_en-Fix-selftest-for-small-MTUs.patch patches.drivers/net-mlx4_core-Fix-wrong-calculation-of-free-counters.patch patches.drivers/net-mlx4_en-Fill-all-counters-under-one-call-of-stat.patch @@ -6611,6 +6678,7 @@ patches.drivers/bpf-force-strict-alignment-checks-for-stack-pointers.patch patches.drivers/bpf-don-t-prune-branches-when-a-scalar-is-replaced-w.patch patches.fixes/bpf-fix-integer-overflows.patch + patches.fixes/openvswitch-Fix-pop_vlan-action-for-double-tagged-fr.patch patches.drivers/drm-i915-Protect-DDI-port-to-DPLL-map-from-theoretic patches.drivers/ASoC-rsnd-ssiu-clear-SSI_MODE-for-non-TDM-Extended-m patches.drivers/ASoC-rockchip-disable-clock-on-error @@ -6637,6 +6705,9 @@ patches.drivers/IB-uverbs-Fix-command-checking-as-part-of-ib_uverbs_.patch patches.drivers/IB-core-Verify-that-QP-is-security-enabled-in-create.patch patches.drivers/IB-mlx5-Fix-mlx5_ib_alloc_mr-error-flow.patch + patches.fixes/xfrm-Fix-xfrm_input-to-verify-state-is-valid-when-en.patch + patches.fixes/xfrm-Reinject-transport-mode-packets-through-tasklet.patch + patches.fixes/sctp-Replace-use-of-sockets_allocated-with-specified.patch patches.drivers/scsi-core-check-for-device-state-in-__scsi_remove_ta.patch patches.drivers/drm-i915-Apply-Display-WA-1183-on-skl-kbl-and-cfl patches.drivers/crypto-chelsio-select-CRYPTO_GF128MUL.patch @@ -6660,17 +6731,21 @@ patches.suse/bpf-prevent-out-of-bounds-speculation.patch patches.drivers/0001-iwlwifi-pcie-fix-DMA-memory-mapping-unmapping.patch patches.drivers/nfp-always-unmask-aux-interrupts-at-init.patch + patches.fixes/ipv6-fix-possible-mem-leaks-in-ipv6_make_skb.patch patches.drivers/ALSA-hda-Apply-headphone-noise-quirk-for-another-Del patches.drivers/ALSA-hda-Apply-the-existing-quirk-to-iMac-14-1 patches.drivers/ALSA-pcm-Remove-yet-superfluous-WARN_ON patches.drivers/ALSA-seq-Make-ioctls-race-free + patches.fixes/xfrm-Use-__skb_queue_tail-in-xfrm_trans_queue.patch patches.drivers/net-ib-mlx5-Don-t-disable-local-loopback-multicast-t.patch patches.drivers/net-mlx5-Fix-get-vector-affinity-helper-function.patch patches.drivers/net-mlx5-Fix-memory-leak-in-bad-flow-of-mlx5_alloc_i.patch patches.drivers/net-mlx5-Fix-mlx5_get_uars_page-to-return-error-code.patch patches.drivers/net-mlx5-Fix-error-handling-in-load-one.patch + patches.drivers/net-mlx5e-Keep-updating-ethtool-statistics-when-the-.patch patches.drivers/net-mlx5e-Don-t-override-netdev-features-field-unles.patch patches.suse/bpf-array-fix-overflow-in-max_entries-and-undefined-.patch + patches.fixes/ipv6-ip6_make_skb-needs-to-clear-cork.base.dst.patch patches.drivers/ibmvnic-Fix-pending-MAC-address-changes.patch patches.drivers/nfp-use-the-correct-index-for-link-speed-table.patch patches.drivers/IB-hfi1-Prevent-a-NULL-dereference.patch @@ -6679,8 +6754,6 @@ patches.drivers/ibmvnic-Fix-IP-offload-control-buffer.patch patches.drivers/ibmvnic-Fix-IPv6-packet-descriptors.patch patches.drivers/net-mlx5e-Fix-fixpoint-divide-exception-in-mlx5e_am_.patch - - # davem/net patches.drivers/ibmvnic-Modify-buffer-size-and-number-of-queues-on-f.patch patches.drivers/ibmvnic-Revert-to-previous-mtu-when-unsupported-valu.patch patches.drivers/ibmvnic-Allocate-and-request-vpd-in-init_resources.patch @@ -7062,6 +7135,9 @@ # ########################################################## + patches.fixes/KEYS-don-t-let-add_key-update-an-uninstantiated-key.patch + patches.fixes/KEYS-Fix-race-between-updating-and-finding-a-negativ.patch + ########################################################## # crypto ##########################################################