|
Jiri Wiesner |
283756 |
From 33392e1beedca210c4d20be1c30a6ab80c2a05f0 Mon Sep 17 00:00:00 2001
|
|
Jiri Wiesner |
283756 |
From: Jiri Wiesner <jwiesner@suse.com>
|
|
Jiri Wiesner |
283756 |
Date: Fri, 21 Aug 2020 10:42:13 +0200
|
|
Jiri Wiesner |
283756 |
Subject: bonding: fix active-backup failover for current ARP slave
|
|
Jiri Wiesner |
283756 |
Git-commit: 0410d07190961ac526f05085765a8d04d926545b
|
|
Takashi Iwai |
7f88fb |
Patch-mainline: v5.9-rc2
|
|
Jiri Wiesner |
283756 |
References: bsc#1174771
|
|
Jiri Wiesner |
283756 |
|
|
Jiri Wiesner |
283756 |
When the ARP monitor is used for link detection, ARP replies are
|
|
Jiri Wiesner |
283756 |
validated for all slaves (arp_validate=3) and fail_over_mac is set to
|
|
Jiri Wiesner |
283756 |
active, two slaves of an active-backup bond may get stuck in a state
|
|
Jiri Wiesner |
283756 |
where both of them are active and pass packets that they receive to
|
|
Jiri Wiesner |
283756 |
the bond. This state makes IPv6 duplicate address detection fail. The
|
|
Jiri Wiesner |
283756 |
state is reached thus:
|
|
Jiri Wiesner |
283756 |
1. The current active slave goes down because the ARP target
|
|
Jiri Wiesner |
283756 |
is not reachable.
|
|
Jiri Wiesner |
283756 |
2. The current ARP slave is chosen and made active.
|
|
Jiri Wiesner |
283756 |
3. A new slave is enslaved. This new slave becomes the current active
|
|
Jiri Wiesner |
283756 |
slave and can reach the ARP target.
|
|
Jiri Wiesner |
283756 |
As a result, the current ARP slave stays active after the enslave
|
|
Jiri Wiesner |
283756 |
action has finished and the log is littered with "PROBE BAD" messages:
|
|
Jiri Wiesner |
283756 |
> bond0: PROBE: c_arp ens10 && cas ens11 BAD
|
|
Jiri Wiesner |
283756 |
The workaround is to remove the slave with "going back" status from
|
|
Jiri Wiesner |
283756 |
the bond and re-enslave it. This issue was encountered when DPDK PMD
|
|
Jiri Wiesner |
283756 |
interfaces were being enslaved to an active-backup bond.
|
|
Jiri Wiesner |
283756 |
|
|
Jiri Wiesner |
283756 |
I would be possible to fix the issue in bond_enslave() or
|
|
Jiri Wiesner |
283756 |
bond_change_active_slave() but the ARP monitor was fixed instead to
|
|
Jiri Wiesner |
283756 |
keep most of the actions changing the current ARP slave in the ARP
|
|
Jiri Wiesner |
283756 |
monitor code. The current ARP slave is set as inactive and backup
|
|
Jiri Wiesner |
283756 |
during the commit phase. A new state, BOND_LINK_FAIL, has been
|
|
Jiri Wiesner |
283756 |
introduced for slaves in the context of the ARP monitor. This allows
|
|
Jiri Wiesner |
283756 |
administrators to see how slaves are rotated for sending ARP requests
|
|
Jiri Wiesner |
283756 |
and attempts are made to find a new active slave.
|
|
Jiri Wiesner |
283756 |
|
|
Jiri Wiesner |
283756 |
Fixes: b2220cad583c9 ("bonding: refactor ARP active-backup monitor")
|
|
Jiri Wiesner |
283756 |
Signed-off-by: Jiri Wiesner <jwiesner@suse.com>
|
|
Jiri Wiesner |
283756 |
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
Jiri Wiesner |
283756 |
---
|
|
Jiri Wiesner |
283756 |
drivers/net/bonding/bond_main.c | 18 ++++++++++++++++--
|
|
Jiri Wiesner |
283756 |
1 file changed, 16 insertions(+), 2 deletions(-)
|
|
Jiri Wiesner |
283756 |
|
|
Jiri Wiesner |
283756 |
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
|
|
Jiri Wiesner |
283756 |
index d10805e5e623..061254a592c0 100644
|
|
Jiri Wiesner |
283756 |
--- a/drivers/net/bonding/bond_main.c
|
|
Jiri Wiesner |
283756 |
+++ b/drivers/net/bonding/bond_main.c
|
|
Jiri Wiesner |
283756 |
@@ -2824,6 +2824,9 @@ static int bond_ab_arp_inspect(struct bonding *bond)
|
|
Jiri Wiesner |
283756 |
if (bond_time_in_interval(bond, last_rx, 1)) {
|
|
Jiri Wiesner |
283756 |
bond_propose_link_state(slave, BOND_LINK_UP);
|
|
Jiri Wiesner |
283756 |
commit++;
|
|
Jiri Wiesner |
283756 |
+ } else if (slave->link == BOND_LINK_BACK) {
|
|
Jiri Wiesner |
283756 |
+ bond_propose_link_state(slave, BOND_LINK_FAIL);
|
|
Jiri Wiesner |
283756 |
+ commit++;
|
|
Jiri Wiesner |
283756 |
}
|
|
Jiri Wiesner |
283756 |
continue;
|
|
Jiri Wiesner |
283756 |
}
|
|
Jiri Wiesner |
283756 |
@@ -2932,6 +2935,19 @@ static void bond_ab_arp_commit(struct bonding *bond)
|
|
Jiri Wiesner |
283756 |
|
|
Jiri Wiesner |
283756 |
continue;
|
|
Jiri Wiesner |
283756 |
|
|
Jiri Wiesner |
283756 |
+ case BOND_LINK_FAIL:
|
|
Jiri Wiesner |
283756 |
+ bond_set_slave_link_state(slave, BOND_LINK_FAIL,
|
|
Jiri Wiesner |
283756 |
+ BOND_SLAVE_NOTIFY_NOW);
|
|
Jiri Wiesner |
283756 |
+ bond_set_slave_inactive_flags(slave,
|
|
Jiri Wiesner |
283756 |
+ BOND_SLAVE_NOTIFY_NOW);
|
|
Jiri Wiesner |
283756 |
+
|
|
Jiri Wiesner |
283756 |
+ /* A slave has just been enslaved and has become
|
|
Jiri Wiesner |
283756 |
+ * the current active slave.
|
|
Jiri Wiesner |
283756 |
+ */
|
|
Jiri Wiesner |
283756 |
+ if (rtnl_dereference(bond->curr_active_slave))
|
|
Jiri Wiesner |
283756 |
+ RCU_INIT_POINTER(bond->current_arp_slave, NULL);
|
|
Jiri Wiesner |
283756 |
+ continue;
|
|
Jiri Wiesner |
283756 |
+
|
|
Jiri Wiesner |
283756 |
default:
|
|
Jiri Wiesner |
283756 |
slave_err(bond->dev, slave->dev,
|
|
Jiri Wiesner |
283756 |
"impossible: link_new_state %d on slave\n",
|
|
Jiri Wiesner |
283756 |
@@ -2982,8 +2998,6 @@ static bool bond_ab_arp_probe(struct bonding *bond)
|
|
Jiri Wiesner |
283756 |
return should_notify_rtnl;
|
|
Jiri Wiesner |
283756 |
}
|
|
Jiri Wiesner |
283756 |
|
|
Jiri Wiesner |
283756 |
- bond_set_slave_inactive_flags(curr_arp_slave, BOND_SLAVE_NOTIFY_LATER);
|
|
Jiri Wiesner |
283756 |
-
|
|
Jiri Wiesner |
283756 |
bond_for_each_slave_rcu(bond, slave, iter) {
|
|
Jiri Wiesner |
283756 |
if (!found && !before && bond_slave_is_up(slave))
|
|
Jiri Wiesner |
283756 |
before = slave;
|
|
Jiri Wiesner |
283756 |
--
|
|
Jiri Wiesner |
283756 |
2.26.2
|
|
Jiri Wiesner |
283756 |
|