Tree - kernel/kernel-source - Pagure for openSUSE

kernel / kernel-source

Source
Stats
Files

Commit: c9f801deb73d69cb815199861d1f433ea476f4a2
Blob Blame History Raw
From: Martin Habets <martinh@xilinx.com>
Date: Mon, 9 May 2022 16:31:43 +0100
Subject: sfc: Copy shared files needed for Siena (part 2)
Patch-mainline: v5.19-rc1
Git-commit: d48523cb88e0703055c1b33e61eb644a7976f92b
References: jsc#PED-1565

These are the files starting with m through w.
No changes are done, those will be done with subsequent commits.

Signed-off-by: Martin Habets <habetsm.xilinx@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Acked-by: Thomas Bogendoerfer <tbogendoerfer@suse.de>
---
 drivers/net/ethernet/sfc/siena/mcdi.c             | 2375 ++++++++++++++++++++++
 drivers/net/ethernet/sfc/siena/mcdi.h             |  388 +++
 drivers/net/ethernet/sfc/siena/mcdi_mon.c         |  531 ++++
 drivers/net/ethernet/sfc/siena/mcdi_port.c        |  117 +
 drivers/net/ethernet/sfc/siena/mcdi_port.h        |   18 
 drivers/net/ethernet/sfc/siena/mcdi_port_common.c | 1301 ++++++++++++
 drivers/net/ethernet/sfc/siena/mcdi_port_common.h |   67 
 drivers/net/ethernet/sfc/siena/mtd.c              |  124 +
 drivers/net/ethernet/sfc/siena/net_driver.h       | 1716 +++++++++++++++
 drivers/net/ethernet/sfc/siena/nic.c              |  580 +++++
 drivers/net/ethernet/sfc/siena/nic.h              |  392 +++
 drivers/net/ethernet/sfc/siena/nic_common.h       |  262 ++
 drivers/net/ethernet/sfc/siena/ptp.c              | 2210 ++++++++++++++++++++
 drivers/net/ethernet/sfc/siena/ptp.h              |   45 
 drivers/net/ethernet/sfc/siena/rx.c               |  399 +++
 drivers/net/ethernet/sfc/siena/rx_common.c        | 1086 ++++++++++
 drivers/net/ethernet/sfc/siena/rx_common.h        |  116 +
 drivers/net/ethernet/sfc/siena/selftest.c         |  807 +++++++
 drivers/net/ethernet/sfc/siena/selftest.h         |   52 
 drivers/net/ethernet/sfc/siena/sriov.c            |   72 
 drivers/net/ethernet/sfc/siena/sriov.h            |   25 
 drivers/net/ethernet/sfc/siena/tx.c               |  643 +++++
 drivers/net/ethernet/sfc/siena/tx.h               |   47 
 drivers/net/ethernet/sfc/siena/tx_common.c        |  449 ++++
 drivers/net/ethernet/sfc/siena/tx_common.h        |   45 
 drivers/net/ethernet/sfc/siena/vfdi.h             |  252 ++
 drivers/net/ethernet/sfc/siena/workarounds.h      |   34 
 27 files changed, 14153 insertions(+)
 create mode 100644 drivers/net/ethernet/sfc/siena/mcdi.c
 create mode 100644 drivers/net/ethernet/sfc/siena/mcdi.h
 create mode 100644 drivers/net/ethernet/sfc/siena/mcdi_mon.c
 create mode 100644 drivers/net/ethernet/sfc/siena/mcdi_port.c
 create mode 100644 drivers/net/ethernet/sfc/siena/mcdi_port.h
 create mode 100644 drivers/net/ethernet/sfc/siena/mcdi_port_common.c
 create mode 100644 drivers/net/ethernet/sfc/siena/mcdi_port_common.h
 create mode 100644 drivers/net/ethernet/sfc/siena/mtd.c
 create mode 100644 drivers/net/ethernet/sfc/siena/net_driver.h
 create mode 100644 drivers/net/ethernet/sfc/siena/nic.c
 create mode 100644 drivers/net/ethernet/sfc/siena/nic.h
 create mode 100644 drivers/net/ethernet/sfc/siena/nic_common.h
 create mode 100644 drivers/net/ethernet/sfc/siena/ptp.c
 create mode 100644 drivers/net/ethernet/sfc/siena/ptp.h
 create mode 100644 drivers/net/ethernet/sfc/siena/rx.c
 create mode 100644 drivers/net/ethernet/sfc/siena/rx_common.c
 create mode 100644 drivers/net/ethernet/sfc/siena/rx_common.h
 create mode 100644 drivers/net/ethernet/sfc/siena/selftest.c
 create mode 100644 drivers/net/ethernet/sfc/siena/selftest.h
 create mode 100644 drivers/net/ethernet/sfc/siena/sriov.c
 create mode 100644 drivers/net/ethernet/sfc/siena/sriov.h
 create mode 100644 drivers/net/ethernet/sfc/siena/tx.c
 create mode 100644 drivers/net/ethernet/sfc/siena/tx.h
 create mode 100644 drivers/net/ethernet/sfc/siena/tx_common.c
 create mode 100644 drivers/net/ethernet/sfc/siena/tx_common.h
 create mode 100644 drivers/net/ethernet/sfc/siena/vfdi.h
 create mode 100644 drivers/net/ethernet/sfc/siena/workarounds.h

--- /dev/null
+++ b/drivers/net/ethernet/sfc/siena/mcdi.c
@@ -0,0 +1,2375 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/****************************************************************************
+ * Driver for Solarflare network controllers and boards
+ * Copyright 2008-2013 Solarflare Communications Inc.
+ */
+
+#include <linux/delay.h>
+#include <linux/moduleparam.h>
+#include <linux/atomic.h>
+#include "net_driver.h"
+#include "nic.h"
+#include "io.h"
+#include "farch_regs.h"
+#include "mcdi_pcol.h"
+
+/**************************************************************************
+ *
+ * Management-Controller-to-Driver Interface
+ *
+ **************************************************************************
+ */
+
+#define MCDI_RPC_TIMEOUT       (10 * HZ)
+
+/* A reboot/assertion causes the MCDI status word to be set after the
+ * command word is set or a REBOOT event is sent. If we notice a reboot
+ * via these mechanisms then wait 250ms for the status word to be set.
+ */
+#define MCDI_STATUS_DELAY_US		100
+#define MCDI_STATUS_DELAY_COUNT		2500
+#define MCDI_STATUS_SLEEP_MS						\
+	(MCDI_STATUS_DELAY_US * MCDI_STATUS_DELAY_COUNT / 1000)
+
+#define SEQ_MASK							\
+	EFX_MASK32(EFX_WIDTH(MCDI_HEADER_SEQ))
+
+struct efx_mcdi_async_param {
+	struct list_head list;
+	unsigned int cmd;
+	size_t inlen;
+	size_t outlen;
+	bool quiet;
+	efx_mcdi_async_completer *complete;
+	unsigned long cookie;
+	/* followed by request/response buffer */
+};
+
+static void efx_mcdi_timeout_async(struct timer_list *t);
+static int efx_mcdi_drv_attach(struct efx_nic *efx, bool driver_operating,
+			       bool *was_attached_out);
+static bool efx_mcdi_poll_once(struct efx_nic *efx);
+static void efx_mcdi_abandon(struct efx_nic *efx);
+
+#ifdef CONFIG_SFC_MCDI_LOGGING
+static bool mcdi_logging_default;
+module_param(mcdi_logging_default, bool, 0644);
+MODULE_PARM_DESC(mcdi_logging_default,
+		 "Enable MCDI logging on newly-probed functions");
+#endif
+
+int efx_mcdi_init(struct efx_nic *efx)
+{
+	struct efx_mcdi_iface *mcdi;
+	bool already_attached;
+	int rc = -ENOMEM;
+
+	efx->mcdi = kzalloc(sizeof(*efx->mcdi), GFP_KERNEL);
+	if (!efx->mcdi)
+		goto fail;
+
+	mcdi = efx_mcdi(efx);
+	mcdi->efx = efx;
+#ifdef CONFIG_SFC_MCDI_LOGGING
+	/* consuming code assumes buffer is page-sized */
+	mcdi->logging_buffer = (char *)__get_free_page(GFP_KERNEL);
+	if (!mcdi->logging_buffer)
+		goto fail1;
+	mcdi->logging_enabled = mcdi_logging_default;
+#endif
+	init_waitqueue_head(&mcdi->wq);
+	init_waitqueue_head(&mcdi->proxy_rx_wq);
+	spin_lock_init(&mcdi->iface_lock);
+	mcdi->state = MCDI_STATE_QUIESCENT;
+	mcdi->mode = MCDI_MODE_POLL;
+	spin_lock_init(&mcdi->async_lock);
+	INIT_LIST_HEAD(&mcdi->async_list);
+	timer_setup(&mcdi->async_timer, efx_mcdi_timeout_async, 0);
+
+	(void) efx_mcdi_poll_reboot(efx);
+	mcdi->new_epoch = true;
+
+	/* Recover from a failed assertion before probing */
+	rc = efx_mcdi_handle_assertion(efx);
+	if (rc)
+		goto fail2;
+
+	/* Let the MC (and BMC, if this is a LOM) know that the driver
+	 * is loaded. We should do this before we reset the NIC.
+	 */
+	rc = efx_mcdi_drv_attach(efx, true, &already_attached);
+	if (rc) {
+		netif_err(efx, probe, efx->net_dev,
+			  "Unable to register driver with MCPU\n");
+		goto fail2;
+	}
+	if (already_attached)
+		/* Not a fatal error */
+		netif_err(efx, probe, efx->net_dev,
+			  "Host already registered with MCPU\n");
+
+	if (efx->mcdi->fn_flags &
+	    (1 << MC_CMD_DRV_ATTACH_EXT_OUT_FLAG_PRIMARY))
+		efx->primary = efx;
+
+	return 0;
+fail2:
+#ifdef CONFIG_SFC_MCDI_LOGGING
+	free_page((unsigned long)mcdi->logging_buffer);
+fail1:
+#endif
+	kfree(efx->mcdi);
+	efx->mcdi = NULL;
+fail:
+	return rc;
+}
+
+void efx_mcdi_detach(struct efx_nic *efx)
+{
+	if (!efx->mcdi)
+		return;
+
+	BUG_ON(efx->mcdi->iface.state != MCDI_STATE_QUIESCENT);
+
+	/* Relinquish the device (back to the BMC, if this is a LOM) */
+	efx_mcdi_drv_attach(efx, false, NULL);
+}
+
+void efx_mcdi_fini(struct efx_nic *efx)
+{
+	if (!efx->mcdi)
+		return;
+
+#ifdef CONFIG_SFC_MCDI_LOGGING
+	free_page((unsigned long)efx->mcdi->iface.logging_buffer);
+#endif
+
+	kfree(efx->mcdi);
+}
+
+static void efx_mcdi_send_request(struct efx_nic *efx, unsigned cmd,
+				  const efx_dword_t *inbuf, size_t inlen)
+{
+	struct efx_mcdi_iface *mcdi = efx_mcdi(efx);
+#ifdef CONFIG_SFC_MCDI_LOGGING
+	char *buf = mcdi->logging_buffer; /* page-sized */
+#endif
+	efx_dword_t hdr[2];
+	size_t hdr_len;
+	u32 xflags, seqno;
+
+	BUG_ON(mcdi->state == MCDI_STATE_QUIESCENT);
+
+	/* Serialise with efx_mcdi_ev_cpl() and efx_mcdi_ev_death() */
+	spin_lock_bh(&mcdi->iface_lock);
+	++mcdi->seqno;
+	seqno = mcdi->seqno & SEQ_MASK;
+	spin_unlock_bh(&mcdi->iface_lock);
+
+	xflags = 0;
+	if (mcdi->mode == MCDI_MODE_EVENTS)
+		xflags |= MCDI_HEADER_XFLAGS_EVREQ;
+
+	if (efx->type->mcdi_max_ver == 1) {
+		/* MCDI v1 */
+		EFX_POPULATE_DWORD_7(hdr[0],
+				     MCDI_HEADER_RESPONSE, 0,
+				     MCDI_HEADER_RESYNC, 1,
+				     MCDI_HEADER_CODE, cmd,
+				     MCDI_HEADER_DATALEN, inlen,
+				     MCDI_HEADER_SEQ, seqno,
+				     MCDI_HEADER_XFLAGS, xflags,
+				     MCDI_HEADER_NOT_EPOCH, !mcdi->new_epoch);
+		hdr_len = 4;
+	} else {
+		/* MCDI v2 */
+		BUG_ON(inlen > MCDI_CTL_SDU_LEN_MAX_V2);
+		EFX_POPULATE_DWORD_7(hdr[0],
+				     MCDI_HEADER_RESPONSE, 0,
+				     MCDI_HEADER_RESYNC, 1,
+				     MCDI_HEADER_CODE, MC_CMD_V2_EXTN,
+				     MCDI_HEADER_DATALEN, 0,
+				     MCDI_HEADER_SEQ, seqno,
+				     MCDI_HEADER_XFLAGS, xflags,
+				     MCDI_HEADER_NOT_EPOCH, !mcdi->new_epoch);
+		EFX_POPULATE_DWORD_2(hdr[1],
+				     MC_CMD_V2_EXTN_IN_EXTENDED_CMD, cmd,
+				     MC_CMD_V2_EXTN_IN_ACTUAL_LEN, inlen);
+		hdr_len = 8;
+	}
+
+#ifdef CONFIG_SFC_MCDI_LOGGING
+	if (mcdi->logging_enabled && !WARN_ON_ONCE(!buf)) {
+		int bytes = 0;
+		int i;
+		/* Lengths should always be a whole number of dwords, so scream
+		 * if they're not.
+		 */
+		WARN_ON_ONCE(hdr_len % 4);
+		WARN_ON_ONCE(inlen % 4);
+
+		/* We own the logging buffer, as only one MCDI can be in
+		 * progress on a NIC at any one time.  So no need for locking.
+		 */
+		for (i = 0; i < hdr_len / 4 && bytes < PAGE_SIZE; i++)
+			bytes += scnprintf(buf + bytes, PAGE_SIZE - bytes,
+					   " %08x",
+					   le32_to_cpu(hdr[i].u32[0]));
+
+		for (i = 0; i < inlen / 4 && bytes < PAGE_SIZE; i++)
+			bytes += scnprintf(buf + bytes, PAGE_SIZE - bytes,
+					   " %08x",
+					   le32_to_cpu(inbuf[i].u32[0]));
+
+		netif_info(efx, hw, efx->net_dev, "MCDI RPC REQ:%s\n", buf);
+	}
+#endif
+
+	efx->type->mcdi_request(efx, hdr, hdr_len, inbuf, inlen);
+
+	mcdi->new_epoch = false;
+}
+
+static int efx_mcdi_errno(unsigned int mcdi_err)
+{
+	switch (mcdi_err) {
+	case 0:
+		return 0;
+#define TRANSLATE_ERROR(name)					\
+	case MC_CMD_ERR_ ## name:				\
+		return -name;
+	TRANSLATE_ERROR(EPERM);
+	TRANSLATE_ERROR(ENOENT);
+	TRANSLATE_ERROR(EINTR);
+	TRANSLATE_ERROR(EAGAIN);
+	TRANSLATE_ERROR(EACCES);
+	TRANSLATE_ERROR(EBUSY);
+	TRANSLATE_ERROR(EINVAL);
+	TRANSLATE_ERROR(EDEADLK);
+	TRANSLATE_ERROR(ENOSYS);
+	TRANSLATE_ERROR(ETIME);
+	TRANSLATE_ERROR(EALREADY);
+	TRANSLATE_ERROR(ENOSPC);
+#undef TRANSLATE_ERROR
+	case MC_CMD_ERR_ENOTSUP:
+		return -EOPNOTSUPP;
+	case MC_CMD_ERR_ALLOC_FAIL:
+		return -ENOBUFS;
+	case MC_CMD_ERR_MAC_EXIST:
+		return -EADDRINUSE;
+	default:
+		return -EPROTO;
+	}
+}
+
+static void efx_mcdi_read_response_header(struct efx_nic *efx)
+{
+	struct efx_mcdi_iface *mcdi = efx_mcdi(efx);
+	unsigned int respseq, respcmd, error;
+#ifdef CONFIG_SFC_MCDI_LOGGING
+	char *buf = mcdi->logging_buffer; /* page-sized */
+#endif
+	efx_dword_t hdr;
+
+	efx->type->mcdi_read_response(efx, &hdr, 0, 4);
+	respseq = EFX_DWORD_FIELD(hdr, MCDI_HEADER_SEQ);
+	respcmd = EFX_DWORD_FIELD(hdr, MCDI_HEADER_CODE);
+	error = EFX_DWORD_FIELD(hdr, MCDI_HEADER_ERROR);
+
+	if (respcmd != MC_CMD_V2_EXTN) {
+		mcdi->resp_hdr_len = 4;
+		mcdi->resp_data_len = EFX_DWORD_FIELD(hdr, MCDI_HEADER_DATALEN);
+	} else {
+		efx->type->mcdi_read_response(efx, &hdr, 4, 4);
+		mcdi->resp_hdr_len = 8;
+		mcdi->resp_data_len =
+			EFX_DWORD_FIELD(hdr, MC_CMD_V2_EXTN_IN_ACTUAL_LEN);
+	}
+
+#ifdef CONFIG_SFC_MCDI_LOGGING
+	if (mcdi->logging_enabled && !WARN_ON_ONCE(!buf)) {
+		size_t hdr_len, data_len;
+		int bytes = 0;
+		int i;
+
+		WARN_ON_ONCE(mcdi->resp_hdr_len % 4);
+		hdr_len = mcdi->resp_hdr_len / 4;
+		/* MCDI_DECLARE_BUF ensures that underlying buffer is padded
+		 * to dword size, and the MCDI buffer is always dword size
+		 */
+		data_len = DIV_ROUND_UP(mcdi->resp_data_len, 4);
+
+		/* We own the logging buffer, as only one MCDI can be in
+		 * progress on a NIC at any one time.  So no need for locking.
+		 */
+		for (i = 0; i < hdr_len && bytes < PAGE_SIZE; i++) {
+			efx->type->mcdi_read_response(efx, &hdr, (i * 4), 4);
+			bytes += scnprintf(buf + bytes, PAGE_SIZE - bytes,
+					   " %08x", le32_to_cpu(hdr.u32[0]));
+		}
+
+		for (i = 0; i < data_len && bytes < PAGE_SIZE; i++) {
+			efx->type->mcdi_read_response(efx, &hdr,
+					mcdi->resp_hdr_len + (i * 4), 4);
+			bytes += scnprintf(buf + bytes, PAGE_SIZE - bytes,
+					   " %08x", le32_to_cpu(hdr.u32[0]));
+		}
+
+		netif_info(efx, hw, efx->net_dev, "MCDI RPC RESP:%s\n", buf);
+	}
+#endif
+
+	mcdi->resprc_raw = 0;
+	if (error && mcdi->resp_data_len == 0) {
+		netif_err(efx, hw, efx->net_dev, "MC rebooted\n");
+		mcdi->resprc = -EIO;
+	} else if ((respseq ^ mcdi->seqno) & SEQ_MASK) {
+		netif_err(efx, hw, efx->net_dev,
+			  "MC response mismatch tx seq 0x%x rx seq 0x%x\n",
+			  respseq, mcdi->seqno);
+		mcdi->resprc = -EIO;
+	} else if (error) {
+		efx->type->mcdi_read_response(efx, &hdr, mcdi->resp_hdr_len, 4);
+		mcdi->resprc_raw = EFX_DWORD_FIELD(hdr, EFX_DWORD_0);
+		mcdi->resprc = efx_mcdi_errno(mcdi->resprc_raw);
+	} else {
+		mcdi->resprc = 0;
+	}
+}
+
+static bool efx_mcdi_poll_once(struct efx_nic *efx)
+{
+	struct efx_mcdi_iface *mcdi = efx_mcdi(efx);
+
+	rmb();
+	if (!efx->type->mcdi_poll_response(efx))
+		return false;
+
+	spin_lock_bh(&mcdi->iface_lock);
+	efx_mcdi_read_response_header(efx);
+	spin_unlock_bh(&mcdi->iface_lock);
+
+	return true;
+}
+
+static int efx_mcdi_poll(struct efx_nic *efx)
+{
+	struct efx_mcdi_iface *mcdi = efx_mcdi(efx);
+	unsigned long time, finish;
+	unsigned int spins;
+	int rc;
+
+	/* Check for a reboot atomically with respect to efx_mcdi_copyout() */
+	rc = efx_mcdi_poll_reboot(efx);
+	if (rc) {
+		spin_lock_bh(&mcdi->iface_lock);
+		mcdi->resprc = rc;
+		mcdi->resp_hdr_len = 0;
+		mcdi->resp_data_len = 0;
+		spin_unlock_bh(&mcdi->iface_lock);
+		return 0;
+	}
+
+	/* Poll for completion. Poll quickly (once a us) for the 1st jiffy,
+	 * because generally mcdi responses are fast. After that, back off
+	 * and poll once a jiffy (approximately)
+	 */
+	spins = USER_TICK_USEC;
+	finish = jiffies + MCDI_RPC_TIMEOUT;
+
+	while (1) {
+		if (spins != 0) {
+			--spins;
+			udelay(1);
+		} else {
+			schedule_timeout_uninterruptible(1);
+		}
+
+		time = jiffies;
+
+		if (efx_mcdi_poll_once(efx))
+			break;
+
+		if (time_after(time, finish))
+			return -ETIMEDOUT;
+	}
+
+	/* Return rc=0 like wait_event_timeout() */
+	return 0;
+}
+
+/* Test and clear MC-rebooted flag for this port/function; reset
+ * software state as necessary.
+ */
+int efx_mcdi_poll_reboot(struct efx_nic *efx)
+{
+	if (!efx->mcdi)
+		return 0;
+
+	return efx->type->mcdi_poll_reboot(efx);
+}
+
+static bool efx_mcdi_acquire_async(struct efx_mcdi_iface *mcdi)
+{
+	return cmpxchg(&mcdi->state,
+		       MCDI_STATE_QUIESCENT, MCDI_STATE_RUNNING_ASYNC) ==
+		MCDI_STATE_QUIESCENT;
+}
+
+static void efx_mcdi_acquire_sync(struct efx_mcdi_iface *mcdi)
+{
+	/* Wait until the interface becomes QUIESCENT and we win the race
+	 * to mark it RUNNING_SYNC.
+	 */
+	wait_event(mcdi->wq,
+		   cmpxchg(&mcdi->state,
+			   MCDI_STATE_QUIESCENT, MCDI_STATE_RUNNING_SYNC) ==
+		   MCDI_STATE_QUIESCENT);
+}
+
+static int efx_mcdi_await_completion(struct efx_nic *efx)
+{
+	struct efx_mcdi_iface *mcdi = efx_mcdi(efx);
+
+	if (wait_event_timeout(mcdi->wq, mcdi->state == MCDI_STATE_COMPLETED,
+			       MCDI_RPC_TIMEOUT) == 0)
+		return -ETIMEDOUT;
+
+	/* Check if efx_mcdi_set_mode() switched us back to polled completions.
+	 * In which case, poll for completions directly. If efx_mcdi_ev_cpl()
+	 * completed the request first, then we'll just end up completing the
+	 * request again, which is safe.
+	 *
+	 * We need an smp_rmb() to synchronise with efx_mcdi_mode_poll(), which
+	 * wait_event_timeout() implicitly provides.
+	 */
+	if (mcdi->mode == MCDI_MODE_POLL)
+		return efx_mcdi_poll(efx);
+
+	return 0;
+}
+
+/* If the interface is RUNNING_SYNC, switch to COMPLETED and wake the
+ * requester.  Return whether this was done.  Does not take any locks.
+ */
+static bool efx_mcdi_complete_sync(struct efx_mcdi_iface *mcdi)
+{
+	if (cmpxchg(&mcdi->state,
+		    MCDI_STATE_RUNNING_SYNC, MCDI_STATE_COMPLETED) ==
+	    MCDI_STATE_RUNNING_SYNC) {
+		wake_up(&mcdi->wq);
+		return true;
+	}
+
+	return false;
+}
+
+static void efx_mcdi_release(struct efx_mcdi_iface *mcdi)
+{
+	if (mcdi->mode == MCDI_MODE_EVENTS) {
+		struct efx_mcdi_async_param *async;
+		struct efx_nic *efx = mcdi->efx;
+
+		/* Process the asynchronous request queue */
+		spin_lock_bh(&mcdi->async_lock);
+		async = list_first_entry_or_null(
+			&mcdi->async_list, struct efx_mcdi_async_param, list);
+		if (async) {
+			mcdi->state = MCDI_STATE_RUNNING_ASYNC;
+			efx_mcdi_send_request(efx, async->cmd,
+					      (const efx_dword_t *)(async + 1),
+					      async->inlen);
+			mod_timer(&mcdi->async_timer,
+				  jiffies + MCDI_RPC_TIMEOUT);
+		}
+		spin_unlock_bh(&mcdi->async_lock);
+
+		if (async)
+			return;
+	}
+
+	mcdi->state = MCDI_STATE_QUIESCENT;
+	wake_up(&mcdi->wq);
+}
+
+/* If the interface is RUNNING_ASYNC, switch to COMPLETED, call the
+ * asynchronous completion function, and release the interface.
+ * Return whether this was done.  Must be called in bh-disabled
+ * context.  Will take iface_lock and async_lock.
+ */
+static bool efx_mcdi_complete_async(struct efx_mcdi_iface *mcdi, bool timeout)
+{
+	struct efx_nic *efx = mcdi->efx;
+	struct efx_mcdi_async_param *async;
+	size_t hdr_len, data_len, err_len;
+	efx_dword_t *outbuf;
+	MCDI_DECLARE_BUF_ERR(errbuf);
+	int rc;
+
+	if (cmpxchg(&mcdi->state,
+		    MCDI_STATE_RUNNING_ASYNC, MCDI_STATE_COMPLETED) !=
+	    MCDI_STATE_RUNNING_ASYNC)
+		return false;
+
+	spin_lock(&mcdi->iface_lock);
+	if (timeout) {
+		/* Ensure that if the completion event arrives later,
+		 * the seqno check in efx_mcdi_ev_cpl() will fail
+		 */
+		++mcdi->seqno;
+		++mcdi->credits;
+		rc = -ETIMEDOUT;
+		hdr_len = 0;
+		data_len = 0;
+	} else {
+		rc = mcdi->resprc;
+		hdr_len = mcdi->resp_hdr_len;
+		data_len = mcdi->resp_data_len;
+	}
+	spin_unlock(&mcdi->iface_lock);
+
+	/* Stop the timer.  In case the timer function is running, we
+	 * must wait for it to return so that there is no possibility
+	 * of it aborting the next request.
+	 */
+	if (!timeout)
+		del_timer_sync(&mcdi->async_timer);
+
+	spin_lock(&mcdi->async_lock);
+	async = list_first_entry(&mcdi->async_list,
+				 struct efx_mcdi_async_param, list);
+	list_del(&async->list);
+	spin_unlock(&mcdi->async_lock);
+
+	outbuf = (efx_dword_t *)(async + 1);
+	efx->type->mcdi_read_response(efx, outbuf, hdr_len,
+				      min(async->outlen, data_len));
+	if (!timeout && rc && !async->quiet) {
+		err_len = min(sizeof(errbuf), data_len);
+		efx->type->mcdi_read_response(efx, errbuf, hdr_len,
+					      sizeof(errbuf));
+		efx_mcdi_display_error(efx, async->cmd, async->inlen, errbuf,
+				       err_len, rc);
+	}
+
+	if (async->complete)
+		async->complete(efx, async->cookie, rc, outbuf,
+				min(async->outlen, data_len));
+	kfree(async);
+
+	efx_mcdi_release(mcdi);
+
+	return true;
+}
+
+static void efx_mcdi_ev_cpl(struct efx_nic *efx, unsigned int seqno,
+			    unsigned int datalen, unsigned int mcdi_err)
+{
+	struct efx_mcdi_iface *mcdi = efx_mcdi(efx);
+	bool wake = false;
+
+	spin_lock(&mcdi->iface_lock);
+
+	if ((seqno ^ mcdi->seqno) & SEQ_MASK) {
+		if (mcdi->credits)
+			/* The request has been cancelled */
+			--mcdi->credits;
+		else
+			netif_err(efx, hw, efx->net_dev,
+				  "MC response mismatch tx seq 0x%x rx "
+				  "seq 0x%x\n", seqno, mcdi->seqno);
+	} else {
+		if (efx->type->mcdi_max_ver >= 2) {
+			/* MCDI v2 responses don't fit in an event */
+			efx_mcdi_read_response_header(efx);
+		} else {
+			mcdi->resprc = efx_mcdi_errno(mcdi_err);
+			mcdi->resp_hdr_len = 4;
+			mcdi->resp_data_len = datalen;
+		}
+
+		wake = true;
+	}
+
+	spin_unlock(&mcdi->iface_lock);
+
+	if (wake) {
+		if (!efx_mcdi_complete_async(mcdi, false))
+			(void) efx_mcdi_complete_sync(mcdi);
+
+		/* If the interface isn't RUNNING_ASYNC or
+		 * RUNNING_SYNC then we've received a duplicate
+		 * completion after we've already transitioned back to
+		 * QUIESCENT. [A subsequent invocation would increment
+		 * seqno, so would have failed the seqno check].
+		 */
+	}
+}
+
+static void efx_mcdi_timeout_async(struct timer_list *t)
+{
+	struct efx_mcdi_iface *mcdi = from_timer(mcdi, t, async_timer);
+
+	efx_mcdi_complete_async(mcdi, true);
+}
+
+static int
+efx_mcdi_check_supported(struct efx_nic *efx, unsigned int cmd, size_t inlen)
+{
+	if (efx->type->mcdi_max_ver < 0 ||
+	     (efx->type->mcdi_max_ver < 2 &&
+	      cmd > MC_CMD_CMD_SPACE_ESCAPE_7))
+		return -EINVAL;
+
+	if (inlen > MCDI_CTL_SDU_LEN_MAX_V2 ||
+	    (efx->type->mcdi_max_ver < 2 &&
+	     inlen > MCDI_CTL_SDU_LEN_MAX_V1))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static bool efx_mcdi_get_proxy_handle(struct efx_nic *efx,
+				      size_t hdr_len, size_t data_len,
+				      u32 *proxy_handle)
+{
+	MCDI_DECLARE_BUF_ERR(testbuf);
+	const size_t buflen = sizeof(testbuf);
+
+	if (!proxy_handle || data_len < buflen)
+		return false;
+
+	efx->type->mcdi_read_response(efx, testbuf, hdr_len, buflen);
+	if (MCDI_DWORD(testbuf, ERR_CODE) == MC_CMD_ERR_PROXY_PENDING) {
+		*proxy_handle = MCDI_DWORD(testbuf, ERR_PROXY_PENDING_HANDLE);
+		return true;
+	}
+
+	return false;
+}
+
+static int _efx_mcdi_rpc_finish(struct efx_nic *efx, unsigned int cmd,
+				size_t inlen,
+				efx_dword_t *outbuf, size_t outlen,
+				size_t *outlen_actual, bool quiet,
+				u32 *proxy_handle, int *raw_rc)
+{
+	struct efx_mcdi_iface *mcdi = efx_mcdi(efx);
+	MCDI_DECLARE_BUF_ERR(errbuf);
+	int rc;
+
+	if (mcdi->mode == MCDI_MODE_POLL)
+		rc = efx_mcdi_poll(efx);
+	else
+		rc = efx_mcdi_await_completion(efx);
+
+	if (rc != 0) {
+		netif_err(efx, hw, efx->net_dev,
+			  "MC command 0x%x inlen %d mode %d timed out\n",
+			  cmd, (int)inlen, mcdi->mode);
+
+		if (mcdi->mode == MCDI_MODE_EVENTS && efx_mcdi_poll_once(efx)) {
+			netif_err(efx, hw, efx->net_dev,
+				  "MCDI request was completed without an event\n");
+			rc = 0;
+		}
+
+		efx_mcdi_abandon(efx);
+
+		/* Close the race with efx_mcdi_ev_cpl() executing just too late
+		 * and completing a request we've just cancelled, by ensuring
+		 * that the seqno check therein fails.
+		 */
+		spin_lock_bh(&mcdi->iface_lock);
+		++mcdi->seqno;
+		++mcdi->credits;
+		spin_unlock_bh(&mcdi->iface_lock);
+	}
+
+	if (proxy_handle)
+		*proxy_handle = 0;
+
+	if (rc != 0) {
+		if (outlen_actual)
+			*outlen_actual = 0;
+	} else {
+		size_t hdr_len, data_len, err_len;
+
+		/* At the very least we need a memory barrier here to ensure
+		 * we pick up changes from efx_mcdi_ev_cpl(). Protect against
+		 * a spurious efx_mcdi_ev_cpl() running concurrently by
+		 * acquiring the iface_lock. */
+		spin_lock_bh(&mcdi->iface_lock);
+		rc = mcdi->resprc;
+		if (raw_rc)
+			*raw_rc = mcdi->resprc_raw;
+		hdr_len = mcdi->resp_hdr_len;
+		data_len = mcdi->resp_data_len;
+		err_len = min(sizeof(errbuf), data_len);
+		spin_unlock_bh(&mcdi->iface_lock);
+
+		BUG_ON(rc > 0);
+
+		efx->type->mcdi_read_response(efx, outbuf, hdr_len,
+					      min(outlen, data_len));
+		if (outlen_actual)
+			*outlen_actual = data_len;
+
+		efx->type->mcdi_read_response(efx, errbuf, hdr_len, err_len);
+
+		if (cmd == MC_CMD_REBOOT && rc == -EIO) {
+			/* Don't reset if MC_CMD_REBOOT returns EIO */
+		} else if (rc == -EIO || rc == -EINTR) {
+			netif_err(efx, hw, efx->net_dev, "MC reboot detected\n");
+			netif_dbg(efx, hw, efx->net_dev, "MC rebooted during command %d rc %d\n",
+				  cmd, -rc);
+			if (efx->type->mcdi_reboot_detected)
+				efx->type->mcdi_reboot_detected(efx);
+			efx_schedule_reset(efx, RESET_TYPE_MC_FAILURE);
+		} else if (proxy_handle && (rc == -EPROTO) &&
+			   efx_mcdi_get_proxy_handle(efx, hdr_len, data_len,
+						     proxy_handle)) {
+			mcdi->proxy_rx_status = 0;
+			mcdi->proxy_rx_handle = 0;
+			mcdi->state = MCDI_STATE_PROXY_WAIT;
+		} else if (rc && !quiet) {
+			efx_mcdi_display_error(efx, cmd, inlen, errbuf, err_len,
+					       rc);
+		}
+
+		if (rc == -EIO || rc == -EINTR) {
+			msleep(MCDI_STATUS_SLEEP_MS);
+			efx_mcdi_poll_reboot(efx);
+			mcdi->new_epoch = true;
+		}
+	}
+
+	if (!proxy_handle || !*proxy_handle)
+		efx_mcdi_release(mcdi);
+	return rc;
+}
+
+static void efx_mcdi_proxy_abort(struct efx_mcdi_iface *mcdi)
+{
+	if (mcdi->state == MCDI_STATE_PROXY_WAIT) {
+		/* Interrupt the proxy wait. */
+		mcdi->proxy_rx_status = -EINTR;
+		wake_up(&mcdi->proxy_rx_wq);
+	}
+}
+
+static void efx_mcdi_ev_proxy_response(struct efx_nic *efx,
+				       u32 handle, int status)
+{
+	struct efx_mcdi_iface *mcdi = efx_mcdi(efx);
+
+	WARN_ON(mcdi->state != MCDI_STATE_PROXY_WAIT);
+
+	mcdi->proxy_rx_status = efx_mcdi_errno(status);
+	/* Ensure the status is written before we update the handle, since the
+	 * latter is used to check if we've finished.
+	 */
+	wmb();
+	mcdi->proxy_rx_handle = handle;
+	wake_up(&mcdi->proxy_rx_wq);
+}
+
+static int efx_mcdi_proxy_wait(struct efx_nic *efx, u32 handle, bool quiet)
+{
+	struct efx_mcdi_iface *mcdi = efx_mcdi(efx);
+	int rc;
+
+	/* Wait for a proxy event, or timeout. */
+	rc = wait_event_timeout(mcdi->proxy_rx_wq,
+				mcdi->proxy_rx_handle != 0 ||
+				mcdi->proxy_rx_status == -EINTR,
+				MCDI_RPC_TIMEOUT);
+
+	if (rc <= 0) {
+		netif_dbg(efx, hw, efx->net_dev,
+			  "MCDI proxy timeout %d\n", handle);
+		return -ETIMEDOUT;
+	} else if (mcdi->proxy_rx_handle != handle) {
+		netif_warn(efx, hw, efx->net_dev,
+			   "MCDI proxy unexpected handle %d (expected %d)\n",
+			   mcdi->proxy_rx_handle, handle);
+		return -EINVAL;
+	}
+
+	return mcdi->proxy_rx_status;
+}
+
+static int _efx_mcdi_rpc(struct efx_nic *efx, unsigned int cmd,
+			 const efx_dword_t *inbuf, size_t inlen,
+			 efx_dword_t *outbuf, size_t outlen,
+			 size_t *outlen_actual, bool quiet, int *raw_rc)
+{
+	u32 proxy_handle = 0; /* Zero is an invalid proxy handle. */
+	int rc;
+
+	if (inbuf && inlen && (inbuf == outbuf)) {
+		/* The input buffer can't be aliased with the output. */
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
+	rc = efx_mcdi_rpc_start(efx, cmd, inbuf, inlen);
+	if (rc)
+		return rc;
+
+	rc = _efx_mcdi_rpc_finish(efx, cmd, inlen, outbuf, outlen,
+				  outlen_actual, quiet, &proxy_handle, raw_rc);
+
+	if (proxy_handle) {
+		/* Handle proxy authorisation. This allows approval of MCDI
+		 * operations to be delegated to the admin function, allowing
+		 * fine control over (eg) multicast subscriptions.
+		 */
+		struct efx_mcdi_iface *mcdi = efx_mcdi(efx);
+
+		netif_dbg(efx, hw, efx->net_dev,
+			  "MCDI waiting for proxy auth %d\n",
+			  proxy_handle);
+		rc = efx_mcdi_proxy_wait(efx, proxy_handle, quiet);
+
+		if (rc == 0) {
+			netif_dbg(efx, hw, efx->net_dev,
+				  "MCDI proxy retry %d\n", proxy_handle);
+
+			/* We now retry the original request. */
+			mcdi->state = MCDI_STATE_RUNNING_SYNC;
+			efx_mcdi_send_request(efx, cmd, inbuf, inlen);
+
+			rc = _efx_mcdi_rpc_finish(efx, cmd, inlen,
+						  outbuf, outlen, outlen_actual,
+						  quiet, NULL, raw_rc);
+		} else {
+			netif_cond_dbg(efx, hw, efx->net_dev, rc == -EPERM, err,
+				       "MC command 0x%x failed after proxy auth rc=%d\n",
+				       cmd, rc);
+
+			if (rc == -EINTR || rc == -EIO)
+				efx_schedule_reset(efx, RESET_TYPE_MC_FAILURE);
+			efx_mcdi_release(mcdi);
+		}
+	}
+
+	return rc;
+}
+
+static int _efx_mcdi_rpc_evb_retry(struct efx_nic *efx, unsigned cmd,
+				   const efx_dword_t *inbuf, size_t inlen,
+				   efx_dword_t *outbuf, size_t outlen,
+				   size_t *outlen_actual, bool quiet)
+{
+	int raw_rc = 0;
+	int rc;
+
+	rc = _efx_mcdi_rpc(efx, cmd, inbuf, inlen,
+			   outbuf, outlen, outlen_actual, true, &raw_rc);
+
+	if ((rc == -EPROTO) && (raw_rc == MC_CMD_ERR_NO_EVB_PORT) &&
+	    efx->type->is_vf) {
+		/* If the EVB port isn't available within a VF this may
+		 * mean the PF is still bringing the switch up. We should
+		 * retry our request shortly.
+		 */
+		unsigned long abort_time = jiffies + MCDI_RPC_TIMEOUT;
+		unsigned int delay_us = 10000;
+
+		netif_dbg(efx, hw, efx->net_dev,
+			  "%s: NO_EVB_PORT; will retry request\n",
+			  __func__);
+
+		do {
+			usleep_range(delay_us, delay_us + 10000);
+			rc = _efx_mcdi_rpc(efx, cmd, inbuf, inlen,
+					   outbuf, outlen, outlen_actual,
+					   true, &raw_rc);
+			if (delay_us < 100000)
+				delay_us <<= 1;
+		} while ((rc == -EPROTO) &&
+			 (raw_rc == MC_CMD_ERR_NO_EVB_PORT) &&
+			 time_before(jiffies, abort_time));
+	}
+
+	if (rc && !quiet && !(cmd == MC_CMD_REBOOT && rc == -EIO))
+		efx_mcdi_display_error(efx, cmd, inlen,
+				       outbuf, outlen, rc);
+
+	return rc;
+}
+
+/**
+ * efx_mcdi_rpc - Issue an MCDI command and wait for completion
+ * @efx: NIC through which to issue the command
+ * @cmd: Command type number
+ * @inbuf: Command parameters
+ * @inlen: Length of command parameters, in bytes.  Must be a multiple
+ *	of 4 and no greater than %MCDI_CTL_SDU_LEN_MAX_V1.
+ * @outbuf: Response buffer.  May be %NULL if @outlen is 0.
+ * @outlen: Length of response buffer, in bytes.  If the actual
+ *	response is longer than @outlen & ~3, it will be truncated
+ *	to that length.
+ * @outlen_actual: Pointer through which to return the actual response
+ *	length.  May be %NULL if this is not needed.
+ *
+ * This function may sleep and therefore must be called in an appropriate
+ * context.
+ *
+ * Return: A negative error code, or zero if successful.  The error
+ *	code may come from the MCDI response or may indicate a failure
+ *	to communicate with the MC.  In the former case, the response
+ *	will still be copied to @outbuf and *@outlen_actual will be
+ *	set accordingly.  In the latter case, *@outlen_actual will be
+ *	set to zero.
+ */
+int efx_mcdi_rpc(struct efx_nic *efx, unsigned cmd,
+		 const efx_dword_t *inbuf, size_t inlen,
+		 efx_dword_t *outbuf, size_t outlen,
+		 size_t *outlen_actual)
+{
+	return _efx_mcdi_rpc_evb_retry(efx, cmd, inbuf, inlen, outbuf, outlen,
+				       outlen_actual, false);
+}
+
+/* Normally, on receiving an error code in the MCDI response,
+ * efx_mcdi_rpc will log an error message containing (among other
+ * things) the raw error code, by means of efx_mcdi_display_error.
+ * This _quiet version suppresses that; if the caller wishes to log
+ * the error conditionally on the return code, it should call this
+ * function and is then responsible for calling efx_mcdi_display_error
+ * as needed.
+ */
+int efx_mcdi_rpc_quiet(struct efx_nic *efx, unsigned cmd,
+		       const efx_dword_t *inbuf, size_t inlen,
+		       efx_dword_t *outbuf, size_t outlen,
+		       size_t *outlen_actual)
+{
+	return _efx_mcdi_rpc_evb_retry(efx, cmd, inbuf, inlen, outbuf, outlen,
+				       outlen_actual, true);
+}
+
+int efx_mcdi_rpc_start(struct efx_nic *efx, unsigned cmd,
+		       const efx_dword_t *inbuf, size_t inlen)
+{
+	struct efx_mcdi_iface *mcdi = efx_mcdi(efx);
+	int rc;
+
+	rc = efx_mcdi_check_supported(efx, cmd, inlen);
+	if (rc)
+		return rc;
+
+	if (efx->mc_bist_for_other_fn)
+		return -ENETDOWN;
+
+	if (mcdi->mode == MCDI_MODE_FAIL)
+		return -ENETDOWN;
+
+	efx_mcdi_acquire_sync(mcdi);
+	efx_mcdi_send_request(efx, cmd, inbuf, inlen);
+	return 0;
+}
+
+static int _efx_mcdi_rpc_async(struct efx_nic *efx, unsigned int cmd,
+			       const efx_dword_t *inbuf, size_t inlen,
+			       size_t outlen,
+			       efx_mcdi_async_completer *complete,
+			       unsigned long cookie, bool quiet)
+{
+	struct efx_mcdi_iface *mcdi = efx_mcdi(efx);
+	struct efx_mcdi_async_param *async;
+	int rc;
+
+	rc = efx_mcdi_check_supported(efx, cmd, inlen);
+	if (rc)
+		return rc;
+
+	if (efx->mc_bist_for_other_fn)
+		return -ENETDOWN;
+
+	async = kmalloc(sizeof(*async) + ALIGN(max(inlen, outlen), 4),
+			GFP_ATOMIC);
+	if (!async)
+		return -ENOMEM;
+
+	async->cmd = cmd;
+	async->inlen = inlen;
+	async->outlen = outlen;
+	async->quiet = quiet;
+	async->complete = complete;
+	async->cookie = cookie;
+	memcpy(async + 1, inbuf, inlen);
+
+	spin_lock_bh(&mcdi->async_lock);
+
+	if (mcdi->mode == MCDI_MODE_EVENTS) {
+		list_add_tail(&async->list, &mcdi->async_list);
+
+		/* If this is at the front of the queue, try to start it
+		 * immediately
+		 */
+		if (mcdi->async_list.next == &async->list &&
+		    efx_mcdi_acquire_async(mcdi)) {
+			efx_mcdi_send_request(efx, cmd, inbuf, inlen);
+			mod_timer(&mcdi->async_timer,
+				  jiffies + MCDI_RPC_TIMEOUT);
+		}
+	} else {
+		kfree(async);
+		rc = -ENETDOWN;
+	}
+
+	spin_unlock_bh(&mcdi->async_lock);
+
+	return rc;
+}
+
+/**
+ * efx_mcdi_rpc_async - Schedule an MCDI command to run asynchronously
+ * @efx: NIC through which to issue the command
+ * @cmd: Command type number
+ * @inbuf: Command parameters
+ * @inlen: Length of command parameters, in bytes
+ * @outlen: Length to allocate for response buffer, in bytes
+ * @complete: Function to be called on completion or cancellation.
+ * @cookie: Arbitrary value to be passed to @complete.
+ *
+ * This function does not sleep and therefore may be called in atomic
+ * context.  It will fail if event queues are disabled or if MCDI
+ * event completions have been disabled due to an error.
+ *
+ * If it succeeds, the @complete function will be called exactly once
+ * in atomic context, when one of the following occurs:
+ * (a) the completion event is received (in NAPI context)
+ * (b) event queues are disabled (in the process that disables them)
+ * (c) the request times-out (in timer context)
+ */
+int
+efx_mcdi_rpc_async(struct efx_nic *efx, unsigned int cmd,
+		   const efx_dword_t *inbuf, size_t inlen, size_t outlen,
+		   efx_mcdi_async_completer *complete, unsigned long cookie)
+{
+	return _efx_mcdi_rpc_async(efx, cmd, inbuf, inlen, outlen, complete,
+				   cookie, false);
+}
+
+int efx_mcdi_rpc_async_quiet(struct efx_nic *efx, unsigned int cmd,
+			     const efx_dword_t *inbuf, size_t inlen,
+			     size_t outlen, efx_mcdi_async_completer *complete,
+			     unsigned long cookie)
+{
+	return _efx_mcdi_rpc_async(efx, cmd, inbuf, inlen, outlen, complete,
+				   cookie, true);
+}
+
+int efx_mcdi_rpc_finish(struct efx_nic *efx, unsigned cmd, size_t inlen,
+			efx_dword_t *outbuf, size_t outlen,
+			size_t *outlen_actual)
+{
+	return _efx_mcdi_rpc_finish(efx, cmd, inlen, outbuf, outlen,
+				    outlen_actual, false, NULL, NULL);
+}
+
+int efx_mcdi_rpc_finish_quiet(struct efx_nic *efx, unsigned cmd, size_t inlen,
+			      efx_dword_t *outbuf, size_t outlen,
+			      size_t *outlen_actual)
+{
+	return _efx_mcdi_rpc_finish(efx, cmd, inlen, outbuf, outlen,
+				    outlen_actual, true, NULL, NULL);
+}
+
+void efx_mcdi_display_error(struct efx_nic *efx, unsigned cmd,
+			    size_t inlen, efx_dword_t *outbuf,
+			    size_t outlen, int rc)
+{
+	int code = 0, err_arg = 0;
+
+	if (outlen >= MC_CMD_ERR_CODE_OFST + 4)
+		code = MCDI_DWORD(outbuf, ERR_CODE);
+	if (outlen >= MC_CMD_ERR_ARG_OFST + 4)
+		err_arg = MCDI_DWORD(outbuf, ERR_ARG);
+	netif_cond_dbg(efx, hw, efx->net_dev, rc == -EPERM, err,
+		       "MC command 0x%x inlen %zu failed rc=%d (raw=%d) arg=%d\n",
+		       cmd, inlen, rc, code, err_arg);
+}
+
+/* Switch to polled MCDI completions.  This can be called in various
+ * error conditions with various locks held, so it must be lockless.
+ * Caller is responsible for flushing asynchronous requests later.
+ */
+void efx_mcdi_mode_poll(struct efx_nic *efx)
+{
+	struct efx_mcdi_iface *mcdi;
+
+	if (!efx->mcdi)
+		return;
+
+	mcdi = efx_mcdi(efx);
+	/* If already in polling mode, nothing to do.
+	 * If in fail-fast state, don't switch to polled completion.
+	 * FLR recovery will do that later.
+	 */
+	if (mcdi->mode == MCDI_MODE_POLL || mcdi->mode == MCDI_MODE_FAIL)
+		return;
+
+	/* We can switch from event completion to polled completion, because
+	 * mcdi requests are always completed in shared memory. We do this by
+	 * switching the mode to POLL'd then completing the request.
+	 * efx_mcdi_await_completion() will then call efx_mcdi_poll().
+	 *
+	 * We need an smp_wmb() to synchronise with efx_mcdi_await_completion(),
+	 * which efx_mcdi_complete_sync() provides for us.
+	 */
+	mcdi->mode = MCDI_MODE_POLL;
+
+	efx_mcdi_complete_sync(mcdi);
+}
+
+/* Flush any running or queued asynchronous requests, after event processing
+ * is stopped
+ */
+void efx_mcdi_flush_async(struct efx_nic *efx)
+{
+	struct efx_mcdi_async_param *async, *next;
+	struct efx_mcdi_iface *mcdi;
+
+	if (!efx->mcdi)
+		return;
+
+	mcdi = efx_mcdi(efx);
+
+	/* We must be in poll or fail mode so no more requests can be queued */
+	BUG_ON(mcdi->mode == MCDI_MODE_EVENTS);
+
+	del_timer_sync(&mcdi->async_timer);
+
+	/* If a request is still running, make sure we give the MC
+	 * time to complete it so that the response won't overwrite our
+	 * next request.
+	 */
+	if (mcdi->state == MCDI_STATE_RUNNING_ASYNC) {
+		efx_mcdi_poll(efx);
+		mcdi->state = MCDI_STATE_QUIESCENT;
+	}
+
+	/* Nothing else will access the async list now, so it is safe
+	 * to walk it without holding async_lock.  If we hold it while
+	 * calling a completer then lockdep may warn that we have
+	 * acquired locks in the wrong order.
+	 */
+	list_for_each_entry_safe(async, next, &mcdi->async_list, list) {
+		if (async->complete)
+			async->complete(efx, async->cookie, -ENETDOWN, NULL, 0);
+		list_del(&async->list);
+		kfree(async);
+	}
+}
+
+void efx_mcdi_mode_event(struct efx_nic *efx)
+{
+	struct efx_mcdi_iface *mcdi;
+
+	if (!efx->mcdi)
+		return;
+
+	mcdi = efx_mcdi(efx);
+	/* If already in event completion mode, nothing to do.
+	 * If in fail-fast state, don't switch to event completion.  FLR
+	 * recovery will do that later.
+	 */
+	if (mcdi->mode == MCDI_MODE_EVENTS || mcdi->mode == MCDI_MODE_FAIL)
+		return;
+
+	/* We can't switch from polled to event completion in the middle of a
+	 * request, because the completion method is specified in the request.
+	 * So acquire the interface to serialise the requestors. We don't need
+	 * to acquire the iface_lock to change the mode here, but we do need a
+	 * write memory barrier ensure that efx_mcdi_rpc() sees it, which
+	 * efx_mcdi_acquire() provides.
+	 */
+	efx_mcdi_acquire_sync(mcdi);
+	mcdi->mode = MCDI_MODE_EVENTS;
+	efx_mcdi_release(mcdi);
+}
+
+static void efx_mcdi_ev_death(struct efx_nic *efx, int rc)
+{
+	struct efx_mcdi_iface *mcdi = efx_mcdi(efx);
+
+	/* If there is an outstanding MCDI request, it has been terminated
+	 * either by a BADASSERT or REBOOT event. If the mcdi interface is
+	 * in polled mode, then do nothing because the MC reboot handler will
+	 * set the header correctly. However, if the mcdi interface is waiting
+	 * for a CMDDONE event it won't receive it [and since all MCDI events
+	 * are sent to the same queue, we can't be racing with
+	 * efx_mcdi_ev_cpl()]
+	 *
+	 * If there is an outstanding asynchronous request, we can't
+	 * complete it now (efx_mcdi_complete() would deadlock).  The
+	 * reset process will take care of this.
+	 *
+	 * There's a race here with efx_mcdi_send_request(), because
+	 * we might receive a REBOOT event *before* the request has
+	 * been copied out. In polled mode (during startup) this is
+	 * irrelevant, because efx_mcdi_complete_sync() is ignored. In
+	 * event mode, this condition is just an edge-case of
+	 * receiving a REBOOT event after posting the MCDI
+	 * request. Did the mc reboot before or after the copyout? The
+	 * best we can do always is just return failure.
+	 *
+	 * If there is an outstanding proxy response expected it is not going
+	 * to arrive. We should thus abort it.
+	 */
+	spin_lock(&mcdi->iface_lock);
+	efx_mcdi_proxy_abort(mcdi);
+
+	if (efx_mcdi_complete_sync(mcdi)) {
+		if (mcdi->mode == MCDI_MODE_EVENTS) {
+			mcdi->resprc = rc;
+			mcdi->resp_hdr_len = 0;
+			mcdi->resp_data_len = 0;
+			++mcdi->credits;
+		}
+	} else {
+		int count;
+
+		/* Consume the status word since efx_mcdi_rpc_finish() won't */
+		for (count = 0; count < MCDI_STATUS_DELAY_COUNT; ++count) {
+			rc = efx_mcdi_poll_reboot(efx);
+			if (rc)
+				break;
+			udelay(MCDI_STATUS_DELAY_US);
+		}
+
+		/* On EF10, a CODE_MC_REBOOT event can be received without the
+		 * reboot detection in efx_mcdi_poll_reboot() being triggered.
+		 * If zero was returned from the final call to
+		 * efx_mcdi_poll_reboot(), the MC reboot wasn't noticed but the
+		 * MC has definitely rebooted so prepare for the reset.
+		 */
+		if (!rc && efx->type->mcdi_reboot_detected)
+			efx->type->mcdi_reboot_detected(efx);
+
+		mcdi->new_epoch = true;
+
+		/* Nobody was waiting for an MCDI request, so trigger a reset */
+		efx_schedule_reset(efx, RESET_TYPE_MC_FAILURE);
+	}
+
+	spin_unlock(&mcdi->iface_lock);
+}
+
+/* The MC is going down in to BIST mode. set the BIST flag to block
+ * new MCDI, cancel any outstanding MCDI and and schedule a BIST-type reset
+ * (which doesn't actually execute a reset, it waits for the controlling
+ * function to reset it).
+ */
+static void efx_mcdi_ev_bist(struct efx_nic *efx)
+{
+	struct efx_mcdi_iface *mcdi = efx_mcdi(efx);
+
+	spin_lock(&mcdi->iface_lock);
+	efx->mc_bist_for_other_fn = true;
+	efx_mcdi_proxy_abort(mcdi);
+
+	if (efx_mcdi_complete_sync(mcdi)) {
+		if (mcdi->mode == MCDI_MODE_EVENTS) {
+			mcdi->resprc = -EIO;
+			mcdi->resp_hdr_len = 0;
+			mcdi->resp_data_len = 0;
+			++mcdi->credits;
+		}
+	}
+	mcdi->new_epoch = true;
+	efx_schedule_reset(efx, RESET_TYPE_MC_BIST);
+	spin_unlock(&mcdi->iface_lock);
+}
+
+/* MCDI timeouts seen, so make all MCDI calls fail-fast and issue an FLR to try
+ * to recover.
+ */
+static void efx_mcdi_abandon(struct efx_nic *efx)
+{
+	struct efx_mcdi_iface *mcdi = efx_mcdi(efx);
+
+	if (xchg(&mcdi->mode, MCDI_MODE_FAIL) == MCDI_MODE_FAIL)
+		return; /* it had already been done */
+	netif_dbg(efx, hw, efx->net_dev, "MCDI is timing out; trying to recover\n");
+	efx_schedule_reset(efx, RESET_TYPE_MCDI_TIMEOUT);
+}
+
+static void efx_handle_drain_event(struct efx_nic *efx)
+{
+	if (atomic_dec_and_test(&efx->active_queues))
+		wake_up(&efx->flush_wq);
+
+	WARN_ON(atomic_read(&efx->active_queues) < 0);
+}
+
+/* Called from efx_farch_ev_process and efx_ef10_ev_process for MCDI events */
+void efx_mcdi_process_event(struct efx_channel *channel,
+			    efx_qword_t *event)
+{
+	struct efx_nic *efx = channel->efx;
+	int code = EFX_QWORD_FIELD(*event, MCDI_EVENT_CODE);
+	u32 data = EFX_QWORD_FIELD(*event, MCDI_EVENT_DATA);
+
+	switch (code) {
+	case MCDI_EVENT_CODE_BADSSERT:
+		netif_err(efx, hw, efx->net_dev,
+			  "MC watchdog or assertion failure at 0x%x\n", data);
+		efx_mcdi_ev_death(efx, -EINTR);
+		break;
+
+	case MCDI_EVENT_CODE_PMNOTICE:
+		netif_info(efx, wol, efx->net_dev, "MCDI PM event.\n");
+		break;
+
+	case MCDI_EVENT_CODE_CMDDONE:
+		efx_mcdi_ev_cpl(efx,
+				MCDI_EVENT_FIELD(*event, CMDDONE_SEQ),
+				MCDI_EVENT_FIELD(*event, CMDDONE_DATALEN),
+				MCDI_EVENT_FIELD(*event, CMDDONE_ERRNO));
+		break;
+
+	case MCDI_EVENT_CODE_LINKCHANGE:
+		efx_mcdi_process_link_change(efx, event);
+		break;
+	case MCDI_EVENT_CODE_SENSOREVT:
+		efx_sensor_event(efx, event);
+		break;
+	case MCDI_EVENT_CODE_SCHEDERR:
+		netif_dbg(efx, hw, efx->net_dev,
+			  "MC Scheduler alert (0x%x)\n", data);
+		break;
+	case MCDI_EVENT_CODE_REBOOT:
+	case MCDI_EVENT_CODE_MC_REBOOT:
+		netif_info(efx, hw, efx->net_dev, "MC Reboot\n");
+		efx_mcdi_ev_death(efx, -EIO);
+		break;
+	case MCDI_EVENT_CODE_MC_BIST:
+		netif_info(efx, hw, efx->net_dev, "MC entered BIST mode\n");
+		efx_mcdi_ev_bist(efx);
+		break;
+	case MCDI_EVENT_CODE_MAC_STATS_DMA:
+		/* MAC stats are gather lazily.  We can ignore this. */
+		break;
+	case MCDI_EVENT_CODE_FLR:
+		if (efx->type->sriov_flr)
+			efx->type->sriov_flr(efx,
+					     MCDI_EVENT_FIELD(*event, FLR_VF));
+		break;
+	case MCDI_EVENT_CODE_PTP_RX:
+	case MCDI_EVENT_CODE_PTP_FAULT:
+	case MCDI_EVENT_CODE_PTP_PPS:
+		efx_ptp_event(efx, event);
+		break;
+	case MCDI_EVENT_CODE_PTP_TIME:
+		efx_time_sync_event(channel, event);
+		break;
+	case MCDI_EVENT_CODE_TX_FLUSH:
+	case MCDI_EVENT_CODE_RX_FLUSH:
+		/* Two flush events will be sent: one to the same event
+		 * queue as completions, and one to event queue 0.
+		 * In the latter case the {RX,TX}_FLUSH_TO_DRIVER
+		 * flag will be set, and we should ignore the event
+		 * because we want to wait for all completions.
+		 */
+		BUILD_BUG_ON(MCDI_EVENT_TX_FLUSH_TO_DRIVER_LBN !=
+			     MCDI_EVENT_RX_FLUSH_TO_DRIVER_LBN);
+		if (!MCDI_EVENT_FIELD(*event, TX_FLUSH_TO_DRIVER))
+			efx_handle_drain_event(efx);
+		break;
+	case MCDI_EVENT_CODE_TX_ERR:
+	case MCDI_EVENT_CODE_RX_ERR:
+		netif_err(efx, hw, efx->net_dev,
+			  "%s DMA error (event: "EFX_QWORD_FMT")\n",
+			  code == MCDI_EVENT_CODE_TX_ERR ? "TX" : "RX",
+			  EFX_QWORD_VAL(*event));
+		efx_schedule_reset(efx, RESET_TYPE_DMA_ERROR);
+		break;
+	case MCDI_EVENT_CODE_PROXY_RESPONSE:
+		efx_mcdi_ev_proxy_response(efx,
+				MCDI_EVENT_FIELD(*event, PROXY_RESPONSE_HANDLE),
+				MCDI_EVENT_FIELD(*event, PROXY_RESPONSE_RC));
+		break;
+	default:
+		netif_err(efx, hw, efx->net_dev,
+			  "Unknown MCDI event " EFX_QWORD_FMT "\n",
+			  EFX_QWORD_VAL(*event));
+	}
+}
+
+/**************************************************************************
+ *
+ * Specific request functions
+ *
+ **************************************************************************
+ */
+
+void efx_mcdi_print_fwver(struct efx_nic *efx, char *buf, size_t len)
+{
+	MCDI_DECLARE_BUF(outbuf, MC_CMD_GET_VERSION_OUT_LEN);
+	size_t outlength;
+	const __le16 *ver_words;
+	size_t offset;
+	int rc;
+
+	BUILD_BUG_ON(MC_CMD_GET_VERSION_IN_LEN != 0);
+	rc = efx_mcdi_rpc(efx, MC_CMD_GET_VERSION, NULL, 0,
+			  outbuf, sizeof(outbuf), &outlength);
+	if (rc)
+		goto fail;
+	if (outlength < MC_CMD_GET_VERSION_OUT_LEN) {
+		rc = -EIO;
+		goto fail;
+	}
+
+	ver_words = (__le16 *)MCDI_PTR(outbuf, GET_VERSION_OUT_VERSION);
+	offset = scnprintf(buf, len, "%u.%u.%u.%u",
+			   le16_to_cpu(ver_words[0]),
+			   le16_to_cpu(ver_words[1]),
+			   le16_to_cpu(ver_words[2]),
+			   le16_to_cpu(ver_words[3]));
+
+	if (efx->type->print_additional_fwver)
+		offset += efx->type->print_additional_fwver(efx, buf + offset,
+							    len - offset);
+
+	/* It's theoretically possible for the string to exceed 31
+	 * characters, though in practice the first three version
+	 * components are short enough that this doesn't happen.
+	 */
+	if (WARN_ON(offset >= len))
+		buf[0] = 0;
+
+	return;
+
+fail:
+	netif_err(efx, probe, efx->net_dev, "%s: failed rc=%d\n", __func__, rc);
+	buf[0] = 0;
+}
+
+static int efx_mcdi_drv_attach(struct efx_nic *efx, bool driver_operating,
+			       bool *was_attached)
+{
+	MCDI_DECLARE_BUF(inbuf, MC_CMD_DRV_ATTACH_IN_LEN);
+	MCDI_DECLARE_BUF(outbuf, MC_CMD_DRV_ATTACH_EXT_OUT_LEN);
+	size_t outlen;
+	int rc;
+
+	MCDI_SET_DWORD(inbuf, DRV_ATTACH_IN_NEW_STATE,
+		       driver_operating ? 1 : 0);
+	MCDI_SET_DWORD(inbuf, DRV_ATTACH_IN_UPDATE, 1);
+	MCDI_SET_DWORD(inbuf, DRV_ATTACH_IN_FIRMWARE_ID, MC_CMD_FW_LOW_LATENCY);
+
+	rc = efx_mcdi_rpc_quiet(efx, MC_CMD_DRV_ATTACH, inbuf, sizeof(inbuf),
+				outbuf, sizeof(outbuf), &outlen);
+	/* If we're not the primary PF, trying to ATTACH with a FIRMWARE_ID
+	 * specified will fail with EPERM, and we have to tell the MC we don't
+	 * care what firmware we get.
+	 */
+	if (rc == -EPERM) {
+		netif_dbg(efx, probe, efx->net_dev,
+			  "efx_mcdi_drv_attach with fw-variant setting failed EPERM, trying without it\n");
+		MCDI_SET_DWORD(inbuf, DRV_ATTACH_IN_FIRMWARE_ID,
+			       MC_CMD_FW_DONT_CARE);
+		rc = efx_mcdi_rpc_quiet(efx, MC_CMD_DRV_ATTACH, inbuf,
+					sizeof(inbuf), outbuf, sizeof(outbuf),
+					&outlen);
+	}
+	if (rc) {
+		efx_mcdi_display_error(efx, MC_CMD_DRV_ATTACH, sizeof(inbuf),
+				       outbuf, outlen, rc);
+		goto fail;
+	}
+	if (outlen < MC_CMD_DRV_ATTACH_OUT_LEN) {
+		rc = -EIO;
+		goto fail;
+	}
+
+	if (driver_operating) {
+		if (outlen >= MC_CMD_DRV_ATTACH_EXT_OUT_LEN) {
+			efx->mcdi->fn_flags =
+				MCDI_DWORD(outbuf,
+					   DRV_ATTACH_EXT_OUT_FUNC_FLAGS);
+		} else {
+			/* Synthesise flags for Siena */
+			efx->mcdi->fn_flags =
+				1 << MC_CMD_DRV_ATTACH_EXT_OUT_FLAG_LINKCTRL |
+				1 << MC_CMD_DRV_ATTACH_EXT_OUT_FLAG_TRUSTED |
+				(efx_port_num(efx) == 0) <<
+				MC_CMD_DRV_ATTACH_EXT_OUT_FLAG_PRIMARY;
+		}
+	}
+
+	/* We currently assume we have control of the external link
+	 * and are completely trusted by firmware.  Abort probing
+	 * if that's not true for this function.
+	 */
+
+	if (was_attached != NULL)
+		*was_attached = MCDI_DWORD(outbuf, DRV_ATTACH_OUT_OLD_STATE);
+	return 0;
+
+fail:
+	netif_err(efx, probe, efx->net_dev, "%s: failed rc=%d\n", __func__, rc);
+	return rc;
+}
+
+int efx_mcdi_get_board_cfg(struct efx_nic *efx, u8 *mac_address,
+			   u16 *fw_subtype_list, u32 *capabilities)
+{
+	MCDI_DECLARE_BUF(outbuf, MC_CMD_GET_BOARD_CFG_OUT_LENMAX);
+	size_t outlen, i;
+	int port_num = efx_port_num(efx);
+	int rc;
+
+	BUILD_BUG_ON(MC_CMD_GET_BOARD_CFG_IN_LEN != 0);
+	/* we need __aligned(2) for ether_addr_copy */
+	BUILD_BUG_ON(MC_CMD_GET_BOARD_CFG_OUT_MAC_ADDR_BASE_PORT0_OFST & 1);
+	BUILD_BUG_ON(MC_CMD_GET_BOARD_CFG_OUT_MAC_ADDR_BASE_PORT1_OFST & 1);
+
+	rc = efx_mcdi_rpc(efx, MC_CMD_GET_BOARD_CFG, NULL, 0,
+			  outbuf, sizeof(outbuf), &outlen);
+	if (rc)
+		goto fail;
+
+	if (outlen < MC_CMD_GET_BOARD_CFG_OUT_LENMIN) {
+		rc = -EIO;
+		goto fail;
+	}
+
+	if (mac_address)
+		ether_addr_copy(mac_address,
+				port_num ?
+				MCDI_PTR(outbuf, GET_BOARD_CFG_OUT_MAC_ADDR_BASE_PORT1) :
+				MCDI_PTR(outbuf, GET_BOARD_CFG_OUT_MAC_ADDR_BASE_PORT0));
+	if (fw_subtype_list) {
+		for (i = 0;
+		     i < MCDI_VAR_ARRAY_LEN(outlen,
+					    GET_BOARD_CFG_OUT_FW_SUBTYPE_LIST);
+		     i++)
+			fw_subtype_list[i] = MCDI_ARRAY_WORD(
+				outbuf, GET_BOARD_CFG_OUT_FW_SUBTYPE_LIST, i);
+		for (; i < MC_CMD_GET_BOARD_CFG_OUT_FW_SUBTYPE_LIST_MAXNUM; i++)
+			fw_subtype_list[i] = 0;
+	}
+	if (capabilities) {
+		if (port_num)
+			*capabilities = MCDI_DWORD(outbuf,
+					GET_BOARD_CFG_OUT_CAPABILITIES_PORT1);
+		else
+			*capabilities = MCDI_DWORD(outbuf,
+					GET_BOARD_CFG_OUT_CAPABILITIES_PORT0);
+	}
+
+	return 0;
+
+fail:
+	netif_err(efx, hw, efx->net_dev, "%s: failed rc=%d len=%d\n",
+		  __func__, rc, (int)outlen);
+
+	return rc;
+}
+
+int efx_mcdi_log_ctrl(struct efx_nic *efx, bool evq, bool uart, u32 dest_evq)
+{
+	MCDI_DECLARE_BUF(inbuf, MC_CMD_LOG_CTRL_IN_LEN);
+	u32 dest = 0;
+	int rc;
+
+	if (uart)
+		dest |= MC_CMD_LOG_CTRL_IN_LOG_DEST_UART;
+	if (evq)
+		dest |= MC_CMD_LOG_CTRL_IN_LOG_DEST_EVQ;
+
+	MCDI_SET_DWORD(inbuf, LOG_CTRL_IN_LOG_DEST, dest);
+	MCDI_SET_DWORD(inbuf, LOG_CTRL_IN_LOG_DEST_EVQ, dest_evq);
+
+	BUILD_BUG_ON(MC_CMD_LOG_CTRL_OUT_LEN != 0);
+
+	rc = efx_mcdi_rpc(efx, MC_CMD_LOG_CTRL, inbuf, sizeof(inbuf),
+			  NULL, 0, NULL);
+	return rc;
+}
+
+int efx_mcdi_nvram_types(struct efx_nic *efx, u32 *nvram_types_out)
+{
+	MCDI_DECLARE_BUF(outbuf, MC_CMD_NVRAM_TYPES_OUT_LEN);
+	size_t outlen;
+	int rc;
+
+	BUILD_BUG_ON(MC_CMD_NVRAM_TYPES_IN_LEN != 0);
+
+	rc = efx_mcdi_rpc(efx, MC_CMD_NVRAM_TYPES, NULL, 0,
+			  outbuf, sizeof(outbuf), &outlen);
+	if (rc)
+		goto fail;
+	if (outlen < MC_CMD_NVRAM_TYPES_OUT_LEN) {
+		rc = -EIO;
+		goto fail;
+	}
+
+	*nvram_types_out = MCDI_DWORD(outbuf, NVRAM_TYPES_OUT_TYPES);
+	return 0;
+
+fail:
+	netif_err(efx, hw, efx->net_dev, "%s: failed rc=%d\n",
+		  __func__, rc);
+	return rc;
+}
+
+/* This function finds types using the new NVRAM_PARTITIONS mcdi. */
+static int efx_new_mcdi_nvram_types(struct efx_nic *efx, u32 *number,
+				    u32 *nvram_types)
+{
+	efx_dword_t *outbuf = kzalloc(MC_CMD_NVRAM_PARTITIONS_OUT_LENMAX_MCDI2,
+				      GFP_KERNEL);
+	size_t outlen;
+	int rc;
+
+	if (!outbuf)
+		return -ENOMEM;
+
+	BUILD_BUG_ON(MC_CMD_NVRAM_PARTITIONS_IN_LEN != 0);
+
+	rc = efx_mcdi_rpc(efx, MC_CMD_NVRAM_PARTITIONS, NULL, 0,
+			  outbuf, MC_CMD_NVRAM_PARTITIONS_OUT_LENMAX_MCDI2, &outlen);
+	if (rc)
+		goto fail;
+
+	*number = MCDI_DWORD(outbuf, NVRAM_PARTITIONS_OUT_NUM_PARTITIONS);
+
+	memcpy(nvram_types, MCDI_PTR(outbuf, NVRAM_PARTITIONS_OUT_TYPE_ID),
+	       *number * sizeof(u32));
+
+fail:
+	kfree(outbuf);
+	return rc;
+}
+
+int efx_mcdi_nvram_info(struct efx_nic *efx, unsigned int type,
+			size_t *size_out, size_t *erase_size_out,
+			bool *protected_out)
+{
+	MCDI_DECLARE_BUF(inbuf, MC_CMD_NVRAM_INFO_IN_LEN);
+	MCDI_DECLARE_BUF(outbuf, MC_CMD_NVRAM_INFO_OUT_LEN);
+	size_t outlen;
+	int rc;
+
+	MCDI_SET_DWORD(inbuf, NVRAM_INFO_IN_TYPE, type);
+
+	rc = efx_mcdi_rpc(efx, MC_CMD_NVRAM_INFO, inbuf, sizeof(inbuf),
+			  outbuf, sizeof(outbuf), &outlen);
+	if (rc)
+		goto fail;
+	if (outlen < MC_CMD_NVRAM_INFO_OUT_LEN) {
+		rc = -EIO;
+		goto fail;
+	}
+
+	*size_out = MCDI_DWORD(outbuf, NVRAM_INFO_OUT_SIZE);
+	*erase_size_out = MCDI_DWORD(outbuf, NVRAM_INFO_OUT_ERASESIZE);
+	*protected_out = !!(MCDI_DWORD(outbuf, NVRAM_INFO_OUT_FLAGS) &
+				(1 << MC_CMD_NVRAM_INFO_OUT_PROTECTED_LBN));
+	return 0;
+
+fail:
+	netif_err(efx, hw, efx->net_dev, "%s: failed rc=%d\n", __func__, rc);
+	return rc;
+}
+
+static int efx_mcdi_nvram_test(struct efx_nic *efx, unsigned int type)
+{
+	MCDI_DECLARE_BUF(inbuf, MC_CMD_NVRAM_TEST_IN_LEN);
+	MCDI_DECLARE_BUF(outbuf, MC_CMD_NVRAM_TEST_OUT_LEN);
+	int rc;
+
+	MCDI_SET_DWORD(inbuf, NVRAM_TEST_IN_TYPE, type);
+
+	rc = efx_mcdi_rpc(efx, MC_CMD_NVRAM_TEST, inbuf, sizeof(inbuf),
+			  outbuf, sizeof(outbuf), NULL);
+	if (rc)
+		return rc;
+
+	switch (MCDI_DWORD(outbuf, NVRAM_TEST_OUT_RESULT)) {
+	case MC_CMD_NVRAM_TEST_PASS:
+	case MC_CMD_NVRAM_TEST_NOTSUPP:
+		return 0;
+	default:
+		return -EIO;
+	}
+}
+
+/* This function tests nvram partitions using the new mcdi partition lookup scheme */
+int efx_new_mcdi_nvram_test_all(struct efx_nic *efx)
+{
+	u32 *nvram_types = kzalloc(MC_CMD_NVRAM_PARTITIONS_OUT_LENMAX_MCDI2,
+				   GFP_KERNEL);
+	unsigned int number;
+	int rc, i;
+
+	if (!nvram_types)
+		return -ENOMEM;
+
+	rc = efx_new_mcdi_nvram_types(efx, &number, nvram_types);
+	if (rc)
+		goto fail;
+
+	/* Require at least one check */
+	rc = -EAGAIN;
+
+	for (i = 0; i < number; i++) {
+		if (nvram_types[i] == NVRAM_PARTITION_TYPE_PARTITION_MAP ||
+		    nvram_types[i] == NVRAM_PARTITION_TYPE_DYNAMIC_CONFIG)
+			continue;
+
+		rc = efx_mcdi_nvram_test(efx, nvram_types[i]);
+		if (rc)
+			goto fail;
+	}
+
+fail:
+	kfree(nvram_types);
+	return rc;
+}
+
+int efx_mcdi_nvram_test_all(struct efx_nic *efx)
+{
+	u32 nvram_types;
+	unsigned int type;
+	int rc;
+
+	rc = efx_mcdi_nvram_types(efx, &nvram_types);
+	if (rc)
+		goto fail1;
+
+	type = 0;
+	while (nvram_types != 0) {
+		if (nvram_types & 1) {
+			rc = efx_mcdi_nvram_test(efx, type);
+			if (rc)
+				goto fail2;
+		}
+		type++;
+		nvram_types >>= 1;
+	}
+
+	return 0;
+
+fail2:
+	netif_err(efx, hw, efx->net_dev, "%s: failed type=%u\n",
+		  __func__, type);
+fail1:
+	netif_err(efx, hw, efx->net_dev, "%s: failed rc=%d\n", __func__, rc);
+	return rc;
+}
+
+/* Returns 1 if an assertion was read, 0 if no assertion had fired,
+ * negative on error.
+ */
+static int efx_mcdi_read_assertion(struct efx_nic *efx)
+{
+	MCDI_DECLARE_BUF(inbuf, MC_CMD_GET_ASSERTS_IN_LEN);
+	MCDI_DECLARE_BUF(outbuf, MC_CMD_GET_ASSERTS_OUT_LEN);
+	unsigned int flags, index;
+	const char *reason;
+	size_t outlen;
+	int retry;
+	int rc;
+
+	/* Attempt to read any stored assertion state before we reboot
+	 * the mcfw out of the assertion handler. Retry twice, once
+	 * because a boot-time assertion might cause this command to fail
+	 * with EINTR. And once again because GET_ASSERTS can race with
+	 * MC_CMD_REBOOT running on the other port. */
+	retry = 2;
+	do {
+		MCDI_SET_DWORD(inbuf, GET_ASSERTS_IN_CLEAR, 1);
+		rc = efx_mcdi_rpc_quiet(efx, MC_CMD_GET_ASSERTS,
+					inbuf, MC_CMD_GET_ASSERTS_IN_LEN,
+					outbuf, sizeof(outbuf), &outlen);
+		if (rc == -EPERM)
+			return 0;
+	} while ((rc == -EINTR || rc == -EIO) && retry-- > 0);
+
+	if (rc) {
+		efx_mcdi_display_error(efx, MC_CMD_GET_ASSERTS,
+				       MC_CMD_GET_ASSERTS_IN_LEN, outbuf,
+				       outlen, rc);
+		return rc;
+	}
+	if (outlen < MC_CMD_GET_ASSERTS_OUT_LEN)
+		return -EIO;
+
+	/* Print out any recorded assertion state */
+	flags = MCDI_DWORD(outbuf, GET_ASSERTS_OUT_GLOBAL_FLAGS);
+	if (flags == MC_CMD_GET_ASSERTS_FLAGS_NO_FAILS)
+		return 0;
+
+	reason = (flags == MC_CMD_GET_ASSERTS_FLAGS_SYS_FAIL)
+		? "system-level assertion"
+		: (flags == MC_CMD_GET_ASSERTS_FLAGS_THR_FAIL)
+		? "thread-level assertion"
+		: (flags == MC_CMD_GET_ASSERTS_FLAGS_WDOG_FIRED)
+		? "watchdog reset"
+		: "unknown assertion";
+	netif_err(efx, hw, efx->net_dev,
+		  "MCPU %s at PC = 0x%.8x in thread 0x%.8x\n", reason,
+		  MCDI_DWORD(outbuf, GET_ASSERTS_OUT_SAVED_PC_OFFS),
+		  MCDI_DWORD(outbuf, GET_ASSERTS_OUT_THREAD_OFFS));
+
+	/* Print out the registers */
+	for (index = 0;
+	     index < MC_CMD_GET_ASSERTS_OUT_GP_REGS_OFFS_NUM;
+	     index++)
+		netif_err(efx, hw, efx->net_dev, "R%.2d (?): 0x%.8x\n",
+			  1 + index,
+			  MCDI_ARRAY_DWORD(outbuf, GET_ASSERTS_OUT_GP_REGS_OFFS,
+					   index));
+
+	return 1;
+}
+
+static int efx_mcdi_exit_assertion(struct efx_nic *efx)
+{
+	MCDI_DECLARE_BUF(inbuf, MC_CMD_REBOOT_IN_LEN);
+	int rc;
+
+	/* If the MC is running debug firmware, it might now be
+	 * waiting for a debugger to attach, but we just want it to
+	 * reboot.  We set a flag that makes the command a no-op if it
+	 * has already done so.
+	 * The MCDI will thus return either 0 or -EIO.
+	 */
+	BUILD_BUG_ON(MC_CMD_REBOOT_OUT_LEN != 0);
+	MCDI_SET_DWORD(inbuf, REBOOT_IN_FLAGS,
+		       MC_CMD_REBOOT_FLAGS_AFTER_ASSERTION);
+	rc = efx_mcdi_rpc_quiet(efx, MC_CMD_REBOOT, inbuf, MC_CMD_REBOOT_IN_LEN,
+				NULL, 0, NULL);
+	if (rc == -EIO)
+		rc = 0;
+	if (rc)
+		efx_mcdi_display_error(efx, MC_CMD_REBOOT, MC_CMD_REBOOT_IN_LEN,
+				       NULL, 0, rc);
+	return rc;
+}
+
+int efx_mcdi_handle_assertion(struct efx_nic *efx)
+{
+	int rc;
+
+	rc = efx_mcdi_read_assertion(efx);
+	if (rc <= 0)
+		return rc;
+
+	return efx_mcdi_exit_assertion(efx);
+}
+
+int efx_mcdi_set_id_led(struct efx_nic *efx, enum efx_led_mode mode)
+{
+	MCDI_DECLARE_BUF(inbuf, MC_CMD_SET_ID_LED_IN_LEN);
+
+	BUILD_BUG_ON(EFX_LED_OFF != MC_CMD_LED_OFF);
+	BUILD_BUG_ON(EFX_LED_ON != MC_CMD_LED_ON);
+	BUILD_BUG_ON(EFX_LED_DEFAULT != MC_CMD_LED_DEFAULT);
+
+	BUILD_BUG_ON(MC_CMD_SET_ID_LED_OUT_LEN != 0);
+
+	MCDI_SET_DWORD(inbuf, SET_ID_LED_IN_STATE, mode);
+
+	return efx_mcdi_rpc(efx, MC_CMD_SET_ID_LED, inbuf, sizeof(inbuf), NULL, 0, NULL);
+}
+
+static int efx_mcdi_reset_func(struct efx_nic *efx)
+{
+	MCDI_DECLARE_BUF(inbuf, MC_CMD_ENTITY_RESET_IN_LEN);
+	int rc;
+
+	BUILD_BUG_ON(MC_CMD_ENTITY_RESET_OUT_LEN != 0);
+	MCDI_POPULATE_DWORD_1(inbuf, ENTITY_RESET_IN_FLAG,
+			      ENTITY_RESET_IN_FUNCTION_RESOURCE_RESET, 1);
+	rc = efx_mcdi_rpc(efx, MC_CMD_ENTITY_RESET, inbuf, sizeof(inbuf),
+			  NULL, 0, NULL);
+	return rc;
+}
+
+static int efx_mcdi_reset_mc(struct efx_nic *efx)
+{
+	MCDI_DECLARE_BUF(inbuf, MC_CMD_REBOOT_IN_LEN);
+	int rc;
+
+	BUILD_BUG_ON(MC_CMD_REBOOT_OUT_LEN != 0);
+	MCDI_SET_DWORD(inbuf, REBOOT_IN_FLAGS, 0);
+	rc = efx_mcdi_rpc(efx, MC_CMD_REBOOT, inbuf, sizeof(inbuf),
+			  NULL, 0, NULL);
+	/* White is black, and up is down */
+	if (rc == -EIO)
+		return 0;
+	if (rc == 0)
+		rc = -EIO;
+	return rc;
+}
+
+enum reset_type efx_mcdi_map_reset_reason(enum reset_type reason)
+{
+	return RESET_TYPE_RECOVER_OR_ALL;
+}
+
+int efx_mcdi_reset(struct efx_nic *efx, enum reset_type method)
+{
+	int rc;
+
+	/* If MCDI is down, we can't handle_assertion */
+	if (method == RESET_TYPE_MCDI_TIMEOUT) {
+		rc = pci_reset_function(efx->pci_dev);
+		if (rc)
+			return rc;
+		/* Re-enable polled MCDI completion */
+		if (efx->mcdi) {
+			struct efx_mcdi_iface *mcdi = efx_mcdi(efx);
+			mcdi->mode = MCDI_MODE_POLL;
+		}
+		return 0;
+	}
+
+	/* Recover from a failed assertion pre-reset */
+	rc = efx_mcdi_handle_assertion(efx);
+	if (rc)
+		return rc;
+
+	if (method == RESET_TYPE_DATAPATH)
+		return 0;
+	else if (method == RESET_TYPE_WORLD)
+		return efx_mcdi_reset_mc(efx);
+	else
+		return efx_mcdi_reset_func(efx);
+}
+
+static int efx_mcdi_wol_filter_set(struct efx_nic *efx, u32 type,
+				   const u8 *mac, int *id_out)
+{
+	MCDI_DECLARE_BUF(inbuf, MC_CMD_WOL_FILTER_SET_IN_LEN);
+	MCDI_DECLARE_BUF(outbuf, MC_CMD_WOL_FILTER_SET_OUT_LEN);
+	size_t outlen;
+	int rc;
+
+	MCDI_SET_DWORD(inbuf, WOL_FILTER_SET_IN_WOL_TYPE, type);
+	MCDI_SET_DWORD(inbuf, WOL_FILTER_SET_IN_FILTER_MODE,
+		       MC_CMD_FILTER_MODE_SIMPLE);
+	ether_addr_copy(MCDI_PTR(inbuf, WOL_FILTER_SET_IN_MAGIC_MAC), mac);
+
+	rc = efx_mcdi_rpc(efx, MC_CMD_WOL_FILTER_SET, inbuf, sizeof(inbuf),
+			  outbuf, sizeof(outbuf), &outlen);
+	if (rc)
+		goto fail;
+
+	if (outlen < MC_CMD_WOL_FILTER_SET_OUT_LEN) {
+		rc = -EIO;
+		goto fail;
+	}
+
+	*id_out = (int)MCDI_DWORD(outbuf, WOL_FILTER_SET_OUT_FILTER_ID);
+
+	return 0;
+
+fail:
+	*id_out = -1;
+	netif_err(efx, hw, efx->net_dev, "%s: failed rc=%d\n", __func__, rc);
+	return rc;
+
+}
+
+
+int
+efx_mcdi_wol_filter_set_magic(struct efx_nic *efx,  const u8 *mac, int *id_out)
+{
+	return efx_mcdi_wol_filter_set(efx, MC_CMD_WOL_TYPE_MAGIC, mac, id_out);
+}
+
+
+int efx_mcdi_wol_filter_get_magic(struct efx_nic *efx, int *id_out)
+{
+	MCDI_DECLARE_BUF(outbuf, MC_CMD_WOL_FILTER_GET_OUT_LEN);
+	size_t outlen;
+	int rc;
+
+	rc = efx_mcdi_rpc(efx, MC_CMD_WOL_FILTER_GET, NULL, 0,
+			  outbuf, sizeof(outbuf), &outlen);
+	if (rc)
+		goto fail;
+
+	if (outlen < MC_CMD_WOL_FILTER_GET_OUT_LEN) {
+		rc = -EIO;
+		goto fail;
+	}
+
+	*id_out = (int)MCDI_DWORD(outbuf, WOL_FILTER_GET_OUT_FILTER_ID);
+
+	return 0;
+
+fail:
+	*id_out = -1;
+	netif_err(efx, hw, efx->net_dev, "%s: failed rc=%d\n", __func__, rc);
+	return rc;
+}
+
+
+int efx_mcdi_wol_filter_remove(struct efx_nic *efx, int id)
+{
+	MCDI_DECLARE_BUF(inbuf, MC_CMD_WOL_FILTER_REMOVE_IN_LEN);
+	int rc;
+
+	MCDI_SET_DWORD(inbuf, WOL_FILTER_REMOVE_IN_FILTER_ID, (u32)id);
+
+	rc = efx_mcdi_rpc(efx, MC_CMD_WOL_FILTER_REMOVE, inbuf, sizeof(inbuf),
+			  NULL, 0, NULL);
+	return rc;
+}
+
+int efx_mcdi_flush_rxqs(struct efx_nic *efx)
+{
+	struct efx_channel *channel;
+	struct efx_rx_queue *rx_queue;
+	MCDI_DECLARE_BUF(inbuf,
+			 MC_CMD_FLUSH_RX_QUEUES_IN_LEN(EFX_MAX_CHANNELS));
+	int rc, count;
+
+	BUILD_BUG_ON(EFX_MAX_CHANNELS >
+		     MC_CMD_FLUSH_RX_QUEUES_IN_QID_OFST_MAXNUM);
+
+	count = 0;
+	efx_for_each_channel(channel, efx) {
+		efx_for_each_channel_rx_queue(rx_queue, channel) {
+			if (rx_queue->flush_pending) {
+				rx_queue->flush_pending = false;
+				atomic_dec(&efx->rxq_flush_pending);
+				MCDI_SET_ARRAY_DWORD(
+					inbuf, FLUSH_RX_QUEUES_IN_QID_OFST,
+					count, efx_rx_queue_index(rx_queue));
+				count++;
+			}
+		}
+	}
+
+	rc = efx_mcdi_rpc(efx, MC_CMD_FLUSH_RX_QUEUES, inbuf,
+			  MC_CMD_FLUSH_RX_QUEUES_IN_LEN(count), NULL, 0, NULL);
+	WARN_ON(rc < 0);
+
+	return rc;
+}
+
+int efx_mcdi_wol_filter_reset(struct efx_nic *efx)
+{
+	int rc;
+
+	rc = efx_mcdi_rpc(efx, MC_CMD_WOL_FILTER_RESET, NULL, 0, NULL, 0, NULL);
+	return rc;
+}
+
+int efx_mcdi_set_workaround(struct efx_nic *efx, u32 type, bool enabled,
+			    unsigned int *flags)
+{
+	MCDI_DECLARE_BUF(inbuf, MC_CMD_WORKAROUND_IN_LEN);
+	MCDI_DECLARE_BUF(outbuf, MC_CMD_WORKAROUND_EXT_OUT_LEN);
+	size_t outlen;
+	int rc;
+
+	BUILD_BUG_ON(MC_CMD_WORKAROUND_OUT_LEN != 0);
+	MCDI_SET_DWORD(inbuf, WORKAROUND_IN_TYPE, type);
+	MCDI_SET_DWORD(inbuf, WORKAROUND_IN_ENABLED, enabled);
+	rc = efx_mcdi_rpc(efx, MC_CMD_WORKAROUND, inbuf, sizeof(inbuf),
+			  outbuf, sizeof(outbuf), &outlen);
+	if (rc)
+		return rc;
+
+	if (!flags)
+		return 0;
+
+	if (outlen >= MC_CMD_WORKAROUND_EXT_OUT_LEN)
+		*flags = MCDI_DWORD(outbuf, WORKAROUND_EXT_OUT_FLAGS);
+	else
+		*flags = 0;
+
+	return 0;
+}
+
+int efx_mcdi_get_workarounds(struct efx_nic *efx, unsigned int *impl_out,
+			     unsigned int *enabled_out)
+{
+	MCDI_DECLARE_BUF(outbuf, MC_CMD_GET_WORKAROUNDS_OUT_LEN);
+	size_t outlen;
+	int rc;
+
+	rc = efx_mcdi_rpc(efx, MC_CMD_GET_WORKAROUNDS, NULL, 0,
+			  outbuf, sizeof(outbuf), &outlen);
+	if (rc)
+		goto fail;
+
+	if (outlen < MC_CMD_GET_WORKAROUNDS_OUT_LEN) {
+		rc = -EIO;
+		goto fail;
+	}
+
+	if (impl_out)
+		*impl_out = MCDI_DWORD(outbuf, GET_WORKAROUNDS_OUT_IMPLEMENTED);
+
+	if (enabled_out)
+		*enabled_out = MCDI_DWORD(outbuf, GET_WORKAROUNDS_OUT_ENABLED);
+
+	return 0;
+
+fail:
+	/* Older firmware lacks GET_WORKAROUNDS and this isn't especially
+	 * terrifying.  The call site will have to deal with it though.
+	 */
+	netif_cond_dbg(efx, hw, efx->net_dev, rc == -ENOSYS, err,
+		       "%s: failed rc=%d\n", __func__, rc);
+	return rc;
+}
+
+#ifdef CONFIG_SFC_MTD
+
+#define EFX_MCDI_NVRAM_LEN_MAX 128
+
+static int efx_mcdi_nvram_update_start(struct efx_nic *efx, unsigned int type)
+{
+	MCDI_DECLARE_BUF(inbuf, MC_CMD_NVRAM_UPDATE_START_V2_IN_LEN);
+	int rc;
+
+	MCDI_SET_DWORD(inbuf, NVRAM_UPDATE_START_IN_TYPE, type);
+	MCDI_POPULATE_DWORD_1(inbuf, NVRAM_UPDATE_START_V2_IN_FLAGS,
+			      NVRAM_UPDATE_START_V2_IN_FLAG_REPORT_VERIFY_RESULT,
+			      1);
+
+	BUILD_BUG_ON(MC_CMD_NVRAM_UPDATE_START_OUT_LEN != 0);
+
+	rc = efx_mcdi_rpc(efx, MC_CMD_NVRAM_UPDATE_START, inbuf, sizeof(inbuf),
+			  NULL, 0, NULL);
+
+	return rc;
+}
+
+static int efx_mcdi_nvram_read(struct efx_nic *efx, unsigned int type,
+			       loff_t offset, u8 *buffer, size_t length)
+{
+	MCDI_DECLARE_BUF(inbuf, MC_CMD_NVRAM_READ_IN_V2_LEN);
+	MCDI_DECLARE_BUF(outbuf,
+			 MC_CMD_NVRAM_READ_OUT_LEN(EFX_MCDI_NVRAM_LEN_MAX));
+	size_t outlen;
+	int rc;
+
+	MCDI_SET_DWORD(inbuf, NVRAM_READ_IN_TYPE, type);
+	MCDI_SET_DWORD(inbuf, NVRAM_READ_IN_OFFSET, offset);
+	MCDI_SET_DWORD(inbuf, NVRAM_READ_IN_LENGTH, length);
+	MCDI_SET_DWORD(inbuf, NVRAM_READ_IN_V2_MODE,
+		       MC_CMD_NVRAM_READ_IN_V2_DEFAULT);
+
+	rc = efx_mcdi_rpc(efx, MC_CMD_NVRAM_READ, inbuf, sizeof(inbuf),
+			  outbuf, sizeof(outbuf), &outlen);
+	if (rc)
+		return rc;
+
+	memcpy(buffer, MCDI_PTR(outbuf, NVRAM_READ_OUT_READ_BUFFER), length);
+	return 0;
+}
+
+static int efx_mcdi_nvram_write(struct efx_nic *efx, unsigned int type,
+				loff_t offset, const u8 *buffer, size_t length)
+{
+	MCDI_DECLARE_BUF(inbuf,
+			 MC_CMD_NVRAM_WRITE_IN_LEN(EFX_MCDI_NVRAM_LEN_MAX));
+	int rc;
+
+	MCDI_SET_DWORD(inbuf, NVRAM_WRITE_IN_TYPE, type);
+	MCDI_SET_DWORD(inbuf, NVRAM_WRITE_IN_OFFSET, offset);
+	MCDI_SET_DWORD(inbuf, NVRAM_WRITE_IN_LENGTH, length);
+	memcpy(MCDI_PTR(inbuf, NVRAM_WRITE_IN_WRITE_BUFFER), buffer, length);
+
+	BUILD_BUG_ON(MC_CMD_NVRAM_WRITE_OUT_LEN != 0);
+
+	rc = efx_mcdi_rpc(efx, MC_CMD_NVRAM_WRITE, inbuf,
+			  ALIGN(MC_CMD_NVRAM_WRITE_IN_LEN(length), 4),
+			  NULL, 0, NULL);
+	return rc;
+}
+
+static int efx_mcdi_nvram_erase(struct efx_nic *efx, unsigned int type,
+				loff_t offset, size_t length)
+{
+	MCDI_DECLARE_BUF(inbuf, MC_CMD_NVRAM_ERASE_IN_LEN);
+	int rc;
+
+	MCDI_SET_DWORD(inbuf, NVRAM_ERASE_IN_TYPE, type);
+	MCDI_SET_DWORD(inbuf, NVRAM_ERASE_IN_OFFSET, offset);
+	MCDI_SET_DWORD(inbuf, NVRAM_ERASE_IN_LENGTH, length);
+
+	BUILD_BUG_ON(MC_CMD_NVRAM_ERASE_OUT_LEN != 0);
+
+	rc = efx_mcdi_rpc(efx, MC_CMD_NVRAM_ERASE, inbuf, sizeof(inbuf),
+			  NULL, 0, NULL);
+	return rc;
+}
+
+static int efx_mcdi_nvram_update_finish(struct efx_nic *efx, unsigned int type)
+{
+	MCDI_DECLARE_BUF(inbuf, MC_CMD_NVRAM_UPDATE_FINISH_V2_IN_LEN);
+	MCDI_DECLARE_BUF(outbuf, MC_CMD_NVRAM_UPDATE_FINISH_V2_OUT_LEN);
+	size_t outlen;
+	int rc, rc2;
+
+	MCDI_SET_DWORD(inbuf, NVRAM_UPDATE_FINISH_IN_TYPE, type);
+	/* Always set this flag. Old firmware ignores it */
+	MCDI_POPULATE_DWORD_1(inbuf, NVRAM_UPDATE_FINISH_V2_IN_FLAGS,
+			      NVRAM_UPDATE_FINISH_V2_IN_FLAG_REPORT_VERIFY_RESULT,
+			      1);
+
+	rc = efx_mcdi_rpc(efx, MC_CMD_NVRAM_UPDATE_FINISH, inbuf, sizeof(inbuf),
+			  outbuf, sizeof(outbuf), &outlen);
+	if (!rc && outlen >= MC_CMD_NVRAM_UPDATE_FINISH_V2_OUT_LEN) {
+		rc2 = MCDI_DWORD(outbuf, NVRAM_UPDATE_FINISH_V2_OUT_RESULT_CODE);
+		if (rc2 != MC_CMD_NVRAM_VERIFY_RC_SUCCESS)
+			netif_err(efx, drv, efx->net_dev,
+				  "NVRAM update failed verification with code 0x%x\n",
+				  rc2);
+		switch (rc2) {
+		case MC_CMD_NVRAM_VERIFY_RC_SUCCESS:
+			break;
+		case MC_CMD_NVRAM_VERIFY_RC_CMS_CHECK_FAILED:
+		case MC_CMD_NVRAM_VERIFY_RC_MESSAGE_DIGEST_CHECK_FAILED:
+		case MC_CMD_NVRAM_VERIFY_RC_SIGNATURE_CHECK_FAILED:
+		case MC_CMD_NVRAM_VERIFY_RC_TRUSTED_APPROVERS_CHECK_FAILED:
+		case MC_CMD_NVRAM_VERIFY_RC_SIGNATURE_CHAIN_CHECK_FAILED:
+			rc = -EIO;
+			break;
+		case MC_CMD_NVRAM_VERIFY_RC_INVALID_CMS_FORMAT:
+		case MC_CMD_NVRAM_VERIFY_RC_BAD_MESSAGE_DIGEST:
+			rc = -EINVAL;
+			break;
+		case MC_CMD_NVRAM_VERIFY_RC_NO_VALID_SIGNATURES:
+		case MC_CMD_NVRAM_VERIFY_RC_NO_TRUSTED_APPROVERS:
+		case MC_CMD_NVRAM_VERIFY_RC_NO_SIGNATURE_MATCH:
+			rc = -EPERM;
+			break;
+		default:
+			netif_err(efx, drv, efx->net_dev,
+				  "Unknown response to NVRAM_UPDATE_FINISH\n");
+			rc = -EIO;
+		}
+	}
+
+	return rc;
+}
+
+int efx_mcdi_mtd_read(struct mtd_info *mtd, loff_t start,
+		      size_t len, size_t *retlen, u8 *buffer)
+{
+	struct efx_mcdi_mtd_partition *part = to_efx_mcdi_mtd_partition(mtd);
+	struct efx_nic *efx = mtd->priv;
+	loff_t offset = start;
+	loff_t end = min_t(loff_t, start + len, mtd->size);
+	size_t chunk;
+	int rc = 0;
+
+	while (offset < end) {
+		chunk = min_t(size_t, end - offset, EFX_MCDI_NVRAM_LEN_MAX);
+		rc = efx_mcdi_nvram_read(efx, part->nvram_type, offset,
+					 buffer, chunk);
+		if (rc)
+			goto out;
+		offset += chunk;
+		buffer += chunk;
+	}
+out:
+	*retlen = offset - start;
+	return rc;
+}
+
+int efx_mcdi_mtd_erase(struct mtd_info *mtd, loff_t start, size_t len)
+{
+	struct efx_mcdi_mtd_partition *part = to_efx_mcdi_mtd_partition(mtd);
+	struct efx_nic *efx = mtd->priv;
+	loff_t offset = start & ~((loff_t)(mtd->erasesize - 1));
+	loff_t end = min_t(loff_t, start + len, mtd->size);
+	size_t chunk = part->common.mtd.erasesize;
+	int rc = 0;
+
+	if (!part->updating) {
+		rc = efx_mcdi_nvram_update_start(efx, part->nvram_type);
+		if (rc)
+			goto out;
+		part->updating = true;
+	}
+
+	/* The MCDI interface can in fact do multiple erase blocks at once;
+	 * but erasing may be slow, so we make multiple calls here to avoid
+	 * tripping the MCDI RPC timeout. */
+	while (offset < end) {
+		rc = efx_mcdi_nvram_erase(efx, part->nvram_type, offset,
+					  chunk);
+		if (rc)
+			goto out;
+		offset += chunk;
+	}
+out:
+	return rc;
+}
+
+int efx_mcdi_mtd_write(struct mtd_info *mtd, loff_t start,
+		       size_t len, size_t *retlen, const u8 *buffer)
+{
+	struct efx_mcdi_mtd_partition *part = to_efx_mcdi_mtd_partition(mtd);
+	struct efx_nic *efx = mtd->priv;
+	loff_t offset = start;
+	loff_t end = min_t(loff_t, start + len, mtd->size);
+	size_t chunk;
+	int rc = 0;
+
+	if (!part->updating) {
+		rc = efx_mcdi_nvram_update_start(efx, part->nvram_type);
+		if (rc)
+			goto out;
+		part->updating = true;
+	}
+
+	while (offset < end) {
+		chunk = min_t(size_t, end - offset, EFX_MCDI_NVRAM_LEN_MAX);
+		rc = efx_mcdi_nvram_write(efx, part->nvram_type, offset,
+					  buffer, chunk);
+		if (rc)
+			goto out;
+		offset += chunk;
+		buffer += chunk;
+	}
+out:
+	*retlen = offset - start;
+	return rc;
+}
+
+int efx_mcdi_mtd_sync(struct mtd_info *mtd)
+{
+	struct efx_mcdi_mtd_partition *part = to_efx_mcdi_mtd_partition(mtd);
+	struct efx_nic *efx = mtd->priv;
+	int rc = 0;
+
+	if (part->updating) {
+		part->updating = false;
+		rc = efx_mcdi_nvram_update_finish(efx, part->nvram_type);
+	}
+
+	return rc;
+}
+
+void efx_mcdi_mtd_rename(struct efx_mtd_partition *part)
+{
+	struct efx_mcdi_mtd_partition *mcdi_part =
+		container_of(part, struct efx_mcdi_mtd_partition, common);
+	struct efx_nic *efx = part->mtd.priv;
+
+	snprintf(part->name, sizeof(part->name), "%s %s:%02x",
+		 efx->name, part->type_name, mcdi_part->fw_subtype);
+}
+
+#endif /* CONFIG_SFC_MTD */
--- /dev/null
+++ b/drivers/net/ethernet/sfc/siena/mcdi.h
@@ -0,0 +1,388 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/****************************************************************************
+ * Driver for Solarflare network controllers and boards
+ * Copyright 2008-2013 Solarflare Communications Inc.
+ */
+
+#ifndef EFX_MCDI_H
+#define EFX_MCDI_H
+
+/**
+ * enum efx_mcdi_state - MCDI request handling state
+ * @MCDI_STATE_QUIESCENT: No pending MCDI requests. If the caller holds the
+ *	mcdi @iface_lock then they are able to move to %MCDI_STATE_RUNNING
+ * @MCDI_STATE_RUNNING_SYNC: There is a synchronous MCDI request pending.
+ *	Only the thread that moved into this state is allowed to move out of it.
+ * @MCDI_STATE_RUNNING_ASYNC: There is an asynchronous MCDI request pending.
+ * @MCDI_STATE_PROXY_WAIT: An MCDI request has completed with a response that
+ *	indicates we must wait for a proxy try again message.
+ * @MCDI_STATE_COMPLETED: An MCDI request has completed, but the owning thread
+ *	has not yet consumed the result. For all other threads, equivalent to
+ *	%MCDI_STATE_RUNNING.
+ */
+enum efx_mcdi_state {
+	MCDI_STATE_QUIESCENT,
+	MCDI_STATE_RUNNING_SYNC,
+	MCDI_STATE_RUNNING_ASYNC,
+	MCDI_STATE_PROXY_WAIT,
+	MCDI_STATE_COMPLETED,
+};
+
+/**
+ * enum efx_mcdi_mode - MCDI transaction mode
+ * @MCDI_MODE_POLL: poll for MCDI completion, until timeout
+ * @MCDI_MODE_EVENTS: wait for an mcdi_event.  On timeout, poll once
+ * @MCDI_MODE_FAIL: we think MCDI is dead, so fail-fast all calls
+ */
+enum efx_mcdi_mode {
+	MCDI_MODE_POLL,
+	MCDI_MODE_EVENTS,
+	MCDI_MODE_FAIL,
+};
+
+/**
+ * struct efx_mcdi_iface - MCDI protocol context
+ * @efx: The associated NIC.
+ * @state: Request handling state. Waited for by @wq.
+ * @mode: Poll for mcdi completion, or wait for an mcdi_event.
+ * @wq: Wait queue for threads waiting for @state != %MCDI_STATE_RUNNING
+ * @new_epoch: Indicates start of day or start of MC reboot recovery
+ * @iface_lock: Serialises access to @seqno, @credits and response metadata
+ * @seqno: The next sequence number to use for mcdi requests.
+ * @credits: Number of spurious MCDI completion events allowed before we
+ *     trigger a fatal error
+ * @resprc: Response error/success code (Linux numbering)
+ * @resp_hdr_len: Response header length
+ * @resp_data_len: Response data (SDU or error) length
+ * @async_lock: Serialises access to @async_list while event processing is
+ *	enabled
+ * @async_list: Queue of asynchronous requests
+ * @async_timer: Timer for asynchronous request timeout
+ * @logging_buffer: buffer that may be used to build MCDI tracing messages
+ * @logging_enabled: whether to trace MCDI
+ * @proxy_rx_handle: Most recently received proxy authorisation handle
+ * @proxy_rx_status: Status of most recent proxy authorisation
+ * @proxy_rx_wq: Wait queue for updates to proxy_rx_handle
+ */
+struct efx_mcdi_iface {
+	struct efx_nic *efx;
+	enum efx_mcdi_state state;
+	enum efx_mcdi_mode mode;
+	wait_queue_head_t wq;
+	spinlock_t iface_lock;
+	bool new_epoch;
+	unsigned int credits;
+	unsigned int seqno;
+	int resprc;
+	int resprc_raw;
+	size_t resp_hdr_len;
+	size_t resp_data_len;
+	spinlock_t async_lock;
+	struct list_head async_list;
+	struct timer_list async_timer;
+#ifdef CONFIG_SFC_MCDI_LOGGING
+	char *logging_buffer;
+	bool logging_enabled;
+#endif
+	unsigned int proxy_rx_handle;
+	int proxy_rx_status;
+	wait_queue_head_t proxy_rx_wq;
+};
+
+struct efx_mcdi_mon {
+	struct efx_buffer dma_buf;
+	struct mutex update_lock;
+	unsigned long last_update;
+	struct device *device;
+	struct efx_mcdi_mon_attribute *attrs;
+	struct attribute_group group;
+	const struct attribute_group *groups[2];
+	unsigned int n_attrs;
+};
+
+struct efx_mcdi_mtd_partition {
+	struct efx_mtd_partition common;
+	bool updating;
+	u16 nvram_type;
+	u16 fw_subtype;
+};
+
+#define to_efx_mcdi_mtd_partition(mtd)				\
+	container_of(mtd, struct efx_mcdi_mtd_partition, common.mtd)
+
+/**
+ * struct efx_mcdi_data - extra state for NICs that implement MCDI
+ * @iface: Interface/protocol state
+ * @hwmon: Hardware monitor state
+ * @fn_flags: Flags for this function, as returned by %MC_CMD_DRV_ATTACH.
+ */
+struct efx_mcdi_data {
+	struct efx_mcdi_iface iface;
+#ifdef CONFIG_SFC_MCDI_MON
+	struct efx_mcdi_mon hwmon;
+#endif
+	u32 fn_flags;
+};
+
+static inline struct efx_mcdi_iface *efx_mcdi(struct efx_nic *efx)
+{
+	EFX_WARN_ON_PARANOID(!efx->mcdi);
+	return &efx->mcdi->iface;
+}
+
+#ifdef CONFIG_SFC_MCDI_MON
+static inline struct efx_mcdi_mon *efx_mcdi_mon(struct efx_nic *efx)
+{
+	EFX_WARN_ON_PARANOID(!efx->mcdi);
+	return &efx->mcdi->hwmon;
+}
+#endif
+
+int efx_mcdi_init(struct efx_nic *efx);
+void efx_mcdi_detach(struct efx_nic *efx);
+void efx_mcdi_fini(struct efx_nic *efx);
+
+int efx_mcdi_rpc(struct efx_nic *efx, unsigned cmd, const efx_dword_t *inbuf,
+		 size_t inlen, efx_dword_t *outbuf, size_t outlen,
+		 size_t *outlen_actual);
+int efx_mcdi_rpc_quiet(struct efx_nic *efx, unsigned cmd,
+		       const efx_dword_t *inbuf, size_t inlen,
+		       efx_dword_t *outbuf, size_t outlen,
+		       size_t *outlen_actual);
+
+int efx_mcdi_rpc_start(struct efx_nic *efx, unsigned cmd,
+		       const efx_dword_t *inbuf, size_t inlen);
+int efx_mcdi_rpc_finish(struct efx_nic *efx, unsigned cmd, size_t inlen,
+			efx_dword_t *outbuf, size_t outlen,
+			size_t *outlen_actual);
+int efx_mcdi_rpc_finish_quiet(struct efx_nic *efx, unsigned cmd,
+			      size_t inlen, efx_dword_t *outbuf,
+			      size_t outlen, size_t *outlen_actual);
+
+typedef void efx_mcdi_async_completer(struct efx_nic *efx,
+				      unsigned long cookie, int rc,
+				      efx_dword_t *outbuf,
+				      size_t outlen_actual);
+int efx_mcdi_rpc_async(struct efx_nic *efx, unsigned int cmd,
+		       const efx_dword_t *inbuf, size_t inlen, size_t outlen,
+		       efx_mcdi_async_completer *complete,
+		       unsigned long cookie);
+int efx_mcdi_rpc_async_quiet(struct efx_nic *efx, unsigned int cmd,
+			     const efx_dword_t *inbuf, size_t inlen,
+			     size_t outlen,
+			     efx_mcdi_async_completer *complete,
+			     unsigned long cookie);
+
+void efx_mcdi_display_error(struct efx_nic *efx, unsigned cmd,
+			    size_t inlen, efx_dword_t *outbuf,
+			    size_t outlen, int rc);
+
+int efx_mcdi_poll_reboot(struct efx_nic *efx);
+void efx_mcdi_mode_poll(struct efx_nic *efx);
+void efx_mcdi_mode_event(struct efx_nic *efx);
+void efx_mcdi_flush_async(struct efx_nic *efx);
+
+void efx_mcdi_process_event(struct efx_channel *channel, efx_qword_t *event);
+void efx_mcdi_sensor_event(struct efx_nic *efx, efx_qword_t *ev);
+
+/* We expect that 16- and 32-bit fields in MCDI requests and responses
+ * are appropriately aligned, but 64-bit fields are only
+ * 32-bit-aligned.  Also, on Siena we must copy to the MC shared
+ * memory strictly 32 bits at a time, so add any necessary padding.
+ */
+#define MCDI_TX_BUF_LEN(_len) DIV_ROUND_UP((_len), 4)
+#define _MCDI_DECLARE_BUF(_name, _len)					\
+	efx_dword_t _name[DIV_ROUND_UP(_len, 4)]
+#define MCDI_DECLARE_BUF(_name, _len)					\
+	_MCDI_DECLARE_BUF(_name, _len) = {{{0}}}
+#define MCDI_DECLARE_BUF_ERR(_name)					\
+	MCDI_DECLARE_BUF(_name, 8)
+#define _MCDI_PTR(_buf, _offset)					\
+	((u8 *)(_buf) + (_offset))
+#define MCDI_PTR(_buf, _field)						\
+	_MCDI_PTR(_buf, MC_CMD_ ## _field ## _OFST)
+#define _MCDI_CHECK_ALIGN(_ofst, _align)				\
+	((_ofst) + BUILD_BUG_ON_ZERO((_ofst) & (_align - 1)))
+#define _MCDI_DWORD(_buf, _field)					\
+	((_buf) + (_MCDI_CHECK_ALIGN(MC_CMD_ ## _field ## _OFST, 4) >> 2))
+
+#define MCDI_BYTE(_buf, _field)						\
+	((void)BUILD_BUG_ON_ZERO(MC_CMD_ ## _field ## _LEN != 1),	\
+	 *MCDI_PTR(_buf, _field))
+#define MCDI_WORD(_buf, _field)						\
+	((u16)BUILD_BUG_ON_ZERO(MC_CMD_ ## _field ## _LEN != 2) +	\
+	 le16_to_cpu(*(__force const __le16 *)MCDI_PTR(_buf, _field)))
+#define MCDI_SET_DWORD(_buf, _field, _value)				\
+	EFX_POPULATE_DWORD_1(*_MCDI_DWORD(_buf, _field), EFX_DWORD_0, _value)
+#define MCDI_DWORD(_buf, _field)					\
+	EFX_DWORD_FIELD(*_MCDI_DWORD(_buf, _field), EFX_DWORD_0)
+#define MCDI_POPULATE_DWORD_1(_buf, _field, _name1, _value1)		\
+	EFX_POPULATE_DWORD_1(*_MCDI_DWORD(_buf, _field),		\
+			     MC_CMD_ ## _name1, _value1)
+#define MCDI_POPULATE_DWORD_2(_buf, _field, _name1, _value1,		\
+			      _name2, _value2)				\
+	EFX_POPULATE_DWORD_2(*_MCDI_DWORD(_buf, _field),		\
+			     MC_CMD_ ## _name1, _value1,		\
+			     MC_CMD_ ## _name2, _value2)
+#define MCDI_POPULATE_DWORD_3(_buf, _field, _name1, _value1,		\
+			      _name2, _value2, _name3, _value3)		\
+	EFX_POPULATE_DWORD_3(*_MCDI_DWORD(_buf, _field),		\
+			     MC_CMD_ ## _name1, _value1,		\
+			     MC_CMD_ ## _name2, _value2,		\
+			     MC_CMD_ ## _name3, _value3)
+#define MCDI_POPULATE_DWORD_4(_buf, _field, _name1, _value1,		\
+			      _name2, _value2, _name3, _value3,		\
+			      _name4, _value4)				\
+	EFX_POPULATE_DWORD_4(*_MCDI_DWORD(_buf, _field),		\
+			     MC_CMD_ ## _name1, _value1,		\
+			     MC_CMD_ ## _name2, _value2,		\
+			     MC_CMD_ ## _name3, _value3,		\
+			     MC_CMD_ ## _name4, _value4)
+#define MCDI_POPULATE_DWORD_5(_buf, _field, _name1, _value1,		\
+			      _name2, _value2, _name3, _value3,		\
+			      _name4, _value4, _name5, _value5)		\
+	EFX_POPULATE_DWORD_5(*_MCDI_DWORD(_buf, _field),		\
+			     MC_CMD_ ## _name1, _value1,		\
+			     MC_CMD_ ## _name2, _value2,		\
+			     MC_CMD_ ## _name3, _value3,		\
+			     MC_CMD_ ## _name4, _value4,		\
+			     MC_CMD_ ## _name5, _value5)
+#define MCDI_POPULATE_DWORD_6(_buf, _field, _name1, _value1,		\
+			      _name2, _value2, _name3, _value3,		\
+			      _name4, _value4, _name5, _value5,		\
+			      _name6, _value6)				\
+	EFX_POPULATE_DWORD_6(*_MCDI_DWORD(_buf, _field),		\
+			     MC_CMD_ ## _name1, _value1,		\
+			     MC_CMD_ ## _name2, _value2,		\
+			     MC_CMD_ ## _name3, _value3,		\
+			     MC_CMD_ ## _name4, _value4,		\
+			     MC_CMD_ ## _name5, _value5,		\
+			     MC_CMD_ ## _name6, _value6)
+#define MCDI_POPULATE_DWORD_7(_buf, _field, _name1, _value1,		\
+			      _name2, _value2, _name3, _value3,		\
+			      _name4, _value4, _name5, _value5,		\
+			      _name6, _value6, _name7, _value7)		\
+	EFX_POPULATE_DWORD_7(*_MCDI_DWORD(_buf, _field),		\
+			     MC_CMD_ ## _name1, _value1,		\
+			     MC_CMD_ ## _name2, _value2,		\
+			     MC_CMD_ ## _name3, _value3,		\
+			     MC_CMD_ ## _name4, _value4,		\
+			     MC_CMD_ ## _name5, _value5,		\
+			     MC_CMD_ ## _name6, _value6,		\
+			     MC_CMD_ ## _name7, _value7)
+#define MCDI_SET_QWORD(_buf, _field, _value)				\
+	do {								\
+		EFX_POPULATE_DWORD_1(_MCDI_DWORD(_buf, _field)[0],	\
+				     EFX_DWORD_0, (u32)(_value));	\
+		EFX_POPULATE_DWORD_1(_MCDI_DWORD(_buf, _field)[1],	\
+				     EFX_DWORD_0, (u64)(_value) >> 32);	\
+	} while (0)
+#define MCDI_QWORD(_buf, _field)					\
+	(EFX_DWORD_FIELD(_MCDI_DWORD(_buf, _field)[0], EFX_DWORD_0) |	\
+	(u64)EFX_DWORD_FIELD(_MCDI_DWORD(_buf, _field)[1], EFX_DWORD_0) << 32)
+#define MCDI_FIELD(_ptr, _type, _field)					\
+	EFX_EXTRACT_DWORD(						\
+		*(efx_dword_t *)					\
+		_MCDI_PTR(_ptr, MC_CMD_ ## _type ## _ ## _field ## _OFST & ~3),\
+		MC_CMD_ ## _type ## _ ## _field ## _LBN & 0x1f,	\
+		(MC_CMD_ ## _type ## _ ## _field ## _LBN & 0x1f) +	\
+		MC_CMD_ ## _type ## _ ## _field ## _WIDTH - 1)
+
+#define _MCDI_ARRAY_PTR(_buf, _field, _index, _align)			\
+	(_MCDI_PTR(_buf, _MCDI_CHECK_ALIGN(MC_CMD_ ## _field ## _OFST, _align))\
+	 + (_index) * _MCDI_CHECK_ALIGN(MC_CMD_ ## _field ## _LEN, _align))
+#define MCDI_DECLARE_STRUCT_PTR(_name)					\
+	efx_dword_t *_name
+#define MCDI_ARRAY_STRUCT_PTR(_buf, _field, _index)			\
+	((efx_dword_t *)_MCDI_ARRAY_PTR(_buf, _field, _index, 4))
+#define MCDI_VAR_ARRAY_LEN(_len, _field)				\
+	min_t(size_t, MC_CMD_ ## _field ## _MAXNUM,			\
+	      ((_len) - MC_CMD_ ## _field ## _OFST) / MC_CMD_ ## _field ## _LEN)
+#define MCDI_ARRAY_WORD(_buf, _field, _index)				\
+	(BUILD_BUG_ON_ZERO(MC_CMD_ ## _field ## _LEN != 2) +		\
+	 le16_to_cpu(*(__force const __le16 *)				\
+		     _MCDI_ARRAY_PTR(_buf, _field, _index, 2)))
+#define _MCDI_ARRAY_DWORD(_buf, _field, _index)				\
+	(BUILD_BUG_ON_ZERO(MC_CMD_ ## _field ## _LEN != 4) +		\
+	 (efx_dword_t *)_MCDI_ARRAY_PTR(_buf, _field, _index, 4))
+#define MCDI_SET_ARRAY_DWORD(_buf, _field, _index, _value)		\
+	EFX_SET_DWORD_FIELD(*_MCDI_ARRAY_DWORD(_buf, _field, _index),	\
+			    EFX_DWORD_0, _value)
+#define MCDI_ARRAY_DWORD(_buf, _field, _index)				\
+	EFX_DWORD_FIELD(*_MCDI_ARRAY_DWORD(_buf, _field, _index), EFX_DWORD_0)
+#define _MCDI_ARRAY_QWORD(_buf, _field, _index)				\
+	(BUILD_BUG_ON_ZERO(MC_CMD_ ## _field ## _LEN != 8) +		\
+	 (efx_dword_t *)_MCDI_ARRAY_PTR(_buf, _field, _index, 4))
+#define MCDI_SET_ARRAY_QWORD(_buf, _field, _index, _value)		\
+	do {								\
+		EFX_SET_DWORD_FIELD(_MCDI_ARRAY_QWORD(_buf, _field, _index)[0],\
+				    EFX_DWORD_0, (u32)(_value));	\
+		EFX_SET_DWORD_FIELD(_MCDI_ARRAY_QWORD(_buf, _field, _index)[1],\
+				    EFX_DWORD_0, (u64)(_value) >> 32);	\
+	} while (0)
+#define MCDI_ARRAY_FIELD(_buf, _field1, _type, _index, _field2)		\
+	MCDI_FIELD(MCDI_ARRAY_STRUCT_PTR(_buf, _field1, _index),	\
+		   _type ## _TYPEDEF, _field2)
+
+#define MCDI_EVENT_FIELD(_ev, _field)			\
+	EFX_QWORD_FIELD(_ev, MCDI_EVENT_ ## _field)
+
+#define MCDI_CAPABILITY(field)						\
+	MC_CMD_GET_CAPABILITIES_V8_OUT_ ## field ## _LBN
+
+#define MCDI_CAPABILITY_OFST(field) \
+	MC_CMD_GET_CAPABILITIES_V8_OUT_ ## field ## _OFST
+
+#define efx_has_cap(efx, field) \
+	efx->type->check_caps(efx, \
+			      MCDI_CAPABILITY(field), \
+			      MCDI_CAPABILITY_OFST(field))
+
+void efx_mcdi_print_fwver(struct efx_nic *efx, char *buf, size_t len);
+int efx_mcdi_get_board_cfg(struct efx_nic *efx, u8 *mac_address,
+			   u16 *fw_subtype_list, u32 *capabilities);
+int efx_mcdi_log_ctrl(struct efx_nic *efx, bool evq, bool uart, u32 dest_evq);
+int efx_mcdi_nvram_types(struct efx_nic *efx, u32 *nvram_types_out);
+int efx_mcdi_nvram_info(struct efx_nic *efx, unsigned int type,
+			size_t *size_out, size_t *erase_size_out,
+			bool *protected_out);
+int efx_new_mcdi_nvram_test_all(struct efx_nic *efx);
+int efx_mcdi_nvram_test_all(struct efx_nic *efx);
+int efx_mcdi_handle_assertion(struct efx_nic *efx);
+int efx_mcdi_set_id_led(struct efx_nic *efx, enum efx_led_mode mode);
+int efx_mcdi_wol_filter_set_magic(struct efx_nic *efx, const u8 *mac,
+				  int *id_out);
+int efx_mcdi_wol_filter_get_magic(struct efx_nic *efx, int *id_out);
+int efx_mcdi_wol_filter_remove(struct efx_nic *efx, int id);
+int efx_mcdi_wol_filter_reset(struct efx_nic *efx);
+int efx_mcdi_flush_rxqs(struct efx_nic *efx);
+void efx_mcdi_process_link_change(struct efx_nic *efx, efx_qword_t *ev);
+void efx_mcdi_mac_start_stats(struct efx_nic *efx);
+void efx_mcdi_mac_stop_stats(struct efx_nic *efx);
+void efx_mcdi_mac_pull_stats(struct efx_nic *efx);
+enum reset_type efx_mcdi_map_reset_reason(enum reset_type reason);
+int efx_mcdi_reset(struct efx_nic *efx, enum reset_type method);
+int efx_mcdi_set_workaround(struct efx_nic *efx, u32 type, bool enabled,
+			    unsigned int *flags);
+int efx_mcdi_get_workarounds(struct efx_nic *efx, unsigned int *impl_out,
+			     unsigned int *enabled_out);
+
+#ifdef CONFIG_SFC_MCDI_MON
+int efx_mcdi_mon_probe(struct efx_nic *efx);
+void efx_mcdi_mon_remove(struct efx_nic *efx);
+#else
+static inline int efx_mcdi_mon_probe(struct efx_nic *efx) { return 0; }
+static inline void efx_mcdi_mon_remove(struct efx_nic *efx) {}
+#endif
+
+#ifdef CONFIG_SFC_MTD
+int efx_mcdi_mtd_read(struct mtd_info *mtd, loff_t start, size_t len,
+		      size_t *retlen, u8 *buffer);
+int efx_mcdi_mtd_erase(struct mtd_info *mtd, loff_t start, size_t len);
+int efx_mcdi_mtd_write(struct mtd_info *mtd, loff_t start, size_t len,
+		       size_t *retlen, const u8 *buffer);
+int efx_mcdi_mtd_sync(struct mtd_info *mtd);
+void efx_mcdi_mtd_rename(struct efx_mtd_partition *part);
+#endif
+
+#endif /* EFX_MCDI_H */
--- /dev/null
+++ b/drivers/net/ethernet/sfc/siena/mcdi_mon.c
@@ -0,0 +1,531 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/****************************************************************************
+ * Driver for Solarflare network controllers and boards
+ * Copyright 2011-2013 Solarflare Communications Inc.
+ */
+
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/hwmon.h>
+#include <linux/stat.h>
+
+#include "net_driver.h"
+#include "mcdi.h"
+#include "mcdi_pcol.h"
+#include "nic.h"
+
+enum efx_hwmon_type {
+	EFX_HWMON_UNKNOWN,
+	EFX_HWMON_TEMP,         /* temperature */
+	EFX_HWMON_COOL,         /* cooling device, probably a heatsink */
+	EFX_HWMON_IN,		/* voltage */
+	EFX_HWMON_CURR,		/* current */
+	EFX_HWMON_POWER,	/* power */
+	EFX_HWMON_TYPES_COUNT
+};
+
+static const char *const efx_hwmon_unit[EFX_HWMON_TYPES_COUNT] = {
+	[EFX_HWMON_TEMP]  = " degC",
+	[EFX_HWMON_COOL]  = " rpm", /* though nonsense for a heatsink */
+	[EFX_HWMON_IN]    = " mV",
+	[EFX_HWMON_CURR]  = " mA",
+	[EFX_HWMON_POWER] = " W",
+};
+
+static const struct {
+	const char *label;
+	enum efx_hwmon_type hwmon_type;
+	int port;
+} efx_mcdi_sensor_type[] = {
+#define SENSOR(name, label, hwmon_type, port)				\
+	[MC_CMD_SENSOR_##name] = { label, EFX_HWMON_ ## hwmon_type, port }
+	SENSOR(CONTROLLER_TEMP,		"Controller board temp.",   TEMP,  -1),
+	SENSOR(PHY_COMMON_TEMP,		"PHY temp.",		    TEMP,  -1),
+	SENSOR(CONTROLLER_COOLING,	"Controller heat sink",	    COOL,  -1),
+	SENSOR(PHY0_TEMP,		"PHY temp.",		    TEMP,  0),
+	SENSOR(PHY0_COOLING,		"PHY heat sink",	    COOL,  0),
+	SENSOR(PHY1_TEMP,		"PHY temp.",		    TEMP,  1),
+	SENSOR(PHY1_COOLING,		"PHY heat sink",	    COOL,  1),
+	SENSOR(IN_1V0,			"1.0V supply",		    IN,    -1),
+	SENSOR(IN_1V2,			"1.2V supply",		    IN,    -1),
+	SENSOR(IN_1V8,			"1.8V supply",		    IN,    -1),
+	SENSOR(IN_2V5,			"2.5V supply",		    IN,    -1),
+	SENSOR(IN_3V3,			"3.3V supply",		    IN,    -1),
+	SENSOR(IN_12V0,			"12.0V supply",		    IN,    -1),
+	SENSOR(IN_1V2A,			"1.2V analogue supply",	    IN,    -1),
+	SENSOR(IN_VREF,			"Ref. voltage",		    IN,    -1),
+	SENSOR(OUT_VAOE,		"AOE FPGA supply",	    IN,    -1),
+	SENSOR(AOE_TEMP,		"AOE FPGA temp.",	    TEMP,  -1),
+	SENSOR(PSU_AOE_TEMP,		"AOE regulator temp.",	    TEMP,  -1),
+	SENSOR(PSU_TEMP,		"Controller regulator temp.",
+								    TEMP,  -1),
+	SENSOR(FAN_0,			"Fan 0",		    COOL,  -1),
+	SENSOR(FAN_1,			"Fan 1",		    COOL,  -1),
+	SENSOR(FAN_2,			"Fan 2",		    COOL,  -1),
+	SENSOR(FAN_3,			"Fan 3",		    COOL,  -1),
+	SENSOR(FAN_4,			"Fan 4",		    COOL,  -1),
+	SENSOR(IN_VAOE,			"AOE input supply",	    IN,    -1),
+	SENSOR(OUT_IAOE,		"AOE output current",	    CURR,  -1),
+	SENSOR(IN_IAOE,			"AOE input current",	    CURR,  -1),
+	SENSOR(NIC_POWER,		"Board power use",	    POWER, -1),
+	SENSOR(IN_0V9,			"0.9V supply",		    IN,    -1),
+	SENSOR(IN_I0V9,			"0.9V supply current",	    CURR,  -1),
+	SENSOR(IN_I1V2,			"1.2V supply current",	    CURR,  -1),
+	SENSOR(IN_0V9_ADC,		"0.9V supply (ext. ADC)",   IN,    -1),
+	SENSOR(CONTROLLER_2_TEMP,	"Controller board temp. 2", TEMP,  -1),
+	SENSOR(VREG_INTERNAL_TEMP,	"Regulator die temp.",	    TEMP,  -1),
+	SENSOR(VREG_0V9_TEMP,		"0.9V regulator temp.",     TEMP,  -1),
+	SENSOR(VREG_1V2_TEMP,		"1.2V regulator temp.",     TEMP,  -1),
+	SENSOR(CONTROLLER_VPTAT,
+			      "Controller PTAT voltage (int. ADC)", IN,    -1),
+	SENSOR(CONTROLLER_INTERNAL_TEMP,
+				 "Controller die temp. (int. ADC)", TEMP,  -1),
+	SENSOR(CONTROLLER_VPTAT_EXTADC,
+			      "Controller PTAT voltage (ext. ADC)", IN,    -1),
+	SENSOR(CONTROLLER_INTERNAL_TEMP_EXTADC,
+				 "Controller die temp. (ext. ADC)", TEMP,  -1),
+	SENSOR(AMBIENT_TEMP,		"Ambient temp.",	    TEMP,  -1),
+	SENSOR(AIRFLOW,			"Air flow raw",		    IN,    -1),
+	SENSOR(VDD08D_VSS08D_CSR,	"0.9V die (int. ADC)",	    IN,    -1),
+	SENSOR(VDD08D_VSS08D_CSR_EXTADC, "0.9V die (ext. ADC)",	    IN,    -1),
+	SENSOR(HOTPOINT_TEMP,  "Controller board temp. (hotpoint)", TEMP,  -1),
+#undef SENSOR
+};
+
+static const char *const sensor_status_names[] = {
+	[MC_CMD_SENSOR_STATE_OK] = "OK",
+	[MC_CMD_SENSOR_STATE_WARNING] = "Warning",
+	[MC_CMD_SENSOR_STATE_FATAL] = "Fatal",
+	[MC_CMD_SENSOR_STATE_BROKEN] = "Device failure",
+	[MC_CMD_SENSOR_STATE_NO_READING] = "No reading",
+};
+
+void efx_mcdi_sensor_event(struct efx_nic *efx, efx_qword_t *ev)
+{
+	unsigned int type, state, value;
+	enum efx_hwmon_type hwmon_type = EFX_HWMON_UNKNOWN;
+	const char *name = NULL, *state_txt, *unit;
+
+	type = EFX_QWORD_FIELD(*ev, MCDI_EVENT_SENSOREVT_MONITOR);
+	state = EFX_QWORD_FIELD(*ev, MCDI_EVENT_SENSOREVT_STATE);
+	value = EFX_QWORD_FIELD(*ev, MCDI_EVENT_SENSOREVT_VALUE);
+
+	/* Deal gracefully with the board having more drivers than we
+	 * know about, but do not expect new sensor states. */
+	if (type < ARRAY_SIZE(efx_mcdi_sensor_type)) {
+		name = efx_mcdi_sensor_type[type].label;
+		hwmon_type = efx_mcdi_sensor_type[type].hwmon_type;
+	}
+	if (!name)
+		name = "No sensor name available";
+	EFX_WARN_ON_PARANOID(state >= ARRAY_SIZE(sensor_status_names));
+	state_txt = sensor_status_names[state];
+	EFX_WARN_ON_PARANOID(hwmon_type >= EFX_HWMON_TYPES_COUNT);
+	unit = efx_hwmon_unit[hwmon_type];
+	if (!unit)
+		unit = "";
+
+	netif_err(efx, hw, efx->net_dev,
+		  "Sensor %d (%s) reports condition '%s' for value %d%s\n",
+		  type, name, state_txt, value, unit);
+}
+
+#ifdef CONFIG_SFC_MCDI_MON
+
+struct efx_mcdi_mon_attribute {
+	struct device_attribute dev_attr;
+	unsigned int index;
+	unsigned int type;
+	enum efx_hwmon_type hwmon_type;
+	unsigned int limit_value;
+	char name[12];
+};
+
+static int efx_mcdi_mon_update(struct efx_nic *efx)
+{
+	struct efx_mcdi_mon *hwmon = efx_mcdi_mon(efx);
+	MCDI_DECLARE_BUF(inbuf, MC_CMD_READ_SENSORS_EXT_IN_LEN);
+	int rc;
+
+	MCDI_SET_QWORD(inbuf, READ_SENSORS_EXT_IN_DMA_ADDR,
+		       hwmon->dma_buf.dma_addr);
+	MCDI_SET_DWORD(inbuf, READ_SENSORS_EXT_IN_LENGTH, hwmon->dma_buf.len);
+
+	rc = efx_mcdi_rpc(efx, MC_CMD_READ_SENSORS,
+			  inbuf, sizeof(inbuf), NULL, 0, NULL);
+	if (rc == 0)
+		hwmon->last_update = jiffies;
+	return rc;
+}
+
+static int efx_mcdi_mon_get_entry(struct device *dev, unsigned int index,
+				  efx_dword_t *entry)
+{
+	struct efx_nic *efx = dev_get_drvdata(dev->parent);
+	struct efx_mcdi_mon *hwmon = efx_mcdi_mon(efx);
+	int rc;
+
+	BUILD_BUG_ON(MC_CMD_READ_SENSORS_OUT_LEN != 0);
+
+	mutex_lock(&hwmon->update_lock);
+
+	/* Use cached value if last update was < 1 s ago */
+	if (time_before(jiffies, hwmon->last_update + HZ))
+		rc = 0;
+	else
+		rc = efx_mcdi_mon_update(efx);
+
+	/* Copy out the requested entry */
+	*entry = ((efx_dword_t *)hwmon->dma_buf.addr)[index];
+
+	mutex_unlock(&hwmon->update_lock);
+
+	return rc;
+}
+
+static ssize_t efx_mcdi_mon_show_value(struct device *dev,
+				       struct device_attribute *attr,
+				       char *buf)
+{
+	struct efx_mcdi_mon_attribute *mon_attr =
+		container_of(attr, struct efx_mcdi_mon_attribute, dev_attr);
+	efx_dword_t entry;
+	unsigned int value, state;
+	int rc;
+
+	rc = efx_mcdi_mon_get_entry(dev, mon_attr->index, &entry);
+	if (rc)
+		return rc;
+
+	state = EFX_DWORD_FIELD(entry, MC_CMD_SENSOR_VALUE_ENTRY_TYPEDEF_STATE);
+	if (state == MC_CMD_SENSOR_STATE_NO_READING)
+		return -EBUSY;
+
+	value = EFX_DWORD_FIELD(entry, MC_CMD_SENSOR_VALUE_ENTRY_TYPEDEF_VALUE);
+
+	switch (mon_attr->hwmon_type) {
+	case EFX_HWMON_TEMP:
+		/* Convert temperature from degrees to milli-degrees Celsius */
+		value *= 1000;
+		break;
+	case EFX_HWMON_POWER:
+		/* Convert power from watts to microwatts */
+		value *= 1000000;
+		break;
+	default:
+		/* No conversion needed */
+		break;
+	}
+
+	return sprintf(buf, "%u\n", value);
+}
+
+static ssize_t efx_mcdi_mon_show_limit(struct device *dev,
+				       struct device_attribute *attr,
+				       char *buf)
+{
+	struct efx_mcdi_mon_attribute *mon_attr =
+		container_of(attr, struct efx_mcdi_mon_attribute, dev_attr);
+	unsigned int value;
+
+	value = mon_attr->limit_value;
+
+	switch (mon_attr->hwmon_type) {
+	case EFX_HWMON_TEMP:
+		/* Convert temperature from degrees to milli-degrees Celsius */
+		value *= 1000;
+		break;
+	case EFX_HWMON_POWER:
+		/* Convert power from watts to microwatts */
+		value *= 1000000;
+		break;
+	default:
+		/* No conversion needed */
+		break;
+	}
+
+	return sprintf(buf, "%u\n", value);
+}
+
+static ssize_t efx_mcdi_mon_show_alarm(struct device *dev,
+				       struct device_attribute *attr,
+				       char *buf)
+{
+	struct efx_mcdi_mon_attribute *mon_attr =
+		container_of(attr, struct efx_mcdi_mon_attribute, dev_attr);
+	efx_dword_t entry;
+	int state;
+	int rc;
+
+	rc = efx_mcdi_mon_get_entry(dev, mon_attr->index, &entry);
+	if (rc)
+		return rc;
+
+	state = EFX_DWORD_FIELD(entry, MC_CMD_SENSOR_VALUE_ENTRY_TYPEDEF_STATE);
+	return sprintf(buf, "%d\n", state != MC_CMD_SENSOR_STATE_OK);
+}
+
+static ssize_t efx_mcdi_mon_show_label(struct device *dev,
+				       struct device_attribute *attr,
+				       char *buf)
+{
+	struct efx_mcdi_mon_attribute *mon_attr =
+		container_of(attr, struct efx_mcdi_mon_attribute, dev_attr);
+	return sprintf(buf, "%s\n",
+		       efx_mcdi_sensor_type[mon_attr->type].label);
+}
+
+static void
+efx_mcdi_mon_add_attr(struct efx_nic *efx, const char *name,
+		      ssize_t (*reader)(struct device *,
+					struct device_attribute *, char *),
+		      unsigned int index, unsigned int type,
+		      unsigned int limit_value)
+{
+	struct efx_mcdi_mon *hwmon = efx_mcdi_mon(efx);
+	struct efx_mcdi_mon_attribute *attr = &hwmon->attrs[hwmon->n_attrs];
+
+	strlcpy(attr->name, name, sizeof(attr->name));
+	attr->index = index;
+	attr->type = type;
+	if (type < ARRAY_SIZE(efx_mcdi_sensor_type))
+		attr->hwmon_type = efx_mcdi_sensor_type[type].hwmon_type;
+	else
+		attr->hwmon_type = EFX_HWMON_UNKNOWN;
+	attr->limit_value = limit_value;
+	sysfs_attr_init(&attr->dev_attr.attr);
+	attr->dev_attr.attr.name = attr->name;
+	attr->dev_attr.attr.mode = 0444;
+	attr->dev_attr.show = reader;
+	hwmon->group.attrs[hwmon->n_attrs++] = &attr->dev_attr.attr;
+}
+
+int efx_mcdi_mon_probe(struct efx_nic *efx)
+{
+	unsigned int n_temp = 0, n_cool = 0, n_in = 0, n_curr = 0, n_power = 0;
+	struct efx_mcdi_mon *hwmon = efx_mcdi_mon(efx);
+	MCDI_DECLARE_BUF(inbuf, MC_CMD_SENSOR_INFO_EXT_IN_LEN);
+	MCDI_DECLARE_BUF(outbuf, MC_CMD_SENSOR_INFO_OUT_LENMAX);
+	unsigned int n_pages, n_sensors, n_attrs, page;
+	size_t outlen;
+	char name[12];
+	u32 mask;
+	int rc, i, j, type;
+
+	/* Find out how many sensors are present */
+	n_sensors = 0;
+	page = 0;
+	do {
+		MCDI_SET_DWORD(inbuf, SENSOR_INFO_EXT_IN_PAGE, page);
+
+		rc = efx_mcdi_rpc(efx, MC_CMD_SENSOR_INFO, inbuf, sizeof(inbuf),
+				  outbuf, sizeof(outbuf), &outlen);
+		if (rc)
+			return rc;
+		if (outlen < MC_CMD_SENSOR_INFO_OUT_LENMIN)
+			return -EIO;
+
+		mask = MCDI_DWORD(outbuf, SENSOR_INFO_OUT_MASK);
+		n_sensors += hweight32(mask & ~(1 << MC_CMD_SENSOR_PAGE0_NEXT));
+		++page;
+	} while (mask & (1 << MC_CMD_SENSOR_PAGE0_NEXT));
+	n_pages = page;
+
+	/* Don't create a device if there are none */
+	if (n_sensors == 0)
+		return 0;
+
+	rc = efx_nic_alloc_buffer(
+		efx, &hwmon->dma_buf,
+		n_sensors * MC_CMD_SENSOR_VALUE_ENTRY_TYPEDEF_LEN,
+		GFP_KERNEL);
+	if (rc)
+		return rc;
+
+	mutex_init(&hwmon->update_lock);
+	efx_mcdi_mon_update(efx);
+
+	/* Allocate space for the maximum possible number of
+	 * attributes for this set of sensors:
+	 * value, min, max, crit, alarm and label for each sensor.
+	 */
+	n_attrs = 6 * n_sensors;
+	hwmon->attrs = kcalloc(n_attrs, sizeof(*hwmon->attrs), GFP_KERNEL);
+	if (!hwmon->attrs) {
+		rc = -ENOMEM;
+		goto fail;
+	}
+	hwmon->group.attrs = kcalloc(n_attrs + 1, sizeof(struct attribute *),
+				     GFP_KERNEL);
+	if (!hwmon->group.attrs) {
+		rc = -ENOMEM;
+		goto fail;
+	}
+
+	for (i = 0, j = -1, type = -1; ; i++) {
+		enum efx_hwmon_type hwmon_type;
+		const char *hwmon_prefix;
+		unsigned hwmon_index;
+		u16 min1, max1, min2, max2;
+
+		/* Find next sensor type or exit if there is none */
+		do {
+			type++;
+
+			if ((type % 32) == 0) {
+				page = type / 32;
+				j = -1;
+				if (page == n_pages)
+					goto hwmon_register;
+
+				MCDI_SET_DWORD(inbuf, SENSOR_INFO_EXT_IN_PAGE,
+					       page);
+				rc = efx_mcdi_rpc(efx, MC_CMD_SENSOR_INFO,
+						  inbuf, sizeof(inbuf),
+						  outbuf, sizeof(outbuf),
+						  &outlen);
+				if (rc)
+					goto fail;
+				if (outlen < MC_CMD_SENSOR_INFO_OUT_LENMIN) {
+					rc = -EIO;
+					goto fail;
+				}
+
+				mask = (MCDI_DWORD(outbuf,
+						   SENSOR_INFO_OUT_MASK) &
+					~(1 << MC_CMD_SENSOR_PAGE0_NEXT));
+
+				/* Check again for short response */
+				if (outlen <
+				    MC_CMD_SENSOR_INFO_OUT_LEN(hweight32(mask))) {
+					rc = -EIO;
+					goto fail;
+				}
+			}
+		} while (!(mask & (1 << type % 32)));
+		j++;
+
+		if (type < ARRAY_SIZE(efx_mcdi_sensor_type)) {
+			hwmon_type = efx_mcdi_sensor_type[type].hwmon_type;
+
+			/* Skip sensors specific to a different port */
+			if (hwmon_type != EFX_HWMON_UNKNOWN &&
+			    efx_mcdi_sensor_type[type].port >= 0 &&
+			    efx_mcdi_sensor_type[type].port !=
+			    efx_port_num(efx))
+				continue;
+		} else {
+			hwmon_type = EFX_HWMON_UNKNOWN;
+		}
+
+		switch (hwmon_type) {
+		case EFX_HWMON_TEMP:
+			hwmon_prefix = "temp";
+			hwmon_index = ++n_temp; /* 1-based */
+			break;
+		case EFX_HWMON_COOL:
+			/* This is likely to be a heatsink, but there
+			 * is no convention for representing cooling
+			 * devices other than fans.
+			 */
+			hwmon_prefix = "fan";
+			hwmon_index = ++n_cool; /* 1-based */
+			break;
+		default:
+			hwmon_prefix = "in";
+			hwmon_index = n_in++; /* 0-based */
+			break;
+		case EFX_HWMON_CURR:
+			hwmon_prefix = "curr";
+			hwmon_index = ++n_curr; /* 1-based */
+			break;
+		case EFX_HWMON_POWER:
+			hwmon_prefix = "power";
+			hwmon_index = ++n_power; /* 1-based */
+			break;
+		}
+
+		min1 = MCDI_ARRAY_FIELD(outbuf, SENSOR_ENTRY,
+					SENSOR_INFO_ENTRY, j, MIN1);
+		max1 = MCDI_ARRAY_FIELD(outbuf, SENSOR_ENTRY,
+					SENSOR_INFO_ENTRY, j, MAX1);
+		min2 = MCDI_ARRAY_FIELD(outbuf, SENSOR_ENTRY,
+					SENSOR_INFO_ENTRY, j, MIN2);
+		max2 = MCDI_ARRAY_FIELD(outbuf, SENSOR_ENTRY,
+					SENSOR_INFO_ENTRY, j, MAX2);
+
+		if (min1 != max1) {
+			snprintf(name, sizeof(name), "%s%u_input",
+				 hwmon_prefix, hwmon_index);
+			efx_mcdi_mon_add_attr(
+				efx, name, efx_mcdi_mon_show_value, i, type, 0);
+
+			if (hwmon_type != EFX_HWMON_POWER) {
+				snprintf(name, sizeof(name), "%s%u_min",
+					 hwmon_prefix, hwmon_index);
+				efx_mcdi_mon_add_attr(
+					efx, name, efx_mcdi_mon_show_limit,
+					i, type, min1);
+			}
+
+			snprintf(name, sizeof(name), "%s%u_max",
+				 hwmon_prefix, hwmon_index);
+			efx_mcdi_mon_add_attr(
+				efx, name, efx_mcdi_mon_show_limit,
+				i, type, max1);
+
+			if (min2 != max2) {
+				/* Assume max2 is critical value.
+				 * But we have no good way to expose min2.
+				 */
+				snprintf(name, sizeof(name), "%s%u_crit",
+					 hwmon_prefix, hwmon_index);
+				efx_mcdi_mon_add_attr(
+					efx, name, efx_mcdi_mon_show_limit,
+					i, type, max2);
+			}
+		}
+
+		snprintf(name, sizeof(name), "%s%u_alarm",
+			 hwmon_prefix, hwmon_index);
+		efx_mcdi_mon_add_attr(
+			efx, name, efx_mcdi_mon_show_alarm, i, type, 0);
+
+		if (type < ARRAY_SIZE(efx_mcdi_sensor_type) &&
+		    efx_mcdi_sensor_type[type].label) {
+			snprintf(name, sizeof(name), "%s%u_label",
+				 hwmon_prefix, hwmon_index);
+			efx_mcdi_mon_add_attr(
+				efx, name, efx_mcdi_mon_show_label, i, type, 0);
+		}
+	}
+
+hwmon_register:
+	hwmon->groups[0] = &hwmon->group;
+	hwmon->device = hwmon_device_register_with_groups(&efx->pci_dev->dev,
+							  KBUILD_MODNAME, NULL,
+							  hwmon->groups);
+	if (IS_ERR(hwmon->device)) {
+		rc = PTR_ERR(hwmon->device);
+		goto fail;
+	}
+
+	return 0;
+
+fail:
+	efx_mcdi_mon_remove(efx);
+	return rc;
+}
+
+void efx_mcdi_mon_remove(struct efx_nic *efx)
+{
+	struct efx_mcdi_mon *hwmon = efx_mcdi_mon(efx);
+
+	if (hwmon->device)
+		hwmon_device_unregister(hwmon->device);
+	kfree(hwmon->attrs);
+	kfree(hwmon->group.attrs);
+	efx_nic_free_buffer(efx, &hwmon->dma_buf);
+}
+
+#endif /* CONFIG_SFC_MCDI_MON */
--- /dev/null
+++ b/drivers/net/ethernet/sfc/siena/mcdi_port.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/****************************************************************************
+ * Driver for Solarflare network controllers and boards
+ * Copyright 2009-2013 Solarflare Communications Inc.
+ */
+
+/*
+ * Driver for PHY related operations via MCDI.
+ */
+
+#include <linux/slab.h>
+#include "efx.h"
+#include "mcdi_port.h"
+#include "mcdi.h"
+#include "mcdi_pcol.h"
+#include "nic.h"
+#include "selftest.h"
+#include "mcdi_port_common.h"
+
+static int efx_mcdi_mdio_read(struct net_device *net_dev,
+			      int prtad, int devad, u16 addr)
+{
+	struct efx_nic *efx = netdev_priv(net_dev);
+	MCDI_DECLARE_BUF(inbuf, MC_CMD_MDIO_READ_IN_LEN);
+	MCDI_DECLARE_BUF(outbuf, MC_CMD_MDIO_READ_OUT_LEN);
+	size_t outlen;
+	int rc;
+
+	MCDI_SET_DWORD(inbuf, MDIO_READ_IN_BUS, efx->mdio_bus);
+	MCDI_SET_DWORD(inbuf, MDIO_READ_IN_PRTAD, prtad);
+	MCDI_SET_DWORD(inbuf, MDIO_READ_IN_DEVAD, devad);
+	MCDI_SET_DWORD(inbuf, MDIO_READ_IN_ADDR, addr);
+
+	rc = efx_mcdi_rpc(efx, MC_CMD_MDIO_READ, inbuf, sizeof(inbuf),
+			  outbuf, sizeof(outbuf), &outlen);
+	if (rc)
+		return rc;
+
+	if (MCDI_DWORD(outbuf, MDIO_READ_OUT_STATUS) !=
+	    MC_CMD_MDIO_STATUS_GOOD)
+		return -EIO;
+
+	return (u16)MCDI_DWORD(outbuf, MDIO_READ_OUT_VALUE);
+}
+
+static int efx_mcdi_mdio_write(struct net_device *net_dev,
+			       int prtad, int devad, u16 addr, u16 value)
+{
+	struct efx_nic *efx = netdev_priv(net_dev);
+	MCDI_DECLARE_BUF(inbuf, MC_CMD_MDIO_WRITE_IN_LEN);
+	MCDI_DECLARE_BUF(outbuf, MC_CMD_MDIO_WRITE_OUT_LEN);
+	size_t outlen;
+	int rc;
+
+	MCDI_SET_DWORD(inbuf, MDIO_WRITE_IN_BUS, efx->mdio_bus);
+	MCDI_SET_DWORD(inbuf, MDIO_WRITE_IN_PRTAD, prtad);
+	MCDI_SET_DWORD(inbuf, MDIO_WRITE_IN_DEVAD, devad);
+	MCDI_SET_DWORD(inbuf, MDIO_WRITE_IN_ADDR, addr);
+	MCDI_SET_DWORD(inbuf, MDIO_WRITE_IN_VALUE, value);
+
+	rc = efx_mcdi_rpc(efx, MC_CMD_MDIO_WRITE, inbuf, sizeof(inbuf),
+			  outbuf, sizeof(outbuf), &outlen);
+	if (rc)
+		return rc;
+
+	if (MCDI_DWORD(outbuf, MDIO_WRITE_OUT_STATUS) !=
+	    MC_CMD_MDIO_STATUS_GOOD)
+		return -EIO;
+
+	return 0;
+}
+
+u32 efx_mcdi_phy_get_caps(struct efx_nic *efx)
+{
+	struct efx_mcdi_phy_data *phy_data = efx->phy_data;
+
+	return phy_data->supported_cap;
+}
+
+bool efx_mcdi_mac_check_fault(struct efx_nic *efx)
+{
+	MCDI_DECLARE_BUF(outbuf, MC_CMD_GET_LINK_OUT_LEN);
+	size_t outlength;
+	int rc;
+
+	BUILD_BUG_ON(MC_CMD_GET_LINK_IN_LEN != 0);
+
+	rc = efx_mcdi_rpc(efx, MC_CMD_GET_LINK, NULL, 0,
+			  outbuf, sizeof(outbuf), &outlength);
+	if (rc)
+		return true;
+
+	return MCDI_DWORD(outbuf, GET_LINK_OUT_MAC_FAULT) != 0;
+}
+
+int efx_mcdi_port_probe(struct efx_nic *efx)
+{
+	int rc;
+
+	/* Set up MDIO structure for PHY */
+	efx->mdio.mode_support = MDIO_SUPPORTS_C45 | MDIO_EMULATE_C22;
+	efx->mdio.mdio_read = efx_mcdi_mdio_read;
+	efx->mdio.mdio_write = efx_mcdi_mdio_write;
+
+	/* Fill out MDIO structure, loopback modes, and initial link state */
+	rc = efx_mcdi_phy_probe(efx);
+	if (rc != 0)
+		return rc;
+
+	return efx_mcdi_mac_init_stats(efx);
+}
+
+void efx_mcdi_port_remove(struct efx_nic *efx)
+{
+	efx_mcdi_phy_remove(efx);
+	efx_mcdi_mac_fini_stats(efx);
+}
--- /dev/null
+++ b/drivers/net/ethernet/sfc/siena/mcdi_port.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/****************************************************************************
+ * Driver for Solarflare network controllers and boards
+ * Copyright 2008-2013 Solarflare Communications Inc.
+ * Copyright 2019-2020 Xilinx Inc.
+ */
+
+#ifndef EFX_MCDI_PORT_H
+#define EFX_MCDI_PORT_H
+
+#include "net_driver.h"
+
+u32 efx_mcdi_phy_get_caps(struct efx_nic *efx);
+bool efx_mcdi_mac_check_fault(struct efx_nic *efx);
+int efx_mcdi_port_probe(struct efx_nic *efx);
+void efx_mcdi_port_remove(struct efx_nic *efx);
+
+#endif /* EFX_MCDI_PORT_H */
--- /dev/null
+++ b/drivers/net/ethernet/sfc/siena/mcdi_port_common.c
@@ -0,0 +1,1301 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/****************************************************************************
+ * Driver for Solarflare network controllers and boards
+ * Copyright 2018 Solarflare Communications Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ */
+
+#include "mcdi_port_common.h"
+#include "efx_common.h"
+#include "nic.h"
+
+int efx_mcdi_get_phy_cfg(struct efx_nic *efx, struct efx_mcdi_phy_data *cfg)
+{
+	MCDI_DECLARE_BUF(outbuf, MC_CMD_GET_PHY_CFG_OUT_LEN);
+	size_t outlen;
+	int rc;
+
+	BUILD_BUG_ON(MC_CMD_GET_PHY_CFG_IN_LEN != 0);
+	BUILD_BUG_ON(MC_CMD_GET_PHY_CFG_OUT_NAME_LEN != sizeof(cfg->name));
+
+	rc = efx_mcdi_rpc(efx, MC_CMD_GET_PHY_CFG, NULL, 0,
+			  outbuf, sizeof(outbuf), &outlen);
+	if (rc)
+		goto fail;
+
+	if (outlen < MC_CMD_GET_PHY_CFG_OUT_LEN) {
+		rc = -EIO;
+		goto fail;
+	}
+
+	cfg->flags = MCDI_DWORD(outbuf, GET_PHY_CFG_OUT_FLAGS);
+	cfg->type = MCDI_DWORD(outbuf, GET_PHY_CFG_OUT_TYPE);
+	cfg->supported_cap =
+		MCDI_DWORD(outbuf, GET_PHY_CFG_OUT_SUPPORTED_CAP);
+	cfg->channel = MCDI_DWORD(outbuf, GET_PHY_CFG_OUT_CHANNEL);
+	cfg->port = MCDI_DWORD(outbuf, GET_PHY_CFG_OUT_PRT);
+	cfg->stats_mask = MCDI_DWORD(outbuf, GET_PHY_CFG_OUT_STATS_MASK);
+	memcpy(cfg->name, MCDI_PTR(outbuf, GET_PHY_CFG_OUT_NAME),
+	       sizeof(cfg->name));
+	cfg->media = MCDI_DWORD(outbuf, GET_PHY_CFG_OUT_MEDIA_TYPE);
+	cfg->mmd_mask = MCDI_DWORD(outbuf, GET_PHY_CFG_OUT_MMD_MASK);
+	memcpy(cfg->revision, MCDI_PTR(outbuf, GET_PHY_CFG_OUT_REVISION),
+	       sizeof(cfg->revision));
+
+	return 0;
+
+fail:
+	netif_err(efx, hw, efx->net_dev, "%s: failed rc=%d\n", __func__, rc);
+	return rc;
+}
+
+void efx_link_set_advertising(struct efx_nic *efx,
+			      const unsigned long *advertising)
+{
+	memcpy(efx->link_advertising, advertising,
+	       sizeof(__ETHTOOL_DECLARE_LINK_MODE_MASK()));
+
+	efx->link_advertising[0] |= ADVERTISED_Autoneg;
+	if (advertising[0] & ADVERTISED_Pause)
+		efx->wanted_fc |= (EFX_FC_TX | EFX_FC_RX);
+	else
+		efx->wanted_fc &= ~(EFX_FC_TX | EFX_FC_RX);
+	if (advertising[0] & ADVERTISED_Asym_Pause)
+		efx->wanted_fc ^= EFX_FC_TX;
+}
+
+int efx_mcdi_set_link(struct efx_nic *efx, u32 capabilities,
+		      u32 flags, u32 loopback_mode, u32 loopback_speed)
+{
+	MCDI_DECLARE_BUF(inbuf, MC_CMD_SET_LINK_IN_LEN);
+
+	BUILD_BUG_ON(MC_CMD_SET_LINK_OUT_LEN != 0);
+
+	MCDI_SET_DWORD(inbuf, SET_LINK_IN_CAP, capabilities);
+	MCDI_SET_DWORD(inbuf, SET_LINK_IN_FLAGS, flags);
+	MCDI_SET_DWORD(inbuf, SET_LINK_IN_LOOPBACK_MODE, loopback_mode);
+	MCDI_SET_DWORD(inbuf, SET_LINK_IN_LOOPBACK_SPEED, loopback_speed);
+
+	return efx_mcdi_rpc(efx, MC_CMD_SET_LINK, inbuf, sizeof(inbuf),
+			  NULL, 0, NULL);
+}
+
+int efx_mcdi_loopback_modes(struct efx_nic *efx, u64 *loopback_modes)
+{
+	MCDI_DECLARE_BUF(outbuf, MC_CMD_GET_LOOPBACK_MODES_OUT_LEN);
+	size_t outlen;
+	int rc;
+
+	rc = efx_mcdi_rpc(efx, MC_CMD_GET_LOOPBACK_MODES, NULL, 0,
+			  outbuf, sizeof(outbuf), &outlen);
+	if (rc)
+		goto fail;
+
+	if (outlen < (MC_CMD_GET_LOOPBACK_MODES_OUT_SUGGESTED_OFST +
+		      MC_CMD_GET_LOOPBACK_MODES_OUT_SUGGESTED_LEN)) {
+		rc = -EIO;
+		goto fail;
+	}
+
+	*loopback_modes = MCDI_QWORD(outbuf, GET_LOOPBACK_MODES_OUT_SUGGESTED);
+
+	return 0;
+
+fail:
+	netif_err(efx, hw, efx->net_dev, "%s: failed rc=%d\n", __func__, rc);
+	return rc;
+}
+
+void mcdi_to_ethtool_linkset(u32 media, u32 cap, unsigned long *linkset)
+{
+	#define SET_BIT(name)	__set_bit(ETHTOOL_LINK_MODE_ ## name ## _BIT, \
+					  linkset)
+
+	bitmap_zero(linkset, __ETHTOOL_LINK_MODE_MASK_NBITS);
+	switch (media) {
+	case MC_CMD_MEDIA_KX4:
+		SET_BIT(Backplane);
+		if (cap & (1 << MC_CMD_PHY_CAP_1000FDX_LBN))
+			SET_BIT(1000baseKX_Full);
+		if (cap & (1 << MC_CMD_PHY_CAP_10000FDX_LBN))
+			SET_BIT(10000baseKX4_Full);
+		if (cap & (1 << MC_CMD_PHY_CAP_40000FDX_LBN))
+			SET_BIT(40000baseKR4_Full);
+		break;
+
+	case MC_CMD_MEDIA_XFP:
+	case MC_CMD_MEDIA_SFP_PLUS:
+	case MC_CMD_MEDIA_QSFP_PLUS:
+		SET_BIT(FIBRE);
+		if (cap & (1 << MC_CMD_PHY_CAP_1000FDX_LBN)) {
+			SET_BIT(1000baseT_Full);
+			SET_BIT(1000baseX_Full);
+		}
+		if (cap & (1 << MC_CMD_PHY_CAP_10000FDX_LBN)) {
+			SET_BIT(10000baseCR_Full);
+			SET_BIT(10000baseLR_Full);
+			SET_BIT(10000baseSR_Full);
+		}
+		if (cap & (1 << MC_CMD_PHY_CAP_40000FDX_LBN)) {
+			SET_BIT(40000baseCR4_Full);
+			SET_BIT(40000baseSR4_Full);
+		}
+		if (cap & (1 << MC_CMD_PHY_CAP_100000FDX_LBN)) {
+			SET_BIT(100000baseCR4_Full);
+			SET_BIT(100000baseSR4_Full);
+		}
+		if (cap & (1 << MC_CMD_PHY_CAP_25000FDX_LBN)) {
+			SET_BIT(25000baseCR_Full);
+			SET_BIT(25000baseSR_Full);
+		}
+		if (cap & (1 << MC_CMD_PHY_CAP_50000FDX_LBN))
+			SET_BIT(50000baseCR2_Full);
+		break;
+
+	case MC_CMD_MEDIA_BASE_T:
+		SET_BIT(TP);
+		if (cap & (1 << MC_CMD_PHY_CAP_10HDX_LBN))
+			SET_BIT(10baseT_Half);
+		if (cap & (1 << MC_CMD_PHY_CAP_10FDX_LBN))
+			SET_BIT(10baseT_Full);
+		if (cap & (1 << MC_CMD_PHY_CAP_100HDX_LBN))
+			SET_BIT(100baseT_Half);
+		if (cap & (1 << MC_CMD_PHY_CAP_100FDX_LBN))
+			SET_BIT(100baseT_Full);
+		if (cap & (1 << MC_CMD_PHY_CAP_1000HDX_LBN))
+			SET_BIT(1000baseT_Half);
+		if (cap & (1 << MC_CMD_PHY_CAP_1000FDX_LBN))
+			SET_BIT(1000baseT_Full);
+		if (cap & (1 << MC_CMD_PHY_CAP_10000FDX_LBN))
+			SET_BIT(10000baseT_Full);
+		break;
+	}
+
+	if (cap & (1 << MC_CMD_PHY_CAP_PAUSE_LBN))
+		SET_BIT(Pause);
+	if (cap & (1 << MC_CMD_PHY_CAP_ASYM_LBN))
+		SET_BIT(Asym_Pause);
+	if (cap & (1 << MC_CMD_PHY_CAP_AN_LBN))
+		SET_BIT(Autoneg);
+
+	#undef SET_BIT
+}
+
+u32 ethtool_linkset_to_mcdi_cap(const unsigned long *linkset)
+{
+	u32 result = 0;
+
+	#define TEST_BIT(name)	test_bit(ETHTOOL_LINK_MODE_ ## name ## _BIT, \
+					 linkset)
+
+	if (TEST_BIT(10baseT_Half))
+		result |= (1 << MC_CMD_PHY_CAP_10HDX_LBN);
+	if (TEST_BIT(10baseT_Full))
+		result |= (1 << MC_CMD_PHY_CAP_10FDX_LBN);
+	if (TEST_BIT(100baseT_Half))
+		result |= (1 << MC_CMD_PHY_CAP_100HDX_LBN);
+	if (TEST_BIT(100baseT_Full))
+		result |= (1 << MC_CMD_PHY_CAP_100FDX_LBN);
+	if (TEST_BIT(1000baseT_Half))
+		result |= (1 << MC_CMD_PHY_CAP_1000HDX_LBN);
+	if (TEST_BIT(1000baseT_Full) || TEST_BIT(1000baseKX_Full) ||
+			TEST_BIT(1000baseX_Full))
+		result |= (1 << MC_CMD_PHY_CAP_1000FDX_LBN);
+	if (TEST_BIT(10000baseT_Full) || TEST_BIT(10000baseKX4_Full) ||
+			TEST_BIT(10000baseCR_Full) || TEST_BIT(10000baseLR_Full) ||
+			TEST_BIT(10000baseSR_Full))
+		result |= (1 << MC_CMD_PHY_CAP_10000FDX_LBN);
+	if (TEST_BIT(40000baseCR4_Full) || TEST_BIT(40000baseKR4_Full) ||
+			TEST_BIT(40000baseSR4_Full))
+		result |= (1 << MC_CMD_PHY_CAP_40000FDX_LBN);
+	if (TEST_BIT(100000baseCR4_Full) || TEST_BIT(100000baseSR4_Full))
+		result |= (1 << MC_CMD_PHY_CAP_100000FDX_LBN);
+	if (TEST_BIT(25000baseCR_Full) || TEST_BIT(25000baseSR_Full))
+		result |= (1 << MC_CMD_PHY_CAP_25000FDX_LBN);
+	if (TEST_BIT(50000baseCR2_Full))
+		result |= (1 << MC_CMD_PHY_CAP_50000FDX_LBN);
+	if (TEST_BIT(Pause))
+		result |= (1 << MC_CMD_PHY_CAP_PAUSE_LBN);
+	if (TEST_BIT(Asym_Pause))
+		result |= (1 << MC_CMD_PHY_CAP_ASYM_LBN);
+	if (TEST_BIT(Autoneg))
+		result |= (1 << MC_CMD_PHY_CAP_AN_LBN);
+
+	#undef TEST_BIT
+
+	return result;
+}
+
+u32 efx_get_mcdi_phy_flags(struct efx_nic *efx)
+{
+	struct efx_mcdi_phy_data *phy_cfg = efx->phy_data;
+	enum efx_phy_mode mode, supported;
+	u32 flags;
+
+	/* TODO: Advertise the capabilities supported by this PHY */
+	supported = 0;
+	if (phy_cfg->flags & (1 << MC_CMD_GET_PHY_CFG_OUT_TXDIS_LBN))
+		supported |= PHY_MODE_TX_DISABLED;
+	if (phy_cfg->flags & (1 << MC_CMD_GET_PHY_CFG_OUT_LOWPOWER_LBN))
+		supported |= PHY_MODE_LOW_POWER;
+	if (phy_cfg->flags & (1 << MC_CMD_GET_PHY_CFG_OUT_POWEROFF_LBN))
+		supported |= PHY_MODE_OFF;
+
+	mode = efx->phy_mode & supported;
+
+	flags = 0;
+	if (mode & PHY_MODE_TX_DISABLED)
+		flags |= (1 << MC_CMD_SET_LINK_IN_TXDIS_LBN);
+	if (mode & PHY_MODE_LOW_POWER)
+		flags |= (1 << MC_CMD_SET_LINK_IN_LOWPOWER_LBN);
+	if (mode & PHY_MODE_OFF)
+		flags |= (1 << MC_CMD_SET_LINK_IN_POWEROFF_LBN);
+
+	return flags;
+}
+
+u8 mcdi_to_ethtool_media(u32 media)
+{
+	switch (media) {
+	case MC_CMD_MEDIA_XAUI:
+	case MC_CMD_MEDIA_CX4:
+	case MC_CMD_MEDIA_KX4:
+		return PORT_OTHER;
+
+	case MC_CMD_MEDIA_XFP:
+	case MC_CMD_MEDIA_SFP_PLUS:
+	case MC_CMD_MEDIA_QSFP_PLUS:
+		return PORT_FIBRE;
+
+	case MC_CMD_MEDIA_BASE_T:
+		return PORT_TP;
+
+	default:
+		return PORT_OTHER;
+	}
+}
+
+void efx_mcdi_phy_decode_link(struct efx_nic *efx,
+			      struct efx_link_state *link_state,
+			      u32 speed, u32 flags, u32 fcntl)
+{
+	switch (fcntl) {
+	case MC_CMD_FCNTL_AUTO:
+		WARN_ON(1);	/* This is not a link mode */
+		link_state->fc = EFX_FC_AUTO | EFX_FC_TX | EFX_FC_RX;
+		break;
+	case MC_CMD_FCNTL_BIDIR:
+		link_state->fc = EFX_FC_TX | EFX_FC_RX;
+		break;
+	case MC_CMD_FCNTL_RESPOND:
+		link_state->fc = EFX_FC_RX;
+		break;
+	default:
+		WARN_ON(1);
+		fallthrough;
+	case MC_CMD_FCNTL_OFF:
+		link_state->fc = 0;
+		break;
+	}
+
+	link_state->up = !!(flags & (1 << MC_CMD_GET_LINK_OUT_LINK_UP_LBN));
+	link_state->fd = !!(flags & (1 << MC_CMD_GET_LINK_OUT_FULL_DUPLEX_LBN));
+	link_state->speed = speed;
+}
+
+/* The semantics of the ethtool FEC mode bitmask are not well defined,
+ * particularly the meaning of combinations of bits.  Which means we get to
+ * define our own semantics, as follows:
+ * OFF overrides any other bits, and means "disable all FEC" (with the
+ * exception of 25G KR4/CR4, where it is not possible to reject it if AN
+ * partner requests it).
+ * AUTO on its own means use cable requirements and link partner autoneg with
+ * fw-default preferences for the cable type.
+ * AUTO and either RS or BASER means use the specified FEC type if cable and
+ * link partner support it, otherwise autoneg/fw-default.
+ * RS or BASER alone means use the specified FEC type if cable and link partner
+ * support it and either requests it, otherwise no FEC.
+ * Both RS and BASER (whether AUTO or not) means use FEC if cable and link
+ * partner support it, preferring RS to BASER.
+ */
+u32 ethtool_fec_caps_to_mcdi(u32 supported_cap, u32 ethtool_cap)
+{
+	u32 ret = 0;
+
+	if (ethtool_cap & ETHTOOL_FEC_OFF)
+		return 0;
+
+	if (ethtool_cap & ETHTOOL_FEC_AUTO)
+		ret |= ((1 << MC_CMD_PHY_CAP_BASER_FEC_LBN) |
+			(1 << MC_CMD_PHY_CAP_25G_BASER_FEC_LBN) |
+			(1 << MC_CMD_PHY_CAP_RS_FEC_LBN)) & supported_cap;
+	if (ethtool_cap & ETHTOOL_FEC_RS &&
+	    supported_cap & (1 << MC_CMD_PHY_CAP_RS_FEC_LBN))
+		ret |= (1 << MC_CMD_PHY_CAP_RS_FEC_LBN) |
+		       (1 << MC_CMD_PHY_CAP_RS_FEC_REQUESTED_LBN);
+	if (ethtool_cap & ETHTOOL_FEC_BASER) {
+		if (supported_cap & (1 << MC_CMD_PHY_CAP_BASER_FEC_LBN))
+			ret |= (1 << MC_CMD_PHY_CAP_BASER_FEC_LBN) |
+			       (1 << MC_CMD_PHY_CAP_BASER_FEC_REQUESTED_LBN);
+		if (supported_cap & (1 << MC_CMD_PHY_CAP_25G_BASER_FEC_LBN))
+			ret |= (1 << MC_CMD_PHY_CAP_25G_BASER_FEC_LBN) |
+			       (1 << MC_CMD_PHY_CAP_25G_BASER_FEC_REQUESTED_LBN);
+	}
+	return ret;
+}
+
+/* Invert ethtool_fec_caps_to_mcdi.  There are two combinations that function
+ * can never produce, (baser xor rs) and neither req; the implementation below
+ * maps both of those to AUTO.  This should never matter, and it's not clear
+ * what a better mapping would be anyway.
+ */
+u32 mcdi_fec_caps_to_ethtool(u32 caps, bool is_25g)
+{
+	bool rs = caps & (1 << MC_CMD_PHY_CAP_RS_FEC_LBN),
+	     rs_req = caps & (1 << MC_CMD_PHY_CAP_RS_FEC_REQUESTED_LBN),
+	     baser = is_25g ? caps & (1 << MC_CMD_PHY_CAP_25G_BASER_FEC_LBN)
+			    : caps & (1 << MC_CMD_PHY_CAP_BASER_FEC_LBN),
+	     baser_req = is_25g ? caps & (1 << MC_CMD_PHY_CAP_25G_BASER_FEC_REQUESTED_LBN)
+				: caps & (1 << MC_CMD_PHY_CAP_BASER_FEC_REQUESTED_LBN);
+
+	if (!baser && !rs)
+		return ETHTOOL_FEC_OFF;
+	return (rs_req ? ETHTOOL_FEC_RS : 0) |
+	       (baser_req ? ETHTOOL_FEC_BASER : 0) |
+	       (baser == baser_req && rs == rs_req ? 0 : ETHTOOL_FEC_AUTO);
+}
+
+/* Verify that the forced flow control settings (!EFX_FC_AUTO) are
+ * supported by the link partner. Warn the user if this isn't the case
+ */
+void efx_mcdi_phy_check_fcntl(struct efx_nic *efx, u32 lpa)
+{
+	struct efx_mcdi_phy_data *phy_cfg = efx->phy_data;
+	u32 rmtadv;
+
+	/* The link partner capabilities are only relevant if the
+	 * link supports flow control autonegotiation
+	 */
+	if (~phy_cfg->supported_cap & (1 << MC_CMD_PHY_CAP_AN_LBN))
+		return;
+
+	/* If flow control autoneg is supported and enabled, then fine */
+	if (efx->wanted_fc & EFX_FC_AUTO)
+		return;
+
+	rmtadv = 0;
+	if (lpa & (1 << MC_CMD_PHY_CAP_PAUSE_LBN))
+		rmtadv |= ADVERTISED_Pause;
+	if (lpa & (1 << MC_CMD_PHY_CAP_ASYM_LBN))
+		rmtadv |=  ADVERTISED_Asym_Pause;
+
+	if ((efx->wanted_fc & EFX_FC_TX) && rmtadv == ADVERTISED_Asym_Pause)
+		netif_err(efx, link, efx->net_dev,
+			  "warning: link partner doesn't support pause frames");
+}
+
+bool efx_mcdi_phy_poll(struct efx_nic *efx)
+{
+	struct efx_link_state old_state = efx->link_state;
+	MCDI_DECLARE_BUF(outbuf, MC_CMD_GET_LINK_OUT_LEN);
+	int rc;
+
+	WARN_ON(!mutex_is_locked(&efx->mac_lock));
+
+	BUILD_BUG_ON(MC_CMD_GET_LINK_IN_LEN != 0);
+
+	rc = efx_mcdi_rpc(efx, MC_CMD_GET_LINK, NULL, 0,
+			  outbuf, sizeof(outbuf), NULL);
+	if (rc)
+		efx->link_state.up = false;
+	else
+		efx_mcdi_phy_decode_link(
+			efx, &efx->link_state,
+			MCDI_DWORD(outbuf, GET_LINK_OUT_LINK_SPEED),
+			MCDI_DWORD(outbuf, GET_LINK_OUT_FLAGS),
+			MCDI_DWORD(outbuf, GET_LINK_OUT_FCNTL));
+
+	return !efx_link_state_equal(&efx->link_state, &old_state);
+}
+
+int efx_mcdi_phy_probe(struct efx_nic *efx)
+{
+	struct efx_mcdi_phy_data *phy_data;
+	MCDI_DECLARE_BUF(outbuf, MC_CMD_GET_LINK_OUT_LEN);
+	u32 caps;
+	int rc;
+
+	/* Initialise and populate phy_data */
+	phy_data = kzalloc(sizeof(*phy_data), GFP_KERNEL);
+	if (phy_data == NULL)
+		return -ENOMEM;
+
+	rc = efx_mcdi_get_phy_cfg(efx, phy_data);
+	if (rc != 0)
+		goto fail;
+
+	/* Read initial link advertisement */
+	BUILD_BUG_ON(MC_CMD_GET_LINK_IN_LEN != 0);
+	rc = efx_mcdi_rpc(efx, MC_CMD_GET_LINK, NULL, 0,
+			  outbuf, sizeof(outbuf), NULL);
+	if (rc)
+		goto fail;
+
+	/* Fill out nic state */
+	efx->phy_data = phy_data;
+	efx->phy_type = phy_data->type;
+
+	efx->mdio_bus = phy_data->channel;
+	efx->mdio.prtad = phy_data->port;
+	efx->mdio.mmds = phy_data->mmd_mask & ~(1 << MC_CMD_MMD_CLAUSE22);
+	efx->mdio.mode_support = 0;
+	if (phy_data->mmd_mask & (1 << MC_CMD_MMD_CLAUSE22))
+		efx->mdio.mode_support |= MDIO_SUPPORTS_C22;
+	if (phy_data->mmd_mask & ~(1 << MC_CMD_MMD_CLAUSE22))
+		efx->mdio.mode_support |= MDIO_SUPPORTS_C45 | MDIO_EMULATE_C22;
+
+	caps = MCDI_DWORD(outbuf, GET_LINK_OUT_CAP);
+	if (caps & (1 << MC_CMD_PHY_CAP_AN_LBN))
+		mcdi_to_ethtool_linkset(phy_data->media, caps,
+					efx->link_advertising);
+	else
+		phy_data->forced_cap = caps;
+
+	/* Assert that we can map efx -> mcdi loopback modes */
+	BUILD_BUG_ON(LOOPBACK_NONE != MC_CMD_LOOPBACK_NONE);
+	BUILD_BUG_ON(LOOPBACK_DATA != MC_CMD_LOOPBACK_DATA);
+	BUILD_BUG_ON(LOOPBACK_GMAC != MC_CMD_LOOPBACK_GMAC);
+	BUILD_BUG_ON(LOOPBACK_XGMII != MC_CMD_LOOPBACK_XGMII);
+	BUILD_BUG_ON(LOOPBACK_XGXS != MC_CMD_LOOPBACK_XGXS);
+	BUILD_BUG_ON(LOOPBACK_XAUI != MC_CMD_LOOPBACK_XAUI);
+	BUILD_BUG_ON(LOOPBACK_GMII != MC_CMD_LOOPBACK_GMII);
+	BUILD_BUG_ON(LOOPBACK_SGMII != MC_CMD_LOOPBACK_SGMII);
+	BUILD_BUG_ON(LOOPBACK_XGBR != MC_CMD_LOOPBACK_XGBR);
+	BUILD_BUG_ON(LOOPBACK_XFI != MC_CMD_LOOPBACK_XFI);
+	BUILD_BUG_ON(LOOPBACK_XAUI_FAR != MC_CMD_LOOPBACK_XAUI_FAR);
+	BUILD_BUG_ON(LOOPBACK_GMII_FAR != MC_CMD_LOOPBACK_GMII_FAR);
+	BUILD_BUG_ON(LOOPBACK_SGMII_FAR != MC_CMD_LOOPBACK_SGMII_FAR);
+	BUILD_BUG_ON(LOOPBACK_XFI_FAR != MC_CMD_LOOPBACK_XFI_FAR);
+	BUILD_BUG_ON(LOOPBACK_GPHY != MC_CMD_LOOPBACK_GPHY);
+	BUILD_BUG_ON(LOOPBACK_PHYXS != MC_CMD_LOOPBACK_PHYXS);
+	BUILD_BUG_ON(LOOPBACK_PCS != MC_CMD_LOOPBACK_PCS);
+	BUILD_BUG_ON(LOOPBACK_PMAPMD != MC_CMD_LOOPBACK_PMAPMD);
+	BUILD_BUG_ON(LOOPBACK_XPORT != MC_CMD_LOOPBACK_XPORT);
+	BUILD_BUG_ON(LOOPBACK_XGMII_WS != MC_CMD_LOOPBACK_XGMII_WS);
+	BUILD_BUG_ON(LOOPBACK_XAUI_WS != MC_CMD_LOOPBACK_XAUI_WS);
+	BUILD_BUG_ON(LOOPBACK_XAUI_WS_FAR != MC_CMD_LOOPBACK_XAUI_WS_FAR);
+	BUILD_BUG_ON(LOOPBACK_XAUI_WS_NEAR != MC_CMD_LOOPBACK_XAUI_WS_NEAR);
+	BUILD_BUG_ON(LOOPBACK_GMII_WS != MC_CMD_LOOPBACK_GMII_WS);
+	BUILD_BUG_ON(LOOPBACK_XFI_WS != MC_CMD_LOOPBACK_XFI_WS);
+	BUILD_BUG_ON(LOOPBACK_XFI_WS_FAR != MC_CMD_LOOPBACK_XFI_WS_FAR);
+	BUILD_BUG_ON(LOOPBACK_PHYXS_WS != MC_CMD_LOOPBACK_PHYXS_WS);
+
+	rc = efx_mcdi_loopback_modes(efx, &efx->loopback_modes);
+	if (rc != 0)
+		goto fail;
+	/* The MC indicates that LOOPBACK_NONE is a valid loopback mode,
+	 * but by convention we don't
+	 */
+	efx->loopback_modes &= ~(1 << LOOPBACK_NONE);
+
+	/* Set the initial link mode */
+	efx_mcdi_phy_decode_link(efx, &efx->link_state,
+				 MCDI_DWORD(outbuf, GET_LINK_OUT_LINK_SPEED),
+				 MCDI_DWORD(outbuf, GET_LINK_OUT_FLAGS),
+				 MCDI_DWORD(outbuf, GET_LINK_OUT_FCNTL));
+
+	/* Record the initial FEC configuration (or nearest approximation
+	 * representable in the ethtool configuration space)
+	 */
+	efx->fec_config = mcdi_fec_caps_to_ethtool(caps,
+						   efx->link_state.speed == 25000 ||
+						   efx->link_state.speed == 50000);
+
+	/* Default to Autonegotiated flow control if the PHY supports it */
+	efx->wanted_fc = EFX_FC_RX | EFX_FC_TX;
+	if (phy_data->supported_cap & (1 << MC_CMD_PHY_CAP_AN_LBN))
+		efx->wanted_fc |= EFX_FC_AUTO;
+	efx_link_set_wanted_fc(efx, efx->wanted_fc);
+
+	return 0;
+
+fail:
+	kfree(phy_data);
+	return rc;
+}
+
+void efx_mcdi_phy_remove(struct efx_nic *efx)
+{
+	struct efx_mcdi_phy_data *phy_data = efx->phy_data;
+
+	efx->phy_data = NULL;
+	kfree(phy_data);
+}
+
+void efx_mcdi_phy_get_link_ksettings(struct efx_nic *efx, struct ethtool_link_ksettings *cmd)
+{
+	struct efx_mcdi_phy_data *phy_cfg = efx->phy_data;
+	MCDI_DECLARE_BUF(outbuf, MC_CMD_GET_LINK_OUT_LEN);
+	int rc;
+
+	cmd->base.speed = efx->link_state.speed;
+	cmd->base.duplex = efx->link_state.fd;
+	cmd->base.port = mcdi_to_ethtool_media(phy_cfg->media);
+	cmd->base.phy_address = phy_cfg->port;
+	cmd->base.autoneg = !!(efx->link_advertising[0] & ADVERTISED_Autoneg);
+	cmd->base.mdio_support = (efx->mdio.mode_support &
+			      (MDIO_SUPPORTS_C45 | MDIO_SUPPORTS_C22));
+
+	mcdi_to_ethtool_linkset(phy_cfg->media, phy_cfg->supported_cap,
+				cmd->link_modes.supported);
+	memcpy(cmd->link_modes.advertising, efx->link_advertising,
+	       sizeof(__ETHTOOL_DECLARE_LINK_MODE_MASK()));
+
+	BUILD_BUG_ON(MC_CMD_GET_LINK_IN_LEN != 0);
+	rc = efx_mcdi_rpc(efx, MC_CMD_GET_LINK, NULL, 0,
+			  outbuf, sizeof(outbuf), NULL);
+	if (rc)
+		return;
+	mcdi_to_ethtool_linkset(phy_cfg->media,
+				MCDI_DWORD(outbuf, GET_LINK_OUT_LP_CAP),
+				cmd->link_modes.lp_advertising);
+}
+
+int efx_mcdi_phy_set_link_ksettings(struct efx_nic *efx, const struct ethtool_link_ksettings *cmd)
+{
+	struct efx_mcdi_phy_data *phy_cfg = efx->phy_data;
+	u32 caps;
+	int rc;
+
+	if (cmd->base.autoneg) {
+		caps = (ethtool_linkset_to_mcdi_cap(cmd->link_modes.advertising) |
+			1 << MC_CMD_PHY_CAP_AN_LBN);
+	} else if (cmd->base.duplex) {
+		switch (cmd->base.speed) {
+		case 10:     caps = 1 << MC_CMD_PHY_CAP_10FDX_LBN;     break;
+		case 100:    caps = 1 << MC_CMD_PHY_CAP_100FDX_LBN;    break;
+		case 1000:   caps = 1 << MC_CMD_PHY_CAP_1000FDX_LBN;   break;
+		case 10000:  caps = 1 << MC_CMD_PHY_CAP_10000FDX_LBN;  break;
+		case 40000:  caps = 1 << MC_CMD_PHY_CAP_40000FDX_LBN;  break;
+		case 100000: caps = 1 << MC_CMD_PHY_CAP_100000FDX_LBN; break;
+		case 25000:  caps = 1 << MC_CMD_PHY_CAP_25000FDX_LBN;  break;
+		case 50000:  caps = 1 << MC_CMD_PHY_CAP_50000FDX_LBN;  break;
+		default:     return -EINVAL;
+		}
+	} else {
+		switch (cmd->base.speed) {
+		case 10:     caps = 1 << MC_CMD_PHY_CAP_10HDX_LBN;     break;
+		case 100:    caps = 1 << MC_CMD_PHY_CAP_100HDX_LBN;    break;
+		case 1000:   caps = 1 << MC_CMD_PHY_CAP_1000HDX_LBN;   break;
+		default:     return -EINVAL;
+		}
+	}
+
+	caps |= ethtool_fec_caps_to_mcdi(phy_cfg->supported_cap, efx->fec_config);
+
+	rc = efx_mcdi_set_link(efx, caps, efx_get_mcdi_phy_flags(efx),
+			       efx->loopback_mode, 0);
+	if (rc)
+		return rc;
+
+	if (cmd->base.autoneg) {
+		efx_link_set_advertising(efx, cmd->link_modes.advertising);
+		phy_cfg->forced_cap = 0;
+	} else {
+		efx_link_clear_advertising(efx);
+		phy_cfg->forced_cap = caps;
+	}
+	return 0;
+}
+
+int efx_mcdi_phy_get_fecparam(struct efx_nic *efx, struct ethtool_fecparam *fec)
+{
+	MCDI_DECLARE_BUF(outbuf, MC_CMD_GET_LINK_OUT_V2_LEN);
+	u32 caps, active, speed; /* MCDI format */
+	bool is_25g = false;
+	size_t outlen;
+	int rc;
+
+	BUILD_BUG_ON(MC_CMD_GET_LINK_IN_LEN != 0);
+	rc = efx_mcdi_rpc(efx, MC_CMD_GET_LINK, NULL, 0,
+			  outbuf, sizeof(outbuf), &outlen);
+	if (rc)
+		return rc;
+	if (outlen < MC_CMD_GET_LINK_OUT_V2_LEN)
+		return -EOPNOTSUPP;
+
+	/* behaviour for 25G/50G links depends on 25G BASER bit */
+	speed = MCDI_DWORD(outbuf, GET_LINK_OUT_V2_LINK_SPEED);
+	is_25g = speed == 25000 || speed == 50000;
+
+	caps = MCDI_DWORD(outbuf, GET_LINK_OUT_V2_CAP);
+	fec->fec = mcdi_fec_caps_to_ethtool(caps, is_25g);
+	/* BASER is never supported on 100G */
+	if (speed == 100000)
+		fec->fec &= ~ETHTOOL_FEC_BASER;
+
+	active = MCDI_DWORD(outbuf, GET_LINK_OUT_V2_FEC_TYPE);
+	switch (active) {
+	case MC_CMD_FEC_NONE:
+		fec->active_fec = ETHTOOL_FEC_OFF;
+		break;
+	case MC_CMD_FEC_BASER:
+		fec->active_fec = ETHTOOL_FEC_BASER;
+		break;
+	case MC_CMD_FEC_RS:
+		fec->active_fec = ETHTOOL_FEC_RS;
+		break;
+	default:
+		netif_warn(efx, hw, efx->net_dev,
+			   "Firmware reports unrecognised FEC_TYPE %u\n",
+			   active);
+		/* We don't know what firmware has picked.  AUTO is as good a
+		 * "can't happen" value as any other.
+		 */
+		fec->active_fec = ETHTOOL_FEC_AUTO;
+		break;
+	}
+
+	return 0;
+}
+
+/* Basic validation to ensure that the caps we are going to attempt to set are
+ * in fact supported by the adapter.  Note that 'no FEC' is always supported.
+ */
+static int ethtool_fec_supported(u32 supported_cap, u32 ethtool_cap)
+{
+	if (ethtool_cap & ETHTOOL_FEC_OFF)
+		return 0;
+
+	if (ethtool_cap &&
+	    !ethtool_fec_caps_to_mcdi(supported_cap, ethtool_cap))
+		return -EINVAL;
+	return 0;
+}
+
+int efx_mcdi_phy_set_fecparam(struct efx_nic *efx, const struct ethtool_fecparam *fec)
+{
+	struct efx_mcdi_phy_data *phy_cfg = efx->phy_data;
+	u32 caps;
+	int rc;
+
+	rc = ethtool_fec_supported(phy_cfg->supported_cap, fec->fec);
+	if (rc)
+		return rc;
+
+	/* Work out what efx_mcdi_phy_set_link_ksettings() would produce from
+	 * saved advertising bits
+	 */
+	if (test_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, efx->link_advertising))
+		caps = (ethtool_linkset_to_mcdi_cap(efx->link_advertising) |
+			1 << MC_CMD_PHY_CAP_AN_LBN);
+	else
+		caps = phy_cfg->forced_cap;
+
+	caps |= ethtool_fec_caps_to_mcdi(phy_cfg->supported_cap, fec->fec);
+	rc = efx_mcdi_set_link(efx, caps, efx_get_mcdi_phy_flags(efx),
+			       efx->loopback_mode, 0);
+	if (rc)
+		return rc;
+
+	/* Record the new FEC setting for subsequent set_link calls */
+	efx->fec_config = fec->fec;
+	return 0;
+}
+
+int efx_mcdi_phy_test_alive(struct efx_nic *efx)
+{
+	MCDI_DECLARE_BUF(outbuf, MC_CMD_GET_PHY_STATE_OUT_LEN);
+	size_t outlen;
+	int rc;
+
+	BUILD_BUG_ON(MC_CMD_GET_PHY_STATE_IN_LEN != 0);
+
+	rc = efx_mcdi_rpc(efx, MC_CMD_GET_PHY_STATE, NULL, 0,
+			  outbuf, sizeof(outbuf), &outlen);
+	if (rc)
+		return rc;
+
+	if (outlen < MC_CMD_GET_PHY_STATE_OUT_LEN)
+		return -EIO;
+	if (MCDI_DWORD(outbuf, GET_PHY_STATE_OUT_STATE) != MC_CMD_PHY_STATE_OK)
+		return -EINVAL;
+
+	return 0;
+}
+
+int efx_mcdi_port_reconfigure(struct efx_nic *efx)
+{
+	struct efx_mcdi_phy_data *phy_cfg = efx->phy_data;
+	u32 caps = (efx->link_advertising[0] ?
+		    ethtool_linkset_to_mcdi_cap(efx->link_advertising) :
+		    phy_cfg->forced_cap);
+
+	caps |= ethtool_fec_caps_to_mcdi(phy_cfg->supported_cap, efx->fec_config);
+
+	return efx_mcdi_set_link(efx, caps, efx_get_mcdi_phy_flags(efx),
+				 efx->loopback_mode, 0);
+}
+
+static const char *const mcdi_sft9001_cable_diag_names[] = {
+	"cable.pairA.length",
+	"cable.pairB.length",
+	"cable.pairC.length",
+	"cable.pairD.length",
+	"cable.pairA.status",
+	"cable.pairB.status",
+	"cable.pairC.status",
+	"cable.pairD.status",
+};
+
+static int efx_mcdi_bist(struct efx_nic *efx, unsigned int bist_mode,
+			 int *results)
+{
+	unsigned int retry, i, count = 0;
+	size_t outlen;
+	u32 status;
+	MCDI_DECLARE_BUF(inbuf, MC_CMD_START_BIST_IN_LEN);
+	MCDI_DECLARE_BUF(outbuf, MC_CMD_POLL_BIST_OUT_SFT9001_LEN);
+	u8 *ptr;
+	int rc;
+
+	BUILD_BUG_ON(MC_CMD_START_BIST_OUT_LEN != 0);
+	MCDI_SET_DWORD(inbuf, START_BIST_IN_TYPE, bist_mode);
+	rc = efx_mcdi_rpc(efx, MC_CMD_START_BIST,
+			  inbuf, MC_CMD_START_BIST_IN_LEN, NULL, 0, NULL);
+	if (rc)
+		goto out;
+
+	/* Wait up to 10s for BIST to finish */
+	for (retry = 0; retry < 100; ++retry) {
+		BUILD_BUG_ON(MC_CMD_POLL_BIST_IN_LEN != 0);
+		rc = efx_mcdi_rpc(efx, MC_CMD_POLL_BIST, NULL, 0,
+				  outbuf, sizeof(outbuf), &outlen);
+		if (rc)
+			goto out;
+
+		status = MCDI_DWORD(outbuf, POLL_BIST_OUT_RESULT);
+		if (status != MC_CMD_POLL_BIST_RUNNING)
+			goto finished;
+
+		msleep(100);
+	}
+
+	rc = -ETIMEDOUT;
+	goto out;
+
+finished:
+	results[count++] = (status == MC_CMD_POLL_BIST_PASSED) ? 1 : -1;
+
+	/* SFT9001 specific cable diagnostics output */
+	if (efx->phy_type == PHY_TYPE_SFT9001B &&
+	    (bist_mode == MC_CMD_PHY_BIST_CABLE_SHORT ||
+	     bist_mode == MC_CMD_PHY_BIST_CABLE_LONG)) {
+		ptr = MCDI_PTR(outbuf, POLL_BIST_OUT_SFT9001_CABLE_LENGTH_A);
+		if (status == MC_CMD_POLL_BIST_PASSED &&
+		    outlen >= MC_CMD_POLL_BIST_OUT_SFT9001_LEN) {
+			for (i = 0; i < 8; i++) {
+				results[count + i] =
+					EFX_DWORD_FIELD(((efx_dword_t *)ptr)[i],
+							EFX_DWORD_0);
+			}
+		}
+		count += 8;
+	}
+	rc = count;
+
+out:
+	return rc;
+}
+
+int efx_mcdi_phy_run_tests(struct efx_nic *efx, int *results, unsigned int flags)
+{
+	struct efx_mcdi_phy_data *phy_cfg = efx->phy_data;
+	u32 mode;
+	int rc;
+
+	if (phy_cfg->flags & (1 << MC_CMD_GET_PHY_CFG_OUT_BIST_LBN)) {
+		rc = efx_mcdi_bist(efx, MC_CMD_PHY_BIST, results);
+		if (rc < 0)
+			return rc;
+
+		results += rc;
+	}
+
+	/* If we support both LONG and SHORT, then run each in response to
+	 * break or not. Otherwise, run the one we support
+	 */
+	mode = 0;
+	if (phy_cfg->flags & (1 << MC_CMD_GET_PHY_CFG_OUT_BIST_CABLE_SHORT_LBN)) {
+		if ((flags & ETH_TEST_FL_OFFLINE) &&
+		    (phy_cfg->flags &
+		     (1 << MC_CMD_GET_PHY_CFG_OUT_BIST_CABLE_LONG_LBN)))
+			mode = MC_CMD_PHY_BIST_CABLE_LONG;
+		else
+			mode = MC_CMD_PHY_BIST_CABLE_SHORT;
+	} else if (phy_cfg->flags &
+		   (1 << MC_CMD_GET_PHY_CFG_OUT_BIST_CABLE_LONG_LBN))
+		mode = MC_CMD_PHY_BIST_CABLE_LONG;
+
+	if (mode != 0) {
+		rc = efx_mcdi_bist(efx, mode, results);
+		if (rc < 0)
+			return rc;
+		results += rc;
+	}
+
+	return 0;
+}
+
+const char *efx_mcdi_phy_test_name(struct efx_nic *efx, unsigned int index)
+{
+	struct efx_mcdi_phy_data *phy_cfg = efx->phy_data;
+
+	if (phy_cfg->flags & (1 << MC_CMD_GET_PHY_CFG_OUT_BIST_LBN)) {
+		if (index == 0)
+			return "bist";
+		--index;
+	}
+
+	if (phy_cfg->flags & ((1 << MC_CMD_GET_PHY_CFG_OUT_BIST_CABLE_SHORT_LBN) |
+			      (1 << MC_CMD_GET_PHY_CFG_OUT_BIST_CABLE_LONG_LBN))) {
+		if (index == 0)
+			return "cable";
+		--index;
+
+		if (efx->phy_type == PHY_TYPE_SFT9001B) {
+			if (index < ARRAY_SIZE(mcdi_sft9001_cable_diag_names))
+				return mcdi_sft9001_cable_diag_names[index];
+			index -= ARRAY_SIZE(mcdi_sft9001_cable_diag_names);
+		}
+	}
+
+	return NULL;
+}
+
+#define SFP_PAGE_SIZE		128
+#define SFF_DIAG_TYPE_OFFSET	92
+#define SFF_DIAG_ADDR_CHANGE	BIT(2)
+#define SFF_8079_NUM_PAGES	2
+#define SFF_8472_NUM_PAGES	4
+#define SFF_8436_NUM_PAGES	5
+#define SFF_DMT_LEVEL_OFFSET	94
+
+/** efx_mcdi_phy_get_module_eeprom_page() - Get a single page of module eeprom
+ * @efx:	NIC context
+ * @page:	EEPROM page number
+ * @data:	Destination data pointer
+ * @offset:	Offset in page to copy from in to data
+ * @space:	Space available in data
+ *
+ * Return:
+ *   >=0 - amount of data copied
+ *   <0  - error
+ */
+static int efx_mcdi_phy_get_module_eeprom_page(struct efx_nic *efx,
+					       unsigned int page,
+					       u8 *data, ssize_t offset,
+					       ssize_t space)
+{
+	MCDI_DECLARE_BUF(outbuf, MC_CMD_GET_PHY_MEDIA_INFO_OUT_LENMAX);
+	MCDI_DECLARE_BUF(inbuf, MC_CMD_GET_PHY_MEDIA_INFO_IN_LEN);
+	unsigned int payload_len;
+	unsigned int to_copy;
+	size_t outlen;
+	int rc;
+
+	if (offset > SFP_PAGE_SIZE)
+		return -EINVAL;
+
+	to_copy = min(space, SFP_PAGE_SIZE - offset);
+
+	MCDI_SET_DWORD(inbuf, GET_PHY_MEDIA_INFO_IN_PAGE, page);
+	rc = efx_mcdi_rpc_quiet(efx, MC_CMD_GET_PHY_MEDIA_INFO,
+				inbuf, sizeof(inbuf),
+				outbuf, sizeof(outbuf),
+				&outlen);
+
+	if (rc)
+		return rc;
+
+	if (outlen < (MC_CMD_GET_PHY_MEDIA_INFO_OUT_DATA_OFST +
+			SFP_PAGE_SIZE))
+		return -EIO;
+
+	payload_len = MCDI_DWORD(outbuf, GET_PHY_MEDIA_INFO_OUT_DATALEN);
+	if (payload_len != SFP_PAGE_SIZE)
+		return -EIO;
+
+	memcpy(data, MCDI_PTR(outbuf, GET_PHY_MEDIA_INFO_OUT_DATA) + offset,
+	       to_copy);
+
+	return to_copy;
+}
+
+static int efx_mcdi_phy_get_module_eeprom_byte(struct efx_nic *efx,
+					       unsigned int page,
+					       u8 byte)
+{
+	u8 data;
+	int rc;
+
+	rc = efx_mcdi_phy_get_module_eeprom_page(efx, page, &data, byte, 1);
+	if (rc == 1)
+		return data;
+
+	return rc;
+}
+
+static int efx_mcdi_phy_diag_type(struct efx_nic *efx)
+{
+	/* Page zero of the EEPROM includes the diagnostic type at byte 92. */
+	return efx_mcdi_phy_get_module_eeprom_byte(efx, 0,
+						   SFF_DIAG_TYPE_OFFSET);
+}
+
+static int efx_mcdi_phy_sff_8472_level(struct efx_nic *efx)
+{
+	/* Page zero of the EEPROM includes the DMT level at byte 94. */
+	return efx_mcdi_phy_get_module_eeprom_byte(efx, 0,
+						   SFF_DMT_LEVEL_OFFSET);
+}
+
+static u32 efx_mcdi_phy_module_type(struct efx_nic *efx)
+{
+	struct efx_mcdi_phy_data *phy_data = efx->phy_data;
+
+	if (phy_data->media != MC_CMD_MEDIA_QSFP_PLUS)
+		return phy_data->media;
+
+	/* A QSFP+ NIC may actually have an SFP+ module attached.
+	 * The ID is page 0, byte 0.
+	 */
+	switch (efx_mcdi_phy_get_module_eeprom_byte(efx, 0, 0)) {
+	case 0x3:
+		return MC_CMD_MEDIA_SFP_PLUS;
+	case 0xc:
+	case 0xd:
+		return MC_CMD_MEDIA_QSFP_PLUS;
+	default:
+		return 0;
+	}
+}
+
+int efx_mcdi_phy_get_module_eeprom(struct efx_nic *efx, struct ethtool_eeprom *ee, u8 *data)
+{
+	int rc;
+	ssize_t space_remaining = ee->len;
+	unsigned int page_off;
+	bool ignore_missing;
+	int num_pages;
+	int page;
+
+	switch (efx_mcdi_phy_module_type(efx)) {
+	case MC_CMD_MEDIA_SFP_PLUS:
+		num_pages = efx_mcdi_phy_sff_8472_level(efx) > 0 ?
+				SFF_8472_NUM_PAGES : SFF_8079_NUM_PAGES;
+		page = 0;
+		ignore_missing = false;
+		break;
+	case MC_CMD_MEDIA_QSFP_PLUS:
+		num_pages = SFF_8436_NUM_PAGES;
+		page = -1; /* We obtain the lower page by asking for -1. */
+		ignore_missing = true; /* Ignore missing pages after page 0. */
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	page_off = ee->offset % SFP_PAGE_SIZE;
+	page += ee->offset / SFP_PAGE_SIZE;
+
+	while (space_remaining && (page < num_pages)) {
+		rc = efx_mcdi_phy_get_module_eeprom_page(efx, page,
+							 data, page_off,
+							 space_remaining);
+
+		if (rc > 0) {
+			space_remaining -= rc;
+			data += rc;
+			page_off = 0;
+			page++;
+		} else if (rc == 0) {
+			space_remaining = 0;
+		} else if (ignore_missing && (page > 0)) {
+			int intended_size = SFP_PAGE_SIZE - page_off;
+
+			space_remaining -= intended_size;
+			if (space_remaining < 0) {
+				space_remaining = 0;
+			} else {
+				memset(data, 0, intended_size);
+				data += intended_size;
+				page_off = 0;
+				page++;
+				rc = 0;
+			}
+		} else {
+			return rc;
+		}
+	}
+
+	return 0;
+}
+
+int efx_mcdi_phy_get_module_info(struct efx_nic *efx, struct ethtool_modinfo *modinfo)
+{
+	int sff_8472_level;
+	int diag_type;
+
+	switch (efx_mcdi_phy_module_type(efx)) {
+	case MC_CMD_MEDIA_SFP_PLUS:
+		sff_8472_level = efx_mcdi_phy_sff_8472_level(efx);
+
+		/* If we can't read the diagnostics level we have none. */
+		if (sff_8472_level < 0)
+			return -EOPNOTSUPP;
+
+		/* Check if this module requires the (unsupported) address
+		 * change operation.
+		 */
+		diag_type = efx_mcdi_phy_diag_type(efx);
+
+		if (sff_8472_level == 0 ||
+		    (diag_type & SFF_DIAG_ADDR_CHANGE)) {
+			modinfo->type = ETH_MODULE_SFF_8079;
+			modinfo->eeprom_len = ETH_MODULE_SFF_8079_LEN;
+		} else {
+			modinfo->type = ETH_MODULE_SFF_8472;
+			modinfo->eeprom_len = ETH_MODULE_SFF_8472_LEN;
+		}
+		break;
+
+	case MC_CMD_MEDIA_QSFP_PLUS:
+		modinfo->type = ETH_MODULE_SFF_8436;
+		modinfo->eeprom_len = ETH_MODULE_SFF_8436_LEN;
+		break;
+
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static unsigned int efx_calc_mac_mtu(struct efx_nic *efx)
+{
+	return EFX_MAX_FRAME_LEN(efx->net_dev->mtu);
+}
+
+int efx_mcdi_set_mac(struct efx_nic *efx)
+{
+	u32 fcntl;
+	MCDI_DECLARE_BUF(cmdbytes, MC_CMD_SET_MAC_IN_LEN);
+
+	BUILD_BUG_ON(MC_CMD_SET_MAC_OUT_LEN != 0);
+
+	/* This has no effect on EF10 */
+	ether_addr_copy(MCDI_PTR(cmdbytes, SET_MAC_IN_ADDR),
+			efx->net_dev->dev_addr);
+
+	MCDI_SET_DWORD(cmdbytes, SET_MAC_IN_MTU, efx_calc_mac_mtu(efx));
+	MCDI_SET_DWORD(cmdbytes, SET_MAC_IN_DRAIN, 0);
+
+	/* Set simple MAC filter for Siena */
+	MCDI_POPULATE_DWORD_1(cmdbytes, SET_MAC_IN_REJECT,
+			      SET_MAC_IN_REJECT_UNCST, efx->unicast_filter);
+
+	MCDI_POPULATE_DWORD_1(cmdbytes, SET_MAC_IN_FLAGS,
+			      SET_MAC_IN_FLAG_INCLUDE_FCS,
+			      !!(efx->net_dev->features & NETIF_F_RXFCS));
+
+	switch (efx->wanted_fc) {
+	case EFX_FC_RX | EFX_FC_TX:
+		fcntl = MC_CMD_FCNTL_BIDIR;
+		break;
+	case EFX_FC_RX:
+		fcntl = MC_CMD_FCNTL_RESPOND;
+		break;
+	default:
+		fcntl = MC_CMD_FCNTL_OFF;
+		break;
+	}
+	if (efx->wanted_fc & EFX_FC_AUTO)
+		fcntl = MC_CMD_FCNTL_AUTO;
+	if (efx->fc_disable)
+		fcntl = MC_CMD_FCNTL_OFF;
+
+	MCDI_SET_DWORD(cmdbytes, SET_MAC_IN_FCNTL, fcntl);
+
+	return efx_mcdi_rpc(efx, MC_CMD_SET_MAC, cmdbytes, sizeof(cmdbytes),
+			    NULL, 0, NULL);
+}
+
+int efx_mcdi_set_mtu(struct efx_nic *efx)
+{
+	MCDI_DECLARE_BUF(inbuf, MC_CMD_SET_MAC_EXT_IN_LEN);
+
+	BUILD_BUG_ON(MC_CMD_SET_MAC_OUT_LEN != 0);
+
+	MCDI_SET_DWORD(inbuf, SET_MAC_EXT_IN_MTU, efx_calc_mac_mtu(efx));
+
+	MCDI_POPULATE_DWORD_1(inbuf, SET_MAC_EXT_IN_CONTROL,
+			      SET_MAC_EXT_IN_CFG_MTU, 1);
+
+	return efx_mcdi_rpc(efx, MC_CMD_SET_MAC, inbuf, sizeof(inbuf),
+			    NULL, 0, NULL);
+}
+
+enum efx_stats_action {
+	EFX_STATS_ENABLE,
+	EFX_STATS_DISABLE,
+	EFX_STATS_PULL,
+};
+
+static int efx_mcdi_mac_stats(struct efx_nic *efx,
+			      enum efx_stats_action action, int clear)
+{
+	MCDI_DECLARE_BUF(inbuf, MC_CMD_MAC_STATS_IN_LEN);
+	int rc;
+	int change = action == EFX_STATS_PULL ? 0 : 1;
+	int enable = action == EFX_STATS_ENABLE ? 1 : 0;
+	int period = action == EFX_STATS_ENABLE ? 1000 : 0;
+	dma_addr_t dma_addr = efx->stats_buffer.dma_addr;
+	u32 dma_len = action != EFX_STATS_DISABLE ?
+		efx->num_mac_stats * sizeof(u64) : 0;
+
+	BUILD_BUG_ON(MC_CMD_MAC_STATS_OUT_DMA_LEN != 0);
+
+	MCDI_SET_QWORD(inbuf, MAC_STATS_IN_DMA_ADDR, dma_addr);
+	MCDI_POPULATE_DWORD_7(inbuf, MAC_STATS_IN_CMD,
+			      MAC_STATS_IN_DMA, !!enable,
+			      MAC_STATS_IN_CLEAR, clear,
+			      MAC_STATS_IN_PERIODIC_CHANGE, change,
+			      MAC_STATS_IN_PERIODIC_ENABLE, enable,
+			      MAC_STATS_IN_PERIODIC_CLEAR, 0,
+			      MAC_STATS_IN_PERIODIC_NOEVENT, 1,
+			      MAC_STATS_IN_PERIOD_MS, period);
+	MCDI_SET_DWORD(inbuf, MAC_STATS_IN_DMA_LEN, dma_len);
+
+	if (efx_nic_rev(efx) >= EFX_REV_HUNT_A0)
+		MCDI_SET_DWORD(inbuf, MAC_STATS_IN_PORT_ID, efx->vport_id);
+
+	rc = efx_mcdi_rpc_quiet(efx, MC_CMD_MAC_STATS, inbuf, sizeof(inbuf),
+				NULL, 0, NULL);
+	/* Expect ENOENT if DMA queues have not been set up */
+	if (rc && (rc != -ENOENT || atomic_read(&efx->active_queues)))
+		efx_mcdi_display_error(efx, MC_CMD_MAC_STATS, sizeof(inbuf),
+				       NULL, 0, rc);
+	return rc;
+}
+
+void efx_mcdi_mac_start_stats(struct efx_nic *efx)
+{
+	__le64 *dma_stats = efx->stats_buffer.addr;
+
+	dma_stats[efx->num_mac_stats - 1] = EFX_MC_STATS_GENERATION_INVALID;
+
+	efx_mcdi_mac_stats(efx, EFX_STATS_ENABLE, 0);
+}
+
+void efx_mcdi_mac_stop_stats(struct efx_nic *efx)
+{
+	efx_mcdi_mac_stats(efx, EFX_STATS_DISABLE, 0);
+}
+
+#define EFX_MAC_STATS_WAIT_US 100
+#define EFX_MAC_STATS_WAIT_ATTEMPTS 10
+
+void efx_mcdi_mac_pull_stats(struct efx_nic *efx)
+{
+	__le64 *dma_stats = efx->stats_buffer.addr;
+	int attempts = EFX_MAC_STATS_WAIT_ATTEMPTS;
+
+	dma_stats[efx->num_mac_stats - 1] = EFX_MC_STATS_GENERATION_INVALID;
+	efx_mcdi_mac_stats(efx, EFX_STATS_PULL, 0);
+
+	while (dma_stats[efx->num_mac_stats - 1] ==
+				EFX_MC_STATS_GENERATION_INVALID &&
+			attempts-- != 0)
+		udelay(EFX_MAC_STATS_WAIT_US);
+}
+
+int efx_mcdi_mac_init_stats(struct efx_nic *efx)
+{
+	int rc;
+
+	if (!efx->num_mac_stats)
+		return 0;
+
+	/* Allocate buffer for stats */
+	rc = efx_nic_alloc_buffer(efx, &efx->stats_buffer,
+				  efx->num_mac_stats * sizeof(u64), GFP_KERNEL);
+	if (rc) {
+		netif_warn(efx, probe, efx->net_dev,
+			   "failed to allocate DMA buffer: %d\n", rc);
+		return rc;
+	}
+
+	netif_dbg(efx, probe, efx->net_dev,
+		  "stats buffer at %llx (virt %p phys %llx)\n",
+		  (u64) efx->stats_buffer.dma_addr,
+		  efx->stats_buffer.addr,
+		  (u64) virt_to_phys(efx->stats_buffer.addr));
+
+	return 0;
+}
+
+void efx_mcdi_mac_fini_stats(struct efx_nic *efx)
+{
+	efx_nic_free_buffer(efx, &efx->stats_buffer);
+}
+
+/* Get physical port number (EF10 only; on Siena it is same as PF number) */
+int efx_mcdi_port_get_number(struct efx_nic *efx)
+{
+	MCDI_DECLARE_BUF(outbuf, MC_CMD_GET_PORT_ASSIGNMENT_OUT_LEN);
+	int rc;
+
+	rc = efx_mcdi_rpc(efx, MC_CMD_GET_PORT_ASSIGNMENT, NULL, 0,
+			  outbuf, sizeof(outbuf), NULL);
+	if (rc)
+		return rc;
+
+	return MCDI_DWORD(outbuf, GET_PORT_ASSIGNMENT_OUT_PORT);
+}
+
+static unsigned int efx_mcdi_event_link_speed[] = {
+	[MCDI_EVENT_LINKCHANGE_SPEED_100M] = 100,
+	[MCDI_EVENT_LINKCHANGE_SPEED_1G] = 1000,
+	[MCDI_EVENT_LINKCHANGE_SPEED_10G] = 10000,
+	[MCDI_EVENT_LINKCHANGE_SPEED_40G] = 40000,
+	[MCDI_EVENT_LINKCHANGE_SPEED_25G] = 25000,
+	[MCDI_EVENT_LINKCHANGE_SPEED_50G] = 50000,
+	[MCDI_EVENT_LINKCHANGE_SPEED_100G] = 100000,
+};
+
+void efx_mcdi_process_link_change(struct efx_nic *efx, efx_qword_t *ev)
+{
+	u32 flags, fcntl, speed, lpa;
+
+	speed = EFX_QWORD_FIELD(*ev, MCDI_EVENT_LINKCHANGE_SPEED);
+	EFX_WARN_ON_PARANOID(speed >= ARRAY_SIZE(efx_mcdi_event_link_speed));
+	speed = efx_mcdi_event_link_speed[speed];
+
+	flags = EFX_QWORD_FIELD(*ev, MCDI_EVENT_LINKCHANGE_LINK_FLAGS);
+	fcntl = EFX_QWORD_FIELD(*ev, MCDI_EVENT_LINKCHANGE_FCNTL);
+	lpa = EFX_QWORD_FIELD(*ev, MCDI_EVENT_LINKCHANGE_LP_CAP);
+
+	/* efx->link_state is only modified by efx_mcdi_phy_get_link(),
+	 * which is only run after flushing the event queues. Therefore, it
+	 * is safe to modify the link state outside of the mac_lock here.
+	 */
+	efx_mcdi_phy_decode_link(efx, &efx->link_state, speed, flags, fcntl);
+
+	efx_mcdi_phy_check_fcntl(efx, lpa);
+
+	efx_link_status_changed(efx);
+}
--- /dev/null
+++ b/drivers/net/ethernet/sfc/siena/mcdi_port_common.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/****************************************************************************
+ * Driver for Solarflare network controllers and boards
+ * Copyright 2018 Solarflare Communications Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ */
+#ifndef EFX_MCDI_PORT_COMMON_H
+#define EFX_MCDI_PORT_COMMON_H
+
+#include "net_driver.h"
+#include "mcdi.h"
+#include "mcdi_pcol.h"
+
+struct efx_mcdi_phy_data {
+	u32 flags;
+	u32 type;
+	u32 supported_cap;
+	u32 channel;
+	u32 port;
+	u32 stats_mask;
+	u8 name[20];
+	u32 media;
+	u32 mmd_mask;
+	u8 revision[20];
+	u32 forced_cap;
+};
+
+int efx_mcdi_get_phy_cfg(struct efx_nic *efx, struct efx_mcdi_phy_data *cfg);
+void efx_link_set_advertising(struct efx_nic *efx,
+			      const unsigned long *advertising);
+int efx_mcdi_set_link(struct efx_nic *efx, u32 capabilities,
+		      u32 flags, u32 loopback_mode, u32 loopback_speed);
+int efx_mcdi_loopback_modes(struct efx_nic *efx, u64 *loopback_modes);
+void mcdi_to_ethtool_linkset(u32 media, u32 cap, unsigned long *linkset);
+u32 ethtool_linkset_to_mcdi_cap(const unsigned long *linkset);
+u32 efx_get_mcdi_phy_flags(struct efx_nic *efx);
+u8 mcdi_to_ethtool_media(u32 media);
+void efx_mcdi_phy_decode_link(struct efx_nic *efx,
+			      struct efx_link_state *link_state,
+			      u32 speed, u32 flags, u32 fcntl);
+u32 ethtool_fec_caps_to_mcdi(u32 supported_cap, u32 ethtool_cap);
+u32 mcdi_fec_caps_to_ethtool(u32 caps, bool is_25g);
+void efx_mcdi_phy_check_fcntl(struct efx_nic *efx, u32 lpa);
+bool efx_mcdi_phy_poll(struct efx_nic *efx);
+int efx_mcdi_phy_probe(struct efx_nic *efx);
+void efx_mcdi_phy_remove(struct efx_nic *efx);
+void efx_mcdi_phy_get_link_ksettings(struct efx_nic *efx, struct ethtool_link_ksettings *cmd);
+int efx_mcdi_phy_set_link_ksettings(struct efx_nic *efx, const struct ethtool_link_ksettings *cmd);
+int efx_mcdi_phy_get_fecparam(struct efx_nic *efx, struct ethtool_fecparam *fec);
+int efx_mcdi_phy_set_fecparam(struct efx_nic *efx, const struct ethtool_fecparam *fec);
+int efx_mcdi_phy_test_alive(struct efx_nic *efx);
+int efx_mcdi_port_reconfigure(struct efx_nic *efx);
+int efx_mcdi_phy_run_tests(struct efx_nic *efx, int *results, unsigned int flags);
+const char *efx_mcdi_phy_test_name(struct efx_nic *efx, unsigned int index);
+int efx_mcdi_phy_get_module_eeprom(struct efx_nic *efx, struct ethtool_eeprom *ee, u8 *data);
+int efx_mcdi_phy_get_module_info(struct efx_nic *efx, struct ethtool_modinfo *modinfo);
+int efx_mcdi_set_mac(struct efx_nic *efx);
+int efx_mcdi_set_mtu(struct efx_nic *efx);
+int efx_mcdi_mac_init_stats(struct efx_nic *efx);
+void efx_mcdi_mac_fini_stats(struct efx_nic *efx);
+int efx_mcdi_port_get_number(struct efx_nic *efx);
+void efx_mcdi_process_link_change(struct efx_nic *efx, efx_qword_t *ev);
+
+#endif
--- /dev/null
+++ b/drivers/net/ethernet/sfc/siena/mtd.c
@@ -0,0 +1,124 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/****************************************************************************
+ * Driver for Solarflare network controllers and boards
+ * Copyright 2005-2006 Fen Systems Ltd.
+ * Copyright 2006-2013 Solarflare Communications Inc.
+ */
+
+#include <linux/module.h>
+#include <linux/mtd/mtd.h>
+#include <linux/slab.h>
+#include <linux/rtnetlink.h>
+
+#include "net_driver.h"
+#include "efx.h"
+
+#define to_efx_mtd_partition(mtd)				\
+	container_of(mtd, struct efx_mtd_partition, mtd)
+
+/* MTD interface */
+
+static int efx_mtd_erase(struct mtd_info *mtd, struct erase_info *erase)
+{
+	struct efx_nic *efx = mtd->priv;
+
+	return efx->type->mtd_erase(mtd, erase->addr, erase->len);
+}
+
+static void efx_mtd_sync(struct mtd_info *mtd)
+{
+	struct efx_mtd_partition *part = to_efx_mtd_partition(mtd);
+	struct efx_nic *efx = mtd->priv;
+	int rc;
+
+	rc = efx->type->mtd_sync(mtd);
+	if (rc)
+		pr_err("%s: %s sync failed (%d)\n",
+		       part->name, part->dev_type_name, rc);
+}
+
+static void efx_mtd_remove_partition(struct efx_mtd_partition *part)
+{
+	int rc;
+
+	for (;;) {
+		rc = mtd_device_unregister(&part->mtd);
+		if (rc != -EBUSY)
+			break;
+		ssleep(1);
+	}
+	WARN_ON(rc);
+	list_del(&part->node);
+}
+
+int efx_mtd_add(struct efx_nic *efx, struct efx_mtd_partition *parts,
+		size_t n_parts, size_t sizeof_part)
+{
+	struct efx_mtd_partition *part;
+	size_t i;
+
+	for (i = 0; i < n_parts; i++) {
+		part = (struct efx_mtd_partition *)((char *)parts +
+						    i * sizeof_part);
+
+		part->mtd.writesize = 1;
+
+		if (!(part->mtd.flags & MTD_NO_ERASE))
+			part->mtd.flags |= MTD_WRITEABLE;
+
+		part->mtd.owner = THIS_MODULE;
+		part->mtd.priv = efx;
+		part->mtd.name = part->name;
+		part->mtd._erase = efx_mtd_erase;
+		part->mtd._read = efx->type->mtd_read;
+		part->mtd._write = efx->type->mtd_write;
+		part->mtd._sync = efx_mtd_sync;
+
+		efx->type->mtd_rename(part);
+
+		if (mtd_device_register(&part->mtd, NULL, 0))
+			goto fail;
+
+		/* Add to list in order - efx_mtd_remove() depends on this */
+		list_add_tail(&part->node, &efx->mtd_list);
+	}
+
+	return 0;
+
+fail:
+	while (i--) {
+		part = (struct efx_mtd_partition *)((char *)parts +
+						    i * sizeof_part);
+		efx_mtd_remove_partition(part);
+	}
+	/* Failure is unlikely here, but probably means we're out of memory */
+	return -ENOMEM;
+}
+
+void efx_mtd_remove(struct efx_nic *efx)
+{
+	struct efx_mtd_partition *parts, *part, *next;
+
+	WARN_ON(efx_dev_registered(efx));
+
+	if (list_empty(&efx->mtd_list))
+		return;
+
+	parts = list_first_entry(&efx->mtd_list, struct efx_mtd_partition,
+				 node);
+
+	list_for_each_entry_safe(part, next, &efx->mtd_list, node)
+		efx_mtd_remove_partition(part);
+
+	kfree(parts);
+}
+
+void efx_mtd_rename(struct efx_nic *efx)
+{
+	struct efx_mtd_partition *part;
+
+	ASSERT_RTNL();
+
+	list_for_each_entry(part, &efx->mtd_list, node)
+		efx->type->mtd_rename(part);
+}
--- /dev/null
+++ b/drivers/net/ethernet/sfc/siena/net_driver.h
@@ -0,0 +1,1716 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/****************************************************************************
+ * Driver for Solarflare network controllers and boards
+ * Copyright 2005-2006 Fen Systems Ltd.
+ * Copyright 2005-2013 Solarflare Communications Inc.
+ */
+
+/* Common definitions for all Efx net driver code */
+
+#ifndef EFX_NET_DRIVER_H
+#define EFX_NET_DRIVER_H
+
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/if_vlan.h>
+#include <linux/timer.h>
+#include <linux/mdio.h>
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/device.h>
+#include <linux/highmem.h>
+#include <linux/workqueue.h>
+#include <linux/mutex.h>
+#include <linux/rwsem.h>
+#include <linux/vmalloc.h>
+#include <linux/mtd/mtd.h>
+#include <net/busy_poll.h>
+#include <net/xdp.h>
+
+#include "enum.h"
+#include "bitfield.h"
+#include "filter.h"
+
+/**************************************************************************
+ *
+ * Build definitions
+ *
+ **************************************************************************/
+
+#ifdef DEBUG
+#define EFX_WARN_ON_ONCE_PARANOID(x) WARN_ON_ONCE(x)
+#define EFX_WARN_ON_PARANOID(x) WARN_ON(x)
+#else
+#define EFX_WARN_ON_ONCE_PARANOID(x) do {} while (0)
+#define EFX_WARN_ON_PARANOID(x) do {} while (0)
+#endif
+
+/**************************************************************************
+ *
+ * Efx data structures
+ *
+ **************************************************************************/
+
+#define EFX_MAX_CHANNELS 32U
+#define EFX_MAX_RX_QUEUES EFX_MAX_CHANNELS
+#define EFX_EXTRA_CHANNEL_IOV	0
+#define EFX_EXTRA_CHANNEL_PTP	1
+#define EFX_MAX_EXTRA_CHANNELS	2U
+
+/* Checksum generation is a per-queue option in hardware, so each
+ * queue visible to the networking core is backed by two hardware TX
+ * queues. */
+#define EFX_MAX_TX_TC		2
+#define EFX_MAX_CORE_TX_QUEUES	(EFX_MAX_TX_TC * EFX_MAX_CHANNELS)
+#define EFX_TXQ_TYPE_OUTER_CSUM	1	/* Outer checksum offload */
+#define EFX_TXQ_TYPE_INNER_CSUM	2	/* Inner checksum offload */
+#define EFX_TXQ_TYPE_HIGHPRI	4	/* High-priority (for TC) */
+#define EFX_TXQ_TYPES		8
+/* HIGHPRI is Siena-only, and INNER_CSUM is EF10, so no need for both */
+#define EFX_MAX_TXQ_PER_CHANNEL	4
+#define EFX_MAX_TX_QUEUES	(EFX_MAX_TXQ_PER_CHANNEL * EFX_MAX_CHANNELS)
+
+/* Maximum possible MTU the driver supports */
+#define EFX_MAX_MTU (9 * 1024)
+
+/* Minimum MTU, from RFC791 (IP) */
+#define EFX_MIN_MTU 68
+
+/* Maximum total header length for TSOv2 */
+#define EFX_TSO2_MAX_HDRLEN	208
+
+/* Size of an RX scatter buffer.  Small enough to pack 2 into a 4K page,
+ * and should be a multiple of the cache line size.
+ */
+#define EFX_RX_USR_BUF_SIZE	(2048 - 256)
+
+/* If possible, we should ensure cache line alignment at start and end
+ * of every buffer.  Otherwise, we just need to ensure 4-byte
+ * alignment of the network header.
+ */
+#if NET_IP_ALIGN == 0
+#define EFX_RX_BUF_ALIGNMENT	L1_CACHE_BYTES
+#else
+#define EFX_RX_BUF_ALIGNMENT	4
+#endif
+
+/* Non-standard XDP_PACKET_HEADROOM and tailroom to satisfy XDP_REDIRECT and
+ * still fit two standard MTU size packets into a single 4K page.
+ */
+#define EFX_XDP_HEADROOM	128
+#define EFX_XDP_TAILROOM	SKB_DATA_ALIGN(sizeof(struct skb_shared_info))
+
+/* Forward declare Precision Time Protocol (PTP) support structure. */
+struct efx_ptp_data;
+struct hwtstamp_config;
+
+struct efx_self_tests;
+
+/**
+ * struct efx_buffer - A general-purpose DMA buffer
+ * @addr: host base address of the buffer
+ * @dma_addr: DMA base address of the buffer
+ * @len: Buffer length, in bytes
+ *
+ * The NIC uses these buffers for its interrupt status registers and
+ * MAC stats dumps.
+ */
+struct efx_buffer {
+	void *addr;
+	dma_addr_t dma_addr;
+	unsigned int len;
+};
+
+/**
+ * struct efx_special_buffer - DMA buffer entered into buffer table
+ * @buf: Standard &struct efx_buffer
+ * @index: Buffer index within controller;s buffer table
+ * @entries: Number of buffer table entries
+ *
+ * The NIC has a buffer table that maps buffers of size %EFX_BUF_SIZE.
+ * Event and descriptor rings are addressed via one or more buffer
+ * table entries (and so can be physically non-contiguous, although we
+ * currently do not take advantage of that).  On Falcon and Siena we
+ * have to take care of allocating and initialising the entries
+ * ourselves.  On later hardware this is managed by the firmware and
+ * @index and @entries are left as 0.
+ */
+struct efx_special_buffer {
+	struct efx_buffer buf;
+	unsigned int index;
+	unsigned int entries;
+};
+
+/**
+ * struct efx_tx_buffer - buffer state for a TX descriptor
+ * @skb: When @flags & %EFX_TX_BUF_SKB, the associated socket buffer to be
+ *	freed when descriptor completes
+ * @xdpf: When @flags & %EFX_TX_BUF_XDP, the XDP frame information; its @data
+ *	member is the associated buffer to drop a page reference on.
+ * @option: When @flags & %EFX_TX_BUF_OPTION, an EF10-specific option
+ *	descriptor.
+ * @dma_addr: DMA address of the fragment.
+ * @flags: Flags for allocation and DMA mapping type
+ * @len: Length of this fragment.
+ *	This field is zero when the queue slot is empty.
+ * @unmap_len: Length of this fragment to unmap
+ * @dma_offset: Offset of @dma_addr from the address of the backing DMA mapping.
+ * Only valid if @unmap_len != 0.
+ */
+struct efx_tx_buffer {
+	union {
+		const struct sk_buff *skb;
+		struct xdp_frame *xdpf;
+	};
+	union {
+		efx_qword_t option;    /* EF10 */
+		dma_addr_t dma_addr;
+	};
+	unsigned short flags;
+	unsigned short len;
+	unsigned short unmap_len;
+	unsigned short dma_offset;
+};
+#define EFX_TX_BUF_CONT		1	/* not last descriptor of packet */
+#define EFX_TX_BUF_SKB		2	/* buffer is last part of skb */
+#define EFX_TX_BUF_MAP_SINGLE	8	/* buffer was mapped with dma_map_single() */
+#define EFX_TX_BUF_OPTION	0x10	/* empty buffer for option descriptor */
+#define EFX_TX_BUF_XDP		0x20	/* buffer was sent with XDP */
+#define EFX_TX_BUF_TSO_V3	0x40	/* empty buffer for a TSO_V3 descriptor */
+
+/**
+ * struct efx_tx_queue - An Efx TX queue
+ *
+ * This is a ring buffer of TX fragments.
+ * Since the TX completion path always executes on the same
+ * CPU and the xmit path can operate on different CPUs,
+ * performance is increased by ensuring that the completion
+ * path and the xmit path operate on different cache lines.
+ * This is particularly important if the xmit path is always
+ * executing on one CPU which is different from the completion
+ * path.  There is also a cache line for members which are
+ * read but not written on the fast path.
+ *
+ * @efx: The associated Efx NIC
+ * @queue: DMA queue number
+ * @label: Label for TX completion events.
+ *	Is our index within @channel->tx_queue array.
+ * @type: configuration type of this TX queue.  A bitmask of %EFX_TXQ_TYPE_* flags.
+ * @tso_version: Version of TSO in use for this queue.
+ * @tso_encap: Is encapsulated TSO supported? Supported in TSOv2 on 8000 series.
+ * @channel: The associated channel
+ * @core_txq: The networking core TX queue structure
+ * @buffer: The software buffer ring
+ * @cb_page: Array of pages of copy buffers.  Carved up according to
+ *	%EFX_TX_CB_ORDER into %EFX_TX_CB_SIZE-sized chunks.
+ * @txd: The hardware descriptor ring
+ * @ptr_mask: The size of the ring minus 1.
+ * @piobuf: PIO buffer region for this TX queue (shared with its partner).
+ *	Size of the region is efx_piobuf_size.
+ * @piobuf_offset: Buffer offset to be specified in PIO descriptors
+ * @initialised: Has hardware queue been initialised?
+ * @timestamping: Is timestamping enabled for this channel?
+ * @xdp_tx: Is this an XDP tx queue?
+ * @read_count: Current read pointer.
+ *	This is the number of buffers that have been removed from both rings.
+ * @old_write_count: The value of @write_count when last checked.
+ *	This is here for performance reasons.  The xmit path will
+ *	only get the up-to-date value of @write_count if this
+ *	variable indicates that the queue is empty.  This is to
+ *	avoid cache-line ping-pong between the xmit path and the
+ *	completion path.
+ * @merge_events: Number of TX merged completion events
+ * @completed_timestamp_major: Top part of the most recent tx timestamp.
+ * @completed_timestamp_minor: Low part of the most recent tx timestamp.
+ * @insert_count: Current insert pointer
+ *	This is the number of buffers that have been added to the
+ *	software ring.
+ * @write_count: Current write pointer
+ *	This is the number of buffers that have been added to the
+ *	hardware ring.
+ * @packet_write_count: Completable write pointer
+ *	This is the write pointer of the last packet written.
+ *	Normally this will equal @write_count, but as option descriptors
+ *	don't produce completion events, they won't update this.
+ *	Filled in iff @efx->type->option_descriptors; only used for PIO.
+ *	Thus, this is written and used on EF10, and neither on farch.
+ * @old_read_count: The value of read_count when last checked.
+ *	This is here for performance reasons.  The xmit path will
+ *	only get the up-to-date value of read_count if this
+ *	variable indicates that the queue is full.  This is to
+ *	avoid cache-line ping-pong between the xmit path and the
+ *	completion path.
+ * @tso_bursts: Number of times TSO xmit invoked by kernel
+ * @tso_long_headers: Number of packets with headers too long for standard
+ *	blocks
+ * @tso_packets: Number of packets via the TSO xmit path
+ * @tso_fallbacks: Number of times TSO fallback used
+ * @pushes: Number of times the TX push feature has been used
+ * @pio_packets: Number of times the TX PIO feature has been used
+ * @xmit_pending: Are any packets waiting to be pushed to the NIC
+ * @cb_packets: Number of times the TX copybreak feature has been used
+ * @notify_count: Count of notified descriptors to the NIC
+ * @empty_read_count: If the completion path has seen the queue as empty
+ *	and the transmission path has not yet checked this, the value of
+ *	@read_count bitwise-added to %EFX_EMPTY_COUNT_VALID; otherwise 0.
+ */
+struct efx_tx_queue {
+	/* Members which don't change on the fast path */
+	struct efx_nic *efx ____cacheline_aligned_in_smp;
+	unsigned int queue;
+	unsigned int label;
+	unsigned int type;
+	unsigned int tso_version;
+	bool tso_encap;
+	struct efx_channel *channel;
+	struct netdev_queue *core_txq;
+	struct efx_tx_buffer *buffer;
+	struct efx_buffer *cb_page;
+	struct efx_special_buffer txd;
+	unsigned int ptr_mask;
+	void __iomem *piobuf;
+	unsigned int piobuf_offset;
+	bool initialised;
+	bool timestamping;
+	bool xdp_tx;
+
+	/* Members used mainly on the completion path */
+	unsigned int read_count ____cacheline_aligned_in_smp;
+	unsigned int old_write_count;
+	unsigned int merge_events;
+	unsigned int bytes_compl;
+	unsigned int pkts_compl;
+	u32 completed_timestamp_major;
+	u32 completed_timestamp_minor;
+
+	/* Members used only on the xmit path */
+	unsigned int insert_count ____cacheline_aligned_in_smp;
+	unsigned int write_count;
+	unsigned int packet_write_count;
+	unsigned int old_read_count;
+	unsigned int tso_bursts;
+	unsigned int tso_long_headers;
+	unsigned int tso_packets;
+	unsigned int tso_fallbacks;
+	unsigned int pushes;
+	unsigned int pio_packets;
+	bool xmit_pending;
+	unsigned int cb_packets;
+	unsigned int notify_count;
+	/* Statistics to supplement MAC stats */
+	unsigned long tx_packets;
+
+	/* Members shared between paths and sometimes updated */
+	unsigned int empty_read_count ____cacheline_aligned_in_smp;
+#define EFX_EMPTY_COUNT_VALID 0x80000000
+	atomic_t flush_outstanding;
+};
+
+#define EFX_TX_CB_ORDER	7
+#define EFX_TX_CB_SIZE	(1 << EFX_TX_CB_ORDER) - NET_IP_ALIGN
+
+/**
+ * struct efx_rx_buffer - An Efx RX data buffer
+ * @dma_addr: DMA base address of the buffer
+ * @page: The associated page buffer.
+ *	Will be %NULL if the buffer slot is currently free.
+ * @page_offset: If pending: offset in @page of DMA base address.
+ *	If completed: offset in @page of Ethernet header.
+ * @len: If pending: length for DMA descriptor.
+ *	If completed: received length, excluding hash prefix.
+ * @flags: Flags for buffer and packet state.  These are only set on the
+ *	first buffer of a scattered packet.
+ */
+struct efx_rx_buffer {
+	dma_addr_t dma_addr;
+	struct page *page;
+	u16 page_offset;
+	u16 len;
+	u16 flags;
+};
+#define EFX_RX_BUF_LAST_IN_PAGE	0x0001
+#define EFX_RX_PKT_CSUMMED	0x0002
+#define EFX_RX_PKT_DISCARD	0x0004
+#define EFX_RX_PKT_TCP		0x0040
+#define EFX_RX_PKT_PREFIX_LEN	0x0080	/* length is in prefix only */
+#define EFX_RX_PKT_CSUM_LEVEL	0x0200
+
+/**
+ * struct efx_rx_page_state - Page-based rx buffer state
+ *
+ * Inserted at the start of every page allocated for receive buffers.
+ * Used to facilitate sharing dma mappings between recycled rx buffers
+ * and those passed up to the kernel.
+ *
+ * @dma_addr: The dma address of this page.
+ */
+struct efx_rx_page_state {
+	dma_addr_t dma_addr;
+
+	unsigned int __pad[] ____cacheline_aligned;
+};
+
+/**
+ * struct efx_rx_queue - An Efx RX queue
+ * @efx: The associated Efx NIC
+ * @core_index:  Index of network core RX queue.  Will be >= 0 iff this
+ *	is associated with a real RX queue.
+ * @buffer: The software buffer ring
+ * @rxd: The hardware descriptor ring
+ * @ptr_mask: The size of the ring minus 1.
+ * @refill_enabled: Enable refill whenever fill level is low
+ * @flush_pending: Set when a RX flush is pending. Has the same lifetime as
+ *	@rxq_flush_pending.
+ * @added_count: Number of buffers added to the receive queue.
+ * @notified_count: Number of buffers given to NIC (<= @added_count).
+ * @removed_count: Number of buffers removed from the receive queue.
+ * @scatter_n: Used by NIC specific receive code.
+ * @scatter_len: Used by NIC specific receive code.
+ * @page_ring: The ring to store DMA mapped pages for reuse.
+ * @page_add: Counter to calculate the write pointer for the recycle ring.
+ * @page_remove: Counter to calculate the read pointer for the recycle ring.
+ * @page_recycle_count: The number of pages that have been recycled.
+ * @page_recycle_failed: The number of pages that couldn't be recycled because
+ *      the kernel still held a reference to them.
+ * @page_recycle_full: The number of pages that were released because the
+ *      recycle ring was full.
+ * @page_ptr_mask: The number of pages in the RX recycle ring minus 1.
+ * @max_fill: RX descriptor maximum fill level (<= ring size)
+ * @fast_fill_trigger: RX descriptor fill level that will trigger a fast fill
+ *	(<= @max_fill)
+ * @min_fill: RX descriptor minimum non-zero fill level.
+ *	This records the minimum fill level observed when a ring
+ *	refill was triggered.
+ * @recycle_count: RX buffer recycle counter.
+ * @slow_fill: Timer used to defer efx_nic_generate_fill_event().
+ * @xdp_rxq_info: XDP specific RX queue information.
+ * @xdp_rxq_info_valid: Is xdp_rxq_info valid data?.
+ */
+struct efx_rx_queue {
+	struct efx_nic *efx;
+	int core_index;
+	struct efx_rx_buffer *buffer;
+	struct efx_special_buffer rxd;
+	unsigned int ptr_mask;
+	bool refill_enabled;
+	bool flush_pending;
+
+	unsigned int added_count;
+	unsigned int notified_count;
+	unsigned int removed_count;
+	unsigned int scatter_n;
+	unsigned int scatter_len;
+	struct page **page_ring;
+	unsigned int page_add;
+	unsigned int page_remove;
+	unsigned int page_recycle_count;
+	unsigned int page_recycle_failed;
+	unsigned int page_recycle_full;
+	unsigned int page_ptr_mask;
+	unsigned int max_fill;
+	unsigned int fast_fill_trigger;
+	unsigned int min_fill;
+	unsigned int min_overfill;
+	unsigned int recycle_count;
+	struct timer_list slow_fill;
+	unsigned int slow_fill_count;
+	/* Statistics to supplement MAC stats */
+	unsigned long rx_packets;
+	struct xdp_rxq_info xdp_rxq_info;
+	bool xdp_rxq_info_valid;
+};
+
+enum efx_sync_events_state {
+	SYNC_EVENTS_DISABLED = 0,
+	SYNC_EVENTS_QUIESCENT,
+	SYNC_EVENTS_REQUESTED,
+	SYNC_EVENTS_VALID,
+};
+
+/**
+ * struct efx_channel - An Efx channel
+ *
+ * A channel comprises an event queue, at least one TX queue, at least
+ * one RX queue, and an associated tasklet for processing the event
+ * queue.
+ *
+ * @efx: Associated Efx NIC
+ * @channel: Channel instance number
+ * @type: Channel type definition
+ * @eventq_init: Event queue initialised flag
+ * @enabled: Channel enabled indicator
+ * @irq: IRQ number (MSI and MSI-X only)
+ * @irq_moderation_us: IRQ moderation value (in microseconds)
+ * @napi_dev: Net device used with NAPI
+ * @napi_str: NAPI control structure
+ * @state: state for NAPI vs busy polling
+ * @state_lock: lock protecting @state
+ * @eventq: Event queue buffer
+ * @eventq_mask: Event queue pointer mask
+ * @eventq_read_ptr: Event queue read pointer
+ * @event_test_cpu: Last CPU to handle interrupt or test event for this channel
+ * @irq_count: Number of IRQs since last adaptive moderation decision
+ * @irq_mod_score: IRQ moderation score
+ * @rfs_filter_count: number of accelerated RFS filters currently in place;
+ *	equals the count of @rps_flow_id slots filled
+ * @rfs_last_expiry: value of jiffies last time some accelerated RFS filters
+ *	were checked for expiry
+ * @rfs_expire_index: next accelerated RFS filter ID to check for expiry
+ * @n_rfs_succeeded: number of successful accelerated RFS filter insertions
+ * @n_rfs_failed: number of failed accelerated RFS filter insertions
+ * @filter_work: Work item for efx_filter_rfs_expire()
+ * @rps_flow_id: Flow IDs of filters allocated for accelerated RFS,
+ *      indexed by filter ID
+ * @n_rx_tobe_disc: Count of RX_TOBE_DISC errors
+ * @n_rx_ip_hdr_chksum_err: Count of RX IP header checksum errors
+ * @n_rx_tcp_udp_chksum_err: Count of RX TCP and UDP checksum errors
+ * @n_rx_mcast_mismatch: Count of unmatched multicast frames
+ * @n_rx_frm_trunc: Count of RX_FRM_TRUNC errors
+ * @n_rx_overlength: Count of RX_OVERLENGTH errors
+ * @n_skbuff_leaks: Count of skbuffs leaked due to RX overrun
+ * @n_rx_nodesc_trunc: Number of RX packets truncated and then dropped due to
+ *	lack of descriptors
+ * @n_rx_merge_events: Number of RX merged completion events
+ * @n_rx_merge_packets: Number of RX packets completed by merged events
+ * @n_rx_xdp_drops: Count of RX packets intentionally dropped due to XDP
+ * @n_rx_xdp_bad_drops: Count of RX packets dropped due to XDP errors
+ * @n_rx_xdp_tx: Count of RX packets retransmitted due to XDP
+ * @n_rx_xdp_redirect: Count of RX packets redirected to a different NIC by XDP
+ * @rx_pkt_n_frags: Number of fragments in next packet to be delivered by
+ *	__efx_rx_packet(), or zero if there is none
+ * @rx_pkt_index: Ring index of first buffer for next packet to be delivered
+ *	by __efx_rx_packet(), if @rx_pkt_n_frags != 0
+ * @rx_list: list of SKBs from current RX, awaiting processing
+ * @rx_queue: RX queue for this channel
+ * @tx_queue: TX queues for this channel
+ * @tx_queue_by_type: pointers into @tx_queue, or %NULL, indexed by txq type
+ * @sync_events_state: Current state of sync events on this channel
+ * @sync_timestamp_major: Major part of the last ptp sync event
+ * @sync_timestamp_minor: Minor part of the last ptp sync event
+ */
+struct efx_channel {
+	struct efx_nic *efx;
+	int channel;
+	const struct efx_channel_type *type;
+	bool eventq_init;
+	bool enabled;
+	int irq;
+	unsigned int irq_moderation_us;
+	struct net_device *napi_dev;
+	struct napi_struct napi_str;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+	unsigned long busy_poll_state;
+#endif
+	struct efx_special_buffer eventq;
+	unsigned int eventq_mask;
+	unsigned int eventq_read_ptr;
+	int event_test_cpu;
+
+	unsigned int irq_count;
+	unsigned int irq_mod_score;
+#ifdef CONFIG_RFS_ACCEL
+	unsigned int rfs_filter_count;
+	unsigned int rfs_last_expiry;
+	unsigned int rfs_expire_index;
+	unsigned int n_rfs_succeeded;
+	unsigned int n_rfs_failed;
+	struct delayed_work filter_work;
+#define RPS_FLOW_ID_INVALID 0xFFFFFFFF
+	u32 *rps_flow_id;
+#endif
+
+	unsigned int n_rx_tobe_disc;
+	unsigned int n_rx_ip_hdr_chksum_err;
+	unsigned int n_rx_tcp_udp_chksum_err;
+	unsigned int n_rx_outer_ip_hdr_chksum_err;
+	unsigned int n_rx_outer_tcp_udp_chksum_err;
+	unsigned int n_rx_inner_ip_hdr_chksum_err;
+	unsigned int n_rx_inner_tcp_udp_chksum_err;
+	unsigned int n_rx_eth_crc_err;
+	unsigned int n_rx_mcast_mismatch;
+	unsigned int n_rx_frm_trunc;
+	unsigned int n_rx_overlength;
+	unsigned int n_skbuff_leaks;
+	unsigned int n_rx_nodesc_trunc;
+	unsigned int n_rx_merge_events;
+	unsigned int n_rx_merge_packets;
+	unsigned int n_rx_xdp_drops;
+	unsigned int n_rx_xdp_bad_drops;
+	unsigned int n_rx_xdp_tx;
+	unsigned int n_rx_xdp_redirect;
+
+	unsigned int rx_pkt_n_frags;
+	unsigned int rx_pkt_index;
+
+	struct list_head *rx_list;
+
+	struct efx_rx_queue rx_queue;
+	struct efx_tx_queue tx_queue[EFX_MAX_TXQ_PER_CHANNEL];
+	struct efx_tx_queue *tx_queue_by_type[EFX_TXQ_TYPES];
+
+	enum efx_sync_events_state sync_events_state;
+	u32 sync_timestamp_major;
+	u32 sync_timestamp_minor;
+};
+
+/**
+ * struct efx_msi_context - Context for each MSI
+ * @efx: The associated NIC
+ * @index: Index of the channel/IRQ
+ * @name: Name of the channel/IRQ
+ *
+ * Unlike &struct efx_channel, this is never reallocated and is always
+ * safe for the IRQ handler to access.
+ */
+struct efx_msi_context {
+	struct efx_nic *efx;
+	unsigned int index;
+	char name[IFNAMSIZ + 6];
+};
+
+/**
+ * struct efx_channel_type - distinguishes traffic and extra channels
+ * @handle_no_channel: Handle failure to allocate an extra channel
+ * @pre_probe: Set up extra state prior to initialisation
+ * @post_remove: Tear down extra state after finalisation, if allocated.
+ *	May be called on channels that have not been probed.
+ * @get_name: Generate the channel's name (used for its IRQ handler)
+ * @copy: Copy the channel state prior to reallocation.  May be %NULL if
+ *	reallocation is not supported.
+ * @receive_skb: Handle an skb ready to be passed to netif_receive_skb()
+ * @want_txqs: Determine whether this channel should have TX queues
+ *	created.  If %NULL, TX queues are not created.
+ * @keep_eventq: Flag for whether event queue should be kept initialised
+ *	while the device is stopped
+ * @want_pio: Flag for whether PIO buffers should be linked to this
+ *	channel's TX queues.
+ */
+struct efx_channel_type {
+	void (*handle_no_channel)(struct efx_nic *);
+	int (*pre_probe)(struct efx_channel *);
+	void (*post_remove)(struct efx_channel *);
+	void (*get_name)(struct efx_channel *, char *buf, size_t len);
+	struct efx_channel *(*copy)(const struct efx_channel *);
+	bool (*receive_skb)(struct efx_channel *, struct sk_buff *);
+	bool (*want_txqs)(struct efx_channel *);
+	bool keep_eventq;
+	bool want_pio;
+};
+
+enum efx_led_mode {
+	EFX_LED_OFF	= 0,
+	EFX_LED_ON	= 1,
+	EFX_LED_DEFAULT	= 2
+};
+
+#define STRING_TABLE_LOOKUP(val, member) \
+	((val) < member ## _max) ? member ## _names[val] : "(invalid)"
+
+extern const char *const efx_loopback_mode_names[];
+extern const unsigned int efx_loopback_mode_max;
+#define LOOPBACK_MODE(efx) \
+	STRING_TABLE_LOOKUP((efx)->loopback_mode, efx_loopback_mode)
+
+enum efx_int_mode {
+	/* Be careful if altering to correct macro below */
+	EFX_INT_MODE_MSIX = 0,
+	EFX_INT_MODE_MSI = 1,
+	EFX_INT_MODE_LEGACY = 2,
+	EFX_INT_MODE_MAX	/* Insert any new items before this */
+};
+#define EFX_INT_MODE_USE_MSI(x) (((x)->interrupt_mode) <= EFX_INT_MODE_MSI)
+
+enum nic_state {
+	STATE_UNINIT = 0,	/* device being probed/removed or is frozen */
+	STATE_READY = 1,	/* hardware ready and netdev registered */
+	STATE_DISABLED = 2,	/* device disabled due to hardware errors */
+	STATE_RECOVERY = 3,	/* device recovering from PCI error */
+};
+
+/* Forward declaration */
+struct efx_nic;
+
+/* Pseudo bit-mask flow control field */
+#define EFX_FC_RX	FLOW_CTRL_RX
+#define EFX_FC_TX	FLOW_CTRL_TX
+#define EFX_FC_AUTO	4
+
+/**
+ * struct efx_link_state - Current state of the link
+ * @up: Link is up
+ * @fd: Link is full-duplex
+ * @fc: Actual flow control flags
+ * @speed: Link speed (Mbps)
+ */
+struct efx_link_state {
+	bool up;
+	bool fd;
+	u8 fc;
+	unsigned int speed;
+};
+
+static inline bool efx_link_state_equal(const struct efx_link_state *left,
+					const struct efx_link_state *right)
+{
+	return left->up == right->up && left->fd == right->fd &&
+		left->fc == right->fc && left->speed == right->speed;
+}
+
+/**
+ * enum efx_phy_mode - PHY operating mode flags
+ * @PHY_MODE_NORMAL: on and should pass traffic
+ * @PHY_MODE_TX_DISABLED: on with TX disabled
+ * @PHY_MODE_LOW_POWER: set to low power through MDIO
+ * @PHY_MODE_OFF: switched off through external control
+ * @PHY_MODE_SPECIAL: on but will not pass traffic
+ */
+enum efx_phy_mode {
+	PHY_MODE_NORMAL		= 0,
+	PHY_MODE_TX_DISABLED	= 1,
+	PHY_MODE_LOW_POWER	= 2,
+	PHY_MODE_OFF		= 4,
+	PHY_MODE_SPECIAL	= 8,
+};
+
+static inline bool efx_phy_mode_disabled(enum efx_phy_mode mode)
+{
+	return !!(mode & ~PHY_MODE_TX_DISABLED);
+}
+
+/**
+ * struct efx_hw_stat_desc - Description of a hardware statistic
+ * @name: Name of the statistic as visible through ethtool, or %NULL if
+ *	it should not be exposed
+ * @dma_width: Width in bits (0 for non-DMA statistics)
+ * @offset: Offset within stats (ignored for non-DMA statistics)
+ */
+struct efx_hw_stat_desc {
+	const char *name;
+	u16 dma_width;
+	u16 offset;
+};
+
+/* Number of bits used in a multicast filter hash address */
+#define EFX_MCAST_HASH_BITS 8
+
+/* Number of (single-bit) entries in a multicast filter hash */
+#define EFX_MCAST_HASH_ENTRIES (1 << EFX_MCAST_HASH_BITS)
+
+/* An Efx multicast filter hash */
+union efx_multicast_hash {
+	u8 byte[EFX_MCAST_HASH_ENTRIES / 8];
+	efx_oword_t oword[EFX_MCAST_HASH_ENTRIES / sizeof(efx_oword_t) / 8];
+};
+
+struct vfdi_status;
+
+/* The reserved RSS context value */
+#define EFX_MCDI_RSS_CONTEXT_INVALID	0xffffffff
+/**
+ * struct efx_rss_context - A user-defined RSS context for filtering
+ * @list: node of linked list on which this struct is stored
+ * @context_id: the RSS_CONTEXT_ID returned by MC firmware, or
+ *	%EFX_MCDI_RSS_CONTEXT_INVALID if this context is not present on the NIC.
+ *	For Siena, 0 if RSS is active, else %EFX_MCDI_RSS_CONTEXT_INVALID.
+ * @user_id: the rss_context ID exposed to userspace over ethtool.
+ * @rx_hash_udp_4tuple: UDP 4-tuple hashing enabled
+ * @rx_hash_key: Toeplitz hash key for this RSS context
+ * @indir_table: Indirection table for this RSS context
+ */
+struct efx_rss_context {
+	struct list_head list;
+	u32 context_id;
+	u32 user_id;
+	bool rx_hash_udp_4tuple;
+	u8 rx_hash_key[40];
+	u32 rx_indir_table[128];
+};
+
+#ifdef CONFIG_RFS_ACCEL
+/* Order of these is important, since filter_id >= %EFX_ARFS_FILTER_ID_PENDING
+ * is used to test if filter does or will exist.
+ */
+#define EFX_ARFS_FILTER_ID_PENDING	-1
+#define EFX_ARFS_FILTER_ID_ERROR	-2
+#define EFX_ARFS_FILTER_ID_REMOVING	-3
+/**
+ * struct efx_arfs_rule - record of an ARFS filter and its IDs
+ * @node: linkage into hash table
+ * @spec: details of the filter (used as key for hash table).  Use efx->type to
+ *	determine which member to use.
+ * @rxq_index: channel to which the filter will steer traffic.
+ * @arfs_id: filter ID which was returned to ARFS
+ * @filter_id: index in software filter table.  May be
+ *	%EFX_ARFS_FILTER_ID_PENDING if filter was not inserted yet,
+ *	%EFX_ARFS_FILTER_ID_ERROR if filter insertion failed, or
+ *	%EFX_ARFS_FILTER_ID_REMOVING if expiry is currently removing the filter.
+ */
+struct efx_arfs_rule {
+	struct hlist_node node;
+	struct efx_filter_spec spec;
+	u16 rxq_index;
+	u16 arfs_id;
+	s32 filter_id;
+};
+
+/* Size chosen so that the table is one page (4kB) */
+#define EFX_ARFS_HASH_TABLE_SIZE	512
+
+/**
+ * struct efx_async_filter_insertion - Request to asynchronously insert a filter
+ * @net_dev: Reference to the netdevice
+ * @spec: The filter to insert
+ * @work: Workitem for this request
+ * @rxq_index: Identifies the channel for which this request was made
+ * @flow_id: Identifies the kernel-side flow for which this request was made
+ */
+struct efx_async_filter_insertion {
+	struct net_device *net_dev;
+	struct efx_filter_spec spec;
+	struct work_struct work;
+	u16 rxq_index;
+	u32 flow_id;
+};
+
+/* Maximum number of ARFS workitems that may be in flight on an efx_nic */
+#define EFX_RPS_MAX_IN_FLIGHT	8
+#endif /* CONFIG_RFS_ACCEL */
+
+enum efx_xdp_tx_queues_mode {
+	EFX_XDP_TX_QUEUES_DEDICATED,	/* one queue per core, locking not needed */
+	EFX_XDP_TX_QUEUES_SHARED,	/* each queue used by more than 1 core */
+	EFX_XDP_TX_QUEUES_BORROWED	/* queues borrowed from net stack */
+};
+
+/**
+ * struct efx_nic - an Efx NIC
+ * @name: Device name (net device name or bus id before net device registered)
+ * @pci_dev: The PCI device
+ * @node: List node for maintaning primary/secondary function lists
+ * @primary: &struct efx_nic instance for the primary function of this
+ *	controller.  May be the same structure, and may be %NULL if no
+ *	primary function is bound.  Serialised by rtnl_lock.
+ * @secondary_list: List of &struct efx_nic instances for the secondary PCI
+ *	functions of the controller, if this is for the primary function.
+ *	Serialised by rtnl_lock.
+ * @type: Controller type attributes
+ * @legacy_irq: IRQ number
+ * @workqueue: Workqueue for port reconfigures and the HW monitor.
+ *	Work items do not hold and must not acquire RTNL.
+ * @workqueue_name: Name of workqueue
+ * @reset_work: Scheduled reset workitem
+ * @membase_phys: Memory BAR value as physical address
+ * @membase: Memory BAR value
+ * @vi_stride: step between per-VI registers / memory regions
+ * @interrupt_mode: Interrupt mode
+ * @timer_quantum_ns: Interrupt timer quantum, in nanoseconds
+ * @timer_max_ns: Interrupt timer maximum value, in nanoseconds
+ * @irq_rx_adaptive: Adaptive IRQ moderation enabled for RX event queues
+ * @irqs_hooked: Channel interrupts are hooked
+ * @irq_rx_mod_step_us: Step size for IRQ moderation for RX event queues
+ * @irq_rx_moderation_us: IRQ moderation time for RX event queues
+ * @msg_enable: Log message enable flags
+ * @state: Device state number (%STATE_*). Serialised by the rtnl_lock.
+ * @reset_pending: Bitmask for pending resets
+ * @tx_queue: TX DMA queues
+ * @rx_queue: RX DMA queues
+ * @channel: Channels
+ * @msi_context: Context for each MSI
+ * @extra_channel_types: Types of extra (non-traffic) channels that
+ *	should be allocated for this NIC
+ * @xdp_tx_queue_count: Number of entries in %xdp_tx_queues.
+ * @xdp_tx_queues: Array of pointers to tx queues used for XDP transmit.
+ * @xdp_txq_queues_mode: XDP TX queues sharing strategy.
+ * @rxq_entries: Size of receive queues requested by user.
+ * @txq_entries: Size of transmit queues requested by user.
+ * @txq_stop_thresh: TX queue fill level at or above which we stop it.
+ * @txq_wake_thresh: TX queue fill level at or below which we wake it.
+ * @tx_dc_base: Base qword address in SRAM of TX queue descriptor caches
+ * @rx_dc_base: Base qword address in SRAM of RX queue descriptor caches
+ * @sram_lim_qw: Qword address limit of SRAM
+ * @next_buffer_table: First available buffer table id
+ * @n_channels: Number of channels in use
+ * @n_rx_channels: Number of channels used for RX (= number of RX queues)
+ * @n_tx_channels: Number of channels used for TX
+ * @n_extra_tx_channels: Number of extra channels with TX queues
+ * @tx_queues_per_channel: number of TX queues probed on each channel
+ * @n_xdp_channels: Number of channels used for XDP TX
+ * @xdp_channel_offset: Offset of zeroth channel used for XPD TX.
+ * @xdp_tx_per_channel: Max number of TX queues on an XDP TX channel.
+ * @rx_ip_align: RX DMA address offset to have IP header aligned in
+ *	in accordance with NET_IP_ALIGN
+ * @rx_dma_len: Current maximum RX DMA length
+ * @rx_buffer_order: Order (log2) of number of pages for each RX buffer
+ * @rx_buffer_truesize: Amortised allocation size of an RX buffer,
+ *	for use in sk_buff::truesize
+ * @rx_prefix_size: Size of RX prefix before packet data
+ * @rx_packet_hash_offset: Offset of RX flow hash from start of packet data
+ *	(valid only if @rx_prefix_size != 0; always negative)
+ * @rx_packet_len_offset: Offset of RX packet length from start of packet data
+ *	(valid only for NICs that set %EFX_RX_PKT_PREFIX_LEN; always negative)
+ * @rx_packet_ts_offset: Offset of timestamp from start of packet data
+ *	(valid only if channel->sync_timestamps_enabled; always negative)
+ * @rx_scatter: Scatter mode enabled for receives
+ * @rss_context: Main RSS context.  Its @list member is the head of the list of
+ *	RSS contexts created by user requests
+ * @rss_lock: Protects custom RSS context software state in @rss_context.list
+ * @vport_id: The function's vport ID, only relevant for PFs
+ * @int_error_count: Number of internal errors seen recently
+ * @int_error_expire: Time at which error count will be expired
+ * @must_realloc_vis: Flag: VIs have yet to be reallocated after MC reboot
+ * @irq_soft_enabled: Are IRQs soft-enabled? If not, IRQ handler will
+ *	acknowledge but do nothing else.
+ * @irq_status: Interrupt status buffer
+ * @irq_zero_count: Number of legacy IRQs seen with queue flags == 0
+ * @irq_level: IRQ level/index for IRQs not triggered by an event queue
+ * @selftest_work: Work item for asynchronous self-test
+ * @mtd_list: List of MTDs attached to the NIC
+ * @nic_data: Hardware dependent state
+ * @mcdi: Management-Controller-to-Driver Interface state
+ * @mac_lock: MAC access lock. Protects @port_enabled, @phy_mode,
+ *	efx_monitor() and efx_reconfigure_port()
+ * @port_enabled: Port enabled indicator.
+ *	Serialises efx_stop_all(), efx_start_all(), efx_monitor() and
+ *	efx_mac_work() with kernel interfaces. Safe to read under any
+ *	one of the rtnl_lock, mac_lock, or netif_tx_lock, but all three must
+ *	be held to modify it.
+ * @port_initialized: Port initialized?
+ * @net_dev: Operating system network device. Consider holding the rtnl lock
+ * @fixed_features: Features which cannot be turned off
+ * @num_mac_stats: Number of MAC stats reported by firmware (MAC_STATS_NUM_STATS
+ *	field of %MC_CMD_GET_CAPABILITIES_V4 response, or %MC_CMD_MAC_NSTATS)
+ * @stats_buffer: DMA buffer for statistics
+ * @phy_type: PHY type
+ * @phy_data: PHY private data (including PHY-specific stats)
+ * @mdio: PHY MDIO interface
+ * @mdio_bus: PHY MDIO bus ID (only used by Siena)
+ * @phy_mode: PHY operating mode. Serialised by @mac_lock.
+ * @link_advertising: Autonegotiation advertising flags
+ * @fec_config: Forward Error Correction configuration flags.  For bit positions
+ *	see &enum ethtool_fec_config_bits.
+ * @link_state: Current state of the link
+ * @n_link_state_changes: Number of times the link has changed state
+ * @unicast_filter: Flag for Falcon-arch simple unicast filter.
+ *	Protected by @mac_lock.
+ * @multicast_hash: Multicast hash table for Falcon-arch.
+ *	Protected by @mac_lock.
+ * @wanted_fc: Wanted flow control flags
+ * @fc_disable: When non-zero flow control is disabled. Typically used to
+ *	ensure that network back pressure doesn't delay dma queue flushes.
+ *	Serialised by the rtnl lock.
+ * @mac_work: Work item for changing MAC promiscuity and multicast hash
+ * @loopback_mode: Loopback status
+ * @loopback_modes: Supported loopback mode bitmask
+ * @loopback_selftest: Offline self-test private state
+ * @xdp_prog: Current XDP programme for this interface
+ * @filter_sem: Filter table rw_semaphore, protects existence of @filter_state
+ * @filter_state: Architecture-dependent filter table state
+ * @rps_mutex: Protects RPS state of all channels
+ * @rps_slot_map: bitmap of in-flight entries in @rps_slot
+ * @rps_slot: array of ARFS insertion requests for efx_filter_rfs_work()
+ * @rps_hash_lock: Protects ARFS filter mapping state (@rps_hash_table and
+ *	@rps_next_id).
+ * @rps_hash_table: Mapping between ARFS filters and their various IDs
+ * @rps_next_id: next arfs_id for an ARFS filter
+ * @active_queues: Count of RX and TX queues that haven't been flushed and drained.
+ * @rxq_flush_pending: Count of number of receive queues that need to be flushed.
+ *	Decremented when the efx_flush_rx_queue() is called.
+ * @rxq_flush_outstanding: Count of number of RX flushes started but not yet
+ *	completed (either success or failure). Not used when MCDI is used to
+ *	flush receive queues.
+ * @flush_wq: wait queue used by efx_nic_flush_queues() to wait for flush completions.
+ * @vf_count: Number of VFs intended to be enabled.
+ * @vf_init_count: Number of VFs that have been fully initialised.
+ * @vi_scale: log2 number of vnics per VF.
+ * @ptp_data: PTP state data
+ * @ptp_warned: has this NIC seen and warned about unexpected PTP events?
+ * @vpd_sn: Serial number read from VPD
+ * @xdp_rxq_info_failed: Have any of the rx queues failed to initialise their
+ *      xdp_rxq_info structures?
+ * @netdev_notifier: Netdevice notifier.
+ * @mem_bar: The BAR that is mapped into membase.
+ * @reg_base: Offset from the start of the bar to the function control window.
+ * @monitor_work: Hardware monitor workitem
+ * @biu_lock: BIU (bus interface unit) lock
+ * @last_irq_cpu: Last CPU to handle a possible test interrupt.  This
+ *	field is used by efx_test_interrupts() to verify that an
+ *	interrupt has occurred.
+ * @stats_lock: Statistics update lock. Must be held when calling
+ *	efx_nic_type::{update,start,stop}_stats.
+ * @n_rx_noskb_drops: Count of RX packets dropped due to failure to allocate an skb
+ *
+ * This is stored in the private area of the &struct net_device.
+ */
+struct efx_nic {
+	/* The following fields should be written very rarely */
+
+	char name[IFNAMSIZ];
+	struct list_head node;
+	struct efx_nic *primary;
+	struct list_head secondary_list;
+	struct pci_dev *pci_dev;
+	unsigned int port_num;
+	const struct efx_nic_type *type;
+	int legacy_irq;
+	bool eeh_disabled_legacy_irq;
+	struct workqueue_struct *workqueue;
+	char workqueue_name[16];
+	struct work_struct reset_work;
+	resource_size_t membase_phys;
+	void __iomem *membase;
+
+	unsigned int vi_stride;
+
+	enum efx_int_mode interrupt_mode;
+	unsigned int timer_quantum_ns;
+	unsigned int timer_max_ns;
+	bool irq_rx_adaptive;
+	bool irqs_hooked;
+	unsigned int irq_mod_step_us;
+	unsigned int irq_rx_moderation_us;
+	u32 msg_enable;
+
+	enum nic_state state;
+	unsigned long reset_pending;
+
+	struct efx_channel *channel[EFX_MAX_CHANNELS];
+	struct efx_msi_context msi_context[EFX_MAX_CHANNELS];
+	const struct efx_channel_type *
+	extra_channel_type[EFX_MAX_EXTRA_CHANNELS];
+
+	unsigned int xdp_tx_queue_count;
+	struct efx_tx_queue **xdp_tx_queues;
+	enum efx_xdp_tx_queues_mode xdp_txq_queues_mode;
+
+	unsigned rxq_entries;
+	unsigned txq_entries;
+	unsigned int txq_stop_thresh;
+	unsigned int txq_wake_thresh;
+
+	unsigned tx_dc_base;
+	unsigned rx_dc_base;
+	unsigned sram_lim_qw;
+	unsigned next_buffer_table;
+
+	unsigned int max_channels;
+	unsigned int max_vis;
+	unsigned int max_tx_channels;
+	unsigned n_channels;
+	unsigned n_rx_channels;
+	unsigned rss_spread;
+	unsigned tx_channel_offset;
+	unsigned n_tx_channels;
+	unsigned n_extra_tx_channels;
+	unsigned int tx_queues_per_channel;
+	unsigned int n_xdp_channels;
+	unsigned int xdp_channel_offset;
+	unsigned int xdp_tx_per_channel;
+	unsigned int rx_ip_align;
+	unsigned int rx_dma_len;
+	unsigned int rx_buffer_order;
+	unsigned int rx_buffer_truesize;
+	unsigned int rx_page_buf_step;
+	unsigned int rx_bufs_per_page;
+	unsigned int rx_pages_per_batch;
+	unsigned int rx_prefix_size;
+	int rx_packet_hash_offset;
+	int rx_packet_len_offset;
+	int rx_packet_ts_offset;
+	bool rx_scatter;
+	struct efx_rss_context rss_context;
+	struct mutex rss_lock;
+	u32 vport_id;
+
+	unsigned int_error_count;
+	unsigned long int_error_expire;
+
+	bool must_realloc_vis;
+	bool irq_soft_enabled;
+	struct efx_buffer irq_status;
+	unsigned irq_zero_count;
+	unsigned irq_level;
+	struct delayed_work selftest_work;
+
+#ifdef CONFIG_SFC_MTD
+	struct list_head mtd_list;
+#endif
+
+	void *nic_data;
+	struct efx_mcdi_data *mcdi;
+
+	struct mutex mac_lock;
+	struct work_struct mac_work;
+	bool port_enabled;
+
+	bool mc_bist_for_other_fn;
+	bool port_initialized;
+	struct net_device *net_dev;
+
+	netdev_features_t fixed_features;
+
+	u16 num_mac_stats;
+	struct efx_buffer stats_buffer;
+	u64 rx_nodesc_drops_total;
+	u64 rx_nodesc_drops_while_down;
+	bool rx_nodesc_drops_prev_state;
+
+	unsigned int phy_type;
+	void *phy_data;
+	struct mdio_if_info mdio;
+	unsigned int mdio_bus;
+	enum efx_phy_mode phy_mode;
+
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(link_advertising);
+	u32 fec_config;
+	struct efx_link_state link_state;
+	unsigned int n_link_state_changes;
+
+	bool unicast_filter;
+	union efx_multicast_hash multicast_hash;
+	u8 wanted_fc;
+	unsigned fc_disable;
+
+	atomic_t rx_reset;
+	enum efx_loopback_mode loopback_mode;
+	u64 loopback_modes;
+
+	void *loopback_selftest;
+	/* We access loopback_selftest immediately before running XDP,
+	 * so we want them next to each other.
+	 */
+	struct bpf_prog __rcu *xdp_prog;
+
+	struct rw_semaphore filter_sem;
+	void *filter_state;
+#ifdef CONFIG_RFS_ACCEL
+	struct mutex rps_mutex;
+	unsigned long rps_slot_map;
+	struct efx_async_filter_insertion rps_slot[EFX_RPS_MAX_IN_FLIGHT];
+	spinlock_t rps_hash_lock;
+	struct hlist_head *rps_hash_table;
+	u32 rps_next_id;
+#endif
+
+	atomic_t active_queues;
+	atomic_t rxq_flush_pending;
+	atomic_t rxq_flush_outstanding;
+	wait_queue_head_t flush_wq;
+
+#ifdef CONFIG_SFC_SRIOV
+	unsigned vf_count;
+	unsigned vf_init_count;
+	unsigned vi_scale;
+#endif
+
+	struct efx_ptp_data *ptp_data;
+	bool ptp_warned;
+
+	char *vpd_sn;
+	bool xdp_rxq_info_failed;
+
+	struct notifier_block netdev_notifier;
+
+	unsigned int mem_bar;
+	u32 reg_base;
+
+	/* The following fields may be written more often */
+
+	struct delayed_work monitor_work ____cacheline_aligned_in_smp;
+	spinlock_t biu_lock;
+	int last_irq_cpu;
+	spinlock_t stats_lock;
+	atomic_t n_rx_noskb_drops;
+};
+
+static inline int efx_dev_registered(struct efx_nic *efx)
+{
+	return efx->net_dev->reg_state == NETREG_REGISTERED;
+}
+
+static inline unsigned int efx_port_num(struct efx_nic *efx)
+{
+	return efx->port_num;
+}
+
+struct efx_mtd_partition {
+	struct list_head node;
+	struct mtd_info mtd;
+	const char *dev_type_name;
+	const char *type_name;
+	char name[IFNAMSIZ + 20];
+};
+
+struct efx_udp_tunnel {
+#define TUNNEL_ENCAP_UDP_PORT_ENTRY_INVALID	0xffff
+	u16 type; /* TUNNEL_ENCAP_UDP_PORT_ENTRY_foo, see mcdi_pcol.h */
+	__be16 port;
+};
+
+/**
+ * struct efx_nic_type - Efx device type definition
+ * @mem_bar: Get the memory BAR
+ * @mem_map_size: Get memory BAR mapped size
+ * @probe: Probe the controller
+ * @remove: Free resources allocated by probe()
+ * @init: Initialise the controller
+ * @dimension_resources: Dimension controller resources (buffer table,
+ *	and VIs once the available interrupt resources are clear)
+ * @fini: Shut down the controller
+ * @monitor: Periodic function for polling link state and hardware monitor
+ * @map_reset_reason: Map ethtool reset reason to a reset method
+ * @map_reset_flags: Map ethtool reset flags to a reset method, if possible
+ * @reset: Reset the controller hardware and possibly the PHY.  This will
+ *	be called while the controller is uninitialised.
+ * @probe_port: Probe the MAC and PHY
+ * @remove_port: Free resources allocated by probe_port()
+ * @handle_global_event: Handle a "global" event (may be %NULL)
+ * @fini_dmaq: Flush and finalise DMA queues (RX and TX queues)
+ * @prepare_flush: Prepare the hardware for flushing the DMA queues
+ *	(for Falcon architecture)
+ * @finish_flush: Clean up after flushing the DMA queues (for Falcon
+ *	architecture)
+ * @prepare_flr: Prepare for an FLR
+ * @finish_flr: Clean up after an FLR
+ * @describe_stats: Describe statistics for ethtool
+ * @update_stats: Update statistics not provided by event handling.
+ *	Either argument may be %NULL.
+ * @update_stats_atomic: Update statistics while in atomic context, if that
+ *	is more limiting than @update_stats.  Otherwise, leave %NULL and
+ *	driver core will call @update_stats.
+ * @start_stats: Start the regular fetching of statistics
+ * @pull_stats: Pull stats from the NIC and wait until they arrive.
+ * @stop_stats: Stop the regular fetching of statistics
+ * @push_irq_moderation: Apply interrupt moderation value
+ * @reconfigure_port: Push loopback/power/txdis changes to the MAC and PHY
+ * @prepare_enable_fc_tx: Prepare MAC to enable pause frame TX (may be %NULL)
+ * @reconfigure_mac: Push MAC address, MTU, flow control and filter settings
+ *	to the hardware.  Serialised by the mac_lock.
+ * @check_mac_fault: Check MAC fault state. True if fault present.
+ * @get_wol: Get WoL configuration from driver state
+ * @set_wol: Push WoL configuration to the NIC
+ * @resume_wol: Synchronise WoL state between driver and MC (e.g. after resume)
+ * @get_fec_stats: Get standard FEC statistics.
+ * @test_chip: Test registers.  May use efx_farch_test_registers(), and is
+ *	expected to reset the NIC.
+ * @test_nvram: Test validity of NVRAM contents
+ * @mcdi_request: Send an MCDI request with the given header and SDU.
+ *	The SDU length may be any value from 0 up to the protocol-
+ *	defined maximum, but its buffer will be padded to a multiple
+ *	of 4 bytes.
+ * @mcdi_poll_response: Test whether an MCDI response is available.
+ * @mcdi_read_response: Read the MCDI response PDU.  The offset will
+ *	be a multiple of 4.  The length may not be, but the buffer
+ *	will be padded so it is safe to round up.
+ * @mcdi_poll_reboot: Test whether the MCDI has rebooted.  If so,
+ *	return an appropriate error code for aborting any current
+ *	request; otherwise return 0.
+ * @irq_enable_master: Enable IRQs on the NIC.  Each event queue must
+ *	be separately enabled after this.
+ * @irq_test_generate: Generate a test IRQ
+ * @irq_disable_non_ev: Disable non-event IRQs on the NIC.  Each event
+ *	queue must be separately disabled before this.
+ * @irq_handle_msi: Handle MSI for a channel.  The @dev_id argument is
+ *	a pointer to the &struct efx_msi_context for the channel.
+ * @irq_handle_legacy: Handle legacy interrupt.  The @dev_id argument
+ *	is a pointer to the &struct efx_nic.
+ * @tx_probe: Allocate resources for TX queue (and select TXQ type)
+ * @tx_init: Initialise TX queue on the NIC
+ * @tx_remove: Free resources for TX queue
+ * @tx_write: Write TX descriptors and doorbell
+ * @tx_enqueue: Add an SKB to TX queue
+ * @rx_push_rss_config: Write RSS hash key and indirection table to the NIC
+ * @rx_pull_rss_config: Read RSS hash key and indirection table back from the NIC
+ * @rx_push_rss_context_config: Write RSS hash key and indirection table for
+ *	user RSS context to the NIC
+ * @rx_pull_rss_context_config: Read RSS hash key and indirection table for user
+ *	RSS context back from the NIC
+ * @rx_probe: Allocate resources for RX queue
+ * @rx_init: Initialise RX queue on the NIC
+ * @rx_remove: Free resources for RX queue
+ * @rx_write: Write RX descriptors and doorbell
+ * @rx_defer_refill: Generate a refill reminder event
+ * @rx_packet: Receive the queued RX buffer on a channel
+ * @rx_buf_hash_valid: Determine whether the RX prefix contains a valid hash
+ * @ev_probe: Allocate resources for event queue
+ * @ev_init: Initialise event queue on the NIC
+ * @ev_fini: Deinitialise event queue on the NIC
+ * @ev_remove: Free resources for event queue
+ * @ev_process: Process events for a queue, up to the given NAPI quota
+ * @ev_read_ack: Acknowledge read events on a queue, rearming its IRQ
+ * @ev_test_generate: Generate a test event
+ * @filter_table_probe: Probe filter capabilities and set up filter software state
+ * @filter_table_restore: Restore filters removed from hardware
+ * @filter_table_remove: Remove filters from hardware and tear down software state
+ * @filter_update_rx_scatter: Update filters after change to rx scatter setting
+ * @filter_insert: add or replace a filter
+ * @filter_remove_safe: remove a filter by ID, carefully
+ * @filter_get_safe: retrieve a filter by ID, carefully
+ * @filter_clear_rx: Remove all RX filters whose priority is less than or
+ *	equal to the given priority and is not %EFX_FILTER_PRI_AUTO
+ * @filter_count_rx_used: Get the number of filters in use at a given priority
+ * @filter_get_rx_id_limit: Get maximum value of a filter id, plus 1
+ * @filter_get_rx_ids: Get list of RX filters at a given priority
+ * @filter_rfs_expire_one: Consider expiring a filter inserted for RFS.
+ *	This must check whether the specified table entry is used by RFS
+ *	and that rps_may_expire_flow() returns true for it.
+ * @mtd_probe: Probe and add MTD partitions associated with this net device,
+ *	 using efx_mtd_add()
+ * @mtd_rename: Set an MTD partition name using the net device name
+ * @mtd_read: Read from an MTD partition
+ * @mtd_erase: Erase part of an MTD partition
+ * @mtd_write: Write to an MTD partition
+ * @mtd_sync: Wait for write-back to complete on MTD partition.  This
+ *	also notifies the driver that a writer has finished using this
+ *	partition.
+ * @ptp_write_host_time: Send host time to MC as part of sync protocol
+ * @ptp_set_ts_sync_events: Enable or disable sync events for inline RX
+ *	timestamping, possibly only temporarily for the purposes of a reset.
+ * @ptp_set_ts_config: Set hardware timestamp configuration.  The flags
+ *	and tx_type will already have been validated but this operation
+ *	must validate and update rx_filter.
+ * @get_phys_port_id: Get the underlying physical port id.
+ * @set_mac_address: Set the MAC address of the device
+ * @tso_versions: Returns mask of firmware-assisted TSO versions supported.
+ *	If %NULL, then device does not support any TSO version.
+ * @udp_tnl_push_ports: Push the list of UDP tunnel ports to the NIC if required.
+ * @udp_tnl_has_port: Check if a port has been added as UDP tunnel
+ * @print_additional_fwver: Dump NIC-specific additional FW version info
+ * @sensor_event: Handle a sensor event from MCDI
+ * @rx_recycle_ring_size: Size of the RX recycle ring
+ * @revision: Hardware architecture revision
+ * @txd_ptr_tbl_base: TX descriptor ring base address
+ * @rxd_ptr_tbl_base: RX descriptor ring base address
+ * @buf_tbl_base: Buffer table base address
+ * @evq_ptr_tbl_base: Event queue pointer table base address
+ * @evq_rptr_tbl_base: Event queue read-pointer table base address
+ * @max_dma_mask: Maximum possible DMA mask
+ * @rx_prefix_size: Size of RX prefix before packet data
+ * @rx_hash_offset: Offset of RX flow hash within prefix
+ * @rx_ts_offset: Offset of timestamp within prefix
+ * @rx_buffer_padding: Size of padding at end of RX packet
+ * @can_rx_scatter: NIC is able to scatter packets to multiple buffers
+ * @always_rx_scatter: NIC will always scatter packets to multiple buffers
+ * @option_descriptors: NIC supports TX option descriptors
+ * @min_interrupt_mode: Lowest capability interrupt mode supported
+ *	from &enum efx_int_mode.
+ * @timer_period_max: Maximum period of interrupt timer (in ticks)
+ * @offload_features: net_device feature flags for protocol offload
+ *	features implemented in hardware
+ * @mcdi_max_ver: Maximum MCDI version supported
+ * @hwtstamp_filters: Mask of hardware timestamp filter types supported
+ */
+struct efx_nic_type {
+	bool is_vf;
+	unsigned int (*mem_bar)(struct efx_nic *efx);
+	unsigned int (*mem_map_size)(struct efx_nic *efx);
+	int (*probe)(struct efx_nic *efx);
+	void (*remove)(struct efx_nic *efx);
+	int (*init)(struct efx_nic *efx);
+	int (*dimension_resources)(struct efx_nic *efx);
+	void (*fini)(struct efx_nic *efx);
+	void (*monitor)(struct efx_nic *efx);
+	enum reset_type (*map_reset_reason)(enum reset_type reason);
+	int (*map_reset_flags)(u32 *flags);
+	int (*reset)(struct efx_nic *efx, enum reset_type method);
+	int (*probe_port)(struct efx_nic *efx);
+	void (*remove_port)(struct efx_nic *efx);
+	bool (*handle_global_event)(struct efx_channel *channel, efx_qword_t *);
+	int (*fini_dmaq)(struct efx_nic *efx);
+	void (*prepare_flush)(struct efx_nic *efx);
+	void (*finish_flush)(struct efx_nic *efx);
+	void (*prepare_flr)(struct efx_nic *efx);
+	void (*finish_flr)(struct efx_nic *efx);
+	size_t (*describe_stats)(struct efx_nic *efx, u8 *names);
+	size_t (*update_stats)(struct efx_nic *efx, u64 *full_stats,
+			       struct rtnl_link_stats64 *core_stats);
+	size_t (*update_stats_atomic)(struct efx_nic *efx, u64 *full_stats,
+				      struct rtnl_link_stats64 *core_stats);
+	void (*start_stats)(struct efx_nic *efx);
+	void (*pull_stats)(struct efx_nic *efx);
+	void (*stop_stats)(struct efx_nic *efx);
+	void (*push_irq_moderation)(struct efx_channel *channel);
+	int (*reconfigure_port)(struct efx_nic *efx);
+	void (*prepare_enable_fc_tx)(struct efx_nic *efx);
+	int (*reconfigure_mac)(struct efx_nic *efx, bool mtu_only);
+	bool (*check_mac_fault)(struct efx_nic *efx);
+	void (*get_wol)(struct efx_nic *efx, struct ethtool_wolinfo *wol);
+	int (*set_wol)(struct efx_nic *efx, u32 type);
+	void (*resume_wol)(struct efx_nic *efx);
+	void (*get_fec_stats)(struct efx_nic *efx,
+			      struct ethtool_fec_stats *fec_stats);
+	unsigned int (*check_caps)(const struct efx_nic *efx,
+				   u8 flag,
+				   u32 offset);
+	int (*test_chip)(struct efx_nic *efx, struct efx_self_tests *tests);
+	int (*test_nvram)(struct efx_nic *efx);
+	void (*mcdi_request)(struct efx_nic *efx,
+			     const efx_dword_t *hdr, size_t hdr_len,
+			     const efx_dword_t *sdu, size_t sdu_len);
+	bool (*mcdi_poll_response)(struct efx_nic *efx);
+	void (*mcdi_read_response)(struct efx_nic *efx, efx_dword_t *pdu,
+				   size_t pdu_offset, size_t pdu_len);
+	int (*mcdi_poll_reboot)(struct efx_nic *efx);
+	void (*mcdi_reboot_detected)(struct efx_nic *efx);
+	void (*irq_enable_master)(struct efx_nic *efx);
+	int (*irq_test_generate)(struct efx_nic *efx);
+	void (*irq_disable_non_ev)(struct efx_nic *efx);
+	irqreturn_t (*irq_handle_msi)(int irq, void *dev_id);
+	irqreturn_t (*irq_handle_legacy)(int irq, void *dev_id);
+	int (*tx_probe)(struct efx_tx_queue *tx_queue);
+	void (*tx_init)(struct efx_tx_queue *tx_queue);
+	void (*tx_remove)(struct efx_tx_queue *tx_queue);
+	void (*tx_write)(struct efx_tx_queue *tx_queue);
+	netdev_tx_t (*tx_enqueue)(struct efx_tx_queue *tx_queue, struct sk_buff *skb);
+	unsigned int (*tx_limit_len)(struct efx_tx_queue *tx_queue,
+				     dma_addr_t dma_addr, unsigned int len);
+	int (*rx_push_rss_config)(struct efx_nic *efx, bool user,
+				  const u32 *rx_indir_table, const u8 *key);
+	int (*rx_pull_rss_config)(struct efx_nic *efx);
+	int (*rx_push_rss_context_config)(struct efx_nic *efx,
+					  struct efx_rss_context *ctx,
+					  const u32 *rx_indir_table,
+					  const u8 *key);
+	int (*rx_pull_rss_context_config)(struct efx_nic *efx,
+					  struct efx_rss_context *ctx);
+	void (*rx_restore_rss_contexts)(struct efx_nic *efx);
+	int (*rx_probe)(struct efx_rx_queue *rx_queue);
+	void (*rx_init)(struct efx_rx_queue *rx_queue);
+	void (*rx_remove)(struct efx_rx_queue *rx_queue);
+	void (*rx_write)(struct efx_rx_queue *rx_queue);
+	void (*rx_defer_refill)(struct efx_rx_queue *rx_queue);
+	void (*rx_packet)(struct efx_channel *channel);
+	bool (*rx_buf_hash_valid)(const u8 *prefix);
+	int (*ev_probe)(struct efx_channel *channel);
+	int (*ev_init)(struct efx_channel *channel);
+	void (*ev_fini)(struct efx_channel *channel);
+	void (*ev_remove)(struct efx_channel *channel);
+	int (*ev_process)(struct efx_channel *channel, int quota);
+	void (*ev_read_ack)(struct efx_channel *channel);
+	void (*ev_test_generate)(struct efx_channel *channel);
+	int (*filter_table_probe)(struct efx_nic *efx);
+	void (*filter_table_restore)(struct efx_nic *efx);
+	void (*filter_table_remove)(struct efx_nic *efx);
+	void (*filter_update_rx_scatter)(struct efx_nic *efx);
+	s32 (*filter_insert)(struct efx_nic *efx,
+			     struct efx_filter_spec *spec, bool replace);
+	int (*filter_remove_safe)(struct efx_nic *efx,
+				  enum efx_filter_priority priority,
+				  u32 filter_id);
+	int (*filter_get_safe)(struct efx_nic *efx,
+			       enum efx_filter_priority priority,
+			       u32 filter_id, struct efx_filter_spec *);
+	int (*filter_clear_rx)(struct efx_nic *efx,
+			       enum efx_filter_priority priority);
+	u32 (*filter_count_rx_used)(struct efx_nic *efx,
+				    enum efx_filter_priority priority);
+	u32 (*filter_get_rx_id_limit)(struct efx_nic *efx);
+	s32 (*filter_get_rx_ids)(struct efx_nic *efx,
+				 enum efx_filter_priority priority,
+				 u32 *buf, u32 size);
+#ifdef CONFIG_RFS_ACCEL
+	bool (*filter_rfs_expire_one)(struct efx_nic *efx, u32 flow_id,
+				      unsigned int index);
+#endif
+#ifdef CONFIG_SFC_MTD
+	int (*mtd_probe)(struct efx_nic *efx);
+	void (*mtd_rename)(struct efx_mtd_partition *part);
+	int (*mtd_read)(struct mtd_info *mtd, loff_t start, size_t len,
+			size_t *retlen, u8 *buffer);
+	int (*mtd_erase)(struct mtd_info *mtd, loff_t start, size_t len);
+	int (*mtd_write)(struct mtd_info *mtd, loff_t start, size_t len,
+			 size_t *retlen, const u8 *buffer);
+	int (*mtd_sync)(struct mtd_info *mtd);
+#endif
+	void (*ptp_write_host_time)(struct efx_nic *efx, u32 host_time);
+	int (*ptp_set_ts_sync_events)(struct efx_nic *efx, bool en, bool temp);
+	int (*ptp_set_ts_config)(struct efx_nic *efx,
+				 struct hwtstamp_config *init);
+	int (*sriov_configure)(struct efx_nic *efx, int num_vfs);
+	int (*vlan_rx_add_vid)(struct efx_nic *efx, __be16 proto, u16 vid);
+	int (*vlan_rx_kill_vid)(struct efx_nic *efx, __be16 proto, u16 vid);
+	int (*get_phys_port_id)(struct efx_nic *efx,
+				struct netdev_phys_item_id *ppid);
+	int (*sriov_init)(struct efx_nic *efx);
+	void (*sriov_fini)(struct efx_nic *efx);
+	bool (*sriov_wanted)(struct efx_nic *efx);
+	void (*sriov_reset)(struct efx_nic *efx);
+	void (*sriov_flr)(struct efx_nic *efx, unsigned vf_i);
+	int (*sriov_set_vf_mac)(struct efx_nic *efx, int vf_i, const u8 *mac);
+	int (*sriov_set_vf_vlan)(struct efx_nic *efx, int vf_i, u16 vlan,
+				 u8 qos);
+	int (*sriov_set_vf_spoofchk)(struct efx_nic *efx, int vf_i,
+				     bool spoofchk);
+	int (*sriov_get_vf_config)(struct efx_nic *efx, int vf_i,
+				   struct ifla_vf_info *ivi);
+	int (*sriov_set_vf_link_state)(struct efx_nic *efx, int vf_i,
+				       int link_state);
+	int (*vswitching_probe)(struct efx_nic *efx);
+	int (*vswitching_restore)(struct efx_nic *efx);
+	void (*vswitching_remove)(struct efx_nic *efx);
+	int (*get_mac_address)(struct efx_nic *efx, unsigned char *perm_addr);
+	int (*set_mac_address)(struct efx_nic *efx);
+	u32 (*tso_versions)(struct efx_nic *efx);
+	int (*udp_tnl_push_ports)(struct efx_nic *efx);
+	bool (*udp_tnl_has_port)(struct efx_nic *efx, __be16 port);
+	size_t (*print_additional_fwver)(struct efx_nic *efx, char *buf,
+					 size_t len);
+	void (*sensor_event)(struct efx_nic *efx, efx_qword_t *ev);
+	unsigned int (*rx_recycle_ring_size)(const struct efx_nic *efx);
+
+	int revision;
+	unsigned int txd_ptr_tbl_base;
+	unsigned int rxd_ptr_tbl_base;
+	unsigned int buf_tbl_base;
+	unsigned int evq_ptr_tbl_base;
+	unsigned int evq_rptr_tbl_base;
+	u64 max_dma_mask;
+	unsigned int rx_prefix_size;
+	unsigned int rx_hash_offset;
+	unsigned int rx_ts_offset;
+	unsigned int rx_buffer_padding;
+	bool can_rx_scatter;
+	bool always_rx_scatter;
+	bool option_descriptors;
+	unsigned int min_interrupt_mode;
+	unsigned int timer_period_max;
+	netdev_features_t offload_features;
+	int mcdi_max_ver;
+	unsigned int max_rx_ip_filters;
+	u32 hwtstamp_filters;
+	unsigned int rx_hash_key_size;
+};
+
+/**************************************************************************
+ *
+ * Prototypes and inline functions
+ *
+ *************************************************************************/
+
+static inline struct efx_channel *
+efx_get_channel(struct efx_nic *efx, unsigned index)
+{
+	EFX_WARN_ON_ONCE_PARANOID(index >= efx->n_channels);
+	return efx->channel[index];
+}
+
+/* Iterate over all used channels */
+#define efx_for_each_channel(_channel, _efx)				\
+	for (_channel = (_efx)->channel[0];				\
+	     _channel;							\
+	     _channel = (_channel->channel + 1 < (_efx)->n_channels) ?	\
+		     (_efx)->channel[_channel->channel + 1] : NULL)
+
+/* Iterate over all used channels in reverse */
+#define efx_for_each_channel_rev(_channel, _efx)			\
+	for (_channel = (_efx)->channel[(_efx)->n_channels - 1];	\
+	     _channel;							\
+	     _channel = _channel->channel ?				\
+		     (_efx)->channel[_channel->channel - 1] : NULL)
+
+static inline struct efx_channel *
+efx_get_tx_channel(struct efx_nic *efx, unsigned int index)
+{
+	EFX_WARN_ON_ONCE_PARANOID(index >= efx->n_tx_channels);
+	return efx->channel[efx->tx_channel_offset + index];
+}
+
+static inline struct efx_channel *
+efx_get_xdp_channel(struct efx_nic *efx, unsigned int index)
+{
+	EFX_WARN_ON_ONCE_PARANOID(index >= efx->n_xdp_channels);
+	return efx->channel[efx->xdp_channel_offset + index];
+}
+
+static inline bool efx_channel_is_xdp_tx(struct efx_channel *channel)
+{
+	return channel->channel - channel->efx->xdp_channel_offset <
+	       channel->efx->n_xdp_channels;
+}
+
+static inline bool efx_channel_has_tx_queues(struct efx_channel *channel)
+{
+	return true;
+}
+
+static inline unsigned int efx_channel_num_tx_queues(struct efx_channel *channel)
+{
+	if (efx_channel_is_xdp_tx(channel))
+		return channel->efx->xdp_tx_per_channel;
+	return channel->efx->tx_queues_per_channel;
+}
+
+static inline struct efx_tx_queue *
+efx_channel_get_tx_queue(struct efx_channel *channel, unsigned int type)
+{
+	EFX_WARN_ON_ONCE_PARANOID(type >= EFX_TXQ_TYPES);
+	return channel->tx_queue_by_type[type];
+}
+
+static inline struct efx_tx_queue *
+efx_get_tx_queue(struct efx_nic *efx, unsigned int index, unsigned int type)
+{
+	struct efx_channel *channel = efx_get_tx_channel(efx, index);
+
+	return efx_channel_get_tx_queue(channel, type);
+}
+
+/* Iterate over all TX queues belonging to a channel */
+#define efx_for_each_channel_tx_queue(_tx_queue, _channel)		\
+	if (!efx_channel_has_tx_queues(_channel))			\
+		;							\
+	else								\
+		for (_tx_queue = (_channel)->tx_queue;			\
+		     _tx_queue < (_channel)->tx_queue +			\
+				 efx_channel_num_tx_queues(_channel);		\
+		     _tx_queue++)
+
+static inline bool efx_channel_has_rx_queue(struct efx_channel *channel)
+{
+	return channel->rx_queue.core_index >= 0;
+}
+
+static inline struct efx_rx_queue *
+efx_channel_get_rx_queue(struct efx_channel *channel)
+{
+	EFX_WARN_ON_ONCE_PARANOID(!efx_channel_has_rx_queue(channel));
+	return &channel->rx_queue;
+}
+
+/* Iterate over all RX queues belonging to a channel */
+#define efx_for_each_channel_rx_queue(_rx_queue, _channel)		\
+	if (!efx_channel_has_rx_queue(_channel))			\
+		;							\
+	else								\
+		for (_rx_queue = &(_channel)->rx_queue;			\
+		     _rx_queue;						\
+		     _rx_queue = NULL)
+
+static inline struct efx_channel *
+efx_rx_queue_channel(struct efx_rx_queue *rx_queue)
+{
+	return container_of(rx_queue, struct efx_channel, rx_queue);
+}
+
+static inline int efx_rx_queue_index(struct efx_rx_queue *rx_queue)
+{
+	return efx_rx_queue_channel(rx_queue)->channel;
+}
+
+/* Returns a pointer to the specified receive buffer in the RX
+ * descriptor queue.
+ */
+static inline struct efx_rx_buffer *efx_rx_buffer(struct efx_rx_queue *rx_queue,
+						  unsigned int index)
+{
+	return &rx_queue->buffer[index];
+}
+
+static inline struct efx_rx_buffer *
+efx_rx_buf_next(struct efx_rx_queue *rx_queue, struct efx_rx_buffer *rx_buf)
+{
+	if (unlikely(rx_buf == efx_rx_buffer(rx_queue, rx_queue->ptr_mask)))
+		return efx_rx_buffer(rx_queue, 0);
+	else
+		return rx_buf + 1;
+}
+
+/**
+ * EFX_MAX_FRAME_LEN - calculate maximum frame length
+ *
+ * This calculates the maximum frame length that will be used for a
+ * given MTU.  The frame length will be equal to the MTU plus a
+ * constant amount of header space and padding.  This is the quantity
+ * that the net driver will program into the MAC as the maximum frame
+ * length.
+ *
+ * The 10G MAC requires 8-byte alignment on the frame
+ * length, so we round up to the nearest 8.
+ *
+ * Re-clocking by the XGXS on RX can reduce an IPG to 32 bits (half an
+ * XGMII cycle).  If the frame length reaches the maximum value in the
+ * same cycle, the XMAC can miss the IPG altogether.  We work around
+ * this by adding a further 16 bytes.
+ */
+#define EFX_FRAME_PAD	16
+#define EFX_MAX_FRAME_LEN(mtu) \
+	(ALIGN(((mtu) + ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN + EFX_FRAME_PAD), 8))
+
+static inline bool efx_xmit_with_hwtstamp(struct sk_buff *skb)
+{
+	return skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP;
+}
+static inline void efx_xmit_hwtstamp_pending(struct sk_buff *skb)
+{
+	skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
+}
+
+/* Get the max fill level of the TX queues on this channel */
+static inline unsigned int
+efx_channel_tx_fill_level(struct efx_channel *channel)
+{
+	struct efx_tx_queue *tx_queue;
+	unsigned int fill_level = 0;
+
+	efx_for_each_channel_tx_queue(tx_queue, channel)
+		fill_level = max(fill_level,
+				 tx_queue->insert_count - tx_queue->read_count);
+
+	return fill_level;
+}
+
+/* Conservative approximation of efx_channel_tx_fill_level using cached value */
+static inline unsigned int
+efx_channel_tx_old_fill_level(struct efx_channel *channel)
+{
+	struct efx_tx_queue *tx_queue;
+	unsigned int fill_level = 0;
+
+	efx_for_each_channel_tx_queue(tx_queue, channel)
+		fill_level = max(fill_level,
+				 tx_queue->insert_count - tx_queue->old_read_count);
+
+	return fill_level;
+}
+
+/* Get all supported features.
+ * If a feature is not fixed, it is present in hw_features.
+ * If a feature is fixed, it does not present in hw_features, but
+ * always in features.
+ */
+static inline netdev_features_t efx_supported_features(const struct efx_nic *efx)
+{
+	const struct net_device *net_dev = efx->net_dev;
+
+	return net_dev->features | net_dev->hw_features;
+}
+
+/* Get the current TX queue insert index. */
+static inline unsigned int
+efx_tx_queue_get_insert_index(const struct efx_tx_queue *tx_queue)
+{
+	return tx_queue->insert_count & tx_queue->ptr_mask;
+}
+
+/* Get a TX buffer. */
+static inline struct efx_tx_buffer *
+__efx_tx_queue_get_insert_buffer(const struct efx_tx_queue *tx_queue)
+{
+	return &tx_queue->buffer[efx_tx_queue_get_insert_index(tx_queue)];
+}
+
+/* Get a TX buffer, checking it's not currently in use. */
+static inline struct efx_tx_buffer *
+efx_tx_queue_get_insert_buffer(const struct efx_tx_queue *tx_queue)
+{
+	struct efx_tx_buffer *buffer =
+		__efx_tx_queue_get_insert_buffer(tx_queue);
+
+	EFX_WARN_ON_ONCE_PARANOID(buffer->len);
+	EFX_WARN_ON_ONCE_PARANOID(buffer->flags);
+	EFX_WARN_ON_ONCE_PARANOID(buffer->unmap_len);
+
+	return buffer;
+}
+
+#endif /* EFX_NET_DRIVER_H */
--- /dev/null
+++ b/drivers/net/ethernet/sfc/siena/nic.c
@@ -0,0 +1,580 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/****************************************************************************
+ * Driver for Solarflare network controllers and boards
+ * Copyright 2005-2006 Fen Systems Ltd.
+ * Copyright 2006-2013 Solarflare Communications Inc.
+ */
+
+#include <linux/bitops.h>
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/cpu_rmap.h>
+#include "net_driver.h"
+#include "bitfield.h"
+#include "efx.h"
+#include "nic.h"
+#include "ef10_regs.h"
+#include "farch_regs.h"
+#include "io.h"
+#include "workarounds.h"
+#include "mcdi_pcol.h"
+
+/**************************************************************************
+ *
+ * Generic buffer handling
+ * These buffers are used for interrupt status, MAC stats, etc.
+ *
+ **************************************************************************/
+
+int efx_nic_alloc_buffer(struct efx_nic *efx, struct efx_buffer *buffer,
+			 unsigned int len, gfp_t gfp_flags)
+{
+	buffer->addr = dma_alloc_coherent(&efx->pci_dev->dev, len,
+					  &buffer->dma_addr, gfp_flags);
+	if (!buffer->addr)
+		return -ENOMEM;
+	buffer->len = len;
+	return 0;
+}
+
+void efx_nic_free_buffer(struct efx_nic *efx, struct efx_buffer *buffer)
+{
+	if (buffer->addr) {
+		dma_free_coherent(&efx->pci_dev->dev, buffer->len,
+				  buffer->addr, buffer->dma_addr);
+		buffer->addr = NULL;
+	}
+}
+
+/* Check whether an event is present in the eventq at the current
+ * read pointer.  Only useful for self-test.
+ */
+bool efx_nic_event_present(struct efx_channel *channel)
+{
+	return efx_event_present(efx_event(channel, channel->eventq_read_ptr));
+}
+
+void efx_nic_event_test_start(struct efx_channel *channel)
+{
+	channel->event_test_cpu = -1;
+	smp_wmb();
+	channel->efx->type->ev_test_generate(channel);
+}
+
+int efx_nic_irq_test_start(struct efx_nic *efx)
+{
+	efx->last_irq_cpu = -1;
+	smp_wmb();
+	return efx->type->irq_test_generate(efx);
+}
+
+/* Hook interrupt handler(s)
+ * Try MSI and then legacy interrupts.
+ */
+int efx_nic_init_interrupt(struct efx_nic *efx)
+{
+	struct efx_channel *channel;
+	unsigned int n_irqs;
+	int rc;
+
+	if (!EFX_INT_MODE_USE_MSI(efx)) {
+		rc = request_irq(efx->legacy_irq,
+				 efx->type->irq_handle_legacy, IRQF_SHARED,
+				 efx->name, efx);
+		if (rc) {
+			netif_err(efx, drv, efx->net_dev,
+				  "failed to hook legacy IRQ %d\n",
+				  efx->pci_dev->irq);
+			goto fail1;
+		}
+		efx->irqs_hooked = true;
+		return 0;
+	}
+
+#ifdef CONFIG_RFS_ACCEL
+	if (efx->interrupt_mode == EFX_INT_MODE_MSIX) {
+		efx->net_dev->rx_cpu_rmap =
+			alloc_irq_cpu_rmap(efx->n_rx_channels);
+		if (!efx->net_dev->rx_cpu_rmap) {
+			rc = -ENOMEM;
+			goto fail1;
+		}
+	}
+#endif
+
+	/* Hook MSI or MSI-X interrupt */
+	n_irqs = 0;
+	efx_for_each_channel(channel, efx) {
+		rc = request_irq(channel->irq, efx->type->irq_handle_msi,
+				 IRQF_PROBE_SHARED, /* Not shared */
+				 efx->msi_context[channel->channel].name,
+				 &efx->msi_context[channel->channel]);
+		if (rc) {
+			netif_err(efx, drv, efx->net_dev,
+				  "failed to hook IRQ %d\n", channel->irq);
+			goto fail2;
+		}
+		++n_irqs;
+
+#ifdef CONFIG_RFS_ACCEL
+		if (efx->interrupt_mode == EFX_INT_MODE_MSIX &&
+		    channel->channel < efx->n_rx_channels) {
+			rc = irq_cpu_rmap_add(efx->net_dev->rx_cpu_rmap,
+					      channel->irq);
+			if (rc)
+				goto fail2;
+		}
+#endif
+	}
+
+	efx->irqs_hooked = true;
+	return 0;
+
+ fail2:
+#ifdef CONFIG_RFS_ACCEL
+	free_irq_cpu_rmap(efx->net_dev->rx_cpu_rmap);
+	efx->net_dev->rx_cpu_rmap = NULL;
+#endif
+	efx_for_each_channel(channel, efx) {
+		if (n_irqs-- == 0)
+			break;
+		free_irq(channel->irq, &efx->msi_context[channel->channel]);
+	}
+ fail1:
+	return rc;
+}
+
+void efx_nic_fini_interrupt(struct efx_nic *efx)
+{
+	struct efx_channel *channel;
+
+#ifdef CONFIG_RFS_ACCEL
+	free_irq_cpu_rmap(efx->net_dev->rx_cpu_rmap);
+	efx->net_dev->rx_cpu_rmap = NULL;
+#endif
+
+	if (!efx->irqs_hooked)
+		return;
+	if (EFX_INT_MODE_USE_MSI(efx)) {
+		/* Disable MSI/MSI-X interrupts */
+		efx_for_each_channel(channel, efx)
+			free_irq(channel->irq,
+				 &efx->msi_context[channel->channel]);
+	} else {
+		/* Disable legacy interrupt */
+		free_irq(efx->legacy_irq, efx);
+	}
+	efx->irqs_hooked = false;
+}
+
+/* Register dump */
+
+#define REGISTER_REVISION_FA	1
+#define REGISTER_REVISION_FB	2
+#define REGISTER_REVISION_FC	3
+#define REGISTER_REVISION_FZ	3	/* last Falcon arch revision */
+#define REGISTER_REVISION_ED	4
+#define REGISTER_REVISION_EZ	4	/* latest EF10 revision */
+
+struct efx_nic_reg {
+	u32 offset:24;
+	u32 min_revision:3, max_revision:3;
+};
+
+#define REGISTER(name, arch, min_rev, max_rev) {			\
+	arch ## R_ ## min_rev ## max_rev ## _ ## name,			\
+	REGISTER_REVISION_ ## arch ## min_rev,				\
+	REGISTER_REVISION_ ## arch ## max_rev				\
+}
+#define REGISTER_AA(name) REGISTER(name, F, A, A)
+#define REGISTER_AB(name) REGISTER(name, F, A, B)
+#define REGISTER_AZ(name) REGISTER(name, F, A, Z)
+#define REGISTER_BB(name) REGISTER(name, F, B, B)
+#define REGISTER_BZ(name) REGISTER(name, F, B, Z)
+#define REGISTER_CZ(name) REGISTER(name, F, C, Z)
+#define REGISTER_DZ(name) REGISTER(name, E, D, Z)
+
+static const struct efx_nic_reg efx_nic_regs[] = {
+	REGISTER_AZ(ADR_REGION),
+	REGISTER_AZ(INT_EN_KER),
+	REGISTER_BZ(INT_EN_CHAR),
+	REGISTER_AZ(INT_ADR_KER),
+	REGISTER_BZ(INT_ADR_CHAR),
+	/* INT_ACK_KER is WO */
+	/* INT_ISR0 is RC */
+	REGISTER_AZ(HW_INIT),
+	REGISTER_CZ(USR_EV_CFG),
+	REGISTER_AB(EE_SPI_HCMD),
+	REGISTER_AB(EE_SPI_HADR),
+	REGISTER_AB(EE_SPI_HDATA),
+	REGISTER_AB(EE_BASE_PAGE),
+	REGISTER_AB(EE_VPD_CFG0),
+	/* EE_VPD_SW_CNTL and EE_VPD_SW_DATA are not used */
+	/* PMBX_DBG_IADDR and PBMX_DBG_IDATA are indirect */
+	/* PCIE_CORE_INDIRECT is indirect */
+	REGISTER_AB(NIC_STAT),
+	REGISTER_AB(GPIO_CTL),
+	REGISTER_AB(GLB_CTL),
+	/* FATAL_INTR_KER and FATAL_INTR_CHAR are partly RC */
+	REGISTER_BZ(DP_CTRL),
+	REGISTER_AZ(MEM_STAT),
+	REGISTER_AZ(CS_DEBUG),
+	REGISTER_AZ(ALTERA_BUILD),
+	REGISTER_AZ(CSR_SPARE),
+	REGISTER_AB(PCIE_SD_CTL0123),
+	REGISTER_AB(PCIE_SD_CTL45),
+	REGISTER_AB(PCIE_PCS_CTL_STAT),
+	/* DEBUG_DATA_OUT is not used */
+	/* DRV_EV is WO */
+	REGISTER_AZ(EVQ_CTL),
+	REGISTER_AZ(EVQ_CNT1),
+	REGISTER_AZ(EVQ_CNT2),
+	REGISTER_AZ(BUF_TBL_CFG),
+	REGISTER_AZ(SRM_RX_DC_CFG),
+	REGISTER_AZ(SRM_TX_DC_CFG),
+	REGISTER_AZ(SRM_CFG),
+	/* BUF_TBL_UPD is WO */
+	REGISTER_AZ(SRM_UPD_EVQ),
+	REGISTER_AZ(SRAM_PARITY),
+	REGISTER_AZ(RX_CFG),
+	REGISTER_BZ(RX_FILTER_CTL),
+	/* RX_FLUSH_DESCQ is WO */
+	REGISTER_AZ(RX_DC_CFG),
+	REGISTER_AZ(RX_DC_PF_WM),
+	REGISTER_BZ(RX_RSS_TKEY),
+	/* RX_NODESC_DROP is RC */
+	REGISTER_AA(RX_SELF_RST),
+	/* RX_DEBUG, RX_PUSH_DROP are not used */
+	REGISTER_CZ(RX_RSS_IPV6_REG1),
+	REGISTER_CZ(RX_RSS_IPV6_REG2),
+	REGISTER_CZ(RX_RSS_IPV6_REG3),
+	/* TX_FLUSH_DESCQ is WO */
+	REGISTER_AZ(TX_DC_CFG),
+	REGISTER_AA(TX_CHKSM_CFG),
+	REGISTER_AZ(TX_CFG),
+	/* TX_PUSH_DROP is not used */
+	REGISTER_AZ(TX_RESERVED),
+	REGISTER_BZ(TX_PACE),
+	/* TX_PACE_DROP_QID is RC */
+	REGISTER_BB(TX_VLAN),
+	REGISTER_BZ(TX_IPFIL_PORTEN),
+	REGISTER_AB(MD_TXD),
+	REGISTER_AB(MD_RXD),
+	REGISTER_AB(MD_CS),
+	REGISTER_AB(MD_PHY_ADR),
+	REGISTER_AB(MD_ID),
+	/* MD_STAT is RC */
+	REGISTER_AB(MAC_STAT_DMA),
+	REGISTER_AB(MAC_CTRL),
+	REGISTER_BB(GEN_MODE),
+	REGISTER_AB(MAC_MC_HASH_REG0),
+	REGISTER_AB(MAC_MC_HASH_REG1),
+	REGISTER_AB(GM_CFG1),
+	REGISTER_AB(GM_CFG2),
+	/* GM_IPG and GM_HD are not used */
+	REGISTER_AB(GM_MAX_FLEN),
+	/* GM_TEST is not used */
+	REGISTER_AB(GM_ADR1),
+	REGISTER_AB(GM_ADR2),
+	REGISTER_AB(GMF_CFG0),
+	REGISTER_AB(GMF_CFG1),
+	REGISTER_AB(GMF_CFG2),
+	REGISTER_AB(GMF_CFG3),
+	REGISTER_AB(GMF_CFG4),
+	REGISTER_AB(GMF_CFG5),
+	REGISTER_BB(TX_SRC_MAC_CTL),
+	REGISTER_AB(XM_ADR_LO),
+	REGISTER_AB(XM_ADR_HI),
+	REGISTER_AB(XM_GLB_CFG),
+	REGISTER_AB(XM_TX_CFG),
+	REGISTER_AB(XM_RX_CFG),
+	REGISTER_AB(XM_MGT_INT_MASK),
+	REGISTER_AB(XM_FC),
+	REGISTER_AB(XM_PAUSE_TIME),
+	REGISTER_AB(XM_TX_PARAM),
+	REGISTER_AB(XM_RX_PARAM),
+	/* XM_MGT_INT_MSK (note no 'A') is RC */
+	REGISTER_AB(XX_PWR_RST),
+	REGISTER_AB(XX_SD_CTL),
+	REGISTER_AB(XX_TXDRV_CTL),
+	/* XX_PRBS_CTL, XX_PRBS_CHK and XX_PRBS_ERR are not used */
+	/* XX_CORE_STAT is partly RC */
+	REGISTER_DZ(BIU_HW_REV_ID),
+	REGISTER_DZ(MC_DB_LWRD),
+	REGISTER_DZ(MC_DB_HWRD),
+};
+
+struct efx_nic_reg_table {
+	u32 offset:24;
+	u32 min_revision:3, max_revision:3;
+	u32 step:6, rows:21;
+};
+
+#define REGISTER_TABLE_DIMENSIONS(_, offset, arch, min_rev, max_rev, step, rows) { \
+	offset,								\
+	REGISTER_REVISION_ ## arch ## min_rev,				\
+	REGISTER_REVISION_ ## arch ## max_rev,				\
+	step, rows							\
+}
+#define REGISTER_TABLE(name, arch, min_rev, max_rev)			\
+	REGISTER_TABLE_DIMENSIONS(					\
+		name, arch ## R_ ## min_rev ## max_rev ## _ ## name,	\
+		arch, min_rev, max_rev,					\
+		arch ## R_ ## min_rev ## max_rev ## _ ## name ## _STEP,	\
+		arch ## R_ ## min_rev ## max_rev ## _ ## name ## _ROWS)
+#define REGISTER_TABLE_AA(name) REGISTER_TABLE(name, F, A, A)
+#define REGISTER_TABLE_AZ(name) REGISTER_TABLE(name, F, A, Z)
+#define REGISTER_TABLE_BB(name) REGISTER_TABLE(name, F, B, B)
+#define REGISTER_TABLE_BZ(name) REGISTER_TABLE(name, F, B, Z)
+#define REGISTER_TABLE_BB_CZ(name)					\
+	REGISTER_TABLE_DIMENSIONS(name, FR_BZ_ ## name, F, B, B,	\
+				  FR_BZ_ ## name ## _STEP,		\
+				  FR_BB_ ## name ## _ROWS),		\
+	REGISTER_TABLE_DIMENSIONS(name, FR_BZ_ ## name, F, C, Z,	\
+				  FR_BZ_ ## name ## _STEP,		\
+				  FR_CZ_ ## name ## _ROWS)
+#define REGISTER_TABLE_CZ(name) REGISTER_TABLE(name, F, C, Z)
+#define REGISTER_TABLE_DZ(name) REGISTER_TABLE(name, E, D, Z)
+
+static const struct efx_nic_reg_table efx_nic_reg_tables[] = {
+	/* DRIVER is not used */
+	/* EVQ_RPTR, TIMER_COMMAND, USR_EV and {RX,TX}_DESC_UPD are WO */
+	REGISTER_TABLE_BB(TX_IPFIL_TBL),
+	REGISTER_TABLE_BB(TX_SRC_MAC_TBL),
+	REGISTER_TABLE_AA(RX_DESC_PTR_TBL_KER),
+	REGISTER_TABLE_BB_CZ(RX_DESC_PTR_TBL),
+	REGISTER_TABLE_AA(TX_DESC_PTR_TBL_KER),
+	REGISTER_TABLE_BB_CZ(TX_DESC_PTR_TBL),
+	REGISTER_TABLE_AA(EVQ_PTR_TBL_KER),
+	REGISTER_TABLE_BB_CZ(EVQ_PTR_TBL),
+	/* We can't reasonably read all of the buffer table (up to 8MB!).
+	 * However this driver will only use a few entries.  Reading
+	 * 1K entries allows for some expansion of queue count and
+	 * size before we need to change the version. */
+	REGISTER_TABLE_DIMENSIONS(BUF_FULL_TBL_KER, FR_AA_BUF_FULL_TBL_KER,
+				  F, A, A, 8, 1024),
+	REGISTER_TABLE_DIMENSIONS(BUF_FULL_TBL, FR_BZ_BUF_FULL_TBL,
+				  F, B, Z, 8, 1024),
+	REGISTER_TABLE_CZ(RX_MAC_FILTER_TBL0),
+	REGISTER_TABLE_BB_CZ(TIMER_TBL),
+	REGISTER_TABLE_BB_CZ(TX_PACE_TBL),
+	REGISTER_TABLE_BZ(RX_INDIRECTION_TBL),
+	/* TX_FILTER_TBL0 is huge and not used by this driver */
+	REGISTER_TABLE_CZ(TX_MAC_FILTER_TBL0),
+	REGISTER_TABLE_CZ(MC_TREG_SMEM),
+	/* MSIX_PBA_TABLE is not mapped */
+	/* SRM_DBG is not mapped (and is redundant with BUF_FLL_TBL) */
+	REGISTER_TABLE_BZ(RX_FILTER_TBL0),
+	REGISTER_TABLE_DZ(BIU_MC_SFT_STATUS),
+};
+
+size_t efx_nic_get_regs_len(struct efx_nic *efx)
+{
+	const struct efx_nic_reg *reg;
+	const struct efx_nic_reg_table *table;
+	size_t len = 0;
+
+	for (reg = efx_nic_regs;
+	     reg < efx_nic_regs + ARRAY_SIZE(efx_nic_regs);
+	     reg++)
+		if (efx->type->revision >= reg->min_revision &&
+		    efx->type->revision <= reg->max_revision)
+			len += sizeof(efx_oword_t);
+
+	for (table = efx_nic_reg_tables;
+	     table < efx_nic_reg_tables + ARRAY_SIZE(efx_nic_reg_tables);
+	     table++)
+		if (efx->type->revision >= table->min_revision &&
+		    efx->type->revision <= table->max_revision)
+			len += table->rows * min_t(size_t, table->step, 16);
+
+	return len;
+}
+
+void efx_nic_get_regs(struct efx_nic *efx, void *buf)
+{
+	const struct efx_nic_reg *reg;
+	const struct efx_nic_reg_table *table;
+
+	for (reg = efx_nic_regs;
+	     reg < efx_nic_regs + ARRAY_SIZE(efx_nic_regs);
+	     reg++) {
+		if (efx->type->revision >= reg->min_revision &&
+		    efx->type->revision <= reg->max_revision) {
+			efx_reado(efx, (efx_oword_t *)buf, reg->offset);
+			buf += sizeof(efx_oword_t);
+		}
+	}
+
+	for (table = efx_nic_reg_tables;
+	     table < efx_nic_reg_tables + ARRAY_SIZE(efx_nic_reg_tables);
+	     table++) {
+		size_t size, i;
+
+		if (!(efx->type->revision >= table->min_revision &&
+		      efx->type->revision <= table->max_revision))
+			continue;
+
+		size = min_t(size_t, table->step, 16);
+
+		for (i = 0; i < table->rows; i++) {
+			switch (table->step) {
+			case 4: /* 32-bit SRAM */
+				efx_readd(efx, buf, table->offset + 4 * i);
+				break;
+			case 8: /* 64-bit SRAM */
+				efx_sram_readq(efx,
+					       efx->membase + table->offset,
+					       buf, i);
+				break;
+			case 16: /* 128-bit-readable register */
+				efx_reado_table(efx, buf, table->offset, i);
+				break;
+			case 32: /* 128-bit register, interleaved */
+				efx_reado_table(efx, buf, table->offset, 2 * i);
+				break;
+			default:
+				WARN_ON(1);
+				return;
+			}
+			buf += size;
+		}
+	}
+}
+
+/**
+ * efx_nic_describe_stats - Describe supported statistics for ethtool
+ * @desc: Array of &struct efx_hw_stat_desc describing the statistics
+ * @count: Length of the @desc array
+ * @mask: Bitmask of which elements of @desc are enabled
+ * @names: Buffer to copy names to, or %NULL.  The names are copied
+ *	starting at intervals of %ETH_GSTRING_LEN bytes.
+ *
+ * Returns the number of visible statistics, i.e. the number of set
+ * bits in the first @count bits of @mask for which a name is defined.
+ */
+size_t efx_nic_describe_stats(const struct efx_hw_stat_desc *desc, size_t count,
+			      const unsigned long *mask, u8 *names)
+{
+	size_t visible = 0;
+	size_t index;
+
+	for_each_set_bit(index, mask, count) {
+		if (desc[index].name) {
+			if (names) {
+				strlcpy(names, desc[index].name,
+					ETH_GSTRING_LEN);
+				names += ETH_GSTRING_LEN;
+			}
+			++visible;
+		}
+	}
+
+	return visible;
+}
+
+/**
+ * efx_nic_copy_stats - Copy stats from the DMA buffer in to an
+ *	intermediate buffer. This is used to get a consistent
+ *	set of stats while the DMA buffer can be written at any time
+ *	by the NIC.
+ * @efx: The associated NIC.
+ * @dest: Destination buffer. Must be the same size as the DMA buffer.
+ */
+int efx_nic_copy_stats(struct efx_nic *efx, __le64 *dest)
+{
+	__le64 *dma_stats = efx->stats_buffer.addr;
+	__le64 generation_start, generation_end;
+	int rc = 0, retry;
+
+	if (!dest)
+		return 0;
+
+	if (!dma_stats)
+		goto return_zeroes;
+
+	/* If we're unlucky enough to read statistics during the DMA, wait
+	 * up to 10ms for it to finish (typically takes <500us)
+	 */
+	for (retry = 0; retry < 100; ++retry) {
+		generation_end = dma_stats[efx->num_mac_stats - 1];
+		if (generation_end == EFX_MC_STATS_GENERATION_INVALID)
+			goto return_zeroes;
+		rmb();
+		memcpy(dest, dma_stats, efx->num_mac_stats * sizeof(__le64));
+		rmb();
+		generation_start = dma_stats[MC_CMD_MAC_GENERATION_START];
+		if (generation_end == generation_start)
+			return 0; /* return good data */
+		udelay(100);
+	}
+
+	rc = -EIO;
+
+return_zeroes:
+	memset(dest, 0, efx->num_mac_stats * sizeof(u64));
+	return rc;
+}
+
+/**
+ * efx_nic_update_stats - Convert statistics DMA buffer to array of u64
+ * @desc: Array of &struct efx_hw_stat_desc describing the DMA buffer
+ *	layout.  DMA widths of 0, 16, 32 and 64 are supported; where
+ *	the width is specified as 0 the corresponding element of
+ *	@stats is not updated.
+ * @count: Length of the @desc array
+ * @mask: Bitmask of which elements of @desc are enabled
+ * @stats: Buffer to update with the converted statistics.  The length
+ *	of this array must be at least @count.
+ * @dma_buf: DMA buffer containing hardware statistics
+ * @accumulate: If set, the converted values will be added rather than
+ *	directly stored to the corresponding elements of @stats
+ */
+void efx_nic_update_stats(const struct efx_hw_stat_desc *desc, size_t count,
+			  const unsigned long *mask,
+			  u64 *stats, const void *dma_buf, bool accumulate)
+{
+	size_t index;
+
+	for_each_set_bit(index, mask, count) {
+		if (desc[index].dma_width) {
+			const void *addr = dma_buf + desc[index].offset;
+			u64 val;
+
+			switch (desc[index].dma_width) {
+			case 16:
+				val = le16_to_cpup((__le16 *)addr);
+				break;
+			case 32:
+				val = le32_to_cpup((__le32 *)addr);
+				break;
+			case 64:
+				val = le64_to_cpup((__le64 *)addr);
+				break;
+			default:
+				WARN_ON(1);
+				val = 0;
+				break;
+			}
+
+			if (accumulate)
+				stats[index] += val;
+			else
+				stats[index] = val;
+		}
+	}
+}
+
+void efx_nic_fix_nodesc_drop_stat(struct efx_nic *efx, u64 *rx_nodesc_drops)
+{
+	/* if down, or this is the first update after coming up */
+	if (!(efx->net_dev->flags & IFF_UP) || !efx->rx_nodesc_drops_prev_state)
+		efx->rx_nodesc_drops_while_down +=
+			*rx_nodesc_drops - efx->rx_nodesc_drops_total;
+	efx->rx_nodesc_drops_total = *rx_nodesc_drops;
+	efx->rx_nodesc_drops_prev_state = !!(efx->net_dev->flags & IFF_UP);
+	*rx_nodesc_drops -= efx->rx_nodesc_drops_while_down;
+}
--- /dev/null
+++ b/drivers/net/ethernet/sfc/siena/nic.h
@@ -0,0 +1,392 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/****************************************************************************
+ * Driver for Solarflare network controllers and boards
+ * Copyright 2005-2006 Fen Systems Ltd.
+ * Copyright 2006-2013 Solarflare Communications Inc.
+ */
+
+#ifndef EFX_NIC_H
+#define EFX_NIC_H
+
+#include "nic_common.h"
+#include "efx.h"
+
+u32 efx_farch_fpga_ver(struct efx_nic *efx);
+
+enum {
+	PHY_TYPE_NONE = 0,
+	PHY_TYPE_TXC43128 = 1,
+	PHY_TYPE_88E1111 = 2,
+	PHY_TYPE_SFX7101 = 3,
+	PHY_TYPE_QT2022C2 = 4,
+	PHY_TYPE_PM8358 = 6,
+	PHY_TYPE_SFT9001A = 8,
+	PHY_TYPE_QT2025C = 9,
+	PHY_TYPE_SFT9001B = 10,
+};
+
+enum {
+	SIENA_STAT_tx_bytes = GENERIC_STAT_COUNT,
+	SIENA_STAT_tx_good_bytes,
+	SIENA_STAT_tx_bad_bytes,
+	SIENA_STAT_tx_packets,
+	SIENA_STAT_tx_bad,
+	SIENA_STAT_tx_pause,
+	SIENA_STAT_tx_control,
+	SIENA_STAT_tx_unicast,
+	SIENA_STAT_tx_multicast,
+	SIENA_STAT_tx_broadcast,
+	SIENA_STAT_tx_lt64,
+	SIENA_STAT_tx_64,
+	SIENA_STAT_tx_65_to_127,
+	SIENA_STAT_tx_128_to_255,
+	SIENA_STAT_tx_256_to_511,
+	SIENA_STAT_tx_512_to_1023,
+	SIENA_STAT_tx_1024_to_15xx,
+	SIENA_STAT_tx_15xx_to_jumbo,
+	SIENA_STAT_tx_gtjumbo,
+	SIENA_STAT_tx_collision,
+	SIENA_STAT_tx_single_collision,
+	SIENA_STAT_tx_multiple_collision,
+	SIENA_STAT_tx_excessive_collision,
+	SIENA_STAT_tx_deferred,
+	SIENA_STAT_tx_late_collision,
+	SIENA_STAT_tx_excessive_deferred,
+	SIENA_STAT_tx_non_tcpudp,
+	SIENA_STAT_tx_mac_src_error,
+	SIENA_STAT_tx_ip_src_error,
+	SIENA_STAT_rx_bytes,
+	SIENA_STAT_rx_good_bytes,
+	SIENA_STAT_rx_bad_bytes,
+	SIENA_STAT_rx_packets,
+	SIENA_STAT_rx_good,
+	SIENA_STAT_rx_bad,
+	SIENA_STAT_rx_pause,
+	SIENA_STAT_rx_control,
+	SIENA_STAT_rx_unicast,
+	SIENA_STAT_rx_multicast,
+	SIENA_STAT_rx_broadcast,
+	SIENA_STAT_rx_lt64,
+	SIENA_STAT_rx_64,
+	SIENA_STAT_rx_65_to_127,
+	SIENA_STAT_rx_128_to_255,
+	SIENA_STAT_rx_256_to_511,
+	SIENA_STAT_rx_512_to_1023,
+	SIENA_STAT_rx_1024_to_15xx,
+	SIENA_STAT_rx_15xx_to_jumbo,
+	SIENA_STAT_rx_gtjumbo,
+	SIENA_STAT_rx_bad_gtjumbo,
+	SIENA_STAT_rx_overflow,
+	SIENA_STAT_rx_false_carrier,
+	SIENA_STAT_rx_symbol_error,
+	SIENA_STAT_rx_align_error,
+	SIENA_STAT_rx_length_error,
+	SIENA_STAT_rx_internal_error,
+	SIENA_STAT_rx_nodesc_drop_cnt,
+	SIENA_STAT_COUNT
+};
+
+/**
+ * struct siena_nic_data - Siena NIC state
+ * @efx: Pointer back to main interface structure
+ * @wol_filter_id: Wake-on-LAN packet filter id
+ * @stats: Hardware statistics
+ * @vf: Array of &struct siena_vf objects
+ * @vf_buftbl_base: The zeroth buffer table index used to back VF queues.
+ * @vfdi_status: Common VFDI status page to be dmad to VF address space.
+ * @local_addr_list: List of local addresses. Protected by %local_lock.
+ * @local_page_list: List of DMA addressable pages used to broadcast
+ *	%local_addr_list. Protected by %local_lock.
+ * @local_lock: Mutex protecting %local_addr_list and %local_page_list.
+ * @peer_work: Work item to broadcast peer addresses to VMs.
+ */
+struct siena_nic_data {
+	struct efx_nic *efx;
+	int wol_filter_id;
+	u64 stats[SIENA_STAT_COUNT];
+#ifdef CONFIG_SFC_SRIOV
+	struct siena_vf *vf;
+	struct efx_channel *vfdi_channel;
+	unsigned vf_buftbl_base;
+	struct efx_buffer vfdi_status;
+	struct list_head local_addr_list;
+	struct list_head local_page_list;
+	struct mutex local_lock;
+	struct work_struct peer_work;
+#endif
+};
+
+enum {
+	EF10_STAT_port_tx_bytes = GENERIC_STAT_COUNT,
+	EF10_STAT_port_tx_packets,
+	EF10_STAT_port_tx_pause,
+	EF10_STAT_port_tx_control,
+	EF10_STAT_port_tx_unicast,
+	EF10_STAT_port_tx_multicast,
+	EF10_STAT_port_tx_broadcast,
+	EF10_STAT_port_tx_lt64,
+	EF10_STAT_port_tx_64,
+	EF10_STAT_port_tx_65_to_127,
+	EF10_STAT_port_tx_128_to_255,
+	EF10_STAT_port_tx_256_to_511,
+	EF10_STAT_port_tx_512_to_1023,
+	EF10_STAT_port_tx_1024_to_15xx,
+	EF10_STAT_port_tx_15xx_to_jumbo,
+	EF10_STAT_port_rx_bytes,
+	EF10_STAT_port_rx_bytes_minus_good_bytes,
+	EF10_STAT_port_rx_good_bytes,
+	EF10_STAT_port_rx_bad_bytes,
+	EF10_STAT_port_rx_packets,
+	EF10_STAT_port_rx_good,
+	EF10_STAT_port_rx_bad,
+	EF10_STAT_port_rx_pause,
+	EF10_STAT_port_rx_control,
+	EF10_STAT_port_rx_unicast,
+	EF10_STAT_port_rx_multicast,
+	EF10_STAT_port_rx_broadcast,
+	EF10_STAT_port_rx_lt64,
+	EF10_STAT_port_rx_64,
+	EF10_STAT_port_rx_65_to_127,
+	EF10_STAT_port_rx_128_to_255,
+	EF10_STAT_port_rx_256_to_511,
+	EF10_STAT_port_rx_512_to_1023,
+	EF10_STAT_port_rx_1024_to_15xx,
+	EF10_STAT_port_rx_15xx_to_jumbo,
+	EF10_STAT_port_rx_gtjumbo,
+	EF10_STAT_port_rx_bad_gtjumbo,
+	EF10_STAT_port_rx_overflow,
+	EF10_STAT_port_rx_align_error,
+	EF10_STAT_port_rx_length_error,
+	EF10_STAT_port_rx_nodesc_drops,
+	EF10_STAT_port_rx_pm_trunc_bb_overflow,
+	EF10_STAT_port_rx_pm_discard_bb_overflow,
+	EF10_STAT_port_rx_pm_trunc_vfifo_full,
+	EF10_STAT_port_rx_pm_discard_vfifo_full,
+	EF10_STAT_port_rx_pm_trunc_qbb,
+	EF10_STAT_port_rx_pm_discard_qbb,
+	EF10_STAT_port_rx_pm_discard_mapping,
+	EF10_STAT_port_rx_dp_q_disabled_packets,
+	EF10_STAT_port_rx_dp_di_dropped_packets,
+	EF10_STAT_port_rx_dp_streaming_packets,
+	EF10_STAT_port_rx_dp_hlb_fetch,
+	EF10_STAT_port_rx_dp_hlb_wait,
+	EF10_STAT_rx_unicast,
+	EF10_STAT_rx_unicast_bytes,
+	EF10_STAT_rx_multicast,
+	EF10_STAT_rx_multicast_bytes,
+	EF10_STAT_rx_broadcast,
+	EF10_STAT_rx_broadcast_bytes,
+	EF10_STAT_rx_bad,
+	EF10_STAT_rx_bad_bytes,
+	EF10_STAT_rx_overflow,
+	EF10_STAT_tx_unicast,
+	EF10_STAT_tx_unicast_bytes,
+	EF10_STAT_tx_multicast,
+	EF10_STAT_tx_multicast_bytes,
+	EF10_STAT_tx_broadcast,
+	EF10_STAT_tx_broadcast_bytes,
+	EF10_STAT_tx_bad,
+	EF10_STAT_tx_bad_bytes,
+	EF10_STAT_tx_overflow,
+	EF10_STAT_V1_COUNT,
+	EF10_STAT_fec_uncorrected_errors = EF10_STAT_V1_COUNT,
+	EF10_STAT_fec_corrected_errors,
+	EF10_STAT_fec_corrected_symbols_lane0,
+	EF10_STAT_fec_corrected_symbols_lane1,
+	EF10_STAT_fec_corrected_symbols_lane2,
+	EF10_STAT_fec_corrected_symbols_lane3,
+	EF10_STAT_ctpio_vi_busy_fallback,
+	EF10_STAT_ctpio_long_write_success,
+	EF10_STAT_ctpio_missing_dbell_fail,
+	EF10_STAT_ctpio_overflow_fail,
+	EF10_STAT_ctpio_underflow_fail,
+	EF10_STAT_ctpio_timeout_fail,
+	EF10_STAT_ctpio_noncontig_wr_fail,
+	EF10_STAT_ctpio_frm_clobber_fail,
+	EF10_STAT_ctpio_invalid_wr_fail,
+	EF10_STAT_ctpio_vi_clobber_fallback,
+	EF10_STAT_ctpio_unqualified_fallback,
+	EF10_STAT_ctpio_runt_fallback,
+	EF10_STAT_ctpio_success,
+	EF10_STAT_ctpio_fallback,
+	EF10_STAT_ctpio_poison,
+	EF10_STAT_ctpio_erase,
+	EF10_STAT_COUNT
+};
+
+/* Maximum number of TX PIO buffers we may allocate to a function.
+ * This matches the total number of buffers on each SFC9100-family
+ * controller.
+ */
+#define EF10_TX_PIOBUF_COUNT 16
+
+/**
+ * struct efx_ef10_nic_data - EF10 architecture NIC state
+ * @mcdi_buf: DMA buffer for MCDI
+ * @warm_boot_count: Last seen MC warm boot count
+ * @vi_base: Absolute index of first VI in this function
+ * @n_allocated_vis: Number of VIs allocated to this function
+ * @n_piobufs: Number of PIO buffers allocated to this function
+ * @wc_membase: Base address of write-combining mapping of the memory BAR
+ * @pio_write_base: Base address for writing PIO buffers
+ * @pio_write_vi_base: Relative VI number for @pio_write_base
+ * @piobuf_handle: Handle of each PIO buffer allocated
+ * @piobuf_size: size of a single PIO buffer
+ * @must_restore_piobufs: Flag: PIO buffers have yet to be restored after MC
+ *	reboot
+ * @mc_stats: Scratch buffer for converting statistics to the kernel's format
+ * @stats: Hardware statistics
+ * @workaround_35388: Flag: firmware supports workaround for bug 35388
+ * @workaround_26807: Flag: firmware supports workaround for bug 26807
+ * @workaround_61265: Flag: firmware supports workaround for bug 61265
+ * @must_check_datapath_caps: Flag: @datapath_caps needs to be revalidated
+ *	after MC reboot
+ * @datapath_caps: Capabilities of datapath firmware (FLAGS1 field of
+ *	%MC_CMD_GET_CAPABILITIES response)
+ * @datapath_caps2: Further Capabilities of datapath firmware (FLAGS2 field of
+ * %MC_CMD_GET_CAPABILITIES response)
+ * @rx_dpcpu_fw_id: Firmware ID of the RxDPCPU
+ * @tx_dpcpu_fw_id: Firmware ID of the TxDPCPU
+ * @must_probe_vswitching: Flag: vswitching has yet to be setup after MC reboot
+ * @pf_index: The number for this PF, or the parent PF if this is a VF
+#ifdef CONFIG_SFC_SRIOV
+ * @vf: Pointer to VF data structure
+#endif
+ * @vport_mac: The MAC address on the vport, only for PFs; VFs will be zero
+ * @vlan_list: List of VLANs added over the interface. Serialised by vlan_lock.
+ * @vlan_lock: Lock to serialize access to vlan_list.
+ * @udp_tunnels: UDP tunnel port numbers and types.
+ * @udp_tunnels_dirty: flag indicating a reboot occurred while pushing
+ *	@udp_tunnels to hardware and thus the push must be re-done.
+ * @udp_tunnels_lock: Serialises writes to @udp_tunnels and @udp_tunnels_dirty.
+ */
+struct efx_ef10_nic_data {
+	struct efx_buffer mcdi_buf;
+	u16 warm_boot_count;
+	unsigned int vi_base;
+	unsigned int n_allocated_vis;
+	unsigned int n_piobufs;
+	void __iomem *wc_membase, *pio_write_base;
+	unsigned int pio_write_vi_base;
+	unsigned int piobuf_handle[EF10_TX_PIOBUF_COUNT];
+	u16 piobuf_size;
+	bool must_restore_piobufs;
+	__le64 *mc_stats;
+	u64 stats[EF10_STAT_COUNT];
+	bool workaround_35388;
+	bool workaround_26807;
+	bool workaround_61265;
+	bool must_check_datapath_caps;
+	u32 datapath_caps;
+	u32 datapath_caps2;
+	unsigned int rx_dpcpu_fw_id;
+	unsigned int tx_dpcpu_fw_id;
+	bool must_probe_vswitching;
+	unsigned int pf_index;
+	u8 port_id[ETH_ALEN];
+#ifdef CONFIG_SFC_SRIOV
+	unsigned int vf_index;
+	struct ef10_vf *vf;
+#endif
+	u8 vport_mac[ETH_ALEN];
+	struct list_head vlan_list;
+	struct mutex vlan_lock;
+	struct efx_udp_tunnel udp_tunnels[16];
+	bool udp_tunnels_dirty;
+	struct mutex udp_tunnels_lock;
+	u64 licensed_features;
+};
+
+/* TSOv2 */
+int efx_ef10_tx_tso_desc(struct efx_tx_queue *tx_queue, struct sk_buff *skb,
+			 bool *data_mapped);
+
+extern const struct efx_nic_type efx_hunt_a0_nic_type;
+extern const struct efx_nic_type efx_hunt_a0_vf_nic_type;
+
+int falcon_probe_board(struct efx_nic *efx, u16 revision_info);
+
+/* Falcon/Siena queue operations */
+int efx_farch_tx_probe(struct efx_tx_queue *tx_queue);
+void efx_farch_tx_init(struct efx_tx_queue *tx_queue);
+void efx_farch_tx_fini(struct efx_tx_queue *tx_queue);
+void efx_farch_tx_remove(struct efx_tx_queue *tx_queue);
+void efx_farch_tx_write(struct efx_tx_queue *tx_queue);
+unsigned int efx_farch_tx_limit_len(struct efx_tx_queue *tx_queue,
+				    dma_addr_t dma_addr, unsigned int len);
+int efx_farch_rx_probe(struct efx_rx_queue *rx_queue);
+void efx_farch_rx_init(struct efx_rx_queue *rx_queue);
+void efx_farch_rx_fini(struct efx_rx_queue *rx_queue);
+void efx_farch_rx_remove(struct efx_rx_queue *rx_queue);
+void efx_farch_rx_write(struct efx_rx_queue *rx_queue);
+void efx_farch_rx_defer_refill(struct efx_rx_queue *rx_queue);
+int efx_farch_ev_probe(struct efx_channel *channel);
+int efx_farch_ev_init(struct efx_channel *channel);
+void efx_farch_ev_fini(struct efx_channel *channel);
+void efx_farch_ev_remove(struct efx_channel *channel);
+int efx_farch_ev_process(struct efx_channel *channel, int quota);
+void efx_farch_ev_read_ack(struct efx_channel *channel);
+void efx_farch_ev_test_generate(struct efx_channel *channel);
+
+/* Falcon/Siena filter operations */
+int efx_farch_filter_table_probe(struct efx_nic *efx);
+void efx_farch_filter_table_restore(struct efx_nic *efx);
+void efx_farch_filter_table_remove(struct efx_nic *efx);
+void efx_farch_filter_update_rx_scatter(struct efx_nic *efx);
+s32 efx_farch_filter_insert(struct efx_nic *efx, struct efx_filter_spec *spec,
+			    bool replace);
+int efx_farch_filter_remove_safe(struct efx_nic *efx,
+				 enum efx_filter_priority priority,
+				 u32 filter_id);
+int efx_farch_filter_get_safe(struct efx_nic *efx,
+			      enum efx_filter_priority priority, u32 filter_id,
+			      struct efx_filter_spec *);
+int efx_farch_filter_clear_rx(struct efx_nic *efx,
+			      enum efx_filter_priority priority);
+u32 efx_farch_filter_count_rx_used(struct efx_nic *efx,
+				   enum efx_filter_priority priority);
+u32 efx_farch_filter_get_rx_id_limit(struct efx_nic *efx);
+s32 efx_farch_filter_get_rx_ids(struct efx_nic *efx,
+				enum efx_filter_priority priority, u32 *buf,
+				u32 size);
+#ifdef CONFIG_RFS_ACCEL
+bool efx_farch_filter_rfs_expire_one(struct efx_nic *efx, u32 flow_id,
+				     unsigned int index);
+#endif
+void efx_farch_filter_sync_rx_mode(struct efx_nic *efx);
+
+/* Falcon/Siena interrupts */
+void efx_farch_irq_enable_master(struct efx_nic *efx);
+int efx_farch_irq_test_generate(struct efx_nic *efx);
+void efx_farch_irq_disable_master(struct efx_nic *efx);
+irqreturn_t efx_farch_msi_interrupt(int irq, void *dev_id);
+irqreturn_t efx_farch_legacy_interrupt(int irq, void *dev_id);
+irqreturn_t efx_farch_fatal_interrupt(struct efx_nic *efx);
+
+/* Global Resources */
+void siena_prepare_flush(struct efx_nic *efx);
+int efx_farch_fini_dmaq(struct efx_nic *efx);
+void efx_farch_finish_flr(struct efx_nic *efx);
+void siena_finish_flush(struct efx_nic *efx);
+void falcon_start_nic_stats(struct efx_nic *efx);
+void falcon_stop_nic_stats(struct efx_nic *efx);
+int falcon_reset_xaui(struct efx_nic *efx);
+void efx_farch_dimension_resources(struct efx_nic *efx, unsigned sram_lim_qw);
+void efx_farch_init_common(struct efx_nic *efx);
+void efx_farch_rx_push_indir_table(struct efx_nic *efx);
+void efx_farch_rx_pull_indir_table(struct efx_nic *efx);
+
+/* Tests */
+struct efx_farch_register_test {
+	unsigned address;
+	efx_oword_t mask;
+};
+
+int efx_farch_test_registers(struct efx_nic *efx,
+			     const struct efx_farch_register_test *regs,
+			     size_t n_regs);
+
+void efx_farch_generate_event(struct efx_nic *efx, unsigned int evq,
+			      efx_qword_t *event);
+
+#endif /* EFX_NIC_H */
--- /dev/null
+++ b/drivers/net/ethernet/sfc/siena/nic_common.h
@@ -0,0 +1,262 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/****************************************************************************
+ * Driver for Solarflare network controllers and boards
+ * Copyright 2005-2006 Fen Systems Ltd.
+ * Copyright 2006-2013 Solarflare Communications Inc.
+ * Copyright 2019-2020 Xilinx Inc.
+ */
+
+#ifndef EFX_NIC_COMMON_H
+#define EFX_NIC_COMMON_H
+
+#include "net_driver.h"
+#include "efx_common.h"
+#include "mcdi.h"
+#include "ptp.h"
+
+enum {
+	/* Revisions 0-2 were Falcon A0, A1 and B0 respectively.
+	 * They are not supported by this driver but these revision numbers
+	 * form part of the ethtool API for register dumping.
+	 */
+	EFX_REV_SIENA_A0 = 3,
+	EFX_REV_HUNT_A0 = 4,
+	EFX_REV_EF100 = 5,
+};
+
+static inline int efx_nic_rev(struct efx_nic *efx)
+{
+	return efx->type->revision;
+}
+
+/* Read the current event from the event queue */
+static inline efx_qword_t *efx_event(struct efx_channel *channel,
+				     unsigned int index)
+{
+	return ((efx_qword_t *) (channel->eventq.buf.addr)) +
+		(index & channel->eventq_mask);
+}
+
+/* See if an event is present
+ *
+ * We check both the high and low dword of the event for all ones.  We
+ * wrote all ones when we cleared the event, and no valid event can
+ * have all ones in either its high or low dwords.  This approach is
+ * robust against reordering.
+ *
+ * Note that using a single 64-bit comparison is incorrect; even
+ * though the CPU read will be atomic, the DMA write may not be.
+ */
+static inline int efx_event_present(efx_qword_t *event)
+{
+	return !(EFX_DWORD_IS_ALL_ONES(event->dword[0]) |
+		  EFX_DWORD_IS_ALL_ONES(event->dword[1]));
+}
+
+/* Returns a pointer to the specified transmit descriptor in the TX
+ * descriptor queue belonging to the specified channel.
+ */
+static inline efx_qword_t *
+efx_tx_desc(struct efx_tx_queue *tx_queue, unsigned int index)
+{
+	return ((efx_qword_t *) (tx_queue->txd.buf.addr)) + index;
+}
+
+/* Report whether this TX queue would be empty for the given write_count.
+ * May return false negative.
+ */
+static inline bool efx_nic_tx_is_empty(struct efx_tx_queue *tx_queue, unsigned int write_count)
+{
+	unsigned int empty_read_count = READ_ONCE(tx_queue->empty_read_count);
+
+	if (empty_read_count == 0)
+		return false;
+
+	return ((empty_read_count ^ write_count) & ~EFX_EMPTY_COUNT_VALID) == 0;
+}
+
+int efx_enqueue_skb_tso(struct efx_tx_queue *tx_queue, struct sk_buff *skb,
+			bool *data_mapped);
+
+/* Decide whether to push a TX descriptor to the NIC vs merely writing
+ * the doorbell.  This can reduce latency when we are adding a single
+ * descriptor to an empty queue, but is otherwise pointless.  Further,
+ * Falcon and Siena have hardware bugs (SF bug 33851) that may be
+ * triggered if we don't check this.
+ * We use the write_count used for the last doorbell push, to get the
+ * NIC's view of the tx queue.
+ */
+static inline bool efx_nic_may_push_tx_desc(struct efx_tx_queue *tx_queue,
+					    unsigned int write_count)
+{
+	bool was_empty = efx_nic_tx_is_empty(tx_queue, write_count);
+
+	tx_queue->empty_read_count = 0;
+	return was_empty && tx_queue->write_count - write_count == 1;
+}
+
+/* Returns a pointer to the specified descriptor in the RX descriptor queue */
+static inline efx_qword_t *
+efx_rx_desc(struct efx_rx_queue *rx_queue, unsigned int index)
+{
+	return ((efx_qword_t *) (rx_queue->rxd.buf.addr)) + index;
+}
+
+/* Alignment of PCIe DMA boundaries (4KB) */
+#define EFX_PAGE_SIZE	4096
+/* Size and alignment of buffer table entries (same) */
+#define EFX_BUF_SIZE	EFX_PAGE_SIZE
+
+/* NIC-generic software stats */
+enum {
+	GENERIC_STAT_rx_noskb_drops,
+	GENERIC_STAT_rx_nodesc_trunc,
+	GENERIC_STAT_COUNT
+};
+
+#define EFX_GENERIC_SW_STAT(ext_name)				\
+	[GENERIC_STAT_ ## ext_name] = { #ext_name, 0, 0 }
+
+/* TX data path */
+static inline int efx_nic_probe_tx(struct efx_tx_queue *tx_queue)
+{
+	return tx_queue->efx->type->tx_probe(tx_queue);
+}
+static inline void efx_nic_init_tx(struct efx_tx_queue *tx_queue)
+{
+	tx_queue->efx->type->tx_init(tx_queue);
+}
+static inline void efx_nic_remove_tx(struct efx_tx_queue *tx_queue)
+{
+	if (tx_queue->efx->type->tx_remove)
+		tx_queue->efx->type->tx_remove(tx_queue);
+}
+static inline void efx_nic_push_buffers(struct efx_tx_queue *tx_queue)
+{
+	tx_queue->efx->type->tx_write(tx_queue);
+}
+
+/* RX data path */
+static inline int efx_nic_probe_rx(struct efx_rx_queue *rx_queue)
+{
+	return rx_queue->efx->type->rx_probe(rx_queue);
+}
+static inline void efx_nic_init_rx(struct efx_rx_queue *rx_queue)
+{
+	rx_queue->efx->type->rx_init(rx_queue);
+}
+static inline void efx_nic_remove_rx(struct efx_rx_queue *rx_queue)
+{
+	rx_queue->efx->type->rx_remove(rx_queue);
+}
+static inline void efx_nic_notify_rx_desc(struct efx_rx_queue *rx_queue)
+{
+	rx_queue->efx->type->rx_write(rx_queue);
+}
+static inline void efx_nic_generate_fill_event(struct efx_rx_queue *rx_queue)
+{
+	rx_queue->efx->type->rx_defer_refill(rx_queue);
+}
+
+/* Event data path */
+static inline int efx_nic_probe_eventq(struct efx_channel *channel)
+{
+	return channel->efx->type->ev_probe(channel);
+}
+static inline int efx_nic_init_eventq(struct efx_channel *channel)
+{
+	return channel->efx->type->ev_init(channel);
+}
+static inline void efx_nic_fini_eventq(struct efx_channel *channel)
+{
+	channel->efx->type->ev_fini(channel);
+}
+static inline void efx_nic_remove_eventq(struct efx_channel *channel)
+{
+	channel->efx->type->ev_remove(channel);
+}
+static inline int
+efx_nic_process_eventq(struct efx_channel *channel, int quota)
+{
+	return channel->efx->type->ev_process(channel, quota);
+}
+static inline void efx_nic_eventq_read_ack(struct efx_channel *channel)
+{
+	channel->efx->type->ev_read_ack(channel);
+}
+
+void efx_nic_event_test_start(struct efx_channel *channel);
+
+bool efx_nic_event_present(struct efx_channel *channel);
+
+static inline void efx_sensor_event(struct efx_nic *efx, efx_qword_t *ev)
+{
+	if (efx->type->sensor_event)
+		efx->type->sensor_event(efx, ev);
+}
+
+static inline unsigned int efx_rx_recycle_ring_size(const struct efx_nic *efx)
+{
+	return efx->type->rx_recycle_ring_size(efx);
+}
+
+/* Some statistics are computed as A - B where A and B each increase
+ * linearly with some hardware counter(s) and the counters are read
+ * asynchronously.  If the counters contributing to B are always read
+ * after those contributing to A, the computed value may be lower than
+ * the true value by some variable amount, and may decrease between
+ * subsequent computations.
+ *
+ * We should never allow statistics to decrease or to exceed the true
+ * value.  Since the computed value will never be greater than the
+ * true value, we can achieve this by only storing the computed value
+ * when it increases.
+ */
+static inline void efx_update_diff_stat(u64 *stat, u64 diff)
+{
+	if ((s64)(diff - *stat) > 0)
+		*stat = diff;
+}
+
+/* Interrupts */
+int efx_nic_init_interrupt(struct efx_nic *efx);
+int efx_nic_irq_test_start(struct efx_nic *efx);
+void efx_nic_fini_interrupt(struct efx_nic *efx);
+
+static inline int efx_nic_event_test_irq_cpu(struct efx_channel *channel)
+{
+	return READ_ONCE(channel->event_test_cpu);
+}
+static inline int efx_nic_irq_test_irq_cpu(struct efx_nic *efx)
+{
+	return READ_ONCE(efx->last_irq_cpu);
+}
+
+/* Global Resources */
+int efx_nic_alloc_buffer(struct efx_nic *efx, struct efx_buffer *buffer,
+			 unsigned int len, gfp_t gfp_flags);
+void efx_nic_free_buffer(struct efx_nic *efx, struct efx_buffer *buffer);
+
+size_t efx_nic_get_regs_len(struct efx_nic *efx);
+void efx_nic_get_regs(struct efx_nic *efx, void *buf);
+
+#define EFX_MC_STATS_GENERATION_INVALID ((__force __le64)(-1))
+
+size_t efx_nic_describe_stats(const struct efx_hw_stat_desc *desc, size_t count,
+			      const unsigned long *mask, u8 *names);
+int efx_nic_copy_stats(struct efx_nic *efx, __le64 *dest);
+void efx_nic_update_stats(const struct efx_hw_stat_desc *desc, size_t count,
+			  const unsigned long *mask, u64 *stats,
+			  const void *dma_buf, bool accumulate);
+void efx_nic_fix_nodesc_drop_stat(struct efx_nic *efx, u64 *stat);
+static inline size_t efx_nic_update_stats_atomic(struct efx_nic *efx, u64 *full_stats,
+						 struct rtnl_link_stats64 *core_stats)
+{
+	if (efx->type->update_stats_atomic)
+		return efx->type->update_stats_atomic(efx, full_stats, core_stats);
+	return efx->type->update_stats(efx, full_stats, core_stats);
+}
+
+#define EFX_MAX_FLUSH_TIME 5000
+
+#endif /* EFX_NIC_COMMON_H */
--- /dev/null
+++ b/drivers/net/ethernet/sfc/siena/ptp.c
@@ -0,0 +1,2210 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/****************************************************************************
+ * Driver for Solarflare network controllers and boards
+ * Copyright 2011-2013 Solarflare Communications Inc.
+ */
+
+/* Theory of operation:
+ *
+ * PTP support is assisted by firmware running on the MC, which provides
+ * the hardware timestamping capabilities.  Both transmitted and received
+ * PTP event packets are queued onto internal queues for subsequent processing;
+ * this is because the MC operations are relatively long and would block
+ * block NAPI/interrupt operation.
+ *
+ * Receive event processing:
+ *	The event contains the packet's UUID and sequence number, together
+ *	with the hardware timestamp.  The PTP receive packet queue is searched
+ *	for this UUID/sequence number and, if found, put on a pending queue.
+ *	Packets not matching are delivered without timestamps (MCDI events will
+ *	always arrive after the actual packet).
+ *	It is important for the operation of the PTP protocol that the ordering
+ *	of packets between the event and general port is maintained.
+ *
+ * Work queue processing:
+ *	If work waiting, synchronise host/hardware time
+ *
+ *	Transmit: send packet through MC, which returns the transmission time
+ *	that is converted to an appropriate timestamp.
+ *
+ *	Receive: the packet's reception time is converted to an appropriate
+ *	timestamp.
+ */
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/time.h>
+#include <linux/ktime.h>
+#include <linux/module.h>
+#include <linux/pps_kernel.h>
+#include <linux/ptp_clock_kernel.h>
+#include "net_driver.h"
+#include "efx.h"
+#include "mcdi.h"
+#include "mcdi_pcol.h"
+#include "io.h"
+#include "farch_regs.h"
+#include "tx.h"
+#include "nic.h" /* indirectly includes ptp.h */
+
+/* Maximum number of events expected to make up a PTP event */
+#define	MAX_EVENT_FRAGS			3
+
+/* Maximum delay, ms, to begin synchronisation */
+#define	MAX_SYNCHRONISE_WAIT_MS		2
+
+/* How long, at most, to spend synchronising */
+#define	SYNCHRONISE_PERIOD_NS		250000
+
+/* How often to update the shared memory time */
+#define	SYNCHRONISATION_GRANULARITY_NS	200
+
+/* Minimum permitted length of a (corrected) synchronisation time */
+#define	DEFAULT_MIN_SYNCHRONISATION_NS	120
+
+/* Maximum permitted length of a (corrected) synchronisation time */
+#define	MAX_SYNCHRONISATION_NS		1000
+
+/* How many (MC) receive events that can be queued */
+#define	MAX_RECEIVE_EVENTS		8
+
+/* Length of (modified) moving average. */
+#define	AVERAGE_LENGTH			16
+
+/* How long an unmatched event or packet can be held */
+#define PKT_EVENT_LIFETIME_MS		10
+
+/* Offsets into PTP packet for identification.  These offsets are from the
+ * start of the IP header, not the MAC header.  Note that neither PTP V1 nor
+ * PTP V2 permit the use of IPV4 options.
+ */
+#define PTP_DPORT_OFFSET	22
+
+#define PTP_V1_VERSION_LENGTH	2
+#define PTP_V1_VERSION_OFFSET	28
+
+#define PTP_V1_UUID_LENGTH	6
+#define PTP_V1_UUID_OFFSET	50
+
+#define PTP_V1_SEQUENCE_LENGTH	2
+#define PTP_V1_SEQUENCE_OFFSET	58
+
+/* The minimum length of a PTP V1 packet for offsets, etc. to be valid:
+ * includes IP header.
+ */
+#define	PTP_V1_MIN_LENGTH	64
+
+#define PTP_V2_VERSION_LENGTH	1
+#define PTP_V2_VERSION_OFFSET	29
+
+#define PTP_V2_UUID_LENGTH	8
+#define PTP_V2_UUID_OFFSET	48
+
+/* Although PTP V2 UUIDs are comprised a ClockIdentity (8) and PortNumber (2),
+ * the MC only captures the last six bytes of the clock identity. These values
+ * reflect those, not the ones used in the standard.  The standard permits
+ * mapping of V1 UUIDs to V2 UUIDs with these same values.
+ */
+#define PTP_V2_MC_UUID_LENGTH	6
+#define PTP_V2_MC_UUID_OFFSET	50
+
+#define PTP_V2_SEQUENCE_LENGTH	2
+#define PTP_V2_SEQUENCE_OFFSET	58
+
+/* The minimum length of a PTP V2 packet for offsets, etc. to be valid:
+ * includes IP header.
+ */
+#define	PTP_V2_MIN_LENGTH	63
+
+#define	PTP_MIN_LENGTH		63
+
+#define PTP_ADDRESS		0xe0000181	/* 224.0.1.129 */
+#define PTP_EVENT_PORT		319
+#define PTP_GENERAL_PORT	320
+
+/* Annoyingly the format of the version numbers are different between
+ * versions 1 and 2 so it isn't possible to simply look for 1 or 2.
+ */
+#define	PTP_VERSION_V1		1
+
+#define	PTP_VERSION_V2		2
+#define	PTP_VERSION_V2_MASK	0x0f
+
+enum ptp_packet_state {
+	PTP_PACKET_STATE_UNMATCHED = 0,
+	PTP_PACKET_STATE_MATCHED,
+	PTP_PACKET_STATE_TIMED_OUT,
+	PTP_PACKET_STATE_MATCH_UNWANTED
+};
+
+/* NIC synchronised with single word of time only comprising
+ * partial seconds and full nanoseconds: 10^9 ~ 2^30 so 2 bits for seconds.
+ */
+#define	MC_NANOSECOND_BITS	30
+#define	MC_NANOSECOND_MASK	((1 << MC_NANOSECOND_BITS) - 1)
+#define	MC_SECOND_MASK		((1 << (32 - MC_NANOSECOND_BITS)) - 1)
+
+/* Maximum parts-per-billion adjustment that is acceptable */
+#define MAX_PPB			1000000
+
+/* Precalculate scale word to avoid long long division at runtime */
+/* This is equivalent to 2^66 / 10^9. */
+#define PPB_SCALE_WORD  ((1LL << (57)) / 1953125LL)
+
+/* How much to shift down after scaling to convert to FP40 */
+#define PPB_SHIFT_FP40		26
+/* ... and FP44. */
+#define PPB_SHIFT_FP44		22
+
+#define PTP_SYNC_ATTEMPTS	4
+
+/**
+ * struct efx_ptp_match - Matching structure, stored in sk_buff's cb area.
+ * @words: UUID and (partial) sequence number
+ * @expiry: Time after which the packet should be delivered irrespective of
+ *            event arrival.
+ * @state: The state of the packet - whether it is ready for processing or
+ *         whether that is of no interest.
+ */
+struct efx_ptp_match {
+	u32 words[DIV_ROUND_UP(PTP_V1_UUID_LENGTH, 4)];
+	unsigned long expiry;
+	enum ptp_packet_state state;
+};
+
+/**
+ * struct efx_ptp_event_rx - A PTP receive event (from MC)
+ * @link: list of events
+ * @seq0: First part of (PTP) UUID
+ * @seq1: Second part of (PTP) UUID and sequence number
+ * @hwtimestamp: Event timestamp
+ * @expiry: Time which the packet arrived
+ */
+struct efx_ptp_event_rx {
+	struct list_head link;
+	u32 seq0;
+	u32 seq1;
+	ktime_t hwtimestamp;
+	unsigned long expiry;
+};
+
+/**
+ * struct efx_ptp_timeset - Synchronisation between host and MC
+ * @host_start: Host time immediately before hardware timestamp taken
+ * @major: Hardware timestamp, major
+ * @minor: Hardware timestamp, minor
+ * @host_end: Host time immediately after hardware timestamp taken
+ * @wait: Number of NIC clock ticks between hardware timestamp being read and
+ *          host end time being seen
+ * @window: Difference of host_end and host_start
+ * @valid: Whether this timeset is valid
+ */
+struct efx_ptp_timeset {
+	u32 host_start;
+	u32 major;
+	u32 minor;
+	u32 host_end;
+	u32 wait;
+	u32 window;	/* Derived: end - start, allowing for wrap */
+};
+
+/**
+ * struct efx_ptp_data - Precision Time Protocol (PTP) state
+ * @efx: The NIC context
+ * @channel: The PTP channel (Siena only)
+ * @rx_ts_inline: Flag for whether RX timestamps are inline (else they are
+ *	separate events)
+ * @rxq: Receive SKB queue (awaiting timestamps)
+ * @txq: Transmit SKB queue
+ * @evt_list: List of MC receive events awaiting packets
+ * @evt_free_list: List of free events
+ * @evt_lock: Lock for manipulating evt_list and evt_free_list
+ * @rx_evts: Instantiated events (on evt_list and evt_free_list)
+ * @workwq: Work queue for processing pending PTP operations
+ * @work: Work task
+ * @reset_required: A serious error has occurred and the PTP task needs to be
+ *                  reset (disable, enable).
+ * @rxfilter_event: Receive filter when operating
+ * @rxfilter_general: Receive filter when operating
+ * @rxfilter_installed: Receive filter installed
+ * @config: Current timestamp configuration
+ * @enabled: PTP operation enabled
+ * @mode: Mode in which PTP operating (PTP version)
+ * @ns_to_nic_time: Function to convert from scalar nanoseconds to NIC time
+ * @nic_to_kernel_time: Function to convert from NIC to kernel time
+ * @nic_time: contains time details
+ * @nic_time.minor_max: Wrap point for NIC minor times
+ * @nic_time.sync_event_diff_min: Minimum acceptable difference between time
+ * in packet prefix and last MCDI time sync event i.e. how much earlier than
+ * the last sync event time a packet timestamp can be.
+ * @nic_time.sync_event_diff_max: Maximum acceptable difference between time
+ * in packet prefix and last MCDI time sync event i.e. how much later than
+ * the last sync event time a packet timestamp can be.
+ * @nic_time.sync_event_minor_shift: Shift required to make minor time from
+ * field in MCDI time sync event.
+ * @min_synchronisation_ns: Minimum acceptable corrected sync window
+ * @capabilities: Capabilities flags from the NIC
+ * @ts_corrections: contains corrections details
+ * @ts_corrections.ptp_tx: Required driver correction of PTP packet transmit
+ *                         timestamps
+ * @ts_corrections.ptp_rx: Required driver correction of PTP packet receive
+ *                         timestamps
+ * @ts_corrections.pps_out: PPS output error (information only)
+ * @ts_corrections.pps_in: Required driver correction of PPS input timestamps
+ * @ts_corrections.general_tx: Required driver correction of general packet
+ *                             transmit timestamps
+ * @ts_corrections.general_rx: Required driver correction of general packet
+ *                             receive timestamps
+ * @evt_frags: Partly assembled PTP events
+ * @evt_frag_idx: Current fragment number
+ * @evt_code: Last event code
+ * @start: Address at which MC indicates ready for synchronisation
+ * @host_time_pps: Host time at last PPS
+ * @adjfreq_ppb_shift: Shift required to convert scaled parts-per-billion
+ * frequency adjustment into a fixed point fractional nanosecond format.
+ * @current_adjfreq: Current ppb adjustment.
+ * @phc_clock: Pointer to registered phc device (if primary function)
+ * @phc_clock_info: Registration structure for phc device
+ * @pps_work: pps work task for handling pps events
+ * @pps_workwq: pps work queue
+ * @nic_ts_enabled: Flag indicating if NIC generated TS events are handled
+ * @txbuf: Buffer for use when transmitting (PTP) packets to MC (avoids
+ *         allocations in main data path).
+ * @good_syncs: Number of successful synchronisations.
+ * @fast_syncs: Number of synchronisations requiring short delay
+ * @bad_syncs: Number of failed synchronisations.
+ * @sync_timeouts: Number of synchronisation timeouts
+ * @no_time_syncs: Number of synchronisations with no good times.
+ * @invalid_sync_windows: Number of sync windows with bad durations.
+ * @undersize_sync_windows: Number of corrected sync windows that are too small
+ * @oversize_sync_windows: Number of corrected sync windows that are too large
+ * @rx_no_timestamp: Number of packets received without a timestamp.
+ * @timeset: Last set of synchronisation statistics.
+ * @xmit_skb: Transmit SKB function.
+ */
+struct efx_ptp_data {
+	struct efx_nic *efx;
+	struct efx_channel *channel;
+	bool rx_ts_inline;
+	struct sk_buff_head rxq;
+	struct sk_buff_head txq;
+	struct list_head evt_list;
+	struct list_head evt_free_list;
+	spinlock_t evt_lock;
+	struct efx_ptp_event_rx rx_evts[MAX_RECEIVE_EVENTS];
+	struct workqueue_struct *workwq;
+	struct work_struct work;
+	bool reset_required;
+	u32 rxfilter_event;
+	u32 rxfilter_general;
+	bool rxfilter_installed;
+	struct hwtstamp_config config;
+	bool enabled;
+	unsigned int mode;
+	void (*ns_to_nic_time)(s64 ns, u32 *nic_major, u32 *nic_minor);
+	ktime_t (*nic_to_kernel_time)(u32 nic_major, u32 nic_minor,
+				      s32 correction);
+	struct {
+		u32 minor_max;
+		u32 sync_event_diff_min;
+		u32 sync_event_diff_max;
+		unsigned int sync_event_minor_shift;
+	} nic_time;
+	unsigned int min_synchronisation_ns;
+	unsigned int capabilities;
+	struct {
+		s32 ptp_tx;
+		s32 ptp_rx;
+		s32 pps_out;
+		s32 pps_in;
+		s32 general_tx;
+		s32 general_rx;
+	} ts_corrections;
+	efx_qword_t evt_frags[MAX_EVENT_FRAGS];
+	int evt_frag_idx;
+	int evt_code;
+	struct efx_buffer start;
+	struct pps_event_time host_time_pps;
+	unsigned int adjfreq_ppb_shift;
+	s64 current_adjfreq;
+	struct ptp_clock *phc_clock;
+	struct ptp_clock_info phc_clock_info;
+	struct work_struct pps_work;
+	struct workqueue_struct *pps_workwq;
+	bool nic_ts_enabled;
+	efx_dword_t txbuf[MCDI_TX_BUF_LEN(MC_CMD_PTP_IN_TRANSMIT_LENMAX)];
+
+	unsigned int good_syncs;
+	unsigned int fast_syncs;
+	unsigned int bad_syncs;
+	unsigned int sync_timeouts;
+	unsigned int no_time_syncs;
+	unsigned int invalid_sync_windows;
+	unsigned int undersize_sync_windows;
+	unsigned int oversize_sync_windows;
+	unsigned int rx_no_timestamp;
+	struct efx_ptp_timeset
+	timeset[MC_CMD_PTP_OUT_SYNCHRONIZE_TIMESET_MAXNUM];
+	void (*xmit_skb)(struct efx_nic *efx, struct sk_buff *skb);
+};
+
+static int efx_phc_adjfreq(struct ptp_clock_info *ptp, s32 delta);
+static int efx_phc_adjtime(struct ptp_clock_info *ptp, s64 delta);
+static int efx_phc_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts);
+static int efx_phc_settime(struct ptp_clock_info *ptp,
+			   const struct timespec64 *e_ts);
+static int efx_phc_enable(struct ptp_clock_info *ptp,
+			  struct ptp_clock_request *request, int on);
+
+bool efx_ptp_use_mac_tx_timestamps(struct efx_nic *efx)
+{
+	return efx_has_cap(efx, TX_MAC_TIMESTAMPING);
+}
+
+/* PTP 'extra' channel is still a traffic channel, but we only create TX queues
+ * if PTP uses MAC TX timestamps, not if PTP uses the MC directly to transmit.
+ */
+static bool efx_ptp_want_txqs(struct efx_channel *channel)
+{
+	return efx_ptp_use_mac_tx_timestamps(channel->efx);
+}
+
+#define PTP_SW_STAT(ext_name, field_name)				\
+	{ #ext_name, 0, offsetof(struct efx_ptp_data, field_name) }
+#define PTP_MC_STAT(ext_name, mcdi_name)				\
+	{ #ext_name, 32, MC_CMD_PTP_OUT_STATUS_STATS_ ## mcdi_name ## _OFST }
+static const struct efx_hw_stat_desc efx_ptp_stat_desc[] = {
+	PTP_SW_STAT(ptp_good_syncs, good_syncs),
+	PTP_SW_STAT(ptp_fast_syncs, fast_syncs),
+	PTP_SW_STAT(ptp_bad_syncs, bad_syncs),
+	PTP_SW_STAT(ptp_sync_timeouts, sync_timeouts),
+	PTP_SW_STAT(ptp_no_time_syncs, no_time_syncs),
+	PTP_SW_STAT(ptp_invalid_sync_windows, invalid_sync_windows),
+	PTP_SW_STAT(ptp_undersize_sync_windows, undersize_sync_windows),
+	PTP_SW_STAT(ptp_oversize_sync_windows, oversize_sync_windows),
+	PTP_SW_STAT(ptp_rx_no_timestamp, rx_no_timestamp),
+	PTP_MC_STAT(ptp_tx_timestamp_packets, TX),
+	PTP_MC_STAT(ptp_rx_timestamp_packets, RX),
+	PTP_MC_STAT(ptp_timestamp_packets, TS),
+	PTP_MC_STAT(ptp_filter_matches, FM),
+	PTP_MC_STAT(ptp_non_filter_matches, NFM),
+};
+#define PTP_STAT_COUNT ARRAY_SIZE(efx_ptp_stat_desc)
+static const unsigned long efx_ptp_stat_mask[] = {
+	[0 ... BITS_TO_LONGS(PTP_STAT_COUNT) - 1] = ~0UL,
+};
+
+size_t efx_ptp_describe_stats(struct efx_nic *efx, u8 *strings)
+{
+	if (!efx->ptp_data)
+		return 0;
+
+	return efx_nic_describe_stats(efx_ptp_stat_desc, PTP_STAT_COUNT,
+				      efx_ptp_stat_mask, strings);
+}
+
+size_t efx_ptp_update_stats(struct efx_nic *efx, u64 *stats)
+{
+	MCDI_DECLARE_BUF(inbuf, MC_CMD_PTP_IN_STATUS_LEN);
+	MCDI_DECLARE_BUF(outbuf, MC_CMD_PTP_OUT_STATUS_LEN);
+	size_t i;
+	int rc;
+
+	if (!efx->ptp_data)
+		return 0;
+
+	/* Copy software statistics */
+	for (i = 0; i < PTP_STAT_COUNT; i++) {
+		if (efx_ptp_stat_desc[i].dma_width)
+			continue;
+		stats[i] = *(unsigned int *)((char *)efx->ptp_data +
+					     efx_ptp_stat_desc[i].offset);
+	}
+
+	/* Fetch MC statistics.  We *must* fill in all statistics or
+	 * risk leaking kernel memory to userland, so if the MCDI
+	 * request fails we pretend we got zeroes.
+	 */
+	MCDI_SET_DWORD(inbuf, PTP_IN_OP, MC_CMD_PTP_OP_STATUS);
+	MCDI_SET_DWORD(inbuf, PTP_IN_PERIPH_ID, 0);
+	rc = efx_mcdi_rpc(efx, MC_CMD_PTP, inbuf, sizeof(inbuf),
+			  outbuf, sizeof(outbuf), NULL);
+	if (rc)
+		memset(outbuf, 0, sizeof(outbuf));
+	efx_nic_update_stats(efx_ptp_stat_desc, PTP_STAT_COUNT,
+			     efx_ptp_stat_mask,
+			     stats, _MCDI_PTR(outbuf, 0), false);
+
+	return PTP_STAT_COUNT;
+}
+
+/* For Siena platforms NIC time is s and ns */
+static void efx_ptp_ns_to_s_ns(s64 ns, u32 *nic_major, u32 *nic_minor)
+{
+	struct timespec64 ts = ns_to_timespec64(ns);
+	*nic_major = (u32)ts.tv_sec;
+	*nic_minor = ts.tv_nsec;
+}
+
+static ktime_t efx_ptp_s_ns_to_ktime_correction(u32 nic_major, u32 nic_minor,
+						s32 correction)
+{
+	ktime_t kt = ktime_set(nic_major, nic_minor);
+	if (correction >= 0)
+		kt = ktime_add_ns(kt, (u64)correction);
+	else
+		kt = ktime_sub_ns(kt, (u64)-correction);
+	return kt;
+}
+
+/* To convert from s27 format to ns we multiply then divide by a power of 2.
+ * For the conversion from ns to s27, the operation is also converted to a
+ * multiply and shift.
+ */
+#define S27_TO_NS_SHIFT	(27)
+#define NS_TO_S27_MULT	(((1ULL << 63) + NSEC_PER_SEC / 2) / NSEC_PER_SEC)
+#define NS_TO_S27_SHIFT	(63 - S27_TO_NS_SHIFT)
+#define S27_MINOR_MAX	(1 << S27_TO_NS_SHIFT)
+
+/* For Huntington platforms NIC time is in seconds and fractions of a second
+ * where the minor register only uses 27 bits in units of 2^-27s.
+ */
+static void efx_ptp_ns_to_s27(s64 ns, u32 *nic_major, u32 *nic_minor)
+{
+	struct timespec64 ts = ns_to_timespec64(ns);
+	u32 maj = (u32)ts.tv_sec;
+	u32 min = (u32)(((u64)ts.tv_nsec * NS_TO_S27_MULT +
+			 (1ULL << (NS_TO_S27_SHIFT - 1))) >> NS_TO_S27_SHIFT);
+
+	/* The conversion can result in the minor value exceeding the maximum.
+	 * In this case, round up to the next second.
+	 */
+	if (min >= S27_MINOR_MAX) {
+		min -= S27_MINOR_MAX;
+		maj++;
+	}
+
+	*nic_major = maj;
+	*nic_minor = min;
+}
+
+static inline ktime_t efx_ptp_s27_to_ktime(u32 nic_major, u32 nic_minor)
+{
+	u32 ns = (u32)(((u64)nic_minor * NSEC_PER_SEC +
+			(1ULL << (S27_TO_NS_SHIFT - 1))) >> S27_TO_NS_SHIFT);
+	return ktime_set(nic_major, ns);
+}
+
+static ktime_t efx_ptp_s27_to_ktime_correction(u32 nic_major, u32 nic_minor,
+					       s32 correction)
+{
+	/* Apply the correction and deal with carry */
+	nic_minor += correction;
+	if ((s32)nic_minor < 0) {
+		nic_minor += S27_MINOR_MAX;
+		nic_major--;
+	} else if (nic_minor >= S27_MINOR_MAX) {
+		nic_minor -= S27_MINOR_MAX;
+		nic_major++;
+	}
+
+	return efx_ptp_s27_to_ktime(nic_major, nic_minor);
+}
+
+/* For Medford2 platforms the time is in seconds and quarter nanoseconds. */
+static void efx_ptp_ns_to_s_qns(s64 ns, u32 *nic_major, u32 *nic_minor)
+{
+	struct timespec64 ts = ns_to_timespec64(ns);
+
+	*nic_major = (u32)ts.tv_sec;
+	*nic_minor = ts.tv_nsec * 4;
+}
+
+static ktime_t efx_ptp_s_qns_to_ktime_correction(u32 nic_major, u32 nic_minor,
+						 s32 correction)
+{
+	ktime_t kt;
+
+	nic_minor = DIV_ROUND_CLOSEST(nic_minor, 4);
+	correction = DIV_ROUND_CLOSEST(correction, 4);
+
+	kt = ktime_set(nic_major, nic_minor);
+
+	if (correction >= 0)
+		kt = ktime_add_ns(kt, (u64)correction);
+	else
+		kt = ktime_sub_ns(kt, (u64)-correction);
+	return kt;
+}
+
+struct efx_channel *efx_ptp_channel(struct efx_nic *efx)
+{
+	return efx->ptp_data ? efx->ptp_data->channel : NULL;
+}
+
+static u32 last_sync_timestamp_major(struct efx_nic *efx)
+{
+	struct efx_channel *channel = efx_ptp_channel(efx);
+	u32 major = 0;
+
+	if (channel)
+		major = channel->sync_timestamp_major;
+	return major;
+}
+
+/* The 8000 series and later can provide the time from the MAC, which is only
+ * 48 bits long and provides meta-information in the top 2 bits.
+ */
+static ktime_t
+efx_ptp_mac_nic_to_ktime_correction(struct efx_nic *efx,
+				    struct efx_ptp_data *ptp,
+				    u32 nic_major, u32 nic_minor,
+				    s32 correction)
+{
+	u32 sync_timestamp;
+	ktime_t kt = { 0 };
+	s16 delta;
+
+	if (!(nic_major & 0x80000000)) {
+		WARN_ON_ONCE(nic_major >> 16);
+
+		/* Medford provides 48 bits of timestamp, so we must get the top
+		 * 16 bits from the timesync event state.
+		 *
+		 * We only have the lower 16 bits of the time now, but we do
+		 * have a full resolution timestamp at some point in past. As
+		 * long as the difference between the (real) now and the sync
+		 * is less than 2^15, then we can reconstruct the difference
+		 * between those two numbers using only the lower 16 bits of
+		 * each.
+		 *
+		 * Put another way
+		 *
+		 * a - b = ((a mod k) - b) mod k
+		 *
+		 * when -k/2 < (a-b) < k/2. In our case k is 2^16. We know
+		 * (a mod k) and b, so can calculate the delta, a - b.
+		 *
+		 */
+		sync_timestamp = last_sync_timestamp_major(efx);
+
+		/* Because delta is s16 this does an implicit mask down to
+		 * 16 bits which is what we need, assuming
+		 * MEDFORD_TX_SECS_EVENT_BITS is 16. delta is signed so that
+		 * we can deal with the (unlikely) case of sync timestamps
+		 * arriving from the future.
+		 */
+		delta = nic_major - sync_timestamp;
+
+		/* Recover the fully specified time now, by applying the offset
+		 * to the (fully specified) sync time.
+		 */
+		nic_major = sync_timestamp + delta;
+
+		kt = ptp->nic_to_kernel_time(nic_major, nic_minor,
+					     correction);
+	}
+	return kt;
+}
+
+ktime_t efx_ptp_nic_to_kernel_time(struct efx_tx_queue *tx_queue)
+{
+	struct efx_nic *efx = tx_queue->efx;
+	struct efx_ptp_data *ptp = efx->ptp_data;
+	ktime_t kt;
+
+	if (efx_ptp_use_mac_tx_timestamps(efx))
+		kt = efx_ptp_mac_nic_to_ktime_correction(efx, ptp,
+				tx_queue->completed_timestamp_major,
+				tx_queue->completed_timestamp_minor,
+				ptp->ts_corrections.general_tx);
+	else
+		kt = ptp->nic_to_kernel_time(
+				tx_queue->completed_timestamp_major,
+				tx_queue->completed_timestamp_minor,
+				ptp->ts_corrections.general_tx);
+	return kt;
+}
+
+/* Get PTP attributes and set up time conversions */
+static int efx_ptp_get_attributes(struct efx_nic *efx)
+{
+	MCDI_DECLARE_BUF(inbuf, MC_CMD_PTP_IN_GET_ATTRIBUTES_LEN);
+	MCDI_DECLARE_BUF(outbuf, MC_CMD_PTP_OUT_GET_ATTRIBUTES_LEN);
+	struct efx_ptp_data *ptp = efx->ptp_data;
+	int rc;
+	u32 fmt;
+	size_t out_len;
+
+	/* Get the PTP attributes. If the NIC doesn't support the operation we
+	 * use the default format for compatibility with older NICs i.e.
+	 * seconds and nanoseconds.
+	 */
+	MCDI_SET_DWORD(inbuf, PTP_IN_OP, MC_CMD_PTP_OP_GET_ATTRIBUTES);
+	MCDI_SET_DWORD(inbuf, PTP_IN_PERIPH_ID, 0);
+	rc = efx_mcdi_rpc_quiet(efx, MC_CMD_PTP, inbuf, sizeof(inbuf),
+				outbuf, sizeof(outbuf), &out_len);
+	if (rc == 0) {
+		fmt = MCDI_DWORD(outbuf, PTP_OUT_GET_ATTRIBUTES_TIME_FORMAT);
+	} else if (rc == -EINVAL) {
+		fmt = MC_CMD_PTP_OUT_GET_ATTRIBUTES_SECONDS_NANOSECONDS;
+	} else if (rc == -EPERM) {
+		pci_info(efx->pci_dev, "no PTP support\n");
+		return rc;
+	} else {
+		efx_mcdi_display_error(efx, MC_CMD_PTP, sizeof(inbuf),
+				       outbuf, sizeof(outbuf), rc);
+		return rc;
+	}
+
+	switch (fmt) {
+	case MC_CMD_PTP_OUT_GET_ATTRIBUTES_SECONDS_27FRACTION:
+		ptp->ns_to_nic_time = efx_ptp_ns_to_s27;
+		ptp->nic_to_kernel_time = efx_ptp_s27_to_ktime_correction;
+		ptp->nic_time.minor_max = 1 << 27;
+		ptp->nic_time.sync_event_minor_shift = 19;
+		break;
+	case MC_CMD_PTP_OUT_GET_ATTRIBUTES_SECONDS_NANOSECONDS:
+		ptp->ns_to_nic_time = efx_ptp_ns_to_s_ns;
+		ptp->nic_to_kernel_time = efx_ptp_s_ns_to_ktime_correction;
+		ptp->nic_time.minor_max = 1000000000;
+		ptp->nic_time.sync_event_minor_shift = 22;
+		break;
+	case MC_CMD_PTP_OUT_GET_ATTRIBUTES_SECONDS_QTR_NANOSECONDS:
+		ptp->ns_to_nic_time = efx_ptp_ns_to_s_qns;
+		ptp->nic_to_kernel_time = efx_ptp_s_qns_to_ktime_correction;
+		ptp->nic_time.minor_max = 4000000000UL;
+		ptp->nic_time.sync_event_minor_shift = 24;
+		break;
+	default:
+		return -ERANGE;
+	}
+
+	/* Precalculate acceptable difference between the minor time in the
+	 * packet prefix and the last MCDI time sync event. We expect the
+	 * packet prefix timestamp to be after of sync event by up to one
+	 * sync event interval (0.25s) but we allow it to exceed this by a
+	 * fuzz factor of (0.1s)
+	 */
+	ptp->nic_time.sync_event_diff_min = ptp->nic_time.minor_max
+		- (ptp->nic_time.minor_max / 10);
+	ptp->nic_time.sync_event_diff_max = (ptp->nic_time.minor_max / 4)
+		+ (ptp->nic_time.minor_max / 10);
+
+	/* MC_CMD_PTP_OP_GET_ATTRIBUTES has been extended twice from an older
+	 * operation MC_CMD_PTP_OP_GET_TIME_FORMAT. The function now may return
+	 * a value to use for the minimum acceptable corrected synchronization
+	 * window and may return further capabilities.
+	 * If we have the extra information store it. For older firmware that
+	 * does not implement the extended command use the default value.
+	 */
+	if (rc == 0 &&
+	    out_len >= MC_CMD_PTP_OUT_GET_ATTRIBUTES_CAPABILITIES_OFST)
+		ptp->min_synchronisation_ns =
+			MCDI_DWORD(outbuf,
+				   PTP_OUT_GET_ATTRIBUTES_SYNC_WINDOW_MIN);
+	else
+		ptp->min_synchronisation_ns = DEFAULT_MIN_SYNCHRONISATION_NS;
+
+	if (rc == 0 &&
+	    out_len >= MC_CMD_PTP_OUT_GET_ATTRIBUTES_LEN)
+		ptp->capabilities = MCDI_DWORD(outbuf,
+					PTP_OUT_GET_ATTRIBUTES_CAPABILITIES);
+	else
+		ptp->capabilities = 0;
+
+	/* Set up the shift for conversion between frequency
+	 * adjustments in parts-per-billion and the fixed-point
+	 * fractional ns format that the adapter uses.
+	 */
+	if (ptp->capabilities & (1 << MC_CMD_PTP_OUT_GET_ATTRIBUTES_FP44_FREQ_ADJ_LBN))
+		ptp->adjfreq_ppb_shift = PPB_SHIFT_FP44;
+	else
+		ptp->adjfreq_ppb_shift = PPB_SHIFT_FP40;
+
+	return 0;
+}
+
+/* Get PTP timestamp corrections */
+static int efx_ptp_get_timestamp_corrections(struct efx_nic *efx)
+{
+	MCDI_DECLARE_BUF(inbuf, MC_CMD_PTP_IN_GET_TIMESTAMP_CORRECTIONS_LEN);
+	MCDI_DECLARE_BUF(outbuf, MC_CMD_PTP_OUT_GET_TIMESTAMP_CORRECTIONS_V2_LEN);
+	int rc;
+	size_t out_len;
+
+	/* Get the timestamp corrections from the NIC. If this operation is
+	 * not supported (older NICs) then no correction is required.
+	 */
+	MCDI_SET_DWORD(inbuf, PTP_IN_OP,
+		       MC_CMD_PTP_OP_GET_TIMESTAMP_CORRECTIONS);
+	MCDI_SET_DWORD(inbuf, PTP_IN_PERIPH_ID, 0);
+
+	rc = efx_mcdi_rpc_quiet(efx, MC_CMD_PTP, inbuf, sizeof(inbuf),
+				outbuf, sizeof(outbuf), &out_len);
+	if (rc == 0) {
+		efx->ptp_data->ts_corrections.ptp_tx = MCDI_DWORD(outbuf,
+			PTP_OUT_GET_TIMESTAMP_CORRECTIONS_TRANSMIT);
+		efx->ptp_data->ts_corrections.ptp_rx = MCDI_DWORD(outbuf,
+			PTP_OUT_GET_TIMESTAMP_CORRECTIONS_RECEIVE);
+		efx->ptp_data->ts_corrections.pps_out = MCDI_DWORD(outbuf,
+			PTP_OUT_GET_TIMESTAMP_CORRECTIONS_PPS_OUT);
+		efx->ptp_data->ts_corrections.pps_in = MCDI_DWORD(outbuf,
+			PTP_OUT_GET_TIMESTAMP_CORRECTIONS_PPS_IN);
+
+		if (out_len >= MC_CMD_PTP_OUT_GET_TIMESTAMP_CORRECTIONS_V2_LEN) {
+			efx->ptp_data->ts_corrections.general_tx = MCDI_DWORD(
+				outbuf,
+				PTP_OUT_GET_TIMESTAMP_CORRECTIONS_V2_GENERAL_TX);
+			efx->ptp_data->ts_corrections.general_rx = MCDI_DWORD(
+				outbuf,
+				PTP_OUT_GET_TIMESTAMP_CORRECTIONS_V2_GENERAL_RX);
+		} else {
+			efx->ptp_data->ts_corrections.general_tx =
+				efx->ptp_data->ts_corrections.ptp_tx;
+			efx->ptp_data->ts_corrections.general_rx =
+				efx->ptp_data->ts_corrections.ptp_rx;
+		}
+	} else if (rc == -EINVAL) {
+		efx->ptp_data->ts_corrections.ptp_tx = 0;
+		efx->ptp_data->ts_corrections.ptp_rx = 0;
+		efx->ptp_data->ts_corrections.pps_out = 0;
+		efx->ptp_data->ts_corrections.pps_in = 0;
+		efx->ptp_data->ts_corrections.general_tx = 0;
+		efx->ptp_data->ts_corrections.general_rx = 0;
+	} else {
+		efx_mcdi_display_error(efx, MC_CMD_PTP, sizeof(inbuf), outbuf,
+				       sizeof(outbuf), rc);
+		return rc;
+	}
+
+	return 0;
+}
+
+/* Enable MCDI PTP support. */
+static int efx_ptp_enable(struct efx_nic *efx)
+{
+	MCDI_DECLARE_BUF(inbuf, MC_CMD_PTP_IN_ENABLE_LEN);
+	MCDI_DECLARE_BUF_ERR(outbuf);
+	int rc;
+
+	MCDI_SET_DWORD(inbuf, PTP_IN_OP, MC_CMD_PTP_OP_ENABLE);
+	MCDI_SET_DWORD(inbuf, PTP_IN_PERIPH_ID, 0);
+	MCDI_SET_DWORD(inbuf, PTP_IN_ENABLE_QUEUE,
+		       efx->ptp_data->channel ?
+		       efx->ptp_data->channel->channel : 0);
+	MCDI_SET_DWORD(inbuf, PTP_IN_ENABLE_MODE, efx->ptp_data->mode);
+
+	rc = efx_mcdi_rpc_quiet(efx, MC_CMD_PTP, inbuf, sizeof(inbuf),
+				outbuf, sizeof(outbuf), NULL);
+	rc = (rc == -EALREADY) ? 0 : rc;
+	if (rc)
+		efx_mcdi_display_error(efx, MC_CMD_PTP,
+				       MC_CMD_PTP_IN_ENABLE_LEN,
+				       outbuf, sizeof(outbuf), rc);
+	return rc;
+}
+
+/* Disable MCDI PTP support.
+ *
+ * Note that this function should never rely on the presence of ptp_data -
+ * may be called before that exists.
+ */
+static int efx_ptp_disable(struct efx_nic *efx)
+{
+	MCDI_DECLARE_BUF(inbuf, MC_CMD_PTP_IN_DISABLE_LEN);
+	MCDI_DECLARE_BUF_ERR(outbuf);
+	int rc;
+
+	MCDI_SET_DWORD(inbuf, PTP_IN_OP, MC_CMD_PTP_OP_DISABLE);
+	MCDI_SET_DWORD(inbuf, PTP_IN_PERIPH_ID, 0);
+	rc = efx_mcdi_rpc_quiet(efx, MC_CMD_PTP, inbuf, sizeof(inbuf),
+				outbuf, sizeof(outbuf), NULL);
+	rc = (rc == -EALREADY) ? 0 : rc;
+	/* If we get ENOSYS, the NIC doesn't support PTP, and thus this function
+	 * should only have been called during probe.
+	 */
+	if (rc == -ENOSYS || rc == -EPERM)
+		pci_info(efx->pci_dev, "no PTP support\n");
+	else if (rc)
+		efx_mcdi_display_error(efx, MC_CMD_PTP,
+				       MC_CMD_PTP_IN_DISABLE_LEN,
+				       outbuf, sizeof(outbuf), rc);
+	return rc;
+}
+
+static void efx_ptp_deliver_rx_queue(struct sk_buff_head *q)
+{
+	struct sk_buff *skb;
+
+	while ((skb = skb_dequeue(q))) {
+		local_bh_disable();
+		netif_receive_skb(skb);
+		local_bh_enable();
+	}
+}
+
+static void efx_ptp_handle_no_channel(struct efx_nic *efx)
+{
+	netif_err(efx, drv, efx->net_dev,
+		  "ERROR: PTP requires MSI-X and 1 additional interrupt"
+		  "vector. PTP disabled\n");
+}
+
+/* Repeatedly send the host time to the MC which will capture the hardware
+ * time.
+ */
+static void efx_ptp_send_times(struct efx_nic *efx,
+			       struct pps_event_time *last_time)
+{
+	struct pps_event_time now;
+	struct timespec64 limit;
+	struct efx_ptp_data *ptp = efx->ptp_data;
+	int *mc_running = ptp->start.addr;
+
+	pps_get_ts(&now);
+	limit = now.ts_real;
+	timespec64_add_ns(&limit, SYNCHRONISE_PERIOD_NS);
+
+	/* Write host time for specified period or until MC is done */
+	while ((timespec64_compare(&now.ts_real, &limit) < 0) &&
+	       READ_ONCE(*mc_running)) {
+		struct timespec64 update_time;
+		unsigned int host_time;
+
+		/* Don't update continuously to avoid saturating the PCIe bus */
+		update_time = now.ts_real;
+		timespec64_add_ns(&update_time, SYNCHRONISATION_GRANULARITY_NS);
+		do {
+			pps_get_ts(&now);
+		} while ((timespec64_compare(&now.ts_real, &update_time) < 0) &&
+			 READ_ONCE(*mc_running));
+
+		/* Synchronise NIC with single word of time only */
+		host_time = (now.ts_real.tv_sec << MC_NANOSECOND_BITS |
+			     now.ts_real.tv_nsec);
+		/* Update host time in NIC memory */
+		efx->type->ptp_write_host_time(efx, host_time);
+	}
+	*last_time = now;
+}
+
+/* Read a timeset from the MC's results and partial process. */
+static void efx_ptp_read_timeset(MCDI_DECLARE_STRUCT_PTR(data),
+				 struct efx_ptp_timeset *timeset)
+{
+	unsigned start_ns, end_ns;
+
+	timeset->host_start = MCDI_DWORD(data, PTP_OUT_SYNCHRONIZE_HOSTSTART);
+	timeset->major = MCDI_DWORD(data, PTP_OUT_SYNCHRONIZE_MAJOR);
+	timeset->minor = MCDI_DWORD(data, PTP_OUT_SYNCHRONIZE_MINOR);
+	timeset->host_end = MCDI_DWORD(data, PTP_OUT_SYNCHRONIZE_HOSTEND),
+	timeset->wait = MCDI_DWORD(data, PTP_OUT_SYNCHRONIZE_WAITNS);
+
+	/* Ignore seconds */
+	start_ns = timeset->host_start & MC_NANOSECOND_MASK;
+	end_ns = timeset->host_end & MC_NANOSECOND_MASK;
+	/* Allow for rollover */
+	if (end_ns < start_ns)
+		end_ns += NSEC_PER_SEC;
+	/* Determine duration of operation */
+	timeset->window = end_ns - start_ns;
+}
+
+/* Process times received from MC.
+ *
+ * Extract times from returned results, and establish the minimum value
+ * seen.  The minimum value represents the "best" possible time and events
+ * too much greater than this are rejected - the machine is, perhaps, too
+ * busy. A number of readings are taken so that, hopefully, at least one good
+ * synchronisation will be seen in the results.
+ */
+static int
+efx_ptp_process_times(struct efx_nic *efx, MCDI_DECLARE_STRUCT_PTR(synch_buf),
+		      size_t response_length,
+		      const struct pps_event_time *last_time)
+{
+	unsigned number_readings =
+		MCDI_VAR_ARRAY_LEN(response_length,
+				   PTP_OUT_SYNCHRONIZE_TIMESET);
+	unsigned i;
+	unsigned ngood = 0;
+	unsigned last_good = 0;
+	struct efx_ptp_data *ptp = efx->ptp_data;
+	u32 last_sec;
+	u32 start_sec;
+	struct timespec64 delta;
+	ktime_t mc_time;
+
+	if (number_readings == 0)
+		return -EAGAIN;
+
+	/* Read the set of results and find the last good host-MC
+	 * synchronization result. The MC times when it finishes reading the
+	 * host time so the corrected window time should be fairly constant
+	 * for a given platform. Increment stats for any results that appear
+	 * to be erroneous.
+	 */
+	for (i = 0; i < number_readings; i++) {
+		s32 window, corrected;
+		struct timespec64 wait;
+
+		efx_ptp_read_timeset(
+			MCDI_ARRAY_STRUCT_PTR(synch_buf,
+					      PTP_OUT_SYNCHRONIZE_TIMESET, i),
+			&ptp->timeset[i]);
+
+		wait = ktime_to_timespec64(
+			ptp->nic_to_kernel_time(0, ptp->timeset[i].wait, 0));
+		window = ptp->timeset[i].window;
+		corrected = window - wait.tv_nsec;
+
+		/* We expect the uncorrected synchronization window to be at
+		 * least as large as the interval between host start and end
+		 * times. If it is smaller than this then this is mostly likely
+		 * to be a consequence of the host's time being adjusted.
+		 * Check that the corrected sync window is in a reasonable
+		 * range. If it is out of range it is likely to be because an
+		 * interrupt or other delay occurred between reading the system
+		 * time and writing it to MC memory.
+		 */
+		if (window < SYNCHRONISATION_GRANULARITY_NS) {
+			++ptp->invalid_sync_windows;
+		} else if (corrected >= MAX_SYNCHRONISATION_NS) {
+			++ptp->oversize_sync_windows;
+		} else if (corrected < ptp->min_synchronisation_ns) {
+			++ptp->undersize_sync_windows;
+		} else {
+			ngood++;
+			last_good = i;
+		}
+	}
+
+	if (ngood == 0) {
+		netif_warn(efx, drv, efx->net_dev,
+			   "PTP no suitable synchronisations\n");
+		return -EAGAIN;
+	}
+
+	/* Calculate delay from last good sync (host time) to last_time.
+	 * It is possible that the seconds rolled over between taking
+	 * the start reading and the last value written by the host.  The
+	 * timescales are such that a gap of more than one second is never
+	 * expected.  delta is *not* normalised.
+	 */
+	start_sec = ptp->timeset[last_good].host_start >> MC_NANOSECOND_BITS;
+	last_sec = last_time->ts_real.tv_sec & MC_SECOND_MASK;
+	if (start_sec != last_sec &&
+	    ((start_sec + 1) & MC_SECOND_MASK) != last_sec) {
+		netif_warn(efx, hw, efx->net_dev,
+			   "PTP bad synchronisation seconds\n");
+		return -EAGAIN;
+	}
+	delta.tv_sec = (last_sec - start_sec) & 1;
+	delta.tv_nsec =
+		last_time->ts_real.tv_nsec -
+		(ptp->timeset[last_good].host_start & MC_NANOSECOND_MASK);
+
+	/* Convert the NIC time at last good sync into kernel time.
+	 * No correction is required - this time is the output of a
+	 * firmware process.
+	 */
+	mc_time = ptp->nic_to_kernel_time(ptp->timeset[last_good].major,
+					  ptp->timeset[last_good].minor, 0);
+
+	/* Calculate delay from NIC top of second to last_time */
+	delta.tv_nsec += ktime_to_timespec64(mc_time).tv_nsec;
+
+	/* Set PPS timestamp to match NIC top of second */
+	ptp->host_time_pps = *last_time;
+	pps_sub_ts(&ptp->host_time_pps, delta);
+
+	return 0;
+}
+
+/* Synchronize times between the host and the MC */
+static int efx_ptp_synchronize(struct efx_nic *efx, unsigned int num_readings)
+{
+	struct efx_ptp_data *ptp = efx->ptp_data;
+	MCDI_DECLARE_BUF(synch_buf, MC_CMD_PTP_OUT_SYNCHRONIZE_LENMAX);
+	size_t response_length;
+	int rc;
+	unsigned long timeout;
+	struct pps_event_time last_time = {};
+	unsigned int loops = 0;
+	int *start = ptp->start.addr;
+
+	MCDI_SET_DWORD(synch_buf, PTP_IN_OP, MC_CMD_PTP_OP_SYNCHRONIZE);
+	MCDI_SET_DWORD(synch_buf, PTP_IN_PERIPH_ID, 0);
+	MCDI_SET_DWORD(synch_buf, PTP_IN_SYNCHRONIZE_NUMTIMESETS,
+		       num_readings);
+	MCDI_SET_QWORD(synch_buf, PTP_IN_SYNCHRONIZE_START_ADDR,
+		       ptp->start.dma_addr);
+
+	/* Clear flag that signals MC ready */
+	WRITE_ONCE(*start, 0);
+	rc = efx_mcdi_rpc_start(efx, MC_CMD_PTP, synch_buf,
+				MC_CMD_PTP_IN_SYNCHRONIZE_LEN);
+	EFX_WARN_ON_ONCE_PARANOID(rc);
+
+	/* Wait for start from MCDI (or timeout) */
+	timeout = jiffies + msecs_to_jiffies(MAX_SYNCHRONISE_WAIT_MS);
+	while (!READ_ONCE(*start) && (time_before(jiffies, timeout))) {
+		udelay(20);	/* Usually start MCDI execution quickly */
+		loops++;
+	}
+
+	if (loops <= 1)
+		++ptp->fast_syncs;
+	if (!time_before(jiffies, timeout))
+		++ptp->sync_timeouts;
+
+	if (READ_ONCE(*start))
+		efx_ptp_send_times(efx, &last_time);
+
+	/* Collect results */
+	rc = efx_mcdi_rpc_finish(efx, MC_CMD_PTP,
+				 MC_CMD_PTP_IN_SYNCHRONIZE_LEN,
+				 synch_buf, sizeof(synch_buf),
+				 &response_length);
+	if (rc == 0) {
+		rc = efx_ptp_process_times(efx, synch_buf, response_length,
+					   &last_time);
+		if (rc == 0)
+			++ptp->good_syncs;
+		else
+			++ptp->no_time_syncs;
+	}
+
+	/* Increment the bad syncs counter if the synchronize fails, whatever
+	 * the reason.
+	 */
+	if (rc != 0)
+		++ptp->bad_syncs;
+
+	return rc;
+}
+
+/* Transmit a PTP packet via the dedicated hardware timestamped queue. */
+static void efx_ptp_xmit_skb_queue(struct efx_nic *efx, struct sk_buff *skb)
+{
+	struct efx_ptp_data *ptp_data = efx->ptp_data;
+	u8 type = efx_tx_csum_type_skb(skb);
+	struct efx_tx_queue *tx_queue;
+
+	tx_queue = efx_channel_get_tx_queue(ptp_data->channel, type);
+	if (tx_queue && tx_queue->timestamping) {
+		efx_enqueue_skb(tx_queue, skb);
+	} else {
+		WARN_ONCE(1, "PTP channel has no timestamped tx queue\n");
+		dev_kfree_skb_any(skb);
+	}
+}
+
+/* Transmit a PTP packet, via the MCDI interface, to the wire. */
+static void efx_ptp_xmit_skb_mc(struct efx_nic *efx, struct sk_buff *skb)
+{
+	struct efx_ptp_data *ptp_data = efx->ptp_data;
+	struct skb_shared_hwtstamps timestamps;
+	int rc = -EIO;
+	MCDI_DECLARE_BUF(txtime, MC_CMD_PTP_OUT_TRANSMIT_LEN);
+	size_t len;
+
+	MCDI_SET_DWORD(ptp_data->txbuf, PTP_IN_OP, MC_CMD_PTP_OP_TRANSMIT);
+	MCDI_SET_DWORD(ptp_data->txbuf, PTP_IN_PERIPH_ID, 0);
+	MCDI_SET_DWORD(ptp_data->txbuf, PTP_IN_TRANSMIT_LENGTH, skb->len);
+	if (skb_shinfo(skb)->nr_frags != 0) {
+		rc = skb_linearize(skb);
+		if (rc != 0)
+			goto fail;
+	}
+
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		rc = skb_checksum_help(skb);
+		if (rc != 0)
+			goto fail;
+	}
+	skb_copy_from_linear_data(skb,
+				  MCDI_PTR(ptp_data->txbuf,
+					   PTP_IN_TRANSMIT_PACKET),
+				  skb->len);
+	rc = efx_mcdi_rpc(efx, MC_CMD_PTP,
+			  ptp_data->txbuf, MC_CMD_PTP_IN_TRANSMIT_LEN(skb->len),
+			  txtime, sizeof(txtime), &len);
+	if (rc != 0)
+		goto fail;
+
+	memset(&timestamps, 0, sizeof(timestamps));
+	timestamps.hwtstamp = ptp_data->nic_to_kernel_time(
+		MCDI_DWORD(txtime, PTP_OUT_TRANSMIT_MAJOR),
+		MCDI_DWORD(txtime, PTP_OUT_TRANSMIT_MINOR),
+		ptp_data->ts_corrections.ptp_tx);
+
+	skb_tstamp_tx(skb, &timestamps);
+
+	rc = 0;
+
+fail:
+	dev_kfree_skb_any(skb);
+
+	return;
+}
+
+static void efx_ptp_drop_time_expired_events(struct efx_nic *efx)
+{
+	struct efx_ptp_data *ptp = efx->ptp_data;
+	struct list_head *cursor;
+	struct list_head *next;
+
+	if (ptp->rx_ts_inline)
+		return;
+
+	/* Drop time-expired events */
+	spin_lock_bh(&ptp->evt_lock);
+	list_for_each_safe(cursor, next, &ptp->evt_list) {
+		struct efx_ptp_event_rx *evt;
+
+		evt = list_entry(cursor, struct efx_ptp_event_rx,
+				 link);
+		if (time_after(jiffies, evt->expiry)) {
+			list_move(&evt->link, &ptp->evt_free_list);
+			netif_warn(efx, hw, efx->net_dev,
+				   "PTP rx event dropped\n");
+		}
+	}
+	spin_unlock_bh(&ptp->evt_lock);
+}
+
+static enum ptp_packet_state efx_ptp_match_rx(struct efx_nic *efx,
+					      struct sk_buff *skb)
+{
+	struct efx_ptp_data *ptp = efx->ptp_data;
+	bool evts_waiting;
+	struct list_head *cursor;
+	struct list_head *next;
+	struct efx_ptp_match *match;
+	enum ptp_packet_state rc = PTP_PACKET_STATE_UNMATCHED;
+
+	WARN_ON_ONCE(ptp->rx_ts_inline);
+
+	spin_lock_bh(&ptp->evt_lock);
+	evts_waiting = !list_empty(&ptp->evt_list);
+	spin_unlock_bh(&ptp->evt_lock);
+
+	if (!evts_waiting)
+		return PTP_PACKET_STATE_UNMATCHED;
+
+	match = (struct efx_ptp_match *)skb->cb;
+	/* Look for a matching timestamp in the event queue */
+	spin_lock_bh(&ptp->evt_lock);
+	list_for_each_safe(cursor, next, &ptp->evt_list) {
+		struct efx_ptp_event_rx *evt;
+
+		evt = list_entry(cursor, struct efx_ptp_event_rx, link);
+		if ((evt->seq0 == match->words[0]) &&
+		    (evt->seq1 == match->words[1])) {
+			struct skb_shared_hwtstamps *timestamps;
+
+			/* Match - add in hardware timestamp */
+			timestamps = skb_hwtstamps(skb);
+			timestamps->hwtstamp = evt->hwtimestamp;
+
+			match->state = PTP_PACKET_STATE_MATCHED;
+			rc = PTP_PACKET_STATE_MATCHED;
+			list_move(&evt->link, &ptp->evt_free_list);
+			break;
+		}
+	}
+	spin_unlock_bh(&ptp->evt_lock);
+
+	return rc;
+}
+
+/* Process any queued receive events and corresponding packets
+ *
+ * q is returned with all the packets that are ready for delivery.
+ */
+static void efx_ptp_process_events(struct efx_nic *efx, struct sk_buff_head *q)
+{
+	struct efx_ptp_data *ptp = efx->ptp_data;
+	struct sk_buff *skb;
+
+	while ((skb = skb_dequeue(&ptp->rxq))) {
+		struct efx_ptp_match *match;
+
+		match = (struct efx_ptp_match *)skb->cb;
+		if (match->state == PTP_PACKET_STATE_MATCH_UNWANTED) {
+			__skb_queue_tail(q, skb);
+		} else if (efx_ptp_match_rx(efx, skb) ==
+			   PTP_PACKET_STATE_MATCHED) {
+			__skb_queue_tail(q, skb);
+		} else if (time_after(jiffies, match->expiry)) {
+			match->state = PTP_PACKET_STATE_TIMED_OUT;
+			++ptp->rx_no_timestamp;
+			__skb_queue_tail(q, skb);
+		} else {
+			/* Replace unprocessed entry and stop */
+			skb_queue_head(&ptp->rxq, skb);
+			break;
+		}
+	}
+}
+
+/* Complete processing of a received packet */
+static inline void efx_ptp_process_rx(struct efx_nic *efx, struct sk_buff *skb)
+{
+	local_bh_disable();
+	netif_receive_skb(skb);
+	local_bh_enable();
+}
+
+static void efx_ptp_remove_multicast_filters(struct efx_nic *efx)
+{
+	struct efx_ptp_data *ptp = efx->ptp_data;
+
+	if (ptp->rxfilter_installed) {
+		efx_filter_remove_id_safe(efx, EFX_FILTER_PRI_REQUIRED,
+					  ptp->rxfilter_general);
+		efx_filter_remove_id_safe(efx, EFX_FILTER_PRI_REQUIRED,
+					  ptp->rxfilter_event);
+		ptp->rxfilter_installed = false;
+	}
+}
+
+static int efx_ptp_insert_multicast_filters(struct efx_nic *efx)
+{
+	struct efx_ptp_data *ptp = efx->ptp_data;
+	struct efx_filter_spec rxfilter;
+	int rc;
+
+	if (!ptp->channel || ptp->rxfilter_installed)
+		return 0;
+
+	/* Must filter on both event and general ports to ensure
+	 * that there is no packet re-ordering.
+	 */
+	efx_filter_init_rx(&rxfilter, EFX_FILTER_PRI_REQUIRED, 0,
+			   efx_rx_queue_index(
+				   efx_channel_get_rx_queue(ptp->channel)));
+	rc = efx_filter_set_ipv4_local(&rxfilter, IPPROTO_UDP,
+				       htonl(PTP_ADDRESS),
+				       htons(PTP_EVENT_PORT));
+	if (rc != 0)
+		return rc;
+
+	rc = efx_filter_insert_filter(efx, &rxfilter, true);
+	if (rc < 0)
+		return rc;
+	ptp->rxfilter_event = rc;
+
+	efx_filter_init_rx(&rxfilter, EFX_FILTER_PRI_REQUIRED, 0,
+			   efx_rx_queue_index(
+				   efx_channel_get_rx_queue(ptp->channel)));
+	rc = efx_filter_set_ipv4_local(&rxfilter, IPPROTO_UDP,
+				       htonl(PTP_ADDRESS),
+				       htons(PTP_GENERAL_PORT));
+	if (rc != 0)
+		goto fail;
+
+	rc = efx_filter_insert_filter(efx, &rxfilter, true);
+	if (rc < 0)
+		goto fail;
+	ptp->rxfilter_general = rc;
+
+	ptp->rxfilter_installed = true;
+	return 0;
+
+fail:
+	efx_filter_remove_id_safe(efx, EFX_FILTER_PRI_REQUIRED,
+				  ptp->rxfilter_event);
+	return rc;
+}
+
+static int efx_ptp_start(struct efx_nic *efx)
+{
+	struct efx_ptp_data *ptp = efx->ptp_data;
+	int rc;
+
+	ptp->reset_required = false;
+
+	rc = efx_ptp_insert_multicast_filters(efx);
+	if (rc)
+		return rc;
+
+	rc = efx_ptp_enable(efx);
+	if (rc != 0)
+		goto fail;
+
+	ptp->evt_frag_idx = 0;
+	ptp->current_adjfreq = 0;
+
+	return 0;
+
+fail:
+	efx_ptp_remove_multicast_filters(efx);
+	return rc;
+}
+
+static int efx_ptp_stop(struct efx_nic *efx)
+{
+	struct efx_ptp_data *ptp = efx->ptp_data;
+	struct list_head *cursor;
+	struct list_head *next;
+	int rc;
+
+	if (ptp == NULL)
+		return 0;
+
+	rc = efx_ptp_disable(efx);
+
+	efx_ptp_remove_multicast_filters(efx);
+
+	/* Make sure RX packets are really delivered */
+	efx_ptp_deliver_rx_queue(&efx->ptp_data->rxq);
+	skb_queue_purge(&efx->ptp_data->txq);
+
+	/* Drop any pending receive events */
+	spin_lock_bh(&efx->ptp_data->evt_lock);
+	list_for_each_safe(cursor, next, &efx->ptp_data->evt_list) {
+		list_move(cursor, &efx->ptp_data->evt_free_list);
+	}
+	spin_unlock_bh(&efx->ptp_data->evt_lock);
+
+	return rc;
+}
+
+static int efx_ptp_restart(struct efx_nic *efx)
+{
+	if (efx->ptp_data && efx->ptp_data->enabled)
+		return efx_ptp_start(efx);
+	return 0;
+}
+
+static void efx_ptp_pps_worker(struct work_struct *work)
+{
+	struct efx_ptp_data *ptp =
+		container_of(work, struct efx_ptp_data, pps_work);
+	struct efx_nic *efx = ptp->efx;
+	struct ptp_clock_event ptp_evt;
+
+	if (efx_ptp_synchronize(efx, PTP_SYNC_ATTEMPTS))
+		return;
+
+	ptp_evt.type = PTP_CLOCK_PPSUSR;
+	ptp_evt.pps_times = ptp->host_time_pps;
+	ptp_clock_event(ptp->phc_clock, &ptp_evt);
+}
+
+static void efx_ptp_worker(struct work_struct *work)
+{
+	struct efx_ptp_data *ptp_data =
+		container_of(work, struct efx_ptp_data, work);
+	struct efx_nic *efx = ptp_data->efx;
+	struct sk_buff *skb;
+	struct sk_buff_head tempq;
+
+	if (ptp_data->reset_required) {
+		efx_ptp_stop(efx);
+		efx_ptp_start(efx);
+		return;
+	}
+
+	efx_ptp_drop_time_expired_events(efx);
+
+	__skb_queue_head_init(&tempq);
+	efx_ptp_process_events(efx, &tempq);
+
+	while ((skb = skb_dequeue(&ptp_data->txq)))
+		ptp_data->xmit_skb(efx, skb);
+
+	while ((skb = __skb_dequeue(&tempq)))
+		efx_ptp_process_rx(efx, skb);
+}
+
+static const struct ptp_clock_info efx_phc_clock_info = {
+	.owner		= THIS_MODULE,
+	.name		= "sfc",
+	.max_adj	= MAX_PPB,
+	.n_alarm	= 0,
+	.n_ext_ts	= 0,
+	.n_per_out	= 0,
+	.n_pins		= 0,
+	.pps		= 1,
+	.adjfreq	= efx_phc_adjfreq,
+	.adjtime	= efx_phc_adjtime,
+	.gettime64	= efx_phc_gettime,
+	.settime64	= efx_phc_settime,
+	.enable		= efx_phc_enable,
+};
+
+/* Initialise PTP state. */
+int efx_ptp_probe(struct efx_nic *efx, struct efx_channel *channel)
+{
+	struct efx_ptp_data *ptp;
+	int rc = 0;
+	unsigned int pos;
+
+	ptp = kzalloc(sizeof(struct efx_ptp_data), GFP_KERNEL);
+	efx->ptp_data = ptp;
+	if (!efx->ptp_data)
+		return -ENOMEM;
+
+	ptp->efx = efx;
+	ptp->channel = channel;
+	ptp->rx_ts_inline = efx_nic_rev(efx) >= EFX_REV_HUNT_A0;
+
+	rc = efx_nic_alloc_buffer(efx, &ptp->start, sizeof(int), GFP_KERNEL);
+	if (rc != 0)
+		goto fail1;
+
+	skb_queue_head_init(&ptp->rxq);
+	skb_queue_head_init(&ptp->txq);
+	ptp->workwq = create_singlethread_workqueue("sfc_ptp");
+	if (!ptp->workwq) {
+		rc = -ENOMEM;
+		goto fail2;
+	}
+
+	if (efx_ptp_use_mac_tx_timestamps(efx)) {
+		ptp->xmit_skb = efx_ptp_xmit_skb_queue;
+		/* Request sync events on this channel. */
+		channel->sync_events_state = SYNC_EVENTS_QUIESCENT;
+	} else {
+		ptp->xmit_skb = efx_ptp_xmit_skb_mc;
+	}
+
+	INIT_WORK(&ptp->work, efx_ptp_worker);
+	ptp->config.flags = 0;
+	ptp->config.tx_type = HWTSTAMP_TX_OFF;
+	ptp->config.rx_filter = HWTSTAMP_FILTER_NONE;
+	INIT_LIST_HEAD(&ptp->evt_list);
+	INIT_LIST_HEAD(&ptp->evt_free_list);
+	spin_lock_init(&ptp->evt_lock);
+	for (pos = 0; pos < MAX_RECEIVE_EVENTS; pos++)
+		list_add(&ptp->rx_evts[pos].link, &ptp->evt_free_list);
+
+	/* Get the NIC PTP attributes and set up time conversions */
+	rc = efx_ptp_get_attributes(efx);
+	if (rc < 0)
+		goto fail3;
+
+	/* Get the timestamp corrections */
+	rc = efx_ptp_get_timestamp_corrections(efx);
+	if (rc < 0)
+		goto fail3;
+
+	if (efx->mcdi->fn_flags &
+	    (1 << MC_CMD_DRV_ATTACH_EXT_OUT_FLAG_PRIMARY)) {
+		ptp->phc_clock_info = efx_phc_clock_info;
+		ptp->phc_clock = ptp_clock_register(&ptp->phc_clock_info,
+						    &efx->pci_dev->dev);
+		if (IS_ERR(ptp->phc_clock)) {
+			rc = PTR_ERR(ptp->phc_clock);
+			goto fail3;
+		} else if (ptp->phc_clock) {
+			INIT_WORK(&ptp->pps_work, efx_ptp_pps_worker);
+			ptp->pps_workwq = create_singlethread_workqueue("sfc_pps");
+			if (!ptp->pps_workwq) {
+				rc = -ENOMEM;
+				goto fail4;
+			}
+		}
+	}
+	ptp->nic_ts_enabled = false;
+
+	return 0;
+fail4:
+	ptp_clock_unregister(efx->ptp_data->phc_clock);
+
+fail3:
+	destroy_workqueue(efx->ptp_data->workwq);
+
+fail2:
+	efx_nic_free_buffer(efx, &ptp->start);
+
+fail1:
+	kfree(efx->ptp_data);
+	efx->ptp_data = NULL;
+
+	return rc;
+}
+
+/* Initialise PTP channel.
+ *
+ * Setting core_index to zero causes the queue to be initialised and doesn't
+ * overlap with 'rxq0' because ptp.c doesn't use skb_record_rx_queue.
+ */
+static int efx_ptp_probe_channel(struct efx_channel *channel)
+{
+	struct efx_nic *efx = channel->efx;
+	int rc;
+
+	channel->irq_moderation_us = 0;
+	channel->rx_queue.core_index = 0;
+
+	rc = efx_ptp_probe(efx, channel);
+	/* Failure to probe PTP is not fatal; this channel will just not be
+	 * used for anything.
+	 * In the case of EPERM, efx_ptp_probe will print its own message (in
+	 * efx_ptp_get_attributes()), so we don't need to.
+	 */
+	if (rc && rc != -EPERM)
+		netif_warn(efx, drv, efx->net_dev,
+			   "Failed to probe PTP, rc=%d\n", rc);
+	return 0;
+}
+
+void efx_ptp_remove(struct efx_nic *efx)
+{
+	if (!efx->ptp_data)
+		return;
+
+	(void)efx_ptp_disable(efx);
+
+	cancel_work_sync(&efx->ptp_data->work);
+	if (efx->ptp_data->pps_workwq)
+		cancel_work_sync(&efx->ptp_data->pps_work);
+
+	skb_queue_purge(&efx->ptp_data->rxq);
+	skb_queue_purge(&efx->ptp_data->txq);
+
+	if (efx->ptp_data->phc_clock) {
+		destroy_workqueue(efx->ptp_data->pps_workwq);
+		ptp_clock_unregister(efx->ptp_data->phc_clock);
+	}
+
+	destroy_workqueue(efx->ptp_data->workwq);
+
+	efx_nic_free_buffer(efx, &efx->ptp_data->start);
+	kfree(efx->ptp_data);
+	efx->ptp_data = NULL;
+}
+
+static void efx_ptp_remove_channel(struct efx_channel *channel)
+{
+	efx_ptp_remove(channel->efx);
+}
+
+static void efx_ptp_get_channel_name(struct efx_channel *channel,
+				     char *buf, size_t len)
+{
+	snprintf(buf, len, "%s-ptp", channel->efx->name);
+}
+
+/* Determine whether this packet should be processed by the PTP module
+ * or transmitted conventionally.
+ */
+bool efx_ptp_is_ptp_tx(struct efx_nic *efx, struct sk_buff *skb)
+{
+	return efx->ptp_data &&
+		efx->ptp_data->enabled &&
+		skb->len >= PTP_MIN_LENGTH &&
+		skb->len <= MC_CMD_PTP_IN_TRANSMIT_PACKET_MAXNUM &&
+		likely(skb->protocol == htons(ETH_P_IP)) &&
+		skb_transport_header_was_set(skb) &&
+		skb_network_header_len(skb) >= sizeof(struct iphdr) &&
+		ip_hdr(skb)->protocol == IPPROTO_UDP &&
+		skb_headlen(skb) >=
+		skb_transport_offset(skb) + sizeof(struct udphdr) &&
+		udp_hdr(skb)->dest == htons(PTP_EVENT_PORT);
+}
+
+/* Receive a PTP packet.  Packets are queued until the arrival of
+ * the receive timestamp from the MC - this will probably occur after the
+ * packet arrival because of the processing in the MC.
+ */
+static bool efx_ptp_rx(struct efx_channel *channel, struct sk_buff *skb)
+{
+	struct efx_nic *efx = channel->efx;
+	struct efx_ptp_data *ptp = efx->ptp_data;
+	struct efx_ptp_match *match = (struct efx_ptp_match *)skb->cb;
+	u8 *match_data_012, *match_data_345;
+	unsigned int version;
+	u8 *data;
+
+	match->expiry = jiffies + msecs_to_jiffies(PKT_EVENT_LIFETIME_MS);
+
+	/* Correct version? */
+	if (ptp->mode == MC_CMD_PTP_MODE_V1) {
+		if (!pskb_may_pull(skb, PTP_V1_MIN_LENGTH)) {
+			return false;
+		}
+		data = skb->data;
+		version = ntohs(*(__be16 *)&data[PTP_V1_VERSION_OFFSET]);
+		if (version != PTP_VERSION_V1) {
+			return false;
+		}
+
+		/* PTP V1 uses all six bytes of the UUID to match the packet
+		 * to the timestamp
+		 */
+		match_data_012 = data + PTP_V1_UUID_OFFSET;
+		match_data_345 = data + PTP_V1_UUID_OFFSET + 3;
+	} else {
+		if (!pskb_may_pull(skb, PTP_V2_MIN_LENGTH)) {
+			return false;
+		}
+		data = skb->data;
+		version = data[PTP_V2_VERSION_OFFSET];
+		if ((version & PTP_VERSION_V2_MASK) != PTP_VERSION_V2) {
+			return false;
+		}
+
+		/* The original V2 implementation uses bytes 2-7 of
+		 * the UUID to match the packet to the timestamp. This
+		 * discards two of the bytes of the MAC address used
+		 * to create the UUID (SF bug 33070).  The PTP V2
+		 * enhanced mode fixes this issue and uses bytes 0-2
+		 * and byte 5-7 of the UUID.
+		 */
+		match_data_345 = data + PTP_V2_UUID_OFFSET + 5;
+		if (ptp->mode == MC_CMD_PTP_MODE_V2) {
+			match_data_012 = data + PTP_V2_UUID_OFFSET + 2;
+		} else {
+			match_data_012 = data + PTP_V2_UUID_OFFSET + 0;
+			BUG_ON(ptp->mode != MC_CMD_PTP_MODE_V2_ENHANCED);
+		}
+	}
+
+	/* Does this packet require timestamping? */
+	if (ntohs(*(__be16 *)&data[PTP_DPORT_OFFSET]) == PTP_EVENT_PORT) {
+		match->state = PTP_PACKET_STATE_UNMATCHED;
+
+		/* We expect the sequence number to be in the same position in
+		 * the packet for PTP V1 and V2
+		 */
+		BUILD_BUG_ON(PTP_V1_SEQUENCE_OFFSET != PTP_V2_SEQUENCE_OFFSET);
+		BUILD_BUG_ON(PTP_V1_SEQUENCE_LENGTH != PTP_V2_SEQUENCE_LENGTH);
+
+		/* Extract UUID/Sequence information */
+		match->words[0] = (match_data_012[0]         |
+				   (match_data_012[1] << 8)  |
+				   (match_data_012[2] << 16) |
+				   (match_data_345[0] << 24));
+		match->words[1] = (match_data_345[1]         |
+				   (match_data_345[2] << 8)  |
+				   (data[PTP_V1_SEQUENCE_OFFSET +
+					 PTP_V1_SEQUENCE_LENGTH - 1] <<
+				    16));
+	} else {
+		match->state = PTP_PACKET_STATE_MATCH_UNWANTED;
+	}
+
+	skb_queue_tail(&ptp->rxq, skb);
+	queue_work(ptp->workwq, &ptp->work);
+
+	return true;
+}
+
+/* Transmit a PTP packet.  This has to be transmitted by the MC
+ * itself, through an MCDI call.  MCDI calls aren't permitted
+ * in the transmit path so defer the actual transmission to a suitable worker.
+ */
+int efx_ptp_tx(struct efx_nic *efx, struct sk_buff *skb)
+{
+	struct efx_ptp_data *ptp = efx->ptp_data;
+
+	skb_queue_tail(&ptp->txq, skb);
+
+	if ((udp_hdr(skb)->dest == htons(PTP_EVENT_PORT)) &&
+	    (skb->len <= MC_CMD_PTP_IN_TRANSMIT_PACKET_MAXNUM))
+		efx_xmit_hwtstamp_pending(skb);
+	queue_work(ptp->workwq, &ptp->work);
+
+	return NETDEV_TX_OK;
+}
+
+int efx_ptp_get_mode(struct efx_nic *efx)
+{
+	return efx->ptp_data->mode;
+}
+
+int efx_ptp_change_mode(struct efx_nic *efx, bool enable_wanted,
+			unsigned int new_mode)
+{
+	if ((enable_wanted != efx->ptp_data->enabled) ||
+	    (enable_wanted && (efx->ptp_data->mode != new_mode))) {
+		int rc = 0;
+
+		if (enable_wanted) {
+			/* Change of mode requires disable */
+			if (efx->ptp_data->enabled &&
+			    (efx->ptp_data->mode != new_mode)) {
+				efx->ptp_data->enabled = false;
+				rc = efx_ptp_stop(efx);
+				if (rc != 0)
+					return rc;
+			}
+
+			/* Set new operating mode and establish
+			 * baseline synchronisation, which must
+			 * succeed.
+			 */
+			efx->ptp_data->mode = new_mode;
+			if (netif_running(efx->net_dev))
+				rc = efx_ptp_start(efx);
+			if (rc == 0) {
+				rc = efx_ptp_synchronize(efx,
+							 PTP_SYNC_ATTEMPTS * 2);
+				if (rc != 0)
+					efx_ptp_stop(efx);
+			}
+		} else {
+			rc = efx_ptp_stop(efx);
+		}
+
+		if (rc != 0)
+			return rc;
+
+		efx->ptp_data->enabled = enable_wanted;
+	}
+
+	return 0;
+}
+
+static int efx_ptp_ts_init(struct efx_nic *efx, struct hwtstamp_config *init)
+{
+	int rc;
+
+	if ((init->tx_type != HWTSTAMP_TX_OFF) &&
+	    (init->tx_type != HWTSTAMP_TX_ON))
+		return -ERANGE;
+
+	rc = efx->type->ptp_set_ts_config(efx, init);
+	if (rc)
+		return rc;
+
+	efx->ptp_data->config = *init;
+	return 0;
+}
+
+void efx_ptp_get_ts_info(struct efx_nic *efx, struct ethtool_ts_info *ts_info)
+{
+	struct efx_ptp_data *ptp = efx->ptp_data;
+	struct efx_nic *primary = efx->primary;
+
+	ASSERT_RTNL();
+
+	if (!ptp)
+		return;
+
+	ts_info->so_timestamping |= (SOF_TIMESTAMPING_TX_HARDWARE |
+				     SOF_TIMESTAMPING_RX_HARDWARE |
+				     SOF_TIMESTAMPING_RAW_HARDWARE);
+	/* Check licensed features.  If we don't have the license for TX
+	 * timestamps, the NIC will not support them.
+	 */
+	if (efx_ptp_use_mac_tx_timestamps(efx)) {
+		struct efx_ef10_nic_data *nic_data = efx->nic_data;
+
+		if (!(nic_data->licensed_features &
+		      (1 << LICENSED_V3_FEATURES_TX_TIMESTAMPS_LBN)))
+			ts_info->so_timestamping &=
+				~SOF_TIMESTAMPING_TX_HARDWARE;
+	}
+	if (primary && primary->ptp_data && primary->ptp_data->phc_clock)
+		ts_info->phc_index =
+			ptp_clock_index(primary->ptp_data->phc_clock);
+	ts_info->tx_types = 1 << HWTSTAMP_TX_OFF | 1 << HWTSTAMP_TX_ON;
+	ts_info->rx_filters = ptp->efx->type->hwtstamp_filters;
+}
+
+int efx_ptp_set_ts_config(struct efx_nic *efx, struct ifreq *ifr)
+{
+	struct hwtstamp_config config;
+	int rc;
+
+	/* Not a PTP enabled port */
+	if (!efx->ptp_data)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&config, ifr->ifr_data, sizeof(config)))
+		return -EFAULT;
+
+	rc = efx_ptp_ts_init(efx, &config);
+	if (rc != 0)
+		return rc;
+
+	return copy_to_user(ifr->ifr_data, &config, sizeof(config))
+		? -EFAULT : 0;
+}
+
+int efx_ptp_get_ts_config(struct efx_nic *efx, struct ifreq *ifr)
+{
+	if (!efx->ptp_data)
+		return -EOPNOTSUPP;
+
+	return copy_to_user(ifr->ifr_data, &efx->ptp_data->config,
+			    sizeof(efx->ptp_data->config)) ? -EFAULT : 0;
+}
+
+static void ptp_event_failure(struct efx_nic *efx, int expected_frag_len)
+{
+	struct efx_ptp_data *ptp = efx->ptp_data;
+
+	netif_err(efx, hw, efx->net_dev,
+		"PTP unexpected event length: got %d expected %d\n",
+		ptp->evt_frag_idx, expected_frag_len);
+	ptp->reset_required = true;
+	queue_work(ptp->workwq, &ptp->work);
+}
+
+/* Process a completed receive event.  Put it on the event queue and
+ * start worker thread.  This is required because event and their
+ * correspoding packets may come in either order.
+ */
+static void ptp_event_rx(struct efx_nic *efx, struct efx_ptp_data *ptp)
+{
+	struct efx_ptp_event_rx *evt = NULL;
+
+	if (WARN_ON_ONCE(ptp->rx_ts_inline))
+		return;
+
+	if (ptp->evt_frag_idx != 3) {
+		ptp_event_failure(efx, 3);
+		return;
+	}
+
+	spin_lock_bh(&ptp->evt_lock);
+	if (!list_empty(&ptp->evt_free_list)) {
+		evt = list_first_entry(&ptp->evt_free_list,
+				       struct efx_ptp_event_rx, link);
+		list_del(&evt->link);
+
+		evt->seq0 = EFX_QWORD_FIELD(ptp->evt_frags[2], MCDI_EVENT_DATA);
+		evt->seq1 = (EFX_QWORD_FIELD(ptp->evt_frags[2],
+					     MCDI_EVENT_SRC)        |
+			     (EFX_QWORD_FIELD(ptp->evt_frags[1],
+					      MCDI_EVENT_SRC) << 8) |
+			     (EFX_QWORD_FIELD(ptp->evt_frags[0],
+					      MCDI_EVENT_SRC) << 16));
+		evt->hwtimestamp = efx->ptp_data->nic_to_kernel_time(
+			EFX_QWORD_FIELD(ptp->evt_frags[0], MCDI_EVENT_DATA),
+			EFX_QWORD_FIELD(ptp->evt_frags[1], MCDI_EVENT_DATA),
+			ptp->ts_corrections.ptp_rx);
+		evt->expiry = jiffies + msecs_to_jiffies(PKT_EVENT_LIFETIME_MS);
+		list_add_tail(&evt->link, &ptp->evt_list);
+
+		queue_work(ptp->workwq, &ptp->work);
+	} else if (net_ratelimit()) {
+		/* Log a rate-limited warning message. */
+		netif_err(efx, rx_err, efx->net_dev, "PTP event queue overflow\n");
+	}
+	spin_unlock_bh(&ptp->evt_lock);
+}
+
+static void ptp_event_fault(struct efx_nic *efx, struct efx_ptp_data *ptp)
+{
+	int code = EFX_QWORD_FIELD(ptp->evt_frags[0], MCDI_EVENT_DATA);
+	if (ptp->evt_frag_idx != 1) {
+		ptp_event_failure(efx, 1);
+		return;
+	}
+
+	netif_err(efx, hw, efx->net_dev, "PTP error %d\n", code);
+}
+
+static void ptp_event_pps(struct efx_nic *efx, struct efx_ptp_data *ptp)
+{
+	if (ptp->nic_ts_enabled)
+		queue_work(ptp->pps_workwq, &ptp->pps_work);
+}
+
+void efx_ptp_event(struct efx_nic *efx, efx_qword_t *ev)
+{
+	struct efx_ptp_data *ptp = efx->ptp_data;
+	int code = EFX_QWORD_FIELD(*ev, MCDI_EVENT_CODE);
+
+	if (!ptp) {
+		if (!efx->ptp_warned) {
+			netif_warn(efx, drv, efx->net_dev,
+				   "Received PTP event but PTP not set up\n");
+			efx->ptp_warned = true;
+		}
+		return;
+	}
+
+	if (!ptp->enabled)
+		return;
+
+	if (ptp->evt_frag_idx == 0) {
+		ptp->evt_code = code;
+	} else if (ptp->evt_code != code) {
+		netif_err(efx, hw, efx->net_dev,
+			  "PTP out of sequence event %d\n", code);
+		ptp->evt_frag_idx = 0;
+	}
+
+	ptp->evt_frags[ptp->evt_frag_idx++] = *ev;
+	if (!MCDI_EVENT_FIELD(*ev, CONT)) {
+		/* Process resulting event */
+		switch (code) {
+		case MCDI_EVENT_CODE_PTP_RX:
+			ptp_event_rx(efx, ptp);
+			break;
+		case MCDI_EVENT_CODE_PTP_FAULT:
+			ptp_event_fault(efx, ptp);
+			break;
+		case MCDI_EVENT_CODE_PTP_PPS:
+			ptp_event_pps(efx, ptp);
+			break;
+		default:
+			netif_err(efx, hw, efx->net_dev,
+				  "PTP unknown event %d\n", code);
+			break;
+		}
+		ptp->evt_frag_idx = 0;
+	} else if (MAX_EVENT_FRAGS == ptp->evt_frag_idx) {
+		netif_err(efx, hw, efx->net_dev,
+			  "PTP too many event fragments\n");
+		ptp->evt_frag_idx = 0;
+	}
+}
+
+void efx_time_sync_event(struct efx_channel *channel, efx_qword_t *ev)
+{
+	struct efx_nic *efx = channel->efx;
+	struct efx_ptp_data *ptp = efx->ptp_data;
+
+	/* When extracting the sync timestamp minor value, we should discard
+	 * the least significant two bits. These are not required in order
+	 * to reconstruct full-range timestamps and they are optionally used
+	 * to report status depending on the options supplied when subscribing
+	 * for sync events.
+	 */
+	channel->sync_timestamp_major = MCDI_EVENT_FIELD(*ev, PTP_TIME_MAJOR);
+	channel->sync_timestamp_minor =
+		(MCDI_EVENT_FIELD(*ev, PTP_TIME_MINOR_MS_8BITS) & 0xFC)
+			<< ptp->nic_time.sync_event_minor_shift;
+
+	/* if sync events have been disabled then we want to silently ignore
+	 * this event, so throw away result.
+	 */
+	(void) cmpxchg(&channel->sync_events_state, SYNC_EVENTS_REQUESTED,
+		       SYNC_EVENTS_VALID);
+}
+
+static inline u32 efx_rx_buf_timestamp_minor(struct efx_nic *efx, const u8 *eh)
+{
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+	return __le32_to_cpup((const __le32 *)(eh + efx->rx_packet_ts_offset));
+#else
+	const u8 *data = eh + efx->rx_packet_ts_offset;
+	return (u32)data[0]       |
+	       (u32)data[1] << 8  |
+	       (u32)data[2] << 16 |
+	       (u32)data[3] << 24;
+#endif
+}
+
+void __efx_rx_skb_attach_timestamp(struct efx_channel *channel,
+				   struct sk_buff *skb)
+{
+	struct efx_nic *efx = channel->efx;
+	struct efx_ptp_data *ptp = efx->ptp_data;
+	u32 pkt_timestamp_major, pkt_timestamp_minor;
+	u32 diff, carry;
+	struct skb_shared_hwtstamps *timestamps;
+
+	if (channel->sync_events_state != SYNC_EVENTS_VALID)
+		return;
+
+	pkt_timestamp_minor = efx_rx_buf_timestamp_minor(efx, skb_mac_header(skb));
+
+	/* get the difference between the packet and sync timestamps,
+	 * modulo one second
+	 */
+	diff = pkt_timestamp_minor - channel->sync_timestamp_minor;
+	if (pkt_timestamp_minor < channel->sync_timestamp_minor)
+		diff += ptp->nic_time.minor_max;
+
+	/* do we roll over a second boundary and need to carry the one? */
+	carry = (channel->sync_timestamp_minor >= ptp->nic_time.minor_max - diff) ?
+		1 : 0;
+
+	if (diff <= ptp->nic_time.sync_event_diff_max) {
+		/* packet is ahead of the sync event by a quarter of a second or
+		 * less (allowing for fuzz)
+		 */
+		pkt_timestamp_major = channel->sync_timestamp_major + carry;
+	} else if (diff >= ptp->nic_time.sync_event_diff_min) {
+		/* packet is behind the sync event but within the fuzz factor.
+		 * This means the RX packet and sync event crossed as they were
+		 * placed on the event queue, which can sometimes happen.
+		 */
+		pkt_timestamp_major = channel->sync_timestamp_major - 1 + carry;
+	} else {
+		/* it's outside tolerance in both directions. this might be
+		 * indicative of us missing sync events for some reason, so
+		 * we'll call it an error rather than risk giving a bogus
+		 * timestamp.
+		 */
+		netif_vdbg(efx, drv, efx->net_dev,
+			  "packet timestamp %x too far from sync event %x:%x\n",
+			  pkt_timestamp_minor, channel->sync_timestamp_major,
+			  channel->sync_timestamp_minor);
+		return;
+	}
+
+	/* attach the timestamps to the skb */
+	timestamps = skb_hwtstamps(skb);
+	timestamps->hwtstamp =
+		ptp->nic_to_kernel_time(pkt_timestamp_major,
+					pkt_timestamp_minor,
+					ptp->ts_corrections.general_rx);
+}
+
+static int efx_phc_adjfreq(struct ptp_clock_info *ptp, s32 delta)
+{
+	struct efx_ptp_data *ptp_data = container_of(ptp,
+						     struct efx_ptp_data,
+						     phc_clock_info);
+	struct efx_nic *efx = ptp_data->efx;
+	MCDI_DECLARE_BUF(inadj, MC_CMD_PTP_IN_ADJUST_LEN);
+	s64 adjustment_ns;
+	int rc;
+
+	if (delta > MAX_PPB)
+		delta = MAX_PPB;
+	else if (delta < -MAX_PPB)
+		delta = -MAX_PPB;
+
+	/* Convert ppb to fixed point ns taking care to round correctly. */
+	adjustment_ns = ((s64)delta * PPB_SCALE_WORD +
+			 (1 << (ptp_data->adjfreq_ppb_shift - 1))) >>
+			ptp_data->adjfreq_ppb_shift;
+
+	MCDI_SET_DWORD(inadj, PTP_IN_OP, MC_CMD_PTP_OP_ADJUST);
+	MCDI_SET_DWORD(inadj, PTP_IN_PERIPH_ID, 0);
+	MCDI_SET_QWORD(inadj, PTP_IN_ADJUST_FREQ, adjustment_ns);
+	MCDI_SET_DWORD(inadj, PTP_IN_ADJUST_SECONDS, 0);
+	MCDI_SET_DWORD(inadj, PTP_IN_ADJUST_NANOSECONDS, 0);
+	rc = efx_mcdi_rpc(efx, MC_CMD_PTP, inadj, sizeof(inadj),
+			  NULL, 0, NULL);
+	if (rc != 0)
+		return rc;
+
+	ptp_data->current_adjfreq = adjustment_ns;
+	return 0;
+}
+
+static int efx_phc_adjtime(struct ptp_clock_info *ptp, s64 delta)
+{
+	u32 nic_major, nic_minor;
+	struct efx_ptp_data *ptp_data = container_of(ptp,
+						     struct efx_ptp_data,
+						     phc_clock_info);
+	struct efx_nic *efx = ptp_data->efx;
+	MCDI_DECLARE_BUF(inbuf, MC_CMD_PTP_IN_ADJUST_LEN);
+
+	efx->ptp_data->ns_to_nic_time(delta, &nic_major, &nic_minor);
+
+	MCDI_SET_DWORD(inbuf, PTP_IN_OP, MC_CMD_PTP_OP_ADJUST);
+	MCDI_SET_DWORD(inbuf, PTP_IN_PERIPH_ID, 0);
+	MCDI_SET_QWORD(inbuf, PTP_IN_ADJUST_FREQ, ptp_data->current_adjfreq);
+	MCDI_SET_DWORD(inbuf, PTP_IN_ADJUST_MAJOR, nic_major);
+	MCDI_SET_DWORD(inbuf, PTP_IN_ADJUST_MINOR, nic_minor);
+	return efx_mcdi_rpc(efx, MC_CMD_PTP, inbuf, sizeof(inbuf),
+			    NULL, 0, NULL);
+}
+
+static int efx_phc_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts)
+{
+	struct efx_ptp_data *ptp_data = container_of(ptp,
+						     struct efx_ptp_data,
+						     phc_clock_info);
+	struct efx_nic *efx = ptp_data->efx;
+	MCDI_DECLARE_BUF(inbuf, MC_CMD_PTP_IN_READ_NIC_TIME_LEN);
+	MCDI_DECLARE_BUF(outbuf, MC_CMD_PTP_OUT_READ_NIC_TIME_LEN);
+	int rc;
+	ktime_t kt;
+
+	MCDI_SET_DWORD(inbuf, PTP_IN_OP, MC_CMD_PTP_OP_READ_NIC_TIME);
+	MCDI_SET_DWORD(inbuf, PTP_IN_PERIPH_ID, 0);
+
+	rc = efx_mcdi_rpc(efx, MC_CMD_PTP, inbuf, sizeof(inbuf),
+			  outbuf, sizeof(outbuf), NULL);
+	if (rc != 0)
+		return rc;
+
+	kt = ptp_data->nic_to_kernel_time(
+		MCDI_DWORD(outbuf, PTP_OUT_READ_NIC_TIME_MAJOR),
+		MCDI_DWORD(outbuf, PTP_OUT_READ_NIC_TIME_MINOR), 0);
+	*ts = ktime_to_timespec64(kt);
+	return 0;
+}
+
+static int efx_phc_settime(struct ptp_clock_info *ptp,
+			   const struct timespec64 *e_ts)
+{
+	/* Get the current NIC time, efx_phc_gettime.
+	 * Subtract from the desired time to get the offset
+	 * call efx_phc_adjtime with the offset
+	 */
+	int rc;
+	struct timespec64 time_now;
+	struct timespec64 delta;
+
+	rc = efx_phc_gettime(ptp, &time_now);
+	if (rc != 0)
+		return rc;
+
+	delta = timespec64_sub(*e_ts, time_now);
+
+	rc = efx_phc_adjtime(ptp, timespec64_to_ns(&delta));
+	if (rc != 0)
+		return rc;
+
+	return 0;
+}
+
+static int efx_phc_enable(struct ptp_clock_info *ptp,
+			  struct ptp_clock_request *request,
+			  int enable)
+{
+	struct efx_ptp_data *ptp_data = container_of(ptp,
+						     struct efx_ptp_data,
+						     phc_clock_info);
+	if (request->type != PTP_CLK_REQ_PPS)
+		return -EOPNOTSUPP;
+
+	ptp_data->nic_ts_enabled = !!enable;
+	return 0;
+}
+
+static const struct efx_channel_type efx_ptp_channel_type = {
+	.handle_no_channel	= efx_ptp_handle_no_channel,
+	.pre_probe		= efx_ptp_probe_channel,
+	.post_remove		= efx_ptp_remove_channel,
+	.get_name		= efx_ptp_get_channel_name,
+	/* no copy operation; there is no need to reallocate this channel */
+	.receive_skb		= efx_ptp_rx,
+	.want_txqs		= efx_ptp_want_txqs,
+	.keep_eventq		= false,
+};
+
+void efx_ptp_defer_probe_with_channel(struct efx_nic *efx)
+{
+	/* Check whether PTP is implemented on this NIC.  The DISABLE
+	 * operation will succeed if and only if it is implemented.
+	 */
+	if (efx_ptp_disable(efx) == 0)
+		efx->extra_channel_type[EFX_EXTRA_CHANNEL_PTP] =
+			&efx_ptp_channel_type;
+}
+
+void efx_ptp_start_datapath(struct efx_nic *efx)
+{
+	if (efx_ptp_restart(efx))
+		netif_err(efx, drv, efx->net_dev, "Failed to restart PTP.\n");
+	/* re-enable timestamping if it was previously enabled */
+	if (efx->type->ptp_set_ts_sync_events)
+		efx->type->ptp_set_ts_sync_events(efx, true, true);
+}
+
+void efx_ptp_stop_datapath(struct efx_nic *efx)
+{
+	/* temporarily disable timestamping */
+	if (efx->type->ptp_set_ts_sync_events)
+		efx->type->ptp_set_ts_sync_events(efx, false, true);
+	efx_ptp_stop(efx);
+}
--- /dev/null
+++ b/drivers/net/ethernet/sfc/siena/ptp.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/****************************************************************************
+ * Driver for Solarflare network controllers and boards
+ * Copyright 2005-2006 Fen Systems Ltd.
+ * Copyright 2006-2013 Solarflare Communications Inc.
+ * Copyright 2019-2020 Xilinx Inc.
+ */
+
+#ifndef EFX_PTP_H
+#define EFX_PTP_H
+
+#include <linux/net_tstamp.h>
+#include "net_driver.h"
+
+struct ethtool_ts_info;
+int efx_ptp_probe(struct efx_nic *efx, struct efx_channel *channel);
+void efx_ptp_defer_probe_with_channel(struct efx_nic *efx);
+struct efx_channel *efx_ptp_channel(struct efx_nic *efx);
+void efx_ptp_remove(struct efx_nic *efx);
+int efx_ptp_set_ts_config(struct efx_nic *efx, struct ifreq *ifr);
+int efx_ptp_get_ts_config(struct efx_nic *efx, struct ifreq *ifr);
+void efx_ptp_get_ts_info(struct efx_nic *efx, struct ethtool_ts_info *ts_info);
+bool efx_ptp_is_ptp_tx(struct efx_nic *efx, struct sk_buff *skb);
+int efx_ptp_get_mode(struct efx_nic *efx);
+int efx_ptp_change_mode(struct efx_nic *efx, bool enable_wanted,
+			unsigned int new_mode);
+int efx_ptp_tx(struct efx_nic *efx, struct sk_buff *skb);
+void efx_ptp_event(struct efx_nic *efx, efx_qword_t *ev);
+size_t efx_ptp_describe_stats(struct efx_nic *efx, u8 *strings);
+size_t efx_ptp_update_stats(struct efx_nic *efx, u64 *stats);
+void efx_time_sync_event(struct efx_channel *channel, efx_qword_t *ev);
+void __efx_rx_skb_attach_timestamp(struct efx_channel *channel,
+				   struct sk_buff *skb);
+static inline void efx_rx_skb_attach_timestamp(struct efx_channel *channel,
+					       struct sk_buff *skb)
+{
+	if (channel->sync_events_state == SYNC_EVENTS_VALID)
+		__efx_rx_skb_attach_timestamp(channel, skb);
+}
+void efx_ptp_start_datapath(struct efx_nic *efx);
+void efx_ptp_stop_datapath(struct efx_nic *efx);
+bool efx_ptp_use_mac_tx_timestamps(struct efx_nic *efx);
+ktime_t efx_ptp_nic_to_kernel_time(struct efx_tx_queue *tx_queue);
+
+#endif /* EFX_PTP_H */
--- /dev/null
+++ b/drivers/net/ethernet/sfc/siena/rx.c
@@ -0,0 +1,399 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/****************************************************************************
+ * Driver for Solarflare network controllers and boards
+ * Copyright 2005-2006 Fen Systems Ltd.
+ * Copyright 2005-2013 Solarflare Communications Inc.
+ */
+
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/slab.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/prefetch.h>
+#include <linux/moduleparam.h>
+#include <linux/iommu.h>
+#include <net/ip.h>
+#include <net/checksum.h>
+#include <net/xdp.h>
+#include <linux/bpf_trace.h>
+#include "net_driver.h"
+#include "efx.h"
+#include "rx_common.h"
+#include "filter.h"
+#include "nic.h"
+#include "selftest.h"
+#include "workarounds.h"
+
+/* Preferred number of descriptors to fill at once */
+#define EFX_RX_PREFERRED_BATCH 8U
+
+/* Maximum rx prefix used by any architecture. */
+#define EFX_MAX_RX_PREFIX_SIZE 16
+
+/* Size of buffer allocated for skb header area. */
+#define EFX_SKB_HEADERS  128u
+
+/* Each packet can consume up to ceil(max_frame_len / buffer_size) buffers */
+#define EFX_RX_MAX_FRAGS DIV_ROUND_UP(EFX_MAX_FRAME_LEN(EFX_MAX_MTU), \
+				      EFX_RX_USR_BUF_SIZE)
+
+static void efx_rx_packet__check_len(struct efx_rx_queue *rx_queue,
+				     struct efx_rx_buffer *rx_buf,
+				     int len)
+{
+	struct efx_nic *efx = rx_queue->efx;
+	unsigned max_len = rx_buf->len - efx->type->rx_buffer_padding;
+
+	if (likely(len <= max_len))
+		return;
+
+	/* The packet must be discarded, but this is only a fatal error
+	 * if the caller indicated it was
+	 */
+	rx_buf->flags |= EFX_RX_PKT_DISCARD;
+
+	if (net_ratelimit())
+		netif_err(efx, rx_err, efx->net_dev,
+			  "RX queue %d overlength RX event (%#x > %#x)\n",
+			  efx_rx_queue_index(rx_queue), len, max_len);
+
+	efx_rx_queue_channel(rx_queue)->n_rx_overlength++;
+}
+
+/* Allocate and construct an SKB around page fragments */
+static struct sk_buff *efx_rx_mk_skb(struct efx_channel *channel,
+				     struct efx_rx_buffer *rx_buf,
+				     unsigned int n_frags,
+				     u8 *eh, int hdr_len)
+{
+	struct efx_nic *efx = channel->efx;
+	struct sk_buff *skb;
+
+	/* Allocate an SKB to store the headers */
+	skb = netdev_alloc_skb(efx->net_dev,
+			       efx->rx_ip_align + efx->rx_prefix_size +
+			       hdr_len);
+	if (unlikely(skb == NULL)) {
+		atomic_inc(&efx->n_rx_noskb_drops);
+		return NULL;
+	}
+
+	EFX_WARN_ON_ONCE_PARANOID(rx_buf->len < hdr_len);
+
+	memcpy(skb->data + efx->rx_ip_align, eh - efx->rx_prefix_size,
+	       efx->rx_prefix_size + hdr_len);
+	skb_reserve(skb, efx->rx_ip_align + efx->rx_prefix_size);
+	__skb_put(skb, hdr_len);
+
+	/* Append the remaining page(s) onto the frag list */
+	if (rx_buf->len > hdr_len) {
+		rx_buf->page_offset += hdr_len;
+		rx_buf->len -= hdr_len;
+
+		for (;;) {
+			skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
+					rx_buf->page, rx_buf->page_offset,
+					rx_buf->len, efx->rx_buffer_truesize);
+			rx_buf->page = NULL;
+
+			if (skb_shinfo(skb)->nr_frags == n_frags)
+				break;
+
+			rx_buf = efx_rx_buf_next(&channel->rx_queue, rx_buf);
+		}
+	} else {
+		__free_pages(rx_buf->page, efx->rx_buffer_order);
+		rx_buf->page = NULL;
+		n_frags = 0;
+	}
+
+	/* Move past the ethernet header */
+	skb->protocol = eth_type_trans(skb, efx->net_dev);
+
+	skb_mark_napi_id(skb, &channel->napi_str);
+
+	return skb;
+}
+
+void efx_rx_packet(struct efx_rx_queue *rx_queue, unsigned int index,
+		   unsigned int n_frags, unsigned int len, u16 flags)
+{
+	struct efx_nic *efx = rx_queue->efx;
+	struct efx_channel *channel = efx_rx_queue_channel(rx_queue);
+	struct efx_rx_buffer *rx_buf;
+
+	rx_queue->rx_packets++;
+
+	rx_buf = efx_rx_buffer(rx_queue, index);
+	rx_buf->flags |= flags;
+
+	/* Validate the number of fragments and completed length */
+	if (n_frags == 1) {
+		if (!(flags & EFX_RX_PKT_PREFIX_LEN))
+			efx_rx_packet__check_len(rx_queue, rx_buf, len);
+	} else if (unlikely(n_frags > EFX_RX_MAX_FRAGS) ||
+		   unlikely(len <= (n_frags - 1) * efx->rx_dma_len) ||
+		   unlikely(len > n_frags * efx->rx_dma_len) ||
+		   unlikely(!efx->rx_scatter)) {
+		/* If this isn't an explicit discard request, either
+		 * the hardware or the driver is broken.
+		 */
+		WARN_ON(!(len == 0 && rx_buf->flags & EFX_RX_PKT_DISCARD));
+		rx_buf->flags |= EFX_RX_PKT_DISCARD;
+	}
+
+	netif_vdbg(efx, rx_status, efx->net_dev,
+		   "RX queue %d received ids %x-%x len %d %s%s\n",
+		   efx_rx_queue_index(rx_queue), index,
+		   (index + n_frags - 1) & rx_queue->ptr_mask, len,
+		   (rx_buf->flags & EFX_RX_PKT_CSUMMED) ? " [SUMMED]" : "",
+		   (rx_buf->flags & EFX_RX_PKT_DISCARD) ? " [DISCARD]" : "");
+
+	/* Discard packet, if instructed to do so.  Process the
+	 * previous receive first.
+	 */
+	if (unlikely(rx_buf->flags & EFX_RX_PKT_DISCARD)) {
+		efx_rx_flush_packet(channel);
+		efx_discard_rx_packet(channel, rx_buf, n_frags);
+		return;
+	}
+
+	if (n_frags == 1 && !(flags & EFX_RX_PKT_PREFIX_LEN))
+		rx_buf->len = len;
+
+	/* Release and/or sync the DMA mapping - assumes all RX buffers
+	 * consumed in-order per RX queue.
+	 */
+	efx_sync_rx_buffer(efx, rx_buf, rx_buf->len);
+
+	/* Prefetch nice and early so data will (hopefully) be in cache by
+	 * the time we look at it.
+	 */
+	prefetch(efx_rx_buf_va(rx_buf));
+
+	rx_buf->page_offset += efx->rx_prefix_size;
+	rx_buf->len -= efx->rx_prefix_size;
+
+	if (n_frags > 1) {
+		/* Release/sync DMA mapping for additional fragments.
+		 * Fix length for last fragment.
+		 */
+		unsigned int tail_frags = n_frags - 1;
+
+		for (;;) {
+			rx_buf = efx_rx_buf_next(rx_queue, rx_buf);
+			if (--tail_frags == 0)
+				break;
+			efx_sync_rx_buffer(efx, rx_buf, efx->rx_dma_len);
+		}
+		rx_buf->len = len - (n_frags - 1) * efx->rx_dma_len;
+		efx_sync_rx_buffer(efx, rx_buf, rx_buf->len);
+	}
+
+	/* All fragments have been DMA-synced, so recycle pages. */
+	rx_buf = efx_rx_buffer(rx_queue, index);
+	efx_recycle_rx_pages(channel, rx_buf, n_frags);
+
+	/* Pipeline receives so that we give time for packet headers to be
+	 * prefetched into cache.
+	 */
+	efx_rx_flush_packet(channel);
+	channel->rx_pkt_n_frags = n_frags;
+	channel->rx_pkt_index = index;
+}
+
+static void efx_rx_deliver(struct efx_channel *channel, u8 *eh,
+			   struct efx_rx_buffer *rx_buf,
+			   unsigned int n_frags)
+{
+	struct sk_buff *skb;
+	u16 hdr_len = min_t(u16, rx_buf->len, EFX_SKB_HEADERS);
+
+	skb = efx_rx_mk_skb(channel, rx_buf, n_frags, eh, hdr_len);
+	if (unlikely(skb == NULL)) {
+		struct efx_rx_queue *rx_queue;
+
+		rx_queue = efx_channel_get_rx_queue(channel);
+		efx_free_rx_buffers(rx_queue, rx_buf, n_frags);
+		return;
+	}
+	skb_record_rx_queue(skb, channel->rx_queue.core_index);
+
+	/* Set the SKB flags */
+	skb_checksum_none_assert(skb);
+	if (likely(rx_buf->flags & EFX_RX_PKT_CSUMMED)) {
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+		skb->csum_level = !!(rx_buf->flags & EFX_RX_PKT_CSUM_LEVEL);
+	}
+
+	efx_rx_skb_attach_timestamp(channel, skb);
+
+	if (channel->type->receive_skb)
+		if (channel->type->receive_skb(channel, skb))
+			return;
+
+	/* Pass the packet up */
+	if (channel->rx_list != NULL)
+		/* Add to list, will pass up later */
+		list_add_tail(&skb->list, channel->rx_list);
+	else
+		/* No list, so pass it up now */
+		netif_receive_skb(skb);
+}
+
+/** efx_do_xdp: perform XDP processing on a received packet
+ *
+ * Returns true if packet should still be delivered.
+ */
+static bool efx_do_xdp(struct efx_nic *efx, struct efx_channel *channel,
+		       struct efx_rx_buffer *rx_buf, u8 **ehp)
+{
+	u8 rx_prefix[EFX_MAX_RX_PREFIX_SIZE];
+	struct efx_rx_queue *rx_queue;
+	struct bpf_prog *xdp_prog;
+	struct xdp_frame *xdpf;
+	struct xdp_buff xdp;
+	u32 xdp_act;
+	s16 offset;
+	int err;
+
+	xdp_prog = rcu_dereference_bh(efx->xdp_prog);
+	if (!xdp_prog)
+		return true;
+
+	rx_queue = efx_channel_get_rx_queue(channel);
+
+	if (unlikely(channel->rx_pkt_n_frags > 1)) {
+		/* We can't do XDP on fragmented packets - drop. */
+		efx_free_rx_buffers(rx_queue, rx_buf,
+				    channel->rx_pkt_n_frags);
+		if (net_ratelimit())
+			netif_err(efx, rx_err, efx->net_dev,
+				  "XDP is not possible with multiple receive fragments (%d)\n",
+				  channel->rx_pkt_n_frags);
+		channel->n_rx_xdp_bad_drops++;
+		return false;
+	}
+
+	dma_sync_single_for_cpu(&efx->pci_dev->dev, rx_buf->dma_addr,
+				rx_buf->len, DMA_FROM_DEVICE);
+
+	/* Save the rx prefix. */
+	EFX_WARN_ON_PARANOID(efx->rx_prefix_size > EFX_MAX_RX_PREFIX_SIZE);
+	memcpy(rx_prefix, *ehp - efx->rx_prefix_size,
+	       efx->rx_prefix_size);
+
+	xdp_init_buff(&xdp, efx->rx_page_buf_step, &rx_queue->xdp_rxq_info);
+	/* No support yet for XDP metadata */
+	xdp_prepare_buff(&xdp, *ehp - EFX_XDP_HEADROOM, EFX_XDP_HEADROOM,
+			 rx_buf->len, false);
+
+	xdp_act = bpf_prog_run_xdp(xdp_prog, &xdp);
+
+	offset = (u8 *)xdp.data - *ehp;
+
+	switch (xdp_act) {
+	case XDP_PASS:
+		/* Fix up rx prefix. */
+		if (offset) {
+			*ehp += offset;
+			rx_buf->page_offset += offset;
+			rx_buf->len -= offset;
+			memcpy(*ehp - efx->rx_prefix_size, rx_prefix,
+			       efx->rx_prefix_size);
+		}
+		break;
+
+	case XDP_TX:
+		/* Buffer ownership passes to tx on success. */
+		xdpf = xdp_convert_buff_to_frame(&xdp);
+		err = efx_xdp_tx_buffers(efx, 1, &xdpf, true);
+		if (unlikely(err != 1)) {
+			efx_free_rx_buffers(rx_queue, rx_buf, 1);
+			if (net_ratelimit())
+				netif_err(efx, rx_err, efx->net_dev,
+					  "XDP TX failed (%d)\n", err);
+			channel->n_rx_xdp_bad_drops++;
+			trace_xdp_exception(efx->net_dev, xdp_prog, xdp_act);
+		} else {
+			channel->n_rx_xdp_tx++;
+		}
+		break;
+
+	case XDP_REDIRECT:
+		err = xdp_do_redirect(efx->net_dev, &xdp, xdp_prog);
+		if (unlikely(err)) {
+			efx_free_rx_buffers(rx_queue, rx_buf, 1);
+			if (net_ratelimit())
+				netif_err(efx, rx_err, efx->net_dev,
+					  "XDP redirect failed (%d)\n", err);
+			channel->n_rx_xdp_bad_drops++;
+			trace_xdp_exception(efx->net_dev, xdp_prog, xdp_act);
+		} else {
+			channel->n_rx_xdp_redirect++;
+		}
+		break;
+
+	default:
+		bpf_warn_invalid_xdp_action(efx->net_dev, xdp_prog, xdp_act);
+		efx_free_rx_buffers(rx_queue, rx_buf, 1);
+		channel->n_rx_xdp_bad_drops++;
+		trace_xdp_exception(efx->net_dev, xdp_prog, xdp_act);
+		break;
+
+	case XDP_ABORTED:
+		trace_xdp_exception(efx->net_dev, xdp_prog, xdp_act);
+		fallthrough;
+	case XDP_DROP:
+		efx_free_rx_buffers(rx_queue, rx_buf, 1);
+		channel->n_rx_xdp_drops++;
+		break;
+	}
+
+	return xdp_act == XDP_PASS;
+}
+
+/* Handle a received packet.  Second half: Touches packet payload. */
+void __efx_rx_packet(struct efx_channel *channel)
+{
+	struct efx_nic *efx = channel->efx;
+	struct efx_rx_buffer *rx_buf =
+		efx_rx_buffer(&channel->rx_queue, channel->rx_pkt_index);
+	u8 *eh = efx_rx_buf_va(rx_buf);
+
+	/* Read length from the prefix if necessary.  This already
+	 * excludes the length of the prefix itself.
+	 */
+	if (rx_buf->flags & EFX_RX_PKT_PREFIX_LEN)
+		rx_buf->len = le16_to_cpup((__le16 *)
+					   (eh + efx->rx_packet_len_offset));
+
+	/* If we're in loopback test, then pass the packet directly to the
+	 * loopback layer, and free the rx_buf here
+	 */
+	if (unlikely(efx->loopback_selftest)) {
+		struct efx_rx_queue *rx_queue;
+
+		efx_loopback_rx_packet(efx, eh, rx_buf->len);
+		rx_queue = efx_channel_get_rx_queue(channel);
+		efx_free_rx_buffers(rx_queue, rx_buf,
+				    channel->rx_pkt_n_frags);
+		goto out;
+	}
+
+	if (!efx_do_xdp(efx, channel, rx_buf, &eh))
+		goto out;
+
+	if (unlikely(!(efx->net_dev->features & NETIF_F_RXCSUM)))
+		rx_buf->flags &= ~EFX_RX_PKT_CSUMMED;
+
+	if ((rx_buf->flags & EFX_RX_PKT_TCP) && !channel->type->receive_skb)
+		efx_rx_packet_gro(channel, rx_buf, channel->rx_pkt_n_frags, eh, 0);
+	else
+		efx_rx_deliver(channel, eh, rx_buf, channel->rx_pkt_n_frags);
+out:
+	channel->rx_pkt_n_frags = 0;
+}
--- /dev/null
+++ b/drivers/net/ethernet/sfc/siena/rx_common.c
@@ -0,0 +1,1086 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/****************************************************************************
+ * Driver for Solarflare network controllers and boards
+ * Copyright 2018 Solarflare Communications Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ */
+
+#include "net_driver.h"
+#include <linux/module.h>
+#include <linux/iommu.h>
+#include "efx.h"
+#include "nic.h"
+#include "rx_common.h"
+
+/* This is the percentage fill level below which new RX descriptors
+ * will be added to the RX descriptor ring.
+ */
+static unsigned int rx_refill_threshold;
+module_param(rx_refill_threshold, uint, 0444);
+MODULE_PARM_DESC(rx_refill_threshold,
+		 "RX descriptor ring refill threshold (%)");
+
+/* RX maximum head room required.
+ *
+ * This must be at least 1 to prevent overflow, plus one packet-worth
+ * to allow pipelined receives.
+ */
+#define EFX_RXD_HEAD_ROOM (1 + EFX_RX_MAX_FRAGS)
+
+/* Check the RX page recycle ring for a page that can be reused. */
+static struct page *efx_reuse_page(struct efx_rx_queue *rx_queue)
+{
+	struct efx_nic *efx = rx_queue->efx;
+	struct efx_rx_page_state *state;
+	unsigned int index;
+	struct page *page;
+
+	if (unlikely(!rx_queue->page_ring))
+		return NULL;
+	index = rx_queue->page_remove & rx_queue->page_ptr_mask;
+	page = rx_queue->page_ring[index];
+	if (page == NULL)
+		return NULL;
+
+	rx_queue->page_ring[index] = NULL;
+	/* page_remove cannot exceed page_add. */
+	if (rx_queue->page_remove != rx_queue->page_add)
+		++rx_queue->page_remove;
+
+	/* If page_count is 1 then we hold the only reference to this page. */
+	if (page_count(page) == 1) {
+		++rx_queue->page_recycle_count;
+		return page;
+	} else {
+		state = page_address(page);
+		dma_unmap_page(&efx->pci_dev->dev, state->dma_addr,
+			       PAGE_SIZE << efx->rx_buffer_order,
+			       DMA_FROM_DEVICE);
+		put_page(page);
+		++rx_queue->page_recycle_failed;
+	}
+
+	return NULL;
+}
+
+/* Attempt to recycle the page if there is an RX recycle ring; the page can
+ * only be added if this is the final RX buffer, to prevent pages being used in
+ * the descriptor ring and appearing in the recycle ring simultaneously.
+ */
+static void efx_recycle_rx_page(struct efx_channel *channel,
+				struct efx_rx_buffer *rx_buf)
+{
+	struct efx_rx_queue *rx_queue = efx_channel_get_rx_queue(channel);
+	struct efx_nic *efx = rx_queue->efx;
+	struct page *page = rx_buf->page;
+	unsigned int index;
+
+	/* Only recycle the page after processing the final buffer. */
+	if (!(rx_buf->flags & EFX_RX_BUF_LAST_IN_PAGE))
+		return;
+
+	index = rx_queue->page_add & rx_queue->page_ptr_mask;
+	if (rx_queue->page_ring[index] == NULL) {
+		unsigned int read_index = rx_queue->page_remove &
+			rx_queue->page_ptr_mask;
+
+		/* The next slot in the recycle ring is available, but
+		 * increment page_remove if the read pointer currently
+		 * points here.
+		 */
+		if (read_index == index)
+			++rx_queue->page_remove;
+		rx_queue->page_ring[index] = page;
+		++rx_queue->page_add;
+		return;
+	}
+	++rx_queue->page_recycle_full;
+	efx_unmap_rx_buffer(efx, rx_buf);
+	put_page(rx_buf->page);
+}
+
+/* Recycle the pages that are used by buffers that have just been received. */
+void efx_recycle_rx_pages(struct efx_channel *channel,
+			  struct efx_rx_buffer *rx_buf,
+			  unsigned int n_frags)
+{
+	struct efx_rx_queue *rx_queue = efx_channel_get_rx_queue(channel);
+
+	if (unlikely(!rx_queue->page_ring))
+		return;
+
+	do {
+		efx_recycle_rx_page(channel, rx_buf);
+		rx_buf = efx_rx_buf_next(rx_queue, rx_buf);
+	} while (--n_frags);
+}
+
+void efx_discard_rx_packet(struct efx_channel *channel,
+			   struct efx_rx_buffer *rx_buf,
+			   unsigned int n_frags)
+{
+	struct efx_rx_queue *rx_queue = efx_channel_get_rx_queue(channel);
+
+	efx_recycle_rx_pages(channel, rx_buf, n_frags);
+
+	efx_free_rx_buffers(rx_queue, rx_buf, n_frags);
+}
+
+static void efx_init_rx_recycle_ring(struct efx_rx_queue *rx_queue)
+{
+	unsigned int bufs_in_recycle_ring, page_ring_size;
+	struct efx_nic *efx = rx_queue->efx;
+
+	bufs_in_recycle_ring = efx_rx_recycle_ring_size(efx);
+	page_ring_size = roundup_pow_of_two(bufs_in_recycle_ring /
+					    efx->rx_bufs_per_page);
+	rx_queue->page_ring = kcalloc(page_ring_size,
+				      sizeof(*rx_queue->page_ring), GFP_KERNEL);
+	if (!rx_queue->page_ring)
+		rx_queue->page_ptr_mask = 0;
+	else
+		rx_queue->page_ptr_mask = page_ring_size - 1;
+}
+
+static void efx_fini_rx_recycle_ring(struct efx_rx_queue *rx_queue)
+{
+	struct efx_nic *efx = rx_queue->efx;
+	int i;
+
+	if (unlikely(!rx_queue->page_ring))
+		return;
+
+	/* Unmap and release the pages in the recycle ring. Remove the ring. */
+	for (i = 0; i <= rx_queue->page_ptr_mask; i++) {
+		struct page *page = rx_queue->page_ring[i];
+		struct efx_rx_page_state *state;
+
+		if (page == NULL)
+			continue;
+
+		state = page_address(page);
+		dma_unmap_page(&efx->pci_dev->dev, state->dma_addr,
+			       PAGE_SIZE << efx->rx_buffer_order,
+			       DMA_FROM_DEVICE);
+		put_page(page);
+	}
+	kfree(rx_queue->page_ring);
+	rx_queue->page_ring = NULL;
+}
+
+static void efx_fini_rx_buffer(struct efx_rx_queue *rx_queue,
+			       struct efx_rx_buffer *rx_buf)
+{
+	/* Release the page reference we hold for the buffer. */
+	if (rx_buf->page)
+		put_page(rx_buf->page);
+
+	/* If this is the last buffer in a page, unmap and free it. */
+	if (rx_buf->flags & EFX_RX_BUF_LAST_IN_PAGE) {
+		efx_unmap_rx_buffer(rx_queue->efx, rx_buf);
+		efx_free_rx_buffers(rx_queue, rx_buf, 1);
+	}
+	rx_buf->page = NULL;
+}
+
+int efx_probe_rx_queue(struct efx_rx_queue *rx_queue)
+{
+	struct efx_nic *efx = rx_queue->efx;
+	unsigned int entries;
+	int rc;
+
+	/* Create the smallest power-of-two aligned ring */
+	entries = max(roundup_pow_of_two(efx->rxq_entries), EFX_MIN_DMAQ_SIZE);
+	EFX_WARN_ON_PARANOID(entries > EFX_MAX_DMAQ_SIZE);
+	rx_queue->ptr_mask = entries - 1;
+
+	netif_dbg(efx, probe, efx->net_dev,
+		  "creating RX queue %d size %#x mask %#x\n",
+		  efx_rx_queue_index(rx_queue), efx->rxq_entries,
+		  rx_queue->ptr_mask);
+
+	/* Allocate RX buffers */
+	rx_queue->buffer = kcalloc(entries, sizeof(*rx_queue->buffer),
+				   GFP_KERNEL);
+	if (!rx_queue->buffer)
+		return -ENOMEM;
+
+	rc = efx_nic_probe_rx(rx_queue);
+	if (rc) {
+		kfree(rx_queue->buffer);
+		rx_queue->buffer = NULL;
+	}
+
+	return rc;
+}
+
+void efx_init_rx_queue(struct efx_rx_queue *rx_queue)
+{
+	unsigned int max_fill, trigger, max_trigger;
+	struct efx_nic *efx = rx_queue->efx;
+	int rc = 0;
+
+	netif_dbg(rx_queue->efx, drv, rx_queue->efx->net_dev,
+		  "initialising RX queue %d\n", efx_rx_queue_index(rx_queue));
+
+	/* Initialise ptr fields */
+	rx_queue->added_count = 0;
+	rx_queue->notified_count = 0;
+	rx_queue->removed_count = 0;
+	rx_queue->min_fill = -1U;
+	efx_init_rx_recycle_ring(rx_queue);
+
+	rx_queue->page_remove = 0;
+	rx_queue->page_add = rx_queue->page_ptr_mask + 1;
+	rx_queue->page_recycle_count = 0;
+	rx_queue->page_recycle_failed = 0;
+	rx_queue->page_recycle_full = 0;
+
+	/* Initialise limit fields */
+	max_fill = efx->rxq_entries - EFX_RXD_HEAD_ROOM;
+	max_trigger =
+		max_fill - efx->rx_pages_per_batch * efx->rx_bufs_per_page;
+	if (rx_refill_threshold != 0) {
+		trigger = max_fill * min(rx_refill_threshold, 100U) / 100U;
+		if (trigger > max_trigger)
+			trigger = max_trigger;
+	} else {
+		trigger = max_trigger;
+	}
+
+	rx_queue->max_fill = max_fill;
+	rx_queue->fast_fill_trigger = trigger;
+	rx_queue->refill_enabled = true;
+
+	/* Initialise XDP queue information */
+	rc = xdp_rxq_info_reg(&rx_queue->xdp_rxq_info, efx->net_dev,
+			      rx_queue->core_index, 0);
+
+	if (rc) {
+		netif_err(efx, rx_err, efx->net_dev,
+			  "Failure to initialise XDP queue information rc=%d\n",
+			  rc);
+		efx->xdp_rxq_info_failed = true;
+	} else {
+		rx_queue->xdp_rxq_info_valid = true;
+	}
+
+	/* Set up RX descriptor ring */
+	efx_nic_init_rx(rx_queue);
+}
+
+void efx_fini_rx_queue(struct efx_rx_queue *rx_queue)
+{
+	struct efx_rx_buffer *rx_buf;
+	int i;
+
+	netif_dbg(rx_queue->efx, drv, rx_queue->efx->net_dev,
+		  "shutting down RX queue %d\n", efx_rx_queue_index(rx_queue));
+
+	del_timer_sync(&rx_queue->slow_fill);
+
+	/* Release RX buffers from the current read ptr to the write ptr */
+	if (rx_queue->buffer) {
+		for (i = rx_queue->removed_count; i < rx_queue->added_count;
+		     i++) {
+			unsigned int index = i & rx_queue->ptr_mask;
+
+			rx_buf = efx_rx_buffer(rx_queue, index);
+			efx_fini_rx_buffer(rx_queue, rx_buf);
+		}
+	}
+
+	efx_fini_rx_recycle_ring(rx_queue);
+
+	if (rx_queue->xdp_rxq_info_valid)
+		xdp_rxq_info_unreg(&rx_queue->xdp_rxq_info);
+
+	rx_queue->xdp_rxq_info_valid = false;
+}
+
+void efx_remove_rx_queue(struct efx_rx_queue *rx_queue)
+{
+	netif_dbg(rx_queue->efx, drv, rx_queue->efx->net_dev,
+		  "destroying RX queue %d\n", efx_rx_queue_index(rx_queue));
+
+	efx_nic_remove_rx(rx_queue);
+
+	kfree(rx_queue->buffer);
+	rx_queue->buffer = NULL;
+}
+
+/* Unmap a DMA-mapped page.  This function is only called for the final RX
+ * buffer in a page.
+ */
+void efx_unmap_rx_buffer(struct efx_nic *efx,
+			 struct efx_rx_buffer *rx_buf)
+{
+	struct page *page = rx_buf->page;
+
+	if (page) {
+		struct efx_rx_page_state *state = page_address(page);
+
+		dma_unmap_page(&efx->pci_dev->dev,
+			       state->dma_addr,
+			       PAGE_SIZE << efx->rx_buffer_order,
+			       DMA_FROM_DEVICE);
+	}
+}
+
+void efx_free_rx_buffers(struct efx_rx_queue *rx_queue,
+			 struct efx_rx_buffer *rx_buf,
+			 unsigned int num_bufs)
+{
+	do {
+		if (rx_buf->page) {
+			put_page(rx_buf->page);
+			rx_buf->page = NULL;
+		}
+		rx_buf = efx_rx_buf_next(rx_queue, rx_buf);
+	} while (--num_bufs);
+}
+
+void efx_rx_slow_fill(struct timer_list *t)
+{
+	struct efx_rx_queue *rx_queue = from_timer(rx_queue, t, slow_fill);
+
+	/* Post an event to cause NAPI to run and refill the queue */
+	efx_nic_generate_fill_event(rx_queue);
+	++rx_queue->slow_fill_count;
+}
+
+void efx_schedule_slow_fill(struct efx_rx_queue *rx_queue)
+{
+	mod_timer(&rx_queue->slow_fill, jiffies + msecs_to_jiffies(10));
+}
+
+/* efx_init_rx_buffers - create EFX_RX_BATCH page-based RX buffers
+ *
+ * @rx_queue:		Efx RX queue
+ *
+ * This allocates a batch of pages, maps them for DMA, and populates
+ * struct efx_rx_buffers for each one. Return a negative error code or
+ * 0 on success. If a single page can be used for multiple buffers,
+ * then the page will either be inserted fully, or not at all.
+ */
+static int efx_init_rx_buffers(struct efx_rx_queue *rx_queue, bool atomic)
+{
+	unsigned int page_offset, index, count;
+	struct efx_nic *efx = rx_queue->efx;
+	struct efx_rx_page_state *state;
+	struct efx_rx_buffer *rx_buf;
+	dma_addr_t dma_addr;
+	struct page *page;
+
+	count = 0;
+	do {
+		page = efx_reuse_page(rx_queue);
+		if (page == NULL) {
+			page = alloc_pages(__GFP_COMP |
+					   (atomic ? GFP_ATOMIC : GFP_KERNEL),
+					   efx->rx_buffer_order);
+			if (unlikely(page == NULL))
+				return -ENOMEM;
+			dma_addr =
+				dma_map_page(&efx->pci_dev->dev, page, 0,
+					     PAGE_SIZE << efx->rx_buffer_order,
+					     DMA_FROM_DEVICE);
+			if (unlikely(dma_mapping_error(&efx->pci_dev->dev,
+						       dma_addr))) {
+				__free_pages(page, efx->rx_buffer_order);
+				return -EIO;
+			}
+			state = page_address(page);
+			state->dma_addr = dma_addr;
+		} else {
+			state = page_address(page);
+			dma_addr = state->dma_addr;
+		}
+
+		dma_addr += sizeof(struct efx_rx_page_state);
+		page_offset = sizeof(struct efx_rx_page_state);
+
+		do {
+			index = rx_queue->added_count & rx_queue->ptr_mask;
+			rx_buf = efx_rx_buffer(rx_queue, index);
+			rx_buf->dma_addr = dma_addr + efx->rx_ip_align +
+					   EFX_XDP_HEADROOM;
+			rx_buf->page = page;
+			rx_buf->page_offset = page_offset + efx->rx_ip_align +
+					      EFX_XDP_HEADROOM;
+			rx_buf->len = efx->rx_dma_len;
+			rx_buf->flags = 0;
+			++rx_queue->added_count;
+			get_page(page);
+			dma_addr += efx->rx_page_buf_step;
+			page_offset += efx->rx_page_buf_step;
+		} while (page_offset + efx->rx_page_buf_step <= PAGE_SIZE);
+
+		rx_buf->flags = EFX_RX_BUF_LAST_IN_PAGE;
+	} while (++count < efx->rx_pages_per_batch);
+
+	return 0;
+}
+
+void efx_rx_config_page_split(struct efx_nic *efx)
+{
+	efx->rx_page_buf_step = ALIGN(efx->rx_dma_len + efx->rx_ip_align +
+				      EFX_XDP_HEADROOM + EFX_XDP_TAILROOM,
+				      EFX_RX_BUF_ALIGNMENT);
+	efx->rx_bufs_per_page = efx->rx_buffer_order ? 1 :
+		((PAGE_SIZE - sizeof(struct efx_rx_page_state)) /
+		efx->rx_page_buf_step);
+	efx->rx_buffer_truesize = (PAGE_SIZE << efx->rx_buffer_order) /
+		efx->rx_bufs_per_page;
+	efx->rx_pages_per_batch = DIV_ROUND_UP(EFX_RX_PREFERRED_BATCH,
+					       efx->rx_bufs_per_page);
+}
+
+/* efx_fast_push_rx_descriptors - push new RX descriptors quickly
+ * @rx_queue:		RX descriptor queue
+ *
+ * This will aim to fill the RX descriptor queue up to
+ * @rx_queue->@max_fill. If there is insufficient atomic
+ * memory to do so, a slow fill will be scheduled.
+ *
+ * The caller must provide serialisation (none is used here). In practise,
+ * this means this function must run from the NAPI handler, or be called
+ * when NAPI is disabled.
+ */
+void efx_fast_push_rx_descriptors(struct efx_rx_queue *rx_queue, bool atomic)
+{
+	struct efx_nic *efx = rx_queue->efx;
+	unsigned int fill_level, batch_size;
+	int space, rc = 0;
+
+	if (!rx_queue->refill_enabled)
+		return;
+
+	/* Calculate current fill level, and exit if we don't need to fill */
+	fill_level = (rx_queue->added_count - rx_queue->removed_count);
+	EFX_WARN_ON_ONCE_PARANOID(fill_level > rx_queue->efx->rxq_entries);
+	if (fill_level >= rx_queue->fast_fill_trigger)
+		goto out;
+
+	/* Record minimum fill level */
+	if (unlikely(fill_level < rx_queue->min_fill)) {
+		if (fill_level)
+			rx_queue->min_fill = fill_level;
+	}
+
+	batch_size = efx->rx_pages_per_batch * efx->rx_bufs_per_page;
+	space = rx_queue->max_fill - fill_level;
+	EFX_WARN_ON_ONCE_PARANOID(space < batch_size);
+
+	netif_vdbg(rx_queue->efx, rx_status, rx_queue->efx->net_dev,
+		   "RX queue %d fast-filling descriptor ring from"
+		   " level %d to level %d\n",
+		   efx_rx_queue_index(rx_queue), fill_level,
+		   rx_queue->max_fill);
+
+	do {
+		rc = efx_init_rx_buffers(rx_queue, atomic);
+		if (unlikely(rc)) {
+			/* Ensure that we don't leave the rx queue empty */
+			efx_schedule_slow_fill(rx_queue);
+			goto out;
+		}
+	} while ((space -= batch_size) >= batch_size);
+
+	netif_vdbg(rx_queue->efx, rx_status, rx_queue->efx->net_dev,
+		   "RX queue %d fast-filled descriptor ring "
+		   "to level %d\n", efx_rx_queue_index(rx_queue),
+		   rx_queue->added_count - rx_queue->removed_count);
+
+ out:
+	if (rx_queue->notified_count != rx_queue->added_count)
+		efx_nic_notify_rx_desc(rx_queue);
+}
+
+/* Pass a received packet up through GRO.  GRO can handle pages
+ * regardless of checksum state and skbs with a good checksum.
+ */
+void
+efx_rx_packet_gro(struct efx_channel *channel, struct efx_rx_buffer *rx_buf,
+		  unsigned int n_frags, u8 *eh, __wsum csum)
+{
+	struct napi_struct *napi = &channel->napi_str;
+	struct efx_nic *efx = channel->efx;
+	struct sk_buff *skb;
+
+	skb = napi_get_frags(napi);
+	if (unlikely(!skb)) {
+		struct efx_rx_queue *rx_queue;
+
+		rx_queue = efx_channel_get_rx_queue(channel);
+		efx_free_rx_buffers(rx_queue, rx_buf, n_frags);
+		return;
+	}
+
+	if (efx->net_dev->features & NETIF_F_RXHASH &&
+	    efx_rx_buf_hash_valid(efx, eh))
+		skb_set_hash(skb, efx_rx_buf_hash(efx, eh),
+			     PKT_HASH_TYPE_L3);
+	if (csum) {
+		skb->csum = csum;
+		skb->ip_summed = CHECKSUM_COMPLETE;
+	} else {
+		skb->ip_summed = ((rx_buf->flags & EFX_RX_PKT_CSUMMED) ?
+				  CHECKSUM_UNNECESSARY : CHECKSUM_NONE);
+	}
+	skb->csum_level = !!(rx_buf->flags & EFX_RX_PKT_CSUM_LEVEL);
+
+	for (;;) {
+		skb_fill_page_desc(skb, skb_shinfo(skb)->nr_frags,
+				   rx_buf->page, rx_buf->page_offset,
+				   rx_buf->len);
+		rx_buf->page = NULL;
+		skb->len += rx_buf->len;
+		if (skb_shinfo(skb)->nr_frags == n_frags)
+			break;
+
+		rx_buf = efx_rx_buf_next(&channel->rx_queue, rx_buf);
+	}
+
+	skb->data_len = skb->len;
+	skb->truesize += n_frags * efx->rx_buffer_truesize;
+
+	skb_record_rx_queue(skb, channel->rx_queue.core_index);
+
+	napi_gro_frags(napi);
+}
+
+/* RSS contexts.  We're using linked lists and crappy O(n) algorithms, because
+ * (a) this is an infrequent control-plane operation and (b) n is small (max 64)
+ */
+struct efx_rss_context *efx_alloc_rss_context_entry(struct efx_nic *efx)
+{
+	struct list_head *head = &efx->rss_context.list;
+	struct efx_rss_context *ctx, *new;
+	u32 id = 1; /* Don't use zero, that refers to the master RSS context */
+
+	WARN_ON(!mutex_is_locked(&efx->rss_lock));
+
+	/* Search for first gap in the numbering */
+	list_for_each_entry(ctx, head, list) {
+		if (ctx->user_id != id)
+			break;
+		id++;
+		/* Check for wrap.  If this happens, we have nearly 2^32
+		 * allocated RSS contexts, which seems unlikely.
+		 */
+		if (WARN_ON_ONCE(!id))
+			return NULL;
+	}
+
+	/* Create the new entry */
+	new = kmalloc(sizeof(*new), GFP_KERNEL);
+	if (!new)
+		return NULL;
+	new->context_id = EFX_MCDI_RSS_CONTEXT_INVALID;
+	new->rx_hash_udp_4tuple = false;
+
+	/* Insert the new entry into the gap */
+	new->user_id = id;
+	list_add_tail(&new->list, &ctx->list);
+	return new;
+}
+
+struct efx_rss_context *efx_find_rss_context_entry(struct efx_nic *efx, u32 id)
+{
+	struct list_head *head = &efx->rss_context.list;
+	struct efx_rss_context *ctx;
+
+	WARN_ON(!mutex_is_locked(&efx->rss_lock));
+
+	list_for_each_entry(ctx, head, list)
+		if (ctx->user_id == id)
+			return ctx;
+	return NULL;
+}
+
+void efx_free_rss_context_entry(struct efx_rss_context *ctx)
+{
+	list_del(&ctx->list);
+	kfree(ctx);
+}
+
+void efx_set_default_rx_indir_table(struct efx_nic *efx,
+				    struct efx_rss_context *ctx)
+{
+	size_t i;
+
+	for (i = 0; i < ARRAY_SIZE(ctx->rx_indir_table); i++)
+		ctx->rx_indir_table[i] =
+			ethtool_rxfh_indir_default(i, efx->rss_spread);
+}
+
+/**
+ * efx_filter_is_mc_recipient - test whether spec is a multicast recipient
+ * @spec: Specification to test
+ *
+ * Return: %true if the specification is a non-drop RX filter that
+ * matches a local MAC address I/G bit value of 1 or matches a local
+ * IPv4 or IPv6 address value in the respective multicast address
+ * range.  Otherwise %false.
+ */
+bool efx_filter_is_mc_recipient(const struct efx_filter_spec *spec)
+{
+	if (!(spec->flags & EFX_FILTER_FLAG_RX) ||
+	    spec->dmaq_id == EFX_FILTER_RX_DMAQ_ID_DROP)
+		return false;
+
+	if (spec->match_flags &
+	    (EFX_FILTER_MATCH_LOC_MAC | EFX_FILTER_MATCH_LOC_MAC_IG) &&
+	    is_multicast_ether_addr(spec->loc_mac))
+		return true;
+
+	if ((spec->match_flags &
+	     (EFX_FILTER_MATCH_ETHER_TYPE | EFX_FILTER_MATCH_LOC_HOST)) ==
+	    (EFX_FILTER_MATCH_ETHER_TYPE | EFX_FILTER_MATCH_LOC_HOST)) {
+		if (spec->ether_type == htons(ETH_P_IP) &&
+		    ipv4_is_multicast(spec->loc_host[0]))
+			return true;
+		if (spec->ether_type == htons(ETH_P_IPV6) &&
+		    ((const u8 *)spec->loc_host)[0] == 0xff)
+			return true;
+	}
+
+	return false;
+}
+
+bool efx_filter_spec_equal(const struct efx_filter_spec *left,
+			   const struct efx_filter_spec *right)
+{
+	if ((left->match_flags ^ right->match_flags) |
+	    ((left->flags ^ right->flags) &
+	     (EFX_FILTER_FLAG_RX | EFX_FILTER_FLAG_TX)))
+		return false;
+
+	return memcmp(&left->outer_vid, &right->outer_vid,
+		      sizeof(struct efx_filter_spec) -
+		      offsetof(struct efx_filter_spec, outer_vid)) == 0;
+}
+
+u32 efx_filter_spec_hash(const struct efx_filter_spec *spec)
+{
+	BUILD_BUG_ON(offsetof(struct efx_filter_spec, outer_vid) & 3);
+	return jhash2((const u32 *)&spec->outer_vid,
+		      (sizeof(struct efx_filter_spec) -
+		       offsetof(struct efx_filter_spec, outer_vid)) / 4,
+		      0);
+}
+
+#ifdef CONFIG_RFS_ACCEL
+bool efx_rps_check_rule(struct efx_arfs_rule *rule, unsigned int filter_idx,
+			bool *force)
+{
+	if (rule->filter_id == EFX_ARFS_FILTER_ID_PENDING) {
+		/* ARFS is currently updating this entry, leave it */
+		return false;
+	}
+	if (rule->filter_id == EFX_ARFS_FILTER_ID_ERROR) {
+		/* ARFS tried and failed to update this, so it's probably out
+		 * of date.  Remove the filter and the ARFS rule entry.
+		 */
+		rule->filter_id = EFX_ARFS_FILTER_ID_REMOVING;
+		*force = true;
+		return true;
+	} else if (WARN_ON(rule->filter_id != filter_idx)) { /* can't happen */
+		/* ARFS has moved on, so old filter is not needed.  Since we did
+		 * not mark the rule with EFX_ARFS_FILTER_ID_REMOVING, it will
+		 * not be removed by efx_rps_hash_del() subsequently.
+		 */
+		*force = true;
+		return true;
+	}
+	/* Remove it iff ARFS wants to. */
+	return true;
+}
+
+static
+struct hlist_head *efx_rps_hash_bucket(struct efx_nic *efx,
+				       const struct efx_filter_spec *spec)
+{
+	u32 hash = efx_filter_spec_hash(spec);
+
+	lockdep_assert_held(&efx->rps_hash_lock);
+	if (!efx->rps_hash_table)
+		return NULL;
+	return &efx->rps_hash_table[hash % EFX_ARFS_HASH_TABLE_SIZE];
+}
+
+struct efx_arfs_rule *efx_rps_hash_find(struct efx_nic *efx,
+					const struct efx_filter_spec *spec)
+{
+	struct efx_arfs_rule *rule;
+	struct hlist_head *head;
+	struct hlist_node *node;
+
+	head = efx_rps_hash_bucket(efx, spec);
+	if (!head)
+		return NULL;
+	hlist_for_each(node, head) {
+		rule = container_of(node, struct efx_arfs_rule, node);
+		if (efx_filter_spec_equal(spec, &rule->spec))
+			return rule;
+	}
+	return NULL;
+}
+
+struct efx_arfs_rule *efx_rps_hash_add(struct efx_nic *efx,
+				       const struct efx_filter_spec *spec,
+				       bool *new)
+{
+	struct efx_arfs_rule *rule;
+	struct hlist_head *head;
+	struct hlist_node *node;
+
+	head = efx_rps_hash_bucket(efx, spec);
+	if (!head)
+		return NULL;
+	hlist_for_each(node, head) {
+		rule = container_of(node, struct efx_arfs_rule, node);
+		if (efx_filter_spec_equal(spec, &rule->spec)) {
+			*new = false;
+			return rule;
+		}
+	}
+	rule = kmalloc(sizeof(*rule), GFP_ATOMIC);
+	*new = true;
+	if (rule) {
+		memcpy(&rule->spec, spec, sizeof(rule->spec));
+		hlist_add_head(&rule->node, head);
+	}
+	return rule;
+}
+
+void efx_rps_hash_del(struct efx_nic *efx, const struct efx_filter_spec *spec)
+{
+	struct efx_arfs_rule *rule;
+	struct hlist_head *head;
+	struct hlist_node *node;
+
+	head = efx_rps_hash_bucket(efx, spec);
+	if (WARN_ON(!head))
+		return;
+	hlist_for_each(node, head) {
+		rule = container_of(node, struct efx_arfs_rule, node);
+		if (efx_filter_spec_equal(spec, &rule->spec)) {
+			/* Someone already reused the entry.  We know that if
+			 * this check doesn't fire (i.e. filter_id == REMOVING)
+			 * then the REMOVING mark was put there by our caller,
+			 * because caller is holding a lock on filter table and
+			 * only holders of that lock set REMOVING.
+			 */
+			if (rule->filter_id != EFX_ARFS_FILTER_ID_REMOVING)
+				return;
+			hlist_del(node);
+			kfree(rule);
+			return;
+		}
+	}
+	/* We didn't find it. */
+	WARN_ON(1);
+}
+#endif
+
+int efx_probe_filters(struct efx_nic *efx)
+{
+	int rc;
+
+	mutex_lock(&efx->mac_lock);
+	down_write(&efx->filter_sem);
+	rc = efx->type->filter_table_probe(efx);
+	if (rc)
+		goto out_unlock;
+
+#ifdef CONFIG_RFS_ACCEL
+	if (efx->type->offload_features & NETIF_F_NTUPLE) {
+		struct efx_channel *channel;
+		int i, success = 1;
+
+		efx_for_each_channel(channel, efx) {
+			channel->rps_flow_id =
+				kcalloc(efx->type->max_rx_ip_filters,
+					sizeof(*channel->rps_flow_id),
+					GFP_KERNEL);
+			if (!channel->rps_flow_id)
+				success = 0;
+			else
+				for (i = 0;
+				     i < efx->type->max_rx_ip_filters;
+				     ++i)
+					channel->rps_flow_id[i] =
+						RPS_FLOW_ID_INVALID;
+			channel->rfs_expire_index = 0;
+			channel->rfs_filter_count = 0;
+		}
+
+		if (!success) {
+			efx_for_each_channel(channel, efx)
+				kfree(channel->rps_flow_id);
+			efx->type->filter_table_remove(efx);
+			rc = -ENOMEM;
+			goto out_unlock;
+		}
+	}
+#endif
+out_unlock:
+	up_write(&efx->filter_sem);
+	mutex_unlock(&efx->mac_lock);
+	return rc;
+}
+
+void efx_remove_filters(struct efx_nic *efx)
+{
+#ifdef CONFIG_RFS_ACCEL
+	struct efx_channel *channel;
+
+	efx_for_each_channel(channel, efx) {
+		cancel_delayed_work_sync(&channel->filter_work);
+		kfree(channel->rps_flow_id);
+		channel->rps_flow_id = NULL;
+	}
+#endif
+	down_write(&efx->filter_sem);
+	efx->type->filter_table_remove(efx);
+	up_write(&efx->filter_sem);
+}
+
+#ifdef CONFIG_RFS_ACCEL
+
+static void efx_filter_rfs_work(struct work_struct *data)
+{
+	struct efx_async_filter_insertion *req = container_of(data, struct efx_async_filter_insertion,
+							      work);
+	struct efx_nic *efx = netdev_priv(req->net_dev);
+	struct efx_channel *channel = efx_get_channel(efx, req->rxq_index);
+	int slot_idx = req - efx->rps_slot;
+	struct efx_arfs_rule *rule;
+	u16 arfs_id = 0;
+	int rc;
+
+	rc = efx->type->filter_insert(efx, &req->spec, true);
+	if (rc >= 0)
+		/* Discard 'priority' part of EF10+ filter ID (mcdi_filters) */
+		rc %= efx->type->max_rx_ip_filters;
+	if (efx->rps_hash_table) {
+		spin_lock_bh(&efx->rps_hash_lock);
+		rule = efx_rps_hash_find(efx, &req->spec);
+		/* The rule might have already gone, if someone else's request
+		 * for the same spec was already worked and then expired before
+		 * we got around to our work.  In that case we have nothing
+		 * tying us to an arfs_id, meaning that as soon as the filter
+		 * is considered for expiry it will be removed.
+		 */
+		if (rule) {
+			if (rc < 0)
+				rule->filter_id = EFX_ARFS_FILTER_ID_ERROR;
+			else
+				rule->filter_id = rc;
+			arfs_id = rule->arfs_id;
+		}
+		spin_unlock_bh(&efx->rps_hash_lock);
+	}
+	if (rc >= 0) {
+		/* Remember this so we can check whether to expire the filter
+		 * later.
+		 */
+		mutex_lock(&efx->rps_mutex);
+		if (channel->rps_flow_id[rc] == RPS_FLOW_ID_INVALID)
+			channel->rfs_filter_count++;
+		channel->rps_flow_id[rc] = req->flow_id;
+		mutex_unlock(&efx->rps_mutex);
+
+		if (req->spec.ether_type == htons(ETH_P_IP))
+			netif_info(efx, rx_status, efx->net_dev,
+				   "steering %s %pI4:%u:%pI4:%u to queue %u [flow %u filter %d id %u]\n",
+				   (req->spec.ip_proto == IPPROTO_TCP) ? "TCP" : "UDP",
+				   req->spec.rem_host, ntohs(req->spec.rem_port),
+				   req->spec.loc_host, ntohs(req->spec.loc_port),
+				   req->rxq_index, req->flow_id, rc, arfs_id);
+		else
+			netif_info(efx, rx_status, efx->net_dev,
+				   "steering %s [%pI6]:%u:[%pI6]:%u to queue %u [flow %u filter %d id %u]\n",
+				   (req->spec.ip_proto == IPPROTO_TCP) ? "TCP" : "UDP",
+				   req->spec.rem_host, ntohs(req->spec.rem_port),
+				   req->spec.loc_host, ntohs(req->spec.loc_port),
+				   req->rxq_index, req->flow_id, rc, arfs_id);
+		channel->n_rfs_succeeded++;
+	} else {
+		if (req->spec.ether_type == htons(ETH_P_IP))
+			netif_dbg(efx, rx_status, efx->net_dev,
+				  "failed to steer %s %pI4:%u:%pI4:%u to queue %u [flow %u rc %d id %u]\n",
+				  (req->spec.ip_proto == IPPROTO_TCP) ? "TCP" : "UDP",
+				  req->spec.rem_host, ntohs(req->spec.rem_port),
+				  req->spec.loc_host, ntohs(req->spec.loc_port),
+				  req->rxq_index, req->flow_id, rc, arfs_id);
+		else
+			netif_dbg(efx, rx_status, efx->net_dev,
+				  "failed to steer %s [%pI6]:%u:[%pI6]:%u to queue %u [flow %u rc %d id %u]\n",
+				  (req->spec.ip_proto == IPPROTO_TCP) ? "TCP" : "UDP",
+				  req->spec.rem_host, ntohs(req->spec.rem_port),
+				  req->spec.loc_host, ntohs(req->spec.loc_port),
+				  req->rxq_index, req->flow_id, rc, arfs_id);
+		channel->n_rfs_failed++;
+		/* We're overloading the NIC's filter tables, so let's do a
+		 * chunk of extra expiry work.
+		 */
+		__efx_filter_rfs_expire(channel, min(channel->rfs_filter_count,
+						     100u));
+	}
+
+	/* Release references */
+	clear_bit(slot_idx, &efx->rps_slot_map);
+	dev_put(req->net_dev);
+}
+
+int efx_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb,
+		   u16 rxq_index, u32 flow_id)
+{
+	struct efx_nic *efx = netdev_priv(net_dev);
+	struct efx_async_filter_insertion *req;
+	struct efx_arfs_rule *rule;
+	struct flow_keys fk;
+	int slot_idx;
+	bool new;
+	int rc;
+
+	/* find a free slot */
+	for (slot_idx = 0; slot_idx < EFX_RPS_MAX_IN_FLIGHT; slot_idx++)
+		if (!test_and_set_bit(slot_idx, &efx->rps_slot_map))
+			break;
+	if (slot_idx >= EFX_RPS_MAX_IN_FLIGHT)
+		return -EBUSY;
+
+	if (flow_id == RPS_FLOW_ID_INVALID) {
+		rc = -EINVAL;
+		goto out_clear;
+	}
+
+	if (!skb_flow_dissect_flow_keys(skb, &fk, 0)) {
+		rc = -EPROTONOSUPPORT;
+		goto out_clear;
+	}
+
+	if (fk.basic.n_proto != htons(ETH_P_IP) && fk.basic.n_proto != htons(ETH_P_IPV6)) {
+		rc = -EPROTONOSUPPORT;
+		goto out_clear;
+	}
+	if (fk.control.flags & FLOW_DIS_IS_FRAGMENT) {
+		rc = -EPROTONOSUPPORT;
+		goto out_clear;
+	}
+
+	req = efx->rps_slot + slot_idx;
+	efx_filter_init_rx(&req->spec, EFX_FILTER_PRI_HINT,
+			   efx->rx_scatter ? EFX_FILTER_FLAG_RX_SCATTER : 0,
+			   rxq_index);
+	req->spec.match_flags =
+		EFX_FILTER_MATCH_ETHER_TYPE | EFX_FILTER_MATCH_IP_PROTO |
+		EFX_FILTER_MATCH_LOC_HOST | EFX_FILTER_MATCH_LOC_PORT |
+		EFX_FILTER_MATCH_REM_HOST | EFX_FILTER_MATCH_REM_PORT;
+	req->spec.ether_type = fk.basic.n_proto;
+	req->spec.ip_proto = fk.basic.ip_proto;
+
+	if (fk.basic.n_proto == htons(ETH_P_IP)) {
+		req->spec.rem_host[0] = fk.addrs.v4addrs.src;
+		req->spec.loc_host[0] = fk.addrs.v4addrs.dst;
+	} else {
+		memcpy(req->spec.rem_host, &fk.addrs.v6addrs.src,
+		       sizeof(struct in6_addr));
+		memcpy(req->spec.loc_host, &fk.addrs.v6addrs.dst,
+		       sizeof(struct in6_addr));
+	}
+
+	req->spec.rem_port = fk.ports.src;
+	req->spec.loc_port = fk.ports.dst;
+
+	if (efx->rps_hash_table) {
+		/* Add it to ARFS hash table */
+		spin_lock(&efx->rps_hash_lock);
+		rule = efx_rps_hash_add(efx, &req->spec, &new);
+		if (!rule) {
+			rc = -ENOMEM;
+			goto out_unlock;
+		}
+		if (new)
+			rule->arfs_id = efx->rps_next_id++ % RPS_NO_FILTER;
+		rc = rule->arfs_id;
+		/* Skip if existing or pending filter already does the right thing */
+		if (!new && rule->rxq_index == rxq_index &&
+		    rule->filter_id >= EFX_ARFS_FILTER_ID_PENDING)
+			goto out_unlock;
+		rule->rxq_index = rxq_index;
+		rule->filter_id = EFX_ARFS_FILTER_ID_PENDING;
+		spin_unlock(&efx->rps_hash_lock);
+	} else {
+		/* Without an ARFS hash table, we just use arfs_id 0 for all
+		 * filters.  This means if multiple flows hash to the same
+		 * flow_id, all but the most recently touched will be eligible
+		 * for expiry.
+		 */
+		rc = 0;
+	}
+
+	/* Queue the request */
+	dev_hold(req->net_dev = net_dev);
+	INIT_WORK(&req->work, efx_filter_rfs_work);
+	req->rxq_index = rxq_index;
+	req->flow_id = flow_id;
+	schedule_work(&req->work);
+	return rc;
+out_unlock:
+	spin_unlock(&efx->rps_hash_lock);
+out_clear:
+	clear_bit(slot_idx, &efx->rps_slot_map);
+	return rc;
+}
+
+bool __efx_filter_rfs_expire(struct efx_channel *channel, unsigned int quota)
+{
+	bool (*expire_one)(struct efx_nic *efx, u32 flow_id, unsigned int index);
+	struct efx_nic *efx = channel->efx;
+	unsigned int index, size, start;
+	u32 flow_id;
+
+	if (!mutex_trylock(&efx->rps_mutex))
+		return false;
+	expire_one = efx->type->filter_rfs_expire_one;
+	index = channel->rfs_expire_index;
+	start = index;
+	size = efx->type->max_rx_ip_filters;
+	while (quota) {
+		flow_id = channel->rps_flow_id[index];
+
+		if (flow_id != RPS_FLOW_ID_INVALID) {
+			quota--;
+			if (expire_one(efx, flow_id, index)) {
+				netif_info(efx, rx_status, efx->net_dev,
+					   "expired filter %d [channel %u flow %u]\n",
+					   index, channel->channel, flow_id);
+				channel->rps_flow_id[index] = RPS_FLOW_ID_INVALID;
+				channel->rfs_filter_count--;
+			}
+		}
+		if (++index == size)
+			index = 0;
+		/* If we were called with a quota that exceeds the total number
+		 * of filters in the table (which shouldn't happen, but could
+		 * if two callers race), ensure that we don't loop forever -
+		 * stop when we've examined every row of the table.
+		 */
+		if (index == start)
+			break;
+	}
+
+	channel->rfs_expire_index = index;
+	mutex_unlock(&efx->rps_mutex);
+	return true;
+}
+
+#endif /* CONFIG_RFS_ACCEL */
--- /dev/null
+++ b/drivers/net/ethernet/sfc/siena/rx_common.h
@@ -0,0 +1,116 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/****************************************************************************
+ * Driver for Solarflare network controllers and boards
+ * Copyright 2018 Solarflare Communications Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ */
+
+#ifndef EFX_RX_COMMON_H
+#define EFX_RX_COMMON_H
+
+/* Preferred number of descriptors to fill at once */
+#define EFX_RX_PREFERRED_BATCH 8U
+
+/* Each packet can consume up to ceil(max_frame_len / buffer_size) buffers */
+#define EFX_RX_MAX_FRAGS DIV_ROUND_UP(EFX_MAX_FRAME_LEN(EFX_MAX_MTU), \
+				      EFX_RX_USR_BUF_SIZE)
+
+/* Number of RX buffers to recycle pages for.  When creating the RX page recycle
+ * ring, this number is divided by the number of buffers per page to calculate
+ * the number of pages to store in the RX page recycle ring.
+ */
+#define EFX_RECYCLE_RING_SIZE_10G	256
+
+static inline u8 *efx_rx_buf_va(struct efx_rx_buffer *buf)
+{
+	return page_address(buf->page) + buf->page_offset;
+}
+
+static inline u32 efx_rx_buf_hash(struct efx_nic *efx, const u8 *eh)
+{
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+	return __le32_to_cpup((const __le32 *)(eh + efx->rx_packet_hash_offset));
+#else
+	const u8 *data = eh + efx->rx_packet_hash_offset;
+
+	return (u32)data[0]	  |
+	       (u32)data[1] << 8  |
+	       (u32)data[2] << 16 |
+	       (u32)data[3] << 24;
+#endif
+}
+
+void efx_rx_slow_fill(struct timer_list *t);
+
+void efx_recycle_rx_pages(struct efx_channel *channel,
+			  struct efx_rx_buffer *rx_buf,
+			  unsigned int n_frags);
+void efx_discard_rx_packet(struct efx_channel *channel,
+			   struct efx_rx_buffer *rx_buf,
+			   unsigned int n_frags);
+
+int efx_probe_rx_queue(struct efx_rx_queue *rx_queue);
+void efx_init_rx_queue(struct efx_rx_queue *rx_queue);
+void efx_fini_rx_queue(struct efx_rx_queue *rx_queue);
+void efx_remove_rx_queue(struct efx_rx_queue *rx_queue);
+void efx_destroy_rx_queue(struct efx_rx_queue *rx_queue);
+
+void efx_init_rx_buffer(struct efx_rx_queue *rx_queue,
+			struct page *page,
+			unsigned int page_offset,
+			u16 flags);
+void efx_unmap_rx_buffer(struct efx_nic *efx, struct efx_rx_buffer *rx_buf);
+
+static inline void efx_sync_rx_buffer(struct efx_nic *efx,
+				      struct efx_rx_buffer *rx_buf,
+				      unsigned int len)
+{
+	dma_sync_single_for_cpu(&efx->pci_dev->dev, rx_buf->dma_addr, len,
+				DMA_FROM_DEVICE);
+}
+
+void efx_free_rx_buffers(struct efx_rx_queue *rx_queue,
+			 struct efx_rx_buffer *rx_buf,
+			 unsigned int num_bufs);
+
+void efx_schedule_slow_fill(struct efx_rx_queue *rx_queue);
+void efx_rx_config_page_split(struct efx_nic *efx);
+void efx_fast_push_rx_descriptors(struct efx_rx_queue *rx_queue, bool atomic);
+
+void
+efx_rx_packet_gro(struct efx_channel *channel, struct efx_rx_buffer *rx_buf,
+		  unsigned int n_frags, u8 *eh, __wsum csum);
+
+struct efx_rss_context *efx_alloc_rss_context_entry(struct efx_nic *efx);
+struct efx_rss_context *efx_find_rss_context_entry(struct efx_nic *efx, u32 id);
+void efx_free_rss_context_entry(struct efx_rss_context *ctx);
+void efx_set_default_rx_indir_table(struct efx_nic *efx,
+				    struct efx_rss_context *ctx);
+
+bool efx_filter_is_mc_recipient(const struct efx_filter_spec *spec);
+bool efx_filter_spec_equal(const struct efx_filter_spec *left,
+			   const struct efx_filter_spec *right);
+u32 efx_filter_spec_hash(const struct efx_filter_spec *spec);
+
+#ifdef CONFIG_RFS_ACCEL
+bool efx_rps_check_rule(struct efx_arfs_rule *rule, unsigned int filter_idx,
+			bool *force);
+struct efx_arfs_rule *efx_rps_hash_find(struct efx_nic *efx,
+					const struct efx_filter_spec *spec);
+struct efx_arfs_rule *efx_rps_hash_add(struct efx_nic *efx,
+				       const struct efx_filter_spec *spec,
+				       bool *new);
+void efx_rps_hash_del(struct efx_nic *efx, const struct efx_filter_spec *spec);
+
+int efx_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb,
+		   u16 rxq_index, u32 flow_id);
+bool __efx_filter_rfs_expire(struct efx_channel *channel, unsigned int quota);
+#endif
+
+int efx_probe_filters(struct efx_nic *efx);
+void efx_remove_filters(struct efx_nic *efx);
+
+#endif
--- /dev/null
+++ b/drivers/net/ethernet/sfc/siena/selftest.c
@@ -0,0 +1,807 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/****************************************************************************
+ * Driver for Solarflare network controllers and boards
+ * Copyright 2005-2006 Fen Systems Ltd.
+ * Copyright 2006-2012 Solarflare Communications Inc.
+ */
+
+#include <linux/netdevice.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/kernel_stat.h>
+#include <linux/pci.h>
+#include <linux/ethtool.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <linux/udp.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include "net_driver.h"
+#include "efx.h"
+#include "efx_common.h"
+#include "efx_channels.h"
+#include "nic.h"
+#include "mcdi_port_common.h"
+#include "selftest.h"
+#include "workarounds.h"
+
+/* IRQ latency can be enormous because:
+ * - All IRQs may be disabled on a CPU for a *long* time by e.g. a
+ *   slow serial console or an old IDE driver doing error recovery
+ * - The PREEMPT_RT patches mostly deal with this, but also allow a
+ *   tasklet or normal task to be given higher priority than our IRQ
+ *   threads
+ * Try to avoid blaming the hardware for this.
+ */
+#define IRQ_TIMEOUT HZ
+
+/*
+ * Loopback test packet structure
+ *
+ * The self-test should stress every RSS vector, and unfortunately
+ * Falcon only performs RSS on TCP/UDP packets.
+ */
+struct efx_loopback_payload {
+	struct ethhdr header;
+	struct iphdr ip;
+	struct udphdr udp;
+	__be16 iteration;
+	char msg[64];
+} __packed;
+
+/* Loopback test source MAC address */
+static const u8 payload_source[ETH_ALEN] __aligned(2) = {
+	0x00, 0x0f, 0x53, 0x1b, 0x1b, 0x1b,
+};
+
+static const char payload_msg[] =
+	"Hello world! This is an Efx loopback test in progress!";
+
+/* Interrupt mode names */
+static const unsigned int efx_interrupt_mode_max = EFX_INT_MODE_MAX;
+static const char *const efx_interrupt_mode_names[] = {
+	[EFX_INT_MODE_MSIX]   = "MSI-X",
+	[EFX_INT_MODE_MSI]    = "MSI",
+	[EFX_INT_MODE_LEGACY] = "legacy",
+};
+#define INT_MODE(efx) \
+	STRING_TABLE_LOOKUP(efx->interrupt_mode, efx_interrupt_mode)
+
+/**
+ * struct efx_loopback_state - persistent state during a loopback selftest
+ * @flush:		Drop all packets in efx_loopback_rx_packet
+ * @packet_count:	Number of packets being used in this test
+ * @skbs:		An array of skbs transmitted
+ * @offload_csum:	Checksums are being offloaded
+ * @rx_good:		RX good packet count
+ * @rx_bad:		RX bad packet count
+ * @payload:		Payload used in tests
+ */
+struct efx_loopback_state {
+	bool flush;
+	int packet_count;
+	struct sk_buff **skbs;
+	bool offload_csum;
+	atomic_t rx_good;
+	atomic_t rx_bad;
+	struct efx_loopback_payload payload;
+};
+
+/* How long to wait for all the packets to arrive (in ms) */
+#define LOOPBACK_TIMEOUT_MS 1000
+
+/**************************************************************************
+ *
+ * MII, NVRAM and register tests
+ *
+ **************************************************************************/
+
+static int efx_test_phy_alive(struct efx_nic *efx, struct efx_self_tests *tests)
+{
+	int rc = 0;
+
+	rc = efx_mcdi_phy_test_alive(efx);
+	tests->phy_alive = rc ? -1 : 1;
+
+	return rc;
+}
+
+static int efx_test_nvram(struct efx_nic *efx, struct efx_self_tests *tests)
+{
+	int rc = 0;
+
+	if (efx->type->test_nvram) {
+		rc = efx->type->test_nvram(efx);
+		if (rc == -EPERM)
+			rc = 0;
+		else
+			tests->nvram = rc ? -1 : 1;
+	}
+
+	return rc;
+}
+
+/**************************************************************************
+ *
+ * Interrupt and event queue testing
+ *
+ **************************************************************************/
+
+/* Test generation and receipt of interrupts */
+static int efx_test_interrupts(struct efx_nic *efx,
+			       struct efx_self_tests *tests)
+{
+	unsigned long timeout, wait;
+	int cpu;
+	int rc;
+
+	netif_dbg(efx, drv, efx->net_dev, "testing interrupts\n");
+	tests->interrupt = -1;
+
+	rc = efx_nic_irq_test_start(efx);
+	if (rc == -ENOTSUPP) {
+		netif_dbg(efx, drv, efx->net_dev,
+			  "direct interrupt testing not supported\n");
+		tests->interrupt = 0;
+		return 0;
+	}
+
+	timeout = jiffies + IRQ_TIMEOUT;
+	wait = 1;
+
+	/* Wait for arrival of test interrupt. */
+	netif_dbg(efx, drv, efx->net_dev, "waiting for test interrupt\n");
+	do {
+		schedule_timeout_uninterruptible(wait);
+		cpu = efx_nic_irq_test_irq_cpu(efx);
+		if (cpu >= 0)
+			goto success;
+		wait *= 2;
+	} while (time_before(jiffies, timeout));
+
+	netif_err(efx, drv, efx->net_dev, "timed out waiting for interrupt\n");
+	return -ETIMEDOUT;
+
+ success:
+	netif_dbg(efx, drv, efx->net_dev, "%s test interrupt seen on CPU%d\n",
+		  INT_MODE(efx), cpu);
+	tests->interrupt = 1;
+	return 0;
+}
+
+/* Test generation and receipt of interrupting events */
+static int efx_test_eventq_irq(struct efx_nic *efx,
+			       struct efx_self_tests *tests)
+{
+	struct efx_channel *channel;
+	unsigned int read_ptr[EFX_MAX_CHANNELS];
+	unsigned long napi_ran = 0, dma_pend = 0, int_pend = 0;
+	unsigned long timeout, wait;
+
+	BUILD_BUG_ON(EFX_MAX_CHANNELS > BITS_PER_LONG);
+
+	efx_for_each_channel(channel, efx) {
+		read_ptr[channel->channel] = channel->eventq_read_ptr;
+		set_bit(channel->channel, &dma_pend);
+		set_bit(channel->channel, &int_pend);
+		efx_nic_event_test_start(channel);
+	}
+
+	timeout = jiffies + IRQ_TIMEOUT;
+	wait = 1;
+
+	/* Wait for arrival of interrupts.  NAPI processing may or may
+	 * not complete in time, but we can cope in any case.
+	 */
+	do {
+		schedule_timeout_uninterruptible(wait);
+
+		efx_for_each_channel(channel, efx) {
+			efx_stop_eventq(channel);
+			if (channel->eventq_read_ptr !=
+			    read_ptr[channel->channel]) {
+				set_bit(channel->channel, &napi_ran);
+				clear_bit(channel->channel, &dma_pend);
+				clear_bit(channel->channel, &int_pend);
+			} else {
+				if (efx_nic_event_present(channel))
+					clear_bit(channel->channel, &dma_pend);
+				if (efx_nic_event_test_irq_cpu(channel) >= 0)
+					clear_bit(channel->channel, &int_pend);
+			}
+			efx_start_eventq(channel);
+		}
+
+		wait *= 2;
+	} while ((dma_pend || int_pend) && time_before(jiffies, timeout));
+
+	efx_for_each_channel(channel, efx) {
+		bool dma_seen = !test_bit(channel->channel, &dma_pend);
+		bool int_seen = !test_bit(channel->channel, &int_pend);
+
+		tests->eventq_dma[channel->channel] = dma_seen ? 1 : -1;
+		tests->eventq_int[channel->channel] = int_seen ? 1 : -1;
+
+		if (dma_seen && int_seen) {
+			netif_dbg(efx, drv, efx->net_dev,
+				  "channel %d event queue passed (with%s NAPI)\n",
+				  channel->channel,
+				  test_bit(channel->channel, &napi_ran) ?
+				  "" : "out");
+		} else {
+			/* Report failure and whether either interrupt or DMA
+			 * worked
+			 */
+			netif_err(efx, drv, efx->net_dev,
+				  "channel %d timed out waiting for event queue\n",
+				  channel->channel);
+			if (int_seen)
+				netif_err(efx, drv, efx->net_dev,
+					  "channel %d saw interrupt "
+					  "during event queue test\n",
+					  channel->channel);
+			if (dma_seen)
+				netif_err(efx, drv, efx->net_dev,
+					  "channel %d event was generated, but "
+					  "failed to trigger an interrupt\n",
+					  channel->channel);
+		}
+	}
+
+	return (dma_pend || int_pend) ? -ETIMEDOUT : 0;
+}
+
+static int efx_test_phy(struct efx_nic *efx, struct efx_self_tests *tests,
+			unsigned flags)
+{
+	int rc;
+
+	mutex_lock(&efx->mac_lock);
+	rc = efx_mcdi_phy_run_tests(efx, tests->phy_ext, flags);
+	mutex_unlock(&efx->mac_lock);
+	if (rc == -EPERM)
+		rc = 0;
+	else
+		netif_info(efx, drv, efx->net_dev,
+			   "%s phy selftest\n", rc ? "Failed" : "Passed");
+
+	return rc;
+}
+
+/**************************************************************************
+ *
+ * Loopback testing
+ * NB Only one loopback test can be executing concurrently.
+ *
+ **************************************************************************/
+
+/* Loopback test RX callback
+ * This is called for each received packet during loopback testing.
+ */
+void efx_loopback_rx_packet(struct efx_nic *efx,
+			    const char *buf_ptr, int pkt_len)
+{
+	struct efx_loopback_state *state = efx->loopback_selftest;
+	struct efx_loopback_payload *received;
+	struct efx_loopback_payload *payload;
+
+	BUG_ON(!buf_ptr);
+
+	/* If we are just flushing, then drop the packet */
+	if ((state == NULL) || state->flush)
+		return;
+
+	payload = &state->payload;
+
+	received = (struct efx_loopback_payload *) buf_ptr;
+	received->ip.saddr = payload->ip.saddr;
+	if (state->offload_csum)
+		received->ip.check = payload->ip.check;
+
+	/* Check that header exists */
+	if (pkt_len < sizeof(received->header)) {
+		netif_err(efx, drv, efx->net_dev,
+			  "saw runt RX packet (length %d) in %s loopback "
+			  "test\n", pkt_len, LOOPBACK_MODE(efx));
+		goto err;
+	}
+
+	/* Check that the ethernet header exists */
+	if (memcmp(&received->header, &payload->header, ETH_HLEN) != 0) {
+		netif_err(efx, drv, efx->net_dev,
+			  "saw non-loopback RX packet in %s loopback test\n",
+			  LOOPBACK_MODE(efx));
+		goto err;
+	}
+
+	/* Check packet length */
+	if (pkt_len != sizeof(*payload)) {
+		netif_err(efx, drv, efx->net_dev,
+			  "saw incorrect RX packet length %d (wanted %d) in "
+			  "%s loopback test\n", pkt_len, (int)sizeof(*payload),
+			  LOOPBACK_MODE(efx));
+		goto err;
+	}
+
+	/* Check that IP header matches */
+	if (memcmp(&received->ip, &payload->ip, sizeof(payload->ip)) != 0) {
+		netif_err(efx, drv, efx->net_dev,
+			  "saw corrupted IP header in %s loopback test\n",
+			  LOOPBACK_MODE(efx));
+		goto err;
+	}
+
+	/* Check that msg and padding matches */
+	if (memcmp(&received->msg, &payload->msg, sizeof(received->msg)) != 0) {
+		netif_err(efx, drv, efx->net_dev,
+			  "saw corrupted RX packet in %s loopback test\n",
+			  LOOPBACK_MODE(efx));
+		goto err;
+	}
+
+	/* Check that iteration matches */
+	if (received->iteration != payload->iteration) {
+		netif_err(efx, drv, efx->net_dev,
+			  "saw RX packet from iteration %d (wanted %d) in "
+			  "%s loopback test\n", ntohs(received->iteration),
+			  ntohs(payload->iteration), LOOPBACK_MODE(efx));
+		goto err;
+	}
+
+	/* Increase correct RX count */
+	netif_vdbg(efx, drv, efx->net_dev,
+		   "got loopback RX in %s loopback test\n", LOOPBACK_MODE(efx));
+
+	atomic_inc(&state->rx_good);
+	return;
+
+ err:
+#ifdef DEBUG
+	if (atomic_read(&state->rx_bad) == 0) {
+		netif_err(efx, drv, efx->net_dev, "received packet:\n");
+		print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 0x10, 1,
+			       buf_ptr, pkt_len, 0);
+		netif_err(efx, drv, efx->net_dev, "expected packet:\n");
+		print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 0x10, 1,
+			       &state->payload, sizeof(state->payload), 0);
+	}
+#endif
+	atomic_inc(&state->rx_bad);
+}
+
+/* Initialise an efx_selftest_state for a new iteration */
+static void efx_iterate_state(struct efx_nic *efx)
+{
+	struct efx_loopback_state *state = efx->loopback_selftest;
+	struct net_device *net_dev = efx->net_dev;
+	struct efx_loopback_payload *payload = &state->payload;
+
+	/* Initialise the layerII header */
+	ether_addr_copy((u8 *)&payload->header.h_dest, net_dev->dev_addr);
+	ether_addr_copy((u8 *)&payload->header.h_source, payload_source);
+	payload->header.h_proto = htons(ETH_P_IP);
+
+	/* saddr set later and used as incrementing count */
+	payload->ip.daddr = htonl(INADDR_LOOPBACK);
+	payload->ip.ihl = 5;
+	payload->ip.check = (__force __sum16) htons(0xdead);
+	payload->ip.tot_len = htons(sizeof(*payload) - sizeof(struct ethhdr));
+	payload->ip.version = IPVERSION;
+	payload->ip.protocol = IPPROTO_UDP;
+
+	/* Initialise udp header */
+	payload->udp.source = 0;
+	payload->udp.len = htons(sizeof(*payload) - sizeof(struct ethhdr) -
+				 sizeof(struct iphdr));
+	payload->udp.check = 0;	/* checksum ignored */
+
+	/* Fill out payload */
+	payload->iteration = htons(ntohs(payload->iteration) + 1);
+	memcpy(&payload->msg, payload_msg, sizeof(payload_msg));
+
+	/* Fill out remaining state members */
+	atomic_set(&state->rx_good, 0);
+	atomic_set(&state->rx_bad, 0);
+	smp_wmb();
+}
+
+static int efx_begin_loopback(struct efx_tx_queue *tx_queue)
+{
+	struct efx_nic *efx = tx_queue->efx;
+	struct efx_loopback_state *state = efx->loopback_selftest;
+	struct efx_loopback_payload *payload;
+	struct sk_buff *skb;
+	int i;
+	netdev_tx_t rc;
+
+	/* Transmit N copies of buffer */
+	for (i = 0; i < state->packet_count; i++) {
+		/* Allocate an skb, holding an extra reference for
+		 * transmit completion counting */
+		skb = alloc_skb(sizeof(state->payload), GFP_KERNEL);
+		if (!skb)
+			return -ENOMEM;
+		state->skbs[i] = skb;
+		skb_get(skb);
+
+		/* Copy the payload in, incrementing the source address to
+		 * exercise the rss vectors */
+		payload = skb_put(skb, sizeof(state->payload));
+		memcpy(payload, &state->payload, sizeof(state->payload));
+		payload->ip.saddr = htonl(INADDR_LOOPBACK | (i << 2));
+
+		/* Ensure everything we've written is visible to the
+		 * interrupt handler. */
+		smp_wmb();
+
+		netif_tx_lock_bh(efx->net_dev);
+		rc = efx_enqueue_skb(tx_queue, skb);
+		netif_tx_unlock_bh(efx->net_dev);
+
+		if (rc != NETDEV_TX_OK) {
+			netif_err(efx, drv, efx->net_dev,
+				  "TX queue %d could not transmit packet %d of "
+				  "%d in %s loopback test\n", tx_queue->label,
+				  i + 1, state->packet_count,
+				  LOOPBACK_MODE(efx));
+
+			/* Defer cleaning up the other skbs for the caller */
+			kfree_skb(skb);
+			return -EPIPE;
+		}
+	}
+
+	return 0;
+}
+
+static int efx_poll_loopback(struct efx_nic *efx)
+{
+	struct efx_loopback_state *state = efx->loopback_selftest;
+
+	return atomic_read(&state->rx_good) == state->packet_count;
+}
+
+static int efx_end_loopback(struct efx_tx_queue *tx_queue,
+			    struct efx_loopback_self_tests *lb_tests)
+{
+	struct efx_nic *efx = tx_queue->efx;
+	struct efx_loopback_state *state = efx->loopback_selftest;
+	struct sk_buff *skb;
+	int tx_done = 0, rx_good, rx_bad;
+	int i, rc = 0;
+
+	netif_tx_lock_bh(efx->net_dev);
+
+	/* Count the number of tx completions, and decrement the refcnt. Any
+	 * skbs not already completed will be free'd when the queue is flushed */
+	for (i = 0; i < state->packet_count; i++) {
+		skb = state->skbs[i];
+		if (skb && !skb_shared(skb))
+			++tx_done;
+		dev_kfree_skb(skb);
+	}
+
+	netif_tx_unlock_bh(efx->net_dev);
+
+	/* Check TX completion and received packet counts */
+	rx_good = atomic_read(&state->rx_good);
+	rx_bad = atomic_read(&state->rx_bad);
+	if (tx_done != state->packet_count) {
+		/* Don't free the skbs; they will be picked up on TX
+		 * overflow or channel teardown.
+		 */
+		netif_err(efx, drv, efx->net_dev,
+			  "TX queue %d saw only %d out of an expected %d "
+			  "TX completion events in %s loopback test\n",
+			  tx_queue->label, tx_done, state->packet_count,
+			  LOOPBACK_MODE(efx));
+		rc = -ETIMEDOUT;
+		/* Allow to fall through so we see the RX errors as well */
+	}
+
+	/* We may always be up to a flush away from our desired packet total */
+	if (rx_good != state->packet_count) {
+		netif_dbg(efx, drv, efx->net_dev,
+			  "TX queue %d saw only %d out of an expected %d "
+			  "received packets in %s loopback test\n",
+			  tx_queue->label, rx_good, state->packet_count,
+			  LOOPBACK_MODE(efx));
+		rc = -ETIMEDOUT;
+		/* Fall through */
+	}
+
+	/* Update loopback test structure */
+	lb_tests->tx_sent[tx_queue->label] += state->packet_count;
+	lb_tests->tx_done[tx_queue->label] += tx_done;
+	lb_tests->rx_good += rx_good;
+	lb_tests->rx_bad += rx_bad;
+
+	return rc;
+}
+
+static int
+efx_test_loopback(struct efx_tx_queue *tx_queue,
+		  struct efx_loopback_self_tests *lb_tests)
+{
+	struct efx_nic *efx = tx_queue->efx;
+	struct efx_loopback_state *state = efx->loopback_selftest;
+	int i, begin_rc, end_rc;
+
+	for (i = 0; i < 3; i++) {
+		/* Determine how many packets to send */
+		state->packet_count = efx->txq_entries / 3;
+		state->packet_count = min(1 << (i << 2), state->packet_count);
+		state->skbs = kcalloc(state->packet_count,
+				      sizeof(state->skbs[0]), GFP_KERNEL);
+		if (!state->skbs)
+			return -ENOMEM;
+		state->flush = false;
+
+		netif_dbg(efx, drv, efx->net_dev,
+			  "TX queue %d (hw %d) testing %s loopback with %d packets\n",
+			  tx_queue->label, tx_queue->queue, LOOPBACK_MODE(efx),
+			  state->packet_count);
+
+		efx_iterate_state(efx);
+		begin_rc = efx_begin_loopback(tx_queue);
+
+		/* This will normally complete very quickly, but be
+		 * prepared to wait much longer. */
+		msleep(1);
+		if (!efx_poll_loopback(efx)) {
+			msleep(LOOPBACK_TIMEOUT_MS);
+			efx_poll_loopback(efx);
+		}
+
+		end_rc = efx_end_loopback(tx_queue, lb_tests);
+		kfree(state->skbs);
+
+		if (begin_rc || end_rc) {
+			/* Wait a while to ensure there are no packets
+			 * floating around after a failure. */
+			schedule_timeout_uninterruptible(HZ / 10);
+			return begin_rc ? begin_rc : end_rc;
+		}
+	}
+
+	netif_dbg(efx, drv, efx->net_dev,
+		  "TX queue %d passed %s loopback test with a burst length "
+		  "of %d packets\n", tx_queue->label, LOOPBACK_MODE(efx),
+		  state->packet_count);
+
+	return 0;
+}
+
+/* Wait for link up. On Falcon, we would prefer to rely on efx_monitor, but
+ * any contention on the mac lock (via e.g. efx_mac_mcast_work) causes it
+ * to delay and retry. Therefore, it's safer to just poll directly. Wait
+ * for link up and any faults to dissipate. */
+static int efx_wait_for_link(struct efx_nic *efx)
+{
+	struct efx_link_state *link_state = &efx->link_state;
+	int count, link_up_count = 0;
+	bool link_up;
+
+	for (count = 0; count < 40; count++) {
+		schedule_timeout_uninterruptible(HZ / 10);
+
+		if (efx->type->monitor != NULL) {
+			mutex_lock(&efx->mac_lock);
+			efx->type->monitor(efx);
+			mutex_unlock(&efx->mac_lock);
+		}
+
+		mutex_lock(&efx->mac_lock);
+		link_up = link_state->up;
+		if (link_up)
+			link_up = !efx->type->check_mac_fault(efx);
+		mutex_unlock(&efx->mac_lock);
+
+		if (link_up) {
+			if (++link_up_count == 2)
+				return 0;
+		} else {
+			link_up_count = 0;
+		}
+	}
+
+	return -ETIMEDOUT;
+}
+
+static int efx_test_loopbacks(struct efx_nic *efx, struct efx_self_tests *tests,
+			      unsigned int loopback_modes)
+{
+	enum efx_loopback_mode mode;
+	struct efx_loopback_state *state;
+	struct efx_channel *channel =
+		efx_get_channel(efx, efx->tx_channel_offset);
+	struct efx_tx_queue *tx_queue;
+	int rc = 0;
+
+	/* Set the port loopback_selftest member. From this point on
+	 * all received packets will be dropped. Mark the state as
+	 * "flushing" so all inflight packets are dropped */
+	state = kzalloc(sizeof(*state), GFP_KERNEL);
+	if (state == NULL)
+		return -ENOMEM;
+	BUG_ON(efx->loopback_selftest);
+	state->flush = true;
+	efx->loopback_selftest = state;
+
+	/* Test all supported loopback modes */
+	for (mode = LOOPBACK_NONE; mode <= LOOPBACK_TEST_MAX; mode++) {
+		if (!(loopback_modes & (1 << mode)))
+			continue;
+
+		/* Move the port into the specified loopback mode. */
+		state->flush = true;
+		mutex_lock(&efx->mac_lock);
+		efx->loopback_mode = mode;
+		rc = __efx_reconfigure_port(efx);
+		mutex_unlock(&efx->mac_lock);
+		if (rc) {
+			netif_err(efx, drv, efx->net_dev,
+				  "unable to move into %s loopback\n",
+				  LOOPBACK_MODE(efx));
+			goto out;
+		}
+
+		rc = efx_wait_for_link(efx);
+		if (rc) {
+			netif_err(efx, drv, efx->net_dev,
+				  "loopback %s never came up\n",
+				  LOOPBACK_MODE(efx));
+			goto out;
+		}
+
+		/* Test all enabled types of TX queue */
+		efx_for_each_channel_tx_queue(tx_queue, channel) {
+			state->offload_csum = (tx_queue->type &
+					       EFX_TXQ_TYPE_OUTER_CSUM);
+			rc = efx_test_loopback(tx_queue,
+					       &tests->loopback[mode]);
+			if (rc)
+				goto out;
+		}
+	}
+
+ out:
+	/* Remove the flush. The caller will remove the loopback setting */
+	state->flush = true;
+	efx->loopback_selftest = NULL;
+	wmb();
+	kfree(state);
+
+	if (rc == -EPERM)
+		rc = 0;
+
+	return rc;
+}
+
+/**************************************************************************
+ *
+ * Entry point
+ *
+ *************************************************************************/
+
+int efx_selftest(struct efx_nic *efx, struct efx_self_tests *tests,
+		 unsigned flags)
+{
+	enum efx_loopback_mode loopback_mode = efx->loopback_mode;
+	int phy_mode = efx->phy_mode;
+	int rc_test = 0, rc_reset, rc;
+
+	efx_selftest_async_cancel(efx);
+
+	/* Online (i.e. non-disruptive) testing
+	 * This checks interrupt generation, event delivery and PHY presence. */
+
+	rc = efx_test_phy_alive(efx, tests);
+	if (rc && !rc_test)
+		rc_test = rc;
+
+	rc = efx_test_nvram(efx, tests);
+	if (rc && !rc_test)
+		rc_test = rc;
+
+	rc = efx_test_interrupts(efx, tests);
+	if (rc && !rc_test)
+		rc_test = rc;
+
+	rc = efx_test_eventq_irq(efx, tests);
+	if (rc && !rc_test)
+		rc_test = rc;
+
+	if (rc_test)
+		return rc_test;
+
+	if (!(flags & ETH_TEST_FL_OFFLINE))
+		return efx_test_phy(efx, tests, flags);
+
+	/* Offline (i.e. disruptive) testing
+	 * This checks MAC and PHY loopback on the specified port. */
+
+	/* Detach the device so the kernel doesn't transmit during the
+	 * loopback test and the watchdog timeout doesn't fire.
+	 */
+	efx_device_detach_sync(efx);
+
+	if (efx->type->test_chip) {
+		rc_reset = efx->type->test_chip(efx, tests);
+		if (rc_reset) {
+			netif_err(efx, hw, efx->net_dev,
+				  "Unable to recover from chip test\n");
+			efx_schedule_reset(efx, RESET_TYPE_DISABLE);
+			return rc_reset;
+		}
+
+		if ((tests->memory < 0 || tests->registers < 0) && !rc_test)
+			rc_test = -EIO;
+	}
+
+	/* Ensure that the phy is powered and out of loopback
+	 * for the bist and loopback tests */
+	mutex_lock(&efx->mac_lock);
+	efx->phy_mode &= ~PHY_MODE_LOW_POWER;
+	efx->loopback_mode = LOOPBACK_NONE;
+	__efx_reconfigure_port(efx);
+	mutex_unlock(&efx->mac_lock);
+
+	rc = efx_test_phy(efx, tests, flags);
+	if (rc && !rc_test)
+		rc_test = rc;
+
+	rc = efx_test_loopbacks(efx, tests, efx->loopback_modes);
+	if (rc && !rc_test)
+		rc_test = rc;
+
+	/* restore the PHY to the previous state */
+	mutex_lock(&efx->mac_lock);
+	efx->phy_mode = phy_mode;
+	efx->loopback_mode = loopback_mode;
+	__efx_reconfigure_port(efx);
+	mutex_unlock(&efx->mac_lock);
+
+	efx_device_attach_if_not_resetting(efx);
+
+	return rc_test;
+}
+
+void efx_selftest_async_start(struct efx_nic *efx)
+{
+	struct efx_channel *channel;
+
+	efx_for_each_channel(channel, efx)
+		efx_nic_event_test_start(channel);
+	schedule_delayed_work(&efx->selftest_work, IRQ_TIMEOUT);
+}
+
+void efx_selftest_async_cancel(struct efx_nic *efx)
+{
+	cancel_delayed_work_sync(&efx->selftest_work);
+}
+
+static void efx_selftest_async_work(struct work_struct *data)
+{
+	struct efx_nic *efx = container_of(data, struct efx_nic,
+					   selftest_work.work);
+	struct efx_channel *channel;
+	int cpu;
+
+	efx_for_each_channel(channel, efx) {
+		cpu = efx_nic_event_test_irq_cpu(channel);
+		if (cpu < 0)
+			netif_err(efx, ifup, efx->net_dev,
+				  "channel %d failed to trigger an interrupt\n",
+				  channel->channel);
+		else
+			netif_dbg(efx, ifup, efx->net_dev,
+				  "channel %d triggered interrupt on CPU %d\n",
+				  channel->channel, cpu);
+	}
+}
+
+void efx_selftest_async_init(struct efx_nic *efx)
+{
+	INIT_DELAYED_WORK(&efx->selftest_work, efx_selftest_async_work);
+}
--- /dev/null
+++ b/drivers/net/ethernet/sfc/siena/selftest.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/****************************************************************************
+ * Driver for Solarflare network controllers and boards
+ * Copyright 2005-2006 Fen Systems Ltd.
+ * Copyright 2006-2012 Solarflare Communications Inc.
+ */
+
+#ifndef EFX_SELFTEST_H
+#define EFX_SELFTEST_H
+
+#include "net_driver.h"
+
+/*
+ * Self tests
+ */
+
+struct efx_loopback_self_tests {
+	int tx_sent[EFX_MAX_TXQ_PER_CHANNEL];
+	int tx_done[EFX_MAX_TXQ_PER_CHANNEL];
+	int rx_good;
+	int rx_bad;
+};
+
+#define EFX_MAX_PHY_TESTS 20
+
+/* Efx self test results
+ * For fields which are not counters, 1 indicates success and -1
+ * indicates failure; 0 indicates test could not be run.
+ */
+struct efx_self_tests {
+	/* online tests */
+	int phy_alive;
+	int nvram;
+	int interrupt;
+	int eventq_dma[EFX_MAX_CHANNELS];
+	int eventq_int[EFX_MAX_CHANNELS];
+	/* offline tests */
+	int memory;
+	int registers;
+	int phy_ext[EFX_MAX_PHY_TESTS];
+	struct efx_loopback_self_tests loopback[LOOPBACK_TEST_MAX + 1];
+};
+
+void efx_loopback_rx_packet(struct efx_nic *efx, const char *buf_ptr,
+			    int pkt_len);
+int efx_selftest(struct efx_nic *efx, struct efx_self_tests *tests,
+		 unsigned flags);
+void efx_selftest_async_init(struct efx_nic *efx);
+void efx_selftest_async_start(struct efx_nic *efx);
+void efx_selftest_async_cancel(struct efx_nic *efx);
+
+#endif /* EFX_SELFTEST_H */
--- /dev/null
+++ b/drivers/net/ethernet/sfc/siena/sriov.c
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/****************************************************************************
+ * Driver for Solarflare network controllers and boards
+ * Copyright 2014-2015 Solarflare Communications Inc.
+ */
+#include <linux/module.h>
+#include "net_driver.h"
+#include "nic.h"
+#include "sriov.h"
+
+int efx_sriov_set_vf_mac(struct net_device *net_dev, int vf_i, u8 *mac)
+{
+	struct efx_nic *efx = netdev_priv(net_dev);
+
+	if (efx->type->sriov_set_vf_mac)
+		return efx->type->sriov_set_vf_mac(efx, vf_i, mac);
+	else
+		return -EOPNOTSUPP;
+}
+
+int efx_sriov_set_vf_vlan(struct net_device *net_dev, int vf_i, u16 vlan,
+			  u8 qos, __be16 vlan_proto)
+{
+	struct efx_nic *efx = netdev_priv(net_dev);
+
+	if (efx->type->sriov_set_vf_vlan) {
+		if ((vlan & ~VLAN_VID_MASK) ||
+		    (qos & ~(VLAN_PRIO_MASK >> VLAN_PRIO_SHIFT)))
+			return -EINVAL;
+
+		if (vlan_proto != htons(ETH_P_8021Q))
+			return -EPROTONOSUPPORT;
+
+		return efx->type->sriov_set_vf_vlan(efx, vf_i, vlan, qos);
+	} else {
+		return -EOPNOTSUPP;
+	}
+}
+
+int efx_sriov_set_vf_spoofchk(struct net_device *net_dev, int vf_i,
+			      bool spoofchk)
+{
+	struct efx_nic *efx = netdev_priv(net_dev);
+
+	if (efx->type->sriov_set_vf_spoofchk)
+		return efx->type->sriov_set_vf_spoofchk(efx, vf_i, spoofchk);
+	else
+		return -EOPNOTSUPP;
+}
+
+int efx_sriov_get_vf_config(struct net_device *net_dev, int vf_i,
+			    struct ifla_vf_info *ivi)
+{
+	struct efx_nic *efx = netdev_priv(net_dev);
+
+	if (efx->type->sriov_get_vf_config)
+		return efx->type->sriov_get_vf_config(efx, vf_i, ivi);
+	else
+		return -EOPNOTSUPP;
+}
+
+int efx_sriov_set_vf_link_state(struct net_device *net_dev, int vf_i,
+				int link_state)
+{
+	struct efx_nic *efx = netdev_priv(net_dev);
+
+	if (efx->type->sriov_set_vf_link_state)
+		return efx->type->sriov_set_vf_link_state(efx, vf_i,
+							  link_state);
+	else
+		return -EOPNOTSUPP;
+}
--- /dev/null
+++ b/drivers/net/ethernet/sfc/siena/sriov.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/****************************************************************************
+ * Driver for Solarflare network controllers and boards
+ * Copyright 2014-2015 Solarflare Communications Inc.
+ */
+
+#ifndef EFX_SRIOV_H
+#define EFX_SRIOV_H
+
+#include "net_driver.h"
+
+#ifdef CONFIG_SFC_SRIOV
+
+int efx_sriov_set_vf_mac(struct net_device *net_dev, int vf_i, u8 *mac);
+int efx_sriov_set_vf_vlan(struct net_device *net_dev, int vf_i, u16 vlan,
+			  u8 qos, __be16 vlan_proto);
+int efx_sriov_set_vf_spoofchk(struct net_device *net_dev, int vf_i,
+			      bool spoofchk);
+int efx_sriov_get_vf_config(struct net_device *net_dev, int vf_i,
+			    struct ifla_vf_info *ivi);
+int efx_sriov_set_vf_link_state(struct net_device *net_dev, int vf_i,
+				int link_state);
+#endif /* CONFIG_SFC_SRIOV */
+
+#endif /* EFX_SRIOV_H */
--- /dev/null
+++ b/drivers/net/ethernet/sfc/siena/tx.c
@@ -0,0 +1,643 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/****************************************************************************
+ * Driver for Solarflare network controllers and boards
+ * Copyright 2005-2006 Fen Systems Ltd.
+ * Copyright 2005-2013 Solarflare Communications Inc.
+ */
+
+#include <linux/pci.h>
+#include <linux/tcp.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <linux/ipv6.h>
+#include <linux/slab.h>
+#include <net/ipv6.h>
+#include <linux/if_ether.h>
+#include <linux/highmem.h>
+#include <linux/cache.h>
+#include "net_driver.h"
+#include "efx.h"
+#include "io.h"
+#include "nic.h"
+#include "tx.h"
+#include "tx_common.h"
+#include "workarounds.h"
+#include "ef10_regs.h"
+
+#ifdef EFX_USE_PIO
+
+#define EFX_PIOBUF_SIZE_DEF ALIGN(256, L1_CACHE_BYTES)
+unsigned int efx_piobuf_size __read_mostly = EFX_PIOBUF_SIZE_DEF;
+
+#endif /* EFX_USE_PIO */
+
+static inline u8 *efx_tx_get_copy_buffer(struct efx_tx_queue *tx_queue,
+					 struct efx_tx_buffer *buffer)
+{
+	unsigned int index = efx_tx_queue_get_insert_index(tx_queue);
+	struct efx_buffer *page_buf =
+		&tx_queue->cb_page[index >> (PAGE_SHIFT - EFX_TX_CB_ORDER)];
+	unsigned int offset =
+		((index << EFX_TX_CB_ORDER) + NET_IP_ALIGN) & (PAGE_SIZE - 1);
+
+	if (unlikely(!page_buf->addr) &&
+	    efx_nic_alloc_buffer(tx_queue->efx, page_buf, PAGE_SIZE,
+				 GFP_ATOMIC))
+		return NULL;
+	buffer->dma_addr = page_buf->dma_addr + offset;
+	buffer->unmap_len = 0;
+	return (u8 *)page_buf->addr + offset;
+}
+
+u8 *efx_tx_get_copy_buffer_limited(struct efx_tx_queue *tx_queue,
+				   struct efx_tx_buffer *buffer, size_t len)
+{
+	if (len > EFX_TX_CB_SIZE)
+		return NULL;
+	return efx_tx_get_copy_buffer(tx_queue, buffer);
+}
+
+static void efx_tx_maybe_stop_queue(struct efx_tx_queue *txq1)
+{
+	/* We need to consider all queues that the net core sees as one */
+	struct efx_nic *efx = txq1->efx;
+	struct efx_tx_queue *txq2;
+	unsigned int fill_level;
+
+	fill_level = efx_channel_tx_old_fill_level(txq1->channel);
+	if (likely(fill_level < efx->txq_stop_thresh))
+		return;
+
+	/* We used the stale old_read_count above, which gives us a
+	 * pessimistic estimate of the fill level (which may even
+	 * validly be >= efx->txq_entries).  Now try again using
+	 * read_count (more likely to be a cache miss).
+	 *
+	 * If we read read_count and then conditionally stop the
+	 * queue, it is possible for the completion path to race with
+	 * us and complete all outstanding descriptors in the middle,
+	 * after which there will be no more completions to wake it.
+	 * Therefore we stop the queue first, then read read_count
+	 * (with a memory barrier to ensure the ordering), then
+	 * restart the queue if the fill level turns out to be low
+	 * enough.
+	 */
+	netif_tx_stop_queue(txq1->core_txq);
+	smp_mb();
+	efx_for_each_channel_tx_queue(txq2, txq1->channel)
+		txq2->old_read_count = READ_ONCE(txq2->read_count);
+
+	fill_level = efx_channel_tx_old_fill_level(txq1->channel);
+	EFX_WARN_ON_ONCE_PARANOID(fill_level >= efx->txq_entries);
+	if (likely(fill_level < efx->txq_stop_thresh)) {
+		smp_mb();
+		if (likely(!efx->loopback_selftest))
+			netif_tx_start_queue(txq1->core_txq);
+	}
+}
+
+static int efx_enqueue_skb_copy(struct efx_tx_queue *tx_queue,
+				struct sk_buff *skb)
+{
+	unsigned int copy_len = skb->len;
+	struct efx_tx_buffer *buffer;
+	u8 *copy_buffer;
+	int rc;
+
+	EFX_WARN_ON_ONCE_PARANOID(copy_len > EFX_TX_CB_SIZE);
+
+	buffer = efx_tx_queue_get_insert_buffer(tx_queue);
+
+	copy_buffer = efx_tx_get_copy_buffer(tx_queue, buffer);
+	if (unlikely(!copy_buffer))
+		return -ENOMEM;
+
+	rc = skb_copy_bits(skb, 0, copy_buffer, copy_len);
+	EFX_WARN_ON_PARANOID(rc);
+	buffer->len = copy_len;
+
+	buffer->skb = skb;
+	buffer->flags = EFX_TX_BUF_SKB;
+
+	++tx_queue->insert_count;
+	return rc;
+}
+
+#ifdef EFX_USE_PIO
+
+struct efx_short_copy_buffer {
+	int used;
+	u8 buf[L1_CACHE_BYTES];
+};
+
+/* Copy to PIO, respecting that writes to PIO buffers must be dword aligned.
+ * Advances piobuf pointer. Leaves additional data in the copy buffer.
+ */
+static void efx_memcpy_toio_aligned(struct efx_nic *efx, u8 __iomem **piobuf,
+				    u8 *data, int len,
+				    struct efx_short_copy_buffer *copy_buf)
+{
+	int block_len = len & ~(sizeof(copy_buf->buf) - 1);
+
+	__iowrite64_copy(*piobuf, data, block_len >> 3);
+	*piobuf += block_len;
+	len -= block_len;
+
+	if (len) {
+		data += block_len;
+		BUG_ON(copy_buf->used);
+		BUG_ON(len > sizeof(copy_buf->buf));
+		memcpy(copy_buf->buf, data, len);
+		copy_buf->used = len;
+	}
+}
+
+/* Copy to PIO, respecting dword alignment, popping data from copy buffer first.
+ * Advances piobuf pointer. Leaves additional data in the copy buffer.
+ */
+static void efx_memcpy_toio_aligned_cb(struct efx_nic *efx, u8 __iomem **piobuf,
+				       u8 *data, int len,
+				       struct efx_short_copy_buffer *copy_buf)
+{
+	if (copy_buf->used) {
+		/* if the copy buffer is partially full, fill it up and write */
+		int copy_to_buf =
+			min_t(int, sizeof(copy_buf->buf) - copy_buf->used, len);
+
+		memcpy(copy_buf->buf + copy_buf->used, data, copy_to_buf);
+		copy_buf->used += copy_to_buf;
+
+		/* if we didn't fill it up then we're done for now */
+		if (copy_buf->used < sizeof(copy_buf->buf))
+			return;
+
+		__iowrite64_copy(*piobuf, copy_buf->buf,
+				 sizeof(copy_buf->buf) >> 3);
+		*piobuf += sizeof(copy_buf->buf);
+		data += copy_to_buf;
+		len -= copy_to_buf;
+		copy_buf->used = 0;
+	}
+
+	efx_memcpy_toio_aligned(efx, piobuf, data, len, copy_buf);
+}
+
+static void efx_flush_copy_buffer(struct efx_nic *efx, u8 __iomem *piobuf,
+				  struct efx_short_copy_buffer *copy_buf)
+{
+	/* if there's anything in it, write the whole buffer, including junk */
+	if (copy_buf->used)
+		__iowrite64_copy(piobuf, copy_buf->buf,
+				 sizeof(copy_buf->buf) >> 3);
+}
+
+/* Traverse skb structure and copy fragments in to PIO buffer.
+ * Advances piobuf pointer.
+ */
+static void efx_skb_copy_bits_to_pio(struct efx_nic *efx, struct sk_buff *skb,
+				     u8 __iomem **piobuf,
+				     struct efx_short_copy_buffer *copy_buf)
+{
+	int i;
+
+	efx_memcpy_toio_aligned(efx, piobuf, skb->data, skb_headlen(skb),
+				copy_buf);
+
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) {
+		skb_frag_t *f = &skb_shinfo(skb)->frags[i];
+		u8 *vaddr;
+
+		vaddr = kmap_atomic(skb_frag_page(f));
+
+		efx_memcpy_toio_aligned_cb(efx, piobuf, vaddr + skb_frag_off(f),
+					   skb_frag_size(f), copy_buf);
+		kunmap_atomic(vaddr);
+	}
+
+	EFX_WARN_ON_ONCE_PARANOID(skb_shinfo(skb)->frag_list);
+}
+
+static int efx_enqueue_skb_pio(struct efx_tx_queue *tx_queue,
+			       struct sk_buff *skb)
+{
+	struct efx_tx_buffer *buffer =
+		efx_tx_queue_get_insert_buffer(tx_queue);
+	u8 __iomem *piobuf = tx_queue->piobuf;
+
+	/* Copy to PIO buffer. Ensure the writes are padded to the end
+	 * of a cache line, as this is required for write-combining to be
+	 * effective on at least x86.
+	 */
+
+	if (skb_shinfo(skb)->nr_frags) {
+		/* The size of the copy buffer will ensure all writes
+		 * are the size of a cache line.
+		 */
+		struct efx_short_copy_buffer copy_buf;
+
+		copy_buf.used = 0;
+
+		efx_skb_copy_bits_to_pio(tx_queue->efx, skb,
+					 &piobuf, &copy_buf);
+		efx_flush_copy_buffer(tx_queue->efx, piobuf, &copy_buf);
+	} else {
+		/* Pad the write to the size of a cache line.
+		 * We can do this because we know the skb_shared_info struct is
+		 * after the source, and the destination buffer is big enough.
+		 */
+		BUILD_BUG_ON(L1_CACHE_BYTES >
+			     SKB_DATA_ALIGN(sizeof(struct skb_shared_info)));
+		__iowrite64_copy(tx_queue->piobuf, skb->data,
+				 ALIGN(skb->len, L1_CACHE_BYTES) >> 3);
+	}
+
+	buffer->skb = skb;
+	buffer->flags = EFX_TX_BUF_SKB | EFX_TX_BUF_OPTION;
+
+	EFX_POPULATE_QWORD_5(buffer->option,
+			     ESF_DZ_TX_DESC_IS_OPT, 1,
+			     ESF_DZ_TX_OPTION_TYPE, ESE_DZ_TX_OPTION_DESC_PIO,
+			     ESF_DZ_TX_PIO_CONT, 0,
+			     ESF_DZ_TX_PIO_BYTE_CNT, skb->len,
+			     ESF_DZ_TX_PIO_BUF_ADDR,
+			     tx_queue->piobuf_offset);
+	++tx_queue->insert_count;
+	return 0;
+}
+
+/* Decide whether we can use TX PIO, ie. write packet data directly into
+ * a buffer on the device.  This can reduce latency at the expense of
+ * throughput, so we only do this if both hardware and software TX rings
+ * are empty, including all queues for the channel.  This also ensures that
+ * only one packet at a time can be using the PIO buffer. If the xmit_more
+ * flag is set then we don't use this - there'll be another packet along
+ * shortly and we want to hold off the doorbell.
+ */
+static bool efx_tx_may_pio(struct efx_tx_queue *tx_queue)
+{
+	struct efx_channel *channel = tx_queue->channel;
+
+	if (!tx_queue->piobuf)
+		return false;
+
+	EFX_WARN_ON_ONCE_PARANOID(!channel->efx->type->option_descriptors);
+
+	efx_for_each_channel_tx_queue(tx_queue, channel)
+		if (!efx_nic_tx_is_empty(tx_queue, tx_queue->packet_write_count))
+			return false;
+
+	return true;
+}
+#endif /* EFX_USE_PIO */
+
+/* Send any pending traffic for a channel. xmit_more is shared across all
+ * queues for a channel, so we must check all of them.
+ */
+static void efx_tx_send_pending(struct efx_channel *channel)
+{
+	struct efx_tx_queue *q;
+
+	efx_for_each_channel_tx_queue(q, channel) {
+		if (q->xmit_pending)
+			efx_nic_push_buffers(q);
+	}
+}
+
+/*
+ * Add a socket buffer to a TX queue
+ *
+ * This maps all fragments of a socket buffer for DMA and adds them to
+ * the TX queue.  The queue's insert pointer will be incremented by
+ * the number of fragments in the socket buffer.
+ *
+ * If any DMA mapping fails, any mapped fragments will be unmapped,
+ * the queue's insert pointer will be restored to its original value.
+ *
+ * This function is split out from efx_hard_start_xmit to allow the
+ * loopback test to direct packets via specific TX queues.
+ *
+ * Returns NETDEV_TX_OK.
+ * You must hold netif_tx_lock() to call this function.
+ */
+netdev_tx_t __efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
+{
+	unsigned int old_insert_count = tx_queue->insert_count;
+	bool xmit_more = netdev_xmit_more();
+	bool data_mapped = false;
+	unsigned int segments;
+	unsigned int skb_len;
+	int rc;
+
+	skb_len = skb->len;
+	segments = skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 0;
+	if (segments == 1)
+		segments = 0; /* Don't use TSO for a single segment. */
+
+	/* Handle TSO first - it's *possible* (although unlikely) that we might
+	 * be passed a packet to segment that's smaller than the copybreak/PIO
+	 * size limit.
+	 */
+	if (segments) {
+		switch (tx_queue->tso_version) {
+		case 1:
+			rc = efx_enqueue_skb_tso(tx_queue, skb, &data_mapped);
+			break;
+		case 2:
+			rc = efx_ef10_tx_tso_desc(tx_queue, skb, &data_mapped);
+			break;
+		case 0: /* No TSO on this queue, SW fallback needed */
+		default:
+			rc = -EINVAL;
+			break;
+		}
+		if (rc == -EINVAL) {
+			rc = efx_tx_tso_fallback(tx_queue, skb);
+			tx_queue->tso_fallbacks++;
+			if (rc == 0)
+				return 0;
+		}
+		if (rc)
+			goto err;
+#ifdef EFX_USE_PIO
+	} else if (skb_len <= efx_piobuf_size && !xmit_more &&
+		   efx_tx_may_pio(tx_queue)) {
+		/* Use PIO for short packets with an empty queue. */
+		if (efx_enqueue_skb_pio(tx_queue, skb))
+			goto err;
+		tx_queue->pio_packets++;
+		data_mapped = true;
+#endif
+	} else if (skb->data_len && skb_len <= EFX_TX_CB_SIZE) {
+		/* Pad short packets or coalesce short fragmented packets. */
+		if (efx_enqueue_skb_copy(tx_queue, skb))
+			goto err;
+		tx_queue->cb_packets++;
+		data_mapped = true;
+	}
+
+	/* Map for DMA and create descriptors if we haven't done so already. */
+	if (!data_mapped && (efx_tx_map_data(tx_queue, skb, segments)))
+		goto err;
+
+	efx_tx_maybe_stop_queue(tx_queue);
+
+	tx_queue->xmit_pending = true;
+
+	/* Pass off to hardware */
+	if (__netdev_tx_sent_queue(tx_queue->core_txq, skb_len, xmit_more))
+		efx_tx_send_pending(tx_queue->channel);
+
+	if (segments) {
+		tx_queue->tso_bursts++;
+		tx_queue->tso_packets += segments;
+		tx_queue->tx_packets  += segments;
+	} else {
+		tx_queue->tx_packets++;
+	}
+
+	return NETDEV_TX_OK;
+
+
+err:
+	efx_enqueue_unwind(tx_queue, old_insert_count);
+	dev_kfree_skb_any(skb);
+
+	/* If we're not expecting another transmit and we had something to push
+	 * on this queue or a partner queue then we need to push here to get the
+	 * previous packets out.
+	 */
+	if (!xmit_more)
+		efx_tx_send_pending(tx_queue->channel);
+
+	return NETDEV_TX_OK;
+}
+
+/* Transmit a packet from an XDP buffer
+ *
+ * Returns number of packets sent on success, error code otherwise.
+ * Runs in NAPI context, either in our poll (for XDP TX) or a different NIC
+ * (for XDP redirect).
+ */
+int efx_xdp_tx_buffers(struct efx_nic *efx, int n, struct xdp_frame **xdpfs,
+		       bool flush)
+{
+	struct efx_tx_buffer *tx_buffer;
+	struct efx_tx_queue *tx_queue;
+	struct xdp_frame *xdpf;
+	dma_addr_t dma_addr;
+	unsigned int len;
+	int space;
+	int cpu;
+	int i = 0;
+
+	if (unlikely(n && !xdpfs))
+		return -EINVAL;
+	if (unlikely(!n))
+		return 0;
+
+	cpu = raw_smp_processor_id();
+	if (unlikely(cpu >= efx->xdp_tx_queue_count))
+		return -EINVAL;
+
+	tx_queue = efx->xdp_tx_queues[cpu];
+	if (unlikely(!tx_queue))
+		return -EINVAL;
+
+	if (!tx_queue->initialised)
+		return -EINVAL;
+
+	if (efx->xdp_txq_queues_mode != EFX_XDP_TX_QUEUES_DEDICATED)
+		HARD_TX_LOCK(efx->net_dev, tx_queue->core_txq, cpu);
+
+	/* If we're borrowing net stack queues we have to handle stop-restart
+	 * or we might block the queue and it will be considered as frozen
+	 */
+	if (efx->xdp_txq_queues_mode == EFX_XDP_TX_QUEUES_BORROWED) {
+		if (netif_tx_queue_stopped(tx_queue->core_txq))
+			goto unlock;
+		efx_tx_maybe_stop_queue(tx_queue);
+	}
+
+	/* Check for available space. We should never need multiple
+	 * descriptors per frame.
+	 */
+	space = efx->txq_entries +
+		tx_queue->read_count - tx_queue->insert_count;
+
+	for (i = 0; i < n; i++) {
+		xdpf = xdpfs[i];
+
+		if (i >= space)
+			break;
+
+		/* We'll want a descriptor for this tx. */
+		prefetchw(__efx_tx_queue_get_insert_buffer(tx_queue));
+
+		len = xdpf->len;
+
+		/* Map for DMA. */
+		dma_addr = dma_map_single(&efx->pci_dev->dev,
+					  xdpf->data, len,
+					  DMA_TO_DEVICE);
+		if (dma_mapping_error(&efx->pci_dev->dev, dma_addr))
+			break;
+
+		/*  Create descriptor and set up for unmapping DMA. */
+		tx_buffer = efx_tx_map_chunk(tx_queue, dma_addr, len);
+		tx_buffer->xdpf = xdpf;
+		tx_buffer->flags = EFX_TX_BUF_XDP |
+				   EFX_TX_BUF_MAP_SINGLE;
+		tx_buffer->dma_offset = 0;
+		tx_buffer->unmap_len = len;
+		tx_queue->tx_packets++;
+	}
+
+	/* Pass mapped frames to hardware. */
+	if (flush && i > 0)
+		efx_nic_push_buffers(tx_queue);
+
+unlock:
+	if (efx->xdp_txq_queues_mode != EFX_XDP_TX_QUEUES_DEDICATED)
+		HARD_TX_UNLOCK(efx->net_dev, tx_queue->core_txq);
+
+	return i == 0 ? -EIO : i;
+}
+
+/* Initiate a packet transmission.  We use one channel per CPU
+ * (sharing when we have more CPUs than channels).
+ *
+ * Context: non-blocking.
+ * Should always return NETDEV_TX_OK and consume the skb.
+ */
+netdev_tx_t efx_hard_start_xmit(struct sk_buff *skb,
+				struct net_device *net_dev)
+{
+	struct efx_nic *efx = netdev_priv(net_dev);
+	struct efx_tx_queue *tx_queue;
+	unsigned index, type;
+
+	EFX_WARN_ON_PARANOID(!netif_device_present(net_dev));
+
+	index = skb_get_queue_mapping(skb);
+	type = efx_tx_csum_type_skb(skb);
+	if (index >= efx->n_tx_channels) {
+		index -= efx->n_tx_channels;
+		type |= EFX_TXQ_TYPE_HIGHPRI;
+	}
+
+	/* PTP "event" packet */
+	if (unlikely(efx_xmit_with_hwtstamp(skb)) &&
+	    ((efx_ptp_use_mac_tx_timestamps(efx) && efx->ptp_data) ||
+	    unlikely(efx_ptp_is_ptp_tx(efx, skb)))) {
+		/* There may be existing transmits on the channel that are
+		 * waiting for this packet to trigger the doorbell write.
+		 * We need to send the packets at this point.
+		 */
+		efx_tx_send_pending(efx_get_tx_channel(efx, index));
+		return efx_ptp_tx(efx, skb);
+	}
+
+	tx_queue = efx_get_tx_queue(efx, index, type);
+	if (WARN_ON_ONCE(!tx_queue)) {
+		/* We don't have a TXQ of the right type.
+		 * This should never happen, as we don't advertise offload
+		 * features unless we can support them.
+		 */
+		dev_kfree_skb_any(skb);
+		/* If we're not expecting another transmit and we had something to push
+		 * on this queue or a partner queue then we need to push here to get the
+		 * previous packets out.
+		 */
+		if (!netdev_xmit_more())
+			efx_tx_send_pending(tx_queue->channel);
+		return NETDEV_TX_OK;
+	}
+
+	return __efx_enqueue_skb(tx_queue, skb);
+}
+
+void efx_xmit_done_single(struct efx_tx_queue *tx_queue)
+{
+	unsigned int pkts_compl = 0, bytes_compl = 0;
+	unsigned int read_ptr;
+	bool finished = false;
+
+	read_ptr = tx_queue->read_count & tx_queue->ptr_mask;
+
+	while (!finished) {
+		struct efx_tx_buffer *buffer = &tx_queue->buffer[read_ptr];
+
+		if (!efx_tx_buffer_in_use(buffer)) {
+			struct efx_nic *efx = tx_queue->efx;
+
+			netif_err(efx, hw, efx->net_dev,
+				  "TX queue %d spurious single TX completion\n",
+				  tx_queue->queue);
+			efx_schedule_reset(efx, RESET_TYPE_TX_SKIP);
+			return;
+		}
+
+		/* Need to check the flag before dequeueing. */
+		if (buffer->flags & EFX_TX_BUF_SKB)
+			finished = true;
+		efx_dequeue_buffer(tx_queue, buffer, &pkts_compl, &bytes_compl);
+
+		++tx_queue->read_count;
+		read_ptr = tx_queue->read_count & tx_queue->ptr_mask;
+	}
+
+	tx_queue->pkts_compl += pkts_compl;
+	tx_queue->bytes_compl += bytes_compl;
+
+	EFX_WARN_ON_PARANOID(pkts_compl != 1);
+
+	efx_xmit_done_check_empty(tx_queue);
+}
+
+void efx_init_tx_queue_core_txq(struct efx_tx_queue *tx_queue)
+{
+	struct efx_nic *efx = tx_queue->efx;
+
+	/* Must be inverse of queue lookup in efx_hard_start_xmit() */
+	tx_queue->core_txq =
+		netdev_get_tx_queue(efx->net_dev,
+				    tx_queue->channel->channel +
+				    ((tx_queue->type & EFX_TXQ_TYPE_HIGHPRI) ?
+				     efx->n_tx_channels : 0));
+}
+
+int efx_setup_tc(struct net_device *net_dev, enum tc_setup_type type,
+		 void *type_data)
+{
+	struct efx_nic *efx = netdev_priv(net_dev);
+	struct tc_mqprio_qopt *mqprio = type_data;
+	unsigned tc, num_tc;
+
+	if (type != TC_SETUP_QDISC_MQPRIO)
+		return -EOPNOTSUPP;
+
+	/* Only Siena supported highpri queues */
+	if (efx_nic_rev(efx) > EFX_REV_SIENA_A0)
+		return -EOPNOTSUPP;
+
+	num_tc = mqprio->num_tc;
+
+	if (num_tc > EFX_MAX_TX_TC)
+		return -EINVAL;
+
+	mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
+
+	if (num_tc == net_dev->num_tc)
+		return 0;
+
+	for (tc = 0; tc < num_tc; tc++) {
+		net_dev->tc_to_txq[tc].offset = tc * efx->n_tx_channels;
+		net_dev->tc_to_txq[tc].count = efx->n_tx_channels;
+	}
+
+	net_dev->num_tc = num_tc;
+
+	return netif_set_real_num_tx_queues(net_dev,
+					    max_t(int, num_tc, 1) *
+					    efx->n_tx_channels);
+}
--- /dev/null
+++ b/drivers/net/ethernet/sfc/siena/tx.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/****************************************************************************
+ * Driver for Solarflare network controllers and boards
+ * Copyright 2005-2006 Fen Systems Ltd.
+ * Copyright 2006-2015 Solarflare Communications Inc.
+ */
+
+#ifndef EFX_TX_H
+#define EFX_TX_H
+
+#include <linux/types.h>
+
+/* Driver internal tx-path related declarations. */
+
+unsigned int efx_tx_limit_len(struct efx_tx_queue *tx_queue,
+			      dma_addr_t dma_addr, unsigned int len);
+
+u8 *efx_tx_get_copy_buffer_limited(struct efx_tx_queue *tx_queue,
+				   struct efx_tx_buffer *buffer, size_t len);
+
+/* What TXQ type will satisfy the checksum offloads required for this skb? */
+static inline unsigned int efx_tx_csum_type_skb(struct sk_buff *skb)
+{
+	if (skb->ip_summed != CHECKSUM_PARTIAL)
+		return 0; /* no checksum offload */
+
+	if (skb->encapsulation &&
+	    skb_checksum_start_offset(skb) == skb_inner_transport_offset(skb)) {
+		/* we only advertise features for IPv4 and IPv6 checksums on
+		 * encapsulated packets, so if the checksum is for the inner
+		 * packet, it must be one of them; no further checking required.
+		 */
+
+		/* Do we also need to offload the outer header checksum? */
+		if (skb_shinfo(skb)->gso_segs > 1 &&
+		    !(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) &&
+		    (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM))
+			return EFX_TXQ_TYPE_OUTER_CSUM | EFX_TXQ_TYPE_INNER_CSUM;
+		return EFX_TXQ_TYPE_INNER_CSUM;
+	}
+
+	/* similarly, we only advertise features for IPv4 and IPv6 checksums,
+	 * so it must be one of them. No need for further checks.
+	 */
+	return EFX_TXQ_TYPE_OUTER_CSUM;
+}
+#endif /* EFX_TX_H */
--- /dev/null
+++ b/drivers/net/ethernet/sfc/siena/tx_common.c
@@ -0,0 +1,449 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/****************************************************************************
+ * Driver for Solarflare network controllers and boards
+ * Copyright 2018 Solarflare Communications Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ */
+
+#include "net_driver.h"
+#include "efx.h"
+#include "nic_common.h"
+#include "tx_common.h"
+
+static unsigned int efx_tx_cb_page_count(struct efx_tx_queue *tx_queue)
+{
+	return DIV_ROUND_UP(tx_queue->ptr_mask + 1,
+			    PAGE_SIZE >> EFX_TX_CB_ORDER);
+}
+
+int efx_probe_tx_queue(struct efx_tx_queue *tx_queue)
+{
+	struct efx_nic *efx = tx_queue->efx;
+	unsigned int entries;
+	int rc;
+
+	/* Create the smallest power-of-two aligned ring */
+	entries = max(roundup_pow_of_two(efx->txq_entries), EFX_MIN_DMAQ_SIZE);
+	EFX_WARN_ON_PARANOID(entries > EFX_MAX_DMAQ_SIZE);
+	tx_queue->ptr_mask = entries - 1;
+
+	netif_dbg(efx, probe, efx->net_dev,
+		  "creating TX queue %d size %#x mask %#x\n",
+		  tx_queue->queue, efx->txq_entries, tx_queue->ptr_mask);
+
+	/* Allocate software ring */
+	tx_queue->buffer = kcalloc(entries, sizeof(*tx_queue->buffer),
+				   GFP_KERNEL);
+	if (!tx_queue->buffer)
+		return -ENOMEM;
+
+	tx_queue->cb_page = kcalloc(efx_tx_cb_page_count(tx_queue),
+				    sizeof(tx_queue->cb_page[0]), GFP_KERNEL);
+	if (!tx_queue->cb_page) {
+		rc = -ENOMEM;
+		goto fail1;
+	}
+
+	/* Allocate hardware ring, determine TXQ type */
+	rc = efx_nic_probe_tx(tx_queue);
+	if (rc)
+		goto fail2;
+
+	tx_queue->channel->tx_queue_by_type[tx_queue->type] = tx_queue;
+	return 0;
+
+fail2:
+	kfree(tx_queue->cb_page);
+	tx_queue->cb_page = NULL;
+fail1:
+	kfree(tx_queue->buffer);
+	tx_queue->buffer = NULL;
+	return rc;
+}
+
+void efx_init_tx_queue(struct efx_tx_queue *tx_queue)
+{
+	struct efx_nic *efx = tx_queue->efx;
+
+	netif_dbg(efx, drv, efx->net_dev,
+		  "initialising TX queue %d\n", tx_queue->queue);
+
+	tx_queue->insert_count = 0;
+	tx_queue->notify_count = 0;
+	tx_queue->write_count = 0;
+	tx_queue->packet_write_count = 0;
+	tx_queue->old_write_count = 0;
+	tx_queue->read_count = 0;
+	tx_queue->old_read_count = 0;
+	tx_queue->empty_read_count = 0 | EFX_EMPTY_COUNT_VALID;
+	tx_queue->xmit_pending = false;
+	tx_queue->timestamping = (efx_ptp_use_mac_tx_timestamps(efx) &&
+				  tx_queue->channel == efx_ptp_channel(efx));
+	tx_queue->completed_timestamp_major = 0;
+	tx_queue->completed_timestamp_minor = 0;
+
+	tx_queue->xdp_tx = efx_channel_is_xdp_tx(tx_queue->channel);
+	tx_queue->tso_version = 0;
+
+	/* Set up TX descriptor ring */
+	efx_nic_init_tx(tx_queue);
+
+	tx_queue->initialised = true;
+}
+
+void efx_fini_tx_queue(struct efx_tx_queue *tx_queue)
+{
+	struct efx_tx_buffer *buffer;
+
+	netif_dbg(tx_queue->efx, drv, tx_queue->efx->net_dev,
+		  "shutting down TX queue %d\n", tx_queue->queue);
+
+	tx_queue->initialised = false;
+
+	if (!tx_queue->buffer)
+		return;
+
+	/* Free any buffers left in the ring */
+	while (tx_queue->read_count != tx_queue->write_count) {
+		unsigned int pkts_compl = 0, bytes_compl = 0;
+
+		buffer = &tx_queue->buffer[tx_queue->read_count & tx_queue->ptr_mask];
+		efx_dequeue_buffer(tx_queue, buffer, &pkts_compl, &bytes_compl);
+
+		++tx_queue->read_count;
+	}
+	tx_queue->xmit_pending = false;
+	netdev_tx_reset_queue(tx_queue->core_txq);
+}
+
+void efx_remove_tx_queue(struct efx_tx_queue *tx_queue)
+{
+	int i;
+
+	if (!tx_queue->buffer)
+		return;
+
+	netif_dbg(tx_queue->efx, drv, tx_queue->efx->net_dev,
+		  "destroying TX queue %d\n", tx_queue->queue);
+	efx_nic_remove_tx(tx_queue);
+
+	if (tx_queue->cb_page) {
+		for (i = 0; i < efx_tx_cb_page_count(tx_queue); i++)
+			efx_nic_free_buffer(tx_queue->efx,
+					    &tx_queue->cb_page[i]);
+		kfree(tx_queue->cb_page);
+		tx_queue->cb_page = NULL;
+	}
+
+	kfree(tx_queue->buffer);
+	tx_queue->buffer = NULL;
+	tx_queue->channel->tx_queue_by_type[tx_queue->type] = NULL;
+}
+
+void efx_dequeue_buffer(struct efx_tx_queue *tx_queue,
+			struct efx_tx_buffer *buffer,
+			unsigned int *pkts_compl,
+			unsigned int *bytes_compl)
+{
+	if (buffer->unmap_len) {
+		struct device *dma_dev = &tx_queue->efx->pci_dev->dev;
+		dma_addr_t unmap_addr = buffer->dma_addr - buffer->dma_offset;
+
+		if (buffer->flags & EFX_TX_BUF_MAP_SINGLE)
+			dma_unmap_single(dma_dev, unmap_addr, buffer->unmap_len,
+					 DMA_TO_DEVICE);
+		else
+			dma_unmap_page(dma_dev, unmap_addr, buffer->unmap_len,
+				       DMA_TO_DEVICE);
+		buffer->unmap_len = 0;
+	}
+
+	if (buffer->flags & EFX_TX_BUF_SKB) {
+		struct sk_buff *skb = (struct sk_buff *)buffer->skb;
+
+		EFX_WARN_ON_PARANOID(!pkts_compl || !bytes_compl);
+		(*pkts_compl)++;
+		(*bytes_compl) += skb->len;
+		if (tx_queue->timestamping &&
+		    (tx_queue->completed_timestamp_major ||
+		     tx_queue->completed_timestamp_minor)) {
+			struct skb_shared_hwtstamps hwtstamp;
+
+			hwtstamp.hwtstamp =
+				efx_ptp_nic_to_kernel_time(tx_queue);
+			skb_tstamp_tx(skb, &hwtstamp);
+
+			tx_queue->completed_timestamp_major = 0;
+			tx_queue->completed_timestamp_minor = 0;
+		}
+		dev_consume_skb_any((struct sk_buff *)buffer->skb);
+		netif_vdbg(tx_queue->efx, tx_done, tx_queue->efx->net_dev,
+			   "TX queue %d transmission id %x complete\n",
+			   tx_queue->queue, tx_queue->read_count);
+	} else if (buffer->flags & EFX_TX_BUF_XDP) {
+		xdp_return_frame_rx_napi(buffer->xdpf);
+	}
+
+	buffer->len = 0;
+	buffer->flags = 0;
+}
+
+/* Remove packets from the TX queue
+ *
+ * This removes packets from the TX queue, up to and including the
+ * specified index.
+ */
+static void efx_dequeue_buffers(struct efx_tx_queue *tx_queue,
+				unsigned int index,
+				unsigned int *pkts_compl,
+				unsigned int *bytes_compl)
+{
+	struct efx_nic *efx = tx_queue->efx;
+	unsigned int stop_index, read_ptr;
+
+	stop_index = (index + 1) & tx_queue->ptr_mask;
+	read_ptr = tx_queue->read_count & tx_queue->ptr_mask;
+
+	while (read_ptr != stop_index) {
+		struct efx_tx_buffer *buffer = &tx_queue->buffer[read_ptr];
+
+		if (!efx_tx_buffer_in_use(buffer)) {
+			netif_err(efx, tx_err, efx->net_dev,
+				  "TX queue %d spurious TX completion id %d\n",
+				  tx_queue->queue, read_ptr);
+			efx_schedule_reset(efx, RESET_TYPE_TX_SKIP);
+			return;
+		}
+
+		efx_dequeue_buffer(tx_queue, buffer, pkts_compl, bytes_compl);
+
+		++tx_queue->read_count;
+		read_ptr = tx_queue->read_count & tx_queue->ptr_mask;
+	}
+}
+
+void efx_xmit_done_check_empty(struct efx_tx_queue *tx_queue)
+{
+	if ((int)(tx_queue->read_count - tx_queue->old_write_count) >= 0) {
+		tx_queue->old_write_count = READ_ONCE(tx_queue->write_count);
+		if (tx_queue->read_count == tx_queue->old_write_count) {
+			/* Ensure that read_count is flushed. */
+			smp_mb();
+			tx_queue->empty_read_count =
+				tx_queue->read_count | EFX_EMPTY_COUNT_VALID;
+		}
+	}
+}
+
+void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index)
+{
+	unsigned int fill_level, pkts_compl = 0, bytes_compl = 0;
+	struct efx_nic *efx = tx_queue->efx;
+
+	EFX_WARN_ON_ONCE_PARANOID(index > tx_queue->ptr_mask);
+
+	efx_dequeue_buffers(tx_queue, index, &pkts_compl, &bytes_compl);
+	tx_queue->pkts_compl += pkts_compl;
+	tx_queue->bytes_compl += bytes_compl;
+
+	if (pkts_compl > 1)
+		++tx_queue->merge_events;
+
+	/* See if we need to restart the netif queue.  This memory
+	 * barrier ensures that we write read_count (inside
+	 * efx_dequeue_buffers()) before reading the queue status.
+	 */
+	smp_mb();
+	if (unlikely(netif_tx_queue_stopped(tx_queue->core_txq)) &&
+	    likely(efx->port_enabled) &&
+	    likely(netif_device_present(efx->net_dev))) {
+		fill_level = efx_channel_tx_fill_level(tx_queue->channel);
+		if (fill_level <= efx->txq_wake_thresh)
+			netif_tx_wake_queue(tx_queue->core_txq);
+	}
+
+	efx_xmit_done_check_empty(tx_queue);
+}
+
+/* Remove buffers put into a tx_queue for the current packet.
+ * None of the buffers must have an skb attached.
+ */
+void efx_enqueue_unwind(struct efx_tx_queue *tx_queue,
+			unsigned int insert_count)
+{
+	struct efx_tx_buffer *buffer;
+	unsigned int bytes_compl = 0;
+	unsigned int pkts_compl = 0;
+
+	/* Work backwards until we hit the original insert pointer value */
+	while (tx_queue->insert_count != insert_count) {
+		--tx_queue->insert_count;
+		buffer = __efx_tx_queue_get_insert_buffer(tx_queue);
+		efx_dequeue_buffer(tx_queue, buffer, &pkts_compl, &bytes_compl);
+	}
+}
+
+struct efx_tx_buffer *efx_tx_map_chunk(struct efx_tx_queue *tx_queue,
+				       dma_addr_t dma_addr, size_t len)
+{
+	const struct efx_nic_type *nic_type = tx_queue->efx->type;
+	struct efx_tx_buffer *buffer;
+	unsigned int dma_len;
+
+	/* Map the fragment taking account of NIC-dependent DMA limits. */
+	do {
+		buffer = efx_tx_queue_get_insert_buffer(tx_queue);
+
+		if (nic_type->tx_limit_len)
+			dma_len = nic_type->tx_limit_len(tx_queue, dma_addr, len);
+		else
+			dma_len = len;
+
+		buffer->len = dma_len;
+		buffer->dma_addr = dma_addr;
+		buffer->flags = EFX_TX_BUF_CONT;
+		len -= dma_len;
+		dma_addr += dma_len;
+		++tx_queue->insert_count;
+	} while (len);
+
+	return buffer;
+}
+
+int efx_tx_tso_header_length(struct sk_buff *skb)
+{
+	size_t header_len;
+
+	if (skb->encapsulation)
+		header_len = skb_inner_transport_header(skb) -
+				skb->data +
+				(inner_tcp_hdr(skb)->doff << 2u);
+	else
+		header_len = skb_transport_header(skb) - skb->data +
+				(tcp_hdr(skb)->doff << 2u);
+	return header_len;
+}
+
+/* Map all data from an SKB for DMA and create descriptors on the queue. */
+int efx_tx_map_data(struct efx_tx_queue *tx_queue, struct sk_buff *skb,
+		    unsigned int segment_count)
+{
+	struct efx_nic *efx = tx_queue->efx;
+	struct device *dma_dev = &efx->pci_dev->dev;
+	unsigned int frag_index, nr_frags;
+	dma_addr_t dma_addr, unmap_addr;
+	unsigned short dma_flags;
+	size_t len, unmap_len;
+
+	nr_frags = skb_shinfo(skb)->nr_frags;
+	frag_index = 0;
+
+	/* Map header data. */
+	len = skb_headlen(skb);
+	dma_addr = dma_map_single(dma_dev, skb->data, len, DMA_TO_DEVICE);
+	dma_flags = EFX_TX_BUF_MAP_SINGLE;
+	unmap_len = len;
+	unmap_addr = dma_addr;
+
+	if (unlikely(dma_mapping_error(dma_dev, dma_addr)))
+		return -EIO;
+
+	if (segment_count) {
+		/* For TSO we need to put the header in to a separate
+		 * descriptor. Map this separately if necessary.
+		 */
+		size_t header_len = efx_tx_tso_header_length(skb);
+
+		if (header_len != len) {
+			tx_queue->tso_long_headers++;
+			efx_tx_map_chunk(tx_queue, dma_addr, header_len);
+			len -= header_len;
+			dma_addr += header_len;
+		}
+	}
+
+	/* Add descriptors for each fragment. */
+	do {
+		struct efx_tx_buffer *buffer;
+		skb_frag_t *fragment;
+
+		buffer = efx_tx_map_chunk(tx_queue, dma_addr, len);
+
+		/* The final descriptor for a fragment is responsible for
+		 * unmapping the whole fragment.
+		 */
+		buffer->flags = EFX_TX_BUF_CONT | dma_flags;
+		buffer->unmap_len = unmap_len;
+		buffer->dma_offset = buffer->dma_addr - unmap_addr;
+
+		if (frag_index >= nr_frags) {
+			/* Store SKB details with the final buffer for
+			 * the completion.
+			 */
+			buffer->skb = skb;
+			buffer->flags = EFX_TX_BUF_SKB | dma_flags;
+			return 0;
+		}
+
+		/* Move on to the next fragment. */
+		fragment = &skb_shinfo(skb)->frags[frag_index++];
+		len = skb_frag_size(fragment);
+		dma_addr = skb_frag_dma_map(dma_dev, fragment, 0, len,
+					    DMA_TO_DEVICE);
+		dma_flags = 0;
+		unmap_len = len;
+		unmap_addr = dma_addr;
+
+		if (unlikely(dma_mapping_error(dma_dev, dma_addr)))
+			return -EIO;
+	} while (1);
+}
+
+unsigned int efx_tx_max_skb_descs(struct efx_nic *efx)
+{
+	/* Header and payload descriptor for each output segment, plus
+	 * one for every input fragment boundary within a segment
+	 */
+	unsigned int max_descs = EFX_TSO_MAX_SEGS * 2 + MAX_SKB_FRAGS;
+
+	/* Possibly one more per segment for option descriptors */
+	if (efx_nic_rev(efx) >= EFX_REV_HUNT_A0)
+		max_descs += EFX_TSO_MAX_SEGS;
+
+	/* Possibly more for PCIe page boundaries within input fragments */
+	if (PAGE_SIZE > EFX_PAGE_SIZE)
+		max_descs += max_t(unsigned int, MAX_SKB_FRAGS,
+				   DIV_ROUND_UP(GSO_MAX_SIZE, EFX_PAGE_SIZE));
+
+	return max_descs;
+}
+
+/*
+ * Fallback to software TSO.
+ *
+ * This is used if we are unable to send a GSO packet through hardware TSO.
+ * This should only ever happen due to per-queue restrictions - unsupported
+ * packets should first be filtered by the feature flags.
+ *
+ * Returns 0 on success, error code otherwise.
+ */
+int efx_tx_tso_fallback(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
+{
+	struct sk_buff *segments, *next;
+
+	segments = skb_gso_segment(skb, 0);
+	if (IS_ERR(segments))
+		return PTR_ERR(segments);
+
+	dev_consume_skb_any(skb);
+
+	skb_list_walk_safe(segments, skb, next) {
+		skb_mark_not_on_list(skb);
+		efx_enqueue_skb(tx_queue, skb);
+	}
+
+	return 0;
+}
--- /dev/null
+++ b/drivers/net/ethernet/sfc/siena/tx_common.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/****************************************************************************
+ * Driver for Solarflare network controllers and boards
+ * Copyright 2018 Solarflare Communications Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ */
+
+#ifndef EFX_TX_COMMON_H
+#define EFX_TX_COMMON_H
+
+int efx_probe_tx_queue(struct efx_tx_queue *tx_queue);
+void efx_init_tx_queue(struct efx_tx_queue *tx_queue);
+void efx_fini_tx_queue(struct efx_tx_queue *tx_queue);
+void efx_remove_tx_queue(struct efx_tx_queue *tx_queue);
+
+void efx_dequeue_buffer(struct efx_tx_queue *tx_queue,
+			struct efx_tx_buffer *buffer,
+			unsigned int *pkts_compl,
+			unsigned int *bytes_compl);
+
+static inline bool efx_tx_buffer_in_use(struct efx_tx_buffer *buffer)
+{
+	return buffer->len || (buffer->flags & EFX_TX_BUF_OPTION);
+}
+
+void efx_xmit_done_check_empty(struct efx_tx_queue *tx_queue);
+void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index);
+
+void efx_enqueue_unwind(struct efx_tx_queue *tx_queue,
+			unsigned int insert_count);
+
+struct efx_tx_buffer *efx_tx_map_chunk(struct efx_tx_queue *tx_queue,
+				       dma_addr_t dma_addr, size_t len);
+int efx_tx_tso_header_length(struct sk_buff *skb);
+int efx_tx_map_data(struct efx_tx_queue *tx_queue, struct sk_buff *skb,
+		    unsigned int segment_count);
+
+unsigned int efx_tx_max_skb_descs(struct efx_nic *efx);
+int efx_tx_tso_fallback(struct efx_tx_queue *tx_queue, struct sk_buff *skb);
+
+extern bool efx_separate_tx_channels;
+#endif
--- /dev/null
+++ b/drivers/net/ethernet/sfc/siena/vfdi.h
@@ -0,0 +1,252 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/****************************************************************************
+ * Driver for Solarflare network controllers and boards
+ * Copyright 2010-2012 Solarflare Communications Inc.
+ */
+#ifndef _VFDI_H
+#define _VFDI_H
+
+/**
+ * DOC: Virtual Function Driver Interface
+ *
+ * This file contains software structures used to form a two way
+ * communication channel between the VF driver and the PF driver,
+ * named Virtual Function Driver Interface (VFDI).
+ *
+ * For the purposes of VFDI, a page is a memory region with size and
+ * alignment of 4K.  All addresses are DMA addresses to be used within
+ * the domain of the relevant VF.
+ *
+ * The only hardware-defined channels for a VF driver to communicate
+ * with the PF driver are the event mailboxes (%FR_CZ_USR_EV
+ * registers).  Writing to these registers generates an event with
+ * EV_CODE = EV_CODE_USR_EV, USER_QID set to the index of the mailbox
+ * and USER_EV_REG_VALUE set to the value written.  The PF driver may
+ * direct or disable delivery of these events by setting
+ * %FR_CZ_USR_EV_CFG.
+ *
+ * The PF driver can send arbitrary events to arbitrary event queues.
+ * However, for consistency, VFDI events from the PF are defined to
+ * follow the same form and be sent to the first event queue assigned
+ * to the VF while that queue is enabled by the VF driver.
+ *
+ * The general form of the variable bits of VFDI events is:
+ *
+ *       0             16                       24   31
+ *      | DATA        | TYPE                   | SEQ   |
+ *
+ * SEQ is a sequence number which should be incremented by 1 (modulo
+ * 256) for each event.  The sequence numbers used in each direction
+ * are independent.
+ *
+ * The VF submits requests of type &struct vfdi_req by sending the
+ * address of the request (ADDR) in a series of 4 events:
+ *
+ *       0             16                       24   31
+ *      | ADDR[0:15]  | VFDI_EV_TYPE_REQ_WORD0 | SEQ   |
+ *      | ADDR[16:31] | VFDI_EV_TYPE_REQ_WORD1 | SEQ+1 |
+ *      | ADDR[32:47] | VFDI_EV_TYPE_REQ_WORD2 | SEQ+2 |
+ *      | ADDR[48:63] | VFDI_EV_TYPE_REQ_WORD3 | SEQ+3 |
+ *
+ * The address must be page-aligned.  After receiving such a valid
+ * series of events, the PF driver will attempt to read the request
+ * and write a response to the same address.  In case of an invalid
+ * sequence of events or a DMA error, there will be no response.
+ *
+ * The VF driver may request that the PF driver writes status
+ * information into its domain asynchronously.  After writing the
+ * status, the PF driver will send an event of the form:
+ *
+ *       0             16                       24   31
+ *      | reserved    | VFDI_EV_TYPE_STATUS    | SEQ   |
+ *
+ * In case the VF must be reset for any reason, the PF driver will
+ * send an event of the form:
+ *
+ *       0             16                       24   31
+ *      | reserved    | VFDI_EV_TYPE_RESET     | SEQ   |
+ *
+ * It is then the responsibility of the VF driver to request
+ * reinitialisation of its queues.
+ */
+#define VFDI_EV_SEQ_LBN 24
+#define VFDI_EV_SEQ_WIDTH 8
+#define VFDI_EV_TYPE_LBN 16
+#define VFDI_EV_TYPE_WIDTH 8
+#define VFDI_EV_TYPE_REQ_WORD0 0
+#define VFDI_EV_TYPE_REQ_WORD1 1
+#define VFDI_EV_TYPE_REQ_WORD2 2
+#define VFDI_EV_TYPE_REQ_WORD3 3
+#define VFDI_EV_TYPE_STATUS 4
+#define VFDI_EV_TYPE_RESET 5
+#define VFDI_EV_DATA_LBN 0
+#define VFDI_EV_DATA_WIDTH 16
+
+struct vfdi_endpoint {
+	u8 mac_addr[ETH_ALEN];
+	__be16 tci;
+};
+
+/**
+ * enum vfdi_op - VFDI operation enumeration
+ * @VFDI_OP_RESPONSE: Indicates a response to the request.
+ * @VFDI_OP_INIT_EVQ: Initialize SRAM entries and initialize an EVQ.
+ * @VFDI_OP_INIT_RXQ: Initialize SRAM entries and initialize an RXQ.
+ * @VFDI_OP_INIT_TXQ: Initialize SRAM entries and initialize a TXQ.
+ * @VFDI_OP_FINI_ALL_QUEUES: Flush all queues, finalize all queues, then
+ *	finalize the SRAM entries.
+ * @VFDI_OP_INSERT_FILTER: Insert a MAC filter targeting the given RXQ.
+ * @VFDI_OP_REMOVE_ALL_FILTERS: Remove all filters.
+ * @VFDI_OP_SET_STATUS_PAGE: Set the DMA page(s) used for status updates
+ *	from PF and write the initial status.
+ * @VFDI_OP_CLEAR_STATUS_PAGE: Clear the DMA page(s) used for status
+ *	updates from PF.
+ */
+enum vfdi_op {
+	VFDI_OP_RESPONSE = 0,
+	VFDI_OP_INIT_EVQ = 1,
+	VFDI_OP_INIT_RXQ = 2,
+	VFDI_OP_INIT_TXQ = 3,
+	VFDI_OP_FINI_ALL_QUEUES = 4,
+	VFDI_OP_INSERT_FILTER = 5,
+	VFDI_OP_REMOVE_ALL_FILTERS = 6,
+	VFDI_OP_SET_STATUS_PAGE = 7,
+	VFDI_OP_CLEAR_STATUS_PAGE = 8,
+	VFDI_OP_LIMIT,
+};
+
+/* Response codes for VFDI operations. Other values may be used in future. */
+#define VFDI_RC_SUCCESS		0
+#define VFDI_RC_ENOMEM		(-12)
+#define VFDI_RC_EINVAL		(-22)
+#define VFDI_RC_EOPNOTSUPP	(-95)
+#define VFDI_RC_ETIMEDOUT	(-110)
+
+/**
+ * struct vfdi_req - Request from VF driver to PF driver
+ * @op: Operation code or response indicator, taken from &enum vfdi_op.
+ * @rc: Response code.  Set to 0 on success or a negative error code on failure.
+ * @u.init_evq.index: Index of event queue to create.
+ * @u.init_evq.buf_count: Number of 4k buffers backing event queue.
+ * @u.init_evq.addr: Array of length %u.init_evq.buf_count containing DMA
+ *	address of each page backing the event queue.
+ * @u.init_rxq.index: Index of receive queue to create.
+ * @u.init_rxq.buf_count: Number of 4k buffers backing receive queue.
+ * @u.init_rxq.evq: Instance of event queue to target receive events at.
+ * @u.init_rxq.label: Label used in receive events.
+ * @u.init_rxq.flags: Unused.
+ * @u.init_rxq.addr: Array of length %u.init_rxq.buf_count containing DMA
+ *	address of each page backing the receive queue.
+ * @u.init_txq.index: Index of transmit queue to create.
+ * @u.init_txq.buf_count: Number of 4k buffers backing transmit queue.
+ * @u.init_txq.evq: Instance of event queue to target transmit completion
+ *	events at.
+ * @u.init_txq.label: Label used in transmit completion events.
+ * @u.init_txq.flags: Checksum offload flags.
+ * @u.init_txq.addr: Array of length %u.init_txq.buf_count containing DMA
+ *	address of each page backing the transmit queue.
+ * @u.mac_filter.rxq: Insert MAC filter at VF local address/VLAN targeting
+ *	all traffic at this receive queue.
+ * @u.mac_filter.flags: MAC filter flags.
+ * @u.set_status_page.dma_addr: Base address for the &struct vfdi_status.
+ *	This address must be page-aligned and the PF may write up to a
+ *	whole page (allowing for extension of the structure).
+ * @u.set_status_page.peer_page_count: Number of additional pages the VF
+ *	has provided into which peer addresses may be DMAd.
+ * @u.set_status_page.peer_page_addr: Array of DMA addresses of pages.
+ *	If the number of peers exceeds 256, then the VF must provide
+ *	additional pages in this array. The PF will then DMA up to
+ *	512 vfdi_endpoint structures into each page.  These addresses
+ *	must be page-aligned.
+ */
+struct vfdi_req {
+	u32 op;
+	u32 reserved1;
+	s32 rc;
+	u32 reserved2;
+	union {
+		struct {
+			u32 index;
+			u32 buf_count;
+			u64 addr[];
+		} init_evq;
+		struct {
+			u32 index;
+			u32 buf_count;
+			u32 evq;
+			u32 label;
+			u32 flags;
+#define VFDI_RXQ_FLAG_SCATTER_EN 1
+			u32 reserved;
+			u64 addr[];
+		} init_rxq;
+		struct {
+			u32 index;
+			u32 buf_count;
+			u32 evq;
+			u32 label;
+			u32 flags;
+#define VFDI_TXQ_FLAG_IP_CSUM_DIS 1
+#define VFDI_TXQ_FLAG_TCPUDP_CSUM_DIS 2
+			u32 reserved;
+			u64 addr[];
+		} init_txq;
+		struct {
+			u32 rxq;
+			u32 flags;
+#define VFDI_MAC_FILTER_FLAG_RSS 1
+#define VFDI_MAC_FILTER_FLAG_SCATTER 2
+		} mac_filter;
+		struct {
+			u64 dma_addr;
+			u64 peer_page_count;
+			u64 peer_page_addr[];
+		} set_status_page;
+	} u;
+};
+
+/**
+ * struct vfdi_status - Status provided by PF driver to VF driver
+ * @generation_start: A generation count DMA'd to VF *before* the
+ *	rest of the structure.
+ * @generation_end: A generation count DMA'd to VF *after* the
+ *	rest of the structure.
+ * @version: Version of this structure; currently set to 1.  Later
+ *	versions must either be layout-compatible or only be sent to VFs
+ *	that specifically request them.
+ * @length: Total length of this structure including embedded tables
+ * @vi_scale: log2 the number of VIs available on this VF. This quantity
+ *	is used by the hardware for register decoding.
+ * @max_tx_channels: The maximum number of transmit queues the VF can use.
+ * @rss_rxq_count: The number of receive queues present in the shared RSS
+ *	indirection table.
+ * @peer_count: Total number of peers in the complete peer list. If larger
+ *	than ARRAY_SIZE(%peers), then the VF must provide sufficient
+ *	additional pages each of which is filled with vfdi_endpoint structures.
+ * @local: The MAC address and outer VLAN tag of *this* VF
+ * @peers: Table of peer addresses.  The @tci fields in these structures
+ *	are currently unused and must be ignored.  Additional peers are
+ *	written into any additional pages provided by the VF.
+ * @timer_quantum_ns: Timer quantum (nominal period between timer ticks)
+ *	for interrupt moderation timers, in nanoseconds. This member is only
+ *	present if @length is sufficiently large.
+ */
+struct vfdi_status {
+	u32 generation_start;
+	u32 generation_end;
+	u32 version;
+	u32 length;
+	u8 vi_scale;
+	u8 max_tx_channels;
+	u8 rss_rxq_count;
+	u8 reserved1;
+	u16 peer_count;
+	u16 reserved2;
+	struct vfdi_endpoint local;
+	struct vfdi_endpoint peers[256];
+
+	/* Members below here extend version 1 of this structure */
+	u32 timer_quantum_ns;
+};
+
+#endif
--- /dev/null
+++ b/drivers/net/ethernet/sfc/siena/workarounds.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/****************************************************************************
+ * Driver for Solarflare network controllers and boards
+ * Copyright 2006-2013 Solarflare Communications Inc.
+ */
+
+#ifndef EFX_WORKAROUNDS_H
+#define EFX_WORKAROUNDS_H
+
+/*
+ * Hardware workarounds.
+ * Bug numbers are from Solarflare's Bugzilla.
+ */
+
+#define EFX_WORKAROUND_SIENA(efx) (efx_nic_rev(efx) == EFX_REV_SIENA_A0)
+#define EFX_WORKAROUND_EF10(efx) (efx_nic_rev(efx) >= EFX_REV_HUNT_A0)
+#define EFX_WORKAROUND_10G(efx) 1
+
+/* Bit-bashed I2C reads cause performance drop */
+#define EFX_WORKAROUND_7884 EFX_WORKAROUND_10G
+/* Legacy interrupt storm when interrupt fifo fills */
+#define EFX_WORKAROUND_17213 EFX_WORKAROUND_SIENA
+
+/* Lockup when writing event block registers at gen2/gen3 */
+#define EFX_EF10_WORKAROUND_35388(efx)					\
+	(((struct efx_ef10_nic_data *)efx->nic_data)->workaround_35388)
+#define EFX_WORKAROUND_35388(efx)					\
+	(efx_nic_rev(efx) == EFX_REV_HUNT_A0 && EFX_EF10_WORKAROUND_35388(efx))
+
+/* Moderation timer access must go through MCDI */
+#define EFX_EF10_WORKAROUND_61265(efx)					\
+	(((struct efx_ef10_nic_data *)efx->nic_data)->workaround_61265)
+
+#endif /* EFX_WORKAROUNDS_H */
kernel / kernel-source

Source Code

Files