From 86784099c356283da3841b7854bfc5a21b6e60a3 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Apr 20 2023 11:00:38 +0000 Subject: net: macb: fix a memory corruption in extended buffer descriptor mode (bsc#1012628). --- diff --git a/patches.kernel.org/6.2.12-073-net-macb-fix-a-memory-corruption-in-extended-b.patch b/patches.kernel.org/6.2.12-073-net-macb-fix-a-memory-corruption-in-extended-b.patch new file mode 100644 index 0000000..817e848 --- /dev/null +++ b/patches.kernel.org/6.2.12-073-net-macb-fix-a-memory-corruption-in-extended-b.patch @@ -0,0 +1,152 @@ +From: Roman Gushchin +Date: Wed, 12 Apr 2023 16:21:44 -0700 +Subject: [PATCH] net: macb: fix a memory corruption in extended buffer + descriptor mode +References: bsc#1012628 +Patch-mainline: 6.2.12 +Git-commit: e8b74453555872851bdd7ea43a7c0ec39659834f + +[ Upstream commit e8b74453555872851bdd7ea43a7c0ec39659834f ] + +For quite some time we were chasing a bug which looked like a sudden +permanent failure of networking and mmc on some of our devices. +The bug was very sensitive to any software changes and even more to +any kernel debug options. + +Finally we got a setup where the problem was reproducible with +CONFIG_DMA_API_DEBUG=y and it revealed the issue with the rx dma: + +[ 16.992082] ------------[ cut here ]------------ +[ 16.996779] DMA-API: macb ff0b0000.ethernet: device driver tries to free DMA memory it has not allocated [device address=0x0000000875e3e244] [size=1536 bytes] +[ 17.011049] WARNING: CPU: 0 PID: 85 at kernel/dma/debug.c:1011 check_unmap+0x6a0/0x900 +[ 17.018977] Modules linked in: xxxxx +[ 17.038823] CPU: 0 PID: 85 Comm: irq/55-8000f000 Not tainted 5.4.0 #28 +[ 17.045345] Hardware name: xxxxx +[ 17.049528] pstate: 60000005 (nZCv daif -PAN -UAO) +[ 17.054322] pc : check_unmap+0x6a0/0x900 +[ 17.058243] lr : check_unmap+0x6a0/0x900 +[ 17.062163] sp : ffffffc010003c40 +[ 17.065470] x29: ffffffc010003c40 x28: 000000004000c03c +[ 17.070783] x27: ffffffc010da7048 x26: ffffff8878e38800 +[ 17.076095] x25: ffffff8879d22810 x24: ffffffc010003cc8 +[ 17.081407] x23: 0000000000000000 x22: ffffffc010a08750 +[ 17.086719] x21: ffffff8878e3c7c0 x20: ffffffc010acb000 +[ 17.092032] x19: 0000000875e3e244 x18: 0000000000000010 +[ 17.097343] x17: 0000000000000000 x16: 0000000000000000 +[ 17.102647] x15: ffffff8879e4a988 x14: 0720072007200720 +[ 17.107959] x13: 0720072007200720 x12: 0720072007200720 +[ 17.113261] x11: 0720072007200720 x10: 0720072007200720 +[ 17.118565] x9 : 0720072007200720 x8 : 000000000000022d +[ 17.123869] x7 : 0000000000000015 x6 : 0000000000000098 +[ 17.129173] x5 : 0000000000000000 x4 : 0000000000000000 +[ 17.134475] x3 : 00000000ffffffff x2 : ffffffc010a1d370 +[ 17.139778] x1 : b420c9d75d27bb00 x0 : 0000000000000000 +[ 17.145082] Call trace: +[ 17.147524] check_unmap+0x6a0/0x900 +[ 17.151091] debug_dma_unmap_page+0x88/0x90 +[ 17.155266] gem_rx+0x114/0x2f0 +[ 17.158396] macb_poll+0x58/0x100 +[ 17.161705] net_rx_action+0x118/0x400 +[ 17.165445] __do_softirq+0x138/0x36c +[ 17.169100] irq_exit+0x98/0xc0 +[ 17.172234] __handle_domain_irq+0x64/0xc0 +[ 17.176320] gic_handle_irq+0x5c/0xc0 +[ 17.179974] el1_irq+0xb8/0x140 +[ 17.183109] xiic_process+0x5c/0xe30 +[ 17.186677] irq_thread_fn+0x28/0x90 +[ 17.190244] irq_thread+0x208/0x2a0 +[ 17.193724] kthread+0x130/0x140 +[ 17.196945] ret_from_fork+0x10/0x20 +[ 17.200510] ---[ end trace 7240980785f81d6f ]--- + +[ 237.021490] ------------[ cut here ]------------ +[ 237.026129] DMA-API: exceeded 7 overlapping mappings of cacheline 0x0000000021d79e7b +[ 237.033886] WARNING: CPU: 0 PID: 0 at kernel/dma/debug.c:499 add_dma_entry+0x214/0x240 +[ 237.041802] Modules linked in: xxxxx +[ 237.061637] CPU: 0 PID: 0 Comm: swapper/0 Tainted: G W 5.4.0 #28 +[ 237.068941] Hardware name: xxxxx +[ 237.073116] pstate: 80000085 (Nzcv daIf -PAN -UAO) +[ 237.077900] pc : add_dma_entry+0x214/0x240 +[ 237.081986] lr : add_dma_entry+0x214/0x240 +[ 237.086072] sp : ffffffc010003c30 +[ 237.089379] x29: ffffffc010003c30 x28: ffffff8878a0be00 +[ 237.094683] x27: 0000000000000180 x26: ffffff8878e387c0 +[ 237.099987] x25: 0000000000000002 x24: 0000000000000000 +[ 237.105290] x23: 000000000000003b x22: ffffffc010a0fa00 +[ 237.110594] x21: 0000000021d79e7b x20: ffffffc010abe600 +[ 237.115897] x19: 00000000ffffffef x18: 0000000000000010 +[ 237.121201] x17: 0000000000000000 x16: 0000000000000000 +[ 237.126504] x15: ffffffc010a0fdc8 x14: 0720072007200720 +[ 237.131807] x13: 0720072007200720 x12: 0720072007200720 +[ 237.137111] x11: 0720072007200720 x10: 0720072007200720 +[ 237.142415] x9 : 0720072007200720 x8 : 0000000000000259 +[ 237.147718] x7 : 0000000000000001 x6 : 0000000000000000 +[ 237.153022] x5 : ffffffc010003a20 x4 : 0000000000000001 +[ 237.158325] x3 : 0000000000000006 x2 : 0000000000000007 +[ 237.163628] x1 : 8ac721b3a7dc1c00 x0 : 0000000000000000 +[ 237.168932] Call trace: +[ 237.171373] add_dma_entry+0x214/0x240 +[ 237.175115] debug_dma_map_page+0xf8/0x120 +[ 237.179203] gem_rx_refill+0x190/0x280 +[ 237.182942] gem_rx+0x224/0x2f0 +[ 237.186075] macb_poll+0x58/0x100 +[ 237.189384] net_rx_action+0x118/0x400 +[ 237.193125] __do_softirq+0x138/0x36c +[ 237.196780] irq_exit+0x98/0xc0 +[ 237.199914] __handle_domain_irq+0x64/0xc0 +[ 237.204000] gic_handle_irq+0x5c/0xc0 +[ 237.207654] el1_irq+0xb8/0x140 +[ 237.210789] arch_cpu_idle+0x40/0x200 +[ 237.214444] default_idle_call+0x18/0x30 +[ 237.218359] do_idle+0x200/0x280 +[ 237.221578] cpu_startup_entry+0x20/0x30 +[ 237.225493] rest_init+0xe4/0xf0 +[ 237.228713] arch_call_rest_init+0xc/0x14 +[ 237.232714] start_kernel+0x47c/0x4a8 +[ 237.236367] ---[ end trace 7240980785f81d70 ]--- + +Lars was fast to find an explanation: according to the datasheet +bit 2 of the rx buffer descriptor entry has a different meaning in the +extended mode: + Address [2] of beginning of buffer, or + in extended buffer descriptor mode (DMA configuration register [28] = 1), + indicates a valid timestamp in the buffer descriptor entry. + +The macb driver didn't mask this bit while getting an address and it +eventually caused a memory corruption and a dma failure. + +The problem is resolved by explicitly clearing the problematic bit +if hw timestamping is used. + +Fixes: 7b4296148066 ("net: macb: Add support for PTP timestamps in DMA descriptors") +Signed-off-by: Roman Gushchin +Co-developed-by: Lars-Peter Clausen +Signed-off-by: Lars-Peter Clausen +Acked-by: Nicolas Ferre +Reviewed-by: Jacob Keller +Link: https://lore.kernel.org/r/20230412232144.770336-1-roman.gushchin@linux.dev +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +Signed-off-by: Jiri Slaby +--- + drivers/net/ethernet/cadence/macb_main.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c +index 6cda3152..5b230831 100644 +--- a/drivers/net/ethernet/cadence/macb_main.c ++++ b/drivers/net/ethernet/cadence/macb_main.c +@@ -1009,6 +1009,10 @@ static dma_addr_t macb_get_addr(struct macb *bp, struct macb_dma_desc *desc) + } + #endif + addr |= MACB_BF(RX_WADDR, MACB_BFEXT(RX_WADDR, desc->addr)); ++#ifdef CONFIG_MACB_USE_HWSTAMP ++ if (bp->hw_dma_cap & HW_DMA_CAP_PTP) ++ addr &= ~GEM_BIT(DMA_RXVALID); ++#endif + return addr; + } + +-- +2.35.3 + diff --git a/series.conf b/series.conf index 807f97f..1b74c13 100644 --- a/series.conf +++ b/series.conf @@ -2300,6 +2300,7 @@ patches.kernel.org/6.2.12-070-selftests-openvswitch-adjust-datapath-NL-messa.patch patches.kernel.org/6.2.12-071-udp6-fix-potential-access-to-stale-information.patch patches.kernel.org/6.2.12-072-selftests-add-the-missing-CONFIG_IP_SCTP-in-ne.patch + patches.kernel.org/6.2.12-073-net-macb-fix-a-memory-corruption-in-extended-b.patch ######################################################## # Build fixes that apply to the vanilla kernel too.