From 769bb1e4b8c07caf9b74bf79a3fe2945f8fd6a4b Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Jun 02 2025 12:31:03 +0000 Subject: hv_netvsc: Preserve contiguous PFN grouping in the page buffer array (bsc#1243737). --- diff --git a/patches.suse/msft-hv-3240-hv_netvsc-Preserve-contiguous-PFN-grouping-in-the-pa.patch b/patches.suse/msft-hv-3240-hv_netvsc-Preserve-contiguous-PFN-grouping-in-the-pa.patch new file mode 100644 index 0000000..bfdc587 --- /dev/null +++ b/patches.suse/msft-hv-3240-hv_netvsc-Preserve-contiguous-PFN-grouping-in-the-pa.patch @@ -0,0 +1,253 @@ +From: Michael Kelley +Date: Mon, 12 May 2025 17:06:02 -0700 +Patch-mainline: v6.15-rc7 +Subject: hv_netvsc: Preserve contiguous PFN grouping in the page buffer array +Git-commit: 41a6328b2c55276f89ea3812069fd7521e348bbf +References: bsc#1243737 + +Starting with commit dca5161f9bd0 ("hv_netvsc: Check status in +SEND_RNDIS_PKT completion message") in the 6.3 kernel, the Linux +driver for Hyper-V synthetic networking (netvsc) occasionally reports +"nvsp_rndis_pkt_complete error status: 2".[1] This error indicates +that Hyper-V has rejected a network packet transmit request from the +guest, and the outgoing network packet is dropped. Higher level +network protocols presumably recover and resend the packet so there is +no functional error, but performance is slightly impacted. Commit +dca5161f9bd0 is not the cause of the error -- it only added reporting +of an error that was already happening without any notice. The error +has presumably been present since the netvsc driver was originally +introduced into Linux. + +The root cause of the problem is that the netvsc driver in Linux may +send an incorrectly formatted VMBus message to Hyper-V when +transmitting the network packet. The incorrect formatting occurs when +the rndis header of the VMBus message crosses a page boundary due to +how the Linux skb head memory is aligned. In such a case, two PFNs are +required to describe the location of the rndis header, even though +they are contiguous in guest physical address (GPA) space. Hyper-V +requires that two rndis header PFNs be in a single "GPA range" data +struture, but current netvsc code puts each PFN in its own GPA range, +which Hyper-V rejects as an error. + +The incorrect formatting occurs only for larger packets that netvsc +must transmit via a VMBus "GPA Direct" message. There's no problem +when netvsc transmits a smaller packet by copying it into a pre- +allocated send buffer slot because the pre-allocated slots don't have +page crossing issues. + +After commit 14ad6ed30a10 ("net: allow small head cache usage with +large MAX_SKB_FRAGS values") in the 6.14-rc4 kernel, the error occurs +much more frequently in VMs with 16 or more vCPUs. It may occur every +few seconds, or even more frequently, in an ssh session that outputs a +lot of text. Commit 14ad6ed30a10 subtly changes how skb head memory is +allocated, making it much more likely that the rndis header will cross +a page boundary when the vCPU count is 16 or more. The changes in +commit 14ad6ed30a10 are perfectly valid -- they just had the side +effect of making the netvsc bug more prominent. + +Current code in init_page_array() creates a separate page buffer array +entry for each PFN required to identify the data to be transmitted. +Contiguous PFNs get separate entries in the page buffer array, and any +information about contiguity is lost. + +Fix the core issue by having init_page_array() construct the page +buffer array to represent contiguous ranges rather than individual +pages. When these ranges are subsequently passed to +netvsc_build_mpb_array(), it can build GPA ranges that contain +multiple PFNs, as required to avoid the error "nvsp_rndis_pkt_complete +error status: 2". If instead the network packet is sent by copying +into a pre-allocated send buffer slot, the copy proceeds using the +contiguous ranges rather than individual pages, but the result of the +copying is the same. Also fix rndis_filter_send_request() to construct +a contiguous range, since it has its own page buffer array. + +This change has a side benefit in CoCo VMs in that netvsc_dma_map() +calls dma_map_single() on each contiguous range instead of on each +page. This results in fewer calls to dma_map_single() but on larger +chunks of memory, which should reduce contention on the swiotlb. + +Since the page buffer array now contains one entry for each contiguous +range instead of for each individual page, the number of entries in +the array can be reduced, saving 208 bytes of stack space in +netvsc_xmit() when MAX_SKG_FRAGS has the default value of 17. + +[1] https://bugzilla.kernel.org/show_bug.cgi?id=217503 + +Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217503 +Cc: # 6.1.x +Signed-off-by: Michael Kelley +Link: https://patch.msgid.link/20250513000604.1396-4-mhklinux@outlook.com +Signed-off-by: Jakub Kicinski +Acked-by: Olaf Hering +--- + drivers/net/hyperv/hyperv_net.h | 12 ++ + drivers/net/hyperv/netvsc_drv.c | 63 ++------- + drivers/net/hyperv/rndis_filter.c | 24 +--- + 3 files changed, 32 insertions(+), 67 deletions(-) + +diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h +--- a/drivers/net/hyperv/hyperv_net.h ++++ b/drivers/net/hyperv/hyperv_net.h +@@ -893,6 +893,18 @@ struct nvsp_message { + sizeof(struct nvsp_message)) + #define NETVSC_MIN_IN_MSG_SIZE sizeof(struct vmpacket_descriptor) + ++/* Maximum # of contiguous data ranges that can make up a trasmitted packet. ++ * Typically it's the max SKB fragments plus 2 for the rndis packet and the ++ * linear portion of the SKB. But if MAX_SKB_FRAGS is large, the value may ++ * need to be limited to MAX_PAGE_BUFFER_COUNT, which is the max # of entries ++ * in a GPA direct packet sent to netvsp over VMBus. ++ */ ++#if MAX_SKB_FRAGS + 2 < MAX_PAGE_BUFFER_COUNT ++#define MAX_DATA_RANGES (MAX_SKB_FRAGS + 2) ++#else ++#define MAX_DATA_RANGES MAX_PAGE_BUFFER_COUNT ++#endif ++ + /* Estimated requestor size: + * out_ring_size/min_out_msg_size + in_ring_size/min_in_msg_size + */ +diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c +--- a/drivers/net/hyperv/netvsc_drv.c ++++ b/drivers/net/hyperv/netvsc_drv.c +@@ -326,43 +326,10 @@ static u16 netvsc_select_queue(struct net_device *ndev, struct sk_buff *skb, + return txq; + } + +-static u32 fill_pg_buf(unsigned long hvpfn, u32 offset, u32 len, +- struct hv_page_buffer *pb) +-{ +- int j = 0; +- +- hvpfn += offset >> HV_HYP_PAGE_SHIFT; +- offset = offset & ~HV_HYP_PAGE_MASK; +- +- while (len > 0) { +- unsigned long bytes; +- +- bytes = HV_HYP_PAGE_SIZE - offset; +- if (bytes > len) +- bytes = len; +- pb[j].pfn = hvpfn; +- pb[j].offset = offset; +- pb[j].len = bytes; +- +- offset += bytes; +- len -= bytes; +- +- if (offset == HV_HYP_PAGE_SIZE && len) { +- hvpfn++; +- offset = 0; +- j++; +- } +- } +- +- return j + 1; +-} +- + static u32 init_page_array(void *hdr, u32 len, struct sk_buff *skb, + struct hv_netvsc_packet *packet, + struct hv_page_buffer *pb) + { +- u32 slots_used = 0; +- char *data = skb->data; + int frags = skb_shinfo(skb)->nr_frags; + int i; + +@@ -371,28 +338,28 @@ static u32 init_page_array(void *hdr, u32 len, struct sk_buff *skb, + * 2. skb linear data + * 3. skb fragment data + */ +- slots_used += fill_pg_buf(virt_to_hvpfn(hdr), +- offset_in_hvpage(hdr), +- len, +- &pb[slots_used]); + ++ pb[0].offset = offset_in_hvpage(hdr); ++ pb[0].len = len; ++ pb[0].pfn = virt_to_hvpfn(hdr); + packet->rmsg_size = len; +- packet->rmsg_pgcnt = slots_used; ++ packet->rmsg_pgcnt = 1; + +- slots_used += fill_pg_buf(virt_to_hvpfn(data), +- offset_in_hvpage(data), +- skb_headlen(skb), +- &pb[slots_used]); ++ pb[1].offset = offset_in_hvpage(skb->data); ++ pb[1].len = skb_headlen(skb); ++ pb[1].pfn = virt_to_hvpfn(skb->data); + + for (i = 0; i < frags; i++) { + skb_frag_t *frag = skb_shinfo(skb)->frags + i; ++ struct hv_page_buffer *cur_pb = &pb[i + 2]; ++ u64 pfn = page_to_hvpfn(skb_frag_page(frag)); ++ u32 offset = skb_frag_off(frag); + +- slots_used += fill_pg_buf(page_to_hvpfn(skb_frag_page(frag)), +- skb_frag_off(frag), +- skb_frag_size(frag), +- &pb[slots_used]); ++ cur_pb->offset = offset_in_hvpage(offset); ++ cur_pb->len = skb_frag_size(frag); ++ cur_pb->pfn = pfn + (offset >> HV_HYP_PAGE_SHIFT); + } +- return slots_used; ++ return frags + 2; + } + + static int count_skb_frag_slots(struct sk_buff *skb) +@@ -483,7 +450,7 @@ static int netvsc_xmit(struct sk_buff *skb, struct net_device *net, bool xdp_tx) + struct net_device *vf_netdev; + u32 rndis_msg_size; + u32 hash; +- struct hv_page_buffer pb[MAX_PAGE_BUFFER_COUNT]; ++ struct hv_page_buffer pb[MAX_DATA_RANGES]; + + /* If VF is present and up then redirect packets to it. + * Skip the VF if it is marked down or has no carrier. +diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c +--- a/drivers/net/hyperv/rndis_filter.c ++++ b/drivers/net/hyperv/rndis_filter.c +@@ -225,8 +225,7 @@ static int rndis_filter_send_request(struct rndis_device *dev, + struct rndis_request *req) + { + struct hv_netvsc_packet *packet; +- struct hv_page_buffer page_buf[2]; +- struct hv_page_buffer *pb = page_buf; ++ struct hv_page_buffer pb; + int ret; + + /* Setup the packet to send it */ +@@ -235,27 +234,14 @@ static int rndis_filter_send_request(struct rndis_device *dev, + packet->total_data_buflen = req->request_msg.msg_len; + packet->page_buf_cnt = 1; + +- pb[0].pfn = virt_to_phys(&req->request_msg) >> +- HV_HYP_PAGE_SHIFT; +- pb[0].len = req->request_msg.msg_len; +- pb[0].offset = offset_in_hvpage(&req->request_msg); +- +- /* Add one page_buf when request_msg crossing page boundary */ +- if (pb[0].offset + pb[0].len > HV_HYP_PAGE_SIZE) { +- packet->page_buf_cnt++; +- pb[0].len = HV_HYP_PAGE_SIZE - +- pb[0].offset; +- pb[1].pfn = virt_to_phys((void *)&req->request_msg +- + pb[0].len) >> HV_HYP_PAGE_SHIFT; +- pb[1].offset = 0; +- pb[1].len = req->request_msg.msg_len - +- pb[0].len; +- } ++ pb.pfn = virt_to_phys(&req->request_msg) >> HV_HYP_PAGE_SHIFT; ++ pb.len = req->request_msg.msg_len; ++ pb.offset = offset_in_hvpage(&req->request_msg); + + trace_rndis_send(dev->ndev, 0, &req->request_msg); + + rcu_read_lock_bh(); +- ret = netvsc_send(dev->ndev, packet, NULL, pb, NULL, false); ++ ret = netvsc_send(dev->ndev, packet, NULL, &pb, NULL, false); + rcu_read_unlock_bh(); + + return ret; diff --git a/series.conf b/series.conf index 1aeb4d7..66c431f 100644 --- a/series.conf +++ b/series.conf @@ -24051,6 +24051,7 @@ patches.suse/ocfs2-fix-the-issue-with-discontiguous-allocation-in.patch patches.suse/msft-hv-3238-Drivers-hv-Allow-vmbus_sendpacket_mpb_desc-to-create.patch patches.suse/msft-hv-3239-hv_netvsc-Use-vmbus_sendpacket_mpb_desc-to-send-VMBu.patch + patches.suse/msft-hv-3240-hv_netvsc-Preserve-contiguous-PFN-grouping-in-the-pa.patch # out-of-tree patches patches.suse/net-dcb-check-for-detached-device-before-executing-c.patch