From: Ilya Dryomov <idryomov@gmail.com>
Date: Tue, 20 Nov 2018 15:44:00 +0100
Subject: libceph: use MSG_SENDPAGE_NOTLAST with ceph_tcp_sendpage()
Git-commit: 433b0a12953bc1dfcb52febb186136395a65aad0
Patch-mainline: v5.0-rc1
References: bsc#1122215
Prevent do_tcp_sendpages() from calling tcp_push() (at least) once per
page. Instead, arrange for tcp_push() to be called (at least) once per
data payload. This results in more MSS-sized packets and fewer packets
overall (5-10% reduction in my tests with typical OSD request sizes).
See commits 2f5338442425 ("tcp: allow splice() to build full TSO
packets"), 35f9c09fe9c7 ("tcp: tcp_sendpages() should call tcp_push()
once") and ae62ca7b0321 ("tcp: fix MSG_SENDPAGE_NOTLAST logic") for
details.
Here is an example of a packet size histogram for 128K OSD requests
(MSS = 1448, top 5):
Before:
SIZE COUNT
1448 777700
952 127915
1200 39238
1219 9806
21 5675
After:
SIZE COUNT
1448 897280
21 6201
1019 2797
643 2739
376 2479
We could do slightly better by explicitly corking the socket but it's
not clear it's worth it.
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Acked-by: Luis Henriques <lhenriques@suse.com>
---
net/ceph/messenger.c | 17 +++++++++++++----
1 file changed, 13 insertions(+), 4 deletions(-)
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -572,12 +572,15 @@ static int ceph_tcp_sendmsg(struct socke
return r;
}
+/*
+ * @more: either or both of MSG_MORE and MSG_SENDPAGE_NOTLAST
+ */
static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
- int offset, size_t size, bool more)
+ int offset, size_t size, int more)
{
ssize_t (*sendpage)(struct socket *sock, struct page *page,
int offset, size_t size, int flags);
- int flags = MSG_DONTWAIT | MSG_NOSIGNAL | (more ? MSG_MORE : 0);
+ int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more;
int ret;
/*
@@ -1571,6 +1574,7 @@ static int write_partial_message_data(st
struct ceph_msg *msg = con->out_msg;
struct ceph_msg_data_cursor *cursor = &msg->cursor;
bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
+ int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
u32 crc;
dout("%s %p msg %p\n", __func__, con, msg);
@@ -1599,8 +1603,10 @@ static int write_partial_message_data(st
}
page = ceph_msg_data_next(cursor, &page_offset, &length, NULL);
+ if (length == cursor->total_resid)
+ more = MSG_MORE;
ret = ceph_tcp_sendpage(con->sock, page, page_offset, length,
- true);
+ more);
if (ret <= 0) {
if (do_datacrc)
msg->footer.data_crc = cpu_to_le32(crc);
@@ -1630,13 +1636,16 @@ static int write_partial_message_data(st
*/
static int write_partial_skip(struct ceph_connection *con)
{
+ int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
int ret;
dout("%s %p %d left\n", __func__, con, con->out_skip);
while (con->out_skip > 0) {
size_t size = min(con->out_skip, (int) PAGE_SIZE);
- ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, true);
+ if (size == con->out_skip)
+ more = MSG_MORE;
+ ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, more);
if (ret <= 0)
goto out;
con->out_skip -= ret;