Jan Kara 4b3242
From e386dfc56f837da66d00a078e5314bc8382fab83 Mon Sep 17 00:00:00 2001
Jan Kara 4b3242
From: Linus Torvalds <torvalds@linux-foundation.org>
Jan Kara 4b3242
Date: Fri, 10 Dec 2021 14:00:15 -0800
Jan Kara 4b3242
Subject: [PATCH] fget: clarify and improve __fget_files() implementation
Jan Kara 4b3242
Git-commit: e386dfc56f837da66d00a078e5314bc8382fab83
Jan Kara 4b3242
Patch-mainline: v5.16-rc6
Jan Kara 4b3242
References: bsc#1193727
Jan Kara 4b3242
Jan Kara 4b3242
Commit 054aa8d439b9 ("fget: check that the fd still exists after getting
Jan Kara 4b3242
a ref to it") fixed a race with getting a reference to a file just as it
Jan Kara 4b3242
was being closed.  It was a fairly minimal patch, and I didn't think
Jan Kara 4b3242
re-checking the file pointer lookup would be a measurable overhead,
Jan Kara 4b3242
since it was all right there and cached.
Jan Kara 4b3242
Jan Kara 4b3242
But I was wrong, as pointed out by the kernel test robot.
Jan Kara 4b3242
Jan Kara 4b3242
The 'poll2' case of the will-it-scale.per_thread_ops benchmark regressed
Jan Kara 4b3242
quite noticeably.  Admittedly it seems to be a very artificial test:
Jan Kara 4b3242
doing "poll()" system calls on regular files in a very tight loop in
Jan Kara 4b3242
multiple threads.
Jan Kara 4b3242
Jan Kara 4b3242
That means that basically all the time is spent just looking up file
Jan Kara 4b3242
descriptors without ever doing anything useful with them (not that doing
Jan Kara 4b3242
'poll()' on a regular file is useful to begin with).  And as a result it
Jan Kara 4b3242
shows the extra "re-check fd" cost as a sore thumb.
Jan Kara 4b3242
Jan Kara 4b3242
Happily, the regression is fixable by just writing the code to loook up
Jan Kara 4b3242
the fd to be better and clearer.  There's still a cost to verify the
Jan Kara 4b3242
file pointer, but now it's basically in the noise even for that
Jan Kara 4b3242
benchmark that does nothing else - and the code is more understandable
Jan Kara 4b3242
and has better comments too.
Jan Kara 4b3242
Jan Kara 4b3242
[ Side note: this patch is also a classic case of one that looks very
Jan Kara 4b3242
  messy with the default greedy Myers diff - it's much more legible with
Jan Kara 4b3242
  either the patience of histogram diff algorithm ]
Jan Kara 4b3242
Jan Kara 4b3242
Link: https://lore.kernel.org/lkml/20211210053743.GA36420@xsang-OptiPlex-9020/
Jan Kara 4b3242
Link: https://lore.kernel.org/lkml/20211213083154.GA20853@linux.intel.com/
Jan Kara 4b3242
Reported-by: kernel test robot <oliver.sang@intel.com>
Jan Kara 4b3242
Tested-by: Carel Si <beibei.si@intel.com>
Jan Kara 4b3242
Cc: Jann Horn <jannh@google.com>
Jan Kara 4b3242
Cc: Miklos Szeredi <mszeredi@redhat.com>
Jan Kara 4b3242
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Jan Kara 4b3242
Acked-by: Jan Kara <jack@suse.cz>
Jan Kara 4b3242
Jan Kara 4b3242
---
Jan Kara 4b3242
 fs/file.c |   70 ++++++++++++++++++++++++++++++++++++++++++++++++--------------
Jan Kara 4b3242
 1 file changed, 55 insertions(+), 15 deletions(-)
Jan Kara 4b3242
Jan Kara 4b3242
--- a/fs/file.c
Jan Kara 4b3242
+++ b/fs/file.c
Jan Kara 4b3242
@@ -706,28 +706,68 @@ void do_close_on_exec(struct files_struc
Jan Kara 4b3242
 	spin_unlock(&files->file_lock);
Jan Kara 4b3242
 }
Jan Kara 4b3242
 
Jan Kara 4b3242
-static struct file *__fget(unsigned int fd, fmode_t mask, unsigned int refs)
Jan Kara 4b3242
+static struct file *__fget_rcu(unsigned int fd, fmode_t mask, unsigned int refs)
Jan Kara 4b3242
 {
Jan Kara 4b3242
 	struct files_struct *files = current->files;
Jan Kara 4b3242
-	struct file *file;
Jan Kara 4b3242
 
Jan Kara 4b3242
-	rcu_read_lock();
Jan Kara 4b3242
-loop:
Jan Kara 4b3242
-	file = fcheck_files(files, fd);
Jan Kara 4b3242
-	if (file) {
Jan Kara 4b3242
-		/* File object ref couldn't be taken.
Jan Kara 4b3242
-		 * dup2() atomicity guarantee is the reason
Jan Kara 4b3242
-		 * we loop to catch the new file (or NULL pointer)
Jan Kara 4b3242
+	for (;;) {
Jan Kara 4b3242
+		struct file *file;
Jan Kara 4b3242
+		struct fdtable *fdt = rcu_dereference_raw(files->fdt);
Jan Kara 4b3242
+		struct file __rcu **fdentry;
Jan Kara 4b3242
+
Jan Kara 4b3242
+		if (unlikely(fd >= fdt->max_fds))
Jan Kara 4b3242
+			return NULL;
Jan Kara 4b3242
+
Jan Kara 4b3242
+		fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds);
Jan Kara 4b3242
+		file = rcu_dereference_raw(*fdentry);
Jan Kara 4b3242
+		if (unlikely(!file))
Jan Kara 4b3242
+			return NULL;
Jan Kara 4b3242
+
Jan Kara 4b3242
+		if (unlikely(file->f_mode & mask))
Jan Kara 4b3242
+			return NULL;
Jan Kara 4b3242
+
Jan Kara 4b3242
+		/*
Jan Kara 4b3242
+		 * Ok, we have a file pointer. However, because we do
Jan Kara 4b3242
+		 * this all locklessly under RCU, we may be racing with
Jan Kara 4b3242
+		 * that file being closed.
Jan Kara 4b3242
+		 *
Jan Kara 4b3242
+		 * Such a race can take two forms:
Jan Kara 4b3242
+		 *
Jan Kara 4b3242
+		 *  (a) the file ref already went down to zero,
Jan Kara 4b3242
+		 *      and get_file_rcu_many() fails. Just try
Jan Kara 4b3242
+		 *      again:
Jan Kara 4b3242
+		 */
Jan Kara 4b3242
+		if (unlikely(!get_file_rcu_many(file, refs)))
Jan Kara 4b3242
+			continue;
Jan Kara 4b3242
+
Jan Kara 4b3242
+		/*
Jan Kara 4b3242
+		 *  (b) the file table entry has changed under us.
Jan Kara 4b3242
+		 *       Note that we don't need to re-check the 'fdt->fd'
Jan Kara 4b3242
+		 *       pointer having changed, because it always goes
Jan Kara 4b3242
+		 *       hand-in-hand with 'fdt'.
Jan Kara 4b3242
+		 *
Jan Kara 4b3242
+		 * If so, we need to put our refs and try again.
Jan Kara 4b3242
 		 */
Jan Kara 4b3242
-		if (file->f_mode & mask)
Jan Kara 4b3242
-			file = NULL;
Jan Kara 4b3242
-		else if (!get_file_rcu_many(file, refs))
Jan Kara 4b3242
-			goto loop;
Jan Kara 4b3242
-		else if (__fcheck_files(files, fd) != file) {
Jan Kara 4b3242
+		if (unlikely(rcu_dereference_raw(files->fdt) != fdt) ||
Jan Kara 4b3242
+		    unlikely(rcu_dereference_raw(*fdentry) != file)) {
Jan Kara 4b3242
 			fput_many(file, refs);
Jan Kara 4b3242
-			goto loop;
Jan Kara 4b3242
+			continue;
Jan Kara 4b3242
 		}
Jan Kara 4b3242
+
Jan Kara 4b3242
+		/*
Jan Kara 4b3242
+		 * Ok, we have a ref to the file, and checked that it
Jan Kara 4b3242
+		 * still exists.
Jan Kara 4b3242
+		 */
Jan Kara 4b3242
+		return file;
Jan Kara 4b3242
 	}
Jan Kara 4b3242
+}
Jan Kara 4b3242
+
Jan Kara 4b3242
+static struct file *__fget(unsigned int fd, fmode_t mask, unsigned int refs)
Jan Kara 4b3242
+{
Jan Kara 4b3242
+	struct file *file;
Jan Kara 4b3242
+
Jan Kara 4b3242
+	rcu_read_lock();
Jan Kara 4b3242
+	file = __fget_rcu(fd, mask, refs);
Jan Kara 4b3242
 	rcu_read_unlock();
Jan Kara 4b3242
 
Jan Kara 4b3242
 	return file;