|
Jan Kara |
4b3242 |
From e386dfc56f837da66d00a078e5314bc8382fab83 Mon Sep 17 00:00:00 2001
|
|
Jan Kara |
4b3242 |
From: Linus Torvalds <torvalds@linux-foundation.org>
|
|
Jan Kara |
4b3242 |
Date: Fri, 10 Dec 2021 14:00:15 -0800
|
|
Jan Kara |
4b3242 |
Subject: [PATCH] fget: clarify and improve __fget_files() implementation
|
|
Jan Kara |
4b3242 |
Git-commit: e386dfc56f837da66d00a078e5314bc8382fab83
|
|
Jan Kara |
4b3242 |
Patch-mainline: v5.16-rc6
|
|
Jan Kara |
4b3242 |
References: bsc#1193727
|
|
Jan Kara |
4b3242 |
|
|
Jan Kara |
4b3242 |
Commit 054aa8d439b9 ("fget: check that the fd still exists after getting
|
|
Jan Kara |
4b3242 |
a ref to it") fixed a race with getting a reference to a file just as it
|
|
Jan Kara |
4b3242 |
was being closed. It was a fairly minimal patch, and I didn't think
|
|
Jan Kara |
4b3242 |
re-checking the file pointer lookup would be a measurable overhead,
|
|
Jan Kara |
4b3242 |
since it was all right there and cached.
|
|
Jan Kara |
4b3242 |
|
|
Jan Kara |
4b3242 |
But I was wrong, as pointed out by the kernel test robot.
|
|
Jan Kara |
4b3242 |
|
|
Jan Kara |
4b3242 |
The 'poll2' case of the will-it-scale.per_thread_ops benchmark regressed
|
|
Jan Kara |
4b3242 |
quite noticeably. Admittedly it seems to be a very artificial test:
|
|
Jan Kara |
4b3242 |
doing "poll()" system calls on regular files in a very tight loop in
|
|
Jan Kara |
4b3242 |
multiple threads.
|
|
Jan Kara |
4b3242 |
|
|
Jan Kara |
4b3242 |
That means that basically all the time is spent just looking up file
|
|
Jan Kara |
4b3242 |
descriptors without ever doing anything useful with them (not that doing
|
|
Jan Kara |
4b3242 |
'poll()' on a regular file is useful to begin with). And as a result it
|
|
Jan Kara |
4b3242 |
shows the extra "re-check fd" cost as a sore thumb.
|
|
Jan Kara |
4b3242 |
|
|
Jan Kara |
4b3242 |
Happily, the regression is fixable by just writing the code to loook up
|
|
Jan Kara |
4b3242 |
the fd to be better and clearer. There's still a cost to verify the
|
|
Jan Kara |
4b3242 |
file pointer, but now it's basically in the noise even for that
|
|
Jan Kara |
4b3242 |
benchmark that does nothing else - and the code is more understandable
|
|
Jan Kara |
4b3242 |
and has better comments too.
|
|
Jan Kara |
4b3242 |
|
|
Jan Kara |
4b3242 |
[ Side note: this patch is also a classic case of one that looks very
|
|
Jan Kara |
4b3242 |
messy with the default greedy Myers diff - it's much more legible with
|
|
Jan Kara |
4b3242 |
either the patience of histogram diff algorithm ]
|
|
Jan Kara |
4b3242 |
|
|
Jan Kara |
4b3242 |
Link: https://lore.kernel.org/lkml/20211210053743.GA36420@xsang-OptiPlex-9020/
|
|
Jan Kara |
4b3242 |
Link: https://lore.kernel.org/lkml/20211213083154.GA20853@linux.intel.com/
|
|
Jan Kara |
4b3242 |
Reported-by: kernel test robot <oliver.sang@intel.com>
|
|
Jan Kara |
4b3242 |
Tested-by: Carel Si <beibei.si@intel.com>
|
|
Jan Kara |
4b3242 |
Cc: Jann Horn <jannh@google.com>
|
|
Jan Kara |
4b3242 |
Cc: Miklos Szeredi <mszeredi@redhat.com>
|
|
Jan Kara |
4b3242 |
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
|
|
Jan Kara |
4b3242 |
Acked-by: Jan Kara <jack@suse.cz>
|
|
Jan Kara |
4b3242 |
|
|
Jan Kara |
4b3242 |
---
|
|
Jan Kara |
4b3242 |
fs/file.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++--------------
|
|
Jan Kara |
4b3242 |
1 file changed, 55 insertions(+), 15 deletions(-)
|
|
Jan Kara |
4b3242 |
|
|
Jan Kara |
4b3242 |
--- a/fs/file.c
|
|
Jan Kara |
4b3242 |
+++ b/fs/file.c
|
|
Jan Kara |
4b3242 |
@@ -706,28 +706,68 @@ void do_close_on_exec(struct files_struc
|
|
Jan Kara |
4b3242 |
spin_unlock(&files->file_lock);
|
|
Jan Kara |
4b3242 |
}
|
|
Jan Kara |
4b3242 |
|
|
Jan Kara |
4b3242 |
-static struct file *__fget(unsigned int fd, fmode_t mask, unsigned int refs)
|
|
Jan Kara |
4b3242 |
+static struct file *__fget_rcu(unsigned int fd, fmode_t mask, unsigned int refs)
|
|
Jan Kara |
4b3242 |
{
|
|
Jan Kara |
4b3242 |
struct files_struct *files = current->files;
|
|
Jan Kara |
4b3242 |
- struct file *file;
|
|
Jan Kara |
4b3242 |
|
|
Jan Kara |
4b3242 |
- rcu_read_lock();
|
|
Jan Kara |
4b3242 |
-loop:
|
|
Jan Kara |
4b3242 |
- file = fcheck_files(files, fd);
|
|
Jan Kara |
4b3242 |
- if (file) {
|
|
Jan Kara |
4b3242 |
- /* File object ref couldn't be taken.
|
|
Jan Kara |
4b3242 |
- * dup2() atomicity guarantee is the reason
|
|
Jan Kara |
4b3242 |
- * we loop to catch the new file (or NULL pointer)
|
|
Jan Kara |
4b3242 |
+ for (;;) {
|
|
Jan Kara |
4b3242 |
+ struct file *file;
|
|
Jan Kara |
4b3242 |
+ struct fdtable *fdt = rcu_dereference_raw(files->fdt);
|
|
Jan Kara |
4b3242 |
+ struct file __rcu **fdentry;
|
|
Jan Kara |
4b3242 |
+
|
|
Jan Kara |
4b3242 |
+ if (unlikely(fd >= fdt->max_fds))
|
|
Jan Kara |
4b3242 |
+ return NULL;
|
|
Jan Kara |
4b3242 |
+
|
|
Jan Kara |
4b3242 |
+ fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds);
|
|
Jan Kara |
4b3242 |
+ file = rcu_dereference_raw(*fdentry);
|
|
Jan Kara |
4b3242 |
+ if (unlikely(!file))
|
|
Jan Kara |
4b3242 |
+ return NULL;
|
|
Jan Kara |
4b3242 |
+
|
|
Jan Kara |
4b3242 |
+ if (unlikely(file->f_mode & mask))
|
|
Jan Kara |
4b3242 |
+ return NULL;
|
|
Jan Kara |
4b3242 |
+
|
|
Jan Kara |
4b3242 |
+ /*
|
|
Jan Kara |
4b3242 |
+ * Ok, we have a file pointer. However, because we do
|
|
Jan Kara |
4b3242 |
+ * this all locklessly under RCU, we may be racing with
|
|
Jan Kara |
4b3242 |
+ * that file being closed.
|
|
Jan Kara |
4b3242 |
+ *
|
|
Jan Kara |
4b3242 |
+ * Such a race can take two forms:
|
|
Jan Kara |
4b3242 |
+ *
|
|
Jan Kara |
4b3242 |
+ * (a) the file ref already went down to zero,
|
|
Jan Kara |
4b3242 |
+ * and get_file_rcu_many() fails. Just try
|
|
Jan Kara |
4b3242 |
+ * again:
|
|
Jan Kara |
4b3242 |
+ */
|
|
Jan Kara |
4b3242 |
+ if (unlikely(!get_file_rcu_many(file, refs)))
|
|
Jan Kara |
4b3242 |
+ continue;
|
|
Jan Kara |
4b3242 |
+
|
|
Jan Kara |
4b3242 |
+ /*
|
|
Jan Kara |
4b3242 |
+ * (b) the file table entry has changed under us.
|
|
Jan Kara |
4b3242 |
+ * Note that we don't need to re-check the 'fdt->fd'
|
|
Jan Kara |
4b3242 |
+ * pointer having changed, because it always goes
|
|
Jan Kara |
4b3242 |
+ * hand-in-hand with 'fdt'.
|
|
Jan Kara |
4b3242 |
+ *
|
|
Jan Kara |
4b3242 |
+ * If so, we need to put our refs and try again.
|
|
Jan Kara |
4b3242 |
*/
|
|
Jan Kara |
4b3242 |
- if (file->f_mode & mask)
|
|
Jan Kara |
4b3242 |
- file = NULL;
|
|
Jan Kara |
4b3242 |
- else if (!get_file_rcu_many(file, refs))
|
|
Jan Kara |
4b3242 |
- goto loop;
|
|
Jan Kara |
4b3242 |
- else if (__fcheck_files(files, fd) != file) {
|
|
Jan Kara |
4b3242 |
+ if (unlikely(rcu_dereference_raw(files->fdt) != fdt) ||
|
|
Jan Kara |
4b3242 |
+ unlikely(rcu_dereference_raw(*fdentry) != file)) {
|
|
Jan Kara |
4b3242 |
fput_many(file, refs);
|
|
Jan Kara |
4b3242 |
- goto loop;
|
|
Jan Kara |
4b3242 |
+ continue;
|
|
Jan Kara |
4b3242 |
}
|
|
Jan Kara |
4b3242 |
+
|
|
Jan Kara |
4b3242 |
+ /*
|
|
Jan Kara |
4b3242 |
+ * Ok, we have a ref to the file, and checked that it
|
|
Jan Kara |
4b3242 |
+ * still exists.
|
|
Jan Kara |
4b3242 |
+ */
|
|
Jan Kara |
4b3242 |
+ return file;
|
|
Jan Kara |
4b3242 |
}
|
|
Jan Kara |
4b3242 |
+}
|
|
Jan Kara |
4b3242 |
+
|
|
Jan Kara |
4b3242 |
+static struct file *__fget(unsigned int fd, fmode_t mask, unsigned int refs)
|
|
Jan Kara |
4b3242 |
+{
|
|
Jan Kara |
4b3242 |
+ struct file *file;
|
|
Jan Kara |
4b3242 |
+
|
|
Jan Kara |
4b3242 |
+ rcu_read_lock();
|
|
Jan Kara |
4b3242 |
+ file = __fget_rcu(fd, mask, refs);
|
|
Jan Kara |
4b3242 |
rcu_read_unlock();
|
|
Jan Kara |
4b3242 |
|
|
Jan Kara |
4b3242 |
return file;
|