Blob Blame History Raw
From 723f449add198ece9539cd1693e36a99fc9ec527 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Tue, 26 Jun 2018 13:51:20 -0500
Subject: [PATCH] procfs: add tunable for fd/fdinfo dentry retention
Patch-mainline: Never, should be https://lkml.org/lkml/2018/4/23/1148
References: bsc#1086652


With a test workload of 4096 threads and 16384 files opened, a simple
find /proc can create about 268 million dentry/inode pairs just to
handle the "fd" and "fdinfo" files.  This is due to every thread
in the /proc/tgid/task directory having its own copy of fd and fdinfo.

On smaller systems, we never have 268M dentry/inode pairs in memory
since memory pressure forces most of them to be dropped.  That same
pressure will force writeback and real file system dentries and inodes
to be dropped as well.  On large memory systems with many CPU cores,
the full 268M dentry/inode pairs can fit into memory at once and
when a large multithreaded task exits, we can hit soft lockups or
the system can become otherwise unresponsive for minutes at a time.

The right fix for this is to eliminate all the duplicate entries but
that is a change that needs discussion.  For now, we can provide a
workaround that limits the exposure to the largest contingent of
proc files: fd and fdinfo.

Usually, we'll only drop procfs dentries immediately when the task
has exited.  This patch adds a sysctl, procfs-drop-fd-dentries, that
deletes the dentries for fd and fdinfo directories immediately, which
also releases the inodes.  The result is that the count never climbs
above a few hundred thousand procfs files and large multithreaded
tasks can exit without causing undue load on the system.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
---
 Documentation/sysctl/fs.txt | 36 ++++++++++++++++++++++++++++++++++++
 fs/proc/fd.c                | 13 ++++++++++++-
 include/linux/proc_fs.h     |  2 ++
 kernel/sysctl.c             |  9 +++++++++
 4 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt
index 35e17f748ca7..01274048df7f 100644
--- a/Documentation/sysctl/fs.txt
+++ b/Documentation/sysctl/fs.txt
@@ -272,6 +272,42 @@ in a mount namespace.
 
 ==============================================================
 
+procfs-drop-fd-dentries:
+
+* SUSE-specific; This option may be removed in a future release.
+
+This option controls when the proc files representing a task's
+opene files are removed.  It applies to the following directories:
+- /proc/pid/fd
+- /proc/pid/fdinfo
+- /proc/pid/task/*/fd
+- /proc/pid/task/*/fdinfo
+
+By default, dentries belonging to tasks that are still running
+will be retained and those belonging to exited tasks will be
+dropped immediately.
+
+This policy ensures that memory is not wasted, but can run into
+scalability issues on very large systems when a task with thousands
+of threads and many open files exits.  When many tasks exit
+simultaneously, substantial contention on the global inode spinlock
+may result in suboptimal performance of the system until the inodes
+are released.
+
+When set to "0" (default), the policy is to retain dentries for running
+tasks and delete dentries from tasks which have exited immediately.  Once
+the dentry is released, the inode will be freed immediately.
+
+When set to "1", the policy is to delete the dentries immediately after
+the last reference is dropped.  Once the dentry is released, the inode
+will be freed immediately.  This ensures that the thread which created
+the inodes will also clean them up, eliminating much of the lock
+contention.  The tradeoff is that frequent use of fd/fdinfo will be
+slower as these files will need to be recreated each time they
+are accessed.
+
+==============================================================
+
 
 2. /proc/sys/fs/binfmt_misc
 ----------------------------------------------------------
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index c330495c3115..ffc8bdd63a11 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -130,9 +130,20 @@ static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
 	return 0;
 }
 
+int procfs_drop_fd_dentries = 0;
+
+static int tid_fd_delete_dentry(const struct dentry *dentry)
+{
+	/* Always delete immediately */
+	if (procfs_drop_fd_dentries)
+		return 1;
+
+	return pid_delete_dentry(dentry);
+}
+
 static const struct dentry_operations tid_fd_dentry_operations = {
 	.d_revalidate	= tid_fd_revalidate,
-	.d_delete	= pid_delete_dentry,
+	.d_delete	= tid_fd_delete_dentry,
 };
 
 static int proc_fd_link(struct dentry *dentry, struct path *path)
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 2d2bf592d9db..44c3687fff5a 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -88,4 +88,6 @@ struct ns_common;
 int open_related_ns(struct ns_common *ns,
 		   struct ns_common *(*get_ns)(struct ns_common *ns));
 
+extern int procfs_drop_fd_dentries;
+
 #endif /* _LINUX_PROC_FS_H */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1cd2d943b9b5..98caf74882ac 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1864,6 +1864,15 @@ static struct ctl_table fs_table[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &one,
 	},
+	{
+		.procname	= "procfs-drop-fd-dentries",
+		.data		= &procfs_drop_fd_dentries,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
 	{ }
 };
 
-- 
2.16.4