Blob Blame History Raw
From: Jeff Mahoney <jeffm@suse.com>
Subject: vfs: add super_operations->get_inode_dev
Patch-mainline: Never, upstream submissions have been met with a request to use per-subvolume superblocks, which is unworkable in practice
References: bsc#927455

There are many places where a dev_t:ino_t pair are passed to userspace
to uniquely describe an inode.  Some file systems, like btrfs, have
multiple inode namespace internally and use a separate dev_t to make the
distinction between them.

This patch adds a super_operations->get_inode_dev operation to allow
the file system to export those dev_ts to callers within the kernel
for further export to userspace.

Without this patch, things like audit and some perf and trace events
will not distinguish between subvolumes within a single btrfs filesystem.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
---
 fs/eventpoll.c                   |  2 +-
 fs/iomap/trace.h                 |  8 ++++----
 fs/locks.c                       | 12 +++++++-----
 fs/nfsd/nfs3xdr.c                |  2 +-
 fs/nfsd/nfs4state.c              |  5 +++--
 fs/nfsd/vfs.c                    |  4 ++--
 fs/notify/fdinfo.c               |  4 ++--
 fs/proc/nommu.c                  |  2 +-
 fs/proc/task_mmu.c               |  2 +-
 fs/proc/task_nommu.c             |  2 +-
 fs/proc_namespace.c              |  3 ++-
 fs/stat.c                        |  2 +-
 include/linux/fs.h               |  9 +++++++++
 include/trace/events/filelock.h  |  8 ++++----
 include/trace/events/filemap.h   |  6 +++---
 include/trace/events/writeback.h |  2 +-
 init/do_mounts.c                 |  2 +-
 kernel/audit_fsnotify.c          |  2 +-
 kernel/audit_watch.c             |  6 +++---
 kernel/auditsc.c                 |  6 +++---
 kernel/events/core.c             |  2 +-
 mm/memory-failure.c              |  2 +-
 net/unix/diag.c                  |  2 +-
 security/tomoyo/condition.c      |  2 +-
 security/tomoyo/realpath.c       |  1 +
 25 files changed, 56 insertions(+), 42 deletions(-)

--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -879,7 +879,7 @@ static void ep_show_fdinfo(struct seq_file *m, struct file *f)
 			   epi->ffd.fd, epi->event.events,
 			   (long long)epi->event.data,
 			   (long long)epi->ffd.file->f_pos,
-			   inode->i_ino, inode->i_sb->s_dev);
+			   inode->i_ino, inode_get_dev(inode));
 		if (seq_has_overflowed(m))
 			break;
 	}
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -33,7 +33,7 @@ DECLARE_EVENT_CLASS(iomap_readpage_class,
 		__field(int, nr_pages)
 	),
 	TP_fast_assign(
-		__entry->dev = inode->i_sb->s_dev;
+		__entry->dev = inode_get_dev(inode);
 		__entry->ino = inode->i_ino;
 		__entry->nr_pages = nr_pages;
 	),
@@ -61,7 +61,7 @@ DECLARE_EVENT_CLASS(iomap_range_class,
 		__field(u64, length)
 	),
 	TP_fast_assign(
-		__entry->dev = inode->i_sb->s_dev;
+		__entry->dev = inode_get_dev(inode);
 		__entry->ino = inode->i_ino;
 		__entry->size = i_size_read(inode);
 		__entry->offset = off;
@@ -121,7 +121,7 @@ DECLARE_EVENT_CLASS(iomap_class,
 		__field(dev_t, bdev)
 	),
 	TP_fast_assign(
-		__entry->dev = inode->i_sb->s_dev;
+		__entry->dev = inode_get_dev(inode);
 		__entry->ino = inode->i_ino;
 		__entry->addr = iomap->addr;
 		__entry->offset = iomap->offset;
@@ -164,7 +164,7 @@ TRACE_EVENT(iomap_iter,
 		__field(unsigned long, caller)
 	),
 	TP_fast_assign(
-		__entry->dev = iter->inode->i_sb->s_dev;
+		__entry->dev = inode_get_dev(iter->inode);
 		__entry->ino = iter->inode->i_ino;
 		__entry->pos = iter->pos;
 		__entry->length = iomap_length(iter);
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -216,12 +216,13 @@ static void
 locks_check_ctx_lists(struct inode *inode)
 {
 	struct file_lock_context *ctx = inode->i_flctx;
+	dev_t dev = inode_get_dev(inode);
 
 	if (unlikely(!list_empty(&ctx->flc_flock) ||
 		     !list_empty(&ctx->flc_posix) ||
 		     !list_empty(&ctx->flc_lease))) {
 		pr_warn("Leaked locks on dev=0x%x:0x%x ino=0x%lx:\n",
-			MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev),
+			MAJOR(dev), MINOR(dev),
 			inode->i_ino);
 		locks_dump_ctx_list(&ctx->flc_flock, "FLOCK");
 		locks_dump_ctx_list(&ctx->flc_posix, "POSIX");
@@ -235,13 +236,14 @@ locks_check_ctx_file_list(struct file *filp, struct list_head *list,
 {
 	struct file_lock *fl;
 	struct inode *inode = file_inode(filp);
+	dev_t dev = inode_get_dev(inode);
 
 	list_for_each_entry(fl, list, fl_list)
 		if (fl->fl_file == filp)
 			pr_warn("Leaked %s lock on dev=0x%x:0x%x ino=0x%lx "
 				" fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n",
-				list_type, MAJOR(inode->i_sb->s_dev),
-				MINOR(inode->i_sb->s_dev), inode->i_ino,
+				list_type, MAJOR(dev),
+				MINOR(dev), inode->i_ino,
 				fl->fl_owner, fl->fl_flags, fl->fl_type, fl->fl_pid);
 }
 
@@ -2737,8 +2739,8 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
 	if (inode) {
 		/* userspace relies on this representation of dev_t */
 		seq_printf(f, "%d %02x:%02x:%lu ", fl_pid,
-				MAJOR(inode->i_sb->s_dev),
-				MINOR(inode->i_sb->s_dev), inode->i_ino);
+				MAJOR(inode_get_dev(inode)),
+				MINOR(inode_get_dev(inode)), inode->i_ino);
 	} else {
 		seq_printf(f, "%d <none>:0 ", fl_pid);
 	}
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -372,7 +372,7 @@ svcxdr_encode_fattr3(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 		fsid ^= ((u64 *)fhp->fh_export->ex_uuid)[1];
 		break;
 	default:
-		fsid = (u64)huge_encode_dev(fhp->fh_dentry->d_sb->s_dev);
+		fsid = (u64)huge_encode_dev(inode_get_dev(d_inode(fhp->fh_dentry)));
 	}
 	p = xdr_encode_hyper(p, fsid);
 
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2579,10 +2579,11 @@ static void nfs4_show_fname(struct seq_file *s, struct nfsd_file *f)
 static void nfs4_show_superblock(struct seq_file *s, struct nfsd_file *f)
 {
 	struct inode *inode = file_inode(f->nf_file);
+	dev_t dev = inode_get_dev(inode);
 
 	seq_printf(s, "superblock: \"%02x:%02x:%ld\"",
-					MAJOR(inode->i_sb->s_dev),
-					 MINOR(inode->i_sb->s_dev),
+					MAJOR(dev),
+					 MINOR(dev),
 					 inode->i_ino);
 }
 
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1037,7 +1037,7 @@ static int wait_for_concurrent_writes(struct file *file)
 	int err = 0;
 
 	if (atomic_read(&inode->i_writecount) > 1
-	    || (last_ino == inode->i_ino && last_dev == inode->i_sb->s_dev)) {
+	    || (last_ino == inode->i_ino && last_dev == inode_get_dev(inode))) {
 		dprintk("nfsd: write defer %d\n", task_pid_nr(current));
 		msleep(10);
 		dprintk("nfsd: write resume %d\n", task_pid_nr(current));
@@ -1048,7 +1048,7 @@ static int wait_for_concurrent_writes(struct file *file)
 		err = vfs_fsync(file, 0);
 	}
 	last_ino = inode->i_ino;
-	last_dev = inode->i_sb->s_dev;
+	last_dev = inode_get_dev(inode);
 	return err;
 }
 
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -85,7 +85,7 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
 	inode = igrab(fsnotify_conn_inode(mark->connector));
 	if (inode) {
 		seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:0 ",
-			   inode_mark->wd, inode->i_ino, inode->i_sb->s_dev,
+			   inode_mark->wd, inode->i_ino, inode_get_dev(inode),
 			   inotify_mark_user_mask(mark));
 		show_mark_fhandle(m, inode);
 		seq_putc(m, '\n');
@@ -112,7 +112,7 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
 		if (!inode)
 			return;
 		seq_printf(m, "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x ",
-			   inode->i_ino, inode->i_sb->s_dev,
+			   inode->i_ino, inode_get_dev(inode),
 			   mflags, mark->mask, mark->ignore_mask);
 		show_mark_fhandle(m, inode);
 		seq_putc(m, '\n');
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -40,7 +40,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
 
 	if (file) {
 		struct inode *inode = file_inode(region->vm_file);
-		dev = inode->i_sb->s_dev;
+		dev = inode_get_dev(inode);
 		ino = inode->i_ino;
 	}
 
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -286,7 +286,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
 
 	if (file) {
 		struct inode *inode = file_inode(vma->vm_file);
-		dev = inode->i_sb->s_dev;
+		dev = inode_get_dev(inode);
 		ino = inode->i_ino;
 		pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
 	}
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -151,7 +151,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
 
 	if (file) {
 		struct inode *inode = file_inode(vma->vm_file);
-		dev = inode->i_sb->s_dev;
+		dev = inode_get_dev(inode);
 		ino = inode->i_ino;
 		pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
 	}
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -137,11 +137,12 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
 	struct proc_mounts *p = m->private;
 	struct mount *r = real_mount(mnt);
 	struct super_block *sb = mnt->mnt_sb;
+	dev_t dev = inode_get_dev(d_inode(mnt->mnt_root));
 	struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
 	int err;
 
 	seq_printf(m, "%i %i %u:%u ", r->mnt_id, r->mnt_parent->mnt_id,
-		   MAJOR(sb->s_dev), MINOR(sb->s_dev));
+		   MAJOR(dev), MINOR(dev));
 	if (sb->s_op->show_path) {
 		err = sb->s_op->show_path(m, mnt->mnt_root);
 		if (err)
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -48,7 +48,7 @@ void generic_fillattr(struct mnt_idmap *idmap, struct inode *inode,
 	vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode);
 	vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode);
 
-	stat->dev = inode->i_sb->s_dev;
+	stat->dev = inode_get_dev(inode);
 	stat->ino = inode->i_ino;
 	stat->mode = inode->i_mode;
 	stat->nlink = inode->i_nlink;
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1915,6 +1915,7 @@ struct super_operations {
 				  struct shrink_control *);
 	long (*free_cached_objects)(struct super_block *,
 				    struct shrink_control *);
+	dev_t (*get_inode_dev)(const struct inode *);
 };
 
 /*
@@ -3186,4 +3187,12 @@ extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len,
 extern int generic_fadvise(struct file *file, loff_t offset, loff_t len,
 			   int advice);
 
+static inline dev_t inode_get_dev(const struct inode *inode)
+{
+	if (inode->i_sb->s_op->get_inode_dev)
+		return inode->i_sb->s_op->get_inode_dev(inode);
+
+	return inode->i_sb->s_dev;
+}
+
 #endif /* _LINUX_FS_H */
--- a/include/trace/events/filelock.h
+++ b/include/trace/events/filelock.h
@@ -48,7 +48,7 @@ TRACE_EVENT(locks_get_lock_context,
 	),
 
 	TP_fast_assign(
-		__entry->s_dev = inode->i_sb->s_dev;
+		__entry->s_dev = inode_get_dev(inode);
 		__entry->i_ino = inode->i_ino;
 		__entry->type = type;
 		__entry->ctx = ctx;
@@ -80,7 +80,7 @@ DECLARE_EVENT_CLASS(filelock_lock,
 
 	TP_fast_assign(
 		__entry->fl = fl ? fl : NULL;
-		__entry->s_dev = inode->i_sb->s_dev;
+		__entry->s_dev = inode_get_dev(inode);
 		__entry->i_ino = inode->i_ino;
 		__entry->fl_blocker = fl ? fl->fl_blocker : NULL;
 		__entry->fl_owner = fl ? fl->fl_owner : NULL;
@@ -135,7 +135,7 @@ DECLARE_EVENT_CLASS(filelock_lease,
 
 	TP_fast_assign(
 		__entry->fl = fl ? fl : NULL;
-		__entry->s_dev = inode->i_sb->s_dev;
+		__entry->s_dev = inode_get_dev(inode);
 		__entry->i_ino = inode->i_ino;
 		__entry->fl_blocker = fl ? fl->fl_blocker : NULL;
 		__entry->fl_owner = fl ? fl->fl_owner : NULL;
@@ -185,7 +185,7 @@ TRACE_EVENT(generic_add_lease,
 	),
 
 	TP_fast_assign(
-		__entry->s_dev = inode->i_sb->s_dev;
+		__entry->s_dev = inode_get_dev(inode);
 		__entry->i_ino = inode->i_ino;
 		__entry->wcount = atomic_read(&inode->i_writecount);
 		__entry->rcount = atomic_read(&inode->i_readcount);
--- a/include/trace/events/filemap.h
+++ b/include/trace/events/filemap.h
@@ -32,7 +32,7 @@ DECLARE_EVENT_CLASS(mm_filemap_op_page_cache,
 		__entry->i_ino = folio->mapping->host->i_ino;
 		__entry->index = folio->index;
 		if (folio->mapping->host->i_sb)
-			__entry->s_dev = folio->mapping->host->i_sb->s_dev;
+			__entry->s_dev = inode_get_dev(folio->mapping->host);
 		else
 			__entry->s_dev = folio->mapping->host->i_rdev;
 		__entry->order = folio_order(folio);
@@ -71,7 +71,7 @@ TRACE_EVENT(filemap_set_wb_err,
 			__entry->i_ino = mapping->host->i_ino;
 			__entry->errseq = eseq;
 			if (mapping->host->i_sb)
-				__entry->s_dev = mapping->host->i_sb->s_dev;
+				__entry->s_dev = inode_get_dev(mapping->host);
 			else
 				__entry->s_dev = mapping->host->i_rdev;
 		),
@@ -99,7 +99,7 @@ TRACE_EVENT(file_check_and_advance_wb_err,
 			__entry->i_ino = file->f_mapping->host->i_ino;
 			if (file->f_mapping->host->i_sb)
 				__entry->s_dev =
-					file->f_mapping->host->i_sb->s_dev;
+					inode_get_dev(file->f_mapping->host);
 			else
 				__entry->s_dev =
 					file->f_mapping->host->i_rdev;
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -809,7 +809,7 @@ DECLARE_EVENT_CLASS(writeback_inode_template,
 	),
 
 	TP_fast_assign(
-		__entry->dev	= inode->i_sb->s_dev;
+		__entry->dev	= inode_get_dev(inode);
 		__entry->ino	= inode->i_ino;
 		__entry->state	= inode->i_state;
 		__entry->mode	= inode->i_mode;
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -378,7 +378,7 @@ static int __init do_mount_root(const char *name, const char *fs,
 
 	init_chdir("/root");
 	s = current->fs->pwd.dentry->d_sb;
-	ROOT_DEV = s->s_dev;
+	ROOT_DEV = inode_get_dev(d_inode(current->fs->pwd.dentry));
 	printk(KERN_INFO
 	       "VFS: Mounted root (%s filesystem)%s on device %u:%u.\n",
 	       s->s_type->name,
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -67,7 +67,7 @@ int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_
 static void audit_update_mark(struct audit_fsnotify_mark *audit_mark,
 			     const struct inode *inode)
 {
-	audit_mark->dev = inode ? inode->i_sb->s_dev : AUDIT_DEV_UNSET;
+	audit_mark->dev = inode ? inode_get_dev(inode) : AUDIT_DEV_UNSET;
 	audit_mark->ino = inode ? inode->i_ino : AUDIT_INO_UNSET;
 }
 
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -352,7 +352,7 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent)
 		return PTR_ERR(d);
 	if (d_is_positive(d)) {
 		/* update watch filter fields */
-		watch->dev = d->d_sb->s_dev;
+		watch->dev = inode_get_dev(d_backing_inode(d));
 		watch->ino = d_backing_inode(d)->i_ino;
 	}
 	inode_unlock(d_backing_inode(parent->dentry));
@@ -477,7 +477,7 @@ static int audit_watch_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
 		return 0;
 
 	if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
-		audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0);
+		audit_update_watch(parent, dname, inode_get_dev(inode), inode->i_ino, 0);
 	else if (mask & (FS_DELETE|FS_MOVED_FROM))
 		audit_update_watch(parent, dname, AUDIT_DEV_UNSET, AUDIT_INO_UNSET, 1);
 	else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF))
@@ -531,7 +531,7 @@ int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark)
 	if (!exe_file)
 		return 0;
 	ino = file_inode(exe_file)->i_ino;
-	dev = file_inode(exe_file)->i_sb->s_dev;
+	dev = inode_get_dev(file_inode(exe_file));
 	fput(exe_file);
 	return audit_mark_compare(mark, ino, dev);
 }
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2273,7 +2273,7 @@ static void audit_copy_inode(struct audit_names *name,
 			     struct inode *inode, unsigned int flags)
 {
 	name->ino   = inode->i_ino;
-	name->dev   = inode->i_sb->s_dev;
+	name->dev   = inode_get_dev(inode);
 	name->mode  = inode->i_mode;
 	name->uid   = inode->i_uid;
 	name->gid   = inode->i_gid;
@@ -2345,7 +2345,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
 		if (n->ino) {
 			/* valid inode number, use that for the comparison */
 			if (n->ino != inode->i_ino ||
-			    n->dev != inode->i_sb->s_dev)
+			    n->dev != inode_get_dev(inode))
 				continue;
 		} else if (n->name) {
 			/* inode number has not been set, check the name */
@@ -2450,7 +2450,7 @@ void __audit_inode_child(struct inode *parent,
 		     n->type != AUDIT_TYPE_UNKNOWN))
 			continue;
 
-		if (n->ino == parent->i_ino && n->dev == parent->i_sb->s_dev &&
+		if (n->ino == parent->i_ino && n->dev == inode_get_dev(parent) &&
 		    !audit_compare_dname_path(dname,
 					      n->name->name, n->name_len)) {
 			if (n->type == AUDIT_TYPE_UNKNOWN)
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8666,7 +8666,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
 			goto cpy_name;
 		}
 		inode = file_inode(vma->vm_file);
-		dev = inode->i_sb->s_dev;
+		dev = inode_get_dev(inode);
 		ino = inode->i_ino;
 		gen = inode->i_generation;
 		maj = MAJOR(dev);
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -194,7 +194,7 @@ static int hwpoison_filter_dev(struct page *p)
 	if (mapping == NULL || mapping->host == NULL)
 		return -EINVAL;
 
-	dev = mapping->host->i_sb->s_dev;
+	dev = inode_get_dev(mapping->host);
 	if (hwpoison_filter_dev_major != ~0U &&
 	    hwpoison_filter_dev_major != MAJOR(dev))
 		return -EINVAL;
--- a/net/unix/diag.c
+++ b/net/unix/diag.c
@@ -31,7 +31,7 @@ static int sk_diag_dump_vfs(struct sock *sk, struct sk_buff *nlskb)
 	if (dentry) {
 		struct unix_diag_vfs uv = {
 			.udiag_vfs_ino = d_backing_inode(dentry)->i_ino,
-			.udiag_vfs_dev = dentry->d_sb->s_dev,
+			.udiag_vfs_dev = inode_get_dev(d_backing_inode(dentry)),
 		};
 
 		return nla_put(nlskb, UNIX_DIAG_VFS, sizeof(uv), &uv);
--- a/security/tomoyo/condition.c
+++ b/security/tomoyo/condition.c
@@ -743,7 +743,7 @@ void tomoyo_get_attributes(struct tomoyo_obj_info *obj)
 			stat->gid  = inode->i_gid;
 			stat->ino  = inode->i_ino;
 			stat->mode = inode->i_mode;
-			stat->dev  = inode->i_sb->s_dev;
+			stat->dev  = inode_get_dev(inode);
 			stat->rdev = inode->i_rdev;
 			obj->stat_valid[i] = true;
 		}
--- a/security/tomoyo/realpath.c
+++ b/security/tomoyo/realpath.c
@@ -174,6 +174,7 @@ static char *tomoyo_get_local_path(struct dentry *dentry, char * const buffer,
 		goto prepend_filesystem_name;
 	}
 	/* Use filesystem name for unnamed devices. */
+	/* NOTE: This will fail with the btrfs get_inode_dev patches */
 	if (!MAJOR(sb->s_dev))
 		goto prepend_filesystem_name;
 	{