From: Jeff Mahoney Subject: vfs: add super_operations->get_inode_dev Patch-mainline: Never, upstream submissions have been met with a request to use per-subvolume superblocks, which is unworkable in practice References: bsc#927455 There are many places where a dev_t:ino_t pair are passed to userspace to uniquely describe an inode. Some file systems, like btrfs, have multiple inode namespace internally and use a separate dev_t to make the distinction between them. This patch adds a super_operations->get_inode_dev operation to allow the file system to export those dev_ts to callers within the kernel for further export to userspace. Without this patch, things like audit and some perf and trace events will not distinguish between subvolumes within a single btrfs filesystem. Signed-off-by: Jeff Mahoney --- fs/eventpoll.c | 2 +- fs/iomap/trace.h | 8 ++++---- fs/locks.c | 12 +++++++----- fs/nfsd/nfs3xdr.c | 2 +- fs/nfsd/nfs4state.c | 5 +++-- fs/nfsd/vfs.c | 4 ++-- fs/notify/fdinfo.c | 4 ++-- fs/proc/nommu.c | 2 +- fs/proc/task_mmu.c | 2 +- fs/proc/task_nommu.c | 2 +- fs/proc_namespace.c | 3 ++- fs/stat.c | 2 +- include/linux/fs.h | 9 +++++++++ include/trace/events/filelock.h | 8 ++++---- include/trace/events/filemap.h | 6 +++--- include/trace/events/writeback.h | 2 +- init/do_mounts.c | 2 +- kernel/audit_fsnotify.c | 2 +- kernel/audit_watch.c | 6 +++--- kernel/auditsc.c | 6 +++--- kernel/events/core.c | 2 +- mm/memory-failure.c | 2 +- net/unix/diag.c | 2 +- security/tomoyo/condition.c | 2 +- security/tomoyo/realpath.c | 1 + 25 files changed, 56 insertions(+), 42 deletions(-) --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -879,7 +879,7 @@ static void ep_show_fdinfo(struct seq_file *m, struct file *f) epi->ffd.fd, epi->event.events, (long long)epi->event.data, (long long)epi->ffd.file->f_pos, - inode->i_ino, inode->i_sb->s_dev); + inode->i_ino, inode_get_dev(inode)); if (seq_has_overflowed(m)) break; } --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -33,7 +33,7 @@ DECLARE_EVENT_CLASS(iomap_readpage_class, __field(int, nr_pages) ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; + __entry->dev = inode_get_dev(inode); __entry->ino = inode->i_ino; __entry->nr_pages = nr_pages; ), @@ -61,7 +61,7 @@ DECLARE_EVENT_CLASS(iomap_range_class, __field(u64, length) ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; + __entry->dev = inode_get_dev(inode); __entry->ino = inode->i_ino; __entry->size = i_size_read(inode); __entry->offset = off; @@ -121,7 +121,7 @@ DECLARE_EVENT_CLASS(iomap_class, __field(dev_t, bdev) ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; + __entry->dev = inode_get_dev(inode); __entry->ino = inode->i_ino; __entry->addr = iomap->addr; __entry->offset = iomap->offset; @@ -164,7 +164,7 @@ TRACE_EVENT(iomap_iter, __field(unsigned long, caller) ), TP_fast_assign( - __entry->dev = iter->inode->i_sb->s_dev; + __entry->dev = inode_get_dev(iter->inode); __entry->ino = iter->inode->i_ino; __entry->pos = iter->pos; __entry->length = iomap_length(iter); --- a/fs/locks.c +++ b/fs/locks.c @@ -215,12 +215,13 @@ static void locks_check_ctx_lists(struct inode *inode) { struct file_lock_context *ctx = inode->i_flctx; + dev_t dev = inode_get_dev(inode); if (unlikely(!list_empty(&ctx->flc_flock) || !list_empty(&ctx->flc_posix) || !list_empty(&ctx->flc_lease))) { pr_warn("Leaked locks on dev=0x%x:0x%x ino=0x%lx:\n", - MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), + MAJOR(dev), MINOR(dev), inode->i_ino); locks_dump_ctx_list(&ctx->flc_flock, "FLOCK"); locks_dump_ctx_list(&ctx->flc_posix, "POSIX"); @@ -234,13 +235,14 @@ locks_check_ctx_file_list(struct file *filp, struct list_head *list, { struct file_lock *fl; struct inode *inode = locks_inode(filp); + dev_t dev = inode_get_dev(inode); list_for_each_entry(fl, list, fl_list) if (fl->fl_file == filp) pr_warn("Leaked %s lock on dev=0x%x:0x%x ino=0x%lx " " fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n", - list_type, MAJOR(inode->i_sb->s_dev), - MINOR(inode->i_sb->s_dev), inode->i_ino, + list_type, MAJOR(dev), + MINOR(dev), inode->i_ino, fl->fl_owner, fl->fl_flags, fl->fl_type, fl->fl_pid); } @@ -2761,8 +2763,8 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, if (inode) { /* userspace relies on this representation of dev_t */ seq_printf(f, "%d %02x:%02x:%lu ", fl_pid, - MAJOR(inode->i_sb->s_dev), - MINOR(inode->i_sb->s_dev), inode->i_ino); + MAJOR(inode_get_dev(inode)), + MINOR(inode_get_dev(inode)), inode->i_ino); } else { seq_printf(f, "%d :0 ", fl_pid); } --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -372,7 +372,7 @@ svcxdr_encode_fattr3(struct svc_rqst *rqstp, struct xdr_stream *xdr, fsid ^= ((u64 *)fhp->fh_export->ex_uuid)[1]; break; default: - fsid = (u64)huge_encode_dev(fhp->fh_dentry->d_sb->s_dev); + fsid = (u64)huge_encode_dev(inode_get_dev(d_inode(fhp->fh_dentry))); } p = xdr_encode_hyper(p, fsid); --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2595,10 +2595,11 @@ static void nfs4_show_fname(struct seq_file *s, struct nfsd_file *f) static void nfs4_show_superblock(struct seq_file *s, struct nfsd_file *f) { struct inode *inode = file_inode(f->nf_file); + dev_t dev = inode_get_dev(inode); seq_printf(s, "superblock: \"%02x:%02x:%ld\"", - MAJOR(inode->i_sb->s_dev), - MINOR(inode->i_sb->s_dev), + MAJOR(dev), + MINOR(dev), inode->i_ino); } --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1033,7 +1033,7 @@ static int wait_for_concurrent_writes(struct file *file) int err = 0; if (atomic_read(&inode->i_writecount) > 1 - || (last_ino == inode->i_ino && last_dev == inode->i_sb->s_dev)) { + || (last_ino == inode->i_ino && last_dev == inode_get_dev(inode))) { dprintk("nfsd: write defer %d\n", task_pid_nr(current)); msleep(10); dprintk("nfsd: write resume %d\n", task_pid_nr(current)); @@ -1044,7 +1044,7 @@ static int wait_for_concurrent_writes(struct file *file) err = vfs_fsync(file, 0); } last_ino = inode->i_ino; - last_dev = inode->i_sb->s_dev; + last_dev = inode_get_dev(inode); return err; } --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -85,7 +85,7 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) inode = igrab(fsnotify_conn_inode(mark->connector)); if (inode) { seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:0 ", - inode_mark->wd, inode->i_ino, inode->i_sb->s_dev, + inode_mark->wd, inode->i_ino, inode_get_dev(inode), inotify_mark_user_mask(mark)); show_mark_fhandle(m, inode); seq_putc(m, '\n'); @@ -112,7 +112,7 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) if (!inode) return; seq_printf(m, "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x ", - inode->i_ino, inode->i_sb->s_dev, + inode->i_ino, inode_get_dev(inode), mflags, mark->mask, mark->ignore_mask); show_mark_fhandle(m, inode); seq_putc(m, '\n'); --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -40,7 +40,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region) if (file) { struct inode *inode = file_inode(region->vm_file); - dev = inode->i_sb->s_dev; + dev = inode_get_dev(inode); ino = inode->i_ino; } --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -286,7 +286,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) if (file) { struct inode *inode = file_inode(vma->vm_file); - dev = inode->i_sb->s_dev; + dev = inode_get_dev(inode); ino = inode->i_ino; pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; } --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -151,7 +151,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) if (file) { struct inode *inode = file_inode(vma->vm_file); - dev = inode->i_sb->s_dev; + dev = inode_get_dev(inode); ino = inode->i_ino; pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT; } --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -137,11 +137,12 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt) struct proc_mounts *p = m->private; struct mount *r = real_mount(mnt); struct super_block *sb = mnt->mnt_sb; + dev_t dev = inode_get_dev(d_inode(mnt->mnt_root)); struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; int err; seq_printf(m, "%i %i %u:%u ", r->mnt_id, r->mnt_parent->mnt_id, - MAJOR(sb->s_dev), MINOR(sb->s_dev)); + MAJOR(dev), MINOR(dev)); if (sb->s_op->show_path) { err = sb->s_op->show_path(m, mnt->mnt_root); if (err) --- a/fs/stat.c +++ b/fs/stat.c @@ -47,7 +47,7 @@ void generic_fillattr(struct user_namespace *mnt_userns, struct inode *inode, vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode); vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); - stat->dev = inode->i_sb->s_dev; + stat->dev = inode_get_dev(inode); stat->ino = inode->i_ino; stat->mode = inode->i_mode; stat->nlink = inode->i_nlink; --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2250,6 +2250,7 @@ struct super_operations { struct shrink_control *); long (*free_cached_objects)(struct super_block *, struct shrink_control *); + dev_t (*get_inode_dev)(const struct inode *); }; /* @@ -3610,4 +3611,12 @@ extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len, extern int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice); +static inline dev_t inode_get_dev(const struct inode *inode) +{ + if (inode->i_sb->s_op->get_inode_dev) + return inode->i_sb->s_op->get_inode_dev(inode); + + return inode->i_sb->s_dev; +} + #endif /* _LINUX_FS_H */ --- a/include/trace/events/filelock.h +++ b/include/trace/events/filelock.h @@ -48,7 +48,7 @@ TRACE_EVENT(locks_get_lock_context, ), TP_fast_assign( - __entry->s_dev = inode->i_sb->s_dev; + __entry->s_dev = inode_get_dev(inode); __entry->i_ino = inode->i_ino; __entry->type = type; __entry->ctx = ctx; @@ -80,7 +80,7 @@ DECLARE_EVENT_CLASS(filelock_lock, TP_fast_assign( __entry->fl = fl ? fl : NULL; - __entry->s_dev = inode->i_sb->s_dev; + __entry->s_dev = inode_get_dev(inode); __entry->i_ino = inode->i_ino; __entry->fl_blocker = fl ? fl->fl_blocker : NULL; __entry->fl_owner = fl ? fl->fl_owner : NULL; @@ -135,7 +135,7 @@ DECLARE_EVENT_CLASS(filelock_lease, TP_fast_assign( __entry->fl = fl ? fl : NULL; - __entry->s_dev = inode->i_sb->s_dev; + __entry->s_dev = inode_get_dev(inode); __entry->i_ino = inode->i_ino; __entry->fl_blocker = fl ? fl->fl_blocker : NULL; __entry->fl_owner = fl ? fl->fl_owner : NULL; @@ -185,7 +185,7 @@ TRACE_EVENT(generic_add_lease, ), TP_fast_assign( - __entry->s_dev = inode->i_sb->s_dev; + __entry->s_dev = inode_get_dev(inode); __entry->i_ino = inode->i_ino; __entry->wcount = atomic_read(&inode->i_writecount); __entry->rcount = atomic_read(&inode->i_readcount); --- a/include/trace/events/filemap.h +++ b/include/trace/events/filemap.h @@ -32,7 +32,7 @@ DECLARE_EVENT_CLASS(mm_filemap_op_page_cache, __entry->i_ino = folio->mapping->host->i_ino; __entry->index = folio->index; if (folio->mapping->host->i_sb) - __entry->s_dev = folio->mapping->host->i_sb->s_dev; + __entry->s_dev = inode_get_dev(folio->mapping->host); else __entry->s_dev = folio->mapping->host->i_rdev; __entry->order = folio_order(folio); @@ -71,7 +71,7 @@ TRACE_EVENT(filemap_set_wb_err, __entry->i_ino = mapping->host->i_ino; __entry->errseq = eseq; if (mapping->host->i_sb) - __entry->s_dev = mapping->host->i_sb->s_dev; + __entry->s_dev = inode_get_dev(mapping->host); else __entry->s_dev = mapping->host->i_rdev; ), @@ -99,7 +99,7 @@ TRACE_EVENT(file_check_and_advance_wb_err, __entry->i_ino = file->f_mapping->host->i_ino; if (file->f_mapping->host->i_sb) __entry->s_dev = - file->f_mapping->host->i_sb->s_dev; + inode_get_dev(file->f_mapping->host); else __entry->s_dev = file->f_mapping->host->i_rdev; --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -809,7 +809,7 @@ DECLARE_EVENT_CLASS(writeback_inode_template, ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; + __entry->dev = inode_get_dev(inode); __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->mode = inode->i_mode; --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -378,7 +378,7 @@ static int __init do_mount_root(const char *name, const char *fs, init_chdir("/root"); s = current->fs->pwd.dentry->d_sb; - ROOT_DEV = s->s_dev; + ROOT_DEV = inode_get_dev(d_inode(current->fs->pwd.dentry)); printk(KERN_INFO "VFS: Mounted root (%s filesystem)%s on device %u:%u.\n", s->s_type->name, --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -67,7 +67,7 @@ int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_ static void audit_update_mark(struct audit_fsnotify_mark *audit_mark, const struct inode *inode) { - audit_mark->dev = inode ? inode->i_sb->s_dev : AUDIT_DEV_UNSET; + audit_mark->dev = inode ? inode_get_dev(inode) : AUDIT_DEV_UNSET; audit_mark->ino = inode ? inode->i_ino : AUDIT_INO_UNSET; } --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -352,7 +352,7 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent) return PTR_ERR(d); if (d_is_positive(d)) { /* update watch filter fields */ - watch->dev = d->d_sb->s_dev; + watch->dev = inode_get_dev(d_backing_inode(d)); watch->ino = d_backing_inode(d)->i_ino; } inode_unlock(d_backing_inode(parent->dentry)); @@ -477,7 +477,7 @@ static int audit_watch_handle_event(struct fsnotify_mark *inode_mark, u32 mask, return 0; if (mask & (FS_CREATE|FS_MOVED_TO) && inode) - audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0); + audit_update_watch(parent, dname, inode_get_dev(inode), inode->i_ino, 0); else if (mask & (FS_DELETE|FS_MOVED_FROM)) audit_update_watch(parent, dname, AUDIT_DEV_UNSET, AUDIT_INO_UNSET, 1); else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) @@ -531,7 +531,7 @@ int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark) if (!exe_file) return 0; ino = file_inode(exe_file)->i_ino; - dev = file_inode(exe_file)->i_sb->s_dev; + dev = inode_get_dev(file_inode(exe_file)); fput(exe_file); return audit_mark_compare(mark, ino, dev); } --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2272,7 +2272,7 @@ static void audit_copy_inode(struct audit_names *name, struct inode *inode, unsigned int flags) { name->ino = inode->i_ino; - name->dev = inode->i_sb->s_dev; + name->dev = inode_get_dev(inode); name->mode = inode->i_mode; name->uid = inode->i_uid; name->gid = inode->i_gid; @@ -2344,7 +2344,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry, if (n->ino) { /* valid inode number, use that for the comparison */ if (n->ino != inode->i_ino || - n->dev != inode->i_sb->s_dev) + n->dev != inode_get_dev(inode)) continue; } else if (n->name) { /* inode number has not been set, check the name */ @@ -2449,7 +2449,7 @@ void __audit_inode_child(struct inode *parent, n->type != AUDIT_TYPE_UNKNOWN)) continue; - if (n->ino == parent->i_ino && n->dev == parent->i_sb->s_dev && + if (n->ino == parent->i_ino && n->dev == inode_get_dev(parent) && !audit_compare_dname_path(dname, n->name->name, n->name_len)) { if (n->type == AUDIT_TYPE_UNKNOWN) --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8654,7 +8654,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) goto cpy_name; } inode = file_inode(vma->vm_file); - dev = inode->i_sb->s_dev; + dev = inode_get_dev(inode); ino = inode->i_ino; gen = inode->i_generation; maj = MAJOR(dev); --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -159,7 +159,7 @@ static int hwpoison_filter_dev(struct page *p) if (mapping == NULL || mapping->host == NULL) return -EINVAL; - dev = mapping->host->i_sb->s_dev; + dev = inode_get_dev(mapping->host); if (hwpoison_filter_dev_major != ~0U && hwpoison_filter_dev_major != MAJOR(dev)) return -EINVAL; --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -31,7 +31,7 @@ static int sk_diag_dump_vfs(struct sock *sk, struct sk_buff *nlskb) if (dentry) { struct unix_diag_vfs uv = { .udiag_vfs_ino = d_backing_inode(dentry)->i_ino, - .udiag_vfs_dev = dentry->d_sb->s_dev, + .udiag_vfs_dev = inode_get_dev(d_backing_inode(dentry)), }; return nla_put(nlskb, UNIX_DIAG_VFS, sizeof(uv), &uv); --- a/security/tomoyo/condition.c +++ b/security/tomoyo/condition.c @@ -743,7 +743,7 @@ void tomoyo_get_attributes(struct tomoyo_obj_info *obj) stat->gid = inode->i_gid; stat->ino = inode->i_ino; stat->mode = inode->i_mode; - stat->dev = inode->i_sb->s_dev; + stat->dev = inode_get_dev(inode); stat->rdev = inode->i_rdev; obj->stat_valid[i] = true; } --- a/security/tomoyo/realpath.c +++ b/security/tomoyo/realpath.c @@ -174,6 +174,7 @@ static char *tomoyo_get_local_path(struct dentry *dentry, char * const buffer, goto prepend_filesystem_name; } /* Use filesystem name for unnamed devices. */ + /* NOTE: This will fail with the btrfs get_inode_dev patches */ if (!MAJOR(sb->s_dev)) goto prepend_filesystem_name; {