Blob Blame History Raw
From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 4 May 2018 14:49:51 -0700
Subject: bpf: btf: Introduce BTF ID
Patch-mainline: v4.18-rc1
Git-commit: 78958fca7ead2f81b60a6827881c4866d1ed0c52
References: bsc#1109837

This patch gives an ID to each loaded BTF.  The ID is allocated by
the idr like the existing prog-id and map-id.

The bpf_put(map->btf) is moved to __bpf_map_put() so that the
userspace can stop seeing the BTF ID ASAP when the last BTF
refcnt is gone.

It also makes BTF accessible from userspace through the
1. new BPF_BTF_GET_FD_BY_ID command.  It is limited to CAP_SYS_ADMIN
   which is inline with the BPF_BTF_LOAD cmd and the existing
   BPF_[MAP|PROG]_GET_FD_BY_ID cmd.
2. new btf_id (and btf_key_id + btf_value_id) in "struct bpf_map_info"

Once the BTF ID handler is accessible from userspace, freeing a BTF
object has to go through a rcu period.  The BPF_BTF_GET_FD_BY_ID cmd
can then be done under a rcu_read_lock() instead of taking
spin_lock.
[Note: A similar rcu usage can be done to the existing
       bpf_prog_get_fd_by_id() in a follow up patch]

When processing the BPF_BTF_GET_FD_BY_ID cmd,
refcount_inc_not_zero() is needed because the BTF object
could be already in the rcu dead row .  btf_get() is
removed since its usage is currently limited to btf.c
alone.  refcount_inc() is used directly instead.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Thomas Bogendoerfer <tbogendoerfer@suse.de>
---
 include/linux/btf.h      |    2 
 include/uapi/linux/bpf.h |    5 ++
 kernel/bpf/btf.c         |  108 ++++++++++++++++++++++++++++++++++++++++++-----
 kernel/bpf/syscall.c     |   24 ++++++++++
 4 files changed, 128 insertions(+), 11 deletions(-)

--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -44,5 +44,7 @@ const struct btf_type *btf_type_id_size(
 					u32 *ret_size);
 void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
 		       struct seq_file *m);
+int btf_get_fd_by_id(u32 id);
+u32 btf_id(const struct btf *btf);
 
 #endif
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -95,6 +95,7 @@ enum bpf_cmd {
 	BPF_PROG_QUERY,
 	BPF_RAW_TRACEPOINT_OPEN,
 	BPF_BTF_LOAD,
+	BPF_BTF_GET_FD_BY_ID,
 };
 
 enum bpf_map_type {
@@ -343,6 +344,7 @@ union bpf_attr {
 			__u32		start_id;
 			__u32		prog_id;
 			__u32		map_id;
+			__u32		btf_id;
 		};
 		__u32		next_id;
 		__u32		open_flags;
@@ -1102,6 +1104,9 @@ struct bpf_map_info {
 	__u32 :32;
 	__u64 netns_dev;
 	__u64 netns_ino;
+	__u32 btf_id;
+	__u32 btf_key_id;
+	__u32 btf_value_id;
 } __attribute__((aligned(8)));
 
 /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -11,6 +11,7 @@
 #include <linux/file.h>
 #include <linux/uaccess.h>
 #include <linux/kernel.h>
+#include <linux/idr.h>
 #include <linux/bpf_verifier.h>
 #include <linux/btf.h>
 
@@ -179,6 +180,9 @@
 	     i < btf_type_vlen(struct_type);				\
 	     i++, member++)
 
+static DEFINE_IDR(btf_idr);
+static DEFINE_SPINLOCK(btf_idr_lock);
+
 struct btf {
 	union {
 		struct btf_header *hdr;
@@ -193,6 +197,8 @@ struct btf {
 	u32 types_size;
 	u32 data_size;
 	refcount_t refcnt;
+	u32 id;
+	struct rcu_head rcu;
 };
 
 enum verifier_phase {
@@ -598,6 +604,42 @@ static int btf_add_type(struct btf_verif
 	return 0;
 }
 
+static int btf_alloc_id(struct btf *btf)
+{
+	int id;
+
+	idr_preload(GFP_KERNEL);
+	spin_lock_bh(&btf_idr_lock);
+	id = idr_alloc_cyclic(&btf_idr, btf, 1, INT_MAX, GFP_ATOMIC);
+	if (id > 0)
+		btf->id = id;
+	spin_unlock_bh(&btf_idr_lock);
+	idr_preload_end();
+
+	if (WARN_ON_ONCE(!id))
+		return -ENOSPC;
+
+	return id > 0 ? 0 : id;
+}
+
+static void btf_free_id(struct btf *btf)
+{
+	unsigned long flags;
+
+	/*
+	 * In map-in-map, calling map_delete_elem() on outer
+	 * map will call bpf_map_put on the inner map.
+	 * It will then eventually call btf_free_id()
+	 * on the inner map.  Some of the map_delete_elem()
+	 * implementation may have irq disabled, so
+	 * we need to use the _irqsave() version instead
+	 * of the _bh() version.
+	 */
+	spin_lock_irqsave(&btf_idr_lock, flags);
+	idr_remove(&btf_idr, btf->id);
+	spin_unlock_irqrestore(&btf_idr_lock, flags);
+}
+
 static void btf_free(struct btf *btf)
 {
 	kvfree(btf->types);
@@ -607,15 +649,19 @@ static void btf_free(struct btf *btf)
 	kfree(btf);
 }
 
-static void btf_get(struct btf *btf)
+static void btf_free_rcu(struct rcu_head *rcu)
 {
-	refcount_inc(&btf->refcnt);
+	struct btf *btf = container_of(rcu, struct btf, rcu);
+
+	btf_free(btf);
 }
 
 void btf_put(struct btf *btf)
 {
-	if (btf && refcount_dec_and_test(&btf->refcnt))
-		btf_free(btf);
+	if (btf && refcount_dec_and_test(&btf->refcnt)) {
+		btf_free_id(btf);
+		call_rcu(&btf->rcu, btf_free_rcu);
+	}
 }
 
 static int env_resolve_init(struct btf_verifier_env *env)
@@ -2006,10 +2052,15 @@ const struct file_operations btf_fops =
 	.release	= btf_release,
 };
 
+static int __btf_new_fd(struct btf *btf)
+{
+	return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC);
+}
+
 int btf_new_fd(const union bpf_attr *attr)
 {
 	struct btf *btf;
-	int fd;
+	int ret;
 
 	btf = btf_parse(u64_to_user_ptr(attr->btf),
 			attr->btf_size, attr->btf_log_level,
@@ -2018,12 +2069,23 @@ int btf_new_fd(const union bpf_attr *att
 	if (IS_ERR(btf))
 		return PTR_ERR(btf);
 
-	fd = anon_inode_getfd("btf", &btf_fops, btf,
-			      O_RDONLY | O_CLOEXEC);
-	if (fd < 0)
+	ret = btf_alloc_id(btf);
+	if (ret) {
+		btf_free(btf);
+		return ret;
+	}
+
+	/*
+	 * The BTF ID is published to the userspace.
+	 * All BTF free must go through call_rcu() from
+	 * now on (i.e. free by calling btf_put()).
+	 */
+
+	ret = __btf_new_fd(btf);
+	if (ret < 0)
 		btf_put(btf);
 
-	return fd;
+	return ret;
 }
 
 struct btf *btf_get_by_fd(int fd)
@@ -2042,7 +2104,7 @@ struct btf *btf_get_by_fd(int fd)
 	}
 
 	btf = f.file->private_data;
-	btf_get(btf);
+	refcount_inc(&btf->refcnt);
 	fdput(f);
 
 	return btf;
@@ -2062,3 +2124,29 @@ int btf_get_info_by_fd(const struct btf
 
 	return 0;
 }
+
+int btf_get_fd_by_id(u32 id)
+{
+	struct btf *btf;
+	int fd;
+
+	rcu_read_lock();
+	btf = idr_find(&btf_idr, id);
+	if (!btf || !refcount_inc_not_zero(&btf->refcnt))
+		btf = ERR_PTR(-ENOENT);
+	rcu_read_unlock();
+
+	if (IS_ERR(btf))
+		return PTR_ERR(btf);
+
+	fd = __btf_new_fd(btf);
+	if (fd < 0)
+		btf_put(btf);
+
+	return fd;
+}
+
+u32 btf_id(const struct btf *btf)
+{
+	return btf->id;
+}
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -254,7 +254,6 @@ static void bpf_map_free_deferred(struct
 	struct bpf_map *map = container_of(work, struct bpf_map, work);
 
 	bpf_map_uncharge_memlock(map);
-	btf_put(map->btf);
 	/* implementation dependent freeing */
 	map->ops->map_free(map);
 }
@@ -275,6 +274,7 @@ static void __bpf_map_put(struct bpf_map
 	if (atomic_dec_and_test(&map->refcnt)) {
 		/* bpf_map_free_id() must be called first */
 		bpf_map_free_id(map, do_idr_lock);
+		btf_put(map->btf);
 		INIT_WORK(&map->work, bpf_map_free_deferred);
 		schedule_work(&map->work);
 	}
@@ -1985,6 +1985,12 @@ static int bpf_map_get_info_by_fd(struct
 	info.map_flags = map->map_flags;
 	memcpy(info.name, map->name, sizeof(map->name));
 
+	if (map->btf) {
+		info.btf_id = btf_id(map->btf);
+		info.btf_key_id = map->btf_key_id;
+		info.btf_value_id = map->btf_value_id;
+	}
+
 	if (bpf_map_is_dev_bound(map)) {
 		err = bpf_map_offload_info_fill(&info, map);
 		if (err)
@@ -2042,6 +2048,19 @@ static int bpf_btf_load(const union bpf_
 	return btf_new_fd(attr);
 }
 
+#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id
+
+static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
+{
+	if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID))
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	return btf_get_fd_by_id(attr->btf_id);
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr = {};
@@ -2121,6 +2140,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf
 	case BPF_BTF_LOAD:
 		err = bpf_btf_load(&attr);
 		break;
+	case BPF_BTF_GET_FD_BY_ID:
+		err = bpf_btf_get_fd_by_id(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;