From: YiFei Zhu <zhuyifei@google.com>
Date: Thu, 16 Dec 2021 02:04:26 +0000
Subject: bpf: Move getsockopt retval to struct bpf_cg_run_ctx
Patch-mainline: v5.18-rc1
Git-commit: c4dcfdd406aa2167396ac215e351e5e4dfd7efe3
References: jsc#PED-1377
The retval value is moved to struct bpf_cg_run_ctx for ease of access
in different prog types with different context structs layouts. The
helper implementation (to be added in a later patch in the series) can
simply perform a container_of from current->bpf_ctx to retrieve
bpf_cg_run_ctx.
Unfortunately, there is no easy way to access the current task_struct
via the verifier BPF bytecode rewrite, aside from possibly calling a
helper, so a pointer to current task is added to struct bpf_sockopt_kern
so that the rewritten BPF bytecode can access struct bpf_cg_run_ctx with
an indirection.
For backward compatibility, if a getsockopt program rejects a syscall
by returning 0, an -EPERM will be generated, by having the
BPF_PROG_RUN_ARRAY_CG family macros automatically set the retval to
-EPERM. Unlike prior to this patch, this -EPERM will be visible to
ctx->retval for any other hooks down the line in the prog array.
Additionally, the restriction that getsockopt filters can only set
the retval to 0 is removed, considering that certain getsockopt
implementations may return optlen. Filters are now able to set the
value arbitrarily.
Signed-off-by: YiFei Zhu <zhuyifei@google.com>
Reviewed-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/73b0325f5c29912ccea7ea57ec1ed4d388fc1d37.1639619851.git.zhuyifei@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
---
include/linux/bpf.h | 20 ++++++-----
include/linux/filter.h | 5 ++
kernel/bpf/cgroup.c | 82 ++++++++++++++++++++++++++++---------------------
3 files changed, 63 insertions(+), 44 deletions(-)
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1250,6 +1250,7 @@ struct bpf_run_ctx {};
struct bpf_cg_run_ctx {
struct bpf_run_ctx run_ctx;
const struct bpf_prog_array_item *prog_item;
+ int retval;
};
struct bpf_trace_run_ctx {
@@ -1285,16 +1286,16 @@ typedef u32 (*bpf_prog_run_fn)(const str
static __always_inline int
BPF_PROG_RUN_ARRAY_CG_FLAGS(const struct bpf_prog_array __rcu *array_rcu,
const void *ctx, bpf_prog_run_fn run_prog,
- u32 *ret_flags)
+ int retval, u32 *ret_flags)
{
const struct bpf_prog_array_item *item;
const struct bpf_prog *prog;
const struct bpf_prog_array *array;
struct bpf_run_ctx *old_run_ctx;
struct bpf_cg_run_ctx run_ctx;
- int ret = 0;
u32 func_ret;
+ run_ctx.retval = retval;
migrate_disable();
rcu_read_lock();
array = rcu_dereference(array_rcu);
@@ -1304,27 +1305,28 @@ BPF_PROG_RUN_ARRAY_CG_FLAGS(const struct
run_ctx.prog_item = item;
func_ret = run_prog(prog, ctx);
if (!(func_ret & 1))
- ret = -EPERM;
+ run_ctx.retval = -EPERM;
*(ret_flags) |= (func_ret >> 1);
item++;
}
bpf_reset_run_ctx(old_run_ctx);
rcu_read_unlock();
migrate_enable();
- return ret;
+ return run_ctx.retval;
}
static __always_inline int
BPF_PROG_RUN_ARRAY_CG(const struct bpf_prog_array __rcu *array_rcu,
- const void *ctx, bpf_prog_run_fn run_prog)
+ const void *ctx, bpf_prog_run_fn run_prog,
+ int retval)
{
const struct bpf_prog_array_item *item;
const struct bpf_prog *prog;
const struct bpf_prog_array *array;
struct bpf_run_ctx *old_run_ctx;
struct bpf_cg_run_ctx run_ctx;
- int ret = 0;
+ run_ctx.retval = retval;
migrate_disable();
rcu_read_lock();
array = rcu_dereference(array_rcu);
@@ -1333,13 +1335,13 @@ BPF_PROG_RUN_ARRAY_CG(const struct bpf_p
while ((prog = READ_ONCE(item->prog))) {
run_ctx.prog_item = item;
if (!run_prog(prog, ctx))
- ret = -EPERM;
+ run_ctx.retval = -EPERM;
item++;
}
bpf_reset_run_ctx(old_run_ctx);
rcu_read_unlock();
migrate_enable();
- return ret;
+ return run_ctx.retval;
}
static __always_inline u32
@@ -1399,7 +1401,7 @@ out:
u32 _flags = 0; \
bool _cn; \
u32 _ret; \
- _ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(array, ctx, func, &_flags); \
+ _ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(array, ctx, func, 0, &_flags); \
_cn = _flags & BPF_RET_SET_CN; \
if (!_ret) \
_ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); \
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1356,7 +1356,10 @@ struct bpf_sockopt_kern {
s32 level;
s32 optname;
s32 optlen;
- s32 retval;
+ /* for retval in struct bpf_cg_run_ctx */
+ struct task_struct *current_task;
+ /* Temporary "register" for indirect stores to ppos. */
+ u64 tmp_reg;
};
int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len);
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1043,7 +1043,7 @@ int __cgroup_bpf_run_filter_skb(struct s
cgrp->bpf.effective[atype], skb, __bpf_prog_run_save_cb);
} else {
ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], skb,
- __bpf_prog_run_save_cb);
+ __bpf_prog_run_save_cb, 0);
}
bpf_restore_data_end(skb, saved_data_end);
__skb_pull(skb, offset);
@@ -1072,7 +1072,7 @@ int __cgroup_bpf_run_filter_sk(struct so
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk,
- bpf_prog_run);
+ bpf_prog_run, 0);
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
@@ -1118,7 +1118,7 @@ int __cgroup_bpf_run_filter_sock_addr(st
cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
return BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[atype], &ctx,
- bpf_prog_run, flags);
+ bpf_prog_run, 0, flags);
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
@@ -1145,7 +1145,7 @@ int __cgroup_bpf_run_filter_sock_ops(str
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sock_ops,
- bpf_prog_run);
+ bpf_prog_run, 0);
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
@@ -1163,7 +1163,7 @@ int __cgroup_bpf_check_dev_permission(sh
rcu_read_lock();
cgrp = task_dfl_cgroup(current);
ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx,
- bpf_prog_run);
+ bpf_prog_run, 0);
rcu_read_unlock();
return ret;
@@ -1294,7 +1294,8 @@ int __cgroup_bpf_run_filter_sysctl(struc
rcu_read_lock();
cgrp = task_dfl_cgroup(current);
- ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, bpf_prog_run);
+ ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx,
+ bpf_prog_run, 0);
rcu_read_unlock();
kfree(ctx.cur_val);
@@ -1409,7 +1410,7 @@ int __cgroup_bpf_run_filter_setsockopt(s
lock_sock(sk);
ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_SETSOCKOPT],
- &ctx, bpf_prog_run);
+ &ctx, bpf_prog_run, 0);
release_sock(sk);
if (ret)
@@ -1473,7 +1474,7 @@ int __cgroup_bpf_run_filter_getsockopt(s
.sk = sk,
.level = level,
.optname = optname,
- .retval = retval,
+ .current_task = current,
};
int ret;
@@ -1517,10 +1518,10 @@ int __cgroup_bpf_run_filter_getsockopt(s
lock_sock(sk);
ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT],
- &ctx, bpf_prog_run);
+ &ctx, bpf_prog_run, retval);
release_sock(sk);
- if (ret)
+ if (ret < 0)
goto out;
if (ctx.optlen > max_optlen || ctx.optlen < 0) {
@@ -1528,14 +1529,6 @@ int __cgroup_bpf_run_filter_getsockopt(s
goto out;
}
- /* BPF programs only allowed to set retval to 0, not some
- * arbitrary value.
- */
- if (ctx.retval != 0 && ctx.retval != retval) {
- ret = -EFAULT;
- goto out;
- }
-
if (ctx.optlen != 0) {
if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
put_user(ctx.optlen, optlen)) {
@@ -1544,8 +1537,6 @@ int __cgroup_bpf_run_filter_getsockopt(s
}
}
- ret = ctx.retval;
-
out:
sockopt_free_buf(&ctx, &buf);
return ret;
@@ -1560,10 +1551,10 @@ int __cgroup_bpf_run_filter_getsockopt_k
.sk = sk,
.level = level,
.optname = optname,
- .retval = retval,
.optlen = *optlen,
.optval = optval,
.optval_end = optval + *optlen,
+ .current_task = current,
};
int ret;
@@ -1576,25 +1567,19 @@ int __cgroup_bpf_run_filter_getsockopt_k
*/
ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT],
- &ctx, bpf_prog_run);
- if (ret)
+ &ctx, bpf_prog_run, retval);
+ if (ret < 0)
return ret;
if (ctx.optlen > *optlen)
return -EFAULT;
- /* BPF programs only allowed to set retval to 0, not some
- * arbitrary value.
- */
- if (ctx.retval != 0 && ctx.retval != retval)
- return -EFAULT;
-
/* BPF programs can shrink the buffer, export the modifications.
*/
if (ctx.optlen != 0)
*optlen = ctx.optlen;
- return ctx.retval;
+ return ret;
}
#endif
@@ -2010,10 +1995,39 @@ static u32 cg_sockopt_convert_ctx_access
*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen);
break;
case offsetof(struct bpf_sockopt, retval):
- if (type == BPF_WRITE)
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval);
- else
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval);
+ BUILD_BUG_ON(offsetof(struct bpf_cg_run_ctx, run_ctx) != 0);
+
+ if (type == BPF_WRITE) {
+ int treg = BPF_REG_9;
+
+ if (si->src_reg == treg || si->dst_reg == treg)
+ --treg;
+ if (si->src_reg == treg || si->dst_reg == treg)
+ --treg;
+ *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, treg,
+ offsetof(struct bpf_sockopt_kern, tmp_reg));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
+ treg, si->dst_reg,
+ offsetof(struct bpf_sockopt_kern, current_task));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
+ treg, treg,
+ offsetof(struct task_struct, bpf_ctx));
+ *insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
+ treg, si->src_reg,
+ offsetof(struct bpf_cg_run_ctx, retval));
+ *insn++ = BPF_LDX_MEM(BPF_DW, treg, si->dst_reg,
+ offsetof(struct bpf_sockopt_kern, tmp_reg));
+ } else {
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sockopt_kern, current_task));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
+ si->dst_reg, si->dst_reg,
+ offsetof(struct task_struct, bpf_ctx));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
+ si->dst_reg, si->dst_reg,
+ offsetof(struct bpf_cg_run_ctx, retval));
+ }
break;
case offsetof(struct bpf_sockopt, optval):
*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval);