Blob Blame History Raw
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 18 Oct 2023 13:56:54 +0200
Subject: perf: Disallow mis-matched inherited group reads
Git-commit: 32671e3799ca2e4590773fd0e63aaa4229e50c06
Patch-mainline: v6.6-rc7
References: bsc#1216584 CVE-2023-5717
X-Info: adjust for context (perf_event.h) no bd27568117664b8b3e259721393df420ed51f57b
X-Info: adjust for context (core.c) no lost samples (119a784c81270eb88e573174ed2209225d646656), no PERF_EV_CAP_SIBLING (9f0c4fa111dc909ca545c45ea20ec84da555ce16), no aux_output fix (f733c6b508bcaa3441ba1eacf16efb9abd47489f)
X-Info: adjust for context (core.c) no Remove perf_event::group_entry (8343aae66167df6708128a778e750d48dbe31302),   this should be safe

Because group consistency is non-atomic between parent (filedesc) and children
(inherited) events, it is possible for PERF_FORMAT_GROUP read() to try and sum
non-matching counter groups -- with non-sensical results.

Add group_generation to distinguish the case where a parent group removes and
adds an event and thus has the same number, but a different configuration of
events as inherited groups.

This became a problem when commit fa8c269353d5 ("perf/core: Invert
perf_read_group() loops") flipped the order of child_list and sibling_list.
Previously it would iterate the group (sibling_list) first, and for each
sibling traverse the child_list. In this order, only the group composition of
the parent is relevant. By flipping the order the group composition of the
child (inherited) events becomes an issue and the mis-match in group
composition becomes evident.

That said; even prior to this commit, while reading of a group that is not
equally inherited was not broken, it still made no sense.

(Ab)use ECHILD as error return to indicate issues with child process group
composition.

Fixes: fa8c269353d5 ("perf/core: Invert perf_read_group() loops")
Reported-by: Budimir Markovic <markovicbudimir@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20231018115654.GK33217@noisy.programming.kicks-ass.net

Signed-off-by: Tony Jones <tonyj@suse.de>
---
 include/linux/perf_event.h |    1 +
 kernel/events/core.c       |   39 +++++++++++++++++++++++++++++++++------
 2 files changed, 34 insertions(+), 6 deletions(-)

--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -591,6 +591,7 @@ struct perf_event {
 	/* The cumulative AND of all event_caps for events in this group. */
 	int				group_caps;
 
+	unsigned int			group_generation;
 	struct perf_event		*group_leader;
 	struct pmu			*pmu;
 	void				*pmu_private;
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1813,6 +1813,7 @@ static void perf_group_attach(struct per
 
 	list_add_tail(&event->sibling_list, &group_leader->sibling_list);
 	group_leader->nr_siblings++;
+	group_leader->group_generation++;
 
 	perf_event__header_size(group_leader);
 
@@ -1883,6 +1884,7 @@ static void perf_group_detach(struct per
 	if (event->group_leader != event) {
 		list_del_init(&event->sibling_list);
 		event->group_leader->nr_siblings--;
+		event->group_leader->group_generation++;
 		goto out;
 	}
 
@@ -4658,7 +4660,7 @@ static int __perf_read_group_add(struct
 					u64 read_format, u64 *values)
 {
 	struct perf_event_context *ctx = leader->ctx;
-	struct perf_event *sub;
+	struct perf_event *sub, *parent;
 	unsigned long flags;
 	int n = 1; /* skip @nr */
 	int ret;
@@ -4668,6 +4670,33 @@ static int __perf_read_group_add(struct
 		return ret;
 
 	raw_spin_lock_irqsave(&ctx->lock, flags);
+	/*
+	 * Verify the grouping between the parent and child (inherited)
+	 * events is still in tact.
+	 *
+	 * Specifically:
+	 *  - leader->ctx->lock pins leader->sibling_list
+	 *  - parent->child_mutex pins parent->child_list
+	 *  - parent->ctx->mutex pins parent->sibling_list
+	 *
+	 * Because parent->ctx != leader->ctx (and child_list nests inside
+	 * ctx->mutex), group destruction is not atomic between children, also
+	 * see perf_event_release_kernel(). Additionally, parent can grow the
+	 * group.
+	 *
+	 * Therefore it is possible to have parent and child groups in a
+	 * different configuration and summing over such a beast makes no sense
+	 * what so ever.
+	 *
+	 * Reject this.
+	 */
+	parent = leader->parent;
+	if (parent &&
+	    (parent->group_generation != leader->group_generation ||
+	     parent->nr_siblings != leader->nr_siblings)) {
+		ret = -ECHILD;
+		goto unlock;
+	}
 
 	/*
 	 * Since we co-schedule groups, {enabled,running} times of siblings
@@ -4697,8 +4726,9 @@ static int __perf_read_group_add(struct
 			values[n++] = primary_event_id(sub);
 	}
 
+unlock:
 	raw_spin_unlock_irqrestore(&ctx->lock, flags);
-	return 0;
+	return ret;
 }
 
 static int perf_read_group(struct perf_event *event,
@@ -4717,10 +4747,6 @@ static int perf_read_group(struct perf_e
 
 	values[0] = 1 + leader->nr_siblings;
 
-	/*
-	 * By locking the child_mutex of the leader we effectively
-	 * lock the child list of all siblings.. XXX explain how.
-	 */
 	mutex_lock(&leader->child_mutex);
 
 	ret = __perf_read_group_add(leader, read_format, values);
@@ -11362,6 +11388,7 @@ static int inherit_group(struct perf_eve
 		if (IS_ERR(child_ctr))
 			return PTR_ERR(child_ctr);
 	}
+	leader->group_generation = parent_event->group_generation;
 	return 0;
 }