|
Filipe Manana |
9d5674 |
From: Filipe Manana <fdmanana@suse.com>
|
|
Filipe Manana |
9d5674 |
Date: Wed, 13 Oct 2021 10:12:49 +0100
|
|
Filipe Manana |
9d5674 |
Git-commit: 2bb2e00ed9787e52580bb651264b8d6a2b7a9dd2
|
|
Filipe Manana |
9d5674 |
Patch-mainline: v5.16-rc1
|
|
Filipe Manana |
9d5674 |
References: bsc#1192896
|
|
Filipe Manana |
9d5674 |
Subject: [PATCH] btrfs: fix deadlock between chunk allocation and chunk btree
|
|
Filipe Manana |
9d5674 |
modifications
|
|
Filipe Manana |
9d5674 |
|
|
Filipe Manana |
9d5674 |
When a task is doing some modification to the chunk btree and it is not in
|
|
Filipe Manana |
9d5674 |
the context of a chunk allocation or a chunk removal, it can deadlock with
|
|
Filipe Manana |
9d5674 |
another task that is currently allocating a new data or metadata chunk.
|
|
Filipe Manana |
9d5674 |
|
|
Filipe Manana |
9d5674 |
These contexts are the following:
|
|
Filipe Manana |
9d5674 |
|
|
Filipe Manana |
9d5674 |
* When relocating a system chunk, when we need to COW the extent buffers
|
|
Filipe Manana |
9d5674 |
that belong to the chunk btree;
|
|
Filipe Manana |
9d5674 |
|
|
Filipe Manana |
9d5674 |
* When adding a new device (ioctl), where we need to add a new device item
|
|
Filipe Manana |
9d5674 |
to the chunk btree;
|
|
Filipe Manana |
9d5674 |
|
|
Filipe Manana |
9d5674 |
* When removing a device (ioctl), where we need to remove a device item
|
|
Filipe Manana |
9d5674 |
from the chunk btree;
|
|
Filipe Manana |
9d5674 |
|
|
Filipe Manana |
9d5674 |
* When resizing a device (ioctl), where we need to update a device item in
|
|
Filipe Manana |
9d5674 |
the chunk btree and may need to relocate a system chunk that lies beyond
|
|
Filipe Manana |
9d5674 |
the new device size when shrinking a device.
|
|
Filipe Manana |
9d5674 |
|
|
Filipe Manana |
9d5674 |
The problem happens due to a sequence of steps like the following:
|
|
Filipe Manana |
9d5674 |
|
|
Filipe Manana |
9d5674 |
1) Task A starts a data or metadata chunk allocation and it locks the
|
|
Filipe Manana |
9d5674 |
chunk mutex;
|
|
Filipe Manana |
9d5674 |
|
|
Filipe Manana |
9d5674 |
2) Task B is relocating a system chunk, and when it needs to COW an extent
|
|
Filipe Manana |
9d5674 |
buffer of the chunk btree, it has locked both that extent buffer as
|
|
Filipe Manana |
9d5674 |
well as its parent extent buffer;
|
|
Filipe Manana |
9d5674 |
|
|
Filipe Manana |
9d5674 |
3) Since there is not enough available system space, either because none
|
|
Filipe Manana |
9d5674 |
of the existing system block groups have enough free space or because
|
|
Filipe Manana |
9d5674 |
the only one with enough free space is in RO mode due to the relocation,
|
|
Filipe Manana |
9d5674 |
task B triggers a new system chunk allocation. It blocks when trying to
|
|
Filipe Manana |
9d5674 |
acquire the chunk mutex, currently held by task A;
|
|
Filipe Manana |
9d5674 |
|
|
Filipe Manana |
9d5674 |
4) Task A enters btrfs_chunk_alloc_add_chunk_item(), in order to insert
|
|
Filipe Manana |
9d5674 |
the new chunk item into the chunk btree and update the existing device
|
|
Filipe Manana |
9d5674 |
items there. But in order to do that, it has to lock the extent buffer
|
|
Filipe Manana |
9d5674 |
that task B locked at step 2, or its parent extent buffer, but task B
|
|
Filipe Manana |
9d5674 |
is waiting on the chunk mutex, which is currently locked by task A,
|
|
Filipe Manana |
9d5674 |
therefore resulting in a deadlock.
|
|
Filipe Manana |
9d5674 |
|
|
Filipe Manana |
9d5674 |
One example report when the deadlock happens with system chunk relocation:
|
|
Filipe Manana |
9d5674 |
|
|
Filipe Manana |
9d5674 |
INFO: task kworker/u9:5:546 blocked for more than 143 seconds.
|
|
Filipe Manana |
9d5674 |
Not tainted 5.15.0-rc3+ #1
|
|
Filipe Manana |
9d5674 |
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
|
|
Filipe Manana |
9d5674 |
task:kworker/u9:5 state:D stack:25936 pid: 546 ppid: 2 flags:0x00004000
|
|
Filipe Manana |
9d5674 |
Workqueue: events_unbound btrfs_async_reclaim_metadata_space
|
|
Filipe Manana |
9d5674 |
Call Trace:
|
|
Filipe Manana |
9d5674 |
context_switch kernel/sched/core.c:4940 [inline]
|
|
Filipe Manana |
9d5674 |
__schedule+0xcd9/0x2530 kernel/sched/core.c:6287
|
|
Filipe Manana |
9d5674 |
schedule+0xd3/0x270 kernel/sched/core.c:6366
|
|
Filipe Manana |
9d5674 |
rwsem_down_read_slowpath+0x4ee/0x9d0 kernel/locking/rwsem.c:993
|
|
Filipe Manana |
9d5674 |
__down_read_common kernel/locking/rwsem.c:1214 [inline]
|
|
Filipe Manana |
9d5674 |
__down_read kernel/locking/rwsem.c:1223 [inline]
|
|
Filipe Manana |
9d5674 |
down_read_nested+0xe6/0x440 kernel/locking/rwsem.c:1590
|
|
Filipe Manana |
9d5674 |
__btrfs_tree_read_lock+0x31/0x350 fs/btrfs/locking.c:47
|
|
Filipe Manana |
9d5674 |
btrfs_tree_read_lock fs/btrfs/locking.c:54 [inline]
|
|
Filipe Manana |
9d5674 |
btrfs_read_lock_root_node+0x8a/0x320 fs/btrfs/locking.c:191
|
|
Filipe Manana |
9d5674 |
btrfs_search_slot_get_root fs/btrfs/ctree.c:1623 [inline]
|
|
Filipe Manana |
9d5674 |
btrfs_search_slot+0x13b4/0x2140 fs/btrfs/ctree.c:1728
|
|
Filipe Manana |
9d5674 |
btrfs_update_device+0x11f/0x500 fs/btrfs/volumes.c:2794
|
|
Filipe Manana |
9d5674 |
btrfs_chunk_alloc_add_chunk_item+0x34d/0xea0 fs/btrfs/volumes.c:5504
|
|
Filipe Manana |
9d5674 |
do_chunk_alloc fs/btrfs/block-group.c:3408 [inline]
|
|
Filipe Manana |
9d5674 |
btrfs_chunk_alloc+0x84d/0xf50 fs/btrfs/block-group.c:3653
|
|
Filipe Manana |
9d5674 |
flush_space+0x54e/0xd80 fs/btrfs/space-info.c:670
|
|
Filipe Manana |
9d5674 |
btrfs_async_reclaim_metadata_space+0x396/0xa90 fs/btrfs/space-info.c:953
|
|
Filipe Manana |
9d5674 |
process_one_work+0x9df/0x16d0 kernel/workqueue.c:2297
|
|
Filipe Manana |
9d5674 |
worker_thread+0x90/0xed0 kernel/workqueue.c:2444
|
|
Filipe Manana |
9d5674 |
kthread+0x3e5/0x4d0 kernel/kthread.c:319
|
|
Filipe Manana |
9d5674 |
ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295
|
|
Filipe Manana |
9d5674 |
INFO: task syz-executor:9107 blocked for more than 143 seconds.
|
|
Filipe Manana |
9d5674 |
Not tainted 5.15.0-rc3+ #1
|
|
Filipe Manana |
9d5674 |
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
|
|
Filipe Manana |
9d5674 |
task:syz-executor state:D stack:23200 pid: 9107 ppid: 7792 flags:0x00004004
|
|
Filipe Manana |
9d5674 |
Call Trace:
|
|
Filipe Manana |
9d5674 |
context_switch kernel/sched/core.c:4940 [inline]
|
|
Filipe Manana |
9d5674 |
__schedule+0xcd9/0x2530 kernel/sched/core.c:6287
|
|
Filipe Manana |
9d5674 |
schedule+0xd3/0x270 kernel/sched/core.c:6366
|
|
Filipe Manana |
9d5674 |
schedule_preempt_disabled+0xf/0x20 kernel/sched/core.c:6425
|
|
Filipe Manana |
9d5674 |
__mutex_lock_common kernel/locking/mutex.c:669 [inline]
|
|
Filipe Manana |
9d5674 |
__mutex_lock+0xc96/0x1680 kernel/locking/mutex.c:729
|
|
Filipe Manana |
9d5674 |
btrfs_chunk_alloc+0x31a/0xf50 fs/btrfs/block-group.c:3631
|
|
Filipe Manana |
9d5674 |
find_free_extent_update_loop fs/btrfs/extent-tree.c:3986 [inline]
|
|
Filipe Manana |
9d5674 |
find_free_extent+0x25cb/0x3a30 fs/btrfs/extent-tree.c:4335
|
|
Filipe Manana |
9d5674 |
btrfs_reserve_extent+0x1f1/0x500 fs/btrfs/extent-tree.c:4415
|
|
Filipe Manana |
9d5674 |
btrfs_alloc_tree_block+0x203/0x1120 fs/btrfs/extent-tree.c:4813
|
|
Filipe Manana |
9d5674 |
__btrfs_cow_block+0x412/0x1620 fs/btrfs/ctree.c:415
|
|
Filipe Manana |
9d5674 |
btrfs_cow_block+0x2f6/0x8c0 fs/btrfs/ctree.c:570
|
|
Filipe Manana |
9d5674 |
btrfs_search_slot+0x1094/0x2140 fs/btrfs/ctree.c:1768
|
|
Filipe Manana |
9d5674 |
relocate_tree_block fs/btrfs/relocation.c:2694 [inline]
|
|
Filipe Manana |
9d5674 |
relocate_tree_blocks+0xf73/0x1770 fs/btrfs/relocation.c:2757
|
|
Filipe Manana |
9d5674 |
relocate_block_group+0x47e/0xc70 fs/btrfs/relocation.c:3673
|
|
Filipe Manana |
9d5674 |
btrfs_relocate_block_group+0x48a/0xc60 fs/btrfs/relocation.c:4070
|
|
Filipe Manana |
9d5674 |
btrfs_relocate_chunk+0x96/0x280 fs/btrfs/volumes.c:3181
|
|
Filipe Manana |
9d5674 |
__btrfs_balance fs/btrfs/volumes.c:3911 [inline]
|
|
Filipe Manana |
9d5674 |
btrfs_balance+0x1f03/0x3cd0 fs/btrfs/volumes.c:4301
|
|
Filipe Manana |
9d5674 |
btrfs_ioctl_balance+0x61e/0x800 fs/btrfs/ioctl.c:4137
|
|
Filipe Manana |
9d5674 |
btrfs_ioctl+0x39ea/0x7b70 fs/btrfs/ioctl.c:4949
|
|
Filipe Manana |
9d5674 |
vfs_ioctl fs/ioctl.c:51 [inline]
|
|
Filipe Manana |
9d5674 |
__do_sys_ioctl fs/ioctl.c:874 [inline]
|
|
Filipe Manana |
9d5674 |
__se_sys_ioctl fs/ioctl.c:860 [inline]
|
|
Filipe Manana |
9d5674 |
__x64_sys_ioctl+0x193/0x200 fs/ioctl.c:860
|
|
Filipe Manana |
9d5674 |
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
|
|
Filipe Manana |
9d5674 |
do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
|
|
Filipe Manana |
9d5674 |
entry_SYSCALL_64_after_hwframe+0x44/0xae
|
|
Filipe Manana |
9d5674 |
|
|
Filipe Manana |
9d5674 |
So fix this by making sure that whenever we try to modify the chunk btree
|
|
Filipe Manana |
9d5674 |
and we are neither in a chunk allocation context nor in a chunk remove
|
|
Filipe Manana |
9d5674 |
context, we reserve system space before modifying the chunk btree.
|
|
Filipe Manana |
9d5674 |
|
|
Filipe Manana |
9d5674 |
Reported-by: Hao Sun <sunhao.th@gmail.com>
|
|
Filipe Manana |
9d5674 |
Link: https://lore.kernel.org/linux-btrfs/CACkBjsax51i4mu6C0C3vJqQN3NR_iVuucoeG3U1HXjrgzn5FFQ@mail.gmail.com/
|
|
Filipe Manana |
9d5674 |
Fixes: 79bd37120b1495 ("btrfs: rework chunk allocation to avoid exhaustion of the system chunk array")
|
|
Filipe Manana |
9d5674 |
CC: stable@vger.kernel.org # 5.14+
|
|
Filipe Manana |
9d5674 |
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
|
|
Filipe Manana |
9d5674 |
Signed-off-by: Filipe Manana <fdmanana@suse.com>
|
|
Filipe Manana |
9d5674 |
Signed-off-by: David Sterba <dsterba@suse.com>
|
|
Filipe Manana |
9d5674 |
---
|
|
Filipe Manana |
9d5674 |
fs/btrfs/block-group.c | 146 +++++++++++++++++++++++++----------------
|
|
Filipe Manana |
9d5674 |
fs/btrfs/block-group.h | 2 +
|
|
Filipe Manana |
9d5674 |
fs/btrfs/relocation.c | 4 ++
|
|
Filipe Manana |
9d5674 |
fs/btrfs/volumes.c | 15 ++++-
|
|
Filipe Manana |
9d5674 |
4 files changed, 111 insertions(+), 56 deletions(-)
|
|
Filipe Manana |
9d5674 |
|
|
Filipe Manana |
9d5674 |
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
|
|
Filipe Manana |
9d5674 |
index c9dee3189..bb2c34743 100644
|
|
Filipe Manana |
9d5674 |
--- a/fs/btrfs/block-group.c
|
|
Filipe Manana |
9d5674 |
+++ b/fs/btrfs/block-group.c
|
|
Filipe Manana |
9d5674 |
@@ -2992,25 +2992,6 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
|
|
Filipe Manana |
9d5674 |
goto out;
|
|
Filipe Manana |
9d5674 |
}
|
|
Filipe Manana |
9d5674 |
|
|
Filipe Manana |
9d5674 |
- /*
|
|
Filipe Manana |
9d5674 |
- * If this is a system chunk allocation then stop right here and do not
|
|
Filipe Manana |
9d5674 |
- * add the chunk item to the chunk btree. This is to prevent a deadlock
|
|
Filipe Manana |
9d5674 |
- * because this system chunk allocation can be triggered while COWing
|
|
Filipe Manana |
9d5674 |
- * some extent buffer of the chunk btree and while holding a lock on a
|
|
Filipe Manana |
9d5674 |
- * parent extent buffer, in which case attempting to insert the chunk
|
|
Filipe Manana |
9d5674 |
- * item (or update the device item) would result in a deadlock on that
|
|
Filipe Manana |
9d5674 |
- * parent extent buffer. In this case defer the chunk btree updates to
|
|
Filipe Manana |
9d5674 |
- * the second phase of chunk allocation and keep our reservation until
|
|
Filipe Manana |
9d5674 |
- * the second phase completes.
|
|
Filipe Manana |
9d5674 |
- *
|
|
Filipe Manana |
9d5674 |
- * This is a rare case and can only be triggered by the very few cases
|
|
Filipe Manana |
9d5674 |
- * we have where we need to touch the chunk btree outside chunk allocation
|
|
Filipe Manana |
9d5674 |
- * and chunk removal. These cases are basically adding a device, removing
|
|
Filipe Manana |
9d5674 |
- * a device or resizing a device.
|
|
Filipe Manana |
9d5674 |
- */
|
|
Filipe Manana |
9d5674 |
- if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
|
|
Filipe Manana |
9d5674 |
- return 0;
|
|
Filipe Manana |
9d5674 |
-
|
|
Filipe Manana |
9d5674 |
ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
|
|
Filipe Manana |
9d5674 |
/*
|
|
Filipe Manana |
9d5674 |
* Normally we are not expected to fail with -ENOSPC here, since we have
|
|
Filipe Manana |
9d5674 |
@@ -3143,14 +3124,14 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
|
|
Filipe Manana |
9d5674 |
* This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
|
|
Filipe Manana |
9d5674 |
* the system chunk array due to concurrent allocations") provides more details.
|
|
Filipe Manana |
9d5674 |
*
|
|
Filipe Manana |
9d5674 |
- * For allocation of system chunks, we defer the updates and insertions into the
|
|
Filipe Manana |
9d5674 |
- * chunk btree to phase 2. This is to prevent deadlocks on extent buffers because
|
|
Filipe Manana |
9d5674 |
- * if the chunk allocation is triggered while COWing an extent buffer of the
|
|
Filipe Manana |
9d5674 |
- * chunk btree, we are holding a lock on the parent of that extent buffer and
|
|
Filipe Manana |
9d5674 |
- * doing the chunk btree updates and insertions can require locking that parent.
|
|
Filipe Manana |
9d5674 |
- * This is for the very few and rare cases where we update the chunk btree that
|
|
Filipe Manana |
9d5674 |
- * are not chunk allocation or chunk removal: adding a device, removing a device
|
|
Filipe Manana |
9d5674 |
- * or resizing a device.
|
|
Filipe Manana |
9d5674 |
+ * Allocation of system chunks does not happen through this function. A task that
|
|
Filipe Manana |
9d5674 |
+ * needs to update the chunk btree (the only btree that uses system chunks), must
|
|
Filipe Manana |
9d5674 |
+ * preallocate chunk space by calling either check_system_chunk() or
|
|
Filipe Manana |
9d5674 |
+ * btrfs_reserve_chunk_metadata() - the former is used when allocating a data or
|
|
Filipe Manana |
9d5674 |
+ * metadata chunk or when removing a chunk, while the later is used before doing
|
|
Filipe Manana |
9d5674 |
+ * a modification to the chunk btree - use cases for the later are adding,
|
|
Filipe Manana |
9d5674 |
+ * removing and resizing a device as well as relocation of a system chunk.
|
|
Filipe Manana |
9d5674 |
+ * See the comment below for more details.
|
|
Filipe Manana |
9d5674 |
*
|
|
Filipe Manana |
9d5674 |
* The reservation of system space, done through check_system_chunk(), as well
|
|
Filipe Manana |
9d5674 |
* as all the updates and insertions into the chunk btree must be done while
|
|
Filipe Manana |
9d5674 |
@@ -3187,11 +3168,27 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
|
|
Filipe Manana |
9d5674 |
if (trans->allocating_chunk)
|
|
Filipe Manana |
9d5674 |
return -ENOSPC;
|
|
Filipe Manana |
9d5674 |
/*
|
|
Filipe Manana |
9d5674 |
- * If we are removing a chunk, don't re-enter or we would deadlock.
|
|
Filipe Manana |
9d5674 |
- * System space reservation and system chunk allocation is done by the
|
|
Filipe Manana |
9d5674 |
- * chunk remove operation (btrfs_remove_chunk()).
|
|
Filipe Manana |
9d5674 |
+ * Allocation of system chunks can not happen through this path, as we
|
|
Filipe Manana |
9d5674 |
+ * could end up in a deadlock if we are allocating a data or metadata
|
|
Filipe Manana |
9d5674 |
+ * chunk and there is another task modifying the chunk btree.
|
|
Filipe Manana |
9d5674 |
+ *
|
|
Filipe Manana |
9d5674 |
+ * This is because while we are holding the chunk mutex, we will attempt
|
|
Filipe Manana |
9d5674 |
+ * to add the new chunk item to the chunk btree or update an existing
|
|
Filipe Manana |
9d5674 |
+ * device item in the chunk btree, while the other task that is modifying
|
|
Filipe Manana |
9d5674 |
+ * the chunk btree is attempting to COW an extent buffer while holding a
|
|
Filipe Manana |
9d5674 |
+ * lock on it and on its parent - if the COW operation triggers a system
|
|
Filipe Manana |
9d5674 |
+ * chunk allocation, then we can deadlock because we are holding the
|
|
Filipe Manana |
9d5674 |
+ * chunk mutex and we may need to access that extent buffer or its parent
|
|
Filipe Manana |
9d5674 |
+ * in order to add the chunk item or update a device item.
|
|
Filipe Manana |
9d5674 |
+ *
|
|
Filipe Manana |
9d5674 |
+ * Tasks that want to modify the chunk tree should reserve system space
|
|
Filipe Manana |
9d5674 |
+ * before updating the chunk btree, by calling either
|
|
Filipe Manana |
9d5674 |
+ * btrfs_reserve_chunk_metadata() or check_system_chunk().
|
|
Filipe Manana |
9d5674 |
+ * It's possible that after a task reserves the space, it still ends up
|
|
Filipe Manana |
9d5674 |
+ * here - this happens in the cases described above at do_chunk_alloc().
|
|
Filipe Manana |
9d5674 |
+ * The task will have to either retry or fail.
|
|
Filipe Manana |
9d5674 |
*/
|
|
Filipe Manana |
9d5674 |
- if (trans->removing_chunk)
|
|
Filipe Manana |
9d5674 |
+ if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
|
|
Filipe Manana |
9d5674 |
return -ENOSPC;
|
|
Filipe Manana |
9d5674 |
|
|
Filipe Manana |
9d5674 |
space_info = btrfs_find_space_info(fs_info, flags);
|
|
Filipe Manana |
9d5674 |
@@ -3290,17 +3287,14 @@ static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
|
|
Filipe Manana |
9d5674 |
return num_dev;
|
|
Filipe Manana |
9d5674 |
}
|
|
Filipe Manana |
9d5674 |
|
|
Filipe Manana |
9d5674 |
-/*
|
|
Filipe Manana |
9d5674 |
- * Reserve space in the system space for allocating or removing a chunk
|
|
Filipe Manana |
9d5674 |
- */
|
|
Filipe Manana |
9d5674 |
-void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
|
|
Filipe Manana |
9d5674 |
+static void reserve_chunk_space(struct btrfs_trans_handle *trans,
|
|
Filipe Manana |
9d5674 |
+ u64 bytes,
|
|
Filipe Manana |
9d5674 |
+ u64 type)
|
|
Filipe Manana |
9d5674 |
{
|
|
Filipe Manana |
9d5674 |
struct btrfs_fs_info *fs_info = trans->fs_info;
|
|
Filipe Manana |
9d5674 |
struct btrfs_space_info *info;
|
|
Filipe Manana |
9d5674 |
u64 left;
|
|
Filipe Manana |
9d5674 |
- u64 thresh;
|
|
Filipe Manana |
9d5674 |
int ret = 0;
|
|
Filipe Manana |
9d5674 |
- u64 num_devs;
|
|
Filipe Manana |
9d5674 |
|
|
Filipe Manana |
9d5674 |
/*
|
|
Filipe Manana |
9d5674 |
* Needed because we can end up allocating a system chunk and for an
|
|
Filipe Manana |
9d5674 |
@@ -3313,19 +3307,13 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
|
|
Filipe Manana |
9d5674 |
left = info->total_bytes - btrfs_space_info_used(info, true);
|
|
Filipe Manana |
9d5674 |
spin_unlock(&info->lock);
|
|
Filipe Manana |
9d5674 |
|
|
Filipe Manana |
9d5674 |
- num_devs = get_profile_num_devs(fs_info, type);
|
|
Filipe Manana |
9d5674 |
-
|
|
Filipe Manana |
9d5674 |
- /* num_devs device items to update and 1 chunk item to add or remove */
|
|
Filipe Manana |
9d5674 |
- thresh = btrfs_calc_metadata_size(fs_info, num_devs) +
|
|
Filipe Manana |
9d5674 |
- btrfs_calc_insert_metadata_size(fs_info, 1);
|
|
Filipe Manana |
9d5674 |
-
|
|
Filipe Manana |
9d5674 |
- if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
|
|
Filipe Manana |
9d5674 |
+ if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
|
|
Filipe Manana |
9d5674 |
btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
|
|
Filipe Manana |
9d5674 |
- left, thresh, type);
|
|
Filipe Manana |
9d5674 |
+ left, bytes, type);
|
|
Filipe Manana |
9d5674 |
btrfs_dump_space_info(fs_info, info, 0, 0);
|
|
Filipe Manana |
9d5674 |
}
|
|
Filipe Manana |
9d5674 |
|
|
Filipe Manana |
9d5674 |
- if (left < thresh) {
|
|
Filipe Manana |
9d5674 |
+ if (left < bytes) {
|
|
Filipe Manana |
9d5674 |
u64 flags = btrfs_system_alloc_profile(fs_info);
|
|
Filipe Manana |
9d5674 |
struct btrfs_block_group *bg;
|
|
Filipe Manana |
9d5674 |
|
|
Filipe Manana |
9d5674 |
@@ -3334,21 +3322,20 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
|
|
Filipe Manana |
9d5674 |
* needing it, as we might not need to COW all nodes/leafs from
|
|
Filipe Manana |
9d5674 |
* the paths we visit in the chunk tree (they were already COWed
|
|
Filipe Manana |
9d5674 |
* or created in the current transaction for example).
|
|
Filipe Manana |
9d5674 |
- *
|
|
Filipe Manana |
9d5674 |
- * Also, if our caller is allocating a system chunk, do not
|
|
Filipe Manana |
9d5674 |
- * attempt to insert the chunk item in the chunk btree, as we
|
|
Filipe Manana |
9d5674 |
- * could deadlock on an extent buffer since our caller may be
|
|
Filipe Manana |
9d5674 |
- * COWing an extent buffer from the chunk btree.
|
|
Filipe Manana |
9d5674 |
*/
|
|
Filipe Manana |
9d5674 |
bg = btrfs_alloc_chunk(trans, flags);
|
|
Filipe Manana |
9d5674 |
if (IS_ERR(bg)) {
|
|
Filipe Manana |
9d5674 |
ret = PTR_ERR(bg);
|
|
Filipe Manana |
9d5674 |
- } else if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) {
|
|
Filipe Manana |
9d5674 |
+ } else {
|
|
Filipe Manana |
9d5674 |
/*
|
|
Filipe Manana |
9d5674 |
* If we fail to add the chunk item here, we end up
|
|
Filipe Manana |
9d5674 |
* trying again at phase 2 of chunk allocation, at
|
|
Filipe Manana |
9d5674 |
* btrfs_create_pending_block_groups(). So ignore
|
|
Filipe Manana |
9d5674 |
- * any error here.
|
|
Filipe Manana |
9d5674 |
+ * any error here. An ENOSPC here could happen, due to
|
|
Filipe Manana |
9d5674 |
+ * the cases described at do_chunk_alloc() - the system
|
|
Filipe Manana |
9d5674 |
+ * block group we just created was just turned into RO
|
|
Filipe Manana |
9d5674 |
+ * mode by a scrub for example, or a running discard
|
|
Filipe Manana |
9d5674 |
+ * temporarily removed its free space entries, etc.
|
|
Filipe Manana |
9d5674 |
*/
|
|
Filipe Manana |
9d5674 |
btrfs_chunk_alloc_add_chunk_item(trans, bg);
|
|
Filipe Manana |
9d5674 |
}
|
|
Filipe Manana |
9d5674 |
@@ -3357,12 +3344,61 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
|
|
Filipe Manana |
9d5674 |
if (!ret) {
|
|
Filipe Manana |
9d5674 |
ret = btrfs_block_rsv_add(fs_info->chunk_root,
|
|
Filipe Manana |
9d5674 |
&fs_info->chunk_block_rsv,
|
|
Filipe Manana |
9d5674 |
- thresh, BTRFS_RESERVE_NO_FLUSH);
|
|
Filipe Manana |
9d5674 |
+ bytes, BTRFS_RESERVE_NO_FLUSH);
|
|
Filipe Manana |
9d5674 |
if (!ret)
|
|
Filipe Manana |
9d5674 |
- trans->chunk_bytes_reserved += thresh;
|
|
Filipe Manana |
9d5674 |
+ trans->chunk_bytes_reserved += bytes;
|
|
Filipe Manana |
9d5674 |
}
|
|
Filipe Manana |
9d5674 |
}
|
|
Filipe Manana |
9d5674 |
|
|
Filipe Manana |
9d5674 |
+/*
|
|
Filipe Manana |
9d5674 |
+ * Reserve space in the system space for allocating or removing a chunk.
|
|
Filipe Manana |
9d5674 |
+ * The caller must be holding fs_info->chunk_mutex.
|
|
Filipe Manana |
9d5674 |
+ */
|
|
Filipe Manana |
9d5674 |
+void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
|
|
Filipe Manana |
9d5674 |
+{
|
|
Filipe Manana |
9d5674 |
+ struct btrfs_fs_info *fs_info = trans->fs_info;
|
|
Filipe Manana |
9d5674 |
+ const u64 num_devs = get_profile_num_devs(fs_info, type);
|
|
Filipe Manana |
9d5674 |
+ u64 bytes;
|
|
Filipe Manana |
9d5674 |
+
|
|
Filipe Manana |
9d5674 |
+ /* num_devs device items to update and 1 chunk item to add or remove. */
|
|
Filipe Manana |
9d5674 |
+ bytes = btrfs_calc_metadata_size(fs_info, num_devs) +
|
|
Filipe Manana |
9d5674 |
+ btrfs_calc_insert_metadata_size(fs_info, 1);
|
|
Filipe Manana |
9d5674 |
+
|
|
Filipe Manana |
9d5674 |
+ reserve_chunk_space(trans, bytes, type);
|
|
Filipe Manana |
9d5674 |
+}
|
|
Filipe Manana |
9d5674 |
+
|
|
Filipe Manana |
9d5674 |
+/*
|
|
Filipe Manana |
9d5674 |
+ * Reserve space in the system space, if needed, for doing a modification to the
|
|
Filipe Manana |
9d5674 |
+ * chunk btree.
|
|
Filipe Manana |
9d5674 |
+ *
|
|
Filipe Manana |
9d5674 |
+ * @trans: A transaction handle.
|
|
Filipe Manana |
9d5674 |
+ * @is_item_insertion: Indicate if the modification is for inserting a new item
|
|
Filipe Manana |
9d5674 |
+ * in the chunk btree or if it's for the deletion or update
|
|
Filipe Manana |
9d5674 |
+ * of an existing item.
|
|
Filipe Manana |
9d5674 |
+ *
|
|
Filipe Manana |
9d5674 |
+ * This is used in a context where we need to update the chunk btree outside
|
|
Filipe Manana |
9d5674 |
+ * block group allocation and removal, to avoid a deadlock with a concurrent
|
|
Filipe Manana |
9d5674 |
+ * task that is allocating a metadata or data block group and therefore needs to
|
|
Filipe Manana |
9d5674 |
+ * update the chunk btree while holding the chunk mutex. After the update to the
|
|
Filipe Manana |
9d5674 |
+ * chunk btree is done, btrfs_trans_release_chunk_metadata() should be called.
|
|
Filipe Manana |
9d5674 |
+ *
|
|
Filipe Manana |
9d5674 |
+ */
|
|
Filipe Manana |
9d5674 |
+void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
|
|
Filipe Manana |
9d5674 |
+ bool is_item_insertion)
|
|
Filipe Manana |
9d5674 |
+{
|
|
Filipe Manana |
9d5674 |
+ struct btrfs_fs_info *fs_info = trans->fs_info;
|
|
Filipe Manana |
9d5674 |
+ u64 bytes;
|
|
Filipe Manana |
9d5674 |
+
|
|
Filipe Manana |
9d5674 |
+ if (is_item_insertion)
|
|
Filipe Manana |
9d5674 |
+ bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
|
|
Filipe Manana |
9d5674 |
+ else
|
|
Filipe Manana |
9d5674 |
+ bytes = btrfs_calc_metadata_size(fs_info, 1);
|
|
Filipe Manana |
9d5674 |
+
|
|
Filipe Manana |
9d5674 |
+ mutex_lock(&fs_info->chunk_mutex);
|
|
Filipe Manana |
9d5674 |
+ reserve_chunk_space(trans, bytes, BTRFS_BLOCK_GROUP_SYSTEM);
|
|
Filipe Manana |
9d5674 |
+ mutex_unlock(&fs_info->chunk_mutex);
|
|
Filipe Manana |
9d5674 |
+}
|
|
Filipe Manana |
9d5674 |
+
|
|
Filipe Manana |
9d5674 |
void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
|
|
Filipe Manana |
9d5674 |
{
|
|
Filipe Manana |
9d5674 |
struct btrfs_block_group *block_group;
|
|
Filipe Manana |
9d5674 |
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
|
|
Filipe Manana |
9d5674 |
index fb10eea11..970c79577 100644
|
|
Filipe Manana |
9d5674 |
--- a/fs/btrfs/block-group.h
|
|
Filipe Manana |
9d5674 |
+++ b/fs/btrfs/block-group.h
|
|
Filipe Manana |
9d5674 |
@@ -237,6 +237,8 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
|
|
Filipe Manana |
9d5674 |
enum btrfs_chunk_alloc_enum force);
|
|
Filipe Manana |
9d5674 |
int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type);
|
|
Filipe Manana |
9d5674 |
void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type);
|
|
Filipe Manana |
9d5674 |
+void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
|
|
Filipe Manana |
9d5674 |
+ bool is_item_insertion);
|
|
Filipe Manana |
9d5674 |
u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags);
|
|
Filipe Manana |
9d5674 |
void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
|
|
Filipe Manana |
9d5674 |
int btrfs_free_block_groups(struct btrfs_fs_info *info);
|
|
Filipe Manana |
9d5674 |
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
|
|
Filipe Manana |
9d5674 |
index c16181c76..1a50b8071 100644
|
|
Filipe Manana |
9d5674 |
--- a/fs/btrfs/relocation.c
|
|
Filipe Manana |
9d5674 |
+++ b/fs/btrfs/relocation.c
|
|
Filipe Manana |
9d5674 |
@@ -3199,8 +3199,12 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
|
|
Filipe Manana |
9d5674 |
list_add_tail(&node->list, &rc->backref_cache.changed);
|
|
Filipe Manana |
9d5674 |
} else {
|
|
Filipe Manana |
9d5674 |
path->lowest_level = node->level;
|
|
Filipe Manana |
9d5674 |
+ if (root == root->fs_info->chunk_root)
|
|
Filipe Manana |
9d5674 |
+ btrfs_reserve_chunk_metadata(trans, false);
|
|
Filipe Manana |
9d5674 |
ret = btrfs_search_slot(trans, root, key, path, 0, 1);
|
|
Filipe Manana |
9d5674 |
btrfs_release_path(path);
|
|
Filipe Manana |
9d5674 |
+ if (root == root->fs_info->chunk_root)
|
|
Filipe Manana |
9d5674 |
+ btrfs_trans_release_chunk_metadata(trans);
|
|
Filipe Manana |
9d5674 |
if (ret > 0)
|
|
Filipe Manana |
9d5674 |
ret = 0;
|
|
Filipe Manana |
9d5674 |
}
|
|
Filipe Manana |
9d5674 |
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
|
|
Filipe Manana |
9d5674 |
index 71e993d94..c5849db6e 100644
|
|
Filipe Manana |
9d5674 |
--- a/fs/btrfs/volumes.c
|
|
Filipe Manana |
9d5674 |
+++ b/fs/btrfs/volumes.c
|
|
Filipe Manana |
9d5674 |
@@ -1992,8 +1992,10 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
|
|
Filipe Manana |
9d5674 |
key.type = BTRFS_DEV_ITEM_KEY;
|
|
Filipe Manana |
9d5674 |
key.offset = device->devid;
|
|
Filipe Manana |
9d5674 |
|
|
Filipe Manana |
9d5674 |
+ btrfs_reserve_chunk_metadata(trans, true);
|
|
Filipe Manana |
9d5674 |
ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
|
|
Filipe Manana |
9d5674 |
&key, sizeof(*dev_item));
|
|
Filipe Manana |
9d5674 |
+ btrfs_trans_release_chunk_metadata(trans);
|
|
Filipe Manana |
9d5674 |
if (ret)
|
|
Filipe Manana |
9d5674 |
goto out;
|
|
Filipe Manana |
9d5674 |
|
|
Filipe Manana |
9d5674 |
@@ -2064,7 +2066,9 @@ static int btrfs_rm_dev_item(struct btrfs_device *device)
|
|
Filipe Manana |
9d5674 |
key.type = BTRFS_DEV_ITEM_KEY;
|
|
Filipe Manana |
9d5674 |
key.offset = device->devid;
|
|
Filipe Manana |
9d5674 |
|
|
Filipe Manana |
9d5674 |
+ btrfs_reserve_chunk_metadata(trans, false);
|
|
Filipe Manana |
9d5674 |
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
|
|
Filipe Manana |
9d5674 |
+ btrfs_trans_release_chunk_metadata(trans);
|
|
Filipe Manana |
9d5674 |
if (ret) {
|
|
Filipe Manana |
9d5674 |
if (ret > 0)
|
|
Filipe Manana |
9d5674 |
ret = -ENOENT;
|
|
Filipe Manana |
9d5674 |
@@ -2590,7 +2594,9 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
|
|
Filipe Manana |
9d5674 |
key.type = BTRFS_DEV_ITEM_KEY;
|
|
Filipe Manana |
9d5674 |
|
|
Filipe Manana |
9d5674 |
while (1) {
|
|
Filipe Manana |
9d5674 |
+ btrfs_reserve_chunk_metadata(trans, false);
|
|
Filipe Manana |
9d5674 |
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
|
|
Filipe Manana |
9d5674 |
+ btrfs_trans_release_chunk_metadata(trans);
|
|
Filipe Manana |
9d5674 |
if (ret < 0)
|
|
Filipe Manana |
9d5674 |
goto error;
|
|
Filipe Manana |
9d5674 |
|
|
Filipe Manana |
9d5674 |
@@ -2918,6 +2924,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
|
|
Filipe Manana |
9d5674 |
struct btrfs_super_block *super_copy = fs_info->super_copy;
|
|
Filipe Manana |
9d5674 |
u64 old_total;
|
|
Filipe Manana |
9d5674 |
u64 diff;
|
|
Filipe Manana |
9d5674 |
+ int ret;
|
|
Filipe Manana |
9d5674 |
|
|
Filipe Manana |
9d5674 |
if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
|
|
Filipe Manana |
9d5674 |
return -EACCES;
|
|
Filipe Manana |
9d5674 |
@@ -2946,7 +2953,11 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
|
|
Filipe Manana |
9d5674 |
&trans->transaction->dev_update_list);
|
|
Filipe Manana |
9d5674 |
mutex_unlock(&fs_info->chunk_mutex);
|
|
Filipe Manana |
9d5674 |
|
|
Filipe Manana |
9d5674 |
- return btrfs_update_device(trans, device);
|
|
Filipe Manana |
9d5674 |
+ btrfs_reserve_chunk_metadata(trans, false);
|
|
Filipe Manana |
9d5674 |
+ ret = btrfs_update_device(trans, device);
|
|
Filipe Manana |
9d5674 |
+ btrfs_trans_release_chunk_metadata(trans);
|
|
Filipe Manana |
9d5674 |
+
|
|
Filipe Manana |
9d5674 |
+ return ret;
|
|
Filipe Manana |
9d5674 |
}
|
|
Filipe Manana |
9d5674 |
|
|
Filipe Manana |
9d5674 |
static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
|
|
Filipe Manana |
9d5674 |
@@ -5032,8 +5043,10 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
|
|
Filipe Manana |
9d5674 |
round_down(old_total - diff, fs_info->sectorsize));
|
|
Filipe Manana |
9d5674 |
mutex_unlock(&fs_info->chunk_mutex);
|
|
Filipe Manana |
9d5674 |
|
|
Filipe Manana |
9d5674 |
+ btrfs_reserve_chunk_metadata(trans, false);
|
|
Filipe Manana |
9d5674 |
/* Now btrfs_update_device() will change the on-disk size. */
|
|
Filipe Manana |
9d5674 |
ret = btrfs_update_device(trans, device);
|
|
Filipe Manana |
9d5674 |
+ btrfs_trans_release_chunk_metadata(trans);
|
|
Filipe Manana |
9d5674 |
if (ret < 0) {
|
|
Filipe Manana |
9d5674 |
btrfs_abort_transaction(trans, ret);
|
|
Filipe Manana |
9d5674 |
btrfs_end_transaction(trans);
|
|
Filipe Manana |
9d5674 |
--
|
|
Filipe Manana |
9d5674 |
2.26.2
|
|
Filipe Manana |
9d5674 |
|