|
Filipe Manana |
9be975 |
From: Filipe Manana <fdmanana@suse.com>
|
|
Filipe Manana |
9be975 |
Date: Mon, 5 Apr 2021 12:32:16 +0100
|
|
Filipe Manana |
9be975 |
Git-commit: 061dde8245356d8864d29e25207aa4daa0be4d3c
|
|
Filipe Manana |
9be975 |
Patch-mainline: v5.13-rc1
|
|
Filipe Manana |
9be975 |
Subject: [PATCH] btrfs: fix race between transaction aborts and fsyncs leading
|
|
Filipe Manana |
9be975 |
to use-after-free
|
|
Filipe Manana |
9be975 |
References: bsc#1186441
|
|
Filipe Manana |
9be975 |
|
|
Filipe Manana |
9be975 |
There is a race between a task aborting a transaction during a commit,
|
|
Filipe Manana |
9be975 |
a task doing an fsync and the transaction kthread, which leads to an
|
|
Filipe Manana |
9be975 |
use-after-free of the log root tree. When this happens, it results in a
|
|
Filipe Manana |
9be975 |
stack trace like the following:
|
|
Filipe Manana |
9be975 |
|
|
Filipe Manana |
9be975 |
BTRFS info (device dm-0): forced readonly
|
|
Filipe Manana |
9be975 |
BTRFS warning (device dm-0): Skipping commit of aborted transaction.
|
|
Filipe Manana |
9be975 |
BTRFS: error (device dm-0) in cleanup_transaction:1958: errno=-5 IO failure
|
|
Filipe Manana |
9be975 |
BTRFS warning (device dm-0): lost page write due to IO error on /dev/mapper/error-test (-5)
|
|
Filipe Manana |
9be975 |
BTRFS warning (device dm-0): Skipping commit of aborted transaction.
|
|
Filipe Manana |
9be975 |
BTRFS warning (device dm-0): direct IO failed ino 261 rw 0,0 sector 0xa4e8 len 4096 err no 10
|
|
Filipe Manana |
9be975 |
BTRFS error (device dm-0): error writing primary super block to device 1
|
|
Filipe Manana |
9be975 |
BTRFS warning (device dm-0): direct IO failed ino 261 rw 0,0 sector 0x12e000 len 4096 err no 10
|
|
Filipe Manana |
9be975 |
BTRFS warning (device dm-0): direct IO failed ino 261 rw 0,0 sector 0x12e008 len 4096 err no 10
|
|
Filipe Manana |
9be975 |
BTRFS warning (device dm-0): direct IO failed ino 261 rw 0,0 sector 0x12e010 len 4096 err no 10
|
|
Filipe Manana |
9be975 |
BTRFS: error (device dm-0) in write_all_supers:4110: errno=-5 IO failure (1 errors while writing supers)
|
|
Filipe Manana |
9be975 |
BTRFS: error (device dm-0) in btrfs_sync_log:3308: errno=-5 IO failure
|
|
Filipe Manana |
9be975 |
general protection fault, probably for non-canonical address 0x6b6b6b6b6b6b6b68: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC PTI
|
|
Filipe Manana |
9be975 |
CPU: 2 PID: 2458471 Comm: fsstress Not tainted 5.12.0-rc5-btrfs-next-84 #1
|
|
Filipe Manana |
9be975 |
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
|
|
Filipe Manana |
9be975 |
RIP: 0010:__mutex_lock+0x139/0xa40
|
|
Filipe Manana |
9be975 |
Code: c0 74 19 (...)
|
|
Filipe Manana |
9be975 |
RSP: 0018:ffff9f18830d7b00 EFLAGS: 00010202
|
|
Filipe Manana |
9be975 |
RAX: 6b6b6b6b6b6b6b68 RBX: 0000000000000001 RCX: 0000000000000002
|
|
Filipe Manana |
9be975 |
RDX: ffffffffb9c54d13 RSI: 0000000000000000 RDI: 0000000000000000
|
|
Filipe Manana |
9be975 |
RBP: ffff9f18830d7bc0 R08: 0000000000000000 R09: 0000000000000000
|
|
Filipe Manana |
9be975 |
R10: ffff9f18830d7be0 R11: 0000000000000001 R12: ffff8c6cd199c040
|
|
Filipe Manana |
9be975 |
R13: ffff8c6c95821358 R14: 00000000fffffffb R15: ffff8c6cbcf01358
|
|
Filipe Manana |
9be975 |
FS: 00007fa9140c2b80(0000) GS:ffff8c6fac600000(0000) knlGS:0000000000000000
|
|
Filipe Manana |
9be975 |
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
|
|
Filipe Manana |
9be975 |
CR2: 00007fa913d52000 CR3: 000000013d2b4003 CR4: 0000000000370ee0
|
|
Filipe Manana |
9be975 |
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
|
|
Filipe Manana |
9be975 |
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
|
|
Filipe Manana |
9be975 |
Call Trace:
|
|
Filipe Manana |
9be975 |
? __btrfs_handle_fs_error+0xde/0x146 [btrfs]
|
|
Filipe Manana |
9be975 |
? btrfs_sync_log+0x7c1/0xf20 [btrfs]
|
|
Filipe Manana |
9be975 |
? btrfs_sync_log+0x7c1/0xf20 [btrfs]
|
|
Filipe Manana |
9be975 |
btrfs_sync_log+0x7c1/0xf20 [btrfs]
|
|
Filipe Manana |
9be975 |
btrfs_sync_file+0x40c/0x580 [btrfs]
|
|
Filipe Manana |
9be975 |
do_fsync+0x38/0x70
|
|
Filipe Manana |
9be975 |
__x64_sys_fsync+0x10/0x20
|
|
Filipe Manana |
9be975 |
do_syscall_64+0x33/0x80
|
|
Filipe Manana |
9be975 |
entry_SYSCALL_64_after_hwframe+0x44/0xae
|
|
Filipe Manana |
9be975 |
RIP: 0033:0x7fa9142a55c3
|
|
Filipe Manana |
9be975 |
Code: 8b 15 09 (...)
|
|
Filipe Manana |
9be975 |
RSP: 002b:00007fff26278d48 EFLAGS: 00000246 ORIG_RAX: 000000000000004a
|
|
Filipe Manana |
9be975 |
RAX: ffffffffffffffda RBX: 0000563c83cb4560 RCX: 00007fa9142a55c3
|
|
Filipe Manana |
9be975 |
RDX: 00007fff26278cb0 RSI: 00007fff26278cb0 RDI: 0000000000000005
|
|
Filipe Manana |
9be975 |
RBP: 0000000000000005 R08: 0000000000000001 R09: 00007fff26278d5c
|
|
Filipe Manana |
9be975 |
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000340
|
|
Filipe Manana |
9be975 |
R13: 00007fff26278de0 R14: 00007fff26278d96 R15: 0000563c83ca57c0
|
|
Filipe Manana |
9be975 |
Modules linked in: btrfs dm_zero dm_snapshot dm_thin_pool (...)
|
|
Filipe Manana |
9be975 |
---[ end trace ee2f1b19327d791d ]---
|
|
Filipe Manana |
9be975 |
|
|
Filipe Manana |
9be975 |
The steps that lead to this crash are the following:
|
|
Filipe Manana |
9be975 |
|
|
Filipe Manana |
9be975 |
1) We are at transaction N;
|
|
Filipe Manana |
9be975 |
|
|
Filipe Manana |
9be975 |
2) We have two tasks with a transaction handle attached to transaction N.
|
|
Filipe Manana |
9be975 |
Task A and Task B. Task B is doing an fsync;
|
|
Filipe Manana |
9be975 |
|
|
Filipe Manana |
9be975 |
3) Task B is at btrfs_sync_log(), and has saved fs_info->log_root_tree
|
|
Filipe Manana |
9be975 |
into a local variable named 'log_root_tree' at the top of
|
|
Filipe Manana |
9be975 |
btrfs_sync_log(). Task B is about to call write_all_supers(), but
|
|
Filipe Manana |
9be975 |
before that...
|
|
Filipe Manana |
9be975 |
|
|
Filipe Manana |
9be975 |
4) Task A calls btrfs_commit_transaction(), and after it sets the
|
|
Filipe Manana |
9be975 |
transaction state to TRANS_STATE_COMMIT_START, an error happens before
|
|
Filipe Manana |
9be975 |
it waits for the transaction's 'num_writers' counter to reach a value
|
|
Filipe Manana |
9be975 |
of 1 (no one else attached to the transaction), so it jumps to the
|
|
Filipe Manana |
9be975 |
label "cleanup_transaction";
|
|
Filipe Manana |
9be975 |
|
|
Filipe Manana |
9be975 |
5) Task A then calls cleanup_transaction(), where it aborts the
|
|
Filipe Manana |
9be975 |
transaction, setting BTRFS_FS_STATE_TRANS_ABORTED on fs_info->fs_state,
|
|
Filipe Manana |
9be975 |
setting the ->aborted field of the transaction and the handle to an
|
|
Filipe Manana |
9be975 |
errno value and also setting BTRFS_FS_STATE_ERROR on fs_info->fs_state.
|
|
Filipe Manana |
9be975 |
|
|
Filipe Manana |
9be975 |
After that, at cleanup_transaction(), it deletes the transaction from
|
|
Filipe Manana |
9be975 |
the list of transactions (fs_info->trans_list), sets the transaction
|
|
Filipe Manana |
9be975 |
to the state TRANS_STATE_COMMIT_DOING and then waits for the number
|
|
Filipe Manana |
9be975 |
of writers to go down to 1, as it's currently 2 (1 for task A and 1
|
|
Filipe Manana |
9be975 |
for task B);
|
|
Filipe Manana |
9be975 |
|
|
Filipe Manana |
9be975 |
6) The transaction kthread is running and sees that BTRFS_FS_STATE_ERROR
|
|
Filipe Manana |
9be975 |
is set in fs_info->fs_state, so it calls btrfs_cleanup_transaction().
|
|
Filipe Manana |
9be975 |
|
|
Filipe Manana |
9be975 |
There it sees the list fs_info->trans_list is empty, and then proceeds
|
|
Filipe Manana |
9be975 |
into calling btrfs_drop_all_logs(), which frees the log root tree with
|
|
Filipe Manana |
9be975 |
a call to btrfs_free_log_root_tree();
|
|
Filipe Manana |
9be975 |
|
|
Filipe Manana |
9be975 |
7) Task B calls write_all_supers() and, shortly after, under the label
|
|
Filipe Manana |
9be975 |
'out_wake_log_root', it deferences the pointer stored in
|
|
Filipe Manana |
9be975 |
'log_root_tree', which was already freed in the previous step by the
|
|
Filipe Manana |
9be975 |
transaction kthread. This results in a use-after-free leading to a
|
|
Filipe Manana |
9be975 |
crash.
|
|
Filipe Manana |
9be975 |
|
|
Filipe Manana |
9be975 |
Fix this by deleting the transaction from the list of transactions at
|
|
Filipe Manana |
9be975 |
cleanup_transaction() only after setting the transaction state to
|
|
Filipe Manana |
9be975 |
TRANS_STATE_COMMIT_DOING and waiting for all existing tasks that are
|
|
Filipe Manana |
9be975 |
attached to the transaction to release their transaction handles.
|
|
Filipe Manana |
9be975 |
This makes the transaction kthread wait for all the tasks attached to
|
|
Filipe Manana |
9be975 |
the transaction to be done with the transaction before dropping the
|
|
Filipe Manana |
9be975 |
log roots and doing other cleanups.
|
|
Filipe Manana |
9be975 |
|
|
Filipe Manana |
9be975 |
Fixes: ef67963dac255b ("btrfs: drop logs when we've aborted a transaction")
|
|
Filipe Manana |
9be975 |
CC: stable@vger.kernel.org # 5.10+
|
|
Filipe Manana |
9be975 |
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
|
|
Filipe Manana |
9be975 |
Signed-off-by: Filipe Manana <fdmanana@suse.com>
|
|
Filipe Manana |
9be975 |
Signed-off-by: David Sterba <dsterba@suse.com>
|
|
Filipe Manana |
9be975 |
---
|
|
Filipe Manana |
9be975 |
fs/btrfs/transaction.c | 12 +++++++++++-
|
|
Filipe Manana |
9be975 |
1 file changed, 11 insertions(+), 1 deletion(-)
|
|
Filipe Manana |
9be975 |
|
|
Filipe Manana |
9be975 |
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
|
|
Filipe Manana |
9be975 |
index 62f5195d0076..f75de9f6c0ad 100644
|
|
Filipe Manana |
9be975 |
--- a/fs/btrfs/transaction.c
|
|
Filipe Manana |
9be975 |
+++ b/fs/btrfs/transaction.c
|
|
Filipe Manana |
9be975 |
@@ -1986,7 +1986,6 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
|
|
Filipe Manana |
9be975 |
*/
|
|
Filipe Manana |
9be975 |
BUG_ON(list_empty(&cur_trans->list));
|
|
Filipe Manana |
9be975 |
|
|
Filipe Manana |
9be975 |
- list_del_init(&cur_trans->list);
|
|
Filipe Manana |
9be975 |
if (cur_trans == fs_info->running_transaction) {
|
|
Filipe Manana |
9be975 |
cur_trans->state = TRANS_STATE_COMMIT_DOING;
|
|
Filipe Manana |
9be975 |
spin_unlock(&fs_info->trans_lock);
|
|
Filipe Manana |
9be975 |
@@ -1995,6 +1994,17 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
|
|
Filipe Manana |
9be975 |
|
|
Filipe Manana |
9be975 |
spin_lock(&fs_info->trans_lock);
|
|
Filipe Manana |
9be975 |
}
|
|
Filipe Manana |
9be975 |
+
|
|
Filipe Manana |
9be975 |
+ /*
|
|
Filipe Manana |
9be975 |
+ * Now that we know no one else is still using the transaction we can
|
|
Filipe Manana |
9be975 |
+ * remove the transaction from the list of transactions. This avoids
|
|
Filipe Manana |
9be975 |
+ * the transaction kthread from cleaning up the transaction while some
|
|
Filipe Manana |
9be975 |
+ * other task is still using it, which could result in a use-after-free
|
|
Filipe Manana |
9be975 |
+ * on things like log trees, as it forces the transaction kthread to
|
|
Filipe Manana |
9be975 |
+ * wait for this transaction to be cleaned up by us.
|
|
Filipe Manana |
9be975 |
+ */
|
|
Filipe Manana |
9be975 |
+ list_del_init(&cur_trans->list);
|
|
Filipe Manana |
9be975 |
+
|
|
Filipe Manana |
9be975 |
spin_unlock(&fs_info->trans_lock);
|
|
Filipe Manana |
9be975 |
|
|
Filipe Manana |
9be975 |
btrfs_cleanup_one_transaction(trans->transaction, fs_info);
|
|
Filipe Manana |
9be975 |
--
|
|
Filipe Manana |
9be975 |
2.26.2
|
|
Filipe Manana |
9be975 |
|