Blob Blame History Raw
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 13 Mar 2020 15:58:05 -0400
Subject: btrfs: Improve global reserve stealing logic
Patch-mainline: Submitted, https://lore.kernel.org/linux-btrfs/20200313195809.141753-1-josef@toxicpanda.com/
References: bsc#1165949

For unlink transactions and block group removal
btrfs_start_transaction_fallback_global_rsv will first try to start
an ordinary transaction and if it fails it will fall back to reserving
the required amount by stealing from the global reserve. This is
problematic because of all the same reasons we had with previous
iterations of the ENOSPC handling, thundering herd.  We get a bunch of
failures all at once, everybody tries to allocate from the global
reserve, some win and some lose, we get an ENSOPC.

Fix this behavior by introducing BTRFS_RESERVE_FLUSH_ALL_STEAL. It's
used to mark unlink reservation. To fix this we need to integrate this
logic into the normal ENOSPC infrastructure.  We still go through all of
the normal flushing work, and at the moment we begin to fail all the
tickets we try to satisfy any tickets that are allowed to steal by
stealing from the global reserve.  If this works we start the flushing
system over again just like we would with a normal ticket satisfaction.
This serializes our global reserve stealing, so we don't have the
thundering herd problem.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Acked-by: Nikolay Borisov <nborisov@suse.com>
---
 fs/btrfs/block-group.c |    2 +-
 fs/btrfs/ctree.h       |    1 +
 fs/btrfs/inode.c       |    2 +-
 fs/btrfs/space-info.c  |   36 +++++++++++++++++++++++++++++++++++-
 fs/btrfs/space-info.h  |    1 +
 fs/btrfs/transaction.c |   43 ++++++-------------------------------------
 fs/btrfs/transaction.h |    3 +--
 7 files changed, 46 insertions(+), 42 deletions(-)

--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1152,7 +1152,7 @@ struct btrfs_trans_handle *btrfs_start_t
 	free_extent_map(em);

 	return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
-							   num_items, 1);
+							   num_items);
 }

 /*
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2575,6 +2575,7 @@ enum btrfs_reserve_flush_enum {
 	BTRFS_RESERVE_FLUSH_LIMIT,
 	BTRFS_RESERVE_FLUSH_EVICT,
 	BTRFS_RESERVE_FLUSH_ALL,
+	BTRFS_RESERVE_FLUSH_ALL_STEAL,
 };

 enum btrfs_flush_state {
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4085,7 +4085,7 @@ static struct btrfs_trans_handle *__unli
 	 * 1 for the inode ref
 	 * 1 for the inode
 	 */
-	return btrfs_start_transaction_fallback_global_rsv(root, 5, 5);
+	return btrfs_start_transaction_fallback_global_rsv(root, 5);
 }

 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -696,6 +696,34 @@ static inline int need_do_async_reclaim(
 		!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
 }

+static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info,
+				  struct btrfs_space_info *space_info,
+				  struct reserve_ticket *ticket)
+{
+	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+	u64 min_bytes;
+
+	if (global_rsv->space_info != space_info)
+		return false;
+
+	spin_lock(&global_rsv->lock);
+	min_bytes = div_factor(global_rsv->size, 5);
+	if (global_rsv->reserved < min_bytes + ticket->bytes) {
+		spin_unlock(&global_rsv->lock);
+		return false;
+	}
+	global_rsv->reserved -= ticket->bytes;
+	ticket->bytes = 0;
+	list_del_init(&ticket->list);
+	wake_up(&ticket->wait);
+	space_info->tickets_id++;
+	if (global_rsv->reserved < global_rsv->size)
+		global_rsv->full = 0;
+	spin_unlock(&global_rsv->lock);
+
+	return true;
+}
+
 /*
  * maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets
  * @fs_info - fs_info for this fs
@@ -728,6 +756,9 @@ static bool maybe_fail_all_tickets(struc
 		ticket = list_first_entry(&space_info->tickets,
 					  struct reserve_ticket, list);

+		if (ticket->steal &&
+		    steal_from_global_rsv(fs_info, space_info, ticket))
+			return true;
 		/*
 		 * may_commit_transaction will avoid committing the transaction
 		 * if it doesn't feel like the space reclaimed by the commit
@@ -951,6 +982,7 @@ static int handle_reserve_ticket(struct

 	switch (flush) {
 	case BTRFS_RESERVE_FLUSH_ALL:
+	case BTRFS_RESERVE_FLUSH_ALL_STEAL:
 		wait_reserve_ticket(fs_info, space_info, ticket);
 		break;
 	case BTRFS_RESERVE_FLUSH_LIMIT:
@@ -1049,7 +1081,9 @@ static int __reserve_metadata_bytes(stru
 		ticket.bytes = orig_bytes;
 		ticket.error = 0;
 		init_waitqueue_head(&ticket.wait);
-		if (flush == BTRFS_RESERVE_FLUSH_ALL) {
+		ticket.steal = (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL);
+		if (flush == BTRFS_RESERVE_FLUSH_ALL ||
+		    flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
 			list_add_tail(&ticket.list, &space_info->tickets);
 			if (!space_info->flush) {
 				space_info->flush = 1;
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -72,6 +72,7 @@ struct btrfs_space_info {
 struct reserve_ticket {
 	u64 bytes;
 	int error;
+	bool steal;
 	struct list_head list;
 	wait_queue_head_t wait;
 };
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -544,7 +544,8 @@ start_transaction(struct btrfs_root *roo
 		 * refill that amount for whatever is missing in the reserve.
 		 */
 		num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items);
-		if (delayed_refs_rsv->full == 0) {
+		if (flush == BTRFS_RESERVE_FLUSH_ALL &&
+		    delayed_refs_rsv->full == 0) {
 			delayed_refs_bytes = num_bytes;
 			num_bytes <<= 1;
 		}
@@ -670,43 +671,11 @@ struct btrfs_trans_handle *btrfs_start_t

 struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
 					struct btrfs_root *root,
-					unsigned int num_items,
-					int min_factor)
-{
-	struct btrfs_fs_info *fs_info = root->fs_info;
-	struct btrfs_trans_handle *trans;
-	u64 num_bytes;
-	int ret;
+					unsigned int num_items)

-	/*
-	 * We have two callers: unlink and block group removal.  The
-	 * former should succeed even if we will temporarily exceed
-	 * quota and the latter operates on the extent root so
-	 * qgroup enforcement is ignored anyway.
-	 */
-	trans = start_transaction(root, num_items, TRANS_START,
-				  BTRFS_RESERVE_FLUSH_ALL, false);
-	if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
-		return trans;
-
-	trans = btrfs_start_transaction(root, 0);
-	if (IS_ERR(trans))
-		return trans;
-
-	num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items);
-	ret = btrfs_cond_migrate_bytes(fs_info, &fs_info->trans_block_rsv,
-				       num_bytes, min_factor);
-	if (ret) {
-		btrfs_end_transaction(trans);
-		return ERR_PTR(ret);
-	}
-
-	trans->block_rsv = &fs_info->trans_block_rsv;
-	trans->bytes_reserved = num_bytes;
-	trace_btrfs_space_reservation(fs_info, "transaction",
-				      trans->transid, num_bytes, 1);
-
-	return trans;
+{
+	return  start_transaction(root, num_items, TRANS_START,
+				  BTRFS_RESERVE_FLUSH_ALL_STEAL, false);
 }

 struct btrfs_trans_handle *btrfs_start_transaction_lflush(
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -185,8 +185,7 @@ struct btrfs_trans_handle *btrfs_start_t
 						   unsigned int num_items);
 struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
 					struct btrfs_root *root,
-					unsigned int num_items,
-					int min_factor);
+					unsigned int num_items);
 struct btrfs_trans_handle *btrfs_start_transaction_lflush(
 					struct btrfs_root *root,
 					unsigned int num_items);