From 0dce1ddefc673a3f39b4964d6b669e2a11aaf903 Mon Sep 17 00:00:00 2001
From: Quentin Bouget <quentin.bouget@cea.fr>
Date: Fri, 4 May 2018 14:53:12 +0200
Subject: [PATCH] LU-8324 hsm: prioritize one RESTORE once in a while

Currently, HSM requests are run in the order they are submitted to
the coordinator. This has the undesirable side effect that the more
interactive requests (RESTORE / REMOVE) can take quite a while to be
run when a huge batch of ARCHIVE requests are already queued.

This patch is not a clean fix to LU-8324, it is merely an attempt at
making things bearable while a proper solution is being developped.

Test-Parameters: trivial testlist=sanity-hsm
Signed-off-by: Quentin Bouget <quentin.bouget@cea.fr>
Change-Id: I8697ffae19b28f31901d9e61cce55b40f848fb51
Reviewed-on: https://review.whamcloud.com/31723
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: John L. Hammond <jhammond@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
---
 lustre/mdt/mdt_coordinator.c |  88 ++++++++++++++-----
 lustre/tests/sanity-hsm.sh   | 196 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 261 insertions(+), 23 deletions(-)

diff --git a/lustre/mdt/mdt_coordinator.c b/lustre/mdt/mdt_coordinator.c
index 9cb5f1e..cc5910d 100644
--- a/lustre/mdt/mdt_coordinator.c
+++ b/lustre/mdt/mdt_coordinator.c
@@ -142,6 +142,7 @@ struct hsm_scan_data {
 	 * for new work?
 	 */
 	bool			 hsd_housekeeping;
+	bool			 hsd_one_restore;
 	int			 hsd_action_count;
 	int			 hsd_request_len; /* array alloc len */
 	int			 hsd_request_count; /* array used count */
@@ -164,19 +165,16 @@ static int mdt_cdt_waiting_cb(const struct lu_env *env,
 	/* Are agents full? */
 	if (hsd->hsd_action_count + atomic_read(&cdt->cdt_request_count) >=
 	    cdt->cdt_max_requests) {
-		if (hsd->hsd_housekeeping) {
-			/* Unknown request and no more room for a new
-			 * request. Continue to scan to find other
-			 * entries for already existing requests. */
-			RETURN(0);
-		} else {
-			/* We cannot send and more requests, stop
-			 * here. There might be more known requests
-			 * that could be merged, but this avoid
-			 * analyzing too many llogs for minor
-			 * gains. */
-			RETURN(LLOG_PROC_BREAK);
-		}
+		/* We cannot send any more request
+		 *
+		 *                     *** SPECIAL CASE ***
+		 *
+		 * Restore requests are too important not to schedule at least
+		 * one, everytime we can.
+		 */
+		if (larr->arr_hai.hai_action != HSMA_RESTORE ||
+		    hsd->hsd_one_restore)
+			RETURN(hsd->hsd_housekeeping ? 0 : LLOG_PROC_BREAK);
 	}
 
 	hai_size = cfs_size_round(larr->arr_hai.hai_len);
@@ -193,17 +191,53 @@ static int mdt_cdt_waiting_cb(const struct lu_env *env,
 		}
 	}
 
-	if (!request) {
-		struct hsm_action_list *hal;
+	/* Are we trying to force-schedule a request? */
+	if (hsd->hsd_action_count + atomic_read(&cdt->cdt_request_count) >=
+	    cdt->cdt_max_requests) {
+		/* Is there really no compatible hsm_scan_request? */
+		if (!request) {
+			for (i -= 1; i >= 0; i--) {
+				if (hsd->hsd_request[i].hal->hal_archive_id ==
+				    archive_id) {
+					request = &hsd->hsd_request[i];
+					break;
+				}
+			}
+		}
+
+		/* Make room for the hai */
+		if (request) {
+			/* Discard the last hai until there is enough space */
+			do {
+				request->hal->hal_count--;
+
+				hai = hai_first(request->hal);
+				for (i = 0; i < request->hal->hal_count; i++)
+					hai = hai_next(hai);
+				request->hal_used_sz -=
+					cfs_size_round(hai->hai_len);
+				hsd->hsd_action_count--;
+			} while (request->hal_used_sz + hai_size >
+				 LDLM_MAXREQSIZE);
+		} else if (hsd->hsd_housekeeping) {
+			struct hsm_scan_request *tmp;
+
+			/* Discard the (whole) last hal */
+			hsd->hsd_request_count--;
+			tmp = &hsd->hsd_request[hsd->hsd_request_count];
+			hsd->hsd_action_count -= tmp->hal->hal_count;
+			OBD_FREE(tmp->hal, tmp->hal_sz);
+		} else {
+			/* Bailing out, this code path is too hot */
+			RETURN(LLOG_PROC_BREAK);
 
-		if (hsd->hsd_request_count == hsd->hsd_request_len) {
-			/* Logic as above. */
-			if (hsd->hsd_housekeeping)
-				RETURN(0);
-			else
-				RETURN(LLOG_PROC_BREAK);
 		}
+	}
+
+	if (!request) {
+		struct hsm_action_list *hal;
 
+		LASSERT(hsd->hsd_request_count < hsd->hsd_request_len);
 		request = &hsd->hsd_request[hsd->hsd_request_count];
 
 		/* allocates hai vector size just needs to be large
@@ -246,15 +280,22 @@ static int mdt_cdt_waiting_cb(const struct lu_env *env,
 
 	memcpy(hai, &larr->arr_hai, larr->arr_hai.hai_len);
 
-	request->hal_used_sz += cfs_size_round(hai->hai_len);
+	request->hal_used_sz += hai_size;
 	request->hal->hal_count++;
 
 	hsd->hsd_action_count++;
 
-	if (hai->hai_action != HSMA_CANCEL)
+	switch (hai->hai_action) {
+	case HSMA_CANCEL:
+		break;
+	case HSMA_RESTORE:
+		hsd->hsd_one_restore = true;
+		/* Intentional fallthrough */
+	default:
 		cdt_agent_record_hash_add(cdt, hai->hai_cookie,
 					  llh->lgh_hdr->llh_cat_idx,
 					  larr->arr_hdr.lrh_index);
+	}
 
 	RETURN(0);
 }
@@ -646,6 +687,7 @@ static int mdt_coordinator(void *data)
 
 		hsd.hsd_action_count = 0;
 		hsd.hsd_request_count = 0;
+		hsd.hsd_one_restore = false;
 
 		rc = cdt_llog_process(mti->mti_env, mdt, mdt_coordinator_cb,
 				      &hsd, 0, 0, WRITE);
diff --git a/lustre/tests/sanity-hsm.sh b/lustre/tests/sanity-hsm.sh
index c178c08..7816583 100755
--- a/lustre/tests/sanity-hsm.sh
+++ b/lustre/tests/sanity-hsm.sh
@@ -4595,6 +4595,202 @@ test_254b()
 }
 run_test 254b "Request counters are correctly incremented and decremented"
 
+# tests 260[a-c] rely on the parsing of the copytool's log file, they might
+# break in the future because of that.
+test_260a()
+{
+	local -a files=("$DIR/$tdir/$tfile".{0..15})
+	local file
+
+	for file in "${files[@]}"; do
+		create_small_file "$file"
+	done
+
+	# Set a few hsm parameters
+	stack_trap \
+		"set_hsm_param loop_period $(get_hsm_param loop_period)" EXIT
+	set_hsm_param loop_period 1
+	stack_trap \
+		"set_hsm_param max_requests $(get_hsm_param max_requests)" EXIT
+	set_hsm_param max_requests 3
+
+	# Release one file
+	copytool setup
+	"$LFS" hsm_archive "${files[0]}"
+	wait_request_state "$(path2fid "${files[0]}")" ARCHIVE SUCCEED
+	"$LFS" hsm_release "${files[0]}"
+
+	# Stop the copytool
+	kill_copytools
+	wait_copytools || error "copytools failed to stop"
+
+	# Send several archive requests
+	for file in "${files[@]:1}"; do
+		"$LFS" hsm_archive "$file"
+	done
+
+	# Send one restore request
+	"$LFS" hsm_restore "${files[0]}"
+
+	# Launch a copytool
+	copytool setup
+
+	# Wait for all the requests to complete
+	wait_request_state "$(path2fid "${files[0]}")" RESTORE SUCCEED
+	for file in "${files[@]:1}"; do
+		wait_request_state "$(path2fid "$file")" ARCHIVE SUCCEED
+	done
+
+	# Collect the actions in the order in which the copytool processed them
+	local -a actions=(
+		$(do_facet "$SINGLEAGT" grep -o '\"RESTORE\\|ARCHIVE\"' \
+			"$(copytool_logfile "$SINGLEAGT")")
+		)
+
+	printf '%s\n' "${actions[@]}"
+
+	local action
+	for action in "${actions[@]:0:3}"; do
+		[ "$action" == RESTORE ] && return
+	done
+
+	error "Too many ARCHIVE requests were run before the RESTORE request"
+}
+run_test 260a "Restore request have priority over other requests"
+
+# This test is very much tied to the implementation of the current priorisation
+# mechanism in the coordinator. It might not make sense to keep it in the future
+test_260b()
+{
+	local -a files=("$DIR/$tdir/$tfile".{0..15})
+	local file
+
+	for file in "${files[@]}"; do
+		create_small_file "$file"
+	done
+
+	# Set a few hsm parameters
+	stack_trap \
+		"set_hsm_param loop_period $(get_hsm_param loop_period)" EXIT
+	set_hsm_param loop_period 1
+	stack_trap \
+		"set_hsm_param max_requests $(get_hsm_param max_requests)" EXIT
+	set_hsm_param max_requests 3
+
+	# Release one file
+	copytool setup --archive-id 2
+	"$LFS" hsm_archive --archive 2 "${files[0]}"
+	wait_request_state "$(path2fid "${files[0]}")" ARCHIVE SUCCEED
+	"$LFS" hsm_release "${files[0]}"
+
+	# Stop the copytool
+	kill_copytools
+	wait_copytools || error "copytools failed to stop"
+
+	# Send several archive requests
+	for file in "${files[@]:1}"; do
+		"$LFS" hsm_archive "$file"
+	done
+
+	# Send one restore request
+	"$LFS" hsm_restore "${files[0]}"
+
+	# Launch a copytool
+	copytool setup
+	copytool setup --archive-id 2
+
+	# Wait for all the requests to complete
+	wait_request_state "$(path2fid "${files[0]}")" RESTORE SUCCEED
+	for file in "${files[@]:1}"; do
+		wait_request_state "$(path2fid "$file")" ARCHIVE SUCCEED
+	done
+
+	# Collect the actions in the order in which the copytool processed them
+	local -a actions=(
+		$(do_facet "$SINGLEAGT" grep -o '\"RESTORE\\|ARCHIVE\"' \
+			"$(copytool_logfile "$SINGLEAGT")")
+		)
+
+	printf '%s\n' "${actions[@]}"
+
+	local action
+	for action in "${actions[@]:0:3}"; do
+		[ "$action" == RESTORE ] && return
+	done
+
+	error "Too many ARCHIVE requests were run before the RESTORE request"
+}
+run_test 260b "Restore request have priority over other requests"
+
+# This test is very much tied to the implementation of the current priorisation
+# mechanism in the coordinator. It might not make sense to keep it in the future
+test_260c()
+{
+	local -a files=("$DIR/$tdir/$tfile".{0..15})
+	local file
+
+	for file in "${files[@]}"; do
+		create_small_file "$file"
+	done
+
+	# Set a few hsm parameters
+	stack_trap \
+		"set_hsm_param loop_period $(get_hsm_param loop_period)" EXIT
+	set_hsm_param loop_period 10
+	stack_trap \
+		"set_hsm_param max_requests $(get_hsm_param max_requests)" EXIT
+	set_hsm_param max_requests 3
+
+	# Release one file
+	copytool setup --archive-id 2
+	"$LFS" hsm_archive --archive 2 "${files[0]}"
+	wait_request_state "$(path2fid "${files[0]}")" ARCHIVE SUCCEED
+	"$LFS" hsm_release "${files[0]}"
+
+	# Stop the copytool
+	kill_copytools
+	wait_copytools || error "copytools failed to stop"
+
+	"$LFS" hsm_archive "${files[1]}"
+
+	# Launch a copytool
+	copytool setup
+	copytool setup --archive-id 2
+
+	wait_request_state "$(path2fid "${files[1]}")" ARCHIVE SUCCEED
+
+	# Send several archive requests
+	for file in "${files[@]:2}"; do
+		"$LFS" hsm_archive "$file"
+	done
+
+	# Send one restore request
+	"$LFS" hsm_restore "${files[0]}"
+
+	# Wait for all the requests to complete
+	wait_request_state "$(path2fid "${files[0]}")" RESTORE SUCCEED
+	for file in "${files[@]:2}"; do
+		wait_request_state "$(path2fid "$file")" ARCHIVE SUCCEED
+	done
+
+	# Collect the actions in the order in which the copytool processed them
+	local -a actions=(
+		$(do_facet "$SINGLEAGT" grep -o '\"RESTORE\\|ARCHIVE\"' \
+			"$(copytool_logfile "$SINGLEAGT")")
+		)
+
+	printf '%s\n' "${actions[@]}"
+
+	local action
+	for action in "${actions[@]:0:3}"; do
+		[ "$action" == RESTORE ] &&
+			error "Restore requests should not be prioritised" \
+			      "unless the coordinator is doing housekeeping"
+	done
+	return 0
+}
+run_test 260c "Restore request have priority over other requests"
+
 test_300() {
 	[ "$CLIENTONLY" ] && skip "CLIENTONLY mode" && return
 
-- 
1.8.3.1