From 9f1ef86ac3518dca6e567364e9a3b47fef3fada5 Mon Sep 17 00:00:00 2001 From: Kirill Malkin Date: Sat, 16 May 2020 20:17:43 -0700 Subject: [PATCH] LU-13651 hsm: call hsm_find_compatible_cb() only for cancel The HSM action queue is scanned linearly in hsm_find_compatible_cb() for existing requests on the same file so that duplicate or conflicting requests are not added and cancel requests are assigned the correct cookie, but this can cause a large delay in adding new requests when the action queue is very large, as access to it is locked for the duration of the search. Scanning the queue does not guarantee that duplicate or conflicting requests are not added as scanning (in hsm_find_compatible_cb()) and adding requests (in mdt_agent_record_add()) are distinct operations that are not serialized by a lock and so a race window exists between these two function calls within which duplicate or conflicting requests can be added. This is hopefully not a big problem though, as the CDT thread will not send duplicate archive requests to a copytool serving a different HSM backend (and it could probably be prevented from sending duplicate archive requests to a copytool serving the same backend with a small change in mdt_hsm_is_action_compat()) and duplicate restore requests are serialized by taking the layout lock on the file before being added to the action queue, which effectively serializes them (although this blocks the caller, e.g. lfs, so it might not be ideal). Since calling hsm_compatible_cb() does not protect completely against this issue and can cause large delays in adding new requests, we skip calling it for all requests apart from cancel requests that don't specify a cookie (which should be all cancel requests in current code), hopefully safely. Test-Parameters: testlist=sanity-hsm Signed-off-by: Kirill Malkin Signed-off-by: Nikitas Angelinas Signed-off-by: Sergey Cheremencev Cray-bug-id: LUS-8717 Change-Id: Id82b2a0720e46a9c12c4d9df323ce5a7bd7aff37 Reviewed-on: https://review.whamcloud.com/38867 Reviewed-by: Ben Evans Tested-by: jenkins Tested-by: Maloo Reviewed-by: Nathan Rutman Reviewed-by: Oleg Drokin --- lustre/mdt/mdt_hsm_cdt_client.c | 28 +++++++++++++--------------- lustre/tests/sanity-hsm.sh | 6 +++++- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/lustre/mdt/mdt_hsm_cdt_client.c b/lustre/mdt/mdt_hsm_cdt_client.c index 705e98eb..5cc2c35 100644 --- a/lustre/mdt/mdt_hsm_cdt_client.c +++ b/lustre/mdt/mdt_hsm_cdt_client.c @@ -111,32 +111,30 @@ static int hsm_find_compatible(const struct lu_env *env, struct mdt_device *mdt, struct hsm_action_list *hal) { struct hsm_action_item *hai; - int rc, i, ok_cnt; + int rc = 0, i; + bool check = false; ENTRY; - ok_cnt = 0; hai = hai_first(hal); for (i = 0; i < hal->hal_count; i++, hai = hai_next(hai)) { /* We only support ARCHIVE, RESTORE, REMOVE and CANCEL here. */ if (hai->hai_action == HSMA_NONE) RETURN(-EINVAL); - /* in a cancel request hai_cookie may be set by caller to - * show the request to be canceled - * if not we need to search by FID + /* In a cancel request hai_cookie may be set by caller to show + * the request to be canceled. If there is at least one cancel + * request that does not have a cookie set we need to search by + * FID; we can skip checking in all other cases */ - if (hai->hai_action == HSMA_CANCEL && hai->hai_cookie != 0) - ok_cnt++; - else - hai->hai_cookie = 0; + if (hai->hai_action == HSMA_CANCEL && hai->hai_cookie == 0) { + check = true; + break; + } } - /* if all requests are cancel with cookie, no need to find compatible */ - if (ok_cnt == hal->hal_count) - RETURN(0); - - rc = cdt_llog_process(env, mdt, hsm_find_compatible_cb, hal, 0, 0, - READ); + if (check) + rc = cdt_llog_process(env, mdt, hsm_find_compatible_cb, hal, 0, + 0, READ); RETURN(rc); } diff --git a/lustre/tests/sanity-hsm.sh b/lustre/tests/sanity-hsm.sh index 740e6f6..f6e0ed4 100755 --- a/lustre/tests/sanity-hsm.sh +++ b/lustre/tests/sanity-hsm.sh @@ -5006,6 +5006,7 @@ test_407() { md5sum $f2 & sleep 2 + do_facet $SINGLEMDS "$LCTL get_param $HSM_PARAM.actions" # after umount hsm_actions->O/x/x log shouldn't have # double RESTORE records like below #[0x200000401:0x1:0x0]...0x58d03a0d/0x58d03a0c action=RESTORE...WAITING @@ -5013,9 +5014,12 @@ test_407() { sleep 30 && do_facet $SINGLEMDS "$LCTL get_param $HSM_PARAM.actions"& fail $SINGLEMDS + do_facet $SINGLEMDS $LCTL set_param fail_loc=0 + + do_facet $SINGLEMDS "$LCTL get_param $HSM_PARAM.actions" copytool_continue - wait_request_state $fid RESTORE SUCCEED + wait_all_done 100 $fid } run_test 407 "Check for double RESTORE records in llog" -- 1.8.3.1