Whamcloud - gitweb
LU-16188 mdt: fix incompatible HSM request handling 58/48658/6
authorAurelien Degremont <degremoa@amazon.com>
Mon, 26 Sep 2022 12:27:37 +0000 (12:27 +0000)
committerOleg Drokin <green@whamcloud.com>
Wed, 8 Feb 2023 06:27:07 +0000 (06:27 +0000)
When the coordinator tries to send multiple hsm actions in
a single request, if one of the request fails incompat checks all the
requests are marked as STARTED but none of the requests are
sent to the agent.

Return EAGAIN from mdt_agent_hsm_send() so that the coordinator would
not mark the requests as STARTED. It would retry them later.

Add a sanity-hsm test.

Test-Parameters: trivial testlist=sanity-hsm
Change-Id: Id4fb858021be6dc6b0cbcf140c3f2051efce57ad
Signed-off-by: Jeya Ganesh Babu Jegatheesan <jeyaga@amazon.com>
Signed-off-by: Aurelien Degremont <degremoa@amazon.com>
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/48658
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Sergey Cheremencev <scherementsev@ddn.com>
Reviewed-by: Nikitas Angelinas <nikitas.angelinas@hpe.com>
Reviewed-by: Etienne AUJAMES <eaujames@ddn.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/mdt/mdt_hsm_cdt_agent.c
lustre/tests/sanity-hsm.sh

index 3e51788..e72e3da 100644 (file)
@@ -534,7 +534,7 @@ int mdt_hsm_agent_send(struct mdt_thread_info *mti,
         * So no need the rebuild a full valid HAL now
         */
        if (fail_request)
-               GOTO(out_buf, rc = 0);
+               GOTO(out_buf, rc = -EAGAIN);
 
        /* Cancel memory registration is useless for purge
         * non registration avoid a deadlock :
index b8cc1af..974cafa 100755 (executable)
@@ -3909,6 +3909,33 @@ test_113() {
 }
 run_test 113 "wrong stat after restore"
 
+test_114() {
+       mkdir_on_mdt0 $DIR/$tdir
+
+       local f1=$DIR/$tdir/${tfile}1
+       local f2=$DIR/$tdir/${tfile}2
+       local fid1=$(create_empty_file "$f1")
+       local fid2=$(create_empty_file "$f2")
+
+       copytool setup
+
+       # Prevent archive from completing
+       cdt_disable
+
+       $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f1 $f2
+       # wait archive to register at CDT
+       wait_request_state "$fid1" ARCHIVE WAITING
+
+       # Set f2 in an incompatible state
+       $LFS hsm_set --noarchive $f2
+
+       cdt_enable
+
+       wait_request_state "$fid1" ARCHIVE SUCCEED
+       wait_request_state "$fid2" ARCHIVE FAILED
+}
+run_test 114 "Incompatible request does not set other requests as STARTED"
+
 test_200() {
        mkdir_on_mdt0 $DIR/$tdir