Whamcloud - gitweb
LU-16235 hsm: get a valid cookie for RAoLU request 50/51850/11
authorEtienne AUJAMES <etienne.aujames@cea.fr>
Wed, 2 Aug 2023 09:27:41 +0000 (11:27 +0200)
committerOleg Drokin <green@whamcloud.com>
Mon, 2 Dec 2024 05:41:32 +0000 (05:41 +0000)
Add a way to get a valid cookie when nobody initializes
cdt_last_cookie.

RAoLU policy is allowed to queue a remove request with the
coordinator stopped. In that cases cdt_last_cookie can not be yet
initialize and the remove request can be queued with a conflicting
cookie.

This patch adds cdt_update_last_cookie() that reverses process the hsm
llog and stops at the first non-cancel action to determine the last
cookie.

Add the regression test sanity-hsm 26e.

Test-Parameters: testlist=sanity-hsm
Test-Parameters: testlist=sanity-hsm
Test-Parameters: testlist=sanity-hsm
Test-Parameters: testlist=sanity-hsm env=ONLY=26e,ONLY_REPEAT=30
Signed-off-by: Etienne AUJAMES <eaujames@ddn.com>
Signed-off-by: Nikitas Angelinas <nikitas.angelinas@hpe.com>
Change-Id: I6468a24b95fcb8768e12f40edfcea3ce8407281f
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/51850
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/mdt/mdt_coordinator.c
lustre/mdt/mdt_hsm_cdt_actions.c
lustre/tests/sanity-hsm.sh

index 5794e95..2dce801 100644 (file)
@@ -1000,9 +1000,9 @@ static int hsm_restore_cb(const struct lu_env *env,
 
        larr = (struct llog_agent_req_rec *)hdr;
        hai = &larr->arr_hai;
-       if (hai->hai_cookie >= cdt->cdt_last_cookie) {
+       if (hai->hai_cookie > cdt->cdt_last_cookie) {
                /* update the cookie to avoid collision */
-               cdt->cdt_last_cookie = hai->hai_cookie + 1;
+               cdt->cdt_last_cookie = hai->hai_cookie;
        }
 
        if (hai->hai_action != HSMA_RESTORE ||
@@ -1039,8 +1039,9 @@ out:
  */
 static int mdt_hsm_pending_restore(struct mdt_thread_info *mti)
 {
+       struct coordinator *cdt = &mti->mti_mdt->mdt_coordinator;
        struct hsm_restore_data  hrd;
-       int                      rc;
+       int rc;
        ENTRY;
 
        hrd.hrd_mti = mti;
@@ -1048,7 +1049,14 @@ static int mdt_hsm_pending_restore(struct mdt_thread_info *mti)
        rc = cdt_llog_process(mti->mti_env, mti->mti_mdt, hsm_restore_cb, &hrd,
                              0, 0, WRITE);
 
-       RETURN(rc);
+       if (rc < 0)
+               RETURN(rc);
+
+       /* no pending request found -> start a new session */
+       if (!cdt->cdt_last_cookie)
+               cdt->cdt_last_cookie = ktime_get_real_seconds();
+
+       RETURN(0);
 }
 
 int hsm_init_ucred(struct lu_ucred *uc)
@@ -1228,9 +1236,6 @@ static int mdt_hsm_cdt_start(struct mdt_device *mdt)
        BUILD_BUG_ON(BIT(CDT_POLICY_SHIFT_COUNT - 1) != CDT_POLICY_LAST);
        cdt->cdt_policy = CDT_DEFAULT_POLICY;
 
-       /* just need to be larger than previous one */
-       /* cdt_last_cookie is protected by cdt_llog_lock */
-       cdt->cdt_last_cookie = ktime_get_real_seconds();
        atomic_set(&cdt->cdt_request_count, 0);
        atomic_set(&cdt->cdt_archive_count, 0);
        atomic_set(&cdt->cdt_restore_count, 0);
index 44f6cb7..d1f3758 100644 (file)
@@ -254,6 +254,70 @@ int cdt_llog_process(const struct lu_env *env, struct mdt_device *mdt,
 }
 
 /**
+ *  llog_cat_process() callback, used to find last used cookie.
+ *  The processing ends at the first non-cancel record.
+ * \param env [IN] environment
+ * \param llh [IN] llog handle
+ * \param hdr [IN] llog record
+ * \param data [IN/OUT] cb data = coordinator
+ * \retval 0 success
+ * \retval -ve failure
+ */
+static int hsm_last_cookie_cb(const struct lu_env *env, struct llog_handle *llh,
+                             struct llog_rec_hdr *hdr, void *data)
+{
+       struct llog_agent_req_rec *larr = (struct llog_agent_req_rec *)hdr;
+       struct hsm_action_item *hai = &larr->arr_hai;
+       struct coordinator *cdt = data;
+
+       /* do not stop on cancel, it takes cookie from other request */
+       if (hai->hai_action == HSMA_CANCEL)
+               RETURN(0);
+
+       if (hai->hai_cookie > cdt->cdt_last_cookie)
+               cdt->cdt_last_cookie = hai->hai_cookie;
+
+       RETURN(LLOG_PROC_BREAK);
+}
+
+/**
+ * Update the last cookie used by a request.
+ * \param mti [IN] context
+ */
+static int cdt_update_last_cookie(const struct lu_env *env,
+                                 struct coordinator *cdt)
+__must_hold(&cdt->cdt_llog_lock)
+{
+       struct mdt_device *mdt;
+       struct obd_device *obd;
+       struct llog_ctxt *lctxt;
+       int rc;
+
+       mdt = container_of(cdt, typeof(*mdt), mdt_coordinator);
+       obd = mdt2obd_dev(mdt);
+       lctxt = llog_get_context(obd, LLOG_AGENT_ORIG_CTXT);
+       if (!lctxt || !lctxt->loc_handle)
+               RETURN(-ENOENT);
+
+       rc = llog_cat_reverse_process(env, lctxt->loc_handle,
+                                     hsm_last_cookie_cb, cdt);
+
+       llog_ctxt_put(lctxt);
+
+       if (rc < 0) {
+               CERROR("%s: failed to process HSM_ACTIONS llog: rc = %d\n",
+                      mdt_obd_name(mdt), rc);
+               RETURN(rc);
+       }
+
+       /* no pending request found -> start a new session */
+       if (!cdt->cdt_last_cookie)
+               cdt->cdt_last_cookie = ktime_get_real_seconds();
+
+       RETURN(0);
+}
+
+/**
  * add an entry in agent llog
  * \param env [IN] environment
  * \param mdt [IN] PDT device
@@ -293,18 +357,28 @@ int mdt_agent_record_add(const struct lu_env *env, struct mdt_device *mdt,
 
        down_write(&cdt->cdt_llog_lock);
 
+       /* If cdt_last_cookie is not set, try to initialize it.
+        * This is used by RAoLU with non-started coordinator.
+        */
+       if (unlikely(!cdt->cdt_last_cookie)) {
+               rc = cdt_update_last_cookie(env, cdt);
+               if (rc < 0)
+                       GOTO(unlock, rc);
+       }
+
        /* in case of cancel request, the cookie is already set to the
         * value of the request cookie to be cancelled
         * so we do not change it */
        if (hai->hai_action == HSMA_CANCEL)
                larr->arr_hai.hai_cookie = hai->hai_cookie;
        else
-               larr->arr_hai.hai_cookie = cdt->cdt_last_cookie++;
+               larr->arr_hai.hai_cookie = ++cdt->cdt_last_cookie;
 
        rc = llog_cat_add(env, lctxt->loc_handle, &larr->arr_hdr, NULL);
        if (rc > 0)
                rc = 0;
 
+unlock:
        up_write(&cdt->cdt_llog_lock);
        llog_ctxt_put(lctxt);
 
index 7ca0986..f4bf3d2 100755 (executable)
@@ -394,6 +394,14 @@ get_request_count() {
                "awk -vn=0 '/'$fid'.*action='$request'/ {n++}; END {print n}'"
 }
 
+get_request_cookie() {
+       local fid=$1
+       local request=$2
+
+       do_facet $SINGLEMDS "$LCTL get_param -n $HSM_PARAM.actions |"\
+               "awk '/'$fid'.*action='$request'/ {print \\\$6}' | cut -f3 -d/"
+}
+
 # Ensure the number of HSM request for a given FID is correct
 # assert_request_count FID REQUEST_TYPE COUNT [ERROR_MSG]
 assert_request_count() {
@@ -2372,6 +2380,53 @@ test_26d() {
 }
 run_test 26d "RAoLU when Client eviction"
 
+test_26e() {
+       # test needs a running copytool
+       copytool setup
+       mkdir_on_mdt0 $DIR/$tdir
+
+       local f=$DIR/$tdir/$tfile
+       local fid=$(create_small_file $f)
+       local f2=$DIR/$tdir/$tfile-2
+       local fid2=$(create_small_file $f2)
+
+       $LFS hsm_archive $f || error "could not archive file"
+       wait_request_state $fid ARCHIVE SUCCEED
+
+       kill_copytools
+       wait_copytools || error "copytool failed to stop"
+
+       $LFS hsm_archive $f2 || error "could not archive file"
+       wait_request_state $fid2 ARCHIVE WAITING
+
+       local last_cookie=$(( $(get_request_cookie $fid2 ARCHIVE) ))
+
+       stack_trap "cdt_set_mount_state enabled"
+       cdt_set_mount_state shutdown
+
+       fail mds1
+       cdt_check_state stopped
+
+       stack_trap "set_hsm_param remove_archive_on_last_unlink 0"
+       set_hsm_param remove_archive_on_last_unlink 1
+
+       rm -f $f
+
+       wait_request_state $fid REMOVE WAITING
+
+       local new_cookie=$(( $(get_request_cookie $fid REMOVE) ))
+       echo "Check cookie from RAoLU request (last: $last_cookie, remove: $new_cookie)"
+       (( new_cookie == last_cookie + 1 )) ||
+               error "RAoLU fail to setup a valid cookie ($new_cookie != $last_cookie + 1)"
+
+       cdt_enable
+       copytool setup
+
+       wait_request_state $fid2 ARCHIVE SUCCEED
+       wait_request_state $fid REMOVE SUCCEED
+}
+run_test 26e "RAoLU with a non-started coordinator"
+
 test_27a() {
        # test needs a running copytool
        copytool setup