Whamcloud - gitweb
LU-15145 hsm: unlock the restore layout lock for a cancel 41/45341/5
authorEtienne AUJAMES <etienne.aujames@cea.fr>
Fri, 22 Oct 2021 18:18:29 +0000 (20:18 +0200)
committerOleg Drokin <green@whamcloud.com>
Tue, 18 Jan 2022 09:08:54 +0000 (09:08 +0000)
The HSM restore EX layout lock is not unlock by a HSM cancel action
or by "hsm_control=purge" parameter.

This patch call cdt_restore_handle_del() in mdt_cancel_all_cb() and
mdt_agent_record_update_cb() for restore action (when updating action
status to ARS_CANCELED).
The test "sanity-hsm test_103a" checks the "purge actions" with
blocking restore.

Test-Parameters: testlist=sanity-hsm,sanity-hsm
Signed-off-by: Etienne AUJAMES <eaujames@ddn.com>
Change-Id: Id891e06aacd2a2c5950048a2d2a5d1398eedfdd7
Reviewed-on: https://review.whamcloud.com/45341
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Lai Siyao <lai.siyao@whamcloud.com>
Reviewed-by: Yingjin Qian <qian@ddn.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/mdt/mdt_coordinator.c
lustre/mdt/mdt_hsm_cdt_actions.c
lustre/mdt/mdt_hsm_cdt_agent.c
lustre/mdt/mdt_internal.h
lustre/tests/sanity-hsm.sh

index a1fc6b0..87296cb 100644 (file)
@@ -741,8 +741,7 @@ static int mdt_coordinator(void *data)
                }
 
                if (update_idx) {
                }
 
                if (update_idx) {
-                       rc = mdt_agent_record_update(mti->mti_env, mdt,
-                                                    updates, update_idx);
+                       rc = mdt_agent_record_update(mti, updates, update_idx);
                        if (rc)
                                CERROR("%s: mdt_agent_record_update() failed, "
                                       "rc=%d, cannot update records "
                        if (rc)
                                CERROR("%s: mdt_agent_record_update() failed, "
                                       "rc=%d, cannot update records "
@@ -1257,8 +1256,7 @@ int mdt_hsm_add_hal(struct mdt_thread_info *mti,
                                .status = ARS_CANCELED,
                        };
 
                                .status = ARS_CANCELED,
                        };
 
-                       rc = mdt_agent_record_update(mti->mti_env, mti->mti_mdt,
-                                                    &update, 1);
+                       rc = mdt_agent_record_update(mti, &update, 1);
                        if (rc) {
                                CERROR("%s: mdt_agent_record_update() failed, "
                                       "rc=%d, cannot update status to %s "
                        if (rc) {
                                CERROR("%s: mdt_agent_record_update() failed, "
                                       "rc=%d, cannot update status to %s "
@@ -1682,8 +1680,7 @@ int mdt_hsm_update_request_state(struct mdt_thread_info *mti,
                update.cookie = pgs->hpk_cookie;
                update.status = status;
 
                update.cookie = pgs->hpk_cookie;
                update.status = status;
 
-               rc1 = mdt_agent_record_update(mti->mti_env, mdt,
-                                             &update, 1);
+               rc1 = mdt_agent_record_update(mti, &update, 1);
                if (rc1)
                        CERROR("%s: mdt_agent_record_update() failed,"
                               " rc=%d, cannot update status to %s"
                if (rc1)
                        CERROR("%s: mdt_agent_record_update() failed,"
                               " rc=%d, cannot update status to %s"
@@ -1717,20 +1714,12 @@ out:
 
 
 /**
 
 
 /**
- * data passed to llog_cat_process() callback
- * to cancel requests
- */
-struct hsm_cancel_all_data {
-       struct mdt_device       *mdt;
-};
-
-/**
  *  llog_cat_process() callback, used to:
  *  - purge all requests
  * \param env [IN] environment
  * \param llh [IN] llog handle
  * \param hdr [IN] llog record
  *  llog_cat_process() callback, used to:
  *  - purge all requests
  * \param env [IN] environment
  * \param llh [IN] llog handle
  * \param hdr [IN] llog record
- * \param data [IN] cb data = struct hsm_cancel_all_data
+ * \param data [IN] cb data = struct mdt_thread_info
  * \retval 0 success
  * \retval -ve failure
  */
  * \retval 0 success
  * \retval -ve failure
  */
@@ -1738,18 +1727,28 @@ static int mdt_cancel_all_cb(const struct lu_env *env,
                             struct llog_handle *llh,
                             struct llog_rec_hdr *hdr, void *data)
 {
                             struct llog_handle *llh,
                             struct llog_rec_hdr *hdr, void *data)
 {
-       struct llog_agent_req_rec       *larr;
-       struct hsm_cancel_all_data      *hcad;
-       int                              rc = 0;
+       struct llog_agent_req_rec *larr = (struct llog_agent_req_rec *)hdr;
+       struct hsm_action_item *hai = &larr->arr_hai;
+       struct mdt_thread_info  *mti = data;
+       struct coordinator *cdt = &mti->mti_mdt->mdt_coordinator;
+       int rc;
        ENTRY;
 
        ENTRY;
 
-       larr = (struct llog_agent_req_rec *)hdr;
-       hcad = data;
-       if (larr->arr_status == ARS_WAITING ||
-           larr->arr_status == ARS_STARTED) {
-               larr->arr_status = ARS_CANCELED;
-               larr->arr_req_change = ktime_get_real_seconds();
-               rc = llog_write(env, llh, hdr, hdr->lrh_index);
+       if (larr->arr_status != ARS_WAITING &&
+           larr->arr_status != ARS_STARTED)
+               RETURN(0);
+
+       /* Unlock the EX layout lock */
+       if (hai->hai_action == HSMA_RESTORE)
+               cdt_restore_handle_del(mti, cdt, &hai->hai_fid);
+
+       larr->arr_status = ARS_CANCELED;
+       larr->arr_req_change = ktime_get_real_seconds();
+       rc = llog_write(env, llh, hdr, hdr->lrh_index);
+       if (rc < 0) {
+               CERROR("%s: cannot update agent log: rc = %d\n",
+                      mdt_obd_name(mti->mti_mdt), rc);
+               rc = LLOG_DEL_RECORD;
        }
 
        RETURN(rc);
        }
 
        RETURN(rc);
@@ -1768,7 +1767,6 @@ static int hsm_cancel_all_actions(struct mdt_device *mdt)
        struct cdt_agent_req            *car;
        struct hsm_action_list          *hal = NULL;
        struct hsm_action_item          *hai;
        struct cdt_agent_req            *car;
        struct hsm_action_list          *hal = NULL;
        struct hsm_action_item          *hai;
-       struct hsm_cancel_all_data       hcad;
        int                              hal_sz = 0, hal_len, rc;
        enum cdt_states                  old_state;
        ENTRY;
        int                              hal_sz = 0, hal_len, rc;
        enum cdt_states                  old_state;
        ENTRY;
@@ -1866,10 +1864,8 @@ static int hsm_cancel_all_actions(struct mdt_device *mdt)
                OBD_FREE(hal, hal_sz);
 
        /* cancel all on-disk records */
                OBD_FREE(hal, hal_sz);
 
        /* cancel all on-disk records */
-       hcad.mdt = mdt;
-
        rc = cdt_llog_process(mti->mti_env, mti->mti_mdt, mdt_cancel_all_cb,
        rc = cdt_llog_process(mti->mti_env, mti->mti_mdt, mdt_cancel_all_cb,
-                             &hcad, 0, 0, WRITE);
+                             (void *)mti, 0, 0, WRITE);
 out_cdt_state:
        /* Enable coordinator, unless the coordinator was stopping. */
        set_cdt_state_locked(cdt, old_state);
 out_cdt_state:
        /* Enable coordinator, unless the coordinator was stopping. */
        set_cdt_state_locked(cdt, old_state);
index 7ae3762..57d1f24 100644 (file)
@@ -312,11 +312,11 @@ free:
  * to find requests
  */
 struct data_update_cb {
  * to find requests
  */
 struct data_update_cb {
-       struct mdt_device       *mdt;
+       struct mdt_thread_info *mti;
        struct hsm_record_update *updates;
        struct hsm_record_update *updates;
-       unsigned int             updates_count;
-       unsigned int             updates_done;
-       time64_t                 change_time;
+       unsigned int updates_count;
+       unsigned int updates_done;
+       time64_t change_time;
 };
 
 /**
 };
 
 /**
@@ -333,14 +333,15 @@ static int mdt_agent_record_update_cb(const struct lu_env *env,
                                      struct llog_rec_hdr *hdr,
                                      void *data)
 {
                                      struct llog_rec_hdr *hdr,
                                      void *data)
 {
-       struct llog_agent_req_rec       *larr;
-       struct data_update_cb           *ducb;
-       int                              rc, i;
+       struct llog_agent_req_rec *larr = (struct llog_agent_req_rec *)hdr;
+       struct hsm_action_item *hai = &larr->arr_hai;
+       struct data_update_cb *ducb = data;
+       struct mdt_thread_info *mti = ducb->mti;
+       struct mdt_device *mdt = ducb->mti->mti_mdt;
+       struct coordinator *cdt = &mdt->mdt_coordinator;
+       int rc, i;
        ENTRY;
 
        ENTRY;
 
-       larr = (struct llog_agent_req_rec *)hdr;
-       ducb = data;
-
        /* check if all done */
        if (ducb->updates_count == ducb->updates_done)
                RETURN(LLOG_PROC_BREAK);
        /* check if all done */
        if (ducb->updates_count == ducb->updates_done)
                RETURN(LLOG_PROC_BREAK);
@@ -354,9 +355,9 @@ static int mdt_agent_record_update_cb(const struct lu_env *env,
                struct hsm_record_update *update = &ducb->updates[i];
 
                CDEBUG(D_HSM, "%s: search %#llx, found %#llx\n",
                struct hsm_record_update *update = &ducb->updates[i];
 
                CDEBUG(D_HSM, "%s: search %#llx, found %#llx\n",
-                      mdt_obd_name(ducb->mdt), update->cookie,
-                      larr->arr_hai.hai_cookie);
-               if (larr->arr_hai.hai_cookie == update->cookie) {
+                      mdt_obd_name(mdt), update->cookie,
+                      hai->hai_cookie);
+               if (hai->hai_cookie == update->cookie) {
 
                        /* If record is a cancel request, it cannot be
                         * canceled. This is to manage the following
 
                        /* If record is a cancel request, it cannot be
                         * canceled. This is to manage the following
@@ -366,21 +367,30 @@ static int mdt_agent_record_update_cb(const struct lu_env *env,
                         * has to be set to ARS_CANCELED and the 2nd
                         * to ARS_SUCCEED
                         */
                         * has to be set to ARS_CANCELED and the 2nd
                         * to ARS_SUCCEED
                         */
-                       if (larr->arr_hai.hai_action == HSMA_CANCEL &&
+                       if (hai->hai_action == HSMA_CANCEL &&
                            update->status == ARS_CANCELED)
                                RETURN(0);
 
                        larr->arr_status = update->status;
                        larr->arr_req_change = ducb->change_time;
                        rc = llog_write(env, llh, hdr, hdr->lrh_index);
                            update->status == ARS_CANCELED)
                                RETURN(0);
 
                        larr->arr_status = update->status;
                        larr->arr_req_change = ducb->change_time;
                        rc = llog_write(env, llh, hdr, hdr->lrh_index);
+                       if (rc < 0)
+                               break;
+
                        ducb->updates_done++;
                        ducb->updates_done++;
+
+                       /* Unlock the EX layout lock */
+                       if (hai->hai_action == HSMA_RESTORE &&
+                           update->status == ARS_CANCELED)
+                               cdt_restore_handle_del(mti, cdt, &hai->hai_fid);
+
                        break;
                }
        }
 
        if (rc < 0)
                CERROR("%s: mdt_agent_llog_update_rec() failed, rc = %d\n",
                        break;
                }
        }
 
        if (rc < 0)
                CERROR("%s: mdt_agent_llog_update_rec() failed, rc = %d\n",
-                      mdt_obd_name(ducb->mdt), rc);
+                      mdt_obd_name(mdt), rc);
 
        RETURN(rc);
 }
 
        RETURN(rc);
 }
@@ -396,10 +406,12 @@ static int mdt_agent_record_update_cb(const struct lu_env *env,
  * \retval 0 on success
  * \retval negative on failure
  */
  * \retval 0 on success
  * \retval negative on failure
  */
-int mdt_agent_record_update(const struct lu_env *env, struct mdt_device *mdt,
+int mdt_agent_record_update(struct mdt_thread_info *mti,
                            struct hsm_record_update *updates,
                            unsigned int updates_count)
 {
                            struct hsm_record_update *updates,
                            unsigned int updates_count)
 {
+       const struct lu_env *env = mti->mti_env;
+       struct mdt_device *mdt = mti->mti_mdt;
        struct data_update_cb    ducb;
        u32 start_cat_idx = -1;
        u32 start_rec_idx = -1;
        struct data_update_cb    ducb;
        u32 start_cat_idx = -1;
        u32 start_rec_idx = -1;
@@ -433,7 +445,7 @@ int mdt_agent_record_update(const struct lu_env *env, struct mdt_device *mdt,
        if (start_rec_idx != 0)
                start_rec_idx -= 1;
 
        if (start_rec_idx != 0)
                start_rec_idx -= 1;
 
-       ducb.mdt = mdt;
+       ducb.mti = mti;
        ducb.updates = updates;
        ducb.updates_count = updates_count;
        ducb.updates_done = 0;
        ducb.updates = updates;
        ducb.updates_count = updates_count;
        ducb.updates_done = 0;
index 1767e11..3e51788 100644 (file)
@@ -426,8 +426,7 @@ int mdt_hsm_agent_send(struct mdt_thread_info *mti,
 
                        update.cookie = hai->hai_cookie;
                        update.status = ARS_SUCCEED;
 
                        update.cookie = hai->hai_cookie;
                        update.status = ARS_SUCCEED;
-                       rc2 = mdt_agent_record_update(mti->mti_env, mdt,
-                                                     &update, 1);
+                       rc2 = mdt_agent_record_update(mti, &update, 1);
                        if (rc2) {
                                CERROR("%s: mdt_agent_record_update() "
                                      "failed, cannot update "
                        if (rc2) {
                                CERROR("%s: mdt_agent_record_update() "
                                      "failed, cannot update "
@@ -483,8 +482,7 @@ int mdt_hsm_agent_send(struct mdt_thread_info *mti,
                                continue;
 
                        fail_request = true;
                                continue;
 
                        fail_request = true;
-                       rc = mdt_agent_record_update(mti->mti_env, mdt,
-                                                    &update, 1);
+                       rc = mdt_agent_record_update(mti, &update, 1);
                        if (rc < 0) {
                                CERROR("%s: mdt_agent_record_update() failed, "
                                       "cannot update status to %s for cookie "
                        if (rc < 0) {
                                CERROR("%s: mdt_agent_record_update() failed, "
                                       "cannot update status to %s for cookie "
@@ -512,8 +510,7 @@ int mdt_hsm_agent_send(struct mdt_thread_info *mti,
                         * make the same HAL with valid only
                         * records */
                        fail_request = true;
                         * make the same HAL with valid only
                         * records */
                        fail_request = true;
-                       rc = mdt_agent_record_update(mti->mti_env, mdt,
-                                                    &update, 1);
+                       rc = mdt_agent_record_update(mti, &update, 1);
                        if (rc) {
                                CERROR("%s: mdt_agent_record_update() failed, "
                                       "cannot update status to %s for cookie "
                        if (rc) {
                                CERROR("%s: mdt_agent_record_update() failed, "
                                       "cannot update status to %s for cookie "
index 2726f86..f3cbbb0 100644 (file)
@@ -1038,7 +1038,7 @@ int cdt_llog_process(const struct lu_env *env, struct mdt_device *mdt,
 int mdt_agent_record_add(const struct lu_env *env, struct mdt_device *mdt,
                         __u32 archive_id, __u64 flags,
                         struct hsm_action_item *hai);
 int mdt_agent_record_add(const struct lu_env *env, struct mdt_device *mdt,
                         __u32 archive_id, __u64 flags,
                         struct hsm_action_item *hai);
-int mdt_agent_record_update(const struct lu_env *env, struct mdt_device *mdt,
+int mdt_agent_record_update(struct mdt_thread_info *mti,
                            struct hsm_record_update *updates,
                            unsigned int updates_count);
 void cdt_agent_record_hash_add(struct coordinator *cdt, u64 cookie, u32 cat_idt,
                            struct hsm_record_update *updates,
                            unsigned int updates_count);
 void cdt_agent_record_hash_add(struct coordinator *cdt, u64 cookie, u32 cat_idt,
index d739235..d30bb2c 100755 (executable)
@@ -3516,6 +3516,60 @@ test_103() {
 }
 run_test 103 "Purge all requests"
 
 }
 run_test 103 "Purge all requests"
 
+test_103a() {
+       cdt_clear_non_blocking_restore
+
+       # test needs a running copytool
+       copytool setup
+
+       local -a fids=()
+       local i
+
+       mkdir_on_mdt0 $DIR/$tdir
+       for i in {0..9}; do
+               fids+=( $(copy_file /etc/passwd $DIR/$tdir/${tfile}_$i) )
+       done
+       $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $DIR/$tdir/*
+
+       local time=0
+       local cnt=0
+       local grep_regex="($(tr ' ' '|' <<< "${fids[*]}")).*action=ARCHIVE.*status=SUCCEED"
+       echo $grep_regex
+       while [[ $time -lt 5 ]] && [[ $cnt -ne ${#fids[@]} ]]; do
+               cnt=$(do_facet mds1 "$LCTL get_param -n $HSM_PARAM.actions |
+                       grep -c -E '$grep_regex'")
+               sleep 1
+               ((++time))
+       done
+       [[ $cnt -eq ${#fids[@]} ]] || error "Fail to archive files $cnt/${#fids[@]}"
+
+       $LFS hsm_release $DIR/$tdir/*
+
+       kill_copytools
+       wait_copytools || error "Copytool failed to stop"
+
+       local -a pids=()
+       for i in "${fids[@]}"; do
+               cat $DIR/.lustre/fid/$i > /dev/null & pids+=($!)
+       done
+
+       cdt_purge
+       grep_regex="($(tr ' ' '|' <<< "${fids[*]}")).*action=RESTORE.*status=CANCELED"
+       cnt=$(do_facet mds1 "$LCTL get_param -n $HSM_PARAM.actions |
+               grep -cE '$grep_regex'")
+
+       [[ "$cnt" -eq ${#fids[@]} ]] ||
+               error "Some request have not been canceled ($cnt/${#fids[@]} canceled)"
+
+       # cat cmds should not hang and should fail
+       for i in "${!pids[@]}"; do
+               wait ${pids[$i]} &&
+                       error "Restore for ${tfile}_$i (${pids[$i]}) should fail" ||
+                       true
+       done
+}
+run_test 103a "Purge pending restore requests"
+
 DATA=CEA
 DATAHEX='[434541]'
 test_104() {
 DATA=CEA
 DATAHEX='[434541]'
 test_104() {