From: Etienne AUJAMES Date: Fri, 22 Oct 2021 18:18:29 +0000 (+0200) Subject: LU-15145 hsm: unlock the restore layout lock for a cancel X-Git-Tag: 2.14.57~17 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=6d4019281b392bcb6993d1cfca3d47d7fa5f7c56 LU-15145 hsm: unlock the restore layout lock for a cancel The HSM restore EX layout lock is not unlock by a HSM cancel action or by "hsm_control=purge" parameter. This patch call cdt_restore_handle_del() in mdt_cancel_all_cb() and mdt_agent_record_update_cb() for restore action (when updating action status to ARS_CANCELED). The test "sanity-hsm test_103a" checks the "purge actions" with blocking restore. Test-Parameters: testlist=sanity-hsm,sanity-hsm Signed-off-by: Etienne AUJAMES Change-Id: Id891e06aacd2a2c5950048a2d2a5d1398eedfdd7 Reviewed-on: https://review.whamcloud.com/45341 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Lai Siyao Reviewed-by: Yingjin Qian Reviewed-by: Oleg Drokin --- diff --git a/lustre/mdt/mdt_coordinator.c b/lustre/mdt/mdt_coordinator.c index a1fc6b0..87296cb 100644 --- a/lustre/mdt/mdt_coordinator.c +++ b/lustre/mdt/mdt_coordinator.c @@ -741,8 +741,7 @@ static int mdt_coordinator(void *data) } if (update_idx) { - rc = mdt_agent_record_update(mti->mti_env, mdt, - updates, update_idx); + rc = mdt_agent_record_update(mti, updates, update_idx); if (rc) CERROR("%s: mdt_agent_record_update() failed, " "rc=%d, cannot update records " @@ -1257,8 +1256,7 @@ int mdt_hsm_add_hal(struct mdt_thread_info *mti, .status = ARS_CANCELED, }; - rc = mdt_agent_record_update(mti->mti_env, mti->mti_mdt, - &update, 1); + rc = mdt_agent_record_update(mti, &update, 1); if (rc) { CERROR("%s: mdt_agent_record_update() failed, " "rc=%d, cannot update status to %s " @@ -1682,8 +1680,7 @@ int mdt_hsm_update_request_state(struct mdt_thread_info *mti, update.cookie = pgs->hpk_cookie; update.status = status; - rc1 = mdt_agent_record_update(mti->mti_env, mdt, - &update, 1); + rc1 = mdt_agent_record_update(mti, &update, 1); if (rc1) CERROR("%s: mdt_agent_record_update() failed," " rc=%d, cannot update status to %s" @@ -1717,20 +1714,12 @@ out: /** - * data passed to llog_cat_process() callback - * to cancel requests - */ -struct hsm_cancel_all_data { - struct mdt_device *mdt; -}; - -/** * llog_cat_process() callback, used to: * - purge all requests * \param env [IN] environment * \param llh [IN] llog handle * \param hdr [IN] llog record - * \param data [IN] cb data = struct hsm_cancel_all_data + * \param data [IN] cb data = struct mdt_thread_info * \retval 0 success * \retval -ve failure */ @@ -1738,18 +1727,28 @@ static int mdt_cancel_all_cb(const struct lu_env *env, struct llog_handle *llh, struct llog_rec_hdr *hdr, void *data) { - struct llog_agent_req_rec *larr; - struct hsm_cancel_all_data *hcad; - int rc = 0; + struct llog_agent_req_rec *larr = (struct llog_agent_req_rec *)hdr; + struct hsm_action_item *hai = &larr->arr_hai; + struct mdt_thread_info *mti = data; + struct coordinator *cdt = &mti->mti_mdt->mdt_coordinator; + int rc; ENTRY; - larr = (struct llog_agent_req_rec *)hdr; - hcad = data; - if (larr->arr_status == ARS_WAITING || - larr->arr_status == ARS_STARTED) { - larr->arr_status = ARS_CANCELED; - larr->arr_req_change = ktime_get_real_seconds(); - rc = llog_write(env, llh, hdr, hdr->lrh_index); + if (larr->arr_status != ARS_WAITING && + larr->arr_status != ARS_STARTED) + RETURN(0); + + /* Unlock the EX layout lock */ + if (hai->hai_action == HSMA_RESTORE) + cdt_restore_handle_del(mti, cdt, &hai->hai_fid); + + larr->arr_status = ARS_CANCELED; + larr->arr_req_change = ktime_get_real_seconds(); + rc = llog_write(env, llh, hdr, hdr->lrh_index); + if (rc < 0) { + CERROR("%s: cannot update agent log: rc = %d\n", + mdt_obd_name(mti->mti_mdt), rc); + rc = LLOG_DEL_RECORD; } RETURN(rc); @@ -1768,7 +1767,6 @@ static int hsm_cancel_all_actions(struct mdt_device *mdt) struct cdt_agent_req *car; struct hsm_action_list *hal = NULL; struct hsm_action_item *hai; - struct hsm_cancel_all_data hcad; int hal_sz = 0, hal_len, rc; enum cdt_states old_state; ENTRY; @@ -1866,10 +1864,8 @@ static int hsm_cancel_all_actions(struct mdt_device *mdt) OBD_FREE(hal, hal_sz); /* cancel all on-disk records */ - hcad.mdt = mdt; - rc = cdt_llog_process(mti->mti_env, mti->mti_mdt, mdt_cancel_all_cb, - &hcad, 0, 0, WRITE); + (void *)mti, 0, 0, WRITE); out_cdt_state: /* Enable coordinator, unless the coordinator was stopping. */ set_cdt_state_locked(cdt, old_state); diff --git a/lustre/mdt/mdt_hsm_cdt_actions.c b/lustre/mdt/mdt_hsm_cdt_actions.c index 7ae3762..57d1f24 100644 --- a/lustre/mdt/mdt_hsm_cdt_actions.c +++ b/lustre/mdt/mdt_hsm_cdt_actions.c @@ -312,11 +312,11 @@ free: * to find requests */ struct data_update_cb { - struct mdt_device *mdt; + struct mdt_thread_info *mti; struct hsm_record_update *updates; - unsigned int updates_count; - unsigned int updates_done; - time64_t change_time; + unsigned int updates_count; + unsigned int updates_done; + time64_t change_time; }; /** @@ -333,14 +333,15 @@ static int mdt_agent_record_update_cb(const struct lu_env *env, struct llog_rec_hdr *hdr, void *data) { - struct llog_agent_req_rec *larr; - struct data_update_cb *ducb; - int rc, i; + struct llog_agent_req_rec *larr = (struct llog_agent_req_rec *)hdr; + struct hsm_action_item *hai = &larr->arr_hai; + struct data_update_cb *ducb = data; + struct mdt_thread_info *mti = ducb->mti; + struct mdt_device *mdt = ducb->mti->mti_mdt; + struct coordinator *cdt = &mdt->mdt_coordinator; + int rc, i; ENTRY; - larr = (struct llog_agent_req_rec *)hdr; - ducb = data; - /* check if all done */ if (ducb->updates_count == ducb->updates_done) RETURN(LLOG_PROC_BREAK); @@ -354,9 +355,9 @@ static int mdt_agent_record_update_cb(const struct lu_env *env, struct hsm_record_update *update = &ducb->updates[i]; CDEBUG(D_HSM, "%s: search %#llx, found %#llx\n", - mdt_obd_name(ducb->mdt), update->cookie, - larr->arr_hai.hai_cookie); - if (larr->arr_hai.hai_cookie == update->cookie) { + mdt_obd_name(mdt), update->cookie, + hai->hai_cookie); + if (hai->hai_cookie == update->cookie) { /* If record is a cancel request, it cannot be * canceled. This is to manage the following @@ -366,21 +367,30 @@ static int mdt_agent_record_update_cb(const struct lu_env *env, * has to be set to ARS_CANCELED and the 2nd * to ARS_SUCCEED */ - if (larr->arr_hai.hai_action == HSMA_CANCEL && + if (hai->hai_action == HSMA_CANCEL && update->status == ARS_CANCELED) RETURN(0); larr->arr_status = update->status; larr->arr_req_change = ducb->change_time; rc = llog_write(env, llh, hdr, hdr->lrh_index); + if (rc < 0) + break; + ducb->updates_done++; + + /* Unlock the EX layout lock */ + if (hai->hai_action == HSMA_RESTORE && + update->status == ARS_CANCELED) + cdt_restore_handle_del(mti, cdt, &hai->hai_fid); + break; } } if (rc < 0) CERROR("%s: mdt_agent_llog_update_rec() failed, rc = %d\n", - mdt_obd_name(ducb->mdt), rc); + mdt_obd_name(mdt), rc); RETURN(rc); } @@ -396,10 +406,12 @@ static int mdt_agent_record_update_cb(const struct lu_env *env, * \retval 0 on success * \retval negative on failure */ -int mdt_agent_record_update(const struct lu_env *env, struct mdt_device *mdt, +int mdt_agent_record_update(struct mdt_thread_info *mti, struct hsm_record_update *updates, unsigned int updates_count) { + const struct lu_env *env = mti->mti_env; + struct mdt_device *mdt = mti->mti_mdt; struct data_update_cb ducb; u32 start_cat_idx = -1; u32 start_rec_idx = -1; @@ -433,7 +445,7 @@ int mdt_agent_record_update(const struct lu_env *env, struct mdt_device *mdt, if (start_rec_idx != 0) start_rec_idx -= 1; - ducb.mdt = mdt; + ducb.mti = mti; ducb.updates = updates; ducb.updates_count = updates_count; ducb.updates_done = 0; diff --git a/lustre/mdt/mdt_hsm_cdt_agent.c b/lustre/mdt/mdt_hsm_cdt_agent.c index 1767e11..3e51788 100644 --- a/lustre/mdt/mdt_hsm_cdt_agent.c +++ b/lustre/mdt/mdt_hsm_cdt_agent.c @@ -426,8 +426,7 @@ int mdt_hsm_agent_send(struct mdt_thread_info *mti, update.cookie = hai->hai_cookie; update.status = ARS_SUCCEED; - rc2 = mdt_agent_record_update(mti->mti_env, mdt, - &update, 1); + rc2 = mdt_agent_record_update(mti, &update, 1); if (rc2) { CERROR("%s: mdt_agent_record_update() " "failed, cannot update " @@ -483,8 +482,7 @@ int mdt_hsm_agent_send(struct mdt_thread_info *mti, continue; fail_request = true; - rc = mdt_agent_record_update(mti->mti_env, mdt, - &update, 1); + rc = mdt_agent_record_update(mti, &update, 1); if (rc < 0) { CERROR("%s: mdt_agent_record_update() failed, " "cannot update status to %s for cookie " @@ -512,8 +510,7 @@ int mdt_hsm_agent_send(struct mdt_thread_info *mti, * make the same HAL with valid only * records */ fail_request = true; - rc = mdt_agent_record_update(mti->mti_env, mdt, - &update, 1); + rc = mdt_agent_record_update(mti, &update, 1); if (rc) { CERROR("%s: mdt_agent_record_update() failed, " "cannot update status to %s for cookie " diff --git a/lustre/mdt/mdt_internal.h b/lustre/mdt/mdt_internal.h index 2726f86..f3cbbb0 100644 --- a/lustre/mdt/mdt_internal.h +++ b/lustre/mdt/mdt_internal.h @@ -1038,7 +1038,7 @@ int cdt_llog_process(const struct lu_env *env, struct mdt_device *mdt, int mdt_agent_record_add(const struct lu_env *env, struct mdt_device *mdt, __u32 archive_id, __u64 flags, struct hsm_action_item *hai); -int mdt_agent_record_update(const struct lu_env *env, struct mdt_device *mdt, +int mdt_agent_record_update(struct mdt_thread_info *mti, struct hsm_record_update *updates, unsigned int updates_count); void cdt_agent_record_hash_add(struct coordinator *cdt, u64 cookie, u32 cat_idt, diff --git a/lustre/tests/sanity-hsm.sh b/lustre/tests/sanity-hsm.sh index d739235..d30bb2c 100755 --- a/lustre/tests/sanity-hsm.sh +++ b/lustre/tests/sanity-hsm.sh @@ -3516,6 +3516,60 @@ test_103() { } run_test 103 "Purge all requests" +test_103a() { + cdt_clear_non_blocking_restore + + # test needs a running copytool + copytool setup + + local -a fids=() + local i + + mkdir_on_mdt0 $DIR/$tdir + for i in {0..9}; do + fids+=( $(copy_file /etc/passwd $DIR/$tdir/${tfile}_$i) ) + done + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $DIR/$tdir/* + + local time=0 + local cnt=0 + local grep_regex="($(tr ' ' '|' <<< "${fids[*]}")).*action=ARCHIVE.*status=SUCCEED" + echo $grep_regex + while [[ $time -lt 5 ]] && [[ $cnt -ne ${#fids[@]} ]]; do + cnt=$(do_facet mds1 "$LCTL get_param -n $HSM_PARAM.actions | + grep -c -E '$grep_regex'") + sleep 1 + ((++time)) + done + [[ $cnt -eq ${#fids[@]} ]] || error "Fail to archive files $cnt/${#fids[@]}" + + $LFS hsm_release $DIR/$tdir/* + + kill_copytools + wait_copytools || error "Copytool failed to stop" + + local -a pids=() + for i in "${fids[@]}"; do + cat $DIR/.lustre/fid/$i > /dev/null & pids+=($!) + done + + cdt_purge + grep_regex="($(tr ' ' '|' <<< "${fids[*]}")).*action=RESTORE.*status=CANCELED" + cnt=$(do_facet mds1 "$LCTL get_param -n $HSM_PARAM.actions | + grep -cE '$grep_regex'") + + [[ "$cnt" -eq ${#fids[@]} ]] || + error "Some request have not been canceled ($cnt/${#fids[@]} canceled)" + + # cat cmds should not hang and should fail + for i in "${!pids[@]}"; do + wait ${pids[$i]} && + error "Restore for ${tfile}_$i (${pids[$i]}) should fail" || + true + done +} +run_test 103a "Purge pending restore requests" + DATA=CEA DATAHEX='[434541]' test_104() {