From fc84852e990dd3174b63ab8e16e6bc3c0b1b092a Mon Sep 17 00:00:00 2001 From: Bruno Faccini Date: Mon, 27 Jun 2016 11:25:20 +0200 Subject: [PATCH] LU-6449 mdt: broadcast orphan hsm_remove requests If a hsm_remove request is received for an unlinked file with no/0 archive_id specified and no Agent/CT has registered to serve all archive_ids, broadcast the request once to each registered archive_id. Also created specific sanity-hsm/test_29d sub-test using "lfs hsm_remove " capability introduced by LU-6494. Test-Parameters: clientcount=4 Signed-off-by: Bruno Faccini Change-Id: Ice5acaa5116dc036d5a98d76368eba2023a29f49 Reviewed-on: http://review.whamcloud.com/20991 Reviewed-by: Frank Zago Tested-by: Jenkins Tested-by: Maloo Reviewed-by: John L. Hammond Reviewed-by: Oleg Drokin --- lustre/mdt/mdt_hsm_cdt_actions.c | 8 +-- lustre/mdt/mdt_hsm_cdt_agent.c | 108 +++++++++++++++++++++++++++++++++++++++ lustre/tests/sanity-hsm.sh | 76 +++++++++++++++++++++++++++ 3 files changed, 189 insertions(+), 3 deletions(-) diff --git a/lustre/mdt/mdt_hsm_cdt_actions.c b/lustre/mdt/mdt_hsm_cdt_actions.c index 137cb4a..619103c 100644 --- a/lustre/mdt/mdt_hsm_cdt_actions.c +++ b/lustre/mdt/mdt_hsm_cdt_actions.c @@ -157,11 +157,13 @@ int mdt_agent_record_add(const struct lu_env *env, /* in case of cancel request, the cookie is already set to the * value of the request cookie to be cancelled * so we do not change it */ - if (hai->hai_action != HSMA_CANCEL) { + if (hai->hai_action == HSMA_CANCEL) { + larr->arr_hai.hai_cookie = hai->hai_cookie; + } else { cdt->cdt_last_cookie++; - hai->hai_cookie = cdt->cdt_last_cookie; + larr->arr_hai.hai_cookie = cdt->cdt_last_cookie; } - larr->arr_hai.hai_cookie = hai->hai_cookie; + rc = llog_cat_add(env, lctxt->loc_handle, &larr->arr_hdr, NULL); if (rc > 0) rc = 0; diff --git a/lustre/mdt/mdt_hsm_cdt_agent.c b/lustre/mdt/mdt_hsm_cdt_agent.c index c97be71..38bc739 100644 --- a/lustre/mdt/mdt_hsm_cdt_agent.c +++ b/lustre/mdt/mdt_hsm_cdt_agent.c @@ -306,6 +306,59 @@ int mdt_hsm_find_best_agent(struct coordinator *cdt, __u32 archive, RETURN(rc); } +int mdt_hsm_send_action_to_each_archive(struct mdt_thread_info *mti, + struct hsm_action_item *hai) +{ + __u64 compound_id; + struct hsm_agent *ha; + __u32 archive_mask = 0; + struct coordinator *cdt = &mti->mti_mdt->mdt_coordinator; + int i; + /* return error by default in case all archive_ids have unregistered */ + int rc = -EAGAIN; + ENTRY; + + /* send action to all registered archive_ids */ + down_read(&cdt->cdt_agent_lock); + list_for_each_entry(ha, &cdt->cdt_agents, ha_list) { + for (i = 0; (i < ha->ha_archive_cnt); i++) { + /* only send once for each archive_id */ + if ((1 << ha->ha_archive_id[i]) & archive_mask) + continue; + archive_mask |= (1 << ha->ha_archive_id[i]); + + /* XXX: instead of creating one request record per + * new action, it could make sense to gather + * all for the same archive_id as one compound + * request/id, like in mdt_hsm_add_actions() ?? */ + compound_id = atomic_inc_return(&cdt->cdt_compound_id); + rc = mdt_agent_record_add(mti->mti_env, mti->mti_mdt, + compound_id, + ha->ha_archive_id[i], 0, + hai); + if (rc) { + CERROR("%s: unable to add HSM remove request " + "for "DFID": rc=%d\n", + mdt_obd_name(mti->mti_mdt), + PFID(&hai->hai_fid), rc); + break; + } else { + CDEBUG(D_HSM, "%s: added HSM remove request " + "for "DFID", archive_id=%d\n", + mdt_obd_name(mti->mti_mdt), + PFID(&hai->hai_fid), + ha->ha_archive_id[i]); + } + } + /* early exit from loop due to error? */ + if (i != ha->ha_archive_cnt) + break; + } + up_read(&cdt->cdt_agent_lock); + + RETURN(rc); +} + /** * send a compound request to the agent * \param mti [IN] context @@ -334,6 +387,61 @@ int mdt_hsm_agent_send(struct mdt_thread_info *mti, ENTRY; rc = mdt_hsm_find_best_agent(cdt, hal->hal_archive_id, &uuid); + if (rc && hal->hal_archive_id == 0) { + uint notrmcount = 0; + int rc2 = 0; + + /* special case of remove requests with no archive_id specified, + * and no agent registered to serve all archives, then create a + * set of new requests, each to be sent to each registered + * archives. + * Todo so, find all HSMA_REMOVE entries, and then : + * _ set completed status as SUCCESS (or FAIL?) + * _ create a new LLOG record for each archive_id + * presently being served by any CT + */ + hai = hai_first(hal); + for (i = 0; i < hal->hal_count; i++, + hai = hai_next(hai)) { + /* only removes are concerned */ + if (hai->hai_action != HSMA_REMOVE) { + /* count if other actions than HSMA_REMOVE, + * to return original error/rc */ + notrmcount++; + continue; + } + + /* send remove request to all registered archive_ids */ + rc2 = mdt_hsm_send_action_to_each_archive(mti, hai); + if (rc2) + break; + + /* only update original request as SUCCEED if it has + * been successfully broadcasted to all available + * archive_ids + * XXX: this should only cause duplicates to be sent, + * unless a method to record already successfully + * reached archive_ids is implemented */ + rc2 = mdt_agent_record_update(mti->mti_env, mdt, + &hai->hai_cookie, + 1, ARS_SUCCEED); + if (rc2) { + CERROR("%s: mdt_agent_record_update() " + "failed, cannot update " + "status to %s for cookie " + LPX64": rc = %d\n", + mdt_obd_name(mdt), + agent_req_status2name(ARS_SUCCEED), + hai->hai_cookie, rc2); + break; + } + } + /* only remove requests with archive_id=0 */ + if (notrmcount == 0) + RETURN(rc2); + + } + if (rc) { CERROR("%s: Cannot find agent for archive %d: rc = %d\n", mdt_obd_name(mdt), hal->hal_archive_id, rc); diff --git a/lustre/tests/sanity-hsm.sh b/lustre/tests/sanity-hsm.sh index b6b29e6b..4bb0aeb 100755 --- a/lustre/tests/sanity-hsm.sh +++ b/lustre/tests/sanity-hsm.sh @@ -752,6 +752,11 @@ wait_for_grace_delay() { sleep $val } +wait_for_loop_period() { + local val=$(get_hsm_param loop_period) + sleep $val +} + parse_json_event() { local raw_event=$1 @@ -2424,6 +2429,77 @@ test_29c() { } run_test 29c "Archive/delete/remove by FID, using a file list." +test_29d() { + # test needs more than one CT + needclients 3 || return 0 + + local n + local file + local fid + + copytool_cleanup $(comma_list $(agts_nodes)) + + # start all of the copytools + for n in $(seq $AGTCOUNT); do + copytool_setup agt$n $MOUNT2 $n + done + + trap "copytool_cleanup $(comma_list $(agts_nodes))" EXIT + # archive files + mkdir -p $DIR/$tdir + file=$DIR/$tdir/$tfile + fid=$(make_small $file) + + $LFS hsm_archive $file + wait_request_state $fid ARCHIVE SUCCEED + check_hsm_flags $file "0x00000009" + + rm -f $file + + $LFS hsm_remove -a 0 $fid + + # give time for CDT to handle remove request and create broadcasted + sleep 2 + + # remove request has been broadcasted ? + local cnt=$(get_request_count $fid REMOVE) + # broadcasted requests + original + [[ $cnt -eq $((AGTCOUNT + 1)) ]] || + error "remove not broadcasted to all CTs" + + # give time for CDT and CTs to handle broadcasted + wait_for_loop_period + + # each agent serves one different archive_id, so broadcasted + # hsm_remove request should only succeed once and fail at all others + local res + local scnt=0 + local fcnt=0 + for n in $(seq $AGTCOUNT); do + res=$(do_facet $SINGLEMDS "$LCTL get_param -n \ + $HSM_PARAM.actions | awk \ + '/'$fid'.*action=REMOVE archive#='$n'/ \ + {print \\\$13}' | cut -f2 -d=") + if [[ "$res" == "SUCCEED" ]]; then + scnt=$((scnt + 1)) + elif [[ "$res" == "FAILED" ]]; then + fcnt=$((fcnt + 1)) + fi + done + + [[ $scnt -ne 1 ]] && + error "one and only CT should have removed successfully" + + [[ $AGTCOUNT -ne $((scnt + fcnt)) ]] && + error "all but one CT should have failed to remove" + + trap - EXIT + copytool_cleanup $(comma_list $(agts_nodes)) + +} +run_test 29d "hsm_remove by FID with archive_id 0 for unlinked file cause "\ + "request to be sent once for each registered archive_id" + test_30a() { # restore at exec cannot work on agent node (because of Linux kernel # protection of executables) -- 1.8.3.1