Whamcloud - gitweb
LU-6449 mdt: broadcast orphan hsm_remove requests 91/20991/6
authorBruno Faccini <bruno.faccini@intel.com>
Mon, 27 Jun 2016 09:25:20 +0000 (11:25 +0200)
committerOleg Drokin <oleg.drokin@intel.com>
Thu, 11 Aug 2016 05:51:17 +0000 (05:51 +0000)
If a hsm_remove request is received for an unlinked file with
no/0 archive_id specified and no Agent/CT has registered to serve
all archive_ids, broadcast the request once to each registered
archive_id.

Also created specific sanity-hsm/test_29d sub-test using
"lfs hsm_remove <fid>" capability introduced by LU-6494.

Test-Parameters: clientcount=4
Signed-off-by: Bruno Faccini <bruno.faccini@intel.com>
Change-Id: Ice5acaa5116dc036d5a98d76368eba2023a29f49
Reviewed-on: http://review.whamcloud.com/20991
Reviewed-by: Frank Zago <fzago@cray.com>
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: John L. Hammond <john.hammond@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/mdt/mdt_hsm_cdt_actions.c
lustre/mdt/mdt_hsm_cdt_agent.c
lustre/tests/sanity-hsm.sh

index 137cb4a..619103c 100644 (file)
@@ -157,11 +157,13 @@ int mdt_agent_record_add(const struct lu_env *env,
        /* in case of cancel request, the cookie is already set to the
         * value of the request cookie to be cancelled
         * so we do not change it */
-       if (hai->hai_action != HSMA_CANCEL) {
+       if (hai->hai_action == HSMA_CANCEL) {
+               larr->arr_hai.hai_cookie = hai->hai_cookie;
+       } else {
                cdt->cdt_last_cookie++;
-               hai->hai_cookie = cdt->cdt_last_cookie;
+               larr->arr_hai.hai_cookie = cdt->cdt_last_cookie;
        }
-       larr->arr_hai.hai_cookie = hai->hai_cookie;
+
        rc = llog_cat_add(env, lctxt->loc_handle, &larr->arr_hdr, NULL);
        if (rc > 0)
                rc = 0;
index c97be71..38bc739 100644 (file)
@@ -306,6 +306,59 @@ int mdt_hsm_find_best_agent(struct coordinator *cdt, __u32 archive,
        RETURN(rc);
 }
 
+int mdt_hsm_send_action_to_each_archive(struct mdt_thread_info *mti,
+                                   struct hsm_action_item *hai)
+{
+       __u64 compound_id;
+       struct hsm_agent *ha;
+       __u32 archive_mask = 0;
+       struct coordinator *cdt = &mti->mti_mdt->mdt_coordinator;
+       int i;
+       /* return error by default in case all archive_ids have unregistered */
+       int rc = -EAGAIN;
+       ENTRY;
+
+       /* send action to all registered archive_ids */
+       down_read(&cdt->cdt_agent_lock);
+       list_for_each_entry(ha, &cdt->cdt_agents, ha_list) {
+               for (i = 0; (i < ha->ha_archive_cnt); i++) {
+                       /* only send once for each archive_id */
+                       if ((1 << ha->ha_archive_id[i]) & archive_mask)
+                               continue;
+                       archive_mask |= (1 << ha->ha_archive_id[i]);
+
+                       /* XXX: instead of creating one request record per
+                        * new action, it could make sense to gather
+                        * all for the same archive_id as one compound
+                        * request/id, like in mdt_hsm_add_actions() ?? */
+                       compound_id = atomic_inc_return(&cdt->cdt_compound_id);
+                       rc = mdt_agent_record_add(mti->mti_env, mti->mti_mdt,
+                                                 compound_id,
+                                                 ha->ha_archive_id[i], 0,
+                                                 hai);
+                       if (rc) {
+                               CERROR("%s: unable to add HSM remove request "
+                                      "for "DFID": rc=%d\n",
+                                      mdt_obd_name(mti->mti_mdt),
+                                      PFID(&hai->hai_fid), rc);
+                               break;
+                       } else {
+                               CDEBUG(D_HSM, "%s: added HSM remove request "
+                                      "for "DFID", archive_id=%d\n",
+                                      mdt_obd_name(mti->mti_mdt),
+                                      PFID(&hai->hai_fid),
+                                      ha->ha_archive_id[i]);
+                       }
+               }
+               /* early exit from loop due to error? */
+               if (i != ha->ha_archive_cnt)
+                       break;
+       }
+       up_read(&cdt->cdt_agent_lock);
+
+       RETURN(rc);
+}
+
 /**
  * send a compound request to the agent
  * \param mti [IN] context
@@ -334,6 +387,61 @@ int mdt_hsm_agent_send(struct mdt_thread_info *mti,
        ENTRY;
 
        rc = mdt_hsm_find_best_agent(cdt, hal->hal_archive_id, &uuid);
+       if (rc && hal->hal_archive_id == 0) {
+               uint notrmcount = 0;
+               int rc2 = 0;
+
+               /* special case of remove requests with no archive_id specified,
+                * and no agent registered to serve all archives, then create a
+                * set of new requests, each to be sent to each registered
+                * archives.
+                * Todo so, find all HSMA_REMOVE entries, and then :
+                *     _ set completed status as SUCCESS (or FAIL?)
+                *     _ create a new LLOG record for each archive_id
+                *       presently being served by any CT
+                */
+               hai = hai_first(hal);
+               for (i = 0; i < hal->hal_count; i++,
+                    hai = hai_next(hai)) {
+                       /* only removes are concerned */
+                       if (hai->hai_action != HSMA_REMOVE) {
+                               /* count if other actions than HSMA_REMOVE,
+                                * to return original error/rc */
+                               notrmcount++;
+                               continue;
+                       }
+
+                       /* send remove request to all registered archive_ids */
+                       rc2 = mdt_hsm_send_action_to_each_archive(mti, hai);
+                       if (rc2)
+                               break;
+
+                       /* only update original request as SUCCEED if it has
+                        * been successfully broadcasted to all available
+                        * archive_ids
+                        * XXX: this should only cause duplicates to be sent,
+                        * unless a method to record already successfully
+                        * reached archive_ids is implemented */
+                       rc2 = mdt_agent_record_update(mti->mti_env, mdt,
+                                                    &hai->hai_cookie,
+                                                    1, ARS_SUCCEED);
+                       if (rc2) {
+                               CERROR("%s: mdt_agent_record_update() "
+                                     "failed, cannot update "
+                                     "status to %s for cookie "
+                                     LPX64": rc = %d\n",
+                                     mdt_obd_name(mdt),
+                                     agent_req_status2name(ARS_SUCCEED),
+                                     hai->hai_cookie, rc2);
+                               break;
+                       }
+               }
+               /* only remove requests with archive_id=0 */
+               if (notrmcount == 0)
+                       RETURN(rc2);
+
+       }
+
        if (rc) {
                CERROR("%s: Cannot find agent for archive %d: rc = %d\n",
                       mdt_obd_name(mdt), hal->hal_archive_id, rc);
index b6b29e6..4bb0aeb 100755 (executable)
@@ -752,6 +752,11 @@ wait_for_grace_delay() {
        sleep $val
 }
 
+wait_for_loop_period() {
+       local val=$(get_hsm_param loop_period)
+       sleep $val
+}
+
 parse_json_event() {
        local raw_event=$1
 
@@ -2424,6 +2429,77 @@ test_29c() {
 }
 run_test 29c "Archive/delete/remove by FID, using a file list."
 
+test_29d() {
+       # test needs more than one CT
+       needclients 3 || return 0
+
+       local n
+       local file
+       local fid
+
+       copytool_cleanup $(comma_list $(agts_nodes))
+
+       # start all of the copytools
+       for n in $(seq $AGTCOUNT); do
+               copytool_setup agt$n $MOUNT2 $n
+       done
+
+       trap "copytool_cleanup $(comma_list $(agts_nodes))" EXIT
+       # archive files
+       mkdir -p $DIR/$tdir
+       file=$DIR/$tdir/$tfile
+       fid=$(make_small $file)
+
+       $LFS hsm_archive $file
+       wait_request_state $fid ARCHIVE SUCCEED
+       check_hsm_flags $file "0x00000009"
+
+       rm -f $file
+
+       $LFS hsm_remove -a 0 $fid
+
+       # give time for CDT to handle remove request and create broadcasted
+       sleep 2
+
+       # remove request has been broadcasted ?
+       local cnt=$(get_request_count $fid REMOVE)
+       # broadcasted requests + original
+       [[ $cnt -eq $((AGTCOUNT + 1)) ]] ||
+               error "remove not broadcasted to all CTs"
+
+       # give time for CDT and CTs to handle broadcasted
+       wait_for_loop_period
+
+       # each agent serves one different archive_id, so broadcasted
+       # hsm_remove request should only succeed once and fail at all others
+       local res
+       local scnt=0
+       local fcnt=0
+       for n in $(seq $AGTCOUNT); do
+               res=$(do_facet $SINGLEMDS "$LCTL get_param -n \
+                              $HSM_PARAM.actions | awk \
+                              '/'$fid'.*action=REMOVE archive#='$n'/ \
+                              {print \\\$13}' | cut -f2 -d=")
+               if [[ "$res" == "SUCCEED" ]]; then
+                       scnt=$((scnt + 1))
+               elif [[ "$res" == "FAILED" ]]; then
+                       fcnt=$((fcnt + 1))
+               fi
+       done
+
+       [[ $scnt -ne 1 ]] &&
+               error "one and only CT should have removed successfully"
+
+       [[ $AGTCOUNT -ne $((scnt + fcnt)) ]] &&
+               error "all but one CT should have failed to remove"
+
+       trap - EXIT
+       copytool_cleanup $(comma_list $(agts_nodes))
+
+}
+run_test 29d "hsm_remove by FID with archive_id 0 for unlinked file cause "\
+            "request to be sent once for each registered archive_id"
+
 test_30a() {
        # restore at exec cannot work on agent node (because of Linux kernel
        # protection of executables)