Whamcloud - gitweb
LU-10383 hsm: flatten mdt_cdt_started_cb() 61/30561/4
authorJohn L. Hammond <john.hammond@intel.com>
Fri, 15 Dec 2017 20:19:46 +0000 (14:19 -0600)
committerOleg Drokin <oleg.drokin@intel.com>
Sun, 14 Jan 2018 02:38:00 +0000 (02:38 +0000)
Rewrite mdt_cdt_started_cb() to avoid creating a fake progress kernel
for mdt_hsm_update_request_state() and handle the cleanup from the
timedout action directly. Cancel cancel actions that have timedout
rather than leaving them in the log indefinitely. The code is improved
in several places to clean up all resources associated with the action
rather than having the clean up depend on unnecessary assumptions.

Since mdt_hsm_coordinator_update() in then only called from the
MDS_HSM_PROGRESS handler, the update_record parameter can be removed
aw well as the now useless wrapper function
mdt_hsm_coordinator_update().

Test-Parameters: trivial testlist=sanity-hsm
Signed-off-by: John L. Hammond <john.hammond@intel.com>
Change-Id: Ic6663b29b2a87de0da59085ccbe297b50abd049d
Reviewed-on: https://review.whamcloud.com/30561
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Faccini Bruno <bruno.faccini@intel.com>
Reviewed-by: Dmitry Eremin <dmitry.eremin@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/mdt/mdt_coordinator.c
lustre/mdt/mdt_hsm.c
lustre/mdt/mdt_hsm_cdt_agent.c
lustre/mdt/mdt_internal.h

index 9dba7c1..fd792dc 100644 (file)
@@ -274,10 +274,11 @@ static int mdt_cdt_started_cb(const struct lu_env *env,
                              struct hsm_scan_data *hsd)
 {
        struct coordinator *cdt = &mdt->mdt_coordinator;
-       struct hsm_progress_kernel pgs;
+       struct hsm_action_item *hai = &larr->arr_hai;
        struct cdt_agent_req *car;
        time64_t now = ktime_get_real_seconds();
        time64_t last;
+       int cl_flags;
        int rc;
 
        if (!hsd->housekeeping)
@@ -287,64 +288,75 @@ static int mdt_cdt_started_cb(const struct lu_env *env,
         * error may happen if coordinator crashes or stopped
         * with running request
         */
-       car = mdt_cdt_find_request(cdt, larr->arr_hai.hai_cookie);
+       car = mdt_cdt_find_request(cdt, hai->hai_cookie);
        if (car == NULL) {
                last = larr->arr_req_change;
        } else {
                last = car->car_req_update;
-               mdt_cdt_put_request(car);
        }
 
        /* test if request too long, if yes cancel it
         * the same way the copy tool acknowledge a cancel request */
        if (now <= last + cdt->cdt_active_req_timeout)
-               RETURN(0);
+               GOTO(out_car, rc = 0);
 
        dump_llog_agent_req_rec("request timed out, start cleaning", larr);
-       /* a too old cancel request just needs to be removed
-        * this can happen, if copy tool does not support
-        * cancel for other requests, we have to remove the
-        * running request and notify the copytool */
-       pgs.hpk_fid = larr->arr_hai.hai_fid;
-       pgs.hpk_cookie = larr->arr_hai.hai_cookie;
-       pgs.hpk_extent = larr->arr_hai.hai_extent;
-       pgs.hpk_flags = HP_FLAG_COMPLETED;
-       pgs.hpk_errval = ENOSYS;
-       pgs.hpk_data_version = 0;
-
-       /* update request state, but do not record in llog, to
-        * avoid deadlock on cdt_llog_lock */
-       rc = mdt_hsm_update_request_state(hsd->mti, &pgs, 0);
-       if (rc)
-               CERROR("%s: cannot cleanup timed out request: "
-                      DFID" for cookie %#llx action=%s\n",
-                      mdt_obd_name(mdt),
-                      PFID(&pgs.hpk_fid), pgs.hpk_cookie,
-                      hsm_copytool_action2name(larr->arr_hai.hai_action));
-
-       if (rc == -ENOENT) {
-               /* The request no longer exists, forget
-                * about it, and do not send a cancel request
-                * to the client, for which an error will be
-                * sent back, leading to an endless cycle of
-                * cancellation. */
-               cdt_agent_record_hash_del(cdt, larr->arr_hai.hai_cookie);
-               RETURN(LLOG_DEL_RECORD);
+
+       if (car != NULL) {
+               car->car_req_update = now;
+               mdt_hsm_agent_update_statistics(cdt, 0, 1, 0, &car->car_uuid);
+               /* Remove car from memory list (LU-9075) */
+               mdt_cdt_remove_request(cdt, hai->hai_cookie);
        }
 
-       /* XXX A cancel request cannot be cancelled. */
-       if (larr->arr_hai.hai_action == HSMA_CANCEL)
-               RETURN(0);
+       /* Emit a changelog record for the failed action.*/
+       cl_flags = 0;
+       hsm_set_cl_error(&cl_flags, ECANCELED);
+
+       switch (hai->hai_action) {
+       case HSMA_ARCHIVE:
+               hsm_set_cl_event(&cl_flags, HE_ARCHIVE);
+               break;
+       case HSMA_RESTORE:
+               hsm_set_cl_event(&cl_flags, HE_RESTORE);
+               break;
+       case HSMA_REMOVE:
+               hsm_set_cl_event(&cl_flags, HE_REMOVE);
+               break;
+       case HSMA_CANCEL:
+               hsm_set_cl_event(&cl_flags, HE_CANCEL);
+               break;
+       default:
+               /* Unknown record type, skip changelog. */
+               cl_flags = 0;
+               break;
+       }
+
+       if (cl_flags != 0)
+               mo_changelog(env, CL_HSM, cl_flags, mdt->mdt_child,
+                            &hai->hai_fid);
+
+       if (hai->hai_action == HSMA_RESTORE)
+               cdt_restore_handle_del(hsd->mti, cdt, &hai->hai_fid);
 
        larr->arr_status = ARS_CANCELED;
        larr->arr_req_change = now;
        rc = llog_write(hsd->mti->mti_env, llh, &larr->arr_hdr,
                        larr->arr_hdr.lrh_index);
-       if (rc < 0)
+       if (rc < 0) {
                CERROR("%s: cannot update agent log: rc = %d\n",
                       mdt_obd_name(mdt), rc);
+               rc = LLOG_DEL_RECORD;
+       }
 
-       RETURN(0);
+       /* ct has completed a request, so a slot is available,
+        * signal the coordinator to find new work */
+       mdt_hsm_cdt_event(cdt);
+out_car:
+       if (car != NULL)
+               mdt_cdt_put_request(car);
+
+       RETURN(rc);
 }
 
 /**
@@ -1332,7 +1344,6 @@ out:
  * update status of a completed request
  * \param mti [IN] context
  * \param pgs [IN] progress of the copy tool
- * \param update_record [IN] update llog record
  * \retval 0 success
  * \retval -ve failure
  */
@@ -1541,13 +1552,11 @@ out:
  * update status of a request
  * \param mti [IN] context
  * \param pgs [IN] progress of the copy tool
- * \param update_record [IN] update llog record
  * \retval 0 success
  * \retval -ve failure
  */
 int mdt_hsm_update_request_state(struct mdt_thread_info *mti,
-                                struct hsm_progress_kernel *pgs,
-                                const int update_record)
+                                struct hsm_progress_kernel *pgs)
 {
        struct mdt_device       *mdt = mti->mti_mdt;
        struct coordinator      *cdt = &mdt->mdt_coordinator;
@@ -1616,36 +1625,32 @@ int mdt_hsm_update_request_state(struct mdt_thread_info *mti,
        hsm_init_ucred(mdt_ucred(mti));
 
        if (pgs->hpk_flags & HP_FLAG_COMPLETED) {
-               enum agent_req_status    status;
+               enum agent_req_status status;
+               struct hsm_record_update update;
+               int rc1;
 
                rc = hsm_cdt_request_completed(mti, pgs, car, &status);
 
-               CDEBUG(D_HSM, "%s record: fid="DFID" cookie=%#llx action=%s "
+               CDEBUG(D_HSM, "updating record: fid="DFID" cookie=%#llx action=%s "
                              "status=%s\n",
-                      update_record ? "Updating" : "Not updating",
                       PFID(&pgs->hpk_fid), pgs->hpk_cookie,
                       hsm_copytool_action2name(car->car_hai->hai_action),
                       agent_req_status2name(status));
 
                /* update record first (LU-9075) */
-               if (update_record) {
-                       int rc1;
-                       struct hsm_record_update update = {
-                               .cookie = pgs->hpk_cookie,
-                               .status = status,
-                       };
-
-                       rc1 = mdt_agent_record_update(mti->mti_env, mdt,
-                                                     &update, 1);
-                       if (rc1)
-                               CERROR("%s: mdt_agent_record_update() failed,"
-                                      " rc=%d, cannot update status to %s"
-                                      " for cookie %#llx\n",
-                                      mdt_obd_name(mdt), rc1,
-                                      agent_req_status2name(status),
-                                      pgs->hpk_cookie);
-                       rc = (rc != 0 ? rc : rc1);
-               }
+               update.cookie = pgs->hpk_cookie;
+               update.status = status;
+
+               rc1 = mdt_agent_record_update(mti->mti_env, mdt,
+                                             &update, 1);
+               if (rc1)
+                       CERROR("%s: mdt_agent_record_update() failed,"
+                              " rc=%d, cannot update status to %s"
+                              " for cookie %#llx\n",
+                              mdt_obd_name(mdt), rc1,
+                              agent_req_status2name(status),
+                              pgs->hpk_cookie);
+               rc = (rc != 0 ? rc : rc1);
 
                /* then remove request from memory list (LU-9075) */
                mdt_cdt_remove_request(cdt, pgs->hpk_cookie);
index 9041d5f..e870268 100644 (file)
@@ -138,7 +138,7 @@ int mdt_hsm_progress(struct tgt_session_info *tsi)
        if (!mdt_hsm_is_admin(info))
                GOTO(out, rc = -EPERM);
 
-       rc = mdt_hsm_coordinator_update(info, hpk);
+       rc = mdt_hsm_update_request_state(info, hpk);
 out:
        mdt_thread_info_fini(info);
        RETURN(rc);
index 92c0eb6..b7c8ce0 100644 (file)
@@ -610,25 +610,6 @@ out_buf:
 }
 
 /**
- * update status of a request
- * \param mti [IN]
- * \param pgs [IN] progress of the copy tool
- * \retval 0 success
- * \retval -ve failure
- */
-int mdt_hsm_coordinator_update(struct mdt_thread_info *mti,
-                              struct hsm_progress_kernel *pgs)
-{
-       int      rc;
-
-       ENTRY;
-       /* ask to coordinator to update request state and
-        * to record on disk the result */
-       rc = mdt_hsm_update_request_state(mti, pgs, 1);
-       RETURN(rc);
-}
-
-/**
  * seq_file method called to start access to /proc file
  */
 static void *mdt_hsm_agent_proc_start(struct seq_file *s, loff_t *off)
index 0249708..9cd3811 100644 (file)
@@ -928,8 +928,6 @@ int mdt_hsm_find_best_agent(struct coordinator *cdt, __u32 archive,
                            struct obd_uuid *uuid);
 int mdt_hsm_agent_send(struct mdt_thread_info *mti, struct hsm_action_list *hal,
                       bool purge);
-int mdt_hsm_coordinator_update(struct mdt_thread_info *mti,
-                              struct hsm_progress_kernel *pgs);
 /* mdt/mdt_hsm_cdt_client.c */
 int mdt_hsm_add_actions(struct mdt_thread_info *info,
                        struct hsm_action_list *hal);
@@ -997,8 +995,7 @@ bool mdt_hsm_is_action_compat(const struct hsm_action_item *hai,
                              u32 archive_id, u64 rq_flags,
                              const struct md_hsm *hsm);
 int mdt_hsm_update_request_state(struct mdt_thread_info *mti,
-                                struct hsm_progress_kernel *pgs,
-                                const int update_record);
+                                struct hsm_progress_kernel *pgs);
 
 int mdt_close_swap_layouts(struct mdt_thread_info *info,
                           struct mdt_object *o, struct md_attr *ma);