Whamcloud - gitweb
LU-9312 hsm: add a cookie indexed request hash
[fs/lustre-release.git] / lustre / mdt / mdt_coordinator.c
index 9074a90..5637ffb 100644 (file)
@@ -23,7 +23,7 @@
  * Copyright (c) 2011, 2012 Commissariat a l'energie atomique et aux energies
  *                          alternatives
  *
- * Copyright (c) 2013, 2014, Intel Corporation.
+ * Copyright (c) 2013, 2016, Intel Corporation.
  * Use is subject to license terms.
  */
 /*
@@ -282,9 +282,9 @@ static int mdt_coordinator_cb(const struct lu_env *env,
                 * error may happen if coordinator crashes or stopped
                 * with running request
                 */
-               car = mdt_cdt_find_request(cdt, larr->arr_hai.hai_cookie, NULL);
+               car = mdt_cdt_find_request(cdt, larr->arr_hai.hai_cookie);
                if (car == NULL) {
-                       last = larr->arr_req_create;
+                       last = larr->arr_req_change;
                } else {
                        last = car->car_req_update;
                        mdt_cdt_put_request(car);
@@ -417,8 +417,8 @@ static int mdt_coordinator(void *data)
        int                      request_sz;
        ENTRY;
 
-       cdt->cdt_thread.t_flags = SVC_RUNNING;
-       wake_up(&cdt->cdt_thread.t_ctl_waitq);
+       cdt->cdt_flags = SVC_RUNNING;
+       wake_up(&cdt->cdt_waitq);
 
        CDEBUG(D_HSM, "%s: coordinator thread starting, pid=%d\n",
               mdt_obd_name(mdt), current_pid());
@@ -442,23 +442,22 @@ static int mdt_coordinator(void *data)
 
                lwi = LWI_TIMEOUT(cfs_time_seconds(cdt->cdt_loop_period),
                                  NULL, NULL);
-               l_wait_event(cdt->cdt_thread.t_ctl_waitq,
-                            (cdt->cdt_thread.t_flags &
-                             (SVC_STOPPING|SVC_EVENT)),
+               l_wait_event(cdt->cdt_waitq,
+                            cdt->cdt_flags & (SVC_STOPPING|SVC_EVENT),
                             &lwi);
 
                CDEBUG(D_HSM, "coordinator resumes\n");
 
-               if (cdt->cdt_thread.t_flags & SVC_STOPPING ||
+               if (cdt->cdt_flags & SVC_STOPPING ||
                    cdt->cdt_state == CDT_STOPPING) {
-                       cdt->cdt_thread.t_flags &= ~SVC_STOPPING;
+                       cdt->cdt_flags &= ~SVC_STOPPING;
                        rc = 0;
                        break;
                }
 
                /* wake up before timeout, new work arrives */
-               if (cdt->cdt_thread.t_flags & SVC_EVENT)
-                       cdt->cdt_thread.t_flags &= ~SVC_EVENT;
+               if (cdt->cdt_flags & SVC_EVENT)
+                       cdt->cdt_flags &= ~SVC_EVENT;
 
                /* if coordinator is suspended continue to wait */
                if (cdt->cdt_state == CDT_DISABLE) {
@@ -567,8 +566,8 @@ out:
                 * by mdt_stop_coordinator(), we have to ack
                 * and cdt cleaning will be done by event sender
                 */
-               cdt->cdt_thread.t_flags = SVC_STOPPED;
-               wake_up(&cdt->cdt_thread.t_ctl_waitq);
+               cdt->cdt_flags = SVC_STOPPED;
+               wake_up(&cdt->cdt_waitq);
        }
 
        if (rc != 0)
@@ -750,8 +749,8 @@ int mdt_hsm_cdt_wakeup(struct mdt_device *mdt)
                RETURN(-ESRCH);
 
        /* wake up coordinator */
-       cdt->cdt_thread.t_flags = SVC_EVENT;
-       wake_up(&cdt->cdt_thread.t_ctl_waitq);
+       cdt->cdt_flags = SVC_EVENT;
+       wake_up(&cdt->cdt_waitq);
 
        RETURN(0);
 }
@@ -771,29 +770,39 @@ int mdt_hsm_cdt_init(struct mdt_device *mdt)
 
        cdt->cdt_state = CDT_STOPPED;
 
-       init_waitqueue_head(&cdt->cdt_thread.t_ctl_waitq);
+       init_waitqueue_head(&cdt->cdt_waitq);
        mutex_init(&cdt->cdt_llog_lock);
        init_rwsem(&cdt->cdt_agent_lock);
        init_rwsem(&cdt->cdt_request_lock);
        mutex_init(&cdt->cdt_restore_lock);
 
-       INIT_LIST_HEAD(&cdt->cdt_requests);
+       INIT_LIST_HEAD(&cdt->cdt_request_list);
        INIT_LIST_HEAD(&cdt->cdt_agents);
        INIT_LIST_HEAD(&cdt->cdt_restore_hdl);
 
+       cdt->cdt_request_cookie_hash = cfs_hash_create("REQUEST_COOKIE_HASH",
+                                                      CFS_HASH_BITS_MIN,
+                                                      CFS_HASH_BITS_MAX,
+                                                      CFS_HASH_BKT_BITS,
+                                                      0 /* extra bytes */,
+                                                      CFS_HASH_MIN_THETA,
+                                                      CFS_HASH_MAX_THETA,
+                                               &cdt_request_cookie_hash_ops,
+                                                      CFS_HASH_DEFAULT);
+       if (cdt->cdt_request_cookie_hash == NULL)
+               RETURN(-ENOMEM);
+
        rc = lu_env_init(&cdt->cdt_env, LCT_MD_THREAD);
        if (rc < 0)
-               RETURN(rc);
+               GOTO(out_request_cookie_hash, rc);
 
        /* for mdt_ucred(), lu_ucred stored in lu_ucred_key */
        rc = lu_context_init(&cdt->cdt_session, LCT_SERVER_SESSION);
-       if (rc == 0) {
-               lu_context_enter(&cdt->cdt_session);
-               cdt->cdt_env.le_ses = &cdt->cdt_session;
-       } else {
-               lu_env_fini(&cdt->cdt_env);
-               RETURN(rc);
-       }
+       if (rc < 0)
+               GOTO(out_env, rc);
+
+       lu_context_enter(&cdt->cdt_session);
+       cdt->cdt_env.le_ses = &cdt->cdt_session;
 
        cdt_mti = lu_context_key_get(&cdt->cdt_env.le_ctx, &mdt_thread_key);
        LASSERT(cdt_mti != NULL);
@@ -813,6 +822,14 @@ int mdt_hsm_cdt_init(struct mdt_device *mdt)
        cdt->cdt_active_req_timeout = 3600;
 
        RETURN(0);
+
+out_env:
+       lu_env_fini(&cdt->cdt_env);
+out_request_cookie_hash:
+       cfs_hash_putref(cdt->cdt_request_cookie_hash);
+       cdt->cdt_request_cookie_hash = NULL;
+
+       return rc;
 }
 
 /**
@@ -829,6 +846,9 @@ int  mdt_hsm_cdt_fini(struct mdt_device *mdt)
 
        lu_env_fini(&cdt->cdt_env);
 
+       cfs_hash_putref(cdt->cdt_request_cookie_hash);
+       cdt->cdt_request_cookie_hash = NULL;
+
        RETURN(0);
 }
 
@@ -838,7 +858,7 @@ int  mdt_hsm_cdt_fini(struct mdt_device *mdt)
  * \retval 0 success
  * \retval -ve failure
  */
-int mdt_hsm_cdt_start(struct mdt_device *mdt)
+static int mdt_hsm_cdt_start(struct mdt_device *mdt)
 {
        struct coordinator      *cdt = &mdt->mdt_coordinator;
        int                      rc;
@@ -883,6 +903,9 @@ int mdt_hsm_cdt_start(struct mdt_device *mdt)
                       " for registered restore: %d\n",
                       mdt_obd_name(mdt), rc);
 
+       if (mdt->mdt_bottom->dd_rdonly)
+               RETURN(0);
+
        task = kthread_run(mdt_coordinator, cdt_mti, "hsm_cdtr");
        if (IS_ERR(task)) {
                rc = PTR_ERR(task);
@@ -896,8 +919,8 @@ int mdt_hsm_cdt_start(struct mdt_device *mdt)
                rc = 0;
        }
 
-       wait_event(cdt->cdt_thread.t_ctl_waitq,
-                      (cdt->cdt_thread.t_flags & SVC_RUNNING));
+       wait_event(cdt->cdt_waitq,
+                      (cdt->cdt_flags & SVC_RUNNING));
 
        cdt->cdt_state = CDT_RUNNING;
        mdt->mdt_opts.mo_coordinator = 1;
@@ -925,19 +948,22 @@ int mdt_hsm_cdt_stop(struct mdt_device *mdt)
 
        if (cdt->cdt_state != CDT_STOPPING) {
                /* stop coordinator thread before cleaning */
-               cdt->cdt_thread.t_flags = SVC_STOPPING;
-               wake_up(&cdt->cdt_thread.t_ctl_waitq);
-               wait_event(cdt->cdt_thread.t_ctl_waitq,
-                          cdt->cdt_thread.t_flags & SVC_STOPPED);
+               cdt->cdt_flags = SVC_STOPPING;
+               wake_up(&cdt->cdt_waitq);
+               wait_event(cdt->cdt_waitq,
+                          cdt->cdt_flags & SVC_STOPPED);
        }
        cdt->cdt_state = CDT_STOPPED;
 
        /* start cleaning */
        down_write(&cdt->cdt_request_lock);
-       list_for_each_entry_safe(car, tmp1, &cdt->cdt_requests,
+       list_for_each_entry_safe(car, tmp1, &cdt->cdt_request_list,
                                 car_request_list) {
+               cfs_hash_del(cdt->cdt_request_cookie_hash,
+                            &car->car_hai->hai_cookie,
+                            &car->car_cookie_hash);
                list_del(&car->car_request_list);
-               mdt_cdt_free_request(car);
+               mdt_cdt_put_request(car);
        }
        up_write(&cdt->cdt_request_lock);
 
@@ -1012,7 +1038,7 @@ int mdt_hsm_add_hal(struct mdt_thread_info *mti,
                        }
 
                        /* find the running request to set it canceled */
-                       car = mdt_cdt_find_request(cdt, hai->hai_cookie, NULL);
+                       car = mdt_cdt_find_request(cdt, hai->hai_cookie);
                        if (car != NULL) {
                                car->car_canceled = 1;
                                /* uuid has to be changed to the one running the
@@ -1062,39 +1088,37 @@ out:
 /**
  * swap layouts between 2 fids
  * \param mti [IN] context
- * \param fid1 [IN]
- * \param fid2 [IN]
+ * \param obj [IN]
+ * \param dfid [IN]
  * \param mh_common [IN] MD HSM
  */
 static int hsm_swap_layouts(struct mdt_thread_info *mti,
-                           const lustre_fid *fid, const lustre_fid *dfid,
+                           struct mdt_object *obj, const struct lu_fid *dfid,
                            struct md_hsm *mh_common)
 {
-       struct mdt_device       *mdt = mti->mti_mdt;
-       struct mdt_object       *child1, *child2;
-       struct mdt_lock_handle  *lh2;
+       struct mdt_object       *dobj;
+       struct mdt_lock_handle  *dlh;
        int                      rc;
        ENTRY;
 
-       child1 = mdt_object_find(mti->mti_env, mdt, fid);
-       if (IS_ERR(child1))
-               GOTO(out, rc = PTR_ERR(child1));
+       if (!mdt_object_exists(obj))
+               GOTO(out, rc = -ENOENT);
 
-       /* we already have layout lock on FID so take only
+       /* we already have layout lock on obj so take only
         * on dfid */
-       lh2 = &mti->mti_lh[MDT_LH_OLD];
-       mdt_lock_reg_init(lh2, LCK_EX);
-       child2 = mdt_object_find_lock(mti, dfid, lh2, MDS_INODELOCK_LAYOUT);
-       if (IS_ERR(child2))
-               GOTO(out_child1, rc = PTR_ERR(child2));
+       dlh = &mti->mti_lh[MDT_LH_OLD];
+       mdt_lock_reg_init(dlh, LCK_EX);
+       dobj = mdt_object_find_lock(mti, dfid, dlh, MDS_INODELOCK_LAYOUT);
+       if (IS_ERR(dobj))
+               GOTO(out, rc = PTR_ERR(dobj));
 
        /* if copy tool closes the volatile before sending the final
         * progress through llapi_hsm_copy_end(), all the objects
         * are removed and mdd_swap_layout LBUG */
-       if (!mdt_object_exists(child2)) {
+       if (!mdt_object_exists(dobj)) {
                CERROR("%s: Copytool has closed volatile file "DFID"\n",
                       mdt_obd_name(mti->mti_mdt), PFID(dfid));
-               GOTO(out_child2, rc = -ENOENT);
+               GOTO(out_dobj, rc = -ENOENT);
        }
        /* Since we only handle restores here, unconditionally use
         * SWAP_LAYOUTS_MDS_HSM flag to ensure original layout will
@@ -1105,17 +1129,15 @@ static int hsm_swap_layouts(struct mdt_thread_info *mti,
         * only need to clear RELEASED and DIRTY.
         */
        mh_common->mh_flags &= ~(HS_RELEASED | HS_DIRTY);
-       rc = mdt_hsm_attr_set(mti, child2, mh_common);
+       rc = mdt_hsm_attr_set(mti, dobj, mh_common);
        if (rc == 0)
                rc = mo_swap_layouts(mti->mti_env,
-                                    mdt_object_child(child1),
-                                    mdt_object_child(child2),
+                                    mdt_object_child(obj),
+                                    mdt_object_child(dobj),
                                     SWAP_LAYOUTS_MDS_HSM);
 
-out_child2:
-       mdt_object_unlock_put(mti, child2, lh2, 1);
-out_child1:
-       mdt_object_put(mti->mti_env, child1);
+out_dobj:
+       mdt_object_unlock_put(mti, dobj, dlh, 1);
 out:
        RETURN(rc);
 }
@@ -1140,20 +1162,17 @@ static int hsm_cdt_request_completed(struct mdt_thread_info *mti,
        int                      cl_flags = 0, rc = 0;
        struct md_hsm            mh;
        bool                     is_mh_changed;
+       bool                     need_changelog = true;
        ENTRY;
 
        /* default is to retry */
        *status = ARS_WAITING;
 
-       /* find object by FID */
+       /* find object by FID, mdt_hsm_get_md_hsm() returns obj or err
+        * if error/removed continue anyway to get correct reporting done */
        obj = mdt_hsm_get_md_hsm(mti, &car->car_hai->hai_fid, &mh);
        /* we will update MD HSM only if needed */
        is_mh_changed = false;
-       if (IS_ERR(obj)) {
-               /* object removed */
-               *status = ARS_SUCCEED;
-               goto unlock;
-       }
 
        /* no need to change mh->mh_arch_id
         * mdt_hsm_get_md_hsm() got it from disk and it is still valid
@@ -1181,9 +1200,11 @@ static int hsm_cdt_request_completed(struct mdt_thread_info *mti,
                        *status = ARS_SUCCEED;
                        break;
                default:
+                       /* retry only if current policy or requested, and
+                        * object is not on error/removed */
                        *status = (cdt->cdt_policy & CDT_NORETRY_ACTION ||
-                                  !(pgs->hpk_flags & HP_FLAG_RETRY) ?
-                                  ARS_FAILED : ARS_WAITING);
+                                  !(pgs->hpk_flags & HP_FLAG_RETRY) ||
+                                  IS_ERR(obj)) ? ARS_FAILED : ARS_WAITING;
                        break;
                }
 
@@ -1282,23 +1303,20 @@ static int hsm_cdt_request_completed(struct mdt_thread_info *mti,
                                 mh.mh_flags & HS_DIRTY ? CLF_HSM_DIRTY : 0);
 
        /* unlock is done later, after layout lock management */
-       if (is_mh_changed)
+       if (is_mh_changed && !IS_ERR(obj))
                rc = mdt_hsm_attr_set(mti, obj, &mh);
 
-unlock:
        /* we give back layout lock only if restore was successful or
-        * if restore was canceled or if policy is to not retry
+        * if no retry will be attempted and if object is still alive,
         * in other cases we just unlock the object */
-       if (car->car_hai->hai_action == HSMA_RESTORE &&
-           (pgs->hpk_errval == 0 || pgs->hpk_errval == ECANCELED ||
-            cdt->cdt_policy & CDT_NORETRY_ACTION)) {
+       if (car->car_hai->hai_action == HSMA_RESTORE) {
                struct cdt_restore_handle       *crh;
 
                /* restore in data FID done, we swap the layouts
                 * only if restore is successful */
-               if (pgs->hpk_errval == 0) {
-                       rc = hsm_swap_layouts(mti, &car->car_hai->hai_fid,
-                                             &car->car_hai->hai_dfid, &mh);
+               if (pgs->hpk_errval == 0 && !IS_ERR(obj)) {
+                       rc = hsm_swap_layouts(mti, obj, &car->car_hai->hai_dfid,
+                                             &mh);
                        if (rc) {
                                if (cdt->cdt_policy & CDT_NORETRY_ACTION)
                                        *status = ARS_FAILED;
@@ -1309,17 +1327,25 @@ unlock:
                if (*status == ARS_WAITING)
                        GOTO(out, rc);
 
+               /* restore special case, need to create ChangeLog record
+                * before to give back layout lock to avoid concurrent
+                * file updater to post out of order ChangeLog */
+               mo_changelog(env, CL_HSM, cl_flags, mdt->mdt_child,
+                            &car->car_hai->hai_fid);
+               need_changelog = false;
+
                /* give back layout lock */
                mutex_lock(&cdt->cdt_restore_lock);
                crh = mdt_hsm_restore_hdl_find(cdt, &car->car_hai->hai_fid);
                if (crh != NULL)
                        list_del(&crh->crh_list);
                mutex_unlock(&cdt->cdt_restore_lock);
-               /* just give back layout lock, we keep
-                * the reference which is given back
-                * later with the lock for HSM flags */
-               if (!IS_ERR(obj) && crh != NULL)
-                       mdt_object_unlock(mti, obj, &crh->crh_lh, 1);
+               /* Just give back layout lock, we keep the reference
+                * which is given back later with the lock for HSM
+                * flags.
+                * XXX obj may be invalid so we do not pass it. */
+               if (crh != NULL)
+                       mdt_object_unlock(mti, NULL, &crh->crh_lh, 1);
 
                if (crh != NULL)
                        OBD_SLAB_FREE_PTR(crh, mdt_hsm_cdt_kmem);
@@ -1328,11 +1354,13 @@ unlock:
        GOTO(out, rc);
 
 out:
-       if (obj != NULL && !IS_ERR(obj)) {
-               mo_changelog(env, CL_HSM, cl_flags,
-                            mdt_object_child(obj));
+       /* always add a ChangeLog record */
+       if (need_changelog)
+               mo_changelog(env, CL_HSM, cl_flags, mdt->mdt_child,
+                            &car->car_hai->hai_fid);
+
+       if (!IS_ERR(obj))
                mdt_object_put(mti->mti_env, obj);
-       }
 
        RETURN(rc);
 }
@@ -1420,15 +1448,14 @@ int mdt_hsm_update_request_state(struct mdt_thread_info *mti,
 
                rc = hsm_cdt_request_completed(mti, pgs, car, &status);
 
-               /* remove request from memory list */
-               mdt_cdt_remove_request(cdt, pgs->hpk_cookie);
-
-               CDEBUG(D_HSM, "Updating record: fid="DFID" cookie=%#llx"
-                             " action=%s status=%s\n", PFID(&pgs->hpk_fid),
-                      pgs->hpk_cookie,
+               CDEBUG(D_HSM, "%s record: fid="DFID" cookie=%#llx action=%s "
+                             "status=%s\n",
+                      update_record ? "Updating" : "Not updating",
+                      PFID(&pgs->hpk_fid), pgs->hpk_cookie,
                       hsm_copytool_action2name(car->car_hai->hai_action),
                       agent_req_status2name(status));
 
+               /* update record first (LU-9075) */
                if (update_record) {
                        int rc1;
 
@@ -1444,6 +1471,10 @@ int mdt_hsm_update_request_state(struct mdt_thread_info *mti,
                                       pgs->hpk_cookie);
                        rc = (rc != 0 ? rc : rc1);
                }
+
+               /* then remove request from memory list (LU-9075) */
+               mdt_cdt_remove_request(cdt, pgs->hpk_cookie);
+
                /* ct has completed a request, so a slot is available, wakeup
                 * cdt to find new work */
                mdt_hsm_cdt_wakeup(mdt);
@@ -1509,6 +1540,8 @@ static int mdt_cancel_all_cb(const struct lu_env *env,
  */
 static int hsm_cancel_all_actions(struct mdt_device *mdt)
 {
+       struct lu_env                    env;
+       struct lu_context                session;
        struct mdt_thread_info          *mti;
        struct coordinator              *cdt = &mdt->mdt_coordinator;
        struct cdt_agent_req            *car;
@@ -1519,8 +1552,25 @@ static int hsm_cancel_all_actions(struct mdt_device *mdt)
        enum cdt_states                  save_state;
        ENTRY;
 
-       /* retrieve coordinator context */
-       mti = lu_context_key_get(&cdt->cdt_env.le_ctx, &mdt_thread_key);
+       rc = lu_env_init(&env, LCT_MD_THREAD);
+       if (rc < 0)
+               RETURN(rc);
+
+       /* for mdt_ucred(), lu_ucred stored in lu_ucred_key */
+       rc = lu_context_init(&session, LCT_SERVER_SESSION);
+       if (rc < 0)
+               GOTO(out_env, rc);
+
+       lu_context_enter(&session);
+       env.le_ses = &session;
+
+       mti = lu_context_key_get(&env.le_ctx, &mdt_thread_key);
+       LASSERT(mti != NULL);
+
+       mti->mti_env = &env;
+       mti->mti_mdt = mdt;
+
+       hsm_init_ucred(mdt_ucred(mti));
 
        /* disable coordinator */
        save_state = cdt->cdt_state;
@@ -1528,7 +1578,7 @@ static int hsm_cancel_all_actions(struct mdt_device *mdt)
 
        /* send cancel to all running requests */
        down_read(&cdt->cdt_request_lock);
-       list_for_each_entry(car, &cdt->cdt_requests, car_request_list) {
+       list_for_each_entry(car, &cdt->cdt_request_list, car_request_list) {
                mdt_cdt_get_request(car);
                /* request is not yet removed from list, it will be done
                 * when copytool will return progress
@@ -1556,7 +1606,7 @@ static int hsm_cancel_all_actions(struct mdt_device *mdt)
                        if (hal == NULL) {
                                mdt_cdt_put_request(car);
                                up_read(&cdt->cdt_request_lock);
-                               GOTO(out, rc = -ENOMEM);
+                               GOTO(out_cdt_state, rc = -ENOMEM);
                        }
                }
 
@@ -1596,9 +1646,13 @@ static int hsm_cancel_all_actions(struct mdt_device *mdt)
 
        rc = cdt_llog_process(mti->mti_env, mti->mti_mdt,
                              mdt_cancel_all_cb, &hcad);
-out:
+out_cdt_state:
        /* enable coordinator */
        cdt->cdt_state = save_state;
+       lu_context_exit(&session);
+       lu_context_fini(&session);
+out_env:
+       lu_env_fini(&env);
 
        RETURN(rc);
 }
@@ -1712,7 +1766,7 @@ static void hsm_policy_bit2str(struct seq_file *m, const __u64 mask,
        }
        /* remove last ' ' */
        m->count--;
-       seq_putc(m, '\0');
+       seq_putc(m, '\n');
 }
 
 /* methods to read/write HSM policy flags */
@@ -1900,6 +1954,7 @@ mdt_hsm_cdt_control_seq_write(struct file *file, const char __user *buffer,
                        rc = -EALREADY;
                } else {
                        cdt->cdt_state = CDT_STOPPING;
+                       mdt_hsm_cdt_wakeup(mdt);
                }
        } else if (strcmp(kernbuf, CDT_DISABLE_CMD) == 0) {
                if ((cdt->cdt_state == CDT_STOPPING) ||