Whamcloud - gitweb
LU-8010 mdt: fix orphan layout_lock cases for restore 10/19710/7
authorBruno Faccini <bruno.faccini@intel.com>
Thu, 21 Apr 2016 13:56:28 +0000 (15:56 +0200)
committerOleg Drokin <oleg.drokin@intel.com>
Mon, 16 May 2016 16:47:41 +0000 (16:47 +0000)
Previously to this patch layout was not unlocked when a restore
was failed before being sent to CT, leading to a situation where
other requestors hang and also to an orphan restore_handle to be
kept on CDT's list of registered restore actions.
Only way to clear situation then, was to stop CDT.
This situation could at least occur if a restore was canceled
but the CT does not handle cancel operation, allowing the restore
to complete but also to have new restore requests to be
registered in the mean time and then to be failed due to
incompatible (no longer released) file state.

Also fix similar deadlock cases where layout lock was taken for
previously started restore requests upon CDT restart, by forcing
their replay/restart. This should also strengthen overall HSM
recovery process.

Signed-off-by: Bruno Faccini <bruno.faccini@intel.com>
Change-Id: Ib1ba9156793a230d256ff80d74372813f10b0321
Reviewed-on: http://review.whamcloud.com/19710
Tested-by: Jenkins
Reviewed-by: John L. Hammond <john.hammond@intel.com>
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: jacques-Charles Lafoucriere <jacques-charles.lafoucriere@cea.fr>
Reviewed-by: Aurelien Degremont <aurelien.degremont@cea.fr>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/mdt/mdt_coordinator.c
lustre/mdt/mdt_hsm_cdt_agent.c
lustre/mdt/mdt_internal.h

index 970f6b8..c02f6a1 100644 (file)
@@ -621,7 +621,7 @@ out:
  * \retval cdt_restore_handle found
  * \retval NULL not found
  */
-static struct cdt_restore_handle *hsm_restore_hdl_find(struct coordinator *cdt,
+struct cdt_restore_handle *mdt_hsm_restore_hdl_find(struct coordinator *cdt,
                                                       const struct lu_fid *fid)
 {
        struct cdt_restore_handle       *crh;
@@ -682,6 +682,17 @@ static int hsm_restore_cb(const struct lu_env *env,
 
        /* restore request not in a final state */
 
+       /* force replay of restore requests left in started state from previous
+        * CDT context, to be canceled later if finally found to be incompatible
+        * when being re-started */
+       if (larr->arr_status == ARS_STARTED) {
+               larr->arr_status = ARS_WAITING;
+               larr->arr_req_change = cfs_time_current_sec();
+               rc = llog_write(env, llh, hdr, hdr->lrh_index);
+               if (rc != 0)
+                       GOTO(out, rc);
+       }
+
        OBD_SLAB_ALLOC_PTR(crh, mdt_hsm_cdt_kmem);
        if (crh == NULL)
                RETURN(-ENOMEM);
@@ -1335,7 +1346,7 @@ unlock:
 
                /* give back layout lock */
                mutex_lock(&cdt->cdt_restore_lock);
-               crh = hsm_restore_hdl_find(cdt, &car->car_hai->hai_fid);
+               crh = mdt_hsm_restore_hdl_find(cdt, &car->car_hai->hai_fid);
                if (crh != NULL)
                        list_del(&crh->crh_list);
                mutex_unlock(&cdt->cdt_restore_lock);
index 895feb7..c97be71 100644 (file)
@@ -406,6 +406,33 @@ int mdt_hsm_agent_send(struct mdt_thread_info *mti,
                                              hai->hai_cookie, rc);
                                        GOTO(out_buf, rc);
                                }
+
+                               /* if restore and record status updated, give
+                                * back granted layout lock */
+                               if (hai->hai_action == HSMA_RESTORE) {
+                                       struct cdt_restore_handle *crh = NULL;
+                                       struct mdt_object *obj = NULL;
+
+                                       mutex_lock(&cdt->cdt_restore_lock);
+                                       crh = mdt_hsm_restore_hdl_find(cdt,
+                                                               &hai->hai_fid);
+                                       if (crh != NULL)
+                                               list_del(&crh->crh_list);
+                                       mutex_unlock(&cdt->cdt_restore_lock);
+                                       obj = mdt_object_find(mti->mti_env,
+                                                             mti->mti_mdt,
+                                                             &hai->hai_fid);
+                                       if (!IS_ERR(obj) && crh != NULL)
+                                               mdt_object_unlock(mti, obj,
+                                                                 &crh->crh_lh,
+                                                                 1);
+                                       if (crh != NULL)
+                                               OBD_SLAB_FREE_PTR(crh,
+                                                       mdt_hsm_cdt_kmem);
+                                       if (!IS_ERR(obj))
+                                               mdt_object_put(mti->mti_env,
+                                                              obj);
+                               }
                        }
                }
        }
index 8829027..dc5d990 100644 (file)
@@ -856,6 +856,8 @@ int mdt_cdt_remove_request(struct coordinator *cdt, __u64 cookie);
 /* mdt/mdt_coordinator.c */
 void mdt_hsm_dump_hal(int level, const char *prefix,
                      struct hsm_action_list *hal);
+struct cdt_restore_handle *mdt_hsm_restore_hdl_find(struct coordinator *cdt,
+                                               const struct lu_fid *fid);
 /* coordinator management */
 int mdt_hsm_cdt_init(struct mdt_device *mdt);
 int mdt_hsm_cdt_start(struct mdt_device *mdt);