From c13ddec8e1cd3a63c16e08f28749771200b92f1b Mon Sep 17 00:00:00 2001 From: Bruno Faccini Date: Thu, 21 Apr 2016 15:56:28 +0200 Subject: [PATCH] LU-8010 mdt: fix orphan layout_lock cases for restore Previously to this patch layout was not unlocked when a restore was failed before being sent to CT, leading to a situation where other requestors hang and also to an orphan restore_handle to be kept on CDT's list of registered restore actions. Only way to clear situation then, was to stop CDT. This situation could at least occur if a restore was canceled but the CT does not handle cancel operation, allowing the restore to complete but also to have new restore requests to be registered in the mean time and then to be failed due to incompatible (no longer released) file state. Also fix similar deadlock cases where layout lock was taken for previously started restore requests upon CDT restart, by forcing their replay/restart. This should also strengthen overall HSM recovery process. Signed-off-by: Bruno Faccini Change-Id: Ib1ba9156793a230d256ff80d74372813f10b0321 Reviewed-on: http://review.whamcloud.com/19710 Tested-by: Jenkins Reviewed-by: John L. Hammond Tested-by: Maloo Reviewed-by: jacques-Charles Lafoucriere Reviewed-by: Aurelien Degremont Reviewed-by: Oleg Drokin --- lustre/mdt/mdt_coordinator.c | 15 +++++++++++++-- lustre/mdt/mdt_hsm_cdt_agent.c | 27 +++++++++++++++++++++++++++ lustre/mdt/mdt_internal.h | 2 ++ 3 files changed, 42 insertions(+), 2 deletions(-) diff --git a/lustre/mdt/mdt_coordinator.c b/lustre/mdt/mdt_coordinator.c index 970f6b8..c02f6a1 100644 --- a/lustre/mdt/mdt_coordinator.c +++ b/lustre/mdt/mdt_coordinator.c @@ -621,7 +621,7 @@ out: * \retval cdt_restore_handle found * \retval NULL not found */ -static struct cdt_restore_handle *hsm_restore_hdl_find(struct coordinator *cdt, +struct cdt_restore_handle *mdt_hsm_restore_hdl_find(struct coordinator *cdt, const struct lu_fid *fid) { struct cdt_restore_handle *crh; @@ -682,6 +682,17 @@ static int hsm_restore_cb(const struct lu_env *env, /* restore request not in a final state */ + /* force replay of restore requests left in started state from previous + * CDT context, to be canceled later if finally found to be incompatible + * when being re-started */ + if (larr->arr_status == ARS_STARTED) { + larr->arr_status = ARS_WAITING; + larr->arr_req_change = cfs_time_current_sec(); + rc = llog_write(env, llh, hdr, hdr->lrh_index); + if (rc != 0) + GOTO(out, rc); + } + OBD_SLAB_ALLOC_PTR(crh, mdt_hsm_cdt_kmem); if (crh == NULL) RETURN(-ENOMEM); @@ -1335,7 +1346,7 @@ unlock: /* give back layout lock */ mutex_lock(&cdt->cdt_restore_lock); - crh = hsm_restore_hdl_find(cdt, &car->car_hai->hai_fid); + crh = mdt_hsm_restore_hdl_find(cdt, &car->car_hai->hai_fid); if (crh != NULL) list_del(&crh->crh_list); mutex_unlock(&cdt->cdt_restore_lock); diff --git a/lustre/mdt/mdt_hsm_cdt_agent.c b/lustre/mdt/mdt_hsm_cdt_agent.c index 895feb7..c97be71 100644 --- a/lustre/mdt/mdt_hsm_cdt_agent.c +++ b/lustre/mdt/mdt_hsm_cdt_agent.c @@ -406,6 +406,33 @@ int mdt_hsm_agent_send(struct mdt_thread_info *mti, hai->hai_cookie, rc); GOTO(out_buf, rc); } + + /* if restore and record status updated, give + * back granted layout lock */ + if (hai->hai_action == HSMA_RESTORE) { + struct cdt_restore_handle *crh = NULL; + struct mdt_object *obj = NULL; + + mutex_lock(&cdt->cdt_restore_lock); + crh = mdt_hsm_restore_hdl_find(cdt, + &hai->hai_fid); + if (crh != NULL) + list_del(&crh->crh_list); + mutex_unlock(&cdt->cdt_restore_lock); + obj = mdt_object_find(mti->mti_env, + mti->mti_mdt, + &hai->hai_fid); + if (!IS_ERR(obj) && crh != NULL) + mdt_object_unlock(mti, obj, + &crh->crh_lh, + 1); + if (crh != NULL) + OBD_SLAB_FREE_PTR(crh, + mdt_hsm_cdt_kmem); + if (!IS_ERR(obj)) + mdt_object_put(mti->mti_env, + obj); + } } } } diff --git a/lustre/mdt/mdt_internal.h b/lustre/mdt/mdt_internal.h index 8829027..dc5d990 100644 --- a/lustre/mdt/mdt_internal.h +++ b/lustre/mdt/mdt_internal.h @@ -856,6 +856,8 @@ int mdt_cdt_remove_request(struct coordinator *cdt, __u64 cookie); /* mdt/mdt_coordinator.c */ void mdt_hsm_dump_hal(int level, const char *prefix, struct hsm_action_list *hal); +struct cdt_restore_handle *mdt_hsm_restore_hdl_find(struct coordinator *cdt, + const struct lu_fid *fid); /* coordinator management */ int mdt_hsm_cdt_init(struct mdt_device *mdt); int mdt_hsm_cdt_start(struct mdt_device *mdt); -- 1.8.3.1