From fe5706e0c19f96e4f821790004f05ab265002e9d Mon Sep 17 00:00:00 2001 From: Nikitas Angelinas Date: Tue, 12 Jul 2022 20:15:36 +0300 Subject: [PATCH] LU-16235 hsm: check CDT state before adding actions llog Don't allow HSM requests to be added to the actions llog when cdt_state is in CDT_STOPPED/CDT_STOPPING as the CDT is unavailable, or in CDT_INIT as any HSM requests in the llog may not have been fully processed and so cdt_last_cookie may not have been set appropriately, otherwise a colliding cookie value can be reused in mdt_agent_record_add() and the assertions in cdt_agent_record_hash_add() can be triggered: "ASSERTION( carl0->carl_cat_idx == carl1->carl_cat_idx ) failed" "ASSERTION( carl0->carl_rec_idx == carl1->carl_rec_idx ) failed" Requests needed to implement the Remove Archive on Last Unlink (RAoLU) policy are allowed when the CDT is shutdown, as those are safe operations. They are also allowed during CDT initialization, even though this can lead to the assertions being triggered, as doing so maintains administrator expectations regarding file archives always being removed when the RAoLU policy is enabled. This could possibly be improved by e.g. failing when mdt_handle_last_unlink() is not able to add an HSM remove request, or saving the requests in an llog so they can be sent if the CDT is available later. For the same reason, the llog needs to be processed before setting cdt_state to CDT_RUNNING in the coordinator thread. Change-Id: I4b5f5ee22f74827b31d8ed5917a8fc16e35d1f16 Signed-off-by: Nikitas Angelinas HPE-bug-id: LUS-8231, LUS-11064 Fixes: e26d7cc3 ("LU-14399 hsm: process hsm_actions in coordinator") Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/48842 Tested-by: jenkins Tested-by: Maloo Reviewed-by: James Simmons Reviewed-by: Oleg Drokin Reviewed-by: Etienne AUJAMES --- lustre/include/uapi/linux/lustre/lustre_user.h | 5 +++++ lustre/mdt/mdt_coordinator.c | 9 +-------- lustre/mdt/mdt_hsm.c | 3 ++- lustre/mdt/mdt_hsm_cdt_actions.c | 15 ++++++++++++++- lustre/mdt/mdt_lib.c | 4 ++-- 5 files changed, 24 insertions(+), 12 deletions(-) diff --git a/lustre/include/uapi/linux/lustre/lustre_user.h b/lustre/include/uapi/linux/lustre/lustre_user.h index 9881854..8243370 100644 --- a/lustre/include/uapi/linux/lustre/lustre_user.h +++ b/lustre/include/uapi/linux/lustre/lustre_user.h @@ -2523,6 +2523,11 @@ static inline char *hai_dump_data_field(const struct hsm_action_item *hai, return buffer; } +enum hal_flags { + /* Register even when the CDT is shutdown or being initialized */ + HAL_CDT_FORCE = 1 << 1, +}; + /* Copytool action list */ #define HAL_VERSION 1 #define HAL_MAXSIZE LNET_MTU /* bytes, used in userspace only */ diff --git a/lustre/mdt/mdt_coordinator.c b/lustre/mdt/mdt_coordinator.c index d49e65c..439e0cc 100644 --- a/lustre/mdt/mdt_coordinator.c +++ b/lustre/mdt/mdt_coordinator.c @@ -603,11 +603,8 @@ static int mdt_coordinator(void *data) obd_uuid2fsname(hsd.hsd_fsname, mdt_obd_name(mdt), sizeof(hsd.hsd_fsname)); - set_cdt_state(cdt, CDT_RUNNING); - - /* Inform mdt_hsm_cdt_start(). */ - wake_up(&cdt->cdt_waitq); cdt_start_pending_restore(mdt, cdt); + set_cdt_state(cdt, CDT_RUNNING); while (1) { int i; @@ -1214,10 +1211,6 @@ static int mdt_hsm_cdt_start(struct mdt_device *mdt) mdt_obd_name(mdt), rc); } else { cdt->cdt_task = task; - wait_event(cdt->cdt_waitq, - cdt->cdt_state != CDT_INIT); - CDEBUG(D_HSM, "%s: coordinator thread started\n", - mdt_obd_name(mdt)); rc = 0; } diff --git a/lustre/mdt/mdt_hsm.c b/lustre/mdt/mdt_hsm.c index 63b7c30..98f9e5d 100644 --- a/lustre/mdt/mdt_hsm.c +++ b/lustre/mdt/mdt_hsm.c @@ -572,7 +572,8 @@ int mdt_hsm_request(struct tgt_session_info *tsi) hal->hal_version = HAL_VERSION; hal->hal_archive_id = hr->hr_archive_id; - hal->hal_flags = hr->hr_flags; + hal->hal_flags = hr->hr_flags & ~HAL_CDT_FORCE; + obd_uuid2fsname(hal->hal_fsname, mdt_obd_name(info->mti_mdt), MTI_NAME_MAXLEN); diff --git a/lustre/mdt/mdt_hsm_cdt_actions.c b/lustre/mdt/mdt_hsm_cdt_actions.c index 57d1f24..f74f79e 100644 --- a/lustre/mdt/mdt_hsm_cdt_actions.c +++ b/lustre/mdt/mdt_hsm_cdt_actions.c @@ -284,8 +284,20 @@ int mdt_agent_record_add(const struct lu_env *env, struct mdt_device *mdt, if (lctxt == NULL || lctxt->loc_handle == NULL) GOTO(free, rc = -ENOENT); + /* Preserve lock order wrt hsm_cancel_all_actions() */ + mutex_lock(&cdt->cdt_state_lock); down_write(&cdt->cdt_llog_lock); + /* Need cdt_last_cookie to be set during CDT startup; allow RAoLU + * requests, even though this can trigger the assertions in + * cdt_agent_record_hash_add(). This could be improved e.g. by failing + * the unlink during CDT_INIT, or adding RAoLU requests in an llog and + * issuing them if the CDT is available later + */ + if ((cdt->cdt_state == CDT_STOPPED || cdt->cdt_state == CDT_STOPPING || + cdt->cdt_state == CDT_INIT) && !(flags & HAL_CDT_FORCE)) + GOTO(unavail, rc = -EAGAIN); + /* in case of cancel request, the cookie is already set to the * value of the request cookie to be cancelled * so we do not change it */ @@ -297,8 +309,9 @@ int mdt_agent_record_add(const struct lu_env *env, struct mdt_device *mdt, rc = llog_cat_add(env, lctxt->loc_handle, &larr->arr_hdr, NULL); if (rc > 0) rc = 0; - +unavail: up_write(&cdt->cdt_llog_lock); + mutex_unlock(&cdt->cdt_state_lock); llog_ctxt_put(lctxt); EXIT; diff --git a/lustre/mdt/mdt_lib.c b/lustre/mdt/mdt_lib.c index 47a338e..6c9932f 100644 --- a/lustre/mdt/mdt_lib.c +++ b/lustre/mdt/mdt_lib.c @@ -984,8 +984,8 @@ int mdt_handle_last_unlink(struct mdt_thread_info *info, struct mdt_object *mo, hai.hai_fid = *mdt_object_fid(mo); - rc = mdt_agent_record_add(info->mti_env, info->mti_mdt, archive_id, 0, - &hai); + rc = mdt_agent_record_add(info->mti_env, info->mti_mdt, archive_id, + HAL_CDT_FORCE, &hai); if (rc) CERROR("%s: unable to add HSM remove request for "DFID ": rc=%d\n", mdt_obd_name(info->mti_mdt), -- 1.8.3.1