From ffedcbae21f7aefe5c2258a94b36fe286f46182c Mon Sep 17 00:00:00 2001 From: Alexander Boyko Date: Sat, 20 Apr 2024 18:02:54 -0400 Subject: [PATCH] LU-17809 osp: make disconnect asynchronous MDT could have many osp devices. During umount there is a problem of casscading timeouts of disconnect request. It could lead to unpredictable large umount time. This patch adds ability of parallel disconnect for OSP devices. During LCFG_PRECLEANUP osp_disconnect() sends disconnects requests. And osp_shutdown() waits it. So casscading timeouts were changed to a single request wait. Don't drop obd_force flag from upper layers. Adds replay-single test 201, it simulates delays of OSP disconnects. This leads to a high cumulative umount time. HPE-bug-id: LUS-12251 Signed-off-by: Alexander Boyko Change-Id: Id788b22c494147bdc7f0d36968629e7b7f660e01 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/54995 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Alexey Lyashkov Reviewed-by: Oleg Drokin Reviewed-by: Alex Zhuravlev --- lustre/include/lustre_net.h | 2 + lustre/osp/osp_dev.c | 33 ++++++--- lustre/osp/osp_internal.h | 5 +- lustre/ptlrpc/import.c | 167 +++++++++++++++++++++++++++++++++--------- lustre/tests/replay-single.sh | 29 ++++++++ 5 files changed, 191 insertions(+), 45 deletions(-) diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h index a0b8b7d..f97d0d9 100644 --- a/lustre/include/lustre_net.h +++ b/lustre/include/lustre_net.h @@ -2251,6 +2251,8 @@ void ptlrpc_watchdog_delete(struct delayed_work *work); int ptlrpc_connect_import(struct obd_import *imp); int ptlrpc_connect_import_locked(struct obd_import *imp); int ptlrpc_init_import(struct obd_import *imp); +int ptlrpc_disconnect_import_async(struct obd_import *imp, int noclose, + struct completion *a, int *r); int ptlrpc_disconnect_import(struct obd_import *imp, int noclose); int ptlrpc_disconnect_and_idle_import(struct obd_import *imp); int ptlrpc_import_recovery_state_machine(struct obd_import *imp); diff --git a/lustre/osp/osp_dev.c b/lustre/osp/osp_dev.c index 702e030..2906366 100644 --- a/lustre/osp/osp_dev.c +++ b/lustre/osp/osp_dev.c @@ -467,13 +467,17 @@ static int osp_disconnect(struct osp_device *d) struct obd_device *obd = d->opd_obd; struct obd_import *imp; int rc = 0; + ENTRY; imp = obd->u.cli.cl_import; + CDEBUG(D_INFO, "%s: disconnecting import %px\n", obd->obd_name, + imp); /* Mark import deactivated now, so we don't try to reconnect if any * of the cleanup RPCs fails (e.g. ldlm cancel, etc). We don't * fully deactivate the import, or that would drop all requests. */ LASSERT(imp != NULL); + spin_lock(&imp->imp_lock); imp->imp_deactive = 1; spin_unlock(&imp->imp_lock); @@ -487,15 +491,13 @@ static int osp_disconnect(struct osp_device *d) /* Send disconnect on healthy import, do force disconnect otherwise */ spin_lock(&imp->imp_lock); - imp->imp_obd->obd_force = imp->imp_state != LUSTRE_IMP_FULL; + imp->imp_obd->obd_force |= imp->imp_state != LUSTRE_IMP_FULL; spin_unlock(&imp->imp_lock); - rc = ptlrpc_disconnect_import(imp, 0); - if (rc != 0) - CERROR("%s: can't disconnect: rc = %d\n", obd->obd_name, rc); - - ptlrpc_invalidate_import(imp); - + init_completion(&d->opd_disconnect_cmplt); + d->opd_disconnecting = 1; + rc = ptlrpc_disconnect_import_async(imp, 0, &d->opd_disconnect_cmplt, + &d->opd_disconnect_res); RETURN(rc); } @@ -615,13 +617,26 @@ static void osp_update_fini(const struct lu_env *env, struct osp_device *osp) */ static int osp_shutdown(const struct lu_env *env, struct osp_device *d) { - int rc = 0; + struct obd_device *obd = d->opd_obd; + struct obd_import *imp = obd->u.cli.cl_import; + int rc = 0; ENTRY; LASSERT(env); - rc = osp_disconnect(d); + /* Shutdown could be called during fail initialization, LCFG_CLEANUP + * without LCFG_PRE_CLEANUP phase, like + * lod_add_device()->obd_connect() failure. + */ + if (d->opd_disconnecting) { + wait_for_completion(&d->opd_disconnect_cmplt); + rc = d->opd_disconnect_res; + if (rc != 0) + CERROR("%s: can't disconnect: rc = %d\n", + obd->obd_name, rc); + } + ptlrpc_invalidate_import(imp); osp_statfs_fini(d); if (!d->opd_connect_mdt) { diff --git a/lustre/osp/osp_internal.h b/lustre/osp/osp_internal.h index cb8a143..4ed3ac1 100644 --- a/lustre/osp/osp_internal.h +++ b/lustre/osp/osp_internal.h @@ -175,6 +175,8 @@ struct osp_device { struct obd_device *opd_obd; struct obd_export *opd_exp; struct obd_connect_data *opd_connect_data; + struct completion opd_disconnect_cmplt; + int opd_disconnect_res; /* connection status. */ unsigned int opd_new_connection:1, @@ -182,7 +184,8 @@ struct osp_device { opd_imp_connected:1, opd_imp_active:1, opd_imp_seen_connected:1, - opd_connect_mdt:1; + opd_connect_mdt:1, + opd_disconnecting:1; /* whether local recovery is completed: * reported via ->ldo_recovery_complete() */ diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index 02a8adc..1526c81 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -1737,22 +1737,146 @@ static struct ptlrpc_request *ptlrpc_disconnect_prep_req(struct obd_import *imp) RETURN(req); } -int ptlrpc_disconnect_import(struct obd_import *imp, int noclose) +struct disconnect_async_arg { + struct completion *daa_completion; + int *daa_result; + int daa_noclose; +}; + +/* + * Unlock import. + */ +static void ptlrpc_disconnect_import_end(struct obd_import *imp, int noclose) +{ + assert_spin_locked(&imp->imp_lock); + + if (noclose) + import_set_state_nolock(imp, LUSTRE_IMP_DISCON); + else + import_set_state_nolock(imp, LUSTRE_IMP_CLOSED); + memset(&imp->imp_remote_handle, 0, sizeof(imp->imp_remote_handle)); + spin_unlock(&imp->imp_lock); + + obd_import_event(imp->imp_obd, imp, IMP_EVENT_DISCON); + if (!noclose) + obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE); +} + +static int ptlrpc_disconnect_interpet(const struct lu_env *env, + struct ptlrpc_request *req, void *args, + int rc) +{ + struct obd_import *imp = req->rq_import; + struct disconnect_async_arg *daa = args; + + spin_lock(&imp->imp_lock); + ptlrpc_disconnect_import_end(imp, daa->daa_noclose); + + if (rc == -ETIMEDOUT || rc == -ENOTCONN || rc == -ESHUTDOWN) + rc = 0; + + if (daa->daa_result) + *daa->daa_result = rc; + + complete(daa->daa_completion); + + return 0; +} + +/** + * Sends disconnect request and set import state DISCONNECT/CLOSED. + * Produces events IMP_EVENT_DISCON[IMP_EVENT_INACTIVE]. + * Signals when it is complete. + * + * \param[in] imp import + * \param[in] noclose final close import + * \param[in] completion completion to signal disconnect is finished + * \param[out] out_res result of disconnection + * + * \retval 0 on seccess + * \retval negative negated errno on error + **/ +int ptlrpc_disconnect_import_async(struct obd_import *imp, int noclose, + struct completion *cmpl, int *out_res) { struct ptlrpc_request *req; int rc = 0; - + struct disconnect_async_arg *daa; ENTRY; - if (imp->imp_obd->obd_force) - GOTO(set_state, rc); + spin_lock(&imp->imp_lock); + /* probably the import has been disconnected already being idle */ + if (imp->imp_state != LUSTRE_IMP_FULL || imp->imp_obd->obd_force) { + + ptlrpc_disconnect_import_end(imp, noclose); + + if (out_res) + *out_res = 0; + complete(cmpl); + + RETURN(0); + } + spin_unlock(&imp->imp_lock); + + req = ptlrpc_disconnect_prep_req(imp); + + spin_lock(&imp->imp_lock); + + if (IS_ERR(req) || imp->imp_state != LUSTRE_IMP_FULL || + imp->imp_obd->obd_force) { + + if (!IS_ERR(req)) + ptlrpc_req_put_with_imp_lock(req); + + ptlrpc_disconnect_import_end(imp, noclose); + rc = IS_ERR(req) ? PTR_ERR(req) : 0; + + if (out_res) + *out_res = rc; + complete(cmpl); + + RETURN(rc); + } + import_set_state_nolock(imp, LUSTRE_IMP_CONNECTING); + spin_unlock(&imp->imp_lock); + + req->rq_interpret_reply = ptlrpc_disconnect_interpet; + daa = ptlrpc_req_async_args(daa, req); + daa->daa_completion = cmpl; + daa->daa_result = out_res; + daa->daa_noclose = noclose; + + ptlrpcd_add_req(req); + + RETURN(rc); +} +EXPORT_SYMBOL(ptlrpc_disconnect_import_async); + +/** + * Sends disconnect request and set import state DISCONNECT/CLOSED. + * Produces events IMP_EVENT_DISCON[IMP_EVENT_INACTIVE]. + * + * \param[in] imp import + * \param[in] noclose final close import + * + * \retval 0 on seccess + * \retval negative negated errno on error + **/ +int ptlrpc_disconnect_import(struct obd_import *imp, int noclose) +{ + DECLARE_COMPLETION_ONSTACK(cmpl); + int rc; + ENTRY; /* probably the import has been disconnected already being idle */ spin_lock(&imp->imp_lock); - if (imp->imp_state == LUSTRE_IMP_IDLE) - GOTO(out, rc); + if (imp->imp_state == LUSTRE_IMP_IDLE || imp->imp_obd->obd_force) { + ptlrpc_disconnect_import_end(imp, noclose); + RETURN(0); + } spin_unlock(&imp->imp_lock); + if (ptlrpc_import_in_recovery(imp)) { long timeout_jiffies; time64_t timeout; @@ -1781,37 +1905,10 @@ int ptlrpc_disconnect_import(struct obd_import *imp, int noclose) rc = -EINTR; } - req = ptlrpc_disconnect_prep_req(imp); - if (IS_ERR(req)) - GOTO(set_state, rc = PTR_ERR(req)); + rc = ptlrpc_disconnect_import_async(imp, noclose, &cmpl, &rc); - spin_lock(&imp->imp_lock); - if (imp->imp_state != LUSTRE_IMP_FULL) { - ptlrpc_req_put_with_imp_lock(req); - GOTO(out, rc); - } - import_set_state_nolock(imp, LUSTRE_IMP_CONNECTING); - spin_unlock(&imp->imp_lock); - - rc = ptlrpc_queue_wait(req); - ptlrpc_req_put(req); + wait_for_completion(&cmpl); -set_state: - spin_lock(&imp->imp_lock); -out: - if (noclose) - import_set_state_nolock(imp, LUSTRE_IMP_DISCON); - else - import_set_state_nolock(imp, LUSTRE_IMP_CLOSED); - memset(&imp->imp_remote_handle, 0, sizeof(imp->imp_remote_handle)); - spin_unlock(&imp->imp_lock); - - obd_import_event(imp->imp_obd, imp, IMP_EVENT_DISCON); - if (!noclose) - obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE); - - if (rc == -ETIMEDOUT || rc == -ENOTCONN || rc == -ESHUTDOWN) - rc = 0; RETURN(rc); } EXPORT_SYMBOL(ptlrpc_disconnect_import); diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index a9f4200..5fefc33 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -5309,6 +5309,35 @@ test_200() { } run_test 200 "Dropping one OBD_PING should not cause disconnect" +test_201() { + (( MDS1_VERSION >= $(version_code 2.15.63) )) || + skip "MDS < 2.15.63 doesn't support parallel disconnect" + (( MDSCOUNT >= 2 )) || skip_env "needs >= 2 MDTs" + (( OSTCOUNT >= 2 )) || skip_env "needs >= 2 OSTs" + + # delay DISCONNECT for 8 seconds, on all OSTs and MDTs +#define OBD_FAIL_OST_DISCONNECT_DELAY 0x245 + do_nodes $(comma_list $(mdts_nodes)) "$LCTL set_param \ + fail_loc=0x245 fail_val=8" + do_nodes $(comma_list $(osts_nodes)) "$LCTL set_param \ + fail_loc=0x245 fail_val=8" + + local start_time=$SECONDS + + stop mds2 + + local duration=$((SECONDS - start_time)) + + start mds2 $(mdsdevname 2) $MDS_MOUNT_OPTS || + error "mount mds2 failed" + echo "Umount took $duration seconds" + + #Valid timeout is 8 for MDTs + 8 for OSTs + 4 some for other umount + (( duration < 20 )) || error "Cascading timeouts on disconnect" +} +run_test 201 "MDT umount cascading disconnects timeouts" + + complete_test $SECONDS check_and_cleanup_lustre exit_status -- 1.8.3.1