From d1b5146eda4fdaa77dd44bc2195435bda0f83a94 Mon Sep 17 00:00:00 2001 From: Vladimir Saveliev Date: Fri, 19 Apr 2019 12:33:12 +0300 Subject: [PATCH] LU-12206 mdt: mdt_init0 failure handling When mdt_init0 fails it has to wait until zombie workqueue has all disconnected exports destroyed before mdt_device_alloc will free the mdt_device. Otherwise, zombie workqueue refers to freed mdt_device via: general protection fault: 0000 [#1] SMP .. Workqueue: obd_zombid obd_zombie_exp_cull [obdclass] .. [] tgt_client_free+0x1e5/0x3c0 [ptlrpc] [] mdt_destroy_export+0x57/0x200 [mdt] [] class_export_destroy+0xee/0x490 [obdclass] [] obd_zombie_exp_cull+0x15/0x20 [obdclass] [] process_one_work+0x17f/0x440 - mdt_init0 call to target_recovery_fini is moved so that it is called on every failure after successful tgt_init. obd_zombie_barrier is to be called after target_recovery_fini->class_disconnect_exports obd->obd_fail is set so that mdt_export_cleanup->tgt_client_del did not clear client's slot in last_rcvd in case of server start failure - mdt_quota_init class_manual_clean does class_detach, goto is added to avoid repeated call to class_detach - qmt_device_init0 start qmt rebalance thread with SVC_STARTING flag so that qmt_start_reba_thread waited until the thread has started. Otherwise, qmt_device may get freed before qmt rebalance thread is stopped Tests for failures during mdt_init0 are added - conf-sanity.sh:test_5i leads to general protection fault - conf-sanity.sh:test_5h causes rmmod: ERROR: Module mdt is in use Cray-bug-id: LUS-2403 Signed-off-by: Vladimir Saveliev Test-Parameters: trivial testlist=conf-sanity envdefinitions=ONLY=5 Change-Id: Ic9dc9e167f6c2e47a5f97e59b5bd26c5231c23ce Reviewed-on: https://review.whamcloud.com/34724 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Andrew Perepechko Reviewed-by: Sergey Cheremencev Reviewed-by: Oleg Drokin --- lustre/include/obd_support.h | 1 + lustre/mdt/mdt_handler.c | 16 ++++++++++++---- lustre/quota/qmt_dev.c | 5 +++-- lustre/quota/qmt_lock.c | 5 ++++- lustre/tests/conf-sanity.sh | 26 ++++++++++++++++++++++++++ 5 files changed, 46 insertions(+), 7 deletions(-) diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index d1d825a..be2b3bb 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -511,6 +511,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_QUOTA_EDQUOT 0xA02 #define OBD_FAIL_QUOTA_DELAY_REINT 0xA03 #define OBD_FAIL_QUOTA_RECOVERABLE_ERR 0xA04 +#define OBD_FAIL_QUOTA_INIT 0xA05 #define OBD_FAIL_LPROC_REMOVE 0xB00 diff --git a/lustre/mdt/mdt_handler.c b/lustre/mdt/mdt_handler.c index 6055b0a..33d9847 100644 --- a/lustre/mdt/mdt_handler.c +++ b/lustre/mdt/mdt_handler.c @@ -4787,9 +4787,12 @@ static int mdt_quota_init(const struct lu_env *env, struct mdt_device *mdt, mdt->mdt_qmt_dev = obd->obd_lu_dev; /* configure local quota objects */ - rc = mdt->mdt_qmt_dev->ld_ops->ldo_prepare(env, - &mdt->mdt_lu_dev, - mdt->mdt_qmt_dev); + if (OBD_FAIL_CHECK(OBD_FAIL_QUOTA_INIT)) + rc = -EBADF; + else + rc = mdt->mdt_qmt_dev->ld_ops->ldo_prepare(env, + &mdt->mdt_lu_dev, + mdt->mdt_qmt_dev); if (rc) GOTO(class_cleanup, rc); @@ -4809,6 +4812,7 @@ class_cleanup: if (rc) { class_manual_cleanup(obd); mdt->mdt_qmt_dev = NULL; + GOTO(lcfg_cleanup, rc); } class_detach: if (rc) @@ -5303,7 +5307,6 @@ static int mdt_init0(const struct lu_env *env, struct mdt_device *m, err_procfs: mdt_procfs_fini(m); err_recovery: - target_recovery_fini(obd); upcall_cache_cleanup(m->mdt_identity_cache); m->mdt_identity_cache = NULL; err_free_hsm: @@ -5314,6 +5317,11 @@ err_los_fini: err_fs_cleanup: mdt_fs_cleanup(env, m); err_tgt: + /* keep recoverable clients */ + obd->obd_fail = 1; + target_recovery_fini(obd); + obd_exports_barrier(obd); + obd_zombie_barrier(); tgt_fini(env, &m->mdt_lut); err_free_ns: ldlm_namespace_free(m->mdt_namespace, NULL, 0); diff --git a/lustre/quota/qmt_dev.c b/lustre/quota/qmt_dev.c index 25684fd..fe65065 100644 --- a/lustre/quota/qmt_dev.c +++ b/lustre/quota/qmt_dev.c @@ -107,7 +107,8 @@ static struct lu_device *qmt_device_fini(const struct lu_env *env, } /* stop rebalance thread */ - qmt_stop_reba_thread(qmt); + if (!qmt->qmt_child->dd_rdonly) + qmt_stop_reba_thread(qmt); /* disconnect from OSD */ if (qmt->qmt_child_exp != NULL) { @@ -240,7 +241,7 @@ static int qmt_device_init0(const struct lu_env *env, struct qmt_device *qmt, GOTO(out, rc); /* set up and start rebalance thread */ - thread_set_flags(&qmt->qmt_reba_thread, SVC_STOPPED); + thread_set_flags(&qmt->qmt_reba_thread, SVC_STARTING); init_waitqueue_head(&qmt->qmt_reba_thread.t_ctl_waitq); INIT_LIST_HEAD(&qmt->qmt_reba_list); spin_lock_init(&qmt->qmt_reba_lock); diff --git a/lustre/quota/qmt_lock.c b/lustre/quota/qmt_lock.c index 9f68426..cbb4a82 100644 --- a/lustre/quota/qmt_lock.c +++ b/lustre/quota/qmt_lock.c @@ -804,12 +804,15 @@ static int qmt_reba_thread(void *arg) ENTRY; OBD_ALLOC_PTR(env); - if (env == NULL) + if (env == NULL) { + thread_set_flags(thread, SVC_STOPPED); RETURN(-ENOMEM); + } rc = lu_env_init(env, LCT_MD_THREAD); if (rc) { CERROR("%s: failed to init env.", qmt->qmt_svname); + thread_set_flags(thread, SVC_STOPPED); OBD_FREE_PTR(env); RETURN(rc); } diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index 613c9e1..7219536 100644 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -519,6 +519,32 @@ test_5g() { } run_test 5g "handle missing debugfs" +test_5h() { + setup + + stop mds1 + #define OBD_FAIL_MDS_FS_SETUP 0x135 + do_facet mds1 "$LCTL set_param fail_loc=0x80000135" + start_mdt 1 && error "start mdt should fail" + start_mdt 1 || error "start mdt failed" + client_up || error "client_up failed" + cleanup +} +run_test 5h "start mdt failure at mdt_fs_setup()" + +test_5i() { + setup + + stop mds1 + #define OBD_FAIL_QUOTA_INIT 0xA05 + do_facet mds1 "$LCTL set_param fail_loc=0x80000A05" + start_mdt 1 && error "start mdt should fail" + start_mdt 1 || error "start mdt failed" + client_up || error "client_up failed" + cleanup +} +run_test 5i "start mdt failure at mdt_quota_init()" + test_6() { setup manual_umount_client -- 1.8.3.1