Whamcloud - gitweb
LU-12206 mdt: mdt_init0 failure handling 24/34724/3
authorVladimir Saveliev <c17830@cray.com>
Fri, 19 Apr 2019 09:33:12 +0000 (12:33 +0300)
committerOleg Drokin <green@whamcloud.com>
Sat, 25 May 2019 04:57:54 +0000 (04:57 +0000)
When mdt_init0 fails it has to wait until zombie workqueue has all
disconnected exports destroyed before mdt_device_alloc will free the
mdt_device. Otherwise, zombie workqueue refers to freed mdt_device
via:
  general protection fault: 0000 [#1] SMP
  ..
  Workqueue: obd_zombid obd_zombie_exp_cull [obdclass]
  ..
  [<ffffffffc08829c5>] tgt_client_free+0x1e5/0x3c0 [ptlrpc]
  [<ffffffffc0ec2327>] mdt_destroy_export+0x57/0x200 [mdt]
  [<ffffffffc05bf20e>] class_export_destroy+0xee/0x490 [obdclass]
  [<ffffffffc05bf5c5>] obd_zombie_exp_cull+0x15/0x20 [obdclass]
  [<ffffffff93ab1d2f>] process_one_work+0x17f/0x440

- mdt_init0
  call to target_recovery_fini is moved so that it is called on every
  failure after successful tgt_init.

  obd_zombie_barrier is to be called after
  target_recovery_fini->class_disconnect_exports

  obd->obd_fail is set so that mdt_export_cleanup->tgt_client_del did
  not clear client's slot in last_rcvd in case of server start failure

- mdt_quota_init
  class_manual_clean does class_detach, goto is added to avoid
  repeated call to class_detach

- qmt_device_init0
  start qmt rebalance thread with SVC_STARTING flag so that
  qmt_start_reba_thread waited until the thread has started.
  Otherwise, qmt_device may get freed before qmt rebalance thread is
  stopped

Tests for failures during mdt_init0 are added
- conf-sanity.sh:test_5i leads to general protection fault
- conf-sanity.sh:test_5h causes
  rmmod: ERROR: Module mdt is in use

Cray-bug-id: LUS-2403
Signed-off-by: Vladimir Saveliev <c17830@cray.com>
Test-Parameters: trivial testlist=conf-sanity envdefinitions=ONLY=5
Change-Id: Ic9dc9e167f6c2e47a5f97e59b5bd26c5231c23ce
Reviewed-on: https://review.whamcloud.com/34724
Tested-by: Jenkins
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andrew Perepechko <c17827@cray.com>
Reviewed-by: Sergey Cheremencev <c17829@cray.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/obd_support.h
lustre/mdt/mdt_handler.c
lustre/quota/qmt_dev.c
lustre/quota/qmt_lock.c
lustre/tests/conf-sanity.sh

index d1d825a..be2b3bb 100644 (file)
@@ -511,6 +511,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_QUOTA_EDQUOT            0xA02
 #define OBD_FAIL_QUOTA_DELAY_REINT       0xA03
 #define OBD_FAIL_QUOTA_RECOVERABLE_ERR   0xA04
+#define OBD_FAIL_QUOTA_INIT              0xA05
 
 #define OBD_FAIL_LPROC_REMOVE            0xB00
 
index 6055b0a..33d9847 100644 (file)
@@ -4787,9 +4787,12 @@ static int mdt_quota_init(const struct lu_env *env, struct mdt_device *mdt,
        mdt->mdt_qmt_dev = obd->obd_lu_dev;
 
        /* configure local quota objects */
-       rc = mdt->mdt_qmt_dev->ld_ops->ldo_prepare(env,
-                                                  &mdt->mdt_lu_dev,
-                                                  mdt->mdt_qmt_dev);
+       if (OBD_FAIL_CHECK(OBD_FAIL_QUOTA_INIT))
+               rc = -EBADF;
+       else
+               rc = mdt->mdt_qmt_dev->ld_ops->ldo_prepare(env,
+                                                          &mdt->mdt_lu_dev,
+                                                          mdt->mdt_qmt_dev);
        if (rc)
                GOTO(class_cleanup, rc);
 
@@ -4809,6 +4812,7 @@ class_cleanup:
        if (rc) {
                class_manual_cleanup(obd);
                mdt->mdt_qmt_dev = NULL;
+               GOTO(lcfg_cleanup, rc);
        }
 class_detach:
        if (rc)
@@ -5303,7 +5307,6 @@ static int mdt_init0(const struct lu_env *env, struct mdt_device *m,
 err_procfs:
        mdt_procfs_fini(m);
 err_recovery:
-       target_recovery_fini(obd);
        upcall_cache_cleanup(m->mdt_identity_cache);
        m->mdt_identity_cache = NULL;
 err_free_hsm:
@@ -5314,6 +5317,11 @@ err_los_fini:
 err_fs_cleanup:
        mdt_fs_cleanup(env, m);
 err_tgt:
+       /* keep recoverable clients */
+       obd->obd_fail = 1;
+       target_recovery_fini(obd);
+       obd_exports_barrier(obd);
+       obd_zombie_barrier();
        tgt_fini(env, &m->mdt_lut);
 err_free_ns:
        ldlm_namespace_free(m->mdt_namespace, NULL, 0);
index 25684fd..fe65065 100644 (file)
@@ -107,7 +107,8 @@ static struct lu_device *qmt_device_fini(const struct lu_env *env,
        }
 
        /* stop rebalance thread */
-       qmt_stop_reba_thread(qmt);
+       if (!qmt->qmt_child->dd_rdonly)
+               qmt_stop_reba_thread(qmt);
 
        /* disconnect from OSD */
        if (qmt->qmt_child_exp != NULL) {
@@ -240,7 +241,7 @@ static int qmt_device_init0(const struct lu_env *env, struct qmt_device *qmt,
                GOTO(out, rc);
 
        /* set up and start rebalance thread */
-       thread_set_flags(&qmt->qmt_reba_thread, SVC_STOPPED);
+       thread_set_flags(&qmt->qmt_reba_thread, SVC_STARTING);
        init_waitqueue_head(&qmt->qmt_reba_thread.t_ctl_waitq);
        INIT_LIST_HEAD(&qmt->qmt_reba_list);
        spin_lock_init(&qmt->qmt_reba_lock);
index 9f68426..cbb4a82 100644 (file)
@@ -804,12 +804,15 @@ static int qmt_reba_thread(void *arg)
        ENTRY;
 
        OBD_ALLOC_PTR(env);
-       if (env == NULL)
+       if (env == NULL) {
+               thread_set_flags(thread, SVC_STOPPED);
                RETURN(-ENOMEM);
+       }
 
        rc = lu_env_init(env, LCT_MD_THREAD);
        if (rc) {
                CERROR("%s: failed to init env.", qmt->qmt_svname);
+               thread_set_flags(thread, SVC_STOPPED);
                OBD_FREE_PTR(env);
                RETURN(rc);
        }
index 613c9e1..7219536 100644 (file)
@@ -519,6 +519,32 @@ test_5g() {
 }
 run_test 5g "handle missing debugfs"
 
+test_5h() {
+       setup
+
+       stop mds1
+       #define OBD_FAIL_MDS_FS_SETUP            0x135
+       do_facet mds1 "$LCTL set_param fail_loc=0x80000135"
+       start_mdt 1 && error "start mdt should fail"
+       start_mdt 1 || error "start mdt failed"
+       client_up || error "client_up failed"
+       cleanup
+}
+run_test 5h "start mdt failure at mdt_fs_setup()"
+
+test_5i() {
+       setup
+
+       stop mds1
+       #define OBD_FAIL_QUOTA_INIT              0xA05
+       do_facet mds1 "$LCTL set_param fail_loc=0x80000A05"
+       start_mdt 1 && error "start mdt should fail"
+       start_mdt 1 || error "start mdt failed"
+       client_up || error "client_up failed"
+       cleanup
+}
+run_test 5i "start mdt failure at mdt_quota_init()"
+
 test_6() {
        setup
        manual_umount_client