Whamcloud - gitweb
LU-11243 lod: fix assertion and hang upon lod_add_device failure 94/32994/7
authorWang Shilong <wshilong@ddn.com>
Mon, 10 Dec 2018 05:45:33 +0000 (13:45 +0800)
committerOleg Drokin <green@whamcloud.com>
Fri, 8 Mar 2019 21:48:30 +0000 (21:48 +0000)
There are two problems:

See following assertion:

    lod_add_device() lustre-OSTe42a-osc-MDT0000:
                     can't set up pool, failed with -12
    osp_disconnect() ASSERTION( imp != ((void *)0) ) failed:
    osp_disconnect() LBUG
    CPU: 1 PID: 10059 Comm: llog_process_th

Problem is obd_disconnect() will cleanup @imp and set NULL.
 ->osp_obd_disconnect
    ->class_manual_cleanup
       ->class_process_config
          ->class_cleanup
             ->obd_precleanup
                ->osp_device_fini
                   ->client_obd_cleanup

While ldo_process_config() will try to access @imp again:
 ->ldo_process_config
    ->osp_shutdown
       ->osp_disconnect
          ->LASSERT(imp != NULL)

Another problem is if we failed before obd_connect().
we will hang on with mount:
 ->ldo_process_config
    ->osp_shutdown
       ->osp_disconnect
          ->ptlrpc_disconnect_import
             ->rc = l_wait_event(imp->imp_recovery_waitq,
                                 !ptlrpc_import_in_recovery(imp), &lwi);

Since connect is not called, imp state will stay LUSTRE_IMP_NEW.
Fix this by check whether we are in recovery properly, only consider
we are in recovery if we are in following states:

 LUSTRE_IMP_CONNECTING = 4,
 LUSTRE_IMP_REPLAY     = 5,
 LUSTRE_IMP_REPLAY_LOCKS = 6,
 LUSTRE_IMP_REPLAY_WAIT  = 7,
 LUSTRE_IMP_RECOVER    = 8,

Change-Id: I2113b95a421bae7117f3057d5f0fdf78db95caa3
Signed-off-by: Wang Shilong <wshilong@ddn.com>
Reviewed-on: https://review.whamcloud.com/32994
Tested-by: Jenkins
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Gu Zheng <gzheng@ddn.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/lod/lod_lov.c
lustre/ptlrpc/recover.c

index c29e9ab..7434374 100644 (file)
@@ -220,6 +220,7 @@ int lod_add_device(const struct lu_env *env, struct lod_device *lod,
        struct obd_uuid         obd_uuid;
        bool                    for_ost;
        bool lock = false;
        struct obd_uuid         obd_uuid;
        bool                    for_ost;
        bool lock = false;
+       bool connected = false;
        ENTRY;
 
        CDEBUG(D_CONFIG, "osp:%s idx:%d gen:%d\n", osp, index, gen);
        ENTRY;
 
        CDEBUG(D_CONFIG, "osp:%s idx:%d gen:%d\n", osp, index, gen);
@@ -302,11 +303,12 @@ int lod_add_device(const struct lu_env *env, struct lod_device *lod,
                       obd->obd_name, osp, rc);
                GOTO(out_cleanup, rc);
        }
                       obd->obd_name, osp, rc);
                GOTO(out_cleanup, rc);
        }
+       connected = true;
 
        /* Allocate ost descriptor and fill it */
        OBD_ALLOC_PTR(tgt_desc);
        if (!tgt_desc)
 
        /* Allocate ost descriptor and fill it */
        OBD_ALLOC_PTR(tgt_desc);
        if (!tgt_desc)
-               GOTO(out_conn, rc = -ENOMEM);
+               GOTO(out_cleanup, rc = -ENOMEM);
 
        tgt_desc->ltd_tgt    = dt_dev;
        tgt_desc->ltd_exp    = exp;
 
        tgt_desc->ltd_tgt    = dt_dev;
        tgt_desc->ltd_exp    = exp;
@@ -426,8 +428,6 @@ out_mutex:
        }
 out_desc:
        OBD_FREE_PTR(tgt_desc);
        }
 out_desc:
        OBD_FREE_PTR(tgt_desc);
-out_conn:
-       obd_disconnect(exp);
 out_cleanup:
        /* XXX OSP needs us to send down LCFG_CLEANUP because it uses
         * objects from the MDT stack. See LU-7184. */
 out_cleanup:
        /* XXX OSP needs us to send down LCFG_CLEANUP because it uses
         * objects from the MDT stack. See LU-7184. */
@@ -437,6 +437,9 @@ out_cleanup:
        lcfg->lcfg_command = LCFG_CLEANUP;
        lu_dev->ld_ops->ldo_process_config(env, lu_dev, lcfg);
 
        lcfg->lcfg_command = LCFG_CLEANUP;
        lu_dev->ld_ops->ldo_process_config(env, lu_dev, lcfg);
 
+       if (connected)
+               obd_disconnect(exp);
+
        return rc;
 }
 
        return rc;
 }
 
index ab93c45..4d5f28b 100644 (file)
@@ -377,9 +377,8 @@ int ptlrpc_import_in_recovery(struct obd_import *imp)
        int in_recovery = 1;
 
        spin_lock(&imp->imp_lock);
        int in_recovery = 1;
 
        spin_lock(&imp->imp_lock);
-       if (imp->imp_state == LUSTRE_IMP_FULL ||
-           imp->imp_state == LUSTRE_IMP_CLOSED ||
-           imp->imp_state == LUSTRE_IMP_DISCON ||
+       if (imp->imp_state <= LUSTRE_IMP_DISCON ||
+           imp->imp_state >= LUSTRE_IMP_FULL ||
            imp->imp_obd->obd_no_recov)
                in_recovery = 0;
        spin_unlock(&imp->imp_lock);
            imp->imp_obd->obd_no_recov)
                in_recovery = 0;
        spin_unlock(&imp->imp_lock);