Whamcloud - gitweb
LU-11814 obd: Crashed while mount in parallel 74/37974/10
authorYang Sheng <ys@whamcloud.com>
Tue, 10 Mar 2020 13:53:26 +0000 (21:53 +0800)
committerOleg Drokin <green@whamcloud.com>
Tue, 14 Apr 2020 08:09:45 +0000 (08:09 +0000)
Ensure obd_type init finished before obd_setup was started.
Also included a test case to reproduce the failure.

 (osd_handler.c:7132:osd_device_init0()) ASSERTION( info ) failed:
 (osd_handler.c:7132:osd_device_init0()) LBUG
 Pid: 2457, comm: mount.lustre 3.10.0-862.9.1.el7_lustre.ddn1.x86_64
Call Trace:
 [<ffffffffc042f7cc>] libcfs_call_trace+0x8c/0xc0 [libcfs]
 [<ffffffffc042f87c>] lbug_with_loc+0x4c/0xa0 [libcfs]
 [<ffffffffc10cc6d5>] osd_device_alloc+0x615/0x770 [osd_ldiskfs]
 [<ffffffffc099ae2a>] obd_setup+0x11a/0x2b0 [obdclass]
 [<ffffffffc099cc58>] class_setup+0x2a8/0x840 [obdclass]
 [<ffffffffc09a08ed>] class_process_config+0x196d/0x2420 [obdclass]
 [<ffffffffc09a45f8>] do_lcfg+0x258/0x500 [obdclass]
 [<ffffffffc09a8e68>] lustre_start_simple+0x88/0x210 [obdclass]
 [<ffffffffc09d59b4>] server_fill_super+0xf34/0x185a [obdclass]
 [<ffffffffc09abfe8>] lustre_fill_super+0x328/0x950 [obdclass]
 [<ffffffff9781f3bf>] mount_nodev+0x4f/0xb0
 [<ffffffffc09a4008>] lustre_mount+0x38/0x60 [obdclass]
 [<ffffffff9781ff3e>] mount_fs+0x3e/0x1b0
 [<ffffffff9783d4c7>] vfs_kern_mount+0x67/0x110
 [<ffffffff9783faef>] do_mount+0x1ef/0xce0
 [<ffffffff97840923>] SyS_mount+0x83/0xd0
 [<ffffffff97d20795>] system_call_fastpath+0x1c/0x21

Signed-off-by: Yang Sheng <ys@whamcloud.com>
Change-Id: If3129df5fd11226636fb84a9275f481cefb749f7
Reviewed-on: https://review.whamcloud.com/37974
Tested-by: jenkins <devops@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Neil Brown <neilb@suse.de>
libcfs/libcfs/module.c
lustre/include/obd.h
lustre/include/obd_class.h
lustre/include/obd_support.h
lustre/obdclass/genops.c
lustre/obdclass/lu_object.c
lustre/tests/conf-sanity.sh

index 81c1f6f..fd6f34b 100644 (file)
@@ -387,8 +387,10 @@ static int proc_fail_loc(struct ctl_table *table, int write,
        long old_fail_loc = cfs_fail_loc;
 
        rc = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
-       if (old_fail_loc != cfs_fail_loc)
+       if (old_fail_loc != cfs_fail_loc) {
+               cfs_race_state = 1;
                wake_up(&cfs_race_waitq);
+       }
        return rc;
 }
 
index be44801..1c8bfa3 100644 (file)
@@ -111,6 +111,7 @@ struct obd_type {
        struct kobject           typ_kobj;
 };
 #define typ_name typ_kobj.name
+#define OBD_LU_TYPE_SETUP ((void *)0x01UL)
 
 struct brw_page {
        u64              off;
index a077f8a..a9222b4 100644 (file)
@@ -531,42 +531,46 @@ static inline int obd_set_info_async(const struct lu_env *env,
 static inline int obd_setup(struct obd_device *obd, struct lustre_cfg *cfg)
 {
         int rc;
-       struct lu_device_type *ldt = obd->obd_type->typ_lu;
-       struct lu_device *d;
+       struct obd_type *type = obd->obd_type;
+       struct lu_device_type *ldt;
 
-        ENTRY;
+       ENTRY;
 
-        if (ldt != NULL) {
-                struct lu_context  session_ctx;
-                struct lu_env env;
-                lu_context_init(&session_ctx, LCT_SESSION | LCT_SERVER_SESSION);
-                session_ctx.lc_thread = NULL;
-                lu_context_enter(&session_ctx);
+       wait_var_event(&type->typ_lu,
+                      smp_load_acquire(&type->typ_lu) != OBD_LU_TYPE_SETUP);
+       ldt = type->typ_lu;
+       if (ldt != NULL) {
+               struct lu_context session_ctx;
+               struct lu_env env;
 
-                rc = lu_env_init(&env, ldt->ldt_ctx_tags);
-                if (rc == 0) {
-                        env.le_ses = &session_ctx;
-                        d = ldt->ldt_ops->ldto_device_alloc(&env, ldt, cfg);
-                        lu_env_fini(&env);
-                        if (!IS_ERR(d)) {
-                                obd->obd_lu_dev = d;
-                                d->ld_obd = obd;
-                                rc = 0;
-                        } else
-                                rc = PTR_ERR(d);
-                }
-                lu_context_exit(&session_ctx);
-                lu_context_fini(&session_ctx);
+               lu_context_init(&session_ctx, LCT_SESSION | LCT_SERVER_SESSION);
+               session_ctx.lc_thread = NULL;
+               lu_context_enter(&session_ctx);
 
-        } else {
+               rc = lu_env_init(&env, ldt->ldt_ctx_tags);
+               if (rc == 0) {
+                       struct lu_device *dev;
+                       env.le_ses = &session_ctx;
+                       dev = ldt->ldt_ops->ldto_device_alloc(&env, ldt, cfg);
+                       lu_env_fini(&env);
+                       if (!IS_ERR(dev)) {
+                               obd->obd_lu_dev = dev;
+                               dev->ld_obd = obd;
+                               rc = 0;
+                       } else
+                               rc = PTR_ERR(dev);
+               }
+               lu_context_exit(&session_ctx);
+               lu_context_fini(&session_ctx);
+       } else {
                if (!obd->obd_type->typ_dt_ops->o_setup) {
                        CERROR("%s: no %s operation\n", obd->obd_name,
                               __func__);
                        RETURN(-EOPNOTSUPP);
                }
-                rc = OBP(obd, setup)(obd, cfg);
-        }
-        RETURN(rc);
+               rc = OBP(obd, setup)(obd, cfg);
+       }
+       RETURN(rc);
 }
 
 static inline int obd_precleanup(struct obd_device *obd)
index baf90c3..d1bb8db 100644 (file)
@@ -462,6 +462,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_OBDCLASS_MODULE_LOAD   0x60a
 #define OBD_FAIL_OBD_ZERO_NLINK_RACE    0x60b
 #define OBD_FAIL_OBD_STOP_MDS_RACE      0x60c
+#define OBD_FAIL_OBD_SETUP              0x60d
 
 #define OBD_FAIL_TGT_REPLY_NET           0x700
 #define OBD_FAIL_TGT_CONN_RACE           0x701
index 75388ed..97672f9 100644 (file)
@@ -251,6 +251,7 @@ int class_register_type(const struct obd_ops *dt_ops,
         if (type == NULL)
                RETURN(-ENOMEM);
 
+       type->typ_lu = ldt ? OBD_LU_TYPE_SETUP : NULL;
        type->typ_kobj.kset = lustre_kset;
        kobject_init(&type->typ_kobj, &class_ktype);
 #ifdef HAVE_SERVER_SUPPORT
@@ -289,8 +290,9 @@ dir_exist:
 setup_ldt:
 #endif
        if (ldt) {
-               type->typ_lu = ldt;
                rc = lu_device_type_init(ldt);
+               smp_store_release(&type->typ_lu, rc ? NULL : ldt);
+               wake_up_var(&type->typ_lu);
                if (rc)
                        GOTO(failed, rc);
        }
index b67bed1..916ad94 100644 (file)
@@ -1486,6 +1486,10 @@ int lu_context_key_register(struct lu_context_key *key)
                if (lu_keys[i])
                        continue;
                key->lct_index = i;
+
+               if (strncmp("osd_", module_name(key->lct_owner), 4) == 0)
+                       CFS_RACE_WAIT(OBD_FAIL_OBD_SETUP);
+
                if (cmpxchg(&lu_keys[i], NULL, key) != NULL)
                        continue;
 
index 5c86ba9..d86fe47 100644 (file)
@@ -8906,6 +8906,24 @@ test_125()
 }
 run_test 125 "check l_tunedisk only tunes OSTs and their slave devices"
 
+test_126() {
+       [[ "$MDS1_VERSION" -ge $(version_code 2.13.52) ]] ||
+               skip "Need MDS version at least 2.13.52"
+
+       cleanup
+       do_rpc_nodes $(facet_active_host $SINGLEMDS) load_module ../libcfs/libcfs/libcfs
+       #define OBD_FAIL_OBD_SETUP 0x60d
+       do_facet mds1 $LCTL set_param fail_loc=0x60d
+       do_rpc_nodes $(facet_active_host $SINGLEMDS) load_modules &
+       for i in {1..40}; do
+               do_facet mds1 lsmod | grep -q osd_$mds1_FSTYPE && break
+               sleep 1
+       done
+       clear_failloc $SINGLEMDS 20 &
+       start mds1 $(mdsdevname 1) $MDS_MOUNT_OPTS
+}
+run_test 126 "mount in parallel shouldn't cause a crash"
+
 if ! combined_mgs_mds ; then
        stop mgs
 fi