Whamcloud - gitweb
LU-12616 obclass: fix MDS start/stop race 52/35652/5
authorAlexander Boyko <c17825@cray.com>
Tue, 30 Jul 2019 11:33:15 +0000 (07:33 -0400)
committerOleg Drokin <green@whamcloud.com>
Tue, 3 Sep 2019 05:09:36 +0000 (05:09 +0000)
The MDS unload happen when type of MDT has no reference.
The MDT drop it during obd_cleanup. So race window located
between obd_cleanup and server_stop_servers.
Lustre can lost MDS obd_device during server_start_targets
between MDS checking and taking the type reference, if another MDT
stops.

The patch takes one more reference for a MDT type at
server_start_targets, and put it at server_stop_servers.

This patch adds sanity test 278. It reproduces the next race
   started cleanup of MDT01
   started cleanup of MDT00
   finished cleanup of MDT00
   started MDT00 mount, checked MDS exist
   finished cleanup of MDT01, and cleanup of MDS also
   asserted during MDT00 initialization

Cray-bug-id: LUS-7275
Signed-off-by: Alexander Boyko <c17825@cray.com>
Change-Id: I9ae3bc2ec1d23c8d436f143d12e26209fdb6b083
Reviewed-on: https://review.whamcloud.com/35652
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Neil Brown <neilb@suse.de>
Reviewed-by: Alexander Zarochentsev <c17826@cray.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/lustre_disk.h
lustre/include/obd_support.h
lustre/obdclass/obd_mount_server.c
lustre/tests/sanity.sh

index 47ad428..b4d509f 100644 (file)
@@ -133,7 +133,8 @@ struct lustre_sb_info {
                                                  own backing_dev_info */
        struct list_head          lsi_lwp_list;
        spinlock_t                lsi_lwp_lock;
                                                  own backing_dev_info */
        struct list_head          lsi_lwp_list;
        spinlock_t                lsi_lwp_lock;
-       unsigned long             lsi_lwp_started:1;
+       unsigned long             lsi_lwp_started:1,
+                                 lsi_server_started:1;
 };
 
 #define LSI_UMOUNT_FAILOVER              0x00200000
 };
 
 #define LSI_UMOUNT_FAILOVER              0x00200000
index 9ac5de0..ddc11c1 100644 (file)
@@ -462,6 +462,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_OBD_NO_LRU             0x609
 #define OBD_FAIL_OBDCLASS_MODULE_LOAD   0x60a
 #define OBD_FAIL_OBD_ZERO_NLINK_RACE    0x60b
 #define OBD_FAIL_OBD_NO_LRU             0x609
 #define OBD_FAIL_OBDCLASS_MODULE_LOAD   0x60a
 #define OBD_FAIL_OBD_ZERO_NLINK_RACE    0x60b
+#define OBD_FAIL_OBD_STOP_MDS_RACE      0x60c
 
 #define OBD_FAIL_TGT_REPLY_NET           0x700
 #define OBD_FAIL_TGT_CONN_RACE           0x701
 
 #define OBD_FAIL_TGT_REPLY_NET           0x700
 #define OBD_FAIL_TGT_CONN_RACE           0x701
index 574e487..aae2e35 100644 (file)
@@ -48,6 +48,7 @@
 #endif
 #include <linux/statfs.h>
 #include <linux/version.h>
 #endif
 #include <linux/statfs.h>
 #include <linux/version.h>
+#include <linux/delay.h>
 
 #include <llog_swab.h>
 #include <lustre_disk.h>
 
 #include <llog_swab.h>
 #include <lustre_disk.h>
@@ -1068,6 +1069,7 @@ static int server_stop_servers(int lsiflags)
        struct obd_device *obd = NULL;
        struct obd_type *type = NULL;
        int rc = 0;
        struct obd_device *obd = NULL;
        struct obd_type *type = NULL;
        int rc = 0;
+       bool type_last;
        ENTRY;
 
        mutex_lock(&server_start_lock);
        ENTRY;
 
        mutex_lock(&server_start_lock);
@@ -1076,18 +1078,24 @@ static int server_stop_servers(int lsiflags)
        /* if this was an MDT, and there are no more MDT's, clean up the MDS */
        if (lsiflags & LDD_F_SV_TYPE_MDT) {
                obd = class_name2obd(LUSTRE_MDS_OBDNAME);
        /* if this was an MDT, and there are no more MDT's, clean up the MDS */
        if (lsiflags & LDD_F_SV_TYPE_MDT) {
                obd = class_name2obd(LUSTRE_MDS_OBDNAME);
-               if (obd != NULL)
-                       type = class_search_type(LUSTRE_MDT_NAME);
-       }
-
+               type = class_search_type(LUSTRE_MDT_NAME);
+       } else if (lsiflags & LDD_F_SV_TYPE_OST) {
        /* if this was an OST, and there are no more OST's, clean up the OSS */
        /* if this was an OST, and there are no more OST's, clean up the OSS */
-       if (lsiflags & LDD_F_SV_TYPE_OST) {
                obd = class_name2obd(LUSTRE_OSS_OBDNAME);
                obd = class_name2obd(LUSTRE_OSS_OBDNAME);
-               if (obd != NULL)
-                       type = class_search_type(LUSTRE_OST_NAME);
+               type = class_search_type(LUSTRE_OST_NAME);
        }
 
        }
 
-       if (obd != NULL && (type == NULL || type->typ_refcnt == 0)) {
+       /* server_stop_servers is a pair of server_start_targets
+        * Here we put type which was taken at server_start_targets.
+        * If type is NULL then there is a wrong logic around type or
+        * type reference. */
+       LASSERTF(type, "Server flags %d, obd %s\n", lsiflags,
+                obd ? obd->obd_name : "NULL");
+
+       type_last = (type->typ_refcnt == 1);
+
+       class_put_type(type);
+       if (obd != NULL && type_last) {
                obd->obd_force = 1;
                /* obd_fail doesn't mean much on a server obd */
                rc = class_manual_cleanup(obd);
                obd->obd_force = 1;
                /* obd_fail doesn't mean much on a server obd */
                rc = class_manual_cleanup(obd);
@@ -1311,46 +1319,48 @@ static int server_start_targets(struct super_block *sb)
        struct config_llog_instance cfg;
        struct lu_env mgc_env;
        struct lu_device *dev;
        struct config_llog_instance cfg;
        struct lu_env mgc_env;
        struct lu_device *dev;
+       char *name_service, *obd_name_service = NULL;
+       struct obd_type *type = NULL;
        int rc;
        ENTRY;
 
        CDEBUG(D_MOUNT, "starting target %s\n", lsi->lsi_svname);
 
        int rc;
        ENTRY;
 
        CDEBUG(D_MOUNT, "starting target %s\n", lsi->lsi_svname);
 
+       LASSERTF(IS_MDT(lsi) || IS_OST(lsi), "designed for MDT or OST only\n");
+
        if (IS_MDT(lsi)) {
        if (IS_MDT(lsi)) {
-               /* make sure the MDS is started */
-               mutex_lock(&server_start_lock);
-               obd = class_name2obd(LUSTRE_MDS_OBDNAME);
-               if (!obd) {
-                       rc = lustre_start_simple(LUSTRE_MDS_OBDNAME,
-                                                LUSTRE_MDS_NAME,
-                                                LUSTRE_MDS_OBDNAME"_uuid",
-                                                NULL, NULL, NULL, NULL);
-                       if (rc) {
-                               mutex_unlock(&server_start_lock);
-                               CERROR("failed to start MDS: %d\n", rc);
-                               RETURN(rc);
-                       }
-               }
-               mutex_unlock(&server_start_lock);
+               obd_name_service = LUSTRE_MDS_OBDNAME;
+               name_service = LUSTRE_MDS_NAME;
+       } else {
+               obd_name_service = LUSTRE_OSS_OBDNAME;
+               name_service = LUSTRE_OSS_NAME;
        }
 
        }
 
-       /* If we're an OST, make sure the global OSS is running */
-       if (IS_OST(lsi)) {
-               /* make sure OSS is started */
-               mutex_lock(&server_start_lock);
-               obd = class_name2obd(LUSTRE_OSS_OBDNAME);
-               if (!obd) {
-                       rc = lustre_start_simple(LUSTRE_OSS_OBDNAME,
-                                                LUSTRE_OSS_NAME,
-                                                LUSTRE_OSS_OBDNAME"_uuid",
-                                                NULL, NULL, NULL, NULL);
-                       if (rc) {
-                               mutex_unlock(&server_start_lock);
-                               CERROR("failed to start OSS: %d\n", rc);
-                               RETURN(rc);
-                       }
+       /* make sure MDS/OSS is started */
+       mutex_lock(&server_start_lock);
+       obd = class_name2obd(obd_name_service);
+       if (!obd) {
+               rc = lustre_start_simple(obd_name_service, name_service,
+                                        (IS_MDT(lsi) ?
+                                         LUSTRE_MDS_OBDNAME"_uuid" :
+                                         LUSTRE_OSS_OBDNAME"_uuid"),
+                                        NULL, NULL, NULL, NULL);
+               if (rc) {
+                       mutex_unlock(&server_start_lock);
+                       CERROR("failed to start %s: %d\n",
+                              obd_name_service, rc);
+                       RETURN(rc);
                }
                }
-               mutex_unlock(&server_start_lock);
+       }
+       /* hold a type reference and put it at server_stop_servers */
+       type = class_get_type(IS_MDT(lsi) ?
+                             LUSTRE_MDT_NAME : LUSTRE_OST_NAME);
+       lsi->lsi_server_started = 1;
+       mutex_unlock(&server_start_lock);
+       if (OBD_FAIL_PRECHECK(OBD_FAIL_OBD_STOP_MDS_RACE) &&
+           IS_MDT(lsi)) {
+               OBD_RACE(OBD_FAIL_OBD_STOP_MDS_RACE);
+               msleep(2 * MSEC_PER_SEC);
        }
 
        rc = lu_env_init(&mgc_env, LCT_MG_THREAD);
        }
 
        rc = lu_env_init(&mgc_env, LCT_MG_THREAD);
@@ -1444,8 +1454,9 @@ out_mgc:
 out_env:
        lu_env_fini(&mgc_env);
 out_stop_service:
 out_env:
        lu_env_fini(&mgc_env);
 out_stop_service:
-       if (rc != 0)
-               server_stop_servers(lsi->lsi_flags);
+       /* in case of error upper function call
+        * server_put_super->server_stop_servers()
+        */
 
        RETURN(rc);
 }
 
        RETURN(rc);
 }
@@ -1531,6 +1542,7 @@ static void server_put_super(struct super_block *sb)
        char *tmpname, *extraname = NULL;
        int tmpname_sz;
        int lsiflags = lsi->lsi_flags;
        char *tmpname, *extraname = NULL;
        int tmpname_sz;
        int lsiflags = lsi->lsi_flags;
+       bool stop_servers = lsi->lsi_server_started;
        ENTRY;
 
        LASSERT(IS_SERVER(lsi));
        ENTRY;
 
        LASSERT(IS_SERVER(lsi));
@@ -1583,6 +1595,13 @@ static void server_put_super(struct super_block *sb)
                         * to .put_super, so we better make sure we clean up! */
                        obd->obd_force = 1;
                        class_manual_cleanup(obd);
                         * to .put_super, so we better make sure we clean up! */
                        obd->obd_force = 1;
                        class_manual_cleanup(obd);
+                       if (OBD_FAIL_PRECHECK(OBD_FAIL_OBD_STOP_MDS_RACE)) {
+                               int idx;
+                               server_name2index(lsi->lsi_svname, &idx, NULL);
+                               /* sleeping for MDT0001 */
+                               if (idx == 1)
+                                       OBD_RACE(OBD_FAIL_OBD_STOP_MDS_RACE);
+                       }
                } else {
                        CERROR("no obd %s\n", lsi->lsi_svname);
                        server_deregister_mount(lsi->lsi_svname);
                } else {
                        CERROR("no obd %s\n", lsi->lsi_svname);
                        server_deregister_mount(lsi->lsi_svname);
@@ -1613,7 +1632,8 @@ static void server_put_super(struct super_block *sb)
        /* Stop the servers (MDS, OSS) if no longer needed.  We must wait
           until the target is really gone so that our type refcount check
           is right. */
        /* Stop the servers (MDS, OSS) if no longer needed.  We must wait
           until the target is really gone so that our type refcount check
           is right. */
-       server_stop_servers(lsiflags);
+       if (stop_servers)
+               server_stop_servers(lsiflags);
 
        /* In case of startup or cleanup err, stop related obds */
        if (extraname) {
 
        /* In case of startup or cleanup err, stop related obds */
        if (extraname) {
index 1f346b1..42bb2fe 100644 (file)
@@ -18493,6 +18493,33 @@ test_277() {
 }
 run_test 277 "Direct IO shall drop page cache"
 
 }
 run_test 277 "Direct IO shall drop page cache"
 
+test_278() {
+       [ $PARALLEL == "yes" ] && skip "skip parallel run" && return
+       [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
+       [[ "$(facet_host mds1)" != "$(facet_host mds2)" ]] &&
+               skip "needs the same host for mdt1 mdt2" && return
+
+       local pid1
+       local pid2
+
+#define OBD_FAIL_OBD_STOP_MDS_RACE     0x60b
+       do_facet mds2 $LCTL set_param fail_loc=0x8000060c
+       stop mds2 &
+       pid2=$!
+
+       stop mds1
+
+       echo "Starting MDTs"
+       start mds1 $(mdsdevname 1) $MDS_MOUNT_OPTS
+       wait $pid2
+#For the error assertion will happen. lu_env_get_key(..., &mdt_thread_key)
+#will return NULL
+       do_facet mds2 $LCTL set_param fail_loc=0
+
+       start mds2 $(mdsdevname 2) $MDS_MOUNT_OPTS
+}
+run_test 278 "Race starting MDS between MDTs stop/start"
+
 cleanup_test_300() {
        trap 0
        umask $SAVE_UMASK
 cleanup_test_300() {
        trap 0
        umask $SAVE_UMASK