From: Alexander Boyko Date: Tue, 30 Jul 2019 11:33:15 +0000 (-0400) Subject: LU-12616 obclass: fix MDS start/stop race X-Git-Tag: 2.12.58~35 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=3cce65712d94cffe8f1626545845b95b88aef672 LU-12616 obclass: fix MDS start/stop race The MDS unload happen when type of MDT has no reference. The MDT drop it during obd_cleanup. So race window located between obd_cleanup and server_stop_servers. Lustre can lost MDS obd_device during server_start_targets between MDS checking and taking the type reference, if another MDT stops. The patch takes one more reference for a MDT type at server_start_targets, and put it at server_stop_servers. This patch adds sanity test 278. It reproduces the next race started cleanup of MDT01 started cleanup of MDT00 finished cleanup of MDT00 started MDT00 mount, checked MDS exist finished cleanup of MDT01, and cleanup of MDS also asserted during MDT00 initialization Cray-bug-id: LUS-7275 Signed-off-by: Alexander Boyko Change-Id: I9ae3bc2ec1d23c8d436f143d12e26209fdb6b083 Reviewed-on: https://review.whamcloud.com/35652 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Neil Brown Reviewed-by: Alexander Zarochentsev Reviewed-by: Oleg Drokin --- diff --git a/lustre/include/lustre_disk.h b/lustre/include/lustre_disk.h index 47ad428..b4d509f 100644 --- a/lustre/include/lustre_disk.h +++ b/lustre/include/lustre_disk.h @@ -133,7 +133,8 @@ struct lustre_sb_info { own backing_dev_info */ struct list_head lsi_lwp_list; spinlock_t lsi_lwp_lock; - unsigned long lsi_lwp_started:1; + unsigned long lsi_lwp_started:1, + lsi_server_started:1; }; #define LSI_UMOUNT_FAILOVER 0x00200000 diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 9ac5de0..ddc11c1 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -462,6 +462,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_OBD_NO_LRU 0x609 #define OBD_FAIL_OBDCLASS_MODULE_LOAD 0x60a #define OBD_FAIL_OBD_ZERO_NLINK_RACE 0x60b +#define OBD_FAIL_OBD_STOP_MDS_RACE 0x60c #define OBD_FAIL_TGT_REPLY_NET 0x700 #define OBD_FAIL_TGT_CONN_RACE 0x701 diff --git a/lustre/obdclass/obd_mount_server.c b/lustre/obdclass/obd_mount_server.c index 574e487..aae2e35 100644 --- a/lustre/obdclass/obd_mount_server.c +++ b/lustre/obdclass/obd_mount_server.c @@ -48,6 +48,7 @@ #endif #include #include +#include #include #include @@ -1068,6 +1069,7 @@ static int server_stop_servers(int lsiflags) struct obd_device *obd = NULL; struct obd_type *type = NULL; int rc = 0; + bool type_last; ENTRY; mutex_lock(&server_start_lock); @@ -1076,18 +1078,24 @@ static int server_stop_servers(int lsiflags) /* if this was an MDT, and there are no more MDT's, clean up the MDS */ if (lsiflags & LDD_F_SV_TYPE_MDT) { obd = class_name2obd(LUSTRE_MDS_OBDNAME); - if (obd != NULL) - type = class_search_type(LUSTRE_MDT_NAME); - } - + type = class_search_type(LUSTRE_MDT_NAME); + } else if (lsiflags & LDD_F_SV_TYPE_OST) { /* if this was an OST, and there are no more OST's, clean up the OSS */ - if (lsiflags & LDD_F_SV_TYPE_OST) { obd = class_name2obd(LUSTRE_OSS_OBDNAME); - if (obd != NULL) - type = class_search_type(LUSTRE_OST_NAME); + type = class_search_type(LUSTRE_OST_NAME); } - if (obd != NULL && (type == NULL || type->typ_refcnt == 0)) { + /* server_stop_servers is a pair of server_start_targets + * Here we put type which was taken at server_start_targets. + * If type is NULL then there is a wrong logic around type or + * type reference. */ + LASSERTF(type, "Server flags %d, obd %s\n", lsiflags, + obd ? obd->obd_name : "NULL"); + + type_last = (type->typ_refcnt == 1); + + class_put_type(type); + if (obd != NULL && type_last) { obd->obd_force = 1; /* obd_fail doesn't mean much on a server obd */ rc = class_manual_cleanup(obd); @@ -1311,46 +1319,48 @@ static int server_start_targets(struct super_block *sb) struct config_llog_instance cfg; struct lu_env mgc_env; struct lu_device *dev; + char *name_service, *obd_name_service = NULL; + struct obd_type *type = NULL; int rc; ENTRY; CDEBUG(D_MOUNT, "starting target %s\n", lsi->lsi_svname); + LASSERTF(IS_MDT(lsi) || IS_OST(lsi), "designed for MDT or OST only\n"); + if (IS_MDT(lsi)) { - /* make sure the MDS is started */ - mutex_lock(&server_start_lock); - obd = class_name2obd(LUSTRE_MDS_OBDNAME); - if (!obd) { - rc = lustre_start_simple(LUSTRE_MDS_OBDNAME, - LUSTRE_MDS_NAME, - LUSTRE_MDS_OBDNAME"_uuid", - NULL, NULL, NULL, NULL); - if (rc) { - mutex_unlock(&server_start_lock); - CERROR("failed to start MDS: %d\n", rc); - RETURN(rc); - } - } - mutex_unlock(&server_start_lock); + obd_name_service = LUSTRE_MDS_OBDNAME; + name_service = LUSTRE_MDS_NAME; + } else { + obd_name_service = LUSTRE_OSS_OBDNAME; + name_service = LUSTRE_OSS_NAME; } - /* If we're an OST, make sure the global OSS is running */ - if (IS_OST(lsi)) { - /* make sure OSS is started */ - mutex_lock(&server_start_lock); - obd = class_name2obd(LUSTRE_OSS_OBDNAME); - if (!obd) { - rc = lustre_start_simple(LUSTRE_OSS_OBDNAME, - LUSTRE_OSS_NAME, - LUSTRE_OSS_OBDNAME"_uuid", - NULL, NULL, NULL, NULL); - if (rc) { - mutex_unlock(&server_start_lock); - CERROR("failed to start OSS: %d\n", rc); - RETURN(rc); - } + /* make sure MDS/OSS is started */ + mutex_lock(&server_start_lock); + obd = class_name2obd(obd_name_service); + if (!obd) { + rc = lustre_start_simple(obd_name_service, name_service, + (IS_MDT(lsi) ? + LUSTRE_MDS_OBDNAME"_uuid" : + LUSTRE_OSS_OBDNAME"_uuid"), + NULL, NULL, NULL, NULL); + if (rc) { + mutex_unlock(&server_start_lock); + CERROR("failed to start %s: %d\n", + obd_name_service, rc); + RETURN(rc); } - mutex_unlock(&server_start_lock); + } + /* hold a type reference and put it at server_stop_servers */ + type = class_get_type(IS_MDT(lsi) ? + LUSTRE_MDT_NAME : LUSTRE_OST_NAME); + lsi->lsi_server_started = 1; + mutex_unlock(&server_start_lock); + if (OBD_FAIL_PRECHECK(OBD_FAIL_OBD_STOP_MDS_RACE) && + IS_MDT(lsi)) { + OBD_RACE(OBD_FAIL_OBD_STOP_MDS_RACE); + msleep(2 * MSEC_PER_SEC); } rc = lu_env_init(&mgc_env, LCT_MG_THREAD); @@ -1444,8 +1454,9 @@ out_mgc: out_env: lu_env_fini(&mgc_env); out_stop_service: - if (rc != 0) - server_stop_servers(lsi->lsi_flags); + /* in case of error upper function call + * server_put_super->server_stop_servers() + */ RETURN(rc); } @@ -1531,6 +1542,7 @@ static void server_put_super(struct super_block *sb) char *tmpname, *extraname = NULL; int tmpname_sz; int lsiflags = lsi->lsi_flags; + bool stop_servers = lsi->lsi_server_started; ENTRY; LASSERT(IS_SERVER(lsi)); @@ -1583,6 +1595,13 @@ static void server_put_super(struct super_block *sb) * to .put_super, so we better make sure we clean up! */ obd->obd_force = 1; class_manual_cleanup(obd); + if (OBD_FAIL_PRECHECK(OBD_FAIL_OBD_STOP_MDS_RACE)) { + int idx; + server_name2index(lsi->lsi_svname, &idx, NULL); + /* sleeping for MDT0001 */ + if (idx == 1) + OBD_RACE(OBD_FAIL_OBD_STOP_MDS_RACE); + } } else { CERROR("no obd %s\n", lsi->lsi_svname); server_deregister_mount(lsi->lsi_svname); @@ -1613,7 +1632,8 @@ static void server_put_super(struct super_block *sb) /* Stop the servers (MDS, OSS) if no longer needed. We must wait until the target is really gone so that our type refcount check is right. */ - server_stop_servers(lsiflags); + if (stop_servers) + server_stop_servers(lsiflags); /* In case of startup or cleanup err, stop related obds */ if (extraname) { diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 1f346b1..42bb2fe 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -18493,6 +18493,33 @@ test_277() { } run_test 277 "Direct IO shall drop page cache" +test_278() { + [ $PARALLEL == "yes" ] && skip "skip parallel run" && return + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return + [[ "$(facet_host mds1)" != "$(facet_host mds2)" ]] && + skip "needs the same host for mdt1 mdt2" && return + + local pid1 + local pid2 + +#define OBD_FAIL_OBD_STOP_MDS_RACE 0x60b + do_facet mds2 $LCTL set_param fail_loc=0x8000060c + stop mds2 & + pid2=$! + + stop mds1 + + echo "Starting MDTs" + start mds1 $(mdsdevname 1) $MDS_MOUNT_OPTS + wait $pid2 +#For the error assertion will happen. lu_env_get_key(..., &mdt_thread_key) +#will return NULL + do_facet mds2 $LCTL set_param fail_loc=0 + + start mds2 $(mdsdevname 2) $MDS_MOUNT_OPTS +} +run_test 278 "Race starting MDS between MDTs stop/start" + cleanup_test_300() { trap 0 umask $SAVE_UMASK