From cae32fb38f0754ab2f8eae89ffe9151c6c194ef8 Mon Sep 17 00:00:00 2001 From: Andriy Skulysh Date: Thu, 15 May 2014 15:29:12 +0300 Subject: [PATCH] LU-4913 mgc: mgc import reconnect race mgc import can be reconnected by pinger or ptlrpc_reconnect_import(). ptlrpc_invalidate_import() isn't protected against alteration of imp_invalid state. Import can be reconnected by pinger which makes imp_invalid equal to false. Thus LASSERT(imp->imp_invalid) fails in ptlrpc_invalidate_import(). It is safe to call ptlrpc_invalidate_import() when import is deactivated, but ptlrpc_reconnect_import() doesn't deactivate it. Let's use only pinger when available to reconnect import Xyratex-bug-id: MRP-1746 Change-Id: I2feb45c5f3e96da30dd5639d5824068f8a126c7d Signed-off-by: Andriy Skulysh Reviewed-on: http://review.whamcloud.com/9967 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Mike Pershin Reviewed-by: Lai Siyao Reviewed-by: Oleg Drokin --- lustre/mgs/mgs_handler.c | 1 + lustre/obdclass/obd_mount.c | 13 ++----------- lustre/ptlrpc/import.c | 41 +++++++++++++++++++++++++++++++++-------- lustre/ptlrpc/pinger.c | 5 +++++ lustre/tests/conf-sanity.sh | 16 ++++++++++++++++ 5 files changed, 57 insertions(+), 19 deletions(-) diff --git a/lustre/mgs/mgs_handler.c b/lustre/mgs/mgs_handler.c index d535bf9..6378aba 100644 --- a/lustre/mgs/mgs_handler.c +++ b/lustre/mgs/mgs_handler.c @@ -60,6 +60,7 @@ static int mgs_connect(struct tgt_session_info *tsi) ENTRY; + CFS_FAIL_TIMEOUT(OBD_FAIL_MGS_CONNECT_NET, cfs_fail_val); rc = tgt_connect(tsi); if (rc) RETURN(rc); diff --git a/lustre/obdclass/obd_mount.c b/lustre/obdclass/obd_mount.c index f8ddf0b..5cd96bf 100644 --- a/lustre/obdclass/obd_mount.c +++ b/lustre/obdclass/obd_mount.c @@ -226,7 +226,6 @@ int lustre_start_mgc(struct super_block *sb) lnet_nid_t nid; char *mgcname = NULL, *niduuid = NULL, *mgssec = NULL; char *ptr; - int recov_bk; int rc = 0, i = 0, j, len; ENTRY; @@ -277,6 +276,8 @@ int lustre_start_mgc(struct super_block *sb) obd = class_name2obd(mgcname); if (obd && !obd->obd_stopping) { + int recov_bk; + rc = obd_set_info_async(NULL, obd->obd_self_export, strlen(KEY_MGSSEC), KEY_MGSSEC, strlen(mgssec), mgssec, NULL); @@ -449,16 +450,6 @@ int lustre_start_mgc(struct super_block *sb) so we know when we can get rid of the mgc. */ atomic_set(&obd->u.cli.cl_mgc_refcount, 1); - /* Try all connections, but only once. */ - recov_bk = 1; - rc = obd_set_info_async(NULL, obd->obd_self_export, - sizeof(KEY_INIT_RECOV_BACKUP), - KEY_INIT_RECOV_BACKUP, - sizeof(recov_bk), &recov_bk, NULL); - if (rc) - /* nonfatal */ - CWARN("can't set %s %d\n", KEY_INIT_RECOV_BACKUP, rc); - /* We connect to the MGS at setup, and don't disconnect until cleanup */ data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_AT | OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV | diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index 81cf710..c18855d 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -281,6 +281,7 @@ void ptlrpc_invalidate_import(struct obd_import *imp) if (!imp->imp_invalid || imp->imp_obd->obd_no_recov) ptlrpc_deactivate_import(imp); + CFS_FAIL_TIMEOUT(OBD_FAIL_MGS_CONNECT_NET, 3 * cfs_fail_val / 2); LASSERT(imp->imp_invalid); /* Wait forever until inflight == 0. We really can't do it another @@ -396,6 +397,19 @@ void ptlrpc_activate_import(struct obd_import *imp) } EXPORT_SYMBOL(ptlrpc_activate_import); +static void ptlrpc_pinger_force(struct obd_import *imp) +{ + CDEBUG(D_HA, "%s: waking up pinger s:%s\n", obd2cli_tgt(imp->imp_obd), + ptlrpc_import_state_name(imp->imp_state)); + + spin_lock(&imp->imp_lock); + imp->imp_force_verify = 1; + spin_unlock(&imp->imp_lock); + + if (imp->imp_state != LUSTRE_IMP_CONNECTING) + ptlrpc_pinger_wake_up(); +} + void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt) { ENTRY; @@ -412,14 +426,7 @@ void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt) ptlrpc_deactivate_import(imp); } - CDEBUG(D_HA, "%s: waking up pinger\n", - obd2cli_tgt(imp->imp_obd)); - - spin_lock(&imp->imp_lock); - imp->imp_force_verify = 1; - spin_unlock(&imp->imp_lock); - - ptlrpc_pinger_wake_up(); + ptlrpc_pinger_force(imp); } EXIT; } @@ -427,6 +434,23 @@ EXPORT_SYMBOL(ptlrpc_fail_import); int ptlrpc_reconnect_import(struct obd_import *imp) { +#ifdef ENABLE_PINGER + struct l_wait_info lwi; + int secs = cfs_time_seconds(obd_timeout); + int rc; + + ptlrpc_pinger_force(imp); + + CDEBUG(D_HA, "%s: recovery started, waiting %u seconds\n", + obd2cli_tgt(imp->imp_obd), secs); + + lwi = LWI_TIMEOUT(secs, NULL, NULL); + rc = l_wait_event(imp->imp_recovery_waitq, + !ptlrpc_import_in_recovery(imp), &lwi); + CDEBUG(D_HA, "%s: recovery finished s:%s\n", obd2cli_tgt(imp->imp_obd), + ptlrpc_import_state_name(imp->imp_state)); + return rc; +#else ptlrpc_set_import_discon(imp, 0); /* Force a new connect attempt */ ptlrpc_invalidate_import(imp); @@ -452,6 +476,7 @@ int ptlrpc_reconnect_import(struct obd_import *imp) /* Attempt a new connect */ ptlrpc_recover_import(imp, NULL, 0); return 0; +#endif } EXPORT_SYMBOL(ptlrpc_reconnect_import); diff --git a/lustre/ptlrpc/pinger.c b/lustre/ptlrpc/pinger.c index daa8aa2..aa8f722 100644 --- a/lustre/ptlrpc/pinger.c +++ b/lustre/ptlrpc/pinger.c @@ -233,6 +233,11 @@ static void ptlrpc_pinger_process_import(struct obd_import *imp, "or recovery disabled: %s)\n", imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd), ptlrpc_import_state_name(level)); + if (force) { + spin_lock(&imp->imp_lock); + imp->imp_force_verify = 1; + spin_unlock(&imp->imp_lock); + } } else if ((imp->imp_pingable && !suppress) || force_next || force) { ptlrpc_ping(imp); } diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index e58f466..8b34529 100644 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -4682,6 +4682,22 @@ test_78() { } run_test 78 "run resize2fs on MDT and OST filesystems" +test_80() { + start_mds + start_ost + uuid=$(do_facet ost1 lctl get_param -n mgc.*.uuid) +#define OBD_FAIL_MGS_PAUSE_TARGET_CON 0x906 + do_facet ost1 "lctl set_param fail_val=10 fail_loc=0x906" + do_facet mgs "lctl set_param fail_val=10 fail_loc=0x906" + do_facet mgs "lctl set_param -n mgs/MGS/evict_client $uuid" + sleep 30 + start_ost2 + + do_facet ost1 "lctl set_param fail_loc=0" + stopall +} +run_test 80 "mgc import reconnect race" + if ! combined_mgs_mds ; then stop mgs fi -- 1.8.3.1