Whamcloud - gitweb
LU-4913 mgc: mgc import reconnect race 67/9967/6
authorAndriy Skulysh <Andriy_Skulysh@xyratex.com>
Thu, 15 May 2014 12:29:12 +0000 (15:29 +0300)
committerOleg Drokin <oleg.drokin@intel.com>
Fri, 23 May 2014 05:26:35 +0000 (05:26 +0000)
mgc import can be reconnected by pinger or
ptlrpc_reconnect_import().
ptlrpc_invalidate_import() isn't protected against
alteration of imp_invalid state. Import can be
reconnected by pinger which makes imp_invalid
equal to false. Thus LASSERT(imp->imp_invalid) fails
in ptlrpc_invalidate_import().

It is safe to call ptlrpc_invalidate_import() when
import is deactivated, but ptlrpc_reconnect_import() doesn't
deactivate it.
Let's use only pinger when available to reconnect import

Xyratex-bug-id: MRP-1746
Change-Id: I2feb45c5f3e96da30dd5639d5824068f8a126c7d
Signed-off-by: Andriy Skulysh <Andriy_Skulysh@xyratex.com>
Reviewed-on: http://review.whamcloud.com/9967
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Mike Pershin <mike.pershin@intel.com>
Reviewed-by: Lai Siyao <lai.siyao@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/mgs/mgs_handler.c
lustre/obdclass/obd_mount.c
lustre/ptlrpc/import.c
lustre/ptlrpc/pinger.c
lustre/tests/conf-sanity.sh

index d535bf9..6378aba 100644 (file)
@@ -60,6 +60,7 @@ static int mgs_connect(struct tgt_session_info *tsi)
 
        ENTRY;
 
+       CFS_FAIL_TIMEOUT(OBD_FAIL_MGS_CONNECT_NET, cfs_fail_val);
        rc = tgt_connect(tsi);
        if (rc)
                RETURN(rc);
index f8ddf0b..5cd96bf 100644 (file)
@@ -226,7 +226,6 @@ int lustre_start_mgc(struct super_block *sb)
         lnet_nid_t nid;
         char *mgcname = NULL, *niduuid = NULL, *mgssec = NULL;
         char *ptr;
-        int recov_bk;
         int rc = 0, i = 0, j, len;
         ENTRY;
 
@@ -277,6 +276,8 @@ int lustre_start_mgc(struct super_block *sb)
 
         obd = class_name2obd(mgcname);
         if (obd && !obd->obd_stopping) {
+               int recov_bk;
+
                 rc = obd_set_info_async(NULL, obd->obd_self_export,
                                         strlen(KEY_MGSSEC), KEY_MGSSEC,
                                         strlen(mgssec), mgssec, NULL);
@@ -449,16 +450,6 @@ int lustre_start_mgc(struct super_block *sb)
            so we know when we can get rid of the mgc. */
        atomic_set(&obd->u.cli.cl_mgc_refcount, 1);
 
-        /* Try all connections, but only once. */
-        recov_bk = 1;
-        rc = obd_set_info_async(NULL, obd->obd_self_export,
-                                sizeof(KEY_INIT_RECOV_BACKUP),
-                                KEY_INIT_RECOV_BACKUP,
-                                sizeof(recov_bk), &recov_bk, NULL);
-        if (rc)
-                /* nonfatal */
-                CWARN("can't set %s %d\n", KEY_INIT_RECOV_BACKUP, rc);
-
        /* We connect to the MGS at setup, and don't disconnect until cleanup */
        data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_AT |
                                  OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV |
index 81cf710..c18855d 100644 (file)
@@ -281,6 +281,7 @@ void ptlrpc_invalidate_import(struct obd_import *imp)
        if (!imp->imp_invalid || imp->imp_obd->obd_no_recov)
                ptlrpc_deactivate_import(imp);
 
+       CFS_FAIL_TIMEOUT(OBD_FAIL_MGS_CONNECT_NET, 3 * cfs_fail_val / 2);
        LASSERT(imp->imp_invalid);
 
         /* Wait forever until inflight == 0. We really can't do it another
@@ -396,6 +397,19 @@ void ptlrpc_activate_import(struct obd_import *imp)
 }
 EXPORT_SYMBOL(ptlrpc_activate_import);
 
+static void ptlrpc_pinger_force(struct obd_import *imp)
+{
+       CDEBUG(D_HA, "%s: waking up pinger s:%s\n", obd2cli_tgt(imp->imp_obd),
+              ptlrpc_import_state_name(imp->imp_state));
+
+       spin_lock(&imp->imp_lock);
+       imp->imp_force_verify = 1;
+       spin_unlock(&imp->imp_lock);
+
+       if (imp->imp_state != LUSTRE_IMP_CONNECTING)
+               ptlrpc_pinger_wake_up();
+}
+
 void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt)
 {
         ENTRY;
@@ -412,14 +426,7 @@ void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt)
                         ptlrpc_deactivate_import(imp);
                 }
 
-                CDEBUG(D_HA, "%s: waking up pinger\n",
-                       obd2cli_tgt(imp->imp_obd));
-
-               spin_lock(&imp->imp_lock);
-               imp->imp_force_verify = 1;
-               spin_unlock(&imp->imp_lock);
-
-               ptlrpc_pinger_wake_up();
+               ptlrpc_pinger_force(imp);
        }
        EXIT;
 }
@@ -427,6 +434,23 @@ EXPORT_SYMBOL(ptlrpc_fail_import);
 
 int ptlrpc_reconnect_import(struct obd_import *imp)
 {
+#ifdef ENABLE_PINGER
+       struct l_wait_info lwi;
+       int secs = cfs_time_seconds(obd_timeout);
+       int rc;
+
+       ptlrpc_pinger_force(imp);
+
+       CDEBUG(D_HA, "%s: recovery started, waiting %u seconds\n",
+              obd2cli_tgt(imp->imp_obd), secs);
+
+       lwi = LWI_TIMEOUT(secs, NULL, NULL);
+       rc = l_wait_event(imp->imp_recovery_waitq,
+                         !ptlrpc_import_in_recovery(imp), &lwi);
+       CDEBUG(D_HA, "%s: recovery finished s:%s\n", obd2cli_tgt(imp->imp_obd),
+              ptlrpc_import_state_name(imp->imp_state));
+       return rc;
+#else
        ptlrpc_set_import_discon(imp, 0);
        /* Force a new connect attempt */
        ptlrpc_invalidate_import(imp);
@@ -452,6 +476,7 @@ int ptlrpc_reconnect_import(struct obd_import *imp)
        /* Attempt a new connect */
        ptlrpc_recover_import(imp, NULL, 0);
        return 0;
+#endif
 }
 EXPORT_SYMBOL(ptlrpc_reconnect_import);
 
index daa8aa2..aa8f722 100644 (file)
@@ -233,6 +233,11 @@ static void ptlrpc_pinger_process_import(struct obd_import *imp,
                       "or recovery disabled: %s)\n",
                       imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd),
                       ptlrpc_import_state_name(level));
+               if (force) {
+                       spin_lock(&imp->imp_lock);
+                       imp->imp_force_verify = 1;
+                       spin_unlock(&imp->imp_lock);
+               }
        } else if ((imp->imp_pingable && !suppress) || force_next || force) {
                ptlrpc_ping(imp);
        }
index e58f466..8b34529 100644 (file)
@@ -4682,6 +4682,22 @@ test_78() {
 }
 run_test 78 "run resize2fs on MDT and OST filesystems"
 
+test_80() {
+       start_mds
+       start_ost
+       uuid=$(do_facet ost1 lctl get_param -n mgc.*.uuid)
+#define OBD_FAIL_MGS_PAUSE_TARGET_CON       0x906
+       do_facet ost1 "lctl set_param fail_val=10 fail_loc=0x906"
+       do_facet mgs "lctl set_param fail_val=10 fail_loc=0x906"
+       do_facet mgs "lctl set_param -n mgs/MGS/evict_client $uuid"
+       sleep 30
+       start_ost2
+
+       do_facet ost1 "lctl set_param fail_loc=0"
+       stopall
+}
+run_test 80 "mgc import reconnect race"
+
 if ! combined_mgs_mds ; then
        stop mgs
 fi