Whamcloud - gitweb
LU-10419 lfsck: signal master engine when stop 27/31627/5
authorFan Yong <fan.yong@intel.com>
Fri, 20 Apr 2018 21:53:50 +0000 (05:53 +0800)
committerOleg Drokin <oleg.drokin@intel.com>
Thu, 14 Jun 2018 03:53:34 +0000 (03:53 +0000)
It is possible that during the LFSCK scanning, some server, MDT
or OST, maybe offline. At that time, if the LFSCK needs to talk
with such offline server, related RPC will trigger reconnect to
the offline server, and the LFSCK engine has to wait untill the
offline server become online or someone deactives the server by
force. To avoid being blocked when lfsck_stop() under such case,
the stop logic will send SIGINT signal to LFSCK engines. But we
only do that for the LFSCK assistant engines, forget to do that
for the LFSCK master engine. This patch fixes that.

Signed-off-by: Fan Yong <fan.yong@intel.com>
Change-Id: I5d51ab49524e8ae54f0853e93b94e78913f65e8a
Reviewed-on: https://review.whamcloud.com/31627
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Lai Siyao <lai.siyao@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/include/obd_support.h
lustre/lfsck/lfsck_engine.c
lustre/lfsck/lfsck_internal.h
lustre/lfsck/lfsck_layout.c
lustre/lfsck/lfsck_lib.c
lustre/lfsck/lfsck_striped_dir.c
lustre/tests/sanity-lfsck.sh

index a0dcdb9..24933d2 100644 (file)
@@ -593,7 +593,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV  0x162a
 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV   0x162b
 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME  0x162c
-#define OBD_FAIL_LFSCK_ASSISTANT_DIRECT        0x162d
+#define OBD_FAIL_LFSCK_ENGINE_DELAY    0x162d
 #define OBD_FAIL_LFSCK_LOST_MDTOBJ2    0x162e
 #define OBD_FAIL_LFSCK_BAD_PFL_RANGE   0x162f
 #define OBD_FAIL_LFSCK_NO_AGENTOBJ     0x1630
index 31b7efa..f49fda2 100644 (file)
@@ -1016,6 +1016,10 @@ int lfsck_master_engine(void *args)
        int                       rc;
        ENTRY;
 
+       spin_lock(&lfsck->li_lock);
+       lfsck->li_task = current;
+       spin_unlock(&lfsck->li_lock);
+
        /* There will be some objects verification during the LFSCK start,
         * such as the subsequent lfsck_verify_lpf(). Trigger low layer OI
         * OI scrub before that to handle the potential inconsistence. */
@@ -1112,6 +1116,7 @@ fini_oit:
 fini_args:
        spin_lock(&lfsck->li_lock);
        thread_set_flags(thread, SVC_STOPPED);
+       lfsck->li_task = NULL;
        spin_unlock(&lfsck->li_lock);
        wake_up_all(&thread->t_ctl_waitq);
        lfsck_thread_args_fini(lta);
index 1edd518..ac94f8f 100644 (file)
@@ -675,6 +675,7 @@ struct lfsck_instance {
        atomic_t                  li_ref;
        atomic_t                  li_double_scan_count;
        struct ptlrpc_thread      li_thread;
+       struct task_struct       *li_task;
 
        /* The time for last checkpoint, seconds */
        time64_t                  li_time_last_checkpoint;
index 712b299..18e0d47 100644 (file)
@@ -4099,7 +4099,7 @@ static int lfsck_layout_assistant_handler_p1(const struct lu_env *env,
        if (lso->lso_dead)
                RETURN(0);
 
-       CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_ASSISTANT_DIRECT, cfs_fail_val);
+       CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_ENGINE_DELAY, cfs_fail_val);
 
        rc = dt_attr_get(env, child, cla);
        if (rc == -ENOENT) {
@@ -5304,16 +5304,13 @@ static int lfsck_layout_scan_stripes(const struct lu_env *env,
                        goto next;
                }
 
-               if (!OBD_FAIL_CHECK(OBD_FAIL_LFSCK_ASSISTANT_DIRECT)) {
-                       rc = dt_declare_attr_get(env, cobj);
-                       if (rc != 0)
-                               goto next;
+               rc = dt_declare_attr_get(env, cobj);
+               if (rc)
+                       goto next;
 
-                       rc = dt_declare_xattr_get(env, cobj, &buf,
-                                                 XATTR_NAME_FID);
-                       if (rc != 0)
-                               goto next;
-               }
+               rc = dt_declare_xattr_get(env, cobj, &buf, XATTR_NAME_FID);
+               if (rc)
+                       goto next;
 
                if (lso == NULL) {
                        struct lu_attr *attr = &info->lti_la;
index 3de52cd..d7bde61 100644 (file)
@@ -3396,6 +3396,9 @@ int lfsck_stop(const struct lu_env *env, struct dt_device *key,
 
        thread_set_flags(thread, SVC_STOPPING);
 
+       LASSERT(lfsck->li_task != NULL);
+       force_sig(SIGINT, lfsck->li_task);
+
        if (lfsck->li_master) {
                struct lfsck_component *com;
                struct lfsck_assistant_data *lad;
index 5cffac7..ba47994 100644 (file)
@@ -1718,6 +1718,8 @@ int lfsck_namespace_verify_stripe_slave(const struct lu_env *env,
                GOTO(out, rc);
        }
 
+       CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_ENGINE_DELAY, cfs_fail_val);
+
        parent = lfsck_object_find_bottom(env, lfsck, pfid);
        if (IS_ERR(parent)) {
                rc = lfsck_namespace_trace_update(env, com, cfid,
index faf8193..b39a59a 100644 (file)
@@ -5032,29 +5032,70 @@ test_31h() {
 }
 run_test 31h "Repair the corrupted shard's name entry"
 
-test_32()
+test_32a()
 {
        lfsck_prep 5 5
        umount_client $MOUNT
 
-       #define OBD_FAIL_LFSCK_ASSISTANT_DIRECT 0x162d
+       #define OBD_FAIL_LFSCK_ENGINE_DELAY     0x162d
        do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
-       $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
+       $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
 
        local STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
        [ "$STATUS" == "scanning-phase1" ] ||
-               error "(3) Expect 'scanning-phase1', but got '$STATUS'"
+               error "(2) Expect 'scanning-phase1', but got '$STATUS'"
 
        echo "stop ost1"
-       stop ost1 > /dev/null || error "(4) Fail to stop OST1!"
+       stop ost1 > /dev/null || error "(3) Fail to stop OST1!"
 
        do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
-       sleep 1
+       sleep 4
 
        echo "stop LFSCK"
-       $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
+       $STOP_LFSCK || error "(4) Fail to stop LFSCK!"
+
+       start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
+               error "(5) Fail to start ost1"
+}
+run_test 32a "stop LFSCK when some OST failed"
+
+test_32b()
+{
+       [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
+
+       lfsck_prep 5 5
+       $LFS mkdir -i 1 $DIR/$tdir/dp ||
+               error "(1) Fail to create $DIR/$tdir/dp"
+       $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc1 ||
+               error "(2) Fail to create $DIR/$tdir/dp/dc1"
+       $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc2 ||
+               error "(3) Fail to create $DIR/$tdir/dp/dc2"
+       umount_client $MOUNT
+
+       #define OBD_FAIL_LFSCK_ENGINE_DELAY     0x162d
+       do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
+       $START_NAMESPACE -r -A || error "(4) Fail to start LFSCK for namespace!"
+
+       wait_update_facet $SINGLEMDS "$LCTL get_param -n \
+               mdd.${MDT_DEV}.lfsck_namespace |
+               awk '/^status/ { print \\\$2 }'" "scanning-phase1" 32 || {
+               $SHOW_NAMESPACE
+               error "(5) unexpected status"
+       }
+
+       echo "stop mds2"
+       stop mds2 > /dev/null || error "(6) Fail to stop MDT2!"
+
+       do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
+       sleep 4
+
+       echo "stop LFSCK"
+       $STOP_LFSCK || error "(7) Fail to stop LFSCK!"
+
+       start mds2 $(mdsdevname 2) $MOUNT_OPTS_NOSCRUB > /dev/null ||
+               error "(8) Fail to start MDT2"
 }
-run_test 32 "stop LFSCK when some OST failed"
+run_test 32b "stop LFSCK when some MDT failed"
 
 test_33()
 {