Whamcloud - gitweb
LU-7169 tests: check disk corruption during failover 64/16664/9
authorFan Yong <fan.yong@intel.com>
Thu, 24 Sep 2015 09:04:41 +0000 (17:04 +0800)
committerOleg Drokin <oleg.drokin@intel.com>
Tue, 17 Nov 2015 15:56:29 +0000 (15:56 +0000)
It is a debug patch for conf-sanity test_84. It is suspected
that there is some disk corruption during the MDT0 failover.

Test-Parameters: mdsfilesystemtype=ldiskfs mdtfilesystemtype=ldiskfs ostfilesystemtype=ldiskfs testlist=conf-sanity,conf-sanity,conf-sanity
Signed-off-by: Fan Yong <fan.yong@intel.com>
Change-Id: I7e20f26e1ecee483474ace44c8284b5776f3c602
Reviewed-on: http://review.whamcloud.com/16664
Tested-by: Jenkins
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Jian Yu <jian.yu@intel.com>
Tested-by: Maloo <hpdd-maloo@intel.com>
lustre/osd-ldiskfs/osd_scrub.c
lustre/tests/conf-sanity.sh
lustre/tests/test-framework.sh

index 61ed4cc..31b879b 100644 (file)
@@ -1735,7 +1735,19 @@ osd_ios_lookup_one_len(const char *name, struct dentry *parent, int namelen)
        struct dentry *dentry;
 
        dentry = ll_lookup_one_len(name, parent, namelen);
        struct dentry *dentry;
 
        dentry = ll_lookup_one_len(name, parent, namelen);
-       if (!IS_ERR(dentry) && dentry->d_inode == NULL) {
+       if (IS_ERR(dentry)) {
+               int rc = PTR_ERR(dentry);
+
+               if (rc != -ENOENT)
+                       CERROR("Fail to find %.*s in %.*s (%lu/%u): rc = %d\n",
+                              namelen, name, parent->d_name.len,
+                              parent->d_name.name, parent->d_inode->i_ino,
+                              parent->d_inode->i_generation, rc);
+
+               return dentry;
+       }
+
+       if (dentry->d_inode == NULL) {
                dput(dentry);
                return ERR_PTR(-ENOENT);
        }
                dput(dentry);
                return ERR_PTR(-ENOENT);
        }
@@ -2488,8 +2500,29 @@ int osd_scrub_setup(const struct lu_env *env, struct osd_device *dev)
                GOTO(cleanup_inode, rc);
        } else {
                if (memcmp(sf->sf_uuid, es->s_uuid, 16) != 0) {
                GOTO(cleanup_inode, rc);
        } else {
                if (memcmp(sf->sf_uuid, es->s_uuid, 16) != 0) {
+                       struct obd_uuid *old_uuid;
+                       struct obd_uuid *new_uuid;
+
+                       OBD_ALLOC_PTR(old_uuid);
+                       OBD_ALLOC_PTR(new_uuid);
+                       if (old_uuid == NULL || new_uuid == NULL) {
+                               CERROR("%.16s: UUID has been changed, but"
+                                      "failed to allocate RAM for report\n",
+                                      LDISKFS_SB(sb)->s_es->s_volume_name);
+                       } else {
+                               class_uuid_unparse(sf->sf_uuid, old_uuid);
+                               class_uuid_unparse(es->s_uuid, new_uuid);
+                               CERROR("%.16s: UUID has been changed from "
+                                      "%s to %s\n",
+                                      LDISKFS_SB(sb)->s_es->s_volume_name,
+                                      old_uuid->uuid, new_uuid->uuid);
+                       }
                        osd_scrub_file_reset(scrub, es->s_uuid,SF_INCONSISTENT);
                        dirty = 1;
                        osd_scrub_file_reset(scrub, es->s_uuid,SF_INCONSISTENT);
                        dirty = 1;
+                       if (old_uuid != NULL)
+                               OBD_FREE_PTR(old_uuid);
+                       if (new_uuid != NULL)
+                               OBD_FREE_PTR(new_uuid);
                } else if (sf->sf_status == SS_SCANNING) {
                        sf->sf_status = SS_CRASHED;
                        dirty = 1;
                } else if (sf->sf_status == SS_SCANNING) {
                        sf->sf_status = SS_CRASHED;
                        dirty = 1;
index fb97eeb..488948c 100644 (file)
@@ -5591,7 +5591,7 @@ test_84() {
        #define OBD_FAIL_TGT_REPLAY_DELAY  0x709 | FAIL_SKIP
        do_facet $SINGLEMDS "lctl set_param fail_loc=0x20000709 fail_val=5"
 
        #define OBD_FAIL_TGT_REPLAY_DELAY  0x709 | FAIL_SKIP
        do_facet $SINGLEMDS "lctl set_param fail_loc=0x20000709 fail_val=5"
 
-       facet_failover $SINGLEMDS || error "failover: $?"
+       facet_failover --fsck $SINGLEMDS || error "failover: $?"
        client_up
 
        echo "recovery status"
        client_up
 
        echo "recovery status"
index 15feee6..2547035 100755 (executable)
@@ -2460,6 +2460,13 @@ affected_facets () {
 }
 
 facet_failover() {
 }
 
 facet_failover() {
+       local E2FSCK_ON_MDT0=false
+       if [ "$1" == "--fsck" ]; then
+               shift
+               [ $(facet_fstype $SINGLEMDS) == ldiskfs ] &&
+                       E2FSCK_ON_MDT0=true
+       fi
+
        local facets=$1
        local sleep_time=$2
        local -a affecteds
        local facets=$1
        local sleep_time=$2
        local -a affecteds
@@ -2493,6 +2500,9 @@ facet_failover() {
                shutdown_facet $facet
        done
 
                shutdown_facet $facet
        done
 
+       $E2FSCK_ON_MDT0 && (run_e2fsck $(facet_active_host $SINGLEMDS) \
+               $(mdsdevname 1) "-n" || error "Running e2fsck")
+
        for ((index=0; index<$total; index++)); do
                facet=$(echo ${affecteds[index]} | tr -s " " | cut -d"," -f 1)
                echo reboot facets: ${affecteds[index]}
        for ((index=0; index<$total; index++)); do
                facet=$(echo ${affecteds[index]} | tr -s " " | cut -d"," -f 1)
                echo reboot facets: ${affecteds[index]}