+++ /dev/null
-#!/bin/bash
-# test e2fsck and lfsck to detect and fix filesystem corruption
-#
-#set -vx
-set -e
-
-[ "$1" = "-v" ] && shift && VERBOSE=echo || VERBOSE=:
-
-LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
-. $LUSTRE/tests/test-framework.sh
-init_test_env $@
-. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
-
-NUMFILES=${NUMFILES:-10}
-NUMDIRS=${NUMDIRS:-4}
-OSTIDX=${OSTIDX:-0} # the OST index in LOV
-OBJGRP=${OBJGRP:-0} # the OST object group
-
-[ ! -d "$SHARED_DIRECTORY" ] &&
- skip_env "SHARED_DIRECTORY should be accessible on all nodes" &&
- exit 0
-[[ $(facet_fstype $SINGLEMDS) != ldiskfs ]] &&
- skip "Only applicable to ldiskfs-based MDTs" && exit 0
-[[ $(facet_fstype OST) != ldiskfs ]] &&
- skip "Only applicable to ldiskfs-based OST" && exit 0
-
-which getfattr &>/dev/null || { skip_env "could not find getfattr" && exit 0; }
-which setfattr &>/dev/null || { skip_env "could not find setfattr" && exit 0; }
-
-MOUNT_2=""
-check_and_setup_lustre
-
-assert_DIR
-
-SAMPLE_FILE=$TMP/$TESTSUITE.junk
-dd if=/dev/urandom of=$SAMPLE_FILE bs=1M count=1
-
-# Create some dirs and files on the filesystem.
-create_files_sub() {
- local test_dir=$1
- local num_dirs=$2
- local file_name=$3
- local first_num=$4
- local last_num=$5
- local d e f
-
- echo "creating files in $test_dir/d[$first_num..$last_num]:"
- for d in $(seq -f $test_dir/d%g $first_num $last_num); do
- mkdir -p $d || error "mkdir $d failed"
- $VERBOSE "created $d $(lfs path2fid $d)"
- for e in $(seq -f $d/d%g $num_dirs); do
- mkdir -p $e || error "mkdir $$e failed"
- $VERBOSE "created $e $(lfs path2fid $e)"
- for f in $(seq -f $e/test%g $num_dirs); do
- cp $file_name $f ||
- error "cp $file_name $f failed"
- $VERBOSE "created $f $(lfs path2fid $f)"
- done
- done
- done
-}
-
-create_files() {
- local test_dir=$1
- local num_dirs=$2
- local num_files=$3
- local f
-
- # create some files on the filesystem
- local first_num=1
- local last_num=$num_dirs
- create_files_sub $test_dir $num_dirs /etc/fstab $first_num $last_num
-
- # create files to be modified
- echo "creating files $test_dir/testfile.[0..$((num_files * 3))]:"
- for f in $(seq -f $test_dir/testfile.%g $((num_files * 3))); do
- cp $SAMPLE_FILE $f || error "cp $SAMPLE_FILE $f failed"
- $VERBOSE "created $f $(lfs path2fid $f)"
- done
-
- # create some more files
- first_num=$((num_dirs * 2 + 1))
- last_num=$((num_dirs * 2 + 3))
- create_files_sub $test_dir $num_dirs /etc/hosts $first_num $last_num
-
- # these should NOT be taken as duplicates
- echo "linking files in $test_dir/d[$first_num..$last_num]:"
- for f in $(seq -f $test_dir/d$last_num/linkfile.%g $num_files); do
- cp /etc/hosts $f || error "cp /etc/hosts $f failed"
- ln $f $f.link || error "ln $f $f.link failed"
- $VERBOSE "linked $f to $f.link $(lfs path2fid $f)"
- done
-}
-
-# Get the objids for files on the OST (given the OST index and object group).
-get_objects() {
- local obdidx=$1
- shift
- local seq=$1
- shift
- local ost_files="$@"
- local ost_objids
- local objids
-
- for F in $ostfiles; do
- objid=$($GETSTRIPE $F |
- awk "{ if (\$1 == $obdidx && \$4 == $seq) print \$2 }")
- $VERBOSE $GETSTRIPE -v $F | grep -v "lmm_seq|lmm_object_id" 1>&2
- ost_objids="$ost_objids $objid"
- done
-
- echo $ost_objids
-}
-
-# Get the OST target device (given the OST facet name and OST index).
-get_ost_dev() {
- local node=$1
- local obdidx=$2
- local ost_name
- local ost_dev
-
- ost_name=$(ostname_from_index $obdidx)
- ost_dev=$(get_osd_param $node $ost_name mntdev)
- if [ $? -ne 0 ]; then
- printf "unable to find OST%04x on $facet\n" $obdidx
- return 1
- fi
-
- if [[ $ost_dev = *loop* ]]; then
- ost_dev=$(do_node $node "losetup $ost_dev" |
- sed -e "s/.*(//" -e "s/).*//")
- fi
-
- echo $ost_dev
-}
-
-# Get the file names to be duplicated or removed on the MDS.
-get_files() {
- local flavor=$1
- local test_dir=$2
- local num_files=$3
- local first last
- local test_file
-
- case $flavor in
- dup)
- first=$((num_files + 1))
- last=$((num_files * 2))
- ;;
- remove)
- first=$((num_files * 2 + 1))
- last=$((num_files * 3))
- ;;
- *) echo "get_files(): invalid flavor" && return 1 ;;
- esac
-
- local files=""
- local f
- for f in $(seq -f testfile.%g $first $last); do
- test_file=$test_dir/$f
- $GETSTRIPE -v $test_file |
- egrep -v "lmm_stripe|lmm_layout|lmm_magic" 1>&2
- files="$files $test_file"
- done
- files=$(echo $files | sed "s#$DIR/##g")
- echo $files
-}
-
-# Remove objects associated with files.
-remove_objects() {
- do_rpc_nodes $(facet_host $1) remove_ost_objects $@
-}
-
-# Remove files from MDS.
-remove_files() {
- do_rpc_nodes $(facet_host $1) remove_mdt_files $@
-}
-
-# Create EAs on files so objects are referenced from different files.
-duplicate_files() {
- do_rpc_nodes $(facet_host $1) duplicate_mdt_files $@
-}
-
-#********************************* Main Flow **********************************#
-
-init_logging
-
-# get the server target devices
-get_svr_devs
-
-TESTDIR=$DIR/d0.$TESTSUITE
-if is_empty_fs $MOUNT; then
- # create test directory
- mkdir -p $TESTDIR || error "mkdir $TESTDIR failed"
-
- # create some dirs and files on the filesystem
- create_files $TESTDIR $NUMDIRS $NUMFILES
-
- # get objids for files in group $OBJGRP on the OST with index $OSTIDX
- echo "objects to be removed, leaving dangling references:"
- OST_REMOVE=$(get_objects $OSTIDX $OBJGRP \
- $(seq -f $TESTDIR/testfile.%g $NUMFILES))
-
- # get the node name and target device for the OST with index $OSTIDX
- OSTNODE=$(facet_active_host ost$((OSTIDX + 1)))
- OSTDEV=$(get_ost_dev $OSTNODE $OSTIDX) ||
- error "get_ost_dev $OSTNODE $OSTIDX failed"
-
- # get the file names to be duplicated on the MDS
- echo "files to be duplicated, leaving double-referenced objects:"
- MDS_DUPE=$(get_files dup $TESTDIR $NUMFILES) || error "$MDS_DUPE"
- # get the file names to be removed from the MDS
- echo "files to be removed, leaving orphan objects:"
- MDS_REMOVE=$(get_files remove $TESTDIR $NUMFILES) || error "$MDS_REMOVE"
-
- stopall -f || error "cleanupall failed"
-
- # remove objects associated with files in group $OBJGRP
- # on the OST with index $OSTIDX
- remove_objects ost$((OSTIDX + 1)) $OSTDEV $OBJGRP $OST_REMOVE ||
- error "removing objects failed"
-
- # remove files from MDS
- remove_files $SINGLEMDS $MDTDEV $MDS_REMOVE ||
- error "removing files failed"
-
- # create EAs on files so objects are referenced from different files
- duplicate_files $SINGLEMDS $MDTDEV $MDS_DUPE ||
- error "duplicating files failed"
- FSCK_MAX_ERR=1 # file system errors corrected
-else # is_empty_fs $MOUNT
- FSCK_MAX_ERR=4 # file system errors left uncorrected
- sync; sync; sleep 3 # make sure all data flush back
-fi
-
-# Test 1a - check and repair the filesystem
-# lfsck will return 1 if the filesystem had errors fixed
-# run e2fsck to generate databases used for lfsck
-generate_db
-
-# remount filesystem
-ORIG_REFORMAT=$REFORMAT
-REFORMAT=""
-check_and_setup_lustre
-REFORMAT=$ORIG_REFORMAT
-
-# run lfsck
-rc=0
-run_lfsck || rc=$?
-if [ $rc -eq 0 ]; then
- echo "clean after the first check"
-else
- # remove the files in lost+found created by the first lfsck
- # run, they could confuse the second run of lfsck.
- rm -fr $DIR/lost+found/*
- sync; sync; sleep 3
-
- # run e2fsck again to generate databases used for lfsck
- generate_db
-
- # run lfsck again
- rc=0
- run_lfsck || rc=$?
- if [ $rc -eq 0 ]; then
- echo "clean after the second check"
- else
- # FIXME: If the first run of lfsck fixed some errors,
- # the second run of lfsck will always return 1 (some
- # errors fixed) but not 0 (fs clean), the reason of
- # this unexpected behaviour is unkown yet.
- #
- # Actually, this issue exists from day one but was
- # not detected before, because run_lfsck() always return
- # 0 before. Let's supress this error and make the lfsck
- # test pass for now, once we figure out the problem,
- # following 'echo' should be replaced with 'error'.
- # See LU-3180.
- echo "lfsck test 2 - finished with rc=$rc"
- fi
-fi
-
-complete $SECONDS
-# The test directory contains some files referencing to some object
-# which could cause error when removing the directory.
-RMCNT=0
-while [ -d $TESTDIR ]; do
- RMCNT=$((RMCNT + 1))
- rm -fr $TESTDIR || echo "$RMCNT round: rm $TESTDIR failed"
- [ $RMCNT -ge 10 ] && error "cleanup $TESTDIR failed $RMCNT times"
- remount_client $MOUNT
-done
-check_and_cleanup_lustre
-exit_status
test_17m() {
local short_sym="0123456789"
local WDIR=$DIR/${tdir}m
- local mds_index
- local devname
- local cmd
local i
- local rc=0
remote_mds_nodsh && skip "remote MDS with nodsh" && return
[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.2.0) ] &&
echo "recreate the 512 symlink files with a shorter string"
for ((i = 0; i < 512; ++i)); do
# rewrite the symlink file with a shorter string
- ln -sf ${long_sym} $WDIR/long-$i
- ln -sf ${short_sym} $WDIR/short-$i
+ ln -sf ${long_sym} $WDIR/long-$i || error "long_sym failed"
+ ln -sf ${short_sym} $WDIR/short-$i || error "short_sym failed"
done
- mds_index=$($LFS getstripe -M $WDIR)
- mds_index=$((mds_index+1))
- devname=$(mdsdevname $mds_index)
- cmd="$E2FSCK -fnvd $devname"
+ local mds_index=$(($($LFS getstripe -M $WDIR) + 1))
+ local devname=$(mdsdevname $mds_index)
- echo "stop and checking mds${mds_index}: $cmd"
+ echo "stop and checking mds${mds_index}:"
# e2fsck should not return error
stop mds${mds_index}
- do_facet mds${mds_index} $cmd || rc=$?
+ run_e2fsck $(facet_active_host mds${mds_index}) $devname -n
+ rc=$?
start mds${mds_index} $devname $MDS_MOUNT_OPTS || error "start failed"
df $MOUNT > /dev/null 2>&1
- [ $rc -ne 0 ] && error "e2fsck should not report error upon "\
- "short/long symlink MDT: rc=$rc"
- return $rc
+ [ $rc -eq 0 ] ||
+ error "e2fsck detected error for short/long symlink: rc=$rc"
}
run_test 17m "run e2fsck against MDT which contains short/long symlink"
check_fs_consistency_17n() {
local mdt_index
- local devname
- local cmd
local rc=0
# create/unlink in 17n only change 2 MDTs(MDT1/MDT2),
# so it only check MDT1/MDT2 instead of all of MDTs.
- for mdt_index in $(seq 1 2); do
- devname=$(mdsdevname $mdt_index)
- cmd="$E2FSCK -fnvd $devname"
-
- echo "stop and checking mds${mdt_index}: $cmd"
+ for mdt_index in 1 2; do
+ local devname=$(mdsdevname $mdt_index)
# e2fsck should not return error
stop mds${mdt_index}
- do_facet mds${mdt_index} $cmd || rc=$?
+ run_e2fsck $(facet_active_host mds$mdt_index) $devname -n ||
+ rc=$((rc + $?))
start mds${mdt_index} $devname $MDS_MOUNT_OPTS ||
- error "mount mds${mdt_index} failed"
+ error "mount mds$mdt_index failed"
df $MOUNT > /dev/null 2>&1
- [ $rc -ne 0 ] && break
done
return $rc
}
print_summary () {
trap 0
[ -z "$DEFAULT_SUITES"] && return 0
- [ "$TESTSUITE" == "lfsck" ] && return 0
[ -n "$ONLY" ] && echo "WARNING: ONLY is set to $(echo $ONLY)"
local details
local form="%-13s %-17s %-9s %s %s\n"
fi
fi
- export LFSCK_BIN=${LFSCK_BIN:-lfsck}
- export LFSCK_ALWAYS=${LFSCK_ALWAYS:-"no"} # check fs after each test suite
- export FSCK_MAX_ERR=4 # File system errors left uncorrected
+ export LFSCK_ALWAYS=${LFSCK_ALWAYS:-"no"} # check fs after test suite
+ export FSCK_MAX_ERR=4 # File system errors left uncorrected
export ZFS=${ZFS:-zfs}
export ZPOOL=${ZPOOL:-zpool}
}
wait_update_facet() {
+ local verbose=
+ [ "$1" = "--verbose" ] && verbose="$1" && shift
+
local facet=$1
shift
- wait_update $(facet_active_host $facet) "$@"
+ wait_update $verbose $(facet_active_host $facet) "$@"
}
sync_all_data() {
return 0
}
-# Run e2fsck on MDT and OST(s) to generate databases used for lfsck.
-generate_db() {
- local i
- local ostidx
- local dev
- local node
-
- [[ $(lustre_version_code $SINGLEMDS) -ne $(version_code 2.2.0) ]] ||
- { skip "Lustre 2.2.0 lacks the patch for LU-1255"; exit 0; }
-
- check_shared_dir $SHARED_DIRECTORY ||
- error "$SHARED_DIRECTORY isn't a shared directory"
-
- export MDSDB=$SHARED_DIRECTORY/mdsdb
- export OSTDB=$SHARED_DIRECTORY/ostdb
-
- # DNE is not supported, so when running e2fsck on a DNE filesystem,
- # we only pass master MDS parameters.
- run_e2fsck $MDTNODE $MDTDEV "-n --mdsdb $MDSDB"
-
- i=0
- ostidx=0
- OSTDB_LIST=""
- for node in $(osts_nodes); do
- for dev in ${OSTDEVS[i]}; do
- run_e2fsck $node $dev "-n --mdsdb $MDSDB --ostdb $OSTDB-$ostidx"
- OSTDB_LIST="$OSTDB_LIST $OSTDB-$ostidx"
- ostidx=$((ostidx + 1))
- done
- i=$((i + 1))
- done
-}
-
-# Run lfsck on server node if lfsck can't be found on client (LU-2571)
-run_lfsck_remote() {
- local cmd="$LFSCK_BIN -c -l --mdsdb $MDSDB --ostdb $OSTDB_LIST $MOUNT"
- local client=$1
- local mounted=true
- local rc=0
-
- #Check if lustre is already mounted
- do_rpc_nodes $client is_mounted $MOUNT || mounted=false
- if ! $mounted; then
- zconf_mount $client $MOUNT ||
- error "failed to mount Lustre on $client"
- fi
- #Run lfsck
- echo $cmd
- do_node $client $cmd || rc=$?
- #Umount if necessary
- if ! $mounted; then
- zconf_umount $client $MOUNT ||
- error "failed to unmount Lustre on $client"
- fi
-
- [ $rc -le $FSCK_MAX_ERR ] ||
- error "$cmd returned $rc, should be <= $FSCK_MAX_ERR"
- echo "lfsck finished with rc=$rc"
-
- return $rc
-}
-
run_lfsck() {
- local facets="client $SINGLEMDS"
- local found=false
- local facet
- local node
- local rc=0
-
- for facet in $facets; do
- node=$(facet_active_host $facet)
- if check_progs_installed $node $LFSCK_BIN; then
- found=true
- break
- fi
+ do_nodes $(comma_list $(mdts_nodes) $(osts_nodes)) \
+ $LCTL set_param printk=+lfsck
+ do_facet $SINGLEMDS "$LCTL lfsck_start -M $FSNAME-MDT0000 -r -A -t all"
+
+ for k in $(seq $MDSCOUNT); do
+ # wait up to 10+1 minutes for LFSCK to complete
+ wait_update_facet --verbose mds${k} "$LCTL get_param -n \
+ mdd.$(facet_svc mds${k}).lfsck_layout |
+ awk '/^status/ { print \\\$2 }'" "completed" 600 ||
+ error "MDS${k} layout isn't the expected 'completed'"
+ wait_update_facet --verbose mds${k} "$LCTL get_param -n \
+ mdd.$(facet_svc mds${k}).lfsck_namespace |
+ awk '/^status/ { print \\\$2 }'" "completed" 60 ||
+ error "MDS${k} namespace isn't the expected 'completed'"
done
- ! $found && error "None of \"$facets\" supports lfsck"
-
- run_lfsck_remote $node || rc=$?
-
- rm -rvf $MDSDB* $OSTDB* || true
- return $rc
+ local rep_mdt=$(do_nodes $(comma_list $(mdts_nodes)) \
+ $LCTL get_param -n mdd.$FSNAME-*.lfsck_* |
+ awk '/repaired/ { print $2 }' | calc_sum)
+ local rep_ost=$(do_nodes $(comma_list $(osts_nodes)) \
+ $LCTL get_param -n obdfilter.$FSNAME-*.lfsck_* |
+ awk '/repaired/ { print $2 }' | calc_sum)
+ local repaired=$((rep_mdt + rep_ost))
+ [ $repaired -eq 0 ] ||
+ error "lfsck repaired $rep_mdt MDT and $rep_ost OST errors"
}
dump_file_contents() {
}
check_and_cleanup_lustre() {
- if [ "$LFSCK_ALWAYS" = "yes" -a "$TESTSUITE" != "lfsck" ]; then
- get_svr_devs
- generate_db
- run_lfsck
- fi
+ if [ "$LFSCK_ALWAYS" = "yes" -a "$TESTSUITE" != "sanity-lfsck" -a \
+ "$TESTSUITE" != "sanity-scrub" ]; then
+ run_lfsck
+ fi
if is_mounted $MOUNT; then
[ -n "$DIR" ] && rm -rf $DIR/[Rdfs][0-9]* ||