print_summary () {
trap 0
[ -z "$DEFAULT_SUITES"] && return 0
- [ "$TESTSUITE" == "lfsck" ] && return 0
[ -n "$ONLY" ] && echo "WARNING: ONLY is set to $(echo $ONLY)"
local details
local form="%-13s %-17s %-9s %s %s\n"
fi
fi
- export LFSCK_BIN=${LFSCK_BIN:-lfsck}
- export LFSCK_ALWAYS=${LFSCK_ALWAYS:-"no"} # check fs after each test suite
- export FSCK_MAX_ERR=4 # File system errors left uncorrected
+ export LFSCK_ALWAYS=${LFSCK_ALWAYS:-"no"} # check fs after test suite
+ export FSCK_MAX_ERR=4 # File system errors left uncorrected
export ZFS=${ZFS:-zfs}
export ZPOOL=${ZPOOL:-zpool}
local name=$3
do_nodes $nodes "$LCTL get_param -n obdfilter.$device.$name \
- osd-*.$device.$name 2>&1" | grep -v 'Found no match'
+ osd-*.$device.$name 2>&1" | grep -v 'error:'
}
set_osd_param() {
local value=$4
do_nodes $nodes "$LCTL set_param -n obdfilter.$device.$name=$value \
- osd-*.$device.$name=$value 2>&1" | grep -v 'Found no match'
+ osd-*.$device.$name=$value 2>&1" | grep -v 'error:'
}
set_debug_size () {
local mntpt=$(facet_mntpt $facet)
local opts="${!opt} $@"
+ module_loaded lustre || load_modules
+
if [ $(facet_fstype $facet) == ldiskfs ] &&
! do_facet $facet test -b ${!dev}; then
opts=$(csa_add "$opts" -o loop)
# start facet device options
start() {
- local facet=$1
- shift
- local device=$1
- shift
- eval export ${facet}_dev=${device}
- eval export ${facet}_opt=\"$@\"
+ local facet=$1
+ shift
+ local device=$1
+ shift
+ eval export ${facet}_dev=${device}
+ eval export ${facet}_opt=\"$@\"
- local varname=${facet}failover_dev
- if [ -n "${!varname}" ] ; then
- eval export ${facet}failover_dev=${!varname}
- else
- eval export ${facet}failover_dev=$device
- fi
+ local varname=${facet}failover_dev
+ if [ -n "${!varname}" ] ; then
+ eval export ${facet}failover_dev=${!varname}
+ else
+ eval export ${facet}failover_dev=$device
+ fi
local mntpt=$(facet_mntpt $facet)
do_facet ${facet} mkdir -p $mntpt
return $rc
}
-# XXX This function is kept for interoperability with old server (< 2.3.50),
-# it should be removed whenever we drop the interoperability for such
-# server.
-restore_quota_old() {
- local mntpt=${1:-$MOUNT}
- local quota_type=$(quota_type $FSNAME | grep MDT | cut -d "=" -f2)
- if [ ! "$old_QUOTA_TYPE" ] ||
- [ "$quota_type" = "$old_QUOTA_TYPE" ]; then
- return
- fi
- quota_save_version $old_QUOTA_TYPE
-}
-
-# XXX This function is kept for interoperability with old server (< 2.3.50),
-# it should be removed whenever we drop the interoperability for such
-# server.
-setup_quota_old(){
- local mntpt=$1
-
- # no quota enforcement for now and accounting works out of the box
- return
-
- # We need save the original quota_type params, and restore them after testing
-
- # Suppose that quota type the same on mds and ost
- local quota_type=$(quota_type | grep MDT | cut -d "=" -f2)
- [ ${PIPESTATUS[0]} -eq 0 ] || error "quota_type failed!"
- echo "[HOST:$HOSTNAME] [old_quota_type:$quota_type] [new_quota_type:$QUOTA_TYPE]"
- if [ "$quota_type" != "$QUOTA_TYPE" ]; then
- export old_QUOTA_TYPE=$quota_type
- quota_save_version $QUOTA_TYPE
- else
- qtype=$(tr -c -d "ug" <<< $QUOTA_TYPE)
- $LFS quotacheck -$qtype $mntpt || error "quotacheck has failed for $type"
- fi
-
- local quota_usrs=$QUOTA_USERS
-
- # get_filesystem_size
- local disksz=$(lfs_df $mntpt | grep "summary" | awk '{print $2}')
- local blk_soft=$((disksz + 1024))
- local blk_hard=$((blk_soft + blk_soft / 20)) # Go 5% over
-
- local Inodes=$(lfs_df -i $mntpt | grep "summary" | awk '{print $2}')
- local i_soft=$Inodes
- local i_hard=$((i_soft + i_soft / 20))
-
- echo "Total disk size: $disksz block-softlimit: $blk_soft block-hardlimit:
- $blk_hard inode-softlimit: $i_soft inode-hardlimit: $i_hard"
-
- local cmd
- for usr in $quota_usrs; do
- echo "Setting up quota on $HOSTNAME:$mntpt for $usr..."
- for type in u g; do
- cmd="$LFS setquota -$type $usr -b $blk_soft -B $blk_hard -i $i_soft -I $i_hard $mntpt"
- echo "+ $cmd"
- eval $cmd || error "$cmd FAILED!"
- done
- # display the quota status
- echo "Quota settings for $usr : "
- $LFS quota -v -u $usr $mntpt || true
- done
-}
-
# get mdt quota type
mdt_quota_type() {
local varsvc=${SINGLEMDS}_svc
# restore old quota type settings
restore_quota() {
- if [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.3.50) ]; then
- restore_quota_old
- return
- fi
-
if [ "$old_MDT_QUOTA_TYPE" ]; then
do_facet mgs $LCTL conf_param \
$FSNAME.quota.mdt=$old_MDT_QUOTA_TYPE
}
setup_quota(){
- if [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.3.50) ]; then
- setup_quota_old $1
- return
- fi
-
local mntpt=$1
# save old quota type & set new quota type
}
wait_update_facet() {
+ local verbose=
+ [ "$1" = "--verbose" ] && verbose="$1" && shift
+
local facet=$1
shift
- wait_update $(facet_active_host $facet) "$@"
+ wait_update $verbose $(facet_active_host $facet) "$@"
}
sync_all_data() {
do_nodes $(comma_list $(mdts_nodes)) \
- "lctl set_param -n osd*.*MDT*.force_sync 1"
+ "lctl set_param -n osd*.*MDT*.force_sync=1"
do_nodes $(comma_list $(osts_nodes)) \
- "lctl set_param -n osd*.*OS*.force_sync 1" 2>&1 |
+ "lctl set_param -n osd*.*OS*.force_sync=1" 2>&1 |
grep -v 'Found no match'
}
return 0
}
-# Run e2fsck on MDT and OST(s) to generate databases used for lfsck.
-generate_db() {
- local i
- local ostidx
- local dev
- local node
-
- [[ $(lustre_version_code $SINGLEMDS) -ne $(version_code 2.2.0) ]] ||
- { skip "Lustre 2.2.0 lacks the patch for LU-1255"; exit 0; }
-
- check_shared_dir $SHARED_DIRECTORY ||
- error "$SHARED_DIRECTORY isn't a shared directory"
-
- export MDSDB=$SHARED_DIRECTORY/mdsdb
- export OSTDB=$SHARED_DIRECTORY/ostdb
-
- # DNE is not supported, so when running e2fsck on a DNE filesystem,
- # we only pass master MDS parameters.
- run_e2fsck $MDTNODE $MDTDEV "-n --mdsdb $MDSDB"
-
- i=0
- ostidx=0
- OSTDB_LIST=""
- for node in $(osts_nodes); do
- for dev in ${OSTDEVS[i]}; do
- run_e2fsck $node $dev "-n --mdsdb $MDSDB --ostdb $OSTDB-$ostidx"
- OSTDB_LIST="$OSTDB_LIST $OSTDB-$ostidx"
- ostidx=$((ostidx + 1))
- done
- i=$((i + 1))
- done
-}
-
-# Run lfsck on server node if lfsck can't be found on client (LU-2571)
-run_lfsck_remote() {
- local cmd="$LFSCK_BIN -c -l --mdsdb $MDSDB --ostdb $OSTDB_LIST $MOUNT"
- local client=$1
- local mounted=true
- local rc=0
-
- #Check if lustre is already mounted
- do_rpc_nodes $client is_mounted $MOUNT || mounted=false
- if ! $mounted; then
- zconf_mount $client $MOUNT ||
- error "failed to mount Lustre on $client"
- fi
- #Run lfsck
- echo $cmd
- do_node $client $cmd || rc=$?
- #Umount if necessary
- if ! $mounted; then
- zconf_umount $client $MOUNT ||
- error "failed to unmount Lustre on $client"
- fi
-
- [ $rc -le $FSCK_MAX_ERR ] ||
- error "$cmd returned $rc, should be <= $FSCK_MAX_ERR"
- echo "lfsck finished with rc=$rc"
-
- return $rc
-}
-
run_lfsck() {
- local facets="client $SINGLEMDS"
- local found=false
- local facet
- local node
- local rc=0
-
- for facet in $facets; do
- node=$(facet_active_host $facet)
- if check_progs_installed $node $LFSCK_BIN; then
- found=true
- break
- fi
+ do_nodes $(comma_list $(mdts_nodes) $(osts_nodes)) \
+ $LCTL set_param printk=+lfsck
+ do_facet $SINGLEMDS "$LCTL lfsck_start -M $FSNAME-MDT0000 -r -A -t all"
+
+ for k in $(seq $MDSCOUNT); do
+ # wait up to 10+1 minutes for LFSCK to complete
+ wait_update_facet --verbose mds${k} "$LCTL get_param -n \
+ mdd.$(facet_svc mds${k}).lfsck_layout |
+ awk '/^status/ { print \\\$2 }'" "completed" 600 ||
+ error "MDS${k} layout isn't the expected 'completed'"
+ wait_update_facet --verbose mds${k} "$LCTL get_param -n \
+ mdd.$(facet_svc mds${k}).lfsck_namespace |
+ awk '/^status/ { print \\\$2 }'" "completed" 60 ||
+ error "MDS${k} namespace isn't the expected 'completed'"
done
- ! $found && error "None of \"$facets\" supports lfsck"
-
- run_lfsck_remote $node || rc=$?
-
- rm -rvf $MDSDB* $OSTDB* || true
- return $rc
+ local rep_mdt=$(do_nodes $(comma_list $(mdts_nodes)) \
+ $LCTL get_param -n mdd.$FSNAME-*.lfsck_* |
+ awk '/repaired/ { print $2 }' | calc_sum)
+ local rep_ost=$(do_nodes $(comma_list $(osts_nodes)) \
+ $LCTL get_param -n obdfilter.$FSNAME-*.lfsck_* |
+ awk '/repaired/ { print $2 }' | calc_sum)
+ local repaired=$((rep_mdt + rep_ost))
+ [ $repaired -eq 0 ] ||
+ error "lfsck repaired $rep_mdt MDT and $rep_ost OST errors"
}
dump_file_contents() {
}
check_and_cleanup_lustre() {
- if [ "$LFSCK_ALWAYS" = "yes" -a "$TESTSUITE" != "lfsck" ]; then
- get_svr_devs
- generate_db
- run_lfsck
- fi
+ if [ "$LFSCK_ALWAYS" = "yes" -a "$TESTSUITE" != "sanity-lfsck" -a \
+ "$TESTSUITE" != "sanity-scrub" ]; then
+ run_lfsck
+ fi
if is_mounted $MOUNT; then
[ -n "$DIR" ] && rm -rf $DIR/[Rdfs][0-9]* ||
}
cancel_lru_locks() {
- $LCTL mark "cancel_lru_locks $1 start"
-
- if [ $1 != "MGC" ]; then
- for d in $(lctl get_param -N ldlm.namespaces.*.lru_size |
- egrep -i $1); do
- $LCTL set_param -n $d=clear
- done
- $LCTL get_param ldlm.namespaces.*.lock_unused_count | egrep -i $1 |
- grep -v '=0'
- else
- for d in $(find \
- /{proc,sys}/fs/lustre/ldlm/namespaces/*$1*/lru_size \
- 2> /dev/null); do
- echo "clear" > $d
- done
-
- for d in $(find \
- /{proc,sys}/fs/lustre/ldlm/namespaces/*$1*/lock_unused_count \
- 2> /dev/null); do
- if [ $(cat $d) != 0 ]; then
- echo "ldlm.namespaces.$(echo "$d" |
- cut -f 7 -d'/').lock_unused_count=$(cat $d)"
- fi
- done
- fi
-
- $LCTL mark "cancel_lru_locks $1 stop"
+ #$LCTL mark "cancel_lru_locks $1 start"
+ $LCTL set_param -n ldlm.namespaces.*$1*.lru_size=clear
+ $LCTL get_param ldlm.namespaces.*$1*.lock_unused_count | grep -v '=0'
+ #$LCTL mark "cancel_lru_locks $1 stop"
}
default_lru_size()
echo "$@" > $LOGDIR/err
fi
fi
+
+ # cleanup the env for failed tests
+ reset_fail_loc
}
exit_status () {
log "$msg== $(date +"%H:%M:%S (%s)")"
}
+check_dmesg_for_errors() {
+ local res
+ local errors="VFS: Busy inodes after unmount of\|\
+ldiskfs_check_descriptors: Checksum for group 0 failed\|\
+group descriptors corrupted"
+
+ res=$(do_nodes $(comma_list $(nodes_list)) "dmesg" | grep "$errors")
+ [ -z "$res" ] && return 0
+ echo "Kernel error detected: $res"
+ return 1
+}
+
#
# Run a single test function and cleanup after it.
#
reset_fail_loc
check_grant ${testnum} || error "check_grant $testnum failed with $?"
check_catastrophe || error "LBUG/LASSERT detected"
+ check_dmesg_for_errors || error "Error in dmesg detected"
if [ "$PARALLEL" != "yes" ]; then
ps auxww | grep -v grep | grep -q multiop &&
error "multiop still running"
}
add_pool_to_list () {
- local fsname=${1%%.*}
- local poolname=${1##$fsname.}
+ local fsname=${1%%.*}
+ local poolname=${1##$fsname.}
- local listvar=${fsname}_CREATED_POOLS
- eval export ${listvar}=$(expand_list ${!listvar} $poolname)
+ local listvar=${fsname}_CREATED_POOLS
+ local temp=${listvar}=$(expand_list ${!listvar} $poolname)
+ eval export $temp
}
remove_pool_from_list () {
- local fsname=${1%%.*}
- local poolname=${1##$fsname.}
+ local fsname=${1%%.*}
+ local poolname=${1##$fsname.}
- local listvar=${fsname}_CREATED_POOLS
- eval export ${listvar}=$(exclude_items_from_list ${!listvar} $poolname)
+ local listvar=${fsname}_CREATED_POOLS
+ local temp=${listvar}=$(exclude_items_from_list ${!listvar} $poolname)
+ eval export $temp
}
destroy_pool_int() {
echo -n $service_time
}
+recovery_time_min() {
+ local connection_switch_min=5
+ local connection_switch_inc=5
+ local connection_switch_max
+ local reconnect_delay_max
+ local initial_connect_timeout
+ local max
+ local timout_20
+
+ #connection_switch_max=min(50, max($connection_switch_min,$TIMEOUT)
+ (($connection_switch_min > $TIMEOUT)) &&
+ max=$connection_switch_min || max=$TIMEOUT
+ (($max < 50)) && connection_switch_max=$max || connection_switch_max=50
+
+ #initial_connect_timeout = max(connection_switch_min, obd_timeout/20)
+ timeout_20=$((TIMEOUT/20))
+ (($connection_switch_min > $timeout_20)) &&
+ initial_connect_timeout=$connection_switch_min ||
+ initial_connect_timeout=$timeout_20
+
+ reconnect_delay_max=$((connection_switch_max + connection_switch_inc + \
+ initial_connect_timeout))
+ echo $((2 * reconnect_delay_max))
+}
+
get_clients_mount_count () {
local clients=${CLIENTS:-`hostname`}