# specify environment variable containing batch job name for server statistics
export JOBID_VAR=${JOBID_VAR:-"procname_uid"} # or "existing" or "disable"
-# LOAD_LLOOP: LU-409: only load llite_lloop module if kernel < 2.6.32 or
-# LOAD_LLOOP is true. LOAD_LLOOP is false by default.
-export LOAD_LLOOP=${LOAD_LLOOP:-false}
-
#export PDSH="pdsh -S -Rssh -w"
export MOUNT_CMD=${MOUNT_CMD:-"mount -t lustre"}
export UMOUNT=${UMOUNT:-"umount -d"}
fi
export LL_DECODE_FILTER_FID=${LL_DECODE_FILTER_FID:-"$LUSTRE/utils/ll_decode_filter_fid"}
[ ! -f "$LL_DECODE_FILTER_FID" ] && export LL_DECODE_FILTER_FID="ll_decode_filter_fid"
+ export LL_DECODE_LINKEA=${LL_DECODE_LINKEA:-"$LUSTRE/utils/ll_decode_linkea"}
+ [ ! -f "$LL_DECODE_LINKEA" ] && export LL_DECODE_LINKEA="ll_decode_linkea"
export MKFS=${MKFS:-"$LUSTRE/utils/mkfs.lustre"}
[ ! -f "$MKFS" ] && export MKFS="mkfs.lustre"
export TUNEFS=${TUNEFS:-"$LUSTRE/utils/tunefs.lustre"}
fi
}
-llite_lloop_enabled() {
- local n1=$(uname -r | cut -d. -f1)
- local n2=$(uname -r | cut -d. -f2)
- local n3=$(uname -r | cut -d- -f1 | cut -d. -f3)
-
- # load the llite_lloop module for < 2.6.32 kernels
- if [[ $n1 -lt 2 ]] || [[ $n1 -eq 2 && $n2 -lt 6 ]] || \
- [[ $n1 -eq 2 && $n2 -eq 6 && $n3 -lt 32 ]] || \
- $LOAD_LLOOP; then
- return 0
- fi
- return 1
-}
-
load_modules_local() {
if [ -n "$MODPROBE" ]; then
# use modprobe
fi
load_module ../libcfs/libcfs/libcfs
+ # Prevent local MODOPTS_LIBCFS being passed as part of environment
+ # variable to remote nodes
+ unset MODOPTS_LIBCFS
[ "$PTLDEBUG" ] && lctl set_param debug="$PTLDEBUG"
[ "$SUBSYSTEM" ] && lctl set_param subsystem_debug="${SUBSYSTEM# }"
fi
load_module llite/lustre
- llite_lloop_enabled && load_module llite/llite_lloop
[ -d /r ] && OGDB=${OGDB:-"/r/tmp"}
OGDB=${OGDB:-$TMP}
rm -f $OGDB/ogdb-$HOSTNAME
virt=$(dmidecode -s system-product-name | awk '{print $1}')
case $virt in
- VMware|KVM|VirtualBox|Parallels) echo ${virt,,} ;;
+ VMware|KVM|VirtualBox|Parallels)
+ echo $virt | tr '[A-Z]' '[a-z]' ;;
*) ;;
esac
}
shift 3
local opts=${@:-"-o cachefile=none"}
- do_facet $facet "$ZPOOL list -H $poolname >/dev/null 2>&1 ||
+ do_facet $facet "modprobe zfs;
+ $ZPOOL list -H $poolname >/dev/null 2>&1 ||
$ZPOOL create -f $opts $poolname $vdev"
}
if [[ -n "$poolname" ]]; then
opts+=" -d $(dirname $(facet_vdevice $facet))"
- do_facet $facet "$ZPOOL list -H $poolname >/dev/null 2>&1 ||
+ do_facet $facet "modprobe zfs;
+ $ZPOOL list -H $poolname >/dev/null 2>&1 ||
$ZPOOL import -f $opts $poolname"
fi
}
# commit the device label change to disk
if [[ $devicelabel =~ (:[a-zA-Z]{3}[0-9]{4}) ]]; then
+ echo "Commit the device label on ${!dev}"
do_facet $facet "sync; sync; sync"
+ sleep 5
fi
# only for remote client
check_client_load () {
- local client=$1
- local var=$(node_var_name $client)_load
- local TESTLOAD=run_${!var}.sh
-
- ps auxww | grep -v grep | grep $client | grep -q "$TESTLOAD" || return 1
-
- # bug 18914: try to connect several times not only when
- # check ps, but while check_catastrophe also
- local tries=3
- local RC=254
- while [ $RC = 254 -a $tries -gt 0 ]; do
- let tries=$tries-1
- # assume success
- RC=0
- if ! check_catastrophe $client; then
- RC=${PIPESTATUS[0]}
- if [ $RC -eq 254 ]; then
- # FIXME: not sure how long we shuold sleep here
- sleep 10
- continue
- fi
- echo "check catastrophe failed: RC=$RC "
- return $RC
- fi
- done
- # We can continue try to connect if RC=254
- # Just print the warning about this
- if [ $RC = 254 ]; then
- echo "got a return status of $RC from do_node while checking catastrophe on $client"
- fi
-
- # see if the load is still on the client
- tries=3
- RC=254
- while [ $RC = 254 -a $tries -gt 0 ]; do
- let tries=$tries-1
- # assume success
- RC=0
- if ! do_node $client "ps auxwww | grep -v grep | grep -q $TESTLOAD"; then
- RC=${PIPESTATUS[0]}
- sleep 30
- fi
- done
- if [ $RC = 254 ]; then
- echo "got a return status of $RC from do_node while checking (catastrophe and 'ps') the client load on $client"
- # see if we can diagnose a bit why this is
- fi
+ local client=$1
+ local var=$(node_var_name $client)_load
+ local testload=run_${!var}.sh
+
+ ps auxww | grep -v grep | grep $client | grep -q $testload || return 1
+
+ # bug 18914: try to connect several times not only when
+ # check ps, but while check_node_health also
+
+ local tries=3
+ local RC=254
+ while [ $RC = 254 -a $tries -gt 0 ]; do
+ let tries=$tries-1
+ # assume success
+ RC=0
+ if ! check_node_health $client; then
+ RC=${PIPESTATUS[0]}
+ if [ $RC -eq 254 ]; then
+ # FIXME: not sure how long we shuold sleep here
+ sleep 10
+ continue
+ fi
+ echo "check node health failed: RC=$RC "
+ return $RC
+ fi
+ done
+ # We can continue try to connect if RC=254
+ # Just print the warning about this
+ if [ $RC = 254 ]; then
+ echo "got a return status of $RC from do_node while checking " \
+ "node health on $client"
+ fi
+
+ # see if the load is still on the client
+ tries=3
+ RC=254
+ while [ $RC = 254 -a $tries -gt 0 ]; do
+ let tries=$tries-1
+ # assume success
+ RC=0
+ if ! do_node $client \
+ "ps auxwww | grep -v grep | grep -q $testload"; then
+ RC=${PIPESTATUS[0]}
+ sleep 30
+ fi
+ done
+ if [ $RC = 254 ]; then
+ echo "got a return status of $RC from do_node while checking " \
+ "(node health and 'ps') the client load on $client"
+ # see if we can diagnose a bit why this is
+ fi
- return $RC
+ return $RC
}
check_client_loads () {
local clients=${1//,/ }
grep -v 'Found no match'
}
+wait_zfs_commit() {
+ # the occupied disk space will be released
+ # only after DMUs are committed
+ if [[ $(facet_fstype $1) == zfs ]]; then
+ echo "sleep $2 for ZFS OSD"
+ sleep $2
+ fi
+}
+
wait_delete_completed_mds() {
local MAX_WAIT=${1:-20}
# for ZFS, waiting more time for DMUs to be committed
mds2sync="$mds2sync $node"
done
if [ -z "$mds2sync" ]; then
+ wait_zfs_commit $SINGLEMDS $ZFS_WAIT
return
fi
mds2sync=$(comma_list $mds2sync)
"$LCTL get_param -n osc.*MDT*.sync_*" | calc_sum)
#echo "$node: $changes changes on all"
if [[ $changes -eq 0 ]]; then
- etime=$(date +%s)
- #echo "delete took $((etime - stime)) seconds"
-
- # the occupied disk space will be released
- # only after DMUs are committed
- if [[ $(facet_fstype $SINGLEMDS) == zfs ]]; then
- echo "sleep $ZFS_WAIT for ZFS OSD"
- sleep $ZFS_WAIT
- fi
-
+ wait_zfs_commit $SINGLEMDS $ZFS_WAIT
return
fi
sleep 1
return $rc
}
+lfs_df_check() {
+ local clients=${1:-$CLIENTS}
+
+ if [ -z "$clients" ]; then
+ $LFS df $MOUNT
+ else
+ $PDSH $clients "$LFS df $MOUNT" > /dev/null
+ fi
+}
+
+
clients_up() {
- # not every config has many clients
- sleep 1
- if [ ! -z "$CLIENTS" ]; then
- $PDSH $CLIENTS "stat -f $MOUNT" > /dev/null
- else
- stat -f $MOUNT > /dev/null
- fi
+ # not every config has many clients
+ sleep 1
+ lfs_df_check
}
client_up() {
- local client=$1
- # usually checked on particular client or locally
- sleep 1
- if [ ! -z "$client" ]; then
- $PDSH $client "stat -f $MOUNT" > /dev/null
- else
- stat -f $MOUNT > /dev/null
- fi
+ # usually checked on particular client or locally
+ sleep 1
+ lfs_df_check $1
}
client_evicted() {
cd $SAVE_PWD
reset_fail_loc
check_grant ${testnum} || error "check_grant $testnum failed with $?"
- check_catastrophe || error "LBUG/LASSERT detected"
+ check_node_health
check_dmesg_for_errors || error "Error in dmesg detected"
if [ "$PARALLEL" != "yes" ]; then
ps auxww | grep -v grep | grep -q multiop &&
done
}
-check_catastrophe() {
+check_node_health() {
local nodes=${1:-$(comma_list $(nodes_list))}
- do_nodes $nodes "rc=0;
-val=\\\$($LCTL get_param -n catastrophe 2>&1);
-if [[ \\\$? -eq 0 && \\\$val -ne 0 ]]; then
- echo \\\$(hostname -s): \\\$val;
- rc=\\\$val;
-fi;
-exit \\\$rc"
+ for node in ${nodes//,/ }; do
+ check_network "$node" 5
+ if [ $? -eq 0 ]; then
+ do_node $node "rc=0;
+ val=\\\$($LCTL get_param -n catastrophe 2>&1);
+ if [[ \\\$? -eq 0 && \\\$val -ne 0 ]]; then
+ echo \\\$(hostname -s): \\\$val;
+ rc=\\\$val;
+ fi;
+ exit \\\$rc" || error "$node:LBUG/LASSERT detected"
+ fi
+ done
}
mdsrate_cleanup () {
return $OSCFULL
}
-pool_list () {
- do_facet mgs lctl pool_list $1
+list_pool() {
+ echo -e "$(do_facet $SINGLEMDS $LCTL pool_list $1 | sed '1d')"
+}
+
+check_pool_not_exist() {
+ local fsname=${1%%.*}
+ local poolname=${1##$fsname.}
+ [[ $# -ne 1 ]] && return 0
+ [[ x$poolname = x ]] && return 0
+ list_pool $fsname | grep -w $1 && return 1
+ return 0
}
create_pool() {
- local fsname=${1%%.*}
- local poolname=${1##$fsname.}
-
- do_facet mgs lctl pool_new $1
- local RC=$?
- # get param should return err unless pool is created
- [[ $RC -ne 0 ]] && return $RC
-
- wait_update $HOSTNAME "lctl get_param -n lov.$fsname-*.pools.$poolname \
- 2>/dev/null || echo foo" "" || RC=1
- if [[ $RC -eq 0 ]]; then
- add_pool_to_list $1
- else
- error "pool_new failed $1"
- fi
- return $RC
+ local fsname=${1%%.*}
+ local poolname=${1##$fsname.}
+
+ do_facet mgs lctl pool_new $1
+ local RC=$?
+ # get param should return err unless pool is created
+ [[ $RC -ne 0 ]] && return $RC
+
+ for mds_id in $(seq $MDSCOUNT); do
+ local mdt_id=$((mds_id-1))
+ local lodname=$fsname-MDT$(printf "%04x" $mdt_id)-mdtlov
+ wait_update_facet mds$mds_id \
+ "lctl get_param -n lod.$lodname.pools.$poolname \
+ 2>/dev/null || echo foo" "" ||
+ error "mds$mds_id: pool_new failed $1"
+ done
+ wait_update $HOSTNAME "lctl get_param -n lov.$fsname-*.pools.$poolname \
+ 2>/dev/null || echo foo" "" || error "pool_new failed $1"
+
+ add_pool_to_list $1
+ return $RC
}
add_pool_to_list () {
}
destroy_pool_int() {
- local ost
- local OSTS=$(do_facet $SINGLEMDS lctl pool_list $1 | \
- awk '$1 !~ /^Pool:/ {print $1}')
- for ost in $OSTS; do
- do_facet mgs lctl pool_remove $1 $ost
- done
- do_facet mgs lctl pool_destroy $1
+ local ost
+ local OSTS=$(list_pool $1)
+ for ost in $OSTS; do
+ do_facet mgs lctl pool_remove $1 $ost
+ done
+ do_facet mgs lctl pool_destroy $1
}
# <fsname>.<poolname> or <poolname>
destroy_pool() {
- local fsname=${1%%.*}
- local poolname=${1##$fsname.}
+ local fsname=${1%%.*}
+ local poolname=${1##$fsname.}
- [[ x$fsname = x$poolname ]] && fsname=$FSNAME
+ [[ x$fsname = x$poolname ]] && fsname=$FSNAME
- local RC
+ local RC
- pool_list $fsname.$poolname || return $?
+ check_pool_not_exist $fsname.$poolname
+ [[ $? -eq 0 ]] && return 0
- destroy_pool_int $fsname.$poolname
- RC=$?
- [[ $RC -ne 0 ]] && return $RC
+ destroy_pool_int $fsname.$poolname
+ RC=$?
+ [[ $RC -ne 0 ]] && return $RC
+ for mds_id in $(seq $MDSCOUNT); do
+ local mdt_id=$((mds_id-1))
+ local lodname=$fsname-MDT$(printf "%04x" $mdt_id)-mdtlov
+ wait_update_facet mds$mds_id \
+ "lctl get_param -n lod.$lodname.pools.$poolname \
+ 2>/dev/null || echo foo" "foo" ||
+ error "mds$mds_id: destroy pool failed $1"
+ done
+ wait_update $HOSTNAME "lctl get_param -n lov.$fsname-*.pools.$poolname \
+ 2>/dev/null || echo foo" "foo" || error "destroy pool failed $1"
- wait_update $HOSTNAME "lctl get_param -n lov.$fsname-*.pools.$poolname \
- 2>/dev/null || echo foo" "foo" || RC=1
+ remove_pool_from_list $fsname.$poolname
- if [[ $RC -eq 0 ]]; then
- remove_pool_from_list $fsname.$poolname
- else
- error "destroy pool failed $1"
- fi
- return $RC
+ return $RC
}
destroy_pools () {
local poolname
local listvar=${fsname}_CREATED_POOLS
- pool_list $fsname
-
[ x${!listvar} = x ] && return 0
echo destroy the created pools: ${!listvar}
local t=$(for i in $list; do printf "$FSNAME-OST%04x_UUID " $i; done)
do_facet mgs $LCTL pool_add \
$FSNAME.$pool $FSNAME-OST[$first-$last/$step]
+
+ # wait for OSTs to be added to the pool
+ for mds_id in $(seq $MDSCOUNT); do
+ local mdt_id=$((mds_id-1))
+ local lodname=$FSNAME-MDT$(printf "%04x" $mdt_id)-mdtlov
+ wait_update_facet mds$mds_id \
+ "lctl get_param -n lod.$lodname.pools.$pool |
+ sort -u | tr '\n' ' ' " "$t" || {
+ error_noexit "mds$mds_id: Add to pool failed"
+ return 3
+ }
+ done
wait_update $HOSTNAME "lctl get_param -n lov.$FSNAME-*.pools.$pool \
| sort -u | tr '\n' ' ' " "$t" || {
error_noexit "Add to pool failed"
local pname="lov.$FSNAME-*.pools.$pool"
local t=$($LCTL get_param -n $pname | head -1)
do_facet mgs $LCTL pool_remove $FSNAME.$pool $t
+ for mds_id in $(seq $MDSCOUNT); do
+ local mdt_id=$((mds_id-1))
+ local lodname=$FSNAME-MDT$(printf "%04x" $mdt_id)-mdtlov
+ wait_update_facet mds$mds_id \
+ "lctl get_param -n lod.$lodname.pools.$pool |
+ grep $t" "" || {
+ error_noexit "mds$mds_id: $t not removed from" \
+ "$FSNAME.$pool"
+ return 2
+ }
+ done
wait_update $HOSTNAME "lctl get_param -n $pname | grep $t" "" || {
error_noexit "$t not removed from $FSNAME.$pool"
return 1
do
do_facet mgs $LCTL pool_remove $FSNAME.$pool $t
done
+ for mds_id in $(seq $MDSCOUNT); do
+ local mdt_id=$((mds_id-1))
+ local lodname=$FSNAME-MDT$(printf "%04x" $mdt_id)-mdtlov
+ wait_update_facet mds$mds_id "lctl get_param -n \
+ lod.$lodname.pools.$pool" "" || {
+ error_noexit "mds$mds_id: Pool $pool not drained"
+ return 4
+ }
+ done
wait_update $HOSTNAME "lctl get_param -n $pname" "" || {
error_noexit "Pool $FSNAME.$pool cannot be drained"
return 1