X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Ftests%2Ftest-framework.sh;h=b1ac4ed2400777bd6fa44d53adcfcc089432b482;hp=c2300ff9e0fb4e2c9780c9f27ded14890c63b75f;hb=27dfc1a14059edb84cfa8f67ac3765dac99317dc;hpb=818b881ea0dbcb0b0e7a022d2cef4e6e72f32d23 diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index c2300ff..b1ac4ed 100644 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -145,6 +145,7 @@ init_test_env() { fi export LST=${LST:-"$LUSTRE/../lnet/utils/lst"} [ ! -f "$LST" ] && export LST=$(which lst) + export SGPDDSURVEY=${SGPDDSURVEY:-$(which sgpdd-survey)} export MDSRATE=${MDSRATE:-"$LUSTRE/tests/mpi/mdsrate"} [ ! -f "$MDSRATE" ] && export MDSRATE=$(which mdsrate 2> /dev/null) if ! echo $PATH | grep -q $LUSTRE/tests/racer; then @@ -216,6 +217,7 @@ init_test_env() { export RPWD=${RPWD:-$PWD} export I_MOUNTED=${I_MOUNTED:-"no"} if [ ! -f /lib/modules/$(uname -r)/kernel/fs/lustre/mds.ko -a \ + ! -f /lib/modules/$(uname -r)/updates/kernel/fs/lustre/mds.ko -a \ ! -f `dirname $0`/../mds/mds.ko ]; then export CLIENTMODSONLY=yes fi @@ -329,7 +331,7 @@ load_modules_local() { load_module obdclass/obdclass load_module ptlrpc/ptlrpc load_module ptlrpc/gss/ptlrpc_gss - [ "$USE_QUOTA" = "yes" -a "$LQUOTA" != "no" ] && load_module quota/lquota + [ "$USE_QUOTA" = "yes" -a "$LQUOTA" != "no" ] && load_module quota/lquota $LQUOTAOPTS load_module fld/fld load_module fid/fid load_module lmv/lmv @@ -859,12 +861,18 @@ fi fi" } -shudown_node_hard () { +shutdown_node () { + local node=$1 + echo + $POWER_DOWN $node + $POWER_DOWN $node +} + +shutdown_node_hard () { local host=$1 local attempts=3 for i in $(seq $attempts) ; do - $POWER_DOWN $host + shutdown_node $host sleep 1 ping -w 3 -c 1 $host > /dev/null 2>&1 || return 0 echo "waiting for $host to fail attempts=$attempts" @@ -879,21 +887,44 @@ shutdown_client() { local attempts=3 if [ "$FAILURE_MODE" = HARD ]; then - shudown_node_hard $client + shutdown_node_hard $client else zconf_umount_clients $client $mnt -f fi } +facets_on_host () { + local host=$1 + local facets="$(get_facets OST),$(get_facets MDS)" + local affected + + combined_mgs_mds || facets="$facets,mgs" + + for facet in ${facets//,/ }; do + if [ $(facet_active_host $facet) == $host ]; then + affected="$affected $facet" + fi + done + + echo $(comma_list $affected) +} + shutdown_facet() { local facet=$1 + if [ "$FAILURE_MODE" = HARD ]; then - shudown_node_hard $(facet_active_host $facet) - elif [ "$FAILURE_MODE" = SOFT ]; then + shutdown_node_hard $(facet_active_host $facet) + else stop $facet fi } +reboot_node() { + local node=$1 + echo + $POWER_UP $node + $POWER_UP $node +} + remount_facet() { local facet=$1 @@ -902,9 +933,9 @@ remount_facet() { } reboot_facet() { - facet=$1 + local facet=$1 if [ "$FAILURE_MODE" = HARD ]; then - $POWER_UP `facet_active_host $facet` + reboot_node $(facet_active_host $facet) else sleep 10 fi @@ -913,7 +944,7 @@ reboot_facet() { boot_node() { local node=$1 if [ "$FAILURE_MODE" = HARD ]; then - $POWER_UP $node + reboot_node $node wait_for_host $node fi } @@ -1137,39 +1168,68 @@ wait_delete_completed () { } wait_for_host() { - local host=$1 - check_network "$host" 900 - while ! do_node $host hostname > /dev/null; do sleep 5; done + local hostlist=$1 + + # we can use "for" here because we are waiting the slowest + for host in ${hostlist//,/ }; do + check_network "$host" 900 + done + while ! do_nodes $hostlist hostname > /dev/null; do sleep 5; done } -wait_for() { - local facet=$1 - local host=`facet_active_host $facet` - wait_for_host $host +wait_for_facet() { + local facetlist=$1 + local hostlist + + for facet in ${facetlist//,/ }; do + hostlist=$(expand_list $hostlist $(facet_active_host $facet)) + done + wait_for_host $hostlist } -wait_recovery_complete () { - local facet=$1 +_wait_recovery_complete () { + local param=$1 # Use default policy if $2 is not passed by caller. local MAX=${2:-$(max_recovery_time)} - local var_svc=${facet}_svc - local procfile="*.${!var_svc}.recovery_status" local WAIT=0 local STATUS= while [ $WAIT -lt $MAX ]; do - STATUS=$(do_facet $facet lctl get_param -n $procfile | grep status) - [[ $STATUS = "status: COMPLETE" ]] && return 0 + STATUS=$(lctl get_param -n $param | grep status) + echo $param $STATUS + [[ $STATUS = "status: COMPLETE" || $STATUS = "status: INACTIVE" ]] && return 0 sleep 5 WAIT=$((WAIT + 5)) - echo "Waiting $((MAX - WAIT)) secs for $facet recovery done. $STATUS" + echo "Waiting $((MAX - WAIT)) secs for $param recovery done. $STATUS" done - echo "$facet recovery not done in $MAX sec. $STATUS" + echo "$param recovery not done in $MAX sec. $STATUS" return 1 } +wait_recovery_complete () { + local facet=$1 + + # with an assumption that at_max is the same on all nodes + local MAX=${2:-$(max_recovery_time)} + + local facets=$facet + if [ "$FAILURE_MODE" = HARD ]; then + facets=$(facets_on_host $(facet_active_host $facet)) + fi + echo affected facets: $facets + + # we can use "for" here because we are waiting the slowest + for facet in ${facets//,/ }; do + local var_svc=${facet}_svc + local param="*.${!var_svc}.recovery_status" + + local host=$(facet_active_host $facet) + do_rpc_nodes $host _wait_recovery_complete $param $MAX + done +} + wait_mds_ost_sync () { # just because recovery is done doesn't mean we've finished # orphan cleanup. Wait for llogs to get synchronized. @@ -1316,15 +1376,36 @@ client_reconnect() { facet_failover() { local facet=$1 local sleep_time=$2 - echo "Failing $facet on node `facet_active_host $facet`" + local host=$(facet_active_host $facet) + + echo "Failing $facet on node $host" + + local affected=$facet + + if [ "$FAILURE_MODE" = HARD ]; then + affected=$(facets_on_host $host) + fi + shutdown_facet $facet + + echo affected facets: $affected + [ -n "$sleep_time" ] && sleep $sleep_time + reboot_facet $facet - change_active $facet - local TO=`facet_active_host $facet` - echo "Failover $facet to $TO" - wait_for $facet - mount_facet $facet || error "Restart of $facet failed" + + change_active $affected + + wait_for_facet $affected + # start mgs first if it is affected + if ! combined_mgs_mds && list_member $affected mgs; then + mount_facet mgs || error "Restart of mgs failed" + fi + # FIXME; has to be changed to mount all facets concurrently + affected=$(exclude_items_from_list $affected mgs) + for facet in ${affected//,/ }; do + mount_facet $facet || error "Restart of $facet on node $host failed!" + done } obd_name() { @@ -1432,11 +1513,6 @@ h2elan() { } declare -fx h2elan -h2openib() { - h2name_or_ip "$1" "openib" -} -declare -fx h2openib - h2o2ib() { h2name_or_ip "$1" "o2ib" } @@ -1482,10 +1558,16 @@ facet_active_host() { } change_active() { - local facet=$1 + local facetlist=$1 + local facet + + facetlist=$(exclude_items_from_list $facetlist mgs) + + for facet in ${facetlist//,/ }; do local failover=${facet}failover - host=`facet_host $failover` + local host=`facet_host $failover` [ -z "$host" ] && return + local curactive=`facet_active $facet` if [ -z "${curactive}" -o "$curactive" == "$failover" ] ; then eval export ${facet}active=$facet @@ -1495,6 +1577,9 @@ change_active() { # save the active host for this facet local activevar=${facet}active echo "$activevar=${!activevar}" > $TMP/$activevar + local TO=`facet_active_host $facet` + echo "Failover $facet to $TO" + done } do_node() { @@ -1697,20 +1782,60 @@ cleanupall() { cleanup_gss } -mdsmkfsopts() -{ - local nr=$1 - test $nr = 1 && echo -n $MDS_MKFS_OPTS || echo -n $MDSn_MKFS_OPTS -} - combined_mgs_mds () { [[ $MDSDEV1 = $MGSDEV ]] && [[ $mds1_HOST = $mgs_HOST ]] } +mkfs_opts () { + local facet=$1 + + local tgt=$(echo $facet | tr -d [:digit:] | tr "[:lower:]" "[:upper:]") + local optvar=${tgt}_MKFS_OPTS + local opt=${!optvar} + + # FIXME: ! combo mgs/mds + mgsfailover is not supported yet + [[ $facet = mgs ]] && echo $opt && return + + # 1. + # --failnode options + local var=${facet}failover_HOST + if [ x"${!var}" != x ] && [ x"${!var}" != x$(facet_host $facet) ] ; then + local failnode=$(h2$NETTYPE ${!var}) + failnode="--failnode=$failnode" + # options does not contain + # or contains wrong --failnode= + if [[ $opt != *${failnode}* ]]; then + opt=$(echo $opt | sed 's/--failnode=.* / /') + opt="$opt $failnode" + fi + fi + + # 2. + # --mgsnode options + # no additional mkfs mds "--mgsnode" option for this configuration + if [[ $facet = mds ]] && combined_mgs_mds; then + echo $opt + return + fi + + # additional mkfs "--mgsnode" + local mgsnode="--mgsnode=$MGSNID" + opt=${opt//$mgsnode } + for nid in ${MGSNID//:/ }; do + local mgsnode="--mgsnode=$nid" + # options does not contain + # --mgsnode=$nid + if [[ $opt != *${mgsnode}" "* ]]; then + opt="$opt --mgsnode=$nid" + fi + done + + echo $opt +} + formatall() { if [ "$IAMDIR" == "yes" ]; then MDS_MKFS_OPTS="$MDS_MKFS_OPTS --iam-dir" - MDSn_MKFS_OPTS="$MDSn_MKFS_OPTS --iam-dir" fi [ "$FSTYPE" ] && FSTYPE_OPT="--backfstype $FSTYPE" @@ -1721,24 +1846,26 @@ formatall() { [ "$CLIENTONLY" ] && return echo Formatting mgs, mds, osts if ! combined_mgs_mds ; then - add mgs $mgs_MKFS_OPTS $FSTYPE_OPT --reformat $MGSDEV || exit 10 + add mgs $(mkfs_opts mgs) $FSTYPE_OPT --reformat $MGSDEV || exit 10 fi for num in `seq $MDSCOUNT`; do echo "Format mds$num: $(mdsdevname $num)" if $VERBOSE; then - add mds$num `mdsmkfsopts $num` $FSTYPE_OPT --reformat `mdsdevname $num` || exit 9 + add mds$num $(mkfs_opts mds) $FSTYPE_OPT --reformat $(mdsdevname $num) || exit 10 else - add mds$num `mdsmkfsopts $num` $FSTYPE_OPT --reformat `mdsdevname $num` > /dev/null || exit 9 + add mds$num $(mkfs_opts mds) $FSTYPE_OPT --reformat $(mdsdevname $num) > /dev/null || exit 10 fi done + # the ost-s could have different OST_MKFS_OPTS + # because of different failnode-s for num in `seq $OSTCOUNT`; do echo "Format ost$num: $(ostdevname $num)" if $VERBOSE; then - add ost$num $OST_MKFS_OPTS --reformat `ostdevname $num` || exit 10 + add ost$num $(mkfs_opts ost${num}) $FSTYPE_OPT --reformat `ostdevname $num` || exit 10 else - add ost$num $OST_MKFS_OPTS --reformat `ostdevname $num` > /dev/null || exit 10 + add ost$num $(mkfs_opts ost${num}) $FSTYPE_OPT --reformat `ostdevname $num` > /dev/null || exit 10 fi done } @@ -1820,7 +1947,7 @@ setupall() { echo $WRITECONF | grep -q "writeconf" && \ writeconf_all if ! combined_mgs_mds ; then - start mgs $MGSDEV $mgs_MOUNT_OPTS + start mgs $MGSDEV $MGS_MOUNT_OPTS fi for num in `seq $MDSCOUNT`; do @@ -1929,6 +2056,8 @@ init_facets_vars () { done fi + combined_mgs_mds || init_facet_vars mgs $MGSDEV $MGS_MOUNT_OPTS + remote_ost_nodsh && return for num in `seq $OSTCOUNT`; do @@ -2227,8 +2356,8 @@ run_e2fsck() { df > /dev/null # update statfs data on disk local cmd="$E2FSCK -d -v -f -n $MDSDB_OPT $ostdb_opt $target_dev" echo $cmd - do_node $node $cmd - local rc=${PIPESTATUS[0]} + local rc=0 + do_node $node $cmd || rc=$? [ $rc -le $FSCK_MAX_ERR ] || \ error "$cmd returned $rc, should be <= $FSCK_MAX_ERR" return 0 @@ -2272,15 +2401,14 @@ generate_db() { run_lfsck() { local cmd="$LFSCK_BIN -c -l --mdsdb $MDSDB --ostdb $OSTDB_LIST $MOUNT" echo $cmd - eval $cmd - local rc=${PIPESTATUS[0]} + local rc=0 + eval $cmd || rc=$? [ $rc -le $FSCK_MAX_ERR ] || \ error "$cmd returned $rc, should be <= $FSCK_MAX_ERR" echo "lfsck finished with rc=$rc" rm -rvf $MDSDB* $OSTDB* || true - - return $rc + return 0 } check_and_cleanup_lustre() { @@ -2288,8 +2416,7 @@ check_and_cleanup_lustre() { get_svr_devs generate_db if [ "$SKIP_LFSCK" == "no" ]; then - local rc=0 - run_lfsck || rc=$? + run_lfsck else echo "skip lfsck" fi @@ -2352,6 +2479,12 @@ comma_list() { echo "$*" | tr -s " " "\n" | sort -b -u | tr "\n" " " | sed 's/ \([^$]\)/,\1/g' } +list_member () { + local list=$1 + local item=$2 + echo $list | grep -qw $item +} + # list, excluded are the comma separated lists exclude_items_from_list () { local list=$1 @@ -2362,7 +2495,7 @@ exclude_items_from_list () { for item in ${excluded//,/ }; do list=$(echo " $list " | sed -re "s/\s+$item\s+/ /g") done - echo $(comma_list $list) + echo $(comma_list $list) } # list, expand are the comma separated lists @@ -2400,13 +2533,23 @@ absolute_path() { } get_facets () { - local name=$(echo $1 | tr "[:upper:]" "[:lower:]") - local type=$(echo $1 | tr "[:lower:]" "[:upper:]") + local types=${1:-"OST MDS MGS"} local list="" - local count=${type}COUNT - for ((i=1; i<=${!count}; i++)) do - list="$list ${name}$i" + + for entry in $types; do + local name=$(echo $entry | tr "[:upper:]" "[:lower:]") + local type=$(echo $entry | tr "[:lower:]" "[:upper:]") + + case $type in + MGS ) list="$list $name";; + MDS|OST ) local count=${type}COUNT + for ((i=1; i<=${!count}; i++)) do + list="$list ${name}$i" + done;; + * ) error "Invalid facet type" + exit 1;; + esac done echo $(comma_list $list) } @@ -2950,6 +3093,11 @@ osc_to_ost() echo $ost } +ostuuid_from_index() +{ + $LFS osts $2 | awk '/^'$1'/ { print $2 }' +} + remote_node () { local node=$1 [ "$node" != "$(hostname)" ] @@ -3027,19 +3175,30 @@ remote_servers () { remote_ost && remote_mds } -osts_nodes () { - local OSTNODES=$(facet_host ost1) +facets_nodes () { + local facets=$1 + local nodes local NODES_sort - for num in `seq $OSTCOUNT`; do - local myOST=$(facet_host ost$num) - OSTNODES="$OSTNODES $myOST" + for facet in ${facets//,/ }; do + if [ "$FAILURE_MODE" = HARD ]; then + nodes="$nodes $(facet_active_host $facet)" + else + nodes="$nodes $(facet_host $facet)" + fi done - NODES_sort=$(for i in $OSTNODES; do echo $i; done | sort -u) + NODES_sort=$(for i in $nodes; do echo $i; done | sort -u) echo $NODES_sort } +osts_nodes () { + local facets=$(get_facets OST) + local nodes=$(facets_nodes $facets) + + echo $nodes +} + nodes_list () { # FIXME. We need a list of clients local myNODES=$HOSTNAME @@ -3049,7 +3208,7 @@ nodes_list () { [ -n "$CLIENTS" ] && myNODES=${CLIENTS//,/ } if [ "$PDSH" -a "$PDSH" != "no_dsh" ]; then - myNODES="$myNODES $(osts_nodes) $(mdts_nodes)" + myNODES="$myNODES $(facets_nodes $(get_facets))" fi myNODES_sort=$(for i in $myNODES; do echo $i; done | sort -u) @@ -3409,7 +3568,7 @@ convert_facet2label() { get_clientosc_proc_path() { local ost=$1 - echo "{$1}-osc-*" + echo "${1}-osc-*" } get_lustre_version () { @@ -3476,33 +3635,37 @@ get_osc_import_name() { return 0 } -wait_import_state () { +_wait_import_state () { local expected=$1 local CONN_PROC=$2 + local maxtime=${3:-max_recovery_time} local CONN_STATE local i=0 CONN_STATE=$($LCTL get_param -n $CONN_PROC 2>/dev/null | cut -f2) while [ "${CONN_STATE}" != "${expected}" ]; do - if [ "${expected}" == "DISCONN" ]; then - # for disconn we can check after proc entry is removed - [ "x${CONN_STATE}" == "x" ] && return 0 - # with AT we can have connect request timeout ~ reconnect timeout - # and test can't see real disconnect - [ "${CONN_STATE}" == "CONNECTING" ] && return 0 - fi - # disconnect rpc should be wait not more obd_timeout - [ $i -ge $(($TIMEOUT * 3 / 2)) ] && \ - error "can't put import for $CONN_PROC into ${expected} state" && return 1 + [ $i -ge $maxtime ] && \ + error "can't put import for $CONN_PROC into ${expected} state after $i sec, have ${CONN_STATE}" && \ + return 1 sleep 1 CONN_STATE=$($LCTL get_param -n $CONN_PROC 2>/dev/null | cut -f2) i=$(($i + 1)) done - log "$CONN_PROC now in ${CONN_STATE} state" + log "$CONN_PROC in ${CONN_STATE} state after $i sec" return 0 } +wait_import_state() { + local state=$1 + local params=$2 + local maxtime=${3:-max_recovery_time} + local param + + for param in ${params//,/ }; do + _wait_import_state $state $param $maxtime || return + done +} wait_osc_import_state() { local facet=$1 local ost_facet=$2 @@ -3550,8 +3713,14 @@ wait_clients_import_state () { local list=$1 local facet=$2 local expected=$3 - shift + local facets=$facet + + if [ "$FAILURE_MODE" = HARD ]; then + facets=$(facets_on_host $(facet_active_host $facet)) + fi + + for facet in ${facets//,/ }; do local label=$(convert_facet2label $facet) local proc_path case $facet in @@ -3559,8 +3728,10 @@ wait_clients_import_state () { mds* ) proc_path="mdc.$(get_clientmdc_proc_path $label).mds_server_uuid" ;; *) error "unknown facet!" ;; esac + local params=$(expand_list $params $proc_path) + done - if ! do_rpc_nodes $list wait_import_state $expected $proc_path; then + if ! do_rpc_nodes $list wait_import_state $expected $params; then error "import is not in ${expected} state" return 1 fi @@ -3683,11 +3854,6 @@ gather_logs () { local list=$1 local ts=$(date +%s) - - # bug 20237, comment 11 - # It would also be useful to provide the option - # of writing the file to an NFS directory so it doesn't need to be copied. - local tmp=$TMP local docp=true [ -f $LOGDIR/shared ] && docp=false @@ -3993,16 +4159,11 @@ wait_flavor() local res=0 for ((i=0;i<20;i++)); do - echo -n "checking..." + echo -n "checking $dir..." res=$(do_check_flavor $dir $flavor) - if [ $res -eq $expect ]; then - echo "found $res $flavor connections of $dir, OK" - return 0 - else - echo "found $res $flavor connections of $dir, not ready ($expect)" - return 0 - sleep 4 - fi + echo "found $res/$expect $flavor connections" + [ $res -eq $expect ] && return 0 + sleep 4 done echo "Error checking $flavor of $dir: expect $expect, actual $res" @@ -4023,7 +4184,7 @@ restore_to_default_flavor() for rule in `do_facet mgs lctl get_param -n $proc 2>/dev/null | grep ".srpc.flavor."`; do echo "remove rule: $rule" spec=`echo $rule | awk -F = '{print $1}'` - do_facet mgs "$LCTL conf_param $spec=" + do_facet mgs "$LCTL conf_param -d $spec" done fi @@ -4188,3 +4349,18 @@ duplicate_mdt_files() { done do_umount } + +run_sgpdd () { + local devs=${1//,/ } + shift + local params=$@ + local rslt=$TMP/sgpdd_survey + + # sgpdd-survey cleanups ${rslt}.* files + + local cmd="rslt=$rslt $params scsidevs=\"$devs\" $SGPDDSURVEY" + echo + $cmd + eval $cmd + cat ${rslt}.detail +} +