From 8d8f071648d64f8a16591481064e13f590629caa Mon Sep 17 00:00:00 2001 From: Jian Yu Date: Thu, 29 Aug 2013 23:23:56 +0800 Subject: [PATCH] LU-3561 tests: improve sanity-hsm.sh to support remote agent In the current sanity-hsm.sh, only one HSM copytool is started on the local Lustre client node. In order to test multiple remote HSM agents/copytools, this patch introduces a new facet type named "AGT" and the following associated variables: - AGTCOUNT: number of agents - AGTDEV{N}: target HSM mount point (root path of the backend) - agt{N}_HOST: hostname of the agent agt{N} - SINGLEAGT: facet of the single agent By default, only single copytool is started on a remote client node. If there was no remote client, then the copytool will be started on the local client. Signed-off-by: Jian Yu Change-Id: I6e85d6948da894bd50148b0504347159ae23a79b Reviewed-on: http://review.whamcloud.com/7110 Tested-by: Hudson Reviewed-by: jacques-Charles Lafoucriere Tested-by: Maloo Reviewed-by: Jinshan Xiong Reviewed-by: Oleg Drokin --- lustre/tests/cfg/local.sh | 13 ++ lustre/tests/sanity-hsm.sh | 302 +++++++++++++++++++++++++---------------- lustre/tests/test-framework.sh | 63 ++++++--- 3 files changed, 243 insertions(+), 135 deletions(-) diff --git a/lustre/tests/cfg/local.sh b/lustre/tests/cfg/local.sh index 438c99f..13f06a3 100644 --- a/lustre/tests/cfg/local.sh +++ b/lustre/tests/cfg/local.sh @@ -125,3 +125,16 @@ MACHINEFILE_OPTION=${MACHINEFILE_OPTION:-"-machinefile"} # running the tests, or in some cases between the servers (e.g. lfsck.sh). # It needs to be a non-lustre filesystem that is available on all the nodes. SHARED_DIRECTORY=${SHARED_DIRECTORY:-$TMP} # bug 17839 comment 65 + +# +# In order to test multiple remote HSM agents, a new facet type named "AGT" and +# the following associated variables are added: +# +# AGTCOUNT: number of agents +# AGTDEV{N}: target HSM mount point (root path of the backend) +# agt{N}_HOST: hostname of the agent agt{N} +# SINGLEAGT: facet of the single agent +# +# Please refer to init_agt_vars() in sanity-hsm.sh for the default values of +# these variables. +# diff --git a/lustre/tests/sanity-hsm.sh b/lustre/tests/sanity-hsm.sh index 62164fb..7807177 100644 --- a/lustre/tests/sanity-hsm.sh +++ b/lustre/tests/sanity-hsm.sh @@ -30,7 +30,7 @@ MCREATE=${MCREATE:-mcreate} MOUNT_2=${MOUNT_2:-"yes"} FAIL_ON_ERROR=false -if [ $MDSCOUNT -ge 2 ]; then +if [[ $MDSCOUNT -ge 2 ]]; then skip_env "Only run with single MDT for now" && exit fi @@ -41,77 +41,138 @@ if [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.4.53) ]]; then fi # $RUNAS_ID may get set incorrectly somewhere else -[ $UID -eq 0 -a $RUNAS_ID -eq 0 ] && - error "\$RUNAS_ID set to 0, but \$UID is also 0!" - +if [[ $UID -eq 0 && $RUNAS_ID -eq 0 ]]; then + skip_env "\$RUNAS_ID set to 0, but \$UID is also 0!" && exit +fi check_runas_id $RUNAS_ID $RUNAS_GID $RUNAS build_test_filter -# the standard state when starting a test is -# - no copytool -# - MOUNT2 done -# as some test changes the default, we need to re-make it -cleanup() { - copytool_cleanup - if ! is_mounted $MOUNT2 - then - mount_client $MOUNT2 +# +# In order to test multiple remote HSM agents, a new facet type named "AGT" and +# the following associated variables are added: +# +# AGTCOUNT: number of agents +# AGTDEV{N}: target HSM mount point (root path of the backend) +# agt{N}_HOST: hostname of the agent agt{N} +# SINGLEAGT: facet of the single agent +# +# The number of agents is initialized as the number of remote client nodes. +# By default, only single copytool is started on a remote client/agent. If there +# was no remote client, then the copytool will be started on the local client. +# +init_agt_vars() { + local n + local agent + + export AGTCOUNT=${AGTCOUNT:-$((CLIENTCOUNT - 1))} + [[ $AGTCOUNT -gt 0 ]] || AGTCOUNT=1 + + export SHARED_DIRECTORY=${SHARED_DIRECTORY:-$TMP} + if [[ $CLIENTCOUNT -gt 1 ]] && + ! check_shared_dir $SHARED_DIRECTORY $CLIENTS; then + skip_env "SHARED_DIRECTORY should be accessible"\ + "on all client nodes" + exit 0 fi - changelog_cleanup + + for n in $(seq $AGTCOUNT); do + eval export AGTDEV$n=\$\{AGTDEV$n:-"$SHARED_DIRECTORY/arc$n"\} + agent=CLIENT$((n + 1)) + if [[ -z "${!agent}" ]]; then + [[ $CLIENTCOUNT -eq 1 ]] && agent=CLIENT1 || + agent=CLIENT2 + fi + eval export agt${n}_HOST=\$\{agt${n}_HOST:-${!agent}\} + done + + export SINGLEAGT=${SINGLEAGT:-agt1} + + export HSMTOOL=${HSMTOOL:-"lhsmtool_posix"} + export HSMTOOL_VERBOSE=${HSMTOOL_VERBOSE:-""} + export HSMTOOL_BASE=$(basename "$HSMTOOL" | cut -f1 -d" ") + HSM_ARCHIVE=$(copytool_device $SINGLEAGT) + HSM_ARCHIVE_NUMBER=2 + + MDT_PARAM="mdt.$FSNAME-MDT0000" + HSM_PARAM="$MDT_PARAM.hsm" + + # archive is purged at copytool setup + HSM_ARCHIVE_PURGE=true } -export HSMTOOL=${HSMTOOL:-"lhsmtool_posix"} -export HSMTOOL_VERBOSE=${HSMTOOL_VERBOSE:-""} -export HSMTOOL_BASE=$(basename "$HSMTOOL" | cut -f1 -d" ") -HSM_ARCHIVE=${HSM_ARCHIVE:-$TMP/arc} -HSM_ARCHIVE_NUMBER=2 +# Get the backend root path for the given agent facet. +copytool_device() { + local facet=$1 + local dev=AGTDEV$(facet_number $facet) -MDT_PARAM="mdt.$FSNAME-MDT0000" -HSM_PARAM="$MDT_PARAM.hsm" + echo -n ${!dev} +} -# archive is purged at copytool setup -HSM_ARCHIVE_PURGE=true +# Stop copytool and unregister an existing changelog user. +cleanup() { + copytool_cleanup + changelog_cleanup +} search_and_kill_copytool() { - echo "Killing existing copy tools" - killall -q $HSMTOOL_BASE || true + local agents=${1:-$(facet_active_host $SINGLEAGT)} + + echo "Killing existing copytools on $agents" + do_nodesv $agents "killall -q $HSMTOOL_BASE" || true } copytool_setup() { - if pkill -CONT -x $HSMTOOL_BASE; then - echo "Wakeup copytool" - return + local facet=${1:-$SINGLEAGT} + local lustre_mntpnt=${2:-$MOUNT} + local arc_id=$3 + local hsm_root=$(copytool_device $facet) + local agent=$(facet_active_host $facet) + + if [[ -z "$arc_id" ]] && + do_facet $facet "pkill -CONT -x $HSMTOOL_BASE"; then + echo "Wakeup copytool $facet on $agent" + return 0 fi if $HSM_ARCHIVE_PURGE; then - echo "Purging archive" - rm -rf $HSM_ARCHIVE/* + echo "Purging archive on $agent" + do_facet $facet "rm -rf $hsm_root/*" fi - echo "Starting copytool" - mkdir -p $HSM_ARCHIVE + echo "Starting copytool $facet on $agent" + do_facet $facet "mkdir -p $hsm_root" || error "mkdir '$hsm_root' failed" # bandwidth is limited to 1MB/s so the copy time is known and # independent of hardware - local CMD="$HSMTOOL $HSMTOOL_VERBOSE --hsm-root $HSM_ARCHIVE" - CMD=$CMD" --daemon --bandwidth 1 $MOUNT" - [[ -z "$1" ]] || CMD+=" --archive $1" - - echo "$CMD" - $CMD & + local cmd="$HSMTOOL $HSMTOOL_VERBOSE --daemon --hsm-root $hsm_root" + [[ -z "$arc_id" ]] || cmd+=" --archive $arc_id" + cmd+=" --bandwidth 1 $lustre_mntpnt" + + # Redirect the standard output and error to a log file which + # can be uploaded to Maloo. + local prefix=$TESTLOG_PREFIX + [[ -z "$TESTNAME" ]] || prefix=$prefix.$TESTNAME + local copytool_log=$prefix.copytool${arc_id}_log.$agent.log + + do_facet $facet "$cmd < /dev/null > $copytool_log 2>&1" || + error "start copytool $facet on $agent failed" trap cleanup EXIT } copytool_cleanup() { trap - EXIT - pkill -INT -x $HSMTOOL_BASE || return 0 + local agents=${1:-$(facet_active_host $SINGLEAGT)} + + do_nodesv $agents "pkill -INT -x $HSMTOOL_BASE" || return 0 sleep 1 - echo "Copytool is stopped" + echo "Copytool is stopped on $agents" } copytool_suspend() { - pkill -STOP -x $HSMTOOL_BASE || return 0 - echo "Copytool is suspended" + local agents=${1:-$(facet_active_host $SINGLEAGT)} + + do_nodesv $agents "pkill -STOP -x $HSMTOOL_BASE" || return 0 + echo "Copytool is suspended on $agents" } copytool_remove_backend() { @@ -122,8 +183,10 @@ copytool_remove_backend() { } import_file() { - $HSMTOOL --archive $HSM_ARCHIVE_NUMBER --hsm-root $HSM_ARCHIVE \ - --import $1 $2 $MOUNT || error "import of $1 to $2 failed" + do_facet $SINGLEAGT \ + "$HSMTOOL --archive $HSM_ARCHIVE_NUMBER --hsm-root $HSM_ARCHIVE\ + --import $1 $2 $MOUNT" || + error "import of $1 to $2 failed" } make_archive() { @@ -368,55 +431,55 @@ wait_result() { wait_update --verbose $(facet_active_host $facet) "$@" } -wait_request_state() -{ +wait_request_state() { local fid=$1 local request=$2 local state=$3 - wait_result $SINGLEMDS "$LCTL get_param -n $HSM_PARAM.agent_actions |\ - grep $fid | grep action=$request |\ - cut -f 13 -d ' ' | cut -f 2 -d =" $state 100 || + + local cmd="$LCTL get_param -n $HSM_PARAM.agent_actions" + cmd+=" | awk '/'$fid'.*action='$request'/ {print \\\$13}' | cut -f2 -d=" + + wait_result $SINGLEMDS "$cmd" $state 100 || error "request on $fid is not $state" } -get_request_state() -{ +get_request_state() { local fid=$1 local request=$2 - do_facet $SINGLEMDS "$LCTL get_param -n $HSM_PARAM.agent_actions |\ - grep $fid | grep action=$request |\ - cut -f 13 -d ' ' | cut -f 2 -d =" + + do_facet $SINGLEMDS "$LCTL get_param -n $HSM_PARAM.agent_actions |"\ + "awk '/'$fid'.*action='$request'/ {print \\\$13}' | cut -f2 -d=" } -get_request_count() -{ +get_request_count() { local fid=$1 local request=$2 - do_facet $SINGLEMDS "$LCTL get_param -n $HSM_PARAM.agent_actions |\ - grep $fid | grep action=$request | wc -l" + + do_facet $SINGLEMDS "$LCTL get_param -n $HSM_PARAM.agent_actions |"\ + "awk -vn=0 '/'$fid'.*action='$request'/ {n++}; END {print n}'" } -wait_all_done() -{ +wait_all_done() { local timeout=$1 - wait_result $SINGLEMDS "$LCTL get_param -n $HSM_PARAM.agent_actions |\ - egrep 'WAITING|STARTED' " "" $timeout || - error "requests did not complete" + + local cmd="$LCTL get_param -n $HSM_PARAM.agent_actions" + cmd+=" | egrep 'WAITING|STARTED'" + + wait_result $SINGLEMDS "$cmd" "" $timeout || + error "requests did not complete" } -wait_for_grace_delay() -{ +wait_for_grace_delay() { local val=$(get_hsm_param grace_delay) sleep $val } -my_uuid() { - $LCTL get_param -n llite.$FSNAME-*.uuid -} - MDT0=$($LCTL get_param -n mdc.*.mds_server_uuid | awk '{gsub(/_UUID/,""); print $1}' | head -1) +# initiate variables +init_agt_vars + # cleanup from previous bad setup search_and_kill_copytool @@ -583,9 +646,9 @@ test_9() { local f=$DIR/$tdir/$tfile local fid=$(copy_file /etc/passwd $f) # we do not use the default one to be sure - local new_an=$((HSM_ARCHIVE_NUMBER+ 1)) + local new_an=$((HSM_ARCHIVE_NUMBER + 1)) copytool_cleanup - copytool_setup $new_an + copytool_setup $SINGLEAGT $MOUNT $new_an $LFS hsm_archive --archive $new_an $f wait_request_state $fid ARCHIVE SUCCEED @@ -595,6 +658,38 @@ test_9() { } run_test 9 "Use of explict archive number, with dedicated copytool" +test_9a() { + [[ $CLIENTCOUNT -ge 3 ]] || + { skip "Need three or more clients"; return 0; } + + local n + local file + local fid + + copytool_cleanup $(comma_list $(agts_nodes)) + + # start all of the copytools + for n in $(seq $AGTCOUNT); do + copytool_setup agt$n + done + + trap "copytool_cleanup $(comma_list $(agts_nodes))" EXIT + # archive files + mkdir -p $DIR/$tdir + for n in $(seq $AGTCOUNT); do + file=$DIR/$tdir/$tfile.$n + fid=$(make_small $file) + + $LFS hsm_archive $file || error "could not archive file $file" + wait_request_state $fid ARCHIVE SUCCEED + check_hsm_flags $file "0x00000001" + done + + trap - EXIT + copytool_cleanup $(comma_list $(agts_nodes)) +} +run_test 9a "Multiple remote agents" + test_10a() { # test needs a running copytool copytool_setup @@ -976,9 +1071,10 @@ test_14() { # rebind the archive to the newly created file echo "rebind $fid to $fid2" - $HSMTOOL --archive $HSM_ARCHIVE_NUMBER --hsm-root="$HSM_ARCHIVE"\ - --rebind $fid $fid2 $DIR || - error "could not rebind file" + + do_facet $SINGLEAGT \ + "$HSMTOOL --archive $HSM_ARCHIVE_NUMBER --hsm-root $HSM_ARCHIVE\ + --rebind $fid $fid2 $DIR" || error "could not rebind file" # restore file and compare md5sum local sum2=$(md5sum $f | awk '{print $1}') @@ -997,7 +1093,7 @@ test_15() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile local count=5 - local tmpfile=$TMP/tmp.$$ + local tmpfile=$SHARED_DIRECTORY/tmp.$$ local fids=() local sums=() @@ -1026,9 +1122,9 @@ test_15() { [[ $nl == $count ]] || error "$nl files in list, $count expected" echo "rebind list of files" - $HSMTOOL --archive $HSM_ARCHIVE_NUMBER --hsm-root="$HSM_ARCHIVE"\ - --rebind $tmpfile $DIR || - error "could not rebind file list" + do_facet $SINGLEAGT \ + "$HSMTOOL --archive $HSM_ARCHIVE_NUMBER --hsm-root $HSM_ARCHIVE\ + --rebind $tmpfile $DIR" || error "could not rebind file list" # restore files and compare md5sum for i in $(seq 1 $count); do @@ -1425,8 +1521,7 @@ test_30b() { } run_test 30b "Restore at exec (release case)" -restore_and_check_size() -{ +restore_and_check_size() { local f=$1 local fid=$2 local s=$(stat -c "%s" $f) @@ -1709,9 +1804,6 @@ test_52() { # test needs a running copytool copytool_setup - # Test behave badly if 2 mount points are present - umount_client $MOUNT2 - mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile local fid=$(copy_file /etc/motd $f 1) @@ -1731,9 +1823,6 @@ test_52() { check_hsm_flags $f "0x0000000b" - # Restore test environment - mount_client $MOUNT2 - copytool_cleanup } run_test 52 "Opened for write file on an evicted client should be set dirty" @@ -1742,9 +1831,6 @@ test_53() { # test needs a running copytool copytool_setup - # Checks are wrong with 2 mount points - umount_client $MOUNT2 - mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile local fid=$(copy_file /etc/motd $f 1) @@ -1765,8 +1851,6 @@ test_53() { check_hsm_flags $f "0x00000009" - mount_client $MOUNT2 - copytool_cleanup } run_test 53 "Opened for read file on an evicted client should not be set dirty" @@ -2098,13 +2182,11 @@ test_105() { run_test 105 "Restart of coordinator" test_106() { - # Test behave badly if 2 mount points are present - umount_client $MOUNT2 - # test needs a running copytool copytool_setup - local uuid=$(my_uuid) + local uuid=$(do_rpc_nodes $(facet_active_host $SINGLEAGT) \ + get_client_uuid | cut -d' ' -f2) local agent=$(do_facet $SINGLEMDS $LCTL get_param -n $HSM_PARAM.agents | grep $uuid) copytool_cleanup @@ -2121,9 +2203,6 @@ test_106() { [[ ! -z "$agent" ]] || error "My uuid $uuid not found in agent list after"\ " copytool restart" - - # Restore test environment - mount_client $MOUNT2 } run_test 106 "Copytool register/unregister" @@ -2392,8 +2471,7 @@ test_221() { local target=0x7d [[ $flags == $target ]] || error "Changelog flag is $flags not $target" - changelog_cleanup - copytool_cleanup + cleanup } run_test 221 "Changelog for archive canceled" @@ -2417,8 +2495,7 @@ test_222a() { local target=0x80 [[ $flags == $target ]] || error "Changelog flag is $flags not $target" - changelog_cleanup - copytool_cleanup + cleanup } run_test 222a "Changelog for explicit restore" @@ -2444,8 +2521,7 @@ test_222b() { local target=0x80 [[ $flags == $target ]] || error "Changelog flag is $flags not $target" - changelog_cleanup - copytool_cleanup + cleanup } run_test 222b "Changelog for implicit restore" @@ -2475,8 +2551,7 @@ test_223a() { [[ $flags == $target ]] || error "Changelog flag is $flags not $target" - changelog_cleanup - copytool_cleanup + cleanup } run_test 223a "Changelog for restore canceled (import case)" @@ -2505,8 +2580,7 @@ test_223b() { [[ $flags == $target ]] || error "Changelog flag is $flags not $target" - changelog_cleanup - copytool_cleanup + cleanup } run_test 223b "Changelog for restore canceled (release case)" @@ -2532,8 +2606,7 @@ test_224() { [[ $flags == $target ]] || error "Changelog flag is $flags not $target" - changelog_cleanup - copytool_cleanup + cleanup } run_test 224 "Changelog for remove" @@ -2571,8 +2644,7 @@ test_225() { [[ $flags == $target ]] || error "Changelog flag is $flags not $target" - changelog_cleanup - copytool_cleanup + cleanup } run_test 225 "Changelog for remove canceled" @@ -2612,8 +2684,7 @@ test_226() { [[ $flags == $target ]] || error "Changelog flag is $flags not $target" - changelog_cleanup - copytool_cleanup + cleanup } run_test 226 "changelog for last rm/mv with exiting archive" @@ -2670,8 +2741,7 @@ test_227() { wait_request_state $fid ARCHIVE SUCCEED check_flags_changes $f $fid lost 3 1 - changelog_cleanup - copytool_cleanup + cleanup } run_test 227 "changelog when explicit setting of HSM flags" diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index a643244..f1d54fb 100644 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -2334,14 +2334,32 @@ replay_barrier_nosync() { $LCTL mark "local REPLAY BARRIER on ${!svc}" } +# +# Get Lustre client uuid for a given Lustre mount point. +# +get_client_uuid() { + local mntpnt=${1:-$MOUNT} + + local name=$($LFS getname $mntpnt | cut -d' ' -f1) + local uuid=$($LCTL get_param -n llite.$name.uuid) + + echo -n $uuid +} + mds_evict_client() { - UUID=`lctl get_param -n mdc.${mds1_svc}-mdc-*.uuid` - do_facet mds1 "lctl set_param -n mdt.${mds1_svc}.evict_client $UUID" + local mntpnt=${1:-$MOUNT} + local uuid=$(get_client_uuid $mntpnt) + + do_facet $SINGLEMDS \ + "$LCTL set_param -n mdt.${mds1_svc}.evict_client $uuid" } ost_evict_client() { - UUID=`lctl get_param -n devices| grep ${ost1_svc}-osc- | egrep -v 'MDT' | awk '{print $5}'` - do_facet ost1 "lctl set_param -n obdfilter.${ost1_svc}.evict_client $UUID" + local mntpnt=${1:-$MOUNT} + local uuid=$(get_client_uuid $mntpnt) + + do_facet ost1 \ + "$LCTL set_param -n obdfilter.${ost1_svc}.evict_client $uuid" } fail() { @@ -3726,10 +3744,11 @@ run_e2fsck() { # verify a directory is shared among nodes. check_shared_dir() { local dir=$1 + local list=${2:-$(comma_list $(nodes_list))} [ -z "$dir" ] && return 1 - do_rpc_nodes "$(comma_list $(nodes_list))" check_logdir $dir - check_write_access $dir || return 1 + do_rpc_nodes "$list" check_logdir $dir + check_write_access $dir "$list" || return 1 return 0 } @@ -3964,7 +3983,7 @@ get_facets () { case $type in MGS ) list="$list $name";; - MDS|OST ) local count=${type}COUNT + MDS|OST|AGT ) local count=${type}COUNT for ((i=1; i<=${!count}; i++)) do list="$list ${name}$i" done;; @@ -4755,6 +4774,11 @@ osts_nodes () { echo -n $(facets_nodes $(get_facets OST)) } +# Get all of the active AGT (HSM agent) nodes. +agts_nodes () { + echo -n $(facets_nodes $(get_facets AGT)) +} + # Get all of the client nodes and active server nodes. nodes_list () { local nodes=$HOSTNAME @@ -6003,19 +6027,20 @@ check_logdir() { } check_write_access() { - local dir=$1 - local node - local file + local dir=$1 + local list=${2:-$(comma_list $(nodes_list))} + local node + local file - for node in $(nodes_list); do - file=$dir/check_file.$(short_hostname $node) - if [[ ! -f "$file" ]]; then - # Logdir not accessible/writable from this node. - return 1 - fi - rm -f $file || return 1 - done - return 0 + for node in ${list//,/ }; do + file=$dir/check_file.$(short_hostname $node) + if [[ ! -f "$file" ]]; then + # Logdir not accessible/writable from this node. + return 1 + fi + rm -f $file || return 1 + done + return 0 } init_logging() { -- 1.8.3.1