From: grev Date: Sat, 4 Apr 2009 02:05:17 +0000 (+0000) Subject: b=18914 X-Git-Tag: v1_8_0_110~127 X-Git-Url: https://git.whamcloud.com/gitweb?a=commitdiff_plain;h=52828321b9100778e982bbe4b967732b9951d879;p=fs%2Flustre-release.git b=18914 i=Brian check_client_load: try to connect several times to avoid ENFILE --- diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 84050a4..e9d8965 100644 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -699,12 +699,36 @@ check_client_load () { local TESTLOAD=run_${!var}.sh ps auxww | grep -v grep | grep $client | grep -q "$TESTLOAD" || return 1 + + # bug 18914: try to connect several times not only when + # check ps, but while check_catastrophe also + local tries=3 + local RC=254 + while [ $RC = 254 -a $tries -gt 0 ]; do + let tries=$tries-1 + # assume success + RC=0 + if ! check_catastrophe $client; then + RC=${PIPESTATUS[0]} + if [ $RC -eq 254 ]; then + # FIXME: not sure how long we shuold sleep here + sleep 10 + continue + fi + echo "check catastrophe failed: RC=$RC " + return $RC + fi + done - check_catastrophe $client || return 2 + # We can continue try to connect if RC=254 + # Just print the warning about this + if [ $RC = 254 ]; then + echo "got a return status of $RC from do_node while checking catastrophe on $client" + fi # see if the load is still on the client - local tries=3 - local RC=254 + tries=3 + RC=254 while [ $RC = 254 -a $tries -gt 0 ]; do let tries=$tries-1 # assume success @@ -715,7 +739,7 @@ check_client_load () { fi done if [ $RC = 254 ]; then - echo "got a return status of $RC from do_node while checking (i.e. with 'ps') the client load on the remote system" + echo "got a return status of $RC from do_node while checking (catastrophe and 'ps') the client load on $client" # see if we can diagnose a bit why this is fi