From 52828321b9100778e982bbe4b967732b9951d879 Mon Sep 17 00:00:00 2001 From: grev Date: Sat, 4 Apr 2009 02:05:17 +0000 Subject: [PATCH] b=18914 i=Brian check_client_load: try to connect several times to avoid ENFILE --- lustre/tests/test-framework.sh | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 84050a4..e9d8965 100644 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -699,12 +699,36 @@ check_client_load () { local TESTLOAD=run_${!var}.sh ps auxww | grep -v grep | grep $client | grep -q "$TESTLOAD" || return 1 + + # bug 18914: try to connect several times not only when + # check ps, but while check_catastrophe also + local tries=3 + local RC=254 + while [ $RC = 254 -a $tries -gt 0 ]; do + let tries=$tries-1 + # assume success + RC=0 + if ! check_catastrophe $client; then + RC=${PIPESTATUS[0]} + if [ $RC -eq 254 ]; then + # FIXME: not sure how long we shuold sleep here + sleep 10 + continue + fi + echo "check catastrophe failed: RC=$RC " + return $RC + fi + done - check_catastrophe $client || return 2 + # We can continue try to connect if RC=254 + # Just print the warning about this + if [ $RC = 254 ]; then + echo "got a return status of $RC from do_node while checking catastrophe on $client" + fi # see if the load is still on the client - local tries=3 - local RC=254 + tries=3 + RC=254 while [ $RC = 254 -a $tries -gt 0 ]; do let tries=$tries-1 # assume success @@ -715,7 +739,7 @@ check_client_load () { fi done if [ $RC = 254 ]; then - echo "got a return status of $RC from do_node while checking (i.e. with 'ps') the client load on the remote system" + echo "got a return status of $RC from do_node while checking (catastrophe and 'ps') the client load on $client" # see if we can diagnose a bit why this is fi -- 1.8.3.1