From 143063e4576ce4c3a17564337bcdfc76f872cf34 Mon Sep 17 00:00:00 2001 From: Elena Gryaznova Date: Mon, 1 Nov 2010 19:00:12 +0300 Subject: [PATCH] b=23869 HARD failure mode fixes i=Brian.Murrel i=Wei.G.Li * facet_failover() has to restart only those affected facets which were UP before the node failure * replay-single tests which use shutdown_facet() && reboot_facet() instead of facet_failover() have to take care about the affected facets --- lustre/tests/replay-single.sh | 17 +++++++++----- lustre/tests/test-framework.sh | 53 +++++++++++++++++++++++++++++++++++------- 2 files changed, 56 insertions(+), 14 deletions(-) diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index 5ae42c3..b6f8d2d 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -2210,18 +2210,23 @@ test_88() { #bug 17485 next_id2=$(do_facet $SINGLEMDS lctl get_param -n osc.$mdtosc.prealloc_next_id) echo "before recovery: last_id = $last_id2, next_id = $next_id2" + # if test uses shutdown_facet && reboot_facet instead of facet_failover () + # it has to take care about the affected facets, bug20407 + local affected_mds1=$(affected_facets mds1) + local affected_ost1=$(affected_facets ost1) + shutdown_facet $SINGLEMDS shutdown_facet ost1 reboot_facet $SINGLEMDS - change_active $SINGLEMDS - wait_for_facet $SINGLEMDS - mount_facet $SINGLEMDS || error "Restart of mds failed" + change_active $affected_mds1 + wait_for_facet $affected_mds1 + mount_facets $affected_mds1 || error "Restart of mds failed" reboot_facet ost1 - change_active ost1 - wait_for_facet ost1 - mount_facet ost1 || error "Restart of ost1 failed" + change_active $affected_ost1 + wait_for_facet $affected_ost1 + mount_facets $affected_ost1 || error "Restart of ost1 failed" clients_up diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index c069043..e56b8a0 100644 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -537,6 +537,15 @@ ostdevlabel() { } # Facet functions +mount_facets () { + local facets=${1:-$(get_facets)} + local facet + + for facet in ${facets//,/ }; do + mount_facet $facet || error "Restart of $facet failed!" + done +} + mount_facet() { local facet=$1 shift @@ -914,6 +923,28 @@ facets_on_host () { echo $(comma_list $affected) } +facet_up () { + local facet=$1 + local host=${2:-$(facet_host $facet)} + + local label=$(convert_facet2label $facet) + do_node $host lctl dl | awk '{print $4}' | grep -q $label +} + +facets_up_on_host () { + local host=$1 + local facets=$(facets_on_host $host) + local affected_up + + for facet in ${facets//,/ }; do + if $(facet_up $facet $host); then + affected_up="$affected_up $facet" + fi + done + + echo $(comma_list $affected_up) +} + shutdown_facet() { local facet=$1 @@ -1378,6 +1409,18 @@ client_reconnect() { rm $MOUNT/recon } +affected_facets () { + local facet=$1 + + local host=$(facet_active_host $facet) + local affected=$facet + + if [ "$FAILURE_MODE" = HARD ]; then + affected=$(facets_up_on_host $host) + fi + echo $affected +} + facet_failover() { local facet=$1 local sleep_time=$2 @@ -1385,11 +1428,7 @@ facet_failover() { echo "Failing $facet on node $host" - local affected=$facet - - if [ "$FAILURE_MODE" = HARD ]; then - affected=$(facets_on_host $host) - fi + local affected=$(affected_facets $facet) shutdown_facet $facet @@ -1408,9 +1447,7 @@ facet_failover() { fi # FIXME; has to be changed to mount all facets concurrently affected=$(exclude_items_from_list $affected mgs) - for facet in ${affected//,/ }; do - mount_facet $facet || error "Restart of $facet on node $host failed!" - done + mount_facets $affected } obd_name() { -- 1.8.3.1