From a3d55ba0f1e921a1e5dae3489bd4c390acfab434 Mon Sep 17 00:00:00 2001 From: Elena Gryaznova Date: Fri, 26 Nov 2010 20:43:09 +0300 Subject: [PATCH] b=23869 HARD failure mode fixes i=Brian.Murrel i=Wei.G.Li facet_failover() has to restart only those affected facets which were UP before the node failure replay-single tests which use shutdown_facet() && reboot_facet() instead of facet_failover() have to take care about the affected facets --- lustre/tests/replay-single.sh | 17 ++++++++----- lustre/tests/test-framework.sh | 55 +++++++++++++++++++++++++++++++++++------- 2 files changed, 57 insertions(+), 15 deletions(-) diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index 2e53fc5..1638c4c 100644 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -2212,18 +2212,23 @@ test_87() { #bug 17485 next_id2=$(do_facet mds lctl get_param -n osc.$mdtosc.prealloc_next_id) echo "before recovery: last_id = $last_id2, next_id = $next_id2" + # if test uses shutdown_facet && reboot_facet instead of facet_failover () + # it has to take care about the affected facets, bug20407 + local affected_mds=$(affected_facets mds) + local affected_ost1=$(affected_facets ost1) + shutdown_facet mds shutdown_facet ost1 reboot_facet mds - change_active mds - wait_for_facet mds - mount_facet mds || error "Restart of mds failed" + change_active $affected_mds + wait_for_facet $affected_mds + mount_facets $affected_mds || error "Restart of mds failed" reboot_facet ost1 - change_active ost1 - wait_for_facet ost1 - mount_facet ost1 || error "Restart of ost1 failed" + change_active $affected_ost1 + wait_for_facet $affected_ost1 + mount_facets $affected_ost1 || error "Restart of ost1 failed" clients_up diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index e56e637..96557f3 100644 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -367,6 +367,15 @@ unload_modules() { } # Facet functions +mount_facets () { + local facets=${1:-$(get_facets)} + local facet + + for facet in ${facets//,/ }; do + mount_facet $facet || error "Restart of $facet failed!" + done +} + mount_facet() { local facet=$1 shift @@ -760,6 +769,28 @@ facets_on_host () { echo $(comma_list $affected) } +facet_up () { + local facet=$1 + local host=${2:-$(facet_host $facet)} + + local label=$(convert_facet2label $facet) + do_node $host lctl dl | awk '{print $4}' | grep -q $label +} + +facets_up_on_host () { + local host=$1 + local facets=$(facets_on_host $host) + local affected_up + + for facet in ${facets//,/ }; do + if $(facet_up $facet $host); then + affected_up="$affected_up $facet" + fi + done + + echo $(comma_list $affected_up) +} + shutdown_facet() { local facet=$1 @@ -1225,6 +1256,18 @@ client_reconnect() { rm $MOUNT/recon } +affected_facets () { + local facet=$1 + + local host=$(facet_active_host $facet) + local affected=$facet + + if [ "$FAILURE_MODE" = HARD ]; then + affected=$(facets_up_on_host $host) + fi + echo $affected +} + facet_failover() { local facet=$1 local sleep_time=$2 @@ -1232,11 +1275,7 @@ facet_failover() { echo "Failing $facet on node $host" - local affected=$facet - - if [ "$FAILURE_MODE" = HARD ]; then - affected=$(facets_on_host $host) - fi + local affected=$(affected_facets $facet) shutdown_facet $facet @@ -1257,11 +1296,9 @@ facet_failover() { if ! combined_mgs_mds && list_member $affected mgs; then mount_facet mgs || error "Restart of mgs failed" fi - # FIXME; has to be changed to mount all facets concurrently + # FIXME; has to be changed to mount all facets concurrently affected=$(exclude_items_from_list $affected mgs) - for facet in ${affected//,/ }; do - mount_facet $facet || error "Restart of $facet on node $host failed!" - done + mount_facets $affected } obd_name() { -- 1.8.3.1