Whamcloud - gitweb
A script for Lustre cluster network verification.
authoryujian <yujian>
Wed, 8 Feb 2006 10:29:10 +0000 (10:29 +0000)
committeryujian <yujian>
Wed, 8 Feb 2006 10:29:10 +0000 (10:29 +0000)
lustre/utils/verify_cluster_net.sh [new file with mode: 0755]

diff --git a/lustre/utils/verify_cluster_net.sh b/lustre/utils/verify_cluster_net.sh
new file mode 100755 (executable)
index 0000000..5a7927c
--- /dev/null
@@ -0,0 +1,296 @@
+#!/bin/bash
+#
+# verify_cluster_net.sh - script for Lustre cluster network verification
+#
+###############################################################################
+
+# Usage
+usage() {
+       cat >&2 <<EOF
+
+Usage: `basename $0` [-v] <csv file>
+
+       -v              verbose mode
+       csv file        a spreadsheet that contains configuration parameters 
+                       (separated by commas) for each target in a Lustre cl-
+                       uster, the first field of each line is the host name 
+                       of the cluster node
+
+EOF
+       exit 1
+}
+
+# Get and check the positional parameters
+while getopts "v" OPTION; do
+       case $OPTION in
+       v) 
+               VERBOSE_OPT=$"yes"
+               ;;
+        ?) 
+               usage 
+       esac
+done
+
+# Toss out the parameters we've already processed
+shift  `expr $OPTIND - 1`
+
+# Here we expect the csv file
+if [ $# -eq 0 ]; then
+       echo >&2 $"`basename $0`: Lack csv file!"
+       usage
+fi
+
+# Global variables
+CSV_FILE=$1
+declare -a HOST_NAMES
+declare -a HOST_IPADDRS
+
+# Output verbose informations
+verbose_output() {
+       if [ "${VERBOSE_OPT}" = "yes" ]; then
+               echo "`basename $0`: $*"
+       fi
+       return 0
+}
+
+# Check the csv file
+check_file() {
+       if [ ! -s ${CSV_FILE} ]; then
+                echo >&2 $"`basename $0`: check_file() error: ${CSV_FILE}" \
+                         "does not exist or is empty!"
+                return 1
+        fi
+
+        return 0
+}
+
+# Get the host names from the csv file
+get_hostnames() {
+       local NAME CHECK_STR
+       declare -i i
+
+       # Initialize the HOST_NAMES array
+       for ((i = 0; i < ${#HOST_NAMES[@]}; i++)); do
+               HOST_NAMES[i]=$""
+       done
+
+       CHECK_STR=`egrep -v "([[:space:]]|^)#" ${CSV_FILE} | awk -F, \
+                 '/[[:alnum:]]/{if ($1 !~/[[:alnum:]]/) print $0}'`
+       if [ -n "${CHECK_STR}" ]; then
+                echo >&2 $"`basename $0`: get_hostnames() error: Lack hostname"\
+                         "field in the line - ${CHECK_STR}"
+               return 1
+       fi
+
+       i=0
+       for NAME in `egrep -v "([[:space:]]|^)#" ${CSV_FILE}\
+                   | awk -F, '/[[:alnum:]]/{print $1}'`
+       do
+               HOST_NAMES[i]=${NAME}
+               i=$i+1
+       done
+
+       return 0
+}
+
+# Check whether the host name matches the name in the local /etc/hosts table
+# and whether the IP address according to the host name is correct
+local_check() {
+       # Check argument
+        if [ $# -ne 2 ]; then
+                echo >&2 $"`basename $0`: local_check() error: Lack argument" \
+                         "for function local_check()!"
+                return 1
+        fi
+
+       local RET_STR REAL_NAME
+
+       # Get the IP address according to the host name from /etc/hosts table
+       # of the current host 
+       HOST_IPADDRS[$2]=`egrep "[[:space:]]$1([[:space:]]|$)" /etc/hosts \
+                    | awk '{print $1}'`
+       if [ -z "${HOST_IPADDRS[$2]}" ]; then
+               echo >&2 "`basename $0`: local_cleck() error: $1 does not" \
+                        "exist in the local /etc/hosts table!"
+               return 1
+       fi
+
+       if [ ${#HOST_IPADDRS[$2]} -gt 15 ]; then
+               echo >&2 "`basename $0`: local_cleck() error: More than one" \
+                        "IP address line according to $1 in the local" \
+                        "/etc/hosts table!"
+               return 1
+       fi
+
+       # Execute pdsh command to get the real host name
+       RET_STR=`pdsh -w ${HOST_IPADDRS[$2]} hostname 2>&1`
+       if [ $? -ne 0 ] || [ "${RET_STR}" != "${RET_STR#*connect:*}" ]; then
+               echo >&2 "`basename $0`: local_cleck() error: pdsh error:" \
+                        "${RET_STR}"
+               return 1
+       fi
+
+       if [ -z "${RET_STR}" ]; then
+               echo >&2 "`basename $0`: local_cleck() error: pdsh error:" \
+                        "Nothing get from pdsh! Check the network connectivity"\
+                        "between the local host and ${HOST_IPADDRS[$2]}" \
+                        "or check the two hosts' rcmd module!"
+               return 1
+       fi
+
+       REAL_NAME=`echo ${RET_STR} | awk '{print $2}'`
+       if [ "$1" != "${REAL_NAME}" ]; then
+               echo >&2 "`basename $0`: local_cleck() error: The real hostname"\
+                        "according to ${HOST_IPADDRS[$2]} is ${REAL_NAME}," \
+                        "not $1! Check the local /etc/hosts table!"
+               return 1
+       fi
+
+       return 0
+}
+
+# Check whether the correct host name and IP address pair matches 
+# the one in the remote /etc/hosts tables
+remote_check() {
+       # Check argument
+        if [ $# -ne 2 ]; then
+                echo >&2 $"`basename $0`: remote_check() error: Lack argument"\
+                         "for function remote_check()!"
+                return 1
+        fi
+
+       declare -i i
+       local RET_STR COMMAND IP_ADDR
+
+       COMMAND=$"egrep \"[[:space:]]$1([[:space:]]|$)\" /etc/hosts"
+
+       # Execute pdsh command to check remote /etc/hosts tables
+       for ((i = 0; i < ${#HOST_NAMES[@]}; i++)); do
+               RET_STR=`pdsh -w ${HOST_NAMES[i]} ${COMMAND} 2>&1`
+               if [ $? -ne 0 ] || [ "${RET_STR}" != "${RET_STR#*connect:*}" ]
+               then
+                       echo >&2 "`basename $0`: remote_check() error:" \
+                                "pdsh error: ${RET_STR}"
+                       return 1
+               fi
+
+               IP_ADDR=`echo ${RET_STR} | awk '{print $2}'`
+               if [ -z "${IP_ADDR}" ]; then
+                       echo >&2 "`basename $0`: remote_check() error:" \
+                                "$1 does not exist in the ${HOST_NAMES[i]}'s"\
+                                "/etc/hosts table!"
+                       return 1
+               fi
+
+               if [ "${IP_ADDR}" != "${HOST_IPADDRS[$2]}" ]; then
+                       echo >&2 "`basename $0`: remote_check() error:" \
+                                "IP address ${IP_ADDR} of $1 in the" \
+                                "${HOST_NAMES[i]}'s /etc/hosts is incorrect!"
+                       return 1
+               fi
+               
+       done
+
+       return 0
+}
+
+# Verify forward and reverse network connectivity of the Lustre cluster
+network_check () {
+       # Check argument
+        if [ $# -eq 0 ]; then
+                echo >&2 $"`basename $0`: network_check() error: Lack argument" \
+                         "for function network_check()!"
+                return 1
+        fi
+
+       declare -i i
+       local RET_STR COMMAND REAL_NAME
+
+       # Execute pdsh command to check network connectivity
+       for ((i = 0; i < ${#HOST_NAMES[@]}; i++)); do
+               COMMAND=$"pdsh -w ${HOST_NAMES[i]} hostname"
+               RET_STR=`pdsh -w $1 ${COMMAND} 2>&1`
+               if [ $? -ne 0 ] || [ "${RET_STR}" != "${RET_STR#*connect:*}" ]
+               then
+                       echo >&2 "`basename $0`: network_check() error:" \
+                                "pdsh error: ${RET_STR}"
+                       return 1
+               fi
+
+               if [ -z "${RET_STR}" ]; then
+                       echo >&2 "`basename $0`: network_check() error:" \
+                                "pdsh error: Nothing get from pdsh! Check" \
+                                "the network connectivity between $1 and" \
+                                "${HOST_NAMES[i]} or the two hosts' rcmd module!"
+                       return 1
+               fi
+
+               REAL_NAME=`echo ${RET_STR} | awk '{print $3}'`
+               if [ "${HOST_NAMES[i]}" != "${REAL_NAME}" ]; then
+                       echo >&2 "`basename $0`: network_check() error:" \
+                                "${RET_STR}"
+                       return 1
+               fi
+       done
+
+       return 0
+}
+
+# Verify forward and reverse network connectivity of the Lustre cluster,
+# and that hostnames match the names in the /etc/hosts tables.
+network_verify() {
+       declare -i i
+
+       # Initialize the HOST_IPADDRS array
+       for ((i = 0; i < ${#HOST_IPADDRS[@]}; i++)); do
+               HOST_IPADDRS[i]=$""
+       done
+
+       # Get all the host names from the csv file
+       if ! get_hostnames; then
+               return 1
+       fi
+
+       # Check whether all the host names match the names in 
+       # all the /etc/hosts tables of the Lustre cluster
+       for ((i = 0; i < ${#HOST_NAMES[@]}; i++)); do
+               verbose_output "Verifying IP address of host" \
+                              "${HOST_NAMES[i]} in the local /etc/hosts..."
+               if ! local_check ${HOST_NAMES[i]} $i; then
+                       return 1
+               fi
+               verbose_output "OK"
+       done
+
+       for ((i = 0; i < ${#HOST_NAMES[@]}; i++)); do
+               verbose_output "Verifying IP address of host" \
+                              "${HOST_NAMES[i]} in the remote /etc/hosts..."
+               if ! remote_check ${HOST_NAMES[i]} $i; then
+                       return 1
+               fi
+               verbose_output "OK"
+       done
+
+       # Verify network connectivity of the Lustre cluster
+       for ((i = 0; i < ${#HOST_NAMES[@]}; i++)); do
+               verbose_output "Verifying network connectivity of host" \
+                              "${HOST_NAMES[i]} to other hosts..."
+               if ! network_check ${HOST_NAMES[i]}; then
+                       return 1
+               fi
+               verbose_output "OK"
+       done
+
+       return 0
+}
+
+# Main flow
+if ! check_file; then
+       exit 1  
+fi
+
+if ! network_verify; then
+       exit 1  
+fi
+
+exit 0