Whamcloud - gitweb
b=10647
authoryujian <yujian>
Thu, 20 Jul 2006 06:24:42 +0000 (06:24 +0000)
committeryujian <yujian>
Thu, 20 Jul 2006 06:24:42 +0000 (06:24 +0000)
not require the use of /etc/hosts on nodes

lustre/scripts/lc_net.sh.in

index e6608b4..e4f150c 100644 (file)
@@ -81,160 +81,104 @@ get_hostnames() {
        return 0
 }
 
-# Check whether the host name matches the name in the local /etc/hosts table
-# and whether the IP address corresponding to the host name is correct
-local_check() {
-       # Check argument
-        if [ $# -ne 2 ]; then
-                echo >&2 $"`basename $0`: local_check() error: Missing"\
-                         "argument for function local_check()!"
-                return 1
-        fi
-
-       local RET_STR REAL_NAME
-
-       # Get the corresponding IP address of the host name from /etc/hosts table
-       # of the current host 
-       HOST_IPADDRS[$2]=`egrep "[[:space:]]$1([[:space:]]|$)" /etc/hosts \
-                    | awk '{print $1}'`
-       if [ -z "${HOST_IPADDRS[$2]}" ]; then
-               echo >&2 "`basename $0`: local_check() error: $1 does not" \
-                        "exist in the local /etc/hosts table!"
+# ping_host host_name
+# Check whether host $host_name is reachable. 
+# If it is, then return the IP address of this host.
+ping_host() {
+       local host_name=$1
+       local ip_addr=
+       local ret_str
+
+       if [ -z "${host_name}" ]; then
+               echo "`basename $0`: ping_host() error: Missing hostname!"
                return 1
        fi
 
-       if [ ${#HOST_IPADDRS[$2]} -gt 15 ]; then
-               echo >&2 "`basename $0`: local_check() error: More than one" \
-                        "IP address line corresponding to $1 in the local" \
-                        "/etc/hosts table!"
+       # Run ping command
+       ret_str=`ping -c1 ${host_name} 2>&1`
+       if [ $? -ne 0 ]; then
+               if [ -n "${ret_str}" ]; then
+                       echo "`basename $0`: ping_host() error: ${ret_str}!"
+               else
+                       echo "`basename $0`: ping_host() error:"\
+                       "Host ${host_name} does not respond to ping!"
+               fi
                return 1
        fi
 
-       # Execute remote command to get the real host name
-       RET_STR=`${REMOTE} ${HOST_IPADDRS[$2]} hostname 2>&1`
-       if [ $? -ne 0 -a -n "${RET_STR}" ]; then
-               echo >&2 "`basename $0`: local_check() error: remote error:" \
-                        "${RET_STR}"
-               return 1
-       fi
+       # Get the IP address
+       ip_addr=`echo "${ret_str}" | head -1 | awk '{print $3}' | \
+               sed -e 's/^(//' -e 's/)$//'`
 
-       if [ -z "${RET_STR}" ]; then
-               echo >&2 "`basename $0`: local_check() error: remote error: No"\
-                        "results from remote! Check the network connectivity"\
-                        "between the local host and ${HOST_IPADDRS[$2]}!"
-               return 1
-       fi
+       echo "${ip_addr}"
+       return 0
+}
 
-       if is_pdsh; then
-               REAL_NAME=`echo ${RET_STR} | awk '{print $2}'`
-       else
-               REAL_NAME=`echo ${RET_STR} | awk '{print $1}'`
-       fi
+# local_check index
+# Check the network connectivity between local host and ${HOST_NAMES[index]}.
+local_check() {
+       declare -i i=$1
 
-       if [ "$1" != "${REAL_NAME}" ]; then
-               echo >&2 "`basename $0`: local_check() error: The real hostname"\
-                        "of ${HOST_IPADDRS[$2]} is \"${REAL_NAME}\","\
-                        "not \"$1\"! Check the local /etc/hosts table!"
+       # Check whether ${HOST_NAMES[i]} is reachable
+       # and get the IP address of this host from ping
+       HOST_IPADDRS[i]=$(ping_host ${HOST_NAMES[i]})
+       if [ $? -ne 0 ]; then
+               echo >&2 "${HOST_IPADDRS[i]}"
                return 1
        fi
 
        return 0
 }
 
-# Check whether the correct host name and IP address pair matches 
-# the one in the remote /etc/hosts tables
+# remote_check index
+# Check whether ${HOST_NAMES[index]} can resolve its own name and whether
+# this host agrees with the local host about what its name is resolved to.
 remote_check() {
-       # Check argument
-        if [ $# -ne 2 ]; then
-                echo >&2 $"`basename $0`: remote_check() error: Missing"\
-                         "argument for function remote_check()!"
-                return 1
-        fi
-
-       declare -i i
-       local RET_STR COMMAND IP_ADDR
-
-       COMMAND=$"egrep \"[[:space:]]$1([[:space:]]|$)\" /etc/hosts"
+       declare -i i=$1
+       local cmd ret_str
+       local ip_addr=          # the IP address got from remote ping
+
+       # Execute remote command to check whether ${HOST_NAMES[i]}
+       # can resolve its own name
+       cmd="ping -c1 ${HOST_NAMES[i]} 2>&1"
+       ret_str=`${REMOTE} ${HOST_NAMES[i]} "${cmd}" 2>&1`
+       if [ $? -ne 0 -a -n "${ret_str}" ]; then
+               echo >&2 "`basename $0`: remote_check() error:"\
+               "remote to ${HOST_NAMES[i]} error: ${ret_str}!"
+               return 1
+       fi
 
-       # Execute remote command to check remote /etc/hosts tables
-       for ((i = 0; i < ${#HOST_NAMES[@]}; i++)); do
-               RET_STR=`${REMOTE} ${HOST_NAMES[i]} ${COMMAND} 2>&1`
-               if [ $? -ne 0 -a -n "${RET_STR}" ]; then
-                       echo >&2 "`basename $0`: remote_check() error:"\
-                                "remote ${HOST_NAMES[i]} error: ${RET_STR}"
-                       return 1
-               fi
+       if [ -z "${ret_str}" ]; then
+               echo >&2 "`basename $0`: remote_check() error:"\
+               "No results from ${HOST_NAMES[i]}! Check the network"\
+               "connectivity between local host and ${HOST_NAMES[i]}!"
+               return 1
+       fi
 
-               if is_pdsh; then
-                       IP_ADDR=`echo ${RET_STR} | awk '{print $2}'`
-               else
-                       IP_ADDR=`echo ${RET_STR} | awk '{print $1}'`
-               fi
-               if [ -z "${IP_ADDR}" ]; then
-                       echo >&2 "`basename $0`: remote_check() error:" \
-                                "$1 does not exist in the ${HOST_NAMES[i]}'s"\
-                                "/etc/hosts table!"
-                       return 1
-               fi
+       # Get the IP address of ${HOST_NAMES[i]} from its own ping
+       if is_pdsh; then
+               ip_addr=`echo "${ret_str}" | head -1 | awk '{print $4}'`
+       else
+               ip_addr=`echo "${ret_str}" | head -1 | awk '{print $3}'`
+       fi
+       ip_addr=`echo "${ip_addr}" | sed -e 's/^(//' -e 's/)$//'`
+
+       # Compare IP addresses
+       # Check whether ${HOST_NAMES[i]} agrees with the local host
+       # about what its name is resolved to.
+       if [ "${ip_addr}" != "${HOST_IPADDRS[i]}" ]; then
+               echo >&2 "`basename $0`: remote_check() error:"\
+               "Local host resolves ${HOST_NAMES[i]} to IP address"\
+               "\"${HOST_IPADDRS[i]}\", while its own resolution is"\
+               "\"${ip_addr}\". They are not the same!"
+               return 1
+       fi
        
-               if [ "${IP_ADDR}" != "${HOST_IPADDRS[$2]}" ]; then
-                       echo >&2 "`basename $0`: remote_check() error:" \
-                                "IP address ${IP_ADDR} of $1 in the" \
-                                "${HOST_NAMES[i]}'s /etc/hosts is incorrect!"
-                       return 1
-               fi
-       done
-
-       return 0
-}
-
-# Verify forward and reverse network connectivity of the Lustre cluster
-network_check() {
-       # Check argument
-        if [ $# -eq 0 ]; then
-                echo >&2 $"`basename $0`: network_check() error: Missing"\
-                         "argument for function network_check()!"
-                return 1
-        fi
-
-       declare -i i
-       local RET_STR COMMAND REAL_NAME
-
-       # Execute remote command to check network connectivity
-       for ((i = 0; i < ${#HOST_NAMES[@]}; i++)); do
-               COMMAND=$"${REMOTE} ${HOST_NAMES[i]} hostname"
-               RET_STR=`${REMOTE} $1 ${COMMAND} 2>&1`
-               if [ $? -ne 0 -a -n "${RET_STR}" ]; then
-                       echo >&2 "`basename $0`: network_check() error:" \
-                                "remote error: ${RET_STR}"
-                       return 1
-               fi
-
-               if [ -z "${RET_STR}" ]; then
-                       echo >&2 "`basename $0`: network_check() error:" \
-                                "No results from remote! Check the network" \
-                                "connectivity between \"$1\" and" \
-                                "\"${HOST_NAMES[i]}\"!"
-                       return 1
-               fi
-
-               if is_pdsh; then
-                       REAL_NAME=`echo ${RET_STR} | awk '{print $3}'`
-               else
-                       REAL_NAME=`echo ${RET_STR} | awk '{print $1}'`
-               fi
-               if [ "${HOST_NAMES[i]}" != "${REAL_NAME}" ]; then
-                       echo >&2 "`basename $0`: network_check() error:" \
-                                "${RET_STR}"
-                       return 1
-               fi
-       done
-
        return 0
 }
 
-# Verify forward and reverse network connectivity of the Lustre cluster,
-# and that hostnames match the names in the /etc/hosts tables.
+# network_verify
+# Verify name resolution and network connectivity of the Lustre cluster
 network_verify() {
        declare -i i
 
@@ -242,39 +186,17 @@ network_verify() {
        unset HOST_IPADDRS
 
        # Get all the host names from the csv file
-       if ! get_hostnames; then
-               return 1
-       fi
-
-       # Check whether all the host names match the names in 
-       # all the /etc/hosts tables of the Lustre cluster
-       for ((i = 0; i < ${#HOST_NAMES[@]}; i++)); do
-               verbose_output "Verifying IP address of host" \
-                              "\"${HOST_NAMES[i]}\" in the local /etc/hosts..."
-               if ! local_check ${HOST_NAMES[i]} $i; then
-                       return 1
-               fi
-               verbose_output "OK"
-       done
+       ! get_hostnames && return 1
 
+       # Check the network connectivity between local host 
+       # and other cluster nodes
        for ((i = 0; i < ${#HOST_NAMES[@]}; i++)); do
                [ "${HOST_NAMES[i]}" = "`hostname`" ] && continue
-               verbose_output "Verifying IP address of host" \
-                              "\"${HOST_NAMES[i]}\" in the remote /etc/hosts..."
-               if ! remote_check ${HOST_NAMES[i]} $i; then
-                       return 1
-               fi
-               verbose_output "OK"
-       done
 
-       # Verify network connectivity of the Lustre cluster
-       for ((i = 0; i < ${#HOST_NAMES[@]}; i++)); do
-               [ "${HOST_NAMES[i]}" = "`hostname`" ] && continue
-               verbose_output "Verifying network connectivity of host" \
-                              "\"${HOST_NAMES[i]}\" to other hosts..."
-               if ! network_check ${HOST_NAMES[i]}; then
-                       return 1
-               fi
+               verbose_output "Verifying network connectivity between"\
+                              "\"`hostname`\" and \"${HOST_NAMES[i]}\"..."
+               ! local_check $i && return 1
+               ! remote_check $i && return 1
                verbose_output "OK"
        done
 
@@ -286,6 +208,7 @@ if ! check_file ${CSV_FILE}; then
        exit 1  
 fi
 
+# Cluster network verification
 if ! network_verify; then
        exit 1  
 fi