return 0
}
-# Check whether the host name matches the name in the local /etc/hosts table
-# and whether the IP address corresponding to the host name is correct
-local_check() {
- # Check argument
- if [ $# -ne 2 ]; then
- echo >&2 $"`basename $0`: local_check() error: Missing"\
- "argument for function local_check()!"
- return 1
- fi
-
- local RET_STR REAL_NAME
-
- # Get the corresponding IP address of the host name from /etc/hosts table
- # of the current host
- HOST_IPADDRS[$2]=`egrep "[[:space:]]$1([[:space:]]|$)" /etc/hosts \
- | awk '{print $1}'`
- if [ -z "${HOST_IPADDRS[$2]}" ]; then
- echo >&2 "`basename $0`: local_check() error: $1 does not" \
- "exist in the local /etc/hosts table!"
+# ping_host host_name
+# Check whether host $host_name is reachable.
+# If it is, then return the IP address of this host.
+ping_host() {
+ local host_name=$1
+ local ip_addr=
+ local ret_str
+
+ if [ -z "${host_name}" ]; then
+ echo "`basename $0`: ping_host() error: Missing hostname!"
return 1
fi
- if [ ${#HOST_IPADDRS[$2]} -gt 15 ]; then
- echo >&2 "`basename $0`: local_check() error: More than one" \
- "IP address line corresponding to $1 in the local" \
- "/etc/hosts table!"
+ # Run ping command
+ ret_str=`ping -c1 ${host_name} 2>&1`
+ if [ $? -ne 0 ]; then
+ if [ -n "${ret_str}" ]; then
+ echo "`basename $0`: ping_host() error: ${ret_str}!"
+ else
+ echo "`basename $0`: ping_host() error:"\
+ "Host ${host_name} does not respond to ping!"
+ fi
return 1
fi
- # Execute remote command to get the real host name
- RET_STR=`${REMOTE} ${HOST_IPADDRS[$2]} hostname 2>&1`
- if [ $? -ne 0 -a -n "${RET_STR}" ]; then
- echo >&2 "`basename $0`: local_check() error: remote error:" \
- "${RET_STR}"
- return 1
- fi
+ # Get the IP address
+ ip_addr=`echo "${ret_str}" | head -1 | awk '{print $3}' | \
+ sed -e 's/^(//' -e 's/)$//'`
- if [ -z "${RET_STR}" ]; then
- echo >&2 "`basename $0`: local_check() error: remote error: No"\
- "results from remote! Check the network connectivity"\
- "between the local host and ${HOST_IPADDRS[$2]}!"
- return 1
- fi
+ echo "${ip_addr}"
+ return 0
+}
- if is_pdsh; then
- REAL_NAME=`echo ${RET_STR} | awk '{print $2}'`
- else
- REAL_NAME=`echo ${RET_STR} | awk '{print $1}'`
- fi
+# local_check index
+# Check the network connectivity between local host and ${HOST_NAMES[index]}.
+local_check() {
+ declare -i i=$1
- if [ "$1" != "${REAL_NAME}" ]; then
- echo >&2 "`basename $0`: local_check() error: The real hostname"\
- "of ${HOST_IPADDRS[$2]} is \"${REAL_NAME}\","\
- "not \"$1\"! Check the local /etc/hosts table!"
+ # Check whether ${HOST_NAMES[i]} is reachable
+ # and get the IP address of this host from ping
+ HOST_IPADDRS[i]=$(ping_host ${HOST_NAMES[i]})
+ if [ $? -ne 0 ]; then
+ echo >&2 "${HOST_IPADDRS[i]}"
return 1
fi
return 0
}
-# Check whether the correct host name and IP address pair matches
-# the one in the remote /etc/hosts tables
+# remote_check index
+# Check whether ${HOST_NAMES[index]} can resolve its own name and whether
+# this host agrees with the local host about what its name is resolved to.
remote_check() {
- # Check argument
- if [ $# -ne 2 ]; then
- echo >&2 $"`basename $0`: remote_check() error: Missing"\
- "argument for function remote_check()!"
- return 1
- fi
-
- declare -i i
- local RET_STR COMMAND IP_ADDR
-
- COMMAND=$"egrep \"[[:space:]]$1([[:space:]]|$)\" /etc/hosts"
+ declare -i i=$1
+ local cmd ret_str
+ local ip_addr= # the IP address got from remote ping
+
+ # Execute remote command to check whether ${HOST_NAMES[i]}
+ # can resolve its own name
+ cmd="ping -c1 ${HOST_NAMES[i]} 2>&1"
+ ret_str=`${REMOTE} ${HOST_NAMES[i]} "${cmd}" 2>&1`
+ if [ $? -ne 0 -a -n "${ret_str}" ]; then
+ echo >&2 "`basename $0`: remote_check() error:"\
+ "remote to ${HOST_NAMES[i]} error: ${ret_str}!"
+ return 1
+ fi
- # Execute remote command to check remote /etc/hosts tables
- for ((i = 0; i < ${#HOST_NAMES[@]}; i++)); do
- RET_STR=`${REMOTE} ${HOST_NAMES[i]} ${COMMAND} 2>&1`
- if [ $? -ne 0 -a -n "${RET_STR}" ]; then
- echo >&2 "`basename $0`: remote_check() error:"\
- "remote ${HOST_NAMES[i]} error: ${RET_STR}"
- return 1
- fi
+ if [ -z "${ret_str}" ]; then
+ echo >&2 "`basename $0`: remote_check() error:"\
+ "No results from ${HOST_NAMES[i]}! Check the network"\
+ "connectivity between local host and ${HOST_NAMES[i]}!"
+ return 1
+ fi
- if is_pdsh; then
- IP_ADDR=`echo ${RET_STR} | awk '{print $2}'`
- else
- IP_ADDR=`echo ${RET_STR} | awk '{print $1}'`
- fi
- if [ -z "${IP_ADDR}" ]; then
- echo >&2 "`basename $0`: remote_check() error:" \
- "$1 does not exist in the ${HOST_NAMES[i]}'s"\
- "/etc/hosts table!"
- return 1
- fi
+ # Get the IP address of ${HOST_NAMES[i]} from its own ping
+ if is_pdsh; then
+ ip_addr=`echo "${ret_str}" | head -1 | awk '{print $4}'`
+ else
+ ip_addr=`echo "${ret_str}" | head -1 | awk '{print $3}'`
+ fi
+ ip_addr=`echo "${ip_addr}" | sed -e 's/^(//' -e 's/)$//'`
+
+ # Compare IP addresses
+ # Check whether ${HOST_NAMES[i]} agrees with the local host
+ # about what its name is resolved to.
+ if [ "${ip_addr}" != "${HOST_IPADDRS[i]}" ]; then
+ echo >&2 "`basename $0`: remote_check() error:"\
+ "Local host resolves ${HOST_NAMES[i]} to IP address"\
+ "\"${HOST_IPADDRS[i]}\", while its own resolution is"\
+ "\"${ip_addr}\". They are not the same!"
+ return 1
+ fi
- if [ "${IP_ADDR}" != "${HOST_IPADDRS[$2]}" ]; then
- echo >&2 "`basename $0`: remote_check() error:" \
- "IP address ${IP_ADDR} of $1 in the" \
- "${HOST_NAMES[i]}'s /etc/hosts is incorrect!"
- return 1
- fi
- done
-
- return 0
-}
-
-# Verify forward and reverse network connectivity of the Lustre cluster
-network_check() {
- # Check argument
- if [ $# -eq 0 ]; then
- echo >&2 $"`basename $0`: network_check() error: Missing"\
- "argument for function network_check()!"
- return 1
- fi
-
- declare -i i
- local RET_STR COMMAND REAL_NAME
-
- # Execute remote command to check network connectivity
- for ((i = 0; i < ${#HOST_NAMES[@]}; i++)); do
- COMMAND=$"${REMOTE} ${HOST_NAMES[i]} hostname"
- RET_STR=`${REMOTE} $1 ${COMMAND} 2>&1`
- if [ $? -ne 0 -a -n "${RET_STR}" ]; then
- echo >&2 "`basename $0`: network_check() error:" \
- "remote error: ${RET_STR}"
- return 1
- fi
-
- if [ -z "${RET_STR}" ]; then
- echo >&2 "`basename $0`: network_check() error:" \
- "No results from remote! Check the network" \
- "connectivity between \"$1\" and" \
- "\"${HOST_NAMES[i]}\"!"
- return 1
- fi
-
- if is_pdsh; then
- REAL_NAME=`echo ${RET_STR} | awk '{print $3}'`
- else
- REAL_NAME=`echo ${RET_STR} | awk '{print $1}'`
- fi
- if [ "${HOST_NAMES[i]}" != "${REAL_NAME}" ]; then
- echo >&2 "`basename $0`: network_check() error:" \
- "${RET_STR}"
- return 1
- fi
- done
-
return 0
}
-# Verify forward and reverse network connectivity of the Lustre cluster,
-# and that hostnames match the names in the /etc/hosts tables.
+# network_verify
+# Verify name resolution and network connectivity of the Lustre cluster
network_verify() {
declare -i i
unset HOST_IPADDRS
# Get all the host names from the csv file
- if ! get_hostnames; then
- return 1
- fi
-
- # Check whether all the host names match the names in
- # all the /etc/hosts tables of the Lustre cluster
- for ((i = 0; i < ${#HOST_NAMES[@]}; i++)); do
- verbose_output "Verifying IP address of host" \
- "\"${HOST_NAMES[i]}\" in the local /etc/hosts..."
- if ! local_check ${HOST_NAMES[i]} $i; then
- return 1
- fi
- verbose_output "OK"
- done
+ ! get_hostnames && return 1
+ # Check the network connectivity between local host
+ # and other cluster nodes
for ((i = 0; i < ${#HOST_NAMES[@]}; i++)); do
[ "${HOST_NAMES[i]}" = "`hostname`" ] && continue
- verbose_output "Verifying IP address of host" \
- "\"${HOST_NAMES[i]}\" in the remote /etc/hosts..."
- if ! remote_check ${HOST_NAMES[i]} $i; then
- return 1
- fi
- verbose_output "OK"
- done
- # Verify network connectivity of the Lustre cluster
- for ((i = 0; i < ${#HOST_NAMES[@]}; i++)); do
- [ "${HOST_NAMES[i]}" = "`hostname`" ] && continue
- verbose_output "Verifying network connectivity of host" \
- "\"${HOST_NAMES[i]}\" to other hosts..."
- if ! network_check ${HOST_NAMES[i]}; then
- return 1
- fi
+ verbose_output "Verifying network connectivity between"\
+ "\"`hostname`\" and \"${HOST_NAMES[i]}\"..."
+ ! local_check $i && return 1
+ ! remote_check $i && return 1
verbose_output "OK"
done
exit 1
fi
+# Cluster network verification
if ! network_verify; then
exit 1
fi