Whamcloud - gitweb
Branch HEAD
[fs/lustre-release.git] / lustre / scripts / lc_hb.in
diff --git a/lustre/scripts/lc_hb.in b/lustre/scripts/lc_hb.in
new file mode 100644 (file)
index 0000000..0fa1fb0
--- /dev/null
@@ -0,0 +1,644 @@
+#!/bin/bash
+#
+# lc_hb - script for generating the Heartbeat HA software's
+#         configuration files
+#
+###############################################################################
+
+# Usage
+usage() {
+       cat >&2 <<EOF
+
+Usage: `basename $0`   <-r HBver> <-n hostnames> [-v]
+                       <-d target device> [-d target device...]
+
+       -r HBver                the version of Heartbeat software
+                               The Heartbeat software versions which are curr-
+                               ently supported are: hbv1 (Heartbeat version 1) 
+                               and hbv2 (Heartbeat version 2).
+       -n hostnames            the nodenames of the primary node and its fail-
+                               overs
+                               Multiple nodenames are separated by colon (:)
+                               delimeter. The first one is the nodename of the 
+                               primary node, the others are failover nodenames.
+       -v                      verbose mode
+       -d target device        the target device name and mount point
+                               The device name and mount point are separated by
+                               colon (:) delimeter. 
+
+EOF
+       exit 1
+}
+
+# Get the library of functions
+. @scriptlibdir@/lc_common
+
+#****************************** Global variables ******************************#
+# Heartbeat tools
+HB_TOOLS_PATH=${HB_TOOLS_PATH:-"/usr/lib64/heartbeat"} # Heartbeat tools path
+CIB_GEN_SCRIPT=${HB_TOOLS_PATH}/haresources2cib.py
+CL_STATUS=${CL_STATUS:-"/usr/bin/cl_status"}
+
+# Service directories and names
+HARES_DIR=${HARES_DIR:-"${HA_DIR}/resource.d"}         # Heartbeat resources
+LUSTRE_SRV=${LUSTRE_SRV:-"Filesystem"} # Service script provided by Heartbeat
+
+TMP_DIR=${HB_TMP_DIR}                  # Temporary directory
+HACF_TEMP=${TMP_DIR}/ha.cf.temp
+AUTHKEYS_TEMP=${TMP_DIR}/authkeys${FILE_SUFFIX}
+
+declare -a NODE_NAMES                  # Node names in the failover group
+
+# Lustre target device names, service names and mount points
+declare -a TARGET_DEVNAMES TARGET_SRVNAMES TARGET_MNTPNTS
+declare -i TARGET_NUM=0                        # Number of targets
+
+
+# Get and check the positional parameters
+VERBOSE_OUTPUT=false
+while getopts "r:n:vd:" OPTION; do
+       case $OPTION in
+       r) 
+               HBVER_OPT=$OPTARG
+               if [ "${HBVER_OPT}" != "${HBVER_HBV1}" ] \
+               && [ "${HBVER_OPT}" != "${HBVER_HBV2}" ]; then
+                       echo >&2 $"`basename $0`: Invalid Heartbeat software" \
+                                 "version - ${HBVER_OPT}!"
+                       usage
+               fi
+               ;;
+        n)
+               HOSTNAME_OPT=$OPTARG 
+               PRIM_NODENAME=`echo ${HOSTNAME_OPT} | awk -F":" '{print $1}'`
+               if [ -z "${PRIM_NODENAME}" ]; then
+                       echo >&2 $"`basename $0`: Missing primary nodename!"
+                       usage
+               fi
+               HOSTNAME_NUM=`echo ${HOSTNAME_OPT} | awk -F":" '{print NF}'`
+               if [ ${HOSTNAME_NUM} -lt 2 ]; then
+                       echo >&2 $"`basename $0`: Missing failover nodenames!"
+                       usage
+               fi
+               if [ "${HBVER_OPT}" = "${HBVER_HBV1}" -a ${HOSTNAME_NUM} -gt 2 ]
+               then
+                       echo >&2 $"`basename $0`: Heartbeat version 1 can" \
+                                 "only support 2 nodes!"
+                       usage
+               fi
+               ;;
+       v) 
+               VERBOSE_OUTPUT=true
+               ;;
+        d)
+               DEVICE_OPT=$OPTARG 
+               TARGET_DEVNAMES[TARGET_NUM]=`echo ${DEVICE_OPT}|awk -F: '{print $1}'`
+               TARGET_MNTPNTS[TARGET_NUM]=`echo ${DEVICE_OPT}|awk -F: '{print $2}'`
+               if [ -z "${TARGET_DEVNAMES[TARGET_NUM]}" ]; then
+                       echo >&2 $"`basename $0`: Missing target device name!"
+                       usage
+               fi
+               if [ -z "${TARGET_MNTPNTS[TARGET_NUM]}" ]; then
+                       echo >&2 $"`basename $0`: Missing mount point for target"\
+                                 "${TARGET_DEVNAMES[TARGET_NUM]}!"
+                       usage
+               fi
+               TARGET_NUM=$(( TARGET_NUM + 1 ))
+               ;;
+        ?) 
+               usage 
+       esac
+done
+
+# Check the required parameters
+if [ -z "${HBVER_OPT}" ]; then
+       echo >&2 $"`basename $0`: Missing -r option!"
+       usage
+fi
+
+if [ -z "${HOSTNAME_OPT}" ]; then
+       echo >&2 $"`basename $0`: Missing -n option!"
+       usage
+fi
+
+if [ -z "${DEVICE_OPT}" ]; then
+       echo >&2 $"`basename $0`: Missing -d option!"
+       usage
+fi
+
+# get_nodenames
+#
+# Get all the node names in this failover group
+get_nodenames() {
+       declare -i idx
+       local nodename_str nodename
+
+       nodename_str=`echo ${HOSTNAME_OPT}|awk '{split($HOSTNAME_OPT, a, ":")}\
+                     END {for (i in a) print a[i]}'`
+       idx=0
+       for nodename in ${nodename_str}
+        do
+               NODE_NAMES[idx]=${nodename}
+               idx=$idx+1
+        done
+
+       return 0
+}
+
+# check_remote_file host_name file
+#
+# Run remote command to check whether @file exists in @host_name
+check_remote_file() {
+       local host_name=$1
+       local file_name=$2
+
+       if [ -z "${host_name}" ]; then
+               echo >&2 "`basename $0`: check_remote_file() error:"\
+                        "Missing hostname!"
+               return 1
+       fi
+
+       if [ -z "${file_name}" ]; then
+               echo >&2 "`basename $0`: check_remote_file() error:"\
+                        "Missing file name!"
+               return 1
+       fi
+
+       # Execute remote command to check the file 
+       ${REMOTE} ${host_name} "[ -e ${file_name} ]"
+       if [ $? -ne 0 ]; then
+               echo >&2 "`basename $0`: check_remote_file() error:"\
+               "${file_name} does not exist in host ${host_name}!"
+               return 1
+       fi
+
+       return 0
+}
+
+# hb_running host_name
+# 
+# Run remote command to check whether heartbeat service is running in @host_name
+hb_running() {
+       local host_name=$1
+       local ret_str
+
+       ret_str=`${REMOTE} ${host_name} "${CL_STATUS} hbstatus" 2>&1`
+       if [ $? -ne 0 ]; then
+               if [ "${ret_str}" = "${ret_str#*stop*}" ]; then
+                       echo >&2 "`basename $0`: hb_running() error:"\
+                       "remote command to ${host_name} error: ${ret_str}!"
+                       return 2
+               else
+                       return 1
+               fi
+       fi
+
+       return 0
+}
+
+# stop_heartbeat host_name
+#
+# Run remote command to stop heartbeat service running in @host_name
+stop_heartbeat() {
+       local host_name=$1
+       local ret_str
+
+       ret_str=`${REMOTE} ${host_name} "/sbin/service heartbeat stop" 2>&1`
+       if [ $? -ne 0 ]; then
+               echo >&2 "`basename $0`: stop_heartbeat() error:"\
+               "remote command to ${host_name} error: ${ret_str}!"
+               return 1
+       fi
+
+       echo "`basename $0`: Heartbeat service is stopped on node ${host_name}."
+       return 0
+}
+
+# check_heartbeat
+#
+# Run remote command to check each node's heartbeat service
+check_heartbeat() {
+       declare -i idx
+       local OK
+
+       for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do
+               # Check Heartbeat configuration directory
+               if ! check_remote_file ${NODE_NAMES[idx]} ${HA_DIR}; then
+                       echo >&2 "`basename $0`: check_heartbeat() error:"\
+                       "Is Heartbeat package installed?"
+                       return 1
+               fi
+
+               if [ "${HBVER_OPT}" = "${HBVER_HBV1}" ]; then
+                       # Check mon configuration directory
+                       if ! check_remote_file ${NODE_NAMES[idx]} ${MON_DIR}; then
+                               echo >&2 "`basename $0`: check_heartbeat()"\
+                               "error: Is mon package installed?"
+                               return 1
+                       fi
+               fi
+
+               if [ "${HBVER_OPT}" = "${HBVER_HBV2}" ]; then
+                       # Check crm directory
+                       if ! check_remote_file ${NODE_NAMES[idx]} ${CIB_DIR}; then
+                               echo >&2 "`basename $0`: check_heartbeat()"\
+                               "error: Is Heartbeat v2 package installed?"
+                               return 1
+                       fi
+               fi
+               
+               # Check heartbeat service status
+               hb_running ${NODE_NAMES[idx]}
+               rc=$?
+               if [ "$rc" -eq "2" ]; then
+                       return 1
+               elif [ "$rc" -eq "1" ]; then
+                       verbose_output "Heartbeat service is stopped on"\
+                       "node ${NODE_NAMES[idx]}."
+               elif [ "$rc" -eq "0" ]; then
+                       OK=
+                       echo -n "`basename $0`: Heartbeat service is running on"\
+                       "${NODE_NAMES[idx]}, go ahead to stop the service and"\
+                       "generate new configurations? [y/n]:"
+                       read OK
+                       if [ "${OK}" = "n" ]; then
+                               echo "`basename $0`: New Heartbeat configurations"\
+                               "are not generated."
+                               return 2
+                       fi
+
+                       # Stop heartbeat service        
+                       stop_heartbeat ${NODE_NAMES[idx]}
+               fi
+       done
+
+       return 0
+}
+
+# get_srvname hostname target_devname
+#
+# Get the lustre target server name from the node @hostname
+get_srvname() {
+       local host_name=$1
+       local target_devname=$2
+       local target_srvname=
+       local ret_str
+
+       # Execute remote command to get the target server name
+       ret_str=`${REMOTE} ${host_name} \
+               "${TUNEFS} --print --verbose ${target_devname} | grep Target:" 2>&1`
+       if [ $? -ne 0 ]; then
+               echo "`basename $0`: get_srvname() error:" \
+                    "from host ${host_name} - ${ret_str}"
+               return 1
+       fi
+
+       if [ "${ret_str}" != "${ret_str#*Target: }" ]; then
+               ret_str=${ret_str#*Target: }
+               target_srvname=`echo ${ret_str} | awk '{print $1}'`
+       fi
+       
+       if [ -z "${target_srvname}" ]; then
+               echo "`basename $0`: get_srvname() error: Cannot get the"\
+                    "server name of target ${target_devname} in ${host_name}!"
+               return 1
+       fi
+
+       echo ${target_srvname}
+       return 0
+} 
+
+# get_srvnames
+#
+# Get server names of all the Lustre targets in this failover group
+get_srvnames() {
+       declare -i i
+
+       # Initialize the TARGET_SRVNAMES array
+       unset TARGET_SRVNAMES
+
+       # Get Lustre target service names
+       for ((i = 0; i < ${#TARGET_DEVNAMES[@]}; i++)); do
+               TARGET_SRVNAMES[i]=$(get_srvname ${PRIM_NODENAME} \
+                                    ${TARGET_DEVNAMES[i]})
+               if [ $? -ne 0 ]; then
+                       echo >&2 "${TARGET_SRVNAMES[i]}"
+                       return 1
+               fi
+       done
+
+       return 0
+}
+
+# create_template
+#
+# Create the templates for ha.cf and authkeys files
+create_template() {
+       /bin/mkdir -p ${TMP_DIR}
+
+       # Create the template for ha.cf
+       if [ "${HBVER_OPT}" = "${HBVER_HBV1}" ]; then
+               cat >${HACF_TEMP} <<EOF
+debugfile /var/log/ha-debug
+logfile /var/log/ha-log
+logfacility     local0
+keepalive 2
+deadtime 30
+initdead 120
+
+auto_failback off
+
+EOF
+       elif [ "${HBVER_OPT}" = "${HBVER_HBV2}" ]; then
+               cat >${HACF_TEMP} <<EOF
+use_logd        yes
+keepalive 1
+deadtime 10
+initdead 60
+
+crm yes
+
+EOF
+       fi
+
+       # Create the template for authkeys
+       if [ ! -s ${AUTHKEYS_TEMP} ]; then
+               cat >${AUTHKEYS_TEMP} <<EOF
+auth 1
+1 sha1 HelloLustre!
+EOF
+       fi
+
+       return 0
+}
+
+# create_hacf
+#
+# Create the ha.cf file and scp it to each node's /etc/ha.d/
+create_hacf() {
+       HACF_PRIMNODE=${TMP_DIR}$"/ha.cf."${PRIM_NODENAME}
+       HACF_LUSTRE=${TMP_DIR}$"/ha.cf"${FILE_SUFFIX}
+
+       declare -i idx
+
+       if [ -e ${HACF_PRIMNODE} ]; then
+               # The ha.cf file for the primary node has already existed.
+               verbose_output "${HACF_PRIMNODE} already exists."
+               return 0
+       fi
+
+       /bin/cp -f ${HACF_TEMP} ${HACF_LUSTRE}
+
+        for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do
+               echo "node    ${NODE_NAMES[idx]}" >> ${HACF_LUSTRE}
+        done
+
+       # scp ha.cf file to all the nodes
+       for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do
+               touch ${TMP_DIR}$"/ha.cf."${NODE_NAMES[idx]}
+               scp ${HACF_LUSTRE} ${NODE_NAMES[idx]}:${HA_DIR}/
+               if [ $? -ne 0 ]; then
+                       echo >&2 "`basename $0`: Failed to scp ha.cf file"\
+                                "to node ${NODE_NAMES[idx]}!"
+                       return 1
+               fi
+       done
+
+       return 0
+}
+
+# create_haresources
+#
+# Create the haresources file and scp it to the each node's /etc/ha.d/
+create_haresources() {
+       HARES_PRIMNODE=${TMP_DIR}$"/haresources."${PRIM_NODENAME}
+       HARES_LUSTRE=${TMP_DIR}$"/haresources"${FILE_SUFFIX}
+       declare -i idx
+       local res_line
+
+       if [ -s ${HARES_PRIMNODE} ]; then
+               # The haresources file for the primary node has already existed
+               if [ -n "`/bin/grep ${TARGET_DEVNAMES[0]} ${HARES_PRIMNODE}`" ]; then
+                       verbose_output "${HARES_PRIMNODE} already exists."
+                       return 0
+               fi
+       fi
+               
+       # Add the resource group line into the haresources file
+       res_line=${PRIM_NODENAME}
+       for ((idx = 0; idx < ${#TARGET_DEVNAMES[@]}; idx++)); do
+               res_line=${res_line}" "${LUSTRE_SRV}::${TARGET_DEVNAMES[idx]}::${TARGET_MNTPNTS[idx]}::${FS_TYPE}
+                       
+               if [ "${HBVER_OPT}" = "${HBVER_HBV1}" ]; then
+                       res_line=${res_line}" "${TARGET_SRVNAMES[idx]}"-mon"
+               fi
+       done
+       echo "${res_line}" >> ${HARES_LUSTRE}
+
+       # Generate the cib.xml file
+       if [ "${HBVER_OPT}" = "${HBVER_HBV2}" ]; then
+               # Add group haclient and user hacluster
+               [ -z "`grep haclient /etc/group`" ] && groupadd haclient
+               [ -z "`grep hacluster /etc/passwd`" ] && useradd -g haclient hacluster
+
+               CIB_LUSTRE=${TMP_DIR}$"/cib.xml"${FILE_SUFFIX}
+               python ${CIB_GEN_SCRIPT} --stdout \
+               ${HARES_LUSTRE} > ${CIB_LUSTRE}
+               if [ $? -ne 0 ]; then
+                       echo >&2 "`basename $0`: Failed to generate cib.xml file"\
+                                "for node ${PRIM_NODENAME}!"
+                       return 1
+               fi
+       fi
+
+       # scp the haresources file or cib.xml file
+       for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do
+               /bin/cp -f ${HARES_LUSTRE} ${TMP_DIR}$"/haresources."${NODE_NAMES[idx]}
+               scp ${HARES_LUSTRE} ${NODE_NAMES[idx]}:${HA_DIR}/
+               if [ $? -ne 0 ]; then
+                       echo >&2 "`basename $0`: Failed to scp haresources file"\
+                                "to node ${NODE_NAMES[idx]}!"
+                       return 1
+               fi
+
+               if [ "${HBVER_OPT}" = "${HBVER_HBV2}" ]; then
+                       scp ${CIB_LUSTRE} ${NODE_NAMES[idx]}:${CIB_DIR}/
+                       if [ $? -ne 0 ]; then
+                               echo >&2 "`basename $0`: Failed to scp cib.xml"\
+                                        "file to node ${NODE_NAMES[idx]}!"
+                               return 1
+                       fi
+               fi
+       done
+
+       return 0
+}
+
+# create_authkeys
+#
+# Create the authkeys file and scp it to the each node's /etc/ha.d/
+create_authkeys() {
+       AUTHKEYS_PRIMNODE=${TMP_DIR}$"/authkeys."${PRIM_NODENAME}
+       declare -i idx
+
+       if [ -e ${AUTHKEYS_PRIMNODE} ]; then
+               verbose_output "${AUTHKEYS_PRIMNODE} already exists."
+               return 0
+       fi
+
+       # scp the authkeys file to all the nodes
+       chmod 600 ${AUTHKEYS_TEMP}
+       for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do
+               touch ${TMP_DIR}$"/authkeys."${NODE_NAMES[idx]}
+               scp -p ${AUTHKEYS_TEMP} ${NODE_NAMES[idx]}:${HA_DIR}/
+               if [ $? -ne 0 ]; then
+                       echo >&2 "`basename $0`: Failed to scp authkeys file"\
+                                "to node ${NODE_NAMES[idx]}!"
+                       return 1
+               fi
+       done
+
+       return 0
+}
+
+# create_moncf
+#
+# Create the mon.cf file and scp it to the each node's /etc/mon/
+create_moncf() {
+       MONCF_PRIMNODE=${TMP_DIR}$"/mon.cf."${PRIM_NODENAME}
+       MONCF_LUSTRE=${TMP_DIR}$"/mon.cf"${FILE_SUFFIX}
+       local srv_name params=
+       declare -i idx
+       declare -a OLD_TARGET_SRVNAMES          # targets in other nodes 
+                                               # in this failover group
+       # Initialize the OLD_TARGET_SRVNAMES array
+       unset OLD_TARGET_SRVNAMES
+
+       if [ -s ${MONCF_PRIMNODE} ]; then
+               if [ -n "`/bin/grep ${TARGET_SRVNAMES[0]} ${MONCF_PRIMNODE}`" ]
+               then
+                       verbose_output "${MONCF_PRIMNODE} already exists."
+                       return 0
+               else
+                       # Get the Lustre target service names 
+                       # from the previous mon.cf file
+                       idx=0
+                       for srv_name in `grep hostgroup ${MONCF_PRIMNODE}\
+                                       |awk '$2 ~ /-mon/ {print $2}'|xargs`
+                       do
+                               OLD_TARGET_SRVNAMES[idx]=`echo ${srv_name}\
+                                                         |sed 's/-mon//g'`
+                               idx=$(( idx + 1 ))
+                       done
+               fi
+       fi
+
+       # Construct the parameters to mon.cf generation script
+       for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do
+               params=${params}" -n "${NODE_NAMES[idx]}
+       done
+
+       for ((idx = 0; idx < ${#OLD_TARGET_SRVNAMES[@]}; idx++)); do
+               params=${params}" -o "${OLD_TARGET_SRVNAMES[idx]}
+       done
+
+       for ((idx = 0; idx < ${#TARGET_SRVNAMES[@]}; idx++)); do
+               params=${params}" -o "${TARGET_SRVNAMES[idx]}
+       done
+
+       ${SCRIPT_GEN_MONCF} ${params}
+       if [ $? -ne 0 ]; then
+               echo >&2 "`basename $0`: Failed to generate mon.cf file"\
+                        "by using ${SCRIPT_GEN_MONCF}!"
+               return 1
+       fi
+
+       /bin/mv *-mon.cfg ${MONCF_LUSTRE}
+
+       # scp the mon.cf file to all the nodes
+       for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do
+               /bin/cp -f ${MONCF_LUSTRE} ${TMP_DIR}$"/mon.cf."${NODE_NAMES[idx]}
+
+               scp ${MONCF_LUSTRE} ${NODE_NAMES[idx]}:${MON_DIR}/
+               if [ $? -ne 0 ]; then
+                       echo >&2 "`basename $0`: Failed to scp mon.cf file"\
+                                "to node ${NODE_NAMES[idx]}!"
+                       return 1
+               fi
+       done
+
+       return 0
+}
+
+# generate_config
+#
+# Generate the configuration files for Heartbeat and scp them to all the nodes
+generate_config() {
+       if [ "${HBVER_OPT}" = "${HBVER_HBV1}" ]; then
+               # Get server names of Lustre targets
+               if ! get_srvnames; then
+                       return 1
+               fi
+       fi
+       
+       if ! create_template; then
+               return 1
+       fi
+
+       verbose_output "Creating and remote copying ha.cf${FILE_SUFFIX} file to"\
+                      "${PRIM_NODENAME} failover group hosts..." 
+       if ! create_hacf; then
+               return 1
+       fi
+       verbose_output "OK"
+
+       verbose_output "Creating and remote copying haresources${FILE_SUFFIX} file"\
+                      "to ${PRIM_NODENAME} failover group hosts..."
+       if ! create_haresources; then
+               return 1
+       fi
+       verbose_output "OK"
+
+       verbose_output "Creating and remote copying authkeys${FILE_SUFFIX} file to" \
+                      "${PRIM_NODENAME} failover group hosts..."
+       if ! create_authkeys; then
+               return 1
+       fi
+       verbose_output "OK"
+
+       if [ "${HBVER_OPT}" = "${HBVER_HBV1}" ]; then
+               verbose_output "Creating and remote copying mon.cf${FILE_SUFFIX} file to" \
+                               "${PRIM_NODENAME} failover group hosts..."
+               if ! create_moncf; then
+                       return 1
+               fi
+               verbose_output "OK"
+       fi
+
+       return 0
+}
+
+# Main flow
+# Get all the node names
+if ! get_nodenames; then
+       exit 1
+fi
+
+# Check heartbeat services
+verbose_output "Checking heartbeat service in the ${PRIM_NODENAME}"\
+              "failover group hosts..."
+check_heartbeat
+rc=$?
+if [ "$rc" -eq "2" ]; then
+       verbose_output "OK"
+       exit 0
+elif [ "$rc" -eq "1" ]; then
+       exit 1
+fi
+verbose_output "OK"
+
+# Generate configuration files
+if ! generate_config; then
+       exit 1
+fi
+
+exit 0