From 3f9b7595730a58f8633e1238fbb9c664660033dd Mon Sep 17 00:00:00 2001 From: yujian Date: Mon, 29 May 2006 13:56:09 +0000 Subject: [PATCH] b=9853 1) use ssh instead of pdsh 2) modify to support monitoring multiple services in one failover group 3) create service symlinks from /etc/init.d/lustre for Lustre targets --- .../utils/cluster_scripts/gen_clumanager_config.sh | 331 ++++++++++++++------- 1 file changed, 231 insertions(+), 100 deletions(-) diff --git a/lustre/utils/cluster_scripts/gen_clumanager_config.sh b/lustre/utils/cluster_scripts/gen_clumanager_config.sh index 8469f7d..0cce008 100755 --- a/lustre/utils/cluster_scripts/gen_clumanager_config.sh +++ b/lustre/utils/cluster_scripts/gen_clumanager_config.sh @@ -1,6 +1,6 @@ #!/bin/bash # -# gen_clumanager_config.sh - script for generating the Red Hat's Cluster Manager +# gen_clumanager_config.sh - script for generating the Red Hat Cluster Manager # HA software's configuration files # ################################################################################ @@ -9,70 +9,80 @@ usage() { cat >&2 < <-d target device> <-s service addresses> - [-c heartbeat channels] [-o heartbeat options] [-v] +Usage: `basename $0` <-n hostnames> <-s service addresses> + [-c heartbeat channel] [-o heartbeat options] [-v] + <-d target device> [-d target device...] -n hostnames the nodenames of the primary node and its fail- overs Multiple nodenames are separated by colon (:) delimeter. The first one is the nodename of the primary node, the others are failover nodenames. - -d target device the target device name and type - The name and type are separated by colon (:) - delimeter. The type values are: mgs, mdt, ost or - mgs_mdt. -s service addresses the IP addresses to failover Multiple addresses are separated by colon (:) delimeter. - -c heartbeat channels the methods to send/rcv heartbeats on + -c heartbeat channel the method to send/rcv heartbeats on The default method is multicast, and multicast_ ipaddress is "225.0.0.11". -o heartbeat options a "catchall" for other heartbeat configuration options + Multiple options are separated by colon (:) + delimeter. -v verbose mode + -d target device the target device name and mount point + The device name and mount point are separated by + colon (:) delimeter. EOF exit 1 } -# Global variables -SCRIPTS_PATH=${CLUSTER_SCRIPTS_PATH:-"./"} -SCRIPT_VERIFY_SRVIP=${SCRIPTS_PATH}$"verify_serviceIP.sh" +#****************************** Global variables ******************************# +# Scripts to be called +SCRIPTS_PATH=${CLUSTER_SCRIPTS_PATH:-"."} +SCRIPT_VERIFY_SRVIP=${SCRIPTS_PATH}/verify_serviceIP.sh + +# Remote command +REMOTE=${REMOTE:-"ssh -x -q"} + +# Lustre utilities path +CMD_PATH=${CMD_PATH:-"/usr/sbin"} +TUNEFS=${TUNEFS:-"$CMD_PATH/tunefs.lustre"} + +# CluManager tools +CLUMAN_TOOLS_PATH=${CLUMAN_TOOLS_PATH:-"/usr/sbin"} +CONFIG_CMD=${CONFIG_CMD:-"${CLUMAN_TOOLS_PATH}/redhat-config-cluster-cmd"} -LUSTRE_SRV_SCRIPT=$"/etc/rc.d/init.d/lustre" # service script for lustre +# Configuration directory +CLUMAN_DIR="/etc" # CluManager configuration directory -TMP_DIR=$"/tmp/clumanager/" # temporary directory -CLUMGR_DIR=$"/etc/" # CluManager configuration directory +# Service directory and name +INIT_DIR=${INIT_DIR:-"/etc/init.d"} +LUSTRE_SRV=${LUSTRE_SRV:-"${INIT_DIR}/lustre"} # service script for lustre -CONFIG_CMD=$"redhat-config-cluster-cmd" +TMP_DIR="/tmp/clumanager" # temporary directory declare -a NODE_NAMES # node names in the failover group declare -a SRV_IPADDRS # service IP addresses +# Lustre target device names, service names and mount points +declare -a TARGET_DEVNAMES TARGET_SRVNAMES TARGET_MNTPNTS +declare -i TARGET_NUM=0 # number of targets + # Get and check the positional parameters -while getopts "n:d:s:c:o:v" OPTION; do +VERBOSE_OUTPUT=false +while getopts "n:s:c:o:vd:" OPTION; do case $OPTION in n) HOSTNAME_OPT=$OPTARG - HOSTNAME_NUM=`echo ${HOSTNAME_OPT} | awk -F":" '{print NF}'` - if [ ${HOSTNAME_NUM} -lt 2 ]; then - echo >&2 $"`basename $0`: Lack failover nodenames!" - usage - fi - ;; - d) - DEVICE_OPT=$OPTARG - TARGET_DEV=`echo ${DEVICE_OPT} | awk -F":" '{print $1}'` - TARGET_TYPE=`echo ${DEVICE_OPT} | awk -F":" '{print $2}'` - if [ -z "${TARGET_TYPE}" ]; then - echo >&2 $"`basename $0`: Lack target device type!" + PRIM_NODENAME=`echo ${HOSTNAME_OPT} | awk -F":" '{print $1}'` + if [ -z "${PRIM_NODENAME}" ]; then + echo >&2 $"`basename $0`: Missing primary nodename!" usage fi - if [ "${TARGET_TYPE}" != "mgs" ]&&[ "${TARGET_TYPE}" != "mdt" ]\ - &&[ "${TARGET_TYPE}" != "ost" ]&&[ "${TARGET_TYPE}" != "mgs_mdt" ] - then - echo >&2 $"`basename $0`: Invalid target device type" \ - "- ${TARGET_TYPE}!" + HOSTNAME_NUM=`echo ${HOSTNAME_OPT} | awk -F":" '{print NF}'` + if [ ${HOSTNAME_NUM} -lt 2 ]; then + echo >&2 $"`basename $0`: Missing failover nodenames!" usage fi ;; @@ -96,8 +106,24 @@ while getopts "n:d:s:c:o:v" OPTION; do HBOPT_OPT=`echo "${HBOPT_OPT}" | sed 's/^"//' | sed 's/"$//'` ;; v) - VERBOSE_OPT=$"yes" + VERBOSE_OUTPUT=true + ;; + d) + DEVICE_OPT=$OPTARG + TARGET_DEVNAMES[TARGET_NUM]=`echo ${DEVICE_OPT}|awk -F: '{print $1}'` + TARGET_MNTPNTS[TARGET_NUM]=`echo ${DEVICE_OPT}|awk -F: '{print $2}'` + if [ -z "${TARGET_DEVNAMES[TARGET_NUM]}" ]; then + echo >&2 $"`basename $0`: Missing target device name!" + usage + fi + if [ -z "${TARGET_MNTPNTS[TARGET_NUM]}" ]; then + echo >&2 $"`basename $0`: Missing mount point for target"\ + "${TARGET_DEVNAMES[TARGET_NUM]}!" + usage + fi + TARGET_NUM=$(( TARGET_NUM + 1 )) ;; + ?) usage esac @@ -105,23 +131,23 @@ done # Check the required parameters if [ -z "${HOSTNAME_OPT}" ]; then - echo >&2 $"`basename $0`: Lack -n option!" + echo >&2 $"`basename $0`: Missing -n option!" usage fi -if [ -z "${DEVICE_OPT}" ]; then - echo >&2 $"`basename $0`: Lack -d option!" +if [ -z "${SRVADDR_OPT}" ]; then + echo >&2 $"`basename $0`: Missing -s option!" usage fi -if [ -z "${SRVADDR_OPT}" ]; then - echo >&2 $"`basename $0`: Lack -s option!" +if [ -z "${DEVICE_OPT}" ]; then + echo >&2 $"`basename $0`: Missing -d option!" usage fi # Output verbose informations verbose_output() { - if [ "${VERBOSE_OPT}" = "yes" ]; then + if ${VERBOSE_OUTPUT}; then echo "`basename $0`: $*" fi return 0 @@ -131,8 +157,6 @@ verbose_output() { # # Get all the node names in this failover group get_nodenames() { - PRIM_NODENAME=`echo ${HOSTNAME_OPT} | awk -F":" '{print $1}'` - declare -i idx local nodename_str nodename @@ -184,22 +208,93 @@ get_check_srvIPaddrs() { # stop_clumanager # -# Run pdsh command to stop each node's clumanager service +# Run remote command to stop each node's clumanager service stop_clumanager() { declare -i idx - local nodename_str=${PRIM_NODENAME} + local ret_str - for ((idx = 1; idx < ${#NODE_NAMES[@]}; idx++)); do - nodename_str=${nodename_str}$","${NODE_NAMES[idx]} + for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do + ret_str=`${REMOTE} ${NODE_NAMES[idx]} \ + "/sbin/service clumanager stop" 2>&1` + if [ $? -ne 0 ]; then + echo >&2 "`basename $0`: stop_clumanager() error:"\ + "from host ${NODE_NAMES[idx]} - $ret_str!" + fi done - ${PDSH} -w ${nodename_str} /sbin/service clumanager stop + return 0 +} + +# get_srvname hostname target_devname +# +# Get the lustre target server name from the node @hostname +get_srvname() { + local host_name=$1 + local target_devname=$2 + local target_srvname= + local ret_str + + # Execute remote command to get the target server name + ret_str=`${REMOTE} ${host_name} \ + "${TUNEFS} --print ${target_devname} | grep Target:" 2>&1` if [ $? -ne 0 ]; then - echo >&2 "`basename $0`: stop_clumanager() error:"\ - "Fail to execute pdsh command!" + echo "`basename $0`: get_srvname() error:" \ + "from host ${host_name} - ${ret_str}" + return 1 + fi + + if [ "${ret_str}" != "${ret_str#*Target: }" ]; then + ret_str=${ret_str#*Target: } + target_srvname=`echo ${ret_str} | awk '{print $1}'` + fi + + if [ -z "${target_srvname}" ]; then + echo "`basename $0`: get_srvname() error: Cannot get the"\ + "server name of target ${target_devname} in ${host_name}!" return 1 fi + echo ${target_srvname} + return 0 +} + +# create_service +# +# Create service symlinks from /etc/init.d/lustre for Lustre targets +create_service() { + declare -i i + local srv_dir + local command ret_str + + # Initialize the TARGET_SRVNAMES array + unset TARGET_SRVNAMES + + # Get Lustre target service names + for ((i = 0; i < ${#TARGET_DEVNAMES[@]}; i++)); do + TARGET_SRVNAMES[i]=$(get_srvname ${PRIM_NODENAME} \ + ${TARGET_DEVNAMES[i]}) + if [ $? -ne 0 ]; then + echo >&2 "${TARGET_SRVNAMES[i]}" + return 1 + fi + done + + # Construct remote command + command=":" + for ((i = 0; i < ${#TARGET_SRVNAMES[@]}; i++)); do + command=${command}";ln -s -f ${LUSTRE_SRV} ${INIT_DIR}/${TARGET_SRVNAMES[i]}" + done + + # Execute remote command to create symlinks + for ((i = 0; i < ${#NODE_NAMES[@]}; i++)); do + ret_str=`${REMOTE} ${NODE_NAMES[i]} "${command}" 2>&1` + if [ $? -ne 0 ]; then + echo >&2 "`basename $0`: create_service() error:" \ + "from host ${NODE_NAMES[i]} - ${ret_str}" + return 1 + fi + done + return 0 } @@ -208,25 +303,69 @@ stop_clumanager() { # Check the return value of redhat-config-cluster-cmd check_retval() { if [ $1 -ne 0 ]; then - echo >&2 "`basename $0`: Fail to run ${CONFIG_CMD}!" + echo >&2 "`basename $0`: Failed to run ${CONFIG_CMD}!" return 1 fi return 0 } +# add_services +# +# Add service tags into the cluster.xml file +add_services() { + declare -i idx + declare -i i + + # Add service tag + for ((i = 0; i < ${#TARGET_SRVNAMES[@]}; i++)); do + ${CONFIG_CMD} --add_service --name=${TARGET_SRVNAMES[i]} + if ! check_retval $?; then + return 1 + fi + + ${CONFIG_CMD} --service=${TARGET_SRVNAMES[i]} \ + --userscript=${INIT_DIR}/${TARGET_SRVNAMES[i]} + if ! check_retval $?; then + return 1 + fi + + for ((idx = 0; idx < ${#SRV_IPADDRS[@]}; idx++)); do + ${CONFIG_CMD} --service=${TARGET_SRVNAMES[i]} \ + --add_service_ipaddress --ipaddress=${SRV_IPADDRS[idx]} + if ! check_retval $?; then + return 1 + fi + done + + ${CONFIG_CMD} --service=${TARGET_SRVNAMES[i]} \ + --device=${TARGET_DEVNAMES[i]} \ + --mount \ + --mountpoint=${TARGET_MNTPNTS[i]} \ + --fstype=lustre + if ! check_retval $?; then + return 1 + fi + done + + return 0 +} + # gen_cluster_xml # # Run redhat-config-cluster-cmd to create the cluster.xml file gen_cluster_xml() { declare -i idx + declare -i i local mcast_IPaddr + local node_names local hbopt_str hbopt # Run redhat-config-cluster-cmd to generate cluster.xml # Add clumembd tag if [ "${HBCHANNEL_OPT}" != "${HBCHANNEL_OPT#*broadcast*}" ]; then ${CONFIG_CMD} --clumembd --broadcast=yes + ${CONFIG_CMD} --clumembd --multicast=no if ! check_retval $?; then return 1 fi @@ -242,59 +381,29 @@ gen_cluster_xml() { fi # Add cluster tag - ${CONFIG_CMD} --cluster --name='${TARGET_TYPE} failover group' - if ! check_retval $?; then - return 1 - fi - - # Add member tag + node_names= for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do - ${CONFIG_CMD} --add_member --name=${NODE_NAMES[idx]} - if ! check_retval $?; then - return 1 - fi + node_names=${node_names}"${NODE_NAMES[idx]} " done - # Add failoverdomain tag - ${CONFIG_CMD} --add_failoverdomain --name=${TARGET_TYPE}-domain + ${CONFIG_CMD} --cluster --name="${node_names}failover group" if ! check_retval $?; then return 1 fi + # Add member tag for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do - ${CONFIG_CMD} --failoverdomain=${TARGET_TYPE}-domain\ - --add_failoverdomainnode --name=${NODE_NAMES[idx]} + ${CONFIG_CMD} --add_member --name=${NODE_NAMES[idx]} if ! check_retval $?; then return 1 fi done # Add service tag - ${CONFIG_CMD} --add_service --name=${TARGET_TYPE}-service - if ! check_retval $?; then - return 1 - fi - - ${CONFIG_CMD} --service=${TARGET_TYPE}-service \ - --userscript=${LUSTRE_SRV_SCRIPT} - if ! check_retval $?; then - return 1 - fi - - ${CONFIG_CMD} --service=${TARGET_TYPE}-service \ - --failoverdomain=${TARGET_TYPE}-domain - if ! check_retval $?; then + if ! add_services; then return 1 fi - for ((idx = 0; idx < ${#SRV_IPADDRS[@]}; idx++)); do - ${CONFIG_CMD} --service=mgs-service \ - --add_service_ipaddress --ipaddress=${SRV_IPADDRS[idx]} - if ! check_retval $?; then - return 1 - fi - done - # Add other tags if [ -n "${HBOPT_OPT}"]; then hbopt_str=`echo ${HBOPT_OPT}|awk '{split($HBOPT_OPT, a, ":")}\ @@ -317,32 +426,54 @@ gen_cluster_xml() { # # Create the cluster.xml file and scp it to the each node's /etc/ create_config() { - CONFIG_PRIMNODE=${TMP_DIR}$"cluster.xml."${PRIM_NODENAME} + CONFIG_PRIMNODE=${TMP_DIR}$"/cluster.xml."${PRIM_NODENAME} declare -i idx - if [ -e ${CONFIG_PRIMNODE} ]; then - verbose_output "${CONFIG_PRIMNODE} already exists." - return 0 - fi - - # Run redhat-config-cluster-cmd to generate cluster.xml - verbose_output "Creating cluster.xml file for" \ - "${PRIM_NODENAME} failover group hosts..." - if ! gen_cluster_xml; then + # Create symlinks for Lustre services + verbose_output "Creating symlinks for lustre target services in"\ + "${PRIM_NODENAME} failover group hosts..." + if ! create_service; then return 1 fi verbose_output "OK" - /bin/cp -f ${CLUMGR_DIR}cluster.xml ${CONFIG_PRIMNODE} + if [ -s ${CONFIG_PRIMNODE} ]; then + if [ -n "`/bin/grep ${TARGET_SRVNAMES[0]} ${CONFIG_PRIMNODE}`" ] + then + verbose_output "${CONFIG_PRIMNODE} already exists." + return 0 + else + /bin/cp -f ${CONFIG_PRIMNODE} ${CLUMAN_DIR}/cluster.xml + + # Add services into the cluster.xml file + if ! add_services; then + return 1 + fi + fi + else + # Run redhat-config-cluster-cmd to generate cluster.xml + verbose_output "Creating cluster.xml file for" \ + "${PRIM_NODENAME} failover group hosts..." + if ! gen_cluster_xml; then + return 1 + fi + verbose_output "OK" + fi + + /bin/cp -f ${CLUMAN_DIR}/cluster.xml ${CONFIG_PRIMNODE} # scp the cluster.xml file to all the nodes verbose_output "Remote copying cluster.xml file to" \ "${PRIM_NODENAME} failover group hosts..." for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do - touch ${TMP_DIR}$"cluster.xml."${NODE_NAMES[idx]} - scp ${CONFIG_PRIMNODE} ${NODE_NAMES[idx]}:${CLUMGR_DIR}cluster.xml + if [ "${PRIM_NODENAME}" != "${NODE_NAMES[idx]}" ]; then + /bin/cp -f ${CONFIG_PRIMNODE} \ + ${TMP_DIR}$"/cluster.xml."${NODE_NAMES[idx]} + fi + + scp ${CONFIG_PRIMNODE} ${NODE_NAMES[idx]}:${CLUMAN_DIR}/cluster.xml if [ $? -ne 0 ]; then - echo >&2 "`basename $0`: Fail to scp cluster.xml file"\ + echo >&2 "`basename $0`: Failed to scp cluster.xml file"\ "to node ${NODE_NAMES[idx]}!" return 1 fi -- 1.8.3.1