X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Fscripts%2Flc_common;h=386b3806228b04fb94f1f0002a424d584e62c167;hb=780bc7a0e9965ae76750daf15356a976afca806f;hp=06a547f6516df935b50074cb2af72868b47f1038;hpb=073e67f1647008c721d452ee3862c3f643f6c248;p=fs%2Flustre-release.git diff --git a/lustre/scripts/lc_common b/lustre/scripts/lc_common index 06a547f..386b380 100644 --- a/lustre/scripts/lc_common +++ b/lustre/scripts/lc_common @@ -1,38 +1,37 @@ -# +#!/bin/bash + # vim:expandtab:shiftwidth=4:softtabstop=4:tabstop=4: + # -# lc_common - This file contains functions to be used by most or all +# lc_common - This file contains common variables and functions to be used by # Lustre cluster config scripts. # ################################################################################ -# Remote command -REMOTE=${REMOTE:-"ssh -x -q"} -#REMOTE=${REMOTE:-"pdsh -S -R ssh -w"} -export REMOTE +#****************************** Common Variables ******************************# +export PATH=$PATH:/sbin:/usr/sbin -# Lustre utilities -CMD_PATH=${CMD_PATH:-"/usr/sbin"} -MKFS=${MKFS:-"$CMD_PATH/mkfs.lustre"} -TUNEFS=${TUNEFS:-"$CMD_PATH/tunefs.lustre"} -LCTL=${LCTL:-"$CMD_PATH/lctl"} +# Remote command +export REMOTE=${REMOTE:-"ssh -x -q"} +#export REMOTE=${REMOTE:-"pdsh -S -R ssh -w"} -EXPORT_PATH=${EXPORT_PATH:-"PATH=\$PATH:/sbin:/usr/sbin;"} +# Lustre utilities +export MKFS=${MKFS:-"mkfs.lustre"} +export TUNEFS=${TUNEFS:-"tunefs.lustre"} +export LCTL=${LCTL:-"lctl"} -# Raid command path -RAID_CMD_PATH=${RAID_CMD_PATH:-"/sbin"} -MDADM=${MDADM:-"$RAID_CMD_PATH/mdadm"} +# Software RAID command +export MDADM=${MDADM:-"mdadm"} # Some scripts to be called -SCRIPTS_PATH=${CLUSTER_SCRIPTS_PATH:-"$(cd `dirname $0`; echo $PWD)"} -MODULE_CONFIG=${SCRIPTS_PATH}/lc_modprobe -VERIFY_CLUSTER_NET=${SCRIPTS_PATH}/lc_net -GEN_HB_CONFIG=${SCRIPTS_PATH}/lc_hb -GEN_CLUMGR_CONFIG=${SCRIPTS_PATH}/lc_cluman -SCRIPT_VERIFY_SRVIP=${SCRIPTS_PATH}/lc_servip -SCRIPT_GEN_MONCF=${SCRIPTS_PATH}/lc_mon -SCRIPT_CONFIG_MD=${SCRIPTS_PATH}/lc_md -SCRIPT_CONFIG_LVM=${SCRIPTS_PATH}/lc_lvm +export MODULE_CONFIG=${MODULE_CONFIG:-"lc_modprobe"} +export VERIFY_CLUSTER_NET=${VERIFY_CLUSTER_NET:-"lc_net"} +export GEN_HB_CONFIG=${GEN_HB_CONFIG:-"lc_hb"} +export GEN_CLUMGR_CONFIG=${GEN_CLUMGR_CONFIG:-"lc_cluman"} +export SCRIPT_VERIFY_SRVIP=${SCRIPT_VERIFY_SRVIP:-"lc_servip"} +export SCRIPT_GEN_MONCF=${SCRIPT_GEN_MONCF:-"lc_mon"} +export SCRIPT_CONFIG_MD=${SCRIPT_CONFIG_MD:-"lc_md"} +export SCRIPT_CONFIG_LVM=${SCRIPT_CONFIG_LVM:-"lc_lvm"} # Variables of HA software HBVER_HBV1="hbv1" # Heartbeat version 1 @@ -62,24 +61,43 @@ FS_TYPE=${FS_TYPE:-"lustre"} # Lustre filesystem type FILE_SUFFIX=${FILE_SUFFIX:-".lustre"} # Suffix of the generated config files # Marker of the MD device line -MD_MARKER=${MD_MARKER:-"MD"} +export MD_MARKER=${MD_MARKER:-"MD"} # Marker of the LVM device line -PV_MARKER=${PV_MARKER:-"PV"} -VG_MARKER=${VG_MARKER:-"VG"} -LV_MARKER=${LV_MARKER:-"LV"} +export PV_MARKER=${PV_MARKER:-"PV"} +export VG_MARKER=${VG_MARKER:-"VG"} +export LV_MARKER=${LV_MARKER:-"LV"} -declare -a CONFIG_ITEM # Items in each line of the csv file +declare -a CONFIG_ITEM # Items in each line of the CSV file declare -a NODE_NAME # Hostnames of nodes have been configured -# Nodelist variables -USE_ALLNODES=false # default is not to operate on all the nodes -SPECIFIED_NODELIST="" # specified list of nodes to be operated on -EXCLUDED_NODELIST="" # list of nodes to be excluded +declare -a MGS_NODENAME # Node names of the MGS servers +declare -a MGS_IDX # Indexes of MGSs in the global arrays +declare -i MGS_NUM # Number of MGS servers in the cluster +declare -i INIT_IDX + +# All of the Lustre target items in the CSV file +declare -a HOST_NAME MODULE_OPTS DEVICE_NAME MOUNT_POINT DEVICE_TYPE FS_NAME +declare -a MGS_NIDS INDEX FORMAT_OPTIONS MKFS_OPTIONS MOUNT_OPTIONS FAILOVERS + +# Heartbeat software requires that node names in the configuration directive +# must (normally) match the "uname -n" of that machine. Since the value of the +# "failover nids" field in the CSV file is the NID(s) of failover partner node, +# we have to figure out the corresponding hostname of that node. +declare -a FAILOVERS_NAMES -export PATH=$PATH:$CMD_PATH:$SCRIPTS_PATH:$CLUMAN_TOOLS_PATH:$RAID_CMD_PATH:/sbin:/usr/sbin +export VERIFY_CONNECT=true # Verify network connectivity by default +export USE_ALLNODES=false # Not operating on all the nodes by default +export SPECIFIED_NODELIST="" # Specified list of nodes to be operated on +export EXCLUDED_NODELIST="" # Specified list of nodes to be excluded +export NODES_TO_USE="" # Defacto list of nodes to be operated on +export NODELIST_OPT="" +export VERBOSE_OUTPUT=false +export VERBOSE_OPT="" +#****************************** Common Functions ******************************# + # verbose_output string # Output verbose information $string verbose_output() { @@ -89,6 +107,24 @@ verbose_output() { return 0 } +# error_output string +# Output error string to stderr, prefixing with ERROR +# for easy error parsing from the rest of the output. +error_output() { + echo >&2 "$(basename $0): ERROR: $*" + return 0 +} + +# error_exit rc string +# Output error to stderr via error_output and exit with rc. +error_exit() { + local rc=$1 + shift + + error_output $* + exit $rc +} + # Check whether the reomte command is pdsh is_pdsh() { if [ "${REMOTE}" = "${REMOTE#*pdsh}" ]; then @@ -103,13 +139,13 @@ is_pdsh() { check_file() { # Check argument if [ $# -eq 0 ]; then - echo >&2 "`basename $0`: check_file() error: Missing csv file!" + error_output "check_file(): Missing CSV file!" return 1 fi - CSV_FILE=$1 + local CSV_FILE=$1 if [ ! -s ${CSV_FILE} ]; then - echo >&2 "`basename $0`: check_file() error: ${CSV_FILE}"\ + error_output "check_file(): ${CSV_FILE}"\ "does not exist or is empty!" return 1 fi @@ -118,21 +154,21 @@ check_file() { } # parse_line line -# Parse a line in the csv file +# Parse a line in the CSV file parse_line() { # Check argument if [ $# -eq 0 ]; then - echo >&2 "`basename $0`: parse_line() error: Missing argument!" + error_output "parse_line(): Missing argument!" return 1 fi declare -i i=0 # Index of the CONFIG_ITEM array - declare -i length=0 + declare -i length=0 declare -i idx=0 - declare -i s_quote_flag=0 # Flag of the single quote character + declare -i s_quote_flag=0 # Flag of the single quote character declare -i d_quote_flag=0 # Flag of the double quotes character local TMP_LETTER LINE - + LINE="$*" # Initialize the CONFIG_ITEM array @@ -239,12 +275,12 @@ remote_error() { ret_str=$* if [ "${ret_str}" != "${ret_str#*connect:*}" ]; then - echo >&2 "`basename $0`: ${fn_name}() error: ${ret_str}" + error_output "${fn_name}(): ${ret_str}" return 0 fi if [ -z "${ret_str}" ]; then - echo >&2 "`basename $0`: ${fn_name}() error:" \ + error_output "${fn_name}():" \ "No results from remote!" \ "Check network connectivity between the local host and ${host_addr}!" return 0 @@ -267,21 +303,17 @@ nid2hostname() { echo "`basename $0`: nid2hostname() error: Invalid nid - \"${nid}\"!" return 1 fi - + case "${nettype}" in lo*) host_name=`hostname`;; elan*) # QsNet # FIXME: Parse the /etc/elanhosts configuration file to # convert ElanID to hostname ;; - gm*) # Myrinet - # FIXME: Use /usr/sbin/gmlndnid to find the hostname of - # the specified GM Global node ID - ;; ptl*) # Portals # FIXME: Convert portal ID to hostname ;; - *) # tcp, o2ib, cib, openib, iib, vib, ra + *) # tcp, o2ib, ra ip_addr=${addr} # Is it IP address or hostname? if [ -n "`echo ${ip_addr} | sed -e 's/\([0-9]\{1,3\}\.\)\{3,3\}[0-9]\{1,3\}//'`" ] @@ -292,7 +324,7 @@ nid2hostname() { fi # Execute remote command to get the host name - ret_str=$(${REMOTE} ${ip_addr} "hostname" 2>&1) + ret_str=$(${REMOTE} ${ip_addr} "hostname" 2>&1 &1 && return 1 all_nodelist=$(egrep -v "([[:space:]]|^)#" ${csv_file} | cut -d, -f 1) @@ -477,7 +509,7 @@ get_csv_nodelist() { get_nodelist() { local ALL_NODELIST - # Get the list of all the nodes in the csv file + # Get the list of all the nodes in the CSV file ALL_NODELIST=$(get_csv_nodelist ${CSV_FILE}) [ ${PIPESTATUS[0]} -ne 0 ] && echo "${ALL_NODELIST}" && return 1 @@ -513,12 +545,518 @@ check_nodelist() { local nodes_to_use=$1 if [ -z "${nodes_to_use}" ]; then - echo "`basename $0`: There are no hosts to be operated on."\ + error_output "There are no nodes to be operated on."\ "Check the node selection options (-a, -w or -x)." - usage + usage 1>&2 + return 1 else verbose_output "Operating on the following nodes: ${nodes_to_use}" fi return 0 } + +# nid_in_nidlist nid nidlist +# Given a nid, and a list of nids in one node (delimited by comma ','), +# return true if the nid appears in the list of nids, or false otherwise. +nid_in_nidlist() { + local nid="$1" + local nidlist="$2" + local my_nid + + [ -z "${nid}" -o -z "${nidlist}" ] && false && return + + if [[ "${nid}" != *@* || "${nid#*@}" == tcp* ]]; then + # network type is tcp + for my_nid in ${nidlist//,/ }; do + [ "${nid%@*}" = "${my_nid%@*}" ] && true && return + done + else + # network type is not tcp + [[ ,${nidlist}, == *,${nid},* ]] && true && return + fi + + false && return +} + +# get_mgs_nids mgs_hostname mgs_nids +# Get the corresponding NID(s) of the MGS node ${mgs_hostname} from the +# "mgs nids" field of one lustre target in the CSV file +get_mgs_nids() { + local mgs_node="$1" + local all_mgs_nids="$2" + local mgs_nids + local ret_str + + # Check whether the hostname of the mgs node is in + # the mgs nids string + for mgs_nids in ${all_mgs_nids//:/ }; do + if nid_in_nidlist ${mgs_node} ${mgs_nids}; then + echo ${mgs_nids} + return 0 + fi + done + + # Let's use lctl to get the real nids from the mgs node + ret_str=$($REMOTE $mgs_node "PATH=\$PATH:/sbin:/usr/sbin +$LCTL list_nids" 2>&1 /dev/null" + if [ ${PIPESTATUS[0]} -eq 0 ]; then + # This node can contact the MGS node + verbose_output "${HOST_NAME[i]} can contact the MGS" \ + "node $mgs_node by using nid \"$mgs_nid\"!" + ping_mgs=true + break + fi + done + done + + if ! ${ping_mgs}; then + error_output "check_lnet_connect():" \ + "${HOST_NAME[i]} cannot contact the MGS node ${mgs_node}"\ + "with nids - \"${nids_str}\"! Check ${LCTL} command!" + return 1 + fi + + return 0 +} + +# Start lnet network in the cluster node and check that +# this node can contact the MGS node +check_lnet() { + if ! $VERIFY_CONNECT; then + return 0 + fi + + # Check argument + if [ $# -eq 0 ]; then + error_output "check_lnet(): Missing argument!" + return 1 + fi + + declare -i i=$1 + declare -i j + local ret_str + + # Execute remote command to start lnet network + verbose_output "Starting lnet network on ${HOST_NAME[i]}" + ret_str=$($REMOTE ${HOST_NAME[i]} "PATH=\$PATH:/sbin:/usr/sbin +modprobe lnet && $LCTL network up" 2>&1) + if [ ${PIPESTATUS[0]} -ne 0 ]; then + error_output "check_lnet(): start lnet network on" \ + "${HOST_NAME[i]} error: $ret_str" + return 1 + fi + + if is_mgs_node ${HOST_NAME[i]}; then + return 0 + fi + + # Execute remote command to check that + # this node can contact the MGS node + for ((j = 0; j < ${MGS_NUM}; j++)); do + if ! check_lnet_connect $i ${MGS_NODENAME[j]}; then + return 1 + fi + done + + return 0 +} + +# Start lnet network in the MGS node +start_mgs_lnet() { + declare -i i + declare -i idx + + if [ -z "${MGS_NODENAME[0]}" -a -z "${MGS_NODENAME[1]}" ]; then + if ${USE_ALLNODES}; then + verbose_output "There is no MGS target in the ${CSV_FILE} file." + else + verbose_output "There is no MGS target in the node list \"${NODES_TO_USE}\"." + fi + return 0 + fi + + for ((i = ${INIT_IDX}; i < ${MGS_NUM}; i++)); do + # Execute remote command to add lnet options lines to + # the MGS node's modprobe.conf/modules.conf + idx=${MGS_IDX[i]} + add_module_options $idx ${MGS_NODENAME[i]} || return ${PIPESTATUS[0]} + + # Start lnet network in the MGS node + check_lnet $idx || return ${PIPESTATUS[0]} + done + + return 0 +} + +# Get all the Lustre target items in the CSV file and do some checks. +get_lustre_items() { + # Check argument + if [ $# -eq 0 ]; then + error_output "get_lustre_items(): Missing argument"\ + "for function get_lustre_items()!" + return 1 + fi + + local CSV_FILE=$1 + local LINE + local marker + local hostname + declare -i line_num=0 + declare -i idx=0 + + exec 9< ${CSV_FILE} + while read -u 9 -r LINE; do + line_num=${line_num}+1 + # verbose_output "Parsing line ${line_num}: $LINE" + + # Get rid of the empty line + [ -z "`echo ${LINE} | awk '/[[:alnum:]]/ {print $0}'`" ] && continue + + # Get rid of the comment line + [ -z "`echo \"${LINE}\" | egrep -v \"([[:space:]]|^)#\"`" ] && continue + + # Skip the Linux MD/LVM line + marker=$(echo ${LINE} | cut -d, -f 2) + if [ "${marker}" = "${MD_MARKER}" -o "${marker}" = "${PV_MARKER}" ] \ + || [ "${marker}" = "${VG_MARKER}" -o "${marker}" = "${LV_MARKER}" ]; then + continue + fi + + # Skip the host which is not specified in the host list + if ! ${USE_ALLNODES}; then + hostname=$(echo ${LINE} | cut -d, -f 1) + ! host_in_hostlist ${hostname} ${NODES_TO_USE} && continue + fi + + # Parse the config line into CONFIG_ITEM + if ! parse_line "$LINE"; then + error_output "parse_line(): Occurred"\ + "on line ${line_num} in ${CSV_FILE}: $LINE" + return 1 + fi + + HOST_NAME[idx]=${CONFIG_ITEM[0]} + MODULE_OPTS[idx]=${CONFIG_ITEM[1]} + DEVICE_NAME[idx]=${CONFIG_ITEM[2]} + MOUNT_POINT[idx]=${CONFIG_ITEM[3]} + DEVICE_TYPE[idx]=${CONFIG_ITEM[4]} + FS_NAME[idx]=${CONFIG_ITEM[5]} + MGS_NIDS[idx]=${CONFIG_ITEM[6]} + INDEX[idx]=${CONFIG_ITEM[7]} + FORMAT_OPTIONS[idx]=${CONFIG_ITEM[8]} + MKFS_OPTIONS[idx]=${CONFIG_ITEM[9]} + MOUNT_OPTIONS[idx]=${CONFIG_ITEM[10]} + FAILOVERS[idx]=${CONFIG_ITEM[11]} + + MODULE_OPTS[idx]=`echo "${MODULE_OPTS[idx]}" | sed 's/"/\\\"/g'` + + # Convert IP addresses in NIDs to hostnames + FAILOVERS_NAMES[idx]=$(ip2hostname_multi_node ${FAILOVERS[idx]}) + if [ ${PIPESTATUS[0]} -ne 0 ]; then + error_output "${FAILOVERS_NAMES[idx]}" + return 1 + fi + + # Check some required items for formatting target + if ! check_lustre_item $idx; then + error_output "check_lustre_item():"\ + "Occurred on line ${line_num} in ${CSV_FILE}." + return 1 + fi + + idx=${idx}+1 + done + + return 0 +}