From: Gabriele Paciucci Date: Thu, 1 Sep 2016 15:54:55 +0000 (+0100) Subject: LU-8457 pacemaker: Pacemaker script to monitor LNet X-Git-Tag: 2.9.53~19 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=9018f11cd5a1ab82353e79271163ef51db081e95 LU-8457 pacemaker: Pacemaker script to monitor LNet A new script to be used in Pacemaker to monitor LNet compatible with ZFS and LDISKFS based Lustre server installations. This RA is able to monitor a single LNet device using the Pacemaker's clone technology. pcs resource create [Resource Name] ocf:lustre:healthLNET dampen=[seconds 5s] multiplier=[number 1000] lctl=[true|false] device=[device name ib0] host_list=[list of NIDs, space separated] --clone where: * dampen The time to wait (dampening) further changes occur * multiplier The number by which to multiply the number of connected ping nodes by * attempts Number of ping attempts, per host, before declaring it dead * timeout How long, in seconds, to wait before declaring a ping lost * lctl Option to enable lctl ping instead of the normal ping. The default is true * device Device used for the LNET network. We assume the same device accross the cluster This script should be located in /usr/lib/ocf/resource.d/lustre/ of both the Lustre servers with permission 755. Test-Parameters: trivial Signed-off-by: Gabriele Paciucci Change-Id: I6292ce36dde0083fa95cb1d047fe582bd7d53116 Reviewed-on: https://review.whamcloud.com/22266 Tested-by: Jenkins Reviewed-by: Nathaniel Clark Reviewed-by: Christopher J. Morrone Tested-by: Maloo Reviewed-by: Oleg Drokin --- diff --git a/contrib/scripts/pacemaker/healthLNET b/contrib/scripts/pacemaker/healthLNET new file mode 100644 index 0000000..fc958c9 --- /dev/null +++ b/contrib/scripts/pacemaker/healthLNET @@ -0,0 +1,410 @@ +#!/bin/sh +# +# +# LNet OCF RA +# + +# License: GNU General Public License (GPL)v2 +# Description: Manages ZFS and Lustre on a shared storage +# Written by: Gabriele Paciucci +# Release Date: 01 September 2016 +# Release Version: 0.99 + +# Copyright (c) 2009 Andrew Beekhof +# Copyright (c) 2016, Intel Corporation + +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS=${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs} +. ${OCF_FUNCTIONS} +: ${__OCF_ACTION=$1} + +####################################################################### + +meta_data() { + cat < + + +0.99 + + +Every time the monitor action is run, this resource agent records (in the CIB) +the current number of lctl ping nodes the host can connect to. + +LNet connectivity + + + + +PID file +PID file + + + + + +The time to wait (dampening) further changes occur + +Dampening interval + + + + + +The name of the attributes to set. This is the name to be used in the constraints. + +Attribute name + + + + + +The number by which to multiply the number of connected ping nodes by + +Value multiplier + + + + + +The list of ping nodes to count. + +Host list + + + + + +Number of ping attempts, per host, before declaring it dead + +no. of ping attempts + + + + + +How long, in seconds, to wait before declaring a ping lost + +ping timeout in seconds + + + + + +Option to enable lctl ping. The default is true + +Extra Options + + + + + +Device used for the LNET network. We assume the same device accross the cluster + +LNET device + + + + + + +A catch all for any other options that need to be passed to ping. + +Extra Options + + + + + +Resource is failed if the score is less than failure_score. +Default never fails. + +failure_score + + + + + +Enables to use default attrd_updater verbose logging on every call. + +Verbose logging + + + + + + + + + + + + + + +END +} + +####################################################################### + +ping_conditional_log() { + level=$1; shift + if [ ${OCF_RESKEY_debug} = "true" ]; then + ocf_log $level "$*" + fi +} + +ping_usage() { + cat <&1`; rc=$? + # debug + # ocf_log info "$lctl_exe $host $OCF_RESKEY_timeout" + + case $rc in + 0) active=`expr $active + 1`;; + 1) ping_conditional_log warn "$host is inactive: $lctl_out";; + *) ocf_log err "Unexpected result for '$lctl_exe $host $OCF_RESKEY_timeout' $rc: $p_out";; + esac + done + return $active + + +} + + + +ping_check() { + active=0 + for host in $OCF_RESKEY_host_list; do + p_exe=ping + + case `uname` in + Linux) p_args="-n -q -W $OCF_RESKEY_timeout -c $OCF_RESKEY_attempts";; + Darwin) p_args="-n -q -t $OCF_RESKEY_timeout -c $OCF_RESKEY_attempts -o";; + *) ocf_log err "Unknown host type: `uname`"; exit $OCF_ERR_INSTALLED;; + esac + + case $host in + *:*) p_exe=ping6 + esac + + p_out=`$p_exe $p_args $OCF_RESKEY_options $host 2>&1`; rc=$? + + case $rc in + 0) active=`expr $active + 1`;; + 1) ping_conditional_log warn "$host is inactive: $p_out";; + *) ocf_log err "Unexpected result for '$p_exe $p_args $OCF_RESKEY_options $host' $rc: $p_out";; + esac + done + return $active +} + +ping_update() { + # first I'm testing if I have the physical link up. + # If not I give up without any additional tests. + # but first we need to find which is the device we are using on the localhost. + + CARRIER=/sys/class/net/$OCF_RESKEY_device/carrier + OPERSTATE=/sys/class/net/$OCF_RESKEY_device/operstate + + CAR_STAT=$(cat $CARRIER) + OPER_STAT=$(cat $OPERSTATE) + + # debug + # ocf_log info "$CAR_STAT - $OPER_STAT" + + if [ "$CAR_STAT" == "1" ] && [ "$OPER_STAT" == "up" ]; then + if [ ${OCF_RESKEY_lctl} = "true" ]; then + lctl_check + active=$? + else + ping_check + active=$? + fi + else + active=0 + fi + + # debug + # ocf_log info "$active" + + score=`expr $active \* $OCF_RESKEY_multiplier` + attrd_updater -n $OCF_RESKEY_name -v $score -d $OCF_RESKEY_dampen $attrd_options + rc=$? + case $rc in + 0) ping_conditional_log debug "Updated $OCF_RESKEY_name = $score" ;; + *) ocf_log warn "Could not update $OCF_RESKEY_name = $score: rc=$rc";; + esac + if [ $rc -ne 0 ]; then + return $rc + fi + + if [ -n "$OCF_RESKEY_failure_score" -a "$score" -lt "$OCF_RESKEY_failure_score" ]; then + ocf_log warn "$OCF_RESKEY_name is less than failure_score($OCF_RESKEY_failure_score)" + return 1 + fi + return 0 +} + +: ${OCF_RESKEY_name:="pingd"} +: ${OCF_RESKEY_dampen:="5s"} +: ${OCF_RESKEY_attempts:="3"} +: ${OCF_RESKEY_multiplier:="1"} +: ${OCF_RESKEY_debug:="false"} +: ${OCF_RESKEY_lctl:="true"} +#: ${OCF_RESKEY_device:="eth1"} +: ${OCF_RESKEY_failure_score:="0"} + +: ${OCF_RESKEY_CRM_meta_timeout:="20000"} +: ${OCF_RESKEY_CRM_meta_globally_unique:="true"} + +integer=`echo ${OCF_RESKEY_timeout} | egrep -o '[0-9]*'` +case ${OCF_RESKEY_timeout} in + *[0-9]ms|*[0-9]msec) OCF_RESKEY_timeout=`expr $integer / 1000`;; + *[0-9]m|*[0-9]min) OCF_RESKEY_timeout=`expr $integer \* 60`;; + *[0-9]h|*[0-9]hr) OCF_RESKEY_timeout=`expr $integer \* 60 \* 60`;; + *) OCF_RESKEY_timeout=$integer;; +esac + +if [ -z ${OCF_RESKEY_timeout} ]; then + if [ x"$OCF_RESKEY_host_list" != x ]; then + host_count=`echo $OCF_RESKEY_host_list | awk '{print NF}'` + OCF_RESKEY_timeout=`expr $OCF_RESKEY_CRM_meta_timeout / $host_count / $OCF_RESKEY_attempts` + OCF_RESKEY_timeout=`expr $OCF_RESKEY_timeout / 1100` # Convert to seconds and finish 10% early + else + OCF_RESKEY_timeout=5 + fi +fi + +if [ ${OCF_RESKEY_timeout} -lt 1 ]; then + OCF_RESKEY_timeout=5 +elif [ ${OCF_RESKEY_timeout} -gt 1000 ]; then + # ping actually complains if this value is too high, 5 minutes is plenty + OCF_RESKEY_timeout=300 +fi + +if [ ${OCF_RESKEY_CRM_meta_globally_unique} = "false" ]; then + : ${OCF_RESKEY_pidfile:="$HA_VARRUN/ping-${OCF_RESKEY_name}"} +else + : ${OCF_RESKEY_pidfile:="$HA_VARRUN/ping-${OCF_RESOURCE_INSTANCE}"} +fi + +attrd_options='-q' +if ocf_is_true ${OCF_RESKEY_debug} ; then + attrd_options='' +fi + +# Check the debug option +case "${OCF_RESKEY_debug}" in + true|True|TRUE|1) OCF_RESKEY_debug=true;; + false|False|FALSE|0) OCF_RESKEY_debug=false;; + *) + ocf_log warn "Value for 'debug' is incorrect. Please specify 'true' or 'false' not: ${OCF_RESKEY_debug}" + OCF_RESKEY_debug=false + ;; +esac + +case $__OCF_ACTION in +meta-data) meta_data + exit $OCF_SUCCESS + ;; +start) ping_start;; +stop) ping_stop;; +monitor) ping_monitor;; +reload) ping_start;; +validate-all) ping_validate;; +usage|help) ping_usage + exit $OCF_SUCCESS + ;; +*) ping_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac