8 # License: GNU General Public License (GPL)v2
9 # Description: Manages ZFS and Lustre on a shared storage
10 # Written by: Gabriele Paciucci
11 # Release Date: 01 November 2016
12 # Release Version: 0.99.4
14 # Copyright (c) 2009 Andrew Beekhof
15 # Copyright (c) 2016, Intel Corporation
19 # This program is free software; you can redistribute it and/or modify
20 # it under the terms of version 2 of the GNU General Public License as
21 # published by the Free Software Foundation.
23 # This program is distributed in the hope that it would be useful, but
24 # WITHOUT ANY WARRANTY; without even the implied warranty of
25 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
27 # Further, this software is distributed without any warranty that it is
28 # free of the rightful claim of any third person regarding infringement
29 # or the like. Any license provided herein, whether implied or
30 # otherwise, applies only to this software file. Patent licenses, if
31 # any, provided herein do not apply to combinations of this program with
32 # other software, or any other product whatsoever.
34 # You should have received a copy of the GNU General Public License
35 # along with this program. If not, see <http://www.gnu.org/licenses/>.
38 #######################################################################
41 : ${OCF_FUNCTIONS=${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs}
45 #######################################################################
50 <!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
51 <resource-agent name="healthLNET">
52 <version>0.99.4</version>
55 Every time the monitor action is run, this resource agent records (in the CIB)
56 the current number of lctl ping nodes the host can connect to.
58 <shortdesc lang="en">LNet connectivity</shortdesc>
62 <parameter name="pidfile" unique="0">
63 <longdesc lang="en">PID file</longdesc>
64 <shortdesc lang="en">PID file</shortdesc>
65 <content type="string" default="$HA_VARRUN/ping-${OCF_RESOURCE_INSTANCE}" />
68 <parameter name="dampen" unique="0">
70 The time to wait (dampening) further changes occur
72 <shortdesc lang="en">Dampening interval</shortdesc>
73 <content type="integer" default="5s"/>
76 <parameter name="name" unique="0">
78 The name of the attributes to set. This is the name to be used in the constraints.
80 <shortdesc lang="en">Attribute name</shortdesc>
81 <content type="string" default="pingd"/>
84 <parameter name="multiplier" unique="0">
86 The number by which to multiply the number of connected ping nodes by
88 <shortdesc lang="en">Value multiplier</shortdesc>
89 <content type="integer" default=""/>
92 <parameter name="host_list" unique="0" required="1">
94 The list of ping nodes to count.
96 <shortdesc lang="en">Host list</shortdesc>
97 <content type="string" default=""/>
100 <parameter name="attempts" unique="0">
102 Number of ping attempts, per host, before declaring it dead
104 <shortdesc lang="en">no. of ping attempts</shortdesc>
105 <content type="integer" default="2"/>
108 <parameter name="timeout" unique="0">
110 How long, in seconds, to wait before declaring a ping lost
112 <shortdesc lang="en">ping timeout in seconds</shortdesc>
113 <content type="integer" default="2"/>
116 <parameter name="lctl" unique="0">
118 Option to enable lctl ping. The default is true
120 <shortdesc lang="en">Extra Options</shortdesc>
121 <content type="string" default="true"/>
124 <parameter name="device" unique="0">
126 Device used for the LNET network. We assume the same device accross the cluster
128 <shortdesc lang="en">LNET device</shortdesc>
129 <content type="string" default=""/>
133 <parameter name="options" unique="0">
135 A catch all for any other options that need to be passed to ping.
137 <shortdesc lang="en">Extra Options</shortdesc>
138 <content type="string" default=""/>
141 <parameter name="failure_score" unique="0">
143 Resource is failed if the score is less than failure_score.
146 <shortdesc lang="en">failure_score</shortdesc>
147 <content type="integer" default=""/>
150 <parameter name="debug" unique="0">
152 Enables to use default attrd_updater verbose logging on every call.
154 <shortdesc lang="en">Verbose logging</shortdesc>
155 <content type="string" default="false"/>
161 <action name="start" timeout="300s" />
162 <action name="stop" timeout="300s" />
163 <action name="reload" timeout="300s" />
164 <action name="monitor" depth="0" timeout="300s" interval="20s"/>
165 <action name="meta-data" timeout="5" />
166 <action name="validate-all" timeout="30" />
172 #######################################################################
174 ping_conditional_log() {
176 if [ ${OCF_RESKEY_debug} = "true" ]; then
183 usage: $0 {start|stop|monitor|migrate_to|migrate_from|validate-all|meta-data}
185 Expects to have a fully populated OCF RA-compliant environment set.
192 if [ $rc -ne 0 ]; then
193 return $OCF_ERR_INSTALLED
196 if [ $? = $OCF_SUCCESS ]; then
199 touch ${OCF_RESKEY_pidfile}
204 rm -f ${OCF_RESKEY_pidfile}
205 attrd_updater -D -n $OCF_RESKEY_name -d $OCF_RESKEY_dampen $attrd_options
210 if [ -f ${OCF_RESKEY_pidfile} ]; then
212 if [ $? -eq 0 ]; then
215 return $OCF_ERR_GENERIC
217 return $OCF_NOT_RUNNING
221 # Is the state directory writable?
222 state_dir=`dirname "$OCF_RESKEY_pidfile"`
223 touch "$state_dir/$$"
225 ocf_log err "Invalid location for 'state': $state_dir is not writable"
230 # Pidfile better be an absolute path
231 case $OCF_RESKEY_pidfile in
233 *) ocf_log warn "You should use an absolute path for pidfile not: $OCF_RESKEY_pidfile" ;;
236 # Check the host list
237 if [ "x" = "x$OCF_RESKEY_host_list" ]; then
238 ocf_log err "Empty host_list. Please specify some nodes to ping"
239 exit $OCF_ERR_CONFIGURED
249 for host in $OCF_RESKEY_host_list; do
252 lctl_out=`$lctl_exe $host $OCF_RESKEY_timeout 2>&1`; rc=$?
254 # ocf_log info "$lctl_exe $host $OCF_RESKEY_timeout"
257 0) active=`expr $active + 1`;;
258 1) ping_conditional_log warn "$host is inactive: $lctl_out";;
259 *) ocf_log err "Unexpected result for '$lctl_exe $host $OCF_RESKEY_timeout' $rc: $p_out";;
271 for host in $OCF_RESKEY_host_list; do
275 Linux) p_args="-n -q -W $OCF_RESKEY_timeout -c $OCF_RESKEY_attempts";;
276 Darwin) p_args="-n -q -t $OCF_RESKEY_timeout -c $OCF_RESKEY_attempts -o";;
277 *) ocf_log err "Unknown host type: `uname`"; exit $OCF_ERR_INSTALLED;;
284 p_out=`$p_exe $p_args $OCF_RESKEY_options $host 2>&1`; rc=$?
287 0) active=`expr $active + 1`;;
288 1) ping_conditional_log warn "$host is inactive: $p_out";;
289 *) ocf_log err "Unexpected result for '$p_exe $p_args $OCF_RESKEY_options $host' $rc: $p_out";;
296 # first I'm testing if I have the physical link up.
297 # If not I give up without any additional tests.
298 # but first we need to find which is the device we are using on the localhost.
300 CARRIER=/sys/class/net/$OCF_RESKEY_device/carrier
301 OPERSTATE=/sys/class/net/$OCF_RESKEY_device/operstate
303 CAR_STAT=$(cat $CARRIER)
304 OPER_STAT=$(cat $OPERSTATE)
307 # ocf_log info "$CAR_STAT - $OPER_STAT"
310 if [ "$CAR_STAT" == "1" ] && [ "$OPER_STAT" == "up" ]; then
311 if [ ${OCF_RESKEY_lctl} = "true" ]; then
323 # ocf_log info "$active"
325 score=`expr $active \* $OCF_RESKEY_multiplier`
326 attrd_updater -n $OCF_RESKEY_name -v $score -d $OCF_RESKEY_dampen $attrd_options
329 0) ping_conditional_log debug "Updated $OCF_RESKEY_name = $score" ;;
330 *) ocf_log warn "Could not update $OCF_RESKEY_name = $score: rc=$rc";;
332 if [ $rc -ne 0 ]; then
335 if [ $score -eq 0 ]; then
336 ocf_log err "LNet connection failed please check"
338 if [ -n "$OCF_RESKEY_failure_score" -a "$score" -lt "$OCF_RESKEY_failure_score" ]; then
339 ocf_log warn "$OCF_RESKEY_name is less than failure_score($OCF_RESKEY_failure_score)"
345 : ${OCF_RESKEY_name:="pingd"}
346 : ${OCF_RESKEY_dampen:="5s"}
347 : ${OCF_RESKEY_attempts:="3"}
348 : ${OCF_RESKEY_multiplier:="1"}
349 : ${OCF_RESKEY_debug:="false"}
350 : ${OCF_RESKEY_lctl:="true"}
351 #: ${OCF_RESKEY_device:="eth1"}
352 : ${OCF_RESKEY_failure_score:="0"}
354 : ${OCF_RESKEY_CRM_meta_timeout:="20000"}
355 : ${OCF_RESKEY_CRM_meta_globally_unique:="true"}
357 integer=`echo ${OCF_RESKEY_timeout} | egrep -o '[0-9]*'`
358 case ${OCF_RESKEY_timeout} in
359 *[0-9]ms|*[0-9]msec) OCF_RESKEY_timeout=`expr $integer / 1000`;;
360 *[0-9]m|*[0-9]min) OCF_RESKEY_timeout=`expr $integer \* 60`;;
361 *[0-9]h|*[0-9]hr) OCF_RESKEY_timeout=`expr $integer \* 60 \* 60`;;
362 *) OCF_RESKEY_timeout=$integer;;
365 if [ -z ${OCF_RESKEY_timeout} ]; then
366 if [ x"$OCF_RESKEY_host_list" != x ]; then
367 host_count=`echo $OCF_RESKEY_host_list | awk '{print NF}'`
368 OCF_RESKEY_timeout=`expr $OCF_RESKEY_CRM_meta_timeout / $host_count / $OCF_RESKEY_attempts`
369 OCF_RESKEY_timeout=`expr $OCF_RESKEY_timeout / 1100` # Convert to seconds and finish 10% early
375 if [ ${OCF_RESKEY_timeout} -lt 1 ]; then
377 elif [ ${OCF_RESKEY_timeout} -gt 1000 ]; then
378 # ping actually complains if this value is too high, 5 minutes is plenty
379 OCF_RESKEY_timeout=300
382 if [ ${OCF_RESKEY_CRM_meta_globally_unique} = "false" ]; then
383 : ${OCF_RESKEY_pidfile:="$HA_VARRUN/ping-${OCF_RESKEY_name}"}
385 : ${OCF_RESKEY_pidfile:="$HA_VARRUN/ping-${OCF_RESOURCE_INSTANCE}"}
389 if ocf_is_true ${OCF_RESKEY_debug} ; then
393 # Check the debug option
394 case "${OCF_RESKEY_debug}" in
395 true|True|TRUE|1) OCF_RESKEY_debug=true;;
396 false|False|FALSE|0) OCF_RESKEY_debug=false;;
398 ocf_log warn "Value for 'debug' is incorrect. Please specify 'true' or 'false' not: ${OCF_RESKEY_debug}"
399 OCF_RESKEY_debug=false
403 case $__OCF_ACTION in
409 monitor) ping_monitor;;
411 validate-all) ping_validate;;
412 usage|help) ping_usage
416 exit $OCF_ERR_UNIMPLEMENTED