From fe3ddff77475dd487873e2f873835455aab7aa38 Mon Sep 17 00:00:00 2001 From: Chao Wang Date: Fri, 1 Aug 2014 12:14:58 -0400 Subject: [PATCH] LU-5030 utils: fix hard-coded /proc/fs/lustre in scripts In the upstream Linux kernel, the files under /proc/fs/lustre and lnet will be moved in the future to use sysfs. Lustre handles this by providing access to this data with the tool lctl which is independent of where the data is located. Many scripts directly access the proc file system instead of using lctl so this patch migrates those scripts to do the proper thing. Signed-off-by: Chao Wang Change-Id: I1d96ccd27fee2b0eb0bf173a4e37adacb628f83c Reviewed-on: http://review.whamcloud.com/10534 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: James Simmons Reviewed-by: Minh Diep Reviewed-by: Oleg Drokin --- lnet/utils/lbstats | 8 +- lustre-iokit/ior-survey/ior-survey | 3 +- .../obdfilter-survey/README.obdfilter-survey | 5 +- lustre-iokit/ost-survey/ost-survey | 48 +- lustre-iokit/stats-collect/iokit-lstats | 96 +- lustre/contrib/lustre_server.sh | 66 +- lustre/doc/l_getgroups.8 | 6 +- lustre/doc/llobdstat.8 | 4 +- lustre/doc/llstat.8 | 23 +- lustre/scripts/Makefile.am | 2 +- lustre/scripts/lnet | 28 +- lustre/scripts/lustre | 62 +- lustre/scripts/lustre_createcsv.in | 2106 -------------------- lustre/scripts/lustre_req_history | 22 +- lustre/tests/qos.sh | 14 +- lustre/tests/runiozone | 2 +- lustre/tests/sanity.sh | 57 +- lustre/tests/sanityn.sh | 23 +- lustre/tests/test-framework.sh | 52 +- lustre/utils/llobdstat | 12 +- lustre/utils/llstat | 9 +- 21 files changed, 310 insertions(+), 2338 deletions(-) delete mode 100644 lustre/scripts/lustre_createcsv.in diff --git a/lnet/utils/lbstats b/lnet/utils/lbstats index a8f0857..6cf4a52 100755 --- a/lnet/utils/lbstats +++ b/lnet/utils/lbstats @@ -1,11 +1,13 @@ #!/bin/bash echo "=== Router Buffers =======" -test -e /proc/sys/lnet/buffers && cat /proc/sys/lnet/buffers +lctl get_param -n buffers 2> /dev/null echo + echo "=== NIs ============================================" -test -e /proc/sys/lnet/nis && cat /proc/sys/lnet/nis +lctl get_param -n nis 2> /dev/null echo + echo "=== Peers =============================================================" -test -e /proc/sys/lnet/peers && cat /proc/sys/lnet/peers +lctl get_param -n peers 2> /dev/null echo diff --git a/lustre-iokit/ior-survey/ior-survey b/lustre-iokit/ior-survey/ior-survey index a2d6724..c632d31 100644 --- a/lustre-iokit/ior-survey/ior-survey +++ b/lustre-iokit/ior-survey/ior-survey @@ -100,8 +100,7 @@ dump_cache() { # we are assuming mpi uses will also have pdsh local clients=$1;shift local tmpfile=$1;shift - clear_cache='for LRU in /proc/fs/lustre/ldlm/namespaces/*/lru_size; - do sudo /bin/bash -c "echo clear > $LRU"; done' + clear_cache='lctl set_param ldlm.namespaces.*.lru_size=clear' echo "=> $clear_cache" >> $tmpfile $pdsh $pdsh_args "$test_clients" "$clear_cache" >> $tmpfile 2>&1 status=$? diff --git a/lustre-iokit/obdfilter-survey/README.obdfilter-survey b/lustre-iokit/obdfilter-survey/README.obdfilter-survey index a05a3d6..8dc63c2 100644 --- a/lustre-iokit/obdfilter-survey/README.obdfilter-survey +++ b/lustre-iokit/obdfilter-survey/README.obdfilter-survey @@ -113,9 +113,8 @@ is to be done. e.g. $ nobjhi=2 thrhi=2 size=1024 targets="" \ case=network sh obdfilter-survey -On server side you can see the stats at: - /proc/fs/lustre/obdecho//stats -where, 'echo_srv' is the obdecho server created through script. +On server side you can see the stats with the following command: + lctl get_param obdecho.*.stats NOTE: In network test only automated run is supported. diff --git a/lustre-iokit/ost-survey/ost-survey b/lustre-iokit/ost-survey/ost-survey index fc93117..a18f311 100755 --- a/lustre-iokit/ost-survey/ost-survey +++ b/lustre-iokit/ost-survey/ost-survey @@ -30,27 +30,30 @@ sub usage () { # ost_count subroutine ets globle variable $OST with Number of OST's # Also fills 1 for active OST indexes in ACTIVEOST_INX array. sub ost_count () { - # numobd gives number of ost's and activeobd gives number of active ost's - my $tempfile = glob ("/proc/fs/lustre/lov/*-clilov-*/activeobd"); - open(PTR, $tempfile) || die "Cannot open $tempfile: $!\n"; - $OSTS = ; - close PTR; + $OSTS = `lctl get_param -n lov.*-clilov-*.activeobd`; + if ( $? ) { + die "Read lov.*-clilov-*.activeobd error: $?\n"; + } print "Number of Active OST devices : $OSTS"; - my $tempfile = glob ("/proc/fs/lustre/lov/*-clilov-*/numobd"); - open(PTR, $tempfile) || die "Cannot open $tempfile: $!\n"; - $numost = ; - close PTR; + + $numost = `lctl get_param -n lov.*-clilov-*.numobd`; + if ( $? ) { + die "Read lov.*-clilov-*.numobd error: $?\n"; + } + if ( $numost != $OSTS ) { printf "Number of non active ots(s): %d\n", ( $numost - $OSTS ); $OSTS = $numost; } - my $tempfile = glob ("/proc/fs/lustre/lov/*-clilov-*/target_obd"); - open(PTR, $tempfile) || die "Cannot open $tempfile: $!\n"; + + $targets = `lctl get_param -n lov.*-clilov-*.target_obd`; + if ( $? ) { + die "Read lov.*-clilov-*.target_obd error: $?\n"; + } + my $count = 0; - my $temp; - while () { - chop; - my ($ost_num, $ost_name, $ost_status) = split(/\s+/, $_); + foreach $line (split /\n/, $targets) { + my ($ost_num, $ost_name, $ost_status) = split(/\s+/, $line); if ( $ost_status eq "ACTIVE" ) { $ACTIVEOST_INX[$count] = 1; } @@ -59,15 +62,18 @@ sub ost_count () { } sub cache_off () { - $CACHEFILE = glob ("/proc/fs/lustre/llite/*/max_cached_mb"); - open(PTR, $CACHEFILE) || die "Cannot open $tempfile: $!\n"; - $CACHESZ = 0 + ; - close PTR; - system("echo 0 >> $CACHEFILE"); + $CACHESZ = `lctl get_param -n llite.*.max_cached_mb`; + if ( $? ) { + die "Read llite.*.max_cached_mb error: $?\n"; + } + + $CACHESZ = `echo "$CACHESZ" | grep max_cached_mb | awk '{print \$2}'`; + + system("lctl set_param -n llite.*.max_cached_mb=0"); } sub cache_on () { - system("echo $CACHESZ >> $CACHEFILE"); + system("lctl set_param -n llite.*.max_cached_mb=$CACHESZ"); } # make_dummy subroutine creates a dummy file that will be used for read operation. diff --git a/lustre-iokit/stats-collect/iokit-lstats b/lustre-iokit/stats-collect/iokit-lstats index d30e4c9..86f1214 100755 --- a/lustre-iokit/stats-collect/iokit-lstats +++ b/lustre-iokit/stats-collect/iokit-lstats @@ -173,17 +173,15 @@ function brw_collector() echo "brw_* for $filter " $(date) # clear old stats - for i in /proc/fs/lustre/obdfilter/${filter}/brw_*; do - echo 0 >$i - done + lctl set_param -n obdfilter.${filter}.brw_*=0 if let "BRW_INTERVAL==0"; then - cat /proc/fs/lustre/obdfilter/${filter}/brw_* + lctl get_param -n obdfilter.${filter}.brw_* idle_collector - cat /proc/fs/lustre/obdfilter/${filter}/brw_* + lctl get_param -n obdfilter.${filter}.brw_* elif let "BRW_INTERVAL>0"; then while [ "$stop_collector" != "1" ]; do - cat /proc/fs/lustre/obdfilter/${filter}/brw_* + lctl get_param -n obdfilter.${filter}.brw_* sleep $BRW_INTERVAL done else @@ -199,8 +197,8 @@ function brw_start() fi # find all obdfilters - for i in /proc/fs/lustre/obdfilter/*; do - local filter=$(basename $i) + for i in $(lctl list_param obdfilter.*); do + filter=$(echo "$i" | awk -F"." '{print $2}') if [ "$filter" == "num_refs" ]; then continue; fi @@ -224,15 +222,15 @@ function service_collector() echo "service stats for ${target}/${srv} " $(date) # clear old stats - echo 0 >$file + lctl set_param -n $file=0 if let "SERVICE_INTERVAL==0"; then - grep -v "^[^ ]*[^0-9]*0 samples" $file + lctl get_param -n $file | grep -v "^[^ ]*[^0-9]*0 samples" idle_collector - grep -v "^[^ ]*[^0-9]*0 samples" $file + lctl get_param -n $file | grep -v "^[^ ]*[^0-9]*0 samples" elif let "SERVICE_INTERVAL>0"; then while [ "$stop_collector" != "1" ]; do - grep -v "^[^ ]*[^0-9]*0 samples" $file + lctl get_param -n $file | grep -v "^[^ ]*[^0-9]*0 samples" sleep $SERVICE_INTERVAL done else @@ -248,25 +246,25 @@ function service_start() fi # find all OSTs and MDTs - for i in /proc/fs/lustre/ost/* /proc/fs/lustre/mdt/*; do - target=$(basename $i) + for i in $(lctl list_param ost.* mdt.*); do + target=$(echo "$i" | awk -F"." '{print $2}') if [ "$target" == "num_refs" ]; then continue; fi - for j in ${i}/*; do - srv=$(basename $j) + for j in $(lctl list_param ${i}.*); do + srv=$(echo "$j" | awk -F"." '{print $3}') if [ "$srv" == "uuid" ]; then continue; fi run_collector "service-${srv}" service_collector \ - ${j}/stats $target $srv & + ${j}.stats $target $srv & done done # find all LDLM services - for i in /proc/fs/lustre/ldlm/services/*; do - srv=$(basename $i) - run_collector "service" service_collector ${i}/stats "ldlm" $srv & + for i in $(lctl list_param ldlm.services.*); do + srv=$(echo "$i" | awk -F"." '{print $3}') + run_collector "service" service_collector ${i}.stats "ldlm" $srv & done } @@ -311,11 +309,12 @@ function client_start() fi # find all osc - for i in /proc/fs/lustre/osc/* ; do - local target=$(basename $i) + for i in $(lctl list_param osc.*); do + target=$(echo "$i" | awk -F"." '{print $2}') if [ "$target" == "num_refs" ]; then continue; fi + i=$(echo "$i" |awk '{gsub(/\./,"/");print}') for j in ${i}/*; do local stats=$(basename $j) if [ "$stats" == "stats" -o "$stats" == "rpc_stats" ]; then @@ -325,8 +324,9 @@ function client_start() done done # find all llite stats - for i in /proc/fs/lustre/llite/* ; do - target=$(basename $i) + for i in $(lctl list_param llite.*); do + target=$(echo "$i" | awk -F"." '{print $2}') + i=$(echo "$i" |awk '{gsub(/\./,"/");print}') for j in ${i}/*; do stats=$(basename $j) if [ "$stats" == "stats" -o "$stats" == "vfs_ops_stats" ]; then @@ -347,8 +347,8 @@ function client_start() function sdio_collector() { local obd=$1 - local uuid=$(cat $obd/uuid) - local tmp=$(cat $obd/mntdev) + local uuid=$(lctl get_param -n obd.uuid 2>&1) + local tmp=$(lctl get_param -n obd.mntdev 2>&1) local disk=$(basename $tmp) local file="/proc/scsi/sd_iostats/${disk}" @@ -379,15 +379,15 @@ function sdio_start() fi # find all obdfilters and MDSs - for i in /proc/fs/lustre/obdfilter/* /proc/fs/lustre/mds/*; do - local obd=$(basename $i) + for i in $(lctl list_param obdfilter.* mds.*); do + obd=$(echo "$i" | awk -F"." '{print $2}') if [ "$obd" == "num_refs" ]; then continue; fi - if [ ! -f ${i}/mntdev ]; then + tmp=$(lctl get_param -n ${i}.mntdev 2>&1) + if [ $? != 0 ]; then continue; fi - local tmp=$(cat ${i}/mntdev) local disk=$(basename $tmp) if [ ! -f /proc/scsi/sd_iostats/${disk} ]; then continue; @@ -406,8 +406,8 @@ function sdio_start() function mballoc_collector() { local obd=$1 - local uuid=$(cat $obd/uuid) - local tmp=$(cat $obd/mntdev) + local uuid=$(lctl get_param -n obd.uuid 2>&1) + local tmp=$(lctl get_param -n obd.mntdev 2>&1) local disk=$(basename $tmp) local file="/proc/fs/ldiskfs*/${disk}/mb_history" @@ -437,15 +437,15 @@ function mballoc_start() fi # find all obdfilters and MDSs - for i in /proc/fs/lustre/obdfilter/* /proc/fs/lustre/mds/*; do - obd=$(basename $i) + for i in $(lctl list_param obdfilter.* mds.*); do + obd=$(echo "$i" | awk -F"." '{print $2}') if [ "$obd" == "num_refs" ]; then continue; fi - if [ ! -f ${i}/mntdev ]; then + tmp=$(lctl get_param -n ${i}.mntdev 2>&1) + if [ $? != 0 ]; then continue; fi - tmp=$(cat ${i}/mntdev) disk=$(basename $tmp) if [ ! -f /proc/fs/ldiskfs*/${disk}/mb_history ]; then continue; @@ -464,8 +464,8 @@ function mballoc_start() function io_collector() { local obd=$1 - local uuid=$(cat $obd/uuid) - local tmp=$(cat $obd/mntdev) + local uuid=$(lctl get_param -n obd.uuid 2>&1) + local tmp=$(lctl get_param -n obd.mntdev 2>&1) local disk=$(basename $tmp) local file="/sys/block/${disk}/stat" @@ -493,15 +493,15 @@ function io_start() fi # find all obdfilters and MDSs - for i in /proc/fs/lustre/obdfilter/* /proc/fs/lustre/mds/*; do - local obd=$(basename $i) + for i in $(lctl list_param obdfilter.* mds.*); do + obd=$(echo "$i" | awk -F"." '{print $2}') if [ "$obd" == "num_refs" ]; then continue; fi - if [ ! -f ${i}/mntdev ]; then + local tmp=$(lctl get_param -n ${i}.mntdev 2>&1) + if [ $? != 0 ]; then continue; fi - local tmp=$(cat ${i}/mntdev) local disk=$(basename $tmp) if [ ! -f /sys/block/${disk}/stat ]; then continue; @@ -520,8 +520,8 @@ function io_start() function jbd_collector() { local obd=$1 - local uuid=$(cat $obd/uuid) - local tmp=$(cat $obd/mntdev) + local uuid=$(lctl get_param -n obd.uuid 2>&1) + local tmp=$(lctl get_param -n obd.mntdev 2>&1) local disk=$(basename $tmp) local file="/proc/fs/jbd/${disk}/history" @@ -546,15 +546,15 @@ function jbd_start() fi # find all obdfilters and MDSs - for i in /proc/fs/lustre/obdfilter/* /proc/fs/lustre/mds/*; do - local obd=$(basename $i) + for i in $(lctl list_param obdfilter.* mds.*); do + obd=$(echo "$i" | awk -F"." '{print $2}') if [ "$obd" == "num_refs" ]; then continue; fi - if [ ! -f ${i}/mntdev ]; then + local tmp=$(lctl get_param -n ${i}.mntdev 2>&1) + if [ $? != 0 ]; then continue; fi - local tmp=$(cat ${i}/mntdev) local disk=$(basename $tmp) if [ ! -f /proc/fs/jbd/${disk}/history ]; then continue; diff --git a/lustre/contrib/lustre_server.sh b/lustre/contrib/lustre_server.sh index 4ee244f..7b4e3a7 100644 --- a/lustre/contrib/lustre_server.sh +++ b/lustre/contrib/lustre_server.sh @@ -181,17 +181,16 @@ list_mounts() { lustre_health_check() { - proc="/proc/fs/lustre/health_check" + check=$(lctl get_param -n health_check 2>&1) # on first check the lustre modules are not loaded yet - if [ ! -e $proc ]; then + if [ $? != 0 ]; then return 0 fi - check=`cat $proc` if [ "$check" = "healthy" ]; then return 0 else - ocf_log err "$proc is $check" + ocf_log err "health_check is $check" return 1 fi } @@ -298,19 +297,20 @@ lustre_server_mounted() # check in all mntdevs if really not mounted # lustre bug 21359 (https://bugzilla.lustre.org/show_bug.cgi?id=21359) if [ $rc -eq $OCF_NOT_RUNNING ]; then - local list="/proc/fs/lustre/mds/* /proc/fs/lustre/obdfilter/*" - for i in $list ; do - if [ -f ${i}/mntdev ]; then - MNTDEVS="$MNTDEVS ${i}/mntdev" - fi - done - local mgsdev=/proc/fs/lustre/mgs/MGS/mntdev - if [ -f $mgsdev ]; then - MNTDEVS="$MNTDEVS $mgsdev" - fi - for i in $MNTDEVS; do - local dev=`cat $i` - if [ "$dev" = "$DEVICE" ]; then + dev=$(lctl get_param -n mds.*.mntdev 2>&1) + if [ $? = 0 ]; then + MNTDEVS=$dev + fi + dev=$(lctl get_param -n obdfilter.*.mntdev 2>&1) + if [ $? = 0 ]; then + MNTDEVS="$MNTDEVS $dev" + fi + dev=$(lctl get_param -n mgs.MGS.mntdev 2>&1) + if [ $? = 0 ]; then + MNTDEVS="$MNTDEVS $dev" + fi + for i in $MNTDEVS; do + if [ "$i" = "$DEVICE" ]; then ocf_log err "Bug21359, /proc/mounts claims device is not mounted, but $i proves this is wrong" rc=$OCF_ERR_GENERIC fi @@ -357,19 +357,25 @@ lustre_server_status() # lustre_server_validate_all() { - proc="/proc/fs/lustre" - if [ ! -d $proc ]; then - modprobe lustre - count=0 - while [ ! -d $proc -o $count -gt 10 ]; do - sleep 1 - done - - if [ ! -d $proc ]; then - ocf_log err "Failed to load the lustre module" - return $OCF_ERR_GENERIC - fi - fi + var=$(lctl get_param -n version 2>&1) + if [ $? != 0 ]; then + modprobe lustre + + for i in `seq 1 10`; do + var=$(lctl get_param -n version 2>&1) + if [ $? != 0 ]; then + sleep 1 + else + break + fi + done + + var=$(lctl get_param -n version 2>&1) + if [ $? != 0 ]; then + ocf_log err "Failed to load the lustre module" + return $OCF_ERR_GENERIC + fi + fi return $OCF_SUCCESS } diff --git a/lustre/doc/l_getgroups.8 b/lustre/doc/l_getgroups.8 index 9219971..bbd7810 100644 --- a/lustre/doc/l_getgroups.8 +++ b/lustre/doc/l_getgroups.8 @@ -8,13 +8,11 @@ l_getgroups \- Handle Lustre user/group cache upcall .SH DESCRIPTION The group upcall file contains the path to an executable that, when properly installed, is invoked to resolve a numeric UID to a group -membership list. This utility should complete the mds_grp_downcall_data -data structure (see Data structures) and write it to the -/proc/fs/lustre/mds/mds-service/group_info pseudo-file. +membership list. .LP .B l_getgroups is the reference implementation of the user/group cache upcall .SH FILES -/proc/fs/lustre/mds/mds-service/group_upcall +/{proc,sys}/fs/lustre/mds/mds-service/group_upcall .SH SEE ALSO Lustre Programming Interfaces section of Lustre Operations Manual. diff --git a/lustre/doc/llobdstat.8 b/lustre/doc/llobdstat.8 index 604cf8a..7b71045 100644 --- a/lustre/doc/llobdstat.8 +++ b/lustre/doc/llobdstat.8 @@ -15,7 +15,7 @@ Type control-C to stop statistics printing. .SH EXAMPLE .nf # llobdstat liane-OST0002 1 -/usr/bin/llobdstat on /proc/fs/lustre/obdfilter/liane-OST0002/stats +/usr/bin/llobdstat on liane-OST0002 Processor counters run at 2800.189 MHz Read: 1.21431e+07, Write: 9.93363e+08, create/destroy: 24/1499, stat: 34, punch: 18 [NOTE: cx: create, dx: destroy, st: statfs, pu: punch ] @@ -31,5 +31,3 @@ Timestamp Read-delta ReadRate Write-delta WriteRate 1217026059 0.00MB 0.00MB/s 0.00MB 0.00MB/s st:1 ... .fi -.SH FILES -/proc/fs/lustre/obdfilter//stats. diff --git a/lustre/doc/llstat.8 b/lustre/doc/llstat.8 index cbe96e0..db2bf89 100644 --- a/lustre/doc/llstat.8 +++ b/lustre/doc/llstat.8 @@ -26,20 +26,19 @@ Display help information. Either the full path to a stats file, or the shorthand: \fImds\fR or \fIost\fR. .SH EXAMPLE -To monitor /proc/fs/lustre/ost/OSS/ost/stats every second: +To monitor the OST RPC stats every second: .IP llstat -i 1 ost .SH FILES .nf -/proc/fs/lustre/mdt/MDS/*/stats -/proc/fs/lustre/mds/*/exports/*/stats -/proc/fs/lustre/mdc/*/stats -/proc/fs/lustre/ldlm/services/*/stats -/proc/fs/lustre/ldlm/namespaces/*/pool/stats -/proc/fs/lustre/mgs/MGS/exports/*/stats -/proc/fs/lustre/ost/OSS/*/stats -/proc/fs/lustre/osc/*/stats -/proc/fs/lustre/obdfilter/*/exports/*/stats -/proc/fs/lustre/obdfilter/*/stats -/proc/fs/lustre/llite/*/stats +llite.*.stats +lwp.*.stats +mdc.*.stats +obdfilter.*.stats +osc.*.stats +osd-ldiskfs.*.stats +osp.*.stats +ldlm.services.*.stats +mds.MDS.*.stats +ost.OSS.*.stats .fi diff --git a/lustre/scripts/Makefile.am b/lustre/scripts/Makefile.am index 1235ca9..f2b60a8a7 100644 --- a/lustre/scripts/Makefile.am +++ b/lustre/scripts/Makefile.am @@ -38,7 +38,7 @@ sbinscripts = lc_servip lustre_up14 lustre_rmmod lhbadm ldev sbinscripts += lustre_routes_config lustre_routes_conversion # These are scripts that are generated from .in files -genscripts = lustre_config lc_modprobe lc_net lc_hb lc_cluman lustre_createcsv \ +genscripts = lustre_config lc_modprobe lc_net lc_hb lc_cluman \ lc_md lc_lvm lustre_start if INIT_SCRIPTS diff --git a/lustre/scripts/lnet b/lustre/scripts/lnet index ca3eaca..af08125 100644 --- a/lustre/scripts/lnet +++ b/lustre/scripts/lnet @@ -145,23 +145,29 @@ status () egrep -q "lnet" /proc/modules && STATE="loaded" # check for any routes - on a portals router this is the only thing - [ "`cat /proc/sys/lnet/routes 2> /dev/null`" ] && STATE="running" && RETVAL=0 + VAR=$(lctl get_param -n routes 2>&1) + if [ $? = 0 ] ; then + STATE="running" + RETVAL=0 + fi # check if this is a router - if [ -d /proc/sys/lnet ]; then - ROUTER="`cat /proc/sys/lnet/routes | head -1 | grep -i -c \"Routing enabled\"`" - if [[ ! -z ${ROUTER} && ${ROUTER} -ge 1 ]]; then - STATE="running" - RETVAL=0 - fi + if [[ "$(lctl get_param -n routes)" =~ "Routing enabled" ]]; then + STATE="running" + RETVAL=0 fi # check for error in health_check - HEALTH="/proc/fs/lustre/health_check" - [ -f "$HEALTH" ] && grep -q "NOT HEALTHY" $HEALTH && STATE="unhealthy" && RETVAL=1 + local health_check=$(lctl get_param -n health_check) + if [[ "$health_check" =~ "NOT HEALTHY" ]]; then + STATE="unhealthy" + RETVAL=1 + fi - # check for LBUG - [ -f "$HEALTH" ] && grep -q "LBUG" $HEALTH && STATE="LBUG" && RETVAL=152 + if [[ "$health_check" =~ "LBUG" ]]; then + STATE="LBUG" + RETVAL=152 + fi echo $STATE eval $old_nullglob diff --git a/lustre/scripts/lustre b/lustre/scripts/lustre index eeb5941..baa0430 100644 --- a/lustre/scripts/lustre +++ b/lustre/scripts/lustre @@ -608,38 +608,54 @@ health_check () egrep -q "libcfs|lvfs|portals" /proc/modules && STATE="loaded" # check for any configured devices (may indicate partial startup) - if [ -d /proc/fs/lustre ]; then - if [ -n "`cat /proc/fs/lustre/devices 2> /dev/null`" ] ; then + VAR=$(lctl get_param version 2>&1) + if [ $? = 0 ] ; then + VAR=$(lctl get_param -n devices 2>&1) + if [ $? = 0 ] ; then STATE="partial" RETVAL=150 fi # check for either a server or a client filesystem - MDT="`ls /proc/fs/lustre/mdt/*/recovery_status 2> /dev/null`" - OST="`ls /proc/fs/lustre/obdfilter/*/recovery_status \ - 2> /dev/null`" - LLITE="`ls /proc/fs/lustre/llite/fs* 2> /dev/null`" - if [ "$MDT" -o "$OST" -o "$LLITE" ]; then - STATE="running" - RETVAL=0 + MDT="" + OST="" + LLITE="" + + VAR=$(lctl get_param -n mdt.*.recovery_status 2>&1) + if [ $? = 0 ] ; then + MDT="YES" fi + + VAR=$(lctl get_param -n obdfilter.*.recovery_status 2>&1) + if [ $? = 0 ] ; then + OST="YES" + fi + + VAR=$(lctl get_param -n llite.fs* 2>&1) + if [ $? = 0 ] ; then + LLITE="YES" + fi + + if [ "$MDT" -o "$OST" -o "$LLITE" ]; then + STATE="running" + RETVAL=0 + fi else # check if this is a router - if [ -d /proc/sys/lnet ]; then - ROUTER="`cat /proc/sys/lnet/routes | head -1 | - grep -i -c \"Routing enabled\"`" - if [[ ! -z ${ROUTER} && ${ROUTER} -ge 1 ]]; then - STATE="running" - RETVAL=0 - fi + if [[ "$(lctl get_param -n routes)" =~ "Routing enabled" ]]; then + STATE="running" + RETVAL=0 fi fi # check for server disconnections - DISCON="`grep -v FULL /proc/fs/lustre/*c/*/*server_uuid 2> /dev/null`" - if [ -n "$DISCON" ] ; then - STATE="disconnected" - RETVAL=0 + VAR=$(lctl get_param -n *c.*.*server_uuid 2>&1) + if [ $? = 0 ] ; then + DISCON="$(echo $VAR | grep -v FULL)" + if [ -n "$DISCON" ] ; then + STATE="disconnected" + RETVAL=0 + fi fi # check for servers in recovery @@ -649,14 +665,14 @@ health_check () fi # check for error in health_check - HEALTH="/proc/fs/lustre/health_check" - if [ -f "$HEALTH" ] && grep -q "NOT HEALTHY" $HEALTH ; then + local health_check=$(lctl get_param -n health_check) + if [[ "$health_check" =~ "NOT HEALTHY" ]]; then STATE="unhealthy" RETVAL=1 fi # check for LBUG - if [ -f "$HEALTH" ] && grep -q "LBUG" $HEALTH ; then + if [[ "$health_check" =~ "LBUG" ]]; then STATE="LBUG" RETVAL=152 fi diff --git a/lustre/scripts/lustre_createcsv.in b/lustre/scripts/lustre_createcsv.in deleted file mode 100644 index 48cb432..0000000 --- a/lustre/scripts/lustre_createcsv.in +++ /dev/null @@ -1,2106 +0,0 @@ -#!/bin/bash - -# vim:expandtab:shiftwidth=4:softtabstop=4:tabstop=4: - -# -# lustre_createcsv - generate a csv file from a running lustre cluster -# -# This script is used to collect lustre target informations, linux MD/LVM device -# informations and HA software configurations in a lustre cluster to generate a -# csv file. In reverse, the csv file could be parsed by lustre_config to -# configure multiple lustre servers in parallel. -# -# This script should be run on the MGS node. -# -################################################################################ - -# Usage -usage() { - cat <&2 - exit 1 - fi - ;; - d) GET_MDLVM_INFO=true;; - h) usage && exit 0;; - v) VERBOSE_OUTPUT=true;; - f) LUSTRE_CSV_FILE=$OPTARG;; - ?) usage 1>&2 && exit 1;; - esac -done - -# Verify the local host is the MGS node -mgs_node() { - if [ ! -e ${LUSTRE_PROC_DEVICES} ]; then - error_output "${LUSTRE_PROC_DEVICES} does" \ - "not exist. Lustre kernel modules may not be loaded!" - return 1 - fi - - if [ -z "`cat ${LUSTRE_PROC_DEVICES}`" ]; then - error_output "${LUSTRE_PROC_DEVICES} is" \ - "empty. Lustre services may not be started!" - return 1 - fi - - if [ -z "`grep ${MGS_TYPE} ${LUSTRE_PROC_DEVICES}`" ]; then - error_output "This node is not a MGS node." \ - "The script should be run on the MGS node!" - return 1 - fi - - return 0 -} - -# get_hostnames -# Get lustre cluster node names -get_hostnames() { - declare -a HOST_NIDS - declare -i idx # Index of HOST_NIDS array - declare -i i # Index of HOST_NAMES array - - if ! mgs_node; then - return 1 - fi - - if [ ! -e ${LNET_PROC_PEERS} ]; then - error_output "${LNET_PROC_PEERS} does not" \ - "exist. LNET kernel modules may not be loaded" \ - "or LNET network may not be up!" - return 1 - fi - - HOST_NAMES[0]=${MGS_HOSTNAME} # MGS node - HOST_NIDS[0]=${HOST_NAMES[0]} - - # Get the nids of the nodes which have contacted MGS - idx=1 - for nid in `cat ${LNET_PROC_PEERS} | awk '{print $1}'`; do - if [ "${nid}" = "nid" ]; then - continue - fi - - HOST_NIDS[idx]=${nid} - let "idx += 1" - done - - if [ ${idx} -eq 1 ]; then - verbose_output "Only one node running in the lustre cluster." \ - "It's ${HOST_NAMES[0]}." - return 0 - fi - - # Get the hostnames of the nodes - for ((idx = 1, i = 1; idx < ${#HOST_NIDS[@]}; idx++, i++)); do - if [ -z "${HOST_NIDS[idx]}" ]; then - error_output "get_hostnames():" \ - "Invalid nid - \"${HOST_NIDS[idx]}\"!" - return 1 - fi - - HOST_NAMES[i]=$(nid2hostname ${HOST_NIDS[idx]}) - if [ $? -ne 0 ]; then - error_output "${HOST_NAMES[i]}" - return 1 - fi - - if [ "${HOST_NAMES[i]}" = "${HOST_NAMES[0]}" ]; then - unset HOST_NAMES[i] - let "i -= 1" - fi - done - - return 0 -} - -#********************** Linux MD/LVM device informations **********************# -# get_md_configs hostname -# Get all the active MD device informations from the node @hostname -get_md_configs() { - declare -i i=0 - declare -i j=0 - local host_name=$1 - local ret_line line first_item - - # Initialize the arrays - unset MD_NAME - unset MD_LEVEL - unset MD_DEVS - - # Execute remote command to the node ${host_name} and get all the - # active MD device informations. - while read -r ret_line; do - if is_pdsh; then - set -- ${ret_line} - shift - line="$*" - else - line="${ret_line}" - fi - - first_item=`echo "${line}" | awk '{print $1}'` - - # Get the MD device name and raid level - if [ "${first_item}" = "ARRAY" ]; then - MD_NAME[i]=`echo "${line}" | awk '{print $2}'` - MD_LEVEL[i]=`echo "${line}" | awk '{print $3}' | sed -e 's/level=//'` - let "j = i" - let "i += 1" - fi - - # Get the MD component devices - if [ "${first_item}" != "${first_item#devices=}" ]; then - MD_DEVS[j]=`echo "${line}" | sed -e 's/devices=//' -e 's/,/ /g'` - fi - done < <(${REMOTE} ${host_name} "PATH=\$PATH:/sbin:/usr/sbin - ${MDADM} --detail --scan --verbose") - - if [ $i -eq 0 ]; then - verbose_output "There are no active MD devices" \ - "in the host ${host_name}!" - fi - - return 0 -} - -# get_pv_configs hostname -# Get all the LVM PV informations from the node @hostname -get_pv_configs() { - PV_NAMES= - local host_name=$1 - local cmd ret_str - - # Execute remote command to get all the PV informations. - cmd="PATH=\$PATH:/sbin:/usr/sbin \ -pvdisplay -c | awk -F: '{print \$1}' | xargs" - ret_str=`${REMOTE} ${host_name} "${cmd}" 2>&1` - if [ $? -ne 0 ]; then - if [ -n "${ret_str}" ]; then - error_output "get_pv_configs():" \ - "remote command to ${host_name} error: ${ret_str}" - else - remote_error "get_pv_configs" ${host_name} - fi - return 1 - fi - - PV_NAMES=`echo "${ret_str}" | sed -e 's/^'${host_name}':[[:space:]]//'` - if [ -z "${PV_NAMES}" ]; then - verbose_output "There are no PVs in the host ${host_name}!" - return 0 - fi - - return 0 -} - -# get_vg_pvnames hostname vgname -# Get the PVs contained in @vgname from the node @hostname -get_vg_pvnames() { - local host_name=$1 - local vg_name=$2 - local pv_names= - local cmd ret_str - - # Execute remote command to get the PV names. - cmd="PATH=\$PATH:/sbin:/usr/sbin vgdisplay -v ${vg_name} 2>/dev/null\ - | grep \"PV Name\" | awk '{print \$3}' | xargs" - ret_str=`${REMOTE} ${host_name} "${cmd}" 2>&1` - if [ $? -ne 0 ]; then - if [ -n "${ret_str}" ]; then - echo "`basename $0`: get_vg_pvnames() error:" \ - "remote command to ${host_name} error: ${ret_str}" - else - remote_error "get_vg_pvnames" ${host_name} - fi - return 1 - fi - - pv_names=`echo "${ret_str}" | sed -e 's/^'${host_name}':[[:space:]]//'` - if [ -z "${pv_names}" ]; then - echo "`basename $0`: get_vg_pvnames() error:" \ - "There are no PVs in VG ${vg_name} in the host ${host_name}!"\ - "Or VG ${vg_name} does not exist." - return 1 - fi - - echo "${pv_names}" - return 0 -} - -# get_vg_configs hostname -# Get all the LVM VG informations from the node @hostname -get_vg_configs() { - declare -i i=0 - local host_name=$1 - local cmd ret_str - local vg_name - - # Initialize the arrays - unset VG_NAME - unset VG_PVNAMES - - # Execute remote command to get all the VG names. - cmd="PATH=\$PATH:/sbin:/usr/sbin vgdisplay \ - | grep \"VG Name\" | awk '{print \$3}' | xargs" - ret_str=`${REMOTE} ${host_name} "${cmd}" 2>&1` - if [ $? -ne 0 ]; then - if [ -n "${ret_str}" ]; then - error_output "get_vg_configs():" \ - "remote command to ${host_name} error: ${ret_str}" - else - remote_error "get_vg_configs" ${host_name} - fi - return 1 - fi - - if [ -z "${ret_str}" ] \ - || [ "${ret_str}" != "${ret_str#*No volume groups found*}" ]; then - verbose_output "There are no VGs in the host ${host_name}!" - return 0 - fi - - # Get all the VG informations - for vg_name in `echo "${ret_str}" | sed -e 's/^'${host_name}'://'`; do - VG_NAME[i]=${vg_name} - VG_PVNAMES[i]=$(get_vg_pvnames ${host_name} ${VG_NAME[i]}) - if [ $? -ne 0 ]; then - error_output "${VG_PVNAMES[i]}" - return 1 - fi - let "i += 1" - done - - return 0 -} - -# get_lv_configs hostname -# Get all the LVM LV informations from the node @hostname -get_lv_configs() { - declare -i i=0 - local host_name=$1 - local ret_line line - - # Initialize the arrays - unset LV_NAME - unset LV_SIZE - unset LV_VGNAME - - # Execute remote command to get all the LV informations. - while read -r ret_line; do - if is_pdsh; then - set -- ${ret_line} - shift - line="$*" - else - line="${ret_line}" - fi - - [ "${line}" != "${line#*volume group*}" ] && break - - LV_NAME[i]=`echo "${line}" | awk -F: '{print $1}' | sed -e 's/.*\///g'` - LV_VGNAME[i]=`echo "${line}" | awk -F: '{print $2}'` - LV_SIZE[i]=`echo "${line}" | awk -F: '{print $7}' | sed -e 's/.*/&K/'` - - let "i += 1" - done < <(${REMOTE} ${host_name} "PATH=\$PATH:/sbin:/usr/sbin lvdisplay -c") - - if [ $i -eq 0 ]; then - verbose_output "There are no LVs in the host ${host_name}" - fi - - return 0 -} - -#*************************** Network module options ***************************# -# last_is_backslash line -# Check whether the last effective letter of @line is a backslash -last_is_backslash() { - local line="$*" - declare -i i - declare -i length - local letter last_letter - - length=${#line} - for ((i = ${length}-1; i >= 0; i--)); do - letter=${line:${i}:1} - [ "x${letter}" != "x " -a "x${letter}" != "x " -a -n "${letter}" ]\ - && last_letter=${letter} && break - done - - [ "x${last_letter}" = "x\\" ] && return 0 - - return 1 -} - -# get_module_opts hostname -# Get the network module options from the node @hostname -get_module_opts() { - local host_name=$1 - local ret_str - local MODULE_CONF KERNEL_VER - local ret_line line find_options - local continue_flag - - MODULE_OPTS=${DEFAULT_MOD_OPTS} - - # Execute remote command to get the kernel version - ret_str=`${REMOTE} ${host_name} "uname -r" 2>&1` - if [ $? -ne 0 -a -n "${ret_str}" ]; then - error_output "get_module_opts():" \ - "remote command error: ${ret_str}" - return 1 - fi - remote_error "get_module_opts" ${host_name} "${ret_str}" && return 1 - - if is_pdsh; then - KERNEL_VER=`echo ${ret_str} | awk '{print $2}'` - else - KERNEL_VER=`echo ${ret_str} | awk '{print $1}'` - fi - - # Get the module configuration file name - if [ "${KERNEL_VER:0:3}" = "2.4" ]; then - MODULE_CONF=/etc/modules.conf - else - MODULE_CONF=/etc/modprobe.conf - fi - - # Execute remote command to get the lustre network module options - continue_flag=false - find_options=false - while read -r ret_line; do - if is_pdsh; then - set -- ${ret_line} - shift - line="$*" - else - line="${ret_line}" - fi - - # Get rid of the comment line - [ -z "`echo \"${line}\"|egrep -v \"^#\"`" ] && continue - - if [ "${line}" != "${line#*options lnet*}" ]; then - if ! ${find_options}; then - find_options=true - MODULE_OPTS=${line} - else - MODULE_OPTS=${MODULE_OPTS}$" \n "${line} - fi - - last_is_backslash "${line}" && continue_flag=true \ - || continue_flag=false - continue - fi - - if ${continue_flag}; then - MODULE_OPTS=${MODULE_OPTS}$" \n "${line} - ! last_is_backslash "${line}" && continue_flag=false - - fi - done < <(${REMOTE} ${host_name} "cat ${MODULE_CONF}") - - if [ -z "${MODULE_OPTS}" ]; then - MODULE_OPTS=${DEFAULT_MOD_OPTS} - fi - - return 0 -} - -#************************ HA software configurations ************************# -# is_ha_target hostname target_devname -# Check whether the target @target_devname was made to be high-available -is_ha_target() { - local host_name=$1 - local target_svname=$2 - local res_file - local ret_str - - case "${HATYPE_OPT}" in - "${HBVER_HBV1}") res_file=${HA_RES};; - "${HBVER_HBV2}") res_file=${HA_CIB};; - "${HATYPE_CLUMGR}") res_file=${CLUMAN_CONFIG};; - esac - - # Execute remote command to check the resource file - ret_str=`${REMOTE} ${host_name} \ - "grep ${target_svname} ${res_file}" 2>&1` - if [ $? -ne 0 -a -n "${ret_str}" ]; then - error_output "is_ha_target():" \ - "remote command error: ${ret_str}" - return 1 - fi - - [ "${ret_str}" = "${ret_str#*${target_svname}*}" ] && return 1 - - return 0 -} - -# get_hb_configs hostname -# Get the Heartbeat configurations from the node @hostname -get_hb_configs() { - local host_name=$1 - local ret_line line - declare -i i - - unset HA_CONFIGS - HB_CHANNELS= - SRV_IPADDRS= - HB_OPTIONS= - - # Execute remote command to get the configs of Heartbeat channels, etc - while read -r ret_line; do - if is_pdsh; then - set -- ${ret_line} - shift - line="$*" - else - line="${ret_line}" - fi - - # Get rid of the comment line - [ -z "`echo \"${line}\"|egrep -v \"^#\"`" ] && continue - - if [ "${line}" != "${line#*serial*}" ] \ - || [ "${line}" != "${line#*cast*}" ]; then - if [ -z "${HB_CHANNELS}" ]; then - HB_CHANNELS=${line} - else - HB_CHANNELS=${HB_CHANNELS}:${line} - fi - fi - - if [ "${line}" != "${line#*stonith*}" ] \ - || [ "${line}" != "${line#*ping*}" ] \ - || [ "${line}" != "${line#*respawn*}" ] \ - || [ "${line}" != "${line#*apiauth*}" ] \ - || [ "${line}" != "${line#*compression*}" ]; then - if [ -z "${HB_OPTIONS}" ]; then - HB_OPTIONS=${line} - else - HB_OPTIONS=${HB_OPTIONS}:${line} - fi - fi - done < <(${REMOTE} ${host_name} "cat ${HA_CF}") - - if [ -z "${HB_CHANNELS}" ]; then - error_output "get_hb_configs():" \ - "There are no heartbeat channel configs in ${HA_CF}" \ - "of host ${host_name} or ${HA_CF} does not exist!" - return 0 - fi - - # Execute remote command to get Heartbeat service address - if [ "${HATYPE_OPT}" = "${HBVER_HBV1}" ]; then - while read -r ret_line; do - if is_pdsh; then - set -- ${ret_line} - shift - line="$*" - else - line="${ret_line}" - fi - - # Get rid of the empty line - [ -z "`echo ${line}|awk '/[[:alnum:]]/ {print $0}'`" ]\ - && continue - - # Get rid of the comment line - [ -z "`echo \"${line}\"|egrep -v \"^#\"`" ] && continue - - SRV_IPADDRS=`echo ${line} | awk '{print $2}'` - [ -n "${SRV_IPADDRS}" ] \ - && [ "`echo ${line} | awk '{print $1}'`" = "${host_name}" ] && break - done < <(${REMOTE} ${host_name} "cat ${HA_RES}") - - if [ -z "${SRV_IPADDRS}" ]; then - error_output "get_hb_configs(): There"\ - "are no service address in ${HA_RES} of host"\ - "${host_name} or ${HA_RES} does not exist!" - return 0 - fi - fi - - # Construct HA configuration items - for ((i = 0; i < ${#TARGET_DEVNAMES[@]}; i++)); do - [ -z "${TARGET_DEVNAMES[i]}" ] && continue - - # Execute remote command to check whether this target service - # was made to be high-available - if is_ha_target ${host_name} ${TARGET_DEVNAMES[i]}; then - HA_CONFIGS[i]=${HB_CHANNELS},${SRV_IPADDRS},${HB_OPTIONS} - fi - done - - return 0 -} - -# get_cluman_channel hostname -# Get the Heartbeat channel of CluManager from the node @hostname -get_cluman_channel() { - local host_name=$1 - local ret_line line - local cluman_channel= - local mcast_ipaddr - - while read -r ret_line; do - if is_pdsh; then - set -- ${ret_line} - shift - line="$*" - else - line="${ret_line}" - fi - - if [ "${line}" != "${line#*broadcast*}" ] \ - && [ "`echo ${line}|awk '{print $3}'`" = "yes" ]; then - cluman_channel="broadcast" - break - fi - - if [ "${line}" != "${line#*multicast_ipaddress*}" ]; then - mcast_ipaddr=`echo ${line}|awk '{print $3}'` - if [ "${mcast_ipaddr}" != "225.0.0.11" ]; then - cluman_channel="multicast ${mcast_ipaddr}" - break - fi - fi - done < <(${REMOTE} ${host_name} "${CONFIG_CMD} --clumembd") - - echo ${cluman_channel} - return 0 -} - -# get_cluman_srvaddr hostname target_svname -# Get the service IP addresses of @target_svname from the node @hostname -get_cluman_srvaddr() { - local host_name=$1 - local target_svname=$2 - local ret_line line - local srvaddr cluman_srvaddr= - - while read -r ret_line; do - if is_pdsh; then - set -- ${ret_line} - shift - line="$*" - else - line="${ret_line}" - fi - - if [ "${line}" != "${line#*ipaddress = *}" ]; then - srvaddr=`echo ${line}|awk '{print $3}'` - if [ -z "${cluman_srvaddr}" ]; then - cluman_srvaddr=${srvaddr} - else - cluman_srvaddr=${cluman_srvaddr}:${srvaddr} - fi - fi - done < <(${REMOTE} ${host_name} "${CONFIG_CMD} \ - --service=${target_svname} --service_ipaddresses") - - if [ -z "${cluman_srvaddr}" ]; then - echo "`basename $0`: get_cluman_srvaddr() error: Cannot" \ - "get the service IP addresses of ${target_svname} in" \ - "${host_name}! Check ${CONFIG_CMD} command!" - return 1 - fi - - echo ${cluman_srvaddr} - return 0 -} - -# get_cluman_configs hostname -# Get the CluManager configurations from the node @hostname -get_cluman_configs() { - local host_name=$1 - local ret_str - declare -i i - - unset HA_CONFIGS - - # Execute remote command to get the configs of CluManager - for ((i = 0; i < ${#TARGET_DEVNAMES[@]}; i++)); do - HB_CHANNELS= - SRV_IPADDRS= - HB_OPTIONS= - [ -z "${TARGET_DEVNAMES[i]}" ] && continue - - # Execute remote command to check whether this target service - # was made to be high-available - ! is_ha_target ${host_name} ${TARGET_DEVNAMES[i]} && continue - - # Execute remote command to get Heartbeat channel - HB_CHANNELS=$(get_cluman_channel ${host_name}) - if [ $? -ne 0 ]; then - error_output "${HB_CHANNELS}" - fi - - # Execute remote command to get service IP address - SRV_IPADDRS=$(get_cluman_srvaddr ${host_name} \ - ${TARGET_SVNAMES[i]}) - if [ $? -ne 0 ]; then - error_output "${SRV_IPADDRS}" - return 0 - fi - - HA_CONFIGS[i]=${HB_CHANNELS},${SRV_IPADDRS},${HB_OPTIONS} - done - - return 0 -} - -# get_ha_configs hostname -# Get the HA software configurations from the node @hostname -get_ha_configs() { - local host_name=$1 - - unset HA_CONFIGS - - if [ -z "${HATYPE_OPT}" ]; then - return 0 - fi - - verbose_output "Collecting HA software configurations from host $1..." - - case "${HATYPE_OPT}" in - "${HBVER_HBV1}" | "${HBVER_HBV2}") # Heartbeat - if ! get_hb_configs ${host_name}; then - return 1 - fi - ;; - "${HATYPE_CLUMGR}") # CluManager - if ! get_cluman_configs ${host_name}; then - return 1 - fi - ;; - esac - - return 0 -} - -#*********************** Lustre targets configurations ***********************# - -# is_failover_service target_svname -# Check whether a target service @target_svname is a failover service. -is_failover_service() { - local target_svname=$1 - declare -i i - - for ((i = 0; i < ${#ALL_TARGET_SVNAMES[@]}; i++)); do - [ "${target_svname}" = "${ALL_TARGET_SVNAMES[i]}" ] && return 0 - done - - return 1 -} - -# get_svnames hostname -# Get the lustre target server obd names from the node @hostname -get_svnames(){ - declare -i i - declare -i j - local host_name=$1 - local ret_line line - - # Initialize the TARGET_SVNAMES array - unset TARGET_SVNAMES - unset FAILOVER_FMTOPTS - - # Execute remote command to the node @hostname and figure out what - # lustre services are running. - i=0 - j=${#ALL_TARGET_SVNAMES[@]} - while read -r ret_line; do - if is_pdsh; then - set -- ${ret_line} - shift - line="$*" - else - line="${ret_line}" - fi - - if [ -z "`echo ${line} | grep ${MGS_TYPE}`" ] \ - && [ -z "`echo ${line} | grep ${MDT_TYPE}`" ] \ - && [ -z "`echo ${line} | grep ${OST_TYPE}`" ]; then - continue - fi - - # Get target server name - TARGET_SVNAMES[i]=`echo ${line} | awk '{print $4}'` - if [ -n "${TARGET_SVNAMES[i]}" ]; then - if is_failover_service ${TARGET_SVNAMES[i]}; then - FAILOVER_FMTOPTS[i]="--noformat" - fi - ALL_TARGET_SVNAMES[j]=${TARGET_SVNAMES[i]} - let "i += 1" - let "j += 1" - else - error_output "get_svnames(): Invalid"\ - "line in ${host_name}'s ${LUSTRE_PROC_DEVICES}"\ - "- \"${line}\"!" - return 1 - fi - done < <(${REMOTE} ${host_name} "cat ${LUSTRE_PROC_DEVICES}") - - if [ $i -eq 0 ]; then - verbose_output "There are no lustre services running" \ - "on the node ${host_name}!" - fi - - return 0 -} - -# is_loopdev devname -# Check whether a device @devname is a loop device or not -is_loopdev() { - local devname=$1 - - if [ -z "${devname}" ] || \ - [ -z "`echo ${devname}|awk '/\/dev\/loop[[:digit:]]/ {print $0}'`" ] - then - return 1 - fi - - return 0 -} - -# get_devname hostname svname -# Get the device name of lustre target @svname from node @hostname -get_devname() { - local host_name=$1 - local target_svname=$2 - local target_devname= - local ret_str - local target_type target_obdtype mntdev_file - - if [ "${target_svname}" = "${MGS_SVNAME}" ]; then - # Execute remote command to get the device name of mgs target - ret_str=`${REMOTE} ${host_name} \ - "PATH=\$PATH:/sbin:/usr/sbin findfs LABEL=${target_svname}" 2>&1` - if [ $? -ne 0 -a -n "${ret_str}" ]; then - if [ "${ret_str}" = "${ret_str#*Unable to resolve*}" ] - then - echo "`basename $0`: get_devname() error:" \ - "remote command error: ${ret_str}" - return 1 - fi - fi - - if [ "${ret_str}" = "${ret_str#*Unable to resolve*}" ]; then - if is_pdsh; then - target_devname=`echo ${ret_str} | awk '{print $2}'` - else - target_devname=`echo ${ret_str} | awk '{print $1}'` - fi - fi - else # Execute remote command to get the device name of mdt/ost target - target_type=`echo ${target_svname} | cut -d - -f 2` - target_obdtype=${target_type:0:3}_TYPE - - mntdev_file=${LUSTRE_PROC}/${!target_obdtype}/${target_svname}/mntdev - - ret_str=`${REMOTE} ${host_name} "cat ${mntdev_file}" 2>&1` - if [ $? -ne 0 -a -n "${ret_str}" ]; then - echo "`basename $0`: get_devname() error:" \ - "remote command error: ${ret_str}" - return 1 - fi - - if [ "${ret_str}" != "${ret_str#*No such file*}" ]; then - echo "`basename $0`: get_devname() error:"\ - "${mntdev_file} does not exist in ${host_name}!" - return 1 - else - if is_pdsh; then - target_devname=`echo ${ret_str} | awk '{print $2}'` - else - target_devname=`echo ${ret_str} | awk '{print $1}'` - fi - fi - fi - - echo ${target_devname} - return 0 -} - -# get_devsize hostname target_devname -# Get the device size (KB) of @target_devname from node @hostname -get_devsize() { - local host_name=$1 - local target_devname=$2 - local target_devsize= - local ret_str - - # Execute remote command to get the device size - ret_str=`${REMOTE} ${host_name} \ - "PATH=\$PATH:/sbin:/usr/sbin blockdev --getsize ${target_devname}" 2>&1` - if [ $? -ne 0 -a -n "${ret_str}" ]; then - echo "`basename $0`: get_devsize() error:" \ - "remote command error: ${ret_str}" - return 1 - fi - - if is_pdsh; then - target_devsize=`echo ${ret_str} | awk '{print $2}'` - else - target_devsize=`echo ${ret_str} | awk '{print $1}'` - fi - - if [ -z "`echo ${target_devsize}|awk '/^[[:digit:]]/ {print $0}'`" ] - then - echo "`basename $0`: get_devsize() error: can't" \ - "get device size of ${target_devname} in ${host_name}!" - return 1 - fi - - let " target_devsize /= 2" - - echo ${target_devsize} - return 0 -} - -# get_realdevname hostname loop_dev -# Get the real device name of loop device @loop_dev from node @hostname -get_realdevname() { - local host_name=$1 - local loop_dev=$2 - local target_devname= - local ret_str - - # Execute remote command to get the real device name - ret_str=`${REMOTE} ${host_name} \ - "PATH=\$PATH:/sbin:/usr/sbin losetup ${loop_dev}" 2>&1` - if [ $? -ne 0 -a -n "${ret_str}" ]; then - echo "`basename $0`: get_realdevname() error:" \ - "remote command error: ${ret_str}" - return 1 - fi - - if is_pdsh; then - target_devname=`echo ${ret_str} | awk '{print $4}' \ - | sed 's/^(//' | sed 's/)$//'` - else - target_devname=`echo ${ret_str} | awk '{print $3}' \ - | sed 's/^(//' | sed 's/)$//'` - fi - - if [ "${ret_str}" != "${ret_str#*No such*}" ] \ - || [ -z "${target_devname}" ]; then - echo "`basename $0`: get_realdevname() error: can't" \ - "get info on device ${loop_dev} in ${host_name}!" - return 1 - fi - - echo ${target_devname} - return 0 -} - -# get_mntpnt hostname target_devname -# Get the lustre target mount point from the node @hostname -get_mntpnt(){ - local host_name=$1 - local target_devname=$2 - local mnt_point= - local ret_str - - # Execute remote command to get the mount point - ret_str=`${REMOTE} ${host_name} \ - "cat /etc/mtab | grep ${target_devname}" 2>&1` - if [ $? -ne 0 -a -n "${ret_str}" ]; then - echo "`basename $0`: get_mntpnt() error:" \ - "remote command error: ${ret_str}" - return 1 - fi - - if is_pdsh; then - mnt_point=`echo ${ret_str} | awk '{print $3}'` - else - mnt_point=`echo ${ret_str} | awk '{print $2}'` - fi - - if [ -z "${mnt_point}" ]; then - echo "`basename $0`: get_mntpnt() error: can't" \ - "get the mount point of ${target_devname} in ${host_name}!" - return 1 - fi - - echo ${mnt_point} - return 0 -} - -# get_devnames hostname -# Get the lustre target device names, mount points -# and loop device sizes from the node @hostname -get_devnames(){ - declare -i i - local host_name=$1 - local ret_line line - - # Initialize the arrays - unset TARGET_DEVNAMES - unset TARGET_DEVSIZES - unset TARGET_MNTPNTS - - for ((i = 0; i < ${#TARGET_SVNAMES[@]}; i++)); do - TARGET_DEVNAMES[i]=$(get_devname ${host_name} \ - ${TARGET_SVNAMES[i]}) - if [ $? -ne 0 ]; then - error_output "${TARGET_DEVNAMES[i]}" - return 1 - fi - - if [ -z "${TARGET_DEVNAMES[i]}" ]; then - if [ "${TARGET_SVNAMES[i]}" = "${MGS_SVNAME}" ]; then - verbose_output "There exists combo mgs/mdt"\ - "target in ${host_name}." - continue - else - error_output "get_devname():"\ - "No device corresponding to target" \ - "${TARGET_SVNAMES[i]} in ${host_name}!" - return 1 - fi - fi - - # Get the mount point of the target - TARGET_MNTPNTS[i]=$(get_mntpnt ${host_name} \ - ${TARGET_DEVNAMES[i]}) - if [ $? -ne 0 ]; then - error_output "${TARGET_MNTPNTS[i]}" - return 1 - fi - - # The target device is a loop device? - if [ -n "${TARGET_DEVNAMES[i]}" ] \ - && is_loopdev ${TARGET_DEVNAMES[i]}; then - # Get the device size - TARGET_DEVSIZES[i]=$(get_devsize ${host_name} \ - ${TARGET_DEVNAMES[i]}) - if [ $? -ne 0 ]; then - error_output "${TARGET_DEVSIZES[i]}" - return 1 - fi - - # Get the real device name - TARGET_DEVNAMES[i]=$(get_realdevname ${host_name} \ - ${TARGET_DEVNAMES[i]}) - if [ $? -ne 0 ]; then - error_output "${TARGET_DEVNAMES[i]}" - return 1 - fi - fi - done - - return 0 -} - -# is_target target_svtype ldd_flags -# Check the service type of a lustre target -is_target() { - case "$1" in - "mdt") let "ret = $2 & LDD_F_SV_TYPE_MDT";; - "ost") let "ret = $2 & LDD_F_SV_TYPE_OST";; - "mgs") let "ret = $2 & LDD_F_SV_TYPE_MGS";; - "*") - error_output "is_target(): Invalid" \ - "target service type - \"$1\"!" - return 1 - ;; - esac - - if [ ${ret} -eq 0 ]; then - return 1 - fi - - return 0 -} - -# get_devtype ldd_flags -# Get the service type of a lustre target from @ldd_flags -get_devtype() { - local target_devtype= - - if [ -z "${flags}" ]; then - echo "`basename $0`: get_devtype() error: Invalid" \ - "ldd_flags - it's value is null!" - return 1 - fi - - if is_target "mgs" $1; then - if is_target "mdt" $1; then - target_devtype="mgs|mdt" - else - target_devtype="mgs" - fi - elif is_target "mdt" $1; then - target_devtype="mdt" - elif is_target "ost" $1; then - target_devtype="ost" - else - echo "`basename $0`: get_devtype() error: Invalid" \ - "ldd_flags - \"$1\"!" - return 1 - fi - - echo ${target_devtype} - return 0 -} - -# get_mntopts ldd_mount_opts -# Get the user-specified lustre target mount options from @ldd_mount_opts -get_mntopts() { - local mount_opts= - local ldd_mount_opts=$1 - - mount_opts="${ldd_mount_opts#${ALWAYS_MNTOPTS}}" - mount_opts="${mount_opts#${MDT_MGS_ALWAYS_MNTOPTS}}" - mount_opts="${mount_opts#${OST_ALWAYS_MNTOPTS}}" - mount_opts="${mount_opts#${OST_DEFAULT_MNTOPTS}}" - mount_opts="`echo \"${mount_opts}\" | sed 's/^,//'`" - - [ "${mount_opts}" != "${mount_opts#*,*}" ] && echo "\""${mount_opts}"\"" \ - || echo ${mount_opts} - - return 0 -} - -# get_mgsnids ldd_params -# Get the mgs nids of lustre target from @ldd_params -get_mgsnids() { - local mgs_nids= # mgs nids in one mgs node - local all_mgs_nids= # mgs nids in all mgs failover nodes - local param= - local ldd_params="$*" - - for param in ${ldd_params}; do - if [ -n "`echo ${param}|awk '/mgsnode=/ {print $0}'`" ]; then - mgs_nids=`echo ${param#${PARAM_MGSNODE}}` - - if [ -n "${all_mgs_nids}" ]; then - all_mgs_nids=${all_mgs_nids}:${mgs_nids} - else - all_mgs_nids=${mgs_nids} - fi - fi - done - - [ "${all_mgs_nids}" != "${all_mgs_nids#*,*}" ] \ - && echo "\""${all_mgs_nids}"\"" || echo ${all_mgs_nids} - - return 0 -} - -# get_failnids ldd_params -# Get the failover nids of lustre target from @ldd_params -get_failnids() { - local fail_nids= # failover nids in one failover node - local all_fail_nids= # failover nids in all failover nodes - # of this target - local param= - local ldd_params="$*" - - for param in ${ldd_params}; do - if [ -n "`echo ${param}|awk '/failover.node=/ {print $0}'`" ]; then - fail_nids=`echo ${param#${PARAM_FAILNODE}}` - - if [ -n "${all_fail_nids}" ]; then - all_fail_nids=${all_fail_nids}:${fail_nids} - else - all_fail_nids=${fail_nids} - fi - fi - done - - [ "${all_fail_nids}" != "${all_fail_nids#*,*}" ] \ - && echo "\""${all_fail_nids}"\"" || echo ${all_fail_nids} - - return 0 -} - -# get_fmtopts target_devname hostname ldd_params -# Get other format options of the lustre target @target_devname from @ldd_params -get_fmtopts() { - local target_devname=$1 - local host_name=$2 - shift - shift - local ldd_params="$*" - local param= - local fmt_opts= - - for param in ${ldd_params}; do - [ -n "`echo ${param}|awk '/mgsnode=/ {print $0}'`" ] && continue - [ -n "`echo ${param}|awk '/failover.node=/ {print $0}'`" ] && continue - - if [ -n "${param}" ]; then - if [ -n "${fmt_opts}" ]; then - fmt_opts=${fmt_opts}" --param=\""${param}"\"" - else - fmt_opts="--param=\""${param}"\"" - fi - fi - done - - echo ${fmt_opts} - return 0 -} - -# get_stripecount host_name target_fsname -# Get the stripe count for @target_fsname -get_stripecount() { - local host_name=$1 - local target_fsname=$2 - local stripe_count= - local stripecount_file - local ret_str - - # Get the stripe count - stripecount_file=${LUSTRE_PROC}/lov/${target_fsname}-mdtlov/stripecount - ret_str=`${REMOTE} ${host_name} "cat ${stripecount_file}" 2>&1` - if [ $? -ne 0 -a -n "${ret_str}" ]; then - echo "`basename $0`: get_stripecount() error:" \ - "remote command to ${host_name} error: ${ret_str}" - return 1 - fi - - if is_pdsh; then - stripe_count=`echo ${ret_str} | awk '{print $2}'` - else - stripe_count=`echo ${ret_str} | awk '{print $1}'` - fi - - if [ "$stripe_count" != "-1" ] && \ - [ -z "`echo ${stripe_count}|awk '/^[[:digit:]]/ {print $0}'`" ]; then - echo "`basename $0`: get_stripecount() error: can't" \ - "get stripe count of ${target_fsname} in ${host_name}!" - return 1 - fi - - echo ${stripe_count} - return 0 -} - -# get_stripecount_opt host_name target_fsname -# Get the stripe count option for lustre mdt target -get_stripecount_opt() { - local host_name=$1 - local target_fsname=$2 - local stripe_count= - local stripecount_opt= - - # Get the stripe count - [ -z "${target_fsname}" ] && target_fsname="lustre" - stripe_count=$(get_stripecount ${host_name} ${target_fsname}) - if [ $? -ne 0 ]; then - echo "${stripe_count}" - return 1 - fi - - if [ "${stripe_count}" != "1" ]; then - stripecount_opt=${OPTSTR_STRIPE_COUNT}${stripe_count} - fi - - echo ${stripecount_opt} - return 0 -} - -# get_ldds hostname -# Get the lustre target disk data from the node @hostname -get_ldds(){ - declare -i i - local host_name=$1 - local ret_line line - local flags mnt_opts params - local stripecount_opt - - # Initialize the arrays - unset TARGET_DEVTYPES TARGET_FSNAMES TARGET_MGSNIDS TARGET_INDEXES - unset TARGET_FMTOPTS TARGET_MNTOPTS TARGET_FAILNIDS - - # Get lustre target device type, fsname, index, etc. - # from MOUNT_DATA_FILE. Using tunefs.lustre to read it. - for ((i = 0; i < ${#TARGET_DEVNAMES[@]}; i++)); do - flags= - mnt_opts= - params= - stripecount_opt= - [ -z "${TARGET_DEVNAMES[i]}" ] && continue - - # Execute remote command to read MOUNT_DATA_FILE - while read -r ret_line; do - if is_pdsh; then - set -- ${ret_line} - shift - line="$*" - else - line="${ret_line}" - fi - - if [ -n "`echo ${line}|awk '/Index:/ {print $0}'`" ]; then - TARGET_INDEXES[i]=`echo ${line}|awk '{print $2}'` - continue - fi - - if [ -n "`echo ${line}|awk '/Lustre FS:/ {print $0}'`" ]; then - TARGET_FSNAMES[i]=`echo ${line}|awk '{print $3}'` - continue - fi - - if [ -n "`echo ${line}|awk '/Flags:/ {print $0}'`" ]; then - flags=`echo ${line}|awk '{print $2}'` - continue - fi - - if [ -n "`echo ${line}|awk '/Persistent mount opts:/ {print $0}'`" ]; then - mnt_opts=`echo ${line}|awk '{print $0}'` - mnt_opts=`echo ${mnt_opts#Persistent mount opts: }` - continue - fi - - if [ -n "`echo ${line}|awk '/Parameters:/ {print $0}'`" ]; then - params=`echo ${line}|awk '{print $0}'` - params=`echo ${params#Parameters:}` - break - fi - done < <(${REMOTE} ${host_name} "PATH=\$PATH:/sbin:/usr/sbin - ${TUNEFS} --print --verbose ${TARGET_DEVNAMES[i]} 2>/dev/null") - - if [ -z "${flags}" ]; then - error_output "get_ldds(): Invalid" \ - "ldd_flags of target ${TARGET_DEVNAMES[i]}" \ - "in host ${host_name} - it's value is null!"\ - "Check ${TUNEFS} command!" - return 1 - fi - - if [ "${TARGET_INDEXES[i]}" = "unassigned" ] \ - || is_target "mgs" ${flags}; then - TARGET_INDEXES[i]= - fi - - [ "${TARGET_FSNAMES[i]}" = "lustre" ] && TARGET_FSNAMES[i]= - - # Get the lustre target service type - TARGET_DEVTYPES[i]=$(get_devtype ${flags}) - if [ $? -ne 0 ]; then - error_output "${TARGET_DEVTYPES[i]} From device" \ - "${TARGET_DEVNAMES[i]} in host ${host_name}!" - return 1 - fi - - # Get the lustre target mount options - TARGET_MNTOPTS[i]=$(get_mntopts "${mnt_opts}") - - # Get mgs nids of the lustre target - TARGET_MGSNIDS[i]=$(get_mgsnids "${params}") - - # Get failover nids of the lustre target - TARGET_FAILNIDS[i]=$(get_failnids "${params}") - if [ $? -ne 0 ]; then - error_output "${TARGET_FAILNIDS[i]} From device" \ - "${TARGET_DEVNAMES[i]} in host ${host_name}!" - return 1 - fi - - # Get other format options of the lustre target - TARGET_FMTOPTS[i]=$(get_fmtopts ${TARGET_DEVNAMES[i]} ${host_name} "${params}") - if [ $? -ne 0 ]; then - error_output "${TARGET_FMTOPTS[i]}" - return 1 - fi - - if [ -n "${TARGET_DEVSIZES[i]}" ]; then - if [ -n "${TARGET_FMTOPTS[i]}" ]; then - TARGET_FMTOPTS[i]="--device-size=${TARGET_DEVSIZES[i]} ""${TARGET_FMTOPTS[i]}" - else - TARGET_FMTOPTS[i]="--device-size=${TARGET_DEVSIZES[i]}" - fi - fi - - if [ -n "${FAILOVER_FMTOPTS[i]}" ]; then - if [ -n "${TARGET_FMTOPTS[i]}" ]; then - TARGET_FMTOPTS[i]=${TARGET_FMTOPTS[i]}" "${FAILOVER_FMTOPTS[i]} - else - TARGET_FMTOPTS[i]=${FAILOVER_FMTOPTS[i]} - fi - fi - - if is_target "mdt" ${flags}; then - # Get the stripe count option - stripecount_opt=$(get_stripecount_opt ${host_name} ${TARGET_FSNAMES[i]}) - if [ $? -ne 0 ]; then - error_output "${stripecount_opt}" - return 1 - fi - - if [ -n "${stripecount_opt}" ]; then - if [ -n "${TARGET_FMTOPTS[i]}" ]; then - TARGET_FMTOPTS[i]=${TARGET_FMTOPTS[i]}" "${stripecount_opt} - else - TARGET_FMTOPTS[i]=${stripecount_opt} - fi - fi - fi - - if [ "${TARGET_FMTOPTS[i]}" != "${TARGET_FMTOPTS[i]#*,*}" ]; then - TARGET_FMTOPTS[i]="\""${TARGET_FMTOPTS[i]}"\"" - fi - done - - return 0 -} - -# get_journalsize target_devname hostname -# Get the journal size of lustre target @target_devname from @hostname -get_journalsize() { - local target_devname=$1 - local host_name=$2 - local journal_inode= - local journal_size= - local ret_str - - # Execute remote command to get the journal inode number - ret_str=`${REMOTE} ${host_name} "PATH=\$PATH:/sbin:/usr/sbin \ -debugfs -R 'stats -h' ${target_devname} | grep 'Journal inode:'" 2>&1` - if [ $? -ne 0 -a -n "${ret_str}" ]; then - echo "`basename $0`: get_journalsize() error:" \ - "remote command error: ${ret_str}" - return 1 - fi - - ret_str=${ret_str#${ret_str%Journal inode:*}} - journal_inode=`echo ${ret_str} | awk '{print $3}'` - if [ -z "`echo ${journal_inode}|awk '/^[[:digit:]]/ {print $0}'`" ] - then - echo "`basename $0`: get_journalsize() error: can't" \ - "get journal inode of ${target_devname} in ${host_name}!" - return 1 - fi - - # Execute remote command to get the journal size - ret_str=`${REMOTE} ${host_name} "PATH=\$PATH:/sbin:/usr/sbin \ -debugfs -R 'stat <${journal_inode}>' ${target_devname}|grep '^User:'" 2>&1` - if [ $? -ne 0 -a -n "${ret_str}" ]; then - echo "`basename $0`: get_journalsize() error:" \ - "remote command error: ${ret_str}" - return 1 - fi - - ret_str=${ret_str#${ret_str%User:*}} - journal_size=`echo ${ret_str} | awk '{print $6}'` - if [ -z "`echo ${journal_size}|awk '/^[[:digit:]]/ {print $0}'`" ] - then - echo "`basename $0`: get_journalsize() error: can't" \ - "get journal size of ${target_devname} in ${host_name}!" - return 1 - fi - - let "journal_size /= 1024*1024" # MB - - echo ${journal_size} - return 0 -} - -# get_defaultjournalsize target_devsize -# Calculate the default journal size from target device size @target_devsize -get_defaultjournalsize() { - declare -i target_devsize=$1 - declare -i journal_size=0 - declare -i max_size base_size - - let "base_size = 1024*1024" - if [ ${target_devsize} -gt ${base_size} ]; then # 1GB - let "journal_size = target_devsize / 102400" - let "journal_size *= 4" - fi - - let "max_size = 102400 * L_BLOCK_SIZE" - let "max_size >>= 20" # 400MB - - if [ ${journal_size} -gt ${max_size} ]; then - let "journal_size = max_size" - fi - - echo ${journal_size} - return 0 -} - -# figure_journal_size target_devname hostname -# Find a reasonable journal file size given the number of blocks -# in the filesystem. This algorithm is derived from figure_journal_size() -# function in util.c of e2fsprogs-1.38.cfs2-1.src.rpm. -figure_journal_size() { - local target_devname=$1 - local host_name=$2 - local ret_str - declare -i block_count - declare -i journal_blocks - declare -i journal_size - - # Execute remote command to get the block count - ret_str=`${REMOTE} ${host_name} "PATH=\$PATH:/sbin:/usr/sbin \ -debugfs -R 'stats -h' ${target_devname} | grep 'Block count:'" 2>&1` - if [ $? -ne 0 -a -n "${ret_str}" ]; then - echo "`basename $0`: figure_journal_size() error:" \ - "remote command error: ${ret_str}" - return 1 - fi - - ret_str=${ret_str#${ret_str%Block count:*}} - block_count=`echo ${ret_str} | awk '{print $3}'` - if [ -z "`echo ${block_count}|awk '/^[[:digit:]]/ {print $0}'`" ] - then - echo "`basename $0`: figure_journal_size() error: can't" \ - "get block count of ${target_devname} in ${host_name}!" - return 1 - fi - - if ((block_count < 32768)); then - let "journal_blocks = 1024" - elif ((block_count < 256*1024)); then - let "journal_blocks = 4096" - elif ((block_count < 512*1024)); then - let "journal_blocks = 8192" - elif ((block_count < 1024*1024)); then - let "journal_blocks = 16384" - else - let "journal_blocks = 32768" - fi - - let "journal_size = journal_blocks * L_BLOCK_SIZE / 1048576" - - echo ${journal_size} - return 0 -} - -# get_J_opt hostname target_devname target_devsize -# Get the mkfs -J option of lustre target @target_devname -# from the node @hostname -get_J_opt() { - local host_name=$1 - local target_devname=$2 - local target_devsize=$3 - local journal_size= - local default_journal_size= - local journal_opt= - - # Get the real journal size of lustre target - journal_size=$(get_journalsize ${target_devname} ${host_name}) - if [ $? -ne 0 ]; then - echo "${journal_size}" - return 1 - fi - - # Get the default journal size of lustre target - default_journal_size=$(get_defaultjournalsize ${target_devsize}) - if [ "${default_journal_size}" = "0" ]; then - default_journal_size=$(figure_journal_size ${target_devname} \ - ${host_name}) - if [ $? -ne 0 ]; then - echo "${default_journal_size}" - return 1 - fi - fi - - if [ "${journal_size}" != "${default_journal_size}" ]; then - journal_opt="-J size=${journal_size}" - fi - - echo ${journal_opt} - return 0 -} - -# get_ratio target_devname hostname -# Get the bytes/inode ratio of lustre target @target_devname from @hostname -get_ratio() { - local target_devname=$1 - local host_name=$2 - local inode_count= - local block_count= - local ratio= - local ret_str - - # Execute remote command to get the inode count - ret_str=`${REMOTE} ${host_name} "PATH=\$PATH:/sbin:/usr/sbin \ -debugfs -R 'stats -h' ${target_devname} | grep 'Inode count:'" 2>&1` - if [ $? -ne 0 -a -n "${ret_str}" ]; then - echo "`basename $0`: get_ratio() error:" \ - "remote command error: ${ret_str}" - return 1 - fi - - ret_str=${ret_str#${ret_str%Inode count:*}} - inode_count=`echo ${ret_str} | awk '{print $3}'` - if [ -z "`echo ${inode_count}|awk '/^[[:digit:]]/ {print $0}'`" ] - then - echo "`basename $0`: get_ratio() error: can't" \ - "get inode count of ${target_devname} in ${host_name}!" - return 1 - fi - - # Execute remote command to get the block count - ret_str=`${REMOTE} ${host_name} "PATH=\$PATH:/sbin:/usr/sbin \ -debugfs -R 'stats -h' ${target_devname} | grep 'Block count:'" 2>&1` - if [ $? -ne 0 -a -n "${ret_str}" ]; then - echo "`basename $0`: get_ratio() error:" \ - "remote command error: ${ret_str}" - return 1 - fi - - ret_str=${ret_str#${ret_str%Block count:*}} - block_count=`echo ${ret_str} | awk '{print $3}'` - if [ -z "`echo ${block_count}|awk '/^[[:digit:]]/ {print $0}'`" ] - then - echo "`basename $0`: get_ratio() error: can't" \ - "get block count of ${target_devname} in ${host_name}!" - return 1 - fi - - let "ratio = block_count*L_BLOCK_SIZE/inode_count" - - echo ${ratio} - return 0 -} - -# get_default_ratio target_devtype target_devsize -# Calculate the default bytes/inode ratio from target type @target_devtype -get_default_ratio() { - local target_devtype=$1 - declare -i target_devsize=$2 - local ratio= - - case "${target_devtype}" in - "mdt" | "mgs|mdt" | "mdt|mgs") - ratio=4096;; - "ost") - [ ${target_devsize} -gt 1000000 ] && ratio=16384;; - esac - - [ -z "${ratio}" ] && ratio=${L_BLOCK_SIZE} - - echo ${ratio} - return 0 -} - -# get_i_opt hostname target_devname target_devtype target_devsize -# Get the mkfs -i option of lustre target @target_devname -# from the node @hostname -get_i_opt() { - local host_name=$1 - local target_devname=$2 - local target_devtype=$3 - local target_devsize=$4 - local ratio= - local default_ratio= - local ratio_opt= - - # Get the real bytes/inode ratio of lustre target - ratio=$(get_ratio ${target_devname} ${host_name}) - if [ $? -ne 0 ]; then - echo "${ratio}" - return 1 - fi - - # Get the default bytes/inode ratio of lustre target - default_ratio=$(get_default_ratio ${target_devtype} ${target_devsize}) - - if [ "${ratio}" != "${default_ratio}" ]; then - ratio_opt="-i ${ratio}" - fi - - echo ${ratio_opt} - return 0 -} - -# get_isize target_devname hostname -# Get the inode size of lustre target @target_devname from @hostname -get_isize() { - local target_devname=$1 - local host_name=$2 - local inode_size= - local ret_str - - # Execute remote command to get the inode size - ret_str=`${REMOTE} ${host_name} "PATH=\$PATH:/sbin:/usr/sbin \ -debugfs -R 'stats -h' ${target_devname} | grep 'Inode size:'" 2>&1` - if [ $? -ne 0 -a -n "${ret_str}" ]; then - echo "`basename $0`: get_isize() error:" \ - "remote command error: ${ret_str}" - return 1 - fi - - ret_str=${ret_str#${ret_str%Inode size:*}} - inode_size=`echo ${ret_str} | awk '{print $3}'` - if [ -z "`echo ${inode_size}|awk '/^[[:digit:]]/ {print $0}'`" ] - then - echo "`basename $0`: get_isize() error: can't" \ - "get inode size of ${target_devname} in ${host_name}!" - return 1 - fi - - echo ${inode_size} - return 0 -} - -# get_mdt_default_isize host_name target_fsname -# Calculate the default inode size of lustre mdt target -get_mdt_default_isize() { - local host_name=$1 - local target_fsname=$2 - declare -i stripe_count - local inode_size= - - # Get the stripe count - stripe_count=$(get_stripecount ${host_name} ${target_fsname}) - if [ $? -ne 0 ]; then - echo "${stripe_count}" - return 1 - fi - - if ((stripe_count > 77)); then - inode_size=512 - elif ((stripe_count > 34)); then - inode_size=2048 - elif ((stripe_count > 13)); then - inode_size=1024 - else - inode_size=512 - fi - - echo ${inode_size} - return 0 -} - -# get_default_isize host_name target_devtype target_fsname -# Calculate the default inode size of lustre target type @target_devtype -get_default_isize() { - local host_name=$1 - local target_devtype=$2 - local target_fsname=$3 - local inode_size= - - case "${target_devtype}" in - "mdt" | "mgs|mdt" | "mdt|mgs") - inode_size=$(get_mdt_default_isize ${host_name} ${target_fsname}) - if [ $? -ne 0 ]; then - echo "${inode_size}" - return 1 - fi - ;; - "ost") - inode_size=256;; - esac - - [ -z "${inode_size}" ] && inode_size=128 - - echo ${inode_size} - return 0 -} - -# get_I_opt hostname target_devname target_devtype target_fsname -# Get the mkfs -I option of lustre target @target_devname -# from the node @hostname -get_I_opt() { - local host_name=$1 - local target_devname=$2 - local target_devtype=$3 - local target_fsname=$4 - local isize= - local default_isize= - local isize_opt= - - # Get the real inode size of lustre target - isize=$(get_isize ${target_devname} ${host_name}) - if [ $? -ne 0 ]; then - echo "${isize}" - return 1 - fi - - # Get the default inode size of lustre target - [ -z "${target_fsname}" ] && target_fsname="lustre" - default_isize=$(get_default_isize ${host_name} ${target_devtype} \ - ${target_fsname}) - if [ $? -ne 0 ]; then - echo "${default_isize}" - return 1 - fi - - if [ "${isize}" != "${default_isize}" ]; then - isize_opt="-I ${isize}" - fi - - echo ${isize_opt} - return 0 -} - -# get_mkfsopts hostname -# Get the mkfs options of lustre targets from the node @hostname -get_mkfsopts(){ - declare -i i - local host_name=$1 - local journal_opt - local ratio_opt - local inode_size_opt - - # Initialize the arrays - unset TARGET_MKFSOPTS - - # FIXME: Get other mkfs options of ext3/ldiskfs besides -J, -i and -I - for ((i = 0; i < ${#TARGET_DEVNAMES[@]}; i++)); do - journal_opt= - ratio_opt= - inode_size_opt= - - [ -z "${TARGET_DEVNAMES[i]}" ] && continue - - if [ -z "${TARGET_DEVSIZES[i]}" ]; then - # Get the device size - TARGET_DEVSIZES[i]=$(get_devsize ${host_name} \ - ${TARGET_DEVNAMES[i]}) - if [ $? -ne 0 ]; then - error_output "${TARGET_DEVSIZES[i]}" - return 1 - fi - fi - - # Get the journal option - journal_opt=$(get_J_opt ${host_name} ${TARGET_DEVNAMES[i]} \ - ${TARGET_DEVSIZES[i]}) - if [ $? -ne 0 ]; then - error_output "${journal_opt}" - return 1 - fi - - if [ -n "${journal_opt}" ]; then - if [ -z "${TARGET_MKFSOPTS[i]}" ]; then - TARGET_MKFSOPTS[i]="${journal_opt}" - else - TARGET_MKFSOPTS[i]=${TARGET_MKFSOPTS[i]}" ${journal_opt}" - fi - fi - - # Get the bytes-per-inode ratio option - ratio_opt=$(get_i_opt ${host_name} ${TARGET_DEVNAMES[i]} \ - ${TARGET_DEVTYPES[i]} ${TARGET_DEVSIZES[i]}) - if [ $? -ne 0 ]; then - error_output "${ratio_opt}" - return 1 - fi - - if [ -n "${ratio_opt}" ]; then - if [ -z "${TARGET_MKFSOPTS[i]}" ]; then - TARGET_MKFSOPTS[i]="${ratio_opt}" - else - TARGET_MKFSOPTS[i]=${TARGET_MKFSOPTS[i]}" ${ratio_opt}" - fi - fi - - # Get the inode size option - inode_size_opt=$(get_I_opt ${host_name} ${TARGET_DEVNAMES[i]} \ - ${TARGET_DEVTYPES[i]} ${TARGET_FSNAMES[i]}) - if [ $? -ne 0 ]; then - error_output "${inode_size_opt}" - return 1 - fi - - if [ -n "${inode_size_opt}" ]; then - if [ -z "${TARGET_MKFSOPTS[i]}" ]; then - TARGET_MKFSOPTS[i]="${inode_size_opt}" - else - TARGET_MKFSOPTS[i]=${TARGET_MKFSOPTS[i]}" ${inode_size_opt}" - fi - fi - - if [ "${TARGET_MKFSOPTS[i]}" != "${TARGET_MKFSOPTS[i]#*,*}" ]; then - TARGET_MKFSOPTS[i]="\""${TARGET_MKFSOPTS[i]}"\"" - fi - done - return 0 -} - -# get_target_configs hostname -# Get the lustre target informations from the node @hostname -get_target_configs() { - declare -i i - local host_name=$1 - local ret_line line - - # Initialize the arrays - unset TARGET_CONFIGS - - # Get lustre target server names - if ! get_svnames ${host_name}; then - return 1 - fi - - # Get lustre target device names, mount points and loop device sizes - if ! get_devnames ${host_name}; then - return 1 - fi - - # Get lustre target device type, fsname, index, etc. - if ! get_ldds ${host_name}; then - return 1 - fi - - # Get mkfs options of lustre targets - if ! get_mkfsopts ${host_name}; then - return 1 - fi - - # Construct lustre target configs - for ((i = 0; i < ${#TARGET_DEVNAMES[@]}; i++)); do - [ -z "${TARGET_DEVNAMES[i]}" ] && continue - TARGET_CONFIGS[i]=${TARGET_DEVNAMES[i]},${TARGET_MNTPNTS[i]},${TARGET_DEVTYPES[i]},${TARGET_FSNAMES[i]},${TARGET_MGSNIDS[i]},${TARGET_INDEXES[i]},${TARGET_FMTOPTS[i]},${TARGET_MKFSOPTS[i]},${TARGET_MNTOPTS[i]},${TARGET_FAILNIDS[i]} - done - - return 0 -} - -# get_configs hostname -# Get all the informations needed to generate a csv file from -# the node @hostname -get_configs() { - # Check the hostname - if [ -z "$1" ]; then - error_output "get_configs():" \ - "Missing hostname!" - return 1 - fi - - # Get network module options - verbose_output "" - verbose_output "Collecting network module options from host $1..." - if ! get_module_opts $1; then - return 1 - fi - verbose_output "OK" - - # Get lustre target informations - verbose_output "Collecting Lustre targets informations from host $1..." - if ! get_target_configs $1; then - return 1 - fi - verbose_output "OK" - - # Get HA software configurations - if ! get_ha_configs $1; then - return 1 - fi - - return 0 -} - -# Collect linux MD/LVM device informations from the lustre cluster and -# append them to the csv file -get_mdlvm_info() { - declare -i idx - declare -i i - local line - - # Collect and append linux MD/LVM informations to the csv file - for ((idx = 0; idx < ${#HOST_NAMES[@]}; idx++)); do - [ -z "${HOST_NAMES[idx]}" ] && continue - - # Collect MD device informations - ! get_md_configs ${HOST_NAMES[idx]} && return 1 - - # Append MD device informations to the csv file - for ((i = 0; i < ${#MD_NAME[@]}; i++)); do - line=${HOST_NAMES[idx]},${MD_MARKER},${MD_NAME[i]},,,${MD_LEVEL[i]},${MD_DEVS[i]} - verbose_output "Informations of MD device ${MD_NAME[i]}" \ - "in host ${HOST_NAMES[idx]} are as follows:" - verbose_output "${line}" - echo "${line}" >> ${LUSTRE_CSV_FILE} - done - - # Collect PV informations - ! get_pv_configs ${HOST_NAMES[idx]} && return 1 - - # Append PV informations to the csv file - if [ -n "${PV_NAMES}" ]; then - line=${HOST_NAMES[idx]},${PV_MARKER},${PV_NAMES} - verbose_output "Informations of PVs" \ - "in host ${HOST_NAMES[idx]} are as follows:" - verbose_output "${line}" - echo "${line}" >> ${LUSTRE_CSV_FILE} - fi - - # Collect VG informations - ! get_vg_configs ${HOST_NAMES[idx]} && return 1 - - # Append VG informations to the csv file - for ((i = 0; i < ${#VG_NAME[@]}; i++)); do - line=${HOST_NAMES[idx]},${VG_MARKER},${VG_NAME[i]},,,${VG_PVNAMES[i]} - verbose_output "Informations of VG ${VG_NAME[i]}" \ - "in host ${HOST_NAMES[idx]} are as follows:" - verbose_output "${line}" - echo "${line}" >> ${LUSTRE_CSV_FILE} - done - - # Collect LV informations - ! get_lv_configs ${HOST_NAMES[idx]} && return 1 - - # Append LV informations to the csv file - for ((i = 0; i < ${#LV_NAME[@]}; i++)); do - line=${HOST_NAMES[idx]},${LV_MARKER},${LV_NAME[i]},,,${LV_SIZE[i]},${LV_VGNAME[i]} - verbose_output "Informations of LV /dev/${LV_VGNAME[i]}/${LV_NAME[i]}"\ - "in host ${HOST_NAMES[idx]} are as follows:" - verbose_output "${line}" - echo "${line}" >> ${LUSTRE_CSV_FILE} - done - done - return 0 -} - -# Generate the csv file from the lustre cluster -gen_csvfile() { - declare -i idx - declare -i i - local line - - # Get lustre cluster node names - verbose_output "Collecting Lustre cluster node names..." - if ! get_hostnames; then - return 1 - fi - verbose_output "OK" - - : > ${LUSTRE_CSV_FILE} - - ${GET_MDLVM_INFO} && get_mdlvm_info - - # Collect and append lustre target informations to the csv file - for ((idx = 0; idx < ${#HOST_NAMES[@]}; idx++)); do - # Collect informations - if ! get_configs ${HOST_NAMES[idx]}; then - rm -f ${LUSTRE_CSV_FILE} - return 1 - fi - - # Append informations to the csv file - for ((i = 0; i < ${#TARGET_DEVNAMES[@]}; i++)); do - [ -z "${TARGET_DEVNAMES[i]}" ] && continue - - if [ -z "${HA_CONFIGS[i]}" ]; then - line=${HOST_NAMES[idx]},${MODULE_OPTS},${TARGET_CONFIGS[i]} - else - line=${HOST_NAMES[idx]},${MODULE_OPTS},${TARGET_CONFIGS[i]},${HA_CONFIGS[i]} - fi - verbose_output "Informations of target ${TARGET_DEVNAMES[i]}" \ - "in host ${HOST_NAMES[idx]} are as follows:" - verbose_output "${line}" - echo "" >> ${LUSTRE_CSV_FILE} - echo "${line}" >> ${LUSTRE_CSV_FILE} - done - done - - return 0 -} - -# Main flow -echo "`basename $0`: ******** Generate csv file -- ${LUSTRE_CSV_FILE} START ********" -if ! gen_csvfile; then - exit 1 -fi -echo "`basename $0`: ******** Generate csv file -- ${LUSTRE_CSV_FILE} OK **********" - -exit 0 diff --git a/lustre/scripts/lustre_req_history b/lustre/scripts/lustre_req_history index 72d592f..0bbab70 100644 --- a/lustre/scripts/lustre_req_history +++ b/lustre/scripts/lustre_req_history @@ -45,10 +45,10 @@ fi TMP_DIR_CLIENT=`mktemp -d /tmp/src_req_history.XXXXX` chmod ugo+rwx $TMP_DIR_CLIENT -MDS_NODE=`cat /proc/fs/lustre/mdc/*/mds_conn_uuid | cut -d @ -f1` +MDS_NODE=$(lctl get_param -n mdc.*.mds_conn_uuid | cut -d @ -f1) TARGET="OSS" i=0 -for NODE in `cat /proc/fs/lustre/osc/*-osc-*/ost_conn_uuid | cut -d @ -f1` ; do +for NODE in $(lctl get_param -n osc.*-osc-*.ost_conn_uuid | cut -d @ -f1) ; do NODE_ARRAY[$i]=$NODE i=`expr $i + 1` done @@ -56,7 +56,7 @@ done # Get the Histories from all the OSS's k=0 -for NODE in `cat /proc/fs/lustre/osc/*-osc-*/ost_conn_uuid | cut -d @ -f1` ; do +for NODE in $(lctl get_param -n osc.*-osc-*.ost_conn_uuid | cut -d @ -f1) ; do SAME_OST=0 j=0 @@ -77,18 +77,17 @@ for NODE in `cat /proc/fs/lustre/osc/*-osc-*/ost_conn_uuid | cut -d @ -f1` ; do TMPFILE2=`mktemp /tmp/temp_histories_$NODE.XXXXXXXXXX` rm -f $TMPFILE1 rm -f $TMPFILE2 - ( echo cd /proc/fs/lustre - echo touch $TMPFILE1 + ( echo touch $TMPFILE1 echo touch $TMPFILE2 echo chmod go+rw $TMPFILE1 echo chmod go+rw $TMPFILE2 for FILE in $REQ_FILES_OST; do SERVICE=`echo $FILE | cut -d "/" -f3` - echo "cat $FILE |cut -d\" \" -f1 | sed s/$/:$TARGET/| sed s/$/:$SERVICE/ >> $TMPFILE1" + echo "lctl get_param -n $FILE |cut -d\" \" -f1 | sed s/$/:$TARGET/| sed s/$/:$SERVICE/ >> $TMPFILE1" done for FILE in $REQ_FILES_CLIENT; do SERVICE=`echo $FILE | cut -d "/" -f3` - echo "cat $FILE | sed s/$/:$TARGET/ | sed s/$/:$SERVICE/ >> $TMPFILE1" + echo "lctl get_param -n $FILE | sed s/$/:$TARGET/ | sed s/$/:$SERVICE/ >> $TMPFILE1" done if [ $CLIENT = $NODE ] then @@ -104,19 +103,18 @@ done # Get the Histories from the MDS TARGET="MDS" -for NODE in `cat /proc/fs/lustre/mdc/*/mds_conn_uuid | cut -d @ -f1`; do +for NODE in $(lctl get_param -n mdc.*.mds_conn_uuid | cut -d @ -f1); do TMPFILE1=`mktemp /tmp/temp_histories_$NODE.XXXXXXXXXX` rm -f $TMPFILE1 TMPFILE2=`mktemp /tmp/temp_histories_$NODE.XXXXXXXXXX` rm -f $TMPFILE2 - ( echo cd /proc/fs/lustre - echo touch $TMPFILE1 + ( echo touch $TMPFILE1 echo touch $TMPFILE2 echo chmod go+rw $TMPFILE1 echo chmod go+rw $TMPFILE2 for FILE in $REQ_FILES_MDT; do SERVICE=`echo $FILE | cut -d "/" -f3` - echo "cat $FILE | sed s/$/:$TARGET/ | sed s/$/:$SERVICE/ >> $TMPFILE1" + echo "lctl get_param -n $FILE | sed s/$/:$TARGET/ | sed s/$/:$SERVICE/ >> $TMPFILE1" done if [ $CLIENT = $NODE ] then @@ -137,7 +135,7 @@ TARGET="CLIENT" for FILE in $REQ_FILES_CLIENT; do SERVICE=`echo $FILE | cut -d "/" -f3` - cat /proc/fs/lustre/$FILE | sed s/$/:$TARGET/ | sed s/$/:$SERVICE/ >> $TMPFILE1 + lctl get_param -n $FILE | sed s/$/:$TARGET/ | sed s/$/:$SERVICE/ >> $TMPFILE1 done grep -r 0@lo $TMPFILE1 >> $TMPFILE2 diff --git a/lustre/tests/qos.sh b/lustre/tests/qos.sh index db33465..0318af1 100644 --- a/lustre/tests/qos.sh +++ b/lustre/tests/qos.sh @@ -17,20 +17,16 @@ rm -fr $MOUNT/* sleep 1 # to ensure we get up-to-date statfs info set_qos() { - for i in `ls /proc/fs/lustre/lov/*/qos_threshold`; do - echo $(($1/1024)) > $i - done - for i in `ls /proc/fs/lustre/lov/*/qos_maxage`; do - echo $2 > $i - done + lctl set_param lov.*.qos_threshold=$(($1/1024)) + lctl set_param lov.*.qos_maxage=$2 } # assume all osts has same free space -OSTCOUNT=`cat /proc/fs/lustre/lov/*/activeobd | head -n 1` -TOTALAVAIL=`cat /proc/fs/lustre/llite/*/kbytesavail | head -n 1` +OSTCOUNT=$(lctl get_param -n lov.*.activeobd | head -n 1) +TOTALAVAIL=$(lctl get_param -n llite.*.kbytesavail | head -n 1) SINGLEAVAIL=$(($TOTALAVAIL/$OSTCOUNT)) MINFREE=$((1024 * 4)) # 4M -TOTALFFREE=`cat /proc/fs/lustre/llite/*/filesfree | head -n 1` +TOTALFFREE=$(lctl get_param -n llite.*.filesfree | head -n 1) if [ $SINGLEAVAIL -lt $MINFREE ]; then echo "ERROR: single ost free size($SINGLEAVAIL kb) is too low!" diff --git a/lustre/tests/runiozone b/lustre/tests/runiozone index 5eacb9c..b364abe 100755 --- a/lustre/tests/runiozone +++ b/lustre/tests/runiozone @@ -8,7 +8,7 @@ [ $1 ] && SIZE=$1 LOOP=0 rm -f endiozone -echo 0 > /proc/sys/lnet/debug +lctl set_param -n debug=0 while date; do LOOP=`expr $LOOP + 1` echo "Test #$LOOP" diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 0f7fc43..a5570fd 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -9187,7 +9187,17 @@ test_133e() { run_test 133e "Verifying OST {read,write}_bytes nid stats =================" test_133f() { - local proc_dirs="/proc/fs/lustre/ /proc/sys/lnet/ /proc/sys/lustre/" + local proc_dirs + + local dirs="/proc/fs/lustre/ /proc/sys/lnet/ /proc/sys/lustre/ \ +/sys/fs/lustre/ /sys/fs/lnet/" + local dir + for dir in $dirs; do + if [ -d $dir ]; then + proc_dirs="$proc_dirs $dir" + fi + done + local facet remote_mds_nodsh && skip "remote MDS with nodsh" && return @@ -9216,7 +9226,17 @@ test_133f() { run_test 133f "Check for LBUGs/Oopses/unreadable files in /proc" test_133g() { - local proc_dirs="/proc/fs/lustre/ /proc/sys/lnet/ /proc/sys/lustre/" + local proc_dirs + + local dirs="/proc/fs/lustre/ /proc/sys/lnet/ /proc/sys/lustre/ \ +/sys/fs/lustre/ /sys/fs/lnet/" + local dir + for dir in $dirs; do + if [ -d $dir ]; then + proc_dirs="$proc_dirs $dir" + fi + done + local facet # Second verifying readability. @@ -11411,7 +11431,7 @@ run_test 214 "hash-indexed directory test - bug 20133" # having "abc" as 1st arg, creates $TMP/lnet_abc.out and $TMP/lnet_abc.sys create_lnet_proc_files() { - cat /proc/sys/lnet/$1 >$TMP/lnet_$1.out || error "cannot read /proc/sys/lnet/$1" + lctl get_param -n $1 >$TMP/lnet_$1.out || error "cannot read lnet.$1" sysctl lnet.$1 >$TMP/lnet_$1.sys_tmp || error "cannot read lnet.$1" sed "s/^lnet.$1\ =\ //g" "$TMP/lnet_$1.sys_tmp" >$TMP/lnet_$1.sys @@ -11467,14 +11487,13 @@ test_215() { # for bugs 18102, 21079, 21517 local L2 # regexp for 2nd line (optional) local BR # regexp for the rest (body) - # /proc/sys/lnet/stats should look as 11 space-separated non-negative numerics + # lnet.stats should look as 11 space-separated non-negative numerics BR="^$N $N $N $N $N $N $N $N $N $N $N$" create_lnet_proc_files "stats" - check_lnet_proc_stats "stats.out" "/proc/sys/lnet/stats" "$BR" check_lnet_proc_stats "stats.sys" "lnet.stats" "$BR" remove_lnet_proc_files "stats" - # /proc/sys/lnet/routes should look like this: + # lnet.routes should look like this: # Routing disabled/enabled # net hops priority state router # where net is a string like tcp0, hops > 0, priority >= 0, @@ -11484,11 +11503,10 @@ test_215() { # for bugs 18102, 21079, 21517 L2="^net +hops +priority +state +router$" BR="^$NET +$N +(0|1) +(up|down) +$NID$" create_lnet_proc_files "routes" - check_lnet_proc_entry "routes.out" "/proc/sys/lnet/routes" "$BR" "$L1" "$L2" check_lnet_proc_entry "routes.sys" "lnet.routes" "$BR" "$L1" "$L2" remove_lnet_proc_files "routes" - # /proc/sys/lnet/routers should look like this: + # lnet.routers should look like this: # ref rtr_ref alive_cnt state last_ping ping_sent deadline down_ni router # where ref > 0, rtr_ref > 0, alive_cnt >= 0, state is up/down, # last_ping >= 0, ping_sent is boolean (0/1), deadline and down_ni are @@ -11496,11 +11514,10 @@ test_215() { # for bugs 18102, 21079, 21517 L1="^ref +rtr_ref +alive_cnt +state +last_ping +ping_sent +deadline +down_ni +router$" BR="^$P +$P +$N +(up|down) +$N +(0|1) +$I +$I +$NID$" create_lnet_proc_files "routers" - check_lnet_proc_entry "routers.out" "/proc/sys/lnet/routers" "$BR" "$L1" check_lnet_proc_entry "routers.sys" "lnet.routers" "$BR" "$L1" remove_lnet_proc_files "routers" - # /proc/sys/lnet/peers should look like this: + # lnet.peers should look like this: # nid refs state last max rtr min tx min queue # where nid is a string like 192.168.1.1@tcp2, refs > 0, # state is up/down/NA, max >= 0. last, rtr, min, tx, min are @@ -11508,21 +11525,19 @@ test_215() { # for bugs 18102, 21079, 21517 L1="^nid +refs +state +last +max +rtr +min +tx +min +queue$" BR="^$NID +$P +(up|down|NA) +$I +$N +$I +$I +$I +$I +$N$" create_lnet_proc_files "peers" - check_lnet_proc_entry "peers.out" "/proc/sys/lnet/peers" "$BR" "$L1" check_lnet_proc_entry "peers.sys" "lnet.peers" "$BR" "$L1" remove_lnet_proc_files "peers" - # /proc/sys/lnet/buffers should look like this: + # lnet.buffers should look like this: # pages count credits min # where pages >=0, count >=0, credits and min are numeric (0 or >0 or <0) L1="^pages +count +credits +min$" BR="^ +$N +$N +$I +$I$" create_lnet_proc_files "buffers" - check_lnet_proc_entry "buffers.out" "/proc/sys/lnet/buffers" "$BR" "$L1" check_lnet_proc_entry "buffers.sys" "lnet.buffers" "$BR" "$L1" remove_lnet_proc_files "buffers" - # /proc/sys/lnet/nis should look like this: + # lnet.nis should look like this: # nid status alive refs peer rtr max tx min # where nid is a string like 192.168.1.1@tcp2, status is up/down, # alive is numeric (0 or >0 or <0), refs >= 0, peer >= 0, @@ -11530,15 +11545,14 @@ test_215() { # for bugs 18102, 21079, 21517 L1="^nid +status +alive +refs +peer +rtr +max +tx +min$" BR="^$NID +(up|down) +$I +$N +$N +$N +$N +$I +$I$" create_lnet_proc_files "nis" - check_lnet_proc_entry "nis.out" "/proc/sys/lnet/nis" "$BR" "$L1" check_lnet_proc_entry "nis.sys" "lnet.nis" "$BR" "$L1" remove_lnet_proc_files "nis" - # can we successfully write to /proc/sys/lnet/stats? - echo "0" >/proc/sys/lnet/stats || error "cannot write to /proc/sys/lnet/stats" + # can we successfully write to lnet.stats? + lctl set_param -n stats=0 || error "cannot write to lnet.stats" sysctl -w lnet.stats=0 || error "cannot write to lnet.stats" } -run_test 215 "/proc/sys/lnet exists and has proper content - bugs 18102, 21079, 21517" +run_test 215 "lnet exists and has proper content - bugs 18102, 21079, 21517" test_216() { # bug 20317 [ $PARALLEL == "yes" ] && skip "skip parallel run" && return @@ -12971,10 +12985,9 @@ test_900() { local ls #define OBD_FAIL_MGC_PAUSE_PROCESS_LOG 0x903 $LCTL set_param fail_loc=0x903 - # cancel_lru_locks mgc - does not work due to lctl set_param syntax - for ls in /proc/fs/lustre/ldlm/namespaces/MGC*/lru_size; do - echo "clear" > $ls - done + + cancel_lru_locks MGC + FAIL_ON_ERROR=true cleanup FAIL_ON_ERROR=true setup } diff --git a/lustre/tests/sanityn.sh b/lustre/tests/sanityn.sh index adbc37b..9f87683 100644 --- a/lustre/tests/sanityn.sh +++ b/lustre/tests/sanityn.sh @@ -1027,8 +1027,9 @@ run_test 34 "no lock timeout under IO" test_35() { # bug 17645 local generation=[] local count=0 - for imp in /proc/fs/lustre/mdc/$FSNAME-MDT*-mdc-*; do - g=$(awk '/generation/{print $2}' $imp/import) + gen=$(lctl get_param mdc.$FSNAME-MDT*-mdc-*.import | grep generation | + awk '/generation/{print $2}') + for g in $gen; do generation[count]=$g let count=count+1 done @@ -1070,10 +1071,20 @@ test_35() { # bug 17645 do_facet client "lctl set_param fail_loc=0x0" df -h $MOUNT1 $MOUNT2 count=0 - for imp in /proc/fs/lustre/mdc/$FSNAME-MDT*-mdc-*; do - g=$(awk '/generation/{print $2}' $imp/import) - if ! test "$g" -eq "${generation[count]}"; then - error "Eviction happened on import $(basename $imp)" + gen=$(lctl get_param mdc.$FSNAME-MDT*-mdc-*.import | grep generation | + awk '/generation/{print $2}') + for g in $gen; do + if ! test "$g" -eq "${generation[count]}"; then + list=$(lctl list_param mdc.$FSNAME-MDT*-mdc-*.import) + local c = 0 + for imp in $list; do + if [ $c = $count ]; then + break + fi + c=c+1 + done + imp=$(echo "$imp" | awk -F"." '{print $2}') + error "Eviction happened on import $imp" fi let count=count+1 done diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 95b4c1b..dd84abb 100755 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -10,7 +10,6 @@ export EJOURNAL=${EJOURNAL:-""} export REFORMAT=${REFORMAT:-""} export WRITECONF=${WRITECONF:-""} export VERBOSE=${VERBOSE:-false} -export CATASTROPHE=${CATASTROPHE:-/proc/sys/lnet/catastrophe} export GSS=false export GSS_KRB5=false export GSS_PIPEFS=false @@ -719,7 +718,8 @@ init_gss() { fi if [ -n "$LGSS_KEYRING_DEBUG" ]; then - echo $LGSS_KEYRING_DEBUG > /proc/fs/lustre/sptlrpc/gss/lgss_keyring/debug_level + lctl set_param -n \ + sptlrpc.gss.lgss_keyring.debug_level=$LGSS_KEYRING_DEBUG fi fi } @@ -1947,8 +1947,12 @@ stop_client_loads() { # verify that lustre actually cleaned up properly cleanup_check() { - [ -f "$CATASTROPHE" ] && [[ $(< $CATASTROPHE) -ne 0 ]] && - error "LBUG/LASSERT detected" + VAR=$(lctl get_param -n catastrophe 2>&1) + if [ $? = 0 ] ; then + if [ $VAR != 0 ]; then + error "LBUG/LASSERT detected" + fi + fi BUSY=$(dmesg | grep -i destruct || true) if [ -n "$BUSY" ]; then echo "$BUSY" 1>&2 @@ -4415,10 +4419,31 @@ set_nodes_failloc () { cancel_lru_locks() { $LCTL mark "cancel_lru_locks $1 start" - for d in `lctl get_param -N ldlm.namespaces.*.lru_size | egrep -i $1`; do - $LCTL set_param -n $d=clear - done - $LCTL get_param ldlm.namespaces.*.lock_unused_count | egrep -i $1 | grep -v '=0' + + if [ $1 != "MGC" ]; then + for d in $(lctl get_param -N ldlm.namespaces.*.lru_size | + egrep -i $1); do + $LCTL set_param -n $d=clear + done + $LCTL get_param ldlm.namespaces.*.lock_unused_count | egrep -i $1 | + grep -v '=0' + else + for d in $(find \ + /{proc,sys}/fs/lustre/ldlm/namespaces/*$1*/lru_size \ + 2> /dev/null); do + echo "clear" > $d + done + + for d in $(find \ + /{proc,sys}/fs/lustre/ldlm/namespaces/*$1*/lock_unused_count \ + 2> /dev/null); do + if [ $(cat $d) != 0 ]; then + echo "ldlm.namespaces.$(echo "$d" | + cut -f 7 -d'/').lock_unused_count=$(cat $d)" + fi + done + fi + $LCTL mark "cancel_lru_locks $1 stop" } @@ -5500,14 +5525,17 @@ restore_lustre_params() { check_catastrophe() { local rnodes=${1:-$(comma_list $(remote_nodes_list))} - local C=$CATASTROPHE - [ -f $C ] && [ $(cat $C) -ne 0 ] && return 1 + VAR=$(lctl get_param -n catastrophe 2>&1) + if [ $? = 0 ] ; then + if [ $VAR != 0 ]; then + return 1 + fi + fi [ -z "$rnodes" ] && return 0 local data - data=$(do_nodes "$rnodes" "rc=\\\$([ -f $C ] && - echo \\\$(< $C) || echo 0); + data=$(do_nodes "$rnodes" "rc=\\\$(lctl get_param -n catastrophe); if [ \\\$rc -ne 0 ]; then echo \\\$(hostname): \\\$rc; fi exit \\\$rc") local rc=$? diff --git a/lustre/utils/llobdstat b/lustre/utils/llobdstat index 733cb4b..7a89318 100755 --- a/lustre/utils/llobdstat +++ b/lustre/utils/llobdstat @@ -5,13 +5,12 @@ my $pname = $0; -my $defaultpath = "/proc/fs/lustre"; my $obdstats = "stats"; sub usage() { print STDERR "Usage: $pname [ [}]\n"; - print STDERR "where ost_name : ost name under $defaultpath/obdfilter\n"; + print STDERR "where ost_name : ost name under /{proc,sys}/fs/lustre/obdfilter\n"; print STDERR " interval : sample interaval in seconds\n"; print STDERR "example: $pname lustre-OST0000 2\n"; print STDERR "Use CTRL + C to stop statistics printing\n"; @@ -33,9 +32,14 @@ if (($#ARGV < 0) || ($#ARGV > 2)) { } elsif ( -f "$ARGV[0]/$obdstats" ) { $statspath = "$ARGV[0]/$obdstats"; } else { - my $st = "$defaultpath/obdfilter/$ARGV[0]/$obdstats"; + my $st = glob ("/{proc,sys}/fs/lustre/obdfilter/$ARGV[0]"); if ( -f "$st" ) { $statspath = $st; + } else { + my $st = glob("/{proc,sys}/fs/lustre/obdfilter/$ARGV[0]/$obdstats"); + if ( -f "$st" ) { + $statspath = $st; + } } } if ( $statspath =~ /^None$/ ) { @@ -49,7 +53,7 @@ if (($#ARGV < 0) || ($#ARGV > 2)) { } } -print "$pname on $statspath\n"; +print "$pname on $ARGV[0]\n"; my %cur; my %last; diff --git a/lustre/utils/llstat b/lustre/utils/llstat index d3ddd0f..918209b 100644 --- a/lustre/utils/llstat +++ b/lustre/utils/llstat @@ -10,12 +10,12 @@ sub usage() { print STDERR "Usage: $pname [-c] [-g] [-i ] [-h ] \n"; - print STDERR " stats_file : /proc/fs/lustre/.../stat\n"; + print STDERR " stats_file : lustre/.../stat\n"; print STDERR " -i interval: polling period\n"; print STDERR " -c : clear stats file first\n"; print STDERR " -g : graphable output format\n"; print STDERR " -h : help, display this information\n"; - print STDERR "example: $pname -i 1 ost (monitors /proc/fs/lustre/ost/OSS/ost/stats)\n"; + print STDERR "example: $pname -i 1 ost (monitors lustre/ost/OSS/ost/stats)\n"; print STDERR "Use CTRL + C to stop statistics printing\n"; exit 1; } @@ -125,7 +125,6 @@ sub readstat() #Globals $pname = $0; -$defaultpath = "/proc/fs/lustre"; $obdstats = "stats"; $clear = 0; $graphable = 0; @@ -168,11 +167,11 @@ if ( -f $obddev ) { } elsif ( -f "$obddev/$obdstats" ) { $statspath = "$obddev/$obdstats"; } else { - my $st = glob("$defaultpath/*/$obddev/$obdstats"); + my $st = glob("/{proc,sys}/fs/lustre/*/$obddev/$obdstats"); if ( -f "$st" ) { $statspath = $st; } else { - $st = glob("$defaultpath/*/*/$obddev/$obdstats"); + $st = glob("/{proc,sys}/fs/lustre/*/*/$obddev/$obdstats"); if ( -f "$st" ) { $statspath = $st; } -- 1.8.3.1