From dfa0760e7d8c7f5f56ebb5ee2e766f0a05cc4e67 Mon Sep 17 00:00:00 2001 From: "John L. Hammond" Date: Tue, 27 Apr 2021 09:38:30 -0500 Subject: [PATCH] Squashed 'lipe/' changes from 38f79e56ec..8251fae87b 8251fae87b Update lipe version to 1.17. 87ee780007 EX-1613 scripts: Use ticket to start/stop hotpools 0ce8cdc011 EX-3078 lipe: quote FIDs in remote commands dcd24fe01e EX-3034 lamigo: check for available agents early 0ff4506a1d EX-3043 lamigo: remove debugging leftover 22cf972d45 EX-3043 lamigo: simplify changelog cleaning check de214cee10 EX-3009 lamigo: dump changelog status 8e7ee6cc80 EX-3017 lpurge: for stats for skipped objects b0bce08ec5 EX-2768 lamigo: don't register a SIGCHLD handler 8981cfa704 EX-3036 lipe: version and revision support 1f562d542b EX-3020 lamigo: prevent out of order changelog clearing d859e4bff2 EX-3021 lipe: refactor lipe_ssh context handling 4124346b0d EX-2718 lpurge: use local mountpoint for purge operations 7caf05b672 EX-3030 lipe: join multiple threads in lamigo_check_jobs() 911cf5018b EX-2994 lipe: update lpurge purged stats correctly 3d1af585e3 Update lipe version to 1.16. 842d8c5aad EX-2962 lipe: Fix config autodetect df423e9539 EX-2948 build: less checks in lipe configuration 6745370c5b EX-2983 lamigo: reduce log level in lamigo_exec_cmd() 77e90c1df3 EX-2979 lamigo: do not count setprefer as replication eda4f09711 EX-2608 scripts: Auto detect previous values 4ca909e2ea Update lipe version to 1.15. 2f80b12aaf EX-2770 lpurge: set lop_mdt_idx before spawning thread 370ba3a118 EX-2930 lipe: fix errno.h include 4f672e1346 EX-2921 lipe: merge tools/lipe to lipe subtree 17a2a63533 EX-2778 lipe: lipe.spec fixes git-subtree-dir: lipe git-subtree-split: 8251fae87b508e36caab6397b1063b308dcb2b05 --- Makefile.am | 1 + configure.ac | 39 +- hotpools-1.17-release-notes.md | 456 +++++++++++++++++ lipe-revision.sh | 29 ++ lipe.spec.in | 26 +- scripts/stratagem-hp-config.sh | 249 +++++++--- scripts/stratagem-hp-convert.sh | 76 +++ scripts/stratagem-hp-start.sh | 28 +- scripts/stratagem-hp-stop.sh | 30 +- scripts/stratagem-hp-teardown-client.sh | 90 ++++ scripts/stratagem-hp-teardown.sh | 86 +++- src/Makefile.am | 6 +- src/lamigo.c | 220 ++++++--- src/lamigo_alr.c | 27 +- src/lipe_ssh.c | 238 +++++---- src/lipe_ssh.h | 14 +- src/lipe_version.c | 32 ++ src/lipe_version.h | 8 + src/lpurge.c | 837 +++++++++++++++----------------- src/lustre_ea.c | 4 +- version-gen.sh | 13 +- 21 files changed, 1693 insertions(+), 816 deletions(-) create mode 100644 hotpools-1.17-release-notes.md create mode 100755 lipe-revision.sh create mode 100755 scripts/stratagem-hp-convert.sh create mode 100755 scripts/stratagem-hp-teardown-client.sh create mode 100644 src/lipe_version.c create mode 100644 src/lipe_version.h diff --git a/Makefile.am b/Makefile.am index 07c0811..a9f0a1b 100644 --- a/Makefile.am +++ b/Makefile.am @@ -55,6 +55,7 @@ PYTHON_COMMANDS = \ EXTRA_DIST= \ $(PYTHON_COMMANDS) \ detect-distro.sh \ + lipe-revision.sh \ example_configs/clownfish/seperate_mgs/clownfish.conf \ example_configs/clownfish/seperate_mgs/lipe_virt.conf \ example_configs/lipe/lipe_install.conf \ diff --git a/configure.ac b/configure.ac index dc74bf9..88a47d5 100644 --- a/configure.ac +++ b/configure.ac @@ -108,43 +108,13 @@ if test "x$idle_user_header" = "xyes"; then AC_DEFINE(IDLE_USER_HEADER, 1, [Lustre uses idle user header]) fi -# -------- check whether DoM is supported by Lustre head files -------- -AC_MSG_CHECKING([Lustre has PFL support]) -saved_libs=$LIBS -LIBS="-llustreapi" -AC_LINK_IFELSE([AC_LANG_SOURCE([ - #include - - int main(void) { - int a = LOV_USER_MAGIC_COMP_V1; - } -])],[ +# few defines which aren't conditional anymore AC_DEFINE([HAVE_LUSTRE_PFL], 1, [Lustre has PFL support]) -have_lustre_pfl=yes -], [have_lustre_pfl="no"]) -LIBS=$saved_libs -AC_MSG_RESULT([$have_lustre_pfl]) - -# -------- check for llapi_layout_get_by_xattr() -------- -AC_MSG_CHECKING([Lustre have llapi_layout_get_by_xattr() and LSoM]) -saved_libs=$LIBS -LIBS="-llustreapi" -AC_LINK_IFELSE([AC_LANG_SOURCE([ - #include - - int main(void) { - llapi_layout_get_by_xattr(NULL, 0, 0); - } -])],[ AC_DEFINE([HAVE_LAYOUT_BY_XATTR], 1, [have llapi_layout_get_by_xattr()]) AC_DEFINE([HAVE_LAZY_SIZE_ON_MDT], 1, [have lazy size on MDT]) -have_layout_by_xattr=yes -], [have_layout_by_xattr="no"]) -LIBS=$saved_libs -AC_MSG_RESULT([$have_layout_by_xattr]) # -------- check for llapi_changelog_in_buf() -------- AC_MSG_CHECKING([Lustre have llapi_changelog_in_buf()]) @@ -337,8 +307,15 @@ AC_SEARCH_LIBS([ext2fs_open2], [ext2fs]) LIPE_RELEASE="1" AC_DEFINE_UNQUOTED(RELEASE, "$LIPE_RELEASE", [release info] ) +AC_DEFINE_UNQUOTED(LIPE_RELEASE, "$LIPE_RELEASE", [release info] ) AC_SUBST(LIPE_RELEASE) +AC_MSG_CHECKING([for lipe revision]) +LIPE_REVISION=$(bash lipe-revision.sh) +AC_MSG_RESULT([$LIPE_REVISION]) +AC_DEFINE_UNQUOTED(LIPE_REVISION, "$LIPE_REVISION", [revision info] ) +AC_SUBST(LIPE_REVISION) + # for exporting to spec file AC_SUBST(ac_configure_args) AC_CONFIG_FILES([Makefile diff --git a/hotpools-1.17-release-notes.md b/hotpools-1.17-release-notes.md new file mode 100644 index 0000000..14c6763 --- /dev/null +++ b/hotpools-1.17-release-notes.md @@ -0,0 +1,456 @@ +# Hotpools Hotfix (lipe-server-1.17) Release Notes + +## lamigo agents + +lamigo will continue to ssh to "agents" and invoke lfs to mirror and +resync files. An "agent" need not be an EXAScaler server (VM). Instead +an agent can be any node that lamigo can reach via ssh as root, has a +Lustre client mount, and has the lfs utility installed. The lamigo +configuration file determines the set of agents used: +``` + # MDT=myfs1-MDT0042 + # grep '^agent=' /etc/lamigo/${MDT}.conf + agent=ai400x-0a12-vm00:/lustre/myfs1/client:4 + agent=ai400x-0a12-vm01:/lustre/myfs1/client:4 + agent=ai400x-0a12-vm02:/lustre/myfs1/client:4 + agent=ai400x-0a12-vm03:/lustre/myfs1/client:4 +``` +In the triple `ai400x-0a12-vm00:/lustre/myfs1/client:4` the first +component is the hostname of the agent, the second is the mount point +of the Lustre client on the agent, and the third is the maximum number +of concurrent mirror operations. + +Using agents other than the EXAScaler servers will alleviate many of +the HA issues seen to date (DDN-1920, EX-1613). The +`stratagem-hp-config.sh` scripts will use the EXAScaler servers as +mirroring agents for lamigo. + +Previous version of lamigo and lpurge failed to startup in some +certain cases when agents were unreachable (DDN-2070, EX-3021). All +known issues in this area are believed to be fixed. + +In the example above we have 4 agents with 4 concurrent operations +each for a total of 16 concurrent operations on behalf of this lamigo +service. Some basic benchmarking on a mix of small and medium files +has shown that we can expect about twice as many (2 * 16 = 32) +replications per second from lamigo. + +## agentless lpurge + +lpurge will no longer use agents or external commands (lfs mirror +split ...) to purge files. Instead it will use the Lustre client mount +point (`/lustre/$FS/client`) on the server where lpurge runs. For +backwards compatibility lpurge will ignore any agents specified in the +config file. Note that lpurge.conf uses lines of the form +``` + mds=0:ai400x-0a12-vm00:/lustre/myfs1/client + mds=1:ai400x-0a12-vm01:/lustre/myfs1/client +``` +to specify an agent. Since they will be ignored, we recommend just +leaving them in the config file. This will make downgrade easier if +needed. + +## Online upgrade of lamigo and lpurge + +To upgrade lamigo and lpurge on a live system we suggest installing +the new lipe-server RPM to all EXAScaler servers and then running +``` + # clush -a killall lamigo lpurge +``` +In this case systemd will automatically restart lamigo and lpurge +(using the newly installed binaries) without invoking HA. This is +preferred over using `stratagem-hp-stop.sh && stratagem-hp-start.sh`. + +## Converting hotpools to use ticketed HA configuration + +Using the Lustre client mount Pacemaker resource to control the +starting and stopping of hotpools introduced cyclic resource +dependencies which in turn made stopping hotpools impossible in some +cases (usually leading to STONITH, see DDN-1920 and EX-1613). To +address this `stratagem-hp-config.sh` now uses a new Pacemaker +resource (`$FS-hotpool`) and ticket (`$FS-hotpool-allocated`) to start +and stop hotpools. Running `stratagem-hp-stop.sh` stops lamigo and +lpurge but not the Lustre client mounts on the servers. + +Alongside the changes to `stratagem-hp-config.sh`, this release +provides two new HA scripts: + +`stratagem-hp-convert.sh` converts a previously configured hotpools +installation to a ticketed configuration. Conversion may be performed +with hotpools running or stopped. + +`stratagem-hp-teardown-client.sh` stops and unconfigures the Lustre +client mount on the servers. + +## Permanently stopping hotpools + +If hotpools is to be stopped permanently or indefinitely then the +changelog users registered for lamigo should be deregistered to avoid +client operations failing due to full changelogs and increased +changelog consumption on the MDTs (upto about 512GB). This should be +done for each MDT. + +## Restarting hotpools + +If hotpools was previously enabled and subsequently stopped then +some maintenance around changelogs should be performed before +re-enabling it. Remove any old '.chlg' files: +``` + # clush -a "find /var/lib -name 'lamigo-*.chlg' -delete" +``` +For each MDT in $FS-MDT0000..$FS-MDTffff perform the following steps. + +1. Check that the changelog user specified in the lamigo config file +agrees with the changelog user registered on the MDT: +``` + # MDT=myfs1-MDT0042 + # grep '^user=' /etc/lamigo/${MDT}.conf + user=cl23 + # clush -aN lctl get_param mdd.${MDT}.changelog_users 2> /dev/null + mdd.myfs1-MDT0042.changelog_users= + current index: 83896534 + ID index (idle seconds) + cl23 82748303 (481516) +``` +If there are other changelog users registered then they may need to be +removed. Please consult with the site admins to ensure that they are +not in use by another changelog consumer (Robinhood, laudit, +...). Other registered changelog users with high `idle seconds` values +should be regarded with suspicion. + +The number of changelog entries stored on disk by the MDT is the +difference between the "current index" and the minimum of the indexes +of all registered users: +``` + 83896534 - 82748303 = 1148231 +``` +2. Check the total changelog space consumption on the MDT: +``` + # clush -aN lctl get_param mdd.${MDT}.changelog_size 2> /dev/null + mdd.myfs1-MDT0042.changelog_size=161358368 +``` +A changelog entry consumes about 144 bytes on average. So this looks +about right +``` + 144 * 1148231 = 165345264 +``` +Contact L3 if there are no changelog users registered but +`changelog_size` is reporting a large value or if `changelog_size` is much +larger than the expected size. + +3. Clear old changelogs: + +If lamigo is the only changelog consumer and there are many unconsumed +entries on the MDT then clear the changelog manually before restarting +hotpools: +``` + # clush -aN lctl get_param mdd.${MDT}.changelog_users 2> /dev/null + mdd.myfs1-MDT0042.changelog_users= + current index: 83896534 + ID index (idle seconds) + cl23 82748303 (481516) + # CL=cl23 + # lfs changelog_clear $MDT $CL 0 +``` +Clearing a large amount of old changelog entries will take some +time. (Do not interrupt it!) Changelogs on *different* MDTs may be +cleared in parallel. + +The steps listed above should be repeated for each MDT! + +## Tuning lamigo (optimization) + +Consider increasing the number of agents or the number of concurrent +jobs per agent. + +## Tuning lpurge (optimization) + +Consider increasing the max_jobs config parameter from the default of +8. + +## Reducing the changelog mask (optimization) + +If there are no changelog consumers other than lamigo then the +changelog mask may be reduced from the default to include only the +types needed by hotpools (`MARK UNLNK CLOSE`). Note the spelling of +'UNLNK'. To see the current changelog mask run: +``` + # clush -aN lctl get_param "mdd.${FS}-*.changelog_mask" + mdd.myfs1-MDT0042.changelog_mask= + MARK CREAT MKDIR HLINK SLINK MKNOD UNLNK RMDIR RENME RNMTO CLOSE LYOUT TRUNC SATTR XATTR HSM MTIME CTIME MIGRT FLRW RESYNC +``` +To make a persistent setting run the following on the MGS: +``` + # lctl set_param -P "mdd.${FS}-*.changelog_mask"='MARK UNLNK CLOSE' +``` +## Creating a nodemap for hotpools agents (optimization) + +As lamigo consumes changelogs (and OST access logs) it dispatches +mirroring operation to agents which in turn generate yet more +changelog entries that lamigo must consume. This feedback loop is not +fatal but does affect performance. It can be avoided by creating a +nodemap that containing the EXAScaler servers (and any configured +agents) and setting `audit_mode=0` for that nodemap. + +(First consult site admins to determine if nodemaps are already in +use. If so then any changes made must be consistent with current +nodemap configuration.) + +To proceed collect the set of possible LNet NIDs for the hotpools +agents. By default this will be the NIDs of the EXAScaler server VMs. +``` + # clush -aN lctl list_nids | sort + 172.16.0.48@o2ib + 172.16.0.49@o2ib + 172.16.0.50@o2ib + 172.16.0.51@o2ib + 172.16.0.52@o2ib + 172.16.0.53@o2ib + 172.16.0.54@o2ib + 172.16.0.55@o2ib +``` +This list of NIDs must be converted to a comma separated list or range +expression. In this example: +``` + 172.16.0.[48-55]@o2ib +``` +(See https://github.com/DDNStorage/lustre_manual_markdown/blob/master/03.16-Mapping%20UIDs%20and%20GIDs%20with%20Nodemap.md for the full syntax.) +To configure and enable the nodemap run the following on the MGS: +``` + # HP_NODEMAP=hp_agents + # HP_AGENT_NIDS="172.16.0.[48-55]@o2ib" + # lctl nodemap_modify --name default --property admin --value 1 + # lctl nodemap_modify --name default --property trusted --value 1 + # lctl nodemap_add $HP_NODEMAP + # lctl nodemap_modify --name $HP_NODEMAP --property admin --value 1 + # lctl nodemap_modify --name $HP_NODEMAP --property trusted --value 1 + # lctl nodemap_modify --name $HP_NODEMAP --property audit_mode --value 0 + # lctl nodemap_add_range --name $HP_NODEMAP --range "$HP_AGENT_NIDS" + # lctl nodemap_add_range --name $HP_NODEMAP --range 0@lo + # lctl nodemap_activate 1 +``` + +## Monitoring lamigo and lpurge + +There are several ways to monitor hotpools beyond checking the +utilization of the fast pool. lamigo maintains several counters and +statistics which it will write out on receiving `SIGUSR1`. The following +commands were run on the VM where `${MDT}` is mounted. +``` + # MDT=myfs1-MDT0042 + # cat /var/run/lamigo-${MDT}.pid + 20223 + # ps -Fp $(cat /var/run/lamigo-${MDT}.pid) + UID PID PPID C SZ RSS PSR STIME TTY TIME CMD + root 14434 9331 0 71699 5120 3 Apr20 ? 00:59:16 /usr/sbin/lamigo -f /etc/lamigo/myfs1-MDT0042.conf + # pkill --exact --pidfile=/var/run/lamigo-${MDT}.pid --signal=USR1 lamigo + # sleep 1 + # cat /var/run/lamigo-${MDT}.stats + version: 1.16-1 + revision: da7d99c2761e0f294a3e0a150ef59972bd04bc89 + config: + chlg_user: cl23 + mdtname: myfs1-MDT0042 + mountpoint: /lustre/myfs1/client + source_pool: ddn_ssd + target_pool: ddn_hdd + min_age: 600 + max_cache: 268435456 + rescan: 0 + thread_count: 1 + pool_refresh: 600 + progress_interval: 600 + periods: 16 + period_time: 10 + warmup_k: 20 + cooldown_k: 15 + ofd_interval: 5 + hot_fraction: 20 + hot_after_idle: 3 + src_free: 70 + tgt_free: 10 + pool ddn_ssd: + osts: 8 + avail: 22974591116 + total: 31824761344 + open: 1 + pool ddn_hdd: + osts: 2 + avail: 436791025224 + total: 494282835936 + open: 1 + stats: + read: 6188174 + skipped: 369235 + processed: 5818939 + removed: 0 + dups: 311924 + spawned: 7111691 + replicated: 4429160 + busy: 0 + queued: 1226018 + skip_hot: 0 + ro2hot: 0 + rw2hot: 0 + rw2cold: 3382984 + changelog: + last_processed_idx: 84139503 + last_cleared_idx: 82909792 + last_cleared_time: 1619120075 # ( Thu Apr 22 19:34:35 2021 ) + agents: + agent3: { host: ai400x-0a12-vm03, mnt: /lustre/myfs1/client, jobs: 4, max: 4, state: active } + agent2: { host: ai400x-0a12-vm02, mnt: /lustre/myfs1/client, jobs: 3, max: 4, state: active } + ... + jobs: + job0: { tid: 140569798792960, fid: [0x2000079ac:0xfb4e:0x0], index: 0, agent: 3, start: 1619120075, command: extend } + job1: { tid: 140568524986112, fid: [0x2000079ac:0xfb02:0x0], index: 0, agent: 3, start: 1619120075, command: extend } + job2: { tid: 140569807185664, fid: [0x2000079ac:0xfaff:0x0], index: 0, agent: 3, start: 1619120075, command: extend } + ... +``` +Note that the difference between the MDT current changelog index and +the last index cleared by lamigo indicates the degree to which +lamigo's consumption of changelog entries is keeping up the production +of new entries (both values can be seen using the `changelog_users` +parameter as shown above). A 10 minute (min-age) lag between reading +and clearing is normal. + +Similarly for lpurge, the following commands were run on the VM where +`${OST}` is mounted: +``` + # OST=myfs1-OST0117 + # cat /var/run/lpurge-${OST}.pid + 20222 + # ps -Fp $(cat /var/run/lpurge-${OST}.pid) + UID PID PPID C SZ RSS PSR STIME TTY TIME CMD + root 20222 1 6 171957 13352 16 Apr20 ? 03:09:06 /usr/sbin/lpurge -f /etc/lpurge/myfs1/OST0117.conf + # pkill --exact --pidfile=/var/run/lpurge-${OST}.pid --signal=USR1 lpurge + # sleep 1 + # cat /var/run/lpurge-${OST}.stats + version: 1.16-1 + revision: b465db8b9b99e217d175f31230d04e10a9a17906 + config: + free_high: 80 + free_low: 50 + ostname: myfs1-OST0117 + mountpoint: /lustre/myfs1/client + pool: ddn_ssd + max_jobs: 8 + check_interval: 30 + scan_rate: 10000 + scan_threads: 1 + slot_size: 1048576 + stats: + scans: 10 + scan_time: 346 + slow_scans: 5 + queued: 3210711 + started: 3202330 + purged: 363985 + failed: 2838337 + low_hits: 6 + space: + kbfree: 2997655148 + low: 1989047584 + hi: 3182476134 + hlists: +``` + +# Known issues + +## If lamigo is falling behind on changelog consumption + +Append a `rescan` directive to the lamigo configuration file, +manually clear the changelog file, and then restart lamigo. +``` + # MDT=myfs1-MDT0042 + # echo rescan >> /etc/lamigo/${MDT}.conf + # CL=... + # lfs changelog_clear $MDT $CL 0 + # pkill --exact --pidfile=/var/run/lamigo-${MDT}.pid lamigo +``` + +## lamigo SSH errors + +If lamigo reports +``` + lamigo: Error authenticating pubkey: Failed to read private key: /root/.ssh/id_rsa +``` +or +``` + lamigo: cannot authenticate SSH session to host HOST: ... +``` +then first check that `ssh` works between the affected hosts. If so +then check that the private key file (usually `/root/.ssh/id_rsa`) is +in PEM format. If the key file starts with +``` + -----BEGIN RSA PRIVATE KEY----- +``` +then it is likely in PEM format. If, instead, it starts with +``` + -----BEGIN OPENSSH PRIVATE KEY----- +``` +then it is in the newer OpenSSH format. To generate a key in PEM +format, add `-m PEM` to the `ssh-keygen` command line. + +Second, ensure that the key file does not contain any comments (lines +starting with `#`). + +## Working around HA/Pacemaker/STONITH when stopping hotpools + +Formerly "stratagem-hp-stop-no-more-tears.txt". These instructions are +obsolete after converting hotpools to a ticketed HA configuration. +``` +# Temporary process to stop hotpools in a production environment +# without invoking HA STONITH due to busy client mount points. +# +# XXX This is not an automated process. You need to run each command in +# turn and look at the output. Maybe even understand it. Sorry. +# +# See https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/high_availability_add-on_reference/s1-managedresource-haar +# for more information. +# +# Author: John L. Hammond + +# Put all lamigo and lpurge HA resources in unmanaged mode. +crm resource unmanage 'lamigo*' +crm resource unmanage 'lpurge*' + +# In the output of hastatus the lamigo and lpurge resources should now +# show up as "Started (unmanaged)" +hastatus + +# Use systemctl to actually stop the services corresponding to the HA +# resources you just unmanaged. +clush -a systemctl stop 'lamigo@*' +clush -a systemctl stop 'lpurge@*' + +# In the output of hastatus the lpurge resources should now show up as "Stopped (unmanaged)" +hastatus + +# Verify that the processes have actually terminated. +clush -a ps -FC lamigo,lpurge + +# Wait for any mirroring operations to drain from the client mounts. +clush -a lsof /lustre/$FS/client/ +clush -a ps -FC lfs + +# Once they have drained you can use the stop script. +stratagem-hp-stop.sh + +# To restart: +crm resource manage 'lamigo*' +crm resource manage 'lpurge*' +stratagem-hp-start.sh + +# Verify that things are back to normal: +hastatus + +# After all of this 'hastatus' may show some 'Failed Resource Actions'. For example: +# +# Failed Resource Actions: +# * lpurge-fs0a12-OST0004_monitor_30000 on ai400x-0a12-vm02 'not running' (7): call=316, status=complete, exitreason='', +# last-rc-change='Thu Apr 8 18:01:57 2021', queued=0ms, exec=0ms +# +# To clean these up do 'crm resource cleanup lpurge-fs0a12-OST0004' + +``` diff --git a/lipe-revision.sh b/lipe-revision.sh new file mode 100755 index 0000000..32933ff --- /dev/null +++ b/lipe-revision.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +# Try to get the current revision (commit hash). Don't fail. + +function get_revision { + local src="$1" + local val="$2" + + if [[ -n "${val}" ]]; then + echo "$0: using '${src}' = '${val}'" >&2 + printf "%s\n" "${val}" + exit 0 + fi +} + +# Manual override. +get_revision LIPE_REVISION "${LIPE_REVISION:-}" + +# Release build on https://es-build.datadirectnet.com/ +get_revision ES_BUILD_LUSTRE_REVISION "${ES_BUILD_LUSTRE_REVISION:-}" + +# Reviews build from Gerrit. +get_revision GERRIT_PATCHSET_REVISION "${GERRIT_PATCHSET_REVISION:-}" + +# Build from source. +GIT_REVISION="$(git rev-parse HEAD 2> /dev/null)" +get_revision 'git rev-parse HEAD' "${GIT_REVISION:-}" + +printf "None\n" diff --git a/lipe.spec.in b/lipe.spec.in index 69c9a4b..a18d53a 100644 --- a/lipe.spec.in +++ b/lipe.spec.in @@ -21,7 +21,7 @@ Prefix: %{_prefix} %{!?python_sitelib: %global python_sitelib %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")} Release: @LIPE_RELEASE@%{?dist} - +VCS: @LIPE_REVISION@ Summary: lipe - Lustre Integrated Policy Engine License: All rights reserved DataDirect Networks Inc. Group: Applications/System @@ -328,13 +328,13 @@ cp -a example_configs/hotpool/* $RPM_BUILD_ROOT%{_sysconfdir}/ install -m 0744 -D init.d/lipe_test_scheduler \ $RPM_BUILD_ROOT%{_sysconfdir}/rc.d/init.d/lipe_test_scheduler %endif -install -g 0 -o 0 -m 0644 man/lipe_scan.1 $RPM_BUILD_ROOT%{_mandir}/man1/ -install -g 0 -o 0 -m 0644 man/lipe_find.1 $RPM_BUILD_ROOT%{_mandir}/man1/ -install -g 0 -o 0 -m 0644 man/lfill.1 $RPM_BUILD_ROOT%{_mandir}/man1/ +install -m 0644 man/lipe_scan.1 $RPM_BUILD_ROOT%{_mandir}/man1/ +install -m 0644 man/lipe_find.1 $RPM_BUILD_ROOT%{_mandir}/man1/ +install -m 0644 man/lfill.1 $RPM_BUILD_ROOT%{_mandir}/man1/ %if %{with laudit} -install -g 0 -o 0 -m 0644 man/laudit.1 $RPM_BUILD_ROOT%{_mandir}/man1/ -install -g 0 -o 0 -m 0644 man/laudit-report.1 $RPM_BUILD_ROOT%{_mandir}/man1/ -install -g 0 -o 0 -m 0644 man/laudit.conf.5 $RPM_BUILD_ROOT%{_mandir}/man5/ +install -m 0644 man/laudit.1 $RPM_BUILD_ROOT%{_mandir}/man1/ +install -m 0644 man/laudit-report.1 $RPM_BUILD_ROOT%{_mandir}/man1/ +install -m 0644 man/laudit.conf.5 $RPM_BUILD_ROOT%{_mandir}/man5/ %endif cp -a lhsm_tests $RPM_BUILD_ROOT%{_libdir}/ @@ -446,15 +446,15 @@ rm -rf $RPM_BUILD_ROOT %changelog -* Thu Jun 30 2020 Gu Zheng 1.5 +* Tue Jun 30 2020 Gu Zheng [1.5] - Update version to 1.5 -* Thu Apr 3 2019 Gu Zheng 1.4 +* Wed Apr 03 2019 Gu Zheng [1.4] - Devide RPM by function -* Thu Jan 3 2019 Gu Zheng 1.4 +* Thu Jan 03 2019 Gu Zheng [1.4] - Add pyltest RPM -* Wed Mar 7 2018 Sebastien Buisson 1.1 +* Wed Mar 07 2018 Sebastien Buisson [1.1] - Add laudit and laudit-report -* Mon Oct 11 2017 Li Xi 1.0 +* Wed Oct 11 2017 Li Xi [1.0] - Add license-server RPM -* Mon Feb 27 2017 Qian Yingjin 0.1 +* Mon Feb 27 2017 Qian Yingjin [0.1] - Initial import diff --git a/scripts/stratagem-hp-config.sh b/scripts/stratagem-hp-config.sh index 8e1e963..67db1d4 100755 --- a/scripts/stratagem-hp-config.sh +++ b/scripts/stratagem-hp-config.sh @@ -1,39 +1,82 @@ #!/bin/bash -# Copyright DDN 2020 # -*- indent-tabs-mode: nil -*- +# Copyright DDN 2021 +# +# Pacemaker Ticket: +# $FS-hotpool-allocated (controlled by resource $FS-hotpool) +# +# Pacemaker Resources: +# $FS-hotpool (controlls $FS-hotpool-allocated ticket) +# cl-$FS-client (cloned client mount) +# lamigo-$FS-MDTXXXX (for each MDT) +# lpurge-$FS-OSTXXXX (for each OST in fast pool) +# +# Files Created +# /etc/lamigo/$FS-MDTXXXX.conf (for each MDT) +# /etc/lpurge/$FS/OSTXXXX.conf (for each OST in fast pool) +# /etc/fstab updated with client mount line +# +# Lustre Changes +# changelog user created for each MDT # CONFIGURATION VARIABLES -# ASSUMES last configlog user is for lamigo, or creates one if one doesn't exist - -LAMIGO_MINAGE=600 -LPURGE_FREEHI=80 -LPURGE_FREELO=50 -FAST_POOL=ddn_ssd -SLOW_POOL=ddn_hdd +DEF_LAMIGO_MINAGE=600 +DEF_LPURGE_FREEHI=80 +DEF_LPURGE_FREELO=50 +DEF_FAST_POOL=ddn_ssd +DEF_SLOW_POOL=ddn_hdd function usage() { - echo "Usage: $(basename $0) -m MINAGE -H FREEHI -L FREELO -f FASTPOOL -s SLOWPOOL [ FILESYSTEM ]" - exit 0 + local rc=${1:-0} + local values=${2:-1} + echo "Usage: $(basename $0) [-h] [-n] [-m MINAGE -H FREEHI -L FREELO -f FASTPOOL -s SLOWPOOL] [ FILESYSTEM ]" + echo " Automatically configures Stratagem Hotpools in a existing HA environment" + echo " -n - No auto-detect value" + echo " -h - This help message" + echo " -m - lamigo minimum age" + echo " -H - lpurge free-hi percent" + echo " -L - lpurge free-lo percent" + echo " -f - Fast OST pool" + echo " -s - Slow OST pool" + if (( values == 1 )); then + echo "Values:" + echo " Filesystem : $FS" + echo " Fast OST Pool : $FAST_POOL" + echo " Slow OST Pool : $SLOW_POOL" + echo " LAmigo MinAge : $LAMIGO_MINAGE" + echo " LPurge FreeHI : ${LPURGE_FREEHI:-${DEF_LPURGE_FREEHI}}" + echo " LPurge FreeLO : ${LPURGE_FREELO:-${DEF_LPURGE_FREELO}}" + fi + exit $rc } -while getopts "hm:H:L:f:s:" opt; do +HELP=false +AUTO=true + +while getopts "hm:H:L:f:s:n" opt; do case "$opt" in - m) LAMIGO_MINAGE=$OPTARG ;; - H) LPURGE_FREEHI=$OPTARG ;; - L) LPURGE_FREELO=$OPTARG ;; - f) FAST_POOL=$OPTARG ;; - s) SLOW_POOL=$OPTARG ;; - h) usage ;; + m) LAMIGO_MINAGE=$OPTARG ;; + H) LPURGE_FREEHI=$OPTARG ;; + L) LPURGE_FREELO=$OPTARG ;; + f) FAST_POOL=$OPTARG ;; + s) SLOW_POOL=$OPTARG ;; + n) AUTO=false ;; + h) HELP=true ;; esac done shift $((OPTIND-1)) +FS=$1 + # Location of /lustre/$FS/ mounts and /lustre/$FS/client mount ROOT=/lustre -FS=$1 +function locate_resource() { + local res=$1 + clush -N --group=ha_heads "crm_resource -QW -r $res 2> /dev/null || true" +} function is_valid_percent { local p="${1:-}" @@ -45,62 +88,95 @@ function is_minage_ok { [[ "${p}" =~ ^[[:digit:]]+$ ]] && ((p >= 5)) } -if ! is_valid_percent "${LPURGE_FREEHI}"; then - echo "Invalid FREEHI percentage '${LPURGE_FREEHI}'" - exit 1 -fi - -if ! is_valid_percent "${LPURGE_FREELO}"; then - echo "Invalid FREELO percentage '${LPURGE_FREELO}'" - exit 1 -fi - -if ! ((LPURGE_FREELO < LPURGE_FREEHI)); then - echo "FREELO (${LPURGE_FREELO}) must be less than FREEHI (${LPURGE_FREEHI})" - exit 1 -fi - -if ! is_minage_ok "${LAMIGO_MINAGE}"; then - echo "MINAGE (${LAMIGO_MINAGE}) must be positive integer >= 5" - exit 1 -fi - if [ -z "$FS" ]; then # Try automatic detection - FS=$(crm_ticket -l|cut -f 1 -d -|grep -v ^lustre$) + FS=$(crm_ticket -l|cut -f 1 -d -|grep -v ^lustre$|sort -u) if [ $(echo "$FS" |wc -w) -ne 1 ]; then - echo "Usage: $0 FSNAME" - exit 1 + echo "Error: Unable to determine filesystem automatically" + usage 22 0 fi fi -# Script from here down -function locate_resource() { - res=$1 - clush -N --group=ha_heads "crm_resource -QW -r $res 2> /dev/null || true" -} +if $AUTO && [ ! -f /etc/lamigo/$FS-MDT0000.conf ]; then + echo "Disabling auto-detection of settings ($FS-MDT0000 config missing)" + AUTO=false +fi MDSHOST=$(locate_resource mdt0000-$FS) if [ -z "$MDSHOST" ]; then # if FS is not an resident filesystem, mdt0 will not exist for it echo Failed to find mdt0 for filesystem: $FS - exit 1 + exit 5 +fi + +# reuse of changelog user handled below +# get lpurge options - this gets the first OST from the FAST pool +if $AUTO; then + AUTO_FAST_POOL=$(awk -F = '/^src=/{ print $2 }' /etc/lamigo/$FS-MDT0000.conf) + AUTO_SLOW_POOL=$(awk -F = '/^tgt=/{ print $2 }' /etc/lamigo/$FS-MDT0000.conf) + AUTO_LAMIGO_MINAGE=$(awk -F = '/^min-age=/{ print $2 }' /etc/lamigo/$FS-MDT0000.conf) fi +# Set value: if not explicitly set, set auto, if no auto, set default + +LAMIGO_MINAGE=${LAMIGO_MINAGE:-${AUTO_LAMIGO_MINAGE:-${DEF_LAMIGO_MINAGE}}} +FAST_POOL=${FAST_POOL:-${AUTO_FAST_POOL:-${DEF_FAST_POOL}}} +SLOW_POOL=${SLOW_POOL:-${AUTO_SLOW_POOL:-${DEF_SLOW_POOL}}} + # List of %04x formated OST indexes -FAST_OSTLIST=$(ssh $MDSHOST lctl pool_list $FS.$FAST_POOL|sed -e 's/.*OST\(....\).*/\1/p;d') +FAST_OSTLIST=$(ssh $MDSHOST lctl pool_list $FS.$FAST_POOL|sed -ne 's/.*-OST\(....\)_UUID/\1/p') if [ -z "$FAST_OSTLIST" ]; then echo "Failed to find OSTs in Fast pool ($FS.$FAST_POOL)" - exit 1 + usage 2 fi -SLOW_OSTLIST=$(ssh $MDSHOST lctl pool_list $FS.$SLOW_POOL|sed -e 's/.*OST\(....\).*/\1/p;d') +SLOW_OSTLIST=$(ssh $MDSHOST lctl pool_list $FS.$SLOW_POOL|sed -ne 's/.*-OST\(....\)_UUID/\1/p') if [ -z "$SLOW_OSTLIST" ]; then echo "Failed to find OSTs in Slow pool ($FS.$SLOW_POOL)" - exit 1 + usage 2 +fi + +if $AUTO; then + for OST in $FAST_OSTLIST; do + F=/etc/lpurge/$FS/OST${OST}.conf + if [ -f $F ]; then + AUTO_LPURGE_FREEHI=$(awk -F = '/^freehi=/{ print $2 }' $F) + AUTO_LPURGE_FREELO=$(awk -F = '/^freelo=/{ print $2 }' $F) + break + fi + done +fi + +LPURGE_FREEHI=${LPURGE_FREEHI:-${AUTO_LPURGE_FREEHI:-${DEF_LPURGE_FREEHI}}} +LPURGE_FREELO=${LPURGE_FREELO:-${AUTO_LPURGE_FREELO:-${DEF_LPURGE_FREELO}}} + +if ! is_valid_percent "${LPURGE_FREEHI}"; then + echo "Invalid FREEHI percentage '${LPURGE_FREEHI}'" + usage 34 +fi + +if ! is_valid_percent "${LPURGE_FREELO}"; then + echo "Invalid FREELO percentage '${LPURGE_FREELO}'" + usage 34 fi +if ! ((LPURGE_FREELO < LPURGE_FREEHI)); then + echo "FREELO (${LPURGE_FREELO}) must be less than FREEHI (${LPURGE_FREEHI})" + usage 34 +fi + +if ! is_minage_ok "${LAMIGO_MINAGE}"; then + echo "MINAGE (${LAMIGO_MINAGE}) must be positive integer >= 5" + usage 34 +fi + +if $HELP; then + usage 0 +fi + +echo "Using: $0 -m ${LAMIGO_MINAGE} -H ${LPURGE_FREEHI} -L ${LPURGE_FREELO} -f ${FAST_POOL} -s ${SLOW_POOL} $FS" + OSSLIST=$(es_config_get --option fs_settings.$FS.oss_list) # Die on errors @@ -108,32 +184,44 @@ set -e MOUNTSPEC=$(/opt/ddn/es/tools/mount_lustre_client --dry-run --fs $FS |awk '{ print $4 }') -echo "Creating config directories" -clush -a mkdir -p /etc/lpurge/$FS/ /etc/lamigo/ $ROOT/$FS/client +# Create client mount if it doesn't exist +if ! crm_resource -QW -r cl-$FS-client > /dev/null 2>&1; then -echo "Creating lustre client mounts for $FS" -clush -a "grep -q $FS/client /etc/fstab || (echo $MOUNTSPEC $ROOT/$FS/client lustre x-systemd.mount-timeout=20m,noauto,_netdev >> /etc/fstab)" -clush -a mkdir -p /etc/systemd/system/lustre-$FS-client.mount.d/ -cat << EOF > /etc/systemd/system/lustre-$FS-client.mount.d/timeout.conf + echo "Creating config directories" + clush -a mkdir -p /etc/lpurge/$FS/ /etc/lamigo/ $ROOT/$FS/client + + echo "Creating lustre client mounts for $FS" + clush -a "grep -q $FS/client /etc/fstab || (echo $MOUNTSPEC $ROOT/$FS/client lustre x-systemd.mount-timeout=20m,noauto,_netdev >> /etc/fstab)" + clush -a mkdir -p /etc/systemd/system/lustre-$FS-client.mount.d/ + cat << EOF > /etc/systemd/system/lustre-$FS-client.mount.d/timeout.conf [Mount] TimeoutSec=20m EOF -clush -ca /etc/systemd/system/lustre-$FS-client.mount.d/timeout.conf -clush -a systemctl daemon-reload -echo "Creating lustre client resource agent" -clush -g ha_heads crm configure </dev/null 2&>1 && crm configure << EOF || true + clush -qS --group=ha_heads "crm_resource -QW -r ost$INDEX-$FS >/dev/null 2>&1 && crm configure << EOF || true primitive lpurge-$FS-OST$OST systemd:lpurge@$FS-OST$OST.service op monitor interval=30s colocation lpurge-$FS-$OST-with-ost inf: lpurge-$FS-OST$OST ost$INDEX-$FS order lpurge-$FS-$OST-after-ost ost$INDEX-$FS lpurge-$FS-OST$OST -order lpurge-$FS-$OST-after-client cl-$FS-client lpurge-$FS-OST$OST +rsc_ticket ticket-$FS-hotpool-allocated-lpurge-$FS-OST$OST $FS-hotpool-allocated: lpurge-$FS-OST$OST loss-policy=stop EOF" done @@ -183,10 +271,21 @@ echo "Setting up lamigo (From $FAST_POOL to $SLOW_POOL aged at least $LAMIGO_MIN for INDEX in $MDTLIST; do MDT=$(printf "%04x" $((10#$INDEX)) ) MDSHOST=$(locate_resource mdt$INDEX-$FS) - CL=$(ssh $MDSHOST lctl get_param -n mdd.$FS-MDT$MDT.changelog_users|tail -1|awk '/^cl/{ print $1 }') + CL="" + + # reuse existing changelog user + if [ -f /etc/lamigo/$FS-MDT$MDT.conf ]; then + CL=$(awk -F = '/^user=/{ print $2 }' /etc/lamigo/$FS-MDT$MDT.conf) + # ensure changelog user exists + if [ -z "$(ssh $MDSHOST lctl get_param -n mdd.$FS-MDT$MDT.changelog_users|grep $CL[[:space:]])" ]; then + echo "Previous changelog user for $FS-MDT$MDT: $CL is missing" + CL="" + fi + fi + # create new user if none was previously configured if [ -z "$CL" ]; then - ssh $MDSHOST lctl --device $FS-MDT$MDT changelog_register - CL=$(ssh $MDSHOST lctl get_param -n mdd.$FS-MDT$MDT.changelog_users|tail -1|awk '/^cl/{ print $1 }') + CL=$(ssh $MDSHOST lctl --device $FS-MDT$MDT changelog_register | awk -F \' '{print $2 }') + echo "Registered new changelog user for $FS-MDT$MDT: $CL" fi echo Creating lamigo config for $FS-MDT$MDT @@ -199,7 +298,7 @@ src=$FAST_POOL tgt=$SLOW_POOL EOF for HOST in $OSSLIST; do - echo oss=$HOST >> /etc/lamigo/$FS-MDT$MDT.conf + echo oss=$HOST >> /etc/lamigo/$FS-MDT$MDT.conf done for HOST in $(cluset -e @all); do echo agent=$HOST:$ROOT/$FS/client:4 >> /etc/lamigo/$FS-MDT$MDT.conf @@ -214,15 +313,13 @@ EOF clush -ca /etc/lamigo/$FS-MDT$MDT.conf /etc/systemd/system/lamigo@$FS-MDT$MDT.service.d/override.conf echo Creating lamigo resource for $FS-MDT$MDT - clush -qS --group=ha_heads "crm_resource -QW -r mdt$INDEX-$FS >/dev/null 2&>1 && crm configure << EOF || true + clush -qS --group=ha_heads "crm_resource -QW -r mdt$INDEX-$FS >/dev/null 2>&1 && crm configure << EOF || true primitive lamigo-$FS-MDT$MDT systemd:lamigo@$FS-MDT$MDT.service op start timeout=15m op monitor interval=30s colocation lamigo-$FS-MDT$MDT-with-mdt inf: lamigo-$FS-MDT$MDT mdt$INDEX-$FS order lamigo-$FS-$MDT-after-mdt mdt$INDEX-$FS lamigo-$FS-MDT$MDT -order lamigo-$FS-$MDT-after-client cl-$FS-client lamigo-$FS-MDT$MDT +rsc_ticket ticket-$FS-hotpool-allocated-lamigo-$FS-MDT$MDT $FS-hotpool-allocated: lamigo-$FS-MDT$MDT loss-policy=stop EOF" done echo "Configuration complete. Run the following command to start services:" -echo " clush -qS --group=ha_heads crm resource start cl-$FS-client" -echo "or" echo " stratagem-hp-start.sh $FS" diff --git a/scripts/stratagem-hp-convert.sh b/scripts/stratagem-hp-convert.sh new file mode 100755 index 0000000..2e342a9 --- /dev/null +++ b/scripts/stratagem-hp-convert.sh @@ -0,0 +1,76 @@ +#!/bin/bash +# -*- indent-tabs-mode: nil -*- +# Copyright DDN 2021 + +FS=$1 + +function usage() { + echo "Usage: $(basename $0) [-h] [FILESYSTEM]" + echo " Convert Stratagem Hotpools HA environment to be ticket based" +} + +if [ "$FS" = "-h" ] || [ "$FS" = "--help" ]; then + usage + exit 0 +fi + +function locate_resource() { + local res=$1 + clush -N --group=ha_heads "crm_resource -QW -r $res 2> /dev/null || true" +} + +if [ -z "$FS" ]; then + # Try automatic detection + FS=$(crm_ticket -l|cut -f 1 -d -|grep -v ^lustre$|uniq) + + if [ $(echo "$FS" |wc -w) -ne 1 ]; then + echo "Error: Unable to determine filesystem automatically, please specify" + usage + exit 1 + fi +fi + +if crm_resource -QW -r $FS-hotpool > /dev/null 2>&1; then + echo "Hotpools for $FS already using ticket" + exit 0 +fi + +MDSHOST=$(locate_resource mdt0000-$FS) +MDTLIST=$(clush -qN -g ha_heads crm_resource --list-raw|sed -ne "s/mdt\(.*\)-$FS$/\1/p") +FAST_POOL=$(awk -F = '/^src=/{ print $2 }' /etc/lamigo/$FS-MDT0000.conf) +FAST_OSTLIST=$(ssh $MDSHOST lctl pool_list $FS.$FAST_POOL|sed -ne 's/.*-OST\(....\)_UUID/\1/p') + +ROLE=Started +if crm_resource -r $FS-client -W |& grep -q NOT; then + ROLE=Stopped +fi + +echo "Creating hotpool ticket for $FS" +clush -g ha_heads crm configure </dev/null 2>&1 && crm configure << EOF || true +delete lpurge-$FS-$OST-after-client +rsc_ticket ticket-$FS-hotpool-allocated-lpurge-$FS-OST$OST $FS-hotpool-allocated: lpurge-$FS-OST$OST loss-policy=stop +EOF" +done + +# MDTLIST is derived from resources so it's already in decimal but formatted %04d +for INDEX in $MDTLIST; do + MDT=$(printf "%04x" $((10#$INDEX)) ) + + echo "Updating lamigo HA for $FS-MDT$MDT" + + clush -qS --group=ha_heads "crm_resource -QW -r mdt$INDEX-$FS >/dev/null 2>&1 && crm configure << EOF || true +delete lamigo-$FS-$MDT-after-client +rsc_ticket ticket-$FS-hotpool-allocated-lamigo-$FS-MDT$MDT $FS-hotpool-allocated: lamigo-$FS-MDT$MDT loss-policy=stop +EOF" +done diff --git a/scripts/stratagem-hp-start.sh b/scripts/stratagem-hp-start.sh index 0c996f0..214262d 100755 --- a/scripts/stratagem-hp-start.sh +++ b/scripts/stratagem-hp-start.sh @@ -4,15 +4,37 @@ FS=$1 +function usage() { + echo "Usage: $(basename $0) [-h] [FILESYSTEM]" + echo " Start Stratagem Hotpools HA environment" +} + +if [ "$FS" = "-h" ] || [ "$FS" = "--help" ]; then + usage + exit 0 +fi + if [ -z "$FS" ]; then # Try automatic detection - FS=$(crm_ticket -l|cut -f 1 -d -|grep -v ^lustre$) + FS=$(crm_ticket -l|cut -f 1 -d -|grep -v ^lustre$|uniq) if [ $(echo "$FS" |wc -w) -ne 1 ]; then - echo "Usage: $0 FSNAME" + echo "Error: Could not automatically find filesystem, please specify" + usage exit 1 fi fi +if ! crm_resource -QW -r $FS-hotpool > /dev/null 2>&1; then + if clush -Ng ha_heads crm_resource -l|egrep -q "^(lamigo|lpurge)-$FS-"; then + echo "Hotpools needs conversion" + echo " Please run: stratagem-hp-convert.sh" + else + echo "Hotpools not configured" + echo " Please run: stratagem-hp-config.sh" + fi + exit 1 +fi + echo Starting Hotpools for $FS -clush -qS --group=ha_heads crm res start cl-$FS-client +clush -qS --group=ha_heads crm res start cl-$FS-client $FS-hotpool diff --git a/scripts/stratagem-hp-stop.sh b/scripts/stratagem-hp-stop.sh index 1f2c7bd..dba265b 100755 --- a/scripts/stratagem-hp-stop.sh +++ b/scripts/stratagem-hp-stop.sh @@ -4,15 +4,37 @@ FS=$1 +function usage() { + echo "Usage: $(basename $0) [-h] [FILESYSTEM]" + echo " Stop stratagem hotpools in HA environment" +} + +if [ "$FS" = "-h" ] || [ "$FS" = "--help" ]; then + usage + exit 0 +fi + if [ -z "$FS" ]; then # Try automatic detection - FS=$(crm_ticket -l|cut -f 1 -d -|grep -v ^lustre$) + FS=$(crm_ticket -l|cut -f 1 -d -|grep -v ^lustre$|uniq) if [ $(echo "$FS" |wc -w) -ne 1 ]; then - echo "Usage: $0 FSNAME" + echo "Error: Could not automatically find filesystem, please specify" + usage exit 1 fi fi -echo Stopping Hotpools for $FS -clush -qS --group=ha_heads crm res stop cl-$FS-client +if crm_resource -QW -r $FS-hotpool > /dev/null 2>&1; then + echo Stopping Hotpools for $FS + clush -qS --group=ha_heads crm res stop $FS-hotpool + +elif clush -Ng ha_heads crm_resource -l|egrep -q "^(lamigo|lpurge)-$FS-"; then + echo "Stopping old style Hotpools (see stratagem-hp-convert.sh)" + clush -qS -g ha_heads crm res stop cl-$FS-client + +else + echo "Hotpools not configured" + echo " Please run: stratagem-hp-config.sh" + exit 1 +fi diff --git a/scripts/stratagem-hp-teardown-client.sh b/scripts/stratagem-hp-teardown-client.sh new file mode 100755 index 0000000..b579878 --- /dev/null +++ b/scripts/stratagem-hp-teardown-client.sh @@ -0,0 +1,90 @@ +#!/bin/bash +# -*- indent-tabs-mode: nil -*- +# Copyright DDN 2021 +# +# Stops and remove cloned client mount +# Removes: +# resources $FS-mount (the cloned mount) +# client mount from /etc/fstab + +FS=$1 + +# Set a 10+ minute timeout for waitfor() +TIMEOUT=${TIMEOUT:-600} + +function usage() { + echo "Usage: $(basename $0) [-h] [-t SECS] [FILESYSTEM]" + echo " Tear down server client mount HA environment" + echo " -t SECS - Max timeout to wait for services to stop (default: $TIMEOUT)" +} + +while getopts "ht:" opt; do + case $opt in + h) usage; exit 0;; + t) TIMEOUT=$OPTARG;; + esac +done + +if [ "$FS" = "--help" ]; then + usage + exit 0 +fi + +if [ -z "$FS" ]; then + # Try automatic detection + FS=$(crm_ticket -l|cut -f 1 -d -|grep -v ^lustre$|uniq) + + if [ $(echo "$FS" |wc -w) -ne 1 ]; then + echo "Usage: $0 FSNAME" + exit 1 + fi +fi + +waitfor () { + local ids=$* + + [ -z "$ids" ] && return + + echo -n "Waiting for $ids" + + local deadline=$((SECONDS+TIMEOUT)) + + local done=false + until $done || ((deadline < SECONDS)); do + + done=true + for i in $ids; do + # This finds the host the resource is active on (empty it is stopped) + res=$(clush -qg ha_heads crm_resource -QW -r $i 2> /dev/null) + if [ -n "$res" ]; then + done=false + echo -n "." + sleep 1 + break + fi + done + done + + echo "" + + if ! $done; then + echo "Waiting for $ids TIMED OUT!" + exit 1 + fi +} + +res=$(clush -qNg ha_heads crm_resource -l 2> /dev/null|grep lamigo-$FS-MDT0000) +if [ -n "$res" ]; then + echo "ERROR: Please tear down Hotpools first" + exit 1 +fi + +echo Stopping Client +clush -qS --group=ha_heads crm res stop cl-$FS-client + +waitfor cl-$FS-client + +clush -qS --group=ha_heads crm config del $FS-client + +echo "Removing client mount from fstab" +clush -a "sed -i '/\/$FS\/client.*lustre/d' /etc/fstab" diff --git a/scripts/stratagem-hp-teardown.sh b/scripts/stratagem-hp-teardown.sh index bc7b489..fdff7b7 100755 --- a/scripts/stratagem-hp-teardown.sh +++ b/scripts/stratagem-hp-teardown.sh @@ -1,15 +1,44 @@ #!/bin/bash # -*- indent-tabs-mode: nil -*- # Copyright DDN 2020 +# +# After stopping hotpools (via ticket) +# Removes: +# resources lamigo-$FS-* +# resources lpurge-$FS-* +# resource $FS-hotpool +# ticket $FS-hotpool-allocated +# +# Leaves: +# all lamigo & lpurge config files +# client mount FS=$1 -# Set a 10+ minute timeout for waitfor() +# Set a default 10+ minute timeout for waitfor() TIMEOUT=${TIMEOUT:-600} +function usage() { + echo "Usage: $(basename $0) [-h] [-t SECS] [FILESYSTEM]" + echo " Tear down Stratagem Hotpools HA environment" + echo " -t SECS - Max timeout to wait for services to stop (default: $TIMEOUT)" +} + +while getopts "ht:" opt; do + case $opt in + h) usage; exit 0;; + t) TIMEOUT=$OPTARG;; + esac +done + +if [ "$FS" = "--help" ]; then + usage + exit 0 +fi + if [ -z "$FS" ]; then # Try automatic detection - FS=$(crm_ticket -l|cut -f 1 -d -|grep -v ^lustre$) + FS=$(crm_ticket -l|cut -f 1 -d -|grep -v ^lustre$|uniq) if [ $(echo "$FS" |wc -w) -ne 1 ]; then echo "Usage: $0 FSNAME" @@ -18,42 +47,54 @@ if [ -z "$FS" ]; then fi waitfor () { - IDS=$* + local ids=$* - [ -z "$IDS" ] && return + [ -z "$ids" ] && return - echo -n "Waiting for $IDS" + echo -n "Waiting for $ids" - local timeout=$TIMEOUT + local deadline=$((SECONDS+TIMEOUT)) - local NOTDONE=true - while $NOTDONE && ((timeout > 0)); do + local done=false + until $done || ((deadline < SECONDS)); do - NOTDONE=false - for i in $IDS; do + done=true + for i in $ids; do # This finds the host the resource is active on (empty it is stopped) res=$(clush -qg ha_heads crm_resource -QW -r $i 2> /dev/null) if [ -n "$res" ]; then - NOTDONE=true + done=false echo -n "." sleep 1 break fi done - - let timeout-=1 done echo "" - if ((timeout == 0)); then - echo "Waiting for $IDS TIMED OUT!" + if ! $done; then + echo "Waiting for $ids TIMED OUT!" exit 1 fi } +if ! crm_resource -QW -r $FS-hotpool > /dev/null 2>&1; then + if clush -Ng ha_heads crm_resource -l|egrep -q "^(lamigo|lpurge)-$FS-"; then + echo "Hotpools needs conversion" + echo " Please run: stratagem-hp-convert.sh" + exit 1 + else + echo "Hotpools not configured" + exit 0 + fi +fi + +echo "Stopping Hotpools for $FS" +clush -qS --group=ha_heads crm res stop $FS-hotpool + for host in $(cluset -e @ha_heads); do - IDS=$(ssh $host crm_resource --list-raw|egrep "(lamigo|lpurge)-$FS") + IDS=$(ssh $host crm_resource --list-raw|egrep "^(lamigo|lpurge)-$FS-") if [ -n "$IDS" ]; then echo Stopping lamigo and lpurge on $host ssh $host crm res stop $IDS @@ -64,13 +105,10 @@ for host in $(cluset -e @ha_heads); do fi done +echo "Removing Hotpool ticket for $FS" +clush -qS --group=ha_heads crm config del $FS-hotpool -echo Stopping Client -clush -qS --group=ha_heads crm res stop cl-$FS-client - -waitfor cl-$FS-client - -clush -qS --group=ha_heads crm config del $FS-client +clush -qS --group=ha_heads crm_ticket --ticket $FS-hotpool-allocated --cleanup -echo "Removing client mount from fstab" -clush -a "sed -i '/\/$FS\/client.*lustre/d' /etc/fstab" +echo "Local client is still configured" +echo " to remove run: stratagem-hp-teardown-client.sh" diff --git a/src/Makefile.am b/src/Makefile.am index 901c554..792c8ff 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -26,6 +26,8 @@ LIPE_SOURCES = cmd.c cmd.h debug.c debug.h fake_lustre_disk.h \ lustre_ea.c lustre_ea.h \ lipe_ldiskfs.c lipe_ldiskfs.h \ lipe_object_attrs.h \ + lipe_version.c \ + lipe_version.h \ list.h policy.h policy.c \ ldiskfs_read_ldd.c ldiskfs_read_ldd.h \ general_policy.h general_policy.c \ @@ -67,9 +69,9 @@ lamigo_SOURCES = lamigo.c lamigo.h lamigo_alr.c lamigo_hash.c lamigo_hash.h \ lamigo_CFLAGS = $(LIPE_CFLAGS) lamigo_LDFLAGS = $(LIPE_LDFLAGS) -lssh -lssh_threads -lpurge_SOURCES = lpurge.c $(LIPE_SOURCES) $(lipe_ssh_SOURCES) +lpurge_SOURCES = lpurge.c $(LIPE_SOURCES) lpurge_CFLAGS = $(LIPE_CFLAGS) -lpurge_LDFLAGS = $(LIPE_LDFLAGS) -lssh -lssh_threads +lpurge_LDFLAGS = $(LIPE_LDFLAGS) lipe_expression_test_SOURCES = lipe_expression_test.c $(LIPE_SOURCES) lipe_expression_test_CFLAGS = $(LIPE_CFLAGS) diff --git a/src/lamigo.c b/src/lamigo.c index 68b5763..323a5aa 100644 --- a/src/lamigo.c +++ b/src/lamigo.c @@ -21,6 +21,7 @@ * - auto-config with lfs df -v to find flash OSTs * - tests */ +#include #include #include #include @@ -60,6 +61,7 @@ #include "general_policy.h" #include "list.h" #include "lipe_config.h" +#include "lipe_version.h" #define DEF_POOL_REFRESH_INTV (10 * 60) #define DEF_PROGRESS_INTV (10 * 60) @@ -130,7 +132,8 @@ static void usage(void) "\t--hot-fraction, files with heat >= are hot (default: %d)\n" "\t--hot-after-idle, hot files after N idle periods to be mirrored (default: %d)\n" "\t--src-free, mirroring to source OST pool is prohibited if pool has less space (default: %d%%)\n" - "\t--tgt-free, mirroring to target OST pool is prohibited if pool has less space (default: %d%%)\n", + "\t--tgt-free, mirroring to target OST pool is prohibited if pool has less space (default: %d%%)\n" + "\t--version, print version information and exit\n", program_invocation_short_name, DEF_MIN_AGE, DEF_CACHE_SIZE >> 20, @@ -264,6 +267,8 @@ struct mirror_opts { }; __u64 lamigo_last_processed_idx; /* changelog index */ +static time_t lamigo_last_cleared; +static __u64 lamigo_last_cleared_index; struct lipe_list_head lamigo_agent_list; static int lamigo_agent_count; int lamigo_max_jobs; /* max jobs for all agents */ @@ -288,6 +293,17 @@ struct pool_list *src_pools; /* source pools */ struct pool_list *tgt_pools; /* target pool */ static void *lamigo_refresh_statfs_thread(void *arg); +inline int are_agents_busy(void) +{ + if (opt.o_iml_re_socket) + return 0; + + if (lamigo_jobs_running >= lamigo_max_jobs) + return 1; + + return 0; +} + /* Convert human readable size string to and int; "1k" -> 1000 */ static int strsize2int(long *sizep, char *str) { @@ -324,6 +340,24 @@ static int strsize2int(long *sizep, char *str) } } +static void systemf(const char *fmt, ...) __attribute__ ((format (printf, 1, 2))); +static void systemf(const char *fmt, ...) +{ + char *cmd = NULL; + va_list args; + int rc; + + va_start(args, fmt); + rc = vasprintf(&cmd, fmt, args); + va_end(args); + + if (rc < 0) + return; + + system(cmd); + free(cmd); +} + static struct lipe_flist *lamigo_flist_init(char *socket) { struct lipe_flist *flist = flist_alloc(NULL, 1024, @@ -344,6 +378,8 @@ void lamigo_usr1_handle(int sig) struct resync_agent *a; struct resync_job *j; struct pool_list *pl; + struct tm *tmtime; + char timestr[32]; FILE *f; int i; @@ -355,11 +391,15 @@ void lamigo_usr1_handle(int sig) llapi_error(LLAPI_MSG_DEBUG|LLAPI_MSG_NO_ERRNO, 0, "can't open dump file\n"); return; } - fprintf(f, "config:\n" + fprintf(f, + "version: %s-%s\n" + "revision: %s\n" + "config:\n" " chlg_user: %s\n" " mdtname: %s\n" " mountpoint: %s\n" " source_pool: ", + PACKAGE_VERSION, LIPE_RELEASE, LIPE_REVISION, opt.o_chlg_user, opt.o_mdtname, opt.o_mntpt); i = 0; for (pl = src_pools; pl != NULL; pl = pl->pl_next, i++) @@ -429,6 +469,16 @@ void lamigo_usr1_handle(int sig) stats.s_skip_hot, stats.s_replicate_ro2hot, stats.s_replicate_rw2hot, stats.s_replicate_rw2cold); + tmtime = localtime(&lamigo_last_cleared); + strftime(timestr, sizeof(timestr), "%c", tmtime); + + fprintf(f, "changelog:\n" + " last_processed_idx: %llu\n" + " last_cleared_idx: %llu\n" + " last_cleared_time: %lu # ( %s )\n", + lamigo_last_processed_idx, lamigo_last_cleared_index, + lamigo_last_cleared, timestr); + #define AGENT_FMT \ " agent%d: { host: %s, mnt: %s, jobs: %u, max: %u, state: %s }\n" @@ -510,7 +560,6 @@ static void lamigo_init_vars(void) signal(SIGUSR1, lamigo_null_handler); signal(SIGUSR2, lamigo_null_handler); - signal(SIGCHLD, lamigo_null_handler); LIPE_INIT_LIST_HEAD(&head.lh_list); LIPE_INIT_LIST_HEAD(&lamigo_job_list); @@ -534,7 +583,7 @@ static void lamigo_cleanup(void) lipe_list_for_each_entry_safe(rss, tmp, &agent->rag_ssh_list, rss_list) { - lipe_cleanup_ssh_session(&rss->rss_ctx); + lipe_ssh_context_destroy(&rss->rss_ctx); lipe_list_del(&rss->rss_list); free(rss); } @@ -586,19 +635,12 @@ static int lamigo_exec_cmd(struct resync_agent *a, char *cmd) lipe_list_del(&rss->rss_list); pthread_mutex_unlock(&a->rag_ssh_lock); - if (lipe_ssh_status(&rss->rss_ctx) != SSH_OK) { - rc = lipe_init_ssh_session(&rss->rss_ctx, a->rag_hostname); - if (rc != SSH_OK) - goto out; - } - rc = lipe_ssh_exec(&rss->rss_ctx, cmd); if (rc) - llapi_error(LLAPI_MSG_FATAL|LLAPI_MSG_NO_ERRNO, 0, + llapi_error(LLAPI_MSG_INFO|LLAPI_MSG_NO_ERRNO, 0, "error executing ssh command '%s' on '%s': rc = %d\n", cmd, a->rag_hostname, rc); -out: pthread_mutex_lock(&a->rag_ssh_lock); lipe_list_add(&rss->rss_list, &a->rag_ssh_list); pthread_cond_signal(&a->rag_ssh_cond); @@ -617,8 +659,8 @@ void *lamigo_replicate_one(void *args) if (rj->rj_setprefer) { snprintf(cmd, sizeof(cmd), - "lfs setstripe --comp-set --comp-flags=prefer -p %s " - "%s/.lustre/fid/"DFID" >&/dev/null", rj->rj_pool, + "lfs setstripe --comp-set --comp-flags=prefer --pool='%s' " + "'%s/.lustre/fid/"DFID"' >&/dev/null", rj->rj_pool, agent->rag_mountpoint, PFID(&rj->rj_fid)); llapi_printf(LLAPI_MSG_DEBUG, "set prefer on "DFID"\n", @@ -627,20 +669,20 @@ void *lamigo_replicate_one(void *args) int i; i = snprintf(cmd, sizeof(cmd), - "%s --pool=%s", + "%s --pool='%s'", opt.o_mirror_cmd, rj->rj_pool); if (rj->rj_stripes > 0) i += snprintf(cmd + i, sizeof(cmd) - i, - " -c %d", rj->rj_stripes); + " --stripe-count=%d", rj->rj_stripes); if (rj->rj_mirror_opts) i += snprintf(cmd + i, sizeof(cmd) - i, - " --flags=%s", rj->rj_mirror_opts); + " --flags='%s'", rj->rj_mirror_opts); i += snprintf(cmd + i, sizeof(cmd) - i, - " %s/.lustre/fid/"DFID" > /dev/null 2>&1", + " '%s/.lustre/fid/"DFID"' > /dev/null 2>&1", agent->rag_mountpoint, PFID(&rj->rj_fid)); } else if (resync == AMIGO_RESYNC_RESYNC) { snprintf(cmd, sizeof(cmd), - "lfs mirror resync %s/.lustre/fid/"DFID" > /dev/null 2>&1", + "lfs mirror resync '%s/.lustre/fid/"DFID"' > /dev/null 2>&1", agent->rag_mountpoint, PFID(&rj->rj_fid)); } else { @@ -1057,7 +1099,7 @@ static int lamigo_get_attrs(const struct lu_fid *fid, /* the subsequent steps to fetch EAs are very expensive * skip if possible */ - if (need_attr == 0 && 0) + if (need_attr == 0) return 0; /* XXX: would be great to get all EAs with a single call */ @@ -1180,7 +1222,7 @@ void *lamigo_check_agent_func(void *args) char cmd[PATH_MAX]; struct resync_agent *a = (struct resync_agent *)args; - snprintf(cmd, sizeof(cmd), "lfs path2fid %s > /dev/null 2>&1", + snprintf(cmd, sizeof(cmd), "lfs path2fid '%s' > /dev/null 2>&1", a->rag_mountpoint); rc = lamigo_exec_cmd(a, cmd); @@ -1296,6 +1338,12 @@ static int lamigo_update_one(struct fid_rec *f) return 0; } + if (are_agents_busy()) { + /* all the agents are busy */ + llapi_error(LLAPI_MSG_DEBUG|LLAPI_MSG_NO_ERRNO, 0, "no agents avilable (max: %d)\n", lamigo_max_jobs); + return 1; + } + /* prevent hot file migration from hot pool to slow */ rc = lamigo_alr_check_is_hot(&f->fr_fh.fh_fid, &ah); if (rc) { @@ -1331,12 +1379,6 @@ static int lamigo_update_one(struct fid_rec *f) return 0; } - if (lamigo_jobs_running >= lamigo_max_jobs) { - /* all the agents are busy */ - llapi_error(LLAPI_MSG_DEBUG|LLAPI_MSG_NO_ERRNO, 0, "no agents avilable (max: %d)\n", lamigo_max_jobs); - return 1; - } - rj = calloc(1, sizeof(struct resync_job)); if (rj == NULL) { llapi_error(LLAPI_MSG_DEBUG|LLAPI_MSG_NO_ERRNO, 0, "can't allocate for a job\n"); @@ -1390,7 +1432,7 @@ static int lamigo_check_sync(void) int count; /* try to resubmit failed jobs first */ - while (!lipe_list_empty(&lamigo_failed_job_list)) { + while (!lipe_list_empty(&lamigo_failed_job_list) && !are_agents_busy()) { struct resync_job *rj; rj = lipe_list_entry(lamigo_failed_job_list.next, @@ -1528,9 +1570,6 @@ static unsigned long get_fid_cache_size(int pct) return cache_size; } -static time_t lamigo_last_cleared; -static __u64 lamigo_last_cleared_index; - static void lamigo_check_and_clear_changelog(void) { struct resync_job *tmp; @@ -1542,10 +1581,9 @@ static void lamigo_check_and_clear_changelog(void) index = lamigo_last_processed_idx; /* this checks prevents too frequent in-memory-search for - * the minimal record-in-work: every 256th record and not - * frequently than once a 5 seconds + * the minimal record-in-work, not frequently than once a 3 seconds */ - if ((index & 256) == 0 && time(NULL) - lamigo_last_cleared < 5) + if (time(NULL) - lamigo_last_cleared < 3) return; lamigo_last_cleared = time(NULL); @@ -1564,68 +1602,56 @@ static void lamigo_check_and_clear_changelog(void) index = tmp->rj_index; } - if (index - lamigo_last_cleared_index < opt.o_chlg_clear_frequency) + lipe_list_for_each_entry(tmp, &lamigo_failed_job_list, rj_list) { + if (tmp->rj_check_job || !tmp->rj_index) + continue; + if (tmp->rj_index < index) + index = tmp->rj_index; + } + + if (index < lamigo_last_cleared_index || + index - lamigo_last_cleared_index < opt.o_chlg_clear_frequency) return; llapi_error(LLAPI_MSG_DEBUG|LLAPI_MSG_NO_ERRNO, 0, "CLEAR upto %llu in %s (%llu last)\n", index, opt.o_chlg_user, lamigo_last_processed_idx); lamigo_last_cleared_index = index; rc = llapi_changelog_clear(opt.o_mdtname, opt.o_chlg_user, index); - if (rc) + if (rc < 0) { llapi_error(LLAPI_MSG_ERROR, rc, - "failed to clear changelog record: %s:%llu", - opt.o_chlg_user, index); + "failed to clear changelog record %s:%llu (%llu last)", + opt.o_chlg_user, index, lamigo_last_cleared_index); + systemf("lctl get_param 'mdd.%s.changelog_users'", opt.o_mdtname); + } } -static void lamigo_check_jobs(void) +static void lamigo_job_fini(struct resync_job *rj, intptr_t retval) { - struct resync_job *rj = NULL, *tmp; - int rc = 0; - intptr_t retval; - - lipe_list_for_each_entry(tmp, &lamigo_job_list, rj_list) { - rc = pthread_tryjoin_np(tmp->rj_pid, (void **)&retval); - if (rc == 0) { - rj = tmp; - break; - } else if (rc != EBUSY) { - break; - } - } - - if (rj == NULL) { - if (rc && rc != EBUSY) - llapi_error(LLAPI_MSG_ERROR|LLAPI_MSG_NO_ERRNO, 0, "unknown job %d exited\n", - rc); - return; - } - llapi_error(LLAPI_MSG_DEBUG|LLAPI_MSG_NO_ERRNO, 0, "job %lu on "DFID" done in %lu: %"PRIdPTR" (%d)\n", rj->rj_pid, PFID(&rj->rj_fid), time(NULL) - rj->rj_start, retval, rj->rj_agent->rag_bad); - lipe_list_del(&rj->rj_list); - if (rj->rj_check_job) { rj->rj_agent->rag_check_in_progress = 0; if (retval == 0) { /* the agent is back */ - llapi_error(LLAPI_MSG_DEBUG|LLAPI_MSG_NO_ERRNO, 0, "agent %s is back\n", - rj->rj_agent->rag_hostname); if (rj->rj_agent->rag_bad) { + llapi_error(LLAPI_MSG_DEBUG|LLAPI_MSG_NO_ERRNO, 0, "agent %s is back\n", + rj->rj_agent->rag_hostname); rj->rj_agent->rag_bad = false; lamigo_max_jobs += rj->rj_agent->rag_maxjobs; } } else { /* the agent is still bad */ if (rj->rj_agent->rag_bad == false) { + llapi_error(LLAPI_MSG_DEBUG|LLAPI_MSG_NO_ERRNO, 0, "agent %s is bad\n", + rj->rj_agent->rag_hostname); + assert(lamigo_max_jobs >= rj->rj_agent->rag_maxjobs); lamigo_max_jobs -= rj->rj_agent->rag_maxjobs; rj->rj_agent->rag_bad = true; } - llapi_error(LLAPI_MSG_DEBUG|LLAPI_MSG_NO_ERRNO, 0, "agent %s is still bad\n", - rj->rj_agent->rag_hostname); } free(rj); return; @@ -1662,19 +1688,40 @@ static void lamigo_check_jobs(void) /* no striping info or file disappeared */ /* no need to handle, forget the changelog record */ } else if (retval == 0) { - stats.s_replicated++; + if (!rj->rj_setprefer) + stats.s_replicated++; } if (rj) free(rj); } +static void lamigo_check_jobs(void) +{ + struct resync_job *rj, *tmp; + int rc = 0; + intptr_t retval; + + lipe_list_for_each_entry_safe(rj, tmp, &lamigo_job_list, rj_list) { + rc = pthread_tryjoin_np(rj->rj_pid, (void **)&retval); + if (rc == EBUSY) { + continue; + } else if (rc != 0) { + llapi_error(LLAPI_MSG_ERROR, rc, "cannot join thread %lld", + (long long)rj->rj_pid); + } else { + lipe_list_del(&rj->rj_list); + lamigo_job_fini(rj, retval); + } + } +} + static void lamigo_add_agent(const char *host, const char *mnt, char *jobs) { struct resync_agent *a; int i; - a = malloc(sizeof(*a)); + a = calloc(1, sizeof(*a)); if (!a) { llapi_error(LLAPI_MSG_FATAL|LLAPI_MSG_NO_ERRNO, 0, "can't allocate memory for agent\n"); @@ -1715,8 +1762,8 @@ static void lamigo_add_agent(const char *host, const char *mnt, char *jobs) /* ssh context per job, and one more for agent heartbeat */ for (i = 0; i < a->rag_maxjobs + 1; i++) { - struct resync_ssh_session *rss = - malloc(sizeof(struct resync_ssh_session)); + struct resync_ssh_session *rss = calloc(1, sizeof(*rss)); + int rc; if (!rss) { llapi_error(LLAPI_MSG_FATAL|LLAPI_MSG_NO_ERRNO, 0, @@ -1724,6 +1771,14 @@ static void lamigo_add_agent(const char *host, const char *mnt, char *jobs) exit(1); } + rc = lipe_ssh_context_init(&rss->rss_ctx, a->rag_hostname); + if (rc < 0) { + llapi_error(LLAPI_MSG_FATAL|LLAPI_MSG_NO_ERRNO, 0, + "cannot create SSH context for '%s'\n", + a->rag_hostname); + exit(1); + } + lipe_list_add(&rss->rss_list, &a->rag_ssh_list); } @@ -1750,6 +1805,7 @@ static void lamigo_add_agent(const char *host, const char *mnt, char *jobs) #define LAMIGO_OPT_STATFS_REFRESH 15 #define LAMIGO_OPT_SRC_FREE 16 #define LAMIGO_OPT_TGT_FREE 17 +#define LAMIGO_OPT_VERSION 18 static struct option options[] = { { "ignore-reads", no_argument, NULL, LAMIGO_OPT_IGNORE_READS}, @@ -1790,6 +1846,7 @@ static struct option options[] = { { "tgt", required_argument, NULL, 't'}, { "user", required_argument, 0, 'u'}, { "verbose", no_argument, NULL, 'v'}, + { "version", no_argument, NULL, LAMIGO_OPT_VERSION }, { NULL } }; @@ -2082,6 +2139,9 @@ void lamigo_process_opt(int c, char *optarg) exit(1); } break; + case LAMIGO_OPT_VERSION: + lipe_version(); + exit(0); case 'a': opt.o_min_age = strtol(optarg, &endptr, 10); if (*endptr != '\0' || opt.o_min_age < 5) { @@ -2594,7 +2654,7 @@ int lamigo_lipe_callback(struct lipe_instance *instance, lamigo_check_jobs(); do { - while (lamigo_jobs_running >= lamigo_max_jobs) { + while (are_agents_busy()) { lamigo_check_jobs(); lamigo_check_sync(); lamigo_check_bad_agents(); @@ -3194,6 +3254,7 @@ int main(int argc, char **argv) int ret = 0; pthread_t pid; + lipe_version_init(); ssh_threads_set_callbacks(ssh_threads_get_pthread()); ssh_init(); @@ -3208,6 +3269,10 @@ int main(int argc, char **argv) * followed by the MDT name ("lamigo lustre-MDT0000"). */ llapi_set_command_name(opt.o_mdtname); + llapi_error(LLAPI_MSG_INFO|LLAPI_MSG_NO_ERRNO, 0, + "version %s-%s, revision %s\n", + PACKAGE_VERSION, LIPE_RELEASE, LIPE_REVISION); + rc = lamigo_init_cache(); if (rc < 0) { llapi_error(LLAPI_MSG_FATAL|LLAPI_MSG_NO_ERRNO, 0, "can't init cache\n"); @@ -3276,11 +3341,15 @@ int main(int argc, char **argv) if (enable_heat) lamigo_check_hot(); - rc = lamigo_check_sync(); - if (rc < 0) { - stop = true; - ret = rc; + + if (!are_agents_busy()) { + rc = lamigo_check_sync(); + if (rc < 0) { + stop = true; + ret = rc; + } } + lamigo_check_jobs(); lamigo_check_and_clear_changelog(); lamigo_check_bad_agents(); @@ -3305,6 +3374,7 @@ int main(int argc, char **argv) out: lamigo_cleanup(); llapi_error(LLAPI_MSG_INFO|LLAPI_MSG_NO_ERRNO, 0, "exited\n"); + lipe_version_fini(); return ret; } diff --git a/src/lamigo_alr.c b/src/lamigo_alr.c index fa28ce2..aceb3ce 100644 --- a/src/lamigo_alr.c +++ b/src/lamigo_alr.c @@ -261,20 +261,14 @@ void *lamigo_alr_data_collection_thread(void *arg) opt.o_alr_ofd_interval, mdtidx, opt.o_alr_extra_args); repeat: - rc = lipe_init_ssh_session(&ala->ala_ctx, ala->ala_host); - if (rc) { - llapi_err_noerrno(LLAPI_MSG_ERROR, - "failed to create connection to agent [%s], %d\n", - ala->ala_host, rc); - goto again; - } - - rc = lipe_ssh_start_cmd(ala->ala_ctx.lsc_session, cmd, &channel); + rc = lipe_ssh_start_cmd(&ala->ala_ctx, cmd, &channel); if (rc != SSH_OK) { - llapi_error(LLAPI_MSG_ERROR, rc, - "cannot start ofd_access_log_readed\n"); + llapi_error(LLAPI_MSG_ERROR|LLAPI_MSG_NO_ERRNO, 0, + "cannot start ofd_access_log_reader on host '%s': rc = %d\n", + ala->ala_host, rc); goto err; } + llapi_err_noerrno(LLAPI_MSG_DEBUG, "alr agent on %s is running\n", ala->ala_host); @@ -303,7 +297,6 @@ err: ssh_channel_close(channel); ssh_channel_free(channel); -again: /* wait for a while */ /* XXX: should be configurable? */ sleep(5); @@ -907,11 +900,17 @@ void lamigo_alr_init(void) void lamigo_add_alr_agent(const char *host) { struct alr_agent *ala; + int rc; - ala = calloc(sizeof(*ala), 1); - assert(ala); + ala = calloc(1, sizeof(*ala)); + assert(ala != NULL); ala->ala_host = strdup(host); + assert(ala->ala_host != NULL); + + rc = lipe_ssh_context_init(&ala->ala_ctx, ala->ala_host); + assert(rc == SSH_OK); + lipe_list_add(&ala->ala_list, &alr_agent_list); } diff --git a/src/lipe_ssh.c b/src/lipe_ssh.c index 5df17d9..43ac483 100644 --- a/src/lipe_ssh.c +++ b/src/lipe_ssh.c @@ -1,4 +1,6 @@ #include +#include +#include #include #include @@ -6,61 +8,59 @@ #include "lipe_ssh.h" -int lipe_ssh_authenticate_pubkey(ssh_session session) -{ - int rc; - - rc = ssh_userauth_publickey_auto(session, NULL, NULL); - if (rc != SSH_AUTH_SUCCESS) { - llapi_printf(LLAPI_MSG_ERROR, "Authentication failed: %s\n", - ssh_get_error(session)); - return rc; - } - - return SSH_AUTH_SUCCESS; -} +#define lipe_ssh_debug(fmt, args...) \ + llapi_error(LLAPI_MSG_DEBUG|LLAPI_MSG_NO_ERRNO, 0, fmt, ##args) +#define lipe_ssh_error(fmt, args...) \ + llapi_error(LLAPI_MSG_ERROR|LLAPI_MSG_NO_ERRNO, 0, fmt, ##args) -int lipe_ssh_start_cmd(ssh_session session, const char *cmd, ssh_channel *rch) +static int lipe_ssh_session_start_cmd(ssh_session session, const char *cmd, ssh_channel *pchannel) { - ssh_channel channel; + ssh_channel channel = NULL; int rc; + assert(session != NULL); + channel = ssh_channel_new(session); if (channel == NULL) { - llapi_printf(LLAPI_MSG_ERROR, - "Error allocating a new channel: %s\n", - ssh_get_error(session)); - return SSH_ERROR; + lipe_ssh_error("canot create a new SSH channel: %s\n", + ssh_get_error(session)); + rc = SSH_ERROR; + goto out; } rc = ssh_channel_open_session(channel); if (rc != SSH_OK) { - llapi_printf(LLAPI_MSG_ERROR, - "Error opening a session channel: %s\n", - ssh_get_error(session)); - ssh_channel_free(channel); - return rc; + lipe_ssh_error("cannot open SSH session channel: %s\n", + ssh_get_error(session)); + goto out; } rc = ssh_channel_request_exec(channel, cmd); if (rc != SSH_OK) { - llapi_printf(LLAPI_MSG_ERROR, "Error executing command: %s\n", - ssh_get_error(session)); - ssh_channel_close(channel); - ssh_channel_free(channel); - return rc; + lipe_ssh_error("cannot execute SSH command: %s\n", + ssh_get_error(session)); + goto out; } - *rch = channel; - return SSH_OK; + + *pchannel = channel; + channel = NULL; + rc = SSH_OK; +out: + if (channel != NULL) + ssh_channel_close(channel); + + ssh_channel_free(channel); + + return rc; } -int lipe_ssh_exec_cmd(ssh_session session, const char *cmd) +static int lipe_ssh_session_exec_cmd(ssh_session session, const char *cmd) { ssh_channel channel; int rc; - rc = lipe_ssh_start_cmd(session, cmd, &channel); + rc = lipe_ssh_session_start_cmd(session, cmd, &channel); if (rc != SSH_OK) return rc; @@ -72,114 +72,140 @@ int lipe_ssh_exec_cmd(ssh_session session, const char *cmd) return rc; } -int lipe_init_ssh_session(struct lipe_ssh_context *ctx, - const char *host) +static void lipe_ssh_session_destroy(ssh_session *psession) +{ + if (*psession != NULL) + ssh_disconnect(*psession); + + ssh_free(*psession); + *psession = NULL; +} + +static int lipe_ssh_session_create(ssh_session *psession, const char *host) { - int timeout = 5; + ssh_session session = NULL; + long timeout = 5; int rc; - ctx->lsc_host = strdup(host); - if (ctx->lsc_host == NULL) - return SSH_ERROR; + assert(SSH_OK == 0); + assert(SSH_AUTH_SUCCESS == 0); + assert(SSH_ERROR < 0); + + assert(psession != NULL); + assert(*psession == NULL); + assert(host != NULL); - pthread_mutex_init(&ctx->lsc_session_mutex, NULL); - /* Create a new ssh session and set options */ - ctx->lsc_session = ssh_new(); - if (ctx->lsc_session == NULL) { - llapi_printf(LLAPI_MSG_ERROR, - "Error creating a new ssh session\n"); + lipe_ssh_debug("creating new SSH session for host '%s'\n", host); + + session = ssh_new(); + if (session == NULL) { + lipe_ssh_error("cannot create a new SSH session: %s\n", + strerror(ENOMEM)); /* Probably. */ rc = SSH_ERROR; - goto out_host; + goto out; } - ssh_options_set(ctx->lsc_session, SSH_OPTIONS_HOST, host); + rc = ssh_options_set(session, SSH_OPTIONS_HOST, host); + if (rc != SSH_OK) { + lipe_ssh_error("cannot set SSH session host to '%s: %s'\n", + host, ssh_get_error(session)); + goto out; + } - if (ssh_options_set(ctx->lsc_session, SSH_OPTIONS_TIMEOUT, - &timeout) < 0) { - llapi_printf(LLAPI_MSG_FATAL, - "Error setting ssh timeout\n"); - rc = SSH_ERROR; + rc = ssh_options_set(session, SSH_OPTIONS_TIMEOUT, &timeout); + if (rc != SSH_OK) { + lipe_ssh_error("cannot set SSH timeout to %ld: %s\n", + timeout, ssh_get_error(session)); goto out; } /* Connect to the ssh server */ - rc = ssh_connect(ctx->lsc_session); + rc = ssh_connect(session); if (rc != SSH_OK) { - llapi_printf(LLAPI_MSG_ERROR, - "Error connecting to host %s: %s\n", - host, - ssh_get_error(ctx->lsc_session)); + lipe_ssh_error("cannot connect SSH session to host '%s': %s\n", + host, ssh_get_error(session)); goto out; } /* Automatically authenticate with public key */ - rc = lipe_ssh_authenticate_pubkey(ctx->lsc_session); - if (rc == SSH_AUTH_SUCCESS) - return rc; - - llapi_printf(LLAPI_MSG_ERROR, - "Error authenticating pubkey: %s\n", - ssh_get_error(ctx->lsc_session)); + rc = ssh_userauth_publickey_auto(session, NULL, NULL); + if (rc != SSH_AUTH_SUCCESS) { + lipe_ssh_error("cannot authenticate SSH session to host '%s': %s\n", + host, ssh_get_error(session)); + goto out; + } - ssh_disconnect(ctx->lsc_session); + *psession = session; + session = NULL; + rc = SSH_OK; out: - ssh_free(ctx->lsc_session); - ctx->lsc_session = NULL; -out_host: - free(ctx->lsc_host); + lipe_ssh_debug("create new SSH session for host '%s': rc = %d\n", host, rc); + lipe_ssh_session_destroy(&session); return rc; } -int lipe_reset_ssh_session(struct lipe_ssh_context *ctx) +static int lipe_ssh_context_check(struct lipe_ssh_context *ctx) { - int rc; - char *hostname; - - pthread_mutex_lock(&ctx->lsc_session_mutex); - hostname = strdup(ctx->lsc_host); - if (!hostname) { - pthread_mutex_unlock(&ctx->lsc_session_mutex); - return -ENOMEM; - } - lipe_cleanup_ssh_session(ctx); - rc = lipe_init_ssh_session(ctx, hostname); - pthread_mutex_unlock(&ctx->lsc_session_mutex); - free(hostname); + if (ctx->lsc_session == NULL) + return lipe_ssh_session_create(&ctx->lsc_session, ctx->lsc_host); - return rc; + return SSH_OK; } -int lipe_ssh_exec(struct lipe_ssh_context *ctx, const char *cmd) +static void lipe_ssh_context_fail(struct lipe_ssh_context *ctx) { - int rc; - - /* Execute a remote command */ - rc = lipe_ssh_exec_cmd(ctx->lsc_session, cmd); - if (rc == SSH_OK) - return SSH_OK; - - /* Reset ssh session and try again */ - if (rc == SSH_ERROR || rc == SSH_AGAIN) { - rc = lipe_reset_ssh_session(ctx); - if (rc) - return rc; - rc = lipe_ssh_exec_cmd(ctx->lsc_session, cmd); - } - - return rc; + lipe_ssh_debug("fail SSH context for host '%s'\n", ctx->lsc_host); + lipe_ssh_session_destroy(&ctx->lsc_session); } -void lipe_cleanup_ssh_session(struct lipe_ssh_context *ctx) +void lipe_ssh_context_destroy(struct lipe_ssh_context *ctx) { - ssh_disconnect(ctx->lsc_session); - ssh_free(ctx->lsc_session); + lipe_ssh_session_destroy(&ctx->lsc_session); free(ctx->lsc_host); + ctx->lsc_host = NULL; } -int lipe_ssh_status(struct lipe_ssh_context *ctx) +int lipe_ssh_context_init(struct lipe_ssh_context *ctx, const char *host) { - if (ctx->lsc_session == NULL) + memset(ctx, 0, sizeof(*ctx)); + + ctx->lsc_host = strdup(host); + if (ctx->lsc_host == NULL) return SSH_ERROR; + + /* Session creation will be done on demand. */ + return SSH_OK; } + +int lipe_ssh_start_cmd(struct lipe_ssh_context *ctx, const char *cmd, ssh_channel *pchannel) +{ + int rc; + + rc = lipe_ssh_context_check(ctx); + if (rc != SSH_OK) + return rc; + + rc = lipe_ssh_session_start_cmd(ctx->lsc_session, cmd, pchannel); + if (rc != SSH_OK) + lipe_ssh_context_fail(ctx); + + return rc; +} + +int lipe_ssh_exec(struct lipe_ssh_context *ctx, const char *cmd) +{ + int rc; + + rc = lipe_ssh_context_check(ctx); + if (rc != SSH_OK) + return rc; + + /* Execute a remote command */ + rc = lipe_ssh_session_exec_cmd(ctx->lsc_session, cmd); + if (rc < 0) + lipe_ssh_context_fail(ctx); + + return rc; +} diff --git a/src/lipe_ssh.h b/src/lipe_ssh.h index 5d5e224..57ab9d8 100644 --- a/src/lipe_ssh.h +++ b/src/lipe_ssh.h @@ -4,23 +4,17 @@ #ifndef _LIPE_SSH_H_ #define _LIPE_SSH_H_ -#include #include struct lipe_ssh_context { char *lsc_host; ssh_session lsc_session; - pthread_mutex_t lsc_session_mutex; }; -int lipe_ssh_authenticate_pubkey(ssh_session session); -int lipe_ssh_exec_cmd(ssh_session session, const char *cmd); -int lipe_init_ssh_session(struct lipe_ssh_context *ctx, - const char *host); -void lipe_cleanup_ssh_session(struct lipe_ssh_context *ctx); -int lipe_reset_ssh_session(struct lipe_ssh_context *ctx); +int lipe_ssh_context_init(struct lipe_ssh_context *ctx, const char *host); +void lipe_ssh_context_destroy(struct lipe_ssh_context *ctx); + int lipe_ssh_exec(struct lipe_ssh_context *ctx, const char *cmd); -int lipe_ssh_start_cmd(ssh_session session, const char *cmd, ssh_channel *rch); -int lipe_ssh_status(struct lipe_ssh_context *ctx); +int lipe_ssh_start_cmd(struct lipe_ssh_context *ctx, const char *cmd, ssh_channel *pchannel); #endif /* _LIPE_SSH_H_ */ diff --git a/src/lipe_version.c b/src/lipe_version.c new file mode 100644 index 0000000..e8b2342 --- /dev/null +++ b/src/lipe_version.c @@ -0,0 +1,32 @@ +#include +#include +#include +#include +#include "lipe_version.h" + +/* A grep-able string for stripped binaries. */ +static const char lipe_revision_string[] = "LiPE Revision: "LIPE_REVISION; + +/* A grep-able string for core dumps. */ +static char *lipe_revision_string_copy; + +void lipe_version(void) +{ + printf("%s (%s) %s-%s\n%s\n", + program_invocation_short_name, + PACKAGE_NAME, + PACKAGE_VERSION, + LIPE_RELEASE, + lipe_revision_string); +} + +void lipe_version_init(void) +{ + if (lipe_revision_string_copy == NULL) + lipe_revision_string_copy = strdup(lipe_revision_string); +} + +void lipe_version_fini(void) +{ + free(lipe_revision_string_copy); +} diff --git a/src/lipe_version.h b/src/lipe_version.h new file mode 100644 index 0000000..49e3c3f --- /dev/null +++ b/src/lipe_version.h @@ -0,0 +1,8 @@ +#ifndef _LIPE_VERSION_H_ +#define _LIPE_VERSION_H_ + +void lipe_version(void); +void lipe_version_init(void); +void lipe_version_fini(void); + +#endif diff --git a/src/lpurge.c b/src/lpurge.c index 17cff87..c1361c1 100644 --- a/src/lpurge.c +++ b/src/lpurge.c @@ -27,7 +27,6 @@ /* * TODO * - convert blocks to 1K units - * - multiple --mirror-id in lfs split to save on ssh sessions * - take OST load into account * */ @@ -59,13 +58,11 @@ #include #include #include -#include -#include #include "lipe_object_attrs.h" +#include "lipe_version.h" #include "list.h" #include "policy.h" #include "flist.h" -#include "lipe_ssh.h" #define container_of(ptr, type, member) ({ \ const typeof(((type *) 0)->member) * __mptr = (ptr); \ @@ -76,23 +73,15 @@ #define LPURGE_FIDS_DUMPFILE "/var/run/lpurge-%s.fids" struct lpurge_object { + struct lipe_list_head lo_list; struct lu_fid lo_fid; /* Used space in bytes */ __u32 lo_blocks; - __u32 lo_comp_id; + __u32 lo_mirror_id; /* Last use time */ time_t lo_last_utime; }; -struct lpurge_object_pack { - struct lipe_list_head lop_list; - unsigned int lop_max; - unsigned int lop_nr; - unsigned int lop_mdt_idx; - pthread_t lop_pid; - struct lpurge_object lop_objs[]; -}; - struct lpurge_slot { time_t ls_min_utime; time_t ls_max_utime; @@ -104,25 +93,15 @@ struct lpurge_slot { unsigned int ls_scan; struct lipe_list_head ls_obj_list; pthread_mutex_t ls_mutex; + /* stats for objects we can't release */ + unsigned long ls_nomirror_objs; /* no replicated objects */ + unsigned long ls_nomirror_space; + unsigned long ls_nopfid_objs; /* no PFID, can't find parent */ + unsigned long ls_nopfid_space; + unsigned long ls_notfirst_objs; /* not a first stripe */ + unsigned long ls_notfirst_space; }; -struct mds_ssh_session { - struct lipe_list_head mss_list; - struct lipe_ssh_context mss_ctx; -}; - -struct mds { - char *host; - char *mnt; - struct lipe_list_head m_ssh_list; - pthread_mutex_t m_ssh_lock; - pthread_cond_t m_ssh_cond; -}; - -#define MAX_MDTS 128 -struct mds mds[MAX_MDTS]; -int mdsnr; - #define LPURGE_HIST_MAX 16 struct lpurge_slot lpurge_hist[LPURGE_HIST_MAX]; @@ -131,8 +110,11 @@ struct stats { unsigned long s_scans; unsigned long s_low_hits; unsigned long s_slow_scans; - unsigned long s_spawned; - unsigned long s_purged; /* files */ + unsigned long s_queued; /* # files queued for purge */ + unsigned long s_started; /* # files dequeued for purge by worker */ + unsigned long s_purged; /* # files we successfully purged */ + unsigned long s_failed; /* # files we failed to purge */ + /* TODO count mirrors we tried to purge but found last non stale */ }; struct stats stats; @@ -177,6 +159,7 @@ char *ostprefix; char ost_mntdev[PATH_MAX]; char *ost_mntpt; int lustre_fd = -1; +int open_by_fid_fd = -1; unsigned long oldest; time_t scan_started_time; time_t scan_finished_time; @@ -191,13 +174,6 @@ char *prog; void load_config(char *name); -#define PACK_SIZE (64*1024) -#define PACK_NR ((PACK_SIZE - offsetof(struct lpurge_object_pack, lop_objs[0])) / \ - sizeof(struct lpurge_object)) - -#define FIDS_PER_CALL 128 /* XXX: bump to a bigger value */ -#define PM_SIZE (offsetof(struct lpurge_object_pack, lop_objs[FIDS_PER_CALL])) - struct lipe_flist *lflist; #define LPURGE_FLIST_SIZE (1024 * 1024) @@ -225,25 +201,11 @@ static int lpurge_init_flist(void) static void sig_handler(int signal) { - int i; - psignal(signal, "exiting"); if (lflist) flist_free(lflist); - for (i = 0; i < mdsnr; i++) { - struct mds *m = &mds[i]; - struct mds_ssh_session *mss, *tmp; - - lipe_list_for_each_entry_safe(mss, tmp, &m->m_ssh_list, - mss_list) { - lipe_cleanup_ssh_session(&mss->mss_ctx); - lipe_list_del(&mss->mss_list); - free(mss); - } - } - - _exit(1); + _exit(0); } static void lpurge_null_handler(int signal) @@ -277,7 +239,7 @@ void lpurge_init_result(void) void lpurge_reset_result(void) { - struct lpurge_object_pack *s, *t; + struct lpurge_object *s, *t; int i; for (i = 0; i < LPURGE_HIST_MAX; i++) { @@ -287,9 +249,15 @@ void lpurge_reset_result(void) lpurge_hist[i].ls_scan = 1; lpurge_hist[i].ls_min_utime = ~0UL; lpurge_hist[i].ls_max_utime = 0; + lpurge_hist[i].ls_nopfid_objs = 0; + lpurge_hist[i].ls_nopfid_space = 0; + lpurge_hist[i].ls_nomirror_objs = 0; + lpurge_hist[i].ls_nomirror_space = 0; + lpurge_hist[i].ls_notfirst_objs = 0; + lpurge_hist[i].ls_notfirst_space = 0; lipe_list_for_each_entry_safe(s, t, &lpurge_hist[i].ls_obj_list, - lop_list) { - lipe_list_del(&s->lop_list); + lo_list) { + lipe_list_del(&s->lo_list); free(s); } } @@ -312,7 +280,8 @@ void usage(void) "\t-R, --scan_rate, slow scanning rate, objs/sec (default: %u)\n" "\t-S, --slot_size, objects to store in memory (default: %u)\n" "\t-t, --scan_threads, scanning threads (default: %u)\n" - "\t-w, --dump, stats file (via USR1 signal, default: %s)\n", + "\t-w, --dump, stats file (via USR1 signal, default: %s)\n" + "\t--version, print version information and exit\n", program_invocation_short_name, DEF_FREEHI, DEF_INTERVAL, @@ -549,7 +518,7 @@ static void lpurge_reclaim_slot(unsigned int index) int i = 0; struct lpurge_slot *ls_1 = lpurge_hist; /* youngest slot */ struct lpurge_slot *ls_2; - struct lpurge_object_pack *p, *t; + struct lpurge_object *p, *t; if (index >= LPURGE_HIST_MAX - 1) return; @@ -562,8 +531,8 @@ static void lpurge_reclaim_slot(unsigned int index) ls_1->ls_min_utime = ~0UL; /* reclaim youngest slot */ lipe_list_for_each_entry_safe(p, t, &ls_1->ls_obj_list, - lop_list) { - lipe_list_del(&p->lop_list); + lo_list) { + lipe_list_del(&p->lo_list); free(p); } pthread_mutex_unlock(&ls_1->ls_mutex); @@ -630,7 +599,7 @@ int lpurge_lipe_callback(struct lipe_instance *instance, struct lipe_object_attrs *attrs) { struct lpurge_slot *ls = NULL; - struct lpurge_object_pack *lp = NULL; + struct lpurge_object *lo = NULL; time_t age, last_used; int index; @@ -642,25 +611,6 @@ int lpurge_lipe_callback(struct lipe_instance *instance, if ((attrs->loa_mode & S_IFMT) != S_IFREG) goto out; - if ((attrs->loa_attr_bits & LIPE_OBJECT_ATTR_FILTER_FID) == 0) - goto out; - - /* to avoid N OSTs to 1 MDT scalability issue we only consider - * objects which store 1st stripe - */ - if (attrs->loa_filter_fid.ff_parent.f_ver != 0) - goto out; - - /* if the object has got ost_layout structure which encodes - * whether the object has a mirror, then we can skip objects - * with mirror_id=0 (no mirror) - */ - if (attrs->loa_filter_fid_size >= sizeof(struct filter_fid)) { - if (mirror_id_of(attrs->loa_filter_fid.ff_layout.ol_comp_id) - == 0) - goto out; - } - last_used = attrs->loa_mtime_ms; if (last_used < attrs->loa_atime_ms) last_used = attrs->loa_atime_ms; @@ -685,13 +635,45 @@ int lpurge_lipe_callback(struct lipe_instance *instance, goto out; } + pthread_mutex_lock(&ls->ls_mutex); + + if ((attrs->loa_attr_bits & LIPE_OBJECT_ATTR_FILTER_FID) == 0) { + ls->ls_nopfid_objs++; + ls->ls_nopfid_space += attrs->loa_blocks >> 10; + pthread_mutex_unlock(&ls->ls_mutex); + goto out; + } + + /* to avoid N OSTs to 1 MDT scalability issue we only consider + * objects which store 1st stripe + */ + if (attrs->loa_filter_fid.ff_parent.f_ver != 0) { + ls->ls_notfirst_objs++; + ls->ls_notfirst_space += attrs->loa_blocks >> 10; + pthread_mutex_unlock(&ls->ls_mutex); + goto out; + } + + /* if the object has got ost_layout structure which encodes + * whether the object has a mirror, then we can skip objects + * with mirror_id=0 (no mirror) + */ + if (attrs->loa_filter_fid_size >= sizeof(struct filter_fid)) { + if (mirror_id_of(attrs->loa_filter_fid.ff_layout.ol_comp_id) + == 0) { + ls->ls_nomirror_objs++; + ls->ls_nomirror_space += attrs->loa_blocks >> 10; + pthread_mutex_unlock(&ls->ls_mutex); + goto out; + } + } + llapi_printf(LLAPI_MSG_DEBUG, "found under "DFID": size %ld block %ld age %ld slot %d\n", PFID(&attrs->loa_filter_fid.ff_parent), (unsigned long)attrs->loa_size, (unsigned long)attrs->loa_blocks >> 10, age, index); - pthread_mutex_lock(&ls->ls_mutex); ls->ls_found++; ls->ls_space += attrs->loa_blocks >> 10; lpurge_scanned_since++; @@ -723,42 +705,31 @@ int lpurge_lipe_callback(struct lipe_instance *instance, } } - if (!lipe_list_empty(&ls->ls_obj_list)) - lp = lipe_list_entry(ls->ls_obj_list.next, - struct lpurge_object_pack, lop_list); + lo = calloc(1, sizeof(*lo)); + if (lo == NULL) + goto out; - if (!lp || lp->lop_nr >= lp->lop_max) { - lp = calloc(1, PACK_SIZE); - if (!lp) { - llapi_printf(LLAPI_MSG_ERROR, - "can't alloca pack\n"); - exit(1); - } - lp->lop_pid = 0; - lp->lop_max = PACK_NR; - lp->lop_nr = 0; - lipe_list_add(&lp->lop_list, &ls->ls_obj_list); - } - lp->lop_objs[lp->lop_nr].lo_fid = attrs->loa_filter_fid.ff_parent; - lp->lop_objs[lp->lop_nr].lo_blocks = attrs->loa_blocks >> 10; - lp->lop_objs[lp->lop_nr].lo_comp_id = 0; - lp->lop_objs[lp->lop_nr].lo_last_utime = last_used; + lo->lo_fid = attrs->loa_filter_fid.ff_parent; + lo->lo_blocks = attrs->loa_blocks >> 10; + lo->lo_last_utime = last_used; if (attrs->loa_filter_fid_size >= sizeof(struct filter_fid)) { __u32 id; id = mirror_id_of(attrs->loa_filter_fid.ff_layout.ol_comp_id); - lp->lop_objs[lp->lop_nr].lo_comp_id = id; + lo->lo_mirror_id = id; } - lp->lop_nr++; + + lipe_list_add(&lo->lo_list, &ls->ls_obj_list); ls->ls_stored++; + if (ls->ls_max_utime < last_used) ls->ls_max_utime = last_used; if (ls->ls_min_utime > last_used) ls->ls_min_utime = last_used; pthread_mutex_unlock(&ls->ls_mutex); - llapi_printf(LLAPI_MSG_DEBUG, "add "DFID" to %p/%d\n", - PFID(&attrs->loa_filter_fid.ff_parent), lp, lp->lop_nr); + llapi_printf(LLAPI_MSG_DEBUG, "add "DFID" to %p/%lu\n", + PFID(&attrs->loa_filter_fid.ff_parent), ls, ls->ls_stored); out: /* @@ -786,257 +757,324 @@ out: return 0; } -static int lpurge_exec_cmd(struct mds *m, char *cmd) +struct collect_ids_data { + __u16 *cid_ids; + int cid_count; + __u16 cid_exclude; +}; + +static int collect_mirror_id(struct llapi_layout *layout, void *cbdata) +{ + struct collect_ids_data *cid = cbdata; + uint32_t id; + int rc; + + rc = llapi_layout_mirror_id_get(layout, &id); + if (rc < 0) + return rc; + + if ((__u16)id != cid->cid_exclude) { + int i; + + for (i = 0; i < cid->cid_count; i++) { + /* already collected the mirror id */ + if (id == cid->cid_ids[i]) + return LLAPI_LAYOUT_ITER_CONT; + } + cid->cid_ids[cid->cid_count] = id; + cid->cid_count++; + } + + return LLAPI_LAYOUT_ITER_CONT; +} + +static bool last_non_stale_mirror(__u16 mirror_id, struct llapi_layout *layout) { - struct mds_ssh_session *mss; + __u16 mirror_ids[128] = { 0 }; + struct collect_ids_data cid = { + .cid_ids = mirror_ids, + .cid_count = 0, + .cid_exclude = mirror_id, + }; + int i; + + llapi_layout_comp_iterate(layout, collect_mirror_id, &cid); + + for (i = 0; i < cid.cid_count; i++) { + struct llapi_resync_comp comp_array[1024] = { { 0 } }; + int comp_size = 0; + + comp_size = llapi_mirror_find_stale(layout, comp_array, + ARRAY_SIZE(comp_array), + &mirror_ids[i], 1); + if (comp_size == 0) + return false; + } + + return true; +} + +static int +lpurge_mirror_delete(const struct lu_fid *fid, unsigned int mirror_id) +{ + char fid_buf[FID_LEN + 1]; + char vname_buf[PATH_MAX]; + struct ll_ioc_lease *lil = NULL; + struct llapi_layout *layout = NULL; + int mdt_index = -1; + int fd = -1; + int vfd = -1; int rc; - pthread_mutex_lock(&m->m_ssh_lock); - /* - * this should not happen since we limited the max count of launched jobs - * just in case. + /* Inline replacement for + * lfs mirror split -d --mirror-id mirror_id $MOUNTPOINT/.lustre/fid/FID */ - while (lipe_list_empty(&m->m_ssh_list)) - pthread_cond_wait(&m->m_ssh_cond, &m->m_ssh_lock); - /* pop one session */ - mss = container_of(m->m_ssh_list.next, struct mds_ssh_session, - mss_list); - lipe_list_del(&mss->mss_list); + snprintf(fid_buf, sizeof(fid_buf), DFID, PFID(fid)); - pthread_mutex_unlock(&m->m_ssh_lock); + fd = openat(open_by_fid_fd, fid_buf, O_RDWR); + if (fd < 0) { + rc = -errno; + llapi_printf(LLAPI_MSG_DEBUG, + "cannot open "DFID" for split: %s\n", + PFID(fid), strerror(errno)); + goto out; + } - rc = lipe_ssh_exec(&mss->mss_ctx, cmd); - if (rc != 0 && rc != EUCLEAN) - llapi_printf(LLAPI_MSG_FATAL, - "error executing ssh command '%s' on '%s': rc = %d\n", - cmd, m->host, rc); - pthread_mutex_lock(&m->m_ssh_lock); - lipe_list_add(&mss->mss_list, &m->m_ssh_list); - pthread_cond_signal(&m->m_ssh_cond); - pthread_mutex_unlock(&m->m_ssh_lock); + rc = llapi_file_fget_mdtidx(fd, &mdt_index); + if (rc < 0) + goto out; + + snprintf(vname_buf, sizeof(vname_buf), LUSTRE_VOLATILE_HDR":%.4X:%.4X", + mdt_index, (unsigned int)random()); + + vfd = openat(open_by_fid_fd, vname_buf, + O_RDWR|O_CREAT|O_EXCL|O_NOFOLLOW|O_LOV_DELAY_CREATE); + if (vfd < 0) { + rc = -errno; + goto out; + } + + rc = llapi_lease_acquire(fd, LL_LEASE_WRLCK); + if (rc < 0) + goto out; + + layout = llapi_layout_get_by_fd(fd, 0); + if (layout == NULL) { + rc = -errno; + goto out; + } + + if (last_non_stale_mirror(mirror_id, layout)) { + rc = -EUCLEAN; + goto out; + } + + lil = calloc(1, offsetof(typeof(*lil), lil_ids[2])); + if (lil == NULL) { + rc = -ENOMEM; + goto out; + } + + lil->lil_mode = LL_LEASE_UNLCK; + lil->lil_flags = LL_LEASE_LAYOUT_SPLIT; + lil->lil_count = 2; + lil->lil_ids[0] = vfd; + lil->lil_ids[1] = mirror_id; + + rc = llapi_lease_set(fd, lil); + if (rc < 0) { + goto out; + } else if (rc == 0) { + /* lost lease lock */ + rc = -EBUSY; + } else { + rc = 0; + } +out: + if (rc < 0) + llapi_printf(LLAPI_MSG_DEBUG, + "cannot delete mirror %u of "DFID": rc = %d\n", + mirror_id, PFID(fid), rc); + else + llapi_printf(LLAPI_MSG_DEBUG, + "deleted mirror %u of "DFID"\n", + mirror_id, PFID(fid)); + + llapi_layout_free(layout); + + free(lil); + + if (!(fd < 0)) + close(fd); + + if (!(vfd < 0)) + close(vfd); return rc; } -#define CMDSIZE (64 * 1024) +static bool lpurge_work_should_run; +static size_t lpurge_work_thread_count; +static pthread_t *lpurge_work_threads; +static LIPE_LIST_HEAD(lpurge_work_list); +static pthread_mutex_t lpurge_work_lock = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t lpurge_work_cond = PTHREAD_COND_INITIALIZER; +static pthread_cond_t lpurge_work_done = PTHREAD_COND_INITIALIZER; -void *lpurge_spawn_one(void *args) +static void *lpurge_work_func(void *data) { - struct lpurge_object_pack *lp = (struct lpurge_object_pack *)args; - char buf[PATH_MAX]; - int rc, i, idx = lp->lop_mdt_idx; - - for (i = 0; i < lp->lop_nr; i++) { - if (lp->lop_objs[i].lo_comp_id) { - snprintf(buf, sizeof(buf), - "lfs mirror split -d --mirror-id %u " - "%s/.lustre/fid/"DFID" > /dev/null 2>&1", - lp->lop_objs[i].lo_comp_id, - mds[idx].mnt, PFID(&lp->lop_objs[i].lo_fid)); - } else { - snprintf(buf, sizeof(buf), - "lfs mirror split -d -p %s " - "%s/.lustre/fid/"DFID" > /dev/null 2>&1", - opt.o_pool, mds[idx].mnt, PFID(&lp->lop_objs[i].lo_fid)); - } + struct lpurge_object *lo; + int rc; - rc = lpurge_exec_cmd(&mds[idx], buf); - if (rc) - llapi_printf(LLAPI_MSG_DEBUG, - "couldn't remove %u from "DFID": rc=%d\n", - lp->lop_objs[i].lo_comp_id, - PFID(&lp->lop_objs[i].lo_fid), rc); + pthread_mutex_lock(&lpurge_work_lock); + + while (1) { + while (lpurge_work_should_run && lipe_list_empty(&lpurge_work_list)) + pthread_cond_wait(&lpurge_work_cond, &lpurge_work_lock); + + if (!lpurge_work_should_run) + break; + + lo = lipe_list_entry(lpurge_work_list.next, + struct lpurge_object, lo_list); + lipe_list_del(&lo->lo_list); + + stats.s_started++; + pthread_mutex_unlock(&lpurge_work_lock); + + rc = lpurge_mirror_delete(&lo->lo_fid, lo->lo_mirror_id); + free(lo); + + pthread_mutex_lock(&lpurge_work_lock); + + if (rc < 0) + stats.s_failed++; + else + stats.s_purged++; + + assert(stats.s_purged + stats.s_failed <= stats.s_queued); + + if (stats.s_purged + stats.s_failed == stats.s_queued) + pthread_cond_signal(&lpurge_work_done); } - /* ignore errors, in the worst case we'll find this object again */ - pthread_exit((void *)0); + pthread_mutex_unlock(&lpurge_work_lock); + + return NULL; } -int lpurge_spawn(struct lipe_list_head *pm_list) +static void lpurge_work_threads_stop(void) { - int jobs[MAX_MDTS] = { 0 }; - struct lipe_list_head list; /* list of jobs in execution */ - struct lpurge_object_pack *lp, *tmp; - int total = 0; - int i, repeat; - - LIPE_INIT_LIST_HEAD(&list); - -next: - repeat = 0; - for (i = 0; i < MAX_MDTS; i++) { - while (!lipe_list_empty(&pm_list[i])) { - pthread_t pid; - int rc; + size_t i; + int rc; - if (jobs[i] >= opt.o_max_jobs) { - repeat = 1; - break; - } + pthread_mutex_lock(&lpurge_work_lock); + lpurge_work_should_run = false; + pthread_cond_broadcast(&lpurge_work_cond); + pthread_mutex_unlock(&lpurge_work_lock); - lp = lipe_list_entry(pm_list[i].next, - struct lpurge_object_pack, lop_list); - lipe_list_del(&lp->lop_list); - - /* now fork and execute */ - rc = pthread_create(&pid, NULL, lpurge_spawn_one, lp); - if (rc == 0) { - stats.s_spawned++; - stats.s_purged = lp->lop_nr; - - lp->lop_pid = pid; - lp->lop_mdt_idx = i; - lipe_list_add_tail(&lp->lop_list, &list); - jobs[i]++; - total++; - } + for (i = 0; i < lpurge_work_thread_count; i++) { + rc = pthread_join(lpurge_work_threads[i], NULL); + if (rc != 0) { + llapi_printf(LLAPI_MSG_FATAL, + "cannot join work thread: %s\n", + strerror(rc)); } } -wait: - assert(total > 0); - lp = NULL; - lipe_list_for_each_entry_safe(lp, tmp, &list, lop_list) { - int rc; - intptr_t retval; + lpurge_work_thread_count = 0; - rc = pthread_tryjoin_np(lp->lop_pid, (void **)&retval); - if (rc) - continue; - lipe_list_del(&lp->lop_list); - i = lp->lop_mdt_idx; - llapi_printf(LLAPI_MSG_DEBUG, - "thread %lu MDT#%d completed: %"PRIdPTR"\n", - lp->lop_pid, i, retval); - free(lp); - assert(jobs[i] > 0); - jobs[i]--; - total--; - } - - if (repeat) - goto next; - if (total) - goto wait; - return 0; -} + free(lpurge_work_threads); + lpurge_work_threads = NULL; -/* round-robin distribution for unknown (w/o mappping, likely new) MDS */ -static int fid2idx_rr; + /* TODO: Free entries in lpurge_work_list. */ +} -static int lpurge_fid2idx(const struct lu_fid *fid) +static int lpurge_work_threads_start(size_t thread_count) { - int rc, idx; + int rc = 0; - /* XXX: local cache to avoid ioctl() for each FID? */ - rc = llapi_get_mdt_index_by_fid(lustre_fd, fid, &idx); - if (rc < 0) { - llapi_printf(LLAPI_MSG_ERROR, - "can't lookup "DFID": rc=%d\n", - PFID(fid), rc); - return rc; + assert(thread_count > 0); + + lpurge_work_should_run = true; + + lpurge_work_threads = calloc(thread_count, sizeof(lpurge_work_threads[0])); + if (lpurge_work_threads == NULL) { + rc = -ENOMEM; + goto out; } - if (idx >= mdsnr || !mds[idx].host) { - /* - * no MDS registered for this index - * but we can use any, it's just less efficient - * due to over-network RPCs to open-close-modify. - * so choose some MDS to send replica removal to - */ - do { - fid2idx_rr++; - if (fid2idx_rr == mdsnr) - fid2idx_rr = 0; - } while (mds[fid2idx_rr].host == NULL); + while (lpurge_work_thread_count < thread_count) { + rc = pthread_create(&lpurge_work_threads[lpurge_work_thread_count], + NULL /* attr */, + &lpurge_work_func, + NULL /* data */); + if (rc != 0) { + llapi_printf(LLAPI_MSG_FATAL, + "cannot create work thread: %s\n", + strerror(rc)); + rc = -rc; + goto out; + } - idx = fid2idx_rr; + lpurge_work_thread_count++; } +out: + if (rc < 0) + lpurge_work_threads_stop(); + + return rc; +} - return idx; +/* Unused. */ +void lpurge_work_submit(struct lpurge_object *lo) +{ + pthread_mutex_lock(&lpurge_work_lock); + assert(lpurge_work_should_run); + stats.s_queued++; + lipe_list_add_tail(&lo->lo_list, &lpurge_work_list); + pthread_cond_signal(&lpurge_work_cond); + pthread_mutex_unlock(&lpurge_work_lock); } void lpurge_purge_slot(struct lpurge_slot *ls, long long target) { - struct lpurge_object_pack *lp, *pm; - struct lipe_list_head pm_list[MAX_MDTS]; - __u64 blocks[MAX_MDTS] = { 0 }; + struct lpurge_object *lo; __u64 total, was, kbfree; int i, rc; /* try to remove some replicas */ - - for (i = 0; i < MAX_MDTS; i++) - LIPE_INIT_LIST_HEAD(&pm_list[i]); - again: llapi_printf(LLAPI_MSG_DEBUG, "release upto %llu (expect %lu in %lu)\n", target, ls->ls_space, ls->ls_found); total = 0; - /* take few FIDs */ + assert(!lipe_list_empty(&ls->ls_obj_list)); - lp = lipe_list_entry(ls->ls_obj_list.next, struct lpurge_object_pack, - lop_list); - for (i = lp->lop_nr - 1; i >= 0; i--) { - struct lu_fid fid; - int idx; - lp->lop_nr--; + pthread_mutex_lock(&lpurge_work_lock); + assert(lpurge_work_should_run); - /* split by MDT index, we could do this at scanning, but that can - * result in many useless lookups if only part of slots are subject - * to removal - */ - fid = lp->lop_objs[i].lo_fid; - if (opt.o_iml_socket) { - rc = flist_add_json_fid(lflist, &fid); - if (rc < 0) { - llapi_printf(LLAPI_MSG_ERROR, - "can't add "DFID": rc=%d\n", - PFID(&fid), rc); - continue; - } - } else { - idx = lpurge_fid2idx(&fid); - if (idx < 0) - continue; - /* how many blocks we expect to free */ - blocks[idx] += lp->lop_objs[i].lo_blocks; - total += lp->lop_objs[i].lo_blocks; - ls->ls_space -= lp->lop_objs[i].lo_blocks; - ls->ls_found--; - ls->ls_stored--; - - /* put FID into per-MDT pack */ - pm = NULL; - if (!lipe_list_empty(&pm_list[idx])) - pm = lipe_list_entry(pm_list[idx].next, - struct lpurge_object_pack, - lop_list); - if (!pm || pm->lop_nr >= pm->lop_max) { - pm = calloc(1, PM_SIZE); - if (!pm) { - llapi_printf(LLAPI_MSG_ERROR, - "can't alloca pack\n"); - exit(1); - } - pm->lop_max = FIDS_PER_CALL; - pm->lop_nr = 0; - lipe_list_add(&pm->lop_list, &pm_list[idx]); - } - pm->lop_objs[pm->lop_nr] = lp->lop_objs[i]; - pm->lop_nr++; - } + while (!lipe_list_empty(&ls->ls_obj_list)) { + lo = lipe_list_entry(ls->ls_obj_list.next, struct lpurge_object, + lo_list); + + /* how many blocks we expect to free */ + total += lo->lo_blocks; + ls->ls_space -= lo->lo_blocks; + ls->ls_found--; + ls->ls_stored--; + + lipe_list_move_tail(&lo->lo_list, &lpurge_work_list); + stats.s_queued++; /* if current collection of objects may free target space, then stop */ if (total >= target) break; } - if (!lp->lop_nr) { - lipe_list_del(&lp->lop_list); - free(lp); - } + + pthread_cond_broadcast(&lpurge_work_cond); + pthread_mutex_unlock(&lpurge_work_lock); /* fork, start and wait for completion against each MDT */ /* give some time to OST_DESTROY to pass through */ @@ -1049,10 +1087,11 @@ again: llapi_printf(LLAPI_MSG_DEBUG, "spawn, expect %llu back\n", total); - if (opt.o_iml_socket) - flist_write(lflist, true); - else - lpurge_spawn(pm_list); + /* Wait for purge threads to complete all submitted work. */ + pthread_mutex_lock(&lpurge_work_lock); + while (stats.s_purged + stats.s_failed < stats.s_queued) + pthread_cond_wait(&lpurge_work_done, &lpurge_work_lock); + pthread_mutex_unlock(&lpurge_work_lock); /* XXX: 20 is probably too short for ZFS as changes need to * get committed on MDS first, then sent to OST, then get @@ -1245,7 +1284,7 @@ void parse_mountpoint(const char *name) struct lu_fid fid; int rc, idx; - lustre_fd = open(name, O_RDONLY/* | O_DIRECTORY*/); + lustre_fd = open(name, O_RDONLY); if (lustre_fd < 0) { llapi_printf(LLAPI_MSG_FATAL, "can't open %s: %d\n", name, errno); @@ -1260,89 +1299,18 @@ void parse_mountpoint(const char *name) "%s isn't Lustre mountpoint: %d\n", name, rc); exit(1); } -} - -void parse_mds(char *args) -{ - struct mds *m; - char *host, *mnt, *sidx; - int idx, rc; - int j; - char *endptr; - - /* mds# hostname mountpoint */ - sidx = strsep(&args, ":\n"); - idx = strtol(sidx, &endptr, 10); - if (*endptr != '\0' || idx < 0) { - rc = -EINVAL; - llapi_error(LLAPI_MSG_FATAL, rc, - "invalid MDS index: '%s'\n", sidx); - exit(1); - } - - if (idx >= MAX_MDTS) { - rc = -EINVAL; - llapi_error(LLAPI_MSG_FATAL, rc, "too high MDS index\n"); - exit(1); - } - m = &mds[idx]; - if (m->host) { - llapi_printf(LLAPI_MSG_FATAL, - "mds #%d is defined already\n", idx); - exit(1); - } - host = strsep(&args, ":\n"); - mnt = strsep(&args, ":\n"); - if (!host || !mnt) { - llapi_printf(LLAPI_MSG_FATAL, - "no host or mntpt: %s\n", args); + open_by_fid_fd = openat(lustre_fd, ".lustre/fid", O_RDONLY); + if (open_by_fid_fd < 0) { + llapi_error(LLAPI_MSG_FATAL, -errno, + "cannot open '%s/.lustre/fid'", name); exit(1); } - - if (idx >= MAX_MDTS) { - llapi_printf(LLAPI_MSG_FATAL, - "MDT index %u > max index %u: %s\n", - idx, MAX_MDTS, args); - exit(1); - } - - m->host = strdup(host); - m->mnt = strdup(mnt); - - /* init ssh session to target mds */ - pthread_mutex_init(&m->m_ssh_lock, NULL); - pthread_cond_init(&m->m_ssh_cond, NULL); - LIPE_INIT_LIST_HEAD(&m->m_ssh_list); - - for (j = 0; j < opt.o_max_jobs; j++) { - struct mds_ssh_session *mss; - - mss = calloc(1, sizeof(*mss)); - if (mss == NULL) { - llapi_printf(LLAPI_MSG_FATAL, - "cannot allocate ssh session\n"); - exit(1); - } - - rc = lipe_init_ssh_session(&mss->mss_ctx, host); - if (rc != SSH_AUTH_SUCCESS) { - llapi_printf(LLAPI_MSG_FATAL, - "cannot create ssh session for host %s: rc = %d\n", - host, rc); - exit(1); - } - - lipe_list_add(&mss->mss_list, &m->m_ssh_list); - } - - mdsnr++; - llapi_printf(LLAPI_MSG_DEBUG, "add mds #%d at %s@%s\n", - idx, host, mnt); } #define LPURGE_INTERNAL_DUMP_FIDS 1 #define LPURGE_OPT_IML_SOCKET 2 +#define LPURGE_OPT_VERSION 3 static struct option options[] = { { "device", required_argument, NULL, 'D'}, @@ -1363,6 +1331,7 @@ static struct option options[] = { { "scan_rate", required_argument, NULL, 'R'}, { "scan_threads", required_argument, NULL, 't'}, { "slot_size", required_argument, NULL, 'S'}, + { "version", no_argument, NULL, LPURGE_OPT_VERSION}, { NULL } }; @@ -1387,6 +1356,9 @@ void lpurge_process_opt(int c, char *optarg) case LPURGE_INTERNAL_DUMP_FIDS: opt.o_fids_dumpfile = strdup(optarg); break; + case LPURGE_OPT_VERSION: + lipe_version(); + exit(0); case 'b': llapi_msg_set_level(LLAPI_MSG_MAX); break; @@ -1441,7 +1413,7 @@ void lpurge_process_opt(int c, char *optarg) opt.o_freelo = value; break; case 'm': - parse_mds(optarg); + /* parse_mds(optarg); */ break; case 'M': opt.o_mountpoint = strdup(optarg); @@ -1565,7 +1537,6 @@ void load_config(char *name) void lpurge_verify_opts(void) { char buf[1024]; - int i; if (!opt.o_pool) { opt.o_pool = DEF_POOL; @@ -1659,42 +1630,6 @@ void lpurge_verify_opts(void) return; } - // Stand-alone Operation Checks - if (mdsnr == 0) { - llapi_printf(LLAPI_MSG_FATAL, - "mds is not configured\n"); - exit(1); - } - - for (i = 0; i < mdsnr; i++) { - int rc; - char cmd[PATH_MAX]; - - if (!mds[i].host) { - llapi_printf(LLAPI_MSG_FATAL, - "MDS host is not configured\n"); - exit(1); - } - - if (!mds[i].mnt) { - llapi_error(LLAPI_MSG_FATAL, -EINVAL, - "mountpoint on MDS host '%s' is not configured\n", - mds[i].host); - exit(1); - } - - /* check mountpoint on MDS */ - snprintf(cmd, sizeof(cmd), "lfs path2fid %s > /dev/null 2>&1", - mds[i].mnt); - rc = lpurge_exec_cmd(&mds[i], cmd); - if (rc) { - llapi_error(LLAPI_MSG_FATAL, -EINVAL, - "invalid mountpoint '%s' on MDS host: '%s'\n", - mds[i].mnt, mds[i].host); - exit(1); - } - } - if (opt.o_max_jobs < 1 || opt.o_max_jobs > 1024) { llapi_printf(LLAPI_MSG_FATAL, "invalid max_jobs: %d\n", opt.o_max_jobs); @@ -1744,24 +1679,23 @@ void lpurge_usr1_handle(int sig) return; } llapi_printf(LLAPI_MSG_DEBUG, "dump to %s\n", opt.o_dumpfile); - fprintf(f, "config:\n" + fprintf(f, + "version: %s-%s\n" + "revision: %s\n" + "config:\n" " free_high: %u\n" " free_low: %u\n" " ostname: %s\n" " mountpoint: %s\n" " pool: %s\n" - " mds: ", - opt.o_freehi, opt.o_freelo, - opt.o_device, opt.o_mountpoint, opt.o_pool); - for (i = 0; i < mdsnr; i++) - fprintf(f, "%s%d:%s:%s", i ? "," : "", i, - mds[i].host, mds[i].mnt); - fprintf(f, "\n" " max_jobs: %u\n" " check_interval: %u\n" " scan_rate: %u\n" " scan_threads: %u\n" " slot_size: %u\n", + PACKAGE_VERSION, LIPE_RELEASE, LIPE_REVISION, + opt.o_freehi, opt.o_freelo, + opt.o_device, opt.o_mountpoint, opt.o_pool, opt.o_max_jobs, opt.o_interval, opt.o_scan_rate, opt.o_scan_threads, opt.o_slot_size); if (opt.o_iml_socket) @@ -1771,11 +1705,14 @@ void lpurge_usr1_handle(int sig) " scans: %lu\n" " scan_time: %llu\n" " slow_scans: %lu\n" - " spawned: %lu\n" + " queued: %lu\n" + " started: %lu\n" " purged: %lu\n" + " failed: %lu\n" " low_hits: %lu\n", stats.s_scans, stats.s_scan_time, stats.s_slow_scans, - stats.s_spawned, stats.s_purged, stats.s_low_hits); + stats.s_queued, stats.s_started, stats.s_purged, stats.s_failed, + stats.s_low_hits); lpurge_kbfree(&kbfree); fprintf(f, "space:\n" " kbfree: %llu\n" @@ -1784,7 +1721,7 @@ void lpurge_usr1_handle(int sig) kbfree, freelo, freehi); #define HIST_FMT \ - " hist%u: { age: %lu, found: %lu, space: %lu, stored: %lu }\n" + " hist%u: { age: %lu, found: %lu, space: %lu, stored: %lu, nomirror_cnt: %lu, nomirror_space: %lu, nopfid_cnt: %lu, nopfid_space: %lu, notfirst_cnt: %lu, notfirst_space: %lu }\n" fprintf(f, "hlists:\n"); for (i = LPURGE_HIST_MAX - 1; i >= 0; i--) { @@ -1794,7 +1731,10 @@ void lpurge_usr1_handle(int sig) continue; fprintf(f, HIST_FMT, i, ls->ls_age, ls->ls_found, - ls->ls_space, ls->ls_stored); + ls->ls_space, ls->ls_stored, + ls->ls_nomirror_objs,ls->ls_nomirror_space, + ls->ls_nopfid_objs, ls->ls_nopfid_space, + ls->ls_notfirst_objs, ls->ls_notfirst_space); } fflush(f); @@ -1924,17 +1864,13 @@ void lpurge_usr2_handle(int sig) * {"fid":"[0x200000401:0x6:0x0]", "last_use_time": 123456, "slot_id": $SLOT_ID} */ for (i = LPURGE_HIST_MAX - 1; i >= 0; i--) { - struct lpurge_object_pack *lp; struct lpurge_slot *ls = lpurge_hist + i; + struct lpurge_object *lo; if (ls->ls_found == 0) continue; - lp = lipe_list_entry(ls->ls_obj_list.next, - struct lpurge_object_pack, - lop_list); - - for (i = lp->lop_nr - 1; i >= 0; i--) { + lipe_list_for_each_entry(lo, &ls->ls_obj_list, lo_list) { int rc; char fid_buf[FID_LEN + 1]; struct lu_fid fid; @@ -1945,13 +1881,13 @@ void lpurge_usr2_handle(int sig) obj_stat = json_object_new_object(); - fid = lp->lop_objs[i].lo_fid; + fid = lo->lo_fid; snprintf(fid_buf, sizeof(fid_buf), DFID, PFID(&fid)); json_object_object_add(obj_stat, "fid", json_object_new_string(fid_buf)); json_object_object_add(obj_stat, "last_use_time", json_object_new_int64( - lp->lop_objs[i].lo_last_utime)); + lo->lo_last_utime)); json_object_object_add(obj_stat, "slot_id", json_object_new_int(i)); output = json_object_to_json_string_ext(obj_stat, @@ -2041,14 +1977,10 @@ static void lpurge_register_signal_handlers(void) int main(int argc, char **argv) { - ssh_threads_set_callbacks(ssh_threads_get_pthread()); - ssh_init(); - + lipe_version_init(); setlinebuf(stdout); setlinebuf(stderr); - memset(mds, 0, sizeof(mds)); - llapi_msg_set_level(LLAPI_MSG_INFO); signal(SIGUSR1, lpurge_null_handler); @@ -2061,6 +1993,10 @@ int main(int argc, char **argv) * followed by the OST name ("lpurge lustre-OST0000"). */ llapi_set_command_name(ostname); + llapi_error(LLAPI_MSG_INFO|LLAPI_MSG_NO_ERRNO, 0, + "version %s-%s, revision %s\n", + PACKAGE_VERSION, LIPE_RELEASE, LIPE_REVISION); + lpurge_get_ost_mntpt(); lpurge_configure_thresholds(); @@ -2070,6 +2006,8 @@ int main(int argc, char **argv) lpurge_register_signal_handlers(); + lpurge_work_threads_start(opt.o_max_jobs); + while (1) { /* XXX: do lazy scanning in the background * before the real need, so that when @@ -2091,4 +2029,9 @@ int main(int argc, char **argv) /* device size can change runtime.. */ lpurge_configure_thresholds(); } + + /* Unreached. */ + lipe_version_fini(); + + return 0; } diff --git a/src/lustre_ea.c b/src/lustre_ea.c index a58525b..fd674bf 100644 --- a/src/lustre_ea.c +++ b/src/lustre_ea.c @@ -7,12 +7,12 @@ * * Author: Li Xi */ -#include +#include #include +#include #include "fake_lustre_idl.h" #include "lustre_ea.h" #include "debug.h" -#include "errno.h" int decode_linkea(char *buf, ssize_t size) { diff --git a/version-gen.sh b/version-gen.sh index 1ebad2d..d731ccb 100755 --- a/version-gen.sh +++ b/version-gen.sh @@ -1,13 +1,8 @@ #!/bin/sh -# Please update all the version references if VERSION changed. -# lipe.spec.in -# man/lipe_scan.1 -# pylipe/lipe_test.py - -VERSION="1.14" -if [ -d .git ]; then - VERSION=$(git describe|sed 's/-[0-9]*-/./') -fi +VERSION="1.17" +# if [ -d .git ]; then +# VERSION=$(git describe|sed 's/-[0-9]*-/./') +# fi printf "%s" "$VERSION" -- 1.8.3.1