Whamcloud - gitweb
Squashed 'lipe/' changes from 38f79e56ec..8251fae87b
authorJohn L. Hammond <jhammond@whamcloud.com>
Tue, 27 Apr 2021 14:38:30 +0000 (09:38 -0500)
committerJohn L. Hammond <jhammond@whamcloud.com>
Tue, 27 Apr 2021 14:38:30 +0000 (09:38 -0500)
8251fae87b Update lipe version to 1.17.
87ee780007 EX-1613 scripts: Use ticket to start/stop hotpools
0ce8cdc011 EX-3078 lipe: quote FIDs in remote commands
dcd24fe01e EX-3034 lamigo: check for available agents early
0ff4506a1d EX-3043 lamigo: remove debugging leftover
22cf972d45 EX-3043 lamigo: simplify changelog cleaning check
de214cee10 EX-3009 lamigo: dump changelog status
8e7ee6cc80 EX-3017 lpurge: for stats for skipped objects
b0bce08ec5 EX-2768 lamigo: don't register a SIGCHLD handler
8981cfa704 EX-3036 lipe: version and revision support
1f562d542b EX-3020 lamigo: prevent out of order changelog clearing
d859e4bff2 EX-3021 lipe: refactor lipe_ssh context handling
4124346b0d EX-2718 lpurge: use local mountpoint for purge operations
7caf05b672 EX-3030 lipe: join multiple threads in lamigo_check_jobs()
911cf5018b EX-2994 lipe: update lpurge purged stats correctly
3d1af585e3 Update lipe version to 1.16.
842d8c5aad EX-2962 lipe: Fix config autodetect
df423e9539 EX-2948 build: less checks in lipe configuration
6745370c5b EX-2983 lamigo: reduce log level in lamigo_exec_cmd()
77e90c1df3 EX-2979 lamigo: do not count setprefer as replication
eda4f09711 EX-2608 scripts: Auto detect previous values
4ca909e2ea Update lipe version to 1.15.
2f80b12aaf EX-2770 lpurge: set lop_mdt_idx before spawning thread
370ba3a118 EX-2930 lipe: fix errno.h include
4f672e1346 EX-2921 lipe: merge tools/lipe to lipe subtree
17a2a63533 EX-2778 lipe: lipe.spec fixes

git-subtree-dir: lipe
git-subtree-split: 8251fae87b508e36caab6397b1063b308dcb2b05

21 files changed:
Makefile.am
configure.ac
hotpools-1.17-release-notes.md [new file with mode: 0644]
lipe-revision.sh [new file with mode: 0755]
lipe.spec.in
scripts/stratagem-hp-config.sh
scripts/stratagem-hp-convert.sh [new file with mode: 0755]
scripts/stratagem-hp-start.sh
scripts/stratagem-hp-stop.sh
scripts/stratagem-hp-teardown-client.sh [new file with mode: 0755]
scripts/stratagem-hp-teardown.sh
src/Makefile.am
src/lamigo.c
src/lamigo_alr.c
src/lipe_ssh.c
src/lipe_ssh.h
src/lipe_version.c [new file with mode: 0644]
src/lipe_version.h [new file with mode: 0644]
src/lpurge.c
src/lustre_ea.c
version-gen.sh

index 07c0811..a9f0a1b 100644 (file)
@@ -55,6 +55,7 @@ PYTHON_COMMANDS = \
 EXTRA_DIST= \
        $(PYTHON_COMMANDS) \
        detect-distro.sh \
+       lipe-revision.sh \
        example_configs/clownfish/seperate_mgs/clownfish.conf \
        example_configs/clownfish/seperate_mgs/lipe_virt.conf \
        example_configs/lipe/lipe_install.conf \
index dc74bf9..88a47d5 100644 (file)
@@ -108,43 +108,13 @@ if test "x$idle_user_header" = "xyes"; then
         AC_DEFINE(IDLE_USER_HEADER, 1, [Lustre uses idle user header])
 fi
 
-# -------- check whether DoM is supported by Lustre head files --------
-AC_MSG_CHECKING([Lustre has PFL support])
-saved_libs=$LIBS
-LIBS="-llustreapi"
-AC_LINK_IFELSE([AC_LANG_SOURCE([       
-       #include <lustre/lustre_user.h>
-
-       int main(void) {
-               int a = LOV_USER_MAGIC_COMP_V1;
-       }
-])],[
+# few defines which aren't conditional anymore
 AC_DEFINE([HAVE_LUSTRE_PFL], 1,
          [Lustre has PFL support])
-have_lustre_pfl=yes
-], [have_lustre_pfl="no"])
-LIBS=$saved_libs
-AC_MSG_RESULT([$have_lustre_pfl])
-
-# -------- check for llapi_layout_get_by_xattr() --------
-AC_MSG_CHECKING([Lustre have llapi_layout_get_by_xattr() and LSoM])
-saved_libs=$LIBS
-LIBS="-llustreapi"
-AC_LINK_IFELSE([AC_LANG_SOURCE([       
-       #include <lustre/lustreapi.h>
-
-       int main(void) {
-               llapi_layout_get_by_xattr(NULL, 0, 0);
-       }
-])],[
 AC_DEFINE([HAVE_LAYOUT_BY_XATTR], 1,
          [have llapi_layout_get_by_xattr()])
 AC_DEFINE([HAVE_LAZY_SIZE_ON_MDT], 1,
          [have lazy size on MDT])
-have_layout_by_xattr=yes
-], [have_layout_by_xattr="no"])
-LIBS=$saved_libs
-AC_MSG_RESULT([$have_layout_by_xattr])
 
 # -------- check for llapi_changelog_in_buf() --------
 AC_MSG_CHECKING([Lustre have llapi_changelog_in_buf()])
@@ -337,8 +307,15 @@ AC_SEARCH_LIBS([ext2fs_open2], [ext2fs])
 
 LIPE_RELEASE="1"
 AC_DEFINE_UNQUOTED(RELEASE, "$LIPE_RELEASE", [release info] )
+AC_DEFINE_UNQUOTED(LIPE_RELEASE, "$LIPE_RELEASE", [release info] )
 AC_SUBST(LIPE_RELEASE)
 
+AC_MSG_CHECKING([for lipe revision])
+LIPE_REVISION=$(bash lipe-revision.sh)
+AC_MSG_RESULT([$LIPE_REVISION])
+AC_DEFINE_UNQUOTED(LIPE_REVISION, "$LIPE_REVISION", [revision info] )
+AC_SUBST(LIPE_REVISION)
+
 # for exporting to spec file
 AC_SUBST(ac_configure_args)
 AC_CONFIG_FILES([Makefile
diff --git a/hotpools-1.17-release-notes.md b/hotpools-1.17-release-notes.md
new file mode 100644 (file)
index 0000000..14c6763
--- /dev/null
@@ -0,0 +1,456 @@
+# Hotpools Hotfix (lipe-server-1.17) Release Notes
+
+## lamigo agents
+
+lamigo will continue to ssh to "agents" and invoke lfs to mirror and
+resync files. An "agent" need not be an EXAScaler server (VM). Instead
+an agent can be any node that lamigo can reach via ssh as root, has a
+Lustre client mount, and has the lfs utility installed. The lamigo
+configuration file determines the set of agents used:
+```
+    # MDT=myfs1-MDT0042
+    # grep '^agent=' /etc/lamigo/${MDT}.conf
+    agent=ai400x-0a12-vm00:/lustre/myfs1/client:4
+    agent=ai400x-0a12-vm01:/lustre/myfs1/client:4
+    agent=ai400x-0a12-vm02:/lustre/myfs1/client:4
+    agent=ai400x-0a12-vm03:/lustre/myfs1/client:4
+```
+In the triple `ai400x-0a12-vm00:/lustre/myfs1/client:4` the first
+component is the hostname of the agent, the second is the mount point
+of the Lustre client on the agent, and the third is the maximum number
+of concurrent mirror operations.
+
+Using agents other than the EXAScaler servers will alleviate many of
+the HA issues seen to date (DDN-1920, EX-1613). The
+`stratagem-hp-config.sh` scripts will use the EXAScaler servers as
+mirroring agents for lamigo.
+
+Previous version of lamigo and lpurge failed to startup in some
+certain cases when agents were unreachable (DDN-2070, EX-3021). All
+known issues in this area are believed to be fixed.
+
+In the example above we have 4 agents with 4 concurrent operations
+each for a total of 16 concurrent operations on behalf of this lamigo
+service. Some basic benchmarking on a mix of small and medium files
+has shown that we can expect about twice as many (2 * 16 = 32)
+replications per second from lamigo.
+
+## agentless lpurge
+
+lpurge will no longer use agents or external commands (lfs mirror
+split ...) to purge files. Instead it will use the Lustre client mount
+point (`/lustre/$FS/client`) on the server where lpurge runs. For
+backwards compatibility lpurge will ignore any agents specified in the
+config file. Note that lpurge.conf uses lines of the form
+```
+    mds=0:ai400x-0a12-vm00:/lustre/myfs1/client
+    mds=1:ai400x-0a12-vm01:/lustre/myfs1/client
+```
+to specify an agent. Since they will be ignored, we recommend just
+leaving them in the config file. This will make downgrade easier if
+needed.
+
+## Online upgrade of lamigo and lpurge
+
+To upgrade lamigo and lpurge on a live system we suggest installing
+the new lipe-server RPM to all EXAScaler servers and then running
+```
+    # clush -a killall lamigo lpurge
+```
+In this case systemd will automatically restart lamigo and lpurge
+(using the newly installed binaries) without invoking HA. This is
+preferred over using `stratagem-hp-stop.sh && stratagem-hp-start.sh`.
+
+## Converting hotpools to use ticketed HA configuration
+
+Using the Lustre client mount Pacemaker resource to control the
+starting and stopping of hotpools introduced cyclic resource
+dependencies which in turn made stopping hotpools impossible in some
+cases (usually leading to STONITH, see DDN-1920 and EX-1613). To
+address this `stratagem-hp-config.sh` now uses a new Pacemaker
+resource (`$FS-hotpool`) and ticket (`$FS-hotpool-allocated`) to start
+and stop hotpools. Running `stratagem-hp-stop.sh` stops lamigo and
+lpurge but not the Lustre client mounts on the servers.
+
+Alongside the changes to `stratagem-hp-config.sh`, this release
+provides two new HA scripts:
+
+`stratagem-hp-convert.sh` converts a previously configured hotpools
+installation to a ticketed configuration. Conversion may be performed
+with hotpools running or stopped.
+
+`stratagem-hp-teardown-client.sh` stops and unconfigures the Lustre
+client mount on the servers.
+
+## Permanently stopping hotpools
+
+If hotpools is to be stopped permanently or indefinitely then the
+changelog users registered for lamigo should be deregistered to avoid
+client operations failing due to full changelogs and increased
+changelog consumption on the MDTs (upto about 512GB). This should be
+done for each MDT.
+
+## Restarting hotpools
+
+If hotpools was previously enabled and subsequently stopped then
+some maintenance around changelogs should be performed before
+re-enabling it. Remove any old '.chlg' files:
+```
+    # clush -a "find /var/lib -name 'lamigo-*.chlg' -delete"
+```
+For each MDT in $FS-MDT0000..$FS-MDTffff perform the following steps.
+
+1. Check that the changelog user specified in the lamigo config file
+agrees with the changelog user registered on the MDT:
+```
+    # MDT=myfs1-MDT0042
+    # grep '^user=' /etc/lamigo/${MDT}.conf
+    user=cl23
+    # clush -aN lctl get_param mdd.${MDT}.changelog_users 2> /dev/null
+    mdd.myfs1-MDT0042.changelog_users=
+    current index: 83896534
+    ID    index (idle seconds)
+    cl23  82748303 (481516)
+```
+If there are other changelog users registered then they may need to be
+removed. Please consult with the site admins to ensure that they are
+not in use by another changelog consumer (Robinhood, laudit,
+...). Other registered changelog users with high `idle seconds` values
+should be regarded with suspicion.
+
+The number of changelog entries stored on disk by the MDT is the
+difference between the "current index" and the minimum of the indexes
+of all registered users:
+```
+    83896534 - 82748303 = 1148231
+```
+2. Check the total changelog space consumption on the MDT:
+```
+    # clush -aN lctl get_param mdd.${MDT}.changelog_size 2> /dev/null
+    mdd.myfs1-MDT0042.changelog_size=161358368
+```
+A changelog entry consumes about 144 bytes on average. So this looks
+about right
+```
+    144 * 1148231 = 165345264
+```
+Contact L3 if there are no changelog users registered but
+`changelog_size` is reporting a large value or if `changelog_size` is much
+larger than the expected size.
+
+3. Clear old changelogs:
+
+If lamigo is the only changelog consumer and there are many unconsumed
+entries on the MDT then clear the changelog manually before restarting
+hotpools:
+```
+    # clush -aN lctl get_param mdd.${MDT}.changelog_users 2> /dev/null
+    mdd.myfs1-MDT0042.changelog_users=
+    current index: 83896534
+    ID    index (idle seconds)
+    cl23  82748303 (481516)
+    # CL=cl23
+    # lfs changelog_clear $MDT $CL 0
+```
+Clearing a large amount of old changelog entries will take some
+time. (Do not interrupt it!) Changelogs on *different* MDTs may be
+cleared in parallel.
+
+The steps listed above should be repeated for each MDT!
+
+## Tuning lamigo (optimization)
+
+Consider increasing the number of agents or the number of concurrent
+jobs per agent.
+
+## Tuning lpurge (optimization)
+
+Consider increasing the max_jobs config parameter from the default of
+8.
+
+## Reducing the changelog mask (optimization)
+
+If there are no changelog consumers other than lamigo then the
+changelog mask may be reduced from the default to include only the
+types needed by hotpools (`MARK UNLNK CLOSE`). Note the spelling of
+'UNLNK'. To see the current changelog mask run:
+```
+    # clush -aN lctl get_param "mdd.${FS}-*.changelog_mask"
+    mdd.myfs1-MDT0042.changelog_mask=
+    MARK CREAT MKDIR HLINK SLINK MKNOD UNLNK RMDIR RENME RNMTO CLOSE LYOUT TRUNC SATTR XATTR HSM MTIME CTIME MIGRT FLRW RESYNC 
+```
+To make a persistent setting run the following on the MGS:
+```
+    # lctl set_param -P "mdd.${FS}-*.changelog_mask"='MARK UNLNK CLOSE'
+```
+## Creating a nodemap for hotpools agents (optimization)
+
+As lamigo consumes changelogs (and OST access logs) it dispatches
+mirroring operation to agents which in turn generate yet more
+changelog entries that lamigo must consume. This feedback loop is not
+fatal but does affect performance. It can be avoided by creating a
+nodemap that containing the EXAScaler servers (and any configured
+agents) and setting `audit_mode=0` for that nodemap.
+
+(First consult site admins to determine if nodemaps are already in
+use. If so then any changes made must be consistent with current
+nodemap configuration.)
+
+To proceed collect the set of possible LNet NIDs for the hotpools
+agents. By default this will be the NIDs of the EXAScaler server VMs.
+```
+    # clush -aN lctl list_nids | sort
+    172.16.0.48@o2ib
+    172.16.0.49@o2ib
+    172.16.0.50@o2ib
+    172.16.0.51@o2ib
+    172.16.0.52@o2ib
+    172.16.0.53@o2ib
+    172.16.0.54@o2ib
+    172.16.0.55@o2ib
+```
+This list of NIDs must be converted to a comma separated list or range
+expression. In this example:
+```
+    172.16.0.[48-55]@o2ib
+```
+(See https://github.com/DDNStorage/lustre_manual_markdown/blob/master/03.16-Mapping%20UIDs%20and%20GIDs%20with%20Nodemap.md for the full syntax.)
+To configure and enable the nodemap run the following on the MGS:
+```
+    # HP_NODEMAP=hp_agents
+    # HP_AGENT_NIDS="172.16.0.[48-55]@o2ib"
+    # lctl nodemap_modify --name default --property admin --value 1
+    # lctl nodemap_modify --name default --property trusted --value 1
+    # lctl nodemap_add $HP_NODEMAP
+    # lctl nodemap_modify --name $HP_NODEMAP --property admin --value 1
+    # lctl nodemap_modify --name $HP_NODEMAP --property trusted --value 1
+    # lctl nodemap_modify --name $HP_NODEMAP --property audit_mode --value 0
+    # lctl nodemap_add_range --name $HP_NODEMAP --range "$HP_AGENT_NIDS"
+    # lctl nodemap_add_range --name $HP_NODEMAP --range 0@lo
+    # lctl nodemap_activate 1
+```
+
+## Monitoring lamigo and lpurge
+
+There are several ways to monitor hotpools beyond checking the
+utilization of the fast pool. lamigo maintains several counters and
+statistics which it will write out on receiving `SIGUSR1`. The following
+commands were run on the VM where `${MDT}` is mounted.
+```
+    # MDT=myfs1-MDT0042
+    # cat /var/run/lamigo-${MDT}.pid
+    20223
+    # ps -Fp $(cat /var/run/lamigo-${MDT}.pid)
+    UID        PID  PPID  C    SZ   RSS PSR STIME TTY          TIME CMD
+    root     14434  9331  0 71699  5120   3 Apr20 ?        00:59:16 /usr/sbin/lamigo -f /etc/lamigo/myfs1-MDT0042.conf
+    # pkill --exact --pidfile=/var/run/lamigo-${MDT}.pid --signal=USR1 lamigo
+    # sleep 1
+    # cat /var/run/lamigo-${MDT}.stats
+    version: 1.16-1
+    revision: da7d99c2761e0f294a3e0a150ef59972bd04bc89
+    config:
+        chlg_user: cl23
+        mdtname: myfs1-MDT0042
+        mountpoint: /lustre/myfs1/client
+        source_pool: ddn_ssd
+        target_pool: ddn_hdd
+        min_age: 600
+        max_cache: 268435456
+        rescan: 0
+        thread_count: 1
+        pool_refresh: 600
+        progress_interval: 600
+        periods: 16
+        period_time: 10
+        warmup_k: 20
+        cooldown_k: 15
+        ofd_interval: 5
+        hot_fraction: 20
+        hot_after_idle: 3
+        src_free: 70
+        tgt_free: 10
+    pool ddn_ssd:
+        osts: 8
+        avail: 22974591116
+        total: 31824761344
+        open: 1
+    pool ddn_hdd:
+        osts: 2
+        avail: 436791025224
+        total: 494282835936
+        open: 1
+    stats:
+        read: 6188174
+        skipped: 369235
+        processed: 5818939
+        removed: 0
+        dups: 311924
+        spawned: 7111691
+        replicated: 4429160
+        busy: 0
+        queued: 1226018
+        skip_hot: 0
+        ro2hot: 0
+        rw2hot: 0
+        rw2cold: 3382984
+    changelog:
+        last_processed_idx: 84139503
+        last_cleared_idx: 82909792
+        last_cleared_time: 1619120075 # ( Thu Apr 22 19:34:35 2021 )
+    agents:
+        agent3: { host: ai400x-0a12-vm03, mnt: /lustre/myfs1/client, jobs: 4, max: 4, state: active }
+        agent2: { host: ai400x-0a12-vm02, mnt: /lustre/myfs1/client, jobs: 3, max: 4, state: active }
+        ...
+    jobs:
+        job0: { tid: 140569798792960, fid: [0x2000079ac:0xfb4e:0x0], index: 0, agent: 3, start: 1619120075, command: extend }
+        job1: { tid: 140568524986112, fid: [0x2000079ac:0xfb02:0x0], index: 0, agent: 3, start: 1619120075, command: extend }
+        job2: { tid: 140569807185664, fid: [0x2000079ac:0xfaff:0x0], index: 0, agent: 3, start: 1619120075, command: extend }
+        ...
+```
+Note that the difference between the MDT current changelog index and
+the last index cleared by lamigo indicates the degree to which
+lamigo's consumption of changelog entries is keeping up the production
+of new entries (both values can be seen using the `changelog_users`
+parameter as shown above). A 10 minute (min-age) lag between reading
+and clearing is normal.
+
+Similarly for lpurge, the following commands were run on the VM where
+`${OST}` is mounted:
+```
+    # OST=myfs1-OST0117
+    # cat /var/run/lpurge-${OST}.pid
+    20222
+    # ps -Fp $(cat /var/run/lpurge-${OST}.pid)
+    UID      PID  PPID  C     SZ   RSS PSR STIME TTY     TIME CMD
+    root   20222     1  6 171957 13352  16 Apr20   ? 03:09:06 /usr/sbin/lpurge -f /etc/lpurge/myfs1/OST0117.conf
+    # pkill --exact --pidfile=/var/run/lpurge-${OST}.pid --signal=USR1 lpurge
+    # sleep 1
+    # cat /var/run/lpurge-${OST}.stats
+    version: 1.16-1
+    revision: b465db8b9b99e217d175f31230d04e10a9a17906
+    config:
+        free_high: 80
+        free_low: 50
+        ostname: myfs1-OST0117
+        mountpoint: /lustre/myfs1/client
+        pool: ddn_ssd
+        max_jobs: 8
+        check_interval: 30
+        scan_rate: 10000
+        scan_threads: 1
+        slot_size: 1048576
+    stats:
+        scans: 10
+        scan_time: 346
+        slow_scans: 5
+        queued: 3210711
+        started: 3202330
+        purged: 363985
+        failed: 2838337
+        low_hits: 6
+    space:
+        kbfree: 2997655148
+        low: 1989047584
+        hi: 3182476134
+    hlists:
+```
+
+# Known issues
+
+## If lamigo is falling behind on changelog consumption
+
+Append a `rescan` directive to the lamigo configuration file,
+manually clear the changelog file, and then restart lamigo.
+```
+    # MDT=myfs1-MDT0042
+    # echo rescan >> /etc/lamigo/${MDT}.conf
+    # CL=...
+    # lfs changelog_clear $MDT $CL 0
+    # pkill --exact --pidfile=/var/run/lamigo-${MDT}.pid lamigo
+```
+
+## lamigo SSH errors
+
+If lamigo reports
+```
+    lamigo: Error authenticating pubkey: Failed to read private key: /root/.ssh/id_rsa
+```
+or
+```
+    lamigo: cannot authenticate SSH session to host HOST: ...
+```
+then first check that `ssh` works between the affected hosts. If so
+then check that the private key file (usually `/root/.ssh/id_rsa`) is
+in PEM format. If the key file starts with
+```
+    -----BEGIN RSA PRIVATE KEY-----
+```
+then it is likely in PEM format. If, instead, it starts with
+```
+    -----BEGIN OPENSSH PRIVATE KEY-----
+```
+then it is in the newer OpenSSH format. To generate a key in PEM
+format, add `-m PEM` to the `ssh-keygen` command line.
+
+Second, ensure that the key file does not contain any comments (lines
+starting with `#`).
+
+## Working around HA/Pacemaker/STONITH when stopping hotpools
+
+Formerly "stratagem-hp-stop-no-more-tears.txt". These instructions are
+obsolete after converting hotpools to a ticketed HA configuration.
+```
+# Temporary process to stop hotpools in a production environment
+# without invoking HA STONITH due to busy client mount points.
+#
+# XXX This is not an automated process. You need to run each command in
+# turn and look at the output. Maybe even understand it. Sorry.
+#
+# See https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/high_availability_add-on_reference/s1-managedresource-haar
+# for more information.
+#
+# Author: John L. Hammond <jhammond@whamcloud.com>
+
+# Put all lamigo and lpurge HA resources in unmanaged mode.
+crm resource unmanage 'lamigo*'
+crm resource unmanage 'lpurge*'
+
+# In the output of hastatus the lamigo and lpurge resources should now
+# show up as "Started (unmanaged)"
+hastatus
+
+# Use systemctl to actually stop the services corresponding to the HA
+# resources you just unmanaged.
+clush -a systemctl stop 'lamigo@*'
+clush -a systemctl stop 'lpurge@*'
+
+# In the output of hastatus the lpurge resources should now show up as "Stopped (unmanaged)"
+hastatus
+
+# Verify that the processes have actually terminated.
+clush -a ps -FC lamigo,lpurge
+
+# Wait for any mirroring operations to drain from the client mounts.
+clush -a lsof /lustre/$FS/client/
+clush -a ps -FC lfs
+
+# Once they have drained you can use the stop script.
+stratagem-hp-stop.sh
+
+# To restart:
+crm resource manage 'lamigo*'
+crm resource manage 'lpurge*'
+stratagem-hp-start.sh
+
+# Verify that things are back to normal:
+hastatus
+
+# After all of this 'hastatus' may show some 'Failed Resource Actions'. For example:
+#
+# Failed Resource Actions:
+# * lpurge-fs0a12-OST0004_monitor_30000 on ai400x-0a12-vm02 'not running' (7): call=316, status=complete, exitreason='',
+#     last-rc-change='Thu Apr 8 18:01:57 2021', queued=0ms, exec=0ms
+#
+# To clean these up do 'crm resource cleanup lpurge-fs0a12-OST0004'
+
+```
diff --git a/lipe-revision.sh b/lipe-revision.sh
new file mode 100755 (executable)
index 0000000..32933ff
--- /dev/null
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+# Try to get the current revision (commit hash). Don't fail.
+
+function get_revision {
+       local src="$1"
+       local val="$2"
+
+       if [[ -n "${val}" ]]; then
+               echo "$0: using '${src}' = '${val}'" >&2
+               printf "%s\n" "${val}"
+               exit 0
+       fi
+}
+
+# Manual override.
+get_revision LIPE_REVISION "${LIPE_REVISION:-}"
+
+# Release build on https://es-build.datadirectnet.com/
+get_revision ES_BUILD_LUSTRE_REVISION "${ES_BUILD_LUSTRE_REVISION:-}"
+
+# Reviews build from Gerrit.
+get_revision GERRIT_PATCHSET_REVISION "${GERRIT_PATCHSET_REVISION:-}"
+
+# Build from source.
+GIT_REVISION="$(git rev-parse HEAD 2> /dev/null)"
+get_revision 'git rev-parse HEAD' "${GIT_REVISION:-}"
+
+printf "None\n"
index 69c9a4b..a18d53a 100644 (file)
@@ -21,7 +21,7 @@ Prefix: %{_prefix}
 %{!?python_sitelib: %global python_sitelib %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")}
 
 Release: @LIPE_RELEASE@%{?dist}
-
+VCS: @LIPE_REVISION@
 Summary: lipe - Lustre Integrated Policy Engine
 License: All rights reserved DataDirect Networks Inc.
 Group: Applications/System
@@ -328,13 +328,13 @@ cp -a example_configs/hotpool/* $RPM_BUILD_ROOT%{_sysconfdir}/
        install -m 0744 -D init.d/lipe_test_scheduler \
                $RPM_BUILD_ROOT%{_sysconfdir}/rc.d/init.d/lipe_test_scheduler
 %endif
-install -g 0 -o 0 -m 0644 man/lipe_scan.1 $RPM_BUILD_ROOT%{_mandir}/man1/
-install -g 0 -o 0 -m 0644 man/lipe_find.1 $RPM_BUILD_ROOT%{_mandir}/man1/
-install -g 0 -o 0 -m 0644 man/lfill.1 $RPM_BUILD_ROOT%{_mandir}/man1/
+install -m 0644 man/lipe_scan.1 $RPM_BUILD_ROOT%{_mandir}/man1/
+install -m 0644 man/lipe_find.1 $RPM_BUILD_ROOT%{_mandir}/man1/
+install -m 0644 man/lfill.1 $RPM_BUILD_ROOT%{_mandir}/man1/
 %if %{with laudit}
-install -g 0 -o 0 -m 0644 man/laudit.1 $RPM_BUILD_ROOT%{_mandir}/man1/
-install -g 0 -o 0 -m 0644 man/laudit-report.1 $RPM_BUILD_ROOT%{_mandir}/man1/
-install -g 0 -o 0 -m 0644 man/laudit.conf.5 $RPM_BUILD_ROOT%{_mandir}/man5/
+install -m 0644 man/laudit.1 $RPM_BUILD_ROOT%{_mandir}/man1/
+install -m 0644 man/laudit-report.1 $RPM_BUILD_ROOT%{_mandir}/man1/
+install -m 0644 man/laudit.conf.5 $RPM_BUILD_ROOT%{_mandir}/man5/
 %endif
 cp -a lhsm_tests $RPM_BUILD_ROOT%{_libdir}/
 
@@ -446,15 +446,15 @@ rm -rf $RPM_BUILD_ROOT
 
 
 %changelog
-* Thu Jun 30 2020 Gu Zheng <gzheng@ddn.com> 1.5
+* Tue Jun 30 2020 Gu Zheng <gzheng@ddn.com> [1.5]
 - Update version to 1.5
-* Thu Apr 3 2019 Gu Zheng <gzheng@ddn.com> 1.4
+* Wed Apr 03 2019 Gu Zheng <gzheng@ddn.com> [1.4]
 - Devide RPM by function
-* Thu Jan 3 2019 Gu Zheng <gzheng@ddn.com> 1.4
+* Thu Jan 03 2019 Gu Zheng <gzheng@ddn.com> [1.4]
 - Add pyltest RPM
-* Wed Mar 7 2018 Sebastien Buisson <sbuisson@ddn.com> 1.1
+* Wed Mar 07 2018 Sebastien Buisson <sbuisson@ddn.com> [1.1]
 - Add laudit and laudit-report
-* Mon Oct 11 2017 Li Xi <lixi@ddn.com> 1.0
+* Wed Oct 11 2017 Li Xi <lixi@ddn.com> [1.0]
 - Add license-server RPM
-* Mon Feb 27 2017 Qian Yingjin <qian@ddn.com> 0.1
+* Mon Feb 27 2017 Qian Yingjin <qian@ddn.com> [0.1]
 - Initial import
index 8e1e963..67db1d4 100755 (executable)
@@ -1,39 +1,82 @@
 #!/bin/bash
-# Copyright DDN 2020
 # -*- indent-tabs-mode: nil -*-
+# Copyright DDN 2021
+#
+# Pacemaker Ticket:
+# $FS-hotpool-allocated (controlled by resource $FS-hotpool)
+#
+# Pacemaker Resources:
+# $FS-hotpool (controlls $FS-hotpool-allocated ticket)
+# cl-$FS-client (cloned client mount)
+# lamigo-$FS-MDTXXXX (for each MDT)
+# lpurge-$FS-OSTXXXX (for each OST in fast pool)
+#
+# Files Created
+# /etc/lamigo/$FS-MDTXXXX.conf (for each MDT)
+# /etc/lpurge/$FS/OSTXXXX.conf (for each OST in fast pool)
+# /etc/fstab updated with client mount line
+#
+# Lustre Changes
+# changelog user created for each MDT
 
 # CONFIGURATION VARIABLES
 
-# ASSUMES last configlog user is for lamigo, or creates one if one doesn't exist
-
-LAMIGO_MINAGE=600
-LPURGE_FREEHI=80
-LPURGE_FREELO=50
-FAST_POOL=ddn_ssd
-SLOW_POOL=ddn_hdd
+DEF_LAMIGO_MINAGE=600
+DEF_LPURGE_FREEHI=80
+DEF_LPURGE_FREELO=50
+DEF_FAST_POOL=ddn_ssd
+DEF_SLOW_POOL=ddn_hdd
 
 function usage()
 {
-    echo "Usage: $(basename $0) -m MINAGE -H FREEHI -L FREELO -f FASTPOOL -s SLOWPOOL [ FILESYSTEM ]"
-    exit 0
+    local rc=${1:-0}
+    local values=${2:-1}
+    echo "Usage: $(basename $0) [-h] [-n] [-m MINAGE -H FREEHI -L FREELO -f FASTPOOL -s SLOWPOOL] [ FILESYSTEM ]"
+    echo " Automatically configures Stratagem Hotpools in a existing HA environment"
+    echo "  -n - No auto-detect value"
+    echo "  -h - This help message"
+    echo "  -m - lamigo minimum age"
+    echo "  -H - lpurge free-hi percent"
+    echo "  -L - lpurge free-lo percent"
+    echo "  -f - Fast OST pool"
+    echo "  -s - Slow OST pool"
+    if (( values == 1 )); then
+        echo "Values:"
+        echo " Filesystem    : $FS"
+        echo " Fast OST Pool : $FAST_POOL"
+        echo " Slow OST Pool : $SLOW_POOL"
+        echo " LAmigo MinAge : $LAMIGO_MINAGE"
+        echo " LPurge FreeHI : ${LPURGE_FREEHI:-${DEF_LPURGE_FREEHI}}"
+        echo " LPurge FreeLO : ${LPURGE_FREELO:-${DEF_LPURGE_FREELO}}"
+    fi
+    exit $rc
 }
 
-while getopts "hm:H:L:f:s:" opt; do
+HELP=false
+AUTO=true
+
+while getopts "hm:H:L:f:s:n" opt; do
     case "$opt" in
-       m) LAMIGO_MINAGE=$OPTARG ;;
-       H) LPURGE_FREEHI=$OPTARG ;;
-       L) LPURGE_FREELO=$OPTARG ;;
-       f) FAST_POOL=$OPTARG ;;
-       s) SLOW_POOL=$OPTARG ;;
-       h) usage ;;
+        m) LAMIGO_MINAGE=$OPTARG ;;
+        H) LPURGE_FREEHI=$OPTARG ;;
+        L) LPURGE_FREELO=$OPTARG ;;
+        f) FAST_POOL=$OPTARG ;;
+        s) SLOW_POOL=$OPTARG ;;
+        n) AUTO=false ;;
+        h) HELP=true ;;
     esac
 done
 shift $((OPTIND-1))
 
+FS=$1
+
 # Location of /lustre/$FS/<SERVER> mounts and /lustre/$FS/client mount
 ROOT=/lustre
 
-FS=$1
+function locate_resource() {
+    local res=$1
+    clush -N --group=ha_heads "crm_resource -QW -r $res 2> /dev/null || true"
+}
 
 function is_valid_percent {
     local p="${1:-}"
@@ -45,62 +88,95 @@ function is_minage_ok {
     [[ "${p}" =~ ^[[:digit:]]+$ ]] && ((p >= 5))
 }
 
-if ! is_valid_percent "${LPURGE_FREEHI}"; then
-    echo "Invalid FREEHI percentage '${LPURGE_FREEHI}'"
-    exit 1
-fi
-
-if ! is_valid_percent "${LPURGE_FREELO}"; then
-    echo "Invalid FREELO percentage '${LPURGE_FREELO}'"
-    exit 1
-fi
-
-if ! ((LPURGE_FREELO < LPURGE_FREEHI)); then
-    echo "FREELO (${LPURGE_FREELO}) must be less than FREEHI (${LPURGE_FREEHI})"
-    exit 1
-fi
-
-if ! is_minage_ok "${LAMIGO_MINAGE}"; then
-    echo "MINAGE (${LAMIGO_MINAGE}) must be positive integer >= 5"
-    exit 1
-fi
-
 if [ -z "$FS" ]; then
     # Try automatic detection
-    FS=$(crm_ticket -l|cut -f 1 -d -|grep -v ^lustre$)
+    FS=$(crm_ticket -l|cut -f 1 -d -|grep -v ^lustre$|sort -u)
 
     if [ $(echo "$FS" |wc -w) -ne 1 ]; then
-        echo "Usage: $0 FSNAME"
-        exit 1
+        echo "Error: Unable to determine filesystem automatically"
+        usage 22 0
     fi
 fi
 
-# Script from here down
-function locate_resource() {
-    res=$1
-    clush -N --group=ha_heads "crm_resource -QW -r $res 2> /dev/null || true"
-}
+if $AUTO && [ ! -f /etc/lamigo/$FS-MDT0000.conf ]; then
+    echo "Disabling auto-detection of settings ($FS-MDT0000 config missing)"
+    AUTO=false
+fi
 
 MDSHOST=$(locate_resource mdt0000-$FS)
 if [ -z "$MDSHOST" ]; then
     # if FS is not an resident filesystem, mdt0 will not exist for it
     echo Failed to find mdt0 for filesystem: $FS
-    exit 1
+    exit 5
+fi
+
+# reuse of changelog user handled below
+# get lpurge options - this gets the first OST from the FAST pool
+if $AUTO; then
+    AUTO_FAST_POOL=$(awk -F = '/^src=/{ print $2 }' /etc/lamigo/$FS-MDT0000.conf)
+    AUTO_SLOW_POOL=$(awk -F = '/^tgt=/{ print $2 }' /etc/lamigo/$FS-MDT0000.conf)
+    AUTO_LAMIGO_MINAGE=$(awk -F = '/^min-age=/{ print $2 }' /etc/lamigo/$FS-MDT0000.conf)
 fi
 
+# Set value: if not explicitly set, set auto, if no auto, set default
+
+LAMIGO_MINAGE=${LAMIGO_MINAGE:-${AUTO_LAMIGO_MINAGE:-${DEF_LAMIGO_MINAGE}}}
+FAST_POOL=${FAST_POOL:-${AUTO_FAST_POOL:-${DEF_FAST_POOL}}}
+SLOW_POOL=${SLOW_POOL:-${AUTO_SLOW_POOL:-${DEF_SLOW_POOL}}}
+
 # List of %04x formated OST indexes
-FAST_OSTLIST=$(ssh $MDSHOST lctl pool_list $FS.$FAST_POOL|sed -e 's/.*OST\(....\).*/\1/p;d')
+FAST_OSTLIST=$(ssh $MDSHOST lctl pool_list $FS.$FAST_POOL|sed -ne 's/.*-OST\(....\)_UUID/\1/p')
 if [ -z "$FAST_OSTLIST" ]; then
     echo "Failed to find OSTs in Fast pool ($FS.$FAST_POOL)"
-    exit 1
+    usage 2
 fi
 
-SLOW_OSTLIST=$(ssh $MDSHOST lctl pool_list $FS.$SLOW_POOL|sed -e 's/.*OST\(....\).*/\1/p;d')
+SLOW_OSTLIST=$(ssh $MDSHOST lctl pool_list $FS.$SLOW_POOL|sed -ne 's/.*-OST\(....\)_UUID/\1/p')
 if [ -z "$SLOW_OSTLIST" ]; then
     echo "Failed to find OSTs in Slow pool ($FS.$SLOW_POOL)"
-    exit 1
+    usage 2
+fi
+
+if $AUTO; then
+    for OST in $FAST_OSTLIST; do
+       F=/etc/lpurge/$FS/OST${OST}.conf
+       if [ -f $F ]; then
+           AUTO_LPURGE_FREEHI=$(awk -F = '/^freehi=/{ print $2 }' $F)
+           AUTO_LPURGE_FREELO=$(awk -F = '/^freelo=/{ print $2 }' $F)
+           break
+       fi
+    done
+fi
+
+LPURGE_FREEHI=${LPURGE_FREEHI:-${AUTO_LPURGE_FREEHI:-${DEF_LPURGE_FREEHI}}}
+LPURGE_FREELO=${LPURGE_FREELO:-${AUTO_LPURGE_FREELO:-${DEF_LPURGE_FREELO}}}
+
+if ! is_valid_percent "${LPURGE_FREEHI}"; then
+    echo "Invalid FREEHI percentage '${LPURGE_FREEHI}'"
+    usage 34
+fi
+
+if ! is_valid_percent "${LPURGE_FREELO}"; then
+    echo "Invalid FREELO percentage '${LPURGE_FREELO}'"
+    usage 34
 fi
 
+if ! ((LPURGE_FREELO < LPURGE_FREEHI)); then
+    echo "FREELO (${LPURGE_FREELO}) must be less than FREEHI (${LPURGE_FREEHI})"
+    usage 34
+fi
+
+if ! is_minage_ok "${LAMIGO_MINAGE}"; then
+    echo "MINAGE (${LAMIGO_MINAGE}) must be positive integer >= 5"
+    usage 34
+fi
+
+if $HELP; then
+    usage 0
+fi
+
+echo "Using: $0 -m ${LAMIGO_MINAGE} -H ${LPURGE_FREEHI} -L ${LPURGE_FREELO} -f ${FAST_POOL} -s ${SLOW_POOL} $FS"
+
 OSSLIST=$(es_config_get --option fs_settings.$FS.oss_list)
 
 # Die on errors
@@ -108,32 +184,44 @@ set -e
 
 MOUNTSPEC=$(/opt/ddn/es/tools/mount_lustre_client --dry-run --fs $FS |awk '{ print $4 }')
 
-echo "Creating config directories"
-clush -a mkdir -p /etc/lpurge/$FS/ /etc/lamigo/ $ROOT/$FS/client
+# Create client mount if it doesn't exist
+if ! crm_resource -QW -r cl-$FS-client > /dev/null 2>&1; then
 
-echo "Creating lustre client mounts for $FS"
-clush -a "grep -q $FS/client /etc/fstab || (echo $MOUNTSPEC $ROOT/$FS/client lustre x-systemd.mount-timeout=20m,noauto,_netdev >> /etc/fstab)"
-clush -a mkdir -p /etc/systemd/system/lustre-$FS-client.mount.d/
-cat << EOF > /etc/systemd/system/lustre-$FS-client.mount.d/timeout.conf
+    echo "Creating config directories"
+    clush -a mkdir -p /etc/lpurge/$FS/ /etc/lamigo/ $ROOT/$FS/client
+
+    echo "Creating lustre client mounts for $FS"
+    clush -a "grep -q $FS/client /etc/fstab || (echo $MOUNTSPEC $ROOT/$FS/client lustre x-systemd.mount-timeout=20m,noauto,_netdev >> /etc/fstab)"
+    clush -a mkdir -p /etc/systemd/system/lustre-$FS-client.mount.d/
+    cat << EOF > /etc/systemd/system/lustre-$FS-client.mount.d/timeout.conf
 [Mount]
 TimeoutSec=20m
 EOF
-clush -ca /etc/systemd/system/lustre-$FS-client.mount.d/timeout.conf
-clush -a systemctl daemon-reload
-echo "Creating lustre client resource agent"
-clush -g ha_heads crm configure <<EOF
-primitive $FS-client systemd:lustre-$FS-client.mount op start timeout=900s op monitor interval=60s meta target-role=Stopped
+    clush -ca /etc/systemd/system/lustre-$FS-client.mount.d/timeout.conf
+    clush -a systemctl daemon-reload
+    echo "Creating lustre client resource agent"
+    clush -g ha_heads crm configure <<EOF
+primitive $FS-client systemd:lustre-$FS-client.mount meta target-role=Stopped op start timeout=900s op monitor interval=60s op stop timeout=900s
 clone cl-$FS-client $FS-client
 rsc_ticket ticket-$FS-allocated-client $FS-allocated: cl-$FS-client loss-policy=stop
 EOF
 
-# create order constraints on all lustre-server template types (mdt or ost)
-clush -g ha_heads "cibadmin -e --query --xpath '//template[@type=\"lustre-server\"]'|awk -F \"'\" '/lustre-$FS-/{ print \$2 }'|awk -F - '{ print \"order $FS-client-after-\"\$3\" 0: lustre-$FS-\"\$3\":start cl-$FS-client:start\" }'|crm configure"
+    # create order constraint for client on all lustre-server template types (mdt or ost)
+    clush -g ha_heads "cibadmin -e --query --xpath '//template[@type=\"lustre-server\"]'|awk -F \"'\" '/lustre-$FS-/{ print \$2 }'|awk -F - '{ print \"order $FS-client-after-\"\$3\" 0: lustre-$FS-\"\$3\":start cl-$FS-client:start\" }'|crm configure"
 
-ssh $MDSHOST crm configure <<EOF
+    ssh $MDSHOST crm configure <<EOF
 order $FS-client-after-mgs 0: mgs:start cl-$FS-client:start
 EOF
 
+fi # ! crm_resource client
+
+# ticket
+echo "Creating Hotpool ticket for $FS"
+clush -g ha_heads crm configure <<EOF
+primitive $FS-hotpool ocf:ddn:Ticketer params name=$FS-hotpool-allocated meta allow-migrate=true priority=199 target-role=Stopped op start interval=0 timeout=30 op monitor interval=30 timeout=30 op stop interval=0 timeout=30
+order $FS-hotpool-after-cl-$FS-client 0: cl-$FS-client $FS-hotpool
+EOF
+
 echo "Configuring OFD access logs"
 MGSHOST=$(locate_resource mgs)
 ssh $MGSHOST lctl set_param -P "obdfilter.${FS}-*.access_log_size=1048576"
@@ -143,7 +231,7 @@ echo "Setting up lpurge (HI:$LPURGE_FREEHI to LO:$LPURGE_FREELO)"
 # this results in MDTLIST being a list of MDT indexes in format "%04d"
 # This format is problematic for values greater than 0007, since by
 # default bash will interpret it as octal
-MDTLIST=$(clush -qN -g ha_heads crm_resource --list-raw|sed "s/mdt\(.*\)-$FS$/\1/p;d")
+MDTLIST=$(clush -qN -g ha_heads crm_resource --list-raw|sed -ne "s/mdt\(.*\)-$FS$/\1/p")
 
 for OST in $FAST_OSTLIST; do
     INDEX=$(printf "%04d" 0x$OST)
@@ -169,11 +257,11 @@ EOF
     clush -ca /etc/lpurge/$FS/OST$OST.conf
 
     echo Creating lpurge resource for $FS-OST$OST
-    clush -qS --group=ha_heads "crm_resource -QW -r ost$INDEX-$FS >/dev/null 2&>1 && crm configure << EOF || true
+    clush -qS --group=ha_heads "crm_resource -QW -r ost$INDEX-$FS >/dev/null 2>&1 && crm configure << EOF || true
 primitive lpurge-$FS-OST$OST systemd:lpurge@$FS-OST$OST.service op monitor interval=30s
 colocation lpurge-$FS-$OST-with-ost inf: lpurge-$FS-OST$OST ost$INDEX-$FS
 order lpurge-$FS-$OST-after-ost ost$INDEX-$FS lpurge-$FS-OST$OST
-order lpurge-$FS-$OST-after-client cl-$FS-client lpurge-$FS-OST$OST
+rsc_ticket ticket-$FS-hotpool-allocated-lpurge-$FS-OST$OST $FS-hotpool-allocated: lpurge-$FS-OST$OST loss-policy=stop
 EOF"
 done
 
@@ -183,10 +271,21 @@ echo "Setting up lamigo (From $FAST_POOL to $SLOW_POOL aged at least $LAMIGO_MIN
 for INDEX in $MDTLIST; do
     MDT=$(printf "%04x" $((10#$INDEX)) )
     MDSHOST=$(locate_resource mdt$INDEX-$FS)
-    CL=$(ssh $MDSHOST lctl get_param -n mdd.$FS-MDT$MDT.changelog_users|tail -1|awk '/^cl/{ print $1 }')
+    CL=""
+
+    # reuse existing changelog user
+    if [ -f /etc/lamigo/$FS-MDT$MDT.conf ]; then
+        CL=$(awk -F = '/^user=/{ print $2 }' /etc/lamigo/$FS-MDT$MDT.conf)
+        # ensure changelog user exists
+        if [ -z "$(ssh $MDSHOST lctl get_param -n mdd.$FS-MDT$MDT.changelog_users|grep $CL[[:space:]])" ]; then
+            echo "Previous changelog user for $FS-MDT$MDT: $CL is missing"
+            CL=""
+        fi
+    fi
+    # create new user if none was previously configured
     if [ -z "$CL" ]; then
-        ssh $MDSHOST lctl --device $FS-MDT$MDT changelog_register
-        CL=$(ssh $MDSHOST lctl get_param -n mdd.$FS-MDT$MDT.changelog_users|tail -1|awk '/^cl/{ print $1 }')
+        CL=$(ssh $MDSHOST lctl --device $FS-MDT$MDT changelog_register | awk -F \' '{print $2 }')
+        echo "Registered new changelog user for $FS-MDT$MDT: $CL"
     fi
 
     echo Creating lamigo config for $FS-MDT$MDT
@@ -199,7 +298,7 @@ src=$FAST_POOL
 tgt=$SLOW_POOL
 EOF
     for HOST in $OSSLIST; do
-       echo oss=$HOST >> /etc/lamigo/$FS-MDT$MDT.conf
+        echo oss=$HOST >> /etc/lamigo/$FS-MDT$MDT.conf
     done
     for HOST in $(cluset -e @all); do
         echo agent=$HOST:$ROOT/$FS/client:4 >> /etc/lamigo/$FS-MDT$MDT.conf
@@ -214,15 +313,13 @@ EOF
     clush -ca /etc/lamigo/$FS-MDT$MDT.conf /etc/systemd/system/lamigo@$FS-MDT$MDT.service.d/override.conf
 
     echo Creating lamigo resource for $FS-MDT$MDT
-    clush -qS --group=ha_heads "crm_resource -QW -r mdt$INDEX-$FS >/dev/null 2&>1 && crm configure << EOF || true
+    clush -qS --group=ha_heads "crm_resource -QW -r mdt$INDEX-$FS >/dev/null 2>&1 && crm configure << EOF || true
 primitive lamigo-$FS-MDT$MDT systemd:lamigo@$FS-MDT$MDT.service op start timeout=15m op monitor interval=30s
 colocation lamigo-$FS-MDT$MDT-with-mdt inf: lamigo-$FS-MDT$MDT mdt$INDEX-$FS
 order lamigo-$FS-$MDT-after-mdt mdt$INDEX-$FS lamigo-$FS-MDT$MDT
-order lamigo-$FS-$MDT-after-client cl-$FS-client lamigo-$FS-MDT$MDT
+rsc_ticket ticket-$FS-hotpool-allocated-lamigo-$FS-MDT$MDT $FS-hotpool-allocated: lamigo-$FS-MDT$MDT loss-policy=stop
 EOF"
 done
 
 echo "Configuration complete.  Run the following command to start services:"
-echo "  clush -qS --group=ha_heads crm resource start cl-$FS-client"
-echo "or"
 echo "  stratagem-hp-start.sh $FS"
diff --git a/scripts/stratagem-hp-convert.sh b/scripts/stratagem-hp-convert.sh
new file mode 100755 (executable)
index 0000000..2e342a9
--- /dev/null
@@ -0,0 +1,76 @@
+#!/bin/bash
+# -*- indent-tabs-mode: nil -*-
+# Copyright DDN 2021
+
+FS=$1
+
+function usage() {
+    echo "Usage: $(basename $0) [-h] [FILESYSTEM]"
+    echo "  Convert Stratagem Hotpools HA environment to be ticket based"
+}
+
+if [ "$FS" = "-h" ] || [ "$FS" = "--help" ]; then
+    usage
+    exit 0
+fi
+
+function locate_resource() {
+    local res=$1
+    clush -N --group=ha_heads "crm_resource -QW -r $res 2> /dev/null || true"
+}
+
+if [ -z "$FS" ]; then
+    # Try automatic detection
+    FS=$(crm_ticket -l|cut -f 1 -d -|grep -v ^lustre$|uniq)
+
+    if [ $(echo "$FS" |wc -w) -ne 1 ]; then
+        echo "Error: Unable to determine filesystem automatically, please specify"
+        usage
+        exit 1
+    fi
+fi
+
+if crm_resource -QW -r $FS-hotpool > /dev/null 2>&1; then
+    echo "Hotpools for $FS already using ticket"
+    exit 0
+fi
+
+MDSHOST=$(locate_resource mdt0000-$FS)
+MDTLIST=$(clush -qN -g ha_heads crm_resource --list-raw|sed -ne "s/mdt\(.*\)-$FS$/\1/p")
+FAST_POOL=$(awk -F = '/^src=/{ print $2 }' /etc/lamigo/$FS-MDT0000.conf)
+FAST_OSTLIST=$(ssh $MDSHOST lctl pool_list $FS.$FAST_POOL|sed -ne 's/.*-OST\(....\)_UUID/\1/p')
+
+ROLE=Started
+if crm_resource -r $FS-client -W |& grep -q NOT; then
+    ROLE=Stopped
+fi
+
+echo "Creating hotpool ticket for $FS"
+clush -g ha_heads crm configure <<EOF
+primitive $FS-hotpool ocf:ddn:Ticketer params name=$FS-hotpool-allocated meta allow-migrate=true priority=199 target-role=$ROLE op start interval=0 timeout=30 op monitor interval=30 timeout=30 op stop interval=0 timeout=30
+order $FS-hotpool-after-cl-$FS-client 0: cl-$FS-client $FS-hotpool
+EOF
+
+for OST in $FAST_OSTLIST; do
+    INDEX=$(printf "%04d" 0x$OST)
+    echo Updating lpurge config for $FS-OST$OST
+
+    echo "Updating lpurge HA for $FS-OST$OST"
+
+    clush -qS --group=ha_heads "crm_resource -QW -r ost$INDEX-$FS >/dev/null 2>&1 && crm configure << EOF || true
+delete lpurge-$FS-$OST-after-client
+rsc_ticket ticket-$FS-hotpool-allocated-lpurge-$FS-OST$OST $FS-hotpool-allocated: lpurge-$FS-OST$OST loss-policy=stop
+EOF"
+done
+
+# MDTLIST is derived from resources so it's already in decimal but formatted %04d
+for INDEX in $MDTLIST; do
+    MDT=$(printf "%04x" $((10#$INDEX)) )
+
+    echo "Updating lamigo HA for $FS-MDT$MDT"
+
+    clush -qS --group=ha_heads "crm_resource -QW -r mdt$INDEX-$FS >/dev/null 2>&1 && crm configure << EOF || true
+delete lamigo-$FS-$MDT-after-client
+rsc_ticket ticket-$FS-hotpool-allocated-lamigo-$FS-MDT$MDT $FS-hotpool-allocated: lamigo-$FS-MDT$MDT loss-policy=stop
+EOF"
+done
index 0c996f0..214262d 100755 (executable)
@@ -4,15 +4,37 @@
 
 FS=$1
 
+function usage() {
+    echo "Usage: $(basename $0) [-h] [FILESYSTEM]"
+    echo "  Start Stratagem Hotpools HA environment"
+}
+
+if [ "$FS" = "-h" ] || [ "$FS" = "--help" ]; then
+    usage
+    exit 0
+fi
+
 if [ -z "$FS" ]; then
     # Try automatic detection
-    FS=$(crm_ticket -l|cut -f 1 -d -|grep -v ^lustre$)
+    FS=$(crm_ticket -l|cut -f 1 -d -|grep -v ^lustre$|uniq)
 
     if [ $(echo "$FS" |wc -w) -ne 1 ]; then
-        echo "Usage: $0 FSNAME"
+       echo "Error: Could not automatically find filesystem, please specify"
+        usage
         exit 1
     fi
 fi
 
+if ! crm_resource -QW -r $FS-hotpool > /dev/null 2>&1; then
+    if clush -Ng ha_heads crm_resource -l|egrep -q "^(lamigo|lpurge)-$FS-"; then
+       echo "Hotpools needs conversion"
+       echo "  Please run: stratagem-hp-convert.sh"
+    else
+       echo "Hotpools not configured"
+       echo "  Please run: stratagem-hp-config.sh"
+    fi
+    exit 1
+fi
+
 echo Starting Hotpools for $FS
-clush -qS --group=ha_heads crm res start cl-$FS-client
+clush -qS --group=ha_heads crm res start cl-$FS-client $FS-hotpool
index 1f2c7bd..dba265b 100755 (executable)
@@ -4,15 +4,37 @@
 
 FS=$1
 
+function usage() {
+    echo "Usage: $(basename $0) [-h] [FILESYSTEM]"
+    echo "  Stop stratagem hotpools in HA environment"
+}
+
+if [ "$FS" = "-h" ] || [ "$FS" = "--help" ]; then
+    usage
+    exit 0
+fi
+
 if [ -z "$FS" ]; then
     # Try automatic detection
-    FS=$(crm_ticket -l|cut -f 1 -d -|grep -v ^lustre$)
+    FS=$(crm_ticket -l|cut -f 1 -d -|grep -v ^lustre$|uniq)
 
     if [ $(echo "$FS" |wc -w) -ne 1 ]; then
-        echo "Usage: $0 FSNAME"
+       echo "Error: Could not automatically find filesystem, please specify"
+        usage
         exit 1
     fi
 fi
 
-echo Stopping Hotpools for $FS
-clush -qS --group=ha_heads crm res stop cl-$FS-client
+if crm_resource -QW -r $FS-hotpool > /dev/null 2>&1; then
+    echo Stopping Hotpools for $FS
+    clush -qS --group=ha_heads crm res stop $FS-hotpool
+
+elif clush -Ng ha_heads crm_resource -l|egrep -q "^(lamigo|lpurge)-$FS-"; then
+    echo "Stopping old style Hotpools (see stratagem-hp-convert.sh)"
+    clush -qS -g ha_heads crm res stop cl-$FS-client
+
+else
+    echo "Hotpools not configured"
+    echo "  Please run: stratagem-hp-config.sh"
+    exit 1
+fi
diff --git a/scripts/stratagem-hp-teardown-client.sh b/scripts/stratagem-hp-teardown-client.sh
new file mode 100755 (executable)
index 0000000..b579878
--- /dev/null
@@ -0,0 +1,90 @@
+#!/bin/bash
+# -*- indent-tabs-mode: nil -*-
+# Copyright DDN 2021
+#
+# Stops and remove cloned client mount
+# Removes:
+# resources $FS-mount (the cloned mount)
+# client mount from /etc/fstab
+
+FS=$1
+
+# Set a 10+ minute timeout for waitfor()
+TIMEOUT=${TIMEOUT:-600}
+
+function usage() {
+    echo "Usage: $(basename $0) [-h] [-t SECS] [FILESYSTEM]"
+    echo "  Tear down server client mount HA environment"
+    echo "   -t SECS - Max timeout to wait for services to stop (default: $TIMEOUT)"
+}
+
+while getopts "ht:" opt; do
+    case $opt in
+        h) usage; exit 0;;
+        t) TIMEOUT=$OPTARG;;
+    esac
+done
+
+if [ "$FS" = "--help" ]; then
+    usage
+    exit 0
+fi
+
+if [ -z "$FS" ]; then
+    # Try automatic detection
+    FS=$(crm_ticket -l|cut -f 1 -d -|grep -v ^lustre$|uniq)
+
+    if [ $(echo "$FS" |wc -w) -ne 1 ]; then
+        echo "Usage: $0 FSNAME"
+        exit 1
+    fi
+fi
+
+waitfor () {
+    local ids=$*
+
+    [ -z "$ids" ] && return
+
+    echo -n "Waiting for $ids"
+
+    local deadline=$((SECONDS+TIMEOUT))
+
+    local done=false
+    until $done || ((deadline < SECONDS)); do
+
+        done=true
+        for i in $ids; do
+            # This finds the host the resource is active on (empty it is stopped)
+            res=$(clush -qg ha_heads crm_resource -QW -r $i 2> /dev/null)
+            if [ -n "$res" ]; then
+                done=false
+                echo -n "."
+                sleep 1
+                break
+            fi
+        done
+    done
+
+    echo ""
+
+    if ! $done; then
+        echo "Waiting for $ids TIMED OUT!"
+        exit 1
+    fi
+}
+
+res=$(clush -qNg ha_heads crm_resource -l 2> /dev/null|grep lamigo-$FS-MDT0000)
+if [ -n "$res" ]; then
+    echo "ERROR: Please tear down Hotpools first"
+    exit 1
+fi
+
+echo Stopping Client
+clush -qS --group=ha_heads crm res stop cl-$FS-client
+
+waitfor cl-$FS-client
+
+clush -qS --group=ha_heads crm config del $FS-client
+
+echo "Removing client mount from fstab"
+clush -a "sed -i '/\/$FS\/client.*lustre/d' /etc/fstab"
index bc7b489..fdff7b7 100755 (executable)
@@ -1,15 +1,44 @@
 #!/bin/bash
 # -*- indent-tabs-mode: nil -*-
 # Copyright DDN 2020
+#
+# After stopping hotpools (via ticket)
+# Removes:
+# resources lamigo-$FS-*
+# resources lpurge-$FS-*
+# resource $FS-hotpool
+# ticket $FS-hotpool-allocated
+#
+# Leaves:
+# all lamigo & lpurge config files
+# client mount
 
 FS=$1
 
-# Set a 10+ minute timeout for waitfor()
+# Set a default 10+ minute timeout for waitfor()
 TIMEOUT=${TIMEOUT:-600}
 
+function usage() {
+    echo "Usage: $(basename $0) [-h] [-t SECS] [FILESYSTEM]"
+    echo "  Tear down Stratagem Hotpools HA environment"
+    echo "   -t SECS - Max timeout to wait for services to stop (default: $TIMEOUT)"
+}
+
+while getopts "ht:" opt; do
+    case $opt in
+        h) usage; exit 0;;
+        t) TIMEOUT=$OPTARG;;
+    esac
+done
+
+if [ "$FS" = "--help" ]; then
+    usage
+    exit 0
+fi
+
 if [ -z "$FS" ]; then
     # Try automatic detection
-    FS=$(crm_ticket -l|cut -f 1 -d -|grep -v ^lustre$)
+    FS=$(crm_ticket -l|cut -f 1 -d -|grep -v ^lustre$|uniq)
 
     if [ $(echo "$FS" |wc -w) -ne 1 ]; then
         echo "Usage: $0 FSNAME"
@@ -18,42 +47,54 @@ if [ -z "$FS" ]; then
 fi
 
 waitfor () {
-    IDS=$*
+    local ids=$*
 
-    [ -z "$IDS" ] && return
+    [ -z "$ids" ] && return
 
-    echo -n "Waiting for $IDS"
+    echo -n "Waiting for $ids"
 
-    local timeout=$TIMEOUT
+    local deadline=$((SECONDS+TIMEOUT))
 
-    local NOTDONE=true
-    while $NOTDONE && ((timeout > 0)); do
+    local done=false
+    until $done || ((deadline < SECONDS)); do
 
-        NOTDONE=false
-        for i in $IDS; do
+        done=true
+        for i in $ids; do
             # This finds the host the resource is active on (empty it is stopped)
             res=$(clush -qg ha_heads crm_resource -QW -r $i 2> /dev/null)
             if [ -n "$res" ]; then
-                NOTDONE=true
+                done=false
                 echo -n "."
                 sleep 1
                 break
             fi
         done
-
-        let timeout-=1
     done
 
     echo ""
 
-    if ((timeout == 0)); then
-        echo "Waiting for $IDS TIMED OUT!"
+    if ! $done; then
+        echo "Waiting for $ids TIMED OUT!"
         exit 1
     fi
 }
 
+if ! crm_resource -QW -r $FS-hotpool > /dev/null 2>&1; then
+    if clush -Ng ha_heads crm_resource -l|egrep -q "^(lamigo|lpurge)-$FS-"; then
+       echo "Hotpools needs conversion"
+       echo "  Please run: stratagem-hp-convert.sh"
+        exit 1
+    else
+       echo "Hotpools not configured"
+        exit 0
+    fi
+fi
+
+echo "Stopping Hotpools for $FS"
+clush -qS --group=ha_heads crm res stop $FS-hotpool
+
 for host in $(cluset -e @ha_heads); do
-    IDS=$(ssh $host crm_resource --list-raw|egrep "(lamigo|lpurge)-$FS")
+    IDS=$(ssh $host crm_resource --list-raw|egrep "^(lamigo|lpurge)-$FS-")
     if [ -n "$IDS" ]; then
         echo Stopping lamigo and lpurge on $host
         ssh $host crm res stop $IDS
@@ -64,13 +105,10 @@ for host in $(cluset -e @ha_heads); do
     fi
 done
 
+echo "Removing Hotpool ticket for $FS"
+clush -qS --group=ha_heads crm config del $FS-hotpool
 
-echo Stopping Client
-clush -qS --group=ha_heads crm res stop cl-$FS-client
-
-waitfor cl-$FS-client
-
-clush -qS --group=ha_heads crm config del $FS-client
+clush -qS --group=ha_heads crm_ticket --ticket $FS-hotpool-allocated --cleanup
 
-echo "Removing client mount from fstab"
-clush -a "sed -i '/\/$FS\/client.*lustre/d' /etc/fstab"
+echo "Local client is still configured"
+echo "  to remove run: stratagem-hp-teardown-client.sh"
index 901c554..792c8ff 100644 (file)
@@ -26,6 +26,8 @@ LIPE_SOURCES = cmd.c cmd.h debug.c debug.h fake_lustre_disk.h \
                lustre_ea.c lustre_ea.h \
                lipe_ldiskfs.c lipe_ldiskfs.h \
                lipe_object_attrs.h \
+               lipe_version.c \
+               lipe_version.h \
                list.h policy.h policy.c \
                ldiskfs_read_ldd.c ldiskfs_read_ldd.h \
                general_policy.h general_policy.c \
@@ -67,9 +69,9 @@ lamigo_SOURCES = lamigo.c lamigo.h lamigo_alr.c lamigo_hash.c lamigo_hash.h \
 lamigo_CFLAGS = $(LIPE_CFLAGS)
 lamigo_LDFLAGS = $(LIPE_LDFLAGS) -lssh -lssh_threads
 
-lpurge_SOURCES = lpurge.c $(LIPE_SOURCES) $(lipe_ssh_SOURCES)
+lpurge_SOURCES = lpurge.c $(LIPE_SOURCES)
 lpurge_CFLAGS = $(LIPE_CFLAGS)
-lpurge_LDFLAGS = $(LIPE_LDFLAGS) -lssh -lssh_threads
+lpurge_LDFLAGS = $(LIPE_LDFLAGS)
 
 lipe_expression_test_SOURCES = lipe_expression_test.c $(LIPE_SOURCES)
 lipe_expression_test_CFLAGS = $(LIPE_CFLAGS)
index 68b5763..323a5aa 100644 (file)
@@ -21,6 +21,7 @@
  *  - auto-config with lfs df -v to find flash OSTs
  *  - tests
  */
+#include <stdarg.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <errno.h>
@@ -60,6 +61,7 @@
 #include "general_policy.h"
 #include "list.h"
 #include "lipe_config.h"
+#include "lipe_version.h"
 
 #define DEF_POOL_REFRESH_INTV  (10 * 60)
 #define DEF_PROGRESS_INTV      (10 * 60)
@@ -130,7 +132,8 @@ static void usage(void)
               "\t--hot-fraction, files with heat >= <hottest*fraction/100> are hot (default: %d)\n"
               "\t--hot-after-idle, hot files after N idle periods to be mirrored (default: %d)\n"
               "\t--src-free, mirroring to source OST pool is prohibited if pool has less space (default: %d%%)\n"
-              "\t--tgt-free, mirroring to target OST pool is prohibited if pool has less space (default: %d%%)\n",
+              "\t--tgt-free, mirroring to target OST pool is prohibited if pool has less space (default: %d%%)\n"
+              "\t--version, print version information and exit\n",
               program_invocation_short_name,
               DEF_MIN_AGE,
               DEF_CACHE_SIZE >> 20,
@@ -264,6 +267,8 @@ struct mirror_opts {
 };
 
 __u64 lamigo_last_processed_idx; /* changelog index */
+static time_t lamigo_last_cleared;
+static __u64 lamigo_last_cleared_index;
 struct lipe_list_head lamigo_agent_list;
 static int lamigo_agent_count;
 int lamigo_max_jobs; /* max jobs for all agents */
@@ -288,6 +293,17 @@ struct pool_list *src_pools; /* source pools */
 struct pool_list *tgt_pools; /* target pool */
 static void *lamigo_refresh_statfs_thread(void *arg);
 
+inline int are_agents_busy(void)
+{
+       if (opt.o_iml_re_socket)
+               return 0;
+
+       if (lamigo_jobs_running >= lamigo_max_jobs)
+               return 1;
+
+       return 0;
+}
+
 /* Convert human readable size string to and int; "1k" -> 1000 */
 static int strsize2int(long *sizep, char *str)
 {
@@ -324,6 +340,24 @@ static int strsize2int(long *sizep, char *str)
        }
 }
 
+static void systemf(const char *fmt, ...) __attribute__ ((format (printf, 1, 2)));
+static void systemf(const char *fmt, ...)
+{
+       char *cmd = NULL;
+       va_list args;
+       int rc;
+
+       va_start(args, fmt);
+       rc = vasprintf(&cmd, fmt, args);
+       va_end(args);
+
+       if (rc < 0)
+               return;
+
+       system(cmd);
+       free(cmd);
+}
+
 static struct lipe_flist *lamigo_flist_init(char *socket)
 {
        struct lipe_flist *flist = flist_alloc(NULL, 1024,
@@ -344,6 +378,8 @@ void lamigo_usr1_handle(int sig)
        struct resync_agent *a;
        struct resync_job *j;
        struct pool_list *pl;
+       struct tm *tmtime;
+       char timestr[32];
        FILE *f;
        int i;
 
@@ -355,11 +391,15 @@ void lamigo_usr1_handle(int sig)
                llapi_error(LLAPI_MSG_DEBUG|LLAPI_MSG_NO_ERRNO, 0, "can't open dump file\n");
                return;
        }
-       fprintf(f, "config:\n"
+       fprintf(f,
+               "version: %s-%s\n"
+               "revision: %s\n"
+               "config:\n"
                "    chlg_user: %s\n"
                "    mdtname: %s\n"
                "    mountpoint: %s\n"
                "    source_pool: ",
+               PACKAGE_VERSION, LIPE_RELEASE, LIPE_REVISION,
                opt.o_chlg_user, opt.o_mdtname, opt.o_mntpt);
        i = 0;
        for (pl = src_pools; pl != NULL; pl = pl->pl_next, i++)
@@ -429,6 +469,16 @@ void lamigo_usr1_handle(int sig)
                stats.s_skip_hot, stats.s_replicate_ro2hot,
                stats.s_replicate_rw2hot, stats.s_replicate_rw2cold);
 
+       tmtime = localtime(&lamigo_last_cleared);
+       strftime(timestr, sizeof(timestr), "%c", tmtime);
+
+       fprintf(f, "changelog:\n"
+               "    last_processed_idx: %llu\n"
+               "    last_cleared_idx: %llu\n"
+               "    last_cleared_time: %lu # ( %s )\n",
+               lamigo_last_processed_idx, lamigo_last_cleared_index,
+               lamigo_last_cleared, timestr);
+
 #define AGENT_FMT \
        "    agent%d: { host: %s, mnt: %s, jobs: %u, max: %u, state: %s }\n"
 
@@ -510,7 +560,6 @@ static void lamigo_init_vars(void)
 
        signal(SIGUSR1, lamigo_null_handler);
        signal(SIGUSR2, lamigo_null_handler);
-       signal(SIGCHLD, lamigo_null_handler);
 
        LIPE_INIT_LIST_HEAD(&head.lh_list);
        LIPE_INIT_LIST_HEAD(&lamigo_job_list);
@@ -534,7 +583,7 @@ static void lamigo_cleanup(void)
                lipe_list_for_each_entry_safe(rss, tmp,
                                              &agent->rag_ssh_list,
                                              rss_list) {
-                       lipe_cleanup_ssh_session(&rss->rss_ctx);
+                       lipe_ssh_context_destroy(&rss->rss_ctx);
                        lipe_list_del(&rss->rss_list);
                        free(rss);
                }
@@ -586,19 +635,12 @@ static int lamigo_exec_cmd(struct resync_agent *a, char *cmd)
        lipe_list_del(&rss->rss_list);
        pthread_mutex_unlock(&a->rag_ssh_lock);
 
-       if (lipe_ssh_status(&rss->rss_ctx) != SSH_OK) {
-               rc = lipe_init_ssh_session(&rss->rss_ctx, a->rag_hostname);
-               if (rc != SSH_OK)
-                       goto out;
-       }
-
        rc = lipe_ssh_exec(&rss->rss_ctx, cmd);
        if (rc)
-               llapi_error(LLAPI_MSG_FATAL|LLAPI_MSG_NO_ERRNO, 0,
+               llapi_error(LLAPI_MSG_INFO|LLAPI_MSG_NO_ERRNO, 0,
                        "error executing ssh command '%s' on '%s': rc = %d\n",
                        cmd, a->rag_hostname, rc);
 
-out:
        pthread_mutex_lock(&a->rag_ssh_lock);
        lipe_list_add(&rss->rss_list, &a->rag_ssh_list);
        pthread_cond_signal(&a->rag_ssh_cond);
@@ -617,8 +659,8 @@ void *lamigo_replicate_one(void *args)
 
        if (rj->rj_setprefer) {
                snprintf(cmd, sizeof(cmd),
-                        "lfs setstripe --comp-set --comp-flags=prefer -p %s "
-                        "%s/.lustre/fid/"DFID" >&/dev/null", rj->rj_pool,
+                        "lfs setstripe --comp-set --comp-flags=prefer --pool='%s' "
+                        "'%s/.lustre/fid/"DFID"' >&/dev/null", rj->rj_pool,
                         agent->rag_mountpoint,
                         PFID(&rj->rj_fid));
                llapi_printf(LLAPI_MSG_DEBUG, "set prefer on "DFID"\n",
@@ -627,20 +669,20 @@ void *lamigo_replicate_one(void *args)
                int i;
 
                i = snprintf(cmd, sizeof(cmd),
-                            "%s --pool=%s",
+                            "%s --pool='%s'",
                             opt.o_mirror_cmd, rj->rj_pool);
                if (rj->rj_stripes > 0)
                        i += snprintf(cmd + i, sizeof(cmd) - i,
-                                     " -%d", rj->rj_stripes);
+                                     " --stripe-count=%d", rj->rj_stripes);
                if (rj->rj_mirror_opts)
                        i += snprintf(cmd + i, sizeof(cmd) - i,
-                                     " --flags=%s", rj->rj_mirror_opts);
+                                     " --flags='%s'", rj->rj_mirror_opts);
                i += snprintf(cmd + i, sizeof(cmd) - i,
-                             " %s/.lustre/fid/"DFID" > /dev/null 2>&1",
+                             " '%s/.lustre/fid/"DFID"' > /dev/null 2>&1",
                              agent->rag_mountpoint, PFID(&rj->rj_fid));
        } else if (resync == AMIGO_RESYNC_RESYNC) {
                snprintf(cmd, sizeof(cmd),
-                        "lfs mirror resync %s/.lustre/fid/"DFID" > /dev/null 2>&1",
+                        "lfs mirror resync '%s/.lustre/fid/"DFID"' > /dev/null 2>&1",
                         agent->rag_mountpoint,
                         PFID(&rj->rj_fid));
        } else {
@@ -1057,7 +1099,7 @@ static int lamigo_get_attrs(const struct lu_fid *fid,
 
        /* the subsequent steps to fetch EAs are very expensive
         * skip if possible */
-       if (need_attr == 0 && 0)
+       if (need_attr == 0)
                return 0;
 
        /* XXX: would be great to get all EAs with a single call */
@@ -1180,7 +1222,7 @@ void *lamigo_check_agent_func(void *args)
        char cmd[PATH_MAX];
        struct resync_agent *a = (struct resync_agent *)args;
 
-       snprintf(cmd, sizeof(cmd), "lfs path2fid %s > /dev/null 2>&1",
+       snprintf(cmd, sizeof(cmd), "lfs path2fid '%s' > /dev/null 2>&1",
                 a->rag_mountpoint);
 
        rc = lamigo_exec_cmd(a, cmd);
@@ -1296,6 +1338,12 @@ static int lamigo_update_one(struct fid_rec *f)
                return 0;
        }
 
+       if (are_agents_busy()) {
+               /* all the agents are busy */
+               llapi_error(LLAPI_MSG_DEBUG|LLAPI_MSG_NO_ERRNO, 0, "no agents avilable (max: %d)\n", lamigo_max_jobs);
+               return 1;
+       }
+
        /* prevent hot file migration from hot pool to slow */
        rc = lamigo_alr_check_is_hot(&f->fr_fh.fh_fid, &ah);
        if (rc) {
@@ -1331,12 +1379,6 @@ static int lamigo_update_one(struct fid_rec *f)
                return 0;
        }
 
-       if (lamigo_jobs_running >= lamigo_max_jobs) {
-               /* all the agents are busy */
-               llapi_error(LLAPI_MSG_DEBUG|LLAPI_MSG_NO_ERRNO, 0, "no agents avilable (max: %d)\n", lamigo_max_jobs);
-               return 1;
-       }
-
        rj = calloc(1, sizeof(struct resync_job));
        if (rj == NULL) {
                llapi_error(LLAPI_MSG_DEBUG|LLAPI_MSG_NO_ERRNO, 0, "can't allocate for a job\n");
@@ -1390,7 +1432,7 @@ static int lamigo_check_sync(void)
        int count;
 
        /* try to resubmit failed jobs first */
-       while (!lipe_list_empty(&lamigo_failed_job_list)) {
+       while (!lipe_list_empty(&lamigo_failed_job_list) && !are_agents_busy()) {
                struct resync_job *rj;
 
                rj = lipe_list_entry(lamigo_failed_job_list.next,
@@ -1528,9 +1570,6 @@ static unsigned long get_fid_cache_size(int pct)
        return cache_size;
 }
 
-static time_t lamigo_last_cleared;
-static __u64 lamigo_last_cleared_index;
-
 static void lamigo_check_and_clear_changelog(void)
 {
        struct resync_job *tmp;
@@ -1542,10 +1581,9 @@ static void lamigo_check_and_clear_changelog(void)
        index = lamigo_last_processed_idx;
 
        /* this checks prevents too frequent in-memory-search for
-        * the minimal record-in-work: every 256th record and not
-        * frequently than once a 5 seconds
+        * the minimal record-in-work, not frequently than once a 3 seconds
         */
-       if ((index & 256) == 0 && time(NULL) - lamigo_last_cleared < 5)
+       if (time(NULL) - lamigo_last_cleared < 3)
                return;
        lamigo_last_cleared = time(NULL);
 
@@ -1564,68 +1602,56 @@ static void lamigo_check_and_clear_changelog(void)
                        index = tmp->rj_index;
        }
 
-       if (index - lamigo_last_cleared_index < opt.o_chlg_clear_frequency)
+       lipe_list_for_each_entry(tmp, &lamigo_failed_job_list, rj_list) {
+               if (tmp->rj_check_job || !tmp->rj_index)
+                       continue;
+               if (tmp->rj_index < index)
+                       index = tmp->rj_index;
+       }
+
+       if (index < lamigo_last_cleared_index ||
+           index - lamigo_last_cleared_index < opt.o_chlg_clear_frequency)
                return;
 
        llapi_error(LLAPI_MSG_DEBUG|LLAPI_MSG_NO_ERRNO, 0, "CLEAR upto %llu in %s (%llu last)\n",
                     index, opt.o_chlg_user, lamigo_last_processed_idx);
        lamigo_last_cleared_index = index;
        rc = llapi_changelog_clear(opt.o_mdtname, opt.o_chlg_user, index);
-       if (rc)
+       if (rc < 0) {
                llapi_error(LLAPI_MSG_ERROR, rc,
-                           "failed to clear changelog record: %s:%llu",
-                           opt.o_chlg_user, index);
+                           "failed to clear changelog record %s:%llu (%llu last)",
+                           opt.o_chlg_user, index, lamigo_last_cleared_index);
+               systemf("lctl get_param 'mdd.%s.changelog_users'", opt.o_mdtname);
+       }
 }
 
-static void lamigo_check_jobs(void)
+static void lamigo_job_fini(struct resync_job *rj, intptr_t retval)
 {
-       struct resync_job *rj = NULL, *tmp;
-       int rc = 0;
-       intptr_t retval;
-
-       lipe_list_for_each_entry(tmp, &lamigo_job_list, rj_list) {
-               rc = pthread_tryjoin_np(tmp->rj_pid, (void **)&retval);
-               if (rc == 0) {
-                       rj = tmp;
-                       break;
-               } else if (rc != EBUSY) {
-                       break;
-               }
-       }
-
-       if (rj == NULL) {
-               if (rc && rc != EBUSY)
-                       llapi_error(LLAPI_MSG_ERROR|LLAPI_MSG_NO_ERRNO, 0, "unknown job %d exited\n",
-                                    rc);
-               return;
-       }
-
        llapi_error(LLAPI_MSG_DEBUG|LLAPI_MSG_NO_ERRNO, 0,
                    "job %lu on "DFID" done in %lu: %"PRIdPTR" (%d)\n",
                    rj->rj_pid, PFID(&rj->rj_fid), time(NULL) - rj->rj_start,
                    retval, rj->rj_agent->rag_bad);
 
-       lipe_list_del(&rj->rj_list);
-
        if (rj->rj_check_job) {
                rj->rj_agent->rag_check_in_progress = 0;
                if (retval == 0) {
                        /* the agent is back */
-                       llapi_error(LLAPI_MSG_DEBUG|LLAPI_MSG_NO_ERRNO, 0, "agent %s is back\n",
-                                    rj->rj_agent->rag_hostname);
                        if (rj->rj_agent->rag_bad) {
+                               llapi_error(LLAPI_MSG_DEBUG|LLAPI_MSG_NO_ERRNO, 0, "agent %s is back\n",
+                                           rj->rj_agent->rag_hostname);
                                rj->rj_agent->rag_bad = false;
                                lamigo_max_jobs += rj->rj_agent->rag_maxjobs;
                        }
                } else {
                        /* the agent is still bad */
                        if (rj->rj_agent->rag_bad == false) {
+                               llapi_error(LLAPI_MSG_DEBUG|LLAPI_MSG_NO_ERRNO, 0, "agent %s is bad\n",
+                                           rj->rj_agent->rag_hostname);
+
                                assert(lamigo_max_jobs >= rj->rj_agent->rag_maxjobs);
                                lamigo_max_jobs -= rj->rj_agent->rag_maxjobs;
                                rj->rj_agent->rag_bad = true;
                        }
-                       llapi_error(LLAPI_MSG_DEBUG|LLAPI_MSG_NO_ERRNO, 0, "agent %s is still bad\n",
-                             rj->rj_agent->rag_hostname);
                }
                free(rj);
                return;
@@ -1662,19 +1688,40 @@ static void lamigo_check_jobs(void)
                /* no striping info or file disappeared */
                /* no need to handle, forget the changelog record */
        } else if (retval == 0) {
-               stats.s_replicated++;
+               if (!rj->rj_setprefer)
+                       stats.s_replicated++;
        }
 
        if (rj)
                free(rj);
 }
 
+static void lamigo_check_jobs(void)
+{
+       struct resync_job *rj, *tmp;
+       int rc = 0;
+       intptr_t retval;
+
+       lipe_list_for_each_entry_safe(rj, tmp, &lamigo_job_list, rj_list) {
+               rc = pthread_tryjoin_np(rj->rj_pid, (void **)&retval);
+               if (rc == EBUSY) {
+                       continue;
+               } else if (rc != 0) {
+                       llapi_error(LLAPI_MSG_ERROR, rc, "cannot join thread %lld",
+                                   (long long)rj->rj_pid);
+               } else {
+                       lipe_list_del(&rj->rj_list);
+                       lamigo_job_fini(rj, retval);
+               }
+       }
+}
+
 static void lamigo_add_agent(const char *host, const char *mnt, char *jobs)
 {
        struct resync_agent *a;
        int i;
 
-       a = malloc(sizeof(*a));
+       a = calloc(1, sizeof(*a));
        if (!a) {
                llapi_error(LLAPI_MSG_FATAL|LLAPI_MSG_NO_ERRNO, 0,
                             "can't allocate memory for agent\n");
@@ -1715,8 +1762,8 @@ static void lamigo_add_agent(const char *host, const char *mnt, char *jobs)
 
        /* ssh context per job, and one more for agent heartbeat */
        for (i = 0; i < a->rag_maxjobs + 1; i++) {
-               struct resync_ssh_session *rss =
-                               malloc(sizeof(struct resync_ssh_session));
+               struct resync_ssh_session *rss = calloc(1, sizeof(*rss));
+               int rc;
 
                if (!rss) {
                        llapi_error(LLAPI_MSG_FATAL|LLAPI_MSG_NO_ERRNO, 0,
@@ -1724,6 +1771,14 @@ static void lamigo_add_agent(const char *host, const char *mnt, char *jobs)
                        exit(1);
                }
 
+               rc = lipe_ssh_context_init(&rss->rss_ctx, a->rag_hostname);
+               if (rc < 0) {
+                       llapi_error(LLAPI_MSG_FATAL|LLAPI_MSG_NO_ERRNO, 0,
+                                   "cannot create SSH context for '%s'\n",
+                                   a->rag_hostname);
+                       exit(1);
+               }
+
                lipe_list_add(&rss->rss_list, &a->rag_ssh_list);
        }
 
@@ -1750,6 +1805,7 @@ static void lamigo_add_agent(const char *host, const char *mnt, char *jobs)
 #define LAMIGO_OPT_STATFS_REFRESH      15
 #define LAMIGO_OPT_SRC_FREE            16
 #define LAMIGO_OPT_TGT_FREE            17
+#define LAMIGO_OPT_VERSION             18
 
 static struct option options[] = {
        { "ignore-reads", no_argument, NULL, LAMIGO_OPT_IGNORE_READS},
@@ -1790,6 +1846,7 @@ static struct option options[] = {
        { "tgt", required_argument, NULL, 't'},
        { "user", required_argument, 0, 'u'},
        { "verbose", no_argument, NULL, 'v'},
+       { "version", no_argument, NULL, LAMIGO_OPT_VERSION },
        { NULL }
 };
 
@@ -2082,6 +2139,9 @@ void lamigo_process_opt(int c, char *optarg)
                        exit(1);
                }
                break;
+       case LAMIGO_OPT_VERSION:
+               lipe_version();
+               exit(0);
        case 'a':
                opt.o_min_age = strtol(optarg, &endptr, 10);
                if (*endptr != '\0' || opt.o_min_age < 5) {
@@ -2594,7 +2654,7 @@ int lamigo_lipe_callback(struct lipe_instance *instance,
        lamigo_check_jobs();
 
        do {
-               while (lamigo_jobs_running >= lamigo_max_jobs) {
+               while (are_agents_busy()) {
                        lamigo_check_jobs();
                        lamigo_check_sync();
                        lamigo_check_bad_agents();
@@ -3194,6 +3254,7 @@ int main(int argc, char **argv)
        int      ret = 0;
        pthread_t pid;
 
+       lipe_version_init();
        ssh_threads_set_callbacks(ssh_threads_get_pthread());
        ssh_init();
 
@@ -3208,6 +3269,10 @@ int main(int argc, char **argv)
         * followed by the MDT name ("lamigo lustre-MDT0000"). */
        llapi_set_command_name(opt.o_mdtname);
 
+       llapi_error(LLAPI_MSG_INFO|LLAPI_MSG_NO_ERRNO, 0,
+                   "version %s-%s, revision %s\n",
+                   PACKAGE_VERSION, LIPE_RELEASE, LIPE_REVISION);
+
        rc = lamigo_init_cache();
        if (rc < 0) {
                llapi_error(LLAPI_MSG_FATAL|LLAPI_MSG_NO_ERRNO, 0, "can't init cache\n");
@@ -3276,11 +3341,15 @@ int main(int argc, char **argv)
 
                if (enable_heat)
                        lamigo_check_hot();
-               rc = lamigo_check_sync();
-               if (rc < 0) {
-                       stop = true;
-                       ret = rc;
+
+               if (!are_agents_busy()) {
+                       rc = lamigo_check_sync();
+                       if (rc < 0) {
+                               stop = true;
+                               ret = rc;
+                       }
                }
+
                lamigo_check_jobs();
                lamigo_check_and_clear_changelog();
                lamigo_check_bad_agents();
@@ -3305,6 +3374,7 @@ int main(int argc, char **argv)
 out:
        lamigo_cleanup();
        llapi_error(LLAPI_MSG_INFO|LLAPI_MSG_NO_ERRNO, 0, "exited\n");
+       lipe_version_fini();
 
        return ret;
 }
index fa28ce2..aceb3ce 100644 (file)
@@ -261,20 +261,14 @@ void *lamigo_alr_data_collection_thread(void *arg)
                opt.o_alr_ofd_interval, mdtidx, opt.o_alr_extra_args);
 
 repeat:
-       rc = lipe_init_ssh_session(&ala->ala_ctx, ala->ala_host);
-       if (rc) {
-               llapi_err_noerrno(LLAPI_MSG_ERROR,
-                               "failed to create connection to agent [%s], %d\n",
-                               ala->ala_host, rc);
-               goto again;
-       }
-
-       rc = lipe_ssh_start_cmd(ala->ala_ctx.lsc_session, cmd, &channel);
+       rc = lipe_ssh_start_cmd(&ala->ala_ctx, cmd, &channel);
        if (rc != SSH_OK) {
-               llapi_error(LLAPI_MSG_ERROR, rc,
-                           "cannot start ofd_access_log_readed\n");
+               llapi_error(LLAPI_MSG_ERROR|LLAPI_MSG_NO_ERRNO, 0,
+                           "cannot start ofd_access_log_reader on host '%s': rc = %d\n",
+                           ala->ala_host, rc);
                goto err;
        }
+
        llapi_err_noerrno(LLAPI_MSG_DEBUG, "alr agent on %s is running\n",
                          ala->ala_host);
 
@@ -303,7 +297,6 @@ err:
        ssh_channel_close(channel);
        ssh_channel_free(channel);
 
-again:
        /* wait for a while */
        /* XXX: should be configurable? */
        sleep(5);
@@ -907,11 +900,17 @@ void lamigo_alr_init(void)
 void lamigo_add_alr_agent(const char *host)
 {
        struct alr_agent *ala;
+       int rc;
 
-       ala = calloc(sizeof(*ala), 1);
-       assert(ala);
+       ala = calloc(1, sizeof(*ala));
+       assert(ala != NULL);
 
        ala->ala_host = strdup(host);
+       assert(ala->ala_host != NULL);
+
+       rc = lipe_ssh_context_init(&ala->ala_ctx, ala->ala_host);
+       assert(rc == SSH_OK);
+
        lipe_list_add(&ala->ala_list, &alr_agent_list);
 }
 
index 5df17d9..43ac483 100644 (file)
@@ -1,4 +1,6 @@
 #include <stdlib.h>
+#include <assert.h>
+#include <malloc.h>
 #include <string.h>
 #include <errno.h>
 
@@ -6,61 +8,59 @@
 
 #include "lipe_ssh.h"
 
-int lipe_ssh_authenticate_pubkey(ssh_session session)
-{
-       int rc;
-
-       rc = ssh_userauth_publickey_auto(session, NULL, NULL);
-       if (rc != SSH_AUTH_SUCCESS) {
-               llapi_printf(LLAPI_MSG_ERROR, "Authentication failed: %s\n",
-                            ssh_get_error(session));
-               return rc;
-       }
-
-       return SSH_AUTH_SUCCESS;
-}
+#define lipe_ssh_debug(fmt, args...) \
+       llapi_error(LLAPI_MSG_DEBUG|LLAPI_MSG_NO_ERRNO, 0, fmt, ##args)
 
+#define lipe_ssh_error(fmt, args...) \
+       llapi_error(LLAPI_MSG_ERROR|LLAPI_MSG_NO_ERRNO, 0, fmt, ##args)
 
-int lipe_ssh_start_cmd(ssh_session session, const char *cmd, ssh_channel *rch)
+static int lipe_ssh_session_start_cmd(ssh_session session, const char *cmd, ssh_channel *pchannel)
 {
-       ssh_channel channel;
+       ssh_channel channel = NULL;
        int rc;
 
+       assert(session != NULL);
+
        channel = ssh_channel_new(session);
        if (channel == NULL) {
-               llapi_printf(LLAPI_MSG_ERROR,
-                            "Error allocating a new channel: %s\n",
-                            ssh_get_error(session));
-               return SSH_ERROR;
+               lipe_ssh_error("canot create a new SSH channel: %s\n",
+                              ssh_get_error(session));
+               rc = SSH_ERROR;
+               goto out;
        }
 
        rc = ssh_channel_open_session(channel);
        if (rc != SSH_OK) {
-               llapi_printf(LLAPI_MSG_ERROR,
-                            "Error opening a session channel: %s\n",
-                            ssh_get_error(session));
-               ssh_channel_free(channel);
-               return rc;
+               lipe_ssh_error("cannot open SSH session channel: %s\n",
+                              ssh_get_error(session));
+               goto out;
        }
 
        rc = ssh_channel_request_exec(channel, cmd);
        if (rc != SSH_OK) {
-               llapi_printf(LLAPI_MSG_ERROR, "Error executing command: %s\n",
-                            ssh_get_error(session));
-               ssh_channel_close(channel);
-               ssh_channel_free(channel);
-               return rc;
+               lipe_ssh_error("cannot execute SSH command: %s\n",
+                              ssh_get_error(session));
+               goto out;
        }
-       *rch = channel;
-       return SSH_OK;
+
+       *pchannel = channel;
+       channel = NULL;
+       rc = SSH_OK;
+out:
+       if (channel != NULL)
+               ssh_channel_close(channel);
+
+       ssh_channel_free(channel);
+
+       return rc;
 }
 
-int lipe_ssh_exec_cmd(ssh_session session, const char *cmd)
+static int lipe_ssh_session_exec_cmd(ssh_session session, const char *cmd)
 {
        ssh_channel channel;
        int rc;
 
-       rc = lipe_ssh_start_cmd(session, cmd, &channel);
+       rc = lipe_ssh_session_start_cmd(session, cmd, &channel);
        if (rc != SSH_OK)
                return rc;
 
@@ -72,114 +72,140 @@ int lipe_ssh_exec_cmd(ssh_session session, const char *cmd)
        return rc;
 }
 
-int lipe_init_ssh_session(struct lipe_ssh_context *ctx,
-                         const char *host)
+static void lipe_ssh_session_destroy(ssh_session *psession)
+{
+       if (*psession != NULL)
+               ssh_disconnect(*psession);
+
+       ssh_free(*psession);
+       *psession = NULL;
+}
+
+static int lipe_ssh_session_create(ssh_session *psession, const char *host)
 {
-       int timeout = 5;
+       ssh_session session = NULL;
+       long timeout = 5;
        int rc;
 
-       ctx->lsc_host = strdup(host);
-       if (ctx->lsc_host == NULL)
-               return SSH_ERROR;
+       assert(SSH_OK == 0);
+       assert(SSH_AUTH_SUCCESS == 0);
+       assert(SSH_ERROR < 0);
+
+       assert(psession != NULL);
+       assert(*psession == NULL);
+       assert(host != NULL);
 
-       pthread_mutex_init(&ctx->lsc_session_mutex, NULL);
-       /* Create a new ssh session and set options */
-       ctx->lsc_session = ssh_new();
-       if (ctx->lsc_session == NULL) {
-               llapi_printf(LLAPI_MSG_ERROR,
-                            "Error creating a new ssh session\n");
+       lipe_ssh_debug("creating new SSH session for host '%s'\n", host);
+
+       session = ssh_new();
+       if (session == NULL) {
+               lipe_ssh_error("cannot create a new SSH session: %s\n",
+                              strerror(ENOMEM)); /* Probably. */
                rc = SSH_ERROR;
-               goto out_host;
+               goto out;
        }
 
-       ssh_options_set(ctx->lsc_session, SSH_OPTIONS_HOST, host);
+       rc = ssh_options_set(session, SSH_OPTIONS_HOST, host);
+       if (rc != SSH_OK) {
+               lipe_ssh_error("cannot set SSH session host to '%s: %s'\n",
+                              host, ssh_get_error(session));
+               goto out;
+       }
 
-       if (ssh_options_set(ctx->lsc_session, SSH_OPTIONS_TIMEOUT,
-                           &timeout) < 0) {
-               llapi_printf(LLAPI_MSG_FATAL,
-                            "Error setting ssh timeout\n");
-               rc = SSH_ERROR;
+       rc = ssh_options_set(session, SSH_OPTIONS_TIMEOUT, &timeout);
+       if (rc != SSH_OK) {
+               lipe_ssh_error("cannot set SSH timeout to %ld: %s\n",
+                              timeout, ssh_get_error(session));
                goto out;
        }
 
        /* Connect to the ssh server */
-       rc = ssh_connect(ctx->lsc_session);
+       rc = ssh_connect(session);
        if (rc != SSH_OK) {
-               llapi_printf(LLAPI_MSG_ERROR,
-                            "Error connecting to host %s: %s\n",
-                            host,
-                            ssh_get_error(ctx->lsc_session));
+               lipe_ssh_error("cannot connect SSH session to host '%s': %s\n",
+                              host, ssh_get_error(session));
                goto out;
        }
 
        /* Automatically authenticate with public key */
-       rc = lipe_ssh_authenticate_pubkey(ctx->lsc_session);
-       if (rc == SSH_AUTH_SUCCESS)
-               return rc;
-
-       llapi_printf(LLAPI_MSG_ERROR,
-                    "Error authenticating pubkey: %s\n",
-                    ssh_get_error(ctx->lsc_session));
+       rc = ssh_userauth_publickey_auto(session, NULL, NULL);
+       if (rc != SSH_AUTH_SUCCESS) {
+               lipe_ssh_error("cannot authenticate SSH session to host '%s': %s\n",
+                              host, ssh_get_error(session));
+               goto out;
+       }
 
-       ssh_disconnect(ctx->lsc_session);
+       *psession = session;
+       session = NULL;
+       rc = SSH_OK;
 out:
-       ssh_free(ctx->lsc_session);
-       ctx->lsc_session = NULL;
-out_host:
-       free(ctx->lsc_host);
+       lipe_ssh_debug("create new SSH session for host '%s': rc = %d\n", host, rc);
+       lipe_ssh_session_destroy(&session);
 
        return rc;
 }
 
-int lipe_reset_ssh_session(struct lipe_ssh_context *ctx)
+static int lipe_ssh_context_check(struct lipe_ssh_context *ctx)
 {
-       int rc;
-       char *hostname;
-
-       pthread_mutex_lock(&ctx->lsc_session_mutex);
-       hostname = strdup(ctx->lsc_host);
-       if (!hostname) {
-               pthread_mutex_unlock(&ctx->lsc_session_mutex);
-               return -ENOMEM;
-       }
-       lipe_cleanup_ssh_session(ctx);
-       rc = lipe_init_ssh_session(ctx, hostname);
-       pthread_mutex_unlock(&ctx->lsc_session_mutex);
-       free(hostname);
+       if (ctx->lsc_session == NULL)
+               return lipe_ssh_session_create(&ctx->lsc_session, ctx->lsc_host);
 
-       return rc;
+       return SSH_OK;
 }
 
-int lipe_ssh_exec(struct lipe_ssh_context *ctx, const char *cmd)
+static void lipe_ssh_context_fail(struct lipe_ssh_context *ctx)
 {
-       int rc;
-
-       /* Execute a remote command */
-       rc = lipe_ssh_exec_cmd(ctx->lsc_session, cmd);
-       if (rc == SSH_OK)
-               return SSH_OK;
-
-       /* Reset ssh session and try again */
-       if (rc == SSH_ERROR || rc == SSH_AGAIN) {
-               rc = lipe_reset_ssh_session(ctx);
-               if (rc)
-                       return rc;
-               rc = lipe_ssh_exec_cmd(ctx->lsc_session, cmd);
-       }
-
-       return rc;
+       lipe_ssh_debug("fail SSH context for host '%s'\n", ctx->lsc_host);
+       lipe_ssh_session_destroy(&ctx->lsc_session);
 }
 
-void lipe_cleanup_ssh_session(struct lipe_ssh_context *ctx)
+void lipe_ssh_context_destroy(struct lipe_ssh_context *ctx)
 {
-       ssh_disconnect(ctx->lsc_session);
-       ssh_free(ctx->lsc_session);
+       lipe_ssh_session_destroy(&ctx->lsc_session);
        free(ctx->lsc_host);
+       ctx->lsc_host = NULL;
 }
 
-int lipe_ssh_status(struct lipe_ssh_context *ctx)
+int lipe_ssh_context_init(struct lipe_ssh_context *ctx, const char *host)
 {
-       if (ctx->lsc_session == NULL)
+       memset(ctx, 0, sizeof(*ctx));
+
+       ctx->lsc_host = strdup(host);
+       if (ctx->lsc_host == NULL)
                return SSH_ERROR;
+
+       /* Session creation will be done on demand. */
+
        return SSH_OK;
 }
+
+int lipe_ssh_start_cmd(struct lipe_ssh_context *ctx, const char *cmd, ssh_channel *pchannel)
+{
+       int rc;
+
+       rc = lipe_ssh_context_check(ctx);
+       if (rc != SSH_OK)
+               return rc;
+
+       rc = lipe_ssh_session_start_cmd(ctx->lsc_session, cmd, pchannel);
+       if (rc != SSH_OK)
+               lipe_ssh_context_fail(ctx);
+
+       return rc;
+}
+
+int lipe_ssh_exec(struct lipe_ssh_context *ctx, const char *cmd)
+{
+       int rc;
+
+       rc = lipe_ssh_context_check(ctx);
+       if (rc != SSH_OK)
+               return rc;
+
+       /* Execute a remote command */
+       rc = lipe_ssh_session_exec_cmd(ctx->lsc_session, cmd);
+       if (rc < 0)
+               lipe_ssh_context_fail(ctx);
+
+       return rc;
+}
index 5d5e224..57ab9d8 100644 (file)
@@ -4,23 +4,17 @@
 #ifndef _LIPE_SSH_H_
 #define _LIPE_SSH_H_
 
-#include <pthread.h>
 #include <libssh/libssh.h>
 
 struct lipe_ssh_context {
        char *lsc_host;
        ssh_session lsc_session;
-       pthread_mutex_t lsc_session_mutex;
 };
 
-int lipe_ssh_authenticate_pubkey(ssh_session session);
-int lipe_ssh_exec_cmd(ssh_session session, const char *cmd);
-int lipe_init_ssh_session(struct lipe_ssh_context *ctx,
-                         const char *host);
-void lipe_cleanup_ssh_session(struct lipe_ssh_context *ctx);
-int lipe_reset_ssh_session(struct lipe_ssh_context *ctx);
+int lipe_ssh_context_init(struct lipe_ssh_context *ctx, const char *host);
+void lipe_ssh_context_destroy(struct lipe_ssh_context *ctx);
+
 int lipe_ssh_exec(struct lipe_ssh_context *ctx, const char *cmd);
-int lipe_ssh_start_cmd(ssh_session session, const char *cmd, ssh_channel *rch);
-int lipe_ssh_status(struct lipe_ssh_context *ctx);
+int lipe_ssh_start_cmd(struct lipe_ssh_context *ctx, const char *cmd, ssh_channel *pchannel);
 
 #endif /* _LIPE_SSH_H_ */
diff --git a/src/lipe_version.c b/src/lipe_version.c
new file mode 100644 (file)
index 0000000..e8b2342
--- /dev/null
@@ -0,0 +1,32 @@
+#include <stdio.h>
+#include <errno.h>
+#include <malloc.h>
+#include <string.h>
+#include "lipe_version.h"
+
+/* A grep-able string for stripped binaries. */
+static const char lipe_revision_string[] = "LiPE Revision: "LIPE_REVISION;
+
+/* A grep-able string for core dumps. */
+static char *lipe_revision_string_copy;
+
+void lipe_version(void)
+{
+       printf("%s (%s) %s-%s\n%s\n",
+              program_invocation_short_name,
+              PACKAGE_NAME,
+              PACKAGE_VERSION,
+              LIPE_RELEASE,
+              lipe_revision_string);
+}
+
+void lipe_version_init(void)
+{
+       if (lipe_revision_string_copy == NULL)
+               lipe_revision_string_copy = strdup(lipe_revision_string);
+}
+
+void lipe_version_fini(void)
+{
+       free(lipe_revision_string_copy);
+}
diff --git a/src/lipe_version.h b/src/lipe_version.h
new file mode 100644 (file)
index 0000000..49e3c3f
--- /dev/null
@@ -0,0 +1,8 @@
+#ifndef _LIPE_VERSION_H_
+#define _LIPE_VERSION_H_
+
+void lipe_version(void);
+void lipe_version_init(void);
+void lipe_version_fini(void);
+
+#endif
index 17cff87..c1361c1 100644 (file)
@@ -27,7 +27,6 @@
 /*
  * TODO
  *  - convert blocks to 1K units
- *  - multiple --mirror-id in lfs split to save on ssh sessions
  *  - take OST load into account
  *
  */
 #include <linux/lustre/lustre_fid.h>
 #include <linux/lustre/lustre_cfg.h>
 #include <json-c/json.h>
-#include <libssh/libssh.h>
-#include <libssh/callbacks.h>
 #include "lipe_object_attrs.h"
+#include "lipe_version.h"
 #include "list.h"
 #include "policy.h"
 #include "flist.h"
-#include "lipe_ssh.h"
 
 #define container_of(ptr, type, member) ({                      \
        const typeof(((type *) 0)->member) * __mptr = (ptr);     \
 #define LPURGE_FIDS_DUMPFILE  "/var/run/lpurge-%s.fids"
 
 struct lpurge_object {
+       struct lipe_list_head lo_list;
        struct lu_fid lo_fid;
        /* Used space in bytes */
        __u32 lo_blocks;
-       __u32 lo_comp_id;
+       __u32 lo_mirror_id;
        /* Last use time */
        time_t lo_last_utime;
 };
 
-struct lpurge_object_pack {
-       struct lipe_list_head lop_list;
-       unsigned int lop_max;
-       unsigned int lop_nr;
-       unsigned int lop_mdt_idx;
-       pthread_t lop_pid;
-       struct lpurge_object lop_objs[];
-};
-
 struct lpurge_slot {
        time_t ls_min_utime;
        time_t ls_max_utime;
@@ -104,25 +93,15 @@ struct lpurge_slot {
        unsigned int ls_scan;
        struct lipe_list_head ls_obj_list;
        pthread_mutex_t ls_mutex;
+       /* stats for objects we can't release */
+       unsigned long ls_nomirror_objs; /* no replicated objects */
+       unsigned long ls_nomirror_space;
+       unsigned long ls_nopfid_objs; /* no PFID, can't find parent */
+       unsigned long ls_nopfid_space;
+       unsigned long ls_notfirst_objs; /* not a first stripe */
+       unsigned long ls_notfirst_space;
 };
 
-struct mds_ssh_session {
-       struct lipe_list_head mss_list;
-       struct lipe_ssh_context mss_ctx;
-};
-
-struct mds {
-       char *host;
-       char *mnt;
-       struct lipe_list_head m_ssh_list;
-       pthread_mutex_t m_ssh_lock;
-       pthread_cond_t m_ssh_cond;
-};
-
-#define MAX_MDTS       128
-struct mds mds[MAX_MDTS];
-int mdsnr;
-
 #define LPURGE_HIST_MAX        16
 struct lpurge_slot lpurge_hist[LPURGE_HIST_MAX];
 
@@ -131,8 +110,11 @@ struct stats {
        unsigned long s_scans;
        unsigned long s_low_hits;
        unsigned long s_slow_scans;
-       unsigned long s_spawned;
-       unsigned long s_purged; /* files */
+       unsigned long s_queued; /* # files queued for purge */
+       unsigned long s_started; /* # files dequeued for purge by worker */
+       unsigned long s_purged; /* # files we successfully purged */
+       unsigned long s_failed; /* # files we failed to purge */
+       /* TODO count mirrors we tried to purge but found last non stale */
 };
 struct stats stats;
 
@@ -177,6 +159,7 @@ char *ostprefix;
 char ost_mntdev[PATH_MAX];
 char *ost_mntpt;
 int lustre_fd = -1;
+int open_by_fid_fd = -1;
 unsigned long oldest;
 time_t scan_started_time;
 time_t scan_finished_time;
@@ -191,13 +174,6 @@ char *prog;
 
 void load_config(char *name);
 
-#define PACK_SIZE      (64*1024)
-#define PACK_NR ((PACK_SIZE - offsetof(struct lpurge_object_pack, lop_objs[0])) / \
-       sizeof(struct lpurge_object))
-
-#define FIDS_PER_CALL  128 /* XXX: bump to a bigger value */
-#define PM_SIZE        (offsetof(struct lpurge_object_pack, lop_objs[FIDS_PER_CALL]))
-
 struct lipe_flist *lflist;
 #define LPURGE_FLIST_SIZE (1024 * 1024)
 
@@ -225,25 +201,11 @@ static int lpurge_init_flist(void)
 
 static void sig_handler(int signal)
 {
-       int i;
-
        psignal(signal, "exiting");
        if (lflist)
                flist_free(lflist);
 
-       for (i = 0; i < mdsnr; i++) {
-               struct mds *m = &mds[i];
-               struct mds_ssh_session *mss, *tmp;
-
-               lipe_list_for_each_entry_safe(mss, tmp, &m->m_ssh_list,
-                                       mss_list) {
-                       lipe_cleanup_ssh_session(&mss->mss_ctx);
-                       lipe_list_del(&mss->mss_list);
-                       free(mss);
-               }
-       }
-
-       _exit(1);
+       _exit(0);
 }
 
 static void lpurge_null_handler(int signal)
@@ -277,7 +239,7 @@ void lpurge_init_result(void)
 
 void lpurge_reset_result(void)
 {
-       struct lpurge_object_pack *s, *t;
+       struct lpurge_object *s, *t;
        int i;
 
        for (i = 0; i < LPURGE_HIST_MAX; i++) {
@@ -287,9 +249,15 @@ void lpurge_reset_result(void)
                lpurge_hist[i].ls_scan = 1;
                lpurge_hist[i].ls_min_utime = ~0UL;
                lpurge_hist[i].ls_max_utime = 0;
+               lpurge_hist[i].ls_nopfid_objs = 0;
+               lpurge_hist[i].ls_nopfid_space = 0;
+               lpurge_hist[i].ls_nomirror_objs = 0;
+               lpurge_hist[i].ls_nomirror_space = 0;
+               lpurge_hist[i].ls_notfirst_objs = 0;
+               lpurge_hist[i].ls_notfirst_space = 0;
                lipe_list_for_each_entry_safe(s, t, &lpurge_hist[i].ls_obj_list,
-                                             lop_list) {
-                       lipe_list_del(&s->lop_list);
+                                             lo_list) {
+                       lipe_list_del(&s->lo_list);
                        free(s);
                }
        }
@@ -312,7 +280,8 @@ void usage(void)
               "\t-R, --scan_rate, slow scanning rate, objs/sec (default: %u)\n"
               "\t-S, --slot_size, objects to store in memory (default: %u)\n"
               "\t-t, --scan_threads, scanning threads (default: %u)\n"
-              "\t-w, --dump, stats file (via USR1 signal, default: %s)\n",
+              "\t-w, --dump, stats file (via USR1 signal, default: %s)\n"
+              "\t--version, print version information and exit\n",
               program_invocation_short_name,
               DEF_FREEHI,
               DEF_INTERVAL,
@@ -549,7 +518,7 @@ static void lpurge_reclaim_slot(unsigned int index)
        int i = 0;
        struct lpurge_slot *ls_1 = lpurge_hist; /* youngest slot */
        struct lpurge_slot *ls_2;
-       struct lpurge_object_pack *p, *t;
+       struct lpurge_object *p, *t;
 
        if (index >= LPURGE_HIST_MAX - 1)
                return;
@@ -562,8 +531,8 @@ static void lpurge_reclaim_slot(unsigned int index)
        ls_1->ls_min_utime = ~0UL;
        /* reclaim youngest slot */
        lipe_list_for_each_entry_safe(p, t, &ls_1->ls_obj_list,
-                                     lop_list) {
-               lipe_list_del(&p->lop_list);
+                                     lo_list) {
+               lipe_list_del(&p->lo_list);
                free(p);
        }
        pthread_mutex_unlock(&ls_1->ls_mutex);
@@ -630,7 +599,7 @@ int lpurge_lipe_callback(struct lipe_instance *instance,
                         struct lipe_object_attrs *attrs)
 {
        struct lpurge_slot *ls = NULL;
-       struct lpurge_object_pack *lp = NULL;
+       struct lpurge_object *lo = NULL;
        time_t age, last_used;
        int index;
 
@@ -642,25 +611,6 @@ int lpurge_lipe_callback(struct lipe_instance *instance,
        if ((attrs->loa_mode & S_IFMT) != S_IFREG)
                goto out;
 
-       if ((attrs->loa_attr_bits & LIPE_OBJECT_ATTR_FILTER_FID) == 0)
-               goto out;
-
-       /* to avoid N OSTs to 1 MDT scalability issue we only consider
-        * objects which store 1st stripe
-        */
-       if (attrs->loa_filter_fid.ff_parent.f_ver != 0)
-               goto out;
-
-       /* if the object has got ost_layout structure which encodes
-        * whether the object has a mirror, then we can skip objects
-        * with mirror_id=0 (no mirror)
-        */
-       if (attrs->loa_filter_fid_size >= sizeof(struct filter_fid)) {
-               if (mirror_id_of(attrs->loa_filter_fid.ff_layout.ol_comp_id)
-                   == 0)
-                       goto out;
-       }
-
        last_used = attrs->loa_mtime_ms;
        if (last_used < attrs->loa_atime_ms)
                last_used = attrs->loa_atime_ms;
@@ -685,13 +635,45 @@ int lpurge_lipe_callback(struct lipe_instance *instance,
                goto out;
        }
 
+       pthread_mutex_lock(&ls->ls_mutex);
+
+       if ((attrs->loa_attr_bits & LIPE_OBJECT_ATTR_FILTER_FID) == 0) {
+               ls->ls_nopfid_objs++;
+               ls->ls_nopfid_space += attrs->loa_blocks >> 10;
+               pthread_mutex_unlock(&ls->ls_mutex);
+               goto out;
+       }
+
+       /* to avoid N OSTs to 1 MDT scalability issue we only consider
+        * objects which store 1st stripe
+        */
+       if (attrs->loa_filter_fid.ff_parent.f_ver != 0) {
+               ls->ls_notfirst_objs++;
+               ls->ls_notfirst_space += attrs->loa_blocks >> 10;
+               pthread_mutex_unlock(&ls->ls_mutex);
+               goto out;
+       }
+
+       /* if the object has got ost_layout structure which encodes
+        * whether the object has a mirror, then we can skip objects
+        * with mirror_id=0 (no mirror)
+        */
+       if (attrs->loa_filter_fid_size >= sizeof(struct filter_fid)) {
+               if (mirror_id_of(attrs->loa_filter_fid.ff_layout.ol_comp_id)
+                   == 0) {
+                       ls->ls_nomirror_objs++;
+                       ls->ls_nomirror_space += attrs->loa_blocks >> 10;
+                       pthread_mutex_unlock(&ls->ls_mutex);
+                       goto out;
+               }
+       }
+
        llapi_printf(LLAPI_MSG_DEBUG,
                     "found under "DFID": size %ld block %ld age %ld slot %d\n",
                     PFID(&attrs->loa_filter_fid.ff_parent),
                     (unsigned long)attrs->loa_size,
                     (unsigned long)attrs->loa_blocks >> 10, age, index);
 
-       pthread_mutex_lock(&ls->ls_mutex);
        ls->ls_found++;
        ls->ls_space += attrs->loa_blocks >> 10;
        lpurge_scanned_since++;
@@ -723,42 +705,31 @@ int lpurge_lipe_callback(struct lipe_instance *instance,
                }
        }
 
-       if (!lipe_list_empty(&ls->ls_obj_list))
-               lp = lipe_list_entry(ls->ls_obj_list.next,
-                                    struct lpurge_object_pack, lop_list);
+       lo = calloc(1, sizeof(*lo));
+       if (lo == NULL)
+               goto out;
 
-       if (!lp || lp->lop_nr >= lp->lop_max) {
-               lp = calloc(1, PACK_SIZE);
-               if (!lp) {
-                       llapi_printf(LLAPI_MSG_ERROR,
-                                    "can't alloca pack\n");
-                       exit(1);
-               }
-               lp->lop_pid = 0;
-               lp->lop_max = PACK_NR;
-               lp->lop_nr = 0;
-               lipe_list_add(&lp->lop_list, &ls->ls_obj_list);
-       }
-       lp->lop_objs[lp->lop_nr].lo_fid = attrs->loa_filter_fid.ff_parent;
-       lp->lop_objs[lp->lop_nr].lo_blocks = attrs->loa_blocks >> 10;
-       lp->lop_objs[lp->lop_nr].lo_comp_id = 0;
-       lp->lop_objs[lp->lop_nr].lo_last_utime = last_used;
+       lo->lo_fid = attrs->loa_filter_fid.ff_parent;
+       lo->lo_blocks = attrs->loa_blocks >> 10;
+       lo->lo_last_utime = last_used;
        if (attrs->loa_filter_fid_size >= sizeof(struct filter_fid)) {
                __u32 id;
 
                id = mirror_id_of(attrs->loa_filter_fid.ff_layout.ol_comp_id);
-               lp->lop_objs[lp->lop_nr].lo_comp_id = id;
+               lo->lo_mirror_id = id;
        }
-       lp->lop_nr++;
+
+       lipe_list_add(&lo->lo_list, &ls->ls_obj_list);
        ls->ls_stored++;
+
        if (ls->ls_max_utime < last_used)
                ls->ls_max_utime = last_used;
        if (ls->ls_min_utime > last_used)
                ls->ls_min_utime = last_used;
        pthread_mutex_unlock(&ls->ls_mutex);
 
-       llapi_printf(LLAPI_MSG_DEBUG, "add "DFID" to %p/%d\n",
-                    PFID(&attrs->loa_filter_fid.ff_parent), lp, lp->lop_nr);
+       llapi_printf(LLAPI_MSG_DEBUG, "add "DFID" to %p/%lu\n",
+                    PFID(&attrs->loa_filter_fid.ff_parent), ls, ls->ls_stored);
 
 out:
        /*
@@ -786,257 +757,324 @@ out:
        return 0;
 }
 
-static int lpurge_exec_cmd(struct mds *m, char *cmd)
+struct collect_ids_data {
+       __u16   *cid_ids;
+       int     cid_count;
+       __u16   cid_exclude;
+};
+
+static int collect_mirror_id(struct llapi_layout *layout, void *cbdata)
+{
+       struct collect_ids_data *cid = cbdata;
+       uint32_t id;
+       int rc;
+
+       rc = llapi_layout_mirror_id_get(layout, &id);
+       if (rc < 0)
+               return rc;
+
+       if ((__u16)id != cid->cid_exclude) {
+               int i;
+
+               for (i = 0; i < cid->cid_count; i++) {
+                       /* already collected the mirror id */
+                       if (id == cid->cid_ids[i])
+                               return LLAPI_LAYOUT_ITER_CONT;
+               }
+               cid->cid_ids[cid->cid_count] = id;
+               cid->cid_count++;
+       }
+
+       return LLAPI_LAYOUT_ITER_CONT;
+}
+
+static bool last_non_stale_mirror(__u16 mirror_id, struct llapi_layout *layout)
 {
-       struct mds_ssh_session *mss;
+       __u16 mirror_ids[128] = { 0 };
+       struct collect_ids_data cid = {
+               .cid_ids = mirror_ids,
+               .cid_count = 0,
+               .cid_exclude = mirror_id,
+       };
+       int i;
+
+       llapi_layout_comp_iterate(layout, collect_mirror_id, &cid);
+
+       for (i = 0; i < cid.cid_count; i++) {
+               struct llapi_resync_comp comp_array[1024] = { { 0 } };
+               int comp_size = 0;
+
+               comp_size = llapi_mirror_find_stale(layout, comp_array,
+                                                   ARRAY_SIZE(comp_array),
+                                                   &mirror_ids[i], 1);
+               if (comp_size == 0)
+                       return false;
+       }
+
+       return true;
+}
+
+static int
+lpurge_mirror_delete(const struct lu_fid *fid, unsigned int mirror_id)
+{
+       char fid_buf[FID_LEN + 1];
+       char vname_buf[PATH_MAX];
+       struct ll_ioc_lease *lil = NULL;
+       struct llapi_layout *layout = NULL;
+       int mdt_index = -1;
+       int fd = -1;
+       int vfd = -1;
        int rc;
 
-       pthread_mutex_lock(&m->m_ssh_lock);
-       /*
-        * this should not happen since we limited the max count of launched jobs
-        * just in case.
+       /* Inline replacement for
+        * lfs mirror split -d --mirror-id mirror_id $MOUNTPOINT/.lustre/fid/FID
         */
-       while (lipe_list_empty(&m->m_ssh_list))
-               pthread_cond_wait(&m->m_ssh_cond, &m->m_ssh_lock);
 
-       /* pop one session */
-       mss = container_of(m->m_ssh_list.next, struct mds_ssh_session,
-                       mss_list);
-       lipe_list_del(&mss->mss_list);
+       snprintf(fid_buf, sizeof(fid_buf), DFID, PFID(fid));
 
-       pthread_mutex_unlock(&m->m_ssh_lock);
+       fd = openat(open_by_fid_fd, fid_buf, O_RDWR);
+       if (fd < 0) {
+               rc = -errno;
+               llapi_printf(LLAPI_MSG_DEBUG,
+                            "cannot open "DFID" for split: %s\n",
+                            PFID(fid), strerror(errno));
+               goto out;
+       }
 
-       rc = lipe_ssh_exec(&mss->mss_ctx, cmd);
-       if (rc != 0 && rc != EUCLEAN)
-               llapi_printf(LLAPI_MSG_FATAL,
-                       "error executing ssh command '%s' on '%s': rc = %d\n",
-                       cmd, m->host, rc);
-       pthread_mutex_lock(&m->m_ssh_lock);
-       lipe_list_add(&mss->mss_list, &m->m_ssh_list);
-       pthread_cond_signal(&m->m_ssh_cond);
-       pthread_mutex_unlock(&m->m_ssh_lock);
+       rc = llapi_file_fget_mdtidx(fd, &mdt_index);
+       if (rc < 0)
+               goto out;
+
+       snprintf(vname_buf, sizeof(vname_buf), LUSTRE_VOLATILE_HDR":%.4X:%.4X",
+                mdt_index, (unsigned int)random());
+
+       vfd = openat(open_by_fid_fd, vname_buf,
+                    O_RDWR|O_CREAT|O_EXCL|O_NOFOLLOW|O_LOV_DELAY_CREATE);
+       if (vfd < 0) {
+               rc = -errno;
+               goto out;
+       }
+
+       rc = llapi_lease_acquire(fd, LL_LEASE_WRLCK);
+       if (rc < 0)
+               goto out;
+
+       layout = llapi_layout_get_by_fd(fd, 0);
+       if (layout == NULL) {
+               rc = -errno;
+               goto out;
+       }
+
+       if (last_non_stale_mirror(mirror_id, layout)) {
+               rc = -EUCLEAN;
+               goto out;
+       }
+
+       lil = calloc(1, offsetof(typeof(*lil), lil_ids[2]));
+       if (lil == NULL) {
+               rc = -ENOMEM;
+               goto out;
+       }
+
+       lil->lil_mode = LL_LEASE_UNLCK;
+       lil->lil_flags = LL_LEASE_LAYOUT_SPLIT;
+       lil->lil_count = 2;
+       lil->lil_ids[0] = vfd;
+       lil->lil_ids[1] = mirror_id;
+
+       rc = llapi_lease_set(fd, lil);
+       if (rc < 0) {
+               goto out;
+       } else if (rc == 0) {
+               /* lost lease lock */
+               rc = -EBUSY;
+       } else {
+               rc = 0;
+       }
+out:
+       if (rc < 0)
+               llapi_printf(LLAPI_MSG_DEBUG,
+                            "cannot delete mirror %u of "DFID": rc = %d\n",
+                            mirror_id, PFID(fid), rc);
+       else
+               llapi_printf(LLAPI_MSG_DEBUG,
+                            "deleted mirror %u of "DFID"\n",
+                            mirror_id, PFID(fid));
+
+       llapi_layout_free(layout);
+
+       free(lil);
+
+       if (!(fd < 0))
+               close(fd);
+
+       if (!(vfd < 0))
+               close(vfd);
 
        return rc;
 }
 
-#define CMDSIZE                (64 * 1024)
+static bool lpurge_work_should_run;
+static size_t lpurge_work_thread_count;
+static pthread_t *lpurge_work_threads;
+static LIPE_LIST_HEAD(lpurge_work_list);
+static pthread_mutex_t lpurge_work_lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t lpurge_work_cond = PTHREAD_COND_INITIALIZER;
+static pthread_cond_t lpurge_work_done = PTHREAD_COND_INITIALIZER;
 
-void *lpurge_spawn_one(void *args)
+static void *lpurge_work_func(void *data)
 {
-       struct lpurge_object_pack *lp = (struct lpurge_object_pack *)args;
-       char buf[PATH_MAX];
-       int rc, i, idx = lp->lop_mdt_idx;
-
-       for (i = 0; i < lp->lop_nr; i++) {
-               if (lp->lop_objs[i].lo_comp_id) {
-                       snprintf(buf, sizeof(buf),
-                               "lfs mirror split -d --mirror-id %u "
-                               "%s/.lustre/fid/"DFID" > /dev/null 2>&1",
-                               lp->lop_objs[i].lo_comp_id,
-                               mds[idx].mnt, PFID(&lp->lop_objs[i].lo_fid));
-               } else {
-                       snprintf(buf, sizeof(buf),
-                               "lfs mirror split -d -p %s "
-                               "%s/.lustre/fid/"DFID" > /dev/null 2>&1",
-                               opt.o_pool, mds[idx].mnt, PFID(&lp->lop_objs[i].lo_fid));
-               }
+       struct lpurge_object *lo;
+       int rc;
 
-               rc = lpurge_exec_cmd(&mds[idx], buf);
-               if (rc)
-                       llapi_printf(LLAPI_MSG_DEBUG,
-                                    "couldn't remove %u from "DFID": rc=%d\n",
-                                    lp->lop_objs[i].lo_comp_id,
-                                    PFID(&lp->lop_objs[i].lo_fid), rc);
+       pthread_mutex_lock(&lpurge_work_lock);
+
+       while (1) {
+               while (lpurge_work_should_run && lipe_list_empty(&lpurge_work_list))
+                       pthread_cond_wait(&lpurge_work_cond, &lpurge_work_lock);
+
+               if (!lpurge_work_should_run)
+                       break;
+
+               lo = lipe_list_entry(lpurge_work_list.next,
+                                    struct lpurge_object, lo_list);
+               lipe_list_del(&lo->lo_list);
+
+               stats.s_started++;
+               pthread_mutex_unlock(&lpurge_work_lock);
+
+               rc = lpurge_mirror_delete(&lo->lo_fid, lo->lo_mirror_id);
+               free(lo);
+
+               pthread_mutex_lock(&lpurge_work_lock);
+
+               if (rc < 0)
+                       stats.s_failed++;
+               else
+                       stats.s_purged++;
+
+               assert(stats.s_purged + stats.s_failed <= stats.s_queued);
+
+               if (stats.s_purged + stats.s_failed == stats.s_queued)
+                       pthread_cond_signal(&lpurge_work_done);
        }
 
-       /* ignore errors, in the worst case we'll find this object again */
-       pthread_exit((void *)0);
+       pthread_mutex_unlock(&lpurge_work_lock);
+
+       return NULL;
 }
 
-int lpurge_spawn(struct lipe_list_head *pm_list)
+static void lpurge_work_threads_stop(void)
 {
-       int jobs[MAX_MDTS] = { 0 };
-       struct lipe_list_head list; /* list of jobs in execution */
-       struct lpurge_object_pack *lp, *tmp;
-       int total = 0;
-       int i, repeat;
-
-       LIPE_INIT_LIST_HEAD(&list);
-
-next:
-       repeat = 0;
-       for (i = 0; i < MAX_MDTS; i++) {
-               while (!lipe_list_empty(&pm_list[i])) {
-                       pthread_t pid;
-                       int rc;
+       size_t i;
+       int rc;
 
-                       if (jobs[i] >= opt.o_max_jobs) {
-                               repeat = 1;
-                               break;
-                       }
+       pthread_mutex_lock(&lpurge_work_lock);
+       lpurge_work_should_run = false;
+       pthread_cond_broadcast(&lpurge_work_cond);
+       pthread_mutex_unlock(&lpurge_work_lock);
 
-                       lp = lipe_list_entry(pm_list[i].next,
-                                       struct lpurge_object_pack, lop_list);
-                       lipe_list_del(&lp->lop_list);
-
-                       /* now fork and execute */
-                       rc = pthread_create(&pid, NULL, lpurge_spawn_one, lp);
-                       if (rc == 0) {
-                               stats.s_spawned++;
-                               stats.s_purged = lp->lop_nr;
-
-                               lp->lop_pid = pid;
-                               lp->lop_mdt_idx = i;
-                               lipe_list_add_tail(&lp->lop_list, &list);
-                               jobs[i]++;
-                               total++;
-                       }
+       for (i = 0; i < lpurge_work_thread_count; i++) {
+               rc = pthread_join(lpurge_work_threads[i], NULL);
+               if (rc != 0) {
+                       llapi_printf(LLAPI_MSG_FATAL,
+                                    "cannot join work thread: %s\n",
+                                    strerror(rc));
                }
        }
 
-wait:
-       assert(total > 0);
-       lp = NULL;
-       lipe_list_for_each_entry_safe(lp, tmp, &list, lop_list) {
-               int rc;
-               intptr_t retval;
+       lpurge_work_thread_count = 0;
 
-               rc = pthread_tryjoin_np(lp->lop_pid, (void **)&retval);
-               if (rc)
-                       continue;
-               lipe_list_del(&lp->lop_list);
-               i = lp->lop_mdt_idx;
-               llapi_printf(LLAPI_MSG_DEBUG,
-                            "thread %lu MDT#%d completed: %"PRIdPTR"\n",
-                            lp->lop_pid, i, retval);
-               free(lp);
-               assert(jobs[i] > 0);
-               jobs[i]--;
-               total--;
-       }
-
-       if (repeat)
-               goto next;
-       if (total)
-               goto wait;
-       return 0;
-}
+       free(lpurge_work_threads);
+       lpurge_work_threads = NULL;
 
-/* round-robin distribution for unknown (w/o mappping, likely new) MDS */
-static int fid2idx_rr;
+       /* TODO: Free entries in lpurge_work_list. */
+}
 
-static int lpurge_fid2idx(const struct lu_fid *fid)
+static int lpurge_work_threads_start(size_t thread_count)
 {
-       int rc, idx;
+       int rc = 0;
 
-       /* XXX: local cache to avoid ioctl() for each FID? */
-       rc = llapi_get_mdt_index_by_fid(lustre_fd, fid, &idx);
-       if (rc < 0) {
-               llapi_printf(LLAPI_MSG_ERROR,
-                               "can't lookup "DFID": rc=%d\n",
-                               PFID(fid), rc);
-               return rc;
+       assert(thread_count > 0);
+
+       lpurge_work_should_run = true;
+
+       lpurge_work_threads = calloc(thread_count, sizeof(lpurge_work_threads[0]));
+       if (lpurge_work_threads == NULL) {
+               rc = -ENOMEM;
+               goto out;
        }
 
-       if (idx >= mdsnr || !mds[idx].host) {
-               /*
-                * no MDS registered for this index
-                * but we can use any, it's just less efficient
-                * due to over-network RPCs to open-close-modify.
-                * so choose some MDS to send replica removal to
-                */
-               do {
-                       fid2idx_rr++;
-                       if (fid2idx_rr == mdsnr)
-                               fid2idx_rr = 0;
-               } while (mds[fid2idx_rr].host == NULL);
+       while (lpurge_work_thread_count < thread_count) {
+               rc = pthread_create(&lpurge_work_threads[lpurge_work_thread_count],
+                                   NULL /* attr */,
+                                   &lpurge_work_func,
+                                   NULL /* data */);
+               if (rc != 0) {
+                       llapi_printf(LLAPI_MSG_FATAL,
+                                    "cannot create work thread: %s\n",
+                                    strerror(rc));
+                       rc = -rc;
+                       goto out;
+               }
 
-               idx = fid2idx_rr;
+               lpurge_work_thread_count++;
        }
+out:
+       if (rc < 0)
+               lpurge_work_threads_stop();
+
+       return rc;
+}
 
-       return idx;
+/* Unused. */
+void lpurge_work_submit(struct lpurge_object *lo)
+{
+       pthread_mutex_lock(&lpurge_work_lock);
+       assert(lpurge_work_should_run);
+       stats.s_queued++;
+       lipe_list_add_tail(&lo->lo_list, &lpurge_work_list);
+       pthread_cond_signal(&lpurge_work_cond);
+       pthread_mutex_unlock(&lpurge_work_lock);
 }
 
 void lpurge_purge_slot(struct lpurge_slot *ls, long long target)
 {
-       struct lpurge_object_pack *lp, *pm;
-       struct lipe_list_head pm_list[MAX_MDTS];
-       __u64 blocks[MAX_MDTS] = { 0 };
+       struct lpurge_object *lo;
        __u64 total, was, kbfree;
        int i, rc;
 
        /* try to remove some replicas */
-
-       for (i = 0; i < MAX_MDTS; i++)
-               LIPE_INIT_LIST_HEAD(&pm_list[i]);
-
 again:
        llapi_printf(LLAPI_MSG_DEBUG, "release upto %llu (expect %lu in %lu)\n",
                     target, ls->ls_space, ls->ls_found);
        total = 0;
-       /* take few FIDs */
+
        assert(!lipe_list_empty(&ls->ls_obj_list));
-       lp = lipe_list_entry(ls->ls_obj_list.next, struct lpurge_object_pack,
-                            lop_list);
-       for (i = lp->lop_nr - 1; i >= 0; i--) {
-               struct lu_fid fid;
-               int idx;
 
-               lp->lop_nr--;
+       pthread_mutex_lock(&lpurge_work_lock);
+       assert(lpurge_work_should_run);
 
-               /* split by MDT index, we could do this at scanning, but that can
-                * result in many useless lookups if only part of slots are subject
-                * to removal
-                */
-               fid = lp->lop_objs[i].lo_fid;
-               if (opt.o_iml_socket) {
-                       rc = flist_add_json_fid(lflist, &fid);
-                       if (rc < 0) {
-                               llapi_printf(LLAPI_MSG_ERROR,
-                                            "can't add "DFID": rc=%d\n",
-                                            PFID(&fid), rc);
-                               continue;
-                       }
-               } else {
-                       idx = lpurge_fid2idx(&fid);
-                       if (idx < 0)
-                               continue;
-                       /* how many blocks we expect to free */
-                       blocks[idx] += lp->lop_objs[i].lo_blocks;
-                       total += lp->lop_objs[i].lo_blocks;
-                       ls->ls_space -= lp->lop_objs[i].lo_blocks;
-                       ls->ls_found--;
-                       ls->ls_stored--;
-
-                       /* put FID into per-MDT pack */
-                       pm = NULL;
-                       if (!lipe_list_empty(&pm_list[idx]))
-                               pm = lipe_list_entry(pm_list[idx].next,
-                                                    struct lpurge_object_pack,
-                                                    lop_list);
-                       if (!pm || pm->lop_nr >= pm->lop_max) {
-                               pm = calloc(1, PM_SIZE);
-                               if (!pm) {
-                                       llapi_printf(LLAPI_MSG_ERROR,
-                                                       "can't alloca pack\n");
-                                       exit(1);
-                               }
-                               pm->lop_max = FIDS_PER_CALL;
-                               pm->lop_nr = 0;
-                               lipe_list_add(&pm->lop_list, &pm_list[idx]);
-                       }
-                       pm->lop_objs[pm->lop_nr] = lp->lop_objs[i];
-                       pm->lop_nr++;
-               }
+       while (!lipe_list_empty(&ls->ls_obj_list)) {
+               lo = lipe_list_entry(ls->ls_obj_list.next, struct lpurge_object,
+                                    lo_list);
+
+               /* how many blocks we expect to free */
+               total += lo->lo_blocks;
+               ls->ls_space -= lo->lo_blocks;
+               ls->ls_found--;
+               ls->ls_stored--;
+
+               lipe_list_move_tail(&lo->lo_list, &lpurge_work_list);
+               stats.s_queued++;
 
                /* if current collection of objects may free target space, then stop */
                if (total >= target)
                        break;
        }
-       if (!lp->lop_nr) {
-               lipe_list_del(&lp->lop_list);
-               free(lp);
-       }
+
+       pthread_cond_broadcast(&lpurge_work_cond);
+       pthread_mutex_unlock(&lpurge_work_lock);
 
        /* fork, start and wait for completion against each MDT */
        /* give some time to OST_DESTROY to pass through */
@@ -1049,10 +1087,11 @@ again:
 
        llapi_printf(LLAPI_MSG_DEBUG, "spawn, expect %llu back\n", total);
 
-       if (opt.o_iml_socket)
-               flist_write(lflist, true);
-       else
-               lpurge_spawn(pm_list);
+       /* Wait for purge threads to complete all submitted work. */
+       pthread_mutex_lock(&lpurge_work_lock);
+       while (stats.s_purged + stats.s_failed < stats.s_queued)
+               pthread_cond_wait(&lpurge_work_done, &lpurge_work_lock);
+       pthread_mutex_unlock(&lpurge_work_lock);
 
        /* XXX: 20 is probably too short for ZFS as changes need to
         * get committed on MDS first, then sent to OST, then get
@@ -1245,7 +1284,7 @@ void parse_mountpoint(const char *name)
        struct lu_fid fid;
        int rc, idx;
 
-       lustre_fd = open(name, O_RDONLY/* | O_DIRECTORY*/);
+       lustre_fd = open(name, O_RDONLY);
        if (lustre_fd < 0) {
                llapi_printf(LLAPI_MSG_FATAL,
                             "can't open %s: %d\n", name, errno);
@@ -1260,89 +1299,18 @@ void parse_mountpoint(const char *name)
                             "%s isn't Lustre mountpoint: %d\n", name, rc);
                exit(1);
        }
-}
-
-void parse_mds(char *args)
-{
-       struct mds *m;
-       char *host, *mnt, *sidx;
-       int idx, rc;
-       int j;
-       char *endptr;
-
-       /* mds# hostname mountpoint */
-       sidx = strsep(&args, ":\n");
-       idx = strtol(sidx, &endptr, 10);
-       if (*endptr != '\0' || idx < 0) {
-               rc = -EINVAL;
-               llapi_error(LLAPI_MSG_FATAL, rc,
-                           "invalid MDS index: '%s'\n", sidx);
-               exit(1);
-       }
-
-       if (idx >= MAX_MDTS) {
-               rc = -EINVAL;
-               llapi_error(LLAPI_MSG_FATAL, rc, "too high MDS index\n");
-               exit(1);
-       }
 
-       m = &mds[idx];
-       if (m->host) {
-               llapi_printf(LLAPI_MSG_FATAL,
-                            "mds #%d is defined already\n", idx);
-               exit(1);
-       }
-       host = strsep(&args, ":\n");
-       mnt = strsep(&args, ":\n");
-       if (!host || !mnt) {
-               llapi_printf(LLAPI_MSG_FATAL,
-                            "no host or mntpt: %s\n", args);
+       open_by_fid_fd = openat(lustre_fd, ".lustre/fid", O_RDONLY);
+       if (open_by_fid_fd < 0) {
+               llapi_error(LLAPI_MSG_FATAL, -errno,
+                           "cannot open '%s/.lustre/fid'", name);
                exit(1);
        }
-
-       if (idx >= MAX_MDTS) {
-               llapi_printf(LLAPI_MSG_FATAL,
-                            "MDT index %u > max index %u: %s\n",
-                            idx, MAX_MDTS, args);
-               exit(1);
-       }
-
-       m->host = strdup(host);
-       m->mnt = strdup(mnt);
-
-       /* init ssh session to target mds */
-       pthread_mutex_init(&m->m_ssh_lock, NULL);
-       pthread_cond_init(&m->m_ssh_cond, NULL);
-       LIPE_INIT_LIST_HEAD(&m->m_ssh_list);
-
-       for (j = 0; j < opt.o_max_jobs; j++) {
-               struct mds_ssh_session *mss;
-
-               mss = calloc(1, sizeof(*mss));
-               if (mss == NULL) {
-                       llapi_printf(LLAPI_MSG_FATAL,
-                               "cannot allocate ssh session\n");
-                       exit(1);
-               }
-
-               rc = lipe_init_ssh_session(&mss->mss_ctx, host);
-               if (rc != SSH_AUTH_SUCCESS) {
-                       llapi_printf(LLAPI_MSG_FATAL,
-                            "cannot create ssh session for host %s: rc = %d\n",
-                               host, rc);
-                       exit(1);
-               }
-
-               lipe_list_add(&mss->mss_list, &m->m_ssh_list);
-       }
-
-       mdsnr++;
-       llapi_printf(LLAPI_MSG_DEBUG, "add mds #%d at %s@%s\n",
-                    idx, host, mnt);
 }
 
 #define LPURGE_INTERNAL_DUMP_FIDS      1
 #define LPURGE_OPT_IML_SOCKET          2
+#define LPURGE_OPT_VERSION 3
 
 static struct option options[] = {
        { "device", required_argument, NULL, 'D'},
@@ -1363,6 +1331,7 @@ static struct option options[] = {
        { "scan_rate", required_argument, NULL, 'R'},
        { "scan_threads", required_argument, NULL, 't'},
        { "slot_size", required_argument, NULL, 'S'},
+       { "version", no_argument, NULL, LPURGE_OPT_VERSION},
        { NULL }
 };
 
@@ -1387,6 +1356,9 @@ void lpurge_process_opt(int c, char *optarg)
        case LPURGE_INTERNAL_DUMP_FIDS:
                opt.o_fids_dumpfile = strdup(optarg);
                break;
+       case LPURGE_OPT_VERSION:
+               lipe_version();
+               exit(0);
        case 'b':
                llapi_msg_set_level(LLAPI_MSG_MAX);
                break;
@@ -1441,7 +1413,7 @@ void lpurge_process_opt(int c, char *optarg)
                opt.o_freelo = value;
                break;
        case 'm':
-               parse_mds(optarg);
+               /* parse_mds(optarg); */
                break;
        case 'M':
                opt.o_mountpoint = strdup(optarg);
@@ -1565,7 +1537,6 @@ void load_config(char *name)
 void lpurge_verify_opts(void)
 {
        char buf[1024];
-       int i;
 
        if (!opt.o_pool) {
                opt.o_pool = DEF_POOL;
@@ -1659,42 +1630,6 @@ void lpurge_verify_opts(void)
                return;
        }
 
-       // Stand-alone Operation Checks
-       if (mdsnr == 0) {
-               llapi_printf(LLAPI_MSG_FATAL,
-                            "mds is not configured\n");
-               exit(1);
-       }
-
-       for (i = 0; i < mdsnr; i++) {
-               int rc;
-               char cmd[PATH_MAX];
-
-               if (!mds[i].host) {
-                       llapi_printf(LLAPI_MSG_FATAL,
-                                    "MDS host is not configured\n");
-                       exit(1);
-               }
-
-               if (!mds[i].mnt) {
-                       llapi_error(LLAPI_MSG_FATAL, -EINVAL,
-                                   "mountpoint on MDS host '%s' is not configured\n",
-                                   mds[i].host);
-                       exit(1);
-               }
-
-               /* check mountpoint on MDS */
-               snprintf(cmd, sizeof(cmd), "lfs path2fid %s > /dev/null 2>&1",
-                       mds[i].mnt);
-               rc = lpurge_exec_cmd(&mds[i], cmd);
-               if (rc) {
-                       llapi_error(LLAPI_MSG_FATAL, -EINVAL,
-                                   "invalid mountpoint '%s' on MDS host: '%s'\n",
-                                   mds[i].mnt, mds[i].host);
-                       exit(1);
-               }
-       }
-
        if (opt.o_max_jobs < 1 || opt.o_max_jobs > 1024) {
                llapi_printf(LLAPI_MSG_FATAL,
                             "invalid max_jobs: %d\n", opt.o_max_jobs);
@@ -1744,24 +1679,23 @@ void lpurge_usr1_handle(int sig)
                return;
        }
        llapi_printf(LLAPI_MSG_DEBUG, "dump to %s\n", opt.o_dumpfile);
-       fprintf(f, "config:\n"
+       fprintf(f,
+               "version: %s-%s\n"
+               "revision: %s\n"
+               "config:\n"
                "    free_high: %u\n"
                "    free_low: %u\n"
                "    ostname: %s\n"
                "    mountpoint: %s\n"
                "    pool: %s\n"
-               "    mds: ",
-               opt.o_freehi, opt.o_freelo,
-               opt.o_device, opt.o_mountpoint, opt.o_pool);
-       for (i = 0; i < mdsnr; i++)
-               fprintf(f, "%s%d:%s:%s", i ? "," : "", i,
-                       mds[i].host, mds[i].mnt);
-       fprintf(f, "\n"
                "    max_jobs: %u\n"
                "    check_interval: %u\n"
                "    scan_rate: %u\n"
                "    scan_threads: %u\n"
                "    slot_size: %u\n",
+               PACKAGE_VERSION, LIPE_RELEASE, LIPE_REVISION,
+               opt.o_freehi, opt.o_freelo,
+               opt.o_device, opt.o_mountpoint, opt.o_pool,
                opt.o_max_jobs, opt.o_interval, opt.o_scan_rate,
                opt.o_scan_threads, opt.o_slot_size);
        if (opt.o_iml_socket)
@@ -1771,11 +1705,14 @@ void lpurge_usr1_handle(int sig)
                "    scans: %lu\n"
                "    scan_time: %llu\n"
                "    slow_scans: %lu\n"
-               "    spawned: %lu\n"
+               "    queued: %lu\n"
+               "    started: %lu\n"
                "    purged: %lu\n"
+               "    failed: %lu\n"
                "    low_hits: %lu\n",
                stats.s_scans, stats.s_scan_time, stats.s_slow_scans,
-               stats.s_spawned, stats.s_purged, stats.s_low_hits);
+               stats.s_queued, stats.s_started, stats.s_purged, stats.s_failed,
+               stats.s_low_hits);
        lpurge_kbfree(&kbfree);
        fprintf(f, "space:\n"
                "    kbfree: %llu\n"
@@ -1784,7 +1721,7 @@ void lpurge_usr1_handle(int sig)
                kbfree, freelo, freehi);
 
 #define HIST_FMT \
-       "    hist%u: { age: %lu, found: %lu, space: %lu, stored: %lu }\n"
+       "    hist%u: { age: %lu, found: %lu, space: %lu, stored: %lu, nomirror_cnt: %lu, nomirror_space: %lu, nopfid_cnt: %lu, nopfid_space: %lu, notfirst_cnt: %lu, notfirst_space: %lu }\n"
 
        fprintf(f, "hlists:\n");
        for (i = LPURGE_HIST_MAX - 1; i >= 0; i--) {
@@ -1794,7 +1731,10 @@ void lpurge_usr1_handle(int sig)
                        continue;
 
                fprintf(f, HIST_FMT, i, ls->ls_age, ls->ls_found,
-                       ls->ls_space, ls->ls_stored);
+                       ls->ls_space, ls->ls_stored,
+                       ls->ls_nomirror_objs,ls->ls_nomirror_space,
+                       ls->ls_nopfid_objs, ls->ls_nopfid_space,
+                       ls->ls_notfirst_objs, ls->ls_notfirst_space);
        }
 
        fflush(f);
@@ -1924,17 +1864,13 @@ void lpurge_usr2_handle(int sig)
         * {"fid":"[0x200000401:0x6:0x0]", "last_use_time": 123456, "slot_id": $SLOT_ID}
         */
        for (i = LPURGE_HIST_MAX - 1; i >= 0; i--) {
-               struct lpurge_object_pack *lp;
                struct lpurge_slot *ls = lpurge_hist + i;
+               struct lpurge_object *lo;
 
                if (ls->ls_found == 0)
                        continue;
 
-               lp = lipe_list_entry(ls->ls_obj_list.next,
-                                    struct lpurge_object_pack,
-                                    lop_list);
-
-               for (i = lp->lop_nr - 1; i >= 0; i--) {
+               lipe_list_for_each_entry(lo, &ls->ls_obj_list, lo_list) {
                        int rc;
                        char fid_buf[FID_LEN + 1];
                        struct lu_fid fid;
@@ -1945,13 +1881,13 @@ void lpurge_usr2_handle(int sig)
 
                        obj_stat = json_object_new_object();
 
-                       fid = lp->lop_objs[i].lo_fid;
+                       fid = lo->lo_fid;
                        snprintf(fid_buf, sizeof(fid_buf), DFID, PFID(&fid));
                        json_object_object_add(obj_stat, "fid",
                                               json_object_new_string(fid_buf));
                        json_object_object_add(obj_stat, "last_use_time",
                                               json_object_new_int64(
-                                              lp->lop_objs[i].lo_last_utime));
+                                                      lo->lo_last_utime));
                        json_object_object_add(obj_stat, "slot_id",
                                               json_object_new_int(i));
                        output = json_object_to_json_string_ext(obj_stat,
@@ -2041,14 +1977,10 @@ static void lpurge_register_signal_handlers(void)
 
 int main(int argc, char **argv)
 {
-       ssh_threads_set_callbacks(ssh_threads_get_pthread());
-       ssh_init();
-
+       lipe_version_init();
        setlinebuf(stdout);
        setlinebuf(stderr);
 
-       memset(mds, 0, sizeof(mds));
-
        llapi_msg_set_level(LLAPI_MSG_INFO);
 
        signal(SIGUSR1, lpurge_null_handler);
@@ -2061,6 +1993,10 @@ int main(int argc, char **argv)
         * followed by the OST name ("lpurge lustre-OST0000"). */
        llapi_set_command_name(ostname);
 
+       llapi_error(LLAPI_MSG_INFO|LLAPI_MSG_NO_ERRNO, 0,
+                   "version %s-%s, revision %s\n",
+                   PACKAGE_VERSION, LIPE_RELEASE, LIPE_REVISION);
+
        lpurge_get_ost_mntpt();
        lpurge_configure_thresholds();
 
@@ -2070,6 +2006,8 @@ int main(int argc, char **argv)
 
        lpurge_register_signal_handlers();
 
+       lpurge_work_threads_start(opt.o_max_jobs);
+
        while (1) {
                /* XXX: do lazy scanning in the background
                 * before the real need, so that when
@@ -2091,4 +2029,9 @@ int main(int argc, char **argv)
                /* device size can change runtime.. */
                lpurge_configure_thresholds();
        }
+
+       /* Unreached. */
+       lipe_version_fini();
+
+       return 0;
 }
index a58525b..fd674bf 100644 (file)
@@ -7,12 +7,12 @@
  *
  * Author: Li Xi <lixi@ddn.com>
  */
-#include <lustre/lustreapi.h>
+#include <errno.h>
 #include <fnmatch.h>
+#include <lustre/lustreapi.h>
 #include "fake_lustre_idl.h"
 #include "lustre_ea.h"
 #include "debug.h"
-#include "errno.h"
 
 int decode_linkea(char *buf, ssize_t size)
 {
index 1ebad2d..d731ccb 100755 (executable)
@@ -1,13 +1,8 @@
 #!/bin/sh
 
-# Please update all the version references if VERSION changed.
-# lipe.spec.in
-# man/lipe_scan.1
-# pylipe/lipe_test.py
-
-VERSION="1.14"
-if [ -d .git ]; then
-       VERSION=$(git describe|sed 's/-[0-9]*-/./')
-fi
+VERSION="1.17"
+# if [ -d .git ]; then
+#      VERSION=$(git describe|sed 's/-[0-9]*-/./')
+# fi
 
 printf "%s" "$VERSION"