From dfa0760e7d8c7f5f56ebb5ee2e766f0a05cc4e67 Mon Sep 17 00:00:00 2001
From: "John L. Hammond" <jhammond@whamcloud.com>
Date: Tue, 27 Apr 2021 09:38:30 -0500
Subject: [PATCH] Squashed 'lipe/' changes from 38f79e56ec..8251fae87b

8251fae87b Update lipe version to 1.17.
87ee780007 EX-1613 scripts: Use ticket to start/stop hotpools
0ce8cdc011 EX-3078 lipe: quote FIDs in remote commands
dcd24fe01e EX-3034 lamigo: check for available agents early
0ff4506a1d EX-3043 lamigo: remove debugging leftover
22cf972d45 EX-3043 lamigo: simplify changelog cleaning check
de214cee10 EX-3009 lamigo: dump changelog status
8e7ee6cc80 EX-3017 lpurge: for stats for skipped objects
b0bce08ec5 EX-2768 lamigo: don't register a SIGCHLD handler
8981cfa704 EX-3036 lipe: version and revision support
1f562d542b EX-3020 lamigo: prevent out of order changelog clearing
d859e4bff2 EX-3021 lipe: refactor lipe_ssh context handling
4124346b0d EX-2718 lpurge: use local mountpoint for purge operations
7caf05b672 EX-3030 lipe: join multiple threads in lamigo_check_jobs()
911cf5018b EX-2994 lipe: update lpurge purged stats correctly
3d1af585e3 Update lipe version to 1.16.
842d8c5aad EX-2962 lipe: Fix config autodetect
df423e9539 EX-2948 build: less checks in lipe configuration
6745370c5b EX-2983 lamigo: reduce log level in lamigo_exec_cmd()
77e90c1df3 EX-2979 lamigo: do not count setprefer as replication
eda4f09711 EX-2608 scripts: Auto detect previous values
4ca909e2ea Update lipe version to 1.15.
2f80b12aaf EX-2770 lpurge: set lop_mdt_idx before spawning thread
370ba3a118 EX-2930 lipe: fix errno.h include
4f672e1346 EX-2921 lipe: merge tools/lipe to lipe subtree
17a2a63533 EX-2778 lipe: lipe.spec fixes

git-subtree-dir: lipe
git-subtree-split: 8251fae87b508e36caab6397b1063b308dcb2b05
---
 Makefile.am                             |   1 +
 configure.ac                            |  39 +-
 hotpools-1.17-release-notes.md          | 456 +++++++++++++++++
 lipe-revision.sh                        |  29 ++
 lipe.spec.in                            |  26 +-
 scripts/stratagem-hp-config.sh          | 249 +++++++---
 scripts/stratagem-hp-convert.sh         |  76 +++
 scripts/stratagem-hp-start.sh           |  28 +-
 scripts/stratagem-hp-stop.sh            |  30 +-
 scripts/stratagem-hp-teardown-client.sh |  90 ++++
 scripts/stratagem-hp-teardown.sh        |  86 +++-
 src/Makefile.am                         |   6 +-
 src/lamigo.c                            | 220 ++++++---
 src/lamigo_alr.c                        |  27 +-
 src/lipe_ssh.c                          | 238 +++++----
 src/lipe_ssh.h                          |  14 +-
 src/lipe_version.c                      |  32 ++
 src/lipe_version.h                      |   8 +
 src/lpurge.c                            | 837 +++++++++++++++-----------------
 src/lustre_ea.c                         |   4 +-
 version-gen.sh                          |  13 +-
 21 files changed, 1693 insertions(+), 816 deletions(-)
 create mode 100644 hotpools-1.17-release-notes.md
 create mode 100755 lipe-revision.sh
 create mode 100755 scripts/stratagem-hp-convert.sh
 create mode 100755 scripts/stratagem-hp-teardown-client.sh
 create mode 100644 src/lipe_version.c
 create mode 100644 src/lipe_version.h

diff --git a/Makefile.am b/Makefile.am
index 07c0811..a9f0a1b 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -55,6 +55,7 @@ PYTHON_COMMANDS = \
 EXTRA_DIST= \
 	$(PYTHON_COMMANDS) \
 	detect-distro.sh \
+	lipe-revision.sh \
 	example_configs/clownfish/seperate_mgs/clownfish.conf \
 	example_configs/clownfish/seperate_mgs/lipe_virt.conf \
 	example_configs/lipe/lipe_install.conf \
diff --git a/configure.ac b/configure.ac
index dc74bf9..88a47d5 100644
--- a/configure.ac
+++ b/configure.ac
@@ -108,43 +108,13 @@ if test "x$idle_user_header" = "xyes"; then
         AC_DEFINE(IDLE_USER_HEADER, 1, [Lustre uses idle user header])
 fi
 
-# -------- check whether DoM is supported by Lustre head files --------
-AC_MSG_CHECKING([Lustre has PFL support])
-saved_libs=$LIBS
-LIBS="-llustreapi"
-AC_LINK_IFELSE([AC_LANG_SOURCE([	
-	#include <lustre/lustre_user.h>
-
-	int main(void) {
-		int a = LOV_USER_MAGIC_COMP_V1;
-	}
-])],[
+# few defines which aren't conditional anymore
 AC_DEFINE([HAVE_LUSTRE_PFL], 1,
 	  [Lustre has PFL support])
-have_lustre_pfl=yes
-], [have_lustre_pfl="no"])
-LIBS=$saved_libs
-AC_MSG_RESULT([$have_lustre_pfl])
-
-# -------- check for llapi_layout_get_by_xattr() --------
-AC_MSG_CHECKING([Lustre have llapi_layout_get_by_xattr() and LSoM])
-saved_libs=$LIBS
-LIBS="-llustreapi"
-AC_LINK_IFELSE([AC_LANG_SOURCE([	
-	#include <lustre/lustreapi.h>
-
-	int main(void) {
-		llapi_layout_get_by_xattr(NULL, 0, 0);
-	}
-])],[
 AC_DEFINE([HAVE_LAYOUT_BY_XATTR], 1,
 	  [have llapi_layout_get_by_xattr()])
 AC_DEFINE([HAVE_LAZY_SIZE_ON_MDT], 1,
 	  [have lazy size on MDT])
-have_layout_by_xattr=yes
-], [have_layout_by_xattr="no"])
-LIBS=$saved_libs
-AC_MSG_RESULT([$have_layout_by_xattr])
 
 # -------- check for llapi_changelog_in_buf() --------
 AC_MSG_CHECKING([Lustre have llapi_changelog_in_buf()])
@@ -337,8 +307,15 @@ AC_SEARCH_LIBS([ext2fs_open2], [ext2fs])
 
 LIPE_RELEASE="1"
 AC_DEFINE_UNQUOTED(RELEASE, "$LIPE_RELEASE", [release info] )
+AC_DEFINE_UNQUOTED(LIPE_RELEASE, "$LIPE_RELEASE", [release info] )
 AC_SUBST(LIPE_RELEASE)
 
+AC_MSG_CHECKING([for lipe revision])
+LIPE_REVISION=$(bash lipe-revision.sh)
+AC_MSG_RESULT([$LIPE_REVISION])
+AC_DEFINE_UNQUOTED(LIPE_REVISION, "$LIPE_REVISION", [revision info] )
+AC_SUBST(LIPE_REVISION)
+
 # for exporting to spec file
 AC_SUBST(ac_configure_args)
 AC_CONFIG_FILES([Makefile
diff --git a/hotpools-1.17-release-notes.md b/hotpools-1.17-release-notes.md
new file mode 100644
index 0000000..14c6763
--- /dev/null
+++ b/hotpools-1.17-release-notes.md
@@ -0,0 +1,456 @@
+# Hotpools Hotfix (lipe-server-1.17) Release Notes
+
+## lamigo agents
+
+lamigo will continue to ssh to "agents" and invoke lfs to mirror and
+resync files. An "agent" need not be an EXAScaler server (VM). Instead
+an agent can be any node that lamigo can reach via ssh as root, has a
+Lustre client mount, and has the lfs utility installed. The lamigo
+configuration file determines the set of agents used:
+```
+    # MDT=myfs1-MDT0042
+    # grep '^agent=' /etc/lamigo/${MDT}.conf
+    agent=ai400x-0a12-vm00:/lustre/myfs1/client:4
+    agent=ai400x-0a12-vm01:/lustre/myfs1/client:4
+    agent=ai400x-0a12-vm02:/lustre/myfs1/client:4
+    agent=ai400x-0a12-vm03:/lustre/myfs1/client:4
+```
+In the triple `ai400x-0a12-vm00:/lustre/myfs1/client:4` the first
+component is the hostname of the agent, the second is the mount point
+of the Lustre client on the agent, and the third is the maximum number
+of concurrent mirror operations.
+
+Using agents other than the EXAScaler servers will alleviate many of
+the HA issues seen to date (DDN-1920, EX-1613). The
+`stratagem-hp-config.sh` scripts will use the EXAScaler servers as
+mirroring agents for lamigo.
+
+Previous version of lamigo and lpurge failed to startup in some
+certain cases when agents were unreachable (DDN-2070, EX-3021). All
+known issues in this area are believed to be fixed.
+
+In the example above we have 4 agents with 4 concurrent operations
+each for a total of 16 concurrent operations on behalf of this lamigo
+service. Some basic benchmarking on a mix of small and medium files
+has shown that we can expect about twice as many (2 * 16 = 32)
+replications per second from lamigo.
+
+## agentless lpurge
+
+lpurge will no longer use agents or external commands (lfs mirror
+split ...) to purge files. Instead it will use the Lustre client mount
+point (`/lustre/$FS/client`) on the server where lpurge runs. For
+backwards compatibility lpurge will ignore any agents specified in the
+config file. Note that lpurge.conf uses lines of the form
+```
+    mds=0:ai400x-0a12-vm00:/lustre/myfs1/client
+    mds=1:ai400x-0a12-vm01:/lustre/myfs1/client
+```
+to specify an agent. Since they will be ignored, we recommend just
+leaving them in the config file. This will make downgrade easier if
+needed.
+
+## Online upgrade of lamigo and lpurge
+
+To upgrade lamigo and lpurge on a live system we suggest installing
+the new lipe-server RPM to all EXAScaler servers and then running
+```
+    # clush -a killall lamigo lpurge
+```
+In this case systemd will automatically restart lamigo and lpurge
+(using the newly installed binaries) without invoking HA. This is
+preferred over using `stratagem-hp-stop.sh && stratagem-hp-start.sh`.
+
+## Converting hotpools to use ticketed HA configuration
+
+Using the Lustre client mount Pacemaker resource to control the
+starting and stopping of hotpools introduced cyclic resource
+dependencies which in turn made stopping hotpools impossible in some
+cases (usually leading to STONITH, see DDN-1920 and EX-1613). To
+address this `stratagem-hp-config.sh` now uses a new Pacemaker
+resource (`$FS-hotpool`) and ticket (`$FS-hotpool-allocated`) to start
+and stop hotpools. Running `stratagem-hp-stop.sh` stops lamigo and
+lpurge but not the Lustre client mounts on the servers.
+
+Alongside the changes to `stratagem-hp-config.sh`, this release
+provides two new HA scripts:
+
+`stratagem-hp-convert.sh` converts a previously configured hotpools
+installation to a ticketed configuration. Conversion may be performed
+with hotpools running or stopped.
+
+`stratagem-hp-teardown-client.sh` stops and unconfigures the Lustre
+client mount on the servers.
+
+## Permanently stopping hotpools
+
+If hotpools is to be stopped permanently or indefinitely then the
+changelog users registered for lamigo should be deregistered to avoid
+client operations failing due to full changelogs and increased
+changelog consumption on the MDTs (upto about 512GB). This should be
+done for each MDT.
+
+## Restarting hotpools
+
+If hotpools was previously enabled and subsequently stopped then
+some maintenance around changelogs should be performed before
+re-enabling it. Remove any old '.chlg' files:
+```
+    # clush -a "find /var/lib -name 'lamigo-*.chlg' -delete"
+```
+For each MDT in $FS-MDT0000..$FS-MDTffff perform the following steps.
+
+1. Check that the changelog user specified in the lamigo config file
+agrees with the changelog user registered on the MDT:
+```
+    # MDT=myfs1-MDT0042
+    # grep '^user=' /etc/lamigo/${MDT}.conf
+    user=cl23
+    # clush -aN lctl get_param mdd.${MDT}.changelog_users 2> /dev/null
+    mdd.myfs1-MDT0042.changelog_users=
+    current index: 83896534
+    ID    index (idle seconds)
+    cl23  82748303 (481516)
+```
+If there are other changelog users registered then they may need to be
+removed. Please consult with the site admins to ensure that they are
+not in use by another changelog consumer (Robinhood, laudit,
+...). Other registered changelog users with high `idle seconds` values
+should be regarded with suspicion.
+
+The number of changelog entries stored on disk by the MDT is the
+difference between the "current index" and the minimum of the indexes
+of all registered users:
+```
+    83896534 - 82748303 = 1148231
+```
+2. Check the total changelog space consumption on the MDT:
+```
+    # clush -aN lctl get_param mdd.${MDT}.changelog_size 2> /dev/null
+    mdd.myfs1-MDT0042.changelog_size=161358368
+```
+A changelog entry consumes about 144 bytes on average. So this looks
+about right
+```
+    144 * 1148231 = 165345264
+```
+Contact L3 if there are no changelog users registered but
+`changelog_size` is reporting a large value or if `changelog_size` is much
+larger than the expected size.
+
+3. Clear old changelogs:
+
+If lamigo is the only changelog consumer and there are many unconsumed
+entries on the MDT then clear the changelog manually before restarting
+hotpools:
+```
+    # clush -aN lctl get_param mdd.${MDT}.changelog_users 2> /dev/null
+    mdd.myfs1-MDT0042.changelog_users=
+    current index: 83896534
+    ID    index (idle seconds)
+    cl23  82748303 (481516)
+    # CL=cl23
+    # lfs changelog_clear $MDT $CL 0
+```
+Clearing a large amount of old changelog entries will take some
+time. (Do not interrupt it!) Changelogs on *different* MDTs may be
+cleared in parallel.
+
+The steps listed above should be repeated for each MDT!
+
+## Tuning lamigo (optimization)
+
+Consider increasing the number of agents or the number of concurrent
+jobs per agent.
+
+## Tuning lpurge (optimization)
+
+Consider increasing the max_jobs config parameter from the default of
+8.
+
+## Reducing the changelog mask (optimization)
+
+If there are no changelog consumers other than lamigo then the
+changelog mask may be reduced from the default to include only the
+types needed by hotpools (`MARK UNLNK CLOSE`). Note the spelling of
+'UNLNK'. To see the current changelog mask run:
+```
+    # clush -aN lctl get_param "mdd.${FS}-*.changelog_mask"
+    mdd.myfs1-MDT0042.changelog_mask=
+    MARK CREAT MKDIR HLINK SLINK MKNOD UNLNK RMDIR RENME RNMTO CLOSE LYOUT TRUNC SATTR XATTR HSM MTIME CTIME MIGRT FLRW RESYNC 
+```
+To make a persistent setting run the following on the MGS:
+```
+    # lctl set_param -P "mdd.${FS}-*.changelog_mask"='MARK UNLNK CLOSE'
+```
+## Creating a nodemap for hotpools agents (optimization)
+
+As lamigo consumes changelogs (and OST access logs) it dispatches
+mirroring operation to agents which in turn generate yet more
+changelog entries that lamigo must consume. This feedback loop is not
+fatal but does affect performance. It can be avoided by creating a
+nodemap that containing the EXAScaler servers (and any configured
+agents) and setting `audit_mode=0` for that nodemap.
+
+(First consult site admins to determine if nodemaps are already in
+use. If so then any changes made must be consistent with current
+nodemap configuration.)
+
+To proceed collect the set of possible LNet NIDs for the hotpools
+agents. By default this will be the NIDs of the EXAScaler server VMs.
+```
+    # clush -aN lctl list_nids | sort
+    172.16.0.48@o2ib
+    172.16.0.49@o2ib
+    172.16.0.50@o2ib
+    172.16.0.51@o2ib
+    172.16.0.52@o2ib
+    172.16.0.53@o2ib
+    172.16.0.54@o2ib
+    172.16.0.55@o2ib
+```
+This list of NIDs must be converted to a comma separated list or range
+expression. In this example:
+```
+    172.16.0.[48-55]@o2ib
+```
+(See https://github.com/DDNStorage/lustre_manual_markdown/blob/master/03.16-Mapping%20UIDs%20and%20GIDs%20with%20Nodemap.md for the full syntax.)
+To configure and enable the nodemap run the following on the MGS:
+```
+    # HP_NODEMAP=hp_agents
+    # HP_AGENT_NIDS="172.16.0.[48-55]@o2ib"
+    # lctl nodemap_modify --name default --property admin --value 1
+    # lctl nodemap_modify --name default --property trusted --value 1
+    # lctl nodemap_add $HP_NODEMAP
+    # lctl nodemap_modify --name $HP_NODEMAP --property admin --value 1
+    # lctl nodemap_modify --name $HP_NODEMAP --property trusted --value 1
+    # lctl nodemap_modify --name $HP_NODEMAP --property audit_mode --value 0
+    # lctl nodemap_add_range --name $HP_NODEMAP --range "$HP_AGENT_NIDS"
+    # lctl nodemap_add_range --name $HP_NODEMAP --range 0@lo
+    # lctl nodemap_activate 1
+```
+
+## Monitoring lamigo and lpurge
+
+There are several ways to monitor hotpools beyond checking the
+utilization of the fast pool. lamigo maintains several counters and
+statistics which it will write out on receiving `SIGUSR1`. The following
+commands were run on the VM where `${MDT}` is mounted.
+```
+    # MDT=myfs1-MDT0042
+    # cat /var/run/lamigo-${MDT}.pid
+    20223
+    # ps -Fp $(cat /var/run/lamigo-${MDT}.pid)
+    UID        PID  PPID  C    SZ   RSS PSR STIME TTY          TIME CMD
+    root     14434  9331  0 71699  5120   3 Apr20 ?        00:59:16 /usr/sbin/lamigo -f /etc/lamigo/myfs1-MDT0042.conf
+    # pkill --exact --pidfile=/var/run/lamigo-${MDT}.pid --signal=USR1 lamigo
+    # sleep 1
+    # cat /var/run/lamigo-${MDT}.stats
+    version: 1.16-1
+    revision: da7d99c2761e0f294a3e0a150ef59972bd04bc89
+    config:
+        chlg_user: cl23
+        mdtname: myfs1-MDT0042
+        mountpoint: /lustre/myfs1/client
+        source_pool: ddn_ssd
+        target_pool: ddn_hdd
+        min_age: 600
+        max_cache: 268435456
+        rescan: 0
+        thread_count: 1
+        pool_refresh: 600
+        progress_interval: 600
+        periods: 16
+        period_time: 10
+        warmup_k: 20
+        cooldown_k: 15
+        ofd_interval: 5
+        hot_fraction: 20
+        hot_after_idle: 3
+        src_free: 70
+        tgt_free: 10
+    pool ddn_ssd:
+        osts: 8
+        avail: 22974591116
+        total: 31824761344
+        open: 1
+    pool ddn_hdd:
+        osts: 2
+        avail: 436791025224
+        total: 494282835936
+        open: 1
+    stats:
+        read: 6188174
+        skipped: 369235
+        processed: 5818939
+        removed: 0
+        dups: 311924
+        spawned: 7111691
+        replicated: 4429160
+        busy: 0
+        queued: 1226018
+        skip_hot: 0
+        ro2hot: 0
+        rw2hot: 0
+        rw2cold: 3382984
+    changelog:
+        last_processed_idx: 84139503
+        last_cleared_idx: 82909792
+        last_cleared_time: 1619120075 # ( Thu Apr 22 19:34:35 2021 )
+    agents:
+        agent3: { host: ai400x-0a12-vm03, mnt: /lustre/myfs1/client, jobs: 4, max: 4, state: active }
+        agent2: { host: ai400x-0a12-vm02, mnt: /lustre/myfs1/client, jobs: 3, max: 4, state: active }
+        ...
+    jobs:
+        job0: { tid: 140569798792960, fid: [0x2000079ac:0xfb4e:0x0], index: 0, agent: 3, start: 1619120075, command: extend }
+        job1: { tid: 140568524986112, fid: [0x2000079ac:0xfb02:0x0], index: 0, agent: 3, start: 1619120075, command: extend }
+        job2: { tid: 140569807185664, fid: [0x2000079ac:0xfaff:0x0], index: 0, agent: 3, start: 1619120075, command: extend }
+        ...
+```
+Note that the difference between the MDT current changelog index and
+the last index cleared by lamigo indicates the degree to which
+lamigo's consumption of changelog entries is keeping up the production
+of new entries (both values can be seen using the `changelog_users`
+parameter as shown above). A 10 minute (min-age) lag between reading
+and clearing is normal.
+
+Similarly for lpurge, the following commands were run on the VM where
+`${OST}` is mounted:
+```
+    # OST=myfs1-OST0117
+    # cat /var/run/lpurge-${OST}.pid
+    20222
+    # ps -Fp $(cat /var/run/lpurge-${OST}.pid)
+    UID      PID  PPID  C     SZ   RSS PSR STIME TTY     TIME CMD
+    root   20222     1  6 171957 13352  16 Apr20   ? 03:09:06 /usr/sbin/lpurge -f /etc/lpurge/myfs1/OST0117.conf
+    # pkill --exact --pidfile=/var/run/lpurge-${OST}.pid --signal=USR1 lpurge
+    # sleep 1
+    # cat /var/run/lpurge-${OST}.stats
+    version: 1.16-1
+    revision: b465db8b9b99e217d175f31230d04e10a9a17906
+    config:
+        free_high: 80
+        free_low: 50
+        ostname: myfs1-OST0117
+        mountpoint: /lustre/myfs1/client
+        pool: ddn_ssd
+        max_jobs: 8
+        check_interval: 30
+        scan_rate: 10000
+        scan_threads: 1
+        slot_size: 1048576
+    stats:
+        scans: 10
+        scan_time: 346
+        slow_scans: 5
+        queued: 3210711
+        started: 3202330
+        purged: 363985
+        failed: 2838337
+        low_hits: 6
+    space:
+        kbfree: 2997655148
+        low: 1989047584
+        hi: 3182476134
+    hlists:
+```
+
+# Known issues
+
+## If lamigo is falling behind on changelog consumption
+
+Append a `rescan` directive to the lamigo configuration file,
+manually clear the changelog file, and then restart lamigo.
+```
+    # MDT=myfs1-MDT0042
+    # echo rescan >> /etc/lamigo/${MDT}.conf
+    # CL=...
+    # lfs changelog_clear $MDT $CL 0
+    # pkill --exact --pidfile=/var/run/lamigo-${MDT}.pid lamigo
+```
+
+## lamigo SSH errors
+
+If lamigo reports
+```
+    lamigo: Error authenticating pubkey: Failed to read private key: /root/.ssh/id_rsa
+```
+or
+```
+    lamigo: cannot authenticate SSH session to host HOST: ...
+```
+then first check that `ssh` works between the affected hosts. If so
+then check that the private key file (usually `/root/.ssh/id_rsa`) is
+in PEM format. If the key file starts with
+```
+    -----BEGIN RSA PRIVATE KEY-----
+```
+then it is likely in PEM format. If, instead, it starts with
+```
+    -----BEGIN OPENSSH PRIVATE KEY-----
+```
+then it is in the newer OpenSSH format. To generate a key in PEM
+format, add `-m PEM` to the `ssh-keygen` command line.
+
+Second, ensure that the key file does not contain any comments (lines
+starting with `#`).
+
+## Working around HA/Pacemaker/STONITH when stopping hotpools
+
+Formerly "stratagem-hp-stop-no-more-tears.txt". These instructions are
+obsolete after converting hotpools to a ticketed HA configuration.
+```
+# Temporary process to stop hotpools in a production environment
+# without invoking HA STONITH due to busy client mount points.
+#
+# XXX This is not an automated process. You need to run each command in
+# turn and look at the output. Maybe even understand it. Sorry.
+#
+# See https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/high_availability_add-on_reference/s1-managedresource-haar
+# for more information.
+#
+# Author: John L. Hammond <jhammond@whamcloud.com>
+
+# Put all lamigo and lpurge HA resources in unmanaged mode.
+crm resource unmanage 'lamigo*'
+crm resource unmanage 'lpurge*'
+
+# In the output of hastatus the lamigo and lpurge resources should now
+# show up as "Started (unmanaged)"
+hastatus
+
+# Use systemctl to actually stop the services corresponding to the HA
+# resources you just unmanaged.
+clush -a systemctl stop 'lamigo@*'
+clush -a systemctl stop 'lpurge@*'
+
+# In the output of hastatus the lpurge resources should now show up as "Stopped (unmanaged)"
+hastatus
+
+# Verify that the processes have actually terminated.
+clush -a ps -FC lamigo,lpurge
+
+# Wait for any mirroring operations to drain from the client mounts.
+clush -a lsof /lustre/$FS/client/
+clush -a ps -FC lfs
+
+# Once they have drained you can use the stop script.
+stratagem-hp-stop.sh
+
+# To restart:
+crm resource manage 'lamigo*'
+crm resource manage 'lpurge*'
+stratagem-hp-start.sh
+
+# Verify that things are back to normal:
+hastatus
+
+# After all of this 'hastatus' may show some 'Failed Resource Actions'. For example:
+#
+# Failed Resource Actions:
+# * lpurge-fs0a12-OST0004_monitor_30000 on ai400x-0a12-vm02 'not running' (7): call=316, status=complete, exitreason='',
+#     last-rc-change='Thu Apr 8 18:01:57 2021', queued=0ms, exec=0ms
+#
+# To clean these up do 'crm resource cleanup lpurge-fs0a12-OST0004'
+
+```
diff --git a/lipe-revision.sh b/lipe-revision.sh
new file mode 100755
index 0000000..32933ff
--- /dev/null
+++ b/lipe-revision.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+# Try to get the current revision (commit hash). Don't fail.
+
+function get_revision {
+	local src="$1"
+	local val="$2"
+
+	if [[ -n "${val}" ]]; then
+		echo "$0: using '${src}' = '${val}'" >&2
+		printf "%s\n" "${val}"
+		exit 0
+	fi
+}
+
+# Manual override.
+get_revision LIPE_REVISION "${LIPE_REVISION:-}"
+
+# Release build on https://es-build.datadirectnet.com/
+get_revision ES_BUILD_LUSTRE_REVISION "${ES_BUILD_LUSTRE_REVISION:-}"
+
+# Reviews build from Gerrit.
+get_revision GERRIT_PATCHSET_REVISION "${GERRIT_PATCHSET_REVISION:-}"
+
+# Build from source.
+GIT_REVISION="$(git rev-parse HEAD 2> /dev/null)"
+get_revision 'git rev-parse HEAD' "${GIT_REVISION:-}"
+
+printf "None\n"
diff --git a/lipe.spec.in b/lipe.spec.in
index 69c9a4b..a18d53a 100644
--- a/lipe.spec.in
+++ b/lipe.spec.in
@@ -21,7 +21,7 @@ Prefix: %{_prefix}
 %{!?python_sitelib: %global python_sitelib %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")}
 
 Release: @LIPE_RELEASE@%{?dist}
-
+VCS: @LIPE_REVISION@
 Summary: lipe - Lustre Integrated Policy Engine
 License: All rights reserved DataDirect Networks Inc.
 Group: Applications/System
@@ -328,13 +328,13 @@ cp -a example_configs/hotpool/* $RPM_BUILD_ROOT%{_sysconfdir}/
 	install -m 0744 -D init.d/lipe_test_scheduler \
 		$RPM_BUILD_ROOT%{_sysconfdir}/rc.d/init.d/lipe_test_scheduler
 %endif
-install -g 0 -o 0 -m 0644 man/lipe_scan.1 $RPM_BUILD_ROOT%{_mandir}/man1/
-install -g 0 -o 0 -m 0644 man/lipe_find.1 $RPM_BUILD_ROOT%{_mandir}/man1/
-install -g 0 -o 0 -m 0644 man/lfill.1 $RPM_BUILD_ROOT%{_mandir}/man1/
+install -m 0644 man/lipe_scan.1 $RPM_BUILD_ROOT%{_mandir}/man1/
+install -m 0644 man/lipe_find.1 $RPM_BUILD_ROOT%{_mandir}/man1/
+install -m 0644 man/lfill.1 $RPM_BUILD_ROOT%{_mandir}/man1/
 %if %{with laudit}
-install -g 0 -o 0 -m 0644 man/laudit.1 $RPM_BUILD_ROOT%{_mandir}/man1/
-install -g 0 -o 0 -m 0644 man/laudit-report.1 $RPM_BUILD_ROOT%{_mandir}/man1/
-install -g 0 -o 0 -m 0644 man/laudit.conf.5 $RPM_BUILD_ROOT%{_mandir}/man5/
+install -m 0644 man/laudit.1 $RPM_BUILD_ROOT%{_mandir}/man1/
+install -m 0644 man/laudit-report.1 $RPM_BUILD_ROOT%{_mandir}/man1/
+install -m 0644 man/laudit.conf.5 $RPM_BUILD_ROOT%{_mandir}/man5/
 %endif
 cp -a lhsm_tests $RPM_BUILD_ROOT%{_libdir}/
 
@@ -446,15 +446,15 @@ rm -rf $RPM_BUILD_ROOT
 
 
 %changelog
-* Thu Jun 30 2020 Gu Zheng <gzheng@ddn.com> 1.5
+* Tue Jun 30 2020 Gu Zheng <gzheng@ddn.com> [1.5]
 - Update version to 1.5
-* Thu Apr 3 2019 Gu Zheng <gzheng@ddn.com> 1.4
+* Wed Apr 03 2019 Gu Zheng <gzheng@ddn.com> [1.4]
 - Devide RPM by function
-* Thu Jan 3 2019 Gu Zheng <gzheng@ddn.com> 1.4
+* Thu Jan 03 2019 Gu Zheng <gzheng@ddn.com> [1.4]
 - Add pyltest RPM
-* Wed Mar 7 2018 Sebastien Buisson <sbuisson@ddn.com> 1.1
+* Wed Mar 07 2018 Sebastien Buisson <sbuisson@ddn.com> [1.1]
 - Add laudit and laudit-report
-* Mon Oct 11 2017 Li Xi <lixi@ddn.com> 1.0
+* Wed Oct 11 2017 Li Xi <lixi@ddn.com> [1.0]
 - Add license-server RPM
-* Mon Feb 27 2017 Qian Yingjin <qian@ddn.com> 0.1
+* Mon Feb 27 2017 Qian Yingjin <qian@ddn.com> [0.1]
 - Initial import
diff --git a/scripts/stratagem-hp-config.sh b/scripts/stratagem-hp-config.sh
index 8e1e963..67db1d4 100755
--- a/scripts/stratagem-hp-config.sh
+++ b/scripts/stratagem-hp-config.sh
@@ -1,39 +1,82 @@
 #!/bin/bash
-# Copyright DDN 2020
 # -*- indent-tabs-mode: nil -*-
+# Copyright DDN 2021
+#
+# Pacemaker Ticket:
+# $FS-hotpool-allocated (controlled by resource $FS-hotpool)
+#
+# Pacemaker Resources:
+# $FS-hotpool (controlls $FS-hotpool-allocated ticket)
+# cl-$FS-client (cloned client mount)
+# lamigo-$FS-MDTXXXX (for each MDT)
+# lpurge-$FS-OSTXXXX (for each OST in fast pool)
+#
+# Files Created
+# /etc/lamigo/$FS-MDTXXXX.conf (for each MDT)
+# /etc/lpurge/$FS/OSTXXXX.conf (for each OST in fast pool)
+# /etc/fstab updated with client mount line
+#
+# Lustre Changes
+# changelog user created for each MDT
 
 # CONFIGURATION VARIABLES
 
-# ASSUMES last configlog user is for lamigo, or creates one if one doesn't exist
-
-LAMIGO_MINAGE=600
-LPURGE_FREEHI=80
-LPURGE_FREELO=50
-FAST_POOL=ddn_ssd
-SLOW_POOL=ddn_hdd
+DEF_LAMIGO_MINAGE=600
+DEF_LPURGE_FREEHI=80
+DEF_LPURGE_FREELO=50
+DEF_FAST_POOL=ddn_ssd
+DEF_SLOW_POOL=ddn_hdd
 
 function usage()
 {
-    echo "Usage: $(basename $0) -m MINAGE -H FREEHI -L FREELO -f FASTPOOL -s SLOWPOOL [ FILESYSTEM ]"
-    exit 0
+    local rc=${1:-0}
+    local values=${2:-1}
+    echo "Usage: $(basename $0) [-h] [-n] [-m MINAGE -H FREEHI -L FREELO -f FASTPOOL -s SLOWPOOL] [ FILESYSTEM ]"
+    echo " Automatically configures Stratagem Hotpools in a existing HA environment"
+    echo "  -n - No auto-detect value"
+    echo "  -h - This help message"
+    echo "  -m - lamigo minimum age"
+    echo "  -H - lpurge free-hi percent"
+    echo "  -L - lpurge free-lo percent"
+    echo "  -f - Fast OST pool"
+    echo "  -s - Slow OST pool"
+    if (( values == 1 )); then
+        echo "Values:"
+        echo " Filesystem    : $FS"
+        echo " Fast OST Pool : $FAST_POOL"
+        echo " Slow OST Pool : $SLOW_POOL"
+        echo " LAmigo MinAge : $LAMIGO_MINAGE"
+        echo " LPurge FreeHI : ${LPURGE_FREEHI:-${DEF_LPURGE_FREEHI}}"
+        echo " LPurge FreeLO : ${LPURGE_FREELO:-${DEF_LPURGE_FREELO}}"
+    fi
+    exit $rc
 }
 
-while getopts "hm:H:L:f:s:" opt; do
+HELP=false
+AUTO=true
+
+while getopts "hm:H:L:f:s:n" opt; do
     case "$opt" in
-	m) LAMIGO_MINAGE=$OPTARG ;;
-	H) LPURGE_FREEHI=$OPTARG ;;
-	L) LPURGE_FREELO=$OPTARG ;;
-	f) FAST_POOL=$OPTARG ;;
-	s) SLOW_POOL=$OPTARG ;;
-	h) usage ;;
+        m) LAMIGO_MINAGE=$OPTARG ;;
+        H) LPURGE_FREEHI=$OPTARG ;;
+        L) LPURGE_FREELO=$OPTARG ;;
+        f) FAST_POOL=$OPTARG ;;
+        s) SLOW_POOL=$OPTARG ;;
+        n) AUTO=false ;;
+        h) HELP=true ;;
     esac
 done
 shift $((OPTIND-1))
 
+FS=$1
+
 # Location of /lustre/$FS/<SERVER> mounts and /lustre/$FS/client mount
 ROOT=/lustre
 
-FS=$1
+function locate_resource() {
+    local res=$1
+    clush -N --group=ha_heads "crm_resource -QW -r $res 2> /dev/null || true"
+}
 
 function is_valid_percent {
     local p="${1:-}"
@@ -45,62 +88,95 @@ function is_minage_ok {
     [[ "${p}" =~ ^[[:digit:]]+$ ]] && ((p >= 5))
 }
 
-if ! is_valid_percent "${LPURGE_FREEHI}"; then
-    echo "Invalid FREEHI percentage '${LPURGE_FREEHI}'"
-    exit 1
-fi
-
-if ! is_valid_percent "${LPURGE_FREELO}"; then
-    echo "Invalid FREELO percentage '${LPURGE_FREELO}'"
-    exit 1
-fi
-
-if ! ((LPURGE_FREELO < LPURGE_FREEHI)); then
-    echo "FREELO (${LPURGE_FREELO}) must be less than FREEHI (${LPURGE_FREEHI})"
-    exit 1
-fi
-
-if ! is_minage_ok "${LAMIGO_MINAGE}"; then
-    echo "MINAGE (${LAMIGO_MINAGE}) must be positive integer >= 5"
-    exit 1
-fi
-
 if [ -z "$FS" ]; then
     # Try automatic detection
-    FS=$(crm_ticket -l|cut -f 1 -d -|grep -v ^lustre$)
+    FS=$(crm_ticket -l|cut -f 1 -d -|grep -v ^lustre$|sort -u)
 
     if [ $(echo "$FS" |wc -w) -ne 1 ]; then
-        echo "Usage: $0 FSNAME"
-        exit 1
+        echo "Error: Unable to determine filesystem automatically"
+        usage 22 0
     fi
 fi
 
-# Script from here down
-function locate_resource() {
-    res=$1
-    clush -N --group=ha_heads "crm_resource -QW -r $res 2> /dev/null || true"
-}
+if $AUTO && [ ! -f /etc/lamigo/$FS-MDT0000.conf ]; then
+    echo "Disabling auto-detection of settings ($FS-MDT0000 config missing)"
+    AUTO=false
+fi
 
 MDSHOST=$(locate_resource mdt0000-$FS)
 if [ -z "$MDSHOST" ]; then
     # if FS is not an resident filesystem, mdt0 will not exist for it
     echo Failed to find mdt0 for filesystem: $FS
-    exit 1
+    exit 5
+fi
+
+# reuse of changelog user handled below
+# get lpurge options - this gets the first OST from the FAST pool
+if $AUTO; then
+    AUTO_FAST_POOL=$(awk -F = '/^src=/{ print $2 }' /etc/lamigo/$FS-MDT0000.conf)
+    AUTO_SLOW_POOL=$(awk -F = '/^tgt=/{ print $2 }' /etc/lamigo/$FS-MDT0000.conf)
+    AUTO_LAMIGO_MINAGE=$(awk -F = '/^min-age=/{ print $2 }' /etc/lamigo/$FS-MDT0000.conf)
 fi
 
+# Set value: if not explicitly set, set auto, if no auto, set default
+
+LAMIGO_MINAGE=${LAMIGO_MINAGE:-${AUTO_LAMIGO_MINAGE:-${DEF_LAMIGO_MINAGE}}}
+FAST_POOL=${FAST_POOL:-${AUTO_FAST_POOL:-${DEF_FAST_POOL}}}
+SLOW_POOL=${SLOW_POOL:-${AUTO_SLOW_POOL:-${DEF_SLOW_POOL}}}
+
 # List of %04x formated OST indexes
-FAST_OSTLIST=$(ssh $MDSHOST lctl pool_list $FS.$FAST_POOL|sed -e 's/.*OST\(....\).*/\1/p;d')
+FAST_OSTLIST=$(ssh $MDSHOST lctl pool_list $FS.$FAST_POOL|sed -ne 's/.*-OST\(....\)_UUID/\1/p')
 if [ -z "$FAST_OSTLIST" ]; then
     echo "Failed to find OSTs in Fast pool ($FS.$FAST_POOL)"
-    exit 1
+    usage 2
 fi
 
-SLOW_OSTLIST=$(ssh $MDSHOST lctl pool_list $FS.$SLOW_POOL|sed -e 's/.*OST\(....\).*/\1/p;d')
+SLOW_OSTLIST=$(ssh $MDSHOST lctl pool_list $FS.$SLOW_POOL|sed -ne 's/.*-OST\(....\)_UUID/\1/p')
 if [ -z "$SLOW_OSTLIST" ]; then
     echo "Failed to find OSTs in Slow pool ($FS.$SLOW_POOL)"
-    exit 1
+    usage 2
+fi
+
+if $AUTO; then
+    for OST in $FAST_OSTLIST; do
+	F=/etc/lpurge/$FS/OST${OST}.conf
+	if [ -f $F ]; then
+	    AUTO_LPURGE_FREEHI=$(awk -F = '/^freehi=/{ print $2 }' $F)
+	    AUTO_LPURGE_FREELO=$(awk -F = '/^freelo=/{ print $2 }' $F)
+	    break
+	fi
+    done
+fi
+
+LPURGE_FREEHI=${LPURGE_FREEHI:-${AUTO_LPURGE_FREEHI:-${DEF_LPURGE_FREEHI}}}
+LPURGE_FREELO=${LPURGE_FREELO:-${AUTO_LPURGE_FREELO:-${DEF_LPURGE_FREELO}}}
+
+if ! is_valid_percent "${LPURGE_FREEHI}"; then
+    echo "Invalid FREEHI percentage '${LPURGE_FREEHI}'"
+    usage 34
+fi
+
+if ! is_valid_percent "${LPURGE_FREELO}"; then
+    echo "Invalid FREELO percentage '${LPURGE_FREELO}'"
+    usage 34
 fi
 
+if ! ((LPURGE_FREELO < LPURGE_FREEHI)); then
+    echo "FREELO (${LPURGE_FREELO}) must be less than FREEHI (${LPURGE_FREEHI})"
+    usage 34
+fi
+
+if ! is_minage_ok "${LAMIGO_MINAGE}"; then
+    echo "MINAGE (${LAMIGO_MINAGE}) must be positive integer >= 5"
+    usage 34
+fi
+
+if $HELP; then
+    usage 0
+fi
+
+echo "Using: $0 -m ${LAMIGO_MINAGE} -H ${LPURGE_FREEHI} -L ${LPURGE_FREELO} -f ${FAST_POOL} -s ${SLOW_POOL} $FS"
+
 OSSLIST=$(es_config_get --option fs_settings.$FS.oss_list)
 
 # Die on errors
@@ -108,32 +184,44 @@ set -e
 
 MOUNTSPEC=$(/opt/ddn/es/tools/mount_lustre_client --dry-run --fs $FS |awk '{ print $4 }')
 
-echo "Creating config directories"
-clush -a mkdir -p /etc/lpurge/$FS/ /etc/lamigo/ $ROOT/$FS/client
+# Create client mount if it doesn't exist
+if ! crm_resource -QW -r cl-$FS-client > /dev/null 2>&1; then
 
-echo "Creating lustre client mounts for $FS"
-clush -a "grep -q $FS/client /etc/fstab || (echo $MOUNTSPEC $ROOT/$FS/client lustre x-systemd.mount-timeout=20m,noauto,_netdev >> /etc/fstab)"
-clush -a mkdir -p /etc/systemd/system/lustre-$FS-client.mount.d/
-cat << EOF > /etc/systemd/system/lustre-$FS-client.mount.d/timeout.conf
+    echo "Creating config directories"
+    clush -a mkdir -p /etc/lpurge/$FS/ /etc/lamigo/ $ROOT/$FS/client
+
+    echo "Creating lustre client mounts for $FS"
+    clush -a "grep -q $FS/client /etc/fstab || (echo $MOUNTSPEC $ROOT/$FS/client lustre x-systemd.mount-timeout=20m,noauto,_netdev >> /etc/fstab)"
+    clush -a mkdir -p /etc/systemd/system/lustre-$FS-client.mount.d/
+    cat << EOF > /etc/systemd/system/lustre-$FS-client.mount.d/timeout.conf
 [Mount]
 TimeoutSec=20m
 EOF
-clush -ca /etc/systemd/system/lustre-$FS-client.mount.d/timeout.conf
-clush -a systemctl daemon-reload
-echo "Creating lustre client resource agent"
-clush -g ha_heads crm configure <<EOF
-primitive $FS-client systemd:lustre-$FS-client.mount op start timeout=900s op monitor interval=60s meta target-role=Stopped
+    clush -ca /etc/systemd/system/lustre-$FS-client.mount.d/timeout.conf
+    clush -a systemctl daemon-reload
+    echo "Creating lustre client resource agent"
+    clush -g ha_heads crm configure <<EOF
+primitive $FS-client systemd:lustre-$FS-client.mount meta target-role=Stopped op start timeout=900s op monitor interval=60s op stop timeout=900s
 clone cl-$FS-client $FS-client
 rsc_ticket ticket-$FS-allocated-client $FS-allocated: cl-$FS-client loss-policy=stop
 EOF
 
-# create order constraints on all lustre-server template types (mdt or ost)
-clush -g ha_heads "cibadmin -e --query --xpath '//template[@type=\"lustre-server\"]'|awk -F \"'\" '/lustre-$FS-/{ print \$2 }'|awk -F - '{ print \"order $FS-client-after-\"\$3\" 0: lustre-$FS-\"\$3\":start cl-$FS-client:start\" }'|crm configure"
+    # create order constraint for client on all lustre-server template types (mdt or ost)
+    clush -g ha_heads "cibadmin -e --query --xpath '//template[@type=\"lustre-server\"]'|awk -F \"'\" '/lustre-$FS-/{ print \$2 }'|awk -F - '{ print \"order $FS-client-after-\"\$3\" 0: lustre-$FS-\"\$3\":start cl-$FS-client:start\" }'|crm configure"
 
-ssh $MDSHOST crm configure <<EOF
+    ssh $MDSHOST crm configure <<EOF
 order $FS-client-after-mgs 0: mgs:start cl-$FS-client:start
 EOF
 
+fi # ! crm_resource client
+
+# ticket
+echo "Creating Hotpool ticket for $FS"
+clush -g ha_heads crm configure <<EOF
+primitive $FS-hotpool ocf:ddn:Ticketer params name=$FS-hotpool-allocated meta allow-migrate=true priority=199 target-role=Stopped op start interval=0 timeout=30 op monitor interval=30 timeout=30 op stop interval=0 timeout=30
+order $FS-hotpool-after-cl-$FS-client 0: cl-$FS-client $FS-hotpool
+EOF
+
 echo "Configuring OFD access logs"
 MGSHOST=$(locate_resource mgs)
 ssh $MGSHOST lctl set_param -P "obdfilter.${FS}-*.access_log_size=1048576"
@@ -143,7 +231,7 @@ echo "Setting up lpurge (HI:$LPURGE_FREEHI to LO:$LPURGE_FREELO)"
 # this results in MDTLIST being a list of MDT indexes in format "%04d"
 # This format is problematic for values greater than 0007, since by
 # default bash will interpret it as octal
-MDTLIST=$(clush -qN -g ha_heads crm_resource --list-raw|sed "s/mdt\(.*\)-$FS$/\1/p;d")
+MDTLIST=$(clush -qN -g ha_heads crm_resource --list-raw|sed -ne "s/mdt\(.*\)-$FS$/\1/p")
 
 for OST in $FAST_OSTLIST; do
     INDEX=$(printf "%04d" 0x$OST)
@@ -169,11 +257,11 @@ EOF
     clush -ca /etc/lpurge/$FS/OST$OST.conf
 
     echo Creating lpurge resource for $FS-OST$OST
-    clush -qS --group=ha_heads "crm_resource -QW -r ost$INDEX-$FS >/dev/null 2&>1 && crm configure << EOF || true
+    clush -qS --group=ha_heads "crm_resource -QW -r ost$INDEX-$FS >/dev/null 2>&1 && crm configure << EOF || true
 primitive lpurge-$FS-OST$OST systemd:lpurge@$FS-OST$OST.service op monitor interval=30s
 colocation lpurge-$FS-$OST-with-ost inf: lpurge-$FS-OST$OST ost$INDEX-$FS
 order lpurge-$FS-$OST-after-ost ost$INDEX-$FS lpurge-$FS-OST$OST
-order lpurge-$FS-$OST-after-client cl-$FS-client lpurge-$FS-OST$OST
+rsc_ticket ticket-$FS-hotpool-allocated-lpurge-$FS-OST$OST $FS-hotpool-allocated: lpurge-$FS-OST$OST loss-policy=stop
 EOF"
 done
 
@@ -183,10 +271,21 @@ echo "Setting up lamigo (From $FAST_POOL to $SLOW_POOL aged at least $LAMIGO_MIN
 for INDEX in $MDTLIST; do
     MDT=$(printf "%04x" $((10#$INDEX)) )
     MDSHOST=$(locate_resource mdt$INDEX-$FS)
-    CL=$(ssh $MDSHOST lctl get_param -n mdd.$FS-MDT$MDT.changelog_users|tail -1|awk '/^cl/{ print $1 }')
+    CL=""
+
+    # reuse existing changelog user
+    if [ -f /etc/lamigo/$FS-MDT$MDT.conf ]; then
+        CL=$(awk -F = '/^user=/{ print $2 }' /etc/lamigo/$FS-MDT$MDT.conf)
+        # ensure changelog user exists
+        if [ -z "$(ssh $MDSHOST lctl get_param -n mdd.$FS-MDT$MDT.changelog_users|grep $CL[[:space:]])" ]; then
+            echo "Previous changelog user for $FS-MDT$MDT: $CL is missing"
+            CL=""
+        fi
+    fi
+    # create new user if none was previously configured
     if [ -z "$CL" ]; then
-        ssh $MDSHOST lctl --device $FS-MDT$MDT changelog_register
-        CL=$(ssh $MDSHOST lctl get_param -n mdd.$FS-MDT$MDT.changelog_users|tail -1|awk '/^cl/{ print $1 }')
+        CL=$(ssh $MDSHOST lctl --device $FS-MDT$MDT changelog_register | awk -F \' '{print $2 }')
+        echo "Registered new changelog user for $FS-MDT$MDT: $CL"
     fi
 
     echo Creating lamigo config for $FS-MDT$MDT
@@ -199,7 +298,7 @@ src=$FAST_POOL
 tgt=$SLOW_POOL
 EOF
     for HOST in $OSSLIST; do
-	echo oss=$HOST >> /etc/lamigo/$FS-MDT$MDT.conf
+        echo oss=$HOST >> /etc/lamigo/$FS-MDT$MDT.conf
     done
     for HOST in $(cluset -e @all); do
         echo agent=$HOST:$ROOT/$FS/client:4 >> /etc/lamigo/$FS-MDT$MDT.conf
@@ -214,15 +313,13 @@ EOF
     clush -ca /etc/lamigo/$FS-MDT$MDT.conf /etc/systemd/system/lamigo@$FS-MDT$MDT.service.d/override.conf
 
     echo Creating lamigo resource for $FS-MDT$MDT
-    clush -qS --group=ha_heads "crm_resource -QW -r mdt$INDEX-$FS >/dev/null 2&>1 && crm configure << EOF || true
+    clush -qS --group=ha_heads "crm_resource -QW -r mdt$INDEX-$FS >/dev/null 2>&1 && crm configure << EOF || true
 primitive lamigo-$FS-MDT$MDT systemd:lamigo@$FS-MDT$MDT.service op start timeout=15m op monitor interval=30s
 colocation lamigo-$FS-MDT$MDT-with-mdt inf: lamigo-$FS-MDT$MDT mdt$INDEX-$FS
 order lamigo-$FS-$MDT-after-mdt mdt$INDEX-$FS lamigo-$FS-MDT$MDT
-order lamigo-$FS-$MDT-after-client cl-$FS-client lamigo-$FS-MDT$MDT
+rsc_ticket ticket-$FS-hotpool-allocated-lamigo-$FS-MDT$MDT $FS-hotpool-allocated: lamigo-$FS-MDT$MDT loss-policy=stop
 EOF"
 done
 
 echo "Configuration complete.  Run the following command to start services:"
-echo "  clush -qS --group=ha_heads crm resource start cl-$FS-client"
-echo "or"
 echo "  stratagem-hp-start.sh $FS"
diff --git a/scripts/stratagem-hp-convert.sh b/scripts/stratagem-hp-convert.sh
new file mode 100755
index 0000000..2e342a9
--- /dev/null
+++ b/scripts/stratagem-hp-convert.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+# -*- indent-tabs-mode: nil -*-
+# Copyright DDN 2021
+
+FS=$1
+
+function usage() {
+    echo "Usage: $(basename $0) [-h] [FILESYSTEM]"
+    echo "  Convert Stratagem Hotpools HA environment to be ticket based"
+}
+
+if [ "$FS" = "-h" ] || [ "$FS" = "--help" ]; then
+    usage
+    exit 0
+fi
+
+function locate_resource() {
+    local res=$1
+    clush -N --group=ha_heads "crm_resource -QW -r $res 2> /dev/null || true"
+}
+
+if [ -z "$FS" ]; then
+    # Try automatic detection
+    FS=$(crm_ticket -l|cut -f 1 -d -|grep -v ^lustre$|uniq)
+
+    if [ $(echo "$FS" |wc -w) -ne 1 ]; then
+        echo "Error: Unable to determine filesystem automatically, please specify"
+        usage
+        exit 1
+    fi
+fi
+
+if crm_resource -QW -r $FS-hotpool > /dev/null 2>&1; then
+    echo "Hotpools for $FS already using ticket"
+    exit 0
+fi
+
+MDSHOST=$(locate_resource mdt0000-$FS)
+MDTLIST=$(clush -qN -g ha_heads crm_resource --list-raw|sed -ne "s/mdt\(.*\)-$FS$/\1/p")
+FAST_POOL=$(awk -F = '/^src=/{ print $2 }' /etc/lamigo/$FS-MDT0000.conf)
+FAST_OSTLIST=$(ssh $MDSHOST lctl pool_list $FS.$FAST_POOL|sed -ne 's/.*-OST\(....\)_UUID/\1/p')
+
+ROLE=Started
+if crm_resource -r $FS-client -W |& grep -q NOT; then
+    ROLE=Stopped
+fi
+
+echo "Creating hotpool ticket for $FS"
+clush -g ha_heads crm configure <<EOF
+primitive $FS-hotpool ocf:ddn:Ticketer params name=$FS-hotpool-allocated meta allow-migrate=true priority=199 target-role=$ROLE op start interval=0 timeout=30 op monitor interval=30 timeout=30 op stop interval=0 timeout=30
+order $FS-hotpool-after-cl-$FS-client 0: cl-$FS-client $FS-hotpool
+EOF
+
+for OST in $FAST_OSTLIST; do
+    INDEX=$(printf "%04d" 0x$OST)
+    echo Updating lpurge config for $FS-OST$OST
+
+    echo "Updating lpurge HA for $FS-OST$OST"
+
+    clush -qS --group=ha_heads "crm_resource -QW -r ost$INDEX-$FS >/dev/null 2>&1 && crm configure << EOF || true
+delete lpurge-$FS-$OST-after-client
+rsc_ticket ticket-$FS-hotpool-allocated-lpurge-$FS-OST$OST $FS-hotpool-allocated: lpurge-$FS-OST$OST loss-policy=stop
+EOF"
+done
+
+# MDTLIST is derived from resources so it's already in decimal but formatted %04d
+for INDEX in $MDTLIST; do
+    MDT=$(printf "%04x" $((10#$INDEX)) )
+
+    echo "Updating lamigo HA for $FS-MDT$MDT"
+
+    clush -qS --group=ha_heads "crm_resource -QW -r mdt$INDEX-$FS >/dev/null 2>&1 && crm configure << EOF || true
+delete lamigo-$FS-$MDT-after-client
+rsc_ticket ticket-$FS-hotpool-allocated-lamigo-$FS-MDT$MDT $FS-hotpool-allocated: lamigo-$FS-MDT$MDT loss-policy=stop
+EOF"
+done
diff --git a/scripts/stratagem-hp-start.sh b/scripts/stratagem-hp-start.sh
index 0c996f0..214262d 100755
--- a/scripts/stratagem-hp-start.sh
+++ b/scripts/stratagem-hp-start.sh
@@ -4,15 +4,37 @@
 
 FS=$1
 
+function usage() {
+    echo "Usage: $(basename $0) [-h] [FILESYSTEM]"
+    echo "  Start Stratagem Hotpools HA environment"
+}
+
+if [ "$FS" = "-h" ] || [ "$FS" = "--help" ]; then
+    usage
+    exit 0
+fi
+
 if [ -z "$FS" ]; then
     # Try automatic detection
-    FS=$(crm_ticket -l|cut -f 1 -d -|grep -v ^lustre$)
+    FS=$(crm_ticket -l|cut -f 1 -d -|grep -v ^lustre$|uniq)
 
     if [ $(echo "$FS" |wc -w) -ne 1 ]; then
-        echo "Usage: $0 FSNAME"
+	echo "Error: Could not automatically find filesystem, please specify"
+        usage
         exit 1
     fi
 fi
 
+if ! crm_resource -QW -r $FS-hotpool > /dev/null 2>&1; then
+    if clush -Ng ha_heads crm_resource -l|egrep -q "^(lamigo|lpurge)-$FS-"; then
+	echo "Hotpools needs conversion"
+	echo "  Please run: stratagem-hp-convert.sh"
+    else
+	echo "Hotpools not configured"
+	echo "  Please run: stratagem-hp-config.sh"
+    fi
+    exit 1
+fi
+
 echo Starting Hotpools for $FS
-clush -qS --group=ha_heads crm res start cl-$FS-client
+clush -qS --group=ha_heads crm res start cl-$FS-client $FS-hotpool
diff --git a/scripts/stratagem-hp-stop.sh b/scripts/stratagem-hp-stop.sh
index 1f2c7bd..dba265b 100755
--- a/scripts/stratagem-hp-stop.sh
+++ b/scripts/stratagem-hp-stop.sh
@@ -4,15 +4,37 @@
 
 FS=$1
 
+function usage() {
+    echo "Usage: $(basename $0) [-h] [FILESYSTEM]"
+    echo "  Stop stratagem hotpools in HA environment"
+}
+
+if [ "$FS" = "-h" ] || [ "$FS" = "--help" ]; then
+    usage
+    exit 0
+fi
+
 if [ -z "$FS" ]; then
     # Try automatic detection
-    FS=$(crm_ticket -l|cut -f 1 -d -|grep -v ^lustre$)
+    FS=$(crm_ticket -l|cut -f 1 -d -|grep -v ^lustre$|uniq)
 
     if [ $(echo "$FS" |wc -w) -ne 1 ]; then
-        echo "Usage: $0 FSNAME"
+	echo "Error: Could not automatically find filesystem, please specify"
+        usage
         exit 1
     fi
 fi
 
-echo Stopping Hotpools for $FS
-clush -qS --group=ha_heads crm res stop cl-$FS-client
+if crm_resource -QW -r $FS-hotpool > /dev/null 2>&1; then
+    echo Stopping Hotpools for $FS
+    clush -qS --group=ha_heads crm res stop $FS-hotpool
+
+elif clush -Ng ha_heads crm_resource -l|egrep -q "^(lamigo|lpurge)-$FS-"; then
+    echo "Stopping old style Hotpools (see stratagem-hp-convert.sh)"
+    clush -qS -g ha_heads crm res stop cl-$FS-client
+
+else
+    echo "Hotpools not configured"
+    echo "  Please run: stratagem-hp-config.sh"
+    exit 1
+fi
diff --git a/scripts/stratagem-hp-teardown-client.sh b/scripts/stratagem-hp-teardown-client.sh
new file mode 100755
index 0000000..b579878
--- /dev/null
+++ b/scripts/stratagem-hp-teardown-client.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+# -*- indent-tabs-mode: nil -*-
+# Copyright DDN 2021
+#
+# Stops and remove cloned client mount
+# Removes:
+# resources $FS-mount (the cloned mount)
+# client mount from /etc/fstab
+
+FS=$1
+
+# Set a 10+ minute timeout for waitfor()
+TIMEOUT=${TIMEOUT:-600}
+
+function usage() {
+    echo "Usage: $(basename $0) [-h] [-t SECS] [FILESYSTEM]"
+    echo "  Tear down server client mount HA environment"
+    echo "   -t SECS - Max timeout to wait for services to stop (default: $TIMEOUT)"
+}
+
+while getopts "ht:" opt; do
+    case $opt in
+        h) usage; exit 0;;
+        t) TIMEOUT=$OPTARG;;
+    esac
+done
+
+if [ "$FS" = "--help" ]; then
+    usage
+    exit 0
+fi
+
+if [ -z "$FS" ]; then
+    # Try automatic detection
+    FS=$(crm_ticket -l|cut -f 1 -d -|grep -v ^lustre$|uniq)
+
+    if [ $(echo "$FS" |wc -w) -ne 1 ]; then
+        echo "Usage: $0 FSNAME"
+        exit 1
+    fi
+fi
+
+waitfor () {
+    local ids=$*
+
+    [ -z "$ids" ] && return
+
+    echo -n "Waiting for $ids"
+
+    local deadline=$((SECONDS+TIMEOUT))
+
+    local done=false
+    until $done || ((deadline < SECONDS)); do
+
+        done=true
+        for i in $ids; do
+            # This finds the host the resource is active on (empty it is stopped)
+            res=$(clush -qg ha_heads crm_resource -QW -r $i 2> /dev/null)
+            if [ -n "$res" ]; then
+                done=false
+                echo -n "."
+                sleep 1
+                break
+            fi
+        done
+    done
+
+    echo ""
+
+    if ! $done; then
+        echo "Waiting for $ids TIMED OUT!"
+        exit 1
+    fi
+}
+
+res=$(clush -qNg ha_heads crm_resource -l 2> /dev/null|grep lamigo-$FS-MDT0000)
+if [ -n "$res" ]; then
+    echo "ERROR: Please tear down Hotpools first"
+    exit 1
+fi
+
+echo Stopping Client
+clush -qS --group=ha_heads crm res stop cl-$FS-client
+
+waitfor cl-$FS-client
+
+clush -qS --group=ha_heads crm config del $FS-client
+
+echo "Removing client mount from fstab"
+clush -a "sed -i '/\/$FS\/client.*lustre/d' /etc/fstab"
diff --git a/scripts/stratagem-hp-teardown.sh b/scripts/stratagem-hp-teardown.sh
index bc7b489..fdff7b7 100755
--- a/scripts/stratagem-hp-teardown.sh
+++ b/scripts/stratagem-hp-teardown.sh
@@ -1,15 +1,44 @@
 #!/bin/bash
 # -*- indent-tabs-mode: nil -*-
 # Copyright DDN 2020
+#
+# After stopping hotpools (via ticket)
+# Removes:
+# resources lamigo-$FS-*
+# resources lpurge-$FS-*
+# resource $FS-hotpool
+# ticket $FS-hotpool-allocated
+#
+# Leaves:
+# all lamigo & lpurge config files
+# client mount
 
 FS=$1
 
-# Set a 10+ minute timeout for waitfor()
+# Set a default 10+ minute timeout for waitfor()
 TIMEOUT=${TIMEOUT:-600}
 
+function usage() {
+    echo "Usage: $(basename $0) [-h] [-t SECS] [FILESYSTEM]"
+    echo "  Tear down Stratagem Hotpools HA environment"
+    echo "   -t SECS - Max timeout to wait for services to stop (default: $TIMEOUT)"
+}
+
+while getopts "ht:" opt; do
+    case $opt in
+        h) usage; exit 0;;
+        t) TIMEOUT=$OPTARG;;
+    esac
+done
+
+if [ "$FS" = "--help" ]; then
+    usage
+    exit 0
+fi
+
 if [ -z "$FS" ]; then
     # Try automatic detection
-    FS=$(crm_ticket -l|cut -f 1 -d -|grep -v ^lustre$)
+    FS=$(crm_ticket -l|cut -f 1 -d -|grep -v ^lustre$|uniq)
 
     if [ $(echo "$FS" |wc -w) -ne 1 ]; then
         echo "Usage: $0 FSNAME"
@@ -18,42 +47,54 @@ if [ -z "$FS" ]; then
 fi
 
 waitfor () {
-    IDS=$*
+    local ids=$*
 
-    [ -z "$IDS" ] && return
+    [ -z "$ids" ] && return
 
-    echo -n "Waiting for $IDS"
+    echo -n "Waiting for $ids"
 
-    local timeout=$TIMEOUT
+    local deadline=$((SECONDS+TIMEOUT))
 
-    local NOTDONE=true
-    while $NOTDONE && ((timeout > 0)); do
+    local done=false
+    until $done || ((deadline < SECONDS)); do
 
-        NOTDONE=false
-        for i in $IDS; do
+        done=true
+        for i in $ids; do
             # This finds the host the resource is active on (empty it is stopped)
             res=$(clush -qg ha_heads crm_resource -QW -r $i 2> /dev/null)
             if [ -n "$res" ]; then
-                NOTDONE=true
+                done=false
                 echo -n "."
                 sleep 1
                 break
             fi
         done
-
-        let timeout-=1
     done
 
     echo ""
 
-    if ((timeout == 0)); then
-        echo "Waiting for $IDS TIMED OUT!"
+    if ! $done; then
+        echo "Waiting for $ids TIMED OUT!"
         exit 1
     fi
 }
 
+if ! crm_resource -QW -r $FS-hotpool > /dev/null 2>&1; then
+    if clush -Ng ha_heads crm_resource -l|egrep -q "^(lamigo|lpurge)-$FS-"; then
+	echo "Hotpools needs conversion"
+	echo "  Please run: stratagem-hp-convert.sh"
+        exit 1
+    else
+	echo "Hotpools not configured"
+        exit 0
+    fi
+fi
+
+echo "Stopping Hotpools for $FS"
+clush -qS --group=ha_heads crm res stop $FS-hotpool
+
 for host in $(cluset -e @ha_heads); do
-    IDS=$(ssh $host crm_resource --list-raw|egrep "(lamigo|lpurge)-$FS")
+    IDS=$(ssh $host crm_resource --list-raw|egrep "^(lamigo|lpurge)-$FS-")
     if [ -n "$IDS" ]; then
         echo Stopping lamigo and lpurge on $host
         ssh $host crm res stop $IDS
@@ -64,13 +105,10 @@ for host in $(cluset -e @ha_heads); do
     fi
 done
 
+echo "Removing Hotpool ticket for $FS"
+clush -qS --group=ha_heads crm config del $FS-hotpool
 
-echo Stopping Client
-clush -qS --group=ha_heads crm res stop cl-$FS-client
-
-waitfor cl-$FS-client
-
-clush -qS --group=ha_heads crm config del $FS-client
+clush -qS --group=ha_heads crm_ticket --ticket $FS-hotpool-allocated --cleanup
 
-echo "Removing client mount from fstab"
-clush -a "sed -i '/\/$FS\/client.*lustre/d' /etc/fstab"
+echo "Local client is still configured"
+echo "  to remove run: stratagem-hp-teardown-client.sh"
diff --git a/src/Makefile.am b/src/Makefile.am
index 901c554..792c8ff 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -26,6 +26,8 @@ LIPE_SOURCES = cmd.c cmd.h debug.c debug.h fake_lustre_disk.h \
 		lustre_ea.c lustre_ea.h \
 		lipe_ldiskfs.c lipe_ldiskfs.h \
 		lipe_object_attrs.h \
+		lipe_version.c \
+		lipe_version.h \
 		list.h policy.h policy.c \
 		ldiskfs_read_ldd.c ldiskfs_read_ldd.h \
 		general_policy.h general_policy.c \
@@ -67,9 +69,9 @@ lamigo_SOURCES = lamigo.c lamigo.h lamigo_alr.c lamigo_hash.c lamigo_hash.h \
 lamigo_CFLAGS = $(LIPE_CFLAGS)
 lamigo_LDFLAGS = $(LIPE_LDFLAGS) -lssh -lssh_threads
 
-lpurge_SOURCES = lpurge.c $(LIPE_SOURCES) $(lipe_ssh_SOURCES)
+lpurge_SOURCES = lpurge.c $(LIPE_SOURCES)
 lpurge_CFLAGS = $(LIPE_CFLAGS)
-lpurge_LDFLAGS = $(LIPE_LDFLAGS) -lssh -lssh_threads
+lpurge_LDFLAGS = $(LIPE_LDFLAGS)
 
 lipe_expression_test_SOURCES = lipe_expression_test.c $(LIPE_SOURCES)
 lipe_expression_test_CFLAGS = $(LIPE_CFLAGS)
diff --git a/src/lamigo.c b/src/lamigo.c
index 68b5763..323a5aa 100644
--- a/src/lamigo.c
+++ b/src/lamigo.c
@@ -21,6 +21,7 @@
  *  - auto-config with lfs df -v to find flash OSTs
  *  - tests
  */
+#include <stdarg.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <errno.h>
@@ -60,6 +61,7 @@
 #include "general_policy.h"
 #include "list.h"
 #include "lipe_config.h"
+#include "lipe_version.h"
 
 #define DEF_POOL_REFRESH_INTV	(10 * 60)
 #define DEF_PROGRESS_INTV	(10 * 60)
@@ -130,7 +132,8 @@ static void usage(void)
 	       "\t--hot-fraction, files with heat >= <hottest*fraction/100> are hot (default: %d)\n"
 	       "\t--hot-after-idle, hot files after N idle periods to be mirrored (default: %d)\n"
 	       "\t--src-free, mirroring to source OST pool is prohibited if pool has less space (default: %d%%)\n"
-	       "\t--tgt-free, mirroring to target OST pool is prohibited if pool has less space (default: %d%%)\n",
+	       "\t--tgt-free, mirroring to target OST pool is prohibited if pool has less space (default: %d%%)\n"
+	       "\t--version, print version information and exit\n",
 	       program_invocation_short_name,
 	       DEF_MIN_AGE,
 	       DEF_CACHE_SIZE >> 20,
@@ -264,6 +267,8 @@ struct mirror_opts {
 };
 
 __u64 lamigo_last_processed_idx; /* changelog index */
+static time_t lamigo_last_cleared;
+static __u64 lamigo_last_cleared_index;
 struct lipe_list_head lamigo_agent_list;
 static int lamigo_agent_count;
 int lamigo_max_jobs; /* max jobs for all agents */
@@ -288,6 +293,17 @@ struct pool_list *src_pools; /* source pools */
 struct pool_list *tgt_pools; /* target pool */
 static void *lamigo_refresh_statfs_thread(void *arg);
 
+inline int are_agents_busy(void)
+{
+	if (opt.o_iml_re_socket)
+		return 0;
+
+	if (lamigo_jobs_running >= lamigo_max_jobs)
+		return 1;
+
+	return 0;
+}
+
 /* Convert human readable size string to and int; "1k" -> 1000 */
 static int strsize2int(long *sizep, char *str)
 {
@@ -324,6 +340,24 @@ static int strsize2int(long *sizep, char *str)
 	}
 }
 
+static void systemf(const char *fmt, ...) __attribute__ ((format (printf, 1, 2)));
+static void systemf(const char *fmt, ...)
+{
+	char *cmd = NULL;
+	va_list args;
+	int rc;
+
+	va_start(args, fmt);
+	rc = vasprintf(&cmd, fmt, args);
+	va_end(args);
+
+	if (rc < 0)
+		return;
+
+	system(cmd);
+	free(cmd);
+}
+
 static struct lipe_flist *lamigo_flist_init(char *socket)
 {
 	struct lipe_flist *flist = flist_alloc(NULL, 1024,
@@ -344,6 +378,8 @@ void lamigo_usr1_handle(int sig)
 	struct resync_agent *a;
 	struct resync_job *j;
 	struct pool_list *pl;
+	struct tm *tmtime;
+	char timestr[32];
 	FILE *f;
 	int i;
 
@@ -355,11 +391,15 @@ void lamigo_usr1_handle(int sig)
 		llapi_error(LLAPI_MSG_DEBUG|LLAPI_MSG_NO_ERRNO, 0, "can't open dump file\n");
 		return;
 	}
-	fprintf(f, "config:\n"
+	fprintf(f,
+		"version: %s-%s\n"
+		"revision: %s\n"
+		"config:\n"
 		"    chlg_user: %s\n"
 		"    mdtname: %s\n"
 		"    mountpoint: %s\n"
 		"    source_pool: ",
+		PACKAGE_VERSION, LIPE_RELEASE, LIPE_REVISION,
 		opt.o_chlg_user, opt.o_mdtname, opt.o_mntpt);
 	i = 0;
 	for (pl = src_pools; pl != NULL; pl = pl->pl_next, i++)
@@ -429,6 +469,16 @@ void lamigo_usr1_handle(int sig)
 		stats.s_skip_hot, stats.s_replicate_ro2hot,
 		stats.s_replicate_rw2hot, stats.s_replicate_rw2cold);
 
+	tmtime = localtime(&lamigo_last_cleared);
+	strftime(timestr, sizeof(timestr), "%c", tmtime);
+
+	fprintf(f, "changelog:\n"
+		"    last_processed_idx: %llu\n"
+		"    last_cleared_idx: %llu\n"
+		"    last_cleared_time: %lu # ( %s )\n",
+		lamigo_last_processed_idx, lamigo_last_cleared_index,
+		lamigo_last_cleared, timestr);
+
 #define AGENT_FMT \
 	"    agent%d: { host: %s, mnt: %s, jobs: %u, max: %u, state: %s }\n"
 
@@ -510,7 +560,6 @@ static void lamigo_init_vars(void)
 
 	signal(SIGUSR1, lamigo_null_handler);
 	signal(SIGUSR2, lamigo_null_handler);
-	signal(SIGCHLD, lamigo_null_handler);
 
 	LIPE_INIT_LIST_HEAD(&head.lh_list);
 	LIPE_INIT_LIST_HEAD(&lamigo_job_list);
@@ -534,7 +583,7 @@ static void lamigo_cleanup(void)
 		lipe_list_for_each_entry_safe(rss, tmp,
 					      &agent->rag_ssh_list,
 					      rss_list) {
-			lipe_cleanup_ssh_session(&rss->rss_ctx);
+			lipe_ssh_context_destroy(&rss->rss_ctx);
 			lipe_list_del(&rss->rss_list);
 			free(rss);
 		}
@@ -586,19 +635,12 @@ static int lamigo_exec_cmd(struct resync_agent *a, char *cmd)
 	lipe_list_del(&rss->rss_list);
 	pthread_mutex_unlock(&a->rag_ssh_lock);
 
-	if (lipe_ssh_status(&rss->rss_ctx) != SSH_OK) {
-		rc = lipe_init_ssh_session(&rss->rss_ctx, a->rag_hostname);
-		if (rc != SSH_OK)
-			goto out;
-	}
-
 	rc = lipe_ssh_exec(&rss->rss_ctx, cmd);
 	if (rc)
-		llapi_error(LLAPI_MSG_FATAL|LLAPI_MSG_NO_ERRNO, 0,
+		llapi_error(LLAPI_MSG_INFO|LLAPI_MSG_NO_ERRNO, 0,
 			"error executing ssh command '%s' on '%s': rc = %d\n",
 			cmd, a->rag_hostname, rc);
 
-out:
 	pthread_mutex_lock(&a->rag_ssh_lock);
 	lipe_list_add(&rss->rss_list, &a->rag_ssh_list);
 	pthread_cond_signal(&a->rag_ssh_cond);
@@ -617,8 +659,8 @@ void *lamigo_replicate_one(void *args)
 
 	if (rj->rj_setprefer) {
 		snprintf(cmd, sizeof(cmd),
-			 "lfs setstripe --comp-set --comp-flags=prefer -p %s "
-			 "%s/.lustre/fid/"DFID" >&/dev/null", rj->rj_pool,
+			 "lfs setstripe --comp-set --comp-flags=prefer --pool='%s' "
+			 "'%s/.lustre/fid/"DFID"' >&/dev/null", rj->rj_pool,
 			 agent->rag_mountpoint,
 			 PFID(&rj->rj_fid));
 		llapi_printf(LLAPI_MSG_DEBUG, "set prefer on "DFID"\n",
@@ -627,20 +669,20 @@ void *lamigo_replicate_one(void *args)
 		int i;
 
 		i = snprintf(cmd, sizeof(cmd),
-			     "%s --pool=%s",
+			     "%s --pool='%s'",
 			     opt.o_mirror_cmd, rj->rj_pool);
 		if (rj->rj_stripes > 0)
 			i += snprintf(cmd + i, sizeof(cmd) - i,
-				      " -c %d", rj->rj_stripes);
+				      " --stripe-count=%d", rj->rj_stripes);
 		if (rj->rj_mirror_opts)
 			i += snprintf(cmd + i, sizeof(cmd) - i,
-				      " --flags=%s", rj->rj_mirror_opts);
+				      " --flags='%s'", rj->rj_mirror_opts);
 		i += snprintf(cmd + i, sizeof(cmd) - i,
-			      " %s/.lustre/fid/"DFID" > /dev/null 2>&1",
+			      " '%s/.lustre/fid/"DFID"' > /dev/null 2>&1",
 			      agent->rag_mountpoint, PFID(&rj->rj_fid));
 	} else if (resync == AMIGO_RESYNC_RESYNC) {
 		snprintf(cmd, sizeof(cmd),
-			 "lfs mirror resync %s/.lustre/fid/"DFID" > /dev/null 2>&1",
+			 "lfs mirror resync '%s/.lustre/fid/"DFID"' > /dev/null 2>&1",
 			 agent->rag_mountpoint,
 			 PFID(&rj->rj_fid));
 	} else {
@@ -1057,7 +1099,7 @@ static int lamigo_get_attrs(const struct lu_fid *fid,
 
 	/* the subsequent steps to fetch EAs are very expensive
 	 * skip if possible */
-	if (need_attr == 0 && 0)
+	if (need_attr == 0)
 		return 0;
 
 	/* XXX: would be great to get all EAs with a single call */
@@ -1180,7 +1222,7 @@ void *lamigo_check_agent_func(void *args)
 	char cmd[PATH_MAX];
 	struct resync_agent *a = (struct resync_agent *)args;
 
-	snprintf(cmd, sizeof(cmd), "lfs path2fid %s > /dev/null 2>&1",
+	snprintf(cmd, sizeof(cmd), "lfs path2fid '%s' > /dev/null 2>&1",
 		 a->rag_mountpoint);
 
 	rc = lamigo_exec_cmd(a, cmd);
@@ -1296,6 +1338,12 @@ static int lamigo_update_one(struct fid_rec *f)
 		return 0;
 	}
 
+	if (are_agents_busy()) {
+		/* all the agents are busy */
+	        llapi_error(LLAPI_MSG_DEBUG|LLAPI_MSG_NO_ERRNO, 0, "no agents avilable (max: %d)\n", lamigo_max_jobs);
+		return 1;
+	}
+
 	/* prevent hot file migration from hot pool to slow */
 	rc = lamigo_alr_check_is_hot(&f->fr_fh.fh_fid, &ah);
 	if (rc) {
@@ -1331,12 +1379,6 @@ static int lamigo_update_one(struct fid_rec *f)
 		return 0;
 	}
 
-	if (lamigo_jobs_running >= lamigo_max_jobs) {
-		/* all the agents are busy */
-	        llapi_error(LLAPI_MSG_DEBUG|LLAPI_MSG_NO_ERRNO, 0, "no agents avilable (max: %d)\n", lamigo_max_jobs);
-		return 1;
-	}
-
 	rj = calloc(1, sizeof(struct resync_job));
 	if (rj == NULL) {
 		llapi_error(LLAPI_MSG_DEBUG|LLAPI_MSG_NO_ERRNO, 0, "can't allocate for a job\n");
@@ -1390,7 +1432,7 @@ static int lamigo_check_sync(void)
 	int count;
 
 	/* try to resubmit failed jobs first */
-	while (!lipe_list_empty(&lamigo_failed_job_list)) {
+	while (!lipe_list_empty(&lamigo_failed_job_list) && !are_agents_busy()) {
 		struct resync_job *rj;
 
 		rj = lipe_list_entry(lamigo_failed_job_list.next,
@@ -1528,9 +1570,6 @@ static unsigned long get_fid_cache_size(int pct)
 	return cache_size;
 }
 
-static time_t lamigo_last_cleared;
-static __u64 lamigo_last_cleared_index;
-
 static void lamigo_check_and_clear_changelog(void)
 {
 	struct resync_job *tmp;
@@ -1542,10 +1581,9 @@ static void lamigo_check_and_clear_changelog(void)
 	index = lamigo_last_processed_idx;
 
 	/* this checks prevents too frequent in-memory-search for
-	 * the minimal record-in-work: every 256th record and not
-	 * frequently than once a 5 seconds
+	 * the minimal record-in-work, not frequently than once a 3 seconds
 	 */
-	if ((index & 256) == 0 && time(NULL) - lamigo_last_cleared < 5)
+	if (time(NULL) - lamigo_last_cleared < 3)
 		return;
 	lamigo_last_cleared = time(NULL);
 
@@ -1564,68 +1602,56 @@ static void lamigo_check_and_clear_changelog(void)
 			index = tmp->rj_index;
 	}
 
-	if (index - lamigo_last_cleared_index < opt.o_chlg_clear_frequency)
+	lipe_list_for_each_entry(tmp, &lamigo_failed_job_list, rj_list) {
+		if (tmp->rj_check_job || !tmp->rj_index)
+			continue;
+		if (tmp->rj_index < index)
+			index = tmp->rj_index;
+	}
+
+	if (index < lamigo_last_cleared_index ||
+	    index - lamigo_last_cleared_index < opt.o_chlg_clear_frequency)
 		return;
 
 	llapi_error(LLAPI_MSG_DEBUG|LLAPI_MSG_NO_ERRNO, 0, "CLEAR upto %llu in %s (%llu last)\n",
 		     index, opt.o_chlg_user, lamigo_last_processed_idx);
 	lamigo_last_cleared_index = index;
 	rc = llapi_changelog_clear(opt.o_mdtname, opt.o_chlg_user, index);
-	if (rc)
+	if (rc < 0) {
 		llapi_error(LLAPI_MSG_ERROR, rc,
-			    "failed to clear changelog record: %s:%llu",
-			    opt.o_chlg_user, index);
+			    "failed to clear changelog record %s:%llu (%llu last)",
+			    opt.o_chlg_user, index, lamigo_last_cleared_index);
+		systemf("lctl get_param 'mdd.%s.changelog_users'", opt.o_mdtname);
+	}
 }
 
-static void lamigo_check_jobs(void)
+static void lamigo_job_fini(struct resync_job *rj, intptr_t retval)
 {
-	struct resync_job *rj = NULL, *tmp;
-	int rc = 0;
-	intptr_t retval;
-
-	lipe_list_for_each_entry(tmp, &lamigo_job_list, rj_list) {
-		rc = pthread_tryjoin_np(tmp->rj_pid, (void **)&retval);
-		if (rc == 0) {
-			rj = tmp;
-			break;
-		} else if (rc != EBUSY) {
-			break;
-		}
-	}
-
-	if (rj == NULL) {
-		if (rc && rc != EBUSY)
-			llapi_error(LLAPI_MSG_ERROR|LLAPI_MSG_NO_ERRNO, 0, "unknown job %d exited\n",
-				     rc);
-		return;
-	}
-
 	llapi_error(LLAPI_MSG_DEBUG|LLAPI_MSG_NO_ERRNO, 0,
 		    "job %lu on "DFID" done in %lu: %"PRIdPTR" (%d)\n",
 		    rj->rj_pid, PFID(&rj->rj_fid), time(NULL) - rj->rj_start,
 		    retval, rj->rj_agent->rag_bad);
 
-	lipe_list_del(&rj->rj_list);
-
 	if (rj->rj_check_job) {
 		rj->rj_agent->rag_check_in_progress = 0;
 		if (retval == 0) {
 			/* the agent is back */
-			llapi_error(LLAPI_MSG_DEBUG|LLAPI_MSG_NO_ERRNO, 0, "agent %s is back\n",
-				     rj->rj_agent->rag_hostname);
 			if (rj->rj_agent->rag_bad) {
+				llapi_error(LLAPI_MSG_DEBUG|LLAPI_MSG_NO_ERRNO, 0, "agent %s is back\n",
+					    rj->rj_agent->rag_hostname);
 				rj->rj_agent->rag_bad = false;
 				lamigo_max_jobs += rj->rj_agent->rag_maxjobs;
 			}
 		} else {
 			/* the agent is still bad */
 			if (rj->rj_agent->rag_bad == false) {
+				llapi_error(LLAPI_MSG_DEBUG|LLAPI_MSG_NO_ERRNO, 0, "agent %s is bad\n",
+					    rj->rj_agent->rag_hostname);
+
 				assert(lamigo_max_jobs >= rj->rj_agent->rag_maxjobs);
 				lamigo_max_jobs -= rj->rj_agent->rag_maxjobs;
 				rj->rj_agent->rag_bad = true;
 			}
-			llapi_error(LLAPI_MSG_DEBUG|LLAPI_MSG_NO_ERRNO, 0, "agent %s is still bad\n",
-			      rj->rj_agent->rag_hostname);
 		}
 		free(rj);
 		return;
@@ -1662,19 +1688,40 @@ static void lamigo_check_jobs(void)
 		/* no striping info or file disappeared */
 		/* no need to handle, forget the changelog record */
 	} else if (retval == 0) {
-		stats.s_replicated++;
+		if (!rj->rj_setprefer)
+			stats.s_replicated++;
 	}
 
 	if (rj)
 		free(rj);
 }
 
+static void lamigo_check_jobs(void)
+{
+	struct resync_job *rj, *tmp;
+	int rc = 0;
+	intptr_t retval;
+
+	lipe_list_for_each_entry_safe(rj, tmp, &lamigo_job_list, rj_list) {
+		rc = pthread_tryjoin_np(rj->rj_pid, (void **)&retval);
+		if (rc == EBUSY) {
+			continue;
+		} else if (rc != 0) {
+			llapi_error(LLAPI_MSG_ERROR, rc, "cannot join thread %lld",
+				    (long long)rj->rj_pid);
+		} else {
+			lipe_list_del(&rj->rj_list);
+			lamigo_job_fini(rj, retval);
+		}
+	}
+}
+
 static void lamigo_add_agent(const char *host, const char *mnt, char *jobs)
 {
 	struct resync_agent *a;
 	int i;
 
-	a = malloc(sizeof(*a));
+	a = calloc(1, sizeof(*a));
 	if (!a) {
 		llapi_error(LLAPI_MSG_FATAL|LLAPI_MSG_NO_ERRNO, 0,
 			     "can't allocate memory for agent\n");
@@ -1715,8 +1762,8 @@ static void lamigo_add_agent(const char *host, const char *mnt, char *jobs)
 
 	/* ssh context per job, and one more for agent heartbeat */
 	for (i = 0; i < a->rag_maxjobs + 1; i++) {
-		struct resync_ssh_session *rss =
-				malloc(sizeof(struct resync_ssh_session));
+		struct resync_ssh_session *rss = calloc(1, sizeof(*rss));
+		int rc;
 
 		if (!rss) {
 			llapi_error(LLAPI_MSG_FATAL|LLAPI_MSG_NO_ERRNO, 0,
@@ -1724,6 +1771,14 @@ static void lamigo_add_agent(const char *host, const char *mnt, char *jobs)
 			exit(1);
 		}
 
+		rc = lipe_ssh_context_init(&rss->rss_ctx, a->rag_hostname);
+		if (rc < 0) {
+			llapi_error(LLAPI_MSG_FATAL|LLAPI_MSG_NO_ERRNO, 0,
+				    "cannot create SSH context for '%s'\n",
+				    a->rag_hostname);
+			exit(1);
+		}
+
 		lipe_list_add(&rss->rss_list, &a->rag_ssh_list);
 	}
 
@@ -1750,6 +1805,7 @@ static void lamigo_add_agent(const char *host, const char *mnt, char *jobs)
 #define LAMIGO_OPT_STATFS_REFRESH	15
 #define LAMIGO_OPT_SRC_FREE		16
 #define LAMIGO_OPT_TGT_FREE		17
+#define LAMIGO_OPT_VERSION		18
 
 static struct option options[] = {
 	{ "ignore-reads", no_argument, NULL, LAMIGO_OPT_IGNORE_READS},
@@ -1790,6 +1846,7 @@ static struct option options[] = {
 	{ "tgt", required_argument, NULL, 't'},
 	{ "user", required_argument, 0, 'u'},
 	{ "verbose", no_argument, NULL, 'v'},
+	{ "version", no_argument, NULL, LAMIGO_OPT_VERSION },
 	{ NULL }
 };
 
@@ -2082,6 +2139,9 @@ void lamigo_process_opt(int c, char *optarg)
 			exit(1);
 		}
 		break;
+	case LAMIGO_OPT_VERSION:
+		lipe_version();
+		exit(0);
 	case 'a':
 		opt.o_min_age = strtol(optarg, &endptr, 10);
 		if (*endptr != '\0' || opt.o_min_age < 5) {
@@ -2594,7 +2654,7 @@ int lamigo_lipe_callback(struct lipe_instance *instance,
 	lamigo_check_jobs();
 
 	do {
-		while (lamigo_jobs_running >= lamigo_max_jobs) {
+		while (are_agents_busy()) {
 			lamigo_check_jobs();
 			lamigo_check_sync();
 			lamigo_check_bad_agents();
@@ -3194,6 +3254,7 @@ int main(int argc, char **argv)
 	int	 ret = 0;
 	pthread_t pid;
 
+	lipe_version_init();
 	ssh_threads_set_callbacks(ssh_threads_get_pthread());
 	ssh_init();
 
@@ -3208,6 +3269,10 @@ int main(int argc, char **argv)
 	 * followed by the MDT name ("lamigo lustre-MDT0000"). */
 	llapi_set_command_name(opt.o_mdtname);
 
+	llapi_error(LLAPI_MSG_INFO|LLAPI_MSG_NO_ERRNO, 0,
+		    "version %s-%s, revision %s\n",
+		    PACKAGE_VERSION, LIPE_RELEASE, LIPE_REVISION);
+
 	rc = lamigo_init_cache();
 	if (rc < 0) {
 		llapi_error(LLAPI_MSG_FATAL|LLAPI_MSG_NO_ERRNO, 0, "can't init cache\n");
@@ -3276,11 +3341,15 @@ int main(int argc, char **argv)
 
 		if (enable_heat)
 			lamigo_check_hot();
-		rc = lamigo_check_sync();
-		if (rc < 0) {
-			stop = true;
-			ret = rc;
+
+		if (!are_agents_busy()) {
+			rc = lamigo_check_sync();
+			if (rc < 0) {
+				stop = true;
+				ret = rc;
+			}
 		}
+
 		lamigo_check_jobs();
 		lamigo_check_and_clear_changelog();
 		lamigo_check_bad_agents();
@@ -3305,6 +3374,7 @@ int main(int argc, char **argv)
 out:
 	lamigo_cleanup();
 	llapi_error(LLAPI_MSG_INFO|LLAPI_MSG_NO_ERRNO, 0, "exited\n");
+	lipe_version_fini();
 
 	return ret;
 }
diff --git a/src/lamigo_alr.c b/src/lamigo_alr.c
index fa28ce2..aceb3ce 100644
--- a/src/lamigo_alr.c
+++ b/src/lamigo_alr.c
@@ -261,20 +261,14 @@ void *lamigo_alr_data_collection_thread(void *arg)
 		opt.o_alr_ofd_interval, mdtidx, opt.o_alr_extra_args);
 
 repeat:
-	rc = lipe_init_ssh_session(&ala->ala_ctx, ala->ala_host);
-	if (rc) {
-		llapi_err_noerrno(LLAPI_MSG_ERROR,
-				"failed to create connection to agent [%s], %d\n",
-				ala->ala_host, rc);
-		goto again;
-	}
-
-	rc = lipe_ssh_start_cmd(ala->ala_ctx.lsc_session, cmd, &channel);
+	rc = lipe_ssh_start_cmd(&ala->ala_ctx, cmd, &channel);
 	if (rc != SSH_OK) {
-		llapi_error(LLAPI_MSG_ERROR, rc,
-			    "cannot start ofd_access_log_readed\n");
+		llapi_error(LLAPI_MSG_ERROR|LLAPI_MSG_NO_ERRNO, 0,
+			    "cannot start ofd_access_log_reader on host '%s': rc = %d\n",
+			    ala->ala_host, rc);
 		goto err;
 	}
+
 	llapi_err_noerrno(LLAPI_MSG_DEBUG, "alr agent on %s is running\n",
 			  ala->ala_host);
 
@@ -303,7 +297,6 @@ err:
 	ssh_channel_close(channel);
 	ssh_channel_free(channel);
 
-again:
 	/* wait for a while */
 	/* XXX: should be configurable? */
 	sleep(5);
@@ -907,11 +900,17 @@ void lamigo_alr_init(void)
 void lamigo_add_alr_agent(const char *host)
 {
 	struct alr_agent *ala;
+	int rc;
 
-	ala = calloc(sizeof(*ala), 1);
-	assert(ala);
+	ala = calloc(1, sizeof(*ala));
+	assert(ala != NULL);
 
 	ala->ala_host = strdup(host);
+	assert(ala->ala_host != NULL);
+
+	rc = lipe_ssh_context_init(&ala->ala_ctx, ala->ala_host);
+	assert(rc == SSH_OK);
+
 	lipe_list_add(&ala->ala_list, &alr_agent_list);
 }
 
diff --git a/src/lipe_ssh.c b/src/lipe_ssh.c
index 5df17d9..43ac483 100644
--- a/src/lipe_ssh.c
+++ b/src/lipe_ssh.c
@@ -1,4 +1,6 @@
 #include <stdlib.h>
+#include <assert.h>
+#include <malloc.h>
 #include <string.h>
 #include <errno.h>
 
@@ -6,61 +8,59 @@
 
 #include "lipe_ssh.h"
 
-int lipe_ssh_authenticate_pubkey(ssh_session session)
-{
-	int rc;
-
-	rc = ssh_userauth_publickey_auto(session, NULL, NULL);
-	if (rc != SSH_AUTH_SUCCESS) {
-		llapi_printf(LLAPI_MSG_ERROR, "Authentication failed: %s\n",
-			     ssh_get_error(session));
-		return rc;
-	}
-
-	return SSH_AUTH_SUCCESS;
-}
+#define lipe_ssh_debug(fmt, args...) \
+	llapi_error(LLAPI_MSG_DEBUG|LLAPI_MSG_NO_ERRNO, 0, fmt, ##args)
 
+#define lipe_ssh_error(fmt, args...) \
+	llapi_error(LLAPI_MSG_ERROR|LLAPI_MSG_NO_ERRNO, 0, fmt, ##args)
 
-int lipe_ssh_start_cmd(ssh_session session, const char *cmd, ssh_channel *rch)
+static int lipe_ssh_session_start_cmd(ssh_session session, const char *cmd, ssh_channel *pchannel)
 {
-	ssh_channel channel;
+	ssh_channel channel = NULL;
 	int rc;
 
+	assert(session != NULL);
+
 	channel = ssh_channel_new(session);
 	if (channel == NULL) {
-		llapi_printf(LLAPI_MSG_ERROR,
-			     "Error allocating a new channel: %s\n",
-			     ssh_get_error(session));
-		return SSH_ERROR;
+		lipe_ssh_error("canot create a new SSH channel: %s\n",
+			       ssh_get_error(session));
+		rc = SSH_ERROR;
+		goto out;
 	}
 
 	rc = ssh_channel_open_session(channel);
 	if (rc != SSH_OK) {
-		llapi_printf(LLAPI_MSG_ERROR,
-			     "Error opening a session channel: %s\n",
-			     ssh_get_error(session));
-		ssh_channel_free(channel);
-		return rc;
+		lipe_ssh_error("cannot open SSH session channel: %s\n",
+			       ssh_get_error(session));
+		goto out;
 	}
 
 	rc = ssh_channel_request_exec(channel, cmd);
 	if (rc != SSH_OK) {
-		llapi_printf(LLAPI_MSG_ERROR, "Error executing command: %s\n",
-			     ssh_get_error(session));
-		ssh_channel_close(channel);
-		ssh_channel_free(channel);
-		return rc;
+		lipe_ssh_error("cannot execute SSH command: %s\n",
+			       ssh_get_error(session));
+		goto out;
 	}
-	*rch = channel;
-	return SSH_OK;
+
+	*pchannel = channel;
+	channel = NULL;
+	rc = SSH_OK;
+out:
+	if (channel != NULL)
+		ssh_channel_close(channel);
+
+	ssh_channel_free(channel);
+
+	return rc;
 }
 
-int lipe_ssh_exec_cmd(ssh_session session, const char *cmd)
+static int lipe_ssh_session_exec_cmd(ssh_session session, const char *cmd)
 {
 	ssh_channel channel;
 	int rc;
 
-	rc = lipe_ssh_start_cmd(session, cmd, &channel);
+	rc = lipe_ssh_session_start_cmd(session, cmd, &channel);
 	if (rc != SSH_OK)
 		return rc;
 
@@ -72,114 +72,140 @@ int lipe_ssh_exec_cmd(ssh_session session, const char *cmd)
 	return rc;
 }
 
-int lipe_init_ssh_session(struct lipe_ssh_context *ctx,
-			  const char *host)
+static void lipe_ssh_session_destroy(ssh_session *psession)
+{
+	if (*psession != NULL)
+		ssh_disconnect(*psession);
+
+	ssh_free(*psession);
+	*psession = NULL;
+}
+
+static int lipe_ssh_session_create(ssh_session *psession, const char *host)
 {
-	int timeout = 5;
+	ssh_session session = NULL;
+	long timeout = 5;
 	int rc;
 
-	ctx->lsc_host = strdup(host);
-	if (ctx->lsc_host == NULL)
-		return SSH_ERROR;
+	assert(SSH_OK == 0);
+	assert(SSH_AUTH_SUCCESS == 0);
+	assert(SSH_ERROR < 0);
+
+	assert(psession != NULL);
+	assert(*psession == NULL);
+	assert(host != NULL);
 
-	pthread_mutex_init(&ctx->lsc_session_mutex, NULL);
-	/* Create a new ssh session and set options */
-	ctx->lsc_session = ssh_new();
-	if (ctx->lsc_session == NULL) {
-		llapi_printf(LLAPI_MSG_ERROR,
-			     "Error creating a new ssh session\n");
+	lipe_ssh_debug("creating new SSH session for host '%s'\n", host);
+
+	session = ssh_new();
+	if (session == NULL) {
+		lipe_ssh_error("cannot create a new SSH session: %s\n",
+			       strerror(ENOMEM)); /* Probably. */
 		rc = SSH_ERROR;
-		goto out_host;
+		goto out;
 	}
 
-	ssh_options_set(ctx->lsc_session, SSH_OPTIONS_HOST, host);
+	rc = ssh_options_set(session, SSH_OPTIONS_HOST, host);
+	if (rc != SSH_OK) {
+		lipe_ssh_error("cannot set SSH session host to '%s: %s'\n",
+			       host, ssh_get_error(session));
+		goto out;
+	}
 
-	if (ssh_options_set(ctx->lsc_session, SSH_OPTIONS_TIMEOUT,
-			    &timeout) < 0) {
-		llapi_printf(LLAPI_MSG_FATAL,
-			     "Error setting ssh timeout\n");
-		rc = SSH_ERROR;
+	rc = ssh_options_set(session, SSH_OPTIONS_TIMEOUT, &timeout);
+	if (rc != SSH_OK) {
+		lipe_ssh_error("cannot set SSH timeout to %ld: %s\n",
+			       timeout, ssh_get_error(session));
 		goto out;
 	}
 
 	/* Connect to the ssh server */
-	rc = ssh_connect(ctx->lsc_session);
+	rc = ssh_connect(session);
 	if (rc != SSH_OK) {
-		llapi_printf(LLAPI_MSG_ERROR,
-			     "Error connecting to host %s: %s\n",
-			     host,
-			     ssh_get_error(ctx->lsc_session));
+		lipe_ssh_error("cannot connect SSH session to host '%s': %s\n",
+			       host, ssh_get_error(session));
 		goto out;
 	}
 
 	/* Automatically authenticate with public key */
-	rc = lipe_ssh_authenticate_pubkey(ctx->lsc_session);
-	if (rc == SSH_AUTH_SUCCESS)
-		return rc;
-
-	llapi_printf(LLAPI_MSG_ERROR,
-		     "Error authenticating pubkey: %s\n",
-		     ssh_get_error(ctx->lsc_session));
+	rc = ssh_userauth_publickey_auto(session, NULL, NULL);
+	if (rc != SSH_AUTH_SUCCESS) {
+		lipe_ssh_error("cannot authenticate SSH session to host '%s': %s\n",
+			       host, ssh_get_error(session));
+		goto out;
+	}
 
-	ssh_disconnect(ctx->lsc_session);
+	*psession = session;
+	session = NULL;
+	rc = SSH_OK;
 out:
-	ssh_free(ctx->lsc_session);
-	ctx->lsc_session = NULL;
-out_host:
-	free(ctx->lsc_host);
+	lipe_ssh_debug("create new SSH session for host '%s': rc = %d\n", host, rc);
+	lipe_ssh_session_destroy(&session);
 
 	return rc;
 }
 
-int lipe_reset_ssh_session(struct lipe_ssh_context *ctx)
+static int lipe_ssh_context_check(struct lipe_ssh_context *ctx)
 {
-	int rc;
-	char *hostname;
-
-	pthread_mutex_lock(&ctx->lsc_session_mutex);
-	hostname = strdup(ctx->lsc_host);
-	if (!hostname) {
-		pthread_mutex_unlock(&ctx->lsc_session_mutex);
-		return -ENOMEM;
-	}
-	lipe_cleanup_ssh_session(ctx);
-	rc = lipe_init_ssh_session(ctx, hostname);
-	pthread_mutex_unlock(&ctx->lsc_session_mutex);
-	free(hostname);
+	if (ctx->lsc_session == NULL)
+		return lipe_ssh_session_create(&ctx->lsc_session, ctx->lsc_host);
 
-	return rc;
+	return SSH_OK;
 }
 
-int lipe_ssh_exec(struct lipe_ssh_context *ctx, const char *cmd)
+static void lipe_ssh_context_fail(struct lipe_ssh_context *ctx)
 {
-	int rc;
-
-	/* Execute a remote command */
-	rc = lipe_ssh_exec_cmd(ctx->lsc_session, cmd);
-	if (rc == SSH_OK)
-		return SSH_OK;
-
-	/* Reset ssh session and try again */
-	if (rc == SSH_ERROR || rc == SSH_AGAIN) {
-		rc = lipe_reset_ssh_session(ctx);
-		if (rc)
-			return rc;
-		rc = lipe_ssh_exec_cmd(ctx->lsc_session, cmd);
-	}
-
-	return rc;
+	lipe_ssh_debug("fail SSH context for host '%s'\n", ctx->lsc_host);
+	lipe_ssh_session_destroy(&ctx->lsc_session);
 }
 
-void lipe_cleanup_ssh_session(struct lipe_ssh_context *ctx)
+void lipe_ssh_context_destroy(struct lipe_ssh_context *ctx)
 {
-	ssh_disconnect(ctx->lsc_session);
-	ssh_free(ctx->lsc_session);
+	lipe_ssh_session_destroy(&ctx->lsc_session);
 	free(ctx->lsc_host);
+	ctx->lsc_host = NULL;
 }
 
-int lipe_ssh_status(struct lipe_ssh_context *ctx)
+int lipe_ssh_context_init(struct lipe_ssh_context *ctx, const char *host)
 {
-	if (ctx->lsc_session == NULL)
+	memset(ctx, 0, sizeof(*ctx));
+
+	ctx->lsc_host = strdup(host);
+	if (ctx->lsc_host == NULL)
 		return SSH_ERROR;
+
+	/* Session creation will be done on demand. */
+
 	return SSH_OK;
 }
+
+int lipe_ssh_start_cmd(struct lipe_ssh_context *ctx, const char *cmd, ssh_channel *pchannel)
+{
+	int rc;
+
+	rc = lipe_ssh_context_check(ctx);
+	if (rc != SSH_OK)
+		return rc;
+
+	rc = lipe_ssh_session_start_cmd(ctx->lsc_session, cmd, pchannel);
+	if (rc != SSH_OK)
+		lipe_ssh_context_fail(ctx);
+
+	return rc;
+}
+
+int lipe_ssh_exec(struct lipe_ssh_context *ctx, const char *cmd)
+{
+	int rc;
+
+	rc = lipe_ssh_context_check(ctx);
+	if (rc != SSH_OK)
+		return rc;
+
+	/* Execute a remote command */
+	rc = lipe_ssh_session_exec_cmd(ctx->lsc_session, cmd);
+	if (rc < 0)
+		lipe_ssh_context_fail(ctx);
+
+	return rc;
+}
diff --git a/src/lipe_ssh.h b/src/lipe_ssh.h
index 5d5e224..57ab9d8 100644
--- a/src/lipe_ssh.h
+++ b/src/lipe_ssh.h
@@ -4,23 +4,17 @@
 #ifndef _LIPE_SSH_H_
 #define _LIPE_SSH_H_
 
-#include <pthread.h>
 #include <libssh/libssh.h>
 
 struct lipe_ssh_context {
 	char *lsc_host;
 	ssh_session lsc_session;
-	pthread_mutex_t lsc_session_mutex;
 };
 
-int lipe_ssh_authenticate_pubkey(ssh_session session);
-int lipe_ssh_exec_cmd(ssh_session session, const char *cmd);
-int lipe_init_ssh_session(struct lipe_ssh_context *ctx,
-			  const char *host);
-void lipe_cleanup_ssh_session(struct lipe_ssh_context *ctx);
-int lipe_reset_ssh_session(struct lipe_ssh_context *ctx);
+int lipe_ssh_context_init(struct lipe_ssh_context *ctx, const char *host);
+void lipe_ssh_context_destroy(struct lipe_ssh_context *ctx);
+
 int lipe_ssh_exec(struct lipe_ssh_context *ctx, const char *cmd);
-int lipe_ssh_start_cmd(ssh_session session, const char *cmd, ssh_channel *rch);
-int lipe_ssh_status(struct lipe_ssh_context *ctx);
+int lipe_ssh_start_cmd(struct lipe_ssh_context *ctx, const char *cmd, ssh_channel *pchannel);
 
 #endif /* _LIPE_SSH_H_ */
diff --git a/src/lipe_version.c b/src/lipe_version.c
new file mode 100644
index 0000000..e8b2342
--- /dev/null
+++ b/src/lipe_version.c
@@ -0,0 +1,32 @@
+#include <stdio.h>
+#include <errno.h>
+#include <malloc.h>
+#include <string.h>
+#include "lipe_version.h"
+
+/* A grep-able string for stripped binaries. */
+static const char lipe_revision_string[] = "LiPE Revision: "LIPE_REVISION;
+
+/* A grep-able string for core dumps. */
+static char *lipe_revision_string_copy;
+
+void lipe_version(void)
+{
+	printf("%s (%s) %s-%s\n%s\n",
+	       program_invocation_short_name,
+	       PACKAGE_NAME,
+	       PACKAGE_VERSION,
+	       LIPE_RELEASE,
+	       lipe_revision_string);
+}
+
+void lipe_version_init(void)
+{
+	if (lipe_revision_string_copy == NULL)
+		lipe_revision_string_copy = strdup(lipe_revision_string);
+}
+
+void lipe_version_fini(void)
+{
+	free(lipe_revision_string_copy);
+}
diff --git a/src/lipe_version.h b/src/lipe_version.h
new file mode 100644
index 0000000..49e3c3f
--- /dev/null
+++ b/src/lipe_version.h
@@ -0,0 +1,8 @@
+#ifndef _LIPE_VERSION_H_
+#define _LIPE_VERSION_H_
+
+void lipe_version(void);
+void lipe_version_init(void);
+void lipe_version_fini(void);
+
+#endif
diff --git a/src/lpurge.c b/src/lpurge.c
index 17cff87..c1361c1 100644
--- a/src/lpurge.c
+++ b/src/lpurge.c
@@ -27,7 +27,6 @@
 /*
  * TODO
  *  - convert blocks to 1K units
- *  - multiple --mirror-id in lfs split to save on ssh sessions
  *  - take OST load into account
  *
  */
@@ -59,13 +58,11 @@
 #include <linux/lustre/lustre_fid.h>
 #include <linux/lustre/lustre_cfg.h>
 #include <json-c/json.h>
-#include <libssh/libssh.h>
-#include <libssh/callbacks.h>
 #include "lipe_object_attrs.h"
+#include "lipe_version.h"
 #include "list.h"
 #include "policy.h"
 #include "flist.h"
-#include "lipe_ssh.h"
 
 #define container_of(ptr, type, member) ({                      \
 	const typeof(((type *) 0)->member) * __mptr = (ptr);     \
@@ -76,23 +73,15 @@
 #define LPURGE_FIDS_DUMPFILE  "/var/run/lpurge-%s.fids"
 
 struct lpurge_object {
+	struct lipe_list_head lo_list;
 	struct lu_fid lo_fid;
 	/* Used space in bytes */
 	__u32 lo_blocks;
-	__u32 lo_comp_id;
+	__u32 lo_mirror_id;
 	/* Last use time */
 	time_t lo_last_utime;
 };
 
-struct lpurge_object_pack {
-	struct lipe_list_head lop_list;
-	unsigned int lop_max;
-	unsigned int lop_nr;
-	unsigned int lop_mdt_idx;
-	pthread_t lop_pid;
-	struct lpurge_object lop_objs[];
-};
-
 struct lpurge_slot {
 	time_t ls_min_utime;
 	time_t ls_max_utime;
@@ -104,25 +93,15 @@ struct lpurge_slot {
 	unsigned int ls_scan;
 	struct lipe_list_head ls_obj_list;
 	pthread_mutex_t ls_mutex;
+	/* stats for objects we can't release */
+	unsigned long ls_nomirror_objs; /* no replicated objects */
+	unsigned long ls_nomirror_space;
+	unsigned long ls_nopfid_objs; /* no PFID, can't find parent */
+	unsigned long ls_nopfid_space;
+	unsigned long ls_notfirst_objs; /* not a first stripe */
+	unsigned long ls_notfirst_space;
 };
 
-struct mds_ssh_session {
-	struct lipe_list_head mss_list;
-	struct lipe_ssh_context mss_ctx;
-};
-
-struct mds {
-	char *host;
-	char *mnt;
-	struct lipe_list_head m_ssh_list;
-	pthread_mutex_t m_ssh_lock;
-	pthread_cond_t m_ssh_cond;
-};
-
-#define MAX_MDTS       128
-struct mds mds[MAX_MDTS];
-int mdsnr;
-
 #define LPURGE_HIST_MAX        16
 struct lpurge_slot lpurge_hist[LPURGE_HIST_MAX];
 
@@ -131,8 +110,11 @@ struct stats {
 	unsigned long s_scans;
 	unsigned long s_low_hits;
 	unsigned long s_slow_scans;
-	unsigned long s_spawned;
-	unsigned long s_purged; /* files */
+	unsigned long s_queued; /* # files queued for purge */
+	unsigned long s_started; /* # files dequeued for purge by worker */
+	unsigned long s_purged; /* # files we successfully purged */
+	unsigned long s_failed; /* # files we failed to purge */
+	/* TODO count mirrors we tried to purge but found last non stale */
 };
 struct stats stats;
 
@@ -177,6 +159,7 @@ char *ostprefix;
 char ost_mntdev[PATH_MAX];
 char *ost_mntpt;
 int lustre_fd = -1;
+int open_by_fid_fd = -1;
 unsigned long oldest;
 time_t scan_started_time;
 time_t scan_finished_time;
@@ -191,13 +174,6 @@ char *prog;
 
 void load_config(char *name);
 
-#define PACK_SIZE	(64*1024)
-#define PACK_NR ((PACK_SIZE - offsetof(struct lpurge_object_pack, lop_objs[0])) / \
-	sizeof(struct lpurge_object))
-
-#define FIDS_PER_CALL	128 /* XXX: bump to a bigger value */
-#define PM_SIZE	(offsetof(struct lpurge_object_pack, lop_objs[FIDS_PER_CALL]))
-
 struct lipe_flist *lflist;
 #define LPURGE_FLIST_SIZE (1024 * 1024)
 
@@ -225,25 +201,11 @@ static int lpurge_init_flist(void)
 
 static void sig_handler(int signal)
 {
-	int i;
-
 	psignal(signal, "exiting");
 	if (lflist)
 		flist_free(lflist);
 
-	for (i = 0; i < mdsnr; i++) {
-		struct mds *m = &mds[i];
-		struct mds_ssh_session *mss, *tmp;
-
-		lipe_list_for_each_entry_safe(mss, tmp, &m->m_ssh_list,
-					mss_list) {
-			lipe_cleanup_ssh_session(&mss->mss_ctx);
-			lipe_list_del(&mss->mss_list);
-			free(mss);
-		}
-	}
-
-	_exit(1);
+	_exit(0);
 }
 
 static void lpurge_null_handler(int signal)
@@ -277,7 +239,7 @@ void lpurge_init_result(void)
 
 void lpurge_reset_result(void)
 {
-	struct lpurge_object_pack *s, *t;
+	struct lpurge_object *s, *t;
 	int i;
 
 	for (i = 0; i < LPURGE_HIST_MAX; i++) {
@@ -287,9 +249,15 @@ void lpurge_reset_result(void)
 		lpurge_hist[i].ls_scan = 1;
 		lpurge_hist[i].ls_min_utime = ~0UL;
 		lpurge_hist[i].ls_max_utime = 0;
+		lpurge_hist[i].ls_nopfid_objs = 0;
+		lpurge_hist[i].ls_nopfid_space = 0;
+		lpurge_hist[i].ls_nomirror_objs = 0;
+		lpurge_hist[i].ls_nomirror_space = 0;
+		lpurge_hist[i].ls_notfirst_objs = 0;
+		lpurge_hist[i].ls_notfirst_space = 0;
 		lipe_list_for_each_entry_safe(s, t, &lpurge_hist[i].ls_obj_list,
-					      lop_list) {
-			lipe_list_del(&s->lop_list);
+					      lo_list) {
+			lipe_list_del(&s->lo_list);
 			free(s);
 		}
 	}
@@ -312,7 +280,8 @@ void usage(void)
 	       "\t-R, --scan_rate, slow scanning rate, objs/sec (default: %u)\n"
 	       "\t-S, --slot_size, objects to store in memory (default: %u)\n"
 	       "\t-t, --scan_threads, scanning threads (default: %u)\n"
-	       "\t-w, --dump, stats file (via USR1 signal, default: %s)\n",
+	       "\t-w, --dump, stats file (via USR1 signal, default: %s)\n"
+	       "\t--version, print version information and exit\n",
 	       program_invocation_short_name,
 	       DEF_FREEHI,
 	       DEF_INTERVAL,
@@ -549,7 +518,7 @@ static void lpurge_reclaim_slot(unsigned int index)
 	int i = 0;
 	struct lpurge_slot *ls_1 = lpurge_hist; /* youngest slot */
 	struct lpurge_slot *ls_2;
-	struct lpurge_object_pack *p, *t;
+	struct lpurge_object *p, *t;
 
 	if (index >= LPURGE_HIST_MAX - 1)
 		return;
@@ -562,8 +531,8 @@ static void lpurge_reclaim_slot(unsigned int index)
 	ls_1->ls_min_utime = ~0UL;
 	/* reclaim youngest slot */
 	lipe_list_for_each_entry_safe(p, t, &ls_1->ls_obj_list,
-				      lop_list) {
-		lipe_list_del(&p->lop_list);
+				      lo_list) {
+		lipe_list_del(&p->lo_list);
 		free(p);
 	}
 	pthread_mutex_unlock(&ls_1->ls_mutex);
@@ -630,7 +599,7 @@ int lpurge_lipe_callback(struct lipe_instance *instance,
 			 struct lipe_object_attrs *attrs)
 {
 	struct lpurge_slot *ls = NULL;
-	struct lpurge_object_pack *lp = NULL;
+	struct lpurge_object *lo = NULL;
 	time_t age, last_used;
 	int index;
 
@@ -642,25 +611,6 @@ int lpurge_lipe_callback(struct lipe_instance *instance,
 	if ((attrs->loa_mode & S_IFMT) != S_IFREG)
 		goto out;
 
-	if ((attrs->loa_attr_bits & LIPE_OBJECT_ATTR_FILTER_FID) == 0)
-		goto out;
-
-	/* to avoid N OSTs to 1 MDT scalability issue we only consider
-	 * objects which store 1st stripe
-	 */
-	if (attrs->loa_filter_fid.ff_parent.f_ver != 0)
-		goto out;
-
-	/* if the object has got ost_layout structure which encodes
-	 * whether the object has a mirror, then we can skip objects
-	 * with mirror_id=0 (no mirror)
-	 */
-	if (attrs->loa_filter_fid_size >= sizeof(struct filter_fid)) {
-		if (mirror_id_of(attrs->loa_filter_fid.ff_layout.ol_comp_id)
-		    == 0)
-			goto out;
-	}
-
 	last_used = attrs->loa_mtime_ms;
 	if (last_used < attrs->loa_atime_ms)
 		last_used = attrs->loa_atime_ms;
@@ -685,13 +635,45 @@ int lpurge_lipe_callback(struct lipe_instance *instance,
 		goto out;
 	}
 
+	pthread_mutex_lock(&ls->ls_mutex);
+
+	if ((attrs->loa_attr_bits & LIPE_OBJECT_ATTR_FILTER_FID) == 0) {
+		ls->ls_nopfid_objs++;
+		ls->ls_nopfid_space += attrs->loa_blocks >> 10;
+		pthread_mutex_unlock(&ls->ls_mutex);
+		goto out;
+	}
+
+	/* to avoid N OSTs to 1 MDT scalability issue we only consider
+	 * objects which store 1st stripe
+	 */
+	if (attrs->loa_filter_fid.ff_parent.f_ver != 0) {
+		ls->ls_notfirst_objs++;
+		ls->ls_notfirst_space += attrs->loa_blocks >> 10;
+		pthread_mutex_unlock(&ls->ls_mutex);
+		goto out;
+	}
+
+	/* if the object has got ost_layout structure which encodes
+	 * whether the object has a mirror, then we can skip objects
+	 * with mirror_id=0 (no mirror)
+	 */
+	if (attrs->loa_filter_fid_size >= sizeof(struct filter_fid)) {
+		if (mirror_id_of(attrs->loa_filter_fid.ff_layout.ol_comp_id)
+		    == 0) {
+			ls->ls_nomirror_objs++;
+			ls->ls_nomirror_space += attrs->loa_blocks >> 10;
+			pthread_mutex_unlock(&ls->ls_mutex);
+			goto out;
+		}
+	}
+
 	llapi_printf(LLAPI_MSG_DEBUG,
 		     "found under "DFID": size %ld block %ld age %ld slot %d\n",
 		     PFID(&attrs->loa_filter_fid.ff_parent),
 		     (unsigned long)attrs->loa_size,
 		     (unsigned long)attrs->loa_blocks >> 10, age, index);
 
-	pthread_mutex_lock(&ls->ls_mutex);
 	ls->ls_found++;
 	ls->ls_space += attrs->loa_blocks >> 10;
 	lpurge_scanned_since++;
@@ -723,42 +705,31 @@ int lpurge_lipe_callback(struct lipe_instance *instance,
 		}
 	}
 
-	if (!lipe_list_empty(&ls->ls_obj_list))
-		lp = lipe_list_entry(ls->ls_obj_list.next,
-				     struct lpurge_object_pack, lop_list);
+	lo = calloc(1, sizeof(*lo));
+	if (lo == NULL)
+		goto out;
 
-	if (!lp || lp->lop_nr >= lp->lop_max) {
-		lp = calloc(1, PACK_SIZE);
-		if (!lp) {
-			llapi_printf(LLAPI_MSG_ERROR,
-				     "can't alloca pack\n");
-			exit(1);
-		}
-		lp->lop_pid = 0;
-		lp->lop_max = PACK_NR;
-		lp->lop_nr = 0;
-		lipe_list_add(&lp->lop_list, &ls->ls_obj_list);
-	}
-	lp->lop_objs[lp->lop_nr].lo_fid = attrs->loa_filter_fid.ff_parent;
-	lp->lop_objs[lp->lop_nr].lo_blocks = attrs->loa_blocks >> 10;
-	lp->lop_objs[lp->lop_nr].lo_comp_id = 0;
-	lp->lop_objs[lp->lop_nr].lo_last_utime = last_used;
+	lo->lo_fid = attrs->loa_filter_fid.ff_parent;
+	lo->lo_blocks = attrs->loa_blocks >> 10;
+	lo->lo_last_utime = last_used;
 	if (attrs->loa_filter_fid_size >= sizeof(struct filter_fid)) {
 		__u32 id;
 
 		id = mirror_id_of(attrs->loa_filter_fid.ff_layout.ol_comp_id);
-		lp->lop_objs[lp->lop_nr].lo_comp_id = id;
+		lo->lo_mirror_id = id;
 	}
-	lp->lop_nr++;
+
+	lipe_list_add(&lo->lo_list, &ls->ls_obj_list);
 	ls->ls_stored++;
+
 	if (ls->ls_max_utime < last_used)
 		ls->ls_max_utime = last_used;
 	if (ls->ls_min_utime > last_used)
 		ls->ls_min_utime = last_used;
 	pthread_mutex_unlock(&ls->ls_mutex);
 
-	llapi_printf(LLAPI_MSG_DEBUG, "add "DFID" to %p/%d\n",
-		     PFID(&attrs->loa_filter_fid.ff_parent), lp, lp->lop_nr);
+	llapi_printf(LLAPI_MSG_DEBUG, "add "DFID" to %p/%lu\n",
+		     PFID(&attrs->loa_filter_fid.ff_parent), ls, ls->ls_stored);
 
 out:
 	/*
@@ -786,257 +757,324 @@ out:
 	return 0;
 }
 
-static int lpurge_exec_cmd(struct mds *m, char *cmd)
+struct collect_ids_data {
+	__u16	*cid_ids;
+	int	cid_count;
+	__u16	cid_exclude;
+};
+
+static int collect_mirror_id(struct llapi_layout *layout, void *cbdata)
+{
+	struct collect_ids_data *cid = cbdata;
+	uint32_t id;
+	int rc;
+
+	rc = llapi_layout_mirror_id_get(layout, &id);
+	if (rc < 0)
+		return rc;
+
+	if ((__u16)id != cid->cid_exclude) {
+		int i;
+
+		for (i = 0; i < cid->cid_count; i++) {
+			/* already collected the mirror id */
+			if (id == cid->cid_ids[i])
+				return LLAPI_LAYOUT_ITER_CONT;
+		}
+		cid->cid_ids[cid->cid_count] = id;
+		cid->cid_count++;
+	}
+
+	return LLAPI_LAYOUT_ITER_CONT;
+}
+
+static bool last_non_stale_mirror(__u16 mirror_id, struct llapi_layout *layout)
 {
-	struct mds_ssh_session *mss;
+	__u16 mirror_ids[128] = { 0 };
+	struct collect_ids_data cid = {
+		.cid_ids = mirror_ids,
+		.cid_count = 0,
+		.cid_exclude = mirror_id,
+	};
+	int i;
+
+	llapi_layout_comp_iterate(layout, collect_mirror_id, &cid);
+
+	for (i = 0; i < cid.cid_count; i++) {
+		struct llapi_resync_comp comp_array[1024] = { { 0 } };
+		int comp_size = 0;
+
+		comp_size = llapi_mirror_find_stale(layout, comp_array,
+						    ARRAY_SIZE(comp_array),
+						    &mirror_ids[i], 1);
+		if (comp_size == 0)
+			return false;
+	}
+
+	return true;
+}
+
+static int
+lpurge_mirror_delete(const struct lu_fid *fid, unsigned int mirror_id)
+{
+	char fid_buf[FID_LEN + 1];
+	char vname_buf[PATH_MAX];
+	struct ll_ioc_lease *lil = NULL;
+	struct llapi_layout *layout = NULL;
+	int mdt_index = -1;
+	int fd = -1;
+	int vfd = -1;
 	int rc;
 
-	pthread_mutex_lock(&m->m_ssh_lock);
-	/*
-	 * this should not happen since we limited the max count of launched jobs
-	 * just in case.
+	/* Inline replacement for
+	 * lfs mirror split -d --mirror-id mirror_id $MOUNTPOINT/.lustre/fid/FID
 	 */
-	while (lipe_list_empty(&m->m_ssh_list))
-		pthread_cond_wait(&m->m_ssh_cond, &m->m_ssh_lock);
 
-	/* pop one session */
-	mss = container_of(m->m_ssh_list.next, struct mds_ssh_session,
-			mss_list);
-	lipe_list_del(&mss->mss_list);
+	snprintf(fid_buf, sizeof(fid_buf), DFID, PFID(fid));
 
-	pthread_mutex_unlock(&m->m_ssh_lock);
+	fd = openat(open_by_fid_fd, fid_buf, O_RDWR);
+	if (fd < 0) {
+		rc = -errno;
+		llapi_printf(LLAPI_MSG_DEBUG,
+			     "cannot open "DFID" for split: %s\n",
+			     PFID(fid), strerror(errno));
+		goto out;
+	}
 
-	rc = lipe_ssh_exec(&mss->mss_ctx, cmd);
-	if (rc != 0 && rc != EUCLEAN)
-		llapi_printf(LLAPI_MSG_FATAL,
-			"error executing ssh command '%s' on '%s': rc = %d\n",
-			cmd, m->host, rc);
-	pthread_mutex_lock(&m->m_ssh_lock);
-	lipe_list_add(&mss->mss_list, &m->m_ssh_list);
-	pthread_cond_signal(&m->m_ssh_cond);
-	pthread_mutex_unlock(&m->m_ssh_lock);
+	rc = llapi_file_fget_mdtidx(fd, &mdt_index);
+	if (rc < 0)
+		goto out;
+
+	snprintf(vname_buf, sizeof(vname_buf), LUSTRE_VOLATILE_HDR":%.4X:%.4X",
+		 mdt_index, (unsigned int)random());
+
+	vfd = openat(open_by_fid_fd, vname_buf,
+		     O_RDWR|O_CREAT|O_EXCL|O_NOFOLLOW|O_LOV_DELAY_CREATE);
+	if (vfd < 0) {
+		rc = -errno;
+		goto out;
+	}
+
+	rc = llapi_lease_acquire(fd, LL_LEASE_WRLCK);
+	if (rc < 0)
+		goto out;
+
+	layout = llapi_layout_get_by_fd(fd, 0);
+	if (layout == NULL) {
+		rc = -errno;
+		goto out;
+	}
+
+	if (last_non_stale_mirror(mirror_id, layout)) {
+		rc = -EUCLEAN;
+		goto out;
+	}
+
+	lil = calloc(1, offsetof(typeof(*lil), lil_ids[2]));
+	if (lil == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	lil->lil_mode = LL_LEASE_UNLCK;
+	lil->lil_flags = LL_LEASE_LAYOUT_SPLIT;
+	lil->lil_count = 2;
+	lil->lil_ids[0] = vfd;
+	lil->lil_ids[1] = mirror_id;
+
+	rc = llapi_lease_set(fd, lil);
+	if (rc < 0) {
+		goto out;
+	} else if (rc == 0) {
+		/* lost lease lock */
+		rc = -EBUSY;
+	} else {
+		rc = 0;
+	}
+out:
+	if (rc < 0)
+		llapi_printf(LLAPI_MSG_DEBUG,
+			     "cannot delete mirror %u of "DFID": rc = %d\n",
+			     mirror_id, PFID(fid), rc);
+	else
+		llapi_printf(LLAPI_MSG_DEBUG,
+			     "deleted mirror %u of "DFID"\n",
+			     mirror_id, PFID(fid));
+
+	llapi_layout_free(layout);
+
+	free(lil);
+
+	if (!(fd < 0))
+		close(fd);
+
+	if (!(vfd < 0))
+		close(vfd);
 
 	return rc;
 }
 
-#define CMDSIZE		(64 * 1024)
+static bool lpurge_work_should_run;
+static size_t lpurge_work_thread_count;
+static pthread_t *lpurge_work_threads;
+static LIPE_LIST_HEAD(lpurge_work_list);
+static pthread_mutex_t lpurge_work_lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t lpurge_work_cond = PTHREAD_COND_INITIALIZER;
+static pthread_cond_t lpurge_work_done = PTHREAD_COND_INITIALIZER;
 
-void *lpurge_spawn_one(void *args)
+static void *lpurge_work_func(void *data)
 {
-	struct lpurge_object_pack *lp = (struct lpurge_object_pack *)args;
-	char buf[PATH_MAX];
-	int rc, i, idx = lp->lop_mdt_idx;
-
-	for (i = 0; i < lp->lop_nr; i++) {
-		if (lp->lop_objs[i].lo_comp_id) {
-			snprintf(buf, sizeof(buf),
-				"lfs mirror split -d --mirror-id %u "
-				"%s/.lustre/fid/"DFID" > /dev/null 2>&1",
-				lp->lop_objs[i].lo_comp_id,
-				mds[idx].mnt, PFID(&lp->lop_objs[i].lo_fid));
-		} else {
-			snprintf(buf, sizeof(buf),
-				"lfs mirror split -d -p %s "
-				"%s/.lustre/fid/"DFID" > /dev/null 2>&1",
-				opt.o_pool, mds[idx].mnt, PFID(&lp->lop_objs[i].lo_fid));
-		}
+	struct lpurge_object *lo;
+	int rc;
 
-		rc = lpurge_exec_cmd(&mds[idx], buf);
-		if (rc)
-			llapi_printf(LLAPI_MSG_DEBUG,
-				     "couldn't remove %u from "DFID": rc=%d\n",
-				     lp->lop_objs[i].lo_comp_id,
-				     PFID(&lp->lop_objs[i].lo_fid), rc);
+	pthread_mutex_lock(&lpurge_work_lock);
+
+	while (1) {
+		while (lpurge_work_should_run && lipe_list_empty(&lpurge_work_list))
+			pthread_cond_wait(&lpurge_work_cond, &lpurge_work_lock);
+
+		if (!lpurge_work_should_run)
+			break;
+
+		lo = lipe_list_entry(lpurge_work_list.next,
+				     struct lpurge_object, lo_list);
+		lipe_list_del(&lo->lo_list);
+
+		stats.s_started++;
+		pthread_mutex_unlock(&lpurge_work_lock);
+
+		rc = lpurge_mirror_delete(&lo->lo_fid, lo->lo_mirror_id);
+		free(lo);
+
+		pthread_mutex_lock(&lpurge_work_lock);
+
+		if (rc < 0)
+			stats.s_failed++;
+		else
+			stats.s_purged++;
+
+		assert(stats.s_purged + stats.s_failed <= stats.s_queued);
+
+		if (stats.s_purged + stats.s_failed == stats.s_queued)
+			pthread_cond_signal(&lpurge_work_done);
 	}
 
-	/* ignore errors, in the worst case we'll find this object again */
-	pthread_exit((void *)0);
+	pthread_mutex_unlock(&lpurge_work_lock);
+
+	return NULL;
 }
 
-int lpurge_spawn(struct lipe_list_head *pm_list)
+static void lpurge_work_threads_stop(void)
 {
-	int jobs[MAX_MDTS] = { 0 };
-	struct lipe_list_head list; /* list of jobs in execution */
-	struct lpurge_object_pack *lp, *tmp;
-	int total = 0;
-	int i, repeat;
-
-	LIPE_INIT_LIST_HEAD(&list);
-
-next:
-	repeat = 0;
-	for (i = 0; i < MAX_MDTS; i++) {
-		while (!lipe_list_empty(&pm_list[i])) {
-			pthread_t pid;
-			int rc;
+	size_t i;
+	int rc;
 
-			if (jobs[i] >= opt.o_max_jobs) {
-				repeat = 1;
-				break;
-			}
+	pthread_mutex_lock(&lpurge_work_lock);
+	lpurge_work_should_run = false;
+	pthread_cond_broadcast(&lpurge_work_cond);
+	pthread_mutex_unlock(&lpurge_work_lock);
 
-			lp = lipe_list_entry(pm_list[i].next,
-					struct lpurge_object_pack, lop_list);
-			lipe_list_del(&lp->lop_list);
-
-			/* now fork and execute */
-			rc = pthread_create(&pid, NULL, lpurge_spawn_one, lp);
-			if (rc == 0) {
-				stats.s_spawned++;
-				stats.s_purged = lp->lop_nr;
-
-				lp->lop_pid = pid;
-				lp->lop_mdt_idx = i;
-				lipe_list_add_tail(&lp->lop_list, &list);
-				jobs[i]++;
-				total++;
-			}
+	for (i = 0; i < lpurge_work_thread_count; i++) {
+		rc = pthread_join(lpurge_work_threads[i], NULL);
+		if (rc != 0) {
+			llapi_printf(LLAPI_MSG_FATAL,
+				     "cannot join work thread: %s\n",
+				     strerror(rc));
 		}
 	}
 
-wait:
-	assert(total > 0);
-	lp = NULL;
-	lipe_list_for_each_entry_safe(lp, tmp, &list, lop_list) {
-		int rc;
-		intptr_t retval;
+	lpurge_work_thread_count = 0;
 
-		rc = pthread_tryjoin_np(lp->lop_pid, (void **)&retval);
-		if (rc)
-			continue;
-		lipe_list_del(&lp->lop_list);
-		i = lp->lop_mdt_idx;
-		llapi_printf(LLAPI_MSG_DEBUG,
-			     "thread %lu MDT#%d completed: %"PRIdPTR"\n",
-			     lp->lop_pid, i, retval);
-		free(lp);
-		assert(jobs[i] > 0);
-		jobs[i]--;
-		total--;
-	}
-
-	if (repeat)
-		goto next;
-	if (total)
-		goto wait;
-	return 0;
-}
+	free(lpurge_work_threads);
+	lpurge_work_threads = NULL;
 
-/* round-robin distribution for unknown (w/o mappping, likely new) MDS */
-static int fid2idx_rr;
+	/* TODO: Free entries in lpurge_work_list. */
+}
 
-static int lpurge_fid2idx(const struct lu_fid *fid)
+static int lpurge_work_threads_start(size_t thread_count)
 {
-	int rc, idx;
+	int rc = 0;
 
-	/* XXX: local cache to avoid ioctl() for each FID? */
-	rc = llapi_get_mdt_index_by_fid(lustre_fd, fid, &idx);
-	if (rc < 0) {
-		llapi_printf(LLAPI_MSG_ERROR,
-				"can't lookup "DFID": rc=%d\n",
-				PFID(fid), rc);
-		return rc;
+	assert(thread_count > 0);
+
+	lpurge_work_should_run = true;
+
+	lpurge_work_threads = calloc(thread_count, sizeof(lpurge_work_threads[0]));
+	if (lpurge_work_threads == NULL) {
+		rc = -ENOMEM;
+		goto out;
 	}
 
-	if (idx >= mdsnr || !mds[idx].host) {
-		/*
-		 * no MDS registered for this index
-		 * but we can use any, it's just less efficient
-		 * due to over-network RPCs to open-close-modify.
-		 * so choose some MDS to send replica removal to
-		 */
-		do {
-			fid2idx_rr++;
-			if (fid2idx_rr == mdsnr)
-				fid2idx_rr = 0;
-		} while (mds[fid2idx_rr].host == NULL);
+	while (lpurge_work_thread_count < thread_count) {
+		rc = pthread_create(&lpurge_work_threads[lpurge_work_thread_count],
+				    NULL /* attr */,
+				    &lpurge_work_func,
+				    NULL /* data */);
+		if (rc != 0) {
+			llapi_printf(LLAPI_MSG_FATAL,
+				     "cannot create work thread: %s\n",
+				     strerror(rc));
+			rc = -rc;
+			goto out;
+		}
 
-		idx = fid2idx_rr;
+		lpurge_work_thread_count++;
 	}
+out:
+	if (rc < 0)
+		lpurge_work_threads_stop();
+
+	return rc;
+}
 
-	return idx;
+/* Unused. */
+void lpurge_work_submit(struct lpurge_object *lo)
+{
+	pthread_mutex_lock(&lpurge_work_lock);
+	assert(lpurge_work_should_run);
+	stats.s_queued++;
+	lipe_list_add_tail(&lo->lo_list, &lpurge_work_list);
+	pthread_cond_signal(&lpurge_work_cond);
+	pthread_mutex_unlock(&lpurge_work_lock);
 }
 
 void lpurge_purge_slot(struct lpurge_slot *ls, long long target)
 {
-	struct lpurge_object_pack *lp, *pm;
-	struct lipe_list_head pm_list[MAX_MDTS];
-	__u64 blocks[MAX_MDTS] = { 0 };
+	struct lpurge_object *lo;
 	__u64 total, was, kbfree;
 	int i, rc;
 
 	/* try to remove some replicas */
-
-	for (i = 0; i < MAX_MDTS; i++)
-		LIPE_INIT_LIST_HEAD(&pm_list[i]);
-
 again:
 	llapi_printf(LLAPI_MSG_DEBUG, "release upto %llu (expect %lu in %lu)\n",
 		     target, ls->ls_space, ls->ls_found);
 	total = 0;
-	/* take few FIDs */
+
 	assert(!lipe_list_empty(&ls->ls_obj_list));
-	lp = lipe_list_entry(ls->ls_obj_list.next, struct lpurge_object_pack,
-			     lop_list);
-	for (i = lp->lop_nr - 1; i >= 0; i--) {
-		struct lu_fid fid;
-		int idx;
 
-		lp->lop_nr--;
+	pthread_mutex_lock(&lpurge_work_lock);
+	assert(lpurge_work_should_run);
 
-		/* split by MDT index, we could do this at scanning, but that can
-		 * result in many useless lookups if only part of slots are subject
-		 * to removal
-		 */
-		fid = lp->lop_objs[i].lo_fid;
-		if (opt.o_iml_socket) {
-			rc = flist_add_json_fid(lflist, &fid);
-			if (rc < 0) {
-				llapi_printf(LLAPI_MSG_ERROR,
-					     "can't add "DFID": rc=%d\n",
-					     PFID(&fid), rc);
-				continue;
-			}
-		} else {
-			idx = lpurge_fid2idx(&fid);
-			if (idx < 0)
-				continue;
-			/* how many blocks we expect to free */
-			blocks[idx] += lp->lop_objs[i].lo_blocks;
-			total += lp->lop_objs[i].lo_blocks;
-			ls->ls_space -= lp->lop_objs[i].lo_blocks;
-			ls->ls_found--;
-			ls->ls_stored--;
-
-			/* put FID into per-MDT pack */
-			pm = NULL;
-			if (!lipe_list_empty(&pm_list[idx]))
-				pm = lipe_list_entry(pm_list[idx].next,
-						     struct lpurge_object_pack,
-						     lop_list);
-			if (!pm || pm->lop_nr >= pm->lop_max) {
-				pm = calloc(1, PM_SIZE);
-				if (!pm) {
-					llapi_printf(LLAPI_MSG_ERROR,
-							"can't alloca pack\n");
-					exit(1);
-				}
-				pm->lop_max = FIDS_PER_CALL;
-				pm->lop_nr = 0;
-				lipe_list_add(&pm->lop_list, &pm_list[idx]);
-			}
-			pm->lop_objs[pm->lop_nr] = lp->lop_objs[i];
-			pm->lop_nr++;
-		}
+	while (!lipe_list_empty(&ls->ls_obj_list)) {
+		lo = lipe_list_entry(ls->ls_obj_list.next, struct lpurge_object,
+				     lo_list);
+
+		/* how many blocks we expect to free */
+		total += lo->lo_blocks;
+		ls->ls_space -= lo->lo_blocks;
+		ls->ls_found--;
+		ls->ls_stored--;
+
+		lipe_list_move_tail(&lo->lo_list, &lpurge_work_list);
+		stats.s_queued++;
 
 		/* if current collection of objects may free target space, then stop */
 		if (total >= target)
 			break;
 	}
-	if (!lp->lop_nr) {
-		lipe_list_del(&lp->lop_list);
-		free(lp);
-	}
+
+	pthread_cond_broadcast(&lpurge_work_cond);
+	pthread_mutex_unlock(&lpurge_work_lock);
 
 	/* fork, start and wait for completion against each MDT */
 	/* give some time to OST_DESTROY to pass through */
@@ -1049,10 +1087,11 @@ again:
 
 	llapi_printf(LLAPI_MSG_DEBUG, "spawn, expect %llu back\n", total);
 
-	if (opt.o_iml_socket)
-		flist_write(lflist, true);
-	else
-		lpurge_spawn(pm_list);
+	/* Wait for purge threads to complete all submitted work. */
+	pthread_mutex_lock(&lpurge_work_lock);
+	while (stats.s_purged + stats.s_failed < stats.s_queued)
+		pthread_cond_wait(&lpurge_work_done, &lpurge_work_lock);
+	pthread_mutex_unlock(&lpurge_work_lock);
 
 	/* XXX: 20 is probably too short for ZFS as changes need to
 	 * get committed on MDS first, then sent to OST, then get
@@ -1245,7 +1284,7 @@ void parse_mountpoint(const char *name)
 	struct lu_fid fid;
 	int rc, idx;
 
-	lustre_fd = open(name, O_RDONLY/* | O_DIRECTORY*/);
+	lustre_fd = open(name, O_RDONLY);
 	if (lustre_fd < 0) {
 		llapi_printf(LLAPI_MSG_FATAL,
 			     "can't open %s: %d\n", name, errno);
@@ -1260,89 +1299,18 @@ void parse_mountpoint(const char *name)
 			     "%s isn't Lustre mountpoint: %d\n", name, rc);
 		exit(1);
 	}
-}
-
-void parse_mds(char *args)
-{
-	struct mds *m;
-	char *host, *mnt, *sidx;
-	int idx, rc;
-	int j;
-	char *endptr;
-
-	/* mds# hostname mountpoint */
-	sidx = strsep(&args, ":\n");
-	idx = strtol(sidx, &endptr, 10);
-	if (*endptr != '\0' || idx < 0) {
-		rc = -EINVAL;
-		llapi_error(LLAPI_MSG_FATAL, rc,
-			    "invalid MDS index: '%s'\n", sidx);
-		exit(1);
-	}
-
-	if (idx >= MAX_MDTS) {
-		rc = -EINVAL;
-		llapi_error(LLAPI_MSG_FATAL, rc, "too high MDS index\n");
-		exit(1);
-	}
 
-	m = &mds[idx];
-	if (m->host) {
-		llapi_printf(LLAPI_MSG_FATAL,
-			     "mds #%d is defined already\n", idx);
-		exit(1);
-	}
-	host = strsep(&args, ":\n");
-	mnt = strsep(&args, ":\n");
-	if (!host || !mnt) {
-		llapi_printf(LLAPI_MSG_FATAL,
-			     "no host or mntpt: %s\n", args);
+	open_by_fid_fd = openat(lustre_fd, ".lustre/fid", O_RDONLY);
+	if (open_by_fid_fd < 0) {
+		llapi_error(LLAPI_MSG_FATAL, -errno,
+			    "cannot open '%s/.lustre/fid'", name);
 		exit(1);
 	}
-
-	if (idx >= MAX_MDTS) {
-		llapi_printf(LLAPI_MSG_FATAL,
-			     "MDT index %u > max index %u: %s\n",
-			     idx, MAX_MDTS, args);
-		exit(1);
-	}
-
-	m->host = strdup(host);
-	m->mnt = strdup(mnt);
-
-	/* init ssh session to target mds */
-	pthread_mutex_init(&m->m_ssh_lock, NULL);
-	pthread_cond_init(&m->m_ssh_cond, NULL);
-	LIPE_INIT_LIST_HEAD(&m->m_ssh_list);
-
-	for (j = 0; j < opt.o_max_jobs; j++) {
-		struct mds_ssh_session *mss;
-
-		mss = calloc(1, sizeof(*mss));
-		if (mss == NULL) {
-			llapi_printf(LLAPI_MSG_FATAL,
-				"cannot allocate ssh session\n");
-			exit(1);
-		}
-
-		rc = lipe_init_ssh_session(&mss->mss_ctx, host);
-		if (rc != SSH_AUTH_SUCCESS) {
-			llapi_printf(LLAPI_MSG_FATAL,
-			     "cannot create ssh session for host %s: rc = %d\n",
-				host, rc);
-			exit(1);
-		}
-
-		lipe_list_add(&mss->mss_list, &m->m_ssh_list);
-	}
-
-	mdsnr++;
-	llapi_printf(LLAPI_MSG_DEBUG, "add mds #%d at %s@%s\n",
-		     idx, host, mnt);
 }
 
 #define LPURGE_INTERNAL_DUMP_FIDS	1
 #define LPURGE_OPT_IML_SOCKET		2
+#define LPURGE_OPT_VERSION 3
 
 static struct option options[] = {
 	{ "device", required_argument, NULL, 'D'},
@@ -1363,6 +1331,7 @@ static struct option options[] = {
 	{ "scan_rate", required_argument, NULL, 'R'},
 	{ "scan_threads", required_argument, NULL, 't'},
 	{ "slot_size", required_argument, NULL, 'S'},
+	{ "version", no_argument, NULL, LPURGE_OPT_VERSION},
 	{ NULL }
 };
 
@@ -1387,6 +1356,9 @@ void lpurge_process_opt(int c, char *optarg)
 	case LPURGE_INTERNAL_DUMP_FIDS:
 		opt.o_fids_dumpfile = strdup(optarg);
 		break;
+	case LPURGE_OPT_VERSION:
+		lipe_version();
+		exit(0);
 	case 'b':
 		llapi_msg_set_level(LLAPI_MSG_MAX);
 		break;
@@ -1441,7 +1413,7 @@ void lpurge_process_opt(int c, char *optarg)
 		opt.o_freelo = value;
 		break;
 	case 'm':
-		parse_mds(optarg);
+		/* parse_mds(optarg); */
 		break;
 	case 'M':
 		opt.o_mountpoint = strdup(optarg);
@@ -1565,7 +1537,6 @@ void load_config(char *name)
 void lpurge_verify_opts(void)
 {
 	char buf[1024];
-	int i;
 
 	if (!opt.o_pool) {
 		opt.o_pool = DEF_POOL;
@@ -1659,42 +1630,6 @@ void lpurge_verify_opts(void)
 		return;
 	}
 
-	// Stand-alone Operation Checks
-	if (mdsnr == 0) {
-		llapi_printf(LLAPI_MSG_FATAL,
-			     "mds is not configured\n");
-		exit(1);
-	}
-
-	for (i = 0; i < mdsnr; i++) {
-		int rc;
-		char cmd[PATH_MAX];
-
-		if (!mds[i].host) {
-			llapi_printf(LLAPI_MSG_FATAL,
-				     "MDS host is not configured\n");
-			exit(1);
-		}
-
-		if (!mds[i].mnt) {
-			llapi_error(LLAPI_MSG_FATAL, -EINVAL,
-				    "mountpoint on MDS host '%s' is not configured\n",
-				    mds[i].host);
-			exit(1);
-		}
-
-		/* check mountpoint on MDS */
-		snprintf(cmd, sizeof(cmd), "lfs path2fid %s > /dev/null 2>&1",
-			mds[i].mnt);
-		rc = lpurge_exec_cmd(&mds[i], cmd);
-		if (rc) {
-			llapi_error(LLAPI_MSG_FATAL, -EINVAL,
-				    "invalid mountpoint '%s' on MDS host: '%s'\n",
-				    mds[i].mnt, mds[i].host);
-			exit(1);
-		}
-	}
-
 	if (opt.o_max_jobs < 1 || opt.o_max_jobs > 1024) {
 		llapi_printf(LLAPI_MSG_FATAL,
 			     "invalid max_jobs: %d\n", opt.o_max_jobs);
@@ -1744,24 +1679,23 @@ void lpurge_usr1_handle(int sig)
 		return;
 	}
 	llapi_printf(LLAPI_MSG_DEBUG, "dump to %s\n", opt.o_dumpfile);
-	fprintf(f, "config:\n"
+	fprintf(f,
+		"version: %s-%s\n"
+		"revision: %s\n"
+		"config:\n"
 		"    free_high: %u\n"
 		"    free_low: %u\n"
 		"    ostname: %s\n"
 		"    mountpoint: %s\n"
 		"    pool: %s\n"
-		"    mds: ",
-		opt.o_freehi, opt.o_freelo,
-		opt.o_device, opt.o_mountpoint, opt.o_pool);
-	for (i = 0; i < mdsnr; i++)
-		fprintf(f, "%s%d:%s:%s", i ? "," : "", i,
-			mds[i].host, mds[i].mnt);
-	fprintf(f, "\n"
 		"    max_jobs: %u\n"
 		"    check_interval: %u\n"
 		"    scan_rate: %u\n"
 		"    scan_threads: %u\n"
 		"    slot_size: %u\n",
+		PACKAGE_VERSION, LIPE_RELEASE, LIPE_REVISION,
+		opt.o_freehi, opt.o_freelo,
+		opt.o_device, opt.o_mountpoint, opt.o_pool,
 		opt.o_max_jobs, opt.o_interval, opt.o_scan_rate,
 		opt.o_scan_threads, opt.o_slot_size);
 	if (opt.o_iml_socket)
@@ -1771,11 +1705,14 @@ void lpurge_usr1_handle(int sig)
 		"    scans: %lu\n"
 		"    scan_time: %llu\n"
 		"    slow_scans: %lu\n"
-		"    spawned: %lu\n"
+		"    queued: %lu\n"
+		"    started: %lu\n"
 		"    purged: %lu\n"
+		"    failed: %lu\n"
 		"    low_hits: %lu\n",
 		stats.s_scans, stats.s_scan_time, stats.s_slow_scans,
-		stats.s_spawned, stats.s_purged, stats.s_low_hits);
+		stats.s_queued, stats.s_started, stats.s_purged, stats.s_failed,
+		stats.s_low_hits);
 	lpurge_kbfree(&kbfree);
 	fprintf(f, "space:\n"
 		"    kbfree: %llu\n"
@@ -1784,7 +1721,7 @@ void lpurge_usr1_handle(int sig)
 		kbfree, freelo, freehi);
 
 #define HIST_FMT \
-	"    hist%u: { age: %lu, found: %lu, space: %lu, stored: %lu }\n"
+	"    hist%u: { age: %lu, found: %lu, space: %lu, stored: %lu, nomirror_cnt: %lu, nomirror_space: %lu, nopfid_cnt: %lu, nopfid_space: %lu, notfirst_cnt: %lu, notfirst_space: %lu }\n"
 
 	fprintf(f, "hlists:\n");
 	for (i = LPURGE_HIST_MAX - 1; i >= 0; i--) {
@@ -1794,7 +1731,10 @@ void lpurge_usr1_handle(int sig)
 			continue;
 
 		fprintf(f, HIST_FMT, i, ls->ls_age, ls->ls_found,
-			ls->ls_space, ls->ls_stored);
+			ls->ls_space, ls->ls_stored,
+			ls->ls_nomirror_objs,ls->ls_nomirror_space,
+			ls->ls_nopfid_objs, ls->ls_nopfid_space,
+			ls->ls_notfirst_objs, ls->ls_notfirst_space);
 	}
 
 	fflush(f);
@@ -1924,17 +1864,13 @@ void lpurge_usr2_handle(int sig)
 	 * {"fid":"[0x200000401:0x6:0x0]", "last_use_time": 123456, "slot_id": $SLOT_ID}
 	 */
 	for (i = LPURGE_HIST_MAX - 1; i >= 0; i--) {
-		struct lpurge_object_pack *lp;
 		struct lpurge_slot *ls = lpurge_hist + i;
+		struct lpurge_object *lo;
 
 		if (ls->ls_found == 0)
 			continue;
 
-		lp = lipe_list_entry(ls->ls_obj_list.next,
-				     struct lpurge_object_pack,
-				     lop_list);
-
-		for (i = lp->lop_nr - 1; i >= 0; i--) {
+		lipe_list_for_each_entry(lo, &ls->ls_obj_list, lo_list) {
 			int rc;
 			char fid_buf[FID_LEN + 1];
 			struct lu_fid fid;
@@ -1945,13 +1881,13 @@ void lpurge_usr2_handle(int sig)
 
 			obj_stat = json_object_new_object();
 
-			fid = lp->lop_objs[i].lo_fid;
+			fid = lo->lo_fid;
 			snprintf(fid_buf, sizeof(fid_buf), DFID, PFID(&fid));
 			json_object_object_add(obj_stat, "fid",
 					       json_object_new_string(fid_buf));
 			json_object_object_add(obj_stat, "last_use_time",
 					       json_object_new_int64(
-					       lp->lop_objs[i].lo_last_utime));
+						       lo->lo_last_utime));
 			json_object_object_add(obj_stat, "slot_id",
 					       json_object_new_int(i));
 			output = json_object_to_json_string_ext(obj_stat,
@@ -2041,14 +1977,10 @@ static void lpurge_register_signal_handlers(void)
 
 int main(int argc, char **argv)
 {
-	ssh_threads_set_callbacks(ssh_threads_get_pthread());
-	ssh_init();
-
+	lipe_version_init();
 	setlinebuf(stdout);
 	setlinebuf(stderr);
 
-	memset(mds, 0, sizeof(mds));
-
 	llapi_msg_set_level(LLAPI_MSG_INFO);
 
 	signal(SIGUSR1, lpurge_null_handler);
@@ -2061,6 +1993,10 @@ int main(int argc, char **argv)
 	 * followed by the OST name ("lpurge lustre-OST0000"). */
 	llapi_set_command_name(ostname);
 
+	llapi_error(LLAPI_MSG_INFO|LLAPI_MSG_NO_ERRNO, 0,
+		    "version %s-%s, revision %s\n",
+		    PACKAGE_VERSION, LIPE_RELEASE, LIPE_REVISION);
+
 	lpurge_get_ost_mntpt();
 	lpurge_configure_thresholds();
 
@@ -2070,6 +2006,8 @@ int main(int argc, char **argv)
 
 	lpurge_register_signal_handlers();
 
+	lpurge_work_threads_start(opt.o_max_jobs);
+
 	while (1) {
 		/* XXX: do lazy scanning in the background
 		 * before the real need, so that when
@@ -2091,4 +2029,9 @@ int main(int argc, char **argv)
 		/* device size can change runtime.. */
 		lpurge_configure_thresholds();
 	}
+
+	/* Unreached. */
+	lipe_version_fini();
+
+	return 0;
 }
diff --git a/src/lustre_ea.c b/src/lustre_ea.c
index a58525b..fd674bf 100644
--- a/src/lustre_ea.c
+++ b/src/lustre_ea.c
@@ -7,12 +7,12 @@
  *
  * Author: Li Xi <lixi@ddn.com>
  */
-#include <lustre/lustreapi.h>
+#include <errno.h>
 #include <fnmatch.h>
+#include <lustre/lustreapi.h>
 #include "fake_lustre_idl.h"
 #include "lustre_ea.h"
 #include "debug.h"
-#include "errno.h"
 
 int decode_linkea(char *buf, ssize_t size)
 {
diff --git a/version-gen.sh b/version-gen.sh
index 1ebad2d..d731ccb 100755
--- a/version-gen.sh
+++ b/version-gen.sh
@@ -1,13 +1,8 @@
 #!/bin/sh
 
-# Please update all the version references if VERSION changed.
-# lipe.spec.in
-# man/lipe_scan.1
-# pylipe/lipe_test.py
-
-VERSION="1.14"
-if [ -d .git ]; then
-	VERSION=$(git describe|sed 's/-[0-9]*-/./')
-fi
+VERSION="1.17"
+# if [ -d .git ]; then
+# 	VERSION=$(git describe|sed 's/-[0-9]*-/./')
+# fi
 
 printf "%s" "$VERSION"
-- 
1.8.3.1