From 2ad1df3c62663ce61e82856ebe4f576f751d617d Mon Sep 17 00:00:00 2001
From: braam <braam>
Date: Mon, 29 Apr 2002 20:36:26 +0000
Subject: [PATCH] - documentation update for MDS recovery - remove unused MGR_
 constants - remove rpc fallout from Andreas mergers - add last committed
 updates to close/reint - add handling of last committed to client file system
 - add replay handling for recovery to client fs & rpc - mark requests as
 completed and committed on the client to   be agnostic of the ordering of
 these events - state machine for recovd - basics in place - last_committed
 and last_received moved in the lustre_msg from body - client cleanup is call
 when system cleans up - set transaction numbers properly on MDS - mds_connect
 call completed - obd interface for high availability new connection
 announcements

---
 lustre/include/linux/lustre_ha.h   |  13 +-
 lustre/include/linux/lustre_idl.h  |   7 +-
 lustre/include/linux/lustre_lite.h |   2 +-
 lustre/include/linux/lustre_net.h  |  55 +++++---
 lustre/include/linux/obd.h         |   5 +-
 lustre/include/linux/obd_support.h |   2 +
 lustre/ldlm/ldlm_lockd.c           |   6 +-
 lustre/llite/Makefile.am           |   2 +-
 lustre/llite/commit_callback.c     |   9 +-
 lustre/llite/file.c                |  13 +-
 lustre/llite/super.c               |   9 +-
 lustre/mdc/mdc_request.c           |  17 ++-
 lustre/mds/handler.c               |  37 +++---
 lustre/mds/mds_reint.c             |   4 +-
 lustre/osc/osc_request.c           |   8 +-
 lustre/ptlrpc/Makefile.am          |   2 +-
 lustre/ptlrpc/client.c             | 149 ++++++++++++++++++++--
 lustre/ptlrpc/niobuf.c             |  12 +-
 lustre/ptlrpc/recovd.c             | 253 +++++++++++--------------------------
 lustre/ptlrpc/rpc.c                |  25 +++-
 lustre/tests/llmount.sh            |  15 +--
 lustre/tests/openunlink.c          |   2 +-
 lustre/tests/runfailure-mds        |  25 ++--
 lustre/utils/ha_assist.sh          |   9 ++
 lustre/utils/obdctl.c              |  20 +++
 25 files changed, 402 insertions(+), 299 deletions(-)
diff --git a/lustre/include/linux/lustre_ha.h b/lustre/include/linux/lustre_ha.h
index 4701457..6598be0 100644
--- a/lustre/include/linux/lustre_ha.h
+++ b/lustre/include/linux/lustre_ha.h
@@ -7,14 +7,6 @@
 
 #include <linux/lustre_net.h>
 
-#define MGR_STOPPING   1
-#define MGR_RUNNING    2
-#define MGR_STOPPED    4
-#define MGR_KILLED     8
-#define MGR_EVENT      16
-#define MGR_WORKING    32
-#define MGR_SIGNAL     64
-
 #define LUSTRE_HA_NAME "ptlrpc"
 
 #define CONNMGR_CONNECT 1
@@ -36,8 +28,9 @@ struct connmgr_body {
 int connmgr_connect(struct recovd_obd *mgr, struct ptlrpc_connection *conn);
 int connmgr_handle(struct obd_device *dev, struct ptlrpc_service *svc,
                    struct ptlrpc_request *req);
-void connmgr_cli_fail(struct ptlrpc_client *cli);
-void connmgr_cli_manage(struct recovd_obd *mgr, struct ptlrpc_client *cli);
+void recovd_cli_fail(struct ptlrpc_client *cli);
+void recovd_cli_manage(struct recovd_obd *mgr, struct ptlrpc_client *cli);
+void recovd_cli_fixed(struct ptlrpc_client *cli);
 int recovd_setup(struct recovd_obd *mgr);
 int recovd_cleanup(struct recovd_obd *mgr);
 
diff --git a/lustre/include/linux/lustre_idl.h b/lustre/include/linux/lustre_idl.h
index 58a4f60..1c9890f 100644
--- a/lustre/include/linux/lustre_idl.h
+++ b/lustre/include/linux/lustre_idl.h
@@ -51,6 +51,9 @@ struct lustre_msg {
 
         __u32 opc;
         __u32 xid;
+        __u64 last_rcvd;
+        __u64 last_committed;
+        __u64 transno;
         __u32 status;
         __u32 type;
         __u32   connid;
@@ -210,8 +213,6 @@ struct mds_body {
         __u32          nlink;
         __u32          generation;
         __u32          last_xid;
-        __u64          last_committed;
-        __u64          last_rcvd;
 };
 
 /* MDS update records */
@@ -555,6 +556,8 @@ static inline int obd_ioctl_getdata(char *buf, char *end, void *arg)
 #define OBD_IOC_NAME2DEV               _IOWR('f', 29, long)
 #define OBD_IOC_NEWDEV                 _IOWR('f', 30, long)
 
+#define OBD_RECOVD_NEWCONN             _IOWR('f', 31, long)
+
 #define OBD_IOC_DEC_FS_USE_COUNT       _IO  ('f', 32      )
 
 #endif
diff --git a/lustre/include/linux/lustre_lite.h b/lustre/include/linux/lustre_lite.h
index dbbdda4..be70d95 100644
--- a/lustre/include/linux/lustre_lite.h
+++ b/lustre/include/linux/lustre_lite.h
@@ -25,6 +25,7 @@
 extern kmem_cache_t *ll_file_data_slab;
 struct ll_file_data { 
         __u64 fd_mdshandle; 
+        struct ptlrpc_request *fd_req; 
 };
 
 #define LL_INLINESZ      60
@@ -48,7 +49,6 @@ struct ll_sb_info {
         struct ptlrpc_client      ll_ost_client;
         struct ptlrpc_connection *ll_ost_conn;
 
-        struct list_head          ll_commitcbd_not_committed;
         wait_queue_head_t         ll_commitcbd_waitq;
         wait_queue_head_t         ll_commitcbd_ctl_waitq;
         int                       ll_commitcbd_flags;
diff --git a/lustre/include/linux/lustre_net.h b/lustre/include/linux/lustre_net.h
index 1bc7c49..5914728 100644
--- a/lustre/include/linux/lustre_net.h
+++ b/lustre/include/linux/lustre_net.h
@@ -56,14 +56,20 @@
 /* default rpc ring length */
 #define RPC_RING_LENGTH    2
 
-#define SVC_STOPPING 1
-#define SVC_RUNNING  2
-#define SVC_STOPPED  4
-#define SVC_KILLED   8
-#define SVC_EVENT    16
-#define SVC_HA_EVENT 32
-#define SVC_SIGNAL   64
-
+#define SVC_KILLED 1
+#define SVC_EVENT  2
+#define SVC_SIGNAL 4
+#define SVC_RUNNING 8
+#define SVC_STOPPING 16
+#define SVC_STOPPED  32
+
+#define RECOVD_STOPPING      1     /* how cleanup tells recovd to quit */
+#define RECOVD_IDLE          2     /* normal state */
+#define RECOVD_STOPPED       4     /* after recovd has stopped */
+#define RECOVD_FAIL          8     /* RPC timeout: wakeup recovd, sets flag */
+#define RECOVD_TIMEOUT       16    /* set when recovd detects a timeout */
+#define RECOVD_UPCALL_WAIT   32    /* an upcall has been placed */
+#define RECOVD_UPCALL_ANSWER 64    /* an upcall has been answered */
 
 #define LUSTRE_CONN_NEW    1
 #define LUSTRE_CONN_CON    2
@@ -95,13 +101,18 @@ struct ptlrpc_client {
         struct obd_device *cli_obd;
         __u32 cli_request_portal;
         __u32 cli_reply_portal;
+        __u64 cli_last_rcvd;
+        __u64 cli_last_committed;
 
         struct semaphore cli_rpc_sem; /* limits outstanding requests */
 
         spinlock_t cli_lock; /* protects lists */
         struct list_head cli_sending_head;
         struct list_head cli_sent_head;
+        struct list_head cli_replied_head;
+        struct list_head cli_replay_head;
         struct list_head cli_ha_item; 
+        void (*cli_recover)(struct ptlrpc_client *); 
 
         struct recovd_obd *cli_recovd;
 };
@@ -111,13 +122,17 @@ struct ptlrpc_client {
 #define PTL_RPC_TYPE_REPLY   3
 
 /* state flags of requests */
-#define PTL_RPC_FL_INTR    1
-#define PTL_RPC_FL_REPLY   2
-#define PTL_RPC_FL_SENT    4
-#define PTL_BULK_FL_SENT   8
-#define PTL_BULK_FL_RCVD   16
-#define PTL_RPC_FL_ERR     32
-#define PTL_RPC_FL_TIMEOUT 64
+#define PTL_RPC_FL_INTR      (1 << 0)
+#define PTL_RPC_FL_REPLY     (1 << 1)
+#define PTL_RPC_FL_SENT      (1 << 2)
+#define PTL_BULK_FL_SENT     (1 << 3)
+#define PTL_BULK_FL_RCVD     (1 << 4)
+#define PTL_RPC_FL_ERR       (1 << 5)
+#define PTL_RPC_FL_TIMEOUT   (1 << 6)
+#define PTL_RPC_FL_RESEND    (1 << 7)
+#define PTL_RPC_FL_COMMITTED (1 << 8)
+#define PTL_RPC_FL_FINISHED  (1 << 9)
+#define PTL_RPC_FL_RETAIN    (1 << 10)
 
 struct ptlrpc_request { 
         int rq_type; /* one of PTL_RPC_REQUEST, PTL_RPC_REPLY, PTL_RPC_BULK */
@@ -127,17 +142,20 @@ struct ptlrpc_request {
         int rq_status;
         int rq_flags; 
         __u32 rq_connid;
+        atomic_t rq_refcount;
 
         int rq_reqlen;
         struct lustre_msg *rq_reqmsg;
 
         int rq_replen;
         struct lustre_msg *rq_repmsg;
+        __u64 rq_transno;
 
         char *rq_bulkbuf;
         int rq_bulklen;
 
         time_t rq_time;
+        time_t rq_timeout;
         //        void * rq_reply_handle;
         wait_queue_head_t rq_wait_for_rep;
 
@@ -225,12 +243,16 @@ int ptlrpc_register_bulk(struct ptlrpc_bulk_desc *);
 int ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *bulk);
 int ptlrpc_reply(struct ptlrpc_service *svc, struct ptlrpc_request *req);
 int ptlrpc_error(struct ptlrpc_service *svc, struct ptlrpc_request *req);
+void ptlrpc_resend_req(struct ptlrpc_request *request);
 int ptl_send_rpc(struct ptlrpc_request *request);
 void ptlrpc_link_svc_me(struct ptlrpc_service *service, int i);
 
 /* rpc/client.c */
-void ptlrpc_init_client(struct recovd_obd *, int req_portal, int rep_portal,
+void ptlrpc_init_client(struct recovd_obd *, 
+                        void (*recover)(struct ptlrpc_client *),
+                        int req_portal, int rep_portal,
                         struct ptlrpc_client *);
+void ptlrpc_cleanup_client(struct ptlrpc_client *cli);
 __u8 *ptlrpc_req_to_uuid(struct ptlrpc_request *req);
 struct ptlrpc_connection *ptlrpc_uuid_to_connection(char *uuid);
 int ptlrpc_queue_wait(struct ptlrpc_request *req);
@@ -239,6 +261,7 @@ struct ptlrpc_request *ptlrpc_prep_req(struct ptlrpc_client *cl,
                                        int count, int *lengths, char **bufs);
 void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *bulk);
 void ptlrpc_free_req(struct ptlrpc_request *request);
+void ptlrpc_req_finished(struct ptlrpc_request *request);
 struct ptlrpc_bulk_desc *ptlrpc_prep_bulk(struct ptlrpc_connection *);
 int ptlrpc_check_status(struct ptlrpc_request *req, int err);
 
diff --git a/lustre/include/linux/obd.h b/lustre/include/linux/obd.h
index b4e07c1..36f3162 100644
--- a/lustre/include/linux/obd.h
+++ b/lustre/include/linux/obd.h
@@ -112,9 +112,10 @@ struct recovd_obd {
         struct ptlrpc_service *recovd_service;
         struct ptlrpc_client  *recovd_client;
         __u32                  recovd_flags; 
+        __u32                  recovd_wakeup_flag; 
         spinlock_t             recovd_lock;
-        struct list_head      recovd_connections_lh; /* connections managed by the mgr */
-        struct list_head      recovd_troubled_lh; /* connections in trouble */
+        struct list_head      recovd_clients_lh; /* clients managed  */
+        struct list_head      recovd_troubled_lh; /* clients in trouble */
         wait_queue_head_t     recovd_recovery_waitq;
         wait_queue_head_t     recovd_ctl_waitq;
         wait_queue_head_t     recovd_waitq;
diff --git a/lustre/include/linux/obd_support.h b/lustre/include/linux/obd_support.h
index ac82272..209e3b9 100644
--- a/lustre/include/linux/obd_support.h
+++ b/lustre/include/linux/obd_support.h
@@ -94,6 +94,8 @@ do {                                                                         \
         if (OBD_FAIL_CHECK(id)) {                                            \
                 CERROR("obd_fail_loc=%x, fail operation rc=%d\n", id, ret);  \
                 obd_fail_loc |= OBD_FAILED;                                  \
+                if ((id) & OBD_FAIL_ONCE)                                    \
+                        obd_fail_loc |= OBD_FAIL_ONCE;                       \
                 RETURN(ret);                                                 \
         }                                                                    \
 } while(0)
diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c
index 72a0622..fdf556b 100644
--- a/lustre/ldlm/ldlm_lockd.c
+++ b/lustre/ldlm/ldlm_lockd.c
@@ -248,7 +248,8 @@ static int ldlm_iocontrol(int cmd, struct obd_conn *conn, int len, void *karg,
                 RETURN(-EINVAL);
         }
 
-        ptlrpc_init_client(NULL, LDLM_REQUEST_PORTAL, LDLM_REPLY_PORTAL,
+        ptlrpc_init_client(NULL, NULL,
+                           LDLM_REQUEST_PORTAL, LDLM_REPLY_PORTAL,
                            obddev->u.ldlm.ldlm_client);
         connection = ptlrpc_uuid_to_connection("ldlm");
         if (!connection)
@@ -293,7 +294,8 @@ static int ldlm_setup(struct obd_device *obddev, obd_count len, void *data)
         OBD_ALLOC(ldlm->ldlm_client, sizeof(*ldlm->ldlm_client));
         if (ldlm->ldlm_client == NULL)
                 LBUG();
-        ptlrpc_init_client(NULL, LDLM_REQUEST_PORTAL, LDLM_REPLY_PORTAL,
+        ptlrpc_init_client(NULL, NULL,
+                           LDLM_REQUEST_PORTAL, LDLM_REPLY_PORTAL,
                            ldlm->ldlm_client);
 
         MOD_INC_USE_COUNT;
diff --git a/lustre/llite/Makefile.am b/lustre/llite/Makefile.am
index 4ce382f8..defc532 100644
--- a/lustre/llite/Makefile.am
+++ b/lustre/llite/Makefile.am
@@ -14,7 +14,7 @@ page.c:
 
 
 LINX=page.c
-llite_SOURCES =  commit_callback.c page.c super.c rw.c file.c dir.c sysctl.c namei.c symlink.c
+llite_SOURCES =  recover.c commit_callback.c page.c super.c rw.c file.c dir.c sysctl.c namei.c symlink.c
 
 dist-hook:
 	list='$(LINX)'; for f in $$list; do rm -f $(distdir)/$$f; done
diff --git a/lustre/llite/commit_callback.c b/lustre/llite/commit_callback.c
index 17b25b8..fed2b3d 100644
--- a/lustre/llite/commit_callback.c
+++ b/lustre/llite/commit_callback.c
@@ -44,9 +44,6 @@ static int ll_commitcbd_check_event(struct ll_sb_info *sbi)
                 GOTO(out, rc = 1);
         }
 
-        if (!list_empty(&sbi->ll_commitcbd_not_committed))
-                GOTO(out, rc = 1);
-
  out:
         spin_unlock(&sbi->ll_commitcbd_lock);
         RETURN(rc);
@@ -83,10 +80,12 @@ static int ll_commitcbd_main(void *arg)
                 spin_lock(&sbi->ll_commitcbd_lock);
                 if (sbi->ll_commitcbd_flags & LL_COMMITCBD_STOPPING) {
                         spin_unlock(&sbi->ll_commitcbd_lock);
-                        CERROR("lustre_hamgr quitting\n"); 
+                        CERROR("lustre_commitd quitting\n"); 
                         EXIT;
                         break;
                 }
+                if (!list_empty(&sbi->ll_mds_client.cli_replied_head))
+                        CERROR("** clean up committed reqs here **\n"); 
 
                 schedule_timeout(sbi->ll_commitcbd_timeout);
                 CERROR("commit callback daemon woken up - FIXME\n"); 
@@ -100,6 +99,8 @@ static int ll_commitcbd_main(void *arg)
         RETURN(0);
 }
 
+
+
 int ll_commitcbd_setup(struct ll_sb_info *sbi)
 {
         int rc;
diff --git a/lustre/llite/file.c b/lustre/llite/file.c
index 9bcfcaa..672b38c 100644
--- a/lustre/llite/file.c
+++ b/lustre/llite/file.c
@@ -65,15 +65,16 @@ static int ll_file_open(struct inode *inode, struct file *file)
 
         rc = mdc_open(&sbi->ll_mds_client, sbi->ll_mds_conn, inode->i_ino,
                       S_IFREG, file->f_flags, &fd->fd_mdshandle, &req); 
-        if (!fd->fd_mdshandle)
-                CERROR("mdc_open didn't assign fd_mdshandle\n");
-
-        ptlrpc_free_req(req);
+        fd->fd_req = req;
+        ptlrpc_req_finished(req);
         if (rc) {
                 if (rc > 0) 
                         rc = -rc;
                 GOTO(out, rc);
         }
+        if (!fd->fd_mdshandle)
+                CERROR("mdc_open didn't assign fd_mdshandle\n");
+        
 
         oa = ll_oa_from_inode(inode, (OBD_MD_FLMODE | OBD_MD_FLID));
         if (oa == NULL)
@@ -137,12 +138,14 @@ static int ll_file_release(struct inode *inode, struct file *file)
 
         rc = mdc_close(&sbi->ll_mds_client, sbi->ll_mds_conn, inode->i_ino,
                        S_IFREG, fd->fd_mdshandle, &req);
-        ptlrpc_free_req(req);
+        ptlrpc_req_finished(req);
         if (rc) { 
                 if (rc > 0) 
                         rc = -rc;
                 GOTO(out, rc);
         }
+        ptlrpc_free_req(fd->fd_req);
+
         EXIT; 
 
  out:
diff --git a/lustre/llite/super.c b/lustre/llite/super.c
index 7ff3176..21b2556 100644
--- a/lustre/llite/super.c
+++ b/lustre/llite/super.c
@@ -26,6 +26,8 @@ kmem_cache_t *ll_file_data_slab;
 extern struct address_space_operations ll_aops;
 extern struct address_space_operations ll_dir_aops;
 struct super_operations ll_super_operations;
+
+extern void ll_recover(struct ptlrpc_client *);
 extern int ll_commitcbd_setup(struct ll_sb_info *);
 extern int ll_commitcbd_cleanup(struct ll_sb_info *);
 
@@ -119,7 +121,8 @@ static struct super_block * ll_read_super(struct super_block *sb,
                 GOTO(out_free, sb = NULL);
         }
 
-        ptlrpc_init_client(ptlrpc_connmgr, MDS_REQUEST_PORTAL, MDC_REPLY_PORTAL,
+        ptlrpc_init_client(ptlrpc_connmgr, ll_recover,
+                           MDS_REQUEST_PORTAL, MDC_REPLY_PORTAL,
                            &sbi->ll_mds_client);
 
         sbi->ll_mds_conn = ptlrpc_uuid_to_connection("mds");
@@ -164,7 +167,6 @@ static struct super_block * ll_read_super(struct super_block *sb,
         }
 
         /* initialize committed transaction callback daemon */
-        INIT_LIST_HEAD(&sbi->ll_commitcbd_not_committed);
         spin_lock_init(&sbi->ll_commitcbd_lock); 
         init_waitqueue_head(&sbi->ll_commitcbd_waitq);
         init_waitqueue_head(&sbi->ll_commitcbd_ctl_waitq);
@@ -208,6 +210,7 @@ static void ll_put_super(struct super_block *sb)
         ll_commitcbd_cleanup(sbi);
         obd_disconnect(&sbi->ll_conn);
         ptlrpc_put_connection(sbi->ll_mds_conn);
+        ptlrpc_cleanup_client(&sbi->ll_mds_client);
         OBD_FREE(sb->u.generic_sbp, sizeof(*sbi));
         MOD_DEC_USE_COUNT;
         EXIT;
@@ -283,7 +286,7 @@ int ll_inode_setattr(struct inode *inode, struct iattr *attr, int do_trunc)
         if (err)
                 CERROR("mdc_setattr fails (%d)\n", err);
 
-        ptlrpc_free_req(request);
+        ptlrpc_req_finished(request);
 
         RETURN(err);
 }
diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c
index 029646f..2e18b6d 100644
--- a/lustre/mdc/mdc_request.c
+++ b/lustre/mdc/mdc_request.c
@@ -58,15 +58,15 @@ int mdc_connect(struct ptlrpc_client *cl, struct ptlrpc_connection *conn,
                 mds_unpack_rep_body(req);
                 body = lustre_msg_buf(req->rq_repmsg, 0);
                 memcpy(rootfid, &body->fid1, sizeof(*rootfid));
-                *last_committed = body->last_committed;
-                *last_rcvd = body->last_rcvd;
+                *last_committed = req->rq_repmsg->last_committed;
+                *last_rcvd = req->rq_repmsg->last_rcvd;
                 *last_xid = body->last_xid;
 
-                CDEBUG(D_NET, "root ino=%ld, last_committed=%ld, last_rcvd=%ld,"
+                CDEBUG(D_NET, "root ino=%ld, last_committed=%Lu, last_rcvd=%Lu,"
                        " last_xid=%d\n",
                        (unsigned long)rootfid->id,
-                       (unsigned long)body->last_committed,
-                       (unsigned long)body->last_rcvd,
+                       (unsigned long long)*last_committed,
+                       (unsigned long long)*last_rcvd,
                        body->last_xid);
         }
 
@@ -123,6 +123,7 @@ int mdc_open(struct ptlrpc_client *cl, struct ptlrpc_connection *conn,
         if (!req)
                 GOTO(out, rc = -ENOMEM);
 
+        req->rq_flags |= PTL_RPC_FL_RETAIN;
         body = lustre_msg_buf(req->rq_reqmsg, 0);
         ll_ino2fid(&body->fid1, ino, 0, type);
         body->flags = HTON__u32(flags);
@@ -159,7 +160,7 @@ int mdc_close(struct ptlrpc_client *cl, struct ptlrpc_connection *conn,
         ll_ino2fid(&body->fid1, ino, 0, type);
         body->objid = fh;
 
-        req->rq_replen = lustre_msg_size(1, &size);
+        req->rq_replen = lustre_msg_size(0, NULL);
 
         rc = ptlrpc_queue_wait(req);
         rc = ptlrpc_check_status(req, rc);
@@ -251,7 +252,8 @@ static int request_ioctl(struct inode *inode, struct file *file,
                 RETURN(-EINVAL);
         }
 
-        ptlrpc_init_client(NULL, MDS_REQUEST_PORTAL, MDC_REPLY_PORTAL, &cl);
+        ptlrpc_init_client(NULL, NULL, 
+                           MDS_REQUEST_PORTAL, MDC_REPLY_PORTAL, &cl);
         conn = ptlrpc_uuid_to_connection("mds");
         if (!conn) {
                 CERROR("cannot create client\n");
@@ -345,6 +347,7 @@ static int request_ioctl(struct inode *inode, struct file *file,
  out:
         ptlrpc_free_req(request);
         ptlrpc_put_connection(conn);
+        ptlrpc_cleanup_client(&cl);
 
         RETURN(err);
 }
diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c
index 4af1f85..167aae4 100644
--- a/lustre/mds/handler.c
+++ b/lustre/mds/handler.c
@@ -313,18 +313,7 @@ int mds_connect(struct ptlrpc_request *req)
                 CDEBUG(D_INFO, "found existing data for UUID '%s' at #%d\n",
                        mcd->mcd_uuid, mci->mci_off);
         }
-        /* Still not 100% sure whether we should reply with the server
-         * last_rcvd or that of this client.  I'm not sure it even makes
-         * a difference on a per-client basis, because last_rcvd is global
-         * and we are not supposed to allow transactions while in recovery.
-         */
-        body->last_xid = le32_to_cpu(mcd->mcd_last_xid);
-        body->last_rcvd = le64_to_cpu(mcd->mcd_last_rcvd);
-        //body->last_rcvd = mds->mds_last_rcvd;
-        body->last_committed = mds->mds_last_committed;
-        CDEBUG(D_INFO, "last_rcvd %ld, last_committed %ld, last_xid %d\n",
-               (unsigned long)body->last_rcvd,
-               (unsigned long)body->last_committed, body->last_xid);
+        body->last_xid = HTON__u32(mcd->mcd_last_xid);
         mds_pack_rep_body(req);
         RETURN(0);
 }
@@ -365,7 +354,6 @@ int mds_getattr(struct ptlrpc_request *req)
         body->mode = inode->i_mode;
         body->nlink = inode->i_nlink;
         body->valid = ~0;
-        body->last_committed = mds->mds_last_committed;
         mds_fs_get_objid(mds, inode, &body->objid);
         l_dput(de);
         RETURN(0);
@@ -404,7 +392,6 @@ int mds_open(struct ptlrpc_request *req)
 
         body = lustre_msg_buf(req->rq_repmsg, 0);
         body->objid = (__u64) (unsigned long)file;
-        body->last_committed = mds->mds_last_committed;
         RETURN(0);
 }
 
@@ -433,6 +420,7 @@ int mds_close(struct ptlrpc_request *req)
 
         file = (struct file *)(unsigned long)body->objid;
         req->rq_status = filp_close(file, 0);
+
         l_dput(de);
         mntput(mnt);
 
@@ -508,6 +496,7 @@ int mds_reint(struct ptlrpc_request *req)
 int mds_handle(struct obd_device *dev, struct ptlrpc_service *svc,
                struct ptlrpc_request *req)
 {
+        struct mds_obd *mds = &req->rq_obd->u.mds;
         int rc;
         ENTRY;
 
@@ -570,6 +559,16 @@ int mds_handle(struct obd_device *dev, struct ptlrpc_service *svc,
 
         EXIT;
 out:
+        /* Still not 100% sure whether we should reply with the server
+         * last_rcvd or that of this client.  I'm not sure it even makes
+         * a difference on a per-client basis, because last_rcvd is global
+         * and we are not supposed to allow transactions while in recovery.
+         */
+        req->rq_repmsg->last_rcvd = HTON__u64(mds->mds_last_rcvd);
+        req->rq_repmsg->last_committed = HTON__u64(mds->mds_last_committed);
+        CDEBUG(D_INFO, "last_rcvd %Lu, last_committed %Lu\n",
+               (unsigned long long)mds->mds_last_rcvd,
+               (unsigned long long)mds->mds_last_committed);
         if (rc) {
                 ptlrpc_error(svc, req);
         } else {
@@ -624,12 +623,12 @@ int mds_read_last_rcvd(struct mds_obd *mds, struct file *f)
          */
         last_rcvd = le64_to_cpu(msd->msd_last_rcvd);
         mds->mds_last_rcvd = last_rcvd;
-        CDEBUG(D_INODE, "got %Ld for server last_rcvd value\n",
+        CDEBUG(D_INODE, "got %Lu for server last_rcvd value\n",
                (unsigned long long)last_rcvd);
 
         last_mount = le64_to_cpu(msd->msd_mount_count);
         mds->mds_mount_count = last_mount;
-        CDEBUG(D_INODE, "got %Ld for server last_mount value\n",
+        CDEBUG(D_INODE, "got %Lu for server last_mount value\n",
                (unsigned long long)last_mount);
 
         for (off = MDS_LR_CLIENT, cl_off = 0, rc = sizeof(*mcd);
@@ -668,12 +667,12 @@ int mds_read_last_rcvd(struct mds_obd *mds, struct file *f)
 
                 if (last_rcvd > mds->mds_last_rcvd) {
                         CDEBUG(D_OTHER,
-                               "client at offset %d has last_rcvd = %Ld\n",
+                               "client at offset %d has last_rcvd = %Lu\n",
                                cl_off, (unsigned long long)last_rcvd);
                         mds->mds_last_rcvd = last_rcvd;
                 }
         }
-        CDEBUG(D_INODE, "got %Ld for highest last_rcvd value, %d clients\n",
+        CDEBUG(D_INODE, "got %Lu for highest last_rcvd value, %d clients\n",
                (unsigned long long)mds->mds_last_rcvd, mds->mds_client_count);
 
         /* After recovery, there can be no local uncommitted transactions */
@@ -824,7 +823,7 @@ int mds_update_server_data(struct mds_obd *mds)
         msd->msd_last_rcvd = cpu_to_le64(mds->mds_last_rcvd);
         msd->msd_mount_count = cpu_to_le64(mds->mds_mount_count);
 
-        CDEBUG(D_SUPER, "MDS mount_count is %Ld, last_rcvd is %Ld\n",
+        CDEBUG(D_SUPER, "MDS mount_count is %Lu, last_rcvd is %Lu\n",
                (unsigned long long)mds->mds_mount_count,
                (unsigned long long)mds->mds_last_rcvd);
         push_ctxt(&saved, &mds->mds_ctxt);
diff --git a/lustre/mds/mds_reint.c b/lustre/mds/mds_reint.c
index 3255b7e..79ed8a2 100644
--- a/lustre/mds/mds_reint.c
+++ b/lustre/mds/mds_reint.c
@@ -77,6 +77,7 @@ int mds_update_last_rcvd(struct mds_obd *mds, void *handle,
         off = MDS_LR_CLIENT + mci->mci_off * MDS_LR_SIZE;
 
         ++mds->mds_last_rcvd;   /* lock this, or make it an LDLM function? */
+        req->rq_repmsg->transno = HTON__u64(mds->mds_last_rcvd);
         mci->mci_mcd->mcd_last_rcvd = cpu_to_le64(mds->mds_last_rcvd);
         mci->mci_mcd->mcd_mount_count = cpu_to_le64(mds->mds_mount_count);
         mci->mci_mcd->mcd_last_xid = cpu_to_le32(req->rq_reqmsg->xid);
@@ -247,8 +248,6 @@ static int mds_reint_create(struct mds_update_record *rec,
                 body = lustre_msg_buf(req->rq_repmsg, 0);
                 body->ino = inode->i_ino;
                 body->generation = inode->i_generation;
-                body->last_rcvd = mds->mds_last_rcvd;
-                body->last_committed = mds->mds_last_committed;
         }
 
 out_create_commit:
@@ -499,5 +498,6 @@ int mds_reint_rec(struct mds_update_record *rec, struct ptlrpc_request *req)
         }
 
         rc = reinters[rec->ur_opcode](rec, req);
+
         return rc;
 }
diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c
index 6135db2..7b33e6c 100644
--- a/lustre/osc/osc_request.c
+++ b/lustre/osc/osc_request.c
@@ -456,7 +456,7 @@ int osc_brw_read(struct obd_conn *conn, obd_count num_oa, struct obdo **oa,
                         if (bulk[pages] == NULL)
                                 continue;
                         kunmap(buf[pages]);
-                        OBD_FREE(bulk[pages], sizeof(**bulk));
+                        ptlrpc_free_bulk(bulk[pages]);
                 }
         }
 
@@ -572,9 +572,9 @@ static int osc_setup(struct obd_device *obddev, obd_count len, void *buf)
         if (osc->osc_ldlm_client == NULL)
                 GOTO(out_client, rc = -ENOMEM);
 
-        ptlrpc_init_client(NULL, OST_REQUEST_PORTAL, OSC_REPLY_PORTAL,
+        ptlrpc_init_client(NULL, NULL, OST_REQUEST_PORTAL, OSC_REPLY_PORTAL,
                            osc->osc_client);
-        ptlrpc_init_client(NULL, LDLM_REQUEST_PORTAL, LDLM_REPLY_PORTAL,
+        ptlrpc_init_client(NULL, NULL, LDLM_REQUEST_PORTAL, LDLM_REPLY_PORTAL,
                            osc->osc_ldlm_client);
 
         MOD_INC_USE_COUNT;
@@ -591,7 +591,9 @@ static int osc_cleanup(struct obd_device * obddev)
 {
         struct osc_obd *osc = &obddev->u.osc;
 
+        ptlrpc_cleanup_client(osc->osc_client);
         OBD_FREE(osc->osc_client, sizeof(*osc->osc_client));
+        ptlrpc_cleanup_client(osc->osc_ldlm_client);
         OBD_FREE(osc->osc_ldlm_client, sizeof(*osc->osc_ldlm_client));
         ptlrpc_put_connection(osc->osc_conn);
 
diff --git a/lustre/ptlrpc/Makefile.am b/lustre/ptlrpc/Makefile.am
index f28910f..e98189e 100644
--- a/lustre/ptlrpc/Makefile.am
+++ b/lustre/ptlrpc/Makefile.am
@@ -9,6 +9,6 @@ MODULE = ptlrpc
 modulefs_DATA = ptlrpc.o
 EXTRA_PROGRAMS = ptlrpc
 
-ptlrpc_SOURCES = recovd.c connection.c rpc.c events.c service.c client.c niobuf.c pack_generic.c
+ptlrpc_SOURCES = connmgr.c recovd.c connection.c rpc.c events.c service.c client.c niobuf.c pack_generic.c
 
 include $(top_srcdir)/Rules
diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c
index 02f3925..e462b56 100644
--- a/lustre/ptlrpc/client.c
+++ b/lustre/ptlrpc/client.c
@@ -26,18 +26,23 @@
 
 #include <linux/lustre_ha.h>
 
-void ptlrpc_init_client(struct recovd_obd *recovd, int req_portal,
+void ptlrpc_init_client(struct recovd_obd *recovd, 
+                        void (*recover)(struct ptlrpc_client *recover),
+                        int req_portal,
                         int rep_portal, struct ptlrpc_client *cl)
 {
         memset(cl, 0, sizeof(*cl));
         cl->cli_recovd = recovd;
+        cl->cli_recover = recover;
         if (recovd)
-                connmgr_cli_manage(recovd, cl);
+                recovd_cli_manage(recovd, cl);
         cl->cli_obd = NULL;
         cl->cli_request_portal = req_portal;
         cl->cli_reply_portal = rep_portal;
         INIT_LIST_HEAD(&cl->cli_sending_head);
         INIT_LIST_HEAD(&cl->cli_sent_head);
+        INIT_LIST_HEAD(&cl->cli_replied_head);
+        INIT_LIST_HEAD(&cl->cli_replay_head);
         spin_lock_init(&cl->cli_lock);
         sema_init(&cl->cli_rpc_sem, 32);
 }
@@ -81,12 +86,16 @@ struct ptlrpc_bulk_desc *ptlrpc_prep_bulk(struct ptlrpc_connection *conn)
 
 void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *bulk)
 {
-        if (bulk == NULL)
+        ENTRY;
+        if (bulk == NULL) {
+                EXIT;
                 return;
+        }
 
         ptlrpc_put_connection(bulk->b_connection);
 
         OBD_FREE(bulk, sizeof(*bulk));
+        EXIT;
 }
 
 struct ptlrpc_request *ptlrpc_prep_req(struct ptlrpc_client *cl,
@@ -111,7 +120,6 @@ struct ptlrpc_request *ptlrpc_prep_req(struct ptlrpc_client *cl,
                 RETURN(NULL);
         }
 
-        request->rq_time = CURRENT_TIME;
         request->rq_type = PTL_RPC_TYPE_REQUEST;
         request->rq_connection = ptlrpc_connection_addref(conn);
 
@@ -121,6 +129,9 @@ struct ptlrpc_request *ptlrpc_prep_req(struct ptlrpc_client *cl,
         request->rq_reqmsg->type = HTON__u32(PTL_RPC_MSG_REQUEST);
         INIT_LIST_HEAD(&request->rq_list);
 
+        /* this will be dec()d once in req_finished, once in free_committed */
+        atomic_set(&request->rq_refcount, 2);
+
         spin_lock(&conn->c_lock);
         request->rq_reqmsg->xid = HTON__u32(++conn->c_xid_out);
         spin_unlock(&conn->c_lock);
@@ -130,6 +141,20 @@ struct ptlrpc_request *ptlrpc_prep_req(struct ptlrpc_client *cl,
         RETURN(request);
 }
 
+void ptlrpc_req_finished(struct ptlrpc_request *request)
+{
+        if (request == NULL)
+                return;
+
+        if (request->rq_repmsg != NULL) { 
+                OBD_FREE(request->rq_repmsg, request->rq_replen);
+                request->rq_repmsg = NULL;
+        }
+
+        if (atomic_dec_and_test(&request->rq_refcount))
+                ptlrpc_free_req(request);
+}
+
 void ptlrpc_free_req(struct ptlrpc_request *request)
 {
         if (request == NULL)
@@ -137,6 +162,8 @@ void ptlrpc_free_req(struct ptlrpc_request *request)
 
         if (request->rq_repmsg != NULL)
                 OBD_FREE(request->rq_repmsg, request->rq_replen);
+        if (request->rq_reqmsg != NULL)
+                OBD_FREE(request->rq_reqmsg, request->rq_reqlen);
 
         if (request->rq_client) {
                 spin_lock(&request->rq_client->cli_lock);
@@ -153,18 +180,30 @@ static int ptlrpc_check_reply(struct ptlrpc_request *req)
 {
         int rc = 0;
 
-        schedule_timeout(3 * HZ);  /* 3 second timeout */
         if (req->rq_repmsg != NULL) {
+                req->rq_transno = NTOH__u64(req->rq_repmsg->transno);
                 req->rq_flags |= PTL_RPC_FL_REPLY;
                 GOTO(out, rc = 1);
         }
 
-        if (CURRENT_TIME - req->rq_time >= 3) {
+        if (req->rq_flags & PTL_RPC_FL_RESEND) { 
+                CERROR("-- RESEND --\n");
+                req->rq_status = -EAGAIN;
+                GOTO(out, rc = 1);
+        }
+
+        if (CURRENT_TIME - req->rq_time >= req->rq_timeout) {
                 CERROR("-- REQ TIMEOUT --\n");
+                /* clear the timeout */
+                req->rq_timeout = 0;
                 req->rq_flags |= PTL_RPC_FL_TIMEOUT;
                 if (req->rq_client && req->rq_client->cli_recovd)
-                        connmgr_cli_fail(req->rq_client);
-                return 0;
+                        recovd_cli_fail(req->rq_client);
+                GOTO(out, rc = 0);
+        }
+
+        if (req->rq_timeout) { 
+                schedule_timeout(req->rq_timeout * HZ);
         }
 
         if (sigismember(&(current->pending.signal), SIGKILL) ||
@@ -231,13 +270,86 @@ static int ptlrpc_abort(struct ptlrpc_request *request)
         return 0;
 }
 
+/* caller must lock cli */
+void ptlrpc_free_committed(struct ptlrpc_client *cli)
+{
+        struct list_head *tmp, *saved;
+        struct ptlrpc_request *req;
+
+        list_for_each_safe(tmp, saved, &cli->cli_replied_head) {
+                req = list_entry(tmp, struct ptlrpc_request, rq_list);
+
+                /* not yet committed */ 
+                if (req->rq_transno > cli->cli_last_committed)
+                        break; 
+
+                /* retain for replay if flagged */
+                if (req->rq_flags & PTL_RPC_FL_RETAIN) {
+                        list_del(&req->rq_list); 
+                        list_add(&req->rq_list, &cli->cli_replay_head);
+                } else {
+                        CDEBUG(D_INFO, "Marking request %p as committed ("
+                               "transno=%Lu, last_committed=%Lu\n", req,
+                               req->rq_transno, cli->cli_last_committed);
+                        if (atomic_dec_and_test(&req->rq_refcount))
+                                ptlrpc_free_req(req);
+                }
+        }
+
+        EXIT;
+        return;
+}
+
+void ptlrpc_cleanup_client(struct ptlrpc_client *cli)
+{
+        struct list_head *tmp, *saved;
+        struct ptlrpc_request *req;
+        ENTRY;
+
+        spin_lock(&cli->cli_lock);
+        list_for_each_safe(tmp, saved, &cli->cli_replied_head) {
+                req = list_entry(tmp, struct ptlrpc_request, rq_list);
+                /* We do this to prevent ptlrpc_free_req from taking cli_lock */
+                CDEBUG(D_INFO, "Cleaning req %p from replied head.\n", req);
+                list_del(&req->rq_list);
+                req->rq_client = NULL;
+                ptlrpc_free_req(req); 
+        }
+        list_for_each_safe(tmp, saved, &cli->cli_sent_head) {
+                req = list_entry(tmp, struct ptlrpc_request, rq_list);
+                CDEBUG(D_INFO, "Cleaning req %p from sent head.\n", req);
+                list_del(&req->rq_list);
+                req->rq_client = NULL;
+                ptlrpc_free_req(req); 
+        }
+        list_for_each_safe(tmp, saved, &cli->cli_replay_head) {
+                req = list_entry(tmp, struct ptlrpc_request, rq_list);
+                CERROR("Request %p is on the replay head at cleanup!\n", req);
+                list_del(&req->rq_list);
+                req->rq_client = NULL;
+                ptlrpc_free_req(req); 
+        }
+        list_for_each_safe(tmp, saved, &cli->cli_sending_head) {
+                req = list_entry(tmp, struct ptlrpc_request, rq_list);
+                CDEBUG(D_INFO, "Cleaning req %p from sending head.\n", req);
+                list_del(&req->rq_list);
+                req->rq_client = NULL;
+                ptlrpc_free_req(req); 
+        }
+        spin_unlock(&cli->cli_lock);
+        EXIT;
+        return;
+}
+
 int ptlrpc_queue_wait(struct ptlrpc_request *req)
 {
         int rc = 0;
         ENTRY;
 
         init_waitqueue_head(&req->rq_wait_for_rep);
-
+ resend:
+        req->rq_time = CURRENT_TIME;
+        req->rq_timeout = 3;
         rc = ptl_send_rpc(req);
         if (rc) {
                 CERROR("error %d, opcode %d\n", rc, req->rq_reqmsg->opc);
@@ -249,7 +361,13 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req)
         CDEBUG(D_OTHER, "-- sleeping\n");
         wait_event_interruptible(req->rq_wait_for_rep, ptlrpc_check_reply(req));
         CDEBUG(D_OTHER, "-- done\n");
-        ptlrpc_cleanup_request_buf(req);
+
+        if (req->rq_flags & PTL_RPC_FL_RESEND) {
+                req->rq_flags &= ~PTL_RPC_FL_RESEND;
+                goto resend;
+        }
+
+        //ptlrpc_cleanup_request_buf(req);
         up(&req->rq_client->cli_rpc_sem);
         if (req->rq_flags & PTL_RPC_FL_INTR) {
                 /* Clean up the dangling reply buffers */
@@ -270,11 +388,20 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req)
                 GOTO(out, rc);
         }
         CDEBUG(D_NET, "got rep %d\n", req->rq_repmsg->xid);
-
         if (req->rq_repmsg->status == 0)
                 CDEBUG(D_NET, "--> buf %p len %d status %d\n", req->rq_repmsg,
                        req->rq_replen, req->rq_repmsg->status);
 
+        spin_lock(&req->rq_client->cli_lock);
+        /* add to the tail of the replied head */
+        list_del(&req->rq_list);
+        list_add(&req->rq_list, req->rq_client->cli_replied_head.prev); 
+
+        req->rq_client->cli_last_rcvd = req->rq_repmsg->last_rcvd;
+        req->rq_client->cli_last_committed = req->rq_repmsg->last_committed;
+        ptlrpc_free_committed(req->rq_client); 
+        spin_unlock(&req->rq_client->cli_lock);
+
         EXIT;
  out:
         return rc;
diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c
index 25503bf..caf94a4 100644
--- a/lustre/ptlrpc/niobuf.c
+++ b/lustre/ptlrpc/niobuf.c
@@ -228,6 +228,16 @@ int ptlrpc_error(struct ptlrpc_service *svc, struct ptlrpc_request *req)
         RETURN(rc);
 }
 
+void ptlrpc_resend_req(struct ptlrpc_request *req)
+{
+        ENTRY;
+        req->rq_flags |= PTL_RPC_FL_RESEND;
+        req->rq_flags &= ~PTL_RPC_FL_TIMEOUT;
+        wake_up_interruptible(&req->rq_wait_for_rep);
+        EXIT;
+        return; 
+}
+
 int ptl_send_rpc(struct ptlrpc_request *request)
 {
         ptl_process_id_t local_id;
@@ -286,7 +296,7 @@ int ptl_send_rpc(struct ptlrpc_request *request)
 
         CDEBUG(D_NET, "Setup reply buffer: %u bytes, xid %u, portal %u\n",
                request->rq_replen, request->rq_reqmsg->xid,
-               request->rq_client->cli_request_portal);
+               request->rq_client->cli_reply_portal);
 
         spin_lock(&request->rq_client->cli_lock);
         list_add(&request->rq_list, &request->rq_client->cli_sending_head);
diff --git a/lustre/ptlrpc/recovd.c b/lustre/ptlrpc/recovd.c
index 4541cc2..af917f1 100644
--- a/lustre/ptlrpc/recovd.c
+++ b/lustre/ptlrpc/recovd.c
@@ -23,21 +23,22 @@
 
 struct recovd_obd *ptlrpc_connmgr;
 
-void connmgr_cli_manage(struct recovd_obd *recovd, struct ptlrpc_client *cli)
+void recovd_cli_manage(struct recovd_obd *recovd, struct ptlrpc_client *cli)
 {
         ENTRY;
         cli->cli_recovd = recovd;
         spin_lock(&recovd->recovd_lock);
-        list_add(&cli->cli_ha_item, &recovd->recovd_connections_lh);
+        list_add(&cli->cli_ha_item, &recovd->recovd_clients_lh);
         spin_unlock(&recovd->recovd_lock);
         EXIT;
 }
 
-void connmgr_cli_fail(struct ptlrpc_client *cli)
+void recovd_cli_fail(struct ptlrpc_client *cli)
 {
         ENTRY;
         spin_lock(&cli->cli_recovd->recovd_lock);
-        cli->cli_recovd->recovd_flags |= SVC_HA_EVENT;
+        cli->cli_recovd->recovd_flags |= RECOVD_FAIL;
+        cli->cli_recovd->recovd_wakeup_flag = 1;
         list_del(&cli->cli_ha_item);
         list_add(&cli->cli_ha_item, &cli->cli_recovd->recovd_troubled_lh);
         spin_unlock(&cli->cli_recovd->recovd_lock);
@@ -45,7 +46,16 @@ void connmgr_cli_fail(struct ptlrpc_client *cli)
         EXIT;
 }
 
-static int connmgr_upcall(void)
+void recovd_cli_fixed(struct ptlrpc_client *cli)
+{
+        ENTRY;
+        list_del(&cli->cli_ha_item);
+        list_add(&cli->cli_ha_item, &cli->cli_recovd->recovd_clients_lh);
+        EXIT;
+}
+
+
+static int recovd_upcall(void)
 {
         char *argv[2];
         char *envp[3];
@@ -60,145 +70,6 @@ static int connmgr_upcall(void)
         return call_usermodehelper(argv[0], argv, envp);
 }
 
-static int connmgr_unpack_body(struct ptlrpc_request *req)
-{
-        struct connmgr_body *b = lustre_msg_buf(req->rq_repmsg, 0);
-        if (b == NULL) {
-                LBUG();
-                RETURN(-EINVAL);
-        }
-
-        b->generation = NTOH__u32(b->generation);
-
-        return 0;
-}
-
-int connmgr_connect(struct recovd_obd *recovd, struct ptlrpc_connection *conn)
-{
-        struct ptlrpc_request *req;
-        struct ptlrpc_client *cl;
-        struct connmgr_body *body;
-        int rc, size = sizeof(*body);
-        ENTRY;
-
-        if (!recovd) {
-                CERROR("no manager\n");
-                LBUG();
-                GOTO(out, rc = -EINVAL);
-        }
-        cl = recovd->recovd_client;
-
-        req = ptlrpc_prep_req(cl, conn, CONNMGR_CONNECT, 1, &size, NULL);
-        if (!req)
-                GOTO(out, rc = -ENOMEM);
-
-        body = lustre_msg_buf(req->rq_reqmsg, 0);
-        body->generation = HTON__u32(conn->c_generation);
-        body->conn = (__u64)(unsigned long)conn;
-        body->conn_token = conn->c_token;
-        strncpy(body->conn_uuid, conn->c_local_uuid, sizeof(body->conn_uuid));
-
-        req->rq_replen = lustre_msg_size(1, &size);
-
-        rc = ptlrpc_queue_wait(req);
-        rc = ptlrpc_check_status(req, rc);
-        if (!rc) {
-                rc = connmgr_unpack_body(req);
-                if (rc)
-                        GOTO(out_free, rc);
-                body = lustre_msg_buf(req->rq_repmsg, 0);
-                CDEBUG(D_NET, "remote generation: %o\n", body->generation);
-                conn->c_level = LUSTRE_CONN_CON;
-                conn->c_remote_conn = body->conn;
-                conn->c_remote_token = body->conn_token;
-                strncpy(conn->c_remote_uuid, body->conn_uuid,
-                        sizeof(conn->c_remote_uuid));
-        }
-
-out_free:
-        ptlrpc_free_req(req);
-out:
-        RETURN(rc);
-}
-
-static int connmgr_handle_connect(struct ptlrpc_request *req)
-{
-        struct connmgr_body *body;
-        int rc, size = sizeof(*body);
-        ENTRY;
-
-        rc = lustre_pack_msg(1, &size, NULL, &req->rq_replen, &req->rq_repmsg);
-        if (rc) {
-                CERROR("connmgr: out of memory\n");
-                req->rq_status = -ENOMEM;
-                RETURN(0);
-        }
-
-        body = lustre_msg_buf(req->rq_reqmsg, 0);
-        rc = connmgr_unpack_body(req);
-        if (rc) {
-                req->rq_status = rc;
-                RETURN(0);
-        }
-
-        req->rq_connection->c_remote_conn = body->conn;
-        req->rq_connection->c_remote_token = body->conn_token;
-        strncpy(req->rq_connection->c_remote_uuid, body->conn_uuid,
-                sizeof(req->rq_connection->c_remote_uuid));
-
-        CERROR("incoming generation %d\n", body->generation);
-        body = lustre_msg_buf(req->rq_repmsg, 0);
-        body->generation = 4711;
-        body->conn = (__u64)(unsigned long)req->rq_connection;
-        body->conn_token = req->rq_connection->c_token;
-        strncpy(body->conn_uuid, req->rq_connection->c_local_uuid,
-                sizeof(body->conn_uuid));
-
-        req->rq_connection->c_level = LUSTRE_CONN_CON;
-        RETURN(0);
-}
-
-int connmgr_handle(struct obd_device *dev, struct ptlrpc_service *svc,
-                   struct ptlrpc_request *req)
-{
-        int rc;
-        ENTRY;
-
-        rc = lustre_unpack_msg(req->rq_reqmsg, req->rq_reqlen);
-        if (rc) {
-                CERROR("Invalid request\n");
-                GOTO(out, rc);
-        }
-
-        if (req->rq_reqmsg->type != NTOH__u32(PTL_RPC_MSG_REQUEST)) {
-                CERROR("wrong packet type sent %d\n",
-                       req->rq_reqmsg->type);
-                GOTO(out, rc = -EINVAL);
-        }
-
-        switch (req->rq_reqmsg->opc) {
-        case CONNMGR_CONNECT:
-                CDEBUG(D_INODE, "connmgr connect\n");
-                rc = connmgr_handle_connect(req);
-                break;
-
-        default:
-                rc = ptlrpc_error(svc, req);
-                RETURN(rc);
-        }
-
-        EXIT;
-out:
-        if (rc) {
-                ptlrpc_error(svc, req);
-        } else {
-                CDEBUG(D_NET, "sending reply\n");
-                ptlrpc_reply(svc, req);
-        }
-
-        return 0;
-}
-
 static int recovd_check_event(struct recovd_obd *recovd)
 {
         int rc = 0;
@@ -206,61 +77,77 @@ static int recovd_check_event(struct recovd_obd *recovd)
 
         spin_lock(&recovd->recovd_lock);
 
-        if (!(recovd->recovd_flags & MGR_WORKING) &&
-            !list_empty(&recovd->recovd_troubled_lh)) {
-
-                CERROR("connection in trouble - state: WORKING, upcall\n");
-                recovd->recovd_flags = MGR_WORKING;
-
-                recovd->recovd_waketime = CURRENT_TIME;
-                recovd->recovd_timeout = 5 * HZ;
+        recovd->recovd_waketime = CURRENT_TIME;
+        if (recovd->recovd_timeout) 
                 schedule_timeout(recovd->recovd_timeout);
-        }
 
-        if (recovd->recovd_flags & MGR_WORKING &&
-            CURRENT_TIME <= recovd->recovd_waketime + recovd->recovd_timeout) {
-                CERROR("WORKING: new event\n");
+        if (recovd->recovd_wakeup_flag) {
+                CERROR("service woken\n"); 
+                GOTO(out, rc = 1);
+        }
 
-                recovd->recovd_waketime = CURRENT_TIME;
-                schedule_timeout(recovd->recovd_timeout);
+        if (recovd->recovd_timeout && 
+            CURRENT_TIME > recovd->recovd_waketime + recovd->recovd_timeout) {
+                recovd->recovd_flags |= RECOVD_TIMEOUT;
+                CERROR("timeout\n");
+                GOTO(out, rc = 1);
         }
 
-        if (recovd->recovd_flags & MGR_STOPPING) {
-                CERROR("ha mgr stopping\n");
+        if (recovd->recovd_flags & RECOVD_STOPPING) {
+                CERROR("recovd stopping\n");
                 rc = 1;
         }
 
+ out:
+        recovd->recovd_wakeup_flag = 0;
         spin_unlock(&recovd->recovd_lock);
         RETURN(rc);
 }
 
 static int recovd_handle_event(struct recovd_obd *recovd)
 {
+        ENTRY;
         spin_lock(&recovd->recovd_lock);
 
-        if (!(recovd->recovd_flags & MGR_WORKING) &&
-            !list_empty(&recovd->recovd_troubled_lh)) {
-
-                CERROR("connection in trouble - state: WORKING, upcall\n");
-                recovd->recovd_flags = MGR_WORKING;
+        if (!(recovd->recovd_flags & RECOVD_UPCALL_WAIT) &&
+            recovd->recovd_flags & RECOVD_FAIL) { 
 
+                CERROR("client in trouble: flags -> UPCALL_WAITING\n");
+                recovd->recovd_flags |= RECOVD_UPCALL_WAIT;
 
-                connmgr_upcall();
+                recovd_upcall();
                 recovd->recovd_waketime = CURRENT_TIME;
-                recovd->recovd_timeout = 5 * HZ;
+                recovd->recovd_timeout = 10 * HZ;
                 schedule_timeout(recovd->recovd_timeout);
         }
 
-        if (recovd->recovd_flags & MGR_WORKING &&
-            CURRENT_TIME <= recovd->recovd_waketime + recovd->recovd_timeout) {
-                CERROR("WORKING: new event\n");
+        if (recovd->recovd_flags & RECOVD_TIMEOUT) { 
+                CERROR("timeout - no news from upcall?\n");
+                recovd->recovd_flags &= ~RECOVD_TIMEOUT;
+        }
 
-                recovd->recovd_waketime = CURRENT_TIME;
-                schedule_timeout(recovd->recovd_timeout);
+        if (recovd->recovd_flags & RECOVD_UPCALL_ANSWER) { 
+                struct list_head *tmp, *pos;
+                CERROR("UPCALL_WAITING: upcall answer\n");
+                CERROR("** fill me in with recovery\n");
+
+                list_for_each_safe(tmp, pos, &recovd->recovd_troubled_lh) { 
+                        struct ptlrpc_client *cli = list_entry
+                                (tmp, struct ptlrpc_client, cli_ha_item);
+
+                        list_del(&cli->cli_ha_item); 
+                        spin_unlock(&recovd->recovd_lock);
+                        if (cli->cli_recover)
+                                cli->cli_recover(cli); 
+                        spin_lock(&recovd->recovd_lock);
+                }
+
+                recovd->recovd_timeout = 0;
+                recovd->recovd_flags = RECOVD_IDLE; 
         }
 
         spin_unlock(&recovd->recovd_lock);
-        return 0;
+        RETURN(0);
 }
 
 static int recovd_main(void *arg)
@@ -280,7 +167,7 @@ static int recovd_main(void *arg)
 
         /* Record that the  thread is running */
         recovd->recovd_thread = current;
-        recovd->recovd_flags = MGR_RUNNING;
+        recovd->recovd_flags = RECOVD_IDLE;
         wake_up(&recovd->recovd_ctl_waitq);
 
         /* And now, loop forever on requests */
@@ -289,9 +176,9 @@ static int recovd_main(void *arg)
                                          recovd_check_event(recovd));
 
                 spin_lock(&recovd->recovd_lock);
-                if (recovd->recovd_flags & MGR_STOPPING) {
+                if (recovd->recovd_flags & RECOVD_STOPPING) {
                         spin_unlock(&recovd->recovd_lock);
-                        CERROR("lustre_hamgr quitting\n");
+                        CERROR("lustre_recovd stopping\n");
                         EXIT;
                         break;
                 }
@@ -301,7 +188,7 @@ static int recovd_main(void *arg)
         }
 
         recovd->recovd_thread = NULL;
-        recovd->recovd_flags = MGR_STOPPED;
+        recovd->recovd_flags = RECOVD_STOPPED;
         wake_up(&recovd->recovd_ctl_waitq);
         CDEBUG(D_NET, "mgr exiting process %d\n", current->pid);
         RETURN(0);
@@ -312,7 +199,7 @@ int recovd_setup(struct recovd_obd *recovd)
         int rc;
         ENTRY;
 
-        INIT_LIST_HEAD(&recovd->recovd_connections_lh);
+        INIT_LIST_HEAD(&recovd->recovd_clients_lh);
         INIT_LIST_HEAD(&recovd->recovd_troubled_lh);
         spin_lock_init(&recovd->recovd_lock);
 
@@ -326,17 +213,19 @@ int recovd_setup(struct recovd_obd *recovd)
                 CERROR("cannot start thread\n");
                 RETURN(-EINVAL);
         }
-        wait_event(recovd->recovd_ctl_waitq, recovd->recovd_flags & MGR_RUNNING);
+        wait_event(recovd->recovd_ctl_waitq, recovd->recovd_flags & RECOVD_IDLE);
 
         RETURN(0);
 }
 
 int recovd_cleanup(struct recovd_obd *recovd)
 {
-        recovd->recovd_flags = MGR_STOPPING;
-
+        spin_lock(&recovd->recovd_lock);
+        recovd->recovd_flags = RECOVD_STOPPING;
         wake_up(&recovd->recovd_waitq);
+        spin_unlock(&recovd->recovd_lock);
+
         wait_event_interruptible(recovd->recovd_ctl_waitq,
-                                 (recovd->recovd_flags & MGR_STOPPED));
+                                 (recovd->recovd_flags & RECOVD_STOPPED));
         RETURN(0);
 }
diff --git a/lustre/ptlrpc/rpc.c b/lustre/ptlrpc/rpc.c
index 0d783c7..659845b 100644
--- a/lustre/ptlrpc/rpc.c
+++ b/lustre/ptlrpc/rpc.c
@@ -55,7 +55,7 @@ int connmgr_setup(struct obd_device *obddev, obd_count len, void *buf)
                 GOTO(err_recovd, err = -EINVAL);
         }
 
-        ptlrpc_init_client(NULL, CONNMGR_REQUEST_PORTAL, 
+        ptlrpc_init_client(NULL, NULL, CONNMGR_REQUEST_PORTAL, 
                            CONNMGR_REPLY_PORTAL, recovd->recovd_client);
 
         err = ptlrpc_start_thread(obddev, recovd->recovd_service, "lustre_connmgr");
@@ -95,17 +95,36 @@ int connmgr_cleanup(struct obd_device *dev)
         }
 
         OBD_FREE(recovd->recovd_service, sizeof(*recovd->recovd_service));
-        recovd->recovd_flags = MGR_STOPPING;
-
+        ptlrpc_cleanup_client(recovd->recovd_client);
         OBD_FREE(recovd->recovd_client, sizeof(*recovd->recovd_client));
         MOD_DEC_USE_COUNT;
         RETURN(0);
 }
 
+
+int connmgr_iocontrol(int cmd, struct obd_conn *conn, int len, void *karg,
+                         void *uarg)
+{
+        struct recovd_obd *recovd = &conn->oc_dev->u.recovd;
+
+        ENTRY;
+        if (cmd == OBD_RECOVD_NEWCONN) { 
+                spin_lock(&recovd->recovd_lock);
+                recovd->recovd_flags |= RECOVD_UPCALL_ANSWER;
+                recovd->recovd_wakeup_flag = 1;
+                wake_up(&recovd->recovd_waitq);
+                spin_unlock(&recovd->recovd_lock);
+                EXIT;
+        }
+        return 0;
+}
+
+
 /* use obd ops to offer management infrastructure */
 static struct obd_ops recovd_obd_ops = {
         o_setup:       connmgr_setup,
         o_cleanup:     connmgr_cleanup,
+        o_iocontrol:     connmgr_iocontrol,
 };
 
 static int __init ptlrpc_init(void)
diff --git a/lustre/tests/llmount.sh b/lustre/tests/llmount.sh
index 50c1c16..f42ddb7 100755
--- a/lustre/tests/llmount.sh
+++ b/lustre/tests/llmount.sh
@@ -15,29 +15,26 @@ read
 
 new_fs ext2 /tmp/ost 10000
 OST=$LOOPDEV
-MDSFS=ext2
+MDSFS=ext3
 new_fs ${MDSFS} /tmp/mds 10000
 MDS=$LOOPDEV
 
 echo 0xffffffff > /proc/sys/portals/debug
 
 $OBDCTL <<EOF
-device 0
+newdev
 attach mds MDSDEV
 setup ${MDS} ${MDSFS}
-device 1
+newdev
 attach obdext2 OBDDEV
 setup ${OST}
-device 2
+newdev
 attach ost OSTDEV
 setup \$OBDDEV
-device 3
-attach ptlrpc RPCDEV
-setup
-device 4
+newdev
 attach ldlm LDLMDEV
 setup
-device 5
+newdev
 attach osc OSCDEV
 setup -1
 quit
diff --git a/lustre/tests/openunlink.c b/lustre/tests/openunlink.c
index dddf6a8..d0ae419 100644
--- a/lustre/tests/openunlink.c
+++ b/lustre/tests/openunlink.c
@@ -15,7 +15,7 @@ int main(int argc, char **argv)
         int fd, rc;
 
         if (argc != 2) {
-                fprintf(stderr, "usage: %s filename\n", argv[1]); 
+                fprintf(stderr, "usage: %s filename\n", argv[0]); 
                 exit(1);
         } else { 
                 fprintf(stderr, "congratulations - program starting\n"); 
diff --git a/lustre/tests/runfailure-mds b/lustre/tests/runfailure-mds
index cff40b1..0f473ff 100755
--- a/lustre/tests/runfailure-mds
+++ b/lustre/tests/runfailure-mds
@@ -20,28 +20,25 @@ MDS=$LOOPDEV
 echo 0xffffffff > /proc/sys/portals/debug
 
 $OBDCTL <<EOF
-device 0
+newdev
 attach mds MDSDEV
 setup ${MDS} ${MDSFS}
-device 1
-attach obdfilter OBDDEV
+newdev
+attach obdext2 OBDDEV
 setup ${OST}
-device 2
+newdev
 attach ost OSTDEV
-setup 1
-device 3
-attach ptlrpc RPCDEV
-setup
-device 4
+setup \$OBDDEV
+newdev
 attach ldlm LDLMDEV
 setup
-device 5
+newdev
 attach osc OSCDEV
 setup -1
 quit
 EOF
 
-MNT='mount -t lustre_lite -o device=`$OBDCTL name2dev OSCDEV` none /mnt/lustre'
+MNT="mount -t lustre_lite -o device=`$OBDCTL name2dev OSCDEV` none /mnt/lustre"
 $MNT
 
 test_fail() {
@@ -53,7 +50,7 @@ test_fail() {
 	echo "Cleaning up and restarting MDS"
 	umount /mnt/lustre || fail "unable to unmount"
 	$OBDCTL <<- EOF
-	device 0
+	name2dev MDSDEV
 	cleanup
 	detach
 	quit
@@ -62,8 +59,8 @@ test_fail() {
 	echo 0 > /proc/sys/lustre/fail_loc
 
 	$OBDCTL <<- EOF
-	device 0
-	attach mds
+	newdev
+	attach mds MDSDEV
 	setup ${MDS} ${MDSFS}
 	quit
 	EOF
diff --git a/lustre/utils/ha_assist.sh b/lustre/utils/ha_assist.sh
index 285e6bd..832b274 100755
--- a/lustre/utils/ha_assist.sh
+++ b/lustre/utils/ha_assist.sh
@@ -2,3 +2,12 @@
 
 echo -n `date` >> /tmp/halog
 echo "- please supply a new mds" >> /tmp/halog
+
+echo "- suppose we have a new one" >> /tmp/halog
+sleep 1
+
+/usr/src/obd/utils/obdctl  <<EOF
+name2dev RPCDEV
+newconn
+EOF
+
diff --git a/lustre/utils/obdctl.c b/lustre/utils/obdctl.c
index b80042a..31fbfc3 100644
--- a/lustre/utils/obdctl.c
+++ b/lustre/utils/obdctl.c
@@ -913,6 +913,25 @@ static int jt_test_ldlm(int argc, char **argv)
         return rc;
 }
 
+static int jt_newconn(int argc, char **argv)
+{
+        struct obd_ioctl_data data;
+        int rc;
+
+        IOCINIT(data);
+        if (argc != 1) {
+                fprintf(stderr, "usage: %s\n", cmdname(argv[0]));
+                return -1;
+        }
+
+        rc = ioctl(fd, OBD_RECOVD_NEWCONN , &data);
+        if (rc < 0)
+                fprintf(stderr, "error: %s: %s\n", cmdname(argv[0]),
+                        strerror(rc = errno));
+
+        return rc;
+}
+
 command_t cmdlist[] = {
         /* Metacommands */
         {"--device", jt__device, 0, "--device <devno> <command [args ...]>"},
@@ -938,6 +957,7 @@ command_t cmdlist[] = {
         {"destroy", jt_destroy, 0, "destroy <id>"},
         {"getattr", jt_getattr, 0, "getattr <id>"},
         {"setattr", jt_setattr, 0, "setattr <id> <mode>"},
+        {"newconn", jt_newconn, 0, "newconn [newuuid]"},
         {"test_getattr", jt_test_getattr, 0, "test_getattr <count> [verbose]"},
         {"test_brw", jt_test_brw, 0, "test_brw <count> [write [verbose]]"},
         {"test_ldlm", jt_test_ldlm, 0, "test lock manager (no args)"},
-- 
1.8.3.1