From: shaver <shaver>
Date: Mon, 28 Oct 2002 02:49:56 +0000 (+0000)
Subject: Short version: replaying create and rename works now, including all the fixups
X-Git-Tag: 0.5.16~111
X-Git-Url: https://git.whamcloud.com/gitweb?a=commitdiff_plain;h=320a34f3973da42af1b235ca5d6c52216f9f5303;p=fs%2Flustre-release.git

Short version: replaying create and rename works now, including all the fixups
               you could ever, ever want.  (OK, except lock replay, but that's
               easy now.)  Single-client recovery should now be much, much
               more robust.

- Added LDLM_FL_INTENT_ONLY to avoid acquiring locks during intent-lock replay.
- Fix leak of open requests that was due to a misunderstanding (mine) of the
  way that the ptlrpc_request refcounting code (mine) really worked.
- Store FID data after create, so that we can fix up the world afterwards.
- Poor man's lustre_spare_buf support through simple over-allocation of request
  buffers.
- Cancel unused locks when we lose contact with an MDS.
- When replaying create, fix up to-be-replayed and delayed-for-recovery requests
  to contain the new FID generation, in addition to the actual in-memory inode.
- Don't sweat a "failed" return code for replayed requests, but do squeak a bit
  if the old and new status doesn't match up (virtually guaranteed, in fact,
  but I'll tune that down later).
- Skip no-transno requests during replay, since they don't affect MDS state.
---

diff --git a/lustre/include/linux/lustre_dlm.h b/lustre/include/linux/lustre_dlm.h
index 187605f..5798815 100644
--- a/lustre/include/linux/lustre_dlm.h
+++ b/lustre/include/linux/lustre_dlm.h
@@ -40,6 +40,7 @@ typedef enum {
 #define LDLM_FL_WAIT_NOREPROC  (1 << 7)
 #define LDLM_FL_CANCEL         (1 << 8)
 #define LDLM_FL_REPLAY         (1 << 9)
+#define LDLM_FL_INTENT_ONLY    (1 << 10) /* don't grant lock, just do intent */
 
 #define LDLM_CB_BLOCKING    1
 #define LDLM_CB_CANCELING   2
diff --git a/lustre/include/linux/lustre_mds.h b/lustre/include/linux/lustre_mds.h
index 8b292ae..7eb76c7 100644
--- a/lustre/include/linux/lustre_mds.h
+++ b/lustre/include/linux/lustre_mds.h
@@ -189,6 +189,9 @@ int mdc_rename(struct lustre_handle *conn,
                struct ptlrpc_request **);
 int mdc_create_client(obd_uuid_t uuid, struct ptlrpc_client *cl);
 
+void mdc_store_create_replay_data(struct ptlrpc_request *req,
+                                  struct super_block *sb);
+
 extern int mds_client_add(struct mds_export_data *med, int cl_off);
 extern int mds_client_free(struct obd_export *exp);
 
diff --git a/lustre/llite/file.c b/lustre/llite/file.c
index 7778e01..e60ce2b 100644
--- a/lustre/llite/file.c
+++ b/lustre/llite/file.c
@@ -105,10 +105,8 @@ static int ll_file_open(struct inode *inode, struct file *file)
                       file->f_flags, lsm, &fd->fd_mdshandle, &req);
         fd->fd_req = req;
 
-        /* We don't call ptlrpc_req_finished here, because the request is
-         * preserved until we see a matching close, at which point it is
-         * released (and likely freed).  (See ll_file_release.)
-         */
+        /* This is the "reply" refcount. */
+        ptlrpc_req_finished(req);
         if (rc)
                 GOTO(out_req, -abs(rc));
         if (!fd->fd_mdshandle.addr ||
diff --git a/lustre/llite/namei.c b/lustre/llite/namei.c
index 188b7e1..7264964 100644
--- a/lustre/llite/namei.c
+++ b/lustre/llite/namei.c
@@ -218,6 +218,7 @@ static struct dentry *ll_lookup2(struct inode *dir, struct dentry *dentry,
                 ino = lic.lic_body->fid1.id;
                 mode = lic.lic_body->mode;
                 if (it->it_op & (IT_CREAT | IT_MKDIR | IT_SYMLINK | IT_MKNOD)) {
+                        mdc_store_create_replay_data(request, dir->i_sb);
                         /* For create ops, we want the lookup to be negative,
                          * unless the create failed in a way that indicates
                          * that the file is already there */
diff --git a/lustre/llite/recover.c b/lustre/llite/recover.c
index fae9718..3692042 100644
--- a/lustre/llite/recover.c
+++ b/lustre/llite/recover.c
@@ -69,7 +69,7 @@ static void abort_inflight_for_import(struct obd_import *imp)
         }
 }
 
-static void prepare_ost(struct obd_import *imp)
+static void prepare_osc(struct obd_import *imp)
 {
         int rc;
         struct ldlm_namespace *ns = imp->imp_obd->obd_namespace;
@@ -109,6 +109,12 @@ static void prepare_ost(struct obd_import *imp)
         }
 }
 
+static void prepare_mdc(struct obd_import *imp)
+{
+        struct ldlm_namespace *ns = imp->imp_obd->obd_namespace;
+        ldlm_cli_cancel_unused(ns, NULL, 1 /* local only */);
+}
+
 static int ll_prepare_recovery(struct ptlrpc_connection *conn)
 {
         struct list_head *tmp;
@@ -118,17 +124,24 @@ static int ll_prepare_recovery(struct ptlrpc_connection *conn)
                                                     imp_chain);
 
                 if (imp->imp_obd->obd_type->typ_ops->o_brw)
-                        prepare_ost(imp);
+                        prepare_osc(imp);
+                else
+                        prepare_mdc(imp);
         }
 
         return ptlrpc_run_recovery_upcall(conn);
 }
 
-static void reconnect_ost(struct obd_import *imp)
+static void reconnect_osc(struct obd_import *imp)
 {
         (void)ptlrpc_reconnect_import(imp, OST_CONNECT);
 }
 
+static int reconnect_mdc(struct obd_import *imp)
+{
+        return ptlrpc_reconnect_import(imp, MDS_CONNECT);
+}
+
 static int ll_reconnect(struct ptlrpc_connection *conn)
 {
         struct list_head *tmp;
@@ -145,12 +158,11 @@ static int ll_reconnect(struct ptlrpc_connection *conn)
                                                     imp_chain);
                 if (imp->imp_obd->obd_type->typ_ops->o_brw) {
                         /* XXX what to do if we fail? */
-                        reconnect_ost(imp);
+                        reconnect_osc(imp);
                 } else {
-                        int rc = ptlrpc_reconnect_import(imp, MDS_CONNECT);
+                        int rc = reconnect_mdc(imp);
                         if (!rc) {
                                 need_replay = 1;
-                                /* XXX obd_cancel_unused */
                         }
                         /* make sure we don't try to replay for dead imps?
                          *
diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c
index 17787276..4478db2 100644
--- a/lustre/mdc/mdc_request.c
+++ b/lustre/mdc/mdc_request.c
@@ -196,6 +196,176 @@ static int mdc_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
         RETURN(0);
 }
 
+struct create_replay_data {
+        struct super_block *sb;
+        u32                 generation;
+};
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+static int create_replay_find_inode(struct inode *inode, unsigned long ino,
+                                    void *opaque)
+#else
+static int create_replay_find_inode(struct inode *inode, void *opaque)
+#endif
+{
+        struct ptlrpc_request *req = opaque;
+        struct create_replay_data *saved;
+        struct mds_body *body;
+        
+        saved = lustre_msg_buf(req->rq_reqmsg, 5); /* lock with intent */
+        
+        if (saved->generation != inode->i_generation) {
+                CDEBUG(D_HA,
+                       "generation mismatch for ino %u: saved %u != inode %u\n",
+                       inode->i_ino, saved->generation, inode->i_generation);
+                return 0;
+        }
+
+        body = lustre_msg_buf(req->rq_repmsg, 1);
+
+        /* XXX do I need more out of ll_update_inode? */
+        CDEBUG(D_HA, "updating inode %u generation %u to %u\n",
+               inode->i_ino, inode->i_generation, body->generation);
+
+        inode->i_generation = body->generation;
+
+        return 1;
+}
+
+static void fixup_req_for_recreate(struct ptlrpc_request *fixreq,
+                                   struct ptlrpc_request *req,
+                                   struct inode *inode)
+{
+        struct ldlm_request *lockreq; 
+        struct mds_rec_link *rec; /* representative, two-fid op structure */
+        int opc;
+
+        if (fixreq->rq_import != req->rq_import) {
+                DEBUG_REQ(D_HA, fixreq, "import mismatch, skipping");
+                return;
+        }
+
+        DEBUG_REQ(D_HA, fixreq, "fixing");
+        
+        /* XXX check replay_state to see if we'll actually replay. */
+
+        /* We only care about LDLM_ENQUEUE and MDS_REINT requests. */
+        if (fixreq->rq_reqmsg->opc == LDLM_ENQUEUE) {
+                lockreq = lustre_msg_buf(fixreq->rq_reqmsg, 0);
+
+                if (lockreq->lock_desc.l_resource.lr_type != LDLM_MDSINTENT) {
+                        DEBUG_REQ(D_HA, fixreq, "non-intent lock, skipping");
+                        return;
+                }
+
+                if (fixreq->rq_reqmsg->bufcount < 2) {
+                        DEBUG_REQ(D_HA, fixreq,
+                                  "short intent (probably readdir), skipping");
+                        return;
+                }
+
+                /* XXX endianness is probably very very wrong here. Very. */
+                rec = lustre_msg_buf(fixreq->rq_reqmsg, 2);
+        } else if (fixreq->rq_reqmsg->opc == MDS_REINT) {
+                rec = lustre_msg_buf(fixreq->rq_reqmsg, 0);
+        } else if (fixreq->rq_reqmsg->opc == MDS_OPEN) {
+                struct mds_body *body = lustre_msg_buf(fixreq->rq_reqmsg, 0);
+                DEBUG_REQ(D_HA, fixreq, "fixing fid1: %u -> %u",
+                          body->fid1.generation, inode->i_generation);
+                body->fid1.generation = inode->i_generation;
+                return;
+        } else {
+                DEBUG_REQ(D_HA, fixreq, "not a replayable request, skipping");
+                return;
+        }
+        
+        if (rec->lk_fid1.id == inode->i_ino) {
+                DEBUG_REQ(D_HA, fixreq, "fixing fid1: %u -> %u",
+                          rec->lk_fid1.generation, inode->i_generation);
+                rec->lk_fid1.generation = inode->i_generation;
+        }
+        
+        /* Some ops have two FIDs. ZZZ We rely on the identical
+         * placement of that second FID in all such ops' messages.
+         */
+        opc = rec->lk_opcode & REINT_OPCODE_MASK;
+        if ((opc == REINT_LINK || opc == REINT_UNLINK ||
+             opc == REINT_RENAME) &&
+            rec->lk_fid2.id == inode->i_ino) {
+                DEBUG_REQ(D_HA, fixreq, "fixing fid2: %u -> %u",
+                          rec->lk_fid2.generation, inode->i_generation);
+                rec->lk_fid2.generation = inode->i_generation;
+        }
+}
+
+static void mdc_replay_create(struct ptlrpc_request *req)
+{
+        struct create_replay_data *saved;
+        struct mds_body *body;
+        struct inode *inode;
+        struct list_head *tmp;
+
+        if (req->rq_reqmsg->opc == MDS_REINT)
+                LBUG(); /* XXX don't handle the non-intent case yet */
+
+        body = lustre_msg_buf(req->rq_repmsg, 1);
+        saved = lustre_msg_buf(req->rq_reqmsg, 5); /* lock with intent */
+
+        CDEBUG(D_HA, "create of inode %d replayed; gen %u -> %u\n",
+               body->fid1.id, saved->generation, body->generation);
+        /* XXX cargo-culted right out of ll_iget */
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+        inode = iget4(saved->sb, body->fid1.id, create_replay_find_inode, req);
+#else
+        {
+                extern int ll_read_inode2(struct inode *inode, void *opaque);
+                inode = iget5_locked(saved->sb, body->fid1.id,
+                                     create_replay_find_inode, req);
+
+                if (!inode)
+                        LBUG(); /* XXX ick */
+                
+                if (inode->i_state & I_NEW)
+                        unlock_new_inode(inode);
+        }
+#endif
+
+        /* Now that we've updated the generation, we need to go and find all
+         * the other requests that refer to this file and will be replayed,
+         * and teach them about our new generation.
+         */
+        list_for_each(tmp, &req->rq_connection->c_sending_head) {
+                struct ptlrpc_request *fixreq =
+                        list_entry(tmp, struct ptlrpc_request, rq_list);
+
+                fixup_req_for_recreate(fixreq, req, inode);
+        }
+
+        list_for_each(tmp, &req->rq_connection->c_delayed_head) {
+                struct ptlrpc_request *fixreq =
+                        list_entry(tmp, struct ptlrpc_request, rq_list);
+
+                fixup_req_for_recreate(fixreq, req, inode);
+        }
+}
+
+void mdc_store_create_replay_data(struct ptlrpc_request *req,
+                                  struct super_block *sb)
+{
+        struct create_replay_data *saved = 
+                lustre_msg_buf(req->rq_reqmsg, 5);
+        struct mds_body *body = lustre_msg_buf(req->rq_repmsg, 1);
+
+
+        if (req->rq_reqmsg->opc == MDS_REINT)
+                LBUG(); /* XXX don't handle the non-intent case yet */
+
+        saved->generation = body->generation;
+        saved->sb = sb; /* XXX is this safe? */
+
+        req->rq_replay_cb = mdc_replay_create;
+}
+
 int mdc_enqueue(struct lustre_handle *conn, int lock_type,
                 struct lookup_intent *it, int lock_mode, struct inode *dir,
                 struct dentry *de, struct lustre_handle *lockh,
@@ -204,13 +374,14 @@ int mdc_enqueue(struct lustre_handle *conn, int lock_type,
         struct ptlrpc_request *req;
         struct obd_device *obddev = class_conn2obd(conn);
         __u64 res_id[RES_NAME_SIZE] = {dir->i_ino};
-        int size[5] = {sizeof(struct ldlm_request), sizeof(struct ldlm_intent)};
+        int size[6] = {sizeof(struct ldlm_request), sizeof(struct ldlm_intent)};
         int rc, flags = 0;
         int repsize[3] = {sizeof(struct ldlm_reply),
                           sizeof(struct mds_body),
                           obddev->u.cli.cl_max_mds_easize};
         struct ldlm_reply *dlm_rep;
         struct ldlm_intent *lit;
+        struct ldlm_request *lockreq;
         ENTRY;
 
         LDLM_DEBUG_NOLOCK("mdsintent %s dir %ld", ldlm_it2str(it->it_op),
@@ -234,7 +405,8 @@ int mdc_enqueue(struct lustre_handle *conn, int lock_type,
                 size[2] = sizeof(struct mds_rec_create);
                 size[3] = de->d_name.len + 1;
                 size[4] = tgtlen + 1;
-                req = ptlrpc_prep_req(class_conn2cliimp(conn), LDLM_ENQUEUE, 5,
+                size[5] = sizeof(struct create_replay_data);
+                req = ptlrpc_prep_req(class_conn2cliimp(conn), LDLM_ENQUEUE, 6,
                                       size, NULL);
                 if (!req)
                         RETURN(-ENOMEM);
@@ -357,6 +529,10 @@ int mdc_enqueue(struct lustre_handle *conn, int lock_type,
                 RETURN(rc);
         }
 
+        /* On replay, we don't want the lock granted. */
+        lockreq = lustre_msg_buf(req->rq_reqmsg, 0);
+        lockreq->lock_flags |= LDLM_FL_INTENT_ONLY;
+
         dlm_rep = lustre_msg_buf(req->rq_repmsg, 0);
         it->it_disposition = (int) dlm_rep->lock_policy_res1;
         it->it_status = (int) dlm_rep->lock_policy_res2;
@@ -595,5 +771,7 @@ EXPORT_SYMBOL(mdc_setattr);
 EXPORT_SYMBOL(mdc_close);
 EXPORT_SYMBOL(mdc_open);
 
+EXPORT_SYMBOL(mdc_store_create_replay_data);
+
 module_init(ptlrpc_request_init);
 module_exit(ptlrpc_request_exit);
diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c
index 57919c6..cb340bc 100644
--- a/lustre/mds/handler.c
+++ b/lustre/mds/handler.c
@@ -1415,6 +1415,11 @@ static int ldlm_intent_policy(struct ldlm_lock *lock, void *req_cookie,
                         }
                 }
 
+
+                if (flags & LDLM_FL_INTENT_ONLY) {
+                        LDLM_DEBUG(lock, "INTENT_ONLY, aborting lock");
+                        RETURN(ELDLM_LOCK_ABORTED);
+                }
                 /* Give the client a lock on the child object, instead of the
                  * parent that it requested. */
                 new_resid[0] = NTOH__u32(mds_rep->ino);
diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c
index 7ef9ff1..ee6bd63 100644
--- a/lustre/ptlrpc/client.c
+++ b/lustre/ptlrpc/client.c
@@ -649,15 +649,13 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req)
 
 int ptlrpc_replay_req(struct ptlrpc_request *req)
 {
-        int rc = 0, old_level;
+        int rc = 0, old_level, old_status;
         // struct ptlrpc_client *cli = req->rq_import->imp_client;
         struct l_wait_info lwi;
         ENTRY;
 
         init_waitqueue_head(&req->rq_wait_for_rep);
-        CDEBUG(D_NET, "req "LPD64" opc %d level %d, conn level %d\n",
-               req->rq_xid, req->rq_reqmsg->opc, req->rq_level,
-               req->rq_connection->c_level);
+        DEBUG_REQ(D_NET, req, "");
 
         req->rq_timeout = obd_timeout;
         req->rq_reqmsg->addr = req->rq_import->imp_handle.addr;
@@ -665,6 +663,7 @@ int ptlrpc_replay_req(struct ptlrpc_request *req)
 
         /* temporarily set request to RECOVD level (reset at out:) */
         old_level = req->rq_level;
+        old_status = req->rq_repmsg->status;
         req->rq_level = LUSTRE_CONN_RECOVD;
         rc = ptl_send_rpc(req);
         if (rc) {
@@ -700,15 +699,9 @@ int ptlrpc_replay_req(struct ptlrpc_request *req)
         if (req->rq_replay_cb)
                 req->rq_replay_cb(req);
 
-        if (req->rq_repmsg->status == 0) {
-                CDEBUG(D_NET, "--> buf %p len %d status %d\n", req->rq_repmsg,
-                       req->rq_replen, req->rq_repmsg->status);
-        } else {
-                CERROR("recovery failed: "); 
-                CERROR("req "LPD64" opc %d level %d, conn level %d\n", 
-                       req->rq_xid, req->rq_reqmsg->opc, req->rq_level,
-                       req->rq_connection->c_level);
-                LBUG();
+        if (req->rq_repmsg->status != old_status) {
+                DEBUG_REQ(D_HA, req, "status %d, old was %d",
+                          req->rq_repmsg->status, old_status);
         }
 
  out:
diff --git a/lustre/ptlrpc/recover.c b/lustre/ptlrpc/recover.c
index 272ffb9..060258f 100644
--- a/lustre/ptlrpc/recover.c
+++ b/lustre/ptlrpc/recover.c
@@ -108,11 +108,12 @@ int ptlrpc_run_recovery_upcall(struct ptlrpc_connection *conn)
         RETURN(0);
 }
 
-#define REPLAY_COMMITTED     0 /* Fully processed (commit + reply) */
-#define REPLAY_REPLAY        1 /* Forced-replay (e.g. open) */
+#define REPLAY_COMMITTED     0 /* Fully processed (commit + reply). */
+#define REPLAY_REPLAY        1 /* Forced-replay (e.g. open). */
 #define REPLAY_RESEND        2 /* Resend required. */
-#define REPLAY_RESEND_IGNORE 3 /* Resend, ignore the reply (already saw it) */
+#define REPLAY_RESEND_IGNORE 3 /* Resend, ignore the reply (already saw it). */
 #define REPLAY_RESTART       4 /* Have to restart the call, sorry! */
+#define REPLAY_NO_STATE      5 /* Request doesn't change MDS state: skip. */
 
 static int replay_state(struct ptlrpc_request *req, __u64 last_xid)
 {
@@ -123,6 +124,12 @@ static int replay_state(struct ptlrpc_request *req, __u64 last_xid)
         /* Uncommitted request */
         if (req->rq_xid > last_xid) {
                 if (req->rq_flags & PTL_RPC_FL_REPLIED) {
+                        if (req->rq_transno == 0) {
+                                /* If no transno was returned, no state was
+                                   altered on the MDS. */
+                                return REPLAY_NO_STATE;
+                        }
+
                         /* Saw reply, so resend and ignore new reply. */
                         return REPLAY_RESEND_IGNORE;
                 }
@@ -141,7 +148,8 @@ static int replay_state(struct ptlrpc_request *req, __u64 last_xid)
 
 static char *replay_state2str(int state) {
         static char *state_strings[] = {
-                "COMMITTED", "REPLAY", "RESEND", "RESEND_IGNORE", "RESTART"
+                "COMMITTED", "REPLAY", "RESEND", "RESEND_IGNORE", "RESTART",
+                "NO_STATE"
         };
         static char *unknown_state = "UNKNOWN";
 
@@ -206,6 +214,11 @@ int ptlrpc_replay(struct ptlrpc_connection *conn)
                         /* XXX commit now? */
                         break;
 
+                    case REPLAY_NO_STATE:
+                        DEBUG_REQ(D_HA, req, "NO_STATE:");
+                        /* XXX commit now? */
+                        break;
+
                     case REPLAY_RESEND_IGNORE:
                         DEBUG_REQ(D_HA, req, "RESEND_IGNORE:");
                         rc = ptlrpc_replay_req(req);