Whamcloud - gitweb
LU-2613 recovery: free open/close request promptly 65/6665/11
authorHongchao Zhang <hongchao.zhang@intel.com>
Sun, 21 Jul 2013 21:40:37 +0000 (05:40 +0800)
committerOleg Drokin <oleg.drokin@intel.com>
Wed, 6 Nov 2013 06:51:49 +0000 (06:51 +0000)
- For the non-create open or committed open, the open request
  should be freed along with the close request as soon as the
  close done, despite that the transno of open/close is
  greater than the last committed transno known by client or not.

- Move the committed open request into another dedicated list,
  that will avoid scanning a huge replay list on receiving each
  reply (when there are many open files).

Signed-off-by: Niu Yawei <yawei.niu@intel.com>
Signed-off-by: Hongchao Zhang <hongchao.zhang@intel.com>
Change-Id: I1a25a35fe7a16681368d92d16964680b6209b3ee
Reviewed-on: http://review.whamcloud.com/6665
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Alex Zhuravlev <alexey.zhuravlev@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
21 files changed:
lustre/include/lustre/lustre_idl.h
lustre/include/lustre_export.h
lustre/include/lustre_import.h
lustre/include/lustre_net.h
lustre/include/obd.h
lustre/include/obd_class.h
lustre/liblustre/file.c
lustre/llite/file.c
lustre/llite/llite_lib.c
lustre/lmv/lmv_obd.c
lustre/mdc/mdc_internal.h
lustre/mdc/mdc_locks.c
lustre/mdc/mdc_reint.c
lustre/mdc/mdc_request.c
lustre/mdt/mdt_open.c
lustre/obdclass/genops.c
lustre/obdclass/lprocfs_status.c
lustre/ptlrpc/client.c
lustre/ptlrpc/import.c
lustre/ptlrpc/recover.c
lustre/tests/sanity.sh

index adf3095..f99ef62 100644 (file)
@@ -1298,6 +1298,7 @@ extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb);
 #define OBD_CONNECT_SHORTIO     0x2000000000000ULL/* short io */
 #define OBD_CONNECT_PINGLESS   0x4000000000000ULL/* pings not required */
 #define OBD_CONNECT_FLOCK_DEAD 0x8000000000000ULL/* improved flock deadlock detection */
+#define OBD_CONNECT_DISP_STRIPE 0x10000000000000ULL/* create stripe disposition*/
 
 /* XXX README XXX:
  * Please DO NOT add flag values here before first ensuring that this same
@@ -1341,7 +1342,9 @@ extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb);
                                OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_UMASK | \
                                OBD_CONNECT_LVB_TYPE | OBD_CONNECT_LAYOUTLOCK |\
                                OBD_CONNECT_PINGLESS | OBD_CONNECT_MAX_EASIZE |\
-                               OBD_CONNECT_FLOCK_DEAD)
+                               OBD_CONNECT_FLOCK_DEAD | \
+                               OBD_CONNECT_DISP_STRIPE)
+
 #define OST_CONNECT_SUPPORTED  (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \
                                 OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \
                                 OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_INDEX | \
@@ -2114,6 +2117,7 @@ extern void lustre_swab_generic_32s (__u32 *val);
 #define DISP_ENQ_CREATE_REF  0x01000000
 #define DISP_OPEN_LOCK       0x02000000
 #define DISP_OPEN_LEASE      0x04000000
+#define DISP_OPEN_STRIPE     0x08000000
 
 /* INODE LOCK PARTS */
 #define MDS_INODELOCK_LOOKUP 0x000001       /* dentry, mode, owner, group */
index 649aa23..2a01b60 100644 (file)
@@ -377,6 +377,15 @@ static inline bool imp_connect_lvb_type(struct obd_import *imp)
                return false;
 }
 
+static inline bool imp_connect_disp_stripe(struct obd_import *imp)
+{
+       struct obd_connect_data *ocd;
+
+       LASSERT(imp != NULL);
+       ocd = &imp->imp_connect_data;
+       return ocd->ocd_connect_flags & OBD_CONNECT_DISP_STRIPE;
+}
+
 extern struct obd_export *class_conn2export(struct lustre_handle *conn);
 extern struct obd_device *class_conn2obd(struct lustre_handle *conn);
 
index d86f0e7..3beccba 100644 (file)
@@ -180,6 +180,17 @@ struct obd_import {
         cfs_list_t                imp_delayed_list;
         /** @} */
 
+       /**
+        * List of requests that are retained for committed open replay. Once
+        * open is committed, open replay request will be moved from the
+        * imp_replay_list into the imp_committed_list.
+        * The imp_replay_cursor is for accelerating searching during replay.
+        * @{
+        */
+       cfs_list_t                imp_committed_list;
+       cfs_list_t               *imp_replay_cursor;
+       /** @} */
+
         /** obd device for this import */
         struct obd_device        *imp_obd;
 
index df95d8a..3c9a2d7 100644 (file)
@@ -2911,6 +2911,8 @@ int ptlrpc_register_rqbd(struct ptlrpc_request_buffer_desc *rqbd);
  * request queues, request management, etc.
  * @{
  */
+void ptlrpc_request_committed(struct ptlrpc_request *req, int force);
+
 void ptlrpc_init_client(int req_portal, int rep_portal, char *name,
                         struct ptlrpc_client *);
 void ptlrpc_cleanup_client(struct obd_import *imp);
index 121063a..b1115d4 100644 (file)
@@ -1347,10 +1347,11 @@ struct lustre_md {
 };
 
 struct md_open_data {
-        struct obd_client_handle *mod_och;
-        struct ptlrpc_request    *mod_open_req;
-        struct ptlrpc_request    *mod_close_req;
-        cfs_atomic_t              mod_refcount;
+       struct obd_client_handle        *mod_och;
+       struct ptlrpc_request           *mod_open_req;
+       struct ptlrpc_request           *mod_close_req;
+       cfs_atomic_t                     mod_refcount;
+       bool                             mod_is_create;
 };
 
 struct lookup_intent;
@@ -1454,7 +1455,7 @@ struct md_ops {
 
        int (*m_set_open_replay_data)(struct obd_export *,
                                      struct obd_client_handle *,
-                                     struct ptlrpc_request *);
+                                     struct lookup_intent *);
 
        int (*m_clear_open_replay_data)(struct obd_export *,
                                        struct obd_client_handle *);
index fe1eccc..39c0dd8 100644 (file)
@@ -2080,13 +2080,13 @@ static inline int md_getxattr(struct obd_export *exp,
 }
 
 static inline int md_set_open_replay_data(struct obd_export *exp,
-                                          struct obd_client_handle *och,
-                                          struct ptlrpc_request *open_req)
+                                         struct obd_client_handle *och,
+                                         struct lookup_intent *it)
 {
-        ENTRY;
-        EXP_CHECK_MD_OP(exp, set_open_replay_data);
-        EXP_MD_COUNTER_INCREMENT(exp, set_open_replay_data);
-        RETURN(MDP(exp->exp_obd, set_open_replay_data)(exp, och, open_req));
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, set_open_replay_data);
+       EXP_MD_COUNTER_INCREMENT(exp, set_open_replay_data);
+       RETURN(MDP(exp->exp_obd, set_open_replay_data)(exp, och, it));
 }
 
 static inline int md_clear_open_replay_data(struct obd_export *exp,
index fac14db..be3bdd5 100644 (file)
@@ -173,8 +173,7 @@ int llu_local_open(struct llu_inode_info *lli, struct lookup_intent *it)
         fd->fd_mds_och.och_fid   = lli->lli_fid;
         lli->lli_file_data = fd;
         llu_ioepoch_open(lli, body->ioepoch);
-        md_set_open_replay_data(lli->lli_sbi->ll_md_exp,
-                                &fd->fd_mds_och, it->d.lustre.it_data);
+       md_set_open_replay_data(lli->lli_sbi->ll_md_exp, &fd->fd_mds_och, it);
 
         RETURN(0);
 }
index 58c1d73..edcdda4 100644 (file)
@@ -494,7 +494,7 @@ static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
        och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
        och->och_flags = it->it_flags;
 
-       return md_set_open_replay_data(md_exp, och, req);
+       return md_set_open_replay_data(md_exp, och, it);
 }
 
 int ll_local_open(struct file *file, struct lookup_intent *it,
index c6b85f1..a06abd3 100644 (file)
@@ -207,7 +207,8 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
                                  OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE |
                                  OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_PINGLESS |
                                  OBD_CONNECT_MAX_EASIZE |
-                                 OBD_CONNECT_FLOCK_DEAD;
+                                 OBD_CONNECT_FLOCK_DEAD |
+                                 OBD_CONNECT_DISP_STRIPE;
 
         if (sbi->ll_flags & LL_SBI_SOM_PREVIEW)
                 data->ocd_connect_flags |= OBD_CONNECT_SOM;
index 20fb834..c89dc69 100644 (file)
@@ -2719,19 +2719,19 @@ int lmv_free_lustre_md(struct obd_export *exp, struct lustre_md *md)
 }
 
 int lmv_set_open_replay_data(struct obd_export *exp,
-                             struct obd_client_handle *och,
-                             struct ptlrpc_request *open_req)
+                            struct obd_client_handle *och,
+                            struct lookup_intent *it)
 {
-        struct obd_device       *obd = exp->exp_obd;
-        struct lmv_obd          *lmv = &obd->u.lmv;
-        struct lmv_tgt_desc     *tgt;
-        ENTRY;
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd          *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc     *tgt;
+       ENTRY;
 
-        tgt = lmv_find_target(lmv, &och->och_fid);
-        if (IS_ERR(tgt))
-                RETURN(PTR_ERR(tgt));
+       tgt = lmv_find_target(lmv, &och->och_fid);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
 
-        RETURN(md_set_open_replay_data(tgt->ltd_exp, och, open_req));
+       RETURN(md_set_open_replay_data(tgt->ltd_exp, och, it));
 }
 
 int lmv_clear_open_replay_data(struct obd_export *exp,
index 7a807d7..adacb30 100644 (file)
@@ -121,8 +121,8 @@ int mdc_get_lustre_md(struct obd_export *md_exp, struct ptlrpc_request *req,
 int mdc_free_lustre_md(struct obd_export *exp, struct lustre_md *md);
 
 int mdc_set_open_replay_data(struct obd_export *exp,
-                             struct obd_client_handle *och,
-                             struct ptlrpc_request *open_req);
+                            struct obd_client_handle *och,
+                            struct lookup_intent *it);
 
 int mdc_clear_open_replay_data(struct obd_export *exp,
                                struct obd_client_handle *och);
index 5d43c47..eabfbbd 100644 (file)
@@ -657,7 +657,7 @@ static int mdc_finish_enqueue(struct obd_export *exp,
                          * happens immediately after swabbing below, new reply
                          * is swabbed by that handler correctly.
                          */
-                        mdc_set_open_replay_data(NULL, NULL, req);
+                       mdc_set_open_replay_data(NULL, NULL, it);
                }
 
                 if ((body->valid & (OBD_MD_FLDIREA | OBD_MD_FLEASIZE)) != 0) {
index b28b332..56e2d74 100644 (file)
@@ -171,6 +171,7 @@ int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data,
                         req->rq_cb_data = *mod;
                         (*mod)->mod_open_req = req;
                         req->rq_commit_cb = mdc_commit_open;
+                       (*mod)->mod_is_create = true;
                         /**
                          * Take an extra reference on \var mod, it protects \var
                          * mod from being freed on eviction (commit callback is
index ab9a375..aa8e39d 100644 (file)
@@ -741,14 +741,15 @@ void mdc_commit_open(struct ptlrpc_request *req)
 }
 
 int mdc_set_open_replay_data(struct obd_export *exp,
-                             struct obd_client_handle *och,
-                             struct ptlrpc_request *open_req)
-{
-        struct md_open_data   *mod;
-        struct mdt_rec_create *rec;
-        struct mdt_body       *body;
-        struct obd_import     *imp = open_req->rq_import;
-        ENTRY;
+                            struct obd_client_handle *och,
+                            struct lookup_intent *it)
+{
+       struct md_open_data     *mod;
+       struct mdt_rec_create   *rec;
+       struct mdt_body         *body;
+       struct ptlrpc_request   *open_req = it->d.lustre.it_data;
+       struct obd_import       *imp = open_req->rq_import;
+       ENTRY;
 
         if (!open_req->rq_replay)
                 RETURN(0);
@@ -781,6 +782,8 @@ int mdc_set_open_replay_data(struct obd_export *exp,
                spin_lock(&open_req->rq_lock);
                och->och_mod = mod;
                mod->mod_och = och;
+               mod->mod_is_create = it_disposition(it, DISP_OPEN_CREATE) ||
+                                    it_disposition(it, DISP_OPEN_STRIPE);
                mod->mod_open_req = open_req;
                open_req->rq_cb_data = mod;
                open_req->rq_commit_cb = mdc_commit_open;
@@ -801,6 +804,23 @@ int mdc_set_open_replay_data(struct obd_export *exp,
         RETURN(0);
 }
 
+static void mdc_free_open(struct md_open_data *mod)
+{
+       int committed = 0;
+
+       if (mod->mod_is_create == 0 &&
+           imp_connect_disp_stripe(mod->mod_open_req->rq_import))
+               committed = 1;
+
+       LASSERT(mod->mod_open_req->rq_replay == 0);
+
+       DEBUG_REQ(D_RPCTRACE, mod->mod_open_req, "free open request\n");
+
+       ptlrpc_request_committed(mod->mod_open_req, committed);
+       if (mod->mod_close_req)
+               ptlrpc_request_committed(mod->mod_close_req, committed);
+}
+
 int mdc_clear_open_replay_data(struct obd_export *exp,
                                struct obd_client_handle *och)
 {
@@ -815,6 +835,8 @@ int mdc_clear_open_replay_data(struct obd_export *exp,
                 RETURN(0);
 
         LASSERT(mod != LP_POISON);
+       LASSERT(mod->mod_open_req != NULL);
+       mdc_free_open(mod);
 
         mod->mod_och = NULL;
         och->och_mod = NULL;
@@ -1014,6 +1036,9 @@ int mdc_done_writing(struct obd_export *exp, struct md_op_data *op_data,
         if (mod) {
                 if (rc != 0)
                         mod->mod_close_req = NULL;
+               LASSERT(mod->mod_open_req != NULL);
+               mdc_free_open(mod);
+
                 /* Since now, mod is accessed through setattr req only,
                  * thus DW req does not keep a reference on mod anymore. */
                 obd_mod_put(mod);
index 3712ae3..27e8fd8 100644 (file)
@@ -664,7 +664,8 @@ void mdt_mfd_set_mode(struct mdt_file_data *mfd, __u64 mode)
 }
 
 static int mdt_mfd_open(struct mdt_thread_info *info, struct mdt_object *p,
-                        struct mdt_object *o, __u64 flags, int created)
+                       struct mdt_object *o, __u64 flags, int created,
+                       struct ldlm_reply *rep)
 {
         struct ptlrpc_request   *req = mdt_info_req(info);
         struct mdt_export_data  *med = &req->rq_export->exp_mdt_data;
@@ -692,6 +693,9 @@ static int mdt_mfd_open(struct mdt_thread_info *info, struct mdt_object *p,
                 rc = mdt_create_data(info, p, o);
                 if (rc)
                         RETURN(rc);
+
+               if (exp_connect_flags(req->rq_export) & OBD_CONNECT_DISP_STRIPE)
+                       mdt_set_disposition(info, rep, DISP_OPEN_STRIPE);
         }
 
         CDEBUG(D_INODE, "after open, ma_valid bit = "LPX64" lmm_size = %d\n",
@@ -979,15 +983,15 @@ int mdt_finish_open(struct mdt_thread_info *info,
                                         repbody->valid |= OBD_MD_FLEASIZE;
                         }
                        mdt_set_disposition(info, rep, DISP_OPEN_OPEN);
-                        RETURN(0);
-                }
-        }
+                       RETURN(0);
+               }
+       }
 
-        rc = mdt_mfd_open(info, p, o, flags, created);
+       rc = mdt_mfd_open(info, p, o, flags, created, rep);
        if (!rc)
                mdt_set_disposition(info, rep, DISP_OPEN_OPEN);
 
-        RETURN(rc);
+       RETURN(rc);
 }
 
 extern void mdt_req_from_lcd(struct ptlrpc_request *req,
index 9213b20..e9f95c0 100644 (file)
@@ -1035,6 +1035,8 @@ struct obd_import *class_new_import(struct obd_device *obd)
        CFS_INIT_LIST_HEAD(&imp->imp_replay_list);
        CFS_INIT_LIST_HEAD(&imp->imp_sending_list);
        CFS_INIT_LIST_HEAD(&imp->imp_delayed_list);
+       CFS_INIT_LIST_HEAD(&imp->imp_committed_list);
+       imp->imp_replay_cursor = &imp->imp_committed_list;
        spin_lock_init(&imp->imp_lock);
        imp->imp_last_success_conn = 0;
        imp->imp_state = LUSTRE_IMP_NEW;
index 6356f2a..207c724 100644 (file)
@@ -879,6 +879,7 @@ static const char *obd_connect_names[] = {
        "short_io",
        "pingless",
        "flock_deadlock",
+       "disp_stripe",
        "unknown",
         NULL
 };
index 9c5fc00..7018182 100644 (file)
@@ -2416,6 +2416,39 @@ int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async)
 }
 EXPORT_SYMBOL(ptlrpc_unregister_reply);
 
+static void ptlrpc_free_request(struct ptlrpc_request *req)
+{
+       spin_lock(&req->rq_lock);
+       req->rq_replay = 0;
+       spin_unlock(&req->rq_lock);
+
+       if (req->rq_commit_cb != NULL)
+               req->rq_commit_cb(req);
+       cfs_list_del_init(&req->rq_replay_list);
+
+       __ptlrpc_req_finished(req, 1);
+}
+
+/**
+ * the request is committed and dropped from the replay list of its import
+ */
+void ptlrpc_request_committed(struct ptlrpc_request *req, int force)
+{
+       struct obd_import       *imp = req->rq_import;
+
+       spin_lock(&imp->imp_lock);
+       if (cfs_list_empty(&req->rq_replay_list)) {
+               spin_unlock(&imp->imp_lock);
+               return;
+       }
+
+       if (force || req->rq_transno <= imp->imp_peer_committed_transno)
+               ptlrpc_free_request(req);
+
+       spin_unlock(&imp->imp_lock);
+}
+EXPORT_SYMBOL(ptlrpc_request_committed);
+
 /**
  * Iterates through replay_list on import and prunes
  * all requests have transno smaller than last_committed for the
@@ -2426,10 +2459,10 @@ EXPORT_SYMBOL(ptlrpc_unregister_reply);
  */
 void ptlrpc_free_committed(struct obd_import *imp)
 {
-        cfs_list_t *tmp, *saved;
-        struct ptlrpc_request *req;
-        struct ptlrpc_request *last_req = NULL; /* temporary fire escape */
-        ENTRY;
+       struct ptlrpc_request   *req, *saved;
+       struct ptlrpc_request   *last_req = NULL; /* temporary fire escape */
+       bool                     skip_committed_list = true;
+       ENTRY;
 
        LASSERT(imp != NULL);
        LASSERT(spin_is_locked(&imp->imp_lock));
@@ -2445,13 +2478,15 @@ void ptlrpc_free_committed(struct obd_import *imp)
         CDEBUG(D_RPCTRACE, "%s: committing for last_committed "LPU64" gen %d\n",
                imp->imp_obd->obd_name, imp->imp_peer_committed_transno,
                imp->imp_generation);
+
+       if (imp->imp_generation != imp->imp_last_generation_checked)
+               skip_committed_list = false;
+
         imp->imp_last_transno_checked = imp->imp_peer_committed_transno;
         imp->imp_last_generation_checked = imp->imp_generation;
 
-        cfs_list_for_each_safe(tmp, saved, &imp->imp_replay_list) {
-                req = cfs_list_entry(tmp, struct ptlrpc_request,
-                                     rq_replay_list);
-
+       cfs_list_for_each_entry_safe(req, saved, &imp->imp_replay_list,
+                                    rq_replay_list) {
                 /* XXX ok to remove when 1357 resolved - rread 05/29/03  */
                 LASSERT(req != last_req);
                 last_req = req;
@@ -2465,29 +2500,37 @@ void ptlrpc_free_committed(struct obd_import *imp)
                         GOTO(free_req, 0);
                 }
 
-                if (req->rq_replay) {
-                        DEBUG_REQ(D_RPCTRACE, req, "keeping (FL_REPLAY)");
-                        continue;
-                }
-
                 /* not yet committed */
                 if (req->rq_transno > imp->imp_peer_committed_transno) {
                         DEBUG_REQ(D_RPCTRACE, req, "stopping search");
                         break;
                 }
 
+               if (req->rq_replay) {
+                       DEBUG_REQ(D_RPCTRACE, req, "keeping (FL_REPLAY)");
+                       cfs_list_move_tail(&req->rq_replay_list,
+                                          &imp->imp_committed_list);
+                       continue;
+               }
+
                 DEBUG_REQ(D_INFO, req, "commit (last_committed "LPU64")",
                           imp->imp_peer_committed_transno);
 free_req:
-               spin_lock(&req->rq_lock);
-               req->rq_replay = 0;
-               spin_unlock(&req->rq_lock);
-                if (req->rq_commit_cb != NULL)
-                        req->rq_commit_cb(req);
-                cfs_list_del_init(&req->rq_replay_list);
-                __ptlrpc_req_finished(req, 1);
+               ptlrpc_free_request(req);
         }
 
+       if (skip_committed_list)
+               GOTO(out, 0);
+
+       cfs_list_for_each_entry_safe(req, saved, &imp->imp_committed_list,
+                                    rq_replay_list) {
+               LASSERT(req->rq_transno != 0);
+               if (req->rq_import_generation < imp->imp_generation) {
+                       DEBUG_REQ(D_RPCTRACE, req, "free stale open request");
+                       ptlrpc_free_request(req);
+               }
+       }
+out:
         EXIT;
         return;
 }
index 30e8471..db2c547 100644 (file)
@@ -567,20 +567,32 @@ static int import_select_connection(struct obd_import *imp)
  */
 static int ptlrpc_first_transno(struct obd_import *imp, __u64 *transno)
 {
-        struct ptlrpc_request *req;
-        cfs_list_t *tmp;
-
-        if (cfs_list_empty(&imp->imp_replay_list))
-                return 0;
-        tmp = imp->imp_replay_list.next;
-        req = cfs_list_entry(tmp, struct ptlrpc_request, rq_replay_list);
-        *transno = req->rq_transno;
-        if (req->rq_transno == 0) {
-                DEBUG_REQ(D_ERROR, req, "zero transno in replay");
-                LBUG();
-        }
-
-        return 1;
+       struct ptlrpc_request   *req;
+       cfs_list_t              *tmp;
+
+       /* The requests in committed_list always have smaller transnos than
+        * the requests in replay_list */
+       if (!cfs_list_empty(&imp->imp_committed_list)) {
+               tmp = imp->imp_committed_list.next;
+               req = cfs_list_entry(tmp, struct ptlrpc_request, rq_replay_list);
+               *transno = req->rq_transno;
+               if (req->rq_transno == 0) {
+                       DEBUG_REQ(D_ERROR, req, "zero transno in committed_list");
+                       LBUG();
+               }
+               return 1;
+       }
+       if (!cfs_list_empty(&imp->imp_replay_list)) {
+               tmp = imp->imp_replay_list.next;
+               req = cfs_list_entry(tmp, struct ptlrpc_request, rq_replay_list);
+               *transno = req->rq_transno;
+               if (req->rq_transno == 0) {
+                       DEBUG_REQ(D_ERROR, req, "zero transno in replay_list");
+                       LBUG();
+               }
+               return 1;
+       }
+       return 0;
 }
 
 /**
index 1dd4533..266e826 100644 (file)
@@ -114,23 +114,58 @@ int ptlrpc_replay_next(struct obd_import *imp, int *inflight)
          * imp_lock is being held by ptlrpc_replay, but it's not. it's
          * just a little race...
          */
-        cfs_list_for_each_safe(tmp, pos, &imp->imp_replay_list) {
-                req = cfs_list_entry(tmp, struct ptlrpc_request,
-                                     rq_replay_list);
-
-                /* If need to resend the last sent transno (because a
-                   reconnect has occurred), then stop on the matching
-                   req and send it again. If, however, the last sent
-                   transno has been committed then we continue replay
-                   from the next request. */
-                if (req->rq_transno > last_transno) {
-                        if (imp->imp_resend_replay)
-                                lustre_msg_add_flags(req->rq_reqmsg,
-                                                     MSG_RESENT);
-                        break;
-                }
-                req = NULL;
-        }
+
+       /* Replay all the committed open requests on committed_list first */
+       if (!cfs_list_empty(&imp->imp_committed_list)) {
+               tmp = imp->imp_committed_list.prev;
+               req = cfs_list_entry(tmp, struct ptlrpc_request,
+                                    rq_replay_list);
+
+               /* The last request on committed_list hasn't been replayed */
+               if (req->rq_transno > last_transno) {
+                       /* Since the imp_committed_list is immutable before
+                        * all of it's requests being replayed, it's safe to
+                        * use a cursor to accelerate the search */
+                       imp->imp_replay_cursor = imp->imp_replay_cursor->next;
+
+                       while (imp->imp_replay_cursor !=
+                              &imp->imp_committed_list) {
+                               req = cfs_list_entry(imp->imp_replay_cursor,
+                                                    struct ptlrpc_request,
+                                                    rq_replay_list);
+                               if (req->rq_transno > last_transno)
+                                       break;
+
+                               req = NULL;
+                               imp->imp_replay_cursor =
+                                       imp->imp_replay_cursor->next;
+                       }
+               } else {
+                       /* All requests on committed_list have been replayed */
+                       imp->imp_replay_cursor = &imp->imp_committed_list;
+                       req = NULL;
+               }
+       }
+
+       /* All the requests in committed list have been replayed, let's replay
+        * the imp_replay_list */
+       if (req == NULL) {
+               cfs_list_for_each_safe(tmp, pos, &imp->imp_replay_list) {
+                       req = cfs_list_entry(tmp, struct ptlrpc_request,
+                                            rq_replay_list);
+
+                       if (req->rq_transno > last_transno)
+                               break;
+                       req = NULL;
+               }
+       }
+
+       /* If need to resend the last sent transno (because a reconnect
+        * has occurred), then stop on the matching req and send it again.
+        * If, however, the last sent transno has been committed then we 
+        * continue replay from the next request. */
+       if (req != NULL && imp->imp_resend_replay)
+               lustre_msg_add_flags(req->rq_reqmsg, MSG_RESENT);
 
        spin_lock(&imp->imp_lock);
        imp->imp_resend_replay = 0;
index b012490..c78359f 100644 (file)
@@ -10841,6 +10841,31 @@ test_208() {
 }
 run_test 208 "Exclusive open"
 
+test_209() {
+       [[ $($LCTL get_param -n mdc.*.connect_flags) == ~disp_stripe ]] &&
+               skip_env "must have disp_stripe" && return
+
+       touch $DIR/$tfile
+       sync; sleep 5; sync;
+
+       echo 3 > /proc/sys/vm/drop_caches
+       req_before=$(awk '/ptlrpc_cache / { print $2 }' /proc/slabinfo)
+
+       # open/close 500 times
+       for i in $(seq 500); do
+               cat $DIR/$tfile
+       done
+
+       echo 3 > /proc/sys/vm/drop_caches
+       req_after=$(awk '/ptlrpc_cache / { print $2 }' /proc/slabinfo)
+
+       echo "before: $req_before, after: $req_after"
+       [ $((req_after - req_before)) -ge 300 ] &&
+               error "open/close requests are not freed"
+       return 0
+}
+run_test 209 "read-only open/close requests should be freed promptly"
+
 test_212() {
        size=`date +%s`
        size=$((size % 8192 + 1))