#define OBD_CONNECT_SHORTIO 0x2000000000000ULL/* short io */
#define OBD_CONNECT_PINGLESS 0x4000000000000ULL/* pings not required */
#define OBD_CONNECT_FLOCK_DEAD 0x8000000000000ULL/* improved flock deadlock detection */
+#define OBD_CONNECT_DISP_STRIPE 0x10000000000000ULL/* create stripe disposition*/
/* XXX README XXX:
* Please DO NOT add flag values here before first ensuring that this same
OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_UMASK | \
OBD_CONNECT_LVB_TYPE | OBD_CONNECT_LAYOUTLOCK |\
OBD_CONNECT_PINGLESS | OBD_CONNECT_MAX_EASIZE |\
- OBD_CONNECT_FLOCK_DEAD)
+ OBD_CONNECT_FLOCK_DEAD | \
+ OBD_CONNECT_DISP_STRIPE)
+
#define OST_CONNECT_SUPPORTED (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \
OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \
OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_INDEX | \
#define DISP_ENQ_CREATE_REF 0x01000000
#define DISP_OPEN_LOCK 0x02000000
#define DISP_OPEN_LEASE 0x04000000
+#define DISP_OPEN_STRIPE 0x08000000
/* INODE LOCK PARTS */
#define MDS_INODELOCK_LOOKUP 0x000001 /* dentry, mode, owner, group */
return false;
}
+static inline bool imp_connect_disp_stripe(struct obd_import *imp)
+{
+ struct obd_connect_data *ocd;
+
+ LASSERT(imp != NULL);
+ ocd = &imp->imp_connect_data;
+ return ocd->ocd_connect_flags & OBD_CONNECT_DISP_STRIPE;
+}
+
extern struct obd_export *class_conn2export(struct lustre_handle *conn);
extern struct obd_device *class_conn2obd(struct lustre_handle *conn);
cfs_list_t imp_delayed_list;
/** @} */
+ /**
+ * List of requests that are retained for committed open replay. Once
+ * open is committed, open replay request will be moved from the
+ * imp_replay_list into the imp_committed_list.
+ * The imp_replay_cursor is for accelerating searching during replay.
+ * @{
+ */
+ cfs_list_t imp_committed_list;
+ cfs_list_t *imp_replay_cursor;
+ /** @} */
+
/** obd device for this import */
struct obd_device *imp_obd;
* request queues, request management, etc.
* @{
*/
+void ptlrpc_request_committed(struct ptlrpc_request *req, int force);
+
void ptlrpc_init_client(int req_portal, int rep_portal, char *name,
struct ptlrpc_client *);
void ptlrpc_cleanup_client(struct obd_import *imp);
};
struct md_open_data {
- struct obd_client_handle *mod_och;
- struct ptlrpc_request *mod_open_req;
- struct ptlrpc_request *mod_close_req;
- cfs_atomic_t mod_refcount;
+ struct obd_client_handle *mod_och;
+ struct ptlrpc_request *mod_open_req;
+ struct ptlrpc_request *mod_close_req;
+ cfs_atomic_t mod_refcount;
+ bool mod_is_create;
};
struct lookup_intent;
int (*m_set_open_replay_data)(struct obd_export *,
struct obd_client_handle *,
- struct ptlrpc_request *);
+ struct lookup_intent *);
int (*m_clear_open_replay_data)(struct obd_export *,
struct obd_client_handle *);
}
static inline int md_set_open_replay_data(struct obd_export *exp,
- struct obd_client_handle *och,
- struct ptlrpc_request *open_req)
+ struct obd_client_handle *och,
+ struct lookup_intent *it)
{
- ENTRY;
- EXP_CHECK_MD_OP(exp, set_open_replay_data);
- EXP_MD_COUNTER_INCREMENT(exp, set_open_replay_data);
- RETURN(MDP(exp->exp_obd, set_open_replay_data)(exp, och, open_req));
+ ENTRY;
+ EXP_CHECK_MD_OP(exp, set_open_replay_data);
+ EXP_MD_COUNTER_INCREMENT(exp, set_open_replay_data);
+ RETURN(MDP(exp->exp_obd, set_open_replay_data)(exp, och, it));
}
static inline int md_clear_open_replay_data(struct obd_export *exp,
fd->fd_mds_och.och_fid = lli->lli_fid;
lli->lli_file_data = fd;
llu_ioepoch_open(lli, body->ioepoch);
- md_set_open_replay_data(lli->lli_sbi->ll_md_exp,
- &fd->fd_mds_och, it->d.lustre.it_data);
+ md_set_open_replay_data(lli->lli_sbi->ll_md_exp, &fd->fd_mds_och, it);
RETURN(0);
}
och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
och->och_flags = it->it_flags;
- return md_set_open_replay_data(md_exp, och, req);
+ return md_set_open_replay_data(md_exp, och, it);
}
int ll_local_open(struct file *file, struct lookup_intent *it,
OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE |
OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_PINGLESS |
OBD_CONNECT_MAX_EASIZE |
- OBD_CONNECT_FLOCK_DEAD;
+ OBD_CONNECT_FLOCK_DEAD |
+ OBD_CONNECT_DISP_STRIPE;
if (sbi->ll_flags & LL_SBI_SOM_PREVIEW)
data->ocd_connect_flags |= OBD_CONNECT_SOM;
}
int lmv_set_open_replay_data(struct obd_export *exp,
- struct obd_client_handle *och,
- struct ptlrpc_request *open_req)
+ struct obd_client_handle *och,
+ struct lookup_intent *it)
{
- struct obd_device *obd = exp->exp_obd;
- struct lmv_obd *lmv = &obd->u.lmv;
- struct lmv_tgt_desc *tgt;
- ENTRY;
+ struct obd_device *obd = exp->exp_obd;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct lmv_tgt_desc *tgt;
+ ENTRY;
- tgt = lmv_find_target(lmv, &och->och_fid);
- if (IS_ERR(tgt))
- RETURN(PTR_ERR(tgt));
+ tgt = lmv_find_target(lmv, &och->och_fid);
+ if (IS_ERR(tgt))
+ RETURN(PTR_ERR(tgt));
- RETURN(md_set_open_replay_data(tgt->ltd_exp, och, open_req));
+ RETURN(md_set_open_replay_data(tgt->ltd_exp, och, it));
}
int lmv_clear_open_replay_data(struct obd_export *exp,
int mdc_free_lustre_md(struct obd_export *exp, struct lustre_md *md);
int mdc_set_open_replay_data(struct obd_export *exp,
- struct obd_client_handle *och,
- struct ptlrpc_request *open_req);
+ struct obd_client_handle *och,
+ struct lookup_intent *it);
int mdc_clear_open_replay_data(struct obd_export *exp,
struct obd_client_handle *och);
* happens immediately after swabbing below, new reply
* is swabbed by that handler correctly.
*/
- mdc_set_open_replay_data(NULL, NULL, req);
+ mdc_set_open_replay_data(NULL, NULL, it);
}
if ((body->valid & (OBD_MD_FLDIREA | OBD_MD_FLEASIZE)) != 0) {
req->rq_cb_data = *mod;
(*mod)->mod_open_req = req;
req->rq_commit_cb = mdc_commit_open;
+ (*mod)->mod_is_create = true;
/**
* Take an extra reference on \var mod, it protects \var
* mod from being freed on eviction (commit callback is
}
int mdc_set_open_replay_data(struct obd_export *exp,
- struct obd_client_handle *och,
- struct ptlrpc_request *open_req)
-{
- struct md_open_data *mod;
- struct mdt_rec_create *rec;
- struct mdt_body *body;
- struct obd_import *imp = open_req->rq_import;
- ENTRY;
+ struct obd_client_handle *och,
+ struct lookup_intent *it)
+{
+ struct md_open_data *mod;
+ struct mdt_rec_create *rec;
+ struct mdt_body *body;
+ struct ptlrpc_request *open_req = it->d.lustre.it_data;
+ struct obd_import *imp = open_req->rq_import;
+ ENTRY;
if (!open_req->rq_replay)
RETURN(0);
spin_lock(&open_req->rq_lock);
och->och_mod = mod;
mod->mod_och = och;
+ mod->mod_is_create = it_disposition(it, DISP_OPEN_CREATE) ||
+ it_disposition(it, DISP_OPEN_STRIPE);
mod->mod_open_req = open_req;
open_req->rq_cb_data = mod;
open_req->rq_commit_cb = mdc_commit_open;
RETURN(0);
}
+static void mdc_free_open(struct md_open_data *mod)
+{
+ int committed = 0;
+
+ if (mod->mod_is_create == 0 &&
+ imp_connect_disp_stripe(mod->mod_open_req->rq_import))
+ committed = 1;
+
+ LASSERT(mod->mod_open_req->rq_replay == 0);
+
+ DEBUG_REQ(D_RPCTRACE, mod->mod_open_req, "free open request\n");
+
+ ptlrpc_request_committed(mod->mod_open_req, committed);
+ if (mod->mod_close_req)
+ ptlrpc_request_committed(mod->mod_close_req, committed);
+}
+
int mdc_clear_open_replay_data(struct obd_export *exp,
struct obd_client_handle *och)
{
RETURN(0);
LASSERT(mod != LP_POISON);
+ LASSERT(mod->mod_open_req != NULL);
+ mdc_free_open(mod);
mod->mod_och = NULL;
och->och_mod = NULL;
if (mod) {
if (rc != 0)
mod->mod_close_req = NULL;
+ LASSERT(mod->mod_open_req != NULL);
+ mdc_free_open(mod);
+
/* Since now, mod is accessed through setattr req only,
* thus DW req does not keep a reference on mod anymore. */
obd_mod_put(mod);
}
static int mdt_mfd_open(struct mdt_thread_info *info, struct mdt_object *p,
- struct mdt_object *o, __u64 flags, int created)
+ struct mdt_object *o, __u64 flags, int created,
+ struct ldlm_reply *rep)
{
struct ptlrpc_request *req = mdt_info_req(info);
struct mdt_export_data *med = &req->rq_export->exp_mdt_data;
rc = mdt_create_data(info, p, o);
if (rc)
RETURN(rc);
+
+ if (exp_connect_flags(req->rq_export) & OBD_CONNECT_DISP_STRIPE)
+ mdt_set_disposition(info, rep, DISP_OPEN_STRIPE);
}
CDEBUG(D_INODE, "after open, ma_valid bit = "LPX64" lmm_size = %d\n",
repbody->valid |= OBD_MD_FLEASIZE;
}
mdt_set_disposition(info, rep, DISP_OPEN_OPEN);
- RETURN(0);
- }
- }
+ RETURN(0);
+ }
+ }
- rc = mdt_mfd_open(info, p, o, flags, created);
+ rc = mdt_mfd_open(info, p, o, flags, created, rep);
if (!rc)
mdt_set_disposition(info, rep, DISP_OPEN_OPEN);
- RETURN(rc);
+ RETURN(rc);
}
extern void mdt_req_from_lcd(struct ptlrpc_request *req,
CFS_INIT_LIST_HEAD(&imp->imp_replay_list);
CFS_INIT_LIST_HEAD(&imp->imp_sending_list);
CFS_INIT_LIST_HEAD(&imp->imp_delayed_list);
+ CFS_INIT_LIST_HEAD(&imp->imp_committed_list);
+ imp->imp_replay_cursor = &imp->imp_committed_list;
spin_lock_init(&imp->imp_lock);
imp->imp_last_success_conn = 0;
imp->imp_state = LUSTRE_IMP_NEW;
"short_io",
"pingless",
"flock_deadlock",
+ "disp_stripe",
"unknown",
NULL
};
}
EXPORT_SYMBOL(ptlrpc_unregister_reply);
+static void ptlrpc_free_request(struct ptlrpc_request *req)
+{
+ spin_lock(&req->rq_lock);
+ req->rq_replay = 0;
+ spin_unlock(&req->rq_lock);
+
+ if (req->rq_commit_cb != NULL)
+ req->rq_commit_cb(req);
+ cfs_list_del_init(&req->rq_replay_list);
+
+ __ptlrpc_req_finished(req, 1);
+}
+
+/**
+ * the request is committed and dropped from the replay list of its import
+ */
+void ptlrpc_request_committed(struct ptlrpc_request *req, int force)
+{
+ struct obd_import *imp = req->rq_import;
+
+ spin_lock(&imp->imp_lock);
+ if (cfs_list_empty(&req->rq_replay_list)) {
+ spin_unlock(&imp->imp_lock);
+ return;
+ }
+
+ if (force || req->rq_transno <= imp->imp_peer_committed_transno)
+ ptlrpc_free_request(req);
+
+ spin_unlock(&imp->imp_lock);
+}
+EXPORT_SYMBOL(ptlrpc_request_committed);
+
/**
* Iterates through replay_list on import and prunes
* all requests have transno smaller than last_committed for the
*/
void ptlrpc_free_committed(struct obd_import *imp)
{
- cfs_list_t *tmp, *saved;
- struct ptlrpc_request *req;
- struct ptlrpc_request *last_req = NULL; /* temporary fire escape */
- ENTRY;
+ struct ptlrpc_request *req, *saved;
+ struct ptlrpc_request *last_req = NULL; /* temporary fire escape */
+ bool skip_committed_list = true;
+ ENTRY;
LASSERT(imp != NULL);
LASSERT(spin_is_locked(&imp->imp_lock));
CDEBUG(D_RPCTRACE, "%s: committing for last_committed "LPU64" gen %d\n",
imp->imp_obd->obd_name, imp->imp_peer_committed_transno,
imp->imp_generation);
+
+ if (imp->imp_generation != imp->imp_last_generation_checked)
+ skip_committed_list = false;
+
imp->imp_last_transno_checked = imp->imp_peer_committed_transno;
imp->imp_last_generation_checked = imp->imp_generation;
- cfs_list_for_each_safe(tmp, saved, &imp->imp_replay_list) {
- req = cfs_list_entry(tmp, struct ptlrpc_request,
- rq_replay_list);
-
+ cfs_list_for_each_entry_safe(req, saved, &imp->imp_replay_list,
+ rq_replay_list) {
/* XXX ok to remove when 1357 resolved - rread 05/29/03 */
LASSERT(req != last_req);
last_req = req;
GOTO(free_req, 0);
}
- if (req->rq_replay) {
- DEBUG_REQ(D_RPCTRACE, req, "keeping (FL_REPLAY)");
- continue;
- }
-
/* not yet committed */
if (req->rq_transno > imp->imp_peer_committed_transno) {
DEBUG_REQ(D_RPCTRACE, req, "stopping search");
break;
}
+ if (req->rq_replay) {
+ DEBUG_REQ(D_RPCTRACE, req, "keeping (FL_REPLAY)");
+ cfs_list_move_tail(&req->rq_replay_list,
+ &imp->imp_committed_list);
+ continue;
+ }
+
DEBUG_REQ(D_INFO, req, "commit (last_committed "LPU64")",
imp->imp_peer_committed_transno);
free_req:
- spin_lock(&req->rq_lock);
- req->rq_replay = 0;
- spin_unlock(&req->rq_lock);
- if (req->rq_commit_cb != NULL)
- req->rq_commit_cb(req);
- cfs_list_del_init(&req->rq_replay_list);
- __ptlrpc_req_finished(req, 1);
+ ptlrpc_free_request(req);
}
+ if (skip_committed_list)
+ GOTO(out, 0);
+
+ cfs_list_for_each_entry_safe(req, saved, &imp->imp_committed_list,
+ rq_replay_list) {
+ LASSERT(req->rq_transno != 0);
+ if (req->rq_import_generation < imp->imp_generation) {
+ DEBUG_REQ(D_RPCTRACE, req, "free stale open request");
+ ptlrpc_free_request(req);
+ }
+ }
+out:
EXIT;
return;
}
*/
static int ptlrpc_first_transno(struct obd_import *imp, __u64 *transno)
{
- struct ptlrpc_request *req;
- cfs_list_t *tmp;
-
- if (cfs_list_empty(&imp->imp_replay_list))
- return 0;
- tmp = imp->imp_replay_list.next;
- req = cfs_list_entry(tmp, struct ptlrpc_request, rq_replay_list);
- *transno = req->rq_transno;
- if (req->rq_transno == 0) {
- DEBUG_REQ(D_ERROR, req, "zero transno in replay");
- LBUG();
- }
-
- return 1;
+ struct ptlrpc_request *req;
+ cfs_list_t *tmp;
+
+ /* The requests in committed_list always have smaller transnos than
+ * the requests in replay_list */
+ if (!cfs_list_empty(&imp->imp_committed_list)) {
+ tmp = imp->imp_committed_list.next;
+ req = cfs_list_entry(tmp, struct ptlrpc_request, rq_replay_list);
+ *transno = req->rq_transno;
+ if (req->rq_transno == 0) {
+ DEBUG_REQ(D_ERROR, req, "zero transno in committed_list");
+ LBUG();
+ }
+ return 1;
+ }
+ if (!cfs_list_empty(&imp->imp_replay_list)) {
+ tmp = imp->imp_replay_list.next;
+ req = cfs_list_entry(tmp, struct ptlrpc_request, rq_replay_list);
+ *transno = req->rq_transno;
+ if (req->rq_transno == 0) {
+ DEBUG_REQ(D_ERROR, req, "zero transno in replay_list");
+ LBUG();
+ }
+ return 1;
+ }
+ return 0;
}
/**
* imp_lock is being held by ptlrpc_replay, but it's not. it's
* just a little race...
*/
- cfs_list_for_each_safe(tmp, pos, &imp->imp_replay_list) {
- req = cfs_list_entry(tmp, struct ptlrpc_request,
- rq_replay_list);
-
- /* If need to resend the last sent transno (because a
- reconnect has occurred), then stop on the matching
- req and send it again. If, however, the last sent
- transno has been committed then we continue replay
- from the next request. */
- if (req->rq_transno > last_transno) {
- if (imp->imp_resend_replay)
- lustre_msg_add_flags(req->rq_reqmsg,
- MSG_RESENT);
- break;
- }
- req = NULL;
- }
+
+ /* Replay all the committed open requests on committed_list first */
+ if (!cfs_list_empty(&imp->imp_committed_list)) {
+ tmp = imp->imp_committed_list.prev;
+ req = cfs_list_entry(tmp, struct ptlrpc_request,
+ rq_replay_list);
+
+ /* The last request on committed_list hasn't been replayed */
+ if (req->rq_transno > last_transno) {
+ /* Since the imp_committed_list is immutable before
+ * all of it's requests being replayed, it's safe to
+ * use a cursor to accelerate the search */
+ imp->imp_replay_cursor = imp->imp_replay_cursor->next;
+
+ while (imp->imp_replay_cursor !=
+ &imp->imp_committed_list) {
+ req = cfs_list_entry(imp->imp_replay_cursor,
+ struct ptlrpc_request,
+ rq_replay_list);
+ if (req->rq_transno > last_transno)
+ break;
+
+ req = NULL;
+ imp->imp_replay_cursor =
+ imp->imp_replay_cursor->next;
+ }
+ } else {
+ /* All requests on committed_list have been replayed */
+ imp->imp_replay_cursor = &imp->imp_committed_list;
+ req = NULL;
+ }
+ }
+
+ /* All the requests in committed list have been replayed, let's replay
+ * the imp_replay_list */
+ if (req == NULL) {
+ cfs_list_for_each_safe(tmp, pos, &imp->imp_replay_list) {
+ req = cfs_list_entry(tmp, struct ptlrpc_request,
+ rq_replay_list);
+
+ if (req->rq_transno > last_transno)
+ break;
+ req = NULL;
+ }
+ }
+
+ /* If need to resend the last sent transno (because a reconnect
+ * has occurred), then stop on the matching req and send it again.
+ * If, however, the last sent transno has been committed then we
+ * continue replay from the next request. */
+ if (req != NULL && imp->imp_resend_replay)
+ lustre_msg_add_flags(req->rq_reqmsg, MSG_RESENT);
spin_lock(&imp->imp_lock);
imp->imp_resend_replay = 0;
}
run_test 208 "Exclusive open"
+test_209() {
+ [[ $($LCTL get_param -n mdc.*.connect_flags) == ~disp_stripe ]] &&
+ skip_env "must have disp_stripe" && return
+
+ touch $DIR/$tfile
+ sync; sleep 5; sync;
+
+ echo 3 > /proc/sys/vm/drop_caches
+ req_before=$(awk '/ptlrpc_cache / { print $2 }' /proc/slabinfo)
+
+ # open/close 500 times
+ for i in $(seq 500); do
+ cat $DIR/$tfile
+ done
+
+ echo 3 > /proc/sys/vm/drop_caches
+ req_after=$(awk '/ptlrpc_cache / { print $2 }' /proc/slabinfo)
+
+ echo "before: $req_before, after: $req_after"
+ [ $((req_after - req_before)) -ge 300 ] &&
+ error "open/close requests are not freed"
+ return 0
+}
+run_test 209 "read-only open/close requests should be freed promptly"
+
test_212() {
size=`date +%s`
size=$((size % 8192 + 1))