From: shaver Date: Mon, 28 Oct 2002 02:49:56 +0000 (+0000) Subject: Short version: replaying create and rename works now, including all the fixups X-Git-Tag: 0.5.16~111 X-Git-Url: https://git.whamcloud.com/gitweb?a=commitdiff_plain;h=320a34f3973da42af1b235ca5d6c52216f9f5303;p=fs%2Flustre-release.git Short version: replaying create and rename works now, including all the fixups you could ever, ever want. (OK, except lock replay, but that's easy now.) Single-client recovery should now be much, much more robust. - Added LDLM_FL_INTENT_ONLY to avoid acquiring locks during intent-lock replay. - Fix leak of open requests that was due to a misunderstanding (mine) of the way that the ptlrpc_request refcounting code (mine) really worked. - Store FID data after create, so that we can fix up the world afterwards. - Poor man's lustre_spare_buf support through simple over-allocation of request buffers. - Cancel unused locks when we lose contact with an MDS. - When replaying create, fix up to-be-replayed and delayed-for-recovery requests to contain the new FID generation, in addition to the actual in-memory inode. - Don't sweat a "failed" return code for replayed requests, but do squeak a bit if the old and new status doesn't match up (virtually guaranteed, in fact, but I'll tune that down later). - Skip no-transno requests during replay, since they don't affect MDS state. --- diff --git a/lustre/include/linux/lustre_dlm.h b/lustre/include/linux/lustre_dlm.h index 187605f..5798815 100644 --- a/lustre/include/linux/lustre_dlm.h +++ b/lustre/include/linux/lustre_dlm.h @@ -40,6 +40,7 @@ typedef enum { #define LDLM_FL_WAIT_NOREPROC (1 << 7) #define LDLM_FL_CANCEL (1 << 8) #define LDLM_FL_REPLAY (1 << 9) +#define LDLM_FL_INTENT_ONLY (1 << 10) /* don't grant lock, just do intent */ #define LDLM_CB_BLOCKING 1 #define LDLM_CB_CANCELING 2 diff --git a/lustre/include/linux/lustre_mds.h b/lustre/include/linux/lustre_mds.h index 8b292ae..7eb76c7 100644 --- a/lustre/include/linux/lustre_mds.h +++ b/lustre/include/linux/lustre_mds.h @@ -189,6 +189,9 @@ int mdc_rename(struct lustre_handle *conn, struct ptlrpc_request **); int mdc_create_client(obd_uuid_t uuid, struct ptlrpc_client *cl); +void mdc_store_create_replay_data(struct ptlrpc_request *req, + struct super_block *sb); + extern int mds_client_add(struct mds_export_data *med, int cl_off); extern int mds_client_free(struct obd_export *exp); diff --git a/lustre/llite/file.c b/lustre/llite/file.c index 7778e01..e60ce2b 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -105,10 +105,8 @@ static int ll_file_open(struct inode *inode, struct file *file) file->f_flags, lsm, &fd->fd_mdshandle, &req); fd->fd_req = req; - /* We don't call ptlrpc_req_finished here, because the request is - * preserved until we see a matching close, at which point it is - * released (and likely freed). (See ll_file_release.) - */ + /* This is the "reply" refcount. */ + ptlrpc_req_finished(req); if (rc) GOTO(out_req, -abs(rc)); if (!fd->fd_mdshandle.addr || diff --git a/lustre/llite/namei.c b/lustre/llite/namei.c index 188b7e1..7264964 100644 --- a/lustre/llite/namei.c +++ b/lustre/llite/namei.c @@ -218,6 +218,7 @@ static struct dentry *ll_lookup2(struct inode *dir, struct dentry *dentry, ino = lic.lic_body->fid1.id; mode = lic.lic_body->mode; if (it->it_op & (IT_CREAT | IT_MKDIR | IT_SYMLINK | IT_MKNOD)) { + mdc_store_create_replay_data(request, dir->i_sb); /* For create ops, we want the lookup to be negative, * unless the create failed in a way that indicates * that the file is already there */ diff --git a/lustre/llite/recover.c b/lustre/llite/recover.c index fae9718..3692042 100644 --- a/lustre/llite/recover.c +++ b/lustre/llite/recover.c @@ -69,7 +69,7 @@ static void abort_inflight_for_import(struct obd_import *imp) } } -static void prepare_ost(struct obd_import *imp) +static void prepare_osc(struct obd_import *imp) { int rc; struct ldlm_namespace *ns = imp->imp_obd->obd_namespace; @@ -109,6 +109,12 @@ static void prepare_ost(struct obd_import *imp) } } +static void prepare_mdc(struct obd_import *imp) +{ + struct ldlm_namespace *ns = imp->imp_obd->obd_namespace; + ldlm_cli_cancel_unused(ns, NULL, 1 /* local only */); +} + static int ll_prepare_recovery(struct ptlrpc_connection *conn) { struct list_head *tmp; @@ -118,17 +124,24 @@ static int ll_prepare_recovery(struct ptlrpc_connection *conn) imp_chain); if (imp->imp_obd->obd_type->typ_ops->o_brw) - prepare_ost(imp); + prepare_osc(imp); + else + prepare_mdc(imp); } return ptlrpc_run_recovery_upcall(conn); } -static void reconnect_ost(struct obd_import *imp) +static void reconnect_osc(struct obd_import *imp) { (void)ptlrpc_reconnect_import(imp, OST_CONNECT); } +static int reconnect_mdc(struct obd_import *imp) +{ + return ptlrpc_reconnect_import(imp, MDS_CONNECT); +} + static int ll_reconnect(struct ptlrpc_connection *conn) { struct list_head *tmp; @@ -145,12 +158,11 @@ static int ll_reconnect(struct ptlrpc_connection *conn) imp_chain); if (imp->imp_obd->obd_type->typ_ops->o_brw) { /* XXX what to do if we fail? */ - reconnect_ost(imp); + reconnect_osc(imp); } else { - int rc = ptlrpc_reconnect_import(imp, MDS_CONNECT); + int rc = reconnect_mdc(imp); if (!rc) { need_replay = 1; - /* XXX obd_cancel_unused */ } /* make sure we don't try to replay for dead imps? * diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c index 17787276..4478db2 100644 --- a/lustre/mdc/mdc_request.c +++ b/lustre/mdc/mdc_request.c @@ -196,6 +196,176 @@ static int mdc_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, RETURN(0); } +struct create_replay_data { + struct super_block *sb; + u32 generation; +}; + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +static int create_replay_find_inode(struct inode *inode, unsigned long ino, + void *opaque) +#else +static int create_replay_find_inode(struct inode *inode, void *opaque) +#endif +{ + struct ptlrpc_request *req = opaque; + struct create_replay_data *saved; + struct mds_body *body; + + saved = lustre_msg_buf(req->rq_reqmsg, 5); /* lock with intent */ + + if (saved->generation != inode->i_generation) { + CDEBUG(D_HA, + "generation mismatch for ino %u: saved %u != inode %u\n", + inode->i_ino, saved->generation, inode->i_generation); + return 0; + } + + body = lustre_msg_buf(req->rq_repmsg, 1); + + /* XXX do I need more out of ll_update_inode? */ + CDEBUG(D_HA, "updating inode %u generation %u to %u\n", + inode->i_ino, inode->i_generation, body->generation); + + inode->i_generation = body->generation; + + return 1; +} + +static void fixup_req_for_recreate(struct ptlrpc_request *fixreq, + struct ptlrpc_request *req, + struct inode *inode) +{ + struct ldlm_request *lockreq; + struct mds_rec_link *rec; /* representative, two-fid op structure */ + int opc; + + if (fixreq->rq_import != req->rq_import) { + DEBUG_REQ(D_HA, fixreq, "import mismatch, skipping"); + return; + } + + DEBUG_REQ(D_HA, fixreq, "fixing"); + + /* XXX check replay_state to see if we'll actually replay. */ + + /* We only care about LDLM_ENQUEUE and MDS_REINT requests. */ + if (fixreq->rq_reqmsg->opc == LDLM_ENQUEUE) { + lockreq = lustre_msg_buf(fixreq->rq_reqmsg, 0); + + if (lockreq->lock_desc.l_resource.lr_type != LDLM_MDSINTENT) { + DEBUG_REQ(D_HA, fixreq, "non-intent lock, skipping"); + return; + } + + if (fixreq->rq_reqmsg->bufcount < 2) { + DEBUG_REQ(D_HA, fixreq, + "short intent (probably readdir), skipping"); + return; + } + + /* XXX endianness is probably very very wrong here. Very. */ + rec = lustre_msg_buf(fixreq->rq_reqmsg, 2); + } else if (fixreq->rq_reqmsg->opc == MDS_REINT) { + rec = lustre_msg_buf(fixreq->rq_reqmsg, 0); + } else if (fixreq->rq_reqmsg->opc == MDS_OPEN) { + struct mds_body *body = lustre_msg_buf(fixreq->rq_reqmsg, 0); + DEBUG_REQ(D_HA, fixreq, "fixing fid1: %u -> %u", + body->fid1.generation, inode->i_generation); + body->fid1.generation = inode->i_generation; + return; + } else { + DEBUG_REQ(D_HA, fixreq, "not a replayable request, skipping"); + return; + } + + if (rec->lk_fid1.id == inode->i_ino) { + DEBUG_REQ(D_HA, fixreq, "fixing fid1: %u -> %u", + rec->lk_fid1.generation, inode->i_generation); + rec->lk_fid1.generation = inode->i_generation; + } + + /* Some ops have two FIDs. ZZZ We rely on the identical + * placement of that second FID in all such ops' messages. + */ + opc = rec->lk_opcode & REINT_OPCODE_MASK; + if ((opc == REINT_LINK || opc == REINT_UNLINK || + opc == REINT_RENAME) && + rec->lk_fid2.id == inode->i_ino) { + DEBUG_REQ(D_HA, fixreq, "fixing fid2: %u -> %u", + rec->lk_fid2.generation, inode->i_generation); + rec->lk_fid2.generation = inode->i_generation; + } +} + +static void mdc_replay_create(struct ptlrpc_request *req) +{ + struct create_replay_data *saved; + struct mds_body *body; + struct inode *inode; + struct list_head *tmp; + + if (req->rq_reqmsg->opc == MDS_REINT) + LBUG(); /* XXX don't handle the non-intent case yet */ + + body = lustre_msg_buf(req->rq_repmsg, 1); + saved = lustre_msg_buf(req->rq_reqmsg, 5); /* lock with intent */ + + CDEBUG(D_HA, "create of inode %d replayed; gen %u -> %u\n", + body->fid1.id, saved->generation, body->generation); + /* XXX cargo-culted right out of ll_iget */ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) + inode = iget4(saved->sb, body->fid1.id, create_replay_find_inode, req); +#else + { + extern int ll_read_inode2(struct inode *inode, void *opaque); + inode = iget5_locked(saved->sb, body->fid1.id, + create_replay_find_inode, req); + + if (!inode) + LBUG(); /* XXX ick */ + + if (inode->i_state & I_NEW) + unlock_new_inode(inode); + } +#endif + + /* Now that we've updated the generation, we need to go and find all + * the other requests that refer to this file and will be replayed, + * and teach them about our new generation. + */ + list_for_each(tmp, &req->rq_connection->c_sending_head) { + struct ptlrpc_request *fixreq = + list_entry(tmp, struct ptlrpc_request, rq_list); + + fixup_req_for_recreate(fixreq, req, inode); + } + + list_for_each(tmp, &req->rq_connection->c_delayed_head) { + struct ptlrpc_request *fixreq = + list_entry(tmp, struct ptlrpc_request, rq_list); + + fixup_req_for_recreate(fixreq, req, inode); + } +} + +void mdc_store_create_replay_data(struct ptlrpc_request *req, + struct super_block *sb) +{ + struct create_replay_data *saved = + lustre_msg_buf(req->rq_reqmsg, 5); + struct mds_body *body = lustre_msg_buf(req->rq_repmsg, 1); + + + if (req->rq_reqmsg->opc == MDS_REINT) + LBUG(); /* XXX don't handle the non-intent case yet */ + + saved->generation = body->generation; + saved->sb = sb; /* XXX is this safe? */ + + req->rq_replay_cb = mdc_replay_create; +} + int mdc_enqueue(struct lustre_handle *conn, int lock_type, struct lookup_intent *it, int lock_mode, struct inode *dir, struct dentry *de, struct lustre_handle *lockh, @@ -204,13 +374,14 @@ int mdc_enqueue(struct lustre_handle *conn, int lock_type, struct ptlrpc_request *req; struct obd_device *obddev = class_conn2obd(conn); __u64 res_id[RES_NAME_SIZE] = {dir->i_ino}; - int size[5] = {sizeof(struct ldlm_request), sizeof(struct ldlm_intent)}; + int size[6] = {sizeof(struct ldlm_request), sizeof(struct ldlm_intent)}; int rc, flags = 0; int repsize[3] = {sizeof(struct ldlm_reply), sizeof(struct mds_body), obddev->u.cli.cl_max_mds_easize}; struct ldlm_reply *dlm_rep; struct ldlm_intent *lit; + struct ldlm_request *lockreq; ENTRY; LDLM_DEBUG_NOLOCK("mdsintent %s dir %ld", ldlm_it2str(it->it_op), @@ -234,7 +405,8 @@ int mdc_enqueue(struct lustre_handle *conn, int lock_type, size[2] = sizeof(struct mds_rec_create); size[3] = de->d_name.len + 1; size[4] = tgtlen + 1; - req = ptlrpc_prep_req(class_conn2cliimp(conn), LDLM_ENQUEUE, 5, + size[5] = sizeof(struct create_replay_data); + req = ptlrpc_prep_req(class_conn2cliimp(conn), LDLM_ENQUEUE, 6, size, NULL); if (!req) RETURN(-ENOMEM); @@ -357,6 +529,10 @@ int mdc_enqueue(struct lustre_handle *conn, int lock_type, RETURN(rc); } + /* On replay, we don't want the lock granted. */ + lockreq = lustre_msg_buf(req->rq_reqmsg, 0); + lockreq->lock_flags |= LDLM_FL_INTENT_ONLY; + dlm_rep = lustre_msg_buf(req->rq_repmsg, 0); it->it_disposition = (int) dlm_rep->lock_policy_res1; it->it_status = (int) dlm_rep->lock_policy_res2; @@ -595,5 +771,7 @@ EXPORT_SYMBOL(mdc_setattr); EXPORT_SYMBOL(mdc_close); EXPORT_SYMBOL(mdc_open); +EXPORT_SYMBOL(mdc_store_create_replay_data); + module_init(ptlrpc_request_init); module_exit(ptlrpc_request_exit); diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c index 57919c6..cb340bc 100644 --- a/lustre/mds/handler.c +++ b/lustre/mds/handler.c @@ -1415,6 +1415,11 @@ static int ldlm_intent_policy(struct ldlm_lock *lock, void *req_cookie, } } + + if (flags & LDLM_FL_INTENT_ONLY) { + LDLM_DEBUG(lock, "INTENT_ONLY, aborting lock"); + RETURN(ELDLM_LOCK_ABORTED); + } /* Give the client a lock on the child object, instead of the * parent that it requested. */ new_resid[0] = NTOH__u32(mds_rep->ino); diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index 7ef9ff1..ee6bd63 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -649,15 +649,13 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req) int ptlrpc_replay_req(struct ptlrpc_request *req) { - int rc = 0, old_level; + int rc = 0, old_level, old_status; // struct ptlrpc_client *cli = req->rq_import->imp_client; struct l_wait_info lwi; ENTRY; init_waitqueue_head(&req->rq_wait_for_rep); - CDEBUG(D_NET, "req "LPD64" opc %d level %d, conn level %d\n", - req->rq_xid, req->rq_reqmsg->opc, req->rq_level, - req->rq_connection->c_level); + DEBUG_REQ(D_NET, req, ""); req->rq_timeout = obd_timeout; req->rq_reqmsg->addr = req->rq_import->imp_handle.addr; @@ -665,6 +663,7 @@ int ptlrpc_replay_req(struct ptlrpc_request *req) /* temporarily set request to RECOVD level (reset at out:) */ old_level = req->rq_level; + old_status = req->rq_repmsg->status; req->rq_level = LUSTRE_CONN_RECOVD; rc = ptl_send_rpc(req); if (rc) { @@ -700,15 +699,9 @@ int ptlrpc_replay_req(struct ptlrpc_request *req) if (req->rq_replay_cb) req->rq_replay_cb(req); - if (req->rq_repmsg->status == 0) { - CDEBUG(D_NET, "--> buf %p len %d status %d\n", req->rq_repmsg, - req->rq_replen, req->rq_repmsg->status); - } else { - CERROR("recovery failed: "); - CERROR("req "LPD64" opc %d level %d, conn level %d\n", - req->rq_xid, req->rq_reqmsg->opc, req->rq_level, - req->rq_connection->c_level); - LBUG(); + if (req->rq_repmsg->status != old_status) { + DEBUG_REQ(D_HA, req, "status %d, old was %d", + req->rq_repmsg->status, old_status); } out: diff --git a/lustre/ptlrpc/recover.c b/lustre/ptlrpc/recover.c index 272ffb9..060258f 100644 --- a/lustre/ptlrpc/recover.c +++ b/lustre/ptlrpc/recover.c @@ -108,11 +108,12 @@ int ptlrpc_run_recovery_upcall(struct ptlrpc_connection *conn) RETURN(0); } -#define REPLAY_COMMITTED 0 /* Fully processed (commit + reply) */ -#define REPLAY_REPLAY 1 /* Forced-replay (e.g. open) */ +#define REPLAY_COMMITTED 0 /* Fully processed (commit + reply). */ +#define REPLAY_REPLAY 1 /* Forced-replay (e.g. open). */ #define REPLAY_RESEND 2 /* Resend required. */ -#define REPLAY_RESEND_IGNORE 3 /* Resend, ignore the reply (already saw it) */ +#define REPLAY_RESEND_IGNORE 3 /* Resend, ignore the reply (already saw it). */ #define REPLAY_RESTART 4 /* Have to restart the call, sorry! */ +#define REPLAY_NO_STATE 5 /* Request doesn't change MDS state: skip. */ static int replay_state(struct ptlrpc_request *req, __u64 last_xid) { @@ -123,6 +124,12 @@ static int replay_state(struct ptlrpc_request *req, __u64 last_xid) /* Uncommitted request */ if (req->rq_xid > last_xid) { if (req->rq_flags & PTL_RPC_FL_REPLIED) { + if (req->rq_transno == 0) { + /* If no transno was returned, no state was + altered on the MDS. */ + return REPLAY_NO_STATE; + } + /* Saw reply, so resend and ignore new reply. */ return REPLAY_RESEND_IGNORE; } @@ -141,7 +148,8 @@ static int replay_state(struct ptlrpc_request *req, __u64 last_xid) static char *replay_state2str(int state) { static char *state_strings[] = { - "COMMITTED", "REPLAY", "RESEND", "RESEND_IGNORE", "RESTART" + "COMMITTED", "REPLAY", "RESEND", "RESEND_IGNORE", "RESTART", + "NO_STATE" }; static char *unknown_state = "UNKNOWN"; @@ -206,6 +214,11 @@ int ptlrpc_replay(struct ptlrpc_connection *conn) /* XXX commit now? */ break; + case REPLAY_NO_STATE: + DEBUG_REQ(D_HA, req, "NO_STATE:"); + /* XXX commit now? */ + break; + case REPLAY_RESEND_IGNORE: DEBUG_REQ(D_HA, req, "RESEND_IGNORE:"); rc = ptlrpc_replay_req(req);