From 401deb5075f9ab7f6c8c1831c56a84b0134e923c Mon Sep 17 00:00:00 2001 From: alex Date: Tue, 1 Jun 2004 15:19:34 +0000 Subject: [PATCH] - mds->lmv->mdc propagate lower timeout down to import we need this becase client's timeout should be longer then inter-mds recovery may take - few ugly hacks that allow client to reconnect with old (before failure took place) UUID - mds_preprw() and mds_preprw() should use l_dput() instead of f_dput() - SOCKNAL_IO_TIMEOUT has been set to 20 seconds to make discovery of stale connection faster - lconf generates persisten UUIDs for LMV clients it each new LMV uses fresh UUID, then each recovered MDS looks like new client (target MDS doesn't recognize it's old because of new UUID). if target MDS gets restarted, then it'll find more clients in LAST_RCVD then it actually had --- lnet/klnds/socklnd/socklnd.h | 2 +- lustre/include/linux/obd.h | 1 + lustre/ldlm/ldlm_lib.c | 14 ++++++++++---- lustre/lmv/lmv_obd.c | 29 ++++++++++++++++++++++++++++- lustre/mdc/mdc_request.c | 5 +++++ lustre/mds/mds_lmv.c | 9 ++++++--- lustre/portals/knals/socknal/socknal.h | 2 +- lustre/ptlrpc/import.c | 25 +++++++++++++++++++++---- lustre/ptlrpc/pinger.c | 5 ++++- lustre/utils/lconf | 6 ++---- 10 files changed, 79 insertions(+), 19 deletions(-) diff --git a/lnet/klnds/socklnd/socklnd.h b/lnet/klnds/socklnd/socklnd.h index bd3c1fb..50ff5ce 100644 --- a/lnet/klnds/socklnd/socklnd.h +++ b/lnet/klnds/socklnd/socklnd.h @@ -77,7 +77,7 @@ #define SOCKNAL_MAX_RECONNECT_INTERVAL (60*HZ) /* ...exponentially increasing to this */ /* default vals for runtime tunables */ -#define SOCKNAL_IO_TIMEOUT 50 /* default comms timeout (seconds) */ +#define SOCKNAL_IO_TIMEOUT 20 /* default comms timeout (seconds) */ #define SOCKNAL_EAGER_ACK 0 /* default eager ack (boolean) */ #define SOCKNAL_TYPED_CONNS 1 /* unidirectional large, bidirectional small? */ #define SOCKNAL_ZC_MIN_FRAG (2<<10) /* default smallest zerocopy fragment */ diff --git a/lustre/include/linux/obd.h b/lustre/include/linux/obd.h index 12cf9fb..ea6f615 100644 --- a/lustre/include/linux/obd.h +++ b/lustre/include/linux/obd.h @@ -425,6 +425,7 @@ struct lmv_obd { int connected; int max_easize; int max_cookiesize; + int server_timeout; }; struct niobuf_local { diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 57d1058..4f113f4 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -339,10 +339,13 @@ int target_handle_reconnect(struct lustre_handle *conn, struct obd_export *exp, struct lustre_handle *hdl; hdl = &exp->exp_imp_reverse->imp_remote_handle; /* Might be a re-connect after a partition. */ - if (!memcmp(&conn->cookie, &hdl->cookie, sizeof conn->cookie)) { +#warning "FIXME ASAP" + memcpy(&hdl->cookie, &conn->cookie, sizeof(conn->cookie)); + if (1 || !memcmp(&conn->cookie, &hdl->cookie, sizeof conn->cookie)) { CERROR("%s reconnecting\n", cluuid->uuid); conn->cookie = exp->exp_handle.h_cookie; - RETURN(EALREADY); + /*RETURN(EALREADY);*/ + RETURN(0); } else { CERROR("%s reconnecting from %s, " "handle mismatch (ours "LPX64", theirs " @@ -393,6 +396,7 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler) if (!target || target->obd_stopping || !target->obd_set_up) { CERROR("UUID '%s' is not available for connect\n", str); + GOTO(out, rc = -ENODEV); } @@ -447,7 +451,8 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler) } else if (req->rq_reqmsg->conn_cnt == 1) { CERROR("%s reconnected with 1 conn_cnt; cookies not random?\n", cluuid.uuid); - GOTO(out, rc = -EALREADY); +#warning "FIXME ASAP" + /*GOTO(out, rc = -EALREADY);*/ } /* Tell the client if we're in recovery. */ @@ -503,7 +508,8 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler) LASSERT(export != NULL); spin_lock_irqsave(&export->exp_lock, flags); - if (export->exp_conn_cnt >= req->rq_reqmsg->conn_cnt) { +#warning "FIXME ASAP" + if (0 && export->exp_conn_cnt >= req->rq_reqmsg->conn_cnt) { CERROR("%s: already connected at a higher conn_cnt: %d > %d\n", cluuid.uuid, export->exp_conn_cnt, req->rq_reqmsg->conn_cnt); diff --git a/lustre/lmv/lmv_obd.c b/lustre/lmv/lmv_obd.c index 99e229e..824bc0e6 100644 --- a/lustre/lmv/lmv_obd.c +++ b/lustre/lmv/lmv_obd.c @@ -106,6 +106,27 @@ static int lmv_connect_fake(struct lustre_handle *conn, RETURN(0); } +void lmv_set_timeouts(struct obd_device *obd) +{ + struct lmv_tgt_desc *tgts; + struct lmv_obd *lmv; + int i; + + lmv = &obd->u.lmv; + if (lmv->server_timeout == 0) + return; + + if (lmv->connected == 0) + return; + + for (i = 0, tgts = lmv->tgts; i < lmv->count; i++, tgts++) { + if (tgts->exp == NULL) + continue; + obd_set_info(tgts->exp, strlen("inter_mds"), + "inter_mds", 0, NULL); + } +} + int lmv_connect(struct obd_device *obd) { struct lmv_obd *lmv = &obd->u.lmv; @@ -180,6 +201,8 @@ int lmv_connect(struct obd_device *obd) atomic_read(&obd->obd_refcount)); } + lmv_set_timeouts(obd); + class_export_put(exp); RETURN (0); @@ -1126,18 +1149,22 @@ int lmv_set_info(struct obd_export *exp, obd_count keylen, RETURN(-EINVAL); } lmv = &obd->u.lmv; - lmv_connect(obd); if (keylen >= strlen("client") && strcmp(key, "client") == 0) { struct lmv_tgt_desc *tgts; int i, rc; + lmv_connect(obd); for (i = 0, tgts = lmv->tgts; i < lmv->count; i++, tgts++) { rc = obd_set_info(tgts->exp, keylen, key, vallen, val); if (rc) RETURN(rc); } RETURN(0); + } else if (keylen >= strlen("inter_mds") && strcmp(key, "inter_mds") == 0) { + lmv->server_timeout = 1; + lmv_set_timeouts(obd); + RETURN(0); } RETURN(-EINVAL); diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c index f6fdd32..90665c1 100644 --- a/lustre/mdc/mdc_request.c +++ b/lustre/mdc/mdc_request.c @@ -707,6 +707,11 @@ int mdc_set_info(struct obd_export *exp, obd_count keylen, rc = ptlrpc_queue_wait(req); ptlrpc_req_finished(req); RETURN(rc); + } else if (keylen >= strlen("inter_mds") && strcmp(key, "inter_mds") == 0) { + struct obd_import *imp = class_exp2cliimp(exp); + imp->imp_server_timeout = 1; + CDEBUG(D_OTHER, "%s: timeout / 2\n", exp->exp_obd->obd_name); + RETURN(0); } RETURN(rc); diff --git a/lustre/mds/mds_lmv.c b/lustre/mds/mds_lmv.c index 9ea4849..863dba5 100644 --- a/lustre/mds/mds_lmv.c +++ b/lustre/mds/mds_lmv.c @@ -99,6 +99,10 @@ int mds_lmv_connect(struct obd_device *obd, char * lmv_name) GOTO(err_reg, rc); mds->mds_num = mdsize; + rc = obd_set_info(mds->mds_lmv_exp, strlen("inter_mds"), + "inter_mds", 0, NULL); + if (rc) + GOTO(err_reg, rc); RETURN(0); err_reg: @@ -518,7 +522,6 @@ static int filter_start_page_write(struct inode *inode, struct dentry *filter_fid2dentry(struct obd_device *obd, struct dentry *dir_dentry, obd_gr group, obd_id id); -void f_dput(struct dentry *dentry); int mds_preprw(int cmd, struct obd_export *exp, struct obdo *oa, int objcount, struct obd_ioobj *obj, @@ -547,7 +550,7 @@ int mds_preprw(int cmd, struct obd_export *exp, struct obdo *oa, if (dentry->d_inode == NULL) { CERROR("trying to BRW to non-existent file "LPU64"\n", obj->ioo_id); - f_dput(dentry); + l_dput(dentry); GOTO(cleanup, rc = -ENOENT); } @@ -571,7 +574,7 @@ int mds_preprw(int cmd, struct obd_export *exp, struct obdo *oa, i, obj->ioo_bufcnt, dentry, rc); while (lnb-- > res) __free_pages(lnb->page, 0); - f_dput(dentry); + l_dput(dentry); GOTO(cleanup, rc); } tot_bytes += lnb->len; diff --git a/lustre/portals/knals/socknal/socknal.h b/lustre/portals/knals/socknal/socknal.h index bd3c1fb..50ff5ce 100644 --- a/lustre/portals/knals/socknal/socknal.h +++ b/lustre/portals/knals/socknal/socknal.h @@ -77,7 +77,7 @@ #define SOCKNAL_MAX_RECONNECT_INTERVAL (60*HZ) /* ...exponentially increasing to this */ /* default vals for runtime tunables */ -#define SOCKNAL_IO_TIMEOUT 50 /* default comms timeout (seconds) */ +#define SOCKNAL_IO_TIMEOUT 20 /* default comms timeout (seconds) */ #define SOCKNAL_EAGER_ACK 0 /* default eager ack (boolean) */ #define SOCKNAL_TYPED_CONNS 1 /* unidirectional large, bidirectional small? */ #define SOCKNAL_ZC_MIN_FRAG (2<<10) /* default smallest zerocopy fragment */ diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index 374e46e..4b5fad3 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -148,6 +148,7 @@ void ptlrpc_deactivate_import(struct obd_import *imp) void ptlrpc_invalidate_import(struct obd_import *imp, int in_rpc) { struct l_wait_info lwi; + unsigned long timeout; int inflight = 0; int rc; @@ -160,8 +161,12 @@ void ptlrpc_invalidate_import(struct obd_import *imp, int in_rpc) inflight = 1; /* wait for all requests to error out and call completion callbacks */ - lwi = LWI_TIMEOUT_INTR(MAX(obd_timeout * HZ, 1), NULL, - NULL, NULL); + if (imp->imp_server_timeout) + timeout = obd_timeout / 2; + else + timeout = obd_timeout; + timeout = MAX(timeout * HZ, 1); + lwi = LWI_TIMEOUT_INTR(timeout, NULL, NULL, NULL); rc = l_wait_event(imp->imp_recovery_waitq, (atomic_read(&imp->imp_inflight) == inflight), &lwi); @@ -441,6 +446,13 @@ finish: if (aa->pcaa_initial_connect && !imp->imp_initial_recov) { ptlrpc_deactivate_import(imp); } + /*if (rc == -ETIMEDOUT) { + CDEBUG(D_ERROR, "recovery of %s on %s failed (timeout)\n", + imp->imp_target_uuid.uuid, + (char *)imp->imp_connection->c_remote_uuid.uuid); + ptlrpc_connect_import(imp, NULL); + RETURN(0); + }*/ CDEBUG(D_ERROR, "recovery of %s on %s failed (%d)\n", imp->imp_target_uuid.uuid, (char *)imp->imp_connection->c_remote_uuid.uuid, rc); @@ -572,8 +584,13 @@ int ptlrpc_disconnect_import(struct obd_import *imp) if (ptlrpc_import_in_recovery(imp)) { struct l_wait_info lwi; - lwi = LWI_TIMEOUT_INTR(MAX(obd_timeout * HZ, 1), back_to_sleep, - NULL, NULL); + unsigned long timeout; + if (imp->imp_server_timeout) + timeout = obd_timeout / 2; + else + timeout = obd_timeout; + timeout = MAX(timeout * HZ, 1); + lwi = LWI_TIMEOUT_INTR(obd_timeout, back_to_sleep, NULL, NULL); rc = l_wait_event(imp->imp_recovery_waitq, !ptlrpc_import_in_recovery(imp), &lwi); diff --git a/lustre/ptlrpc/pinger.c b/lustre/ptlrpc/pinger.c index 01d7d23..f1e68e7 100644 --- a/lustre/ptlrpc/pinger.c +++ b/lustre/ptlrpc/pinger.c @@ -337,8 +337,11 @@ static int pinger_check_rpcs(void *arg) if (level == LUSTRE_IMP_DISCON) { /* wait at least a timeout before trying recovery again. */ + unsigned long timeout = obd_timeout; + if (imp->imp_server_timeout) + timeout = obd_timeout / 2; imp->imp_next_ping = time(NULL) + - (obd_timeout * HZ); + (timeout * HZ); ptlrpc_initiate_recovery(imp); } else if (level != LUSTRE_IMP_FULL || diff --git a/lustre/utils/lconf b/lustre/utils/lconf index 7c48e1b..a1d13a7 100755 --- a/lustre/utils/lconf +++ b/lustre/utils/lconf @@ -1380,9 +1380,6 @@ class LMV(Module): if name_override != None: self.name = "lmv_%s" % name_override self.add_lustre_module('lmv', 'lmv') - self.mds_uuid = self.db.get_first_ref('mds') - mds = self.db.lookup(self.mds_uuid) - self.lmv_name = mds.getName() self.devlist = self.db.get_refs('mds') self.mdclist = [] self.desc_uuid = self.uuid @@ -1409,7 +1406,6 @@ class LMV(Module): except CommandError, e: print "Error preparing LMV %s\n" % mdc.uuid raise e - self.info(self.mds_uuid) lctl.lmv_setup(self.name, self.uuid, self.desc_uuid, string.join(self.devlist)) @@ -1517,6 +1513,7 @@ class MDSDEV(Module): # setup LMV if self.master_mds: client_uuid = generate_client_uuid(self.name) + client_uuid = self.name + "_lmv_" + "UUID" self.master = LMV(self.db.lookup(self.lmv_uuid), client_uuid, self.name, self.name) self.master_mds = self.master.name # modules @@ -1524,6 +1521,7 @@ class MDSDEV(Module): self.add_lustre_module('osc', 'osc') self.add_lustre_module('lov', 'lov') self.add_lustre_module('lmv', 'lmv') + self.add_lustre_module('ost', 'ost') self.add_lustre_module('mds', 'mds') if self.fstype: self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.fstype)) -- 1.8.3.1