From 1b7dd856aada8006b53b43e329b5c3b79d97f668 Mon Sep 17 00:00:00 2001 From: adilger Date: Fri, 27 Jan 2006 09:29:07 +0000 Subject: [PATCH] Branch b_release_1_4_6 Description: MDS or OST may oops/LBUG if a client is connecting multiple times Details : The client ptlrpc code may be trying to reconnect to a down server before a previous connection attempt has timed out. Increase the reconnect interval to be longer than the connection timeout interval to avoid sending duplicate connections to servers. This first part of the fix simply increases the reconnect interval to be longer than the connection request timeout, to avoid having multiple connect requests in flight from the same client at the same time. b=9635 r=nathan --- lustre/ChangeLog | 14 ++++++++++++-- lustre/include/linux/obd_support.h | 1 + lustre/ptlrpc/niobuf.c | 6 +++++- lustre/ptlrpc/pinger.c | 7 +++++-- 4 files changed, 23 insertions(+), 5 deletions(-) diff --git a/lustre/ChangeLog b/lustre/ChangeLog index cf04b63..d6c56ba 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -4,7 +4,7 @@ INTEROPERATE with older versions automatically. Please read the user documentation before upgrading any part of a live system. * WARNING: Lustre networking configuration changes are required with - this release. See https://bugzilla.clusterfs.com/show_bug.cgi?id=10052 + this release. See https://bugzilla.clusterfs.com/show_bug.cgi?id=10052 for details. * bug fixes @@ -521,6 +521,16 @@ Details : mds_cleanup() and filter_cleanup() need to drop the kernel lock the kernel lock is held, not whether it is this process that is holding it as 2.6 kernels do. +Severity : major +Frequency : rare +Bugzilla : 9635 +Description: MDS or OST may oops/LBUG if a client is connecting multiple times +Details : The client ptlrpc code may be trying to reconnect to a down + server before a previous connection attempt has timed out. + Increase the reconnect interval to be longer than the connection + timeout interval to avoid sending duplicate connections to + servers. + ------------------------------------------------------------------------------ 08-26-2005 Cluster File Systems, Inc. @@ -791,7 +801,7 @@ Severity: : enhancement Bugzilla : 3262, 6359 Description: Attempts to reconnect to servers are now more aggressive. Details : This builds on the enhanced upcall-less recovery that was added - in 1.4.2. When trying to reconnect to servers, clients will + in 1.4.2. When trying to reconnect to servers, clients will now try each server in the failover group every 10 seconds. By default, clients would previously try one server every 25 seconds. diff --git a/lustre/include/linux/obd_support.h b/lustre/include/linux/obd_support.h index ea0c5b2..ba3999d 100644 --- a/lustre/include/linux/obd_support.h +++ b/lustre/include/linux/obd_support.h @@ -39,6 +39,7 @@ extern unsigned int obd_fail_loc; extern unsigned int obd_dump_on_timeout; extern unsigned int obd_timeout; /* seconds */ #define PING_INTERVAL max(obd_timeout / 4, 1U) +#define RECONNECT_INTERVAL max(obd_timeout / 10, 10U) extern unsigned int ldlm_timeout; extern unsigned int obd_health_check_timeout; extern char obd_lustre_upcall[128]; diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c index cfac594..2752ef4 100644 --- a/lustre/ptlrpc/niobuf.c +++ b/lustre/ptlrpc/niobuf.c @@ -339,11 +339,15 @@ int ptlrpc_send_reply (struct ptlrpc_request *req, int may_be_difficult) req->rq_repmsg->status = req->rq_status; req->rq_repmsg->opc = req->rq_reqmsg->opc; - if (req->rq_export == NULL) + if (req->rq_export == NULL || req->rq_export->exp_connection == NULL) conn = ptlrpc_get_connection(req->rq_peer, req->rq_self, NULL); else conn = ptlrpc_connection_addref(req->rq_export->exp_connection); + if (conn == NULL) { + CERROR("not replying on NULL connection\n"); /* bug 9635 */ + return -ENOTCONN; + } atomic_inc (&svc->srv_outstanding_replies); ptlrpc_rs_addref(rs); /* +1 ref for the network */ diff --git a/lustre/ptlrpc/pinger.c b/lustre/ptlrpc/pinger.c index 2aaef4b..78c62ad 100644 --- a/lustre/ptlrpc/pinger.c +++ b/lustre/ptlrpc/pinger.c @@ -68,7 +68,8 @@ int ptlrpc_ping(struct obd_import *imp) static void ptlrpc_update_next_ping(struct obd_import *imp) { imp->imp_next_ping = jiffies + HZ * - (imp->imp_state == LUSTRE_IMP_DISCON ? 10 : PING_INTERVAL); + (imp->imp_state == LUSTRE_IMP_DISCON ? RECONNECT_INTERVAL : + PING_INTERVAL); } void ptlrpc_ping_import_soon(struct obd_import *imp) @@ -319,6 +320,7 @@ void ptlrpc_pinger_wake_up() * the current implementation of pinger in liblustre is not optimized */ +#ifdef ENABLE_PINGER static struct pinger_data { int pd_recursion; unsigned long pd_this_ping; /* jiffies */ @@ -464,11 +466,12 @@ out: } static void *pinger_callback = NULL; +#endif /* ENABLE_PINGER */ int ptlrpc_start_pinger(void) { - memset(&pinger_args, 0, sizeof(pinger_args)); #ifdef ENABLE_PINGER + memset(&pinger_args, 0, sizeof(pinger_args)); pinger_callback = liblustre_register_wait_callback(&pinger_check_rpcs, &pinger_args); #endif -- 1.8.3.1