From 5940a46bbf0ed2a10108d196c2cfd98c24b44a86 Mon Sep 17 00:00:00 2001 From: alex Date: Sat, 7 May 2005 15:40:03 +0000 Subject: [PATCH] b=6019 - workaround for cascading timeouts when failure of 1 server node causes failures of unrelated connections. this caused different problems. for example, lock cancel timeouts. --- lustre/include/linux/lustre_ha.h | 3 + lustre/include/linux/lustre_import.h | 7 +- lustre/ldlm/ldlm_lockd.c | 21 +++++ lustre/ldlm/ldlm_request.c | 4 + lustre/obdclass/genops.c | 1 + lustre/ptlrpc/client.c | 10 ++- lustre/ptlrpc/import.c | 12 ++- lustre/ptlrpc/niobuf.c | 1 - lustre/ptlrpc/pinger.c | 164 ++++++++++++++++++++++++++--------- lustre/ptlrpc/recover.c | 1 + 10 files changed, 179 insertions(+), 45 deletions(-) diff --git a/lustre/include/linux/lustre_ha.h b/lustre/include/linux/lustre_ha.h index 739a875..ecc6543 100644 --- a/lustre/include/linux/lustre_ha.h +++ b/lustre/include/linux/lustre_ha.h @@ -24,5 +24,8 @@ void ptlrpc_deactivate_import(struct obd_import *imp); void ptlrpc_invalidate_import(struct obd_import *imp, int in_rpc); void ptlrpc_fail_import(struct obd_import *imp, int generation); void ptlrpc_fail_export(struct obd_export *exp); +int ptlrpc_check_suspend(void); +void ptlrpc_activate_timeouts(void); +void ptlrpc_deactivate_timeouts(void); #endif diff --git a/lustre/include/linux/lustre_import.h b/lustre/include/linux/lustre_import.h index 0f7b8b3..716028d 100644 --- a/lustre/include/linux/lustre_import.h +++ b/lustre/include/linux/lustre_import.h @@ -101,10 +101,15 @@ struct obd_import { imp_dlm_fake:1, imp_server_timeout:1, imp_initial_recov:1, imp_force_verify:1, imp_pingable:1, imp_resend_replay:1, - imp_deactive:1; + imp_deactive:1, + imp_waiting_ping_reply:1; __u32 imp_connect_op; __u32 imp_connect_flags; struct obd_connect_data imp_connect_data; + + unsigned long imp_last_ping_xid; + int imp_reqs_replayed; + int imp_locks_replayed; }; typedef void (*obd_import_callback)(struct obd_import *imp, void *closure, diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index 0a5d6a1..0634cb7 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -95,6 +95,8 @@ struct ldlm_bl_work_item { #ifdef __KERNEL__ +static int ldlm_add_waiting_lock(struct ldlm_lock *lock); + static inline int have_expired_locks(void) { int need_to_run; @@ -179,6 +181,7 @@ static void waiting_locks_callback(unsigned long unused) if (obd_dump_on_timeout) portals_debug_dumplog(); +repeat: spin_lock_bh(&waiting_locks_spinlock); while (!list_empty(&waiting_locks_list)) { lock = list_entry(waiting_locks_list.next, struct ldlm_lock, @@ -188,6 +191,24 @@ static void waiting_locks_callback(unsigned long unused) (lock->l_req_mode == LCK_GROUP)) break; + if (ptlrpc_check_suspend()) { + /* there is a case when we talk to one mds, holding + * lock from another mds. this way we easily can get + * here, if second mds is being recovered. so, we + * suspend timeouts. bug 6019 */ + + LDLM_ERROR(lock, "recharge timeout: %s@%s nid %s ", + lock->l_export->exp_client_uuid.uuid, + lock->l_export->exp_connection->c_remote_uuid.uuid, + ptlrpc_peernid2str(&lock->l_export->exp_connection->c_peer, str)); + + list_del_init(&lock->l_pending_chain); + spin_unlock_bh(&waiting_locks_spinlock); + ldlm_add_waiting_lock(lock); + + goto repeat; + } + LDLM_ERROR(lock, "lock callback timer expired: evicting client " "%s@%s nid %s ", lock->l_export->exp_client_uuid.uuid, diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c index 094ae033..90c988a 100644 --- a/lustre/ldlm/ldlm_request.c +++ b/lustre/ldlm/ldlm_request.c @@ -50,6 +50,9 @@ int ldlm_expired_completion_wait(void *data) if (lock->l_conn_export == NULL) { static unsigned long next_dump = 0, last_dump = 0; + if (ptlrpc_check_suspend()) + RETURN(0); + LDLM_ERROR(lock, "lock timed out; not entering recovery in " "server code, just going back to sleep"); if (time_after(jiffies, next_dump)) { @@ -1036,6 +1039,7 @@ static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock) LDLM_DEBUG(lock, "replaying lock:"); + imp->imp_locks_replayed++; atomic_inc(&req->rq_import->imp_replay_inflight); req->rq_async_args.pointer_arg[0] = lock; req->rq_interpret_reply = replay_lock_interpret; diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c index bb7781b..36ae1e8 100644 --- a/lustre/obdclass/genops.c +++ b/lustre/obdclass/genops.c @@ -591,6 +591,7 @@ struct obd_import *class_new_import(void) INIT_LIST_HEAD(&imp->imp_conn_list); INIT_LIST_HEAD(&imp->imp_handle.h_link); class_handle_hash(&imp->imp_handle, import_handle_addref); + imp->imp_waiting_ping_reply = 0; return imp; } diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index 8bbbf62..a3c8f3b 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -1289,8 +1289,13 @@ void ptlrpc_restart_req(struct ptlrpc_request *req) static int expired_request(void *data) { struct ptlrpc_request *req = data; + struct obd_import *imp; ENTRY; + /* some failure can suspend regular timeouts */ + if (ptlrpc_check_suspend()) + RETURN(1); + RETURN(ptlrpc_expire_one_request(req)); } @@ -1465,9 +1470,12 @@ restart: timeout = MAX(req->rq_timeout * HZ, 1); DEBUG_REQ(D_NET, req, "-- sleeping for %d jiffies", timeout); } +repeat: lwi = LWI_TIMEOUT_INTR(timeout, expired_request, interrupted_request, req); - l_wait_event(req->rq_reply_waitq, ptlrpc_check_reply(req), &lwi); + rc = l_wait_event(req->rq_reply_waitq, ptlrpc_check_reply(req), &lwi); + if (rc == -ETIMEDOUT && ptlrpc_check_and_wait_suspend(req)) + goto repeat; DEBUG_REQ(D_NET, req, "-- done sleeping"); CDEBUG(D_RPCTRACE, "Completed RPC pname:cluuid:pid:xid:ni:nid:opc " diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index 12e63b3..7cf82f1 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -101,6 +101,7 @@ int ptlrpc_set_import_discon(struct obd_import *imp) imp->imp_obd->obd_name, imp->imp_target_uuid.uuid, imp->imp_connection->c_remote_uuid.uuid); + ptlrpc_deactivate_timeouts(); IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON); spin_unlock_irqrestore(&imp->imp_lock, flags); obd_import_event(imp->imp_obd, imp, IMP_EVENT_DISCON); @@ -191,6 +192,7 @@ void ptlrpc_activate_import(struct obd_import *imp) spin_unlock_irqrestore(&imp->imp_lock, flags); obd_import_event(obd, imp, IMP_EVENT_ACTIVE); + ptlrpc_activate_timeouts(); } void ptlrpc_fail_import(struct obd_import *imp, int generation) @@ -397,6 +399,8 @@ int ptlrpc_connect_import(struct obd_import *imp, char * new_uuid) MSG_CONNECT_INITIAL); imp->imp_replayable = 1; } + + imp->imp_reqs_replayed = imp->imp_locks_replayed = 0; ptlrpcd_add_req(request); rc = 0; @@ -459,6 +463,7 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request, imp->imp_conn_cnt = request->rq_repmsg->conn_cnt; imp->imp_remote_handle = request->rq_repmsg->handle; IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL); + ptlrpc_pinger_sending_on_import(imp); GOTO(finish, rc = 0); } @@ -686,10 +691,13 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp) GOTO(out, rc); IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL); ptlrpc_activate_import(imp); - CWARN("%s: connection restored to %s@%s\n", + CWARN("%s: connection restored to %s@%s, " + "%d/%d req/lock replayed\n", imp->imp_obd->obd_name, imp->imp_target_uuid.uuid, - imp->imp_connection->c_remote_uuid.uuid); + imp->imp_connection->c_remote_uuid.uuid, + imp->imp_reqs_replayed, + imp->imp_locks_replayed); } if (imp->imp_state == LUSTRE_IMP_FULL) { diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c index af48920..f883a48 100644 --- a/lustre/ptlrpc/niobuf.c +++ b/lustre/ptlrpc/niobuf.c @@ -478,7 +478,6 @@ int ptl_send_rpc(struct ptlrpc_request *request) ptlrpc_request_addref(request); /* +1 ref for the SENT callback */ request->rq_sent = LTIME_S(CURRENT_TIME); - ptlrpc_pinger_sending_on_import(request->rq_import); rc = ptl_send_buf(&request->rq_req_md_h, request->rq_reqbuf, request->rq_reqdata_len, PTL_NOACK_REQ, &request->rq_req_cbid, diff --git a/lustre/ptlrpc/pinger.c b/lustre/ptlrpc/pinger.c index 89b1191..756cd63 100644 --- a/lustre/ptlrpc/pinger.c +++ b/lustre/ptlrpc/pinger.c @@ -35,9 +35,33 @@ #include #include "ptlrpc_internal.h" +#define PINGER_RATE 3 /* how many pings we'll do in obd_timeout period */ + static DECLARE_MUTEX(pinger_sem); static struct list_head pinger_imports = LIST_HEAD_INIT(pinger_imports); +static int ptlrpc_ping_interpret(struct ptlrpc_request *req, + void *data, int rc) +{ + struct obd_import *imp = req->rq_import; + DEBUG_REQ(D_HA, req, "ping reply"); + if (imp->imp_waiting_ping_reply == 0) + DEBUG_REQ(D_ERROR, req, "late ping reply?"); + if (imp->imp_last_ping_xid != req->rq_xid) + DEBUG_REQ(D_ERROR, req, "uh, wrong ping reply on x%lx", + imp->imp_last_ping_xid); + else + imp->imp_last_ping_xid = 0; + + /* if ping reply is an error, don't drop "replied" flag + * on import, so pinger will invalidate it */ + if (ptlrpc_client_replied(req) && req->rq_repmsg->type == PTL_RPC_MSG_ERR) + return 0; + + imp->imp_waiting_ping_reply = 0; + return 0; +} + int ptlrpc_ping(struct obd_import *imp) { struct ptlrpc_request *req; @@ -51,6 +75,10 @@ int ptlrpc_ping(struct obd_import *imp) imp->imp_target_uuid.uuid); req->rq_no_resend = req->rq_no_delay = 1; req->rq_replen = lustre_msg_size(0, NULL); + req->rq_interpret_reply = ptlrpc_ping_interpret; + req->rq_timeout = obd_timeout / PINGER_RATE; + imp->imp_waiting_ping_reply = 1; + imp->imp_last_ping_xid = req->rq_xid; ptlrpcd_add_req(req); } else { CERROR("OOM trying to ping %s->%s\n", @@ -63,12 +91,97 @@ int ptlrpc_ping(struct obd_import *imp) } #ifdef __KERNEL__ -int ptlrpc_next_ping(struct obd_import *imp) +static inline int ptlrpc_next_ping(struct obd_import *imp) +{ + return jiffies + (obd_timeout / PINGER_RATE * HZ); +} + +static inline int ptlrpc_next_reconnect(struct obd_import *imp) { if (imp->imp_server_timeout) - return jiffies + (obd_timeout / 4 * HZ); - else return jiffies + (obd_timeout / 2 * HZ); + else + return jiffies + (obd_timeout * HZ); +} + +static atomic_t suspend_timeouts = ATOMIC_INIT(0); +static wait_queue_head_t suspend_timeouts_waitq; + +void ptlrpc_deactivate_timeouts(void) +{ + CDEBUG(D_HA, "deactivate timeouts\n"); + atomic_inc(&suspend_timeouts); +} + +void ptlrpc_activate_timeouts(void) +{ + CDEBUG(D_HA, "activate timeouts\n"); + LASSERT(atomic_read(&suspend_timeouts) > 0); + if (atomic_dec_and_test(&suspend_timeouts)) + wake_up(&suspend_timeouts_waitq); +} + +int ptlrpc_check_suspend(void) +{ + if (atomic_read(&suspend_timeouts)) + return 1; + return 0; +} + +int ptlrpc_check_and_wait_suspend(struct ptlrpc_request *req) +{ + struct l_wait_info lwi; + + if (atomic_read(&suspend_timeouts)) { + DEBUG_REQ(D_NET, req, "-- suspend %d regular timeout", + atomic_read(&suspend_timeouts)); + lwi = LWI_INTR(NULL, NULL); + l_wait_event(suspend_timeouts_waitq, + atomic_read(&suspend_timeouts) == 0, &lwi); + DEBUG_REQ(D_NET, req, "-- recharge regular timeout"); + return 1; + } + return 0; +} + +static void ptlrpc_pinger_process_import(struct obd_import *imp, + unsigned long this_ping) +{ + unsigned long flags; + int force, level; + + spin_lock_irqsave(&imp->imp_lock, flags); + level = imp->imp_state; + force = imp->imp_force_verify; + if (force) + imp->imp_force_verify = 0; + spin_unlock_irqrestore(&imp->imp_lock, flags); + + if (imp->imp_next_ping > this_ping && force == 0) + return; + + if (level == LUSTRE_IMP_DISCON && !imp->imp_deactive) { + /* wait at least a timeout before trying recovery again */ + imp->imp_next_ping = ptlrpc_next_reconnect(imp); + ptlrpc_initiate_recovery(imp); + } else if (level != LUSTRE_IMP_FULL || imp->imp_obd->obd_no_recov) { + CDEBUG(D_HA, "not pinging %s (in recovery " + " or recovery disabled: %s)\n", + imp->imp_target_uuid.uuid, + ptlrpc_import_state_name(level)); + } else if (level == LUSTRE_IMP_FULL && imp->imp_waiting_ping_reply && + imp->imp_next_ping >= this_ping && imp->imp_pingable) { + CDEBUG(D_HA, "%s: %s hasn't respond on ping x%lu\n", + imp->imp_obd->obd_uuid.uuid, + imp->imp_target_uuid.uuid, imp->imp_last_ping_xid); + CDEBUG(D_ERROR, "%s: %s hasn't respond on ping x%lu\n", + imp->imp_obd->obd_uuid.uuid, + imp->imp_target_uuid.uuid, imp->imp_last_ping_xid); + ptlrpc_fail_import(imp, 0); + } else if (imp->imp_pingable || force) { + imp->imp_next_ping = ptlrpc_next_ping(imp); + ptlrpc_ping(imp); + } } static int ptlrpc_pinger_main(void *arg) @@ -110,45 +223,13 @@ static int ptlrpc_pinger_main(void *arg) struct obd_import *imp = list_entry(iter, struct obd_import, imp_pinger_chain); - int force, level; - unsigned long flags; - - - spin_lock_irqsave(&imp->imp_lock, flags); - level = imp->imp_state; - force = imp->imp_force_verify; - if (force) - imp->imp_force_verify = 0; - spin_unlock_irqrestore(&imp->imp_lock, flags); - - if (imp->imp_next_ping <= this_ping || force) { - if (level == LUSTRE_IMP_DISCON && - !imp->imp_deactive) { - /* wait at least a timeout before - trying recovery again. */ - imp->imp_next_ping = - ptlrpc_next_ping(imp); - ptlrpc_initiate_recovery(imp); - } else if (level != LUSTRE_IMP_FULL || - imp->imp_obd->obd_no_recov) { - CDEBUG(D_HA, - "not pinging %s (in recovery " - "or recovery disabled: %s)\n", - imp->imp_target_uuid.uuid, - ptlrpc_import_state_name(level)); - } else if (imp->imp_pingable || force) { - ptlrpc_ping(imp); - } - - } else if (imp->imp_pingable) { - CDEBUG(D_HA, "don't need to ping %s " - "(%lu > %lu)\n", - imp->imp_target_uuid.uuid, - imp->imp_next_ping, this_ping); - } + + ptlrpc_pinger_process_import(imp, this_ping); + CDEBUG(D_OTHER, "%s: pingable %d, next_ping %lu(%lu)\n", imp->imp_target_uuid.uuid, imp->imp_pingable, imp->imp_next_ping, jiffies); + if (imp->imp_pingable && imp->imp_next_ping && imp->imp_next_ping - jiffies < time_to_next_ping && imp->imp_next_ping > jiffies) @@ -194,6 +275,8 @@ int ptlrpc_start_pinger(void) #endif ENTRY; + LASSERT(obd_timeout > PINGER_RATE); + if (pinger_thread != NULL) RETURN(-EALREADY); @@ -201,6 +284,7 @@ int ptlrpc_start_pinger(void) if (pinger_thread == NULL) RETURN(-ENOMEM); init_waitqueue_head(&pinger_thread->t_ctl_waitq); + init_waitqueue_head(&suspend_timeouts_waitq); d.name = "ll_ping"; d.thread = pinger_thread; @@ -246,7 +330,7 @@ int ptlrpc_stop_pinger(void) void ptlrpc_pinger_sending_on_import(struct obd_import *imp) { down(&pinger_sem); - imp->imp_next_ping = jiffies + (obd_timeout * HZ); + imp->imp_next_ping = ptlrpc_next_ping(imp); up(&pinger_sem); } diff --git a/lustre/ptlrpc/recover.c b/lustre/ptlrpc/recover.c index 6731c7d..a09c8a2 100644 --- a/lustre/ptlrpc/recover.c +++ b/lustre/ptlrpc/recover.c @@ -214,6 +214,7 @@ int ptlrpc_replay_next(struct obd_import *imp, int *inflight) LPD64"\n", rc, req->rq_xid); RETURN(rc); } + imp->imp_reqs_replayed++; *inflight = 1; } RETURN(rc); -- 1.8.3.1