From b521d27574716cef7e5b0b3526859779f4edbc77 Mon Sep 17 00:00:00 2001 From: adilger Date: Mon, 12 Apr 2004 21:29:37 +0000 Subject: [PATCH] Fix resend race during recovery (don't set rq_timeout = 0). b=2950 r=robert --- lustre/include/linux/lustre_net.h | 2 +- lustre/include/linux/obd.h | 6 +++--- lustre/ptlrpc/client.c | 18 +++++++++++++++--- lustre/ptlrpc/events.c | 2 +- lustre/ptlrpc/niobuf.c | 1 + 5 files changed, 21 insertions(+), 8 deletions(-) diff --git a/lustre/include/linux/lustre_net.h b/lustre/include/linux/lustre_net.h index 23e72f6..43d5445 100644 --- a/lustre/include/linux/lustre_net.h +++ b/lustre/include/linux/lustre_net.h @@ -267,7 +267,7 @@ struct ptlrpc_request { unsigned int rq_intr:1, rq_replied:1, rq_err:1, rq_timedout:1, rq_resend:1, rq_restart:1, rq_replay:1, rq_no_resend:1, rq_waiting:1, rq_receiving_reply:1, - rq_no_delay:1; + rq_no_delay:1, rq_net_err:1; int rq_phase; /* client-side refcount for SENT race */ atomic_t rq_refcount; diff --git a/lustre/include/linux/obd.h b/lustre/include/linux/obd.h index b5d47f1..185424e 100644 --- a/lustre/include/linux/obd.h +++ b/lustre/include/linux/obd.h @@ -463,9 +463,9 @@ struct obd_device { struct obd_uuid obd_uuid; int obd_minor; - int obd_attached:1, obd_set_up:1, obd_recovering:1, - obd_abort_recovery:1, obd_replayable:1, obd_no_transno:1, - obd_no_recov:1, obd_stopping:1; + unsigned int obd_attached:1, obd_set_up:1, obd_recovering:1, + obd_abort_recovery:1, obd_replayable:1, obd_no_transno:1, + obd_no_recov:1, obd_stopping:1; atomic_t obd_refcount; wait_queue_head_t obd_refcount_waitq; struct proc_dir_entry *obd_proc_entry; diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index 70db906..45bae96 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -406,6 +406,13 @@ static int ptlrpc_check_reply(struct ptlrpc_request *req) DEBUG_REQ(D_NET, req, "REPLIED:"); GOTO(out, rc = 1); } + + if (req->rq_net_err && !req->rq_timedout) { + spin_unlock_irqrestore (&req->rq_lock, flags); + ptlrpc_expire_one_request(req); + spin_lock_irqsave (&req->rq_lock, flags); + GOTO(out, rc = 0); + } if (req->rq_err) { DEBUG_REQ(D_ERROR, req, "ABORTED:"); @@ -583,7 +590,7 @@ static int ptlrpc_send_new_req(struct ptlrpc_request *req) rc = ptl_send_rpc(req); if (rc) { DEBUG_REQ(D_HA, req, "send failed (%d); expect timeout", rc); - req->rq_timeout = 1; + req->rq_net_err = 1; RETURN(rc); } RETURN(0); @@ -656,6 +663,10 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set) } if (req->rq_phase == RQ_PHASE_RPC) { + if (req->rq_net_err && !req->rq_timedout) { + ptlrpc_expire_one_request(req); + continue; + } if (req->rq_waiting || req->rq_resend) { int status; @@ -713,7 +724,7 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set) DEBUG_REQ(D_HA, req, "send failed (%d)", rc); force_timer_recalc = 1; - req->rq_timeout = 0; + req->rq_net_err = 1; } /* need to reset the timeout */ force_timer_recalc = 1; @@ -1203,10 +1214,11 @@ void ptlrpc_resend_req(struct ptlrpc_request *req) spin_lock_irqsave (&req->rq_lock, flags); req->rq_resend = 1; + req->rq_net_err = 0; req->rq_timedout = 0; if (req->rq_bulk) { __u64 old_xid = req->rq_xid; - + /* ensure previous bulk fails */ req->rq_xid = ptlrpc_next_xid(); CDEBUG(D_HA, "resend bulk old x"LPU64" new x"LPU64"\n", diff --git a/lustre/ptlrpc/events.c b/lustre/ptlrpc/events.c index 343ccba..3b366b3 100644 --- a/lustre/ptlrpc/events.c +++ b/lustre/ptlrpc/events.c @@ -57,7 +57,7 @@ void request_out_callback(ptl_event_t *ev) * like failing sends in client.c does currently... */ spin_lock_irqsave(&req->rq_lock, flags); - req->rq_timeout = 0; + req->rq_net_err = 1; spin_unlock_irqrestore(&req->rq_lock, flags); ptlrpc_wake_client_req(req); diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c index bb23f3f..4eff5c5 100644 --- a/lustre/ptlrpc/niobuf.c +++ b/lustre/ptlrpc/niobuf.c @@ -458,6 +458,7 @@ int ptl_send_rpc(struct ptlrpc_request *request) request->rq_replied = 0; request->rq_err = 0; request->rq_timedout = 0; + request->rq_net_err = 0; request->rq_resend = 0; request->rq_restart = 0; spin_unlock_irqrestore (&request->rq_lock, flags); -- 1.8.3.1