From: wangdi Date: Mon, 21 Nov 2011 06:57:03 +0000 (-0800) Subject: LU-868 ptlrpc: Fix the timeout for waiting next replay X-Git-Tag: 2.1.54~1 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=06f41b84901ad3bb4901b51a24e1553f9eeb6f1c LU-868 ptlrpc: Fix the timeout for waiting next replay During recovery, when setting the timeout for waiting the next replay, it should consider netlatency(added into timeout) and early reply as well, so if server sends the early reply for the request, the client might extend the timeout according to current estimate service time. Signed-off-by: Wang di Change-Id: I23ebf1dc3f525f78573890be26474b2c79c65a6d Reviewed-on: http://review.whamcloud.com/1716 Tested-by: Hudson Reviewed-by: Jinshan Xiong Tested-by: Maloo Reviewed-by: Oleg Drokin --- diff --git a/lustre/include/lustre/lustre_idl.h b/lustre/include/lustre/lustre_idl.h index 0a3948a..4645d30 100644 --- a/lustre/include/lustre/lustre_idl.h +++ b/lustre/include/lustre/lustre_idl.h @@ -991,7 +991,8 @@ struct ptlrpc_body { __u32 pb_op_flags; __u32 pb_conn_cnt; __u32 pb_timeout; /* for req, the deadline, for rep, the service est */ - __u32 pb_service_time; /* for rep, actual service time */ + __u32 pb_service_time; /* for rep, actual service time, also used for + net_latency of req */ __u32 pb_limit; __u64 pb_slv; /* VBR: pre-versions */ diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 98629a6..3a98991 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -1776,9 +1776,20 @@ static int handle_recovery_req(struct ptlrpc_thread *thread, * Add request timeout to the recovery time so next request from * this client may come in recovery time */ - if (!AT_OFF) - to = lustre_msg_get_timeout(req->rq_reqmsg); - extend_recovery_timer(class_exp2obd(req->rq_export), to); + if (!AT_OFF) { + struct ptlrpc_service *svc = req->rq_rqbd->rqbd_service; + /* If the server sent early reply for this request, + * the client will recalculate the timeout according to + * current server estimate service time, so we will + * use the maxium timeout here for waiting the client + * sending the next req */ + to = max((int)at_est2timeout( + at_get(&svc->srv_at_estimate)), + (int)lustre_msg_get_timeout(req->rq_reqmsg)); + /* Add net_latency (see ptlrpc_replay_req) */ + to += lustre_msg_get_service_time(req->rq_reqmsg); + } + extend_recovery_timer(class_exp2obd(req->rq_export), to); } reqcopy_put: RETURN(rc); diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index de8af5b..485a328 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -2650,6 +2650,10 @@ int ptlrpc_replay_req(struct ptlrpc_request *req) /* Readjust the timeout for current conditions */ ptlrpc_at_set_req_timeout(req); + /* Tell server the net_latency, so the server can calculate how long + * it should wait for next replay */ + lustre_msg_set_service_time(req->rq_reqmsg, + ptlrpc_at_get_net_latency(req)); DEBUG_REQ(D_HA, req, "REPLAY"); cfs_atomic_inc(&req->rq_import->imp_replay_inflight);