Whamcloud - gitweb
LU-5116 ptlrpc: race at req processing 71/10471/2
authorAlexander.Boyko <alexander_boyko@xyratex.com>
Wed, 28 May 2014 17:52:13 +0000 (21:52 +0400)
committerOleg Drokin <oleg.drokin@intel.com>
Fri, 30 May 2014 03:58:15 +0000 (03:58 +0000)
Race between ptlrpc_resend_req() and ptlrpc_check_set().
1 thread do ptlrpc_check_set()->after_reply()
2 thread do ptlrpc_resend_req()
The result is request with rq_resend = 1 and MSG_REPLY flag.
When this request will came to server it will cause client eviction.
The patch skip ptlrpc_resend_req logic if rq_replied is set,
and clear rq_resend flag at reply_in_callback() when client got
reply.

Signed-off-by: Alexander Boyko <alexander_boyko@xyratex.com>
Xyratex-bug-id: MRP-1888
Change-Id: If47b9a1f559ab16e4b416332ada7c73a8758d46c
Reviewed-on: http://review.whamcloud.com/10471
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Mike Pershin <mike.pershin@intel.com>
Reviewed-by: Chris Horn <hornc@cray.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/ptlrpc/client.c
lustre/ptlrpc/events.c
lustre/ptlrpc/niobuf.c

index 3f3e220..ea8a206 100644 (file)
@@ -2565,10 +2565,19 @@ EXPORT_SYMBOL(ptlrpc_cleanup_client);
 void ptlrpc_resend_req(struct ptlrpc_request *req)
 {
         DEBUG_REQ(D_HA, req, "going to resend");
+       spin_lock(&req->rq_lock);
+
+       /* Request got reply but linked to the import list still.
+          Let ptlrpc_check_set() to process it. */
+       if (ptlrpc_client_replied(req)) {
+               spin_unlock(&req->rq_lock);
+               DEBUG_REQ(D_HA, req, "it has reply, so skip it");
+               return;
+       }
+
         lustre_msg_set_handle(req->rq_reqmsg, &(struct lustre_handle){ 0 });
         req->rq_status = -EAGAIN;
 
-       spin_lock(&req->rq_lock);
         req->rq_resend = 1;
         req->rq_net_err = 0;
         req->rq_timedout = 0;
index 024d22c..d7f9a11 100644 (file)
@@ -153,6 +153,8 @@ void reply_in_callback(lnet_event_t *ev)
                 /* Real reply */
                 req->rq_rep_swab_mask = 0;
                 req->rq_replied = 1;
+               /* Got reply, no resend required */
+               req->rq_resend = 0;
                 req->rq_reply_off = ev->offset;
                 req->rq_nob_received = ev->mlength;
                 /* LNetMDUnlink can't be called under the LNET_LOCK,
index e75599a..f779054 100644 (file)
@@ -690,6 +690,8 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
         /* If this is a re-transmit, we're required to have disengaged
          * cleanly from the previous attempt */
         LASSERT(!request->rq_receiving_reply);
+       LASSERT(!((lustre_msg_get_flags(request->rq_reqmsg) & MSG_REPLAY) &&
+               (request->rq_import->imp_state == LUSTRE_IMP_FULL)));
 
        if (unlikely(obd != NULL && obd->obd_fail)) {
                CDEBUG(D_HA, "muting rpc for failed imp obd %s\n",