Whamcloud - gitweb
LU-8193 ptlrpc: set proper mbits for EINPROGRESS resend
[fs/lustre-release.git] / lustre / ptlrpc / client.c
index d665545..ecb2384 100644 (file)
@@ -128,19 +128,21 @@ struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned nfrags, unsigned max_brw,
                (ptlrpc_is_bulk_desc_kvec(type) &&
                 ops->add_iov_frag != NULL));
 
+       OBD_ALLOC_PTR(desc);
+       if (desc == NULL)
+               return NULL;
        if (type & PTLRPC_BULK_BUF_KIOV) {
-               OBD_ALLOC(desc,
-                         offsetof(struct ptlrpc_bulk_desc,
-                                  bd_u.bd_kiov.bd_vec[nfrags]));
+               OBD_ALLOC_LARGE(GET_KIOV(desc),
+                               nfrags * sizeof(*GET_KIOV(desc)));
+               if (GET_KIOV(desc) == NULL)
+                       goto out;
        } else {
-               OBD_ALLOC(desc,
-                         offsetof(struct ptlrpc_bulk_desc,
-                                  bd_u.bd_kvec.bd_kvec[nfrags]));
+               OBD_ALLOC_LARGE(GET_KVEC(desc),
+                               nfrags * sizeof(*GET_KVEC(desc)));
+               if (GET_KVEC(desc) == NULL)
+                       goto out;
        }
 
-       if (!desc)
-               return NULL;
-
        spin_lock_init(&desc->bd_lock);
        init_waitqueue_head(&desc->bd_waitq);
        desc->bd_max_iov = nfrags;
@@ -157,6 +159,9 @@ struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned nfrags, unsigned max_brw,
                LNetInvalidateHandle(&desc->bd_mds[i]);
 
        return desc;
+out:
+       OBD_FREE_PTR(desc);
+       return NULL;
 }
 
 /**
@@ -271,13 +276,12 @@ void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc)
                desc->bd_frag_ops->release_frags(desc);
 
        if (ptlrpc_is_bulk_desc_kiov(desc->bd_type))
-               OBD_FREE(desc, offsetof(struct ptlrpc_bulk_desc,
-                                       bd_u.bd_kiov.bd_vec[desc->bd_max_iov]));
+               OBD_FREE_LARGE(GET_KIOV(desc),
+                       desc->bd_max_iov * sizeof(*GET_KIOV(desc)));
        else
-               OBD_FREE(desc, offsetof(struct ptlrpc_bulk_desc,
-                                       bd_u.bd_kvec.bd_kvec[desc->
-                                               bd_max_iov]));
-
+               OBD_FREE_LARGE(GET_KVEC(desc),
+                       desc->bd_max_iov * sizeof(*GET_KVEC(desc)));
+       OBD_FREE_PTR(desc);
        EXIT;
 }
 EXPORT_SYMBOL(ptlrpc_free_bulk);
@@ -553,10 +557,10 @@ int ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq)
                if (!msg) {
                        ptlrpc_request_cache_free(req);
                        return i;
-                }
-                req->rq_reqbuf = msg;
-                req->rq_reqbuf_len = size;
-                req->rq_pool = pool;
+               }
+               req->rq_reqbuf = msg;
+               req->rq_reqbuf_len = size;
+               req->rq_pool = pool;
                spin_lock(&pool->prp_lock);
                list_add_tail(&req->rq_list, &pool->prp_req_list);
        }
@@ -1048,6 +1052,9 @@ void ptlrpc_set_add_req(struct ptlrpc_request_set *set,
 {
        LASSERT(list_empty(&req->rq_set_chain));
 
+       if (req->rq_allow_intr)
+               set->set_allow_intr = 1;
+
        /* The set takes over the caller's request reference */
        list_add_tail(&req->rq_set_chain, &set->set_requests);
        req->rq_set = set;
@@ -1651,8 +1658,14 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
                                   rq_set_chain);
                struct obd_import *imp = req->rq_import;
                int unregistered = 0;
+               int async = 1;
                int rc = 0;
 
+               if (req->rq_phase == RQ_PHASE_COMPLETE) {
+                       list_move_tail(&req->rq_set_chain, &comp_reqs);
+                       continue;
+               }
+
                /* This schedule point is mainly for the ptlrpcd caller of this
                 * function.  Most ptlrpc sets are not long-lived and unbounded
                 * in length, but at the least the set used by the ptlrpcd is.
@@ -1669,16 +1682,18 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
                        req->rq_status = -EINTR;
                        ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
 
+                       /* Since it is interpreted and we have to wait for
+                        * the reply to be unlinked, then use sync mode. */
+                       async = 0;
+
                        GOTO(interpret, req->rq_status);
                }
 
-                if (req->rq_phase == RQ_PHASE_NEW &&
-                    ptlrpc_send_new_req(req)) {
-                        force_timer_recalc = 1;
-                }
+               if (req->rq_phase == RQ_PHASE_NEW && ptlrpc_send_new_req(req))
+                       force_timer_recalc = 1;
 
-                /* delayed send - skip */
-                if (req->rq_phase == RQ_PHASE_NEW && req->rq_sent)
+               /* delayed send - skip */
+               if (req->rq_phase == RQ_PHASE_NEW && req->rq_sent)
                        continue;
 
                /* delayed resend - skip */
@@ -1686,11 +1701,10 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
                    req->rq_sent > cfs_time_current_sec())
                        continue;
 
-                if (!(req->rq_phase == RQ_PHASE_RPC ||
-                      req->rq_phase == RQ_PHASE_BULK ||
-                      req->rq_phase == RQ_PHASE_INTERPRET ||
-                      req->rq_phase == RQ_PHASE_UNREGISTERING ||
-                      req->rq_phase == RQ_PHASE_COMPLETE)) {
+               if (!(req->rq_phase == RQ_PHASE_RPC ||
+                     req->rq_phase == RQ_PHASE_BULK ||
+                     req->rq_phase == RQ_PHASE_INTERPRET ||
+                     req->rq_phase == RQ_PHASE_UNREGISTERING)) {
                         DEBUG_REQ(D_ERROR, req, "bad phase %x", req->rq_phase);
                         LBUG();
                 }
@@ -1730,11 +1744,6 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
                         ptlrpc_rqphase_move(req, req->rq_next_phase);
                 }
 
-                if (req->rq_phase == RQ_PHASE_COMPLETE) {
-                       list_move_tail(&req->rq_set_chain, &comp_reqs);
-                        continue;
-               }
-
                 if (req->rq_phase == RQ_PHASE_INTERPRET)
                         GOTO(interpret, req->rq_status);
 
@@ -1951,27 +1960,27 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
                        req->rq_status = -EIO;
                }
 
-                ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+               ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
 
-        interpret:
-                LASSERT(req->rq_phase == RQ_PHASE_INTERPRET);
+       interpret:
+               LASSERT(req->rq_phase == RQ_PHASE_INTERPRET);
 
-                /* This moves to "unregistering" phase we need to wait for
-                 * reply unlink. */
-                if (!unregistered && !ptlrpc_unregister_reply(req, 1)) {
-                        /* start async bulk unlink too */
-                        ptlrpc_unregister_bulk(req, 1);
-                        continue;
-                }
+               /* This moves to "unregistering" phase we need to wait for
+                * reply unlink. */
+               if (!unregistered && !ptlrpc_unregister_reply(req, async)) {
+                       /* start async bulk unlink too */
+                       ptlrpc_unregister_bulk(req, 1);
+                       continue;
+               }
 
-                if (!ptlrpc_unregister_bulk(req, 1))
-                        continue;
+               if (!ptlrpc_unregister_bulk(req, async))
+                       continue;
 
-                /* When calling interpret receiving already should be
-                 * finished. */
-                LASSERT(!req->rq_receiving_reply);
+               /* When calling interpret receiving already should be
+                * finished. */
+               LASSERT(!req->rq_receiving_reply);
 
-                ptlrpc_req_interpret(env, req, req->rq_status);
+               ptlrpc_req_interpret(env, req, req->rq_status);
 
                if (ptlrpcd_check_work(req)) {
                        atomic_dec(&set->set_remaining);
@@ -2057,8 +2066,8 @@ int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink)
                       "timed out for sent delay" : "timed out for slow reply"),
                   req->rq_sent, req->rq_real_sent);
 
-        if (imp != NULL && obd_debug_peer_on_timeout)
-                LNetCtl(IOC_LIBCFS_DEBUG_PEER, &imp->imp_connection->c_peer);
+       if (imp != NULL && obd_debug_peer_on_timeout)
+               LNetDebugPeer(imp->imp_connection->c_peer);
 
         ptlrpc_unregister_reply(req, async_unlink);
         ptlrpc_unregister_bulk(req, async_unlink);
@@ -2180,6 +2189,9 @@ static void ptlrpc_interrupted_set(void *data)
                struct ptlrpc_request *req =
                        list_entry(tmp, struct ptlrpc_request, rq_set_chain);
 
+               if (req->rq_intr)
+                       continue;
+
                if (req->rq_phase != RQ_PHASE_RPC &&
                    req->rq_phase != RQ_PHASE_UNREGISTERING &&
                    !req->rq_allow_intr)
@@ -2274,17 +2286,12 @@ int ptlrpc_set_wait(struct ptlrpc_request_set *set)
                 CDEBUG(D_RPCTRACE, "set %p going to sleep for %d seconds\n",
                        set, timeout);
 
-               if (timeout == 0 && !signal_pending(current))
-                        /*
-                         * No requests are in-flight (ether timed out
-                         * or delayed), so we can allow interrupts.
-                         * We still want to block for a limited time,
-                         * so we allow interrupts during the timeout.
-                         */
-                       lwi = LWI_TIMEOUT_INTR_ALL(cfs_time_seconds(1),
-                                                   ptlrpc_expired_set,
-                                                   ptlrpc_interrupted_set, set);
-               else if (set->set_allow_intr)
+               if ((timeout == 0 && !signal_pending(current)) ||
+                   set->set_allow_intr)
+                       /* No requests are in-flight (ether timed out
+                        * or delayed), so we can allow interrupts.
+                        * We still want to block for a limited time,
+                        * so we allow interrupts during the timeout. */
                        lwi = LWI_TIMEOUT_INTR_ALL(
                                        cfs_time_seconds(timeout ? timeout : 1),
                                        ptlrpc_expired_set,
@@ -2834,9 +2841,6 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req)
                RETURN(-ENOMEM);
        }
 
-       if (req->rq_allow_intr)
-               set->set_allow_intr = 1;
-
        /* for distributed debugging */
        lustre_msg_set_status(req->rq_reqmsg, current_pid());
 
@@ -2927,6 +2931,49 @@ static int ptlrpc_replay_interpret(const struct lu_env *env,
                 DEBUG_REQ(D_ERROR, req, "status %d, old was %d",
                           lustre_msg_get_status(req->rq_repmsg),
                           aa->praa_old_status);
+
+               /* Note: If the replay fails for MDT-MDT recovery, let's
+                * abort all of the following requests in the replay
+                * and sending list, because MDT-MDT update requests
+                * are dependent on each other, see LU-7039 */
+               if (imp->imp_connect_flags_orig & OBD_CONNECT_MDS_MDS) {
+                       struct ptlrpc_request *free_req;
+                       struct ptlrpc_request *tmp;
+
+                       spin_lock(&imp->imp_lock);
+                       list_for_each_entry_safe(free_req, tmp,
+                                                &imp->imp_replay_list,
+                                                rq_replay_list) {
+                               ptlrpc_free_request(free_req);
+                       }
+
+                       list_for_each_entry_safe(free_req, tmp,
+                                                &imp->imp_committed_list,
+                                                rq_replay_list) {
+                               ptlrpc_free_request(free_req);
+                       }
+
+                       list_for_each_entry_safe(free_req, tmp,
+                                               &imp->imp_delayed_list,
+                                               rq_list) {
+                               spin_lock(&free_req->rq_lock);
+                               free_req->rq_err = 1;
+                               free_req->rq_status = -EIO;
+                               ptlrpc_client_wake_req(free_req);
+                               spin_unlock(&free_req->rq_lock);
+                       }
+
+                       list_for_each_entry_safe(free_req, tmp,
+                                               &imp->imp_sending_list,
+                                               rq_list) {
+                               spin_lock(&free_req->rq_lock);
+                               free_req->rq_err = 1;
+                               free_req->rq_status = -EIO;
+                               ptlrpc_client_wake_req(free_req);
+                               spin_unlock(&free_req->rq_lock);
+                       }
+                       spin_unlock(&imp->imp_lock);
+               }
         } else {
                 /* Put it back for re-replay. */
                 lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status);
@@ -3160,11 +3207,15 @@ void ptlrpc_set_bulk_mbits(struct ptlrpc_request *req)
                __u64   old_mbits = req->rq_mbits;
 
                if ((bd->bd_import->imp_connect_data.ocd_connect_flags &
-                   OBD_CONNECT_BULK_MBITS) != 0)
+                   OBD_CONNECT_BULK_MBITS) != 0) {
                        req->rq_mbits = ptlrpc_next_xid();
-               else /* old version transfers rq_xid to peer as matchbits */
-                       req->rq_mbits = req->rq_xid = ptlrpc_next_xid();
-
+               } else {/* old version transfers rq_xid to peer as matchbits */
+                       spin_lock(&req->rq_import->imp_lock);
+                       list_del_init(&req->rq_unreplied_list);
+                       ptlrpc_assign_next_xid_nolock(req);
+                       req->rq_mbits = req->rq_xid;
+                       spin_unlock(&req->rq_import->imp_lock);
+               }
                CDEBUG(D_HA, "resend bulk old x"LPU64" new x"LPU64"\n",
                       old_mbits, req->rq_mbits);
        }