Whamcloud - gitweb
LU-11514 lnet: separate ni state from recovery
[fs/lustre-release.git] / lnet / lnet / lib-move.c
index a89fa41..e217e95 100644 (file)
@@ -865,7 +865,8 @@ lnet_peer_is_alive(struct lnet_peer_ni *lp, time64_t now)
 /* NB: returns 1 when alive, 0 when dead, negative when error;
  *     may drop the lnet_net_lock */
 static int
-lnet_peer_alive_locked(struct lnet_ni *ni, struct lnet_peer_ni *lp)
+lnet_peer_alive_locked(struct lnet_ni *ni, struct lnet_peer_ni *lp,
+                      struct lnet_msg *msg)
 {
        time64_t now = ktime_get_seconds();
 
@@ -876,6 +877,13 @@ lnet_peer_alive_locked(struct lnet_ni *ni, struct lnet_peer_ni *lp)
                return 1;
 
        /*
+        * If we're resending a message, let's attempt to send it even if
+        * the peer is down to fulfill our resend quota on the message
+        */
+       if (msg->msg_retry_count > 0)
+               return 1;
+
+       /*
         * Peer appears dead, but we should avoid frequent NI queries (at
         * most once per lnet_queryinterval seconds).
         */
@@ -933,9 +941,10 @@ lnet_post_send_locked(struct lnet_msg *msg, int do_send)
 
        /* NB 'lp' is always the next hop */
        if ((msg->msg_target.pid & LNET_PID_USERFLAG) == 0 &&
-           lnet_peer_alive_locked(ni, lp) == 0) {
-               the_lnet.ln_counters[cpt]->drop_count++;
-               the_lnet.ln_counters[cpt]->drop_length += msg->msg_len;
+           lnet_peer_alive_locked(ni, lp, msg) == 0) {
+               the_lnet.ln_counters[cpt]->lct_common.lcc_drop_count++;
+               the_lnet.ln_counters[cpt]->lct_common.lcc_drop_length +=
+                       msg->msg_len;
                lnet_net_unlock(cpt);
                if (msg->msg_txpeer)
                        lnet_incr_stats(&msg->msg_txpeer->lpni_stats,
@@ -948,10 +957,9 @@ lnet_post_send_locked(struct lnet_msg *msg, int do_send)
 
                CNETERR("Dropping message for %s: peer not alive\n",
                        libcfs_id2str(msg->msg_target));
-               if (do_send) {
-                       msg->msg_health_status = LNET_MSG_STATUS_LOCAL_DROPPED;
+               msg->msg_health_status = LNET_MSG_STATUS_LOCAL_DROPPED;
+               if (do_send)
                        lnet_finalize(msg, -EHOSTUNREACH);
-               }
 
                lnet_net_lock(cpt);
                return -EHOSTUNREACH;
@@ -1601,6 +1609,7 @@ lnet_handle_send(struct lnet_send_data *sd)
        __u32 send_case = sd->sd_send_case;
        int rc;
        __u32 routing = send_case & REMOTE_DST;
+        struct lnet_rsp_tracker *rspt;
 
        /*
         * Increment sequence number of the selected peer so that we
@@ -1693,17 +1702,30 @@ lnet_handle_send(struct lnet_send_data *sd)
                msg->msg_hdr.dest_nid = cpu_to_le64(msg->msg_txpeer->lpni_nid);
        }
 
+       /*
+        * if we have response tracker block update it with the next hop
+        * nid
+        */
+       if (msg->msg_md) {
+               rspt = msg->msg_md->md_rspt_ptr;
+               if (rspt) {
+                       rspt->rspt_next_hop_nid = msg->msg_txpeer->lpni_nid;
+                       CDEBUG(D_NET, "rspt_next_hop_nid = %s\n",
+                              libcfs_nid2str(rspt->rspt_next_hop_nid));
+               }
+       }
+
        rc = lnet_post_send_locked(msg, 0);
 
        if (!rc)
-               CDEBUG(D_NET, "TRACE: %s(%s:%s) -> %s(%s:%s) : %s\n",
+               CDEBUG(D_NET, "TRACE: %s(%s:%s) -> %s(%s:%s) : %s try# %d\n",
                       libcfs_nid2str(msg->msg_hdr.src_nid),
                       libcfs_nid2str(msg->msg_txni->ni_nid),
                       libcfs_nid2str(sd->sd_src_nid),
                       libcfs_nid2str(msg->msg_hdr.dest_nid),
                       libcfs_nid2str(sd->sd_dst_nid),
                       libcfs_nid2str(msg->msg_txpeer->lpni_nid),
-                      lnet_msgtyp2str(msg->msg_type));
+                      lnet_msgtyp2str(msg->msg_type), msg->msg_retry_count);
 
        return rc;
 }
@@ -2725,6 +2747,9 @@ lnet_finalize_expired_responses(bool force)
 
                        if (ktime_compare(ktime_get(), rspt->rspt_deadline) >= 0 ||
                            force) {
+                               struct lnet_peer_ni *lpni;
+                               lnet_nid_t nid;
+
                                md = lnet_handle2md(&rspt->rspt_mdh);
                                if (!md) {
                                        LNetInvalidateMDHandle(&rspt->rspt_mdh);
@@ -2738,14 +2763,30 @@ lnet_finalize_expired_responses(bool force)
                                lnet_res_unlock(i);
 
                                lnet_net_lock(i);
-                               the_lnet.ln_counters[i]->response_timeout_count++;
+                               the_lnet.ln_counters[i]->lct_health.lch_response_timeout_count++;
                                lnet_net_unlock(i);
 
                                list_del_init(&rspt->rspt_on_list);
 
-                               CDEBUG(D_NET, "Response timed out: md = %p\n", md);
+                               nid = rspt->rspt_next_hop_nid;
+
+                               CNETERR("Response timed out: md = %p: nid = %s\n",
+                                       md, libcfs_nid2str(nid));
                                LNetMDUnlink(rspt->rspt_mdh);
                                lnet_rspt_free(rspt, i);
+
+                               /*
+                                * If there is a timeout on the response
+                                * from the next hop decrement its health
+                                * value so that we don't use it
+                                */
+                               lnet_net_lock(0);
+                               lpni = lnet_find_peer_ni_locked(nid);
+                               if (lpni) {
+                                       lnet_handle_remote_failure_locked(lpni);
+                                       lnet_peer_ni_decref_locked(lpni);
+                               }
+                               lnet_net_unlock(0);
                        } else {
                                lnet_res_unlock(i);
                                break;
@@ -2808,11 +2849,12 @@ lnet_resend_pending_msgs_locked(struct list_head *resendq, int cpt)
                        lnet_peer_ni_decref_locked(lpni);
 
                        lnet_net_unlock(cpt);
-                       CDEBUG(D_NET, "resending %s->%s: %s recovery %d\n",
+                       CDEBUG(D_NET, "resending %s->%s: %s recovery %d try# %d\n",
                               libcfs_nid2str(src_nid),
                               libcfs_id2str(msg->msg_target),
                               lnet_msgtyp2str(msg->msg_type),
-                              msg->msg_recovery);
+                              msg->msg_recovery,
+                              msg->msg_retry_count);
                        rc = lnet_send(src_nid, msg, LNET_NID_ANY);
                        if (rc) {
                                CERROR("Error sending %s to %s: %d\n",
@@ -2823,7 +2865,7 @@ lnet_resend_pending_msgs_locked(struct list_head *resendq, int cpt)
                        }
                        lnet_net_lock(cpt);
                        if (!rc)
-                               the_lnet.ln_counters[cpt]->resend_count++;
+                               the_lnet.ln_counters[cpt]->lct_health.lch_resend_count++;
                }
        }
 }
@@ -2842,13 +2884,14 @@ lnet_resend_pending_msgs(void)
 
 /* called with cpt and ni_lock held */
 static void
-lnet_unlink_ni_recovery_mdh_locked(struct lnet_ni *ni, int cpt)
+lnet_unlink_ni_recovery_mdh_locked(struct lnet_ni *ni, int cpt, bool force)
 {
        struct lnet_handle_md recovery_mdh;
 
        LNetInvalidateMDHandle(&recovery_mdh);
 
-       if (ni->ni_state & LNET_NI_STATE_RECOVERY_PENDING) {
+       if (ni->ni_recovery_state & LNET_NI_RECOVERY_PENDING ||
+           force) {
                recovery_mdh = ni->ni_ping_mdh;
                LNetInvalidateMDHandle(&ni->ni_ping_mdh);
        }
@@ -2901,15 +2944,26 @@ lnet_recover_local_nis(void)
 
                lnet_net_lock(0);
                lnet_ni_lock(ni);
-               if (!(ni->ni_state & LNET_NI_STATE_ACTIVE) ||
+               if (ni->ni_state != LNET_NI_STATE_ACTIVE ||
                    healthv == LNET_MAX_HEALTH_VALUE) {
                        list_del_init(&ni->ni_recovery);
-                       lnet_unlink_ni_recovery_mdh_locked(ni, 0);
+                       lnet_unlink_ni_recovery_mdh_locked(ni, 0, false);
                        lnet_ni_unlock(ni);
                        lnet_ni_decref_locked(ni, 0);
                        lnet_net_unlock(0);
                        continue;
                }
+
+               /*
+                * if the local NI failed recovery we must unlink the md.
+                * But we want to keep the local_ni on the recovery queue
+                * so we can continue the attempts to recover it.
+                */
+               if (ni->ni_recovery_state & LNET_NI_RECOVERY_FAILED) {
+                       lnet_unlink_ni_recovery_mdh_locked(ni, 0, true);
+                       ni->ni_recovery_state &= ~LNET_NI_RECOVERY_FAILED;
+               }
+
                lnet_ni_unlock(ni);
                lnet_net_unlock(0);
 
@@ -2918,8 +2972,8 @@ lnet_recover_local_nis(void)
                       libcfs_nid2str(ni->ni_nid));
 
                lnet_ni_lock(ni);
-               if (!(ni->ni_state & LNET_NI_STATE_RECOVERY_PENDING)) {
-                       ni->ni_state |= LNET_NI_STATE_RECOVERY_PENDING;
+               if (!(ni->ni_recovery_state & LNET_NI_RECOVERY_PENDING)) {
+                       ni->ni_recovery_state |= LNET_NI_RECOVERY_PENDING;
                        lnet_ni_unlock(ni);
 
                        LIBCFS_ALLOC(ev_info, sizeof(*ev_info));
@@ -2927,7 +2981,8 @@ lnet_recover_local_nis(void)
                                CERROR("out of memory. Can't recover %s\n",
                                       libcfs_nid2str(ni->ni_nid));
                                lnet_ni_lock(ni);
-                               ni->ni_state &= ~LNET_NI_STATE_RECOVERY_PENDING;
+                               ni->ni_recovery_state &=
+                                 ~LNET_NI_RECOVERY_PENDING;
                                lnet_ni_unlock(ni);
                                continue;
                        }
@@ -2999,7 +3054,7 @@ lnet_recover_local_nis(void)
 
                        lnet_ni_lock(ni);
                        if (rc)
-                               ni->ni_state &= ~LNET_NI_STATE_RECOVERY_PENDING;
+                               ni->ni_recovery_state &= ~LNET_NI_RECOVERY_PENDING;
                }
                lnet_ni_unlock(ni);
        }
@@ -3063,7 +3118,7 @@ lnet_clean_local_ni_recoveryq(void)
                                struct lnet_ni, ni_recovery);
                list_del_init(&ni->ni_recovery);
                lnet_ni_lock(ni);
-               lnet_unlink_ni_recovery_mdh_locked(ni, 0);
+               lnet_unlink_ni_recovery_mdh_locked(ni, 0, true);
                lnet_ni_unlock(ni);
                lnet_ni_decref_locked(ni, 0);
        }
@@ -3072,13 +3127,14 @@ lnet_clean_local_ni_recoveryq(void)
 }
 
 static void
-lnet_unlink_lpni_recovery_mdh_locked(struct lnet_peer_ni *lpni, int cpt)
+lnet_unlink_lpni_recovery_mdh_locked(struct lnet_peer_ni *lpni, int cpt,
+                                    bool force)
 {
        struct lnet_handle_md recovery_mdh;
 
        LNetInvalidateMDHandle(&recovery_mdh);
 
-       if (lpni->lpni_state & LNET_PEER_NI_RECOVERY_PENDING) {
+       if (lpni->lpni_state & LNET_PEER_NI_RECOVERY_PENDING || force) {
                recovery_mdh = lpni->lpni_recovery_ping_mdh;
                LNetInvalidateMDHandle(&lpni->lpni_recovery_ping_mdh);
        }
@@ -3101,7 +3157,7 @@ lnet_clean_peer_ni_recoveryq(void)
                                 lpni_recovery) {
                list_del_init(&lpni->lpni_recovery);
                spin_lock(&lpni->lpni_lock);
-               lnet_unlink_lpni_recovery_mdh_locked(lpni, LNET_LOCK_EX);
+               lnet_unlink_lpni_recovery_mdh_locked(lpni, LNET_LOCK_EX, true);
                spin_unlock(&lpni->lpni_lock);
                lnet_peer_ni_decref_locked(lpni);
        }
@@ -3169,12 +3225,23 @@ lnet_recover_peer_nis(void)
                if (lpni->lpni_state & LNET_PEER_NI_DELETING ||
                    healthv == LNET_MAX_HEALTH_VALUE) {
                        list_del_init(&lpni->lpni_recovery);
-                       lnet_unlink_lpni_recovery_mdh_locked(lpni, 0);
+                       lnet_unlink_lpni_recovery_mdh_locked(lpni, 0, false);
                        spin_unlock(&lpni->lpni_lock);
                        lnet_peer_ni_decref_locked(lpni);
                        lnet_net_unlock(0);
                        continue;
                }
+
+               /*
+                * If the peer NI has failed recovery we must unlink the
+                * md. But we want to keep the peer ni on the recovery
+                * queue so we can try to continue recovering it
+                */
+               if (lpni->lpni_state & LNET_PEER_NI_RECOVERY_FAILED) {
+                       lnet_unlink_lpni_recovery_mdh_locked(lpni, 0, true);
+                       lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_FAILED;
+               }
+
                spin_unlock(&lpni->lpni_lock);
                lnet_net_unlock(0);
 
@@ -3259,7 +3326,10 @@ lnet_recover_peer_nis(void)
 static int
 lnet_monitor_thread(void *arg)
 {
-       int wakeup_counter = 0;
+       time64_t recovery_timeout = 0;
+       time64_t rsp_timeout = 0;
+       int interval;
+       time64_t now;
 
        /*
         * The monitor thread takes care of the following:
@@ -3274,20 +3344,23 @@ lnet_monitor_thread(void *arg)
        cfs_block_allsigs();
 
        while (the_lnet.ln_mt_state == LNET_MT_STATE_RUNNING) {
+               now = ktime_get_real_seconds();
+
                if (lnet_router_checker_active())
                        lnet_check_routers();
 
                lnet_resend_pending_msgs();
 
-               wakeup_counter++;
-               if (wakeup_counter >= lnet_transaction_timeout / 2) {
+               if (now >= rsp_timeout) {
                        lnet_finalize_expired_responses(false);
-                       wakeup_counter = 0;
+                       rsp_timeout = now + (lnet_transaction_timeout / 2);
                }
 
-               lnet_recover_local_nis();
-
-               lnet_recover_peer_nis();
+               if (now >= recovery_timeout) {
+                       lnet_recover_local_nis();
+                       lnet_recover_peer_nis();
+                       recovery_timeout = now + lnet_recovery_interval;
+               }
 
                /*
                 * TODO do we need to check if we should sleep without
@@ -3298,9 +3371,11 @@ lnet_monitor_thread(void *arg)
                 * cases where we get a complaint that an idle thread
                 * is waking up unnecessarily.
                 */
+               interval = min(lnet_recovery_interval,
+                              lnet_transaction_timeout / 2);
                wait_event_interruptible_timeout(the_lnet.ln_mt_waitq,
                                                false,
-                                               cfs_time_seconds(1));
+                                               cfs_time_seconds(interval));
        }
 
        /* clean up the router checker */
@@ -3394,12 +3469,15 @@ lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info,
                        return;
                }
                lnet_ni_lock(ni);
-               ni->ni_state &= ~LNET_NI_STATE_RECOVERY_PENDING;
+               ni->ni_recovery_state &= ~LNET_NI_RECOVERY_PENDING;
+               if (status)
+                       ni->ni_recovery_state |= LNET_NI_RECOVERY_FAILED;
                lnet_ni_unlock(ni);
                lnet_net_unlock(0);
 
                if (status != 0) {
-                       CERROR("local NI recovery failed with %d\n", status);
+                       CERROR("local NI (%s) recovery failed with %d\n",
+                              libcfs_nid2str(nid), status);
                        return;
                }
                /*
@@ -3422,12 +3500,15 @@ lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info,
                }
                spin_lock(&lpni->lpni_lock);
                lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING;
+               if (status)
+                       lpni->lpni_state |= LNET_PEER_NI_RECOVERY_FAILED;
                spin_unlock(&lpni->lpni_lock);
                lnet_peer_ni_decref_locked(lpni);
                lnet_net_unlock(cpt);
 
                if (status != 0)
-                       CERROR("peer NI recovery failed with %d\n", status);
+                       CERROR("peer NI (%s) recovery failed with %d\n",
+                              libcfs_nid2str(nid), status);
        }
 }
 
@@ -3457,6 +3538,7 @@ lnet_mt_event_handler(struct lnet_event *event)
                               libcfs_nid2str(ev_info->mt_nid),
                               (event->status) ? "unsuccessfully" :
                               "successfully", event->status);
+               lnet_handle_recovery_reply(ev_info, event->status);
                break;
        default:
                CERROR("Unexpected event: %d\n", event->type);
@@ -3591,8 +3673,8 @@ lnet_drop_message(struct lnet_ni *ni, int cpt, void *private, unsigned int nob,
 {
        lnet_net_lock(cpt);
        lnet_incr_stats(&ni->ni_stats, msg_type, LNET_STATS_TYPE_DROP);
-       the_lnet.ln_counters[cpt]->drop_count++;
-       the_lnet.ln_counters[cpt]->drop_length += nob;
+       the_lnet.ln_counters[cpt]->lct_common.lcc_drop_count++;
+       the_lnet.ln_counters[cpt]->lct_common.lcc_drop_length += nob;
        lnet_net_unlock(cpt);
 
        lnet_ni_recv(ni, private, NULL, 0, 0, 0, nob);
@@ -4582,8 +4664,9 @@ lnet_create_reply_msg(struct lnet_ni *ni, struct lnet_msg *getmsg)
 
        lnet_net_lock(cpt);
        lnet_incr_stats(&ni->ni_stats, LNET_MSG_GET, LNET_STATS_TYPE_DROP);
-       the_lnet.ln_counters[cpt]->drop_count++;
-       the_lnet.ln_counters[cpt]->drop_length += getmd->md_length;
+       the_lnet.ln_counters[cpt]->lct_common.lcc_drop_count++;
+       the_lnet.ln_counters[cpt]->lct_common.lcc_drop_length +=
+               getmd->md_length;
        lnet_net_unlock(cpt);
 
        if (msg != NULL)