Whamcloud - gitweb
LU-13575 lnet: Ensure round robin selection of peer NIs
[fs/lustre-release.git] / lnet / lnet / lib-move.c
index 957c8ba..6117ccb 100644 (file)
@@ -759,7 +759,7 @@ lnet_ni_eager_recv(struct lnet_ni *ni, struct lnet_msg *msg)
        if (rc != 0) {
                CERROR("recv from %s / send to %s aborted: "
                       "eager_recv failed %d\n",
-                      libcfs_nid2str(msg->msg_rxpeer->lpni_nid),
+                      libcfs_nidstr(&msg->msg_rxpeer->lpni_nid),
                       libcfs_id2str(msg->msg_target), rc);
                LASSERT(rc < 0); /* required by my callers */
        }
@@ -843,8 +843,7 @@ lnet_post_send_locked(struct lnet_msg *msg, int do_send)
 
        /* can't get here if we're sending to the loopback interface */
        if (the_lnet.ln_loni)
-               LASSERT(lp->lpni_nid !=
-                       lnet_nid_to_nid4(&the_lnet.ln_loni->ni_nid));
+               LASSERT(!nid_same(&lp->lpni_nid, &the_lnet.ln_loni->ni_nid));
 
        /* NB 'lp' is always the next hop */
        if ((msg->msg_target.pid & LNET_PID_USERFLAG) == 0 &&
@@ -1338,8 +1337,8 @@ lnet_select_peer_ni(struct lnet_ni *best_ni, lnet_nid_t dst_nid,
 
                if (best_lpni)
                        CDEBUG(D_NET, "n:[%s, %s] h:[%d, %d] p:[%d, %d] c:[%d, %d] s:[%d, %d]\n",
-                               libcfs_nid2str(lpni->lpni_nid),
-                               libcfs_nid2str(best_lpni->lpni_nid),
+                               libcfs_nidstr(&lpni->lpni_nid),
+                               libcfs_nidstr(&best_lpni->lpni_nid),
                                lpni_healthv, best_lpni_healthv,
                                lpni_sel_prio, best_sel_prio,
                                lpni->lpni_txcredits, best_lpni_credits,
@@ -1408,7 +1407,7 @@ select_lpni:
        }
 
        CDEBUG(D_NET, "sd_best_lpni = %s\n",
-              libcfs_nid2str(best_lpni->lpni_nid));
+              libcfs_nidstr(&best_lpni->lpni_nid));
 
        return best_lpni;
 }
@@ -1512,7 +1511,7 @@ lnet_find_route_locked(struct lnet_remotenet *rnet, __u32 src_net,
        list_for_each_entry(route, &rnet->lrn_routes, lr_list) {
                if (!lnet_is_route_alive(route))
                        continue;
-               gw_pnid = route->lr_gateway->lp_primary_nid;
+               gw_pnid = lnet_nid_to_nid4(&route->lr_gateway->lp_primary_nid);
 
                /* no protection on below fields, but it's harmless */
                if (last_route && (last_route->lr_seq - route->lr_seq < 0))
@@ -1838,17 +1837,17 @@ lnet_handle_send(struct lnet_send_data *sd)
         * local ni and local net so that we pick the next ones
         * in Round Robin.
         */
-       best_lpni->lpni_seq++;
        best_lpni->lpni_peer_net->lpn_seq++;
-       best_ni->ni_seq++;
+       best_lpni->lpni_seq = best_lpni->lpni_peer_net->lpn_seq;
        best_ni->ni_net->net_seq++;
+       best_ni->ni_seq = best_ni->ni_net->net_seq;
 
        CDEBUG(D_NET, "%s NI seq info: [%d:%d:%d:%u] %s LPNI seq info [%d:%d:%d:%u]\n",
               libcfs_nidstr(&best_ni->ni_nid),
               best_ni->ni_seq, best_ni->ni_net->net_seq,
               atomic_read(&best_ni->ni_tx_credits),
               best_ni->ni_sel_priority,
-              libcfs_nid2str(best_lpni->lpni_nid),
+              libcfs_nidstr(&best_lpni->lpni_nid),
               best_lpni->lpni_seq, best_lpni->lpni_peer_net->lpn_seq,
               best_lpni->lpni_txcredits,
               best_lpni->lpni_sel_priority);
@@ -1868,7 +1867,7 @@ lnet_handle_send(struct lnet_send_data *sd)
         * the configuration has changed. We don't have a hold on the best_ni
         * yet, and it may have vanished.
         */
-       cpt2 = lnet_cpt_of_nid_locked(best_lpni->lpni_nid, best_ni);
+       cpt2 = lnet_cpt_of_nid_locked(&best_lpni->lpni_nid, best_ni);
        if (sd->sd_cpt != cpt2) {
                __u32 seq = lnet_get_dlc_seq_locked();
                lnet_net_unlock(sd->sd_cpt);
@@ -1899,7 +1898,8 @@ lnet_handle_send(struct lnet_send_data *sd)
         * what was originally set in the target or it will be the NID of
         * a router if this message should be routed
         */
-       msg->msg_target.nid = msg->msg_txpeer->lpni_nid;
+       /* FIXME handle large-addr nids */
+       msg->msg_target.nid = lnet_nid_to_nid4(&msg->msg_txpeer->lpni_nid);
 
        /*
         * lnet_msg_commit assigns the correct cpt to the message, which
@@ -1930,13 +1930,16 @@ lnet_handle_send(struct lnet_send_data *sd)
                 * lnet_select_pathway() function and is never changed.
                 * It's safe to use it here.
                 */
-               msg->msg_hdr.dest_nid = cpu_to_le64(final_dst_lpni->lpni_nid);
+               /* FIXME handle large-addr nid */
+               msg->msg_hdr.dest_nid =
+                       cpu_to_le64(lnet_nid_to_nid4(&final_dst_lpni->lpni_nid));
        } else {
                /*
                 * if we're not routing set the dest_nid to the best peer
                 * ni NID that we picked earlier in the algorithm.
                 */
-               msg->msg_hdr.dest_nid = cpu_to_le64(msg->msg_txpeer->lpni_nid);
+               msg->msg_hdr.dest_nid =
+                       cpu_to_le64(lnet_nid_to_nid4(&msg->msg_txpeer->lpni_nid));
        }
 
        /*
@@ -1946,9 +1949,10 @@ lnet_handle_send(struct lnet_send_data *sd)
        if (msg->msg_md) {
                rspt = msg->msg_md->md_rspt_ptr;
                if (rspt) {
-                       rspt->rspt_next_hop_nid = msg->msg_txpeer->lpni_nid;
+                       rspt->rspt_next_hop_nid =
+                               msg->msg_txpeer->lpni_nid;
                        CDEBUG(D_NET, "rspt_next_hop_nid = %s\n",
-                              libcfs_nid2str(rspt->rspt_next_hop_nid));
+                              libcfs_nidstr(&rspt->rspt_next_hop_nid));
                }
        }
 
@@ -1961,7 +1965,7 @@ lnet_handle_send(struct lnet_send_data *sd)
                       libcfs_nid2str(sd->sd_src_nid),
                       libcfs_nid2str(msg->msg_hdr.dest_nid),
                       libcfs_nid2str(sd->sd_dst_nid),
-                      libcfs_nid2str(msg->msg_txpeer->lpni_nid),
+                      libcfs_nidstr(&msg->msg_txpeer->lpni_nid),
                       libcfs_nid2str(sd->sd_rtr_nid),
                       lnet_msgtyp2str(msg->msg_type), msg->msg_retry_count);
 
@@ -1976,7 +1980,7 @@ lnet_set_non_mr_pref_nid(struct lnet_peer_ni *lpni, struct lnet_ni *lni,
            !lnet_msg_is_response(msg) && lpni->lpni_pref_nnids == 0) {
                CDEBUG(D_NET, "Setting preferred local NID %s on NMR peer %s\n",
                       libcfs_nidstr(&lni->ni_nid),
-                      libcfs_nid2str(lpni->lpni_nid));
+                      libcfs_nidstr(&lpni->lpni_nid));
                lnet_peer_ni_set_non_mr_pref_nid(
                        lpni, lnet_nid_to_nid4(&lni->ni_nid));
        }
@@ -2031,8 +2035,8 @@ lnet_handle_spec_local_mr_dst(struct lnet_send_data *sd)
        }
 
        if (sd->sd_best_lpni &&
-           sd->sd_best_lpni->lpni_nid ==
-           lnet_nid_to_nid4(&the_lnet.ln_loni->ni_nid))
+           nid_same(&sd->sd_best_lpni->lpni_nid,
+                     &the_lnet.ln_loni->ni_nid))
                return lnet_handle_lo_send(sd);
        else if (sd->sd_best_lpni)
                return lnet_handle_send(sd);
@@ -2100,7 +2104,7 @@ lnet_initiate_peer_discovery(struct lnet_peer_ni *lpni, struct lnet_msg *msg,
                return rc;
        }
 
-       new_lpni = lnet_find_peer_ni_locked(lpni->lpni_nid);
+       new_lpni = lnet_find_peer_ni_locked(lnet_nid_to_nid4(&lpni->lpni_nid));
        if (!new_lpni) {
                lnet_peer_ni_decref_locked(lpni);
                return -ENOENT;
@@ -2133,7 +2137,7 @@ lnet_initiate_peer_discovery(struct lnet_peer_ni *lpni, struct lnet_msg *msg,
        lnet_peer_ni_decref_locked(new_lpni);
 
        CDEBUG(D_NET, "msg %p delayed. %s pending discovery\n",
-              msg, libcfs_nid2str(peer->lp_primary_nid));
+              msg, libcfs_nidstr(&peer->lp_primary_nid));
 
        return LNET_DC_WAIT;
 }
@@ -2554,7 +2558,7 @@ lnet_select_preferred_best_ni(struct lnet_send_data *sd)
                /* If there is no best_ni we don't have a route */
                if (!best_ni) {
                        CERROR("no path to %s from net %s\n",
-                               libcfs_nid2str(best_lpni->lpni_nid),
+                               libcfs_nidstr(&best_lpni->lpni_nid),
                                libcfs_net2str(best_lpni->lpni_net->net_id));
                        return -EHOSTUNREACH;
                }
@@ -2680,8 +2684,8 @@ lnet_handle_any_mr_dsta(struct lnet_send_data *sd)
                 * network
                 */
                if (sd->sd_best_lpni &&
-                   sd->sd_best_lpni->lpni_nid ==
-                   lnet_nid_to_nid4(&the_lnet.ln_loni->ni_nid)) {
+                   nid_same(&sd->sd_best_lpni->lpni_nid,
+                            &the_lnet.ln_loni->ni_nid)) {
                        /*
                         * in case we initially started with a routed
                         * destination, let's reset to local
@@ -3216,7 +3220,7 @@ lnet_finalize_expired_responses(void)
                        if (ktime_compare(now, rspt->rspt_deadline) >= 0 ||
                            the_lnet.ln_mt_state == LNET_MT_STATE_SHUTDOWN) {
                                struct lnet_peer_ni *lpni;
-                               lnet_nid_t nid;
+                               struct lnet_nid nid;
 
                                md = lnet_handle2md(&rspt->rspt_mdh);
                                if (!md) {
@@ -3274,7 +3278,7 @@ lnet_finalize_expired_responses(void)
 
                                CDEBUG(D_NET,
                                       "Response timeout: md = %p: nid = %s\n",
-                                      md, libcfs_nid2str(nid));
+                                      md, libcfs_nidstr(&nid));
 
                                /*
                                 * If there is a timeout on the response
@@ -3282,7 +3286,7 @@ lnet_finalize_expired_responses(void)
                                 * value so that we don't use it
                                 */
                                lnet_net_lock(0);
-                               lpni = lnet_find_peer_ni_locked(nid);
+                               lpni = lnet_peer_ni_find_locked(&nid);
                                if (lpni) {
                                        lnet_handle_remote_failure_locked(lpni);
                                        lnet_peer_ni_decref_locked(lpni);
@@ -3722,7 +3726,7 @@ lnet_recover_peer_nis(void)
                        LIBCFS_ALLOC(ev_info, sizeof(*ev_info));
                        if (!ev_info) {
                                CERROR("out of memory. Can't recover %s\n",
-                                      libcfs_nid2str(lpni->lpni_nid));
+                                      libcfs_nidstr(&lpni->lpni_nid));
                                spin_lock(&lpni->lpni_lock);
                                lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING;
                                spin_unlock(&lpni->lpni_lock);
@@ -3732,7 +3736,8 @@ lnet_recover_peer_nis(void)
                        /* look at the comments in lnet_recover_local_nis() */
                        mdh = lpni->lpni_recovery_ping_mdh;
                        LNetInvalidateMDHandle(&lpni->lpni_recovery_ping_mdh);
-                       nid = lpni->lpni_nid;
+                       /* FIXME handle large-addr nid */
+                       nid = lnet_nid_to_nid4(&lpni->lpni_nid);
                        lnet_net_lock(0);
                        list_del_init(&lpni->lpni_recovery);
                        lnet_peer_ni_decref_locked(lpni);
@@ -4562,7 +4567,7 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid,
                        return -EPROTO;
                }
 
-               if (lnet_islocalnid(dest_nid)) {
+               if (lnet_islocalnid4(dest_nid)) {
                        /* dest is another local NI; sender should have used
                         * this node's NID on its own network */
                        CERROR("%s, src %s: Bad dest nid %s "
@@ -4654,10 +4659,10 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid,
                                      cpt);
        if (IS_ERR(lpni)) {
                lnet_net_unlock(cpt);
-               CERROR("%s, src %s: Dropping %s "
-                      "(error %ld looking up sender)\n",
+               rc = PTR_ERR(lpni);
+               CERROR("%s, src %s: Dropping %s (error %d looking up sender)\n",
                       libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
-                      lnet_msgtyp2str(type), PTR_ERR(lpni));
+                      lnet_msgtyp2str(type), rc);
                lnet_msg_free(msg);
                if (rc == -ESHUTDOWN)
                        /* We are shutting down.  Don't do anything more */
@@ -5099,7 +5104,8 @@ lnet_create_reply_msg(struct lnet_ni *ni, struct lnet_msg *getmsg)
               libcfs_nidstr(&ni->ni_nid), libcfs_id2str(peer_id), getmd);
 
        /* setup information for lnet_build_msg_event */
-       msg->msg_initiator = getmsg->msg_txpeer->lpni_peer_net->lpn_peer->lp_primary_nid;
+       msg->msg_initiator =
+               lnet_nid_to_nid4(&getmsg->msg_txpeer->lpni_peer_net->lpn_peer->lp_primary_nid);
        msg->msg_from = peer_id.nid;
        msg->msg_type = LNET_MSG_GET; /* flag this msg as an "optimized" GET */
        msg->msg_hdr.src_nid = peer_id.nid;