Whamcloud - gitweb
LU-15860 socklnd: Duplicate ksock_conn_cb
[fs/lustre-release.git] / lnet / klnds / socklnd / socklnd.c
index ec7e069..f3fff31 100644 (file)
@@ -152,14 +152,14 @@ ksocknal_destroy_conn_cb(struct ksock_conn_cb *conn_cb)
 }
 
 static struct ksock_peer_ni *
-ksocknal_create_peer(struct lnet_ni *ni, struct lnet_process_id id)
+ksocknal_create_peer(struct lnet_ni *ni, struct lnet_processid *id)
 {
-       int cpt = lnet_cpt_of_nid(id.nid, ni);
+       int cpt = lnet_nid2cpt(&id->nid, ni);
        struct ksock_net *net = ni->ni_data;
        struct ksock_peer_ni *peer_ni;
 
-       LASSERT(id.nid != LNET_NID_ANY);
-       LASSERT(id.pid != LNET_PID_ANY);
+       LASSERT(!LNET_NID_IS_ANY(&id->nid));
+       LASSERT(id->pid != LNET_PID_ANY);
        LASSERT(!in_interrupt());
 
        if (!atomic_inc_unless_negative(&net->ksnn_npeers)) {
@@ -174,7 +174,7 @@ ksocknal_create_peer(struct lnet_ni *ni, struct lnet_process_id id)
        }
 
        peer_ni->ksnp_ni = ni;
-       peer_ni->ksnp_id = id;
+       peer_ni->ksnp_id = *id;
        refcount_set(&peer_ni->ksnp_refcount, 1); /* 1 ref for caller */
        peer_ni->ksnp_closing = 0;
        peer_ni->ksnp_accepting = 0;
@@ -197,7 +197,7 @@ ksocknal_destroy_peer(struct ksock_peer_ni *peer_ni)
        struct ksock_net *net = peer_ni->ksnp_ni->ni_data;
 
        CDEBUG (D_NET, "peer_ni %s %p deleted\n",
-               libcfs_id2str(peer_ni->ksnp_id), peer_ni);
+               libcfs_idstr(&peer_ni->ksnp_id), peer_ni);
 
        LASSERT(refcount_read(&peer_ni->ksnp_refcount) == 0);
        LASSERT(peer_ni->ksnp_accepting == 0);
@@ -218,23 +218,24 @@ ksocknal_destroy_peer(struct ksock_peer_ni *peer_ni)
 }
 
 struct ksock_peer_ni *
-ksocknal_find_peer_locked(struct lnet_ni *ni, struct lnet_process_id id)
+ksocknal_find_peer_locked(struct lnet_ni *ni, struct lnet_processid *id)
 {
        struct ksock_peer_ni *peer_ni;
+       unsigned long hash = nidhash(&id->nid);
 
        hash_for_each_possible(ksocknal_data.ksnd_peers, peer_ni,
-                              ksnp_list, id.nid) {
+                              ksnp_list, hash) {
                LASSERT(!peer_ni->ksnp_closing);
 
                if (peer_ni->ksnp_ni != ni)
                        continue;
 
-               if (peer_ni->ksnp_id.nid != id.nid ||
-                   peer_ni->ksnp_id.pid != id.pid)
+               if (!nid_same(&peer_ni->ksnp_id.nid, &id->nid) ||
+                   peer_ni->ksnp_id.pid != id->pid)
                        continue;
 
                CDEBUG(D_NET, "got peer_ni [%p] -> %s (%d)\n",
-                      peer_ni, libcfs_id2str(id),
+                      peer_ni, libcfs_idstr(id),
                       refcount_read(&peer_ni->ksnp_refcount));
                return peer_ni;
        }
@@ -242,7 +243,7 @@ ksocknal_find_peer_locked(struct lnet_ni *ni, struct lnet_process_id id)
 }
 
 struct ksock_peer_ni *
-ksocknal_find_peer(struct lnet_ni *ni, struct lnet_process_id id)
+ksocknal_find_peer(struct lnet_ni *ni, struct lnet_processid *id)
 {
        struct ksock_peer_ni *peer_ni;
 
@@ -252,7 +253,7 @@ ksocknal_find_peer(struct lnet_ni *ni, struct lnet_process_id id)
                ksocknal_peer_addref(peer_ni);
        read_unlock(&ksocknal_data.ksnd_global_lock);
 
-        return (peer_ni);
+       return peer_ni;
 }
 
 static void
@@ -290,7 +291,7 @@ ksocknal_unlink_peer_locked(struct ksock_peer_ni *peer_ni)
 
 static int
 ksocknal_get_peer_info(struct lnet_ni *ni, int index,
-                      struct lnet_process_id *id, __u32 *myip, __u32 *peer_ip,
+                      struct lnet_processid *id, __u32 *myip, __u32 *peer_ip,
                       int *port, int *conn_count, int *share_count)
 {
        struct ksock_peer_ni *peer_ni;
@@ -418,7 +419,9 @@ ksocknal_incr_conn_count(struct ksock_conn_cb *conn_cb,
        switch (type) {
        case SOCKLND_CONN_CONTROL:
                conn_cb->ksnr_ctrl_conn_count++;
-               /* there's a single control connection per peer */
+               /* there's a single control connection per peer,
+                * two in case of loopback
+                */
                conn_cb->ksnr_connected |= BIT(type);
                break;
        case SOCKLND_CONN_BULK_IN:
@@ -444,6 +447,46 @@ ksocknal_incr_conn_count(struct ksock_conn_cb *conn_cb,
               type, conn_cb->ksnr_connected, conn_cb->ksnr_max_conns);
 }
 
+
+static void
+ksocknal_decr_conn_count(struct ksock_conn_cb *conn_cb,
+                        int type)
+{
+       conn_cb->ksnr_conn_count--;
+
+       /* check if all connections of the given type got created */
+       switch (type) {
+       case SOCKLND_CONN_CONTROL:
+               conn_cb->ksnr_ctrl_conn_count--;
+               /* there's a single control connection per peer,
+                * two in case of loopback
+                */
+               if (conn_cb->ksnr_ctrl_conn_count == 0)
+                       conn_cb->ksnr_connected &= ~BIT(type);
+               break;
+       case SOCKLND_CONN_BULK_IN:
+               conn_cb->ksnr_blki_conn_count--;
+               if (conn_cb->ksnr_blki_conn_count < conn_cb->ksnr_max_conns)
+                       conn_cb->ksnr_connected &= ~BIT(type);
+               break;
+       case SOCKLND_CONN_BULK_OUT:
+               conn_cb->ksnr_blko_conn_count--;
+               if (conn_cb->ksnr_blko_conn_count < conn_cb->ksnr_max_conns)
+                       conn_cb->ksnr_connected &= ~BIT(type);
+               break;
+       case SOCKLND_CONN_ANY:
+               if (conn_cb->ksnr_conn_count < conn_cb->ksnr_max_conns)
+                       conn_cb->ksnr_connected &= ~BIT(type);
+               break;
+       default:
+               LBUG();
+               break;
+       }
+
+       CDEBUG(D_NET, "Del conn type %d, ksnr_connected %x ksnr_max_conns %d\n",
+              type, conn_cb->ksnr_connected, conn_cb->ksnr_max_conns);
+}
+
 static void
 ksocknal_associate_cb_conn_locked(struct ksock_conn_cb *conn_cb,
                                  struct ksock_conn *conn)
@@ -462,13 +505,13 @@ ksocknal_associate_cb_conn_locked(struct ksock_conn_cb *conn_cb,
                if (conn_cb->ksnr_myiface < 0) {
                        /* route wasn't bound locally yet (the initial route) */
                        CDEBUG(D_NET, "Binding %s %pIS to interface %d\n",
-                              libcfs_id2str(peer_ni->ksnp_id),
+                              libcfs_idstr(&peer_ni->ksnp_id),
                               &conn_cb->ksnr_addr,
                               conn_iface);
                } else {
                        CDEBUG(D_NET,
                               "Rebinding %s %pIS from interface %d to %d\n",
-                              libcfs_id2str(peer_ni->ksnp_id),
+                              libcfs_idstr(&peer_ni->ksnp_id),
                               &conn_cb->ksnr_addr,
                               conn_cb->ksnr_myiface,
                               conn_iface);
@@ -564,15 +607,15 @@ ksocknal_del_conn_cb_locked(struct ksock_conn_cb *conn_cb)
 }
 
 int
-ksocknal_add_peer(struct lnet_ni *ni, struct lnet_process_id id,
+ksocknal_add_peer(struct lnet_ni *ni, struct lnet_processid *id,
                  struct sockaddr *addr)
 {
        struct ksock_peer_ni *peer_ni;
        struct ksock_peer_ni *peer2;
        struct ksock_conn_cb *conn_cb;
 
-       if (id.nid == LNET_NID_ANY ||
-           id.pid == LNET_PID_ANY)
+       if (LNET_NID_IS_ANY(&id->nid) ||
+           id->pid == LNET_PID_ANY)
                return (-EINVAL);
 
        /* Have a brand new peer_ni ready... */
@@ -598,17 +641,21 @@ ksocknal_add_peer(struct lnet_ni *ni, struct lnet_process_id id,
                peer_ni = peer2;
        } else {
                /* peer_ni table takes my ref on peer_ni */
-               hash_add(ksocknal_data.ksnd_peers, &peer_ni->ksnp_list, id.nid);
+               hash_add(ksocknal_data.ksnd_peers, &peer_ni->ksnp_list,
+                        nidhash(&id->nid));
        }
 
-       ksocknal_add_conn_cb_locked(peer_ni, conn_cb);
-
-       /* Remember conns_per_peer setting at the time
-        * of connection initiation. It will define the
-        * max number of conns per type for this conn_cb
-        * while it's in use.
-        */
-       conn_cb->ksnr_max_conns = ksocknal_get_conns_per_peer(peer_ni);
+       if (peer_ni->ksnp_conn_cb) {
+               ksocknal_conn_cb_decref(conn_cb);
+       } else {
+               ksocknal_add_conn_cb_locked(peer_ni, conn_cb);
+               /* Remember conns_per_peer setting at the time
+                * of connection initiation. It will define the
+                * max number of conns per type for this conn_cb
+                * while it's in use.
+                */
+               conn_cb->ksnr_max_conns = ksocknal_get_conns_per_peer(peer_ni);
+       }
 
        write_unlock_bh(&ksocknal_data.ksnd_global_lock);
 
@@ -616,7 +663,7 @@ ksocknal_add_peer(struct lnet_ni *ni, struct lnet_process_id id,
 }
 
 static void
-ksocknal_del_peer_locked(struct ksock_peer_ni *peer_ni, __u32 ip)
+ksocknal_del_peer_locked(struct ksock_peer_ni *peer_ni)
 {
        struct ksock_conn *conn;
        struct ksock_conn *cnxt;
@@ -639,7 +686,7 @@ ksocknal_del_peer_locked(struct ksock_peer_ni *peer_ni, __u32 ip)
 }
 
 static int
-ksocknal_del_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ip)
+ksocknal_del_peer(struct lnet_ni *ni, struct lnet_processid *id)
 {
        LIST_HEAD(zombies);
        struct hlist_node *pnxt;
@@ -651,8 +698,9 @@ ksocknal_del_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ip)
 
        write_lock_bh(&ksocknal_data.ksnd_global_lock);
 
-       if (id.nid != LNET_NID_ANY) {
-               lo = hash_min(id.nid, HASH_BITS(ksocknal_data.ksnd_peers));
+       if (id && !LNET_NID_IS_ANY(&id->nid)) {
+               lo = hash_min(nidhash(&id->nid),
+                             HASH_BITS(ksocknal_data.ksnd_peers));
                hi = lo;
        } else {
                lo = 0;
@@ -666,15 +714,15 @@ ksocknal_del_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ip)
                        if (peer_ni->ksnp_ni != ni)
                                continue;
 
-                       if (!((id.nid == LNET_NID_ANY ||
-                              peer_ni->ksnp_id.nid == id.nid) &&
-                             (id.pid == LNET_PID_ANY ||
-                              peer_ni->ksnp_id.pid == id.pid)))
+                       if (!((!id || LNET_NID_IS_ANY(&id->nid) ||
+                              nid_same(&peer_ni->ksnp_id.nid, &id->nid)) &&
+                             (!id || id->pid == LNET_PID_ANY ||
+                              peer_ni->ksnp_id.pid == id->pid)))
                                continue;
 
                        ksocknal_peer_addref(peer_ni);  /* a ref for me... */
 
-                       ksocknal_del_peer_locked(peer_ni, ip);
+                       ksocknal_del_peer_locked(peer_ni);
 
                        if (peer_ni->ksnp_closing &&
                            !list_empty(&peer_ni->ksnp_tx_queue)) {
@@ -797,7 +845,7 @@ ksocknal_create_conn(struct lnet_ni *ni, struct ksock_conn_cb *conn_cb,
 {
        rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock;
        LIST_HEAD(zombies);
-       struct lnet_process_id peerid;
+       struct lnet_processid peerid;
        u64 incarnation;
        struct ksock_conn *conn;
        struct ksock_conn *conn2;
@@ -883,11 +931,11 @@ ksocknal_create_conn(struct lnet_ni *ni, struct ksock_conn_cb *conn_cb,
 #endif
                }
 
-               rc = ksocknal_send_hello(ni, conn, peerid.nid, hello);
+               rc = ksocknal_send_hello(ni, conn, &peerid.nid, hello);
                if (rc != 0)
                        goto failed_1;
        } else {
-               peerid.nid = LNET_NID_ANY;
+               peerid.nid = LNET_ANY_NID;
                peerid.pid = LNET_PID_ANY;
 
                /* Passive, get protocol from peer_ni */
@@ -900,15 +948,15 @@ ksocknal_create_conn(struct lnet_ni *ni, struct ksock_conn_cb *conn_cb,
 
        LASSERT(rc == 0 || active);
        LASSERT(conn->ksnc_proto != NULL);
-       LASSERT(peerid.nid != LNET_NID_ANY);
+       LASSERT(!LNET_NID_IS_ANY(&peerid.nid));
 
-       cpt = lnet_cpt_of_nid(peerid.nid, ni);
+       cpt = lnet_nid2cpt(&peerid.nid, ni);
 
        if (active) {
                ksocknal_peer_addref(peer_ni);
                write_lock_bh(global_lock);
        } else {
-               peer_ni = ksocknal_create_peer(ni, peerid);
+               peer_ni = ksocknal_create_peer(ni, &peerid);
                if (IS_ERR(peer_ni)) {
                        rc = PTR_ERR(peer_ni);
                        goto failed_1;
@@ -919,12 +967,12 @@ ksocknal_create_conn(struct lnet_ni *ni, struct ksock_conn_cb *conn_cb,
                /* called with a ref on ni, so shutdown can't have started */
                LASSERT(atomic_read(&((struct ksock_net *)ni->ni_data)->ksnn_npeers) >= 0);
 
-               peer2 = ksocknal_find_peer_locked(ni, peerid);
+               peer2 = ksocknal_find_peer_locked(ni, &peerid);
                if (peer2 == NULL) {
                        /* NB this puts an "empty" peer_ni in the peer_ni
                         * table (which takes my ref) */
                        hash_add(ksocknal_data.ksnd_peers,
-                                &peer_ni->ksnp_list, peerid.nid);
+                                &peer_ni->ksnp_list, nidhash(&peerid.nid));
                } else {
                        ksocknal_peer_decref(peer_ni);
                        peer_ni = peer2;
@@ -937,7 +985,7 @@ ksocknal_create_conn(struct lnet_ni *ni, struct ksock_conn_cb *conn_cb,
                /* Am I already connecting to this guy?  Resolve in
                 * favour of higher NID...
                 */
-               if (peerid.nid < ni->ni_nid &&
+               if (memcmp(&peerid.nid, &ni->ni_nid, sizeof(peerid.nid)) < 0 &&
                    ksocknal_connecting(peer_ni->ksnp_conn_cb,
                                        ((struct sockaddr *) &conn->ksnc_peeraddr))) {
                        rc = EALREADY;
@@ -1034,7 +1082,7 @@ ksocknal_create_conn(struct lnet_ni *ni, struct ksock_conn_cb *conn_cb,
            !rpc_cmp_addr((struct sockaddr *)&conn_cb->ksnr_addr,
                          (struct sockaddr *)&conn->ksnc_peeraddr)) {
                CERROR("Route %s %pIS connected to %pIS\n",
-                      libcfs_id2str(peer_ni->ksnp_id),
+                      libcfs_idstr(&peer_ni->ksnp_id),
                       &conn_cb->ksnr_addr,
                       &conn->ksnc_peeraddr);
        }
@@ -1093,7 +1141,6 @@ ksocknal_create_conn(struct lnet_ni *ni, struct ksock_conn_cb *conn_cb,
        }
 
        write_unlock_bh(global_lock);
-
        /* We've now got a new connection.  Any errors from here on are just
         * like "normal" comms errors and we close the connection normally.
         * NB (a) we still have to send the reply HELLO for passive
@@ -1104,13 +1151,13 @@ ksocknal_create_conn(struct lnet_ni *ni, struct ksock_conn_cb *conn_cb,
 
        CDEBUG(D_NET, "New conn %s p %d.x %pIS -> %pISp"
               " incarnation:%lld sched[%d]\n",
-              libcfs_id2str(peerid), conn->ksnc_proto->pro_version,
+              libcfs_idstr(&peerid), conn->ksnc_proto->pro_version,
               &conn->ksnc_myaddr, &conn->ksnc_peeraddr,
               incarnation, cpt);
 
        if (!active) {
                hello->kshm_nips = 0;
-               rc = ksocknal_send_hello(ni, conn, peerid.nid, hello);
+               rc = ksocknal_send_hello(ni, conn, &peerid.nid, hello);
        }
 
        LIBCFS_FREE(hello, offsetof(struct ksock_hello_msg,
@@ -1167,10 +1214,10 @@ failed_2:
        if (warn != NULL) {
                if (rc < 0)
                        CERROR("Not creating conn %s type %d: %s\n",
-                              libcfs_id2str(peerid), conn->ksnc_type, warn);
+                              libcfs_idstr(&peerid), conn->ksnc_type, warn);
                else
                        CDEBUG(D_NET, "Not creating conn %s type %d: %s\n",
-                              libcfs_id2str(peerid), conn->ksnc_type, warn);
+                              libcfs_idstr(&peerid), conn->ksnc_type, warn);
        }
 
        if (!active) {
@@ -1180,7 +1227,7 @@ failed_2:
                         */
                        conn->ksnc_type = SOCKLND_CONN_NONE;
                        hello->kshm_nips = 0;
-                       ksocknal_send_hello(ni, conn, peerid.nid, hello);
+                       ksocknal_send_hello(ni, conn, &peerid.nid, hello);
                }
 
                write_lock_bh(global_lock);
@@ -1219,6 +1266,8 @@ ksocknal_close_conn_locked(struct ksock_conn *conn, int error)
        struct ksock_peer_ni *peer_ni = conn->ksnc_peer;
        struct ksock_conn_cb *conn_cb;
        struct ksock_conn *conn2;
+       int conn_count;
+       int duplicate_count = 0;
 
        LASSERT(peer_ni->ksnp_error == 0);
        LASSERT(!conn->ksnc_closing);
@@ -1232,21 +1281,29 @@ ksocknal_close_conn_locked(struct ksock_conn *conn, int error)
                /* dissociate conn from cb... */
                LASSERT(!conn_cb->ksnr_deleted);
 
+               conn_count = ksocknal_get_conn_count_by_type(conn_cb,
+                                                            conn->ksnc_type);
                /* connected bit is set only if all connections
                 * of the given type got created
                 */
-               if (ksocknal_get_conn_count_by_type(conn_cb, conn->ksnc_type) ==
-                   conn_cb->ksnr_max_conns)
+               if (conn_count == conn_cb->ksnr_max_conns)
                        LASSERT((conn_cb->ksnr_connected &
                                BIT(conn->ksnc_type)) != 0);
 
-               list_for_each_entry(conn2, &peer_ni->ksnp_conns, ksnc_list) {
-                       if (conn2->ksnc_conn_cb == conn_cb &&
-                           conn2->ksnc_type == conn->ksnc_type)
-                               goto conn2_found;
+               if (conn_count == 1) {
+                       list_for_each_entry(conn2, &peer_ni->ksnp_conns,
+                                           ksnc_list) {
+                               if (conn2->ksnc_conn_cb == conn_cb &&
+                                   conn2->ksnc_type == conn->ksnc_type)
+                                       duplicate_count += 1;
+                       }
+                       if (duplicate_count > 0)
+                               CERROR("Found %d duplicate conns type %d\n",
+                                      duplicate_count,
+                                      conn->ksnc_type);
                }
-               conn_cb->ksnr_connected &= ~BIT(conn->ksnc_type);
-conn2_found:
+               ksocknal_decr_conn_count(conn_cb, conn->ksnc_type);
+
                conn->ksnc_conn_cb = NULL;
 
                /* drop conn's ref on conn_cb */
@@ -1317,7 +1374,8 @@ ksocknal_peer_failed(struct ksock_peer_ni *peer_ni)
        read_unlock(&ksocknal_data.ksnd_global_lock);
 
        if (notify)
-               lnet_notify(peer_ni->ksnp_ni, peer_ni->ksnp_id.nid,
+               lnet_notify(peer_ni->ksnp_ni,
+                           lnet_nid_to_nid4(&peer_ni->ksnp_id.nid),
                            false, false, last_alive);
 }
 
@@ -1455,9 +1513,10 @@ ksocknal_destroy_conn(struct ksock_conn *conn)
                 last_rcv = conn->ksnc_rx_deadline -
                           ksocknal_timeout();
                CERROR("Completing partial receive from %s[%d], ip %pISp, with error, wanted: %d, left: %d, last alive is %lld secs ago\n",
-                       libcfs_id2str(conn->ksnc_peer->ksnp_id), conn->ksnc_type,
+                      libcfs_idstr(&conn->ksnc_peer->ksnp_id),
+                      conn->ksnc_type,
                       &conn->ksnc_peeraddr,
-                       conn->ksnc_rx_nob_wanted, conn->ksnc_rx_nob_left,
+                      conn->ksnc_rx_nob_wanted, conn->ksnc_rx_nob_left,
                       ktime_get_seconds() - last_rcv);
                if (conn->ksnc_lnet_msg)
                        conn->ksnc_lnet_msg->msg_health_status =
@@ -1467,31 +1526,31 @@ ksocknal_destroy_conn(struct ksock_conn *conn)
        case SOCKNAL_RX_LNET_HEADER:
                if (conn->ksnc_rx_started)
                        CERROR("Incomplete receive of lnet header from %s, ip %pISp, with error, protocol: %d.x.\n",
-                              libcfs_id2str(conn->ksnc_peer->ksnp_id),
+                              libcfs_idstr(&conn->ksnc_peer->ksnp_id),
                               &conn->ksnc_peeraddr,
                               conn->ksnc_proto->pro_version);
                break;
-        case SOCKNAL_RX_KSM_HEADER:
-                if (conn->ksnc_rx_started)
+       case SOCKNAL_RX_KSM_HEADER:
+               if (conn->ksnc_rx_started)
                        CERROR("Incomplete receive of ksock message from %s, ip %pISp, with error, protocol: %d.x.\n",
-                              libcfs_id2str(conn->ksnc_peer->ksnp_id),
+                              libcfs_idstr(&conn->ksnc_peer->ksnp_id),
                               &conn->ksnc_peeraddr,
                               conn->ksnc_proto->pro_version);
-                break;
-        case SOCKNAL_RX_SLOP:
-                if (conn->ksnc_rx_started)
+               break;
+       case SOCKNAL_RX_SLOP:
+               if (conn->ksnc_rx_started)
                        CERROR("Incomplete receive of slops from %s, ip %pISp, with error\n",
-                              libcfs_id2str(conn->ksnc_peer->ksnp_id),
+                              libcfs_idstr(&conn->ksnc_peer->ksnp_id),
                               &conn->ksnc_peeraddr);
-               break;
-        default:
-                LBUG ();
-                break;
-        }
+               break;
+       default:
+               LBUG();
+               break;
+       }
 
-        ksocknal_peer_decref(conn->ksnc_peer);
+       ksocknal_peer_decref(conn->ksnc_peer);
 
-        LIBCFS_FREE (conn, sizeof (*conn));
+       LIBCFS_FREE(conn, sizeof(*conn));
 }
 
 int
@@ -1531,7 +1590,7 @@ ksocknal_close_conn_and_siblings(struct ksock_conn *conn, int why)
 }
 
 int
-ksocknal_close_matching_conns(struct lnet_process_id id, __u32 ipaddr)
+ksocknal_close_matching_conns(struct lnet_processid *id, __u32 ipaddr)
 {
        struct ksock_peer_ni *peer_ni;
        struct hlist_node *pnxt;
@@ -1543,8 +1602,9 @@ ksocknal_close_matching_conns(struct lnet_process_id id, __u32 ipaddr)
 
        write_lock_bh(&ksocknal_data.ksnd_global_lock);
 
-       if (id.nid != LNET_NID_ANY) {
-               lo = hash_min(id.nid, HASH_BITS(ksocknal_data.ksnd_peers));
+       if (!LNET_NID_IS_ANY(&id->nid)) {
+               lo = hash_min(nidhash(&id->nid),
+                             HASH_BITS(ksocknal_data.ksnd_peers));
                hi = lo;
        } else {
                lo = 0;
@@ -1557,10 +1617,10 @@ ksocknal_close_matching_conns(struct lnet_process_id id, __u32 ipaddr)
                                          &ksocknal_data.ksnd_peers[i],
                                          ksnp_list) {
 
-                       if (!((id.nid == LNET_NID_ANY ||
-                              id.nid == peer_ni->ksnp_id.nid) &&
-                             (id.pid == LNET_PID_ANY ||
-                              id.pid == peer_ni->ksnp_id.pid)))
+                       if (!((LNET_NID_IS_ANY(&id->nid) ||
+                              nid_same(&id->nid, &peer_ni->ksnp_id.nid)) &&
+                             (id->pid == LNET_PID_ANY ||
+                              id->pid == peer_ni->ksnp_id.pid)))
                                continue;
 
                        count += ksocknal_close_peer_conns_locked(
@@ -1572,31 +1632,33 @@ ksocknal_close_matching_conns(struct lnet_process_id id, __u32 ipaddr)
        write_unlock_bh(&ksocknal_data.ksnd_global_lock);
 
        /* wildcards always succeed */
-       if (id.nid == LNET_NID_ANY || id.pid == LNET_PID_ANY || ipaddr == 0)
+       if (LNET_NID_IS_ANY(&id->nid) || id->pid == LNET_PID_ANY ||
+           ipaddr == 0)
                return 0;
 
        return (count == 0 ? -ENOENT : 0);
 }
 
 void
-ksocknal_notify_gw_down(lnet_nid_t gw_nid)
+ksocknal_notify_gw_down(struct lnet_nid *gw_nid)
 {
        /* The router is telling me she's been notified of a change in
         * gateway state....
         */
-       struct lnet_process_id id = {
-               .nid    = gw_nid,
+       struct lnet_processid id = {
                .pid    = LNET_PID_ANY,
+               .nid    = *gw_nid,
        };
 
-       CDEBUG(D_NET, "gw %s down\n", libcfs_nid2str(gw_nid));
+       CDEBUG(D_NET, "gw %s down\n", libcfs_nidstr(gw_nid));
 
        /* If the gateway crashed, close all open connections... */
-       ksocknal_close_matching_conns(id, 0);
+       ksocknal_close_matching_conns(&id, 0);
        return;
 
        /* We can only establish new connections
-        * if we have autroutes, and these connect on demand. */
+        * if we have autroutes, and these connect on demand.
+        */
 }
 
 static void
@@ -1630,15 +1692,16 @@ ksocknal_push_peer(struct ksock_peer_ni *peer_ni)
 }
 
 static int
-ksocknal_push(struct lnet_ni *ni, struct lnet_process_id id)
+ksocknal_push(struct lnet_ni *ni, struct lnet_processid *id)
 {
        int lo;
        int hi;
        int bkt;
        int rc = -ENOENT;
 
-       if (id.nid != LNET_NID_ANY) {
-               lo = hash_min(id.nid, HASH_BITS(ksocknal_data.ksnd_peers));
+       if (!LNET_NID_IS_ANY(&id->nid)) {
+               lo = hash_min(nidhash(&id->nid),
+                             HASH_BITS(ksocknal_data.ksnd_peers));
                hi = lo;
        } else {
                lo = 0;
@@ -1656,10 +1719,11 @@ ksocknal_push(struct lnet_ni *ni, struct lnet_process_id id)
                        hlist_for_each_entry(peer_ni,
                                             &ksocknal_data.ksnd_peers[bkt],
                                             ksnp_list) {
-                               if (!((id.nid == LNET_NID_ANY ||
-                                      id.nid == peer_ni->ksnp_id.nid) &&
-                                     (id.pid == LNET_PID_ANY ||
-                                      id.pid == peer_ni->ksnp_id.pid)))
+                               if (!((LNET_NID_IS_ANY(&id->nid) ||
+                                      nid_same(&id->nid,
+                                                &peer_ni->ksnp_id.nid)) &&
+                                     (id->pid == LNET_PID_ANY ||
+                                      id->pid == peer_ni->ksnp_id.pid)))
                                        continue;
 
                                if (i++ == peer_off) {
@@ -1683,7 +1747,7 @@ ksocknal_push(struct lnet_ni *ni, struct lnet_process_id id)
 int
 ksocknal_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg)
 {
-       struct lnet_process_id id = {0};
+       struct lnet_processid id = {};
        struct libcfs_ioctl_data *data = arg;
        int rc;
 
@@ -1715,43 +1779,43 @@ ksocknal_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg)
                return rc;
         }
 
-        case IOC_LIBCFS_GET_PEER: {
-                __u32            myip = 0;
-                __u32            ip = 0;
-                int              port = 0;
-                int              conn_count = 0;
-                int              share_count = 0;
-
-                rc = ksocknal_get_peer_info(ni, data->ioc_count,
-                                            &id, &myip, &ip, &port,
-                                            &conn_count,  &share_count);
-                if (rc != 0)
-                        return rc;
+       case IOC_LIBCFS_GET_PEER: {
+               __u32 myip = 0;
+               __u32 ip = 0;
+               int port = 0;
+               int conn_count = 0;
+               int share_count = 0;
 
-                data->ioc_nid    = id.nid;
-                data->ioc_count  = share_count;
-                data->ioc_u32[0] = ip;
-                data->ioc_u32[1] = port;
-                data->ioc_u32[2] = myip;
-                data->ioc_u32[3] = conn_count;
-                data->ioc_u32[4] = id.pid;
-                return 0;
-        }
+               rc = ksocknal_get_peer_info(ni, data->ioc_count,
+                                           &id, &myip, &ip, &port,
+                                           &conn_count,  &share_count);
+               if (rc != 0)
+                       return rc;
+               if (!nid_is_nid4(&id.nid))
+                       return -EINVAL;
+               data->ioc_nid    = lnet_nid_to_nid4(&id.nid);
+               data->ioc_count  = share_count;
+               data->ioc_u32[0] = ip;
+               data->ioc_u32[1] = port;
+               data->ioc_u32[2] = myip;
+               data->ioc_u32[3] = conn_count;
+               data->ioc_u32[4] = id.pid;
+               return 0;
+       }
 
        case IOC_LIBCFS_ADD_PEER: {
                struct sockaddr_in sa = {.sin_family = AF_INET};
 
-               id.nid = data->ioc_nid;
                id.pid = LNET_PID_LUSTRE;
+               lnet_nid4_to_nid(data->ioc_nid, &id.nid);
                sa.sin_addr.s_addr = htonl(data->ioc_u32[0]);
                sa.sin_port = htons(data->ioc_u32[1]);
-               return ksocknal_add_peer(ni, id, (struct sockaddr *)&sa);
+               return ksocknal_add_peer(ni, &id, (struct sockaddr *)&sa);
        }
-        case IOC_LIBCFS_DEL_PEER:
-                id.nid = data->ioc_nid;
-                id.pid = LNET_PID_ANY;
-                return ksocknal_del_peer (ni, id,
-                                          data->ioc_u32[0]); /* IP */
+       case IOC_LIBCFS_DEL_PEER:
+               lnet_nid4_to_nid(data->ioc_nid, &id.nid);
+               id.pid = LNET_PID_ANY;
+               return ksocknal_del_peer(ni, &id);
 
         case IOC_LIBCFS_GET_CONN: {
                 int           txmem;
@@ -1766,9 +1830,9 @@ ksocknal_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg)
 
                 ksocknal_lib_get_conn_tunables(conn, &txmem, &rxmem, &nagle);
 
-                data->ioc_count  = txmem;
-                data->ioc_nid    = conn->ksnc_peer->ksnp_id.nid;
-                data->ioc_flags  = nagle;
+               data->ioc_count = txmem;
+               data->ioc_nid = lnet_nid_to_nid4(&conn->ksnc_peer->ksnp_id.nid);
+               data->ioc_flags = nagle;
                if (psa->sin_family == AF_INET)
                        data->ioc_u32[0] = ntohl(psa->sin_addr.s_addr);
                else
@@ -1787,31 +1851,32 @@ ksocknal_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg)
                 return 0;
         }
 
-        case IOC_LIBCFS_CLOSE_CONNECTION:
-                id.nid = data->ioc_nid;
-                id.pid = LNET_PID_ANY;
-                return ksocknal_close_matching_conns (id,
-                                                      data->ioc_u32[0]);
-
-        case IOC_LIBCFS_REGISTER_MYNID:
-                /* Ignore if this is a noop */
-                if (data->ioc_nid == ni->ni_nid)
-                        return 0;
-
-                CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n",
-                       libcfs_nid2str(data->ioc_nid),
-                       libcfs_nid2str(ni->ni_nid));
-                return -EINVAL;
-
-        case IOC_LIBCFS_PUSH_CONNECTION:
-                id.nid = data->ioc_nid;
-                id.pid = LNET_PID_ANY;
-                return ksocknal_push(ni, id);
-
-        default:
-                return -EINVAL;
-        }
-        /* not reached */
+       case IOC_LIBCFS_CLOSE_CONNECTION:
+               lnet_nid4_to_nid(data->ioc_nid, &id.nid);
+               id.pid = LNET_PID_ANY;
+               return ksocknal_close_matching_conns(&id,
+                                                    data->ioc_u32[0]);
+
+       case IOC_LIBCFS_REGISTER_MYNID:
+               /* Ignore if this is a noop */
+               if (nid_is_nid4(&ni->ni_nid) &&
+                   data->ioc_nid == lnet_nid_to_nid4(&ni->ni_nid))
+                       return 0;
+
+               CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n",
+                      libcfs_nid2str(data->ioc_nid),
+                      libcfs_nidstr(&ni->ni_nid));
+               return -EINVAL;
+
+       case IOC_LIBCFS_PUSH_CONNECTION:
+               lnet_nid4_to_nid(data->ioc_nid, &id.nid);
+               id.pid = LNET_PID_ANY;
+               return ksocknal_push(ni, &id);
+
+       default:
+               return -EINVAL;
+       }
+       /* not reached */
 }
 
 static void
@@ -1847,11 +1912,15 @@ static int ksocknal_get_link_status(struct net_device *dev)
 
        LASSERT(dev);
 
-       if (!netif_running(dev))
+       if (!netif_running(dev)) {
                ret = 0;
+               CDEBUG(D_NET, "device not running\n");
+       }
        /* Some devices may not be providing link settings */
-       else if (dev->ethtool_ops->get_link)
+       else if (dev->ethtool_ops->get_link) {
                ret = dev->ethtool_ops->get_link(dev);
+               CDEBUG(D_NET, "get_link returns %u\n", ret);
+       }
 
        return ret;
 }
@@ -1860,11 +1929,16 @@ static int
 ksocknal_handle_link_state_change(struct net_device *dev,
                                  unsigned char operstate)
 {
-       struct lnet_ni *ni;
+       struct lnet_ni *ni = NULL;
        struct ksock_net *net;
        struct ksock_net *cnxt;
        int ifindex;
        unsigned char link_down = !(operstate == IF_OPER_UP);
+       struct in_device *in_dev;
+       bool found_ip = false;
+       struct ksock_interface *ksi = NULL;
+       struct sockaddr_in *sa;
+       DECLARE_CONST_IN_IFADDR(ifa);
 
        ifindex = dev->ifindex;
 
@@ -1873,20 +1947,92 @@ ksocknal_handle_link_state_change(struct net_device *dev,
 
        list_for_each_entry_safe(net, cnxt, &ksocknal_data.ksnd_nets,
                                 ksnn_list) {
-               if (net->ksnn_interface.ksni_index != ifindex)
+
+               ksi = &net->ksnn_interface;
+               sa = (void *)&ksi->ksni_addr;
+               found_ip = false;
+
+               if (ksi->ksni_index != ifindex ||
+                   strcmp(ksi->ksni_name, dev->name))
                        continue;
+
                ni = net->ksnn_ni;
-               if (link_down)
+
+               in_dev = __in_dev_get_rtnl(dev);
+               if (!in_dev) {
+                       CDEBUG(D_NET, "Interface %s has no IPv4 status.\n",
+                              dev->name);
+                       CDEBUG(D_NET, "set link fatal state to 1\n");
+                       atomic_set(&ni->ni_fatal_error_on, 1);
+                       continue;
+               }
+               in_dev_for_each_ifa_rtnl(ifa, in_dev) {
+                       if (sa->sin_addr.s_addr == ifa->ifa_local)
+                               found_ip = true;
+               }
+               endfor_ifa(in_dev);
+
+               if (!found_ip) {
+                       CDEBUG(D_NET, "Interface %s has no matching ip\n",
+                              dev->name);
+                       CDEBUG(D_NET, "set link fatal state to 1\n");
+                       atomic_set(&ni->ni_fatal_error_on, 1);
+                       continue;
+               }
+
+               if (link_down) {
+                       CDEBUG(D_NET, "set link fatal state to 1\n");
                        atomic_set(&ni->ni_fatal_error_on, link_down);
-               else
+               } else {
+                       CDEBUG(D_NET, "set link fatal state to %u\n",
+                              (ksocknal_get_link_status(dev) == 0));
                        atomic_set(&ni->ni_fatal_error_on,
                                   (ksocknal_get_link_status(dev) == 0));
+               }
        }
 out:
        return 0;
 }
 
 
+static int
+ksocknal_handle_inetaddr_change(struct in_ifaddr *ifa, unsigned long event)
+{
+       struct lnet_ni *ni;
+       struct ksock_net *net;
+       struct ksock_net *cnxt;
+       struct net_device *event_netdev = ifa->ifa_dev->dev;
+       int ifindex;
+       struct ksock_interface *ksi = NULL;
+       struct sockaddr_in *sa;
+
+       if (!ksocknal_data.ksnd_nnets)
+               goto out;
+
+       ifindex = event_netdev->ifindex;
+
+       list_for_each_entry_safe(net, cnxt, &ksocknal_data.ksnd_nets,
+                                ksnn_list) {
+
+               ksi = &net->ksnn_interface;
+               sa = (void *)&ksi->ksni_addr;
+
+               if (ksi->ksni_index != ifindex ||
+                   strcmp(ksi->ksni_name, event_netdev->name))
+                       continue;
+
+               if (sa->sin_addr.s_addr == ifa->ifa_local) {
+                       CDEBUG(D_NET, "set link fatal state to %u\n",
+                              (event == NETDEV_DOWN));
+                       ni = net->ksnn_ni;
+                       atomic_set(&ni->ni_fatal_error_on,
+                                  (event == NETDEV_DOWN));
+               }
+       }
+out:
+       return 0;
+}
+
 /************************************
  * Net device notifier event handler
  ************************************/
@@ -1898,6 +2044,9 @@ static int ksocknal_device_event(struct notifier_block *unused,
 
        operstate = dev->operstate;
 
+       CDEBUG(D_NET, "devevent: status=%ld, iface=%s ifindex %d state %u\n",
+              event, dev->name, dev->ifindex, operstate);
+
        switch (event) {
        case NETDEV_UP:
        case NETDEV_DOWN:
@@ -1909,10 +2058,36 @@ static int ksocknal_device_event(struct notifier_block *unused,
        return NOTIFY_OK;
 }
 
-static struct notifier_block ksocknal_notifier_block = {
+/************************************
+ * Inetaddr notifier event handler
+ ************************************/
+static int ksocknal_inetaddr_event(struct notifier_block *unused,
+                                  unsigned long event, void *ptr)
+{
+       struct in_ifaddr *ifa = ptr;
+
+       CDEBUG(D_NET, "addrevent: status %ld ip addr %pI4, netmask %pI4.\n",
+              event, &ifa->ifa_address, &ifa->ifa_mask);
+
+       switch (event) {
+       case NETDEV_UP:
+       case NETDEV_DOWN:
+       case NETDEV_CHANGE:
+               ksocknal_handle_inetaddr_change(ifa, event);
+               break;
+
+       }
+       return NOTIFY_OK;
+}
+
+static struct notifier_block ksocknal_dev_notifier_block = {
        .notifier_call = ksocknal_device_event,
 };
 
+static struct notifier_block ksocknal_inetaddr_notifier_block = {
+       .notifier_call = ksocknal_inetaddr_event,
+};
+
 static void
 ksocknal_base_shutdown(void)
 {
@@ -1924,13 +2099,15 @@ ksocknal_base_shutdown(void)
               libcfs_kmem_read());
        LASSERT (ksocknal_data.ksnd_nnets == 0);
 
-       if (ksocknal_data.ksnd_init == SOCKNAL_INIT_ALL)
-               unregister_netdevice_notifier(&ksocknal_notifier_block);
+       if (ksocknal_data.ksnd_init == SOCKNAL_INIT_ALL) {
+               unregister_netdevice_notifier(&ksocknal_dev_notifier_block);
+               unregister_inetaddr_notifier(&ksocknal_inetaddr_notifier_block);
+       }
 
        switch (ksocknal_data.ksnd_init) {
        default:
                LASSERT(0);
-               /* fallthrough */
+               fallthrough;
 
        case SOCKNAL_INIT_ALL:
        case SOCKNAL_INIT_DATA:
@@ -2091,7 +2268,8 @@ ksocknal_base_startup(void)
                 goto failed;
         }
 
-       register_netdevice_notifier(&ksocknal_notifier_block);
+       register_netdevice_notifier(&ksocknal_dev_notifier_block);
+       register_inetaddr_notifier(&ksocknal_inetaddr_notifier_block);
 
         /* flag everything initialised */
         ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL;
@@ -2118,9 +2296,8 @@ ksocknal_debug_peerhash(struct lnet_ni *ni)
                if (peer_ni->ksnp_ni != ni)
                        continue;
 
-               CWARN("Active peer_ni on shutdown: %s, ref %d, "
-                     "closing %d, accepting %d, err %d, zcookie %llu, "
-                     "txq %d, zc_req %d\n", libcfs_id2str(peer_ni->ksnp_id),
+               CWARN("Active peer_ni on shutdown: %s, ref %d, closing %d, accepting %d, err %d, zcookie %llu, txq %d, zc_req %d\n",
+                     libcfs_idstr(&peer_ni->ksnp_id),
                      refcount_read(&peer_ni->ksnp_refcount),
                      peer_ni->ksnp_closing,
                      peer_ni->ksnp_accepting, peer_ni->ksnp_error,
@@ -2153,10 +2330,6 @@ void
 ksocknal_shutdown(struct lnet_ni *ni)
 {
        struct ksock_net *net = ni->ni_data;
-       struct lnet_process_id anyid = {
-               .nid = LNET_NID_ANY,
-               .pid = LNET_PID_ANY,
-       };
 
        LASSERT(ksocknal_data.ksnd_init == SOCKNAL_INIT_ALL);
        LASSERT(ksocknal_data.ksnd_nnets > 0);
@@ -2165,7 +2338,7 @@ ksocknal_shutdown(struct lnet_ni *ni)
        atomic_add(SOCKNAL_SHUTDOWN_BIAS, &net->ksnn_npeers);
 
        /* Delete all peers */
-       ksocknal_del_peer(ni, anyid, 0);
+       ksocknal_del_peer(ni, NULL);
 
        /* Wait for all peer_ni state to clean up */
        wait_var_event_warning(&net->ksnn_npeers,
@@ -2348,10 +2521,8 @@ ksocknal_startup(struct lnet_ni *ni)
 
        LASSERT(ksi);
        LASSERT(ksi->ksni_addr.ss_family == AF_INET);
-       ni->ni_nid = LNET_MKNID(
-               LNET_NIDNET(ni->ni_nid),
-               ntohl(((struct sockaddr_in *)
-                      &ksi->ksni_addr)->sin_addr.s_addr));
+       ni->ni_nid.nid_addr[0] =
+               ((struct sockaddr_in *)&ksi->ksni_addr)->sin_addr.s_addr;
        list_add(&net->ksnn_list, &ksocknal_data.ksnd_nets);
        net->ksnn_ni = ni;
        ksocknal_data.ksnd_nnets++;