Whamcloud - gitweb
* More work on config error messages + some code restructure to trap config
authoreeb <eeb>
Thu, 30 Jun 2005 17:33:32 +0000 (17:33 +0000)
committereeb <eeb>
Thu, 30 Jun 2005 17:33:32 +0000 (17:33 +0000)
   errors better.

*  Fixed bug in 'implicit loopback'

*  Completely removed duplicate interface usage checks from generic network
   setup.  The NAL should be doing this.   Currently socknal is the only one
   that supports multiple instances and it doesn't mind at all if 2 different
   network instances use the same interfaces, and that's dead useful for
   testing.

*  Tightened up lonal refcounting; there was a remote chance of a shutdown
   race.

14 files changed:
lnet/include/lnet/lib-lnet.h
lnet/include/lnet/lib-p30.h
lnet/include/lnet/lib-types.h
lnet/klnds/openiblnd/openiblnd.c
lnet/klnds/ralnd/ralnd.c
lnet/klnds/socklnd/socklnd.c
lnet/klnds/socklnd/socklnd.h
lnet/klnds/socklnd/socklnd_cb.c
lnet/libcfs/linux/linux-tcpip.c
lnet/lnet/acceptor.c
lnet/lnet/api-ni.c
lnet/lnet/config.c
lnet/lnet/lib-move.c
lnet/lnet/lo.c

index 2fa1619..7f5a6dc 100644 (file)
@@ -490,7 +490,7 @@ extern void ptl_connect_console_error(int rc, ptl_nid_t peer_nid,
 
 extern int ptl_count_acceptor_nis(ptl_ni_t **first_ni);
 
-extern ptl_err_t ptl_accept(struct socket *sock, __u32 magic, int choose_ni);
+extern ptl_err_t ptl_accept(ptl_ni_t *blind_ni, struct socket *sock, __u32 magic);
 extern int       ptl_acceptor_timeout(void);
 extern int       ptl_acceptor_port(void);
 #endif
index 2fa1619..7f5a6dc 100644 (file)
@@ -490,7 +490,7 @@ extern void ptl_connect_console_error(int rc, ptl_nid_t peer_nid,
 
 extern int ptl_count_acceptor_nis(ptl_ni_t **first_ni);
 
-extern ptl_err_t ptl_accept(struct socket *sock, __u32 magic, int choose_ni);
+extern ptl_err_t ptl_accept(ptl_ni_t *blind_ni, struct socket *sock, __u32 magic);
 extern int       ptl_acceptor_timeout(void);
 extern int       ptl_acceptor_port(void);
 #endif
index 5f1b411..710fcb6 100644 (file)
@@ -356,6 +356,7 @@ typedef struct                                  /* loopback descriptor */
         }                lod_iov;
 } lo_desc_t;
 
+/* loopback descriptor types */
 #define LOD_IOV     0xeb105
 #define LOD_KIOV    0xeb106
 
index 497afce..fd86f9c 100644 (file)
@@ -333,7 +333,7 @@ kibnal_handle_svcqry (struct socket *sock)
                                       ptl_acceptor_timeout());
         } else {
                 /* This might be a generic acceptor connection request... */
-                rc = ptl_accept(sock, msg->ibm_magic, 0);
+                rc = ptl_accept(kibnal_data.kib_ni, sock, msg->ibm_magic);
                 if (rc != PTL_OK)
                         goto out;
 
index 9422b08..28218ae 100644 (file)
@@ -77,7 +77,7 @@ kranal_recv_connreq(struct socket *sock, kra_connreq_t *connreq, int active)
             connreq->racr_magic != RANAL_MSG_MAGIC &&
             connreq->racr_magic != __swab32(RANAL_MSG_MAGIC)) {
                 /* Is this a generic acceptor connection request? */
-                rc = ptl_accept(sock, connreq->racr_magic, 0);
+                rc = ptl_accept(kranal_data.kra_ni, sock, connreq->racr_magic);
                 if (rc != PTL_OK)               /* nope */
                         return -EPROTO;
 
index 6effa0a..a552472 100644 (file)
@@ -311,17 +311,17 @@ ksocknal_associate_route_conn_locked(ksock_route_t *route, ksock_conn_t *conn)
         if (route->ksnr_myipaddr != conn->ksnc_myipaddr) {
                 if (route->ksnr_myipaddr == 0) {
                         /* route wasn't bound locally yet (the initial route) */
-                        CWARN("Binding %s %u.%u.%u.%u to %u.%u.%u.%u\n",
-                              libcfs_nid2str(peer->ksnp_nid),
-                              HIPQUAD(route->ksnr_ipaddr),
-                              HIPQUAD(conn->ksnc_myipaddr));
+                        CDEBUG(D_NET, "Binding %s %u.%u.%u.%u to %u.%u.%u.%u\n",
+                               libcfs_nid2str(peer->ksnp_nid),
+                               HIPQUAD(route->ksnr_ipaddr),
+                               HIPQUAD(conn->ksnc_myipaddr));
                 } else {
-                        CWARN("Rebinding %s %u.%u.%u.%u from "
-                              "%u.%u.%u.%u to %u.%u.%u.%u\n",
-                              libcfs_nid2str(peer->ksnp_nid),
-                              HIPQUAD(route->ksnr_ipaddr),
-                              HIPQUAD(route->ksnr_myipaddr),
-                              HIPQUAD(conn->ksnc_myipaddr));
+                        CDEBUG(D_NET, "Rebinding %s %u.%u.%u.%u from "
+                               "%u.%u.%u.%u to %u.%u.%u.%u\n",
+                               libcfs_nid2str(peer->ksnp_nid),
+                               HIPQUAD(route->ksnr_ipaddr),
+                               HIPQUAD(route->ksnr_myipaddr),
+                               HIPQUAD(conn->ksnc_myipaddr));
 
                         iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni,
                                                   route->ksnr_myipaddr);
@@ -926,11 +926,14 @@ ksocknal_accept (ptl_ni_t *ni, struct socket *sock)
 
         PORTAL_ALLOC(cr, sizeof(*cr));
         if (cr == NULL) {
-                CWARN("ENOMEM allocating connection request from"
-                      "%u.%u.%u.%u\n", HIPQUAD(peer_ip));
+                LCONSOLE_ERROR("Dropping connection request from "
+                               "%u.%u.%u.%u: memory exhausted\n",
+                               HIPQUAD(peer_ip));
                 return PTL_FAIL;
         }
 
+        ptl_ni_addref(ni);
+        cr->ksncr_ni   = ni;
         cr->ksncr_sock = sock;
 
         spin_lock_irqsave(&ksocknal_data.ksnd_connd_lock, flags);
@@ -943,13 +946,13 @@ ksocknal_accept (ptl_ni_t *ni, struct socket *sock)
 }
 
 int
-ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type)
+ksocknal_create_conn (ptl_ni_t *ni, ksock_route_t *route, 
+                      struct socket *sock, int type)
 {
         rwlock_t          *global_lock = &ksocknal_data.ksnd_global_lock;
+        ksock_net_t       *net = (ksock_net_t *)ni->ni_data;
         __u32              ipaddrs[PTL_MAX_INTERFACES];
         int                nipaddrs;
-        ptl_ni_t          *ni;
-        ksock_net_t       *net;
         ptl_nid_t          nid;
         struct list_head  *tmp;
         __u64              incarnation;
@@ -1000,15 +1003,12 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type)
                 goto failed_1;
 
         if (route != NULL) {
-                ni = route->ksnr_peer->ksnp_ni;
-                net = (ksock_net_t *)ni->ni_data;
+                LASSERT(ni == route->ksnr_peer->ksnp_ni);
 
                 /* Active connection sends HELLO eagerly */
                 nipaddrs = ksocknal_local_ipvec(ni, ipaddrs);
 
-                rc = ksocknal_send_hello (conn, ni->ni_nid, 
-                                          net->ksnn_incarnation,
-                                          ipaddrs, nipaddrs);
+                rc = ksocknal_send_hello (ni, conn, ipaddrs, nipaddrs);
                 if (rc != 0)
                         goto failed_1;
         }
@@ -1018,7 +1018,7 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type)
          * Passive connections use the listener timeout since the peer sends
          * eagerly */
         nid = (route == NULL) ? PTL_NID_ANY : route->ksnr_peer->ksnp_nid;
-        rc = ksocknal_recv_hello (conn, &nid, &incarnation, ipaddrs);
+        rc = ksocknal_recv_hello (ni, conn, &nid, &incarnation, ipaddrs);
         if (rc < 0)
                 goto failed_1;
         nipaddrs = rc;
@@ -1033,24 +1033,7 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type)
                                        ipaddrs, nipaddrs);
                 rc = 0;
         } else {
-                ni = ptl_net2ni(PTL_NIDNET(nid));
-
-                if (ni == NULL) {
-                        CERROR("Refusing connection attempt "
-                               "(no matching net)\n");
-                        rc = -ECONNREFUSED;
-                        goto failed_1;
-                }
-
-                net = (ksock_net_t *)ni->ni_data;
                 rc = ksocknal_create_peer(&peer, ni, nid);
-
-                /* lose extra ref from ptl_net2ni NB we wait for all the peers
-                 * to be deleted before ni teardown can complete; i.e. ni can't
-                 * disappear until all its peer table entries has gone so
-                 * there's no need to account the peer's refs on ni. */
-                ptl_ni_decref(ni);
-
                 if (rc != 0)
                         goto failed_1;
 
@@ -1072,9 +1055,7 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type)
                 write_unlock_irqrestore(global_lock, flags);
 
                 nipaddrs = ksocknal_select_ips(peer, ipaddrs, nipaddrs);
-                rc = ksocknal_send_hello (conn, ni->ni_nid,
-                                          net->ksnn_incarnation,
-                                          ipaddrs, nipaddrs);
+                rc = ksocknal_send_hello (ni, conn, ipaddrs, nipaddrs);
                 if (rc < 0)
                         goto failed_2;
         }
@@ -1180,11 +1161,11 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type)
                 ksocknal_connsock_decref(conn);
         }
 
-        CWARN("New conn %s %u.%u.%u.%u -> %u.%u.%u.%u/%d"
-              " incarnation:"LPD64" sched[%d]/%d\n",
-              libcfs_nid2str(nid), HIPQUAD(conn->ksnc_myipaddr),
-              HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port, incarnation,
-              (int)(conn->ksnc_scheduler - ksocknal_data.ksnd_schedulers), irq);
+        CDEBUG(D_NET, "New conn %s %u.%u.%u.%u -> %u.%u.%u.%u/%d"
+               " incarnation:"LPD64" sched[%d]/%d\n",
+               libcfs_nid2str(nid), HIPQUAD(conn->ksnc_myipaddr),
+               HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port, incarnation,
+               (int)(conn->ksnc_scheduler - ksocknal_data.ksnd_schedulers), irq);
 
         ksocknal_conn_decref(conn);
         return (0);
@@ -1433,11 +1414,11 @@ ksocknal_close_stale_conns_locked (ksock_peer_t *peer, __u64 incarnation)
                 if (conn->ksnc_incarnation == incarnation)
                         continue;
 
-                CWARN("Closing stale conn %s ip:%08x/%d "
-                      "incarnation:"LPD64"("LPD64")\n",
-                      libcfs_nid2str(peer->ksnp_nid), 
-                      conn->ksnc_ipaddr, conn->ksnc_port,
-                      conn->ksnc_incarnation, incarnation);
+                CDEBUG(D_NET, "Closing stale conn %s ip:%08x/%d "
+                       "incarnation:"LPD64"("LPD64")\n",
+                       libcfs_nid2str(peer->ksnp_nid), 
+                       conn->ksnc_ipaddr, conn->ksnc_port,
+                       conn->ksnc_incarnation, incarnation);
 
                 count++;
                 ksocknal_close_conn_locked (conn, -ESTALE);
index f139ef1..d2789de 100644 (file)
@@ -382,6 +382,7 @@ typedef struct ksock_peer
 typedef struct ksock_connreq
 {
         struct list_head    ksncr_list;         /* stash on ksnd_connd_connreqs */
+        ptl_ni_t           *ksncr_ni;           /* chosen NI */
         struct socket      *ksncr_sock;         /* accepted socket */
 } ksock_connreq_t;
 
@@ -501,7 +502,7 @@ ptl_err_t ksocknal_accept(ptl_ni_t *ni, struct socket *sock);
 extern int ksocknal_add_peer(ptl_ni_t *ni, ptl_nid_t nid, __u32 ip, int port);
 extern ksock_peer_t *ksocknal_find_peer_locked (ptl_ni_t *ni, ptl_nid_t nid);
 extern ksock_peer_t *ksocknal_find_peer (ptl_ni_t *ni, ptl_nid_t nid);
-extern int ksocknal_create_conn (ksock_route_t *route,
+extern int ksocknal_create_conn (ptl_ni_t *ni, ksock_route_t *route,
                                  struct socket *sock, int type);
 extern void ksocknal_close_conn_locked (ksock_conn_t *conn, int why);
 extern void ksocknal_terminate_conn (ksock_conn_t *conn);
@@ -521,10 +522,11 @@ extern int ksocknal_new_packet (ksock_conn_t *conn, int skip);
 extern int ksocknal_scheduler (void *arg);
 extern int ksocknal_connd (void *arg);
 extern int ksocknal_reaper (void *arg);
-extern int ksocknal_send_hello (ksock_conn_t *conn, ptl_nid_t nid,
-                                __u64 incarnation, __u32 *ipaddrs, int nipaddrs);
-extern int ksocknal_recv_hello (ksock_conn_t *conn, ptl_nid_t *nid, 
-                                __u64 *incarnation, __u32 *ipaddrs);
+extern int ksocknal_send_hello (ptl_ni_t *ni, ksock_conn_t *conn,
+                                __u32 *ipaddrs, int nipaddrs);
+extern int ksocknal_recv_hello (ptl_ni_t *ni, ksock_conn_t *conn, 
+                                ptl_nid_t *nid, __u64 *incarnation, 
+                                __u32 *ipaddrs);
 
 extern void ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn);
 extern void ksocknal_lib_set_callback(struct socket *sock,  ksock_conn_t *conn);
index f4dc566..3105543 100644 (file)
@@ -1258,11 +1258,11 @@ ksocknal_process_receive (ksock_conn_t *conn)
                 LASSERT (rc != -EAGAIN);
 
                 if (rc == 0)
-                        CWARN ("[%p] EOF from %s ip %d.%d.%d.%d:%d\n",
-                               conn, 
-                               libcfs_nid2str(conn->ksnc_peer->ksnp_nid),
-                               HIPQUAD(conn->ksnc_ipaddr),
-                               conn->ksnc_port);
+                        CDEBUG (D_NET, "[%p] EOF from %s ip %d.%d.%d.%d:%d\n",
+                                conn, 
+                                libcfs_nid2str(conn->ksnc_peer->ksnp_nid),
+                                HIPQUAD(conn->ksnc_ipaddr),
+                                conn->ksnc_port);
                 else if (!conn->ksnc_closing)
                         CERROR ("[%p] Error %d on read from %s"
                                 " ip %d.%d.%d.%d:%d\n",
@@ -1658,11 +1658,11 @@ void ksocknal_write_callback (ksock_conn_t *conn)
 }
 
 int
-ksocknal_send_hello (ksock_conn_t *conn, 
-                     ptl_nid_t srcnid, __u64 incarnation,
+ksocknal_send_hello (ptl_ni_t *ni, ksock_conn_t *conn, 
                      __u32 *ipaddrs, int nipaddrs)
 {
         /* CAVEAT EMPTOR: this byte flips 'ipaddrs' */
+        ksock_net_t        *net = (ksock_net_t *)ni->ni_data;
         struct socket      *sock = conn->ksnc_sock;
         ptl_hdr_t           hdr;
         ptl_magicversion_t *hmv = (ptl_magicversion_t *)&hdr.dest_nid;
@@ -1680,12 +1680,12 @@ ksocknal_send_hello (ksock_conn_t *conn,
         hmv->version_major = cpu_to_le16 (PTL_PROTO_TCP_VERSION_MAJOR);
         hmv->version_minor = cpu_to_le16 (PTL_PROTO_TCP_VERSION_MINOR);
 
-        hdr.src_nid        = cpu_to_le64 (srcnid);
+        hdr.src_nid        = cpu_to_le64 (ni->ni_nid);
         hdr.type           = cpu_to_le32 (PTL_MSG_HELLO);
         hdr.payload_length = cpu_to_le32 (nipaddrs * sizeof(*ipaddrs));
 
         hdr.msg.hello.type = cpu_to_le32 (conn->ksnc_type);
-        hdr.msg.hello.incarnation = cpu_to_le64 (incarnation);
+        hdr.msg.hello.incarnation = cpu_to_le64 (net->ksnn_incarnation);
 
         for (i = 0; i < nipaddrs; i++) {
                 ipaddrs[i] = __cpu_to_le32 (ipaddrs[i]);
@@ -1729,8 +1729,8 @@ ksocknal_invert_type(int type)
 }
 
 int
-ksocknal_recv_hello (ksock_conn_t *conn, ptl_nid_t *nid,
-                     __u64 *incarnation, __u32 *ipaddrs)
+ksocknal_recv_hello (ptl_ni_t *ni, ksock_conn_t *conn, 
+                     ptl_nid_t *nid, __u64 *incarnation, __u32 *ipaddrs)
 {
         struct socket      *sock = conn->ksnc_sock;
         int                 active;
@@ -1759,11 +1759,12 @@ ksocknal_recv_hello (ksock_conn_t *conn, ptl_nid_t *nid,
         if (!active && 
             hmv->magic != le32_to_cpu (PTL_PROTO_TCP_MAGIC)) {
                 /* Is this a generic acceptor connection request? */
-                rc = ptl_accept(sock, hmv->magic, 0);
+                rc = ptl_accept(ni, sock, hmv->magic);
                 if (rc != PTL_OK)
                         return -EPROTO;
 
-                /* Yes it is! Start over again now I've skipping it. */
+                /* Yes it is! Start over again now I've skipping the generic
+                 * request */
                 rc = libcfs_sock_read(sock, &hmv->magic, 
                                       sizeof (hmv->magic), timeout);
                 if (rc != 0) {
@@ -1906,7 +1907,7 @@ ksocknal_connect (ksock_route_t *route)
 {
         CFS_LIST_HEAD    (zombies);
         ksock_tx_t       *tx;
-        ksock_peer_t     *peer;
+        ksock_peer_t     *peer = route->ksnr_peer;
         unsigned long     flags;
         int               type;
         struct socket    *sock;
@@ -1933,15 +1934,15 @@ ksocknal_connect (ksock_route_t *route)
 
                 write_unlock_irqrestore(&ksocknal_data.ksnd_global_lock, flags);
 
-                rc = ptl_connect(&sock, route->ksnr_peer->ksnp_nid,
+                rc = ptl_connect(&sock, peer->ksnp_nid,
                                  route->ksnr_myipaddr, 
                                  route->ksnr_ipaddr, route->ksnr_port);
                 if (rc != PTL_OK)
                         goto failed;
 
-                rc = ksocknal_create_conn(route, sock, type);
+                rc = ksocknal_create_conn(peer->ksnp_ni, route, sock, type);
                 if (rc != 0) {
-                        ptl_connect_console_error(rc, route->ksnr_peer->ksnp_nid,
+                        ptl_connect_console_error(rc, peer->ksnp_nid,
                                                   route->ksnr_ipaddr, 
                                                   route->ksnr_port);
                         goto failed;
@@ -1958,7 +1959,6 @@ ksocknal_connect (ksock_route_t *route)
  failed:
         write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags);
 
-        peer = route->ksnr_peer;
         LASSERT (route->ksnr_connecting);
         route->ksnr_connecting = 0;
 
@@ -2043,7 +2043,9 @@ ksocknal_connd (void *arg)
                         spin_unlock_irqrestore(&ksocknal_data.ksnd_connd_lock, 
                                                flags);
                         
-                        ksocknal_create_conn(NULL, cr->ksncr_sock, SOCKNAL_CONN_NONE);
+                        ksocknal_create_conn(cr->ksncr_ni, NULL, 
+                                             cr->ksncr_sock, SOCKNAL_CONN_NONE);
+                        ptl_ni_decref(cr->ksncr_ni);
                         PORTAL_FREE(cr, sizeof(*cr));
                         
                         spin_lock_irqsave(&ksocknal_data.ksnd_connd_lock,
index 02cfd34..bad8f5b 100644 (file)
@@ -357,7 +357,7 @@ libcfs_sock_read (struct socket *sock, void *buffer, int nob, int timeout)
                         return rc;
 
                 if (rc == 0)
-                        return -ECONNABORTED;
+                        return -ECONNRESET;
 
                 buffer = ((char *)buffer) + rc;
                 nob -= rc;
index 726ce6c..b65ea5c 100644 (file)
@@ -76,7 +76,7 @@ ptl_connect_console_error (int rc, ptl_nid_t peer_nid,
         /* "normal" errors */
         case -ECONNREFUSED:
                 LCONSOLE_ERROR("Connection to %s at host %u.%u.%u.%u "
-                               "on port %d was refused; "
+                               "on port %d was refused: "
                                "check that Lustre is running on that node.\n",
                                libcfs_nid2str(peer_nid),
                                HIPQUAD(peer_ip), peer_port);
@@ -84,28 +84,30 @@ ptl_connect_console_error (int rc, ptl_nid_t peer_nid,
         case -EHOSTUNREACH:
         case -ENETUNREACH:
                 LCONSOLE_ERROR("Connection to %s at host %u.%u.%u.%u "
-                               "was unreachable; the network or that node may "
+                               "was unreachable: the network or that node may "
                                "be down, or Lustre may be misconfigured.\n",
                                libcfs_nid2str(peer_nid), HIPQUAD(peer_ip));
                 break;
         case -ETIMEDOUT:
                 LCONSOLE_ERROR("Connection to %s at host %u.%u.%u.%u on "
-                               "port %d took too long; that node may be hung "
+                               "port %d took too long: that node may be hung "
                                "or experiencing high load.\n",
                                libcfs_nid2str(peer_nid),
                                HIPQUAD(peer_ip), peer_port);
                 break;
         case -ECONNRESET:
                 LCONSOLE_ERROR("Connection to %s at host %u.%u.%u.%u on "
-                               "port %d was reset; "
-                               "Is it running a compatible version of Lustre?\n",
+                               "port %d was reset: "
+                               "is it running a compatible version of Lustre "
+                               "and is %s one of its NIDs?\n",
                                libcfs_nid2str(peer_nid),
-                               HIPQUAD(peer_ip), peer_port);
+                               HIPQUAD(peer_ip), peer_port,
+                               libcfs_nid2str(peer_nid));
                 break;
         case -EPROTO:
                 LCONSOLE_ERROR("Protocol error connecting to %s at host "
                                "%u.%u.%u.%u on port %d: "
-                               "Is it running a compatible version of Lustre?\n",
+                               "is it running a compatible version of Lustre?\n",
                                libcfs_nid2str(peer_nid),
                                HIPQUAD(peer_ip), peer_port);
                 break;
@@ -193,7 +195,7 @@ ptl_accept_magic(__u32 magic, __u32 constant)
 }
 
 ptl_err_t
-ptl_accept(struct socket *sock, __u32 magic, int choose_ni)
+ptl_accept(ptl_ni_t *blind_ni, struct socket *sock, __u32 magic)
 {
         ptl_acceptor_connreq_t  cr;
         __u32                   peer_ip;
@@ -201,41 +203,31 @@ ptl_accept(struct socket *sock, __u32 magic, int choose_ni)
         int                     rc;
         int                     flip;
         ptl_ni_t               *ni;
+        char                   *str;
 
         /* CAVEAT EMPTOR: I may be called by a NAL in any thread's context if I
          * passed the new socket "blindly" to the single NI that needed an
-         * acceptor.  If so, 'choose_ni' is FALSE... */
+         * acceptor.  If so, blind_ni != NULL... */
 
         LASSERT (sizeof(cr) <= 16);             /* not too big for the stack */
         
         rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port);
         LASSERT (rc == 0);                      /* we succeeded before */
 
-        if (ptl_accept_magic(magic, PTL_PROTO_TCP_MAGIC)) {
-                CERROR("Refusing connection from %u.%u.%u.%u: "
-                       " 'old' socknal/tcpnal acceptor protocol\n",
-                       HIPQUAD(peer_ip));
-                return PTL_FAIL;
-        }
-        
-        if (ptl_accept_magic(magic, PTL_PROTO_RA_MAGIC)) {
-                CERROR("Refusing connection from %u.%u.%u.%u: "
-                       " 'old' ranal acceptor protocol\n",
-                       HIPQUAD(peer_ip));
-                return PTL_FAIL;
-        }
-        
-        if (ptl_accept_magic(magic, PTL_PROTO_OPENIB_MAGIC)) {
-                CERROR("Refusing connection from %u.%u.%u.%u: "
-                       " 'old' openibnal acceptor protocol\n",
-                       HIPQUAD(peer_ip));
-                return PTL_FAIL;
-        }
-            
         if (!ptl_accept_magic(magic, PTL_PROTO_ACCEPTOR_MAGIC)) {
-                CERROR("Refusing connection from %u.%u.%u.%u: "
-                       " unrecognised magic %08x\n",
-                       HIPQUAD(peer_ip), magic);
+
+                if (magic == le32_to_cpu(PTL_PROTO_TCP_MAGIC))
+                        str = "'old' socknal/tcpnal";
+                else if (ptl_accept_magic(magic, PTL_PROTO_RA_MAGIC))
+                        str = "'old' ranal";
+                else if (ptl_accept_magic(magic, PTL_PROTO_OPENIB_MAGIC))
+                        str = "'old' openibnal";
+                else
+                        str = "unrecognised";
+            
+                LCONSOLE_ERROR("Refusing connection from %u.%u.%u.%u magic %08x: "
+                               " %s acceptor protocol\n",
+                               HIPQUAD(peer_ip), magic, str);
                 return PTL_FAIL;
         }
 
@@ -260,45 +252,49 @@ ptl_accept(struct socket *sock, __u32 magic, int choose_ni)
         }
         
         if (cr.acr_version != PTL_PROTO_ACCEPTOR_VERSION) {
-                CERROR("Refusing connection from %u.%u.%u.%u: "
-                       " unrecognised protocol version %d\n",
-                       HIPQUAD(peer_ip), cr.acr_version);
+                LCONSOLE_ERROR("Refusing connection from %u.%u.%u.%u: "
+                               " unrecognised protocol version %d\n",
+                               HIPQUAD(peer_ip), cr.acr_version);
                 return PTL_FAIL;
         }
 
-        if (!choose_ni) {
-                CDEBUG(D_WARNING, "Skipped %s from %u.%u.%u.%u\n", 
-                       libcfs_nid2str(cr.acr_nid), HIPQUAD(peer_ip));
-                /* I got called just to skip the connection request */
-                return PTL_OK;
-        }
-
         ni = ptl_net2ni(PTL_NIDNET(cr.acr_nid));
         if (ni == NULL ||             /* no matching net */
             ni->ni_nid != cr.acr_nid) /* right NET, but wrong NID! */ {
                 if (ni != NULL)
                         ptl_ni_decref(ni);
-                CERROR("Refusing connection from %u.%u.%u.%u for %s: "
-                       " No matching NI\n",
-                       HIPQUAD(peer_ip), libcfs_nid2str(cr.acr_nid));
+                LCONSOLE_ERROR("Refusing connection from %u.%u.%u.%u for %s: "
+                               " No matching NI\n",
+                               HIPQUAD(peer_ip), libcfs_nid2str(cr.acr_nid));
                 return PTL_FAIL;
         }
 
         if (ni->ni_nal->nal_accept == NULL) {
                 ptl_ni_decref(ni);
-                CERROR("Refusing connection from %u.%u.%u.%u for %s: "
-                       " NI doesn not accept IP connections\n",
-                       HIPQUAD(peer_ip), libcfs_nid2str(cr.acr_nid));
+                LCONSOLE_ERROR("Refusing connection from %u.%u.%u.%u for %s: "
+                               " NI doesn not accept IP connections\n",
+                               HIPQUAD(peer_ip), libcfs_nid2str(cr.acr_nid));
                 return PTL_FAIL;
         }
                 
-        CDEBUG(D_WARNING, "Accept %s from %u.%u.%u.%u\n",
-               libcfs_nid2str(cr.acr_nid), HIPQUAD(peer_ip));
+        CDEBUG(D_NET, "Accept %s from %u.%u.%u.%u%s\n",
+               libcfs_nid2str(cr.acr_nid), HIPQUAD(peer_ip),
+               blind_ni == NULL ? "" : " (blind)");
 
-        rc = ni->ni_nal->nal_accept(ni, sock);
-        if (rc != PTL_OK)
-                CERROR("NI %s refused connection from %u.%u.%u.%u\n",
-                       libcfs_nid2str(ni->ni_nid), HIPQUAD(peer_ip));
+        if (blind_ni == NULL) {
+                rc = ni->ni_nal->nal_accept(ni, sock);
+                if (rc != PTL_OK)
+                        CERROR("NI %s refused connection from %u.%u.%u.%u\n",
+                               libcfs_nid2str(ni->ni_nid), HIPQUAD(peer_ip));
+        } else {
+                /* blind_ni is the only NI that needs me and it was given the
+                 * chance to handle this connection request itself in case it
+                 * was sent by an "old" socknal.  But this connection request
+                 * uses the new acceptor protocol and I'm just being called to
+                 * verify and skip it */
+                LASSERT (ni == blind_ni);
+                rc = PTL_OK;
+        }
 
         ptl_ni_decref(ni);
         return rc;
@@ -415,7 +411,7 @@ ptl_acceptor(void *arg)
                        goto failed;
                }
 
-                rc = ptl_accept(newsock, magic, 1);
+                rc = ptl_accept(NULL, newsock, magic);
                 if (rc != PTL_OK)
                         goto failed;
                 
index 53afa32..c51f882 100644 (file)
@@ -596,23 +596,28 @@ ptl_islocalnid (ptl_nid_t nid)
         struct list_head *tmp;
         ptl_ni_t         *ni;
         unsigned long     flags;
+        int               islocal = 0;
 
         PTL_LOCK(flags);
+
         list_for_each (tmp, &ptl_apini.apini_nis) {
                 ni = list_entry(tmp, ptl_ni_t, ni_list);
 
-                if (ni->ni_nid == nid)
-                        return 1;
+                if (ni->ni_nid == nid) {
+                        islocal = 1;
+                        break;
+                }
         }
         
         PTL_UNLOCK(flags);
-        return 0;
+        return islocal;
 }
 
 void
 ptl_shutdown_nalnis (void)
 {
         int                i;
+        int                islo;
         ptl_ni_t          *ni;
         unsigned long      flags;
 
@@ -638,6 +643,10 @@ ptl_shutdown_nalnis (void)
                 ptl_ni_decref_locked(ni); /* drop apini's ref (shutdown on last ref) */
         }
 
+        /* Drop the cached loopback NI. */
+        ptl_ni_decref_locked(ptl_loni);
+        ptl_loni = NULL;
+
         /* Now wait for the NI's I just nuked to show up on apini_zombie_nis
          * and shut them down in guaranteed thread context */
         i = 2;
@@ -660,13 +669,15 @@ ptl_shutdown_nalnis (void)
 
                 PTL_UNLOCK(flags);
 
+                islo = ni->ni_nal->nal_type == LONAL;
+
                 LASSERT (!in_interrupt());
                 (ni->ni_nal->nal_shutdown)(ni);
 
                 /* can't deref nal anymore now; it might have unregistered
                  * itself...  */
 
-                if (PTL_NETNAL(PTL_NIDNET(ni->ni_nid)) != LONAL)
+                if (!islo)
                         LCONSOLE(0, "Removed NI %s\n", 
                                  libcfs_nid2str(ni->ni_nid));
 
@@ -763,6 +774,9 @@ ptl_startup_nalnis (void)
                 PTL_UNLOCK(flags);
         }
 
+        ptl_loni = ptl_net2ni(PTL_MKNET(LONAL, 0));
+        LASSERT (ptl_loni != NULL);
+
         return PTL_OK;
         
  failed:
index 2ce8a18..b7a3072 100644 (file)
@@ -105,78 +105,47 @@ ptl_trimwhite(char *str)
 }
 
 int
-ptl_nis_conflict(ptl_ni_t *ni1, ptl_ni_t *ni2)
+ptl_net_unique(__u32 net, struct list_head *nilist)
 {
-        if (PTL_NETNAL(PTL_NIDNET(ni1->ni_nid)) != /* different NALs */
-            PTL_NETNAL(PTL_NIDNET(ni2->ni_nid)))
-                return 0;
-
-        if (ni1 != ni2 &&
-            PTL_NIDNET(ni1->ni_nid) == PTL_NIDNET(ni2->ni_nid)) {
-                CERROR("Duplicate network: %s\n",
-                       libcfs_net2str(PTL_NIDNET(ni1->ni_nid)));
-                return 1;
-        }
-
-        if (ni1->ni_interfaces[0] == NULL ||   
-            ni2->ni_interfaces[0] == NULL) {
-                /* one (or both) using all available interfaces */
-                if (ni1 != ni2) {
-                        CERROR("Interface conflict: %s, %s\n",
-                               libcfs_net2str(PTL_NIDNET(ni1->ni_nid)),
-                               libcfs_net2str(PTL_NIDNET(ni2->ni_nid)));
-                        return 1;
-                }
-                return 0;
-        }
-#if 0
-        /* leave this commented out so the same interface can be included explicitly in 2
-         * networks. */
-
-        for (i = 0; i < PTL_MAX_INTERFACES; i++) {
-                if (ni1->ni_interfaces[i] == NULL)
-                        break;
-
-                for (j = 0; j < PTL_MAX_INTERFACES; j++) {
-                        if (ni2->ni_interfaces[j] == NULL)
-                                break;
+        struct list_head *tmp;
+        ptl_ni_t         *ni;
 
-                        if (ni1 == ni2 && i == j)
-                                continue;
+        list_for_each (tmp, nilist) {
+                ni = list_entry(tmp, ptl_ni_t, ni_list);
 
-                        if (strcmp(ni1->ni_interfaces[i],
-                                   ni2->ni_interfaces[j]))
-                                continue;
-                        
-                        CERROR("Duplicate interface: %s(%s), %s(%s)\n",
-                               libcfs_net2str(PTL_NIDNET(ni1->ni_nid)),
-                               ni1->ni_interfaces[i],
-                               libcfs_net2str(PTL_NIDNET(ni2->ni_nid)),
-                               ni2->ni_interfaces[i]);
-                        return 1;
-                }
+                if (PTL_NIDNET(ni->ni_nid) == net)
+                        return 0;
         }
-#endif   
-        return 0;
+        
+        return 1;
 }
 
-ptl_err_t
-ptl_check_ni_conflicts(ptl_ni_t *ni, struct list_head *nilist)
+ptl_ni_t *
+ptl_new_ni(__u32 net, struct list_head *nilist)
 {
-        struct list_head *tmp;
-        ptl_ni_t         *ni2;
+        ptl_ni_t *ni;
 
-        /* Yes! ni _has_ just been added to this list. */
-        LASSERT (ni == list_entry(nilist->prev, ptl_ni_t, ni_list));
+        if (!ptl_net_unique(net, nilist)) {
+                LCONSOLE_ERROR("Duplicate network specified: %s\n",
+                               libcfs_net2str(net));
+                return NULL;
+        }
         
-        list_for_each (tmp, nilist) {
-                ni2 = list_entry(tmp, ptl_ni_t, ni_list);
-
-                if (ptl_nis_conflict(ni, ni2))
-                        return PTL_FAIL;
+        PORTAL_ALLOC(ni, sizeof(*ni));
+        if (ni == NULL) {
+                CERROR("Out of memory creating network %s\n",
+                       libcfs_net2str(net));
+                return NULL;
         }
         
-        return PTL_OK;
+        /* zero counters/flags, NULL pointers... */
+        memset(ni, 0, sizeof(*ni));
+
+        /* NAL will fill in the address part of the NID */
+        ni->ni_nid = PTL_MKNID(net, 0);
+
+        list_add_tail(&ni->ni_list, nilist);
+        return ni;
 }
 
 ptl_err_t
@@ -185,12 +154,12 @@ ptl_parse_networks(struct list_head *nilist, char *networks)
        int       tokensize = strlen(networks) + 1;
         char     *tokens;
         char     *str;
-        ptl_ni_t *ni = NULL;
+        ptl_ni_t *ni;
         __u32     net;
 
        if (strlen(networks) > PTL_SINGLE_TEXTBUF_NOB) {
                /* _WAY_ conservative */
-               CERROR("Can't parse networks; string too long\n");
+               LCONSOLE_ERROR("Can't parse networks: string too long\n");
                return PTL_FAIL;
        }
 
@@ -204,17 +173,11 @@ ptl_parse_networks(struct list_head *nilist, char *networks)
         ptl_apini.apini_network_tokens_nob = tokensize;
         memcpy (tokens, networks, tokensize);
        str = tokens;
-
-        PORTAL_ALLOC(ptl_loni, sizeof(*ptl_loni));
-        if (ptl_loni == NULL) {
-                CERROR("Can't allocate LO NI\n");
-                goto failed;
-        }
+        
         /* Add in the loopback network */
-        /* zero counters/flags, NULL pointers... */
-        memset(ptl_loni, 0, sizeof(*ptl_loni));
-        ptl_loni->ni_nid = PTL_MKNID(PTL_MKNET(LONAL, 0), 0);
-        list_add_tail(&ptl_loni->ni_list, nilist);
+        ni = ptl_new_ni(PTL_MKNET(LONAL, 0), nilist);
+        if (ni == NULL)
+                goto failed;
         
         while (str != NULL && *str != 0) {
                 char      *comma = strchr(str, ',');
@@ -222,17 +185,14 @@ ptl_parse_networks(struct list_head *nilist, char *networks)
                 int        niface;
                char      *iface;
 
-                PORTAL_ALLOC(ni, sizeof(*ni));
-                if (ni == NULL) {
-                        CERROR ("ENOMEM parsing 'networks=\"%s\"'\n", networks);
-                        goto failed;
-                }
-                /* zero counters/flags, NULL pointers... */
-                memset(ni, 0, sizeof(*ni));
-                list_add_tail(&ni->ni_list, nilist);
-                
+                /* NB we don't check interface conflicts here; it's the NALs
+                 * responsibility (if it cares at all) */
+
                 if (bracket == NULL ||
                    (comma != NULL && comma < bracket)) {
+
+                        /* no interface list specified */
+
                        if (comma != NULL)
                                *comma++ = 0;
                        net = libcfs_str2net(ptl_trimwhite(str));
@@ -243,8 +203,7 @@ ptl_parse_networks(struct list_head *nilist, char *networks)
                                 goto failed;
                         }
 
-                        ni->ni_nid = PTL_MKNID(net, 0);
-                        if (ptl_check_ni_conflicts(ni, nilist) != PTL_OK)
+                        if (ptl_new_ni(net, nilist) == NULL)
                                 goto failed;
 
                        str = comma;
@@ -259,7 +218,9 @@ ptl_parse_networks(struct list_head *nilist, char *networks)
                         goto failed;
                 } 
 
-                ni->ni_nid = PTL_MKNID(net, 0);
+                ni = ptl_new_ni(net, nilist);
+                if (ni == NULL)
+                        goto failed;
 
                 niface = 0;
                iface = bracket + 1;
@@ -285,8 +246,8 @@ ptl_parse_networks(struct list_head *nilist, char *networks)
                         }
 
                         if (niface == PTL_MAX_INTERFACES) {
-                                LCONSOLE_ERROR("Too many interfaces for %s\n",
-                                               libcfs_net2str(PTL_NIDNET(ni->ni_nid)));
+                                LCONSOLE_ERROR("Too many interfaces for net %s\n",
+                                               libcfs_net2str(net));
                                 goto failed;
                         }
 
@@ -294,9 +255,6 @@ ptl_parse_networks(struct list_head *nilist, char *networks)
                        iface = comma;
                } while (iface != NULL);
 
-                if (ptl_check_ni_conflicts(ni, nilist) != PTL_OK)
-                        goto failed;
-                
                str = bracket + 1;
                comma = strchr(bracket + 1, ',');
                if (comma != NULL) {
index ba31c3f..74315ad 100644 (file)
@@ -579,9 +579,10 @@ ptl_send (ptl_ni_t *ni, void *private, ptl_msg_t *msg,
           ptl_hdr_t *hdr, int type, ptl_process_id_t target,
           ptl_libmd_t *md, ptl_size_t offset, ptl_size_t len)
 {
-        ptl_nid_t gw_nid;
-        int       routing = 0;
-        ptl_err_t rc;
+        unsigned long flags;
+        ptl_nid_t     gw_nid;
+        int           routing = 0;
+        ptl_err_t     rc;
 
         /* CAVEAT EMPTOR! ni != NULL == interface pre-determined (ACK) */
 
@@ -596,14 +597,25 @@ ptl_send (ptl_ni_t *ni, void *private, ptl_msg_t *msg,
                 return PTL_FAIL;
         }
 
-        if (PTL_NETNAL(PTL_NIDNET(ni->ni_nid)) != LONAL) {
+        if (ni->ni_nal->nal_type != LONAL) {
                 if (gw_nid != ni->ni_nid) {         /* it's not for me */
                         routing = gw_nid != target.nid; /* will gateway have to forward? */
                 } else if (allow_destination_aliases || /* force lonal? */
                            implicit_loopback) {
-                        ptl_ni_addref(ptl_loni);
-                        ptl_ni_decref(ni);
+
+                        PTL_LOCK(flags);
+                        ptl_ni_decref_locked(ni);
                         ni = ptl_loni;
+                        if (ni != NULL)
+                                ptl_ni_addref_locked(ni);
+                        PTL_UNLOCK(flags);
+                        
+                        if (ni == NULL)         /* shutdown in progress */
+                                return PTL_FAIL;
+
+                        if (implicit_loopback)
+                                target.nid = ni->ni_nid;
+
                 } else {                        /* barf */
                         ptl_ni_decref(ni);
                         CERROR("Attempt to send to self via %s, not LONAL\n",
@@ -1120,25 +1132,6 @@ ptl_parse(ptl_ni_t *ni, ptl_hdr_t *hdr, void *private)
         /* That's "OK I can parse it", not "OK I like it" :) */
 }
 
-ptl_ni_t *
-ptl_nid2ni (ptl_nid_t nid)
-{
-        struct list_head   *tmp;
-        ptl_ni_t           *ni;
-        
-        /* Called holding PTL_LOCK */
-
-        list_for_each (tmp, &ptl_apini.apini_nis) {
-                ni = list_entry(tmp, ptl_ni_t, ni_list);
-                
-                /* network type & number match in target NID and ni's NID */
-                if (((ni->ni_nid ^ nid)>>32) == 0)
-                        return ni;
-        }
-
-        return NULL;
-}
-
 ptl_err_t
 PtlPut(ptl_handle_md_t mdh, ptl_ack_req_t ack,
        ptl_process_id_t target, ptl_pt_index_t portal,
index 4727e54..a2d3f03 100644 (file)
@@ -255,7 +255,6 @@ void
 lonal_shutdown(ptl_ni_t *ni)
 {
        CDEBUG (D_NET, "shutdown\n");
-       LASSERT (ni == ptl_loni);
         LASSERT (lonal_instanced);
         
         lonal_instanced = 0;
@@ -265,7 +264,6 @@ ptl_err_t
 lonal_startup (ptl_ni_t *ni)
 {
        LASSERT (ni->ni_nal == &ptl_lonal);
-       LASSERT (ni == ptl_loni);
        LASSERT (!lonal_instanced);
         lonal_instanced = 1;