X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lnet%2Fklnds%2Fsocklnd%2Fsocklnd.c;h=45058c5f351c107b1919784fead793b154c8a1dc;hb=96bbc5b674d8c766e375c4266a3a7859a47b172e;hp=764277014ed23ac71099a5af5a3dc03efb0d4ab5;hpb=fb1d86804d2e9f82045c5198b2a9850321c64fb9;p=fs%2Flustre-release.git diff --git a/lnet/klnds/socklnd/socklnd.c b/lnet/klnds/socklnd/socklnd.c index 7642770..45058c5 100644 --- a/lnet/klnds/socklnd/socklnd.c +++ b/lnet/klnds/socklnd/socklnd.c @@ -1,184 +1,66 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (C) 2001, 2002 Cluster File Systems, Inc. - * Author: Zach Brown - * Author: Peter J. Braam - * Author: Phil Schwan - * Author: Eric Barton + * GPL HEADER START * - * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * - * Portals is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. * - * Portals is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). * - * You should have received a copy of the GNU General Public License - * along with Portals; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/klnds/socklnd/socklnd.c + * + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton */ -#include "socknal.h" +#include "socklnd.h" -nal_t ksocknal_api; +lnd_t the_ksocklnd; ksock_nal_data_t ksocknal_data; -ptl_handle_ni_t ksocknal_ni; -ksock_tunables_t ksocknal_tunables; - -kpr_nal_interface_t ksocknal_router_interface = { - kprni_nalid: SOCKNAL, - kprni_arg: &ksocknal_data, - kprni_fwd: ksocknal_fwd_packet, - kprni_notify: ksocknal_notify, -}; - -#ifdef CONFIG_SYSCTL -#define SOCKNAL_SYSCTL 200 - -#define SOCKNAL_SYSCTL_TIMEOUT 1 -#define SOCKNAL_SYSCTL_EAGER_ACK 2 -#define SOCKNAL_SYSCTL_ZERO_COPY 3 -#define SOCKNAL_SYSCTL_TYPED 4 -#define SOCKNAL_SYSCTL_MIN_BULK 5 -#define SOCKNAL_SYSCTL_BUFFER_SIZE 6 -#define SOCKNAL_SYSCTL_NAGLE 7 -#define SOCKNAL_SYSCTL_IRQ_AFFINITY 8 -#define SOCKNAL_SYSCTL_KEEPALIVE_IDLE 9 -#define SOCKNAL_SYSCTL_KEEPALIVE_COUNT 10 -#define SOCKNAL_SYSCTL_KEEPALIVE_INTVL 11 - -static ctl_table ksocknal_ctl_table[] = { - {SOCKNAL_SYSCTL_TIMEOUT, "timeout", - &ksocknal_tunables.ksnd_io_timeout, sizeof (int), - 0644, NULL, &proc_dointvec}, - {SOCKNAL_SYSCTL_EAGER_ACK, "eager_ack", - &ksocknal_tunables.ksnd_eager_ack, sizeof (int), - 0644, NULL, &proc_dointvec}, -#if SOCKNAL_ZC - {SOCKNAL_SYSCTL_ZERO_COPY, "zero_copy", - &ksocknal_tunables.ksnd_zc_min_frag, sizeof (int), - 0644, NULL, &proc_dointvec}, -#endif - {SOCKNAL_SYSCTL_TYPED, "typed", - &ksocknal_tunables.ksnd_typed_conns, sizeof (int), - 0644, NULL, &proc_dointvec}, - {SOCKNAL_SYSCTL_MIN_BULK, "min_bulk", - &ksocknal_tunables.ksnd_min_bulk, sizeof (int), - 0644, NULL, &proc_dointvec}, - {SOCKNAL_SYSCTL_BUFFER_SIZE, "buffer_size", - &ksocknal_tunables.ksnd_buffer_size, sizeof(int), - 0644, NULL, &proc_dointvec}, - {SOCKNAL_SYSCTL_NAGLE, "nagle", - &ksocknal_tunables.ksnd_nagle, sizeof(int), - 0644, NULL, &proc_dointvec}, -#if CPU_AFFINITY - {SOCKNAL_SYSCTL_IRQ_AFFINITY, "irq_affinity", - &ksocknal_tunables.ksnd_irq_affinity, sizeof(int), - 0644, NULL, &proc_dointvec}, -#endif - {SOCKNAL_SYSCTL_KEEPALIVE_IDLE, "keepalive_idle", - &ksocknal_tunables.ksnd_keepalive_idle, sizeof(int), - 0644, NULL, &proc_dointvec}, - {SOCKNAL_SYSCTL_KEEPALIVE_COUNT, "keepalive_count", - &ksocknal_tunables.ksnd_keepalive_count, sizeof(int), - 0644, NULL, &proc_dointvec}, - {SOCKNAL_SYSCTL_KEEPALIVE_INTVL, "keepalive_intvl", - &ksocknal_tunables.ksnd_keepalive_intvl, sizeof(int), - 0644, NULL, &proc_dointvec}, - { 0 } -}; - -static ctl_table ksocknal_top_ctl_table[] = { - {SOCKNAL_SYSCTL, "socknal", NULL, 0, 0555, ksocknal_ctl_table}, - { 0 } -}; -#endif - -int -ksocknal_set_mynid(ptl_nid_t nid) -{ - lib_ni_t *ni = &ksocknal_lib.libnal_ni; - - /* FIXME: we have to do this because we call lib_init() at module - * insertion time, which is before we have 'mynid' available. lib_init - * sets the NAL's nid, which it uses to tell other nodes where packets - * are coming from. This is not a very graceful solution to this - * problem. */ - - CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n", - nid, ni->ni_pid.nid); - - ni->ni_pid.nid = nid; - return (0); -} - -void -ksocknal_bind_irq (unsigned int irq) -{ -#if (defined(CONFIG_SMP) && CPU_AFFINITY) - int bind; - int cpu; - unsigned long flags; - char cmdline[64]; - ksock_irqinfo_t *info; - char *argv[] = {"/bin/sh", - "-c", - cmdline, - NULL}; - char *envp[] = {"HOME=/", - "PATH=/sbin:/bin:/usr/sbin:/usr/bin", - NULL}; - - LASSERT (irq < NR_IRQS); - if (irq == 0) /* software NIC or affinity disabled */ - return; - - info = &ksocknal_data.ksnd_irqinfo[irq]; - - write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); - - LASSERT (info->ksni_valid); - bind = !info->ksni_bound; - info->ksni_bound = 1; - - write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); - - if (!bind) /* bound already */ - return; - - cpu = ksocknal_irqsched2cpu(info->ksni_sched); - snprintf (cmdline, sizeof (cmdline), - "echo %d > /proc/irq/%u/smp_affinity", 1 << cpu, irq); - - printk (KERN_INFO "Lustre: Binding irq %u to CPU %d with cmd: %s\n", - irq, cpu, cmdline); - - /* FIXME: Find a better method of setting IRQ affinity... - */ - - USERMODEHELPER(argv[0], argv, envp); -#endif -} ksock_interface_t * -ksocknal_ip2iface(__u32 ip) +ksocknal_ip2iface(lnet_ni_t *ni, __u32 ip) { + ksock_net_t *net = ni->ni_data; int i; ksock_interface_t *iface; - for (i = 0; i < ksocknal_data.ksnd_ninterfaces; i++) { - LASSERT(i < SOCKNAL_MAX_INTERFACES); - iface = &ksocknal_data.ksnd_interfaces[i]; - + for (i = 0; i < net->ksnn_ninterfaces; i++) { + LASSERT(i < LNET_MAX_INTERFACES); + iface = &net->ksnn_interfaces[i]; + if (iface->ksni_ipaddr == ip) return (iface); } - + return (NULL); } @@ -187,16 +69,16 @@ ksocknal_create_route (__u32 ipaddr, int port) { ksock_route_t *route; - PORTAL_ALLOC (route, sizeof (*route)); + LIBCFS_ALLOC (route, sizeof (*route)); if (route == NULL) return (NULL); - atomic_set (&route->ksnr_refcount, 1); + cfs_atomic_set (&route->ksnr_refcount, 1); route->ksnr_peer = NULL; - route->ksnr_timeout = jiffies; - route->ksnr_retry_interval = SOCKNAL_MIN_RECONNECT_INTERVAL; + route->ksnr_retry_interval = 0; /* OK to connect at any time */ route->ksnr_ipaddr = ipaddr; route->ksnr_port = port; + route->ksnr_scheduled = 0; route->ksnr_connecting = 0; route->ksnr_connected = 0; route->ksnr_deleted = 0; @@ -209,115 +91,127 @@ ksocknal_create_route (__u32 ipaddr, int port) void ksocknal_destroy_route (ksock_route_t *route) { - if (route->ksnr_peer != NULL) - ksocknal_put_peer (route->ksnr_peer); - - PORTAL_FREE (route, sizeof (*route)); -} - -void -ksocknal_put_route (ksock_route_t *route) -{ - CDEBUG (D_OTHER, "putting route[%p] (%d)\n", - route, atomic_read (&route->ksnr_refcount)); + LASSERT (cfs_atomic_read(&route->ksnr_refcount) == 0); - LASSERT (atomic_read (&route->ksnr_refcount) > 0); - if (!atomic_dec_and_test (&route->ksnr_refcount)) - return; + if (route->ksnr_peer != NULL) + ksocknal_peer_decref(route->ksnr_peer); - ksocknal_destroy_route (route); + LIBCFS_FREE (route, sizeof (*route)); } -ksock_peer_t * -ksocknal_create_peer (ptl_nid_t nid) +int +ksocknal_create_peer (ksock_peer_t **peerp, lnet_ni_t *ni, lnet_process_id_t id) { - ksock_peer_t *peer; + ksock_net_t *net = ni->ni_data; + ksock_peer_t *peer; - LASSERT (nid != PTL_NID_ANY); + LASSERT (id.nid != LNET_NID_ANY); + LASSERT (id.pid != LNET_PID_ANY); + LASSERT (!cfs_in_interrupt()); - PORTAL_ALLOC (peer, sizeof (*peer)); + LIBCFS_ALLOC (peer, sizeof (*peer)); if (peer == NULL) - return (NULL); + return -ENOMEM; memset (peer, 0, sizeof (*peer)); /* NULL pointers/clear flags etc */ - peer->ksnp_nid = nid; - atomic_set (&peer->ksnp_refcount, 1); /* 1 ref for caller */ + peer->ksnp_ni = ni; + peer->ksnp_id = id; + cfs_atomic_set (&peer->ksnp_refcount, 1); /* 1 ref for caller */ peer->ksnp_closing = 0; - INIT_LIST_HEAD (&peer->ksnp_conns); - INIT_LIST_HEAD (&peer->ksnp_routes); - INIT_LIST_HEAD (&peer->ksnp_tx_queue); + peer->ksnp_accepting = 0; + peer->ksnp_proto = NULL; + peer->ksnp_last_alive = 0; + peer->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1; - atomic_inc (&ksocknal_data.ksnd_npeers); - return (peer); -} + CFS_INIT_LIST_HEAD (&peer->ksnp_conns); + CFS_INIT_LIST_HEAD (&peer->ksnp_routes); + CFS_INIT_LIST_HEAD (&peer->ksnp_tx_queue); + CFS_INIT_LIST_HEAD (&peer->ksnp_zc_req_list); + cfs_spin_lock_init(&peer->ksnp_lock); -void -ksocknal_destroy_peer (ksock_peer_t *peer) -{ - CDEBUG (D_NET, "peer "LPX64" %p deleted\n", peer->ksnp_nid, peer); + cfs_spin_lock_bh (&net->ksnn_lock); - LASSERT (atomic_read (&peer->ksnp_refcount) == 0); - LASSERT (list_empty (&peer->ksnp_conns)); - LASSERT (list_empty (&peer->ksnp_routes)); - LASSERT (list_empty (&peer->ksnp_tx_queue)); + if (net->ksnn_shutdown) { + cfs_spin_unlock_bh (&net->ksnn_lock); - PORTAL_FREE (peer, sizeof (*peer)); + LIBCFS_FREE(peer, sizeof(*peer)); + CERROR("Can't create peer: network shutdown\n"); + return -ESHUTDOWN; + } + + net->ksnn_npeers++; - /* NB a peer's connections and autoconnect routes keep a reference - * on their peer until they are destroyed, so we can be assured - * that _all_ state to do with this peer has been cleaned up when - * its refcount drops to zero. */ - atomic_dec (&ksocknal_data.ksnd_npeers); + cfs_spin_unlock_bh (&net->ksnn_lock); + + *peerp = peer; + return 0; } void -ksocknal_put_peer (ksock_peer_t *peer) +ksocknal_destroy_peer (ksock_peer_t *peer) { - CDEBUG (D_OTHER, "putting peer[%p] -> "LPX64" (%d)\n", - peer, peer->ksnp_nid, - atomic_read (&peer->ksnp_refcount)); - - LASSERT (atomic_read (&peer->ksnp_refcount) > 0); - if (!atomic_dec_and_test (&peer->ksnp_refcount)) - return; - - ksocknal_destroy_peer (peer); + ksock_net_t *net = peer->ksnp_ni->ni_data; + + CDEBUG (D_NET, "peer %s %p deleted\n", + libcfs_id2str(peer->ksnp_id), peer); + + LASSERT (cfs_atomic_read (&peer->ksnp_refcount) == 0); + LASSERT (peer->ksnp_accepting == 0); + LASSERT (cfs_list_empty (&peer->ksnp_conns)); + LASSERT (cfs_list_empty (&peer->ksnp_routes)); + LASSERT (cfs_list_empty (&peer->ksnp_tx_queue)); + LASSERT (cfs_list_empty (&peer->ksnp_zc_req_list)); + + LIBCFS_FREE (peer, sizeof (*peer)); + + /* NB a peer's connections and routes keep a reference on their peer + * until they are destroyed, so we can be assured that _all_ state to + * do with this peer has been cleaned up when its refcount drops to + * zero. */ + cfs_spin_lock_bh (&net->ksnn_lock); + net->ksnn_npeers--; + cfs_spin_unlock_bh (&net->ksnn_lock); } ksock_peer_t * -ksocknal_find_peer_locked (ptl_nid_t nid) +ksocknal_find_peer_locked (lnet_ni_t *ni, lnet_process_id_t id) { - struct list_head *peer_list = ksocknal_nid2peerlist (nid); - struct list_head *tmp; + cfs_list_t *peer_list = ksocknal_nid2peerlist(id.nid); + cfs_list_t *tmp; ksock_peer_t *peer; - list_for_each (tmp, peer_list) { + cfs_list_for_each (tmp, peer_list) { - peer = list_entry (tmp, ksock_peer_t, ksnp_list); + peer = cfs_list_entry (tmp, ksock_peer_t, ksnp_list); LASSERT (!peer->ksnp_closing); - if (peer->ksnp_nid != nid) + if (peer->ksnp_ni != ni) continue; - CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n", - peer, nid, atomic_read (&peer->ksnp_refcount)); + if (peer->ksnp_id.nid != id.nid || + peer->ksnp_id.pid != id.pid) + continue; + + CDEBUG(D_NET, "got peer [%p] -> %s (%d)\n", + peer, libcfs_id2str(id), + cfs_atomic_read(&peer->ksnp_refcount)); return (peer); } return (NULL); } ksock_peer_t * -ksocknal_get_peer (ptl_nid_t nid) +ksocknal_find_peer (lnet_ni_t *ni, lnet_process_id_t id) { ksock_peer_t *peer; - read_lock (&ksocknal_data.ksnd_global_lock); - peer = ksocknal_find_peer_locked (nid); + cfs_read_lock (&ksocknal_data.ksnd_global_lock); + peer = ksocknal_find_peer_locked (ni, id); if (peer != NULL) /* +1 ref for caller? */ - atomic_inc (&peer->ksnp_refcount); - read_unlock (&ksocknal_data.ksnd_global_lock); + ksocknal_peer_addref(peer); + cfs_read_unlock (&ksocknal_data.ksnd_global_lock); return (peer); } @@ -327,49 +221,60 @@ ksocknal_unlink_peer_locked (ksock_peer_t *peer) { int i; __u32 ip; + ksock_interface_t *iface; for (i = 0; i < peer->ksnp_n_passive_ips; i++) { - LASSERT (i < SOCKNAL_MAX_INTERFACES); + LASSERT (i < LNET_MAX_INTERFACES); ip = peer->ksnp_passive_ips[i]; - ksocknal_ip2iface(ip)->ksni_npeers--; + iface = ksocknal_ip2iface(peer->ksnp_ni, ip); + /* All IPs in peer->ksnp_passive_ips[] come from the + * interface list, therefore the call must succeed. */ + LASSERT (iface != NULL); + + CDEBUG(D_NET, "peer=%p iface=%p ksni_nroutes=%d\n", + peer, iface, iface->ksni_nroutes); + iface->ksni_npeers--; } - LASSERT (list_empty(&peer->ksnp_conns)); - LASSERT (list_empty(&peer->ksnp_routes)); + LASSERT (cfs_list_empty(&peer->ksnp_conns)); + LASSERT (cfs_list_empty(&peer->ksnp_routes)); LASSERT (!peer->ksnp_closing); peer->ksnp_closing = 1; - list_del (&peer->ksnp_list); + cfs_list_del (&peer->ksnp_list); /* lose peerlist's ref */ - ksocknal_put_peer (peer); + ksocknal_peer_decref(peer); } int -ksocknal_get_peer_info (int index, ptl_nid_t *nid, - __u32 *myip, __u32 *peer_ip, int *port, - int *conn_count, int *share_count) +ksocknal_get_peer_info (lnet_ni_t *ni, int index, + lnet_process_id_t *id, __u32 *myip, __u32 *peer_ip, + int *port, int *conn_count, int *share_count) { ksock_peer_t *peer; - struct list_head *ptmp; + cfs_list_t *ptmp; ksock_route_t *route; - struct list_head *rtmp; + cfs_list_t *rtmp; int i; int j; int rc = -ENOENT; - read_lock (&ksocknal_data.ksnd_global_lock); + cfs_read_lock (&ksocknal_data.ksnd_global_lock); for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { - - list_for_each (ptmp, &ksocknal_data.ksnd_peers[i]) { - peer = list_entry (ptmp, ksock_peer_t, ksnp_list); + + cfs_list_for_each (ptmp, &ksocknal_data.ksnd_peers[i]) { + peer = cfs_list_entry (ptmp, ksock_peer_t, ksnp_list); + + if (peer->ksnp_ni != ni) + continue; if (peer->ksnp_n_passive_ips == 0 && - list_empty(&peer->ksnp_routes)) { + cfs_list_empty(&peer->ksnp_routes)) { if (index-- > 0) continue; - - *nid = peer->ksnp_nid; + + *id = peer->ksnp_id; *myip = 0; *peer_ip = 0; *port = 0; @@ -382,8 +287,8 @@ ksocknal_get_peer_info (int index, ptl_nid_t *nid, for (j = 0; j < peer->ksnp_n_passive_ips; j++) { if (index-- > 0) continue; - - *nid = peer->ksnp_nid; + + *id = peer->ksnp_id; *myip = peer->ksnp_passive_ips[j]; *peer_ip = 0; *port = 0; @@ -392,15 +297,15 @@ ksocknal_get_peer_info (int index, ptl_nid_t *nid, rc = 0; goto out; } - - list_for_each (rtmp, &peer->ksnp_routes) { + + cfs_list_for_each (rtmp, &peer->ksnp_routes) { if (index-- > 0) continue; - route = list_entry(rtmp, ksock_route_t, - ksnr_list); + route = cfs_list_entry(rtmp, ksock_route_t, + ksnr_list); - *nid = peer->ksnp_nid; + *id = peer->ksnp_id; *myip = route->ksnr_myipaddr; *peer_ip = route->ksnr_ipaddr; *port = route->ksnr_port; @@ -412,7 +317,7 @@ ksocknal_get_peer_info (int index, ptl_nid_t *nid, } } out: - read_unlock (&ksocknal_data.ksnd_global_lock); + cfs_read_unlock (&ksocknal_data.ksnd_global_lock); return (rc); } @@ -424,74 +329,75 @@ ksocknal_associate_route_conn_locked(ksock_route_t *route, ksock_conn_t *conn) ksock_interface_t *iface; conn->ksnc_route = route; - atomic_inc (&route->ksnr_refcount); + ksocknal_route_addref(route); if (route->ksnr_myipaddr != conn->ksnc_myipaddr) { if (route->ksnr_myipaddr == 0) { /* route wasn't bound locally yet (the initial route) */ - CWARN("Binding "LPX64" %u.%u.%u.%u to %u.%u.%u.%u\n", - peer->ksnp_nid, - HIPQUAD(route->ksnr_ipaddr), - HIPQUAD(conn->ksnc_myipaddr)); + CDEBUG(D_NET, "Binding %s %u.%u.%u.%u to %u.%u.%u.%u\n", + libcfs_id2str(peer->ksnp_id), + HIPQUAD(route->ksnr_ipaddr), + HIPQUAD(conn->ksnc_myipaddr)); } else { - CWARN("Rebinding "LPX64" %u.%u.%u.%u from " - "%u.%u.%u.%u to %u.%u.%u.%u\n", - peer->ksnp_nid, - HIPQUAD(route->ksnr_ipaddr), - HIPQUAD(route->ksnr_myipaddr), - HIPQUAD(conn->ksnc_myipaddr)); - - iface = ksocknal_ip2iface(route->ksnr_myipaddr); - if (iface != NULL) + CDEBUG(D_NET, "Rebinding %s %u.%u.%u.%u from " + "%u.%u.%u.%u to %u.%u.%u.%u\n", + libcfs_id2str(peer->ksnp_id), + HIPQUAD(route->ksnr_ipaddr), + HIPQUAD(route->ksnr_myipaddr), + HIPQUAD(conn->ksnc_myipaddr)); + + iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni, + route->ksnr_myipaddr); + if (iface != NULL) iface->ksni_nroutes--; } route->ksnr_myipaddr = conn->ksnc_myipaddr; - iface = ksocknal_ip2iface(route->ksnr_myipaddr); - if (iface != NULL) + iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni, + route->ksnr_myipaddr); + if (iface != NULL) iface->ksni_nroutes++; } route->ksnr_connected |= (1<ksnr_connecting &= ~(1<ksnr_conn_count++; /* Successful connection => further attempts can * proceed immediately */ - route->ksnr_timeout = jiffies; - route->ksnr_retry_interval = SOCKNAL_MIN_RECONNECT_INTERVAL; + route->ksnr_retry_interval = 0; } void ksocknal_add_route_locked (ksock_peer_t *peer, ksock_route_t *route) { - struct list_head *tmp; + cfs_list_t *tmp; ksock_conn_t *conn; - int type; ksock_route_t *route2; + LASSERT (!peer->ksnp_closing); LASSERT (route->ksnr_peer == NULL); - LASSERT (route->ksnr_connecting == 0); + LASSERT (!route->ksnr_scheduled); + LASSERT (!route->ksnr_connecting); LASSERT (route->ksnr_connected == 0); /* LASSERT(unique) */ - list_for_each(tmp, &peer->ksnp_routes) { - route2 = list_entry(tmp, ksock_route_t, ksnr_list); + cfs_list_for_each(tmp, &peer->ksnp_routes) { + route2 = cfs_list_entry(tmp, ksock_route_t, ksnr_list); if (route2->ksnr_ipaddr == route->ksnr_ipaddr) { - CERROR ("Duplicate route "LPX64" %u.%u.%u.%u\n", - peer->ksnp_nid, HIPQUAD(route->ksnr_ipaddr)); + CERROR ("Duplicate route %s %u.%u.%u.%u\n", + libcfs_id2str(peer->ksnp_id), + HIPQUAD(route->ksnr_ipaddr)); LBUG(); } } route->ksnr_peer = peer; - atomic_inc (&peer->ksnp_refcount); + ksocknal_peer_addref(peer); /* peer's routelist takes over my ref on 'route' */ - list_add_tail(&route->ksnr_list, &peer->ksnp_routes); - - list_for_each(tmp, &peer->ksnp_conns) { - conn = list_entry(tmp, ksock_conn_t, ksnc_list); - type = conn->ksnc_type; + cfs_list_add_tail(&route->ksnr_list, &peer->ksnp_routes); + + cfs_list_for_each(tmp, &peer->ksnp_conns) { + conn = cfs_list_entry(tmp, ksock_conn_t, ksnc_list); if (conn->ksnc_ipaddr != route->ksnr_ipaddr) continue; @@ -507,290 +413,254 @@ ksocknal_del_route_locked (ksock_route_t *route) ksock_peer_t *peer = route->ksnr_peer; ksock_interface_t *iface; ksock_conn_t *conn; - struct list_head *ctmp; - struct list_head *cnxt; + cfs_list_t *ctmp; + cfs_list_t *cnxt; LASSERT (!route->ksnr_deleted); /* Close associated conns */ - list_for_each_safe (ctmp, cnxt, &peer->ksnp_conns) { - conn = list_entry(ctmp, ksock_conn_t, ksnc_list); + cfs_list_for_each_safe (ctmp, cnxt, &peer->ksnp_conns) { + conn = cfs_list_entry(ctmp, ksock_conn_t, ksnc_list); if (conn->ksnc_route != route) continue; - + ksocknal_close_conn_locked (conn, 0); } if (route->ksnr_myipaddr != 0) { - iface = ksocknal_ip2iface(route->ksnr_myipaddr); + iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni, + route->ksnr_myipaddr); if (iface != NULL) iface->ksni_nroutes--; } route->ksnr_deleted = 1; - list_del (&route->ksnr_list); - ksocknal_put_route (route); /* drop peer's ref */ + cfs_list_del (&route->ksnr_list); + ksocknal_route_decref(route); /* drop peer's ref */ - if (list_empty (&peer->ksnp_routes) && - list_empty (&peer->ksnp_conns)) { - /* I've just removed the last autoconnect route of a peer - * with no active connections */ + if (cfs_list_empty (&peer->ksnp_routes) && + cfs_list_empty (&peer->ksnp_conns)) { + /* I've just removed the last route to a peer with no active + * connections */ ksocknal_unlink_peer_locked (peer); } } int -ksocknal_add_peer (ptl_nid_t nid, __u32 ipaddr, int port) +ksocknal_add_peer (lnet_ni_t *ni, lnet_process_id_t id, __u32 ipaddr, int port) { - unsigned long flags; - struct list_head *tmp; + cfs_list_t *tmp; ksock_peer_t *peer; ksock_peer_t *peer2; ksock_route_t *route; ksock_route_t *route2; - - if (nid == PTL_NID_ANY) + int rc; + + if (id.nid == LNET_NID_ANY || + id.pid == LNET_PID_ANY) return (-EINVAL); /* Have a brand new peer ready... */ - peer = ksocknal_create_peer (nid); - if (peer == NULL) - return (-ENOMEM); + rc = ksocknal_create_peer(&peer, ni, id); + if (rc != 0) + return rc; route = ksocknal_create_route (ipaddr, port); if (route == NULL) { - ksocknal_put_peer (peer); + ksocknal_peer_decref(peer); return (-ENOMEM); } - write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); + cfs_write_lock_bh (&ksocknal_data.ksnd_global_lock); - peer2 = ksocknal_find_peer_locked (nid); + /* always called with a ref on ni, so shutdown can't have started */ + LASSERT (((ksock_net_t *) ni->ni_data)->ksnn_shutdown == 0); + + peer2 = ksocknal_find_peer_locked (ni, id); if (peer2 != NULL) { - ksocknal_put_peer (peer); + ksocknal_peer_decref(peer); peer = peer2; } else { /* peer table takes my ref on peer */ - list_add_tail (&peer->ksnp_list, - ksocknal_nid2peerlist (nid)); + cfs_list_add_tail (&peer->ksnp_list, + ksocknal_nid2peerlist (id.nid)); } route2 = NULL; - list_for_each (tmp, &peer->ksnp_routes) { - route2 = list_entry(tmp, ksock_route_t, ksnr_list); - + cfs_list_for_each (tmp, &peer->ksnp_routes) { + route2 = cfs_list_entry(tmp, ksock_route_t, ksnr_list); + if (route2->ksnr_ipaddr == ipaddr) break; - + route2 = NULL; } if (route2 == NULL) { ksocknal_add_route_locked(peer, route); route->ksnr_share_count++; } else { - ksocknal_put_route(route); + ksocknal_route_decref(route); route2->ksnr_share_count++; } - write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); + cfs_write_unlock_bh (&ksocknal_data.ksnd_global_lock); return (0); } void -ksocknal_del_peer_locked (ksock_peer_t *peer, __u32 ip, int single_share) +ksocknal_del_peer_locked (ksock_peer_t *peer, __u32 ip) { ksock_conn_t *conn; ksock_route_t *route; - struct list_head *tmp; - struct list_head *nxt; + cfs_list_t *tmp; + cfs_list_t *nxt; int nshared; LASSERT (!peer->ksnp_closing); - list_for_each_safe (tmp, nxt, &peer->ksnp_routes) { - route = list_entry(tmp, ksock_route_t, ksnr_list); + /* Extra ref prevents peer disappearing until I'm done with it */ + ksocknal_peer_addref(peer); - if (single_share && route->ksnr_share_count == 0) - continue; + cfs_list_for_each_safe (tmp, nxt, &peer->ksnp_routes) { + route = cfs_list_entry(tmp, ksock_route_t, ksnr_list); /* no match */ if (!(ip == 0 || route->ksnr_ipaddr == ip)) continue; - if (!single_share) - route->ksnr_share_count = 0; - else if (route->ksnr_share_count > 0) - route->ksnr_share_count--; - - if (route->ksnr_share_count == 0) { - /* This deletes associated conns too */ - ksocknal_del_route_locked (route); - } - - if (single_share) - break; + route->ksnr_share_count = 0; + /* This deletes associated conns too */ + ksocknal_del_route_locked (route); } nshared = 0; - list_for_each_safe (tmp, nxt, &peer->ksnp_routes) { - route = list_entry(tmp, ksock_route_t, ksnr_list); + cfs_list_for_each_safe (tmp, nxt, &peer->ksnp_routes) { + route = cfs_list_entry(tmp, ksock_route_t, ksnr_list); nshared += route->ksnr_share_count; } - + if (nshared == 0) { /* remove everything else if there are no explicit entries * left */ - list_for_each_safe (tmp, nxt, &peer->ksnp_routes) { - route = list_entry(tmp, ksock_route_t, ksnr_list); + cfs_list_for_each_safe (tmp, nxt, &peer->ksnp_routes) { + route = cfs_list_entry(tmp, ksock_route_t, ksnr_list); /* we should only be removing auto-entries */ LASSERT(route->ksnr_share_count == 0); ksocknal_del_route_locked (route); } - list_for_each_safe (tmp, nxt, &peer->ksnp_conns) { - conn = list_entry(tmp, ksock_conn_t, ksnc_list); + cfs_list_for_each_safe (tmp, nxt, &peer->ksnp_conns) { + conn = cfs_list_entry(tmp, ksock_conn_t, ksnc_list); ksocknal_close_conn_locked(conn, 0); } } - + + ksocknal_peer_decref(peer); /* NB peer unlinks itself when last conn/route is removed */ } int -ksocknal_del_peer (ptl_nid_t nid, __u32 ip, int single_share) +ksocknal_del_peer (lnet_ni_t *ni, lnet_process_id_t id, __u32 ip) { - unsigned long flags; - struct list_head *ptmp; - struct list_head *pnxt; + CFS_LIST_HEAD (zombies); + cfs_list_t *ptmp; + cfs_list_t *pnxt; ksock_peer_t *peer; int lo; int hi; int i; int rc = -ENOENT; - write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); + cfs_write_lock_bh (&ksocknal_data.ksnd_global_lock); - if (nid != PTL_NID_ANY) - lo = hi = ksocknal_nid2peerlist(nid) - ksocknal_data.ksnd_peers; + if (id.nid != LNET_NID_ANY) + lo = hi = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers); else { lo = 0; hi = ksocknal_data.ksnd_peer_hash_size - 1; } for (i = lo; i <= hi; i++) { - list_for_each_safe (ptmp, pnxt, &ksocknal_data.ksnd_peers[i]) { - peer = list_entry (ptmp, ksock_peer_t, ksnp_list); + cfs_list_for_each_safe (ptmp, pnxt, + &ksocknal_data.ksnd_peers[i]) { + peer = cfs_list_entry (ptmp, ksock_peer_t, ksnp_list); - if (!(nid == PTL_NID_ANY || peer->ksnp_nid == nid)) + if (peer->ksnp_ni != ni) continue; - ksocknal_del_peer_locked (peer, ip, single_share); - rc = 0; /* matched! */ + if (!((id.nid == LNET_NID_ANY || peer->ksnp_id.nid == id.nid) && + (id.pid == LNET_PID_ANY || peer->ksnp_id.pid == id.pid))) + continue; - if (single_share) - break; + ksocknal_peer_addref(peer); /* a ref for me... */ + + ksocknal_del_peer_locked (peer, ip); + + if (peer->ksnp_closing && + !cfs_list_empty(&peer->ksnp_tx_queue)) { + LASSERT (cfs_list_empty(&peer->ksnp_conns)); + LASSERT (cfs_list_empty(&peer->ksnp_routes)); + + cfs_list_splice_init(&peer->ksnp_tx_queue, + &zombies); + } + + ksocknal_peer_decref(peer); /* ...till here */ + + rc = 0; /* matched! */ } } - write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); + cfs_write_unlock_bh (&ksocknal_data.ksnd_global_lock); + + ksocknal_txlist_done(ni, &zombies, 1); return (rc); } ksock_conn_t * -ksocknal_get_conn_by_idx (int index) +ksocknal_get_conn_by_idx (lnet_ni_t *ni, int index) { ksock_peer_t *peer; - struct list_head *ptmp; + cfs_list_t *ptmp; ksock_conn_t *conn; - struct list_head *ctmp; + cfs_list_t *ctmp; int i; - read_lock (&ksocknal_data.ksnd_global_lock); + cfs_read_lock (&ksocknal_data.ksnd_global_lock); for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { - list_for_each (ptmp, &ksocknal_data.ksnd_peers[i]) { - peer = list_entry (ptmp, ksock_peer_t, ksnp_list); + cfs_list_for_each (ptmp, &ksocknal_data.ksnd_peers[i]) { + peer = cfs_list_entry (ptmp, ksock_peer_t, ksnp_list); LASSERT (!peer->ksnp_closing); - list_for_each (ctmp, &peer->ksnp_conns) { + if (peer->ksnp_ni != ni) + continue; + + cfs_list_for_each (ctmp, &peer->ksnp_conns) { if (index-- > 0) continue; - conn = list_entry (ctmp, ksock_conn_t, ksnc_list); - atomic_inc (&conn->ksnc_refcount); - read_unlock (&ksocknal_data.ksnd_global_lock); + conn = cfs_list_entry (ctmp, ksock_conn_t, + ksnc_list); + ksocknal_conn_addref(conn); + cfs_read_unlock (&ksocknal_data. \ + ksnd_global_lock); return (conn); } } } - read_unlock (&ksocknal_data.ksnd_global_lock); + cfs_read_unlock (&ksocknal_data.ksnd_global_lock); return (NULL); } -int -ksocknal_get_conn_addrs (ksock_conn_t *conn) -{ - struct sockaddr_in sin; - int len = sizeof (sin); - int rc; - - rc = conn->ksnc_sock->ops->getname (conn->ksnc_sock, - (struct sockaddr *)&sin, &len, 2); - /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */ - LASSERT (!conn->ksnc_closing); - - if (rc != 0) { - CERROR ("Error %d getting sock peer IP\n", rc); - return rc; - } - - conn->ksnc_ipaddr = ntohl (sin.sin_addr.s_addr); - conn->ksnc_port = ntohs (sin.sin_port); - - rc = conn->ksnc_sock->ops->getname (conn->ksnc_sock, - (struct sockaddr *)&sin, &len, 0); - if (rc != 0) { - CERROR ("Error %d getting sock local IP\n", rc); - return rc; - } - - conn->ksnc_myipaddr = ntohl (sin.sin_addr.s_addr); - - return 0; -} - -unsigned int -ksocknal_sock_irq (struct socket *sock) -{ - int irq = 0; - struct dst_entry *dst; - - if (!ksocknal_tunables.ksnd_irq_affinity) - return 0; - - dst = sk_dst_get (sock->sk); - if (dst != NULL) { - if (dst->dev != NULL) { - irq = dst->dev->irq; - if (irq >= NR_IRQS) { - CERROR ("Unexpected IRQ %x\n", irq); - irq = 0; - } - } - dst_release (dst); - } - - return (irq); -} - ksock_sched_t * ksocknal_choose_scheduler_locked (unsigned int irq) { @@ -798,7 +668,7 @@ ksocknal_choose_scheduler_locked (unsigned int irq) ksock_irqinfo_t *info; int i; - LASSERT (irq < NR_IRQS); + LASSERT (irq < CFS_NR_IRQS); info = &ksocknal_data.ksnd_irqinfo[irq]; if (irq != 0 && /* hardware NIC */ @@ -816,32 +686,40 @@ ksocknal_choose_scheduler_locked (unsigned int irq) if (irq != 0) { /* Hardware NIC */ info->ksni_valid = 1; - info->ksni_sched = sched - ksocknal_data.ksnd_schedulers; + info->ksni_sched = (unsigned int)(sched - ksocknal_data.ksnd_schedulers); /* no overflow... */ - LASSERT (info->ksni_sched == sched - ksocknal_data.ksnd_schedulers); + LASSERT (info->ksni_sched == (unsigned int)(sched - ksocknal_data.ksnd_schedulers)); } return (sched); } int -ksocknal_local_ipvec (__u32 *ipaddrs) +ksocknal_local_ipvec (lnet_ni_t *ni, __u32 *ipaddrs) { + ksock_net_t *net = ni->ni_data; int i; int nip; - read_lock (&ksocknal_data.ksnd_global_lock); + cfs_read_lock (&ksocknal_data.ksnd_global_lock); - nip = ksocknal_data.ksnd_ninterfaces; - for (i = 0; i < nip; i++) { - LASSERT (i < SOCKNAL_MAX_INTERFACES); + nip = net->ksnn_ninterfaces; + LASSERT (nip <= LNET_MAX_INTERFACES); + + /* Only offer interfaces for additional connections if I have + * more than one. */ + if (nip < 2) { + cfs_read_unlock (&ksocknal_data.ksnd_global_lock); + return 0; + } - ipaddrs[i] = ksocknal_data.ksnd_interfaces[i].ksni_ipaddr; + for (i = 0; i < nip; i++) { + ipaddrs[i] = net->ksnn_interfaces[i].ksni_ipaddr; LASSERT (ipaddrs[i] != 0); } - - read_unlock (&ksocknal_data.ksnd_global_lock); + + cfs_read_unlock (&ksocknal_data.ksnd_global_lock); return (nip); } @@ -854,25 +732,25 @@ ksocknal_match_peerip (ksock_interface_t *iface, __u32 *ips, int nips) int this_xor; int this_netmatch; int i; - + for (i = 0; i < nips; i++) { if (ips[i] == 0) continue; this_xor = (ips[i] ^ iface->ksni_ipaddr); this_netmatch = ((this_xor & iface->ksni_netmask) == 0) ? 1 : 0; - + if (!(best < 0 || best_netmatch < this_netmatch || - (best_netmatch == this_netmatch && + (best_netmatch == this_netmatch && best_xor > this_xor))) continue; - + best = i; best_netmatch = this_netmatch; best_xor = this_xor; } - + LASSERT (best >= 0); return (best); } @@ -880,8 +758,8 @@ ksocknal_match_peerip (ksock_interface_t *iface, __u32 *ips, int nips) int ksocknal_select_ips(ksock_peer_t *peer, __u32 *peerips, int n_peerips) { - rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock; - unsigned long flags; + cfs_rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock; + ksock_net_t *net = peer->ksnp_ni->ni_data; ksock_interface_t *iface; ksock_interface_t *best_iface; int n_ips; @@ -901,27 +779,30 @@ ksocknal_select_ips(ksock_peer_t *peer, __u32 *peerips, int n_peerips) /* Also note that I'm not going to return more than n_peerips * interfaces, even if I have more myself */ - - write_lock_irqsave(global_lock, flags); - LASSERT (n_peerips <= SOCKNAL_MAX_INTERFACES); - LASSERT (ksocknal_data.ksnd_ninterfaces <= SOCKNAL_MAX_INTERFACES); + cfs_write_lock_bh (global_lock); - n_ips = MIN(n_peerips, ksocknal_data.ksnd_ninterfaces); + LASSERT (n_peerips <= LNET_MAX_INTERFACES); + LASSERT (net->ksnn_ninterfaces <= LNET_MAX_INTERFACES); + + /* Only match interfaces for additional connections + * if I have > 1 interface */ + n_ips = (net->ksnn_ninterfaces < 2) ? 0 : + MIN(n_peerips, net->ksnn_ninterfaces); for (i = 0; peer->ksnp_n_passive_ips < n_ips; i++) { /* ^ yes really... */ /* If we have any new interfaces, first tick off all the * peer IPs that match old interfaces, then choose new - * interfaces to match the remaining peer IPS. + * interfaces to match the remaining peer IPS. * We don't forget interfaces we've stopped using; we might * start using them again... */ - + if (i < peer->ksnp_n_passive_ips) { /* Old interface. */ ip = peer->ksnp_passive_ips[i]; - best_iface = ksocknal_ip2iface(ip); + best_iface = ksocknal_ip2iface(peer->ksnp_ni, ip); /* peer passive ips are kept up to date */ LASSERT(best_iface != NULL); @@ -932,15 +813,15 @@ ksocknal_select_ips(ksock_peer_t *peer, __u32 *peerips, int n_peerips) best_iface = NULL; best_netmatch = 0; best_npeers = 0; - - for (j = 0; j < ksocknal_data.ksnd_ninterfaces; j++) { - iface = &ksocknal_data.ksnd_interfaces[j]; + + for (j = 0; j < net->ksnn_ninterfaces; j++) { + iface = &net->ksnn_interfaces[j]; ip = iface->ksni_ipaddr; for (k = 0; k < peer->ksnp_n_passive_ips; k++) if (peer->ksnp_passive_ips[k] == ip) break; - + if (k < peer->ksnp_n_passive_ips) /* using it already */ continue; @@ -964,69 +845,82 @@ ksocknal_select_ips(ksock_peer_t *peer, __u32 *peerips, int n_peerips) peer->ksnp_passive_ips[i] = ip; peer->ksnp_n_passive_ips = i+1; } - + LASSERT (best_iface != NULL); /* mark the best matching peer IP used */ j = ksocknal_match_peerip(best_iface, peerips, n_peerips); peerips[j] = 0; } - + /* Overwrite input peer IP addresses */ memcpy(peerips, peer->ksnp_passive_ips, n_ips * sizeof(*peerips)); - - write_unlock_irqrestore(global_lock, flags); - + + cfs_write_unlock_bh (global_lock); + return (n_ips); } void -ksocknal_create_routes(ksock_peer_t *peer, int port, +ksocknal_create_routes(ksock_peer_t *peer, int port, __u32 *peer_ipaddrs, int npeer_ipaddrs) { - ksock_route_t *newroute = NULL; - rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock; - unsigned long flags; - struct list_head *rtmp; - ksock_route_t *route; - ksock_interface_t *iface; - ksock_interface_t *best_iface; - int best_netmatch; - int this_netmatch; - int best_nroutes; - int i; - int j; + ksock_route_t *newroute = NULL; + cfs_rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock; + lnet_ni_t *ni = peer->ksnp_ni; + ksock_net_t *net = ni->ni_data; + cfs_list_t *rtmp; + ksock_route_t *route; + ksock_interface_t *iface; + ksock_interface_t *best_iface; + int best_netmatch; + int this_netmatch; + int best_nroutes; + int i; + int j; /* CAVEAT EMPTOR: We do all our interface matching with an * exclusive hold of global lock at IRQ priority. We're only * expecting to be dealing with small numbers of interfaces, so the * O(n**3)-ness here shouldn't matter */ - write_lock_irqsave(global_lock, flags); + cfs_write_lock_bh (global_lock); + + if (net->ksnn_ninterfaces < 2) { + /* Only create additional connections + * if I have > 1 interface */ + cfs_write_unlock_bh (global_lock); + return; + } + + LASSERT (npeer_ipaddrs <= LNET_MAX_INTERFACES); - LASSERT (npeer_ipaddrs <= SOCKNAL_MAX_INTERFACES); - for (i = 0; i < npeer_ipaddrs; i++) { if (newroute != NULL) { newroute->ksnr_ipaddr = peer_ipaddrs[i]; } else { - write_unlock_irqrestore(global_lock, flags); + cfs_write_unlock_bh (global_lock); newroute = ksocknal_create_route(peer_ipaddrs[i], port); if (newroute == NULL) return; - write_lock_irqsave(global_lock, flags); + cfs_write_lock_bh (global_lock); + } + + if (peer->ksnp_closing) { + /* peer got closed under me */ + break; } - + /* Already got a route? */ route = NULL; - list_for_each(rtmp, &peer->ksnp_routes) { - route = list_entry(rtmp, ksock_route_t, ksnr_list); + cfs_list_for_each(rtmp, &peer->ksnp_routes) { + route = cfs_list_entry(rtmp, ksock_route_t, ksnr_list); if (route->ksnr_ipaddr == newroute->ksnr_ipaddr) break; - + route = NULL; } if (route != NULL) @@ -1036,15 +930,16 @@ ksocknal_create_routes(ksock_peer_t *peer, int port, best_nroutes = 0; best_netmatch = 0; - LASSERT (ksocknal_data.ksnd_ninterfaces <= SOCKNAL_MAX_INTERFACES); + LASSERT (net->ksnn_ninterfaces <= LNET_MAX_INTERFACES); /* Select interface to connect from */ - for (j = 0; j < ksocknal_data.ksnd_ninterfaces; j++) { - iface = &ksocknal_data.ksnd_interfaces[j]; + for (j = 0; j < net->ksnn_ninterfaces; j++) { + iface = &net->ksnn_interfaces[j]; /* Using this interface already? */ - list_for_each(rtmp, &peer->ksnp_routes) { - route = list_entry(rtmp, ksock_route_t, ksnr_list); + cfs_list_for_each(rtmp, &peer->ksnp_routes) { + route = cfs_list_entry(rtmp, ksock_route_t, + ksnr_list); if (route->ksnr_myipaddr == iface->ksni_ipaddr) break; @@ -1054,21 +949,21 @@ ksocknal_create_routes(ksock_peer_t *peer, int port, if (route != NULL) continue; - this_netmatch = (((iface->ksni_ipaddr ^ - newroute->ksnr_ipaddr) & + this_netmatch = (((iface->ksni_ipaddr ^ + newroute->ksnr_ipaddr) & iface->ksni_netmask) == 0) ? 1 : 0; - + if (!(best_iface == NULL || best_netmatch < this_netmatch || (best_netmatch == this_netmatch && best_nroutes > iface->ksni_nroutes))) continue; - + best_iface = iface; best_netmatch = this_netmatch; best_nroutes = iface->ksni_nroutes; } - + if (best_iface == NULL) continue; @@ -1078,159 +973,272 @@ ksocknal_create_routes(ksock_peer_t *peer, int port, ksocknal_add_route_locked(peer, newroute); newroute = NULL; } - - write_unlock_irqrestore(global_lock, flags); + + cfs_write_unlock_bh (global_lock); if (newroute != NULL) - ksocknal_put_route(newroute); + ksocknal_route_decref(newroute); +} + +int +ksocknal_accept (lnet_ni_t *ni, cfs_socket_t *sock) +{ + ksock_connreq_t *cr; + int rc; + __u32 peer_ip; + int peer_port; + + rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port); + LASSERT (rc == 0); /* we succeeded before */ + + LIBCFS_ALLOC(cr, sizeof(*cr)); + if (cr == NULL) { + LCONSOLE_ERROR_MSG(0x12f, "Dropping connection request from " + "%u.%u.%u.%u: memory exhausted\n", + HIPQUAD(peer_ip)); + return -ENOMEM; + } + + lnet_ni_addref(ni); + cr->ksncr_ni = ni; + cr->ksncr_sock = sock; + + cfs_spin_lock_bh (&ksocknal_data.ksnd_connd_lock); + + cfs_list_add_tail(&cr->ksncr_list, &ksocknal_data.ksnd_connd_connreqs); + cfs_waitq_signal(&ksocknal_data.ksnd_connd_waitq); + + cfs_spin_unlock_bh (&ksocknal_data.ksnd_connd_lock); + return 0; +} + +int +ksocknal_connecting (ksock_peer_t *peer, __u32 ipaddr) +{ + ksock_route_t *route; + + cfs_list_for_each_entry_typed (route, &peer->ksnp_routes, + ksock_route_t, ksnr_list) { + + if (route->ksnr_ipaddr == ipaddr) + return route->ksnr_connecting; + } + return 0; } int -ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type) +ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route, + cfs_socket_t *sock, int type) { - int passive = (type == SOCKNAL_CONN_NONE); - rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock; - __u32 ipaddrs[SOCKNAL_MAX_INTERFACES]; - int nipaddrs; - ptl_nid_t nid; - struct list_head *tmp; + cfs_rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock; + CFS_LIST_HEAD (zombies); + lnet_process_id_t peerid; + cfs_list_t *tmp; __u64 incarnation; - unsigned long flags; ksock_conn_t *conn; ksock_conn_t *conn2; ksock_peer_t *peer = NULL; ksock_peer_t *peer2; ksock_sched_t *sched; + ksock_hello_msg_t *hello; unsigned int irq; ksock_tx_t *tx; + ksock_tx_t *txtmp; int rc; + int active; + char *warn = NULL; - /* NB, sock has an associated file since (a) this connection might - * have been created in userland and (b) we need to refcount the - * socket so that we don't close it while I/O is being done on - * it, and sock->file has that pre-cooked... */ - LASSERT (sock->file != NULL); - LASSERT (file_count(sock->file) > 0); - LASSERT (route == NULL || !passive); + active = (route != NULL); - rc = ksocknal_setup_sock (sock); - if (rc != 0) - return (rc); + LASSERT (active == (type != SOCKLND_CONN_NONE)); - irq = ksocknal_sock_irq (sock); + irq = ksocknal_lib_sock_irq (sock); - PORTAL_ALLOC(conn, sizeof(*conn)); - if (conn == NULL) - return (-ENOMEM); + LIBCFS_ALLOC(conn, sizeof(*conn)); + if (conn == NULL) { + rc = -ENOMEM; + goto failed_0; + } memset (conn, 0, sizeof (*conn)); + conn->ksnc_peer = NULL; conn->ksnc_route = NULL; conn->ksnc_sock = sock; + /* 2 ref, 1 for conn, another extra ref prevents socket + * being closed before establishment of connection */ + cfs_atomic_set (&conn->ksnc_sock_refcount, 2); conn->ksnc_type = type; - conn->ksnc_saved_data_ready = sock->sk->sk_data_ready; - conn->ksnc_saved_write_space = sock->sk->sk_write_space; - atomic_set (&conn->ksnc_refcount, 1); /* 1 ref for me */ + ksocknal_lib_save_callback(sock, conn); + cfs_atomic_set (&conn->ksnc_conn_refcount, 1); /* 1 ref for me */ conn->ksnc_rx_ready = 0; conn->ksnc_rx_scheduled = 0; - ksocknal_new_packet (conn, 0); - INIT_LIST_HEAD (&conn->ksnc_tx_queue); + CFS_INIT_LIST_HEAD (&conn->ksnc_tx_queue); conn->ksnc_tx_ready = 0; conn->ksnc_tx_scheduled = 0; - atomic_set (&conn->ksnc_tx_nob, 0); + conn->ksnc_tx_carrier = NULL; + cfs_atomic_set (&conn->ksnc_tx_nob, 0); + + LIBCFS_ALLOC(hello, offsetof(ksock_hello_msg_t, + kshm_ips[LNET_MAX_INTERFACES])); + if (hello == NULL) { + rc = -ENOMEM; + goto failed_1; + } /* stash conn's local and remote addrs */ - rc = ksocknal_get_conn_addrs (conn); + rc = ksocknal_lib_get_conn_addrs (conn); if (rc != 0) - goto failed_0; + goto failed_1; + + /* Find out/confirm peer's NID and connection type and get the + * vector of interfaces she's willing to let me connect to. + * Passive connections use the listener timeout since the peer sends + * eagerly */ + + if (active) { + peer = route->ksnr_peer; + LASSERT(ni == peer->ksnp_ni); - if (!passive) { /* Active connection sends HELLO eagerly */ - rc = ksocknal_local_ipvec(ipaddrs); - if (rc < 0) - goto failed_0; - nipaddrs = rc; + hello->kshm_nips = ksocknal_local_ipvec(ni, hello->kshm_ips); + peerid = peer->ksnp_id; + + cfs_write_lock_bh(global_lock); + conn->ksnc_proto = peer->ksnp_proto; + cfs_write_unlock_bh(global_lock); + + if (conn->ksnc_proto == NULL) { + conn->ksnc_proto = &ksocknal_protocol_v3x; +#if SOCKNAL_VERSION_DEBUG + if (*ksocknal_tunables.ksnd_protocol == 2) + conn->ksnc_proto = &ksocknal_protocol_v2x; + else if (*ksocknal_tunables.ksnd_protocol == 1) + conn->ksnc_proto = &ksocknal_protocol_v1x; +#endif + } - rc = ksocknal_send_hello (conn, ipaddrs, nipaddrs); + rc = ksocknal_send_hello (ni, conn, peerid.nid, hello); if (rc != 0) - goto failed_0; + goto failed_1; + } else { + peerid.nid = LNET_NID_ANY; + peerid.pid = LNET_PID_ANY; + + /* Passive, get protocol from peer */ + conn->ksnc_proto = NULL; } - /* Find out/confirm peer's NID and connection type and get the - * vector of interfaces she's willing to let me connect to */ - nid = (route == NULL) ? PTL_NID_ANY : route->ksnr_peer->ksnp_nid; - rc = ksocknal_recv_hello (conn, &nid, &incarnation, ipaddrs); + rc = ksocknal_recv_hello (ni, conn, hello, &peerid, &incarnation); if (rc < 0) - goto failed_0; - nipaddrs = rc; - LASSERT (nid != PTL_NID_ANY); + goto failed_1; - if (route != NULL) { - peer = route->ksnr_peer; - atomic_inc(&peer->ksnp_refcount); + LASSERT (rc == 0 || active); + LASSERT (conn->ksnc_proto != NULL); + LASSERT (peerid.nid != LNET_NID_ANY); + + if (active) { + ksocknal_peer_addref(peer); + cfs_write_lock_bh (global_lock); } else { - peer = ksocknal_create_peer(nid); - if (peer == NULL) { - rc = -ENOMEM; - goto failed_0; - } + rc = ksocknal_create_peer(&peer, ni, peerid); + if (rc != 0) + goto failed_1; + + cfs_write_lock_bh (global_lock); - write_lock_irqsave(global_lock, flags); + /* called with a ref on ni, so shutdown can't have started */ + LASSERT (((ksock_net_t *) ni->ni_data)->ksnn_shutdown == 0); - peer2 = ksocknal_find_peer_locked(nid); + peer2 = ksocknal_find_peer_locked(ni, peerid); if (peer2 == NULL) { /* NB this puts an "empty" peer in the peer * table (which takes my ref) */ - list_add_tail(&peer->ksnp_list, - ksocknal_nid2peerlist(nid)); - } else { - ksocknal_put_peer(peer); + cfs_list_add_tail(&peer->ksnp_list, + ksocknal_nid2peerlist(peerid.nid)); + } else { + ksocknal_peer_decref(peer); peer = peer2; } - /* +1 ref for me */ - atomic_inc(&peer->ksnp_refcount); - write_unlock_irqrestore(global_lock, flags); - } - - if (!passive) { - ksocknal_create_routes(peer, conn->ksnc_port, - ipaddrs, nipaddrs); - rc = 0; - } else { - rc = ksocknal_select_ips(peer, ipaddrs, nipaddrs); - LASSERT (rc >= 0); - rc = ksocknal_send_hello (conn, ipaddrs, rc); + /* +1 ref for me */ + ksocknal_peer_addref(peer); + peer->ksnp_accepting++; + + /* Am I already connecting to this guy? Resolve in + * favour of higher NID... */ + if (peerid.nid < ni->ni_nid && + ksocknal_connecting(peer, conn->ksnc_ipaddr)) { + rc = EALREADY; + warn = "connection race resolution"; + goto failed_2; + } } - if (rc < 0) - goto failed_1; - - write_lock_irqsave (global_lock, flags); if (peer->ksnp_closing || - (route != NULL && route->ksnr_deleted)) { - /* route/peer got closed under me */ + (active && route->ksnr_deleted)) { + /* peer/route got closed under me */ rc = -ESTALE; + warn = "peer/route removed"; + goto failed_2; + } + + if (peer->ksnp_proto == NULL) { + /* Never connected before. + * NB recv_hello may have returned EPROTO to signal my peer + * wants a different protocol than the one I asked for. + */ + LASSERT (cfs_list_empty(&peer->ksnp_conns)); + + peer->ksnp_proto = conn->ksnc_proto; + peer->ksnp_incarnation = incarnation; + } + + if (peer->ksnp_proto != conn->ksnc_proto || + peer->ksnp_incarnation != incarnation) { + /* Peer rebooted or I've got the wrong protocol version */ + ksocknal_close_peer_conns_locked(peer, 0, 0); + + peer->ksnp_proto = NULL; + rc = ESTALE; + warn = peer->ksnp_incarnation != incarnation ? + "peer rebooted" : + "wrong proto version"; + goto failed_2; + } + + switch (rc) { + default: + LBUG(); + case 0: + break; + case EALREADY: + warn = "lost conn race"; + goto failed_2; + case EPROTO: + warn = "retry with different protocol version"; goto failed_2; } - /* Refuse to duplicate an existing connection (both sides might - * autoconnect at once), unless this is a loopback connection */ + /* Refuse to duplicate an existing connection, unless this is a + * loopback connection */ if (conn->ksnc_ipaddr != conn->ksnc_myipaddr) { - list_for_each(tmp, &peer->ksnp_conns) { - conn2 = list_entry(tmp, ksock_conn_t, ksnc_list); + cfs_list_for_each(tmp, &peer->ksnp_conns) { + conn2 = cfs_list_entry(tmp, ksock_conn_t, ksnc_list); if (conn2->ksnc_ipaddr != conn->ksnc_ipaddr || conn2->ksnc_myipaddr != conn->ksnc_myipaddr || - conn2->ksnc_type != conn->ksnc_type || - conn2->ksnc_incarnation != incarnation) + conn2->ksnc_type != conn->ksnc_type) continue; - CWARN("Not creating duplicate connection to " - "%u.%u.%u.%u type %d\n", - HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_type); - rc = -EALREADY; + /* Reply on a passive connection attempt so the peer + * realises we're connected. */ + LASSERT (rc == 0); + if (!active) + rc = EALREADY; + + warn = "duplicate"; goto failed_2; } } @@ -1238,10 +1246,10 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type) /* If the connection created by this route didn't bind to the IP * address the route connected to, the connection/route matching * code below probably isn't going to work. */ - if (route != NULL && + if (active && route->ksnr_ipaddr != conn->ksnc_ipaddr) { - CERROR("Route "LPX64" %u.%u.%u.%u connected to %u.%u.%u.%u\n", - peer->ksnp_nid, + CERROR("Route %s %u.%u.%u.%u connected to %u.%u.%u.%u\n", + libcfs_id2str(peer->ksnp_id), HIPQUAD(route->ksnr_ipaddr), HIPQUAD(conn->ksnc_ipaddr)); } @@ -1250,94 +1258,162 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type) * create an association. This allows incoming connections created * by routes in my peer to match my own route entries so I don't * continually create duplicate routes. */ - list_for_each (tmp, &peer->ksnp_routes) { - route = list_entry(tmp, ksock_route_t, ksnr_list); + cfs_list_for_each (tmp, &peer->ksnp_routes) { + route = cfs_list_entry(tmp, ksock_route_t, ksnr_list); if (route->ksnr_ipaddr != conn->ksnc_ipaddr) continue; - + ksocknal_associate_route_conn_locked(route, conn); break; } - /* Give conn a ref on sock->file since we're going to return success */ - get_file(sock->file); - conn->ksnc_peer = peer; /* conn takes my ref on peer */ - conn->ksnc_incarnation = incarnation; - peer->ksnp_last_alive = jiffies; + peer->ksnp_last_alive = cfs_time_current(); + peer->ksnp_send_keepalive = 0; peer->ksnp_error = 0; sched = ksocknal_choose_scheduler_locked (irq); sched->kss_nconns++; conn->ksnc_scheduler = sched; + conn->ksnc_tx_last_post = cfs_time_current(); /* Set the deadline for the outgoing HELLO to drain */ - conn->ksnc_tx_bufnob = sock->sk->sk_wmem_queued; - conn->ksnc_tx_deadline = jiffies + - ksocknal_tunables.ksnd_io_timeout * HZ; - mb(); /* order with adding to peer's conn list */ + conn->ksnc_tx_bufnob = libcfs_sock_wmem_queued(sock); + conn->ksnc_tx_deadline = cfs_time_shift(*ksocknal_tunables.ksnd_timeout); + cfs_mb(); /* order with adding to peer's conn list */ - list_add (&conn->ksnc_list, &peer->ksnp_conns); - atomic_inc (&conn->ksnc_refcount); + cfs_list_add (&conn->ksnc_list, &peer->ksnp_conns); + ksocknal_conn_addref(conn); - /* NB my callbacks block while I hold ksnd_global_lock */ - sock->sk->sk_user_data = conn; - sock->sk->sk_data_ready = ksocknal_data_ready; - sock->sk->sk_write_space = ksocknal_write_space; - - /* Take all the packets blocking for a connection. - * NB, it might be nicer to share these blocked packets among any - * other connections that are becoming established. */ - while (!list_empty (&peer->ksnp_tx_queue)) { - tx = list_entry (peer->ksnp_tx_queue.next, - ksock_tx_t, tx_list); - - list_del (&tx->tx_list); + ksocknal_new_packet(conn, 0); + + conn->ksnc_zc_capable = ksocknal_lib_zc_capable(conn); + + /* Take packets blocking for this connection. */ + cfs_list_for_each_entry_safe(tx, txtmp, &peer->ksnp_tx_queue, tx_list) { + if (conn->ksnc_proto->pro_match_tx(conn, tx, tx->tx_nonblk) == SOCKNAL_MATCH_NO) + continue; + + cfs_list_del (&tx->tx_list); ksocknal_queue_tx_locked (tx, conn); } - rc = ksocknal_close_stale_conns_locked(peer, incarnation); - if (rc != 0) - CERROR ("Closed %d stale conns to nid "LPX64" ip %d.%d.%d.%d\n", - rc, conn->ksnc_peer->ksnp_nid, - HIPQUAD(conn->ksnc_ipaddr)); + cfs_write_unlock_bh (global_lock); - write_unlock_irqrestore (global_lock, flags); + /* We've now got a new connection. Any errors from here on are just + * like "normal" comms errors and we close the connection normally. + * NB (a) we still have to send the reply HELLO for passive + * connections, + * (b) normal I/O on the conn is blocked until I setup and call the + * socket callbacks. + */ + + ksocknal_lib_bind_irq (irq); - ksocknal_bind_irq (irq); + CDEBUG(D_NET, "New conn %s p %d.x %u.%u.%u.%u -> %u.%u.%u.%u/%d" + " incarnation:"LPD64" sched[%d]/%d\n", + libcfs_id2str(peerid), conn->ksnc_proto->pro_version, + HIPQUAD(conn->ksnc_myipaddr), HIPQUAD(conn->ksnc_ipaddr), + conn->ksnc_port, incarnation, + (int)(conn->ksnc_scheduler - ksocknal_data.ksnd_schedulers), irq); - /* Call the callbacks right now to get things going. */ - if (ksocknal_getconnsock(conn) == 0) { - ksocknal_data_ready (sock->sk, 0); - ksocknal_write_space (sock->sk); - ksocknal_putconnsock(conn); + if (active) { + /* additional routes after interface exchange? */ + ksocknal_create_routes(peer, conn->ksnc_port, + hello->kshm_ips, hello->kshm_nips); + } else { + hello->kshm_nips = ksocknal_select_ips(peer, hello->kshm_ips, + hello->kshm_nips); + rc = ksocknal_send_hello(ni, conn, peerid.nid, hello); } - CWARN("New conn nid:"LPX64" %u.%u.%u.%u -> %u.%u.%u.%u/%d" - " incarnation:"LPX64" sched[%d]/%d\n", - nid, HIPQUAD(conn->ksnc_myipaddr), - HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port, incarnation, - (int)(conn->ksnc_scheduler - ksocknal_data.ksnd_schedulers), irq); + LIBCFS_FREE(hello, offsetof(ksock_hello_msg_t, + kshm_ips[LNET_MAX_INTERFACES])); - ksocknal_put_conn (conn); - return (0); + /* setup the socket AFTER I've received hello (it disables + * SO_LINGER). I might call back to the acceptor who may want + * to send a protocol version response and then close the + * socket; this ensures the socket only tears down after the + * response has been sent. */ + if (rc == 0) + rc = ksocknal_lib_setup_sock(sock); - failed_2: - if (!peer->ksnp_closing && - list_empty (&peer->ksnp_conns) && - list_empty (&peer->ksnp_routes)) - ksocknal_unlink_peer_locked(peer); - write_unlock_irqrestore(global_lock, flags); + cfs_write_lock_bh(global_lock); - failed_1: - ksocknal_put_peer (peer); + /* NB my callbacks block while I hold ksnd_global_lock */ + ksocknal_lib_set_callback(sock, conn); - failed_0: - PORTAL_FREE (conn, sizeof(*conn)); + if (!active) + peer->ksnp_accepting--; - LASSERT (rc != 0); - return (rc); + cfs_write_unlock_bh(global_lock); + + if (rc != 0) { + cfs_write_lock_bh(global_lock); + if (!conn->ksnc_closing) { + /* could be closed by another thread */ + ksocknal_close_conn_locked(conn, rc); + } + cfs_write_unlock_bh(global_lock); + } else if (ksocknal_connsock_addref(conn) == 0) { + /* Allow I/O to proceed. */ + ksocknal_read_callback(conn); + ksocknal_write_callback(conn); + ksocknal_connsock_decref(conn); + } + + ksocknal_connsock_decref(conn); + ksocknal_conn_decref(conn); + return rc; + + failed_2: + if (!peer->ksnp_closing && + cfs_list_empty (&peer->ksnp_conns) && + cfs_list_empty (&peer->ksnp_routes)) { + cfs_list_add(&zombies, &peer->ksnp_tx_queue); + cfs_list_del_init(&peer->ksnp_tx_queue); + ksocknal_unlink_peer_locked(peer); + } + + cfs_write_unlock_bh (global_lock); + + if (warn != NULL) { + if (rc < 0) + CERROR("Not creating conn %s type %d: %s\n", + libcfs_id2str(peerid), conn->ksnc_type, warn); + else + CDEBUG(D_NET, "Not creating conn %s type %d: %s\n", + libcfs_id2str(peerid), conn->ksnc_type, warn); + } + + if (!active) { + if (rc > 0) { + /* Request retry by replying with CONN_NONE + * ksnc_proto has been set already */ + conn->ksnc_type = SOCKLND_CONN_NONE; + hello->kshm_nips = 0; + ksocknal_send_hello(ni, conn, peerid.nid, hello); + } + + cfs_write_lock_bh(global_lock); + peer->ksnp_accepting--; + cfs_write_unlock_bh(global_lock); + } + + ksocknal_txlist_done(ni, &zombies, 1); + ksocknal_peer_decref(peer); + + failed_1: + if (hello != NULL) + LIBCFS_FREE(hello, offsetof(ksock_hello_msg_t, + kshm_ips[LNET_MAX_INTERFACES])); + + LIBCFS_FREE (conn, sizeof(*conn)); + + failed_0: + libcfs_sock_release(sock); + return rc; } void @@ -1349,31 +1425,29 @@ ksocknal_close_conn_locked (ksock_conn_t *conn, int error) ksock_peer_t *peer = conn->ksnc_peer; ksock_route_t *route; ksock_conn_t *conn2; - struct list_head *tmp; + cfs_list_t *tmp; LASSERT (peer->ksnp_error == 0); LASSERT (!conn->ksnc_closing); conn->ksnc_closing = 1; - atomic_inc (&ksocknal_data.ksnd_nclosing_conns); - + /* ksnd_deathrow_conns takes over peer's ref */ - list_del (&conn->ksnc_list); + cfs_list_del (&conn->ksnc_list); route = conn->ksnc_route; if (route != NULL) { /* dissociate conn from route... */ LASSERT (!route->ksnr_deleted); - LASSERT ((route->ksnr_connecting & (1 << conn->ksnc_type)) == 0); LASSERT ((route->ksnr_connected & (1 << conn->ksnc_type)) != 0); conn2 = NULL; - list_for_each(tmp, &peer->ksnp_conns) { - conn2 = list_entry(tmp, ksock_conn_t, ksnc_list); - + cfs_list_for_each(tmp, &peer->ksnp_conns) { + conn2 = cfs_list_entry(tmp, ksock_conn_t, ksnc_list); + if (conn2->ksnc_route == route && conn2->ksnc_type == conn->ksnc_type) break; - + conn2 = NULL; } if (conn2 == NULL) @@ -1382,30 +1456,115 @@ ksocknal_close_conn_locked (ksock_conn_t *conn, int error) conn->ksnc_route = NULL; #if 0 /* irrelevent with only eager routes */ - list_del (&route->ksnr_list); /* make route least favourite */ - list_add_tail (&route->ksnr_list, &peer->ksnp_routes); + /* make route least favourite */ + cfs_list_del (&route->ksnr_list); + cfs_list_add_tail (&route->ksnr_list, &peer->ksnp_routes); #endif - ksocknal_put_route (route); /* drop conn's ref on route */ + ksocknal_route_decref(route); /* drop conn's ref on route */ } - if (list_empty (&peer->ksnp_conns)) { + if (cfs_list_empty (&peer->ksnp_conns)) { /* No more connections to this peer */ + if (!cfs_list_empty(&peer->ksnp_tx_queue)) { + ksock_tx_t *tx; + + LASSERT (conn->ksnc_proto == &ksocknal_protocol_v3x); + + /* throw them to the last connection..., + * these TXs will be send to /dev/null by scheduler */ + cfs_list_for_each_entry(tx, &peer->ksnp_tx_queue, + tx_list) + ksocknal_tx_prep(conn, tx); + + cfs_spin_lock_bh(&conn->ksnc_scheduler->kss_lock); + cfs_list_splice_init(&peer->ksnp_tx_queue, + &conn->ksnc_tx_queue); + cfs_spin_unlock_bh(&conn->ksnc_scheduler->kss_lock); + } + + peer->ksnp_proto = NULL; /* renegotiate protocol version */ peer->ksnp_error = error; /* stash last conn close reason */ - if (list_empty (&peer->ksnp_routes)) { + if (cfs_list_empty (&peer->ksnp_routes)) { /* I've just closed last conn belonging to a - * non-autoconnecting peer */ + * peer with no routes to it */ ksocknal_unlink_peer_locked (peer); } } - spin_lock (&ksocknal_data.ksnd_reaper_lock); + cfs_spin_lock_bh (&ksocknal_data.ksnd_reaper_lock); + + cfs_list_add_tail (&conn->ksnc_list, + &ksocknal_data.ksnd_deathrow_conns); + cfs_waitq_signal (&ksocknal_data.ksnd_reaper_waitq); + + cfs_spin_unlock_bh (&ksocknal_data.ksnd_reaper_lock); +} + +void +ksocknal_peer_failed (ksock_peer_t *peer) +{ + int notify = 0; + cfs_time_t last_alive = 0; + + /* There has been a connection failure or comms error; but I'll only + * tell LNET I think the peer is dead if it's to another kernel and + * there are no connections or connection attempts in existance. */ + + cfs_read_lock (&ksocknal_data.ksnd_global_lock); + + if ((peer->ksnp_id.pid & LNET_PID_USERFLAG) == 0 && + cfs_list_empty(&peer->ksnp_conns) && + peer->ksnp_accepting == 0 && + ksocknal_find_connecting_route_locked(peer) == NULL) { + notify = 1; + last_alive = peer->ksnp_last_alive; + } + + cfs_read_unlock (&ksocknal_data.ksnd_global_lock); + + if (notify) + lnet_notify (peer->ksnp_ni, peer->ksnp_id.nid, 0, + last_alive); +} + +void +ksocknal_finalize_zcreq(ksock_conn_t *conn) +{ + ksock_peer_t *peer = conn->ksnc_peer; + ksock_tx_t *tx; + ksock_tx_t *tmp; + CFS_LIST_HEAD (zlist); + + /* NB safe to finalize TXs because closing of socket will + * abort all buffered data */ + LASSERT (conn->ksnc_sock == NULL); + + cfs_spin_lock(&peer->ksnp_lock); + + cfs_list_for_each_entry_safe_typed(tx, tmp, &peer->ksnp_zc_req_list, + ksock_tx_t, tx_zc_list) { + if (tx->tx_conn != conn) + continue; + + LASSERT (tx->tx_msg.ksm_zc_cookies[0] != 0); + + tx->tx_msg.ksm_zc_cookies[0] = 0; + if (tx->tx_resid == 0) + tx->tx_resid = -1; /* mark it as not-acked */ + cfs_list_del(&tx->tx_zc_list); + cfs_list_add(&tx->tx_zc_list, &zlist); + } + + cfs_spin_unlock(&peer->ksnp_lock); + + while (!cfs_list_empty(&zlist)) { + tx = cfs_list_entry(zlist.next, ksock_tx_t, tx_zc_list); - list_add_tail (&conn->ksnc_list, &ksocknal_data.ksnd_deathrow_conns); - wake_up (&ksocknal_data.ksnd_reaper_waitq); - - spin_unlock (&ksocknal_data.ksnd_reaper_lock); + cfs_list_del(&tx->tx_zc_list); + ksocknal_tx_decref(tx); + } } void @@ -1415,46 +1574,35 @@ ksocknal_terminate_conn (ksock_conn_t *conn) * disengage the socket from its callbacks and close it. * ksnc_refcount will eventually hit zero, and then the reaper will * destroy it. */ - unsigned long flags; - ksock_peer_t *peer = conn->ksnc_peer; - ksock_sched_t *sched = conn->ksnc_scheduler; - struct timeval now; - time_t then = 0; - int notify = 0; + ksock_peer_t *peer = conn->ksnc_peer; + ksock_sched_t *sched = conn->ksnc_scheduler; + int failed = 0; LASSERT(conn->ksnc_closing); /* wake up the scheduler to "send" all remaining packets to /dev/null */ - spin_lock_irqsave(&sched->kss_lock, flags); + cfs_spin_lock_bh (&sched->kss_lock); + + /* a closing conn is always ready to tx */ + conn->ksnc_tx_ready = 1; if (!conn->ksnc_tx_scheduled && - !list_empty(&conn->ksnc_tx_queue)){ - list_add_tail (&conn->ksnc_tx_list, + !cfs_list_empty(&conn->ksnc_tx_queue)){ + cfs_list_add_tail (&conn->ksnc_tx_list, &sched->kss_tx_conns); - /* a closing conn is always ready to tx */ - conn->ksnc_tx_ready = 1; conn->ksnc_tx_scheduled = 1; /* extra ref for scheduler */ - atomic_inc (&conn->ksnc_refcount); + ksocknal_conn_addref(conn); - wake_up (&sched->kss_waitq); + cfs_waitq_signal (&sched->kss_waitq); } - spin_unlock_irqrestore (&sched->kss_lock, flags); + cfs_spin_unlock_bh (&sched->kss_lock); /* serialise with callbacks */ - write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); - - /* Remove conn's network callbacks. - * NB I _have_ to restore the callback, rather than storing a noop, - * since the socket could survive past this module being unloaded!! */ - conn->ksnc_sock->sk->sk_data_ready = conn->ksnc_saved_data_ready; - conn->ksnc_sock->sk->sk_write_space = conn->ksnc_saved_write_space; + cfs_write_lock_bh (&ksocknal_data.ksnd_global_lock); - /* A callback could be in progress already; they hold a read lock - * on ksnd_global_lock (to serialise with me) and NOOP if - * sk_user_data is NULL. */ - conn->ksnc_sock->sk->sk_user_data = NULL; + ksocknal_lib_reset_callback(conn->ksnc_sock, conn); /* OK, so this conn may not be completely disengaged from its * scheduler yet, but it _has_ committed to terminate... */ @@ -1462,97 +1610,113 @@ ksocknal_terminate_conn (ksock_conn_t *conn) if (peer->ksnp_error != 0) { /* peer's last conn closed in error */ - LASSERT (list_empty (&peer->ksnp_conns)); - - /* convert peer's last-known-alive timestamp from jiffies */ - do_gettimeofday (&now); - then = now.tv_sec - (jiffies - peer->ksnp_last_alive)/HZ; - notify = 1; + LASSERT (cfs_list_empty (&peer->ksnp_conns)); + failed = 1; + peer->ksnp_error = 0; /* avoid multiple notifications */ } - - write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); + + cfs_write_unlock_bh (&ksocknal_data.ksnd_global_lock); + + if (failed) + ksocknal_peer_failed(peer); /* The socket is closed on the final put; either here, or in * ksocknal_{send,recv}msg(). Since we set up the linger2 option * when the connection was established, this will close the socket * immediately, aborting anything buffered in it. Any hung * zero-copy transmits will therefore complete in finite time. */ - ksocknal_putconnsock (conn); + ksocknal_connsock_decref(conn); +} - if (notify) - kpr_notify (&ksocknal_data.ksnd_router, peer->ksnp_nid, - 0, then); +void +ksocknal_queue_zombie_conn (ksock_conn_t *conn) +{ + /* Queue the conn for the reaper to destroy */ + + LASSERT (cfs_atomic_read(&conn->ksnc_conn_refcount) == 0); + cfs_spin_lock_bh (&ksocknal_data.ksnd_reaper_lock); + + cfs_list_add_tail(&conn->ksnc_list, &ksocknal_data.ksnd_zombie_conns); + cfs_waitq_signal(&ksocknal_data.ksnd_reaper_waitq); + + cfs_spin_unlock_bh (&ksocknal_data.ksnd_reaper_lock); } void ksocknal_destroy_conn (ksock_conn_t *conn) { + cfs_time_t last_rcv; + /* Final coup-de-grace of the reaper */ CDEBUG (D_NET, "connection %p\n", conn); - LASSERT (atomic_read (&conn->ksnc_refcount) == 0); + LASSERT (cfs_atomic_read (&conn->ksnc_conn_refcount) == 0); + LASSERT (cfs_atomic_read (&conn->ksnc_sock_refcount) == 0); + LASSERT (conn->ksnc_sock == NULL); LASSERT (conn->ksnc_route == NULL); LASSERT (!conn->ksnc_tx_scheduled); LASSERT (!conn->ksnc_rx_scheduled); - LASSERT (list_empty(&conn->ksnc_tx_queue)); + LASSERT (cfs_list_empty(&conn->ksnc_tx_queue)); /* complete current receive if any */ switch (conn->ksnc_rx_state) { - case SOCKNAL_RX_BODY: - CERROR("Completing partial receive from "LPX64 - ", ip %d.%d.%d.%d:%d, with error\n", - conn->ksnc_peer->ksnp_nid, - HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port); - lib_finalize (&ksocknal_lib, NULL, conn->ksnc_cookie, PTL_FAIL); + case SOCKNAL_RX_LNET_PAYLOAD: + last_rcv = conn->ksnc_rx_deadline - + cfs_time_seconds(*ksocknal_tunables.ksnd_timeout); + CERROR("Completing partial receive from %s[%d]" + ", ip %d.%d.%d.%d:%d, with error, wanted: %d, left: %d, " + "last alive is %ld secs ago\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), conn->ksnc_type, + HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port, + conn->ksnc_rx_nob_wanted, conn->ksnc_rx_nob_left, + cfs_duration_sec(cfs_time_sub(cfs_time_current(), + last_rcv))); + lnet_finalize (conn->ksnc_peer->ksnp_ni, + conn->ksnc_cookie, -EIO); break; - case SOCKNAL_RX_BODY_FWD: - ksocknal_fmb_callback (conn->ksnc_cookie, -ECONNABORTED); + case SOCKNAL_RX_LNET_HEADER: + if (conn->ksnc_rx_started) + CERROR("Incomplete receive of lnet header from %s" + ", ip %d.%d.%d.%d:%d, with error, protocol: %d.x.\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), + HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port, + conn->ksnc_proto->pro_version); break; - case SOCKNAL_RX_HEADER: - case SOCKNAL_RX_SLOP: + case SOCKNAL_RX_KSM_HEADER: + if (conn->ksnc_rx_started) + CERROR("Incomplete receive of ksock message from %s" + ", ip %d.%d.%d.%d:%d, with error, protocol: %d.x.\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), + HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port, + conn->ksnc_proto->pro_version); break; + case SOCKNAL_RX_SLOP: + if (conn->ksnc_rx_started) + CERROR("Incomplete receive of slops from %s" + ", ip %d.%d.%d.%d:%d, with error\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), + HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port); + break; default: LBUG (); break; } - ksocknal_put_peer (conn->ksnc_peer); + ksocknal_peer_decref(conn->ksnc_peer); - PORTAL_FREE (conn, sizeof (*conn)); - atomic_dec (&ksocknal_data.ksnd_nclosing_conns); -} - -void -ksocknal_put_conn (ksock_conn_t *conn) -{ - unsigned long flags; - - CDEBUG (D_OTHER, "putting conn[%p] -> "LPX64" (%d)\n", - conn, conn->ksnc_peer->ksnp_nid, - atomic_read (&conn->ksnc_refcount)); - - LASSERT (atomic_read (&conn->ksnc_refcount) > 0); - if (!atomic_dec_and_test (&conn->ksnc_refcount)) - return; - - spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags); - - list_add (&conn->ksnc_list, &ksocknal_data.ksnd_zombie_conns); - wake_up (&ksocknal_data.ksnd_reaper_waitq); - - spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags); + LIBCFS_FREE (conn, sizeof (*conn)); } int ksocknal_close_peer_conns_locked (ksock_peer_t *peer, __u32 ipaddr, int why) { ksock_conn_t *conn; - struct list_head *ctmp; - struct list_head *cnxt; + cfs_list_t *ctmp; + cfs_list_t *cnxt; int count = 0; - list_for_each_safe (ctmp, cnxt, &peer->ksnp_conns) { - conn = list_entry (ctmp, ksock_conn_t, ksnc_list); + cfs_list_for_each_safe (ctmp, cnxt, &peer->ksnp_conns) { + conn = cfs_list_entry (ctmp, ksock_conn_t, ksnc_list); if (ipaddr == 0 || conn->ksnc_ipaddr == ipaddr) { @@ -1565,157 +1729,140 @@ ksocknal_close_peer_conns_locked (ksock_peer_t *peer, __u32 ipaddr, int why) } int -ksocknal_close_stale_conns_locked (ksock_peer_t *peer, __u64 incarnation) -{ - ksock_conn_t *conn; - struct list_head *ctmp; - struct list_head *cnxt; - int count = 0; - - list_for_each_safe (ctmp, cnxt, &peer->ksnp_conns) { - conn = list_entry (ctmp, ksock_conn_t, ksnc_list); - - if (conn->ksnc_incarnation == incarnation) - continue; - - CWARN("Closing stale conn nid:"LPX64" ip:%08x/%d " - "incarnation:"LPX64"("LPX64")\n", - peer->ksnp_nid, conn->ksnc_ipaddr, conn->ksnc_port, - conn->ksnc_incarnation, incarnation); - - count++; - ksocknal_close_conn_locked (conn, -ESTALE); - } - - return (count); -} - -int -ksocknal_close_conn_and_siblings (ksock_conn_t *conn, int why) +ksocknal_close_conn_and_siblings (ksock_conn_t *conn, int why) { ksock_peer_t *peer = conn->ksnc_peer; __u32 ipaddr = conn->ksnc_ipaddr; - unsigned long flags; int count; - write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); + cfs_write_lock_bh (&ksocknal_data.ksnd_global_lock); count = ksocknal_close_peer_conns_locked (peer, ipaddr, why); - - write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); + + cfs_write_unlock_bh (&ksocknal_data.ksnd_global_lock); return (count); } int -ksocknal_close_matching_conns (ptl_nid_t nid, __u32 ipaddr) +ksocknal_close_matching_conns (lnet_process_id_t id, __u32 ipaddr) { - unsigned long flags; ksock_peer_t *peer; - struct list_head *ptmp; - struct list_head *pnxt; + cfs_list_t *ptmp; + cfs_list_t *pnxt; int lo; int hi; int i; int count = 0; - write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); + cfs_write_lock_bh (&ksocknal_data.ksnd_global_lock); - if (nid != PTL_NID_ANY) - lo = hi = ksocknal_nid2peerlist(nid) - ksocknal_data.ksnd_peers; + if (id.nid != LNET_NID_ANY) + lo = hi = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers); else { lo = 0; hi = ksocknal_data.ksnd_peer_hash_size - 1; } for (i = lo; i <= hi; i++) { - list_for_each_safe (ptmp, pnxt, &ksocknal_data.ksnd_peers[i]) { + cfs_list_for_each_safe (ptmp, pnxt, + &ksocknal_data.ksnd_peers[i]) { - peer = list_entry (ptmp, ksock_peer_t, ksnp_list); + peer = cfs_list_entry (ptmp, ksock_peer_t, ksnp_list); - if (!(nid == PTL_NID_ANY || nid == peer->ksnp_nid)) + if (!((id.nid == LNET_NID_ANY || id.nid == peer->ksnp_id.nid) && + (id.pid == LNET_PID_ANY || id.pid == peer->ksnp_id.pid))) continue; count += ksocknal_close_peer_conns_locked (peer, ipaddr, 0); } } - write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); + cfs_write_unlock_bh (&ksocknal_data.ksnd_global_lock); /* wildcards always succeed */ - if (nid == PTL_NID_ANY || ipaddr == 0) + if (id.nid == LNET_NID_ANY || id.pid == LNET_PID_ANY || ipaddr == 0) return (0); - + return (count == 0 ? -ENOENT : 0); } void -ksocknal_notify (void *arg, ptl_nid_t gw_nid, int alive) +ksocknal_notify (lnet_ni_t *ni, lnet_nid_t gw_nid, int alive) { /* The router is telling me she's been notified of a change in * gateway state.... */ + lnet_process_id_t id = {0}; + + id.nid = gw_nid; + id.pid = LNET_PID_ANY; - CDEBUG (D_NET, "gw "LPX64" %s\n", gw_nid, alive ? "up" : "down"); + CDEBUG (D_NET, "gw %s %s\n", libcfs_nid2str(gw_nid), + alive ? "up" : "down"); if (!alive) { /* If the gateway crashed, close all open connections... */ - ksocknal_close_matching_conns (gw_nid, 0); + ksocknal_close_matching_conns (id, 0); return; } - + /* ...otherwise do nothing. We can only establish new connections * if we have autroutes, and these connect on demand. */ } -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) -struct tcp_opt *sock2tcp_opt(struct sock *sk) -{ - return &(sk->tp_pinfo.af_tcp); -} -#else -struct tcp_opt *sock2tcp_opt(struct sock *sk) -{ - struct tcp_sock *s = (struct tcp_sock *)sk; - return &s->tcp; -} -#endif - void -ksocknal_push_conn (ksock_conn_t *conn) +ksocknal_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when) { - struct sock *sk; - struct tcp_opt *tp; - int nonagle; - int val = 1; - int rc; - mm_segment_t oldmm; - - rc = ksocknal_getconnsock (conn); - if (rc != 0) /* being shut down */ - return; - - sk = conn->ksnc_sock->sk; - tp = sock2tcp_opt(sk); - - lock_sock (sk); - nonagle = tp->nonagle; - tp->nonagle = 1; - release_sock (sk); + int connect = 1; + cfs_time_t last_alive = 0; + ksock_peer_t *peer = NULL; + cfs_rwlock_t *glock = &ksocknal_data.ksnd_global_lock; + lnet_process_id_t id = {.nid = nid, .pid = LUSTRE_SRV_LNET_PID}; + + cfs_read_lock(glock); + + peer = ksocknal_find_peer_locked(ni, id); + if (peer != NULL) { + cfs_list_t *tmp; + ksock_conn_t *conn; + int bufnob; + + cfs_list_for_each (tmp, &peer->ksnp_conns) { + conn = cfs_list_entry(tmp, ksock_conn_t, ksnc_list); + bufnob = libcfs_sock_wmem_queued(conn->ksnc_sock); + + if (bufnob < conn->ksnc_tx_bufnob) { + /* something got ACKed */ + conn->ksnc_tx_deadline = + cfs_time_shift(*ksocknal_tunables.ksnd_timeout); + peer->ksnp_last_alive = cfs_time_current(); + conn->ksnc_tx_bufnob = bufnob; + } + } - oldmm = get_fs (); - set_fs (KERNEL_DS); + last_alive = peer->ksnp_last_alive; + if (ksocknal_find_connectable_route_locked(peer) == NULL) + connect = 0; + } - rc = sk->sk_prot->setsockopt (sk, SOL_TCP, TCP_NODELAY, - (char *)&val, sizeof (val)); - LASSERT (rc == 0); + cfs_read_unlock(glock); - set_fs (oldmm); + if (last_alive != 0) + *when = last_alive; - lock_sock (sk); - tp->nonagle = nonagle; - release_sock (sk); + if (!connect) + return; - ksocknal_putconnsock (conn); + ksocknal_add_peer(ni, id, LNET_NIDADDR(nid), lnet_acceptor_port()); + + cfs_write_lock_bh(glock); + + peer = ksocknal_find_peer_locked(ni, id); + if (peer != NULL) + ksocknal_launch_all_connections_locked(peer); + + cfs_write_unlock_bh(glock); + return; } void @@ -1723,76 +1870,75 @@ ksocknal_push_peer (ksock_peer_t *peer) { int index; int i; - struct list_head *tmp; + cfs_list_t *tmp; ksock_conn_t *conn; for (index = 0; ; index++) { - read_lock (&ksocknal_data.ksnd_global_lock); + cfs_read_lock (&ksocknal_data.ksnd_global_lock); i = 0; conn = NULL; - list_for_each (tmp, &peer->ksnp_conns) { + cfs_list_for_each (tmp, &peer->ksnp_conns) { if (i++ == index) { - conn = list_entry (tmp, ksock_conn_t, ksnc_list); - atomic_inc (&conn->ksnc_refcount); + conn = cfs_list_entry (tmp, ksock_conn_t, + ksnc_list); + ksocknal_conn_addref(conn); break; } } - read_unlock (&ksocknal_data.ksnd_global_lock); + cfs_read_unlock (&ksocknal_data.ksnd_global_lock); if (conn == NULL) break; - ksocknal_push_conn (conn); - ksocknal_put_conn (conn); + ksocknal_lib_push_conn (conn); + ksocknal_conn_decref(conn); } } int -ksocknal_push (ptl_nid_t nid) +ksocknal_push (lnet_ni_t *ni, lnet_process_id_t id) { ksock_peer_t *peer; - struct list_head *tmp; + cfs_list_t *tmp; int index; int i; int j; int rc = -ENOENT; - if (nid != PTL_NID_ANY) { - peer = ksocknal_get_peer (nid); - - if (peer != NULL) { - rc = 0; - ksocknal_push_peer (peer); - ksocknal_put_peer (peer); - } - return (rc); - } - for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { for (j = 0; ; j++) { - read_lock (&ksocknal_data.ksnd_global_lock); + cfs_read_lock (&ksocknal_data.ksnd_global_lock); index = 0; peer = NULL; - list_for_each (tmp, &ksocknal_data.ksnd_peers[i]) { + cfs_list_for_each (tmp, &ksocknal_data.ksnd_peers[i]) { + peer = cfs_list_entry(tmp, ksock_peer_t, + ksnp_list); + + if (!((id.nid == LNET_NID_ANY || + id.nid == peer->ksnp_id.nid) && + (id.pid == LNET_PID_ANY || + id.pid == peer->ksnp_id.pid))) { + peer = NULL; + continue; + } + if (index++ == j) { - peer = list_entry(tmp, ksock_peer_t, - ksnp_list); - atomic_inc (&peer->ksnp_refcount); + ksocknal_peer_addref(peer); break; } } - read_unlock (&ksocknal_data.ksnd_global_lock); + cfs_read_unlock (&ksocknal_data.ksnd_global_lock); if (peer != NULL) { rc = 0; ksocknal_push_peer (peer); - ksocknal_put_peer (peer); + ksocknal_peer_decref(peer); } } @@ -1802,32 +1948,32 @@ ksocknal_push (ptl_nid_t nid) } int -ksocknal_add_interface(__u32 ipaddress, __u32 netmask) +ksocknal_add_interface(lnet_ni_t *ni, __u32 ipaddress, __u32 netmask) { - unsigned long flags; + ksock_net_t *net = ni->ni_data; ksock_interface_t *iface; int rc; int i; int j; - struct list_head *ptmp; + cfs_list_t *ptmp; ksock_peer_t *peer; - struct list_head *rtmp; + cfs_list_t *rtmp; ksock_route_t *route; if (ipaddress == 0 || netmask == 0) return (-EINVAL); - write_lock_irqsave(&ksocknal_data.ksnd_global_lock, flags); + cfs_write_lock_bh (&ksocknal_data.ksnd_global_lock); - iface = ksocknal_ip2iface(ipaddress); + iface = ksocknal_ip2iface(ni, ipaddress); if (iface != NULL) { /* silently ignore dups */ rc = 0; - } else if (ksocknal_data.ksnd_ninterfaces == SOCKNAL_MAX_INTERFACES) { + } else if (net->ksnn_ninterfaces == LNET_MAX_INTERFACES) { rc = -ENOSPC; } else { - iface = &ksocknal_data.ksnd_interfaces[ksocknal_data.ksnd_ninterfaces++]; + iface = &net->ksnn_interfaces[net->ksnn_ninterfaces++]; iface->ksni_ipaddr = ipaddress; iface->ksni_netmask = netmask; @@ -1835,16 +1981,19 @@ ksocknal_add_interface(__u32 ipaddress, __u32 netmask) iface->ksni_npeers = 0; for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { - list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) { - peer = list_entry(ptmp, ksock_peer_t, ksnp_list); + cfs_list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) { + peer = cfs_list_entry(ptmp, ksock_peer_t, + ksnp_list); - for (j = 0; i < peer->ksnp_n_passive_ips; j++) + for (j = 0; j < peer->ksnp_n_passive_ips; j++) if (peer->ksnp_passive_ips[j] == ipaddress) iface->ksni_npeers++; - - list_for_each(rtmp, &peer->ksnp_routes) { - route = list_entry(rtmp, ksock_route_t, ksnr_list); - + + cfs_list_for_each(rtmp, &peer->ksnp_routes) { + route = cfs_list_entry(rtmp, + ksock_route_t, + ksnr_list); + if (route->ksnr_myipaddr == ipaddress) iface->ksni_nroutes++; } @@ -1854,8 +2003,8 @@ ksocknal_add_interface(__u32 ipaddress, __u32 netmask) rc = 0; /* NB only new connections will pay attention to the new interface! */ } - - write_unlock_irqrestore(&ksocknal_data.ksnd_global_lock, flags); + + cfs_write_unlock_bh (&ksocknal_data.ksnd_global_lock); return (rc); } @@ -1863,8 +2012,8 @@ ksocknal_add_interface(__u32 ipaddress, __u32 netmask) void ksocknal_peer_del_interface_locked(ksock_peer_t *peer, __u32 ipaddr) { - struct list_head *tmp; - struct list_head *nxt; + cfs_list_t *tmp; + cfs_list_t *nxt; ksock_route_t *route; ksock_conn_t *conn; int i; @@ -1879,12 +2028,12 @@ ksocknal_peer_del_interface_locked(ksock_peer_t *peer, __u32 ipaddr) break; } - list_for_each_safe(tmp, nxt, &peer->ksnp_routes) { - route = list_entry (tmp, ksock_route_t, ksnr_list); - + cfs_list_for_each_safe(tmp, nxt, &peer->ksnp_routes) { + route = cfs_list_entry (tmp, ksock_route_t, ksnr_list); + if (route->ksnr_myipaddr != ipaddr) continue; - + if (route->ksnr_share_count != 0) { /* Manually created; keep, but unbind */ route->ksnr_myipaddr = 0; @@ -1892,31 +2041,31 @@ ksocknal_peer_del_interface_locked(ksock_peer_t *peer, __u32 ipaddr) ksocknal_del_route_locked(route); } } - - list_for_each_safe(tmp, nxt, &peer->ksnp_conns) { - conn = list_entry(tmp, ksock_conn_t, ksnc_list); - + + cfs_list_for_each_safe(tmp, nxt, &peer->ksnp_conns) { + conn = cfs_list_entry(tmp, ksock_conn_t, ksnc_list); + if (conn->ksnc_myipaddr == ipaddr) ksocknal_close_conn_locked (conn, 0); } } int -ksocknal_del_interface(__u32 ipaddress) +ksocknal_del_interface(lnet_ni_t *ni, __u32 ipaddress) { + ksock_net_t *net = ni->ni_data; int rc = -ENOENT; - unsigned long flags; - struct list_head *tmp; - struct list_head *nxt; + cfs_list_t *tmp; + cfs_list_t *nxt; ksock_peer_t *peer; __u32 this_ip; int i; int j; - write_lock_irqsave(&ksocknal_data.ksnd_global_lock, flags); + cfs_write_lock_bh (&ksocknal_data.ksnd_global_lock); - for (i = 0; i < ksocknal_data.ksnd_ninterfaces; i++) { - this_ip = ksocknal_data.ksnd_interfaces[i].ksni_ipaddr; + for (i = 0; i < net->ksnn_ninterfaces; i++) { + this_ip = net->ksnn_interfaces[i].ksni_ipaddr; if (!(ipaddress == 0 || ipaddress == this_ip)) @@ -1924,325 +2073,267 @@ ksocknal_del_interface(__u32 ipaddress) rc = 0; - for (j = i+1; j < ksocknal_data.ksnd_ninterfaces; j++) - ksocknal_data.ksnd_interfaces[j-1] = - ksocknal_data.ksnd_interfaces[j]; - - ksocknal_data.ksnd_ninterfaces--; + for (j = i+1; j < net->ksnn_ninterfaces; j++) + net->ksnn_interfaces[j-1] = + net->ksnn_interfaces[j]; + + net->ksnn_ninterfaces--; for (j = 0; j < ksocknal_data.ksnd_peer_hash_size; j++) { - list_for_each_safe(tmp, nxt, &ksocknal_data.ksnd_peers[j]) { - peer = list_entry(tmp, ksock_peer_t, ksnp_list); - + cfs_list_for_each_safe(tmp, nxt, + &ksocknal_data.ksnd_peers[j]) { + peer = cfs_list_entry(tmp, ksock_peer_t, + ksnp_list); + + if (peer->ksnp_ni != ni) + continue; + ksocknal_peer_del_interface_locked(peer, this_ip); } } } - - write_unlock_irqrestore(&ksocknal_data.ksnd_global_lock, flags); - + + cfs_write_unlock_bh (&ksocknal_data.ksnd_global_lock); + return (rc); } int -ksocknal_cmd(struct portals_cfg *pcfg, void * private) +ksocknal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg) { + lnet_process_id_t id = {0}; + struct libcfs_ioctl_data *data = arg; int rc; - switch(pcfg->pcfg_command) { - case NAL_CMD_GET_INTERFACE: { + switch(cmd) { + case IOC_LIBCFS_GET_INTERFACE: { + ksock_net_t *net = ni->ni_data; ksock_interface_t *iface; - read_lock (&ksocknal_data.ksnd_global_lock); + cfs_read_lock (&ksocknal_data.ksnd_global_lock); - if (pcfg->pcfg_count < 0 || - pcfg->pcfg_count >= ksocknal_data.ksnd_ninterfaces) { + if (data->ioc_count >= (__u32)net->ksnn_ninterfaces) { rc = -ENOENT; } else { rc = 0; - iface = &ksocknal_data.ksnd_interfaces[pcfg->pcfg_count]; + iface = &net->ksnn_interfaces[data->ioc_count]; - pcfg->pcfg_id = iface->ksni_ipaddr; - pcfg->pcfg_misc = iface->ksni_netmask; - pcfg->pcfg_fd = iface->ksni_npeers; - pcfg->pcfg_count = iface->ksni_nroutes; + data->ioc_u32[0] = iface->ksni_ipaddr; + data->ioc_u32[1] = iface->ksni_netmask; + data->ioc_u32[2] = iface->ksni_npeers; + data->ioc_u32[3] = iface->ksni_nroutes; } - - read_unlock (&ksocknal_data.ksnd_global_lock); - break; - } - case NAL_CMD_ADD_INTERFACE: { - rc = ksocknal_add_interface(pcfg->pcfg_id, /* IP address */ - pcfg->pcfg_misc); /* net mask */ - break; - } - case NAL_CMD_DEL_INTERFACE: { - rc = ksocknal_del_interface(pcfg->pcfg_id); /* IP address */ - break; - } - case NAL_CMD_GET_PEER: { - ptl_nid_t nid = 0; - __u32 myip = 0; - __u32 ip = 0; - int port = 0; - int conn_count = 0; - int share_count = 0; - - rc = ksocknal_get_peer_info(pcfg->pcfg_count, &nid, - &myip, &ip, &port, - &conn_count, &share_count); - pcfg->pcfg_nid = nid; - pcfg->pcfg_size = myip; - pcfg->pcfg_id = ip; - pcfg->pcfg_misc = port; - pcfg->pcfg_count = conn_count; - pcfg->pcfg_wait = share_count; - break; - } - case NAL_CMD_ADD_PEER: { - rc = ksocknal_add_peer (pcfg->pcfg_nid, - pcfg->pcfg_id, /* IP */ - pcfg->pcfg_misc); /* port */ - break; - } - case NAL_CMD_DEL_PEER: { - rc = ksocknal_del_peer (pcfg->pcfg_nid, - pcfg->pcfg_id, /* IP */ - pcfg->pcfg_flags); /* single_share? */ - break; + + cfs_read_unlock (&ksocknal_data.ksnd_global_lock); + return rc; } - case NAL_CMD_GET_CONN: { - ksock_conn_t *conn = ksocknal_get_conn_by_idx (pcfg->pcfg_count); - if (conn == NULL) - rc = -ENOENT; - else { - int txmem; - int rxmem; - int nagle; + case IOC_LIBCFS_ADD_INTERFACE: + return ksocknal_add_interface(ni, + data->ioc_u32[0], /* IP address */ + data->ioc_u32[1]); /* net mask */ - ksocknal_get_conn_tunables(conn, &txmem, &rxmem, &nagle); + case IOC_LIBCFS_DEL_INTERFACE: + return ksocknal_del_interface(ni, + data->ioc_u32[0]); /* IP address */ - rc = 0; - pcfg->pcfg_nid = conn->ksnc_peer->ksnp_nid; - pcfg->pcfg_id = conn->ksnc_ipaddr; - pcfg->pcfg_misc = conn->ksnc_port; - pcfg->pcfg_fd = conn->ksnc_myipaddr; - pcfg->pcfg_flags = conn->ksnc_type; - pcfg->pcfg_gw_nal = conn->ksnc_scheduler - - ksocknal_data.ksnd_schedulers; - pcfg->pcfg_count = txmem; - pcfg->pcfg_size = rxmem; - pcfg->pcfg_wait = nagle; - ksocknal_put_conn (conn); - } - break; + case IOC_LIBCFS_GET_PEER: { + __u32 myip = 0; + __u32 ip = 0; + int port = 0; + int conn_count = 0; + int share_count = 0; + + rc = ksocknal_get_peer_info(ni, data->ioc_count, + &id, &myip, &ip, &port, + &conn_count, &share_count); + if (rc != 0) + return rc; + + data->ioc_nid = id.nid; + data->ioc_count = share_count; + data->ioc_u32[0] = ip; + data->ioc_u32[1] = port; + data->ioc_u32[2] = myip; + data->ioc_u32[3] = conn_count; + data->ioc_u32[4] = id.pid; + return 0; } - case NAL_CMD_REGISTER_PEER_FD: { - struct socket *sock = sockfd_lookup (pcfg->pcfg_fd, &rc); - int type = pcfg->pcfg_misc; - if (sock == NULL) - break; + case IOC_LIBCFS_ADD_PEER: + id.nid = data->ioc_nid; + id.pid = LUSTRE_SRV_LNET_PID; + return ksocknal_add_peer (ni, id, + data->ioc_u32[0], /* IP */ + data->ioc_u32[1]); /* port */ - switch (type) { - case SOCKNAL_CONN_NONE: - case SOCKNAL_CONN_ANY: - case SOCKNAL_CONN_CONTROL: - case SOCKNAL_CONN_BULK_IN: - case SOCKNAL_CONN_BULK_OUT: - rc = ksocknal_create_conn(NULL, sock, type); - break; - default: - rc = -EINVAL; - break; - } - fput (sock->file); - break; - } - case NAL_CMD_CLOSE_CONNECTION: { - rc = ksocknal_close_matching_conns (pcfg->pcfg_nid, - pcfg->pcfg_id); - break; - } - case NAL_CMD_REGISTER_MYNID: { - rc = ksocknal_set_mynid (pcfg->pcfg_nid); - break; - } - case NAL_CMD_PUSH_CONNECTION: { - rc = ksocknal_push (pcfg->pcfg_nid); - break; - } - default: - rc = -EINVAL; - break; + case IOC_LIBCFS_DEL_PEER: + id.nid = data->ioc_nid; + id.pid = LNET_PID_ANY; + return ksocknal_del_peer (ni, id, + data->ioc_u32[0]); /* IP */ + + case IOC_LIBCFS_GET_CONN: { + int txmem; + int rxmem; + int nagle; + ksock_conn_t *conn = ksocknal_get_conn_by_idx (ni, data->ioc_count); + + if (conn == NULL) + return -ENOENT; + + ksocknal_lib_get_conn_tunables(conn, &txmem, &rxmem, &nagle); + + data->ioc_count = txmem; + data->ioc_nid = conn->ksnc_peer->ksnp_id.nid; + data->ioc_flags = nagle; + data->ioc_u32[0] = conn->ksnc_ipaddr; + data->ioc_u32[1] = conn->ksnc_port; + data->ioc_u32[2] = conn->ksnc_myipaddr; + data->ioc_u32[3] = conn->ksnc_type; + data->ioc_u32[4] = (__u32)(conn->ksnc_scheduler - + ksocknal_data.ksnd_schedulers); + data->ioc_u32[5] = rxmem; + data->ioc_u32[6] = conn->ksnc_peer->ksnp_id.pid; + ksocknal_conn_decref(conn); + return 0; } - return rc; -} + case IOC_LIBCFS_CLOSE_CONNECTION: + id.nid = data->ioc_nid; + id.pid = LNET_PID_ANY; + return ksocknal_close_matching_conns (id, + data->ioc_u32[0]); -void -ksocknal_free_fmbs (ksock_fmb_pool_t *p) -{ - int npages = p->fmp_buff_pages; - ksock_fmb_t *fmb; - int i; + case IOC_LIBCFS_REGISTER_MYNID: + /* Ignore if this is a noop */ + if (data->ioc_nid == ni->ni_nid) + return 0; - LASSERT (list_empty(&p->fmp_blocked_conns)); - LASSERT (p->fmp_nactive_fmbs == 0); - - while (!list_empty(&p->fmp_idle_fmbs)) { + CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n", + libcfs_nid2str(data->ioc_nid), + libcfs_nid2str(ni->ni_nid)); + return -EINVAL; - fmb = list_entry(p->fmp_idle_fmbs.next, - ksock_fmb_t, fmb_list); - - for (i = 0; i < npages; i++) - if (fmb->fmb_kiov[i].kiov_page != NULL) - __free_page(fmb->fmb_kiov[i].kiov_page); + case IOC_LIBCFS_PUSH_CONNECTION: + id.nid = data->ioc_nid; + id.pid = LNET_PID_ANY; + return ksocknal_push(ni, id); - list_del(&fmb->fmb_list); - PORTAL_FREE(fmb, offsetof(ksock_fmb_t, fmb_kiov[npages])); + default: + return -EINVAL; } + /* not reached */ } void ksocknal_free_buffers (void) { - ksocknal_free_fmbs(&ksocknal_data.ksnd_small_fmp); - ksocknal_free_fmbs(&ksocknal_data.ksnd_large_fmp); - - LASSERT (atomic_read(&ksocknal_data.ksnd_nactive_ltxs) == 0); + LASSERT (cfs_atomic_read(&ksocknal_data.ksnd_nactive_txs) == 0); if (ksocknal_data.ksnd_schedulers != NULL) - PORTAL_FREE (ksocknal_data.ksnd_schedulers, + LIBCFS_FREE (ksocknal_data.ksnd_schedulers, sizeof (ksock_sched_t) * ksocknal_data.ksnd_nschedulers); - PORTAL_FREE (ksocknal_data.ksnd_peers, - sizeof (struct list_head) * + LIBCFS_FREE (ksocknal_data.ksnd_peers, + sizeof (cfs_list_t) * ksocknal_data.ksnd_peer_hash_size); + + cfs_spin_lock(&ksocknal_data.ksnd_tx_lock); + + if (!cfs_list_empty(&ksocknal_data.ksnd_idle_noop_txs)) { + cfs_list_t zlist; + ksock_tx_t *tx; + + cfs_list_add(&zlist, &ksocknal_data.ksnd_idle_noop_txs); + cfs_list_del_init(&ksocknal_data.ksnd_idle_noop_txs); + cfs_spin_unlock(&ksocknal_data.ksnd_tx_lock); + + while(!cfs_list_empty(&zlist)) { + tx = cfs_list_entry(zlist.next, ksock_tx_t, tx_list); + cfs_list_del(&tx->tx_list); + LIBCFS_FREE(tx, tx->tx_desc_size); + } + } else { + cfs_spin_unlock(&ksocknal_data.ksnd_tx_lock); + } } void -ksocknal_api_shutdown (nal_t *nal) +ksocknal_base_shutdown (void) { ksock_sched_t *sched; int i; - if (nal->nal_refct != 0) { - /* This module got the first ref */ - PORTAL_MODULE_UNUSE; - return; - } - CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n", - atomic_read (&portal_kmemory)); - - LASSERT(nal == &ksocknal_api); + cfs_atomic_read (&libcfs_kmemory)); + LASSERT (ksocknal_data.ksnd_nnets == 0); switch (ksocknal_data.ksnd_init) { default: LASSERT (0); case SOCKNAL_INIT_ALL: - libcfs_nal_cmd_unregister(SOCKNAL); - - ksocknal_data.ksnd_init = SOCKNAL_INIT_LIB; - /* fall through */ - - case SOCKNAL_INIT_LIB: - /* No more calls to ksocknal_cmd() to create new - * autoroutes/connections since we're being unloaded. */ - - /* Delete all peers */ - ksocknal_del_peer(PTL_NID_ANY, 0, 0); - - /* Wait for all peer state to clean up */ - i = 2; - while (atomic_read (&ksocknal_data.ksnd_npeers) != 0) { - i++; - CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ - "waiting for %d peers to disconnect\n", - atomic_read (&ksocknal_data.ksnd_npeers)); - set_current_state (TASK_UNINTERRUPTIBLE); - schedule_timeout (HZ); - } - - /* Tell lib we've stopped calling into her. */ - lib_fini(&ksocknal_lib); - - ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA; - /* fall through */ - case SOCKNAL_INIT_DATA: - LASSERT (atomic_read (&ksocknal_data.ksnd_npeers) == 0); LASSERT (ksocknal_data.ksnd_peers != NULL); for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { - LASSERT (list_empty (&ksocknal_data.ksnd_peers[i])); + LASSERT (cfs_list_empty (&ksocknal_data.ksnd_peers[i])); } - LASSERT (list_empty (&ksocknal_data.ksnd_enomem_conns)); - LASSERT (list_empty (&ksocknal_data.ksnd_zombie_conns)); - LASSERT (list_empty (&ksocknal_data.ksnd_autoconnectd_routes)); - LASSERT (list_empty (&ksocknal_data.ksnd_small_fmp.fmp_blocked_conns)); - LASSERT (list_empty (&ksocknal_data.ksnd_large_fmp.fmp_blocked_conns)); + LASSERT (cfs_list_empty (&ksocknal_data.ksnd_enomem_conns)); + LASSERT (cfs_list_empty (&ksocknal_data.ksnd_zombie_conns)); + LASSERT (cfs_list_empty (&ksocknal_data.ksnd_connd_connreqs)); + LASSERT (cfs_list_empty (&ksocknal_data.ksnd_connd_routes)); if (ksocknal_data.ksnd_schedulers != NULL) for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) { ksock_sched_t *kss = &ksocknal_data.ksnd_schedulers[i]; - LASSERT (list_empty (&kss->kss_tx_conns)); - LASSERT (list_empty (&kss->kss_rx_conns)); + LASSERT (cfs_list_empty (&kss->kss_tx_conns)); + LASSERT (cfs_list_empty (&kss->kss_rx_conns)); + LASSERT (cfs_list_empty (&kss-> \ + kss_zombie_noop_txs)); LASSERT (kss->kss_nconns == 0); } - /* stop router calling me */ - kpr_shutdown (&ksocknal_data.ksnd_router); - /* flag threads to terminate; wake and wait for them to die */ ksocknal_data.ksnd_shuttingdown = 1; - wake_up_all (&ksocknal_data.ksnd_autoconnectd_waitq); - wake_up_all (&ksocknal_data.ksnd_reaper_waitq); + cfs_waitq_broadcast (&ksocknal_data.ksnd_connd_waitq); + cfs_waitq_broadcast (&ksocknal_data.ksnd_reaper_waitq); - for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) { - sched = &ksocknal_data.ksnd_schedulers[i]; - wake_up_all(&sched->kss_waitq); - } + if (ksocknal_data.ksnd_schedulers != NULL) + for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) { + sched = &ksocknal_data.ksnd_schedulers[i]; + cfs_waitq_broadcast(&sched->kss_waitq); + } i = 4; - read_lock(&ksocknal_data.ksnd_global_lock); + cfs_read_lock (&ksocknal_data.ksnd_global_lock); while (ksocknal_data.ksnd_nthreads != 0) { i++; CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ "waiting for %d threads to terminate\n", ksocknal_data.ksnd_nthreads); - read_unlock(&ksocknal_data.ksnd_global_lock); - set_current_state (TASK_UNINTERRUPTIBLE); - schedule_timeout (HZ); - read_lock(&ksocknal_data.ksnd_global_lock); + cfs_read_unlock (&ksocknal_data.ksnd_global_lock); + cfs_pause(cfs_time_seconds(1)); + cfs_read_lock (&ksocknal_data.ksnd_global_lock); } - read_unlock(&ksocknal_data.ksnd_global_lock); - - kpr_deregister (&ksocknal_data.ksnd_router); + cfs_read_unlock (&ksocknal_data.ksnd_global_lock); ksocknal_free_buffers(); ksocknal_data.ksnd_init = SOCKNAL_INIT_NOTHING; - /* fall through */ - - case SOCKNAL_INIT_NOTHING: break; } CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", - atomic_read (&portal_kmemory)); + cfs_atomic_read (&libcfs_kmemory)); - printk(KERN_INFO "Lustre: Routing socket NAL unloaded (final mem %d)\n", - atomic_read(&portal_kmemory)); + PORTAL_MODULE_UNUSE; } - -void -ksocknal_init_incarnation (void) +__u64 +ksocknal_new_incarnation (void) { struct timeval tv; @@ -2250,209 +2341,354 @@ ksocknal_init_incarnation (void) * identifies this particular instance of the socknal. Hopefully * we won't be able to reboot more frequently than 1MHz for the * forseeable future :) */ - - do_gettimeofday(&tv); - - ksocknal_data.ksnd_incarnation = - (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; + + cfs_gettimeofday(&tv); + + return (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; } int -ksocknal_api_startup (nal_t *nal, ptl_pid_t requested_pid, - ptl_ni_limits_t *requested_limits, - ptl_ni_limits_t *actual_limits) +ksocknal_base_startup (void) { - ptl_process_id_t process_id; - int pkmem = atomic_read(&portal_kmemory); int rc; int i; - int j; - - LASSERT (nal == &ksocknal_api); - - if (nal->nal_refct != 0) { - if (actual_limits != NULL) - *actual_limits = ksocknal_lib.libnal_ni.ni_actual_limits; - /* This module got the first ref */ - PORTAL_MODULE_USE; - return (PTL_OK); - } LASSERT (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING); + LASSERT (ksocknal_data.ksnd_nnets == 0); memset (&ksocknal_data, 0, sizeof (ksocknal_data)); /* zero pointers */ - ksocknal_init_incarnation(); - ksocknal_data.ksnd_peer_hash_size = SOCKNAL_PEER_HASH_SIZE; - PORTAL_ALLOC (ksocknal_data.ksnd_peers, - sizeof (struct list_head) * ksocknal_data.ksnd_peer_hash_size); + LIBCFS_ALLOC (ksocknal_data.ksnd_peers, + sizeof (cfs_list_t) * + ksocknal_data.ksnd_peer_hash_size); if (ksocknal_data.ksnd_peers == NULL) - return (-ENOMEM); + return -ENOMEM; for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) - INIT_LIST_HEAD(&ksocknal_data.ksnd_peers[i]); + CFS_INIT_LIST_HEAD(&ksocknal_data.ksnd_peers[i]); - rwlock_init(&ksocknal_data.ksnd_global_lock); + cfs_rwlock_init(&ksocknal_data.ksnd_global_lock); - spin_lock_init(&ksocknal_data.ksnd_small_fmp.fmp_lock); - INIT_LIST_HEAD(&ksocknal_data.ksnd_small_fmp.fmp_idle_fmbs); - INIT_LIST_HEAD(&ksocknal_data.ksnd_small_fmp.fmp_blocked_conns); - ksocknal_data.ksnd_small_fmp.fmp_buff_pages = SOCKNAL_SMALL_FWD_PAGES; + cfs_spin_lock_init (&ksocknal_data.ksnd_reaper_lock); + CFS_INIT_LIST_HEAD (&ksocknal_data.ksnd_enomem_conns); + CFS_INIT_LIST_HEAD (&ksocknal_data.ksnd_zombie_conns); + CFS_INIT_LIST_HEAD (&ksocknal_data.ksnd_deathrow_conns); + cfs_waitq_init(&ksocknal_data.ksnd_reaper_waitq); - spin_lock_init(&ksocknal_data.ksnd_large_fmp.fmp_lock); - INIT_LIST_HEAD(&ksocknal_data.ksnd_large_fmp.fmp_idle_fmbs); - INIT_LIST_HEAD(&ksocknal_data.ksnd_large_fmp.fmp_blocked_conns); - ksocknal_data.ksnd_large_fmp.fmp_buff_pages = SOCKNAL_LARGE_FWD_PAGES; + cfs_spin_lock_init (&ksocknal_data.ksnd_connd_lock); + CFS_INIT_LIST_HEAD (&ksocknal_data.ksnd_connd_connreqs); + CFS_INIT_LIST_HEAD (&ksocknal_data.ksnd_connd_routes); + cfs_waitq_init(&ksocknal_data.ksnd_connd_waitq); - spin_lock_init (&ksocknal_data.ksnd_reaper_lock); - INIT_LIST_HEAD (&ksocknal_data.ksnd_enomem_conns); - INIT_LIST_HEAD (&ksocknal_data.ksnd_zombie_conns); - INIT_LIST_HEAD (&ksocknal_data.ksnd_deathrow_conns); - init_waitqueue_head(&ksocknal_data.ksnd_reaper_waitq); - - spin_lock_init (&ksocknal_data.ksnd_autoconnectd_lock); - INIT_LIST_HEAD (&ksocknal_data.ksnd_autoconnectd_routes); - init_waitqueue_head(&ksocknal_data.ksnd_autoconnectd_waitq); + cfs_spin_lock_init (&ksocknal_data.ksnd_tx_lock); + CFS_INIT_LIST_HEAD (&ksocknal_data.ksnd_idle_noop_txs); /* NB memset above zeros whole of ksocknal_data, including * ksocknal_data.ksnd_irqinfo[all].ksni_valid */ /* flag lists/ptrs/locks initialised */ ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA; + PORTAL_MODULE_USE; ksocknal_data.ksnd_nschedulers = ksocknal_nsched(); - PORTAL_ALLOC(ksocknal_data.ksnd_schedulers, + LIBCFS_ALLOC(ksocknal_data.ksnd_schedulers, sizeof(ksock_sched_t) * ksocknal_data.ksnd_nschedulers); - if (ksocknal_data.ksnd_schedulers == NULL) { - ksocknal_api_shutdown (nal); - return (-ENOMEM); - } + if (ksocknal_data.ksnd_schedulers == NULL) + goto failed; for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) { ksock_sched_t *kss = &ksocknal_data.ksnd_schedulers[i]; - spin_lock_init (&kss->kss_lock); - INIT_LIST_HEAD (&kss->kss_rx_conns); - INIT_LIST_HEAD (&kss->kss_tx_conns); -#if SOCKNAL_ZC - INIT_LIST_HEAD (&kss->kss_zctxdone_list); -#endif - init_waitqueue_head (&kss->kss_waitq); + cfs_spin_lock_init (&kss->kss_lock); + CFS_INIT_LIST_HEAD (&kss->kss_rx_conns); + CFS_INIT_LIST_HEAD (&kss->kss_tx_conns); + CFS_INIT_LIST_HEAD (&kss->kss_zombie_noop_txs); + cfs_waitq_init (&kss->kss_waitq); } - /* NB we have to wait to be told our true NID... */ - process_id.pid = requested_pid; - process_id.nid = 0; - - rc = lib_init(&ksocknal_lib, nal, process_id, - requested_limits, actual_limits); - if (rc != PTL_OK) { - CERROR("lib_init failed: error %d\n", rc); - ksocknal_api_shutdown (nal); - return (rc); - } - - ksocknal_data.ksnd_init = SOCKNAL_INIT_LIB; // flag lib_init() called - for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) { rc = ksocknal_thread_start (ksocknal_scheduler, &ksocknal_data.ksnd_schedulers[i]); if (rc != 0) { CERROR("Can't spawn socknal scheduler[%d]: %d\n", i, rc); - ksocknal_api_shutdown (nal); - return (rc); + goto failed; } } - for (i = 0; i < SOCKNAL_N_AUTOCONNECTD; i++) { - rc = ksocknal_thread_start (ksocknal_autoconnectd, (void *)((long)i)); + /* must have at least 2 connds to remain responsive to accepts while + * connecting */ + if (*ksocknal_tunables.ksnd_nconnds < 2) + *ksocknal_tunables.ksnd_nconnds = 2; + + for (i = 0; i < *ksocknal_tunables.ksnd_nconnds; i++) { + rc = ksocknal_thread_start (ksocknal_connd, + (void *)((ulong_ptr_t)i)); if (rc != 0) { - CERROR("Can't spawn socknal autoconnectd: %d\n", rc); - ksocknal_api_shutdown (nal); - return (rc); + CERROR("Can't spawn socknal connd: %d\n", rc); + goto failed; } } rc = ksocknal_thread_start (ksocknal_reaper, NULL); if (rc != 0) { CERROR ("Can't spawn socknal reaper: %d\n", rc); - ksocknal_api_shutdown (nal); - return (rc); + goto failed; } - rc = kpr_register(&ksocknal_data.ksnd_router, - &ksocknal_router_interface); - if (rc != 0) { - CDEBUG(D_NET, "Can't initialise routing interface " - "(rc = %d): not routing\n", rc); + /* flag everything initialised */ + ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL; + + return 0; + + failed: + ksocknal_base_shutdown(); + return -ENETDOWN; +} + +void +ksocknal_debug_peerhash (lnet_ni_t *ni) +{ + ksock_peer_t *peer = NULL; + cfs_list_t *tmp; + int i; + + cfs_read_lock (&ksocknal_data.ksnd_global_lock); + + for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { + cfs_list_for_each (tmp, &ksocknal_data.ksnd_peers[i]) { + peer = cfs_list_entry (tmp, ksock_peer_t, ksnp_list); + + if (peer->ksnp_ni == ni) break; + + peer = NULL; + } + } + + if (peer != NULL) { + ksock_route_t *route; + ksock_conn_t *conn; + + CWARN ("Active peer on shutdown: %s, ref %d, scnt %d, " + "closing %d, accepting %d, err %d, zcookie "LPU64", " + "txq %d, zc_req %d\n", libcfs_id2str(peer->ksnp_id), + cfs_atomic_read(&peer->ksnp_refcount), + peer->ksnp_sharecount, peer->ksnp_closing, + peer->ksnp_accepting, peer->ksnp_error, + peer->ksnp_zc_next_cookie, + !cfs_list_empty(&peer->ksnp_tx_queue), + !cfs_list_empty(&peer->ksnp_zc_req_list)); + + cfs_list_for_each (tmp, &peer->ksnp_routes) { + route = cfs_list_entry(tmp, ksock_route_t, ksnr_list); + CWARN ("Route: ref %d, schd %d, conn %d, cnted %d, " + "del %d\n", cfs_atomic_read(&route->ksnr_refcount), + route->ksnr_scheduled, route->ksnr_connecting, + route->ksnr_connected, route->ksnr_deleted); + } + + cfs_list_for_each (tmp, &peer->ksnp_conns) { + conn = cfs_list_entry(tmp, ksock_conn_t, ksnc_list); + CWARN ("Conn: ref %d, sref %d, t %d, c %d\n", + cfs_atomic_read(&conn->ksnc_conn_refcount), + cfs_atomic_read(&conn->ksnc_sock_refcount), + conn->ksnc_type, conn->ksnc_closing); + } + } + + cfs_read_unlock (&ksocknal_data.ksnd_global_lock); + return; +} + +void +ksocknal_shutdown (lnet_ni_t *ni) +{ + ksock_net_t *net = ni->ni_data; + int i; + lnet_process_id_t anyid = {0}; + + anyid.nid = LNET_NID_ANY; + anyid.pid = LNET_PID_ANY; + + LASSERT(ksocknal_data.ksnd_init == SOCKNAL_INIT_ALL); + LASSERT(ksocknal_data.ksnd_nnets > 0); + + cfs_spin_lock_bh (&net->ksnn_lock); + net->ksnn_shutdown = 1; /* prevent new peers */ + cfs_spin_unlock_bh (&net->ksnn_lock); + + /* Delete all peers */ + ksocknal_del_peer(ni, anyid, 0); + + /* Wait for all peer state to clean up */ + i = 2; + cfs_spin_lock_bh (&net->ksnn_lock); + while (net->ksnn_npeers != 0) { + cfs_spin_unlock_bh (&net->ksnn_lock); + + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ + "waiting for %d peers to disconnect\n", + net->ksnn_npeers); + cfs_pause(cfs_time_seconds(1)); + + ksocknal_debug_peerhash(ni); + + cfs_spin_lock_bh (&net->ksnn_lock); + } + cfs_spin_unlock_bh (&net->ksnn_lock); + + for (i = 0; i < net->ksnn_ninterfaces; i++) { + LASSERT (net->ksnn_interfaces[i].ksni_npeers == 0); + LASSERT (net->ksnn_interfaces[i].ksni_nroutes == 0); + } + + LIBCFS_FREE(net, sizeof(*net)); + + ksocknal_data.ksnd_nnets--; + if (ksocknal_data.ksnd_nnets == 0) + ksocknal_base_shutdown(); +} + +int +ksocknal_enumerate_interfaces(ksock_net_t *net) +{ + char **names; + int i; + int j; + int rc; + int n; + + n = libcfs_ipif_enumerate(&names); + if (n <= 0) { + CERROR("Can't enumerate interfaces: %d\n", n); + return n; + } + + for (i = j = 0; i < n; i++) { + int up; + __u32 ip; + __u32 mask; + + if (!strcmp(names[i], "lo")) /* skip the loopback IF */ + continue; + + rc = libcfs_ipif_query(names[i], &up, &ip, &mask); + if (rc != 0) { + CWARN("Can't get interface %s info: %d\n", + names[i], rc); + continue; + } + + if (!up) { + CWARN("Ignoring interface %s (down)\n", + names[i]); + continue; + } + + if (j == LNET_MAX_INTERFACES) { + CWARN("Ignoring interface %s (too many interfaces)\n", + names[i]); + continue; + } + + net->ksnn_interfaces[j].ksni_ipaddr = ip; + net->ksnn_interfaces[j].ksni_netmask = mask; + j++; + } + + libcfs_ipif_free_enumeration(names, n); + + if (j == 0) + CERROR("Can't find any usable interfaces\n"); + + return j; +} + +int +ksocknal_startup (lnet_ni_t *ni) +{ + ksock_net_t *net; + int rc; + int i; + + LASSERT (ni->ni_lnd == &the_ksocklnd); + + if (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING) { + rc = ksocknal_base_startup(); + if (rc != 0) + return rc; + } + + LIBCFS_ALLOC(net, sizeof(*net)); + if (net == NULL) + goto fail_0; + + memset(net, 0, sizeof(*net)); + cfs_spin_lock_init(&net->ksnn_lock); + net->ksnn_incarnation = ksocknal_new_incarnation(); + ni->ni_data = net; + ni->ni_peertimeout = *ksocknal_tunables.ksnd_peertimeout; + ni->ni_maxtxcredits = *ksocknal_tunables.ksnd_credits; + ni->ni_peertxcredits = *ksocknal_tunables.ksnd_peertxcredits; + ni->ni_peerrtrcredits = *ksocknal_tunables.ksnd_peerrtrcredits; + + if (ni->ni_interfaces[0] == NULL) { + rc = ksocknal_enumerate_interfaces(net); + if (rc <= 0) + goto fail_1; + + net->ksnn_ninterfaces = 1; } else { - /* Only allocate forwarding buffers if there's a router */ - - for (i = 0; i < (SOCKNAL_SMALL_FWD_NMSGS + - SOCKNAL_LARGE_FWD_NMSGS); i++) { - ksock_fmb_t *fmb; - ksock_fmb_pool_t *pool; - - - if (i < SOCKNAL_SMALL_FWD_NMSGS) - pool = &ksocknal_data.ksnd_small_fmp; - else - pool = &ksocknal_data.ksnd_large_fmp; - - PORTAL_ALLOC(fmb, offsetof(ksock_fmb_t, - fmb_kiov[pool->fmp_buff_pages])); - if (fmb == NULL) { - ksocknal_api_shutdown(nal); - return (-ENOMEM); - } + for (i = 0; i < LNET_MAX_INTERFACES; i++) { + int up; - fmb->fmb_pool = pool; - - for (j = 0; j < pool->fmp_buff_pages; j++) { - fmb->fmb_kiov[j].kiov_page = alloc_page(GFP_KERNEL); + if (ni->ni_interfaces[i] == NULL) + break; - if (fmb->fmb_kiov[j].kiov_page == NULL) { - ksocknal_api_shutdown (nal); - return (-ENOMEM); - } + rc = libcfs_ipif_query( + ni->ni_interfaces[i], &up, + &net->ksnn_interfaces[i].ksni_ipaddr, + &net->ksnn_interfaces[i].ksni_netmask); - LASSERT(page_address(fmb->fmb_kiov[j].kiov_page) != NULL); + if (rc != 0) { + CERROR("Can't get interface %s info: %d\n", + ni->ni_interfaces[i], rc); + goto fail_1; } - list_add(&fmb->fmb_list, &pool->fmp_idle_fmbs); + if (!up) { + CERROR("Interface %s is down\n", + ni->ni_interfaces[i]); + goto fail_1; + } } + net->ksnn_ninterfaces = i; } - rc = libcfs_nal_cmd_register(SOCKNAL, &ksocknal_cmd, NULL); - if (rc != 0) { - CERROR ("Can't initialise command interface (rc = %d)\n", rc); - ksocknal_api_shutdown (nal); - return (rc); - } + ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), + net->ksnn_interfaces[0].ksni_ipaddr); - /* flag everything initialised */ - ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL; + ksocknal_data.ksnd_nnets++; - printk(KERN_INFO "Lustre: Routing socket NAL loaded " - "(Routing %s, initial mem %d, incarnation "LPX64")\n", - kpr_routing (&ksocknal_data.ksnd_router) ? - "enabled" : "disabled", pkmem, ksocknal_data.ksnd_incarnation); + return 0; - return (0); + fail_1: + LIBCFS_FREE(net, sizeof(*net)); + fail_0: + if (ksocknal_data.ksnd_nnets == 0) + ksocknal_base_shutdown(); + + return -ENETDOWN; } + void __exit ksocknal_module_fini (void) { -#ifdef CONFIG_SYSCTL - if (ksocknal_tunables.ksnd_sysctl != NULL) - unregister_sysctl_table (ksocknal_tunables.ksnd_sysctl); -#endif - PtlNIFini(ksocknal_ni); - - ptl_unregister_nal(SOCKNAL); + lnet_unregister_lnd(&the_ksocklnd); + ksocknal_tunables_fini(); } int __init @@ -2460,72 +2696,32 @@ ksocknal_module_init (void) { int rc; - /* packet descriptor must fit in a router descriptor's scratchpad */ - LASSERT(sizeof (ksock_tx_t) <= sizeof (kprfd_scratch_t)); - /* the following must be sizeof(int) for proc_dointvec() */ - LASSERT(sizeof (ksocknal_tunables.ksnd_io_timeout) == sizeof (int)); - LASSERT(sizeof (ksocknal_tunables.ksnd_eager_ack) == sizeof (int)); - LASSERT(sizeof (ksocknal_tunables.ksnd_typed_conns) == sizeof (int)); - LASSERT(sizeof (ksocknal_tunables.ksnd_min_bulk) == sizeof (int)); - LASSERT(sizeof (ksocknal_tunables.ksnd_buffer_size) == sizeof (int)); - LASSERT(sizeof (ksocknal_tunables.ksnd_nagle) == sizeof (int)); - LASSERT(sizeof (ksocknal_tunables.ksnd_keepalive_idle) == sizeof (int)); - LASSERT(sizeof (ksocknal_tunables.ksnd_keepalive_count) == sizeof (int)); - LASSERT(sizeof (ksocknal_tunables.ksnd_keepalive_intvl) == sizeof (int)); -#if CPU_AFFINITY - LASSERT(sizeof (ksocknal_tunables.ksnd_irq_affinity) == sizeof (int)); -#endif -#if SOCKNAL_ZC - LASSERT(sizeof (ksocknal_tunables.ksnd_zc_min_frag) == sizeof (int)); -#endif /* check ksnr_connected/connecting field large enough */ - LASSERT(SOCKNAL_CONN_NTYPES <= 4); - - ksocknal_api.nal_ni_init = ksocknal_api_startup; - ksocknal_api.nal_ni_fini = ksocknal_api_shutdown; - - /* Initialise dynamic tunables to defaults once only */ - ksocknal_tunables.ksnd_io_timeout = SOCKNAL_IO_TIMEOUT; - ksocknal_tunables.ksnd_eager_ack = SOCKNAL_EAGER_ACK; - ksocknal_tunables.ksnd_typed_conns = SOCKNAL_TYPED_CONNS; - ksocknal_tunables.ksnd_min_bulk = SOCKNAL_MIN_BULK; - ksocknal_tunables.ksnd_buffer_size = SOCKNAL_BUFFER_SIZE; - ksocknal_tunables.ksnd_nagle = SOCKNAL_NAGLE; - ksocknal_tunables.ksnd_keepalive_idle = SOCKNAL_KEEPALIVE_IDLE; - ksocknal_tunables.ksnd_keepalive_count = SOCKNAL_KEEPALIVE_COUNT; - ksocknal_tunables.ksnd_keepalive_intvl = SOCKNAL_KEEPALIVE_INTVL; -#if CPU_AFFINITY - ksocknal_tunables.ksnd_irq_affinity = SOCKNAL_IRQ_AFFINITY; -#endif -#if SOCKNAL_ZC - ksocknal_tunables.ksnd_zc_min_frag = SOCKNAL_ZC_MIN_FRAG; -#endif + CLASSERT (SOCKLND_CONN_NTYPES <= 4); + CLASSERT (SOCKLND_CONN_ACK == SOCKLND_CONN_BULK_IN); + + /* initialize the_ksocklnd */ + the_ksocklnd.lnd_type = SOCKLND; + the_ksocklnd.lnd_startup = ksocknal_startup; + the_ksocklnd.lnd_shutdown = ksocknal_shutdown; + the_ksocklnd.lnd_ctl = ksocknal_ctl; + the_ksocklnd.lnd_send = ksocknal_send; + the_ksocklnd.lnd_recv = ksocknal_recv; + the_ksocklnd.lnd_notify = ksocknal_notify; + the_ksocklnd.lnd_query = ksocknal_query; + the_ksocklnd.lnd_accept = ksocknal_accept; + + rc = ksocknal_tunables_init(); + if (rc != 0) + return rc; - rc = ptl_register_nal(SOCKNAL, &ksocknal_api); - if (rc != PTL_OK) { - CERROR("Can't register SOCKNAL: %d\n", rc); - return (-ENOMEM); /* or something... */ - } + lnet_register_lnd(&the_ksocklnd); - /* Pure gateways want the NAL started up at module load time... */ - rc = PtlNIInit(SOCKNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &ksocknal_ni); - if (rc != PTL_OK && rc != PTL_IFACE_DUP) { - ptl_unregister_nal(SOCKNAL); - return (-ENODEV); - } - -#ifdef CONFIG_SYSCTL - /* Press on regardless even if registering sysctl doesn't work */ - ksocknal_tunables.ksnd_sysctl = - register_sysctl_table (ksocknal_top_ctl_table, 0); -#endif - return (0); + return 0; } -MODULE_AUTHOR("Cluster File Systems, Inc. "); -MODULE_DESCRIPTION("Kernel TCP Socket NAL v0.01"); +MODULE_AUTHOR("Sun Microsystems, Inc. "); +MODULE_DESCRIPTION("Kernel TCP Socket LND v3.0.0"); MODULE_LICENSE("GPL"); -module_init(ksocknal_module_init); -module_exit(ksocknal_module_fini); - +cfs_module(ksocknal, "3.0.0", ksocknal_module_init, ksocknal_module_fini);