1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
5 * Author: Zach Brown <zab@zabbo.net>
6 * Author: Peter J. Braam <braam@clusterfs.com>
7 * Author: Phil Schwan <phil@clusterfs.com>
8 * Author: Eric Barton <eric@bartonsoftware.com>
10 * This file is part of Portals, http://www.sf.net/projects/sandiaportals/
12 * Portals is free software; you can redistribute it and/or
13 * modify it under the terms of version 2 of the GNU General Public
14 * License as published by the Free Software Foundation.
16 * Portals is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with Portals; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
29 ksock_nal_data_t ksocknal_data;
30 ptl_handle_ni_t ksocknal_ni;
31 ksock_tunables_t ksocknal_tunables;
33 kpr_nal_interface_t ksocknal_router_interface = {
35 kprni_arg: &ksocknal_data,
36 kprni_fwd: ksocknal_fwd_packet,
37 kprni_notify: ksocknal_notify,
41 ksocknal_set_mynid(ptl_nid_t nid)
43 lib_ni_t *ni = &ksocknal_lib.libnal_ni;
45 /* FIXME: we have to do this because we call lib_init() at module
46 * insertion time, which is before we have 'mynid' available. lib_init
47 * sets the NAL's nid, which it uses to tell other nodes where packets
48 * are coming from. This is not a very graceful solution to this
51 CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n",
59 ksocknal_ip2iface(__u32 ip)
62 ksock_interface_t *iface;
64 for (i = 0; i < ksocknal_data.ksnd_ninterfaces; i++) {
65 LASSERT(i < SOCKNAL_MAX_INTERFACES);
66 iface = &ksocknal_data.ksnd_interfaces[i];
68 if (iface->ksni_ipaddr == ip)
76 ksocknal_create_route (__u32 ipaddr, int port)
80 PORTAL_ALLOC (route, sizeof (*route));
84 atomic_set (&route->ksnr_refcount, 1);
85 route->ksnr_peer = NULL;
86 route->ksnr_timeout = cfs_time_current();
87 route->ksnr_retry_interval = SOCKNAL_MIN_RECONNECT_INTERVAL;
88 route->ksnr_ipaddr = ipaddr;
89 route->ksnr_port = port;
90 route->ksnr_connecting = 0;
91 route->ksnr_connected = 0;
92 route->ksnr_deleted = 0;
93 route->ksnr_conn_count = 0;
94 route->ksnr_share_count = 0;
100 ksocknal_destroy_route (ksock_route_t *route)
102 if (route->ksnr_peer != NULL)
103 ksocknal_put_peer (route->ksnr_peer);
105 PORTAL_FREE (route, sizeof (*route));
109 ksocknal_put_route (ksock_route_t *route)
111 CDEBUG (D_OTHER, "putting route[%p] (%d)\n",
112 route, atomic_read (&route->ksnr_refcount));
114 LASSERT (atomic_read (&route->ksnr_refcount) > 0);
115 if (!atomic_dec_and_test (&route->ksnr_refcount))
118 ksocknal_destroy_route (route);
122 ksocknal_create_peer (ptl_nid_t nid)
126 LASSERT (nid != PTL_NID_ANY);
128 PORTAL_ALLOC (peer, sizeof (*peer));
132 memset (peer, 0, sizeof (*peer)); /* NULL pointers/clear flags etc */
134 peer->ksnp_nid = nid;
135 atomic_set (&peer->ksnp_refcount, 1); /* 1 ref for caller */
136 peer->ksnp_closing = 0;
137 CFS_INIT_LIST_HEAD (&peer->ksnp_conns);
138 CFS_INIT_LIST_HEAD (&peer->ksnp_routes);
139 CFS_INIT_LIST_HEAD (&peer->ksnp_tx_queue);
141 atomic_inc (&ksocknal_data.ksnd_npeers);
146 ksocknal_destroy_peer (ksock_peer_t *peer)
148 CDEBUG (D_NET, "peer "LPX64" %p deleted\n", peer->ksnp_nid, peer);
150 LASSERT (atomic_read (&peer->ksnp_refcount) == 0);
151 LASSERT (list_empty (&peer->ksnp_conns));
152 LASSERT (list_empty (&peer->ksnp_routes));
153 LASSERT (list_empty (&peer->ksnp_tx_queue));
155 PORTAL_FREE (peer, sizeof (*peer));
157 /* NB a peer's connections and autoconnect routes keep a reference
158 * on their peer until they are destroyed, so we can be assured
159 * that _all_ state to do with this peer has been cleaned up when
160 * its refcount drops to zero. */
161 atomic_dec (&ksocknal_data.ksnd_npeers);
165 ksocknal_put_peer (ksock_peer_t *peer)
167 CDEBUG (D_OTHER, "putting peer[%p] -> "LPX64" (%d)\n",
168 peer, peer->ksnp_nid,
169 atomic_read (&peer->ksnp_refcount));
171 LASSERT (atomic_read (&peer->ksnp_refcount) > 0);
172 if (!atomic_dec_and_test (&peer->ksnp_refcount))
175 ksocknal_destroy_peer (peer);
179 ksocknal_find_peer_locked (ptl_nid_t nid)
181 struct list_head *peer_list = ksocknal_nid2peerlist (nid);
182 struct list_head *tmp;
185 list_for_each (tmp, peer_list) {
187 peer = list_entry (tmp, ksock_peer_t, ksnp_list);
189 LASSERT (!peer->ksnp_closing);
191 if (peer->ksnp_nid != nid)
194 CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n",
195 peer, nid, atomic_read (&peer->ksnp_refcount));
202 ksocknal_get_peer (ptl_nid_t nid)
206 read_lock (&ksocknal_data.ksnd_global_lock);
207 peer = ksocknal_find_peer_locked (nid);
208 if (peer != NULL) /* +1 ref for caller? */
209 atomic_inc (&peer->ksnp_refcount);
210 read_unlock (&ksocknal_data.ksnd_global_lock);
216 ksocknal_unlink_peer_locked (ksock_peer_t *peer)
221 for (i = 0; i < peer->ksnp_n_passive_ips; i++) {
222 LASSERT (i < SOCKNAL_MAX_INTERFACES);
223 ip = peer->ksnp_passive_ips[i];
225 ksocknal_ip2iface(ip)->ksni_npeers--;
228 LASSERT (list_empty(&peer->ksnp_conns));
229 LASSERT (list_empty(&peer->ksnp_routes));
230 LASSERT (!peer->ksnp_closing);
231 peer->ksnp_closing = 1;
232 list_del (&peer->ksnp_list);
233 /* lose peerlist's ref */
234 ksocknal_put_peer (peer);
238 ksocknal_get_peer_info (int index, ptl_nid_t *nid,
239 __u32 *myip, __u32 *peer_ip, int *port,
240 int *conn_count, int *share_count)
243 struct list_head *ptmp;
244 ksock_route_t *route;
245 struct list_head *rtmp;
250 read_lock (&ksocknal_data.ksnd_global_lock);
252 for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
254 list_for_each (ptmp, &ksocknal_data.ksnd_peers[i]) {
255 peer = list_entry (ptmp, ksock_peer_t, ksnp_list);
257 if (peer->ksnp_n_passive_ips == 0 &&
258 list_empty(&peer->ksnp_routes)) {
262 *nid = peer->ksnp_nid;
272 for (j = 0; j < peer->ksnp_n_passive_ips; j++) {
276 *nid = peer->ksnp_nid;
277 *myip = peer->ksnp_passive_ips[j];
286 list_for_each (rtmp, &peer->ksnp_routes) {
290 route = list_entry(rtmp, ksock_route_t,
293 *nid = peer->ksnp_nid;
294 *myip = route->ksnr_myipaddr;
295 *peer_ip = route->ksnr_ipaddr;
296 *port = route->ksnr_port;
297 *conn_count = route->ksnr_conn_count;
298 *share_count = route->ksnr_share_count;
305 read_unlock (&ksocknal_data.ksnd_global_lock);
310 ksocknal_associate_route_conn_locked(ksock_route_t *route, ksock_conn_t *conn)
312 ksock_peer_t *peer = route->ksnr_peer;
313 int type = conn->ksnc_type;
314 ksock_interface_t *iface;
316 conn->ksnc_route = route;
317 atomic_inc (&route->ksnr_refcount);
319 if (route->ksnr_myipaddr != conn->ksnc_myipaddr) {
320 if (route->ksnr_myipaddr == 0) {
321 /* route wasn't bound locally yet (the initial route) */
322 CWARN("Binding "LPX64" %u.%u.%u.%u to %u.%u.%u.%u\n",
324 HIPQUAD(route->ksnr_ipaddr),
325 HIPQUAD(conn->ksnc_myipaddr));
327 CWARN("Rebinding "LPX64" %u.%u.%u.%u from "
328 "%u.%u.%u.%u to %u.%u.%u.%u\n",
330 HIPQUAD(route->ksnr_ipaddr),
331 HIPQUAD(route->ksnr_myipaddr),
332 HIPQUAD(conn->ksnc_myipaddr));
334 iface = ksocknal_ip2iface(route->ksnr_myipaddr);
336 iface->ksni_nroutes--;
338 route->ksnr_myipaddr = conn->ksnc_myipaddr;
339 iface = ksocknal_ip2iface(route->ksnr_myipaddr);
341 iface->ksni_nroutes++;
344 route->ksnr_connected |= (1<<type);
345 route->ksnr_connecting &= ~(1<<type);
346 route->ksnr_conn_count++;
348 /* Successful connection => further attempts can
349 * proceed immediately */
350 route->ksnr_timeout = cfs_time_current();
351 route->ksnr_retry_interval = SOCKNAL_MIN_RECONNECT_INTERVAL;
355 ksocknal_add_route_locked (ksock_peer_t *peer, ksock_route_t *route)
357 struct list_head *tmp;
360 ksock_route_t *route2;
362 LASSERT (route->ksnr_peer == NULL);
363 LASSERT (route->ksnr_connecting == 0);
364 LASSERT (route->ksnr_connected == 0);
366 /* LASSERT(unique) */
367 list_for_each(tmp, &peer->ksnp_routes) {
368 route2 = list_entry(tmp, ksock_route_t, ksnr_list);
370 if (route2->ksnr_ipaddr == route->ksnr_ipaddr) {
371 CERROR ("Duplicate route "LPX64" %u.%u.%u.%u\n",
372 peer->ksnp_nid, HIPQUAD(route->ksnr_ipaddr));
377 route->ksnr_peer = peer;
378 atomic_inc (&peer->ksnp_refcount);
379 /* peer's routelist takes over my ref on 'route' */
380 list_add_tail(&route->ksnr_list, &peer->ksnp_routes);
382 list_for_each(tmp, &peer->ksnp_conns) {
383 conn = list_entry(tmp, ksock_conn_t, ksnc_list);
384 type = conn->ksnc_type;
386 if (conn->ksnc_ipaddr != route->ksnr_ipaddr)
389 ksocknal_associate_route_conn_locked(route, conn);
390 /* keep going (typed routes) */
395 ksocknal_del_route_locked (ksock_route_t *route)
397 ksock_peer_t *peer = route->ksnr_peer;
398 ksock_interface_t *iface;
400 struct list_head *ctmp;
401 struct list_head *cnxt;
403 LASSERT (!route->ksnr_deleted);
405 /* Close associated conns */
406 list_for_each_safe (ctmp, cnxt, &peer->ksnp_conns) {
407 conn = list_entry(ctmp, ksock_conn_t, ksnc_list);
409 if (conn->ksnc_route != route)
412 ksocknal_close_conn_locked (conn, 0);
415 if (route->ksnr_myipaddr != 0) {
416 iface = ksocknal_ip2iface(route->ksnr_myipaddr);
418 iface->ksni_nroutes--;
421 route->ksnr_deleted = 1;
422 list_del (&route->ksnr_list);
423 ksocknal_put_route (route); /* drop peer's ref */
425 if (list_empty (&peer->ksnp_routes) &&
426 list_empty (&peer->ksnp_conns)) {
427 /* I've just removed the last autoconnect route of a peer
428 * with no active connections */
429 ksocknal_unlink_peer_locked (peer);
434 ksocknal_add_peer (ptl_nid_t nid, __u32 ipaddr, int port)
437 struct list_head *tmp;
440 ksock_route_t *route;
441 ksock_route_t *route2;
443 if (nid == PTL_NID_ANY)
446 /* Have a brand new peer ready... */
447 peer = ksocknal_create_peer (nid);
451 route = ksocknal_create_route (ipaddr, port);
453 ksocknal_put_peer (peer);
457 write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags);
459 peer2 = ksocknal_find_peer_locked (nid);
461 ksocknal_put_peer (peer);
464 /* peer table takes my ref on peer */
465 list_add_tail (&peer->ksnp_list,
466 ksocknal_nid2peerlist (nid));
470 list_for_each (tmp, &peer->ksnp_routes) {
471 route2 = list_entry(tmp, ksock_route_t, ksnr_list);
473 if (route2->ksnr_ipaddr == ipaddr)
478 if (route2 == NULL) {
479 ksocknal_add_route_locked(peer, route);
480 route->ksnr_share_count++;
482 ksocknal_put_route(route);
483 route2->ksnr_share_count++;
486 write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags);
492 ksocknal_del_peer_locked (ksock_peer_t *peer, __u32 ip, int single_share)
495 ksock_route_t *route;
496 struct list_head *tmp;
497 struct list_head *nxt;
500 LASSERT (!peer->ksnp_closing);
502 /* Extra ref prevents peer disappearing until I'm done with it */
503 atomic_inc(&peer->ksnp_refcount);
505 list_for_each_safe (tmp, nxt, &peer->ksnp_routes) {
506 route = list_entry(tmp, ksock_route_t, ksnr_list);
508 if (single_share && route->ksnr_share_count == 0)
512 if (!(ip == 0 || route->ksnr_ipaddr == ip))
516 route->ksnr_share_count = 0;
517 else if (route->ksnr_share_count > 0)
518 route->ksnr_share_count--;
520 if (route->ksnr_share_count == 0) {
521 /* This deletes associated conns too */
522 ksocknal_del_route_locked (route);
530 list_for_each_safe (tmp, nxt, &peer->ksnp_routes) {
531 route = list_entry(tmp, ksock_route_t, ksnr_list);
532 nshared += route->ksnr_share_count;
536 /* remove everything else if there are no explicit entries
539 list_for_each_safe (tmp, nxt, &peer->ksnp_routes) {
540 route = list_entry(tmp, ksock_route_t, ksnr_list);
542 /* we should only be removing auto-entries */
543 LASSERT(route->ksnr_share_count == 0);
544 ksocknal_del_route_locked (route);
547 list_for_each_safe (tmp, nxt, &peer->ksnp_conns) {
548 conn = list_entry(tmp, ksock_conn_t, ksnc_list);
550 ksocknal_close_conn_locked(conn, 0);
554 ksocknal_put_peer(peer);
555 /* NB peer unlinks itself when last conn/route is removed */
559 ksocknal_del_peer (ptl_nid_t nid, __u32 ip, int single_share)
562 struct list_head *ptmp;
563 struct list_head *pnxt;
570 write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags);
572 if (nid != PTL_NID_ANY)
573 lo = hi = ksocknal_nid2peerlist(nid) - ksocknal_data.ksnd_peers;
576 hi = ksocknal_data.ksnd_peer_hash_size - 1;
579 for (i = lo; i <= hi; i++) {
580 list_for_each_safe (ptmp, pnxt, &ksocknal_data.ksnd_peers[i]) {
581 peer = list_entry (ptmp, ksock_peer_t, ksnp_list);
583 if (!(nid == PTL_NID_ANY || peer->ksnp_nid == nid))
586 ksocknal_del_peer_locked (peer, ip, single_share);
587 rc = 0; /* matched! */
594 write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags);
600 ksocknal_get_conn_by_idx (int index)
603 struct list_head *ptmp;
605 struct list_head *ctmp;
608 read_lock (&ksocknal_data.ksnd_global_lock);
610 for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
611 list_for_each (ptmp, &ksocknal_data.ksnd_peers[i]) {
612 peer = list_entry (ptmp, ksock_peer_t, ksnp_list);
614 LASSERT (!peer->ksnp_closing);
616 list_for_each (ctmp, &peer->ksnp_conns) {
620 conn = list_entry (ctmp, ksock_conn_t, ksnc_list);
621 atomic_inc (&conn->ksnc_refcount);
622 read_unlock (&ksocknal_data.ksnd_global_lock);
628 read_unlock (&ksocknal_data.ksnd_global_lock);
633 ksocknal_choose_scheduler_locked (unsigned int irq)
635 ksock_sched_t *sched;
636 ksock_irqinfo_t *info;
639 LASSERT (irq < NR_IRQS);
640 info = &ksocknal_data.ksnd_irqinfo[irq];
642 if (irq != 0 && /* hardware NIC */
643 info->ksni_valid) { /* already set up */
644 return (&ksocknal_data.ksnd_schedulers[info->ksni_sched]);
647 /* software NIC (irq == 0) || not associated with a scheduler yet.
648 * Choose the CPU with the fewest connections... */
649 sched = &ksocknal_data.ksnd_schedulers[0];
650 for (i = 1; i < ksocknal_data.ksnd_nschedulers; i++)
651 if (sched->kss_nconns >
652 ksocknal_data.ksnd_schedulers[i].kss_nconns)
653 sched = &ksocknal_data.ksnd_schedulers[i];
655 if (irq != 0) { /* Hardware NIC */
656 info->ksni_valid = 1;
657 info->ksni_sched = sched - ksocknal_data.ksnd_schedulers;
660 LASSERT (info->ksni_sched == sched - ksocknal_data.ksnd_schedulers);
667 ksocknal_local_ipvec (__u32 *ipaddrs)
672 read_lock (&ksocknal_data.ksnd_global_lock);
674 nip = ksocknal_data.ksnd_ninterfaces;
675 for (i = 0; i < nip; i++) {
676 LASSERT (i < SOCKNAL_MAX_INTERFACES);
678 ipaddrs[i] = ksocknal_data.ksnd_interfaces[i].ksni_ipaddr;
679 LASSERT (ipaddrs[i] != 0);
682 read_unlock (&ksocknal_data.ksnd_global_lock);
687 ksocknal_match_peerip (ksock_interface_t *iface, __u32 *ips, int nips)
689 int best_netmatch = 0;
696 for (i = 0; i < nips; i++) {
700 this_xor = (ips[i] ^ iface->ksni_ipaddr);
701 this_netmatch = ((this_xor & iface->ksni_netmask) == 0) ? 1 : 0;
704 best_netmatch < this_netmatch ||
705 (best_netmatch == this_netmatch &&
706 best_xor > this_xor)))
710 best_netmatch = this_netmatch;
719 ksocknal_select_ips(ksock_peer_t *peer, __u32 *peerips, int n_peerips)
721 rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock;
723 ksock_interface_t *iface;
724 ksock_interface_t *best_iface;
735 /* CAVEAT EMPTOR: We do all our interface matching with an
736 * exclusive hold of global lock at IRQ priority. We're only
737 * expecting to be dealing with small numbers of interfaces, so the
738 * O(n**3)-ness shouldn't matter */
740 /* Also note that I'm not going to return more than n_peerips
741 * interfaces, even if I have more myself */
743 write_lock_irqsave(global_lock, flags);
745 LASSERT (n_peerips <= SOCKNAL_MAX_INTERFACES);
746 LASSERT (ksocknal_data.ksnd_ninterfaces <= SOCKNAL_MAX_INTERFACES);
748 n_ips = MIN(n_peerips, ksocknal_data.ksnd_ninterfaces);
750 for (i = 0; peer->ksnp_n_passive_ips < n_ips; i++) {
751 /* ^ yes really... */
753 /* If we have any new interfaces, first tick off all the
754 * peer IPs that match old interfaces, then choose new
755 * interfaces to match the remaining peer IPS.
756 * We don't forget interfaces we've stopped using; we might
757 * start using them again... */
759 if (i < peer->ksnp_n_passive_ips) {
761 ip = peer->ksnp_passive_ips[i];
762 best_iface = ksocknal_ip2iface(ip);
764 /* peer passive ips are kept up to date */
765 LASSERT(best_iface != NULL);
767 /* choose a new interface */
768 LASSERT (i == peer->ksnp_n_passive_ips);
774 for (j = 0; j < ksocknal_data.ksnd_ninterfaces; j++) {
775 iface = &ksocknal_data.ksnd_interfaces[j];
776 ip = iface->ksni_ipaddr;
778 for (k = 0; k < peer->ksnp_n_passive_ips; k++)
779 if (peer->ksnp_passive_ips[k] == ip)
782 if (k < peer->ksnp_n_passive_ips) /* using it already */
785 k = ksocknal_match_peerip(iface, peerips, n_peerips);
786 xor = (ip ^ peerips[k]);
787 this_netmatch = ((xor & iface->ksni_netmask) == 0) ? 1 : 0;
789 if (!(best_iface == NULL ||
790 best_netmatch < this_netmatch ||
791 (best_netmatch == this_netmatch &&
792 best_npeers > iface->ksni_npeers)))
796 best_netmatch = this_netmatch;
797 best_npeers = iface->ksni_npeers;
800 best_iface->ksni_npeers++;
801 ip = best_iface->ksni_ipaddr;
802 peer->ksnp_passive_ips[i] = ip;
803 peer->ksnp_n_passive_ips = i+1;
806 LASSERT (best_iface != NULL);
808 /* mark the best matching peer IP used */
809 j = ksocknal_match_peerip(best_iface, peerips, n_peerips);
813 /* Overwrite input peer IP addresses */
814 memcpy(peerips, peer->ksnp_passive_ips, n_ips * sizeof(*peerips));
816 write_unlock_irqrestore(global_lock, flags);
822 ksocknal_create_routes(ksock_peer_t *peer, int port,
823 __u32 *peer_ipaddrs, int npeer_ipaddrs)
825 ksock_route_t *newroute = NULL;
826 rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock;
828 struct list_head *rtmp;
829 ksock_route_t *route;
830 ksock_interface_t *iface;
831 ksock_interface_t *best_iface;
838 /* CAVEAT EMPTOR: We do all our interface matching with an
839 * exclusive hold of global lock at IRQ priority. We're only
840 * expecting to be dealing with small numbers of interfaces, so the
841 * O(n**3)-ness here shouldn't matter */
843 write_lock_irqsave(global_lock, flags);
845 LASSERT (npeer_ipaddrs <= SOCKNAL_MAX_INTERFACES);
847 for (i = 0; i < npeer_ipaddrs; i++) {
848 if (newroute != NULL) {
849 newroute->ksnr_ipaddr = peer_ipaddrs[i];
851 write_unlock_irqrestore(global_lock, flags);
853 newroute = ksocknal_create_route(peer_ipaddrs[i], port);
854 if (newroute == NULL)
857 write_lock_irqsave(global_lock, flags);
860 /* Already got a route? */
862 list_for_each(rtmp, &peer->ksnp_routes) {
863 route = list_entry(rtmp, ksock_route_t, ksnr_list);
865 if (route->ksnr_ipaddr == newroute->ksnr_ipaddr)
877 LASSERT (ksocknal_data.ksnd_ninterfaces <= SOCKNAL_MAX_INTERFACES);
879 /* Select interface to connect from */
880 for (j = 0; j < ksocknal_data.ksnd_ninterfaces; j++) {
881 iface = &ksocknal_data.ksnd_interfaces[j];
883 /* Using this interface already? */
884 list_for_each(rtmp, &peer->ksnp_routes) {
885 route = list_entry(rtmp, ksock_route_t, ksnr_list);
887 if (route->ksnr_myipaddr == iface->ksni_ipaddr)
895 this_netmatch = (((iface->ksni_ipaddr ^
896 newroute->ksnr_ipaddr) &
897 iface->ksni_netmask) == 0) ? 1 : 0;
899 if (!(best_iface == NULL ||
900 best_netmatch < this_netmatch ||
901 (best_netmatch == this_netmatch &&
902 best_nroutes > iface->ksni_nroutes)))
906 best_netmatch = this_netmatch;
907 best_nroutes = iface->ksni_nroutes;
910 if (best_iface == NULL)
913 newroute->ksnr_myipaddr = best_iface->ksni_ipaddr;
914 best_iface->ksni_nroutes++;
916 ksocknal_add_route_locked(peer, newroute);
920 write_unlock_irqrestore(global_lock, flags);
921 if (newroute != NULL)
922 ksocknal_put_route(newroute);
926 ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type)
928 int passive = (type == SOCKNAL_CONN_NONE);
929 rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock;
930 __u32 ipaddrs[SOCKNAL_MAX_INTERFACES];
933 struct list_head *tmp;
938 ksock_peer_t *peer = NULL;
940 ksock_sched_t *sched;
945 /* NB, sock has an associated file since (a) this connection might
946 * have been created in userland and (b) we need to refcount the
947 * socket so that we don't close it while I/O is being done on
948 * it, and sock->file has that pre-cooked... */
949 LASSERT (KSN_SOCK2FILE(sock) != NULL);
950 LASSERT (cfs_file_count(KSN_SOCK2FILE(sock)) > 0);
951 LASSERT (route == NULL || !passive);
953 rc = ksocknal_lib_setup_sock (sock);
957 irq = ksocknal_lib_sock_irq (sock);
959 PORTAL_ALLOC(conn, sizeof(*conn));
963 memset (conn, 0, sizeof (*conn));
964 conn->ksnc_peer = NULL;
965 conn->ksnc_route = NULL;
966 conn->ksnc_sock = sock;
967 conn->ksnc_type = type;
968 ksocknal_lib_save_callback(sock, conn);
969 atomic_set (&conn->ksnc_refcount, 1); /* 1 ref for me */
971 conn->ksnc_rx_ready = 0;
972 conn->ksnc_rx_scheduled = 0;
973 ksocknal_new_packet (conn, 0);
975 CFS_INIT_LIST_HEAD (&conn->ksnc_tx_queue);
976 conn->ksnc_tx_ready = 0;
977 conn->ksnc_tx_scheduled = 0;
978 atomic_set (&conn->ksnc_tx_nob, 0);
980 /* stash conn's local and remote addrs */
981 rc = ksocknal_lib_get_conn_addrs (conn);
986 /* Active connection sends HELLO eagerly */
987 rc = ksocknal_local_ipvec(ipaddrs);
992 rc = ksocknal_send_hello (conn, ipaddrs, nipaddrs);
997 /* Find out/confirm peer's NID and connection type and get the
998 * vector of interfaces she's willing to let me connect to */
999 nid = (route == NULL) ? PTL_NID_ANY : route->ksnr_peer->ksnp_nid;
1000 rc = ksocknal_recv_hello (conn, &nid, &incarnation, ipaddrs);
1004 LASSERT (nid != PTL_NID_ANY);
1006 if (route != NULL) {
1007 peer = route->ksnr_peer;
1008 atomic_inc(&peer->ksnp_refcount);
1010 peer = ksocknal_create_peer(nid);
1016 write_lock_irqsave(global_lock, flags);
1018 peer2 = ksocknal_find_peer_locked(nid);
1019 if (peer2 == NULL) {
1020 /* NB this puts an "empty" peer in the peer
1021 * table (which takes my ref) */
1022 list_add_tail(&peer->ksnp_list,
1023 ksocknal_nid2peerlist(nid));
1025 ksocknal_put_peer(peer);
1029 atomic_inc(&peer->ksnp_refcount);
1031 write_unlock_irqrestore(global_lock, flags);
1035 ksocknal_create_routes(peer, conn->ksnc_port,
1039 rc = ksocknal_select_ips(peer, ipaddrs, nipaddrs);
1041 rc = ksocknal_send_hello (conn, ipaddrs, rc);
1046 write_lock_irqsave (global_lock, flags);
1048 if (peer->ksnp_closing ||
1049 (route != NULL && route->ksnr_deleted)) {
1050 /* route/peer got closed under me */
1055 /* Refuse to duplicate an existing connection (both sides might
1056 * autoconnect at once), unless this is a loopback connection */
1057 if (conn->ksnc_ipaddr != conn->ksnc_myipaddr) {
1058 list_for_each(tmp, &peer->ksnp_conns) {
1059 conn2 = list_entry(tmp, ksock_conn_t, ksnc_list);
1061 if (conn2->ksnc_ipaddr != conn->ksnc_ipaddr ||
1062 conn2->ksnc_myipaddr != conn->ksnc_myipaddr ||
1063 conn2->ksnc_type != conn->ksnc_type ||
1064 conn2->ksnc_incarnation != incarnation)
1067 CWARN("Not creating duplicate connection to "
1068 "%u.%u.%u.%u type %d\n",
1069 HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_type);
1075 /* If the connection created by this route didn't bind to the IP
1076 * address the route connected to, the connection/route matching
1077 * code below probably isn't going to work. */
1078 if (route != NULL &&
1079 route->ksnr_ipaddr != conn->ksnc_ipaddr) {
1080 CERROR("Route "LPX64" %u.%u.%u.%u connected to %u.%u.%u.%u\n",
1082 HIPQUAD(route->ksnr_ipaddr),
1083 HIPQUAD(conn->ksnc_ipaddr));
1086 /* Search for a route corresponding to the new connection and
1087 * create an association. This allows incoming connections created
1088 * by routes in my peer to match my own route entries so I don't
1089 * continually create duplicate routes. */
1090 list_for_each (tmp, &peer->ksnp_routes) {
1091 route = list_entry(tmp, ksock_route_t, ksnr_list);
1093 if (route->ksnr_ipaddr != conn->ksnc_ipaddr)
1096 ksocknal_associate_route_conn_locked(route, conn);
1100 /* Give conn a ref on sock->file since we're going to return success */
1101 cfs_get_file(KSN_SOCK2FILE(sock));
1103 conn->ksnc_peer = peer; /* conn takes my ref on peer */
1104 conn->ksnc_incarnation = incarnation;
1105 peer->ksnp_last_alive = cfs_time_current();
1106 peer->ksnp_error = 0;
1108 sched = ksocknal_choose_scheduler_locked (irq);
1109 sched->kss_nconns++;
1110 conn->ksnc_scheduler = sched;
1112 /* Set the deadline for the outgoing HELLO to drain */
1113 conn->ksnc_tx_bufnob = SOCK_WMEM_QUEUED(sock);
1114 conn->ksnc_tx_deadline = cfs_time_shift(ksocknal_tunables.ksnd_io_timeout);
1115 mb(); /* order with adding to peer's conn list */
1117 list_add (&conn->ksnc_list, &peer->ksnp_conns);
1118 atomic_inc (&conn->ksnc_refcount);
1120 /* NB my callbacks block while I hold ksnd_global_lock */
1121 ksocknal_lib_set_callback(sock, conn);
1123 /* Take all the packets blocking for a connection.
1124 * NB, it might be nicer to share these blocked packets among any
1125 * other connections that are becoming established. */
1126 while (!list_empty (&peer->ksnp_tx_queue)) {
1127 tx = list_entry (peer->ksnp_tx_queue.next,
1128 ksock_tx_t, tx_list);
1130 list_del (&tx->tx_list);
1131 ksocknal_queue_tx_locked (tx, conn);
1134 rc = ksocknal_close_stale_conns_locked(peer, incarnation);
1136 CERROR ("Closed %d stale conns to nid "LPX64" ip %d.%d.%d.%d\n",
1137 rc, conn->ksnc_peer->ksnp_nid,
1138 HIPQUAD(conn->ksnc_ipaddr));
1140 write_unlock_irqrestore (global_lock, flags);
1142 ksocknal_lib_bind_irq (irq);
1144 /* Call the callbacks right now to get things going. */
1145 if (ksocknal_getconnsock(conn) == 0) {
1146 ksocknal_lib_act_callback(sock, conn);
1147 ksocknal_putconnsock(conn);
1150 CWARN("New conn nid:"LPX64" %u.%u.%u.%u -> %u.%u.%u.%u/%d"
1151 " incarnation:"LPX64" sched[%d]/%d\n",
1152 nid, HIPQUAD(conn->ksnc_myipaddr),
1153 HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port, incarnation,
1154 (int)(conn->ksnc_scheduler - ksocknal_data.ksnd_schedulers), irq);
1156 ksocknal_put_conn (conn);
1160 if (!peer->ksnp_closing &&
1161 list_empty (&peer->ksnp_conns) &&
1162 list_empty (&peer->ksnp_routes))
1163 ksocknal_unlink_peer_locked(peer);
1164 write_unlock_irqrestore(global_lock, flags);
1167 ksocknal_put_peer (peer);
1170 PORTAL_FREE (conn, sizeof(*conn));
1177 ksocknal_close_conn_locked (ksock_conn_t *conn, int error)
1179 /* This just does the immmediate housekeeping, and queues the
1180 * connection for the reaper to terminate.
1181 * Caller holds ksnd_global_lock exclusively in irq context */
1182 ksock_peer_t *peer = conn->ksnc_peer;
1183 ksock_route_t *route;
1184 ksock_conn_t *conn2;
1185 struct list_head *tmp;
1187 LASSERT (peer->ksnp_error == 0);
1188 LASSERT (!conn->ksnc_closing);
1189 conn->ksnc_closing = 1;
1190 atomic_inc (&ksocknal_data.ksnd_nclosing_conns);
1192 /* ksnd_deathrow_conns takes over peer's ref */
1193 list_del (&conn->ksnc_list);
1195 route = conn->ksnc_route;
1196 if (route != NULL) {
1197 /* dissociate conn from route... */
1198 LASSERT (!route->ksnr_deleted);
1199 LASSERT ((route->ksnr_connecting & (1 << conn->ksnc_type)) == 0);
1200 LASSERT ((route->ksnr_connected & (1 << conn->ksnc_type)) != 0);
1203 list_for_each(tmp, &peer->ksnp_conns) {
1204 conn2 = list_entry(tmp, ksock_conn_t, ksnc_list);
1206 if (conn2->ksnc_route == route &&
1207 conn2->ksnc_type == conn->ksnc_type)
1213 route->ksnr_connected &= ~(1 << conn->ksnc_type);
1215 conn->ksnc_route = NULL;
1217 #if 0 /* irrelevent with only eager routes */
1218 list_del (&route->ksnr_list); /* make route least favourite */
1219 list_add_tail (&route->ksnr_list, &peer->ksnp_routes);
1221 ksocknal_put_route (route); /* drop conn's ref on route */
1224 if (list_empty (&peer->ksnp_conns)) {
1225 /* No more connections to this peer */
1227 peer->ksnp_error = error; /* stash last conn close reason */
1229 if (list_empty (&peer->ksnp_routes)) {
1230 /* I've just closed last conn belonging to a
1231 * non-autoconnecting peer */
1232 ksocknal_unlink_peer_locked (peer);
1236 spin_lock (&ksocknal_data.ksnd_reaper_lock);
1238 list_add_tail (&conn->ksnc_list, &ksocknal_data.ksnd_deathrow_conns);
1239 cfs_waitq_signal (&ksocknal_data.ksnd_reaper_waitq);
1241 spin_unlock (&ksocknal_data.ksnd_reaper_lock);
1245 ksocknal_terminate_conn (ksock_conn_t *conn)
1247 /* This gets called by the reaper (guaranteed thread context) to
1248 * disengage the socket from its callbacks and close it.
1249 * ksnc_refcount will eventually hit zero, and then the reaper will
1251 unsigned long flags;
1252 ksock_peer_t *peer = conn->ksnc_peer;
1253 ksock_sched_t *sched = conn->ksnc_scheduler;
1258 LASSERT(conn->ksnc_closing);
1260 /* wake up the scheduler to "send" all remaining packets to /dev/null */
1261 spin_lock_irqsave(&sched->kss_lock, flags);
1263 if (!conn->ksnc_tx_scheduled &&
1264 !list_empty(&conn->ksnc_tx_queue)){
1265 list_add_tail (&conn->ksnc_tx_list,
1266 &sched->kss_tx_conns);
1267 /* a closing conn is always ready to tx */
1268 conn->ksnc_tx_ready = 1;
1269 conn->ksnc_tx_scheduled = 1;
1270 /* extra ref for scheduler */
1271 atomic_inc (&conn->ksnc_refcount);
1273 cfs_waitq_signal (&sched->kss_waitq);
1276 spin_unlock_irqrestore (&sched->kss_lock, flags);
1278 /* serialise with callbacks */
1279 write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags);
1281 ksocknal_lib_reset_callback(conn->ksnc_sock, conn);
1283 /* OK, so this conn may not be completely disengaged from its
1284 * scheduler yet, but it _has_ committed to terminate... */
1285 conn->ksnc_scheduler->kss_nconns--;
1287 if (peer->ksnp_error != 0) {
1288 /* peer's last conn closed in error */
1289 LASSERT (list_empty (&peer->ksnp_conns));
1291 /* convert peer's last-known-alive timestamp from jiffies */
1292 do_gettimeofday (&now);
1293 then = now.tv_sec - cfs_duration_sec(cfs_time_sub(cfs_time_current(),
1294 peer->ksnp_last_alive));
1298 write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags);
1300 /* The socket is closed on the final put; either here, or in
1301 * ksocknal_{send,recv}msg(). Since we set up the linger2 option
1302 * when the connection was established, this will close the socket
1303 * immediately, aborting anything buffered in it. Any hung
1304 * zero-copy transmits will therefore complete in finite time. */
1305 ksocknal_putconnsock (conn);
1308 kpr_notify (&ksocknal_data.ksnd_router, peer->ksnp_nid,
1313 ksocknal_destroy_conn (ksock_conn_t *conn)
1315 /* Final coup-de-grace of the reaper */
1316 CDEBUG (D_NET, "connection %p\n", conn);
1318 LASSERT (atomic_read (&conn->ksnc_refcount) == 0);
1319 LASSERT (conn->ksnc_route == NULL);
1320 LASSERT (!conn->ksnc_tx_scheduled);
1321 LASSERT (!conn->ksnc_rx_scheduled);
1322 LASSERT (list_empty(&conn->ksnc_tx_queue));
1324 /* complete current receive if any */
1325 switch (conn->ksnc_rx_state) {
1326 case SOCKNAL_RX_BODY:
1327 CERROR("Completing partial receive from "LPX64
1328 ", ip %d.%d.%d.%d:%d, with error\n",
1329 conn->ksnc_peer->ksnp_nid,
1330 HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
1331 lib_finalize (&ksocknal_lib, NULL, conn->ksnc_cookie, PTL_FAIL);
1333 case SOCKNAL_RX_BODY_FWD:
1334 ksocknal_fmb_callback (conn->ksnc_cookie, -ECONNABORTED);
1336 case SOCKNAL_RX_HEADER:
1337 case SOCKNAL_RX_SLOP:
1344 ksocknal_put_peer (conn->ksnc_peer);
1346 PORTAL_FREE (conn, sizeof (*conn));
1347 atomic_dec (&ksocknal_data.ksnd_nclosing_conns);
1351 ksocknal_put_conn (ksock_conn_t *conn)
1353 unsigned long flags;
1355 CDEBUG (D_OTHER, "putting conn[%p] -> "LPX64" (%d)\n",
1356 conn, conn->ksnc_peer->ksnp_nid,
1357 atomic_read (&conn->ksnc_refcount));
1359 LASSERT (atomic_read (&conn->ksnc_refcount) > 0);
1360 if (!atomic_dec_and_test (&conn->ksnc_refcount))
1363 spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags);
1365 list_add (&conn->ksnc_list, &ksocknal_data.ksnd_zombie_conns);
1366 cfs_waitq_signal (&ksocknal_data.ksnd_reaper_waitq);
1368 spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags);
1372 ksocknal_close_peer_conns_locked (ksock_peer_t *peer, __u32 ipaddr, int why)
1375 struct list_head *ctmp;
1376 struct list_head *cnxt;
1379 list_for_each_safe (ctmp, cnxt, &peer->ksnp_conns) {
1380 conn = list_entry (ctmp, ksock_conn_t, ksnc_list);
1383 conn->ksnc_ipaddr == ipaddr) {
1385 ksocknal_close_conn_locked (conn, why);
1393 ksocknal_close_stale_conns_locked (ksock_peer_t *peer, __u64 incarnation)
1396 struct list_head *ctmp;
1397 struct list_head *cnxt;
1400 list_for_each_safe (ctmp, cnxt, &peer->ksnp_conns) {
1401 conn = list_entry (ctmp, ksock_conn_t, ksnc_list);
1403 if (conn->ksnc_incarnation == incarnation)
1406 CWARN("Closing stale conn nid:"LPX64" ip:%08x/%d "
1407 "incarnation:"LPX64"("LPX64")\n",
1408 peer->ksnp_nid, conn->ksnc_ipaddr, conn->ksnc_port,
1409 conn->ksnc_incarnation, incarnation);
1412 ksocknal_close_conn_locked (conn, -ESTALE);
1419 ksocknal_close_conn_and_siblings (ksock_conn_t *conn, int why)
1421 ksock_peer_t *peer = conn->ksnc_peer;
1422 __u32 ipaddr = conn->ksnc_ipaddr;
1423 unsigned long flags;
1426 write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags);
1428 count = ksocknal_close_peer_conns_locked (peer, ipaddr, why);
1430 write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags);
1436 ksocknal_close_matching_conns (ptl_nid_t nid, __u32 ipaddr)
1438 unsigned long flags;
1440 struct list_head *ptmp;
1441 struct list_head *pnxt;
1447 write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags);
1449 if (nid != PTL_NID_ANY)
1450 lo = hi = ksocknal_nid2peerlist(nid) - ksocknal_data.ksnd_peers;
1453 hi = ksocknal_data.ksnd_peer_hash_size - 1;
1456 for (i = lo; i <= hi; i++) {
1457 list_for_each_safe (ptmp, pnxt, &ksocknal_data.ksnd_peers[i]) {
1459 peer = list_entry (ptmp, ksock_peer_t, ksnp_list);
1461 if (!(nid == PTL_NID_ANY || nid == peer->ksnp_nid))
1464 count += ksocknal_close_peer_conns_locked (peer, ipaddr, 0);
1468 write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags);
1470 /* wildcards always succeed */
1471 if (nid == PTL_NID_ANY || ipaddr == 0)
1474 return (count == 0 ? -ENOENT : 0);
1478 ksocknal_notify (void *arg, ptl_nid_t gw_nid, int alive)
1480 /* The router is telling me she's been notified of a change in
1481 * gateway state.... */
1483 CDEBUG (D_NET, "gw "LPX64" %s\n", gw_nid, alive ? "up" : "down");
1486 /* If the gateway crashed, close all open connections... */
1487 ksocknal_close_matching_conns (gw_nid, 0);
1491 /* ...otherwise do nothing. We can only establish new connections
1492 * if we have autroutes, and these connect on demand. */
1496 ksocknal_push_peer (ksock_peer_t *peer)
1500 struct list_head *tmp;
1503 for (index = 0; ; index++) {
1504 read_lock (&ksocknal_data.ksnd_global_lock);
1509 list_for_each (tmp, &peer->ksnp_conns) {
1511 conn = list_entry (tmp, ksock_conn_t, ksnc_list);
1512 atomic_inc (&conn->ksnc_refcount);
1517 read_unlock (&ksocknal_data.ksnd_global_lock);
1522 ksocknal_lib_push_conn (conn);
1523 ksocknal_put_conn (conn);
1528 ksocknal_push (ptl_nid_t nid)
1531 struct list_head *tmp;
1537 if (nid != PTL_NID_ANY) {
1538 peer = ksocknal_get_peer (nid);
1542 ksocknal_push_peer (peer);
1543 ksocknal_put_peer (peer);
1548 for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
1549 for (j = 0; ; j++) {
1550 read_lock (&ksocknal_data.ksnd_global_lock);
1555 list_for_each (tmp, &ksocknal_data.ksnd_peers[i]) {
1557 peer = list_entry(tmp, ksock_peer_t,
1559 atomic_inc (&peer->ksnp_refcount);
1564 read_unlock (&ksocknal_data.ksnd_global_lock);
1568 ksocknal_push_peer (peer);
1569 ksocknal_put_peer (peer);
1579 ksocknal_add_interface(__u32 ipaddress, __u32 netmask)
1581 unsigned long flags;
1582 ksock_interface_t *iface;
1586 struct list_head *ptmp;
1588 struct list_head *rtmp;
1589 ksock_route_t *route;
1591 if (ipaddress == 0 ||
1595 write_lock_irqsave(&ksocknal_data.ksnd_global_lock, flags);
1597 iface = ksocknal_ip2iface(ipaddress);
1598 if (iface != NULL) {
1599 /* silently ignore dups */
1601 } else if (ksocknal_data.ksnd_ninterfaces == SOCKNAL_MAX_INTERFACES) {
1604 iface = &ksocknal_data.ksnd_interfaces[ksocknal_data.ksnd_ninterfaces++];
1606 iface->ksni_ipaddr = ipaddress;
1607 iface->ksni_netmask = netmask;
1608 iface->ksni_nroutes = 0;
1609 iface->ksni_npeers = 0;
1611 for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
1612 list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) {
1613 peer = list_entry(ptmp, ksock_peer_t, ksnp_list);
1615 for (j = 0; i < peer->ksnp_n_passive_ips; j++)
1616 if (peer->ksnp_passive_ips[j] == ipaddress)
1617 iface->ksni_npeers++;
1619 list_for_each(rtmp, &peer->ksnp_routes) {
1620 route = list_entry(rtmp, ksock_route_t, ksnr_list);
1622 if (route->ksnr_myipaddr == ipaddress)
1623 iface->ksni_nroutes++;
1629 /* NB only new connections will pay attention to the new interface! */
1632 write_unlock_irqrestore(&ksocknal_data.ksnd_global_lock, flags);
1638 ksocknal_peer_del_interface_locked(ksock_peer_t *peer, __u32 ipaddr)
1640 struct list_head *tmp;
1641 struct list_head *nxt;
1642 ksock_route_t *route;
1647 for (i = 0; i < peer->ksnp_n_passive_ips; i++)
1648 if (peer->ksnp_passive_ips[i] == ipaddr) {
1649 for (j = i+1; j < peer->ksnp_n_passive_ips; j++)
1650 peer->ksnp_passive_ips[j-1] =
1651 peer->ksnp_passive_ips[j];
1652 peer->ksnp_n_passive_ips--;
1656 list_for_each_safe(tmp, nxt, &peer->ksnp_routes) {
1657 route = list_entry (tmp, ksock_route_t, ksnr_list);
1659 if (route->ksnr_myipaddr != ipaddr)
1662 if (route->ksnr_share_count != 0) {
1663 /* Manually created; keep, but unbind */
1664 route->ksnr_myipaddr = 0;
1666 ksocknal_del_route_locked(route);
1670 list_for_each_safe(tmp, nxt, &peer->ksnp_conns) {
1671 conn = list_entry(tmp, ksock_conn_t, ksnc_list);
1673 if (conn->ksnc_myipaddr == ipaddr)
1674 ksocknal_close_conn_locked (conn, 0);
1679 ksocknal_del_interface(__u32 ipaddress)
1682 unsigned long flags;
1683 struct list_head *tmp;
1684 struct list_head *nxt;
1690 write_lock_irqsave(&ksocknal_data.ksnd_global_lock, flags);
1692 for (i = 0; i < ksocknal_data.ksnd_ninterfaces; i++) {
1693 this_ip = ksocknal_data.ksnd_interfaces[i].ksni_ipaddr;
1695 if (!(ipaddress == 0 ||
1696 ipaddress == this_ip))
1701 for (j = i+1; j < ksocknal_data.ksnd_ninterfaces; j++)
1702 ksocknal_data.ksnd_interfaces[j-1] =
1703 ksocknal_data.ksnd_interfaces[j];
1705 ksocknal_data.ksnd_ninterfaces--;
1707 for (j = 0; j < ksocknal_data.ksnd_peer_hash_size; j++) {
1708 list_for_each_safe(tmp, nxt, &ksocknal_data.ksnd_peers[j]) {
1709 peer = list_entry(tmp, ksock_peer_t, ksnp_list);
1711 ksocknal_peer_del_interface_locked(peer, this_ip);
1716 write_unlock_irqrestore(&ksocknal_data.ksnd_global_lock, flags);
1722 ksocknal_cmd(struct portals_cfg *pcfg, void * private)
1726 switch(pcfg->pcfg_command) {
1727 case NAL_CMD_GET_INTERFACE: {
1728 ksock_interface_t *iface;
1730 read_lock (&ksocknal_data.ksnd_global_lock);
1732 if (pcfg->pcfg_count < 0 ||
1733 pcfg->pcfg_count >= ksocknal_data.ksnd_ninterfaces) {
1737 iface = &ksocknal_data.ksnd_interfaces[pcfg->pcfg_count];
1739 pcfg->pcfg_id = iface->ksni_ipaddr;
1740 pcfg->pcfg_misc = iface->ksni_netmask;
1741 pcfg->pcfg_fd = iface->ksni_npeers;
1742 pcfg->pcfg_count = iface->ksni_nroutes;
1745 read_unlock (&ksocknal_data.ksnd_global_lock);
1748 case NAL_CMD_ADD_INTERFACE: {
1749 rc = ksocknal_add_interface(pcfg->pcfg_id, /* IP address */
1750 pcfg->pcfg_misc); /* net mask */
1753 case NAL_CMD_DEL_INTERFACE: {
1754 rc = ksocknal_del_interface(pcfg->pcfg_id); /* IP address */
1757 case NAL_CMD_GET_PEER: {
1763 int share_count = 0;
1765 rc = ksocknal_get_peer_info(pcfg->pcfg_count, &nid,
1767 &conn_count, &share_count);
1768 pcfg->pcfg_nid = nid;
1769 pcfg->pcfg_size = myip;
1771 pcfg->pcfg_misc = port;
1772 pcfg->pcfg_count = conn_count;
1773 pcfg->pcfg_wait = share_count;
1776 case NAL_CMD_ADD_PEER: {
1777 rc = ksocknal_add_peer (pcfg->pcfg_nid,
1778 pcfg->pcfg_id, /* IP */
1779 pcfg->pcfg_misc); /* port */
1782 case NAL_CMD_DEL_PEER: {
1783 rc = ksocknal_del_peer (pcfg->pcfg_nid,
1784 pcfg->pcfg_id, /* IP */
1785 pcfg->pcfg_flags); /* single_share? */
1788 case NAL_CMD_GET_CONN: {
1789 ksock_conn_t *conn = ksocknal_get_conn_by_idx (pcfg->pcfg_count);
1798 ksocknal_get_conn_tunables(conn, &txmem, &rxmem, &nagle);
1801 pcfg->pcfg_nid = conn->ksnc_peer->ksnp_nid;
1802 pcfg->pcfg_id = conn->ksnc_ipaddr;
1803 pcfg->pcfg_misc = conn->ksnc_port;
1804 pcfg->pcfg_fd = conn->ksnc_myipaddr;
1805 pcfg->pcfg_flags = conn->ksnc_type;
1806 pcfg->pcfg_gw_nal = conn->ksnc_scheduler -
1807 ksocknal_data.ksnd_schedulers;
1808 pcfg->pcfg_count = txmem;
1809 pcfg->pcfg_size = rxmem;
1810 pcfg->pcfg_wait = nagle;
1811 ksocknal_put_conn (conn);
1815 case NAL_CMD_REGISTER_PEER_FD: {
1816 struct socket *sock = sockfd_lookup (pcfg->pcfg_fd, &rc);
1817 int type = pcfg->pcfg_misc;
1823 case SOCKNAL_CONN_NONE:
1824 case SOCKNAL_CONN_ANY:
1825 case SOCKNAL_CONN_CONTROL:
1826 case SOCKNAL_CONN_BULK_IN:
1827 case SOCKNAL_CONN_BULK_OUT:
1828 rc = ksocknal_create_conn(NULL, sock, type);
1834 cfs_put_file (KSN_SOCK2FILE(sock));
1837 case NAL_CMD_CLOSE_CONNECTION: {
1838 rc = ksocknal_close_matching_conns (pcfg->pcfg_nid,
1842 case NAL_CMD_REGISTER_MYNID: {
1843 rc = ksocknal_set_mynid (pcfg->pcfg_nid);
1846 case NAL_CMD_PUSH_CONNECTION: {
1847 rc = ksocknal_push (pcfg->pcfg_nid);
1859 ksocknal_free_fmbs (ksock_fmb_pool_t *p)
1861 int npages = p->fmp_buff_pages;
1865 LASSERT (list_empty(&p->fmp_blocked_conns));
1866 LASSERT (p->fmp_nactive_fmbs == 0);
1868 while (!list_empty(&p->fmp_idle_fmbs)) {
1870 fmb = list_entry(p->fmp_idle_fmbs.next,
1871 ksock_fmb_t, fmb_list);
1873 for (i = 0; i < npages; i++)
1874 if (fmb->fmb_kiov[i].kiov_page != NULL)
1875 cfs_free_page(fmb->fmb_kiov[i].kiov_page);
1877 list_del(&fmb->fmb_list);
1878 PORTAL_FREE(fmb, offsetof(ksock_fmb_t, fmb_kiov[npages]));
1883 ksocknal_free_buffers (void)
1885 ksocknal_free_fmbs(&ksocknal_data.ksnd_small_fmp);
1886 ksocknal_free_fmbs(&ksocknal_data.ksnd_large_fmp);
1888 LASSERT (atomic_read(&ksocknal_data.ksnd_nactive_ltxs) == 0);
1890 if (ksocknal_data.ksnd_schedulers != NULL)
1891 PORTAL_FREE (ksocknal_data.ksnd_schedulers,
1892 sizeof (ksock_sched_t) * ksocknal_data.ksnd_nschedulers);
1894 PORTAL_FREE (ksocknal_data.ksnd_peers,
1895 sizeof (struct list_head) *
1896 ksocknal_data.ksnd_peer_hash_size);
1900 ksocknal_api_shutdown (nal_t *nal)
1902 ksock_sched_t *sched;
1905 if (nal->nal_refct != 0) {
1906 /* This module got the first ref */
1907 PORTAL_MODULE_UNUSE;
1911 CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
1912 atomic_read (&portal_kmemory));
1914 LASSERT(nal == &ksocknal_api);
1916 switch (ksocknal_data.ksnd_init) {
1920 case SOCKNAL_INIT_ALL:
1921 libcfs_nal_cmd_unregister(SOCKNAL);
1923 ksocknal_data.ksnd_init = SOCKNAL_INIT_LIB;
1926 case SOCKNAL_INIT_LIB:
1927 /* No more calls to ksocknal_cmd() to create new
1928 * autoroutes/connections since we're being unloaded. */
1930 /* Delete all peers */
1931 ksocknal_del_peer(PTL_NID_ANY, 0, 0);
1933 /* Wait for all peer state to clean up */
1935 while (atomic_read (&ksocknal_data.ksnd_npeers) != 0) {
1937 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1938 "waiting for %d peers to disconnect\n",
1939 atomic_read (&ksocknal_data.ksnd_npeers));
1940 set_current_state (TASK_UNINTERRUPTIBLE);
1941 schedule_timeout (cfs_time_seconds(1));
1944 /* Tell lib we've stopped calling into her. */
1945 lib_fini(&ksocknal_lib);
1947 ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA;
1950 case SOCKNAL_INIT_DATA:
1951 LASSERT (atomic_read (&ksocknal_data.ksnd_npeers) == 0);
1952 LASSERT (ksocknal_data.ksnd_peers != NULL);
1953 for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
1954 LASSERT (list_empty (&ksocknal_data.ksnd_peers[i]));
1956 LASSERT (list_empty (&ksocknal_data.ksnd_enomem_conns));
1957 LASSERT (list_empty (&ksocknal_data.ksnd_zombie_conns));
1958 LASSERT (list_empty (&ksocknal_data.ksnd_autoconnectd_routes));
1959 LASSERT (list_empty (&ksocknal_data.ksnd_small_fmp.fmp_blocked_conns));
1960 LASSERT (list_empty (&ksocknal_data.ksnd_large_fmp.fmp_blocked_conns));
1962 if (ksocknal_data.ksnd_schedulers != NULL)
1963 for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) {
1964 ksock_sched_t *kss =
1965 &ksocknal_data.ksnd_schedulers[i];
1967 LASSERT (list_empty (&kss->kss_tx_conns));
1968 LASSERT (list_empty (&kss->kss_rx_conns));
1969 LASSERT (kss->kss_nconns == 0);
1972 /* stop router calling me */
1973 kpr_shutdown (&ksocknal_data.ksnd_router);
1975 /* flag threads to terminate; wake and wait for them to die */
1976 ksocknal_data.ksnd_shuttingdown = 1;
1977 cfs_waitq_broadcast (&ksocknal_data.ksnd_autoconnectd_waitq);
1978 cfs_waitq_broadcast (&ksocknal_data.ksnd_reaper_waitq);
1980 for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) {
1981 sched = &ksocknal_data.ksnd_schedulers[i];
1982 cfs_waitq_broadcast(&sched->kss_waitq);
1986 read_lock(&ksocknal_data.ksnd_global_lock);
1987 while (ksocknal_data.ksnd_nthreads != 0) {
1989 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1990 "waiting for %d threads to terminate\n",
1991 ksocknal_data.ksnd_nthreads);
1992 read_unlock(&ksocknal_data.ksnd_global_lock);
1993 set_current_state (TASK_UNINTERRUPTIBLE);
1994 schedule_timeout (cfs_time_seconds(1));
1995 read_lock(&ksocknal_data.ksnd_global_lock);
1997 read_unlock(&ksocknal_data.ksnd_global_lock);
1999 kpr_deregister (&ksocknal_data.ksnd_router);
2001 ksocknal_free_buffers();
2003 ksocknal_data.ksnd_init = SOCKNAL_INIT_NOTHING;
2006 case SOCKNAL_INIT_NOTHING:
2010 CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
2011 atomic_read (&portal_kmemory));
2013 printk(KERN_INFO "Lustre: Routing socket NAL unloaded (final mem %d)\n",
2014 atomic_read(&portal_kmemory));
2019 ksocknal_init_incarnation (void)
2023 /* The incarnation number is the time this module loaded and it
2024 * identifies this particular instance of the socknal. Hopefully
2025 * we won't be able to reboot more frequently than 1MHz for the
2026 * forseeable future :) */
2028 do_gettimeofday(&tv);
2030 ksocknal_data.ksnd_incarnation =
2031 (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
2035 ksocknal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
2036 ptl_ni_limits_t *requested_limits,
2037 ptl_ni_limits_t *actual_limits)
2039 ptl_process_id_t process_id;
2040 int pkmem = atomic_read(&portal_kmemory);
2045 LASSERT (nal == &ksocknal_api);
2047 if (nal->nal_refct != 0) {
2048 if (actual_limits != NULL)
2049 *actual_limits = ksocknal_lib.libnal_ni.ni_actual_limits;
2050 /* This module got the first ref */
2055 LASSERT (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING);
2057 memset (&ksocknal_data, 0, sizeof (ksocknal_data)); /* zero pointers */
2059 ksocknal_init_incarnation();
2061 ksocknal_data.ksnd_peer_hash_size = SOCKNAL_PEER_HASH_SIZE;
2062 PORTAL_ALLOC (ksocknal_data.ksnd_peers,
2063 sizeof (struct list_head) * ksocknal_data.ksnd_peer_hash_size);
2064 if (ksocknal_data.ksnd_peers == NULL)
2067 for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++)
2068 CFS_INIT_LIST_HEAD(&ksocknal_data.ksnd_peers[i]);
2070 rwlock_init(&ksocknal_data.ksnd_global_lock);
2072 spin_lock_init(&ksocknal_data.ksnd_small_fmp.fmp_lock);
2073 CFS_INIT_LIST_HEAD(&ksocknal_data.ksnd_small_fmp.fmp_idle_fmbs);
2074 CFS_INIT_LIST_HEAD(&ksocknal_data.ksnd_small_fmp.fmp_blocked_conns);
2075 ksocknal_data.ksnd_small_fmp.fmp_buff_pages = SOCKNAL_SMALL_FWD_PAGES;
2077 spin_lock_init(&ksocknal_data.ksnd_large_fmp.fmp_lock);
2078 CFS_INIT_LIST_HEAD(&ksocknal_data.ksnd_large_fmp.fmp_idle_fmbs);
2079 CFS_INIT_LIST_HEAD(&ksocknal_data.ksnd_large_fmp.fmp_blocked_conns);
2080 ksocknal_data.ksnd_large_fmp.fmp_buff_pages = SOCKNAL_LARGE_FWD_PAGES;
2082 spin_lock_init (&ksocknal_data.ksnd_reaper_lock);
2083 CFS_INIT_LIST_HEAD (&ksocknal_data.ksnd_enomem_conns);
2084 CFS_INIT_LIST_HEAD (&ksocknal_data.ksnd_zombie_conns);
2085 CFS_INIT_LIST_HEAD (&ksocknal_data.ksnd_deathrow_conns);
2086 cfs_waitq_init(&ksocknal_data.ksnd_reaper_waitq);
2088 spin_lock_init (&ksocknal_data.ksnd_autoconnectd_lock);
2089 CFS_INIT_LIST_HEAD (&ksocknal_data.ksnd_autoconnectd_routes);
2090 cfs_waitq_init(&ksocknal_data.ksnd_autoconnectd_waitq);
2092 /* NB memset above zeros whole of ksocknal_data, including
2093 * ksocknal_data.ksnd_irqinfo[all].ksni_valid */
2095 /* flag lists/ptrs/locks initialised */
2096 ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA;
2098 ksocknal_data.ksnd_nschedulers = ksocknal_nsched();
2099 PORTAL_ALLOC(ksocknal_data.ksnd_schedulers,
2100 sizeof(ksock_sched_t) * ksocknal_data.ksnd_nschedulers);
2101 if (ksocknal_data.ksnd_schedulers == NULL) {
2102 ksocknal_api_shutdown (nal);
2106 for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) {
2107 ksock_sched_t *kss = &ksocknal_data.ksnd_schedulers[i];
2109 spin_lock_init (&kss->kss_lock);
2110 CFS_INIT_LIST_HEAD (&kss->kss_rx_conns);
2111 CFS_INIT_LIST_HEAD (&kss->kss_tx_conns);
2113 CFS_INIT_LIST_HEAD (&kss->kss_zctxdone_list);
2115 cfs_waitq_init (&kss->kss_waitq);
2118 /* NB we have to wait to be told our true NID... */
2119 process_id.pid = requested_pid;
2122 rc = lib_init(&ksocknal_lib, nal, process_id,
2123 requested_limits, actual_limits);
2125 CERROR("lib_init failed: error %d\n", rc);
2126 ksocknal_api_shutdown (nal);
2130 ksocknal_data.ksnd_init = SOCKNAL_INIT_LIB; // flag lib_init() called
2132 for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) {
2133 rc = ksocknal_thread_start (ksocknal_scheduler,
2134 &ksocknal_data.ksnd_schedulers[i]);
2136 CERROR("Can't spawn socknal scheduler[%d]: %d\n",
2138 ksocknal_api_shutdown (nal);
2143 for (i = 0; i < SOCKNAL_N_AUTOCONNECTD; i++) {
2144 rc = ksocknal_thread_start (ksocknal_autoconnectd, (void *)((long)i));
2146 CERROR("Can't spawn socknal autoconnectd: %d\n", rc);
2147 ksocknal_api_shutdown (nal);
2152 rc = ksocknal_thread_start (ksocknal_reaper, NULL);
2154 CERROR ("Can't spawn socknal reaper: %d\n", rc);
2155 ksocknal_api_shutdown (nal);
2159 rc = kpr_register(&ksocknal_data.ksnd_router,
2160 &ksocknal_router_interface);
2162 CDEBUG(D_NET, "Can't initialise routing interface "
2163 "(rc = %d): not routing\n", rc);
2165 /* Only allocate forwarding buffers if there's a router */
2167 for (i = 0; i < (SOCKNAL_SMALL_FWD_NMSGS +
2168 SOCKNAL_LARGE_FWD_NMSGS); i++) {
2170 ksock_fmb_pool_t *pool;
2173 if (i < SOCKNAL_SMALL_FWD_NMSGS)
2174 pool = &ksocknal_data.ksnd_small_fmp;
2176 pool = &ksocknal_data.ksnd_large_fmp;
2178 PORTAL_ALLOC(fmb, offsetof(ksock_fmb_t,
2179 fmb_kiov[pool->fmp_buff_pages]));
2181 ksocknal_api_shutdown(nal);
2185 fmb->fmb_pool = pool;
2187 for (j = 0; j < pool->fmp_buff_pages; j++) {
2188 fmb->fmb_kiov[j].kiov_page = cfs_alloc_page(CFS_ALLOC_STD);
2190 if (fmb->fmb_kiov[j].kiov_page == NULL) {
2191 ksocknal_api_shutdown (nal);
2195 LASSERT(cfs_page_address(fmb->fmb_kiov[j].kiov_page) != NULL);
2198 list_add(&fmb->fmb_list, &pool->fmp_idle_fmbs);
2202 rc = libcfs_nal_cmd_register(SOCKNAL, &ksocknal_cmd, NULL);
2204 CERROR ("Can't initialise command interface (rc = %d)\n", rc);
2205 ksocknal_api_shutdown (nal);
2209 /* flag everything initialised */
2210 ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL;
2212 printk(KERN_INFO "Lustre: Routing socket NAL loaded "
2213 "(Routing %s, initial mem %d, incarnation "LPX64")\n",
2214 kpr_routing (&ksocknal_data.ksnd_router) ?
2215 "enabled" : "disabled", pkmem, ksocknal_data.ksnd_incarnation);
2221 ksocknal_module_fini (void)
2223 #ifdef CONFIG_SYSCTL
2224 if (ksocknal_tunables.ksnd_sysctl != NULL)
2225 unregister_sysctl_table (ksocknal_tunables.ksnd_sysctl);
2227 PtlNIFini(ksocknal_ni);
2229 ptl_unregister_nal(SOCKNAL);
2232 extern cfs_sysctl_table_t ksocknal_top_ctl_table[];
2235 ksocknal_module_init (void)
2239 /* packet descriptor must fit in a router descriptor's scratchpad */
2240 LASSERT(sizeof (ksock_tx_t) <= sizeof (kprfd_scratch_t));
2241 /* the following must be sizeof(int) for proc_dointvec() */
2242 LASSERT(sizeof (ksocknal_tunables.ksnd_io_timeout) == sizeof (int));
2243 LASSERT(sizeof (ksocknal_tunables.ksnd_eager_ack) == sizeof (int));
2244 LASSERT(sizeof (ksocknal_tunables.ksnd_typed_conns) == sizeof (int));
2245 LASSERT(sizeof (ksocknal_tunables.ksnd_min_bulk) == sizeof (int));
2246 LASSERT(sizeof (ksocknal_tunables.ksnd_buffer_size) == sizeof (int));
2247 LASSERT(sizeof (ksocknal_tunables.ksnd_nagle) == sizeof (int));
2248 LASSERT(sizeof (ksocknal_tunables.ksnd_keepalive_idle) == sizeof (int));
2249 LASSERT(sizeof (ksocknal_tunables.ksnd_keepalive_count) == sizeof (int));
2250 LASSERT(sizeof (ksocknal_tunables.ksnd_keepalive_intvl) == sizeof (int));
2252 LASSERT(sizeof (ksocknal_tunables.ksnd_irq_affinity) == sizeof (int));
2255 LASSERT(sizeof (ksocknal_tunables.ksnd_zc_min_frag) == sizeof (int));
2257 /* check ksnr_connected/connecting field large enough */
2258 LASSERT(SOCKNAL_CONN_NTYPES <= 4);
2260 ksocknal_api.nal_ni_init = ksocknal_api_startup;
2261 ksocknal_api.nal_ni_fini = ksocknal_api_shutdown;
2263 /* Initialise dynamic tunables to defaults once only */
2264 ksocknal_tunables.ksnd_io_timeout = SOCKNAL_IO_TIMEOUT;
2265 ksocknal_tunables.ksnd_eager_ack = SOCKNAL_EAGER_ACK;
2266 ksocknal_tunables.ksnd_typed_conns = SOCKNAL_TYPED_CONNS;
2267 ksocknal_tunables.ksnd_min_bulk = SOCKNAL_MIN_BULK;
2268 ksocknal_tunables.ksnd_buffer_size = SOCKNAL_BUFFER_SIZE;
2269 ksocknal_tunables.ksnd_nagle = SOCKNAL_NAGLE;
2270 ksocknal_tunables.ksnd_keepalive_idle = SOCKNAL_KEEPALIVE_IDLE;
2271 ksocknal_tunables.ksnd_keepalive_count = SOCKNAL_KEEPALIVE_COUNT;
2272 ksocknal_tunables.ksnd_keepalive_intvl = SOCKNAL_KEEPALIVE_INTVL;
2274 ksocknal_tunables.ksnd_irq_affinity = SOCKNAL_IRQ_AFFINITY;
2277 ksocknal_tunables.ksnd_zc_min_frag = SOCKNAL_ZC_MIN_FRAG;
2280 rc = ptl_register_nal(SOCKNAL, &ksocknal_api);
2282 CERROR("Can't register SOCKNAL: %d\n", rc);
2283 return (-ENOMEM); /* or something... */
2286 /* Pure gateways want the NAL started up at module load time... */
2287 rc = PtlNIInit(SOCKNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &ksocknal_ni);
2288 if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
2289 ptl_unregister_nal(SOCKNAL);
2293 #ifdef CONFIG_SYSCTL
2294 /* Press on regardless even if registering sysctl doesn't work */
2295 ksocknal_tunables.ksnd_sysctl =
2296 register_sysctl_table (ksocknal_top_ctl_table, 0);
2301 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
2302 MODULE_DESCRIPTION("Kernel TCP Socket NAL v1.0.0");
2303 MODULE_LICENSE("GPL");
2305 cfs_module(ksocknal, "1.0.0", ksocknal_module_init, ksocknal_module_fini);