1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (C) 2002 Cluster File Systems, Inc.
6 * This file is part of Portals
7 * http://sourceforge.net/projects/sandiaportals/
9 * Portals is free software; you can redistribute it and/or
10 * modify it under the terms of version 2 of the GNU General Public
11 * License as published by the Free Software Foundation.
13 * Portals is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with Portals; if not, write to the Free Software
20 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26 LIST_HEAD(kpr_routes);
27 LIST_HEAD(kpr_gateways);
30 unsigned long long kpr_fwd_bytes;
31 unsigned long kpr_fwd_packets;
32 unsigned long kpr_fwd_errors;
33 atomic_t kpr_queue_depth;
35 /* Mostly the tables are read-only (thread and interrupt context)
37 * Once in a blue moon we register/deregister NALs and add/remove routing
38 * entries (thread context only)... */
39 rwlock_t kpr_rwlock = RW_LOCK_UNLOCKED;
41 kpr_router_interface_t kpr_router_interface = {
42 kprri_register: kpr_register_nal,
43 kprri_lookup: kpr_lookup_target,
44 kprri_fwd_start: kpr_forward_packet,
45 kprri_fwd_done: kpr_complete_packet,
46 kprri_notify: kpr_nal_notify,
47 kprri_shutdown: kpr_shutdown_nal,
48 kprri_deregister: kpr_deregister_nal,
51 kpr_control_interface_t kpr_control_interface = {
52 kprci_add_route: kpr_add_route,
53 kprci_del_route: kpr_del_route,
54 kprci_get_route: kpr_get_route,
55 kprci_notify: kpr_sys_notify,
59 kpr_register_nal (kpr_nal_interface_t *nalif, void **argp)
65 CDEBUG (D_NET, "Registering NAL %d\n", nalif->kprni_nalid);
67 PORTAL_ALLOC (ne, sizeof (*ne));
71 memset (ne, 0, sizeof (*ne));
72 memcpy ((void *)&ne->kpne_interface, (void *)nalif, sizeof (*nalif));
74 LASSERT (!in_interrupt());
75 write_lock_irqsave (&kpr_rwlock, flags);
77 for (e = kpr_nals.next; e != &kpr_nals; e = e->next)
79 kpr_nal_entry_t *ne2 = list_entry (e, kpr_nal_entry_t, kpne_list);
81 if (ne2->kpne_interface.kprni_nalid == ne->kpne_interface.kprni_nalid)
83 write_unlock_irqrestore (&kpr_rwlock, flags);
85 CERROR ("Attempt to register same NAL %d twice\n", ne->kpne_interface.kprni_nalid);
87 PORTAL_FREE (ne, sizeof (*ne));
92 list_add (&ne->kpne_list, &kpr_nals);
94 write_unlock_irqrestore (&kpr_rwlock, flags);
102 kpr_do_upcall (void *arg)
104 kpr_upcall_t *u = (kpr_upcall_t *)arg;
113 u->kpru_alive ? "up" : "down",
117 snprintf (nalstr, sizeof(nalstr), "%d", u->kpru_nal_id);
118 snprintf (nidstr, sizeof(nidstr), LPX64, u->kpru_nid);
119 snprintf (whenstr, sizeof(whenstr), "%ld", u->kpru_when);
121 portals_run_upcall (argv);
127 kpr_upcall (int gw_nalid, ptl_nid_t gw_nid, int alive, time_t when)
129 char str[PTL_NALFMT_SIZE];
131 /* May be in arbitrary context */
132 kpr_upcall_t *u = kmalloc (sizeof (kpr_upcall_t), GFP_ATOMIC);
135 CERROR ("Upcall out of memory: nal %d nid "LPX64" (%s) %s\n",
137 portals_nid2str(gw_nalid, gw_nid, str),
138 alive ? "up" : "down");
142 u->kpru_nal_id = gw_nalid;
143 u->kpru_nid = gw_nid;
144 u->kpru_alive = alive;
147 prepare_work (&u->kpru_tq, kpr_do_upcall, u);
148 schedule_work (&u->kpru_tq);
152 kpr_do_notify (int byNal, int gateway_nalid, ptl_nid_t gateway_nid,
153 int alive, time_t when)
157 kpr_nal_entry_t *ne = NULL;
158 kpr_gateway_entry_t *ge = NULL;
162 char str[PTL_NALFMT_SIZE];
164 CDEBUG (D_NET, "%s notifying [%d] "LPX64": %s\n",
165 byNal ? "NAL" : "userspace",
166 gateway_nalid, gateway_nid, alive ? "up" : "down");
168 /* can't do predictions... */
169 do_gettimeofday (&now);
170 if (when > now.tv_sec) {
171 CWARN ("Ignoring prediction from %s of [%d] "LPX64" %s "
172 "%ld seconds in the future\n",
173 byNal ? "NAL" : "userspace",
174 gateway_nalid, gateway_nid,
175 alive ? "up" : "down",
180 LASSERT (when <= now.tv_sec);
182 /* Serialise with lookups (i.e. write lock) */
183 write_lock_irqsave(&kpr_rwlock, flags);
186 list_for_each_safe (e, n, &kpr_gateways) {
188 ge = list_entry(e, kpr_gateway_entry_t, kpge_list);
189 if ((gateway_nalid != 0 &&
190 ge->kpge_nalid != gateway_nalid) ||
191 ge->kpge_nid != gateway_nid)
199 /* gateway not found */
200 write_unlock_irqrestore(&kpr_rwlock, flags);
201 CDEBUG (D_NET, "Gateway not found\n");
205 if (when < ge->kpge_timestamp) {
206 /* out of date information */
207 write_unlock_irqrestore (&kpr_rwlock, flags);
208 CDEBUG (D_NET, "Out of date\n");
212 /* update timestamp */
213 ge->kpge_timestamp = when;
215 if ((!ge->kpge_alive) == (!alive)) {
216 /* new date for old news */
217 write_unlock_irqrestore (&kpr_rwlock, flags);
218 CDEBUG (D_NET, "Old news\n");
222 ge->kpge_alive = alive;
223 CDEBUG(D_NET, "set "LPX64" [%p] %d\n", gateway_nid, ge, alive);
226 /* Reset all gateway weights so the newly-enabled gateway
227 * doesn't have to play catch-up */
228 list_for_each_safe (e, n, &kpr_gateways) {
229 kpr_gateway_entry_t *ge = list_entry(e, kpr_gateway_entry_t,
231 atomic_set (&ge->kpge_weight, 0);
237 /* userland notified me: notify NAL? */
238 ne = kpr_find_nal_entry_locked (ge->kpge_nalid);
240 if (!ne->kpne_shutdown &&
241 ne->kpne_interface.kprni_notify != NULL) {
242 /* take a ref on this NAL until notifying
243 * it has completed... */
244 atomic_inc (&ne->kpne_refcount);
250 write_unlock_irqrestore(&kpr_rwlock, flags);
253 ne->kpne_interface.kprni_notify (ne->kpne_interface.kprni_arg,
255 /* 'ne' can disappear now... */
256 atomic_dec (&ne->kpne_refcount);
260 /* It wasn't userland that notified me... */
261 CWARN ("Upcall: NAL %d NID "LPX64" (%s) is %s\n",
262 gateway_nalid, gateway_nid,
263 portals_nid2str(gateway_nalid, gateway_nid, str),
264 alive ? "alive" : "dead");
265 kpr_upcall (gateway_nalid, gateway_nid, alive, when);
267 CDEBUG (D_NET, " NOT Doing upcall\n");
274 kpr_nal_notify (void *arg, ptl_nid_t peer, int alive, time_t when)
276 kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg;
278 kpr_do_notify (1, ne->kpne_interface.kprni_nalid, peer, alive, when);
282 kpr_shutdown_nal (void *arg)
285 kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg;
287 CDEBUG (D_NET, "Shutting down NAL %d\n", ne->kpne_interface.kprni_nalid);
289 LASSERT (!ne->kpne_shutdown);
290 LASSERT (!in_interrupt());
292 write_lock_irqsave (&kpr_rwlock, flags); /* locking a bit spurious... */
293 ne->kpne_shutdown = 1;
294 write_unlock_irqrestore (&kpr_rwlock, flags); /* except it's a memory barrier */
296 while (atomic_read (&ne->kpne_refcount) != 0)
298 CDEBUG (D_NET, "Waiting for refcount on NAL %d to reach zero (%d)\n",
299 ne->kpne_interface.kprni_nalid, atomic_read (&ne->kpne_refcount));
301 set_current_state (TASK_UNINTERRUPTIBLE);
302 schedule_timeout (HZ);
307 kpr_deregister_nal (void *arg)
310 kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg;
312 CDEBUG (D_NET, "Deregister NAL %d\n", ne->kpne_interface.kprni_nalid);
314 LASSERT (ne->kpne_shutdown); /* caller must have issued shutdown already */
315 LASSERT (atomic_read (&ne->kpne_refcount) == 0); /* can't be busy */
316 LASSERT (!in_interrupt());
318 write_lock_irqsave (&kpr_rwlock, flags);
320 list_del (&ne->kpne_list);
322 write_unlock_irqrestore (&kpr_rwlock, flags);
324 PORTAL_FREE (ne, sizeof (*ne));
329 kpr_ge_isbetter (kpr_gateway_entry_t *ge1, kpr_gateway_entry_t *ge2)
331 const int significant_bits = 0x00ffffff;
332 /* We use atomic_t to record/compare route weights for
333 * load-balancing. Here we limit ourselves to only using
334 * 'significant_bits' when we do an 'after' comparison */
336 int diff = (atomic_read (&ge1->kpge_weight) -
337 atomic_read (&ge2->kpge_weight)) & significant_bits;
338 int rc = (diff > (significant_bits >> 1));
340 CDEBUG(D_NET, "[%p]"LPX64"=%d %s [%p]"LPX64"=%d\n",
341 ge1, ge1->kpge_nid, atomic_read (&ge1->kpge_weight),
343 ge2, ge2->kpge_nid, atomic_read (&ge2->kpge_weight));
349 kpr_update_weight (kpr_gateway_entry_t *ge, int nob)
351 int weight = 1 + (nob + sizeof (ptl_hdr_t)/2)/sizeof (ptl_hdr_t);
353 /* We've chosen this route entry (i.e. gateway) to forward payload
354 * of length 'nob'; update the route's weight to make it less
355 * favoured. Note that the weight is 1 plus the payload size
356 * rounded and scaled to the portals header size, so we get better
357 * use of the significant bits in kpge_weight. */
359 CDEBUG(D_NET, "gateway [%p]"LPX64" += %d\n", ge,
360 ge->kpge_nid, weight);
362 atomic_add (weight, &ge->kpge_weight);
366 kpr_lookup_target (void *arg, ptl_nid_t target_nid, int nob,
367 ptl_nid_t *gateway_nidp)
369 kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg;
371 kpr_route_entry_t *re;
372 kpr_gateway_entry_t *ge = NULL;
375 /* Caller wants to know if 'target_nid' can be reached via a gateway
376 * ON HER OWN NETWORK */
378 CDEBUG (D_NET, "lookup "LPX64" from NAL %d\n", target_nid,
379 ne->kpne_interface.kprni_nalid);
381 if (ne->kpne_shutdown) /* caller is shutting down */
384 read_lock (&kpr_rwlock);
386 /* Search routes for one that has a gateway to target_nid on the callers network */
388 list_for_each (e, &kpr_routes) {
389 re = list_entry (e, kpr_route_entry_t, kpre_list);
391 if (re->kpre_lo_nid > target_nid ||
392 re->kpre_hi_nid < target_nid)
395 /* found table entry */
397 if (re->kpre_gateway->kpge_nalid != ne->kpne_interface.kprni_nalid ||
398 !re->kpre_gateway->kpge_alive) {
399 /* different NAL or gateway down */
405 kpr_ge_isbetter (re->kpre_gateway, ge))
406 ge = re->kpre_gateway;
410 kpr_update_weight (ge, nob);
411 *gateway_nidp = ge->kpge_nid;
415 read_unlock (&kpr_rwlock);
417 /* NB can't deref 're' now; it might have been removed! */
419 CDEBUG (D_NET, "lookup "LPX64" from NAL %d: %d ("LPX64")\n",
420 target_nid, ne->kpne_interface.kprni_nalid, rc,
421 (rc == 0) ? *gateway_nidp : (ptl_nid_t)0);
426 kpr_find_nal_entry_locked (int nal_id)
430 /* Called with kpr_rwlock held */
432 list_for_each (e, &kpr_nals) {
433 kpr_nal_entry_t *ne = list_entry (e, kpr_nal_entry_t, kpne_list);
435 if (nal_id != ne->kpne_interface.kprni_nalid) /* no match */
445 kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd)
447 kpr_nal_entry_t *src_ne = (kpr_nal_entry_t *)arg;
448 ptl_nid_t target_nid = fwd->kprfd_target_nid;
449 int nob = fwd->kprfd_nob;
450 kpr_gateway_entry_t *ge = NULL;
451 kpr_nal_entry_t *dst_ne = NULL;
453 kpr_route_entry_t *re;
454 kpr_nal_entry_t *tmp_ne;
456 CDEBUG (D_NET, "forward [%p] "LPX64" from NAL %d\n", fwd,
457 target_nid, src_ne->kpne_interface.kprni_nalid);
459 LASSERT (nob == lib_kiov_nob (fwd->kprfd_niov, fwd->kprfd_kiov));
461 atomic_inc (&kpr_queue_depth);
462 atomic_inc (&src_ne->kpne_refcount); /* source nal is busy until fwd completes */
464 kpr_fwd_packets++; /* (loose) stats accounting */
465 kpr_fwd_bytes += nob + sizeof(ptl_hdr_t);
467 if (src_ne->kpne_shutdown) /* caller is shutting down */
470 fwd->kprfd_router_arg = src_ne; /* stash caller's nal entry */
472 read_lock (&kpr_rwlock);
474 /* Search routes for one that has a gateway to target_nid NOT on the caller's network */
476 list_for_each (e, &kpr_routes) {
477 re = list_entry (e, kpr_route_entry_t, kpre_list);
479 if (re->kpre_lo_nid > target_nid || /* no match */
480 re->kpre_hi_nid < target_nid)
483 if (re->kpre_gateway->kpge_nalid == src_ne->kpne_interface.kprni_nalid)
484 continue; /* don't route to same NAL */
486 if (!re->kpre_gateway->kpge_alive)
487 continue; /* gateway is dead */
489 tmp_ne = kpr_find_nal_entry_locked (re->kpre_gateway->kpge_nalid);
491 if (tmp_ne == NULL ||
492 tmp_ne->kpne_shutdown) {
493 /* NAL must be registered and not shutting down */
498 kpr_ge_isbetter (re->kpre_gateway, ge)) {
499 ge = re->kpre_gateway;
505 LASSERT (dst_ne != NULL);
507 kpr_update_weight (ge, nob);
509 fwd->kprfd_gateway_nid = ge->kpge_nid;
510 atomic_inc (&dst_ne->kpne_refcount); /* dest nal is busy until fwd completes */
512 read_unlock (&kpr_rwlock);
514 CDEBUG (D_NET, "forward [%p] "LPX64" from NAL %d: "
515 "to "LPX64" on NAL %d\n",
516 fwd, target_nid, src_ne->kpne_interface.kprni_nalid,
517 fwd->kprfd_gateway_nid, dst_ne->kpne_interface.kprni_nalid);
519 dst_ne->kpne_interface.kprni_fwd (dst_ne->kpne_interface.kprni_arg, fwd);
523 read_unlock (&kpr_rwlock);
527 CDEBUG (D_NET, "Failed to forward [%p] "LPX64" from NAL %d\n", fwd,
528 target_nid, src_ne->kpne_interface.kprni_nalid);
530 /* Can't find anywhere to forward to */
531 (fwd->kprfd_callback)(fwd->kprfd_callback_arg, -EHOSTUNREACH);
533 atomic_dec (&kpr_queue_depth);
534 atomic_dec (&src_ne->kpne_refcount);
538 kpr_complete_packet (void *arg, kpr_fwd_desc_t *fwd, int error)
540 kpr_nal_entry_t *dst_ne = (kpr_nal_entry_t *)arg;
541 kpr_nal_entry_t *src_ne = (kpr_nal_entry_t *)fwd->kprfd_router_arg;
543 CDEBUG (D_NET, "complete(1) [%p] from NAL %d to NAL %d: %d\n", fwd,
544 src_ne->kpne_interface.kprni_nalid, dst_ne->kpne_interface.kprni_nalid, error);
546 atomic_dec (&dst_ne->kpne_refcount); /* CAVEAT EMPTOR dst_ne can disappear now!!! */
548 (fwd->kprfd_callback)(fwd->kprfd_callback_arg, error);
550 CDEBUG (D_NET, "complete(2) [%p] from NAL %d: %d\n", fwd,
551 src_ne->kpne_interface.kprni_nalid, error);
553 atomic_dec (&kpr_queue_depth);
554 atomic_dec (&src_ne->kpne_refcount); /* CAVEAT EMPTOR src_ne can disappear now!!! */
558 kpr_add_route (int gateway_nalid, ptl_nid_t gateway_nid,
559 ptl_nid_t lo_nid, ptl_nid_t hi_nid)
563 kpr_route_entry_t *re;
564 kpr_gateway_entry_t *ge;
567 CDEBUG(D_NET, "Add route: %d "LPX64" : "LPX64" - "LPX64"\n",
568 gateway_nalid, gateway_nid, lo_nid, hi_nid);
570 if (gateway_nalid == PTL_NID_ANY ||
571 lo_nid == PTL_NID_ANY ||
572 hi_nid == PTL_NID_ANY ||
576 PORTAL_ALLOC (ge, sizeof (*ge));
580 ge->kpge_nalid = gateway_nalid;
581 ge->kpge_nid = gateway_nid;
583 ge->kpge_timestamp = 0;
584 ge->kpge_refcount = 0;
585 atomic_set (&ge->kpge_weight, 0);
587 PORTAL_ALLOC (re, sizeof (*re));
589 PORTAL_FREE (ge, sizeof (*ge));
593 re->kpre_lo_nid = lo_nid;
594 re->kpre_hi_nid = hi_nid;
596 LASSERT(!in_interrupt());
597 write_lock_irqsave (&kpr_rwlock, flags);
599 list_for_each (e, &kpr_gateways) {
600 kpr_gateway_entry_t *ge2 = list_entry(e, kpr_gateway_entry_t,
603 if (ge2->kpge_nalid == gateway_nalid &&
604 ge2->kpge_nid == gateway_nid) {
605 PORTAL_FREE (ge, sizeof (*ge));
613 /* Adding a new gateway... */
615 list_add (&ge->kpge_list, &kpr_gateways);
617 /* ...zero all gateway weights so this one doesn't have to
620 list_for_each (e, &kpr_gateways) {
621 kpr_gateway_entry_t *ge2 = list_entry(e, kpr_gateway_entry_t,
623 atomic_set (&ge2->kpge_weight, 0);
628 re->kpre_gateway = ge;
630 list_add (&re->kpre_list, &kpr_routes);
632 write_unlock_irqrestore (&kpr_rwlock, flags);
637 kpr_sys_notify (int gateway_nalid, ptl_nid_t gateway_nid,
638 int alive, time_t when)
640 return (kpr_do_notify (0, gateway_nalid, gateway_nid, alive, when));
644 kpr_del_route (int gw_nalid, ptl_nid_t gw_nid,
645 ptl_nid_t lo, ptl_nid_t hi)
647 int specific = (lo != PTL_NID_ANY);
653 CDEBUG(D_NET, "Del route [%d] "LPX64" : "LPX64" - "LPX64"\n",
654 gw_nalid, gw_nid, lo, hi);
656 LASSERT(!in_interrupt());
658 /* NB Caller may specify either all routes via the given gateway
659 * (lo/hi == PTL_NID_ANY) or a specific route entry (lo/hi are
662 if (specific ? (hi == PTL_NID_ANY || hi < lo) : (hi != PTL_NID_ANY))
665 write_lock_irqsave(&kpr_rwlock, flags);
667 list_for_each_safe (e, n, &kpr_routes) {
668 kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t,
670 kpr_gateway_entry_t *ge = re->kpre_gateway;
672 if (ge->kpge_nalid != gw_nalid ||
673 ge->kpge_nid != gw_nid ||
675 (lo != re->kpre_lo_nid || hi != re->kpre_hi_nid)))
680 if (--ge->kpge_refcount == 0) {
681 list_del (&ge->kpge_list);
682 PORTAL_FREE (ge, sizeof (*ge));
685 list_del (&re->kpre_list);
686 PORTAL_FREE(re, sizeof (*re));
692 write_unlock_irqrestore(&kpr_rwlock, flags);
697 kpr_get_route (int idx, int *gateway_nalid, ptl_nid_t *gateway_nid,
698 ptl_nid_t *lo_nid, ptl_nid_t *hi_nid, int *alive)
702 read_lock(&kpr_rwlock);
704 for (e = kpr_routes.next; e != &kpr_routes; e = e->next) {
705 kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t,
707 kpr_gateway_entry_t *ge = re->kpre_gateway;
710 *gateway_nalid = ge->kpge_nalid;
711 *gateway_nid = ge->kpge_nid;
712 *alive = ge->kpge_alive;
713 *lo_nid = re->kpre_lo_nid;
714 *hi_nid = re->kpre_hi_nid;
716 read_unlock(&kpr_rwlock);
721 read_unlock (&kpr_rwlock);
725 static void /*__exit*/
728 LASSERT (list_empty (&kpr_nals));
730 while (!list_empty (&kpr_routes)) {
731 kpr_route_entry_t *re = list_entry(kpr_routes.next,
735 list_del(&re->kpre_list);
736 PORTAL_FREE(re, sizeof (*re));
741 PORTAL_SYMBOL_UNREGISTER(kpr_router_interface);
742 PORTAL_SYMBOL_UNREGISTER(kpr_control_interface);
744 CDEBUG(D_MALLOC, "kpr_finalise: kmem back to %d\n",
745 atomic_read(&portal_kmemory));
749 kpr_initialise (void)
751 CDEBUG(D_MALLOC, "kpr_initialise: kmem %d\n",
752 atomic_read(&portal_kmemory));
756 PORTAL_SYMBOL_REGISTER(kpr_router_interface);
757 PORTAL_SYMBOL_REGISTER(kpr_control_interface);
761 MODULE_AUTHOR("Eric Barton");
762 MODULE_DESCRIPTION("Kernel Portals Router v0.01");
763 MODULE_LICENSE("GPL");
765 module_init (kpr_initialise);
766 module_exit (kpr_finalise);
768 EXPORT_SYMBOL (kpr_control_interface);
769 EXPORT_SYMBOL (kpr_router_interface);