Whamcloud - gitweb
- debug info to investigate a source of small writes
[fs/lustre-release.git] / lnet / router / router.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * Copyright (C) 2002 Cluster File Systems, Inc.
5  *
6  *   This file is part of Portals
7  *   http://sourceforge.net/projects/sandiaportals/
8  *
9  *   Portals is free software; you can redistribute it and/or
10  *   modify it under the terms of version 2 of the GNU General Public
11  *   License as published by the Free Software Foundation.
12  *
13  *   Portals is distributed in the hope that it will be useful,
14  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
15  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  *   GNU General Public License for more details.
17  *
18  *   You should have received a copy of the GNU General Public License
19  *   along with Portals; if not, write to the Free Software
20  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21  *
22  */
23
24 #include "router.h"
25
26 LIST_HEAD(kpr_routes);
27 LIST_HEAD(kpr_gateways);
28 LIST_HEAD(kpr_nals);
29
30 unsigned int       kpr_routes_generation;
31 unsigned long long kpr_fwd_bytes;
32 unsigned long      kpr_fwd_packets;
33 unsigned long      kpr_fwd_errors;
34 atomic_t           kpr_queue_depth;
35
36 /* Mostly the tables are read-only (thread and interrupt context)
37  *
38  * Once in a blue moon we register/deregister NALs and add/remove routing
39  * entries (thread context only)... */
40 rwlock_t         kpr_rwlock = RW_LOCK_UNLOCKED;
41
42 kpr_router_interface_t kpr_router_interface = {
43         kprri_register:         kpr_register_nal,
44         kprri_lookup:           kpr_lookup_target,
45         kprri_fwd_start:        kpr_forward_packet,
46         kprri_fwd_done:         kpr_complete_packet,
47         kprri_notify:           kpr_nal_notify,
48         kprri_shutdown:         kpr_shutdown_nal,
49         kprri_deregister:       kpr_deregister_nal,
50 };
51
52 int
53 kpr_register_nal (kpr_nal_interface_t *nalif, void **argp)
54 {
55         unsigned long      flags;
56         struct list_head  *e;
57         kpr_nal_entry_t   *ne;
58
59         CDEBUG (D_NET, "Registering NAL %x\n", nalif->kprni_nalid);
60
61         PORTAL_ALLOC (ne, sizeof (*ne));
62         if (ne == NULL)
63                 return (-ENOMEM);
64
65         memset (ne, 0, sizeof (*ne));
66         memcpy ((void *)&ne->kpne_interface, (void *)nalif, sizeof (*nalif));
67
68         LASSERT (!in_interrupt());
69         write_lock_irqsave (&kpr_rwlock, flags);
70
71         for (e = kpr_nals.next; e != &kpr_nals; e = e->next)
72         {
73                 kpr_nal_entry_t *ne2 = list_entry (e, kpr_nal_entry_t, kpne_list);
74
75                 if (ne2->kpne_interface.kprni_nalid == ne->kpne_interface.kprni_nalid)
76                 {
77                         write_unlock_irqrestore (&kpr_rwlock, flags);
78
79                         CERROR ("Attempt to register same NAL %x twice\n", ne->kpne_interface.kprni_nalid);
80
81                         PORTAL_FREE (ne, sizeof (*ne));
82                         return (-EEXIST);
83                 }
84         }
85
86         list_add (&ne->kpne_list, &kpr_nals);
87
88         write_unlock_irqrestore (&kpr_rwlock, flags);
89
90         *argp = ne;
91         PORTAL_MODULE_USE;
92         return (0);
93 }
94
95 void
96 kpr_do_upcall (void *arg)
97 {
98         kpr_upcall_t *u = (kpr_upcall_t *)arg;
99         char          nalstr[10];
100         char          nidstr[36];
101         char          whenstr[36];
102         char         *argv[] = {
103                 NULL,
104                 "ROUTER_NOTIFY",
105                 nalstr,
106                 nidstr,
107                 u->kpru_alive ? "up" : "down",
108                 whenstr,
109                 NULL};
110         
111         snprintf (nalstr, sizeof(nalstr), "%d", u->kpru_nal_id);
112         snprintf (nidstr, sizeof(nidstr), LPX64, u->kpru_nid);
113         snprintf (whenstr, sizeof(whenstr), "%ld", u->kpru_when);
114
115         portals_run_upcall (argv);
116
117         kfree (u);
118 }
119
120 void
121 kpr_upcall (int gw_nalid, ptl_nid_t gw_nid, int alive, time_t when)
122 {
123         char str[PTL_NALFMT_SIZE];
124         
125         /* May be in arbitrary context */
126         kpr_upcall_t  *u = kmalloc (sizeof (kpr_upcall_t), GFP_ATOMIC);
127
128         if (u == NULL) {
129                 CERROR ("Upcall out of memory: nal %x nid "LPX64" (%s) %s\n",
130                         gw_nalid, gw_nid,
131                         portals_nid2str(gw_nalid, gw_nid, str),
132                         alive ? "up" : "down");
133                 return;
134         }
135
136         u->kpru_nal_id     = gw_nalid;
137         u->kpru_nid        = gw_nid;
138         u->kpru_alive      = alive;
139         u->kpru_when       = when;
140
141         prepare_work (&u->kpru_tq, kpr_do_upcall, u);
142         schedule_work (&u->kpru_tq);
143 }
144
145 int
146 kpr_do_notify (int byNal, int gateway_nalid, ptl_nid_t gateway_nid,
147                int alive, time_t when)
148 {
149         unsigned long        flags;
150         int                  found;
151         kpr_nal_entry_t     *ne = NULL;
152         kpr_gateway_entry_t *ge = NULL;
153         struct timeval       now;
154         struct list_head    *e;
155         struct list_head    *n;
156         char                 str[PTL_NALFMT_SIZE];
157
158         CDEBUG (D_NET, "%s notifying [%x] "LPX64": %s\n", 
159                 byNal ? "NAL" : "userspace", 
160                 gateway_nalid, gateway_nid, alive ? "up" : "down");
161
162         /* can't do predictions... */
163         do_gettimeofday (&now);
164         if (when > now.tv_sec) {
165                 CWARN ("Ignoring prediction from %s of [%x] "LPX64" %s "
166                        "%ld seconds in the future\n", 
167                        byNal ? "NAL" : "userspace", 
168                        gateway_nalid, gateway_nid, 
169                        alive ? "up" : "down",
170                        when - now.tv_sec);
171                 return (EINVAL);
172         }
173
174         LASSERT (when <= now.tv_sec);
175
176         /* Serialise with lookups (i.e. write lock) */
177         write_lock_irqsave(&kpr_rwlock, flags);
178
179         found = 0;
180         list_for_each_safe (e, n, &kpr_gateways) {
181
182                 ge = list_entry(e, kpr_gateway_entry_t, kpge_list);
183                 if ((gateway_nalid != 0 &&
184                      ge->kpge_nalid != gateway_nalid) ||
185                     ge->kpge_nid != gateway_nid)
186                         continue;
187
188                 found = 1;
189                 break;
190         }
191
192         if (!found) {
193                 /* gateway not found */
194                 write_unlock_irqrestore(&kpr_rwlock, flags);
195                 CDEBUG (D_NET, "Gateway not found\n");
196                 return (0);
197         }
198         
199         if (when < ge->kpge_timestamp) {
200                 /* out of date information */
201                 write_unlock_irqrestore (&kpr_rwlock, flags);
202                 CDEBUG (D_NET, "Out of date\n");
203                 return (0);
204         }
205
206         /* update timestamp */
207         ge->kpge_timestamp = when;
208
209         if ((!ge->kpge_alive) == (!alive)) {
210                 /* new date for old news */
211                 write_unlock_irqrestore (&kpr_rwlock, flags);
212                 CDEBUG (D_NET, "Old news\n");
213                 return (0);
214         }
215
216         ge->kpge_alive = alive;
217         CDEBUG(D_NET, "set "LPX64" [%p] %d\n", gateway_nid, ge, alive);
218
219         if (alive) {
220                 /* Reset all gateway weights so the newly-enabled gateway
221                  * doesn't have to play catch-up */
222                 list_for_each_safe (e, n, &kpr_gateways) {
223                         kpr_gateway_entry_t *ge = list_entry(e, kpr_gateway_entry_t,
224                                                              kpge_list);
225                         atomic_set (&ge->kpge_weight, 0);
226                 }
227         }
228
229         found = 0;
230         if (!byNal) {
231                 /* userland notified me: notify NAL? */
232                 ne = kpr_find_nal_entry_locked (ge->kpge_nalid);
233                 if (ne != NULL) {
234                         if (!ne->kpne_shutdown &&
235                             ne->kpne_interface.kprni_notify != NULL) {
236                                 /* take a ref on this NAL until notifying
237                                  * it has completed... */
238                                 atomic_inc (&ne->kpne_refcount);
239                                 found = 1;
240                         }
241                 }
242         }
243
244         write_unlock_irqrestore(&kpr_rwlock, flags);
245
246         if (found) {
247                 ne->kpne_interface.kprni_notify (ne->kpne_interface.kprni_arg,
248                                                  gateway_nid, alive);
249                 /* 'ne' can disappear now... */
250                 atomic_dec (&ne->kpne_refcount);
251         }
252         
253         if (byNal) {
254                 /* It wasn't userland that notified me... */
255                 CWARN ("Upcall: NAL %x NID "LPX64" (%s) is %s\n",
256                        gateway_nalid, gateway_nid,
257                        portals_nid2str(gateway_nalid, gateway_nid, str),
258                        alive ? "alive" : "dead");
259                 kpr_upcall (gateway_nalid, gateway_nid, alive, when);
260         } else {
261                 CDEBUG (D_NET, " NOT Doing upcall\n");
262         }
263         
264         return (0);
265 }
266
267 void
268 kpr_nal_notify (void *arg, ptl_nid_t peer, int alive, time_t when)
269 {
270         kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg;
271         
272         kpr_do_notify (1, ne->kpne_interface.kprni_nalid, peer, alive, when);
273 }
274
275 void
276 kpr_shutdown_nal (void *arg)
277 {
278         unsigned long    flags;
279         kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg;
280
281         CDEBUG (D_NET, "Shutting down NAL %x\n", ne->kpne_interface.kprni_nalid);
282
283         LASSERT (!ne->kpne_shutdown);
284         LASSERT (!in_interrupt());
285
286         write_lock_irqsave (&kpr_rwlock, flags);
287         ne->kpne_shutdown = 1;
288         write_unlock_irqrestore (&kpr_rwlock, flags);
289 }
290
291 void
292 kpr_deregister_nal (void *arg)
293 {
294         unsigned long     flags;
295         kpr_nal_entry_t  *ne = (kpr_nal_entry_t *)arg;
296
297         CDEBUG (D_NET, "Deregister NAL %x\n", ne->kpne_interface.kprni_nalid);
298
299         LASSERT (ne->kpne_shutdown);            /* caller must have issued shutdown already */
300         LASSERT (!in_interrupt());
301
302         write_lock_irqsave (&kpr_rwlock, flags);
303         list_del (&ne->kpne_list);
304         write_unlock_irqrestore (&kpr_rwlock, flags);
305
306         /* Wait until all outstanding messages/notifications have completed */
307         while (atomic_read (&ne->kpne_refcount) != 0)
308         {
309                 CDEBUG (D_NET, "Waiting for refcount on NAL %x to reach zero (%d)\n",
310                         ne->kpne_interface.kprni_nalid, atomic_read (&ne->kpne_refcount));
311
312                 set_current_state (TASK_UNINTERRUPTIBLE);
313                 schedule_timeout (HZ);
314         }
315
316         PORTAL_FREE (ne, sizeof (*ne));
317         PORTAL_MODULE_UNUSE;
318 }
319
320 int
321 kpr_ge_isbetter (kpr_gateway_entry_t *ge1, kpr_gateway_entry_t *ge2)
322 {
323         const int significant_bits = 0x00ffffff;
324         /* We use atomic_t to record/compare route weights for
325          * load-balancing.  Here we limit ourselves to only using
326          * 'significant_bits' when we do an 'after' comparison */
327
328         int    diff = (atomic_read (&ge1->kpge_weight) -
329                        atomic_read (&ge2->kpge_weight)) & significant_bits;
330         int    rc = (diff > (significant_bits >> 1));
331
332         CDEBUG(D_INFO, "[%p]"LPX64"=%d %s [%p]"LPX64"=%d\n",
333                ge1, ge1->kpge_nid, atomic_read (&ge1->kpge_weight),
334                rc ? ">" : "<",
335                ge2, ge2->kpge_nid, atomic_read (&ge2->kpge_weight));
336
337         return (rc);
338 }
339
340 void
341 kpr_update_weight (kpr_gateway_entry_t *ge, int nob)
342 {
343         int weight = 1 + (nob + sizeof (ptl_hdr_t)/2)/sizeof (ptl_hdr_t);
344
345         /* We've chosen this route entry (i.e. gateway) to forward payload
346          * of length 'nob'; update the route's weight to make it less
347          * favoured.  Note that the weight is 1 plus the payload size
348          * rounded and scaled to the portals header size, so we get better
349          * use of the significant bits in kpge_weight. */
350
351         CDEBUG(D_INFO, "gateway [%p]"LPX64" += %d\n", ge,
352                ge->kpge_nid, weight);
353         
354         atomic_add (weight, &ge->kpge_weight);
355 }
356
357 int
358 kpr_lookup_target (void *arg, ptl_nid_t target_nid, int nob,
359                    ptl_nid_t *gateway_nidp)
360 {
361         kpr_nal_entry_t     *ne = (kpr_nal_entry_t *)arg;
362         struct list_head    *e;
363         kpr_route_entry_t   *re;
364         kpr_gateway_entry_t *ge = NULL;
365         int                  rc = -ENOENT;
366
367         /* Caller wants to know if 'target_nid' can be reached via a gateway
368          * ON HER OWN NETWORK */
369
370         CDEBUG (D_INFO, "lookup "LPX64" from NAL %x\n", target_nid, 
371                 ne->kpne_interface.kprni_nalid);
372         LASSERT (!in_interrupt());
373
374         read_lock (&kpr_rwlock);
375
376         if (ne->kpne_shutdown) {        /* caller is shutting down */
377                 read_unlock (&kpr_rwlock);
378                 return (-ENOENT);
379         }
380
381         /* Search routes for one that has a gateway to target_nid on the callers network */
382
383         list_for_each (e, &kpr_routes) {
384                 re = list_entry (e, kpr_route_entry_t, kpre_list);
385
386                 if (re->kpre_lo_nid > target_nid ||
387                     re->kpre_hi_nid < target_nid)
388                         continue;
389
390                 /* found table entry */
391
392                 if (re->kpre_gateway->kpge_nalid != ne->kpne_interface.kprni_nalid ||
393                     !re->kpre_gateway->kpge_alive) {
394                         /* different NAL or gateway down */
395                         rc = -EHOSTUNREACH;
396                         continue;
397                 }
398                 
399                 if (ge == NULL ||
400                     kpr_ge_isbetter (re->kpre_gateway, ge))
401                     ge = re->kpre_gateway;
402         }
403
404         if (ge != NULL) {
405                 kpr_update_weight (ge, nob);
406                 *gateway_nidp = ge->kpge_nid;
407                 rc = 0;
408         }
409         
410         read_unlock (&kpr_rwlock);
411
412         /* NB can't deref 're' now; it might have been removed! */
413
414         CDEBUG (D_NET, "lookup "LPX64" from NAL %x: %d ("LPX64")\n",
415                 target_nid, ne->kpne_interface.kprni_nalid, rc,
416                 (rc == 0) ? *gateway_nidp : (ptl_nid_t)0);
417         return (rc);
418 }
419
420 kpr_nal_entry_t *
421 kpr_find_nal_entry_locked (int nal_id)
422 {
423         struct list_head    *e;
424         
425         /* Called with kpr_rwlock held */
426
427         list_for_each (e, &kpr_nals) {
428                 kpr_nal_entry_t *ne = list_entry (e, kpr_nal_entry_t, kpne_list);
429
430                 if (nal_id != ne->kpne_interface.kprni_nalid) /* no match */
431                         continue;
432
433                 return (ne);
434         }
435         
436         return (NULL);
437 }
438
439 void
440 kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd)
441 {
442         kpr_nal_entry_t     *src_ne = (kpr_nal_entry_t *)arg;
443         ptl_nid_t            target_nid = fwd->kprfd_target_nid;
444         int                  nob = fwd->kprfd_nob;
445         kpr_gateway_entry_t *ge = NULL;
446         kpr_nal_entry_t     *dst_ne = NULL;
447         struct list_head    *e;
448         kpr_route_entry_t   *re;
449         kpr_nal_entry_t     *tmp_ne;
450         int                  rc;
451
452         CDEBUG (D_NET, "forward [%p] "LPX64" from NAL %x\n", fwd,
453                 target_nid, src_ne->kpne_interface.kprni_nalid);
454
455         LASSERT (nob == lib_kiov_nob (fwd->kprfd_niov, fwd->kprfd_kiov));
456         LASSERT (!in_interrupt());
457
458         read_lock (&kpr_rwlock);
459
460         kpr_fwd_packets++;                   /* (loose) stats accounting */
461         kpr_fwd_bytes += nob + sizeof(ptl_hdr_t);
462
463         if (src_ne->kpne_shutdown) {         /* caller is shutting down */
464                 rc = -ESHUTDOWN;
465                 goto out;
466         }
467
468         fwd->kprfd_router_arg = src_ne;      /* stash caller's nal entry */
469
470         /* Search routes for one that has a gateway to target_nid NOT on the caller's network */
471
472         list_for_each (e, &kpr_routes) {
473                 re = list_entry (e, kpr_route_entry_t, kpre_list);
474
475                 if (re->kpre_lo_nid > target_nid || /* no match */
476                     re->kpre_hi_nid < target_nid)
477                         continue;
478
479                 if (re->kpre_gateway->kpge_nalid == src_ne->kpne_interface.kprni_nalid)
480                         continue;               /* don't route to same NAL */
481
482                 if (!re->kpre_gateway->kpge_alive)
483                         continue;               /* gateway is dead */
484                 
485                 tmp_ne = kpr_find_nal_entry_locked (re->kpre_gateway->kpge_nalid);
486
487                 if (tmp_ne == NULL ||
488                     tmp_ne->kpne_shutdown) {
489                         /* NAL must be registered and not shutting down */
490                         continue;
491                 }
492
493                 if (ge == NULL ||
494                     kpr_ge_isbetter (re->kpre_gateway, ge)) {
495                         ge = re->kpre_gateway;
496                         dst_ne = tmp_ne;
497                 }
498         }
499         
500         if (ge != NULL) {
501                 LASSERT (dst_ne != NULL);
502                 
503                 kpr_update_weight (ge, nob);
504
505                 fwd->kprfd_gateway_nid = ge->kpge_nid;
506                 atomic_inc (&src_ne->kpne_refcount); /* source and dest nals are */
507                 atomic_inc (&dst_ne->kpne_refcount); /* busy until fwd completes */
508                 atomic_inc (&kpr_queue_depth);
509
510                 read_unlock (&kpr_rwlock);
511
512                 CDEBUG (D_NET, "forward [%p] "LPX64" from NAL %x: "
513                         "to "LPX64" on NAL %x\n", 
514                         fwd, target_nid, src_ne->kpne_interface.kprni_nalid,
515                         fwd->kprfd_gateway_nid, dst_ne->kpne_interface.kprni_nalid);
516
517                 dst_ne->kpne_interface.kprni_fwd (dst_ne->kpne_interface.kprni_arg, fwd);
518                 return;
519         }
520
521         rc = -EHOSTUNREACH;
522  out:
523         kpr_fwd_errors++;
524
525         CDEBUG (D_NET, "Failed to forward [%p] "LPX64" from NAL %x: %d\n", 
526                 fwd, target_nid, src_ne->kpne_interface.kprni_nalid, rc);
527
528         (fwd->kprfd_callback)(fwd->kprfd_callback_arg, rc);
529
530         read_unlock (&kpr_rwlock);
531 }
532
533 void
534 kpr_complete_packet (void *arg, kpr_fwd_desc_t *fwd, int error)
535 {
536         kpr_nal_entry_t *dst_ne = (kpr_nal_entry_t *)arg;
537         kpr_nal_entry_t *src_ne = (kpr_nal_entry_t *)fwd->kprfd_router_arg;
538
539         CDEBUG (D_NET, "complete(1) [%p] from NAL %x to NAL %x: %d\n", fwd,
540                 src_ne->kpne_interface.kprni_nalid, dst_ne->kpne_interface.kprni_nalid, error);
541
542         atomic_dec (&dst_ne->kpne_refcount);    /* CAVEAT EMPTOR dst_ne can disappear now!!! */
543
544         (fwd->kprfd_callback)(fwd->kprfd_callback_arg, error);
545
546         CDEBUG (D_NET, "complete(2) [%p] from NAL %x: %d\n", fwd,
547                 src_ne->kpne_interface.kprni_nalid, error);
548
549         atomic_dec (&kpr_queue_depth);
550         atomic_dec (&src_ne->kpne_refcount);    /* CAVEAT EMPTOR src_ne can disappear now!!! */
551 }
552
553 int
554 kpr_add_route (int gateway_nalid, ptl_nid_t gateway_nid, 
555                ptl_nid_t lo_nid, ptl_nid_t hi_nid)
556 {
557         unsigned long        flags;
558         struct list_head    *e;
559         kpr_route_entry_t   *re;
560         kpr_gateway_entry_t *ge;
561         int                  dup = 0;
562
563         CDEBUG(D_NET, "Add route: %x "LPX64" : "LPX64" - "LPX64"\n",
564                gateway_nalid, gateway_nid, lo_nid, hi_nid);
565
566         if (gateway_nalid == PTL_NID_ANY ||
567             lo_nid == PTL_NID_ANY ||
568             hi_nid == PTL_NID_ANY ||
569             lo_nid > hi_nid)
570                 return (-EINVAL);
571
572         PORTAL_ALLOC (ge, sizeof (*ge));
573         if (ge == NULL)
574                 return (-ENOMEM);
575
576         ge->kpge_nalid = gateway_nalid;
577         ge->kpge_nid   = gateway_nid;
578         ge->kpge_alive = 1;
579         ge->kpge_timestamp = 0;
580         ge->kpge_refcount = 0;
581         atomic_set (&ge->kpge_weight, 0);
582
583         PORTAL_ALLOC (re, sizeof (*re));
584         if (re == NULL) {
585                 PORTAL_FREE (ge, sizeof (*ge));
586                 return (-ENOMEM);
587         }
588
589         re->kpre_lo_nid = lo_nid;
590         re->kpre_hi_nid = hi_nid;
591
592         LASSERT(!in_interrupt());
593         write_lock_irqsave (&kpr_rwlock, flags);
594
595         list_for_each (e, &kpr_gateways) {
596                 kpr_gateway_entry_t *ge2 = list_entry(e, kpr_gateway_entry_t,
597                                                       kpge_list);
598
599                 if (ge2->kpge_nalid == gateway_nalid &&
600                     ge2->kpge_nid == gateway_nid) {
601                         PORTAL_FREE (ge, sizeof (*ge));
602                         ge = ge2;
603                         dup = 1;
604                         break;
605                 }
606         }
607
608         if (!dup) {
609                 /* Adding a new gateway... */
610                 list_add (&ge->kpge_list, &kpr_gateways);
611
612                 /* ...zero all gateway weights so this one doesn't have to
613                  * play catch-up */
614
615                 list_for_each (e, &kpr_gateways) {
616                         kpr_gateway_entry_t *ge2 = list_entry(e, kpr_gateway_entry_t,
617                                                               kpge_list);
618                         atomic_set (&ge2->kpge_weight, 0);
619                 }
620         }
621
622         re->kpre_gateway = ge;
623         ge->kpge_refcount++;
624         list_add (&re->kpre_list, &kpr_routes);
625         kpr_routes_generation++;
626
627         write_unlock_irqrestore (&kpr_rwlock, flags);
628         return (0);
629 }
630
631 int
632 kpr_sys_notify (int gateway_nalid, ptl_nid_t gateway_nid,
633                 int alive, time_t when)
634 {
635         return (kpr_do_notify (0, gateway_nalid, gateway_nid, alive, when));
636 }
637
638 int
639 kpr_del_route (int gw_nalid, ptl_nid_t gw_nid,
640                ptl_nid_t lo, ptl_nid_t hi)
641 {
642         int                specific = (lo != PTL_NID_ANY);
643         unsigned long      flags;
644         int                rc = -ENOENT;
645         struct list_head  *e;
646         struct list_head  *n;
647
648         CDEBUG(D_NET, "Del route [%x] "LPX64" : "LPX64" - "LPX64"\n",
649                gw_nalid, gw_nid, lo, hi);
650
651         LASSERT(!in_interrupt());
652
653         /* NB Caller may specify either all routes via the given gateway
654          * (lo/hi == PTL_NID_ANY) or a specific route entry (lo/hi are
655          * actual NIDs) */
656         if (specific ? (hi == PTL_NID_ANY || hi < lo) : (hi != PTL_NID_ANY))
657                 return (-EINVAL);
658
659         write_lock_irqsave(&kpr_rwlock, flags);
660
661         list_for_each_safe (e, n, &kpr_routes) {
662                 kpr_route_entry_t   *re = list_entry(e, kpr_route_entry_t,
663                                                    kpre_list);
664                 kpr_gateway_entry_t *ge = re->kpre_gateway;
665
666                 if (ge->kpge_nalid != gw_nalid ||
667                     ge->kpge_nid != gw_nid ||
668                     (specific &&
669                      (lo != re->kpre_lo_nid || hi != re->kpre_hi_nid)))
670                         continue;
671
672                 rc = 0;
673
674                 if (--ge->kpge_refcount == 0) {
675                         list_del (&ge->kpge_list);
676                         PORTAL_FREE (ge, sizeof (*ge));
677                 }
678
679                 list_del (&re->kpre_list);
680                 PORTAL_FREE(re, sizeof (*re));
681
682                 if (specific)
683                         break;
684         }
685
686         kpr_routes_generation++;
687         write_unlock_irqrestore(&kpr_rwlock, flags);
688
689         return (rc);
690 }
691
692 int
693 kpr_get_route (int idx, __u32 *gateway_nalid, ptl_nid_t *gateway_nid,
694                ptl_nid_t *lo_nid, ptl_nid_t *hi_nid, __u32 *alive)
695 {
696         struct list_head  *e;
697
698         LASSERT (!in_interrupt());
699         read_lock(&kpr_rwlock);
700
701         for (e = kpr_routes.next; e != &kpr_routes; e = e->next) {
702                 kpr_route_entry_t   *re = list_entry(e, kpr_route_entry_t,
703                                                      kpre_list);
704                 kpr_gateway_entry_t *ge = re->kpre_gateway;
705                 
706                 if (idx-- == 0) {
707                         *gateway_nalid = ge->kpge_nalid;
708                         *gateway_nid = ge->kpge_nid;
709                         *alive = ge->kpge_alive;
710                         *lo_nid = re->kpre_lo_nid;
711                         *hi_nid = re->kpre_hi_nid;
712
713                         read_unlock(&kpr_rwlock);
714                         return (0);
715                 }
716         }
717
718         read_unlock (&kpr_rwlock);
719         return (-ENOENT);
720 }
721
722 static int 
723 kpr_nal_cmd(struct portals_cfg *pcfg, void * private)
724 {
725         int err = -EINVAL;
726         ENTRY;
727
728         switch(pcfg->pcfg_command) {
729         default:
730                 CDEBUG(D_IOCTL, "Inappropriate cmd: %d\n", pcfg->pcfg_command);
731                 break;
732                 
733         case NAL_CMD_ADD_ROUTE:
734                 CDEBUG(D_IOCTL, "Adding route: [%x] "LPU64" : "LPU64" - "LPU64"\n",
735                        pcfg->pcfg_nal, pcfg->pcfg_nid, 
736                        pcfg->pcfg_nid2, pcfg->pcfg_nid3);
737                 err = kpr_add_route(pcfg->pcfg_gw_nal, pcfg->pcfg_nid,
738                                     pcfg->pcfg_nid2, pcfg->pcfg_nid3);
739                 break;
740
741         case NAL_CMD_DEL_ROUTE:
742                 CDEBUG (D_IOCTL, "Removing routes via [%x] "LPU64" : "LPU64" - "LPU64"\n",
743                         pcfg->pcfg_gw_nal, pcfg->pcfg_nid, 
744                         pcfg->pcfg_nid2, pcfg->pcfg_nid3);
745                 err = kpr_del_route (pcfg->pcfg_gw_nal, pcfg->pcfg_nid,
746                                      pcfg->pcfg_nid2, pcfg->pcfg_nid3);
747                 break;
748
749         case NAL_CMD_NOTIFY_ROUTER: {
750                 CDEBUG (D_IOCTL, "Notifying peer [%x] "LPU64" %s @ %ld\n",
751                         pcfg->pcfg_gw_nal, pcfg->pcfg_nid,
752                         pcfg->pcfg_flags ? "Enabling" : "Disabling",
753                         (time_t)pcfg->pcfg_nid3);
754                 
755                 err = kpr_sys_notify (pcfg->pcfg_gw_nal, pcfg->pcfg_nid,
756                                       pcfg->pcfg_flags, (time_t)pcfg->pcfg_nid3);
757                 break;
758         }
759                 
760         case NAL_CMD_GET_ROUTE:
761                 CDEBUG (D_IOCTL, "Getting route [%d]\n", pcfg->pcfg_count);
762                 err = kpr_get_route(pcfg->pcfg_count, &pcfg->pcfg_gw_nal,
763                                     &pcfg->pcfg_nid, 
764                                     &pcfg->pcfg_nid2, &pcfg->pcfg_nid3,
765                                     &pcfg->pcfg_flags);
766                 break;
767         }
768         RETURN(err);
769 }
770
771
772 static void /*__exit*/
773 kpr_finalise (void)
774 {
775         LASSERT (list_empty (&kpr_nals));
776
777         libcfs_nal_cmd_unregister(ROUTER);
778
779         PORTAL_SYMBOL_UNREGISTER(kpr_router_interface);
780
781         kpr_proc_fini();
782
783         while (!list_empty (&kpr_routes)) {
784                 kpr_route_entry_t *re = list_entry(kpr_routes.next,
785                                                    kpr_route_entry_t,
786                                                    kpre_list);
787
788                 list_del(&re->kpre_list);
789                 PORTAL_FREE(re, sizeof (*re));
790         }
791
792         CDEBUG(D_MALLOC, "kpr_finalise: kmem back to %d\n",
793                atomic_read(&portal_kmemory));
794 }
795
796 static int __init
797 kpr_initialise (void)
798 {
799         int     rc;
800         
801         CDEBUG(D_MALLOC, "kpr_initialise: kmem %d\n",
802                atomic_read(&portal_kmemory));
803
804         kpr_routes_generation = 0;
805         kpr_proc_init();
806
807         rc = libcfs_nal_cmd_register(ROUTER, kpr_nal_cmd, NULL);
808         if (rc != 0) {
809                 CERROR("Can't register nal cmd handler\n");
810                 return (rc);
811         }
812         
813         PORTAL_SYMBOL_REGISTER(kpr_router_interface);
814         return (0);
815 }
816
817 MODULE_AUTHOR("Eric Barton");
818 MODULE_DESCRIPTION("Kernel Portals Router v0.01");
819 MODULE_LICENSE("GPL");
820
821 module_init (kpr_initialise);
822 module_exit (kpr_finalise);
823
824 EXPORT_SYMBOL (kpr_router_interface);