Whamcloud - gitweb
LU-1346 gnilnd: remove libcfs abstractions
[fs/lustre-release.git] / lnet / klnds / gnilnd / gnilnd_stack.c
1 /*
2  * Copyright (C) 2012 Cray, Inc.
3  *
4  *   Author: Nic Henke <nic@cray.com>
5  *
6  *   This file is part of Lustre, http://www.lustre.org.
7  *
8  *   Lustre is free software; you can redistribute it and/or
9  *   modify it under the terms of version 2 of the GNU General Public
10  *   License as published by the Free Software Foundation.
11  *
12  *   Lustre is distributed in the hope that it will be useful,
13  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  *   GNU General Public License for more details.
16  *
17  *   You should have received a copy of the GNU General Public License
18  *   along with Lustre; if not, write to the Free Software
19  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20  *
21  */
22 #include "gnilnd.h"
23 #include <rsms/rs_sm_states.h>
24
25 /* Advance all timeouts by nap_time seconds. */
26 void
27 kgnilnd_bump_timeouts(__u32 nap_time, char *reason)
28 {
29         int                     i;
30         kgn_peer_t             *peer;
31         kgn_conn_t             *conn;
32         kgn_tx_t               *tx;
33         kgn_device_t           *dev;
34         kgn_dgram_t            *dgram;
35
36         LCONSOLE_INFO("%s: bumping all timeouts by %ds\n", reason, nap_time);
37
38         LASSERTF(GNILND_IS_QUIESCED, "gnilnd not quiesced %d != %d\n",
39                  atomic_read(&kgnilnd_data.kgn_nquiesce),
40                  atomic_read(&kgnilnd_data.kgn_nthreads));
41
42         /* requiring that the threads are paused ensures a couple of things:
43          * - combined code paths for stack reset and quiesce event as stack reset
44          *   runs with the threads paused
45          * - prevents traffic to the Gemini during a quiesce period
46          * - reduces the locking requirements
47         */
48
49         for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
50                 list_for_each_entry(peer, &kgnilnd_data.kgn_peers[i], gnp_list) {
51
52                         /* we can reconnect again at any time */
53                         peer->gnp_reconnect_time = jiffies;
54                         /* reset now that network is healthy */
55                         peer->gnp_reconnect_interval = 0;
56                         /* tell LNet dude is still alive */
57                         kgnilnd_peer_alive(peer);
58
59                         list_for_each_entry(tx, &peer->gnp_tx_queue, tx_list) {
60                                 tx->tx_qtime = jiffies;
61                         }
62
63                         list_for_each_entry(conn, &peer->gnp_conns, gnc_list) {
64                                 unsigned long           timeout;
65
66                                 timeout = cfs_time_seconds(conn->gnc_timeout);
67
68                                 /* bump last_rx/last_rx_cq on all conns - including
69                                  * closed ones, this will have the effect of
70                                  * bumping the purgatory timers for those */
71                                 conn->gnc_last_rx = conn->gnc_last_rx_cq = jiffies;
72
73                                 /* we don't timeout based on old gnc_last_tx, so
74                                  * we'll back it up and schedule the conn to trigger
75                                  * a NOOP */
76                                 conn->gnc_last_tx = jiffies - timeout;
77                                 if (conn->gnc_state != GNILND_CONN_DONE)
78                                 kgnilnd_schedule_conn(conn);
79                         }
80                 }
81         }
82
83         for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
84                 dev = &kgnilnd_data.kgn_devices[i];
85                 for (i = 0; i < (*kgnilnd_tunables.kgn_peer_hash_size - 1); i++) {
86                         list_for_each_entry(dgram, &dev->gnd_dgrams[i], gndg_list) {
87                                 dgram->gndg_post_time = jiffies;
88                         }
89                 }
90         }
91 }
92
93 /* Quiesce or wake up the stack.  The caller must hold the kgn_quiesce_sem semaphore
94  * on entry, which holds off any pending stack shutdown.   */
95 void
96 kgnilnd_quiesce_wait(char *reason)
97 {
98         int             i;
99
100         if (kgnilnd_data.kgn_quiesce_trigger) {
101                 unsigned long   quiesce_deadline, quiesce_to;
102                 /* FREEZE TAG!!!! */
103
104                 /* morning sunshine */
105                 spin_lock(&kgnilnd_data.kgn_reaper_lock);
106                 wake_up_all(&kgnilnd_data.kgn_reaper_waitq);
107                 spin_unlock(&kgnilnd_data.kgn_reaper_lock);
108
109                 for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
110                         kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
111
112                         wake_up_all(&dev->gnd_waitq);
113                         wake_up_all(&dev->gnd_dgram_waitq);
114                         wake_up_all(&dev->gnd_dgping_waitq);
115                 }
116
117                 kgnilnd_wakeup_rca_thread();
118
119                 /* we'll wait for 10x the timeout for the threads to pause */
120                 quiesce_to = cfs_time_seconds(*kgnilnd_tunables.kgn_timeout * 10);
121                 quiesce_deadline = (long) jiffies + quiesce_to;
122
123                 /* wait for everyone to check-in as quiesced */
124                 i = 1;
125                 while (!GNILND_IS_QUIESCED) {
126                         i++;
127                         LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET,
128                                  "%s: Waiting for %d threads to pause\n",
129                                  reason,
130                                  atomic_read(&kgnilnd_data.kgn_nthreads) -
131                                  atomic_read(&kgnilnd_data.kgn_nquiesce));
132                         CFS_RACE(CFS_FAIL_GNI_QUIESCE_RACE);
133                         cfs_pause(cfs_time_seconds(1 * i));
134
135                         LASSERTF(quiesce_deadline > jiffies,
136                                  "couldn't quiesce threads in %lu seconds, falling over now\n",
137                                  cfs_duration_sec(quiesce_to));
138                 }
139
140                 LCONSOLE_WARN("%s: All threads paused!\n", reason);
141                 /* XXX Nic: Is there a set of counters we can grab here to
142                  * ensure that there is no traffic until quiesce is over ?*/
143         } else {
144                 /* GO! GO! GO! */
145
146                 for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
147                         kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
148                         kgnilnd_schedule_dgram(dev);
149                 }
150
151                 /* wait for everyone to check-in as running - they will be spinning
152                  * and looking, so no need to poke any waitq */
153                 i = 1;
154                 while (atomic_read(&kgnilnd_data.kgn_nquiesce) > 0) {
155                         i++;
156                         LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET,
157                                  "%s: Waiting for %d threads to wake up\n",
158                                   reason,
159                                   atomic_read(&kgnilnd_data.kgn_nquiesce));
160                         cfs_pause(cfs_time_seconds(1 * i));
161                 }
162
163                 LCONSOLE_WARN("%s: All threads awake!\n", reason);
164         }
165 }
166
167 /* Reset the stack.  */
168 void
169 kgnilnd_reset_stack(void)
170 {
171         int              i, rc = 0;
172         kgn_net_t       *net;
173         kgn_peer_t      *peer, *peerN;
174         LIST_HEAD        (souls);
175         char            *reason = "critical hardware error";
176         __u32            seconds;
177         unsigned long    start, end;
178         ENTRY;
179
180         /* Race with del_peer and its atomics */
181         CFS_RACE(CFS_FAIL_GNI_RACE_RESET);
182
183         if (kgnilnd_data.kgn_init != GNILND_INIT_ALL) {
184                 CERROR("can't reset the stack, gnilnd is not initialized\n");
185                 RETURN_EXIT;
186         }
187
188         /* First make sure we are not already quiesced - we panic if so,
189          * as that could leave software in a bad state */
190         LASSERTF(kgnilnd_data.kgn_quiesce_trigger == GNILND_QUIESCE_IDLE,
191                 "can't reset the stack, already doing so: trigger %d\n",
192                  kgnilnd_data.kgn_quiesce_trigger);
193
194         set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_RESET);
195
196         /* wake up the dgram waitq thread - but after trigger set to make sure it
197          * goes into quiesce */
198         CFS_RACE(CFS_FAIL_GNI_WC_DGRAM_FREE);
199         /* same for scheduler that is dropping state transitiosn */
200         CFS_RACE(CFS_FAIL_GNI_DROP_CLOSING);
201         CFS_RACE(CFS_FAIL_GNI_DROP_DESTROY_EP);
202
203         kgnilnd_quiesce_wait(reason);
204
205         start = jiffies;
206
207         kgnilnd_data.kgn_in_reset = 1;
208         kgnilnd_data.kgn_nresets++;
209         LCONSOLE_WARN("%s: resetting all resources (count %d)\n",
210                       reason, kgnilnd_data.kgn_nresets);
211
212         for (i = 0; i < *kgnilnd_tunables.kgn_net_hash_size; i++) {
213                 list_for_each_entry(net, &kgnilnd_data.kgn_nets[i], gnn_list) {
214                         rc = kgnilnd_cancel_net_dgrams(net);
215                         LASSERTF(rc == 0, "couldn't cleanup datagrams: %d\n", rc);
216                 }
217         }
218
219         /* error -ENOTRECOVERABLE is stack reset */
220         kgnilnd_del_conn_or_peer(NULL, LNET_NID_ANY, GNILND_DEL_CONN, -ENOTRECOVERABLE);
221
222         for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
223                 kgn_device_t    *dev = &kgnilnd_data.kgn_devices[i];
224                 kgnilnd_cancel_wc_dgrams(dev);
225                 kgnilnd_wait_for_canceled_dgrams(dev);
226         }
227
228         /* manually do some conn processing ala kgnilnd_process_conns */
229         for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
230                 kgn_device_t    *dev = &kgnilnd_data.kgn_devices[i];
231                 kgn_conn_t      *conn;
232                 int              conn_sched;
233
234                 /* go find all the closed conns that need to be nuked - the
235                  * scheduler thread isn't running to do this for us */
236
237                 CDEBUG(D_NET, "will try to clear up %d ready_conns\n",
238                         kgnilnd_count_list(&dev->gnd_ready_conns));
239
240                 /* use while/list_first_entry loop to ensure we can handle any
241                  * DESTROY_EP conns added from kgnilnd_complete_closed_conn */
242                 while (!list_empty(&dev->gnd_ready_conns)) {
243                         conn = list_first_entry(&dev->gnd_ready_conns,
244                                                 kgn_conn_t, gnc_schedlist);
245                         conn_sched = xchg(&conn->gnc_scheduled, GNILND_CONN_PROCESS);
246
247                         LASSERTF(conn_sched != GNILND_CONN_IDLE &&
248                                  conn_sched != GNILND_CONN_PROCESS,
249                                  "conn %p on ready list but in bad state: %d\n",
250                                  conn, conn_sched);
251
252                         list_del_init(&conn->gnc_schedlist);
253
254                         if (conn->gnc_state == GNILND_CONN_CLOSING) {
255                                 /* bump to CLOSED to fake out send of CLOSE */
256                                 conn->gnc_state = GNILND_CONN_CLOSED;
257                                 conn->gnc_close_sent = 1;
258                         }
259
260                         if (conn->gnc_state == GNILND_CONN_DESTROY_EP) {
261                                 kgnilnd_destroy_conn_ep(conn);
262                         } else {
263                                 kgnilnd_complete_closed_conn(conn);
264                         }
265
266                         /* there really shouldn't be any other states here -
267                          * they would have been cleared out in the del_peer_or_conn or the dgram
268                          * aborts above.
269                          * there is an LASSERTF in kgnilnd_complete_closed_conn that will take
270                          * care of catching anything else for us */
271
272                         kgnilnd_schedule_process_conn(conn, -1);
273
274                         kgnilnd_conn_decref(conn);
275                 }
276         }
277
278         /* don't let the little weasily purgatory conns hide from us */
279         for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
280                 list_for_each_entry_safe(peer, peerN, &kgnilnd_data.kgn_peers[i], gnp_list) {
281                         kgn_conn_t       *conn, *connN;
282
283                         list_for_each_entry_safe(conn, connN, &peer->gnp_conns, gnc_list) {
284                                 kgnilnd_detach_purgatory_locked(conn, &souls);
285                         }
286                 }
287         }
288
289         CDEBUG(D_NET, "about to release %d purgatory entries\n",
290                 kgnilnd_count_list(&souls));
291
292         kgnilnd_release_purgatory_list(&souls);
293
294         /* validate we are now clean */
295         for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
296                 kgn_device_t    *dev = &kgnilnd_data.kgn_devices[i];
297
298                 /* now all the cons/mboxes should be cleaned up, including purgatory
299                  * so go through and release the MDDs for our persistent PHYS fma_blks
300                  */
301                 kgnilnd_unmap_phys_fmablk(dev);
302
303                 LASSERTF(atomic_read(&dev->gnd_nfmablk) == 0,
304                         "reset failed: fma blocks still live %d\n",
305                         atomic_read(&dev->gnd_nfmablk));
306
307                 LASSERTF(atomic_read(&dev->gnd_neps) == 0,
308                         "reset failed: EP handles still live %d\n",
309                         atomic_read(&dev->gnd_neps));
310         }
311
312         LASSERTF(atomic_read(&kgnilnd_data.kgn_nconns) == 0,
313                 "reset failed: conns left %d\n",
314                 atomic_read(&kgnilnd_data.kgn_nconns));
315
316         /* fine to have peers left - they are waiting for new conns
317          * but should not be holding any open HW resources */
318
319         /* like the last part of kgnilnd_base_shutdown() */
320
321         CFS_RACE(CFS_FAIL_GNI_SR_DOWN_RACE);
322
323         for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
324                 kgnilnd_dev_fini(&kgnilnd_data.kgn_devices[i]);
325         }
326
327         /* no need to free and recreate the TX descriptors
328          * we nuked all the ones that could be using HW resources in
329          * kgnilnd_close_matching_conns and asserted it worked in
330          * kgnilnd_dev_fini */
331
332         /* At this point, all HW is torn down, start to reset */
333
334         /* only reset our known devs */
335         for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
336                 kgn_device_t    *dev = &kgnilnd_data.kgn_devices[i];
337                 rc = kgnilnd_dev_init(dev);
338                 LASSERTF(rc == 0, "dev_init failed for dev %d\n", i);
339                 kgnilnd_map_phys_fmablk(dev);
340                 LASSERTF(rc == 0, "map_phys_fmablk failed for dev %d\n", i);
341                 rc = kgnilnd_setup_wildcard_dgram(dev);
342                 LASSERTF(rc == 0, "couldnt setup datagrams on dev %d: %d\n",
343                         i, rc);
344         }
345
346         /* Now the fun restarts... - release the hounds! */
347
348         end = jiffies;
349         seconds = cfs_duration_sec((long)end - start);
350         kgnilnd_bump_timeouts(seconds, reason);
351
352         kgnilnd_data.kgn_in_reset = 0;
353         set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_IDLE);
354         kgnilnd_quiesce_wait(reason);
355         LCONSOLE_WARN("%s reset of all hardware resources\n",
356                 rc ? "failed" : "successful");
357
358         RETURN_EXIT;
359 }
360
361 /* A thread that handles quiece and reset hardware events.
362  * We do the same thing regardless of which device reported the event. */
363 int
364 kgnilnd_ruhroh_thread(void *arg)
365 {
366         int                i = 1;
367         DEFINE_WAIT(wait);
368
369         cfs_block_allsigs();
370         set_user_nice(current, *kgnilnd_tunables.kgn_nice);
371         kgnilnd_data.kgn_ruhroh_running = 1;
372
373         while (1) {
374
375                 /* Block until there's a request..  A reset request could come in
376                  * while we're handling a quiesce one, or vice versa.
377                  * Keep processing requests until there are none.*/
378                 prepare_to_wait(&kgnilnd_data.kgn_ruhroh_waitq, &wait, TASK_INTERRUPTIBLE);
379                 while (!(kgnilnd_data.kgn_ruhroh_shutdown ||
380                                 kgnilnd_data.kgn_needs_reset || kgnilnd_data.kgn_needs_pause))
381                         schedule();
382                 finish_wait(&kgnilnd_data.kgn_ruhroh_waitq, &wait);
383
384                /* Exit if the driver is shutting down. */
385                 if (kgnilnd_data.kgn_ruhroh_shutdown)
386                         break;
387
388                 /* Serialize with driver startup and shutdown. */
389                 down(&kgnilnd_data.kgn_quiesce_sem);
390
391                CDEBUG(D_NET, "trigger %d reset %d to_bump %d pause %d\n",
392                         kgnilnd_data.kgn_quiesce_trigger,
393                         kgnilnd_data.kgn_needs_reset,
394                         kgnilnd_data.kgn_bump_info_rdy,
395                         kgnilnd_data.kgn_needs_pause);
396
397                 /* Do we need to do a pause/quiesce? */
398                 if (kgnilnd_data.kgn_needs_pause) {
399
400                         /* Pause all other kgnilnd threads. */
401                         set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_HW_QUIESCE);
402                         kgnilnd_quiesce_wait("hardware quiesce flag");
403
404                         /* If the hardware quiesce flag is set, wait for it to clear.
405                          * This should happen relatively quickly, so we wait for it.
406                          * This will hold up the eventd thread, but on everything but
407                          * the simulator, this is ok-- there is one thread per core.
408                          *
409                          * Handle (possibly multiple) quiesce events while we wait. The
410                          * memory barrier ensures that the core doesn't start fetching
411                          * kgn_bump_info_rdy before it fetches kgn_needs_pause, and
412                          * matches the second mb in kgnilnd_quiesce_end_callback(). */
413                         smp_rmb();
414                         while (kgnilnd_hw_in_quiesce() || kgnilnd_data.kgn_bump_info_rdy) {
415
416                                 i++;
417                                 LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET,
418                                                 "Waiting for hardware quiesce flag to clear\n");
419                                 cfs_pause(cfs_time_seconds(1 * i));
420
421                                 /* If we got a quiesce event with bump info, DO THE BUMP!. */
422                                 if (kgnilnd_data.kgn_bump_info_rdy) {
423                                         /* reset console rate limiting for each event */
424                                         i = 1;
425
426                                         /* Make sure the core doesn't start fetching
427                                          * kgni_quiesce_seconds until after it sees
428                                          * kgn_bump_info_rdy set.  This is the match to the
429                                          * first mb in kgnilnd_quiesce_end_callback(). */
430                                         smp_rmb();
431                                         (void) kgnilnd_bump_timeouts(kgnilnd_data.kgn_quiesce_secs,
432                                                                "hardware quiesce callback");
433                                         set_mb(kgnilnd_data.kgn_quiesce_secs, 0);
434                                         set_mb(kgnilnd_data.kgn_bump_info_rdy, 0);
435                                 }
436                       }
437
438                         /* Reset the kgn_needs_pause flag before coming out of
439                          * the pause.  This ordering avoids a race with the
440                          * setting of this flag in kgnilnd_pause_threads().  */
441                         set_mb(kgnilnd_data.kgn_needs_pause, 0);
442
443                         /* ok, let the kids back into the pool */
444                         set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_IDLE);
445                         kgnilnd_quiesce_wait("hardware quiesce");
446                 }
447
448                 /* Do a stack reset if needed. */
449                 if (kgnilnd_data.kgn_needs_reset) {
450                         kgnilnd_reset_stack();
451                         set_mb(kgnilnd_data.kgn_needs_reset, 0);
452                 }
453
454                 up(&kgnilnd_data.kgn_quiesce_sem);
455         }
456
457         kgnilnd_data.kgn_ruhroh_running = 0;
458         return 0;
459 }
460
461 /* Set pause request flag.  Any functions that
462  * call this one are responsible for ensuring that
463  * variables they set up are visible on other cores before
464  * this flag setting.  This executes in interrupt or kernel
465  * thread context.  */
466 void
467 kgnilnd_pause_threads(void)
468 {
469         /* only device 0 gets the handle, see kgnilnd_dev_init */
470         kgn_device_t  *dev = &kgnilnd_data.kgn_devices[0];
471         LASSERTF(dev != NULL, "dev 0 is NULL\n");
472
473         /* If we're currently in a pause triggered by the pause flag,
474          * there's no need to set it again.  We clear the kgn_needs_pause
475          * flag before we reset kgn_quiesce_trigger to avoid a race.  The
476          * read memory barrier matches the setmb() on the trigger in
477          * kgnilnd_ruhroh_task().                                       */
478         smp_rmb();
479         if (!(kgnilnd_data.kgn_quiesce_trigger == GNILND_QUIESCE_HW_QUIESCE &&
480                         GNILND_IS_QUIESCED)) {
481                  CDEBUG(D_NET, "requesting thread pause\n");
482
483                 kgnilnd_data.kgn_needs_pause = 1;
484
485                 wake_up(&kgnilnd_data.kgn_ruhroh_waitq);
486         } else {
487             CDEBUG(D_NET, "thread pause already underway\n");
488         }
489 }
490
491 /* Return non-zero if the GNI hardware quiesce flag is set */
492 int
493 kgnilnd_hw_in_quiesce(void)
494 {
495         /* only device 0 gets the handle, see kgnilnd_dev_init */
496         kgn_device_t      *dev0 = &kgnilnd_data.kgn_devices[0];
497
498         LASSERTF(dev0 != NULL, "dev 0 is NULL\n");
499
500         smp_rmb();
501         return kgnilnd_get_quiesce_status(dev0->gnd_handle) != 0;
502 }
503
504
505 /* If the GNI hardware quiesce flag is set, initiate our pause and
506  * return non-zero.  Also return non-zero if the stack is shutting down. */
507 int
508 kgnilnd_check_hw_quiesce(void)
509 {
510         if (likely(!kgnilnd_hw_in_quiesce()))
511                 return 0;
512
513         if (!kgnilnd_data.kgn_ruhroh_shutdown) {
514                 CDEBUG(D_NET, "initiating thread pause\n");
515                 kgnilnd_pause_threads();
516         } else {
517                 CDEBUG(D_NET, "thread pause bypassed because of shutdown\n");
518         }
519
520         return 1;
521 }
522
523 /* Callback from kngi with the quiesce duration.  This executes
524  * in interrupt context.                                        */
525 void
526 kgnilnd_quiesce_end_callback(gni_nic_handle_t nic_handle, uint64_t msecs)
527 {
528         /* only device 0 gets the handle, see kgnilnd_dev_init */
529         kgn_device_t  *dev = &kgnilnd_data.kgn_devices[0];
530         LASSERTF(dev != NULL, "dev 0 is NULL\n");
531
532         if (!kgnilnd_data.kgn_ruhroh_shutdown) {
533
534                 CDEBUG(D_NET, "requesting timeout bump by "LPD64" msecs\n", msecs);
535
536                 /* Save the bump interval and request the bump.
537                  * The memory barrier ensures that the interval is in place before
538                  * the bump flag can be seen (in case a core is already running the
539                  * ruhroh task), and that the bump request flag in place before
540                  * the pause request can be seen (to ensure a core doesn't miss the bump
541                  * request flag).       */
542                 /* If another callback occurred before the ruhroh task
543                  * finished processing the first bump request, we'd over-write its info.
544                  * Nic says that callbacks occur so slowly that this isn't an issue.    */
545                 set_mb(kgnilnd_data.kgn_quiesce_secs, msecs / MSEC_PER_SEC);
546                 set_mb(kgnilnd_data.kgn_bump_info_rdy, 1);
547                 kgnilnd_pause_threads();
548         } else {
549                 CDEBUG(D_NET, "timeout bump bypassed because of shutdown\n");
550         }
551 }
552
553 void
554 kgnilnd_critical_error(struct gni_err *err_handle)
555 {
556         /* only device 0 gets the handle, see kgnilnd_dev_init */
557         kgn_device_t  *dev = &kgnilnd_data.kgn_devices[0];
558         LASSERTF(dev != NULL, "dev 0 is NULL\n");
559
560         if (!kgnilnd_data.kgn_ruhroh_shutdown) {
561                 CDEBUG(D_NET, "requesting stack reset\n");
562                 kgnilnd_data.kgn_needs_reset = 1;
563                 wake_up(&kgnilnd_data.kgn_ruhroh_waitq);
564         } else {
565                 CDEBUG(D_NET, "stack reset bypassed because of shutdown\n");
566         }
567 }
568
569 #if defined(GNILND_USE_RCA)
570 #include <krca_lib.h>
571 #define RCA_EVENTS 3
572 /* RCA ticket is needed for krca_wakeup_wait_event() */
573 static krca_ticket_t rca_krt = KRCA_NULL_TICKET;
574 struct rcadata {
575         rca_ticket_t ticket;
576         int subscribed;
577         rs_event_code_t ec;
578 };
579 static struct rcadata rd[RCA_EVENTS] = {
580         {0, 0, ec_node_unavailable},
581         {0, 0, ec_node_available},
582         {0, 0, ec_node_failed}
583 };
584
585 /* thread for receiving rca events */
586 int
587 kgnilnd_rca(void *arg)
588 {
589         int        i, rc;
590         int        retry_count;
591         rs_event_t event;
592         lnet_nid_t nid;
593
594         cfs_block_allsigs();
595
596         /* all gnilnd threads need to run fairly urgently */
597         set_user_nice(current, *kgnilnd_tunables.kgn_nice);
598
599         /*
600          * Register our service with RCA and subscribe to events
601          * of interest.
602          */
603         rca_krt = KRCA_NULL_TICKET;
604         rc = krca_register(&rca_krt, RCA_SVCTYPE_GNILND, current->pid, 0);
605         if (rc < 0) {
606                 CNETERR("krca_register(%x) returned %d\n", current->pid, rc);
607                 goto done;
608         }
609
610         for (i = 0; i < RCA_EVENTS; i++) {
611                 retry_count = 0;
612 subscribe_retry:
613                 rc = krca_subscribe(&rca_krt, rd[i].ec, RCA_RX_SVC_ANY,
614                                     &rd[i].ticket);
615
616                 if ((rc == -EINTR) && !retry_count) {
617                         retry_count++;
618                         CNETERR("krca_subscribe returned %d - retrying\n", rc);
619                         goto subscribe_retry;
620                 }
621
622                 if (rc < 0) {
623                         CNETERR("rca subscription failed (%d)\n", rc);
624                         goto done;
625                 }
626
627                 rd[i].subscribed = 1;
628         }
629
630         while (!kgnilnd_data.kgn_shutdown) {
631                 if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
632                         KGNILND_SPIN_QUIESCE;
633                 }
634                 /* wait here for a subscribed event */
635                 rc = krca_wait_event(&rca_krt);
636
637                 /* RCA return values:
638                  * 0 indicates krca_wakeup_wait_event caused krca_wait_event
639                  *   return.
640                  * -ERESTARTSYS indicates krca_wait_event returned because of a
641                  *   signal.
642                  * -ENOSPC indicates no space available to create an rcad_reg_t
643                  * 1 indicates a message is waiting.
644                  */
645                 if (rc <= 0) {
646                         continue;
647                 }
648
649                 if (krca_get_message(&rca_krt, &event) == 0) {
650                         int node_down = GNILND_RCA_NODE_UNKNOWN;
651                         rs_state_t state;
652                         LIST_HEAD(zombies);
653
654                         /* Compute nodes don't care about other compute nodes
655                          * so we don't need to create a peer.
656                          */
657                         if (GNILND_COMPUTE &&
658                             !RSN_GET_FLD(event.ev_gen.svid_node.rs_node_flat,
659                                         IS_SVC)) {
660                                 continue;
661                         }
662
663                         /* Only care about compute and service nodes not GPUs */
664                         if (RSN_GET_FLD(event.ev_gen.svid_node.rs_node_flat,
665                                         TYPE) != rt_node) {
666                                 continue;
667                         }
668
669                         switch (event.ev_id) {
670                         case ec_node_available:
671                                 CDEBUG(D_INFO, "ec_node_available\n");
672                                 node_down = GNILND_RCA_NODE_UP;
673                                 break;
674                         case ec_node_failed:
675                                 CDEBUG(D_INFO, "ec_node_failed\n");
676                                 if (event.ev_len > 0) {
677                                         CDEBUG(D_ERROR,
678                                                 "ec_node_failed ignored\n");
679                                         break;
680                                 }
681                                 node_down = GNILND_RCA_NODE_DOWN;
682                                 break;
683                         case ec_node_unavailable:
684                                 state = RSN_GET_FLD(event.ev_gen.svid_node.rsn_intval, STATE);
685
686                                 CDEBUG(D_INFO, "ec_node_unavailable\n");
687
688                                 /*
689                                  * Ignore overloaded ec_node_unavailable events
690                                  * generated by 'xtcli set_reserve'.
691                                  */
692                                 if (RS_GET_CS_STATE(state) == RS_CS_READY) {
693                                         CDEBUG(D_INFO, "ignoring "
694                                                 "ec_node_unavailable event with"
695                                                 " RS_CS_READY state\n");
696                                         break;
697                                 }
698                                 node_down = GNILND_RCA_NODE_DOWN;
699                                 break;
700                         default:
701                                 CDEBUG(D_INFO, "unknown event\n");
702                                 break;
703                         }
704
705                         /* if we get an event we don't know about, just go ahead
706                          * and wait for another event */
707                         if (node_down == GNILND_RCA_NODE_UNKNOWN) {
708                                 continue;
709                         }
710
711                         nid = RSN_GET_FLD(event.ev_gen.svid_node.rs_node_flat,
712                                           NID);
713                         CDEBUG(D_INFO,"kgnilnd_rca() reporting nid %d %s\n",
714                                (int)nid, node_down ? "down" : "up");
715                         kgnilnd_report_node_state(nid, node_down);
716
717                 } else {
718                         CNETERR("krca_get_message failed\n");
719                 }
720         }
721
722 done:
723         CDEBUG(D_INFO, "done\n");
724
725         for (i = 0; i < RCA_EVENTS; i++) {
726                 if (rd[i].subscribed) {
727                         rc = krca_unsubscribe(&rca_krt, rd[i].ticket);
728
729                         if (rc) {
730                                 CNETERR("rca unsubscribe failed (%d)\n", rc);
731                         }
732
733                         rd[i].subscribed = 0;
734                 }
735         }
736
737         krca_unregister(&rca_krt);
738         kgnilnd_thread_fini();
739         return 0;
740
741 }
742
743 int
744 kgnilnd_start_rca_thread(void)
745 {
746         return kgnilnd_thread_start(kgnilnd_rca, NULL, "kgnilnd_rca", 0);
747 }
748
749 void
750 kgnilnd_wakeup_rca_thread(void)
751 {
752         int ret;
753
754         ret = krca_wakeup_wait_event(&rca_krt);
755
756         if (ret) {
757                 CDEBUG(D_ERROR, "krca_wakeup_wait_event failed\n");
758         }
759 }
760
761 #else /* GNILND_USE_RCA */
762
763 int
764 kgnilnd_start_rca_thread(void)
765 {
766         return 0;
767 }
768
769 void
770 kgnilnd_wakeup_rca_thread(void)
771 {
772 }
773
774 #endif /* GNILND_USE_RCA */