Whamcloud - gitweb
LU-6210 lnet: Change positional struct initializers to C99
[fs/lustre-release.git] / lnet / klnds / gnilnd / gnilnd_stack.c
1 /*
2  * Copyright (C) 2012 Cray, Inc.
3  *
4  * Copyright (c) 2014, Intel Corporation.
5  *
6  *   Author: Nic Henke <nic@cray.com>
7  *
8  *   This file is part of Lustre, http://www.lustre.org.
9  *
10  *   Lustre is free software; you can redistribute it and/or
11  *   modify it under the terms of version 2 of the GNU General Public
12  *   License as published by the Free Software Foundation.
13  *
14  *   Lustre is distributed in the hope that it will be useful,
15  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
16  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  *   GNU General Public License for more details.
18  *
19  *   You should have received a copy of the GNU General Public License
20  *   along with Lustre; if not, write to the Free Software
21  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22  *
23  */
24 #include "gnilnd.h"
25 #if defined(GNILND_USE_RCA)
26 #include <rsms/rs_sm_states.h>
27 #endif
28 /* Advance all timeouts by nap_time seconds. */
29 void
30 kgnilnd_bump_timeouts(__u32 nap_time, char *reason)
31 {
32         int                     i;
33         kgn_peer_t             *peer;
34         kgn_conn_t             *conn;
35         kgn_tx_t               *tx;
36         kgn_device_t           *dev;
37         kgn_dgram_t            *dgram;
38
39         CDEBUG(D_INFO, "%s: bumping all timeouts by %ds\n", reason, nap_time);
40
41         LASSERTF(GNILND_IS_QUIESCED, "gnilnd not quiesced %d != %d\n",
42                  atomic_read(&kgnilnd_data.kgn_nquiesce),
43                  atomic_read(&kgnilnd_data.kgn_nthreads));
44
45         /* requiring that the threads are paused ensures a couple of things:
46          * - combined code paths for stack reset and quiesce event as stack reset
47          *   runs with the threads paused
48          * - prevents traffic to the Gemini during a quiesce period
49          * - reduces the locking requirements
50         */
51
52         for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
53                 list_for_each_entry(peer, &kgnilnd_data.kgn_peers[i], gnp_list) {
54
55                         /* we can reconnect again at any time */
56                         peer->gnp_reconnect_time = jiffies;
57                         /* reset now that network is healthy */
58                         peer->gnp_reconnect_interval = 0;
59                         /* tell LNet dude is still alive */
60                         kgnilnd_peer_alive(peer);
61                         kgnilnd_peer_notify(peer, 0, 1);
62
63                         list_for_each_entry(tx, &peer->gnp_tx_queue, tx_list) {
64                                 tx->tx_qtime = jiffies;
65                         }
66
67                         list_for_each_entry(conn, &peer->gnp_conns, gnc_list) {
68                                 unsigned long           timeout;
69
70                                 timeout = cfs_time_seconds(conn->gnc_timeout);
71
72                                 /* bump last_rx/last_rx_cq on all conns - including
73                                  * closed ones, this will have the effect of
74                                  * bumping the purgatory timers for those */
75                                 conn->gnc_last_rx = conn->gnc_last_rx_cq = jiffies;
76
77                                 /* we don't timeout based on old gnc_last_tx, so
78                                  * we'll back it up and schedule the conn to trigger
79                                  * a NOOP */
80                                 conn->gnc_last_tx = jiffies - timeout;
81                                 if (conn->gnc_state != GNILND_CONN_DONE)
82                                 kgnilnd_schedule_conn(conn);
83                         }
84                 }
85         }
86
87         for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
88                 dev = &kgnilnd_data.kgn_devices[i];
89                 for (i = 0; i < (*kgnilnd_tunables.kgn_peer_hash_size - 1); i++) {
90                         list_for_each_entry(dgram, &dev->gnd_dgrams[i], gndg_list) {
91                                 dgram->gndg_post_time = jiffies;
92                         }
93                 }
94         }
95 }
96
97 /* Quiesce or wake up the stack.  The caller must hold the kgn_quiesce_sem semaphore
98  * on entry, which holds off any pending stack shutdown.   */
99 void
100 kgnilnd_quiesce_wait(char *reason)
101 {
102         int             i;
103
104         if (kgnilnd_data.kgn_quiesce_trigger) {
105                 unsigned long   quiesce_deadline, quiesce_to;
106                 /* FREEZE TAG!!!! */
107
108                 /* morning sunshine */
109                 spin_lock(&kgnilnd_data.kgn_reaper_lock);
110                 wake_up_all(&kgnilnd_data.kgn_reaper_waitq);
111                 spin_unlock(&kgnilnd_data.kgn_reaper_lock);
112
113                 for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
114                         kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
115
116                         wake_up_all(&dev->gnd_waitq);
117                         wake_up_all(&dev->gnd_dgram_waitq);
118                         wake_up_all(&dev->gnd_dgping_waitq);
119                 }
120
121                 kgnilnd_wakeup_rca_thread();
122
123                 /* we'll wait for 10x the timeout for the threads to pause */
124                 quiesce_to = cfs_time_seconds(*kgnilnd_tunables.kgn_timeout * 10);
125                 quiesce_deadline = (long) jiffies + quiesce_to;
126
127                 LCONSOLE_INFO("Quiesce start: %s\n", reason);
128                 /* wait for everyone to check-in as quiesced */
129                 while (!GNILND_IS_QUIESCED) {
130                         CDEBUG(D_INFO,
131                                  "%s: Waiting for %d threads to pause\n",
132                                  reason,
133                                  atomic_read(&kgnilnd_data.kgn_nthreads) -
134                                  atomic_read(&kgnilnd_data.kgn_nquiesce));
135                         CFS_RACE(CFS_FAIL_GNI_QUIESCE_RACE);
136                         set_current_state(TASK_UNINTERRUPTIBLE);
137                         schedule_timeout(cfs_time_seconds(1 * i));
138
139                         LASSERTF(quiesce_deadline > jiffies,
140                                  "couldn't quiesce threads in %lu seconds, falling over now\n",
141                                  cfs_duration_sec(quiesce_to));
142                 }
143
144                 CDEBUG(D_INFO, "%s: All threads paused!\n", reason);
145                 /* XXX Nic: Is there a set of counters we can grab here to
146                  * ensure that there is no traffic until quiesce is over ?*/
147         } else {
148                 LCONSOLE_INFO("Quiesce complete: %s\n", reason);
149
150                 for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
151                         kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
152                         kgnilnd_schedule_dgram(dev);
153                 }
154
155                 /* wait for everyone to check-in as running - they will be spinning
156                  * and looking, so no need to poke any waitq */
157                 while (atomic_read(&kgnilnd_data.kgn_nquiesce) > 0) {
158                         CDEBUG(D_INFO,
159                                  "%s: Waiting for %d threads to wake up\n",
160                                   reason,
161                                   atomic_read(&kgnilnd_data.kgn_nquiesce));
162                         set_current_state(TASK_UNINTERRUPTIBLE);
163                         schedule_timeout(cfs_time_seconds(1 * i));
164                 }
165
166                 CDEBUG(D_INFO, "%s: All threads awake!\n", reason);
167         }
168 }
169
170 /* Reset the stack.  */
171 void
172 kgnilnd_reset_stack(void)
173 {
174         int              i, rc = 0;
175         kgn_net_t       *net;
176         kgn_peer_t      *peer, *peerN;
177         LIST_HEAD        (souls);
178         char            *reason = "critical hardware error";
179         __u32            seconds;
180         unsigned long    start, end;
181         ENTRY;
182
183         /* Race with del_peer and its atomics */
184         CFS_RACE(CFS_FAIL_GNI_RACE_RESET);
185
186         if (kgnilnd_data.kgn_init != GNILND_INIT_ALL) {
187                 CERROR("can't reset the stack, gnilnd is not initialized\n");
188                 RETURN_EXIT;
189         }
190
191         /* First make sure we are not already quiesced - we panic if so,
192          * as that could leave software in a bad state */
193         LASSERTF(kgnilnd_data.kgn_quiesce_trigger == GNILND_QUIESCE_IDLE,
194                 "can't reset the stack, already doing so: trigger %d\n",
195                  kgnilnd_data.kgn_quiesce_trigger);
196
197         set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_RESET);
198
199         /* wake up the dgram waitq thread - but after trigger set to make sure it
200          * goes into quiesce */
201         CFS_RACE(CFS_FAIL_GNI_WC_DGRAM_FREE);
202         /* same for scheduler that is dropping state transitiosn */
203         CFS_RACE(CFS_FAIL_GNI_DROP_CLOSING);
204         CFS_RACE(CFS_FAIL_GNI_DROP_DESTROY_EP);
205
206         kgnilnd_quiesce_wait(reason);
207
208         start = jiffies;
209
210         kgnilnd_data.kgn_in_reset = 1;
211         kgnilnd_data.kgn_nresets++;
212         LCONSOLE_WARN("%s: resetting all resources (count %d)\n",
213                       reason, kgnilnd_data.kgn_nresets);
214
215         for (i = 0; i < *kgnilnd_tunables.kgn_net_hash_size; i++) {
216                 list_for_each_entry(net, &kgnilnd_data.kgn_nets[i], gnn_list) {
217                         rc = kgnilnd_cancel_net_dgrams(net);
218                         LASSERTF(rc == 0, "couldn't cleanup datagrams: %d\n", rc);
219                 }
220         }
221
222         /* error -ENOTRECOVERABLE is stack reset */
223         kgnilnd_del_conn_or_peer(NULL, LNET_NID_ANY, GNILND_DEL_CONN, -ENOTRECOVERABLE);
224
225         for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
226                 kgn_device_t    *dev = &kgnilnd_data.kgn_devices[i];
227                 kgnilnd_cancel_wc_dgrams(dev);
228                 kgnilnd_wait_for_canceled_dgrams(dev);
229         }
230
231         /* manually do some conn processing ala kgnilnd_process_conns */
232         for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
233                 kgn_device_t    *dev = &kgnilnd_data.kgn_devices[i];
234                 kgn_conn_t      *conn;
235                 int              conn_sched;
236
237                 /* go find all the closed conns that need to be nuked - the
238                  * scheduler thread isn't running to do this for us */
239
240                 CDEBUG(D_NET, "will try to clear up %d ready_conns\n",
241                         kgnilnd_count_list(&dev->gnd_ready_conns));
242
243                 /* use while/list_first_entry loop to ensure we can handle any
244                  * DESTROY_EP conns added from kgnilnd_complete_closed_conn */
245                 while (!list_empty(&dev->gnd_ready_conns)) {
246                         conn = list_first_entry(&dev->gnd_ready_conns,
247                                                 kgn_conn_t, gnc_schedlist);
248                         conn_sched = xchg(&conn->gnc_scheduled, GNILND_CONN_PROCESS);
249
250                         LASSERTF(conn_sched != GNILND_CONN_IDLE &&
251                                  conn_sched != GNILND_CONN_PROCESS,
252                                  "conn %p on ready list but in bad state: %d\n",
253                                  conn, conn_sched);
254
255                         list_del_init(&conn->gnc_schedlist);
256
257                         if (conn->gnc_state == GNILND_CONN_CLOSING) {
258                                 /* bump to CLOSED to fake out send of CLOSE */
259                                 conn->gnc_state = GNILND_CONN_CLOSED;
260                                 conn->gnc_close_sent = 1;
261                         }
262
263                         if (conn->gnc_state == GNILND_CONN_DESTROY_EP) {
264                                 kgnilnd_destroy_conn_ep(conn);
265                         } else {
266                                 kgnilnd_complete_closed_conn(conn);
267                         }
268
269                         /* there really shouldn't be any other states here -
270                          * they would have been cleared out in the del_peer_or_conn or the dgram
271                          * aborts above.
272                          * there is an LASSERTF in kgnilnd_complete_closed_conn that will take
273                          * care of catching anything else for us */
274
275                         kgnilnd_schedule_process_conn(conn, -1);
276
277                         kgnilnd_conn_decref(conn);
278                 }
279         }
280
281         /* don't let the little weasily purgatory conns hide from us */
282         for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
283                 list_for_each_entry_safe(peer, peerN, &kgnilnd_data.kgn_peers[i], gnp_list) {
284                         kgn_conn_t       *conn, *connN;
285
286                         list_for_each_entry_safe(conn, connN, &peer->gnp_conns, gnc_list) {
287                                 kgnilnd_detach_purgatory_locked(conn, &souls);
288                         }
289                 }
290         }
291
292         CDEBUG(D_NET, "about to release %d purgatory entries\n",
293                 kgnilnd_count_list(&souls));
294
295         kgnilnd_release_purgatory_list(&souls);
296
297         /* validate we are now clean */
298         for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
299                 kgn_device_t    *dev = &kgnilnd_data.kgn_devices[i];
300
301                 /* now all the cons/mboxes should be cleaned up, including purgatory
302                  * so go through and release the MDDs for our persistent PHYS fma_blks
303                  */
304                 kgnilnd_unmap_fma_blocks(dev);
305
306                 LASSERTF(atomic_read(&dev->gnd_nfmablk) == 0,
307                         "reset failed: fma blocks still live %d\n",
308                         atomic_read(&dev->gnd_nfmablk));
309
310                 LASSERTF(atomic_read(&dev->gnd_neps) == 0,
311                         "reset failed: EP handles still live %d\n",
312                         atomic_read(&dev->gnd_neps));
313         }
314
315         LASSERTF(atomic_read(&kgnilnd_data.kgn_nconns) == 0,
316                 "reset failed: conns left %d\n",
317                 atomic_read(&kgnilnd_data.kgn_nconns));
318
319         /* fine to have peers left - they are waiting for new conns
320          * but should not be holding any open HW resources */
321
322         /* like the last part of kgnilnd_base_shutdown() */
323
324         CFS_RACE(CFS_FAIL_GNI_SR_DOWN_RACE);
325
326         for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
327                 kgnilnd_dev_fini(&kgnilnd_data.kgn_devices[i]);
328         }
329
330         /* no need to free and recreate the TX descriptors
331          * we nuked all the ones that could be using HW resources in
332          * kgnilnd_close_matching_conns and asserted it worked in
333          * kgnilnd_dev_fini */
334
335         /* At this point, all HW is torn down, start to reset */
336
337         /* only reset our known devs */
338         for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
339                 kgn_device_t    *dev = &kgnilnd_data.kgn_devices[i];
340                 rc = kgnilnd_dev_init(dev);
341                 LASSERTF(rc == 0, "dev_init failed for dev %d\n", i);
342                 kgnilnd_map_phys_fmablk(dev);
343                 LASSERTF(rc == 0, "map_phys_fmablk failed for dev %d\n", i);
344                 rc = kgnilnd_setup_wildcard_dgram(dev);
345                 LASSERTF(rc == 0, "couldnt setup datagrams on dev %d: %d\n",
346                         i, rc);
347         }
348
349         /* Now the fun restarts... - release the hounds! */
350
351         end = jiffies;
352         seconds = cfs_duration_sec((long)end - start);
353         kgnilnd_bump_timeouts(seconds, reason);
354
355         kgnilnd_data.kgn_in_reset = 0;
356         set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_IDLE);
357         kgnilnd_quiesce_wait(reason);
358         LCONSOLE_WARN("%s reset of all hardware resources\n",
359                 rc ? "failed" : "successful");
360
361         RETURN_EXIT;
362 }
363
364 /* A thread that handles quiece and reset hardware events.
365  * We do the same thing regardless of which device reported the event. */
366 int
367 kgnilnd_ruhroh_thread(void *arg)
368 {
369         int                i = 1;
370         DEFINE_WAIT(wait);
371
372         cfs_block_allsigs();
373         set_user_nice(current, *kgnilnd_tunables.kgn_nice);
374         kgnilnd_data.kgn_ruhroh_running = 1;
375
376         while (1) {
377
378                 /* Block until there's a request..  A reset request could come in
379                  * while we're handling a quiesce one, or vice versa.
380                  * Keep processing requests until there are none.*/
381                 prepare_to_wait(&kgnilnd_data.kgn_ruhroh_waitq, &wait, TASK_INTERRUPTIBLE);
382                 while (!(kgnilnd_data.kgn_ruhroh_shutdown ||
383                                 kgnilnd_data.kgn_needs_reset || kgnilnd_data.kgn_needs_pause))
384                         schedule();
385                 finish_wait(&kgnilnd_data.kgn_ruhroh_waitq, &wait);
386
387                /* Exit if the driver is shutting down. */
388                 if (kgnilnd_data.kgn_ruhroh_shutdown)
389                         break;
390
391                 /* Serialize with driver startup and shutdown. */
392                 mutex_lock(&kgnilnd_data.kgn_quiesce_mutex);
393
394                CDEBUG(D_NET, "trigger %d reset %d to_bump %d pause %d\n",
395                         kgnilnd_data.kgn_quiesce_trigger,
396                         kgnilnd_data.kgn_needs_reset,
397                         kgnilnd_data.kgn_bump_info_rdy,
398                         kgnilnd_data.kgn_needs_pause);
399
400                 /* Do we need to do a pause/quiesce? */
401                 if (kgnilnd_data.kgn_needs_pause) {
402
403                         /* Pause all other kgnilnd threads. */
404                         set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_HW_QUIESCE);
405                         kgnilnd_quiesce_wait("hardware quiesce");
406
407                         /* If the hardware quiesce flag is set, wait for it to clear.
408                          * This should happen relatively quickly, so we wait for it.
409                          * This will hold up the eventd thread, but on everything but
410                          * the simulator, this is ok-- there is one thread per core.
411                          *
412                          * Handle (possibly multiple) quiesce events while we wait. The
413                          * memory barrier ensures that the core doesn't start fetching
414                          * kgn_bump_info_rdy before it fetches kgn_needs_pause, and
415                          * matches the second mb in kgnilnd_quiesce_end_callback(). */
416                         smp_rmb();
417                         while (kgnilnd_hw_in_quiesce() || kgnilnd_data.kgn_bump_info_rdy) {
418
419                                 i++;
420                                 CDEBUG(D_INFO, "Waiting for hardware quiesce "
421                                                "flag to clear\n");
422                                 set_current_state(TASK_UNINTERRUPTIBLE);
423                                 schedule_timeout(cfs_time_seconds(1 * i));
424
425                                 /* If we got a quiesce event with bump info, DO THE BUMP!. */
426                                 if (kgnilnd_data.kgn_bump_info_rdy) {
427                                         /* reset console rate limiting for each event */
428                                         i = 1;
429
430                                         /* Make sure the core doesn't start fetching
431                                          * kgni_quiesce_seconds until after it sees
432                                          * kgn_bump_info_rdy set.  This is the match to the
433                                          * first mb in kgnilnd_quiesce_end_callback(). */
434                                         smp_rmb();
435                                         (void) kgnilnd_bump_timeouts(kgnilnd_data.kgn_quiesce_secs,
436                                                                "hardware quiesce callback");
437                                         set_mb(kgnilnd_data.kgn_quiesce_secs, 0);
438                                         set_mb(kgnilnd_data.kgn_bump_info_rdy, 0);
439                                 }
440                       }
441
442                         /* Reset the kgn_needs_pause flag before coming out of
443                          * the pause.  This ordering avoids a race with the
444                          * setting of this flag in kgnilnd_pause_threads().  */
445                         set_mb(kgnilnd_data.kgn_needs_pause, 0);
446
447                         /* ok, let the kids back into the pool */
448                         set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_IDLE);
449                         kgnilnd_quiesce_wait("hardware quiesce");
450                 }
451
452                 /* Do a stack reset if needed. */
453                 if (kgnilnd_data.kgn_needs_reset) {
454                         kgnilnd_reset_stack();
455                         set_mb(kgnilnd_data.kgn_needs_reset, 0);
456                 }
457
458                 mutex_unlock(&kgnilnd_data.kgn_quiesce_mutex);
459         }
460
461         kgnilnd_data.kgn_ruhroh_running = 0;
462         return 0;
463 }
464
465 /* Set pause request flag.  Any functions that
466  * call this one are responsible for ensuring that
467  * variables they set up are visible on other cores before
468  * this flag setting.  This executes in interrupt or kernel
469  * thread context.  */
470 void
471 kgnilnd_pause_threads(void)
472 {
473         /* only device 0 gets the handle, see kgnilnd_dev_init */
474         kgn_device_t  *dev = &kgnilnd_data.kgn_devices[0];
475         LASSERTF(dev != NULL, "dev 0 is NULL\n");
476
477         /* If we're currently in a pause triggered by the pause flag,
478          * there's no need to set it again.  We clear the kgn_needs_pause
479          * flag before we reset kgn_quiesce_trigger to avoid a race.  The
480          * read memory barrier matches the setmb() on the trigger in
481          * kgnilnd_ruhroh_task().                                       */
482         smp_rmb();
483         if (!(kgnilnd_data.kgn_quiesce_trigger == GNILND_QUIESCE_HW_QUIESCE &&
484                         GNILND_IS_QUIESCED)) {
485                  CDEBUG(D_NET, "requesting thread pause\n");
486
487                 kgnilnd_data.kgn_needs_pause = 1;
488
489                 wake_up(&kgnilnd_data.kgn_ruhroh_waitq);
490         } else {
491             CDEBUG(D_NET, "thread pause already underway\n");
492         }
493 }
494
495 /* Return non-zero if the GNI hardware quiesce flag is set */
496 int
497 kgnilnd_hw_in_quiesce(void)
498 {
499         /* only device 0 gets the handle, see kgnilnd_dev_init */
500         kgn_device_t      *dev0 = &kgnilnd_data.kgn_devices[0];
501
502         LASSERTF(dev0 != NULL, "dev 0 is NULL\n");
503
504         smp_rmb();
505         return kgnilnd_get_quiesce_status(dev0->gnd_handle) != 0;
506 }
507
508
509 /* If the GNI hardware quiesce flag is set, initiate our pause and
510  * return non-zero.  Also return non-zero if the stack is shutting down. */
511 int
512 kgnilnd_check_hw_quiesce(void)
513 {
514         if (likely(!kgnilnd_hw_in_quiesce()))
515                 return 0;
516
517         if (!kgnilnd_data.kgn_ruhroh_shutdown) {
518                 CDEBUG(D_NET, "initiating thread pause\n");
519                 kgnilnd_pause_threads();
520         } else {
521                 CDEBUG(D_NET, "thread pause bypassed because of shutdown\n");
522         }
523
524         return 1;
525 }
526
527 /* Callback from kngi with the quiesce duration.  This executes
528  * in interrupt context.                                        */
529 void
530 kgnilnd_quiesce_end_callback(gni_nic_handle_t nic_handle, uint64_t msecs)
531 {
532         /* only device 0 gets the handle, see kgnilnd_dev_init */
533         kgn_device_t  *dev = &kgnilnd_data.kgn_devices[0];
534         LASSERTF(dev != NULL, "dev 0 is NULL\n");
535
536         if (!kgnilnd_data.kgn_ruhroh_shutdown) {
537
538                 CDEBUG(D_NET, "requesting timeout bump by %lld msecs\n", msecs);
539
540                 /* Save the bump interval and request the bump.
541                  * The memory barrier ensures that the interval is in place before
542                  * the bump flag can be seen (in case a core is already running the
543                  * ruhroh task), and that the bump request flag in place before
544                  * the pause request can be seen (to ensure a core doesn't miss the bump
545                  * request flag).       */
546                 /* If another callback occurred before the ruhroh task
547                  * finished processing the first bump request, we'd over-write its info.
548                  * Nic says that callbacks occur so slowly that this isn't an issue.    */
549                 set_mb(kgnilnd_data.kgn_quiesce_secs, msecs / MSEC_PER_SEC);
550                 set_mb(kgnilnd_data.kgn_bump_info_rdy, 1);
551                 kgnilnd_pause_threads();
552         } else {
553                 CDEBUG(D_NET, "timeout bump bypassed because of shutdown\n");
554         }
555 }
556
557 void
558 kgnilnd_critical_error(struct gni_err *err_handle)
559 {
560         /* only device 0 gets the handle, see kgnilnd_dev_init */
561         kgn_device_t  *dev = &kgnilnd_data.kgn_devices[0];
562         LASSERTF(dev != NULL, "dev 0 is NULL\n");
563
564         if (!kgnilnd_data.kgn_ruhroh_shutdown) {
565                 CDEBUG(D_NET, "requesting stack reset\n");
566                 kgnilnd_data.kgn_needs_reset = 1;
567                 wake_up(&kgnilnd_data.kgn_ruhroh_waitq);
568         } else {
569                 CDEBUG(D_NET, "stack reset bypassed because of shutdown\n");
570         }
571 }
572
573 #if defined(GNILND_USE_RCA)
574 #include <krca_lib.h>
575 #define RCA_EVENTS 3
576 /* RCA ticket is needed for krca_wakeup_wait_event() */
577 static krca_ticket_t rca_krt = KRCA_NULL_TICKET;
578 struct rcadata {
579         rca_ticket_t ticket;
580         int subscribed;
581         rs_event_code_t ec;
582 };
583 static struct rcadata rd[RCA_EVENTS] = {
584         { .ec = ec_node_unavailable },
585         { .ec = ec_node_available },
586         { .ec = ec_node_failed } };
587
588 /* thread for receiving rca events */
589 int
590 kgnilnd_rca(void *arg)
591 {
592         int        i, rc;
593         int        retry_count;
594         rs_event_t event;
595         lnet_nid_t nid;
596
597         cfs_block_allsigs();
598
599         /* all gnilnd threads need to run fairly urgently */
600         set_user_nice(current, *kgnilnd_tunables.kgn_nice);
601
602         /*
603          * Register our service with RCA and subscribe to events
604          * of interest.
605          */
606         rca_krt = KRCA_NULL_TICKET;
607         rc = krca_register(&rca_krt, RCA_SVCTYPE_GNILND, current->pid, 0);
608         if (rc < 0) {
609                 CNETERR("krca_register(%x) returned %d\n", current->pid, rc);
610                 goto done;
611         }
612
613         for (i = 0; i < RCA_EVENTS; i++) {
614                 retry_count = 0;
615 subscribe_retry:
616                 rc = krca_subscribe(&rca_krt, rd[i].ec, RCA_RX_SVC_ANY,
617                                     &rd[i].ticket);
618
619                 if ((rc == -EINTR) && !retry_count) {
620                         retry_count++;
621                         CNETERR("krca_subscribe returned %d - retrying\n", rc);
622                         goto subscribe_retry;
623                 }
624
625                 if (rc < 0) {
626                         CNETERR("rca subscription failed (%d)\n", rc);
627                         goto done;
628                 }
629
630                 rd[i].subscribed = 1;
631         }
632
633         while (!kgnilnd_data.kgn_shutdown) {
634                 if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
635                         KGNILND_SPIN_QUIESCE;
636                 }
637                 /* wait here for a subscribed event */
638                 rc = krca_wait_event(&rca_krt);
639
640                 /* RCA return values:
641                  * 0 indicates krca_wakeup_wait_event caused krca_wait_event
642                  *   return.
643                  * -ERESTARTSYS indicates krca_wait_event returned because of a
644                  *   signal.
645                  * -ENOSPC indicates no space available to create an rcad_reg_t
646                  * 1 indicates a message is waiting.
647                  */
648                 if (rc <= 0) {
649                         continue;
650                 }
651
652                 if (krca_get_message(&rca_krt, &event) == 0) {
653                         int node_down = GNILND_PEER_UNKNOWN;
654                         rs_state_t state;
655                         LIST_HEAD(zombies);
656
657                         /* Compute nodes don't care about other compute nodes
658                          * so we don't need to create a peer.
659                          */
660                         if (GNILND_COMPUTE &&
661                             !RSN_GET_FLD(event.ev_gen.svid_node.rs_node_flat,
662                                         IS_SVC)) {
663                                 continue;
664                         }
665
666                         /* Only care about compute and service nodes not GPUs */
667                         if (!(RSN_GET_FLD(event.ev_gen.svid_node.rs_node_flat,
668                                         TYPE) == rt_node ||
669                              RSN_GET_FLD(event.ev_gen.svid_node.rs_node_flat,
670                                         TYPE) == rt_accel)) {
671                                                 continue;
672                         }
673
674                         switch (event.ev_id) {
675                         case ec_node_available:
676                                 CDEBUG(D_INFO, "ec_node_available\n");
677                                 node_down = GNILND_PEER_UP;
678                                 break;
679                         case ec_node_failed:
680                                 CDEBUG(D_INFO, "ec_node_failed\n");
681                                 if (event.ev_len > 0) {
682                                         CDEBUG(D_ERROR,
683                                                 "ec_node_failed ignored\n");
684                                         break;
685                                 }
686                                 node_down = GNILND_PEER_DOWN;
687                                 break;
688                         case ec_node_unavailable:
689                                 state = RSN_GET_FLD(event.ev_gen.svid_node.rsn_intval, STATE);
690
691                                 CDEBUG(D_INFO, "ec_node_unavailable\n");
692
693                                 /*
694                                  * Ignore overloaded ec_node_unavailable events
695                                  * generated by 'xtcli set_reserve'.
696                                  */
697                                 if (RS_GET_CS_STATE(state) == RS_CS_READY) {
698                                         CDEBUG(D_INFO, "ignoring "
699                                                 "ec_node_unavailable event with"
700                                                 " RS_CS_READY state\n");
701                                         break;
702                                 }
703                                 node_down = GNILND_PEER_DOWN;
704                                 break;
705                         default:
706                                 CDEBUG(D_INFO, "unknown event\n");
707                                 break;
708                         }
709
710                         /* if we get an event we don't know about, just go ahead
711                          * and wait for another event */
712                         if (node_down == GNILND_PEER_UNKNOWN)
713                                 continue;
714
715                         nid = RSN_GET_FLD(event.ev_gen.svid_node.rs_node_flat,
716                                           NID);
717                         CDEBUG(D_INFO,"kgnilnd_rca() reporting nid %d %s\n",
718                                (int)nid, node_down ? "down" : "up");
719                         kgnilnd_report_node_state(nid, node_down);
720
721                 } else {
722                         CNETERR("krca_get_message failed\n");
723                 }
724         }
725
726 done:
727         CDEBUG(D_INFO, "done\n");
728
729         for (i = 0; i < RCA_EVENTS; i++) {
730                 if (rd[i].subscribed) {
731                         rc = krca_unsubscribe(&rca_krt, rd[i].ticket);
732
733                         if (rc) {
734                                 CNETERR("rca unsubscribe failed (%d)\n", rc);
735                         }
736
737                         rd[i].subscribed = 0;
738                 }
739         }
740
741         krca_unregister(&rca_krt);
742         kgnilnd_thread_fini();
743         return 0;
744
745 }
746
747 int
748 kgnilnd_start_rca_thread(void)
749 {
750         return kgnilnd_thread_start(kgnilnd_rca, NULL, "kgnilnd_rca", 0);
751 }
752
753 void
754 kgnilnd_wakeup_rca_thread(void)
755 {
756         int ret;
757
758         ret = krca_wakeup_wait_event(&rca_krt);
759
760         if (ret) {
761                 CDEBUG(D_ERROR, "krca_wakeup_wait_event failed\n");
762         }
763 }
764
765 int
766 kgnilnd_get_node_state(__u32 nid)
767 {
768         int i;
769         int rc = GNILND_PEER_UNKNOWN;
770         int ret;
771         rs_node_array_t nlist;
772         rs_node_t       *na = NULL;
773
774         if ((ret = krca_get_sysnodes(&nlist)) < 0) {
775                 CDEBUG(D_NETERROR, "krca_get_sysnodes failed %d\n", ret);
776                 goto ns_done;
777         }
778
779         na = nlist.na_ids;
780
781         for (i = 0; i < nlist.na_len; i++) {
782                 if ((rca_nid_t)RSN_GET_FLD(na[i].rs_node_flat, NID) == nid) {
783                         rc = RSN_GET_FLD(na[i].rs_node_flat, STATE) == RS_CS_READY ?
784                                 GNILND_PEER_UP : GNILND_PEER_DOWN;
785                         break;
786                 }
787         }
788
789 ns_done:
790         kfree(na);
791         CDEBUG(D_NET, "nid %d rc %d (0=up)\n", nid, rc);
792         return rc;
793 }
794
795 #else /* GNILND_USE_RCA */
796
797 int
798 kgnilnd_start_rca_thread(void)
799 {
800         return 0;
801 }
802
803 void
804 kgnilnd_wakeup_rca_thread(void)
805 {
806 }
807
808 int
809 kgnilnd_get_node_state(__u32 nid)
810 {
811         return GNILND_PEER_UP;
812 }
813 #endif /* GNILND_USE_RCA */