lnet/klnds/gnilnd/gnilnd_stack.c

   1 /*
   2  * Copyright (C) 2012 Cray, Inc.
   3  *
   4  *   Author: Nic Henke <nic@cray.com>
   5  *
   6  *   This file is part of Lustre, http://www.lustre.org.
   7  *
   8  *   Lustre is free software; you can redistribute it and/or
   9  *   modify it under the terms of version 2 of the GNU General Public
  10  *   License as published by the Free Software Foundation.
  11  *
  12  *   Lustre is distributed in the hope that it will be useful,
  13  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  *   GNU General Public License for more details.
  16  *
  17  *   You should have received a copy of the GNU General Public License
  18  *   along with Lustre; if not, write to the Free Software
  19  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  20  *
  21  */
  22 #include "gnilnd.h"
  23
  24 /* Advance all timeouts by nap_time seconds. */
  25 void
  26 kgnilnd_bump_timeouts(__u32 nap_time, char *reason)
  27 {
  28         int                     i;
  29         kgn_peer_t             *peer;
  30         kgn_conn_t             *conn;
  31         kgn_tx_t               *tx;
  32         kgn_device_t           *dev;
  33         kgn_dgram_t            *dgram;
  34
  35         LCONSOLE_INFO("%s: bumping all timeouts by %ds\n", reason, nap_time);
  36
  37         LASSERTF(GNILND_IS_QUIESCED, "gnilnd not quiesced %d != %d\n",
  38                  atomic_read(&kgnilnd_data.kgn_nquiesce),
  39                  atomic_read(&kgnilnd_data.kgn_nthreads));
  40
  41         /* requiring that the threads are paused ensures a couple of things:
  42          * - combined code paths for stack reset and quiesce event as stack reset
  43          *   runs with the threads paused
  44          * - prevents traffic to the Gemini during a quiesce period
  45          * - reduces the locking requirements
  46         */
  47
  48         for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
  49                 list_for_each_entry(peer, &kgnilnd_data.kgn_peers[i], gnp_list) {
  50
  51                         /* we can reconnect again at any time */
  52                         peer->gnp_reconnect_time = jiffies;
  53                         /* reset now that network is healthy */
  54                         peer->gnp_reconnect_interval = 0;
  55                         /* tell LNet dude is still alive */
  56                         kgnilnd_peer_alive(peer);
  57
  58                         list_for_each_entry(tx, &peer->gnp_tx_queue, tx_list) {
  59                                 tx->tx_qtime = jiffies;
  60                         }
  61
  62                         list_for_each_entry(conn, &peer->gnp_conns, gnc_list) {
  63                                 unsigned long           timeout;
  64
  65                                 timeout = cfs_time_seconds(conn->gnc_timeout);
  66
  67                                 /* bump last_rx/last_rx_cq on all conns - including
  68                                  * closed ones, this will have the effect of
  69                                  * bumping the purgatory timers for those */
  70                                 conn->gnc_last_rx = conn->gnc_last_rx_cq = jiffies;
  71
  72                                 /* we don't timeout based on old gnc_last_tx, so
  73                                  * we'll back it up and schedule the conn to trigger
  74                                  * a NOOP */
  75                                 conn->gnc_last_tx = jiffies - timeout;
  76                                 kgnilnd_schedule_conn(conn);
  77                         }
  78                 }
  79         }
  80
  81         for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
  82                 dev = &kgnilnd_data.kgn_devices[i];
  83                 for (i = 0; i < (*kgnilnd_tunables.kgn_peer_hash_size - 1); i++) {
  84                         list_for_each_entry(dgram, &dev->gnd_dgrams[i], gndg_list) {
  85                                 dgram->gndg_post_time = jiffies;
  86                         }
  87                 }
  88         }
  89 }
  90
  91 /* Quiesce or wake up the stack.  The caller must hold the kgn_quiesce_sem semaphore
  92  * on entry, which holds off any pending stack shutdown.   */
  93 void
  94 kgnilnd_quiesce_wait(char *reason)
  95 {
  96         int             i;
  97
  98         if (kgnilnd_data.kgn_quiesce_trigger) {
  99                 unsigned long   quiesce_deadline, quiesce_to;
 100                 /* FREEZE TAG!!!! */
 101
 102                 /* morning sunshine */
 103                 spin_lock(&kgnilnd_data.kgn_reaper_lock);
 104                 wake_up_all(&kgnilnd_data.kgn_reaper_waitq);
 105                 spin_unlock(&kgnilnd_data.kgn_reaper_lock);
 106
 107                 for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
 108                         kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
 109
 110                         wake_up_all(&dev->gnd_waitq);
 111                         wake_up_all(&dev->gnd_dgram_waitq);
 112                         wake_up_all(&dev->gnd_dgping_waitq);
 113                 }
 114
 115                 /* we'll wait for 10x the timeout for the threads to pause */
 116                 quiesce_to = cfs_time_seconds(*kgnilnd_tunables.kgn_timeout * 10);
 117                 quiesce_deadline = (long) jiffies + quiesce_to;
 118
 119                 /* wait for everyone to check-in as quiesced */
 120                 i = 1;
 121                 while (!GNILND_IS_QUIESCED) {
 122                         i++;
 123                         LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET,
 124                                  "%s: Waiting for %d threads to pause\n",
 125                                  reason,
 126                                  atomic_read(&kgnilnd_data.kgn_nthreads) -
 127                                  atomic_read(&kgnilnd_data.kgn_nquiesce));
 128                         CFS_RACE(CFS_FAIL_GNI_QUIESCE_RACE);
 129                         cfs_pause(cfs_time_seconds(1 * i));
 130
 131                         LASSERTF(quiesce_deadline > jiffies,
 132                                  "couldn't quiesce threads in %lu seconds, falling over now\n",
 133                                  cfs_duration_sec(quiesce_to));
 134                 }
 135
 136                 LCONSOLE_WARN("%s: All threads paused!\n", reason);
 137                 /* XXX Nic: Is there a set of counters we can grab here to
 138                  * ensure that there is no traffic until quiesce is over ?*/
 139         } else {
 140                 /* GO! GO! GO! */
 141
 142                 for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
 143                         kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
 144                         kgnilnd_schedule_dgram(dev);
 145                 }
 146
 147                 /* wait for everyone to check-in as running - they will be spinning
 148                  * and looking, so no need to poke any waitq */
 149                 i = 1;
 150                 while (atomic_read(&kgnilnd_data.kgn_nquiesce) > 0) {
 151                         i++;
 152                         LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET,
 153                                  "%s: Waiting for %d threads to wake up\n",
 154                                   reason,
 155                                   atomic_read(&kgnilnd_data.kgn_nquiesce));
 156                         cfs_pause(cfs_time_seconds(1 * i));
 157                 }
 158
 159                 LCONSOLE_WARN("%s: All threads awake!\n", reason);
 160         }
 161 }
 162
 163 /* Reset the stack.  */
 164 void
 165 kgnilnd_reset_stack(void)
 166 {
 167         int              i, rc = 0;
 168         kgn_net_t       *net;
 169         kgn_peer_t      *peer, *peerN;
 170         LIST_HEAD        (souls);
 171         char            *reason = "critical hardware error";
 172         __u32            seconds;
 173         unsigned long    start, end;
 174         ENTRY;
 175
 176         /* Race with del_peer and its atomics */
 177         CFS_RACE(CFS_FAIL_GNI_RACE_RESET);
 178
 179         if (kgnilnd_data.kgn_init != GNILND_INIT_ALL) {
 180                 CERROR("can't reset the stack, gnilnd is not initialized\n");
 181                 RETURN_EXIT;
 182         }
 183
 184         /* First make sure we are not already quiesced - we panic if so,
 185          * as that could leave software in a bad state */
 186         LASSERTF(kgnilnd_data.kgn_quiesce_trigger == GNILND_QUIESCE_IDLE,
 187                 "can't reset the stack, already doing so: trigger %d\n",
 188                  kgnilnd_data.kgn_quiesce_trigger);
 189
 190         set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_RESET);
 191
 192         /* wake up the dgram waitq thread - but after trigger set to make sure it
 193          * goes into quiesce */
 194         CFS_RACE(CFS_FAIL_GNI_WC_DGRAM_FREE);
 195         /* same for scheduler that is dropping state transitiosn */
 196         CFS_RACE(CFS_FAIL_GNI_DROP_CLOSING);
 197         CFS_RACE(CFS_FAIL_GNI_DROP_DESTROY_EP);
 198
 199         kgnilnd_quiesce_wait(reason);
 200
 201         start = jiffies;
 202
 203         kgnilnd_data.kgn_in_reset = 1;
 204         kgnilnd_data.kgn_nresets++;
 205         LCONSOLE_WARN("%s: resetting all resources (count %d)\n",
 206                       reason, kgnilnd_data.kgn_nresets);
 207
 208         for (i = 0; i < *kgnilnd_tunables.kgn_net_hash_size; i++) {
 209                 list_for_each_entry(net, &kgnilnd_data.kgn_nets[i], gnn_list) {
 210                         rc = kgnilnd_cancel_net_dgrams(net);
 211                         LASSERTF(rc == 0, "couldn't cleanup datagrams: %d\n", rc);
 212                 }
 213         }
 214
 215         /* error -ENOTRECOVERABLE is stack reset */
 216         kgnilnd_del_conn_or_peer(NULL, LNET_NID_ANY, GNILND_DEL_CONN, -ENOTRECOVERABLE);
 217
 218         for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
 219                 kgn_device_t    *dev = &kgnilnd_data.kgn_devices[i];
 220                 kgnilnd_cancel_wc_dgrams(dev);
 221                 kgnilnd_wait_for_canceled_dgrams(dev);
 222         }
 223
 224         /* manually do some conn processing ala kgnilnd_process_conns */
 225         for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
 226                 kgn_device_t    *dev = &kgnilnd_data.kgn_devices[i];
 227                 kgn_conn_t      *conn;
 228                 int              conn_sched;
 229
 230                 /* go find all the closed conns that need to be nuked - the
 231                  * scheduler thread isn't running to do this for us */
 232
 233                 CDEBUG(D_NET, "will try to clear up %d ready_conns\n",
 234                         kgnilnd_count_list(&dev->gnd_ready_conns));
 235
 236                 /* use while/list_first_entry loop to ensure we can handle any
 237                  * DESTROY_EP conns added from kgnilnd_complete_closed_conn */
 238                 while (!list_empty(&dev->gnd_ready_conns)) {
 239                         conn = list_first_entry(&dev->gnd_ready_conns,
 240                                                 kgn_conn_t, gnc_schedlist);
 241                         conn_sched = xchg(&conn->gnc_scheduled, GNILND_CONN_PROCESS);
 242
 243                         LASSERTF(conn_sched != GNILND_CONN_IDLE &&
 244                                  conn_sched != GNILND_CONN_PROCESS,
 245                                  "conn %p on ready list but in bad state: %d\n",
 246                                  conn, conn_sched);
 247
 248                         list_del_init(&conn->gnc_schedlist);
 249
 250                         if (conn->gnc_state == GNILND_CONN_CLOSING) {
 251                                 /* bump to CLOSED to fake out send of CLOSE */
 252                                 conn->gnc_state = GNILND_CONN_CLOSED;
 253                                 conn->gnc_close_sent = 1;
 254                         }
 255
 256                         if (conn->gnc_state == GNILND_CONN_DESTROY_EP) {
 257                                 kgnilnd_destroy_conn_ep(conn);
 258                         } else {
 259                                 kgnilnd_complete_closed_conn(conn);
 260                         }
 261
 262                         /* there really shouldn't be any other states here -
 263                          * they would have been cleared out in the del_peer_or_conn or the dgram
 264                          * aborts above.
 265                          * there is an LASSERTF in kgnilnd_complete_closed_conn that will take
 266                          * care of catching anything else for us */
 267
 268                         kgnilnd_schedule_process_conn(conn, -1);
 269
 270                         kgnilnd_conn_decref(conn);
 271                 }
 272         }
 273
 274         /* don't let the little weasily purgatory conns hide from us */
 275         for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
 276                 list_for_each_entry_safe(peer, peerN, &kgnilnd_data.kgn_peers[i], gnp_list) {
 277                         kgn_conn_t       *conn, *connN;
 278
 279                         list_for_each_entry_safe(conn, connN, &peer->gnp_conns, gnc_list) {
 280                                 kgnilnd_detach_purgatory_locked(conn, &souls);
 281                         }
 282                 }
 283         }
 284
 285         CDEBUG(D_NET, "about to release %d purgatory entries\n",
 286                 kgnilnd_count_list(&souls));
 287
 288         kgnilnd_release_purgatory_list(&souls);
 289
 290         /* validate we are now clean */
 291         for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
 292                 kgn_device_t    *dev = &kgnilnd_data.kgn_devices[i];
 293
 294                 /* now all the cons/mboxes should be cleaned up, including purgatory
 295                  * so go through and release the MDDs for our persistent PHYS fma_blks
 296                  */
 297                 kgnilnd_unmap_phys_fmablk(dev);
 298
 299                 LASSERTF(atomic_read(&dev->gnd_nfmablk) == 0,
 300                         "reset failed: fma blocks still live %d\n",
 301                         atomic_read(&dev->gnd_nfmablk));
 302
 303                 LASSERTF(atomic_read(&dev->gnd_neps) == 0,
 304                         "reset failed: EP handles still live %d\n",
 305                         atomic_read(&dev->gnd_neps));
 306         }
 307
 308         LASSERTF(atomic_read(&kgnilnd_data.kgn_nconns) == 0,
 309                 "reset failed: conns left %d\n",
 310                 atomic_read(&kgnilnd_data.kgn_nconns));
 311
 312         /* fine to have peers left - they are waiting for new conns
 313          * but should not be holding any open HW resources */
 314
 315         /* like the last part of kgnilnd_base_shutdown() */
 316
 317         CFS_RACE(CFS_FAIL_GNI_SR_DOWN_RACE);
 318
 319         for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
 320                 kgnilnd_dev_fini(&kgnilnd_data.kgn_devices[i]);
 321         }
 322
 323         /* no need to free and recreate the TX descriptors
 324          * we nuked all the ones that could be using HW resources in
 325          * kgnilnd_close_matching_conns and asserted it worked in
 326          * kgnilnd_dev_fini */
 327
 328         /* At this point, all HW is torn down, start to reset */
 329
 330         /* only reset our known devs */
 331         for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
 332                 kgn_device_t    *dev = &kgnilnd_data.kgn_devices[i];
 333                 rc = kgnilnd_dev_init(dev);
 334                 LASSERTF(rc == 0, "dev_init failed for dev %d\n", i);
 335                 kgnilnd_map_phys_fmablk(dev);
 336                 LASSERTF(rc == 0, "map_phys_fmablk failed for dev %d\n", i);
 337                 rc = kgnilnd_setup_wildcard_dgram(dev);
 338                 LASSERTF(rc == 0, "couldnt setup datagrams on dev %d: %d\n",
 339                         i, rc);
 340         }
 341
 342         /* Now the fun restarts... - release the hounds! */
 343
 344         end = jiffies;
 345         seconds = cfs_duration_sec((long)end - start);
 346         kgnilnd_bump_timeouts(seconds, reason);
 347
 348         kgnilnd_data.kgn_in_reset = 0;
 349         set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_IDLE);
 350         kgnilnd_quiesce_wait(reason);
 351         LCONSOLE_WARN("%s reset of all hardware resources\n",
 352                 rc ? "failed" : "successful");
 353
 354         RETURN_EXIT;
 355 }
 356
 357 /* A thread that handles quiece and reset hardware events.
 358  * We do the same thing regardless of which device reported the event. */
 359 int
 360 kgnilnd_ruhroh_thread(void *arg)
 361 {
 362         int                i = 1;
 363         DEFINE_WAIT(wait);
 364
 365         cfs_daemonize("kgnilnd_rr");
 366         cfs_block_allsigs();
 367         set_user_nice(current, *kgnilnd_tunables.kgn_nice);
 368         kgnilnd_data.kgn_ruhroh_running = 1;
 369
 370         while (1) {
 371
 372                 /* Block until there's a request..  A reset request could come in
 373                  * while we're handling a quiesce one, or vice versa.
 374                  * Keep processing requests until there are none.*/
 375                 prepare_to_wait(&kgnilnd_data.kgn_ruhroh_waitq, &wait, TASK_INTERRUPTIBLE);
 376                 while (!(kgnilnd_data.kgn_ruhroh_shutdown ||
 377                                 kgnilnd_data.kgn_needs_reset || kgnilnd_data.kgn_needs_pause))
 378                         schedule();
 379                 finish_wait(&kgnilnd_data.kgn_ruhroh_waitq, &wait);
 380
 381                /* Exit if the driver is shutting down. */
 382                 if (kgnilnd_data.kgn_ruhroh_shutdown)
 383                         break;
 384
 385                 /* Serialize with driver startup and shutdown. */
 386                 down(&kgnilnd_data.kgn_quiesce_sem);
 387
 388                CDEBUG(D_NET, "trigger %d reset %d to_bump %d pause %d\n",
 389                         kgnilnd_data.kgn_quiesce_trigger,
 390                         kgnilnd_data.kgn_needs_reset,
 391                         kgnilnd_data.kgn_bump_info_rdy,
 392                         kgnilnd_data.kgn_needs_pause);
 393
 394                 /* Do we need to do a pause/quiesce? */
 395                 if (kgnilnd_data.kgn_needs_pause) {
 396
 397                         /* Pause all other kgnilnd threads. */
 398                         set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_HW_QUIESCE);
 399                         kgnilnd_quiesce_wait("hardware quiesce flag");
 400
 401                         /* If the hardware quiesce flag is set, wait for it to clear.
 402                          * This should happen relatively quickly, so we wait for it.
 403                          * This will hold up the eventd thread, but on everything but
 404                          * the simulator, this is ok-- there is one thread per core.
 405                          *
 406                          * Handle (possibly multiple) quiesce events while we wait. The
 407                          * memory barrier ensures that the core doesn't start fetching
 408                          * kgn_bump_info_rdy before it fetches kgn_needs_pause, and
 409                          * matches the second mb in kgnilnd_quiesce_end_callback(). */
 410                         smp_rmb();
 411                         while (kgnilnd_hw_in_quiesce() || kgnilnd_data.kgn_bump_info_rdy) {
 412
 413                                 i++;
 414                                 LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET,
 415                                                 "Waiting for hardware quiesce flag to clear\n");
 416                                 cfs_pause(cfs_time_seconds(1 * i));
 417
 418                                 /* If we got a quiesce event with bump info, DO THE BUMP!. */
 419                                 if (kgnilnd_data.kgn_bump_info_rdy) {
 420                                         /* reset console rate limiting for each event */
 421                                         i = 1;
 422
 423                                         /* Make sure the core doesn't start fetching
 424                                          * kgni_quiesce_seconds until after it sees
 425                                          * kgn_bump_info_rdy set.  This is the match to the
 426                                          * first mb in kgnilnd_quiesce_end_callback(). */
 427                                         smp_rmb();
 428                                         (void) kgnilnd_bump_timeouts(kgnilnd_data.kgn_quiesce_secs,
 429                                                                "hardware quiesce callback");
 430                                         set_mb(kgnilnd_data.kgn_quiesce_secs, 0);
 431                                         set_mb(kgnilnd_data.kgn_bump_info_rdy, 0);
 432                                 }
 433                       }
 434
 435                         /* Reset the kgn_needs_pause flag before coming out of
 436                          * the pause.  This ordering avoids a race with the
 437                          * setting of this flag in kgnilnd_pause_threads().  */
 438                         set_mb(kgnilnd_data.kgn_needs_pause, 0);
 439
 440                         /* ok, let the kids back into the pool */
 441                         set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_IDLE);
 442                         kgnilnd_quiesce_wait("hardware quiesce");
 443                 }
 444
 445                 /* Do a stack reset if needed. */
 446                 if (kgnilnd_data.kgn_needs_reset) {
 447                         kgnilnd_reset_stack();
 448                         set_mb(kgnilnd_data.kgn_needs_reset, 0);
 449                 }
 450
 451                 up(&kgnilnd_data.kgn_quiesce_sem);
 452         }
 453
 454         kgnilnd_data.kgn_ruhroh_running = 0;
 455         return 0;
 456 }
 457
 458 /* Set pause request flag.  Any functions that
 459  * call this one are responsible for ensuring that
 460  * variables they set up are visible on other cores before
 461  * this flag setting.  This executes in interrupt or kernel
 462  * thread context.  */
 463 void
 464 kgnilnd_pause_threads(void)
 465 {
 466         /* only device 0 gets the handle, see kgnilnd_dev_init */
 467         kgn_device_t  *dev = &kgnilnd_data.kgn_devices[0];
 468         LASSERTF(dev != NULL, "dev 0 is NULL\n");
 469
 470         /* If we're currently in a pause triggered by the pause flag,
 471          * there's no need to set it again.  We clear the kgn_needs_pause
 472          * flag before we reset kgn_quiesce_trigger to avoid a race.  The
 473          * read memory barrier matches the setmb() on the trigger in
 474          * kgnilnd_ruhroh_task().                                       */
 475         smp_rmb();
 476         if (!(kgnilnd_data.kgn_quiesce_trigger == GNILND_QUIESCE_HW_QUIESCE &&
 477                         GNILND_IS_QUIESCED)) {
 478                  CDEBUG(D_NET, "requesting thread pause\n");
 479
 480                 kgnilnd_data.kgn_needs_pause = 1;
 481
 482                 wake_up(&kgnilnd_data.kgn_ruhroh_waitq);
 483         } else {
 484             CDEBUG(D_NET, "thread pause already underway\n");
 485         }
 486 }
 487
 488 /* Return non-zero if the GNI hardware quiesce flag is set */
 489 int
 490 kgnilnd_hw_in_quiesce(void)
 491 {
 492         /* only device 0 gets the handle, see kgnilnd_dev_init */
 493         kgn_device_t      *dev0 = &kgnilnd_data.kgn_devices[0];
 494
 495         LASSERTF(dev0 != NULL, "dev 0 is NULL\n");
 496
 497         smp_rmb();
 498         return kgnilnd_get_quiesce_status(dev0->gnd_handle) != 0;
 499 }
 500
 501
 502 /* If the GNI hardware quiesce flag is set, initiate our pause and
 503  * return non-zero.  Also return non-zero if the stack is shutting down. */
 504 int
 505 kgnilnd_check_hw_quiesce(void)
 506 {
 507         if (likely(!kgnilnd_hw_in_quiesce()))
 508                 return 0;
 509
 510         if (!kgnilnd_data.kgn_ruhroh_shutdown) {
 511                 CDEBUG(D_NET, "initiating thread pause\n");
 512                 kgnilnd_pause_threads();
 513         } else {
 514                 CDEBUG(D_NET, "thread pause bypassed because of shutdown\n");
 515         }
 516
 517         return 1;
 518 }
 519
 520 /* Callback from kngi with the quiesce duration.  This executes
 521  * in interrupt context.                                        */
 522 void
 523 kgnilnd_quiesce_end_callback(gni_nic_handle_t nic_handle, uint64_t msecs)
 524 {
 525         /* only device 0 gets the handle, see kgnilnd_dev_init */
 526         kgn_device_t  *dev = &kgnilnd_data.kgn_devices[0];
 527         LASSERTF(dev != NULL, "dev 0 is NULL\n");
 528
 529         if (!kgnilnd_data.kgn_ruhroh_shutdown) {
 530
 531                 CDEBUG(D_NET, "requesting timeout bump by "LPD64" msecs\n", msecs);
 532
 533                 /* Save the bump interval and request the bump.
 534                  * The memory barrier ensures that the interval is in place before
 535                  * the bump flag can be seen (in case a core is already running the
 536                  * ruhroh task), and that the bump request flag in place before
 537                  * the pause request can be seen (to ensure a core doesn't miss the bump
 538                  * request flag).       */
 539                 /* If another callback occurred before the ruhroh task
 540                  * finished processing the first bump request, we'd over-write its info.
 541                  * Nic says that callbacks occur so slowly that this isn't an issue.    */
 542                 set_mb(kgnilnd_data.kgn_quiesce_secs, msecs / MSEC_PER_SEC);
 543                 set_mb(kgnilnd_data.kgn_bump_info_rdy, 1);
 544                 kgnilnd_pause_threads();
 545         } else {
 546                 CDEBUG(D_NET, "timeout bump bypassed because of shutdown\n");
 547         }
 548 }
 549
 550 void
 551 kgnilnd_critical_error(struct gni_err *err_handle)
 552 {
 553         /* only device 0 gets the handle, see kgnilnd_dev_init */
 554         kgn_device_t  *dev = &kgnilnd_data.kgn_devices[0];
 555         LASSERTF(dev != NULL, "dev 0 is NULL\n");
 556
 557         if (!kgnilnd_data.kgn_ruhroh_shutdown) {
 558                 CDEBUG(D_NET, "requesting stack reset\n");
 559                 kgnilnd_data.kgn_needs_reset = 1;
 560                 wake_up(&kgnilnd_data.kgn_ruhroh_waitq);
 561         } else {
 562                 CDEBUG(D_NET, "stack reset bypassed because of shutdown\n");
 563         }
 564 }