lnet/klnds/gnilnd/gnilnd_stack.c

   1 /*
   2  * Copyright (C) 2012 Cray, Inc.
   3  *
   4  *   Author: Nic Henke <nic@cray.com>
   5  *
   6  *   This file is part of Lustre, http://www.lustre.org.
   7  *
   8  *   Lustre is free software; you can redistribute it and/or
   9  *   modify it under the terms of version 2 of the GNU General Public
  10  *   License as published by the Free Software Foundation.
  11  *
  12  *   Lustre is distributed in the hope that it will be useful,
  13  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  *   GNU General Public License for more details.
  16  *
  17  *   You should have received a copy of the GNU General Public License
  18  *   along with Lustre; if not, write to the Free Software
  19  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  20  *
  21  */
  22 #include "gnilnd.h"
  23 #include <rsms/rs_sm_states.h>
  24
  25 /* Advance all timeouts by nap_time seconds. */
  26 void
  27 kgnilnd_bump_timeouts(__u32 nap_time, char *reason)
  28 {
  29         int                     i;
  30         kgn_peer_t             *peer;
  31         kgn_conn_t             *conn;
  32         kgn_tx_t               *tx;
  33         kgn_device_t           *dev;
  34         kgn_dgram_t            *dgram;
  35
  36         LCONSOLE_INFO("%s: bumping all timeouts by %ds\n", reason, nap_time);
  37
  38         LASSERTF(GNILND_IS_QUIESCED, "gnilnd not quiesced %d != %d\n",
  39                  atomic_read(&kgnilnd_data.kgn_nquiesce),
  40                  atomic_read(&kgnilnd_data.kgn_nthreads));
  41
  42         /* requiring that the threads are paused ensures a couple of things:
  43          * - combined code paths for stack reset and quiesce event as stack reset
  44          *   runs with the threads paused
  45          * - prevents traffic to the Gemini during a quiesce period
  46          * - reduces the locking requirements
  47         */
  48
  49         for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
  50                 list_for_each_entry(peer, &kgnilnd_data.kgn_peers[i], gnp_list) {
  51
  52                         /* we can reconnect again at any time */
  53                         peer->gnp_reconnect_time = jiffies;
  54                         /* reset now that network is healthy */
  55                         peer->gnp_reconnect_interval = 0;
  56                         /* tell LNet dude is still alive */
  57                         kgnilnd_peer_alive(peer);
  58
  59                         list_for_each_entry(tx, &peer->gnp_tx_queue, tx_list) {
  60                                 tx->tx_qtime = jiffies;
  61                         }
  62
  63                         list_for_each_entry(conn, &peer->gnp_conns, gnc_list) {
  64                                 unsigned long           timeout;
  65
  66                                 timeout = cfs_time_seconds(conn->gnc_timeout);
  67
  68                                 /* bump last_rx/last_rx_cq on all conns - including
  69                                  * closed ones, this will have the effect of
  70                                  * bumping the purgatory timers for those */
  71                                 conn->gnc_last_rx = conn->gnc_last_rx_cq = jiffies;
  72
  73                                 /* we don't timeout based on old gnc_last_tx, so
  74                                  * we'll back it up and schedule the conn to trigger
  75                                  * a NOOP */
  76                                 conn->gnc_last_tx = jiffies - timeout;
  77                                 if (conn->gnc_state != GNILND_CONN_DONE)
  78                                 kgnilnd_schedule_conn(conn);
  79                         }
  80                 }
  81         }
  82
  83         for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
  84                 dev = &kgnilnd_data.kgn_devices[i];
  85                 for (i = 0; i < (*kgnilnd_tunables.kgn_peer_hash_size - 1); i++) {
  86                         list_for_each_entry(dgram, &dev->gnd_dgrams[i], gndg_list) {
  87                                 dgram->gndg_post_time = jiffies;
  88                         }
  89                 }
  90         }
  91 }
  92
  93 /* Quiesce or wake up the stack.  The caller must hold the kgn_quiesce_sem semaphore
  94  * on entry, which holds off any pending stack shutdown.   */
  95 void
  96 kgnilnd_quiesce_wait(char *reason)
  97 {
  98         int             i;
  99
 100         if (kgnilnd_data.kgn_quiesce_trigger) {
 101                 unsigned long   quiesce_deadline, quiesce_to;
 102                 /* FREEZE TAG!!!! */
 103
 104                 /* morning sunshine */
 105                 spin_lock(&kgnilnd_data.kgn_reaper_lock);
 106                 wake_up_all(&kgnilnd_data.kgn_reaper_waitq);
 107                 spin_unlock(&kgnilnd_data.kgn_reaper_lock);
 108
 109                 for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
 110                         kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
 111
 112                         wake_up_all(&dev->gnd_waitq);
 113                         wake_up_all(&dev->gnd_dgram_waitq);
 114                         wake_up_all(&dev->gnd_dgping_waitq);
 115                 }
 116
 117                 kgnilnd_wakeup_rca_thread();
 118
 119                 /* we'll wait for 10x the timeout for the threads to pause */
 120                 quiesce_to = cfs_time_seconds(*kgnilnd_tunables.kgn_timeout * 10);
 121                 quiesce_deadline = (long) jiffies + quiesce_to;
 122
 123                 /* wait for everyone to check-in as quiesced */
 124                 i = 1;
 125                 while (!GNILND_IS_QUIESCED) {
 126                         i++;
 127                         LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET,
 128                                  "%s: Waiting for %d threads to pause\n",
 129                                  reason,
 130                                  atomic_read(&kgnilnd_data.kgn_nthreads) -
 131                                  atomic_read(&kgnilnd_data.kgn_nquiesce));
 132                         CFS_RACE(CFS_FAIL_GNI_QUIESCE_RACE);
 133                         cfs_pause(cfs_time_seconds(1 * i));
 134
 135                         LASSERTF(quiesce_deadline > jiffies,
 136                                  "couldn't quiesce threads in %lu seconds, falling over now\n",
 137                                  cfs_duration_sec(quiesce_to));
 138                 }
 139
 140                 LCONSOLE_WARN("%s: All threads paused!\n", reason);
 141                 /* XXX Nic: Is there a set of counters we can grab here to
 142                  * ensure that there is no traffic until quiesce is over ?*/
 143         } else {
 144                 /* GO! GO! GO! */
 145
 146                 for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
 147                         kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
 148                         kgnilnd_schedule_dgram(dev);
 149                 }
 150
 151                 /* wait for everyone to check-in as running - they will be spinning
 152                  * and looking, so no need to poke any waitq */
 153                 i = 1;
 154                 while (atomic_read(&kgnilnd_data.kgn_nquiesce) > 0) {
 155                         i++;
 156                         LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET,
 157                                  "%s: Waiting for %d threads to wake up\n",
 158                                   reason,
 159                                   atomic_read(&kgnilnd_data.kgn_nquiesce));
 160                         cfs_pause(cfs_time_seconds(1 * i));
 161                 }
 162
 163                 LCONSOLE_WARN("%s: All threads awake!\n", reason);
 164         }
 165 }
 166
 167 /* Reset the stack.  */
 168 void
 169 kgnilnd_reset_stack(void)
 170 {
 171         int              i, rc = 0;
 172         kgn_net_t       *net;
 173         kgn_peer_t      *peer, *peerN;
 174         LIST_HEAD        (souls);
 175         char            *reason = "critical hardware error";
 176         __u32            seconds;
 177         unsigned long    start, end;
 178         ENTRY;
 179
 180         /* Race with del_peer and its atomics */
 181         CFS_RACE(CFS_FAIL_GNI_RACE_RESET);
 182
 183         if (kgnilnd_data.kgn_init != GNILND_INIT_ALL) {
 184                 CERROR("can't reset the stack, gnilnd is not initialized\n");
 185                 RETURN_EXIT;
 186         }
 187
 188         /* First make sure we are not already quiesced - we panic if so,
 189          * as that could leave software in a bad state */
 190         LASSERTF(kgnilnd_data.kgn_quiesce_trigger == GNILND_QUIESCE_IDLE,
 191                 "can't reset the stack, already doing so: trigger %d\n",
 192                  kgnilnd_data.kgn_quiesce_trigger);
 193
 194         set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_RESET);
 195
 196         /* wake up the dgram waitq thread - but after trigger set to make sure it
 197          * goes into quiesce */
 198         CFS_RACE(CFS_FAIL_GNI_WC_DGRAM_FREE);
 199         /* same for scheduler that is dropping state transitiosn */
 200         CFS_RACE(CFS_FAIL_GNI_DROP_CLOSING);
 201         CFS_RACE(CFS_FAIL_GNI_DROP_DESTROY_EP);
 202
 203         kgnilnd_quiesce_wait(reason);
 204
 205         start = jiffies;
 206
 207         kgnilnd_data.kgn_in_reset = 1;
 208         kgnilnd_data.kgn_nresets++;
 209         LCONSOLE_WARN("%s: resetting all resources (count %d)\n",
 210                       reason, kgnilnd_data.kgn_nresets);
 211
 212         for (i = 0; i < *kgnilnd_tunables.kgn_net_hash_size; i++) {
 213                 list_for_each_entry(net, &kgnilnd_data.kgn_nets[i], gnn_list) {
 214                         rc = kgnilnd_cancel_net_dgrams(net);
 215                         LASSERTF(rc == 0, "couldn't cleanup datagrams: %d\n", rc);
 216                 }
 217         }
 218
 219         /* error -ENOTRECOVERABLE is stack reset */
 220         kgnilnd_del_conn_or_peer(NULL, LNET_NID_ANY, GNILND_DEL_CONN, -ENOTRECOVERABLE);
 221
 222         for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
 223                 kgn_device_t    *dev = &kgnilnd_data.kgn_devices[i];
 224                 kgnilnd_cancel_wc_dgrams(dev);
 225                 kgnilnd_wait_for_canceled_dgrams(dev);
 226         }
 227
 228         /* manually do some conn processing ala kgnilnd_process_conns */
 229         for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
 230                 kgn_device_t    *dev = &kgnilnd_data.kgn_devices[i];
 231                 kgn_conn_t      *conn;
 232                 int              conn_sched;
 233
 234                 /* go find all the closed conns that need to be nuked - the
 235                  * scheduler thread isn't running to do this for us */
 236
 237                 CDEBUG(D_NET, "will try to clear up %d ready_conns\n",
 238                         kgnilnd_count_list(&dev->gnd_ready_conns));
 239
 240                 /* use while/list_first_entry loop to ensure we can handle any
 241                  * DESTROY_EP conns added from kgnilnd_complete_closed_conn */
 242                 while (!list_empty(&dev->gnd_ready_conns)) {
 243                         conn = list_first_entry(&dev->gnd_ready_conns,
 244                                                 kgn_conn_t, gnc_schedlist);
 245                         conn_sched = xchg(&conn->gnc_scheduled, GNILND_CONN_PROCESS);
 246
 247                         LASSERTF(conn_sched != GNILND_CONN_IDLE &&
 248                                  conn_sched != GNILND_CONN_PROCESS,
 249                                  "conn %p on ready list but in bad state: %d\n",
 250                                  conn, conn_sched);
 251
 252                         list_del_init(&conn->gnc_schedlist);
 253
 254                         if (conn->gnc_state == GNILND_CONN_CLOSING) {
 255                                 /* bump to CLOSED to fake out send of CLOSE */
 256                                 conn->gnc_state = GNILND_CONN_CLOSED;
 257                                 conn->gnc_close_sent = 1;
 258                         }
 259
 260                         if (conn->gnc_state == GNILND_CONN_DESTROY_EP) {
 261                                 kgnilnd_destroy_conn_ep(conn);
 262                         } else {
 263                                 kgnilnd_complete_closed_conn(conn);
 264                         }
 265
 266                         /* there really shouldn't be any other states here -
 267                          * they would have been cleared out in the del_peer_or_conn or the dgram
 268                          * aborts above.
 269                          * there is an LASSERTF in kgnilnd_complete_closed_conn that will take
 270                          * care of catching anything else for us */
 271
 272                         kgnilnd_schedule_process_conn(conn, -1);
 273
 274                         kgnilnd_conn_decref(conn);
 275                 }
 276         }
 277
 278         /* don't let the little weasily purgatory conns hide from us */
 279         for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
 280                 list_for_each_entry_safe(peer, peerN, &kgnilnd_data.kgn_peers[i], gnp_list) {
 281                         kgn_conn_t       *conn, *connN;
 282
 283                         list_for_each_entry_safe(conn, connN, &peer->gnp_conns, gnc_list) {
 284                                 kgnilnd_detach_purgatory_locked(conn, &souls);
 285                         }
 286                 }
 287         }
 288
 289         CDEBUG(D_NET, "about to release %d purgatory entries\n",
 290                 kgnilnd_count_list(&souls));
 291
 292         kgnilnd_release_purgatory_list(&souls);
 293
 294         /* validate we are now clean */
 295         for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
 296                 kgn_device_t    *dev = &kgnilnd_data.kgn_devices[i];
 297
 298                 /* now all the cons/mboxes should be cleaned up, including purgatory
 299                  * so go through and release the MDDs for our persistent PHYS fma_blks
 300                  */
 301                 kgnilnd_unmap_phys_fmablk(dev);
 302
 303                 LASSERTF(atomic_read(&dev->gnd_nfmablk) == 0,
 304                         "reset failed: fma blocks still live %d\n",
 305                         atomic_read(&dev->gnd_nfmablk));
 306
 307                 LASSERTF(atomic_read(&dev->gnd_neps) == 0,
 308                         "reset failed: EP handles still live %d\n",
 309                         atomic_read(&dev->gnd_neps));
 310         }
 311
 312         LASSERTF(atomic_read(&kgnilnd_data.kgn_nconns) == 0,
 313                 "reset failed: conns left %d\n",
 314                 atomic_read(&kgnilnd_data.kgn_nconns));
 315
 316         /* fine to have peers left - they are waiting for new conns
 317          * but should not be holding any open HW resources */
 318
 319         /* like the last part of kgnilnd_base_shutdown() */
 320
 321         CFS_RACE(CFS_FAIL_GNI_SR_DOWN_RACE);
 322
 323         for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
 324                 kgnilnd_dev_fini(&kgnilnd_data.kgn_devices[i]);
 325         }
 326
 327         /* no need to free and recreate the TX descriptors
 328          * we nuked all the ones that could be using HW resources in
 329          * kgnilnd_close_matching_conns and asserted it worked in
 330          * kgnilnd_dev_fini */
 331
 332         /* At this point, all HW is torn down, start to reset */
 333
 334         /* only reset our known devs */
 335         for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
 336                 kgn_device_t    *dev = &kgnilnd_data.kgn_devices[i];
 337                 rc = kgnilnd_dev_init(dev);
 338                 LASSERTF(rc == 0, "dev_init failed for dev %d\n", i);
 339                 kgnilnd_map_phys_fmablk(dev);
 340                 LASSERTF(rc == 0, "map_phys_fmablk failed for dev %d\n", i);
 341                 rc = kgnilnd_setup_wildcard_dgram(dev);
 342                 LASSERTF(rc == 0, "couldnt setup datagrams on dev %d: %d\n",
 343                         i, rc);
 344         }
 345
 346         /* Now the fun restarts... - release the hounds! */
 347
 348         end = jiffies;
 349         seconds = cfs_duration_sec((long)end - start);
 350         kgnilnd_bump_timeouts(seconds, reason);
 351
 352         kgnilnd_data.kgn_in_reset = 0;
 353         set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_IDLE);
 354         kgnilnd_quiesce_wait(reason);
 355         LCONSOLE_WARN("%s reset of all hardware resources\n",
 356                 rc ? "failed" : "successful");
 357
 358         RETURN_EXIT;
 359 }
 360
 361 /* A thread that handles quiece and reset hardware events.
 362  * We do the same thing regardless of which device reported the event. */
 363 int
 364 kgnilnd_ruhroh_thread(void *arg)
 365 {
 366         int                i = 1;
 367         DEFINE_WAIT(wait);
 368
 369         cfs_daemonize("kgnilnd_rr");
 370         cfs_block_allsigs();
 371         set_user_nice(current, *kgnilnd_tunables.kgn_nice);
 372         kgnilnd_data.kgn_ruhroh_running = 1;
 373
 374         while (1) {
 375
 376                 /* Block until there's a request..  A reset request could come in
 377                  * while we're handling a quiesce one, or vice versa.
 378                  * Keep processing requests until there are none.*/
 379                 prepare_to_wait(&kgnilnd_data.kgn_ruhroh_waitq, &wait, TASK_INTERRUPTIBLE);
 380                 while (!(kgnilnd_data.kgn_ruhroh_shutdown ||
 381                                 kgnilnd_data.kgn_needs_reset || kgnilnd_data.kgn_needs_pause))
 382                         schedule();
 383                 finish_wait(&kgnilnd_data.kgn_ruhroh_waitq, &wait);
 384
 385                /* Exit if the driver is shutting down. */
 386                 if (kgnilnd_data.kgn_ruhroh_shutdown)
 387                         break;
 388
 389                 /* Serialize with driver startup and shutdown. */
 390                 down(&kgnilnd_data.kgn_quiesce_sem);
 391
 392                CDEBUG(D_NET, "trigger %d reset %d to_bump %d pause %d\n",
 393                         kgnilnd_data.kgn_quiesce_trigger,
 394                         kgnilnd_data.kgn_needs_reset,
 395                         kgnilnd_data.kgn_bump_info_rdy,
 396                         kgnilnd_data.kgn_needs_pause);
 397
 398                 /* Do we need to do a pause/quiesce? */
 399                 if (kgnilnd_data.kgn_needs_pause) {
 400
 401                         /* Pause all other kgnilnd threads. */
 402                         set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_HW_QUIESCE);
 403                         kgnilnd_quiesce_wait("hardware quiesce flag");
 404
 405                         /* If the hardware quiesce flag is set, wait for it to clear.
 406                          * This should happen relatively quickly, so we wait for it.
 407                          * This will hold up the eventd thread, but on everything but
 408                          * the simulator, this is ok-- there is one thread per core.
 409                          *
 410                          * Handle (possibly multiple) quiesce events while we wait. The
 411                          * memory barrier ensures that the core doesn't start fetching
 412                          * kgn_bump_info_rdy before it fetches kgn_needs_pause, and
 413                          * matches the second mb in kgnilnd_quiesce_end_callback(). */
 414                         smp_rmb();
 415                         while (kgnilnd_hw_in_quiesce() || kgnilnd_data.kgn_bump_info_rdy) {
 416
 417                                 i++;
 418                                 LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET,
 419                                                 "Waiting for hardware quiesce flag to clear\n");
 420                                 cfs_pause(cfs_time_seconds(1 * i));
 421
 422                                 /* If we got a quiesce event with bump info, DO THE BUMP!. */
 423                                 if (kgnilnd_data.kgn_bump_info_rdy) {
 424                                         /* reset console rate limiting for each event */
 425                                         i = 1;
 426
 427                                         /* Make sure the core doesn't start fetching
 428                                          * kgni_quiesce_seconds until after it sees
 429                                          * kgn_bump_info_rdy set.  This is the match to the
 430                                          * first mb in kgnilnd_quiesce_end_callback(). */
 431                                         smp_rmb();
 432                                         (void) kgnilnd_bump_timeouts(kgnilnd_data.kgn_quiesce_secs,
 433                                                                "hardware quiesce callback");
 434                                         set_mb(kgnilnd_data.kgn_quiesce_secs, 0);
 435                                         set_mb(kgnilnd_data.kgn_bump_info_rdy, 0);
 436                                 }
 437                       }
 438
 439                         /* Reset the kgn_needs_pause flag before coming out of
 440                          * the pause.  This ordering avoids a race with the
 441                          * setting of this flag in kgnilnd_pause_threads().  */
 442                         set_mb(kgnilnd_data.kgn_needs_pause, 0);
 443
 444                         /* ok, let the kids back into the pool */
 445                         set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_IDLE);
 446                         kgnilnd_quiesce_wait("hardware quiesce");
 447                 }
 448
 449                 /* Do a stack reset if needed. */
 450                 if (kgnilnd_data.kgn_needs_reset) {
 451                         kgnilnd_reset_stack();
 452                         set_mb(kgnilnd_data.kgn_needs_reset, 0);
 453                 }
 454
 455                 up(&kgnilnd_data.kgn_quiesce_sem);
 456         }
 457
 458         kgnilnd_data.kgn_ruhroh_running = 0;
 459         return 0;
 460 }
 461
 462 /* Set pause request flag.  Any functions that
 463  * call this one are responsible for ensuring that
 464  * variables they set up are visible on other cores before
 465  * this flag setting.  This executes in interrupt or kernel
 466  * thread context.  */
 467 void
 468 kgnilnd_pause_threads(void)
 469 {
 470         /* only device 0 gets the handle, see kgnilnd_dev_init */
 471         kgn_device_t  *dev = &kgnilnd_data.kgn_devices[0];
 472         LASSERTF(dev != NULL, "dev 0 is NULL\n");
 473
 474         /* If we're currently in a pause triggered by the pause flag,
 475          * there's no need to set it again.  We clear the kgn_needs_pause
 476          * flag before we reset kgn_quiesce_trigger to avoid a race.  The
 477          * read memory barrier matches the setmb() on the trigger in
 478          * kgnilnd_ruhroh_task().                                       */
 479         smp_rmb();
 480         if (!(kgnilnd_data.kgn_quiesce_trigger == GNILND_QUIESCE_HW_QUIESCE &&
 481                         GNILND_IS_QUIESCED)) {
 482                  CDEBUG(D_NET, "requesting thread pause\n");
 483
 484                 kgnilnd_data.kgn_needs_pause = 1;
 485
 486                 wake_up(&kgnilnd_data.kgn_ruhroh_waitq);
 487         } else {
 488             CDEBUG(D_NET, "thread pause already underway\n");
 489         }
 490 }
 491
 492 /* Return non-zero if the GNI hardware quiesce flag is set */
 493 int
 494 kgnilnd_hw_in_quiesce(void)
 495 {
 496         /* only device 0 gets the handle, see kgnilnd_dev_init */
 497         kgn_device_t      *dev0 = &kgnilnd_data.kgn_devices[0];
 498
 499         LASSERTF(dev0 != NULL, "dev 0 is NULL\n");
 500
 501         smp_rmb();
 502         return kgnilnd_get_quiesce_status(dev0->gnd_handle) != 0;
 503 }
 504
 505
 506 /* If the GNI hardware quiesce flag is set, initiate our pause and
 507  * return non-zero.  Also return non-zero if the stack is shutting down. */
 508 int
 509 kgnilnd_check_hw_quiesce(void)
 510 {
 511         if (likely(!kgnilnd_hw_in_quiesce()))
 512                 return 0;
 513
 514         if (!kgnilnd_data.kgn_ruhroh_shutdown) {
 515                 CDEBUG(D_NET, "initiating thread pause\n");
 516                 kgnilnd_pause_threads();
 517         } else {
 518                 CDEBUG(D_NET, "thread pause bypassed because of shutdown\n");
 519         }
 520
 521         return 1;
 522 }
 523
 524 /* Callback from kngi with the quiesce duration.  This executes
 525  * in interrupt context.                                        */
 526 void
 527 kgnilnd_quiesce_end_callback(gni_nic_handle_t nic_handle, uint64_t msecs)
 528 {
 529         /* only device 0 gets the handle, see kgnilnd_dev_init */
 530         kgn_device_t  *dev = &kgnilnd_data.kgn_devices[0];
 531         LASSERTF(dev != NULL, "dev 0 is NULL\n");
 532
 533         if (!kgnilnd_data.kgn_ruhroh_shutdown) {
 534
 535                 CDEBUG(D_NET, "requesting timeout bump by "LPD64" msecs\n", msecs);
 536
 537                 /* Save the bump interval and request the bump.
 538                  * The memory barrier ensures that the interval is in place before
 539                  * the bump flag can be seen (in case a core is already running the
 540                  * ruhroh task), and that the bump request flag in place before
 541                  * the pause request can be seen (to ensure a core doesn't miss the bump
 542                  * request flag).       */
 543                 /* If another callback occurred before the ruhroh task
 544                  * finished processing the first bump request, we'd over-write its info.
 545                  * Nic says that callbacks occur so slowly that this isn't an issue.    */
 546                 set_mb(kgnilnd_data.kgn_quiesce_secs, msecs / MSEC_PER_SEC);
 547                 set_mb(kgnilnd_data.kgn_bump_info_rdy, 1);
 548                 kgnilnd_pause_threads();
 549         } else {
 550                 CDEBUG(D_NET, "timeout bump bypassed because of shutdown\n");
 551         }
 552 }
 553
 554 void
 555 kgnilnd_critical_error(struct gni_err *err_handle)
 556 {
 557         /* only device 0 gets the handle, see kgnilnd_dev_init */
 558         kgn_device_t  *dev = &kgnilnd_data.kgn_devices[0];
 559         LASSERTF(dev != NULL, "dev 0 is NULL\n");
 560
 561         if (!kgnilnd_data.kgn_ruhroh_shutdown) {
 562                 CDEBUG(D_NET, "requesting stack reset\n");
 563                 kgnilnd_data.kgn_needs_reset = 1;
 564                 wake_up(&kgnilnd_data.kgn_ruhroh_waitq);
 565         } else {
 566                 CDEBUG(D_NET, "stack reset bypassed because of shutdown\n");
 567         }
 568 }
 569
 570 #if defined(GNILND_USE_RCA)
 571 #include <krca_lib.h>
 572 #define RCA_EVENTS 3
 573 /* RCA ticket is needed for krca_wakeup_wait_event() */
 574 static krca_ticket_t rca_krt = KRCA_NULL_TICKET;
 575 struct rcadata {
 576         rca_ticket_t ticket;
 577         int subscribed;
 578         rs_event_code_t ec;
 579 };
 580 static struct rcadata rd[RCA_EVENTS] = {
 581         {0, 0, ec_node_unavailable},
 582         {0, 0, ec_node_available},
 583         {0, 0, ec_node_failed}
 584 };
 585
 586 /* thread for receiving rca events */
 587 int
 588 kgnilnd_rca(void *arg)
 589 {
 590         int        i, rc;
 591         int        retry_count;
 592         rs_event_t event;
 593         lnet_nid_t nid;
 594
 595         cfs_daemonize("kgnilnd_rca");
 596         cfs_block_allsigs();
 597
 598         /* all gnilnd threads need to run fairly urgently */
 599         set_user_nice(current, *kgnilnd_tunables.kgn_nice);
 600
 601         /*
 602          * Register our service with RCA and subscribe to events
 603          * of interest.
 604          */
 605         rca_krt = KRCA_NULL_TICKET;
 606         rc = krca_register(&rca_krt, RCA_SVCTYPE_GNILND, current->pid, 0);
 607         if (rc < 0) {
 608                 CNETERR("krca_register(%x) returned %d\n", current->pid, rc);
 609                 goto done;
 610         }
 611
 612         for (i = 0; i < RCA_EVENTS; i++) {
 613                 retry_count = 0;
 614 subscribe_retry:
 615                 rc = krca_subscribe(&rca_krt, rd[i].ec, RCA_RX_SVC_ANY,
 616                                     &rd[i].ticket);
 617
 618                 if ((rc == -EINTR) && !retry_count) {
 619                         retry_count++;
 620                         CNETERR("krca_subscribe returned %d - retrying\n", rc);
 621                         goto subscribe_retry;
 622                 }
 623
 624                 if (rc < 0) {
 625                         CNETERR("rca subscription failed (%d)\n", rc);
 626                         goto done;
 627                 }
 628
 629                 rd[i].subscribed = 1;
 630         }
 631
 632         while (!kgnilnd_data.kgn_shutdown) {
 633                 if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
 634                         KGNILND_SPIN_QUIESCE;
 635                 }
 636                 /* wait here for a subscribed event */
 637                 rc = krca_wait_event(&rca_krt);
 638
 639                 /* RCA return values:
 640                  * 0 indicates krca_wakeup_wait_event caused krca_wait_event
 641                  *   return.
 642                  * -ERESTARTSYS indicates krca_wait_event returned because of a
 643                  *   signal.
 644                  * -ENOSPC indicates no space available to create an rcad_reg_t
 645                  * 1 indicates a message is waiting.
 646                  */
 647                 if (rc <= 0) {
 648                         continue;
 649                 }
 650
 651                 if (krca_get_message(&rca_krt, &event) == 0) {
 652                         int node_down = GNILND_RCA_NODE_UNKNOWN;
 653                         rs_state_t state;
 654                         CFS_LIST_HEAD(zombies);
 655
 656                         /* Compute nodes don't care about other compute nodes
 657                          * so we don't need to create a peer.
 658                          */
 659                         if (GNILND_COMPUTE &&
 660                             !RSN_GET_FLD(event.ev_gen.svid_node.rs_node_flat,
 661                                         IS_SVC)) {
 662                                 continue;
 663                         }
 664
 665                         /* Only care about compute and service nodes not GPUs */
 666                         if (RSN_GET_FLD(event.ev_gen.svid_node.rs_node_flat,
 667                                         TYPE) != rt_node) {
 668                                 continue;
 669                         }
 670
 671                         switch (event.ev_id) {
 672                         case ec_node_available:
 673                                 CDEBUG(D_INFO, "ec_node_available\n");
 674                                 node_down = GNILND_RCA_NODE_UP;
 675                                 break;
 676                         case ec_node_failed:
 677                                 CDEBUG(D_INFO, "ec_node_failed\n");
 678                                 if (event.ev_len > 0) {
 679                                         CDEBUG(D_ERROR,
 680                                                 "ec_node_failed ignored\n");
 681                                         break;
 682                                 }
 683                                 node_down = GNILND_RCA_NODE_DOWN;
 684                                 break;
 685                         case ec_node_unavailable:
 686                                 state = RSN_GET_FLD(event.ev_gen.svid_node.rsn_intval, STATE);
 687
 688                                 CDEBUG(D_INFO, "ec_node_unavailable\n");
 689
 690                                 /*
 691                                  * Ignore overloaded ec_node_unavailable events
 692                                  * generated by 'xtcli set_reserve'.
 693                                  */
 694                                 if (RS_GET_CS_STATE(state) == RS_CS_READY) {
 695                                         CDEBUG(D_INFO, "ignoring "
 696                                                 "ec_node_unavailable event with"
 697                                                 " RS_CS_READY state\n");
 698                                         break;
 699                                 }
 700                                 node_down = GNILND_RCA_NODE_DOWN;
 701                                 break;
 702                         default:
 703                                 CDEBUG(D_INFO, "unknown event\n");
 704                                 break;
 705                         }
 706
 707                         /* if we get an event we don't know about, just go ahead
 708                          * and wait for another event */
 709                         if (node_down == GNILND_RCA_NODE_UNKNOWN) {
 710                                 continue;
 711                         }
 712
 713                         nid = RSN_GET_FLD(event.ev_gen.svid_node.rs_node_flat,
 714                                           NID);
 715                         CDEBUG(D_INFO,"kgnilnd_rca() reporting nid %d %s\n",
 716                                (int)nid, node_down ? "down" : "up");
 717                         kgnilnd_report_node_state(nid, node_down);
 718
 719                 } else {
 720                         CNETERR("krca_get_message failed\n");
 721                 }
 722         }
 723
 724 done:
 725         CDEBUG(D_INFO, "done\n");
 726
 727         for (i = 0; i < RCA_EVENTS; i++) {
 728                 if (rd[i].subscribed) {
 729                         rc = krca_unsubscribe(&rca_krt, rd[i].ticket);
 730
 731                         if (rc) {
 732                                 CNETERR("rca unsubscribe failed (%d)\n", rc);
 733                         }
 734
 735                         rd[i].subscribed = 0;
 736                 }
 737         }
 738
 739         krca_unregister(&rca_krt);
 740         kgnilnd_thread_fini();
 741         return 0;
 742
 743 }
 744
 745 int
 746 kgnilnd_start_rca_thread(void)
 747 {
 748         return kgnilnd_thread_start(kgnilnd_rca, NULL, "kgnilnd_rca", 0);
 749 }
 750
 751 void
 752 kgnilnd_wakeup_rca_thread(void)
 753 {
 754         int ret;
 755
 756         ret = krca_wakeup_wait_event(&rca_krt);
 757
 758         if (ret) {
 759                 CDEBUG(D_ERROR, "krca_wakeup_wait_event failed\n");
 760         }
 761 }
 762
 763 #else /* GNILND_USE_RCA */
 764
 765 int
 766 kgnilnd_start_rca_thread(void)
 767 {
 768         return 0;
 769 }
 770
 771 void
 772 kgnilnd_wakeup_rca_thread(void)
 773 {
 774 }
 775
 776 #endif /* GNILND_USE_RCA */