lnet/klnds/gnilnd/gnilnd_stack.c

   1 /*
   2  * Copyright (C) 2012 Cray, Inc.
   3  *
   4  * Copyright (c) 2014, Intel Corporation.
   5  *
   6  *   Author: Nic Henke <nic@cray.com>
   7  *
   8  *   This file is part of Lustre, http://www.lustre.org.
   9  *
  10  *   Lustre is free software; you can redistribute it and/or
  11  *   modify it under the terms of version 2 of the GNU General Public
  12  *   License as published by the Free Software Foundation.
  13  *
  14  *   Lustre is distributed in the hope that it will be useful,
  15  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  *   GNU General Public License for more details.
  18  *
  19  *   You should have received a copy of the GNU General Public License
  20  *   along with Lustre; if not, write to the Free Software
  21  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  22  *
  23  */
  24 #include "gnilnd.h"
  25 #if defined(GNILND_USE_RCA)
  26 #include <rsms/rs_sm_states.h>
  27 #endif
  28 /* Advance all timeouts by nap_time seconds. */
  29 void
  30 kgnilnd_bump_timeouts(__u32 nap_time, char *reason)
  31 {
  32         int                     i;
  33         kgn_peer_t             *peer;
  34         kgn_conn_t             *conn;
  35         kgn_tx_t               *tx;
  36         kgn_device_t           *dev;
  37         kgn_dgram_t            *dgram;
  38
  39         CDEBUG(D_INFO, "%s: bumping all timeouts by %ds\n", reason, nap_time);
  40
  41         LASSERTF(GNILND_IS_QUIESCED, "gnilnd not quiesced %d != %d\n",
  42                  atomic_read(&kgnilnd_data.kgn_nquiesce),
  43                  atomic_read(&kgnilnd_data.kgn_nthreads));
  44
  45         /* requiring that the threads are paused ensures a couple of things:
  46          * - combined code paths for stack reset and quiesce event as stack reset
  47          *   runs with the threads paused
  48          * - prevents traffic to the Gemini during a quiesce period
  49          * - reduces the locking requirements
  50         */
  51
  52         for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
  53                 list_for_each_entry(peer, &kgnilnd_data.kgn_peers[i], gnp_list) {
  54
  55                         /* we can reconnect again at any time */
  56                         peer->gnp_reconnect_time = jiffies;
  57                         /* reset now that network is healthy */
  58                         peer->gnp_reconnect_interval = 0;
  59                         /* tell LNet dude is still alive */
  60                         kgnilnd_peer_alive(peer);
  61                         kgnilnd_peer_notify(peer, 0, 1);
  62
  63                         list_for_each_entry(tx, &peer->gnp_tx_queue, tx_list) {
  64                                 tx->tx_qtime = jiffies;
  65                         }
  66
  67                         list_for_each_entry(conn, &peer->gnp_conns, gnc_list) {
  68                                 unsigned long           timeout;
  69
  70                                 timeout = cfs_time_seconds(conn->gnc_timeout);
  71
  72                                 /* bump last_rx/last_rx_cq on all conns - including
  73                                  * closed ones, this will have the effect of
  74                                  * bumping the purgatory timers for those */
  75                                 conn->gnc_last_rx = conn->gnc_last_rx_cq = jiffies;
  76
  77                                 /* we don't timeout based on old gnc_last_tx, so
  78                                  * we'll back it up and schedule the conn to trigger
  79                                  * a NOOP */
  80                                 conn->gnc_last_tx = jiffies - timeout;
  81                                 if (conn->gnc_state != GNILND_CONN_DONE)
  82                                 kgnilnd_schedule_conn(conn);
  83                         }
  84                 }
  85         }
  86
  87         for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
  88                 dev = &kgnilnd_data.kgn_devices[i];
  89                 for (i = 0; i < (*kgnilnd_tunables.kgn_peer_hash_size - 1); i++) {
  90                         list_for_each_entry(dgram, &dev->gnd_dgrams[i], gndg_list) {
  91                                 dgram->gndg_post_time = jiffies;
  92                         }
  93                 }
  94         }
  95 }
  96
  97 /* Quiesce or wake up the stack.  The caller must hold the kgn_quiesce_sem semaphore
  98  * on entry, which holds off any pending stack shutdown.   */
  99 void
 100 kgnilnd_quiesce_wait(char *reason)
 101 {
 102         int             i;
 103
 104         if (kgnilnd_data.kgn_quiesce_trigger) {
 105                 unsigned long   quiesce_deadline, quiesce_to;
 106                 /* FREEZE TAG!!!! */
 107
 108                 /* morning sunshine */
 109                 spin_lock(&kgnilnd_data.kgn_reaper_lock);
 110                 wake_up_all(&kgnilnd_data.kgn_reaper_waitq);
 111                 spin_unlock(&kgnilnd_data.kgn_reaper_lock);
 112
 113                 for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
 114                         kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
 115
 116                         wake_up_all(&dev->gnd_waitq);
 117                         wake_up_all(&dev->gnd_dgram_waitq);
 118                         wake_up_all(&dev->gnd_dgping_waitq);
 119                 }
 120
 121                 kgnilnd_wakeup_rca_thread();
 122
 123                 /* we'll wait for 10x the timeout for the threads to pause */
 124                 quiesce_to = cfs_time_seconds(*kgnilnd_tunables.kgn_timeout * 10);
 125                 quiesce_deadline = (long) jiffies + quiesce_to;
 126
 127                 LCONSOLE_INFO("Quiesce start: %s\n", reason);
 128                 /* wait for everyone to check-in as quiesced */
 129                 while (!GNILND_IS_QUIESCED) {
 130                         CDEBUG(D_INFO,
 131                                  "%s: Waiting for %d threads to pause\n",
 132                                  reason,
 133                                  atomic_read(&kgnilnd_data.kgn_nthreads) -
 134                                  atomic_read(&kgnilnd_data.kgn_nquiesce));
 135                         CFS_RACE(CFS_FAIL_GNI_QUIESCE_RACE);
 136                         set_current_state(TASK_UNINTERRUPTIBLE);
 137                         schedule_timeout(cfs_time_seconds(1 * i));
 138
 139                         LASSERTF(quiesce_deadline > jiffies,
 140                                  "couldn't quiesce threads in %lu seconds, falling over now\n",
 141                                  cfs_duration_sec(quiesce_to));
 142                 }
 143
 144                 CDEBUG(D_INFO, "%s: All threads paused!\n", reason);
 145                 /* XXX Nic: Is there a set of counters we can grab here to
 146                  * ensure that there is no traffic until quiesce is over ?*/
 147         } else {
 148                 LCONSOLE_INFO("Quiesce complete: %s\n", reason);
 149
 150                 for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
 151                         kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
 152                         kgnilnd_schedule_dgram(dev);
 153                 }
 154
 155                 /* wait for everyone to check-in as running - they will be spinning
 156                  * and looking, so no need to poke any waitq */
 157                 while (atomic_read(&kgnilnd_data.kgn_nquiesce) > 0) {
 158                         CDEBUG(D_INFO,
 159                                  "%s: Waiting for %d threads to wake up\n",
 160                                   reason,
 161                                   atomic_read(&kgnilnd_data.kgn_nquiesce));
 162                         set_current_state(TASK_UNINTERRUPTIBLE);
 163                         schedule_timeout(cfs_time_seconds(1 * i));
 164                 }
 165
 166                 CDEBUG(D_INFO, "%s: All threads awake!\n", reason);
 167         }
 168 }
 169
 170 /* Reset the stack.  */
 171 void
 172 kgnilnd_reset_stack(void)
 173 {
 174         int              i, rc = 0;
 175         kgn_net_t       *net;
 176         kgn_peer_t      *peer, *peerN;
 177         LIST_HEAD        (souls);
 178         char            *reason = "critical hardware error";
 179         __u32            seconds;
 180         unsigned long    start, end;
 181         ENTRY;
 182
 183         /* Race with del_peer and its atomics */
 184         CFS_RACE(CFS_FAIL_GNI_RACE_RESET);
 185
 186         if (kgnilnd_data.kgn_init != GNILND_INIT_ALL) {
 187                 CERROR("can't reset the stack, gnilnd is not initialized\n");
 188                 RETURN_EXIT;
 189         }
 190
 191         /* First make sure we are not already quiesced - we panic if so,
 192          * as that could leave software in a bad state */
 193         LASSERTF(kgnilnd_data.kgn_quiesce_trigger == GNILND_QUIESCE_IDLE,
 194                 "can't reset the stack, already doing so: trigger %d\n",
 195                  kgnilnd_data.kgn_quiesce_trigger);
 196
 197         set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_RESET);
 198
 199         /* wake up the dgram waitq thread - but after trigger set to make sure it
 200          * goes into quiesce */
 201         CFS_RACE(CFS_FAIL_GNI_WC_DGRAM_FREE);
 202         /* same for scheduler that is dropping state transitiosn */
 203         CFS_RACE(CFS_FAIL_GNI_DROP_CLOSING);
 204         CFS_RACE(CFS_FAIL_GNI_DROP_DESTROY_EP);
 205
 206         kgnilnd_quiesce_wait(reason);
 207
 208         start = jiffies;
 209
 210         kgnilnd_data.kgn_in_reset = 1;
 211         kgnilnd_data.kgn_nresets++;
 212         LCONSOLE_WARN("%s: resetting all resources (count %d)\n",
 213                       reason, kgnilnd_data.kgn_nresets);
 214
 215         for (i = 0; i < *kgnilnd_tunables.kgn_net_hash_size; i++) {
 216                 list_for_each_entry(net, &kgnilnd_data.kgn_nets[i], gnn_list) {
 217                         rc = kgnilnd_cancel_net_dgrams(net);
 218                         LASSERTF(rc == 0, "couldn't cleanup datagrams: %d\n", rc);
 219                 }
 220         }
 221
 222         /* error -ENOTRECOVERABLE is stack reset */
 223         kgnilnd_del_conn_or_peer(NULL, LNET_NID_ANY, GNILND_DEL_CONN, -ENOTRECOVERABLE);
 224
 225         for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
 226                 kgn_device_t    *dev = &kgnilnd_data.kgn_devices[i];
 227                 kgnilnd_cancel_wc_dgrams(dev);
 228                 kgnilnd_wait_for_canceled_dgrams(dev);
 229         }
 230
 231         /* manually do some conn processing ala kgnilnd_process_conns */
 232         for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
 233                 kgn_device_t    *dev = &kgnilnd_data.kgn_devices[i];
 234                 kgn_conn_t      *conn;
 235                 int              conn_sched;
 236
 237                 /* go find all the closed conns that need to be nuked - the
 238                  * scheduler thread isn't running to do this for us */
 239
 240                 CDEBUG(D_NET, "will try to clear up %d ready_conns\n",
 241                         kgnilnd_count_list(&dev->gnd_ready_conns));
 242
 243                 /* use while/list_first_entry loop to ensure we can handle any
 244                  * DESTROY_EP conns added from kgnilnd_complete_closed_conn */
 245                 while (!list_empty(&dev->gnd_ready_conns)) {
 246                         conn = list_first_entry(&dev->gnd_ready_conns,
 247                                                 kgn_conn_t, gnc_schedlist);
 248                         conn_sched = xchg(&conn->gnc_scheduled, GNILND_CONN_PROCESS);
 249
 250                         LASSERTF(conn_sched != GNILND_CONN_IDLE &&
 251                                  conn_sched != GNILND_CONN_PROCESS,
 252                                  "conn %p on ready list but in bad state: %d\n",
 253                                  conn, conn_sched);
 254
 255                         list_del_init(&conn->gnc_schedlist);
 256
 257                         if (conn->gnc_state == GNILND_CONN_CLOSING) {
 258                                 /* bump to CLOSED to fake out send of CLOSE */
 259                                 conn->gnc_state = GNILND_CONN_CLOSED;
 260                                 conn->gnc_close_sent = 1;
 261                         }
 262
 263                         if (conn->gnc_state == GNILND_CONN_DESTROY_EP) {
 264                                 kgnilnd_destroy_conn_ep(conn);
 265                         } else {
 266                                 kgnilnd_complete_closed_conn(conn);
 267                         }
 268
 269                         /* there really shouldn't be any other states here -
 270                          * they would have been cleared out in the del_peer_or_conn or the dgram
 271                          * aborts above.
 272                          * there is an LASSERTF in kgnilnd_complete_closed_conn that will take
 273                          * care of catching anything else for us */
 274
 275                         kgnilnd_schedule_process_conn(conn, -1);
 276
 277                         kgnilnd_conn_decref(conn);
 278                 }
 279         }
 280
 281         /* don't let the little weasily purgatory conns hide from us */
 282         for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
 283                 list_for_each_entry_safe(peer, peerN, &kgnilnd_data.kgn_peers[i], gnp_list) {
 284                         kgn_conn_t       *conn, *connN;
 285
 286                         list_for_each_entry_safe(conn, connN, &peer->gnp_conns, gnc_list) {
 287                                 kgnilnd_detach_purgatory_locked(conn, &souls);
 288                         }
 289                 }
 290         }
 291
 292         CDEBUG(D_NET, "about to release %d purgatory entries\n",
 293                 kgnilnd_count_list(&souls));
 294
 295         kgnilnd_release_purgatory_list(&souls);
 296
 297         /* validate we are now clean */
 298         for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
 299                 kgn_device_t    *dev = &kgnilnd_data.kgn_devices[i];
 300
 301                 /* now all the cons/mboxes should be cleaned up, including purgatory
 302                  * so go through and release the MDDs for our persistent PHYS fma_blks
 303                  */
 304                 kgnilnd_unmap_fma_blocks(dev);
 305
 306                 LASSERTF(atomic_read(&dev->gnd_nfmablk) == 0,
 307                         "reset failed: fma blocks still live %d\n",
 308                         atomic_read(&dev->gnd_nfmablk));
 309
 310                 LASSERTF(atomic_read(&dev->gnd_neps) == 0,
 311                         "reset failed: EP handles still live %d\n",
 312                         atomic_read(&dev->gnd_neps));
 313         }
 314
 315         LASSERTF(atomic_read(&kgnilnd_data.kgn_nconns) == 0,
 316                 "reset failed: conns left %d\n",
 317                 atomic_read(&kgnilnd_data.kgn_nconns));
 318
 319         /* fine to have peers left - they are waiting for new conns
 320          * but should not be holding any open HW resources */
 321
 322         /* like the last part of kgnilnd_base_shutdown() */
 323
 324         CFS_RACE(CFS_FAIL_GNI_SR_DOWN_RACE);
 325
 326         for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
 327                 kgnilnd_dev_fini(&kgnilnd_data.kgn_devices[i]);
 328         }
 329
 330         /* no need to free and recreate the TX descriptors
 331          * we nuked all the ones that could be using HW resources in
 332          * kgnilnd_close_matching_conns and asserted it worked in
 333          * kgnilnd_dev_fini */
 334
 335         /* At this point, all HW is torn down, start to reset */
 336
 337         /* only reset our known devs */
 338         for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
 339                 kgn_device_t    *dev = &kgnilnd_data.kgn_devices[i];
 340                 rc = kgnilnd_dev_init(dev);
 341                 LASSERTF(rc == 0, "dev_init failed for dev %d\n", i);
 342                 kgnilnd_map_phys_fmablk(dev);
 343                 LASSERTF(rc == 0, "map_phys_fmablk failed for dev %d\n", i);
 344                 rc = kgnilnd_setup_wildcard_dgram(dev);
 345                 LASSERTF(rc == 0, "couldnt setup datagrams on dev %d: %d\n",
 346                         i, rc);
 347         }
 348
 349         /* Now the fun restarts... - release the hounds! */
 350
 351         end = jiffies;
 352         seconds = cfs_duration_sec((long)end - start);
 353         kgnilnd_bump_timeouts(seconds, reason);
 354
 355         kgnilnd_data.kgn_in_reset = 0;
 356         set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_IDLE);
 357         kgnilnd_quiesce_wait(reason);
 358         LCONSOLE_WARN("%s reset of all hardware resources\n",
 359                 rc ? "failed" : "successful");
 360
 361         RETURN_EXIT;
 362 }
 363
 364 /* A thread that handles quiece and reset hardware events.
 365  * We do the same thing regardless of which device reported the event. */
 366 int
 367 kgnilnd_ruhroh_thread(void *arg)
 368 {
 369         int                i = 1;
 370         DEFINE_WAIT(wait);
 371
 372         cfs_block_allsigs();
 373         set_user_nice(current, *kgnilnd_tunables.kgn_nice);
 374         kgnilnd_data.kgn_ruhroh_running = 1;
 375
 376         while (1) {
 377
 378                 /* Block until there's a request..  A reset request could come in
 379                  * while we're handling a quiesce one, or vice versa.
 380                  * Keep processing requests until there are none.*/
 381                 prepare_to_wait(&kgnilnd_data.kgn_ruhroh_waitq, &wait, TASK_INTERRUPTIBLE);
 382                 while (!(kgnilnd_data.kgn_ruhroh_shutdown ||
 383                                 kgnilnd_data.kgn_needs_reset || kgnilnd_data.kgn_needs_pause))
 384                         schedule();
 385                 finish_wait(&kgnilnd_data.kgn_ruhroh_waitq, &wait);
 386
 387                /* Exit if the driver is shutting down. */
 388                 if (kgnilnd_data.kgn_ruhroh_shutdown)
 389                         break;
 390
 391                 /* Serialize with driver startup and shutdown. */
 392                 mutex_lock(&kgnilnd_data.kgn_quiesce_mutex);
 393
 394                CDEBUG(D_NET, "trigger %d reset %d to_bump %d pause %d\n",
 395                         kgnilnd_data.kgn_quiesce_trigger,
 396                         kgnilnd_data.kgn_needs_reset,
 397                         kgnilnd_data.kgn_bump_info_rdy,
 398                         kgnilnd_data.kgn_needs_pause);
 399
 400                 /* Do we need to do a pause/quiesce? */
 401                 if (kgnilnd_data.kgn_needs_pause) {
 402
 403                         /* Pause all other kgnilnd threads. */
 404                         set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_HW_QUIESCE);
 405                         kgnilnd_quiesce_wait("hardware quiesce");
 406
 407                         /* If the hardware quiesce flag is set, wait for it to clear.
 408                          * This should happen relatively quickly, so we wait for it.
 409                          * This will hold up the eventd thread, but on everything but
 410                          * the simulator, this is ok-- there is one thread per core.
 411                          *
 412                          * Handle (possibly multiple) quiesce events while we wait. The
 413                          * memory barrier ensures that the core doesn't start fetching
 414                          * kgn_bump_info_rdy before it fetches kgn_needs_pause, and
 415                          * matches the second mb in kgnilnd_quiesce_end_callback(). */
 416                         smp_rmb();
 417                         while (kgnilnd_hw_in_quiesce() || kgnilnd_data.kgn_bump_info_rdy) {
 418
 419                                 i++;
 420                                 CDEBUG(D_INFO, "Waiting for hardware quiesce "
 421                                                "flag to clear\n");
 422                                 set_current_state(TASK_UNINTERRUPTIBLE);
 423                                 schedule_timeout(cfs_time_seconds(1 * i));
 424
 425                                 /* If we got a quiesce event with bump info, DO THE BUMP!. */
 426                                 if (kgnilnd_data.kgn_bump_info_rdy) {
 427                                         /* reset console rate limiting for each event */
 428                                         i = 1;
 429
 430                                         /* Make sure the core doesn't start fetching
 431                                          * kgni_quiesce_seconds until after it sees
 432                                          * kgn_bump_info_rdy set.  This is the match to the
 433                                          * first mb in kgnilnd_quiesce_end_callback(). */
 434                                         smp_rmb();
 435                                         (void) kgnilnd_bump_timeouts(kgnilnd_data.kgn_quiesce_secs,
 436                                                                "hardware quiesce callback");
 437                                         set_mb(kgnilnd_data.kgn_quiesce_secs, 0);
 438                                         set_mb(kgnilnd_data.kgn_bump_info_rdy, 0);
 439                                 }
 440                       }
 441
 442                         /* Reset the kgn_needs_pause flag before coming out of
 443                          * the pause.  This ordering avoids a race with the
 444                          * setting of this flag in kgnilnd_pause_threads().  */
 445                         set_mb(kgnilnd_data.kgn_needs_pause, 0);
 446
 447                         /* ok, let the kids back into the pool */
 448                         set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_IDLE);
 449                         kgnilnd_quiesce_wait("hardware quiesce");
 450                 }
 451
 452                 /* Do a stack reset if needed. */
 453                 if (kgnilnd_data.kgn_needs_reset) {
 454                         kgnilnd_reset_stack();
 455                         set_mb(kgnilnd_data.kgn_needs_reset, 0);
 456                 }
 457
 458                 mutex_unlock(&kgnilnd_data.kgn_quiesce_mutex);
 459         }
 460
 461         kgnilnd_data.kgn_ruhroh_running = 0;
 462         return 0;
 463 }
 464
 465 /* Set pause request flag.  Any functions that
 466  * call this one are responsible for ensuring that
 467  * variables they set up are visible on other cores before
 468  * this flag setting.  This executes in interrupt or kernel
 469  * thread context.  */
 470 void
 471 kgnilnd_pause_threads(void)
 472 {
 473         /* only device 0 gets the handle, see kgnilnd_dev_init */
 474         kgn_device_t  *dev = &kgnilnd_data.kgn_devices[0];
 475         LASSERTF(dev != NULL, "dev 0 is NULL\n");
 476
 477         /* If we're currently in a pause triggered by the pause flag,
 478          * there's no need to set it again.  We clear the kgn_needs_pause
 479          * flag before we reset kgn_quiesce_trigger to avoid a race.  The
 480          * read memory barrier matches the setmb() on the trigger in
 481          * kgnilnd_ruhroh_task().                                       */
 482         smp_rmb();
 483         if (!(kgnilnd_data.kgn_quiesce_trigger == GNILND_QUIESCE_HW_QUIESCE &&
 484                         GNILND_IS_QUIESCED)) {
 485                  CDEBUG(D_NET, "requesting thread pause\n");
 486
 487                 kgnilnd_data.kgn_needs_pause = 1;
 488
 489                 wake_up(&kgnilnd_data.kgn_ruhroh_waitq);
 490         } else {
 491             CDEBUG(D_NET, "thread pause already underway\n");
 492         }
 493 }
 494
 495 /* Return non-zero if the GNI hardware quiesce flag is set */
 496 int
 497 kgnilnd_hw_in_quiesce(void)
 498 {
 499         /* only device 0 gets the handle, see kgnilnd_dev_init */
 500         kgn_device_t      *dev0 = &kgnilnd_data.kgn_devices[0];
 501
 502         LASSERTF(dev0 != NULL, "dev 0 is NULL\n");
 503
 504         smp_rmb();
 505         return kgnilnd_get_quiesce_status(dev0->gnd_handle) != 0;
 506 }
 507
 508
 509 /* If the GNI hardware quiesce flag is set, initiate our pause and
 510  * return non-zero.  Also return non-zero if the stack is shutting down. */
 511 int
 512 kgnilnd_check_hw_quiesce(void)
 513 {
 514         if (likely(!kgnilnd_hw_in_quiesce()))
 515                 return 0;
 516
 517         if (!kgnilnd_data.kgn_ruhroh_shutdown) {
 518                 CDEBUG(D_NET, "initiating thread pause\n");
 519                 kgnilnd_pause_threads();
 520         } else {
 521                 CDEBUG(D_NET, "thread pause bypassed because of shutdown\n");
 522         }
 523
 524         return 1;
 525 }
 526
 527 /* Callback from kngi with the quiesce duration.  This executes
 528  * in interrupt context.                                        */
 529 void
 530 kgnilnd_quiesce_end_callback(gni_nic_handle_t nic_handle, uint64_t msecs)
 531 {
 532         /* only device 0 gets the handle, see kgnilnd_dev_init */
 533         kgn_device_t  *dev = &kgnilnd_data.kgn_devices[0];
 534         LASSERTF(dev != NULL, "dev 0 is NULL\n");
 535
 536         if (!kgnilnd_data.kgn_ruhroh_shutdown) {
 537
 538                 CDEBUG(D_NET, "requesting timeout bump by %lld msecs\n", msecs);
 539
 540                 /* Save the bump interval and request the bump.
 541                  * The memory barrier ensures that the interval is in place before
 542                  * the bump flag can be seen (in case a core is already running the
 543                  * ruhroh task), and that the bump request flag in place before
 544                  * the pause request can be seen (to ensure a core doesn't miss the bump
 545                  * request flag).       */
 546                 /* If another callback occurred before the ruhroh task
 547                  * finished processing the first bump request, we'd over-write its info.
 548                  * Nic says that callbacks occur so slowly that this isn't an issue.    */
 549                 set_mb(kgnilnd_data.kgn_quiesce_secs, msecs / MSEC_PER_SEC);
 550                 set_mb(kgnilnd_data.kgn_bump_info_rdy, 1);
 551                 kgnilnd_pause_threads();
 552         } else {
 553                 CDEBUG(D_NET, "timeout bump bypassed because of shutdown\n");
 554         }
 555 }
 556
 557 void
 558 kgnilnd_critical_error(struct gni_err *err_handle)
 559 {
 560         /* only device 0 gets the handle, see kgnilnd_dev_init */
 561         kgn_device_t  *dev = &kgnilnd_data.kgn_devices[0];
 562         LASSERTF(dev != NULL, "dev 0 is NULL\n");
 563
 564         if (!kgnilnd_data.kgn_ruhroh_shutdown) {
 565                 CDEBUG(D_NET, "requesting stack reset\n");
 566                 kgnilnd_data.kgn_needs_reset = 1;
 567                 wake_up(&kgnilnd_data.kgn_ruhroh_waitq);
 568         } else {
 569                 CDEBUG(D_NET, "stack reset bypassed because of shutdown\n");
 570         }
 571 }
 572
 573 #if defined(GNILND_USE_RCA)
 574 #include <krca_lib.h>
 575 #define RCA_EVENTS 3
 576 /* RCA ticket is needed for krca_wakeup_wait_event() */
 577 static krca_ticket_t rca_krt = KRCA_NULL_TICKET;
 578 struct rcadata {
 579         rca_ticket_t ticket;
 580         int subscribed;
 581         rs_event_code_t ec;
 582 };
 583 static struct rcadata rd[RCA_EVENTS] = {
 584         {0, 0, ec_node_unavailable},
 585         {0, 0, ec_node_available},
 586         {0, 0, ec_node_failed}
 587 };
 588
 589 /* thread for receiving rca events */
 590 int
 591 kgnilnd_rca(void *arg)
 592 {
 593         int        i, rc;
 594         int        retry_count;
 595         rs_event_t event;
 596         lnet_nid_t nid;
 597
 598         cfs_block_allsigs();
 599
 600         /* all gnilnd threads need to run fairly urgently */
 601         set_user_nice(current, *kgnilnd_tunables.kgn_nice);
 602
 603         /*
 604          * Register our service with RCA and subscribe to events
 605          * of interest.
 606          */
 607         rca_krt = KRCA_NULL_TICKET;
 608         rc = krca_register(&rca_krt, RCA_SVCTYPE_GNILND, current->pid, 0);
 609         if (rc < 0) {
 610                 CNETERR("krca_register(%x) returned %d\n", current->pid, rc);
 611                 goto done;
 612         }
 613
 614         for (i = 0; i < RCA_EVENTS; i++) {
 615                 retry_count = 0;
 616 subscribe_retry:
 617                 rc = krca_subscribe(&rca_krt, rd[i].ec, RCA_RX_SVC_ANY,
 618                                     &rd[i].ticket);
 619
 620                 if ((rc == -EINTR) && !retry_count) {
 621                         retry_count++;
 622                         CNETERR("krca_subscribe returned %d - retrying\n", rc);
 623                         goto subscribe_retry;
 624                 }
 625
 626                 if (rc < 0) {
 627                         CNETERR("rca subscription failed (%d)\n", rc);
 628                         goto done;
 629                 }
 630
 631                 rd[i].subscribed = 1;
 632         }
 633
 634         while (!kgnilnd_data.kgn_shutdown) {
 635                 if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
 636                         KGNILND_SPIN_QUIESCE;
 637                 }
 638                 /* wait here for a subscribed event */
 639                 rc = krca_wait_event(&rca_krt);
 640
 641                 /* RCA return values:
 642                  * 0 indicates krca_wakeup_wait_event caused krca_wait_event
 643                  *   return.
 644                  * -ERESTARTSYS indicates krca_wait_event returned because of a
 645                  *   signal.
 646                  * -ENOSPC indicates no space available to create an rcad_reg_t
 647                  * 1 indicates a message is waiting.
 648                  */
 649                 if (rc <= 0) {
 650                         continue;
 651                 }
 652
 653                 if (krca_get_message(&rca_krt, &event) == 0) {
 654                         int node_down = GNILND_PEER_UNKNOWN;
 655                         rs_state_t state;
 656                         LIST_HEAD(zombies);
 657
 658                         /* Compute nodes don't care about other compute nodes
 659                          * so we don't need to create a peer.
 660                          */
 661                         if (GNILND_COMPUTE &&
 662                             !RSN_GET_FLD(event.ev_gen.svid_node.rs_node_flat,
 663                                         IS_SVC)) {
 664                                 continue;
 665                         }
 666
 667                         /* Only care about compute and service nodes not GPUs */
 668                         if (!(RSN_GET_FLD(event.ev_gen.svid_node.rs_node_flat,
 669                                         TYPE) == rt_node ||
 670                              RSN_GET_FLD(event.ev_gen.svid_node.rs_node_flat,
 671                                         TYPE) == rt_accel)) {
 672                                                 continue;
 673                         }
 674
 675                         switch (event.ev_id) {
 676                         case ec_node_available:
 677                                 CDEBUG(D_INFO, "ec_node_available\n");
 678                                 node_down = GNILND_PEER_UP;
 679                                 break;
 680                         case ec_node_failed:
 681                                 CDEBUG(D_INFO, "ec_node_failed\n");
 682                                 if (event.ev_len > 0) {
 683                                         CDEBUG(D_ERROR,
 684                                                 "ec_node_failed ignored\n");
 685                                         break;
 686                                 }
 687                                 node_down = GNILND_PEER_DOWN;
 688                                 break;
 689                         case ec_node_unavailable:
 690                                 state = RSN_GET_FLD(event.ev_gen.svid_node.rsn_intval, STATE);
 691
 692                                 CDEBUG(D_INFO, "ec_node_unavailable\n");
 693
 694                                 /*
 695                                  * Ignore overloaded ec_node_unavailable events
 696                                  * generated by 'xtcli set_reserve'.
 697                                  */
 698                                 if (RS_GET_CS_STATE(state) == RS_CS_READY) {
 699                                         CDEBUG(D_INFO, "ignoring "
 700                                                 "ec_node_unavailable event with"
 701                                                 " RS_CS_READY state\n");
 702                                         break;
 703                                 }
 704                                 node_down = GNILND_PEER_DOWN;
 705                                 break;
 706                         default:
 707                                 CDEBUG(D_INFO, "unknown event\n");
 708                                 break;
 709                         }
 710
 711                         /* if we get an event we don't know about, just go ahead
 712                          * and wait for another event */
 713                         if (node_down == GNILND_PEER_UNKNOWN)
 714                                 continue;
 715
 716                         nid = RSN_GET_FLD(event.ev_gen.svid_node.rs_node_flat,
 717                                           NID);
 718                         CDEBUG(D_INFO,"kgnilnd_rca() reporting nid %d %s\n",
 719                                (int)nid, node_down ? "down" : "up");
 720                         kgnilnd_report_node_state(nid, node_down);
 721
 722                 } else {
 723                         CNETERR("krca_get_message failed\n");
 724                 }
 725         }
 726
 727 done:
 728         CDEBUG(D_INFO, "done\n");
 729
 730         for (i = 0; i < RCA_EVENTS; i++) {
 731                 if (rd[i].subscribed) {
 732                         rc = krca_unsubscribe(&rca_krt, rd[i].ticket);
 733
 734                         if (rc) {
 735                                 CNETERR("rca unsubscribe failed (%d)\n", rc);
 736                         }
 737
 738                         rd[i].subscribed = 0;
 739                 }
 740         }
 741
 742         krca_unregister(&rca_krt);
 743         kgnilnd_thread_fini();
 744         return 0;
 745
 746 }
 747
 748 int
 749 kgnilnd_start_rca_thread(void)
 750 {
 751         return kgnilnd_thread_start(kgnilnd_rca, NULL, "kgnilnd_rca", 0);
 752 }
 753
 754 void
 755 kgnilnd_wakeup_rca_thread(void)
 756 {
 757         int ret;
 758
 759         ret = krca_wakeup_wait_event(&rca_krt);
 760
 761         if (ret) {
 762                 CDEBUG(D_ERROR, "krca_wakeup_wait_event failed\n");
 763         }
 764 }
 765
 766 int
 767 kgnilnd_get_node_state(__u32 nid)
 768 {
 769         int i;
 770         int rc = GNILND_PEER_UNKNOWN;
 771         int ret;
 772         rs_node_array_t nlist;
 773         rs_node_t       *na = NULL;
 774
 775         if ((ret = krca_get_sysnodes(&nlist)) < 0) {
 776                 CDEBUG(D_NETERROR, "krca_get_sysnodes failed %d\n", ret);
 777                 goto ns_done;
 778         }
 779
 780         na = nlist.na_ids;
 781
 782         for (i = 0; i < nlist.na_len; i++) {
 783                 if ((rca_nid_t)RSN_GET_FLD(na[i].rs_node_flat, NID) == nid) {
 784                         rc = RSN_GET_FLD(na[i].rs_node_flat, STATE) == RS_CS_READY ?
 785                                 GNILND_PEER_UP : GNILND_PEER_DOWN;
 786                         break;
 787                 }
 788         }
 789
 790 ns_done:
 791         kfree(na);
 792         CDEBUG(D_NET, "nid %d rc %d (0=up)\n", nid, rc);
 793         return rc;
 794 }
 795
 796 #else /* GNILND_USE_RCA */
 797
 798 int
 799 kgnilnd_start_rca_thread(void)
 800 {
 801         return 0;
 802 }
 803
 804 void
 805 kgnilnd_wakeup_rca_thread(void)
 806 {
 807 }
 808
 809 int
 810 kgnilnd_get_node_state(__u32 nid)
 811 {
 812         return GNILND_PEER_UP;
 813 }
 814 #endif /* GNILND_USE_RCA */