lnet/klnds/gnilnd/gnilnd_conn.c

   1 /*
   2  * Copyright (C) 2012 Cray, Inc.
   3  *
   4  * Copyright (c) 2014, Intel Corporation.
   5  *
   6  *   Author: Nic Henke <nic@cray.com>
   7  *   Author: James Shimek <jshimek@cray.com>
   8  *
   9  *   This file is part of Lustre, http://www.lustre.org.
  10  *
  11  *   Lustre is free software; you can redistribute it and/or
  12  *   modify it under the terms of version 2 of the GNU General Public
  13  *   License as published by the Free Software Foundation.
  14  *
  15  *   Lustre is distributed in the hope that it will be useful,
  16  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  *   GNU General Public License for more details.
  19  *
  20  *   You should have received a copy of the GNU General Public License
  21  *   along with Lustre; if not, write to the Free Software
  22  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  23  *
  24  */
  25
  26 #include "gnilnd.h"
  27
  28 void
  29 kgnilnd_setup_smsg_attr(gni_smsg_attr_t *smsg_attr)
  30 {
  31         smsg_attr->mbox_maxcredit = *kgnilnd_tunables.kgn_mbox_credits;
  32         smsg_attr->msg_maxsize = GNILND_MAX_MSG_SIZE;
  33         smsg_attr->msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT;
  34 }
  35
  36 int
  37 kgnilnd_map_fmablk(kgn_device_t *device, kgn_fma_memblock_t *fma_blk)
  38 {
  39         gni_return_t            rrc;
  40         __u32                   flags = GNI_MEM_READWRITE;
  41
  42         if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
  43                 flags |= GNI_MEM_PHYS_CONT;
  44         }
  45
  46         fma_blk->gnm_hold_timeout = 0;
  47
  48         /* make sure we are mapping a clean block */
  49         LASSERTF(fma_blk->gnm_hndl.qword1 == 0UL, "fma_blk %p dirty\n", fma_blk);
  50
  51         rrc = kgnilnd_mem_register(device->gnd_handle, (__u64)fma_blk->gnm_block,
  52                                    fma_blk->gnm_blk_size, device->gnd_rcv_fma_cqh,
  53                                    flags, &fma_blk->gnm_hndl);
  54         if (rrc != GNI_RC_SUCCESS) {
  55                 /* XXX Nic: need a way to silence this for runtime stuff that is ok to fail
  56                  * -- like when under MDD or GART pressure on big systems
  57                  */
  58                 CNETERR("register fmablk failed 0x%p mbox_size %d flags %u\n",
  59                         fma_blk, fma_blk->gnm_mbox_size, flags);
  60                 RETURN(-ENOMEM);
  61         }
  62
  63         /* PHYS_CONT memory isn't really mapped, at least not in GART -
  64          *  but all mappings chew up a MDD
  65          */
  66         if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
  67                 atomic64_add(fma_blk->gnm_blk_size, &device->gnd_nbytes_map);
  68         }
  69
  70         atomic_inc(&device->gnd_n_mdd);
  71         /* nfmablk is live (mapped) blocks */
  72         atomic_inc(&device->gnd_nfmablk);
  73
  74         RETURN(0);
  75 }
  76
  77 int
  78 kgnilnd_alloc_fmablk(kgn_device_t *device, int use_phys)
  79 {
  80         int                     rc = 0;
  81         int                     num_mbox;
  82         kgn_fma_memblock_t     *fma_blk;
  83         gni_smsg_attr_t         smsg_attr;
  84         unsigned long           fmablk_vers;
  85
  86 #if defined(CONFIG_CRAY_XT) && !defined(CONFIG_CRAY_COMPUTE)
  87         /* We allocate large blocks of memory here potentially leading
  88          * to memory exhaustion during massive reconnects during a network
  89          * outage. Limit the amount of fma blocks to use by always keeping
  90          * a percent of pages free initially set to 25% of total memory. */
  91         if (global_page_state(NR_FREE_PAGES) < kgnilnd_data.free_pages_limit) {
  92                 LCONSOLE_INFO("Exceeding free page limit of %ld. "
  93                               "Free pages available %ld\n",
  94                               kgnilnd_data.free_pages_limit,
  95                               global_page_state(NR_FREE_PAGES));
  96                 return -ENOMEM;
  97         }
  98 #endif
  99         /* we'll use fmablk_vers and the gnd_fmablk_mutex to gate access
 100          * to this allocation code. Everyone will sample the version
 101          * before and after getting the mutex. If it has changed,
 102          * we'll bail out to check the lists again - this indicates that
 103          * some sort of change was made to the lists and it is possible
 104          * that there is a mailbox for us to find now. This should prevent
 105          * a ton of spinning in the case where there are lots of threads
 106          * that need a yet-to-be-allocated mailbox for a connection. */
 107
 108         fmablk_vers = atomic_read(&device->gnd_fmablk_vers);
 109         mutex_lock(&device->gnd_fmablk_mutex);
 110
 111         if (fmablk_vers != atomic_read(&device->gnd_fmablk_vers)) {
 112                 /* version changed while we were waiting for semaphore,
 113                  * we'll recheck the lists assuming something nice happened */
 114                 mutex_unlock(&device->gnd_fmablk_mutex);
 115                 return 0;
 116         }
 117
 118         LIBCFS_ALLOC(fma_blk, sizeof(kgn_fma_memblock_t));
 119         if (fma_blk == NULL) {
 120                 CNETERR("could not allocate fma block descriptor\n");
 121                 rc = -ENOMEM;
 122                 GOTO(out, rc);
 123         }
 124
 125         INIT_LIST_HEAD(&fma_blk->gnm_bufflist);
 126
 127         kgnilnd_setup_smsg_attr(&smsg_attr);
 128
 129         gni_smsg_buff_size_needed(&smsg_attr, &fma_blk->gnm_mbox_size);
 130
 131         LASSERTF(fma_blk->gnm_mbox_size, "mbox size %d\n", fma_blk->gnm_mbox_size);
 132
 133         /* gni_smsg_buff_size_needed calculates the base mailbox size and since
 134          * we want to hold kgn_peer_credits worth of messages in both directions,
 135          * we add PAYLOAD to grow the mailbox size
 136          */
 137
 138         fma_blk->gnm_mbox_size += GNILND_MBOX_PAYLOAD;
 139
 140         /* we'll only use physical during preallocate at startup -- this keeps it nice and
 141          * clean for runtime decisions. We'll keep the PHYS ones around until shutdown
 142          * as reallocating them is tough if there is memory fragmentation */
 143
 144         if (use_phys) {
 145                 fma_blk->gnm_block = kmem_cache_alloc(kgnilnd_data.kgn_mbox_cache, GFP_ATOMIC);
 146                 if (fma_blk->gnm_block == NULL) {
 147                         CNETERR("could not allocate physical SMSG mailbox memory\n");
 148                         rc = -ENOMEM;
 149                         GOTO(free_desc, rc);
 150                 }
 151                 fma_blk->gnm_blk_size = KMALLOC_MAX_SIZE;
 152                 num_mbox = fma_blk->gnm_blk_size / fma_blk->gnm_mbox_size;
 153
 154                 LASSERTF(num_mbox >= 1,
 155                          "num_mbox %d blk_size %u mbox_size %d\n",
 156                           num_mbox, fma_blk->gnm_blk_size, fma_blk->gnm_mbox_size);
 157
 158                 fma_blk->gnm_state = GNILND_FMABLK_PHYS;
 159
 160         } else {
 161                 num_mbox = *kgnilnd_tunables.kgn_mbox_per_block;
 162                 fma_blk->gnm_blk_size = num_mbox * fma_blk->gnm_mbox_size;
 163
 164                 LASSERTF(num_mbox >= 1 && num_mbox >= *kgnilnd_tunables.kgn_mbox_per_block,
 165                          "num_mbox %d blk_size %u mbox_size %d tunable %d\n",
 166                          num_mbox, fma_blk->gnm_blk_size, fma_blk->gnm_mbox_size,
 167                          *kgnilnd_tunables.kgn_mbox_per_block);
 168
 169                 fma_blk->gnm_block = kgnilnd_vzalloc(fma_blk->gnm_blk_size);
 170                 if (fma_blk->gnm_block == NULL) {
 171                         CNETERR("could not allocate virtual SMSG mailbox memory, %d bytes\n", fma_blk->gnm_blk_size);
 172                         rc = -ENOMEM;
 173                         GOTO(free_desc, rc);
 174                 }
 175
 176                 fma_blk->gnm_state = GNILND_FMABLK_VIRT;
 177         }
 178
 179         /* allocate just enough space for the bits to track the mailboxes */
 180         LIBCFS_ALLOC(fma_blk->gnm_bit_array, BITS_TO_LONGS(num_mbox) * sizeof(unsigned long));
 181         if (fma_blk->gnm_bit_array == NULL) {
 182                 CNETERR("could not allocate mailbox bitmask, %lu bytes for %d mbox\n",
 183                        sizeof(unsigned long) * BITS_TO_LONGS(num_mbox), num_mbox);
 184                 rc = -ENOMEM;
 185                 GOTO(free_blk, rc);
 186         }
 187         bitmap_zero(fma_blk->gnm_bit_array, num_mbox);
 188
 189         /* now that the num_mbox is set based on allocation type, get debug info setup */
 190         LIBCFS_ALLOC(fma_blk->gnm_mbox_info, sizeof(kgn_mbox_info_t) * num_mbox);
 191         if (fma_blk->gnm_mbox_info == NULL) {
 192                 CNETERR("could not allocate mailbox debug, %lu bytes for %d mbox\n",
 193                        sizeof(kgn_mbox_info_t) * num_mbox, num_mbox);
 194                 rc = -ENOMEM;
 195                 GOTO(free_bit, rc);
 196         }
 197
 198         rc = kgnilnd_map_fmablk(device, fma_blk);
 199         if (rc) {
 200                 GOTO(free_info, rc);
 201         }
 202
 203         fma_blk->gnm_next_avail_mbox = 0;
 204         fma_blk->gnm_avail_mboxs = fma_blk->gnm_num_mboxs = num_mbox;
 205
 206         CDEBUG(D_MALLOC, "alloc fmablk 0x%p num %d msg_maxsize %d credits %d "
 207                 "mbox_size %d MDD "LPX64"."LPX64"\n",
 208                 fma_blk, num_mbox, smsg_attr.msg_maxsize, smsg_attr.mbox_maxcredit,
 209                 fma_blk->gnm_mbox_size, fma_blk->gnm_hndl.qword1,
 210                 fma_blk->gnm_hndl.qword2);
 211
 212         /* lock Is protecting data structures, not semaphore */
 213
 214         spin_lock(&device->gnd_fmablk_lock);
 215         list_add_tail(&fma_blk->gnm_bufflist, &device->gnd_fma_buffs);
 216
 217         /* toggle under the lock so once they change the list is also
 218          * ready for others to traverse */
 219         atomic_inc(&device->gnd_fmablk_vers);
 220
 221         spin_unlock(&device->gnd_fmablk_lock);
 222
 223         mutex_unlock(&device->gnd_fmablk_mutex);
 224
 225         return 0;
 226
 227 free_info:
 228         LIBCFS_FREE(fma_blk->gnm_mbox_info, sizeof(kgn_mbox_info_t)*num_mbox);
 229 free_bit:
 230         LIBCFS_FREE(fma_blk->gnm_bit_array, BITS_TO_LONGS(num_mbox) * sizeof (unsigned long));
 231 free_blk:
 232         if (fma_blk->gnm_state == GNILND_FMABLK_VIRT) {
 233                 LIBCFS_FREE(fma_blk->gnm_block, fma_blk->gnm_blk_size);
 234         } else {
 235                 kmem_cache_free(kgnilnd_data.kgn_mbox_cache, fma_blk->gnm_block);
 236         }
 237 free_desc:
 238         LIBCFS_FREE(fma_blk, sizeof(kgn_fma_memblock_t));
 239 out:
 240         mutex_unlock(&device->gnd_fmablk_mutex);
 241         return rc;
 242 }
 243
 244 void
 245 kgnilnd_unmap_fmablk(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
 246 {
 247         gni_return_t            rrc;
 248
 249         /* if some held, set hold_timeout from conn timeouts used in this block
 250          * but not during shutdown, then just nuke and pave
 251          * During a stack reset, we need to deregister with a hold timeout
 252          * set so we don't use the same mdd after reset is complete */
 253         if ((fma_blk->gnm_held_mboxs && !kgnilnd_data.kgn_shutdown) ||
 254             kgnilnd_data.kgn_in_reset) {
 255                 fma_blk->gnm_hold_timeout = GNILND_TIMEOUT2DEADMAN;
 256         }
 257
 258         /* we are changing the state of a block, tickle version to tell
 259          * proc code list is stale now */
 260         atomic_inc(&dev->gnd_fmablk_vers);
 261
 262         rrc = kgnilnd_mem_deregister(dev->gnd_handle, &fma_blk->gnm_hndl, fma_blk->gnm_hold_timeout);
 263
 264         CDEBUG(rrc == GNI_RC_SUCCESS ? D_MALLOC : D_CONSOLE|D_NETERROR,
 265                "unmap fmablk 0x%p@%s sz %u total %d avail %d held %d mbox_size %d "
 266                 "hold_timeout %d\n",
 267                fma_blk, kgnilnd_fmablk_state2str(fma_blk->gnm_state),
 268                fma_blk->gnm_blk_size, fma_blk->gnm_num_mboxs,
 269                fma_blk->gnm_avail_mboxs, fma_blk->gnm_held_mboxs,
 270                fma_blk->gnm_mbox_size, fma_blk->gnm_hold_timeout);
 271
 272         LASSERTF(rrc == GNI_RC_SUCCESS,
 273                 "tried to double unmap or something bad, fma_blk %p (rrc %d)\n",
 274                 fma_blk, rrc);
 275
 276         if (fma_blk->gnm_hold_timeout &&
 277             !(kgnilnd_data.kgn_in_reset &&
 278               fma_blk->gnm_state == GNILND_FMABLK_PHYS)) {
 279                 atomic_inc(&dev->gnd_n_mdd_held);
 280         } else {
 281                 atomic_dec(&dev->gnd_n_mdd);
 282         }
 283
 284         /* PHYS blocks don't get mapped */
 285         if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
 286                 atomic64_sub(fma_blk->gnm_blk_size, &dev->gnd_nbytes_map);
 287                 fma_blk->gnm_state = GNILND_FMABLK_IDLE;
 288         } else if (kgnilnd_data.kgn_in_reset) {
 289                 /* in stack reset, clear MDD handle for PHYS blocks, as we'll
 290                  * re-use the fma_blk after reset so we don't have to drop/allocate
 291                  * all of those physical blocks */
 292                 fma_blk->gnm_hndl.qword1 = fma_blk->gnm_hndl.qword2 = 0UL;
 293         }
 294
 295         /* Decrement here as this is the # of mapped blocks */
 296         atomic_dec(&dev->gnd_nfmablk);
 297 }
 298
 299
 300 /* needs lock on gnd_fmablk_lock to cover gnd_fma_buffs */
 301 void
 302 kgnilnd_free_fmablk_locked(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
 303 {
 304         LASSERTF(fma_blk->gnm_avail_mboxs == fma_blk->gnm_num_mboxs,
 305                  "fma_blk %p@%d free in bad state (%d): blk total %d avail %d held %d\n",
 306                  fma_blk, fma_blk->gnm_state, fma_blk->gnm_hold_timeout, fma_blk->gnm_num_mboxs,
 307                 fma_blk->gnm_avail_mboxs, fma_blk->gnm_held_mboxs);
 308
 309         atomic_inc(&dev->gnd_fmablk_vers);
 310
 311         if (fma_blk->gnm_hold_timeout) {
 312                 CDEBUG(D_MALLOC, "mdd release fmablk 0x%p sz %u avail %d held %d "
 313                         "mbox_size %d\n",
 314                         fma_blk, fma_blk->gnm_blk_size, fma_blk->gnm_avail_mboxs,
 315                         fma_blk->gnm_held_mboxs, fma_blk->gnm_mbox_size);
 316
 317                 /* We leave MDD dangling over stack reset */
 318                 if (!kgnilnd_data.kgn_in_reset) {
 319                         kgnilnd_mem_mdd_release(dev->gnd_handle, &fma_blk->gnm_hndl);
 320                 }
 321                 /* ignoring the return code - if kgni/ghal can't find it
 322                  * it must be released already */
 323                 atomic_dec(&dev->gnd_n_mdd_held);
 324                 atomic_dec(&dev->gnd_n_mdd);
 325         }
 326
 327         /* we cant' free the gnm_block until all the conns have released their
 328          * purgatory holds. While we have purgatory holds, we might check the conn
 329          * RX mailbox during the CLOSING process. It is possible that kgni might
 330          * try to look into the RX side for credits when sending the CLOSE msg too */
 331         CDEBUG(D_MALLOC, "fmablk %p free buffer %p mbox_size %d\n",
 332                 fma_blk, fma_blk->gnm_block, fma_blk->gnm_mbox_size);
 333
 334         if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
 335                 kmem_cache_free(kgnilnd_data.kgn_mbox_cache, fma_blk->gnm_block);
 336         } else {
 337                 LIBCFS_FREE(fma_blk->gnm_block, fma_blk->gnm_blk_size);
 338         }
 339         fma_blk->gnm_state = GNILND_FMABLK_FREED;
 340
 341         list_del(&fma_blk->gnm_bufflist);
 342
 343         LIBCFS_FREE(fma_blk->gnm_mbox_info, sizeof(kgn_mbox_info_t)*fma_blk->gnm_num_mboxs);
 344         LIBCFS_FREE(fma_blk->gnm_bit_array, BITS_TO_LONGS(fma_blk->gnm_num_mboxs) * sizeof (unsigned long));
 345         LIBCFS_FREE(fma_blk, sizeof(kgn_fma_memblock_t));
 346 }
 347
 348 void
 349 kgnilnd_find_free_mbox(kgn_conn_t *conn)
 350 {
 351         kgn_device_t            *dev = conn->gnc_device;
 352         gni_smsg_attr_t         *smsg_attr = &conn->gnpr_smsg_attr;
 353         kgn_fma_memblock_t      *fma_blk;
 354         kgn_mbox_info_t         *mbox = NULL;
 355         int                     id;
 356
 357         spin_lock(&dev->gnd_fmablk_lock);
 358
 359         list_for_each_entry(fma_blk, &conn->gnc_device->gnd_fma_buffs,
 360                             gnm_bufflist) {
 361                 if (fma_blk->gnm_avail_mboxs <= 0 ||
 362                     fma_blk->gnm_state <= GNILND_FMABLK_IDLE) {
 363                         continue;
 364                 }
 365                 /* look in bitarray for available mailbox */
 366                 do {
 367                         id = find_next_zero_bit(
 368                                 fma_blk->gnm_bit_array,
 369                                 fma_blk->gnm_num_mboxs,
 370                                 fma_blk->gnm_next_avail_mbox);
 371                       if (id == fma_blk->gnm_num_mboxs &&
 372                           fma_blk->gnm_next_avail_mbox != 0) {
 373                                 /* wrap around */
 374                                 fma_blk->gnm_next_avail_mbox = 0;
 375                         } else {
 376                                 break;
 377                         }
 378                 } while (1);
 379
 380                 LASSERTF(id < fma_blk->gnm_num_mboxs, "id %d max %d\n",
 381                          id, fma_blk->gnm_num_mboxs);
 382                 set_bit(id, (volatile unsigned long *)fma_blk->gnm_bit_array);
 383                 conn->gnc_mbox_id = id;
 384
 385                 fma_blk->gnm_next_avail_mbox =
 386                         (id == (fma_blk->gnm_num_mboxs - 1)) ? 0 : (id + 1);
 387                 fma_blk->gnm_avail_mboxs--;
 388                 conn->gnc_fma_blk = fma_blk;
 389
 390                 kgnilnd_setup_smsg_attr(smsg_attr);
 391
 392                 smsg_attr->msg_buffer = fma_blk->gnm_block;
 393                 smsg_attr->mbox_offset = fma_blk->gnm_mbox_size * id;
 394                 smsg_attr->mem_hndl = fma_blk->gnm_hndl;
 395                 smsg_attr->buff_size = fma_blk->gnm_mbox_size;
 396
 397                 /* We'll set the hndl to zero for PHYS blocks unmapped during stack
 398                  * reset and re-use the same fma_blk after stack reset. This ensures we've
 399                  * properly mapped it before we use it */
 400                 LASSERTF(fma_blk->gnm_hndl.qword1 != 0UL, "unmapped fma_blk %p, state %d\n",
 401                          fma_blk, fma_blk->gnm_state);
 402
 403                 CDEBUG(D_NET, "conn %p smsg %p fmablk %p "
 404                         "allocating SMSG mbox %d buf %p "
 405                         "offset %u hndl "LPX64"."LPX64"\n",
 406                         conn, smsg_attr, fma_blk, id,
 407                         smsg_attr->msg_buffer, smsg_attr->mbox_offset,
 408                         fma_blk->gnm_hndl.qword1,
 409                         fma_blk->gnm_hndl.qword2);
 410
 411                 mbox = &fma_blk->gnm_mbox_info[id];
 412                 mbox->mbx_create_conn_memset = jiffies;
 413                 mbox->mbx_nallocs++;
 414                 mbox->mbx_nallocs_total++;
 415
 416                 /* zero mbox to remove any old data from our last use.
 417                  * this better be safe, if not our purgatory timers
 418                  * are too short or a peer really is misbehaving */
 419                 memset(smsg_attr->msg_buffer + smsg_attr->mbox_offset,
 420                        0, smsg_attr->buff_size);
 421                 break;
 422         }
 423
 424         spin_unlock(&dev->gnd_fmablk_lock);
 425 }
 426
 427 int
 428 kgnilnd_setup_mbox(kgn_conn_t *conn)
 429 {
 430         gni_smsg_attr_t         *smsg_attr = &conn->gnpr_smsg_attr;
 431         int                      err = 0;
 432
 433         smsg_attr->msg_buffer = NULL;
 434         /* Look for available mbox */
 435         do {
 436                 kgnilnd_find_free_mbox(conn);
 437
 438                 /* nothing in the existing buffers, make a new one */
 439                 if (smsg_attr->msg_buffer == NULL) {
 440                         /* for runtime allocations, we only want vmalloc */
 441                         err = kgnilnd_alloc_fmablk(conn->gnc_device, 0);
 442                         if (err) {
 443                                 break;
 444                         }
 445                 }
 446         } while (smsg_attr->msg_buffer == NULL);
 447
 448         if (err)
 449                 CNETERR("couldn't allocate SMSG mbox for conn %p Error: %d\n",
 450                         conn, err);
 451         return err;
 452 }
 453
 454 void
 455 kgnilnd_release_mbox(kgn_conn_t *conn, int purgatory_hold)
 456 {
 457         kgn_device_t           *dev = conn->gnc_device;
 458         gni_smsg_attr_t        *smsg_attr = &conn->gnpr_smsg_attr;
 459         kgn_fma_memblock_t     *fma_blk = NULL;
 460         kgn_mbox_info_t        *mbox = NULL;
 461         int                     found = 0;
 462         int                     id;
 463
 464         /* if we failed to setup mbox and now destroying conn */
 465         if (smsg_attr->msg_buffer == NULL) {
 466                 return;
 467         }
 468
 469         id = conn->gnc_mbox_id;
 470
 471         spin_lock(&dev->gnd_fmablk_lock);
 472         /* make sure our conn points at a valid fma_blk
 473          * We use this instead of a mem block search out of smsg_attr
 474          * because we could have freed a block for fma_blk #1 but the fma_blk
 475          * is still in the list for a purgatory hold. This would induce a false
 476          * match if that same block gets reallocated to fma_blk #2 */
 477         list_for_each_entry(fma_blk, &dev->gnd_fma_buffs, gnm_bufflist) {
 478                 if (fma_blk == conn->gnc_fma_blk) {
 479                         found = 1;
 480                         break;
 481                 }
 482         }
 483         LASSERTF(found, "unable to find conn 0x%p with gnc_fma_blk %p "
 484                  "anywhere in the world\n", conn, conn->gnc_fma_blk);
 485
 486         LASSERTF(id < fma_blk->gnm_num_mboxs,
 487                 "bad id %d max %d\n",
 488                 id, fma_blk->gnm_num_mboxs);
 489
 490         /* < 0 - was held, now free it
 491          * == 0 - just free it
 492          * > 0 - hold it for now */
 493         if (purgatory_hold == 0) {
 494                 CDEBUG(D_NET, "conn %p smsg %p fmablk %p freeing SMSG mbox %d "
 495                         "hndl "LPX64"."LPX64"\n",
 496                         conn, smsg_attr, fma_blk, id,
 497                         fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
 498                 fma_blk->gnm_avail_mboxs++;
 499
 500         } else if (purgatory_hold > 0) {
 501                 CDEBUG(D_NET, "conn %p smsg %p fmablk %p holding SMSG mbox %d "
 502                         "hndl "LPX64"."LPX64"\n",
 503                         conn, smsg_attr, fma_blk, id,
 504                         fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
 505
 506                 fma_blk->gnm_held_mboxs++;
 507                 fma_blk->gnm_max_timeout = MAX(fma_blk->gnm_max_timeout,
 508                                                 conn->gnc_timeout);
 509         } else {
 510                 CDEBUG(D_NET, "conn %p smsg %p fmablk %p release SMSG mbox %d "
 511                         "hndl "LPX64"."LPX64"\n",
 512                         conn, smsg_attr, fma_blk, id,
 513                         fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
 514
 515                 fma_blk->gnm_held_mboxs--;
 516                 fma_blk->gnm_avail_mboxs++;
 517         }
 518
 519         if (purgatory_hold <= 0) {
 520                 /* if kgni is retransmitting, freeing the smsg block before the EP
 521                  * is destroyed gets messy. Bug 768295. */
 522                 LASSERTF(conn->gnc_ephandle == NULL,
 523                          "can't release mbox before EP is nuked. conn 0x%p\n", conn);
 524
 525                 mbox = &fma_blk->gnm_mbox_info[id];
 526                 mbox->mbx_release_from_purgatory = jiffies;
 527
 528                 /* clear conn gnc_fmablk if it is gone - this allows us to
 529                  * not worry about state so much in kgnilnd_destroy_conn
 530                  * and makes the guaranteed cleanup of the resources easier */
 531                 LASSERTF(test_and_clear_bit(id, fma_blk->gnm_bit_array),
 532                         "conn %p bit %d already cleared in fma_blk %p\n",
 533                          conn, id, fma_blk);
 534                 conn->gnc_fma_blk = NULL;
 535                 mbox->mbx_nallocs--;
 536         }
 537
 538         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_FMABLK_AVAIL)) {
 539                 CERROR("LBUGs in your future: forcibly marking fma_blk %p "
 540                        "as mapped\n", fma_blk);
 541                 fma_blk->gnm_state = GNILND_FMABLK_VIRT;
 542         }
 543
 544         /* we don't release or unmap PHYS blocks as part of the normal cycle --
 545          * those are controlled manually from startup/shutdown */
 546         if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
 547                 /* we can unmap once all are unused (held or avail)
 548                  * but check hold_timeout to make sure we are not trying to double
 549                  * unmap this buffer. If there was no hold_timeout set due to
 550                  * held_mboxs, we'll free the mobx here shortly and won't have to
 551                  * worry about catching a double free for a 'clean' fma_blk */
 552                 if (((fma_blk->gnm_avail_mboxs + fma_blk->gnm_held_mboxs) == fma_blk->gnm_num_mboxs) &&
 553                     (!fma_blk->gnm_hold_timeout)) {
 554                         kgnilnd_unmap_fmablk(dev, fma_blk);
 555                 }
 556
 557                 /* But we can only free once they are all avail */
 558                 if (fma_blk->gnm_avail_mboxs == fma_blk->gnm_num_mboxs &&
 559                     fma_blk->gnm_held_mboxs == 0) {
 560                         /* all mailboxes are released, free fma_blk */
 561                         kgnilnd_free_fmablk_locked(dev, fma_blk);
 562                 }
 563         }
 564
 565         spin_unlock(&dev->gnd_fmablk_lock);
 566 }
 567
 568 int
 569 kgnilnd_count_phys_mbox(kgn_device_t *device)
 570 {
 571         int                     i = 0;
 572         kgn_fma_memblock_t     *fma_blk;
 573
 574         spin_lock(&device->gnd_fmablk_lock);
 575
 576         list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
 577                 if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
 578                         i += fma_blk->gnm_num_mboxs;
 579         }
 580         spin_unlock(&device->gnd_fmablk_lock);
 581
 582         RETURN(i);
 583 }
 584
 585 int
 586 kgnilnd_allocate_phys_fmablk(kgn_device_t *device)
 587 {
 588         int     rc;
 589
 590         while (kgnilnd_count_phys_mbox(device) < *kgnilnd_tunables.kgn_nphys_mbox) {
 591
 592                 rc = kgnilnd_alloc_fmablk(device, 1);
 593                 if (rc) {
 594                         CERROR("failed phys mbox allocation, stopping at %d, rc %d\n",
 595                                 kgnilnd_count_phys_mbox(device), rc);
 596                         RETURN(rc);
 597                 }
 598         }
 599         RETURN(0);
 600 }
 601
 602 int
 603 kgnilnd_map_phys_fmablk(kgn_device_t *device)
 604 {
 605
 606         int                     rc = 0;
 607         kgn_fma_memblock_t     *fma_blk;
 608
 609         /* use mutex to gate access to single thread, just in case */
 610         mutex_lock(&device->gnd_fmablk_mutex);
 611
 612         spin_lock(&device->gnd_fmablk_lock);
 613
 614         list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
 615                 if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
 616                         rc = kgnilnd_map_fmablk(device, fma_blk);
 617                         if (rc)
 618                                 break;
 619                 }
 620         }
 621         spin_unlock(&device->gnd_fmablk_lock);
 622
 623         mutex_unlock(&device->gnd_fmablk_mutex);
 624
 625         RETURN(rc);
 626 }
 627
 628 void
 629 kgnilnd_unmap_fma_blocks(kgn_device_t *device)
 630 {
 631
 632         kgn_fma_memblock_t      *fma_blk;
 633
 634         /* use mutex to gate access to single thread, just in case */
 635         mutex_lock(&device->gnd_fmablk_mutex);
 636
 637         spin_lock(&device->gnd_fmablk_lock);
 638
 639         list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
 640                 kgnilnd_unmap_fmablk(device, fma_blk);
 641         }
 642         spin_unlock(&device->gnd_fmablk_lock);
 643
 644         mutex_unlock(&device->gnd_fmablk_mutex);
 645 }
 646
 647 void
 648 kgnilnd_free_phys_fmablk(kgn_device_t *device)
 649 {
 650
 651         kgn_fma_memblock_t      *fma_blk, *fma_blkN;
 652
 653         /* use mutex to gate access to single thread, just in case */
 654         mutex_lock(&device->gnd_fmablk_mutex);
 655
 656         spin_lock(&device->gnd_fmablk_lock);
 657
 658         list_for_each_entry_safe(fma_blk, fma_blkN, &device->gnd_fma_buffs, gnm_bufflist) {
 659                 if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
 660                         kgnilnd_free_fmablk_locked(device, fma_blk);
 661         }
 662         spin_unlock(&device->gnd_fmablk_lock);
 663
 664         mutex_unlock(&device->gnd_fmablk_mutex);
 665 }
 666
 667 /* kgnilnd dgram nid->struct managment */
 668
 669 static inline struct list_head *
 670 kgnilnd_nid2dgramlist(kgn_device_t *dev, lnet_nid_t nid)
 671 {
 672         unsigned int hash = ((unsigned int)nid) % *kgnilnd_tunables.kgn_peer_hash_size;
 673
 674         RETURN(&dev->gnd_dgrams[hash]);
 675 }
 676
 677
 678 /* needs dev->gnd_dgram_lock held */
 679 kgn_dgram_t *
 680 kgnilnd_find_dgram_locked(kgn_device_t *dev, lnet_nid_t dst_nid)
 681 {
 682         struct list_head *dgram_list = kgnilnd_nid2dgramlist(dev, dst_nid);
 683         kgn_dgram_t      *dgram;
 684
 685         list_for_each_entry(dgram, dgram_list, gndg_list) {
 686
 687                 /* if state > POSTED, we are already handling cancel/completion */
 688                 if ((dgram->gndg_conn_out.gncr_dstnid != dst_nid) ||
 689                      dgram->gndg_state > GNILND_DGRAM_POSTED)
 690                         continue;
 691
 692                 CDEBUG(D_NET, "got dgram [%p] -> %s\n",
 693                        dgram, libcfs_nid2str(dst_nid));
 694                 return dgram;
 695         }
 696         return NULL;
 697 }
 698
 699 int
 700 kgnilnd_find_and_cancel_dgram(kgn_device_t *dev, lnet_nid_t dst_nid)
 701 {
 702         kgn_dgram_t     *dgram;
 703
 704         spin_lock(&dev->gnd_dgram_lock);
 705         dgram = kgnilnd_find_dgram_locked(dev, dst_nid);
 706
 707         if (dgram) {
 708                 kgnilnd_cancel_dgram_locked(dgram);
 709         }
 710         spin_unlock(&dev->gnd_dgram_lock);
 711
 712         RETURN(!!(dgram == NULL));
 713 }
 714
 715 int
 716 kgnilnd_pack_connreq(kgn_connreq_t *connreq, kgn_conn_t *conn,
 717                      lnet_nid_t srcnid, lnet_nid_t dstnid,
 718                      kgn_connreq_type_t type)
 719 {
 720         int err = 0;
 721
 722         /* ensure we haven't violated max datagram size */
 723         CLASSERT(sizeof(kgn_connreq_t) <= GNI_DATAGRAM_MAXSIZE);
 724
 725         /* no need to zero out, we do that when allocating dgram */
 726         connreq->gncr_magic     = GNILND_MSG_MAGIC;
 727
 728         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_SRCNID)) {
 729                 srcnid = 0xABADBABE;
 730         } else if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_DSTNID)) {
 731                 dstnid = 0xDEFEC8ED;
 732         }
 733
 734         connreq->gncr_srcnid    = srcnid;
 735         connreq->gncr_dstnid    = dstnid;
 736
 737         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
 738                 connreq->gncr_version = 99;
 739         } else {
 740                 connreq->gncr_version   = GNILND_CONNREQ_VERSION;
 741         }
 742         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
 743                 connreq->gncr_type = 99;
 744         } else {
 745                 connreq->gncr_type      = type;
 746         }
 747         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
 748                 connreq->gncr_peerstamp = 0;
 749         } else {
 750                 connreq->gncr_peerstamp = kgnilnd_data.kgn_peerstamp;
 751         }
 752         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
 753                 connreq->gncr_connstamp = 0;
 754         } else {
 755                 connreq->gncr_connstamp = conn->gnc_my_connstamp;
 756         }
 757         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
 758                 connreq->gncr_timeout = 0;
 759         } else {
 760                 connreq->gncr_timeout   = conn->gnc_timeout;
 761         }
 762
 763         /* the rest pack the data into the payload in other places */
 764         if (type == GNILND_CONNREQ_REQ) {
 765                 kgn_gniparams_t       *req_params = &connreq->gncr_gnparams;
 766                 req_params->gnpr_host_id = conn->gnc_device->gnd_host_id;
 767                 req_params->gnpr_cqid = conn->gnc_cqid;
 768
 769                 /* allocate mailbox for this connection */
 770                 err = kgnilnd_setup_mbox(conn);
 771                 if (err != 0) {
 772                         CERROR("Failed to setup FMA mailbox (%d)\n", err);
 773                 }
 774                 req_params->gnpr_smsg_attr = conn->gnpr_smsg_attr;
 775         }
 776
 777         /* XXX Nic: TBD - checksum computation */
 778
 779         return err;
 780 }
 781
 782 int
 783 kgnilnd_unpack_connreq(kgn_dgram_t *dgram)
 784 {
 785         kgn_connreq_t           *connreq = &dgram->gndg_conn_in;
 786         int                      swab, rc = 0;
 787         kgn_net_t               *net;
 788
 789         /* the following fields must be handled in a backwards compatible
 790          * manner to ensure we can always send and interpret NAKs */
 791
 792         if (connreq->gncr_magic != GNILND_MSG_MAGIC &&
 793             connreq->gncr_magic != __swab32(GNILND_MSG_MAGIC)) {
 794                 /* Unexpected magic! */
 795                 CERROR("Unexpected magic %08x\n",
 796                        connreq->gncr_magic);
 797                 return -EBADF;
 798         }
 799
 800         swab = (connreq->gncr_magic == __swab32(GNILND_MSG_MAGIC));
 801         if (swab) {
 802                 __swab32s(&connreq->gncr_magic);
 803                 __swab32s(&connreq->gncr_cksum);
 804                 __swab16s(&connreq->gncr_type);
 805                 __swab16s(&connreq->gncr_version);
 806                 __swab32s(&connreq->gncr_timeout);
 807                 __swab64s(&connreq->gncr_srcnid);
 808                 __swab64s(&connreq->gncr_dstnid);
 809                 __swab64s(&connreq->gncr_peerstamp);
 810                 __swab64s(&connreq->gncr_connstamp);
 811         }
 812
 813         /* Do NOT return anything but -EBADF before we munge
 814          * connreq->gncr_srcnid - we need that to send the nak */
 815
 816         if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
 817                 lnet_nid_t      incoming = connreq->gncr_srcnid;
 818
 819                 /* even if the incoming packet is hosed, we know who we sent
 820                  * the original and can set the srcnid so that we can properly
 821                  * look up our peer to close the loop on this connreq. We still use
 822                  * -EBADF to prevent a NAK - just in case there are issues with
 823                  * the payload coming from a random spot, etc. */
 824                 connreq->gncr_srcnid = dgram->gndg_conn_out.gncr_dstnid;
 825
 826                 if (LNET_NIDADDR(dgram->gndg_conn_out.gncr_dstnid) !=
 827                                 LNET_NIDADDR(incoming)) {
 828                         /* we got a datagram match for the wrong nid... */
 829                         CERROR("matched datagram 0x%p with srcnid %s "
 830                                 "(%x), expecting %s (%x)\n",
 831                                 dgram,
 832                                 libcfs_nid2str(incoming),
 833                                 LNET_NIDADDR(incoming),
 834                                 libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
 835                                 LNET_NIDADDR(dgram->gndg_conn_out.gncr_dstnid));
 836                         return -EBADF;
 837                 }
 838         } else {
 839                 /* if we have a wildcard datagram it should match an
 840                  * incoming "active" datagram that should have a fully formed
 841                  * srcnid and dstnid. If we couldn't unpack it, we drop as
 842                  * corrupted packet, otherwise we'll just verify that the dstnid
 843                  * matches the NID for the NET that the dgram was posted */
 844
 845                 /* make sure their wildcard didn't match ours, that is unpossible */
 846                 LASSERTF(connreq->gncr_dstnid != LNET_NID_ANY,
 847                          "dgram 0x%p from %s, connreq 0x%p; "
 848                          "wildcard matched wildcard \n", dgram,
 849                          libcfs_nid2str(connreq->gncr_srcnid), connreq);
 850
 851                 rc = kgnilnd_find_net(connreq->gncr_dstnid, &net);
 852
 853                 if (rc == -ESHUTDOWN) {
 854                         CERROR("Looking up network: device is in shutdown");
 855                         return rc;
 856                 } else if (rc == -ENONET) {
 857                         CERROR("Connection data from %s: she sent "
 858                         "dst_nid %s, but net lookup failed on "
 859                         "dgram 0x%p@%s\n",
 860                         libcfs_nid2str(connreq->gncr_srcnid),
 861                         libcfs_nid2str(connreq->gncr_dstnid),
 862                         dgram, kgnilnd_dgram_type2str(dgram));
 863                         return rc;
 864                 }
 865
 866                 if (net->gnn_ni->ni_nid != connreq->gncr_dstnid) {
 867                         CERROR("Bad connection data from %s: she sent "
 868                                "dst_nid %s, but I am %s with dgram 0x%p@%s\n",
 869                                libcfs_nid2str(connreq->gncr_srcnid),
 870                                libcfs_nid2str(connreq->gncr_dstnid),
 871                                libcfs_nid2str(net->gnn_ni->ni_nid),
 872                                dgram, kgnilnd_dgram_type2str(dgram));
 873                         kgnilnd_net_decref(net);
 874                         return -EBADSLT;
 875                 }
 876
 877                 /* kgnilnd_find_net takes a ref on the net it finds, You need to decref it when not needed. */
 878                 kgnilnd_net_decref(net);
 879         }
 880
 881         if (connreq->gncr_version != GNILND_CONNREQ_VERSION) {
 882                 CERROR("Unexpected version %d\n", connreq->gncr_version);
 883                 return -EPROTO;
 884         }
 885
 886         /* XXX Nic: TBD - checksum validation */
 887         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_DROP)) {
 888                 return -EBADF;
 889         }
 890
 891         if (swab && connreq->gncr_type == GNILND_CONNREQ_REQ) {
 892                 __u64 msg_addr = (__u64) connreq->gncr_gnparams.gnpr_smsg_attr.msg_buffer;
 893
 894                 __swab32s(&connreq->gncr_gnparams.gnpr_host_id);
 895                 __swab32s(&connreq->gncr_gnparams.gnpr_cqid);
 896                 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.buff_size);
 897                 __swab16s(&connreq->gncr_gnparams.gnpr_smsg_attr.mbox_maxcredit);
 898                 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.mbox_offset);
 899                 __swab64s(&connreq->gncr_gnparams.gnpr_smsg_attr.mem_hndl.qword1);
 900                 __swab64s(&connreq->gncr_gnparams.gnpr_smsg_attr.mem_hndl.qword2);
 901                 __swab64s(&msg_addr);
 902                 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.msg_maxsize);
 903                 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.msg_type);
 904         } else if (swab && connreq->gncr_type == GNILND_CONNREQ_NAK) {
 905                 __swab32s(&connreq->gncr_nakdata.gnnd_errno);
 906         }
 907
 908         /* since we use a unique instance ID for each network, the driver
 909          * will take care of dropping datagrams if we don't have that network.
 910          */
 911
 912         /* few more idiot software or configuration checks */
 913
 914         switch (connreq->gncr_type) {
 915         case GNILND_CONNREQ_REQ:
 916                 /* wire up EP and SMSG block - this will check the incoming data
 917                  * and barf a NAK back if need to */
 918                 rc = kgnilnd_set_conn_params(dgram);
 919                 if (rc)
 920                         return rc;
 921                 break;
 922         case GNILND_CONNREQ_NAK:
 923         case GNILND_CONNREQ_CLOSE:
 924                 break;
 925         default:
 926                 CERROR("unknown connreq packet type %d\n", connreq->gncr_type);
 927                 return -EPROTO;
 928         }
 929
 930         if (connreq->gncr_peerstamp == 0 || connreq->gncr_connstamp == 0) {
 931                 CERROR("Recived bad timestamps peer "LPU64" conn "LPU64"\n",
 932                 connreq->gncr_peerstamp, connreq->gncr_connstamp);
 933                 return -EPROTO;
 934         }
 935
 936         if (connreq->gncr_timeout < GNILND_MIN_TIMEOUT) {
 937                 CERROR("Received timeout %d < MIN %d\n",
 938                        connreq->gncr_timeout, GNILND_MIN_TIMEOUT);
 939                 return -EPROTO;
 940         }
 941
 942         return 0;
 943 }
 944
 945 int
 946 kgnilnd_alloc_dgram(kgn_dgram_t **dgramp, kgn_device_t *dev, kgn_dgram_type_t type)
 947 {
 948         kgn_dgram_t         *dgram;
 949
 950         dgram = kmem_cache_alloc(kgnilnd_data.kgn_dgram_cache, GFP_ATOMIC);
 951         if (dgram == NULL)
 952                 return -ENOMEM;
 953
 954         /* cache alloc'd memory is not zeroed */
 955         memset((void *)dgram, 0, sizeof(*dgram)) ;
 956
 957         INIT_LIST_HEAD(&dgram->gndg_list);
 958         dgram->gndg_state = GNILND_DGRAM_USED;
 959         dgram->gndg_type = type;
 960         dgram->gndg_magic = GNILND_DGRAM_MAGIC;
 961
 962         atomic_inc(&dev->gnd_ndgrams);
 963
 964         CDEBUG(D_MALLOC|D_NETTRACE, "slab-alloced 'dgram': %lu at %p %s ndgrams"
 965                 " %d\n",
 966                 sizeof(*dgram), dgram, kgnilnd_dgram_type2str(dgram),
 967                 atomic_read(&dev->gnd_ndgrams));
 968
 969         *dgramp = dgram;
 970         return 0;
 971 }
 972
 973 /* call this on a dgram that came back from kgnilnd_ep_postdata_test_by_id
 974  * returns < 0 on dgram to be cleaned up
 975  * > 0 on dgram that isn't done yet
 976  * == 0 on dgram that is ok and needs connreq processing */
 977 int
 978 kgnilnd_process_dgram(kgn_dgram_t *dgram, gni_post_state_t post_state)
 979 {
 980         int rc = 0;
 981
 982         switch (post_state) {
 983         case GNI_POST_COMPLETED:
 984                 /* normal state for dgrams that need actual processing */
 985                 /* GOTO to avoid processing dgram as canceled/done */
 986                 GOTO(process_out, rc);
 987
 988         case GNI_POST_PENDING:
 989                 /* we should only see this if we are testing a WC dgram after a
 990                  * cancel - it means that it needs a full cycle of waiting
 991                  * for kgni_sm_task to finish moving it to TERMINATED */
 992                 LASSERTF((dgram->gndg_type == GNILND_DGRAM_WC_REQ) &&
 993                           (dgram->gndg_state == GNILND_DGRAM_CANCELED),
 994                          "POST_PENDING dgram 0x%p with bad type %d(%s) or state %d(%s)\n",
 995                          dgram, dgram->gndg_type, kgnilnd_dgram_type2str(dgram),
 996                          dgram->gndg_state, kgnilnd_dgram_state2str(dgram));
 997
 998                 /* positive RC as this dgram isn't done yet */
 999                 rc = EINPROGRESS;
1000
1001                 /* GOTO as this isn't done yet */
1002                 GOTO(process_out, rc);
1003                 break;
1004
1005         case GNI_POST_TERMINATED:
1006                 /* we've called cancel and it is done or remote guy called cancel and
1007                  * we've receved it on a WC dgram */
1008 #if 0
1009                 /* we are seeing weird terminations on non WC dgrams when we have not
1010                  * canceled them */
1011
1012                 LASSERTF(dgram->gndg_state == GNILND_DGRAM_CANCELED ||
1013                          dgram->gndg_conn_out.gncr_dstnid == LNET_NID_ANY,
1014                         "dgram 0x%p with bad state %d(%s) or dst nid %s\n",
1015                         dgram, dgram->gndg_state, kgnilnd_dgram_state2str(dgram),
1016                         libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid));
1017 #endif
1018
1019                 CDEBUG(D_NETTRACE, "dgram 0x%p saw %s, cleaning it up\n", dgram,
1020                        dgram->gndg_state == GNILND_DGRAM_CANCELED ?  "canceled" : "terminated");
1021
1022                 rc =  -ECANCELED;
1023                 break;
1024
1025         case GNI_POST_TIMEOUT:
1026                 /* we could have a timeout on a wildcard dgram too - if
1027                  * we got the incoming request but the remote node beefed
1028                  * before kgni could send the match data back. We'll just error
1029                  * on the active case and bail out gracefully */
1030                 if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
1031                         CNETERR("hardware timeout for connect to "
1032                                "%s after %lu seconds. Is node dead?\n",
1033                                libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
1034                                cfs_duration_sec(jiffies - dgram->gndg_post_time));
1035                 }
1036
1037                 rc = -ETIMEDOUT;
1038                 break;
1039
1040         default:
1041                 CERROR("dgram 0x%p with bad post_state %d\n", dgram, post_state);
1042                 LBUG();
1043         }
1044
1045         /* now finish cleaning up a dgram that is canceled/terminated and needs to
1046          * go away */
1047
1048         /* If this was actively canceled, drop the count now that we are processing */
1049         if (dgram->gndg_state == GNILND_DGRAM_CANCELED) {
1050                 atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1051                 /* caller responsible for gndg_list removal */
1052         }
1053
1054 process_out:
1055
1056         RETURN(rc);
1057 }
1058
1059 /* needs dev->gnd_dgram_lock held */
1060 void
1061 kgnilnd_cancel_dgram_locked(kgn_dgram_t *dgram)
1062 {
1063         gni_return_t            grc;
1064
1065         if (dgram->gndg_state != GNILND_DGRAM_POSTED) {
1066                 return;
1067         }
1068
1069         LASSERTF(dgram->gndg_conn != NULL,
1070                  "dgram 0x%p with NULL conn\n", dgram);
1071
1072         /* C.E - WC dgrams could be canceled immediately but
1073          * if there was some match pending, we need to call
1074          * test_by_id to clear it out. If that test returns
1075          * POST_PENDING, it is half done and needs to go along
1076          * with the rest of dgrams and go through a kgni_sm_task cycle
1077          * and deliver a GNI_POST_TERMINATED event before they
1078          * are actually canceled */
1079
1080         dgram->gndg_state = GNILND_DGRAM_CANCELED;
1081
1082         if (dgram->gndg_conn->gnc_state >= GNILND_CONN_ESTABLISHED) {
1083                 /* we don't need to cancel_by_id if the datagram was good */
1084                 return;
1085         }
1086
1087         /* let folks know there are outstanding cancels */
1088         atomic_inc(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1089         /* leave on nid list until cancel is done for debugging fun */
1090         grc = kgnilnd_ep_postdata_cancel_by_id(dgram->gndg_conn->gnc_ephandle, (__u64) dgram);
1091
1092         /* if we don't get success here, we have hosed up the dgram tracking
1093          * code and need to bail out */
1094         LASSERTF(grc == GNI_RC_SUCCESS,
1095                  "postdata_cancel returned %d for conn 0x%p to %s\n",
1096                  grc, dgram->gndg_conn,
1097                  dgram->gndg_conn->gnc_peer ?
1098                   libcfs_nid2str(dgram->gndg_conn->gnc_peer->gnp_nid)
1099                   : "<?>");
1100
1101         CDEBUG(D_NETTRACE,
1102                 "canceled dgram 0x%p conn 0x%p ephandle 0x%p\n",
1103                 dgram, dgram->gndg_conn,
1104                 dgram->gndg_conn->gnc_ephandle);
1105
1106         if (dgram->gndg_type == GNILND_DGRAM_WC_REQ) {
1107                 gni_post_state_t         post_state;
1108                 int                      rc = 0;
1109                 __u32                    remote_addr = 0, remote_id = 0;
1110
1111                 grc = kgnilnd_ep_postdata_test_by_id(dgram->gndg_conn->gnc_ephandle,
1112                                                      (__u64)dgram, &post_state,
1113                                                      &remote_addr, &remote_id);
1114
1115                 LASSERTF(grc == GNI_RC_NO_MATCH || grc == GNI_RC_SUCCESS,
1116                          "bad grc %d from test_by_id on dgram 0x%p\n",
1117                         grc, dgram);
1118
1119                 /* if WC was canceled immediately, we get NO_MATCH, if needs to go
1120                  * through full cycle, we get SUCCESS and need to parse post_state */
1121
1122                 CDEBUG(D_NET, "grc %d dgram 0x%p type %s post_state %d "
1123                         "remote_addr %u remote_id %u\n", grc, dgram,
1124                         kgnilnd_dgram_type2str(dgram),
1125                         post_state, remote_addr, remote_id);
1126
1127                 if (grc == GNI_RC_NO_MATCH) {
1128                         /* she's gone, reduce count and move along */
1129                         dgram->gndg_state = GNILND_DGRAM_DONE;
1130                         atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1131                         RETURN_EXIT;
1132                 }
1133
1134                 rc = kgnilnd_process_dgram(dgram, post_state);
1135
1136                 if (rc <= 0) {
1137                         /* if for some weird reason we get a valid dgram back, just mark as done
1138                          * so we can drop it and move along.
1139                          * C.E - if it was completed, we'll just release the conn/mbox
1140                          * back into the pool and it'll get reused. That said, we should only
1141                          * be canceling a WC dgram on stack rest or shutdown, so that is moot */
1142                         dgram->gndg_state = GNILND_DGRAM_DONE;
1143                         atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1144
1145                         /* caller context responsible for calling kgnilnd_release_dgram() */
1146                 } else {
1147                         /* still pending, let it simmer until golden brown and delicious */
1148                 }
1149         }
1150
1151         /* for non WC dgrams, they are still on the nid list but marked canceled waiting
1152          * for kgni to return their ID to us via probe - that is when we'll complete their
1153          * cancel processing */
1154 }
1155
1156 void
1157 kgnilnd_cleanup_dgram(kgn_dgram_t *dgram)
1158 {
1159         /* release the dgram ref on conn */
1160         if (dgram->gndg_conn) {
1161                 kgnilnd_conn_decref(dgram->gndg_conn);
1162                 dgram->gndg_conn = NULL;
1163         }
1164 }
1165
1166 void
1167 kgnilnd_free_dgram(kgn_device_t *dev, kgn_dgram_t *dgram)
1168 {
1169         LASSERTF(dgram->gndg_state == GNILND_DGRAM_USED ||
1170                  dgram->gndg_state == GNILND_DGRAM_DONE,
1171                  "dgram 0x%p with bad state %s\n",
1172                  dgram, kgnilnd_dgram_state2str(dgram));
1173
1174         /* bit of poisoning to help detect bad driver data */
1175         dgram->gndg_magic = 0x6f5a6b5f;
1176         atomic_dec(&dev->gnd_ndgrams);
1177
1178         kmem_cache_free(kgnilnd_data.kgn_dgram_cache, dgram);
1179         CDEBUG(D_MALLOC|D_NETTRACE, "slab-freed 'dgram': %lu at %p %s"
1180                " ndgrams %d\n",
1181                sizeof(*dgram), dgram, kgnilnd_dgram_type2str(dgram),
1182                atomic_read(&dev->gnd_ndgrams));
1183 }
1184
1185 int
1186 kgnilnd_post_dgram(kgn_device_t *dev, lnet_nid_t dstnid, kgn_connreq_type_t type,
1187                    int data_rc)
1188 {
1189         int              rc = 0;
1190         kgn_dgram_t     *dgram = NULL;
1191         kgn_dgram_t     *tmpdgram;
1192         kgn_dgram_type_t dgtype;
1193         gni_return_t     grc;
1194         __u64            srcnid;
1195         ENTRY;
1196
1197         switch (type) {
1198         case GNILND_CONNREQ_REQ:
1199                 if (dstnid == LNET_NID_ANY)
1200                         dgtype = GNILND_DGRAM_WC_REQ;
1201                 else
1202                         dgtype = GNILND_DGRAM_REQ;
1203                 break;
1204         case GNILND_CONNREQ_NAK:
1205                 LASSERTF(dstnid != LNET_NID_ANY, "can't NAK to LNET_NID_ANY\n");
1206                 dgtype = GNILND_DGRAM_NAK;
1207                 break;
1208         default:
1209                 CERROR("unknown connreq type %d\n", type);
1210                 LBUG();
1211         }
1212
1213         rc = kgnilnd_alloc_dgram(&dgram, dev, dgtype);
1214         if (rc < 0) {
1215                 rc = -ENOMEM;
1216                 GOTO(post_failed, rc);
1217         }
1218
1219         rc = kgnilnd_create_conn(&dgram->gndg_conn, dev);
1220         if (rc) {
1221                 GOTO(post_failed, rc);
1222         }
1223
1224         if (dgram->gndg_type == GNILND_DGRAM_WC_REQ) {
1225                 /* clear buffer for sanity on reuse of wildcard */
1226                 memset(&dgram->gndg_conn_in, 0, sizeof(kgn_connreq_t));
1227         }
1228
1229         if (dstnid == LNET_NID_ANY) {
1230                 /* set here to reset any dgram re-use */
1231                 dgram->gndg_conn->gnc_state = GNILND_CONN_LISTEN;
1232         } else {
1233                 __u32            host_id;
1234
1235                 rc = kgnilnd_nid_to_nicaddrs(LNET_NIDADDR(dstnid), 1, &host_id);
1236                 if (rc <= 0) {
1237                         rc = -ESRCH;
1238                         GOTO(post_failed, rc);
1239                 }
1240
1241                 dgram->gndg_conn->gnc_state = GNILND_CONN_CONNECTING;
1242
1243                 /* don't need to serialize, there are no CQs for the dgram
1244                  * EP on the kgn_net_t */
1245                 grc = kgnilnd_ep_bind(dgram->gndg_conn->gnc_ephandle, host_id, dev->gnd_id);
1246
1247                 if (grc != GNI_RC_SUCCESS) {
1248                         rc = -ECONNABORTED;
1249                         GOTO(post_failed, rc);
1250                 }
1251
1252         }
1253
1254         /* If we are posting wildcards post using a net of 0, otherwise we'll use the
1255          * net of the destination node.
1256          */
1257
1258         if (dstnid == LNET_NID_ANY) {
1259                 srcnid = LNET_MKNID(LNET_MKNET(GNILND, 0), dev->gnd_nid);
1260         } else {
1261                 srcnid = LNET_MKNID(LNET_NIDNET(dstnid), dev->gnd_nid);
1262         }
1263
1264         rc = kgnilnd_pack_connreq(&dgram->gndg_conn_out, dgram->gndg_conn,
1265                                   srcnid, dstnid, type);
1266         if (rc) {
1267                 GOTO(post_failed, rc);
1268         }
1269
1270         if (type == GNILND_CONNREQ_NAK)
1271                 dgram->gndg_conn_out.gncr_nakdata.gnnd_errno = data_rc;
1272
1273         dgram->gndg_post_time = jiffies;
1274
1275         /* XXX Nic: here is where we'd add in logical network multiplexing */
1276
1277         CDEBUG(D_NETTRACE, "dgram 0x%p type %s %s->%s cdm %d\n",
1278                dgram, kgnilnd_dgram_type2str(dgram),
1279                libcfs_nid2str(srcnid),
1280                libcfs_nid2str(dstnid), dev->gnd_id);
1281
1282         /* this allocates memory, can't hold locks across */
1283         grc = kgnilnd_ep_postdata_w_id(dgram->gndg_conn->gnc_ephandle,
1284                                    &dgram->gndg_conn_out, sizeof(kgn_connreq_t),
1285                                    &dgram->gndg_conn_in, sizeof(kgn_connreq_t),
1286                                    (__u64)dgram);
1287
1288         if (grc != GNI_RC_SUCCESS) {
1289                 CNETERR("dropping failed dgram post id 0x%p type %s"
1290                         " reqtype %s to %s: rc %d\n",
1291                         dgram, kgnilnd_dgram_type2str(dgram),
1292                         kgnilnd_connreq_type2str(&dgram->gndg_conn_out),
1293                         libcfs_nid2str(dstnid), grc);
1294                 rc = (grc == GNI_RC_ERROR_NOMEM) ? -ENOMEM : -EBADR;
1295                 GOTO(post_failed, rc);
1296         }
1297
1298         /* we don't need to add earlier - if someone does del_peer during post,
1299          * that peer will get marked as unlinked and the callers wil take care of it.
1300          * The dgram code is largely kgn_peer_t ignorant, so at worst, we'll just drop
1301          * the completed dgram later when we cant find a peer to stuff it into */
1302
1303         spin_lock(&dev->gnd_dgram_lock);
1304
1305         /* make sure we are not double posting targeted dgrams
1306          * - we can multiple post WC dgrams to help with processing speed */
1307         if (dstnid != LNET_NID_ANY) {
1308                 tmpdgram = kgnilnd_find_dgram_locked(dev, dstnid);
1309
1310                 LASSERTF(tmpdgram == NULL,
1311                         "dgram 0x%p->%s already posted\n",
1312                          dgram, libcfs_nid2str(dstnid));
1313         }
1314
1315         /* unmunge dstnid to help processing code cope... */
1316         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_DSTNID)) {
1317                 dgram->gndg_conn_out.gncr_dstnid = dstnid;
1318         }
1319
1320         list_add_tail(&dgram->gndg_list, kgnilnd_nid2dgramlist(dev, dstnid));
1321         dgram->gndg_state = GNILND_DGRAM_POSTED;
1322         spin_unlock(&dev->gnd_dgram_lock);
1323
1324 post_failed:
1325         if (rc < 0 && dgram != NULL) {
1326                 kgnilnd_cleanup_dgram(dgram);
1327                 kgnilnd_free_dgram(dev, dgram);
1328         }
1329
1330         RETURN(rc);
1331 }
1332
1333 /* The shutdown flag is set from the shutdown and stack reset threads. */
1334 void
1335 kgnilnd_release_dgram(kgn_device_t *dev, kgn_dgram_t *dgram, int shutdown)
1336 {
1337         /* The conns of canceled active dgrams need to be put in purgatory so
1338          * we don't reuse the mailbox */
1339         if (unlikely(dgram->gndg_state == GNILND_DGRAM_CANCELED)) {
1340                 kgn_peer_t *peer;
1341                 kgn_conn_t *conn = dgram->gndg_conn;
1342                 lnet_nid_t nid = dgram->gndg_conn_out.gncr_dstnid;
1343
1344                 dgram->gndg_state = GNILND_DGRAM_DONE;
1345
1346                 /* During shutdown we've already removed the peer so we don't
1347                  * need to add a peer. During stack reset we don't care about
1348                  * MDDs since they are all released. */
1349                 if (!shutdown) {
1350                         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1351                         peer = kgnilnd_find_peer_locked(nid);
1352
1353                         if (peer != NULL) {
1354                                 CDEBUG(D_NET, "adding peer's conn with nid %s "
1355                                         "to purgatory\n", libcfs_nid2str(nid));
1356                                 kgnilnd_conn_addref(conn);
1357                                 conn->gnc_peer = peer;
1358                                 kgnilnd_peer_addref(peer);
1359                                 kgnilnd_admin_addref(conn->gnc_peer->gnp_dirty_eps);
1360                                 conn->gnc_state = GNILND_CONN_CLOSED;
1361                                 list_add_tail(&conn->gnc_list,
1362                                               &peer->gnp_conns);
1363                                 kgnilnd_add_purgatory_locked(conn,
1364                                                              conn->gnc_peer);
1365                                 kgnilnd_schedule_conn(conn);
1366                         }
1367                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1368                 }
1369         }
1370
1371         spin_lock(&dev->gnd_dgram_lock);
1372         kgnilnd_cancel_dgram_locked(dgram);
1373         spin_unlock(&dev->gnd_dgram_lock);
1374
1375         kgnilnd_cleanup_dgram(dgram);
1376
1377         /* if the dgram is 'canceled' it needs to be wait until the event
1378          * comes up from kgni that tells us it is safe to release */
1379         if (dgram->gndg_state != GNILND_DGRAM_CANCELED) {
1380                 dgram->gndg_state = GNILND_DGRAM_DONE;
1381
1382                 LASSERTF(list_empty(&dgram->gndg_list), "dgram 0x%p on list\n", dgram);
1383
1384                 /* if it is a wildcard and we are in an appropriate state, repost
1385                  * the wildcard */
1386
1387                 if ((dgram->gndg_type == GNILND_DGRAM_WC_REQ) &&
1388                     (!kgnilnd_data.kgn_wc_kill && !kgnilnd_data.kgn_in_reset)) {
1389                         int     rerc;
1390
1391                         rerc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
1392                         if (rerc != 0) {
1393                                 /* We failed to repost the WC dgram for some reason
1394                                  * mark it so the repost system attempts to repost */
1395                                 kgnilnd_admin_addref(dev->gnd_nwcdgrams);
1396                         }
1397                 }
1398
1399                 /* always free the old dgram */
1400                 kgnilnd_free_dgram(dev, dgram);
1401         }
1402 }
1403
1404
1405 int
1406 kgnilnd_probe_for_dgram(kgn_device_t *dev, kgn_dgram_t **dgramp)
1407 {
1408         kgn_dgram_t             *dgram = NULL;
1409         gni_post_state_t         post_state;
1410         gni_return_t             grc;
1411         int                      rc = 0;
1412         __u64                    readyid;
1413         __u32                    remote_addr = 0, remote_id = 0;
1414         ENTRY;
1415
1416         /* Probe with the lock held. That way if we get a dgram we dont have it canceled
1417          * between finding the ready dgram and grabbing the lock to remove it from the
1418          * list. Otherwise we could be left in an inconsistent state. We own the dgram
1419          * once its off the list so we don't need to worry about others changing it at
1420          * that point. */
1421         spin_lock(&dev->gnd_dgram_lock);
1422         grc = kgnilnd_postdata_probe_by_id(dev->gnd_handle, &readyid);
1423         if (grc != GNI_RC_SUCCESS) {
1424                 spin_unlock(&dev->gnd_dgram_lock);
1425                 /* return 0 to indicate nothing happened */
1426                 RETURN(0);
1427         }
1428
1429         CDEBUG(D_NET, "ready "LPX64" on device 0x%p\n",
1430                 readyid, dev);
1431
1432         dgram = (kgn_dgram_t *)readyid;
1433
1434         LASSERTF(dgram->gndg_magic == GNILND_DGRAM_MAGIC,
1435                  "dgram 0x%p from id "LPX64" with bad magic %x\n",
1436                  dgram, readyid, dgram->gndg_magic);
1437
1438         LASSERTF(dgram->gndg_state == GNILND_DGRAM_POSTED ||
1439                  dgram->gndg_state == GNILND_DGRAM_CANCELED,
1440                  "dgram 0x%p with bad state %s\n",
1441                  dgram, kgnilnd_dgram_state2str(dgram));
1442
1443         LASSERTF(!list_empty(&dgram->gndg_list),
1444                  "dgram 0x%p with bad list state %s type %s\n",
1445                  dgram, kgnilnd_dgram_state2str(dgram),
1446                  kgnilnd_dgram_type2str(dgram));
1447
1448         /* now we know that the datagram structure is ok, so pull off list */
1449         list_del_init(&dgram->gndg_list);
1450
1451         /* while we have the gnn_dgram_lock and BEFORE we call test_by_id
1452          * change the state from POSTED to PROCESSING to ensure that
1453          * nobody cancels it after we've pulled it from the wire */
1454         if (dgram->gndg_state == GNILND_DGRAM_POSTED) {
1455                 dgram->gndg_state = GNILND_DGRAM_PROCESSING;
1456         }
1457
1458         LASSERTF(dgram->gndg_conn != NULL,
1459                 "dgram 0x%p with NULL conn\n", dgram);
1460
1461         grc = kgnilnd_ep_postdata_test_by_id(dgram->gndg_conn->gnc_ephandle,
1462                                              (__u64)dgram, &post_state,
1463                                              &remote_addr, &remote_id);
1464
1465         /* we now "own" this datagram */
1466         spin_unlock(&dev->gnd_dgram_lock);
1467
1468         LASSERTF(grc != GNI_RC_NO_MATCH, "kgni lied! probe_by_id told us that"
1469                  " id "LPU64" was ready\n", readyid);
1470
1471         CDEBUG(D_NET, "grc %d dgram 0x%p type %s post_state %d "
1472                 "remote_addr %u remote_id %u\n", grc, dgram,
1473                 kgnilnd_dgram_type2str(dgram),
1474                 post_state, remote_addr, remote_id);
1475
1476         if (unlikely(grc != GNI_RC_SUCCESS)) {
1477                 CNETERR("getting data for dgram 0x%p->%s failed rc %d. Dropping it\n",
1478                         dgram, libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
1479                         grc);
1480                 rc = -EINVAL;
1481                 GOTO(probe_for_out, rc);
1482         }
1483
1484         rc = kgnilnd_process_dgram(dgram, post_state);
1485
1486         /* we should never get probe finding a dgram for us and then it
1487          * being a WC dgram that is still in the middle of processing */
1488         LASSERTF(rc <= 0, "bad rc %d from process_dgram 0x%p state %d\n",
1489                  rc, dgram, post_state);
1490
1491         if (rc == 0) {
1492                 /* dgram is good enough for the data to be used */
1493                 dgram->gndg_state = GNILND_DGRAM_PROCESSING;
1494                 /* fake rc to mark that we've done something */
1495                 rc = 1;
1496         } else {
1497                 /* let kgnilnd_release_dgram take care of canceled dgrams */
1498                 if (dgram->gndg_state != GNILND_DGRAM_CANCELED) {
1499                         dgram->gndg_state = GNILND_DGRAM_DONE;
1500                 }
1501         }
1502
1503         *dgramp = dgram;
1504         RETURN(rc);
1505
1506 probe_for_out:
1507
1508         kgnilnd_release_dgram(dev, dgram, 0);
1509         RETURN(rc);
1510 }
1511
1512 int
1513 kgnilnd_setup_wildcard_dgram(kgn_device_t *dev)
1514 {
1515         /* if kgn_wildcard is zero, return error */
1516         int     rc = -ENOENT, i;
1517         ENTRY;
1518
1519         for (i = 0; i < *kgnilnd_tunables.kgn_nwildcard; i++) {
1520                 rc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
1521                 if (rc < 0) {
1522                         CERROR("error %d: could not post wildcard datagram # %d\n",
1523                                 rc, i);
1524                         rc = -EINVAL;
1525                         GOTO(failed, rc);
1526                 }
1527         }
1528
1529 failed:
1530         RETURN(rc);
1531 }
1532
1533 int
1534 kgnilnd_cancel_net_dgrams(kgn_net_t *net)
1535 {
1536         kgn_dgram_t            *dg, *dgN;
1537         struct list_head        zombies;
1538         int                     i;
1539         ENTRY;
1540
1541         /* we want to cancel any outstanding dgrams - we don't want to rely
1542          * on del_peer_or_conn catching all of them. This helps protect us in cases
1543          * where we don't quite keep the peer->dgram mapping in sync due to some
1544          * race conditions */
1545
1546         LASSERTF(net->gnn_shutdown || kgnilnd_data.kgn_in_reset,
1547                  "called with LND invalid state: net shutdown %d "
1548                  "in reset %d\n", net->gnn_shutdown,
1549                  kgnilnd_data.kgn_in_reset);
1550
1551         INIT_LIST_HEAD(&zombies);
1552
1553         spin_lock(&net->gnn_dev->gnd_dgram_lock);
1554
1555         for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
1556                 list_for_each_entry_safe(dg, dgN, &net->gnn_dev->gnd_dgrams[i], gndg_list) {
1557
1558                         /* skip nids not on our net or are wildcards */
1559
1560
1561                         if (dg->gndg_type == GNILND_DGRAM_WC_REQ ||
1562                                 net->gnn_netnum != LNET_NETNUM(LNET_NIDNET(dg->gndg_conn_out.gncr_dstnid)))
1563                                 continue;
1564
1565                         kgnilnd_cancel_dgram_locked(dg);
1566                 }
1567         }
1568
1569         spin_unlock(&net->gnn_dev->gnd_dgram_lock);
1570
1571         RETURN(0);
1572 }
1573
1574 int
1575 kgnilnd_cancel_wc_dgrams(kgn_device_t *dev)
1576 {
1577         kgn_dgram_t *dg, *dgN;
1578         struct list_head zombies;
1579         ENTRY;
1580
1581         /* Time to kill the outstanding WC's
1582          * WC's exist on net 0 only but match on any net...
1583          */
1584
1585         LASSERTF(kgnilnd_data.kgn_in_reset || kgnilnd_data.kgn_wc_kill,
1586                 "called with LND invalid state: WC shutdown %d "
1587                 "in reset %d\n", kgnilnd_data.kgn_wc_kill,
1588                 kgnilnd_data.kgn_in_reset);
1589
1590         INIT_LIST_HEAD(&zombies);
1591         spin_lock(&dev->gnd_dgram_lock);
1592
1593         do {
1594                 dg = kgnilnd_find_dgram_locked(dev, LNET_NID_ANY);
1595                 if (dg != NULL) {
1596                         LASSERTF(dg->gndg_type == GNILND_DGRAM_WC_REQ,
1597                                  "dgram 0x%p->%s with bad type %d (%s)\n",
1598                                 dg, libcfs_nid2str(dg->gndg_conn_out.gncr_dstnid),
1599                                 dg->gndg_type, kgnilnd_dgram_type2str(dg));
1600
1601                         kgnilnd_cancel_dgram_locked(dg);
1602
1603                         /* WC could be DONE already, check and if so add to list to be released */
1604                         if (dg->gndg_state == GNILND_DGRAM_DONE) {
1605                                 list_del_init(&dg->gndg_list);
1606                                 list_add_tail(&dg->gndg_list, &zombies);
1607                         }
1608                 }
1609         } while (dg != NULL);
1610
1611         spin_unlock(&dev->gnd_dgram_lock);
1612
1613         list_for_each_entry_safe(dg, dgN, &zombies, gndg_list) {
1614                 list_del_init(&dg->gndg_list);
1615                 kgnilnd_release_dgram(dev, dg, 1);
1616         }
1617         RETURN(0);
1618
1619 }
1620
1621 int
1622 kgnilnd_cancel_dgrams(kgn_device_t *dev)
1623 {
1624         kgn_dgram_t *dg, *dgN;
1625         int i;
1626         ENTRY;
1627
1628         /* Cancel any outstanding non wildcard datagrams regardless
1629          * of which net they are on as we are in base shutdown and
1630          * dont care about connecting anymore.
1631          */
1632
1633         LASSERTF(kgnilnd_data.kgn_wc_kill == 1,"We didnt get called from base shutdown\n");
1634
1635         spin_lock(&dev->gnd_dgram_lock);
1636
1637         for (i = 0; i < (*kgnilnd_tunables.kgn_peer_hash_size -1); i++) {
1638                 list_for_each_entry_safe(dg, dgN, &dev->gnd_dgrams[i], gndg_list) {
1639                         if (dg->gndg_type != GNILND_DGRAM_WC_REQ)
1640                                 kgnilnd_cancel_dgram_locked(dg);
1641                 }
1642         }
1643
1644         spin_unlock(&dev->gnd_dgram_lock);
1645
1646         RETURN(0);
1647 }
1648
1649
1650 void
1651 kgnilnd_wait_for_canceled_dgrams(kgn_device_t *dev)
1652 {
1653         int             i = 4;
1654         int             rc;
1655         gni_return_t    grc;
1656         __u64           readyid;
1657         kgn_dgram_t    *dgram;
1658
1659         /* use do while to get at least one check run to allow
1660          * regression test for 762072 to hit bug if there */
1661
1662         /* This function races with the dgram mover during shutdown so it is possible for
1663          * a dgram to be seen in kgnilnd_postdata_probe_wait_by_id but be handled in the
1664          * dgram mover thread instead of inside of this function.
1665          */
1666
1667         /* This should only be called from within shutdown, baseshutdown, or stack reset.
1668          * there are no assertions here to verify since base_shutdown has nothing in it we can check
1669          * the net is gone by then.
1670          */
1671
1672         do {
1673                 i++;
1674                 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
1675                         "Waiting for %d canceled datagrams to clear on device %d\n",
1676                         atomic_read(&dev->gnd_canceled_dgrams), dev->gnd_id);
1677
1678                 /* check once a second */
1679                 grc = kgnilnd_postdata_probe_wait_by_id(dev->gnd_handle,
1680                        250, &readyid);
1681
1682                 if (grc != GNI_RC_SUCCESS)
1683                         continue;
1684
1685                 CDEBUG(D_NET, "ready "LPX64" on device %d->0x%p\n",
1686                         readyid, dev->gnd_id, dev);
1687
1688                 rc = kgnilnd_probe_for_dgram(dev, &dgram);
1689                 if (rc != 0) {
1690                         /* if we got a valid dgram or one that is now done, clean up */
1691                         kgnilnd_release_dgram(dev, dgram, 1);
1692                 }
1693         } while (atomic_read(&dev->gnd_canceled_dgrams));
1694 }
1695
1696 int
1697 kgnilnd_start_connect(kgn_peer_t *peer)
1698 {
1699         int              rc = 0;
1700         /* sync point for kgnilnd_del_peer_locked - do an early check to
1701          * catch the most common hits where del_peer is done by the
1702          * time we get here */
1703         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING1)) {
1704                 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING1, 1)) {};
1705         }
1706
1707         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1708         if (!kgnilnd_peer_active(peer) || peer->gnp_connecting != GNILND_PEER_CONNECT) {
1709                 /* raced with peer getting unlinked */
1710                 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1711                 rc = ESTALE;
1712                 GOTO(out, rc);
1713         }
1714         peer->gnp_connecting = GNILND_PEER_POSTING;
1715         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1716
1717         set_mb(peer->gnp_last_dgram_time, jiffies);
1718         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING2)) {
1719                 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING2, 1)) {};
1720         }
1721
1722         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING3)) {
1723                 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING3, 1)) {};
1724                 rc = cfs_fail_val ? cfs_fail_val : -ENOMEM;
1725         } else {
1726                 rc = kgnilnd_post_dgram(peer->gnp_net->gnn_dev,
1727                                         peer->gnp_nid, GNILND_CONNREQ_REQ, 0);
1728         }
1729         if (rc < 0) {
1730                 set_mb(peer->gnp_last_dgram_errno, rc);
1731                 GOTO(failed, rc);
1732         }
1733
1734         /* while we're posting someone could have decided this peer/dgram needed to
1735          * die a quick death, so we check for state change and process accordingly */
1736
1737         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1738         if (!kgnilnd_peer_active(peer) || peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH) {
1739                 if (peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH) {
1740                         peer->gnp_connecting = GNILND_PEER_KILL;
1741                 }
1742                 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1743                 /* positive RC to avoid dgram cleanup - we'll have to
1744                  * wait for the kgni GNI_POST_TERMINATED event to
1745                  * finish cleaning up */
1746                 rc = ESTALE;
1747                 kgnilnd_find_and_cancel_dgram(peer->gnp_net->gnn_dev, peer->gnp_nid);
1748                 GOTO(out, rc);
1749         }
1750         peer->gnp_connecting = GNILND_PEER_POSTED;
1751         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1752         /* reaper thread will take care of any timeouts */
1753         CDEBUG(D_NET, "waiting for connect to finish to %s rc %d\n",
1754                libcfs_nid2str(peer->gnp_nid), rc);
1755
1756         RETURN(rc);
1757
1758 failed:
1759         CDEBUG(D_NET, "connect to %s failed: rc %d \n",
1760                libcfs_nid2str(peer->gnp_nid), rc);
1761 out:
1762         RETURN(rc);
1763 }
1764
1765 int
1766 kgnilnd_finish_connect(kgn_dgram_t *dgram)
1767 {
1768         kgn_conn_t        *conn = dgram->gndg_conn;
1769         lnet_nid_t         her_nid = dgram->gndg_conn_in.gncr_srcnid;
1770         kgn_peer_t        *new_peer, *peer = NULL;
1771         kgn_tx_t          *tx;
1772         kgn_tx_t          *txn;
1773         kgn_mbox_info_t   *mbox;
1774         int                rc;
1775         int                nstale;
1776
1777         /* try to find a peer that matches the nid we got in the connreq
1778          * kgnilnd_unpack_connreq makes sure that conn_in.gncr_srcnid is
1779          * HER and conn_out.gncr_srcnid is ME for both active and WC dgrams */
1780
1781         /* assume this is a new peer  - it makes locking cleaner when it isn't */
1782         /* no holding kgn_net_rw_sem - already are at the kgnilnd_dgram_mover level */
1783
1784         rc = kgnilnd_create_peer_safe(&new_peer, her_nid, NULL, GNILND_RCA_NODE_UP);
1785         if (rc != 0) {
1786                 CERROR("Can't create peer for %s\n", libcfs_nid2str(her_nid));
1787                 return rc;
1788         }
1789
1790         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1791
1792         /* this transfers ref from create_peer to the kgn_peer table */
1793         kgnilnd_add_peer_locked(her_nid, new_peer, &peer);
1794
1795         /* if we found an existing peer, is it really ready for a new conn ? */
1796         if (peer != new_peer) {
1797                 /* if this was an active connect attempt but we can't find a peer waiting for it
1798                  * we will dump in the trash */
1799
1800                 if (peer->gnp_connecting == GNILND_PEER_IDLE && dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
1801                         CDEBUG(D_NET, "dropping completed connreq for %s peer 0x%p->%s\n",
1802                                libcfs_nid2str(her_nid), peer, libcfs_nid2str(peer->gnp_nid));
1803                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1804                         rc = ECANCELED;
1805                         GOTO(out, rc);
1806                 }
1807
1808                 /* check to see if we can catch a connecting peer before it is
1809                  * removed from the connd_peers list - if not, we need to
1810                  * let the connreqs race and be handled by kgnilnd_conn_isdup_locked() */
1811                 if (peer->gnp_connecting != GNILND_PEER_IDLE) {
1812                         spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
1813                         if (!list_empty(&peer->gnp_connd_list)) {
1814                                 list_del_init(&peer->gnp_connd_list);
1815                                 /* drop connd ref */
1816                                 kgnilnd_peer_decref(peer);
1817                         }
1818                         spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
1819                         /* clear rc to make sure we don't have fake error */
1820                         rc = 0;
1821                 }
1822
1823                 /* no matter what, we are no longer waiting to connect this peer now */
1824                 peer->gnp_connecting = GNILND_PEER_IDLE;
1825
1826                 /* Refuse to duplicate an existing connection (both sides might try to
1827                  * connect at once).  NB we return success!  We _are_ connected so we
1828                  * _don't_ have any blocked txs to complete with failure. */
1829                 rc = kgnilnd_conn_isdup_locked(peer, conn);
1830                 if (rc != 0) {
1831                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1832                         CDEBUG(D_NET, "Not creating duplicate connection to %s: %d\n",
1833                               libcfs_nid2str(her_nid), rc);
1834                         rc = EALREADY;
1835                         GOTO(out, rc);
1836                 }
1837         }
1838
1839         if (peer->gnp_down == GNILND_RCA_NODE_DOWN) {
1840                 CNETERR("Received connection request from down nid %s\n",
1841                         libcfs_nid2str(her_nid));
1842                 peer->gnp_down = GNILND_RCA_NODE_UP;
1843         }
1844
1845         nstale = kgnilnd_close_stale_conns_locked(peer, conn);
1846
1847         /* either way with peer (new or existing), we are ok with ref counts here as the
1848          * kgnilnd_add_peer_locked will use our ref on new_peer (from create_peer_safe) as the
1849          * ref for the peer table. */
1850
1851         /* at this point, the connection request is a winner */
1852
1853         /* mark 'DONE' to avoid cancel being called from release */
1854         dgram->gndg_state = GNILND_DGRAM_DONE;
1855
1856         /* initialise timestamps before reaper looks at them */
1857         conn->gnc_last_rx = conn->gnc_last_rx_cq = jiffies;
1858
1859         /* last_tx is initialized to jiffies - (keepalive*2) so that if the NOOP fails it will
1860          * immediatly send a NOOP in the reaper thread during the call to
1861          * kgnilnd_check_conn_timeouts_locked
1862          */
1863         conn->gnc_last_tx = jiffies - (cfs_time_seconds(GNILND_TO2KA(conn->gnc_timeout)) * 2);
1864         conn->gnc_state = GNILND_CONN_ESTABLISHED;
1865
1866         /* save the dgram type used to establish this connection */
1867         conn->gnc_dgram_type = dgram->gndg_type;
1868
1869         /* refs are not transferred from dgram to tables, so increment to
1870          * take ownership */
1871         kgnilnd_conn_addref(conn);
1872         kgnilnd_peer_addref(peer);
1873         conn->gnc_peer = peer;
1874         list_add_tail(&conn->gnc_list, &peer->gnp_conns);
1875
1876         kgnilnd_conn_addref(conn);               /* +1 ref for conn table */
1877         list_add_tail(&conn->gnc_hashlist,
1878                       kgnilnd_cqid2connlist(conn->gnc_cqid));
1879         kgnilnd_data.kgn_conn_version++;
1880
1881         /* Dont send NOOP if fail_loc is set
1882          */
1883         if (!CFS_FAIL_CHECK(CFS_FAIL_GNI_ONLY_NOOP)) {
1884                 tx = kgnilnd_new_tx_msg(GNILND_MSG_NOOP, peer->gnp_net->gnn_ni->ni_nid);
1885                 if (tx == NULL) {
1886                         CNETERR("can't get TX to initiate NOOP to %s\n",
1887                                 libcfs_nid2str(peer->gnp_nid));
1888                 } else {
1889                         kgnilnd_queue_tx(conn, tx);
1890                 }
1891         }
1892
1893         /* Schedule all packets blocking for a connection */
1894         list_for_each_entry_safe(tx, txn, &peer->gnp_tx_queue, tx_list) {
1895                 /* lock held here is the peer_conn lock */
1896                 kgnilnd_tx_del_state_locked(tx, peer, NULL, GNILND_TX_ALLOCD);
1897                 kgnilnd_queue_tx(conn, tx);
1898         }
1899
1900         /* If this is an active connection lets mark its timestamp on the MBoX */
1901         if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
1902                 mbox = &conn->gnc_fma_blk->gnm_mbox_info[conn->gnc_mbox_id];
1903                 /* conn->gnc_last_rx is jiffies it better exist as it was just set */
1904                 mbox->mbx_release_purg_active_dgram = conn->gnc_last_rx;
1905         }
1906
1907         /* Bug 765042: wake up scheduler for a race with finish_connect and
1908          * complete_conn_closed with a conn in purgatory
1909          * since we can't use CFS_RACE due to mutex_holds in kgnilnd_process_conns,
1910          * we just check for set and then clear */
1911         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_FINISH_PURG)) {
1912                 cfs_fail_loc = 0x0;
1913                 /* get scheduler thread moving again */
1914                 kgnilnd_schedule_device(conn->gnc_device);
1915         }
1916
1917         CDEBUG(D_NET, "New conn 0x%p->%s dev %d\n",
1918                conn, libcfs_nid2str(her_nid), conn->gnc_device->gnd_id);
1919
1920         /* make sure we reset peer reconnect interval now that we have a good conn */
1921         kgnilnd_peer_alive(peer);
1922         peer->gnp_reconnect_interval = 0;
1923
1924         /* clear the unlink attribute if we dont clear it kgnilnd_del_conn_or_peer will wait
1925          * on the atomic forever
1926          */
1927         if (peer->gnp_pending_unlink) {
1928                 peer->gnp_pending_unlink = 0;
1929                 kgnilnd_admin_decref(kgnilnd_data.kgn_npending_unlink);
1930                 CDEBUG(D_NET, "Clearing peer unlink %p\n",peer);
1931         }
1932
1933         /* add ref to make it hang around until after we drop the lock */
1934         kgnilnd_conn_addref(conn);
1935
1936         /* Once the peer_conn lock is dropped, the conn could actually move into
1937          * CLOSING->CLOSED->DONE in the scheduler thread, so hold the
1938          * lock until we are really done */
1939         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1940
1941         /* Notify LNET that we now have a working connection to this peer.
1942          * This is a Cray extension to the "standard" LND behavior. */
1943         lnet_notify(peer->gnp_net->gnn_ni, peer->gnp_nid,
1944                      1, cfs_time_current());
1945
1946         /* drop our 'hold' ref */
1947         kgnilnd_conn_decref(conn);
1948
1949 out:
1950         RETURN(rc);
1951 }
1952
1953 void
1954 kgnilnd_send_nak(kgn_device_t *dev, lnet_nid_t dst_nid, int error)
1955 {
1956         int              rc = 0;
1957         ENTRY;
1958
1959         LASSERTF(dst_nid != LNET_NID_ANY, "bad dst_nid %s\n", libcfs_nid2str(dst_nid));
1960
1961         CDEBUG(D_NET, "NAK to %s errno %d\n", libcfs_nid2str(dst_nid), error);
1962
1963         rc = kgnilnd_post_dgram(dev, dst_nid, GNILND_CONNREQ_NAK, error);
1964
1965         if (rc < 0) {
1966                 CDEBUG(D_NET, "NAK to %s failed: rc %d \n", libcfs_nid2str(dst_nid), rc);
1967         }
1968         EXIT;
1969 }
1970
1971 int
1972 kgnilnd_process_nak(kgn_dgram_t *dgram)
1973 {
1974         kgn_connreq_t     *connreq = &dgram->gndg_conn_in;
1975         lnet_nid_t         src_nid = connreq->gncr_srcnid;
1976         int                errno = connreq->gncr_nakdata.gnnd_errno;
1977         kgn_peer_t        *peer;
1978         int                rc = 0;
1979
1980         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1981
1982         peer = kgnilnd_find_peer_locked(src_nid);
1983         if (peer == NULL) {
1984                 /* we likely dropped him from bad data when we processed
1985                  * the original REQ */
1986                 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1987                 return -EBADSLT;
1988         }
1989
1990         /* need to check peerstamp/connstamp against the ones we find
1991          * to make sure we don't close new (and good?) conns that we
1992          * formed after this connreq failed */
1993         if (peer->gnp_connecting == GNILND_PEER_IDLE) {
1994                 kgn_conn_t        conn;
1995
1996                 if (list_empty(&peer->gnp_conns)) {
1997                         /* assume already procced datagram and it barfed up
1998                          * on this side too */
1999                         CDEBUG(D_NET, "dropping NAK from %s; "
2000                                "peer %s is already not connected\n",
2001                                 libcfs_nid2str(connreq->gncr_srcnid),
2002                                 libcfs_nid2str(connreq->gncr_dstnid));
2003                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2004                         return 0;
2005                 }
2006
2007                 /* stub up a connection with the connreq XXX_stamps to allow
2008                  * use to use close_stale_conns_locked */
2009                 conn.gnc_peerstamp = connreq->gncr_peerstamp;
2010                 conn.gnc_my_connstamp = connreq->gncr_connstamp;
2011                 conn.gnc_peer_connstamp = connreq->gncr_connstamp;
2012                 conn.gnc_device = peer->gnp_net->gnn_dev;
2013
2014                 rc = kgnilnd_close_stale_conns_locked(peer, &conn);
2015
2016                 LCONSOLE_INFO("Received NAK from %s for %s errno %d; "
2017                         "closed %d connections\n",
2018                         libcfs_nid2str(connreq->gncr_srcnid),
2019                         libcfs_nid2str(connreq->gncr_dstnid), errno, rc);
2020         } else {
2021                 spin_lock(&dgram->gndg_conn->gnc_device->gnd_connd_lock);
2022
2023                 if (list_empty(&peer->gnp_connd_list)) {
2024                         /* if peer isn't on waiting list, try to find one to nuke */
2025                         rc = kgnilnd_find_and_cancel_dgram(peer->gnp_net->gnn_dev,
2026                                                            peer->gnp_nid);
2027
2028                         if (rc) {
2029                                 LCONSOLE_INFO("Received NAK from %s for %s errno %d; "
2030                                         "canceled pending connect request\n",
2031                                         libcfs_nid2str(connreq->gncr_srcnid),
2032                                         libcfs_nid2str(connreq->gncr_dstnid), errno);
2033                         }
2034
2035                         /* if we can't find a waiting dgram, we just drop the nak - the conn
2036                          * connect must have failed (didn't find conn above and clear connecting
2037                          * -- so nothing to do besides drop */
2038                 } else {
2039                         /* peer is on list, meaning it is a new connect attempt from the one
2040                          * we started that generated the NAK - so just drop NAK */
2041
2042                         /* use negative to prevent error message */
2043                         rc = -EAGAIN;
2044                 }
2045                 spin_unlock(&dgram->gndg_conn->gnc_device->gnd_connd_lock);
2046         }
2047
2048         /* success! we found a peer and at least marked pending_nak */
2049         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2050
2051         return rc;
2052 }
2053
2054 int
2055 kgnilnd_process_connreq(kgn_dgram_t *dgram, int *needs_nak)
2056 {
2057         int                      rc;
2058
2059         rc = kgnilnd_unpack_connreq(dgram);
2060         if (rc < 0) {
2061                 if (rc != -EBADF) {
2062                         /* only NAK if we have good srcnid to use */
2063                         *needs_nak = 1;
2064                 }
2065                 goto connreq_out;
2066         }
2067
2068         switch (dgram->gndg_conn_in.gncr_type) {
2069         case GNILND_CONNREQ_REQ:
2070                 /* wire up peer & conn, send queued TX */
2071                 rc = kgnilnd_finish_connect(dgram);
2072
2073                 /* don't nak when the nid is hosed */
2074                 if ((rc < 0)) {
2075                         *needs_nak = 1;
2076                 }
2077
2078                 break;
2079         case GNILND_CONNREQ_NAK:
2080                 rc = kgnilnd_process_nak(dgram);
2081                 /* return early to prevent reconnect bump */
2082                 return rc;
2083         default:
2084                 CERROR("unexpected connreq type %s (%d) from %s\n",
2085                         kgnilnd_connreq_type2str(&dgram->gndg_conn_in),
2086                         dgram->gndg_conn_in.gncr_type,
2087                         libcfs_nid2str(dgram->gndg_conn_in.gncr_srcnid));
2088                 rc = -EINVAL;
2089                 *needs_nak = 1;
2090                 break;
2091         }
2092
2093 connreq_out:
2094         RETURN(rc);
2095 }
2096
2097 int
2098 kgnilnd_probe_and_process_dgram(kgn_device_t *dev)
2099 {
2100         int                      rc;
2101         int                      needs_nak = 0;
2102         lnet_nid_t               nak_dstnid = LNET_NID_ANY;
2103         lnet_nid_t               orig_dstnid;
2104         kgn_dgram_t             *dgram = NULL;
2105         kgn_peer_t              *peer;
2106         ENTRY;
2107
2108         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PAUSE_DGRAM_COMP)) {
2109                 rc = 0;
2110         } else {
2111                 rc = kgnilnd_probe_for_dgram(dev, &dgram);
2112         }
2113
2114         if (rc == 0) {
2115                 RETURN(0);
2116         } else if (rc < 0) {
2117                 GOTO(inform_peer, rc);
2118         } else {
2119                 /* rc > 1 means it did something, reset for this func  */
2120                 rc = 0;
2121         }
2122
2123         switch (dgram->gndg_type) {
2124         case GNILND_DGRAM_WC_REQ:
2125         case GNILND_DGRAM_REQ:
2126                 rc = kgnilnd_process_connreq(dgram, &needs_nak);
2127                 break;
2128         case GNILND_DGRAM_NAK:
2129                 CDEBUG(D_NETTRACE, "NAK to %s done\n",
2130                         libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid));
2131                 break;
2132         default:
2133                 CERROR("unknown datagram type %s (%d)\n",
2134                        kgnilnd_dgram_type2str(dgram), dgram->gndg_type);
2135                 break;
2136         }
2137
2138         /* stash data to use after releasing current datagram */
2139         /* don't stash net - we are operating on a net already,
2140          * so the lock on rw_net_lock is sufficient */
2141
2142         nak_dstnid = dgram->gndg_conn_in.gncr_srcnid;
2143
2144 inform_peer:
2145         LASSERTF(dgram != NULL, "dgram 0x%p rc %d needs_nak %d\n", dgram, rc, needs_nak);
2146
2147         orig_dstnid = dgram->gndg_conn_out.gncr_dstnid;
2148
2149         kgnilnd_release_dgram(dev, dgram, 0);
2150
2151         CDEBUG(D_NET, "cleaning up dgram to %s, rc %d\n",
2152                libcfs_nid2str(orig_dstnid), rc);
2153
2154         /* if this was a WC_REQ that matched an existing peer, it'll get marked done
2155          * in kgnilnd_finish_connect - if errors are from before we get to there,
2156          * we just drop as it is a WC_REQ - the peer CAN'T be waiting for it */
2157         if ((orig_dstnid != LNET_NID_ANY) && (rc < 0)) {
2158                 /* if we have a negative rc, we want to find a peer to inform about
2159                  * the bad connection attempt. Sorry buddy, better luck next time! */
2160
2161                 write_lock(&kgnilnd_data.kgn_peer_conn_lock);
2162                 peer = kgnilnd_find_peer_locked(orig_dstnid);
2163
2164                 if (peer != NULL) {
2165                         /* add ref to make sure he stays around past the possible unlink
2166                          * so we can tell LNet about him */
2167                         kgnilnd_peer_addref(peer);
2168
2169                         /* if he still cares about the outstanding connect */
2170                         if (peer->gnp_connecting >= GNILND_PEER_CONNECT) {
2171                                 /* check if he is on the connd list and remove.. */
2172                                 spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
2173                                 if (!list_empty(&peer->gnp_connd_list)) {
2174                                         list_del_init(&peer->gnp_connd_list);
2175                                         /* drop connd ref */
2176                                         kgnilnd_peer_decref(peer);
2177                                 }
2178                                 spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
2179
2180                                 /* clear gnp_connecting so we don't have a non-connecting peer
2181                                  * on gnd_connd_list */
2182                                 peer->gnp_connecting = GNILND_PEER_IDLE;
2183
2184                                 set_mb(peer->gnp_last_dgram_errno, rc);
2185
2186                                 kgnilnd_peer_increase_reconnect_locked(peer);
2187                         }
2188                 }
2189                 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2190
2191                 /* now that we are outside the lock, tell Mommy */
2192                 if (peer != NULL) {
2193                         kgnilnd_peer_notify(peer, rc, 0);
2194                         kgnilnd_peer_decref(peer);
2195                 }
2196         }
2197
2198         if (needs_nak) {
2199                 kgnilnd_send_nak(dev, nak_dstnid, rc);
2200         }
2201
2202         RETURN(1);
2203 }
2204
2205 void
2206 kgnilnd_reaper_dgram_check(kgn_device_t *dev)
2207 {
2208         kgn_dgram_t    *dgram, *tmp;
2209         int             i;
2210
2211         spin_lock(&dev->gnd_dgram_lock);
2212
2213         for (i = 0; i < (*kgnilnd_tunables.kgn_peer_hash_size - 1); i++) {
2214                 list_for_each_entry_safe(dgram, tmp, &dev->gnd_dgrams[i], gndg_list) {
2215                         unsigned long            now = jiffies;
2216                         unsigned long            timeout;
2217
2218                         /* don't timeout stuff if the network is mucked or shutting down */
2219                         if (kgnilnd_check_hw_quiesce()) {
2220                                 break;
2221                         }
2222
2223                         if ((dgram->gndg_state != GNILND_DGRAM_POSTED) ||
2224                             (dgram->gndg_type == GNILND_DGRAM_WC_REQ)) {
2225                                 continue;
2226                         }
2227                         CDEBUG(D_NETTRACE, "checking dgram 0x%p type %s "
2228                                 "state %s conn 0x%p to %s age %lus\n",
2229                                 dgram, kgnilnd_dgram_type2str(dgram),
2230                                 kgnilnd_dgram_state2str(dgram), dgram->gndg_conn,
2231                                 libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
2232                                 cfs_duration_sec(now - dgram->gndg_post_time));
2233
2234                         timeout = cfs_time_seconds(*kgnilnd_tunables.kgn_timeout);
2235
2236                         if (time_before(now, (dgram->gndg_post_time + timeout)))
2237                                 continue;
2238
2239                         CNETERR("%s datagram to %s timed out @ %lus dgram "
2240                                 "0x%p state %s conn 0x%p\n",
2241                                 kgnilnd_dgram_type2str(dgram),
2242                                 libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
2243                                 cfs_duration_sec(now - dgram->gndg_post_time),
2244                                 dgram, kgnilnd_dgram_state2str(dgram),
2245                                 dgram->gndg_conn);
2246
2247                         kgnilnd_cancel_dgram_locked(dgram);
2248                 }
2249         }
2250         spin_unlock(&dev->gnd_dgram_lock);
2251 }
2252
2253
2254 /* use a thread for the possibly long-blocking wait_by_id to prevent
2255  * stalling the global workqueues */
2256 int
2257 kgnilnd_dgram_waitq(void *arg)
2258 {
2259         kgn_device_t     *dev = (kgn_device_t *) arg;
2260         char              name[16];
2261         gni_return_t      grc;
2262         __u64             readyid;
2263         DEFINE_WAIT(mover_done);
2264
2265         snprintf(name, sizeof(name), "kgnilnd_dgn_%02d", dev->gnd_id);
2266         cfs_block_allsigs();
2267
2268         /* all gnilnd threads need to run fairly urgently */
2269         set_user_nice(current, *kgnilnd_tunables.kgn_nice);
2270
2271         /* we dont shut down until the device shuts down ... */
2272         while (!kgnilnd_data.kgn_shutdown) {
2273                 /* to quiesce or to not quiesce, that is the question */
2274                 if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
2275                         KGNILND_SPIN_QUIESCE;
2276                 }
2277
2278                 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_PAUSE_DGRAM_COMP, 1)) {}
2279
2280                 /* check once a second */
2281                 grc = kgnilnd_postdata_probe_wait_by_id(dev->gnd_handle,
2282                                                        1000, &readyid);
2283
2284                 if (grc == GNI_RC_SUCCESS) {
2285                         CDEBUG(D_INFO, "waking up dgram mover thread\n");
2286                         kgnilnd_schedule_dgram(dev);
2287
2288                         /* wait for dgram thread to ping us before spinning again */
2289                         prepare_to_wait(&dev->gnd_dgping_waitq, &mover_done,
2290                                         TASK_INTERRUPTIBLE);
2291
2292                         /* don't sleep if we need to quiesce */
2293                         if (likely(!kgnilnd_data.kgn_quiesce_trigger)) {
2294                                 schedule();
2295                         }
2296                         finish_wait(&dev->gnd_dgping_waitq, &mover_done);
2297                 }
2298         }
2299
2300         kgnilnd_thread_fini();
2301         return 0;
2302 }
2303
2304 int
2305 kgnilnd_start_outbound_dgrams(kgn_device_t *dev, unsigned long deadline)
2306 {
2307         int                      did_something = 0, rc;
2308         kgn_peer_t              *peer = NULL;
2309
2310         spin_lock(&dev->gnd_connd_lock);
2311
2312         /* Active connect - we added this in kgnilnd_launch_tx */
2313         while (!list_empty(&dev->gnd_connd_peers) && time_before(jiffies, deadline)) {
2314                 peer = list_first_entry(&dev->gnd_connd_peers,
2315                                         kgn_peer_t, gnp_connd_list);
2316
2317                 /* ref for connd removed in if/else below */
2318                list_del_init(&peer->gnp_connd_list);
2319
2320                 /* gnp_connecting and membership on gnd_connd_peers should be
2321                  * done coherently to avoid double adding, etc */
2322                 /* don't need kgnilnd_data.kgn_peer_conn_lock here as that is only needed
2323                  * to get the peer to gnp_connecting in the first place. We just need to
2324                  * rely on gnd_connd_lock to serialize someone pulling him from the list
2325                  * BEFORE clearing gnp_connecting */
2326                 LASSERTF(peer->gnp_connecting != GNILND_PEER_IDLE, "peer 0x%p->%s not connecting\n",
2327                          peer, libcfs_nid2str(peer->gnp_nid));
2328
2329                 spin_unlock(&dev->gnd_connd_lock);
2330
2331                 CDEBUG(D_NET, "processing connect to %s\n",
2332                        libcfs_nid2str(peer->gnp_nid));
2333
2334                 did_something += 1;
2335                 rc = kgnilnd_start_connect(peer);
2336
2337                 if (likely(rc >= 0)) {
2338                         /* 0 on success, positive on 'just drop peer' errors */
2339                         kgnilnd_peer_decref(peer);
2340                 } else if (rc == -ENOMEM) {
2341                         /* if we are out of wildcards, add back to
2342                          * connd_list - then break out and we'll try later
2343                          * if other errors, we'll bail & cancel pending tx */
2344                         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
2345                         if (peer->gnp_connecting == GNILND_PEER_POSTING) {
2346                                 peer->gnp_connecting = GNILND_PEER_CONNECT;
2347                                 spin_lock(&dev->gnd_connd_lock);
2348                                 list_add_tail(&peer->gnp_connd_list,
2349                                               &dev->gnd_connd_peers);
2350                         } else {
2351                                 /* connecting changed while we were posting */
2352
2353                                 LASSERTF(peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH, "Peer is in invalid"
2354                                         " state 0x%p->%s, connecting %d\n",
2355                                         peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting);
2356                                 peer->gnp_connecting = GNILND_PEER_KILL;
2357                                 spin_lock(&dev->gnd_connd_lock);
2358                                 /* remove the peer ref frrom the cond list */
2359                                 kgnilnd_peer_decref(peer);
2360                                 /* let the system handle itself */
2361                         }
2362                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2363                         /* the datagrams are a global pool,
2364                          * so break out of trying and hope some free
2365                          * up soon */
2366                         did_something -= 1;
2367                         break;
2368                 } else {
2369                         /* something bad happened, you lose */
2370                         CNETERR("could not start connecting to %s "
2371                                 "rc %d: Will retry until TX timeout\n",
2372                                libcfs_nid2str(peer->gnp_nid), rc);
2373                         /* It didnt post so just set connecting back to zero now.
2374                          * The reaper will reattempt the connection if it needs too.
2375                          * If the peer needs death set it so the reaper will cleanup.
2376                          */
2377                         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
2378                         if (peer->gnp_connecting == GNILND_PEER_POSTING) {
2379                                 peer->gnp_connecting = GNILND_PEER_IDLE;
2380                                 kgnilnd_peer_increase_reconnect_locked(peer);
2381                         } else {
2382                                 LASSERTF(peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH, "Peer is in invalid"
2383                                         " state 0x%p->%s, connecting %d\n",
2384                                         peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting);
2385                                 peer->gnp_connecting = GNILND_PEER_KILL;
2386                         }
2387                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2388
2389                         /* hold onto ref until we are really done - if it was
2390                          * unlinked this could result in a destroy */
2391                         kgnilnd_peer_decref(peer);
2392                 }
2393                 spin_lock(&dev->gnd_connd_lock);
2394         }
2395
2396         spin_unlock(&dev->gnd_connd_lock);
2397         RETURN(did_something);
2398 }
2399
2400 int
2401 kgnilnd_repost_wc_dgrams(kgn_device_t *dev)
2402 {
2403         int did_something = 0, to_repost, i;
2404         to_repost = atomic_read(&dev->gnd_nwcdgrams);
2405         ENTRY;
2406
2407         for (i = 0; i < to_repost; ++i) {
2408                 int     rerc;
2409                 rerc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
2410                 if (rerc == 0) {
2411                         kgnilnd_admin_decref(dev->gnd_nwcdgrams);
2412                         did_something += 1;
2413                 } else {
2414                         CDEBUG(D_NETERROR, "error %d: dev %d could not post wildcard datagram\n",
2415                                 rerc, dev->gnd_id);
2416                         break;
2417                 }
2418         }
2419
2420         RETURN(did_something);
2421 }
2422
2423 static void
2424 kgnilnd_dgram_poke_with_stick(unsigned long arg)
2425 {
2426         int             dev_id = arg;
2427         kgn_device_t    *dev = &kgnilnd_data.kgn_devices[dev_id];
2428
2429         wake_up(&dev->gnd_dgram_waitq);
2430 }
2431
2432 /* use single thread for dgrams - should be sufficient for performance */
2433 int
2434 kgnilnd_dgram_mover(void *arg)
2435 {
2436         kgn_device_t            *dev = (kgn_device_t *)arg;
2437         char                     name[16];
2438         int                      rc, did_something;
2439         unsigned long            next_purge_check = jiffies - 1;
2440         unsigned long            timeout;
2441         struct timer_list        timer;
2442         unsigned long            deadline = 0;
2443         DEFINE_WAIT(wait);
2444
2445         snprintf(name, sizeof(name), "kgnilnd_dg_%02d", dev->gnd_id);
2446         cfs_block_allsigs();
2447         /* all gnilnd threads need to run fairly urgently */
2448         set_user_nice(current, *kgnilnd_tunables.kgn_nice);
2449
2450         /* we are ok not locking for these variables as the dgram waitq threads
2451          * will block both due to tying up net (kgn_shutdown) and the completion
2452          * event for the dgram_waitq (kgn_quiesce_trigger) */
2453         deadline = jiffies + cfs_time_seconds(*kgnilnd_tunables.kgn_dgram_timeout);
2454         while (!kgnilnd_data.kgn_shutdown) {
2455                 /* Safe: kgn_shutdown only set when quiescent */
2456
2457                 /* race with stack reset - we want to hold off seeing any new incoming dgrams
2458                  * so we can force a dirty WC dgram for Bug 762072 - put right before
2459                  * quiesce check so that it'll go right into that and not do any
2460                  * dgram mucking */
2461                 CFS_RACE(CFS_FAIL_GNI_WC_DGRAM_FREE);
2462
2463                 /* to quiesce or to not quiesce, that is the question */
2464                 if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
2465                         KGNILND_SPIN_QUIESCE;
2466                 }
2467                 did_something = 0;
2468
2469                 CFS_RACE(CFS_FAIL_GNI_QUIESCE_RACE);
2470
2471                 /* process any newly completed dgrams */
2472                 down_read(&kgnilnd_data.kgn_net_rw_sem);
2473
2474                 rc = kgnilnd_probe_and_process_dgram(dev);
2475                 if (rc > 0) {
2476                         did_something += rc;
2477                 }
2478
2479                 up_read(&kgnilnd_data.kgn_net_rw_sem);
2480
2481                 CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_DGRAM_DEADLINE,
2482                         (*kgnilnd_tunables.kgn_dgram_timeout + 1));
2483                 /* start new outbound dgrams */
2484                 did_something += kgnilnd_start_outbound_dgrams(dev, deadline);
2485
2486                 /* find dead dgrams */
2487                 if (time_after_eq(jiffies, next_purge_check)) {
2488                         /* these don't need to be checked that often */
2489                         kgnilnd_reaper_dgram_check(dev);
2490
2491                         next_purge_check = (long) jiffies +
2492                                       cfs_time_seconds(kgnilnd_data.kgn_new_min_timeout / 4);
2493                 }
2494
2495                 did_something += kgnilnd_repost_wc_dgrams(dev);
2496
2497                 /* careful with the jiffy wrap... */
2498                 timeout = (long)(next_purge_check - jiffies);
2499
2500                 CDEBUG(D_INFO, "did %d timeout %lu next %lu jiffies %lu\n",
2501                        did_something, timeout, next_purge_check, jiffies);
2502
2503                 if ((did_something || timeout <= 0) && time_before(jiffies, deadline)) {
2504                         did_something = 0;
2505                         continue;
2506                 }
2507
2508                 prepare_to_wait(&dev->gnd_dgram_waitq, &wait, TASK_INTERRUPTIBLE);
2509
2510                 setup_timer(&timer, kgnilnd_dgram_poke_with_stick, dev->gnd_id);
2511                 mod_timer(&timer, (long) jiffies + timeout);
2512
2513                 /* last second chance for others to poke us */
2514                 did_something += xchg(&dev->gnd_dgram_ready, GNILND_DGRAM_IDLE);
2515
2516                 /* check flag variables before comittingi even if we did something;
2517                  * if we are after the deadline call schedule */
2518                 if ((!did_something || time_after(jiffies, deadline)) &&
2519                     !kgnilnd_data.kgn_shutdown &&
2520                     !kgnilnd_data.kgn_quiesce_trigger) {
2521                         CDEBUG(D_INFO, "schedule timeout %ld (%lu sec)\n",
2522                                timeout, cfs_duration_sec(timeout));
2523                         wake_up_all(&dev->gnd_dgping_waitq);
2524                         schedule();
2525                         CDEBUG(D_INFO, "awake after schedule\n");
2526                         deadline = jiffies + cfs_time_seconds(*kgnilnd_tunables.kgn_dgram_timeout);
2527                 }
2528
2529                 del_singleshot_timer_sync(&timer);
2530                 finish_wait(&dev->gnd_dgram_waitq, &wait);
2531         }
2532
2533         kgnilnd_thread_fini();
2534         return 0;
2535 }