lnet/klnds/gnilnd/gnilnd_conn.c

   1 /*
   2  * Copyright (C) 2012 Cray, Inc.
   3  *
   4  * Copyright (c) 2014, Intel Corporation.
   5  *
   6  *   Author: Nic Henke <nic@cray.com>
   7  *   Author: James Shimek <jshimek@cray.com>
   8  *
   9  *   This file is part of Lustre, http://www.lustre.org.
  10  *
  11  *   Lustre is free software; you can redistribute it and/or
  12  *   modify it under the terms of version 2 of the GNU General Public
  13  *   License as published by the Free Software Foundation.
  14  *
  15  *   Lustre is distributed in the hope that it will be useful,
  16  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  *   GNU General Public License for more details.
  19  *
  20  *   You should have received a copy of the GNU General Public License
  21  *   along with Lustre; if not, write to the Free Software
  22  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  23  *
  24  */
  25
  26 #include "gnilnd.h"
  27
  28 void
  29 kgnilnd_setup_smsg_attr(gni_smsg_attr_t *smsg_attr)
  30 {
  31         smsg_attr->mbox_maxcredit = *kgnilnd_tunables.kgn_mbox_credits;
  32         smsg_attr->msg_maxsize = GNILND_MAX_MSG_SIZE;
  33         smsg_attr->msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT;
  34 }
  35
  36 int
  37 kgnilnd_map_fmablk(kgn_device_t *device, kgn_fma_memblock_t *fma_blk)
  38 {
  39         gni_return_t            rrc;
  40         __u32                   flags = GNI_MEM_READWRITE;
  41
  42         if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
  43                 flags |= GNI_MEM_PHYS_CONT;
  44         }
  45
  46         /* make sure we are mapping a clean block */
  47         LASSERTF(fma_blk->gnm_hndl.qword1 == 0UL, "fma_blk %p dirty\n", fma_blk);
  48
  49         rrc = kgnilnd_mem_register(device->gnd_handle, (__u64)fma_blk->gnm_block,
  50                                    fma_blk->gnm_blk_size, device->gnd_rcv_fma_cqh,
  51                                    flags, &fma_blk->gnm_hndl);
  52         if (rrc != GNI_RC_SUCCESS) {
  53                 /* XXX Nic: need a way to silence this for runtime stuff that is ok to fail
  54                  * -- like when under MDD or GART pressure on big systems
  55                  */
  56                 CNETERR("register fmablk failed 0x%p mbox_size %d flags %u\n",
  57                         fma_blk, fma_blk->gnm_mbox_size, flags);
  58                 RETURN(-ENOMEM);
  59         }
  60
  61         /* PHYS_CONT memory isn't really mapped, at least not in GART -
  62          *  but all mappings chew up a MDD
  63          */
  64         if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
  65                 atomic64_add(fma_blk->gnm_blk_size, &device->gnd_nbytes_map);
  66         }
  67
  68         atomic_inc(&device->gnd_n_mdd);
  69         /* nfmablk is live (mapped) blocks */
  70         atomic_inc(&device->gnd_nfmablk);
  71
  72         RETURN(0);
  73 }
  74
  75 int
  76 kgnilnd_alloc_fmablk(kgn_device_t *device, int use_phys)
  77 {
  78         int                     rc = 0;
  79         int                     num_mbox;
  80         kgn_fma_memblock_t     *fma_blk;
  81         gni_smsg_attr_t         smsg_attr;
  82         unsigned long           fmablk_vers;
  83
  84         /* we'll use fmablk_vers and the gnd_fmablk_mutex to gate access
  85          * to this allocation code. Everyone will sample the version
  86          * before and after getting the mutex. If it has changed,
  87          * we'll bail out to check the lists again - this indicates that
  88          * some sort of change was made to the lists and it is possible
  89          * that there is a mailbox for us to find now. This should prevent
  90          * a ton of spinning in the case where there are lots of threads
  91          * that need a yet-to-be-allocated mailbox for a connection. */
  92
  93         fmablk_vers = atomic_read(&device->gnd_fmablk_vers);
  94         mutex_lock(&device->gnd_fmablk_mutex);
  95
  96         if (fmablk_vers != atomic_read(&device->gnd_fmablk_vers)) {
  97                 /* version changed while we were waiting for semaphore,
  98                  * we'll recheck the lists assuming something nice happened */
  99                 mutex_unlock(&device->gnd_fmablk_mutex);
 100                 return 0;
 101         }
 102
 103         LIBCFS_ALLOC(fma_blk, sizeof(kgn_fma_memblock_t));
 104         if (fma_blk == NULL) {
 105                 CNETERR("could not allocate fma block descriptor\n");
 106                 rc = -ENOMEM;
 107                 GOTO(out, rc);
 108         }
 109
 110         INIT_LIST_HEAD(&fma_blk->gnm_bufflist);
 111
 112         kgnilnd_setup_smsg_attr(&smsg_attr);
 113
 114         gni_smsg_buff_size_needed(&smsg_attr, &fma_blk->gnm_mbox_size);
 115
 116         LASSERTF(fma_blk->gnm_mbox_size, "mbox size %d\n", fma_blk->gnm_mbox_size);
 117
 118         /* gni_smsg_buff_size_needed calculates the base mailbox size and since
 119          * we want to hold kgn_peer_credits worth of messages in both directions,
 120          * we add PAYLOAD to grow the mailbox size
 121          */
 122
 123         fma_blk->gnm_mbox_size += GNILND_MBOX_PAYLOAD;
 124
 125         /* we'll only use physical during preallocate at startup -- this keeps it nice and
 126          * clean for runtime decisions. We'll keep the PHYS ones around until shutdown
 127          * as reallocating them is tough if there is memory fragmentation */
 128
 129         if (use_phys) {
 130                 fma_blk->gnm_block = kmem_cache_alloc(kgnilnd_data.kgn_mbox_cache, GFP_ATOMIC);
 131                 if (fma_blk->gnm_block == NULL) {
 132                         CNETERR("could not allocate physical SMSG mailbox memory\n");
 133                         rc = -ENOMEM;
 134                         GOTO(free_desc, rc);
 135                 }
 136                 fma_blk->gnm_blk_size = KMALLOC_MAX_SIZE;
 137                 num_mbox = fma_blk->gnm_blk_size / fma_blk->gnm_mbox_size;
 138
 139                 LASSERTF(num_mbox >= 1,
 140                          "num_mbox %d blk_size %u mbox_size %d\n",
 141                           num_mbox, fma_blk->gnm_blk_size, fma_blk->gnm_mbox_size);
 142
 143                 fma_blk->gnm_state = GNILND_FMABLK_PHYS;
 144
 145         } else {
 146                 num_mbox = *kgnilnd_tunables.kgn_mbox_per_block;
 147                 fma_blk->gnm_blk_size = num_mbox * fma_blk->gnm_mbox_size;
 148
 149                 LASSERTF(num_mbox >= 1 && num_mbox >= *kgnilnd_tunables.kgn_mbox_per_block,
 150                          "num_mbox %d blk_size %u mbox_size %d tunable %d\n",
 151                          num_mbox, fma_blk->gnm_blk_size, fma_blk->gnm_mbox_size,
 152                          *kgnilnd_tunables.kgn_mbox_per_block);
 153
 154                 LIBCFS_ALLOC(fma_blk->gnm_block, fma_blk->gnm_blk_size);
 155                 if (fma_blk->gnm_block == NULL) {
 156                         CNETERR("could not allocate virtual SMSG mailbox memory, %d bytes\n", fma_blk->gnm_blk_size);
 157                         rc = -ENOMEM;
 158                         GOTO(free_desc, rc);
 159                 }
 160
 161                 fma_blk->gnm_state = GNILND_FMABLK_VIRT;
 162         }
 163
 164         /* allocate just enough space for the bits to track the mailboxes */
 165         LIBCFS_ALLOC(fma_blk->gnm_bit_array, BITS_TO_LONGS(num_mbox) * sizeof(unsigned long));
 166         if (fma_blk->gnm_bit_array == NULL) {
 167                 CNETERR("could not allocate mailbox bitmask, %lu bytes for %d mbox\n",
 168                        sizeof(unsigned long) * BITS_TO_LONGS(num_mbox), num_mbox);
 169                 rc = -ENOMEM;
 170                 GOTO(free_blk, rc);
 171         }
 172         bitmap_zero(fma_blk->gnm_bit_array, num_mbox);
 173
 174         /* now that the num_mbox is set based on allocation type, get debug info setup */
 175         LIBCFS_ALLOC(fma_blk->gnm_mbox_info, sizeof(kgn_mbox_info_t) * num_mbox);
 176         if (fma_blk->gnm_mbox_info == NULL) {
 177                 CNETERR("could not allocate mailbox debug, %lu bytes for %d mbox\n",
 178                        sizeof(kgn_mbox_info_t) * num_mbox, num_mbox);
 179                 rc = -ENOMEM;
 180                 GOTO(free_bit, rc);
 181         }
 182
 183         rc = kgnilnd_map_fmablk(device, fma_blk);
 184         if (rc) {
 185                 GOTO(free_info, rc);
 186         }
 187
 188         fma_blk->gnm_next_avail_mbox = 0;
 189         fma_blk->gnm_avail_mboxs = fma_blk->gnm_num_mboxs = num_mbox;
 190
 191         CDEBUG(D_MALLOC, "alloc fmablk 0x%p num %d msg_maxsize %d credits %d "
 192                 "mbox_size %d MDD "LPX64"."LPX64"\n",
 193                 fma_blk, num_mbox, smsg_attr.msg_maxsize, smsg_attr.mbox_maxcredit,
 194                 fma_blk->gnm_mbox_size, fma_blk->gnm_hndl.qword1,
 195                 fma_blk->gnm_hndl.qword2);
 196
 197         /* lock Is protecting data structures, not semaphore */
 198
 199         spin_lock(&device->gnd_fmablk_lock);
 200         list_add_tail(&fma_blk->gnm_bufflist, &device->gnd_fma_buffs);
 201
 202         /* toggle under the lock so once they change the list is also
 203          * ready for others to traverse */
 204         atomic_inc(&device->gnd_fmablk_vers);
 205
 206         spin_unlock(&device->gnd_fmablk_lock);
 207
 208         mutex_unlock(&device->gnd_fmablk_mutex);
 209
 210         return 0;
 211
 212 free_info:
 213         LIBCFS_FREE(fma_blk->gnm_mbox_info, sizeof(kgn_mbox_info_t)*num_mbox);
 214 free_bit:
 215         LIBCFS_FREE(fma_blk->gnm_bit_array, BITS_TO_LONGS(num_mbox) * sizeof (unsigned long));
 216 free_blk:
 217         if (fma_blk->gnm_state == GNILND_FMABLK_VIRT) {
 218                 LIBCFS_FREE(fma_blk->gnm_block, fma_blk->gnm_blk_size);
 219         } else {
 220                 kmem_cache_free(kgnilnd_data.kgn_mbox_cache, fma_blk->gnm_block);
 221         }
 222 free_desc:
 223         LIBCFS_FREE(fma_blk, sizeof(kgn_fma_memblock_t));
 224 out:
 225         mutex_unlock(&device->gnd_fmablk_mutex);
 226         return rc;
 227 }
 228
 229 void
 230 kgnilnd_unmap_fmablk(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
 231 {
 232         gni_return_t            rrc;
 233
 234         /* if some held, set hold_timeout from conn timeouts used in this block
 235          * but not during shutdown, then just nuke and pave */
 236         if (fma_blk->gnm_held_mboxs && (!kgnilnd_data.kgn_shutdown)) {
 237                 fma_blk->gnm_hold_timeout = GNILND_TIMEOUT2DEADMAN;
 238         }
 239
 240         /* we are changing the state of a block, tickle version to tell
 241          * proc code list is stale now */
 242         atomic_inc(&dev->gnd_fmablk_vers);
 243
 244         rrc = kgnilnd_mem_deregister(dev->gnd_handle, &fma_blk->gnm_hndl, fma_blk->gnm_hold_timeout);
 245
 246         CDEBUG(rrc == GNI_RC_SUCCESS ? D_MALLOC : D_CONSOLE|D_NETERROR,
 247                "unmap fmablk 0x%p@%s sz %u total %d avail %d held %d mbox_size %d "
 248                 "hold_timeout %d\n",
 249                fma_blk, kgnilnd_fmablk_state2str(fma_blk->gnm_state),
 250                fma_blk->gnm_blk_size, fma_blk->gnm_num_mboxs,
 251                fma_blk->gnm_avail_mboxs, fma_blk->gnm_held_mboxs,
 252                fma_blk->gnm_mbox_size, fma_blk->gnm_hold_timeout);
 253
 254         LASSERTF(rrc == GNI_RC_SUCCESS,
 255                 "tried to double unmap or something bad, fma_blk %p (rrc %d)\n",
 256                 fma_blk, rrc);
 257
 258         if (fma_blk->gnm_hold_timeout) {
 259                 atomic_inc(&dev->gnd_n_mdd_held);
 260         } else {
 261                 atomic_dec(&dev->gnd_n_mdd);
 262         }
 263
 264         /* PHYS blocks don't get mapped */
 265         if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
 266                 atomic64_sub(fma_blk->gnm_blk_size, &dev->gnd_nbytes_map);
 267                 fma_blk->gnm_state = GNILND_FMABLK_IDLE;
 268         } else if (kgnilnd_data.kgn_in_reset) {
 269                 /* in stack reset, clear MDD handle for PHYS blocks, as we'll
 270                  * re-use the fma_blk after reset so we don't have to drop/allocate
 271                  * all of those physical blocks */
 272                 fma_blk->gnm_hndl.qword1 = fma_blk->gnm_hndl.qword2 = 0UL;
 273         }
 274
 275         /* Decrement here as this is the # of mapped blocks */
 276         atomic_dec(&dev->gnd_nfmablk);
 277 }
 278
 279
 280 /* needs lock on gnd_fmablk_lock to cover gnd_fma_buffs */
 281 void
 282 kgnilnd_free_fmablk_locked(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
 283 {
 284         LASSERTF(fma_blk->gnm_avail_mboxs == fma_blk->gnm_num_mboxs,
 285                  "fma_blk %p@%d free in bad state (%d): blk total %d avail %d held %d\n",
 286                  fma_blk, fma_blk->gnm_state, fma_blk->gnm_hold_timeout, fma_blk->gnm_num_mboxs,
 287                 fma_blk->gnm_avail_mboxs, fma_blk->gnm_held_mboxs);
 288
 289         atomic_inc(&dev->gnd_fmablk_vers);
 290
 291         if (fma_blk->gnm_hold_timeout) {
 292                 CDEBUG(D_MALLOC, "mdd release fmablk 0x%p sz %u avail %d held %d "
 293                         "mbox_size %d\n",
 294                         fma_blk, fma_blk->gnm_blk_size, fma_blk->gnm_avail_mboxs,
 295                         fma_blk->gnm_held_mboxs, fma_blk->gnm_mbox_size);
 296
 297                 /* We leave MDD dangling over stack reset */
 298                 if (!kgnilnd_data.kgn_in_reset) {
 299                         kgnilnd_mem_mdd_release(dev->gnd_handle, &fma_blk->gnm_hndl);
 300                 }
 301                 /* ignoring the return code - if kgni/ghal can't find it
 302                  * it must be released already */
 303                 atomic_dec(&dev->gnd_n_mdd_held);
 304                 atomic_dec(&dev->gnd_n_mdd);
 305         }
 306
 307         /* we cant' free the gnm_block until all the conns have released their
 308          * purgatory holds. While we have purgatory holds, we might check the conn
 309          * RX mailbox during the CLOSING process. It is possible that kgni might
 310          * try to look into the RX side for credits when sending the CLOSE msg too */
 311         CDEBUG(D_MALLOC, "fmablk %p free buffer %p mbox_size %d\n",
 312                 fma_blk, fma_blk->gnm_block, fma_blk->gnm_mbox_size);
 313
 314         if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
 315                 kmem_cache_free(kgnilnd_data.kgn_mbox_cache, fma_blk->gnm_block);
 316         } else {
 317                 LIBCFS_FREE(fma_blk->gnm_block, fma_blk->gnm_blk_size);
 318         }
 319         fma_blk->gnm_state = GNILND_FMABLK_FREED;
 320
 321         list_del(&fma_blk->gnm_bufflist);
 322
 323         LIBCFS_FREE(fma_blk->gnm_mbox_info, sizeof(kgn_mbox_info_t)*fma_blk->gnm_num_mboxs);
 324         LIBCFS_FREE(fma_blk->gnm_bit_array, BITS_TO_LONGS(fma_blk->gnm_num_mboxs) * sizeof (unsigned long));
 325         LIBCFS_FREE(fma_blk, sizeof(kgn_fma_memblock_t));
 326 }
 327
 328 void
 329 kgnilnd_find_free_mbox(kgn_conn_t *conn)
 330 {
 331         kgn_device_t            *dev = conn->gnc_device;
 332         gni_smsg_attr_t         *smsg_attr = &conn->gnpr_smsg_attr;
 333         kgn_fma_memblock_t      *fma_blk;
 334         kgn_mbox_info_t         *mbox = NULL;
 335         int                     id;
 336
 337         spin_lock(&dev->gnd_fmablk_lock);
 338
 339         list_for_each_entry(fma_blk, &conn->gnc_device->gnd_fma_buffs,
 340                             gnm_bufflist) {
 341                 if (fma_blk->gnm_avail_mboxs <= 0 ||
 342                     fma_blk->gnm_state <= GNILND_FMABLK_IDLE) {
 343                         continue;
 344                 }
 345                 /* look in bitarray for available mailbox */
 346                 do {
 347                         id = find_next_zero_bit(
 348                                 fma_blk->gnm_bit_array,
 349                                 fma_blk->gnm_num_mboxs,
 350                                 fma_blk->gnm_next_avail_mbox);
 351                       if (id == fma_blk->gnm_num_mboxs &&
 352                           fma_blk->gnm_next_avail_mbox != 0) {
 353                                 /* wrap around */
 354                                 fma_blk->gnm_next_avail_mbox = 0;
 355                         } else {
 356                                 break;
 357                         }
 358                 } while (1);
 359
 360                 LASSERTF(id < fma_blk->gnm_num_mboxs, "id %d max %d\n",
 361                          id, fma_blk->gnm_num_mboxs);
 362                 set_bit(id, (volatile unsigned long *)fma_blk->gnm_bit_array);
 363                 conn->gnc_mbox_id = id;
 364
 365                 fma_blk->gnm_next_avail_mbox =
 366                         (id == (fma_blk->gnm_num_mboxs - 1)) ? 0 : (id + 1);
 367                 fma_blk->gnm_avail_mboxs--;
 368                 conn->gnc_fma_blk = fma_blk;
 369
 370                 kgnilnd_setup_smsg_attr(smsg_attr);
 371
 372                 smsg_attr->msg_buffer = fma_blk->gnm_block;
 373                 smsg_attr->mbox_offset = fma_blk->gnm_mbox_size * id;
 374                 smsg_attr->mem_hndl = fma_blk->gnm_hndl;
 375                 smsg_attr->buff_size = fma_blk->gnm_mbox_size;
 376
 377                 /* We'll set the hndl to zero for PHYS blocks unmapped during stack
 378                  * reset and re-use the same fma_blk after stack reset. This ensures we've
 379                  * properly mapped it before we use it */
 380                 LASSERTF(fma_blk->gnm_hndl.qword1 != 0UL, "unmapped fma_blk %p, state %d\n",
 381                          fma_blk, fma_blk->gnm_state);
 382
 383                 CDEBUG(D_NET, "conn %p smsg %p fmablk %p "
 384                         "allocating SMSG mbox %d buf %p "
 385                         "offset %u hndl "LPX64"."LPX64"\n",
 386                         conn, smsg_attr, fma_blk, id,
 387                         smsg_attr->msg_buffer, smsg_attr->mbox_offset,
 388                         fma_blk->gnm_hndl.qword1,
 389                         fma_blk->gnm_hndl.qword2);
 390
 391                 mbox = &fma_blk->gnm_mbox_info[id];
 392                 mbox->mbx_create_conn_memset = jiffies;
 393                 mbox->mbx_nallocs++;
 394                 mbox->mbx_nallocs_total++;
 395
 396                 /* zero mbox to remove any old data from our last use.
 397                  * this better be safe, if not our purgatory timers
 398                  * are too short or a peer really is misbehaving */
 399                 memset(smsg_attr->msg_buffer + smsg_attr->mbox_offset,
 400                        0, smsg_attr->buff_size);
 401                 break;
 402         }
 403
 404         spin_unlock(&dev->gnd_fmablk_lock);
 405 }
 406
 407 int
 408 kgnilnd_setup_mbox(kgn_conn_t *conn)
 409 {
 410         gni_smsg_attr_t         *smsg_attr = &conn->gnpr_smsg_attr;
 411         int                      err = 0;
 412
 413         smsg_attr->msg_buffer = NULL;
 414         /* Look for available mbox */
 415         do {
 416                 kgnilnd_find_free_mbox(conn);
 417
 418                 /* nothing in the existing buffers, make a new one */
 419                 if (smsg_attr->msg_buffer == NULL) {
 420                         /* for runtime allocations, we only want vmalloc */
 421                         err = kgnilnd_alloc_fmablk(conn->gnc_device, 0);
 422                         if (err) {
 423                                 break;
 424                         }
 425                 }
 426         } while (smsg_attr->msg_buffer == NULL);
 427
 428         if (err)
 429                 CNETERR("couldn't allocate SMSG mbox for conn %p Error: %d\n",
 430                         conn, err);
 431         return err;
 432 }
 433
 434 void
 435 kgnilnd_release_mbox(kgn_conn_t *conn, int purgatory_hold)
 436 {
 437         kgn_device_t           *dev = conn->gnc_device;
 438         gni_smsg_attr_t        *smsg_attr = &conn->gnpr_smsg_attr;
 439         kgn_fma_memblock_t     *fma_blk = NULL;
 440         kgn_mbox_info_t        *mbox = NULL;
 441         int                     found = 0;
 442         int                     id;
 443
 444         /* if we failed to setup mbox and now destroying conn */
 445         if (smsg_attr->msg_buffer == NULL) {
 446                 return;
 447         }
 448
 449         id = conn->gnc_mbox_id;
 450
 451         spin_lock(&dev->gnd_fmablk_lock);
 452         /* make sure our conn points at a valid fma_blk
 453          * We use this instead of a mem block search out of smsg_attr
 454          * because we could have freed a block for fma_blk #1 but the fma_blk
 455          * is still in the list for a purgatory hold. This would induce a false
 456          * match if that same block gets reallocated to fma_blk #2 */
 457         list_for_each_entry(fma_blk, &dev->gnd_fma_buffs, gnm_bufflist) {
 458                 if (fma_blk == conn->gnc_fma_blk) {
 459                         found = 1;
 460                         break;
 461                 }
 462         }
 463         LASSERTF(found, "unable to find conn 0x%p with gnc_fma_blk %p "
 464                  "anywhere in the world\n", conn, conn->gnc_fma_blk);
 465
 466         LASSERTF(id < fma_blk->gnm_num_mboxs,
 467                 "bad id %d max %d\n",
 468                 id, fma_blk->gnm_num_mboxs);
 469
 470         /* < 0 - was held, now free it
 471          * == 0 - just free it
 472          * > 0 - hold it for now */
 473         if (purgatory_hold == 0) {
 474                 CDEBUG(D_NET, "conn %p smsg %p fmablk %p freeing SMSG mbox %d "
 475                         "hndl "LPX64"."LPX64"\n",
 476                         conn, smsg_attr, fma_blk, id,
 477                         fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
 478                 fma_blk->gnm_avail_mboxs++;
 479
 480         } else if (purgatory_hold > 0) {
 481                 CDEBUG(D_NET, "conn %p smsg %p fmablk %p holding SMSG mbox %d "
 482                         "hndl "LPX64"."LPX64"\n",
 483                         conn, smsg_attr, fma_blk, id,
 484                         fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
 485
 486                 fma_blk->gnm_held_mboxs++;
 487                 fma_blk->gnm_max_timeout = MAX(fma_blk->gnm_max_timeout,
 488                                                 conn->gnc_timeout);
 489         } else {
 490                 CDEBUG(D_NET, "conn %p smsg %p fmablk %p release SMSG mbox %d "
 491                         "hndl "LPX64"."LPX64"\n",
 492                         conn, smsg_attr, fma_blk, id,
 493                         fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
 494
 495                 fma_blk->gnm_held_mboxs--;
 496                 fma_blk->gnm_avail_mboxs++;
 497         }
 498
 499         if (purgatory_hold <= 0) {
 500                 /* if kgni is retransmitting, freeing the smsg block before the EP
 501                  * is destroyed gets messy. Bug 768295. */
 502                 LASSERTF(conn->gnc_ephandle == NULL,
 503                          "can't release mbox before EP is nuked. conn 0x%p\n", conn);
 504
 505                 mbox = &fma_blk->gnm_mbox_info[id];
 506                 mbox->mbx_release_from_purgatory = jiffies;
 507
 508                 /* clear conn gnc_fmablk if it is gone - this allows us to
 509                  * not worry about state so much in kgnilnd_destroy_conn
 510                  * and makes the guaranteed cleanup of the resources easier */
 511                 LASSERTF(test_and_clear_bit(id, fma_blk->gnm_bit_array),
 512                         "conn %p bit %d already cleared in fma_blk %p\n",
 513                          conn, id, fma_blk);
 514                 conn->gnc_fma_blk = NULL;
 515                 mbox->mbx_nallocs--;
 516         }
 517
 518         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_FMABLK_AVAIL)) {
 519                 CERROR("LBUGs in your future: forcibly marking fma_blk %p "
 520                        "as mapped\n", fma_blk);
 521                 fma_blk->gnm_state = GNILND_FMABLK_VIRT;
 522         }
 523
 524         /* we don't release or unmap PHYS blocks as part of the normal cycle --
 525          * those are controlled manually from startup/shutdown */
 526         if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
 527                 /* we can unmap once all are unused (held or avail)
 528                  * but check hold_timeout to make sure we are not trying to double
 529                  * unmap this buffer. If there was no hold_timeout set due to
 530                  * held_mboxs, we'll free the mobx here shortly and won't have to
 531                  * worry about catching a double free for a 'clean' fma_blk */
 532                 if (((fma_blk->gnm_avail_mboxs + fma_blk->gnm_held_mboxs) == fma_blk->gnm_num_mboxs) &&
 533                     (!fma_blk->gnm_hold_timeout)) {
 534                         kgnilnd_unmap_fmablk(dev, fma_blk);
 535                 }
 536
 537                 /* But we can only free once they are all avail */
 538                 if (fma_blk->gnm_avail_mboxs == fma_blk->gnm_num_mboxs &&
 539                     fma_blk->gnm_held_mboxs == 0) {
 540                         /* all mailboxes are released, free fma_blk */
 541                         kgnilnd_free_fmablk_locked(dev, fma_blk);
 542                 }
 543         }
 544
 545         spin_unlock(&dev->gnd_fmablk_lock);
 546 }
 547
 548 int
 549 kgnilnd_count_phys_mbox(kgn_device_t *device)
 550 {
 551         int                     i = 0;
 552         kgn_fma_memblock_t     *fma_blk;
 553
 554         spin_lock(&device->gnd_fmablk_lock);
 555
 556         list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
 557                 if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
 558                         i += fma_blk->gnm_num_mboxs;
 559         }
 560         spin_unlock(&device->gnd_fmablk_lock);
 561
 562         RETURN(i);
 563 }
 564
 565 int
 566 kgnilnd_allocate_phys_fmablk(kgn_device_t *device)
 567 {
 568         int     rc;
 569
 570         while (kgnilnd_count_phys_mbox(device) < *kgnilnd_tunables.kgn_nphys_mbox) {
 571
 572                 rc = kgnilnd_alloc_fmablk(device, 1);
 573                 if (rc) {
 574                         CERROR("failed phys mbox allocation, stopping at %d, rc %d\n",
 575                                 kgnilnd_count_phys_mbox(device), rc);
 576                         RETURN(rc);
 577                 }
 578         }
 579         RETURN(0);
 580 }
 581
 582 int
 583 kgnilnd_map_phys_fmablk(kgn_device_t *device)
 584 {
 585
 586         int                     rc = 0;
 587         kgn_fma_memblock_t     *fma_blk;
 588
 589         /* use mutex to gate access to single thread, just in case */
 590         mutex_lock(&device->gnd_fmablk_mutex);
 591
 592         spin_lock(&device->gnd_fmablk_lock);
 593
 594         list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
 595                 if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
 596                         rc = kgnilnd_map_fmablk(device, fma_blk);
 597                         if (rc)
 598                                 break;
 599                 }
 600         }
 601         spin_unlock(&device->gnd_fmablk_lock);
 602
 603         mutex_unlock(&device->gnd_fmablk_mutex);
 604
 605         RETURN(rc);
 606 }
 607
 608 void
 609 kgnilnd_unmap_fma_blocks(kgn_device_t *device)
 610 {
 611
 612         kgn_fma_memblock_t      *fma_blk;
 613
 614         /* use mutex to gate access to single thread, just in case */
 615         mutex_lock(&device->gnd_fmablk_mutex);
 616
 617         spin_lock(&device->gnd_fmablk_lock);
 618
 619         list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
 620                 kgnilnd_unmap_fmablk(device, fma_blk);
 621         }
 622         spin_unlock(&device->gnd_fmablk_lock);
 623
 624         mutex_unlock(&device->gnd_fmablk_mutex);
 625 }
 626
 627 void
 628 kgnilnd_free_phys_fmablk(kgn_device_t *device)
 629 {
 630
 631         kgn_fma_memblock_t      *fma_blk, *fma_blkN;
 632
 633         /* use mutex to gate access to single thread, just in case */
 634         mutex_lock(&device->gnd_fmablk_mutex);
 635
 636         spin_lock(&device->gnd_fmablk_lock);
 637
 638         list_for_each_entry_safe(fma_blk, fma_blkN, &device->gnd_fma_buffs, gnm_bufflist) {
 639                 if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
 640                         kgnilnd_free_fmablk_locked(device, fma_blk);
 641         }
 642         spin_unlock(&device->gnd_fmablk_lock);
 643
 644         mutex_unlock(&device->gnd_fmablk_mutex);
 645 }
 646
 647 /* kgnilnd dgram nid->struct managment */
 648
 649 static inline struct list_head *
 650 kgnilnd_nid2dgramlist(kgn_device_t *dev, lnet_nid_t nid)
 651 {
 652         unsigned int hash = ((unsigned int)nid) % *kgnilnd_tunables.kgn_peer_hash_size;
 653
 654         RETURN(&dev->gnd_dgrams[hash]);
 655 }
 656
 657
 658 /* needs dev->gnd_dgram_lock held */
 659 kgn_dgram_t *
 660 kgnilnd_find_dgram_locked(kgn_device_t *dev, lnet_nid_t dst_nid)
 661 {
 662         struct list_head *dgram_list = kgnilnd_nid2dgramlist(dev, dst_nid);
 663         kgn_dgram_t      *dgram;
 664
 665         list_for_each_entry(dgram, dgram_list, gndg_list) {
 666
 667                 /* if state > POSTED, we are already handling cancel/completion */
 668                 if ((dgram->gndg_conn_out.gncr_dstnid != dst_nid) ||
 669                      dgram->gndg_state > GNILND_DGRAM_POSTED)
 670                         continue;
 671
 672                 CDEBUG(D_NET, "got dgram [%p] -> %s\n",
 673                        dgram, libcfs_nid2str(dst_nid));
 674                 return dgram;
 675         }
 676         return NULL;
 677 }
 678
 679 int
 680 kgnilnd_find_and_cancel_dgram(kgn_device_t *dev, lnet_nid_t dst_nid)
 681 {
 682         kgn_dgram_t     *dgram;
 683
 684         spin_lock(&dev->gnd_dgram_lock);
 685         dgram = kgnilnd_find_dgram_locked(dev, dst_nid);
 686
 687         if (dgram) {
 688                 kgnilnd_cancel_dgram_locked(dgram);
 689         }
 690         spin_unlock(&dev->gnd_dgram_lock);
 691
 692         RETURN(!!(dgram == NULL));
 693 }
 694
 695 int
 696 kgnilnd_pack_connreq(kgn_connreq_t *connreq, kgn_conn_t *conn,
 697                      lnet_nid_t srcnid, lnet_nid_t dstnid,
 698                      kgn_connreq_type_t type)
 699 {
 700         int err = 0;
 701
 702         /* ensure we haven't violated max datagram size */
 703         CLASSERT(sizeof(kgn_connreq_t) <= GNI_DATAGRAM_MAXSIZE);
 704
 705         /* no need to zero out, we do that when allocating dgram */
 706         connreq->gncr_magic     = GNILND_MSG_MAGIC;
 707
 708         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_SRCNID)) {
 709                 srcnid = 0xABADBABE;
 710         } else if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_DSTNID)) {
 711                 dstnid = 0xDEFEC8ED;
 712         }
 713
 714         connreq->gncr_srcnid    = srcnid;
 715         connreq->gncr_dstnid    = dstnid;
 716
 717         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
 718                 connreq->gncr_version = 99;
 719         } else {
 720                 connreq->gncr_version   = GNILND_CONNREQ_VERSION;
 721         }
 722         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
 723                 connreq->gncr_type = 99;
 724         } else {
 725                 connreq->gncr_type      = type;
 726         }
 727         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
 728                 connreq->gncr_peerstamp = 0;
 729         } else {
 730                 connreq->gncr_peerstamp = kgnilnd_data.kgn_peerstamp;
 731         }
 732         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
 733                 connreq->gncr_connstamp = 0;
 734         } else {
 735                 connreq->gncr_connstamp = conn->gnc_my_connstamp;
 736         }
 737         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
 738                 connreq->gncr_timeout = 0;
 739         } else {
 740                 connreq->gncr_timeout   = conn->gnc_timeout;
 741         }
 742
 743         /* the rest pack the data into the payload in other places */
 744         if (type == GNILND_CONNREQ_REQ) {
 745                 kgn_gniparams_t       *req_params = &connreq->gncr_gnparams;
 746                 req_params->gnpr_host_id = conn->gnc_device->gnd_host_id;
 747                 req_params->gnpr_cqid = conn->gnc_cqid;
 748
 749                 /* allocate mailbox for this connection */
 750                 err = kgnilnd_setup_mbox(conn);
 751                 if (err != 0) {
 752                         CERROR("Failed to setup FMA mailbox (%d)\n", err);
 753                 }
 754                 req_params->gnpr_smsg_attr = conn->gnpr_smsg_attr;
 755         }
 756
 757         /* XXX Nic: TBD - checksum computation */
 758
 759         return err;
 760 }
 761
 762 int
 763 kgnilnd_unpack_connreq(kgn_dgram_t *dgram)
 764 {
 765         kgn_connreq_t           *connreq = &dgram->gndg_conn_in;
 766         int                      swab, rc = 0;
 767         kgn_net_t               *net;
 768
 769         /* the following fields must be handled in a backwards compatible
 770          * manner to ensure we can always send and interpret NAKs */
 771
 772         if (connreq->gncr_magic != GNILND_MSG_MAGIC &&
 773             connreq->gncr_magic != __swab32(GNILND_MSG_MAGIC)) {
 774                 /* Unexpected magic! */
 775                 CERROR("Unexpected magic %08x\n",
 776                        connreq->gncr_magic);
 777                 return -EBADF;
 778         }
 779
 780         swab = (connreq->gncr_magic == __swab32(GNILND_MSG_MAGIC));
 781         if (swab) {
 782                 __swab32s(&connreq->gncr_magic);
 783                 __swab32s(&connreq->gncr_cksum);
 784                 __swab16s(&connreq->gncr_type);
 785                 __swab16s(&connreq->gncr_version);
 786                 __swab32s(&connreq->gncr_timeout);
 787                 __swab64s(&connreq->gncr_srcnid);
 788                 __swab64s(&connreq->gncr_dstnid);
 789                 __swab64s(&connreq->gncr_peerstamp);
 790                 __swab64s(&connreq->gncr_connstamp);
 791         }
 792
 793         /* Do NOT return anything but -EBADF before we munge
 794          * connreq->gncr_srcnid - we need that to send the nak */
 795
 796         if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
 797                 lnet_nid_t      incoming = connreq->gncr_srcnid;
 798
 799                 /* even if the incoming packet is hosed, we know who we sent
 800                  * the original and can set the srcnid so that we can properly
 801                  * look up our peer to close the loop on this connreq. We still use
 802                  * -EBADF to prevent a NAK - just in case there are issues with
 803                  * the payload coming from a random spot, etc. */
 804                 connreq->gncr_srcnid = dgram->gndg_conn_out.gncr_dstnid;
 805
 806                 if (LNET_NIDADDR(dgram->gndg_conn_out.gncr_dstnid) !=
 807                                 LNET_NIDADDR(incoming)) {
 808                         /* we got a datagram match for the wrong nid... */
 809                         CERROR("matched datagram 0x%p with srcnid %s "
 810                                 "(%x), expecting %s (%x)\n",
 811                                 dgram,
 812                                 libcfs_nid2str(incoming),
 813                                 LNET_NIDADDR(incoming),
 814                                 libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
 815                                 LNET_NIDADDR(dgram->gndg_conn_out.gncr_dstnid));
 816                         return -EBADF;
 817                 }
 818         } else {
 819                 /* if we have a wildcard datagram it should match an
 820                  * incoming "active" datagram that should have a fully formed
 821                  * srcnid and dstnid. If we couldn't unpack it, we drop as
 822                  * corrupted packet, otherwise we'll just verify that the dstnid
 823                  * matches the NID for the NET that the dgram was posted */
 824
 825                 /* make sure their wildcard didn't match ours, that is unpossible */
 826                 LASSERTF(connreq->gncr_dstnid != LNET_NID_ANY,
 827                          "dgram 0x%p from %s, connreq 0x%p; "
 828                          "wildcard matched wildcard \n", dgram,
 829                          libcfs_nid2str(connreq->gncr_srcnid), connreq);
 830
 831                 rc = kgnilnd_find_net(connreq->gncr_dstnid, &net);
 832
 833                 if (rc == -ESHUTDOWN) {
 834                         CERROR("Looking up network: device is in shutdown");
 835                         return rc;
 836                 } else if (rc == -ENONET) {
 837                         CERROR("Connection data from %s: she sent "
 838                         "dst_nid %s, but net lookup failed on "
 839                         "dgram 0x%p@%s\n",
 840                         libcfs_nid2str(connreq->gncr_srcnid),
 841                         libcfs_nid2str(connreq->gncr_dstnid),
 842                         dgram, kgnilnd_dgram_type2str(dgram));
 843                         return rc;
 844                 }
 845
 846                 if (net->gnn_ni->ni_nid != connreq->gncr_dstnid) {
 847                         CERROR("Bad connection data from %s: she sent "
 848                                "dst_nid %s, but I am %s with dgram 0x%p@%s\n",
 849                                libcfs_nid2str(connreq->gncr_srcnid),
 850                                libcfs_nid2str(connreq->gncr_dstnid),
 851                                libcfs_nid2str(net->gnn_ni->ni_nid),
 852                                dgram, kgnilnd_dgram_type2str(dgram));
 853                         kgnilnd_net_decref(net);
 854                         return -EBADSLT;
 855                 }
 856
 857                 /* kgnilnd_find_net takes a ref on the net it finds, You need to decref it when not needed. */
 858                 kgnilnd_net_decref(net);
 859         }
 860
 861         if (connreq->gncr_version != GNILND_CONNREQ_VERSION) {
 862                 CERROR("Unexpected version %d\n", connreq->gncr_version);
 863                 return -EPROTO;
 864         }
 865
 866         /* XXX Nic: TBD - checksum validation */
 867         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_DROP)) {
 868                 return -EBADF;
 869         }
 870
 871         if (swab && connreq->gncr_type == GNILND_CONNREQ_REQ) {
 872                 __u64 msg_addr = (__u64) connreq->gncr_gnparams.gnpr_smsg_attr.msg_buffer;
 873
 874                 __swab32s(&connreq->gncr_gnparams.gnpr_host_id);
 875                 __swab32s(&connreq->gncr_gnparams.gnpr_cqid);
 876                 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.buff_size);
 877                 __swab16s(&connreq->gncr_gnparams.gnpr_smsg_attr.mbox_maxcredit);
 878                 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.mbox_offset);
 879                 __swab64s(&connreq->gncr_gnparams.gnpr_smsg_attr.mem_hndl.qword1);
 880                 __swab64s(&connreq->gncr_gnparams.gnpr_smsg_attr.mem_hndl.qword2);
 881                 __swab64s(&msg_addr);
 882                 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.msg_maxsize);
 883                 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.msg_type);
 884         } else if (swab && connreq->gncr_type == GNILND_CONNREQ_NAK) {
 885                 __swab32s(&connreq->gncr_nakdata.gnnd_errno);
 886         }
 887
 888         /* since we use a unique instance ID for each network, the driver
 889          * will take care of dropping datagrams if we don't have that network.
 890          */
 891
 892         /* few more idiot software or configuration checks */
 893
 894         switch (connreq->gncr_type) {
 895         case GNILND_CONNREQ_REQ:
 896                 /* wire up EP and SMSG block - this will check the incoming data
 897                  * and barf a NAK back if need to */
 898                 rc = kgnilnd_set_conn_params(dgram);
 899                 if (rc)
 900                         return rc;
 901                 break;
 902         case GNILND_CONNREQ_NAK:
 903         case GNILND_CONNREQ_CLOSE:
 904                 break;
 905         default:
 906                 CERROR("unknown connreq packet type %d\n", connreq->gncr_type);
 907                 return -EPROTO;
 908         }
 909
 910         if (connreq->gncr_peerstamp == 0 || connreq->gncr_connstamp == 0) {
 911                 CERROR("Recived bad timestamps peer "LPU64" conn "LPU64"\n",
 912                 connreq->gncr_peerstamp, connreq->gncr_connstamp);
 913                 return -EPROTO;
 914         }
 915
 916         if (connreq->gncr_timeout < GNILND_MIN_TIMEOUT) {
 917                 CERROR("Received timeout %d < MIN %d\n",
 918                        connreq->gncr_timeout, GNILND_MIN_TIMEOUT);
 919                 return -EPROTO;
 920         }
 921
 922         return 0;
 923 }
 924
 925 int
 926 kgnilnd_alloc_dgram(kgn_dgram_t **dgramp, kgn_device_t *dev, kgn_dgram_type_t type)
 927 {
 928         kgn_dgram_t         *dgram;
 929
 930         dgram = kmem_cache_alloc(kgnilnd_data.kgn_dgram_cache, GFP_ATOMIC);
 931         if (dgram == NULL)
 932                 return -ENOMEM;
 933
 934         /* cache alloc'd memory is not zeroed */
 935         memset((void *)dgram, 0, sizeof(*dgram)) ;
 936
 937         INIT_LIST_HEAD(&dgram->gndg_list);
 938         dgram->gndg_state = GNILND_DGRAM_USED;
 939         dgram->gndg_type = type;
 940         dgram->gndg_magic = GNILND_DGRAM_MAGIC;
 941
 942         atomic_inc(&dev->gnd_ndgrams);
 943
 944         CDEBUG(D_MALLOC|D_NETTRACE, "slab-alloced 'dgram': %lu at %p %s ndgrams"
 945                 " %d\n",
 946                 sizeof(*dgram), dgram, kgnilnd_dgram_type2str(dgram),
 947                 atomic_read(&dev->gnd_ndgrams));
 948
 949         *dgramp = dgram;
 950         return 0;
 951 }
 952
 953 /* call this on a dgram that came back from kgnilnd_ep_postdata_test_by_id
 954  * returns < 0 on dgram to be cleaned up
 955  * > 0 on dgram that isn't done yet
 956  * == 0 on dgram that is ok and needs connreq processing */
 957 int
 958 kgnilnd_process_dgram(kgn_dgram_t *dgram, gni_post_state_t post_state)
 959 {
 960         int rc = 0;
 961
 962         switch (post_state) {
 963         case GNI_POST_COMPLETED:
 964                 /* normal state for dgrams that need actual processing */
 965                 /* GOTO to avoid processing dgram as canceled/done */
 966                 GOTO(process_out, rc);
 967
 968         case GNI_POST_PENDING:
 969                 /* we should only see this if we are testing a WC dgram after a
 970                  * cancel - it means that it needs a full cycle of waiting
 971                  * for kgni_sm_task to finish moving it to TERMINATED */
 972                 LASSERTF((dgram->gndg_type == GNILND_DGRAM_WC_REQ) &&
 973                           (dgram->gndg_state == GNILND_DGRAM_CANCELED),
 974                          "POST_PENDING dgram 0x%p with bad type %d(%s) or state %d(%s)\n",
 975                          dgram, dgram->gndg_type, kgnilnd_dgram_type2str(dgram),
 976                          dgram->gndg_state, kgnilnd_dgram_state2str(dgram));
 977
 978                 /* positive RC as this dgram isn't done yet */
 979                 rc = EINPROGRESS;
 980
 981                 /* GOTO as this isn't done yet */
 982                 GOTO(process_out, rc);
 983                 break;
 984
 985         case GNI_POST_TERMINATED:
 986                 /* we've called cancel and it is done or remote guy called cancel and
 987                  * we've receved it on a WC dgram */
 988 #if 0
 989                 /* we are seeing weird terminations on non WC dgrams when we have not
 990                  * canceled them */
 991
 992                 LASSERTF(dgram->gndg_state == GNILND_DGRAM_CANCELED ||
 993                          dgram->gndg_conn_out.gncr_dstnid == LNET_NID_ANY,
 994                         "dgram 0x%p with bad state %d(%s) or dst nid %s\n",
 995                         dgram, dgram->gndg_state, kgnilnd_dgram_state2str(dgram),
 996                         libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid));
 997 #endif
 998
 999                 CDEBUG(D_NETTRACE, "dgram 0x%p saw %s, cleaning it up\n", dgram,
1000                        dgram->gndg_state == GNILND_DGRAM_CANCELED ?  "canceled" : "terminated");
1001
1002                 rc =  -ECANCELED;
1003                 break;
1004
1005         case GNI_POST_TIMEOUT:
1006                 /* we could have a timeout on a wildcard dgram too - if
1007                  * we got the incoming request but the remote node beefed
1008                  * before kgni could send the match data back. We'll just error
1009                  * on the active case and bail out gracefully */
1010                 if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
1011                         CNETERR("hardware timeout for connect to "
1012                                "%s after %lu seconds. Is node dead?\n",
1013                                libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
1014                                cfs_duration_sec(jiffies - dgram->gndg_post_time));
1015                 }
1016
1017                 rc = -ETIMEDOUT;
1018                 break;
1019
1020         default:
1021                 CERROR("dgram 0x%p with bad post_state %d\n", dgram, post_state);
1022                 LBUG();
1023         }
1024
1025         /* now finish cleaning up a dgram that is canceled/terminated and needs to
1026          * go away */
1027
1028         /* If this was actively canceled, drop the count now that we are processing */
1029         if (dgram->gndg_state == GNILND_DGRAM_CANCELED) {
1030                 atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1031                 /* caller responsible for gndg_list removal */
1032         }
1033
1034 process_out:
1035
1036         RETURN(rc);
1037 }
1038
1039 /* needs dev->gnd_dgram_lock held */
1040 void
1041 kgnilnd_cancel_dgram_locked(kgn_dgram_t *dgram)
1042 {
1043         gni_return_t            grc;
1044
1045         if (dgram->gndg_state != GNILND_DGRAM_POSTED) {
1046                 return;
1047         }
1048
1049         LASSERTF(dgram->gndg_conn != NULL,
1050                  "dgram 0x%p with NULL conn\n", dgram);
1051
1052         /* C.E - WC dgrams could be canceled immediately but
1053          * if there was some match pending, we need to call
1054          * test_by_id to clear it out. If that test returns
1055          * POST_PENDING, it is half done and needs to go along
1056          * with the rest of dgrams and go through a kgni_sm_task cycle
1057          * and deliver a GNI_POST_TERMINATED event before they
1058          * are actually canceled */
1059
1060         dgram->gndg_state = GNILND_DGRAM_CANCELED;
1061
1062         if (dgram->gndg_conn->gnc_state >= GNILND_CONN_ESTABLISHED) {
1063                 /* we don't need to cancel_by_id if the datagram was good */
1064                 return;
1065         }
1066
1067         /* let folks know there are outstanding cancels */
1068         atomic_inc(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1069         /* leave on nid list until cancel is done for debugging fun */
1070         grc = kgnilnd_ep_postdata_cancel_by_id(dgram->gndg_conn->gnc_ephandle, (__u64) dgram);
1071
1072         /* if we don't get success here, we have hosed up the dgram tracking
1073          * code and need to bail out */
1074         LASSERTF(grc == GNI_RC_SUCCESS,
1075                  "postdata_cancel returned %d for conn 0x%p to %s\n",
1076                  grc, dgram->gndg_conn,
1077                  dgram->gndg_conn->gnc_peer ?
1078                   libcfs_nid2str(dgram->gndg_conn->gnc_peer->gnp_nid)
1079                   : "<?>");
1080
1081         CDEBUG(D_NETTRACE,
1082                 "canceled dgram 0x%p conn 0x%p ephandle 0x%p\n",
1083                 dgram, dgram->gndg_conn,
1084                 dgram->gndg_conn->gnc_ephandle);
1085
1086         if (dgram->gndg_type == GNILND_DGRAM_WC_REQ) {
1087                 gni_post_state_t         post_state;
1088                 int                      rc = 0;
1089                 __u32                    remote_addr = 0, remote_id = 0;
1090
1091                 grc = kgnilnd_ep_postdata_test_by_id(dgram->gndg_conn->gnc_ephandle,
1092                                                      (__u64)dgram, &post_state,
1093                                                      &remote_addr, &remote_id);
1094
1095                 LASSERTF(grc == GNI_RC_NO_MATCH || grc == GNI_RC_SUCCESS,
1096                          "bad grc %d from test_by_id on dgram 0x%p\n",
1097                         grc, dgram);
1098
1099                 /* if WC was canceled immediately, we get NO_MATCH, if needs to go
1100                  * through full cycle, we get SUCCESS and need to parse post_state */
1101
1102                 CDEBUG(D_NET, "grc %d dgram 0x%p type %s post_state %d "
1103                         "remote_addr %u remote_id %u\n", grc, dgram,
1104                         kgnilnd_dgram_type2str(dgram),
1105                         post_state, remote_addr, remote_id);
1106
1107                 if (grc == GNI_RC_NO_MATCH) {
1108                         /* she's gone, reduce count and move along */
1109                         dgram->gndg_state = GNILND_DGRAM_DONE;
1110                         atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1111                         RETURN_EXIT;
1112                 }
1113
1114                 rc = kgnilnd_process_dgram(dgram, post_state);
1115
1116                 if (rc <= 0) {
1117                         /* if for some weird reason we get a valid dgram back, just mark as done
1118                          * so we can drop it and move along.
1119                          * C.E - if it was completed, we'll just release the conn/mbox
1120                          * back into the pool and it'll get reused. That said, we should only
1121                          * be canceling a WC dgram on stack rest or shutdown, so that is moot */
1122                         dgram->gndg_state = GNILND_DGRAM_DONE;
1123                         atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1124
1125                         /* caller context responsible for calling kgnilnd_release_dgram() */
1126                 } else {
1127                         /* still pending, let it simmer until golden brown and delicious */
1128                 }
1129         }
1130
1131         /* for non WC dgrams, they are still on the nid list but marked canceled waiting
1132          * for kgni to return their ID to us via probe - that is when we'll complete their
1133          * cancel processing */
1134 }
1135
1136 void
1137 kgnilnd_cleanup_dgram(kgn_dgram_t *dgram)
1138 {
1139         /* release the dgram ref on conn */
1140         if (dgram->gndg_conn) {
1141                 kgnilnd_conn_decref(dgram->gndg_conn);
1142                 dgram->gndg_conn = NULL;
1143         }
1144 }
1145
1146 void
1147 kgnilnd_free_dgram(kgn_device_t *dev, kgn_dgram_t *dgram)
1148 {
1149         LASSERTF(dgram->gndg_state == GNILND_DGRAM_USED ||
1150                  dgram->gndg_state == GNILND_DGRAM_DONE,
1151                  "dgram 0x%p with bad state %s\n",
1152                  dgram, kgnilnd_dgram_state2str(dgram));
1153
1154         /* bit of poisoning to help detect bad driver data */
1155         dgram->gndg_magic = 0x6f5a6b5f;
1156         atomic_dec(&dev->gnd_ndgrams);
1157
1158         kmem_cache_free(kgnilnd_data.kgn_dgram_cache, dgram);
1159         CDEBUG(D_MALLOC|D_NETTRACE, "slab-freed 'dgram': %lu at %p %s"
1160                " ndgrams %d\n",
1161                sizeof(*dgram), dgram, kgnilnd_dgram_type2str(dgram),
1162                atomic_read(&dev->gnd_ndgrams));
1163 }
1164
1165 int
1166 kgnilnd_post_dgram(kgn_device_t *dev, lnet_nid_t dstnid, kgn_connreq_type_t type,
1167                    int data_rc)
1168 {
1169         int              rc = 0;
1170         kgn_dgram_t     *dgram = NULL;
1171         kgn_dgram_t     *tmpdgram;
1172         kgn_dgram_type_t dgtype;
1173         gni_return_t     grc;
1174         __u64            srcnid;
1175         ENTRY;
1176
1177         switch (type) {
1178         case GNILND_CONNREQ_REQ:
1179                 if (dstnid == LNET_NID_ANY)
1180                         dgtype = GNILND_DGRAM_WC_REQ;
1181                 else
1182                         dgtype = GNILND_DGRAM_REQ;
1183                 break;
1184         case GNILND_CONNREQ_NAK:
1185                 LASSERTF(dstnid != LNET_NID_ANY, "can't NAK to LNET_NID_ANY\n");
1186                 dgtype = GNILND_DGRAM_NAK;
1187                 break;
1188         default:
1189                 CERROR("unknown connreq type %d\n", type);
1190                 LBUG();
1191         }
1192
1193         rc = kgnilnd_alloc_dgram(&dgram, dev, dgtype);
1194         if (rc < 0) {
1195                 rc = -ENOMEM;
1196                 GOTO(post_failed, rc);
1197         }
1198
1199         rc = kgnilnd_create_conn(&dgram->gndg_conn, dev);
1200         if (rc) {
1201                 GOTO(post_failed, rc);
1202         }
1203
1204         if (dgram->gndg_type == GNILND_DGRAM_WC_REQ) {
1205                 /* clear buffer for sanity on reuse of wildcard */
1206                 memset(&dgram->gndg_conn_in, 0, sizeof(kgn_connreq_t));
1207         }
1208
1209         if (dstnid == LNET_NID_ANY) {
1210                 /* set here to reset any dgram re-use */
1211                 dgram->gndg_conn->gnc_state = GNILND_CONN_LISTEN;
1212         } else {
1213                 __u32            host_id;
1214
1215                 rc = kgnilnd_nid_to_nicaddrs(LNET_NIDADDR(dstnid), 1, &host_id);
1216                 if (rc <= 0) {
1217                         rc = -ESRCH;
1218                         GOTO(post_failed, rc);
1219                 }
1220
1221                 dgram->gndg_conn->gnc_state = GNILND_CONN_CONNECTING;
1222
1223                 /* don't need to serialize, there are no CQs for the dgram
1224                  * EP on the kgn_net_t */
1225                 grc = kgnilnd_ep_bind(dgram->gndg_conn->gnc_ephandle, host_id, dev->gnd_id);
1226
1227                 if (grc != GNI_RC_SUCCESS) {
1228                         rc = -ECONNABORTED;
1229                         GOTO(post_failed, rc);
1230                 }
1231
1232         }
1233
1234         /* If we are posting wildcards post using a net of 0, otherwise we'll use the
1235          * net of the destination node.
1236          */
1237
1238         if (dstnid == LNET_NID_ANY) {
1239                 srcnid = LNET_MKNID(LNET_MKNET(GNILND, 0), dev->gnd_nid);
1240         } else {
1241                 srcnid = LNET_MKNID(LNET_NIDNET(dstnid), dev->gnd_nid);
1242         }
1243
1244         rc = kgnilnd_pack_connreq(&dgram->gndg_conn_out, dgram->gndg_conn,
1245                                   srcnid, dstnid, type);
1246         if (rc) {
1247                 GOTO(post_failed, rc);
1248         }
1249
1250         if (type == GNILND_CONNREQ_NAK)
1251                 dgram->gndg_conn_out.gncr_nakdata.gnnd_errno = data_rc;
1252
1253         dgram->gndg_post_time = jiffies;
1254
1255         /* XXX Nic: here is where we'd add in logical network multiplexing */
1256
1257         CDEBUG(D_NETTRACE, "dgram 0x%p type %s %s->%s cdm %d\n",
1258                dgram, kgnilnd_dgram_type2str(dgram),
1259                libcfs_nid2str(srcnid),
1260                libcfs_nid2str(dstnid), dev->gnd_id);
1261
1262         /* this allocates memory, can't hold locks across */
1263         grc = kgnilnd_ep_postdata_w_id(dgram->gndg_conn->gnc_ephandle,
1264                                    &dgram->gndg_conn_out, sizeof(kgn_connreq_t),
1265                                    &dgram->gndg_conn_in, sizeof(kgn_connreq_t),
1266                                    (__u64)dgram);
1267
1268         if (grc != GNI_RC_SUCCESS) {
1269                 CNETERR("dropping failed dgram post id 0x%p type %s"
1270                         " reqtype %s to %s: rc %d\n",
1271                         dgram, kgnilnd_dgram_type2str(dgram),
1272                         kgnilnd_connreq_type2str(&dgram->gndg_conn_out),
1273                         libcfs_nid2str(dstnid), grc);
1274                 rc = (grc == GNI_RC_ERROR_NOMEM) ? -ENOMEM : -EBADR;
1275                 GOTO(post_failed, rc);
1276         }
1277
1278         /* we don't need to add earlier - if someone does del_peer during post,
1279          * that peer will get marked as unlinked and the callers wil take care of it.
1280          * The dgram code is largely kgn_peer_t ignorant, so at worst, we'll just drop
1281          * the completed dgram later when we cant find a peer to stuff it into */
1282
1283         spin_lock(&dev->gnd_dgram_lock);
1284
1285         /* make sure we are not double posting targeted dgrams
1286          * - we can multiple post WC dgrams to help with processing speed */
1287         if (dstnid != LNET_NID_ANY) {
1288                 tmpdgram = kgnilnd_find_dgram_locked(dev, dstnid);
1289
1290                 LASSERTF(tmpdgram == NULL,
1291                         "dgram 0x%p->%s already posted\n",
1292                          dgram, libcfs_nid2str(dstnid));
1293         }
1294
1295         /* unmunge dstnid to help processing code cope... */
1296         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_DSTNID)) {
1297                 dgram->gndg_conn_out.gncr_dstnid = dstnid;
1298         }
1299
1300         list_add_tail(&dgram->gndg_list, kgnilnd_nid2dgramlist(dev, dstnid));
1301         dgram->gndg_state = GNILND_DGRAM_POSTED;
1302         spin_unlock(&dev->gnd_dgram_lock);
1303
1304 post_failed:
1305         if (rc < 0 && dgram != NULL) {
1306                 kgnilnd_cleanup_dgram(dgram);
1307                 kgnilnd_free_dgram(dev, dgram);
1308         }
1309
1310         RETURN(rc);
1311 }
1312
1313 /* The shutdown flag is set from the shutdown and stack reset threads. */
1314 void
1315 kgnilnd_release_dgram(kgn_device_t *dev, kgn_dgram_t *dgram, int shutdown)
1316 {
1317         /* The conns of canceled active dgrams need to be put in purgatory so
1318          * we don't reuse the mailbox */
1319         if (unlikely(dgram->gndg_state == GNILND_DGRAM_CANCELED)) {
1320                 kgn_peer_t *peer;
1321                 kgn_conn_t *conn = dgram->gndg_conn;
1322                 lnet_nid_t nid = dgram->gndg_conn_out.gncr_dstnid;
1323
1324                 dgram->gndg_state = GNILND_DGRAM_DONE;
1325
1326                 /* During shutdown we've already removed the peer so we don't
1327                  * need to add a peer. During stack reset we don't care about
1328                  * MDDs since they are all released. */
1329                 if (!shutdown) {
1330                         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1331                         peer = kgnilnd_find_peer_locked(nid);
1332
1333                         if (peer != NULL) {
1334                                 CDEBUG(D_NET, "adding peer's conn with nid %s "
1335                                         "to purgatory\n", libcfs_nid2str(nid));
1336                                 kgnilnd_conn_addref(conn);
1337                                 conn->gnc_peer = peer;
1338                                 kgnilnd_peer_addref(peer);
1339                                 kgnilnd_admin_addref(conn->gnc_peer->gnp_dirty_eps);
1340                                 conn->gnc_state = GNILND_CONN_CLOSED;
1341                                 list_add_tail(&conn->gnc_list,
1342                                               &peer->gnp_conns);
1343                                 kgnilnd_add_purgatory_locked(conn,
1344                                                              conn->gnc_peer);
1345                                 kgnilnd_schedule_conn(conn);
1346                         }
1347                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1348                 }
1349         }
1350
1351         spin_lock(&dev->gnd_dgram_lock);
1352         kgnilnd_cancel_dgram_locked(dgram);
1353         spin_unlock(&dev->gnd_dgram_lock);
1354
1355         kgnilnd_cleanup_dgram(dgram);
1356
1357         /* if the dgram is 'canceled' it needs to be wait until the event
1358          * comes up from kgni that tells us it is safe to release */
1359         if (dgram->gndg_state != GNILND_DGRAM_CANCELED) {
1360                 dgram->gndg_state = GNILND_DGRAM_DONE;
1361
1362                 LASSERTF(list_empty(&dgram->gndg_list), "dgram 0x%p on list\n", dgram);
1363
1364                 /* if it is a wildcard and we are in an appropriate state, repost
1365                  * the wildcard */
1366
1367                 if ((dgram->gndg_type == GNILND_DGRAM_WC_REQ) &&
1368                     (!kgnilnd_data.kgn_wc_kill && !kgnilnd_data.kgn_in_reset)) {
1369                         int     rerc;
1370
1371                         rerc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
1372                         if (rerc != 0) {
1373                                 /* We failed to repost the WC dgram for some reason
1374                                  * mark it so the repost system attempts to repost */
1375                                 kgnilnd_admin_addref(dev->gnd_nwcdgrams);
1376                         }
1377                 }
1378
1379                 /* always free the old dgram */
1380                 kgnilnd_free_dgram(dev, dgram);
1381         }
1382 }
1383
1384
1385 int
1386 kgnilnd_probe_for_dgram(kgn_device_t *dev, kgn_dgram_t **dgramp)
1387 {
1388         kgn_dgram_t             *dgram = NULL;
1389         gni_post_state_t         post_state;
1390         gni_return_t             grc;
1391         int                      rc = 0;
1392         __u64                    readyid;
1393         __u32                    remote_addr = 0, remote_id = 0;
1394         ENTRY;
1395
1396         /* Probe with the lock held. That way if we get a dgram we dont have it canceled
1397          * between finding the ready dgram and grabbing the lock to remove it from the
1398          * list. Otherwise we could be left in an inconsistent state. We own the dgram
1399          * once its off the list so we don't need to worry about others changing it at
1400          * that point. */
1401         spin_lock(&dev->gnd_dgram_lock);
1402         grc = kgnilnd_postdata_probe_by_id(dev->gnd_handle, &readyid);
1403         if (grc != GNI_RC_SUCCESS) {
1404                 spin_unlock(&dev->gnd_dgram_lock);
1405                 /* return 0 to indicate nothing happened */
1406                 RETURN(0);
1407         }
1408
1409         CDEBUG(D_NET, "ready "LPX64" on device 0x%p\n",
1410                 readyid, dev);
1411
1412         dgram = (kgn_dgram_t *)readyid;
1413
1414         LASSERTF(dgram->gndg_magic == GNILND_DGRAM_MAGIC,
1415                  "dgram 0x%p from id "LPX64" with bad magic %x\n",
1416                  dgram, readyid, dgram->gndg_magic);
1417
1418         LASSERTF(dgram->gndg_state == GNILND_DGRAM_POSTED ||
1419                  dgram->gndg_state == GNILND_DGRAM_CANCELED,
1420                  "dgram 0x%p with bad state %s\n",
1421                  dgram, kgnilnd_dgram_state2str(dgram));
1422
1423         LASSERTF(!list_empty(&dgram->gndg_list),
1424                  "dgram 0x%p with bad list state %s type %s\n",
1425                  dgram, kgnilnd_dgram_state2str(dgram),
1426                  kgnilnd_dgram_type2str(dgram));
1427
1428         /* now we know that the datagram structure is ok, so pull off list */
1429         list_del_init(&dgram->gndg_list);
1430
1431         /* while we have the gnn_dgram_lock and BEFORE we call test_by_id
1432          * change the state from POSTED to PROCESSING to ensure that
1433          * nobody cancels it after we've pulled it from the wire */
1434         if (dgram->gndg_state == GNILND_DGRAM_POSTED) {
1435                 dgram->gndg_state = GNILND_DGRAM_PROCESSING;
1436         }
1437
1438         LASSERTF(dgram->gndg_conn != NULL,
1439                 "dgram 0x%p with NULL conn\n", dgram);
1440
1441         grc = kgnilnd_ep_postdata_test_by_id(dgram->gndg_conn->gnc_ephandle,
1442                                              (__u64)dgram, &post_state,
1443                                              &remote_addr, &remote_id);
1444
1445         /* we now "own" this datagram */
1446         spin_unlock(&dev->gnd_dgram_lock);
1447
1448         LASSERTF(grc != GNI_RC_NO_MATCH, "kgni lied! probe_by_id told us that"
1449                  " id "LPU64" was ready\n", readyid);
1450
1451         CDEBUG(D_NET, "grc %d dgram 0x%p type %s post_state %d "
1452                 "remote_addr %u remote_id %u\n", grc, dgram,
1453                 kgnilnd_dgram_type2str(dgram),
1454                 post_state, remote_addr, remote_id);
1455
1456         if (unlikely(grc != GNI_RC_SUCCESS)) {
1457                 CNETERR("getting data for dgram 0x%p->%s failed rc %d. Dropping it\n",
1458                         dgram, libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
1459                         grc);
1460                 rc = -EINVAL;
1461                 GOTO(probe_for_out, rc);
1462         }
1463
1464         rc = kgnilnd_process_dgram(dgram, post_state);
1465
1466         /* we should never get probe finding a dgram for us and then it
1467          * being a WC dgram that is still in the middle of processing */
1468         LASSERTF(rc <= 0, "bad rc %d from process_dgram 0x%p state %d\n",
1469                  rc, dgram, post_state);
1470
1471         if (rc == 0) {
1472                 /* dgram is good enough for the data to be used */
1473                 dgram->gndg_state = GNILND_DGRAM_PROCESSING;
1474                 /* fake rc to mark that we've done something */
1475                 rc = 1;
1476         } else {
1477                 /* let kgnilnd_release_dgram take care of canceled dgrams */
1478                 if (dgram->gndg_state != GNILND_DGRAM_CANCELED) {
1479                         dgram->gndg_state = GNILND_DGRAM_DONE;
1480                 }
1481         }
1482
1483         *dgramp = dgram;
1484         RETURN(rc);
1485
1486 probe_for_out:
1487
1488         kgnilnd_release_dgram(dev, dgram, 0);
1489         RETURN(rc);
1490 }
1491
1492 int
1493 kgnilnd_setup_wildcard_dgram(kgn_device_t *dev)
1494 {
1495         /* if kgn_wildcard is zero, return error */
1496         int     rc = -ENOENT, i;
1497         ENTRY;
1498
1499         for (i = 0; i < *kgnilnd_tunables.kgn_nwildcard; i++) {
1500                 rc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
1501                 if (rc < 0) {
1502                         CERROR("error %d: could not post wildcard datagram # %d\n",
1503                                 rc, i);
1504                         rc = -EINVAL;
1505                         GOTO(failed, rc);
1506                 }
1507         }
1508
1509 failed:
1510         RETURN(rc);
1511 }
1512
1513 int
1514 kgnilnd_cancel_net_dgrams(kgn_net_t *net)
1515 {
1516         kgn_dgram_t            *dg, *dgN;
1517         struct list_head        zombies;
1518         int                     i;
1519         ENTRY;
1520
1521         /* we want to cancel any outstanding dgrams - we don't want to rely
1522          * on del_peer_or_conn catching all of them. This helps protect us in cases
1523          * where we don't quite keep the peer->dgram mapping in sync due to some
1524          * race conditions */
1525
1526         LASSERTF(net->gnn_shutdown || kgnilnd_data.kgn_in_reset,
1527                  "called with LND invalid state: net shutdown %d "
1528                  "in reset %d\n", net->gnn_shutdown,
1529                  kgnilnd_data.kgn_in_reset);
1530
1531         INIT_LIST_HEAD(&zombies);
1532
1533         spin_lock(&net->gnn_dev->gnd_dgram_lock);
1534
1535         for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
1536                 list_for_each_entry_safe(dg, dgN, &net->gnn_dev->gnd_dgrams[i], gndg_list) {
1537
1538                         /* skip nids not on our net or are wildcards */
1539
1540
1541                         if (dg->gndg_type == GNILND_DGRAM_WC_REQ ||
1542                                 net->gnn_netnum != LNET_NETNUM(LNET_NIDNET(dg->gndg_conn_out.gncr_dstnid)))
1543                                 continue;
1544
1545                         kgnilnd_cancel_dgram_locked(dg);
1546                 }
1547         }
1548
1549         spin_unlock(&net->gnn_dev->gnd_dgram_lock);
1550
1551         RETURN(0);
1552 }
1553
1554 int
1555 kgnilnd_cancel_wc_dgrams(kgn_device_t *dev)
1556 {
1557         kgn_dgram_t *dg, *dgN;
1558         struct list_head zombies;
1559         ENTRY;
1560
1561         /* Time to kill the outstanding WC's
1562          * WC's exist on net 0 only but match on any net...
1563          */
1564
1565         LASSERTF(kgnilnd_data.kgn_in_reset || kgnilnd_data.kgn_wc_kill,
1566                 "called with LND invalid state: WC shutdown %d "
1567                 "in reset %d\n", kgnilnd_data.kgn_wc_kill,
1568                 kgnilnd_data.kgn_in_reset);
1569
1570         INIT_LIST_HEAD(&zombies);
1571         spin_lock(&dev->gnd_dgram_lock);
1572
1573         do {
1574                 dg = kgnilnd_find_dgram_locked(dev, LNET_NID_ANY);
1575                 if (dg != NULL) {
1576                         LASSERTF(dg->gndg_type == GNILND_DGRAM_WC_REQ,
1577                                  "dgram 0x%p->%s with bad type %d (%s)\n",
1578                                 dg, libcfs_nid2str(dg->gndg_conn_out.gncr_dstnid),
1579                                 dg->gndg_type, kgnilnd_dgram_type2str(dg));
1580
1581                         kgnilnd_cancel_dgram_locked(dg);
1582
1583                         /* WC could be DONE already, check and if so add to list to be released */
1584                         if (dg->gndg_state == GNILND_DGRAM_DONE) {
1585                                 list_del_init(&dg->gndg_list);
1586                                 list_add_tail(&dg->gndg_list, &zombies);
1587                         }
1588                 }
1589         } while (dg != NULL);
1590
1591         spin_unlock(&dev->gnd_dgram_lock);
1592
1593         list_for_each_entry_safe(dg, dgN, &zombies, gndg_list) {
1594                 list_del_init(&dg->gndg_list);
1595                 kgnilnd_release_dgram(dev, dg, 1);
1596         }
1597         RETURN(0);
1598
1599 }
1600
1601 int
1602 kgnilnd_cancel_dgrams(kgn_device_t *dev)
1603 {
1604         kgn_dgram_t *dg, *dgN;
1605         int i;
1606         ENTRY;
1607
1608         /* Cancel any outstanding non wildcard datagrams regardless
1609          * of which net they are on as we are in base shutdown and
1610          * dont care about connecting anymore.
1611          */
1612
1613         LASSERTF(kgnilnd_data.kgn_wc_kill == 1,"We didnt get called from base shutdown\n");
1614
1615         spin_lock(&dev->gnd_dgram_lock);
1616
1617         for (i = 0; i < (*kgnilnd_tunables.kgn_peer_hash_size -1); i++) {
1618                 list_for_each_entry_safe(dg, dgN, &dev->gnd_dgrams[i], gndg_list) {
1619                         if (dg->gndg_type != GNILND_DGRAM_WC_REQ)
1620                                 kgnilnd_cancel_dgram_locked(dg);
1621                 }
1622         }
1623
1624         spin_unlock(&dev->gnd_dgram_lock);
1625
1626         RETURN(0);
1627 }
1628
1629
1630 void
1631 kgnilnd_wait_for_canceled_dgrams(kgn_device_t *dev)
1632 {
1633         int             i = 4;
1634         int             rc;
1635         gni_return_t    grc;
1636         __u64           readyid;
1637         kgn_dgram_t    *dgram;
1638
1639         /* use do while to get at least one check run to allow
1640          * regression test for 762072 to hit bug if there */
1641
1642         /* This function races with the dgram mover during shutdown so it is possible for
1643          * a dgram to be seen in kgnilnd_postdata_probe_wait_by_id but be handled in the
1644          * dgram mover thread instead of inside of this function.
1645          */
1646
1647         /* This should only be called from within shutdown, baseshutdown, or stack reset.
1648          * there are no assertions here to verify since base_shutdown has nothing in it we can check
1649          * the net is gone by then.
1650          */
1651
1652         do {
1653                 i++;
1654                 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
1655                         "Waiting for %d canceled datagrams to clear on device %d\n",
1656                         atomic_read(&dev->gnd_canceled_dgrams), dev->gnd_id);
1657
1658                 /* check once a second */
1659                 grc = kgnilnd_postdata_probe_wait_by_id(dev->gnd_handle,
1660                        250, &readyid);
1661
1662                 if (grc != GNI_RC_SUCCESS)
1663                         continue;
1664
1665                 CDEBUG(D_NET, "ready "LPX64" on device %d->0x%p\n",
1666                         readyid, dev->gnd_id, dev);
1667
1668                 rc = kgnilnd_probe_for_dgram(dev, &dgram);
1669                 if (rc != 0) {
1670                         /* if we got a valid dgram or one that is now done, clean up */
1671                         kgnilnd_release_dgram(dev, dgram, 1);
1672                 }
1673         } while (atomic_read(&dev->gnd_canceled_dgrams));
1674 }
1675
1676 int
1677 kgnilnd_start_connect(kgn_peer_t *peer)
1678 {
1679         int              rc = 0;
1680         /* sync point for kgnilnd_del_peer_locked - do an early check to
1681          * catch the most common hits where del_peer is done by the
1682          * time we get here */
1683         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING1)) {
1684                 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING1, 1)) {};
1685         }
1686
1687         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1688         if (!kgnilnd_peer_active(peer) || peer->gnp_connecting != GNILND_PEER_CONNECT) {
1689                 /* raced with peer getting unlinked */
1690                 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1691                 rc = ESTALE;
1692                 GOTO(out, rc);
1693         }
1694         peer->gnp_connecting = GNILND_PEER_POSTING;
1695         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1696
1697         set_mb(peer->gnp_last_dgram_time, jiffies);
1698         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING2)) {
1699                 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING2, 1)) {};
1700         }
1701
1702         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING3)) {
1703                 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING3, 1)) {};
1704                 rc = cfs_fail_val ? cfs_fail_val : -ENOMEM;
1705         } else {
1706                 rc = kgnilnd_post_dgram(peer->gnp_net->gnn_dev,
1707                                         peer->gnp_nid, GNILND_CONNREQ_REQ, 0);
1708         }
1709         if (rc < 0) {
1710                 set_mb(peer->gnp_last_dgram_errno, rc);
1711                 GOTO(failed, rc);
1712         }
1713
1714         /* while we're posting someone could have decided this peer/dgram needed to
1715          * die a quick death, so we check for state change and process accordingly */
1716
1717         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1718         if (!kgnilnd_peer_active(peer) || peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH) {
1719                 if (peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH) {
1720                         peer->gnp_connecting = GNILND_PEER_KILL;
1721                 }
1722                 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1723                 /* positive RC to avoid dgram cleanup - we'll have to
1724                  * wait for the kgni GNI_POST_TERMINATED event to
1725                  * finish cleaning up */
1726                 rc = ESTALE;
1727                 kgnilnd_find_and_cancel_dgram(peer->gnp_net->gnn_dev, peer->gnp_nid);
1728                 GOTO(out, rc);
1729         }
1730         peer->gnp_connecting = GNILND_PEER_POSTED;
1731         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1732         /* reaper thread will take care of any timeouts */
1733         CDEBUG(D_NET, "waiting for connect to finish to %s rc %d\n",
1734                libcfs_nid2str(peer->gnp_nid), rc);
1735
1736         RETURN(rc);
1737
1738 failed:
1739         CDEBUG(D_NET, "connect to %s failed: rc %d \n",
1740                libcfs_nid2str(peer->gnp_nid), rc);
1741 out:
1742         RETURN(rc);
1743 }
1744
1745 int
1746 kgnilnd_finish_connect(kgn_dgram_t *dgram)
1747 {
1748         kgn_conn_t        *conn = dgram->gndg_conn;
1749         lnet_nid_t         her_nid = dgram->gndg_conn_in.gncr_srcnid;
1750         kgn_peer_t        *new_peer, *peer = NULL;
1751         kgn_tx_t          *tx;
1752         kgn_tx_t          *txn;
1753         kgn_mbox_info_t   *mbox;
1754         int                rc;
1755         int                nstale;
1756
1757         /* try to find a peer that matches the nid we got in the connreq
1758          * kgnilnd_unpack_connreq makes sure that conn_in.gncr_srcnid is
1759          * HER and conn_out.gncr_srcnid is ME for both active and WC dgrams */
1760
1761         /* assume this is a new peer  - it makes locking cleaner when it isn't */
1762         /* no holding kgn_net_rw_sem - already are at the kgnilnd_dgram_mover level */
1763
1764         rc = kgnilnd_create_peer_safe(&new_peer, her_nid, NULL, GNILND_RCA_NODE_UP);
1765         if (rc != 0) {
1766                 CERROR("Can't create peer for %s\n", libcfs_nid2str(her_nid));
1767                 return rc;
1768         }
1769
1770         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1771
1772         /* this transfers ref from create_peer to the kgn_peer table */
1773         kgnilnd_add_peer_locked(her_nid, new_peer, &peer);
1774
1775         /* if we found an existing peer, is it really ready for a new conn ? */
1776         if (peer != new_peer) {
1777                 /* if this was an active connect attempt but we can't find a peer waiting for it
1778                  * we will dump in the trash */
1779
1780                 if (peer->gnp_connecting == GNILND_PEER_IDLE && dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
1781                         CDEBUG(D_NET, "dropping completed connreq for %s peer 0x%p->%s\n",
1782                                libcfs_nid2str(her_nid), peer, libcfs_nid2str(peer->gnp_nid));
1783                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1784                         rc = ECANCELED;
1785                         GOTO(out, rc);
1786                 }
1787
1788                 /* check to see if we can catch a connecting peer before it is
1789                  * removed from the connd_peers list - if not, we need to
1790                  * let the connreqs race and be handled by kgnilnd_conn_isdup_locked() */
1791                 if (peer->gnp_connecting != GNILND_PEER_IDLE) {
1792                         spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
1793                         if (!list_empty(&peer->gnp_connd_list)) {
1794                                 list_del_init(&peer->gnp_connd_list);
1795                                 /* drop connd ref */
1796                                 kgnilnd_peer_decref(peer);
1797                         }
1798                         spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
1799                         /* clear rc to make sure we don't have fake error */
1800                         rc = 0;
1801                 }
1802
1803                 /* no matter what, we are no longer waiting to connect this peer now */
1804                 peer->gnp_connecting = GNILND_PEER_IDLE;
1805
1806                 /* Refuse to duplicate an existing connection (both sides might try to
1807                  * connect at once).  NB we return success!  We _are_ connected so we
1808                  * _don't_ have any blocked txs to complete with failure. */
1809                 rc = kgnilnd_conn_isdup_locked(peer, conn);
1810                 if (rc != 0) {
1811                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1812                         CDEBUG(D_NET, "Not creating duplicate connection to %s: %d\n",
1813                               libcfs_nid2str(her_nid), rc);
1814                         rc = EALREADY;
1815                         GOTO(out, rc);
1816                 }
1817         }
1818
1819         if (peer->gnp_down == GNILND_RCA_NODE_DOWN) {
1820                 CNETERR("Received connection request from %s that RCA thinks is"
1821                         " down.\n", libcfs_nid2str(her_nid));
1822                 peer->gnp_down = GNILND_RCA_NODE_UP;
1823         }
1824
1825         nstale = kgnilnd_close_stale_conns_locked(peer, conn);
1826
1827         /* either way with peer (new or existing), we are ok with ref counts here as the
1828          * kgnilnd_add_peer_locked will use our ref on new_peer (from create_peer_safe) as the
1829          * ref for the peer table. */
1830
1831         /* at this point, the connection request is a winner */
1832
1833         /* mark 'DONE' to avoid cancel being called from release */
1834         dgram->gndg_state = GNILND_DGRAM_DONE;
1835
1836         /* initialise timestamps before reaper looks at them */
1837         conn->gnc_last_rx = conn->gnc_last_rx_cq = jiffies;
1838
1839         /* last_tx is initialized to jiffies - (keepalive*2) so that if the NOOP fails it will
1840          * immediatly send a NOOP in the reaper thread during the call to
1841          * kgnilnd_check_conn_timeouts_locked
1842          */
1843         conn->gnc_last_tx = jiffies - (cfs_time_seconds(GNILND_TO2KA(conn->gnc_timeout)) * 2);
1844         conn->gnc_state = GNILND_CONN_ESTABLISHED;
1845
1846         /* save the dgram type used to establish this connection */
1847         conn->gnc_dgram_type = dgram->gndg_type;
1848
1849         /* refs are not transferred from dgram to tables, so increment to
1850          * take ownership */
1851         kgnilnd_conn_addref(conn);
1852         kgnilnd_peer_addref(peer);
1853         conn->gnc_peer = peer;
1854         list_add_tail(&conn->gnc_list, &peer->gnp_conns);
1855
1856         kgnilnd_conn_addref(conn);               /* +1 ref for conn table */
1857         list_add_tail(&conn->gnc_hashlist,
1858                       kgnilnd_cqid2connlist(conn->gnc_cqid));
1859         kgnilnd_data.kgn_conn_version++;
1860
1861         /* Dont send NOOP if fail_loc is set
1862          */
1863         if (!CFS_FAIL_CHECK(CFS_FAIL_GNI_ONLY_NOOP)) {
1864                 tx = kgnilnd_new_tx_msg(GNILND_MSG_NOOP, peer->gnp_net->gnn_ni->ni_nid);
1865                 if (tx == NULL) {
1866                         CNETERR("can't get TX to initiate NOOP to %s\n",
1867                                 libcfs_nid2str(peer->gnp_nid));
1868                 } else {
1869                         kgnilnd_queue_tx(conn, tx);
1870                 }
1871         }
1872
1873         /* Schedule all packets blocking for a connection */
1874         list_for_each_entry_safe(tx, txn, &peer->gnp_tx_queue, tx_list) {
1875                 /* lock held here is the peer_conn lock */
1876                 kgnilnd_tx_del_state_locked(tx, peer, NULL, GNILND_TX_ALLOCD);
1877                 kgnilnd_queue_tx(conn, tx);
1878         }
1879
1880         /* If this is an active connection lets mark its timestamp on the MBoX */
1881         if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
1882                 mbox = &conn->gnc_fma_blk->gnm_mbox_info[conn->gnc_mbox_id];
1883                 /* conn->gnc_last_rx is jiffies it better exist as it was just set */
1884                 mbox->mbx_release_purg_active_dgram = conn->gnc_last_rx;
1885         }
1886
1887         /* Bug 765042: wake up scheduler for a race with finish_connect and
1888          * complete_conn_closed with a conn in purgatory
1889          * since we can't use CFS_RACE due to mutex_holds in kgnilnd_process_conns,
1890          * we just check for set and then clear */
1891         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_FINISH_PURG)) {
1892                 cfs_fail_loc = 0x0;
1893                 /* get scheduler thread moving again */
1894                 kgnilnd_schedule_device(conn->gnc_device);
1895         }
1896
1897         CDEBUG(D_NET, "New conn 0x%p->%s dev %d\n",
1898                conn, libcfs_nid2str(her_nid), conn->gnc_device->gnd_id);
1899
1900         /* make sure we reset peer reconnect interval now that we have a good conn */
1901         kgnilnd_peer_alive(peer);
1902         peer->gnp_reconnect_interval = 0;
1903
1904         /* clear the unlink attribute if we dont clear it kgnilnd_del_conn_or_peer will wait
1905          * on the atomic forever
1906          */
1907         if (peer->gnp_pending_unlink) {
1908                 peer->gnp_pending_unlink = 0;
1909                 kgnilnd_admin_decref(kgnilnd_data.kgn_npending_unlink);
1910                 CDEBUG(D_NET, "Clearing peer unlink %p\n",peer);
1911         }
1912
1913         /* add ref to make it hang around until after we drop the lock */
1914         kgnilnd_conn_addref(conn);
1915
1916         /* Once the peer_conn lock is dropped, the conn could actually move into
1917          * CLOSING->CLOSED->DONE in the scheduler thread, so hold the
1918          * lock until we are really done */
1919         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1920
1921         /* Notify LNET that we now have a working connection to this peer.
1922          * This is a Cray extension to the "standard" LND behavior. */
1923         lnet_notify(peer->gnp_net->gnn_ni, peer->gnp_nid,
1924                      1, cfs_time_current());
1925
1926         /* drop our 'hold' ref */
1927         kgnilnd_conn_decref(conn);
1928
1929 out:
1930         RETURN(rc);
1931 }
1932
1933 void
1934 kgnilnd_send_nak(kgn_device_t *dev, lnet_nid_t dst_nid, int error)
1935 {
1936         int              rc = 0;
1937         ENTRY;
1938
1939         LASSERTF(dst_nid != LNET_NID_ANY, "bad dst_nid %s\n", libcfs_nid2str(dst_nid));
1940
1941         CDEBUG(D_NET, "NAK to %s errno %d\n", libcfs_nid2str(dst_nid), error);
1942
1943         rc = kgnilnd_post_dgram(dev, dst_nid, GNILND_CONNREQ_NAK, error);
1944
1945         if (rc < 0) {
1946                 CDEBUG(D_NET, "NAK to %s failed: rc %d \n", libcfs_nid2str(dst_nid), rc);
1947         }
1948         EXIT;
1949 }
1950
1951 int
1952 kgnilnd_process_nak(kgn_dgram_t *dgram)
1953 {
1954         kgn_connreq_t     *connreq = &dgram->gndg_conn_in;
1955         lnet_nid_t         src_nid = connreq->gncr_srcnid;
1956         int                errno = connreq->gncr_nakdata.gnnd_errno;
1957         kgn_peer_t        *peer;
1958         int                rc = 0;
1959
1960         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1961
1962         peer = kgnilnd_find_peer_locked(src_nid);
1963         if (peer == NULL) {
1964                 /* we likely dropped him from bad data when we processed
1965                  * the original REQ */
1966                 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1967                 return -EBADSLT;
1968         }
1969
1970         /* need to check peerstamp/connstamp against the ones we find
1971          * to make sure we don't close new (and good?) conns that we
1972          * formed after this connreq failed */
1973         if (peer->gnp_connecting == GNILND_PEER_IDLE) {
1974                 kgn_conn_t        conn;
1975
1976                 if (list_empty(&peer->gnp_conns)) {
1977                         /* assume already procced datagram and it barfed up
1978                          * on this side too */
1979                         CDEBUG(D_NET, "dropping NAK from %s; "
1980                                "peer %s is already not connected\n",
1981                                 libcfs_nid2str(connreq->gncr_srcnid),
1982                                 libcfs_nid2str(connreq->gncr_dstnid));
1983                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1984                         return 0;
1985                 }
1986
1987                 /* stub up a connection with the connreq XXX_stamps to allow
1988                  * use to use close_stale_conns_locked */
1989                 conn.gnc_peerstamp = connreq->gncr_peerstamp;
1990                 conn.gnc_my_connstamp = connreq->gncr_connstamp;
1991                 conn.gnc_peer_connstamp = connreq->gncr_connstamp;
1992                 conn.gnc_device = peer->gnp_net->gnn_dev;
1993
1994                 rc = kgnilnd_close_stale_conns_locked(peer, &conn);
1995
1996                 LCONSOLE_INFO("Received NAK from %s for %s errno %d; "
1997                         "closed %d connections\n",
1998                         libcfs_nid2str(connreq->gncr_srcnid),
1999                         libcfs_nid2str(connreq->gncr_dstnid), errno, rc);
2000         } else {
2001                 spin_lock(&dgram->gndg_conn->gnc_device->gnd_connd_lock);
2002
2003                 if (list_empty(&peer->gnp_connd_list)) {
2004                         /* if peer isn't on waiting list, try to find one to nuke */
2005                         rc = kgnilnd_find_and_cancel_dgram(peer->gnp_net->gnn_dev,
2006                                                            peer->gnp_nid);
2007
2008                         if (rc) {
2009                                 LCONSOLE_INFO("Received NAK from %s for %s errno %d; "
2010                                         "canceled pending connect request\n",
2011                                         libcfs_nid2str(connreq->gncr_srcnid),
2012                                         libcfs_nid2str(connreq->gncr_dstnid), errno);
2013                         }
2014
2015                         /* if we can't find a waiting dgram, we just drop the nak - the conn
2016                          * connect must have failed (didn't find conn above and clear connecting
2017                          * -- so nothing to do besides drop */
2018                 } else {
2019                         /* peer is on list, meaning it is a new connect attempt from the one
2020                          * we started that generated the NAK - so just drop NAK */
2021
2022                         /* use negative to prevent error message */
2023                         rc = -EAGAIN;
2024                 }
2025                 spin_unlock(&dgram->gndg_conn->gnc_device->gnd_connd_lock);
2026         }
2027
2028         /* success! we found a peer and at least marked pending_nak */
2029         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2030
2031         return rc;
2032 }
2033
2034 int
2035 kgnilnd_process_connreq(kgn_dgram_t *dgram, int *needs_nak)
2036 {
2037         int                      rc;
2038
2039         rc = kgnilnd_unpack_connreq(dgram);
2040         if (rc < 0) {
2041                 if (rc != -EBADF) {
2042                         /* only NAK if we have good srcnid to use */
2043                         *needs_nak = 1;
2044                 }
2045                 goto connreq_out;
2046         }
2047
2048         switch (dgram->gndg_conn_in.gncr_type) {
2049         case GNILND_CONNREQ_REQ:
2050                 /* wire up peer & conn, send queued TX */
2051                 rc = kgnilnd_finish_connect(dgram);
2052
2053                 /* don't nak when the nid is hosed */
2054                 if ((rc < 0)) {
2055                         *needs_nak = 1;
2056                 }
2057
2058                 break;
2059         case GNILND_CONNREQ_NAK:
2060                 rc = kgnilnd_process_nak(dgram);
2061                 /* return early to prevent reconnect bump */
2062                 return rc;
2063         default:
2064                 CERROR("unexpected connreq type %s (%d) from %s\n",
2065                         kgnilnd_connreq_type2str(&dgram->gndg_conn_in),
2066                         dgram->gndg_conn_in.gncr_type,
2067                         libcfs_nid2str(dgram->gndg_conn_in.gncr_srcnid));
2068                 rc = -EINVAL;
2069                 *needs_nak = 1;
2070                 break;
2071         }
2072
2073 connreq_out:
2074         RETURN(rc);
2075 }
2076
2077 int
2078 kgnilnd_probe_and_process_dgram(kgn_device_t *dev)
2079 {
2080         int                      rc;
2081         int                      needs_nak = 0;
2082         lnet_nid_t               nak_dstnid = LNET_NID_ANY;
2083         lnet_nid_t               orig_dstnid;
2084         kgn_dgram_t             *dgram = NULL;
2085         kgn_peer_t              *peer;
2086         ENTRY;
2087
2088         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PAUSE_DGRAM_COMP)) {
2089                 rc = 0;
2090         } else {
2091                 rc = kgnilnd_probe_for_dgram(dev, &dgram);
2092         }
2093
2094         if (rc == 0) {
2095                 RETURN(0);
2096         } else if (rc < 0) {
2097                 GOTO(inform_peer, rc);
2098         } else {
2099                 /* rc > 1 means it did something, reset for this func  */
2100                 rc = 0;
2101         }
2102
2103         switch (dgram->gndg_type) {
2104         case GNILND_DGRAM_WC_REQ:
2105         case GNILND_DGRAM_REQ:
2106                 rc = kgnilnd_process_connreq(dgram, &needs_nak);
2107                 break;
2108         case GNILND_DGRAM_NAK:
2109                 CDEBUG(D_NETTRACE, "NAK to %s done\n",
2110                         libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid));
2111                 break;
2112         default:
2113                 CERROR("unknown datagram type %s (%d)\n",
2114                        kgnilnd_dgram_type2str(dgram), dgram->gndg_type);
2115                 break;
2116         }
2117
2118         /* stash data to use after releasing current datagram */
2119         /* don't stash net - we are operating on a net already,
2120          * so the lock on rw_net_lock is sufficient */
2121
2122         nak_dstnid = dgram->gndg_conn_in.gncr_srcnid;
2123
2124 inform_peer:
2125         LASSERTF(dgram != NULL, "dgram 0x%p rc %d needs_nak %d\n", dgram, rc, needs_nak);
2126
2127         orig_dstnid = dgram->gndg_conn_out.gncr_dstnid;
2128
2129         kgnilnd_release_dgram(dev, dgram, 0);
2130
2131         CDEBUG(D_NET, "cleaning up dgram to %s, rc %d\n",
2132                libcfs_nid2str(orig_dstnid), rc);
2133
2134         /* if this was a WC_REQ that matched an existing peer, it'll get marked done
2135          * in kgnilnd_finish_connect - if errors are from before we get to there,
2136          * we just drop as it is a WC_REQ - the peer CAN'T be waiting for it */
2137         if ((orig_dstnid != LNET_NID_ANY) && (rc < 0)) {
2138                 /* if we have a negative rc, we want to find a peer to inform about
2139                  * the bad connection attempt. Sorry buddy, better luck next time! */
2140
2141                 write_lock(&kgnilnd_data.kgn_peer_conn_lock);
2142                 peer = kgnilnd_find_peer_locked(orig_dstnid);
2143
2144                 if (peer != NULL) {
2145                         /* add ref to make sure he stays around past the possible unlink
2146                          * so we can tell LNet about him */
2147                         kgnilnd_peer_addref(peer);
2148
2149                         /* if he still cares about the outstanding connect */
2150                         if (peer->gnp_connecting >= GNILND_PEER_CONNECT) {
2151                                 /* check if he is on the connd list and remove.. */
2152                                 spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
2153                                 if (!list_empty(&peer->gnp_connd_list)) {
2154                                         list_del_init(&peer->gnp_connd_list);
2155                                         /* drop connd ref */
2156                                         kgnilnd_peer_decref(peer);
2157                                 }
2158                                 spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
2159
2160                                 /* clear gnp_connecting so we don't have a non-connecting peer
2161                                  * on gnd_connd_list */
2162                                 peer->gnp_connecting = GNILND_PEER_IDLE;
2163
2164                                 set_mb(peer->gnp_last_dgram_errno, rc);
2165
2166                                 kgnilnd_peer_increase_reconnect_locked(peer);
2167                         }
2168                 }
2169                 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2170
2171                 /* now that we are outside the lock, tell Mommy */
2172                 if (peer != NULL) {
2173                         kgnilnd_peer_notify(peer, rc);
2174                         kgnilnd_peer_decref(peer);
2175                 }
2176         }
2177
2178         if (needs_nak) {
2179                 kgnilnd_send_nak(dev, nak_dstnid, rc);
2180         }
2181
2182         RETURN(1);
2183 }
2184
2185 void
2186 kgnilnd_reaper_dgram_check(kgn_device_t *dev)
2187 {
2188         kgn_dgram_t    *dgram, *tmp;
2189         int             i;
2190
2191         spin_lock(&dev->gnd_dgram_lock);
2192
2193         for (i = 0; i < (*kgnilnd_tunables.kgn_peer_hash_size - 1); i++) {
2194                 list_for_each_entry_safe(dgram, tmp, &dev->gnd_dgrams[i], gndg_list) {
2195                         unsigned long            now = jiffies;
2196                         unsigned long            timeout;
2197
2198                         /* don't timeout stuff if the network is mucked or shutting down */
2199                         if (kgnilnd_check_hw_quiesce()) {
2200                                 break;
2201                         }
2202
2203                         if ((dgram->gndg_state != GNILND_DGRAM_POSTED) ||
2204                             (dgram->gndg_type == GNILND_DGRAM_WC_REQ)) {
2205                                 continue;
2206                         }
2207                         CDEBUG(D_NETTRACE, "checking dgram 0x%p type %s "
2208                                 "state %s conn 0x%p to %s age %lus\n",
2209                                 dgram, kgnilnd_dgram_type2str(dgram),
2210                                 kgnilnd_dgram_state2str(dgram), dgram->gndg_conn,
2211                                 libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
2212                                 cfs_duration_sec(now - dgram->gndg_post_time));
2213
2214                         timeout = cfs_time_seconds(*kgnilnd_tunables.kgn_timeout);
2215
2216                         if (time_before(now, (dgram->gndg_post_time + timeout)))
2217                                 continue;
2218
2219                         CNETERR("%s datagram to %s timed out @ %lus dgram "
2220                                 "0x%p state %s conn 0x%p\n",
2221                                 kgnilnd_dgram_type2str(dgram),
2222                                 libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
2223                                 cfs_duration_sec(now - dgram->gndg_post_time),
2224                                 dgram, kgnilnd_dgram_state2str(dgram),
2225                                 dgram->gndg_conn);
2226
2227                         kgnilnd_cancel_dgram_locked(dgram);
2228                 }
2229         }
2230         spin_unlock(&dev->gnd_dgram_lock);
2231 }
2232
2233
2234 /* use a thread for the possibly long-blocking wait_by_id to prevent
2235  * stalling the global workqueues */
2236 int
2237 kgnilnd_dgram_waitq(void *arg)
2238 {
2239         kgn_device_t     *dev = (kgn_device_t *) arg;
2240         char              name[16];
2241         gni_return_t      grc;
2242         __u64             readyid;
2243         DEFINE_WAIT(mover_done);
2244
2245         snprintf(name, sizeof(name), "kgnilnd_dgn_%02d", dev->gnd_id);
2246         cfs_block_allsigs();
2247
2248         /* all gnilnd threads need to run fairly urgently */
2249         set_user_nice(current, *kgnilnd_tunables.kgn_nice);
2250
2251         /* we dont shut down until the device shuts down ... */
2252         while (!kgnilnd_data.kgn_shutdown) {
2253                 /* to quiesce or to not quiesce, that is the question */
2254                 if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
2255                         KGNILND_SPIN_QUIESCE;
2256                 }
2257
2258                 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_PAUSE_DGRAM_COMP, 1)) {}
2259
2260                 /* check once a second */
2261                 grc = kgnilnd_postdata_probe_wait_by_id(dev->gnd_handle,
2262                                                        1000, &readyid);
2263
2264                 if (grc == GNI_RC_SUCCESS) {
2265                         CDEBUG(D_INFO, "waking up dgram mover thread\n");
2266                         kgnilnd_schedule_dgram(dev);
2267
2268                         /* wait for dgram thread to ping us before spinning again */
2269                         prepare_to_wait(&dev->gnd_dgping_waitq, &mover_done,
2270                                         TASK_INTERRUPTIBLE);
2271
2272                         /* don't sleep if we need to quiesce */
2273                         if (likely(!kgnilnd_data.kgn_quiesce_trigger)) {
2274                                 schedule();
2275                         }
2276                         finish_wait(&dev->gnd_dgping_waitq, &mover_done);
2277                 }
2278         }
2279
2280         kgnilnd_thread_fini();
2281         return 0;
2282 }
2283
2284 int
2285 kgnilnd_start_outbound_dgrams(kgn_device_t *dev, unsigned long deadline)
2286 {
2287         int                      did_something = 0, rc;
2288         kgn_peer_t              *peer = NULL;
2289
2290         spin_lock(&dev->gnd_connd_lock);
2291
2292         /* Active connect - we added this in kgnilnd_launch_tx */
2293         while (!list_empty(&dev->gnd_connd_peers) && time_before(jiffies, deadline)) {
2294                 peer = list_first_entry(&dev->gnd_connd_peers,
2295                                         kgn_peer_t, gnp_connd_list);
2296
2297                 /* ref for connd removed in if/else below */
2298                list_del_init(&peer->gnp_connd_list);
2299
2300                 /* gnp_connecting and membership on gnd_connd_peers should be
2301                  * done coherently to avoid double adding, etc */
2302                 /* don't need kgnilnd_data.kgn_peer_conn_lock here as that is only needed
2303                  * to get the peer to gnp_connecting in the first place. We just need to
2304                  * rely on gnd_connd_lock to serialize someone pulling him from the list
2305                  * BEFORE clearing gnp_connecting */
2306                 LASSERTF(peer->gnp_connecting != GNILND_PEER_IDLE, "peer 0x%p->%s not connecting\n",
2307                          peer, libcfs_nid2str(peer->gnp_nid));
2308
2309                 spin_unlock(&dev->gnd_connd_lock);
2310
2311                 CDEBUG(D_NET, "processing connect to %s\n",
2312                        libcfs_nid2str(peer->gnp_nid));
2313
2314                 did_something += 1;
2315                 rc = kgnilnd_start_connect(peer);
2316
2317                 if (likely(rc >= 0)) {
2318                         /* 0 on success, positive on 'just drop peer' errors */
2319                         kgnilnd_peer_decref(peer);
2320                 } else if (rc == -ENOMEM) {
2321                         /* if we are out of wildcards, add back to
2322                          * connd_list - then break out and we'll try later
2323                          * if other errors, we'll bail & cancel pending tx */
2324                         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
2325                         if (peer->gnp_connecting == GNILND_PEER_POSTING) {
2326                                 peer->gnp_connecting = GNILND_PEER_CONNECT;
2327                                 spin_lock(&dev->gnd_connd_lock);
2328                                 list_add_tail(&peer->gnp_connd_list,
2329                                               &dev->gnd_connd_peers);
2330                         } else {
2331                                 /* connecting changed while we were posting */
2332
2333                                 LASSERTF(peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH, "Peer is in invalid"
2334                                         " state 0x%p->%s, connecting %d\n",
2335                                         peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting);
2336                                 peer->gnp_connecting = GNILND_PEER_KILL;
2337                                 spin_lock(&dev->gnd_connd_lock);
2338                                 /* remove the peer ref frrom the cond list */
2339                                 kgnilnd_peer_decref(peer);
2340                                 /* let the system handle itself */
2341                         }
2342                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2343                         /* the datagrams are a global pool,
2344                          * so break out of trying and hope some free
2345                          * up soon */
2346                         did_something -= 1;
2347                         break;
2348                 } else {
2349                         /* something bad happened, you lose */
2350                         CNETERR("could not start connecting to %s "
2351                                 "rc %d: Will retry until TX timeout\n",
2352                                libcfs_nid2str(peer->gnp_nid), rc);
2353                         /* It didnt post so just set connecting back to zero now.
2354                          * The reaper will reattempt the connection if it needs too.
2355                          * If the peer needs death set it so the reaper will cleanup.
2356                          */
2357                         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
2358                         if (peer->gnp_connecting == GNILND_PEER_POSTING) {
2359                                 peer->gnp_connecting = GNILND_PEER_IDLE;
2360                                 kgnilnd_peer_increase_reconnect_locked(peer);
2361                         } else {
2362                                 LASSERTF(peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH, "Peer is in invalid"
2363                                         " state 0x%p->%s, connecting %d\n",
2364                                         peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting);
2365                                 peer->gnp_connecting = GNILND_PEER_KILL;
2366                         }
2367                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2368
2369                         /* hold onto ref until we are really done - if it was
2370                          * unlinked this could result in a destroy */
2371                         kgnilnd_peer_decref(peer);
2372                 }
2373                 spin_lock(&dev->gnd_connd_lock);
2374         }
2375
2376         spin_unlock(&dev->gnd_connd_lock);
2377         RETURN(did_something);
2378 }
2379
2380 int
2381 kgnilnd_repost_wc_dgrams(kgn_device_t *dev)
2382 {
2383         int did_something = 0, to_repost, i;
2384         to_repost = atomic_read(&dev->gnd_nwcdgrams);
2385         ENTRY;
2386
2387         for (i = 0; i < to_repost; ++i) {
2388                 int     rerc;
2389                 rerc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
2390                 if (rerc == 0) {
2391                         kgnilnd_admin_decref(dev->gnd_nwcdgrams);
2392                         did_something += 1;
2393                 } else {
2394                         CDEBUG(D_NETERROR, "error %d: dev %d could not post wildcard datagram\n",
2395                                 rerc, dev->gnd_id);
2396                         break;
2397                 }
2398         }
2399
2400         RETURN(did_something);
2401 }
2402
2403 static void
2404 kgnilnd_dgram_poke_with_stick(unsigned long arg)
2405 {
2406         int             dev_id = arg;
2407         kgn_device_t    *dev = &kgnilnd_data.kgn_devices[dev_id];
2408
2409         wake_up(&dev->gnd_dgram_waitq);
2410 }
2411
2412 /* use single thread for dgrams - should be sufficient for performance */
2413 int
2414 kgnilnd_dgram_mover(void *arg)
2415 {
2416         kgn_device_t            *dev = (kgn_device_t *)arg;
2417         char                     name[16];
2418         int                      rc, did_something;
2419         unsigned long            next_purge_check = jiffies - 1;
2420         unsigned long            timeout;
2421         struct timer_list        timer;
2422         unsigned long            deadline = 0;
2423         DEFINE_WAIT(wait);
2424
2425         snprintf(name, sizeof(name), "kgnilnd_dg_%02d", dev->gnd_id);
2426         cfs_block_allsigs();
2427         /* all gnilnd threads need to run fairly urgently */
2428         set_user_nice(current, *kgnilnd_tunables.kgn_nice);
2429
2430         /* we are ok not locking for these variables as the dgram waitq threads
2431          * will block both due to tying up net (kgn_shutdown) and the completion
2432          * event for the dgram_waitq (kgn_quiesce_trigger) */
2433         deadline = jiffies + cfs_time_seconds(*kgnilnd_tunables.kgn_dgram_timeout);
2434         while (!kgnilnd_data.kgn_shutdown) {
2435                 /* Safe: kgn_shutdown only set when quiescent */
2436
2437                 /* race with stack reset - we want to hold off seeing any new incoming dgrams
2438                  * so we can force a dirty WC dgram for Bug 762072 - put right before
2439                  * quiesce check so that it'll go right into that and not do any
2440                  * dgram mucking */
2441                 CFS_RACE(CFS_FAIL_GNI_WC_DGRAM_FREE);
2442
2443                 /* to quiesce or to not quiesce, that is the question */
2444                 if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
2445                         KGNILND_SPIN_QUIESCE;
2446                 }
2447                 did_something = 0;
2448
2449                 CFS_RACE(CFS_FAIL_GNI_QUIESCE_RACE);
2450
2451                 /* process any newly completed dgrams */
2452                 down_read(&kgnilnd_data.kgn_net_rw_sem);
2453
2454                 rc = kgnilnd_probe_and_process_dgram(dev);
2455                 if (rc > 0) {
2456                         did_something += rc;
2457                 }
2458
2459                 up_read(&kgnilnd_data.kgn_net_rw_sem);
2460
2461                 CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_DGRAM_DEADLINE,
2462                         (*kgnilnd_tunables.kgn_dgram_timeout + 1));
2463                 /* start new outbound dgrams */
2464                 did_something += kgnilnd_start_outbound_dgrams(dev, deadline);
2465
2466                 /* find dead dgrams */
2467                 if (time_after_eq(jiffies, next_purge_check)) {
2468                         /* these don't need to be checked that often */
2469                         kgnilnd_reaper_dgram_check(dev);
2470
2471                         next_purge_check = (long) jiffies +
2472                                       cfs_time_seconds(kgnilnd_data.kgn_new_min_timeout / 4);
2473                 }
2474
2475                 did_something += kgnilnd_repost_wc_dgrams(dev);
2476
2477                 /* careful with the jiffy wrap... */
2478                 timeout = (long)(next_purge_check - jiffies);
2479
2480                 CDEBUG(D_INFO, "did %d timeout %lu next %lu jiffies %lu\n",
2481                        did_something, timeout, next_purge_check, jiffies);
2482
2483                 if ((did_something || timeout <= 0) && time_before(jiffies, deadline)) {
2484                         did_something = 0;
2485                         continue;
2486                 }
2487
2488                 prepare_to_wait(&dev->gnd_dgram_waitq, &wait, TASK_INTERRUPTIBLE);
2489
2490                 setup_timer(&timer, kgnilnd_dgram_poke_with_stick, dev->gnd_id);
2491                 mod_timer(&timer, (long) jiffies + timeout);
2492
2493                 /* last second chance for others to poke us */
2494                 did_something += xchg(&dev->gnd_dgram_ready, GNILND_DGRAM_IDLE);
2495
2496                 /* check flag variables before comittingi even if we did something;
2497                  * if we are after the deadline call schedule */
2498                 if ((!did_something || time_after(jiffies, deadline)) &&
2499                     !kgnilnd_data.kgn_shutdown &&
2500                     !kgnilnd_data.kgn_quiesce_trigger) {
2501                         CDEBUG(D_INFO, "schedule timeout %ld (%lu sec)\n",
2502                                timeout, cfs_duration_sec(timeout));
2503                         wake_up_all(&dev->gnd_dgping_waitq);
2504                         schedule();
2505                         CDEBUG(D_INFO, "awake after schedule\n");
2506                         deadline = jiffies + cfs_time_seconds(*kgnilnd_tunables.kgn_dgram_timeout);
2507                 }
2508
2509                 del_singleshot_timer_sync(&timer);
2510                 finish_wait(&dev->gnd_dgram_waitq, &wait);
2511         }
2512
2513         kgnilnd_thread_fini();
2514         return 0;
2515 }