lnet/klnds/gnilnd/gnilnd_conn.c

   1 /*
   2  * Copyright (C) 2012 Cray, Inc.
   3  *
   4  *   Author: Nic Henke <nic@cray.com>
   5  *   Author: James Shimek <jshimek@cray.com>
   6  *
   7  *   This file is part of Lustre, http://www.lustre.org.
   8  *
   9  *   Lustre is free software; you can redistribute it and/or
  10  *   modify it under the terms of version 2 of the GNU General Public
  11  *   License as published by the Free Software Foundation.
  12  *
  13  *   Lustre is distributed in the hope that it will be useful,
  14  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16  *   GNU General Public License for more details.
  17  *
  18  *   You should have received a copy of the GNU General Public License
  19  *   along with Lustre; if not, write to the Free Software
  20  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  21  *
  22  */
  23
  24 #include "gnilnd.h"
  25
  26 void
  27 kgnilnd_setup_smsg_attr(gni_smsg_attr_t *smsg_attr)
  28 {
  29         smsg_attr->mbox_maxcredit = *kgnilnd_tunables.kgn_mbox_credits;
  30         smsg_attr->msg_maxsize = GNILND_MAX_MSG_SIZE;
  31         smsg_attr->msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT;
  32 }
  33
  34 int
  35 kgnilnd_map_fmablk(kgn_device_t *device, kgn_fma_memblock_t *fma_blk)
  36 {
  37         gni_return_t            rrc;
  38         __u32                   flags = GNI_MEM_READWRITE;
  39
  40         if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
  41                 flags |= GNI_MEM_PHYS_CONT;
  42         }
  43
  44         /* make sure we are mapping a clean block */
  45         LASSERTF(fma_blk->gnm_hndl.qword1 == 0UL, "fma_blk %p dirty\n", fma_blk);
  46
  47         rrc = kgnilnd_mem_register(device->gnd_handle, (__u64)fma_blk->gnm_block,
  48                                    fma_blk->gnm_blk_size, device->gnd_rcv_fma_cqh,
  49                                    flags, &fma_blk->gnm_hndl);
  50         if (rrc != GNI_RC_SUCCESS) {
  51                 /* XXX Nic: need a way to silence this for runtime stuff that is ok to fail
  52                  * -- like when under MDD or GART pressure on big systems
  53                  */
  54                 CNETERR("register fmablk failed 0x%p mbox_size %d flags %u\n",
  55                         fma_blk, fma_blk->gnm_mbox_size, flags);
  56                 RETURN(-ENOMEM);
  57         }
  58
  59         /* PHYS_CONT memory isn't really mapped, at least not in GART -
  60          *  but all mappings chew up a MDD
  61          */
  62         if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
  63                 atomic64_add(fma_blk->gnm_blk_size, &device->gnd_nbytes_map);
  64         }
  65
  66         atomic_inc(&device->gnd_n_mdd);
  67         /* nfmablk is live (mapped) blocks */
  68         atomic_inc(&device->gnd_nfmablk);
  69
  70         RETURN(0);
  71 }
  72
  73 int
  74 kgnilnd_alloc_fmablk(kgn_device_t *device, int use_phys)
  75 {
  76         int                     rc = 0;
  77         int                     num_mbox;
  78         kgn_fma_memblock_t     *fma_blk;
  79         gni_smsg_attr_t         smsg_attr;
  80         unsigned long           fmablk_vers;
  81
  82         /* we'll use fmablk_vers and the gnd_fmablk_sem to gate access
  83          * to this allocation code. Everyone will sample the version
  84          * before and after getting the semaphore. If it has changed,
  85          * we'll bail out to check the lists again - this indicates that
  86          * some sort of change was made to the lists and it is possible
  87          * that there is a mailbox for us to find now. This should prevent
  88          * a ton of spinning in the case where there are lots of threads
  89          * that need a yet-to-be-allocated mailbox for a connection. */
  90
  91         fmablk_vers = atomic_read(&device->gnd_fmablk_vers);
  92         down(&device->gnd_fmablk_sem);
  93
  94         if (fmablk_vers != atomic_read(&device->gnd_fmablk_vers)) {
  95                 /* version changed while we were waiting for semaphore,
  96                  * we'll recheck the lists assuming something nice happened */
  97                 up(&device->gnd_fmablk_sem);
  98                 return 0;
  99         }
 100
 101         LIBCFS_ALLOC(fma_blk, sizeof(kgn_fma_memblock_t));
 102         if (fma_blk == NULL) {
 103                 CNETERR("could not allocate fma block descriptor\n");
 104                 rc = -ENOMEM;
 105                 GOTO(out, rc);
 106         }
 107
 108         INIT_LIST_HEAD(&fma_blk->gnm_bufflist);
 109
 110         kgnilnd_setup_smsg_attr(&smsg_attr);
 111
 112         gni_smsg_buff_size_needed(&smsg_attr, &fma_blk->gnm_mbox_size);
 113
 114         LASSERTF(fma_blk->gnm_mbox_size, "mbox size %d\n", fma_blk->gnm_mbox_size);
 115
 116         /* gni_smsg_buff_size_needed calculates the base mailbox size and since
 117          * we want to hold kgn_peer_credits worth of messages in both directions,
 118          * we add PAYLOAD to grow the mailbox size
 119          */
 120
 121         fma_blk->gnm_mbox_size += GNILND_MBOX_PAYLOAD;
 122
 123         /* we'll only use physical during preallocate at startup -- this keeps it nice and
 124          * clean for runtime decisions. We'll keep the PHYS ones around until shutdown
 125          * as reallocating them is tough if there is memory fragmentation */
 126
 127         if (use_phys) {
 128                 fma_blk->gnm_block = kmem_cache_alloc(kgnilnd_data.kgn_mbox_cache, GFP_ATOMIC);
 129                 if (fma_blk->gnm_block == NULL) {
 130                         CNETERR("could not allocate physical SMSG mailbox memory\n");
 131                         rc = -ENOMEM;
 132                         GOTO(free_desc, rc);
 133                 }
 134                 fma_blk->gnm_blk_size = KMALLOC_MAX_SIZE;
 135                 num_mbox = fma_blk->gnm_blk_size / fma_blk->gnm_mbox_size;
 136
 137                 LASSERTF(num_mbox >= 1,
 138                          "num_mbox %d blk_size %u mbox_size %d\n",
 139                           num_mbox, fma_blk->gnm_blk_size, fma_blk->gnm_mbox_size);
 140
 141                 fma_blk->gnm_state = GNILND_FMABLK_PHYS;
 142
 143         } else {
 144                 num_mbox = *kgnilnd_tunables.kgn_mbox_per_block;
 145                 fma_blk->gnm_blk_size = num_mbox * fma_blk->gnm_mbox_size;
 146
 147                 LASSERTF(num_mbox >= 1 && num_mbox >= *kgnilnd_tunables.kgn_mbox_per_block,
 148                          "num_mbox %d blk_size %u mbox_size %d tunable %d\n",
 149                          num_mbox, fma_blk->gnm_blk_size, fma_blk->gnm_mbox_size,
 150                          *kgnilnd_tunables.kgn_mbox_per_block);
 151
 152                 LIBCFS_ALLOC(fma_blk->gnm_block, fma_blk->gnm_blk_size);
 153                 if (fma_blk->gnm_block == NULL) {
 154                         CNETERR("could not allocate virtual SMSG mailbox memory, %d bytes\n", fma_blk->gnm_blk_size);
 155                         rc = -ENOMEM;
 156                         GOTO(free_desc, rc);
 157                 }
 158
 159                 fma_blk->gnm_state = GNILND_FMABLK_VIRT;
 160         }
 161
 162         /* allocate just enough space for the bits to track the mailboxes */
 163         LIBCFS_ALLOC(fma_blk->gnm_bit_array, BITS_TO_LONGS(num_mbox) * sizeof(unsigned long));
 164         if (fma_blk->gnm_bit_array == NULL) {
 165                 CNETERR("could not allocate mailbox bitmask, %lu bytes for %d mbox\n",
 166                        sizeof(unsigned long) * BITS_TO_LONGS(num_mbox), num_mbox);
 167                 rc = -ENOMEM;
 168                 GOTO(free_blk, rc);
 169         }
 170         bitmap_zero(fma_blk->gnm_bit_array, num_mbox);
 171
 172         /* now that the num_mbox is set based on allocation type, get debug info setup */
 173         LIBCFS_ALLOC(fma_blk->gnm_mbox_info, sizeof(kgn_mbox_info_t) * num_mbox);
 174         if (fma_blk->gnm_mbox_info == NULL) {
 175                 CNETERR("could not allocate mailbox debug, %lu bytes for %d mbox\n",
 176                        sizeof(kgn_mbox_info_t) * num_mbox, num_mbox);
 177                 rc = -ENOMEM;
 178                 GOTO(free_bit, rc);
 179         }
 180
 181         rc = kgnilnd_map_fmablk(device, fma_blk);
 182         if (rc) {
 183                 GOTO(free_info, rc);
 184         }
 185
 186         fma_blk->gnm_next_avail_mbox = 0;
 187         fma_blk->gnm_avail_mboxs = fma_blk->gnm_num_mboxs = num_mbox;
 188
 189         CDEBUG(D_MALLOC, "alloc fmablk 0x%p num %d msg_maxsize %d credits %d "
 190                 "mbox_size %d MDD "LPX64"."LPX64"\n",
 191                 fma_blk, num_mbox, smsg_attr.msg_maxsize, smsg_attr.mbox_maxcredit,
 192                 fma_blk->gnm_mbox_size, fma_blk->gnm_hndl.qword1,
 193                 fma_blk->gnm_hndl.qword2);
 194
 195         /* lock Is protecting data structures, not semaphore */
 196
 197         spin_lock(&device->gnd_fmablk_lock);
 198         list_add_tail(&fma_blk->gnm_bufflist, &device->gnd_fma_buffs);
 199
 200         /* toggle under the lock so once they change the list is also
 201          * ready for others to traverse */
 202         atomic_inc(&device->gnd_fmablk_vers);
 203
 204         spin_unlock(&device->gnd_fmablk_lock);
 205
 206         up(&device->gnd_fmablk_sem);
 207
 208         return 0;
 209
 210 free_info:
 211         LIBCFS_FREE(fma_blk->gnm_mbox_info, sizeof(kgn_mbox_info_t)*num_mbox);
 212 free_bit:
 213         LIBCFS_FREE(fma_blk->gnm_bit_array, BITS_TO_LONGS(num_mbox) * sizeof (unsigned long));
 214 free_blk:
 215         if (fma_blk->gnm_state == GNILND_FMABLK_VIRT) {
 216                 LIBCFS_FREE(fma_blk->gnm_block, fma_blk->gnm_blk_size);
 217         } else {
 218                 kmem_cache_free(kgnilnd_data.kgn_mbox_cache, fma_blk->gnm_block);
 219         }
 220 free_desc:
 221         LIBCFS_FREE(fma_blk, sizeof(kgn_fma_memblock_t));
 222 out:
 223         up(&device->gnd_fmablk_sem);
 224         return rc;
 225 }
 226
 227 void
 228 kgnilnd_unmap_fmablk(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
 229 {
 230         gni_return_t            rrc;
 231
 232         /* if some held, set hold_timeout from conn timeouts used in this block
 233          * but not during shutdown, then just nuke and pave */
 234         if (fma_blk->gnm_held_mboxs && (!kgnilnd_data.kgn_shutdown)) {
 235                 fma_blk->gnm_hold_timeout = GNILND_TIMEOUT2DEADMAN;
 236         }
 237
 238         /* we are changing the state of a block, tickle version to tell
 239          * proc code list is stale now */
 240         atomic_inc(&dev->gnd_fmablk_vers);
 241
 242         rrc = kgnilnd_mem_deregister(dev->gnd_handle, &fma_blk->gnm_hndl, fma_blk->gnm_hold_timeout);
 243
 244         CDEBUG(rrc == GNI_RC_SUCCESS ? D_MALLOC : D_CONSOLE|D_NETERROR,
 245                "unmap fmablk 0x%p@%s sz %u total %d avail %d held %d mbox_size %d "
 246                 "hold_timeout %d\n",
 247                fma_blk, kgnilnd_fmablk_state2str(fma_blk->gnm_state),
 248                fma_blk->gnm_blk_size, fma_blk->gnm_num_mboxs,
 249                fma_blk->gnm_avail_mboxs, fma_blk->gnm_held_mboxs,
 250                fma_blk->gnm_mbox_size, fma_blk->gnm_hold_timeout);
 251
 252         LASSERTF(rrc == GNI_RC_SUCCESS,
 253                 "tried to double unmap or something bad, fma_blk %p (rrc %d)\n",
 254                 fma_blk, rrc);
 255
 256         if (fma_blk->gnm_hold_timeout) {
 257                 atomic_inc(&dev->gnd_n_mdd_held);
 258         } else {
 259                 atomic_dec(&dev->gnd_n_mdd);
 260         }
 261
 262         /* PHYS blocks don't get mapped */
 263         if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
 264                 atomic64_sub(fma_blk->gnm_blk_size, &dev->gnd_nbytes_map);
 265                 fma_blk->gnm_state = GNILND_FMABLK_IDLE;
 266         } else if (kgnilnd_data.kgn_in_reset) {
 267                 /* in stack reset, clear MDD handle for PHYS blocks, as we'll
 268                  * re-use the fma_blk after reset so we don't have to drop/allocate
 269                  * all of those physical blocks */
 270                 fma_blk->gnm_hndl.qword1 = fma_blk->gnm_hndl.qword2 = 0UL;
 271         }
 272
 273         /* Decrement here as this is the # of mapped blocks */
 274         atomic_dec(&dev->gnd_nfmablk);
 275 }
 276
 277
 278 /* needs lock on gnd_fmablk_lock to cover gnd_fma_buffs */
 279 void
 280 kgnilnd_free_fmablk_locked(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
 281 {
 282         LASSERTF(fma_blk->gnm_avail_mboxs == fma_blk->gnm_num_mboxs,
 283                  "fma_blk %p@%d free in bad state (%d): blk total %d avail %d held %d\n",
 284                  fma_blk, fma_blk->gnm_state, fma_blk->gnm_hold_timeout, fma_blk->gnm_num_mboxs,
 285                 fma_blk->gnm_avail_mboxs, fma_blk->gnm_held_mboxs);
 286
 287         atomic_inc(&dev->gnd_fmablk_vers);
 288
 289         if (fma_blk->gnm_hold_timeout) {
 290                 CDEBUG(D_MALLOC, "mdd release fmablk 0x%p sz %u avail %d held %d "
 291                         "mbox_size %d\n",
 292                         fma_blk, fma_blk->gnm_blk_size, fma_blk->gnm_avail_mboxs,
 293                         fma_blk->gnm_held_mboxs, fma_blk->gnm_mbox_size);
 294
 295                 /* We leave MDD dangling over stack reset */
 296                 if (!kgnilnd_data.kgn_in_reset) {
 297                         kgnilnd_mem_mdd_release(dev->gnd_handle, &fma_blk->gnm_hndl);
 298                 }
 299                 /* ignoring the return code - if kgni/ghal can't find it
 300                  * it must be released already */
 301                 atomic_dec(&dev->gnd_n_mdd_held);
 302                 atomic_dec(&dev->gnd_n_mdd);
 303         }
 304
 305         /* we cant' free the gnm_block until all the conns have released their
 306          * purgatory holds. While we have purgatory holds, we might check the conn
 307          * RX mailbox during the CLOSING process. It is possible that kgni might
 308          * try to look into the RX side for credits when sending the CLOSE msg too */
 309         CDEBUG(D_MALLOC, "fmablk %p free buffer %p mbox_size %d\n",
 310                 fma_blk, fma_blk->gnm_block, fma_blk->gnm_mbox_size);
 311
 312         if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
 313                 kmem_cache_free(kgnilnd_data.kgn_mbox_cache, fma_blk->gnm_block);
 314         } else {
 315                 LIBCFS_FREE(fma_blk->gnm_block, fma_blk->gnm_blk_size);
 316         }
 317         fma_blk->gnm_state = GNILND_FMABLK_FREED;
 318
 319         list_del(&fma_blk->gnm_bufflist);
 320
 321         LIBCFS_FREE(fma_blk->gnm_mbox_info, sizeof(kgn_mbox_info_t)*fma_blk->gnm_num_mboxs);
 322         LIBCFS_FREE(fma_blk->gnm_bit_array, BITS_TO_LONGS(fma_blk->gnm_num_mboxs) * sizeof (unsigned long));
 323         LIBCFS_FREE(fma_blk, sizeof(kgn_fma_memblock_t));
 324 }
 325
 326 void
 327 kgnilnd_find_free_mbox(kgn_conn_t *conn)
 328 {
 329         kgn_device_t            *dev = conn->gnc_device;
 330         gni_smsg_attr_t         *smsg_attr = &conn->gnpr_smsg_attr;
 331         kgn_fma_memblock_t      *fma_blk;
 332         kgn_mbox_info_t         *mbox = NULL;
 333         int                     id;
 334
 335         spin_lock(&dev->gnd_fmablk_lock);
 336
 337         list_for_each_entry(fma_blk, &conn->gnc_device->gnd_fma_buffs,
 338                             gnm_bufflist) {
 339                 if (fma_blk->gnm_avail_mboxs <= 0 ||
 340                     fma_blk->gnm_state <= GNILND_FMABLK_IDLE) {
 341                         continue;
 342                 }
 343                 /* look in bitarray for available mailbox */
 344                 do {
 345                         id = find_next_zero_bit(
 346                                 fma_blk->gnm_bit_array,
 347                                 fma_blk->gnm_num_mboxs,
 348                                 fma_blk->gnm_next_avail_mbox);
 349                       if (id == fma_blk->gnm_num_mboxs &&
 350                           fma_blk->gnm_next_avail_mbox != 0) {
 351                                 /* wrap around */
 352                                 fma_blk->gnm_next_avail_mbox = 0;
 353                         } else {
 354                                 break;
 355                         }
 356                 } while (1);
 357
 358                 LASSERTF(id < fma_blk->gnm_num_mboxs, "id %d max %d\n",
 359                          id, fma_blk->gnm_num_mboxs);
 360                 set_bit(id, (volatile unsigned long *)fma_blk->gnm_bit_array);
 361                 conn->gnc_mbox_id = id;
 362
 363                 fma_blk->gnm_next_avail_mbox =
 364                         (id == (fma_blk->gnm_num_mboxs - 1)) ? 0 : (id + 1);
 365                 fma_blk->gnm_avail_mboxs--;
 366                 conn->gnc_fma_blk = fma_blk;
 367
 368                 kgnilnd_setup_smsg_attr(smsg_attr);
 369
 370                 smsg_attr->msg_buffer = fma_blk->gnm_block;
 371                 smsg_attr->mbox_offset = fma_blk->gnm_mbox_size * id;
 372                 smsg_attr->mem_hndl = fma_blk->gnm_hndl;
 373                 smsg_attr->buff_size = fma_blk->gnm_mbox_size;
 374
 375                 /* We'll set the hndl to zero for PHYS blocks unmapped during stack
 376                  * reset and re-use the same fma_blk after stack reset. This ensures we've
 377                  * properly mapped it before we use it */
 378                 LASSERTF(fma_blk->gnm_hndl.qword1 != 0UL, "unmapped fma_blk %p, state %d\n",
 379                          fma_blk, fma_blk->gnm_state);
 380
 381                 CDEBUG(D_NET, "conn %p smsg %p fmablk %p "
 382                         "allocating SMSG mbox %d buf %p "
 383                         "offset %u hndl "LPX64"."LPX64"\n",
 384                         conn, smsg_attr, fma_blk, id,
 385                         smsg_attr->msg_buffer, smsg_attr->mbox_offset,
 386                         fma_blk->gnm_hndl.qword1,
 387                         fma_blk->gnm_hndl.qword2);
 388
 389                 mbox = &fma_blk->gnm_mbox_info[id];
 390                 mbox->mbx_create_conn_memset = jiffies;
 391                 mbox->mbx_nallocs++;
 392                 mbox->mbx_nallocs_total++;
 393
 394                 /* zero mbox to remove any old data from our last use.
 395                  * this better be safe, if not our purgatory timers
 396                  * are too short or a peer really is misbehaving */
 397                 memset(smsg_attr->msg_buffer + smsg_attr->mbox_offset,
 398                        0, smsg_attr->buff_size);
 399                 break;
 400         }
 401
 402         spin_unlock(&dev->gnd_fmablk_lock);
 403 }
 404
 405 int
 406 kgnilnd_setup_mbox(kgn_conn_t *conn)
 407 {
 408         gni_smsg_attr_t         *smsg_attr = &conn->gnpr_smsg_attr;
 409         int                      err = 0;
 410
 411         smsg_attr->msg_buffer = NULL;
 412         /* Look for available mbox */
 413         do {
 414                 kgnilnd_find_free_mbox(conn);
 415
 416                 /* nothing in the existing buffers, make a new one */
 417                 if (smsg_attr->msg_buffer == NULL) {
 418                         /* for runtime allocations, we only want vmalloc */
 419                         err = kgnilnd_alloc_fmablk(conn->gnc_device, 0);
 420                         if (err) {
 421                                 break;
 422                         }
 423                 }
 424         } while (smsg_attr->msg_buffer == NULL);
 425
 426         if (err)
 427                 CNETERR("couldn't allocate SMSG mbox for conn %p Error: %d\n",
 428                         conn, err);
 429         return err;
 430 }
 431
 432 void
 433 kgnilnd_release_mbox(kgn_conn_t *conn, int purgatory_hold)
 434 {
 435         kgn_device_t           *dev = conn->gnc_device;
 436         gni_smsg_attr_t        *smsg_attr = &conn->gnpr_smsg_attr;
 437         kgn_fma_memblock_t     *fma_blk = NULL;
 438         kgn_mbox_info_t        *mbox = NULL;
 439         int                     found = 0;
 440         int                     id;
 441
 442         /* if we failed to setup mbox and now destroying conn */
 443         if (smsg_attr->msg_buffer == NULL) {
 444                 return;
 445         }
 446
 447         id = conn->gnc_mbox_id;
 448
 449         spin_lock(&dev->gnd_fmablk_lock);
 450         /* make sure our conn points at a valid fma_blk
 451          * We use this instead of a mem block search out of smsg_attr
 452          * because we could have freed a block for fma_blk #1 but the fma_blk
 453          * is still in the list for a purgatory hold. This would induce a false
 454          * match if that same block gets reallocated to fma_blk #2 */
 455         list_for_each_entry(fma_blk, &dev->gnd_fma_buffs, gnm_bufflist) {
 456                 if (fma_blk == conn->gnc_fma_blk) {
 457                         found = 1;
 458                         break;
 459                 }
 460         }
 461         LASSERTF(found, "unable to find conn 0x%p with gnc_fma_blk %p "
 462                  "anywhere in the world\n", conn, conn->gnc_fma_blk);
 463
 464         LASSERTF(id < fma_blk->gnm_num_mboxs,
 465                 "bad id %d max %d\n",
 466                 id, fma_blk->gnm_num_mboxs);
 467
 468         /* < 0 - was held, now free it
 469          * == 0 - just free it
 470          * > 0 - hold it for now */
 471         if (purgatory_hold == 0) {
 472                 CDEBUG(D_NET, "conn %p smsg %p fmablk %p freeing SMSG mbox %d "
 473                         "hndl "LPX64"."LPX64"\n",
 474                         conn, smsg_attr, fma_blk, id,
 475                         fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
 476                 fma_blk->gnm_avail_mboxs++;
 477
 478         } else if (purgatory_hold > 0) {
 479                 CDEBUG(D_NET, "conn %p smsg %p fmablk %p holding SMSG mbox %d "
 480                         "hndl "LPX64"."LPX64"\n",
 481                         conn, smsg_attr, fma_blk, id,
 482                         fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
 483
 484                 fma_blk->gnm_held_mboxs++;
 485                 fma_blk->gnm_max_timeout = MAX(fma_blk->gnm_max_timeout,
 486                                                 conn->gnc_timeout);
 487         } else {
 488                 CDEBUG(D_NET, "conn %p smsg %p fmablk %p release SMSG mbox %d "
 489                         "hndl "LPX64"."LPX64"\n",
 490                         conn, smsg_attr, fma_blk, id,
 491                         fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
 492
 493                 fma_blk->gnm_held_mboxs--;
 494                 fma_blk->gnm_avail_mboxs++;
 495         }
 496
 497         if (purgatory_hold <= 0) {
 498                 /* if kgni is retransmitting, freeing the smsg block before the EP
 499                  * is destroyed gets messy. Bug 768295. */
 500                 LASSERTF(conn->gnc_ephandle == NULL,
 501                          "can't release mbox before EP is nuked. conn 0x%p\n", conn);
 502
 503                 mbox = &fma_blk->gnm_mbox_info[id];
 504                 mbox->mbx_release_from_purgatory = jiffies;
 505
 506                 /* clear conn gnc_fmablk if it is gone - this allows us to
 507                  * not worry about state so much in kgnilnd_destroy_conn
 508                  * and makes the guaranteed cleanup of the resources easier */
 509                 LASSERTF(test_and_clear_bit(id, fma_blk->gnm_bit_array),
 510                         "conn %p bit %d already cleared in fma_blk %p\n",
 511                          conn, id, fma_blk);
 512                 conn->gnc_fma_blk = NULL;
 513                 mbox->mbx_nallocs--;
 514         }
 515
 516         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_FMABLK_AVAIL)) {
 517                 CERROR("LBUGs in your future: forcibly marking fma_blk %p "
 518                        "as mapped\n", fma_blk);
 519                 fma_blk->gnm_state = GNILND_FMABLK_VIRT;
 520         }
 521
 522         /* we don't release or unmap PHYS blocks as part of the normal cycle --
 523          * those are controlled manually from startup/shutdown */
 524         if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
 525                 /* we can unmap once all are unused (held or avail)
 526                  * but check hold_timeout to make sure we are not trying to double
 527                  * unmap this buffer. If there was no hold_timeout set due to
 528                  * held_mboxs, we'll free the mobx here shortly and won't have to
 529                  * worry about catching a double free for a 'clean' fma_blk */
 530                 if (((fma_blk->gnm_avail_mboxs + fma_blk->gnm_held_mboxs) == fma_blk->gnm_num_mboxs) &&
 531                     (!fma_blk->gnm_hold_timeout)) {
 532                         kgnilnd_unmap_fmablk(dev, fma_blk);
 533                 }
 534
 535                 /* But we can only free once they are all avail */
 536                 if (fma_blk->gnm_avail_mboxs == fma_blk->gnm_num_mboxs &&
 537                     fma_blk->gnm_held_mboxs == 0) {
 538                         /* all mailboxes are released, free fma_blk */
 539                         kgnilnd_free_fmablk_locked(dev, fma_blk);
 540                 }
 541         }
 542
 543         spin_unlock(&dev->gnd_fmablk_lock);
 544 }
 545
 546 int
 547 kgnilnd_count_phys_mbox(kgn_device_t *device)
 548 {
 549         int                     i = 0;
 550         kgn_fma_memblock_t     *fma_blk;
 551
 552         spin_lock(&device->gnd_fmablk_lock);
 553
 554         list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
 555                 if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
 556                         i += fma_blk->gnm_num_mboxs;
 557         }
 558         spin_unlock(&device->gnd_fmablk_lock);
 559
 560         RETURN(i);
 561 }
 562
 563 int
 564 kgnilnd_allocate_phys_fmablk(kgn_device_t *device)
 565 {
 566         int     rc;
 567
 568         while (kgnilnd_count_phys_mbox(device) < *kgnilnd_tunables.kgn_nphys_mbox) {
 569
 570                 rc = kgnilnd_alloc_fmablk(device, 1);
 571                 if (rc) {
 572                         CERROR("failed phys mbox allocation, stopping at %d, rc %d\n",
 573                                 kgnilnd_count_phys_mbox(device), rc);
 574                         RETURN(rc);
 575                 }
 576         }
 577         RETURN(0);
 578 }
 579
 580 int
 581 kgnilnd_map_phys_fmablk(kgn_device_t *device)
 582 {
 583
 584         int                     rc = 0;
 585         kgn_fma_memblock_t     *fma_blk;
 586
 587         /* use sem to gate access to single thread, just in case */
 588         down(&device->gnd_fmablk_sem);
 589
 590         spin_lock(&device->gnd_fmablk_lock);
 591
 592         list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
 593                 if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
 594                         rc = kgnilnd_map_fmablk(device, fma_blk);
 595                         if (rc)
 596                                 break;
 597                 }
 598         }
 599         spin_unlock(&device->gnd_fmablk_lock);
 600
 601         up(&device->gnd_fmablk_sem);
 602
 603         RETURN(rc);
 604 }
 605
 606 void
 607 kgnilnd_unmap_fma_blocks(kgn_device_t *device)
 608 {
 609
 610         kgn_fma_memblock_t      *fma_blk;
 611
 612         /* use sem to gate access to single thread, just in case */
 613         down(&device->gnd_fmablk_sem);
 614
 615         spin_lock(&device->gnd_fmablk_lock);
 616
 617         list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
 618                 kgnilnd_unmap_fmablk(device, fma_blk);
 619         }
 620         spin_unlock(&device->gnd_fmablk_lock);
 621
 622         up(&device->gnd_fmablk_sem);
 623 }
 624
 625 void
 626 kgnilnd_free_phys_fmablk(kgn_device_t *device)
 627 {
 628
 629         kgn_fma_memblock_t      *fma_blk, *fma_blkN;
 630
 631         /* use sem to gate access to single thread, just in case */
 632         down(&device->gnd_fmablk_sem);
 633
 634         spin_lock(&device->gnd_fmablk_lock);
 635
 636         list_for_each_entry_safe(fma_blk, fma_blkN, &device->gnd_fma_buffs, gnm_bufflist) {
 637                 if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
 638                         kgnilnd_free_fmablk_locked(device, fma_blk);
 639         }
 640         spin_unlock(&device->gnd_fmablk_lock);
 641
 642         up(&device->gnd_fmablk_sem);
 643 }
 644
 645 /* kgnilnd dgram nid->struct managment */
 646
 647 static inline struct list_head *
 648 kgnilnd_nid2dgramlist(kgn_device_t *dev, lnet_nid_t nid)
 649 {
 650         unsigned int hash = ((unsigned int)nid) % *kgnilnd_tunables.kgn_peer_hash_size;
 651
 652         RETURN(&dev->gnd_dgrams[hash]);
 653 }
 654
 655
 656 /* needs dev->gnd_dgram_lock held */
 657 kgn_dgram_t *
 658 kgnilnd_find_dgram_locked(kgn_device_t *dev, lnet_nid_t dst_nid)
 659 {
 660         struct list_head *dgram_list = kgnilnd_nid2dgramlist(dev, dst_nid);
 661         kgn_dgram_t      *dgram;
 662
 663         list_for_each_entry(dgram, dgram_list, gndg_list) {
 664
 665                 /* if state > POSTED, we are already handling cancel/completion */
 666                 if ((dgram->gndg_conn_out.gncr_dstnid != dst_nid) ||
 667                      dgram->gndg_state > GNILND_DGRAM_POSTED)
 668                         continue;
 669
 670                 CDEBUG(D_NET, "got dgram [%p] -> %s\n",
 671                        dgram, libcfs_nid2str(dst_nid));
 672                 return dgram;
 673         }
 674         return NULL;
 675 }
 676
 677 int
 678 kgnilnd_find_and_cancel_dgram(kgn_device_t *dev, lnet_nid_t dst_nid)
 679 {
 680         kgn_dgram_t     *dgram;
 681
 682         spin_lock(&dev->gnd_dgram_lock);
 683         dgram = kgnilnd_find_dgram_locked(dev, dst_nid);
 684
 685         if (dgram) {
 686                 kgnilnd_cancel_dgram_locked(dgram);
 687         }
 688         spin_unlock(&dev->gnd_dgram_lock);
 689
 690         RETURN(!!(dgram == NULL));
 691 }
 692
 693 int
 694 kgnilnd_pack_connreq(kgn_connreq_t *connreq, kgn_conn_t *conn,
 695                      lnet_nid_t srcnid, lnet_nid_t dstnid,
 696                      kgn_connreq_type_t type)
 697 {
 698         int err = 0;
 699
 700         /* ensure we haven't violated max datagram size */
 701         CLASSERT(sizeof(kgn_connreq_t) <= GNI_DATAGRAM_MAXSIZE);
 702
 703         /* no need to zero out, we do that when allocating dgram */
 704         connreq->gncr_magic     = GNILND_MSG_MAGIC;
 705
 706         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_SRCNID)) {
 707                 srcnid = 0xABADBABE;
 708         } else if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_DSTNID)) {
 709                 dstnid = 0xDEFEC8ED;
 710         }
 711
 712         connreq->gncr_srcnid    = srcnid;
 713         connreq->gncr_dstnid    = dstnid;
 714
 715         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
 716                 connreq->gncr_version = 99;
 717         } else {
 718                 connreq->gncr_version   = GNILND_CONNREQ_VERSION;
 719         }
 720         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
 721                 connreq->gncr_type = 99;
 722         } else {
 723                 connreq->gncr_type      = type;
 724         }
 725         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
 726                 connreq->gncr_peerstamp = 0;
 727         } else {
 728                 connreq->gncr_peerstamp = kgnilnd_data.kgn_peerstamp;
 729         }
 730         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
 731                 connreq->gncr_connstamp = 0;
 732         } else {
 733                 connreq->gncr_connstamp = conn->gnc_my_connstamp;
 734         }
 735         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
 736                 connreq->gncr_timeout = 0;
 737         } else {
 738                 connreq->gncr_timeout   = conn->gnc_timeout;
 739         }
 740
 741         /* the rest pack the data into the payload in other places */
 742         if (type == GNILND_CONNREQ_REQ) {
 743                 kgn_gniparams_t       *req_params = &connreq->gncr_gnparams;
 744                 req_params->gnpr_host_id = conn->gnc_device->gnd_host_id;
 745                 req_params->gnpr_cqid = conn->gnc_cqid;
 746
 747                 /* allocate mailbox for this connection */
 748                 err = kgnilnd_setup_mbox(conn);
 749                 if (err != 0) {
 750                         CERROR("Failed to setup FMA mailbox (%d)\n", err);
 751                 }
 752                 req_params->gnpr_smsg_attr = conn->gnpr_smsg_attr;
 753         }
 754
 755         /* XXX Nic: TBD - checksum computation */
 756
 757         return err;
 758 }
 759
 760 int
 761 kgnilnd_unpack_connreq(kgn_dgram_t *dgram)
 762 {
 763         kgn_connreq_t           *connreq = &dgram->gndg_conn_in;
 764         int                      swab, rc = 0;
 765         kgn_net_t               *net;
 766
 767         /* the following fields must be handled in a backwards compatible
 768          * manner to ensure we can always send and interpret NAKs */
 769
 770         if (connreq->gncr_magic != GNILND_MSG_MAGIC &&
 771             connreq->gncr_magic != __swab32(GNILND_MSG_MAGIC)) {
 772                 /* Unexpected magic! */
 773                 CERROR("Unexpected magic %08x\n",
 774                        connreq->gncr_magic);
 775                 return -EBADF;
 776         }
 777
 778         swab = (connreq->gncr_magic == __swab32(GNILND_MSG_MAGIC));
 779         if (swab) {
 780                 __swab32s(&connreq->gncr_magic);
 781                 __swab32s(&connreq->gncr_cksum);
 782                 __swab16s(&connreq->gncr_type);
 783                 __swab16s(&connreq->gncr_version);
 784                 __swab32s(&connreq->gncr_timeout);
 785                 __swab64s(&connreq->gncr_srcnid);
 786                 __swab64s(&connreq->gncr_dstnid);
 787                 __swab64s(&connreq->gncr_peerstamp);
 788                 __swab64s(&connreq->gncr_connstamp);
 789         }
 790
 791         /* Do NOT return anything but -EBADF before we munge
 792          * connreq->gncr_srcnid - we need that to send the nak */
 793
 794         if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
 795                 lnet_nid_t      incoming = connreq->gncr_srcnid;
 796
 797                 /* even if the incoming packet is hosed, we know who we sent
 798                  * the original and can set the srcnid so that we can properly
 799                  * look up our peer to close the loop on this connreq. We still use
 800                  * -EBADF to prevent a NAK - just in case there are issues with
 801                  * the payload coming from a random spot, etc. */
 802                 connreq->gncr_srcnid = dgram->gndg_conn_out.gncr_dstnid;
 803
 804                 if (LNET_NIDADDR(dgram->gndg_conn_out.gncr_dstnid) !=
 805                                 LNET_NIDADDR(incoming)) {
 806                         /* we got a datagram match for the wrong nid... */
 807                         CERROR("matched datagram 0x%p with srcnid %s "
 808                                 "(%x), expecting %s (%x)\n",
 809                                 dgram,
 810                                 libcfs_nid2str(incoming),
 811                                 LNET_NIDADDR(incoming),
 812                                 libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
 813                                 LNET_NIDADDR(dgram->gndg_conn_out.gncr_dstnid));
 814                         return -EBADF;
 815                 }
 816         } else {
 817                 /* if we have a wildcard datagram it should match an
 818                  * incoming "active" datagram that should have a fully formed
 819                  * srcnid and dstnid. If we couldn't unpack it, we drop as
 820                  * corrupted packet, otherwise we'll just verify that the dstnid
 821                  * matches the NID for the NET that the dgram was posted */
 822
 823                 /* make sure their wildcard didn't match ours, that is unpossible */
 824                 LASSERTF(connreq->gncr_dstnid != LNET_NID_ANY,
 825                          "dgram 0x%p from %s, connreq 0x%p; "
 826                          "wildcard matched wildcard \n", dgram,
 827                          libcfs_nid2str(connreq->gncr_srcnid), connreq);
 828
 829                 rc = kgnilnd_find_net(connreq->gncr_dstnid, &net);
 830
 831                 if (rc == -ESHUTDOWN) {
 832                         CERROR("Looking up network: device is in shutdown");
 833                         return rc;
 834                 } else if (rc == -ENONET) {
 835                         CERROR("Connection data from %s: she sent "
 836                         "dst_nid %s, but net lookup failed on "
 837                         "dgram 0x%p@%s\n",
 838                         libcfs_nid2str(connreq->gncr_srcnid),
 839                         libcfs_nid2str(connreq->gncr_dstnid),
 840                         dgram, kgnilnd_dgram_type2str(dgram));
 841                         return rc;
 842                 }
 843
 844                 if (net->gnn_ni->ni_nid != connreq->gncr_dstnid) {
 845                         CERROR("Bad connection data from %s: she sent "
 846                                "dst_nid %s, but I am %s with dgram 0x%p@%s\n",
 847                                libcfs_nid2str(connreq->gncr_srcnid),
 848                                libcfs_nid2str(connreq->gncr_dstnid),
 849                                libcfs_nid2str(net->gnn_ni->ni_nid),
 850                                dgram, kgnilnd_dgram_type2str(dgram));
 851                         kgnilnd_net_decref(net);
 852                         return -EBADSLT;
 853                 }
 854
 855                 /* kgnilnd_find_net takes a ref on the net it finds, You need to decref it when not needed. */
 856                 kgnilnd_net_decref(net);
 857         }
 858
 859         if (connreq->gncr_version != GNILND_CONNREQ_VERSION) {
 860                 CERROR("Unexpected version %d\n", connreq->gncr_version);
 861                 return -EPROTO;
 862         }
 863
 864         /* XXX Nic: TBD - checksum validation */
 865         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_DROP)) {
 866                 return -EBADF;
 867         }
 868
 869         if (swab && connreq->gncr_type == GNILND_CONNREQ_REQ) {
 870                 __u64 msg_addr = (__u64) connreq->gncr_gnparams.gnpr_smsg_attr.msg_buffer;
 871
 872                 __swab32s(&connreq->gncr_gnparams.gnpr_host_id);
 873                 __swab32s(&connreq->gncr_gnparams.gnpr_cqid);
 874                 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.buff_size);
 875                 __swab16s(&connreq->gncr_gnparams.gnpr_smsg_attr.mbox_maxcredit);
 876                 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.mbox_offset);
 877                 __swab64s(&connreq->gncr_gnparams.gnpr_smsg_attr.mem_hndl.qword1);
 878                 __swab64s(&connreq->gncr_gnparams.gnpr_smsg_attr.mem_hndl.qword2);
 879                 __swab64s(&msg_addr);
 880                 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.msg_maxsize);
 881                 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.msg_type);
 882         } else if (swab && connreq->gncr_type == GNILND_CONNREQ_NAK) {
 883                 __swab32s(&connreq->gncr_nakdata.gnnd_errno);
 884         }
 885
 886         /* since we use a unique instance ID for each network, the driver
 887          * will take care of dropping datagrams if we don't have that network.
 888          */
 889
 890         /* few more idiot software or configuration checks */
 891
 892         switch (connreq->gncr_type) {
 893         case GNILND_CONNREQ_REQ:
 894                 /* wire up EP and SMSG block - this will check the incoming data
 895                  * and barf a NAK back if need to */
 896                 rc = kgnilnd_set_conn_params(dgram);
 897                 if (rc)
 898                         return rc;
 899                 break;
 900         case GNILND_CONNREQ_NAK:
 901         case GNILND_CONNREQ_CLOSE:
 902                 break;
 903         default:
 904                 CERROR("unknown connreq packet type %d\n", connreq->gncr_type);
 905                 return -EPROTO;
 906         }
 907
 908         if (connreq->gncr_peerstamp == 0 || connreq->gncr_connstamp == 0) {
 909                 CERROR("Recived bad timestamps peer "LPU64" conn "LPU64"\n",
 910                 connreq->gncr_peerstamp, connreq->gncr_connstamp);
 911                 return -EPROTO;
 912         }
 913
 914         if (connreq->gncr_timeout < GNILND_MIN_TIMEOUT) {
 915                 CERROR("Received timeout %d < MIN %d\n",
 916                        connreq->gncr_timeout, GNILND_MIN_TIMEOUT);
 917                 return -EPROTO;
 918         }
 919
 920         return 0;
 921 }
 922
 923 int
 924 kgnilnd_alloc_dgram(kgn_dgram_t **dgramp, kgn_device_t *dev, kgn_dgram_type_t type)
 925 {
 926         kgn_dgram_t         *dgram;
 927
 928         dgram = kmem_cache_alloc(kgnilnd_data.kgn_dgram_cache, GFP_ATOMIC);
 929         if (dgram == NULL)
 930                 return -ENOMEM;
 931
 932         /* cache alloc'd memory is not zeroed */
 933         memset((void *)dgram, 0, sizeof(*dgram)) ;
 934
 935         INIT_LIST_HEAD(&dgram->gndg_list);
 936         dgram->gndg_state = GNILND_DGRAM_USED;
 937         dgram->gndg_type = type;
 938         dgram->gndg_magic = GNILND_DGRAM_MAGIC;
 939
 940         atomic_inc(&dev->gnd_ndgrams);
 941
 942         CDEBUG(D_MALLOC|D_NETTRACE, "slab-alloced 'dgram': %lu at %p %s ndgrams"
 943                 " %d\n",
 944                 sizeof(*dgram), dgram, kgnilnd_dgram_type2str(dgram),
 945                 atomic_read(&dev->gnd_ndgrams));
 946
 947         *dgramp = dgram;
 948         return 0;
 949 }
 950
 951 /* call this on a dgram that came back from kgnilnd_ep_postdata_test_by_id
 952  * returns < 0 on dgram to be cleaned up
 953  * > 0 on dgram that isn't done yet
 954  * == 0 on dgram that is ok and needs connreq processing */
 955 int
 956 kgnilnd_process_dgram(kgn_dgram_t *dgram, gni_post_state_t post_state)
 957 {
 958         int rc = 0;
 959
 960         switch (post_state) {
 961         case GNI_POST_COMPLETED:
 962                 /* normal state for dgrams that need actual processing */
 963                 /* GOTO to avoid processing dgram as canceled/done */
 964                 GOTO(process_out, rc);
 965
 966         case GNI_POST_PENDING:
 967                 /* we should only see this if we are testing a WC dgram after a
 968                  * cancel - it means that it needs a full cycle of waiting
 969                  * for kgni_sm_task to finish moving it to TERMINATED */
 970                 LASSERTF((dgram->gndg_type == GNILND_DGRAM_WC_REQ) &&
 971                           (dgram->gndg_state == GNILND_DGRAM_CANCELED),
 972                          "POST_PENDING dgram 0x%p with bad type %d(%s) or state %d(%s)\n",
 973                          dgram, dgram->gndg_type, kgnilnd_dgram_type2str(dgram),
 974                          dgram->gndg_state, kgnilnd_dgram_state2str(dgram));
 975
 976                 /* positive RC as this dgram isn't done yet */
 977                 rc = EINPROGRESS;
 978
 979                 /* GOTO as this isn't done yet */
 980                 GOTO(process_out, rc);
 981                 break;
 982
 983         case GNI_POST_TERMINATED:
 984                 /* we've called cancel and it is done or remote guy called cancel and
 985                  * we've receved it on a WC dgram */
 986 #if 0
 987                 /* we are seeing weird terminations on non WC dgrams when we have not
 988                  * canceled them */
 989
 990                 LASSERTF(dgram->gndg_state == GNILND_DGRAM_CANCELED ||
 991                          dgram->gndg_conn_out.gncr_dstnid == LNET_NID_ANY,
 992                         "dgram 0x%p with bad state %d(%s) or dst nid %s\n",
 993                         dgram, dgram->gndg_state, kgnilnd_dgram_state2str(dgram),
 994                         libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid));
 995 #endif
 996
 997                 CDEBUG(D_NETTRACE, "dgram 0x%p saw %s, cleaning it up\n", dgram,
 998                        dgram->gndg_state == GNILND_DGRAM_CANCELED ?  "canceled" : "terminated");
 999
1000                 rc =  -ECANCELED;
1001                 break;
1002
1003         case GNI_POST_TIMEOUT:
1004                 /* we could have a timeout on a wildcard dgram too - if
1005                  * we got the incoming request but the remote node beefed
1006                  * before kgni could send the match data back. We'll just error
1007                  * on the active case and bail out gracefully */
1008                 if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
1009                         CNETERR("hardware timeout for connect to "
1010                                "%s after %lu seconds. Is node dead?\n",
1011                                libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
1012                                cfs_duration_sec(jiffies - dgram->gndg_post_time));
1013                 }
1014
1015                 rc = -ETIMEDOUT;
1016                 break;
1017
1018         default:
1019                 CERROR("dgram 0x%p with bad post_state %d\n", dgram, post_state);
1020                 LBUG();
1021         }
1022
1023         /* now finish cleaning up a dgram that is canceled/terminated and needs to
1024          * go away */
1025
1026         /* If this was actively canceled, drop the count now that we are processing */
1027         if (dgram->gndg_state == GNILND_DGRAM_CANCELED) {
1028                 atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1029                 /* caller responsible for gndg_list removal */
1030         }
1031
1032 process_out:
1033
1034         RETURN(rc);
1035 }
1036
1037 /* needs dev->gnd_dgram_lock held */
1038 void
1039 kgnilnd_cancel_dgram_locked(kgn_dgram_t *dgram)
1040 {
1041         gni_return_t            grc;
1042
1043         if (dgram->gndg_state != GNILND_DGRAM_POSTED) {
1044                 return;
1045         }
1046
1047         LASSERTF(dgram->gndg_conn != NULL,
1048                  "dgram 0x%p with NULL conn\n", dgram);
1049
1050         /* C.E - WC dgrams could be canceled immediately but
1051          * if there was some match pending, we need to call
1052          * test_by_id to clear it out. If that test returns
1053          * POST_PENDING, it is half done and needs to go along
1054          * with the rest of dgrams and go through a kgni_sm_task cycle
1055          * and deliver a GNI_POST_TERMINATED event before they
1056          * are actually canceled */
1057
1058         dgram->gndg_state = GNILND_DGRAM_CANCELED;
1059
1060         if (dgram->gndg_conn->gnc_state >= GNILND_CONN_ESTABLISHED) {
1061                 /* we don't need to cancel_by_id if the datagram was good */
1062                 return;
1063         }
1064
1065         /* let folks know there are outstanding cancels */
1066         atomic_inc(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1067         /* leave on nid list until cancel is done for debugging fun */
1068         grc = kgnilnd_ep_postdata_cancel_by_id(dgram->gndg_conn->gnc_ephandle, (__u64) dgram);
1069
1070         /* if we don't get success here, we have hosed up the dgram tracking
1071          * code and need to bail out */
1072         LASSERTF(grc == GNI_RC_SUCCESS,
1073                  "postdata_cancel returned %d for conn 0x%p to %s\n",
1074                  grc, dgram->gndg_conn,
1075                  dgram->gndg_conn->gnc_peer ?
1076                   libcfs_nid2str(dgram->gndg_conn->gnc_peer->gnp_nid)
1077                   : "<?>");
1078
1079         CDEBUG(D_NETTRACE,
1080                 "canceled dgram 0x%p conn 0x%p ephandle 0x%p\n",
1081                 dgram, dgram->gndg_conn,
1082                 dgram->gndg_conn->gnc_ephandle);
1083
1084         if (dgram->gndg_type == GNILND_DGRAM_WC_REQ) {
1085                 gni_post_state_t         post_state;
1086                 int                      rc = 0;
1087                 __u32                    remote_addr = 0, remote_id = 0;
1088
1089                 grc = kgnilnd_ep_postdata_test_by_id(dgram->gndg_conn->gnc_ephandle,
1090                                                      (__u64)dgram, &post_state,
1091                                                      &remote_addr, &remote_id);
1092
1093                 LASSERTF(grc == GNI_RC_NO_MATCH || grc == GNI_RC_SUCCESS,
1094                          "bad grc %d from test_by_id on dgram 0x%p\n",
1095                         grc, dgram);
1096
1097                 /* if WC was canceled immediately, we get NO_MATCH, if needs to go
1098                  * through full cycle, we get SUCCESS and need to parse post_state */
1099
1100                 CDEBUG(D_NET, "grc %d dgram 0x%p type %s post_state %d "
1101                         "remote_addr %u remote_id %u\n", grc, dgram,
1102                         kgnilnd_dgram_type2str(dgram),
1103                         post_state, remote_addr, remote_id);
1104
1105                 if (grc == GNI_RC_NO_MATCH) {
1106                         /* she's gone, reduce count and move along */
1107                         dgram->gndg_state = GNILND_DGRAM_DONE;
1108                         atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1109                         RETURN_EXIT;
1110                 }
1111
1112                 rc = kgnilnd_process_dgram(dgram, post_state);
1113
1114                 if (rc <= 0) {
1115                         /* if for some weird reason we get a valid dgram back, just mark as done
1116                          * so we can drop it and move along.
1117                          * C.E - if it was completed, we'll just release the conn/mbox
1118                          * back into the pool and it'll get reused. That said, we should only
1119                          * be canceling a WC dgram on stack rest or shutdown, so that is moot */
1120                         dgram->gndg_state = GNILND_DGRAM_DONE;
1121                         atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1122
1123                         /* caller context responsible for calling kgnilnd_release_dgram() */
1124                 } else {
1125                         /* still pending, let it simmer until golden brown and delicious */
1126                 }
1127         }
1128
1129         /* for non WC dgrams, they are still on the nid list but marked canceled waiting
1130          * for kgni to return their ID to us via probe - that is when we'll complete their
1131          * cancel processing */
1132 }
1133
1134 void
1135 kgnilnd_cleanup_dgram(kgn_dgram_t *dgram)
1136 {
1137         /* release the dgram ref on conn */
1138         if (dgram->gndg_conn) {
1139                 kgnilnd_conn_decref(dgram->gndg_conn);
1140                 dgram->gndg_conn = NULL;
1141         }
1142 }
1143
1144 void
1145 kgnilnd_free_dgram(kgn_device_t *dev, kgn_dgram_t *dgram)
1146 {
1147         LASSERTF(dgram->gndg_state == GNILND_DGRAM_USED ||
1148                  dgram->gndg_state == GNILND_DGRAM_DONE,
1149                  "dgram 0x%p with bad state %s\n",
1150                  dgram, kgnilnd_dgram_state2str(dgram));
1151
1152         /* bit of poisoning to help detect bad driver data */
1153         dgram->gndg_magic = 0x6f5a6b5f;
1154         atomic_dec(&dev->gnd_ndgrams);
1155
1156         kmem_cache_free(kgnilnd_data.kgn_dgram_cache, dgram);
1157         CDEBUG(D_MALLOC|D_NETTRACE, "slab-freed 'dgram': %lu at %p %s"
1158                " ndgrams %d\n",
1159                sizeof(*dgram), dgram, kgnilnd_dgram_type2str(dgram),
1160                atomic_read(&dev->gnd_ndgrams));
1161 }
1162
1163 int
1164 kgnilnd_post_dgram(kgn_device_t *dev, lnet_nid_t dstnid, kgn_connreq_type_t type,
1165                    int data_rc)
1166 {
1167         int              rc = 0;
1168         kgn_dgram_t     *dgram = NULL;
1169         kgn_dgram_t     *tmpdgram;
1170         kgn_dgram_type_t dgtype;
1171         gni_return_t     grc;
1172         __u64            srcnid;
1173         ENTRY;
1174
1175         switch (type) {
1176         case GNILND_CONNREQ_REQ:
1177                 if (dstnid == LNET_NID_ANY)
1178                         dgtype = GNILND_DGRAM_WC_REQ;
1179                 else
1180                         dgtype = GNILND_DGRAM_REQ;
1181                 break;
1182         case GNILND_CONNREQ_NAK:
1183                 LASSERTF(dstnid != LNET_NID_ANY, "can't NAK to LNET_NID_ANY\n");
1184                 dgtype = GNILND_DGRAM_NAK;
1185                 break;
1186         default:
1187                 CERROR("unknown connreq type %d\n", type);
1188                 LBUG();
1189         }
1190
1191         rc = kgnilnd_alloc_dgram(&dgram, dev, dgtype);
1192         if (rc < 0) {
1193                 rc = -ENOMEM;
1194                 GOTO(post_failed, rc);
1195         }
1196
1197         rc = kgnilnd_create_conn(&dgram->gndg_conn, dev);
1198         if (rc) {
1199                 GOTO(post_failed, rc);
1200         }
1201
1202         if (dgram->gndg_type == GNILND_DGRAM_WC_REQ) {
1203                 /* clear buffer for sanity on reuse of wildcard */
1204                 memset(&dgram->gndg_conn_in, 0, sizeof(kgn_connreq_t));
1205         }
1206
1207         if (dstnid == LNET_NID_ANY) {
1208                 /* set here to reset any dgram re-use */
1209                 dgram->gndg_conn->gnc_state = GNILND_CONN_LISTEN;
1210         } else {
1211                 __u32            host_id;
1212
1213                 rc = kgnilnd_nid_to_nicaddrs(LNET_NIDADDR(dstnid), 1, &host_id);
1214                 if (rc <= 0) {
1215                         rc = -ESRCH;
1216                         GOTO(post_failed, rc);
1217                 }
1218
1219                 dgram->gndg_conn->gnc_state = GNILND_CONN_CONNECTING;
1220
1221                 /* don't need to serialize, there are no CQs for the dgram
1222                  * EP on the kgn_net_t */
1223                 grc = kgnilnd_ep_bind(dgram->gndg_conn->gnc_ephandle, host_id, dev->gnd_id);
1224
1225                 if (grc != GNI_RC_SUCCESS) {
1226                         rc = -ECONNABORTED;
1227                         GOTO(post_failed, rc);
1228                 }
1229
1230         }
1231
1232         /* If we are posting wildcards post using a net of 0, otherwise we'll use the
1233          * net of the destination node.
1234          */
1235
1236         if (dstnid == LNET_NID_ANY) {
1237                 srcnid = LNET_MKNID(LNET_MKNET(GNILND, 0), dev->gnd_nid);
1238         } else {
1239                 srcnid = LNET_MKNID(LNET_NIDNET(dstnid), dev->gnd_nid);
1240         }
1241
1242         rc = kgnilnd_pack_connreq(&dgram->gndg_conn_out, dgram->gndg_conn,
1243                                   srcnid, dstnid, type);
1244         if (rc) {
1245                 GOTO(post_failed, rc);
1246         }
1247
1248         if (type == GNILND_CONNREQ_NAK)
1249                 dgram->gndg_conn_out.gncr_nakdata.gnnd_errno = data_rc;
1250
1251         dgram->gndg_post_time = jiffies;
1252
1253         /* XXX Nic: here is where we'd add in logical network multiplexing */
1254
1255         CDEBUG(D_NETTRACE, "dgram 0x%p type %s %s->%s cdm %d\n",
1256                dgram, kgnilnd_dgram_type2str(dgram),
1257                libcfs_nid2str(srcnid),
1258                libcfs_nid2str(dstnid), dev->gnd_id);
1259
1260         /* this allocates memory, can't hold locks across */
1261         grc = kgnilnd_ep_postdata_w_id(dgram->gndg_conn->gnc_ephandle,
1262                                    &dgram->gndg_conn_out, sizeof(kgn_connreq_t),
1263                                    &dgram->gndg_conn_in, sizeof(kgn_connreq_t),
1264                                    (__u64)dgram);
1265
1266         if (grc != GNI_RC_SUCCESS) {
1267                 CNETERR("dropping failed dgram post id 0x%p type %s"
1268                         " reqtype %s to %s: rc %d\n",
1269                         dgram, kgnilnd_dgram_type2str(dgram),
1270                         kgnilnd_connreq_type2str(&dgram->gndg_conn_out),
1271                         libcfs_nid2str(dstnid), grc);
1272                 rc = (grc == GNI_RC_ERROR_NOMEM) ? -ENOMEM : -EBADR;
1273                 GOTO(post_failed, rc);
1274         }
1275
1276         /* we don't need to add earlier - if someone does del_peer during post,
1277          * that peer will get marked as unlinked and the callers wil take care of it.
1278          * The dgram code is largely kgn_peer_t ignorant, so at worst, we'll just drop
1279          * the completed dgram later when we cant find a peer to stuff it into */
1280
1281         spin_lock(&dev->gnd_dgram_lock);
1282
1283         /* make sure we are not double posting targeted dgrams
1284          * - we can multiple post WC dgrams to help with processing speed */
1285         if (dstnid != LNET_NID_ANY) {
1286                 tmpdgram = kgnilnd_find_dgram_locked(dev, dstnid);
1287
1288                 LASSERTF(tmpdgram == NULL,
1289                         "dgram 0x%p->%s already posted\n",
1290                          dgram, libcfs_nid2str(dstnid));
1291         }
1292
1293         /* unmunge dstnid to help processing code cope... */
1294         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_DSTNID)) {
1295                 dgram->gndg_conn_out.gncr_dstnid = dstnid;
1296         }
1297
1298         list_add_tail(&dgram->gndg_list, kgnilnd_nid2dgramlist(dev, dstnid));
1299         dgram->gndg_state = GNILND_DGRAM_POSTED;
1300         spin_unlock(&dev->gnd_dgram_lock);
1301
1302 post_failed:
1303         if (rc < 0 && dgram != NULL) {
1304                 kgnilnd_cleanup_dgram(dgram);
1305                 kgnilnd_free_dgram(dev, dgram);
1306         }
1307
1308         RETURN(rc);
1309 }
1310
1311 /* The shutdown flag is set from the shutdown and stack reset threads. */
1312 void
1313 kgnilnd_release_dgram(kgn_device_t *dev, kgn_dgram_t *dgram, int shutdown)
1314 {
1315         /* The conns of canceled active dgrams need to be put in purgatory so
1316          * we don't reuse the mailbox */
1317         if (unlikely(dgram->gndg_state == GNILND_DGRAM_CANCELED)) {
1318                 kgn_peer_t *peer;
1319                 kgn_conn_t *conn = dgram->gndg_conn;
1320                 lnet_nid_t nid = dgram->gndg_conn_out.gncr_dstnid;
1321
1322                 dgram->gndg_state = GNILND_DGRAM_DONE;
1323
1324                 /* During shutdown we've already removed the peer so we don't
1325                  * need to add a peer. During stack reset we don't care about
1326                  * MDDs since they are all released. */
1327                 if (!shutdown) {
1328                         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1329                         peer = kgnilnd_find_peer_locked(nid);
1330
1331                         if (peer != NULL) {
1332                                 CDEBUG(D_NET, "adding peer's conn with nid %s "
1333                                         "to purgatory\n", libcfs_nid2str(nid));
1334                                 kgnilnd_conn_addref(conn);
1335                                 conn->gnc_peer = peer;
1336                                 kgnilnd_peer_addref(peer);
1337                                 kgnilnd_admin_addref(conn->gnc_peer->gnp_dirty_eps);
1338                                 conn->gnc_state = GNILND_CONN_CLOSED;
1339                                 list_add_tail(&conn->gnc_list,
1340                                               &peer->gnp_conns);
1341                                 kgnilnd_add_purgatory_locked(conn,
1342                                                              conn->gnc_peer);
1343                                 kgnilnd_schedule_conn(conn);
1344                         }
1345                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1346                 }
1347         }
1348
1349         spin_lock(&dev->gnd_dgram_lock);
1350         kgnilnd_cancel_dgram_locked(dgram);
1351         spin_unlock(&dev->gnd_dgram_lock);
1352
1353         kgnilnd_cleanup_dgram(dgram);
1354
1355         /* if the dgram is 'canceled' it needs to be wait until the event
1356          * comes up from kgni that tells us it is safe to release */
1357         if (dgram->gndg_state != GNILND_DGRAM_CANCELED) {
1358                 dgram->gndg_state = GNILND_DGRAM_DONE;
1359
1360                 LASSERTF(list_empty(&dgram->gndg_list), "dgram 0x%p on list\n", dgram);
1361
1362                 /* if it is a wildcard and we are in an appropriate state, repost
1363                  * the wildcard */
1364
1365                 if ((dgram->gndg_type == GNILND_DGRAM_WC_REQ) &&
1366                     (!kgnilnd_data.kgn_wc_kill && !kgnilnd_data.kgn_in_reset)) {
1367                         int     rerc;
1368
1369                         rerc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
1370                         if (rerc != 0) {
1371                                 /* We failed to repost the WC dgram for some reason
1372                                  * mark it so the repost system attempts to repost */
1373                                 kgnilnd_admin_addref(dev->gnd_nwcdgrams);
1374                         }
1375                 }
1376
1377                 /* always free the old dgram */
1378                 kgnilnd_free_dgram(dev, dgram);
1379         }
1380 }
1381
1382
1383 int
1384 kgnilnd_probe_for_dgram(kgn_device_t *dev, kgn_dgram_t **dgramp)
1385 {
1386         kgn_dgram_t             *dgram = NULL;
1387         gni_post_state_t         post_state;
1388         gni_return_t             grc;
1389         int                      rc = 0;
1390         __u64                    readyid;
1391         __u32                    remote_addr = 0, remote_id = 0;
1392         ENTRY;
1393
1394         /* Probe with the lock held. That way if we get a dgram we dont have it canceled
1395          * between finding the ready dgram and grabbing the lock to remove it from the
1396          * list. Otherwise we could be left in an inconsistent state. We own the dgram
1397          * once its off the list so we don't need to worry about others changing it at
1398          * that point. */
1399         spin_lock(&dev->gnd_dgram_lock);
1400         grc = kgnilnd_postdata_probe_by_id(dev->gnd_handle, &readyid);
1401         if (grc != GNI_RC_SUCCESS) {
1402                 spin_unlock(&dev->gnd_dgram_lock);
1403                 /* return 0 to indicate nothing happened */
1404                 RETURN(0);
1405         }
1406
1407         CDEBUG(D_NET, "ready "LPX64" on device 0x%p\n",
1408                 readyid, dev);
1409
1410         dgram = (kgn_dgram_t *)readyid;
1411
1412         LASSERTF(dgram->gndg_magic == GNILND_DGRAM_MAGIC,
1413                  "dgram 0x%p from id "LPX64" with bad magic %x\n",
1414                  dgram, readyid, dgram->gndg_magic);
1415
1416         LASSERTF(dgram->gndg_state == GNILND_DGRAM_POSTED ||
1417                  dgram->gndg_state == GNILND_DGRAM_CANCELED,
1418                  "dgram 0x%p with bad state %s\n",
1419                  dgram, kgnilnd_dgram_state2str(dgram));
1420
1421         LASSERTF(!list_empty(&dgram->gndg_list),
1422                  "dgram 0x%p with bad list state %s type %s\n",
1423                  dgram, kgnilnd_dgram_state2str(dgram),
1424                  kgnilnd_dgram_type2str(dgram));
1425
1426         /* now we know that the datagram structure is ok, so pull off list */
1427         list_del_init(&dgram->gndg_list);
1428
1429         /* while we have the gnn_dgram_lock and BEFORE we call test_by_id
1430          * change the state from POSTED to PROCESSING to ensure that
1431          * nobody cancels it after we've pulled it from the wire */
1432         if (dgram->gndg_state == GNILND_DGRAM_POSTED) {
1433                 dgram->gndg_state = GNILND_DGRAM_PROCESSING;
1434         }
1435
1436         LASSERTF(dgram->gndg_conn != NULL,
1437                 "dgram 0x%p with NULL conn\n", dgram);
1438
1439         grc = kgnilnd_ep_postdata_test_by_id(dgram->gndg_conn->gnc_ephandle,
1440                                              (__u64)dgram, &post_state,
1441                                              &remote_addr, &remote_id);
1442
1443         /* we now "own" this datagram */
1444         spin_unlock(&dev->gnd_dgram_lock);
1445
1446         LASSERTF(grc != GNI_RC_NO_MATCH, "kgni lied! probe_by_id told us that"
1447                  " id "LPU64" was ready\n", readyid);
1448
1449         CDEBUG(D_NET, "grc %d dgram 0x%p type %s post_state %d "
1450                 "remote_addr %u remote_id %u\n", grc, dgram,
1451                 kgnilnd_dgram_type2str(dgram),
1452                 post_state, remote_addr, remote_id);
1453
1454         if (unlikely(grc != GNI_RC_SUCCESS)) {
1455                 CNETERR("getting data for dgram 0x%p->%s failed rc %d. Dropping it\n",
1456                         dgram, libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
1457                         grc);
1458                 rc = -EINVAL;
1459                 GOTO(probe_for_out, rc);
1460         }
1461
1462         rc = kgnilnd_process_dgram(dgram, post_state);
1463
1464         /* we should never get probe finding a dgram for us and then it
1465          * being a WC dgram that is still in the middle of processing */
1466         LASSERTF(rc <= 0, "bad rc %d from process_dgram 0x%p state %d\n",
1467                  rc, dgram, post_state);
1468
1469         if (rc == 0) {
1470                 /* dgram is good enough for the data to be used */
1471                 dgram->gndg_state = GNILND_DGRAM_PROCESSING;
1472                 /* fake rc to mark that we've done something */
1473                 rc = 1;
1474         } else {
1475                 /* let kgnilnd_release_dgram take care of canceled dgrams */
1476                 if (dgram->gndg_state != GNILND_DGRAM_CANCELED) {
1477                         dgram->gndg_state = GNILND_DGRAM_DONE;
1478                 }
1479         }
1480
1481         *dgramp = dgram;
1482         RETURN(rc);
1483
1484 probe_for_out:
1485
1486         kgnilnd_release_dgram(dev, dgram, 0);
1487         RETURN(rc);
1488 }
1489
1490 int
1491 kgnilnd_setup_wildcard_dgram(kgn_device_t *dev)
1492 {
1493         /* if kgn_wildcard is zero, return error */
1494         int     rc = -ENOENT, i;
1495         ENTRY;
1496
1497         for (i = 0; i < *kgnilnd_tunables.kgn_nwildcard; i++) {
1498                 rc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
1499                 if (rc < 0) {
1500                         CERROR("error %d: could not post wildcard datagram # %d\n",
1501                                 rc, i);
1502                         rc = -EINVAL;
1503                         GOTO(failed, rc);
1504                 }
1505         }
1506
1507 failed:
1508         RETURN(rc);
1509 }
1510
1511 int
1512 kgnilnd_cancel_net_dgrams(kgn_net_t *net)
1513 {
1514         kgn_dgram_t            *dg, *dgN;
1515         struct list_head        zombies;
1516         int                     i;
1517         ENTRY;
1518
1519         /* we want to cancel any outstanding dgrams - we don't want to rely
1520          * on del_peer_or_conn catching all of them. This helps protect us in cases
1521          * where we don't quite keep the peer->dgram mapping in sync due to some
1522          * race conditions */
1523
1524         LASSERTF(net->gnn_shutdown || kgnilnd_data.kgn_in_reset,
1525                  "called with LND invalid state: net shutdown %d "
1526                  "in reset %d\n", net->gnn_shutdown,
1527                  kgnilnd_data.kgn_in_reset);
1528
1529         INIT_LIST_HEAD(&zombies);
1530
1531         spin_lock(&net->gnn_dev->gnd_dgram_lock);
1532
1533         for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
1534                 list_for_each_entry_safe(dg, dgN, &net->gnn_dev->gnd_dgrams[i], gndg_list) {
1535
1536                         /* skip nids not on our net or are wildcards */
1537
1538
1539                         if (dg->gndg_type == GNILND_DGRAM_WC_REQ ||
1540                                 net->gnn_netnum != LNET_NETNUM(LNET_NIDNET(dg->gndg_conn_out.gncr_dstnid)))
1541                                 continue;
1542
1543                         kgnilnd_cancel_dgram_locked(dg);
1544                 }
1545         }
1546
1547         spin_unlock(&net->gnn_dev->gnd_dgram_lock);
1548
1549         RETURN(0);
1550 }
1551
1552 int
1553 kgnilnd_cancel_wc_dgrams(kgn_device_t *dev)
1554 {
1555         kgn_dgram_t *dg, *dgN;
1556         struct list_head zombies;
1557         ENTRY;
1558
1559         /* Time to kill the outstanding WC's
1560          * WC's exist on net 0 only but match on any net...
1561          */
1562
1563         LASSERTF(kgnilnd_data.kgn_in_reset || kgnilnd_data.kgn_wc_kill,
1564                 "called with LND invalid state: WC shutdown %d "
1565                 "in reset %d\n", kgnilnd_data.kgn_wc_kill,
1566                 kgnilnd_data.kgn_in_reset);
1567
1568         INIT_LIST_HEAD(&zombies);
1569         spin_lock(&dev->gnd_dgram_lock);
1570
1571         do {
1572                 dg = kgnilnd_find_dgram_locked(dev, LNET_NID_ANY);
1573                 if (dg != NULL) {
1574                         LASSERTF(dg->gndg_type == GNILND_DGRAM_WC_REQ,
1575                                  "dgram 0x%p->%s with bad type %d (%s)\n",
1576                                 dg, libcfs_nid2str(dg->gndg_conn_out.gncr_dstnid),
1577                                 dg->gndg_type, kgnilnd_dgram_type2str(dg));
1578
1579                         kgnilnd_cancel_dgram_locked(dg);
1580
1581                         /* WC could be DONE already, check and if so add to list to be released */
1582                         if (dg->gndg_state == GNILND_DGRAM_DONE) {
1583                                 list_del_init(&dg->gndg_list);
1584                                 list_add_tail(&dg->gndg_list, &zombies);
1585                         }
1586                 }
1587         } while (dg != NULL);
1588
1589         spin_unlock(&dev->gnd_dgram_lock);
1590
1591         list_for_each_entry_safe(dg, dgN, &zombies, gndg_list) {
1592                 list_del_init(&dg->gndg_list);
1593                 kgnilnd_release_dgram(dev, dg, 1);
1594         }
1595         RETURN(0);
1596
1597 }
1598
1599 int
1600 kgnilnd_cancel_dgrams(kgn_device_t *dev)
1601 {
1602         kgn_dgram_t *dg, *dgN;
1603         int i;
1604         ENTRY;
1605
1606         /* Cancel any outstanding non wildcard datagrams regardless
1607          * of which net they are on as we are in base shutdown and
1608          * dont care about connecting anymore.
1609          */
1610
1611         LASSERTF(kgnilnd_data.kgn_wc_kill == 1,"We didnt get called from base shutdown\n");
1612
1613         spin_lock(&dev->gnd_dgram_lock);
1614
1615         for (i = 0; i < (*kgnilnd_tunables.kgn_peer_hash_size -1); i++) {
1616                 list_for_each_entry_safe(dg, dgN, &dev->gnd_dgrams[i], gndg_list) {
1617                         if (dg->gndg_type != GNILND_DGRAM_WC_REQ)
1618                                 kgnilnd_cancel_dgram_locked(dg);
1619                 }
1620         }
1621
1622         spin_unlock(&dev->gnd_dgram_lock);
1623
1624         RETURN(0);
1625 }
1626
1627
1628 void
1629 kgnilnd_wait_for_canceled_dgrams(kgn_device_t *dev)
1630 {
1631         int             i = 4;
1632         int             rc;
1633         gni_return_t    grc;
1634         __u64           readyid;
1635         kgn_dgram_t    *dgram;
1636
1637         /* use do while to get at least one check run to allow
1638          * regression test for 762072 to hit bug if there */
1639
1640         /* This function races with the dgram mover during shutdown so it is possible for
1641          * a dgram to be seen in kgnilnd_postdata_probe_wait_by_id but be handled in the
1642          * dgram mover thread instead of inside of this function.
1643          */
1644
1645         /* This should only be called from within shutdown, baseshutdown, or stack reset.
1646          * there are no assertions here to verify since base_shutdown has nothing in it we can check
1647          * the net is gone by then.
1648          */
1649
1650         do {
1651                 i++;
1652                 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
1653                         "Waiting for %d canceled datagrams to clear on device %d\n",
1654                         atomic_read(&dev->gnd_canceled_dgrams), dev->gnd_id);
1655
1656                 /* check once a second */
1657                 grc = kgnilnd_postdata_probe_wait_by_id(dev->gnd_handle,
1658                        250, &readyid);
1659
1660                 if (grc != GNI_RC_SUCCESS)
1661                         continue;
1662
1663                 CDEBUG(D_NET, "ready "LPX64" on device %d->0x%p\n",
1664                         readyid, dev->gnd_id, dev);
1665
1666                 rc = kgnilnd_probe_for_dgram(dev, &dgram);
1667                 if (rc != 0) {
1668                         /* if we got a valid dgram or one that is now done, clean up */
1669                         kgnilnd_release_dgram(dev, dgram, 1);
1670                 }
1671         } while (atomic_read(&dev->gnd_canceled_dgrams));
1672 }
1673
1674 int
1675 kgnilnd_start_connect(kgn_peer_t *peer)
1676 {
1677         int              rc = 0;
1678         /* sync point for kgnilnd_del_peer_locked - do an early check to
1679          * catch the most common hits where del_peer is done by the
1680          * time we get here */
1681         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING1)) {
1682                 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING1, 1)) {};
1683         }
1684
1685         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1686         if (!kgnilnd_peer_active(peer) || peer->gnp_connecting != GNILND_PEER_CONNECT) {
1687                 /* raced with peer getting unlinked */
1688                 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1689                 rc = ESTALE;
1690                 GOTO(out, rc);
1691         }
1692         peer->gnp_connecting = GNILND_PEER_POSTING;
1693         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1694
1695         set_mb(peer->gnp_last_dgram_time, jiffies);
1696         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING2)) {
1697                 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING2, 1)) {};
1698         }
1699
1700         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING3)) {
1701                 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING3, 1)) {};
1702                 rc = cfs_fail_val ? cfs_fail_val : -ENOMEM;
1703         } else {
1704                 rc = kgnilnd_post_dgram(peer->gnp_net->gnn_dev,
1705                                         peer->gnp_nid, GNILND_CONNREQ_REQ, 0);
1706         }
1707         if (rc < 0) {
1708                 set_mb(peer->gnp_last_dgram_errno, rc);
1709                 GOTO(failed, rc);
1710         }
1711
1712         /* while we're posting someone could have decided this peer/dgram needed to
1713          * die a quick death, so we check for state change and process accordingly */
1714
1715         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1716         if (!kgnilnd_peer_active(peer) || peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH) {
1717                 if (peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH) {
1718                         peer->gnp_connecting = GNILND_PEER_KILL;
1719                 }
1720                 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1721                 /* positive RC to avoid dgram cleanup - we'll have to
1722                  * wait for the kgni GNI_POST_TERMINATED event to
1723                  * finish cleaning up */
1724                 rc = ESTALE;
1725                 kgnilnd_find_and_cancel_dgram(peer->gnp_net->gnn_dev, peer->gnp_nid);
1726                 GOTO(out, rc);
1727         }
1728         peer->gnp_connecting = GNILND_PEER_POSTED;
1729         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1730         /* reaper thread will take care of any timeouts */
1731         CDEBUG(D_NET, "waiting for connect to finish to %s rc %d\n",
1732                libcfs_nid2str(peer->gnp_nid), rc);
1733
1734         RETURN(rc);
1735
1736 failed:
1737         CDEBUG(D_NET, "connect to %s failed: rc %d \n",
1738                libcfs_nid2str(peer->gnp_nid), rc);
1739 out:
1740         RETURN(rc);
1741 }
1742
1743 int
1744 kgnilnd_finish_connect(kgn_dgram_t *dgram)
1745 {
1746         kgn_conn_t        *conn = dgram->gndg_conn;
1747         lnet_nid_t         her_nid = dgram->gndg_conn_in.gncr_srcnid;
1748         kgn_peer_t        *new_peer, *peer = NULL;
1749         kgn_tx_t          *tx;
1750         kgn_tx_t          *txn;
1751         kgn_mbox_info_t   *mbox;
1752         int                rc;
1753         int                nstale;
1754
1755         /* try to find a peer that matches the nid we got in the connreq
1756          * kgnilnd_unpack_connreq makes sure that conn_in.gncr_srcnid is
1757          * HER and conn_out.gncr_srcnid is ME for both active and WC dgrams */
1758
1759         /* assume this is a new peer  - it makes locking cleaner when it isn't */
1760         /* no holding kgn_net_rw_sem - already are at the kgnilnd_dgram_mover level */
1761
1762         rc = kgnilnd_create_peer_safe(&new_peer, her_nid, NULL, GNILND_RCA_NODE_UP);
1763         if (rc != 0) {
1764                 CERROR("Can't create peer for %s\n", libcfs_nid2str(her_nid));
1765                 return rc;
1766         }
1767
1768         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1769
1770         /* this transfers ref from create_peer to the kgn_peer table */
1771         kgnilnd_add_peer_locked(her_nid, new_peer, &peer);
1772
1773         /* if we found an existing peer, is it really ready for a new conn ? */
1774         if (peer != new_peer) {
1775                 /* if this was an active connect attempt but we can't find a peer waiting for it
1776                  * we will dump in the trash */
1777
1778                 if (peer->gnp_connecting == GNILND_PEER_IDLE && dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
1779                         CDEBUG(D_NET, "dropping completed connreq for %s peer 0x%p->%s\n",
1780                                libcfs_nid2str(her_nid), peer, libcfs_nid2str(peer->gnp_nid));
1781                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1782                         rc = ECANCELED;
1783                         GOTO(out, rc);
1784                 }
1785
1786                 /* check to see if we can catch a connecting peer before it is
1787                  * removed from the connd_peers list - if not, we need to
1788                  * let the connreqs race and be handled by kgnilnd_conn_isdup_locked() */
1789                 if (peer->gnp_connecting != GNILND_PEER_IDLE) {
1790                         spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
1791                         if (!list_empty(&peer->gnp_connd_list)) {
1792                                 list_del_init(&peer->gnp_connd_list);
1793                                 /* drop connd ref */
1794                                 kgnilnd_peer_decref(peer);
1795                         }
1796                         spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
1797                         /* clear rc to make sure we don't have fake error */
1798                         rc = 0;
1799                 }
1800
1801                 /* no matter what, we are no longer waiting to connect this peer now */
1802                 peer->gnp_connecting = GNILND_PEER_IDLE;
1803
1804                 /* Refuse to duplicate an existing connection (both sides might try to
1805                  * connect at once).  NB we return success!  We _are_ connected so we
1806                  * _don't_ have any blocked txs to complete with failure. */
1807                 rc = kgnilnd_conn_isdup_locked(peer, conn);
1808                 if (rc != 0) {
1809                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1810                         CDEBUG(D_NET, "Not creating duplicate connection to %s: %d\n",
1811                               libcfs_nid2str(her_nid), rc);
1812                         rc = EALREADY;
1813                         GOTO(out, rc);
1814                 }
1815         }
1816
1817         if (peer->gnp_down == GNILND_RCA_NODE_DOWN) {
1818                 CNETERR("Received connection request from %s that RCA thinks is"
1819                         " down.\n", libcfs_nid2str(her_nid));
1820                 peer->gnp_down = GNILND_RCA_NODE_UP;
1821         }
1822
1823         nstale = kgnilnd_close_stale_conns_locked(peer, conn);
1824
1825         /* either way with peer (new or existing), we are ok with ref counts here as the
1826          * kgnilnd_add_peer_locked will use our ref on new_peer (from create_peer_safe) as the
1827          * ref for the peer table. */
1828
1829         /* at this point, the connection request is a winner */
1830
1831         /* mark 'DONE' to avoid cancel being called from release */
1832         dgram->gndg_state = GNILND_DGRAM_DONE;
1833
1834         /* initialise timestamps before reaper looks at them */
1835         conn->gnc_last_rx = conn->gnc_last_rx_cq = jiffies;
1836
1837         /* last_tx is initialized to jiffies - (keepalive*2) so that if the NOOP fails it will
1838          * immediatly send a NOOP in the reaper thread during the call to
1839          * kgnilnd_check_conn_timeouts_locked
1840          */
1841         conn->gnc_last_tx = jiffies - (cfs_time_seconds(GNILND_TO2KA(conn->gnc_timeout)) * 2);
1842         conn->gnc_state = GNILND_CONN_ESTABLISHED;
1843
1844         /* save the dgram type used to establish this connection */
1845         conn->gnc_dgram_type = dgram->gndg_type;
1846
1847         /* refs are not transferred from dgram to tables, so increment to
1848          * take ownership */
1849         kgnilnd_conn_addref(conn);
1850         kgnilnd_peer_addref(peer);
1851         conn->gnc_peer = peer;
1852         list_add_tail(&conn->gnc_list, &peer->gnp_conns);
1853
1854         kgnilnd_conn_addref(conn);               /* +1 ref for conn table */
1855         list_add_tail(&conn->gnc_hashlist,
1856                       kgnilnd_cqid2connlist(conn->gnc_cqid));
1857         kgnilnd_data.kgn_conn_version++;
1858
1859         /* Dont send NOOP if fail_loc is set
1860          */
1861         if (!CFS_FAIL_CHECK(CFS_FAIL_GNI_ONLY_NOOP)) {
1862                 tx = kgnilnd_new_tx_msg(GNILND_MSG_NOOP, peer->gnp_net->gnn_ni->ni_nid);
1863                 if (tx == NULL) {
1864                         CNETERR("can't get TX to initiate NOOP to %s\n",
1865                                 libcfs_nid2str(peer->gnp_nid));
1866                 } else {
1867                         kgnilnd_queue_tx(conn, tx);
1868                 }
1869         }
1870
1871         /* Schedule all packets blocking for a connection */
1872         list_for_each_entry_safe(tx, txn, &peer->gnp_tx_queue, tx_list) {
1873                 /* lock held here is the peer_conn lock */
1874                 kgnilnd_tx_del_state_locked(tx, peer, NULL, GNILND_TX_ALLOCD);
1875                 kgnilnd_queue_tx(conn, tx);
1876         }
1877
1878         /* If this is an active connection lets mark its timestamp on the MBoX */
1879         if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
1880                 mbox = &conn->gnc_fma_blk->gnm_mbox_info[conn->gnc_mbox_id];
1881                 /* conn->gnc_last_rx is jiffies it better exist as it was just set */
1882                 mbox->mbx_release_purg_active_dgram = conn->gnc_last_rx;
1883         }
1884
1885         /* Bug 765042: wake up scheduler for a race with finish_connect and
1886          * complete_conn_closed with a conn in purgatory
1887          * since we can't use CFS_RACE due to mutex_holds in kgnilnd_process_conns,
1888          * we just check for set and then clear */
1889         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_FINISH_PURG)) {
1890                 cfs_fail_loc = 0x0;
1891                 /* get scheduler thread moving again */
1892                 kgnilnd_schedule_device(conn->gnc_device);
1893         }
1894
1895         CDEBUG(D_NET, "New conn 0x%p->%s dev %d\n",
1896                conn, libcfs_nid2str(her_nid), conn->gnc_device->gnd_id);
1897
1898         /* make sure we reset peer reconnect interval now that we have a good conn */
1899         kgnilnd_peer_alive(peer);
1900         peer->gnp_reconnect_interval = 0;
1901
1902         /* clear the unlink attribute if we dont clear it kgnilnd_del_conn_or_peer will wait
1903          * on the atomic forever
1904          */
1905         if (peer->gnp_pending_unlink) {
1906                 peer->gnp_pending_unlink = 0;
1907                 kgnilnd_admin_decref(kgnilnd_data.kgn_npending_unlink);
1908                 CDEBUG(D_NET, "Clearing peer unlink %p\n",peer);
1909         }
1910
1911         /* add ref to make it hang around until after we drop the lock */
1912         kgnilnd_conn_addref(conn);
1913
1914         /* Once the peer_conn lock is dropped, the conn could actually move into
1915          * CLOSING->CLOSED->DONE in the scheduler thread, so hold the
1916          * lock until we are really done */
1917         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1918
1919         /* Notify LNET that we now have a working connection to this peer.
1920          * This is a Cray extension to the "standard" LND behavior. */
1921         lnet_notify(peer->gnp_net->gnn_ni, peer->gnp_nid,
1922                      1, cfs_time_current());
1923
1924         /* drop our 'hold' ref */
1925         kgnilnd_conn_decref(conn);
1926
1927 out:
1928         RETURN(rc);
1929 }
1930
1931 void
1932 kgnilnd_send_nak(kgn_device_t *dev, lnet_nid_t dst_nid, int error)
1933 {
1934         int              rc = 0;
1935         ENTRY;
1936
1937         LASSERTF(dst_nid != LNET_NID_ANY, "bad dst_nid %s\n", libcfs_nid2str(dst_nid));
1938
1939         CDEBUG(D_NET, "NAK to %s errno %d\n", libcfs_nid2str(dst_nid), error);
1940
1941         rc = kgnilnd_post_dgram(dev, dst_nid, GNILND_CONNREQ_NAK, error);
1942
1943         if (rc < 0) {
1944                 CDEBUG(D_NET, "NAK to %s failed: rc %d \n", libcfs_nid2str(dst_nid), rc);
1945         }
1946         EXIT;
1947 }
1948
1949 int
1950 kgnilnd_process_nak(kgn_dgram_t *dgram)
1951 {
1952         kgn_connreq_t     *connreq = &dgram->gndg_conn_in;
1953         lnet_nid_t         src_nid = connreq->gncr_srcnid;
1954         int                errno = connreq->gncr_nakdata.gnnd_errno;
1955         kgn_peer_t        *peer;
1956         int                rc = 0;
1957
1958         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1959
1960         peer = kgnilnd_find_peer_locked(src_nid);
1961         if (peer == NULL) {
1962                 /* we likely dropped him from bad data when we processed
1963                  * the original REQ */
1964                 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1965                 return -EBADSLT;
1966         }
1967
1968         /* need to check peerstamp/connstamp against the ones we find
1969          * to make sure we don't close new (and good?) conns that we
1970          * formed after this connreq failed */
1971         if (peer->gnp_connecting == GNILND_PEER_IDLE) {
1972                 kgn_conn_t        conn;
1973
1974                 if (list_empty(&peer->gnp_conns)) {
1975                         /* assume already procced datagram and it barfed up
1976                          * on this side too */
1977                         CDEBUG(D_NET, "dropping NAK from %s; "
1978                                "peer %s is already not connected\n",
1979                                 libcfs_nid2str(connreq->gncr_srcnid),
1980                                 libcfs_nid2str(connreq->gncr_dstnid));
1981                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1982                         return 0;
1983                 }
1984
1985                 /* stub up a connection with the connreq XXX_stamps to allow
1986                  * use to use close_stale_conns_locked */
1987                 conn.gnc_peerstamp = connreq->gncr_peerstamp;
1988                 conn.gnc_my_connstamp = connreq->gncr_connstamp;
1989                 conn.gnc_peer_connstamp = connreq->gncr_connstamp;
1990                 conn.gnc_device = peer->gnp_net->gnn_dev;
1991
1992                 rc = kgnilnd_close_stale_conns_locked(peer, &conn);
1993
1994                 LCONSOLE_INFO("Received NAK from %s for %s errno %d; "
1995                         "closed %d connections\n",
1996                         libcfs_nid2str(connreq->gncr_srcnid),
1997                         libcfs_nid2str(connreq->gncr_dstnid), errno, rc);
1998         } else {
1999                 spin_lock(&dgram->gndg_conn->gnc_device->gnd_connd_lock);
2000
2001                 if (list_empty(&peer->gnp_connd_list)) {
2002                         /* if peer isn't on waiting list, try to find one to nuke */
2003                         rc = kgnilnd_find_and_cancel_dgram(peer->gnp_net->gnn_dev,
2004                                                            peer->gnp_nid);
2005
2006                         if (rc) {
2007                                 LCONSOLE_INFO("Received NAK from %s for %s errno %d; "
2008                                         "canceled pending connect request\n",
2009                                         libcfs_nid2str(connreq->gncr_srcnid),
2010                                         libcfs_nid2str(connreq->gncr_dstnid), errno);
2011                         }
2012
2013                         /* if we can't find a waiting dgram, we just drop the nak - the conn
2014                          * connect must have failed (didn't find conn above and clear connecting
2015                          * -- so nothing to do besides drop */
2016                 } else {
2017                         /* peer is on list, meaning it is a new connect attempt from the one
2018                          * we started that generated the NAK - so just drop NAK */
2019
2020                         /* use negative to prevent error message */
2021                         rc = -EAGAIN;
2022                 }
2023                 spin_unlock(&dgram->gndg_conn->gnc_device->gnd_connd_lock);
2024         }
2025
2026         /* success! we found a peer and at least marked pending_nak */
2027         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2028
2029         return rc;
2030 }
2031
2032 int
2033 kgnilnd_process_connreq(kgn_dgram_t *dgram, int *needs_nak)
2034 {
2035         int                      rc;
2036
2037         rc = kgnilnd_unpack_connreq(dgram);
2038         if (rc < 0) {
2039                 if (rc != -EBADF) {
2040                         /* only NAK if we have good srcnid to use */
2041                         *needs_nak = 1;
2042                 }
2043                 goto connreq_out;
2044         }
2045
2046         switch (dgram->gndg_conn_in.gncr_type) {
2047         case GNILND_CONNREQ_REQ:
2048                 /* wire up peer & conn, send queued TX */
2049                 rc = kgnilnd_finish_connect(dgram);
2050
2051                 /* don't nak when the nid is hosed */
2052                 if ((rc < 0)) {
2053                         *needs_nak = 1;
2054                 }
2055
2056                 break;
2057         case GNILND_CONNREQ_NAK:
2058                 rc = kgnilnd_process_nak(dgram);
2059                 /* return early to prevent reconnect bump */
2060                 return rc;
2061         default:
2062                 CERROR("unexpected connreq type %s (%d) from %s\n",
2063                         kgnilnd_connreq_type2str(&dgram->gndg_conn_in),
2064                         dgram->gndg_conn_in.gncr_type,
2065                         libcfs_nid2str(dgram->gndg_conn_in.gncr_srcnid));
2066                 rc = -EINVAL;
2067                 *needs_nak = 1;
2068                 break;
2069         }
2070
2071 connreq_out:
2072         RETURN(rc);
2073 }
2074
2075 int
2076 kgnilnd_probe_and_process_dgram(kgn_device_t *dev)
2077 {
2078         int                      rc;
2079         int                      needs_nak = 0;
2080         lnet_nid_t               nak_dstnid = LNET_NID_ANY;
2081         lnet_nid_t               orig_dstnid;
2082         kgn_dgram_t             *dgram = NULL;
2083         kgn_peer_t              *peer;
2084         ENTRY;
2085
2086         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PAUSE_DGRAM_COMP)) {
2087                 rc = 0;
2088         } else {
2089                 rc = kgnilnd_probe_for_dgram(dev, &dgram);
2090         }
2091
2092         if (rc == 0) {
2093                 RETURN(0);
2094         } else if (rc < 0) {
2095                 GOTO(inform_peer, rc);
2096         } else {
2097                 /* rc > 1 means it did something, reset for this func  */
2098                 rc = 0;
2099         }
2100
2101         switch (dgram->gndg_type) {
2102         case GNILND_DGRAM_WC_REQ:
2103         case GNILND_DGRAM_REQ:
2104                 rc = kgnilnd_process_connreq(dgram, &needs_nak);
2105                 break;
2106         case GNILND_DGRAM_NAK:
2107                 CDEBUG(D_NETTRACE, "NAK to %s done\n",
2108                         libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid));
2109                 break;
2110         default:
2111                 CERROR("unknown datagram type %s (%d)\n",
2112                        kgnilnd_dgram_type2str(dgram), dgram->gndg_type);
2113                 break;
2114         }
2115
2116         /* stash data to use after releasing current datagram */
2117         /* don't stash net - we are operating on a net already,
2118          * so the lock on rw_net_lock is sufficient */
2119
2120         nak_dstnid = dgram->gndg_conn_in.gncr_srcnid;
2121
2122 inform_peer:
2123         LASSERTF(dgram != NULL, "dgram 0x%p rc %d needs_nak %d\n", dgram, rc, needs_nak);
2124
2125         orig_dstnid = dgram->gndg_conn_out.gncr_dstnid;
2126
2127         kgnilnd_release_dgram(dev, dgram, 0);
2128
2129         CDEBUG(D_NET, "cleaning up dgram to %s, rc %d\n",
2130                libcfs_nid2str(orig_dstnid), rc);
2131
2132         /* if this was a WC_REQ that matched an existing peer, it'll get marked done
2133          * in kgnilnd_finish_connect - if errors are from before we get to there,
2134          * we just drop as it is a WC_REQ - the peer CAN'T be waiting for it */
2135         if ((orig_dstnid != LNET_NID_ANY) && (rc < 0)) {
2136                 /* if we have a negative rc, we want to find a peer to inform about
2137                  * the bad connection attempt. Sorry buddy, better luck next time! */
2138
2139                 write_lock(&kgnilnd_data.kgn_peer_conn_lock);
2140                 peer = kgnilnd_find_peer_locked(orig_dstnid);
2141
2142                 if (peer != NULL) {
2143                         /* add ref to make sure he stays around past the possible unlink
2144                          * so we can tell LNet about him */
2145                         kgnilnd_peer_addref(peer);
2146
2147                         /* if he still cares about the outstanding connect */
2148                         if (peer->gnp_connecting >= GNILND_PEER_CONNECT) {
2149                                 /* check if he is on the connd list and remove.. */
2150                                 spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
2151                                 if (!list_empty(&peer->gnp_connd_list)) {
2152                                         list_del_init(&peer->gnp_connd_list);
2153                                         /* drop connd ref */
2154                                         kgnilnd_peer_decref(peer);
2155                                 }
2156                                 spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
2157
2158                                 /* clear gnp_connecting so we don't have a non-connecting peer
2159                                  * on gnd_connd_list */
2160                                 peer->gnp_connecting = GNILND_PEER_IDLE;
2161
2162                                 set_mb(peer->gnp_last_dgram_errno, rc);
2163
2164                                 kgnilnd_peer_increase_reconnect_locked(peer);
2165                         }
2166                 }
2167                 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2168
2169                 /* now that we are outside the lock, tell Mommy */
2170                 if (peer != NULL) {
2171                         kgnilnd_peer_notify(peer, rc);
2172                         kgnilnd_peer_decref(peer);
2173                 }
2174         }
2175
2176         if (needs_nak) {
2177                 kgnilnd_send_nak(dev, nak_dstnid, rc);
2178         }
2179
2180         RETURN(1);
2181 }
2182
2183 void
2184 kgnilnd_reaper_dgram_check(kgn_device_t *dev)
2185 {
2186         kgn_dgram_t    *dgram, *tmp;
2187         int             i;
2188
2189         spin_lock(&dev->gnd_dgram_lock);
2190
2191         for (i = 0; i < (*kgnilnd_tunables.kgn_peer_hash_size - 1); i++) {
2192                 list_for_each_entry_safe(dgram, tmp, &dev->gnd_dgrams[i], gndg_list) {
2193                         unsigned long            now = jiffies;
2194                         unsigned long            timeout;
2195
2196                         /* don't timeout stuff if the network is mucked or shutting down */
2197                         if (kgnilnd_check_hw_quiesce()) {
2198                                 break;
2199                         }
2200
2201                         if ((dgram->gndg_state != GNILND_DGRAM_POSTED) ||
2202                             (dgram->gndg_type == GNILND_DGRAM_WC_REQ)) {
2203                                 continue;
2204                         }
2205                         CDEBUG(D_NETTRACE, "checking dgram 0x%p type %s "
2206                                 "state %s conn 0x%p to %s age %lus\n",
2207                                 dgram, kgnilnd_dgram_type2str(dgram),
2208                                 kgnilnd_dgram_state2str(dgram), dgram->gndg_conn,
2209                                 libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
2210                                 cfs_duration_sec(now - dgram->gndg_post_time));
2211
2212                         timeout = cfs_time_seconds(*kgnilnd_tunables.kgn_timeout);
2213
2214                         if (time_before(now, (dgram->gndg_post_time + timeout)))
2215                                 continue;
2216
2217                         CNETERR("%s datagram to %s timed out @ %lus dgram "
2218                                 "0x%p state %s conn 0x%p\n",
2219                                 kgnilnd_dgram_type2str(dgram),
2220                                 libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
2221                                 cfs_duration_sec(now - dgram->gndg_post_time),
2222                                 dgram, kgnilnd_dgram_state2str(dgram),
2223                                 dgram->gndg_conn);
2224
2225                         kgnilnd_cancel_dgram_locked(dgram);
2226                 }
2227         }
2228         spin_unlock(&dev->gnd_dgram_lock);
2229 }
2230
2231
2232 /* use a thread for the possibly long-blocking wait_by_id to prevent
2233  * stalling the global workqueues */
2234 int
2235 kgnilnd_dgram_waitq(void *arg)
2236 {
2237         kgn_device_t     *dev = (kgn_device_t *) arg;
2238         char              name[16];
2239         gni_return_t      grc;
2240         __u64             readyid;
2241         DEFINE_WAIT(mover_done);
2242
2243         snprintf(name, sizeof(name), "kgnilnd_dgn_%02d", dev->gnd_id);
2244         cfs_block_allsigs();
2245
2246         /* all gnilnd threads need to run fairly urgently */
2247         set_user_nice(current, *kgnilnd_tunables.kgn_nice);
2248
2249         /* we dont shut down until the device shuts down ... */
2250         while (!kgnilnd_data.kgn_shutdown) {
2251                 /* to quiesce or to not quiesce, that is the question */
2252                 if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
2253                         KGNILND_SPIN_QUIESCE;
2254                 }
2255
2256                 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_PAUSE_DGRAM_COMP, 1)) {}
2257
2258                 /* check once a second */
2259                 grc = kgnilnd_postdata_probe_wait_by_id(dev->gnd_handle,
2260                                                        1000, &readyid);
2261
2262                 if (grc == GNI_RC_SUCCESS) {
2263                         CDEBUG(D_INFO, "waking up dgram mover thread\n");
2264                         kgnilnd_schedule_dgram(dev);
2265
2266                         /* wait for dgram thread to ping us before spinning again */
2267                         prepare_to_wait(&dev->gnd_dgping_waitq, &mover_done,
2268                                         TASK_INTERRUPTIBLE);
2269
2270                         /* don't sleep if we need to quiesce */
2271                         if (likely(!kgnilnd_data.kgn_quiesce_trigger)) {
2272                                 schedule();
2273                         }
2274                         finish_wait(&dev->gnd_dgping_waitq, &mover_done);
2275                 }
2276         }
2277
2278         kgnilnd_thread_fini();
2279         return 0;
2280 }
2281
2282 int
2283 kgnilnd_start_outbound_dgrams(kgn_device_t *dev, unsigned long deadline)
2284 {
2285         int                      did_something = 0, rc;
2286         kgn_peer_t              *peer = NULL;
2287
2288         spin_lock(&dev->gnd_connd_lock);
2289
2290         /* Active connect - we added this in kgnilnd_launch_tx */
2291         while (!list_empty(&dev->gnd_connd_peers) && time_before(jiffies, deadline)) {
2292                 peer = list_first_entry(&dev->gnd_connd_peers,
2293                                         kgn_peer_t, gnp_connd_list);
2294
2295                 /* ref for connd removed in if/else below */
2296                list_del_init(&peer->gnp_connd_list);
2297
2298                 /* gnp_connecting and membership on gnd_connd_peers should be
2299                  * done coherently to avoid double adding, etc */
2300                 /* don't need kgnilnd_data.kgn_peer_conn_lock here as that is only needed
2301                  * to get the peer to gnp_connecting in the first place. We just need to
2302                  * rely on gnd_connd_lock to serialize someone pulling him from the list
2303                  * BEFORE clearing gnp_connecting */
2304                 LASSERTF(peer->gnp_connecting != GNILND_PEER_IDLE, "peer 0x%p->%s not connecting\n",
2305                          peer, libcfs_nid2str(peer->gnp_nid));
2306
2307                 spin_unlock(&dev->gnd_connd_lock);
2308
2309                 CDEBUG(D_NET, "processing connect to %s\n",
2310                        libcfs_nid2str(peer->gnp_nid));
2311
2312                 did_something += 1;
2313                 rc = kgnilnd_start_connect(peer);
2314
2315                 if (likely(rc >= 0)) {
2316                         /* 0 on success, positive on 'just drop peer' errors */
2317                         kgnilnd_peer_decref(peer);
2318                 } else if (rc == -ENOMEM) {
2319                         /* if we are out of wildcards, add back to
2320                          * connd_list - then break out and we'll try later
2321                          * if other errors, we'll bail & cancel pending tx */
2322                         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
2323                         if (peer->gnp_connecting == GNILND_PEER_POSTING) {
2324                                 peer->gnp_connecting = GNILND_PEER_CONNECT;
2325                                 spin_lock(&dev->gnd_connd_lock);
2326                                 list_add_tail(&peer->gnp_connd_list,
2327                                               &dev->gnd_connd_peers);
2328                         } else {
2329                                 /* connecting changed while we were posting */
2330
2331                                 LASSERTF(peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH, "Peer is in invalid"
2332                                         " state 0x%p->%s, connecting %d\n",
2333                                         peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting);
2334                                 peer->gnp_connecting = GNILND_PEER_KILL;
2335                                 spin_lock(&dev->gnd_connd_lock);
2336                                 /* remove the peer ref frrom the cond list */
2337                                 kgnilnd_peer_decref(peer);
2338                                 /* let the system handle itself */
2339                         }
2340                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2341                         /* the datagrams are a global pool,
2342                          * so break out of trying and hope some free
2343                          * up soon */
2344                         did_something -= 1;
2345                         break;
2346                 } else {
2347                         /* something bad happened, you lose */
2348                         CNETERR("could not start connecting to %s "
2349                                 "rc %d: Will retry until TX timeout\n",
2350                                libcfs_nid2str(peer->gnp_nid), rc);
2351                         /* It didnt post so just set connecting back to zero now.
2352                          * The reaper will reattempt the connection if it needs too.
2353                          * If the peer needs death set it so the reaper will cleanup.
2354                          */
2355                         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
2356                         if (peer->gnp_connecting == GNILND_PEER_POSTING) {
2357                                 peer->gnp_connecting = GNILND_PEER_IDLE;
2358                                 kgnilnd_peer_increase_reconnect_locked(peer);
2359                         } else {
2360                                 LASSERTF(peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH, "Peer is in invalid"
2361                                         " state 0x%p->%s, connecting %d\n",
2362                                         peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting);
2363                                 peer->gnp_connecting = GNILND_PEER_KILL;
2364                         }
2365                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2366
2367                         /* hold onto ref until we are really done - if it was
2368                          * unlinked this could result in a destroy */
2369                         kgnilnd_peer_decref(peer);
2370                 }
2371                 spin_lock(&dev->gnd_connd_lock);
2372         }
2373
2374         spin_unlock(&dev->gnd_connd_lock);
2375         RETURN(did_something);
2376 }
2377
2378 int
2379 kgnilnd_repost_wc_dgrams(kgn_device_t *dev)
2380 {
2381         int did_something = 0, to_repost, i;
2382         to_repost = atomic_read(&dev->gnd_nwcdgrams);
2383         ENTRY;
2384
2385         for (i = 0; i < to_repost; ++i) {
2386                 int     rerc;
2387                 rerc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
2388                 if (rerc == 0) {
2389                         kgnilnd_admin_decref(dev->gnd_nwcdgrams);
2390                         did_something += 1;
2391                 } else {
2392                         CDEBUG(D_NETERROR, "error %d: dev %d could not post wildcard datagram\n",
2393                                 rerc, dev->gnd_id);
2394                         break;
2395                 }
2396         }
2397
2398         RETURN(did_something);
2399 }
2400
2401 static void
2402 kgnilnd_dgram_poke_with_stick(unsigned long arg)
2403 {
2404         int             dev_id = arg;
2405         kgn_device_t    *dev = &kgnilnd_data.kgn_devices[dev_id];
2406
2407         wake_up(&dev->gnd_dgram_waitq);
2408 }
2409
2410 /* use single thread for dgrams - should be sufficient for performance */
2411 int
2412 kgnilnd_dgram_mover(void *arg)
2413 {
2414         kgn_device_t            *dev = (kgn_device_t *)arg;
2415         char                     name[16];
2416         int                      rc, did_something;
2417         unsigned long            next_purge_check = jiffies - 1;
2418         unsigned long            timeout;
2419         struct timer_list        timer;
2420         unsigned long            deadline = 0;
2421         DEFINE_WAIT(wait);
2422
2423         snprintf(name, sizeof(name), "kgnilnd_dg_%02d", dev->gnd_id);
2424         cfs_block_allsigs();
2425         /* all gnilnd threads need to run fairly urgently */
2426         set_user_nice(current, *kgnilnd_tunables.kgn_nice);
2427
2428         /* we are ok not locking for these variables as the dgram waitq threads
2429          * will block both due to tying up net (kgn_shutdown) and the completion
2430          * event for the dgram_waitq (kgn_quiesce_trigger) */
2431         deadline = jiffies + cfs_time_seconds(*kgnilnd_tunables.kgn_dgram_timeout);
2432         while (!kgnilnd_data.kgn_shutdown) {
2433                 /* Safe: kgn_shutdown only set when quiescent */
2434
2435                 /* race with stack reset - we want to hold off seeing any new incoming dgrams
2436                  * so we can force a dirty WC dgram for Bug 762072 - put right before
2437                  * quiesce check so that it'll go right into that and not do any
2438                  * dgram mucking */
2439                 CFS_RACE(CFS_FAIL_GNI_WC_DGRAM_FREE);
2440
2441                 /* to quiesce or to not quiesce, that is the question */
2442                 if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
2443                         KGNILND_SPIN_QUIESCE;
2444                 }
2445                 did_something = 0;
2446
2447                 CFS_RACE(CFS_FAIL_GNI_QUIESCE_RACE);
2448
2449                 /* process any newly completed dgrams */
2450                 down_read(&kgnilnd_data.kgn_net_rw_sem);
2451
2452                 rc = kgnilnd_probe_and_process_dgram(dev);
2453                 if (rc > 0) {
2454                         did_something += rc;
2455                 }
2456
2457                 up_read(&kgnilnd_data.kgn_net_rw_sem);
2458
2459                 CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_DGRAM_DEADLINE,
2460                         (*kgnilnd_tunables.kgn_dgram_timeout + 1));
2461                 /* start new outbound dgrams */
2462                 did_something += kgnilnd_start_outbound_dgrams(dev, deadline);
2463
2464                 /* find dead dgrams */
2465                 if (time_after_eq(jiffies, next_purge_check)) {
2466                         /* these don't need to be checked that often */
2467                         kgnilnd_reaper_dgram_check(dev);
2468
2469                         next_purge_check = (long) jiffies +
2470                                       cfs_time_seconds(kgnilnd_data.kgn_new_min_timeout / 4);
2471                 }
2472
2473                 did_something += kgnilnd_repost_wc_dgrams(dev);
2474
2475                 /* careful with the jiffy wrap... */
2476                 timeout = (long)(next_purge_check - jiffies);
2477
2478                 CDEBUG(D_INFO, "did %d timeout %lu next %lu jiffies %lu\n",
2479                        did_something, timeout, next_purge_check, jiffies);
2480
2481                 if ((did_something || timeout <= 0) && time_before(jiffies, deadline)) {
2482                         did_something = 0;
2483                         continue;
2484                 }
2485
2486                 prepare_to_wait(&dev->gnd_dgram_waitq, &wait, TASK_INTERRUPTIBLE);
2487
2488                 setup_timer(&timer, kgnilnd_dgram_poke_with_stick, dev->gnd_id);
2489                 mod_timer(&timer, (long) jiffies + timeout);
2490
2491                 /* last second chance for others to poke us */
2492                 did_something += xchg(&dev->gnd_dgram_ready, GNILND_DGRAM_IDLE);
2493
2494                 /* check flag variables before comittingi even if we did something;
2495                  * if we are after the deadline call schedule */
2496                 if ((!did_something || time_after(jiffies, deadline)) &&
2497                     !kgnilnd_data.kgn_shutdown &&
2498                     !kgnilnd_data.kgn_quiesce_trigger) {
2499                         CDEBUG(D_INFO, "schedule timeout %ld (%lu sec)\n",
2500                                timeout, cfs_duration_sec(timeout));
2501                         wake_up_all(&dev->gnd_dgping_waitq);
2502                         schedule();
2503                         CDEBUG(D_INFO, "awake after schedule\n");
2504                         deadline = jiffies + cfs_time_seconds(*kgnilnd_tunables.kgn_dgram_timeout);
2505                 }
2506
2507                 del_singleshot_timer_sync(&timer);
2508                 finish_wait(&dev->gnd_dgram_waitq, &wait);
2509         }
2510
2511         kgnilnd_thread_fini();
2512         return 0;
2513 }