lnet/klnds/gnilnd/gnilnd_conn.c

   1 /*
   2  * Copyright (C) 2012 Cray, Inc.
   3  *
   4  *   Author: Igor Gorodetsky <iogordet@cray.com>
   5  *   Author: Nic Henke <nic@cray.com>
   6  *   Author: James Shimek <jshimek@cray.com>
   7  *
   8  *   This file is part of Lustre, http://www.lustre.org.
   9  *
  10  *   Lustre is free software; you can redistribute it and/or
  11  *   modify it under the terms of version 2 of the GNU General Public
  12  *   License as published by the Free Software Foundation.
  13  *
  14  *   Lustre is distributed in the hope that it will be useful,
  15  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  *   GNU General Public License for more details.
  18  *
  19  *   You should have received a copy of the GNU General Public License
  20  *   along with Lustre; if not, write to the Free Software
  21  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  22  *
  23  */
  24
  25 #include "gnilnd.h"
  26
  27 void
  28 kgnilnd_setup_smsg_attr(gni_smsg_attr_t *smsg_attr)
  29 {
  30         smsg_attr->mbox_maxcredit = *kgnilnd_tunables.kgn_mbox_credits;
  31         smsg_attr->msg_maxsize = GNILND_MAX_MSG_SIZE;
  32         smsg_attr->msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT;
  33 }
  34
  35 int
  36 kgnilnd_map_fmablk(kgn_device_t *device, kgn_fma_memblock_t *fma_blk)
  37 {
  38         gni_return_t            rrc;
  39         __u32                   flags = GNI_MEM_READWRITE;
  40
  41         if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
  42                 flags |= GNI_MEM_PHYS_CONT;
  43         }
  44
  45         /* make sure we are mapping a clean block */
  46         LASSERTF(fma_blk->gnm_hndl.qword1 == 0UL, "fma_blk %p dirty\n", fma_blk);
  47
  48         rrc = kgnilnd_mem_register(device->gnd_handle, (__u64)fma_blk->gnm_block,
  49                                    fma_blk->gnm_blk_size, device->gnd_rcv_fma_cqh,
  50                                    flags, &fma_blk->gnm_hndl);
  51         if (rrc != GNI_RC_SUCCESS) {
  52                 /* XXX Nic: need a way to silence this for runtime stuff that is ok to fail
  53                  * -- like when under MDD or GART pressure on big systems
  54                  */
  55                 CNETERR("register fmablk failed 0x%p mbox_size %d flags %u\n",
  56                         fma_blk, fma_blk->gnm_mbox_size, flags);
  57                 RETURN(-ENOMEM);
  58         }
  59
  60         /* PHYS_CONT memory isn't really mapped, at least not in GART -
  61          *  but all mappings chew up a MDD
  62          */
  63         if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
  64                 atomic64_add(fma_blk->gnm_blk_size, &device->gnd_nbytes_map);
  65         }
  66
  67         atomic_inc(&device->gnd_n_mdd);
  68         /* nfmablk is live (mapped) blocks */
  69         atomic_inc(&device->gnd_nfmablk);
  70
  71         RETURN(0);
  72 }
  73
  74 int
  75 kgnilnd_alloc_fmablk(kgn_device_t *device, int use_phys)
  76 {
  77         int                     rc = 0;
  78         int                     num_mbox;
  79         kgn_fma_memblock_t     *fma_blk;
  80         gni_smsg_attr_t         smsg_attr;
  81         unsigned long           fmablk_vers;
  82
  83         /* we'll use fmablk_vers and the gnd_fmablk_sem to gate access
  84          * to this allocation code. Everyone will sample the version
  85          * before and after getting the semaphore. If it has changed,
  86          * we'll bail out to check the lists again - this indicates that
  87          * some sort of change was made to the lists and it is possible
  88          * that there is a mailbox for us to find now. This should prevent
  89          * a ton of spinning in the case where there are lots of threads
  90          * that need a yet-to-be-allocated mailbox for a connection. */
  91
  92         fmablk_vers = atomic_read(&device->gnd_fmablk_vers);
  93         down(&device->gnd_fmablk_sem);
  94
  95         if (fmablk_vers != atomic_read(&device->gnd_fmablk_vers)) {
  96                 /* version changed while we were waiting for semaphore,
  97                  * we'll recheck the lists assuming something nice happened */
  98                 up(&device->gnd_fmablk_sem);
  99                 return 0;
 100         }
 101
 102         LIBCFS_ALLOC(fma_blk, sizeof(kgn_fma_memblock_t));
 103         if (fma_blk == NULL) {
 104                 CNETERR("could not allocate fma block descriptor\n");
 105                 rc = -ENOMEM;
 106                 GOTO(out, rc);
 107         }
 108
 109         INIT_LIST_HEAD(&fma_blk->gnm_bufflist);
 110
 111         kgnilnd_setup_smsg_attr(&smsg_attr);
 112
 113         gni_smsg_buff_size_needed(&smsg_attr, &fma_blk->gnm_mbox_size);
 114
 115         LASSERTF(fma_blk->gnm_mbox_size, "mbox size %d\n", fma_blk->gnm_mbox_size);
 116
 117         /* gni_smsg_buff_size_needed calculates the base mailbox size and since
 118          * we want to hold kgn_peer_credits worth of messages in both directions,
 119          * we add PAYLOAD to grow the mailbox size
 120          */
 121
 122         fma_blk->gnm_mbox_size += GNILND_MBOX_PAYLOAD;
 123
 124         /* we'll only use physical during preallocate at startup -- this keeps it nice and
 125          * clean for runtime decisions. We'll keep the PHYS ones around until shutdown
 126          * as reallocating them is tough if there is memory fragmentation */
 127
 128         if (use_phys) {
 129                 fma_blk->gnm_block = cfs_mem_cache_alloc(kgnilnd_data.kgn_mbox_cache, CFS_ALLOC_ATOMIC);
 130                 if (fma_blk->gnm_block == NULL) {
 131                         CNETERR("could not allocate physical SMSG mailbox memory\n");
 132                         rc = -ENOMEM;
 133                         GOTO(free_desc, rc);
 134                 }
 135                 fma_blk->gnm_blk_size = KMALLOC_MAX_SIZE;
 136                 num_mbox = fma_blk->gnm_blk_size / fma_blk->gnm_mbox_size;
 137
 138                 LASSERTF(num_mbox >= 1,
 139                          "num_mbox %d blk_size %u mbox_size %d\n",
 140                           num_mbox, fma_blk->gnm_blk_size, fma_blk->gnm_mbox_size);
 141
 142                 fma_blk->gnm_state = GNILND_FMABLK_PHYS;
 143
 144         } else {
 145                 num_mbox = *kgnilnd_tunables.kgn_mbox_per_block;
 146                 fma_blk->gnm_blk_size = num_mbox * fma_blk->gnm_mbox_size;
 147
 148                 LASSERTF(num_mbox >= 1 && num_mbox >= *kgnilnd_tunables.kgn_mbox_per_block,
 149                          "num_mbox %d blk_size %u mbox_size %d tunable %d\n",
 150                          num_mbox, fma_blk->gnm_blk_size, fma_blk->gnm_mbox_size,
 151                          *kgnilnd_tunables.kgn_mbox_per_block);
 152
 153                 LIBCFS_ALLOC(fma_blk->gnm_block, fma_blk->gnm_blk_size);
 154                 if (fma_blk->gnm_block == NULL) {
 155                         CNETERR("could not allocate virtual SMSG mailbox memory, %d bytes\n", fma_blk->gnm_blk_size);
 156                         rc = -ENOMEM;
 157                         GOTO(free_desc, rc);
 158                 }
 159
 160                 fma_blk->gnm_state = GNILND_FMABLK_VIRT;
 161         }
 162
 163         /* allocate just enough space for the bits to track the mailboxes */
 164         LIBCFS_ALLOC(fma_blk->gnm_bit_array, BITS_TO_LONGS(num_mbox) * sizeof(unsigned long));
 165         if (fma_blk->gnm_bit_array == NULL) {
 166                 CNETERR("could not allocate mailbox bitmask, %lu bytes for %d mbox\n",
 167                        sizeof(unsigned long) * BITS_TO_LONGS(num_mbox), num_mbox);
 168                 rc = -ENOMEM;
 169                 GOTO(free_blk, rc);
 170         }
 171         bitmap_zero(fma_blk->gnm_bit_array, num_mbox);
 172
 173         /* now that the num_mbox is set based on allocation type, get debug info setup */
 174         LIBCFS_ALLOC(fma_blk->gnm_mbox_info, sizeof(kgn_mbox_info_t) * num_mbox);
 175         if (fma_blk->gnm_mbox_info == NULL) {
 176                 CNETERR("could not allocate mailbox debug, %lu bytes for %d mbox\n",
 177                        sizeof(kgn_mbox_info_t) * num_mbox, num_mbox);
 178                 rc = -ENOMEM;
 179                 GOTO(free_bit, rc);
 180         }
 181
 182         rc = kgnilnd_map_fmablk(device, fma_blk);
 183         if (rc) {
 184                 GOTO(free_info, rc);
 185         }
 186
 187         fma_blk->gnm_next_avail_mbox = 0;
 188         fma_blk->gnm_avail_mboxs = fma_blk->gnm_num_mboxs = num_mbox;
 189
 190         CDEBUG(D_MALLOC, "alloc fmablk 0x%p num %d msg_maxsize %d credits %d "
 191                 "mbox_size %d MDD "LPX64"."LPX64"\n",
 192                 fma_blk, num_mbox, smsg_attr.msg_maxsize, smsg_attr.mbox_maxcredit,
 193                 fma_blk->gnm_mbox_size, fma_blk->gnm_hndl.qword1,
 194                 fma_blk->gnm_hndl.qword2);
 195
 196         /* lock Is protecting data structures, not semaphore */
 197
 198         spin_lock(&device->gnd_fmablk_lock);
 199         list_add_tail(&fma_blk->gnm_bufflist, &device->gnd_fma_buffs);
 200
 201         /* toggle under the lock so once they change the list is also
 202          * ready for others to traverse */
 203         atomic_inc(&device->gnd_fmablk_vers);
 204
 205         spin_unlock(&device->gnd_fmablk_lock);
 206
 207         up(&device->gnd_fmablk_sem);
 208
 209         return 0;
 210
 211 free_info:
 212         LIBCFS_FREE(fma_blk->gnm_mbox_info, sizeof(kgn_mbox_info_t)*num_mbox);
 213 free_bit:
 214         LIBCFS_FREE(fma_blk->gnm_bit_array, BITS_TO_LONGS(num_mbox) * sizeof (unsigned long));
 215 free_blk:
 216         if (fma_blk->gnm_state == GNILND_FMABLK_VIRT) {
 217                 LIBCFS_FREE(fma_blk->gnm_block, fma_blk->gnm_blk_size);
 218         } else {
 219                 cfs_mem_cache_free(kgnilnd_data.kgn_mbox_cache, fma_blk->gnm_block);
 220         }
 221 free_desc:
 222         LIBCFS_FREE(fma_blk, sizeof(kgn_fma_memblock_t));
 223 out:
 224         up(&device->gnd_fmablk_sem);
 225         return rc;
 226 }
 227
 228 void
 229 kgnilnd_unmap_fmablk(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
 230 {
 231         gni_return_t            rrc;
 232
 233         /* if some held, set hold_timeout from conn timeouts used in this block
 234          * but not during shutdown, then just nuke and pave */
 235         if (fma_blk->gnm_held_mboxs && (!kgnilnd_data.kgn_shutdown)) {
 236                 fma_blk->gnm_hold_timeout = GNILND_TIMEOUT2DEADMAN;
 237         }
 238
 239         /* we are changing the state of a block, tickle version to tell
 240          * proc code list is stale now */
 241         atomic_inc(&dev->gnd_fmablk_vers);
 242
 243         rrc = kgnilnd_mem_deregister(dev->gnd_handle, &fma_blk->gnm_hndl, fma_blk->gnm_hold_timeout);
 244
 245         CDEBUG(rrc == GNI_RC_SUCCESS ? D_MALLOC : D_CONSOLE|D_NETERROR,
 246                "unmap fmablk 0x%p@%s sz %u total %d avail %d held %d mbox_size %d "
 247                 "hold_timeout %d\n",
 248                fma_blk, kgnilnd_fmablk_state2str(fma_blk->gnm_state),
 249                fma_blk->gnm_blk_size, fma_blk->gnm_num_mboxs,
 250                fma_blk->gnm_avail_mboxs, fma_blk->gnm_held_mboxs,
 251                fma_blk->gnm_mbox_size, fma_blk->gnm_hold_timeout);
 252
 253         LASSERTF(rrc == GNI_RC_SUCCESS,
 254                 "tried to double unmap or something bad, fma_blk %p (rrc %d)\n",
 255                 fma_blk, rrc);
 256
 257         if (fma_blk->gnm_hold_timeout) {
 258                 atomic_inc(&dev->gnd_n_mdd_held);
 259         } else {
 260                 atomic_dec(&dev->gnd_n_mdd);
 261         }
 262
 263         /* PHYS blocks don't get mapped */
 264         if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
 265                 atomic64_sub(fma_blk->gnm_blk_size, &dev->gnd_nbytes_map);
 266         } else if (kgnilnd_data.kgn_in_reset) {
 267                 /* in stack reset, clear MDD handle for PHYS blocks, as we'll
 268                  * re-use the fma_blk after reset so we don't have to drop/allocate
 269                  * all of those physical blocks */
 270                 fma_blk->gnm_hndl.qword1 = fma_blk->gnm_hndl.qword2 = 0UL;
 271         }
 272
 273         /* Decrement here as this is the # of mapped blocks */
 274         atomic_dec(&dev->gnd_nfmablk);
 275 }
 276
 277
 278 /* needs lock on gnd_fmablk_lock to cover gnd_fma_buffs */
 279 void
 280 kgnilnd_free_fmablk_locked(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
 281 {
 282         LASSERTF(fma_blk->gnm_avail_mboxs == fma_blk->gnm_num_mboxs,
 283                  "fma_blk %p@%d free in bad state (%d): blk total %d avail %d held %d\n",
 284                  fma_blk, fma_blk->gnm_state, fma_blk->gnm_hold_timeout, fma_blk->gnm_num_mboxs,
 285                 fma_blk->gnm_avail_mboxs, fma_blk->gnm_held_mboxs);
 286
 287         atomic_inc(&dev->gnd_fmablk_vers);
 288
 289         if (fma_blk->gnm_hold_timeout) {
 290                 CDEBUG(D_MALLOC, "mdd release fmablk 0x%p sz %u avail %d held %d "
 291                         "mbox_size %d\n",
 292                         fma_blk, fma_blk->gnm_blk_size, fma_blk->gnm_avail_mboxs,
 293                         fma_blk->gnm_held_mboxs, fma_blk->gnm_mbox_size);
 294
 295                 /* We leave MDD dangling over stack reset */
 296                 if (!kgnilnd_data.kgn_in_reset) {
 297                         kgnilnd_mem_mdd_release(dev->gnd_handle, &fma_blk->gnm_hndl);
 298                 }
 299                 /* ignoring the return code - if kgni/ghal can't find it
 300                  * it must be released already */
 301                 atomic_dec(&dev->gnd_n_mdd_held);
 302                 atomic_dec(&dev->gnd_n_mdd);
 303         }
 304
 305         /* we cant' free the gnm_block until all the conns have released their
 306          * purgatory holds. While we have purgatory holds, we might check the conn
 307          * RX mailbox during the CLOSING process. It is possible that kgni might
 308          * try to look into the RX side for credits when sending the CLOSE msg too */
 309         CDEBUG(D_MALLOC, "fmablk %p free buffer %p mbox_size %d\n",
 310                 fma_blk, fma_blk->gnm_block, fma_blk->gnm_mbox_size);
 311
 312         if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
 313                 cfs_mem_cache_free(kgnilnd_data.kgn_mbox_cache, fma_blk->gnm_block);
 314         } else {
 315                 LIBCFS_FREE(fma_blk->gnm_block, fma_blk->gnm_blk_size);
 316         }
 317         fma_blk->gnm_state = GNILND_FMABLK_FREED;
 318
 319         list_del(&fma_blk->gnm_bufflist);
 320
 321         LIBCFS_FREE(fma_blk->gnm_mbox_info, sizeof(kgn_mbox_info_t)*fma_blk->gnm_num_mboxs);
 322         LIBCFS_FREE(fma_blk->gnm_bit_array, BITS_TO_LONGS(fma_blk->gnm_num_mboxs) * sizeof (unsigned long));
 323         LIBCFS_FREE(fma_blk, sizeof(kgn_fma_memblock_t));
 324 }
 325
 326 void
 327 kgnilnd_find_free_mbox(kgn_conn_t *conn)
 328 {
 329         kgn_device_t            *dev = conn->gnc_device;
 330         gni_smsg_attr_t         *smsg_attr = &conn->gnpr_smsg_attr;
 331         kgn_fma_memblock_t      *fma_blk;
 332         kgn_mbox_info_t         *mbox = NULL;
 333         int                     id;
 334
 335         spin_lock(&dev->gnd_fmablk_lock);
 336
 337         list_for_each_entry(fma_blk, &conn->gnc_device->gnd_fma_buffs,
 338                             gnm_bufflist) {
 339                 if (fma_blk->gnm_avail_mboxs <= 0 ||
 340                     fma_blk->gnm_state <= GNILND_FMABLK_IDLE) {
 341                         continue;
 342                 }
 343                 /* look in bitarray for available mailbox */
 344                 do {
 345                         id = find_next_zero_bit(
 346                                 fma_blk->gnm_bit_array,
 347                                 fma_blk->gnm_num_mboxs,
 348                                 fma_blk->gnm_next_avail_mbox);
 349                       if (id == fma_blk->gnm_num_mboxs &&
 350                           fma_blk->gnm_next_avail_mbox != 0) {
 351                                 /* wrap around */
 352                                 fma_blk->gnm_next_avail_mbox = 0;
 353                         } else {
 354                                 break;
 355                         }
 356                 } while (1);
 357
 358                 LASSERTF(id < fma_blk->gnm_num_mboxs, "id %d max %d\n",
 359                          id, fma_blk->gnm_num_mboxs);
 360                 set_bit(id, (volatile unsigned long *)fma_blk->gnm_bit_array);
 361                 conn->gnc_mbox_id = id;
 362
 363                 fma_blk->gnm_next_avail_mbox =
 364                         (id == (fma_blk->gnm_num_mboxs - 1)) ? 0 : (id + 1);
 365                 fma_blk->gnm_avail_mboxs--;
 366                 conn->gnc_fma_blk = fma_blk;
 367
 368                 kgnilnd_setup_smsg_attr(smsg_attr);
 369
 370                 smsg_attr->msg_buffer = fma_blk->gnm_block;
 371                 smsg_attr->mbox_offset = fma_blk->gnm_mbox_size * id;
 372                 smsg_attr->mem_hndl = fma_blk->gnm_hndl;
 373                 smsg_attr->buff_size = fma_blk->gnm_mbox_size;
 374
 375                 /* We'll set the hndl to zero for PHYS blocks unmapped during stack
 376                  * reset and re-use the same fma_blk after stack reset. This ensures we've
 377                  * properly mapped it before we use it */
 378                 LASSERTF(fma_blk->gnm_hndl.qword1 != 0UL, "unmapped fma_blk %p, state %d\n",
 379                          fma_blk, fma_blk->gnm_state);
 380
 381                 CDEBUG(D_NET, "conn %p smsg %p fmablk %p "
 382                         "allocating SMSG mbox %d buf %p "
 383                         "offset %u hndl "LPX64"."LPX64"\n",
 384                         conn, smsg_attr, fma_blk, id,
 385                         smsg_attr->msg_buffer, smsg_attr->mbox_offset,
 386                         fma_blk->gnm_hndl.qword1,
 387                         fma_blk->gnm_hndl.qword2);
 388
 389                 mbox = &fma_blk->gnm_mbox_info[id];
 390                 mbox->mbx_create_conn_memset = jiffies;
 391
 392                 /* zero mbox to remove any old data from our last use.
 393                  * this better be safe, if not our purgatory timers
 394                  * are too short or a peer really is misbehaving */
 395                 memset(smsg_attr->msg_buffer + smsg_attr->mbox_offset,
 396                        0, smsg_attr->buff_size);
 397                 break;
 398         }
 399
 400         spin_unlock(&dev->gnd_fmablk_lock);
 401 }
 402
 403 int
 404 kgnilnd_setup_mbox(kgn_conn_t *conn)
 405 {
 406         gni_smsg_attr_t         *smsg_attr = &conn->gnpr_smsg_attr;
 407         int                      err = 0;
 408
 409         smsg_attr->msg_buffer = NULL;
 410         /* Look for available mbox */
 411         do {
 412                 kgnilnd_find_free_mbox(conn);
 413
 414                 /* nothing in the existing buffers, make a new one */
 415                 if (smsg_attr->msg_buffer == NULL) {
 416                         /* for runtime allocations, we only want vmalloc */
 417                         err = kgnilnd_alloc_fmablk(conn->gnc_device, 0);
 418                         if (err) {
 419                                 break;
 420                         }
 421                 }
 422         } while (smsg_attr->msg_buffer == NULL);
 423
 424         if (err)
 425                 CNETERR("couldn't allocate SMSG mbox for conn %p Error: %d\n",
 426                         conn, err);
 427         return err;
 428 }
 429
 430 void
 431 kgnilnd_release_mbox(kgn_conn_t *conn, int purgatory_hold)
 432 {
 433         kgn_device_t           *dev = conn->gnc_device;
 434         gni_smsg_attr_t        *smsg_attr = &conn->gnpr_smsg_attr;
 435         kgn_fma_memblock_t     *fma_blk = NULL;
 436         kgn_mbox_info_t        *mbox = NULL;
 437         int                     found = 0;
 438         int                     id;
 439
 440         /* if we failed to setup mbox and now destroying conn */
 441         if (smsg_attr->msg_buffer == NULL) {
 442                 return;
 443         }
 444
 445         id = conn->gnc_mbox_id;
 446
 447         spin_lock(&dev->gnd_fmablk_lock);
 448         /* make sure our conn points at a valid fma_blk
 449          * We use this instead of a mem block search out of smsg_attr
 450          * because we could have freed a block for fma_blk #1 but the fma_blk
 451          * is still in the list for a purgatory hold. This would induce a false
 452          * match if that same block gets reallocated to fma_blk #2 */
 453         list_for_each_entry(fma_blk, &dev->gnd_fma_buffs, gnm_bufflist) {
 454                 if (fma_blk == conn->gnc_fma_blk) {
 455                         found = 1;
 456                         break;
 457                 }
 458         }
 459         LASSERTF(found, "unable to find conn 0x%p with gnc_fma_blk %p "
 460                  "anywhere in the world\n", conn, conn->gnc_fma_blk);
 461
 462         LASSERTF(id < fma_blk->gnm_num_mboxs,
 463                 "bad id %d max %d\n",
 464                 id, fma_blk->gnm_num_mboxs);
 465
 466         /* < 0 - was held, now free it
 467          * == 0 - just free it
 468          * > 0 - hold it for now */
 469         if (purgatory_hold == 0) {
 470                 CDEBUG(D_NET, "conn %p smsg %p fmablk %p freeing SMSG mbox %d "
 471                         "hndl "LPX64"."LPX64"\n",
 472                         conn, smsg_attr, fma_blk, id,
 473                         fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
 474                 fma_blk->gnm_avail_mboxs++;
 475
 476         } else if (purgatory_hold > 0) {
 477                 CDEBUG(D_NET, "conn %p smsg %p fmablk %p holding SMSG mbox %d "
 478                         "hndl "LPX64"."LPX64"\n",
 479                         conn, smsg_attr, fma_blk, id,
 480                         fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
 481
 482                 fma_blk->gnm_held_mboxs++;
 483                 fma_blk->gnm_max_timeout = MAX(fma_blk->gnm_max_timeout,
 484                                                 conn->gnc_timeout);
 485         } else {
 486                 CDEBUG(D_NET, "conn %p smsg %p fmablk %p release SMSG mbox %d "
 487                         "hndl "LPX64"."LPX64"\n",
 488                         conn, smsg_attr, fma_blk, id,
 489                         fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
 490
 491                 fma_blk->gnm_held_mboxs--;
 492                 fma_blk->gnm_avail_mboxs++;
 493         }
 494
 495         if (purgatory_hold <= 0) {
 496                 /* if kgni is retransmitting, freeing the smsg block before the EP
 497                  * is destroyed gets messy. Bug 768295. */
 498                 LASSERTF(conn->gnc_ephandle == NULL,
 499                          "can't release mbox before EP is nuked. conn 0x%p\n", conn);
 500
 501                 mbox = &fma_blk->gnm_mbox_info[id];
 502                 mbox->mbx_release_from_purgatory = jiffies;
 503
 504                 /* clear conn gnc_fmablk if it is gone - this allows us to
 505                  * not worry about state so much in kgnilnd_destroy_conn
 506                  * and makes the guaranteed cleanup of the resources easier */
 507                 LASSERTF(test_and_clear_bit(id, fma_blk->gnm_bit_array),
 508                         "conn %p bit %d already cleared in fma_blk %p\n",
 509                          conn, id, fma_blk);
 510                 conn->gnc_fma_blk = NULL;
 511         }
 512
 513         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_FMABLK_AVAIL)) {
 514                 CERROR("LBUGs in your future: forcibly marking fma_blk %p "
 515                        "as mapped\n", fma_blk);
 516                 fma_blk->gnm_state = GNILND_FMABLK_VIRT;
 517         }
 518
 519         /* we don't release or unmap PHYS blocks as part of the normal cycle --
 520          * those are controlled manually from startup/shutdown */
 521         if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
 522                 /* we can unmap once all are unused (held or avail)
 523                  * but check hold_timeout to make sure we are not trying to double
 524                  * unmap this buffer. If there was no hold_timeout set due to
 525                  * held_mboxs, we'll free the mobx here shortly and won't have to
 526                  * worry about catching a double free for a 'clean' fma_blk */
 527                 if (((fma_blk->gnm_avail_mboxs + fma_blk->gnm_held_mboxs) == fma_blk->gnm_num_mboxs) &&
 528                     (!fma_blk->gnm_hold_timeout)) {
 529                         kgnilnd_unmap_fmablk(dev, fma_blk);
 530                 }
 531
 532                 /* But we can only free once they are all avail */
 533                 if (fma_blk->gnm_avail_mboxs == fma_blk->gnm_num_mboxs &&
 534                     fma_blk->gnm_held_mboxs == 0) {
 535                         /* all mailboxes are released, free fma_blk */
 536                         kgnilnd_free_fmablk_locked(dev, fma_blk);
 537                 }
 538         }
 539
 540         spin_unlock(&dev->gnd_fmablk_lock);
 541 }
 542
 543 int
 544 kgnilnd_count_phys_mbox(kgn_device_t *device)
 545 {
 546         int                     i = 0;
 547         kgn_fma_memblock_t     *fma_blk;
 548
 549         spin_lock(&device->gnd_fmablk_lock);
 550
 551         list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
 552                 if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
 553                         i += fma_blk->gnm_num_mboxs;
 554         }
 555         spin_unlock(&device->gnd_fmablk_lock);
 556
 557         RETURN(i);
 558 }
 559
 560 int
 561 kgnilnd_allocate_phys_fmablk(kgn_device_t *device)
 562 {
 563         int     rc;
 564
 565         while (kgnilnd_count_phys_mbox(device) < *kgnilnd_tunables.kgn_nphys_mbox) {
 566
 567                 rc = kgnilnd_alloc_fmablk(device, 1);
 568                 if (rc) {
 569                         CERROR("failed phys mbox allocation, stopping at %d, rc %d\n",
 570                                 kgnilnd_count_phys_mbox(device), rc);
 571                         RETURN(rc);
 572                 }
 573         }
 574         RETURN(0);
 575 }
 576
 577 int
 578 kgnilnd_map_phys_fmablk(kgn_device_t *device)
 579 {
 580
 581         int                     rc = 0;
 582         kgn_fma_memblock_t     *fma_blk;
 583
 584         /* use sem to gate access to single thread, just in case */
 585         down(&device->gnd_fmablk_sem);
 586
 587         spin_lock(&device->gnd_fmablk_lock);
 588
 589         list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
 590                 if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
 591                         rc = kgnilnd_map_fmablk(device, fma_blk);
 592                         if (rc)
 593                                 break;
 594         }
 595         spin_unlock(&device->gnd_fmablk_lock);
 596
 597         up(&device->gnd_fmablk_sem);
 598
 599         RETURN(rc);
 600 }
 601
 602 void
 603 kgnilnd_unmap_phys_fmablk(kgn_device_t *device)
 604 {
 605
 606         kgn_fma_memblock_t      *fma_blk;
 607
 608         /* use sem to gate access to single thread, just in case */
 609         down(&device->gnd_fmablk_sem);
 610
 611         spin_lock(&device->gnd_fmablk_lock);
 612
 613         list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
 614                 if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
 615                         kgnilnd_unmap_fmablk(device, fma_blk);
 616         }
 617         spin_unlock(&device->gnd_fmablk_lock);
 618
 619         up(&device->gnd_fmablk_sem);
 620 }
 621
 622 void
 623 kgnilnd_free_phys_fmablk(kgn_device_t *device)
 624 {
 625
 626         kgn_fma_memblock_t      *fma_blk, *fma_blkN;
 627
 628         /* use sem to gate access to single thread, just in case */
 629         down(&device->gnd_fmablk_sem);
 630
 631         spin_lock(&device->gnd_fmablk_lock);
 632
 633         list_for_each_entry_safe(fma_blk, fma_blkN, &device->gnd_fma_buffs, gnm_bufflist) {
 634                 if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
 635                         kgnilnd_free_fmablk_locked(device, fma_blk);
 636         }
 637         spin_unlock(&device->gnd_fmablk_lock);
 638
 639         up(&device->gnd_fmablk_sem);
 640 }
 641
 642 /* kgnilnd dgram nid->struct managment */
 643
 644 static inline struct list_head *
 645 kgnilnd_nid2dgramlist(kgn_device_t *dev, lnet_nid_t nid)
 646 {
 647         unsigned int hash = ((unsigned int)nid) % *kgnilnd_tunables.kgn_peer_hash_size;
 648
 649         RETURN(&dev->gnd_dgrams[hash]);
 650 }
 651
 652
 653 /* needs dev->gnd_dgram_lock held */
 654 kgn_dgram_t *
 655 kgnilnd_find_dgram_locked(kgn_device_t *dev, lnet_nid_t dst_nid)
 656 {
 657         struct list_head *dgram_list = kgnilnd_nid2dgramlist(dev, dst_nid);
 658         kgn_dgram_t      *dgram;
 659
 660         list_for_each_entry(dgram, dgram_list, gndg_list) {
 661
 662                 /* if state > POSTED, we are already handling cancel/completion */
 663                 if ((dgram->gndg_conn_out.gncr_dstnid != dst_nid) ||
 664                      dgram->gndg_state > GNILND_DGRAM_POSTED)
 665                         continue;
 666
 667                 CDEBUG(D_NET, "got dgram [%p] -> %s\n",
 668                        dgram, libcfs_nid2str(dst_nid));
 669                 return dgram;
 670         }
 671         return NULL;
 672 }
 673
 674 int
 675 kgnilnd_find_and_cancel_dgram(kgn_device_t *dev, lnet_nid_t dst_nid)
 676 {
 677         kgn_dgram_t     *dgram;
 678
 679         spin_lock(&dev->gnd_dgram_lock);
 680         dgram = kgnilnd_find_dgram_locked(dev, dst_nid);
 681
 682         if (dgram) {
 683                 kgnilnd_cancel_dgram_locked(dgram);
 684         }
 685         spin_unlock(&dev->gnd_dgram_lock);
 686
 687         RETURN(!!(dgram == NULL));
 688 }
 689
 690 int
 691 kgnilnd_pack_connreq(kgn_connreq_t *connreq, kgn_conn_t *conn,
 692                      lnet_nid_t srcnid, lnet_nid_t dstnid,
 693                      kgn_connreq_type_t type)
 694 {
 695         int err = 0;
 696
 697         /* ensure we haven't violated max datagram size */
 698         CLASSERT(sizeof(kgn_connreq_t) <= GNI_DATAGRAM_MAXSIZE);
 699
 700         /* no need to zero out, we do that when allocating dgram */
 701         connreq->gncr_magic     = GNILND_MSG_MAGIC;
 702
 703         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_SRCNID)) {
 704                 srcnid = 0xABADBABE;
 705         } else if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_DSTNID)) {
 706                 dstnid = 0xDEFEC8ED;
 707         }
 708
 709         connreq->gncr_srcnid    = srcnid;
 710         connreq->gncr_dstnid    = dstnid;
 711
 712         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
 713                 connreq->gncr_version = 99;
 714         } else {
 715                 connreq->gncr_version   = GNILND_CONNREQ_VERSION;
 716         }
 717         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
 718                 connreq->gncr_type = 99;
 719         } else {
 720                 connreq->gncr_type      = type;
 721         }
 722         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
 723                 connreq->gncr_peerstamp = 0;
 724         } else {
 725                 connreq->gncr_peerstamp = kgnilnd_data.kgn_peerstamp;
 726         }
 727         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
 728                 connreq->gncr_connstamp = 0;
 729         } else {
 730                 connreq->gncr_connstamp = conn->gnc_my_connstamp;
 731         }
 732         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
 733                 connreq->gncr_timeout = 0;
 734         } else {
 735                 connreq->gncr_timeout   = conn->gnc_timeout;
 736         }
 737
 738         /* the rest pack the data into the payload in other places */
 739         if (type == GNILND_CONNREQ_REQ) {
 740                 kgn_gniparams_t       *req_params = &connreq->gncr_gnparams;
 741                 req_params->gnpr_host_id = conn->gnc_device->gnd_host_id;
 742                 req_params->gnpr_cqid = conn->gnc_cqid;
 743
 744                 /* allocate mailbox for this connection */
 745                 err = kgnilnd_setup_mbox(conn);
 746                 if (err != 0) {
 747                         CERROR("Failed to setup FMA mailbox (%d)\n", err);
 748                 }
 749                 req_params->gnpr_smsg_attr = conn->gnpr_smsg_attr;
 750         }
 751
 752         /* XXX Nic: TBD - checksum computation */
 753
 754         return err;
 755 }
 756
 757 int
 758 kgnilnd_unpack_connreq(kgn_dgram_t *dgram)
 759 {
 760         kgn_connreq_t           *connreq = &dgram->gndg_conn_in;
 761         int                      swab, rc = 0;
 762         kgn_net_t               *net;
 763
 764         /* the following fields must be handled in a backwards compatible
 765          * manner to ensure we can always send and interpret NAKs */
 766
 767         if (connreq->gncr_magic != GNILND_MSG_MAGIC &&
 768             connreq->gncr_magic != __swab32(GNILND_MSG_MAGIC)) {
 769                 /* Unexpected magic! */
 770                 CERROR("Unexpected magic %08x\n",
 771                        connreq->gncr_magic);
 772                 return -EBADF;
 773         }
 774
 775         swab = (connreq->gncr_magic == __swab32(GNILND_MSG_MAGIC));
 776         if (swab) {
 777                 __swab32s(&connreq->gncr_magic);
 778                 __swab32s(&connreq->gncr_cksum);
 779                 __swab16s(&connreq->gncr_type);
 780                 __swab16s(&connreq->gncr_version);
 781                 __swab32s(&connreq->gncr_timeout);
 782                 __swab64s(&connreq->gncr_srcnid);
 783                 __swab64s(&connreq->gncr_dstnid);
 784                 __swab64s(&connreq->gncr_peerstamp);
 785                 __swab64s(&connreq->gncr_connstamp);
 786         }
 787
 788         /* Do NOT return anything but -EBADF before we munge
 789          * connreq->gncr_srcnid - we need that to send the nak */
 790
 791         if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
 792                 lnet_nid_t      incoming = connreq->gncr_srcnid;
 793
 794                 /* even if the incoming packet is hosed, we know who we sent
 795                  * the original and can set the srcnid so that we can properly
 796                  * look up our peer to close the loop on this connreq. We still use
 797                  * -EBADF to prevent a NAK - just in case there are issues with
 798                  * the payload coming from a random spot, etc. */
 799                 connreq->gncr_srcnid = dgram->gndg_conn_out.gncr_dstnid;
 800
 801                 if (LNET_NIDADDR(dgram->gndg_conn_out.gncr_dstnid) !=
 802                                 LNET_NIDADDR(incoming)) {
 803                         /* we got a datagram match for the wrong nid... */
 804                         CERROR("matched datagram 0x%p with srcnid %s "
 805                                 "(%x), expecting %s (%x)\n",
 806                                 dgram,
 807                                 libcfs_nid2str(incoming),
 808                                 LNET_NIDADDR(incoming),
 809                                 libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
 810                                 LNET_NIDADDR(dgram->gndg_conn_out.gncr_dstnid));
 811                         return -EBADF;
 812                 }
 813         } else {
 814                 /* if we have a wildcard datagram it should match an
 815                  * incoming "active" datagram that should have a fully formed
 816                  * srcnid and dstnid. If we couldn't unpack it, we drop as
 817                  * corrupted packet, otherwise we'll just verify that the dstnid
 818                  * matches the NID for the NET that the dgram was posted */
 819
 820                 /* make sure their wildcard didn't match ours, that is unpossible */
 821                 LASSERTF(connreq->gncr_dstnid != LNET_NID_ANY,
 822                          "dgram 0x%p from %s, connreq 0x%p; "
 823                          "wildcard matched wildcard \n", dgram,
 824                          libcfs_nid2str(connreq->gncr_srcnid), connreq);
 825
 826                 rc = kgnilnd_find_net(connreq->gncr_dstnid, &net);
 827
 828                 if (rc == -ESHUTDOWN) {
 829                         CERROR("Looking up network: device is in shutdown");
 830                         return rc;
 831                 } else if (rc == -ENONET) {
 832                         CERROR("Connection data from %s: she sent "
 833                         "dst_nid %s, but net lookup failed on "
 834                         "dgram 0x%p@%s\n",
 835                         libcfs_nid2str(connreq->gncr_srcnid),
 836                         libcfs_nid2str(connreq->gncr_dstnid),
 837                         dgram, kgnilnd_dgram_type2str(dgram));
 838                         return rc;
 839                 }
 840
 841                 if (net->gnn_ni->ni_nid != connreq->gncr_dstnid) {
 842                         CERROR("Bad connection data from %s: she sent "
 843                                "dst_nid %s, but I am %s with dgram 0x%p@%s\n",
 844                                libcfs_nid2str(connreq->gncr_srcnid),
 845                                libcfs_nid2str(connreq->gncr_dstnid),
 846                                libcfs_nid2str(net->gnn_ni->ni_nid),
 847                                dgram, kgnilnd_dgram_type2str(dgram));
 848                         kgnilnd_net_decref(net);
 849                         return -EBADSLT;
 850                 }
 851
 852                 /* kgnilnd_find_net takes a ref on the net it finds, You need to decref it when not needed. */
 853                 kgnilnd_net_decref(net);
 854         }
 855
 856         if (connreq->gncr_version != GNILND_CONNREQ_VERSION) {
 857                 CERROR("Unexpected version %d\n", connreq->gncr_version);
 858                 return -EPROTO;
 859         }
 860
 861         /* XXX Nic: TBD - checksum validation */
 862         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_DROP)) {
 863                 return -EBADF;
 864         }
 865
 866         if (swab && connreq->gncr_type == GNILND_CONNREQ_REQ) {
 867                 __u64 msg_addr = (__u64) connreq->gncr_gnparams.gnpr_smsg_attr.msg_buffer;
 868
 869                 __swab32s(&connreq->gncr_gnparams.gnpr_host_id);
 870                 __swab32s(&connreq->gncr_gnparams.gnpr_cqid);
 871                 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.buff_size);
 872                 __swab16s(&connreq->gncr_gnparams.gnpr_smsg_attr.mbox_maxcredit);
 873                 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.mbox_offset);
 874                 __swab64s(&connreq->gncr_gnparams.gnpr_smsg_attr.mem_hndl.qword1);
 875                 __swab64s(&connreq->gncr_gnparams.gnpr_smsg_attr.mem_hndl.qword2);
 876                 __swab64s(&msg_addr);
 877                 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.msg_maxsize);
 878                 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.msg_type);
 879         } else if (swab && connreq->gncr_type == GNILND_CONNREQ_NAK) {
 880                 __swab32s(&connreq->gncr_nakdata.gnnd_errno);
 881         }
 882
 883         /* since we use a unique instance ID for each network, the driver
 884          * will take care of dropping datagrams if we don't have that network.
 885          */
 886
 887         /* few more idiot software or configuration checks */
 888
 889         switch (connreq->gncr_type) {
 890         case GNILND_CONNREQ_REQ:
 891                 /* wire up EP and SMSG block - this will check the incoming data
 892                  * and barf a NAK back if need to */
 893                 rc = kgnilnd_set_conn_params(dgram);
 894                 if (rc)
 895                         return rc;
 896                 break;
 897         case GNILND_CONNREQ_NAK:
 898         case GNILND_CONNREQ_CLOSE:
 899                 break;
 900         default:
 901                 CERROR("unknown connreq packet type %d\n", connreq->gncr_type);
 902                 return -EPROTO;
 903         }
 904
 905         if (connreq->gncr_peerstamp == 0 || connreq->gncr_connstamp == 0) {
 906                 CERROR("Recived bad timestamps peer "LPU64" conn "LPU64"\n",
 907                 connreq->gncr_peerstamp, connreq->gncr_connstamp);
 908                 return -EPROTO;
 909         }
 910
 911         if (connreq->gncr_timeout < GNILND_MIN_TIMEOUT) {
 912                 CERROR("Received timeout %d < MIN %d\n",
 913                        connreq->gncr_timeout, GNILND_MIN_TIMEOUT);
 914                 return -EPROTO;
 915         }
 916
 917         return 0;
 918 }
 919
 920 int
 921 kgnilnd_alloc_dgram(kgn_dgram_t **dgramp, kgn_device_t *dev, kgn_dgram_type_t type)
 922 {
 923         kgn_dgram_t         *dgram;
 924
 925         dgram = cfs_mem_cache_alloc(kgnilnd_data.kgn_dgram_cache,
 926                                     CFS_ALLOC_ATOMIC);
 927         if (dgram == NULL)
 928                 return -ENOMEM;
 929
 930         /* cache alloc'd memory is not zeroed */
 931         memset((void *)dgram, 0, sizeof(*dgram)) ;
 932
 933         INIT_LIST_HEAD(&dgram->gndg_list);
 934         dgram->gndg_state = GNILND_DGRAM_USED;
 935         dgram->gndg_type = type;
 936         dgram->gndg_magic = GNILND_DGRAM_MAGIC;
 937
 938         atomic_inc(&dev->gnd_ndgrams);
 939
 940         CDEBUG(D_MALLOC|D_NETTRACE, "slab-alloced 'dgram': %lu at %p.\n",
 941                sizeof(*dgram), dgram);
 942
 943         *dgramp = dgram;
 944         return 0;
 945 }
 946
 947 /* call this on a dgram that came back from kgnilnd_ep_postdata_test_by_id
 948  * returns < 0 on dgram to be cleaned up
 949  * > 0 on dgram that isn't done yet
 950  * == 0 on dgram that is ok and needs connreq processing */
 951 int
 952 kgnilnd_process_dgram(kgn_dgram_t *dgram, gni_post_state_t post_state)
 953 {
 954         int rc = 0;
 955
 956         switch (post_state) {
 957         case GNI_POST_COMPLETED:
 958                 /* normal state for dgrams that need actual processing */
 959                 /* GOTO to avoid processing dgram as canceled/done */
 960                 GOTO(process_out, rc);
 961
 962         case GNI_POST_PENDING:
 963                 /* we should only see this if we are testing a WC dgram after a
 964                  * cancel - it means that it needs a full cycle of waiting
 965                  * for kgni_sm_task to finish moving it to TERMINATED */
 966                 LASSERTF((dgram->gndg_type == GNILND_DGRAM_WC_REQ) &&
 967                           (dgram->gndg_state == GNILND_DGRAM_CANCELED),
 968                          "POST_PENDING dgram 0x%p with bad type %d(%s) or state %d(%s)\n",
 969                          dgram, dgram->gndg_type, kgnilnd_dgram_type2str(dgram),
 970                          dgram->gndg_state, kgnilnd_dgram_state2str(dgram));
 971
 972                 /* positive RC as this dgram isn't done yet */
 973                 rc = EINPROGRESS;
 974
 975                 /* GOTO as this isn't done yet */
 976                 GOTO(process_out, rc);
 977                 break;
 978
 979         case GNI_POST_TERMINATED:
 980                 /* we've called cancel and it is done or remote guy called cancel and
 981                  * we've receved it on a WC dgram */
 982 #if 0
 983                 /* we are seeing weird terminations on non WC dgrams when we have not
 984                  * canceled them */
 985
 986                 LASSERTF(dgram->gndg_state == GNILND_DGRAM_CANCELED ||
 987                          dgram->gndg_conn_out.gncr_dstnid == LNET_NID_ANY,
 988                         "dgram 0x%p with bad state %d(%s) or dst nid %s\n",
 989                         dgram, dgram->gndg_state, kgnilnd_dgram_state2str(dgram),
 990                         libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid));
 991 #endif
 992
 993                 CDEBUG(D_NETTRACE, "dgram 0x%p saw %s, cleaning it up\n", dgram,
 994                        dgram->gndg_state == GNILND_DGRAM_CANCELED ?  "canceled" : "terminated");
 995
 996                 rc =  -ECANCELED;
 997                 break;
 998
 999         case GNI_POST_TIMEOUT:
1000                 /* we could have a timeout on a wildcard dgram too - if
1001                  * we got the incoming request but the remote node beefed
1002                  * before kgni could send the match data back. We'll just error
1003                  * on the active case and bail out gracefully */
1004                 if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
1005                         CNETERR("hardware timeout for connect to "
1006                                "%s after %lu seconds. Is node dead?\n",
1007                                libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
1008                                cfs_duration_sec(jiffies - dgram->gndg_post_time));
1009                 }
1010
1011                 rc = -ETIMEDOUT;
1012                 break;
1013
1014         default:
1015                 CERROR("dgram 0x%p with bad post_state %d\n", dgram, post_state);
1016                 LBUG();
1017         }
1018
1019         /* now finish cleaning up a dgram that is canceled/terminated and needs to
1020          * go away */
1021
1022         /* If this was actively canceled, drop the count now that we are processing */
1023         if (dgram->gndg_state == GNILND_DGRAM_CANCELED) {
1024                 atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1025                 /* caller responsible for gndg_list removal */
1026         }
1027
1028 process_out:
1029
1030         RETURN(rc);
1031 }
1032
1033 /* needs dev->gnd_dgram_lock held */
1034 void
1035 kgnilnd_cancel_dgram_locked(kgn_dgram_t *dgram)
1036 {
1037         gni_return_t            grc;
1038
1039         if (dgram->gndg_state != GNILND_DGRAM_POSTED) {
1040                 return;
1041         }
1042
1043         LASSERTF(dgram->gndg_conn != NULL,
1044                  "dgram 0x%p with NULL conn\n", dgram);
1045
1046         /* C.E - WC dgrams could be canceled immediately but
1047          * if there was some match pending, we need to call
1048          * test_by_id to clear it out. If that test returns
1049          * POST_PENDING, it is half done and needs to go along
1050          * with the rest of dgrams and go through a kgni_sm_task cycle
1051          * and deliver a GNI_POST_TERMINATED event before they
1052          * are actually canceled */
1053
1054         dgram->gndg_state = GNILND_DGRAM_CANCELED;
1055
1056         if (dgram->gndg_conn->gnc_state >= GNILND_CONN_ESTABLISHED) {
1057                 /* we don't need to cancel_by_id if the datagram was good */
1058                 return;
1059         }
1060
1061         /* let folks know there are outstanding cancels */
1062         atomic_inc(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1063         /* leave on nid list until cancel is done for debugging fun */
1064         grc = kgnilnd_ep_postdata_cancel_by_id(dgram->gndg_conn->gnc_ephandle, (__u64) dgram);
1065
1066         /* if we don't get success here, we have hosed up the dgram tracking
1067          * code and need to bail out */
1068         LASSERTF(grc == GNI_RC_SUCCESS,
1069                  "postdata_cancel returned %d for conn 0x%p to %s\n",
1070                  grc, dgram->gndg_conn,
1071                  dgram->gndg_conn->gnc_peer ?
1072                   libcfs_nid2str(dgram->gndg_conn->gnc_peer->gnp_nid)
1073                   : "<?>");
1074
1075         CDEBUG(D_NETTRACE,
1076                 "canceled dgram 0x%p conn 0x%p ephandle 0x%p\n",
1077                 dgram, dgram->gndg_conn,
1078                 dgram->gndg_conn->gnc_ephandle);
1079
1080         if (dgram->gndg_type == GNILND_DGRAM_WC_REQ) {
1081                 gni_post_state_t         post_state;
1082                 int                      rc = 0;
1083                 __u32                    remote_addr = 0, remote_id = 0;
1084
1085                 grc = kgnilnd_ep_postdata_test_by_id(dgram->gndg_conn->gnc_ephandle,
1086                                                      (__u64)dgram, &post_state,
1087                                                      &remote_addr, &remote_id);
1088
1089                 LASSERTF(grc == GNI_RC_NO_MATCH || grc == GNI_RC_SUCCESS,
1090                          "bad grc %d from test_by_id on dgram 0x%p\n",
1091                         grc, dgram);
1092
1093                 /* if WC was canceled immediately, we get NO_MATCH, if needs to go
1094                  * through full cycle, we get SUCCESS and need to parse post_state */
1095
1096                 CDEBUG(D_NET, "grc %d dgram 0x%p type %s post_state %d "
1097                         "remote_addr %u remote_id %u\n", grc, dgram,
1098                         kgnilnd_dgram_type2str(dgram),
1099                         post_state, remote_addr, remote_id);
1100
1101                 if (grc == GNI_RC_NO_MATCH) {
1102                         /* she's gone, reduce count and move along */
1103                         dgram->gndg_state = GNILND_DGRAM_DONE;
1104                         atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1105                         RETURN_EXIT;
1106                 }
1107
1108                 rc = kgnilnd_process_dgram(dgram, post_state);
1109
1110                 if (rc <= 0) {
1111                         /* if for some weird reason we get a valid dgram back, just mark as done
1112                          * so we can drop it and move along.
1113                          * C.E - if it was completed, we'll just release the conn/mbox
1114                          * back into the pool and it'll get reused. That said, we should only
1115                          * be canceling a WC dgram on stack rest or shutdown, so that is moot */
1116                         dgram->gndg_state = GNILND_DGRAM_DONE;
1117                         atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1118
1119                         /* caller context responsible for calling kgnilnd_release_dgram() */
1120                 } else {
1121                         /* still pending, let it simmer until golden brown and delicious */
1122                 }
1123         }
1124
1125         /* for non WC dgrams, they are still on the nid list but marked canceled waiting
1126          * for kgni to return their ID to us via probe - that is when we'll complete their
1127          * cancel processing */
1128 }
1129
1130 void
1131 kgnilnd_cleanup_dgram(kgn_dgram_t *dgram)
1132 {
1133         /* release the dgram ref on conn */
1134         if (dgram->gndg_conn) {
1135                 kgnilnd_conn_decref(dgram->gndg_conn);
1136                 dgram->gndg_conn = NULL;
1137         }
1138 }
1139
1140 void
1141 kgnilnd_free_dgram(kgn_device_t *dev, kgn_dgram_t *dgram)
1142 {
1143         LASSERTF(dgram->gndg_state == GNILND_DGRAM_USED ||
1144                  dgram->gndg_state == GNILND_DGRAM_DONE,
1145                  "dgram 0x%p with bad state %s\n",
1146                  dgram, kgnilnd_dgram_state2str(dgram));
1147
1148         /* bit of poisoning to help detect bad driver data */
1149         dgram->gndg_magic = 0x6f5a6b5f;
1150         atomic_dec(&dev->gnd_ndgrams);
1151
1152         cfs_mem_cache_free(kgnilnd_data.kgn_dgram_cache, dgram);
1153         CDEBUG(D_MALLOC|D_NETTRACE, "slab-freed 'dgram': %lu at %p.\n",
1154                sizeof(*dgram), dgram);
1155 }
1156
1157 int
1158 kgnilnd_post_dgram(kgn_device_t *dev, lnet_nid_t dstnid, kgn_connreq_type_t type,
1159                    int data_rc)
1160 {
1161         int              rc = 0;
1162         kgn_dgram_t     *dgram = NULL;
1163         kgn_dgram_t     *tmpdgram;
1164         kgn_dgram_type_t dgtype;
1165         gni_return_t     grc;
1166         __u64            srcnid;
1167         ENTRY;
1168
1169         switch (type) {
1170         case GNILND_CONNREQ_REQ:
1171                 if (dstnid == LNET_NID_ANY)
1172                         dgtype = GNILND_DGRAM_WC_REQ;
1173                 else
1174                         dgtype = GNILND_DGRAM_REQ;
1175                 break;
1176         case GNILND_CONNREQ_NAK:
1177                 LASSERTF(dstnid != LNET_NID_ANY, "can't NAK to LNET_NID_ANY\n");
1178                 dgtype = GNILND_DGRAM_NAK;
1179                 break;
1180         default:
1181                 CERROR("unknown connreq type %d\n", type);
1182                 LBUG();
1183         }
1184
1185         rc = kgnilnd_alloc_dgram(&dgram, dev, dgtype);
1186         if (rc < 0) {
1187                 rc = -ENOMEM;
1188                 GOTO(post_failed, rc);
1189         }
1190
1191         rc = kgnilnd_create_conn(&dgram->gndg_conn, dev);
1192         if (rc) {
1193                 GOTO(post_failed, rc);
1194         }
1195
1196         if (dgram->gndg_type == GNILND_DGRAM_WC_REQ) {
1197                 /* clear buffer for sanity on reuse of wildcard */
1198                 memset(&dgram->gndg_conn_in, 0, sizeof(kgn_connreq_t));
1199         }
1200
1201         if (dstnid == LNET_NID_ANY) {
1202                 /* set here to reset any dgram re-use */
1203                 dgram->gndg_conn->gnc_state = GNILND_CONN_LISTEN;
1204         } else {
1205                 __u32            host_id;
1206
1207                 rc = kgnilnd_nid_to_nicaddrs(LNET_NIDADDR(dstnid), 1, &host_id);
1208                 if (rc <= 0) {
1209                         rc = -ESRCH;
1210                         GOTO(post_failed, rc);
1211                 }
1212
1213                 dgram->gndg_conn->gnc_state = GNILND_CONN_CONNECTING;
1214
1215                 /* don't need to serialize, there are no CQs for the dgram
1216                  * EP on the kgn_net_t */
1217                 grc = kgnilnd_ep_bind(dgram->gndg_conn->gnc_ephandle, host_id, dev->gnd_id);
1218
1219                 if (grc != GNI_RC_SUCCESS) {
1220                         rc = -ECONNABORTED;
1221                         GOTO(post_failed, rc);
1222                 }
1223
1224         }
1225
1226         /* If we are posting wildcards post using a net of 0, otherwise we'll use the
1227          * net of the destination node.
1228          */
1229
1230         if (dstnid == LNET_NID_ANY) {
1231                 srcnid = LNET_MKNID(LNET_MKNET(GNILND, 0), dev->gnd_nid);
1232         } else {
1233                 srcnid = LNET_MKNID(LNET_NIDNET(dstnid), dev->gnd_nid);
1234         }
1235
1236         rc = kgnilnd_pack_connreq(&dgram->gndg_conn_out, dgram->gndg_conn,
1237                                   srcnid, dstnid, type);
1238         if (rc) {
1239                 GOTO(post_failed, rc);
1240         }
1241
1242         if (type == GNILND_CONNREQ_NAK)
1243                 dgram->gndg_conn_out.gncr_nakdata.gnnd_errno = data_rc;
1244
1245         dgram->gndg_post_time = jiffies;
1246
1247         /* XXX Nic: here is where we'd add in logical network multiplexing */
1248
1249         CDEBUG(D_NETTRACE, "dgram 0x%p type %s %s->%s cdm %d\n",
1250                dgram, kgnilnd_dgram_type2str(dgram),
1251                libcfs_nid2str(srcnid),
1252                libcfs_nid2str(dstnid), dev->gnd_id);
1253
1254         /* this allocates memory, can't hold locks across */
1255         grc = kgnilnd_ep_postdata_w_id(dgram->gndg_conn->gnc_ephandle,
1256                                    &dgram->gndg_conn_out, sizeof(kgn_connreq_t),
1257                                    &dgram->gndg_conn_in, sizeof(kgn_connreq_t),
1258                                    (__u64)dgram);
1259
1260         if (grc != GNI_RC_SUCCESS) {
1261                 CNETERR("dropping failed dgram post id 0x%p type %s"
1262                         " reqtype %s to %s: rc %d\n",
1263                         dgram, kgnilnd_dgram_type2str(dgram),
1264                         kgnilnd_connreq_type2str(&dgram->gndg_conn_out),
1265                         libcfs_nid2str(dstnid), grc);
1266                 rc = (grc == GNI_RC_ERROR_NOMEM) ? -ENOMEM : -EBADR;
1267                 GOTO(post_failed, rc);
1268         }
1269
1270         /* we don't need to add earlier - if someone does del_peer during post,
1271          * that peer will get marked as unlinked and the callers wil take care of it.
1272          * The dgram code is largely kgn_peer_t ignorant, so at worst, we'll just drop
1273          * the completed dgram later when we cant find a peer to stuff it into */
1274
1275         spin_lock(&dev->gnd_dgram_lock);
1276
1277         /* make sure we are not double posting targeted dgrams
1278          * - we can multiple post WC dgrams to help with processing speed */
1279         if (dstnid != LNET_NID_ANY) {
1280                 tmpdgram = kgnilnd_find_dgram_locked(dev, dstnid);
1281
1282                 LASSERTF(tmpdgram == NULL,
1283                         "dgram 0x%p->%s already posted\n",
1284                          dgram, libcfs_nid2str(dstnid));
1285         }
1286
1287         /* unmunge dstnid to help processing code cope... */
1288         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_DSTNID)) {
1289                 dgram->gndg_conn_out.gncr_dstnid = dstnid;
1290         }
1291
1292         list_add_tail(&dgram->gndg_list, kgnilnd_nid2dgramlist(dev, dstnid));
1293         dgram->gndg_state = GNILND_DGRAM_POSTED;
1294         spin_unlock(&dev->gnd_dgram_lock);
1295
1296 post_failed:
1297         if (rc < 0 && dgram != NULL) {
1298                 kgnilnd_cleanup_dgram(dgram);
1299                 kgnilnd_free_dgram(dev, dgram);
1300         }
1301
1302         RETURN(rc);
1303 }
1304
1305 void
1306 kgnilnd_release_dgram(kgn_device_t *dev, kgn_dgram_t *dgram)
1307 {
1308         spin_lock(&dev->gnd_dgram_lock);
1309         kgnilnd_cancel_dgram_locked(dgram);
1310         spin_unlock(&dev->gnd_dgram_lock);
1311
1312         kgnilnd_cleanup_dgram(dgram);
1313
1314         /* if the dgram is 'canceled' it needs to be wait until the event
1315          * comes up from kgni that tells us it is safe to release */
1316         if (dgram->gndg_state != GNILND_DGRAM_CANCELED) {
1317                 dgram->gndg_state = GNILND_DGRAM_DONE;
1318
1319                 LASSERTF(list_empty(&dgram->gndg_list), "dgram 0x%p on list\n", dgram);
1320
1321                 /* if it is a wildcard and we are in an appropriate state, repost
1322                  * the wildcard */
1323
1324                 if ((dgram->gndg_type == GNILND_DGRAM_WC_REQ) &&
1325                     (!kgnilnd_data.kgn_wc_kill && !kgnilnd_data.kgn_in_reset)) {
1326                         int     rerc;
1327
1328                         rerc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
1329                         LASSERTF(rerc == 0,
1330                                 "error %d: dev %d could not repost wildcard datagram id 0x%p\n",
1331                                 rerc, dev->gnd_id, dgram);
1332                 }
1333
1334                 /* always free the old dgram */
1335                 kgnilnd_free_dgram(dev, dgram);
1336         }
1337 }
1338
1339
1340 int
1341 kgnilnd_probe_for_dgram(kgn_device_t *dev, kgn_dgram_t **dgramp)
1342 {
1343         kgn_dgram_t             *dgram = NULL;
1344         gni_post_state_t         post_state;
1345         gni_return_t             grc;
1346         int                      rc = 0;
1347         __u64                    readyid;
1348         __u32                    remote_addr = 0, remote_id = 0;
1349         ENTRY;
1350
1351         /* Probe with the lock held. That way if we get a dgram we dont have it canceled
1352          * between finding the ready dgram and grabbing the lock to remove it from the
1353          * list. Otherwise we could be left in an inconsistent state. We own the dgram
1354          * once its off the list so we don't need to worry about others changing it at
1355          * that point. */
1356         spin_lock(&dev->gnd_dgram_lock);
1357         grc = kgnilnd_postdata_probe_by_id(dev->gnd_handle, &readyid);
1358         if (grc != GNI_RC_SUCCESS) {
1359                 spin_unlock(&dev->gnd_dgram_lock);
1360                 /* return 0 to indicate nothing happened */
1361                 RETURN(0);
1362         }
1363
1364         CDEBUG(D_NET, "ready "LPX64" on device 0x%p\n",
1365                 readyid, dev);
1366
1367         dgram = (kgn_dgram_t *)readyid;
1368
1369         LASSERTF(dgram->gndg_magic == GNILND_DGRAM_MAGIC,
1370                  "dgram 0x%p from id "LPX64" with bad magic %x\n",
1371                  dgram, readyid, dgram->gndg_magic);
1372
1373         LASSERTF(dgram->gndg_state == GNILND_DGRAM_POSTED ||
1374                  dgram->gndg_state == GNILND_DGRAM_CANCELED,
1375                  "dgram 0x%p with bad state %s\n",
1376                  dgram, kgnilnd_dgram_state2str(dgram));
1377
1378         LASSERTF(!list_empty(&dgram->gndg_list),
1379                  "dgram 0x%p with bad list state %s\n",
1380                  dgram, kgnilnd_dgram_state2str(dgram));
1381
1382         /* now we know that the datagram structure is ok, so pull off list */
1383         list_del_init(&dgram->gndg_list);
1384
1385         /* while we have the gnn_dgram_lock and BEFORE we call test_by_id
1386          * change the state from POSTED to PROCESSING to ensure that
1387          * nobody cancels it after we've pulled it from the wire */
1388         if (dgram->gndg_state == GNILND_DGRAM_POSTED) {
1389                 dgram->gndg_state = GNILND_DGRAM_PROCESSING;
1390         }
1391
1392         spin_unlock(&dev->gnd_dgram_lock);
1393
1394         /* we now "own" this datagram */
1395
1396         LASSERTF(dgram->gndg_conn != NULL,
1397                 "dgram 0x%p with NULL conn\n", dgram);
1398
1399         grc = kgnilnd_ep_postdata_test_by_id(dgram->gndg_conn->gnc_ephandle,
1400                                              (__u64)dgram, &post_state,
1401                                              &remote_addr, &remote_id);
1402
1403         LASSERTF(grc != GNI_RC_NO_MATCH, "kgni lied! probe_by_id told us that"
1404                  " id "LPU64" was ready\n", readyid);
1405
1406         CDEBUG(D_NET, "grc %d dgram 0x%p type %s post_state %d "
1407                 "remote_addr %u remote_id %u\n", grc, dgram,
1408                 kgnilnd_dgram_type2str(dgram),
1409                 post_state, remote_addr, remote_id);
1410
1411         if (unlikely(grc != GNI_RC_SUCCESS)) {
1412                 CNETERR("getting data for dgram 0x%p->%s failed rc %d. Dropping it\n",
1413                         dgram, libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
1414                         grc);
1415                 rc = -EINVAL;
1416                 GOTO(probe_for_out, rc);
1417         }
1418
1419         rc = kgnilnd_process_dgram(dgram, post_state);
1420
1421         /* we should never get probe finding a dgram for us and then it
1422          * being a WC dgram that is still in the middle of processing */
1423         LASSERTF(rc <= 0, "bad rc %d from process_dgram 0x%p state %d\n",
1424                  rc, dgram, post_state);
1425
1426         if (rc == 0) {
1427                 /* dgram is good enough for the data to be used */
1428                 dgram->gndg_state = GNILND_DGRAM_PROCESSING;
1429                 /* fake rc to mark that we've done something */
1430                 rc = 1;
1431         } else {
1432                 /* bring out your dead! */
1433                 dgram->gndg_state = GNILND_DGRAM_DONE;
1434         }
1435
1436         *dgramp = dgram;
1437         RETURN(rc);
1438
1439 probe_for_out:
1440
1441         kgnilnd_release_dgram(dev, dgram);
1442         RETURN(rc);
1443 }
1444
1445 int
1446 kgnilnd_setup_wildcard_dgram(kgn_device_t *dev)
1447 {
1448         /* if kgn_wildcard is zero, return error */
1449         int     rc = -ENOENT, i;
1450         ENTRY;
1451
1452         for (i = 0; i < *kgnilnd_tunables.kgn_nwildcard; i++) {
1453                 rc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
1454                 if (rc < 0) {
1455                         CERROR("error %d: could not post wildcard datagram # %d\n",
1456                                 rc, i);
1457                         rc = -EINVAL;
1458                         GOTO(failed, rc);
1459                 }
1460         }
1461
1462 failed:
1463         RETURN(rc);
1464 }
1465
1466 int
1467 kgnilnd_cancel_net_dgrams(kgn_net_t *net)
1468 {
1469         kgn_dgram_t            *dg, *dgN;
1470         struct list_head        zombies;
1471         int                     i;
1472         ENTRY;
1473
1474         /* we want to cancel any outstanding dgrams - we don't want to rely
1475          * on del_peer_or_conn catching all of them. This helps protect us in cases
1476          * where we don't quite keep the peer->dgram mapping in sync due to some
1477          * race conditions */
1478
1479         LASSERTF(net->gnn_shutdown || kgnilnd_data.kgn_in_reset,
1480                  "called with LND invalid state: net shutdown %d "
1481                  "in reset %d\n", net->gnn_shutdown,
1482                  kgnilnd_data.kgn_in_reset);
1483
1484         INIT_LIST_HEAD(&zombies);
1485
1486         spin_lock(&net->gnn_dev->gnd_dgram_lock);
1487
1488         for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
1489                 list_for_each_entry_safe(dg, dgN, &net->gnn_dev->gnd_dgrams[i], gndg_list) {
1490
1491                         /* skip nids not on our net or are wildcards */
1492
1493
1494                         if (dg->gndg_type == GNILND_DGRAM_WC_REQ ||
1495                                 net->gnn_netnum != LNET_NETNUM(LNET_NIDNET(dg->gndg_conn_out.gncr_dstnid)))
1496                                 continue;
1497
1498                         kgnilnd_cancel_dgram_locked(dg);
1499                 }
1500         }
1501
1502         spin_unlock(&net->gnn_dev->gnd_dgram_lock);
1503
1504         RETURN(0);
1505 }
1506
1507 int
1508 kgnilnd_cancel_wc_dgrams(kgn_device_t *dev)
1509 {
1510         kgn_dgram_t *dg, *dgN;
1511         struct list_head zombies;
1512         ENTRY;
1513
1514         /* Time to kill the outstanding WC's
1515          * WC's exist on net 0 only but match on any net...
1516          */
1517
1518         LASSERTF(kgnilnd_data.kgn_in_reset || kgnilnd_data.kgn_wc_kill,
1519                 "called with LND invalid state: WC shutdown %d "
1520                 "in reset %d\n", kgnilnd_data.kgn_wc_kill,
1521                 kgnilnd_data.kgn_in_reset);
1522
1523         INIT_LIST_HEAD(&zombies);
1524         spin_lock(&dev->gnd_dgram_lock);
1525
1526         do {
1527                 dg = kgnilnd_find_dgram_locked(dev, LNET_NID_ANY);
1528                 if (dg != NULL) {
1529                         LASSERTF(dg->gndg_type == GNILND_DGRAM_WC_REQ,
1530                                  "dgram 0x%p->%s with bad type %d (%s)\n",
1531                                 dg, libcfs_nid2str(dg->gndg_conn_out.gncr_dstnid),
1532                                 dg->gndg_type, kgnilnd_dgram_type2str(dg));
1533
1534                         kgnilnd_cancel_dgram_locked(dg);
1535
1536                         /* WC could be DONE already, check and if so add to list to be released */
1537                         if (dg->gndg_state == GNILND_DGRAM_DONE) {
1538                                 list_del_init(&dg->gndg_list);
1539                                 list_add_tail(&dg->gndg_list, &zombies);
1540                         }
1541                 }
1542         } while (dg != NULL);
1543
1544         spin_unlock(&dev->gnd_dgram_lock);
1545
1546         list_for_each_entry_safe(dg, dgN, &zombies, gndg_list) {
1547                 list_del_init(&dg->gndg_list);
1548                 kgnilnd_release_dgram(dev, dg);
1549         }
1550         RETURN(0);
1551
1552 }
1553
1554 void
1555 kgnilnd_wait_for_canceled_dgrams(kgn_device_t *dev)
1556 {
1557         int             i = 4;
1558         int             rc;
1559         gni_return_t    grc;
1560         __u64           readyid;
1561         kgn_dgram_t    *dgram;
1562
1563         /* use do while to get at least one check run to allow
1564          * regression test for 762072 to hit bug if there */
1565
1566         /* This function races with the dgram mover during shutdown so it is possible for
1567          * a dgram to be seen in kgnilnd_postdata_probe_wait_by_id but be handled in the
1568          * dgram mover thread instead of inside of this function.
1569          */
1570
1571         /* This should only be called from within shutdown, baseshutdown, or stack reset.
1572          * there are no assertions here to verify since base_shutdown has nothing in it we can check
1573          * the net is gone by then.
1574          */
1575
1576         do {
1577                 i++;
1578                 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
1579                         "Waiting for %d canceled datagrams to clear on device %d\n",
1580                         atomic_read(&dev->gnd_canceled_dgrams), dev->gnd_id);
1581
1582                 /* check once a second */
1583                 grc = kgnilnd_postdata_probe_wait_by_id(dev->gnd_handle,
1584                        250, &readyid);
1585
1586                 if (grc != GNI_RC_SUCCESS)
1587                         continue;
1588
1589                 CDEBUG(D_NET, "ready "LPX64" on device %d->0x%p\n",
1590                         readyid, dev->gnd_id, dev);
1591
1592                 rc = kgnilnd_probe_for_dgram(dev, &dgram);
1593                 if (rc != 0) {
1594                         /* if we got a valid dgram or one that is now done, clean up */
1595                         kgnilnd_release_dgram(dev, dgram);
1596                 }
1597         } while (atomic_read(&dev->gnd_canceled_dgrams));
1598 }
1599
1600 int
1601 kgnilnd_start_connect(kgn_peer_t *peer)
1602 {
1603         int              rc = 0;
1604         /* sync point for kgnilnd_del_peer_locked - do an early check to
1605          * catch the most common hits where del_peer is done by the
1606          * time we get here */
1607         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING1)) {
1608                 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING1, 1)) {};
1609         }
1610
1611         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1612         if (!kgnilnd_peer_active(peer) || peer->gnp_connecting != GNILND_PEER_CONNECT) {
1613                 /* raced with peer getting unlinked */
1614                 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1615                 rc = ESTALE;
1616                 GOTO(out, rc);
1617         }
1618         peer->gnp_connecting = GNILND_PEER_POSTING;
1619         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1620
1621         set_mb(peer->gnp_last_dgram_time, jiffies);
1622         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING2)) {
1623                 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING2, 1)) {};
1624         }
1625
1626         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING3)) {
1627                 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING3, 1)) {};
1628                 rc = cfs_fail_val ? cfs_fail_val : -ENOMEM;
1629         } else {
1630                 rc = kgnilnd_post_dgram(peer->gnp_net->gnn_dev,
1631                                         peer->gnp_nid, GNILND_CONNREQ_REQ, 0);
1632         }
1633         if (rc < 0) {
1634                 set_mb(peer->gnp_last_dgram_errno, rc);
1635                 GOTO(failed, rc);
1636         }
1637
1638         /* while we're posting someone could have decided this peer/dgram needed to
1639          * die a quick death, so we check for state change and process accordingly */
1640
1641         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1642         if (!kgnilnd_peer_active(peer) || peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH) {
1643                 if (peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH) {
1644                         peer->gnp_connecting = GNILND_PEER_KILL;
1645                 }
1646                 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1647                 /* positive RC to avoid dgram cleanup - we'll have to
1648                  * wait for the kgni GNI_POST_TERMINATED event to
1649                  * finish cleaning up */
1650                 rc = ESTALE;
1651                 kgnilnd_find_and_cancel_dgram(peer->gnp_net->gnn_dev, peer->gnp_nid);
1652                 GOTO(out, rc);
1653         }
1654         peer->gnp_connecting = GNILND_PEER_POSTED;
1655         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1656         /* reaper thread will take care of any timeouts */
1657         CDEBUG(D_NET, "waiting for connect to finish to %s rc %d\n",
1658                libcfs_nid2str(peer->gnp_nid), rc);
1659
1660         RETURN(rc);
1661
1662 failed:
1663         CDEBUG(D_NET, "connect to %s failed: rc %d \n",
1664                libcfs_nid2str(peer->gnp_nid), rc);
1665 out:
1666         RETURN(rc);
1667 }
1668
1669 int
1670 kgnilnd_finish_connect(kgn_dgram_t *dgram)
1671 {
1672         kgn_conn_t        *conn = dgram->gndg_conn;
1673         lnet_nid_t         her_nid = dgram->gndg_conn_in.gncr_srcnid;
1674         kgn_peer_t        *new_peer, *peer = NULL;
1675         kgn_tx_t          *tx;
1676         kgn_tx_t          *txn;
1677         kgn_mbox_info_t   *mbox;
1678         int                rc;
1679         int                nstale;
1680
1681         /* try to find a peer that matches the nid we got in the connreq
1682          * kgnilnd_unpack_connreq makes sure that conn_in.gncr_srcnid is
1683          * HER and conn_out.gncr_srcnid is ME for both active and WC dgrams */
1684
1685         /* assume this is a new peer  - it makes locking cleaner when it isn't */
1686         /* no holding kgn_net_rw_sem - already are at the kgnilnd_dgram_mover level */
1687
1688         rc = kgnilnd_create_peer_safe(&new_peer, her_nid, NULL);
1689         if (rc != 0) {
1690                 CERROR("Can't create peer for %s\n", libcfs_nid2str(her_nid));
1691                 return rc;
1692         }
1693
1694         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1695
1696         /* this transfers ref from create_peer to the kgn_peer table */
1697         kgnilnd_add_peer_locked(her_nid, new_peer, &peer);
1698
1699         /* if we found an existing peer, is it really ready for a new conn ? */
1700         if (peer != new_peer) {
1701                 /* if this was an active connect attempt but we can't find a peer waiting for it
1702                  * we will dump in the trash */
1703
1704                 if (peer->gnp_connecting == GNILND_PEER_IDLE && dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
1705                         CDEBUG(D_NET, "dropping completed connreq for %s peer 0x%p->%s\n",
1706                                libcfs_nid2str(her_nid), peer, libcfs_nid2str(peer->gnp_nid));
1707                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1708                         rc = ECANCELED;
1709                         GOTO(out, rc);
1710                 }
1711
1712                 /* check to see if we can catch a connecting peer before it is
1713                  * removed from the connd_peers list - if not, we need to
1714                  * let the connreqs race and be handled by kgnilnd_conn_isdup_locked() */
1715                 if (peer->gnp_connecting != GNILND_PEER_IDLE) {
1716                         spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
1717                         if (!list_empty(&peer->gnp_connd_list)) {
1718                                 list_del_init(&peer->gnp_connd_list);
1719                                 /* drop connd ref */
1720                                 kgnilnd_peer_decref(peer);
1721                         }
1722                         spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
1723                         /* clear rc to make sure we don't have fake error */
1724                         rc = 0;
1725                 }
1726
1727                 /* no matter what, we are no longer waiting to connect this peer now */
1728                 peer->gnp_connecting = GNILND_PEER_IDLE;
1729
1730                 /* Refuse to duplicate an existing connection (both sides might try to
1731                  * connect at once).  NB we return success!  We _are_ connected so we
1732                  * _don't_ have any blocked txs to complete with failure. */
1733                 rc = kgnilnd_conn_isdup_locked(peer, conn);
1734                 if (rc != 0) {
1735                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1736                         CDEBUG(D_NET, "Not creating duplicate connection to %s: %d\n",
1737                               libcfs_nid2str(her_nid), rc);
1738                         rc = EALREADY;
1739                         GOTO(out, rc);
1740                 }
1741         }
1742
1743         nstale = kgnilnd_close_stale_conns_locked(peer, conn);
1744
1745         /* either way with peer (new or existing), we are ok with ref counts here as the
1746          * kgnilnd_add_peer_locked will use our ref on new_peer (from create_peer_safe) as the
1747          * ref for the peer table. */
1748
1749         /* at this point, the connection request is a winner */
1750
1751         /* mark 'DONE' to avoid cancel being called from release */
1752         dgram->gndg_state = GNILND_DGRAM_DONE;
1753
1754         /* initialise timestamps before reaper looks at them */
1755         conn->gnc_last_rx = conn->gnc_last_rx_cq = jiffies;
1756
1757         /* last_tx is initialized to jiffies - (keepalive*2) so that if the NOOP fails it will
1758          * immediatly send a NOOP in the reaper thread during the call to
1759          * kgnilnd_check_conn_timeouts_locked
1760          */
1761         conn->gnc_last_tx = jiffies - (cfs_time_seconds(GNILND_TO2KA(conn->gnc_timeout)) * 2);
1762         conn->gnc_state = GNILND_CONN_ESTABLISHED;
1763
1764         /* refs are not transferred from dgram to tables, so increment to
1765          * take ownership */
1766         kgnilnd_conn_addref(conn);
1767         kgnilnd_peer_addref(peer);
1768         conn->gnc_peer = peer;
1769         list_add_tail(&conn->gnc_list, &peer->gnp_conns);
1770
1771         kgnilnd_conn_addref(conn);               /* +1 ref for conn table */
1772         list_add_tail(&conn->gnc_hashlist,
1773                       kgnilnd_cqid2connlist(conn->gnc_cqid));
1774         kgnilnd_data.kgn_conn_version++;
1775
1776         /* Dont send NOOP if fail_loc is set
1777          */
1778         if (!CFS_FAIL_CHECK(CFS_FAIL_GNI_ONLY_NOOP)) {
1779                 tx = kgnilnd_new_tx_msg(GNILND_MSG_NOOP, peer->gnp_net->gnn_ni->ni_nid);
1780                 if (tx == NULL) {
1781                         CNETERR("can't get TX to initiate NOOP to %s\n",
1782                                 libcfs_nid2str(peer->gnp_nid));
1783                 } else {
1784                         kgnilnd_queue_tx(conn, tx);
1785                 }
1786         }
1787
1788         /* Schedule all packets blocking for a connection */
1789         list_for_each_entry_safe(tx, txn, &peer->gnp_tx_queue, tx_list) {
1790                 /* lock held here is the peer_conn lock */
1791                 kgnilnd_tx_del_state_locked(tx, peer, NULL, GNILND_TX_ALLOCD);
1792                 kgnilnd_queue_tx(conn, tx);
1793         }
1794
1795         /* If this is an active connection lets mark its timestamp on the MBoX */
1796         if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
1797                 mbox = &conn->gnc_fma_blk->gnm_mbox_info[conn->gnc_mbox_id];
1798                 /* conn->gnc_last_rx is jiffies it better exist as it was just set */
1799                 mbox->mbx_release_purg_active_dgram = conn->gnc_last_rx;
1800         }
1801
1802         /* Bug 765042: wake up scheduler for a race with finish_connect and
1803          * complete_conn_closed with a conn in purgatory
1804          * since we can't use CFS_RACE due to mutex_holds in kgnilnd_process_conns,
1805          * we just check for set and then clear */
1806         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_FINISH_PURG)) {
1807                 cfs_fail_loc = 0x0;
1808                 /* get scheduler thread moving again */
1809                 kgnilnd_schedule_device(conn->gnc_device);
1810         }
1811
1812         CDEBUG(D_NET, "New conn 0x%p->%s dev %d\n",
1813                conn, libcfs_nid2str(her_nid), conn->gnc_device->gnd_id);
1814
1815         /* make sure we reset peer reconnect interval now that we have a good conn */
1816         kgnilnd_peer_alive(peer);
1817         peer->gnp_reconnect_interval = 0;
1818
1819         /* clear the unlink attribute if we dont clear it kgnilnd_del_conn_or_peer will wait
1820          * on the atomic forever
1821          */
1822         if (peer->gnp_pending_unlink) {
1823                 peer->gnp_pending_unlink = 0;
1824                 kgnilnd_admin_decref(kgnilnd_data.kgn_npending_unlink);
1825                 CDEBUG(D_NET, "Clearing peer unlink %p\n",peer);
1826         }
1827
1828         /* add ref to make it hang around until after we drop the lock */
1829         kgnilnd_conn_addref(conn);
1830
1831         /* Once the peer_conn lock is dropped, the conn could actually move into
1832          * CLOSING->CLOSED->DONE in the scheduler thread, so hold the
1833          * lock until we are really done */
1834         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1835
1836         /* Notify LNET that we now have a working connection to this peer.
1837          * This is a Cray extension to the "standard" LND behavior. */
1838         lnet_notify(peer->gnp_net->gnn_ni, peer->gnp_nid,
1839                      1, cfs_time_current());
1840
1841         /* schedule the conn to pick up any SMSG sent by peer before we could
1842          * process this dgram */
1843         kgnilnd_schedule_conn(conn);
1844
1845         /* drop our 'hold' ref */
1846         kgnilnd_conn_decref(conn);
1847
1848 out:
1849         RETURN(rc);
1850 }
1851
1852 void
1853 kgnilnd_send_nak(kgn_device_t *dev, lnet_nid_t dst_nid, int error)
1854 {
1855         int              rc = 0;
1856         ENTRY;
1857
1858         LASSERTF(dst_nid != LNET_NID_ANY, "bad dst_nid %s\n", libcfs_nid2str(dst_nid));
1859
1860         CDEBUG(D_NET, "NAK to %s errno %d\n", libcfs_nid2str(dst_nid), error);
1861
1862         rc = kgnilnd_post_dgram(dev, dst_nid, GNILND_CONNREQ_NAK, error);
1863
1864         if (rc < 0) {
1865                 CDEBUG(D_NET, "NAK to %s failed: rc %d \n", libcfs_nid2str(dst_nid), rc);
1866         }
1867         EXIT;
1868 }
1869
1870 int
1871 kgnilnd_process_nak(kgn_dgram_t *dgram)
1872 {
1873         kgn_connreq_t     *connreq = &dgram->gndg_conn_in;
1874         lnet_nid_t         src_nid = connreq->gncr_srcnid;
1875         int                errno = connreq->gncr_nakdata.gnnd_errno;
1876         kgn_peer_t        *peer;
1877         int                rc = 0;
1878
1879         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1880
1881         peer = kgnilnd_find_peer_locked(src_nid);
1882         if (peer == NULL) {
1883                 /* we likely dropped him from bad data when we processed
1884                  * the original REQ */
1885                 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1886                 return -EBADSLT;
1887         }
1888
1889         /* need to check peerstamp/connstamp against the ones we find
1890          * to make sure we don't close new (and good?) conns that we
1891          * formed after this connreq failed */
1892         if (peer->gnp_connecting == GNILND_PEER_IDLE) {
1893                 kgn_conn_t        conn;
1894
1895                 if (list_empty(&peer->gnp_conns)) {
1896                         /* assume already procced datagram and it barfed up
1897                          * on this side too */
1898                         CDEBUG(D_NET, "dropping NAK from %s; "
1899                                "peer %s is already not connected\n",
1900                                 libcfs_nid2str(connreq->gncr_srcnid),
1901                                 libcfs_nid2str(connreq->gncr_dstnid));
1902                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1903                         return 0;
1904                 }
1905
1906                 /* stub up a connection with the connreq XXX_stamps to allow
1907                  * use to use close_stale_conns_locked */
1908                 conn.gnc_peerstamp = connreq->gncr_peerstamp;
1909                 conn.gnc_my_connstamp = connreq->gncr_connstamp;
1910                 conn.gnc_peer_connstamp = connreq->gncr_connstamp;
1911                 conn.gnc_device = peer->gnp_net->gnn_dev;
1912
1913                 rc = kgnilnd_close_stale_conns_locked(peer, &conn);
1914
1915                 LCONSOLE_INFO("Received NAK from %s for %s errno %d; "
1916                         "closed %d connections\n",
1917                         libcfs_nid2str(connreq->gncr_srcnid),
1918                         libcfs_nid2str(connreq->gncr_dstnid), errno, rc);
1919         } else {
1920                 rc = 0;
1921                 spin_lock(&dgram->gndg_conn->gnc_device->gnd_connd_lock);
1922
1923                 if (list_empty(&peer->gnp_connd_list)) {
1924                         /* if peer isn't on waiting list, try to find one to nuke */
1925                         rc = kgnilnd_find_and_cancel_dgram(peer->gnp_net->gnn_dev,
1926                                                            peer->gnp_nid);
1927
1928                         if (rc) {
1929                                 LCONSOLE_INFO("Received NAK from %s for %s errno %d; "
1930                                         "canceled pending connect request\n",
1931                                         libcfs_nid2str(connreq->gncr_srcnid),
1932                                         libcfs_nid2str(connreq->gncr_dstnid), errno);
1933                         }
1934
1935                         /* if we can't find a waiting dgram, we just drop the nak - the conn
1936                          * connect must have failed (didn't find conn above and clear connecting
1937                          * -- so nothing to do besides drop */
1938                 } else {
1939                         /* peer is on list, meaning it is a new connect attempt from the one
1940                          * we started that generated the NAK - so just drop NAK */
1941
1942                         /* use negative to prevent error message */
1943                         rc = -EAGAIN;
1944                 }
1945                 spin_unlock(&dgram->gndg_conn->gnc_device->gnd_connd_lock);
1946         }
1947
1948         /* success! we found a peer and at least marked pending_nak */
1949         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1950
1951         return 0;
1952 }
1953
1954 int
1955 kgnilnd_process_connreq(kgn_dgram_t *dgram, int *needs_nak)
1956 {
1957         int                      rc;
1958
1959         rc = kgnilnd_unpack_connreq(dgram);
1960         if (rc < 0) {
1961                 if (rc != -EBADF) {
1962                         /* only NAK if we have good srcnid to use */
1963                         *needs_nak = 1;
1964                 }
1965                 goto connreq_out;
1966         }
1967
1968         switch (dgram->gndg_conn_in.gncr_type) {
1969         case GNILND_CONNREQ_REQ:
1970                 /* wire up peer & conn, send queued TX */
1971                 rc = kgnilnd_finish_connect(dgram);
1972
1973                 /* don't nak when the nid is hosed */
1974                 if ((rc < 0)) {
1975                         *needs_nak = 1;
1976                 }
1977
1978                 break;
1979         case GNILND_CONNREQ_NAK:
1980                 rc = kgnilnd_process_nak(dgram);
1981                 /* return early to prevent reconnect bump */
1982                 return rc;
1983         default:
1984                 CERROR("unexpected connreq type %s (%d) from %s\n",
1985                         kgnilnd_connreq_type2str(&dgram->gndg_conn_in),
1986                         dgram->gndg_conn_in.gncr_type,
1987                         libcfs_nid2str(dgram->gndg_conn_in.gncr_srcnid));
1988                 rc = -EINVAL;
1989                 *needs_nak = 1;
1990                 break;
1991         }
1992
1993 connreq_out:
1994         RETURN(rc);
1995 }
1996
1997 int
1998 kgnilnd_probe_and_process_dgram(kgn_device_t *dev)
1999 {
2000         int                      rc;
2001         int                      needs_nak = 0;
2002         lnet_nid_t               nak_dstnid = LNET_NID_ANY;
2003         lnet_nid_t               orig_dstnid;
2004         kgn_dgram_t             *dgram = NULL;
2005         kgn_peer_t              *peer;
2006         ENTRY;
2007
2008         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PAUSE_DGRAM_COMP)) {
2009                 rc = 0;
2010         } else {
2011                 rc = kgnilnd_probe_for_dgram(dev, &dgram);
2012         }
2013
2014         if (rc == 0) {
2015                 RETURN(0);
2016         } else if (rc < 0) {
2017                 GOTO(inform_peer, rc);
2018         } else {
2019                 /* rc > 1 means it did something, reset for this func  */
2020                 rc = 0;
2021         }
2022
2023         switch (dgram->gndg_type) {
2024         case GNILND_DGRAM_WC_REQ:
2025         case GNILND_DGRAM_REQ:
2026                 rc = kgnilnd_process_connreq(dgram, &needs_nak);
2027                 break;
2028         case GNILND_DGRAM_NAK:
2029                 CDEBUG(D_NETTRACE, "NAK to %s done\n",
2030                         libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid));
2031                 break;
2032         default:
2033                 CERROR("unknown datagram type %s (%d)\n",
2034                        kgnilnd_dgram_type2str(dgram), dgram->gndg_type);
2035                 break;
2036         }
2037
2038         /* stash data to use after releasing current datagram */
2039         /* don't stash net - we are operating on a net already,
2040          * so the lock on rw_net_lock is sufficient */
2041
2042         nak_dstnid = dgram->gndg_conn_in.gncr_srcnid;
2043
2044 inform_peer:
2045         LASSERTF(dgram != NULL, "dgram 0x%p rc %d needs_nak %d\n", dgram, rc, needs_nak);
2046
2047         orig_dstnid = dgram->gndg_conn_out.gncr_dstnid;
2048
2049         kgnilnd_release_dgram(dev, dgram);
2050
2051         CDEBUG(D_NET, "cleaning up dgram to %s, rc %d\n",
2052                libcfs_nid2str(orig_dstnid), rc);
2053
2054         /* if this was a WC_REQ that matched an existing peer, it'll get marked done
2055          * in kgnilnd_finish_connect - if errors are from before we get to there,
2056          * we just drop as it is a WC_REQ - the peer CAN'T be waiting for it */
2057         if ((orig_dstnid != LNET_NID_ANY) && (rc < 0)) {
2058                 /* if we have a negative rc, we want to find a peer to inform about
2059                  * the bad connection attempt. Sorry buddy, better luck next time! */
2060
2061                 write_lock(&kgnilnd_data.kgn_peer_conn_lock);
2062                 peer = kgnilnd_find_peer_locked(orig_dstnid);
2063
2064                 if (peer != NULL) {
2065                         /* add ref to make sure he stays around past the possible unlink
2066                          * so we can tell LNet about him */
2067                         kgnilnd_peer_addref(peer);
2068
2069                         /* if he still cares about the outstanding connect */
2070                         if (peer->gnp_connecting >= GNILND_PEER_CONNECT) {
2071                                 /* check if he is on the connd list and remove.. */
2072                                 spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
2073                                 if (!list_empty(&peer->gnp_connd_list)) {
2074                                         list_del_init(&peer->gnp_connd_list);
2075                                         /* drop connd ref */
2076                                         kgnilnd_peer_decref(peer);
2077                                 }
2078                                 spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
2079
2080                                 /* clear gnp_connecting so we don't have a non-connecting peer
2081                                  * on gnd_connd_list */
2082                                 peer->gnp_connecting = GNILND_PEER_IDLE;
2083
2084                                 set_mb(peer->gnp_last_dgram_errno, rc);
2085
2086                                 kgnilnd_peer_increase_reconnect_locked(peer);
2087                         }
2088                 }
2089                 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2090
2091                 /* now that we are outside the lock, tell Mommy */
2092                 if (peer != NULL) {
2093                         kgnilnd_peer_notify(peer, rc);
2094                         kgnilnd_peer_decref(peer);
2095                 }
2096         }
2097
2098         if (needs_nak) {
2099                 kgnilnd_send_nak(dev, nak_dstnid, rc);
2100         }
2101
2102         RETURN(1);
2103 }
2104
2105 void
2106 kgnilnd_reaper_dgram_check(kgn_device_t *dev)
2107 {
2108         kgn_dgram_t    *dgram, *tmp;
2109         int             i;
2110
2111         spin_lock(&dev->gnd_dgram_lock);
2112
2113         for (i = 0; i < (*kgnilnd_tunables.kgn_peer_hash_size - 1); i++) {
2114                 list_for_each_entry_safe(dgram, tmp, &dev->gnd_dgrams[i], gndg_list) {
2115                         unsigned long            now = jiffies;
2116                         unsigned long            timeout;
2117
2118                         /* don't timeout stuff if the network is mucked or shutting down */
2119                         if (kgnilnd_check_hw_quiesce()) {
2120                                 break;
2121                         }
2122
2123                         if ((dgram->gndg_state != GNILND_DGRAM_POSTED) ||
2124                             (dgram->gndg_type == GNILND_DGRAM_WC_REQ)) {
2125                                 continue;
2126                         }
2127                         CDEBUG(D_NETTRACE, "checking dgram 0x%p type %s "
2128                                 "state %s conn 0x%p to %s age %lus\n",
2129                                 dgram, kgnilnd_dgram_type2str(dgram),
2130                                 kgnilnd_dgram_state2str(dgram), dgram->gndg_conn,
2131                                 libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
2132                                 cfs_duration_sec(now - dgram->gndg_post_time));
2133
2134                         timeout = cfs_time_seconds(*kgnilnd_tunables.kgn_timeout);
2135
2136                         if (time_before(now, (dgram->gndg_post_time + timeout)))
2137                                 continue;
2138
2139                         CNETERR("%s datagram to %s timed out @ %lus dgram "
2140                                 "0x%p state %s conn 0x%p\n",
2141                                 kgnilnd_dgram_type2str(dgram),
2142                                 libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
2143                                 cfs_duration_sec(now - dgram->gndg_post_time),
2144                                 dgram, kgnilnd_dgram_state2str(dgram),
2145                                 dgram->gndg_conn);
2146
2147                         kgnilnd_cancel_dgram_locked(dgram);
2148                 }
2149         }
2150         spin_unlock(&dev->gnd_dgram_lock);
2151 }
2152
2153
2154 /* use a thread for the possibly long-blocking wait_by_id to prevent
2155  * stalling the global workqueues */
2156 int
2157 kgnilnd_dgram_waitq(void *arg)
2158 {
2159         kgn_device_t     *dev = (kgn_device_t *) arg;
2160         char              name[16];
2161         gni_return_t      grc;
2162         __u64             readyid;
2163         DEFINE_WAIT(mover_done);
2164
2165         snprintf(name, sizeof(name), "kgnilnd_dgn_%02d", dev->gnd_id);
2166         cfs_daemonize(name);
2167         cfs_block_allsigs();
2168
2169         /* all gnilnd threads need to run fairly urgently */
2170         set_user_nice(current, *kgnilnd_tunables.kgn_nice);
2171
2172         /* we dont shut down until the device shuts down ... */
2173         while (!kgnilnd_data.kgn_shutdown) {
2174                 /* to quiesce or to not quiesce, that is the question */
2175                 if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
2176                         KGNILND_SPIN_QUIESCE;
2177                 }
2178
2179                 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_PAUSE_DGRAM_COMP, 1)) {}
2180
2181                 /* check once a second */
2182                 grc = kgnilnd_postdata_probe_wait_by_id(dev->gnd_handle,
2183                                                        1000, &readyid);
2184
2185                 if (grc == GNI_RC_SUCCESS) {
2186                         CDEBUG(D_INFO, "waking up dgram mover thread\n");
2187                         kgnilnd_schedule_dgram(dev);
2188
2189                         /* wait for dgram thread to ping us before spinning again */
2190                         prepare_to_wait(&dev->gnd_dgping_waitq, &mover_done,
2191                                         TASK_INTERRUPTIBLE);
2192
2193                         /* don't sleep if we need to quiesce */
2194                         if (likely(!kgnilnd_data.kgn_quiesce_trigger)) {
2195                                 schedule();
2196                         }
2197                         finish_wait(&dev->gnd_dgping_waitq, &mover_done);
2198                 }
2199         }
2200
2201         kgnilnd_thread_fini();
2202         return 0;
2203 }
2204
2205 int
2206 kgnilnd_start_outbound_dgrams(kgn_device_t *dev)
2207 {
2208         int                      did_something = 0, rc;
2209         kgn_peer_t              *peer = NULL;
2210
2211         spin_lock(&dev->gnd_connd_lock);
2212
2213         /* Active connect - we added this in kgnilnd_launch_tx */
2214         while (!list_empty(&dev->gnd_connd_peers)) {
2215                 peer = list_first_entry(&dev->gnd_connd_peers,
2216                                         kgn_peer_t, gnp_connd_list);
2217
2218                 /* ref for connd removed in if/else below */
2219                list_del_init(&peer->gnp_connd_list);
2220
2221                 /* gnp_connecting and membership on gnd_connd_peers should be
2222                  * done coherently to avoid double adding, etc */
2223                 /* don't need kgnilnd_data.kgn_peer_conn_lock here as that is only needed
2224                  * to get the peer to gnp_connecting in the first place. We just need to
2225                  * rely on gnd_connd_lock to serialize someone pulling him from the list
2226                  * BEFORE clearing gnp_connecting */
2227                 LASSERTF(peer->gnp_connecting != GNILND_PEER_IDLE, "peer 0x%p->%s not connecting\n",
2228                          peer, libcfs_nid2str(peer->gnp_nid));
2229
2230                 spin_unlock(&dev->gnd_connd_lock);
2231
2232                 CDEBUG(D_NET, "processing connect to %s\n",
2233                        libcfs_nid2str(peer->gnp_nid));
2234
2235                 did_something += 1;
2236                 rc = kgnilnd_start_connect(peer);
2237
2238                 if (likely(rc >= 0)) {
2239                         /* 0 on success, positive on 'just drop peer' errors */
2240                         kgnilnd_peer_decref(peer);
2241                 } else if (rc == -ENOMEM) {
2242                         /* if we are out of wildcards, add back to
2243                          * connd_list - then break out and we'll try later
2244                          * if other errors, we'll bail & cancel pending tx */
2245                         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
2246                         if (peer->gnp_connecting == GNILND_PEER_POSTING) {
2247                                 peer->gnp_connecting = GNILND_PEER_CONNECT;
2248                                 spin_lock(&dev->gnd_connd_lock);
2249                                 list_add_tail(&peer->gnp_connd_list,
2250                                               &dev->gnd_connd_peers);
2251                         } else {
2252                                 /* connecting changed while we were posting */
2253
2254                                 LASSERTF(peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH, "Peer is in invalid"
2255                                         " state 0x%p->%s, connecting %d\n",
2256                                         peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting);
2257                                 peer->gnp_connecting = GNILND_PEER_KILL;
2258                                 spin_lock(&dev->gnd_connd_lock);
2259                                 /* remove the peer ref frrom the cond list */
2260                                 kgnilnd_peer_decref(peer);
2261                                 /* let the system handle itself */
2262                         }
2263                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2264                         /* the datagrams are a global pool,
2265                          * so break out of trying and hope some free
2266                          * up soon */
2267                         did_something -= 1;
2268                         break;
2269                 } else {
2270                         /* something bad happened, you lose */
2271                         CNETERR("could not start connecting to %s "
2272                                 "rc %d: Will retry until TX timeout\n",
2273                                libcfs_nid2str(peer->gnp_nid), rc);
2274                         /* It didnt post so just set connecting back to zero now.
2275                          * The reaper will reattempt the connection if it needs too.
2276                          * If the peer needs death set it so the reaper will cleanup.
2277                          */
2278                         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
2279                         if (peer->gnp_connecting == GNILND_PEER_POSTING) {
2280                                 peer->gnp_connecting = GNILND_PEER_IDLE;
2281                                 kgnilnd_peer_increase_reconnect_locked(peer);
2282                         } else {
2283                                 LASSERTF(peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH, "Peer is in invalid"
2284                                         " state 0x%p->%s, connecting %d\n",
2285                                         peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting);
2286                                 peer->gnp_connecting = GNILND_PEER_KILL;
2287                         }
2288                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2289
2290                         /* hold onto ref until we are really done - if it was
2291                          * unlinked this could result in a destroy */
2292                         kgnilnd_peer_decref(peer);
2293                 }
2294                 spin_lock(&dev->gnd_connd_lock);
2295         }
2296
2297         spin_unlock(&dev->gnd_connd_lock);
2298         RETURN(did_something);
2299 }
2300
2301 static void
2302 kgnilnd_dgram_poke_with_stick(unsigned long arg)
2303 {
2304         int             dev_id = arg;
2305         kgn_device_t    *dev = &kgnilnd_data.kgn_devices[dev_id];
2306
2307         wake_up(&dev->gnd_dgram_waitq);
2308 }
2309
2310 /* use single thread for dgrams - should be sufficient for performance */
2311 int
2312 kgnilnd_dgram_mover(void *arg)
2313 {
2314         kgn_device_t            *dev = (kgn_device_t *)arg;
2315         char                     name[16];
2316         int                      rc, did_something;
2317         unsigned long            next_purge_check = jiffies - 1;
2318         unsigned long            timeout;
2319         struct timer_list        timer;
2320         DEFINE_WAIT(wait);
2321
2322         snprintf(name, sizeof(name), "kgnilnd_dg_%02d", dev->gnd_id);
2323         cfs_daemonize(name);
2324         cfs_block_allsigs();
2325         /* all gnilnd threads need to run fairly urgently */
2326         set_user_nice(current, *kgnilnd_tunables.kgn_nice);
2327
2328         /* we are ok not locking for these variables as the dgram waitq threads
2329          * will block both due to tying up net (kgn_shutdown) and the completion
2330          * event for the dgram_waitq (kgn_quiesce_trigger) */
2331
2332         while (!kgnilnd_data.kgn_shutdown) {
2333                 /* Safe: kgn_shutdown only set when quiescent */
2334
2335                 /* race with stack reset - we want to hold off seeing any new incoming dgrams
2336                  * so we can force a dirty WC dgram for Bug 762072 - put right before
2337                  * quiesce check so that it'll go right into that and not do any
2338                  * dgram mucking */
2339                 CFS_RACE(CFS_FAIL_GNI_WC_DGRAM_FREE);
2340
2341                 /* to quiesce or to not quiesce, that is the question */
2342                 if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
2343                         KGNILND_SPIN_QUIESCE;
2344                 }
2345                 did_something = 0;
2346
2347                 CFS_RACE(CFS_FAIL_GNI_QUIESCE_RACE);
2348
2349                 /* process any newly completed dgrams */
2350                 down_read(&kgnilnd_data.kgn_net_rw_sem);
2351
2352                 rc = kgnilnd_probe_and_process_dgram(dev);
2353                 if (rc > 0) {
2354                         did_something += rc;
2355                 }
2356
2357                 up_read(&kgnilnd_data.kgn_net_rw_sem);
2358
2359                 /* start new outbound dgrams */
2360                 did_something += kgnilnd_start_outbound_dgrams(dev);
2361
2362                 /* find dead dgrams */
2363                 if (time_after_eq(jiffies, next_purge_check)) {
2364                         /* these don't need to be checked that often */
2365                         kgnilnd_reaper_dgram_check(dev);
2366
2367                         next_purge_check = (long) jiffies +
2368                                       cfs_time_seconds(kgnilnd_data.kgn_new_min_timeout / 4);
2369                 }
2370
2371                 /* careful with the jiffy wrap... */
2372                 timeout = (long)(next_purge_check - jiffies);
2373
2374                 CDEBUG(D_INFO, "did %d timeout %lu next %lu jiffies %lu\n",
2375                        did_something, timeout, next_purge_check, jiffies);
2376
2377                 if (did_something || timeout <= 0) {
2378                         did_something = 0;
2379                         continue;
2380                 }
2381
2382                 prepare_to_wait(&dev->gnd_dgram_waitq, &wait, TASK_INTERRUPTIBLE);
2383
2384                 setup_timer(&timer, kgnilnd_dgram_poke_with_stick, dev->gnd_id);
2385                 mod_timer(&timer, (long) jiffies + timeout);
2386
2387                 /* last second chance for others to poke us */
2388                 did_something += xchg(&dev->gnd_dgram_ready, GNILND_DGRAM_IDLE);
2389
2390                 /* check flag variables before comitting */
2391                 if (!did_something &&
2392                     !kgnilnd_data.kgn_shutdown &&
2393                     !kgnilnd_data.kgn_quiesce_trigger) {
2394                         CDEBUG(D_INFO, "schedule timeout %ld (%lu sec)\n",
2395                                timeout, cfs_duration_sec(timeout));
2396                         wake_up_all(&dev->gnd_dgping_waitq);
2397                         schedule();
2398                         CDEBUG(D_INFO, "awake after schedule\n");
2399                 }
2400
2401                 del_singleshot_timer_sync(&timer);
2402                 finish_wait(&dev->gnd_dgram_waitq, &wait);
2403         }
2404
2405         kgnilnd_thread_fini();
2406         return 0;
2407 }
2408