lnet/klnds/gnilnd/gnilnd_conn.c

   1 /*
   2  * Copyright (C) 2012 Cray, Inc.
   3  *
   4  * Copyright (c) 2014, Intel Corporation.
   5  *
   6  *   Author: Nic Henke <nic@cray.com>
   7  *   Author: James Shimek <jshimek@cray.com>
   8  *
   9  *   This file is part of Lustre, http://www.lustre.org.
  10  *
  11  *   Lustre is free software; you can redistribute it and/or
  12  *   modify it under the terms of version 2 of the GNU General Public
  13  *   License as published by the Free Software Foundation.
  14  *
  15  *   Lustre is distributed in the hope that it will be useful,
  16  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  *   GNU General Public License for more details.
  19  *
  20  *   You should have received a copy of the GNU General Public License
  21  *   along with Lustre; if not, write to the Free Software
  22  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  23  *
  24  */
  25
  26 #include "gnilnd.h"
  27 #include <linux/swap.h>
  28
  29 void
  30 kgnilnd_setup_smsg_attr(gni_smsg_attr_t *smsg_attr)
  31 {
  32         smsg_attr->mbox_maxcredit = *kgnilnd_tunables.kgn_mbox_credits;
  33         smsg_attr->msg_maxsize = GNILND_MAX_MSG_SIZE;
  34         smsg_attr->msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT;
  35 }
  36
  37 int
  38 kgnilnd_map_fmablk(kgn_device_t *device, kgn_fma_memblock_t *fma_blk)
  39 {
  40         gni_return_t            rrc;
  41         __u32                   flags = GNI_MEM_READWRITE;
  42         static unsigned long    reg_to;
  43         int                     rfto = *kgnilnd_tunables.kgn_reg_fail_timeout;
  44
  45         if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
  46                 flags |= GNI_MEM_PHYS_CONT;
  47         }
  48
  49         fma_blk->gnm_hold_timeout = 0;
  50
  51         /* make sure we are mapping a clean block */
  52         LASSERTF(fma_blk->gnm_hndl.qword1 == 0UL, "fma_blk %p dirty\n", fma_blk);
  53
  54         rrc = kgnilnd_mem_register(device->gnd_handle, (__u64)fma_blk->gnm_block,
  55                                    fma_blk->gnm_blk_size, device->gnd_rcv_fma_cqh,
  56                                    flags, &fma_blk->gnm_hndl);
  57         if (rrc != GNI_RC_SUCCESS) {
  58                 if (rfto != GNILND_REGFAILTO_DISABLE) {
  59                         if (reg_to == 0) {
  60                                 reg_to = jiffies + cfs_time_seconds(rfto);
  61                         } else if (time_after(jiffies, reg_to)) {
  62                                 CERROR("FATAL:fmablk registration has failed "
  63                                        "for %ld seconds.\n",
  64                                        cfs_duration_sec(jiffies - reg_to) +
  65                                                 rfto);
  66                                 LBUG();
  67                         }
  68                 }
  69
  70                 CNETERR("register fmablk failed 0x%p mbox_size %d flags %u\n",
  71                         fma_blk, fma_blk->gnm_mbox_size, flags);
  72                 RETURN(-ENOMEM);
  73         }
  74
  75         reg_to = 0;
  76
  77         /* PHYS_CONT memory isn't really mapped, at least not in GART -
  78          *  but all mappings chew up a MDD
  79          */
  80         if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
  81                 atomic64_add(fma_blk->gnm_blk_size, &device->gnd_nbytes_map);
  82         }
  83
  84         atomic_inc(&device->gnd_n_mdd);
  85         /* nfmablk is live (mapped) blocks */
  86         atomic_inc(&device->gnd_nfmablk);
  87
  88         RETURN(0);
  89 }
  90
  91 int
  92 kgnilnd_alloc_fmablk(kgn_device_t *device, int use_phys)
  93 {
  94         int                     rc = 0;
  95         int                     num_mbox;
  96         kgn_fma_memblock_t     *fma_blk;
  97         gni_smsg_attr_t         smsg_attr;
  98         unsigned long           fmablk_vers;
  99
 100 #if defined(CONFIG_CRAY_XT) && !defined(CONFIG_CRAY_COMPUTE)
 101         /* We allocate large blocks of memory here potentially leading
 102          * to memory exhaustion during massive reconnects during a network
 103          * outage. Limit the amount of fma blocks to use by always keeping
 104          * a percent of pages free initially set to 25% of total memory. */
 105         if (nr_free_pages() < kgnilnd_data.free_pages_limit) {
 106                 LCONSOLE_INFO("Exceeding free page limit of %ld. "
 107                               "Free pages available %ld\n",
 108                               kgnilnd_data.free_pages_limit,
 109                               nr_free_pages());
 110                 return -ENOMEM;
 111         }
 112 #endif
 113         /* we'll use fmablk_vers and the gnd_fmablk_mutex to gate access
 114          * to this allocation code. Everyone will sample the version
 115          * before and after getting the mutex. If it has changed,
 116          * we'll bail out to check the lists again - this indicates that
 117          * some sort of change was made to the lists and it is possible
 118          * that there is a mailbox for us to find now. This should prevent
 119          * a ton of spinning in the case where there are lots of threads
 120          * that need a yet-to-be-allocated mailbox for a connection. */
 121
 122         fmablk_vers = atomic_read(&device->gnd_fmablk_vers);
 123         mutex_lock(&device->gnd_fmablk_mutex);
 124
 125         if (fmablk_vers != atomic_read(&device->gnd_fmablk_vers)) {
 126                 /* version changed while we were waiting for semaphore,
 127                  * we'll recheck the lists assuming something nice happened */
 128                 mutex_unlock(&device->gnd_fmablk_mutex);
 129                 return 0;
 130         }
 131
 132         LIBCFS_ALLOC(fma_blk, sizeof(kgn_fma_memblock_t));
 133         if (fma_blk == NULL) {
 134                 CNETERR("could not allocate fma block descriptor\n");
 135                 rc = -ENOMEM;
 136                 GOTO(out, rc);
 137         }
 138
 139         INIT_LIST_HEAD(&fma_blk->gnm_bufflist);
 140
 141         kgnilnd_setup_smsg_attr(&smsg_attr);
 142
 143         gni_smsg_buff_size_needed(&smsg_attr, &fma_blk->gnm_mbox_size);
 144
 145         LASSERTF(fma_blk->gnm_mbox_size, "mbox size %d\n", fma_blk->gnm_mbox_size);
 146
 147         /* gni_smsg_buff_size_needed calculates the base mailbox size and since
 148          * we want to hold kgn_peer_credits worth of messages in both directions,
 149          * we add PAYLOAD to grow the mailbox size
 150          */
 151
 152         fma_blk->gnm_mbox_size += GNILND_MBOX_PAYLOAD;
 153
 154         /* we'll only use physical during preallocate at startup -- this keeps it nice and
 155          * clean for runtime decisions. We'll keep the PHYS ones around until shutdown
 156          * as reallocating them is tough if there is memory fragmentation */
 157
 158         if (use_phys) {
 159                 fma_blk->gnm_block = kmem_cache_alloc(kgnilnd_data.kgn_mbox_cache, GFP_ATOMIC);
 160                 if (fma_blk->gnm_block == NULL) {
 161                         CNETERR("could not allocate physical SMSG mailbox memory\n");
 162                         rc = -ENOMEM;
 163                         GOTO(free_desc, rc);
 164                 }
 165                 fma_blk->gnm_blk_size = GNILND_MBOX_SIZE;
 166                 num_mbox = fma_blk->gnm_blk_size / fma_blk->gnm_mbox_size;
 167
 168                 LASSERTF(num_mbox >= 1,
 169                          "num_mbox %d blk_size %u mbox_size %d\n",
 170                           num_mbox, fma_blk->gnm_blk_size, fma_blk->gnm_mbox_size);
 171
 172                 fma_blk->gnm_state = GNILND_FMABLK_PHYS;
 173
 174         } else {
 175                 num_mbox = *kgnilnd_tunables.kgn_mbox_per_block;
 176                 fma_blk->gnm_blk_size = num_mbox * fma_blk->gnm_mbox_size;
 177
 178                 LASSERTF(num_mbox >= 1 && num_mbox >= *kgnilnd_tunables.kgn_mbox_per_block,
 179                          "num_mbox %d blk_size %u mbox_size %d tunable %d\n",
 180                          num_mbox, fma_blk->gnm_blk_size, fma_blk->gnm_mbox_size,
 181                          *kgnilnd_tunables.kgn_mbox_per_block);
 182
 183                 fma_blk->gnm_block = kgnilnd_vzalloc(fma_blk->gnm_blk_size);
 184                 if (fma_blk->gnm_block == NULL) {
 185                         CNETERR("could not allocate virtual SMSG mailbox memory, %d bytes\n", fma_blk->gnm_blk_size);
 186                         rc = -ENOMEM;
 187                         GOTO(free_desc, rc);
 188                 }
 189
 190                 fma_blk->gnm_state = GNILND_FMABLK_VIRT;
 191         }
 192
 193         /* allocate just enough space for the bits to track the mailboxes */
 194         CFS_ALLOC_PTR_ARRAY(fma_blk->gnm_bit_array, BITS_TO_LONGS(num_mbox));
 195         if (fma_blk->gnm_bit_array == NULL) {
 196                 CNETERR("could not allocate mailbox bitmask, %lu bytes for %d mbox\n",
 197                        sizeof(unsigned long) * BITS_TO_LONGS(num_mbox), num_mbox);
 198                 rc = -ENOMEM;
 199                 GOTO(free_blk, rc);
 200         }
 201         bitmap_zero(fma_blk->gnm_bit_array, num_mbox);
 202
 203         /* now that the num_mbox is set based on allocation type, get debug
 204          * info setup
 205          * */
 206         CFS_ALLOC_PTR_ARRAY(fma_blk->gnm_mbox_info, num_mbox);
 207         if (fma_blk->gnm_mbox_info == NULL) {
 208                 CNETERR("could not allocate mailbox debug, %lu bytes for %d mbox\n",
 209                        sizeof(kgn_mbox_info_t) * num_mbox, num_mbox);
 210                 rc = -ENOMEM;
 211                 GOTO(free_bit, rc);
 212         }
 213
 214         rc = kgnilnd_map_fmablk(device, fma_blk);
 215         if (rc) {
 216                 GOTO(free_info, rc);
 217         }
 218
 219         fma_blk->gnm_next_avail_mbox = 0;
 220         fma_blk->gnm_avail_mboxs = fma_blk->gnm_num_mboxs = num_mbox;
 221
 222         CDEBUG(D_MALLOC, "alloc fmablk 0x%p num %d msg_maxsize %d credits %d "
 223                 "mbox_size %d MDD %#llx.%#llx\n",
 224                 fma_blk, num_mbox, smsg_attr.msg_maxsize, smsg_attr.mbox_maxcredit,
 225                 fma_blk->gnm_mbox_size, fma_blk->gnm_hndl.qword1,
 226                 fma_blk->gnm_hndl.qword2);
 227
 228         /* lock Is protecting data structures, not semaphore */
 229
 230         spin_lock(&device->gnd_fmablk_lock);
 231         list_add_tail(&fma_blk->gnm_bufflist, &device->gnd_fma_buffs);
 232
 233         /* toggle under the lock so once they change the list is also
 234          * ready for others to traverse */
 235         atomic_inc(&device->gnd_fmablk_vers);
 236
 237         spin_unlock(&device->gnd_fmablk_lock);
 238
 239         mutex_unlock(&device->gnd_fmablk_mutex);
 240
 241         return 0;
 242
 243 free_info:
 244         CFS_FREE_PTR_ARRAY(fma_blk->gnm_mbox_info, num_mbox);
 245 free_bit:
 246         CFS_FREE_PTR_ARRAY(fma_blk->gnm_bit_array, BITS_TO_LONGS(num_mbox));
 247 free_blk:
 248         if (fma_blk->gnm_state == GNILND_FMABLK_VIRT) {
 249                 kgnilnd_vfree(fma_blk->gnm_block, fma_blk->gnm_blk_size);
 250         } else {
 251                 kmem_cache_free(kgnilnd_data.kgn_mbox_cache, fma_blk->gnm_block);
 252         }
 253 free_desc:
 254         LIBCFS_FREE(fma_blk, sizeof(kgn_fma_memblock_t));
 255 out:
 256         mutex_unlock(&device->gnd_fmablk_mutex);
 257         return rc;
 258 }
 259
 260 void
 261 kgnilnd_unmap_fmablk(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
 262 {
 263         gni_return_t            rrc;
 264
 265         /* if some held, set hold_timeout from conn timeouts used in this block
 266          * but not during shutdown, then just nuke and pave
 267          * During a stack reset, we need to deregister with a hold timeout
 268          * set so we don't use the same mdd after reset is complete */
 269         if ((fma_blk->gnm_held_mboxs && !kgnilnd_data.kgn_shutdown) ||
 270             kgnilnd_data.kgn_in_reset) {
 271                 fma_blk->gnm_hold_timeout = GNILND_TIMEOUT2DEADMAN;
 272         }
 273
 274         /* we are changing the state of a block, tickle version to tell
 275          * proc code list is stale now */
 276         atomic_inc(&dev->gnd_fmablk_vers);
 277
 278         rrc = kgnilnd_mem_deregister(dev->gnd_handle, &fma_blk->gnm_hndl, fma_blk->gnm_hold_timeout);
 279
 280         CDEBUG(rrc == GNI_RC_SUCCESS ? D_MALLOC : D_CONSOLE|D_NETERROR,
 281                "unmap fmablk 0x%p@%s sz %u total %d avail %d held %d mbox_size %d "
 282                 "hold_timeout %d\n",
 283                fma_blk, kgnilnd_fmablk_state2str(fma_blk->gnm_state),
 284                fma_blk->gnm_blk_size, fma_blk->gnm_num_mboxs,
 285                fma_blk->gnm_avail_mboxs, fma_blk->gnm_held_mboxs,
 286                fma_blk->gnm_mbox_size, fma_blk->gnm_hold_timeout);
 287
 288         LASSERTF(rrc == GNI_RC_SUCCESS,
 289                 "tried to double unmap or something bad, fma_blk %p (rrc %d)\n",
 290                 fma_blk, rrc);
 291
 292         if (fma_blk->gnm_hold_timeout &&
 293             !(kgnilnd_data.kgn_in_reset &&
 294               fma_blk->gnm_state == GNILND_FMABLK_PHYS)) {
 295                 atomic_inc(&dev->gnd_n_mdd_held);
 296         } else {
 297                 atomic_dec(&dev->gnd_n_mdd);
 298         }
 299
 300         /* PHYS blocks don't get mapped */
 301         if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
 302                 atomic64_sub(fma_blk->gnm_blk_size, &dev->gnd_nbytes_map);
 303                 fma_blk->gnm_state = GNILND_FMABLK_IDLE;
 304         } else if (kgnilnd_data.kgn_in_reset) {
 305                 /* in stack reset, clear MDD handle for PHYS blocks, as we'll
 306                  * re-use the fma_blk after reset so we don't have to drop/allocate
 307                  * all of those physical blocks */
 308                 fma_blk->gnm_hndl.qword1 = fma_blk->gnm_hndl.qword2 = 0UL;
 309         }
 310
 311         /* Decrement here as this is the # of mapped blocks */
 312         atomic_dec(&dev->gnd_nfmablk);
 313 }
 314
 315
 316 /* needs lock on gnd_fmablk_lock to cover gnd_fma_buffs */
 317 void
 318 kgnilnd_free_fmablk_locked(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
 319 {
 320         LASSERTF(fma_blk->gnm_avail_mboxs == fma_blk->gnm_num_mboxs,
 321                  "fma_blk %p@%d free in bad state (%d): blk total %d avail %d held %d\n",
 322                  fma_blk, fma_blk->gnm_state, fma_blk->gnm_hold_timeout, fma_blk->gnm_num_mboxs,
 323                 fma_blk->gnm_avail_mboxs, fma_blk->gnm_held_mboxs);
 324
 325         atomic_inc(&dev->gnd_fmablk_vers);
 326
 327         if (fma_blk->gnm_hold_timeout) {
 328                 CDEBUG(D_MALLOC, "mdd release fmablk 0x%p sz %u avail %d held %d "
 329                         "mbox_size %d\n",
 330                         fma_blk, fma_blk->gnm_blk_size, fma_blk->gnm_avail_mboxs,
 331                         fma_blk->gnm_held_mboxs, fma_blk->gnm_mbox_size);
 332
 333                 /* We leave MDD dangling over stack reset */
 334                 if (!kgnilnd_data.kgn_in_reset) {
 335                         kgnilnd_mem_mdd_release(dev->gnd_handle, &fma_blk->gnm_hndl);
 336                 }
 337                 /* ignoring the return code - if kgni/ghal can't find it
 338                  * it must be released already */
 339                 atomic_dec(&dev->gnd_n_mdd_held);
 340                 atomic_dec(&dev->gnd_n_mdd);
 341         }
 342
 343         /* we cant' free the gnm_block until all the conns have released their
 344          * purgatory holds. While we have purgatory holds, we might check the conn
 345          * RX mailbox during the CLOSING process. It is possible that kgni might
 346          * try to look into the RX side for credits when sending the CLOSE msg too */
 347         CDEBUG(D_MALLOC, "fmablk %p free buffer %p mbox_size %d\n",
 348                 fma_blk, fma_blk->gnm_block, fma_blk->gnm_mbox_size);
 349
 350         if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
 351                 kmem_cache_free(kgnilnd_data.kgn_mbox_cache, fma_blk->gnm_block);
 352         } else {
 353                 kgnilnd_vfree(fma_blk->gnm_block, fma_blk->gnm_blk_size);
 354         }
 355         fma_blk->gnm_state = GNILND_FMABLK_FREED;
 356
 357         list_del(&fma_blk->gnm_bufflist);
 358
 359         CFS_FREE_PTR_ARRAY(fma_blk->gnm_mbox_info, fma_blk->gnm_num_mboxs);
 360         CFS_FREE_PTR_ARRAY(fma_blk->gnm_bit_array,
 361                            BITS_TO_LONGS(fma_blk->gnm_num_mboxs));
 362         LIBCFS_FREE(fma_blk, sizeof(kgn_fma_memblock_t));
 363 }
 364
 365 void
 366 kgnilnd_find_free_mbox(kgn_conn_t *conn)
 367 {
 368         kgn_device_t            *dev = conn->gnc_device;
 369         gni_smsg_attr_t         *smsg_attr = &conn->gnpr_smsg_attr;
 370         kgn_fma_memblock_t      *fma_blk;
 371         kgn_mbox_info_t         *mbox = NULL;
 372         int                     id;
 373
 374         spin_lock(&dev->gnd_fmablk_lock);
 375
 376         list_for_each_entry(fma_blk, &conn->gnc_device->gnd_fma_buffs,
 377                             gnm_bufflist) {
 378                 if (fma_blk->gnm_avail_mboxs <= 0 ||
 379                     fma_blk->gnm_state <= GNILND_FMABLK_IDLE) {
 380                         continue;
 381                 }
 382                 /* look in bitarray for available mailbox */
 383                 do {
 384                         id = find_next_zero_bit(
 385                                 fma_blk->gnm_bit_array,
 386                                 fma_blk->gnm_num_mboxs,
 387                                 fma_blk->gnm_next_avail_mbox);
 388                       if (id == fma_blk->gnm_num_mboxs &&
 389                           fma_blk->gnm_next_avail_mbox != 0) {
 390                                 /* wrap around */
 391                                 fma_blk->gnm_next_avail_mbox = 0;
 392                         } else {
 393                                 break;
 394                         }
 395                 } while (1);
 396
 397                 LASSERTF(id < fma_blk->gnm_num_mboxs, "id %d max %d\n",
 398                          id, fma_blk->gnm_num_mboxs);
 399                 set_bit(id, (volatile unsigned long *)fma_blk->gnm_bit_array);
 400                 conn->gnc_mbox_id = id;
 401
 402                 fma_blk->gnm_next_avail_mbox =
 403                         (id == (fma_blk->gnm_num_mboxs - 1)) ? 0 : (id + 1);
 404                 fma_blk->gnm_avail_mboxs--;
 405                 conn->gnc_fma_blk = fma_blk;
 406
 407                 kgnilnd_setup_smsg_attr(smsg_attr);
 408
 409                 smsg_attr->msg_buffer = fma_blk->gnm_block;
 410                 smsg_attr->mbox_offset = fma_blk->gnm_mbox_size * id;
 411                 smsg_attr->mem_hndl = fma_blk->gnm_hndl;
 412                 smsg_attr->buff_size = fma_blk->gnm_mbox_size;
 413
 414                 /* We'll set the hndl to zero for PHYS blocks unmapped during stack
 415                  * reset and re-use the same fma_blk after stack reset. This ensures we've
 416                  * properly mapped it before we use it */
 417                 LASSERTF(fma_blk->gnm_hndl.qword1 != 0UL, "unmapped fma_blk %p, state %d\n",
 418                          fma_blk, fma_blk->gnm_state);
 419
 420                 CDEBUG(D_NET, "conn %p smsg %p fmablk %p "
 421                         "allocating SMSG mbox %d buf %p "
 422                         "offset %u hndl %#llx.%#llx\n",
 423                         conn, smsg_attr, fma_blk, id,
 424                         smsg_attr->msg_buffer, smsg_attr->mbox_offset,
 425                         fma_blk->gnm_hndl.qword1,
 426                         fma_blk->gnm_hndl.qword2);
 427
 428                 mbox = &fma_blk->gnm_mbox_info[id];
 429                 mbox->mbx_create_conn_memset = jiffies;
 430                 mbox->mbx_nallocs++;
 431                 mbox->mbx_nallocs_total++;
 432
 433                 /* zero mbox to remove any old data from our last use.
 434                  * this better be safe, if not our purgatory timers
 435                  * are too short or a peer really is misbehaving */
 436                 memset(smsg_attr->msg_buffer + smsg_attr->mbox_offset,
 437                        0, smsg_attr->buff_size);
 438                 break;
 439         }
 440
 441         spin_unlock(&dev->gnd_fmablk_lock);
 442 }
 443
 444 int
 445 kgnilnd_setup_mbox(kgn_conn_t *conn)
 446 {
 447         gni_smsg_attr_t         *smsg_attr = &conn->gnpr_smsg_attr;
 448         int                      err = 0;
 449
 450         smsg_attr->msg_buffer = NULL;
 451         /* Look for available mbox */
 452         do {
 453                 kgnilnd_find_free_mbox(conn);
 454
 455                 /* nothing in the existing buffers, make a new one */
 456                 if (smsg_attr->msg_buffer == NULL) {
 457                         /* for runtime allocations, we only want vmalloc */
 458                         err = kgnilnd_alloc_fmablk(conn->gnc_device, 0);
 459                         if (err) {
 460                                 break;
 461                         }
 462                 }
 463         } while (smsg_attr->msg_buffer == NULL);
 464
 465         if (err)
 466                 CNETERR("couldn't allocate SMSG mbox for conn %p Error: %d\n",
 467                         conn, err);
 468         return err;
 469 }
 470
 471 void
 472 kgnilnd_release_mbox(kgn_conn_t *conn, int purgatory_hold)
 473 {
 474         kgn_device_t           *dev = conn->gnc_device;
 475         gni_smsg_attr_t        *smsg_attr = &conn->gnpr_smsg_attr;
 476         kgn_fma_memblock_t     *fma_blk = NULL;
 477         kgn_mbox_info_t        *mbox = NULL;
 478         int                     found = 0;
 479         int                     id;
 480
 481         /* if we failed to setup mbox and now destroying conn */
 482         if (smsg_attr->msg_buffer == NULL) {
 483                 return;
 484         }
 485
 486         id = conn->gnc_mbox_id;
 487
 488         spin_lock(&dev->gnd_fmablk_lock);
 489         /* make sure our conn points at a valid fma_blk
 490          * We use this instead of a mem block search out of smsg_attr
 491          * because we could have freed a block for fma_blk #1 but the fma_blk
 492          * is still in the list for a purgatory hold. This would induce a false
 493          * match if that same block gets reallocated to fma_blk #2 */
 494         list_for_each_entry(fma_blk, &dev->gnd_fma_buffs, gnm_bufflist) {
 495                 if (fma_blk == conn->gnc_fma_blk) {
 496                         found = 1;
 497                         break;
 498                 }
 499         }
 500         LASSERTF(found, "unable to find conn 0x%p with gnc_fma_blk %p "
 501                  "anywhere in the world\n", conn, conn->gnc_fma_blk);
 502
 503         LASSERTF(id < fma_blk->gnm_num_mboxs,
 504                 "bad id %d max %d\n",
 505                 id, fma_blk->gnm_num_mboxs);
 506
 507         /* < 0 - was held, now free it
 508          * == 0 - just free it
 509          * > 0 - hold it for now */
 510         if (purgatory_hold == 0) {
 511                 CDEBUG(D_NET, "conn %p smsg %p fmablk %p freeing SMSG mbox %d "
 512                         "hndl %#llx.%#llx\n",
 513                         conn, smsg_attr, fma_blk, id,
 514                         fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
 515                 fma_blk->gnm_avail_mboxs++;
 516
 517         } else if (purgatory_hold > 0) {
 518                 CDEBUG(D_NET, "conn %p smsg %p fmablk %p holding SMSG mbox %d "
 519                         "hndl %#llx.%#llx\n",
 520                         conn, smsg_attr, fma_blk, id,
 521                         fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
 522
 523                 fma_blk->gnm_held_mboxs++;
 524                 fma_blk->gnm_max_timeout = max_t(long, fma_blk->gnm_max_timeout,
 525                                                  conn->gnc_timeout);
 526         } else {
 527                 CDEBUG(D_NET, "conn %p smsg %p fmablk %p release SMSG mbox %d "
 528                         "hndl %#llx.%#llx\n",
 529                         conn, smsg_attr, fma_blk, id,
 530                         fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
 531
 532                 fma_blk->gnm_held_mboxs--;
 533                 fma_blk->gnm_avail_mboxs++;
 534         }
 535
 536         if (purgatory_hold <= 0) {
 537                 /* if kgni is retransmitting, freeing the smsg block before the EP
 538                  * is destroyed gets messy. Bug 768295. */
 539                 LASSERTF(conn->gnc_ephandle == NULL,
 540                          "can't release mbox before EP is nuked. conn 0x%p\n", conn);
 541
 542                 mbox = &fma_blk->gnm_mbox_info[id];
 543                 mbox->mbx_release_from_purgatory = jiffies;
 544
 545                 /* clear conn gnc_fmablk if it is gone - this allows us to
 546                  * not worry about state so much in kgnilnd_destroy_conn
 547                  * and makes the guaranteed cleanup of the resources easier */
 548                 LASSERTF(test_and_clear_bit(id, fma_blk->gnm_bit_array),
 549                         "conn %p bit %d already cleared in fma_blk %p\n",
 550                          conn, id, fma_blk);
 551                 conn->gnc_fma_blk = NULL;
 552                 mbox->mbx_nallocs--;
 553         }
 554
 555         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_FMABLK_AVAIL)) {
 556                 CERROR("LBUGs in your future: forcibly marking fma_blk %p "
 557                        "as mapped\n", fma_blk);
 558                 fma_blk->gnm_state = GNILND_FMABLK_VIRT;
 559         }
 560
 561         /* we don't release or unmap PHYS blocks as part of the normal cycle --
 562          * those are controlled manually from startup/shutdown */
 563         if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
 564                 /* we can unmap once all are unused (held or avail)
 565                  * but check hold_timeout to make sure we are not trying to double
 566                  * unmap this buffer. If there was no hold_timeout set due to
 567                  * held_mboxs, we'll free the mobx here shortly and won't have to
 568                  * worry about catching a double free for a 'clean' fma_blk */
 569                 if (((fma_blk->gnm_avail_mboxs + fma_blk->gnm_held_mboxs) == fma_blk->gnm_num_mboxs) &&
 570                     (!fma_blk->gnm_hold_timeout)) {
 571                         kgnilnd_unmap_fmablk(dev, fma_blk);
 572                 }
 573
 574                 /* But we can only free once they are all avail */
 575                 if (fma_blk->gnm_avail_mboxs == fma_blk->gnm_num_mboxs &&
 576                     fma_blk->gnm_held_mboxs == 0) {
 577                         /* all mailboxes are released, free fma_blk */
 578                         kgnilnd_free_fmablk_locked(dev, fma_blk);
 579                 }
 580         }
 581
 582         spin_unlock(&dev->gnd_fmablk_lock);
 583 }
 584
 585 int
 586 kgnilnd_count_phys_mbox(kgn_device_t *device)
 587 {
 588         int                     i = 0;
 589         kgn_fma_memblock_t     *fma_blk;
 590
 591         spin_lock(&device->gnd_fmablk_lock);
 592
 593         list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
 594                 if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
 595                         i += fma_blk->gnm_num_mboxs;
 596         }
 597         spin_unlock(&device->gnd_fmablk_lock);
 598
 599         RETURN(i);
 600 }
 601
 602 int
 603 kgnilnd_allocate_phys_fmablk(kgn_device_t *device)
 604 {
 605         int     rc;
 606
 607         while (kgnilnd_count_phys_mbox(device) < *kgnilnd_tunables.kgn_nphys_mbox) {
 608
 609                 rc = kgnilnd_alloc_fmablk(device, 1);
 610                 if (rc) {
 611                         CERROR("failed phys mbox allocation, stopping at %d, rc %d\n",
 612                                 kgnilnd_count_phys_mbox(device), rc);
 613                         RETURN(rc);
 614                 }
 615         }
 616         RETURN(0);
 617 }
 618
 619 int
 620 kgnilnd_map_phys_fmablk(kgn_device_t *device)
 621 {
 622
 623         int                     rc = 0;
 624         kgn_fma_memblock_t     *fma_blk;
 625
 626         /* use mutex to gate access to single thread, just in case */
 627         mutex_lock(&device->gnd_fmablk_mutex);
 628
 629         spin_lock(&device->gnd_fmablk_lock);
 630
 631         list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
 632                 if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
 633                         rc = kgnilnd_map_fmablk(device, fma_blk);
 634                         if (rc)
 635                                 break;
 636                 }
 637         }
 638         spin_unlock(&device->gnd_fmablk_lock);
 639
 640         mutex_unlock(&device->gnd_fmablk_mutex);
 641
 642         RETURN(rc);
 643 }
 644
 645 void
 646 kgnilnd_unmap_fma_blocks(kgn_device_t *device)
 647 {
 648
 649         kgn_fma_memblock_t      *fma_blk;
 650
 651         /* use mutex to gate access to single thread, just in case */
 652         mutex_lock(&device->gnd_fmablk_mutex);
 653
 654         spin_lock(&device->gnd_fmablk_lock);
 655
 656         list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
 657                 kgnilnd_unmap_fmablk(device, fma_blk);
 658         }
 659         spin_unlock(&device->gnd_fmablk_lock);
 660
 661         mutex_unlock(&device->gnd_fmablk_mutex);
 662 }
 663
 664 void
 665 kgnilnd_free_phys_fmablk(kgn_device_t *device)
 666 {
 667
 668         kgn_fma_memblock_t      *fma_blk, *fma_blkN;
 669
 670         /* use mutex to gate access to single thread, just in case */
 671         mutex_lock(&device->gnd_fmablk_mutex);
 672
 673         spin_lock(&device->gnd_fmablk_lock);
 674
 675         list_for_each_entry_safe(fma_blk, fma_blkN, &device->gnd_fma_buffs, gnm_bufflist) {
 676                 if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
 677                         kgnilnd_free_fmablk_locked(device, fma_blk);
 678         }
 679         spin_unlock(&device->gnd_fmablk_lock);
 680
 681         mutex_unlock(&device->gnd_fmablk_mutex);
 682 }
 683
 684 /* kgnilnd dgram nid->struct managment */
 685
 686 static inline struct list_head *
 687 kgnilnd_nid2dgramlist(kgn_device_t *dev, lnet_nid_t nid)
 688 {
 689         unsigned int hash = ((unsigned int)nid) % *kgnilnd_tunables.kgn_peer_hash_size;
 690
 691         RETURN(&dev->gnd_dgrams[hash]);
 692 }
 693
 694
 695 /* needs dev->gnd_dgram_lock held */
 696 kgn_dgram_t *
 697 kgnilnd_find_dgram_locked(kgn_device_t *dev, lnet_nid_t dst_nid)
 698 {
 699         struct list_head *dgram_list = kgnilnd_nid2dgramlist(dev, dst_nid);
 700         kgn_dgram_t      *dgram;
 701
 702         list_for_each_entry(dgram, dgram_list, gndg_list) {
 703
 704                 /* if state > POSTED, we are already handling cancel/completion */
 705                 if ((dgram->gndg_conn_out.gncr_dstnid != dst_nid) ||
 706                      dgram->gndg_state > GNILND_DGRAM_POSTED)
 707                         continue;
 708
 709                 CDEBUG(D_NET, "got dgram [%p] -> %s\n",
 710                        dgram, libcfs_nid2str(dst_nid));
 711                 return dgram;
 712         }
 713         return NULL;
 714 }
 715
 716 int
 717 kgnilnd_find_and_cancel_dgram(kgn_device_t *dev, lnet_nid_t dst_nid)
 718 {
 719         kgn_dgram_t     *dgram;
 720
 721         spin_lock(&dev->gnd_dgram_lock);
 722         dgram = kgnilnd_find_dgram_locked(dev, dst_nid);
 723
 724         if (dgram) {
 725                 kgnilnd_cancel_dgram_locked(dgram);
 726         }
 727         spin_unlock(&dev->gnd_dgram_lock);
 728
 729         RETURN(!!(dgram == NULL));
 730 }
 731
 732 int
 733 kgnilnd_pack_connreq(kgn_connreq_t *connreq, kgn_conn_t *conn,
 734                      lnet_nid_t srcnid, lnet_nid_t dstnid,
 735                      kgn_connreq_type_t type)
 736 {
 737         int err = 0;
 738
 739         /* ensure we haven't violated max datagram size */
 740         BUILD_BUG_ON(sizeof(kgn_connreq_t) > GNI_DATAGRAM_MAXSIZE);
 741
 742         /* no need to zero out, we do that when allocating dgram */
 743         connreq->gncr_magic     = GNILND_MSG_MAGIC;
 744
 745         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_SRCNID)) {
 746                 srcnid = 0xABADBABE;
 747         } else if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_DSTNID)) {
 748                 dstnid = 0xDEFEC8ED;
 749         }
 750
 751         connreq->gncr_srcnid    = srcnid;
 752         connreq->gncr_dstnid    = dstnid;
 753
 754         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
 755                 connreq->gncr_version = 99;
 756         } else {
 757                 connreq->gncr_version   = GNILND_CONNREQ_VERSION;
 758         }
 759         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
 760                 connreq->gncr_type = 99;
 761         } else {
 762                 connreq->gncr_type      = type;
 763         }
 764         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
 765                 connreq->gncr_peerstamp = 0;
 766         } else {
 767                 connreq->gncr_peerstamp = kgnilnd_data.kgn_peerstamp;
 768         }
 769         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
 770                 connreq->gncr_connstamp = 0;
 771         } else {
 772                 connreq->gncr_connstamp = conn->gnc_my_connstamp;
 773         }
 774         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
 775                 connreq->gncr_timeout = 0;
 776         } else {
 777                 connreq->gncr_timeout   = conn->gnc_timeout;
 778         }
 779
 780         /* the rest pack the data into the payload in other places */
 781         if (type == GNILND_CONNREQ_REQ) {
 782                 kgn_gniparams_t       *req_params = &connreq->gncr_gnparams;
 783                 req_params->gnpr_host_id = conn->gnc_device->gnd_host_id;
 784                 req_params->gnpr_cqid = conn->gnc_cqid;
 785
 786                 /* allocate mailbox for this connection */
 787                 err = kgnilnd_setup_mbox(conn);
 788                 if (err != 0) {
 789                         CERROR("Failed to setup FMA mailbox (%d)\n", err);
 790                 }
 791                 req_params->gnpr_smsg_attr = conn->gnpr_smsg_attr;
 792         }
 793
 794         /* XXX Nic: TBD - checksum computation */
 795
 796         return err;
 797 }
 798
 799 int
 800 kgnilnd_unpack_connreq(kgn_dgram_t *dgram)
 801 {
 802         kgn_connreq_t           *connreq = &dgram->gndg_conn_in;
 803         int                      swab, rc = 0;
 804         kgn_net_t               *net;
 805
 806         /* the following fields must be handled in a backwards compatible
 807          * manner to ensure we can always send and interpret NAKs */
 808
 809         if (connreq->gncr_magic != GNILND_MSG_MAGIC &&
 810             connreq->gncr_magic != __swab32(GNILND_MSG_MAGIC)) {
 811                 /* Unexpected magic! */
 812                 CERROR("Unexpected magic %08x\n",
 813                        connreq->gncr_magic);
 814                 return -EBADF;
 815         }
 816
 817         swab = (connreq->gncr_magic == __swab32(GNILND_MSG_MAGIC));
 818         if (swab) {
 819                 __swab32s(&connreq->gncr_magic);
 820                 __swab32s(&connreq->gncr_cksum);
 821                 __swab16s(&connreq->gncr_type);
 822                 __swab16s(&connreq->gncr_version);
 823                 __swab32s(&connreq->gncr_timeout);
 824                 __swab64s(&connreq->gncr_srcnid);
 825                 __swab64s(&connreq->gncr_dstnid);
 826                 __swab64s(&connreq->gncr_peerstamp);
 827                 __swab64s(&connreq->gncr_connstamp);
 828         }
 829
 830         /* Do NOT return anything but -EBADF before we munge
 831          * connreq->gncr_srcnid - we need that to send the nak */
 832
 833         if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
 834                 lnet_nid_t      incoming = connreq->gncr_srcnid;
 835
 836                 /* even if the incoming packet is hosed, we know who we sent
 837                  * the original and can set the srcnid so that we can properly
 838                  * look up our peer to close the loop on this connreq. We still use
 839                  * -EBADF to prevent a NAK - just in case there are issues with
 840                  * the payload coming from a random spot, etc. */
 841                 connreq->gncr_srcnid = dgram->gndg_conn_out.gncr_dstnid;
 842
 843                 if (LNET_NIDADDR(dgram->gndg_conn_out.gncr_dstnid) !=
 844                                 LNET_NIDADDR(incoming)) {
 845                         /* we got a datagram match for the wrong nid... */
 846                         CERROR("matched datagram 0x%p with srcnid %s "
 847                                 "(%x), expecting %s (%x)\n",
 848                                 dgram,
 849                                 libcfs_nid2str(incoming),
 850                                 LNET_NIDADDR(incoming),
 851                                 libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
 852                                 LNET_NIDADDR(dgram->gndg_conn_out.gncr_dstnid));
 853                         return -EBADF;
 854                 }
 855         } else {
 856                 /* if we have a wildcard datagram it should match an
 857                  * incoming "active" datagram that should have a fully formed
 858                  * srcnid and dstnid. If we couldn't unpack it, we drop as
 859                  * corrupted packet, otherwise we'll just verify that the dstnid
 860                  * matches the NID for the NET that the dgram was posted */
 861
 862                 /* make sure their wildcard didn't match ours, that is unpossible */
 863                 LASSERTF(connreq->gncr_dstnid != LNET_NID_ANY,
 864                          "dgram 0x%p from %s, connreq 0x%p; "
 865                          "wildcard matched wildcard \n", dgram,
 866                          libcfs_nid2str(connreq->gncr_srcnid), connreq);
 867
 868                 rc = kgnilnd_find_net(connreq->gncr_dstnid, &net);
 869
 870                 if (rc == -ESHUTDOWN) {
 871                         CERROR("Looking up network: device is in shutdown\n");
 872                         return rc;
 873                 } else if (rc == -ENONET) {
 874                         CERROR("Connection data from %s: she sent "
 875                         "dst_nid %s, but net lookup failed on "
 876                         "dgram 0x%p@%s\n",
 877                         libcfs_nid2str(connreq->gncr_srcnid),
 878                         libcfs_nid2str(connreq->gncr_dstnid),
 879                         dgram, kgnilnd_dgram_type2str(dgram));
 880                         return rc;
 881                 }
 882
 883                 if (lnet_nid_to_nid4(&net->gnn_ni->ni_nid) !=
 884                     connreq->gncr_dstnid) {
 885                         CERROR("Bad connection data from %s: she sent "
 886                                "dst_nid %s, but I am %s with dgram 0x%p@%s\n",
 887                                libcfs_nid2str(connreq->gncr_srcnid),
 888                                libcfs_nid2str(connreq->gncr_dstnid),
 889                                libcfs_nidstr(&net->gnn_ni->ni_nid),
 890                                dgram, kgnilnd_dgram_type2str(dgram));
 891                         kgnilnd_net_decref(net);
 892                         return -EBADSLT;
 893                 }
 894
 895                 /* kgnilnd_find_net takes a ref on the net it finds, You need to decref it when not needed. */
 896                 kgnilnd_net_decref(net);
 897         }
 898
 899         if (connreq->gncr_version != GNILND_CONNREQ_VERSION) {
 900                 CERROR("Unexpected version %d\n", connreq->gncr_version);
 901                 return -EPROTO;
 902         }
 903
 904         /* XXX Nic: TBD - checksum validation */
 905         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_DROP)) {
 906                 return -EBADF;
 907         }
 908
 909         if (swab && connreq->gncr_type == GNILND_CONNREQ_REQ) {
 910                 __u64 msg_addr = (__u64) connreq->gncr_gnparams.gnpr_smsg_attr.msg_buffer;
 911
 912                 __swab32s(&connreq->gncr_gnparams.gnpr_host_id);
 913                 __swab32s(&connreq->gncr_gnparams.gnpr_cqid);
 914                 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.buff_size);
 915                 __swab16s(&connreq->gncr_gnparams.gnpr_smsg_attr.mbox_maxcredit);
 916                 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.mbox_offset);
 917                 __swab64s(&connreq->gncr_gnparams.gnpr_smsg_attr.mem_hndl.qword1);
 918                 __swab64s(&connreq->gncr_gnparams.gnpr_smsg_attr.mem_hndl.qword2);
 919                 __swab64s(&msg_addr);
 920                 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.msg_maxsize);
 921                 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.msg_type);
 922         } else if (swab && connreq->gncr_type == GNILND_CONNREQ_NAK) {
 923                 __swab32s(&connreq->gncr_nakdata.gnnd_errno);
 924         }
 925
 926         /* since we use a unique instance ID for each network, the driver
 927          * will take care of dropping datagrams if we don't have that network.
 928          */
 929
 930         /* few more idiot software or configuration checks */
 931
 932         switch (connreq->gncr_type) {
 933         case GNILND_CONNREQ_REQ:
 934                 /* wire up EP and SMSG block - this will check the incoming data
 935                  * and barf a NAK back if need to */
 936                 rc = kgnilnd_set_conn_params(dgram);
 937                 if (rc)
 938                         return rc;
 939                 break;
 940         case GNILND_CONNREQ_NAK:
 941         case GNILND_CONNREQ_CLOSE:
 942                 break;
 943         default:
 944                 CERROR("unknown connreq packet type %d\n", connreq->gncr_type);
 945                 return -EPROTO;
 946         }
 947
 948         if (connreq->gncr_peerstamp == 0 || connreq->gncr_connstamp == 0) {
 949                 CERROR("Recived bad timestamps peer %llu conn %llu\n",
 950                 connreq->gncr_peerstamp, connreq->gncr_connstamp);
 951                 return -EPROTO;
 952         }
 953
 954         if (connreq->gncr_timeout < GNILND_MIN_TIMEOUT) {
 955                 CERROR("Received timeout %d < MIN %d\n",
 956                        connreq->gncr_timeout, GNILND_MIN_TIMEOUT);
 957                 return -EPROTO;
 958         }
 959
 960         return 0;
 961 }
 962
 963 int
 964 kgnilnd_alloc_dgram(kgn_dgram_t **dgramp, kgn_device_t *dev, kgn_dgram_type_t type)
 965 {
 966         kgn_dgram_t         *dgram;
 967
 968         dgram = kmem_cache_zalloc(kgnilnd_data.kgn_dgram_cache, GFP_ATOMIC);
 969         if (dgram == NULL)
 970                 return -ENOMEM;
 971
 972         INIT_LIST_HEAD(&dgram->gndg_list);
 973         dgram->gndg_state = GNILND_DGRAM_USED;
 974         dgram->gndg_type = type;
 975         dgram->gndg_magic = GNILND_DGRAM_MAGIC;
 976
 977         atomic_inc(&dev->gnd_ndgrams);
 978
 979         CDEBUG(D_MALLOC|D_NETTRACE, "slab-alloced 'dgram': %lu at %p %s ndgrams"
 980                 " %d\n",
 981                 sizeof(*dgram), dgram, kgnilnd_dgram_type2str(dgram),
 982                 atomic_read(&dev->gnd_ndgrams));
 983
 984         *dgramp = dgram;
 985         return 0;
 986 }
 987
 988 /* call this on a dgram that came back from kgnilnd_ep_postdata_test_by_id
 989  * returns < 0 on dgram to be cleaned up
 990  * > 0 on dgram that isn't done yet
 991  * == 0 on dgram that is ok and needs connreq processing */
 992 int
 993 kgnilnd_process_dgram(kgn_dgram_t *dgram, gni_post_state_t post_state)
 994 {
 995         int rc = 0;
 996
 997         switch (post_state) {
 998         case GNI_POST_COMPLETED:
 999                 /* normal state for dgrams that need actual processing */
1000                 /* GOTO to avoid processing dgram as canceled/done */
1001                 GOTO(process_out, rc);
1002
1003         case GNI_POST_PENDING:
1004                 /* we should only see this if we are testing a WC dgram after a
1005                  * cancel - it means that it needs a full cycle of waiting
1006                  * for kgni_sm_task to finish moving it to TERMINATED */
1007                 LASSERTF((dgram->gndg_type == GNILND_DGRAM_WC_REQ) &&
1008                           (dgram->gndg_state == GNILND_DGRAM_CANCELED),
1009                          "POST_PENDING dgram 0x%p with bad type %d(%s) or state %d(%s)\n",
1010                          dgram, dgram->gndg_type, kgnilnd_dgram_type2str(dgram),
1011                          dgram->gndg_state, kgnilnd_dgram_state2str(dgram));
1012
1013                 /* positive RC as this dgram isn't done yet */
1014                 rc = EINPROGRESS;
1015
1016                 /* GOTO as this isn't done yet */
1017                 GOTO(process_out, rc);
1018                 break;
1019
1020         case GNI_POST_TERMINATED:
1021                 /* we've called cancel and it is done or remote guy called cancel and
1022                  * we've receved it on a WC dgram */
1023 #if 0
1024                 /* we are seeing weird terminations on non WC dgrams when we have not
1025                  * canceled them */
1026
1027                 LASSERTF(dgram->gndg_state == GNILND_DGRAM_CANCELED ||
1028                          dgram->gndg_conn_out.gncr_dstnid == LNET_NID_ANY,
1029                         "dgram 0x%p with bad state %d(%s) or dst nid %s\n",
1030                         dgram, dgram->gndg_state, kgnilnd_dgram_state2str(dgram),
1031                         libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid));
1032 #endif
1033
1034                 CDEBUG(D_NETTRACE, "dgram 0x%p saw %s, cleaning it up\n", dgram,
1035                        dgram->gndg_state == GNILND_DGRAM_CANCELED ?  "canceled" : "terminated");
1036
1037                 rc =  -ECANCELED;
1038                 break;
1039
1040         case GNI_POST_TIMEOUT:
1041                 /* we could have a timeout on a wildcard dgram too - if
1042                  * we got the incoming request but the remote node beefed
1043                  * before kgni could send the match data back. We'll just error
1044                  * on the active case and bail out gracefully */
1045                 if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
1046                         CNETERR("hardware timeout for connect to "
1047                                "%s after %lu seconds. Is node dead?\n",
1048                                libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
1049                                cfs_duration_sec(jiffies - dgram->gndg_post_time));
1050                 }
1051
1052                 rc = -ETIMEDOUT;
1053                 break;
1054
1055         default:
1056                 CERROR("dgram 0x%p with bad post_state %d\n", dgram, post_state);
1057                 LBUG();
1058         }
1059
1060         /* now finish cleaning up a dgram that is canceled/terminated and needs to
1061          * go away */
1062
1063         /* If this was actively canceled, drop the count now that we are processing */
1064         if (dgram->gndg_state == GNILND_DGRAM_CANCELED) {
1065                 atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1066                 /* caller responsible for gndg_list removal */
1067         }
1068
1069 process_out:
1070
1071         RETURN(rc);
1072 }
1073
1074 /* needs dev->gnd_dgram_lock held */
1075 void
1076 kgnilnd_cancel_dgram_locked(kgn_dgram_t *dgram)
1077 {
1078         gni_return_t            grc;
1079
1080         if (dgram->gndg_state != GNILND_DGRAM_POSTED) {
1081                 return;
1082         }
1083
1084         LASSERTF(dgram->gndg_conn != NULL,
1085                  "dgram 0x%p with NULL conn\n", dgram);
1086
1087         /* C.E - WC dgrams could be canceled immediately but
1088          * if there was some match pending, we need to call
1089          * test_by_id to clear it out. If that test returns
1090          * POST_PENDING, it is half done and needs to go along
1091          * with the rest of dgrams and go through a kgni_sm_task cycle
1092          * and deliver a GNI_POST_TERMINATED event before they
1093          * are actually canceled */
1094
1095         dgram->gndg_state = GNILND_DGRAM_CANCELED;
1096
1097         if (dgram->gndg_conn->gnc_state >= GNILND_CONN_ESTABLISHED) {
1098                 /* we don't need to cancel_by_id if the datagram was good */
1099                 return;
1100         }
1101
1102         /* let folks know there are outstanding cancels */
1103         atomic_inc(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1104         /* leave on nid list until cancel is done for debugging fun */
1105         grc = kgnilnd_ep_postdata_cancel_by_id(dgram->gndg_conn->gnc_ephandle, (__u64) dgram);
1106
1107         /* if we don't get success here, we have hosed up the dgram tracking
1108          * code and need to bail out */
1109         LASSERTF(grc == GNI_RC_SUCCESS,
1110                  "postdata_cancel returned %d for conn 0x%p to %s\n",
1111                  grc, dgram->gndg_conn,
1112                  dgram->gndg_conn->gnc_peer ?
1113                   libcfs_nid2str(dgram->gndg_conn->gnc_peer->gnp_nid)
1114                   : "<?>");
1115
1116         CDEBUG(D_NETTRACE,
1117                 "canceled dgram 0x%p conn 0x%p ephandle 0x%p\n",
1118                 dgram, dgram->gndg_conn,
1119                 dgram->gndg_conn->gnc_ephandle);
1120
1121         if (dgram->gndg_type == GNILND_DGRAM_WC_REQ) {
1122                 gni_post_state_t         post_state;
1123                 int                      rc = 0;
1124                 __u32                    remote_addr = 0, remote_id = 0;
1125
1126                 grc = kgnilnd_ep_postdata_test_by_id(dgram->gndg_conn->gnc_ephandle,
1127                                                      (__u64)dgram, &post_state,
1128                                                      &remote_addr, &remote_id);
1129
1130                 LASSERTF(grc == GNI_RC_NO_MATCH || grc == GNI_RC_SUCCESS,
1131                          "bad grc %d from test_by_id on dgram 0x%p\n",
1132                         grc, dgram);
1133
1134                 /* if WC was canceled immediately, we get NO_MATCH, if needs to go
1135                  * through full cycle, we get SUCCESS and need to parse post_state */
1136
1137                 CDEBUG(D_NET, "grc %d dgram 0x%p type %s post_state %d "
1138                         "remote_addr %u remote_id %u\n", grc, dgram,
1139                         kgnilnd_dgram_type2str(dgram),
1140                         post_state, remote_addr, remote_id);
1141
1142                 if (grc == GNI_RC_NO_MATCH) {
1143                         /* she's gone, reduce count and move along */
1144                         dgram->gndg_state = GNILND_DGRAM_DONE;
1145                         atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1146                         RETURN_EXIT;
1147                 }
1148
1149                 rc = kgnilnd_process_dgram(dgram, post_state);
1150
1151                 if (rc <= 0) {
1152                         /* if for some weird reason we get a valid dgram back, just mark as done
1153                          * so we can drop it and move along.
1154                          * C.E - if it was completed, we'll just release the conn/mbox
1155                          * back into the pool and it'll get reused. That said, we should only
1156                          * be canceling a WC dgram on stack rest or shutdown, so that is moot */
1157                         dgram->gndg_state = GNILND_DGRAM_DONE;
1158                         atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1159
1160                         /* caller context responsible for calling kgnilnd_release_dgram() */
1161                 } else {
1162                         /* still pending, let it simmer until golden brown and delicious */
1163                 }
1164         }
1165
1166         /* for non WC dgrams, they are still on the nid list but marked canceled waiting
1167          * for kgni to return their ID to us via probe - that is when we'll complete their
1168          * cancel processing */
1169 }
1170
1171 void
1172 kgnilnd_cleanup_dgram(kgn_dgram_t *dgram)
1173 {
1174         /* release the dgram ref on conn */
1175         if (dgram->gndg_conn) {
1176                 kgnilnd_conn_decref(dgram->gndg_conn);
1177                 dgram->gndg_conn = NULL;
1178         }
1179 }
1180
1181 void
1182 kgnilnd_free_dgram(kgn_device_t *dev, kgn_dgram_t *dgram)
1183 {
1184         LASSERTF(dgram->gndg_state == GNILND_DGRAM_USED ||
1185                  dgram->gndg_state == GNILND_DGRAM_DONE,
1186                  "dgram 0x%p with bad state %s\n",
1187                  dgram, kgnilnd_dgram_state2str(dgram));
1188
1189         /* bit of poisoning to help detect bad driver data */
1190         dgram->gndg_magic = 0x6f5a6b5f;
1191         atomic_dec(&dev->gnd_ndgrams);
1192
1193         kmem_cache_free(kgnilnd_data.kgn_dgram_cache, dgram);
1194         CDEBUG(D_MALLOC|D_NETTRACE, "slab-freed 'dgram': %lu at %p %s"
1195                " ndgrams %d\n",
1196                sizeof(*dgram), dgram, kgnilnd_dgram_type2str(dgram),
1197                atomic_read(&dev->gnd_ndgrams));
1198 }
1199
1200 int
1201 kgnilnd_post_dgram(kgn_device_t *dev, lnet_nid_t dstnid, kgn_connreq_type_t type,
1202                    int data_rc)
1203 {
1204         int              rc = 0;
1205         kgn_dgram_t     *dgram = NULL;
1206         kgn_dgram_t     *tmpdgram;
1207         kgn_dgram_type_t dgtype;
1208         gni_return_t     grc;
1209         __u64            srcnid;
1210         ENTRY;
1211
1212         switch (type) {
1213         case GNILND_CONNREQ_REQ:
1214                 if (dstnid == LNET_NID_ANY)
1215                         dgtype = GNILND_DGRAM_WC_REQ;
1216                 else
1217                         dgtype = GNILND_DGRAM_REQ;
1218                 break;
1219         case GNILND_CONNREQ_NAK:
1220                 LASSERTF(dstnid != LNET_NID_ANY, "can't NAK to LNET_NID_ANY\n");
1221                 dgtype = GNILND_DGRAM_NAK;
1222                 break;
1223         default:
1224                 CERROR("unknown connreq type %d\n", type);
1225                 LBUG();
1226         }
1227
1228         rc = kgnilnd_alloc_dgram(&dgram, dev, dgtype);
1229         if (rc < 0) {
1230                 rc = -ENOMEM;
1231                 GOTO(post_failed, rc);
1232         }
1233
1234         rc = kgnilnd_create_conn(&dgram->gndg_conn, dev);
1235         if (rc) {
1236                 GOTO(post_failed, rc);
1237         }
1238
1239         if (dgram->gndg_type == GNILND_DGRAM_WC_REQ) {
1240                 /* clear buffer for sanity on reuse of wildcard */
1241                 memset(&dgram->gndg_conn_in, 0, sizeof(kgn_connreq_t));
1242         }
1243
1244         if (dstnid == LNET_NID_ANY) {
1245                 /* set here to reset any dgram re-use */
1246                 dgram->gndg_conn->gnc_state = GNILND_CONN_LISTEN;
1247         } else {
1248                 __u32            host_id;
1249
1250                 rc = kgnilnd_nid_to_nicaddrs(LNET_NIDADDR(dstnid), 1, &host_id);
1251                 if (rc <= 0) {
1252                         rc = -ESRCH;
1253                         GOTO(post_failed, rc);
1254                 }
1255
1256                 dgram->gndg_conn->gnc_state = GNILND_CONN_CONNECTING;
1257
1258                 /* don't need to serialize, there are no CQs for the dgram
1259                  * EP on the kgn_net_t */
1260                 grc = kgnilnd_ep_bind(dgram->gndg_conn->gnc_ephandle, host_id, dev->gnd_id);
1261
1262                 if (grc != GNI_RC_SUCCESS) {
1263                         rc = -ECONNABORTED;
1264                         GOTO(post_failed, rc);
1265                 }
1266
1267         }
1268
1269         /* If we are posting wildcards post using a net of 0, otherwise we'll use the
1270          * net of the destination node.
1271          */
1272
1273         if (dstnid == LNET_NID_ANY) {
1274                 srcnid = LNET_MKNID(LNET_MKNET(GNILND, 0), dev->gnd_nid);
1275         } else {
1276                 srcnid = LNET_MKNID(LNET_NIDNET(dstnid), dev->gnd_nid);
1277         }
1278
1279         rc = kgnilnd_pack_connreq(&dgram->gndg_conn_out, dgram->gndg_conn,
1280                                   srcnid, dstnid, type);
1281         if (rc) {
1282                 GOTO(post_failed, rc);
1283         }
1284
1285         if (type == GNILND_CONNREQ_NAK)
1286                 dgram->gndg_conn_out.gncr_nakdata.gnnd_errno = data_rc;
1287
1288         dgram->gndg_post_time = jiffies;
1289
1290         /* XXX Nic: here is where we'd add in logical network multiplexing */
1291
1292         CDEBUG(D_NETTRACE, "dgram 0x%p type %s %s->%s cdm %d\n",
1293                dgram, kgnilnd_dgram_type2str(dgram),
1294                libcfs_nid2str(srcnid),
1295                libcfs_nid2str(dstnid), dev->gnd_id);
1296
1297         /* this allocates memory, can't hold locks across */
1298         grc = kgnilnd_ep_postdata_w_id(dgram->gndg_conn->gnc_ephandle,
1299                                    &dgram->gndg_conn_out, sizeof(kgn_connreq_t),
1300                                    &dgram->gndg_conn_in, sizeof(kgn_connreq_t),
1301                                    (__u64)dgram);
1302
1303         if (grc != GNI_RC_SUCCESS) {
1304                 CNETERR("dropping failed dgram post id 0x%p type %s"
1305                         " reqtype %s to %s: rc %d\n",
1306                         dgram, kgnilnd_dgram_type2str(dgram),
1307                         kgnilnd_connreq_type2str(&dgram->gndg_conn_out),
1308                         libcfs_nid2str(dstnid), grc);
1309                 rc = (grc == GNI_RC_ERROR_NOMEM) ? -ENOMEM : -EBADR;
1310                 GOTO(post_failed, rc);
1311         }
1312
1313         /* we don't need to add earlier - if someone does del_peer during post,
1314          * that peer will get marked as unlinked and the callers wil take care of it.
1315          * The dgram code is largely kgn_peer_t ignorant, so at worst, we'll just drop
1316          * the completed dgram later when we cant find a peer to stuff it into */
1317
1318         spin_lock(&dev->gnd_dgram_lock);
1319
1320         /* make sure we are not double posting targeted dgrams
1321          * - we can multiple post WC dgrams to help with processing speed */
1322         if (dstnid != LNET_NID_ANY) {
1323                 tmpdgram = kgnilnd_find_dgram_locked(dev, dstnid);
1324
1325                 LASSERTF(tmpdgram == NULL,
1326                         "dgram 0x%p->%s already posted\n",
1327                          dgram, libcfs_nid2str(dstnid));
1328         }
1329
1330         /* unmunge dstnid to help processing code cope... */
1331         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_DSTNID)) {
1332                 dgram->gndg_conn_out.gncr_dstnid = dstnid;
1333         }
1334
1335         list_add_tail(&dgram->gndg_list, kgnilnd_nid2dgramlist(dev, dstnid));
1336         dgram->gndg_state = GNILND_DGRAM_POSTED;
1337         spin_unlock(&dev->gnd_dgram_lock);
1338
1339 post_failed:
1340         if (rc < 0 && dgram != NULL) {
1341                 kgnilnd_cleanup_dgram(dgram);
1342                 kgnilnd_free_dgram(dev, dgram);
1343         }
1344
1345         RETURN(rc);
1346 }
1347
1348 /* The shutdown flag is set from the shutdown and stack reset threads. */
1349 void
1350 kgnilnd_release_dgram(kgn_device_t *dev, kgn_dgram_t *dgram, int shutdown)
1351 {
1352         /* The conns of canceled active dgrams need to be put in purgatory so
1353          * we don't reuse the mailbox */
1354         if (unlikely(dgram->gndg_state == GNILND_DGRAM_CANCELED)) {
1355                 kgn_peer_t *peer;
1356                 kgn_conn_t *conn = dgram->gndg_conn;
1357                 lnet_nid_t nid = dgram->gndg_conn_out.gncr_dstnid;
1358
1359                 dgram->gndg_state = GNILND_DGRAM_DONE;
1360
1361                 /* During shutdown we've already removed the peer so we don't
1362                  * need to add a peer. During stack reset we don't care about
1363                  * MDDs since they are all released. */
1364                 if (!shutdown) {
1365                         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1366                         peer = kgnilnd_find_peer_locked(nid);
1367
1368                         if (peer != NULL) {
1369                                 CDEBUG(D_NET, "adding peer's conn with nid %s "
1370                                         "to purgatory\n", libcfs_nid2str(nid));
1371                                 kgnilnd_conn_addref(conn);
1372                                 conn->gnc_peer = peer;
1373                                 kgnilnd_peer_addref(peer);
1374                                 kgnilnd_admin_addref(conn->gnc_peer->gnp_dirty_eps);
1375                                 conn->gnc_state = GNILND_CONN_CLOSED;
1376                                 list_add_tail(&conn->gnc_list,
1377                                               &peer->gnp_conns);
1378                                 kgnilnd_add_purgatory_locked(conn,
1379                                                              conn->gnc_peer);
1380                                 kgnilnd_schedule_conn(conn);
1381                         }
1382                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1383                 }
1384         }
1385
1386         spin_lock(&dev->gnd_dgram_lock);
1387         kgnilnd_cancel_dgram_locked(dgram);
1388         spin_unlock(&dev->gnd_dgram_lock);
1389
1390         kgnilnd_cleanup_dgram(dgram);
1391
1392         /* if the dgram is 'canceled' it needs to be wait until the event
1393          * comes up from kgni that tells us it is safe to release */
1394         if (dgram->gndg_state != GNILND_DGRAM_CANCELED) {
1395                 dgram->gndg_state = GNILND_DGRAM_DONE;
1396
1397                 LASSERTF(list_empty(&dgram->gndg_list), "dgram 0x%p on list\n", dgram);
1398
1399                 /* if it is a wildcard and we are in an appropriate state, repost
1400                  * the wildcard */
1401
1402                 if ((dgram->gndg_type == GNILND_DGRAM_WC_REQ) &&
1403                     (!kgnilnd_data.kgn_wc_kill && !kgnilnd_data.kgn_in_reset)) {
1404                         int     rerc;
1405
1406                         rerc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
1407                         if (rerc != 0) {
1408                                 /* We failed to repost the WC dgram for some reason
1409                                  * mark it so the repost system attempts to repost */
1410                                 kgnilnd_admin_addref(dev->gnd_nwcdgrams);
1411                         }
1412                 }
1413
1414                 /* always free the old dgram */
1415                 kgnilnd_free_dgram(dev, dgram);
1416         }
1417 }
1418
1419
1420 int
1421 kgnilnd_probe_for_dgram(kgn_device_t *dev, kgn_dgram_t **dgramp)
1422 {
1423         kgn_dgram_t             *dgram = NULL;
1424         gni_post_state_t         post_state;
1425         gni_return_t             grc;
1426         int                      rc = 0;
1427         __u64                    readyid;
1428         __u32                    remote_addr = 0, remote_id = 0;
1429         ENTRY;
1430
1431         /* Probe with the lock held. That way if we get a dgram we dont have it canceled
1432          * between finding the ready dgram and grabbing the lock to remove it from the
1433          * list. Otherwise we could be left in an inconsistent state. We own the dgram
1434          * once its off the list so we don't need to worry about others changing it at
1435          * that point. */
1436         spin_lock(&dev->gnd_dgram_lock);
1437         grc = kgnilnd_postdata_probe_by_id(dev->gnd_handle, &readyid);
1438         if (grc != GNI_RC_SUCCESS) {
1439                 spin_unlock(&dev->gnd_dgram_lock);
1440                 /* return 0 to indicate nothing happened */
1441                 RETURN(0);
1442         }
1443
1444         CDEBUG(D_NET, "ready %#llx on device 0x%p\n",
1445                 readyid, dev);
1446
1447         dgram = (kgn_dgram_t *)readyid;
1448
1449         LASSERTF(dgram->gndg_magic == GNILND_DGRAM_MAGIC,
1450                  "dgram 0x%p from id %#llx with bad magic %x\n",
1451                  dgram, readyid, dgram->gndg_magic);
1452
1453         LASSERTF(dgram->gndg_state == GNILND_DGRAM_POSTED ||
1454                  dgram->gndg_state == GNILND_DGRAM_CANCELED,
1455                  "dgram 0x%p with bad state %s\n",
1456                  dgram, kgnilnd_dgram_state2str(dgram));
1457
1458         LASSERTF(!list_empty(&dgram->gndg_list),
1459                  "dgram 0x%p with bad list state %s type %s\n",
1460                  dgram, kgnilnd_dgram_state2str(dgram),
1461                  kgnilnd_dgram_type2str(dgram));
1462
1463         /* now we know that the datagram structure is ok, so pull off list */
1464         list_del_init(&dgram->gndg_list);
1465
1466         /* while we have the gnn_dgram_lock and BEFORE we call test_by_id
1467          * change the state from POSTED to PROCESSING to ensure that
1468          * nobody cancels it after we've pulled it from the wire */
1469         if (dgram->gndg_state == GNILND_DGRAM_POSTED) {
1470                 dgram->gndg_state = GNILND_DGRAM_PROCESSING;
1471         }
1472
1473         LASSERTF(dgram->gndg_conn != NULL,
1474                 "dgram 0x%p with NULL conn\n", dgram);
1475
1476         grc = kgnilnd_ep_postdata_test_by_id(dgram->gndg_conn->gnc_ephandle,
1477                                              (__u64)dgram, &post_state,
1478                                              &remote_addr, &remote_id);
1479
1480         /* we now "own" this datagram */
1481         spin_unlock(&dev->gnd_dgram_lock);
1482
1483         LASSERTF(grc != GNI_RC_NO_MATCH, "kgni lied! probe_by_id told us that"
1484                  " id %llu was ready\n", readyid);
1485
1486         CDEBUG(D_NET, "grc %d dgram 0x%p type %s post_state %d "
1487                 "remote_addr %u remote_id %u\n", grc, dgram,
1488                 kgnilnd_dgram_type2str(dgram),
1489                 post_state, remote_addr, remote_id);
1490
1491         if (unlikely(grc != GNI_RC_SUCCESS)) {
1492                 CNETERR("getting data for dgram 0x%p->%s failed rc %d. Dropping it\n",
1493                         dgram, libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
1494                         grc);
1495                 rc = -EINVAL;
1496                 GOTO(probe_for_out, rc);
1497         }
1498
1499         rc = kgnilnd_process_dgram(dgram, post_state);
1500
1501         /* we should never get probe finding a dgram for us and then it
1502          * being a WC dgram that is still in the middle of processing */
1503         LASSERTF(rc <= 0, "bad rc %d from process_dgram 0x%p state %d\n",
1504                  rc, dgram, post_state);
1505
1506         if (rc == 0) {
1507                 /* dgram is good enough for the data to be used */
1508                 dgram->gndg_state = GNILND_DGRAM_PROCESSING;
1509                 /* fake rc to mark that we've done something */
1510                 rc = 1;
1511         } else {
1512                 /* let kgnilnd_release_dgram take care of canceled dgrams */
1513                 if (dgram->gndg_state != GNILND_DGRAM_CANCELED) {
1514                         dgram->gndg_state = GNILND_DGRAM_DONE;
1515                 }
1516         }
1517
1518         *dgramp = dgram;
1519         RETURN(rc);
1520
1521 probe_for_out:
1522
1523         kgnilnd_release_dgram(dev, dgram, 0);
1524         RETURN(rc);
1525 }
1526
1527 int
1528 kgnilnd_setup_wildcard_dgram(kgn_device_t *dev)
1529 {
1530         /* if kgn_wildcard is zero, return error */
1531         int     rc = -ENOENT, i;
1532         ENTRY;
1533
1534         for (i = 0; i < *kgnilnd_tunables.kgn_nwildcard; i++) {
1535                 rc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
1536                 if (rc < 0) {
1537                         CERROR("error %d: could not post wildcard datagram # %d\n",
1538                                 rc, i);
1539                         rc = -EINVAL;
1540                         GOTO(failed, rc);
1541                 }
1542         }
1543
1544 failed:
1545         RETURN(rc);
1546 }
1547
1548 int
1549 kgnilnd_cancel_net_dgrams(kgn_net_t *net)
1550 {
1551         kgn_dgram_t *dg, *dgN;
1552         LIST_HEAD(zombies);
1553         int i;
1554         ENTRY;
1555
1556         /* we want to cancel any outstanding dgrams - we don't want to rely
1557          * on del_peer_or_conn catching all of them. This helps protect us in cases
1558          * where we don't quite keep the peer->dgram mapping in sync due to some
1559          * race conditions */
1560
1561         LASSERTF(net->gnn_shutdown || kgnilnd_data.kgn_in_reset,
1562                  "called with LND invalid state: net shutdown %d "
1563                  "in reset %d\n", net->gnn_shutdown,
1564                  kgnilnd_data.kgn_in_reset);
1565
1566         spin_lock(&net->gnn_dev->gnd_dgram_lock);
1567
1568         for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
1569                 list_for_each_entry_safe(dg, dgN, &net->gnn_dev->gnd_dgrams[i], gndg_list) {
1570
1571                         /* skip nids not on our net or are wildcards */
1572
1573
1574                         if (dg->gndg_type == GNILND_DGRAM_WC_REQ ||
1575                                 net->gnn_netnum != LNET_NETNUM(LNET_NIDNET(dg->gndg_conn_out.gncr_dstnid)))
1576                                 continue;
1577
1578                         kgnilnd_cancel_dgram_locked(dg);
1579                 }
1580         }
1581
1582         spin_unlock(&net->gnn_dev->gnd_dgram_lock);
1583
1584         RETURN(0);
1585 }
1586
1587 int
1588 kgnilnd_cancel_wc_dgrams(kgn_device_t *dev)
1589 {
1590         kgn_dgram_t *dg, *dgN;
1591         LIST_HEAD(zombies);
1592         ENTRY;
1593
1594         /* Time to kill the outstanding WC's
1595          * WC's exist on net 0 only but match on any net...
1596          */
1597
1598         LASSERTF(kgnilnd_data.kgn_in_reset || kgnilnd_data.kgn_wc_kill,
1599                 "called with LND invalid state: WC shutdown %d "
1600                 "in reset %d\n", kgnilnd_data.kgn_wc_kill,
1601                 kgnilnd_data.kgn_in_reset);
1602
1603         spin_lock(&dev->gnd_dgram_lock);
1604
1605         do {
1606                 dg = kgnilnd_find_dgram_locked(dev, LNET_NID_ANY);
1607                 if (dg != NULL) {
1608                         LASSERTF(dg->gndg_type == GNILND_DGRAM_WC_REQ,
1609                                  "dgram 0x%p->%s with bad type %d (%s)\n",
1610                                 dg, libcfs_nid2str(dg->gndg_conn_out.gncr_dstnid),
1611                                 dg->gndg_type, kgnilnd_dgram_type2str(dg));
1612
1613                         kgnilnd_cancel_dgram_locked(dg);
1614
1615                         /* WC could be DONE already, check and if so add to list to be released */
1616                         if (dg->gndg_state == GNILND_DGRAM_DONE)
1617                                 list_move_tail(&dg->gndg_list, &zombies);
1618                 }
1619         } while (dg != NULL);
1620
1621         spin_unlock(&dev->gnd_dgram_lock);
1622
1623         list_for_each_entry_safe(dg, dgN, &zombies, gndg_list) {
1624                 list_del_init(&dg->gndg_list);
1625                 kgnilnd_release_dgram(dev, dg, 1);
1626         }
1627         RETURN(0);
1628
1629 }
1630
1631 int
1632 kgnilnd_cancel_dgrams(kgn_device_t *dev)
1633 {
1634         kgn_dgram_t *dg, *dgN;
1635         int i;
1636         ENTRY;
1637
1638         /* Cancel any outstanding non wildcard datagrams regardless
1639          * of which net they are on as we are in base shutdown and
1640          * dont care about connecting anymore.
1641          */
1642
1643         LASSERTF(kgnilnd_data.kgn_wc_kill == 1,"We didnt get called from base shutdown\n");
1644
1645         spin_lock(&dev->gnd_dgram_lock);
1646
1647         for (i = 0; i < (*kgnilnd_tunables.kgn_peer_hash_size -1); i++) {
1648                 list_for_each_entry_safe(dg, dgN, &dev->gnd_dgrams[i], gndg_list) {
1649                         if (dg->gndg_type != GNILND_DGRAM_WC_REQ)
1650                                 kgnilnd_cancel_dgram_locked(dg);
1651                 }
1652         }
1653
1654         spin_unlock(&dev->gnd_dgram_lock);
1655
1656         RETURN(0);
1657 }
1658
1659
1660 void
1661 kgnilnd_wait_for_canceled_dgrams(kgn_device_t *dev)
1662 {
1663         int             i = 4;
1664         int             rc;
1665         gni_return_t    grc;
1666         __u64           readyid;
1667         kgn_dgram_t    *dgram;
1668
1669         /* use do while to get at least one check run to allow
1670          * regression test for 762072 to hit bug if there */
1671
1672         /* This function races with the dgram mover during shutdown so it is possible for
1673          * a dgram to be seen in kgnilnd_postdata_probe_wait_by_id but be handled in the
1674          * dgram mover thread instead of inside of this function.
1675          */
1676
1677         /* This should only be called from within shutdown, baseshutdown, or stack reset.
1678          * there are no assertions here to verify since base_shutdown has nothing in it we can check
1679          * the net is gone by then.
1680          */
1681
1682         do {
1683                 i++;
1684                 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
1685                         "Waiting for %d canceled datagrams to clear on device %d\n",
1686                         atomic_read(&dev->gnd_canceled_dgrams), dev->gnd_id);
1687
1688                 /* check once a second */
1689                 grc = kgnilnd_postdata_probe_wait_by_id(dev->gnd_handle,
1690                        250, &readyid);
1691
1692                 if (grc != GNI_RC_SUCCESS)
1693                         continue;
1694
1695                 CDEBUG(D_NET, "ready %#llx on device %d->0x%p\n",
1696                         readyid, dev->gnd_id, dev);
1697
1698                 rc = kgnilnd_probe_for_dgram(dev, &dgram);
1699                 if (rc != 0) {
1700                         /* if we got a valid dgram or one that is now done, clean up */
1701                         kgnilnd_release_dgram(dev, dgram, 1);
1702                 }
1703         } while (atomic_read(&dev->gnd_canceled_dgrams));
1704 }
1705
1706 int
1707 kgnilnd_start_connect(kgn_peer_t *peer)
1708 {
1709         int              rc = 0;
1710         /* sync point for kgnilnd_del_peer_locked - do an early check to
1711          * catch the most common hits where del_peer is done by the
1712          * time we get here */
1713         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING1)) {
1714                 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING1, 1)) {};
1715         }
1716
1717         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1718         if (!kgnilnd_peer_active(peer) || peer->gnp_connecting != GNILND_PEER_CONNECT) {
1719                 /* raced with peer getting unlinked */
1720                 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1721                 rc = ESTALE;
1722                 GOTO(out, rc);
1723         }
1724         peer->gnp_connecting = GNILND_PEER_POSTING;
1725         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1726
1727         set_mb(peer->gnp_last_dgram_time, jiffies);
1728         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING2)) {
1729                 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING2, 1)) {};
1730         }
1731
1732         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING3)) {
1733                 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING3, 1)) {};
1734                 rc = cfs_fail_val ? cfs_fail_val : -ENOMEM;
1735         } else {
1736                 rc = kgnilnd_post_dgram(peer->gnp_net->gnn_dev,
1737                                         peer->gnp_nid, GNILND_CONNREQ_REQ, 0);
1738         }
1739         if (rc < 0) {
1740                 set_mb(peer->gnp_last_dgram_errno, rc);
1741                 GOTO(failed, rc);
1742         }
1743
1744         /* while we're posting someone could have decided this peer/dgram needed to
1745          * die a quick death, so we check for state change and process accordingly */
1746
1747         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1748         if (!kgnilnd_peer_active(peer) || peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH) {
1749                 if (peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH) {
1750                         peer->gnp_connecting = GNILND_PEER_KILL;
1751                 }
1752                 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1753                 /* positive RC to avoid dgram cleanup - we'll have to
1754                  * wait for the kgni GNI_POST_TERMINATED event to
1755                  * finish cleaning up */
1756                 rc = ESTALE;
1757                 kgnilnd_find_and_cancel_dgram(peer->gnp_net->gnn_dev, peer->gnp_nid);
1758                 GOTO(out, rc);
1759         }
1760         peer->gnp_connecting = GNILND_PEER_POSTED;
1761         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1762         /* reaper thread will take care of any timeouts */
1763         CDEBUG(D_NET, "waiting for connect to finish to %s rc %d\n",
1764                libcfs_nid2str(peer->gnp_nid), rc);
1765
1766         RETURN(rc);
1767
1768 failed:
1769         CDEBUG(D_NET, "connect to %s failed: rc %d \n",
1770                libcfs_nid2str(peer->gnp_nid), rc);
1771 out:
1772         RETURN(rc);
1773 }
1774
1775 int
1776 kgnilnd_finish_connect(kgn_dgram_t *dgram)
1777 {
1778         kgn_conn_t        *conn = dgram->gndg_conn;
1779         lnet_nid_t         her_nid = dgram->gndg_conn_in.gncr_srcnid;
1780         kgn_peer_t        *new_peer, *peer = NULL;
1781         kgn_tx_t          *tx;
1782         kgn_tx_t          *txn;
1783         kgn_mbox_info_t   *mbox;
1784         int                rc;
1785         int                nstale;
1786
1787         /* try to find a peer that matches the nid we got in the connreq
1788          * kgnilnd_unpack_connreq makes sure that conn_in.gncr_srcnid is
1789          * HER and conn_out.gncr_srcnid is ME for both active and WC dgrams */
1790
1791         /* assume this is a new peer  - it makes locking cleaner when it isn't */
1792         /* no holding kgn_net_rw_sem - already are at the kgnilnd_dgram_mover level */
1793
1794         rc = kgnilnd_create_peer_safe(&new_peer, her_nid, NULL, GNILND_PEER_UP);
1795         if (rc != 0) {
1796                 CERROR("Can't create peer for %s\n", libcfs_nid2str(her_nid));
1797                 return rc;
1798         }
1799
1800         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1801
1802         /* this transfers ref from create_peer to the kgn_peer table */
1803         kgnilnd_add_peer_locked(her_nid, new_peer, &peer);
1804
1805         /* if we found an existing peer, is it really ready for a new conn ? */
1806         if (peer != new_peer) {
1807                 /* if this was an active connect attempt but we can't find a peer waiting for it
1808                  * we will dump in the trash */
1809
1810                 if (peer->gnp_connecting == GNILND_PEER_IDLE && dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
1811                         CDEBUG(D_NET, "dropping completed connreq for %s peer 0x%p->%s\n",
1812                                libcfs_nid2str(her_nid), peer, libcfs_nid2str(peer->gnp_nid));
1813                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1814                         rc = ECANCELED;
1815                         GOTO(out, rc);
1816                 }
1817
1818                 /* check to see if we can catch a connecting peer before it is
1819                  * removed from the connd_peers list - if not, we need to
1820                  * let the connreqs race and be handled by kgnilnd_conn_isdup_locked() */
1821                 if (peer->gnp_connecting != GNILND_PEER_IDLE) {
1822                         spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
1823                         if (!list_empty(&peer->gnp_connd_list)) {
1824                                 list_del_init(&peer->gnp_connd_list);
1825                                 /* drop connd ref */
1826                                 kgnilnd_peer_decref(peer);
1827                         }
1828                         spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
1829                         /* clear rc to make sure we don't have fake error */
1830                         rc = 0;
1831                 }
1832
1833                 /* no matter what, we are no longer waiting to connect this peer now */
1834                 peer->gnp_connecting = GNILND_PEER_IDLE;
1835
1836                 /* Refuse to duplicate an existing connection (both sides might try to
1837                  * connect at once).  NB we return success!  We _are_ connected so we
1838                  * _don't_ have any blocked txs to complete with failure. */
1839                 rc = kgnilnd_conn_isdup_locked(peer, conn);
1840                 if (rc != 0) {
1841                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1842                         CDEBUG(D_NET, "Not creating duplicate connection to %s: %d\n",
1843                               libcfs_nid2str(her_nid), rc);
1844                         rc = EALREADY;
1845                         GOTO(out, rc);
1846                 }
1847         }
1848
1849         if (peer->gnp_state == GNILND_PEER_DOWN) {
1850                 CNETERR("Received connection request from down nid %s\n",
1851                         libcfs_nid2str(her_nid));
1852         }
1853
1854         peer->gnp_state = GNILND_PEER_UP;
1855         nstale = kgnilnd_close_stale_conns_locked(peer, conn);
1856
1857         /* either way with peer (new or existing), we are ok with ref counts here as the
1858          * kgnilnd_add_peer_locked will use our ref on new_peer (from create_peer_safe) as the
1859          * ref for the peer table. */
1860
1861         /* at this point, the connection request is a winner */
1862
1863         /* mark 'DONE' to avoid cancel being called from release */
1864         dgram->gndg_state = GNILND_DGRAM_DONE;
1865
1866         /* initialise timestamps before reaper looks at them */
1867         conn->gnc_last_rx = conn->gnc_last_rx_cq = jiffies;
1868
1869         /* last_tx is initialized to jiffies - (keepalive*2) so that if the NOOP fails it will
1870          * immediatly send a NOOP in the reaper thread during the call to
1871          * kgnilnd_check_conn_timeouts_locked
1872          */
1873         conn->gnc_last_tx = jiffies - (cfs_time_seconds(GNILND_TO2KA(conn->gnc_timeout)) * 2);
1874         conn->gnc_state = GNILND_CONN_ESTABLISHED;
1875
1876         /* save the dgram type used to establish this connection */
1877         conn->gnc_dgram_type = dgram->gndg_type;
1878
1879         /* refs are not transferred from dgram to tables, so increment to
1880          * take ownership */
1881         kgnilnd_conn_addref(conn);
1882         kgnilnd_peer_addref(peer);
1883         conn->gnc_peer = peer;
1884         list_add_tail(&conn->gnc_list, &peer->gnp_conns);
1885
1886         kgnilnd_conn_addref(conn);               /* +1 ref for conn table */
1887         list_add_tail(&conn->gnc_hashlist,
1888                       kgnilnd_cqid2connlist(conn->gnc_cqid));
1889         kgnilnd_data.kgn_conn_version++;
1890
1891         /* Dont send NOOP if fail_loc is set
1892          */
1893         if (!CFS_FAIL_CHECK(CFS_FAIL_GNI_ONLY_NOOP)) {
1894                 tx = kgnilnd_new_tx_msg(GNILND_MSG_NOOP,
1895                                         lnet_nid_to_nid4(&peer->gnp_net->gnn_ni->ni_nid));
1896                 if (tx == NULL) {
1897                         CNETERR("can't get TX to initiate NOOP to %s\n",
1898                                 libcfs_nid2str(peer->gnp_nid));
1899                 } else {
1900                         kgnilnd_queue_tx(conn, tx);
1901                 }
1902         }
1903
1904         /* Schedule all packets blocking for a connection */
1905         list_for_each_entry_safe(tx, txn, &peer->gnp_tx_queue, tx_list) {
1906                 /* lock held here is the peer_conn lock */
1907                 kgnilnd_tx_del_state_locked(tx, peer, NULL, GNILND_TX_ALLOCD);
1908                 kgnilnd_queue_tx(conn, tx);
1909         }
1910
1911         /* If this is an active connection lets mark its timestamp on the MBoX */
1912         if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
1913                 mbox = &conn->gnc_fma_blk->gnm_mbox_info[conn->gnc_mbox_id];
1914                 /* conn->gnc_last_rx is jiffies it better exist as it was just set */
1915                 mbox->mbx_release_purg_active_dgram = conn->gnc_last_rx;
1916         }
1917
1918         /* Bug 765042: wake up scheduler for a race with finish_connect and
1919          * complete_conn_closed with a conn in purgatory
1920          * since we can't use CFS_RACE due to mutex_holds in kgnilnd_process_conns,
1921          * we just check for set and then clear */
1922         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_FINISH_PURG)) {
1923                 cfs_fail_loc = 0x0;
1924                 /* get scheduler thread moving again */
1925                 kgnilnd_schedule_device(conn->gnc_device);
1926         }
1927
1928         CDEBUG(D_NET, "New conn 0x%p->%s dev %d\n",
1929                conn, libcfs_nid2str(her_nid), conn->gnc_device->gnd_id);
1930
1931         /* make sure we reset peer reconnect interval now that we have a good conn */
1932         kgnilnd_peer_alive(peer);
1933         peer->gnp_reconnect_interval = 0;
1934
1935         /* clear the unlink attribute if we dont clear it kgnilnd_del_conn_or_peer will wait
1936          * on the atomic forever
1937          */
1938         if (peer->gnp_pending_unlink) {
1939                 peer->gnp_pending_unlink = 0;
1940                 kgnilnd_admin_decref(kgnilnd_data.kgn_npending_unlink);
1941                 CDEBUG(D_NET, "Clearing peer unlink %p\n",peer);
1942         }
1943
1944         /* add ref to make it hang around until after we drop the lock */
1945         kgnilnd_conn_addref(conn);
1946
1947         /* Once the peer_conn lock is dropped, the conn could actually move into
1948          * CLOSING->CLOSED->DONE in the scheduler thread, so hold the
1949          * lock until we are really done */
1950         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1951
1952         /* Notify LNET that we now have a working connection to this peer.
1953          * This is a Cray extension to the "standard" LND behavior.
1954          */
1955         lnet_notify(peer->gnp_net->gnn_ni, peer->gnp_nid, true, true,
1956                     ktime_get_seconds());
1957
1958         /* drop our 'hold' ref */
1959         kgnilnd_conn_decref(conn);
1960
1961 out:
1962         RETURN(rc);
1963 }
1964
1965 void
1966 kgnilnd_send_nak(kgn_device_t *dev, lnet_nid_t dst_nid, int error)
1967 {
1968         int              rc = 0;
1969         ENTRY;
1970
1971         LASSERTF(dst_nid != LNET_NID_ANY, "bad dst_nid %s\n", libcfs_nid2str(dst_nid));
1972
1973         CDEBUG(D_NET, "NAK to %s errno %d\n", libcfs_nid2str(dst_nid), error);
1974
1975         rc = kgnilnd_post_dgram(dev, dst_nid, GNILND_CONNREQ_NAK, error);
1976
1977         if (rc < 0) {
1978                 CDEBUG(D_NET, "NAK to %s failed: rc %d \n", libcfs_nid2str(dst_nid), rc);
1979         }
1980         EXIT;
1981 }
1982
1983 int
1984 kgnilnd_process_nak(kgn_dgram_t *dgram)
1985 {
1986         kgn_connreq_t     *connreq = &dgram->gndg_conn_in;
1987         lnet_nid_t         src_nid = connreq->gncr_srcnid;
1988         int                errno = connreq->gncr_nakdata.gnnd_errno;
1989         kgn_peer_t        *peer;
1990         int                rc = 0;
1991
1992         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1993
1994         peer = kgnilnd_find_peer_locked(src_nid);
1995         if (peer == NULL) {
1996                 /* we likely dropped him from bad data when we processed
1997                  * the original REQ */
1998                 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1999                 return -EBADSLT;
2000         }
2001
2002         /* need to check peerstamp/connstamp against the ones we find
2003          * to make sure we don't close new (and good?) conns that we
2004          * formed after this connreq failed */
2005         if (peer->gnp_connecting == GNILND_PEER_IDLE) {
2006                 kgn_conn_t        conn;
2007
2008                 if (list_empty(&peer->gnp_conns)) {
2009                         /* assume already procced datagram and it barfed up
2010                          * on this side too */
2011                         CDEBUG(D_NET, "dropping NAK from %s; "
2012                                "peer %s is already not connected\n",
2013                                 libcfs_nid2str(connreq->gncr_srcnid),
2014                                 libcfs_nid2str(connreq->gncr_dstnid));
2015                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2016                         return 0;
2017                 }
2018
2019                 /* stub up a connection with the connreq XXX_stamps to allow
2020                  * use to use close_stale_conns_locked */
2021                 conn.gnc_peerstamp = connreq->gncr_peerstamp;
2022                 conn.gnc_my_connstamp = connreq->gncr_connstamp;
2023                 conn.gnc_peer_connstamp = connreq->gncr_connstamp;
2024                 conn.gnc_device = peer->gnp_net->gnn_dev;
2025
2026                 rc = kgnilnd_close_stale_conns_locked(peer, &conn);
2027
2028                 LCONSOLE_INFO("Received NAK from %s for %s errno %d; "
2029                         "closed %d connections\n",
2030                         libcfs_nid2str(connreq->gncr_srcnid),
2031                         libcfs_nid2str(connreq->gncr_dstnid), errno, rc);
2032         } else {
2033                 spin_lock(&dgram->gndg_conn->gnc_device->gnd_connd_lock);
2034
2035                 if (list_empty(&peer->gnp_connd_list)) {
2036                         /* if peer isn't on waiting list, try to find one to nuke */
2037                         rc = kgnilnd_find_and_cancel_dgram(peer->gnp_net->gnn_dev,
2038                                                            peer->gnp_nid);
2039
2040                         if (rc) {
2041                                 LCONSOLE_INFO("Received NAK from %s for %s errno %d; "
2042                                         "canceled pending connect request\n",
2043                                         libcfs_nid2str(connreq->gncr_srcnid),
2044                                         libcfs_nid2str(connreq->gncr_dstnid), errno);
2045                         }
2046
2047                         /* if we can't find a waiting dgram, we just drop the nak - the conn
2048                          * connect must have failed (didn't find conn above and clear connecting
2049                          * -- so nothing to do besides drop */
2050                 } else {
2051                         /* peer is on list, meaning it is a new connect attempt from the one
2052                          * we started that generated the NAK - so just drop NAK */
2053
2054                         /* use negative to prevent error message */
2055                         rc = -EAGAIN;
2056                 }
2057                 spin_unlock(&dgram->gndg_conn->gnc_device->gnd_connd_lock);
2058         }
2059
2060         /* success! we found a peer and at least marked pending_nak */
2061         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2062
2063         return rc;
2064 }
2065
2066 int
2067 kgnilnd_process_connreq(kgn_dgram_t *dgram, int *needs_nak)
2068 {
2069         int                      rc;
2070
2071         rc = kgnilnd_unpack_connreq(dgram);
2072         if (rc < 0) {
2073                 if (rc != -EBADF) {
2074                         /* only NAK if we have good srcnid to use */
2075                         *needs_nak = 1;
2076                 }
2077                 goto connreq_out;
2078         }
2079
2080         switch (dgram->gndg_conn_in.gncr_type) {
2081         case GNILND_CONNREQ_REQ:
2082                 /* wire up peer & conn, send queued TX */
2083                 rc = kgnilnd_finish_connect(dgram);
2084
2085                 /* don't nak when the nid is hosed */
2086                 if ((rc < 0)) {
2087                         *needs_nak = 1;
2088                 }
2089
2090                 break;
2091         case GNILND_CONNREQ_NAK:
2092                 rc = kgnilnd_process_nak(dgram);
2093                 /* return early to prevent reconnect bump */
2094                 return rc;
2095         default:
2096                 CERROR("unexpected connreq type %s (%d) from %s\n",
2097                         kgnilnd_connreq_type2str(&dgram->gndg_conn_in),
2098                         dgram->gndg_conn_in.gncr_type,
2099                         libcfs_nid2str(dgram->gndg_conn_in.gncr_srcnid));
2100                 rc = -EINVAL;
2101                 *needs_nak = 1;
2102                 break;
2103         }
2104
2105 connreq_out:
2106         RETURN(rc);
2107 }
2108
2109 int
2110 kgnilnd_probe_and_process_dgram(kgn_device_t *dev)
2111 {
2112         int                      rc;
2113         int                      needs_nak = 0;
2114         lnet_nid_t               nak_dstnid = LNET_NID_ANY;
2115         lnet_nid_t               orig_dstnid;
2116         kgn_dgram_t             *dgram = NULL;
2117         kgn_peer_t              *peer;
2118         ENTRY;
2119
2120         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PAUSE_DGRAM_COMP)) {
2121                 rc = 0;
2122         } else {
2123                 rc = kgnilnd_probe_for_dgram(dev, &dgram);
2124         }
2125
2126         if (rc == 0) {
2127                 RETURN(0);
2128         } else if (rc < 0) {
2129                 GOTO(inform_peer, rc);
2130         } else {
2131                 /* rc > 1 means it did something, reset for this func  */
2132                 rc = 0;
2133         }
2134
2135         switch (dgram->gndg_type) {
2136         case GNILND_DGRAM_WC_REQ:
2137         case GNILND_DGRAM_REQ:
2138                 rc = kgnilnd_process_connreq(dgram, &needs_nak);
2139                 break;
2140         case GNILND_DGRAM_NAK:
2141                 CDEBUG(D_NETTRACE, "NAK to %s done\n",
2142                         libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid));
2143                 break;
2144         default:
2145                 CERROR("unknown datagram type %s (%d)\n",
2146                        kgnilnd_dgram_type2str(dgram), dgram->gndg_type);
2147                 break;
2148         }
2149
2150         /* stash data to use after releasing current datagram */
2151         /* don't stash net - we are operating on a net already,
2152          * so the lock on rw_net_lock is sufficient */
2153
2154         nak_dstnid = dgram->gndg_conn_in.gncr_srcnid;
2155
2156 inform_peer:
2157         LASSERTF(dgram != NULL, "dgram 0x%p rc %d needs_nak %d\n", dgram, rc, needs_nak);
2158
2159         orig_dstnid = dgram->gndg_conn_out.gncr_dstnid;
2160
2161         kgnilnd_release_dgram(dev, dgram, 0);
2162
2163         CDEBUG(D_NET, "cleaning up dgram to %s, rc %d\n",
2164                libcfs_nid2str(orig_dstnid), rc);
2165
2166         /* if this was a WC_REQ that matched an existing peer, it'll get marked done
2167          * in kgnilnd_finish_connect - if errors are from before we get to there,
2168          * we just drop as it is a WC_REQ - the peer CAN'T be waiting for it */
2169         if ((orig_dstnid != LNET_NID_ANY) && (rc < 0)) {
2170                 /* if we have a negative rc, we want to find a peer to inform about
2171                  * the bad connection attempt. Sorry buddy, better luck next time! */
2172
2173                 write_lock(&kgnilnd_data.kgn_peer_conn_lock);
2174                 peer = kgnilnd_find_peer_locked(orig_dstnid);
2175
2176                 if (peer != NULL) {
2177                         /* add ref to make sure he stays around past the possible unlink
2178                          * so we can tell LNet about him */
2179                         kgnilnd_peer_addref(peer);
2180
2181                         /* if he still cares about the outstanding connect */
2182                         if (peer->gnp_connecting >= GNILND_PEER_CONNECT) {
2183                                 /* check if he is on the connd list and remove.. */
2184                                 spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
2185                                 if (!list_empty(&peer->gnp_connd_list)) {
2186                                         list_del_init(&peer->gnp_connd_list);
2187                                         /* drop connd ref */
2188                                         kgnilnd_peer_decref(peer);
2189                                 }
2190                                 spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
2191
2192                                 /* clear gnp_connecting so we don't have a non-connecting peer
2193                                  * on gnd_connd_list */
2194                                 peer->gnp_connecting = GNILND_PEER_IDLE;
2195
2196                                 set_mb(peer->gnp_last_dgram_errno, rc);
2197
2198                                 kgnilnd_peer_increase_reconnect_locked(peer);
2199                         }
2200                 }
2201                 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2202
2203                 /* now that we are outside the lock, tell Mommy */
2204                 if (peer != NULL) {
2205                         kgnilnd_peer_notify(peer, rc, 0);
2206                         kgnilnd_peer_decref(peer);
2207                 }
2208         }
2209
2210         if (needs_nak) {
2211                 kgnilnd_send_nak(dev, nak_dstnid, rc);
2212         }
2213
2214         RETURN(1);
2215 }
2216
2217 void
2218 kgnilnd_reaper_dgram_check(kgn_device_t *dev)
2219 {
2220         kgn_dgram_t    *dgram, *tmp;
2221         int             i;
2222
2223         spin_lock(&dev->gnd_dgram_lock);
2224
2225         for (i = 0; i < (*kgnilnd_tunables.kgn_peer_hash_size - 1); i++) {
2226                 list_for_each_entry_safe(dgram, tmp, &dev->gnd_dgrams[i], gndg_list) {
2227                         unsigned long            now = jiffies;
2228                         unsigned long            timeout;
2229
2230                         /* don't timeout stuff if the network is mucked or shutting down */
2231                         if (kgnilnd_check_hw_quiesce()) {
2232                                 break;
2233                         }
2234
2235                         if ((dgram->gndg_state != GNILND_DGRAM_POSTED) ||
2236                             (dgram->gndg_type == GNILND_DGRAM_WC_REQ)) {
2237                                 continue;
2238                         }
2239                         CDEBUG(D_NETTRACE, "checking dgram 0x%p type %s "
2240                                 "state %s conn 0x%p to %s age %lus\n",
2241                                 dgram, kgnilnd_dgram_type2str(dgram),
2242                                 kgnilnd_dgram_state2str(dgram), dgram->gndg_conn,
2243                                 libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
2244                                 cfs_duration_sec(now - dgram->gndg_post_time));
2245
2246                         timeout = cfs_time_seconds(*kgnilnd_tunables.kgn_timeout);
2247
2248                         if (time_before(now, (dgram->gndg_post_time + timeout)))
2249                                 continue;
2250
2251                         CNETERR("%s datagram to %s timed out @ %lus dgram "
2252                                 "0x%p state %s conn 0x%p\n",
2253                                 kgnilnd_dgram_type2str(dgram),
2254                                 libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
2255                                 cfs_duration_sec(now - dgram->gndg_post_time),
2256                                 dgram, kgnilnd_dgram_state2str(dgram),
2257                                 dgram->gndg_conn);
2258
2259                         kgnilnd_cancel_dgram_locked(dgram);
2260                 }
2261         }
2262         spin_unlock(&dev->gnd_dgram_lock);
2263 }
2264
2265
2266 /* use a thread for the possibly long-blocking wait_by_id to prevent
2267  * stalling the global workqueues */
2268 int
2269 kgnilnd_dgram_waitq(void *arg)
2270 {
2271         kgn_device_t     *dev = (kgn_device_t *) arg;
2272         char              name[16];
2273         gni_return_t      grc;
2274         __u64             readyid;
2275         DEFINE_WAIT(mover_done);
2276
2277         snprintf(name, sizeof(name), "kgnilnd_dgn_%02d", dev->gnd_id);
2278
2279         /* all gnilnd threads need to run fairly urgently */
2280         set_user_nice(current, *kgnilnd_tunables.kgn_nice);
2281
2282         /* we dont shut down until the device shuts down ... */
2283         while (!kgnilnd_data.kgn_shutdown) {
2284                 /* to quiesce or to not quiesce, that is the question */
2285                 if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
2286                         KGNILND_SPIN_QUIESCE;
2287                 }
2288
2289                 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_PAUSE_DGRAM_COMP, 1)) {}
2290
2291                 /* check once a second */
2292                 grc = kgnilnd_postdata_probe_wait_by_id(dev->gnd_handle,
2293                                                        1000, &readyid);
2294
2295                 if (grc == GNI_RC_SUCCESS) {
2296                         CDEBUG(D_INFO, "waking up dgram mover thread\n");
2297                         kgnilnd_schedule_dgram(dev);
2298
2299                         /* wait for dgram thread to ping us before spinning again */
2300                         prepare_to_wait(&dev->gnd_dgping_waitq, &mover_done,
2301                                         TASK_INTERRUPTIBLE);
2302
2303                         /* don't sleep if we need to quiesce */
2304                         if (likely(!kgnilnd_data.kgn_quiesce_trigger)) {
2305                                 schedule();
2306                         }
2307                         finish_wait(&dev->gnd_dgping_waitq, &mover_done);
2308                 }
2309         }
2310
2311         kgnilnd_thread_fini();
2312         return 0;
2313 }
2314
2315 int
2316 kgnilnd_start_outbound_dgrams(kgn_device_t *dev, unsigned long deadline)
2317 {
2318         int                      did_something = 0, rc;
2319         kgn_peer_t              *peer = NULL;
2320
2321         spin_lock(&dev->gnd_connd_lock);
2322
2323         /* Active connect - we added this in kgnilnd_launch_tx */
2324         while (!list_empty(&dev->gnd_connd_peers) && time_before(jiffies, deadline)) {
2325                 peer = list_first_entry(&dev->gnd_connd_peers,
2326                                         kgn_peer_t, gnp_connd_list);
2327
2328                 /* ref for connd removed in if/else below */
2329                list_del_init(&peer->gnp_connd_list);
2330
2331                 /* gnp_connecting and membership on gnd_connd_peers should be
2332                  * done coherently to avoid double adding, etc */
2333                 /* don't need kgnilnd_data.kgn_peer_conn_lock here as that is only needed
2334                  * to get the peer to gnp_connecting in the first place. We just need to
2335                  * rely on gnd_connd_lock to serialize someone pulling him from the list
2336                  * BEFORE clearing gnp_connecting */
2337                 LASSERTF(peer->gnp_connecting != GNILND_PEER_IDLE, "peer 0x%p->%s not connecting\n",
2338                          peer, libcfs_nid2str(peer->gnp_nid));
2339
2340                 spin_unlock(&dev->gnd_connd_lock);
2341
2342                 CDEBUG(D_NET, "processing connect to %s\n",
2343                        libcfs_nid2str(peer->gnp_nid));
2344
2345                 did_something += 1;
2346                 rc = kgnilnd_start_connect(peer);
2347
2348                 if (likely(rc >= 0)) {
2349                         /* 0 on success, positive on 'just drop peer' errors */
2350                         kgnilnd_peer_decref(peer);
2351                 } else if (rc == -ENOMEM) {
2352                         /* if we are out of wildcards, add back to
2353                          * connd_list - then break out and we'll try later
2354                          * if other errors, we'll bail & cancel pending tx */
2355                         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
2356                         if (peer->gnp_connecting == GNILND_PEER_POSTING) {
2357                                 peer->gnp_connecting = GNILND_PEER_CONNECT;
2358                                 spin_lock(&dev->gnd_connd_lock);
2359                                 list_add_tail(&peer->gnp_connd_list,
2360                                               &dev->gnd_connd_peers);
2361                         } else {
2362                                 /* connecting changed while we were posting */
2363
2364                                 LASSERTF(peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH, "Peer is in invalid"
2365                                         " state 0x%p->%s, connecting %d\n",
2366                                         peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting);
2367                                 peer->gnp_connecting = GNILND_PEER_KILL;
2368                                 spin_lock(&dev->gnd_connd_lock);
2369                                 /* remove the peer ref frrom the cond list */
2370                                 kgnilnd_peer_decref(peer);
2371                                 /* let the system handle itself */
2372                         }
2373                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2374                         /* the datagrams are a global pool,
2375                          * so break out of trying and hope some free
2376                          * up soon */
2377                         did_something -= 1;
2378                         break;
2379                 } else {
2380                         /* something bad happened, you lose */
2381                         CNETERR("could not start connecting to %s "
2382                                 "rc %d: Will retry until TX timeout\n",
2383                                libcfs_nid2str(peer->gnp_nid), rc);
2384                         /* It didnt post so just set connecting back to zero now.
2385                          * The reaper will reattempt the connection if it needs too.
2386                          * If the peer needs death set it so the reaper will cleanup.
2387                          */
2388                         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
2389                         if (peer->gnp_connecting == GNILND_PEER_POSTING) {
2390                                 peer->gnp_connecting = GNILND_PEER_IDLE;
2391                                 kgnilnd_peer_increase_reconnect_locked(peer);
2392                         } else {
2393                                 LASSERTF(peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH, "Peer is in invalid"
2394                                         " state 0x%p->%s, connecting %d\n",
2395                                         peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting);
2396                                 peer->gnp_connecting = GNILND_PEER_KILL;
2397                         }
2398                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2399
2400                         /* hold onto ref until we are really done - if it was
2401                          * unlinked this could result in a destroy */
2402                         kgnilnd_peer_decref(peer);
2403                 }
2404                 spin_lock(&dev->gnd_connd_lock);
2405         }
2406
2407         spin_unlock(&dev->gnd_connd_lock);
2408         RETURN(did_something);
2409 }
2410
2411 int
2412 kgnilnd_repost_wc_dgrams(kgn_device_t *dev)
2413 {
2414         int did_something = 0, to_repost, i;
2415         to_repost = atomic_read(&dev->gnd_nwcdgrams);
2416         ENTRY;
2417
2418         for (i = 0; i < to_repost; ++i) {
2419                 int     rerc;
2420                 rerc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
2421                 if (rerc == 0) {
2422                         kgnilnd_admin_decref(dev->gnd_nwcdgrams);
2423                         did_something += 1;
2424                 } else {
2425                         CDEBUG(D_NETERROR, "error %d: dev %d could not post wildcard datagram\n",
2426                                 rerc, dev->gnd_id);
2427                         break;
2428                 }
2429         }
2430
2431         RETURN(did_something);
2432 }
2433
2434 struct kgnilnd_dgram_timer {
2435         struct timer_list timer;
2436         kgn_device_t *dev;
2437 };
2438
2439 static void
2440 kgnilnd_dgram_poke_with_stick(cfs_timer_cb_arg_t arg)
2441 {
2442         struct kgnilnd_dgram_timer *t = cfs_from_timer(t, arg, timer);
2443
2444         wake_up(&t->dev->gnd_dgram_waitq);
2445 }
2446
2447 /* use single thread for dgrams - should be sufficient for performance */
2448 int
2449 kgnilnd_dgram_mover(void *arg)
2450 {
2451         kgn_device_t            *dev = (kgn_device_t *)arg;
2452         char                     name[16];
2453         int                      rc, did_something;
2454         unsigned long            next_purge_check = jiffies - 1;
2455         unsigned long            timeout;
2456         struct kgnilnd_dgram_timer timer;
2457         unsigned long deadline = 0;
2458         DEFINE_WAIT(wait);
2459
2460         snprintf(name, sizeof(name), "kgnilnd_dg_%02d", dev->gnd_id);
2461
2462         /* all gnilnd threads need to run fairly urgently */
2463         set_user_nice(current, *kgnilnd_tunables.kgn_nice);
2464
2465         /* we are ok not locking for these variables as the dgram waitq threads
2466          * will block both due to tying up net (kgn_shutdown) and the completion
2467          * event for the dgram_waitq (kgn_quiesce_trigger) */
2468         deadline = jiffies + cfs_time_seconds(*kgnilnd_tunables.kgn_dgram_timeout);
2469         while (!kgnilnd_data.kgn_shutdown) {
2470                 /* Safe: kgn_shutdown only set when quiescent */
2471
2472                 /* race with stack reset - we want to hold off seeing any new incoming dgrams
2473                  * so we can force a dirty WC dgram for Bug 762072 - put right before
2474                  * quiesce check so that it'll go right into that and not do any
2475                  * dgram mucking */
2476                 CFS_RACE(CFS_FAIL_GNI_WC_DGRAM_FREE);
2477
2478                 /* to quiesce or to not quiesce, that is the question */
2479                 if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
2480                         KGNILND_SPIN_QUIESCE;
2481                 }
2482                 did_something = 0;
2483
2484                 CFS_RACE(CFS_FAIL_GNI_QUIESCE_RACE);
2485
2486                 /* process any newly completed dgrams */
2487                 down_read(&kgnilnd_data.kgn_net_rw_sem);
2488
2489                 rc = kgnilnd_probe_and_process_dgram(dev);
2490                 if (rc > 0) {
2491                         did_something += rc;
2492                 }
2493
2494                 up_read(&kgnilnd_data.kgn_net_rw_sem);
2495
2496                 CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_DGRAM_DEADLINE,
2497                         (*kgnilnd_tunables.kgn_dgram_timeout + 1));
2498                 /* start new outbound dgrams */
2499                 did_something += kgnilnd_start_outbound_dgrams(dev, deadline);
2500
2501                 /* find dead dgrams */
2502                 if (time_after_eq(jiffies, next_purge_check)) {
2503                         /* these don't need to be checked that often */
2504                         kgnilnd_reaper_dgram_check(dev);
2505
2506                         next_purge_check = (long) jiffies +
2507                                       cfs_time_seconds(kgnilnd_data.kgn_new_min_timeout / 4);
2508                 }
2509
2510                 did_something += kgnilnd_repost_wc_dgrams(dev);
2511
2512                 /* careful with the jiffy wrap... */
2513                 timeout = (long)(next_purge_check - jiffies);
2514
2515                 CDEBUG(D_INFO, "did %d timeout %lu next %lu jiffies %lu\n",
2516                        did_something, timeout, next_purge_check, jiffies);
2517
2518                 if ((did_something || timeout <= 0) && time_before(jiffies, deadline)) {
2519                         did_something = 0;
2520                         continue;
2521                 }
2522
2523                 prepare_to_wait(&dev->gnd_dgram_waitq, &wait, TASK_INTERRUPTIBLE);
2524
2525                 cfs_timer_setup(&timer.timer,
2526                                 kgnilnd_dgram_poke_with_stick,
2527                                 dev, 0);
2528                 timer.dev = dev;
2529                 mod_timer(&timer.timer, (long) jiffies + timeout);
2530
2531                 /* last second chance for others to poke us */
2532                 did_something += xchg(&dev->gnd_dgram_ready, GNILND_DGRAM_IDLE);
2533
2534                 /* check flag variables before committing even if we
2535                  * did something; if we are after the deadline call
2536                  * schedule */
2537                 if ((!did_something || time_after(jiffies, deadline)) &&
2538                     !kgnilnd_data.kgn_shutdown &&
2539                     !kgnilnd_data.kgn_quiesce_trigger) {
2540                         CDEBUG(D_INFO, "schedule timeout %ld (%lu sec)\n",
2541                                timeout, cfs_duration_sec(timeout));
2542                         wake_up(&dev->gnd_dgping_waitq);
2543                         schedule();
2544                         CDEBUG(D_INFO, "awake after schedule\n");
2545                         deadline = jiffies + cfs_time_seconds(*kgnilnd_tunables.kgn_dgram_timeout);
2546                 }
2547
2548                 del_singleshot_timer_sync(&timer.timer);
2549                 finish_wait(&dev->gnd_dgram_waitq, &wait);
2550         }
2551
2552         kgnilnd_thread_fini();
2553         return 0;
2554 }