1 // SPDX-License-Identifier: GPL-2.0
3 /* Copyright (C) 2012 Cray, Inc.
5 * Copyright (c) 2014, Intel Corporation.
8 /* This file is part of Lustre, http://www.lustre.org.
10 * Author: Nic Henke <nic@cray.com>
11 * Author: James Shimek <jshimek@cray.com>
15 #include <linux/swap.h>
18 kgnilnd_setup_smsg_attr(gni_smsg_attr_t *smsg_attr)
20 smsg_attr->mbox_maxcredit = *kgnilnd_tunables.kgn_mbox_credits;
21 smsg_attr->msg_maxsize = GNILND_MAX_MSG_SIZE;
22 smsg_attr->msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT;
26 kgnilnd_map_fmablk(kgn_device_t *device, kgn_fma_memblock_t *fma_blk)
29 __u32 flags = GNI_MEM_READWRITE;
30 static unsigned long reg_to;
31 int rfto = *kgnilnd_tunables.kgn_reg_fail_timeout;
33 if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
34 flags |= GNI_MEM_PHYS_CONT;
37 fma_blk->gnm_hold_timeout = 0;
39 /* make sure we are mapping a clean block */
40 LASSERTF(fma_blk->gnm_hndl.qword1 == 0UL,
41 "fma_blk %px dirty\n", fma_blk);
43 rrc = kgnilnd_mem_register(device->gnd_handle, (__u64)fma_blk->gnm_block,
44 fma_blk->gnm_blk_size, device->gnd_rcv_fma_cqh,
45 flags, &fma_blk->gnm_hndl);
46 if (rrc != GNI_RC_SUCCESS) {
47 if (rfto != GNILND_REGFAILTO_DISABLE) {
49 reg_to = jiffies + cfs_time_seconds(rfto);
51 LASSERTF(!time_after(jiffies, reg_to),
52 "FATAL:fmablk registration has failed for %ld seconds.\n",
53 cfs_duration_sec(jiffies - reg_to) + rfto);
56 CNETERR("register fmablk failed 0x%p mbox_size %d flags %u\n",
57 fma_blk, fma_blk->gnm_mbox_size, flags);
63 /* PHYS_CONT memory isn't really mapped, at least not in GART -
64 * but all mappings chew up a MDD
66 if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
67 atomic64_add(fma_blk->gnm_blk_size, &device->gnd_nbytes_map);
70 atomic_inc(&device->gnd_n_mdd);
71 /* nfmablk is live (mapped) blocks */
72 atomic_inc(&device->gnd_nfmablk);
78 kgnilnd_alloc_fmablk(kgn_device_t *device, int use_phys)
82 kgn_fma_memblock_t *fma_blk;
83 gni_smsg_attr_t smsg_attr;
84 unsigned long fmablk_vers;
86 #if defined(CONFIG_CRAY_XT) && !defined(CONFIG_CRAY_COMPUTE)
87 /* We allocate large blocks of memory here potentially leading
88 * to memory exhaustion during massive reconnects during a network
89 * outage. Limit the amount of fma blocks to use by always keeping
90 * a percent of pages free initially set to 25% of total memory. */
91 if (nr_free_pages() < kgnilnd_data.free_pages_limit) {
92 LCONSOLE_INFO("Exceeding free page limit of %ld. "
93 "Free pages available %ld\n",
94 kgnilnd_data.free_pages_limit,
99 /* we'll use fmablk_vers and the gnd_fmablk_mutex to gate access
100 * to this allocation code. Everyone will sample the version
101 * before and after getting the mutex. If it has changed,
102 * we'll bail out to check the lists again - this indicates that
103 * some sort of change was made to the lists and it is possible
104 * that there is a mailbox for us to find now. This should prevent
105 * a ton of spinning in the case where there are lots of threads
106 * that need a yet-to-be-allocated mailbox for a connection. */
108 fmablk_vers = atomic_read(&device->gnd_fmablk_vers);
109 mutex_lock(&device->gnd_fmablk_mutex);
111 if (fmablk_vers != atomic_read(&device->gnd_fmablk_vers)) {
112 /* version changed while we were waiting for semaphore,
113 * we'll recheck the lists assuming something nice happened */
114 mutex_unlock(&device->gnd_fmablk_mutex);
118 LIBCFS_ALLOC(fma_blk, sizeof(kgn_fma_memblock_t));
119 if (fma_blk == NULL) {
120 CNETERR("could not allocate fma block descriptor\n");
125 INIT_LIST_HEAD(&fma_blk->gnm_bufflist);
127 kgnilnd_setup_smsg_attr(&smsg_attr);
129 gni_smsg_buff_size_needed(&smsg_attr, &fma_blk->gnm_mbox_size);
131 LASSERTF(fma_blk->gnm_mbox_size, "mbox size %d\n", fma_blk->gnm_mbox_size);
133 /* gni_smsg_buff_size_needed calculates the base mailbox size and since
134 * we want to hold kgn_peer_credits worth of messages in both directions,
135 * we add PAYLOAD to grow the mailbox size
138 fma_blk->gnm_mbox_size += GNILND_MBOX_PAYLOAD;
140 /* we'll only use physical during preallocate at startup -- this keeps it nice and
141 * clean for runtime decisions. We'll keep the PHYS ones around until shutdown
142 * as reallocating them is tough if there is memory fragmentation */
145 fma_blk->gnm_block = kmem_cache_alloc(kgnilnd_data.kgn_mbox_cache, GFP_ATOMIC);
146 if (fma_blk->gnm_block == NULL) {
147 CNETERR("could not allocate physical SMSG mailbox memory\n");
151 fma_blk->gnm_blk_size = GNILND_MBOX_SIZE;
152 num_mbox = fma_blk->gnm_blk_size / fma_blk->gnm_mbox_size;
154 LASSERTF(num_mbox >= 1,
155 "num_mbox %d blk_size %u mbox_size %d\n",
156 num_mbox, fma_blk->gnm_blk_size, fma_blk->gnm_mbox_size);
158 fma_blk->gnm_state = GNILND_FMABLK_PHYS;
161 num_mbox = *kgnilnd_tunables.kgn_mbox_per_block;
162 fma_blk->gnm_blk_size = num_mbox * fma_blk->gnm_mbox_size;
164 LASSERTF(num_mbox >= 1 && num_mbox >= *kgnilnd_tunables.kgn_mbox_per_block,
165 "num_mbox %d blk_size %u mbox_size %d tunable %d\n",
166 num_mbox, fma_blk->gnm_blk_size, fma_blk->gnm_mbox_size,
167 *kgnilnd_tunables.kgn_mbox_per_block);
169 fma_blk->gnm_block = kgnilnd_vzalloc(fma_blk->gnm_blk_size);
170 if (fma_blk->gnm_block == NULL) {
171 CNETERR("could not allocate virtual SMSG mailbox memory, %d bytes\n", fma_blk->gnm_blk_size);
176 fma_blk->gnm_state = GNILND_FMABLK_VIRT;
179 /* allocate just enough space for the bits to track the mailboxes */
180 CFS_ALLOC_PTR_ARRAY(fma_blk->gnm_bit_array, BITS_TO_LONGS(num_mbox));
181 if (fma_blk->gnm_bit_array == NULL) {
182 CNETERR("could not allocate mailbox bitmask, %lu bytes for %d mbox\n",
183 sizeof(unsigned long) * BITS_TO_LONGS(num_mbox), num_mbox);
187 bitmap_zero(fma_blk->gnm_bit_array, num_mbox);
189 /* now that the num_mbox is set based on allocation type, get debug
192 CFS_ALLOC_PTR_ARRAY(fma_blk->gnm_mbox_info, num_mbox);
193 if (fma_blk->gnm_mbox_info == NULL) {
194 CNETERR("could not allocate mailbox debug, %lu bytes for %d mbox\n",
195 sizeof(kgn_mbox_info_t) * num_mbox, num_mbox);
200 rc = kgnilnd_map_fmablk(device, fma_blk);
205 fma_blk->gnm_next_avail_mbox = 0;
206 fma_blk->gnm_avail_mboxs = fma_blk->gnm_num_mboxs = num_mbox;
208 CDEBUG(D_MALLOC, "alloc fmablk 0x%p num %d msg_maxsize %d credits %d "
209 "mbox_size %d MDD %#llx.%#llx\n",
210 fma_blk, num_mbox, smsg_attr.msg_maxsize, smsg_attr.mbox_maxcredit,
211 fma_blk->gnm_mbox_size, fma_blk->gnm_hndl.qword1,
212 fma_blk->gnm_hndl.qword2);
214 /* lock Is protecting data structures, not semaphore */
216 spin_lock(&device->gnd_fmablk_lock);
217 list_add_tail(&fma_blk->gnm_bufflist, &device->gnd_fma_buffs);
219 /* toggle under the lock so once they change the list is also
220 * ready for others to traverse */
221 atomic_inc(&device->gnd_fmablk_vers);
223 spin_unlock(&device->gnd_fmablk_lock);
225 mutex_unlock(&device->gnd_fmablk_mutex);
230 CFS_FREE_PTR_ARRAY(fma_blk->gnm_mbox_info, num_mbox);
232 CFS_FREE_PTR_ARRAY(fma_blk->gnm_bit_array, BITS_TO_LONGS(num_mbox));
234 if (fma_blk->gnm_state == GNILND_FMABLK_VIRT) {
235 kgnilnd_vfree(fma_blk->gnm_block, fma_blk->gnm_blk_size);
237 kmem_cache_free(kgnilnd_data.kgn_mbox_cache, fma_blk->gnm_block);
240 LIBCFS_FREE(fma_blk, sizeof(kgn_fma_memblock_t));
242 mutex_unlock(&device->gnd_fmablk_mutex);
247 kgnilnd_unmap_fmablk(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
251 /* if some held, set hold_timeout from conn timeouts used in this block
252 * but not during shutdown, then just nuke and pave
253 * During a stack reset, we need to deregister with a hold timeout
254 * set so we don't use the same mdd after reset is complete */
255 if ((fma_blk->gnm_held_mboxs && !kgnilnd_data.kgn_shutdown) ||
256 kgnilnd_data.kgn_in_reset) {
257 fma_blk->gnm_hold_timeout = GNILND_TIMEOUT2DEADMAN;
260 /* we are changing the state of a block, tickle version to tell
261 * proc code list is stale now */
262 atomic_inc(&dev->gnd_fmablk_vers);
264 rrc = kgnilnd_mem_deregister(dev->gnd_handle, &fma_blk->gnm_hndl, fma_blk->gnm_hold_timeout);
266 CDEBUG(rrc == GNI_RC_SUCCESS ? D_MALLOC : D_CONSOLE|D_NETERROR,
267 "unmap fmablk 0x%p@%s sz %u total %d avail %d held %d mbox_size %d "
269 fma_blk, kgnilnd_fmablk_state2str(fma_blk->gnm_state),
270 fma_blk->gnm_blk_size, fma_blk->gnm_num_mboxs,
271 fma_blk->gnm_avail_mboxs, fma_blk->gnm_held_mboxs,
272 fma_blk->gnm_mbox_size, fma_blk->gnm_hold_timeout);
274 LASSERTF(rrc == GNI_RC_SUCCESS,
275 "tried to double unmap or something bad, fma_blk %px (rrc %d)\n",
278 if (fma_blk->gnm_hold_timeout &&
279 !(kgnilnd_data.kgn_in_reset &&
280 fma_blk->gnm_state == GNILND_FMABLK_PHYS)) {
281 atomic_inc(&dev->gnd_n_mdd_held);
283 atomic_dec(&dev->gnd_n_mdd);
286 /* PHYS blocks don't get mapped */
287 if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
288 atomic64_sub(fma_blk->gnm_blk_size, &dev->gnd_nbytes_map);
289 fma_blk->gnm_state = GNILND_FMABLK_IDLE;
290 } else if (kgnilnd_data.kgn_in_reset) {
291 /* in stack reset, clear MDD handle for PHYS blocks, as we'll
292 * re-use the fma_blk after reset so we don't have to drop/allocate
293 * all of those physical blocks */
294 fma_blk->gnm_hndl.qword1 = fma_blk->gnm_hndl.qword2 = 0UL;
297 /* Decrement here as this is the # of mapped blocks */
298 atomic_dec(&dev->gnd_nfmablk);
302 /* needs lock on gnd_fmablk_lock to cover gnd_fma_buffs */
304 kgnilnd_free_fmablk_locked(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
306 LASSERTF(fma_blk->gnm_avail_mboxs == fma_blk->gnm_num_mboxs,
307 "fma_blk %px@%d free in bad state (%d): blk total %d avail %d held %d\n",
308 fma_blk, fma_blk->gnm_state, fma_blk->gnm_hold_timeout, fma_blk->gnm_num_mboxs,
309 fma_blk->gnm_avail_mboxs, fma_blk->gnm_held_mboxs);
311 atomic_inc(&dev->gnd_fmablk_vers);
313 if (fma_blk->gnm_hold_timeout) {
314 CDEBUG(D_MALLOC, "mdd release fmablk 0x%p sz %u avail %d held %d "
316 fma_blk, fma_blk->gnm_blk_size, fma_blk->gnm_avail_mboxs,
317 fma_blk->gnm_held_mboxs, fma_blk->gnm_mbox_size);
319 /* We leave MDD dangling over stack reset */
320 if (!kgnilnd_data.kgn_in_reset) {
321 kgnilnd_mem_mdd_release(dev->gnd_handle, &fma_blk->gnm_hndl);
323 /* ignoring the return code - if kgni/ghal can't find it
324 * it must be released already */
325 atomic_dec(&dev->gnd_n_mdd_held);
326 atomic_dec(&dev->gnd_n_mdd);
329 /* we cant' free the gnm_block until all the conns have released their
330 * purgatory holds. While we have purgatory holds, we might check the conn
331 * RX mailbox during the CLOSING process. It is possible that kgni might
332 * try to look into the RX side for credits when sending the CLOSE msg too */
333 if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
334 LIBCFS_MEM_MSG(fma_blk->gnm_block, fma_blk->gnm_mbox_size, "free");
335 kmem_cache_free(kgnilnd_data.kgn_mbox_cache, fma_blk->gnm_block);
337 kgnilnd_vfree(fma_blk->gnm_block, fma_blk->gnm_blk_size);
339 fma_blk->gnm_state = GNILND_FMABLK_FREED;
341 list_del(&fma_blk->gnm_bufflist);
343 CFS_FREE_PTR_ARRAY(fma_blk->gnm_mbox_info, fma_blk->gnm_num_mboxs);
344 CFS_FREE_PTR_ARRAY(fma_blk->gnm_bit_array,
345 BITS_TO_LONGS(fma_blk->gnm_num_mboxs));
346 LIBCFS_FREE(fma_blk, sizeof(kgn_fma_memblock_t));
350 kgnilnd_find_free_mbox(kgn_conn_t *conn)
352 kgn_device_t *dev = conn->gnc_device;
353 gni_smsg_attr_t *smsg_attr = &conn->gnpr_smsg_attr;
354 kgn_fma_memblock_t *fma_blk;
355 kgn_mbox_info_t *mbox = NULL;
358 spin_lock(&dev->gnd_fmablk_lock);
360 list_for_each_entry(fma_blk, &conn->gnc_device->gnd_fma_buffs,
362 if (fma_blk->gnm_avail_mboxs <= 0 ||
363 fma_blk->gnm_state <= GNILND_FMABLK_IDLE) {
366 /* look in bitarray for available mailbox */
368 id = find_next_zero_bit(
369 fma_blk->gnm_bit_array,
370 fma_blk->gnm_num_mboxs,
371 fma_blk->gnm_next_avail_mbox);
372 if (id == fma_blk->gnm_num_mboxs &&
373 fma_blk->gnm_next_avail_mbox != 0) {
375 fma_blk->gnm_next_avail_mbox = 0;
381 LASSERTF(id < fma_blk->gnm_num_mboxs, "id %d max %d\n",
382 id, fma_blk->gnm_num_mboxs);
383 set_bit(id, (volatile unsigned long *)fma_blk->gnm_bit_array);
384 conn->gnc_mbox_id = id;
386 fma_blk->gnm_next_avail_mbox =
387 (id == (fma_blk->gnm_num_mboxs - 1)) ? 0 : (id + 1);
388 fma_blk->gnm_avail_mboxs--;
389 conn->gnc_fma_blk = fma_blk;
391 kgnilnd_setup_smsg_attr(smsg_attr);
393 smsg_attr->msg_buffer = fma_blk->gnm_block;
394 smsg_attr->mbox_offset = fma_blk->gnm_mbox_size * id;
395 smsg_attr->mem_hndl = fma_blk->gnm_hndl;
396 smsg_attr->buff_size = fma_blk->gnm_mbox_size;
398 /* We'll set the hndl to zero for PHYS blocks unmapped during stack
399 * reset and re-use the same fma_blk after stack reset. This ensures we've
400 * properly mapped it before we use it */
401 LASSERTF(fma_blk->gnm_hndl.qword1 != 0UL,
402 "unmapped fma_blk %px, state %d\n",
403 fma_blk, fma_blk->gnm_state);
405 CDEBUG(D_NET, "conn %p smsg %p fmablk %p "
406 "allocating SMSG mbox %d buf %p "
407 "offset %u hndl %#llx.%#llx\n",
408 conn, smsg_attr, fma_blk, id,
409 smsg_attr->msg_buffer, smsg_attr->mbox_offset,
410 fma_blk->gnm_hndl.qword1,
411 fma_blk->gnm_hndl.qword2);
413 mbox = &fma_blk->gnm_mbox_info[id];
414 mbox->mbx_create_conn_memset = jiffies;
416 mbox->mbx_nallocs_total++;
418 /* zero mbox to remove any old data from our last use.
419 * this better be safe, if not our purgatory timers
420 * are too short or a peer really is misbehaving */
421 memset(smsg_attr->msg_buffer + smsg_attr->mbox_offset,
422 0, smsg_attr->buff_size);
426 spin_unlock(&dev->gnd_fmablk_lock);
430 kgnilnd_setup_mbox(kgn_conn_t *conn)
432 gni_smsg_attr_t *smsg_attr = &conn->gnpr_smsg_attr;
435 smsg_attr->msg_buffer = NULL;
436 /* Look for available mbox */
438 kgnilnd_find_free_mbox(conn);
440 /* nothing in the existing buffers, make a new one */
441 if (smsg_attr->msg_buffer == NULL) {
442 /* for runtime allocations, we only want vmalloc */
443 err = kgnilnd_alloc_fmablk(conn->gnc_device, 0);
448 } while (smsg_attr->msg_buffer == NULL);
451 CNETERR("couldn't allocate SMSG mbox for conn %p Error: %d\n",
457 kgnilnd_release_mbox(kgn_conn_t *conn, int purgatory_hold)
459 kgn_device_t *dev = conn->gnc_device;
460 gni_smsg_attr_t *smsg_attr = &conn->gnpr_smsg_attr;
461 kgn_fma_memblock_t *fma_blk = NULL;
462 kgn_mbox_info_t *mbox = NULL;
466 /* if we failed to setup mbox and now destroying conn */
467 if (smsg_attr->msg_buffer == NULL) {
471 id = conn->gnc_mbox_id;
473 spin_lock(&dev->gnd_fmablk_lock);
474 /* make sure our conn points at a valid fma_blk
475 * We use this instead of a mem block search out of smsg_attr
476 * because we could have freed a block for fma_blk #1 but the fma_blk
477 * is still in the list for a purgatory hold. This would induce a false
478 * match if that same block gets reallocated to fma_blk #2 */
479 list_for_each_entry(fma_blk, &dev->gnd_fma_buffs, gnm_bufflist) {
480 if (fma_blk == conn->gnc_fma_blk) {
486 "unable to find conn 0x%p with gnc_fma_blk %px anywhere in the world\n",
487 conn, conn->gnc_fma_blk);
489 LASSERTF(id < fma_blk->gnm_num_mboxs,
490 "bad id %d max %d\n",
491 id, fma_blk->gnm_num_mboxs);
493 /* < 0 - was held, now free it
494 * == 0 - just free it
495 * > 0 - hold it for now */
496 if (purgatory_hold == 0) {
497 CDEBUG(D_NET, "conn %p smsg %p fmablk %p freeing SMSG mbox %d "
498 "hndl %#llx.%#llx\n",
499 conn, smsg_attr, fma_blk, id,
500 fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
501 fma_blk->gnm_avail_mboxs++;
503 } else if (purgatory_hold > 0) {
504 CDEBUG(D_NET, "conn %p smsg %p fmablk %p holding SMSG mbox %d "
505 "hndl %#llx.%#llx\n",
506 conn, smsg_attr, fma_blk, id,
507 fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
509 fma_blk->gnm_held_mboxs++;
510 fma_blk->gnm_max_timeout = max_t(long, fma_blk->gnm_max_timeout,
513 CDEBUG(D_NET, "conn %p smsg %p fmablk %p release SMSG mbox %d "
514 "hndl %#llx.%#llx\n",
515 conn, smsg_attr, fma_blk, id,
516 fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
518 fma_blk->gnm_held_mboxs--;
519 fma_blk->gnm_avail_mboxs++;
522 if (purgatory_hold <= 0) {
523 /* if kgni is retransmitting, freeing the smsg block before the EP
524 * is destroyed gets messy. Bug 768295. */
525 LASSERTF(conn->gnc_ephandle == NULL,
526 "can't release mbox before EP is nuked. conn 0x%p\n", conn);
528 mbox = &fma_blk->gnm_mbox_info[id];
529 mbox->mbx_release_from_purgatory = jiffies;
531 /* clear conn gnc_fmablk if it is gone - this allows us to
532 * not worry about state so much in kgnilnd_destroy_conn
533 * and makes the guaranteed cleanup of the resources easier */
534 LASSERTF(test_and_clear_bit(id, fma_blk->gnm_bit_array),
535 "conn %px bit %d already cleared in fma_blk %px\n",
537 conn->gnc_fma_blk = NULL;
541 if (CFS_FAIL_CHECK(CFS_FAIL_GNI_FMABLK_AVAIL)) {
542 CERROR("LBUGs in your future: forcibly marking fma_blk %p "
543 "as mapped\n", fma_blk);
544 fma_blk->gnm_state = GNILND_FMABLK_VIRT;
547 /* we don't release or unmap PHYS blocks as part of the normal cycle --
548 * those are controlled manually from startup/shutdown */
549 if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
550 /* we can unmap once all are unused (held or avail)
551 * but check hold_timeout to make sure we are not trying to double
552 * unmap this buffer. If there was no hold_timeout set due to
553 * held_mboxs, we'll free the mobx here shortly and won't have to
554 * worry about catching a double free for a 'clean' fma_blk */
555 if (((fma_blk->gnm_avail_mboxs + fma_blk->gnm_held_mboxs) == fma_blk->gnm_num_mboxs) &&
556 (!fma_blk->gnm_hold_timeout)) {
557 kgnilnd_unmap_fmablk(dev, fma_blk);
560 /* But we can only free once they are all avail */
561 if (fma_blk->gnm_avail_mboxs == fma_blk->gnm_num_mboxs &&
562 fma_blk->gnm_held_mboxs == 0) {
563 /* all mailboxes are released, free fma_blk */
564 kgnilnd_free_fmablk_locked(dev, fma_blk);
568 spin_unlock(&dev->gnd_fmablk_lock);
572 kgnilnd_count_phys_mbox(kgn_device_t *device)
575 kgn_fma_memblock_t *fma_blk;
577 spin_lock(&device->gnd_fmablk_lock);
579 list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
580 if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
581 i += fma_blk->gnm_num_mboxs;
583 spin_unlock(&device->gnd_fmablk_lock);
589 kgnilnd_allocate_phys_fmablk(kgn_device_t *device)
593 while (kgnilnd_count_phys_mbox(device) < *kgnilnd_tunables.kgn_nphys_mbox) {
595 rc = kgnilnd_alloc_fmablk(device, 1);
597 CERROR("failed phys mbox allocation, stopping at %d, rc %d\n",
598 kgnilnd_count_phys_mbox(device), rc);
606 kgnilnd_map_phys_fmablk(kgn_device_t *device)
610 kgn_fma_memblock_t *fma_blk;
612 /* use mutex to gate access to single thread, just in case */
613 mutex_lock(&device->gnd_fmablk_mutex);
615 spin_lock(&device->gnd_fmablk_lock);
617 list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
618 if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
619 rc = kgnilnd_map_fmablk(device, fma_blk);
624 spin_unlock(&device->gnd_fmablk_lock);
626 mutex_unlock(&device->gnd_fmablk_mutex);
632 kgnilnd_unmap_fma_blocks(kgn_device_t *device)
635 kgn_fma_memblock_t *fma_blk;
637 /* use mutex to gate access to single thread, just in case */
638 mutex_lock(&device->gnd_fmablk_mutex);
640 spin_lock(&device->gnd_fmablk_lock);
642 list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
643 kgnilnd_unmap_fmablk(device, fma_blk);
645 spin_unlock(&device->gnd_fmablk_lock);
647 mutex_unlock(&device->gnd_fmablk_mutex);
651 kgnilnd_free_phys_fmablk(kgn_device_t *device)
654 kgn_fma_memblock_t *fma_blk, *fma_blkN;
656 /* use mutex to gate access to single thread, just in case */
657 mutex_lock(&device->gnd_fmablk_mutex);
659 spin_lock(&device->gnd_fmablk_lock);
661 list_for_each_entry_safe(fma_blk, fma_blkN, &device->gnd_fma_buffs, gnm_bufflist) {
662 if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
663 kgnilnd_free_fmablk_locked(device, fma_blk);
665 spin_unlock(&device->gnd_fmablk_lock);
667 mutex_unlock(&device->gnd_fmablk_mutex);
670 /* kgnilnd dgram nid->struct managment */
672 static inline struct list_head *
673 kgnilnd_nid2dgramlist(kgn_device_t *dev, lnet_nid_t nid)
675 unsigned int hash = ((unsigned int)nid) % *kgnilnd_tunables.kgn_peer_hash_size;
677 RETURN(&dev->gnd_dgrams[hash]);
681 /* needs dev->gnd_dgram_lock held */
683 kgnilnd_find_dgram_locked(kgn_device_t *dev, lnet_nid_t dst_nid)
685 struct list_head *dgram_list = kgnilnd_nid2dgramlist(dev, dst_nid);
688 list_for_each_entry(dgram, dgram_list, gndg_list) {
690 /* if state > POSTED, we are already handling cancel/completion */
691 if ((dgram->gndg_conn_out.gncr_dstnid != dst_nid) ||
692 dgram->gndg_state > GNILND_DGRAM_POSTED)
695 CDEBUG(D_NET, "got dgram [%p] -> %s\n",
696 dgram, libcfs_nid2str(dst_nid));
703 kgnilnd_find_and_cancel_dgram(kgn_device_t *dev, lnet_nid_t dst_nid)
707 spin_lock(&dev->gnd_dgram_lock);
708 dgram = kgnilnd_find_dgram_locked(dev, dst_nid);
711 kgnilnd_cancel_dgram_locked(dgram);
713 spin_unlock(&dev->gnd_dgram_lock);
715 RETURN(!!(dgram == NULL));
719 kgnilnd_pack_connreq(kgn_connreq_t *connreq, kgn_conn_t *conn,
720 lnet_nid_t srcnid, lnet_nid_t dstnid,
721 kgn_connreq_type_t type)
725 /* ensure we haven't violated max datagram size */
726 BUILD_BUG_ON(sizeof(kgn_connreq_t) > GNI_DATAGRAM_MAXSIZE);
728 /* no need to zero out, we do that when allocating dgram */
729 connreq->gncr_magic = GNILND_MSG_MAGIC;
731 if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_SRCNID)) {
733 } else if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_DSTNID)) {
737 connreq->gncr_srcnid = srcnid;
738 connreq->gncr_dstnid = dstnid;
740 if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
741 connreq->gncr_version = 99;
743 connreq->gncr_version = GNILND_CONNREQ_VERSION;
745 if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
746 connreq->gncr_type = 99;
748 connreq->gncr_type = type;
750 if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
751 connreq->gncr_peerstamp = 0;
753 connreq->gncr_peerstamp = kgnilnd_data.kgn_peerstamp;
755 if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
756 connreq->gncr_connstamp = 0;
758 connreq->gncr_connstamp = conn->gnc_my_connstamp;
760 if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
761 connreq->gncr_timeout = 0;
763 connreq->gncr_timeout = conn->gnc_timeout;
766 /* the rest pack the data into the payload in other places */
767 if (type == GNILND_CONNREQ_REQ) {
768 kgn_gniparams_t *req_params = &connreq->gncr_gnparams;
769 req_params->gnpr_host_id = conn->gnc_device->gnd_host_id;
770 req_params->gnpr_cqid = conn->gnc_cqid;
772 /* allocate mailbox for this connection */
773 err = kgnilnd_setup_mbox(conn);
775 CERROR("Failed to setup FMA mailbox (%d)\n", err);
777 req_params->gnpr_smsg_attr = conn->gnpr_smsg_attr;
780 /* XXX Nic: TBD - checksum computation */
786 kgnilnd_unpack_connreq(kgn_dgram_t *dgram)
788 kgn_connreq_t *connreq = &dgram->gndg_conn_in;
792 /* the following fields must be handled in a backwards compatible
793 * manner to ensure we can always send and interpret NAKs */
795 if (connreq->gncr_magic != GNILND_MSG_MAGIC &&
796 connreq->gncr_magic != __swab32(GNILND_MSG_MAGIC)) {
797 /* Unexpected magic! */
798 CERROR("Unexpected magic %08x\n",
799 connreq->gncr_magic);
803 swab = (connreq->gncr_magic == __swab32(GNILND_MSG_MAGIC));
805 __swab32s(&connreq->gncr_magic);
806 __swab32s(&connreq->gncr_cksum);
807 __swab16s(&connreq->gncr_type);
808 __swab16s(&connreq->gncr_version);
809 __swab32s(&connreq->gncr_timeout);
810 __swab64s(&connreq->gncr_srcnid);
811 __swab64s(&connreq->gncr_dstnid);
812 __swab64s(&connreq->gncr_peerstamp);
813 __swab64s(&connreq->gncr_connstamp);
816 /* Do NOT return anything but -EBADF before we munge
817 * connreq->gncr_srcnid - we need that to send the nak */
819 if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
820 lnet_nid_t incoming = connreq->gncr_srcnid;
822 /* even if the incoming packet is hosed, we know who we sent
823 * the original and can set the srcnid so that we can properly
824 * look up our peer to close the loop on this connreq. We still use
825 * -EBADF to prevent a NAK - just in case there are issues with
826 * the payload coming from a random spot, etc. */
827 connreq->gncr_srcnid = dgram->gndg_conn_out.gncr_dstnid;
829 if (LNET_NIDADDR(dgram->gndg_conn_out.gncr_dstnid) !=
830 LNET_NIDADDR(incoming)) {
831 /* we got a datagram match for the wrong nid... */
832 CERROR("matched datagram 0x%p with srcnid %s "
833 "(%x), expecting %s (%x)\n",
835 libcfs_nid2str(incoming),
836 LNET_NIDADDR(incoming),
837 libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
838 LNET_NIDADDR(dgram->gndg_conn_out.gncr_dstnid));
842 /* if we have a wildcard datagram it should match an
843 * incoming "active" datagram that should have a fully formed
844 * srcnid and dstnid. If we couldn't unpack it, we drop as
845 * corrupted packet, otherwise we'll just verify that the dstnid
846 * matches the NID for the NET that the dgram was posted */
848 /* make sure their wildcard didn't match ours, that is unpossible */
849 LASSERTF(connreq->gncr_dstnid != LNET_NID_ANY,
850 "dgram 0x%p from %s, connreq 0x%p; "
851 "wildcard matched wildcard \n", dgram,
852 libcfs_nid2str(connreq->gncr_srcnid), connreq);
854 rc = kgnilnd_find_net(connreq->gncr_dstnid, &net);
856 if (rc == -ESHUTDOWN) {
857 CERROR("Looking up network: device is in shutdown\n");
859 } else if (rc == -ENONET) {
860 CERROR("Connection data from %s: she sent "
861 "dst_nid %s, but net lookup failed on "
863 libcfs_nid2str(connreq->gncr_srcnid),
864 libcfs_nid2str(connreq->gncr_dstnid),
865 dgram, kgnilnd_dgram_type2str(dgram));
869 if (lnet_nid_to_nid4(&net->gnn_ni->ni_nid) !=
870 connreq->gncr_dstnid) {
871 CERROR("Bad connection data from %s: she sent "
872 "dst_nid %s, but I am %s with dgram 0x%p@%s\n",
873 libcfs_nid2str(connreq->gncr_srcnid),
874 libcfs_nid2str(connreq->gncr_dstnid),
875 libcfs_nidstr(&net->gnn_ni->ni_nid),
876 dgram, kgnilnd_dgram_type2str(dgram));
877 kgnilnd_net_decref(net);
881 /* kgnilnd_find_net takes a ref on the net it finds, You need to decref it when not needed. */
882 kgnilnd_net_decref(net);
885 if (connreq->gncr_version != GNILND_CONNREQ_VERSION) {
886 CERROR("Unexpected version %d\n", connreq->gncr_version);
890 /* XXX Nic: TBD - checksum validation */
891 if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_DROP)) {
895 if (swab && connreq->gncr_type == GNILND_CONNREQ_REQ) {
896 __u64 msg_addr = (__u64) connreq->gncr_gnparams.gnpr_smsg_attr.msg_buffer;
898 __swab32s(&connreq->gncr_gnparams.gnpr_host_id);
899 __swab32s(&connreq->gncr_gnparams.gnpr_cqid);
900 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.buff_size);
901 __swab16s(&connreq->gncr_gnparams.gnpr_smsg_attr.mbox_maxcredit);
902 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.mbox_offset);
903 __swab64s(&connreq->gncr_gnparams.gnpr_smsg_attr.mem_hndl.qword1);
904 __swab64s(&connreq->gncr_gnparams.gnpr_smsg_attr.mem_hndl.qword2);
905 __swab64s(&msg_addr);
906 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.msg_maxsize);
907 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.msg_type);
908 } else if (swab && connreq->gncr_type == GNILND_CONNREQ_NAK) {
909 __swab32s(&connreq->gncr_nakdata.gnnd_errno);
912 /* since we use a unique instance ID for each network, the driver
913 * will take care of dropping datagrams if we don't have that network.
916 /* few more idiot software or configuration checks */
918 switch (connreq->gncr_type) {
919 case GNILND_CONNREQ_REQ:
920 /* wire up EP and SMSG block - this will check the incoming data
921 * and barf a NAK back if need to */
922 rc = kgnilnd_set_conn_params(dgram);
926 case GNILND_CONNREQ_NAK:
927 case GNILND_CONNREQ_CLOSE:
930 CERROR("unknown connreq packet type %d\n", connreq->gncr_type);
934 if (connreq->gncr_peerstamp == 0 || connreq->gncr_connstamp == 0) {
935 CERROR("Recived bad timestamps peer %llu conn %llu\n",
936 connreq->gncr_peerstamp, connreq->gncr_connstamp);
940 if (connreq->gncr_timeout < GNILND_MIN_TIMEOUT) {
941 CERROR("Received timeout %d < MIN %d\n",
942 connreq->gncr_timeout, GNILND_MIN_TIMEOUT);
950 kgnilnd_alloc_dgram(kgn_dgram_t **dgramp, kgn_device_t *dev, kgn_dgram_type_t type)
954 dgram = kmem_cache_zalloc(kgnilnd_data.kgn_dgram_cache, GFP_ATOMIC);
958 INIT_LIST_HEAD(&dgram->gndg_list);
959 dgram->gndg_state = GNILND_DGRAM_USED;
960 dgram->gndg_type = type;
961 dgram->gndg_magic = GNILND_DGRAM_MAGIC;
963 atomic_inc(&dev->gnd_ndgrams);
965 CDEBUG(D_MALLOC|D_NETTRACE, "slab-alloced 'dgram': %lu at %p %s ndgrams"
967 sizeof(*dgram), dgram, kgnilnd_dgram_type2str(dgram),
968 atomic_read(&dev->gnd_ndgrams));
974 /* call this on a dgram that came back from kgnilnd_ep_postdata_test_by_id
975 * returns < 0 on dgram to be cleaned up
976 * > 0 on dgram that isn't done yet
977 * == 0 on dgram that is ok and needs connreq processing */
979 kgnilnd_process_dgram(kgn_dgram_t *dgram, gni_post_state_t post_state)
983 switch (post_state) {
984 case GNI_POST_COMPLETED:
985 /* normal state for dgrams that need actual processing */
986 /* GOTO to avoid processing dgram as canceled/done */
987 GOTO(process_out, rc);
989 case GNI_POST_PENDING:
990 /* we should only see this if we are testing a WC dgram after a
991 * cancel - it means that it needs a full cycle of waiting
992 * for kgni_sm_task to finish moving it to TERMINATED */
993 LASSERTF((dgram->gndg_type == GNILND_DGRAM_WC_REQ) &&
994 (dgram->gndg_state == GNILND_DGRAM_CANCELED),
995 "POST_PENDING dgram 0x%p with bad type %d(%s) or state %d(%s)\n",
996 dgram, dgram->gndg_type, kgnilnd_dgram_type2str(dgram),
997 dgram->gndg_state, kgnilnd_dgram_state2str(dgram));
999 /* positive RC as this dgram isn't done yet */
1002 /* GOTO as this isn't done yet */
1003 GOTO(process_out, rc);
1006 case GNI_POST_TERMINATED:
1007 /* we've called cancel and it is done or remote guy called cancel and
1008 * we've receved it on a WC dgram */
1010 /* we are seeing weird terminations on non WC dgrams when we have not
1013 LASSERTF(dgram->gndg_state == GNILND_DGRAM_CANCELED ||
1014 dgram->gndg_conn_out.gncr_dstnid == LNET_NID_ANY,
1015 "dgram 0x%p with bad state %d(%s) or dst nid %s\n",
1016 dgram, dgram->gndg_state, kgnilnd_dgram_state2str(dgram),
1017 libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid));
1020 CDEBUG(D_NETTRACE, "dgram 0x%p saw %s, cleaning it up\n", dgram,
1021 dgram->gndg_state == GNILND_DGRAM_CANCELED ? "canceled" : "terminated");
1026 case GNI_POST_TIMEOUT:
1027 /* we could have a timeout on a wildcard dgram too - if
1028 * we got the incoming request but the remote node beefed
1029 * before kgni could send the match data back. We'll just error
1030 * on the active case and bail out gracefully */
1031 if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
1032 CNETERR("hardware timeout for connect to "
1033 "%s after %lu seconds. Is node dead?\n",
1034 libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
1035 cfs_duration_sec(jiffies - dgram->gndg_post_time));
1042 CERROR("dgram 0x%p with bad post_state %d\n", dgram, post_state);
1046 /* now finish cleaning up a dgram that is canceled/terminated and needs to
1049 /* If this was actively canceled, drop the count now that we are processing */
1050 if (dgram->gndg_state == GNILND_DGRAM_CANCELED) {
1051 atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1052 /* caller responsible for gndg_list removal */
1060 /* needs dev->gnd_dgram_lock held */
1062 kgnilnd_cancel_dgram_locked(kgn_dgram_t *dgram)
1066 if (dgram->gndg_state != GNILND_DGRAM_POSTED) {
1070 LASSERTF(dgram->gndg_conn != NULL,
1071 "dgram 0x%p with NULL conn\n", dgram);
1073 /* C.E - WC dgrams could be canceled immediately but
1074 * if there was some match pending, we need to call
1075 * test_by_id to clear it out. If that test returns
1076 * POST_PENDING, it is half done and needs to go along
1077 * with the rest of dgrams and go through a kgni_sm_task cycle
1078 * and deliver a GNI_POST_TERMINATED event before they
1079 * are actually canceled */
1081 dgram->gndg_state = GNILND_DGRAM_CANCELED;
1083 if (dgram->gndg_conn->gnc_state >= GNILND_CONN_ESTABLISHED) {
1084 /* we don't need to cancel_by_id if the datagram was good */
1088 /* let folks know there are outstanding cancels */
1089 atomic_inc(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1090 /* leave on nid list until cancel is done for debugging fun */
1091 grc = kgnilnd_ep_postdata_cancel_by_id(dgram->gndg_conn->gnc_ephandle, (__u64) dgram);
1093 /* if we don't get success here, we have hosed up the dgram tracking
1094 * code and need to bail out */
1095 LASSERTF(grc == GNI_RC_SUCCESS,
1096 "postdata_cancel returned %d for conn 0x%p to %s\n",
1097 grc, dgram->gndg_conn,
1098 dgram->gndg_conn->gnc_peer ?
1099 libcfs_nid2str(dgram->gndg_conn->gnc_peer->gnp_nid)
1103 "canceled dgram 0x%p conn 0x%p ephandle 0x%p\n",
1104 dgram, dgram->gndg_conn,
1105 dgram->gndg_conn->gnc_ephandle);
1107 if (dgram->gndg_type == GNILND_DGRAM_WC_REQ) {
1108 gni_post_state_t post_state;
1110 __u32 remote_addr = 0, remote_id = 0;
1112 grc = kgnilnd_ep_postdata_test_by_id(dgram->gndg_conn->gnc_ephandle,
1113 (__u64)dgram, &post_state,
1114 &remote_addr, &remote_id);
1116 LASSERTF(grc == GNI_RC_NO_MATCH || grc == GNI_RC_SUCCESS,
1117 "bad grc %d from test_by_id on dgram 0x%p\n",
1120 /* if WC was canceled immediately, we get NO_MATCH, if needs to go
1121 * through full cycle, we get SUCCESS and need to parse post_state */
1123 CDEBUG(D_NET, "grc %d dgram 0x%p type %s post_state %d "
1124 "remote_addr %u remote_id %u\n", grc, dgram,
1125 kgnilnd_dgram_type2str(dgram),
1126 post_state, remote_addr, remote_id);
1128 if (grc == GNI_RC_NO_MATCH) {
1129 /* she's gone, reduce count and move along */
1130 dgram->gndg_state = GNILND_DGRAM_DONE;
1131 atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1135 rc = kgnilnd_process_dgram(dgram, post_state);
1138 /* if for some weird reason we get a valid dgram back, just mark as done
1139 * so we can drop it and move along.
1140 * C.E - if it was completed, we'll just release the conn/mbox
1141 * back into the pool and it'll get reused. That said, we should only
1142 * be canceling a WC dgram on stack rest or shutdown, so that is moot */
1143 dgram->gndg_state = GNILND_DGRAM_DONE;
1144 atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1146 /* caller context responsible for calling kgnilnd_release_dgram() */
1148 /* still pending, let it simmer until golden brown and delicious */
1152 /* for non WC dgrams, they are still on the nid list but marked canceled waiting
1153 * for kgni to return their ID to us via probe - that is when we'll complete their
1154 * cancel processing */
1158 kgnilnd_cleanup_dgram(kgn_dgram_t *dgram)
1160 /* release the dgram ref on conn */
1161 if (dgram->gndg_conn) {
1162 kgnilnd_conn_decref(dgram->gndg_conn);
1163 dgram->gndg_conn = NULL;
1168 kgnilnd_free_dgram(kgn_device_t *dev, kgn_dgram_t *dgram)
1170 LASSERTF(dgram->gndg_state == GNILND_DGRAM_USED ||
1171 dgram->gndg_state == GNILND_DGRAM_DONE,
1172 "dgram 0x%p with bad state %s\n",
1173 dgram, kgnilnd_dgram_state2str(dgram));
1175 /* bit of poisoning to help detect bad driver data */
1176 dgram->gndg_magic = 0x6f5a6b5f;
1177 atomic_dec(&dev->gnd_ndgrams);
1179 kmem_cache_free(kgnilnd_data.kgn_dgram_cache, dgram);
1180 CDEBUG(D_MALLOC|D_NETTRACE, "slab-freed 'dgram': %lu at %p %s"
1182 sizeof(*dgram), dgram, kgnilnd_dgram_type2str(dgram),
1183 atomic_read(&dev->gnd_ndgrams));
1187 kgnilnd_post_dgram(kgn_device_t *dev, lnet_nid_t dstnid, kgn_connreq_type_t type,
1191 kgn_dgram_t *dgram = NULL;
1192 kgn_dgram_t *tmpdgram;
1193 kgn_dgram_type_t dgtype;
1199 case GNILND_CONNREQ_REQ:
1200 if (dstnid == LNET_NID_ANY)
1201 dgtype = GNILND_DGRAM_WC_REQ;
1203 dgtype = GNILND_DGRAM_REQ;
1205 case GNILND_CONNREQ_NAK:
1206 LASSERTF(dstnid != LNET_NID_ANY, "can't NAK to LNET_NID_ANY\n");
1207 dgtype = GNILND_DGRAM_NAK;
1210 CERROR("unknown connreq type %d\n", type);
1214 rc = kgnilnd_alloc_dgram(&dgram, dev, dgtype);
1217 GOTO(post_failed, rc);
1220 rc = kgnilnd_create_conn(&dgram->gndg_conn, dev);
1222 GOTO(post_failed, rc);
1225 if (dgram->gndg_type == GNILND_DGRAM_WC_REQ) {
1226 /* clear buffer for sanity on reuse of wildcard */
1227 memset(&dgram->gndg_conn_in, 0, sizeof(kgn_connreq_t));
1230 if (dstnid == LNET_NID_ANY) {
1231 /* set here to reset any dgram re-use */
1232 dgram->gndg_conn->gnc_state = GNILND_CONN_LISTEN;
1236 rc = kgnilnd_nid_to_nicaddrs(LNET_NIDADDR(dstnid), 1, &host_id);
1239 GOTO(post_failed, rc);
1242 dgram->gndg_conn->gnc_state = GNILND_CONN_CONNECTING;
1244 /* don't need to serialize, there are no CQs for the dgram
1245 * EP on the kgn_net_t */
1246 grc = kgnilnd_ep_bind(dgram->gndg_conn->gnc_ephandle, host_id, dev->gnd_id);
1248 if (grc != GNI_RC_SUCCESS) {
1250 GOTO(post_failed, rc);
1255 /* If we are posting wildcards post using a net of 0, otherwise we'll use the
1256 * net of the destination node.
1259 if (dstnid == LNET_NID_ANY) {
1260 srcnid = LNET_MKNID(LNET_MKNET(GNILND, 0), dev->gnd_nid);
1262 srcnid = LNET_MKNID(LNET_NIDNET(dstnid), dev->gnd_nid);
1265 rc = kgnilnd_pack_connreq(&dgram->gndg_conn_out, dgram->gndg_conn,
1266 srcnid, dstnid, type);
1268 GOTO(post_failed, rc);
1271 if (type == GNILND_CONNREQ_NAK)
1272 dgram->gndg_conn_out.gncr_nakdata.gnnd_errno = data_rc;
1274 dgram->gndg_post_time = jiffies;
1276 /* XXX Nic: here is where we'd add in logical network multiplexing */
1278 CDEBUG(D_NETTRACE, "dgram 0x%p type %s %s->%s cdm %d\n",
1279 dgram, kgnilnd_dgram_type2str(dgram),
1280 libcfs_nid2str(srcnid),
1281 libcfs_nid2str(dstnid), dev->gnd_id);
1283 /* this allocates memory, can't hold locks across */
1284 grc = kgnilnd_ep_postdata_w_id(dgram->gndg_conn->gnc_ephandle,
1285 &dgram->gndg_conn_out, sizeof(kgn_connreq_t),
1286 &dgram->gndg_conn_in, sizeof(kgn_connreq_t),
1289 if (grc != GNI_RC_SUCCESS) {
1290 CNETERR("dropping failed dgram post id 0x%p type %s"
1291 " reqtype %s to %s: rc %d\n",
1292 dgram, kgnilnd_dgram_type2str(dgram),
1293 kgnilnd_connreq_type2str(&dgram->gndg_conn_out),
1294 libcfs_nid2str(dstnid), grc);
1295 rc = (grc == GNI_RC_ERROR_NOMEM) ? -ENOMEM : -EBADR;
1296 GOTO(post_failed, rc);
1299 /* we don't need to add earlier - if someone does del_peer during post,
1300 * that peer will get marked as unlinked and the callers wil take care of it.
1301 * The dgram code is largely kgn_peer_t ignorant, so at worst, we'll just drop
1302 * the completed dgram later when we cant find a peer to stuff it into */
1304 spin_lock(&dev->gnd_dgram_lock);
1306 /* make sure we are not double posting targeted dgrams
1307 * - we can multiple post WC dgrams to help with processing speed */
1308 if (dstnid != LNET_NID_ANY) {
1309 tmpdgram = kgnilnd_find_dgram_locked(dev, dstnid);
1311 LASSERTF(tmpdgram == NULL,
1312 "dgram 0x%p->%s already posted\n",
1313 dgram, libcfs_nid2str(dstnid));
1316 /* unmunge dstnid to help processing code cope... */
1317 if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_DSTNID)) {
1318 dgram->gndg_conn_out.gncr_dstnid = dstnid;
1321 list_add_tail(&dgram->gndg_list, kgnilnd_nid2dgramlist(dev, dstnid));
1322 dgram->gndg_state = GNILND_DGRAM_POSTED;
1323 spin_unlock(&dev->gnd_dgram_lock);
1326 if (rc < 0 && dgram != NULL) {
1327 kgnilnd_cleanup_dgram(dgram);
1328 kgnilnd_free_dgram(dev, dgram);
1334 /* The shutdown flag is set from the shutdown and stack reset threads. */
1336 kgnilnd_release_dgram(kgn_device_t *dev, kgn_dgram_t *dgram, int shutdown)
1338 /* The conns of canceled active dgrams need to be put in purgatory so
1339 * we don't reuse the mailbox */
1340 if (unlikely(dgram->gndg_state == GNILND_DGRAM_CANCELED)) {
1342 kgn_conn_t *conn = dgram->gndg_conn;
1343 lnet_nid_t nid = dgram->gndg_conn_out.gncr_dstnid;
1345 dgram->gndg_state = GNILND_DGRAM_DONE;
1347 /* During shutdown we've already removed the peer so we don't
1348 * need to add a peer. During stack reset we don't care about
1349 * MDDs since they are all released. */
1351 write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1352 peer = kgnilnd_find_peer_locked(nid);
1355 CDEBUG(D_NET, "adding peer's conn with nid %s "
1356 "to purgatory\n", libcfs_nid2str(nid));
1357 kgnilnd_conn_addref(conn);
1358 conn->gnc_peer = peer;
1359 kgnilnd_peer_addref(peer);
1360 kgnilnd_admin_addref(conn->gnc_peer->gnp_dirty_eps);
1361 conn->gnc_state = GNILND_CONN_CLOSED;
1362 list_add_tail(&conn->gnc_list,
1364 kgnilnd_add_purgatory_locked(conn,
1366 kgnilnd_schedule_conn(conn);
1368 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1372 spin_lock(&dev->gnd_dgram_lock);
1373 kgnilnd_cancel_dgram_locked(dgram);
1374 spin_unlock(&dev->gnd_dgram_lock);
1376 kgnilnd_cleanup_dgram(dgram);
1378 /* if the dgram is 'canceled' it needs to be wait until the event
1379 * comes up from kgni that tells us it is safe to release */
1380 if (dgram->gndg_state != GNILND_DGRAM_CANCELED) {
1381 dgram->gndg_state = GNILND_DGRAM_DONE;
1383 LASSERTF(list_empty(&dgram->gndg_list), "dgram 0x%p on list\n", dgram);
1385 /* if it is a wildcard and we are in an appropriate state, repost
1388 if ((dgram->gndg_type == GNILND_DGRAM_WC_REQ) &&
1389 (!kgnilnd_data.kgn_wc_kill && !kgnilnd_data.kgn_in_reset)) {
1392 rerc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
1394 /* We failed to repost the WC dgram for some reason
1395 * mark it so the repost system attempts to repost */
1396 kgnilnd_admin_addref(dev->gnd_nwcdgrams);
1400 /* always free the old dgram */
1401 kgnilnd_free_dgram(dev, dgram);
1407 kgnilnd_probe_for_dgram(kgn_device_t *dev, kgn_dgram_t **dgramp)
1409 kgn_dgram_t *dgram = NULL;
1410 gni_post_state_t post_state;
1414 __u32 remote_addr = 0, remote_id = 0;
1417 /* Probe with the lock held. That way if we get a dgram we dont have it canceled
1418 * between finding the ready dgram and grabbing the lock to remove it from the
1419 * list. Otherwise we could be left in an inconsistent state. We own the dgram
1420 * once its off the list so we don't need to worry about others changing it at
1422 spin_lock(&dev->gnd_dgram_lock);
1423 grc = kgnilnd_postdata_probe_by_id(dev->gnd_handle, &readyid);
1424 if (grc != GNI_RC_SUCCESS) {
1425 spin_unlock(&dev->gnd_dgram_lock);
1426 /* return 0 to indicate nothing happened */
1430 CDEBUG(D_NET, "ready %#llx on device 0x%p\n",
1433 dgram = (kgn_dgram_t *)readyid;
1435 LASSERTF(dgram->gndg_magic == GNILND_DGRAM_MAGIC,
1436 "dgram 0x%p from id %#llx with bad magic %x\n",
1437 dgram, readyid, dgram->gndg_magic);
1439 LASSERTF(dgram->gndg_state == GNILND_DGRAM_POSTED ||
1440 dgram->gndg_state == GNILND_DGRAM_CANCELED,
1441 "dgram 0x%p with bad state %s\n",
1442 dgram, kgnilnd_dgram_state2str(dgram));
1444 LASSERTF(!list_empty(&dgram->gndg_list),
1445 "dgram 0x%p with bad list state %s type %s\n",
1446 dgram, kgnilnd_dgram_state2str(dgram),
1447 kgnilnd_dgram_type2str(dgram));
1449 /* now we know that the datagram structure is ok, so pull off list */
1450 list_del_init(&dgram->gndg_list);
1452 /* while we have the gnn_dgram_lock and BEFORE we call test_by_id
1453 * change the state from POSTED to PROCESSING to ensure that
1454 * nobody cancels it after we've pulled it from the wire */
1455 if (dgram->gndg_state == GNILND_DGRAM_POSTED) {
1456 dgram->gndg_state = GNILND_DGRAM_PROCESSING;
1459 LASSERTF(dgram->gndg_conn != NULL,
1460 "dgram 0x%p with NULL conn\n", dgram);
1462 grc = kgnilnd_ep_postdata_test_by_id(dgram->gndg_conn->gnc_ephandle,
1463 (__u64)dgram, &post_state,
1464 &remote_addr, &remote_id);
1466 /* we now "own" this datagram */
1467 spin_unlock(&dev->gnd_dgram_lock);
1469 LASSERTF(grc != GNI_RC_NO_MATCH, "kgni lied! probe_by_id told us that"
1470 " id %llu was ready\n", readyid);
1472 CDEBUG(D_NET, "grc %d dgram 0x%p type %s post_state %d "
1473 "remote_addr %u remote_id %u\n", grc, dgram,
1474 kgnilnd_dgram_type2str(dgram),
1475 post_state, remote_addr, remote_id);
1477 if (unlikely(grc != GNI_RC_SUCCESS)) {
1478 CNETERR("getting data for dgram 0x%p->%s failed rc %d. Dropping it\n",
1479 dgram, libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
1482 GOTO(probe_for_out, rc);
1485 rc = kgnilnd_process_dgram(dgram, post_state);
1487 /* we should never get probe finding a dgram for us and then it
1488 * being a WC dgram that is still in the middle of processing */
1489 LASSERTF(rc <= 0, "bad rc %d from process_dgram 0x%p state %d\n",
1490 rc, dgram, post_state);
1493 /* dgram is good enough for the data to be used */
1494 dgram->gndg_state = GNILND_DGRAM_PROCESSING;
1495 /* fake rc to mark that we've done something */
1498 /* let kgnilnd_release_dgram take care of canceled dgrams */
1499 if (dgram->gndg_state != GNILND_DGRAM_CANCELED) {
1500 dgram->gndg_state = GNILND_DGRAM_DONE;
1509 kgnilnd_release_dgram(dev, dgram, 0);
1514 kgnilnd_setup_wildcard_dgram(kgn_device_t *dev)
1516 /* if kgn_wildcard is zero, return error */
1517 int rc = -ENOENT, i;
1520 for (i = 0; i < *kgnilnd_tunables.kgn_nwildcard; i++) {
1521 rc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
1523 CERROR("error %d: could not post wildcard datagram # %d\n",
1535 kgnilnd_cancel_net_dgrams(kgn_net_t *net)
1537 kgn_dgram_t *dg, *dgN;
1542 /* we want to cancel any outstanding dgrams - we don't want to rely
1543 * on del_peer_or_conn catching all of them. This helps protect us in cases
1544 * where we don't quite keep the peer->dgram mapping in sync due to some
1545 * race conditions */
1547 LASSERTF(net->gnn_shutdown || kgnilnd_data.kgn_in_reset,
1548 "called with LND invalid state: net shutdown %d "
1549 "in reset %d\n", net->gnn_shutdown,
1550 kgnilnd_data.kgn_in_reset);
1552 spin_lock(&net->gnn_dev->gnd_dgram_lock);
1554 for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
1555 list_for_each_entry_safe(dg, dgN, &net->gnn_dev->gnd_dgrams[i], gndg_list) {
1557 /* skip nids not on our net or are wildcards */
1560 if (dg->gndg_type == GNILND_DGRAM_WC_REQ ||
1561 net->gnn_netnum != LNET_NETNUM(LNET_NIDNET(dg->gndg_conn_out.gncr_dstnid)))
1564 kgnilnd_cancel_dgram_locked(dg);
1568 spin_unlock(&net->gnn_dev->gnd_dgram_lock);
1574 kgnilnd_cancel_wc_dgrams(kgn_device_t *dev)
1576 kgn_dgram_t *dg, *dgN;
1580 /* Time to kill the outstanding WC's
1581 * WC's exist on net 0 only but match on any net...
1584 LASSERTF(kgnilnd_data.kgn_in_reset || kgnilnd_data.kgn_wc_kill,
1585 "called with LND invalid state: WC shutdown %d "
1586 "in reset %d\n", kgnilnd_data.kgn_wc_kill,
1587 kgnilnd_data.kgn_in_reset);
1589 spin_lock(&dev->gnd_dgram_lock);
1592 dg = kgnilnd_find_dgram_locked(dev, LNET_NID_ANY);
1594 LASSERTF(dg->gndg_type == GNILND_DGRAM_WC_REQ,
1595 "dgram 0x%p->%s with bad type %d (%s)\n",
1596 dg, libcfs_nid2str(dg->gndg_conn_out.gncr_dstnid),
1597 dg->gndg_type, kgnilnd_dgram_type2str(dg));
1599 kgnilnd_cancel_dgram_locked(dg);
1601 /* WC could be DONE already, check and if so add to list to be released */
1602 if (dg->gndg_state == GNILND_DGRAM_DONE)
1603 list_move_tail(&dg->gndg_list, &zombies);
1605 } while (dg != NULL);
1607 spin_unlock(&dev->gnd_dgram_lock);
1609 list_for_each_entry_safe(dg, dgN, &zombies, gndg_list) {
1610 list_del_init(&dg->gndg_list);
1611 kgnilnd_release_dgram(dev, dg, 1);
1618 kgnilnd_cancel_dgrams(kgn_device_t *dev)
1620 kgn_dgram_t *dg, *dgN;
1624 /* Cancel any outstanding non wildcard datagrams regardless
1625 * of which net they are on as we are in base shutdown and
1626 * dont care about connecting anymore.
1629 LASSERTF(kgnilnd_data.kgn_wc_kill == 1,"We didnt get called from base shutdown\n");
1631 spin_lock(&dev->gnd_dgram_lock);
1633 for (i = 0; i < (*kgnilnd_tunables.kgn_peer_hash_size -1); i++) {
1634 list_for_each_entry_safe(dg, dgN, &dev->gnd_dgrams[i], gndg_list) {
1635 if (dg->gndg_type != GNILND_DGRAM_WC_REQ)
1636 kgnilnd_cancel_dgram_locked(dg);
1640 spin_unlock(&dev->gnd_dgram_lock);
1647 kgnilnd_wait_for_canceled_dgrams(kgn_device_t *dev)
1655 /* use do while to get at least one check run to allow
1656 * regression test for 762072 to hit bug if there */
1658 /* This function races with the dgram mover during shutdown so it is possible for
1659 * a dgram to be seen in kgnilnd_postdata_probe_wait_by_id but be handled in the
1660 * dgram mover thread instead of inside of this function.
1663 /* This should only be called from within shutdown, baseshutdown, or stack reset.
1664 * there are no assertions here to verify since base_shutdown has nothing in it we can check
1665 * the net is gone by then.
1670 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
1671 "Waiting for %d canceled datagrams to clear on device %d\n",
1672 atomic_read(&dev->gnd_canceled_dgrams), dev->gnd_id);
1674 /* check once a second */
1675 grc = kgnilnd_postdata_probe_wait_by_id(dev->gnd_handle,
1678 if (grc != GNI_RC_SUCCESS)
1681 CDEBUG(D_NET, "ready %#llx on device %d->0x%p\n",
1682 readyid, dev->gnd_id, dev);
1684 rc = kgnilnd_probe_for_dgram(dev, &dgram);
1686 /* if we got a valid dgram or one that is now done, clean up */
1687 kgnilnd_release_dgram(dev, dgram, 1);
1689 } while (atomic_read(&dev->gnd_canceled_dgrams));
1693 kgnilnd_start_connect(kgn_peer_t *peer)
1696 /* sync point for kgnilnd_del_peer_locked - do an early check to
1697 * catch the most common hits where del_peer is done by the
1698 * time we get here */
1699 if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING1)) {
1700 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING1, 1)) {};
1703 write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1704 if (!kgnilnd_peer_active(peer) || peer->gnp_connecting != GNILND_PEER_CONNECT) {
1705 /* raced with peer getting unlinked */
1706 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1710 peer->gnp_connecting = GNILND_PEER_POSTING;
1711 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1713 set_mb(peer->gnp_last_dgram_time, jiffies);
1714 if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING2)) {
1715 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING2, 1)) {};
1718 if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING3)) {
1719 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING3, 1)) {};
1720 rc = cfs_fail_val ? cfs_fail_val : -ENOMEM;
1722 rc = kgnilnd_post_dgram(peer->gnp_net->gnn_dev,
1723 peer->gnp_nid, GNILND_CONNREQ_REQ, 0);
1726 set_mb(peer->gnp_last_dgram_errno, rc);
1730 /* while we're posting someone could have decided this peer/dgram needed to
1731 * die a quick death, so we check for state change and process accordingly */
1733 write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1734 if (!kgnilnd_peer_active(peer) || peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH) {
1735 if (peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH) {
1736 peer->gnp_connecting = GNILND_PEER_KILL;
1738 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1739 /* positive RC to avoid dgram cleanup - we'll have to
1740 * wait for the kgni GNI_POST_TERMINATED event to
1741 * finish cleaning up */
1743 kgnilnd_find_and_cancel_dgram(peer->gnp_net->gnn_dev, peer->gnp_nid);
1746 peer->gnp_connecting = GNILND_PEER_POSTED;
1747 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1748 /* reaper thread will take care of any timeouts */
1749 CDEBUG(D_NET, "waiting for connect to finish to %s rc %d\n",
1750 libcfs_nid2str(peer->gnp_nid), rc);
1755 CDEBUG(D_NET, "connect to %s failed: rc %d \n",
1756 libcfs_nid2str(peer->gnp_nid), rc);
1762 kgnilnd_finish_connect(kgn_dgram_t *dgram)
1764 kgn_conn_t *conn = dgram->gndg_conn;
1765 lnet_nid_t her_nid = dgram->gndg_conn_in.gncr_srcnid;
1766 struct lnet_nid peer_nid;
1767 kgn_peer_t *new_peer, *peer = NULL;
1770 kgn_mbox_info_t *mbox;
1774 /* try to find a peer that matches the nid we got in the connreq
1775 * kgnilnd_unpack_connreq makes sure that conn_in.gncr_srcnid is
1776 * HER and conn_out.gncr_srcnid is ME for both active and WC dgrams */
1778 /* assume this is a new peer - it makes locking cleaner when it isn't */
1779 /* no holding kgn_net_rw_sem - already are at the kgnilnd_dgram_mover level */
1781 rc = kgnilnd_create_peer_safe(&new_peer, her_nid, NULL, GNILND_PEER_UP);
1783 CERROR("Can't create peer for %s\n", libcfs_nid2str(her_nid));
1787 write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1789 /* this transfers ref from create_peer to the kgn_peer table */
1790 kgnilnd_add_peer_locked(her_nid, new_peer, &peer);
1792 /* if we found an existing peer, is it really ready for a new conn ? */
1793 if (peer != new_peer) {
1794 /* if this was an active connect attempt but we can't find a peer waiting for it
1795 * we will dump in the trash */
1797 if (peer->gnp_connecting == GNILND_PEER_IDLE && dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
1798 CDEBUG(D_NET, "dropping completed connreq for %s peer 0x%p->%s\n",
1799 libcfs_nid2str(her_nid), peer, libcfs_nid2str(peer->gnp_nid));
1800 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1805 /* check to see if we can catch a connecting peer before it is
1806 * removed from the connd_peers list - if not, we need to
1807 * let the connreqs race and be handled by kgnilnd_conn_isdup_locked() */
1808 if (peer->gnp_connecting != GNILND_PEER_IDLE) {
1809 spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
1810 if (!list_empty(&peer->gnp_connd_list)) {
1811 list_del_init(&peer->gnp_connd_list);
1812 /* drop connd ref */
1813 kgnilnd_peer_decref(peer);
1815 spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
1816 /* clear rc to make sure we don't have fake error */
1820 /* no matter what, we are no longer waiting to connect this peer now */
1821 peer->gnp_connecting = GNILND_PEER_IDLE;
1823 /* Refuse to duplicate an existing connection (both sides might try to
1824 * connect at once). NB we return success! We _are_ connected so we
1825 * _don't_ have any blocked txs to complete with failure. */
1826 rc = kgnilnd_conn_isdup_locked(peer, conn);
1828 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1829 CDEBUG(D_NET, "Not creating duplicate connection to %s: %d\n",
1830 libcfs_nid2str(her_nid), rc);
1836 if (peer->gnp_state == GNILND_PEER_DOWN) {
1837 CNETERR("Received connection request from down nid %s\n",
1838 libcfs_nid2str(her_nid));
1841 peer->gnp_state = GNILND_PEER_UP;
1842 nstale = kgnilnd_close_stale_conns_locked(peer, conn);
1844 /* either way with peer (new or existing), we are ok with ref counts here as the
1845 * kgnilnd_add_peer_locked will use our ref on new_peer (from create_peer_safe) as the
1846 * ref for the peer table. */
1848 /* at this point, the connection request is a winner */
1850 /* mark 'DONE' to avoid cancel being called from release */
1851 dgram->gndg_state = GNILND_DGRAM_DONE;
1853 /* initialise timestamps before reaper looks at them */
1854 conn->gnc_last_rx = conn->gnc_last_rx_cq = jiffies;
1856 /* last_tx is initialized to jiffies - (keepalive*2) so that if the NOOP fails it will
1857 * immediatly send a NOOP in the reaper thread during the call to
1858 * kgnilnd_check_conn_timeouts_locked
1860 conn->gnc_last_tx = jiffies - (cfs_time_seconds(GNILND_TO2KA(conn->gnc_timeout)) * 2);
1861 conn->gnc_state = GNILND_CONN_ESTABLISHED;
1863 /* save the dgram type used to establish this connection */
1864 conn->gnc_dgram_type = dgram->gndg_type;
1866 /* refs are not transferred from dgram to tables, so increment to
1868 kgnilnd_conn_addref(conn);
1869 kgnilnd_peer_addref(peer);
1870 conn->gnc_peer = peer;
1871 list_add_tail(&conn->gnc_list, &peer->gnp_conns);
1873 kgnilnd_conn_addref(conn); /* +1 ref for conn table */
1874 list_add_tail(&conn->gnc_hashlist,
1875 kgnilnd_cqid2connlist(conn->gnc_cqid));
1876 kgnilnd_data.kgn_conn_version++;
1878 /* Dont send NOOP if fail_loc is set
1880 if (!CFS_FAIL_CHECK(CFS_FAIL_GNI_ONLY_NOOP)) {
1881 tx = kgnilnd_new_tx_msg(GNILND_MSG_NOOP,
1882 lnet_nid_to_nid4(&peer->gnp_net->gnn_ni->ni_nid));
1884 CNETERR("can't get TX to initiate NOOP to %s\n",
1885 libcfs_nid2str(peer->gnp_nid));
1887 kgnilnd_queue_tx(conn, tx);
1891 /* Schedule all packets blocking for a connection */
1892 list_for_each_entry_safe(tx, txn, &peer->gnp_tx_queue, tx_list) {
1893 /* lock held here is the peer_conn lock */
1894 kgnilnd_tx_del_state_locked(tx, peer, NULL, GNILND_TX_ALLOCD);
1895 kgnilnd_queue_tx(conn, tx);
1898 /* If this is an active connection lets mark its timestamp on the MBoX */
1899 if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
1900 mbox = &conn->gnc_fma_blk->gnm_mbox_info[conn->gnc_mbox_id];
1901 /* conn->gnc_last_rx is jiffies it better exist as it was just set */
1902 mbox->mbx_release_purg_active_dgram = conn->gnc_last_rx;
1905 /* Bug 765042: wake up scheduler for a race with finish_connect and
1906 * complete_conn_closed with a conn in purgatory
1907 * since we can't use CFS_RACE due to mutex_holds in kgnilnd_process_conns,
1908 * we just check for set and then clear */
1909 if (CFS_FAIL_CHECK(CFS_FAIL_GNI_FINISH_PURG)) {
1911 /* get scheduler thread moving again */
1912 kgnilnd_schedule_device(conn->gnc_device);
1915 CDEBUG(D_NET, "New conn 0x%p->%s dev %d\n",
1916 conn, libcfs_nid2str(her_nid), conn->gnc_device->gnd_id);
1918 /* make sure we reset peer reconnect interval now that we have a good conn */
1919 kgnilnd_peer_alive(peer);
1920 peer->gnp_reconnect_interval = 0;
1922 /* clear the unlink attribute if we dont clear it kgnilnd_del_conn_or_peer will wait
1923 * on the atomic forever
1925 if (peer->gnp_pending_unlink) {
1926 peer->gnp_pending_unlink = 0;
1927 kgnilnd_admin_decref(kgnilnd_data.kgn_npending_unlink);
1928 CDEBUG(D_NET, "Clearing peer unlink %p\n",peer);
1931 /* add ref to make it hang around until after we drop the lock */
1932 kgnilnd_conn_addref(conn);
1934 /* Once the peer_conn lock is dropped, the conn could actually move into
1935 * CLOSING->CLOSED->DONE in the scheduler thread, so hold the
1936 * lock until we are really done */
1937 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1939 /* Notify LNET that we now have a working connection to this peer.
1940 * This is a Cray extension to the "standard" LND behavior.
1942 lnet_nid4_to_nid(peer->gnp_nid, &peer_nid);
1943 lnet_notify(peer->gnp_net->gnn_ni, &peer_nid, true, true,
1944 ktime_get_seconds());
1946 /* drop our 'hold' ref */
1947 kgnilnd_conn_decref(conn);
1954 kgnilnd_send_nak(kgn_device_t *dev, lnet_nid_t dst_nid, int error)
1959 LASSERTF(dst_nid != LNET_NID_ANY, "bad dst_nid %s\n", libcfs_nid2str(dst_nid));
1961 CDEBUG(D_NET, "NAK to %s errno %d\n", libcfs_nid2str(dst_nid), error);
1963 rc = kgnilnd_post_dgram(dev, dst_nid, GNILND_CONNREQ_NAK, error);
1966 CDEBUG(D_NET, "NAK to %s failed: rc %d \n", libcfs_nid2str(dst_nid), rc);
1972 kgnilnd_process_nak(kgn_dgram_t *dgram)
1974 kgn_connreq_t *connreq = &dgram->gndg_conn_in;
1975 lnet_nid_t src_nid = connreq->gncr_srcnid;
1976 int errno = connreq->gncr_nakdata.gnnd_errno;
1980 write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1982 peer = kgnilnd_find_peer_locked(src_nid);
1984 /* we likely dropped him from bad data when we processed
1985 * the original REQ */
1986 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1990 /* need to check peerstamp/connstamp against the ones we find
1991 * to make sure we don't close new (and good?) conns that we
1992 * formed after this connreq failed */
1993 if (peer->gnp_connecting == GNILND_PEER_IDLE) {
1996 if (list_empty(&peer->gnp_conns)) {
1997 /* assume already procced datagram and it barfed up
1998 * on this side too */
1999 CDEBUG(D_NET, "dropping NAK from %s; "
2000 "peer %s is already not connected\n",
2001 libcfs_nid2str(connreq->gncr_srcnid),
2002 libcfs_nid2str(connreq->gncr_dstnid));
2003 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2007 /* stub up a connection with the connreq XXX_stamps to allow
2008 * use to use close_stale_conns_locked */
2009 conn.gnc_peerstamp = connreq->gncr_peerstamp;
2010 conn.gnc_my_connstamp = connreq->gncr_connstamp;
2011 conn.gnc_peer_connstamp = connreq->gncr_connstamp;
2012 conn.gnc_device = peer->gnp_net->gnn_dev;
2014 rc = kgnilnd_close_stale_conns_locked(peer, &conn);
2016 LCONSOLE_INFO("Received NAK from %s for %s errno %d; "
2017 "closed %d connections\n",
2018 libcfs_nid2str(connreq->gncr_srcnid),
2019 libcfs_nid2str(connreq->gncr_dstnid), errno, rc);
2021 spin_lock(&dgram->gndg_conn->gnc_device->gnd_connd_lock);
2023 if (list_empty(&peer->gnp_connd_list)) {
2024 /* if peer isn't on waiting list, try to find one to nuke */
2025 rc = kgnilnd_find_and_cancel_dgram(peer->gnp_net->gnn_dev,
2029 LCONSOLE_INFO("Received NAK from %s for %s errno %d; "
2030 "canceled pending connect request\n",
2031 libcfs_nid2str(connreq->gncr_srcnid),
2032 libcfs_nid2str(connreq->gncr_dstnid), errno);
2035 /* if we can't find a waiting dgram, we just drop the nak - the conn
2036 * connect must have failed (didn't find conn above and clear connecting
2037 * -- so nothing to do besides drop */
2039 /* peer is on list, meaning it is a new connect attempt from the one
2040 * we started that generated the NAK - so just drop NAK */
2042 /* use negative to prevent error message */
2045 spin_unlock(&dgram->gndg_conn->gnc_device->gnd_connd_lock);
2048 /* success! we found a peer and at least marked pending_nak */
2049 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2055 kgnilnd_process_connreq(kgn_dgram_t *dgram, int *needs_nak)
2059 rc = kgnilnd_unpack_connreq(dgram);
2062 /* only NAK if we have good srcnid to use */
2068 switch (dgram->gndg_conn_in.gncr_type) {
2069 case GNILND_CONNREQ_REQ:
2070 /* wire up peer & conn, send queued TX */
2071 rc = kgnilnd_finish_connect(dgram);
2073 /* don't nak when the nid is hosed */
2079 case GNILND_CONNREQ_NAK:
2080 rc = kgnilnd_process_nak(dgram);
2081 /* return early to prevent reconnect bump */
2084 CERROR("unexpected connreq type %s (%d) from %s\n",
2085 kgnilnd_connreq_type2str(&dgram->gndg_conn_in),
2086 dgram->gndg_conn_in.gncr_type,
2087 libcfs_nid2str(dgram->gndg_conn_in.gncr_srcnid));
2098 kgnilnd_probe_and_process_dgram(kgn_device_t *dev)
2102 lnet_nid_t nak_dstnid = LNET_NID_ANY;
2103 lnet_nid_t orig_dstnid;
2104 kgn_dgram_t *dgram = NULL;
2108 if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PAUSE_DGRAM_COMP)) {
2111 rc = kgnilnd_probe_for_dgram(dev, &dgram);
2116 } else if (rc < 0) {
2117 GOTO(inform_peer, rc);
2119 /* rc > 1 means it did something, reset for this func */
2123 switch (dgram->gndg_type) {
2124 case GNILND_DGRAM_WC_REQ:
2125 case GNILND_DGRAM_REQ:
2126 rc = kgnilnd_process_connreq(dgram, &needs_nak);
2128 case GNILND_DGRAM_NAK:
2129 CDEBUG(D_NETTRACE, "NAK to %s done\n",
2130 libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid));
2133 CERROR("unknown datagram type %s (%d)\n",
2134 kgnilnd_dgram_type2str(dgram), dgram->gndg_type);
2138 /* stash data to use after releasing current datagram */
2139 /* don't stash net - we are operating on a net already,
2140 * so the lock on rw_net_lock is sufficient */
2142 nak_dstnid = dgram->gndg_conn_in.gncr_srcnid;
2145 LASSERTF(dgram != NULL, "dgram 0x%p rc %d needs_nak %d\n", dgram, rc, needs_nak);
2147 orig_dstnid = dgram->gndg_conn_out.gncr_dstnid;
2149 kgnilnd_release_dgram(dev, dgram, 0);
2151 CDEBUG(D_NET, "cleaning up dgram to %s, rc %d\n",
2152 libcfs_nid2str(orig_dstnid), rc);
2154 /* if this was a WC_REQ that matched an existing peer, it'll get marked done
2155 * in kgnilnd_finish_connect - if errors are from before we get to there,
2156 * we just drop as it is a WC_REQ - the peer CAN'T be waiting for it */
2157 if ((orig_dstnid != LNET_NID_ANY) && (rc < 0)) {
2158 /* if we have a negative rc, we want to find a peer to inform about
2159 * the bad connection attempt. Sorry buddy, better luck next time! */
2161 write_lock(&kgnilnd_data.kgn_peer_conn_lock);
2162 peer = kgnilnd_find_peer_locked(orig_dstnid);
2165 /* add ref to make sure he stays around past the possible unlink
2166 * so we can tell LNet about him */
2167 kgnilnd_peer_addref(peer);
2169 /* if he still cares about the outstanding connect */
2170 if (peer->gnp_connecting >= GNILND_PEER_CONNECT) {
2171 /* check if he is on the connd list and remove.. */
2172 spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
2173 if (!list_empty(&peer->gnp_connd_list)) {
2174 list_del_init(&peer->gnp_connd_list);
2175 /* drop connd ref */
2176 kgnilnd_peer_decref(peer);
2178 spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
2180 /* clear gnp_connecting so we don't have a non-connecting peer
2181 * on gnd_connd_list */
2182 peer->gnp_connecting = GNILND_PEER_IDLE;
2184 set_mb(peer->gnp_last_dgram_errno, rc);
2186 kgnilnd_peer_increase_reconnect_locked(peer);
2189 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2191 /* now that we are outside the lock, tell Mommy */
2193 kgnilnd_peer_notify(peer, rc, 0);
2194 kgnilnd_peer_decref(peer);
2199 kgnilnd_send_nak(dev, nak_dstnid, rc);
2206 kgnilnd_reaper_dgram_check(kgn_device_t *dev)
2208 kgn_dgram_t *dgram, *tmp;
2211 spin_lock(&dev->gnd_dgram_lock);
2213 for (i = 0; i < (*kgnilnd_tunables.kgn_peer_hash_size - 1); i++) {
2214 list_for_each_entry_safe(dgram, tmp, &dev->gnd_dgrams[i], gndg_list) {
2215 unsigned long now = jiffies;
2216 unsigned long timeout;
2218 /* don't timeout stuff if the network is mucked or shutting down */
2219 if (kgnilnd_check_hw_quiesce()) {
2223 if ((dgram->gndg_state != GNILND_DGRAM_POSTED) ||
2224 (dgram->gndg_type == GNILND_DGRAM_WC_REQ)) {
2227 CDEBUG(D_NETTRACE, "checking dgram 0x%p type %s "
2228 "state %s conn 0x%p to %s age %lus\n",
2229 dgram, kgnilnd_dgram_type2str(dgram),
2230 kgnilnd_dgram_state2str(dgram), dgram->gndg_conn,
2231 libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
2232 cfs_duration_sec(now - dgram->gndg_post_time));
2234 timeout = cfs_time_seconds(*kgnilnd_tunables.kgn_timeout);
2236 if (time_before(now, (dgram->gndg_post_time + timeout)))
2239 CNETERR("%s datagram to %s timed out @ %lus dgram "
2240 "0x%p state %s conn 0x%p\n",
2241 kgnilnd_dgram_type2str(dgram),
2242 libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
2243 cfs_duration_sec(now - dgram->gndg_post_time),
2244 dgram, kgnilnd_dgram_state2str(dgram),
2247 kgnilnd_cancel_dgram_locked(dgram);
2250 spin_unlock(&dev->gnd_dgram_lock);
2254 /* use a thread for the possibly long-blocking wait_by_id to prevent
2255 * stalling the global workqueues */
2257 kgnilnd_dgram_waitq(void *arg)
2259 kgn_device_t *dev = (kgn_device_t *) arg;
2263 DEFINE_WAIT(mover_done);
2265 snprintf(name, sizeof(name), "kgnilnd_dgn_%02d", dev->gnd_id);
2267 /* all gnilnd threads need to run fairly urgently */
2268 set_user_nice(current, *kgnilnd_tunables.kgn_nice);
2270 /* we dont shut down until the device shuts down ... */
2271 while (!kgnilnd_data.kgn_shutdown) {
2272 /* to quiesce or to not quiesce, that is the question */
2273 if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
2274 KGNILND_SPIN_QUIESCE;
2277 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_PAUSE_DGRAM_COMP, 1)) {}
2279 /* check once a second */
2280 grc = kgnilnd_postdata_probe_wait_by_id(dev->gnd_handle,
2283 if (grc == GNI_RC_SUCCESS) {
2284 CDEBUG(D_INFO, "waking up dgram mover thread\n");
2285 kgnilnd_schedule_dgram(dev);
2287 /* wait for dgram thread to ping us before spinning again */
2288 prepare_to_wait(&dev->gnd_dgping_waitq, &mover_done,
2289 TASK_INTERRUPTIBLE);
2291 /* don't sleep if we need to quiesce */
2292 if (likely(!kgnilnd_data.kgn_quiesce_trigger)) {
2295 finish_wait(&dev->gnd_dgping_waitq, &mover_done);
2299 kgnilnd_thread_fini();
2304 kgnilnd_start_outbound_dgrams(kgn_device_t *dev, unsigned long deadline)
2306 int did_something = 0, rc;
2307 kgn_peer_t *peer = NULL;
2309 spin_lock(&dev->gnd_connd_lock);
2311 /* Active connect - we added this in kgnilnd_launch_tx */
2312 while (!list_empty(&dev->gnd_connd_peers) && time_before(jiffies, deadline)) {
2313 peer = list_first_entry(&dev->gnd_connd_peers,
2314 kgn_peer_t, gnp_connd_list);
2316 /* ref for connd removed in if/else below */
2317 list_del_init(&peer->gnp_connd_list);
2319 /* gnp_connecting and membership on gnd_connd_peers should be
2320 * done coherently to avoid double adding, etc */
2321 /* don't need kgnilnd_data.kgn_peer_conn_lock here as that is only needed
2322 * to get the peer to gnp_connecting in the first place. We just need to
2323 * rely on gnd_connd_lock to serialize someone pulling him from the list
2324 * BEFORE clearing gnp_connecting */
2325 LASSERTF(peer->gnp_connecting != GNILND_PEER_IDLE, "peer 0x%p->%s not connecting\n",
2326 peer, libcfs_nid2str(peer->gnp_nid));
2328 spin_unlock(&dev->gnd_connd_lock);
2330 CDEBUG(D_NET, "processing connect to %s\n",
2331 libcfs_nid2str(peer->gnp_nid));
2334 rc = kgnilnd_start_connect(peer);
2336 if (likely(rc >= 0)) {
2337 /* 0 on success, positive on 'just drop peer' errors */
2338 kgnilnd_peer_decref(peer);
2339 } else if (rc == -ENOMEM) {
2340 /* if we are out of wildcards, add back to
2341 * connd_list - then break out and we'll try later
2342 * if other errors, we'll bail & cancel pending tx */
2343 write_lock(&kgnilnd_data.kgn_peer_conn_lock);
2344 if (peer->gnp_connecting == GNILND_PEER_POSTING) {
2345 peer->gnp_connecting = GNILND_PEER_CONNECT;
2346 spin_lock(&dev->gnd_connd_lock);
2347 list_add_tail(&peer->gnp_connd_list,
2348 &dev->gnd_connd_peers);
2350 /* connecting changed while we were posting */
2352 LASSERTF(peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH, "Peer is in invalid"
2353 " state 0x%p->%s, connecting %d\n",
2354 peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting);
2355 peer->gnp_connecting = GNILND_PEER_KILL;
2356 spin_lock(&dev->gnd_connd_lock);
2357 /* remove the peer ref frrom the cond list */
2358 kgnilnd_peer_decref(peer);
2359 /* let the system handle itself */
2361 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2362 /* the datagrams are a global pool,
2363 * so break out of trying and hope some free
2368 /* something bad happened, you lose */
2369 CNETERR("could not start connecting to %s "
2370 "rc %d: Will retry until TX timeout\n",
2371 libcfs_nid2str(peer->gnp_nid), rc);
2372 /* It didnt post so just set connecting back to zero now.
2373 * The reaper will reattempt the connection if it needs too.
2374 * If the peer needs death set it so the reaper will cleanup.
2376 write_lock(&kgnilnd_data.kgn_peer_conn_lock);
2377 if (peer->gnp_connecting == GNILND_PEER_POSTING) {
2378 peer->gnp_connecting = GNILND_PEER_IDLE;
2379 kgnilnd_peer_increase_reconnect_locked(peer);
2381 LASSERTF(peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH, "Peer is in invalid"
2382 " state 0x%p->%s, connecting %d\n",
2383 peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting);
2384 peer->gnp_connecting = GNILND_PEER_KILL;
2386 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2388 /* hold onto ref until we are really done - if it was
2389 * unlinked this could result in a destroy */
2390 kgnilnd_peer_decref(peer);
2392 spin_lock(&dev->gnd_connd_lock);
2395 spin_unlock(&dev->gnd_connd_lock);
2396 RETURN(did_something);
2400 kgnilnd_repost_wc_dgrams(kgn_device_t *dev)
2402 int did_something = 0, to_repost, i;
2403 to_repost = atomic_read(&dev->gnd_nwcdgrams);
2406 for (i = 0; i < to_repost; ++i) {
2408 rerc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
2410 kgnilnd_admin_decref(dev->gnd_nwcdgrams);
2413 CDEBUG(D_NETERROR, "error %d: dev %d could not post wildcard datagram\n",
2419 RETURN(did_something);
2422 struct kgnilnd_dgram_timer {
2423 struct timer_list timer;
2428 kgnilnd_dgram_poke_with_stick(cfs_timer_cb_arg_t arg)
2430 struct kgnilnd_dgram_timer *t = cfs_from_timer(t, arg, timer);
2432 wake_up(&t->dev->gnd_dgram_waitq);
2435 /* use single thread for dgrams - should be sufficient for performance */
2437 kgnilnd_dgram_mover(void *arg)
2439 kgn_device_t *dev = (kgn_device_t *)arg;
2441 int rc, did_something;
2442 unsigned long next_purge_check = jiffies - 1;
2443 unsigned long timeout;
2444 struct kgnilnd_dgram_timer timer;
2445 unsigned long deadline = 0;
2448 snprintf(name, sizeof(name), "kgnilnd_dg_%02d", dev->gnd_id);
2450 /* all gnilnd threads need to run fairly urgently */
2451 set_user_nice(current, *kgnilnd_tunables.kgn_nice);
2453 /* we are ok not locking for these variables as the dgram waitq threads
2454 * will block both due to tying up net (kgn_shutdown) and the completion
2455 * event for the dgram_waitq (kgn_quiesce_trigger) */
2456 deadline = jiffies + cfs_time_seconds(*kgnilnd_tunables.kgn_dgram_timeout);
2457 while (!kgnilnd_data.kgn_shutdown) {
2458 /* Safe: kgn_shutdown only set when quiescent */
2460 /* race with stack reset - we want to hold off seeing any new incoming dgrams
2461 * so we can force a dirty WC dgram for Bug 762072 - put right before
2462 * quiesce check so that it'll go right into that and not do any
2464 CFS_RACE(CFS_FAIL_GNI_WC_DGRAM_FREE);
2466 /* to quiesce or to not quiesce, that is the question */
2467 if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
2468 KGNILND_SPIN_QUIESCE;
2472 CFS_RACE(CFS_FAIL_GNI_QUIESCE_RACE);
2474 /* process any newly completed dgrams */
2475 down_read(&kgnilnd_data.kgn_net_rw_sem);
2477 rc = kgnilnd_probe_and_process_dgram(dev);
2479 did_something += rc;
2482 up_read(&kgnilnd_data.kgn_net_rw_sem);
2484 CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_DGRAM_DEADLINE,
2485 (*kgnilnd_tunables.kgn_dgram_timeout + 1));
2486 /* start new outbound dgrams */
2487 did_something += kgnilnd_start_outbound_dgrams(dev, deadline);
2489 /* find dead dgrams */
2490 if (time_after_eq(jiffies, next_purge_check)) {
2491 /* these don't need to be checked that often */
2492 kgnilnd_reaper_dgram_check(dev);
2494 next_purge_check = (long) jiffies +
2495 cfs_time_seconds(kgnilnd_data.kgn_new_min_timeout / 4);
2498 did_something += kgnilnd_repost_wc_dgrams(dev);
2500 /* careful with the jiffy wrap... */
2501 timeout = (long)(next_purge_check - jiffies);
2503 CDEBUG(D_INFO, "did %d timeout %lu next %lu jiffies %lu\n",
2504 did_something, timeout, next_purge_check, jiffies);
2506 if ((did_something || timeout <= 0) && time_before(jiffies, deadline)) {
2511 prepare_to_wait(&dev->gnd_dgram_waitq, &wait, TASK_INTERRUPTIBLE);
2513 cfs_timer_setup(&timer.timer,
2514 kgnilnd_dgram_poke_with_stick,
2517 mod_timer(&timer.timer, (long) jiffies + timeout);
2519 /* last second chance for others to poke us */
2520 did_something += xchg(&dev->gnd_dgram_ready, GNILND_DGRAM_IDLE);
2522 /* check flag variables before committing even if we
2523 * did something; if we are after the deadline call
2525 if ((!did_something || time_after(jiffies, deadline)) &&
2526 !kgnilnd_data.kgn_shutdown &&
2527 !kgnilnd_data.kgn_quiesce_trigger) {
2528 CDEBUG(D_INFO, "schedule timeout %ld (%lu sec)\n",
2529 timeout, cfs_duration_sec(timeout));
2530 wake_up(&dev->gnd_dgping_waitq);
2532 CDEBUG(D_INFO, "awake after schedule\n");
2533 deadline = jiffies + cfs_time_seconds(*kgnilnd_tunables.kgn_dgram_timeout);
2536 timer_delete_sync(&timer.timer);
2537 finish_wait(&dev->gnd_dgram_waitq, &wait);
2540 kgnilnd_thread_fini();