2 * Copyright (C) 2012 Cray, Inc.
4 * Author: Igor Gorodetsky <iogordet@cray.com>
5 * Author: Nic Henke <nic@cray.com>
6 * Author: James Shimek <jshimek@cray.com>
8 * This file is part of Lustre, http://www.lustre.org.
10 * Lustre is free software; you can redistribute it and/or
11 * modify it under the terms of version 2 of the GNU General Public
12 * License as published by the Free Software Foundation.
14 * Lustre is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with Lustre; if not, write to the Free Software
21 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28 kgnilnd_setup_smsg_attr(gni_smsg_attr_t *smsg_attr)
30 smsg_attr->mbox_maxcredit = *kgnilnd_tunables.kgn_mbox_credits;
31 smsg_attr->msg_maxsize = GNILND_MAX_MSG_SIZE;
32 smsg_attr->msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT;
36 kgnilnd_map_fmablk(kgn_device_t *device, kgn_fma_memblock_t *fma_blk)
39 __u32 flags = GNI_MEM_READWRITE;
41 if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
42 flags |= GNI_MEM_PHYS_CONT;
45 /* make sure we are mapping a clean block */
46 LASSERTF(fma_blk->gnm_hndl.qword1 == 0UL, "fma_blk %p dirty\n", fma_blk);
48 rrc = kgnilnd_mem_register(device->gnd_handle, (__u64)fma_blk->gnm_block,
49 fma_blk->gnm_blk_size, device->gnd_rcv_fma_cqh,
50 flags, &fma_blk->gnm_hndl);
51 if (rrc != GNI_RC_SUCCESS) {
52 /* XXX Nic: need a way to silence this for runtime stuff that is ok to fail
53 * -- like when under MDD or GART pressure on big systems
55 CNETERR("register fmablk failed 0x%p mbox_size %d flags %u\n",
56 fma_blk, fma_blk->gnm_mbox_size, flags);
60 /* PHYS_CONT memory isn't really mapped, at least not in GART -
61 * but all mappings chew up a MDD
63 if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
64 atomic64_add(fma_blk->gnm_blk_size, &device->gnd_nbytes_map);
67 atomic_inc(&device->gnd_n_mdd);
68 /* nfmablk is live (mapped) blocks */
69 atomic_inc(&device->gnd_nfmablk);
75 kgnilnd_alloc_fmablk(kgn_device_t *device, int use_phys)
79 kgn_fma_memblock_t *fma_blk;
80 gni_smsg_attr_t smsg_attr;
81 unsigned long fmablk_vers;
83 /* we'll use fmablk_vers and the gnd_fmablk_sem to gate access
84 * to this allocation code. Everyone will sample the version
85 * before and after getting the semaphore. If it has changed,
86 * we'll bail out to check the lists again - this indicates that
87 * some sort of change was made to the lists and it is possible
88 * that there is a mailbox for us to find now. This should prevent
89 * a ton of spinning in the case where there are lots of threads
90 * that need a yet-to-be-allocated mailbox for a connection. */
92 fmablk_vers = atomic_read(&device->gnd_fmablk_vers);
93 down(&device->gnd_fmablk_sem);
95 if (fmablk_vers != atomic_read(&device->gnd_fmablk_vers)) {
96 /* version changed while we were waiting for semaphore,
97 * we'll recheck the lists assuming something nice happened */
98 up(&device->gnd_fmablk_sem);
102 LIBCFS_ALLOC(fma_blk, sizeof(kgn_fma_memblock_t));
103 if (fma_blk == NULL) {
104 CNETERR("could not allocate fma block descriptor\n");
109 INIT_LIST_HEAD(&fma_blk->gnm_bufflist);
111 kgnilnd_setup_smsg_attr(&smsg_attr);
113 gni_smsg_buff_size_needed(&smsg_attr, &fma_blk->gnm_mbox_size);
115 LASSERTF(fma_blk->gnm_mbox_size, "mbox size %d\n", fma_blk->gnm_mbox_size);
117 /* gni_smsg_buff_size_needed calculates the base mailbox size and since
118 * we want to hold kgn_peer_credits worth of messages in both directions,
119 * we add PAYLOAD to grow the mailbox size
122 fma_blk->gnm_mbox_size += GNILND_MBOX_PAYLOAD;
124 /* we'll only use physical during preallocate at startup -- this keeps it nice and
125 * clean for runtime decisions. We'll keep the PHYS ones around until shutdown
126 * as reallocating them is tough if there is memory fragmentation */
129 fma_blk->gnm_block = cfs_mem_cache_alloc(kgnilnd_data.kgn_mbox_cache, CFS_ALLOC_ATOMIC);
130 if (fma_blk->gnm_block == NULL) {
131 CNETERR("could not allocate physical SMSG mailbox memory\n");
135 fma_blk->gnm_blk_size = KMALLOC_MAX_SIZE;
136 num_mbox = fma_blk->gnm_blk_size / fma_blk->gnm_mbox_size;
138 LASSERTF(num_mbox >= 1,
139 "num_mbox %d blk_size %u mbox_size %d\n",
140 num_mbox, fma_blk->gnm_blk_size, fma_blk->gnm_mbox_size);
142 fma_blk->gnm_state = GNILND_FMABLK_PHYS;
145 num_mbox = *kgnilnd_tunables.kgn_mbox_per_block;
146 fma_blk->gnm_blk_size = num_mbox * fma_blk->gnm_mbox_size;
148 LASSERTF(num_mbox >= 1 && num_mbox >= *kgnilnd_tunables.kgn_mbox_per_block,
149 "num_mbox %d blk_size %u mbox_size %d tunable %d\n",
150 num_mbox, fma_blk->gnm_blk_size, fma_blk->gnm_mbox_size,
151 *kgnilnd_tunables.kgn_mbox_per_block);
153 LIBCFS_ALLOC(fma_blk->gnm_block, fma_blk->gnm_blk_size);
154 if (fma_blk->gnm_block == NULL) {
155 CNETERR("could not allocate virtual SMSG mailbox memory, %d bytes\n", fma_blk->gnm_blk_size);
160 fma_blk->gnm_state = GNILND_FMABLK_VIRT;
163 /* allocate just enough space for the bits to track the mailboxes */
164 LIBCFS_ALLOC(fma_blk->gnm_bit_array, BITS_TO_LONGS(num_mbox) * sizeof(unsigned long));
165 if (fma_blk->gnm_bit_array == NULL) {
166 CNETERR("could not allocate mailbox bitmask, %lu bytes for %d mbox\n",
167 sizeof(unsigned long) * BITS_TO_LONGS(num_mbox), num_mbox);
171 bitmap_zero(fma_blk->gnm_bit_array, num_mbox);
173 /* now that the num_mbox is set based on allocation type, get debug info setup */
174 LIBCFS_ALLOC(fma_blk->gnm_mbox_info, sizeof(kgn_mbox_info_t) * num_mbox);
175 if (fma_blk->gnm_mbox_info == NULL) {
176 CNETERR("could not allocate mailbox debug, %lu bytes for %d mbox\n",
177 sizeof(kgn_mbox_info_t) * num_mbox, num_mbox);
182 rc = kgnilnd_map_fmablk(device, fma_blk);
187 fma_blk->gnm_next_avail_mbox = 0;
188 fma_blk->gnm_avail_mboxs = fma_blk->gnm_num_mboxs = num_mbox;
190 CDEBUG(D_MALLOC, "alloc fmablk 0x%p num %d msg_maxsize %d credits %d "
191 "mbox_size %d MDD "LPX64"."LPX64"\n",
192 fma_blk, num_mbox, smsg_attr.msg_maxsize, smsg_attr.mbox_maxcredit,
193 fma_blk->gnm_mbox_size, fma_blk->gnm_hndl.qword1,
194 fma_blk->gnm_hndl.qword2);
196 /* lock Is protecting data structures, not semaphore */
198 spin_lock(&device->gnd_fmablk_lock);
199 list_add_tail(&fma_blk->gnm_bufflist, &device->gnd_fma_buffs);
201 /* toggle under the lock so once they change the list is also
202 * ready for others to traverse */
203 atomic_inc(&device->gnd_fmablk_vers);
205 spin_unlock(&device->gnd_fmablk_lock);
207 up(&device->gnd_fmablk_sem);
212 LIBCFS_FREE(fma_blk->gnm_mbox_info, sizeof(kgn_mbox_info_t)*num_mbox);
214 LIBCFS_FREE(fma_blk->gnm_bit_array, BITS_TO_LONGS(num_mbox) * sizeof (unsigned long));
216 if (fma_blk->gnm_state == GNILND_FMABLK_VIRT) {
217 LIBCFS_FREE(fma_blk->gnm_block, fma_blk->gnm_blk_size);
219 cfs_mem_cache_free(kgnilnd_data.kgn_mbox_cache, fma_blk->gnm_block);
222 LIBCFS_FREE(fma_blk, sizeof(kgn_fma_memblock_t));
224 up(&device->gnd_fmablk_sem);
229 kgnilnd_unmap_fmablk(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
233 /* if some held, set hold_timeout from conn timeouts used in this block
234 * but not during shutdown, then just nuke and pave */
235 if (fma_blk->gnm_held_mboxs && (!kgnilnd_data.kgn_shutdown)) {
236 fma_blk->gnm_hold_timeout = GNILND_TIMEOUT2DEADMAN;
239 /* we are changing the state of a block, tickle version to tell
240 * proc code list is stale now */
241 atomic_inc(&dev->gnd_fmablk_vers);
243 rrc = kgnilnd_mem_deregister(dev->gnd_handle, &fma_blk->gnm_hndl, fma_blk->gnm_hold_timeout);
245 CDEBUG(rrc == GNI_RC_SUCCESS ? D_MALLOC : D_CONSOLE|D_NETERROR,
246 "unmap fmablk 0x%p@%s sz %u total %d avail %d held %d mbox_size %d "
248 fma_blk, kgnilnd_fmablk_state2str(fma_blk->gnm_state),
249 fma_blk->gnm_blk_size, fma_blk->gnm_num_mboxs,
250 fma_blk->gnm_avail_mboxs, fma_blk->gnm_held_mboxs,
251 fma_blk->gnm_mbox_size, fma_blk->gnm_hold_timeout);
253 LASSERTF(rrc == GNI_RC_SUCCESS,
254 "tried to double unmap or something bad, fma_blk %p (rrc %d)\n",
257 if (fma_blk->gnm_hold_timeout) {
258 atomic_inc(&dev->gnd_n_mdd_held);
260 atomic_dec(&dev->gnd_n_mdd);
263 /* PHYS blocks don't get mapped */
264 if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
265 atomic64_sub(fma_blk->gnm_blk_size, &dev->gnd_nbytes_map);
266 } else if (kgnilnd_data.kgn_in_reset) {
267 /* in stack reset, clear MDD handle for PHYS blocks, as we'll
268 * re-use the fma_blk after reset so we don't have to drop/allocate
269 * all of those physical blocks */
270 fma_blk->gnm_hndl.qword1 = fma_blk->gnm_hndl.qword2 = 0UL;
273 /* Decrement here as this is the # of mapped blocks */
274 atomic_dec(&dev->gnd_nfmablk);
278 /* needs lock on gnd_fmablk_lock to cover gnd_fma_buffs */
280 kgnilnd_free_fmablk_locked(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
282 LASSERTF(fma_blk->gnm_avail_mboxs == fma_blk->gnm_num_mboxs,
283 "fma_blk %p@%d free in bad state (%d): blk total %d avail %d held %d\n",
284 fma_blk, fma_blk->gnm_state, fma_blk->gnm_hold_timeout, fma_blk->gnm_num_mboxs,
285 fma_blk->gnm_avail_mboxs, fma_blk->gnm_held_mboxs);
287 atomic_inc(&dev->gnd_fmablk_vers);
289 if (fma_blk->gnm_hold_timeout) {
290 CDEBUG(D_MALLOC, "mdd release fmablk 0x%p sz %u avail %d held %d "
292 fma_blk, fma_blk->gnm_blk_size, fma_blk->gnm_avail_mboxs,
293 fma_blk->gnm_held_mboxs, fma_blk->gnm_mbox_size);
295 /* We leave MDD dangling over stack reset */
296 if (!kgnilnd_data.kgn_in_reset) {
297 kgnilnd_mem_mdd_release(dev->gnd_handle, &fma_blk->gnm_hndl);
299 /* ignoring the return code - if kgni/ghal can't find it
300 * it must be released already */
301 atomic_dec(&dev->gnd_n_mdd_held);
302 atomic_dec(&dev->gnd_n_mdd);
305 /* we cant' free the gnm_block until all the conns have released their
306 * purgatory holds. While we have purgatory holds, we might check the conn
307 * RX mailbox during the CLOSING process. It is possible that kgni might
308 * try to look into the RX side for credits when sending the CLOSE msg too */
309 CDEBUG(D_MALLOC, "fmablk %p free buffer %p mbox_size %d\n",
310 fma_blk, fma_blk->gnm_block, fma_blk->gnm_mbox_size);
312 if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
313 cfs_mem_cache_free(kgnilnd_data.kgn_mbox_cache, fma_blk->gnm_block);
315 LIBCFS_FREE(fma_blk->gnm_block, fma_blk->gnm_blk_size);
317 fma_blk->gnm_state = GNILND_FMABLK_FREED;
319 list_del(&fma_blk->gnm_bufflist);
321 LIBCFS_FREE(fma_blk->gnm_mbox_info, sizeof(kgn_mbox_info_t)*fma_blk->gnm_num_mboxs);
322 LIBCFS_FREE(fma_blk->gnm_bit_array, BITS_TO_LONGS(fma_blk->gnm_num_mboxs) * sizeof (unsigned long));
323 LIBCFS_FREE(fma_blk, sizeof(kgn_fma_memblock_t));
327 kgnilnd_find_free_mbox(kgn_conn_t *conn)
329 kgn_device_t *dev = conn->gnc_device;
330 gni_smsg_attr_t *smsg_attr = &conn->gnpr_smsg_attr;
331 kgn_fma_memblock_t *fma_blk;
332 kgn_mbox_info_t *mbox = NULL;
335 spin_lock(&dev->gnd_fmablk_lock);
337 list_for_each_entry(fma_blk, &conn->gnc_device->gnd_fma_buffs,
339 if (fma_blk->gnm_avail_mboxs <= 0 ||
340 fma_blk->gnm_state <= GNILND_FMABLK_IDLE) {
343 /* look in bitarray for available mailbox */
345 id = find_next_zero_bit(
346 fma_blk->gnm_bit_array,
347 fma_blk->gnm_num_mboxs,
348 fma_blk->gnm_next_avail_mbox);
349 if (id == fma_blk->gnm_num_mboxs &&
350 fma_blk->gnm_next_avail_mbox != 0) {
352 fma_blk->gnm_next_avail_mbox = 0;
358 LASSERTF(id < fma_blk->gnm_num_mboxs, "id %d max %d\n",
359 id, fma_blk->gnm_num_mboxs);
360 set_bit(id, (volatile unsigned long *)fma_blk->gnm_bit_array);
361 conn->gnc_mbox_id = id;
363 fma_blk->gnm_next_avail_mbox =
364 (id == (fma_blk->gnm_num_mboxs - 1)) ? 0 : (id + 1);
365 fma_blk->gnm_avail_mboxs--;
366 conn->gnc_fma_blk = fma_blk;
368 kgnilnd_setup_smsg_attr(smsg_attr);
370 smsg_attr->msg_buffer = fma_blk->gnm_block;
371 smsg_attr->mbox_offset = fma_blk->gnm_mbox_size * id;
372 smsg_attr->mem_hndl = fma_blk->gnm_hndl;
373 smsg_attr->buff_size = fma_blk->gnm_mbox_size;
375 /* We'll set the hndl to zero for PHYS blocks unmapped during stack
376 * reset and re-use the same fma_blk after stack reset. This ensures we've
377 * properly mapped it before we use it */
378 LASSERTF(fma_blk->gnm_hndl.qword1 != 0UL, "unmapped fma_blk %p, state %d\n",
379 fma_blk, fma_blk->gnm_state);
381 CDEBUG(D_NET, "conn %p smsg %p fmablk %p "
382 "allocating SMSG mbox %d buf %p "
383 "offset %u hndl "LPX64"."LPX64"\n",
384 conn, smsg_attr, fma_blk, id,
385 smsg_attr->msg_buffer, smsg_attr->mbox_offset,
386 fma_blk->gnm_hndl.qword1,
387 fma_blk->gnm_hndl.qword2);
389 mbox = &fma_blk->gnm_mbox_info[id];
390 mbox->mbx_create_conn_memset = jiffies;
392 /* zero mbox to remove any old data from our last use.
393 * this better be safe, if not our purgatory timers
394 * are too short or a peer really is misbehaving */
395 memset(smsg_attr->msg_buffer + smsg_attr->mbox_offset,
396 0, smsg_attr->buff_size);
400 spin_unlock(&dev->gnd_fmablk_lock);
404 kgnilnd_setup_mbox(kgn_conn_t *conn)
406 gni_smsg_attr_t *smsg_attr = &conn->gnpr_smsg_attr;
409 smsg_attr->msg_buffer = NULL;
410 /* Look for available mbox */
412 kgnilnd_find_free_mbox(conn);
414 /* nothing in the existing buffers, make a new one */
415 if (smsg_attr->msg_buffer == NULL) {
416 /* for runtime allocations, we only want vmalloc */
417 err = kgnilnd_alloc_fmablk(conn->gnc_device, 0);
422 } while (smsg_attr->msg_buffer == NULL);
425 CNETERR("couldn't allocate SMSG mbox for conn %p Error: %d\n",
431 kgnilnd_release_mbox(kgn_conn_t *conn, int purgatory_hold)
433 kgn_device_t *dev = conn->gnc_device;
434 gni_smsg_attr_t *smsg_attr = &conn->gnpr_smsg_attr;
435 kgn_fma_memblock_t *fma_blk = NULL;
436 kgn_mbox_info_t *mbox = NULL;
440 /* if we failed to setup mbox and now destroying conn */
441 if (smsg_attr->msg_buffer == NULL) {
445 id = conn->gnc_mbox_id;
447 spin_lock(&dev->gnd_fmablk_lock);
448 /* make sure our conn points at a valid fma_blk
449 * We use this instead of a mem block search out of smsg_attr
450 * because we could have freed a block for fma_blk #1 but the fma_blk
451 * is still in the list for a purgatory hold. This would induce a false
452 * match if that same block gets reallocated to fma_blk #2 */
453 list_for_each_entry(fma_blk, &dev->gnd_fma_buffs, gnm_bufflist) {
454 if (fma_blk == conn->gnc_fma_blk) {
459 LASSERTF(found, "unable to find conn 0x%p with gnc_fma_blk %p "
460 "anywhere in the world\n", conn, conn->gnc_fma_blk);
462 LASSERTF(id < fma_blk->gnm_num_mboxs,
463 "bad id %d max %d\n",
464 id, fma_blk->gnm_num_mboxs);
466 /* < 0 - was held, now free it
467 * == 0 - just free it
468 * > 0 - hold it for now */
469 if (purgatory_hold == 0) {
470 CDEBUG(D_NET, "conn %p smsg %p fmablk %p freeing SMSG mbox %d "
471 "hndl "LPX64"."LPX64"\n",
472 conn, smsg_attr, fma_blk, id,
473 fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
474 fma_blk->gnm_avail_mboxs++;
476 } else if (purgatory_hold > 0) {
477 CDEBUG(D_NET, "conn %p smsg %p fmablk %p holding SMSG mbox %d "
478 "hndl "LPX64"."LPX64"\n",
479 conn, smsg_attr, fma_blk, id,
480 fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
482 fma_blk->gnm_held_mboxs++;
483 fma_blk->gnm_max_timeout = MAX(fma_blk->gnm_max_timeout,
486 CDEBUG(D_NET, "conn %p smsg %p fmablk %p release SMSG mbox %d "
487 "hndl "LPX64"."LPX64"\n",
488 conn, smsg_attr, fma_blk, id,
489 fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
491 fma_blk->gnm_held_mboxs--;
492 fma_blk->gnm_avail_mboxs++;
495 if (purgatory_hold <= 0) {
496 /* if kgni is retransmitting, freeing the smsg block before the EP
497 * is destroyed gets messy. Bug 768295. */
498 LASSERTF(conn->gnc_ephandle == NULL,
499 "can't release mbox before EP is nuked. conn 0x%p\n", conn);
501 mbox = &fma_blk->gnm_mbox_info[id];
502 mbox->mbx_release_from_purgatory = jiffies;
504 /* clear conn gnc_fmablk if it is gone - this allows us to
505 * not worry about state so much in kgnilnd_destroy_conn
506 * and makes the guaranteed cleanup of the resources easier */
507 LASSERTF(test_and_clear_bit(id, fma_blk->gnm_bit_array),
508 "conn %p bit %d already cleared in fma_blk %p\n",
510 conn->gnc_fma_blk = NULL;
513 if (CFS_FAIL_CHECK(CFS_FAIL_GNI_FMABLK_AVAIL)) {
514 CERROR("LBUGs in your future: forcibly marking fma_blk %p "
515 "as mapped\n", fma_blk);
516 fma_blk->gnm_state = GNILND_FMABLK_VIRT;
519 /* we don't release or unmap PHYS blocks as part of the normal cycle --
520 * those are controlled manually from startup/shutdown */
521 if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
522 /* we can unmap once all are unused (held or avail)
523 * but check hold_timeout to make sure we are not trying to double
524 * unmap this buffer. If there was no hold_timeout set due to
525 * held_mboxs, we'll free the mobx here shortly and won't have to
526 * worry about catching a double free for a 'clean' fma_blk */
527 if (((fma_blk->gnm_avail_mboxs + fma_blk->gnm_held_mboxs) == fma_blk->gnm_num_mboxs) &&
528 (!fma_blk->gnm_hold_timeout)) {
529 kgnilnd_unmap_fmablk(dev, fma_blk);
532 /* But we can only free once they are all avail */
533 if (fma_blk->gnm_avail_mboxs == fma_blk->gnm_num_mboxs &&
534 fma_blk->gnm_held_mboxs == 0) {
535 /* all mailboxes are released, free fma_blk */
536 kgnilnd_free_fmablk_locked(dev, fma_blk);
540 spin_unlock(&dev->gnd_fmablk_lock);
544 kgnilnd_count_phys_mbox(kgn_device_t *device)
547 kgn_fma_memblock_t *fma_blk;
549 spin_lock(&device->gnd_fmablk_lock);
551 list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
552 if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
553 i += fma_blk->gnm_num_mboxs;
555 spin_unlock(&device->gnd_fmablk_lock);
561 kgnilnd_allocate_phys_fmablk(kgn_device_t *device)
565 while (kgnilnd_count_phys_mbox(device) < *kgnilnd_tunables.kgn_nphys_mbox) {
567 rc = kgnilnd_alloc_fmablk(device, 1);
569 CERROR("failed phys mbox allocation, stopping at %d, rc %d\n",
570 kgnilnd_count_phys_mbox(device), rc);
578 kgnilnd_map_phys_fmablk(kgn_device_t *device)
582 kgn_fma_memblock_t *fma_blk;
584 /* use sem to gate access to single thread, just in case */
585 down(&device->gnd_fmablk_sem);
587 spin_lock(&device->gnd_fmablk_lock);
589 list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
590 if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
591 rc = kgnilnd_map_fmablk(device, fma_blk);
595 spin_unlock(&device->gnd_fmablk_lock);
597 up(&device->gnd_fmablk_sem);
603 kgnilnd_unmap_phys_fmablk(kgn_device_t *device)
606 kgn_fma_memblock_t *fma_blk;
608 /* use sem to gate access to single thread, just in case */
609 down(&device->gnd_fmablk_sem);
611 spin_lock(&device->gnd_fmablk_lock);
613 list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
614 if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
615 kgnilnd_unmap_fmablk(device, fma_blk);
617 spin_unlock(&device->gnd_fmablk_lock);
619 up(&device->gnd_fmablk_sem);
623 kgnilnd_free_phys_fmablk(kgn_device_t *device)
626 kgn_fma_memblock_t *fma_blk, *fma_blkN;
628 /* use sem to gate access to single thread, just in case */
629 down(&device->gnd_fmablk_sem);
631 spin_lock(&device->gnd_fmablk_lock);
633 list_for_each_entry_safe(fma_blk, fma_blkN, &device->gnd_fma_buffs, gnm_bufflist) {
634 if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
635 kgnilnd_free_fmablk_locked(device, fma_blk);
637 spin_unlock(&device->gnd_fmablk_lock);
639 up(&device->gnd_fmablk_sem);
642 /* kgnilnd dgram nid->struct managment */
644 static inline struct list_head *
645 kgnilnd_nid2dgramlist(kgn_device_t *dev, lnet_nid_t nid)
647 unsigned int hash = ((unsigned int)nid) % *kgnilnd_tunables.kgn_peer_hash_size;
649 RETURN(&dev->gnd_dgrams[hash]);
653 /* needs dev->gnd_dgram_lock held */
655 kgnilnd_find_dgram_locked(kgn_device_t *dev, lnet_nid_t dst_nid)
657 struct list_head *dgram_list = kgnilnd_nid2dgramlist(dev, dst_nid);
660 list_for_each_entry(dgram, dgram_list, gndg_list) {
662 /* if state > POSTED, we are already handling cancel/completion */
663 if ((dgram->gndg_conn_out.gncr_dstnid != dst_nid) ||
664 dgram->gndg_state > GNILND_DGRAM_POSTED)
667 CDEBUG(D_NET, "got dgram [%p] -> %s\n",
668 dgram, libcfs_nid2str(dst_nid));
675 kgnilnd_find_and_cancel_dgram(kgn_device_t *dev, lnet_nid_t dst_nid)
679 spin_lock(&dev->gnd_dgram_lock);
680 dgram = kgnilnd_find_dgram_locked(dev, dst_nid);
683 kgnilnd_cancel_dgram_locked(dgram);
685 spin_unlock(&dev->gnd_dgram_lock);
687 RETURN(!!(dgram == NULL));
691 kgnilnd_pack_connreq(kgn_connreq_t *connreq, kgn_conn_t *conn,
692 lnet_nid_t srcnid, lnet_nid_t dstnid,
693 kgn_connreq_type_t type)
697 /* ensure we haven't violated max datagram size */
698 CLASSERT(sizeof(kgn_connreq_t) <= GNI_DATAGRAM_MAXSIZE);
700 /* no need to zero out, we do that when allocating dgram */
701 connreq->gncr_magic = GNILND_MSG_MAGIC;
703 if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_SRCNID)) {
705 } else if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_DSTNID)) {
709 connreq->gncr_srcnid = srcnid;
710 connreq->gncr_dstnid = dstnid;
712 if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
713 connreq->gncr_version = 99;
715 connreq->gncr_version = GNILND_CONNREQ_VERSION;
717 if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
718 connreq->gncr_type = 99;
720 connreq->gncr_type = type;
722 if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
723 connreq->gncr_peerstamp = 0;
725 connreq->gncr_peerstamp = kgnilnd_data.kgn_peerstamp;
727 if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
728 connreq->gncr_connstamp = 0;
730 connreq->gncr_connstamp = conn->gnc_my_connstamp;
732 if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
733 connreq->gncr_timeout = 0;
735 connreq->gncr_timeout = conn->gnc_timeout;
738 /* the rest pack the data into the payload in other places */
739 if (type == GNILND_CONNREQ_REQ) {
740 kgn_gniparams_t *req_params = &connreq->gncr_gnparams;
741 req_params->gnpr_host_id = conn->gnc_device->gnd_host_id;
742 req_params->gnpr_cqid = conn->gnc_cqid;
744 /* allocate mailbox for this connection */
745 err = kgnilnd_setup_mbox(conn);
747 CERROR("Failed to setup FMA mailbox (%d)\n", err);
749 req_params->gnpr_smsg_attr = conn->gnpr_smsg_attr;
752 /* XXX Nic: TBD - checksum computation */
758 kgnilnd_unpack_connreq(kgn_dgram_t *dgram)
760 kgn_connreq_t *connreq = &dgram->gndg_conn_in;
764 /* the following fields must be handled in a backwards compatible
765 * manner to ensure we can always send and interpret NAKs */
767 if (connreq->gncr_magic != GNILND_MSG_MAGIC &&
768 connreq->gncr_magic != __swab32(GNILND_MSG_MAGIC)) {
769 /* Unexpected magic! */
770 CERROR("Unexpected magic %08x\n",
771 connreq->gncr_magic);
775 swab = (connreq->gncr_magic == __swab32(GNILND_MSG_MAGIC));
777 __swab32s(&connreq->gncr_magic);
778 __swab32s(&connreq->gncr_cksum);
779 __swab16s(&connreq->gncr_type);
780 __swab16s(&connreq->gncr_version);
781 __swab32s(&connreq->gncr_timeout);
782 __swab64s(&connreq->gncr_srcnid);
783 __swab64s(&connreq->gncr_dstnid);
784 __swab64s(&connreq->gncr_peerstamp);
785 __swab64s(&connreq->gncr_connstamp);
788 /* Do NOT return anything but -EBADF before we munge
789 * connreq->gncr_srcnid - we need that to send the nak */
791 if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
792 lnet_nid_t incoming = connreq->gncr_srcnid;
794 /* even if the incoming packet is hosed, we know who we sent
795 * the original and can set the srcnid so that we can properly
796 * look up our peer to close the loop on this connreq. We still use
797 * -EBADF to prevent a NAK - just in case there are issues with
798 * the payload coming from a random spot, etc. */
799 connreq->gncr_srcnid = dgram->gndg_conn_out.gncr_dstnid;
801 if (LNET_NIDADDR(dgram->gndg_conn_out.gncr_dstnid) !=
802 LNET_NIDADDR(incoming)) {
803 /* we got a datagram match for the wrong nid... */
804 CERROR("matched datagram 0x%p with srcnid %s "
805 "(%x), expecting %s (%x)\n",
807 libcfs_nid2str(incoming),
808 LNET_NIDADDR(incoming),
809 libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
810 LNET_NIDADDR(dgram->gndg_conn_out.gncr_dstnid));
814 /* if we have a wildcard datagram it should match an
815 * incoming "active" datagram that should have a fully formed
816 * srcnid and dstnid. If we couldn't unpack it, we drop as
817 * corrupted packet, otherwise we'll just verify that the dstnid
818 * matches the NID for the NET that the dgram was posted */
820 /* make sure their wildcard didn't match ours, that is unpossible */
821 LASSERTF(connreq->gncr_dstnid != LNET_NID_ANY,
822 "dgram 0x%p from %s, connreq 0x%p; "
823 "wildcard matched wildcard \n", dgram,
824 libcfs_nid2str(connreq->gncr_srcnid), connreq);
826 rc = kgnilnd_find_net(connreq->gncr_dstnid, &net);
828 if (rc == -ESHUTDOWN) {
829 CERROR("Looking up network: device is in shutdown");
831 } else if (rc == -ENONET) {
832 CERROR("Connection data from %s: she sent "
833 "dst_nid %s, but net lookup failed on "
835 libcfs_nid2str(connreq->gncr_srcnid),
836 libcfs_nid2str(connreq->gncr_dstnid),
837 dgram, kgnilnd_dgram_type2str(dgram));
841 if (net->gnn_ni->ni_nid != connreq->gncr_dstnid) {
842 CERROR("Bad connection data from %s: she sent "
843 "dst_nid %s, but I am %s with dgram 0x%p@%s\n",
844 libcfs_nid2str(connreq->gncr_srcnid),
845 libcfs_nid2str(connreq->gncr_dstnid),
846 libcfs_nid2str(net->gnn_ni->ni_nid),
847 dgram, kgnilnd_dgram_type2str(dgram));
848 kgnilnd_net_decref(net);
852 /* kgnilnd_find_net takes a ref on the net it finds, You need to decref it when not needed. */
853 kgnilnd_net_decref(net);
856 if (connreq->gncr_version != GNILND_CONNREQ_VERSION) {
857 CERROR("Unexpected version %d\n", connreq->gncr_version);
861 /* XXX Nic: TBD - checksum validation */
862 if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_DROP)) {
866 if (swab && connreq->gncr_type == GNILND_CONNREQ_REQ) {
867 __u64 msg_addr = (__u64) connreq->gncr_gnparams.gnpr_smsg_attr.msg_buffer;
869 __swab32s(&connreq->gncr_gnparams.gnpr_host_id);
870 __swab32s(&connreq->gncr_gnparams.gnpr_cqid);
871 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.buff_size);
872 __swab16s(&connreq->gncr_gnparams.gnpr_smsg_attr.mbox_maxcredit);
873 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.mbox_offset);
874 __swab64s(&connreq->gncr_gnparams.gnpr_smsg_attr.mem_hndl.qword1);
875 __swab64s(&connreq->gncr_gnparams.gnpr_smsg_attr.mem_hndl.qword2);
876 __swab64s(&msg_addr);
877 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.msg_maxsize);
878 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.msg_type);
879 } else if (swab && connreq->gncr_type == GNILND_CONNREQ_NAK) {
880 __swab32s(&connreq->gncr_nakdata.gnnd_errno);
883 /* since we use a unique instance ID for each network, the driver
884 * will take care of dropping datagrams if we don't have that network.
887 /* few more idiot software or configuration checks */
889 switch (connreq->gncr_type) {
890 case GNILND_CONNREQ_REQ:
891 /* wire up EP and SMSG block - this will check the incoming data
892 * and barf a NAK back if need to */
893 rc = kgnilnd_set_conn_params(dgram);
897 case GNILND_CONNREQ_NAK:
898 case GNILND_CONNREQ_CLOSE:
901 CERROR("unknown connreq packet type %d\n", connreq->gncr_type);
905 if (connreq->gncr_peerstamp == 0 || connreq->gncr_connstamp == 0) {
906 CERROR("Recived bad timestamps peer "LPU64" conn "LPU64"\n",
907 connreq->gncr_peerstamp, connreq->gncr_connstamp);
911 if (connreq->gncr_timeout < GNILND_MIN_TIMEOUT) {
912 CERROR("Received timeout %d < MIN %d\n",
913 connreq->gncr_timeout, GNILND_MIN_TIMEOUT);
921 kgnilnd_alloc_dgram(kgn_dgram_t **dgramp, kgn_device_t *dev, kgn_dgram_type_t type)
925 dgram = cfs_mem_cache_alloc(kgnilnd_data.kgn_dgram_cache,
930 /* cache alloc'd memory is not zeroed */
931 memset((void *)dgram, 0, sizeof(*dgram)) ;
933 INIT_LIST_HEAD(&dgram->gndg_list);
934 dgram->gndg_state = GNILND_DGRAM_USED;
935 dgram->gndg_type = type;
936 dgram->gndg_magic = GNILND_DGRAM_MAGIC;
938 atomic_inc(&dev->gnd_ndgrams);
940 CDEBUG(D_MALLOC|D_NETTRACE, "slab-alloced 'dgram': %lu at %p.\n",
941 sizeof(*dgram), dgram);
947 /* call this on a dgram that came back from kgnilnd_ep_postdata_test_by_id
948 * returns < 0 on dgram to be cleaned up
949 * > 0 on dgram that isn't done yet
950 * == 0 on dgram that is ok and needs connreq processing */
952 kgnilnd_process_dgram(kgn_dgram_t *dgram, gni_post_state_t post_state)
956 switch (post_state) {
957 case GNI_POST_COMPLETED:
958 /* normal state for dgrams that need actual processing */
959 /* GOTO to avoid processing dgram as canceled/done */
960 GOTO(process_out, rc);
962 case GNI_POST_PENDING:
963 /* we should only see this if we are testing a WC dgram after a
964 * cancel - it means that it needs a full cycle of waiting
965 * for kgni_sm_task to finish moving it to TERMINATED */
966 LASSERTF((dgram->gndg_type == GNILND_DGRAM_WC_REQ) &&
967 (dgram->gndg_state == GNILND_DGRAM_CANCELED),
968 "POST_PENDING dgram 0x%p with bad type %d(%s) or state %d(%s)\n",
969 dgram, dgram->gndg_type, kgnilnd_dgram_type2str(dgram),
970 dgram->gndg_state, kgnilnd_dgram_state2str(dgram));
972 /* positive RC as this dgram isn't done yet */
975 /* GOTO as this isn't done yet */
976 GOTO(process_out, rc);
979 case GNI_POST_TERMINATED:
980 /* we've called cancel and it is done or remote guy called cancel and
981 * we've receved it on a WC dgram */
983 /* we are seeing weird terminations on non WC dgrams when we have not
986 LASSERTF(dgram->gndg_state == GNILND_DGRAM_CANCELED ||
987 dgram->gndg_conn_out.gncr_dstnid == LNET_NID_ANY,
988 "dgram 0x%p with bad state %d(%s) or dst nid %s\n",
989 dgram, dgram->gndg_state, kgnilnd_dgram_state2str(dgram),
990 libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid));
993 CDEBUG(D_NETTRACE, "dgram 0x%p saw %s, cleaning it up\n", dgram,
994 dgram->gndg_state == GNILND_DGRAM_CANCELED ? "canceled" : "terminated");
999 case GNI_POST_TIMEOUT:
1000 /* we could have a timeout on a wildcard dgram too - if
1001 * we got the incoming request but the remote node beefed
1002 * before kgni could send the match data back. We'll just error
1003 * on the active case and bail out gracefully */
1004 if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
1005 CNETERR("hardware timeout for connect to "
1006 "%s after %lu seconds. Is node dead?\n",
1007 libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
1008 cfs_duration_sec(jiffies - dgram->gndg_post_time));
1015 CERROR("dgram 0x%p with bad post_state %d\n", dgram, post_state);
1019 /* now finish cleaning up a dgram that is canceled/terminated and needs to
1022 /* If this was actively canceled, drop the count now that we are processing */
1023 if (dgram->gndg_state == GNILND_DGRAM_CANCELED) {
1024 atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1025 /* caller responsible for gndg_list removal */
1033 /* needs dev->gnd_dgram_lock held */
1035 kgnilnd_cancel_dgram_locked(kgn_dgram_t *dgram)
1039 if (dgram->gndg_state != GNILND_DGRAM_POSTED) {
1043 LASSERTF(dgram->gndg_conn != NULL,
1044 "dgram 0x%p with NULL conn\n", dgram);
1046 /* C.E - WC dgrams could be canceled immediately but
1047 * if there was some match pending, we need to call
1048 * test_by_id to clear it out. If that test returns
1049 * POST_PENDING, it is half done and needs to go along
1050 * with the rest of dgrams and go through a kgni_sm_task cycle
1051 * and deliver a GNI_POST_TERMINATED event before they
1052 * are actually canceled */
1054 dgram->gndg_state = GNILND_DGRAM_CANCELED;
1056 if (dgram->gndg_conn->gnc_state >= GNILND_CONN_ESTABLISHED) {
1057 /* we don't need to cancel_by_id if the datagram was good */
1061 /* let folks know there are outstanding cancels */
1062 atomic_inc(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1063 /* leave on nid list until cancel is done for debugging fun */
1064 grc = kgnilnd_ep_postdata_cancel_by_id(dgram->gndg_conn->gnc_ephandle, (__u64) dgram);
1066 /* if we don't get success here, we have hosed up the dgram tracking
1067 * code and need to bail out */
1068 LASSERTF(grc == GNI_RC_SUCCESS,
1069 "postdata_cancel returned %d for conn 0x%p to %s\n",
1070 grc, dgram->gndg_conn,
1071 dgram->gndg_conn->gnc_peer ?
1072 libcfs_nid2str(dgram->gndg_conn->gnc_peer->gnp_nid)
1076 "canceled dgram 0x%p conn 0x%p ephandle 0x%p\n",
1077 dgram, dgram->gndg_conn,
1078 dgram->gndg_conn->gnc_ephandle);
1080 if (dgram->gndg_type == GNILND_DGRAM_WC_REQ) {
1081 gni_post_state_t post_state;
1083 __u32 remote_addr = 0, remote_id = 0;
1085 grc = kgnilnd_ep_postdata_test_by_id(dgram->gndg_conn->gnc_ephandle,
1086 (__u64)dgram, &post_state,
1087 &remote_addr, &remote_id);
1089 LASSERTF(grc == GNI_RC_NO_MATCH || grc == GNI_RC_SUCCESS,
1090 "bad grc %d from test_by_id on dgram 0x%p\n",
1093 /* if WC was canceled immediately, we get NO_MATCH, if needs to go
1094 * through full cycle, we get SUCCESS and need to parse post_state */
1096 CDEBUG(D_NET, "grc %d dgram 0x%p type %s post_state %d "
1097 "remote_addr %u remote_id %u\n", grc, dgram,
1098 kgnilnd_dgram_type2str(dgram),
1099 post_state, remote_addr, remote_id);
1101 if (grc == GNI_RC_NO_MATCH) {
1102 /* she's gone, reduce count and move along */
1103 dgram->gndg_state = GNILND_DGRAM_DONE;
1104 atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1108 rc = kgnilnd_process_dgram(dgram, post_state);
1111 /* if for some weird reason we get a valid dgram back, just mark as done
1112 * so we can drop it and move along.
1113 * C.E - if it was completed, we'll just release the conn/mbox
1114 * back into the pool and it'll get reused. That said, we should only
1115 * be canceling a WC dgram on stack rest or shutdown, so that is moot */
1116 dgram->gndg_state = GNILND_DGRAM_DONE;
1117 atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1119 /* caller context responsible for calling kgnilnd_release_dgram() */
1121 /* still pending, let it simmer until golden brown and delicious */
1125 /* for non WC dgrams, they are still on the nid list but marked canceled waiting
1126 * for kgni to return their ID to us via probe - that is when we'll complete their
1127 * cancel processing */
1131 kgnilnd_cleanup_dgram(kgn_dgram_t *dgram)
1133 /* release the dgram ref on conn */
1134 if (dgram->gndg_conn) {
1135 kgnilnd_conn_decref(dgram->gndg_conn);
1136 dgram->gndg_conn = NULL;
1141 kgnilnd_free_dgram(kgn_device_t *dev, kgn_dgram_t *dgram)
1143 LASSERTF(dgram->gndg_state == GNILND_DGRAM_USED ||
1144 dgram->gndg_state == GNILND_DGRAM_DONE,
1145 "dgram 0x%p with bad state %s\n",
1146 dgram, kgnilnd_dgram_state2str(dgram));
1148 /* bit of poisoning to help detect bad driver data */
1149 dgram->gndg_magic = 0x6f5a6b5f;
1150 atomic_dec(&dev->gnd_ndgrams);
1152 cfs_mem_cache_free(kgnilnd_data.kgn_dgram_cache, dgram);
1153 CDEBUG(D_MALLOC|D_NETTRACE, "slab-freed 'dgram': %lu at %p.\n",
1154 sizeof(*dgram), dgram);
1158 kgnilnd_post_dgram(kgn_device_t *dev, lnet_nid_t dstnid, kgn_connreq_type_t type,
1162 kgn_dgram_t *dgram = NULL;
1163 kgn_dgram_t *tmpdgram;
1164 kgn_dgram_type_t dgtype;
1170 case GNILND_CONNREQ_REQ:
1171 if (dstnid == LNET_NID_ANY)
1172 dgtype = GNILND_DGRAM_WC_REQ;
1174 dgtype = GNILND_DGRAM_REQ;
1176 case GNILND_CONNREQ_NAK:
1177 LASSERTF(dstnid != LNET_NID_ANY, "can't NAK to LNET_NID_ANY\n");
1178 dgtype = GNILND_DGRAM_NAK;
1181 CERROR("unknown connreq type %d\n", type);
1185 rc = kgnilnd_alloc_dgram(&dgram, dev, dgtype);
1188 GOTO(post_failed, rc);
1191 rc = kgnilnd_create_conn(&dgram->gndg_conn, dev);
1193 GOTO(post_failed, rc);
1196 if (dgram->gndg_type == GNILND_DGRAM_WC_REQ) {
1197 /* clear buffer for sanity on reuse of wildcard */
1198 memset(&dgram->gndg_conn_in, 0, sizeof(kgn_connreq_t));
1201 if (dstnid == LNET_NID_ANY) {
1202 /* set here to reset any dgram re-use */
1203 dgram->gndg_conn->gnc_state = GNILND_CONN_LISTEN;
1207 rc = kgnilnd_nid_to_nicaddrs(LNET_NIDADDR(dstnid), 1, &host_id);
1210 GOTO(post_failed, rc);
1213 dgram->gndg_conn->gnc_state = GNILND_CONN_CONNECTING;
1215 /* don't need to serialize, there are no CQs for the dgram
1216 * EP on the kgn_net_t */
1217 grc = kgnilnd_ep_bind(dgram->gndg_conn->gnc_ephandle, host_id, dev->gnd_id);
1219 if (grc != GNI_RC_SUCCESS) {
1221 GOTO(post_failed, rc);
1226 /* If we are posting wildcards post using a net of 0, otherwise we'll use the
1227 * net of the destination node.
1230 if (dstnid == LNET_NID_ANY) {
1231 srcnid = LNET_MKNID(LNET_MKNET(GNILND, 0), dev->gnd_nid);
1233 srcnid = LNET_MKNID(LNET_NIDNET(dstnid), dev->gnd_nid);
1236 rc = kgnilnd_pack_connreq(&dgram->gndg_conn_out, dgram->gndg_conn,
1237 srcnid, dstnid, type);
1239 GOTO(post_failed, rc);
1242 if (type == GNILND_CONNREQ_NAK)
1243 dgram->gndg_conn_out.gncr_nakdata.gnnd_errno = data_rc;
1245 dgram->gndg_post_time = jiffies;
1247 /* XXX Nic: here is where we'd add in logical network multiplexing */
1249 CDEBUG(D_NETTRACE, "dgram 0x%p type %s %s->%s cdm %d\n",
1250 dgram, kgnilnd_dgram_type2str(dgram),
1251 libcfs_nid2str(srcnid),
1252 libcfs_nid2str(dstnid), dev->gnd_id);
1254 /* this allocates memory, can't hold locks across */
1255 grc = kgnilnd_ep_postdata_w_id(dgram->gndg_conn->gnc_ephandle,
1256 &dgram->gndg_conn_out, sizeof(kgn_connreq_t),
1257 &dgram->gndg_conn_in, sizeof(kgn_connreq_t),
1260 if (grc != GNI_RC_SUCCESS) {
1261 CNETERR("dropping failed dgram post id 0x%p type %s"
1262 " reqtype %s to %s: rc %d\n",
1263 dgram, kgnilnd_dgram_type2str(dgram),
1264 kgnilnd_connreq_type2str(&dgram->gndg_conn_out),
1265 libcfs_nid2str(dstnid), grc);
1266 rc = (grc == GNI_RC_ERROR_NOMEM) ? -ENOMEM : -EBADR;
1267 GOTO(post_failed, rc);
1270 /* we don't need to add earlier - if someone does del_peer during post,
1271 * that peer will get marked as unlinked and the callers wil take care of it.
1272 * The dgram code is largely kgn_peer_t ignorant, so at worst, we'll just drop
1273 * the completed dgram later when we cant find a peer to stuff it into */
1275 spin_lock(&dev->gnd_dgram_lock);
1277 /* make sure we are not double posting targeted dgrams
1278 * - we can multiple post WC dgrams to help with processing speed */
1279 if (dstnid != LNET_NID_ANY) {
1280 tmpdgram = kgnilnd_find_dgram_locked(dev, dstnid);
1282 LASSERTF(tmpdgram == NULL,
1283 "dgram 0x%p->%s already posted\n",
1284 dgram, libcfs_nid2str(dstnid));
1287 /* unmunge dstnid to help processing code cope... */
1288 if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_DSTNID)) {
1289 dgram->gndg_conn_out.gncr_dstnid = dstnid;
1292 list_add_tail(&dgram->gndg_list, kgnilnd_nid2dgramlist(dev, dstnid));
1293 dgram->gndg_state = GNILND_DGRAM_POSTED;
1294 spin_unlock(&dev->gnd_dgram_lock);
1297 if (rc < 0 && dgram != NULL) {
1298 kgnilnd_cleanup_dgram(dgram);
1299 kgnilnd_free_dgram(dev, dgram);
1306 kgnilnd_release_dgram(kgn_device_t *dev, kgn_dgram_t *dgram)
1308 spin_lock(&dev->gnd_dgram_lock);
1309 kgnilnd_cancel_dgram_locked(dgram);
1310 spin_unlock(&dev->gnd_dgram_lock);
1312 kgnilnd_cleanup_dgram(dgram);
1314 /* if the dgram is 'canceled' it needs to be wait until the event
1315 * comes up from kgni that tells us it is safe to release */
1316 if (dgram->gndg_state != GNILND_DGRAM_CANCELED) {
1317 dgram->gndg_state = GNILND_DGRAM_DONE;
1319 LASSERTF(list_empty(&dgram->gndg_list), "dgram 0x%p on list\n", dgram);
1321 /* if it is a wildcard and we are in an appropriate state, repost
1324 if ((dgram->gndg_type == GNILND_DGRAM_WC_REQ) &&
1325 (!kgnilnd_data.kgn_wc_kill && !kgnilnd_data.kgn_in_reset)) {
1328 rerc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
1330 "error %d: dev %d could not repost wildcard datagram id 0x%p\n",
1331 rerc, dev->gnd_id, dgram);
1334 /* always free the old dgram */
1335 kgnilnd_free_dgram(dev, dgram);
1341 kgnilnd_probe_for_dgram(kgn_device_t *dev, kgn_dgram_t **dgramp)
1343 kgn_dgram_t *dgram = NULL;
1344 gni_post_state_t post_state;
1348 __u32 remote_addr = 0, remote_id = 0;
1351 /* Probe with the lock held. That way if we get a dgram we dont have it canceled
1352 * between finding the ready dgram and grabbing the lock to remove it from the
1353 * list. Otherwise we could be left in an inconsistent state. We own the dgram
1354 * once its off the list so we don't need to worry about others changing it at
1356 spin_lock(&dev->gnd_dgram_lock);
1357 grc = kgnilnd_postdata_probe_by_id(dev->gnd_handle, &readyid);
1358 if (grc != GNI_RC_SUCCESS) {
1359 spin_unlock(&dev->gnd_dgram_lock);
1360 /* return 0 to indicate nothing happened */
1364 CDEBUG(D_NET, "ready "LPX64" on device 0x%p\n",
1367 dgram = (kgn_dgram_t *)readyid;
1369 LASSERTF(dgram->gndg_magic == GNILND_DGRAM_MAGIC,
1370 "dgram 0x%p from id "LPX64" with bad magic %x\n",
1371 dgram, readyid, dgram->gndg_magic);
1373 LASSERTF(dgram->gndg_state == GNILND_DGRAM_POSTED ||
1374 dgram->gndg_state == GNILND_DGRAM_CANCELED,
1375 "dgram 0x%p with bad state %s\n",
1376 dgram, kgnilnd_dgram_state2str(dgram));
1378 LASSERTF(!list_empty(&dgram->gndg_list),
1379 "dgram 0x%p with bad list state %s\n",
1380 dgram, kgnilnd_dgram_state2str(dgram));
1382 /* now we know that the datagram structure is ok, so pull off list */
1383 list_del_init(&dgram->gndg_list);
1385 /* while we have the gnn_dgram_lock and BEFORE we call test_by_id
1386 * change the state from POSTED to PROCESSING to ensure that
1387 * nobody cancels it after we've pulled it from the wire */
1388 if (dgram->gndg_state == GNILND_DGRAM_POSTED) {
1389 dgram->gndg_state = GNILND_DGRAM_PROCESSING;
1392 spin_unlock(&dev->gnd_dgram_lock);
1394 /* we now "own" this datagram */
1396 LASSERTF(dgram->gndg_conn != NULL,
1397 "dgram 0x%p with NULL conn\n", dgram);
1399 grc = kgnilnd_ep_postdata_test_by_id(dgram->gndg_conn->gnc_ephandle,
1400 (__u64)dgram, &post_state,
1401 &remote_addr, &remote_id);
1403 LASSERTF(grc != GNI_RC_NO_MATCH, "kgni lied! probe_by_id told us that"
1404 " id "LPU64" was ready\n", readyid);
1406 CDEBUG(D_NET, "grc %d dgram 0x%p type %s post_state %d "
1407 "remote_addr %u remote_id %u\n", grc, dgram,
1408 kgnilnd_dgram_type2str(dgram),
1409 post_state, remote_addr, remote_id);
1411 if (unlikely(grc != GNI_RC_SUCCESS)) {
1412 CNETERR("getting data for dgram 0x%p->%s failed rc %d. Dropping it\n",
1413 dgram, libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
1416 GOTO(probe_for_out, rc);
1419 rc = kgnilnd_process_dgram(dgram, post_state);
1421 /* we should never get probe finding a dgram for us and then it
1422 * being a WC dgram that is still in the middle of processing */
1423 LASSERTF(rc <= 0, "bad rc %d from process_dgram 0x%p state %d\n",
1424 rc, dgram, post_state);
1427 /* dgram is good enough for the data to be used */
1428 dgram->gndg_state = GNILND_DGRAM_PROCESSING;
1429 /* fake rc to mark that we've done something */
1432 /* bring out your dead! */
1433 dgram->gndg_state = GNILND_DGRAM_DONE;
1441 kgnilnd_release_dgram(dev, dgram);
1446 kgnilnd_setup_wildcard_dgram(kgn_device_t *dev)
1448 /* if kgn_wildcard is zero, return error */
1449 int rc = -ENOENT, i;
1452 for (i = 0; i < *kgnilnd_tunables.kgn_nwildcard; i++) {
1453 rc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
1455 CERROR("error %d: could not post wildcard datagram # %d\n",
1467 kgnilnd_cancel_net_dgrams(kgn_net_t *net)
1469 kgn_dgram_t *dg, *dgN;
1470 struct list_head zombies;
1474 /* we want to cancel any outstanding dgrams - we don't want to rely
1475 * on del_peer_or_conn catching all of them. This helps protect us in cases
1476 * where we don't quite keep the peer->dgram mapping in sync due to some
1477 * race conditions */
1479 LASSERTF(net->gnn_shutdown || kgnilnd_data.kgn_in_reset,
1480 "called with LND invalid state: net shutdown %d "
1481 "in reset %d\n", net->gnn_shutdown,
1482 kgnilnd_data.kgn_in_reset);
1484 INIT_LIST_HEAD(&zombies);
1486 spin_lock(&net->gnn_dev->gnd_dgram_lock);
1488 for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
1489 list_for_each_entry_safe(dg, dgN, &net->gnn_dev->gnd_dgrams[i], gndg_list) {
1491 /* skip nids not on our net or are wildcards */
1494 if (dg->gndg_type == GNILND_DGRAM_WC_REQ ||
1495 net->gnn_netnum != LNET_NETNUM(LNET_NIDNET(dg->gndg_conn_out.gncr_dstnid)))
1498 kgnilnd_cancel_dgram_locked(dg);
1502 spin_unlock(&net->gnn_dev->gnd_dgram_lock);
1508 kgnilnd_cancel_wc_dgrams(kgn_device_t *dev)
1510 kgn_dgram_t *dg, *dgN;
1511 struct list_head zombies;
1514 /* Time to kill the outstanding WC's
1515 * WC's exist on net 0 only but match on any net...
1518 LASSERTF(kgnilnd_data.kgn_in_reset || kgnilnd_data.kgn_wc_kill,
1519 "called with LND invalid state: WC shutdown %d "
1520 "in reset %d\n", kgnilnd_data.kgn_wc_kill,
1521 kgnilnd_data.kgn_in_reset);
1523 INIT_LIST_HEAD(&zombies);
1524 spin_lock(&dev->gnd_dgram_lock);
1527 dg = kgnilnd_find_dgram_locked(dev, LNET_NID_ANY);
1529 LASSERTF(dg->gndg_type == GNILND_DGRAM_WC_REQ,
1530 "dgram 0x%p->%s with bad type %d (%s)\n",
1531 dg, libcfs_nid2str(dg->gndg_conn_out.gncr_dstnid),
1532 dg->gndg_type, kgnilnd_dgram_type2str(dg));
1534 kgnilnd_cancel_dgram_locked(dg);
1536 /* WC could be DONE already, check and if so add to list to be released */
1537 if (dg->gndg_state == GNILND_DGRAM_DONE) {
1538 list_del_init(&dg->gndg_list);
1539 list_add_tail(&dg->gndg_list, &zombies);
1542 } while (dg != NULL);
1544 spin_unlock(&dev->gnd_dgram_lock);
1546 list_for_each_entry_safe(dg, dgN, &zombies, gndg_list) {
1547 list_del_init(&dg->gndg_list);
1548 kgnilnd_release_dgram(dev, dg);
1555 kgnilnd_wait_for_canceled_dgrams(kgn_device_t *dev)
1563 /* use do while to get at least one check run to allow
1564 * regression test for 762072 to hit bug if there */
1566 /* This function races with the dgram mover during shutdown so it is possible for
1567 * a dgram to be seen in kgnilnd_postdata_probe_wait_by_id but be handled in the
1568 * dgram mover thread instead of inside of this function.
1571 /* This should only be called from within shutdown, baseshutdown, or stack reset.
1572 * there are no assertions here to verify since base_shutdown has nothing in it we can check
1573 * the net is gone by then.
1578 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
1579 "Waiting for %d canceled datagrams to clear on device %d\n",
1580 atomic_read(&dev->gnd_canceled_dgrams), dev->gnd_id);
1582 /* check once a second */
1583 grc = kgnilnd_postdata_probe_wait_by_id(dev->gnd_handle,
1586 if (grc != GNI_RC_SUCCESS)
1589 CDEBUG(D_NET, "ready "LPX64" on device %d->0x%p\n",
1590 readyid, dev->gnd_id, dev);
1592 rc = kgnilnd_probe_for_dgram(dev, &dgram);
1594 /* if we got a valid dgram or one that is now done, clean up */
1595 kgnilnd_release_dgram(dev, dgram);
1597 } while (atomic_read(&dev->gnd_canceled_dgrams));
1601 kgnilnd_start_connect(kgn_peer_t *peer)
1604 /* sync point for kgnilnd_del_peer_locked - do an early check to
1605 * catch the most common hits where del_peer is done by the
1606 * time we get here */
1607 if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING1)) {
1608 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING1, 1)) {};
1611 write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1612 if (!kgnilnd_peer_active(peer) || peer->gnp_connecting != GNILND_PEER_CONNECT) {
1613 /* raced with peer getting unlinked */
1614 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1618 peer->gnp_connecting = GNILND_PEER_POSTING;
1619 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1621 set_mb(peer->gnp_last_dgram_time, jiffies);
1622 if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING2)) {
1623 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING2, 1)) {};
1626 if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING3)) {
1627 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING3, 1)) {};
1628 rc = cfs_fail_val ? cfs_fail_val : -ENOMEM;
1630 rc = kgnilnd_post_dgram(peer->gnp_net->gnn_dev,
1631 peer->gnp_nid, GNILND_CONNREQ_REQ, 0);
1634 set_mb(peer->gnp_last_dgram_errno, rc);
1638 /* while we're posting someone could have decided this peer/dgram needed to
1639 * die a quick death, so we check for state change and process accordingly */
1641 write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1642 if (!kgnilnd_peer_active(peer) || peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH) {
1643 if (peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH) {
1644 peer->gnp_connecting = GNILND_PEER_KILL;
1646 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1647 /* positive RC to avoid dgram cleanup - we'll have to
1648 * wait for the kgni GNI_POST_TERMINATED event to
1649 * finish cleaning up */
1651 kgnilnd_find_and_cancel_dgram(peer->gnp_net->gnn_dev, peer->gnp_nid);
1654 peer->gnp_connecting = GNILND_PEER_POSTED;
1655 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1656 /* reaper thread will take care of any timeouts */
1657 CDEBUG(D_NET, "waiting for connect to finish to %s rc %d\n",
1658 libcfs_nid2str(peer->gnp_nid), rc);
1663 CDEBUG(D_NET, "connect to %s failed: rc %d \n",
1664 libcfs_nid2str(peer->gnp_nid), rc);
1670 kgnilnd_finish_connect(kgn_dgram_t *dgram)
1672 kgn_conn_t *conn = dgram->gndg_conn;
1673 lnet_nid_t her_nid = dgram->gndg_conn_in.gncr_srcnid;
1674 kgn_peer_t *new_peer, *peer = NULL;
1677 kgn_mbox_info_t *mbox;
1681 /* try to find a peer that matches the nid we got in the connreq
1682 * kgnilnd_unpack_connreq makes sure that conn_in.gncr_srcnid is
1683 * HER and conn_out.gncr_srcnid is ME for both active and WC dgrams */
1685 /* assume this is a new peer - it makes locking cleaner when it isn't */
1686 /* no holding kgn_net_rw_sem - already are at the kgnilnd_dgram_mover level */
1688 rc = kgnilnd_create_peer_safe(&new_peer, her_nid, NULL);
1690 CERROR("Can't create peer for %s\n", libcfs_nid2str(her_nid));
1694 write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1696 /* this transfers ref from create_peer to the kgn_peer table */
1697 kgnilnd_add_peer_locked(her_nid, new_peer, &peer);
1699 /* if we found an existing peer, is it really ready for a new conn ? */
1700 if (peer != new_peer) {
1701 /* if this was an active connect attempt but we can't find a peer waiting for it
1702 * we will dump in the trash */
1704 if (peer->gnp_connecting == GNILND_PEER_IDLE && dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
1705 CDEBUG(D_NET, "dropping completed connreq for %s peer 0x%p->%s\n",
1706 libcfs_nid2str(her_nid), peer, libcfs_nid2str(peer->gnp_nid));
1707 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1712 /* check to see if we can catch a connecting peer before it is
1713 * removed from the connd_peers list - if not, we need to
1714 * let the connreqs race and be handled by kgnilnd_conn_isdup_locked() */
1715 if (peer->gnp_connecting != GNILND_PEER_IDLE) {
1716 spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
1717 if (!list_empty(&peer->gnp_connd_list)) {
1718 list_del_init(&peer->gnp_connd_list);
1719 /* drop connd ref */
1720 kgnilnd_peer_decref(peer);
1722 spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
1723 /* clear rc to make sure we don't have fake error */
1727 /* no matter what, we are no longer waiting to connect this peer now */
1728 peer->gnp_connecting = GNILND_PEER_IDLE;
1730 /* Refuse to duplicate an existing connection (both sides might try to
1731 * connect at once). NB we return success! We _are_ connected so we
1732 * _don't_ have any blocked txs to complete with failure. */
1733 rc = kgnilnd_conn_isdup_locked(peer, conn);
1735 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1736 CDEBUG(D_NET, "Not creating duplicate connection to %s: %d\n",
1737 libcfs_nid2str(her_nid), rc);
1743 nstale = kgnilnd_close_stale_conns_locked(peer, conn);
1745 /* either way with peer (new or existing), we are ok with ref counts here as the
1746 * kgnilnd_add_peer_locked will use our ref on new_peer (from create_peer_safe) as the
1747 * ref for the peer table. */
1749 /* at this point, the connection request is a winner */
1751 /* mark 'DONE' to avoid cancel being called from release */
1752 dgram->gndg_state = GNILND_DGRAM_DONE;
1754 /* initialise timestamps before reaper looks at them */
1755 conn->gnc_last_rx = conn->gnc_last_rx_cq = jiffies;
1757 /* last_tx is initialized to jiffies - (keepalive*2) so that if the NOOP fails it will
1758 * immediatly send a NOOP in the reaper thread during the call to
1759 * kgnilnd_check_conn_timeouts_locked
1761 conn->gnc_last_tx = jiffies - (cfs_time_seconds(GNILND_TO2KA(conn->gnc_timeout)) * 2);
1762 conn->gnc_state = GNILND_CONN_ESTABLISHED;
1764 /* refs are not transferred from dgram to tables, so increment to
1766 kgnilnd_conn_addref(conn);
1767 kgnilnd_peer_addref(peer);
1768 conn->gnc_peer = peer;
1769 list_add_tail(&conn->gnc_list, &peer->gnp_conns);
1771 kgnilnd_conn_addref(conn); /* +1 ref for conn table */
1772 list_add_tail(&conn->gnc_hashlist,
1773 kgnilnd_cqid2connlist(conn->gnc_cqid));
1774 kgnilnd_data.kgn_conn_version++;
1776 /* Dont send NOOP if fail_loc is set
1778 if (!CFS_FAIL_CHECK(CFS_FAIL_GNI_ONLY_NOOP)) {
1779 tx = kgnilnd_new_tx_msg(GNILND_MSG_NOOP, peer->gnp_net->gnn_ni->ni_nid);
1781 CNETERR("can't get TX to initiate NOOP to %s\n",
1782 libcfs_nid2str(peer->gnp_nid));
1784 kgnilnd_queue_tx(conn, tx);
1788 /* Schedule all packets blocking for a connection */
1789 list_for_each_entry_safe(tx, txn, &peer->gnp_tx_queue, tx_list) {
1790 /* lock held here is the peer_conn lock */
1791 kgnilnd_tx_del_state_locked(tx, peer, NULL, GNILND_TX_ALLOCD);
1792 kgnilnd_queue_tx(conn, tx);
1795 /* If this is an active connection lets mark its timestamp on the MBoX */
1796 if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
1797 mbox = &conn->gnc_fma_blk->gnm_mbox_info[conn->gnc_mbox_id];
1798 /* conn->gnc_last_rx is jiffies it better exist as it was just set */
1799 mbox->mbx_release_purg_active_dgram = conn->gnc_last_rx;
1802 /* Bug 765042: wake up scheduler for a race with finish_connect and
1803 * complete_conn_closed with a conn in purgatory
1804 * since we can't use CFS_RACE due to mutex_holds in kgnilnd_process_conns,
1805 * we just check for set and then clear */
1806 if (CFS_FAIL_CHECK(CFS_FAIL_GNI_FINISH_PURG)) {
1808 /* get scheduler thread moving again */
1809 kgnilnd_schedule_device(conn->gnc_device);
1812 CDEBUG(D_NET, "New conn 0x%p->%s dev %d\n",
1813 conn, libcfs_nid2str(her_nid), conn->gnc_device->gnd_id);
1815 /* make sure we reset peer reconnect interval now that we have a good conn */
1816 kgnilnd_peer_alive(peer);
1817 peer->gnp_reconnect_interval = 0;
1819 /* clear the unlink attribute if we dont clear it kgnilnd_del_conn_or_peer will wait
1820 * on the atomic forever
1822 if (peer->gnp_pending_unlink) {
1823 peer->gnp_pending_unlink = 0;
1824 kgnilnd_admin_decref(kgnilnd_data.kgn_npending_unlink);
1825 CDEBUG(D_NET, "Clearing peer unlink %p\n",peer);
1828 /* add ref to make it hang around until after we drop the lock */
1829 kgnilnd_conn_addref(conn);
1831 /* Once the peer_conn lock is dropped, the conn could actually move into
1832 * CLOSING->CLOSED->DONE in the scheduler thread, so hold the
1833 * lock until we are really done */
1834 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1836 /* Notify LNET that we now have a working connection to this peer.
1837 * This is a Cray extension to the "standard" LND behavior. */
1838 lnet_notify(peer->gnp_net->gnn_ni, peer->gnp_nid,
1839 1, cfs_time_current());
1841 /* schedule the conn to pick up any SMSG sent by peer before we could
1842 * process this dgram */
1843 kgnilnd_schedule_conn(conn);
1845 /* drop our 'hold' ref */
1846 kgnilnd_conn_decref(conn);
1853 kgnilnd_send_nak(kgn_device_t *dev, lnet_nid_t dst_nid, int error)
1858 LASSERTF(dst_nid != LNET_NID_ANY, "bad dst_nid %s\n", libcfs_nid2str(dst_nid));
1860 CDEBUG(D_NET, "NAK to %s errno %d\n", libcfs_nid2str(dst_nid), error);
1862 rc = kgnilnd_post_dgram(dev, dst_nid, GNILND_CONNREQ_NAK, error);
1865 CDEBUG(D_NET, "NAK to %s failed: rc %d \n", libcfs_nid2str(dst_nid), rc);
1871 kgnilnd_process_nak(kgn_dgram_t *dgram)
1873 kgn_connreq_t *connreq = &dgram->gndg_conn_in;
1874 lnet_nid_t src_nid = connreq->gncr_srcnid;
1875 int errno = connreq->gncr_nakdata.gnnd_errno;
1879 write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1881 peer = kgnilnd_find_peer_locked(src_nid);
1883 /* we likely dropped him from bad data when we processed
1884 * the original REQ */
1885 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1889 /* need to check peerstamp/connstamp against the ones we find
1890 * to make sure we don't close new (and good?) conns that we
1891 * formed after this connreq failed */
1892 if (peer->gnp_connecting == GNILND_PEER_IDLE) {
1895 if (list_empty(&peer->gnp_conns)) {
1896 /* assume already procced datagram and it barfed up
1897 * on this side too */
1898 CDEBUG(D_NET, "dropping NAK from %s; "
1899 "peer %s is already not connected\n",
1900 libcfs_nid2str(connreq->gncr_srcnid),
1901 libcfs_nid2str(connreq->gncr_dstnid));
1902 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1906 /* stub up a connection with the connreq XXX_stamps to allow
1907 * use to use close_stale_conns_locked */
1908 conn.gnc_peerstamp = connreq->gncr_peerstamp;
1909 conn.gnc_my_connstamp = connreq->gncr_connstamp;
1910 conn.gnc_peer_connstamp = connreq->gncr_connstamp;
1911 conn.gnc_device = peer->gnp_net->gnn_dev;
1913 rc = kgnilnd_close_stale_conns_locked(peer, &conn);
1915 LCONSOLE_INFO("Received NAK from %s for %s errno %d; "
1916 "closed %d connections\n",
1917 libcfs_nid2str(connreq->gncr_srcnid),
1918 libcfs_nid2str(connreq->gncr_dstnid), errno, rc);
1921 spin_lock(&dgram->gndg_conn->gnc_device->gnd_connd_lock);
1923 if (list_empty(&peer->gnp_connd_list)) {
1924 /* if peer isn't on waiting list, try to find one to nuke */
1925 rc = kgnilnd_find_and_cancel_dgram(peer->gnp_net->gnn_dev,
1929 LCONSOLE_INFO("Received NAK from %s for %s errno %d; "
1930 "canceled pending connect request\n",
1931 libcfs_nid2str(connreq->gncr_srcnid),
1932 libcfs_nid2str(connreq->gncr_dstnid), errno);
1935 /* if we can't find a waiting dgram, we just drop the nak - the conn
1936 * connect must have failed (didn't find conn above and clear connecting
1937 * -- so nothing to do besides drop */
1939 /* peer is on list, meaning it is a new connect attempt from the one
1940 * we started that generated the NAK - so just drop NAK */
1942 /* use negative to prevent error message */
1945 spin_unlock(&dgram->gndg_conn->gnc_device->gnd_connd_lock);
1948 /* success! we found a peer and at least marked pending_nak */
1949 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1955 kgnilnd_process_connreq(kgn_dgram_t *dgram, int *needs_nak)
1959 rc = kgnilnd_unpack_connreq(dgram);
1962 /* only NAK if we have good srcnid to use */
1968 switch (dgram->gndg_conn_in.gncr_type) {
1969 case GNILND_CONNREQ_REQ:
1970 /* wire up peer & conn, send queued TX */
1971 rc = kgnilnd_finish_connect(dgram);
1973 /* don't nak when the nid is hosed */
1979 case GNILND_CONNREQ_NAK:
1980 rc = kgnilnd_process_nak(dgram);
1981 /* return early to prevent reconnect bump */
1984 CERROR("unexpected connreq type %s (%d) from %s\n",
1985 kgnilnd_connreq_type2str(&dgram->gndg_conn_in),
1986 dgram->gndg_conn_in.gncr_type,
1987 libcfs_nid2str(dgram->gndg_conn_in.gncr_srcnid));
1998 kgnilnd_probe_and_process_dgram(kgn_device_t *dev)
2002 lnet_nid_t nak_dstnid = LNET_NID_ANY;
2003 lnet_nid_t orig_dstnid;
2004 kgn_dgram_t *dgram = NULL;
2008 if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PAUSE_DGRAM_COMP)) {
2011 rc = kgnilnd_probe_for_dgram(dev, &dgram);
2016 } else if (rc < 0) {
2017 GOTO(inform_peer, rc);
2019 /* rc > 1 means it did something, reset for this func */
2023 switch (dgram->gndg_type) {
2024 case GNILND_DGRAM_WC_REQ:
2025 case GNILND_DGRAM_REQ:
2026 rc = kgnilnd_process_connreq(dgram, &needs_nak);
2028 case GNILND_DGRAM_NAK:
2029 CDEBUG(D_NETTRACE, "NAK to %s done\n",
2030 libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid));
2033 CERROR("unknown datagram type %s (%d)\n",
2034 kgnilnd_dgram_type2str(dgram), dgram->gndg_type);
2038 /* stash data to use after releasing current datagram */
2039 /* don't stash net - we are operating on a net already,
2040 * so the lock on rw_net_lock is sufficient */
2042 nak_dstnid = dgram->gndg_conn_in.gncr_srcnid;
2045 LASSERTF(dgram != NULL, "dgram 0x%p rc %d needs_nak %d\n", dgram, rc, needs_nak);
2047 orig_dstnid = dgram->gndg_conn_out.gncr_dstnid;
2049 kgnilnd_release_dgram(dev, dgram);
2051 CDEBUG(D_NET, "cleaning up dgram to %s, rc %d\n",
2052 libcfs_nid2str(orig_dstnid), rc);
2054 /* if this was a WC_REQ that matched an existing peer, it'll get marked done
2055 * in kgnilnd_finish_connect - if errors are from before we get to there,
2056 * we just drop as it is a WC_REQ - the peer CAN'T be waiting for it */
2057 if ((orig_dstnid != LNET_NID_ANY) && (rc < 0)) {
2058 /* if we have a negative rc, we want to find a peer to inform about
2059 * the bad connection attempt. Sorry buddy, better luck next time! */
2061 write_lock(&kgnilnd_data.kgn_peer_conn_lock);
2062 peer = kgnilnd_find_peer_locked(orig_dstnid);
2065 /* add ref to make sure he stays around past the possible unlink
2066 * so we can tell LNet about him */
2067 kgnilnd_peer_addref(peer);
2069 /* if he still cares about the outstanding connect */
2070 if (peer->gnp_connecting >= GNILND_PEER_CONNECT) {
2071 /* check if he is on the connd list and remove.. */
2072 spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
2073 if (!list_empty(&peer->gnp_connd_list)) {
2074 list_del_init(&peer->gnp_connd_list);
2075 /* drop connd ref */
2076 kgnilnd_peer_decref(peer);
2078 spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
2080 /* clear gnp_connecting so we don't have a non-connecting peer
2081 * on gnd_connd_list */
2082 peer->gnp_connecting = GNILND_PEER_IDLE;
2084 set_mb(peer->gnp_last_dgram_errno, rc);
2086 kgnilnd_peer_increase_reconnect_locked(peer);
2089 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2091 /* now that we are outside the lock, tell Mommy */
2093 kgnilnd_peer_notify(peer, rc);
2094 kgnilnd_peer_decref(peer);
2099 kgnilnd_send_nak(dev, nak_dstnid, rc);
2106 kgnilnd_reaper_dgram_check(kgn_device_t *dev)
2108 kgn_dgram_t *dgram, *tmp;
2111 spin_lock(&dev->gnd_dgram_lock);
2113 for (i = 0; i < (*kgnilnd_tunables.kgn_peer_hash_size - 1); i++) {
2114 list_for_each_entry_safe(dgram, tmp, &dev->gnd_dgrams[i], gndg_list) {
2115 unsigned long now = jiffies;
2116 unsigned long timeout;
2118 /* don't timeout stuff if the network is mucked or shutting down */
2119 if (kgnilnd_check_hw_quiesce()) {
2123 if ((dgram->gndg_state != GNILND_DGRAM_POSTED) ||
2124 (dgram->gndg_type == GNILND_DGRAM_WC_REQ)) {
2127 CDEBUG(D_NETTRACE, "checking dgram 0x%p type %s "
2128 "state %s conn 0x%p to %s age %lus\n",
2129 dgram, kgnilnd_dgram_type2str(dgram),
2130 kgnilnd_dgram_state2str(dgram), dgram->gndg_conn,
2131 libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
2132 cfs_duration_sec(now - dgram->gndg_post_time));
2134 timeout = cfs_time_seconds(*kgnilnd_tunables.kgn_timeout);
2136 if (time_before(now, (dgram->gndg_post_time + timeout)))
2139 CNETERR("%s datagram to %s timed out @ %lus dgram "
2140 "0x%p state %s conn 0x%p\n",
2141 kgnilnd_dgram_type2str(dgram),
2142 libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
2143 cfs_duration_sec(now - dgram->gndg_post_time),
2144 dgram, kgnilnd_dgram_state2str(dgram),
2147 kgnilnd_cancel_dgram_locked(dgram);
2150 spin_unlock(&dev->gnd_dgram_lock);
2154 /* use a thread for the possibly long-blocking wait_by_id to prevent
2155 * stalling the global workqueues */
2157 kgnilnd_dgram_waitq(void *arg)
2159 kgn_device_t *dev = (kgn_device_t *) arg;
2163 DEFINE_WAIT(mover_done);
2165 snprintf(name, sizeof(name), "kgnilnd_dgn_%02d", dev->gnd_id);
2166 cfs_daemonize(name);
2167 cfs_block_allsigs();
2169 /* all gnilnd threads need to run fairly urgently */
2170 set_user_nice(current, *kgnilnd_tunables.kgn_nice);
2172 /* we dont shut down until the device shuts down ... */
2173 while (!kgnilnd_data.kgn_shutdown) {
2174 /* to quiesce or to not quiesce, that is the question */
2175 if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
2176 KGNILND_SPIN_QUIESCE;
2179 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_PAUSE_DGRAM_COMP, 1)) {}
2181 /* check once a second */
2182 grc = kgnilnd_postdata_probe_wait_by_id(dev->gnd_handle,
2185 if (grc == GNI_RC_SUCCESS) {
2186 CDEBUG(D_INFO, "waking up dgram mover thread\n");
2187 kgnilnd_schedule_dgram(dev);
2189 /* wait for dgram thread to ping us before spinning again */
2190 prepare_to_wait(&dev->gnd_dgping_waitq, &mover_done,
2191 TASK_INTERRUPTIBLE);
2193 /* don't sleep if we need to quiesce */
2194 if (likely(!kgnilnd_data.kgn_quiesce_trigger)) {
2197 finish_wait(&dev->gnd_dgping_waitq, &mover_done);
2201 kgnilnd_thread_fini();
2206 kgnilnd_start_outbound_dgrams(kgn_device_t *dev)
2208 int did_something = 0, rc;
2209 kgn_peer_t *peer = NULL;
2211 spin_lock(&dev->gnd_connd_lock);
2213 /* Active connect - we added this in kgnilnd_launch_tx */
2214 while (!list_empty(&dev->gnd_connd_peers)) {
2215 peer = list_first_entry(&dev->gnd_connd_peers,
2216 kgn_peer_t, gnp_connd_list);
2218 /* ref for connd removed in if/else below */
2219 list_del_init(&peer->gnp_connd_list);
2221 /* gnp_connecting and membership on gnd_connd_peers should be
2222 * done coherently to avoid double adding, etc */
2223 /* don't need kgnilnd_data.kgn_peer_conn_lock here as that is only needed
2224 * to get the peer to gnp_connecting in the first place. We just need to
2225 * rely on gnd_connd_lock to serialize someone pulling him from the list
2226 * BEFORE clearing gnp_connecting */
2227 LASSERTF(peer->gnp_connecting != GNILND_PEER_IDLE, "peer 0x%p->%s not connecting\n",
2228 peer, libcfs_nid2str(peer->gnp_nid));
2230 spin_unlock(&dev->gnd_connd_lock);
2232 CDEBUG(D_NET, "processing connect to %s\n",
2233 libcfs_nid2str(peer->gnp_nid));
2236 rc = kgnilnd_start_connect(peer);
2238 if (likely(rc >= 0)) {
2239 /* 0 on success, positive on 'just drop peer' errors */
2240 kgnilnd_peer_decref(peer);
2241 } else if (rc == -ENOMEM) {
2242 /* if we are out of wildcards, add back to
2243 * connd_list - then break out and we'll try later
2244 * if other errors, we'll bail & cancel pending tx */
2245 write_lock(&kgnilnd_data.kgn_peer_conn_lock);
2246 if (peer->gnp_connecting == GNILND_PEER_POSTING) {
2247 peer->gnp_connecting = GNILND_PEER_CONNECT;
2248 spin_lock(&dev->gnd_connd_lock);
2249 list_add_tail(&peer->gnp_connd_list,
2250 &dev->gnd_connd_peers);
2252 /* connecting changed while we were posting */
2254 LASSERTF(peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH, "Peer is in invalid"
2255 " state 0x%p->%s, connecting %d\n",
2256 peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting);
2257 peer->gnp_connecting = GNILND_PEER_KILL;
2258 spin_lock(&dev->gnd_connd_lock);
2259 /* remove the peer ref frrom the cond list */
2260 kgnilnd_peer_decref(peer);
2261 /* let the system handle itself */
2263 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2264 /* the datagrams are a global pool,
2265 * so break out of trying and hope some free
2270 /* something bad happened, you lose */
2271 CNETERR("could not start connecting to %s "
2272 "rc %d: Will retry until TX timeout\n",
2273 libcfs_nid2str(peer->gnp_nid), rc);
2274 /* It didnt post so just set connecting back to zero now.
2275 * The reaper will reattempt the connection if it needs too.
2276 * If the peer needs death set it so the reaper will cleanup.
2278 write_lock(&kgnilnd_data.kgn_peer_conn_lock);
2279 if (peer->gnp_connecting == GNILND_PEER_POSTING) {
2280 peer->gnp_connecting = GNILND_PEER_IDLE;
2281 kgnilnd_peer_increase_reconnect_locked(peer);
2283 LASSERTF(peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH, "Peer is in invalid"
2284 " state 0x%p->%s, connecting %d\n",
2285 peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting);
2286 peer->gnp_connecting = GNILND_PEER_KILL;
2288 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2290 /* hold onto ref until we are really done - if it was
2291 * unlinked this could result in a destroy */
2292 kgnilnd_peer_decref(peer);
2294 spin_lock(&dev->gnd_connd_lock);
2297 spin_unlock(&dev->gnd_connd_lock);
2298 RETURN(did_something);
2302 kgnilnd_dgram_poke_with_stick(unsigned long arg)
2305 kgn_device_t *dev = &kgnilnd_data.kgn_devices[dev_id];
2307 wake_up(&dev->gnd_dgram_waitq);
2310 /* use single thread for dgrams - should be sufficient for performance */
2312 kgnilnd_dgram_mover(void *arg)
2314 kgn_device_t *dev = (kgn_device_t *)arg;
2316 int rc, did_something;
2317 unsigned long next_purge_check = jiffies - 1;
2318 unsigned long timeout;
2319 struct timer_list timer;
2322 snprintf(name, sizeof(name), "kgnilnd_dg_%02d", dev->gnd_id);
2323 cfs_daemonize(name);
2324 cfs_block_allsigs();
2325 /* all gnilnd threads need to run fairly urgently */
2326 set_user_nice(current, *kgnilnd_tunables.kgn_nice);
2328 /* we are ok not locking for these variables as the dgram waitq threads
2329 * will block both due to tying up net (kgn_shutdown) and the completion
2330 * event for the dgram_waitq (kgn_quiesce_trigger) */
2332 while (!kgnilnd_data.kgn_shutdown) {
2333 /* Safe: kgn_shutdown only set when quiescent */
2335 /* race with stack reset - we want to hold off seeing any new incoming dgrams
2336 * so we can force a dirty WC dgram for Bug 762072 - put right before
2337 * quiesce check so that it'll go right into that and not do any
2339 CFS_RACE(CFS_FAIL_GNI_WC_DGRAM_FREE);
2341 /* to quiesce or to not quiesce, that is the question */
2342 if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
2343 KGNILND_SPIN_QUIESCE;
2347 CFS_RACE(CFS_FAIL_GNI_QUIESCE_RACE);
2349 /* process any newly completed dgrams */
2350 down_read(&kgnilnd_data.kgn_net_rw_sem);
2352 rc = kgnilnd_probe_and_process_dgram(dev);
2354 did_something += rc;
2357 up_read(&kgnilnd_data.kgn_net_rw_sem);
2359 /* start new outbound dgrams */
2360 did_something += kgnilnd_start_outbound_dgrams(dev);
2362 /* find dead dgrams */
2363 if (time_after_eq(jiffies, next_purge_check)) {
2364 /* these don't need to be checked that often */
2365 kgnilnd_reaper_dgram_check(dev);
2367 next_purge_check = (long) jiffies +
2368 cfs_time_seconds(kgnilnd_data.kgn_new_min_timeout / 4);
2371 /* careful with the jiffy wrap... */
2372 timeout = (long)(next_purge_check - jiffies);
2374 CDEBUG(D_INFO, "did %d timeout %lu next %lu jiffies %lu\n",
2375 did_something, timeout, next_purge_check, jiffies);
2377 if (did_something || timeout <= 0) {
2382 prepare_to_wait(&dev->gnd_dgram_waitq, &wait, TASK_INTERRUPTIBLE);
2384 setup_timer(&timer, kgnilnd_dgram_poke_with_stick, dev->gnd_id);
2385 mod_timer(&timer, (long) jiffies + timeout);
2387 /* last second chance for others to poke us */
2388 did_something += xchg(&dev->gnd_dgram_ready, GNILND_DGRAM_IDLE);
2390 /* check flag variables before comitting */
2391 if (!did_something &&
2392 !kgnilnd_data.kgn_shutdown &&
2393 !kgnilnd_data.kgn_quiesce_trigger) {
2394 CDEBUG(D_INFO, "schedule timeout %ld (%lu sec)\n",
2395 timeout, cfs_duration_sec(timeout));
2396 wake_up_all(&dev->gnd_dgping_waitq);
2398 CDEBUG(D_INFO, "awake after schedule\n");
2401 del_singleshot_timer_sync(&timer);
2402 finish_wait(&dev->gnd_dgram_waitq, &wait);
2405 kgnilnd_thread_fini();