Whamcloud - gitweb
LU-17744 ldiskfs: mballoc stats fixes
[fs/lustre-release.git] / lnet / klnds / gnilnd / gnilnd_conn.c
1 /*
2  * Copyright (C) 2012 Cray, Inc.
3  *
4  * Copyright (c) 2014, Intel Corporation.
5  *
6  *   Author: Nic Henke <nic@cray.com>
7  *   Author: James Shimek <jshimek@cray.com>
8  *
9  *   This file is part of Lustre, http://www.lustre.org.
10  *
11  *   Lustre is free software; you can redistribute it and/or
12  *   modify it under the terms of version 2 of the GNU General Public
13  *   License as published by the Free Software Foundation.
14  *
15  *   Lustre is distributed in the hope that it will be useful,
16  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
17  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  *   GNU General Public License for more details.
19  *
20  *   You should have received a copy of the GNU General Public License
21  *   along with Lustre; if not, write to the Free Software
22  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23  *
24  */
25
26 #include "gnilnd.h"
27 #include <linux/swap.h>
28
29 void
30 kgnilnd_setup_smsg_attr(gni_smsg_attr_t *smsg_attr)
31 {
32         smsg_attr->mbox_maxcredit = *kgnilnd_tunables.kgn_mbox_credits;
33         smsg_attr->msg_maxsize = GNILND_MAX_MSG_SIZE;
34         smsg_attr->msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT;
35 }
36
37 int
38 kgnilnd_map_fmablk(kgn_device_t *device, kgn_fma_memblock_t *fma_blk)
39 {
40         gni_return_t            rrc;
41         __u32                   flags = GNI_MEM_READWRITE;
42         static unsigned long    reg_to;
43         int                     rfto = *kgnilnd_tunables.kgn_reg_fail_timeout;
44
45         if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
46                 flags |= GNI_MEM_PHYS_CONT;
47         }
48
49         fma_blk->gnm_hold_timeout = 0;
50
51         /* make sure we are mapping a clean block */
52         LASSERTF(fma_blk->gnm_hndl.qword1 == 0UL,
53                  "fma_blk %px dirty\n", fma_blk);
54
55         rrc = kgnilnd_mem_register(device->gnd_handle, (__u64)fma_blk->gnm_block,
56                                    fma_blk->gnm_blk_size, device->gnd_rcv_fma_cqh,
57                                    flags, &fma_blk->gnm_hndl);
58         if (rrc != GNI_RC_SUCCESS) {
59                 if (rfto != GNILND_REGFAILTO_DISABLE) {
60                         if (reg_to == 0) {
61                                 reg_to = jiffies + cfs_time_seconds(rfto);
62                         } else if (time_after(jiffies, reg_to)) {
63                                 CERROR("FATAL:fmablk registration has failed "
64                                        "for %ld seconds.\n",
65                                        cfs_duration_sec(jiffies - reg_to) +
66                                                 rfto);
67                                 LBUG();
68                         }
69                 }
70
71                 CNETERR("register fmablk failed 0x%p mbox_size %d flags %u\n",
72                         fma_blk, fma_blk->gnm_mbox_size, flags);
73                 RETURN(-ENOMEM);
74         }
75
76         reg_to = 0;
77
78         /* PHYS_CONT memory isn't really mapped, at least not in GART -
79          *  but all mappings chew up a MDD
80          */
81         if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
82                 atomic64_add(fma_blk->gnm_blk_size, &device->gnd_nbytes_map);
83         }
84
85         atomic_inc(&device->gnd_n_mdd);
86         /* nfmablk is live (mapped) blocks */
87         atomic_inc(&device->gnd_nfmablk);
88
89         RETURN(0);
90 }
91
92 int
93 kgnilnd_alloc_fmablk(kgn_device_t *device, int use_phys)
94 {
95         int                     rc = 0;
96         int                     num_mbox;
97         kgn_fma_memblock_t     *fma_blk;
98         gni_smsg_attr_t         smsg_attr;
99         unsigned long           fmablk_vers;
100
101 #if defined(CONFIG_CRAY_XT) && !defined(CONFIG_CRAY_COMPUTE)
102         /* We allocate large blocks of memory here potentially leading
103          * to memory exhaustion during massive reconnects during a network
104          * outage. Limit the amount of fma blocks to use by always keeping
105          * a percent of pages free initially set to 25% of total memory. */
106         if (nr_free_pages() < kgnilnd_data.free_pages_limit) {
107                 LCONSOLE_INFO("Exceeding free page limit of %ld. "
108                               "Free pages available %ld\n",
109                               kgnilnd_data.free_pages_limit,
110                               nr_free_pages());
111                 return -ENOMEM;
112         }
113 #endif
114         /* we'll use fmablk_vers and the gnd_fmablk_mutex to gate access
115          * to this allocation code. Everyone will sample the version
116          * before and after getting the mutex. If it has changed,
117          * we'll bail out to check the lists again - this indicates that
118          * some sort of change was made to the lists and it is possible
119          * that there is a mailbox for us to find now. This should prevent
120          * a ton of spinning in the case where there are lots of threads
121          * that need a yet-to-be-allocated mailbox for a connection. */
122
123         fmablk_vers = atomic_read(&device->gnd_fmablk_vers);
124         mutex_lock(&device->gnd_fmablk_mutex);
125
126         if (fmablk_vers != atomic_read(&device->gnd_fmablk_vers)) {
127                 /* version changed while we were waiting for semaphore,
128                  * we'll recheck the lists assuming something nice happened */
129                 mutex_unlock(&device->gnd_fmablk_mutex);
130                 return 0;
131         }
132
133         LIBCFS_ALLOC(fma_blk, sizeof(kgn_fma_memblock_t));
134         if (fma_blk == NULL) {
135                 CNETERR("could not allocate fma block descriptor\n");
136                 rc = -ENOMEM;
137                 GOTO(out, rc);
138         }
139
140         INIT_LIST_HEAD(&fma_blk->gnm_bufflist);
141
142         kgnilnd_setup_smsg_attr(&smsg_attr);
143
144         gni_smsg_buff_size_needed(&smsg_attr, &fma_blk->gnm_mbox_size);
145
146         LASSERTF(fma_blk->gnm_mbox_size, "mbox size %d\n", fma_blk->gnm_mbox_size);
147
148         /* gni_smsg_buff_size_needed calculates the base mailbox size and since
149          * we want to hold kgn_peer_credits worth of messages in both directions,
150          * we add PAYLOAD to grow the mailbox size
151          */
152
153         fma_blk->gnm_mbox_size += GNILND_MBOX_PAYLOAD;
154
155         /* we'll only use physical during preallocate at startup -- this keeps it nice and
156          * clean for runtime decisions. We'll keep the PHYS ones around until shutdown
157          * as reallocating them is tough if there is memory fragmentation */
158
159         if (use_phys) {
160                 fma_blk->gnm_block = kmem_cache_alloc(kgnilnd_data.kgn_mbox_cache, GFP_ATOMIC);
161                 if (fma_blk->gnm_block == NULL) {
162                         CNETERR("could not allocate physical SMSG mailbox memory\n");
163                         rc = -ENOMEM;
164                         GOTO(free_desc, rc);
165                 }
166                 fma_blk->gnm_blk_size = GNILND_MBOX_SIZE;
167                 num_mbox = fma_blk->gnm_blk_size / fma_blk->gnm_mbox_size;
168
169                 LASSERTF(num_mbox >= 1,
170                          "num_mbox %d blk_size %u mbox_size %d\n",
171                           num_mbox, fma_blk->gnm_blk_size, fma_blk->gnm_mbox_size);
172
173                 fma_blk->gnm_state = GNILND_FMABLK_PHYS;
174
175         } else {
176                 num_mbox = *kgnilnd_tunables.kgn_mbox_per_block;
177                 fma_blk->gnm_blk_size = num_mbox * fma_blk->gnm_mbox_size;
178
179                 LASSERTF(num_mbox >= 1 && num_mbox >= *kgnilnd_tunables.kgn_mbox_per_block,
180                          "num_mbox %d blk_size %u mbox_size %d tunable %d\n",
181                          num_mbox, fma_blk->gnm_blk_size, fma_blk->gnm_mbox_size,
182                          *kgnilnd_tunables.kgn_mbox_per_block);
183
184                 fma_blk->gnm_block = kgnilnd_vzalloc(fma_blk->gnm_blk_size);
185                 if (fma_blk->gnm_block == NULL) {
186                         CNETERR("could not allocate virtual SMSG mailbox memory, %d bytes\n", fma_blk->gnm_blk_size);
187                         rc = -ENOMEM;
188                         GOTO(free_desc, rc);
189                 }
190
191                 fma_blk->gnm_state = GNILND_FMABLK_VIRT;
192         }
193
194         /* allocate just enough space for the bits to track the mailboxes */
195         CFS_ALLOC_PTR_ARRAY(fma_blk->gnm_bit_array, BITS_TO_LONGS(num_mbox));
196         if (fma_blk->gnm_bit_array == NULL) {
197                 CNETERR("could not allocate mailbox bitmask, %lu bytes for %d mbox\n",
198                        sizeof(unsigned long) * BITS_TO_LONGS(num_mbox), num_mbox);
199                 rc = -ENOMEM;
200                 GOTO(free_blk, rc);
201         }
202         bitmap_zero(fma_blk->gnm_bit_array, num_mbox);
203
204         /* now that the num_mbox is set based on allocation type, get debug
205          * info setup
206          * */
207         CFS_ALLOC_PTR_ARRAY(fma_blk->gnm_mbox_info, num_mbox);
208         if (fma_blk->gnm_mbox_info == NULL) {
209                 CNETERR("could not allocate mailbox debug, %lu bytes for %d mbox\n",
210                        sizeof(kgn_mbox_info_t) * num_mbox, num_mbox);
211                 rc = -ENOMEM;
212                 GOTO(free_bit, rc);
213         }
214
215         rc = kgnilnd_map_fmablk(device, fma_blk);
216         if (rc) {
217                 GOTO(free_info, rc);
218         }
219
220         fma_blk->gnm_next_avail_mbox = 0;
221         fma_blk->gnm_avail_mboxs = fma_blk->gnm_num_mboxs = num_mbox;
222
223         CDEBUG(D_MALLOC, "alloc fmablk 0x%p num %d msg_maxsize %d credits %d "
224                 "mbox_size %d MDD %#llx.%#llx\n",
225                 fma_blk, num_mbox, smsg_attr.msg_maxsize, smsg_attr.mbox_maxcredit,
226                 fma_blk->gnm_mbox_size, fma_blk->gnm_hndl.qword1,
227                 fma_blk->gnm_hndl.qword2);
228
229         /* lock Is protecting data structures, not semaphore */
230
231         spin_lock(&device->gnd_fmablk_lock);
232         list_add_tail(&fma_blk->gnm_bufflist, &device->gnd_fma_buffs);
233
234         /* toggle under the lock so once they change the list is also
235          * ready for others to traverse */
236         atomic_inc(&device->gnd_fmablk_vers);
237
238         spin_unlock(&device->gnd_fmablk_lock);
239
240         mutex_unlock(&device->gnd_fmablk_mutex);
241
242         return 0;
243
244 free_info:
245         CFS_FREE_PTR_ARRAY(fma_blk->gnm_mbox_info, num_mbox);
246 free_bit:
247         CFS_FREE_PTR_ARRAY(fma_blk->gnm_bit_array, BITS_TO_LONGS(num_mbox));
248 free_blk:
249         if (fma_blk->gnm_state == GNILND_FMABLK_VIRT) {
250                 kgnilnd_vfree(fma_blk->gnm_block, fma_blk->gnm_blk_size);
251         } else {
252                 kmem_cache_free(kgnilnd_data.kgn_mbox_cache, fma_blk->gnm_block);
253         }
254 free_desc:
255         LIBCFS_FREE(fma_blk, sizeof(kgn_fma_memblock_t));
256 out:
257         mutex_unlock(&device->gnd_fmablk_mutex);
258         return rc;
259 }
260
261 void
262 kgnilnd_unmap_fmablk(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
263 {
264         gni_return_t            rrc;
265
266         /* if some held, set hold_timeout from conn timeouts used in this block
267          * but not during shutdown, then just nuke and pave
268          * During a stack reset, we need to deregister with a hold timeout
269          * set so we don't use the same mdd after reset is complete */
270         if ((fma_blk->gnm_held_mboxs && !kgnilnd_data.kgn_shutdown) ||
271             kgnilnd_data.kgn_in_reset) {
272                 fma_blk->gnm_hold_timeout = GNILND_TIMEOUT2DEADMAN;
273         }
274
275         /* we are changing the state of a block, tickle version to tell
276          * proc code list is stale now */
277         atomic_inc(&dev->gnd_fmablk_vers);
278
279         rrc = kgnilnd_mem_deregister(dev->gnd_handle, &fma_blk->gnm_hndl, fma_blk->gnm_hold_timeout);
280
281         CDEBUG(rrc == GNI_RC_SUCCESS ? D_MALLOC : D_CONSOLE|D_NETERROR,
282                "unmap fmablk 0x%p@%s sz %u total %d avail %d held %d mbox_size %d "
283                 "hold_timeout %d\n",
284                fma_blk, kgnilnd_fmablk_state2str(fma_blk->gnm_state),
285                fma_blk->gnm_blk_size, fma_blk->gnm_num_mboxs,
286                fma_blk->gnm_avail_mboxs, fma_blk->gnm_held_mboxs,
287                fma_blk->gnm_mbox_size, fma_blk->gnm_hold_timeout);
288
289         LASSERTF(rrc == GNI_RC_SUCCESS,
290                 "tried to double unmap or something bad, fma_blk %px (rrc %d)\n",
291                 fma_blk, rrc);
292
293         if (fma_blk->gnm_hold_timeout &&
294             !(kgnilnd_data.kgn_in_reset &&
295               fma_blk->gnm_state == GNILND_FMABLK_PHYS)) {
296                 atomic_inc(&dev->gnd_n_mdd_held);
297         } else {
298                 atomic_dec(&dev->gnd_n_mdd);
299         }
300
301         /* PHYS blocks don't get mapped */
302         if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
303                 atomic64_sub(fma_blk->gnm_blk_size, &dev->gnd_nbytes_map);
304                 fma_blk->gnm_state = GNILND_FMABLK_IDLE;
305         } else if (kgnilnd_data.kgn_in_reset) {
306                 /* in stack reset, clear MDD handle for PHYS blocks, as we'll
307                  * re-use the fma_blk after reset so we don't have to drop/allocate
308                  * all of those physical blocks */
309                 fma_blk->gnm_hndl.qword1 = fma_blk->gnm_hndl.qword2 = 0UL;
310         }
311
312         /* Decrement here as this is the # of mapped blocks */
313         atomic_dec(&dev->gnd_nfmablk);
314 }
315
316
317 /* needs lock on gnd_fmablk_lock to cover gnd_fma_buffs */
318 void
319 kgnilnd_free_fmablk_locked(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
320 {
321         LASSERTF(fma_blk->gnm_avail_mboxs == fma_blk->gnm_num_mboxs,
322                  "fma_blk %px@%d free in bad state (%d): blk total %d avail %d held %d\n",
323                  fma_blk, fma_blk->gnm_state, fma_blk->gnm_hold_timeout, fma_blk->gnm_num_mboxs,
324                 fma_blk->gnm_avail_mboxs, fma_blk->gnm_held_mboxs);
325
326         atomic_inc(&dev->gnd_fmablk_vers);
327
328         if (fma_blk->gnm_hold_timeout) {
329                 CDEBUG(D_MALLOC, "mdd release fmablk 0x%p sz %u avail %d held %d "
330                         "mbox_size %d\n",
331                         fma_blk, fma_blk->gnm_blk_size, fma_blk->gnm_avail_mboxs,
332                         fma_blk->gnm_held_mboxs, fma_blk->gnm_mbox_size);
333
334                 /* We leave MDD dangling over stack reset */
335                 if (!kgnilnd_data.kgn_in_reset) {
336                         kgnilnd_mem_mdd_release(dev->gnd_handle, &fma_blk->gnm_hndl);
337                 }
338                 /* ignoring the return code - if kgni/ghal can't find it
339                  * it must be released already */
340                 atomic_dec(&dev->gnd_n_mdd_held);
341                 atomic_dec(&dev->gnd_n_mdd);
342         }
343
344         /* we cant' free the gnm_block until all the conns have released their
345          * purgatory holds. While we have purgatory holds, we might check the conn
346          * RX mailbox during the CLOSING process. It is possible that kgni might
347          * try to look into the RX side for credits when sending the CLOSE msg too */
348         if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
349                 LIBCFS_MEM_MSG(fma_blk->gnm_block, fma_blk->gnm_mbox_size, "free");
350                 kmem_cache_free(kgnilnd_data.kgn_mbox_cache, fma_blk->gnm_block);
351         } else {
352                 kgnilnd_vfree(fma_blk->gnm_block, fma_blk->gnm_blk_size);
353         }
354         fma_blk->gnm_state = GNILND_FMABLK_FREED;
355
356         list_del(&fma_blk->gnm_bufflist);
357
358         CFS_FREE_PTR_ARRAY(fma_blk->gnm_mbox_info, fma_blk->gnm_num_mboxs);
359         CFS_FREE_PTR_ARRAY(fma_blk->gnm_bit_array,
360                            BITS_TO_LONGS(fma_blk->gnm_num_mboxs));
361         LIBCFS_FREE(fma_blk, sizeof(kgn_fma_memblock_t));
362 }
363
364 void
365 kgnilnd_find_free_mbox(kgn_conn_t *conn)
366 {
367         kgn_device_t            *dev = conn->gnc_device;
368         gni_smsg_attr_t         *smsg_attr = &conn->gnpr_smsg_attr;
369         kgn_fma_memblock_t      *fma_blk;
370         kgn_mbox_info_t         *mbox = NULL;
371         int                     id;
372
373         spin_lock(&dev->gnd_fmablk_lock);
374
375         list_for_each_entry(fma_blk, &conn->gnc_device->gnd_fma_buffs,
376                             gnm_bufflist) {
377                 if (fma_blk->gnm_avail_mboxs <= 0 ||
378                     fma_blk->gnm_state <= GNILND_FMABLK_IDLE) {
379                         continue;
380                 }
381                 /* look in bitarray for available mailbox */
382                 do {
383                         id = find_next_zero_bit(
384                                 fma_blk->gnm_bit_array,
385                                 fma_blk->gnm_num_mboxs,
386                                 fma_blk->gnm_next_avail_mbox);
387                       if (id == fma_blk->gnm_num_mboxs &&
388                           fma_blk->gnm_next_avail_mbox != 0) {
389                                 /* wrap around */
390                                 fma_blk->gnm_next_avail_mbox = 0;
391                         } else {
392                                 break;
393                         }
394                 } while (1);
395
396                 LASSERTF(id < fma_blk->gnm_num_mboxs, "id %d max %d\n",
397                          id, fma_blk->gnm_num_mboxs);
398                 set_bit(id, (volatile unsigned long *)fma_blk->gnm_bit_array);
399                 conn->gnc_mbox_id = id;
400
401                 fma_blk->gnm_next_avail_mbox =
402                         (id == (fma_blk->gnm_num_mboxs - 1)) ? 0 : (id + 1);
403                 fma_blk->gnm_avail_mboxs--;
404                 conn->gnc_fma_blk = fma_blk;
405
406                 kgnilnd_setup_smsg_attr(smsg_attr);
407
408                 smsg_attr->msg_buffer = fma_blk->gnm_block;
409                 smsg_attr->mbox_offset = fma_blk->gnm_mbox_size * id;
410                 smsg_attr->mem_hndl = fma_blk->gnm_hndl;
411                 smsg_attr->buff_size = fma_blk->gnm_mbox_size;
412
413                 /* We'll set the hndl to zero for PHYS blocks unmapped during stack
414                  * reset and re-use the same fma_blk after stack reset. This ensures we've
415                  * properly mapped it before we use it */
416                 LASSERTF(fma_blk->gnm_hndl.qword1 != 0UL,
417                         "unmapped fma_blk %px, state %d\n",
418                          fma_blk, fma_blk->gnm_state);
419
420                 CDEBUG(D_NET, "conn %p smsg %p fmablk %p "
421                         "allocating SMSG mbox %d buf %p "
422                         "offset %u hndl %#llx.%#llx\n",
423                         conn, smsg_attr, fma_blk, id,
424                         smsg_attr->msg_buffer, smsg_attr->mbox_offset,
425                         fma_blk->gnm_hndl.qword1,
426                         fma_blk->gnm_hndl.qword2);
427
428                 mbox = &fma_blk->gnm_mbox_info[id];
429                 mbox->mbx_create_conn_memset = jiffies;
430                 mbox->mbx_nallocs++;
431                 mbox->mbx_nallocs_total++;
432
433                 /* zero mbox to remove any old data from our last use.
434                  * this better be safe, if not our purgatory timers
435                  * are too short or a peer really is misbehaving */
436                 memset(smsg_attr->msg_buffer + smsg_attr->mbox_offset,
437                        0, smsg_attr->buff_size);
438                 break;
439         }
440
441         spin_unlock(&dev->gnd_fmablk_lock);
442 }
443
444 int
445 kgnilnd_setup_mbox(kgn_conn_t *conn)
446 {
447         gni_smsg_attr_t         *smsg_attr = &conn->gnpr_smsg_attr;
448         int                      err = 0;
449
450         smsg_attr->msg_buffer = NULL;
451         /* Look for available mbox */
452         do {
453                 kgnilnd_find_free_mbox(conn);
454
455                 /* nothing in the existing buffers, make a new one */
456                 if (smsg_attr->msg_buffer == NULL) {
457                         /* for runtime allocations, we only want vmalloc */
458                         err = kgnilnd_alloc_fmablk(conn->gnc_device, 0);
459                         if (err) {
460                                 break;
461                         }
462                 }
463         } while (smsg_attr->msg_buffer == NULL);
464
465         if (err)
466                 CNETERR("couldn't allocate SMSG mbox for conn %p Error: %d\n",
467                         conn, err);
468         return err;
469 }
470
471 void
472 kgnilnd_release_mbox(kgn_conn_t *conn, int purgatory_hold)
473 {
474         kgn_device_t           *dev = conn->gnc_device;
475         gni_smsg_attr_t        *smsg_attr = &conn->gnpr_smsg_attr;
476         kgn_fma_memblock_t     *fma_blk = NULL;
477         kgn_mbox_info_t        *mbox = NULL;
478         int                     found = 0;
479         int                     id;
480
481         /* if we failed to setup mbox and now destroying conn */
482         if (smsg_attr->msg_buffer == NULL) {
483                 return;
484         }
485
486         id = conn->gnc_mbox_id;
487
488         spin_lock(&dev->gnd_fmablk_lock);
489         /* make sure our conn points at a valid fma_blk
490          * We use this instead of a mem block search out of smsg_attr
491          * because we could have freed a block for fma_blk #1 but the fma_blk
492          * is still in the list for a purgatory hold. This would induce a false
493          * match if that same block gets reallocated to fma_blk #2 */
494         list_for_each_entry(fma_blk, &dev->gnd_fma_buffs, gnm_bufflist) {
495                 if (fma_blk == conn->gnc_fma_blk) {
496                         found = 1;
497                         break;
498                 }
499         }
500         LASSERTF(found,
501                 "unable to find conn 0x%p with gnc_fma_blk %px anywhere in the world\n",
502                  conn, conn->gnc_fma_blk);
503
504         LASSERTF(id < fma_blk->gnm_num_mboxs,
505                 "bad id %d max %d\n",
506                 id, fma_blk->gnm_num_mboxs);
507
508         /* < 0 - was held, now free it
509          * == 0 - just free it
510          * > 0 - hold it for now */
511         if (purgatory_hold == 0) {
512                 CDEBUG(D_NET, "conn %p smsg %p fmablk %p freeing SMSG mbox %d "
513                         "hndl %#llx.%#llx\n",
514                         conn, smsg_attr, fma_blk, id,
515                         fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
516                 fma_blk->gnm_avail_mboxs++;
517
518         } else if (purgatory_hold > 0) {
519                 CDEBUG(D_NET, "conn %p smsg %p fmablk %p holding SMSG mbox %d "
520                         "hndl %#llx.%#llx\n",
521                         conn, smsg_attr, fma_blk, id,
522                         fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
523
524                 fma_blk->gnm_held_mboxs++;
525                 fma_blk->gnm_max_timeout = max_t(long, fma_blk->gnm_max_timeout,
526                                                  conn->gnc_timeout);
527         } else {
528                 CDEBUG(D_NET, "conn %p smsg %p fmablk %p release SMSG mbox %d "
529                         "hndl %#llx.%#llx\n",
530                         conn, smsg_attr, fma_blk, id,
531                         fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
532
533                 fma_blk->gnm_held_mboxs--;
534                 fma_blk->gnm_avail_mboxs++;
535         }
536
537         if (purgatory_hold <= 0) {
538                 /* if kgni is retransmitting, freeing the smsg block before the EP
539                  * is destroyed gets messy. Bug 768295. */
540                 LASSERTF(conn->gnc_ephandle == NULL,
541                          "can't release mbox before EP is nuked. conn 0x%p\n", conn);
542
543                 mbox = &fma_blk->gnm_mbox_info[id];
544                 mbox->mbx_release_from_purgatory = jiffies;
545
546                 /* clear conn gnc_fmablk if it is gone - this allows us to
547                  * not worry about state so much in kgnilnd_destroy_conn
548                  * and makes the guaranteed cleanup of the resources easier */
549                 LASSERTF(test_and_clear_bit(id, fma_blk->gnm_bit_array),
550                         "conn %px bit %d already cleared in fma_blk %px\n",
551                          conn, id, fma_blk);
552                 conn->gnc_fma_blk = NULL;
553                 mbox->mbx_nallocs--;
554         }
555
556         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_FMABLK_AVAIL)) {
557                 CERROR("LBUGs in your future: forcibly marking fma_blk %p "
558                        "as mapped\n", fma_blk);
559                 fma_blk->gnm_state = GNILND_FMABLK_VIRT;
560         }
561
562         /* we don't release or unmap PHYS blocks as part of the normal cycle --
563          * those are controlled manually from startup/shutdown */
564         if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
565                 /* we can unmap once all are unused (held or avail)
566                  * but check hold_timeout to make sure we are not trying to double
567                  * unmap this buffer. If there was no hold_timeout set due to
568                  * held_mboxs, we'll free the mobx here shortly and won't have to
569                  * worry about catching a double free for a 'clean' fma_blk */
570                 if (((fma_blk->gnm_avail_mboxs + fma_blk->gnm_held_mboxs) == fma_blk->gnm_num_mboxs) &&
571                     (!fma_blk->gnm_hold_timeout)) {
572                         kgnilnd_unmap_fmablk(dev, fma_blk);
573                 }
574
575                 /* But we can only free once they are all avail */
576                 if (fma_blk->gnm_avail_mboxs == fma_blk->gnm_num_mboxs &&
577                     fma_blk->gnm_held_mboxs == 0) {
578                         /* all mailboxes are released, free fma_blk */
579                         kgnilnd_free_fmablk_locked(dev, fma_blk);
580                 }
581         }
582
583         spin_unlock(&dev->gnd_fmablk_lock);
584 }
585
586 int
587 kgnilnd_count_phys_mbox(kgn_device_t *device)
588 {
589         int                     i = 0;
590         kgn_fma_memblock_t     *fma_blk;
591
592         spin_lock(&device->gnd_fmablk_lock);
593
594         list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
595                 if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
596                         i += fma_blk->gnm_num_mboxs;
597         }
598         spin_unlock(&device->gnd_fmablk_lock);
599
600         RETURN(i);
601 }
602
603 int
604 kgnilnd_allocate_phys_fmablk(kgn_device_t *device)
605 {
606         int     rc;
607
608         while (kgnilnd_count_phys_mbox(device) < *kgnilnd_tunables.kgn_nphys_mbox) {
609
610                 rc = kgnilnd_alloc_fmablk(device, 1);
611                 if (rc) {
612                         CERROR("failed phys mbox allocation, stopping at %d, rc %d\n",
613                                 kgnilnd_count_phys_mbox(device), rc);
614                         RETURN(rc);
615                 }
616         }
617         RETURN(0);
618 }
619
620 int
621 kgnilnd_map_phys_fmablk(kgn_device_t *device)
622 {
623
624         int                     rc = 0;
625         kgn_fma_memblock_t     *fma_blk;
626
627         /* use mutex to gate access to single thread, just in case */
628         mutex_lock(&device->gnd_fmablk_mutex);
629
630         spin_lock(&device->gnd_fmablk_lock);
631
632         list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
633                 if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
634                         rc = kgnilnd_map_fmablk(device, fma_blk);
635                         if (rc)
636                                 break;
637                 }
638         }
639         spin_unlock(&device->gnd_fmablk_lock);
640
641         mutex_unlock(&device->gnd_fmablk_mutex);
642
643         RETURN(rc);
644 }
645
646 void
647 kgnilnd_unmap_fma_blocks(kgn_device_t *device)
648 {
649
650         kgn_fma_memblock_t      *fma_blk;
651
652         /* use mutex to gate access to single thread, just in case */
653         mutex_lock(&device->gnd_fmablk_mutex);
654
655         spin_lock(&device->gnd_fmablk_lock);
656
657         list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
658                 kgnilnd_unmap_fmablk(device, fma_blk);
659         }
660         spin_unlock(&device->gnd_fmablk_lock);
661
662         mutex_unlock(&device->gnd_fmablk_mutex);
663 }
664
665 void
666 kgnilnd_free_phys_fmablk(kgn_device_t *device)
667 {
668
669         kgn_fma_memblock_t      *fma_blk, *fma_blkN;
670
671         /* use mutex to gate access to single thread, just in case */
672         mutex_lock(&device->gnd_fmablk_mutex);
673
674         spin_lock(&device->gnd_fmablk_lock);
675
676         list_for_each_entry_safe(fma_blk, fma_blkN, &device->gnd_fma_buffs, gnm_bufflist) {
677                 if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
678                         kgnilnd_free_fmablk_locked(device, fma_blk);
679         }
680         spin_unlock(&device->gnd_fmablk_lock);
681
682         mutex_unlock(&device->gnd_fmablk_mutex);
683 }
684
685 /* kgnilnd dgram nid->struct managment */
686
687 static inline struct list_head *
688 kgnilnd_nid2dgramlist(kgn_device_t *dev, lnet_nid_t nid)
689 {
690         unsigned int hash = ((unsigned int)nid) % *kgnilnd_tunables.kgn_peer_hash_size;
691
692         RETURN(&dev->gnd_dgrams[hash]);
693 }
694
695
696 /* needs dev->gnd_dgram_lock held */
697 kgn_dgram_t *
698 kgnilnd_find_dgram_locked(kgn_device_t *dev, lnet_nid_t dst_nid)
699 {
700         struct list_head *dgram_list = kgnilnd_nid2dgramlist(dev, dst_nid);
701         kgn_dgram_t      *dgram;
702
703         list_for_each_entry(dgram, dgram_list, gndg_list) {
704
705                 /* if state > POSTED, we are already handling cancel/completion */
706                 if ((dgram->gndg_conn_out.gncr_dstnid != dst_nid) ||
707                      dgram->gndg_state > GNILND_DGRAM_POSTED)
708                         continue;
709
710                 CDEBUG(D_NET, "got dgram [%p] -> %s\n",
711                        dgram, libcfs_nid2str(dst_nid));
712                 return dgram;
713         }
714         return NULL;
715 }
716
717 int
718 kgnilnd_find_and_cancel_dgram(kgn_device_t *dev, lnet_nid_t dst_nid)
719 {
720         kgn_dgram_t     *dgram;
721
722         spin_lock(&dev->gnd_dgram_lock);
723         dgram = kgnilnd_find_dgram_locked(dev, dst_nid);
724
725         if (dgram) {
726                 kgnilnd_cancel_dgram_locked(dgram);
727         }
728         spin_unlock(&dev->gnd_dgram_lock);
729
730         RETURN(!!(dgram == NULL));
731 }
732
733 int
734 kgnilnd_pack_connreq(kgn_connreq_t *connreq, kgn_conn_t *conn,
735                      lnet_nid_t srcnid, lnet_nid_t dstnid,
736                      kgn_connreq_type_t type)
737 {
738         int err = 0;
739
740         /* ensure we haven't violated max datagram size */
741         BUILD_BUG_ON(sizeof(kgn_connreq_t) > GNI_DATAGRAM_MAXSIZE);
742
743         /* no need to zero out, we do that when allocating dgram */
744         connreq->gncr_magic     = GNILND_MSG_MAGIC;
745
746         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_SRCNID)) {
747                 srcnid = 0xABADBABE;
748         } else if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_DSTNID)) {
749                 dstnid = 0xDEFEC8ED;
750         }
751
752         connreq->gncr_srcnid    = srcnid;
753         connreq->gncr_dstnid    = dstnid;
754
755         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
756                 connreq->gncr_version = 99;
757         } else {
758                 connreq->gncr_version   = GNILND_CONNREQ_VERSION;
759         }
760         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
761                 connreq->gncr_type = 99;
762         } else {
763                 connreq->gncr_type      = type;
764         }
765         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
766                 connreq->gncr_peerstamp = 0;
767         } else {
768                 connreq->gncr_peerstamp = kgnilnd_data.kgn_peerstamp;
769         }
770         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
771                 connreq->gncr_connstamp = 0;
772         } else {
773                 connreq->gncr_connstamp = conn->gnc_my_connstamp;
774         }
775         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
776                 connreq->gncr_timeout = 0;
777         } else {
778                 connreq->gncr_timeout   = conn->gnc_timeout;
779         }
780
781         /* the rest pack the data into the payload in other places */
782         if (type == GNILND_CONNREQ_REQ) {
783                 kgn_gniparams_t       *req_params = &connreq->gncr_gnparams;
784                 req_params->gnpr_host_id = conn->gnc_device->gnd_host_id;
785                 req_params->gnpr_cqid = conn->gnc_cqid;
786
787                 /* allocate mailbox for this connection */
788                 err = kgnilnd_setup_mbox(conn);
789                 if (err != 0) {
790                         CERROR("Failed to setup FMA mailbox (%d)\n", err);
791                 }
792                 req_params->gnpr_smsg_attr = conn->gnpr_smsg_attr;
793         }
794
795         /* XXX Nic: TBD - checksum computation */
796
797         return err;
798 }
799
800 int
801 kgnilnd_unpack_connreq(kgn_dgram_t *dgram)
802 {
803         kgn_connreq_t           *connreq = &dgram->gndg_conn_in;
804         int                      swab, rc = 0;
805         kgn_net_t               *net;
806
807         /* the following fields must be handled in a backwards compatible
808          * manner to ensure we can always send and interpret NAKs */
809
810         if (connreq->gncr_magic != GNILND_MSG_MAGIC &&
811             connreq->gncr_magic != __swab32(GNILND_MSG_MAGIC)) {
812                 /* Unexpected magic! */
813                 CERROR("Unexpected magic %08x\n",
814                        connreq->gncr_magic);
815                 return -EBADF;
816         }
817
818         swab = (connreq->gncr_magic == __swab32(GNILND_MSG_MAGIC));
819         if (swab) {
820                 __swab32s(&connreq->gncr_magic);
821                 __swab32s(&connreq->gncr_cksum);
822                 __swab16s(&connreq->gncr_type);
823                 __swab16s(&connreq->gncr_version);
824                 __swab32s(&connreq->gncr_timeout);
825                 __swab64s(&connreq->gncr_srcnid);
826                 __swab64s(&connreq->gncr_dstnid);
827                 __swab64s(&connreq->gncr_peerstamp);
828                 __swab64s(&connreq->gncr_connstamp);
829         }
830
831         /* Do NOT return anything but -EBADF before we munge
832          * connreq->gncr_srcnid - we need that to send the nak */
833
834         if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
835                 lnet_nid_t      incoming = connreq->gncr_srcnid;
836
837                 /* even if the incoming packet is hosed, we know who we sent
838                  * the original and can set the srcnid so that we can properly
839                  * look up our peer to close the loop on this connreq. We still use
840                  * -EBADF to prevent a NAK - just in case there are issues with
841                  * the payload coming from a random spot, etc. */
842                 connreq->gncr_srcnid = dgram->gndg_conn_out.gncr_dstnid;
843
844                 if (LNET_NIDADDR(dgram->gndg_conn_out.gncr_dstnid) !=
845                                 LNET_NIDADDR(incoming)) {
846                         /* we got a datagram match for the wrong nid... */
847                         CERROR("matched datagram 0x%p with srcnid %s "
848                                 "(%x), expecting %s (%x)\n",
849                                 dgram,
850                                 libcfs_nid2str(incoming),
851                                 LNET_NIDADDR(incoming),
852                                 libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
853                                 LNET_NIDADDR(dgram->gndg_conn_out.gncr_dstnid));
854                         return -EBADF;
855                 }
856         } else {
857                 /* if we have a wildcard datagram it should match an
858                  * incoming "active" datagram that should have a fully formed
859                  * srcnid and dstnid. If we couldn't unpack it, we drop as
860                  * corrupted packet, otherwise we'll just verify that the dstnid
861                  * matches the NID for the NET that the dgram was posted */
862
863                 /* make sure their wildcard didn't match ours, that is unpossible */
864                 LASSERTF(connreq->gncr_dstnid != LNET_NID_ANY,
865                          "dgram 0x%p from %s, connreq 0x%p; "
866                          "wildcard matched wildcard \n", dgram,
867                          libcfs_nid2str(connreq->gncr_srcnid), connreq);
868
869                 rc = kgnilnd_find_net(connreq->gncr_dstnid, &net);
870
871                 if (rc == -ESHUTDOWN) {
872                         CERROR("Looking up network: device is in shutdown\n");
873                         return rc;
874                 } else if (rc == -ENONET) {
875                         CERROR("Connection data from %s: she sent "
876                         "dst_nid %s, but net lookup failed on "
877                         "dgram 0x%p@%s\n",
878                         libcfs_nid2str(connreq->gncr_srcnid),
879                         libcfs_nid2str(connreq->gncr_dstnid),
880                         dgram, kgnilnd_dgram_type2str(dgram));
881                         return rc;
882                 }
883
884                 if (lnet_nid_to_nid4(&net->gnn_ni->ni_nid) !=
885                     connreq->gncr_dstnid) {
886                         CERROR("Bad connection data from %s: she sent "
887                                "dst_nid %s, but I am %s with dgram 0x%p@%s\n",
888                                libcfs_nid2str(connreq->gncr_srcnid),
889                                libcfs_nid2str(connreq->gncr_dstnid),
890                                libcfs_nidstr(&net->gnn_ni->ni_nid),
891                                dgram, kgnilnd_dgram_type2str(dgram));
892                         kgnilnd_net_decref(net);
893                         return -EBADSLT;
894                 }
895
896                 /* kgnilnd_find_net takes a ref on the net it finds, You need to decref it when not needed. */
897                 kgnilnd_net_decref(net);
898         }
899
900         if (connreq->gncr_version != GNILND_CONNREQ_VERSION) {
901                 CERROR("Unexpected version %d\n", connreq->gncr_version);
902                 return -EPROTO;
903         }
904
905         /* XXX Nic: TBD - checksum validation */
906         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_DROP)) {
907                 return -EBADF;
908         }
909
910         if (swab && connreq->gncr_type == GNILND_CONNREQ_REQ) {
911                 __u64 msg_addr = (__u64) connreq->gncr_gnparams.gnpr_smsg_attr.msg_buffer;
912
913                 __swab32s(&connreq->gncr_gnparams.gnpr_host_id);
914                 __swab32s(&connreq->gncr_gnparams.gnpr_cqid);
915                 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.buff_size);
916                 __swab16s(&connreq->gncr_gnparams.gnpr_smsg_attr.mbox_maxcredit);
917                 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.mbox_offset);
918                 __swab64s(&connreq->gncr_gnparams.gnpr_smsg_attr.mem_hndl.qword1);
919                 __swab64s(&connreq->gncr_gnparams.gnpr_smsg_attr.mem_hndl.qword2);
920                 __swab64s(&msg_addr);
921                 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.msg_maxsize);
922                 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.msg_type);
923         } else if (swab && connreq->gncr_type == GNILND_CONNREQ_NAK) {
924                 __swab32s(&connreq->gncr_nakdata.gnnd_errno);
925         }
926
927         /* since we use a unique instance ID for each network, the driver
928          * will take care of dropping datagrams if we don't have that network.
929          */
930
931         /* few more idiot software or configuration checks */
932
933         switch (connreq->gncr_type) {
934         case GNILND_CONNREQ_REQ:
935                 /* wire up EP and SMSG block - this will check the incoming data
936                  * and barf a NAK back if need to */
937                 rc = kgnilnd_set_conn_params(dgram);
938                 if (rc)
939                         return rc;
940                 break;
941         case GNILND_CONNREQ_NAK:
942         case GNILND_CONNREQ_CLOSE:
943                 break;
944         default:
945                 CERROR("unknown connreq packet type %d\n", connreq->gncr_type);
946                 return -EPROTO;
947         }
948
949         if (connreq->gncr_peerstamp == 0 || connreq->gncr_connstamp == 0) {
950                 CERROR("Recived bad timestamps peer %llu conn %llu\n",
951                 connreq->gncr_peerstamp, connreq->gncr_connstamp);
952                 return -EPROTO;
953         }
954
955         if (connreq->gncr_timeout < GNILND_MIN_TIMEOUT) {
956                 CERROR("Received timeout %d < MIN %d\n",
957                        connreq->gncr_timeout, GNILND_MIN_TIMEOUT);
958                 return -EPROTO;
959         }
960
961         return 0;
962 }
963
964 int
965 kgnilnd_alloc_dgram(kgn_dgram_t **dgramp, kgn_device_t *dev, kgn_dgram_type_t type)
966 {
967         kgn_dgram_t         *dgram;
968
969         dgram = kmem_cache_zalloc(kgnilnd_data.kgn_dgram_cache, GFP_ATOMIC);
970         if (dgram == NULL)
971                 return -ENOMEM;
972
973         INIT_LIST_HEAD(&dgram->gndg_list);
974         dgram->gndg_state = GNILND_DGRAM_USED;
975         dgram->gndg_type = type;
976         dgram->gndg_magic = GNILND_DGRAM_MAGIC;
977
978         atomic_inc(&dev->gnd_ndgrams);
979
980         CDEBUG(D_MALLOC|D_NETTRACE, "slab-alloced 'dgram': %lu at %p %s ndgrams"
981                 " %d\n",
982                 sizeof(*dgram), dgram, kgnilnd_dgram_type2str(dgram),
983                 atomic_read(&dev->gnd_ndgrams));
984
985         *dgramp = dgram;
986         return 0;
987 }
988
989 /* call this on a dgram that came back from kgnilnd_ep_postdata_test_by_id
990  * returns < 0 on dgram to be cleaned up
991  * > 0 on dgram that isn't done yet
992  * == 0 on dgram that is ok and needs connreq processing */
993 int
994 kgnilnd_process_dgram(kgn_dgram_t *dgram, gni_post_state_t post_state)
995 {
996         int rc = 0;
997
998         switch (post_state) {
999         case GNI_POST_COMPLETED:
1000                 /* normal state for dgrams that need actual processing */
1001                 /* GOTO to avoid processing dgram as canceled/done */
1002                 GOTO(process_out, rc);
1003
1004         case GNI_POST_PENDING:
1005                 /* we should only see this if we are testing a WC dgram after a
1006                  * cancel - it means that it needs a full cycle of waiting
1007                  * for kgni_sm_task to finish moving it to TERMINATED */
1008                 LASSERTF((dgram->gndg_type == GNILND_DGRAM_WC_REQ) &&
1009                           (dgram->gndg_state == GNILND_DGRAM_CANCELED),
1010                          "POST_PENDING dgram 0x%p with bad type %d(%s) or state %d(%s)\n",
1011                          dgram, dgram->gndg_type, kgnilnd_dgram_type2str(dgram),
1012                          dgram->gndg_state, kgnilnd_dgram_state2str(dgram));
1013
1014                 /* positive RC as this dgram isn't done yet */
1015                 rc = EINPROGRESS;
1016
1017                 /* GOTO as this isn't done yet */
1018                 GOTO(process_out, rc);
1019                 break;
1020
1021         case GNI_POST_TERMINATED:
1022                 /* we've called cancel and it is done or remote guy called cancel and
1023                  * we've receved it on a WC dgram */
1024 #if 0
1025                 /* we are seeing weird terminations on non WC dgrams when we have not
1026                  * canceled them */
1027
1028                 LASSERTF(dgram->gndg_state == GNILND_DGRAM_CANCELED ||
1029                          dgram->gndg_conn_out.gncr_dstnid == LNET_NID_ANY,
1030                         "dgram 0x%p with bad state %d(%s) or dst nid %s\n",
1031                         dgram, dgram->gndg_state, kgnilnd_dgram_state2str(dgram),
1032                         libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid));
1033 #endif
1034
1035                 CDEBUG(D_NETTRACE, "dgram 0x%p saw %s, cleaning it up\n", dgram,
1036                        dgram->gndg_state == GNILND_DGRAM_CANCELED ?  "canceled" : "terminated");
1037
1038                 rc =  -ECANCELED;
1039                 break;
1040
1041         case GNI_POST_TIMEOUT:
1042                 /* we could have a timeout on a wildcard dgram too - if
1043                  * we got the incoming request but the remote node beefed
1044                  * before kgni could send the match data back. We'll just error
1045                  * on the active case and bail out gracefully */
1046                 if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
1047                         CNETERR("hardware timeout for connect to "
1048                                "%s after %lu seconds. Is node dead?\n",
1049                                libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
1050                                cfs_duration_sec(jiffies - dgram->gndg_post_time));
1051                 }
1052
1053                 rc = -ETIMEDOUT;
1054                 break;
1055
1056         default:
1057                 CERROR("dgram 0x%p with bad post_state %d\n", dgram, post_state);
1058                 LBUG();
1059         }
1060
1061         /* now finish cleaning up a dgram that is canceled/terminated and needs to
1062          * go away */
1063
1064         /* If this was actively canceled, drop the count now that we are processing */
1065         if (dgram->gndg_state == GNILND_DGRAM_CANCELED) {
1066                 atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1067                 /* caller responsible for gndg_list removal */
1068         }
1069
1070 process_out:
1071
1072         RETURN(rc);
1073 }
1074
1075 /* needs dev->gnd_dgram_lock held */
1076 void
1077 kgnilnd_cancel_dgram_locked(kgn_dgram_t *dgram)
1078 {
1079         gni_return_t            grc;
1080
1081         if (dgram->gndg_state != GNILND_DGRAM_POSTED) {
1082                 return;
1083         }
1084
1085         LASSERTF(dgram->gndg_conn != NULL,
1086                  "dgram 0x%p with NULL conn\n", dgram);
1087
1088         /* C.E - WC dgrams could be canceled immediately but
1089          * if there was some match pending, we need to call
1090          * test_by_id to clear it out. If that test returns
1091          * POST_PENDING, it is half done and needs to go along
1092          * with the rest of dgrams and go through a kgni_sm_task cycle
1093          * and deliver a GNI_POST_TERMINATED event before they
1094          * are actually canceled */
1095
1096         dgram->gndg_state = GNILND_DGRAM_CANCELED;
1097
1098         if (dgram->gndg_conn->gnc_state >= GNILND_CONN_ESTABLISHED) {
1099                 /* we don't need to cancel_by_id if the datagram was good */
1100                 return;
1101         }
1102
1103         /* let folks know there are outstanding cancels */
1104         atomic_inc(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1105         /* leave on nid list until cancel is done for debugging fun */
1106         grc = kgnilnd_ep_postdata_cancel_by_id(dgram->gndg_conn->gnc_ephandle, (__u64) dgram);
1107
1108         /* if we don't get success here, we have hosed up the dgram tracking
1109          * code and need to bail out */
1110         LASSERTF(grc == GNI_RC_SUCCESS,
1111                  "postdata_cancel returned %d for conn 0x%p to %s\n",
1112                  grc, dgram->gndg_conn,
1113                  dgram->gndg_conn->gnc_peer ?
1114                   libcfs_nid2str(dgram->gndg_conn->gnc_peer->gnp_nid)
1115                   : "<?>");
1116
1117         CDEBUG(D_NETTRACE,
1118                 "canceled dgram 0x%p conn 0x%p ephandle 0x%p\n",
1119                 dgram, dgram->gndg_conn,
1120                 dgram->gndg_conn->gnc_ephandle);
1121
1122         if (dgram->gndg_type == GNILND_DGRAM_WC_REQ) {
1123                 gni_post_state_t         post_state;
1124                 int                      rc = 0;
1125                 __u32                    remote_addr = 0, remote_id = 0;
1126
1127                 grc = kgnilnd_ep_postdata_test_by_id(dgram->gndg_conn->gnc_ephandle,
1128                                                      (__u64)dgram, &post_state,
1129                                                      &remote_addr, &remote_id);
1130
1131                 LASSERTF(grc == GNI_RC_NO_MATCH || grc == GNI_RC_SUCCESS,
1132                          "bad grc %d from test_by_id on dgram 0x%p\n",
1133                         grc, dgram);
1134
1135                 /* if WC was canceled immediately, we get NO_MATCH, if needs to go
1136                  * through full cycle, we get SUCCESS and need to parse post_state */
1137
1138                 CDEBUG(D_NET, "grc %d dgram 0x%p type %s post_state %d "
1139                         "remote_addr %u remote_id %u\n", grc, dgram,
1140                         kgnilnd_dgram_type2str(dgram),
1141                         post_state, remote_addr, remote_id);
1142
1143                 if (grc == GNI_RC_NO_MATCH) {
1144                         /* she's gone, reduce count and move along */
1145                         dgram->gndg_state = GNILND_DGRAM_DONE;
1146                         atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1147                         RETURN_EXIT;
1148                 }
1149
1150                 rc = kgnilnd_process_dgram(dgram, post_state);
1151
1152                 if (rc <= 0) {
1153                         /* if for some weird reason we get a valid dgram back, just mark as done
1154                          * so we can drop it and move along.
1155                          * C.E - if it was completed, we'll just release the conn/mbox
1156                          * back into the pool and it'll get reused. That said, we should only
1157                          * be canceling a WC dgram on stack rest or shutdown, so that is moot */
1158                         dgram->gndg_state = GNILND_DGRAM_DONE;
1159                         atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1160
1161                         /* caller context responsible for calling kgnilnd_release_dgram() */
1162                 } else {
1163                         /* still pending, let it simmer until golden brown and delicious */
1164                 }
1165         }
1166
1167         /* for non WC dgrams, they are still on the nid list but marked canceled waiting
1168          * for kgni to return their ID to us via probe - that is when we'll complete their
1169          * cancel processing */
1170 }
1171
1172 void
1173 kgnilnd_cleanup_dgram(kgn_dgram_t *dgram)
1174 {
1175         /* release the dgram ref on conn */
1176         if (dgram->gndg_conn) {
1177                 kgnilnd_conn_decref(dgram->gndg_conn);
1178                 dgram->gndg_conn = NULL;
1179         }
1180 }
1181
1182 void
1183 kgnilnd_free_dgram(kgn_device_t *dev, kgn_dgram_t *dgram)
1184 {
1185         LASSERTF(dgram->gndg_state == GNILND_DGRAM_USED ||
1186                  dgram->gndg_state == GNILND_DGRAM_DONE,
1187                  "dgram 0x%p with bad state %s\n",
1188                  dgram, kgnilnd_dgram_state2str(dgram));
1189
1190         /* bit of poisoning to help detect bad driver data */
1191         dgram->gndg_magic = 0x6f5a6b5f;
1192         atomic_dec(&dev->gnd_ndgrams);
1193
1194         kmem_cache_free(kgnilnd_data.kgn_dgram_cache, dgram);
1195         CDEBUG(D_MALLOC|D_NETTRACE, "slab-freed 'dgram': %lu at %p %s"
1196                " ndgrams %d\n",
1197                sizeof(*dgram), dgram, kgnilnd_dgram_type2str(dgram),
1198                atomic_read(&dev->gnd_ndgrams));
1199 }
1200
1201 int
1202 kgnilnd_post_dgram(kgn_device_t *dev, lnet_nid_t dstnid, kgn_connreq_type_t type,
1203                    int data_rc)
1204 {
1205         int              rc = 0;
1206         kgn_dgram_t     *dgram = NULL;
1207         kgn_dgram_t     *tmpdgram;
1208         kgn_dgram_type_t dgtype;
1209         gni_return_t     grc;
1210         __u64            srcnid;
1211         ENTRY;
1212
1213         switch (type) {
1214         case GNILND_CONNREQ_REQ:
1215                 if (dstnid == LNET_NID_ANY)
1216                         dgtype = GNILND_DGRAM_WC_REQ;
1217                 else
1218                         dgtype = GNILND_DGRAM_REQ;
1219                 break;
1220         case GNILND_CONNREQ_NAK:
1221                 LASSERTF(dstnid != LNET_NID_ANY, "can't NAK to LNET_NID_ANY\n");
1222                 dgtype = GNILND_DGRAM_NAK;
1223                 break;
1224         default:
1225                 CERROR("unknown connreq type %d\n", type);
1226                 LBUG();
1227         }
1228
1229         rc = kgnilnd_alloc_dgram(&dgram, dev, dgtype);
1230         if (rc < 0) {
1231                 rc = -ENOMEM;
1232                 GOTO(post_failed, rc);
1233         }
1234
1235         rc = kgnilnd_create_conn(&dgram->gndg_conn, dev);
1236         if (rc) {
1237                 GOTO(post_failed, rc);
1238         }
1239
1240         if (dgram->gndg_type == GNILND_DGRAM_WC_REQ) {
1241                 /* clear buffer for sanity on reuse of wildcard */
1242                 memset(&dgram->gndg_conn_in, 0, sizeof(kgn_connreq_t));
1243         }
1244
1245         if (dstnid == LNET_NID_ANY) {
1246                 /* set here to reset any dgram re-use */
1247                 dgram->gndg_conn->gnc_state = GNILND_CONN_LISTEN;
1248         } else {
1249                 __u32            host_id;
1250
1251                 rc = kgnilnd_nid_to_nicaddrs(LNET_NIDADDR(dstnid), 1, &host_id);
1252                 if (rc <= 0) {
1253                         rc = -ESRCH;
1254                         GOTO(post_failed, rc);
1255                 }
1256
1257                 dgram->gndg_conn->gnc_state = GNILND_CONN_CONNECTING;
1258
1259                 /* don't need to serialize, there are no CQs for the dgram
1260                  * EP on the kgn_net_t */
1261                 grc = kgnilnd_ep_bind(dgram->gndg_conn->gnc_ephandle, host_id, dev->gnd_id);
1262
1263                 if (grc != GNI_RC_SUCCESS) {
1264                         rc = -ECONNABORTED;
1265                         GOTO(post_failed, rc);
1266                 }
1267
1268         }
1269
1270         /* If we are posting wildcards post using a net of 0, otherwise we'll use the
1271          * net of the destination node.
1272          */
1273
1274         if (dstnid == LNET_NID_ANY) {
1275                 srcnid = LNET_MKNID(LNET_MKNET(GNILND, 0), dev->gnd_nid);
1276         } else {
1277                 srcnid = LNET_MKNID(LNET_NIDNET(dstnid), dev->gnd_nid);
1278         }
1279
1280         rc = kgnilnd_pack_connreq(&dgram->gndg_conn_out, dgram->gndg_conn,
1281                                   srcnid, dstnid, type);
1282         if (rc) {
1283                 GOTO(post_failed, rc);
1284         }
1285
1286         if (type == GNILND_CONNREQ_NAK)
1287                 dgram->gndg_conn_out.gncr_nakdata.gnnd_errno = data_rc;
1288
1289         dgram->gndg_post_time = jiffies;
1290
1291         /* XXX Nic: here is where we'd add in logical network multiplexing */
1292
1293         CDEBUG(D_NETTRACE, "dgram 0x%p type %s %s->%s cdm %d\n",
1294                dgram, kgnilnd_dgram_type2str(dgram),
1295                libcfs_nid2str(srcnid),
1296                libcfs_nid2str(dstnid), dev->gnd_id);
1297
1298         /* this allocates memory, can't hold locks across */
1299         grc = kgnilnd_ep_postdata_w_id(dgram->gndg_conn->gnc_ephandle,
1300                                    &dgram->gndg_conn_out, sizeof(kgn_connreq_t),
1301                                    &dgram->gndg_conn_in, sizeof(kgn_connreq_t),
1302                                    (__u64)dgram);
1303
1304         if (grc != GNI_RC_SUCCESS) {
1305                 CNETERR("dropping failed dgram post id 0x%p type %s"
1306                         " reqtype %s to %s: rc %d\n",
1307                         dgram, kgnilnd_dgram_type2str(dgram),
1308                         kgnilnd_connreq_type2str(&dgram->gndg_conn_out),
1309                         libcfs_nid2str(dstnid), grc);
1310                 rc = (grc == GNI_RC_ERROR_NOMEM) ? -ENOMEM : -EBADR;
1311                 GOTO(post_failed, rc);
1312         }
1313
1314         /* we don't need to add earlier - if someone does del_peer during post,
1315          * that peer will get marked as unlinked and the callers wil take care of it.
1316          * The dgram code is largely kgn_peer_t ignorant, so at worst, we'll just drop
1317          * the completed dgram later when we cant find a peer to stuff it into */
1318
1319         spin_lock(&dev->gnd_dgram_lock);
1320
1321         /* make sure we are not double posting targeted dgrams
1322          * - we can multiple post WC dgrams to help with processing speed */
1323         if (dstnid != LNET_NID_ANY) {
1324                 tmpdgram = kgnilnd_find_dgram_locked(dev, dstnid);
1325
1326                 LASSERTF(tmpdgram == NULL,
1327                         "dgram 0x%p->%s already posted\n",
1328                          dgram, libcfs_nid2str(dstnid));
1329         }
1330
1331         /* unmunge dstnid to help processing code cope... */
1332         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_DSTNID)) {
1333                 dgram->gndg_conn_out.gncr_dstnid = dstnid;
1334         }
1335
1336         list_add_tail(&dgram->gndg_list, kgnilnd_nid2dgramlist(dev, dstnid));
1337         dgram->gndg_state = GNILND_DGRAM_POSTED;
1338         spin_unlock(&dev->gnd_dgram_lock);
1339
1340 post_failed:
1341         if (rc < 0 && dgram != NULL) {
1342                 kgnilnd_cleanup_dgram(dgram);
1343                 kgnilnd_free_dgram(dev, dgram);
1344         }
1345
1346         RETURN(rc);
1347 }
1348
1349 /* The shutdown flag is set from the shutdown and stack reset threads. */
1350 void
1351 kgnilnd_release_dgram(kgn_device_t *dev, kgn_dgram_t *dgram, int shutdown)
1352 {
1353         /* The conns of canceled active dgrams need to be put in purgatory so
1354          * we don't reuse the mailbox */
1355         if (unlikely(dgram->gndg_state == GNILND_DGRAM_CANCELED)) {
1356                 kgn_peer_t *peer;
1357                 kgn_conn_t *conn = dgram->gndg_conn;
1358                 lnet_nid_t nid = dgram->gndg_conn_out.gncr_dstnid;
1359
1360                 dgram->gndg_state = GNILND_DGRAM_DONE;
1361
1362                 /* During shutdown we've already removed the peer so we don't
1363                  * need to add a peer. During stack reset we don't care about
1364                  * MDDs since they are all released. */
1365                 if (!shutdown) {
1366                         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1367                         peer = kgnilnd_find_peer_locked(nid);
1368
1369                         if (peer != NULL) {
1370                                 CDEBUG(D_NET, "adding peer's conn with nid %s "
1371                                         "to purgatory\n", libcfs_nid2str(nid));
1372                                 kgnilnd_conn_addref(conn);
1373                                 conn->gnc_peer = peer;
1374                                 kgnilnd_peer_addref(peer);
1375                                 kgnilnd_admin_addref(conn->gnc_peer->gnp_dirty_eps);
1376                                 conn->gnc_state = GNILND_CONN_CLOSED;
1377                                 list_add_tail(&conn->gnc_list,
1378                                               &peer->gnp_conns);
1379                                 kgnilnd_add_purgatory_locked(conn,
1380                                                              conn->gnc_peer);
1381                                 kgnilnd_schedule_conn(conn);
1382                         }
1383                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1384                 }
1385         }
1386
1387         spin_lock(&dev->gnd_dgram_lock);
1388         kgnilnd_cancel_dgram_locked(dgram);
1389         spin_unlock(&dev->gnd_dgram_lock);
1390
1391         kgnilnd_cleanup_dgram(dgram);
1392
1393         /* if the dgram is 'canceled' it needs to be wait until the event
1394          * comes up from kgni that tells us it is safe to release */
1395         if (dgram->gndg_state != GNILND_DGRAM_CANCELED) {
1396                 dgram->gndg_state = GNILND_DGRAM_DONE;
1397
1398                 LASSERTF(list_empty(&dgram->gndg_list), "dgram 0x%p on list\n", dgram);
1399
1400                 /* if it is a wildcard and we are in an appropriate state, repost
1401                  * the wildcard */
1402
1403                 if ((dgram->gndg_type == GNILND_DGRAM_WC_REQ) &&
1404                     (!kgnilnd_data.kgn_wc_kill && !kgnilnd_data.kgn_in_reset)) {
1405                         int     rerc;
1406
1407                         rerc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
1408                         if (rerc != 0) {
1409                                 /* We failed to repost the WC dgram for some reason
1410                                  * mark it so the repost system attempts to repost */
1411                                 kgnilnd_admin_addref(dev->gnd_nwcdgrams);
1412                         }
1413                 }
1414
1415                 /* always free the old dgram */
1416                 kgnilnd_free_dgram(dev, dgram);
1417         }
1418 }
1419
1420
1421 int
1422 kgnilnd_probe_for_dgram(kgn_device_t *dev, kgn_dgram_t **dgramp)
1423 {
1424         kgn_dgram_t             *dgram = NULL;
1425         gni_post_state_t         post_state;
1426         gni_return_t             grc;
1427         int                      rc = 0;
1428         __u64                    readyid;
1429         __u32                    remote_addr = 0, remote_id = 0;
1430         ENTRY;
1431
1432         /* Probe with the lock held. That way if we get a dgram we dont have it canceled
1433          * between finding the ready dgram and grabbing the lock to remove it from the
1434          * list. Otherwise we could be left in an inconsistent state. We own the dgram
1435          * once its off the list so we don't need to worry about others changing it at
1436          * that point. */
1437         spin_lock(&dev->gnd_dgram_lock);
1438         grc = kgnilnd_postdata_probe_by_id(dev->gnd_handle, &readyid);
1439         if (grc != GNI_RC_SUCCESS) {
1440                 spin_unlock(&dev->gnd_dgram_lock);
1441                 /* return 0 to indicate nothing happened */
1442                 RETURN(0);
1443         }
1444
1445         CDEBUG(D_NET, "ready %#llx on device 0x%p\n",
1446                 readyid, dev);
1447
1448         dgram = (kgn_dgram_t *)readyid;
1449
1450         LASSERTF(dgram->gndg_magic == GNILND_DGRAM_MAGIC,
1451                  "dgram 0x%p from id %#llx with bad magic %x\n",
1452                  dgram, readyid, dgram->gndg_magic);
1453
1454         LASSERTF(dgram->gndg_state == GNILND_DGRAM_POSTED ||
1455                  dgram->gndg_state == GNILND_DGRAM_CANCELED,
1456                  "dgram 0x%p with bad state %s\n",
1457                  dgram, kgnilnd_dgram_state2str(dgram));
1458
1459         LASSERTF(!list_empty(&dgram->gndg_list),
1460                  "dgram 0x%p with bad list state %s type %s\n",
1461                  dgram, kgnilnd_dgram_state2str(dgram),
1462                  kgnilnd_dgram_type2str(dgram));
1463
1464         /* now we know that the datagram structure is ok, so pull off list */
1465         list_del_init(&dgram->gndg_list);
1466
1467         /* while we have the gnn_dgram_lock and BEFORE we call test_by_id
1468          * change the state from POSTED to PROCESSING to ensure that
1469          * nobody cancels it after we've pulled it from the wire */
1470         if (dgram->gndg_state == GNILND_DGRAM_POSTED) {
1471                 dgram->gndg_state = GNILND_DGRAM_PROCESSING;
1472         }
1473
1474         LASSERTF(dgram->gndg_conn != NULL,
1475                 "dgram 0x%p with NULL conn\n", dgram);
1476
1477         grc = kgnilnd_ep_postdata_test_by_id(dgram->gndg_conn->gnc_ephandle,
1478                                              (__u64)dgram, &post_state,
1479                                              &remote_addr, &remote_id);
1480
1481         /* we now "own" this datagram */
1482         spin_unlock(&dev->gnd_dgram_lock);
1483
1484         LASSERTF(grc != GNI_RC_NO_MATCH, "kgni lied! probe_by_id told us that"
1485                  " id %llu was ready\n", readyid);
1486
1487         CDEBUG(D_NET, "grc %d dgram 0x%p type %s post_state %d "
1488                 "remote_addr %u remote_id %u\n", grc, dgram,
1489                 kgnilnd_dgram_type2str(dgram),
1490                 post_state, remote_addr, remote_id);
1491
1492         if (unlikely(grc != GNI_RC_SUCCESS)) {
1493                 CNETERR("getting data for dgram 0x%p->%s failed rc %d. Dropping it\n",
1494                         dgram, libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
1495                         grc);
1496                 rc = -EINVAL;
1497                 GOTO(probe_for_out, rc);
1498         }
1499
1500         rc = kgnilnd_process_dgram(dgram, post_state);
1501
1502         /* we should never get probe finding a dgram for us and then it
1503          * being a WC dgram that is still in the middle of processing */
1504         LASSERTF(rc <= 0, "bad rc %d from process_dgram 0x%p state %d\n",
1505                  rc, dgram, post_state);
1506
1507         if (rc == 0) {
1508                 /* dgram is good enough for the data to be used */
1509                 dgram->gndg_state = GNILND_DGRAM_PROCESSING;
1510                 /* fake rc to mark that we've done something */
1511                 rc = 1;
1512         } else {
1513                 /* let kgnilnd_release_dgram take care of canceled dgrams */
1514                 if (dgram->gndg_state != GNILND_DGRAM_CANCELED) {
1515                         dgram->gndg_state = GNILND_DGRAM_DONE;
1516                 }
1517         }
1518
1519         *dgramp = dgram;
1520         RETURN(rc);
1521
1522 probe_for_out:
1523
1524         kgnilnd_release_dgram(dev, dgram, 0);
1525         RETURN(rc);
1526 }
1527
1528 int
1529 kgnilnd_setup_wildcard_dgram(kgn_device_t *dev)
1530 {
1531         /* if kgn_wildcard is zero, return error */
1532         int     rc = -ENOENT, i;
1533         ENTRY;
1534
1535         for (i = 0; i < *kgnilnd_tunables.kgn_nwildcard; i++) {
1536                 rc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
1537                 if (rc < 0) {
1538                         CERROR("error %d: could not post wildcard datagram # %d\n",
1539                                 rc, i);
1540                         rc = -EINVAL;
1541                         GOTO(failed, rc);
1542                 }
1543         }
1544
1545 failed:
1546         RETURN(rc);
1547 }
1548
1549 int
1550 kgnilnd_cancel_net_dgrams(kgn_net_t *net)
1551 {
1552         kgn_dgram_t *dg, *dgN;
1553         LIST_HEAD(zombies);
1554         int i;
1555         ENTRY;
1556
1557         /* we want to cancel any outstanding dgrams - we don't want to rely
1558          * on del_peer_or_conn catching all of them. This helps protect us in cases
1559          * where we don't quite keep the peer->dgram mapping in sync due to some
1560          * race conditions */
1561
1562         LASSERTF(net->gnn_shutdown || kgnilnd_data.kgn_in_reset,
1563                  "called with LND invalid state: net shutdown %d "
1564                  "in reset %d\n", net->gnn_shutdown,
1565                  kgnilnd_data.kgn_in_reset);
1566
1567         spin_lock(&net->gnn_dev->gnd_dgram_lock);
1568
1569         for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
1570                 list_for_each_entry_safe(dg, dgN, &net->gnn_dev->gnd_dgrams[i], gndg_list) {
1571
1572                         /* skip nids not on our net or are wildcards */
1573
1574
1575                         if (dg->gndg_type == GNILND_DGRAM_WC_REQ ||
1576                                 net->gnn_netnum != LNET_NETNUM(LNET_NIDNET(dg->gndg_conn_out.gncr_dstnid)))
1577                                 continue;
1578
1579                         kgnilnd_cancel_dgram_locked(dg);
1580                 }
1581         }
1582
1583         spin_unlock(&net->gnn_dev->gnd_dgram_lock);
1584
1585         RETURN(0);
1586 }
1587
1588 int
1589 kgnilnd_cancel_wc_dgrams(kgn_device_t *dev)
1590 {
1591         kgn_dgram_t *dg, *dgN;
1592         LIST_HEAD(zombies);
1593         ENTRY;
1594
1595         /* Time to kill the outstanding WC's
1596          * WC's exist on net 0 only but match on any net...
1597          */
1598
1599         LASSERTF(kgnilnd_data.kgn_in_reset || kgnilnd_data.kgn_wc_kill,
1600                 "called with LND invalid state: WC shutdown %d "
1601                 "in reset %d\n", kgnilnd_data.kgn_wc_kill,
1602                 kgnilnd_data.kgn_in_reset);
1603
1604         spin_lock(&dev->gnd_dgram_lock);
1605
1606         do {
1607                 dg = kgnilnd_find_dgram_locked(dev, LNET_NID_ANY);
1608                 if (dg != NULL) {
1609                         LASSERTF(dg->gndg_type == GNILND_DGRAM_WC_REQ,
1610                                  "dgram 0x%p->%s with bad type %d (%s)\n",
1611                                 dg, libcfs_nid2str(dg->gndg_conn_out.gncr_dstnid),
1612                                 dg->gndg_type, kgnilnd_dgram_type2str(dg));
1613
1614                         kgnilnd_cancel_dgram_locked(dg);
1615
1616                         /* WC could be DONE already, check and if so add to list to be released */
1617                         if (dg->gndg_state == GNILND_DGRAM_DONE)
1618                                 list_move_tail(&dg->gndg_list, &zombies);
1619                 }
1620         } while (dg != NULL);
1621
1622         spin_unlock(&dev->gnd_dgram_lock);
1623
1624         list_for_each_entry_safe(dg, dgN, &zombies, gndg_list) {
1625                 list_del_init(&dg->gndg_list);
1626                 kgnilnd_release_dgram(dev, dg, 1);
1627         }
1628         RETURN(0);
1629
1630 }
1631
1632 int
1633 kgnilnd_cancel_dgrams(kgn_device_t *dev)
1634 {
1635         kgn_dgram_t *dg, *dgN;
1636         int i;
1637         ENTRY;
1638
1639         /* Cancel any outstanding non wildcard datagrams regardless
1640          * of which net they are on as we are in base shutdown and
1641          * dont care about connecting anymore.
1642          */
1643
1644         LASSERTF(kgnilnd_data.kgn_wc_kill == 1,"We didnt get called from base shutdown\n");
1645
1646         spin_lock(&dev->gnd_dgram_lock);
1647
1648         for (i = 0; i < (*kgnilnd_tunables.kgn_peer_hash_size -1); i++) {
1649                 list_for_each_entry_safe(dg, dgN, &dev->gnd_dgrams[i], gndg_list) {
1650                         if (dg->gndg_type != GNILND_DGRAM_WC_REQ)
1651                                 kgnilnd_cancel_dgram_locked(dg);
1652                 }
1653         }
1654
1655         spin_unlock(&dev->gnd_dgram_lock);
1656
1657         RETURN(0);
1658 }
1659
1660
1661 void
1662 kgnilnd_wait_for_canceled_dgrams(kgn_device_t *dev)
1663 {
1664         int             i = 4;
1665         int             rc;
1666         gni_return_t    grc;
1667         __u64           readyid;
1668         kgn_dgram_t    *dgram;
1669
1670         /* use do while to get at least one check run to allow
1671          * regression test for 762072 to hit bug if there */
1672
1673         /* This function races with the dgram mover during shutdown so it is possible for
1674          * a dgram to be seen in kgnilnd_postdata_probe_wait_by_id but be handled in the
1675          * dgram mover thread instead of inside of this function.
1676          */
1677
1678         /* This should only be called from within shutdown, baseshutdown, or stack reset.
1679          * there are no assertions here to verify since base_shutdown has nothing in it we can check
1680          * the net is gone by then.
1681          */
1682
1683         do {
1684                 i++;
1685                 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
1686                         "Waiting for %d canceled datagrams to clear on device %d\n",
1687                         atomic_read(&dev->gnd_canceled_dgrams), dev->gnd_id);
1688
1689                 /* check once a second */
1690                 grc = kgnilnd_postdata_probe_wait_by_id(dev->gnd_handle,
1691                        250, &readyid);
1692
1693                 if (grc != GNI_RC_SUCCESS)
1694                         continue;
1695
1696                 CDEBUG(D_NET, "ready %#llx on device %d->0x%p\n",
1697                         readyid, dev->gnd_id, dev);
1698
1699                 rc = kgnilnd_probe_for_dgram(dev, &dgram);
1700                 if (rc != 0) {
1701                         /* if we got a valid dgram or one that is now done, clean up */
1702                         kgnilnd_release_dgram(dev, dgram, 1);
1703                 }
1704         } while (atomic_read(&dev->gnd_canceled_dgrams));
1705 }
1706
1707 int
1708 kgnilnd_start_connect(kgn_peer_t *peer)
1709 {
1710         int              rc = 0;
1711         /* sync point for kgnilnd_del_peer_locked - do an early check to
1712          * catch the most common hits where del_peer is done by the
1713          * time we get here */
1714         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING1)) {
1715                 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING1, 1)) {};
1716         }
1717
1718         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1719         if (!kgnilnd_peer_active(peer) || peer->gnp_connecting != GNILND_PEER_CONNECT) {
1720                 /* raced with peer getting unlinked */
1721                 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1722                 rc = ESTALE;
1723                 GOTO(out, rc);
1724         }
1725         peer->gnp_connecting = GNILND_PEER_POSTING;
1726         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1727
1728         set_mb(peer->gnp_last_dgram_time, jiffies);
1729         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING2)) {
1730                 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING2, 1)) {};
1731         }
1732
1733         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING3)) {
1734                 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING3, 1)) {};
1735                 rc = cfs_fail_val ? cfs_fail_val : -ENOMEM;
1736         } else {
1737                 rc = kgnilnd_post_dgram(peer->gnp_net->gnn_dev,
1738                                         peer->gnp_nid, GNILND_CONNREQ_REQ, 0);
1739         }
1740         if (rc < 0) {
1741                 set_mb(peer->gnp_last_dgram_errno, rc);
1742                 GOTO(failed, rc);
1743         }
1744
1745         /* while we're posting someone could have decided this peer/dgram needed to
1746          * die a quick death, so we check for state change and process accordingly */
1747
1748         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1749         if (!kgnilnd_peer_active(peer) || peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH) {
1750                 if (peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH) {
1751                         peer->gnp_connecting = GNILND_PEER_KILL;
1752                 }
1753                 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1754                 /* positive RC to avoid dgram cleanup - we'll have to
1755                  * wait for the kgni GNI_POST_TERMINATED event to
1756                  * finish cleaning up */
1757                 rc = ESTALE;
1758                 kgnilnd_find_and_cancel_dgram(peer->gnp_net->gnn_dev, peer->gnp_nid);
1759                 GOTO(out, rc);
1760         }
1761         peer->gnp_connecting = GNILND_PEER_POSTED;
1762         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1763         /* reaper thread will take care of any timeouts */
1764         CDEBUG(D_NET, "waiting for connect to finish to %s rc %d\n",
1765                libcfs_nid2str(peer->gnp_nid), rc);
1766
1767         RETURN(rc);
1768
1769 failed:
1770         CDEBUG(D_NET, "connect to %s failed: rc %d \n",
1771                libcfs_nid2str(peer->gnp_nid), rc);
1772 out:
1773         RETURN(rc);
1774 }
1775
1776 int
1777 kgnilnd_finish_connect(kgn_dgram_t *dgram)
1778 {
1779         kgn_conn_t        *conn = dgram->gndg_conn;
1780         lnet_nid_t         her_nid = dgram->gndg_conn_in.gncr_srcnid;
1781         struct lnet_nid    peer_nid;
1782         kgn_peer_t        *new_peer, *peer = NULL;
1783         kgn_tx_t          *tx;
1784         kgn_tx_t          *txn;
1785         kgn_mbox_info_t   *mbox;
1786         int                rc;
1787         int                nstale;
1788
1789         /* try to find a peer that matches the nid we got in the connreq
1790          * kgnilnd_unpack_connreq makes sure that conn_in.gncr_srcnid is
1791          * HER and conn_out.gncr_srcnid is ME for both active and WC dgrams */
1792
1793         /* assume this is a new peer  - it makes locking cleaner when it isn't */
1794         /* no holding kgn_net_rw_sem - already are at the kgnilnd_dgram_mover level */
1795
1796         rc = kgnilnd_create_peer_safe(&new_peer, her_nid, NULL, GNILND_PEER_UP);
1797         if (rc != 0) {
1798                 CERROR("Can't create peer for %s\n", libcfs_nid2str(her_nid));
1799                 return rc;
1800         }
1801
1802         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1803
1804         /* this transfers ref from create_peer to the kgn_peer table */
1805         kgnilnd_add_peer_locked(her_nid, new_peer, &peer);
1806
1807         /* if we found an existing peer, is it really ready for a new conn ? */
1808         if (peer != new_peer) {
1809                 /* if this was an active connect attempt but we can't find a peer waiting for it
1810                  * we will dump in the trash */
1811
1812                 if (peer->gnp_connecting == GNILND_PEER_IDLE && dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
1813                         CDEBUG(D_NET, "dropping completed connreq for %s peer 0x%p->%s\n",
1814                                libcfs_nid2str(her_nid), peer, libcfs_nid2str(peer->gnp_nid));
1815                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1816                         rc = ECANCELED;
1817                         GOTO(out, rc);
1818                 }
1819
1820                 /* check to see if we can catch a connecting peer before it is
1821                  * removed from the connd_peers list - if not, we need to
1822                  * let the connreqs race and be handled by kgnilnd_conn_isdup_locked() */
1823                 if (peer->gnp_connecting != GNILND_PEER_IDLE) {
1824                         spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
1825                         if (!list_empty(&peer->gnp_connd_list)) {
1826                                 list_del_init(&peer->gnp_connd_list);
1827                                 /* drop connd ref */
1828                                 kgnilnd_peer_decref(peer);
1829                         }
1830                         spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
1831                         /* clear rc to make sure we don't have fake error */
1832                         rc = 0;
1833                 }
1834
1835                 /* no matter what, we are no longer waiting to connect this peer now */
1836                 peer->gnp_connecting = GNILND_PEER_IDLE;
1837
1838                 /* Refuse to duplicate an existing connection (both sides might try to
1839                  * connect at once).  NB we return success!  We _are_ connected so we
1840                  * _don't_ have any blocked txs to complete with failure. */
1841                 rc = kgnilnd_conn_isdup_locked(peer, conn);
1842                 if (rc != 0) {
1843                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1844                         CDEBUG(D_NET, "Not creating duplicate connection to %s: %d\n",
1845                               libcfs_nid2str(her_nid), rc);
1846                         rc = EALREADY;
1847                         GOTO(out, rc);
1848                 }
1849         }
1850
1851         if (peer->gnp_state == GNILND_PEER_DOWN) {
1852                 CNETERR("Received connection request from down nid %s\n",
1853                         libcfs_nid2str(her_nid));
1854         }
1855
1856         peer->gnp_state = GNILND_PEER_UP;
1857         nstale = kgnilnd_close_stale_conns_locked(peer, conn);
1858
1859         /* either way with peer (new or existing), we are ok with ref counts here as the
1860          * kgnilnd_add_peer_locked will use our ref on new_peer (from create_peer_safe) as the
1861          * ref for the peer table. */
1862
1863         /* at this point, the connection request is a winner */
1864
1865         /* mark 'DONE' to avoid cancel being called from release */
1866         dgram->gndg_state = GNILND_DGRAM_DONE;
1867
1868         /* initialise timestamps before reaper looks at them */
1869         conn->gnc_last_rx = conn->gnc_last_rx_cq = jiffies;
1870
1871         /* last_tx is initialized to jiffies - (keepalive*2) so that if the NOOP fails it will
1872          * immediatly send a NOOP in the reaper thread during the call to
1873          * kgnilnd_check_conn_timeouts_locked
1874          */
1875         conn->gnc_last_tx = jiffies - (cfs_time_seconds(GNILND_TO2KA(conn->gnc_timeout)) * 2);
1876         conn->gnc_state = GNILND_CONN_ESTABLISHED;
1877
1878         /* save the dgram type used to establish this connection */
1879         conn->gnc_dgram_type = dgram->gndg_type;
1880
1881         /* refs are not transferred from dgram to tables, so increment to
1882          * take ownership */
1883         kgnilnd_conn_addref(conn);
1884         kgnilnd_peer_addref(peer);
1885         conn->gnc_peer = peer;
1886         list_add_tail(&conn->gnc_list, &peer->gnp_conns);
1887
1888         kgnilnd_conn_addref(conn);               /* +1 ref for conn table */
1889         list_add_tail(&conn->gnc_hashlist,
1890                       kgnilnd_cqid2connlist(conn->gnc_cqid));
1891         kgnilnd_data.kgn_conn_version++;
1892
1893         /* Dont send NOOP if fail_loc is set
1894          */
1895         if (!CFS_FAIL_CHECK(CFS_FAIL_GNI_ONLY_NOOP)) {
1896                 tx = kgnilnd_new_tx_msg(GNILND_MSG_NOOP,
1897                                         lnet_nid_to_nid4(&peer->gnp_net->gnn_ni->ni_nid));
1898                 if (tx == NULL) {
1899                         CNETERR("can't get TX to initiate NOOP to %s\n",
1900                                 libcfs_nid2str(peer->gnp_nid));
1901                 } else {
1902                         kgnilnd_queue_tx(conn, tx);
1903                 }
1904         }
1905
1906         /* Schedule all packets blocking for a connection */
1907         list_for_each_entry_safe(tx, txn, &peer->gnp_tx_queue, tx_list) {
1908                 /* lock held here is the peer_conn lock */
1909                 kgnilnd_tx_del_state_locked(tx, peer, NULL, GNILND_TX_ALLOCD);
1910                 kgnilnd_queue_tx(conn, tx);
1911         }
1912
1913         /* If this is an active connection lets mark its timestamp on the MBoX */
1914         if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
1915                 mbox = &conn->gnc_fma_blk->gnm_mbox_info[conn->gnc_mbox_id];
1916                 /* conn->gnc_last_rx is jiffies it better exist as it was just set */
1917                 mbox->mbx_release_purg_active_dgram = conn->gnc_last_rx;
1918         }
1919
1920         /* Bug 765042: wake up scheduler for a race with finish_connect and
1921          * complete_conn_closed with a conn in purgatory
1922          * since we can't use CFS_RACE due to mutex_holds in kgnilnd_process_conns,
1923          * we just check for set and then clear */
1924         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_FINISH_PURG)) {
1925                 cfs_fail_loc = 0x0;
1926                 /* get scheduler thread moving again */
1927                 kgnilnd_schedule_device(conn->gnc_device);
1928         }
1929
1930         CDEBUG(D_NET, "New conn 0x%p->%s dev %d\n",
1931                conn, libcfs_nid2str(her_nid), conn->gnc_device->gnd_id);
1932
1933         /* make sure we reset peer reconnect interval now that we have a good conn */
1934         kgnilnd_peer_alive(peer);
1935         peer->gnp_reconnect_interval = 0;
1936
1937         /* clear the unlink attribute if we dont clear it kgnilnd_del_conn_or_peer will wait
1938          * on the atomic forever
1939          */
1940         if (peer->gnp_pending_unlink) {
1941                 peer->gnp_pending_unlink = 0;
1942                 kgnilnd_admin_decref(kgnilnd_data.kgn_npending_unlink);
1943                 CDEBUG(D_NET, "Clearing peer unlink %p\n",peer);
1944         }
1945
1946         /* add ref to make it hang around until after we drop the lock */
1947         kgnilnd_conn_addref(conn);
1948
1949         /* Once the peer_conn lock is dropped, the conn could actually move into
1950          * CLOSING->CLOSED->DONE in the scheduler thread, so hold the
1951          * lock until we are really done */
1952         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1953
1954         /* Notify LNET that we now have a working connection to this peer.
1955          * This is a Cray extension to the "standard" LND behavior.
1956          */
1957         lnet_nid4_to_nid(peer->gnp_nid, &peer_nid);
1958         lnet_notify(peer->gnp_net->gnn_ni, &peer_nid, true, true,
1959                     ktime_get_seconds());
1960
1961         /* drop our 'hold' ref */
1962         kgnilnd_conn_decref(conn);
1963
1964 out:
1965         RETURN(rc);
1966 }
1967
1968 void
1969 kgnilnd_send_nak(kgn_device_t *dev, lnet_nid_t dst_nid, int error)
1970 {
1971         int              rc = 0;
1972         ENTRY;
1973
1974         LASSERTF(dst_nid != LNET_NID_ANY, "bad dst_nid %s\n", libcfs_nid2str(dst_nid));
1975
1976         CDEBUG(D_NET, "NAK to %s errno %d\n", libcfs_nid2str(dst_nid), error);
1977
1978         rc = kgnilnd_post_dgram(dev, dst_nid, GNILND_CONNREQ_NAK, error);
1979
1980         if (rc < 0) {
1981                 CDEBUG(D_NET, "NAK to %s failed: rc %d \n", libcfs_nid2str(dst_nid), rc);
1982         }
1983         EXIT;
1984 }
1985
1986 int
1987 kgnilnd_process_nak(kgn_dgram_t *dgram)
1988 {
1989         kgn_connreq_t     *connreq = &dgram->gndg_conn_in;
1990         lnet_nid_t         src_nid = connreq->gncr_srcnid;
1991         int                errno = connreq->gncr_nakdata.gnnd_errno;
1992         kgn_peer_t        *peer;
1993         int                rc = 0;
1994
1995         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1996
1997         peer = kgnilnd_find_peer_locked(src_nid);
1998         if (peer == NULL) {
1999                 /* we likely dropped him from bad data when we processed
2000                  * the original REQ */
2001                 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2002                 return -EBADSLT;
2003         }
2004
2005         /* need to check peerstamp/connstamp against the ones we find
2006          * to make sure we don't close new (and good?) conns that we
2007          * formed after this connreq failed */
2008         if (peer->gnp_connecting == GNILND_PEER_IDLE) {
2009                 kgn_conn_t        conn;
2010
2011                 if (list_empty(&peer->gnp_conns)) {
2012                         /* assume already procced datagram and it barfed up
2013                          * on this side too */
2014                         CDEBUG(D_NET, "dropping NAK from %s; "
2015                                "peer %s is already not connected\n",
2016                                 libcfs_nid2str(connreq->gncr_srcnid),
2017                                 libcfs_nid2str(connreq->gncr_dstnid));
2018                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2019                         return 0;
2020                 }
2021
2022                 /* stub up a connection with the connreq XXX_stamps to allow
2023                  * use to use close_stale_conns_locked */
2024                 conn.gnc_peerstamp = connreq->gncr_peerstamp;
2025                 conn.gnc_my_connstamp = connreq->gncr_connstamp;
2026                 conn.gnc_peer_connstamp = connreq->gncr_connstamp;
2027                 conn.gnc_device = peer->gnp_net->gnn_dev;
2028
2029                 rc = kgnilnd_close_stale_conns_locked(peer, &conn);
2030
2031                 LCONSOLE_INFO("Received NAK from %s for %s errno %d; "
2032                         "closed %d connections\n",
2033                         libcfs_nid2str(connreq->gncr_srcnid),
2034                         libcfs_nid2str(connreq->gncr_dstnid), errno, rc);
2035         } else {
2036                 spin_lock(&dgram->gndg_conn->gnc_device->gnd_connd_lock);
2037
2038                 if (list_empty(&peer->gnp_connd_list)) {
2039                         /* if peer isn't on waiting list, try to find one to nuke */
2040                         rc = kgnilnd_find_and_cancel_dgram(peer->gnp_net->gnn_dev,
2041                                                            peer->gnp_nid);
2042
2043                         if (rc) {
2044                                 LCONSOLE_INFO("Received NAK from %s for %s errno %d; "
2045                                         "canceled pending connect request\n",
2046                                         libcfs_nid2str(connreq->gncr_srcnid),
2047                                         libcfs_nid2str(connreq->gncr_dstnid), errno);
2048                         }
2049
2050                         /* if we can't find a waiting dgram, we just drop the nak - the conn
2051                          * connect must have failed (didn't find conn above and clear connecting
2052                          * -- so nothing to do besides drop */
2053                 } else {
2054                         /* peer is on list, meaning it is a new connect attempt from the one
2055                          * we started that generated the NAK - so just drop NAK */
2056
2057                         /* use negative to prevent error message */
2058                         rc = -EAGAIN;
2059                 }
2060                 spin_unlock(&dgram->gndg_conn->gnc_device->gnd_connd_lock);
2061         }
2062
2063         /* success! we found a peer and at least marked pending_nak */
2064         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2065
2066         return rc;
2067 }
2068
2069 int
2070 kgnilnd_process_connreq(kgn_dgram_t *dgram, int *needs_nak)
2071 {
2072         int                      rc;
2073
2074         rc = kgnilnd_unpack_connreq(dgram);
2075         if (rc < 0) {
2076                 if (rc != -EBADF) {
2077                         /* only NAK if we have good srcnid to use */
2078                         *needs_nak = 1;
2079                 }
2080                 goto connreq_out;
2081         }
2082
2083         switch (dgram->gndg_conn_in.gncr_type) {
2084         case GNILND_CONNREQ_REQ:
2085                 /* wire up peer & conn, send queued TX */
2086                 rc = kgnilnd_finish_connect(dgram);
2087
2088                 /* don't nak when the nid is hosed */
2089                 if ((rc < 0)) {
2090                         *needs_nak = 1;
2091                 }
2092
2093                 break;
2094         case GNILND_CONNREQ_NAK:
2095                 rc = kgnilnd_process_nak(dgram);
2096                 /* return early to prevent reconnect bump */
2097                 return rc;
2098         default:
2099                 CERROR("unexpected connreq type %s (%d) from %s\n",
2100                         kgnilnd_connreq_type2str(&dgram->gndg_conn_in),
2101                         dgram->gndg_conn_in.gncr_type,
2102                         libcfs_nid2str(dgram->gndg_conn_in.gncr_srcnid));
2103                 rc = -EINVAL;
2104                 *needs_nak = 1;
2105                 break;
2106         }
2107
2108 connreq_out:
2109         RETURN(rc);
2110 }
2111
2112 int
2113 kgnilnd_probe_and_process_dgram(kgn_device_t *dev)
2114 {
2115         int                      rc;
2116         int                      needs_nak = 0;
2117         lnet_nid_t               nak_dstnid = LNET_NID_ANY;
2118         lnet_nid_t               orig_dstnid;
2119         kgn_dgram_t             *dgram = NULL;
2120         kgn_peer_t              *peer;
2121         ENTRY;
2122
2123         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PAUSE_DGRAM_COMP)) {
2124                 rc = 0;
2125         } else {
2126                 rc = kgnilnd_probe_for_dgram(dev, &dgram);
2127         }
2128
2129         if (rc == 0) {
2130                 RETURN(0);
2131         } else if (rc < 0) {
2132                 GOTO(inform_peer, rc);
2133         } else {
2134                 /* rc > 1 means it did something, reset for this func  */
2135                 rc = 0;
2136         }
2137
2138         switch (dgram->gndg_type) {
2139         case GNILND_DGRAM_WC_REQ:
2140         case GNILND_DGRAM_REQ:
2141                 rc = kgnilnd_process_connreq(dgram, &needs_nak);
2142                 break;
2143         case GNILND_DGRAM_NAK:
2144                 CDEBUG(D_NETTRACE, "NAK to %s done\n",
2145                         libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid));
2146                 break;
2147         default:
2148                 CERROR("unknown datagram type %s (%d)\n",
2149                        kgnilnd_dgram_type2str(dgram), dgram->gndg_type);
2150                 break;
2151         }
2152
2153         /* stash data to use after releasing current datagram */
2154         /* don't stash net - we are operating on a net already,
2155          * so the lock on rw_net_lock is sufficient */
2156
2157         nak_dstnid = dgram->gndg_conn_in.gncr_srcnid;
2158
2159 inform_peer:
2160         LASSERTF(dgram != NULL, "dgram 0x%p rc %d needs_nak %d\n", dgram, rc, needs_nak);
2161
2162         orig_dstnid = dgram->gndg_conn_out.gncr_dstnid;
2163
2164         kgnilnd_release_dgram(dev, dgram, 0);
2165
2166         CDEBUG(D_NET, "cleaning up dgram to %s, rc %d\n",
2167                libcfs_nid2str(orig_dstnid), rc);
2168
2169         /* if this was a WC_REQ that matched an existing peer, it'll get marked done
2170          * in kgnilnd_finish_connect - if errors are from before we get to there,
2171          * we just drop as it is a WC_REQ - the peer CAN'T be waiting for it */
2172         if ((orig_dstnid != LNET_NID_ANY) && (rc < 0)) {
2173                 /* if we have a negative rc, we want to find a peer to inform about
2174                  * the bad connection attempt. Sorry buddy, better luck next time! */
2175
2176                 write_lock(&kgnilnd_data.kgn_peer_conn_lock);
2177                 peer = kgnilnd_find_peer_locked(orig_dstnid);
2178
2179                 if (peer != NULL) {
2180                         /* add ref to make sure he stays around past the possible unlink
2181                          * so we can tell LNet about him */
2182                         kgnilnd_peer_addref(peer);
2183
2184                         /* if he still cares about the outstanding connect */
2185                         if (peer->gnp_connecting >= GNILND_PEER_CONNECT) {
2186                                 /* check if he is on the connd list and remove.. */
2187                                 spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
2188                                 if (!list_empty(&peer->gnp_connd_list)) {
2189                                         list_del_init(&peer->gnp_connd_list);
2190                                         /* drop connd ref */
2191                                         kgnilnd_peer_decref(peer);
2192                                 }
2193                                 spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
2194
2195                                 /* clear gnp_connecting so we don't have a non-connecting peer
2196                                  * on gnd_connd_list */
2197                                 peer->gnp_connecting = GNILND_PEER_IDLE;
2198
2199                                 set_mb(peer->gnp_last_dgram_errno, rc);
2200
2201                                 kgnilnd_peer_increase_reconnect_locked(peer);
2202                         }
2203                 }
2204                 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2205
2206                 /* now that we are outside the lock, tell Mommy */
2207                 if (peer != NULL) {
2208                         kgnilnd_peer_notify(peer, rc, 0);
2209                         kgnilnd_peer_decref(peer);
2210                 }
2211         }
2212
2213         if (needs_nak) {
2214                 kgnilnd_send_nak(dev, nak_dstnid, rc);
2215         }
2216
2217         RETURN(1);
2218 }
2219
2220 void
2221 kgnilnd_reaper_dgram_check(kgn_device_t *dev)
2222 {
2223         kgn_dgram_t    *dgram, *tmp;
2224         int             i;
2225
2226         spin_lock(&dev->gnd_dgram_lock);
2227
2228         for (i = 0; i < (*kgnilnd_tunables.kgn_peer_hash_size - 1); i++) {
2229                 list_for_each_entry_safe(dgram, tmp, &dev->gnd_dgrams[i], gndg_list) {
2230                         unsigned long            now = jiffies;
2231                         unsigned long            timeout;
2232
2233                         /* don't timeout stuff if the network is mucked or shutting down */
2234                         if (kgnilnd_check_hw_quiesce()) {
2235                                 break;
2236                         }
2237
2238                         if ((dgram->gndg_state != GNILND_DGRAM_POSTED) ||
2239                             (dgram->gndg_type == GNILND_DGRAM_WC_REQ)) {
2240                                 continue;
2241                         }
2242                         CDEBUG(D_NETTRACE, "checking dgram 0x%p type %s "
2243                                 "state %s conn 0x%p to %s age %lus\n",
2244                                 dgram, kgnilnd_dgram_type2str(dgram),
2245                                 kgnilnd_dgram_state2str(dgram), dgram->gndg_conn,
2246                                 libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
2247                                 cfs_duration_sec(now - dgram->gndg_post_time));
2248
2249                         timeout = cfs_time_seconds(*kgnilnd_tunables.kgn_timeout);
2250
2251                         if (time_before(now, (dgram->gndg_post_time + timeout)))
2252                                 continue;
2253
2254                         CNETERR("%s datagram to %s timed out @ %lus dgram "
2255                                 "0x%p state %s conn 0x%p\n",
2256                                 kgnilnd_dgram_type2str(dgram),
2257                                 libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
2258                                 cfs_duration_sec(now - dgram->gndg_post_time),
2259                                 dgram, kgnilnd_dgram_state2str(dgram),
2260                                 dgram->gndg_conn);
2261
2262                         kgnilnd_cancel_dgram_locked(dgram);
2263                 }
2264         }
2265         spin_unlock(&dev->gnd_dgram_lock);
2266 }
2267
2268
2269 /* use a thread for the possibly long-blocking wait_by_id to prevent
2270  * stalling the global workqueues */
2271 int
2272 kgnilnd_dgram_waitq(void *arg)
2273 {
2274         kgn_device_t     *dev = (kgn_device_t *) arg;
2275         char              name[16];
2276         gni_return_t      grc;
2277         __u64             readyid;
2278         DEFINE_WAIT(mover_done);
2279
2280         snprintf(name, sizeof(name), "kgnilnd_dgn_%02d", dev->gnd_id);
2281
2282         /* all gnilnd threads need to run fairly urgently */
2283         set_user_nice(current, *kgnilnd_tunables.kgn_nice);
2284
2285         /* we dont shut down until the device shuts down ... */
2286         while (!kgnilnd_data.kgn_shutdown) {
2287                 /* to quiesce or to not quiesce, that is the question */
2288                 if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
2289                         KGNILND_SPIN_QUIESCE;
2290                 }
2291
2292                 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_PAUSE_DGRAM_COMP, 1)) {}
2293
2294                 /* check once a second */
2295                 grc = kgnilnd_postdata_probe_wait_by_id(dev->gnd_handle,
2296                                                        1000, &readyid);
2297
2298                 if (grc == GNI_RC_SUCCESS) {
2299                         CDEBUG(D_INFO, "waking up dgram mover thread\n");
2300                         kgnilnd_schedule_dgram(dev);
2301
2302                         /* wait for dgram thread to ping us before spinning again */
2303                         prepare_to_wait(&dev->gnd_dgping_waitq, &mover_done,
2304                                         TASK_INTERRUPTIBLE);
2305
2306                         /* don't sleep if we need to quiesce */
2307                         if (likely(!kgnilnd_data.kgn_quiesce_trigger)) {
2308                                 schedule();
2309                         }
2310                         finish_wait(&dev->gnd_dgping_waitq, &mover_done);
2311                 }
2312         }
2313
2314         kgnilnd_thread_fini();
2315         return 0;
2316 }
2317
2318 int
2319 kgnilnd_start_outbound_dgrams(kgn_device_t *dev, unsigned long deadline)
2320 {
2321         int                      did_something = 0, rc;
2322         kgn_peer_t              *peer = NULL;
2323
2324         spin_lock(&dev->gnd_connd_lock);
2325
2326         /* Active connect - we added this in kgnilnd_launch_tx */
2327         while (!list_empty(&dev->gnd_connd_peers) && time_before(jiffies, deadline)) {
2328                 peer = list_first_entry(&dev->gnd_connd_peers,
2329                                         kgn_peer_t, gnp_connd_list);
2330
2331                 /* ref for connd removed in if/else below */
2332                list_del_init(&peer->gnp_connd_list);
2333
2334                 /* gnp_connecting and membership on gnd_connd_peers should be
2335                  * done coherently to avoid double adding, etc */
2336                 /* don't need kgnilnd_data.kgn_peer_conn_lock here as that is only needed
2337                  * to get the peer to gnp_connecting in the first place. We just need to
2338                  * rely on gnd_connd_lock to serialize someone pulling him from the list
2339                  * BEFORE clearing gnp_connecting */
2340                 LASSERTF(peer->gnp_connecting != GNILND_PEER_IDLE, "peer 0x%p->%s not connecting\n",
2341                          peer, libcfs_nid2str(peer->gnp_nid));
2342
2343                 spin_unlock(&dev->gnd_connd_lock);
2344
2345                 CDEBUG(D_NET, "processing connect to %s\n",
2346                        libcfs_nid2str(peer->gnp_nid));
2347
2348                 did_something += 1;
2349                 rc = kgnilnd_start_connect(peer);
2350
2351                 if (likely(rc >= 0)) {
2352                         /* 0 on success, positive on 'just drop peer' errors */
2353                         kgnilnd_peer_decref(peer);
2354                 } else if (rc == -ENOMEM) {
2355                         /* if we are out of wildcards, add back to
2356                          * connd_list - then break out and we'll try later
2357                          * if other errors, we'll bail & cancel pending tx */
2358                         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
2359                         if (peer->gnp_connecting == GNILND_PEER_POSTING) {
2360                                 peer->gnp_connecting = GNILND_PEER_CONNECT;
2361                                 spin_lock(&dev->gnd_connd_lock);
2362                                 list_add_tail(&peer->gnp_connd_list,
2363                                               &dev->gnd_connd_peers);
2364                         } else {
2365                                 /* connecting changed while we were posting */
2366
2367                                 LASSERTF(peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH, "Peer is in invalid"
2368                                         " state 0x%p->%s, connecting %d\n",
2369                                         peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting);
2370                                 peer->gnp_connecting = GNILND_PEER_KILL;
2371                                 spin_lock(&dev->gnd_connd_lock);
2372                                 /* remove the peer ref frrom the cond list */
2373                                 kgnilnd_peer_decref(peer);
2374                                 /* let the system handle itself */
2375                         }
2376                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2377                         /* the datagrams are a global pool,
2378                          * so break out of trying and hope some free
2379                          * up soon */
2380                         did_something -= 1;
2381                         break;
2382                 } else {
2383                         /* something bad happened, you lose */
2384                         CNETERR("could not start connecting to %s "
2385                                 "rc %d: Will retry until TX timeout\n",
2386                                libcfs_nid2str(peer->gnp_nid), rc);
2387                         /* It didnt post so just set connecting back to zero now.
2388                          * The reaper will reattempt the connection if it needs too.
2389                          * If the peer needs death set it so the reaper will cleanup.
2390                          */
2391                         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
2392                         if (peer->gnp_connecting == GNILND_PEER_POSTING) {
2393                                 peer->gnp_connecting = GNILND_PEER_IDLE;
2394                                 kgnilnd_peer_increase_reconnect_locked(peer);
2395                         } else {
2396                                 LASSERTF(peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH, "Peer is in invalid"
2397                                         " state 0x%p->%s, connecting %d\n",
2398                                         peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting);
2399                                 peer->gnp_connecting = GNILND_PEER_KILL;
2400                         }
2401                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2402
2403                         /* hold onto ref until we are really done - if it was
2404                          * unlinked this could result in a destroy */
2405                         kgnilnd_peer_decref(peer);
2406                 }
2407                 spin_lock(&dev->gnd_connd_lock);
2408         }
2409
2410         spin_unlock(&dev->gnd_connd_lock);
2411         RETURN(did_something);
2412 }
2413
2414 int
2415 kgnilnd_repost_wc_dgrams(kgn_device_t *dev)
2416 {
2417         int did_something = 0, to_repost, i;
2418         to_repost = atomic_read(&dev->gnd_nwcdgrams);
2419         ENTRY;
2420
2421         for (i = 0; i < to_repost; ++i) {
2422                 int     rerc;
2423                 rerc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
2424                 if (rerc == 0) {
2425                         kgnilnd_admin_decref(dev->gnd_nwcdgrams);
2426                         did_something += 1;
2427                 } else {
2428                         CDEBUG(D_NETERROR, "error %d: dev %d could not post wildcard datagram\n",
2429                                 rerc, dev->gnd_id);
2430                         break;
2431                 }
2432         }
2433
2434         RETURN(did_something);
2435 }
2436
2437 struct kgnilnd_dgram_timer {
2438         struct timer_list timer;
2439         kgn_device_t *dev;
2440 };
2441
2442 static void
2443 kgnilnd_dgram_poke_with_stick(cfs_timer_cb_arg_t arg)
2444 {
2445         struct kgnilnd_dgram_timer *t = cfs_from_timer(t, arg, timer);
2446
2447         wake_up(&t->dev->gnd_dgram_waitq);
2448 }
2449
2450 /* use single thread for dgrams - should be sufficient for performance */
2451 int
2452 kgnilnd_dgram_mover(void *arg)
2453 {
2454         kgn_device_t            *dev = (kgn_device_t *)arg;
2455         char                     name[16];
2456         int                      rc, did_something;
2457         unsigned long            next_purge_check = jiffies - 1;
2458         unsigned long            timeout;
2459         struct kgnilnd_dgram_timer timer;
2460         unsigned long deadline = 0;
2461         DEFINE_WAIT(wait);
2462
2463         snprintf(name, sizeof(name), "kgnilnd_dg_%02d", dev->gnd_id);
2464
2465         /* all gnilnd threads need to run fairly urgently */
2466         set_user_nice(current, *kgnilnd_tunables.kgn_nice);
2467
2468         /* we are ok not locking for these variables as the dgram waitq threads
2469          * will block both due to tying up net (kgn_shutdown) and the completion
2470          * event for the dgram_waitq (kgn_quiesce_trigger) */
2471         deadline = jiffies + cfs_time_seconds(*kgnilnd_tunables.kgn_dgram_timeout);
2472         while (!kgnilnd_data.kgn_shutdown) {
2473                 /* Safe: kgn_shutdown only set when quiescent */
2474
2475                 /* race with stack reset - we want to hold off seeing any new incoming dgrams
2476                  * so we can force a dirty WC dgram for Bug 762072 - put right before
2477                  * quiesce check so that it'll go right into that and not do any
2478                  * dgram mucking */
2479                 CFS_RACE(CFS_FAIL_GNI_WC_DGRAM_FREE);
2480
2481                 /* to quiesce or to not quiesce, that is the question */
2482                 if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
2483                         KGNILND_SPIN_QUIESCE;
2484                 }
2485                 did_something = 0;
2486
2487                 CFS_RACE(CFS_FAIL_GNI_QUIESCE_RACE);
2488
2489                 /* process any newly completed dgrams */
2490                 down_read(&kgnilnd_data.kgn_net_rw_sem);
2491
2492                 rc = kgnilnd_probe_and_process_dgram(dev);
2493                 if (rc > 0) {
2494                         did_something += rc;
2495                 }
2496
2497                 up_read(&kgnilnd_data.kgn_net_rw_sem);
2498
2499                 CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_DGRAM_DEADLINE,
2500                         (*kgnilnd_tunables.kgn_dgram_timeout + 1));
2501                 /* start new outbound dgrams */
2502                 did_something += kgnilnd_start_outbound_dgrams(dev, deadline);
2503
2504                 /* find dead dgrams */
2505                 if (time_after_eq(jiffies, next_purge_check)) {
2506                         /* these don't need to be checked that often */
2507                         kgnilnd_reaper_dgram_check(dev);
2508
2509                         next_purge_check = (long) jiffies +
2510                                       cfs_time_seconds(kgnilnd_data.kgn_new_min_timeout / 4);
2511                 }
2512
2513                 did_something += kgnilnd_repost_wc_dgrams(dev);
2514
2515                 /* careful with the jiffy wrap... */
2516                 timeout = (long)(next_purge_check - jiffies);
2517
2518                 CDEBUG(D_INFO, "did %d timeout %lu next %lu jiffies %lu\n",
2519                        did_something, timeout, next_purge_check, jiffies);
2520
2521                 if ((did_something || timeout <= 0) && time_before(jiffies, deadline)) {
2522                         did_something = 0;
2523                         continue;
2524                 }
2525
2526                 prepare_to_wait(&dev->gnd_dgram_waitq, &wait, TASK_INTERRUPTIBLE);
2527
2528                 cfs_timer_setup(&timer.timer,
2529                                 kgnilnd_dgram_poke_with_stick,
2530                                 dev, 0);
2531                 timer.dev = dev;
2532                 mod_timer(&timer.timer, (long) jiffies + timeout);
2533
2534                 /* last second chance for others to poke us */
2535                 did_something += xchg(&dev->gnd_dgram_ready, GNILND_DGRAM_IDLE);
2536
2537                 /* check flag variables before committing even if we
2538                  * did something; if we are after the deadline call
2539                  * schedule */
2540                 if ((!did_something || time_after(jiffies, deadline)) &&
2541                     !kgnilnd_data.kgn_shutdown &&
2542                     !kgnilnd_data.kgn_quiesce_trigger) {
2543                         CDEBUG(D_INFO, "schedule timeout %ld (%lu sec)\n",
2544                                timeout, cfs_duration_sec(timeout));
2545                         wake_up(&dev->gnd_dgping_waitq);
2546                         schedule();
2547                         CDEBUG(D_INFO, "awake after schedule\n");
2548                         deadline = jiffies + cfs_time_seconds(*kgnilnd_tunables.kgn_dgram_timeout);
2549                 }
2550
2551                 timer_delete_sync(&timer.timer);
2552                 finish_wait(&dev->gnd_dgram_waitq, &wait);
2553         }
2554
2555         kgnilnd_thread_fini();
2556         return 0;
2557 }