Whamcloud - gitweb
LU-8368 gnilnd: Use kgnilnd_vzalloc() to avoid stalls
[fs/lustre-release.git] / lnet / klnds / gnilnd / gnilnd_conn.c
1 /*
2  * Copyright (C) 2012 Cray, Inc.
3  *
4  * Copyright (c) 2014, Intel Corporation.
5  *
6  *   Author: Nic Henke <nic@cray.com>
7  *   Author: James Shimek <jshimek@cray.com>
8  *
9  *   This file is part of Lustre, http://www.lustre.org.
10  *
11  *   Lustre is free software; you can redistribute it and/or
12  *   modify it under the terms of version 2 of the GNU General Public
13  *   License as published by the Free Software Foundation.
14  *
15  *   Lustre is distributed in the hope that it will be useful,
16  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
17  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  *   GNU General Public License for more details.
19  *
20  *   You should have received a copy of the GNU General Public License
21  *   along with Lustre; if not, write to the Free Software
22  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23  *
24  */
25
26 #include "gnilnd.h"
27
28 void
29 kgnilnd_setup_smsg_attr(gni_smsg_attr_t *smsg_attr)
30 {
31         smsg_attr->mbox_maxcredit = *kgnilnd_tunables.kgn_mbox_credits;
32         smsg_attr->msg_maxsize = GNILND_MAX_MSG_SIZE;
33         smsg_attr->msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT;
34 }
35
36 int
37 kgnilnd_map_fmablk(kgn_device_t *device, kgn_fma_memblock_t *fma_blk)
38 {
39         gni_return_t            rrc;
40         __u32                   flags = GNI_MEM_READWRITE;
41         static unsigned long    reg_to;
42         int                     rfto = *kgnilnd_tunables.kgn_reg_fail_timeout;
43
44         if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
45                 flags |= GNI_MEM_PHYS_CONT;
46         }
47
48         fma_blk->gnm_hold_timeout = 0;
49
50         /* make sure we are mapping a clean block */
51         LASSERTF(fma_blk->gnm_hndl.qword1 == 0UL, "fma_blk %p dirty\n", fma_blk);
52
53         rrc = kgnilnd_mem_register(device->gnd_handle, (__u64)fma_blk->gnm_block,
54                                    fma_blk->gnm_blk_size, device->gnd_rcv_fma_cqh,
55                                    flags, &fma_blk->gnm_hndl);
56         if (rrc != GNI_RC_SUCCESS) {
57                 if (rfto != GNILND_REGFAILTO_DISABLE) {
58                         if (reg_to == 0) {
59                                 reg_to = jiffies + cfs_time_seconds(rfto);
60                         } else if (time_after(jiffies, reg_to)) {
61                                 CERROR("FATAL:fmablk registration has failed "
62                                        "for %ld seconds.\n",
63                                        cfs_duration_sec(jiffies - reg_to) +
64                                                 rfto);
65                                 LBUG();
66                         }
67                 }
68
69                 CNETERR("register fmablk failed 0x%p mbox_size %d flags %u\n",
70                         fma_blk, fma_blk->gnm_mbox_size, flags);
71                 RETURN(-ENOMEM);
72         }
73
74         reg_to = 0;
75
76         /* PHYS_CONT memory isn't really mapped, at least not in GART -
77          *  but all mappings chew up a MDD
78          */
79         if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
80                 atomic64_add(fma_blk->gnm_blk_size, &device->gnd_nbytes_map);
81         }
82
83         atomic_inc(&device->gnd_n_mdd);
84         /* nfmablk is live (mapped) blocks */
85         atomic_inc(&device->gnd_nfmablk);
86
87         RETURN(0);
88 }
89
90 int
91 kgnilnd_alloc_fmablk(kgn_device_t *device, int use_phys)
92 {
93         int                     rc = 0;
94         int                     num_mbox;
95         kgn_fma_memblock_t     *fma_blk;
96         gni_smsg_attr_t         smsg_attr;
97         unsigned long           fmablk_vers;
98
99 #if defined(CONFIG_CRAY_XT) && !defined(CONFIG_CRAY_COMPUTE)
100         /* We allocate large blocks of memory here potentially leading
101          * to memory exhaustion during massive reconnects during a network
102          * outage. Limit the amount of fma blocks to use by always keeping
103          * a percent of pages free initially set to 25% of total memory. */
104         if (global_page_state(NR_FREE_PAGES) < kgnilnd_data.free_pages_limit) {
105                 LCONSOLE_INFO("Exceeding free page limit of %ld. "
106                               "Free pages available %ld\n",
107                               kgnilnd_data.free_pages_limit,
108                               global_page_state(NR_FREE_PAGES));
109                 return -ENOMEM;
110         }
111 #endif
112         /* we'll use fmablk_vers and the gnd_fmablk_mutex to gate access
113          * to this allocation code. Everyone will sample the version
114          * before and after getting the mutex. If it has changed,
115          * we'll bail out to check the lists again - this indicates that
116          * some sort of change was made to the lists and it is possible
117          * that there is a mailbox for us to find now. This should prevent
118          * a ton of spinning in the case where there are lots of threads
119          * that need a yet-to-be-allocated mailbox for a connection. */
120
121         fmablk_vers = atomic_read(&device->gnd_fmablk_vers);
122         mutex_lock(&device->gnd_fmablk_mutex);
123
124         if (fmablk_vers != atomic_read(&device->gnd_fmablk_vers)) {
125                 /* version changed while we were waiting for semaphore,
126                  * we'll recheck the lists assuming something nice happened */
127                 mutex_unlock(&device->gnd_fmablk_mutex);
128                 return 0;
129         }
130
131         LIBCFS_ALLOC(fma_blk, sizeof(kgn_fma_memblock_t));
132         if (fma_blk == NULL) {
133                 CNETERR("could not allocate fma block descriptor\n");
134                 rc = -ENOMEM;
135                 GOTO(out, rc);
136         }
137
138         INIT_LIST_HEAD(&fma_blk->gnm_bufflist);
139
140         kgnilnd_setup_smsg_attr(&smsg_attr);
141
142         gni_smsg_buff_size_needed(&smsg_attr, &fma_blk->gnm_mbox_size);
143
144         LASSERTF(fma_blk->gnm_mbox_size, "mbox size %d\n", fma_blk->gnm_mbox_size);
145
146         /* gni_smsg_buff_size_needed calculates the base mailbox size and since
147          * we want to hold kgn_peer_credits worth of messages in both directions,
148          * we add PAYLOAD to grow the mailbox size
149          */
150
151         fma_blk->gnm_mbox_size += GNILND_MBOX_PAYLOAD;
152
153         /* we'll only use physical during preallocate at startup -- this keeps it nice and
154          * clean for runtime decisions. We'll keep the PHYS ones around until shutdown
155          * as reallocating them is tough if there is memory fragmentation */
156
157         if (use_phys) {
158                 fma_blk->gnm_block = kmem_cache_alloc(kgnilnd_data.kgn_mbox_cache, GFP_ATOMIC);
159                 if (fma_blk->gnm_block == NULL) {
160                         CNETERR("could not allocate physical SMSG mailbox memory\n");
161                         rc = -ENOMEM;
162                         GOTO(free_desc, rc);
163                 }
164                 fma_blk->gnm_blk_size = KMALLOC_MAX_SIZE;
165                 num_mbox = fma_blk->gnm_blk_size / fma_blk->gnm_mbox_size;
166
167                 LASSERTF(num_mbox >= 1,
168                          "num_mbox %d blk_size %u mbox_size %d\n",
169                           num_mbox, fma_blk->gnm_blk_size, fma_blk->gnm_mbox_size);
170
171                 fma_blk->gnm_state = GNILND_FMABLK_PHYS;
172
173         } else {
174                 num_mbox = *kgnilnd_tunables.kgn_mbox_per_block;
175                 fma_blk->gnm_blk_size = num_mbox * fma_blk->gnm_mbox_size;
176
177                 LASSERTF(num_mbox >= 1 && num_mbox >= *kgnilnd_tunables.kgn_mbox_per_block,
178                          "num_mbox %d blk_size %u mbox_size %d tunable %d\n",
179                          num_mbox, fma_blk->gnm_blk_size, fma_blk->gnm_mbox_size,
180                          *kgnilnd_tunables.kgn_mbox_per_block);
181
182                 fma_blk->gnm_block = kgnilnd_vzalloc(fma_blk->gnm_blk_size);
183                 if (fma_blk->gnm_block == NULL) {
184                         CNETERR("could not allocate virtual SMSG mailbox memory, %d bytes\n", fma_blk->gnm_blk_size);
185                         rc = -ENOMEM;
186                         GOTO(free_desc, rc);
187                 }
188
189                 fma_blk->gnm_state = GNILND_FMABLK_VIRT;
190         }
191
192         /* allocate just enough space for the bits to track the mailboxes */
193         LIBCFS_ALLOC(fma_blk->gnm_bit_array, BITS_TO_LONGS(num_mbox) * sizeof(unsigned long));
194         if (fma_blk->gnm_bit_array == NULL) {
195                 CNETERR("could not allocate mailbox bitmask, %lu bytes for %d mbox\n",
196                        sizeof(unsigned long) * BITS_TO_LONGS(num_mbox), num_mbox);
197                 rc = -ENOMEM;
198                 GOTO(free_blk, rc);
199         }
200         bitmap_zero(fma_blk->gnm_bit_array, num_mbox);
201
202         /* now that the num_mbox is set based on allocation type, get debug info setup */
203         LIBCFS_ALLOC(fma_blk->gnm_mbox_info, sizeof(kgn_mbox_info_t) * num_mbox);
204         if (fma_blk->gnm_mbox_info == NULL) {
205                 CNETERR("could not allocate mailbox debug, %lu bytes for %d mbox\n",
206                        sizeof(kgn_mbox_info_t) * num_mbox, num_mbox);
207                 rc = -ENOMEM;
208                 GOTO(free_bit, rc);
209         }
210
211         rc = kgnilnd_map_fmablk(device, fma_blk);
212         if (rc) {
213                 GOTO(free_info, rc);
214         }
215
216         fma_blk->gnm_next_avail_mbox = 0;
217         fma_blk->gnm_avail_mboxs = fma_blk->gnm_num_mboxs = num_mbox;
218
219         CDEBUG(D_MALLOC, "alloc fmablk 0x%p num %d msg_maxsize %d credits %d "
220                 "mbox_size %d MDD %#llx.%#llx\n",
221                 fma_blk, num_mbox, smsg_attr.msg_maxsize, smsg_attr.mbox_maxcredit,
222                 fma_blk->gnm_mbox_size, fma_blk->gnm_hndl.qword1,
223                 fma_blk->gnm_hndl.qword2);
224
225         /* lock Is protecting data structures, not semaphore */
226
227         spin_lock(&device->gnd_fmablk_lock);
228         list_add_tail(&fma_blk->gnm_bufflist, &device->gnd_fma_buffs);
229
230         /* toggle under the lock so once they change the list is also
231          * ready for others to traverse */
232         atomic_inc(&device->gnd_fmablk_vers);
233
234         spin_unlock(&device->gnd_fmablk_lock);
235
236         mutex_unlock(&device->gnd_fmablk_mutex);
237
238         return 0;
239
240 free_info:
241         LIBCFS_FREE(fma_blk->gnm_mbox_info, sizeof(kgn_mbox_info_t)*num_mbox);
242 free_bit:
243         LIBCFS_FREE(fma_blk->gnm_bit_array, BITS_TO_LONGS(num_mbox) * sizeof (unsigned long));
244 free_blk:
245         if (fma_blk->gnm_state == GNILND_FMABLK_VIRT) {
246                 kgnilnd_vfree(fma_blk->gnm_block, fma_blk->gnm_blk_size);
247         } else {
248                 kmem_cache_free(kgnilnd_data.kgn_mbox_cache, fma_blk->gnm_block);
249         }
250 free_desc:
251         LIBCFS_FREE(fma_blk, sizeof(kgn_fma_memblock_t));
252 out:
253         mutex_unlock(&device->gnd_fmablk_mutex);
254         return rc;
255 }
256
257 void
258 kgnilnd_unmap_fmablk(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
259 {
260         gni_return_t            rrc;
261
262         /* if some held, set hold_timeout from conn timeouts used in this block
263          * but not during shutdown, then just nuke and pave
264          * During a stack reset, we need to deregister with a hold timeout
265          * set so we don't use the same mdd after reset is complete */
266         if ((fma_blk->gnm_held_mboxs && !kgnilnd_data.kgn_shutdown) ||
267             kgnilnd_data.kgn_in_reset) {
268                 fma_blk->gnm_hold_timeout = GNILND_TIMEOUT2DEADMAN;
269         }
270
271         /* we are changing the state of a block, tickle version to tell
272          * proc code list is stale now */
273         atomic_inc(&dev->gnd_fmablk_vers);
274
275         rrc = kgnilnd_mem_deregister(dev->gnd_handle, &fma_blk->gnm_hndl, fma_blk->gnm_hold_timeout);
276
277         CDEBUG(rrc == GNI_RC_SUCCESS ? D_MALLOC : D_CONSOLE|D_NETERROR,
278                "unmap fmablk 0x%p@%s sz %u total %d avail %d held %d mbox_size %d "
279                 "hold_timeout %d\n",
280                fma_blk, kgnilnd_fmablk_state2str(fma_blk->gnm_state),
281                fma_blk->gnm_blk_size, fma_blk->gnm_num_mboxs,
282                fma_blk->gnm_avail_mboxs, fma_blk->gnm_held_mboxs,
283                fma_blk->gnm_mbox_size, fma_blk->gnm_hold_timeout);
284
285         LASSERTF(rrc == GNI_RC_SUCCESS,
286                 "tried to double unmap or something bad, fma_blk %p (rrc %d)\n",
287                 fma_blk, rrc);
288
289         if (fma_blk->gnm_hold_timeout &&
290             !(kgnilnd_data.kgn_in_reset &&
291               fma_blk->gnm_state == GNILND_FMABLK_PHYS)) {
292                 atomic_inc(&dev->gnd_n_mdd_held);
293         } else {
294                 atomic_dec(&dev->gnd_n_mdd);
295         }
296
297         /* PHYS blocks don't get mapped */
298         if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
299                 atomic64_sub(fma_blk->gnm_blk_size, &dev->gnd_nbytes_map);
300                 fma_blk->gnm_state = GNILND_FMABLK_IDLE;
301         } else if (kgnilnd_data.kgn_in_reset) {
302                 /* in stack reset, clear MDD handle for PHYS blocks, as we'll
303                  * re-use the fma_blk after reset so we don't have to drop/allocate
304                  * all of those physical blocks */
305                 fma_blk->gnm_hndl.qword1 = fma_blk->gnm_hndl.qword2 = 0UL;
306         }
307
308         /* Decrement here as this is the # of mapped blocks */
309         atomic_dec(&dev->gnd_nfmablk);
310 }
311
312
313 /* needs lock on gnd_fmablk_lock to cover gnd_fma_buffs */
314 void
315 kgnilnd_free_fmablk_locked(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
316 {
317         LASSERTF(fma_blk->gnm_avail_mboxs == fma_blk->gnm_num_mboxs,
318                  "fma_blk %p@%d free in bad state (%d): blk total %d avail %d held %d\n",
319                  fma_blk, fma_blk->gnm_state, fma_blk->gnm_hold_timeout, fma_blk->gnm_num_mboxs,
320                 fma_blk->gnm_avail_mboxs, fma_blk->gnm_held_mboxs);
321
322         atomic_inc(&dev->gnd_fmablk_vers);
323
324         if (fma_blk->gnm_hold_timeout) {
325                 CDEBUG(D_MALLOC, "mdd release fmablk 0x%p sz %u avail %d held %d "
326                         "mbox_size %d\n",
327                         fma_blk, fma_blk->gnm_blk_size, fma_blk->gnm_avail_mboxs,
328                         fma_blk->gnm_held_mboxs, fma_blk->gnm_mbox_size);
329
330                 /* We leave MDD dangling over stack reset */
331                 if (!kgnilnd_data.kgn_in_reset) {
332                         kgnilnd_mem_mdd_release(dev->gnd_handle, &fma_blk->gnm_hndl);
333                 }
334                 /* ignoring the return code - if kgni/ghal can't find it
335                  * it must be released already */
336                 atomic_dec(&dev->gnd_n_mdd_held);
337                 atomic_dec(&dev->gnd_n_mdd);
338         }
339
340         /* we cant' free the gnm_block until all the conns have released their
341          * purgatory holds. While we have purgatory holds, we might check the conn
342          * RX mailbox during the CLOSING process. It is possible that kgni might
343          * try to look into the RX side for credits when sending the CLOSE msg too */
344         CDEBUG(D_MALLOC, "fmablk %p free buffer %p mbox_size %d\n",
345                 fma_blk, fma_blk->gnm_block, fma_blk->gnm_mbox_size);
346
347         if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
348                 kmem_cache_free(kgnilnd_data.kgn_mbox_cache, fma_blk->gnm_block);
349         } else {
350                 kgnilnd_vfree(fma_blk->gnm_block, fma_blk->gnm_blk_size);
351         }
352         fma_blk->gnm_state = GNILND_FMABLK_FREED;
353
354         list_del(&fma_blk->gnm_bufflist);
355
356         LIBCFS_FREE(fma_blk->gnm_mbox_info, sizeof(kgn_mbox_info_t)*fma_blk->gnm_num_mboxs);
357         LIBCFS_FREE(fma_blk->gnm_bit_array, BITS_TO_LONGS(fma_blk->gnm_num_mboxs) * sizeof (unsigned long));
358         LIBCFS_FREE(fma_blk, sizeof(kgn_fma_memblock_t));
359 }
360
361 void
362 kgnilnd_find_free_mbox(kgn_conn_t *conn)
363 {
364         kgn_device_t            *dev = conn->gnc_device;
365         gni_smsg_attr_t         *smsg_attr = &conn->gnpr_smsg_attr;
366         kgn_fma_memblock_t      *fma_blk;
367         kgn_mbox_info_t         *mbox = NULL;
368         int                     id;
369
370         spin_lock(&dev->gnd_fmablk_lock);
371
372         list_for_each_entry(fma_blk, &conn->gnc_device->gnd_fma_buffs,
373                             gnm_bufflist) {
374                 if (fma_blk->gnm_avail_mboxs <= 0 ||
375                     fma_blk->gnm_state <= GNILND_FMABLK_IDLE) {
376                         continue;
377                 }
378                 /* look in bitarray for available mailbox */
379                 do {
380                         id = find_next_zero_bit(
381                                 fma_blk->gnm_bit_array,
382                                 fma_blk->gnm_num_mboxs,
383                                 fma_blk->gnm_next_avail_mbox);
384                       if (id == fma_blk->gnm_num_mboxs &&
385                           fma_blk->gnm_next_avail_mbox != 0) {
386                                 /* wrap around */
387                                 fma_blk->gnm_next_avail_mbox = 0;
388                         } else {
389                                 break;
390                         }
391                 } while (1);
392
393                 LASSERTF(id < fma_blk->gnm_num_mboxs, "id %d max %d\n",
394                          id, fma_blk->gnm_num_mboxs);
395                 set_bit(id, (volatile unsigned long *)fma_blk->gnm_bit_array);
396                 conn->gnc_mbox_id = id;
397
398                 fma_blk->gnm_next_avail_mbox =
399                         (id == (fma_blk->gnm_num_mboxs - 1)) ? 0 : (id + 1);
400                 fma_blk->gnm_avail_mboxs--;
401                 conn->gnc_fma_blk = fma_blk;
402
403                 kgnilnd_setup_smsg_attr(smsg_attr);
404
405                 smsg_attr->msg_buffer = fma_blk->gnm_block;
406                 smsg_attr->mbox_offset = fma_blk->gnm_mbox_size * id;
407                 smsg_attr->mem_hndl = fma_blk->gnm_hndl;
408                 smsg_attr->buff_size = fma_blk->gnm_mbox_size;
409
410                 /* We'll set the hndl to zero for PHYS blocks unmapped during stack
411                  * reset and re-use the same fma_blk after stack reset. This ensures we've
412                  * properly mapped it before we use it */
413                 LASSERTF(fma_blk->gnm_hndl.qword1 != 0UL, "unmapped fma_blk %p, state %d\n",
414                          fma_blk, fma_blk->gnm_state);
415
416                 CDEBUG(D_NET, "conn %p smsg %p fmablk %p "
417                         "allocating SMSG mbox %d buf %p "
418                         "offset %u hndl %#llx.%#llx\n",
419                         conn, smsg_attr, fma_blk, id,
420                         smsg_attr->msg_buffer, smsg_attr->mbox_offset,
421                         fma_blk->gnm_hndl.qword1,
422                         fma_blk->gnm_hndl.qword2);
423
424                 mbox = &fma_blk->gnm_mbox_info[id];
425                 mbox->mbx_create_conn_memset = jiffies;
426                 mbox->mbx_nallocs++;
427                 mbox->mbx_nallocs_total++;
428
429                 /* zero mbox to remove any old data from our last use.
430                  * this better be safe, if not our purgatory timers
431                  * are too short or a peer really is misbehaving */
432                 memset(smsg_attr->msg_buffer + smsg_attr->mbox_offset,
433                        0, smsg_attr->buff_size);
434                 break;
435         }
436
437         spin_unlock(&dev->gnd_fmablk_lock);
438 }
439
440 int
441 kgnilnd_setup_mbox(kgn_conn_t *conn)
442 {
443         gni_smsg_attr_t         *smsg_attr = &conn->gnpr_smsg_attr;
444         int                      err = 0;
445
446         smsg_attr->msg_buffer = NULL;
447         /* Look for available mbox */
448         do {
449                 kgnilnd_find_free_mbox(conn);
450
451                 /* nothing in the existing buffers, make a new one */
452                 if (smsg_attr->msg_buffer == NULL) {
453                         /* for runtime allocations, we only want vmalloc */
454                         err = kgnilnd_alloc_fmablk(conn->gnc_device, 0);
455                         if (err) {
456                                 break;
457                         }
458                 }
459         } while (smsg_attr->msg_buffer == NULL);
460
461         if (err)
462                 CNETERR("couldn't allocate SMSG mbox for conn %p Error: %d\n",
463                         conn, err);
464         return err;
465 }
466
467 void
468 kgnilnd_release_mbox(kgn_conn_t *conn, int purgatory_hold)
469 {
470         kgn_device_t           *dev = conn->gnc_device;
471         gni_smsg_attr_t        *smsg_attr = &conn->gnpr_smsg_attr;
472         kgn_fma_memblock_t     *fma_blk = NULL;
473         kgn_mbox_info_t        *mbox = NULL;
474         int                     found = 0;
475         int                     id;
476
477         /* if we failed to setup mbox and now destroying conn */
478         if (smsg_attr->msg_buffer == NULL) {
479                 return;
480         }
481
482         id = conn->gnc_mbox_id;
483
484         spin_lock(&dev->gnd_fmablk_lock);
485         /* make sure our conn points at a valid fma_blk
486          * We use this instead of a mem block search out of smsg_attr
487          * because we could have freed a block for fma_blk #1 but the fma_blk
488          * is still in the list for a purgatory hold. This would induce a false
489          * match if that same block gets reallocated to fma_blk #2 */
490         list_for_each_entry(fma_blk, &dev->gnd_fma_buffs, gnm_bufflist) {
491                 if (fma_blk == conn->gnc_fma_blk) {
492                         found = 1;
493                         break;
494                 }
495         }
496         LASSERTF(found, "unable to find conn 0x%p with gnc_fma_blk %p "
497                  "anywhere in the world\n", conn, conn->gnc_fma_blk);
498
499         LASSERTF(id < fma_blk->gnm_num_mboxs,
500                 "bad id %d max %d\n",
501                 id, fma_blk->gnm_num_mboxs);
502
503         /* < 0 - was held, now free it
504          * == 0 - just free it
505          * > 0 - hold it for now */
506         if (purgatory_hold == 0) {
507                 CDEBUG(D_NET, "conn %p smsg %p fmablk %p freeing SMSG mbox %d "
508                         "hndl %#llx.%#llx\n",
509                         conn, smsg_attr, fma_blk, id,
510                         fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
511                 fma_blk->gnm_avail_mboxs++;
512
513         } else if (purgatory_hold > 0) {
514                 CDEBUG(D_NET, "conn %p smsg %p fmablk %p holding SMSG mbox %d "
515                         "hndl %#llx.%#llx\n",
516                         conn, smsg_attr, fma_blk, id,
517                         fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
518
519                 fma_blk->gnm_held_mboxs++;
520                 fma_blk->gnm_max_timeout = MAX(fma_blk->gnm_max_timeout,
521                                                 conn->gnc_timeout);
522         } else {
523                 CDEBUG(D_NET, "conn %p smsg %p fmablk %p release SMSG mbox %d "
524                         "hndl %#llx.%#llx\n",
525                         conn, smsg_attr, fma_blk, id,
526                         fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
527
528                 fma_blk->gnm_held_mboxs--;
529                 fma_blk->gnm_avail_mboxs++;
530         }
531
532         if (purgatory_hold <= 0) {
533                 /* if kgni is retransmitting, freeing the smsg block before the EP
534                  * is destroyed gets messy. Bug 768295. */
535                 LASSERTF(conn->gnc_ephandle == NULL,
536                          "can't release mbox before EP is nuked. conn 0x%p\n", conn);
537
538                 mbox = &fma_blk->gnm_mbox_info[id];
539                 mbox->mbx_release_from_purgatory = jiffies;
540
541                 /* clear conn gnc_fmablk if it is gone - this allows us to
542                  * not worry about state so much in kgnilnd_destroy_conn
543                  * and makes the guaranteed cleanup of the resources easier */
544                 LASSERTF(test_and_clear_bit(id, fma_blk->gnm_bit_array),
545                         "conn %p bit %d already cleared in fma_blk %p\n",
546                          conn, id, fma_blk);
547                 conn->gnc_fma_blk = NULL;
548                 mbox->mbx_nallocs--;
549         }
550
551         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_FMABLK_AVAIL)) {
552                 CERROR("LBUGs in your future: forcibly marking fma_blk %p "
553                        "as mapped\n", fma_blk);
554                 fma_blk->gnm_state = GNILND_FMABLK_VIRT;
555         }
556
557         /* we don't release or unmap PHYS blocks as part of the normal cycle --
558          * those are controlled manually from startup/shutdown */
559         if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
560                 /* we can unmap once all are unused (held or avail)
561                  * but check hold_timeout to make sure we are not trying to double
562                  * unmap this buffer. If there was no hold_timeout set due to
563                  * held_mboxs, we'll free the mobx here shortly and won't have to
564                  * worry about catching a double free for a 'clean' fma_blk */
565                 if (((fma_blk->gnm_avail_mboxs + fma_blk->gnm_held_mboxs) == fma_blk->gnm_num_mboxs) &&
566                     (!fma_blk->gnm_hold_timeout)) {
567                         kgnilnd_unmap_fmablk(dev, fma_blk);
568                 }
569
570                 /* But we can only free once they are all avail */
571                 if (fma_blk->gnm_avail_mboxs == fma_blk->gnm_num_mboxs &&
572                     fma_blk->gnm_held_mboxs == 0) {
573                         /* all mailboxes are released, free fma_blk */
574                         kgnilnd_free_fmablk_locked(dev, fma_blk);
575                 }
576         }
577
578         spin_unlock(&dev->gnd_fmablk_lock);
579 }
580
581 int
582 kgnilnd_count_phys_mbox(kgn_device_t *device)
583 {
584         int                     i = 0;
585         kgn_fma_memblock_t     *fma_blk;
586
587         spin_lock(&device->gnd_fmablk_lock);
588
589         list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
590                 if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
591                         i += fma_blk->gnm_num_mboxs;
592         }
593         spin_unlock(&device->gnd_fmablk_lock);
594
595         RETURN(i);
596 }
597
598 int
599 kgnilnd_allocate_phys_fmablk(kgn_device_t *device)
600 {
601         int     rc;
602
603         while (kgnilnd_count_phys_mbox(device) < *kgnilnd_tunables.kgn_nphys_mbox) {
604
605                 rc = kgnilnd_alloc_fmablk(device, 1);
606                 if (rc) {
607                         CERROR("failed phys mbox allocation, stopping at %d, rc %d\n",
608                                 kgnilnd_count_phys_mbox(device), rc);
609                         RETURN(rc);
610                 }
611         }
612         RETURN(0);
613 }
614
615 int
616 kgnilnd_map_phys_fmablk(kgn_device_t *device)
617 {
618
619         int                     rc = 0;
620         kgn_fma_memblock_t     *fma_blk;
621
622         /* use mutex to gate access to single thread, just in case */
623         mutex_lock(&device->gnd_fmablk_mutex);
624
625         spin_lock(&device->gnd_fmablk_lock);
626
627         list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
628                 if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
629                         rc = kgnilnd_map_fmablk(device, fma_blk);
630                         if (rc)
631                                 break;
632                 }
633         }
634         spin_unlock(&device->gnd_fmablk_lock);
635
636         mutex_unlock(&device->gnd_fmablk_mutex);
637
638         RETURN(rc);
639 }
640
641 void
642 kgnilnd_unmap_fma_blocks(kgn_device_t *device)
643 {
644
645         kgn_fma_memblock_t      *fma_blk;
646
647         /* use mutex to gate access to single thread, just in case */
648         mutex_lock(&device->gnd_fmablk_mutex);
649
650         spin_lock(&device->gnd_fmablk_lock);
651
652         list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
653                 kgnilnd_unmap_fmablk(device, fma_blk);
654         }
655         spin_unlock(&device->gnd_fmablk_lock);
656
657         mutex_unlock(&device->gnd_fmablk_mutex);
658 }
659
660 void
661 kgnilnd_free_phys_fmablk(kgn_device_t *device)
662 {
663
664         kgn_fma_memblock_t      *fma_blk, *fma_blkN;
665
666         /* use mutex to gate access to single thread, just in case */
667         mutex_lock(&device->gnd_fmablk_mutex);
668
669         spin_lock(&device->gnd_fmablk_lock);
670
671         list_for_each_entry_safe(fma_blk, fma_blkN, &device->gnd_fma_buffs, gnm_bufflist) {
672                 if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
673                         kgnilnd_free_fmablk_locked(device, fma_blk);
674         }
675         spin_unlock(&device->gnd_fmablk_lock);
676
677         mutex_unlock(&device->gnd_fmablk_mutex);
678 }
679
680 /* kgnilnd dgram nid->struct managment */
681
682 static inline struct list_head *
683 kgnilnd_nid2dgramlist(kgn_device_t *dev, lnet_nid_t nid)
684 {
685         unsigned int hash = ((unsigned int)nid) % *kgnilnd_tunables.kgn_peer_hash_size;
686
687         RETURN(&dev->gnd_dgrams[hash]);
688 }
689
690
691 /* needs dev->gnd_dgram_lock held */
692 kgn_dgram_t *
693 kgnilnd_find_dgram_locked(kgn_device_t *dev, lnet_nid_t dst_nid)
694 {
695         struct list_head *dgram_list = kgnilnd_nid2dgramlist(dev, dst_nid);
696         kgn_dgram_t      *dgram;
697
698         list_for_each_entry(dgram, dgram_list, gndg_list) {
699
700                 /* if state > POSTED, we are already handling cancel/completion */
701                 if ((dgram->gndg_conn_out.gncr_dstnid != dst_nid) ||
702                      dgram->gndg_state > GNILND_DGRAM_POSTED)
703                         continue;
704
705                 CDEBUG(D_NET, "got dgram [%p] -> %s\n",
706                        dgram, libcfs_nid2str(dst_nid));
707                 return dgram;
708         }
709         return NULL;
710 }
711
712 int
713 kgnilnd_find_and_cancel_dgram(kgn_device_t *dev, lnet_nid_t dst_nid)
714 {
715         kgn_dgram_t     *dgram;
716
717         spin_lock(&dev->gnd_dgram_lock);
718         dgram = kgnilnd_find_dgram_locked(dev, dst_nid);
719
720         if (dgram) {
721                 kgnilnd_cancel_dgram_locked(dgram);
722         }
723         spin_unlock(&dev->gnd_dgram_lock);
724
725         RETURN(!!(dgram == NULL));
726 }
727
728 int
729 kgnilnd_pack_connreq(kgn_connreq_t *connreq, kgn_conn_t *conn,
730                      lnet_nid_t srcnid, lnet_nid_t dstnid,
731                      kgn_connreq_type_t type)
732 {
733         int err = 0;
734
735         /* ensure we haven't violated max datagram size */
736         CLASSERT(sizeof(kgn_connreq_t) <= GNI_DATAGRAM_MAXSIZE);
737
738         /* no need to zero out, we do that when allocating dgram */
739         connreq->gncr_magic     = GNILND_MSG_MAGIC;
740
741         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_SRCNID)) {
742                 srcnid = 0xABADBABE;
743         } else if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_DSTNID)) {
744                 dstnid = 0xDEFEC8ED;
745         }
746
747         connreq->gncr_srcnid    = srcnid;
748         connreq->gncr_dstnid    = dstnid;
749
750         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
751                 connreq->gncr_version = 99;
752         } else {
753                 connreq->gncr_version   = GNILND_CONNREQ_VERSION;
754         }
755         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
756                 connreq->gncr_type = 99;
757         } else {
758                 connreq->gncr_type      = type;
759         }
760         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
761                 connreq->gncr_peerstamp = 0;
762         } else {
763                 connreq->gncr_peerstamp = kgnilnd_data.kgn_peerstamp;
764         }
765         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
766                 connreq->gncr_connstamp = 0;
767         } else {
768                 connreq->gncr_connstamp = conn->gnc_my_connstamp;
769         }
770         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
771                 connreq->gncr_timeout = 0;
772         } else {
773                 connreq->gncr_timeout   = conn->gnc_timeout;
774         }
775
776         /* the rest pack the data into the payload in other places */
777         if (type == GNILND_CONNREQ_REQ) {
778                 kgn_gniparams_t       *req_params = &connreq->gncr_gnparams;
779                 req_params->gnpr_host_id = conn->gnc_device->gnd_host_id;
780                 req_params->gnpr_cqid = conn->gnc_cqid;
781
782                 /* allocate mailbox for this connection */
783                 err = kgnilnd_setup_mbox(conn);
784                 if (err != 0) {
785                         CERROR("Failed to setup FMA mailbox (%d)\n", err);
786                 }
787                 req_params->gnpr_smsg_attr = conn->gnpr_smsg_attr;
788         }
789
790         /* XXX Nic: TBD - checksum computation */
791
792         return err;
793 }
794
795 int
796 kgnilnd_unpack_connreq(kgn_dgram_t *dgram)
797 {
798         kgn_connreq_t           *connreq = &dgram->gndg_conn_in;
799         int                      swab, rc = 0;
800         kgn_net_t               *net;
801
802         /* the following fields must be handled in a backwards compatible
803          * manner to ensure we can always send and interpret NAKs */
804
805         if (connreq->gncr_magic != GNILND_MSG_MAGIC &&
806             connreq->gncr_magic != __swab32(GNILND_MSG_MAGIC)) {
807                 /* Unexpected magic! */
808                 CERROR("Unexpected magic %08x\n",
809                        connreq->gncr_magic);
810                 return -EBADF;
811         }
812
813         swab = (connreq->gncr_magic == __swab32(GNILND_MSG_MAGIC));
814         if (swab) {
815                 __swab32s(&connreq->gncr_magic);
816                 __swab32s(&connreq->gncr_cksum);
817                 __swab16s(&connreq->gncr_type);
818                 __swab16s(&connreq->gncr_version);
819                 __swab32s(&connreq->gncr_timeout);
820                 __swab64s(&connreq->gncr_srcnid);
821                 __swab64s(&connreq->gncr_dstnid);
822                 __swab64s(&connreq->gncr_peerstamp);
823                 __swab64s(&connreq->gncr_connstamp);
824         }
825
826         /* Do NOT return anything but -EBADF before we munge
827          * connreq->gncr_srcnid - we need that to send the nak */
828
829         if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
830                 lnet_nid_t      incoming = connreq->gncr_srcnid;
831
832                 /* even if the incoming packet is hosed, we know who we sent
833                  * the original and can set the srcnid so that we can properly
834                  * look up our peer to close the loop on this connreq. We still use
835                  * -EBADF to prevent a NAK - just in case there are issues with
836                  * the payload coming from a random spot, etc. */
837                 connreq->gncr_srcnid = dgram->gndg_conn_out.gncr_dstnid;
838
839                 if (LNET_NIDADDR(dgram->gndg_conn_out.gncr_dstnid) !=
840                                 LNET_NIDADDR(incoming)) {
841                         /* we got a datagram match for the wrong nid... */
842                         CERROR("matched datagram 0x%p with srcnid %s "
843                                 "(%x), expecting %s (%x)\n",
844                                 dgram,
845                                 libcfs_nid2str(incoming),
846                                 LNET_NIDADDR(incoming),
847                                 libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
848                                 LNET_NIDADDR(dgram->gndg_conn_out.gncr_dstnid));
849                         return -EBADF;
850                 }
851         } else {
852                 /* if we have a wildcard datagram it should match an
853                  * incoming "active" datagram that should have a fully formed
854                  * srcnid and dstnid. If we couldn't unpack it, we drop as
855                  * corrupted packet, otherwise we'll just verify that the dstnid
856                  * matches the NID for the NET that the dgram was posted */
857
858                 /* make sure their wildcard didn't match ours, that is unpossible */
859                 LASSERTF(connreq->gncr_dstnid != LNET_NID_ANY,
860                          "dgram 0x%p from %s, connreq 0x%p; "
861                          "wildcard matched wildcard \n", dgram,
862                          libcfs_nid2str(connreq->gncr_srcnid), connreq);
863
864                 rc = kgnilnd_find_net(connreq->gncr_dstnid, &net);
865
866                 if (rc == -ESHUTDOWN) {
867                         CERROR("Looking up network: device is in shutdown");
868                         return rc;
869                 } else if (rc == -ENONET) {
870                         CERROR("Connection data from %s: she sent "
871                         "dst_nid %s, but net lookup failed on "
872                         "dgram 0x%p@%s\n",
873                         libcfs_nid2str(connreq->gncr_srcnid),
874                         libcfs_nid2str(connreq->gncr_dstnid),
875                         dgram, kgnilnd_dgram_type2str(dgram));
876                         return rc;
877                 }
878
879                 if (net->gnn_ni->ni_nid != connreq->gncr_dstnid) {
880                         CERROR("Bad connection data from %s: she sent "
881                                "dst_nid %s, but I am %s with dgram 0x%p@%s\n",
882                                libcfs_nid2str(connreq->gncr_srcnid),
883                                libcfs_nid2str(connreq->gncr_dstnid),
884                                libcfs_nid2str(net->gnn_ni->ni_nid),
885                                dgram, kgnilnd_dgram_type2str(dgram));
886                         kgnilnd_net_decref(net);
887                         return -EBADSLT;
888                 }
889
890                 /* kgnilnd_find_net takes a ref on the net it finds, You need to decref it when not needed. */
891                 kgnilnd_net_decref(net);
892         }
893
894         if (connreq->gncr_version != GNILND_CONNREQ_VERSION) {
895                 CERROR("Unexpected version %d\n", connreq->gncr_version);
896                 return -EPROTO;
897         }
898
899         /* XXX Nic: TBD - checksum validation */
900         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_DROP)) {
901                 return -EBADF;
902         }
903
904         if (swab && connreq->gncr_type == GNILND_CONNREQ_REQ) {
905                 __u64 msg_addr = (__u64) connreq->gncr_gnparams.gnpr_smsg_attr.msg_buffer;
906
907                 __swab32s(&connreq->gncr_gnparams.gnpr_host_id);
908                 __swab32s(&connreq->gncr_gnparams.gnpr_cqid);
909                 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.buff_size);
910                 __swab16s(&connreq->gncr_gnparams.gnpr_smsg_attr.mbox_maxcredit);
911                 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.mbox_offset);
912                 __swab64s(&connreq->gncr_gnparams.gnpr_smsg_attr.mem_hndl.qword1);
913                 __swab64s(&connreq->gncr_gnparams.gnpr_smsg_attr.mem_hndl.qword2);
914                 __swab64s(&msg_addr);
915                 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.msg_maxsize);
916                 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.msg_type);
917         } else if (swab && connreq->gncr_type == GNILND_CONNREQ_NAK) {
918                 __swab32s(&connreq->gncr_nakdata.gnnd_errno);
919         }
920
921         /* since we use a unique instance ID for each network, the driver
922          * will take care of dropping datagrams if we don't have that network.
923          */
924
925         /* few more idiot software or configuration checks */
926
927         switch (connreq->gncr_type) {
928         case GNILND_CONNREQ_REQ:
929                 /* wire up EP and SMSG block - this will check the incoming data
930                  * and barf a NAK back if need to */
931                 rc = kgnilnd_set_conn_params(dgram);
932                 if (rc)
933                         return rc;
934                 break;
935         case GNILND_CONNREQ_NAK:
936         case GNILND_CONNREQ_CLOSE:
937                 break;
938         default:
939                 CERROR("unknown connreq packet type %d\n", connreq->gncr_type);
940                 return -EPROTO;
941         }
942
943         if (connreq->gncr_peerstamp == 0 || connreq->gncr_connstamp == 0) {
944                 CERROR("Recived bad timestamps peer %llu conn %llu\n",
945                 connreq->gncr_peerstamp, connreq->gncr_connstamp);
946                 return -EPROTO;
947         }
948
949         if (connreq->gncr_timeout < GNILND_MIN_TIMEOUT) {
950                 CERROR("Received timeout %d < MIN %d\n",
951                        connreq->gncr_timeout, GNILND_MIN_TIMEOUT);
952                 return -EPROTO;
953         }
954
955         return 0;
956 }
957
958 int
959 kgnilnd_alloc_dgram(kgn_dgram_t **dgramp, kgn_device_t *dev, kgn_dgram_type_t type)
960 {
961         kgn_dgram_t         *dgram;
962
963         dgram = kmem_cache_alloc(kgnilnd_data.kgn_dgram_cache, GFP_ATOMIC);
964         if (dgram == NULL)
965                 return -ENOMEM;
966
967         /* cache alloc'd memory is not zeroed */
968         memset((void *)dgram, 0, sizeof(*dgram)) ;
969
970         INIT_LIST_HEAD(&dgram->gndg_list);
971         dgram->gndg_state = GNILND_DGRAM_USED;
972         dgram->gndg_type = type;
973         dgram->gndg_magic = GNILND_DGRAM_MAGIC;
974
975         atomic_inc(&dev->gnd_ndgrams);
976
977         CDEBUG(D_MALLOC|D_NETTRACE, "slab-alloced 'dgram': %lu at %p %s ndgrams"
978                 " %d\n",
979                 sizeof(*dgram), dgram, kgnilnd_dgram_type2str(dgram),
980                 atomic_read(&dev->gnd_ndgrams));
981
982         *dgramp = dgram;
983         return 0;
984 }
985
986 /* call this on a dgram that came back from kgnilnd_ep_postdata_test_by_id
987  * returns < 0 on dgram to be cleaned up
988  * > 0 on dgram that isn't done yet
989  * == 0 on dgram that is ok and needs connreq processing */
990 int
991 kgnilnd_process_dgram(kgn_dgram_t *dgram, gni_post_state_t post_state)
992 {
993         int rc = 0;
994
995         switch (post_state) {
996         case GNI_POST_COMPLETED:
997                 /* normal state for dgrams that need actual processing */
998                 /* GOTO to avoid processing dgram as canceled/done */
999                 GOTO(process_out, rc);
1000
1001         case GNI_POST_PENDING:
1002                 /* we should only see this if we are testing a WC dgram after a
1003                  * cancel - it means that it needs a full cycle of waiting
1004                  * for kgni_sm_task to finish moving it to TERMINATED */
1005                 LASSERTF((dgram->gndg_type == GNILND_DGRAM_WC_REQ) &&
1006                           (dgram->gndg_state == GNILND_DGRAM_CANCELED),
1007                          "POST_PENDING dgram 0x%p with bad type %d(%s) or state %d(%s)\n",
1008                          dgram, dgram->gndg_type, kgnilnd_dgram_type2str(dgram),
1009                          dgram->gndg_state, kgnilnd_dgram_state2str(dgram));
1010
1011                 /* positive RC as this dgram isn't done yet */
1012                 rc = EINPROGRESS;
1013
1014                 /* GOTO as this isn't done yet */
1015                 GOTO(process_out, rc);
1016                 break;
1017
1018         case GNI_POST_TERMINATED:
1019                 /* we've called cancel and it is done or remote guy called cancel and
1020                  * we've receved it on a WC dgram */
1021 #if 0
1022                 /* we are seeing weird terminations on non WC dgrams when we have not
1023                  * canceled them */
1024
1025                 LASSERTF(dgram->gndg_state == GNILND_DGRAM_CANCELED ||
1026                          dgram->gndg_conn_out.gncr_dstnid == LNET_NID_ANY,
1027                         "dgram 0x%p with bad state %d(%s) or dst nid %s\n",
1028                         dgram, dgram->gndg_state, kgnilnd_dgram_state2str(dgram),
1029                         libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid));
1030 #endif
1031
1032                 CDEBUG(D_NETTRACE, "dgram 0x%p saw %s, cleaning it up\n", dgram,
1033                        dgram->gndg_state == GNILND_DGRAM_CANCELED ?  "canceled" : "terminated");
1034
1035                 rc =  -ECANCELED;
1036                 break;
1037
1038         case GNI_POST_TIMEOUT:
1039                 /* we could have a timeout on a wildcard dgram too - if
1040                  * we got the incoming request but the remote node beefed
1041                  * before kgni could send the match data back. We'll just error
1042                  * on the active case and bail out gracefully */
1043                 if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
1044                         CNETERR("hardware timeout for connect to "
1045                                "%s after %lu seconds. Is node dead?\n",
1046                                libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
1047                                cfs_duration_sec(jiffies - dgram->gndg_post_time));
1048                 }
1049
1050                 rc = -ETIMEDOUT;
1051                 break;
1052
1053         default:
1054                 CERROR("dgram 0x%p with bad post_state %d\n", dgram, post_state);
1055                 LBUG();
1056         }
1057
1058         /* now finish cleaning up a dgram that is canceled/terminated and needs to
1059          * go away */
1060
1061         /* If this was actively canceled, drop the count now that we are processing */
1062         if (dgram->gndg_state == GNILND_DGRAM_CANCELED) {
1063                 atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1064                 /* caller responsible for gndg_list removal */
1065         }
1066
1067 process_out:
1068
1069         RETURN(rc);
1070 }
1071
1072 /* needs dev->gnd_dgram_lock held */
1073 void
1074 kgnilnd_cancel_dgram_locked(kgn_dgram_t *dgram)
1075 {
1076         gni_return_t            grc;
1077
1078         if (dgram->gndg_state != GNILND_DGRAM_POSTED) {
1079                 return;
1080         }
1081
1082         LASSERTF(dgram->gndg_conn != NULL,
1083                  "dgram 0x%p with NULL conn\n", dgram);
1084
1085         /* C.E - WC dgrams could be canceled immediately but
1086          * if there was some match pending, we need to call
1087          * test_by_id to clear it out. If that test returns
1088          * POST_PENDING, it is half done and needs to go along
1089          * with the rest of dgrams and go through a kgni_sm_task cycle
1090          * and deliver a GNI_POST_TERMINATED event before they
1091          * are actually canceled */
1092
1093         dgram->gndg_state = GNILND_DGRAM_CANCELED;
1094
1095         if (dgram->gndg_conn->gnc_state >= GNILND_CONN_ESTABLISHED) {
1096                 /* we don't need to cancel_by_id if the datagram was good */
1097                 return;
1098         }
1099
1100         /* let folks know there are outstanding cancels */
1101         atomic_inc(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1102         /* leave on nid list until cancel is done for debugging fun */
1103         grc = kgnilnd_ep_postdata_cancel_by_id(dgram->gndg_conn->gnc_ephandle, (__u64) dgram);
1104
1105         /* if we don't get success here, we have hosed up the dgram tracking
1106          * code and need to bail out */
1107         LASSERTF(grc == GNI_RC_SUCCESS,
1108                  "postdata_cancel returned %d for conn 0x%p to %s\n",
1109                  grc, dgram->gndg_conn,
1110                  dgram->gndg_conn->gnc_peer ?
1111                   libcfs_nid2str(dgram->gndg_conn->gnc_peer->gnp_nid)
1112                   : "<?>");
1113
1114         CDEBUG(D_NETTRACE,
1115                 "canceled dgram 0x%p conn 0x%p ephandle 0x%p\n",
1116                 dgram, dgram->gndg_conn,
1117                 dgram->gndg_conn->gnc_ephandle);
1118
1119         if (dgram->gndg_type == GNILND_DGRAM_WC_REQ) {
1120                 gni_post_state_t         post_state;
1121                 int                      rc = 0;
1122                 __u32                    remote_addr = 0, remote_id = 0;
1123
1124                 grc = kgnilnd_ep_postdata_test_by_id(dgram->gndg_conn->gnc_ephandle,
1125                                                      (__u64)dgram, &post_state,
1126                                                      &remote_addr, &remote_id);
1127
1128                 LASSERTF(grc == GNI_RC_NO_MATCH || grc == GNI_RC_SUCCESS,
1129                          "bad grc %d from test_by_id on dgram 0x%p\n",
1130                         grc, dgram);
1131
1132                 /* if WC was canceled immediately, we get NO_MATCH, if needs to go
1133                  * through full cycle, we get SUCCESS and need to parse post_state */
1134
1135                 CDEBUG(D_NET, "grc %d dgram 0x%p type %s post_state %d "
1136                         "remote_addr %u remote_id %u\n", grc, dgram,
1137                         kgnilnd_dgram_type2str(dgram),
1138                         post_state, remote_addr, remote_id);
1139
1140                 if (grc == GNI_RC_NO_MATCH) {
1141                         /* she's gone, reduce count and move along */
1142                         dgram->gndg_state = GNILND_DGRAM_DONE;
1143                         atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1144                         RETURN_EXIT;
1145                 }
1146
1147                 rc = kgnilnd_process_dgram(dgram, post_state);
1148
1149                 if (rc <= 0) {
1150                         /* if for some weird reason we get a valid dgram back, just mark as done
1151                          * so we can drop it and move along.
1152                          * C.E - if it was completed, we'll just release the conn/mbox
1153                          * back into the pool and it'll get reused. That said, we should only
1154                          * be canceling a WC dgram on stack rest or shutdown, so that is moot */
1155                         dgram->gndg_state = GNILND_DGRAM_DONE;
1156                         atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1157
1158                         /* caller context responsible for calling kgnilnd_release_dgram() */
1159                 } else {
1160                         /* still pending, let it simmer until golden brown and delicious */
1161                 }
1162         }
1163
1164         /* for non WC dgrams, they are still on the nid list but marked canceled waiting
1165          * for kgni to return their ID to us via probe - that is when we'll complete their
1166          * cancel processing */
1167 }
1168
1169 void
1170 kgnilnd_cleanup_dgram(kgn_dgram_t *dgram)
1171 {
1172         /* release the dgram ref on conn */
1173         if (dgram->gndg_conn) {
1174                 kgnilnd_conn_decref(dgram->gndg_conn);
1175                 dgram->gndg_conn = NULL;
1176         }
1177 }
1178
1179 void
1180 kgnilnd_free_dgram(kgn_device_t *dev, kgn_dgram_t *dgram)
1181 {
1182         LASSERTF(dgram->gndg_state == GNILND_DGRAM_USED ||
1183                  dgram->gndg_state == GNILND_DGRAM_DONE,
1184                  "dgram 0x%p with bad state %s\n",
1185                  dgram, kgnilnd_dgram_state2str(dgram));
1186
1187         /* bit of poisoning to help detect bad driver data */
1188         dgram->gndg_magic = 0x6f5a6b5f;
1189         atomic_dec(&dev->gnd_ndgrams);
1190
1191         kmem_cache_free(kgnilnd_data.kgn_dgram_cache, dgram);
1192         CDEBUG(D_MALLOC|D_NETTRACE, "slab-freed 'dgram': %lu at %p %s"
1193                " ndgrams %d\n",
1194                sizeof(*dgram), dgram, kgnilnd_dgram_type2str(dgram),
1195                atomic_read(&dev->gnd_ndgrams));
1196 }
1197
1198 int
1199 kgnilnd_post_dgram(kgn_device_t *dev, lnet_nid_t dstnid, kgn_connreq_type_t type,
1200                    int data_rc)
1201 {
1202         int              rc = 0;
1203         kgn_dgram_t     *dgram = NULL;
1204         kgn_dgram_t     *tmpdgram;
1205         kgn_dgram_type_t dgtype;
1206         gni_return_t     grc;
1207         __u64            srcnid;
1208         ENTRY;
1209
1210         switch (type) {
1211         case GNILND_CONNREQ_REQ:
1212                 if (dstnid == LNET_NID_ANY)
1213                         dgtype = GNILND_DGRAM_WC_REQ;
1214                 else
1215                         dgtype = GNILND_DGRAM_REQ;
1216                 break;
1217         case GNILND_CONNREQ_NAK:
1218                 LASSERTF(dstnid != LNET_NID_ANY, "can't NAK to LNET_NID_ANY\n");
1219                 dgtype = GNILND_DGRAM_NAK;
1220                 break;
1221         default:
1222                 CERROR("unknown connreq type %d\n", type);
1223                 LBUG();
1224         }
1225
1226         rc = kgnilnd_alloc_dgram(&dgram, dev, dgtype);
1227         if (rc < 0) {
1228                 rc = -ENOMEM;
1229                 GOTO(post_failed, rc);
1230         }
1231
1232         rc = kgnilnd_create_conn(&dgram->gndg_conn, dev);
1233         if (rc) {
1234                 GOTO(post_failed, rc);
1235         }
1236
1237         if (dgram->gndg_type == GNILND_DGRAM_WC_REQ) {
1238                 /* clear buffer for sanity on reuse of wildcard */
1239                 memset(&dgram->gndg_conn_in, 0, sizeof(kgn_connreq_t));
1240         }
1241
1242         if (dstnid == LNET_NID_ANY) {
1243                 /* set here to reset any dgram re-use */
1244                 dgram->gndg_conn->gnc_state = GNILND_CONN_LISTEN;
1245         } else {
1246                 __u32            host_id;
1247
1248                 rc = kgnilnd_nid_to_nicaddrs(LNET_NIDADDR(dstnid), 1, &host_id);
1249                 if (rc <= 0) {
1250                         rc = -ESRCH;
1251                         GOTO(post_failed, rc);
1252                 }
1253
1254                 dgram->gndg_conn->gnc_state = GNILND_CONN_CONNECTING;
1255
1256                 /* don't need to serialize, there are no CQs for the dgram
1257                  * EP on the kgn_net_t */
1258                 grc = kgnilnd_ep_bind(dgram->gndg_conn->gnc_ephandle, host_id, dev->gnd_id);
1259
1260                 if (grc != GNI_RC_SUCCESS) {
1261                         rc = -ECONNABORTED;
1262                         GOTO(post_failed, rc);
1263                 }
1264
1265         }
1266
1267         /* If we are posting wildcards post using a net of 0, otherwise we'll use the
1268          * net of the destination node.
1269          */
1270
1271         if (dstnid == LNET_NID_ANY) {
1272                 srcnid = LNET_MKNID(LNET_MKNET(GNILND, 0), dev->gnd_nid);
1273         } else {
1274                 srcnid = LNET_MKNID(LNET_NIDNET(dstnid), dev->gnd_nid);
1275         }
1276
1277         rc = kgnilnd_pack_connreq(&dgram->gndg_conn_out, dgram->gndg_conn,
1278                                   srcnid, dstnid, type);
1279         if (rc) {
1280                 GOTO(post_failed, rc);
1281         }
1282
1283         if (type == GNILND_CONNREQ_NAK)
1284                 dgram->gndg_conn_out.gncr_nakdata.gnnd_errno = data_rc;
1285
1286         dgram->gndg_post_time = jiffies;
1287
1288         /* XXX Nic: here is where we'd add in logical network multiplexing */
1289
1290         CDEBUG(D_NETTRACE, "dgram 0x%p type %s %s->%s cdm %d\n",
1291                dgram, kgnilnd_dgram_type2str(dgram),
1292                libcfs_nid2str(srcnid),
1293                libcfs_nid2str(dstnid), dev->gnd_id);
1294
1295         /* this allocates memory, can't hold locks across */
1296         grc = kgnilnd_ep_postdata_w_id(dgram->gndg_conn->gnc_ephandle,
1297                                    &dgram->gndg_conn_out, sizeof(kgn_connreq_t),
1298                                    &dgram->gndg_conn_in, sizeof(kgn_connreq_t),
1299                                    (__u64)dgram);
1300
1301         if (grc != GNI_RC_SUCCESS) {
1302                 CNETERR("dropping failed dgram post id 0x%p type %s"
1303                         " reqtype %s to %s: rc %d\n",
1304                         dgram, kgnilnd_dgram_type2str(dgram),
1305                         kgnilnd_connreq_type2str(&dgram->gndg_conn_out),
1306                         libcfs_nid2str(dstnid), grc);
1307                 rc = (grc == GNI_RC_ERROR_NOMEM) ? -ENOMEM : -EBADR;
1308                 GOTO(post_failed, rc);
1309         }
1310
1311         /* we don't need to add earlier - if someone does del_peer during post,
1312          * that peer will get marked as unlinked and the callers wil take care of it.
1313          * The dgram code is largely kgn_peer_t ignorant, so at worst, we'll just drop
1314          * the completed dgram later when we cant find a peer to stuff it into */
1315
1316         spin_lock(&dev->gnd_dgram_lock);
1317
1318         /* make sure we are not double posting targeted dgrams
1319          * - we can multiple post WC dgrams to help with processing speed */
1320         if (dstnid != LNET_NID_ANY) {
1321                 tmpdgram = kgnilnd_find_dgram_locked(dev, dstnid);
1322
1323                 LASSERTF(tmpdgram == NULL,
1324                         "dgram 0x%p->%s already posted\n",
1325                          dgram, libcfs_nid2str(dstnid));
1326         }
1327
1328         /* unmunge dstnid to help processing code cope... */
1329         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_DSTNID)) {
1330                 dgram->gndg_conn_out.gncr_dstnid = dstnid;
1331         }
1332
1333         list_add_tail(&dgram->gndg_list, kgnilnd_nid2dgramlist(dev, dstnid));
1334         dgram->gndg_state = GNILND_DGRAM_POSTED;
1335         spin_unlock(&dev->gnd_dgram_lock);
1336
1337 post_failed:
1338         if (rc < 0 && dgram != NULL) {
1339                 kgnilnd_cleanup_dgram(dgram);
1340                 kgnilnd_free_dgram(dev, dgram);
1341         }
1342
1343         RETURN(rc);
1344 }
1345
1346 /* The shutdown flag is set from the shutdown and stack reset threads. */
1347 void
1348 kgnilnd_release_dgram(kgn_device_t *dev, kgn_dgram_t *dgram, int shutdown)
1349 {
1350         /* The conns of canceled active dgrams need to be put in purgatory so
1351          * we don't reuse the mailbox */
1352         if (unlikely(dgram->gndg_state == GNILND_DGRAM_CANCELED)) {
1353                 kgn_peer_t *peer;
1354                 kgn_conn_t *conn = dgram->gndg_conn;
1355                 lnet_nid_t nid = dgram->gndg_conn_out.gncr_dstnid;
1356
1357                 dgram->gndg_state = GNILND_DGRAM_DONE;
1358
1359                 /* During shutdown we've already removed the peer so we don't
1360                  * need to add a peer. During stack reset we don't care about
1361                  * MDDs since they are all released. */
1362                 if (!shutdown) {
1363                         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1364                         peer = kgnilnd_find_peer_locked(nid);
1365
1366                         if (peer != NULL) {
1367                                 CDEBUG(D_NET, "adding peer's conn with nid %s "
1368                                         "to purgatory\n", libcfs_nid2str(nid));
1369                                 kgnilnd_conn_addref(conn);
1370                                 conn->gnc_peer = peer;
1371                                 kgnilnd_peer_addref(peer);
1372                                 kgnilnd_admin_addref(conn->gnc_peer->gnp_dirty_eps);
1373                                 conn->gnc_state = GNILND_CONN_CLOSED;
1374                                 list_add_tail(&conn->gnc_list,
1375                                               &peer->gnp_conns);
1376                                 kgnilnd_add_purgatory_locked(conn,
1377                                                              conn->gnc_peer);
1378                                 kgnilnd_schedule_conn(conn);
1379                         }
1380                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1381                 }
1382         }
1383
1384         spin_lock(&dev->gnd_dgram_lock);
1385         kgnilnd_cancel_dgram_locked(dgram);
1386         spin_unlock(&dev->gnd_dgram_lock);
1387
1388         kgnilnd_cleanup_dgram(dgram);
1389
1390         /* if the dgram is 'canceled' it needs to be wait until the event
1391          * comes up from kgni that tells us it is safe to release */
1392         if (dgram->gndg_state != GNILND_DGRAM_CANCELED) {
1393                 dgram->gndg_state = GNILND_DGRAM_DONE;
1394
1395                 LASSERTF(list_empty(&dgram->gndg_list), "dgram 0x%p on list\n", dgram);
1396
1397                 /* if it is a wildcard and we are in an appropriate state, repost
1398                  * the wildcard */
1399
1400                 if ((dgram->gndg_type == GNILND_DGRAM_WC_REQ) &&
1401                     (!kgnilnd_data.kgn_wc_kill && !kgnilnd_data.kgn_in_reset)) {
1402                         int     rerc;
1403
1404                         rerc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
1405                         if (rerc != 0) {
1406                                 /* We failed to repost the WC dgram for some reason
1407                                  * mark it so the repost system attempts to repost */
1408                                 kgnilnd_admin_addref(dev->gnd_nwcdgrams);
1409                         }
1410                 }
1411
1412                 /* always free the old dgram */
1413                 kgnilnd_free_dgram(dev, dgram);
1414         }
1415 }
1416
1417
1418 int
1419 kgnilnd_probe_for_dgram(kgn_device_t *dev, kgn_dgram_t **dgramp)
1420 {
1421         kgn_dgram_t             *dgram = NULL;
1422         gni_post_state_t         post_state;
1423         gni_return_t             grc;
1424         int                      rc = 0;
1425         __u64                    readyid;
1426         __u32                    remote_addr = 0, remote_id = 0;
1427         ENTRY;
1428
1429         /* Probe with the lock held. That way if we get a dgram we dont have it canceled
1430          * between finding the ready dgram and grabbing the lock to remove it from the
1431          * list. Otherwise we could be left in an inconsistent state. We own the dgram
1432          * once its off the list so we don't need to worry about others changing it at
1433          * that point. */
1434         spin_lock(&dev->gnd_dgram_lock);
1435         grc = kgnilnd_postdata_probe_by_id(dev->gnd_handle, &readyid);
1436         if (grc != GNI_RC_SUCCESS) {
1437                 spin_unlock(&dev->gnd_dgram_lock);
1438                 /* return 0 to indicate nothing happened */
1439                 RETURN(0);
1440         }
1441
1442         CDEBUG(D_NET, "ready %#llx on device 0x%p\n",
1443                 readyid, dev);
1444
1445         dgram = (kgn_dgram_t *)readyid;
1446
1447         LASSERTF(dgram->gndg_magic == GNILND_DGRAM_MAGIC,
1448                  "dgram 0x%p from id %#llx with bad magic %x\n",
1449                  dgram, readyid, dgram->gndg_magic);
1450
1451         LASSERTF(dgram->gndg_state == GNILND_DGRAM_POSTED ||
1452                  dgram->gndg_state == GNILND_DGRAM_CANCELED,
1453                  "dgram 0x%p with bad state %s\n",
1454                  dgram, kgnilnd_dgram_state2str(dgram));
1455
1456         LASSERTF(!list_empty(&dgram->gndg_list),
1457                  "dgram 0x%p with bad list state %s type %s\n",
1458                  dgram, kgnilnd_dgram_state2str(dgram),
1459                  kgnilnd_dgram_type2str(dgram));
1460
1461         /* now we know that the datagram structure is ok, so pull off list */
1462         list_del_init(&dgram->gndg_list);
1463
1464         /* while we have the gnn_dgram_lock and BEFORE we call test_by_id
1465          * change the state from POSTED to PROCESSING to ensure that
1466          * nobody cancels it after we've pulled it from the wire */
1467         if (dgram->gndg_state == GNILND_DGRAM_POSTED) {
1468                 dgram->gndg_state = GNILND_DGRAM_PROCESSING;
1469         }
1470
1471         LASSERTF(dgram->gndg_conn != NULL,
1472                 "dgram 0x%p with NULL conn\n", dgram);
1473
1474         grc = kgnilnd_ep_postdata_test_by_id(dgram->gndg_conn->gnc_ephandle,
1475                                              (__u64)dgram, &post_state,
1476                                              &remote_addr, &remote_id);
1477
1478         /* we now "own" this datagram */
1479         spin_unlock(&dev->gnd_dgram_lock);
1480
1481         LASSERTF(grc != GNI_RC_NO_MATCH, "kgni lied! probe_by_id told us that"
1482                  " id %llu was ready\n", readyid);
1483
1484         CDEBUG(D_NET, "grc %d dgram 0x%p type %s post_state %d "
1485                 "remote_addr %u remote_id %u\n", grc, dgram,
1486                 kgnilnd_dgram_type2str(dgram),
1487                 post_state, remote_addr, remote_id);
1488
1489         if (unlikely(grc != GNI_RC_SUCCESS)) {
1490                 CNETERR("getting data for dgram 0x%p->%s failed rc %d. Dropping it\n",
1491                         dgram, libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
1492                         grc);
1493                 rc = -EINVAL;
1494                 GOTO(probe_for_out, rc);
1495         }
1496
1497         rc = kgnilnd_process_dgram(dgram, post_state);
1498
1499         /* we should never get probe finding a dgram for us and then it
1500          * being a WC dgram that is still in the middle of processing */
1501         LASSERTF(rc <= 0, "bad rc %d from process_dgram 0x%p state %d\n",
1502                  rc, dgram, post_state);
1503
1504         if (rc == 0) {
1505                 /* dgram is good enough for the data to be used */
1506                 dgram->gndg_state = GNILND_DGRAM_PROCESSING;
1507                 /* fake rc to mark that we've done something */
1508                 rc = 1;
1509         } else {
1510                 /* let kgnilnd_release_dgram take care of canceled dgrams */
1511                 if (dgram->gndg_state != GNILND_DGRAM_CANCELED) {
1512                         dgram->gndg_state = GNILND_DGRAM_DONE;
1513                 }
1514         }
1515
1516         *dgramp = dgram;
1517         RETURN(rc);
1518
1519 probe_for_out:
1520
1521         kgnilnd_release_dgram(dev, dgram, 0);
1522         RETURN(rc);
1523 }
1524
1525 int
1526 kgnilnd_setup_wildcard_dgram(kgn_device_t *dev)
1527 {
1528         /* if kgn_wildcard is zero, return error */
1529         int     rc = -ENOENT, i;
1530         ENTRY;
1531
1532         for (i = 0; i < *kgnilnd_tunables.kgn_nwildcard; i++) {
1533                 rc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
1534                 if (rc < 0) {
1535                         CERROR("error %d: could not post wildcard datagram # %d\n",
1536                                 rc, i);
1537                         rc = -EINVAL;
1538                         GOTO(failed, rc);
1539                 }
1540         }
1541
1542 failed:
1543         RETURN(rc);
1544 }
1545
1546 int
1547 kgnilnd_cancel_net_dgrams(kgn_net_t *net)
1548 {
1549         kgn_dgram_t            *dg, *dgN;
1550         struct list_head        zombies;
1551         int                     i;
1552         ENTRY;
1553
1554         /* we want to cancel any outstanding dgrams - we don't want to rely
1555          * on del_peer_or_conn catching all of them. This helps protect us in cases
1556          * where we don't quite keep the peer->dgram mapping in sync due to some
1557          * race conditions */
1558
1559         LASSERTF(net->gnn_shutdown || kgnilnd_data.kgn_in_reset,
1560                  "called with LND invalid state: net shutdown %d "
1561                  "in reset %d\n", net->gnn_shutdown,
1562                  kgnilnd_data.kgn_in_reset);
1563
1564         INIT_LIST_HEAD(&zombies);
1565
1566         spin_lock(&net->gnn_dev->gnd_dgram_lock);
1567
1568         for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
1569                 list_for_each_entry_safe(dg, dgN, &net->gnn_dev->gnd_dgrams[i], gndg_list) {
1570
1571                         /* skip nids not on our net or are wildcards */
1572
1573
1574                         if (dg->gndg_type == GNILND_DGRAM_WC_REQ ||
1575                                 net->gnn_netnum != LNET_NETNUM(LNET_NIDNET(dg->gndg_conn_out.gncr_dstnid)))
1576                                 continue;
1577
1578                         kgnilnd_cancel_dgram_locked(dg);
1579                 }
1580         }
1581
1582         spin_unlock(&net->gnn_dev->gnd_dgram_lock);
1583
1584         RETURN(0);
1585 }
1586
1587 int
1588 kgnilnd_cancel_wc_dgrams(kgn_device_t *dev)
1589 {
1590         kgn_dgram_t *dg, *dgN;
1591         struct list_head zombies;
1592         ENTRY;
1593
1594         /* Time to kill the outstanding WC's
1595          * WC's exist on net 0 only but match on any net...
1596          */
1597
1598         LASSERTF(kgnilnd_data.kgn_in_reset || kgnilnd_data.kgn_wc_kill,
1599                 "called with LND invalid state: WC shutdown %d "
1600                 "in reset %d\n", kgnilnd_data.kgn_wc_kill,
1601                 kgnilnd_data.kgn_in_reset);
1602
1603         INIT_LIST_HEAD(&zombies);
1604         spin_lock(&dev->gnd_dgram_lock);
1605
1606         do {
1607                 dg = kgnilnd_find_dgram_locked(dev, LNET_NID_ANY);
1608                 if (dg != NULL) {
1609                         LASSERTF(dg->gndg_type == GNILND_DGRAM_WC_REQ,
1610                                  "dgram 0x%p->%s with bad type %d (%s)\n",
1611                                 dg, libcfs_nid2str(dg->gndg_conn_out.gncr_dstnid),
1612                                 dg->gndg_type, kgnilnd_dgram_type2str(dg));
1613
1614                         kgnilnd_cancel_dgram_locked(dg);
1615
1616                         /* WC could be DONE already, check and if so add to list to be released */
1617                         if (dg->gndg_state == GNILND_DGRAM_DONE) {
1618                                 list_del_init(&dg->gndg_list);
1619                                 list_add_tail(&dg->gndg_list, &zombies);
1620                         }
1621                 }
1622         } while (dg != NULL);
1623
1624         spin_unlock(&dev->gnd_dgram_lock);
1625
1626         list_for_each_entry_safe(dg, dgN, &zombies, gndg_list) {
1627                 list_del_init(&dg->gndg_list);
1628                 kgnilnd_release_dgram(dev, dg, 1);
1629         }
1630         RETURN(0);
1631
1632 }
1633
1634 int
1635 kgnilnd_cancel_dgrams(kgn_device_t *dev)
1636 {
1637         kgn_dgram_t *dg, *dgN;
1638         int i;
1639         ENTRY;
1640
1641         /* Cancel any outstanding non wildcard datagrams regardless
1642          * of which net they are on as we are in base shutdown and
1643          * dont care about connecting anymore.
1644          */
1645
1646         LASSERTF(kgnilnd_data.kgn_wc_kill == 1,"We didnt get called from base shutdown\n");
1647
1648         spin_lock(&dev->gnd_dgram_lock);
1649
1650         for (i = 0; i < (*kgnilnd_tunables.kgn_peer_hash_size -1); i++) {
1651                 list_for_each_entry_safe(dg, dgN, &dev->gnd_dgrams[i], gndg_list) {
1652                         if (dg->gndg_type != GNILND_DGRAM_WC_REQ)
1653                                 kgnilnd_cancel_dgram_locked(dg);
1654                 }
1655         }
1656
1657         spin_unlock(&dev->gnd_dgram_lock);
1658
1659         RETURN(0);
1660 }
1661
1662
1663 void
1664 kgnilnd_wait_for_canceled_dgrams(kgn_device_t *dev)
1665 {
1666         int             i = 4;
1667         int             rc;
1668         gni_return_t    grc;
1669         __u64           readyid;
1670         kgn_dgram_t    *dgram;
1671
1672         /* use do while to get at least one check run to allow
1673          * regression test for 762072 to hit bug if there */
1674
1675         /* This function races with the dgram mover during shutdown so it is possible for
1676          * a dgram to be seen in kgnilnd_postdata_probe_wait_by_id but be handled in the
1677          * dgram mover thread instead of inside of this function.
1678          */
1679
1680         /* This should only be called from within shutdown, baseshutdown, or stack reset.
1681          * there are no assertions here to verify since base_shutdown has nothing in it we can check
1682          * the net is gone by then.
1683          */
1684
1685         do {
1686                 i++;
1687                 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
1688                         "Waiting for %d canceled datagrams to clear on device %d\n",
1689                         atomic_read(&dev->gnd_canceled_dgrams), dev->gnd_id);
1690
1691                 /* check once a second */
1692                 grc = kgnilnd_postdata_probe_wait_by_id(dev->gnd_handle,
1693                        250, &readyid);
1694
1695                 if (grc != GNI_RC_SUCCESS)
1696                         continue;
1697
1698                 CDEBUG(D_NET, "ready %#llx on device %d->0x%p\n",
1699                         readyid, dev->gnd_id, dev);
1700
1701                 rc = kgnilnd_probe_for_dgram(dev, &dgram);
1702                 if (rc != 0) {
1703                         /* if we got a valid dgram or one that is now done, clean up */
1704                         kgnilnd_release_dgram(dev, dgram, 1);
1705                 }
1706         } while (atomic_read(&dev->gnd_canceled_dgrams));
1707 }
1708
1709 int
1710 kgnilnd_start_connect(kgn_peer_t *peer)
1711 {
1712         int              rc = 0;
1713         /* sync point for kgnilnd_del_peer_locked - do an early check to
1714          * catch the most common hits where del_peer is done by the
1715          * time we get here */
1716         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING1)) {
1717                 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING1, 1)) {};
1718         }
1719
1720         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1721         if (!kgnilnd_peer_active(peer) || peer->gnp_connecting != GNILND_PEER_CONNECT) {
1722                 /* raced with peer getting unlinked */
1723                 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1724                 rc = ESTALE;
1725                 GOTO(out, rc);
1726         }
1727         peer->gnp_connecting = GNILND_PEER_POSTING;
1728         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1729
1730         set_mb(peer->gnp_last_dgram_time, jiffies);
1731         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING2)) {
1732                 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING2, 1)) {};
1733         }
1734
1735         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING3)) {
1736                 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING3, 1)) {};
1737                 rc = cfs_fail_val ? cfs_fail_val : -ENOMEM;
1738         } else {
1739                 rc = kgnilnd_post_dgram(peer->gnp_net->gnn_dev,
1740                                         peer->gnp_nid, GNILND_CONNREQ_REQ, 0);
1741         }
1742         if (rc < 0) {
1743                 set_mb(peer->gnp_last_dgram_errno, rc);
1744                 GOTO(failed, rc);
1745         }
1746
1747         /* while we're posting someone could have decided this peer/dgram needed to
1748          * die a quick death, so we check for state change and process accordingly */
1749
1750         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1751         if (!kgnilnd_peer_active(peer) || peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH) {
1752                 if (peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH) {
1753                         peer->gnp_connecting = GNILND_PEER_KILL;
1754                 }
1755                 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1756                 /* positive RC to avoid dgram cleanup - we'll have to
1757                  * wait for the kgni GNI_POST_TERMINATED event to
1758                  * finish cleaning up */
1759                 rc = ESTALE;
1760                 kgnilnd_find_and_cancel_dgram(peer->gnp_net->gnn_dev, peer->gnp_nid);
1761                 GOTO(out, rc);
1762         }
1763         peer->gnp_connecting = GNILND_PEER_POSTED;
1764         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1765         /* reaper thread will take care of any timeouts */
1766         CDEBUG(D_NET, "waiting for connect to finish to %s rc %d\n",
1767                libcfs_nid2str(peer->gnp_nid), rc);
1768
1769         RETURN(rc);
1770
1771 failed:
1772         CDEBUG(D_NET, "connect to %s failed: rc %d \n",
1773                libcfs_nid2str(peer->gnp_nid), rc);
1774 out:
1775         RETURN(rc);
1776 }
1777
1778 int
1779 kgnilnd_finish_connect(kgn_dgram_t *dgram)
1780 {
1781         kgn_conn_t        *conn = dgram->gndg_conn;
1782         lnet_nid_t         her_nid = dgram->gndg_conn_in.gncr_srcnid;
1783         kgn_peer_t        *new_peer, *peer = NULL;
1784         kgn_tx_t          *tx;
1785         kgn_tx_t          *txn;
1786         kgn_mbox_info_t   *mbox;
1787         int                rc;
1788         int                nstale;
1789
1790         /* try to find a peer that matches the nid we got in the connreq
1791          * kgnilnd_unpack_connreq makes sure that conn_in.gncr_srcnid is
1792          * HER and conn_out.gncr_srcnid is ME for both active and WC dgrams */
1793
1794         /* assume this is a new peer  - it makes locking cleaner when it isn't */
1795         /* no holding kgn_net_rw_sem - already are at the kgnilnd_dgram_mover level */
1796
1797         rc = kgnilnd_create_peer_safe(&new_peer, her_nid, NULL, GNILND_RCA_NODE_UP);
1798         if (rc != 0) {
1799                 CERROR("Can't create peer for %s\n", libcfs_nid2str(her_nid));
1800                 return rc;
1801         }
1802
1803         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1804
1805         /* this transfers ref from create_peer to the kgn_peer table */
1806         kgnilnd_add_peer_locked(her_nid, new_peer, &peer);
1807
1808         /* if we found an existing peer, is it really ready for a new conn ? */
1809         if (peer != new_peer) {
1810                 /* if this was an active connect attempt but we can't find a peer waiting for it
1811                  * we will dump in the trash */
1812
1813                 if (peer->gnp_connecting == GNILND_PEER_IDLE && dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
1814                         CDEBUG(D_NET, "dropping completed connreq for %s peer 0x%p->%s\n",
1815                                libcfs_nid2str(her_nid), peer, libcfs_nid2str(peer->gnp_nid));
1816                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1817                         rc = ECANCELED;
1818                         GOTO(out, rc);
1819                 }
1820
1821                 /* check to see if we can catch a connecting peer before it is
1822                  * removed from the connd_peers list - if not, we need to
1823                  * let the connreqs race and be handled by kgnilnd_conn_isdup_locked() */
1824                 if (peer->gnp_connecting != GNILND_PEER_IDLE) {
1825                         spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
1826                         if (!list_empty(&peer->gnp_connd_list)) {
1827                                 list_del_init(&peer->gnp_connd_list);
1828                                 /* drop connd ref */
1829                                 kgnilnd_peer_decref(peer);
1830                         }
1831                         spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
1832                         /* clear rc to make sure we don't have fake error */
1833                         rc = 0;
1834                 }
1835
1836                 /* no matter what, we are no longer waiting to connect this peer now */
1837                 peer->gnp_connecting = GNILND_PEER_IDLE;
1838
1839                 /* Refuse to duplicate an existing connection (both sides might try to
1840                  * connect at once).  NB we return success!  We _are_ connected so we
1841                  * _don't_ have any blocked txs to complete with failure. */
1842                 rc = kgnilnd_conn_isdup_locked(peer, conn);
1843                 if (rc != 0) {
1844                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1845                         CDEBUG(D_NET, "Not creating duplicate connection to %s: %d\n",
1846                               libcfs_nid2str(her_nid), rc);
1847                         rc = EALREADY;
1848                         GOTO(out, rc);
1849                 }
1850         }
1851
1852         if (peer->gnp_down == GNILND_RCA_NODE_DOWN) {
1853                 CNETERR("Received connection request from down nid %s\n",
1854                         libcfs_nid2str(her_nid));
1855                 peer->gnp_down = GNILND_RCA_NODE_UP;
1856         }
1857
1858         nstale = kgnilnd_close_stale_conns_locked(peer, conn);
1859
1860         /* either way with peer (new or existing), we are ok with ref counts here as the
1861          * kgnilnd_add_peer_locked will use our ref on new_peer (from create_peer_safe) as the
1862          * ref for the peer table. */
1863
1864         /* at this point, the connection request is a winner */
1865
1866         /* mark 'DONE' to avoid cancel being called from release */
1867         dgram->gndg_state = GNILND_DGRAM_DONE;
1868
1869         /* initialise timestamps before reaper looks at them */
1870         conn->gnc_last_rx = conn->gnc_last_rx_cq = jiffies;
1871
1872         /* last_tx is initialized to jiffies - (keepalive*2) so that if the NOOP fails it will
1873          * immediatly send a NOOP in the reaper thread during the call to
1874          * kgnilnd_check_conn_timeouts_locked
1875          */
1876         conn->gnc_last_tx = jiffies - (cfs_time_seconds(GNILND_TO2KA(conn->gnc_timeout)) * 2);
1877         conn->gnc_state = GNILND_CONN_ESTABLISHED;
1878
1879         /* save the dgram type used to establish this connection */
1880         conn->gnc_dgram_type = dgram->gndg_type;
1881
1882         /* refs are not transferred from dgram to tables, so increment to
1883          * take ownership */
1884         kgnilnd_conn_addref(conn);
1885         kgnilnd_peer_addref(peer);
1886         conn->gnc_peer = peer;
1887         list_add_tail(&conn->gnc_list, &peer->gnp_conns);
1888
1889         kgnilnd_conn_addref(conn);               /* +1 ref for conn table */
1890         list_add_tail(&conn->gnc_hashlist,
1891                       kgnilnd_cqid2connlist(conn->gnc_cqid));
1892         kgnilnd_data.kgn_conn_version++;
1893
1894         /* Dont send NOOP if fail_loc is set
1895          */
1896         if (!CFS_FAIL_CHECK(CFS_FAIL_GNI_ONLY_NOOP)) {
1897                 tx = kgnilnd_new_tx_msg(GNILND_MSG_NOOP, peer->gnp_net->gnn_ni->ni_nid);
1898                 if (tx == NULL) {
1899                         CNETERR("can't get TX to initiate NOOP to %s\n",
1900                                 libcfs_nid2str(peer->gnp_nid));
1901                 } else {
1902                         kgnilnd_queue_tx(conn, tx);
1903                 }
1904         }
1905
1906         /* Schedule all packets blocking for a connection */
1907         list_for_each_entry_safe(tx, txn, &peer->gnp_tx_queue, tx_list) {
1908                 /* lock held here is the peer_conn lock */
1909                 kgnilnd_tx_del_state_locked(tx, peer, NULL, GNILND_TX_ALLOCD);
1910                 kgnilnd_queue_tx(conn, tx);
1911         }
1912
1913         /* If this is an active connection lets mark its timestamp on the MBoX */
1914         if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
1915                 mbox = &conn->gnc_fma_blk->gnm_mbox_info[conn->gnc_mbox_id];
1916                 /* conn->gnc_last_rx is jiffies it better exist as it was just set */
1917                 mbox->mbx_release_purg_active_dgram = conn->gnc_last_rx;
1918         }
1919
1920         /* Bug 765042: wake up scheduler for a race with finish_connect and
1921          * complete_conn_closed with a conn in purgatory
1922          * since we can't use CFS_RACE due to mutex_holds in kgnilnd_process_conns,
1923          * we just check for set and then clear */
1924         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_FINISH_PURG)) {
1925                 cfs_fail_loc = 0x0;
1926                 /* get scheduler thread moving again */
1927                 kgnilnd_schedule_device(conn->gnc_device);
1928         }
1929
1930         CDEBUG(D_NET, "New conn 0x%p->%s dev %d\n",
1931                conn, libcfs_nid2str(her_nid), conn->gnc_device->gnd_id);
1932
1933         /* make sure we reset peer reconnect interval now that we have a good conn */
1934         kgnilnd_peer_alive(peer);
1935         peer->gnp_reconnect_interval = 0;
1936
1937         /* clear the unlink attribute if we dont clear it kgnilnd_del_conn_or_peer will wait
1938          * on the atomic forever
1939          */
1940         if (peer->gnp_pending_unlink) {
1941                 peer->gnp_pending_unlink = 0;
1942                 kgnilnd_admin_decref(kgnilnd_data.kgn_npending_unlink);
1943                 CDEBUG(D_NET, "Clearing peer unlink %p\n",peer);
1944         }
1945
1946         /* add ref to make it hang around until after we drop the lock */
1947         kgnilnd_conn_addref(conn);
1948
1949         /* Once the peer_conn lock is dropped, the conn could actually move into
1950          * CLOSING->CLOSED->DONE in the scheduler thread, so hold the
1951          * lock until we are really done */
1952         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1953
1954         /* Notify LNET that we now have a working connection to this peer.
1955          * This is a Cray extension to the "standard" LND behavior. */
1956         lnet_notify(peer->gnp_net->gnn_ni, peer->gnp_nid,
1957                      1, cfs_time_current());
1958
1959         /* drop our 'hold' ref */
1960         kgnilnd_conn_decref(conn);
1961
1962 out:
1963         RETURN(rc);
1964 }
1965
1966 void
1967 kgnilnd_send_nak(kgn_device_t *dev, lnet_nid_t dst_nid, int error)
1968 {
1969         int              rc = 0;
1970         ENTRY;
1971
1972         LASSERTF(dst_nid != LNET_NID_ANY, "bad dst_nid %s\n", libcfs_nid2str(dst_nid));
1973
1974         CDEBUG(D_NET, "NAK to %s errno %d\n", libcfs_nid2str(dst_nid), error);
1975
1976         rc = kgnilnd_post_dgram(dev, dst_nid, GNILND_CONNREQ_NAK, error);
1977
1978         if (rc < 0) {
1979                 CDEBUG(D_NET, "NAK to %s failed: rc %d \n", libcfs_nid2str(dst_nid), rc);
1980         }
1981         EXIT;
1982 }
1983
1984 int
1985 kgnilnd_process_nak(kgn_dgram_t *dgram)
1986 {
1987         kgn_connreq_t     *connreq = &dgram->gndg_conn_in;
1988         lnet_nid_t         src_nid = connreq->gncr_srcnid;
1989         int                errno = connreq->gncr_nakdata.gnnd_errno;
1990         kgn_peer_t        *peer;
1991         int                rc = 0;
1992
1993         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1994
1995         peer = kgnilnd_find_peer_locked(src_nid);
1996         if (peer == NULL) {
1997                 /* we likely dropped him from bad data when we processed
1998                  * the original REQ */
1999                 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2000                 return -EBADSLT;
2001         }
2002
2003         /* need to check peerstamp/connstamp against the ones we find
2004          * to make sure we don't close new (and good?) conns that we
2005          * formed after this connreq failed */
2006         if (peer->gnp_connecting == GNILND_PEER_IDLE) {
2007                 kgn_conn_t        conn;
2008
2009                 if (list_empty(&peer->gnp_conns)) {
2010                         /* assume already procced datagram and it barfed up
2011                          * on this side too */
2012                         CDEBUG(D_NET, "dropping NAK from %s; "
2013                                "peer %s is already not connected\n",
2014                                 libcfs_nid2str(connreq->gncr_srcnid),
2015                                 libcfs_nid2str(connreq->gncr_dstnid));
2016                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2017                         return 0;
2018                 }
2019
2020                 /* stub up a connection with the connreq XXX_stamps to allow
2021                  * use to use close_stale_conns_locked */
2022                 conn.gnc_peerstamp = connreq->gncr_peerstamp;
2023                 conn.gnc_my_connstamp = connreq->gncr_connstamp;
2024                 conn.gnc_peer_connstamp = connreq->gncr_connstamp;
2025                 conn.gnc_device = peer->gnp_net->gnn_dev;
2026
2027                 rc = kgnilnd_close_stale_conns_locked(peer, &conn);
2028
2029                 LCONSOLE_INFO("Received NAK from %s for %s errno %d; "
2030                         "closed %d connections\n",
2031                         libcfs_nid2str(connreq->gncr_srcnid),
2032                         libcfs_nid2str(connreq->gncr_dstnid), errno, rc);
2033         } else {
2034                 spin_lock(&dgram->gndg_conn->gnc_device->gnd_connd_lock);
2035
2036                 if (list_empty(&peer->gnp_connd_list)) {
2037                         /* if peer isn't on waiting list, try to find one to nuke */
2038                         rc = kgnilnd_find_and_cancel_dgram(peer->gnp_net->gnn_dev,
2039                                                            peer->gnp_nid);
2040
2041                         if (rc) {
2042                                 LCONSOLE_INFO("Received NAK from %s for %s errno %d; "
2043                                         "canceled pending connect request\n",
2044                                         libcfs_nid2str(connreq->gncr_srcnid),
2045                                         libcfs_nid2str(connreq->gncr_dstnid), errno);
2046                         }
2047
2048                         /* if we can't find a waiting dgram, we just drop the nak - the conn
2049                          * connect must have failed (didn't find conn above and clear connecting
2050                          * -- so nothing to do besides drop */
2051                 } else {
2052                         /* peer is on list, meaning it is a new connect attempt from the one
2053                          * we started that generated the NAK - so just drop NAK */
2054
2055                         /* use negative to prevent error message */
2056                         rc = -EAGAIN;
2057                 }
2058                 spin_unlock(&dgram->gndg_conn->gnc_device->gnd_connd_lock);
2059         }
2060
2061         /* success! we found a peer and at least marked pending_nak */
2062         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2063
2064         return rc;
2065 }
2066
2067 int
2068 kgnilnd_process_connreq(kgn_dgram_t *dgram, int *needs_nak)
2069 {
2070         int                      rc;
2071
2072         rc = kgnilnd_unpack_connreq(dgram);
2073         if (rc < 0) {
2074                 if (rc != -EBADF) {
2075                         /* only NAK if we have good srcnid to use */
2076                         *needs_nak = 1;
2077                 }
2078                 goto connreq_out;
2079         }
2080
2081         switch (dgram->gndg_conn_in.gncr_type) {
2082         case GNILND_CONNREQ_REQ:
2083                 /* wire up peer & conn, send queued TX */
2084                 rc = kgnilnd_finish_connect(dgram);
2085
2086                 /* don't nak when the nid is hosed */
2087                 if ((rc < 0)) {
2088                         *needs_nak = 1;
2089                 }
2090
2091                 break;
2092         case GNILND_CONNREQ_NAK:
2093                 rc = kgnilnd_process_nak(dgram);
2094                 /* return early to prevent reconnect bump */
2095                 return rc;
2096         default:
2097                 CERROR("unexpected connreq type %s (%d) from %s\n",
2098                         kgnilnd_connreq_type2str(&dgram->gndg_conn_in),
2099                         dgram->gndg_conn_in.gncr_type,
2100                         libcfs_nid2str(dgram->gndg_conn_in.gncr_srcnid));
2101                 rc = -EINVAL;
2102                 *needs_nak = 1;
2103                 break;
2104         }
2105
2106 connreq_out:
2107         RETURN(rc);
2108 }
2109
2110 int
2111 kgnilnd_probe_and_process_dgram(kgn_device_t *dev)
2112 {
2113         int                      rc;
2114         int                      needs_nak = 0;
2115         lnet_nid_t               nak_dstnid = LNET_NID_ANY;
2116         lnet_nid_t               orig_dstnid;
2117         kgn_dgram_t             *dgram = NULL;
2118         kgn_peer_t              *peer;
2119         ENTRY;
2120
2121         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PAUSE_DGRAM_COMP)) {
2122                 rc = 0;
2123         } else {
2124                 rc = kgnilnd_probe_for_dgram(dev, &dgram);
2125         }
2126
2127         if (rc == 0) {
2128                 RETURN(0);
2129         } else if (rc < 0) {
2130                 GOTO(inform_peer, rc);
2131         } else {
2132                 /* rc > 1 means it did something, reset for this func  */
2133                 rc = 0;
2134         }
2135
2136         switch (dgram->gndg_type) {
2137         case GNILND_DGRAM_WC_REQ:
2138         case GNILND_DGRAM_REQ:
2139                 rc = kgnilnd_process_connreq(dgram, &needs_nak);
2140                 break;
2141         case GNILND_DGRAM_NAK:
2142                 CDEBUG(D_NETTRACE, "NAK to %s done\n",
2143                         libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid));
2144                 break;
2145         default:
2146                 CERROR("unknown datagram type %s (%d)\n",
2147                        kgnilnd_dgram_type2str(dgram), dgram->gndg_type);
2148                 break;
2149         }
2150
2151         /* stash data to use after releasing current datagram */
2152         /* don't stash net - we are operating on a net already,
2153          * so the lock on rw_net_lock is sufficient */
2154
2155         nak_dstnid = dgram->gndg_conn_in.gncr_srcnid;
2156
2157 inform_peer:
2158         LASSERTF(dgram != NULL, "dgram 0x%p rc %d needs_nak %d\n", dgram, rc, needs_nak);
2159
2160         orig_dstnid = dgram->gndg_conn_out.gncr_dstnid;
2161
2162         kgnilnd_release_dgram(dev, dgram, 0);
2163
2164         CDEBUG(D_NET, "cleaning up dgram to %s, rc %d\n",
2165                libcfs_nid2str(orig_dstnid), rc);
2166
2167         /* if this was a WC_REQ that matched an existing peer, it'll get marked done
2168          * in kgnilnd_finish_connect - if errors are from before we get to there,
2169          * we just drop as it is a WC_REQ - the peer CAN'T be waiting for it */
2170         if ((orig_dstnid != LNET_NID_ANY) && (rc < 0)) {
2171                 /* if we have a negative rc, we want to find a peer to inform about
2172                  * the bad connection attempt. Sorry buddy, better luck next time! */
2173
2174                 write_lock(&kgnilnd_data.kgn_peer_conn_lock);
2175                 peer = kgnilnd_find_peer_locked(orig_dstnid);
2176
2177                 if (peer != NULL) {
2178                         /* add ref to make sure he stays around past the possible unlink
2179                          * so we can tell LNet about him */
2180                         kgnilnd_peer_addref(peer);
2181
2182                         /* if he still cares about the outstanding connect */
2183                         if (peer->gnp_connecting >= GNILND_PEER_CONNECT) {
2184                                 /* check if he is on the connd list and remove.. */
2185                                 spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
2186                                 if (!list_empty(&peer->gnp_connd_list)) {
2187                                         list_del_init(&peer->gnp_connd_list);
2188                                         /* drop connd ref */
2189                                         kgnilnd_peer_decref(peer);
2190                                 }
2191                                 spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
2192
2193                                 /* clear gnp_connecting so we don't have a non-connecting peer
2194                                  * on gnd_connd_list */
2195                                 peer->gnp_connecting = GNILND_PEER_IDLE;
2196
2197                                 set_mb(peer->gnp_last_dgram_errno, rc);
2198
2199                                 kgnilnd_peer_increase_reconnect_locked(peer);
2200                         }
2201                 }
2202                 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2203
2204                 /* now that we are outside the lock, tell Mommy */
2205                 if (peer != NULL) {
2206                         kgnilnd_peer_notify(peer, rc, 0);
2207                         kgnilnd_peer_decref(peer);
2208                 }
2209         }
2210
2211         if (needs_nak) {
2212                 kgnilnd_send_nak(dev, nak_dstnid, rc);
2213         }
2214
2215         RETURN(1);
2216 }
2217
2218 void
2219 kgnilnd_reaper_dgram_check(kgn_device_t *dev)
2220 {
2221         kgn_dgram_t    *dgram, *tmp;
2222         int             i;
2223
2224         spin_lock(&dev->gnd_dgram_lock);
2225
2226         for (i = 0; i < (*kgnilnd_tunables.kgn_peer_hash_size - 1); i++) {
2227                 list_for_each_entry_safe(dgram, tmp, &dev->gnd_dgrams[i], gndg_list) {
2228                         unsigned long            now = jiffies;
2229                         unsigned long            timeout;
2230
2231                         /* don't timeout stuff if the network is mucked or shutting down */
2232                         if (kgnilnd_check_hw_quiesce()) {
2233                                 break;
2234                         }
2235
2236                         if ((dgram->gndg_state != GNILND_DGRAM_POSTED) ||
2237                             (dgram->gndg_type == GNILND_DGRAM_WC_REQ)) {
2238                                 continue;
2239                         }
2240                         CDEBUG(D_NETTRACE, "checking dgram 0x%p type %s "
2241                                 "state %s conn 0x%p to %s age %lus\n",
2242                                 dgram, kgnilnd_dgram_type2str(dgram),
2243                                 kgnilnd_dgram_state2str(dgram), dgram->gndg_conn,
2244                                 libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
2245                                 cfs_duration_sec(now - dgram->gndg_post_time));
2246
2247                         timeout = cfs_time_seconds(*kgnilnd_tunables.kgn_timeout);
2248
2249                         if (time_before(now, (dgram->gndg_post_time + timeout)))
2250                                 continue;
2251
2252                         CNETERR("%s datagram to %s timed out @ %lus dgram "
2253                                 "0x%p state %s conn 0x%p\n",
2254                                 kgnilnd_dgram_type2str(dgram),
2255                                 libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
2256                                 cfs_duration_sec(now - dgram->gndg_post_time),
2257                                 dgram, kgnilnd_dgram_state2str(dgram),
2258                                 dgram->gndg_conn);
2259
2260                         kgnilnd_cancel_dgram_locked(dgram);
2261                 }
2262         }
2263         spin_unlock(&dev->gnd_dgram_lock);
2264 }
2265
2266
2267 /* use a thread for the possibly long-blocking wait_by_id to prevent
2268  * stalling the global workqueues */
2269 int
2270 kgnilnd_dgram_waitq(void *arg)
2271 {
2272         kgn_device_t     *dev = (kgn_device_t *) arg;
2273         char              name[16];
2274         gni_return_t      grc;
2275         __u64             readyid;
2276         DEFINE_WAIT(mover_done);
2277
2278         snprintf(name, sizeof(name), "kgnilnd_dgn_%02d", dev->gnd_id);
2279         cfs_block_allsigs();
2280
2281         /* all gnilnd threads need to run fairly urgently */
2282         set_user_nice(current, *kgnilnd_tunables.kgn_nice);
2283
2284         /* we dont shut down until the device shuts down ... */
2285         while (!kgnilnd_data.kgn_shutdown) {
2286                 /* to quiesce or to not quiesce, that is the question */
2287                 if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
2288                         KGNILND_SPIN_QUIESCE;
2289                 }
2290
2291                 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_PAUSE_DGRAM_COMP, 1)) {}
2292
2293                 /* check once a second */
2294                 grc = kgnilnd_postdata_probe_wait_by_id(dev->gnd_handle,
2295                                                        1000, &readyid);
2296
2297                 if (grc == GNI_RC_SUCCESS) {
2298                         CDEBUG(D_INFO, "waking up dgram mover thread\n");
2299                         kgnilnd_schedule_dgram(dev);
2300
2301                         /* wait for dgram thread to ping us before spinning again */
2302                         prepare_to_wait(&dev->gnd_dgping_waitq, &mover_done,
2303                                         TASK_INTERRUPTIBLE);
2304
2305                         /* don't sleep if we need to quiesce */
2306                         if (likely(!kgnilnd_data.kgn_quiesce_trigger)) {
2307                                 schedule();
2308                         }
2309                         finish_wait(&dev->gnd_dgping_waitq, &mover_done);
2310                 }
2311         }
2312
2313         kgnilnd_thread_fini();
2314         return 0;
2315 }
2316
2317 int
2318 kgnilnd_start_outbound_dgrams(kgn_device_t *dev, unsigned long deadline)
2319 {
2320         int                      did_something = 0, rc;
2321         kgn_peer_t              *peer = NULL;
2322
2323         spin_lock(&dev->gnd_connd_lock);
2324
2325         /* Active connect - we added this in kgnilnd_launch_tx */
2326         while (!list_empty(&dev->gnd_connd_peers) && time_before(jiffies, deadline)) {
2327                 peer = list_first_entry(&dev->gnd_connd_peers,
2328                                         kgn_peer_t, gnp_connd_list);
2329
2330                 /* ref for connd removed in if/else below */
2331                list_del_init(&peer->gnp_connd_list);
2332
2333                 /* gnp_connecting and membership on gnd_connd_peers should be
2334                  * done coherently to avoid double adding, etc */
2335                 /* don't need kgnilnd_data.kgn_peer_conn_lock here as that is only needed
2336                  * to get the peer to gnp_connecting in the first place. We just need to
2337                  * rely on gnd_connd_lock to serialize someone pulling him from the list
2338                  * BEFORE clearing gnp_connecting */
2339                 LASSERTF(peer->gnp_connecting != GNILND_PEER_IDLE, "peer 0x%p->%s not connecting\n",
2340                          peer, libcfs_nid2str(peer->gnp_nid));
2341
2342                 spin_unlock(&dev->gnd_connd_lock);
2343
2344                 CDEBUG(D_NET, "processing connect to %s\n",
2345                        libcfs_nid2str(peer->gnp_nid));
2346
2347                 did_something += 1;
2348                 rc = kgnilnd_start_connect(peer);
2349
2350                 if (likely(rc >= 0)) {
2351                         /* 0 on success, positive on 'just drop peer' errors */
2352                         kgnilnd_peer_decref(peer);
2353                 } else if (rc == -ENOMEM) {
2354                         /* if we are out of wildcards, add back to
2355                          * connd_list - then break out and we'll try later
2356                          * if other errors, we'll bail & cancel pending tx */
2357                         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
2358                         if (peer->gnp_connecting == GNILND_PEER_POSTING) {
2359                                 peer->gnp_connecting = GNILND_PEER_CONNECT;
2360                                 spin_lock(&dev->gnd_connd_lock);
2361                                 list_add_tail(&peer->gnp_connd_list,
2362                                               &dev->gnd_connd_peers);
2363                         } else {
2364                                 /* connecting changed while we were posting */
2365
2366                                 LASSERTF(peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH, "Peer is in invalid"
2367                                         " state 0x%p->%s, connecting %d\n",
2368                                         peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting);
2369                                 peer->gnp_connecting = GNILND_PEER_KILL;
2370                                 spin_lock(&dev->gnd_connd_lock);
2371                                 /* remove the peer ref frrom the cond list */
2372                                 kgnilnd_peer_decref(peer);
2373                                 /* let the system handle itself */
2374                         }
2375                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2376                         /* the datagrams are a global pool,
2377                          * so break out of trying and hope some free
2378                          * up soon */
2379                         did_something -= 1;
2380                         break;
2381                 } else {
2382                         /* something bad happened, you lose */
2383                         CNETERR("could not start connecting to %s "
2384                                 "rc %d: Will retry until TX timeout\n",
2385                                libcfs_nid2str(peer->gnp_nid), rc);
2386                         /* It didnt post so just set connecting back to zero now.
2387                          * The reaper will reattempt the connection if it needs too.
2388                          * If the peer needs death set it so the reaper will cleanup.
2389                          */
2390                         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
2391                         if (peer->gnp_connecting == GNILND_PEER_POSTING) {
2392                                 peer->gnp_connecting = GNILND_PEER_IDLE;
2393                                 kgnilnd_peer_increase_reconnect_locked(peer);
2394                         } else {
2395                                 LASSERTF(peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH, "Peer is in invalid"
2396                                         " state 0x%p->%s, connecting %d\n",
2397                                         peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting);
2398                                 peer->gnp_connecting = GNILND_PEER_KILL;
2399                         }
2400                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2401
2402                         /* hold onto ref until we are really done - if it was
2403                          * unlinked this could result in a destroy */
2404                         kgnilnd_peer_decref(peer);
2405                 }
2406                 spin_lock(&dev->gnd_connd_lock);
2407         }
2408
2409         spin_unlock(&dev->gnd_connd_lock);
2410         RETURN(did_something);
2411 }
2412
2413 int
2414 kgnilnd_repost_wc_dgrams(kgn_device_t *dev)
2415 {
2416         int did_something = 0, to_repost, i;
2417         to_repost = atomic_read(&dev->gnd_nwcdgrams);
2418         ENTRY;
2419
2420         for (i = 0; i < to_repost; ++i) {
2421                 int     rerc;
2422                 rerc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
2423                 if (rerc == 0) {
2424                         kgnilnd_admin_decref(dev->gnd_nwcdgrams);
2425                         did_something += 1;
2426                 } else {
2427                         CDEBUG(D_NETERROR, "error %d: dev %d could not post wildcard datagram\n",
2428                                 rerc, dev->gnd_id);
2429                         break;
2430                 }
2431         }
2432
2433         RETURN(did_something);
2434 }
2435
2436 static void
2437 kgnilnd_dgram_poke_with_stick(unsigned long arg)
2438 {
2439         int             dev_id = arg;
2440         kgn_device_t    *dev = &kgnilnd_data.kgn_devices[dev_id];
2441
2442         wake_up(&dev->gnd_dgram_waitq);
2443 }
2444
2445 /* use single thread for dgrams - should be sufficient for performance */
2446 int
2447 kgnilnd_dgram_mover(void *arg)
2448 {
2449         kgn_device_t            *dev = (kgn_device_t *)arg;
2450         char                     name[16];
2451         int                      rc, did_something;
2452         unsigned long            next_purge_check = jiffies - 1;
2453         unsigned long            timeout;
2454         struct timer_list        timer;
2455         unsigned long            deadline = 0;
2456         DEFINE_WAIT(wait);
2457
2458         snprintf(name, sizeof(name), "kgnilnd_dg_%02d", dev->gnd_id);
2459         cfs_block_allsigs();
2460         /* all gnilnd threads need to run fairly urgently */
2461         set_user_nice(current, *kgnilnd_tunables.kgn_nice);
2462
2463         /* we are ok not locking for these variables as the dgram waitq threads
2464          * will block both due to tying up net (kgn_shutdown) and the completion
2465          * event for the dgram_waitq (kgn_quiesce_trigger) */
2466         deadline = jiffies + cfs_time_seconds(*kgnilnd_tunables.kgn_dgram_timeout);
2467         while (!kgnilnd_data.kgn_shutdown) {
2468                 /* Safe: kgn_shutdown only set when quiescent */
2469
2470                 /* race with stack reset - we want to hold off seeing any new incoming dgrams
2471                  * so we can force a dirty WC dgram for Bug 762072 - put right before
2472                  * quiesce check so that it'll go right into that and not do any
2473                  * dgram mucking */
2474                 CFS_RACE(CFS_FAIL_GNI_WC_DGRAM_FREE);
2475
2476                 /* to quiesce or to not quiesce, that is the question */
2477                 if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
2478                         KGNILND_SPIN_QUIESCE;
2479                 }
2480                 did_something = 0;
2481
2482                 CFS_RACE(CFS_FAIL_GNI_QUIESCE_RACE);
2483
2484                 /* process any newly completed dgrams */
2485                 down_read(&kgnilnd_data.kgn_net_rw_sem);
2486
2487                 rc = kgnilnd_probe_and_process_dgram(dev);
2488                 if (rc > 0) {
2489                         did_something += rc;
2490                 }
2491
2492                 up_read(&kgnilnd_data.kgn_net_rw_sem);
2493
2494                 CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_DGRAM_DEADLINE,
2495                         (*kgnilnd_tunables.kgn_dgram_timeout + 1));
2496                 /* start new outbound dgrams */
2497                 did_something += kgnilnd_start_outbound_dgrams(dev, deadline);
2498
2499                 /* find dead dgrams */
2500                 if (time_after_eq(jiffies, next_purge_check)) {
2501                         /* these don't need to be checked that often */
2502                         kgnilnd_reaper_dgram_check(dev);
2503
2504                         next_purge_check = (long) jiffies +
2505                                       cfs_time_seconds(kgnilnd_data.kgn_new_min_timeout / 4);
2506                 }
2507
2508                 did_something += kgnilnd_repost_wc_dgrams(dev);
2509
2510                 /* careful with the jiffy wrap... */
2511                 timeout = (long)(next_purge_check - jiffies);
2512
2513                 CDEBUG(D_INFO, "did %d timeout %lu next %lu jiffies %lu\n",
2514                        did_something, timeout, next_purge_check, jiffies);
2515
2516                 if ((did_something || timeout <= 0) && time_before(jiffies, deadline)) {
2517                         did_something = 0;
2518                         continue;
2519                 }
2520
2521                 prepare_to_wait(&dev->gnd_dgram_waitq, &wait, TASK_INTERRUPTIBLE);
2522
2523                 setup_timer(&timer, kgnilnd_dgram_poke_with_stick, dev->gnd_id);
2524                 mod_timer(&timer, (long) jiffies + timeout);
2525
2526                 /* last second chance for others to poke us */
2527                 did_something += xchg(&dev->gnd_dgram_ready, GNILND_DGRAM_IDLE);
2528
2529                 /* check flag variables before committing even if we
2530                  * did something; if we are after the deadline call
2531                  * schedule */
2532                 if ((!did_something || time_after(jiffies, deadline)) &&
2533                     !kgnilnd_data.kgn_shutdown &&
2534                     !kgnilnd_data.kgn_quiesce_trigger) {
2535                         CDEBUG(D_INFO, "schedule timeout %ld (%lu sec)\n",
2536                                timeout, cfs_duration_sec(timeout));
2537                         wake_up_all(&dev->gnd_dgping_waitq);
2538                         schedule();
2539                         CDEBUG(D_INFO, "awake after schedule\n");
2540                         deadline = jiffies + cfs_time_seconds(*kgnilnd_tunables.kgn_dgram_timeout);
2541                 }
2542
2543                 del_singleshot_timer_sync(&timer);
2544                 finish_wait(&dev->gnd_dgram_waitq, &wait);
2545         }
2546
2547         kgnilnd_thread_fini();
2548         return 0;
2549 }