Whamcloud - gitweb
LU-3008 lnet: Update support for Cray's interconnects
[fs/lustre-release.git] / lnet / klnds / gnilnd / gnilnd_conn.c
1 /*
2  * Copyright (C) 2012 Cray, Inc.
3  *
4  *   Author: Nic Henke <nic@cray.com>
5  *   Author: James Shimek <jshimek@cray.com>
6  *
7  *   This file is part of Lustre, http://www.lustre.org.
8  *
9  *   Lustre is free software; you can redistribute it and/or
10  *   modify it under the terms of version 2 of the GNU General Public
11  *   License as published by the Free Software Foundation.
12  *
13  *   Lustre is distributed in the hope that it will be useful,
14  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
15  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  *   GNU General Public License for more details.
17  *
18  *   You should have received a copy of the GNU General Public License
19  *   along with Lustre; if not, write to the Free Software
20  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21  *
22  */
23
24 #include "gnilnd.h"
25
26 void
27 kgnilnd_setup_smsg_attr(gni_smsg_attr_t *smsg_attr)
28 {
29         smsg_attr->mbox_maxcredit = *kgnilnd_tunables.kgn_mbox_credits;
30         smsg_attr->msg_maxsize = GNILND_MAX_MSG_SIZE;
31         smsg_attr->msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT;
32 }
33
34 int
35 kgnilnd_map_fmablk(kgn_device_t *device, kgn_fma_memblock_t *fma_blk)
36 {
37         gni_return_t            rrc;
38         __u32                   flags = GNI_MEM_READWRITE;
39
40         if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
41                 flags |= GNI_MEM_PHYS_CONT;
42         }
43
44         /* make sure we are mapping a clean block */
45         LASSERTF(fma_blk->gnm_hndl.qword1 == 0UL, "fma_blk %p dirty\n", fma_blk);
46
47         rrc = kgnilnd_mem_register(device->gnd_handle, (__u64)fma_blk->gnm_block,
48                                    fma_blk->gnm_blk_size, device->gnd_rcv_fma_cqh,
49                                    flags, &fma_blk->gnm_hndl);
50         if (rrc != GNI_RC_SUCCESS) {
51                 /* XXX Nic: need a way to silence this for runtime stuff that is ok to fail
52                  * -- like when under MDD or GART pressure on big systems
53                  */
54                 CNETERR("register fmablk failed 0x%p mbox_size %d flags %u\n",
55                         fma_blk, fma_blk->gnm_mbox_size, flags);
56                 RETURN(-ENOMEM);
57         }
58
59         /* PHYS_CONT memory isn't really mapped, at least not in GART -
60          *  but all mappings chew up a MDD
61          */
62         if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
63                 atomic64_add(fma_blk->gnm_blk_size, &device->gnd_nbytes_map);
64         }
65
66         atomic_inc(&device->gnd_n_mdd);
67         /* nfmablk is live (mapped) blocks */
68         atomic_inc(&device->gnd_nfmablk);
69
70         RETURN(0);
71 }
72
73 int
74 kgnilnd_alloc_fmablk(kgn_device_t *device, int use_phys)
75 {
76         int                     rc = 0;
77         int                     num_mbox;
78         kgn_fma_memblock_t     *fma_blk;
79         gni_smsg_attr_t         smsg_attr;
80         unsigned long           fmablk_vers;
81
82         /* we'll use fmablk_vers and the gnd_fmablk_sem to gate access
83          * to this allocation code. Everyone will sample the version
84          * before and after getting the semaphore. If it has changed,
85          * we'll bail out to check the lists again - this indicates that
86          * some sort of change was made to the lists and it is possible
87          * that there is a mailbox for us to find now. This should prevent
88          * a ton of spinning in the case where there are lots of threads
89          * that need a yet-to-be-allocated mailbox for a connection. */
90
91         fmablk_vers = atomic_read(&device->gnd_fmablk_vers);
92         down(&device->gnd_fmablk_sem);
93
94         if (fmablk_vers != atomic_read(&device->gnd_fmablk_vers)) {
95                 /* version changed while we were waiting for semaphore,
96                  * we'll recheck the lists assuming something nice happened */
97                 up(&device->gnd_fmablk_sem);
98                 return 0;
99         }
100
101         LIBCFS_ALLOC(fma_blk, sizeof(kgn_fma_memblock_t));
102         if (fma_blk == NULL) {
103                 CNETERR("could not allocate fma block descriptor\n");
104                 rc = -ENOMEM;
105                 GOTO(out, rc);
106         }
107
108         INIT_LIST_HEAD(&fma_blk->gnm_bufflist);
109
110         kgnilnd_setup_smsg_attr(&smsg_attr);
111
112         gni_smsg_buff_size_needed(&smsg_attr, &fma_blk->gnm_mbox_size);
113
114         LASSERTF(fma_blk->gnm_mbox_size, "mbox size %d\n", fma_blk->gnm_mbox_size);
115
116         /* gni_smsg_buff_size_needed calculates the base mailbox size and since
117          * we want to hold kgn_peer_credits worth of messages in both directions,
118          * we add PAYLOAD to grow the mailbox size
119          */
120
121         fma_blk->gnm_mbox_size += GNILND_MBOX_PAYLOAD;
122
123         /* we'll only use physical during preallocate at startup -- this keeps it nice and
124          * clean for runtime decisions. We'll keep the PHYS ones around until shutdown
125          * as reallocating them is tough if there is memory fragmentation */
126
127         if (use_phys) {
128                 fma_blk->gnm_block = cfs_mem_cache_alloc(kgnilnd_data.kgn_mbox_cache, CFS_ALLOC_ATOMIC);
129                 if (fma_blk->gnm_block == NULL) {
130                         CNETERR("could not allocate physical SMSG mailbox memory\n");
131                         rc = -ENOMEM;
132                         GOTO(free_desc, rc);
133                 }
134                 fma_blk->gnm_blk_size = KMALLOC_MAX_SIZE;
135                 num_mbox = fma_blk->gnm_blk_size / fma_blk->gnm_mbox_size;
136
137                 LASSERTF(num_mbox >= 1,
138                          "num_mbox %d blk_size %u mbox_size %d\n",
139                           num_mbox, fma_blk->gnm_blk_size, fma_blk->gnm_mbox_size);
140
141                 fma_blk->gnm_state = GNILND_FMABLK_PHYS;
142
143         } else {
144                 num_mbox = *kgnilnd_tunables.kgn_mbox_per_block;
145                 fma_blk->gnm_blk_size = num_mbox * fma_blk->gnm_mbox_size;
146
147                 LASSERTF(num_mbox >= 1 && num_mbox >= *kgnilnd_tunables.kgn_mbox_per_block,
148                          "num_mbox %d blk_size %u mbox_size %d tunable %d\n",
149                          num_mbox, fma_blk->gnm_blk_size, fma_blk->gnm_mbox_size,
150                          *kgnilnd_tunables.kgn_mbox_per_block);
151
152                 LIBCFS_ALLOC(fma_blk->gnm_block, fma_blk->gnm_blk_size);
153                 if (fma_blk->gnm_block == NULL) {
154                         CNETERR("could not allocate virtual SMSG mailbox memory, %d bytes\n", fma_blk->gnm_blk_size);
155                         rc = -ENOMEM;
156                         GOTO(free_desc, rc);
157                 }
158
159                 fma_blk->gnm_state = GNILND_FMABLK_VIRT;
160         }
161
162         /* allocate just enough space for the bits to track the mailboxes */
163         LIBCFS_ALLOC(fma_blk->gnm_bit_array, BITS_TO_LONGS(num_mbox) * sizeof(unsigned long));
164         if (fma_blk->gnm_bit_array == NULL) {
165                 CNETERR("could not allocate mailbox bitmask, %lu bytes for %d mbox\n",
166                        sizeof(unsigned long) * BITS_TO_LONGS(num_mbox), num_mbox);
167                 rc = -ENOMEM;
168                 GOTO(free_blk, rc);
169         }
170         bitmap_zero(fma_blk->gnm_bit_array, num_mbox);
171
172         /* now that the num_mbox is set based on allocation type, get debug info setup */
173         LIBCFS_ALLOC(fma_blk->gnm_mbox_info, sizeof(kgn_mbox_info_t) * num_mbox);
174         if (fma_blk->gnm_mbox_info == NULL) {
175                 CNETERR("could not allocate mailbox debug, %lu bytes for %d mbox\n",
176                        sizeof(kgn_mbox_info_t) * num_mbox, num_mbox);
177                 rc = -ENOMEM;
178                 GOTO(free_bit, rc);
179         }
180
181         rc = kgnilnd_map_fmablk(device, fma_blk);
182         if (rc) {
183                 GOTO(free_info, rc);
184         }
185
186         fma_blk->gnm_next_avail_mbox = 0;
187         fma_blk->gnm_avail_mboxs = fma_blk->gnm_num_mboxs = num_mbox;
188
189         CDEBUG(D_MALLOC, "alloc fmablk 0x%p num %d msg_maxsize %d credits %d "
190                 "mbox_size %d MDD "LPX64"."LPX64"\n",
191                 fma_blk, num_mbox, smsg_attr.msg_maxsize, smsg_attr.mbox_maxcredit,
192                 fma_blk->gnm_mbox_size, fma_blk->gnm_hndl.qword1,
193                 fma_blk->gnm_hndl.qword2);
194
195         /* lock Is protecting data structures, not semaphore */
196
197         spin_lock(&device->gnd_fmablk_lock);
198         list_add_tail(&fma_blk->gnm_bufflist, &device->gnd_fma_buffs);
199
200         /* toggle under the lock so once they change the list is also
201          * ready for others to traverse */
202         atomic_inc(&device->gnd_fmablk_vers);
203
204         spin_unlock(&device->gnd_fmablk_lock);
205
206         up(&device->gnd_fmablk_sem);
207
208         return 0;
209
210 free_info:
211         LIBCFS_FREE(fma_blk->gnm_mbox_info, sizeof(kgn_mbox_info_t)*num_mbox);
212 free_bit:
213         LIBCFS_FREE(fma_blk->gnm_bit_array, BITS_TO_LONGS(num_mbox) * sizeof (unsigned long));
214 free_blk:
215         if (fma_blk->gnm_state == GNILND_FMABLK_VIRT) {
216                 LIBCFS_FREE(fma_blk->gnm_block, fma_blk->gnm_blk_size);
217         } else {
218                 cfs_mem_cache_free(kgnilnd_data.kgn_mbox_cache, fma_blk->gnm_block);
219         }
220 free_desc:
221         LIBCFS_FREE(fma_blk, sizeof(kgn_fma_memblock_t));
222 out:
223         up(&device->gnd_fmablk_sem);
224         return rc;
225 }
226
227 void
228 kgnilnd_unmap_fmablk(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
229 {
230         gni_return_t            rrc;
231
232         /* if some held, set hold_timeout from conn timeouts used in this block
233          * but not during shutdown, then just nuke and pave */
234         if (fma_blk->gnm_held_mboxs && (!kgnilnd_data.kgn_shutdown)) {
235                 fma_blk->gnm_hold_timeout = GNILND_TIMEOUT2DEADMAN;
236         }
237
238         /* we are changing the state of a block, tickle version to tell
239          * proc code list is stale now */
240         atomic_inc(&dev->gnd_fmablk_vers);
241
242         rrc = kgnilnd_mem_deregister(dev->gnd_handle, &fma_blk->gnm_hndl, fma_blk->gnm_hold_timeout);
243
244         CDEBUG(rrc == GNI_RC_SUCCESS ? D_MALLOC : D_CONSOLE|D_NETERROR,
245                "unmap fmablk 0x%p@%s sz %u total %d avail %d held %d mbox_size %d "
246                 "hold_timeout %d\n",
247                fma_blk, kgnilnd_fmablk_state2str(fma_blk->gnm_state),
248                fma_blk->gnm_blk_size, fma_blk->gnm_num_mboxs,
249                fma_blk->gnm_avail_mboxs, fma_blk->gnm_held_mboxs,
250                fma_blk->gnm_mbox_size, fma_blk->gnm_hold_timeout);
251
252         LASSERTF(rrc == GNI_RC_SUCCESS,
253                 "tried to double unmap or something bad, fma_blk %p (rrc %d)\n",
254                 fma_blk, rrc);
255
256         if (fma_blk->gnm_hold_timeout) {
257                 atomic_inc(&dev->gnd_n_mdd_held);
258         } else {
259                 atomic_dec(&dev->gnd_n_mdd);
260         }
261
262         /* PHYS blocks don't get mapped */
263         if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
264                 atomic64_sub(fma_blk->gnm_blk_size, &dev->gnd_nbytes_map);
265                 fma_blk->gnm_state = GNILND_FMABLK_IDLE;
266         } else if (kgnilnd_data.kgn_in_reset) {
267                 /* in stack reset, clear MDD handle for PHYS blocks, as we'll
268                  * re-use the fma_blk after reset so we don't have to drop/allocate
269                  * all of those physical blocks */
270                 fma_blk->gnm_hndl.qword1 = fma_blk->gnm_hndl.qword2 = 0UL;
271         }
272
273         /* Decrement here as this is the # of mapped blocks */
274         atomic_dec(&dev->gnd_nfmablk);
275 }
276
277
278 /* needs lock on gnd_fmablk_lock to cover gnd_fma_buffs */
279 void
280 kgnilnd_free_fmablk_locked(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
281 {
282         LASSERTF(fma_blk->gnm_avail_mboxs == fma_blk->gnm_num_mboxs,
283                  "fma_blk %p@%d free in bad state (%d): blk total %d avail %d held %d\n",
284                  fma_blk, fma_blk->gnm_state, fma_blk->gnm_hold_timeout, fma_blk->gnm_num_mboxs,
285                 fma_blk->gnm_avail_mboxs, fma_blk->gnm_held_mboxs);
286
287         atomic_inc(&dev->gnd_fmablk_vers);
288
289         if (fma_blk->gnm_hold_timeout) {
290                 CDEBUG(D_MALLOC, "mdd release fmablk 0x%p sz %u avail %d held %d "
291                         "mbox_size %d\n",
292                         fma_blk, fma_blk->gnm_blk_size, fma_blk->gnm_avail_mboxs,
293                         fma_blk->gnm_held_mboxs, fma_blk->gnm_mbox_size);
294
295                 /* We leave MDD dangling over stack reset */
296                 if (!kgnilnd_data.kgn_in_reset) {
297                         kgnilnd_mem_mdd_release(dev->gnd_handle, &fma_blk->gnm_hndl);
298                 }
299                 /* ignoring the return code - if kgni/ghal can't find it
300                  * it must be released already */
301                 atomic_dec(&dev->gnd_n_mdd_held);
302                 atomic_dec(&dev->gnd_n_mdd);
303         }
304
305         /* we cant' free the gnm_block until all the conns have released their
306          * purgatory holds. While we have purgatory holds, we might check the conn
307          * RX mailbox during the CLOSING process. It is possible that kgni might
308          * try to look into the RX side for credits when sending the CLOSE msg too */
309         CDEBUG(D_MALLOC, "fmablk %p free buffer %p mbox_size %d\n",
310                 fma_blk, fma_blk->gnm_block, fma_blk->gnm_mbox_size);
311
312         if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
313                 cfs_mem_cache_free(kgnilnd_data.kgn_mbox_cache, fma_blk->gnm_block);
314         } else {
315                 LIBCFS_FREE(fma_blk->gnm_block, fma_blk->gnm_blk_size);
316         }
317         fma_blk->gnm_state = GNILND_FMABLK_FREED;
318
319         list_del(&fma_blk->gnm_bufflist);
320
321         LIBCFS_FREE(fma_blk->gnm_mbox_info, sizeof(kgn_mbox_info_t)*fma_blk->gnm_num_mboxs);
322         LIBCFS_FREE(fma_blk->gnm_bit_array, BITS_TO_LONGS(fma_blk->gnm_num_mboxs) * sizeof (unsigned long));
323         LIBCFS_FREE(fma_blk, sizeof(kgn_fma_memblock_t));
324 }
325
326 void
327 kgnilnd_find_free_mbox(kgn_conn_t *conn)
328 {
329         kgn_device_t            *dev = conn->gnc_device;
330         gni_smsg_attr_t         *smsg_attr = &conn->gnpr_smsg_attr;
331         kgn_fma_memblock_t      *fma_blk;
332         kgn_mbox_info_t         *mbox = NULL;
333         int                     id;
334
335         spin_lock(&dev->gnd_fmablk_lock);
336
337         list_for_each_entry(fma_blk, &conn->gnc_device->gnd_fma_buffs,
338                             gnm_bufflist) {
339                 if (fma_blk->gnm_avail_mboxs <= 0 ||
340                     fma_blk->gnm_state <= GNILND_FMABLK_IDLE) {
341                         continue;
342                 }
343                 /* look in bitarray for available mailbox */
344                 do {
345                         id = find_next_zero_bit(
346                                 fma_blk->gnm_bit_array,
347                                 fma_blk->gnm_num_mboxs,
348                                 fma_blk->gnm_next_avail_mbox);
349                       if (id == fma_blk->gnm_num_mboxs &&
350                           fma_blk->gnm_next_avail_mbox != 0) {
351                                 /* wrap around */
352                                 fma_blk->gnm_next_avail_mbox = 0;
353                         } else {
354                                 break;
355                         }
356                 } while (1);
357
358                 LASSERTF(id < fma_blk->gnm_num_mboxs, "id %d max %d\n",
359                          id, fma_blk->gnm_num_mboxs);
360                 set_bit(id, (volatile unsigned long *)fma_blk->gnm_bit_array);
361                 conn->gnc_mbox_id = id;
362
363                 fma_blk->gnm_next_avail_mbox =
364                         (id == (fma_blk->gnm_num_mboxs - 1)) ? 0 : (id + 1);
365                 fma_blk->gnm_avail_mboxs--;
366                 conn->gnc_fma_blk = fma_blk;
367
368                 kgnilnd_setup_smsg_attr(smsg_attr);
369
370                 smsg_attr->msg_buffer = fma_blk->gnm_block;
371                 smsg_attr->mbox_offset = fma_blk->gnm_mbox_size * id;
372                 smsg_attr->mem_hndl = fma_blk->gnm_hndl;
373                 smsg_attr->buff_size = fma_blk->gnm_mbox_size;
374
375                 /* We'll set the hndl to zero for PHYS blocks unmapped during stack
376                  * reset and re-use the same fma_blk after stack reset. This ensures we've
377                  * properly mapped it before we use it */
378                 LASSERTF(fma_blk->gnm_hndl.qword1 != 0UL, "unmapped fma_blk %p, state %d\n",
379                          fma_blk, fma_blk->gnm_state);
380
381                 CDEBUG(D_NET, "conn %p smsg %p fmablk %p "
382                         "allocating SMSG mbox %d buf %p "
383                         "offset %u hndl "LPX64"."LPX64"\n",
384                         conn, smsg_attr, fma_blk, id,
385                         smsg_attr->msg_buffer, smsg_attr->mbox_offset,
386                         fma_blk->gnm_hndl.qword1,
387                         fma_blk->gnm_hndl.qword2);
388
389                 mbox = &fma_blk->gnm_mbox_info[id];
390                 mbox->mbx_create_conn_memset = jiffies;
391                 mbox->mbx_nallocs++;
392                 mbox->mbx_nallocs_total++;
393
394                 /* zero mbox to remove any old data from our last use.
395                  * this better be safe, if not our purgatory timers
396                  * are too short or a peer really is misbehaving */
397                 memset(smsg_attr->msg_buffer + smsg_attr->mbox_offset,
398                        0, smsg_attr->buff_size);
399                 break;
400         }
401
402         spin_unlock(&dev->gnd_fmablk_lock);
403 }
404
405 int
406 kgnilnd_setup_mbox(kgn_conn_t *conn)
407 {
408         gni_smsg_attr_t         *smsg_attr = &conn->gnpr_smsg_attr;
409         int                      err = 0;
410
411         smsg_attr->msg_buffer = NULL;
412         /* Look for available mbox */
413         do {
414                 kgnilnd_find_free_mbox(conn);
415
416                 /* nothing in the existing buffers, make a new one */
417                 if (smsg_attr->msg_buffer == NULL) {
418                         /* for runtime allocations, we only want vmalloc */
419                         err = kgnilnd_alloc_fmablk(conn->gnc_device, 0);
420                         if (err) {
421                                 break;
422                         }
423                 }
424         } while (smsg_attr->msg_buffer == NULL);
425
426         if (err)
427                 CNETERR("couldn't allocate SMSG mbox for conn %p Error: %d\n",
428                         conn, err);
429         return err;
430 }
431
432 void
433 kgnilnd_release_mbox(kgn_conn_t *conn, int purgatory_hold)
434 {
435         kgn_device_t           *dev = conn->gnc_device;
436         gni_smsg_attr_t        *smsg_attr = &conn->gnpr_smsg_attr;
437         kgn_fma_memblock_t     *fma_blk = NULL;
438         kgn_mbox_info_t        *mbox = NULL;
439         int                     found = 0;
440         int                     id;
441
442         /* if we failed to setup mbox and now destroying conn */
443         if (smsg_attr->msg_buffer == NULL) {
444                 return;
445         }
446
447         id = conn->gnc_mbox_id;
448
449         spin_lock(&dev->gnd_fmablk_lock);
450         /* make sure our conn points at a valid fma_blk
451          * We use this instead of a mem block search out of smsg_attr
452          * because we could have freed a block for fma_blk #1 but the fma_blk
453          * is still in the list for a purgatory hold. This would induce a false
454          * match if that same block gets reallocated to fma_blk #2 */
455         list_for_each_entry(fma_blk, &dev->gnd_fma_buffs, gnm_bufflist) {
456                 if (fma_blk == conn->gnc_fma_blk) {
457                         found = 1;
458                         break;
459                 }
460         }
461         LASSERTF(found, "unable to find conn 0x%p with gnc_fma_blk %p "
462                  "anywhere in the world\n", conn, conn->gnc_fma_blk);
463
464         LASSERTF(id < fma_blk->gnm_num_mboxs,
465                 "bad id %d max %d\n",
466                 id, fma_blk->gnm_num_mboxs);
467
468         /* < 0 - was held, now free it
469          * == 0 - just free it
470          * > 0 - hold it for now */
471         if (purgatory_hold == 0) {
472                 CDEBUG(D_NET, "conn %p smsg %p fmablk %p freeing SMSG mbox %d "
473                         "hndl "LPX64"."LPX64"\n",
474                         conn, smsg_attr, fma_blk, id,
475                         fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
476                 fma_blk->gnm_avail_mboxs++;
477
478         } else if (purgatory_hold > 0) {
479                 CDEBUG(D_NET, "conn %p smsg %p fmablk %p holding SMSG mbox %d "
480                         "hndl "LPX64"."LPX64"\n",
481                         conn, smsg_attr, fma_blk, id,
482                         fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
483
484                 fma_blk->gnm_held_mboxs++;
485                 fma_blk->gnm_max_timeout = MAX(fma_blk->gnm_max_timeout,
486                                                 conn->gnc_timeout);
487         } else {
488                 CDEBUG(D_NET, "conn %p smsg %p fmablk %p release SMSG mbox %d "
489                         "hndl "LPX64"."LPX64"\n",
490                         conn, smsg_attr, fma_blk, id,
491                         fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
492
493                 fma_blk->gnm_held_mboxs--;
494                 fma_blk->gnm_avail_mboxs++;
495         }
496
497         if (purgatory_hold <= 0) {
498                 /* if kgni is retransmitting, freeing the smsg block before the EP
499                  * is destroyed gets messy. Bug 768295. */
500                 LASSERTF(conn->gnc_ephandle == NULL,
501                          "can't release mbox before EP is nuked. conn 0x%p\n", conn);
502
503                 mbox = &fma_blk->gnm_mbox_info[id];
504                 mbox->mbx_release_from_purgatory = jiffies;
505
506                 /* clear conn gnc_fmablk if it is gone - this allows us to
507                  * not worry about state so much in kgnilnd_destroy_conn
508                  * and makes the guaranteed cleanup of the resources easier */
509                 LASSERTF(test_and_clear_bit(id, fma_blk->gnm_bit_array),
510                         "conn %p bit %d already cleared in fma_blk %p\n",
511                          conn, id, fma_blk);
512                 conn->gnc_fma_blk = NULL;
513                 mbox->mbx_nallocs--;
514         }
515
516         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_FMABLK_AVAIL)) {
517                 CERROR("LBUGs in your future: forcibly marking fma_blk %p "
518                        "as mapped\n", fma_blk);
519                 fma_blk->gnm_state = GNILND_FMABLK_VIRT;
520         }
521
522         /* we don't release or unmap PHYS blocks as part of the normal cycle --
523          * those are controlled manually from startup/shutdown */
524         if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
525                 /* we can unmap once all are unused (held or avail)
526                  * but check hold_timeout to make sure we are not trying to double
527                  * unmap this buffer. If there was no hold_timeout set due to
528                  * held_mboxs, we'll free the mobx here shortly and won't have to
529                  * worry about catching a double free for a 'clean' fma_blk */
530                 if (((fma_blk->gnm_avail_mboxs + fma_blk->gnm_held_mboxs) == fma_blk->gnm_num_mboxs) &&
531                     (!fma_blk->gnm_hold_timeout)) {
532                         kgnilnd_unmap_fmablk(dev, fma_blk);
533                 }
534
535                 /* But we can only free once they are all avail */
536                 if (fma_blk->gnm_avail_mboxs == fma_blk->gnm_num_mboxs &&
537                     fma_blk->gnm_held_mboxs == 0) {
538                         /* all mailboxes are released, free fma_blk */
539                         kgnilnd_free_fmablk_locked(dev, fma_blk);
540                 }
541         }
542
543         spin_unlock(&dev->gnd_fmablk_lock);
544 }
545
546 int
547 kgnilnd_count_phys_mbox(kgn_device_t *device)
548 {
549         int                     i = 0;
550         kgn_fma_memblock_t     *fma_blk;
551
552         spin_lock(&device->gnd_fmablk_lock);
553
554         list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
555                 if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
556                         i += fma_blk->gnm_num_mboxs;
557         }
558         spin_unlock(&device->gnd_fmablk_lock);
559
560         RETURN(i);
561 }
562
563 int
564 kgnilnd_allocate_phys_fmablk(kgn_device_t *device)
565 {
566         int     rc;
567
568         while (kgnilnd_count_phys_mbox(device) < *kgnilnd_tunables.kgn_nphys_mbox) {
569
570                 rc = kgnilnd_alloc_fmablk(device, 1);
571                 if (rc) {
572                         CERROR("failed phys mbox allocation, stopping at %d, rc %d\n",
573                                 kgnilnd_count_phys_mbox(device), rc);
574                         RETURN(rc);
575                 }
576         }
577         RETURN(0);
578 }
579
580 int
581 kgnilnd_map_phys_fmablk(kgn_device_t *device)
582 {
583
584         int                     rc = 0;
585         kgn_fma_memblock_t     *fma_blk;
586
587         /* use sem to gate access to single thread, just in case */
588         down(&device->gnd_fmablk_sem);
589
590         spin_lock(&device->gnd_fmablk_lock);
591
592         list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
593                 if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
594                         rc = kgnilnd_map_fmablk(device, fma_blk);
595                         if (rc)
596                                 break;
597         }
598         spin_unlock(&device->gnd_fmablk_lock);
599
600         up(&device->gnd_fmablk_sem);
601
602         RETURN(rc);
603 }
604
605 void
606 kgnilnd_unmap_phys_fmablk(kgn_device_t *device)
607 {
608
609         kgn_fma_memblock_t      *fma_blk;
610
611         /* use sem to gate access to single thread, just in case */
612         down(&device->gnd_fmablk_sem);
613
614         spin_lock(&device->gnd_fmablk_lock);
615
616         list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
617                 if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
618                         kgnilnd_unmap_fmablk(device, fma_blk);
619         }
620         spin_unlock(&device->gnd_fmablk_lock);
621
622         up(&device->gnd_fmablk_sem);
623 }
624
625 void
626 kgnilnd_free_phys_fmablk(kgn_device_t *device)
627 {
628
629         kgn_fma_memblock_t      *fma_blk, *fma_blkN;
630
631         /* use sem to gate access to single thread, just in case */
632         down(&device->gnd_fmablk_sem);
633
634         spin_lock(&device->gnd_fmablk_lock);
635
636         list_for_each_entry_safe(fma_blk, fma_blkN, &device->gnd_fma_buffs, gnm_bufflist) {
637                 if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
638                         kgnilnd_free_fmablk_locked(device, fma_blk);
639         }
640         spin_unlock(&device->gnd_fmablk_lock);
641
642         up(&device->gnd_fmablk_sem);
643 }
644
645 /* kgnilnd dgram nid->struct managment */
646
647 static inline struct list_head *
648 kgnilnd_nid2dgramlist(kgn_device_t *dev, lnet_nid_t nid)
649 {
650         unsigned int hash = ((unsigned int)nid) % *kgnilnd_tunables.kgn_peer_hash_size;
651
652         RETURN(&dev->gnd_dgrams[hash]);
653 }
654
655
656 /* needs dev->gnd_dgram_lock held */
657 kgn_dgram_t *
658 kgnilnd_find_dgram_locked(kgn_device_t *dev, lnet_nid_t dst_nid)
659 {
660         struct list_head *dgram_list = kgnilnd_nid2dgramlist(dev, dst_nid);
661         kgn_dgram_t      *dgram;
662
663         list_for_each_entry(dgram, dgram_list, gndg_list) {
664
665                 /* if state > POSTED, we are already handling cancel/completion */
666                 if ((dgram->gndg_conn_out.gncr_dstnid != dst_nid) ||
667                      dgram->gndg_state > GNILND_DGRAM_POSTED)
668                         continue;
669
670                 CDEBUG(D_NET, "got dgram [%p] -> %s\n",
671                        dgram, libcfs_nid2str(dst_nid));
672                 return dgram;
673         }
674         return NULL;
675 }
676
677 int
678 kgnilnd_find_and_cancel_dgram(kgn_device_t *dev, lnet_nid_t dst_nid)
679 {
680         kgn_dgram_t     *dgram;
681
682         spin_lock(&dev->gnd_dgram_lock);
683         dgram = kgnilnd_find_dgram_locked(dev, dst_nid);
684
685         if (dgram) {
686                 kgnilnd_cancel_dgram_locked(dgram);
687         }
688         spin_unlock(&dev->gnd_dgram_lock);
689
690         RETURN(!!(dgram == NULL));
691 }
692
693 int
694 kgnilnd_pack_connreq(kgn_connreq_t *connreq, kgn_conn_t *conn,
695                      lnet_nid_t srcnid, lnet_nid_t dstnid,
696                      kgn_connreq_type_t type)
697 {
698         int err = 0;
699
700         /* ensure we haven't violated max datagram size */
701         CLASSERT(sizeof(kgn_connreq_t) <= GNI_DATAGRAM_MAXSIZE);
702
703         /* no need to zero out, we do that when allocating dgram */
704         connreq->gncr_magic     = GNILND_MSG_MAGIC;
705
706         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_SRCNID)) {
707                 srcnid = 0xABADBABE;
708         } else if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_DSTNID)) {
709                 dstnid = 0xDEFEC8ED;
710         }
711
712         connreq->gncr_srcnid    = srcnid;
713         connreq->gncr_dstnid    = dstnid;
714
715         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
716                 connreq->gncr_version = 99;
717         } else {
718                 connreq->gncr_version   = GNILND_CONNREQ_VERSION;
719         }
720         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
721                 connreq->gncr_type = 99;
722         } else {
723                 connreq->gncr_type      = type;
724         }
725         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
726                 connreq->gncr_peerstamp = 0;
727         } else {
728                 connreq->gncr_peerstamp = kgnilnd_data.kgn_peerstamp;
729         }
730         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
731                 connreq->gncr_connstamp = 0;
732         } else {
733                 connreq->gncr_connstamp = conn->gnc_my_connstamp;
734         }
735         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
736                 connreq->gncr_timeout = 0;
737         } else {
738                 connreq->gncr_timeout   = conn->gnc_timeout;
739         }
740
741         /* the rest pack the data into the payload in other places */
742         if (type == GNILND_CONNREQ_REQ) {
743                 kgn_gniparams_t       *req_params = &connreq->gncr_gnparams;
744                 req_params->gnpr_host_id = conn->gnc_device->gnd_host_id;
745                 req_params->gnpr_cqid = conn->gnc_cqid;
746
747                 /* allocate mailbox for this connection */
748                 err = kgnilnd_setup_mbox(conn);
749                 if (err != 0) {
750                         CERROR("Failed to setup FMA mailbox (%d)\n", err);
751                 }
752                 req_params->gnpr_smsg_attr = conn->gnpr_smsg_attr;
753         }
754
755         /* XXX Nic: TBD - checksum computation */
756
757         return err;
758 }
759
760 int
761 kgnilnd_unpack_connreq(kgn_dgram_t *dgram)
762 {
763         kgn_connreq_t           *connreq = &dgram->gndg_conn_in;
764         int                      swab, rc = 0;
765         kgn_net_t               *net;
766
767         /* the following fields must be handled in a backwards compatible
768          * manner to ensure we can always send and interpret NAKs */
769
770         if (connreq->gncr_magic != GNILND_MSG_MAGIC &&
771             connreq->gncr_magic != __swab32(GNILND_MSG_MAGIC)) {
772                 /* Unexpected magic! */
773                 CERROR("Unexpected magic %08x\n",
774                        connreq->gncr_magic);
775                 return -EBADF;
776         }
777
778         swab = (connreq->gncr_magic == __swab32(GNILND_MSG_MAGIC));
779         if (swab) {
780                 __swab32s(&connreq->gncr_magic);
781                 __swab32s(&connreq->gncr_cksum);
782                 __swab16s(&connreq->gncr_type);
783                 __swab16s(&connreq->gncr_version);
784                 __swab32s(&connreq->gncr_timeout);
785                 __swab64s(&connreq->gncr_srcnid);
786                 __swab64s(&connreq->gncr_dstnid);
787                 __swab64s(&connreq->gncr_peerstamp);
788                 __swab64s(&connreq->gncr_connstamp);
789         }
790
791         /* Do NOT return anything but -EBADF before we munge
792          * connreq->gncr_srcnid - we need that to send the nak */
793
794         if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
795                 lnet_nid_t      incoming = connreq->gncr_srcnid;
796
797                 /* even if the incoming packet is hosed, we know who we sent
798                  * the original and can set the srcnid so that we can properly
799                  * look up our peer to close the loop on this connreq. We still use
800                  * -EBADF to prevent a NAK - just in case there are issues with
801                  * the payload coming from a random spot, etc. */
802                 connreq->gncr_srcnid = dgram->gndg_conn_out.gncr_dstnid;
803
804                 if (LNET_NIDADDR(dgram->gndg_conn_out.gncr_dstnid) !=
805                                 LNET_NIDADDR(incoming)) {
806                         /* we got a datagram match for the wrong nid... */
807                         CERROR("matched datagram 0x%p with srcnid %s "
808                                 "(%x), expecting %s (%x)\n",
809                                 dgram,
810                                 libcfs_nid2str(incoming),
811                                 LNET_NIDADDR(incoming),
812                                 libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
813                                 LNET_NIDADDR(dgram->gndg_conn_out.gncr_dstnid));
814                         return -EBADF;
815                 }
816         } else {
817                 /* if we have a wildcard datagram it should match an
818                  * incoming "active" datagram that should have a fully formed
819                  * srcnid and dstnid. If we couldn't unpack it, we drop as
820                  * corrupted packet, otherwise we'll just verify that the dstnid
821                  * matches the NID for the NET that the dgram was posted */
822
823                 /* make sure their wildcard didn't match ours, that is unpossible */
824                 LASSERTF(connreq->gncr_dstnid != LNET_NID_ANY,
825                          "dgram 0x%p from %s, connreq 0x%p; "
826                          "wildcard matched wildcard \n", dgram,
827                          libcfs_nid2str(connreq->gncr_srcnid), connreq);
828
829                 rc = kgnilnd_find_net(connreq->gncr_dstnid, &net);
830
831                 if (rc == -ESHUTDOWN) {
832                         CERROR("Looking up network: device is in shutdown");
833                         return rc;
834                 } else if (rc == -ENONET) {
835                         CERROR("Connection data from %s: she sent "
836                         "dst_nid %s, but net lookup failed on "
837                         "dgram 0x%p@%s\n",
838                         libcfs_nid2str(connreq->gncr_srcnid),
839                         libcfs_nid2str(connreq->gncr_dstnid),
840                         dgram, kgnilnd_dgram_type2str(dgram));
841                         return rc;
842                 }
843
844                 if (net->gnn_ni->ni_nid != connreq->gncr_dstnid) {
845                         CERROR("Bad connection data from %s: she sent "
846                                "dst_nid %s, but I am %s with dgram 0x%p@%s\n",
847                                libcfs_nid2str(connreq->gncr_srcnid),
848                                libcfs_nid2str(connreq->gncr_dstnid),
849                                libcfs_nid2str(net->gnn_ni->ni_nid),
850                                dgram, kgnilnd_dgram_type2str(dgram));
851                         kgnilnd_net_decref(net);
852                         return -EBADSLT;
853                 }
854
855                 /* kgnilnd_find_net takes a ref on the net it finds, You need to decref it when not needed. */
856                 kgnilnd_net_decref(net);
857         }
858
859         if (connreq->gncr_version != GNILND_CONNREQ_VERSION) {
860                 CERROR("Unexpected version %d\n", connreq->gncr_version);
861                 return -EPROTO;
862         }
863
864         /* XXX Nic: TBD - checksum validation */
865         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_DROP)) {
866                 return -EBADF;
867         }
868
869         if (swab && connreq->gncr_type == GNILND_CONNREQ_REQ) {
870                 __u64 msg_addr = (__u64) connreq->gncr_gnparams.gnpr_smsg_attr.msg_buffer;
871
872                 __swab32s(&connreq->gncr_gnparams.gnpr_host_id);
873                 __swab32s(&connreq->gncr_gnparams.gnpr_cqid);
874                 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.buff_size);
875                 __swab16s(&connreq->gncr_gnparams.gnpr_smsg_attr.mbox_maxcredit);
876                 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.mbox_offset);
877                 __swab64s(&connreq->gncr_gnparams.gnpr_smsg_attr.mem_hndl.qword1);
878                 __swab64s(&connreq->gncr_gnparams.gnpr_smsg_attr.mem_hndl.qword2);
879                 __swab64s(&msg_addr);
880                 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.msg_maxsize);
881                 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.msg_type);
882         } else if (swab && connreq->gncr_type == GNILND_CONNREQ_NAK) {
883                 __swab32s(&connreq->gncr_nakdata.gnnd_errno);
884         }
885
886         /* since we use a unique instance ID for each network, the driver
887          * will take care of dropping datagrams if we don't have that network.
888          */
889
890         /* few more idiot software or configuration checks */
891
892         switch (connreq->gncr_type) {
893         case GNILND_CONNREQ_REQ:
894                 /* wire up EP and SMSG block - this will check the incoming data
895                  * and barf a NAK back if need to */
896                 rc = kgnilnd_set_conn_params(dgram);
897                 if (rc)
898                         return rc;
899                 break;
900         case GNILND_CONNREQ_NAK:
901         case GNILND_CONNREQ_CLOSE:
902                 break;
903         default:
904                 CERROR("unknown connreq packet type %d\n", connreq->gncr_type);
905                 return -EPROTO;
906         }
907
908         if (connreq->gncr_peerstamp == 0 || connreq->gncr_connstamp == 0) {
909                 CERROR("Recived bad timestamps peer "LPU64" conn "LPU64"\n",
910                 connreq->gncr_peerstamp, connreq->gncr_connstamp);
911                 return -EPROTO;
912         }
913
914         if (connreq->gncr_timeout < GNILND_MIN_TIMEOUT) {
915                 CERROR("Received timeout %d < MIN %d\n",
916                        connreq->gncr_timeout, GNILND_MIN_TIMEOUT);
917                 return -EPROTO;
918         }
919
920         return 0;
921 }
922
923 int
924 kgnilnd_alloc_dgram(kgn_dgram_t **dgramp, kgn_device_t *dev, kgn_dgram_type_t type)
925 {
926         kgn_dgram_t         *dgram;
927
928         dgram = cfs_mem_cache_alloc(kgnilnd_data.kgn_dgram_cache,
929                                         CFS_ALLOC_ATOMIC);
930         if (dgram == NULL)
931                 return -ENOMEM;
932
933         /* cache alloc'd memory is not zeroed */
934         memset((void *)dgram, 0, sizeof(*dgram)) ;
935
936         INIT_LIST_HEAD(&dgram->gndg_list);
937         dgram->gndg_state = GNILND_DGRAM_USED;
938         dgram->gndg_type = type;
939         dgram->gndg_magic = GNILND_DGRAM_MAGIC;
940
941         atomic_inc(&dev->gnd_ndgrams);
942
943         CDEBUG(D_MALLOC|D_NETTRACE, "slab-alloced 'dgram': %lu at %p.\n",
944                sizeof(*dgram), dgram);
945
946         *dgramp = dgram;
947         return 0;
948 }
949
950 /* call this on a dgram that came back from kgnilnd_ep_postdata_test_by_id
951  * returns < 0 on dgram to be cleaned up
952  * > 0 on dgram that isn't done yet
953  * == 0 on dgram that is ok and needs connreq processing */
954 int
955 kgnilnd_process_dgram(kgn_dgram_t *dgram, gni_post_state_t post_state)
956 {
957         int rc = 0;
958
959         switch (post_state) {
960         case GNI_POST_COMPLETED:
961                 /* normal state for dgrams that need actual processing */
962                 /* GOTO to avoid processing dgram as canceled/done */
963                 GOTO(process_out, rc);
964
965         case GNI_POST_PENDING:
966                 /* we should only see this if we are testing a WC dgram after a
967                  * cancel - it means that it needs a full cycle of waiting
968                  * for kgni_sm_task to finish moving it to TERMINATED */
969                 LASSERTF((dgram->gndg_type == GNILND_DGRAM_WC_REQ) &&
970                           (dgram->gndg_state == GNILND_DGRAM_CANCELED),
971                          "POST_PENDING dgram 0x%p with bad type %d(%s) or state %d(%s)\n",
972                          dgram, dgram->gndg_type, kgnilnd_dgram_type2str(dgram),
973                          dgram->gndg_state, kgnilnd_dgram_state2str(dgram));
974
975                 /* positive RC as this dgram isn't done yet */
976                 rc = EINPROGRESS;
977
978                 /* GOTO as this isn't done yet */
979                 GOTO(process_out, rc);
980                 break;
981
982         case GNI_POST_TERMINATED:
983                 /* we've called cancel and it is done or remote guy called cancel and
984                  * we've receved it on a WC dgram */
985 #if 0
986                 /* we are seeing weird terminations on non WC dgrams when we have not
987                  * canceled them */
988
989                 LASSERTF(dgram->gndg_state == GNILND_DGRAM_CANCELED ||
990                          dgram->gndg_conn_out.gncr_dstnid == LNET_NID_ANY,
991                         "dgram 0x%p with bad state %d(%s) or dst nid %s\n",
992                         dgram, dgram->gndg_state, kgnilnd_dgram_state2str(dgram),
993                         libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid));
994 #endif
995
996                 CDEBUG(D_NETTRACE, "dgram 0x%p saw %s, cleaning it up\n", dgram,
997                        dgram->gndg_state == GNILND_DGRAM_CANCELED ?  "canceled" : "terminated");
998
999                 rc =  -ECANCELED;
1000                 break;
1001
1002         case GNI_POST_TIMEOUT:
1003                 /* we could have a timeout on a wildcard dgram too - if
1004                  * we got the incoming request but the remote node beefed
1005                  * before kgni could send the match data back. We'll just error
1006                  * on the active case and bail out gracefully */
1007                 if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
1008                         CNETERR("hardware timeout for connect to "
1009                                "%s after %lu seconds. Is node dead?\n",
1010                                libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
1011                                cfs_duration_sec(jiffies - dgram->gndg_post_time));
1012                 }
1013
1014                 rc = -ETIMEDOUT;
1015                 break;
1016
1017         default:
1018                 CERROR("dgram 0x%p with bad post_state %d\n", dgram, post_state);
1019                 LBUG();
1020         }
1021
1022         /* now finish cleaning up a dgram that is canceled/terminated and needs to
1023          * go away */
1024
1025         /* If this was actively canceled, drop the count now that we are processing */
1026         if (dgram->gndg_state == GNILND_DGRAM_CANCELED) {
1027                 atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1028                 /* caller responsible for gndg_list removal */
1029         }
1030
1031 process_out:
1032
1033         RETURN(rc);
1034 }
1035
1036 /* needs dev->gnd_dgram_lock held */
1037 void
1038 kgnilnd_cancel_dgram_locked(kgn_dgram_t *dgram)
1039 {
1040         gni_return_t            grc;
1041
1042         if (dgram->gndg_state != GNILND_DGRAM_POSTED) {
1043                 return;
1044         }
1045
1046         LASSERTF(dgram->gndg_conn != NULL,
1047                  "dgram 0x%p with NULL conn\n", dgram);
1048
1049         /* C.E - WC dgrams could be canceled immediately but
1050          * if there was some match pending, we need to call
1051          * test_by_id to clear it out. If that test returns
1052          * POST_PENDING, it is half done and needs to go along
1053          * with the rest of dgrams and go through a kgni_sm_task cycle
1054          * and deliver a GNI_POST_TERMINATED event before they
1055          * are actually canceled */
1056
1057         dgram->gndg_state = GNILND_DGRAM_CANCELED;
1058
1059         if (dgram->gndg_conn->gnc_state >= GNILND_CONN_ESTABLISHED) {
1060                 /* we don't need to cancel_by_id if the datagram was good */
1061                 return;
1062         }
1063
1064         /* let folks know there are outstanding cancels */
1065         atomic_inc(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1066         /* leave on nid list until cancel is done for debugging fun */
1067         grc = kgnilnd_ep_postdata_cancel_by_id(dgram->gndg_conn->gnc_ephandle, (__u64) dgram);
1068
1069         /* if we don't get success here, we have hosed up the dgram tracking
1070          * code and need to bail out */
1071         LASSERTF(grc == GNI_RC_SUCCESS,
1072                  "postdata_cancel returned %d for conn 0x%p to %s\n",
1073                  grc, dgram->gndg_conn,
1074                  dgram->gndg_conn->gnc_peer ?
1075                   libcfs_nid2str(dgram->gndg_conn->gnc_peer->gnp_nid)
1076                   : "<?>");
1077
1078         CDEBUG(D_NETTRACE,
1079                 "canceled dgram 0x%p conn 0x%p ephandle 0x%p\n",
1080                 dgram, dgram->gndg_conn,
1081                 dgram->gndg_conn->gnc_ephandle);
1082
1083         if (dgram->gndg_type == GNILND_DGRAM_WC_REQ) {
1084                 gni_post_state_t         post_state;
1085                 int                      rc = 0;
1086                 __u32                    remote_addr = 0, remote_id = 0;
1087
1088                 grc = kgnilnd_ep_postdata_test_by_id(dgram->gndg_conn->gnc_ephandle,
1089                                                      (__u64)dgram, &post_state,
1090                                                      &remote_addr, &remote_id);
1091
1092                 LASSERTF(grc == GNI_RC_NO_MATCH || grc == GNI_RC_SUCCESS,
1093                          "bad grc %d from test_by_id on dgram 0x%p\n",
1094                         grc, dgram);
1095
1096                 /* if WC was canceled immediately, we get NO_MATCH, if needs to go
1097                  * through full cycle, we get SUCCESS and need to parse post_state */
1098
1099                 CDEBUG(D_NET, "grc %d dgram 0x%p type %s post_state %d "
1100                         "remote_addr %u remote_id %u\n", grc, dgram,
1101                         kgnilnd_dgram_type2str(dgram),
1102                         post_state, remote_addr, remote_id);
1103
1104                 if (grc == GNI_RC_NO_MATCH) {
1105                         /* she's gone, reduce count and move along */
1106                         dgram->gndg_state = GNILND_DGRAM_DONE;
1107                         atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1108                         RETURN_EXIT;
1109                 }
1110
1111                 rc = kgnilnd_process_dgram(dgram, post_state);
1112
1113                 if (rc <= 0) {
1114                         /* if for some weird reason we get a valid dgram back, just mark as done
1115                          * so we can drop it and move along.
1116                          * C.E - if it was completed, we'll just release the conn/mbox
1117                          * back into the pool and it'll get reused. That said, we should only
1118                          * be canceling a WC dgram on stack rest or shutdown, so that is moot */
1119                         dgram->gndg_state = GNILND_DGRAM_DONE;
1120                         atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1121
1122                         /* caller context responsible for calling kgnilnd_release_dgram() */
1123                 } else {
1124                         /* still pending, let it simmer until golden brown and delicious */
1125                 }
1126         }
1127
1128         /* for non WC dgrams, they are still on the nid list but marked canceled waiting
1129          * for kgni to return their ID to us via probe - that is when we'll complete their
1130          * cancel processing */
1131 }
1132
1133 void
1134 kgnilnd_cleanup_dgram(kgn_dgram_t *dgram)
1135 {
1136         /* release the dgram ref on conn */
1137         if (dgram->gndg_conn) {
1138                 kgnilnd_conn_decref(dgram->gndg_conn);
1139                 dgram->gndg_conn = NULL;
1140         }
1141 }
1142
1143 void
1144 kgnilnd_free_dgram(kgn_device_t *dev, kgn_dgram_t *dgram)
1145 {
1146         LASSERTF(dgram->gndg_state == GNILND_DGRAM_USED ||
1147                  dgram->gndg_state == GNILND_DGRAM_DONE,
1148                  "dgram 0x%p with bad state %s\n",
1149                  dgram, kgnilnd_dgram_state2str(dgram));
1150
1151         /* bit of poisoning to help detect bad driver data */
1152         dgram->gndg_magic = 0x6f5a6b5f;
1153         atomic_dec(&dev->gnd_ndgrams);
1154
1155         cfs_mem_cache_free(kgnilnd_data.kgn_dgram_cache, dgram);
1156         CDEBUG(D_MALLOC|D_NETTRACE, "slab-freed 'dgram': %lu at %p.\n",
1157                sizeof(*dgram), dgram);
1158 }
1159
1160 int
1161 kgnilnd_post_dgram(kgn_device_t *dev, lnet_nid_t dstnid, kgn_connreq_type_t type,
1162                    int data_rc)
1163 {
1164         int              rc = 0;
1165         kgn_dgram_t     *dgram = NULL;
1166         kgn_dgram_t     *tmpdgram;
1167         kgn_dgram_type_t dgtype;
1168         gni_return_t     grc;
1169         __u64            srcnid;
1170         ENTRY;
1171
1172         switch (type) {
1173         case GNILND_CONNREQ_REQ:
1174                 if (dstnid == LNET_NID_ANY)
1175                         dgtype = GNILND_DGRAM_WC_REQ;
1176                 else
1177                         dgtype = GNILND_DGRAM_REQ;
1178                 break;
1179         case GNILND_CONNREQ_NAK:
1180                 LASSERTF(dstnid != LNET_NID_ANY, "can't NAK to LNET_NID_ANY\n");
1181                 dgtype = GNILND_DGRAM_NAK;
1182                 break;
1183         default:
1184                 CERROR("unknown connreq type %d\n", type);
1185                 LBUG();
1186         }
1187
1188         rc = kgnilnd_alloc_dgram(&dgram, dev, dgtype);
1189         if (rc < 0) {
1190                 rc = -ENOMEM;
1191                 GOTO(post_failed, rc);
1192         }
1193
1194         rc = kgnilnd_create_conn(&dgram->gndg_conn, dev);
1195         if (rc) {
1196                 GOTO(post_failed, rc);
1197         }
1198
1199         if (dgram->gndg_type == GNILND_DGRAM_WC_REQ) {
1200                 /* clear buffer for sanity on reuse of wildcard */
1201                 memset(&dgram->gndg_conn_in, 0, sizeof(kgn_connreq_t));
1202         }
1203
1204         if (dstnid == LNET_NID_ANY) {
1205                 /* set here to reset any dgram re-use */
1206                 dgram->gndg_conn->gnc_state = GNILND_CONN_LISTEN;
1207         } else {
1208                 __u32            host_id;
1209
1210                 rc = kgnilnd_nid_to_nicaddrs(LNET_NIDADDR(dstnid), 1, &host_id);
1211                 if (rc <= 0) {
1212                         rc = -ESRCH;
1213                         GOTO(post_failed, rc);
1214                 }
1215
1216                 dgram->gndg_conn->gnc_state = GNILND_CONN_CONNECTING;
1217
1218                 /* don't need to serialize, there are no CQs for the dgram
1219                  * EP on the kgn_net_t */
1220                 grc = kgnilnd_ep_bind(dgram->gndg_conn->gnc_ephandle, host_id, dev->gnd_id);
1221
1222                 if (grc != GNI_RC_SUCCESS) {
1223                         rc = -ECONNABORTED;
1224                         GOTO(post_failed, rc);
1225                 }
1226
1227         }
1228
1229         /* If we are posting wildcards post using a net of 0, otherwise we'll use the
1230          * net of the destination node.
1231          */
1232
1233         if (dstnid == LNET_NID_ANY) {
1234                 srcnid = LNET_MKNID(LNET_MKNET(GNILND, 0), dev->gnd_nid);
1235         } else {
1236                 srcnid = LNET_MKNID(LNET_NIDNET(dstnid), dev->gnd_nid);
1237         }
1238
1239         rc = kgnilnd_pack_connreq(&dgram->gndg_conn_out, dgram->gndg_conn,
1240                                   srcnid, dstnid, type);
1241         if (rc) {
1242                 GOTO(post_failed, rc);
1243         }
1244
1245         if (type == GNILND_CONNREQ_NAK)
1246                 dgram->gndg_conn_out.gncr_nakdata.gnnd_errno = data_rc;
1247
1248         dgram->gndg_post_time = jiffies;
1249
1250         /* XXX Nic: here is where we'd add in logical network multiplexing */
1251
1252         CDEBUG(D_NETTRACE, "dgram 0x%p type %s %s->%s cdm %d\n",
1253                dgram, kgnilnd_dgram_type2str(dgram),
1254                libcfs_nid2str(srcnid),
1255                libcfs_nid2str(dstnid), dev->gnd_id);
1256
1257         /* this allocates memory, can't hold locks across */
1258         grc = kgnilnd_ep_postdata_w_id(dgram->gndg_conn->gnc_ephandle,
1259                                    &dgram->gndg_conn_out, sizeof(kgn_connreq_t),
1260                                    &dgram->gndg_conn_in, sizeof(kgn_connreq_t),
1261                                    (__u64)dgram);
1262
1263         if (grc != GNI_RC_SUCCESS) {
1264                 CNETERR("dropping failed dgram post id 0x%p type %s"
1265                         " reqtype %s to %s: rc %d\n",
1266                         dgram, kgnilnd_dgram_type2str(dgram),
1267                         kgnilnd_connreq_type2str(&dgram->gndg_conn_out),
1268                         libcfs_nid2str(dstnid), grc);
1269                 rc = (grc == GNI_RC_ERROR_NOMEM) ? -ENOMEM : -EBADR;
1270                 GOTO(post_failed, rc);
1271         }
1272
1273         /* we don't need to add earlier - if someone does del_peer during post,
1274          * that peer will get marked as unlinked and the callers wil take care of it.
1275          * The dgram code is largely kgn_peer_t ignorant, so at worst, we'll just drop
1276          * the completed dgram later when we cant find a peer to stuff it into */
1277
1278         spin_lock(&dev->gnd_dgram_lock);
1279
1280         /* make sure we are not double posting targeted dgrams
1281          * - we can multiple post WC dgrams to help with processing speed */
1282         if (dstnid != LNET_NID_ANY) {
1283                 tmpdgram = kgnilnd_find_dgram_locked(dev, dstnid);
1284
1285                 LASSERTF(tmpdgram == NULL,
1286                         "dgram 0x%p->%s already posted\n",
1287                          dgram, libcfs_nid2str(dstnid));
1288         }
1289
1290         /* unmunge dstnid to help processing code cope... */
1291         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_DSTNID)) {
1292                 dgram->gndg_conn_out.gncr_dstnid = dstnid;
1293         }
1294
1295         list_add_tail(&dgram->gndg_list, kgnilnd_nid2dgramlist(dev, dstnid));
1296         dgram->gndg_state = GNILND_DGRAM_POSTED;
1297         spin_unlock(&dev->gnd_dgram_lock);
1298
1299 post_failed:
1300         if (rc < 0 && dgram != NULL) {
1301                 kgnilnd_cleanup_dgram(dgram);
1302                 kgnilnd_free_dgram(dev, dgram);
1303         }
1304
1305         RETURN(rc);
1306 }
1307
1308 void
1309 kgnilnd_release_dgram(kgn_device_t *dev, kgn_dgram_t *dgram)
1310 {
1311         spin_lock(&dev->gnd_dgram_lock);
1312         kgnilnd_cancel_dgram_locked(dgram);
1313         spin_unlock(&dev->gnd_dgram_lock);
1314
1315         kgnilnd_cleanup_dgram(dgram);
1316
1317         /* if the dgram is 'canceled' it needs to be wait until the event
1318          * comes up from kgni that tells us it is safe to release */
1319         if (dgram->gndg_state != GNILND_DGRAM_CANCELED) {
1320                 dgram->gndg_state = GNILND_DGRAM_DONE;
1321
1322                 LASSERTF(list_empty(&dgram->gndg_list), "dgram 0x%p on list\n", dgram);
1323
1324                 /* if it is a wildcard and we are in an appropriate state, repost
1325                  * the wildcard */
1326
1327                 if ((dgram->gndg_type == GNILND_DGRAM_WC_REQ) &&
1328                     (!kgnilnd_data.kgn_wc_kill && !kgnilnd_data.kgn_in_reset)) {
1329                         int     rerc;
1330
1331                         rerc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
1332                         if (rerc != 0) {
1333                                 /* We failed to repost the WC dgram for some reason
1334                                  * mark it so the repost system attempts to repost */
1335                                 kgnilnd_admin_addref(dev->gnd_nwcdgrams);
1336                         }
1337                 }
1338
1339                 /* always free the old dgram */
1340                 kgnilnd_free_dgram(dev, dgram);
1341         }
1342 }
1343
1344
1345 int
1346 kgnilnd_probe_for_dgram(kgn_device_t *dev, kgn_dgram_t **dgramp)
1347 {
1348         kgn_dgram_t             *dgram = NULL;
1349         gni_post_state_t         post_state;
1350         gni_return_t             grc;
1351         int                      rc = 0;
1352         __u64                    readyid;
1353         __u32                    remote_addr = 0, remote_id = 0;
1354         ENTRY;
1355
1356         /* Probe with the lock held. That way if we get a dgram we dont have it canceled
1357          * between finding the ready dgram and grabbing the lock to remove it from the
1358          * list. Otherwise we could be left in an inconsistent state. We own the dgram
1359          * once its off the list so we don't need to worry about others changing it at
1360          * that point. */
1361         spin_lock(&dev->gnd_dgram_lock);
1362         grc = kgnilnd_postdata_probe_by_id(dev->gnd_handle, &readyid);
1363         if (grc != GNI_RC_SUCCESS) {
1364                 spin_unlock(&dev->gnd_dgram_lock);
1365                 /* return 0 to indicate nothing happened */
1366                 RETURN(0);
1367         }
1368
1369         CDEBUG(D_NET, "ready "LPX64" on device 0x%p\n",
1370                 readyid, dev);
1371
1372         dgram = (kgn_dgram_t *)readyid;
1373
1374         LASSERTF(dgram->gndg_magic == GNILND_DGRAM_MAGIC,
1375                  "dgram 0x%p from id "LPX64" with bad magic %x\n",
1376                  dgram, readyid, dgram->gndg_magic);
1377
1378         LASSERTF(dgram->gndg_state == GNILND_DGRAM_POSTED ||
1379                  dgram->gndg_state == GNILND_DGRAM_CANCELED,
1380                  "dgram 0x%p with bad state %s\n",
1381                  dgram, kgnilnd_dgram_state2str(dgram));
1382
1383         LASSERTF(!list_empty(&dgram->gndg_list),
1384                  "dgram 0x%p with bad list state %s\n",
1385                  dgram, kgnilnd_dgram_state2str(dgram));
1386
1387         /* now we know that the datagram structure is ok, so pull off list */
1388         list_del_init(&dgram->gndg_list);
1389
1390         /* while we have the gnn_dgram_lock and BEFORE we call test_by_id
1391          * change the state from POSTED to PROCESSING to ensure that
1392          * nobody cancels it after we've pulled it from the wire */
1393         if (dgram->gndg_state == GNILND_DGRAM_POSTED) {
1394                 dgram->gndg_state = GNILND_DGRAM_PROCESSING;
1395         }
1396
1397         spin_unlock(&dev->gnd_dgram_lock);
1398
1399         /* we now "own" this datagram */
1400
1401         LASSERTF(dgram->gndg_conn != NULL,
1402                 "dgram 0x%p with NULL conn\n", dgram);
1403
1404         grc = kgnilnd_ep_postdata_test_by_id(dgram->gndg_conn->gnc_ephandle,
1405                                              (__u64)dgram, &post_state,
1406                                              &remote_addr, &remote_id);
1407
1408         LASSERTF(grc != GNI_RC_NO_MATCH, "kgni lied! probe_by_id told us that"
1409                  " id "LPU64" was ready\n", readyid);
1410
1411         CDEBUG(D_NET, "grc %d dgram 0x%p type %s post_state %d "
1412                 "remote_addr %u remote_id %u\n", grc, dgram,
1413                 kgnilnd_dgram_type2str(dgram),
1414                 post_state, remote_addr, remote_id);
1415
1416         if (unlikely(grc != GNI_RC_SUCCESS)) {
1417                 CNETERR("getting data for dgram 0x%p->%s failed rc %d. Dropping it\n",
1418                         dgram, libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
1419                         grc);
1420                 rc = -EINVAL;
1421                 GOTO(probe_for_out, rc);
1422         }
1423
1424         rc = kgnilnd_process_dgram(dgram, post_state);
1425
1426         /* we should never get probe finding a dgram for us and then it
1427          * being a WC dgram that is still in the middle of processing */
1428         LASSERTF(rc <= 0, "bad rc %d from process_dgram 0x%p state %d\n",
1429                  rc, dgram, post_state);
1430
1431         if (rc == 0) {
1432                 /* dgram is good enough for the data to be used */
1433                 dgram->gndg_state = GNILND_DGRAM_PROCESSING;
1434                 /* fake rc to mark that we've done something */
1435                 rc = 1;
1436         } else {
1437                 /* bring out your dead! */
1438                 dgram->gndg_state = GNILND_DGRAM_DONE;
1439         }
1440
1441         *dgramp = dgram;
1442         RETURN(rc);
1443
1444 probe_for_out:
1445
1446         kgnilnd_release_dgram(dev, dgram);
1447         RETURN(rc);
1448 }
1449
1450 int
1451 kgnilnd_setup_wildcard_dgram(kgn_device_t *dev)
1452 {
1453         /* if kgn_wildcard is zero, return error */
1454         int     rc = -ENOENT, i;
1455         ENTRY;
1456
1457         for (i = 0; i < *kgnilnd_tunables.kgn_nwildcard; i++) {
1458                 rc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
1459                 if (rc < 0) {
1460                         CERROR("error %d: could not post wildcard datagram # %d\n",
1461                                 rc, i);
1462                         rc = -EINVAL;
1463                         GOTO(failed, rc);
1464                 }
1465         }
1466
1467 failed:
1468         RETURN(rc);
1469 }
1470
1471 int
1472 kgnilnd_cancel_net_dgrams(kgn_net_t *net)
1473 {
1474         kgn_dgram_t            *dg, *dgN;
1475         struct list_head        zombies;
1476         int                     i;
1477         ENTRY;
1478
1479         /* we want to cancel any outstanding dgrams - we don't want to rely
1480          * on del_peer_or_conn catching all of them. This helps protect us in cases
1481          * where we don't quite keep the peer->dgram mapping in sync due to some
1482          * race conditions */
1483
1484         LASSERTF(net->gnn_shutdown || kgnilnd_data.kgn_in_reset,
1485                  "called with LND invalid state: net shutdown %d "
1486                  "in reset %d\n", net->gnn_shutdown,
1487                  kgnilnd_data.kgn_in_reset);
1488
1489         INIT_LIST_HEAD(&zombies);
1490
1491         spin_lock(&net->gnn_dev->gnd_dgram_lock);
1492
1493         for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
1494                 list_for_each_entry_safe(dg, dgN, &net->gnn_dev->gnd_dgrams[i], gndg_list) {
1495
1496                         /* skip nids not on our net or are wildcards */
1497
1498
1499                         if (dg->gndg_type == GNILND_DGRAM_WC_REQ ||
1500                                 net->gnn_netnum != LNET_NETNUM(LNET_NIDNET(dg->gndg_conn_out.gncr_dstnid)))
1501                                 continue;
1502
1503                         kgnilnd_cancel_dgram_locked(dg);
1504                 }
1505         }
1506
1507         spin_unlock(&net->gnn_dev->gnd_dgram_lock);
1508
1509         RETURN(0);
1510 }
1511
1512 int
1513 kgnilnd_cancel_wc_dgrams(kgn_device_t *dev)
1514 {
1515         kgn_dgram_t *dg, *dgN;
1516         struct list_head zombies;
1517         ENTRY;
1518
1519         /* Time to kill the outstanding WC's
1520          * WC's exist on net 0 only but match on any net...
1521          */
1522
1523         LASSERTF(kgnilnd_data.kgn_in_reset || kgnilnd_data.kgn_wc_kill,
1524                 "called with LND invalid state: WC shutdown %d "
1525                 "in reset %d\n", kgnilnd_data.kgn_wc_kill,
1526                 kgnilnd_data.kgn_in_reset);
1527
1528         INIT_LIST_HEAD(&zombies);
1529         spin_lock(&dev->gnd_dgram_lock);
1530
1531         do {
1532                 dg = kgnilnd_find_dgram_locked(dev, LNET_NID_ANY);
1533                 if (dg != NULL) {
1534                         LASSERTF(dg->gndg_type == GNILND_DGRAM_WC_REQ,
1535                                  "dgram 0x%p->%s with bad type %d (%s)\n",
1536                                 dg, libcfs_nid2str(dg->gndg_conn_out.gncr_dstnid),
1537                                 dg->gndg_type, kgnilnd_dgram_type2str(dg));
1538
1539                         kgnilnd_cancel_dgram_locked(dg);
1540
1541                         /* WC could be DONE already, check and if so add to list to be released */
1542                         if (dg->gndg_state == GNILND_DGRAM_DONE) {
1543                                 list_del_init(&dg->gndg_list);
1544                                 list_add_tail(&dg->gndg_list, &zombies);
1545                         }
1546                 }
1547         } while (dg != NULL);
1548
1549         spin_unlock(&dev->gnd_dgram_lock);
1550
1551         list_for_each_entry_safe(dg, dgN, &zombies, gndg_list) {
1552                 list_del_init(&dg->gndg_list);
1553                 kgnilnd_release_dgram(dev, dg);
1554         }
1555         RETURN(0);
1556
1557 }
1558
1559 void
1560 kgnilnd_wait_for_canceled_dgrams(kgn_device_t *dev)
1561 {
1562         int             i = 4;
1563         int             rc;
1564         gni_return_t    grc;
1565         __u64           readyid;
1566         kgn_dgram_t    *dgram;
1567
1568         /* use do while to get at least one check run to allow
1569          * regression test for 762072 to hit bug if there */
1570
1571         /* This function races with the dgram mover during shutdown so it is possible for
1572          * a dgram to be seen in kgnilnd_postdata_probe_wait_by_id but be handled in the
1573          * dgram mover thread instead of inside of this function.
1574          */
1575
1576         /* This should only be called from within shutdown, baseshutdown, or stack reset.
1577          * there are no assertions here to verify since base_shutdown has nothing in it we can check
1578          * the net is gone by then.
1579          */
1580
1581         do {
1582                 i++;
1583                 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
1584                         "Waiting for %d canceled datagrams to clear on device %d\n",
1585                         atomic_read(&dev->gnd_canceled_dgrams), dev->gnd_id);
1586
1587                 /* check once a second */
1588                 grc = kgnilnd_postdata_probe_wait_by_id(dev->gnd_handle,
1589                        250, &readyid);
1590
1591                 if (grc != GNI_RC_SUCCESS)
1592                         continue;
1593
1594                 CDEBUG(D_NET, "ready "LPX64" on device %d->0x%p\n",
1595                         readyid, dev->gnd_id, dev);
1596
1597                 rc = kgnilnd_probe_for_dgram(dev, &dgram);
1598                 if (rc != 0) {
1599                         /* if we got a valid dgram or one that is now done, clean up */
1600                         kgnilnd_release_dgram(dev, dgram);
1601                 }
1602         } while (atomic_read(&dev->gnd_canceled_dgrams));
1603 }
1604
1605 int
1606 kgnilnd_start_connect(kgn_peer_t *peer)
1607 {
1608         int              rc = 0;
1609         /* sync point for kgnilnd_del_peer_locked - do an early check to
1610          * catch the most common hits where del_peer is done by the
1611          * time we get here */
1612         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING1)) {
1613                 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING1, 1)) {};
1614         }
1615
1616         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1617         if (!kgnilnd_peer_active(peer) || peer->gnp_connecting != GNILND_PEER_CONNECT) {
1618                 /* raced with peer getting unlinked */
1619                 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1620                 rc = ESTALE;
1621                 GOTO(out, rc);
1622         }
1623         peer->gnp_connecting = GNILND_PEER_POSTING;
1624         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1625
1626         set_mb(peer->gnp_last_dgram_time, jiffies);
1627         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING2)) {
1628                 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING2, 1)) {};
1629         }
1630
1631         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING3)) {
1632                 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING3, 1)) {};
1633                 rc = cfs_fail_val ? cfs_fail_val : -ENOMEM;
1634         } else {
1635                 rc = kgnilnd_post_dgram(peer->gnp_net->gnn_dev,
1636                                         peer->gnp_nid, GNILND_CONNREQ_REQ, 0);
1637         }
1638         if (rc < 0) {
1639                 set_mb(peer->gnp_last_dgram_errno, rc);
1640                 GOTO(failed, rc);
1641         }
1642
1643         /* while we're posting someone could have decided this peer/dgram needed to
1644          * die a quick death, so we check for state change and process accordingly */
1645
1646         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1647         if (!kgnilnd_peer_active(peer) || peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH) {
1648                 if (peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH) {
1649                         peer->gnp_connecting = GNILND_PEER_KILL;
1650                 }
1651                 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1652                 /* positive RC to avoid dgram cleanup - we'll have to
1653                  * wait for the kgni GNI_POST_TERMINATED event to
1654                  * finish cleaning up */
1655                 rc = ESTALE;
1656                 kgnilnd_find_and_cancel_dgram(peer->gnp_net->gnn_dev, peer->gnp_nid);
1657                 GOTO(out, rc);
1658         }
1659         peer->gnp_connecting = GNILND_PEER_POSTED;
1660         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1661         /* reaper thread will take care of any timeouts */
1662         CDEBUG(D_NET, "waiting for connect to finish to %s rc %d\n",
1663                libcfs_nid2str(peer->gnp_nid), rc);
1664
1665         RETURN(rc);
1666
1667 failed:
1668         CDEBUG(D_NET, "connect to %s failed: rc %d \n",
1669                libcfs_nid2str(peer->gnp_nid), rc);
1670 out:
1671         RETURN(rc);
1672 }
1673
1674 int
1675 kgnilnd_finish_connect(kgn_dgram_t *dgram)
1676 {
1677         kgn_conn_t        *conn = dgram->gndg_conn;
1678         lnet_nid_t         her_nid = dgram->gndg_conn_in.gncr_srcnid;
1679         kgn_peer_t        *new_peer, *peer = NULL;
1680         kgn_tx_t          *tx;
1681         kgn_tx_t          *txn;
1682         kgn_mbox_info_t   *mbox;
1683         int                rc;
1684         int                nstale;
1685
1686         /* try to find a peer that matches the nid we got in the connreq
1687          * kgnilnd_unpack_connreq makes sure that conn_in.gncr_srcnid is
1688          * HER and conn_out.gncr_srcnid is ME for both active and WC dgrams */
1689
1690         /* assume this is a new peer  - it makes locking cleaner when it isn't */
1691         /* no holding kgn_net_rw_sem - already are at the kgnilnd_dgram_mover level */
1692
1693         rc = kgnilnd_create_peer_safe(&new_peer, her_nid, NULL);
1694         if (rc != 0) {
1695                 CERROR("Can't create peer for %s\n", libcfs_nid2str(her_nid));
1696                 return rc;
1697         }
1698
1699         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1700
1701         /* this transfers ref from create_peer to the kgn_peer table */
1702         kgnilnd_add_peer_locked(her_nid, new_peer, &peer);
1703
1704         /* if we found an existing peer, is it really ready for a new conn ? */
1705         if (peer != new_peer) {
1706                 /* if this was an active connect attempt but we can't find a peer waiting for it
1707                  * we will dump in the trash */
1708
1709                 if (peer->gnp_connecting == GNILND_PEER_IDLE && dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
1710                         CDEBUG(D_NET, "dropping completed connreq for %s peer 0x%p->%s\n",
1711                                libcfs_nid2str(her_nid), peer, libcfs_nid2str(peer->gnp_nid));
1712                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1713                         rc = ECANCELED;
1714                         GOTO(out, rc);
1715                 }
1716
1717                 /* check to see if we can catch a connecting peer before it is
1718                  * removed from the connd_peers list - if not, we need to
1719                  * let the connreqs race and be handled by kgnilnd_conn_isdup_locked() */
1720                 if (peer->gnp_connecting != GNILND_PEER_IDLE) {
1721                         spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
1722                         if (!list_empty(&peer->gnp_connd_list)) {
1723                                 list_del_init(&peer->gnp_connd_list);
1724                                 /* drop connd ref */
1725                                 kgnilnd_peer_decref(peer);
1726                         }
1727                         spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
1728                         /* clear rc to make sure we don't have fake error */
1729                         rc = 0;
1730                 }
1731
1732                 /* no matter what, we are no longer waiting to connect this peer now */
1733                 peer->gnp_connecting = GNILND_PEER_IDLE;
1734
1735                 /* Refuse to duplicate an existing connection (both sides might try to
1736                  * connect at once).  NB we return success!  We _are_ connected so we
1737                  * _don't_ have any blocked txs to complete with failure. */
1738                 rc = kgnilnd_conn_isdup_locked(peer, conn);
1739                 if (rc != 0) {
1740                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1741                         CDEBUG(D_NET, "Not creating duplicate connection to %s: %d\n",
1742                               libcfs_nid2str(her_nid), rc);
1743                         rc = EALREADY;
1744                         GOTO(out, rc);
1745                 }
1746         }
1747
1748         if (peer->gnp_down == GNILND_RCA_NODE_DOWN) {
1749                 CNETERR("Received connection request from %s that RCA thinks is"
1750                         " down.\n", libcfs_nid2str(her_nid));
1751                 peer->gnp_down = GNILND_RCA_NODE_UP;
1752         }
1753
1754         nstale = kgnilnd_close_stale_conns_locked(peer, conn);
1755
1756         /* either way with peer (new or existing), we are ok with ref counts here as the
1757          * kgnilnd_add_peer_locked will use our ref on new_peer (from create_peer_safe) as the
1758          * ref for the peer table. */
1759
1760         /* at this point, the connection request is a winner */
1761
1762         /* mark 'DONE' to avoid cancel being called from release */
1763         dgram->gndg_state = GNILND_DGRAM_DONE;
1764
1765         /* initialise timestamps before reaper looks at them */
1766         conn->gnc_last_rx = conn->gnc_last_rx_cq = jiffies;
1767
1768         /* last_tx is initialized to jiffies - (keepalive*2) so that if the NOOP fails it will
1769          * immediatly send a NOOP in the reaper thread during the call to
1770          * kgnilnd_check_conn_timeouts_locked
1771          */
1772         conn->gnc_last_tx = jiffies - (cfs_time_seconds(GNILND_TO2KA(conn->gnc_timeout)) * 2);
1773         conn->gnc_state = GNILND_CONN_ESTABLISHED;
1774
1775         /* save the dgram type used to establish this connection */
1776         conn->gnc_dgram_type = dgram->gndg_type;
1777
1778         /* refs are not transferred from dgram to tables, so increment to
1779          * take ownership */
1780         kgnilnd_conn_addref(conn);
1781         kgnilnd_peer_addref(peer);
1782         conn->gnc_peer = peer;
1783         list_add_tail(&conn->gnc_list, &peer->gnp_conns);
1784
1785         kgnilnd_conn_addref(conn);               /* +1 ref for conn table */
1786         list_add_tail(&conn->gnc_hashlist,
1787                       kgnilnd_cqid2connlist(conn->gnc_cqid));
1788         kgnilnd_data.kgn_conn_version++;
1789
1790         /* Dont send NOOP if fail_loc is set
1791          */
1792         if (!CFS_FAIL_CHECK(CFS_FAIL_GNI_ONLY_NOOP)) {
1793                 tx = kgnilnd_new_tx_msg(GNILND_MSG_NOOP, peer->gnp_net->gnn_ni->ni_nid);
1794                 if (tx == NULL) {
1795                         CNETERR("can't get TX to initiate NOOP to %s\n",
1796                                 libcfs_nid2str(peer->gnp_nid));
1797                 } else {
1798                         kgnilnd_queue_tx(conn, tx);
1799                 }
1800         }
1801
1802         /* Schedule all packets blocking for a connection */
1803         list_for_each_entry_safe(tx, txn, &peer->gnp_tx_queue, tx_list) {
1804                 /* lock held here is the peer_conn lock */
1805                 kgnilnd_tx_del_state_locked(tx, peer, NULL, GNILND_TX_ALLOCD);
1806                 kgnilnd_queue_tx(conn, tx);
1807         }
1808
1809         /* If this is an active connection lets mark its timestamp on the MBoX */
1810         if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
1811                 mbox = &conn->gnc_fma_blk->gnm_mbox_info[conn->gnc_mbox_id];
1812                 /* conn->gnc_last_rx is jiffies it better exist as it was just set */
1813                 mbox->mbx_release_purg_active_dgram = conn->gnc_last_rx;
1814         }
1815
1816         /* Bug 765042: wake up scheduler for a race with finish_connect and
1817          * complete_conn_closed with a conn in purgatory
1818          * since we can't use CFS_RACE due to mutex_holds in kgnilnd_process_conns,
1819          * we just check for set and then clear */
1820         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_FINISH_PURG)) {
1821                 cfs_fail_loc = 0x0;
1822                 /* get scheduler thread moving again */
1823                 kgnilnd_schedule_device(conn->gnc_device);
1824         }
1825
1826         CDEBUG(D_NET, "New conn 0x%p->%s dev %d\n",
1827                conn, libcfs_nid2str(her_nid), conn->gnc_device->gnd_id);
1828
1829         /* make sure we reset peer reconnect interval now that we have a good conn */
1830         kgnilnd_peer_alive(peer);
1831         peer->gnp_reconnect_interval = 0;
1832
1833         /* clear the unlink attribute if we dont clear it kgnilnd_del_conn_or_peer will wait
1834          * on the atomic forever
1835          */
1836         if (peer->gnp_pending_unlink) {
1837                 peer->gnp_pending_unlink = 0;
1838                 kgnilnd_admin_decref(kgnilnd_data.kgn_npending_unlink);
1839                 CDEBUG(D_NET, "Clearing peer unlink %p\n",peer);
1840         }
1841
1842         /* add ref to make it hang around until after we drop the lock */
1843         kgnilnd_conn_addref(conn);
1844
1845         /* Once the peer_conn lock is dropped, the conn could actually move into
1846          * CLOSING->CLOSED->DONE in the scheduler thread, so hold the
1847          * lock until we are really done */
1848         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1849
1850         /* Notify LNET that we now have a working connection to this peer.
1851          * This is a Cray extension to the "standard" LND behavior. */
1852         lnet_notify(peer->gnp_net->gnn_ni, peer->gnp_nid,
1853                      1, cfs_time_current());
1854
1855         /* drop our 'hold' ref */
1856         kgnilnd_conn_decref(conn);
1857
1858 out:
1859         RETURN(rc);
1860 }
1861
1862 void
1863 kgnilnd_send_nak(kgn_device_t *dev, lnet_nid_t dst_nid, int error)
1864 {
1865         int              rc = 0;
1866         ENTRY;
1867
1868         LASSERTF(dst_nid != LNET_NID_ANY, "bad dst_nid %s\n", libcfs_nid2str(dst_nid));
1869
1870         CDEBUG(D_NET, "NAK to %s errno %d\n", libcfs_nid2str(dst_nid), error);
1871
1872         rc = kgnilnd_post_dgram(dev, dst_nid, GNILND_CONNREQ_NAK, error);
1873
1874         if (rc < 0) {
1875                 CDEBUG(D_NET, "NAK to %s failed: rc %d \n", libcfs_nid2str(dst_nid), rc);
1876         }
1877         EXIT;
1878 }
1879
1880 int
1881 kgnilnd_process_nak(kgn_dgram_t *dgram)
1882 {
1883         kgn_connreq_t     *connreq = &dgram->gndg_conn_in;
1884         lnet_nid_t         src_nid = connreq->gncr_srcnid;
1885         int                errno = connreq->gncr_nakdata.gnnd_errno;
1886         kgn_peer_t        *peer;
1887         int                rc = 0;
1888
1889         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1890
1891         peer = kgnilnd_find_peer_locked(src_nid);
1892         if (peer == NULL) {
1893                 /* we likely dropped him from bad data when we processed
1894                  * the original REQ */
1895                 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1896                 return -EBADSLT;
1897         }
1898
1899         /* need to check peerstamp/connstamp against the ones we find
1900          * to make sure we don't close new (and good?) conns that we
1901          * formed after this connreq failed */
1902         if (peer->gnp_connecting == GNILND_PEER_IDLE) {
1903                 kgn_conn_t        conn;
1904
1905                 if (list_empty(&peer->gnp_conns)) {
1906                         /* assume already procced datagram and it barfed up
1907                          * on this side too */
1908                         CDEBUG(D_NET, "dropping NAK from %s; "
1909                                "peer %s is already not connected\n",
1910                                 libcfs_nid2str(connreq->gncr_srcnid),
1911                                 libcfs_nid2str(connreq->gncr_dstnid));
1912                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1913                         return 0;
1914                 }
1915
1916                 /* stub up a connection with the connreq XXX_stamps to allow
1917                  * use to use close_stale_conns_locked */
1918                 conn.gnc_peerstamp = connreq->gncr_peerstamp;
1919                 conn.gnc_my_connstamp = connreq->gncr_connstamp;
1920                 conn.gnc_peer_connstamp = connreq->gncr_connstamp;
1921                 conn.gnc_device = peer->gnp_net->gnn_dev;
1922
1923                 rc = kgnilnd_close_stale_conns_locked(peer, &conn);
1924
1925                 LCONSOLE_INFO("Received NAK from %s for %s errno %d; "
1926                         "closed %d connections\n",
1927                         libcfs_nid2str(connreq->gncr_srcnid),
1928                         libcfs_nid2str(connreq->gncr_dstnid), errno, rc);
1929         } else {
1930                 rc = 0;
1931                 spin_lock(&dgram->gndg_conn->gnc_device->gnd_connd_lock);
1932
1933                 if (list_empty(&peer->gnp_connd_list)) {
1934                         /* if peer isn't on waiting list, try to find one to nuke */
1935                         rc = kgnilnd_find_and_cancel_dgram(peer->gnp_net->gnn_dev,
1936                                                            peer->gnp_nid);
1937
1938                         if (rc) {
1939                                 LCONSOLE_INFO("Received NAK from %s for %s errno %d; "
1940                                         "canceled pending connect request\n",
1941                                         libcfs_nid2str(connreq->gncr_srcnid),
1942                                         libcfs_nid2str(connreq->gncr_dstnid), errno);
1943                         }
1944
1945                         /* if we can't find a waiting dgram, we just drop the nak - the conn
1946                          * connect must have failed (didn't find conn above and clear connecting
1947                          * -- so nothing to do besides drop */
1948                 } else {
1949                         /* peer is on list, meaning it is a new connect attempt from the one
1950                          * we started that generated the NAK - so just drop NAK */
1951
1952                         /* use negative to prevent error message */
1953                         rc = -EAGAIN;
1954                 }
1955                 spin_unlock(&dgram->gndg_conn->gnc_device->gnd_connd_lock);
1956         }
1957
1958         /* success! we found a peer and at least marked pending_nak */
1959         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1960
1961         return 0;
1962 }
1963
1964 int
1965 kgnilnd_process_connreq(kgn_dgram_t *dgram, int *needs_nak)
1966 {
1967         int                      rc;
1968
1969         rc = kgnilnd_unpack_connreq(dgram);
1970         if (rc < 0) {
1971                 if (rc != -EBADF) {
1972                         /* only NAK if we have good srcnid to use */
1973                         *needs_nak = 1;
1974                 }
1975                 goto connreq_out;
1976         }
1977
1978         switch (dgram->gndg_conn_in.gncr_type) {
1979         case GNILND_CONNREQ_REQ:
1980                 /* wire up peer & conn, send queued TX */
1981                 rc = kgnilnd_finish_connect(dgram);
1982
1983                 /* don't nak when the nid is hosed */
1984                 if ((rc < 0)) {
1985                         *needs_nak = 1;
1986                 }
1987
1988                 break;
1989         case GNILND_CONNREQ_NAK:
1990                 rc = kgnilnd_process_nak(dgram);
1991                 /* return early to prevent reconnect bump */
1992                 return rc;
1993         default:
1994                 CERROR("unexpected connreq type %s (%d) from %s\n",
1995                         kgnilnd_connreq_type2str(&dgram->gndg_conn_in),
1996                         dgram->gndg_conn_in.gncr_type,
1997                         libcfs_nid2str(dgram->gndg_conn_in.gncr_srcnid));
1998                 rc = -EINVAL;
1999                 *needs_nak = 1;
2000                 break;
2001         }
2002
2003 connreq_out:
2004         RETURN(rc);
2005 }
2006
2007 int
2008 kgnilnd_probe_and_process_dgram(kgn_device_t *dev)
2009 {
2010         int                      rc;
2011         int                      needs_nak = 0;
2012         lnet_nid_t               nak_dstnid = LNET_NID_ANY;
2013         lnet_nid_t               orig_dstnid;
2014         kgn_dgram_t             *dgram = NULL;
2015         kgn_peer_t              *peer;
2016         ENTRY;
2017
2018         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PAUSE_DGRAM_COMP)) {
2019                 rc = 0;
2020         } else {
2021                 rc = kgnilnd_probe_for_dgram(dev, &dgram);
2022         }
2023
2024         if (rc == 0) {
2025                 RETURN(0);
2026         } else if (rc < 0) {
2027                 GOTO(inform_peer, rc);
2028         } else {
2029                 /* rc > 1 means it did something, reset for this func  */
2030                 rc = 0;
2031         }
2032
2033         switch (dgram->gndg_type) {
2034         case GNILND_DGRAM_WC_REQ:
2035         case GNILND_DGRAM_REQ:
2036                 rc = kgnilnd_process_connreq(dgram, &needs_nak);
2037                 break;
2038         case GNILND_DGRAM_NAK:
2039                 CDEBUG(D_NETTRACE, "NAK to %s done\n",
2040                         libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid));
2041                 break;
2042         default:
2043                 CERROR("unknown datagram type %s (%d)\n",
2044                        kgnilnd_dgram_type2str(dgram), dgram->gndg_type);
2045                 break;
2046         }
2047
2048         /* stash data to use after releasing current datagram */
2049         /* don't stash net - we are operating on a net already,
2050          * so the lock on rw_net_lock is sufficient */
2051
2052         nak_dstnid = dgram->gndg_conn_in.gncr_srcnid;
2053
2054 inform_peer:
2055         LASSERTF(dgram != NULL, "dgram 0x%p rc %d needs_nak %d\n", dgram, rc, needs_nak);
2056
2057         orig_dstnid = dgram->gndg_conn_out.gncr_dstnid;
2058
2059         kgnilnd_release_dgram(dev, dgram);
2060
2061         CDEBUG(D_NET, "cleaning up dgram to %s, rc %d\n",
2062                libcfs_nid2str(orig_dstnid), rc);
2063
2064         /* if this was a WC_REQ that matched an existing peer, it'll get marked done
2065          * in kgnilnd_finish_connect - if errors are from before we get to there,
2066          * we just drop as it is a WC_REQ - the peer CAN'T be waiting for it */
2067         if ((orig_dstnid != LNET_NID_ANY) && (rc < 0)) {
2068                 /* if we have a negative rc, we want to find a peer to inform about
2069                  * the bad connection attempt. Sorry buddy, better luck next time! */
2070
2071                 write_lock(&kgnilnd_data.kgn_peer_conn_lock);
2072                 peer = kgnilnd_find_peer_locked(orig_dstnid);
2073
2074                 if (peer != NULL) {
2075                         /* add ref to make sure he stays around past the possible unlink
2076                          * so we can tell LNet about him */
2077                         kgnilnd_peer_addref(peer);
2078
2079                         /* if he still cares about the outstanding connect */
2080                         if (peer->gnp_connecting >= GNILND_PEER_CONNECT) {
2081                                 /* check if he is on the connd list and remove.. */
2082                                 spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
2083                                 if (!list_empty(&peer->gnp_connd_list)) {
2084                                         list_del_init(&peer->gnp_connd_list);
2085                                         /* drop connd ref */
2086                                         kgnilnd_peer_decref(peer);
2087                                 }
2088                                 spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
2089
2090                                 /* clear gnp_connecting so we don't have a non-connecting peer
2091                                  * on gnd_connd_list */
2092                                 peer->gnp_connecting = GNILND_PEER_IDLE;
2093
2094                                 set_mb(peer->gnp_last_dgram_errno, rc);
2095
2096                                 kgnilnd_peer_increase_reconnect_locked(peer);
2097                         }
2098                 }
2099                 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2100
2101                 /* now that we are outside the lock, tell Mommy */
2102                 if (peer != NULL) {
2103                         kgnilnd_peer_notify(peer, rc);
2104                         kgnilnd_peer_decref(peer);
2105                 }
2106         }
2107
2108         if (needs_nak) {
2109                 kgnilnd_send_nak(dev, nak_dstnid, rc);
2110         }
2111
2112         RETURN(1);
2113 }
2114
2115 void
2116 kgnilnd_reaper_dgram_check(kgn_device_t *dev)
2117 {
2118         kgn_dgram_t    *dgram, *tmp;
2119         int             i;
2120
2121         spin_lock(&dev->gnd_dgram_lock);
2122
2123         for (i = 0; i < (*kgnilnd_tunables.kgn_peer_hash_size - 1); i++) {
2124                 list_for_each_entry_safe(dgram, tmp, &dev->gnd_dgrams[i], gndg_list) {
2125                         unsigned long            now = jiffies;
2126                         unsigned long            timeout;
2127
2128                         /* don't timeout stuff if the network is mucked or shutting down */
2129                         if (kgnilnd_check_hw_quiesce()) {
2130                                 break;
2131                         }
2132
2133                         if ((dgram->gndg_state != GNILND_DGRAM_POSTED) ||
2134                             (dgram->gndg_type == GNILND_DGRAM_WC_REQ)) {
2135                                 continue;
2136                         }
2137                         CDEBUG(D_NETTRACE, "checking dgram 0x%p type %s "
2138                                 "state %s conn 0x%p to %s age %lus\n",
2139                                 dgram, kgnilnd_dgram_type2str(dgram),
2140                                 kgnilnd_dgram_state2str(dgram), dgram->gndg_conn,
2141                                 libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
2142                                 cfs_duration_sec(now - dgram->gndg_post_time));
2143
2144                         timeout = cfs_time_seconds(*kgnilnd_tunables.kgn_timeout);
2145
2146                         if (time_before(now, (dgram->gndg_post_time + timeout)))
2147                                 continue;
2148
2149                         CNETERR("%s datagram to %s timed out @ %lus dgram "
2150                                 "0x%p state %s conn 0x%p\n",
2151                                 kgnilnd_dgram_type2str(dgram),
2152                                 libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
2153                                 cfs_duration_sec(now - dgram->gndg_post_time),
2154                                 dgram, kgnilnd_dgram_state2str(dgram),
2155                                 dgram->gndg_conn);
2156
2157                         kgnilnd_cancel_dgram_locked(dgram);
2158                 }
2159         }
2160         spin_unlock(&dev->gnd_dgram_lock);
2161 }
2162
2163
2164 /* use a thread for the possibly long-blocking wait_by_id to prevent
2165  * stalling the global workqueues */
2166 int
2167 kgnilnd_dgram_waitq(void *arg)
2168 {
2169         kgn_device_t     *dev = (kgn_device_t *) arg;
2170         char              name[16];
2171         gni_return_t      grc;
2172         __u64             readyid;
2173         DEFINE_WAIT(mover_done);
2174
2175         snprintf(name, sizeof(name), "kgnilnd_dgn_%02d", dev->gnd_id);
2176         cfs_daemonize(name);
2177         cfs_block_allsigs();
2178
2179         /* all gnilnd threads need to run fairly urgently */
2180         set_user_nice(current, *kgnilnd_tunables.kgn_nice);
2181
2182         /* we dont shut down until the device shuts down ... */
2183         while (!kgnilnd_data.kgn_shutdown) {
2184                 /* to quiesce or to not quiesce, that is the question */
2185                 if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
2186                         KGNILND_SPIN_QUIESCE;
2187                 }
2188
2189                 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_PAUSE_DGRAM_COMP, 1)) {}
2190
2191                 /* check once a second */
2192                 grc = kgnilnd_postdata_probe_wait_by_id(dev->gnd_handle,
2193                                                        1000, &readyid);
2194
2195                 if (grc == GNI_RC_SUCCESS) {
2196                         CDEBUG(D_INFO, "waking up dgram mover thread\n");
2197                         kgnilnd_schedule_dgram(dev);
2198
2199                         /* wait for dgram thread to ping us before spinning again */
2200                         prepare_to_wait(&dev->gnd_dgping_waitq, &mover_done,
2201                                         TASK_INTERRUPTIBLE);
2202
2203                         /* don't sleep if we need to quiesce */
2204                         if (likely(!kgnilnd_data.kgn_quiesce_trigger)) {
2205                                 schedule();
2206                         }
2207                         finish_wait(&dev->gnd_dgping_waitq, &mover_done);
2208                 }
2209         }
2210
2211         kgnilnd_thread_fini();
2212         return 0;
2213 }
2214
2215 int
2216 kgnilnd_start_outbound_dgrams(kgn_device_t *dev, unsigned long deadline)
2217 {
2218         int                      did_something = 0, rc;
2219         kgn_peer_t              *peer = NULL;
2220
2221         spin_lock(&dev->gnd_connd_lock);
2222
2223         /* Active connect - we added this in kgnilnd_launch_tx */
2224         while (!list_empty(&dev->gnd_connd_peers) && time_before(jiffies, deadline)) {
2225                 peer = list_first_entry(&dev->gnd_connd_peers,
2226                                         kgn_peer_t, gnp_connd_list);
2227
2228                 /* ref for connd removed in if/else below */
2229                list_del_init(&peer->gnp_connd_list);
2230
2231                 /* gnp_connecting and membership on gnd_connd_peers should be
2232                  * done coherently to avoid double adding, etc */
2233                 /* don't need kgnilnd_data.kgn_peer_conn_lock here as that is only needed
2234                  * to get the peer to gnp_connecting in the first place. We just need to
2235                  * rely on gnd_connd_lock to serialize someone pulling him from the list
2236                  * BEFORE clearing gnp_connecting */
2237                 LASSERTF(peer->gnp_connecting != GNILND_PEER_IDLE, "peer 0x%p->%s not connecting\n",
2238                          peer, libcfs_nid2str(peer->gnp_nid));
2239
2240                 spin_unlock(&dev->gnd_connd_lock);
2241
2242                 CDEBUG(D_NET, "processing connect to %s\n",
2243                        libcfs_nid2str(peer->gnp_nid));
2244
2245                 did_something += 1;
2246                 rc = kgnilnd_start_connect(peer);
2247
2248                 if (likely(rc >= 0)) {
2249                         /* 0 on success, positive on 'just drop peer' errors */
2250                         kgnilnd_peer_decref(peer);
2251                 } else if (rc == -ENOMEM) {
2252                         /* if we are out of wildcards, add back to
2253                          * connd_list - then break out and we'll try later
2254                          * if other errors, we'll bail & cancel pending tx */
2255                         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
2256                         if (peer->gnp_connecting == GNILND_PEER_POSTING) {
2257                                 peer->gnp_connecting = GNILND_PEER_CONNECT;
2258                                 spin_lock(&dev->gnd_connd_lock);
2259                                 list_add_tail(&peer->gnp_connd_list,
2260                                               &dev->gnd_connd_peers);
2261                         } else {
2262                                 /* connecting changed while we were posting */
2263
2264                                 LASSERTF(peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH, "Peer is in invalid"
2265                                         " state 0x%p->%s, connecting %d\n",
2266                                         peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting);
2267                                 peer->gnp_connecting = GNILND_PEER_KILL;
2268                                 spin_lock(&dev->gnd_connd_lock);
2269                                 /* remove the peer ref frrom the cond list */
2270                                 kgnilnd_peer_decref(peer);
2271                                 /* let the system handle itself */
2272                         }
2273                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2274                         /* the datagrams are a global pool,
2275                          * so break out of trying and hope some free
2276                          * up soon */
2277                         did_something -= 1;
2278                         break;
2279                 } else {
2280                         /* something bad happened, you lose */
2281                         CNETERR("could not start connecting to %s "
2282                                 "rc %d: Will retry until TX timeout\n",
2283                                libcfs_nid2str(peer->gnp_nid), rc);
2284                         /* It didnt post so just set connecting back to zero now.
2285                          * The reaper will reattempt the connection if it needs too.
2286                          * If the peer needs death set it so the reaper will cleanup.
2287                          */
2288                         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
2289                         if (peer->gnp_connecting == GNILND_PEER_POSTING) {
2290                                 peer->gnp_connecting = GNILND_PEER_IDLE;
2291                                 kgnilnd_peer_increase_reconnect_locked(peer);
2292                         } else {
2293                                 LASSERTF(peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH, "Peer is in invalid"
2294                                         " state 0x%p->%s, connecting %d\n",
2295                                         peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting);
2296                                 peer->gnp_connecting = GNILND_PEER_KILL;
2297                         }
2298                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2299
2300                         /* hold onto ref until we are really done - if it was
2301                          * unlinked this could result in a destroy */
2302                         kgnilnd_peer_decref(peer);
2303                 }
2304                 spin_lock(&dev->gnd_connd_lock);
2305         }
2306
2307         spin_unlock(&dev->gnd_connd_lock);
2308         RETURN(did_something);
2309 }
2310
2311 int
2312 kgnilnd_repost_wc_dgrams(kgn_device_t *dev)
2313 {
2314         int did_something = 0, to_repost, i;
2315         to_repost = atomic_read(&dev->gnd_nwcdgrams);
2316         ENTRY;
2317
2318         for (i = 0; i < to_repost; ++i) {
2319                 int     rerc;
2320                 rerc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
2321                 if (rerc == 0) {
2322                         kgnilnd_admin_decref(dev->gnd_nwcdgrams);
2323                         did_something += 1;
2324                 } else {
2325                         CDEBUG(D_NETERROR, "error %d: dev %d could not post wildcard datagram\n",
2326                                 rerc, dev->gnd_id);
2327                         break;
2328                 }
2329         }
2330
2331         RETURN(did_something);
2332 }
2333
2334 static void
2335 kgnilnd_dgram_poke_with_stick(unsigned long arg)
2336 {
2337         int             dev_id = arg;
2338         kgn_device_t    *dev = &kgnilnd_data.kgn_devices[dev_id];
2339
2340         wake_up(&dev->gnd_dgram_waitq);
2341 }
2342
2343 /* use single thread for dgrams - should be sufficient for performance */
2344 int
2345 kgnilnd_dgram_mover(void *arg)
2346 {
2347         kgn_device_t            *dev = (kgn_device_t *)arg;
2348         char                     name[16];
2349         int                      rc, did_something;
2350         unsigned long            next_purge_check = jiffies - 1;
2351         unsigned long            timeout;
2352         struct timer_list        timer;
2353         unsigned long            deadline = 0;
2354         DEFINE_WAIT(wait);
2355
2356         snprintf(name, sizeof(name), "kgnilnd_dg_%02d", dev->gnd_id);
2357         cfs_daemonize(name);
2358         cfs_block_allsigs();
2359         /* all gnilnd threads need to run fairly urgently */
2360         set_user_nice(current, *kgnilnd_tunables.kgn_nice);
2361
2362         /* we are ok not locking for these variables as the dgram waitq threads
2363          * will block both due to tying up net (kgn_shutdown) and the completion
2364          * event for the dgram_waitq (kgn_quiesce_trigger) */
2365         deadline = jiffies + cfs_time_seconds(*kgnilnd_tunables.kgn_dgram_timeout);
2366         while (!kgnilnd_data.kgn_shutdown) {
2367                 /* Safe: kgn_shutdown only set when quiescent */
2368
2369                 /* race with stack reset - we want to hold off seeing any new incoming dgrams
2370                  * so we can force a dirty WC dgram for Bug 762072 - put right before
2371                  * quiesce check so that it'll go right into that and not do any
2372                  * dgram mucking */
2373                 CFS_RACE(CFS_FAIL_GNI_WC_DGRAM_FREE);
2374
2375                 /* to quiesce or to not quiesce, that is the question */
2376                 if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
2377                         KGNILND_SPIN_QUIESCE;
2378                 }
2379                 did_something = 0;
2380
2381                 CFS_RACE(CFS_FAIL_GNI_QUIESCE_RACE);
2382
2383                 /* process any newly completed dgrams */
2384                 down_read(&kgnilnd_data.kgn_net_rw_sem);
2385
2386                 rc = kgnilnd_probe_and_process_dgram(dev);
2387                 if (rc > 0) {
2388                         did_something += rc;
2389                 }
2390
2391                 up_read(&kgnilnd_data.kgn_net_rw_sem);
2392
2393                 CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_DGRAM_DEADLINE,
2394                         (*kgnilnd_tunables.kgn_dgram_timeout + 1));
2395                 /* start new outbound dgrams */
2396                 did_something += kgnilnd_start_outbound_dgrams(dev, deadline);
2397
2398                 /* find dead dgrams */
2399                 if (time_after_eq(jiffies, next_purge_check)) {
2400                         /* these don't need to be checked that often */
2401                         kgnilnd_reaper_dgram_check(dev);
2402
2403                         next_purge_check = (long) jiffies +
2404                                       cfs_time_seconds(kgnilnd_data.kgn_new_min_timeout / 4);
2405                 }
2406
2407                 did_something += kgnilnd_repost_wc_dgrams(dev);
2408
2409                 /* careful with the jiffy wrap... */
2410                 timeout = (long)(next_purge_check - jiffies);
2411
2412                 CDEBUG(D_INFO, "did %d timeout %lu next %lu jiffies %lu\n",
2413                        did_something, timeout, next_purge_check, jiffies);
2414
2415                 if ((did_something || timeout <= 0) && time_before(jiffies, deadline)) {
2416                         did_something = 0;
2417                         continue;
2418                 }
2419
2420                 prepare_to_wait(&dev->gnd_dgram_waitq, &wait, TASK_INTERRUPTIBLE);
2421
2422                 setup_timer(&timer, kgnilnd_dgram_poke_with_stick, dev->gnd_id);
2423                 mod_timer(&timer, (long) jiffies + timeout);
2424
2425                 /* last second chance for others to poke us */
2426                 did_something += xchg(&dev->gnd_dgram_ready, GNILND_DGRAM_IDLE);
2427
2428                 /* check flag variables before comittingi even if we did something;
2429                  * if we are after the deadline call schedule */
2430                 if ((!did_something || time_after(jiffies, deadline)) &&
2431                     !kgnilnd_data.kgn_shutdown &&
2432                     !kgnilnd_data.kgn_quiesce_trigger) {
2433                         CDEBUG(D_INFO, "schedule timeout %ld (%lu sec)\n",
2434                                timeout, cfs_duration_sec(timeout));
2435                         wake_up_all(&dev->gnd_dgping_waitq);
2436                         schedule();
2437                         CDEBUG(D_INFO, "awake after schedule\n");
2438                         deadline = jiffies + cfs_time_seconds(*kgnilnd_tunables.kgn_dgram_timeout);
2439                 }
2440
2441                 del_singleshot_timer_sync(&timer);
2442                 finish_wait(&dev->gnd_dgram_waitq, &wait);
2443         }
2444
2445         kgnilnd_thread_fini();
2446         return 0;
2447 }
2448