Whamcloud - gitweb
LU-14080 gnilnd: updates for SUSE 15 SP2
[fs/lustre-release.git] / lnet / klnds / gnilnd / gnilnd_conn.c
1 /*
2  * Copyright (C) 2012 Cray, Inc.
3  *
4  * Copyright (c) 2014, Intel Corporation.
5  *
6  *   Author: Nic Henke <nic@cray.com>
7  *   Author: James Shimek <jshimek@cray.com>
8  *
9  *   This file is part of Lustre, http://www.lustre.org.
10  *
11  *   Lustre is free software; you can redistribute it and/or
12  *   modify it under the terms of version 2 of the GNU General Public
13  *   License as published by the Free Software Foundation.
14  *
15  *   Lustre is distributed in the hope that it will be useful,
16  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
17  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  *   GNU General Public License for more details.
19  *
20  *   You should have received a copy of the GNU General Public License
21  *   along with Lustre; if not, write to the Free Software
22  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23  *
24  */
25
26 #include "gnilnd.h"
27 #include <linux/swap.h>
28
29 void
30 kgnilnd_setup_smsg_attr(gni_smsg_attr_t *smsg_attr)
31 {
32         smsg_attr->mbox_maxcredit = *kgnilnd_tunables.kgn_mbox_credits;
33         smsg_attr->msg_maxsize = GNILND_MAX_MSG_SIZE;
34         smsg_attr->msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT;
35 }
36
37 int
38 kgnilnd_map_fmablk(kgn_device_t *device, kgn_fma_memblock_t *fma_blk)
39 {
40         gni_return_t            rrc;
41         __u32                   flags = GNI_MEM_READWRITE;
42         static unsigned long    reg_to;
43         int                     rfto = *kgnilnd_tunables.kgn_reg_fail_timeout;
44
45         if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
46                 flags |= GNI_MEM_PHYS_CONT;
47         }
48
49         fma_blk->gnm_hold_timeout = 0;
50
51         /* make sure we are mapping a clean block */
52         LASSERTF(fma_blk->gnm_hndl.qword1 == 0UL, "fma_blk %p dirty\n", fma_blk);
53
54         rrc = kgnilnd_mem_register(device->gnd_handle, (__u64)fma_blk->gnm_block,
55                                    fma_blk->gnm_blk_size, device->gnd_rcv_fma_cqh,
56                                    flags, &fma_blk->gnm_hndl);
57         if (rrc != GNI_RC_SUCCESS) {
58                 if (rfto != GNILND_REGFAILTO_DISABLE) {
59                         if (reg_to == 0) {
60                                 reg_to = jiffies + cfs_time_seconds(rfto);
61                         } else if (time_after(jiffies, reg_to)) {
62                                 CERROR("FATAL:fmablk registration has failed "
63                                        "for %ld seconds.\n",
64                                        cfs_duration_sec(jiffies - reg_to) +
65                                                 rfto);
66                                 LBUG();
67                         }
68                 }
69
70                 CNETERR("register fmablk failed 0x%p mbox_size %d flags %u\n",
71                         fma_blk, fma_blk->gnm_mbox_size, flags);
72                 RETURN(-ENOMEM);
73         }
74
75         reg_to = 0;
76
77         /* PHYS_CONT memory isn't really mapped, at least not in GART -
78          *  but all mappings chew up a MDD
79          */
80         if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
81                 atomic64_add(fma_blk->gnm_blk_size, &device->gnd_nbytes_map);
82         }
83
84         atomic_inc(&device->gnd_n_mdd);
85         /* nfmablk is live (mapped) blocks */
86         atomic_inc(&device->gnd_nfmablk);
87
88         RETURN(0);
89 }
90
91 int
92 kgnilnd_alloc_fmablk(kgn_device_t *device, int use_phys)
93 {
94         int                     rc = 0;
95         int                     num_mbox;
96         kgn_fma_memblock_t     *fma_blk;
97         gni_smsg_attr_t         smsg_attr;
98         unsigned long           fmablk_vers;
99
100 #if defined(CONFIG_CRAY_XT) && !defined(CONFIG_CRAY_COMPUTE)
101         /* We allocate large blocks of memory here potentially leading
102          * to memory exhaustion during massive reconnects during a network
103          * outage. Limit the amount of fma blocks to use by always keeping
104          * a percent of pages free initially set to 25% of total memory. */
105         if (nr_free_pages() < kgnilnd_data.free_pages_limit) {
106                 LCONSOLE_INFO("Exceeding free page limit of %ld. "
107                               "Free pages available %ld\n",
108                               kgnilnd_data.free_pages_limit,
109                               nr_free_pages());
110                 return -ENOMEM;
111         }
112 #endif
113         /* we'll use fmablk_vers and the gnd_fmablk_mutex to gate access
114          * to this allocation code. Everyone will sample the version
115          * before and after getting the mutex. If it has changed,
116          * we'll bail out to check the lists again - this indicates that
117          * some sort of change was made to the lists and it is possible
118          * that there is a mailbox for us to find now. This should prevent
119          * a ton of spinning in the case where there are lots of threads
120          * that need a yet-to-be-allocated mailbox for a connection. */
121
122         fmablk_vers = atomic_read(&device->gnd_fmablk_vers);
123         mutex_lock(&device->gnd_fmablk_mutex);
124
125         if (fmablk_vers != atomic_read(&device->gnd_fmablk_vers)) {
126                 /* version changed while we were waiting for semaphore,
127                  * we'll recheck the lists assuming something nice happened */
128                 mutex_unlock(&device->gnd_fmablk_mutex);
129                 return 0;
130         }
131
132         LIBCFS_ALLOC(fma_blk, sizeof(kgn_fma_memblock_t));
133         if (fma_blk == NULL) {
134                 CNETERR("could not allocate fma block descriptor\n");
135                 rc = -ENOMEM;
136                 GOTO(out, rc);
137         }
138
139         INIT_LIST_HEAD(&fma_blk->gnm_bufflist);
140
141         kgnilnd_setup_smsg_attr(&smsg_attr);
142
143         gni_smsg_buff_size_needed(&smsg_attr, &fma_blk->gnm_mbox_size);
144
145         LASSERTF(fma_blk->gnm_mbox_size, "mbox size %d\n", fma_blk->gnm_mbox_size);
146
147         /* gni_smsg_buff_size_needed calculates the base mailbox size and since
148          * we want to hold kgn_peer_credits worth of messages in both directions,
149          * we add PAYLOAD to grow the mailbox size
150          */
151
152         fma_blk->gnm_mbox_size += GNILND_MBOX_PAYLOAD;
153
154         /* we'll only use physical during preallocate at startup -- this keeps it nice and
155          * clean for runtime decisions. We'll keep the PHYS ones around until shutdown
156          * as reallocating them is tough if there is memory fragmentation */
157
158         if (use_phys) {
159                 fma_blk->gnm_block = kmem_cache_alloc(kgnilnd_data.kgn_mbox_cache, GFP_ATOMIC);
160                 if (fma_blk->gnm_block == NULL) {
161                         CNETERR("could not allocate physical SMSG mailbox memory\n");
162                         rc = -ENOMEM;
163                         GOTO(free_desc, rc);
164                 }
165                 fma_blk->gnm_blk_size = GNILND_MBOX_SIZE;
166                 num_mbox = fma_blk->gnm_blk_size / fma_blk->gnm_mbox_size;
167
168                 LASSERTF(num_mbox >= 1,
169                          "num_mbox %d blk_size %u mbox_size %d\n",
170                           num_mbox, fma_blk->gnm_blk_size, fma_blk->gnm_mbox_size);
171
172                 fma_blk->gnm_state = GNILND_FMABLK_PHYS;
173
174         } else {
175                 num_mbox = *kgnilnd_tunables.kgn_mbox_per_block;
176                 fma_blk->gnm_blk_size = num_mbox * fma_blk->gnm_mbox_size;
177
178                 LASSERTF(num_mbox >= 1 && num_mbox >= *kgnilnd_tunables.kgn_mbox_per_block,
179                          "num_mbox %d blk_size %u mbox_size %d tunable %d\n",
180                          num_mbox, fma_blk->gnm_blk_size, fma_blk->gnm_mbox_size,
181                          *kgnilnd_tunables.kgn_mbox_per_block);
182
183                 fma_blk->gnm_block = kgnilnd_vzalloc(fma_blk->gnm_blk_size);
184                 if (fma_blk->gnm_block == NULL) {
185                         CNETERR("could not allocate virtual SMSG mailbox memory, %d bytes\n", fma_blk->gnm_blk_size);
186                         rc = -ENOMEM;
187                         GOTO(free_desc, rc);
188                 }
189
190                 fma_blk->gnm_state = GNILND_FMABLK_VIRT;
191         }
192
193         /* allocate just enough space for the bits to track the mailboxes */
194         CFS_ALLOC_PTR_ARRAY(fma_blk->gnm_bit_array, BITS_TO_LONGS(num_mbox));
195         if (fma_blk->gnm_bit_array == NULL) {
196                 CNETERR("could not allocate mailbox bitmask, %lu bytes for %d mbox\n",
197                        sizeof(unsigned long) * BITS_TO_LONGS(num_mbox), num_mbox);
198                 rc = -ENOMEM;
199                 GOTO(free_blk, rc);
200         }
201         bitmap_zero(fma_blk->gnm_bit_array, num_mbox);
202
203         /* now that the num_mbox is set based on allocation type, get debug
204          * info setup
205          * */
206         CFS_ALLOC_PTR_ARRAY(fma_blk->gnm_mbox_info, num_mbox);
207         if (fma_blk->gnm_mbox_info == NULL) {
208                 CNETERR("could not allocate mailbox debug, %lu bytes for %d mbox\n",
209                        sizeof(kgn_mbox_info_t) * num_mbox, num_mbox);
210                 rc = -ENOMEM;
211                 GOTO(free_bit, rc);
212         }
213
214         rc = kgnilnd_map_fmablk(device, fma_blk);
215         if (rc) {
216                 GOTO(free_info, rc);
217         }
218
219         fma_blk->gnm_next_avail_mbox = 0;
220         fma_blk->gnm_avail_mboxs = fma_blk->gnm_num_mboxs = num_mbox;
221
222         CDEBUG(D_MALLOC, "alloc fmablk 0x%p num %d msg_maxsize %d credits %d "
223                 "mbox_size %d MDD %#llx.%#llx\n",
224                 fma_blk, num_mbox, smsg_attr.msg_maxsize, smsg_attr.mbox_maxcredit,
225                 fma_blk->gnm_mbox_size, fma_blk->gnm_hndl.qword1,
226                 fma_blk->gnm_hndl.qword2);
227
228         /* lock Is protecting data structures, not semaphore */
229
230         spin_lock(&device->gnd_fmablk_lock);
231         list_add_tail(&fma_blk->gnm_bufflist, &device->gnd_fma_buffs);
232
233         /* toggle under the lock so once they change the list is also
234          * ready for others to traverse */
235         atomic_inc(&device->gnd_fmablk_vers);
236
237         spin_unlock(&device->gnd_fmablk_lock);
238
239         mutex_unlock(&device->gnd_fmablk_mutex);
240
241         return 0;
242
243 free_info:
244         CFS_FREE_PTR_ARRAY(fma_blk->gnm_mbox_info, num_mbox);
245 free_bit:
246         CFS_FREE_PTR_ARRAY(fma_blk->gnm_bit_array, BITS_TO_LONGS(num_mbox));
247 free_blk:
248         if (fma_blk->gnm_state == GNILND_FMABLK_VIRT) {
249                 kgnilnd_vfree(fma_blk->gnm_block, fma_blk->gnm_blk_size);
250         } else {
251                 kmem_cache_free(kgnilnd_data.kgn_mbox_cache, fma_blk->gnm_block);
252         }
253 free_desc:
254         LIBCFS_FREE(fma_blk, sizeof(kgn_fma_memblock_t));
255 out:
256         mutex_unlock(&device->gnd_fmablk_mutex);
257         return rc;
258 }
259
260 void
261 kgnilnd_unmap_fmablk(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
262 {
263         gni_return_t            rrc;
264
265         /* if some held, set hold_timeout from conn timeouts used in this block
266          * but not during shutdown, then just nuke and pave
267          * During a stack reset, we need to deregister with a hold timeout
268          * set so we don't use the same mdd after reset is complete */
269         if ((fma_blk->gnm_held_mboxs && !kgnilnd_data.kgn_shutdown) ||
270             kgnilnd_data.kgn_in_reset) {
271                 fma_blk->gnm_hold_timeout = GNILND_TIMEOUT2DEADMAN;
272         }
273
274         /* we are changing the state of a block, tickle version to tell
275          * proc code list is stale now */
276         atomic_inc(&dev->gnd_fmablk_vers);
277
278         rrc = kgnilnd_mem_deregister(dev->gnd_handle, &fma_blk->gnm_hndl, fma_blk->gnm_hold_timeout);
279
280         CDEBUG(rrc == GNI_RC_SUCCESS ? D_MALLOC : D_CONSOLE|D_NETERROR,
281                "unmap fmablk 0x%p@%s sz %u total %d avail %d held %d mbox_size %d "
282                 "hold_timeout %d\n",
283                fma_blk, kgnilnd_fmablk_state2str(fma_blk->gnm_state),
284                fma_blk->gnm_blk_size, fma_blk->gnm_num_mboxs,
285                fma_blk->gnm_avail_mboxs, fma_blk->gnm_held_mboxs,
286                fma_blk->gnm_mbox_size, fma_blk->gnm_hold_timeout);
287
288         LASSERTF(rrc == GNI_RC_SUCCESS,
289                 "tried to double unmap or something bad, fma_blk %p (rrc %d)\n",
290                 fma_blk, rrc);
291
292         if (fma_blk->gnm_hold_timeout &&
293             !(kgnilnd_data.kgn_in_reset &&
294               fma_blk->gnm_state == GNILND_FMABLK_PHYS)) {
295                 atomic_inc(&dev->gnd_n_mdd_held);
296         } else {
297                 atomic_dec(&dev->gnd_n_mdd);
298         }
299
300         /* PHYS blocks don't get mapped */
301         if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
302                 atomic64_sub(fma_blk->gnm_blk_size, &dev->gnd_nbytes_map);
303                 fma_blk->gnm_state = GNILND_FMABLK_IDLE;
304         } else if (kgnilnd_data.kgn_in_reset) {
305                 /* in stack reset, clear MDD handle for PHYS blocks, as we'll
306                  * re-use the fma_blk after reset so we don't have to drop/allocate
307                  * all of those physical blocks */
308                 fma_blk->gnm_hndl.qword1 = fma_blk->gnm_hndl.qword2 = 0UL;
309         }
310
311         /* Decrement here as this is the # of mapped blocks */
312         atomic_dec(&dev->gnd_nfmablk);
313 }
314
315
316 /* needs lock on gnd_fmablk_lock to cover gnd_fma_buffs */
317 void
318 kgnilnd_free_fmablk_locked(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
319 {
320         LASSERTF(fma_blk->gnm_avail_mboxs == fma_blk->gnm_num_mboxs,
321                  "fma_blk %p@%d free in bad state (%d): blk total %d avail %d held %d\n",
322                  fma_blk, fma_blk->gnm_state, fma_blk->gnm_hold_timeout, fma_blk->gnm_num_mboxs,
323                 fma_blk->gnm_avail_mboxs, fma_blk->gnm_held_mboxs);
324
325         atomic_inc(&dev->gnd_fmablk_vers);
326
327         if (fma_blk->gnm_hold_timeout) {
328                 CDEBUG(D_MALLOC, "mdd release fmablk 0x%p sz %u avail %d held %d "
329                         "mbox_size %d\n",
330                         fma_blk, fma_blk->gnm_blk_size, fma_blk->gnm_avail_mboxs,
331                         fma_blk->gnm_held_mboxs, fma_blk->gnm_mbox_size);
332
333                 /* We leave MDD dangling over stack reset */
334                 if (!kgnilnd_data.kgn_in_reset) {
335                         kgnilnd_mem_mdd_release(dev->gnd_handle, &fma_blk->gnm_hndl);
336                 }
337                 /* ignoring the return code - if kgni/ghal can't find it
338                  * it must be released already */
339                 atomic_dec(&dev->gnd_n_mdd_held);
340                 atomic_dec(&dev->gnd_n_mdd);
341         }
342
343         /* we cant' free the gnm_block until all the conns have released their
344          * purgatory holds. While we have purgatory holds, we might check the conn
345          * RX mailbox during the CLOSING process. It is possible that kgni might
346          * try to look into the RX side for credits when sending the CLOSE msg too */
347         CDEBUG(D_MALLOC, "fmablk %p free buffer %p mbox_size %d\n",
348                 fma_blk, fma_blk->gnm_block, fma_blk->gnm_mbox_size);
349
350         if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
351                 kmem_cache_free(kgnilnd_data.kgn_mbox_cache, fma_blk->gnm_block);
352         } else {
353                 kgnilnd_vfree(fma_blk->gnm_block, fma_blk->gnm_blk_size);
354         }
355         fma_blk->gnm_state = GNILND_FMABLK_FREED;
356
357         list_del(&fma_blk->gnm_bufflist);
358
359         CFS_FREE_PTR_ARRAY(fma_blk->gnm_mbox_info, fma_blk->gnm_num_mboxs);
360         CFS_FREE_PTR_ARRAY(fma_blk->gnm_bit_array,
361                            BITS_TO_LONGS(fma_blk->gnm_num_mboxs));
362         LIBCFS_FREE(fma_blk, sizeof(kgn_fma_memblock_t));
363 }
364
365 void
366 kgnilnd_find_free_mbox(kgn_conn_t *conn)
367 {
368         kgn_device_t            *dev = conn->gnc_device;
369         gni_smsg_attr_t         *smsg_attr = &conn->gnpr_smsg_attr;
370         kgn_fma_memblock_t      *fma_blk;
371         kgn_mbox_info_t         *mbox = NULL;
372         int                     id;
373
374         spin_lock(&dev->gnd_fmablk_lock);
375
376         list_for_each_entry(fma_blk, &conn->gnc_device->gnd_fma_buffs,
377                             gnm_bufflist) {
378                 if (fma_blk->gnm_avail_mboxs <= 0 ||
379                     fma_blk->gnm_state <= GNILND_FMABLK_IDLE) {
380                         continue;
381                 }
382                 /* look in bitarray for available mailbox */
383                 do {
384                         id = find_next_zero_bit(
385                                 fma_blk->gnm_bit_array,
386                                 fma_blk->gnm_num_mboxs,
387                                 fma_blk->gnm_next_avail_mbox);
388                       if (id == fma_blk->gnm_num_mboxs &&
389                           fma_blk->gnm_next_avail_mbox != 0) {
390                                 /* wrap around */
391                                 fma_blk->gnm_next_avail_mbox = 0;
392                         } else {
393                                 break;
394                         }
395                 } while (1);
396
397                 LASSERTF(id < fma_blk->gnm_num_mboxs, "id %d max %d\n",
398                          id, fma_blk->gnm_num_mboxs);
399                 set_bit(id, (volatile unsigned long *)fma_blk->gnm_bit_array);
400                 conn->gnc_mbox_id = id;
401
402                 fma_blk->gnm_next_avail_mbox =
403                         (id == (fma_blk->gnm_num_mboxs - 1)) ? 0 : (id + 1);
404                 fma_blk->gnm_avail_mboxs--;
405                 conn->gnc_fma_blk = fma_blk;
406
407                 kgnilnd_setup_smsg_attr(smsg_attr);
408
409                 smsg_attr->msg_buffer = fma_blk->gnm_block;
410                 smsg_attr->mbox_offset = fma_blk->gnm_mbox_size * id;
411                 smsg_attr->mem_hndl = fma_blk->gnm_hndl;
412                 smsg_attr->buff_size = fma_blk->gnm_mbox_size;
413
414                 /* We'll set the hndl to zero for PHYS blocks unmapped during stack
415                  * reset and re-use the same fma_blk after stack reset. This ensures we've
416                  * properly mapped it before we use it */
417                 LASSERTF(fma_blk->gnm_hndl.qword1 != 0UL, "unmapped fma_blk %p, state %d\n",
418                          fma_blk, fma_blk->gnm_state);
419
420                 CDEBUG(D_NET, "conn %p smsg %p fmablk %p "
421                         "allocating SMSG mbox %d buf %p "
422                         "offset %u hndl %#llx.%#llx\n",
423                         conn, smsg_attr, fma_blk, id,
424                         smsg_attr->msg_buffer, smsg_attr->mbox_offset,
425                         fma_blk->gnm_hndl.qword1,
426                         fma_blk->gnm_hndl.qword2);
427
428                 mbox = &fma_blk->gnm_mbox_info[id];
429                 mbox->mbx_create_conn_memset = jiffies;
430                 mbox->mbx_nallocs++;
431                 mbox->mbx_nallocs_total++;
432
433                 /* zero mbox to remove any old data from our last use.
434                  * this better be safe, if not our purgatory timers
435                  * are too short or a peer really is misbehaving */
436                 memset(smsg_attr->msg_buffer + smsg_attr->mbox_offset,
437                        0, smsg_attr->buff_size);
438                 break;
439         }
440
441         spin_unlock(&dev->gnd_fmablk_lock);
442 }
443
444 int
445 kgnilnd_setup_mbox(kgn_conn_t *conn)
446 {
447         gni_smsg_attr_t         *smsg_attr = &conn->gnpr_smsg_attr;
448         int                      err = 0;
449
450         smsg_attr->msg_buffer = NULL;
451         /* Look for available mbox */
452         do {
453                 kgnilnd_find_free_mbox(conn);
454
455                 /* nothing in the existing buffers, make a new one */
456                 if (smsg_attr->msg_buffer == NULL) {
457                         /* for runtime allocations, we only want vmalloc */
458                         err = kgnilnd_alloc_fmablk(conn->gnc_device, 0);
459                         if (err) {
460                                 break;
461                         }
462                 }
463         } while (smsg_attr->msg_buffer == NULL);
464
465         if (err)
466                 CNETERR("couldn't allocate SMSG mbox for conn %p Error: %d\n",
467                         conn, err);
468         return err;
469 }
470
471 void
472 kgnilnd_release_mbox(kgn_conn_t *conn, int purgatory_hold)
473 {
474         kgn_device_t           *dev = conn->gnc_device;
475         gni_smsg_attr_t        *smsg_attr = &conn->gnpr_smsg_attr;
476         kgn_fma_memblock_t     *fma_blk = NULL;
477         kgn_mbox_info_t        *mbox = NULL;
478         int                     found = 0;
479         int                     id;
480
481         /* if we failed to setup mbox and now destroying conn */
482         if (smsg_attr->msg_buffer == NULL) {
483                 return;
484         }
485
486         id = conn->gnc_mbox_id;
487
488         spin_lock(&dev->gnd_fmablk_lock);
489         /* make sure our conn points at a valid fma_blk
490          * We use this instead of a mem block search out of smsg_attr
491          * because we could have freed a block for fma_blk #1 but the fma_blk
492          * is still in the list for a purgatory hold. This would induce a false
493          * match if that same block gets reallocated to fma_blk #2 */
494         list_for_each_entry(fma_blk, &dev->gnd_fma_buffs, gnm_bufflist) {
495                 if (fma_blk == conn->gnc_fma_blk) {
496                         found = 1;
497                         break;
498                 }
499         }
500         LASSERTF(found, "unable to find conn 0x%p with gnc_fma_blk %p "
501                  "anywhere in the world\n", conn, conn->gnc_fma_blk);
502
503         LASSERTF(id < fma_blk->gnm_num_mboxs,
504                 "bad id %d max %d\n",
505                 id, fma_blk->gnm_num_mboxs);
506
507         /* < 0 - was held, now free it
508          * == 0 - just free it
509          * > 0 - hold it for now */
510         if (purgatory_hold == 0) {
511                 CDEBUG(D_NET, "conn %p smsg %p fmablk %p freeing SMSG mbox %d "
512                         "hndl %#llx.%#llx\n",
513                         conn, smsg_attr, fma_blk, id,
514                         fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
515                 fma_blk->gnm_avail_mboxs++;
516
517         } else if (purgatory_hold > 0) {
518                 CDEBUG(D_NET, "conn %p smsg %p fmablk %p holding SMSG mbox %d "
519                         "hndl %#llx.%#llx\n",
520                         conn, smsg_attr, fma_blk, id,
521                         fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
522
523                 fma_blk->gnm_held_mboxs++;
524                 fma_blk->gnm_max_timeout = max_t(long, fma_blk->gnm_max_timeout,
525                                                  conn->gnc_timeout);
526         } else {
527                 CDEBUG(D_NET, "conn %p smsg %p fmablk %p release SMSG mbox %d "
528                         "hndl %#llx.%#llx\n",
529                         conn, smsg_attr, fma_blk, id,
530                         fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
531
532                 fma_blk->gnm_held_mboxs--;
533                 fma_blk->gnm_avail_mboxs++;
534         }
535
536         if (purgatory_hold <= 0) {
537                 /* if kgni is retransmitting, freeing the smsg block before the EP
538                  * is destroyed gets messy. Bug 768295. */
539                 LASSERTF(conn->gnc_ephandle == NULL,
540                          "can't release mbox before EP is nuked. conn 0x%p\n", conn);
541
542                 mbox = &fma_blk->gnm_mbox_info[id];
543                 mbox->mbx_release_from_purgatory = jiffies;
544
545                 /* clear conn gnc_fmablk if it is gone - this allows us to
546                  * not worry about state so much in kgnilnd_destroy_conn
547                  * and makes the guaranteed cleanup of the resources easier */
548                 LASSERTF(test_and_clear_bit(id, fma_blk->gnm_bit_array),
549                         "conn %p bit %d already cleared in fma_blk %p\n",
550                          conn, id, fma_blk);
551                 conn->gnc_fma_blk = NULL;
552                 mbox->mbx_nallocs--;
553         }
554
555         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_FMABLK_AVAIL)) {
556                 CERROR("LBUGs in your future: forcibly marking fma_blk %p "
557                        "as mapped\n", fma_blk);
558                 fma_blk->gnm_state = GNILND_FMABLK_VIRT;
559         }
560
561         /* we don't release or unmap PHYS blocks as part of the normal cycle --
562          * those are controlled manually from startup/shutdown */
563         if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
564                 /* we can unmap once all are unused (held or avail)
565                  * but check hold_timeout to make sure we are not trying to double
566                  * unmap this buffer. If there was no hold_timeout set due to
567                  * held_mboxs, we'll free the mobx here shortly and won't have to
568                  * worry about catching a double free for a 'clean' fma_blk */
569                 if (((fma_blk->gnm_avail_mboxs + fma_blk->gnm_held_mboxs) == fma_blk->gnm_num_mboxs) &&
570                     (!fma_blk->gnm_hold_timeout)) {
571                         kgnilnd_unmap_fmablk(dev, fma_blk);
572                 }
573
574                 /* But we can only free once they are all avail */
575                 if (fma_blk->gnm_avail_mboxs == fma_blk->gnm_num_mboxs &&
576                     fma_blk->gnm_held_mboxs == 0) {
577                         /* all mailboxes are released, free fma_blk */
578                         kgnilnd_free_fmablk_locked(dev, fma_blk);
579                 }
580         }
581
582         spin_unlock(&dev->gnd_fmablk_lock);
583 }
584
585 int
586 kgnilnd_count_phys_mbox(kgn_device_t *device)
587 {
588         int                     i = 0;
589         kgn_fma_memblock_t     *fma_blk;
590
591         spin_lock(&device->gnd_fmablk_lock);
592
593         list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
594                 if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
595                         i += fma_blk->gnm_num_mboxs;
596         }
597         spin_unlock(&device->gnd_fmablk_lock);
598
599         RETURN(i);
600 }
601
602 int
603 kgnilnd_allocate_phys_fmablk(kgn_device_t *device)
604 {
605         int     rc;
606
607         while (kgnilnd_count_phys_mbox(device) < *kgnilnd_tunables.kgn_nphys_mbox) {
608
609                 rc = kgnilnd_alloc_fmablk(device, 1);
610                 if (rc) {
611                         CERROR("failed phys mbox allocation, stopping at %d, rc %d\n",
612                                 kgnilnd_count_phys_mbox(device), rc);
613                         RETURN(rc);
614                 }
615         }
616         RETURN(0);
617 }
618
619 int
620 kgnilnd_map_phys_fmablk(kgn_device_t *device)
621 {
622
623         int                     rc = 0;
624         kgn_fma_memblock_t     *fma_blk;
625
626         /* use mutex to gate access to single thread, just in case */
627         mutex_lock(&device->gnd_fmablk_mutex);
628
629         spin_lock(&device->gnd_fmablk_lock);
630
631         list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
632                 if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
633                         rc = kgnilnd_map_fmablk(device, fma_blk);
634                         if (rc)
635                                 break;
636                 }
637         }
638         spin_unlock(&device->gnd_fmablk_lock);
639
640         mutex_unlock(&device->gnd_fmablk_mutex);
641
642         RETURN(rc);
643 }
644
645 void
646 kgnilnd_unmap_fma_blocks(kgn_device_t *device)
647 {
648
649         kgn_fma_memblock_t      *fma_blk;
650
651         /* use mutex to gate access to single thread, just in case */
652         mutex_lock(&device->gnd_fmablk_mutex);
653
654         spin_lock(&device->gnd_fmablk_lock);
655
656         list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
657                 kgnilnd_unmap_fmablk(device, fma_blk);
658         }
659         spin_unlock(&device->gnd_fmablk_lock);
660
661         mutex_unlock(&device->gnd_fmablk_mutex);
662 }
663
664 void
665 kgnilnd_free_phys_fmablk(kgn_device_t *device)
666 {
667
668         kgn_fma_memblock_t      *fma_blk, *fma_blkN;
669
670         /* use mutex to gate access to single thread, just in case */
671         mutex_lock(&device->gnd_fmablk_mutex);
672
673         spin_lock(&device->gnd_fmablk_lock);
674
675         list_for_each_entry_safe(fma_blk, fma_blkN, &device->gnd_fma_buffs, gnm_bufflist) {
676                 if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
677                         kgnilnd_free_fmablk_locked(device, fma_blk);
678         }
679         spin_unlock(&device->gnd_fmablk_lock);
680
681         mutex_unlock(&device->gnd_fmablk_mutex);
682 }
683
684 /* kgnilnd dgram nid->struct managment */
685
686 static inline struct list_head *
687 kgnilnd_nid2dgramlist(kgn_device_t *dev, lnet_nid_t nid)
688 {
689         unsigned int hash = ((unsigned int)nid) % *kgnilnd_tunables.kgn_peer_hash_size;
690
691         RETURN(&dev->gnd_dgrams[hash]);
692 }
693
694
695 /* needs dev->gnd_dgram_lock held */
696 kgn_dgram_t *
697 kgnilnd_find_dgram_locked(kgn_device_t *dev, lnet_nid_t dst_nid)
698 {
699         struct list_head *dgram_list = kgnilnd_nid2dgramlist(dev, dst_nid);
700         kgn_dgram_t      *dgram;
701
702         list_for_each_entry(dgram, dgram_list, gndg_list) {
703
704                 /* if state > POSTED, we are already handling cancel/completion */
705                 if ((dgram->gndg_conn_out.gncr_dstnid != dst_nid) ||
706                      dgram->gndg_state > GNILND_DGRAM_POSTED)
707                         continue;
708
709                 CDEBUG(D_NET, "got dgram [%p] -> %s\n",
710                        dgram, libcfs_nid2str(dst_nid));
711                 return dgram;
712         }
713         return NULL;
714 }
715
716 int
717 kgnilnd_find_and_cancel_dgram(kgn_device_t *dev, lnet_nid_t dst_nid)
718 {
719         kgn_dgram_t     *dgram;
720
721         spin_lock(&dev->gnd_dgram_lock);
722         dgram = kgnilnd_find_dgram_locked(dev, dst_nid);
723
724         if (dgram) {
725                 kgnilnd_cancel_dgram_locked(dgram);
726         }
727         spin_unlock(&dev->gnd_dgram_lock);
728
729         RETURN(!!(dgram == NULL));
730 }
731
732 int
733 kgnilnd_pack_connreq(kgn_connreq_t *connreq, kgn_conn_t *conn,
734                      lnet_nid_t srcnid, lnet_nid_t dstnid,
735                      kgn_connreq_type_t type)
736 {
737         int err = 0;
738
739         /* ensure we haven't violated max datagram size */
740         BUILD_BUG_ON(sizeof(kgn_connreq_t) > GNI_DATAGRAM_MAXSIZE);
741
742         /* no need to zero out, we do that when allocating dgram */
743         connreq->gncr_magic     = GNILND_MSG_MAGIC;
744
745         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_SRCNID)) {
746                 srcnid = 0xABADBABE;
747         } else if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_DSTNID)) {
748                 dstnid = 0xDEFEC8ED;
749         }
750
751         connreq->gncr_srcnid    = srcnid;
752         connreq->gncr_dstnid    = dstnid;
753
754         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
755                 connreq->gncr_version = 99;
756         } else {
757                 connreq->gncr_version   = GNILND_CONNREQ_VERSION;
758         }
759         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
760                 connreq->gncr_type = 99;
761         } else {
762                 connreq->gncr_type      = type;
763         }
764         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
765                 connreq->gncr_peerstamp = 0;
766         } else {
767                 connreq->gncr_peerstamp = kgnilnd_data.kgn_peerstamp;
768         }
769         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
770                 connreq->gncr_connstamp = 0;
771         } else {
772                 connreq->gncr_connstamp = conn->gnc_my_connstamp;
773         }
774         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
775                 connreq->gncr_timeout = 0;
776         } else {
777                 connreq->gncr_timeout   = conn->gnc_timeout;
778         }
779
780         /* the rest pack the data into the payload in other places */
781         if (type == GNILND_CONNREQ_REQ) {
782                 kgn_gniparams_t       *req_params = &connreq->gncr_gnparams;
783                 req_params->gnpr_host_id = conn->gnc_device->gnd_host_id;
784                 req_params->gnpr_cqid = conn->gnc_cqid;
785
786                 /* allocate mailbox for this connection */
787                 err = kgnilnd_setup_mbox(conn);
788                 if (err != 0) {
789                         CERROR("Failed to setup FMA mailbox (%d)\n", err);
790                 }
791                 req_params->gnpr_smsg_attr = conn->gnpr_smsg_attr;
792         }
793
794         /* XXX Nic: TBD - checksum computation */
795
796         return err;
797 }
798
799 int
800 kgnilnd_unpack_connreq(kgn_dgram_t *dgram)
801 {
802         kgn_connreq_t           *connreq = &dgram->gndg_conn_in;
803         int                      swab, rc = 0;
804         kgn_net_t               *net;
805
806         /* the following fields must be handled in a backwards compatible
807          * manner to ensure we can always send and interpret NAKs */
808
809         if (connreq->gncr_magic != GNILND_MSG_MAGIC &&
810             connreq->gncr_magic != __swab32(GNILND_MSG_MAGIC)) {
811                 /* Unexpected magic! */
812                 CERROR("Unexpected magic %08x\n",
813                        connreq->gncr_magic);
814                 return -EBADF;
815         }
816
817         swab = (connreq->gncr_magic == __swab32(GNILND_MSG_MAGIC));
818         if (swab) {
819                 __swab32s(&connreq->gncr_magic);
820                 __swab32s(&connreq->gncr_cksum);
821                 __swab16s(&connreq->gncr_type);
822                 __swab16s(&connreq->gncr_version);
823                 __swab32s(&connreq->gncr_timeout);
824                 __swab64s(&connreq->gncr_srcnid);
825                 __swab64s(&connreq->gncr_dstnid);
826                 __swab64s(&connreq->gncr_peerstamp);
827                 __swab64s(&connreq->gncr_connstamp);
828         }
829
830         /* Do NOT return anything but -EBADF before we munge
831          * connreq->gncr_srcnid - we need that to send the nak */
832
833         if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
834                 lnet_nid_t      incoming = connreq->gncr_srcnid;
835
836                 /* even if the incoming packet is hosed, we know who we sent
837                  * the original and can set the srcnid so that we can properly
838                  * look up our peer to close the loop on this connreq. We still use
839                  * -EBADF to prevent a NAK - just in case there are issues with
840                  * the payload coming from a random spot, etc. */
841                 connreq->gncr_srcnid = dgram->gndg_conn_out.gncr_dstnid;
842
843                 if (LNET_NIDADDR(dgram->gndg_conn_out.gncr_dstnid) !=
844                                 LNET_NIDADDR(incoming)) {
845                         /* we got a datagram match for the wrong nid... */
846                         CERROR("matched datagram 0x%p with srcnid %s "
847                                 "(%x), expecting %s (%x)\n",
848                                 dgram,
849                                 libcfs_nid2str(incoming),
850                                 LNET_NIDADDR(incoming),
851                                 libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
852                                 LNET_NIDADDR(dgram->gndg_conn_out.gncr_dstnid));
853                         return -EBADF;
854                 }
855         } else {
856                 /* if we have a wildcard datagram it should match an
857                  * incoming "active" datagram that should have a fully formed
858                  * srcnid and dstnid. If we couldn't unpack it, we drop as
859                  * corrupted packet, otherwise we'll just verify that the dstnid
860                  * matches the NID for the NET that the dgram was posted */
861
862                 /* make sure their wildcard didn't match ours, that is unpossible */
863                 LASSERTF(connreq->gncr_dstnid != LNET_NID_ANY,
864                          "dgram 0x%p from %s, connreq 0x%p; "
865                          "wildcard matched wildcard \n", dgram,
866                          libcfs_nid2str(connreq->gncr_srcnid), connreq);
867
868                 rc = kgnilnd_find_net(connreq->gncr_dstnid, &net);
869
870                 if (rc == -ESHUTDOWN) {
871                         CERROR("Looking up network: device is in shutdown");
872                         return rc;
873                 } else if (rc == -ENONET) {
874                         CERROR("Connection data from %s: she sent "
875                         "dst_nid %s, but net lookup failed on "
876                         "dgram 0x%p@%s\n",
877                         libcfs_nid2str(connreq->gncr_srcnid),
878                         libcfs_nid2str(connreq->gncr_dstnid),
879                         dgram, kgnilnd_dgram_type2str(dgram));
880                         return rc;
881                 }
882
883                 if (net->gnn_ni->ni_nid != connreq->gncr_dstnid) {
884                         CERROR("Bad connection data from %s: she sent "
885                                "dst_nid %s, but I am %s with dgram 0x%p@%s\n",
886                                libcfs_nid2str(connreq->gncr_srcnid),
887                                libcfs_nid2str(connreq->gncr_dstnid),
888                                libcfs_nid2str(net->gnn_ni->ni_nid),
889                                dgram, kgnilnd_dgram_type2str(dgram));
890                         kgnilnd_net_decref(net);
891                         return -EBADSLT;
892                 }
893
894                 /* kgnilnd_find_net takes a ref on the net it finds, You need to decref it when not needed. */
895                 kgnilnd_net_decref(net);
896         }
897
898         if (connreq->gncr_version != GNILND_CONNREQ_VERSION) {
899                 CERROR("Unexpected version %d\n", connreq->gncr_version);
900                 return -EPROTO;
901         }
902
903         /* XXX Nic: TBD - checksum validation */
904         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_DROP)) {
905                 return -EBADF;
906         }
907
908         if (swab && connreq->gncr_type == GNILND_CONNREQ_REQ) {
909                 __u64 msg_addr = (__u64) connreq->gncr_gnparams.gnpr_smsg_attr.msg_buffer;
910
911                 __swab32s(&connreq->gncr_gnparams.gnpr_host_id);
912                 __swab32s(&connreq->gncr_gnparams.gnpr_cqid);
913                 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.buff_size);
914                 __swab16s(&connreq->gncr_gnparams.gnpr_smsg_attr.mbox_maxcredit);
915                 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.mbox_offset);
916                 __swab64s(&connreq->gncr_gnparams.gnpr_smsg_attr.mem_hndl.qword1);
917                 __swab64s(&connreq->gncr_gnparams.gnpr_smsg_attr.mem_hndl.qword2);
918                 __swab64s(&msg_addr);
919                 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.msg_maxsize);
920                 __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.msg_type);
921         } else if (swab && connreq->gncr_type == GNILND_CONNREQ_NAK) {
922                 __swab32s(&connreq->gncr_nakdata.gnnd_errno);
923         }
924
925         /* since we use a unique instance ID for each network, the driver
926          * will take care of dropping datagrams if we don't have that network.
927          */
928
929         /* few more idiot software or configuration checks */
930
931         switch (connreq->gncr_type) {
932         case GNILND_CONNREQ_REQ:
933                 /* wire up EP and SMSG block - this will check the incoming data
934                  * and barf a NAK back if need to */
935                 rc = kgnilnd_set_conn_params(dgram);
936                 if (rc)
937                         return rc;
938                 break;
939         case GNILND_CONNREQ_NAK:
940         case GNILND_CONNREQ_CLOSE:
941                 break;
942         default:
943                 CERROR("unknown connreq packet type %d\n", connreq->gncr_type);
944                 return -EPROTO;
945         }
946
947         if (connreq->gncr_peerstamp == 0 || connreq->gncr_connstamp == 0) {
948                 CERROR("Recived bad timestamps peer %llu conn %llu\n",
949                 connreq->gncr_peerstamp, connreq->gncr_connstamp);
950                 return -EPROTO;
951         }
952
953         if (connreq->gncr_timeout < GNILND_MIN_TIMEOUT) {
954                 CERROR("Received timeout %d < MIN %d\n",
955                        connreq->gncr_timeout, GNILND_MIN_TIMEOUT);
956                 return -EPROTO;
957         }
958
959         return 0;
960 }
961
962 int
963 kgnilnd_alloc_dgram(kgn_dgram_t **dgramp, kgn_device_t *dev, kgn_dgram_type_t type)
964 {
965         kgn_dgram_t         *dgram;
966
967         dgram = kmem_cache_zalloc(kgnilnd_data.kgn_dgram_cache, GFP_ATOMIC);
968         if (dgram == NULL)
969                 return -ENOMEM;
970
971         INIT_LIST_HEAD(&dgram->gndg_list);
972         dgram->gndg_state = GNILND_DGRAM_USED;
973         dgram->gndg_type = type;
974         dgram->gndg_magic = GNILND_DGRAM_MAGIC;
975
976         atomic_inc(&dev->gnd_ndgrams);
977
978         CDEBUG(D_MALLOC|D_NETTRACE, "slab-alloced 'dgram': %lu at %p %s ndgrams"
979                 " %d\n",
980                 sizeof(*dgram), dgram, kgnilnd_dgram_type2str(dgram),
981                 atomic_read(&dev->gnd_ndgrams));
982
983         *dgramp = dgram;
984         return 0;
985 }
986
987 /* call this on a dgram that came back from kgnilnd_ep_postdata_test_by_id
988  * returns < 0 on dgram to be cleaned up
989  * > 0 on dgram that isn't done yet
990  * == 0 on dgram that is ok and needs connreq processing */
991 int
992 kgnilnd_process_dgram(kgn_dgram_t *dgram, gni_post_state_t post_state)
993 {
994         int rc = 0;
995
996         switch (post_state) {
997         case GNI_POST_COMPLETED:
998                 /* normal state for dgrams that need actual processing */
999                 /* GOTO to avoid processing dgram as canceled/done */
1000                 GOTO(process_out, rc);
1001
1002         case GNI_POST_PENDING:
1003                 /* we should only see this if we are testing a WC dgram after a
1004                  * cancel - it means that it needs a full cycle of waiting
1005                  * for kgni_sm_task to finish moving it to TERMINATED */
1006                 LASSERTF((dgram->gndg_type == GNILND_DGRAM_WC_REQ) &&
1007                           (dgram->gndg_state == GNILND_DGRAM_CANCELED),
1008                          "POST_PENDING dgram 0x%p with bad type %d(%s) or state %d(%s)\n",
1009                          dgram, dgram->gndg_type, kgnilnd_dgram_type2str(dgram),
1010                          dgram->gndg_state, kgnilnd_dgram_state2str(dgram));
1011
1012                 /* positive RC as this dgram isn't done yet */
1013                 rc = EINPROGRESS;
1014
1015                 /* GOTO as this isn't done yet */
1016                 GOTO(process_out, rc);
1017                 break;
1018
1019         case GNI_POST_TERMINATED:
1020                 /* we've called cancel and it is done or remote guy called cancel and
1021                  * we've receved it on a WC dgram */
1022 #if 0
1023                 /* we are seeing weird terminations on non WC dgrams when we have not
1024                  * canceled them */
1025
1026                 LASSERTF(dgram->gndg_state == GNILND_DGRAM_CANCELED ||
1027                          dgram->gndg_conn_out.gncr_dstnid == LNET_NID_ANY,
1028                         "dgram 0x%p with bad state %d(%s) or dst nid %s\n",
1029                         dgram, dgram->gndg_state, kgnilnd_dgram_state2str(dgram),
1030                         libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid));
1031 #endif
1032
1033                 CDEBUG(D_NETTRACE, "dgram 0x%p saw %s, cleaning it up\n", dgram,
1034                        dgram->gndg_state == GNILND_DGRAM_CANCELED ?  "canceled" : "terminated");
1035
1036                 rc =  -ECANCELED;
1037                 break;
1038
1039         case GNI_POST_TIMEOUT:
1040                 /* we could have a timeout on a wildcard dgram too - if
1041                  * we got the incoming request but the remote node beefed
1042                  * before kgni could send the match data back. We'll just error
1043                  * on the active case and bail out gracefully */
1044                 if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
1045                         CNETERR("hardware timeout for connect to "
1046                                "%s after %lu seconds. Is node dead?\n",
1047                                libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
1048                                cfs_duration_sec(jiffies - dgram->gndg_post_time));
1049                 }
1050
1051                 rc = -ETIMEDOUT;
1052                 break;
1053
1054         default:
1055                 CERROR("dgram 0x%p with bad post_state %d\n", dgram, post_state);
1056                 LBUG();
1057         }
1058
1059         /* now finish cleaning up a dgram that is canceled/terminated and needs to
1060          * go away */
1061
1062         /* If this was actively canceled, drop the count now that we are processing */
1063         if (dgram->gndg_state == GNILND_DGRAM_CANCELED) {
1064                 atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1065                 /* caller responsible for gndg_list removal */
1066         }
1067
1068 process_out:
1069
1070         RETURN(rc);
1071 }
1072
1073 /* needs dev->gnd_dgram_lock held */
1074 void
1075 kgnilnd_cancel_dgram_locked(kgn_dgram_t *dgram)
1076 {
1077         gni_return_t            grc;
1078
1079         if (dgram->gndg_state != GNILND_DGRAM_POSTED) {
1080                 return;
1081         }
1082
1083         LASSERTF(dgram->gndg_conn != NULL,
1084                  "dgram 0x%p with NULL conn\n", dgram);
1085
1086         /* C.E - WC dgrams could be canceled immediately but
1087          * if there was some match pending, we need to call
1088          * test_by_id to clear it out. If that test returns
1089          * POST_PENDING, it is half done and needs to go along
1090          * with the rest of dgrams and go through a kgni_sm_task cycle
1091          * and deliver a GNI_POST_TERMINATED event before they
1092          * are actually canceled */
1093
1094         dgram->gndg_state = GNILND_DGRAM_CANCELED;
1095
1096         if (dgram->gndg_conn->gnc_state >= GNILND_CONN_ESTABLISHED) {
1097                 /* we don't need to cancel_by_id if the datagram was good */
1098                 return;
1099         }
1100
1101         /* let folks know there are outstanding cancels */
1102         atomic_inc(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1103         /* leave on nid list until cancel is done for debugging fun */
1104         grc = kgnilnd_ep_postdata_cancel_by_id(dgram->gndg_conn->gnc_ephandle, (__u64) dgram);
1105
1106         /* if we don't get success here, we have hosed up the dgram tracking
1107          * code and need to bail out */
1108         LASSERTF(grc == GNI_RC_SUCCESS,
1109                  "postdata_cancel returned %d for conn 0x%p to %s\n",
1110                  grc, dgram->gndg_conn,
1111                  dgram->gndg_conn->gnc_peer ?
1112                   libcfs_nid2str(dgram->gndg_conn->gnc_peer->gnp_nid)
1113                   : "<?>");
1114
1115         CDEBUG(D_NETTRACE,
1116                 "canceled dgram 0x%p conn 0x%p ephandle 0x%p\n",
1117                 dgram, dgram->gndg_conn,
1118                 dgram->gndg_conn->gnc_ephandle);
1119
1120         if (dgram->gndg_type == GNILND_DGRAM_WC_REQ) {
1121                 gni_post_state_t         post_state;
1122                 int                      rc = 0;
1123                 __u32                    remote_addr = 0, remote_id = 0;
1124
1125                 grc = kgnilnd_ep_postdata_test_by_id(dgram->gndg_conn->gnc_ephandle,
1126                                                      (__u64)dgram, &post_state,
1127                                                      &remote_addr, &remote_id);
1128
1129                 LASSERTF(grc == GNI_RC_NO_MATCH || grc == GNI_RC_SUCCESS,
1130                          "bad grc %d from test_by_id on dgram 0x%p\n",
1131                         grc, dgram);
1132
1133                 /* if WC was canceled immediately, we get NO_MATCH, if needs to go
1134                  * through full cycle, we get SUCCESS and need to parse post_state */
1135
1136                 CDEBUG(D_NET, "grc %d dgram 0x%p type %s post_state %d "
1137                         "remote_addr %u remote_id %u\n", grc, dgram,
1138                         kgnilnd_dgram_type2str(dgram),
1139                         post_state, remote_addr, remote_id);
1140
1141                 if (grc == GNI_RC_NO_MATCH) {
1142                         /* she's gone, reduce count and move along */
1143                         dgram->gndg_state = GNILND_DGRAM_DONE;
1144                         atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1145                         RETURN_EXIT;
1146                 }
1147
1148                 rc = kgnilnd_process_dgram(dgram, post_state);
1149
1150                 if (rc <= 0) {
1151                         /* if for some weird reason we get a valid dgram back, just mark as done
1152                          * so we can drop it and move along.
1153                          * C.E - if it was completed, we'll just release the conn/mbox
1154                          * back into the pool and it'll get reused. That said, we should only
1155                          * be canceling a WC dgram on stack rest or shutdown, so that is moot */
1156                         dgram->gndg_state = GNILND_DGRAM_DONE;
1157                         atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
1158
1159                         /* caller context responsible for calling kgnilnd_release_dgram() */
1160                 } else {
1161                         /* still pending, let it simmer until golden brown and delicious */
1162                 }
1163         }
1164
1165         /* for non WC dgrams, they are still on the nid list but marked canceled waiting
1166          * for kgni to return their ID to us via probe - that is when we'll complete their
1167          * cancel processing */
1168 }
1169
1170 void
1171 kgnilnd_cleanup_dgram(kgn_dgram_t *dgram)
1172 {
1173         /* release the dgram ref on conn */
1174         if (dgram->gndg_conn) {
1175                 kgnilnd_conn_decref(dgram->gndg_conn);
1176                 dgram->gndg_conn = NULL;
1177         }
1178 }
1179
1180 void
1181 kgnilnd_free_dgram(kgn_device_t *dev, kgn_dgram_t *dgram)
1182 {
1183         LASSERTF(dgram->gndg_state == GNILND_DGRAM_USED ||
1184                  dgram->gndg_state == GNILND_DGRAM_DONE,
1185                  "dgram 0x%p with bad state %s\n",
1186                  dgram, kgnilnd_dgram_state2str(dgram));
1187
1188         /* bit of poisoning to help detect bad driver data */
1189         dgram->gndg_magic = 0x6f5a6b5f;
1190         atomic_dec(&dev->gnd_ndgrams);
1191
1192         kmem_cache_free(kgnilnd_data.kgn_dgram_cache, dgram);
1193         CDEBUG(D_MALLOC|D_NETTRACE, "slab-freed 'dgram': %lu at %p %s"
1194                " ndgrams %d\n",
1195                sizeof(*dgram), dgram, kgnilnd_dgram_type2str(dgram),
1196                atomic_read(&dev->gnd_ndgrams));
1197 }
1198
1199 int
1200 kgnilnd_post_dgram(kgn_device_t *dev, lnet_nid_t dstnid, kgn_connreq_type_t type,
1201                    int data_rc)
1202 {
1203         int              rc = 0;
1204         kgn_dgram_t     *dgram = NULL;
1205         kgn_dgram_t     *tmpdgram;
1206         kgn_dgram_type_t dgtype;
1207         gni_return_t     grc;
1208         __u64            srcnid;
1209         ENTRY;
1210
1211         switch (type) {
1212         case GNILND_CONNREQ_REQ:
1213                 if (dstnid == LNET_NID_ANY)
1214                         dgtype = GNILND_DGRAM_WC_REQ;
1215                 else
1216                         dgtype = GNILND_DGRAM_REQ;
1217                 break;
1218         case GNILND_CONNREQ_NAK:
1219                 LASSERTF(dstnid != LNET_NID_ANY, "can't NAK to LNET_NID_ANY\n");
1220                 dgtype = GNILND_DGRAM_NAK;
1221                 break;
1222         default:
1223                 CERROR("unknown connreq type %d\n", type);
1224                 LBUG();
1225         }
1226
1227         rc = kgnilnd_alloc_dgram(&dgram, dev, dgtype);
1228         if (rc < 0) {
1229                 rc = -ENOMEM;
1230                 GOTO(post_failed, rc);
1231         }
1232
1233         rc = kgnilnd_create_conn(&dgram->gndg_conn, dev);
1234         if (rc) {
1235                 GOTO(post_failed, rc);
1236         }
1237
1238         if (dgram->gndg_type == GNILND_DGRAM_WC_REQ) {
1239                 /* clear buffer for sanity on reuse of wildcard */
1240                 memset(&dgram->gndg_conn_in, 0, sizeof(kgn_connreq_t));
1241         }
1242
1243         if (dstnid == LNET_NID_ANY) {
1244                 /* set here to reset any dgram re-use */
1245                 dgram->gndg_conn->gnc_state = GNILND_CONN_LISTEN;
1246         } else {
1247                 __u32            host_id;
1248
1249                 rc = kgnilnd_nid_to_nicaddrs(LNET_NIDADDR(dstnid), 1, &host_id);
1250                 if (rc <= 0) {
1251                         rc = -ESRCH;
1252                         GOTO(post_failed, rc);
1253                 }
1254
1255                 dgram->gndg_conn->gnc_state = GNILND_CONN_CONNECTING;
1256
1257                 /* don't need to serialize, there are no CQs for the dgram
1258                  * EP on the kgn_net_t */
1259                 grc = kgnilnd_ep_bind(dgram->gndg_conn->gnc_ephandle, host_id, dev->gnd_id);
1260
1261                 if (grc != GNI_RC_SUCCESS) {
1262                         rc = -ECONNABORTED;
1263                         GOTO(post_failed, rc);
1264                 }
1265
1266         }
1267
1268         /* If we are posting wildcards post using a net of 0, otherwise we'll use the
1269          * net of the destination node.
1270          */
1271
1272         if (dstnid == LNET_NID_ANY) {
1273                 srcnid = LNET_MKNID(LNET_MKNET(GNILND, 0), dev->gnd_nid);
1274         } else {
1275                 srcnid = LNET_MKNID(LNET_NIDNET(dstnid), dev->gnd_nid);
1276         }
1277
1278         rc = kgnilnd_pack_connreq(&dgram->gndg_conn_out, dgram->gndg_conn,
1279                                   srcnid, dstnid, type);
1280         if (rc) {
1281                 GOTO(post_failed, rc);
1282         }
1283
1284         if (type == GNILND_CONNREQ_NAK)
1285                 dgram->gndg_conn_out.gncr_nakdata.gnnd_errno = data_rc;
1286
1287         dgram->gndg_post_time = jiffies;
1288
1289         /* XXX Nic: here is where we'd add in logical network multiplexing */
1290
1291         CDEBUG(D_NETTRACE, "dgram 0x%p type %s %s->%s cdm %d\n",
1292                dgram, kgnilnd_dgram_type2str(dgram),
1293                libcfs_nid2str(srcnid),
1294                libcfs_nid2str(dstnid), dev->gnd_id);
1295
1296         /* this allocates memory, can't hold locks across */
1297         grc = kgnilnd_ep_postdata_w_id(dgram->gndg_conn->gnc_ephandle,
1298                                    &dgram->gndg_conn_out, sizeof(kgn_connreq_t),
1299                                    &dgram->gndg_conn_in, sizeof(kgn_connreq_t),
1300                                    (__u64)dgram);
1301
1302         if (grc != GNI_RC_SUCCESS) {
1303                 CNETERR("dropping failed dgram post id 0x%p type %s"
1304                         " reqtype %s to %s: rc %d\n",
1305                         dgram, kgnilnd_dgram_type2str(dgram),
1306                         kgnilnd_connreq_type2str(&dgram->gndg_conn_out),
1307                         libcfs_nid2str(dstnid), grc);
1308                 rc = (grc == GNI_RC_ERROR_NOMEM) ? -ENOMEM : -EBADR;
1309                 GOTO(post_failed, rc);
1310         }
1311
1312         /* we don't need to add earlier - if someone does del_peer during post,
1313          * that peer will get marked as unlinked and the callers wil take care of it.
1314          * The dgram code is largely kgn_peer_t ignorant, so at worst, we'll just drop
1315          * the completed dgram later when we cant find a peer to stuff it into */
1316
1317         spin_lock(&dev->gnd_dgram_lock);
1318
1319         /* make sure we are not double posting targeted dgrams
1320          * - we can multiple post WC dgrams to help with processing speed */
1321         if (dstnid != LNET_NID_ANY) {
1322                 tmpdgram = kgnilnd_find_dgram_locked(dev, dstnid);
1323
1324                 LASSERTF(tmpdgram == NULL,
1325                         "dgram 0x%p->%s already posted\n",
1326                          dgram, libcfs_nid2str(dstnid));
1327         }
1328
1329         /* unmunge dstnid to help processing code cope... */
1330         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_DSTNID)) {
1331                 dgram->gndg_conn_out.gncr_dstnid = dstnid;
1332         }
1333
1334         list_add_tail(&dgram->gndg_list, kgnilnd_nid2dgramlist(dev, dstnid));
1335         dgram->gndg_state = GNILND_DGRAM_POSTED;
1336         spin_unlock(&dev->gnd_dgram_lock);
1337
1338 post_failed:
1339         if (rc < 0 && dgram != NULL) {
1340                 kgnilnd_cleanup_dgram(dgram);
1341                 kgnilnd_free_dgram(dev, dgram);
1342         }
1343
1344         RETURN(rc);
1345 }
1346
1347 /* The shutdown flag is set from the shutdown and stack reset threads. */
1348 void
1349 kgnilnd_release_dgram(kgn_device_t *dev, kgn_dgram_t *dgram, int shutdown)
1350 {
1351         /* The conns of canceled active dgrams need to be put in purgatory so
1352          * we don't reuse the mailbox */
1353         if (unlikely(dgram->gndg_state == GNILND_DGRAM_CANCELED)) {
1354                 kgn_peer_t *peer;
1355                 kgn_conn_t *conn = dgram->gndg_conn;
1356                 lnet_nid_t nid = dgram->gndg_conn_out.gncr_dstnid;
1357
1358                 dgram->gndg_state = GNILND_DGRAM_DONE;
1359
1360                 /* During shutdown we've already removed the peer so we don't
1361                  * need to add a peer. During stack reset we don't care about
1362                  * MDDs since they are all released. */
1363                 if (!shutdown) {
1364                         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1365                         peer = kgnilnd_find_peer_locked(nid);
1366
1367                         if (peer != NULL) {
1368                                 CDEBUG(D_NET, "adding peer's conn with nid %s "
1369                                         "to purgatory\n", libcfs_nid2str(nid));
1370                                 kgnilnd_conn_addref(conn);
1371                                 conn->gnc_peer = peer;
1372                                 kgnilnd_peer_addref(peer);
1373                                 kgnilnd_admin_addref(conn->gnc_peer->gnp_dirty_eps);
1374                                 conn->gnc_state = GNILND_CONN_CLOSED;
1375                                 list_add_tail(&conn->gnc_list,
1376                                               &peer->gnp_conns);
1377                                 kgnilnd_add_purgatory_locked(conn,
1378                                                              conn->gnc_peer);
1379                                 kgnilnd_schedule_conn(conn);
1380                         }
1381                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1382                 }
1383         }
1384
1385         spin_lock(&dev->gnd_dgram_lock);
1386         kgnilnd_cancel_dgram_locked(dgram);
1387         spin_unlock(&dev->gnd_dgram_lock);
1388
1389         kgnilnd_cleanup_dgram(dgram);
1390
1391         /* if the dgram is 'canceled' it needs to be wait until the event
1392          * comes up from kgni that tells us it is safe to release */
1393         if (dgram->gndg_state != GNILND_DGRAM_CANCELED) {
1394                 dgram->gndg_state = GNILND_DGRAM_DONE;
1395
1396                 LASSERTF(list_empty(&dgram->gndg_list), "dgram 0x%p on list\n", dgram);
1397
1398                 /* if it is a wildcard and we are in an appropriate state, repost
1399                  * the wildcard */
1400
1401                 if ((dgram->gndg_type == GNILND_DGRAM_WC_REQ) &&
1402                     (!kgnilnd_data.kgn_wc_kill && !kgnilnd_data.kgn_in_reset)) {
1403                         int     rerc;
1404
1405                         rerc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
1406                         if (rerc != 0) {
1407                                 /* We failed to repost the WC dgram for some reason
1408                                  * mark it so the repost system attempts to repost */
1409                                 kgnilnd_admin_addref(dev->gnd_nwcdgrams);
1410                         }
1411                 }
1412
1413                 /* always free the old dgram */
1414                 kgnilnd_free_dgram(dev, dgram);
1415         }
1416 }
1417
1418
1419 int
1420 kgnilnd_probe_for_dgram(kgn_device_t *dev, kgn_dgram_t **dgramp)
1421 {
1422         kgn_dgram_t             *dgram = NULL;
1423         gni_post_state_t         post_state;
1424         gni_return_t             grc;
1425         int                      rc = 0;
1426         __u64                    readyid;
1427         __u32                    remote_addr = 0, remote_id = 0;
1428         ENTRY;
1429
1430         /* Probe with the lock held. That way if we get a dgram we dont have it canceled
1431          * between finding the ready dgram and grabbing the lock to remove it from the
1432          * list. Otherwise we could be left in an inconsistent state. We own the dgram
1433          * once its off the list so we don't need to worry about others changing it at
1434          * that point. */
1435         spin_lock(&dev->gnd_dgram_lock);
1436         grc = kgnilnd_postdata_probe_by_id(dev->gnd_handle, &readyid);
1437         if (grc != GNI_RC_SUCCESS) {
1438                 spin_unlock(&dev->gnd_dgram_lock);
1439                 /* return 0 to indicate nothing happened */
1440                 RETURN(0);
1441         }
1442
1443         CDEBUG(D_NET, "ready %#llx on device 0x%p\n",
1444                 readyid, dev);
1445
1446         dgram = (kgn_dgram_t *)readyid;
1447
1448         LASSERTF(dgram->gndg_magic == GNILND_DGRAM_MAGIC,
1449                  "dgram 0x%p from id %#llx with bad magic %x\n",
1450                  dgram, readyid, dgram->gndg_magic);
1451
1452         LASSERTF(dgram->gndg_state == GNILND_DGRAM_POSTED ||
1453                  dgram->gndg_state == GNILND_DGRAM_CANCELED,
1454                  "dgram 0x%p with bad state %s\n",
1455                  dgram, kgnilnd_dgram_state2str(dgram));
1456
1457         LASSERTF(!list_empty(&dgram->gndg_list),
1458                  "dgram 0x%p with bad list state %s type %s\n",
1459                  dgram, kgnilnd_dgram_state2str(dgram),
1460                  kgnilnd_dgram_type2str(dgram));
1461
1462         /* now we know that the datagram structure is ok, so pull off list */
1463         list_del_init(&dgram->gndg_list);
1464
1465         /* while we have the gnn_dgram_lock and BEFORE we call test_by_id
1466          * change the state from POSTED to PROCESSING to ensure that
1467          * nobody cancels it after we've pulled it from the wire */
1468         if (dgram->gndg_state == GNILND_DGRAM_POSTED) {
1469                 dgram->gndg_state = GNILND_DGRAM_PROCESSING;
1470         }
1471
1472         LASSERTF(dgram->gndg_conn != NULL,
1473                 "dgram 0x%p with NULL conn\n", dgram);
1474
1475         grc = kgnilnd_ep_postdata_test_by_id(dgram->gndg_conn->gnc_ephandle,
1476                                              (__u64)dgram, &post_state,
1477                                              &remote_addr, &remote_id);
1478
1479         /* we now "own" this datagram */
1480         spin_unlock(&dev->gnd_dgram_lock);
1481
1482         LASSERTF(grc != GNI_RC_NO_MATCH, "kgni lied! probe_by_id told us that"
1483                  " id %llu was ready\n", readyid);
1484
1485         CDEBUG(D_NET, "grc %d dgram 0x%p type %s post_state %d "
1486                 "remote_addr %u remote_id %u\n", grc, dgram,
1487                 kgnilnd_dgram_type2str(dgram),
1488                 post_state, remote_addr, remote_id);
1489
1490         if (unlikely(grc != GNI_RC_SUCCESS)) {
1491                 CNETERR("getting data for dgram 0x%p->%s failed rc %d. Dropping it\n",
1492                         dgram, libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
1493                         grc);
1494                 rc = -EINVAL;
1495                 GOTO(probe_for_out, rc);
1496         }
1497
1498         rc = kgnilnd_process_dgram(dgram, post_state);
1499
1500         /* we should never get probe finding a dgram for us and then it
1501          * being a WC dgram that is still in the middle of processing */
1502         LASSERTF(rc <= 0, "bad rc %d from process_dgram 0x%p state %d\n",
1503                  rc, dgram, post_state);
1504
1505         if (rc == 0) {
1506                 /* dgram is good enough for the data to be used */
1507                 dgram->gndg_state = GNILND_DGRAM_PROCESSING;
1508                 /* fake rc to mark that we've done something */
1509                 rc = 1;
1510         } else {
1511                 /* let kgnilnd_release_dgram take care of canceled dgrams */
1512                 if (dgram->gndg_state != GNILND_DGRAM_CANCELED) {
1513                         dgram->gndg_state = GNILND_DGRAM_DONE;
1514                 }
1515         }
1516
1517         *dgramp = dgram;
1518         RETURN(rc);
1519
1520 probe_for_out:
1521
1522         kgnilnd_release_dgram(dev, dgram, 0);
1523         RETURN(rc);
1524 }
1525
1526 int
1527 kgnilnd_setup_wildcard_dgram(kgn_device_t *dev)
1528 {
1529         /* if kgn_wildcard is zero, return error */
1530         int     rc = -ENOENT, i;
1531         ENTRY;
1532
1533         for (i = 0; i < *kgnilnd_tunables.kgn_nwildcard; i++) {
1534                 rc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
1535                 if (rc < 0) {
1536                         CERROR("error %d: could not post wildcard datagram # %d\n",
1537                                 rc, i);
1538                         rc = -EINVAL;
1539                         GOTO(failed, rc);
1540                 }
1541         }
1542
1543 failed:
1544         RETURN(rc);
1545 }
1546
1547 int
1548 kgnilnd_cancel_net_dgrams(kgn_net_t *net)
1549 {
1550         kgn_dgram_t *dg, *dgN;
1551         LIST_HEAD(zombies);
1552         int i;
1553         ENTRY;
1554
1555         /* we want to cancel any outstanding dgrams - we don't want to rely
1556          * on del_peer_or_conn catching all of them. This helps protect us in cases
1557          * where we don't quite keep the peer->dgram mapping in sync due to some
1558          * race conditions */
1559
1560         LASSERTF(net->gnn_shutdown || kgnilnd_data.kgn_in_reset,
1561                  "called with LND invalid state: net shutdown %d "
1562                  "in reset %d\n", net->gnn_shutdown,
1563                  kgnilnd_data.kgn_in_reset);
1564
1565         spin_lock(&net->gnn_dev->gnd_dgram_lock);
1566
1567         for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
1568                 list_for_each_entry_safe(dg, dgN, &net->gnn_dev->gnd_dgrams[i], gndg_list) {
1569
1570                         /* skip nids not on our net or are wildcards */
1571
1572
1573                         if (dg->gndg_type == GNILND_DGRAM_WC_REQ ||
1574                                 net->gnn_netnum != LNET_NETNUM(LNET_NIDNET(dg->gndg_conn_out.gncr_dstnid)))
1575                                 continue;
1576
1577                         kgnilnd_cancel_dgram_locked(dg);
1578                 }
1579         }
1580
1581         spin_unlock(&net->gnn_dev->gnd_dgram_lock);
1582
1583         RETURN(0);
1584 }
1585
1586 int
1587 kgnilnd_cancel_wc_dgrams(kgn_device_t *dev)
1588 {
1589         kgn_dgram_t *dg, *dgN;
1590         LIST_HEAD(zombies);
1591         ENTRY;
1592
1593         /* Time to kill the outstanding WC's
1594          * WC's exist on net 0 only but match on any net...
1595          */
1596
1597         LASSERTF(kgnilnd_data.kgn_in_reset || kgnilnd_data.kgn_wc_kill,
1598                 "called with LND invalid state: WC shutdown %d "
1599                 "in reset %d\n", kgnilnd_data.kgn_wc_kill,
1600                 kgnilnd_data.kgn_in_reset);
1601
1602         spin_lock(&dev->gnd_dgram_lock);
1603
1604         do {
1605                 dg = kgnilnd_find_dgram_locked(dev, LNET_NID_ANY);
1606                 if (dg != NULL) {
1607                         LASSERTF(dg->gndg_type == GNILND_DGRAM_WC_REQ,
1608                                  "dgram 0x%p->%s with bad type %d (%s)\n",
1609                                 dg, libcfs_nid2str(dg->gndg_conn_out.gncr_dstnid),
1610                                 dg->gndg_type, kgnilnd_dgram_type2str(dg));
1611
1612                         kgnilnd_cancel_dgram_locked(dg);
1613
1614                         /* WC could be DONE already, check and if so add to list to be released */
1615                         if (dg->gndg_state == GNILND_DGRAM_DONE)
1616                                 list_move_tail(&dg->gndg_list, &zombies);
1617                 }
1618         } while (dg != NULL);
1619
1620         spin_unlock(&dev->gnd_dgram_lock);
1621
1622         list_for_each_entry_safe(dg, dgN, &zombies, gndg_list) {
1623                 list_del_init(&dg->gndg_list);
1624                 kgnilnd_release_dgram(dev, dg, 1);
1625         }
1626         RETURN(0);
1627
1628 }
1629
1630 int
1631 kgnilnd_cancel_dgrams(kgn_device_t *dev)
1632 {
1633         kgn_dgram_t *dg, *dgN;
1634         int i;
1635         ENTRY;
1636
1637         /* Cancel any outstanding non wildcard datagrams regardless
1638          * of which net they are on as we are in base shutdown and
1639          * dont care about connecting anymore.
1640          */
1641
1642         LASSERTF(kgnilnd_data.kgn_wc_kill == 1,"We didnt get called from base shutdown\n");
1643
1644         spin_lock(&dev->gnd_dgram_lock);
1645
1646         for (i = 0; i < (*kgnilnd_tunables.kgn_peer_hash_size -1); i++) {
1647                 list_for_each_entry_safe(dg, dgN, &dev->gnd_dgrams[i], gndg_list) {
1648                         if (dg->gndg_type != GNILND_DGRAM_WC_REQ)
1649                                 kgnilnd_cancel_dgram_locked(dg);
1650                 }
1651         }
1652
1653         spin_unlock(&dev->gnd_dgram_lock);
1654
1655         RETURN(0);
1656 }
1657
1658
1659 void
1660 kgnilnd_wait_for_canceled_dgrams(kgn_device_t *dev)
1661 {
1662         int             i = 4;
1663         int             rc;
1664         gni_return_t    grc;
1665         __u64           readyid;
1666         kgn_dgram_t    *dgram;
1667
1668         /* use do while to get at least one check run to allow
1669          * regression test for 762072 to hit bug if there */
1670
1671         /* This function races with the dgram mover during shutdown so it is possible for
1672          * a dgram to be seen in kgnilnd_postdata_probe_wait_by_id but be handled in the
1673          * dgram mover thread instead of inside of this function.
1674          */
1675
1676         /* This should only be called from within shutdown, baseshutdown, or stack reset.
1677          * there are no assertions here to verify since base_shutdown has nothing in it we can check
1678          * the net is gone by then.
1679          */
1680
1681         do {
1682                 i++;
1683                 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
1684                         "Waiting for %d canceled datagrams to clear on device %d\n",
1685                         atomic_read(&dev->gnd_canceled_dgrams), dev->gnd_id);
1686
1687                 /* check once a second */
1688                 grc = kgnilnd_postdata_probe_wait_by_id(dev->gnd_handle,
1689                        250, &readyid);
1690
1691                 if (grc != GNI_RC_SUCCESS)
1692                         continue;
1693
1694                 CDEBUG(D_NET, "ready %#llx on device %d->0x%p\n",
1695                         readyid, dev->gnd_id, dev);
1696
1697                 rc = kgnilnd_probe_for_dgram(dev, &dgram);
1698                 if (rc != 0) {
1699                         /* if we got a valid dgram or one that is now done, clean up */
1700                         kgnilnd_release_dgram(dev, dgram, 1);
1701                 }
1702         } while (atomic_read(&dev->gnd_canceled_dgrams));
1703 }
1704
1705 int
1706 kgnilnd_start_connect(kgn_peer_t *peer)
1707 {
1708         int              rc = 0;
1709         /* sync point for kgnilnd_del_peer_locked - do an early check to
1710          * catch the most common hits where del_peer is done by the
1711          * time we get here */
1712         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING1)) {
1713                 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING1, 1)) {};
1714         }
1715
1716         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1717         if (!kgnilnd_peer_active(peer) || peer->gnp_connecting != GNILND_PEER_CONNECT) {
1718                 /* raced with peer getting unlinked */
1719                 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1720                 rc = ESTALE;
1721                 GOTO(out, rc);
1722         }
1723         peer->gnp_connecting = GNILND_PEER_POSTING;
1724         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1725
1726         set_mb(peer->gnp_last_dgram_time, jiffies);
1727         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING2)) {
1728                 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING2, 1)) {};
1729         }
1730
1731         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING3)) {
1732                 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING3, 1)) {};
1733                 rc = cfs_fail_val ? cfs_fail_val : -ENOMEM;
1734         } else {
1735                 rc = kgnilnd_post_dgram(peer->gnp_net->gnn_dev,
1736                                         peer->gnp_nid, GNILND_CONNREQ_REQ, 0);
1737         }
1738         if (rc < 0) {
1739                 set_mb(peer->gnp_last_dgram_errno, rc);
1740                 GOTO(failed, rc);
1741         }
1742
1743         /* while we're posting someone could have decided this peer/dgram needed to
1744          * die a quick death, so we check for state change and process accordingly */
1745
1746         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1747         if (!kgnilnd_peer_active(peer) || peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH) {
1748                 if (peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH) {
1749                         peer->gnp_connecting = GNILND_PEER_KILL;
1750                 }
1751                 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1752                 /* positive RC to avoid dgram cleanup - we'll have to
1753                  * wait for the kgni GNI_POST_TERMINATED event to
1754                  * finish cleaning up */
1755                 rc = ESTALE;
1756                 kgnilnd_find_and_cancel_dgram(peer->gnp_net->gnn_dev, peer->gnp_nid);
1757                 GOTO(out, rc);
1758         }
1759         peer->gnp_connecting = GNILND_PEER_POSTED;
1760         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1761         /* reaper thread will take care of any timeouts */
1762         CDEBUG(D_NET, "waiting for connect to finish to %s rc %d\n",
1763                libcfs_nid2str(peer->gnp_nid), rc);
1764
1765         RETURN(rc);
1766
1767 failed:
1768         CDEBUG(D_NET, "connect to %s failed: rc %d \n",
1769                libcfs_nid2str(peer->gnp_nid), rc);
1770 out:
1771         RETURN(rc);
1772 }
1773
1774 int
1775 kgnilnd_finish_connect(kgn_dgram_t *dgram)
1776 {
1777         kgn_conn_t        *conn = dgram->gndg_conn;
1778         lnet_nid_t         her_nid = dgram->gndg_conn_in.gncr_srcnid;
1779         kgn_peer_t        *new_peer, *peer = NULL;
1780         kgn_tx_t          *tx;
1781         kgn_tx_t          *txn;
1782         kgn_mbox_info_t   *mbox;
1783         int                rc;
1784         int                nstale;
1785
1786         /* try to find a peer that matches the nid we got in the connreq
1787          * kgnilnd_unpack_connreq makes sure that conn_in.gncr_srcnid is
1788          * HER and conn_out.gncr_srcnid is ME for both active and WC dgrams */
1789
1790         /* assume this is a new peer  - it makes locking cleaner when it isn't */
1791         /* no holding kgn_net_rw_sem - already are at the kgnilnd_dgram_mover level */
1792
1793         rc = kgnilnd_create_peer_safe(&new_peer, her_nid, NULL, GNILND_PEER_UP);
1794         if (rc != 0) {
1795                 CERROR("Can't create peer for %s\n", libcfs_nid2str(her_nid));
1796                 return rc;
1797         }
1798
1799         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1800
1801         /* this transfers ref from create_peer to the kgn_peer table */
1802         kgnilnd_add_peer_locked(her_nid, new_peer, &peer);
1803
1804         /* if we found an existing peer, is it really ready for a new conn ? */
1805         if (peer != new_peer) {
1806                 /* if this was an active connect attempt but we can't find a peer waiting for it
1807                  * we will dump in the trash */
1808
1809                 if (peer->gnp_connecting == GNILND_PEER_IDLE && dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
1810                         CDEBUG(D_NET, "dropping completed connreq for %s peer 0x%p->%s\n",
1811                                libcfs_nid2str(her_nid), peer, libcfs_nid2str(peer->gnp_nid));
1812                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1813                         rc = ECANCELED;
1814                         GOTO(out, rc);
1815                 }
1816
1817                 /* check to see if we can catch a connecting peer before it is
1818                  * removed from the connd_peers list - if not, we need to
1819                  * let the connreqs race and be handled by kgnilnd_conn_isdup_locked() */
1820                 if (peer->gnp_connecting != GNILND_PEER_IDLE) {
1821                         spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
1822                         if (!list_empty(&peer->gnp_connd_list)) {
1823                                 list_del_init(&peer->gnp_connd_list);
1824                                 /* drop connd ref */
1825                                 kgnilnd_peer_decref(peer);
1826                         }
1827                         spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
1828                         /* clear rc to make sure we don't have fake error */
1829                         rc = 0;
1830                 }
1831
1832                 /* no matter what, we are no longer waiting to connect this peer now */
1833                 peer->gnp_connecting = GNILND_PEER_IDLE;
1834
1835                 /* Refuse to duplicate an existing connection (both sides might try to
1836                  * connect at once).  NB we return success!  We _are_ connected so we
1837                  * _don't_ have any blocked txs to complete with failure. */
1838                 rc = kgnilnd_conn_isdup_locked(peer, conn);
1839                 if (rc != 0) {
1840                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1841                         CDEBUG(D_NET, "Not creating duplicate connection to %s: %d\n",
1842                               libcfs_nid2str(her_nid), rc);
1843                         rc = EALREADY;
1844                         GOTO(out, rc);
1845                 }
1846         }
1847
1848         if (peer->gnp_state == GNILND_PEER_DOWN) {
1849                 CNETERR("Received connection request from down nid %s\n",
1850                         libcfs_nid2str(her_nid));
1851         }
1852
1853         peer->gnp_state = GNILND_PEER_UP;
1854         nstale = kgnilnd_close_stale_conns_locked(peer, conn);
1855
1856         /* either way with peer (new or existing), we are ok with ref counts here as the
1857          * kgnilnd_add_peer_locked will use our ref on new_peer (from create_peer_safe) as the
1858          * ref for the peer table. */
1859
1860         /* at this point, the connection request is a winner */
1861
1862         /* mark 'DONE' to avoid cancel being called from release */
1863         dgram->gndg_state = GNILND_DGRAM_DONE;
1864
1865         /* initialise timestamps before reaper looks at them */
1866         conn->gnc_last_rx = conn->gnc_last_rx_cq = jiffies;
1867
1868         /* last_tx is initialized to jiffies - (keepalive*2) so that if the NOOP fails it will
1869          * immediatly send a NOOP in the reaper thread during the call to
1870          * kgnilnd_check_conn_timeouts_locked
1871          */
1872         conn->gnc_last_tx = jiffies - (cfs_time_seconds(GNILND_TO2KA(conn->gnc_timeout)) * 2);
1873         conn->gnc_state = GNILND_CONN_ESTABLISHED;
1874
1875         /* save the dgram type used to establish this connection */
1876         conn->gnc_dgram_type = dgram->gndg_type;
1877
1878         /* refs are not transferred from dgram to tables, so increment to
1879          * take ownership */
1880         kgnilnd_conn_addref(conn);
1881         kgnilnd_peer_addref(peer);
1882         conn->gnc_peer = peer;
1883         list_add_tail(&conn->gnc_list, &peer->gnp_conns);
1884
1885         kgnilnd_conn_addref(conn);               /* +1 ref for conn table */
1886         list_add_tail(&conn->gnc_hashlist,
1887                       kgnilnd_cqid2connlist(conn->gnc_cqid));
1888         kgnilnd_data.kgn_conn_version++;
1889
1890         /* Dont send NOOP if fail_loc is set
1891          */
1892         if (!CFS_FAIL_CHECK(CFS_FAIL_GNI_ONLY_NOOP)) {
1893                 tx = kgnilnd_new_tx_msg(GNILND_MSG_NOOP, peer->gnp_net->gnn_ni->ni_nid);
1894                 if (tx == NULL) {
1895                         CNETERR("can't get TX to initiate NOOP to %s\n",
1896                                 libcfs_nid2str(peer->gnp_nid));
1897                 } else {
1898                         kgnilnd_queue_tx(conn, tx);
1899                 }
1900         }
1901
1902         /* Schedule all packets blocking for a connection */
1903         list_for_each_entry_safe(tx, txn, &peer->gnp_tx_queue, tx_list) {
1904                 /* lock held here is the peer_conn lock */
1905                 kgnilnd_tx_del_state_locked(tx, peer, NULL, GNILND_TX_ALLOCD);
1906                 kgnilnd_queue_tx(conn, tx);
1907         }
1908
1909         /* If this is an active connection lets mark its timestamp on the MBoX */
1910         if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
1911                 mbox = &conn->gnc_fma_blk->gnm_mbox_info[conn->gnc_mbox_id];
1912                 /* conn->gnc_last_rx is jiffies it better exist as it was just set */
1913                 mbox->mbx_release_purg_active_dgram = conn->gnc_last_rx;
1914         }
1915
1916         /* Bug 765042: wake up scheduler for a race with finish_connect and
1917          * complete_conn_closed with a conn in purgatory
1918          * since we can't use CFS_RACE due to mutex_holds in kgnilnd_process_conns,
1919          * we just check for set and then clear */
1920         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_FINISH_PURG)) {
1921                 cfs_fail_loc = 0x0;
1922                 /* get scheduler thread moving again */
1923                 kgnilnd_schedule_device(conn->gnc_device);
1924         }
1925
1926         CDEBUG(D_NET, "New conn 0x%p->%s dev %d\n",
1927                conn, libcfs_nid2str(her_nid), conn->gnc_device->gnd_id);
1928
1929         /* make sure we reset peer reconnect interval now that we have a good conn */
1930         kgnilnd_peer_alive(peer);
1931         peer->gnp_reconnect_interval = 0;
1932
1933         /* clear the unlink attribute if we dont clear it kgnilnd_del_conn_or_peer will wait
1934          * on the atomic forever
1935          */
1936         if (peer->gnp_pending_unlink) {
1937                 peer->gnp_pending_unlink = 0;
1938                 kgnilnd_admin_decref(kgnilnd_data.kgn_npending_unlink);
1939                 CDEBUG(D_NET, "Clearing peer unlink %p\n",peer);
1940         }
1941
1942         /* add ref to make it hang around until after we drop the lock */
1943         kgnilnd_conn_addref(conn);
1944
1945         /* Once the peer_conn lock is dropped, the conn could actually move into
1946          * CLOSING->CLOSED->DONE in the scheduler thread, so hold the
1947          * lock until we are really done */
1948         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1949
1950         /* Notify LNET that we now have a working connection to this peer.
1951          * This is a Cray extension to the "standard" LND behavior.
1952          */
1953         lnet_notify(peer->gnp_net->gnn_ni, peer->gnp_nid, true, true,
1954                     ktime_get_seconds());
1955
1956         /* drop our 'hold' ref */
1957         kgnilnd_conn_decref(conn);
1958
1959 out:
1960         RETURN(rc);
1961 }
1962
1963 void
1964 kgnilnd_send_nak(kgn_device_t *dev, lnet_nid_t dst_nid, int error)
1965 {
1966         int              rc = 0;
1967         ENTRY;
1968
1969         LASSERTF(dst_nid != LNET_NID_ANY, "bad dst_nid %s\n", libcfs_nid2str(dst_nid));
1970
1971         CDEBUG(D_NET, "NAK to %s errno %d\n", libcfs_nid2str(dst_nid), error);
1972
1973         rc = kgnilnd_post_dgram(dev, dst_nid, GNILND_CONNREQ_NAK, error);
1974
1975         if (rc < 0) {
1976                 CDEBUG(D_NET, "NAK to %s failed: rc %d \n", libcfs_nid2str(dst_nid), rc);
1977         }
1978         EXIT;
1979 }
1980
1981 int
1982 kgnilnd_process_nak(kgn_dgram_t *dgram)
1983 {
1984         kgn_connreq_t     *connreq = &dgram->gndg_conn_in;
1985         lnet_nid_t         src_nid = connreq->gncr_srcnid;
1986         int                errno = connreq->gncr_nakdata.gnnd_errno;
1987         kgn_peer_t        *peer;
1988         int                rc = 0;
1989
1990         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
1991
1992         peer = kgnilnd_find_peer_locked(src_nid);
1993         if (peer == NULL) {
1994                 /* we likely dropped him from bad data when we processed
1995                  * the original REQ */
1996                 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
1997                 return -EBADSLT;
1998         }
1999
2000         /* need to check peerstamp/connstamp against the ones we find
2001          * to make sure we don't close new (and good?) conns that we
2002          * formed after this connreq failed */
2003         if (peer->gnp_connecting == GNILND_PEER_IDLE) {
2004                 kgn_conn_t        conn;
2005
2006                 if (list_empty(&peer->gnp_conns)) {
2007                         /* assume already procced datagram and it barfed up
2008                          * on this side too */
2009                         CDEBUG(D_NET, "dropping NAK from %s; "
2010                                "peer %s is already not connected\n",
2011                                 libcfs_nid2str(connreq->gncr_srcnid),
2012                                 libcfs_nid2str(connreq->gncr_dstnid));
2013                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2014                         return 0;
2015                 }
2016
2017                 /* stub up a connection with the connreq XXX_stamps to allow
2018                  * use to use close_stale_conns_locked */
2019                 conn.gnc_peerstamp = connreq->gncr_peerstamp;
2020                 conn.gnc_my_connstamp = connreq->gncr_connstamp;
2021                 conn.gnc_peer_connstamp = connreq->gncr_connstamp;
2022                 conn.gnc_device = peer->gnp_net->gnn_dev;
2023
2024                 rc = kgnilnd_close_stale_conns_locked(peer, &conn);
2025
2026                 LCONSOLE_INFO("Received NAK from %s for %s errno %d; "
2027                         "closed %d connections\n",
2028                         libcfs_nid2str(connreq->gncr_srcnid),
2029                         libcfs_nid2str(connreq->gncr_dstnid), errno, rc);
2030         } else {
2031                 spin_lock(&dgram->gndg_conn->gnc_device->gnd_connd_lock);
2032
2033                 if (list_empty(&peer->gnp_connd_list)) {
2034                         /* if peer isn't on waiting list, try to find one to nuke */
2035                         rc = kgnilnd_find_and_cancel_dgram(peer->gnp_net->gnn_dev,
2036                                                            peer->gnp_nid);
2037
2038                         if (rc) {
2039                                 LCONSOLE_INFO("Received NAK from %s for %s errno %d; "
2040                                         "canceled pending connect request\n",
2041                                         libcfs_nid2str(connreq->gncr_srcnid),
2042                                         libcfs_nid2str(connreq->gncr_dstnid), errno);
2043                         }
2044
2045                         /* if we can't find a waiting dgram, we just drop the nak - the conn
2046                          * connect must have failed (didn't find conn above and clear connecting
2047                          * -- so nothing to do besides drop */
2048                 } else {
2049                         /* peer is on list, meaning it is a new connect attempt from the one
2050                          * we started that generated the NAK - so just drop NAK */
2051
2052                         /* use negative to prevent error message */
2053                         rc = -EAGAIN;
2054                 }
2055                 spin_unlock(&dgram->gndg_conn->gnc_device->gnd_connd_lock);
2056         }
2057
2058         /* success! we found a peer and at least marked pending_nak */
2059         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2060
2061         return rc;
2062 }
2063
2064 int
2065 kgnilnd_process_connreq(kgn_dgram_t *dgram, int *needs_nak)
2066 {
2067         int                      rc;
2068
2069         rc = kgnilnd_unpack_connreq(dgram);
2070         if (rc < 0) {
2071                 if (rc != -EBADF) {
2072                         /* only NAK if we have good srcnid to use */
2073                         *needs_nak = 1;
2074                 }
2075                 goto connreq_out;
2076         }
2077
2078         switch (dgram->gndg_conn_in.gncr_type) {
2079         case GNILND_CONNREQ_REQ:
2080                 /* wire up peer & conn, send queued TX */
2081                 rc = kgnilnd_finish_connect(dgram);
2082
2083                 /* don't nak when the nid is hosed */
2084                 if ((rc < 0)) {
2085                         *needs_nak = 1;
2086                 }
2087
2088                 break;
2089         case GNILND_CONNREQ_NAK:
2090                 rc = kgnilnd_process_nak(dgram);
2091                 /* return early to prevent reconnect bump */
2092                 return rc;
2093         default:
2094                 CERROR("unexpected connreq type %s (%d) from %s\n",
2095                         kgnilnd_connreq_type2str(&dgram->gndg_conn_in),
2096                         dgram->gndg_conn_in.gncr_type,
2097                         libcfs_nid2str(dgram->gndg_conn_in.gncr_srcnid));
2098                 rc = -EINVAL;
2099                 *needs_nak = 1;
2100                 break;
2101         }
2102
2103 connreq_out:
2104         RETURN(rc);
2105 }
2106
2107 int
2108 kgnilnd_probe_and_process_dgram(kgn_device_t *dev)
2109 {
2110         int                      rc;
2111         int                      needs_nak = 0;
2112         lnet_nid_t               nak_dstnid = LNET_NID_ANY;
2113         lnet_nid_t               orig_dstnid;
2114         kgn_dgram_t             *dgram = NULL;
2115         kgn_peer_t              *peer;
2116         ENTRY;
2117
2118         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PAUSE_DGRAM_COMP)) {
2119                 rc = 0;
2120         } else {
2121                 rc = kgnilnd_probe_for_dgram(dev, &dgram);
2122         }
2123
2124         if (rc == 0) {
2125                 RETURN(0);
2126         } else if (rc < 0) {
2127                 GOTO(inform_peer, rc);
2128         } else {
2129                 /* rc > 1 means it did something, reset for this func  */
2130                 rc = 0;
2131         }
2132
2133         switch (dgram->gndg_type) {
2134         case GNILND_DGRAM_WC_REQ:
2135         case GNILND_DGRAM_REQ:
2136                 rc = kgnilnd_process_connreq(dgram, &needs_nak);
2137                 break;
2138         case GNILND_DGRAM_NAK:
2139                 CDEBUG(D_NETTRACE, "NAK to %s done\n",
2140                         libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid));
2141                 break;
2142         default:
2143                 CERROR("unknown datagram type %s (%d)\n",
2144                        kgnilnd_dgram_type2str(dgram), dgram->gndg_type);
2145                 break;
2146         }
2147
2148         /* stash data to use after releasing current datagram */
2149         /* don't stash net - we are operating on a net already,
2150          * so the lock on rw_net_lock is sufficient */
2151
2152         nak_dstnid = dgram->gndg_conn_in.gncr_srcnid;
2153
2154 inform_peer:
2155         LASSERTF(dgram != NULL, "dgram 0x%p rc %d needs_nak %d\n", dgram, rc, needs_nak);
2156
2157         orig_dstnid = dgram->gndg_conn_out.gncr_dstnid;
2158
2159         kgnilnd_release_dgram(dev, dgram, 0);
2160
2161         CDEBUG(D_NET, "cleaning up dgram to %s, rc %d\n",
2162                libcfs_nid2str(orig_dstnid), rc);
2163
2164         /* if this was a WC_REQ that matched an existing peer, it'll get marked done
2165          * in kgnilnd_finish_connect - if errors are from before we get to there,
2166          * we just drop as it is a WC_REQ - the peer CAN'T be waiting for it */
2167         if ((orig_dstnid != LNET_NID_ANY) && (rc < 0)) {
2168                 /* if we have a negative rc, we want to find a peer to inform about
2169                  * the bad connection attempt. Sorry buddy, better luck next time! */
2170
2171                 write_lock(&kgnilnd_data.kgn_peer_conn_lock);
2172                 peer = kgnilnd_find_peer_locked(orig_dstnid);
2173
2174                 if (peer != NULL) {
2175                         /* add ref to make sure he stays around past the possible unlink
2176                          * so we can tell LNet about him */
2177                         kgnilnd_peer_addref(peer);
2178
2179                         /* if he still cares about the outstanding connect */
2180                         if (peer->gnp_connecting >= GNILND_PEER_CONNECT) {
2181                                 /* check if he is on the connd list and remove.. */
2182                                 spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
2183                                 if (!list_empty(&peer->gnp_connd_list)) {
2184                                         list_del_init(&peer->gnp_connd_list);
2185                                         /* drop connd ref */
2186                                         kgnilnd_peer_decref(peer);
2187                                 }
2188                                 spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
2189
2190                                 /* clear gnp_connecting so we don't have a non-connecting peer
2191                                  * on gnd_connd_list */
2192                                 peer->gnp_connecting = GNILND_PEER_IDLE;
2193
2194                                 set_mb(peer->gnp_last_dgram_errno, rc);
2195
2196                                 kgnilnd_peer_increase_reconnect_locked(peer);
2197                         }
2198                 }
2199                 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2200
2201                 /* now that we are outside the lock, tell Mommy */
2202                 if (peer != NULL) {
2203                         kgnilnd_peer_notify(peer, rc, 0);
2204                         kgnilnd_peer_decref(peer);
2205                 }
2206         }
2207
2208         if (needs_nak) {
2209                 kgnilnd_send_nak(dev, nak_dstnid, rc);
2210         }
2211
2212         RETURN(1);
2213 }
2214
2215 void
2216 kgnilnd_reaper_dgram_check(kgn_device_t *dev)
2217 {
2218         kgn_dgram_t    *dgram, *tmp;
2219         int             i;
2220
2221         spin_lock(&dev->gnd_dgram_lock);
2222
2223         for (i = 0; i < (*kgnilnd_tunables.kgn_peer_hash_size - 1); i++) {
2224                 list_for_each_entry_safe(dgram, tmp, &dev->gnd_dgrams[i], gndg_list) {
2225                         unsigned long            now = jiffies;
2226                         unsigned long            timeout;
2227
2228                         /* don't timeout stuff if the network is mucked or shutting down */
2229                         if (kgnilnd_check_hw_quiesce()) {
2230                                 break;
2231                         }
2232
2233                         if ((dgram->gndg_state != GNILND_DGRAM_POSTED) ||
2234                             (dgram->gndg_type == GNILND_DGRAM_WC_REQ)) {
2235                                 continue;
2236                         }
2237                         CDEBUG(D_NETTRACE, "checking dgram 0x%p type %s "
2238                                 "state %s conn 0x%p to %s age %lus\n",
2239                                 dgram, kgnilnd_dgram_type2str(dgram),
2240                                 kgnilnd_dgram_state2str(dgram), dgram->gndg_conn,
2241                                 libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
2242                                 cfs_duration_sec(now - dgram->gndg_post_time));
2243
2244                         timeout = cfs_time_seconds(*kgnilnd_tunables.kgn_timeout);
2245
2246                         if (time_before(now, (dgram->gndg_post_time + timeout)))
2247                                 continue;
2248
2249                         CNETERR("%s datagram to %s timed out @ %lus dgram "
2250                                 "0x%p state %s conn 0x%p\n",
2251                                 kgnilnd_dgram_type2str(dgram),
2252                                 libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
2253                                 cfs_duration_sec(now - dgram->gndg_post_time),
2254                                 dgram, kgnilnd_dgram_state2str(dgram),
2255                                 dgram->gndg_conn);
2256
2257                         kgnilnd_cancel_dgram_locked(dgram);
2258                 }
2259         }
2260         spin_unlock(&dev->gnd_dgram_lock);
2261 }
2262
2263
2264 /* use a thread for the possibly long-blocking wait_by_id to prevent
2265  * stalling the global workqueues */
2266 int
2267 kgnilnd_dgram_waitq(void *arg)
2268 {
2269         kgn_device_t     *dev = (kgn_device_t *) arg;
2270         char              name[16];
2271         gni_return_t      grc;
2272         __u64             readyid;
2273         DEFINE_WAIT(mover_done);
2274
2275         snprintf(name, sizeof(name), "kgnilnd_dgn_%02d", dev->gnd_id);
2276
2277         /* all gnilnd threads need to run fairly urgently */
2278         set_user_nice(current, *kgnilnd_tunables.kgn_nice);
2279
2280         /* we dont shut down until the device shuts down ... */
2281         while (!kgnilnd_data.kgn_shutdown) {
2282                 /* to quiesce or to not quiesce, that is the question */
2283                 if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
2284                         KGNILND_SPIN_QUIESCE;
2285                 }
2286
2287                 while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_PAUSE_DGRAM_COMP, 1)) {}
2288
2289                 /* check once a second */
2290                 grc = kgnilnd_postdata_probe_wait_by_id(dev->gnd_handle,
2291                                                        1000, &readyid);
2292
2293                 if (grc == GNI_RC_SUCCESS) {
2294                         CDEBUG(D_INFO, "waking up dgram mover thread\n");
2295                         kgnilnd_schedule_dgram(dev);
2296
2297                         /* wait for dgram thread to ping us before spinning again */
2298                         prepare_to_wait(&dev->gnd_dgping_waitq, &mover_done,
2299                                         TASK_INTERRUPTIBLE);
2300
2301                         /* don't sleep if we need to quiesce */
2302                         if (likely(!kgnilnd_data.kgn_quiesce_trigger)) {
2303                                 schedule();
2304                         }
2305                         finish_wait(&dev->gnd_dgping_waitq, &mover_done);
2306                 }
2307         }
2308
2309         kgnilnd_thread_fini();
2310         return 0;
2311 }
2312
2313 int
2314 kgnilnd_start_outbound_dgrams(kgn_device_t *dev, unsigned long deadline)
2315 {
2316         int                      did_something = 0, rc;
2317         kgn_peer_t              *peer = NULL;
2318
2319         spin_lock(&dev->gnd_connd_lock);
2320
2321         /* Active connect - we added this in kgnilnd_launch_tx */
2322         while (!list_empty(&dev->gnd_connd_peers) && time_before(jiffies, deadline)) {
2323                 peer = list_first_entry(&dev->gnd_connd_peers,
2324                                         kgn_peer_t, gnp_connd_list);
2325
2326                 /* ref for connd removed in if/else below */
2327                list_del_init(&peer->gnp_connd_list);
2328
2329                 /* gnp_connecting and membership on gnd_connd_peers should be
2330                  * done coherently to avoid double adding, etc */
2331                 /* don't need kgnilnd_data.kgn_peer_conn_lock here as that is only needed
2332                  * to get the peer to gnp_connecting in the first place. We just need to
2333                  * rely on gnd_connd_lock to serialize someone pulling him from the list
2334                  * BEFORE clearing gnp_connecting */
2335                 LASSERTF(peer->gnp_connecting != GNILND_PEER_IDLE, "peer 0x%p->%s not connecting\n",
2336                          peer, libcfs_nid2str(peer->gnp_nid));
2337
2338                 spin_unlock(&dev->gnd_connd_lock);
2339
2340                 CDEBUG(D_NET, "processing connect to %s\n",
2341                        libcfs_nid2str(peer->gnp_nid));
2342
2343                 did_something += 1;
2344                 rc = kgnilnd_start_connect(peer);
2345
2346                 if (likely(rc >= 0)) {
2347                         /* 0 on success, positive on 'just drop peer' errors */
2348                         kgnilnd_peer_decref(peer);
2349                 } else if (rc == -ENOMEM) {
2350                         /* if we are out of wildcards, add back to
2351                          * connd_list - then break out and we'll try later
2352                          * if other errors, we'll bail & cancel pending tx */
2353                         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
2354                         if (peer->gnp_connecting == GNILND_PEER_POSTING) {
2355                                 peer->gnp_connecting = GNILND_PEER_CONNECT;
2356                                 spin_lock(&dev->gnd_connd_lock);
2357                                 list_add_tail(&peer->gnp_connd_list,
2358                                               &dev->gnd_connd_peers);
2359                         } else {
2360                                 /* connecting changed while we were posting */
2361
2362                                 LASSERTF(peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH, "Peer is in invalid"
2363                                         " state 0x%p->%s, connecting %d\n",
2364                                         peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting);
2365                                 peer->gnp_connecting = GNILND_PEER_KILL;
2366                                 spin_lock(&dev->gnd_connd_lock);
2367                                 /* remove the peer ref frrom the cond list */
2368                                 kgnilnd_peer_decref(peer);
2369                                 /* let the system handle itself */
2370                         }
2371                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2372                         /* the datagrams are a global pool,
2373                          * so break out of trying and hope some free
2374                          * up soon */
2375                         did_something -= 1;
2376                         break;
2377                 } else {
2378                         /* something bad happened, you lose */
2379                         CNETERR("could not start connecting to %s "
2380                                 "rc %d: Will retry until TX timeout\n",
2381                                libcfs_nid2str(peer->gnp_nid), rc);
2382                         /* It didnt post so just set connecting back to zero now.
2383                          * The reaper will reattempt the connection if it needs too.
2384                          * If the peer needs death set it so the reaper will cleanup.
2385                          */
2386                         write_lock(&kgnilnd_data.kgn_peer_conn_lock);
2387                         if (peer->gnp_connecting == GNILND_PEER_POSTING) {
2388                                 peer->gnp_connecting = GNILND_PEER_IDLE;
2389                                 kgnilnd_peer_increase_reconnect_locked(peer);
2390                         } else {
2391                                 LASSERTF(peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH, "Peer is in invalid"
2392                                         " state 0x%p->%s, connecting %d\n",
2393                                         peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting);
2394                                 peer->gnp_connecting = GNILND_PEER_KILL;
2395                         }
2396                         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
2397
2398                         /* hold onto ref until we are really done - if it was
2399                          * unlinked this could result in a destroy */
2400                         kgnilnd_peer_decref(peer);
2401                 }
2402                 spin_lock(&dev->gnd_connd_lock);
2403         }
2404
2405         spin_unlock(&dev->gnd_connd_lock);
2406         RETURN(did_something);
2407 }
2408
2409 int
2410 kgnilnd_repost_wc_dgrams(kgn_device_t *dev)
2411 {
2412         int did_something = 0, to_repost, i;
2413         to_repost = atomic_read(&dev->gnd_nwcdgrams);
2414         ENTRY;
2415
2416         for (i = 0; i < to_repost; ++i) {
2417                 int     rerc;
2418                 rerc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
2419                 if (rerc == 0) {
2420                         kgnilnd_admin_decref(dev->gnd_nwcdgrams);
2421                         did_something += 1;
2422                 } else {
2423                         CDEBUG(D_NETERROR, "error %d: dev %d could not post wildcard datagram\n",
2424                                 rerc, dev->gnd_id);
2425                         break;
2426                 }
2427         }
2428
2429         RETURN(did_something);
2430 }
2431
2432 struct kgnilnd_dgram_timer {
2433         struct timer_list timer;
2434         kgn_device_t *dev;
2435 };
2436
2437 static void
2438 kgnilnd_dgram_poke_with_stick(cfs_timer_cb_arg_t arg)
2439 {
2440         struct kgnilnd_dgram_timer *t = cfs_from_timer(t, arg, timer);
2441
2442         wake_up(&t->dev->gnd_dgram_waitq);
2443 }
2444
2445 /* use single thread for dgrams - should be sufficient for performance */
2446 int
2447 kgnilnd_dgram_mover(void *arg)
2448 {
2449         kgn_device_t            *dev = (kgn_device_t *)arg;
2450         char                     name[16];
2451         int                      rc, did_something;
2452         unsigned long            next_purge_check = jiffies - 1;
2453         unsigned long            timeout;
2454         struct kgnilnd_dgram_timer timer;
2455         unsigned long deadline = 0;
2456         DEFINE_WAIT(wait);
2457
2458         snprintf(name, sizeof(name), "kgnilnd_dg_%02d", dev->gnd_id);
2459
2460         /* all gnilnd threads need to run fairly urgently */
2461         set_user_nice(current, *kgnilnd_tunables.kgn_nice);
2462
2463         /* we are ok not locking for these variables as the dgram waitq threads
2464          * will block both due to tying up net (kgn_shutdown) and the completion
2465          * event for the dgram_waitq (kgn_quiesce_trigger) */
2466         deadline = jiffies + cfs_time_seconds(*kgnilnd_tunables.kgn_dgram_timeout);
2467         while (!kgnilnd_data.kgn_shutdown) {
2468                 /* Safe: kgn_shutdown only set when quiescent */
2469
2470                 /* race with stack reset - we want to hold off seeing any new incoming dgrams
2471                  * so we can force a dirty WC dgram for Bug 762072 - put right before
2472                  * quiesce check so that it'll go right into that and not do any
2473                  * dgram mucking */
2474                 CFS_RACE(CFS_FAIL_GNI_WC_DGRAM_FREE);
2475
2476                 /* to quiesce or to not quiesce, that is the question */
2477                 if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
2478                         KGNILND_SPIN_QUIESCE;
2479                 }
2480                 did_something = 0;
2481
2482                 CFS_RACE(CFS_FAIL_GNI_QUIESCE_RACE);
2483
2484                 /* process any newly completed dgrams */
2485                 down_read(&kgnilnd_data.kgn_net_rw_sem);
2486
2487                 rc = kgnilnd_probe_and_process_dgram(dev);
2488                 if (rc > 0) {
2489                         did_something += rc;
2490                 }
2491
2492                 up_read(&kgnilnd_data.kgn_net_rw_sem);
2493
2494                 CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_DGRAM_DEADLINE,
2495                         (*kgnilnd_tunables.kgn_dgram_timeout + 1));
2496                 /* start new outbound dgrams */
2497                 did_something += kgnilnd_start_outbound_dgrams(dev, deadline);
2498
2499                 /* find dead dgrams */
2500                 if (time_after_eq(jiffies, next_purge_check)) {
2501                         /* these don't need to be checked that often */
2502                         kgnilnd_reaper_dgram_check(dev);
2503
2504                         next_purge_check = (long) jiffies +
2505                                       cfs_time_seconds(kgnilnd_data.kgn_new_min_timeout / 4);
2506                 }
2507
2508                 did_something += kgnilnd_repost_wc_dgrams(dev);
2509
2510                 /* careful with the jiffy wrap... */
2511                 timeout = (long)(next_purge_check - jiffies);
2512
2513                 CDEBUG(D_INFO, "did %d timeout %lu next %lu jiffies %lu\n",
2514                        did_something, timeout, next_purge_check, jiffies);
2515
2516                 if ((did_something || timeout <= 0) && time_before(jiffies, deadline)) {
2517                         did_something = 0;
2518                         continue;
2519                 }
2520
2521                 prepare_to_wait(&dev->gnd_dgram_waitq, &wait, TASK_INTERRUPTIBLE);
2522
2523                 cfs_timer_setup(&timer.timer,
2524                                 kgnilnd_dgram_poke_with_stick,
2525                                 dev, 0);
2526                 timer.dev = dev;
2527                 mod_timer(&timer.timer, (long) jiffies + timeout);
2528
2529                 /* last second chance for others to poke us */
2530                 did_something += xchg(&dev->gnd_dgram_ready, GNILND_DGRAM_IDLE);
2531
2532                 /* check flag variables before committing even if we
2533                  * did something; if we are after the deadline call
2534                  * schedule */
2535                 if ((!did_something || time_after(jiffies, deadline)) &&
2536                     !kgnilnd_data.kgn_shutdown &&
2537                     !kgnilnd_data.kgn_quiesce_trigger) {
2538                         CDEBUG(D_INFO, "schedule timeout %ld (%lu sec)\n",
2539                                timeout, cfs_duration_sec(timeout));
2540                         wake_up_all(&dev->gnd_dgping_waitq);
2541                         schedule();
2542                         CDEBUG(D_INFO, "awake after schedule\n");
2543                         deadline = jiffies + cfs_time_seconds(*kgnilnd_tunables.kgn_dgram_timeout);
2544                 }
2545
2546                 del_singleshot_timer_sync(&timer.timer);
2547                 finish_wait(&dev->gnd_dgram_waitq, &wait);
2548         }
2549
2550         kgnilnd_thread_fini();
2551         return 0;
2552 }