Whamcloud - gitweb
f6f18ffba257afc3181bbbefb37fe5bfa164d673
[fs/lustre-release.git] / lnet / klnds / openiblnd / openiblnd_cb.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * Copyright (C) 2004 Cluster File Systems, Inc.
5  *   Author: Eric Barton <eric@bartonsoftware.com>
6  *
7  *   This file is part of Lustre, http://www.lustre.org.
8  *
9  *   Lustre is free software; you can redistribute it and/or
10  *   modify it under the terms of version 2 of the GNU General Public
11  *   License as published by the Free Software Foundation.
12  *
13  *   Lustre is distributed in the hope that it will be useful,
14  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
15  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  *   GNU General Public License for more details.
17  *
18  *   You should have received a copy of the GNU General Public License
19  *   along with Lustre; if not, write to the Free Software
20  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21  *
22  */
23
24 #include "openibnal.h"
25
26 /*
27  *  LIB functions follow
28  *
29  */
30 void
31 kibnal_schedule_tx_done (kib_tx_t *tx)
32 {
33         unsigned long flags;
34
35         spin_lock_irqsave (&kibnal_data.kib_sched_lock, flags);
36
37         list_add_tail(&tx->tx_list, &kibnal_data.kib_sched_txq);
38         wake_up (&kibnal_data.kib_sched_waitq);
39
40         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
41 }
42
43 void
44 kibnal_tx_done (kib_tx_t *tx)
45 {
46         ptl_err_t        ptlrc = (tx->tx_status == 0) ? PTL_OK : PTL_FAIL;
47         unsigned long    flags;
48         int              i;
49         int              rc;
50
51         LASSERT (tx->tx_sending == 0);          /* mustn't be awaiting callback */
52         LASSERT (!tx->tx_passive_rdma_wait);    /* mustn't be awaiting RDMA */
53
54         switch (tx->tx_mapped) {
55         default:
56                 LBUG();
57
58         case KIB_TX_UNMAPPED:
59                 break;
60                 
61         case KIB_TX_MAPPED:
62                 if (in_interrupt()) {
63                         /* can't deregister memory in IRQ context... */
64                         kibnal_schedule_tx_done(tx);
65                         return;
66                 }
67                 rc = ib_memory_deregister(tx->tx_md.md_handle.mr);
68                 LASSERT (rc == 0);
69                 tx->tx_mapped = KIB_TX_UNMAPPED;
70                 break;
71
72 #if IBNAL_FMR
73         case KIB_TX_MAPPED_FMR:
74                 if (in_interrupt() && tx->tx_status != 0) {
75                         /* can't flush FMRs in IRQ context... */
76                         kibnal_schedule_tx_done(tx);
77                         return;
78                 }              
79
80                 rc = ib_fmr_deregister(tx->tx_md.md_handle.fmr);
81                 LASSERT (rc == 0);
82
83                 if (tx->tx_status != 0)
84                         ib_fmr_pool_force_flush(kibnal_data.kib_fmr_pool);
85                 tx->tx_mapped = KIB_TX_UNMAPPED;
86                 break;
87 #endif
88         }
89
90         for (i = 0; i < 2; i++) {
91                 /* tx may have up to 2 libmsgs to finalise */
92                 if (tx->tx_libmsg[i] == NULL)
93                         continue;
94
95                 lib_finalize (&kibnal_lib, NULL, tx->tx_libmsg[i], ptlrc);
96                 tx->tx_libmsg[i] = NULL;
97         }
98         
99         if (tx->tx_conn != NULL) {
100                 kibnal_put_conn (tx->tx_conn);
101                 tx->tx_conn = NULL;
102         }
103
104         tx->tx_nsp = 0;
105         tx->tx_passive_rdma = 0;
106         tx->tx_status = 0;
107
108         spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
109
110         if (tx->tx_isnblk) {
111                 list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs);
112         } else {
113                 list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs);
114                 wake_up (&kibnal_data.kib_idle_tx_waitq);
115         }
116
117         spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
118 }
119
120 kib_tx_t *
121 kibnal_get_idle_tx (int may_block) 
122 {
123         unsigned long  flags;
124         kib_tx_t      *tx = NULL;
125         
126         for (;;) {
127                 spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
128
129                 /* "normal" descriptor is free */
130                 if (!list_empty (&kibnal_data.kib_idle_txs)) {
131                         tx = list_entry (kibnal_data.kib_idle_txs.next,
132                                          kib_tx_t, tx_list);
133                         break;
134                 }
135
136                 if (!may_block) {
137                         /* may dip into reserve pool */
138                         if (list_empty (&kibnal_data.kib_idle_nblk_txs)) {
139                                 CERROR ("reserved tx desc pool exhausted\n");
140                                 break;
141                         }
142
143                         tx = list_entry (kibnal_data.kib_idle_nblk_txs.next,
144                                          kib_tx_t, tx_list);
145                         break;
146                 }
147
148                 /* block for idle tx */
149                 spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
150
151                 wait_event (kibnal_data.kib_idle_tx_waitq,
152                             !list_empty (&kibnal_data.kib_idle_txs) ||
153                             kibnal_data.kib_shutdown);
154         }
155
156         if (tx != NULL) {
157                 list_del (&tx->tx_list);
158
159                 /* Allocate a new passive RDMA completion cookie.  It might
160                  * not be needed, but we've got a lock right now and we're
161                  * unlikely to wrap... */
162                 tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++;
163
164                 LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
165                 LASSERT (tx->tx_nsp == 0);
166                 LASSERT (tx->tx_sending == 0);
167                 LASSERT (tx->tx_status == 0);
168                 LASSERT (tx->tx_conn == NULL);
169                 LASSERT (!tx->tx_passive_rdma);
170                 LASSERT (!tx->tx_passive_rdma_wait);
171                 LASSERT (tx->tx_libmsg[0] == NULL);
172                 LASSERT (tx->tx_libmsg[1] == NULL);
173         }
174
175         spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
176         
177         return (tx);
178 }
179
180 int
181 kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist)
182 {
183         /* I would guess that if kibnal_get_peer (nid) == NULL,
184            and we're not routing, then 'nid' is very distant :) */
185         if ( nal->libnal_ni.ni_pid.nid == nid ) {
186                 *dist = 0;
187         } else {
188                 *dist = 1;
189         }
190
191         return 0;
192 }
193
194 void
195 kibnal_complete_passive_rdma(kib_conn_t *conn, __u64 cookie, int status)
196 {
197         struct list_head *ttmp;
198         unsigned long     flags;
199         int               idle;
200
201         spin_lock_irqsave (&conn->ibc_lock, flags);
202
203         list_for_each (ttmp, &conn->ibc_active_txs) {
204                 kib_tx_t *tx = list_entry(ttmp, kib_tx_t, tx_list);
205
206                 LASSERT (tx->tx_passive_rdma ||
207                          !tx->tx_passive_rdma_wait);
208
209                 LASSERT (tx->tx_passive_rdma_wait ||
210                          tx->tx_sending != 0);
211
212                 if (!tx->tx_passive_rdma_wait ||
213                     tx->tx_passive_rdma_cookie != cookie)
214                         continue;
215
216                 CDEBUG(D_NET, "Complete %p "LPD64": %d\n", tx, cookie, status);
217
218                 tx->tx_status = status;
219                 tx->tx_passive_rdma_wait = 0;
220                 idle = (tx->tx_sending == 0);
221
222                 if (idle)
223                         list_del (&tx->tx_list);
224
225                 spin_unlock_irqrestore (&conn->ibc_lock, flags);
226
227                 /* I could be racing with tx callbacks.  It's whoever
228                  * _makes_ tx idle that frees it */
229                 if (idle)
230                         kibnal_tx_done (tx);
231                 return;
232         }
233                 
234         spin_unlock_irqrestore (&conn->ibc_lock, flags);
235
236         CERROR ("Unmatched (late?) RDMA completion "LPX64" from "LPX64"\n",
237                 cookie, conn->ibc_peer->ibp_nid);
238 }
239
240 void
241 kibnal_post_rx (kib_rx_t *rx, int do_credits)
242 {
243         kib_conn_t   *conn = rx->rx_conn;
244         int           rc;
245         unsigned long flags;
246
247         rx->rx_gl = (struct ib_gather_scatter) {
248                 .address = rx->rx_vaddr,
249                 .length  = IBNAL_MSG_SIZE,
250                 .key     = conn->ibc_rx_pages->ibp_lkey,
251         };
252
253         rx->rx_sp = (struct ib_receive_param) {
254                 .work_request_id        = kibnal_ptr2wreqid(rx, 1),
255                 .scatter_list           = &rx->rx_gl,
256                 .num_scatter_entries    = 1,
257                 .device_specific        = NULL,
258                 .signaled               = 1,
259         };
260
261         LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
262         LASSERT (!rx->rx_posted);
263         rx->rx_posted = 1;
264         mb();
265
266         if (conn->ibc_state != IBNAL_CONN_ESTABLISHED)
267                 rc = -ECONNABORTED;
268         else
269                 rc = ib_receive (conn->ibc_qp, &rx->rx_sp, 1);
270
271         if (rc == 0) {
272                 if (do_credits) {
273                         spin_lock_irqsave(&conn->ibc_lock, flags);
274                         conn->ibc_outstanding_credits++;
275                         spin_unlock_irqrestore(&conn->ibc_lock, flags);
276
277                         kibnal_check_sends(conn);
278                 }
279                 return;
280         }
281
282         if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
283                 CERROR ("Error posting receive -> "LPX64": %d\n",
284                         conn->ibc_peer->ibp_nid, rc);
285                 kibnal_close_conn (rx->rx_conn, rc);
286         } else {
287                 CDEBUG (D_NET, "Error posting receive -> "LPX64": %d\n",
288                         conn->ibc_peer->ibp_nid, rc);
289         }
290
291         /* Drop rx's ref */
292         kibnal_put_conn (conn);
293 }
294
295 #if IBNAL_CKSUM
296 __u32 kibnal_cksum (void *ptr, int nob)
297 {
298         char  *c  = ptr;
299         __u32  sum = 0;
300
301         while (nob-- > 0)
302                 sum = ((sum << 1) | (sum >> 31)) + *c++;
303         
304         return (sum);
305 }
306 #endif
307
308 void
309 kibnal_rx_callback (struct ib_cq_entry *e)
310 {
311         kib_rx_t     *rx = (kib_rx_t *)kibnal_wreqid2ptr(e->work_request_id);
312         kib_msg_t    *msg = rx->rx_msg;
313         kib_conn_t   *conn = rx->rx_conn;
314         int           nob = e->bytes_transferred;
315         const int     base_nob = offsetof(kib_msg_t, ibm_u);
316         int           credits;
317         int           flipped;
318         unsigned long flags;
319 #if IBNAL_CKSUM
320         __u32         msg_cksum;
321         __u32         computed_cksum;
322 #endif
323
324         CDEBUG (D_NET, "rx %p conn %p\n", rx, conn);
325         LASSERT (rx->rx_posted);
326         rx->rx_posted = 0;
327         mb();
328
329         /* receives complete with error in any case after we've started
330          * closing the QP */
331         if (conn->ibc_state >= IBNAL_CONN_DEATHROW)
332                 goto failed;
333
334         /* We don't post receives until the conn is established */
335         LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED);
336
337         if (e->status != IB_COMPLETION_STATUS_SUCCESS) {
338                 CERROR("Rx from "LPX64" failed: %d\n", 
339                        conn->ibc_peer->ibp_nid, e->status);
340                 goto failed;
341         }
342
343         if (nob < base_nob) {
344                 CERROR ("Short rx from "LPX64": %d\n",
345                         conn->ibc_peer->ibp_nid, nob);
346                 goto failed;
347         }
348
349         /* Receiver does any byte flipping if necessary... */
350
351         if (msg->ibm_magic == IBNAL_MSG_MAGIC) {
352                 flipped = 0;
353         } else {
354                 if (msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) {
355                         CERROR ("Unrecognised magic: %08x from "LPX64"\n", 
356                                 msg->ibm_magic, conn->ibc_peer->ibp_nid);
357                         goto failed;
358                 }
359                 flipped = 1;
360                 __swab16s (&msg->ibm_version);
361                 LASSERT (sizeof(msg->ibm_type) == 1);
362                 LASSERT (sizeof(msg->ibm_credits) == 1);
363         }
364
365         if (msg->ibm_version != IBNAL_MSG_VERSION) {
366                 CERROR ("Incompatible msg version %d (%d expected)\n",
367                         msg->ibm_version, IBNAL_MSG_VERSION);
368                 goto failed;
369         }
370
371 #if IBNAL_CKSUM
372         if (nob != msg->ibm_nob) {
373                 CERROR ("Unexpected # bytes %d (%d expected)\n", nob, msg->ibm_nob);
374                 goto failed;
375         }
376
377         msg_cksum = le32_to_cpu(msg->ibm_cksum);
378         msg->ibm_cksum = 0;
379         computed_cksum = kibnal_cksum (msg, nob);
380         
381         if (msg_cksum != computed_cksum) {
382                 CERROR ("Checksum failure %d: (%d expected)\n",
383                         computed_cksum, msg_cksum);
384                 goto failed;
385         }
386         CDEBUG(D_NET, "cksum %x, nob %d\n", computed_cksum, nob);
387 #endif
388
389         /* Have I received credits that will let me send? */
390         credits = msg->ibm_credits;
391         if (credits != 0) {
392                 spin_lock_irqsave(&conn->ibc_lock, flags);
393                 conn->ibc_credits += credits;
394                 spin_unlock_irqrestore(&conn->ibc_lock, flags);
395                 
396                 kibnal_check_sends(conn);
397         }
398
399         switch (msg->ibm_type) {
400         case IBNAL_MSG_NOOP:
401                 kibnal_post_rx (rx, 1);
402                 return;
403
404         case IBNAL_MSG_IMMEDIATE:
405                 if (nob < base_nob + sizeof (kib_immediate_msg_t)) {
406                         CERROR ("Short IMMEDIATE from "LPX64": %d\n",
407                                 conn->ibc_peer->ibp_nid, nob);
408                         goto failed;
409                 }
410                 break;
411                 
412         case IBNAL_MSG_PUT_RDMA:
413         case IBNAL_MSG_GET_RDMA:
414                 if (nob < base_nob + sizeof (kib_rdma_msg_t)) {
415                         CERROR ("Short RDMA msg from "LPX64": %d\n",
416                                 conn->ibc_peer->ibp_nid, nob);
417                         goto failed;
418                 }
419                 if (flipped) {
420                         __swab32s(&msg->ibm_u.rdma.ibrm_desc.rd_key);
421                         __swab32s(&msg->ibm_u.rdma.ibrm_desc.rd_nob);
422                         __swab64s(&msg->ibm_u.rdma.ibrm_desc.rd_addr);
423                 }
424                 CDEBUG(D_NET, "%d RDMA: cookie "LPX64", key %x, addr "LPX64", nob %d\n",
425                        msg->ibm_type, msg->ibm_u.rdma.ibrm_cookie,
426                        msg->ibm_u.rdma.ibrm_desc.rd_key,
427                        msg->ibm_u.rdma.ibrm_desc.rd_addr,
428                        msg->ibm_u.rdma.ibrm_desc.rd_nob);
429                 break;
430                 
431         case IBNAL_MSG_PUT_DONE:
432         case IBNAL_MSG_GET_DONE:
433                 if (nob < base_nob + sizeof (kib_completion_msg_t)) {
434                         CERROR ("Short COMPLETION msg from "LPX64": %d\n",
435                                 conn->ibc_peer->ibp_nid, nob);
436                         goto failed;
437                 }
438                 if (flipped)
439                         __swab32s(&msg->ibm_u.completion.ibcm_status);
440                 
441                 CDEBUG(D_NET, "%d DONE: cookie "LPX64", status %d\n",
442                        msg->ibm_type, msg->ibm_u.completion.ibcm_cookie,
443                        msg->ibm_u.completion.ibcm_status);
444
445                 kibnal_complete_passive_rdma (conn, 
446                                               msg->ibm_u.completion.ibcm_cookie,
447                                               msg->ibm_u.completion.ibcm_status);
448                 kibnal_post_rx (rx, 1);
449                 return;
450                         
451         default:
452                 CERROR ("Can't parse type from "LPX64": %d\n",
453                         conn->ibc_peer->ibp_nid, msg->ibm_type);
454                 goto failed;
455         }
456
457         /* schedule for kibnal_rx() in thread context */
458         spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
459         
460         list_add_tail (&rx->rx_list, &kibnal_data.kib_sched_rxq);
461         wake_up (&kibnal_data.kib_sched_waitq);
462         
463         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
464         return;
465         
466  failed:
467         CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
468         kibnal_close_conn(conn, -ECONNABORTED);
469
470         /* Don't re-post rx & drop its ref on conn */
471         kibnal_put_conn(conn);
472 }
473
474 void
475 kibnal_rx (kib_rx_t *rx)
476 {
477         kib_msg_t   *msg = rx->rx_msg;
478
479         /* Clear flag so I can detect if I've sent an RDMA completion */
480         rx->rx_rdma = 0;
481
482         switch (msg->ibm_type) {
483         case IBNAL_MSG_GET_RDMA:
484                 lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx);
485                 /* If the incoming get was matched, I'll have initiated the
486                  * RDMA and the completion message... */
487                 if (rx->rx_rdma)
488                         break;
489
490                 /* Otherwise, I'll send a failed completion now to prevent
491                  * the peer's GET blocking for the full timeout. */
492                 CERROR ("Completing unmatched RDMA GET from "LPX64"\n",
493                         rx->rx_conn->ibc_peer->ibp_nid);
494                 kibnal_start_active_rdma (IBNAL_MSG_GET_DONE, -EIO,
495                                           rx, NULL, 0, NULL, NULL, 0, 0);
496                 break;
497                 
498         case IBNAL_MSG_PUT_RDMA:
499                 lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx);
500                 if (rx->rx_rdma)
501                         break;
502                 /* This is most unusual, since even if lib_parse() didn't
503                  * match anything, it should have asked us to read (and
504                  * discard) the payload.  The portals header must be
505                  * inconsistent with this message type, so it's the
506                  * sender's fault for sending garbage and she can time
507                  * herself out... */
508                 CERROR ("Uncompleted RMDA PUT from "LPX64"\n",
509                         rx->rx_conn->ibc_peer->ibp_nid);
510                 break;
511
512         case IBNAL_MSG_IMMEDIATE:
513                 lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx);
514                 LASSERT (!rx->rx_rdma);
515                 break;
516                 
517         default:
518                 LBUG();
519                 break;
520         }
521
522         kibnal_post_rx (rx, 1);
523 }
524
525 #if 0
526 int
527 kibnal_kvaddr_to_phys (unsigned long vaddr, __u64 *physp)
528 {
529         struct page *page;
530
531         if (vaddr >= VMALLOC_START &&
532             vaddr < VMALLOC_END)
533                 page = vmalloc_to_page ((void *)vaddr);
534 #if CONFIG_HIGHMEM
535         else if (vaddr >= PKMAP_BASE &&
536                  vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE))
537                 page = vmalloc_to_page ((void *)vaddr);
538         /* in 2.4 ^ just walks the page tables */
539 #endif
540         else
541                 page = virt_to_page (vaddr);
542
543         if (page == NULL ||
544             !VALID_PAGE (page))
545                 return (-EFAULT);
546
547         *physp = kibnal_page2phys(page) + (vaddr & (PAGE_SIZE - 1));
548         return (0);
549 }
550 #endif
551
552 int
553 kibnal_map_iov (kib_tx_t *tx, enum ib_memory_access access,
554                  int niov, struct iovec *iov, int offset, int nob)
555                  
556 {
557         void   *vaddr;
558         int     rc;
559
560         LASSERT (nob > 0);
561         LASSERT (niov > 0);
562         LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
563
564         while (offset >= iov->iov_len) {
565                 offset -= iov->iov_len;
566                 niov--;
567                 iov++;
568                 LASSERT (niov > 0);
569         }
570
571         if (nob > iov->iov_len - offset) {
572                 CERROR ("Can't map multiple vaddr fragments\n");
573                 return (-EMSGSIZE);
574         }
575
576         vaddr = (void *)(((unsigned long)iov->iov_base) + offset);
577         tx->tx_md.md_addr = (__u64)((unsigned long)vaddr);
578
579         rc = ib_memory_register (kibnal_data.kib_pd,
580                                  vaddr, nob,
581                                  access,
582                                  &tx->tx_md.md_handle.mr,
583                                  &tx->tx_md.md_lkey,
584                                  &tx->tx_md.md_rkey);
585         
586         if (rc != 0) {
587                 CERROR ("Can't map vaddr: %d\n", rc);
588                 return (rc);
589         }
590
591         tx->tx_mapped = KIB_TX_MAPPED;
592         return (0);
593 }
594
595 int
596 kibnal_map_kiov (kib_tx_t *tx, enum ib_memory_access access,
597                   int nkiov, ptl_kiov_t *kiov,
598                   int offset, int nob)
599 {
600 #if IBNAL_FMR
601         __u64                      *phys;
602         const int                   mapped = KIB_TX_MAPPED_FMR;
603 #else
604         struct ib_physical_buffer  *phys;
605         const int                   mapped = KIB_TX_MAPPED;
606 #endif
607         int                         page_offset;
608         int                         nphys;
609         int                         resid;
610         int                         phys_size;
611         int                         rc;
612
613         CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
614
615         LASSERT (nob > 0);
616         LASSERT (nkiov > 0);
617         LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
618
619         while (offset >= kiov->kiov_len) {
620                 offset -= kiov->kiov_len;
621                 nkiov--;
622                 kiov++;
623                 LASSERT (nkiov > 0);
624         }
625
626         phys_size = nkiov * sizeof (*phys);
627         PORTAL_ALLOC(phys, phys_size);
628         if (phys == NULL) {
629                 CERROR ("Can't allocate tmp phys\n");
630                 return (-ENOMEM);
631         }
632
633         page_offset = kiov->kiov_offset + offset;
634 #if IBNAL_FMR
635         phys[0] = kibnal_page2phys(kiov->kiov_page);
636 #else
637         phys[0].address = kibnal_page2phys(kiov->kiov_page);
638         phys[0].size = PAGE_SIZE;
639 #endif
640         nphys = 1;
641         resid = nob - (kiov->kiov_len - offset);
642
643         while (resid > 0) {
644                 kiov++;
645                 nkiov--;
646                 LASSERT (nkiov > 0);
647
648                 if (kiov->kiov_offset != 0 ||
649                     ((resid > PAGE_SIZE) && 
650                      kiov->kiov_len < PAGE_SIZE)) {
651                         int i;
652                         /* Can't have gaps */
653                         CERROR ("Can't make payload contiguous in I/O VM:"
654                                 "page %d, offset %d, len %d \n", nphys, 
655                                 kiov->kiov_offset, kiov->kiov_len);
656
657                         for (i = -nphys; i < nkiov; i++) 
658                         {
659                                 CERROR("kiov[%d] %p +%d for %d\n",
660                                        i, kiov[i].kiov_page, kiov[i].kiov_offset, kiov[i].kiov_len);
661                         }
662                         
663                         rc = -EINVAL;
664                         goto out;
665                 }
666
667                 if (nphys == PTL_MD_MAX_IOV) {
668                         CERROR ("payload too big (%d)\n", nphys);
669                         rc = -EMSGSIZE;
670                         goto out;
671                 }
672
673                 LASSERT (nphys * sizeof (*phys) < phys_size);
674 #if IBNAL_FMR
675                 phys[nphys] = kibnal_page2phys(kiov->kiov_page);
676 #else
677                 phys[nphys].address = kibnal_page2phys(kiov->kiov_page);
678                 phys[nphys].size = PAGE_SIZE;
679 #endif
680                 nphys++;
681
682                 resid -= PAGE_SIZE;
683         }
684
685 #if 0
686         CWARN ("nphys %d, nob %d, page_offset %d\n", nphys, nob, page_offset);
687         for (rc = 0; rc < nphys; rc++)
688                 CWARN ("   [%d] "LPX64" / %d\n", rc, phys[rc].address, phys[rc].size);
689 #endif
690         tx->tx_md.md_addr = IBNAL_RDMA_BASE;
691
692 #if IBNAL_FMR
693         rc = ib_fmr_register_physical (kibnal_data.kib_fmr_pool,
694                                        phys, nphys,
695                                        &tx->tx_md.md_addr,
696                                        page_offset,
697                                        &tx->tx_md.md_handle.fmr,
698                                        &tx->tx_md.md_lkey,
699                                        &tx->tx_md.md_rkey);
700 #else
701         rc = ib_memory_register_physical (kibnal_data.kib_pd,
702                                           phys, nphys,
703                                           &tx->tx_md.md_addr,
704                                           nob, page_offset,
705                                           access,
706                                           &tx->tx_md.md_handle.mr,
707                                           &tx->tx_md.md_lkey,
708                                           &tx->tx_md.md_rkey);
709 #endif
710         if (rc == 0) {
711                 CDEBUG(D_NET, "Mapped %d pages %d bytes @ offset %d: lkey %x, rkey %x\n",
712                        nphys, nob, page_offset, tx->tx_md.md_lkey, tx->tx_md.md_rkey);
713                 tx->tx_mapped = mapped;
714         } else {
715                 CERROR ("Can't map phys: %d\n", rc);
716                 rc = -EFAULT;
717         }
718
719  out:
720         PORTAL_FREE(phys, phys_size);
721         return (rc);
722 }
723
724 kib_conn_t *
725 kibnal_find_conn_locked (kib_peer_t *peer)
726 {
727         struct list_head *tmp;
728
729         /* just return the first connection */
730         list_for_each (tmp, &peer->ibp_conns) {
731                 return (list_entry(tmp, kib_conn_t, ibc_list));
732         }
733
734         return (NULL);
735 }
736
737 void
738 kibnal_check_sends (kib_conn_t *conn)
739 {
740         unsigned long   flags;
741         kib_tx_t       *tx;
742         int             rc;
743         int             i;
744         int             done;
745         int             nwork;
746
747         spin_lock_irqsave (&conn->ibc_lock, flags);
748
749         LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE);
750
751         if (list_empty(&conn->ibc_tx_queue) &&
752             conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) {
753                 spin_unlock_irqrestore(&conn->ibc_lock, flags);
754                 
755                 tx = kibnal_get_idle_tx(0);     /* don't block */
756                 if (tx != NULL)
757                         kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0);
758
759                 spin_lock_irqsave(&conn->ibc_lock, flags);
760                 
761                 if (tx != NULL) {
762                         atomic_inc(&conn->ibc_refcount);
763                         kibnal_queue_tx_locked(tx, conn);
764                 }
765         }
766
767         while (!list_empty (&conn->ibc_tx_queue)) {
768                 tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list);
769
770                 /* We rely on this for QP sizing */
771                 LASSERT (tx->tx_nsp > 0 && tx->tx_nsp <= 2);
772
773                 LASSERT (conn->ibc_outstanding_credits >= 0);
774                 LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE);
775                 LASSERT (conn->ibc_credits >= 0);
776                 LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE);
777
778                 /* Not on ibc_rdma_queue */
779                 LASSERT (!tx->tx_passive_rdma_wait);
780
781                 if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE)
782                         break;
783
784                 if (conn->ibc_credits == 0)     /* no credits */
785                         break;
786                 
787                 if (conn->ibc_credits == 1 &&   /* last credit reserved for */
788                     conn->ibc_outstanding_credits == 0) /* giving back credits */
789                         break;
790
791                 list_del (&tx->tx_list);
792
793                 if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP &&
794                     (!list_empty(&conn->ibc_tx_queue) ||
795                      conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) {
796                         /* redundant NOOP */
797                         spin_unlock_irqrestore(&conn->ibc_lock, flags);
798                         kibnal_tx_done(tx);
799                         spin_lock_irqsave(&conn->ibc_lock, flags);
800                         continue;
801                 }
802
803                 tx->tx_msg->ibm_credits = conn->ibc_outstanding_credits;
804                 conn->ibc_outstanding_credits = 0;
805
806                 conn->ibc_nsends_posted++;
807                 conn->ibc_credits--;
808
809                 tx->tx_sending = tx->tx_nsp;
810                 tx->tx_passive_rdma_wait = tx->tx_passive_rdma;
811                 list_add (&tx->tx_list, &conn->ibc_active_txs);
812 #if IBNAL_CKSUM
813                 tx->tx_msg->ibm_cksum = 0;
814                 tx->tx_msg->ibm_cksum = kibnal_cksum(tx->tx_msg, tx->tx_msg->ibm_nob);
815                 CDEBUG(D_NET, "cksum %x, nob %d\n", tx->tx_msg->ibm_cksum, tx->tx_msg->ibm_nob);
816 #endif
817                 spin_unlock_irqrestore (&conn->ibc_lock, flags);
818
819                 /* NB the gap between removing tx from the queue and sending it
820                  * allows message re-ordering to occur */
821
822                 LASSERT (tx->tx_nsp > 0);
823
824                 rc = -ECONNABORTED;
825                 nwork = 0;
826                 if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
827                         tx->tx_status = 0;
828                         /* Driver only accepts 1 item at a time */
829                         for (i = 0; i < tx->tx_nsp; i++) {
830                                 rc = ib_send (conn->ibc_qp, &tx->tx_sp[i], 1);
831                                 if (rc != 0)
832                                         break;
833                                 nwork++;
834                         }
835                 }
836
837                 spin_lock_irqsave (&conn->ibc_lock, flags);
838                 if (rc != 0) {
839                         /* NB credits are transferred in the actual
840                          * message, which can only be the last work item */
841                         conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits;
842                         conn->ibc_credits++;
843                         conn->ibc_nsends_posted--;
844
845                         tx->tx_status = rc;
846                         tx->tx_passive_rdma_wait = 0;
847                         tx->tx_sending -= tx->tx_nsp - nwork;
848
849                         done = (tx->tx_sending == 0);
850                         if (done)
851                                 list_del (&tx->tx_list);
852                         
853                         spin_unlock_irqrestore (&conn->ibc_lock, flags);
854                         
855                         if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
856                                 CERROR ("Error %d posting transmit to "LPX64"\n", 
857                                         rc, conn->ibc_peer->ibp_nid);
858                         else
859                                 CDEBUG (D_NET, "Error %d posting transmit to "
860                                         LPX64"\n", rc, conn->ibc_peer->ibp_nid);
861
862                         kibnal_close_conn (conn, rc);
863
864                         if (done)
865                                 kibnal_tx_done (tx);
866                         return;
867                 }
868                 
869         }
870
871         spin_unlock_irqrestore (&conn->ibc_lock, flags);
872 }
873
874 void
875 kibnal_tx_callback (struct ib_cq_entry *e)
876 {
877         kib_tx_t     *tx = (kib_tx_t *)kibnal_wreqid2ptr(e->work_request_id);
878         kib_conn_t   *conn;
879         unsigned long flags;
880         int           idle;
881
882         conn = tx->tx_conn;
883         LASSERT (conn != NULL);
884         LASSERT (tx->tx_sending != 0);
885
886         spin_lock_irqsave(&conn->ibc_lock, flags);
887
888         CDEBUG(D_NET, "conn %p tx %p [%d/%d]: %d\n", conn, tx,
889                tx->tx_nsp - tx->tx_sending, tx->tx_nsp,
890                e->status);
891
892         /* I could be racing with rdma completion.  Whoever makes 'tx' idle
893          * gets to free it, which also drops its ref on 'conn'.  If it's
894          * not me, then I take an extra ref on conn so it can't disappear
895          * under me. */
896
897         tx->tx_sending--;
898         idle = (tx->tx_sending == 0) &&         /* This is the final callback */
899                (!tx->tx_passive_rdma_wait);     /* Not waiting for RDMA completion */
900         if (idle)
901                 list_del(&tx->tx_list);
902
903         CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
904                conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
905                atomic_read (&conn->ibc_refcount));
906         atomic_inc (&conn->ibc_refcount);
907
908         if (tx->tx_sending == 0)
909                 conn->ibc_nsends_posted--;
910
911         if (e->status != IB_COMPLETION_STATUS_SUCCESS &&
912             tx->tx_status == 0)
913                 tx->tx_status = -ECONNABORTED;
914                 
915         spin_unlock_irqrestore(&conn->ibc_lock, flags);
916
917         if (idle)
918                 kibnal_tx_done (tx);
919
920         if (e->status != IB_COMPLETION_STATUS_SUCCESS) {
921                 CERROR ("Tx completion to "LPX64" failed: %d\n", 
922                         conn->ibc_peer->ibp_nid, e->status);
923                 kibnal_close_conn (conn, -ENETDOWN);
924         } else {
925                 /* can I shovel some more sends out the door? */
926                 kibnal_check_sends(conn);
927         }
928
929         kibnal_put_conn (conn);
930 }
931
932 void
933 kibnal_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg)
934 {
935         if (kibnal_wreqid_is_rx(e->work_request_id))
936                 kibnal_rx_callback (e);
937         else
938                 kibnal_tx_callback (e);
939 }
940
941 void
942 kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob)
943 {
944         struct ib_gather_scatter *gl = &tx->tx_gl[tx->tx_nsp];
945         struct ib_send_param     *sp = &tx->tx_sp[tx->tx_nsp];
946         int                       fence;
947         int                       nob = offsetof (kib_msg_t, ibm_u) + body_nob;
948
949         LASSERT (tx->tx_nsp >= 0 && 
950                  tx->tx_nsp < sizeof(tx->tx_sp)/sizeof(tx->tx_sp[0]));
951         LASSERT (nob <= IBNAL_MSG_SIZE);
952         
953         tx->tx_msg->ibm_magic = IBNAL_MSG_MAGIC;
954         tx->tx_msg->ibm_version = IBNAL_MSG_VERSION;
955         tx->tx_msg->ibm_type = type;
956 #if IBNAL_CKSUM
957         tx->tx_msg->ibm_nob = nob;
958 #endif
959         /* Fence the message if it's bundled with an RDMA read */
960         fence = (tx->tx_nsp > 0) &&
961                 (type == IBNAL_MSG_PUT_DONE);
962
963         *gl = (struct ib_gather_scatter) {
964                 .address = tx->tx_vaddr,
965                 .length  = nob,
966                 .key     = kibnal_data.kib_tx_pages->ibp_lkey,
967         };
968
969         /* NB If this is an RDMA read, the completion message must wait for
970          * the RDMA to complete.  Sends wait for previous RDMA writes
971          * anyway... */
972         *sp = (struct ib_send_param) {
973                 .work_request_id      = kibnal_ptr2wreqid(tx, 0),
974                 .op                   = IB_OP_SEND,
975                 .gather_list          = gl,
976                 .num_gather_entries   = 1,
977                 .device_specific      = NULL,
978                 .solicited_event      = 1,
979                 .signaled             = 1,
980                 .immediate_data_valid = 0,
981                 .fence                = fence,
982                 .inline_data          = 0,
983         };
984
985         tx->tx_nsp++;
986 }
987
988 void
989 kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
990 {
991         unsigned long         flags;
992
993         spin_lock_irqsave(&conn->ibc_lock, flags);
994
995         kibnal_queue_tx_locked (tx, conn);
996         
997         spin_unlock_irqrestore(&conn->ibc_lock, flags);
998         
999         kibnal_check_sends(conn);
1000 }
1001
1002 void
1003 kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid)
1004 {
1005         unsigned long    flags;
1006         kib_peer_t      *peer;
1007         kib_conn_t      *conn;
1008         rwlock_t        *g_lock = &kibnal_data.kib_global_lock;
1009
1010         /* If I get here, I've committed to send, so I complete the tx with
1011          * failure on any problems */
1012         
1013         LASSERT (tx->tx_conn == NULL);          /* only set when assigned a conn */
1014         LASSERT (tx->tx_nsp > 0);               /* work items have been set up */
1015
1016         read_lock (g_lock);
1017         
1018         peer = kibnal_find_peer_locked (nid);
1019         if (peer == NULL) {
1020                 read_unlock (g_lock);
1021                 tx->tx_status = -EHOSTUNREACH;
1022                 kibnal_tx_done (tx);
1023                 return;
1024         }
1025
1026         conn = kibnal_find_conn_locked (peer);
1027         if (conn != NULL) {
1028                 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
1029                        conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
1030                        atomic_read (&conn->ibc_refcount));
1031                 atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
1032                 read_unlock (g_lock);
1033                 
1034                 kibnal_queue_tx (tx, conn);
1035                 return;
1036         }
1037         
1038         /* Making one or more connections; I'll need a write lock... */
1039         read_unlock (g_lock);
1040         write_lock_irqsave (g_lock, flags);
1041
1042         peer = kibnal_find_peer_locked (nid);
1043         if (peer == NULL) {
1044                 write_unlock_irqrestore (g_lock, flags);
1045                 tx->tx_status = -EHOSTUNREACH;
1046                 kibnal_tx_done (tx);
1047                 return;
1048         }
1049
1050         conn = kibnal_find_conn_locked (peer);
1051         if (conn != NULL) {
1052                 /* Connection exists; queue message on it */
1053                 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
1054                        conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
1055                        atomic_read (&conn->ibc_refcount));
1056                 atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
1057                 write_unlock_irqrestore (g_lock, flags);
1058                 
1059                 kibnal_queue_tx (tx, conn);
1060                 return;
1061         }
1062
1063         if (peer->ibp_connecting == 0) {
1064                 if (!time_after_eq(jiffies, peer->ibp_reconnect_time)) {
1065                         write_unlock_irqrestore (g_lock, flags);
1066                         tx->tx_status = -EHOSTUNREACH;
1067                         kibnal_tx_done (tx);
1068                         return;
1069                 }
1070         
1071                 peer->ibp_connecting = 1;
1072                 atomic_inc (&peer->ibp_refcount); /* extra ref for connd */
1073         
1074                 spin_lock (&kibnal_data.kib_connd_lock);
1075         
1076                 list_add_tail (&peer->ibp_connd_list,
1077                                &kibnal_data.kib_connd_peers);
1078                 wake_up (&kibnal_data.kib_connd_waitq);
1079         
1080                 spin_unlock (&kibnal_data.kib_connd_lock);
1081         }
1082         
1083         /* A connection is being established; queue the message... */
1084         list_add_tail (&tx->tx_list, &peer->ibp_tx_queue);
1085
1086         write_unlock_irqrestore (g_lock, flags);
1087 }
1088
1089 ptl_err_t
1090 kibnal_start_passive_rdma (int type, ptl_nid_t nid,
1091                             lib_msg_t *libmsg, ptl_hdr_t *hdr)
1092 {
1093         int         nob = libmsg->md->length;
1094         kib_tx_t   *tx;
1095         kib_msg_t  *ibmsg;
1096         int         rc;
1097         int         access;
1098         
1099         LASSERT (type == IBNAL_MSG_PUT_RDMA || 
1100                  type == IBNAL_MSG_GET_RDMA);
1101         LASSERT (nob > 0);
1102         LASSERT (!in_interrupt());              /* Mapping could block */
1103
1104         if (type == IBNAL_MSG_PUT_RDMA) {
1105                 access = IB_ACCESS_REMOTE_READ;
1106         } else {
1107                 access = IB_ACCESS_REMOTE_WRITE |
1108                          IB_ACCESS_LOCAL_WRITE;
1109         }
1110
1111         tx = kibnal_get_idle_tx (1);           /* May block; caller is an app thread */
1112         LASSERT (tx != NULL);
1113
1114         if ((libmsg->md->options & PTL_MD_KIOV) == 0) 
1115                 rc = kibnal_map_iov (tx, access,
1116                                      libmsg->md->md_niov,
1117                                      libmsg->md->md_iov.iov,
1118                                      0, nob);
1119         else
1120                 rc = kibnal_map_kiov (tx, access,
1121                                       libmsg->md->md_niov, 
1122                                       libmsg->md->md_iov.kiov,
1123                                       0, nob);
1124
1125         if (rc != 0) {
1126                 CERROR ("Can't map RDMA for "LPX64": %d\n", nid, rc);
1127                 goto failed;
1128         }
1129         
1130         if (type == IBNAL_MSG_GET_RDMA) {
1131                 /* reply gets finalized when tx completes */
1132                 tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib, 
1133                                                         nid, libmsg);
1134                 if (tx->tx_libmsg[1] == NULL) {
1135                         CERROR ("Can't create reply for GET -> "LPX64"\n",
1136                                 nid);
1137                         rc = -ENOMEM;
1138                         goto failed;
1139                 }
1140         }
1141         
1142         tx->tx_passive_rdma = 1;
1143
1144         ibmsg = tx->tx_msg;
1145
1146         ibmsg->ibm_u.rdma.ibrm_hdr = *hdr;
1147         ibmsg->ibm_u.rdma.ibrm_cookie = tx->tx_passive_rdma_cookie;
1148         ibmsg->ibm_u.rdma.ibrm_desc.rd_key = tx->tx_md.md_rkey;
1149         ibmsg->ibm_u.rdma.ibrm_desc.rd_addr = tx->tx_md.md_addr;
1150         ibmsg->ibm_u.rdma.ibrm_desc.rd_nob = nob;
1151
1152         kibnal_init_tx_msg (tx, type, sizeof (kib_rdma_msg_t));
1153
1154         CDEBUG(D_NET, "Passive: %p cookie "LPX64", key %x, addr "
1155                LPX64", nob %d\n",
1156                tx, tx->tx_passive_rdma_cookie, tx->tx_md.md_rkey,
1157                tx->tx_md.md_addr, nob);
1158         
1159         /* libmsg gets finalized when tx completes. */
1160         tx->tx_libmsg[0] = libmsg;
1161
1162         kibnal_launch_tx(tx, nid);
1163         return (PTL_OK);
1164
1165  failed:
1166         tx->tx_status = rc;
1167         kibnal_tx_done (tx);
1168         return (PTL_FAIL);
1169 }
1170
1171 void
1172 kibnal_start_active_rdma (int type, int status,
1173                            kib_rx_t *rx, lib_msg_t *libmsg, 
1174                            unsigned int niov,
1175                            struct iovec *iov, ptl_kiov_t *kiov,
1176                            int offset, int nob)
1177 {
1178         kib_msg_t    *rxmsg = rx->rx_msg;
1179         kib_msg_t    *txmsg;
1180         kib_tx_t     *tx;
1181         int           access;
1182         int           rdma_op;
1183         int           rc;
1184
1185         CDEBUG(D_NET, "type %d, status %d, niov %d, offset %d, nob %d\n",
1186                type, status, niov, offset, nob);
1187
1188         /* Called by scheduler */
1189         LASSERT (!in_interrupt ());
1190
1191         /* Either all pages or all vaddrs */
1192         LASSERT (!(kiov != NULL && iov != NULL));
1193
1194         /* No data if we're completing with failure */
1195         LASSERT (status == 0 || nob == 0);
1196
1197         LASSERT (type == IBNAL_MSG_GET_DONE ||
1198                  type == IBNAL_MSG_PUT_DONE);
1199
1200         /* Flag I'm completing the RDMA.  Even if I fail to send the
1201          * completion message, I will have tried my best so further
1202          * attempts shouldn't be tried. */
1203         LASSERT (!rx->rx_rdma);
1204         rx->rx_rdma = 1;
1205
1206         if (type == IBNAL_MSG_GET_DONE) {
1207                 access   = 0;
1208                 rdma_op  = IB_OP_RDMA_WRITE;
1209                 LASSERT (rxmsg->ibm_type == IBNAL_MSG_GET_RDMA);
1210         } else {
1211                 access   = IB_ACCESS_LOCAL_WRITE;
1212                 rdma_op  = IB_OP_RDMA_READ;
1213                 LASSERT (rxmsg->ibm_type == IBNAL_MSG_PUT_RDMA);
1214         }
1215
1216         tx = kibnal_get_idle_tx (0);           /* Mustn't block */
1217         if (tx == NULL) {
1218                 CERROR ("tx descs exhausted on RDMA from "LPX64
1219                         " completing locally with failure\n",
1220                         rx->rx_conn->ibc_peer->ibp_nid);
1221                 lib_finalize (&kibnal_lib, NULL, libmsg, PTL_NO_SPACE);
1222                 return;
1223         }
1224         LASSERT (tx->tx_nsp == 0);
1225                         
1226         if (nob != 0) {
1227                 /* We actually need to transfer some data (the transfer
1228                  * size could get truncated to zero when the incoming
1229                  * message is matched) */
1230
1231                 if (kiov != NULL)
1232                         rc = kibnal_map_kiov (tx, access,
1233                                               niov, kiov, offset, nob);
1234                 else
1235                         rc = kibnal_map_iov (tx, access,
1236                                              niov, iov, offset, nob);
1237                 
1238                 if (rc != 0) {
1239                         CERROR ("Can't map RDMA -> "LPX64": %d\n", 
1240                                 rx->rx_conn->ibc_peer->ibp_nid, rc);
1241                         /* We'll skip the RDMA and complete with failure. */
1242                         status = rc;
1243                         nob = 0;
1244                 } else {
1245                         tx->tx_gl[0] = (struct ib_gather_scatter) {
1246                                 .address = tx->tx_md.md_addr,
1247                                 .length  = nob,
1248                                 .key     = tx->tx_md.md_lkey,
1249                         };
1250                 
1251                         tx->tx_sp[0] = (struct ib_send_param) {
1252                                 .work_request_id      = kibnal_ptr2wreqid(tx, 0),
1253                                 .op                   = rdma_op,
1254                                 .gather_list          = &tx->tx_gl[0],
1255                                 .num_gather_entries   = 1,
1256                                 .remote_address       = rxmsg->ibm_u.rdma.ibrm_desc.rd_addr,
1257                                 .rkey                 = rxmsg->ibm_u.rdma.ibrm_desc.rd_key,
1258                                 .device_specific      = NULL,
1259                                 .solicited_event      = 0,
1260                                 .signaled             = 1,
1261                                 .immediate_data_valid = 0,
1262                                 .fence                = 0,
1263                                 .inline_data          = 0,
1264                         };
1265
1266                         tx->tx_nsp = 1;
1267                 }
1268         }
1269
1270         txmsg = tx->tx_msg;
1271
1272         txmsg->ibm_u.completion.ibcm_cookie = rxmsg->ibm_u.rdma.ibrm_cookie;
1273         txmsg->ibm_u.completion.ibcm_status = status;
1274         
1275         kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t));
1276
1277         if (status == 0 && nob != 0) {
1278                 LASSERT (tx->tx_nsp > 1);
1279                 /* RDMA: libmsg gets finalized when the tx completes.  This
1280                  * is after the completion message has been sent, which in
1281                  * turn is after the RDMA has finished. */
1282                 tx->tx_libmsg[0] = libmsg;
1283         } else {
1284                 LASSERT (tx->tx_nsp == 1);
1285                 /* No RDMA: local completion happens now! */
1286                 CDEBUG(D_WARNING,"No data: immediate completion\n");
1287                 lib_finalize (&kibnal_lib, NULL, libmsg,
1288                               status == 0 ? PTL_OK : PTL_FAIL);
1289         }
1290
1291         /* +1 ref for this tx... */
1292         CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
1293                rx->rx_conn, rx->rx_conn->ibc_state, 
1294                rx->rx_conn->ibc_peer->ibp_nid,
1295                atomic_read (&rx->rx_conn->ibc_refcount));
1296         atomic_inc (&rx->rx_conn->ibc_refcount);
1297         /* ...and queue it up */
1298         kibnal_queue_tx(tx, rx->rx_conn);
1299 }
1300
1301 ptl_err_t
1302 kibnal_sendmsg(lib_nal_t    *nal, 
1303                 void         *private,
1304                 lib_msg_t    *libmsg,
1305                 ptl_hdr_t    *hdr, 
1306                 int           type, 
1307                 ptl_nid_t     nid, 
1308                 ptl_pid_t     pid,
1309                 unsigned int  payload_niov, 
1310                 struct iovec *payload_iov, 
1311                 ptl_kiov_t   *payload_kiov,
1312                 int           payload_offset,
1313                 int           payload_nob)
1314 {
1315         kib_msg_t  *ibmsg;
1316         kib_tx_t   *tx;
1317         int         nob;
1318
1319         /* NB 'private' is different depending on what we're sending.... */
1320
1321         CDEBUG(D_NET, "sending %d bytes in %d frags to nid:"LPX64" pid %d\n",
1322                payload_nob, payload_niov, nid , pid);
1323
1324         LASSERT (payload_nob == 0 || payload_niov > 0);
1325         LASSERT (payload_niov <= PTL_MD_MAX_IOV);
1326
1327         /* Thread context if we're sending payload */
1328         LASSERT (!in_interrupt() || payload_niov == 0);
1329         /* payload is either all vaddrs or all pages */
1330         LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
1331
1332         switch (type) {
1333         default:
1334                 LBUG();
1335                 return (PTL_FAIL);
1336                 
1337         case PTL_MSG_REPLY: {
1338                 /* reply's 'private' is the incoming receive */
1339                 kib_rx_t *rx = private;
1340
1341                 /* RDMA reply expected? */
1342                 if (rx->rx_msg->ibm_type == IBNAL_MSG_GET_RDMA) {
1343                         kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0,
1344                                                  rx, libmsg, payload_niov, 
1345                                                  payload_iov, payload_kiov,
1346                                                  payload_offset, payload_nob);
1347                         return (PTL_OK);
1348                 }
1349                 
1350                 /* Incoming message consistent with immediate reply? */
1351                 if (rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) {
1352                         CERROR ("REPLY to "LPX64" bad opbm type %d!!!\n",
1353                                 nid, rx->rx_msg->ibm_type);
1354                         return (PTL_FAIL);
1355                 }
1356
1357                 /* Will it fit in a message? */
1358                 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
1359                 if (nob >= IBNAL_MSG_SIZE) {
1360                         CERROR("REPLY for "LPX64" too big (RDMA not requested): %d\n", 
1361                                nid, payload_nob);
1362                         return (PTL_FAIL);
1363                 }
1364                 break;
1365         }
1366
1367         case PTL_MSG_GET:
1368                 /* might the REPLY message be big enough to need RDMA? */
1369                 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[libmsg->md->length]);
1370                 if (nob > IBNAL_MSG_SIZE)
1371                         return (kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, 
1372                                                           nid, libmsg, hdr));
1373                 break;
1374
1375         case PTL_MSG_ACK:
1376                 LASSERT (payload_nob == 0);
1377                 break;
1378
1379         case PTL_MSG_PUT:
1380                 /* Is the payload big enough to need RDMA? */
1381                 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
1382                 if (nob > IBNAL_MSG_SIZE)
1383                         return (kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA,
1384                                                           nid, libmsg, hdr));
1385                 
1386                 break;
1387         }
1388
1389         tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK ||
1390                                   type == PTL_MSG_REPLY ||
1391                                   in_interrupt()));
1392         if (tx == NULL) {
1393                 CERROR ("Can't send %d to "LPX64": tx descs exhausted%s\n", 
1394                         type, nid, in_interrupt() ? " (intr)" : "");
1395                 return (PTL_NO_SPACE);
1396         }
1397
1398         ibmsg = tx->tx_msg;
1399         ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
1400
1401         if (payload_nob > 0) {
1402                 if (payload_kiov != NULL)
1403                         lib_copy_kiov2buf(ibmsg->ibm_u.immediate.ibim_payload,
1404                                           payload_niov, payload_kiov,
1405                                           payload_offset, payload_nob);
1406                 else
1407                         lib_copy_iov2buf(ibmsg->ibm_u.immediate.ibim_payload,
1408                                          payload_niov, payload_iov,
1409                                          payload_offset, payload_nob);
1410         }
1411
1412         kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE,
1413                             offsetof(kib_immediate_msg_t, 
1414                                      ibim_payload[payload_nob]));
1415
1416         /* libmsg gets finalized when tx completes */
1417         tx->tx_libmsg[0] = libmsg;
1418
1419         kibnal_launch_tx(tx, nid);
1420         return (PTL_OK);
1421 }
1422
1423 ptl_err_t
1424 kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie,
1425                ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
1426                unsigned int payload_niov, struct iovec *payload_iov,
1427                size_t payload_offset, size_t payload_len)
1428 {
1429         return (kibnal_sendmsg(nal, private, cookie,
1430                                hdr, type, nid, pid,
1431                                payload_niov, payload_iov, NULL,
1432                                payload_offset, payload_len));
1433 }
1434
1435 ptl_err_t
1436 kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, 
1437                      ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
1438                      unsigned int payload_niov, ptl_kiov_t *payload_kiov, 
1439                      size_t payload_offset, size_t payload_len)
1440 {
1441         return (kibnal_sendmsg(nal, private, cookie,
1442                                hdr, type, nid, pid,
1443                                payload_niov, NULL, payload_kiov,
1444                                payload_offset, payload_len));
1445 }
1446
1447 ptl_err_t
1448 kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
1449                  unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov,
1450                  int offset, int mlen, int rlen)
1451 {
1452         kib_rx_t    *rx = private;
1453         kib_msg_t   *rxmsg = rx->rx_msg;
1454         int          msg_nob;
1455         
1456         LASSERT (mlen <= rlen);
1457         LASSERT (!in_interrupt ());
1458         /* Either all pages or all vaddrs */
1459         LASSERT (!(kiov != NULL && iov != NULL));
1460
1461         switch (rxmsg->ibm_type) {
1462         default:
1463                 LBUG();
1464                 return (PTL_FAIL);
1465                 
1466         case IBNAL_MSG_IMMEDIATE:
1467                 msg_nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
1468                 if (msg_nob > IBNAL_MSG_SIZE) {
1469                         CERROR ("Immediate message from "LPX64" too big: %d\n",
1470                                 rxmsg->ibm_u.immediate.ibim_hdr.src_nid, rlen);
1471                         return (PTL_FAIL);
1472                 }
1473
1474                 if (kiov != NULL)
1475                         lib_copy_buf2kiov(niov, kiov, offset,
1476                                           rxmsg->ibm_u.immediate.ibim_payload,
1477                                           mlen);
1478                 else
1479                         lib_copy_buf2iov(niov, iov, offset,
1480                                          rxmsg->ibm_u.immediate.ibim_payload,
1481                                          mlen);
1482
1483                 lib_finalize (nal, NULL, libmsg, PTL_OK);
1484                 return (PTL_OK);
1485
1486         case IBNAL_MSG_GET_RDMA:
1487                 /* We get called here just to discard any junk after the
1488                  * GET hdr. */
1489                 LASSERT (libmsg == NULL);
1490                 lib_finalize (nal, NULL, libmsg, PTL_OK);
1491                 return (PTL_OK);
1492
1493         case IBNAL_MSG_PUT_RDMA:
1494                 kibnal_start_active_rdma (IBNAL_MSG_PUT_DONE, 0,
1495                                           rx, libmsg, 
1496                                           niov, iov, kiov, offset, mlen);
1497                 return (PTL_OK);
1498         }
1499 }
1500
1501 ptl_err_t
1502 kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg,
1503               unsigned int niov, struct iovec *iov, 
1504               size_t offset, size_t mlen, size_t rlen)
1505 {
1506         return (kibnal_recvmsg (nal, private, msg, niov, iov, NULL,
1507                                 offset, mlen, rlen));
1508 }
1509
1510 ptl_err_t
1511 kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg,
1512                      unsigned int niov, ptl_kiov_t *kiov, 
1513                      size_t offset, size_t mlen, size_t rlen)
1514 {
1515         return (kibnal_recvmsg (nal, private, msg, niov, NULL, kiov,
1516                                 offset, mlen, rlen));
1517 }
1518
1519 int
1520 kibnal_thread_start (int (*fn)(void *arg), void *arg)
1521 {
1522         long    pid = kernel_thread (fn, arg, 0);
1523
1524         if (pid < 0)
1525                 return ((int)pid);
1526
1527         atomic_inc (&kibnal_data.kib_nthreads);
1528         return (0);
1529 }
1530
1531 void
1532 kibnal_thread_fini (void)
1533 {
1534         atomic_dec (&kibnal_data.kib_nthreads);
1535 }
1536
1537 void
1538 kibnal_close_conn_locked (kib_conn_t *conn, int error)
1539 {
1540         /* This just does the immmediate housekeeping, and schedules the
1541          * connection for the connd to finish off.
1542          * Caller holds kib_global_lock exclusively in irq context */
1543         kib_peer_t   *peer = conn->ibc_peer;
1544
1545         CDEBUG (error == 0 ? D_NET : D_ERROR,
1546                 "closing conn to "LPX64": error %d\n", peer->ibp_nid, error);
1547         
1548         LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED ||
1549                  conn->ibc_state == IBNAL_CONN_CONNECTING);
1550
1551         if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
1552                 /* kib_connd_conns takes ibc_list's ref */
1553                 list_del (&conn->ibc_list);
1554         } else {
1555                 /* new ref for kib_connd_conns */
1556                 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
1557                        conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
1558                        atomic_read (&conn->ibc_refcount));
1559                 atomic_inc (&conn->ibc_refcount);
1560         }
1561         
1562         if (list_empty (&peer->ibp_conns) &&
1563             peer->ibp_persistence == 0) {
1564                 /* Non-persistent peer with no more conns... */
1565                 kibnal_unlink_peer_locked (peer);
1566         }
1567
1568         conn->ibc_state = IBNAL_CONN_DEATHROW;
1569
1570         /* Schedule conn for closing/destruction */
1571         spin_lock (&kibnal_data.kib_connd_lock);
1572
1573         list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns);
1574         wake_up (&kibnal_data.kib_connd_waitq);
1575                 
1576         spin_unlock (&kibnal_data.kib_connd_lock);
1577 }
1578
1579 int
1580 kibnal_close_conn (kib_conn_t *conn, int why)
1581 {
1582         unsigned long     flags;
1583         int               count = 0;
1584
1585         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
1586
1587         LASSERT (conn->ibc_state >= IBNAL_CONN_CONNECTING);
1588         
1589         if (conn->ibc_state <= IBNAL_CONN_ESTABLISHED) {
1590                 count = 1;
1591                 kibnal_close_conn_locked (conn, why);
1592         }
1593         
1594         write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
1595         return (count);
1596 }
1597
1598 void
1599 kibnal_peer_connect_failed (kib_peer_t *peer, int active, int rc)
1600 {
1601         LIST_HEAD        (zombies);
1602         kib_tx_t         *tx;
1603         unsigned long     flags;
1604
1605         LASSERT (rc != 0);
1606         LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL);
1607
1608         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
1609
1610         LASSERT (peer->ibp_connecting != 0);
1611         peer->ibp_connecting--;
1612
1613         if (peer->ibp_connecting != 0) {
1614                 /* another connection attempt under way (loopback?)... */
1615                 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
1616                 return;
1617         }
1618
1619         if (list_empty(&peer->ibp_conns)) {
1620                 /* Say when active connection can be re-attempted */
1621                 peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval;
1622                 /* Increase reconnection interval */
1623                 peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2,
1624                                                     IBNAL_MAX_RECONNECT_INTERVAL);
1625         
1626                 /* Take peer's blocked blocked transmits; I'll complete
1627                  * them with error */
1628                 while (!list_empty (&peer->ibp_tx_queue)) {
1629                         tx = list_entry (peer->ibp_tx_queue.next,
1630                                          kib_tx_t, tx_list);
1631                         
1632                         list_del (&tx->tx_list);
1633                         list_add_tail (&tx->tx_list, &zombies);
1634                 }
1635                 
1636                 if (kibnal_peer_active(peer) &&
1637                     (peer->ibp_persistence == 0)) {
1638                         /* failed connection attempt on non-persistent peer */
1639                         kibnal_unlink_peer_locked (peer);
1640                 }
1641         } else {
1642                 /* Can't have blocked transmits if there are connections */
1643                 LASSERT (list_empty(&peer->ibp_tx_queue));
1644         }
1645         
1646         write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
1647
1648         if (!list_empty (&zombies))
1649                 CERROR ("Deleting messages for "LPX64": connection failed\n",
1650                         peer->ibp_nid);
1651
1652         while (!list_empty (&zombies)) {
1653                 tx = list_entry (zombies.next, kib_tx_t, tx_list);
1654
1655                 list_del (&tx->tx_list);
1656                 /* complete now */
1657                 tx->tx_status = -EHOSTUNREACH;
1658                 kibnal_tx_done (tx);
1659         }
1660 }
1661
1662 void
1663 kibnal_connreq_done (kib_conn_t *conn, int active, int status)
1664 {
1665         int               state = conn->ibc_state;
1666         kib_peer_t       *peer = conn->ibc_peer;
1667         kib_tx_t         *tx;
1668         unsigned long     flags;
1669         int               rc;
1670         int               i;
1671
1672         /* passive connection has no connreq & vice versa */
1673         LASSERT (!active == !(conn->ibc_connreq != NULL));
1674         if (active) {
1675                 PORTAL_FREE (conn->ibc_connreq, sizeof (*conn->ibc_connreq));
1676                 conn->ibc_connreq = NULL;
1677         }
1678
1679         if (state == IBNAL_CONN_CONNECTING) {
1680                 /* Install common (active/passive) callback for
1681                  * disconnect/idle notification if I got as far as getting
1682                  * a CM comm_id */
1683                 rc = tsIbCmCallbackModify(conn->ibc_comm_id, 
1684                                           kibnal_conn_callback, conn);
1685                 LASSERT (rc == 0);
1686         }
1687         
1688         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
1689
1690         LASSERT (peer->ibp_connecting != 0);
1691         
1692         if (status == 0) {                         
1693                 /* connection established... */
1694                 LASSERT (state == IBNAL_CONN_CONNECTING);
1695                 conn->ibc_state = IBNAL_CONN_ESTABLISHED;
1696
1697                 if (!kibnal_peer_active(peer)) {
1698                         /* ...but peer deleted meantime */
1699                         status = -ECONNABORTED;
1700                 }
1701         } else {
1702                 LASSERT (state == IBNAL_CONN_INIT_QP ||
1703                          state == IBNAL_CONN_CONNECTING);
1704         }
1705
1706         if (status == 0) {
1707                 /* Everything worked! */
1708
1709                 peer->ibp_connecting--;
1710
1711                 /* +1 ref for ibc_list; caller(== CM)'s ref remains until
1712                  * the IB_CM_IDLE callback */
1713                 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
1714                        conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
1715                        atomic_read (&conn->ibc_refcount));
1716                 atomic_inc (&conn->ibc_refcount);
1717                 list_add (&conn->ibc_list, &peer->ibp_conns);
1718                 
1719                 /* reset reconnect interval for next attempt */
1720                 peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
1721
1722                 /* post blocked sends to the new connection */
1723                 spin_lock (&conn->ibc_lock);
1724                 
1725                 while (!list_empty (&peer->ibp_tx_queue)) {
1726                         tx = list_entry (peer->ibp_tx_queue.next, 
1727                                          kib_tx_t, tx_list);
1728                         
1729                         list_del (&tx->tx_list);
1730
1731                         /* +1 ref for each tx */
1732                         CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
1733                                conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
1734                                atomic_read (&conn->ibc_refcount));
1735                         atomic_inc (&conn->ibc_refcount);
1736                         kibnal_queue_tx_locked (tx, conn);
1737                 }
1738                 
1739                 spin_unlock (&conn->ibc_lock);
1740
1741                 /* Nuke any dangling conns from a different peer instance... */
1742                 kibnal_close_stale_conns_locked (conn->ibc_peer,
1743                                                  conn->ibc_incarnation);
1744
1745                 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
1746
1747                 /* queue up all the receives */
1748                 for (i = 0; i < IBNAL_RX_MSGS; i++) {
1749                         /* +1 ref for rx desc */
1750                         CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
1751                                conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
1752                                atomic_read (&conn->ibc_refcount));
1753                         atomic_inc (&conn->ibc_refcount);
1754
1755                         CDEBUG(D_NET, "RX[%d] %p->%p - "LPX64"\n",
1756                                i, &conn->ibc_rxs[i], conn->ibc_rxs[i].rx_msg,
1757                                conn->ibc_rxs[i].rx_vaddr);
1758
1759                         kibnal_post_rx (&conn->ibc_rxs[i], 0);
1760                 }
1761
1762                 kibnal_check_sends (conn);
1763                 return;
1764         }
1765
1766         /* connection failed */
1767         if (state == IBNAL_CONN_CONNECTING) {
1768                 /* schedule for connd to close */
1769                 kibnal_close_conn_locked (conn, status);
1770         } else {
1771                 /* Don't have a CM comm_id; just wait for refs to drain */
1772                 conn->ibc_state = IBNAL_CONN_ZOMBIE;
1773         } 
1774
1775         write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
1776
1777         kibnal_peer_connect_failed (conn->ibc_peer, active, status);
1778
1779         if (state != IBNAL_CONN_CONNECTING) {
1780                 /* drop caller's ref if we're not waiting for the
1781                  * IB_CM_IDLE callback */
1782                 kibnal_put_conn (conn);
1783         }
1784 }
1785
1786 int
1787 kibnal_accept (kib_conn_t **connp, tTS_IB_CM_COMM_ID cid,
1788                 ptl_nid_t nid, __u64 incarnation, int queue_depth)
1789 {
1790         kib_conn_t    *conn = kibnal_create_conn();
1791         kib_peer_t    *peer;
1792         kib_peer_t    *peer2;
1793         unsigned long  flags;
1794
1795         if (conn == NULL)
1796                 return (-ENOMEM);
1797
1798         if (queue_depth != IBNAL_MSG_QUEUE_SIZE) {
1799                 CERROR("Can't accept "LPX64": bad queue depth %d (%d expected)\n",
1800                        nid, queue_depth, IBNAL_MSG_QUEUE_SIZE);
1801                 return (-EPROTO);
1802         }
1803         
1804         /* assume 'nid' is a new peer */
1805         peer = kibnal_create_peer (nid);
1806         if (peer == NULL) {
1807                 CDEBUG(D_NET, "--conn[%p] state %d -> "LPX64" (%d)\n",
1808                        conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
1809                        atomic_read (&conn->ibc_refcount));
1810                 atomic_dec (&conn->ibc_refcount);
1811                 kibnal_destroy_conn(conn);
1812                 return (-ENOMEM);
1813         }
1814         
1815         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
1816
1817         peer2 = kibnal_find_peer_locked(nid);
1818         if (peer2 == NULL) {
1819                 /* peer table takes my ref on peer */
1820                 list_add_tail (&peer->ibp_list,
1821                                kibnal_nid2peerlist(nid));
1822         } else {
1823                 kibnal_put_peer (peer);
1824                 peer = peer2;
1825         }
1826
1827         /* +1 ref for conn */
1828         atomic_inc (&peer->ibp_refcount);
1829         peer->ibp_connecting++;
1830
1831         write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
1832
1833         conn->ibc_peer = peer;
1834         conn->ibc_state = IBNAL_CONN_CONNECTING;
1835         conn->ibc_comm_id = cid;
1836         conn->ibc_incarnation = incarnation;
1837         conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
1838
1839         *connp = conn;
1840         return (0);
1841 }
1842
1843 tTS_IB_CM_CALLBACK_RETURN
1844 kibnal_idle_conn_callback (tTS_IB_CM_EVENT event,
1845                             tTS_IB_CM_COMM_ID cid,
1846                             void *param,
1847                             void *arg)
1848 {
1849         /* Shouldn't ever get a callback after TS_IB_CM_IDLE */
1850         CERROR ("Unexpected event %d: conn %p\n", event, arg);
1851         LBUG ();
1852         return TS_IB_CM_CALLBACK_PROCEED;
1853 }
1854
1855 tTS_IB_CM_CALLBACK_RETURN
1856 kibnal_conn_callback (tTS_IB_CM_EVENT event,
1857                        tTS_IB_CM_COMM_ID cid,
1858                        void *param,
1859                        void *arg)
1860 {
1861         kib_conn_t       *conn = arg;
1862         LIST_HEAD        (zombies); 
1863         struct list_head *tmp;
1864         struct list_head *nxt;
1865         kib_tx_t         *tx;
1866         unsigned long     flags;
1867         int               done;
1868         int               rc;
1869
1870         /* Established Connection Notifier */
1871
1872         switch (event) {
1873         default:
1874                 CERROR("Connection %p -> "LPX64" ERROR %d\n",
1875                        conn, conn->ibc_peer->ibp_nid, event);
1876                 kibnal_close_conn (conn, -ECONNABORTED);
1877                 break;
1878                 
1879         case TS_IB_CM_DISCONNECTED:
1880                 CDEBUG(D_WARNING, "Connection %p -> "LPX64" DISCONNECTED.\n",
1881                        conn, conn->ibc_peer->ibp_nid);
1882                 kibnal_close_conn (conn, 0);
1883                 break;
1884
1885         case TS_IB_CM_IDLE:
1886                 CDEBUG(D_NET, "Connection %p -> "LPX64" IDLE.\n",
1887                        conn, conn->ibc_peer->ibp_nid);
1888                 kibnal_put_conn (conn);        /* Lose CM's ref */
1889
1890                 /* LASSERT (no further callbacks) */
1891                 rc = tsIbCmCallbackModify(cid, 
1892                                           kibnal_idle_conn_callback, conn);
1893                 LASSERT (rc == 0);
1894
1895                 /* NB we wait until the connection has closed before
1896                  * completing outstanding passive RDMAs so we can be sure
1897                  * the network can't touch the mapped memory any more. */
1898
1899                 spin_lock_irqsave (&conn->ibc_lock, flags);
1900
1901                 /* grab passive RDMAs not waiting for the tx callback */
1902                 list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) {
1903                         tx = list_entry (tmp, kib_tx_t, tx_list);
1904
1905                         LASSERT (tx->tx_passive_rdma ||
1906                                  !tx->tx_passive_rdma_wait);
1907
1908                         LASSERT (tx->tx_passive_rdma_wait ||
1909                                  tx->tx_sending != 0);
1910
1911                         /* still waiting for tx callback? */
1912                         if (!tx->tx_passive_rdma_wait)
1913                                 continue;
1914
1915                         tx->tx_status = -ECONNABORTED;
1916                         tx->tx_passive_rdma_wait = 0;
1917                         done = (tx->tx_sending == 0);
1918
1919                         if (!done)
1920                                 continue;
1921
1922                         list_del (&tx->tx_list);
1923                         list_add (&tx->tx_list, &zombies);
1924                 }
1925
1926                 /* grab all blocked transmits */
1927                 list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) {
1928                         tx = list_entry (tmp, kib_tx_t, tx_list);
1929                         
1930                         list_del (&tx->tx_list);
1931                         list_add (&tx->tx_list, &zombies);
1932                 }
1933                 
1934                 spin_unlock_irqrestore (&conn->ibc_lock, flags);
1935
1936                 while (!list_empty(&zombies)) {
1937                         tx = list_entry (zombies.next, kib_tx_t, tx_list);
1938
1939                         list_del(&tx->tx_list);
1940                         kibnal_tx_done (tx);
1941                 }
1942                 break;
1943         }
1944
1945         return TS_IB_CM_CALLBACK_PROCEED;
1946 }
1947
1948 tTS_IB_CM_CALLBACK_RETURN
1949 kibnal_passive_conn_callback (tTS_IB_CM_EVENT event,
1950                                tTS_IB_CM_COMM_ID cid,
1951                                void *param,
1952                                void *arg)
1953 {
1954         kib_conn_t *conn = arg;
1955         int          rc;
1956         
1957         switch (event) {
1958         default:
1959                 if (conn == NULL) {
1960                         /* no connection yet */
1961                         CERROR ("Unexpected event: %d\n", event);
1962                         return TS_IB_CM_CALLBACK_ABORT;
1963                 }
1964                 
1965                 CERROR ("Unexpected event %p -> "LPX64": %d\n", 
1966                         conn, conn->ibc_peer->ibp_nid, event);
1967                 kibnal_connreq_done (conn, 0, -ECONNABORTED);
1968                 break;
1969                 
1970         case TS_IB_CM_REQ_RECEIVED: {
1971                 struct ib_cm_req_received_param *req = param;
1972                 kib_wire_connreq_t             *wcr = req->remote_private_data;
1973
1974                 LASSERT (conn == NULL);
1975
1976                 CDEBUG(D_NET, "REQ from "LPX64"\n", le64_to_cpu(wcr->wcr_nid));
1977
1978                 if (req->remote_private_data_len < sizeof (*wcr)) {
1979                         CERROR("Connect from remote LID %04x: too short %d\n",
1980                                req->dlid, req->remote_private_data_len);
1981                         return TS_IB_CM_CALLBACK_ABORT;
1982                 }
1983
1984                 if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) {
1985                         CERROR ("Can't accept LID %04x: bad magic %08x\n",
1986                                 req->dlid, le32_to_cpu(wcr->wcr_magic));
1987                         return TS_IB_CM_CALLBACK_ABORT;
1988                 }
1989                 
1990                 if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) {
1991                         CERROR ("Can't accept LID %04x: bad version %d\n",
1992                                 req->dlid, le16_to_cpu(wcr->wcr_magic));
1993                         return TS_IB_CM_CALLBACK_ABORT;
1994                 }
1995                                 
1996                 rc = kibnal_accept(&conn,
1997                                    cid,
1998                                    le64_to_cpu(wcr->wcr_nid),
1999                                    le64_to_cpu(wcr->wcr_incarnation),
2000                                    le16_to_cpu(wcr->wcr_queue_depth));
2001                 if (rc != 0) {
2002                         CERROR ("Can't accept "LPX64": %d\n",
2003                                 le64_to_cpu(wcr->wcr_nid), rc);
2004                         return TS_IB_CM_CALLBACK_ABORT;
2005                 }
2006
2007                 /* update 'arg' for next callback */
2008                 rc = tsIbCmCallbackModify(cid, 
2009                                           kibnal_passive_conn_callback, conn);
2010                 LASSERT (rc == 0);
2011
2012                 req->accept_param.qp                     = conn->ibc_qp;
2013                 *((kib_wire_connreq_t *)req->accept_param.reply_private_data)
2014                         = (kib_wire_connreq_t) {
2015                                 .wcr_magic       = cpu_to_le32(IBNAL_MSG_MAGIC),
2016                                 .wcr_version     = cpu_to_le16(IBNAL_MSG_VERSION),
2017                                 .wcr_queue_depth = cpu_to_le32(IBNAL_MSG_QUEUE_SIZE),
2018                                 .wcr_nid         = cpu_to_le64(kibnal_data.kib_nid),
2019                                 .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation),
2020                         };
2021                 req->accept_param.reply_private_data_len = sizeof(kib_wire_connreq_t);
2022                 req->accept_param.responder_resources    = IBNAL_RESPONDER_RESOURCES;
2023                 req->accept_param.initiator_depth        = IBNAL_RESPONDER_RESOURCES;
2024                 req->accept_param.rnr_retry_count        = IBNAL_RNR_RETRY;
2025                 req->accept_param.flow_control           = IBNAL_FLOW_CONTROL;
2026
2027                 CDEBUG(D_NET, "Proceeding\n");
2028                 break;
2029         }
2030
2031         case TS_IB_CM_ESTABLISHED:
2032                 LASSERT (conn != NULL);
2033                 CDEBUG(D_WARNING, "Connection %p -> "LPX64" ESTABLISHED.\n",
2034                        conn, conn->ibc_peer->ibp_nid);
2035
2036                 kibnal_connreq_done (conn, 0, 0);
2037                 break;
2038         }
2039
2040         /* NB if the connreq is done, we switch to kibnal_conn_callback */
2041         return TS_IB_CM_CALLBACK_PROCEED;
2042 }
2043
2044 tTS_IB_CM_CALLBACK_RETURN
2045 kibnal_active_conn_callback (tTS_IB_CM_EVENT event,
2046                               tTS_IB_CM_COMM_ID cid,
2047                               void *param,
2048                               void *arg)
2049 {
2050         kib_conn_t *conn = arg;
2051
2052         switch (event) {
2053         case TS_IB_CM_REP_RECEIVED: {
2054                 struct ib_cm_rep_received_param *rep = param;
2055                 kib_wire_connreq_t             *wcr = rep->remote_private_data;
2056
2057                 if (rep->remote_private_data_len < sizeof (*wcr)) {
2058                         CERROR ("Short reply from "LPX64": %d\n",
2059                                 conn->ibc_peer->ibp_nid,
2060                                 rep->remote_private_data_len);
2061                         kibnal_connreq_done (conn, 1, -EPROTO);
2062                         break;
2063                 }
2064
2065                 if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) {
2066                         CERROR ("Can't connect "LPX64": bad magic %08x\n",
2067                                 conn->ibc_peer->ibp_nid, le32_to_cpu(wcr->wcr_magic));
2068                         kibnal_connreq_done (conn, 1, -EPROTO);
2069                         break;
2070                 }
2071                 
2072                 if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) {
2073                         CERROR ("Can't connect "LPX64": bad version %d\n",
2074                                 conn->ibc_peer->ibp_nid, le16_to_cpu(wcr->wcr_magic));
2075                         kibnal_connreq_done (conn, 1, -EPROTO);
2076                         break;
2077                 }
2078                                 
2079                 if (wcr->wcr_queue_depth != cpu_to_le16(IBNAL_MSG_QUEUE_SIZE)) {
2080                         CERROR ("Can't connect "LPX64": bad queue depth %d\n",
2081                                 conn->ibc_peer->ibp_nid, le16_to_cpu(wcr->wcr_queue_depth));
2082                         kibnal_connreq_done (conn, 1, -EPROTO);
2083                         break;
2084                 }
2085                                 
2086                 if (le64_to_cpu(wcr->wcr_nid) != conn->ibc_peer->ibp_nid) {
2087                         CERROR ("Unexpected NID "LPX64" from "LPX64"\n",
2088                                 le64_to_cpu(wcr->wcr_nid), conn->ibc_peer->ibp_nid);
2089                         kibnal_connreq_done (conn, 1, -EPROTO);
2090                         break;
2091                 }
2092
2093                 CDEBUG(D_NET, "Connection %p -> "LPX64" REP_RECEIVED.\n",
2094                        conn, conn->ibc_peer->ibp_nid);
2095
2096                 conn->ibc_incarnation = le64_to_cpu(wcr->wcr_incarnation);
2097                 conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
2098                 break;
2099         }
2100
2101         case TS_IB_CM_ESTABLISHED:
2102                 CDEBUG(D_WARNING, "Connection %p -> "LPX64" Established\n",
2103                        conn, conn->ibc_peer->ibp_nid);
2104
2105                 kibnal_connreq_done (conn, 1, 0);
2106                 break;
2107
2108         case TS_IB_CM_IDLE:
2109                 CERROR("Connection %p -> "LPX64" IDLE\n",
2110                        conn, conn->ibc_peer->ibp_nid);
2111                 /* Back out state change: I'm disengaged from CM */
2112                 conn->ibc_state = IBNAL_CONN_INIT_QP;
2113                 
2114                 kibnal_connreq_done (conn, 1, -ECONNABORTED);
2115                 break;
2116
2117         default:
2118                 CERROR("Connection %p -> "LPX64" ERROR %d\n",
2119                        conn, conn->ibc_peer->ibp_nid, event);
2120                 kibnal_connreq_done (conn, 1, -ECONNABORTED);
2121                 break;
2122         }
2123
2124         /* NB if the connreq is done, we switch to kibnal_conn_callback */
2125         return TS_IB_CM_CALLBACK_PROCEED;
2126 }
2127
2128 int
2129 kibnal_pathreq_callback (tTS_IB_CLIENT_QUERY_TID tid, int status,
2130                           struct ib_path_record *resp, int remaining,
2131                           void *arg)
2132 {
2133         kib_conn_t *conn = arg;
2134         
2135         if (status != 0) {
2136                 CERROR ("status %d\n", status);
2137                 kibnal_connreq_done (conn, 1, status);
2138                 goto out;
2139         }
2140
2141         conn->ibc_connreq->cr_path = *resp;
2142
2143         conn->ibc_connreq->cr_wcr = (kib_wire_connreq_t) {
2144                 .wcr_magic       = cpu_to_le32(IBNAL_MSG_MAGIC),
2145                 .wcr_version     = cpu_to_le16(IBNAL_MSG_VERSION),
2146                 .wcr_queue_depth = cpu_to_le16(IBNAL_MSG_QUEUE_SIZE),
2147                 .wcr_nid         = cpu_to_le64(kibnal_data.kib_nid),
2148                 .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation),
2149         };
2150
2151         conn->ibc_connreq->cr_connparam = (struct ib_cm_active_param) {
2152                 .qp                   = conn->ibc_qp,
2153                 .req_private_data     = &conn->ibc_connreq->cr_wcr,
2154                 .req_private_data_len = sizeof(conn->ibc_connreq->cr_wcr),
2155                 .responder_resources  = IBNAL_RESPONDER_RESOURCES,
2156                 .initiator_depth      = IBNAL_RESPONDER_RESOURCES,
2157                 .retry_count          = IBNAL_RETRY,
2158                 .rnr_retry_count      = IBNAL_RNR_RETRY,
2159                 .cm_response_timeout  = kibnal_tunables.kib_io_timeout,
2160                 .max_cm_retries       = IBNAL_CM_RETRY,
2161                 .flow_control         = IBNAL_FLOW_CONTROL,
2162         };
2163
2164         /* XXX set timeout just like SDP!!!*/
2165         conn->ibc_connreq->cr_path.packet_life = 13;
2166         
2167         /* Flag I'm getting involved with the CM... */
2168         conn->ibc_state = IBNAL_CONN_CONNECTING;
2169
2170         CDEBUG(D_NET, "Connecting to, service id "LPX64", on "LPX64"\n",
2171                conn->ibc_connreq->cr_service.service_id, 
2172                *kibnal_service_nid_field(&conn->ibc_connreq->cr_service));
2173
2174         /* kibnal_connect_callback gets my conn ref */
2175         status = ib_cm_connect (&conn->ibc_connreq->cr_connparam, 
2176                                 &conn->ibc_connreq->cr_path, NULL,
2177                                 conn->ibc_connreq->cr_service.service_id, 0,
2178                                 kibnal_active_conn_callback, conn,
2179                                 &conn->ibc_comm_id);
2180         if (status != 0) {
2181                 CERROR ("Connect: %d\n", status);
2182                 /* Back out state change: I've not got a CM comm_id yet... */
2183                 conn->ibc_state = IBNAL_CONN_INIT_QP;
2184                 kibnal_connreq_done (conn, 1, status);
2185         }
2186         
2187  out:
2188         /* return non-zero to prevent further callbacks */
2189         return 1;
2190 }
2191
2192 void
2193 kibnal_service_get_callback (tTS_IB_CLIENT_QUERY_TID tid, int status,
2194                              struct ib_common_attrib_service *resp, void *arg)
2195 {
2196         kib_conn_t *conn = arg;
2197         
2198         if (status != 0) {
2199                 CERROR ("status %d\n", status);
2200                 kibnal_connreq_done (conn, 1, status);
2201                 return;
2202         }
2203
2204         CDEBUG(D_NET, "Got status %d, service id "LPX64", on "LPX64"\n",
2205                status, resp->service_id, 
2206                *kibnal_service_nid_field(resp));
2207
2208         conn->ibc_connreq->cr_service = *resp;
2209
2210         status = ib_cached_gid_get(kibnal_data.kib_device,
2211                                    kibnal_data.kib_port, 0,
2212                                    conn->ibc_connreq->cr_gid);
2213         LASSERT (status == 0);
2214
2215         /* kibnal_pathreq_callback gets my conn ref */
2216         status = tsIbPathRecordRequest (kibnal_data.kib_device,
2217                                         kibnal_data.kib_port,
2218                                         conn->ibc_connreq->cr_gid,
2219                                         conn->ibc_connreq->cr_service.service_gid,
2220                                         conn->ibc_connreq->cr_service.service_pkey,
2221                                         0,
2222                                         kibnal_tunables.kib_io_timeout * HZ,
2223                                         0,
2224                                         kibnal_pathreq_callback, conn, 
2225                                         &conn->ibc_connreq->cr_tid);
2226
2227         if (status == 0)
2228                 return;
2229
2230         CERROR ("Path record request: %d\n", status);
2231         kibnal_connreq_done (conn, 1, status);
2232 }
2233
2234 void
2235 kibnal_connect_peer (kib_peer_t *peer)
2236 {
2237         kib_conn_t  *conn = kibnal_create_conn();
2238         int          rc;
2239
2240         LASSERT (peer->ibp_connecting != 0);
2241
2242         if (conn == NULL) {
2243                 CERROR ("Can't allocate conn\n");
2244                 kibnal_peer_connect_failed (peer, 1, -ENOMEM);
2245                 return;
2246         }
2247
2248         conn->ibc_peer = peer;
2249         atomic_inc (&peer->ibp_refcount);
2250
2251         PORTAL_ALLOC (conn->ibc_connreq, sizeof (*conn->ibc_connreq));
2252         if (conn->ibc_connreq == NULL) {
2253                 CERROR ("Can't allocate connreq\n");
2254                 kibnal_connreq_done (conn, 1, -ENOMEM);
2255                 return;
2256         }
2257
2258         memset(conn->ibc_connreq, 0, sizeof (*conn->ibc_connreq));
2259
2260         kibnal_set_service_keys(&conn->ibc_connreq->cr_service, peer->ibp_nid);
2261
2262         /* kibnal_service_get_callback gets my conn ref */
2263         rc = ib_service_get (kibnal_data.kib_device, 
2264                              kibnal_data.kib_port,
2265                              &conn->ibc_connreq->cr_service,
2266                              KIBNAL_SERVICE_KEY_MASK,
2267                              kibnal_tunables.kib_io_timeout * HZ,
2268                              kibnal_service_get_callback, conn, 
2269                              &conn->ibc_connreq->cr_tid);
2270         
2271         if (rc == 0)
2272                 return;
2273
2274         CERROR ("ib_service_get: %d\n", rc);
2275         kibnal_connreq_done (conn, 1, rc);
2276 }
2277
2278 int
2279 kibnal_conn_timed_out (kib_conn_t *conn)
2280 {
2281         kib_tx_t          *tx;
2282         struct list_head  *ttmp;
2283         unsigned long      flags;
2284
2285         spin_lock_irqsave (&conn->ibc_lock, flags);
2286
2287         list_for_each (ttmp, &conn->ibc_tx_queue) {
2288                 tx = list_entry (ttmp, kib_tx_t, tx_list);
2289
2290                 LASSERT (!tx->tx_passive_rdma_wait);
2291                 LASSERT (tx->tx_sending == 0);
2292
2293                 if (time_after_eq (jiffies, tx->tx_deadline)) {
2294                         spin_unlock_irqrestore (&conn->ibc_lock, flags);
2295                         return 1;
2296                 }
2297         }
2298
2299         list_for_each (ttmp, &conn->ibc_active_txs) {
2300                 tx = list_entry (ttmp, kib_tx_t, tx_list);
2301
2302                 LASSERT (tx->tx_passive_rdma ||
2303                          !tx->tx_passive_rdma_wait);
2304
2305                 LASSERT (tx->tx_passive_rdma_wait ||
2306                          tx->tx_sending != 0);
2307
2308                 if (time_after_eq (jiffies, tx->tx_deadline)) {
2309                         spin_unlock_irqrestore (&conn->ibc_lock, flags);
2310                         return 1;
2311                 }
2312         }
2313
2314         spin_unlock_irqrestore (&conn->ibc_lock, flags);
2315
2316         return 0;
2317 }
2318
2319 void
2320 kibnal_check_conns (int idx)
2321 {
2322         struct list_head  *peers = &kibnal_data.kib_peers[idx];
2323         struct list_head  *ptmp;
2324         kib_peer_t        *peer;
2325         kib_conn_t        *conn;
2326         struct list_head  *ctmp;
2327
2328  again:
2329         /* NB. We expect to have a look at all the peers and not find any
2330          * rdmas to time out, so we just use a shared lock while we
2331          * take a look... */
2332         read_lock (&kibnal_data.kib_global_lock);
2333
2334         list_for_each (ptmp, peers) {
2335                 peer = list_entry (ptmp, kib_peer_t, ibp_list);
2336
2337                 list_for_each (ctmp, &peer->ibp_conns) {
2338                         conn = list_entry (ctmp, kib_conn_t, ibc_list);
2339
2340                         LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED);
2341
2342
2343                         /* In case we have enough credits to return via a
2344                          * NOOP, but there were no non-blocking tx descs
2345                          * free to do it last time... */
2346                         kibnal_check_sends(conn);
2347
2348                         if (!kibnal_conn_timed_out(conn))
2349                                 continue;
2350                         
2351                         CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
2352                                conn, conn->ibc_state, peer->ibp_nid,
2353                                atomic_read (&conn->ibc_refcount));
2354
2355                         atomic_inc (&conn->ibc_refcount);
2356                         read_unlock (&kibnal_data.kib_global_lock);
2357
2358                         CERROR("Timed out RDMA with "LPX64"\n",
2359                                peer->ibp_nid);
2360
2361                         kibnal_close_conn (conn, -ETIMEDOUT);
2362                         kibnal_put_conn (conn);
2363
2364                         /* start again now I've dropped the lock */
2365                         goto again;
2366                 }
2367         }
2368
2369         read_unlock (&kibnal_data.kib_global_lock);
2370 }
2371
2372 void
2373 kibnal_terminate_conn (kib_conn_t *conn)
2374 {
2375         int           rc;
2376
2377         CDEBUG(D_NET, "conn %p\n", conn);
2378         LASSERT (conn->ibc_state == IBNAL_CONN_DEATHROW);
2379         conn->ibc_state = IBNAL_CONN_ZOMBIE;
2380
2381         rc = ib_cm_disconnect (conn->ibc_comm_id);
2382         if (rc != 0)
2383                 CERROR ("Error %d disconnecting conn %p -> "LPX64"\n",
2384                         rc, conn, conn->ibc_peer->ibp_nid);
2385 }
2386
2387 int
2388 kibnal_connd (void *arg)
2389 {
2390         wait_queue_t       wait;
2391         unsigned long      flags;
2392         kib_conn_t        *conn;
2393         kib_peer_t        *peer;
2394         int                timeout;
2395         int                i;
2396         int                peer_index = 0;
2397         unsigned long      deadline = jiffies;
2398         
2399         kportal_daemonize ("kibnal_connd");
2400         kportal_blockallsigs ();
2401
2402         init_waitqueue_entry (&wait, current);
2403
2404         spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
2405
2406         for (;;) {
2407                 if (!list_empty (&kibnal_data.kib_connd_conns)) {
2408                         conn = list_entry (kibnal_data.kib_connd_conns.next,
2409                                            kib_conn_t, ibc_list);
2410                         list_del (&conn->ibc_list);
2411                         
2412                         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
2413
2414                         switch (conn->ibc_state) {
2415                         case IBNAL_CONN_DEATHROW:
2416                                 LASSERT (conn->ibc_comm_id != TS_IB_CM_COMM_ID_INVALID);
2417                                 /* Disconnect: conn becomes a zombie in the
2418                                  * callback and last ref reschedules it
2419                                  * here... */
2420                                 kibnal_terminate_conn(conn);
2421                                 kibnal_put_conn (conn);
2422                                 break;
2423                                 
2424                         case IBNAL_CONN_ZOMBIE:
2425                                 kibnal_destroy_conn (conn);
2426                                 break;
2427                                 
2428                         default:
2429                                 CERROR ("Bad conn %p state: %d\n",
2430                                         conn, conn->ibc_state);
2431                                 LBUG();
2432                         }
2433
2434                         spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
2435                         continue;
2436                 }
2437
2438                 if (!list_empty (&kibnal_data.kib_connd_peers)) {
2439                         peer = list_entry (kibnal_data.kib_connd_peers.next,
2440                                            kib_peer_t, ibp_connd_list);
2441                         
2442                         list_del_init (&peer->ibp_connd_list);
2443                         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
2444
2445                         kibnal_connect_peer (peer);
2446                         kibnal_put_peer (peer);
2447
2448                         spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
2449                 }
2450
2451                 /* shut down and nobody left to reap... */
2452                 if (kibnal_data.kib_shutdown &&
2453                     atomic_read(&kibnal_data.kib_nconns) == 0)
2454                         break;
2455
2456                 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
2457
2458                 /* careful with the jiffy wrap... */
2459                 while ((timeout = (int)(deadline - jiffies)) <= 0) {
2460                         const int n = 4;
2461                         const int p = 1;
2462                         int       chunk = kibnal_data.kib_peer_hash_size;
2463                         
2464                         /* Time to check for RDMA timeouts on a few more
2465                          * peers: I do checks every 'p' seconds on a
2466                          * proportion of the peer table and I need to check
2467                          * every connection 'n' times within a timeout
2468                          * interval, to ensure I detect a timeout on any
2469                          * connection within (n+1)/n times the timeout
2470                          * interval. */
2471
2472                         if (kibnal_tunables.kib_io_timeout > n * p)
2473                                 chunk = (chunk * n * p) / 
2474                                         kibnal_tunables.kib_io_timeout;
2475                         if (chunk == 0)
2476                                 chunk = 1;
2477
2478                         for (i = 0; i < chunk; i++) {
2479                                 kibnal_check_conns (peer_index);
2480                                 peer_index = (peer_index + 1) % 
2481                                              kibnal_data.kib_peer_hash_size;
2482                         }
2483
2484                         deadline += p * HZ;
2485                 }
2486
2487                 kibnal_data.kib_connd_waketime = jiffies + timeout;
2488
2489                 set_current_state (TASK_INTERRUPTIBLE);
2490                 add_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
2491
2492                 if (!kibnal_data.kib_shutdown &&
2493                     list_empty (&kibnal_data.kib_connd_conns) &&
2494                     list_empty (&kibnal_data.kib_connd_peers))
2495                         schedule_timeout (timeout);
2496
2497                 set_current_state (TASK_RUNNING);
2498                 remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
2499
2500                 spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
2501         }
2502
2503         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
2504
2505         kibnal_thread_fini ();
2506         return (0);
2507 }
2508
2509 int
2510 kibnal_scheduler(void *arg)
2511 {
2512         long            id = (long)arg;
2513         char            name[16];
2514         kib_rx_t       *rx;
2515         kib_tx_t       *tx;
2516         unsigned long   flags;
2517         int             rc;
2518         int             counter = 0;
2519         int             did_something;
2520
2521         snprintf(name, sizeof(name), "kibnal_sd_%02ld", id);
2522         kportal_daemonize(name);
2523         kportal_blockallsigs();
2524
2525         spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
2526
2527         for (;;) {
2528                 did_something = 0;
2529
2530                 while (!list_empty(&kibnal_data.kib_sched_txq)) {
2531                         tx = list_entry(kibnal_data.kib_sched_txq.next,
2532                                         kib_tx_t, tx_list);
2533                         list_del(&tx->tx_list);
2534                         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
2535                                                flags);
2536                         kibnal_tx_done(tx);
2537
2538                         spin_lock_irqsave(&kibnal_data.kib_sched_lock,
2539                                           flags);
2540                 }
2541
2542                 if (!list_empty(&kibnal_data.kib_sched_rxq)) {
2543                         rx = list_entry(kibnal_data.kib_sched_rxq.next,
2544                                         kib_rx_t, rx_list);
2545                         list_del(&rx->rx_list);
2546                         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
2547                                                flags);
2548
2549                         kibnal_rx(rx);
2550
2551                         did_something = 1;
2552                         spin_lock_irqsave(&kibnal_data.kib_sched_lock,
2553                                           flags);
2554                 }
2555
2556                 /* shut down and no receives to complete... */
2557                 if (kibnal_data.kib_shutdown &&
2558                     atomic_read(&kibnal_data.kib_nconns) == 0)
2559                         break;
2560
2561                 /* nothing to do or hogging CPU */
2562                 if (!did_something || counter++ == IBNAL_RESCHED) {
2563                         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
2564                                                flags);
2565                         counter = 0;
2566
2567                         if (!did_something) {
2568                                 rc = wait_event_interruptible(
2569                                         kibnal_data.kib_sched_waitq,
2570                                         !list_empty(&kibnal_data.kib_sched_txq) || 
2571                                         !list_empty(&kibnal_data.kib_sched_rxq) || 
2572                                         (kibnal_data.kib_shutdown &&
2573                                          atomic_read (&kibnal_data.kib_nconns) == 0));
2574                         } else {
2575                                 our_cond_resched();
2576                         }
2577
2578                         spin_lock_irqsave(&kibnal_data.kib_sched_lock,
2579                                           flags);
2580                 }
2581         }
2582
2583         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
2584
2585         kibnal_thread_fini();
2586         return (0);
2587 }
2588
2589
2590 lib_nal_t kibnal_lib = {
2591         libnal_data:        &kibnal_data,      /* NAL private data */
2592         libnal_send:         kibnal_send,
2593         libnal_send_pages:   kibnal_send_pages,
2594         libnal_recv:         kibnal_recv,
2595         libnal_recv_pages:   kibnal_recv_pages,
2596         libnal_dist:         kibnal_dist
2597 };