Whamcloud - gitweb
file jbd-stats-2.6.9.patch was initially added on branch b1_4.
[fs/lustre-release.git] / lnet / klnds / viblnd / viblnd_cb.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * Copyright (C) 2004 Cluster File Systems, Inc.
5  *   Author: Eric Barton <eric@bartonsoftware.com>
6  *   Author: Frank Zago <fzago@systemfabricworks.com>
7  *
8  *   This file is part of Lustre, http://www.lustre.org.
9  *
10  *   Lustre is free software; you can redistribute it and/or
11  *   modify it under the terms of version 2 of the GNU General Public
12  *   License as published by the Free Software Foundation.
13  *
14  *   Lustre is distributed in the hope that it will be useful,
15  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
16  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  *   GNU General Public License for more details.
18  *
19  *   You should have received a copy of the GNU General Public License
20  *   along with Lustre; if not, write to the Free Software
21  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22  *
23  */
24
25 #include "vibnal.h"
26
27 void
28 kibnal_tx_done (kib_tx_t *tx)
29 {
30         ptl_err_t        ptlrc = (tx->tx_status == 0) ? PTL_OK : PTL_FAIL;
31         int              i;
32
33         LASSERT (!in_interrupt());
34         LASSERT (!tx->tx_queued);               /* mustn't be queued for sending */
35         LASSERT (tx->tx_sending == 0);          /* mustn't be awaiting sent callback */
36         LASSERT (!tx->tx_waiting);              /* mustn't be awaiting peer response */
37
38 #if IBNAL_USE_FMR
39         if (tx->tx_md.md_fmrcount == 0 ||
40             (ptlrc != PTL_OK && tx->tx_md.md_active)) {
41                 vv_return_t      vvrc;
42
43                 /* mapping must be active (it dropped fmrcount to 0) */
44                 LASSERT (tx->tx_md.md_active); 
45
46                 vvrc = vv_unmap_fmr(kibnal_data.kib_hca,
47                                     1, &tx->tx_md.md_fmrhandle);
48                 LASSERT (vvrc == vv_return_ok);
49
50                 tx->tx_md.md_fmrcount = IBNAL_FMR_NMAPS;
51         }
52         tx->tx_md.md_active = 0;
53 #endif
54         for (i = 0; i < 2; i++) {
55                 /* tx may have up to 2 libmsgs to finalise */
56                 if (tx->tx_libmsg[i] == NULL)
57                         continue;
58
59                 lib_finalize (&kibnal_lib, NULL, tx->tx_libmsg[i], ptlrc);
60                 tx->tx_libmsg[i] = NULL;
61         }
62         
63         if (tx->tx_conn != NULL) {
64                 kibnal_conn_decref(tx->tx_conn);
65                 tx->tx_conn = NULL;
66         }
67
68         tx->tx_nwrq = 0;
69         tx->tx_status = 0;
70
71         spin_lock(&kibnal_data.kib_tx_lock);
72
73         if (tx->tx_isnblk) {
74                 list_add (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs);
75         } else {
76                 list_add (&tx->tx_list, &kibnal_data.kib_idle_txs);
77                 wake_up (&kibnal_data.kib_idle_tx_waitq);
78         }
79
80         spin_unlock(&kibnal_data.kib_tx_lock);
81 }
82
83 kib_tx_t *
84 kibnal_get_idle_tx (int may_block) 
85 {
86         kib_tx_t      *tx = NULL;
87         ENTRY;
88         
89         for (;;) {
90                 spin_lock(&kibnal_data.kib_tx_lock);
91
92                 /* "normal" descriptor is free */
93                 if (!list_empty (&kibnal_data.kib_idle_txs)) {
94                         tx = list_entry (kibnal_data.kib_idle_txs.next,
95                                          kib_tx_t, tx_list);
96                         break;
97                 }
98
99                 if (!may_block) {
100                         /* may dip into reserve pool */
101                         if (list_empty (&kibnal_data.kib_idle_nblk_txs)) {
102                                 CERROR ("reserved tx desc pool exhausted\n");
103                                 break;
104                         }
105
106                         tx = list_entry (kibnal_data.kib_idle_nblk_txs.next,
107                                          kib_tx_t, tx_list);
108                         break;
109                 }
110
111                 /* block for idle tx */
112                 spin_unlock(&kibnal_data.kib_tx_lock);
113
114                 wait_event (kibnal_data.kib_idle_tx_waitq,
115                             !list_empty (&kibnal_data.kib_idle_txs) ||
116                             kibnal_data.kib_shutdown);
117         }
118
119         if (tx != NULL) {
120                 list_del (&tx->tx_list);
121
122                 /* Allocate a new completion cookie.  It might not be needed,
123                  * but we've got a lock right now and we're unlikely to
124                  * wrap... */
125                 tx->tx_cookie = kibnal_data.kib_next_tx_cookie++;
126
127                 LASSERT (tx->tx_nwrq == 0);
128                 LASSERT (!tx->tx_queued);
129                 LASSERT (tx->tx_sending == 0);
130                 LASSERT (!tx->tx_waiting);
131                 LASSERT (tx->tx_status == 0);
132                 LASSERT (tx->tx_conn == NULL);
133                 LASSERT (tx->tx_libmsg[0] == NULL);
134                 LASSERT (tx->tx_libmsg[1] == NULL);
135         }
136
137         spin_unlock(&kibnal_data.kib_tx_lock);
138         
139         RETURN(tx);
140 }
141
142 int
143 kibnal_post_rx (kib_rx_t *rx, int credit)
144 {
145         kib_conn_t   *conn = rx->rx_conn;
146         int           rc = 0;
147         __u64         addr = (__u64)((unsigned long)((rx)->rx_msg));
148         vv_return_t   vvrc;
149
150         LASSERT (!in_interrupt());
151         
152         rx->rx_gl = (vv_scatgat_t) {
153                 .v_address = KIBNAL_ADDR2SG(addr),
154                 .l_key     = rx->rx_lkey,
155                 .length    = IBNAL_MSG_SIZE,
156         };
157
158         rx->rx_wrq = (vv_wr_t) {
159                 .wr_id                   = kibnal_ptr2wreqid(rx, IBNAL_WID_RX),
160                 .completion_notification = 1,
161                 .scatgat_list            = &rx->rx_gl,
162                 .num_of_data_segments    = 1,
163                 .wr_type                 = vv_wr_receive,
164         };
165
166         LASSERT (conn->ibc_state >= IBNAL_CONN_INIT);
167         LASSERT (!rx->rx_posted);
168
169         CDEBUG(D_NET, "posting rx [%d %x "LPX64"]\n", 
170                rx->rx_wrq.scatgat_list->length,
171                rx->rx_wrq.scatgat_list->l_key,
172                KIBNAL_SG2ADDR(rx->rx_wrq.scatgat_list->v_address));
173
174         if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) {
175                 /* No more posts for this rx; so lose its ref */
176                 kibnal_conn_decref(conn);
177                 return 0;
178         }
179         
180         rx->rx_posted = 1;
181
182         spin_lock(&conn->ibc_lock);
183         /* Serialise vv_post_receive; it's not re-entrant on the same QP */
184         vvrc = vv_post_receive(kibnal_data.kib_hca,
185                                conn->ibc_qp, &rx->rx_wrq);
186         spin_unlock(&conn->ibc_lock);
187
188         if (vvrc == vv_return_ok) {
189                 if (credit) {
190                         spin_lock(&conn->ibc_lock);
191                         conn->ibc_outstanding_credits++;
192                         spin_unlock(&conn->ibc_lock);
193
194                         kibnal_check_sends(conn);
195                 }
196                 return 0;
197         }
198         
199         CERROR ("post rx -> "LPX64" failed %d\n", 
200                 conn->ibc_peer->ibp_nid, vvrc);
201         rc = -EIO;
202         kibnal_close_conn(rx->rx_conn, rc);
203         /* No more posts for this rx; so lose its ref */
204         kibnal_conn_decref(conn);
205         return rc;
206 }
207
208 int
209 kibnal_post_receives (kib_conn_t *conn)
210 {
211         int    i;
212         int    rc;
213
214         LASSERT (conn->ibc_state < IBNAL_CONN_ESTABLISHED);
215         LASSERT (conn->ibc_comms_error == 0);
216
217         for (i = 0; i < IBNAL_RX_MSGS; i++) {
218                 /* +1 ref for rx desc.  This ref remains until kibnal_post_rx
219                  * fails (i.e. actual failure or we're disconnecting) */
220                 kibnal_conn_addref(conn);
221                 rc = kibnal_post_rx (&conn->ibc_rxs[i], 0);
222                 if (rc != 0)
223                         return rc;
224         }
225
226         return 0;
227 }
228
229 kib_tx_t *
230 kibnal_find_waiting_tx_locked(kib_conn_t *conn, int txtype, __u64 cookie)
231 {
232         struct list_head   *tmp;
233         
234         list_for_each(tmp, &conn->ibc_active_txs) {
235                 kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
236                 
237                 LASSERT (!tx->tx_queued);
238                 LASSERT (tx->tx_sending != 0 || tx->tx_waiting);
239
240                 if (tx->tx_cookie != cookie)
241                         continue;
242
243                 if (tx->tx_waiting &&
244                     tx->tx_msg->ibm_type == txtype)
245                         return tx;
246
247                 CWARN("Bad completion: %swaiting, type %x (wanted %x)\n",
248                       tx->tx_waiting ? "" : "NOT ",
249                       tx->tx_msg->ibm_type, txtype);
250         }
251         return NULL;
252 }
253
254 void
255 kibnal_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie)
256 {
257         kib_tx_t    *tx;
258         int          idle;
259
260         spin_lock(&conn->ibc_lock);
261
262         tx = kibnal_find_waiting_tx_locked(conn, txtype, cookie);
263         if (tx == NULL) {
264                 spin_unlock(&conn->ibc_lock);
265
266                 CWARN("Unmatched completion type %x cookie "LPX64
267                       " from "LPX64"\n",
268                       txtype, cookie, conn->ibc_peer->ibp_nid);
269                 kibnal_close_conn (conn, -EPROTO);
270                 return;
271         }
272
273         if (tx->tx_status == 0) {               /* success so far */
274                 if (status < 0) {               /* failed? */
275                         tx->tx_status = status;
276                 } else if (txtype == IBNAL_MSG_GET_REQ) { 
277                         /* XXX layering violation: set REPLY data length */
278                         LASSERT (tx->tx_libmsg[1] != NULL);
279                         LASSERT (tx->tx_libmsg[1]->ev.type == 
280                                  PTL_EVENT_REPLY_END);
281
282                         tx->tx_libmsg[1]->ev.mlength = status;
283                 }
284         }
285         
286         tx->tx_waiting = 0;
287
288         idle = !tx->tx_queued && (tx->tx_sending == 0);
289         if (idle)
290                 list_del(&tx->tx_list);
291
292         spin_unlock(&conn->ibc_lock);
293         
294         if (idle)
295                 kibnal_tx_done(tx);
296 }
297
298 void
299 kibnal_send_completion (kib_conn_t *conn, int type, int status, __u64 cookie) 
300 {
301         kib_tx_t    *tx = kibnal_get_idle_tx(0);
302         
303         if (tx == NULL) {
304                 CERROR("Can't get tx for completion %x for "LPX64"\n",
305                        type, conn->ibc_peer->ibp_nid);
306                 return;
307         }
308         
309         tx->tx_msg->ibm_u.completion.ibcm_status = status;
310         tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie;
311         kibnal_init_tx_msg(tx, type, sizeof(kib_completion_msg_t));
312         
313         kibnal_queue_tx(tx, conn);
314 }
315
316 void
317 kibnal_handle_rx (kib_rx_t *rx)
318 {
319         kib_msg_t    *msg = rx->rx_msg;
320         kib_conn_t   *conn = rx->rx_conn;
321         int           credits = msg->ibm_credits;
322         kib_tx_t     *tx;
323         int           rc;
324
325         LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
326
327         CDEBUG (D_NET, "Received %x[%d] from "LPX64"\n",
328                 msg->ibm_type, credits, conn->ibc_peer->ibp_nid);
329         
330         if (credits != 0) {
331                 /* Have I received credits that will let me send? */
332                 spin_lock(&conn->ibc_lock);
333                 conn->ibc_credits += credits;
334                 spin_unlock(&conn->ibc_lock);
335
336                 kibnal_check_sends(conn);
337         }
338
339         switch (msg->ibm_type) {
340         default:
341                 CERROR("Bad IBNAL message type %x from "LPX64"\n",
342                        msg->ibm_type, conn->ibc_peer->ibp_nid);
343                 break;
344
345         case IBNAL_MSG_NOOP:
346                 break;
347
348         case IBNAL_MSG_IMMEDIATE:
349                 lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx);
350                 break;
351                 
352         case IBNAL_MSG_PUT_REQ:
353                 rx->rx_responded = 0;
354                 lib_parse(&kibnal_lib, &msg->ibm_u.putreq.ibprm_hdr, rx);
355                 if (rx->rx_responded)
356                         break;
357
358                 /* I wasn't asked to transfer any payload data.  This happens
359                  * if the PUT didn't match, or got truncated. */
360                 kibnal_send_completion(rx->rx_conn, IBNAL_MSG_PUT_NAK, 0,
361                                        msg->ibm_u.putreq.ibprm_cookie);
362                 break;
363
364         case IBNAL_MSG_PUT_NAK:
365                 CWARN ("PUT_NACK from "LPX64"\n", conn->ibc_peer->ibp_nid);
366                 kibnal_handle_completion(conn, IBNAL_MSG_PUT_REQ, 
367                                          msg->ibm_u.completion.ibcm_status,
368                                          msg->ibm_u.completion.ibcm_cookie);
369                 break;
370
371         case IBNAL_MSG_PUT_ACK:
372                 spin_lock(&conn->ibc_lock);
373                 tx = kibnal_find_waiting_tx_locked(conn, IBNAL_MSG_PUT_REQ,
374                                                    msg->ibm_u.putack.ibpam_src_cookie);
375                 if (tx != NULL)
376                         list_del(&tx->tx_list);
377                 spin_unlock(&conn->ibc_lock);
378
379                 if (tx == NULL) {
380                         CERROR("Unmatched PUT_ACK from "LPX64"\n",
381                                conn->ibc_peer->ibp_nid);
382                         kibnal_close_conn(conn, -EPROTO);
383                         break;
384                 }
385
386                 LASSERT (tx->tx_waiting);
387                 /* CAVEAT EMPTOR: I could be racing with tx_complete, but...
388                  * (a) I can overwrite tx_msg since my peer has received it!
389                  * (b) tx_waiting set tells tx_complete() it's not done. */
390
391                 tx->tx_nwrq = 0;                /* overwrite PUT_REQ */
392
393                 rc = kibnal_init_rdma(tx, IBNAL_MSG_PUT_DONE, 
394                                       kibnal_rd_size(&msg->ibm_u.putack.ibpam_rd),
395                                       &msg->ibm_u.putack.ibpam_rd,
396                                       msg->ibm_u.putack.ibpam_dst_cookie);
397                 if (rc < 0)
398                         CERROR("Can't setup rdma for PUT to "LPX64": %d\n",
399                                conn->ibc_peer->ibp_nid, rc);
400
401                 spin_lock(&conn->ibc_lock);
402                 if (tx->tx_status == 0 && rc < 0)
403                         tx->tx_status = rc;
404                 tx->tx_waiting = 0;             /* clear waiting and queue atomically */
405                 kibnal_queue_tx_locked(tx, conn);
406                 spin_unlock(&conn->ibc_lock);
407                 break;
408                 
409         case IBNAL_MSG_PUT_DONE:
410                 kibnal_handle_completion(conn, IBNAL_MSG_PUT_ACK,
411                                          msg->ibm_u.completion.ibcm_status,
412                                          msg->ibm_u.completion.ibcm_cookie);
413                 break;
414
415         case IBNAL_MSG_GET_REQ:
416                 rx->rx_responded = 0;
417                 lib_parse(&kibnal_lib, &msg->ibm_u.get.ibgm_hdr, rx);
418                 if (rx->rx_responded)           /* I responded to the GET_REQ */
419                         break;
420                 /* NB GET didn't match (I'd have responded even with no payload
421                  * data) */
422                 kibnal_send_completion(rx->rx_conn, IBNAL_MSG_GET_DONE, -ENODATA,
423                                        msg->ibm_u.get.ibgm_cookie);
424                 break;
425
426         case IBNAL_MSG_GET_DONE:
427                 kibnal_handle_completion(conn, IBNAL_MSG_GET_REQ,
428                                          msg->ibm_u.completion.ibcm_status,
429                                          msg->ibm_u.completion.ibcm_cookie);
430                 break;
431         }
432
433         kibnal_post_rx(rx, 1);
434 }
435
436 void
437 kibnal_rx_complete (kib_rx_t *rx, vv_comp_status_t vvrc, int nob, __u64 rxseq)
438 {
439         kib_msg_t    *msg = rx->rx_msg;
440         kib_conn_t   *conn = rx->rx_conn;
441         unsigned long flags;
442         int           rc;
443
444         CDEBUG (D_NET, "rx %p conn %p\n", rx, conn);
445         LASSERT (rx->rx_posted);
446         rx->rx_posted = 0;
447
448         if (conn->ibc_state > IBNAL_CONN_ESTABLISHED)
449                 goto ignore;
450
451         if (vvrc != vv_comp_status_success) {
452                 CERROR("Rx from "LPX64" failed: %d\n", 
453                        conn->ibc_peer->ibp_nid, vvrc);
454                 goto failed;
455         }
456
457         rc = kibnal_unpack_msg(msg, nob);
458         if (rc != 0) {
459                 CERROR ("Error %d unpacking rx from "LPX64"\n",
460                         rc, conn->ibc_peer->ibp_nid);
461                 goto failed;
462         }
463
464         if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid ||
465             msg->ibm_srcstamp != conn->ibc_incarnation ||
466             msg->ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid ||
467             msg->ibm_dststamp != kibnal_data.kib_incarnation) {
468                 CERROR ("Stale rx from "LPX64"\n",
469                         conn->ibc_peer->ibp_nid);
470                 goto failed;
471         }
472
473         if (msg->ibm_seq != rxseq) {
474                 CERROR ("Out-of-sequence rx from "LPX64
475                         ": got "LPD64" but expected "LPD64"\n",
476                         conn->ibc_peer->ibp_nid, msg->ibm_seq, rxseq);
477                 goto failed;
478         }
479
480         /* racing with connection establishment/teardown! */
481
482         if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
483                 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
484                 /* must check holding global lock to eliminate race */
485                 if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
486                         list_add_tail(&rx->rx_list, &conn->ibc_early_rxs);
487                         write_unlock_irqrestore(&kibnal_data.kib_global_lock, 
488                                                 flags);
489                         return;
490                 }
491                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, 
492                                         flags);
493         }
494         kibnal_handle_rx(rx);
495         return;
496         
497  failed:
498         CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
499         kibnal_close_conn(conn, -EIO);
500  ignore:
501         /* Don't re-post rx & drop its ref on conn */
502         kibnal_conn_decref(conn);
503 }
504
505 struct page *
506 kibnal_kvaddr_to_page (unsigned long vaddr)
507 {
508         struct page *page;
509
510         if (vaddr >= VMALLOC_START &&
511             vaddr < VMALLOC_END) {
512                 page = vmalloc_to_page ((void *)vaddr);
513                 LASSERT (page != NULL);
514                 return page;
515         }
516 #if CONFIG_HIGHMEM
517         if (vaddr >= PKMAP_BASE &&
518             vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) {
519                 /* No highmem pages only used for bulk (kiov) I/O */
520                 CERROR("find page for address in highmem\n");
521                 LBUG();
522         }
523 #endif
524         page = virt_to_page (vaddr);
525         LASSERT (page != NULL);
526         return page;
527 }
528
529 #if !IBNAL_USE_FMR
530 int
531 kibnal_append_rdfrag(kib_rdma_desc_t *rd, int active, struct page *page, 
532                      unsigned long page_offset, unsigned long len)
533 {
534         kib_rdma_frag_t *frag = &rd->rd_frags[rd->rd_nfrag];
535         vv_l_key_t       l_key;
536         vv_r_key_t       r_key;
537         __u64            addr;
538         __u64            frag_addr;
539         vv_mem_reg_h_t   mem_h;
540         vv_return_t      vvrc;
541
542         if (rd->rd_nfrag >= IBNAL_MAX_RDMA_FRAGS) {
543                 CERROR ("Too many RDMA fragments\n");
544                 return -EMSGSIZE;
545         }
546
547         /* Try to create an address that adaptor-tavor will munge into a valid
548          * network address, given how it maps all phys mem into 1 region */
549         addr = kibnal_page2phys(page) + page_offset + PAGE_OFFSET;
550
551         vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca, 
552                                     (void *)((unsigned long)addr),
553                                     len, &mem_h, &l_key, &r_key);
554         LASSERT (vvrc == vv_return_ok);
555
556         if (active) {
557                 if (rd->rd_nfrag == 0) {
558                         rd->rd_key = l_key;
559                 } else if (l_key != rd->rd_key) {
560                         CERROR ("> 1 key for single RDMA desc\n");
561                         return -EINVAL;
562                 }
563                 frag_addr = addr;
564         } else {
565                 if (rd->rd_nfrag == 0) {
566                         rd->rd_key = r_key;
567                 } else if (r_key != rd->rd_key) {
568                         CERROR ("> 1 key for single RDMA desc\n");
569                         return -EINVAL;
570                 }
571
572                 frag_addr = kibnal_addr2net(addr);
573         }
574
575         kibnal_rf_set(frag, frag_addr, len);
576
577         CDEBUG(D_NET,"map frag [%d][%d %x %08x%08x] "LPX64"\n", 
578                rd->rd_nfrag, frag->rf_nob, rd->rd_key, 
579                frag->rf_addr_hi, frag->rf_addr_lo, frag_addr);
580
581         rd->rd_nfrag++;
582         return 0;
583 }
584
585 int
586 kibnal_setup_rd_iov(kib_tx_t *tx, kib_rdma_desc_t *rd, 
587                     vv_access_con_bit_mask_t access,
588                     int niov, struct iovec *iov, int offset, int nob)
589                  
590 {
591         /* active if I'm sending */
592         int           active = ((access & vv_acc_r_mem_write) == 0);
593         int           fragnob;
594         int           rc;
595         unsigned long vaddr;
596         struct page  *page;
597         int           page_offset;
598
599         LASSERT (nob > 0);
600         LASSERT (niov > 0);
601         LASSERT ((rd != tx->tx_rd) == !active);
602
603         while (offset >= iov->iov_len) {
604                 offset -= iov->iov_len;
605                 niov--;
606                 iov++;
607                 LASSERT (niov > 0);
608         }
609
610         rd->rd_nfrag = 0;
611         do {
612                 LASSERT (niov > 0);
613
614                 vaddr = ((unsigned long)iov->iov_base) + offset;
615                 page_offset = vaddr & (PAGE_SIZE - 1);
616                 page = kibnal_kvaddr_to_page(vaddr);
617                 if (page == NULL) {
618                         CERROR ("Can't find page\n");
619                         return -EFAULT;
620                 }
621
622                 fragnob = min((int)(iov->iov_len - offset), nob);
623                 fragnob = min(fragnob, (int)PAGE_SIZE - page_offset);
624
625                 rc = kibnal_append_rdfrag(rd, active, page, 
626                                           page_offset, fragnob);
627                 if (rc != 0)
628                         return rc;
629
630                 if (offset + fragnob < iov->iov_len) {
631                         offset += fragnob;
632                 } else {
633                         offset = 0;
634                         iov++;
635                         niov--;
636                 }
637                 nob -= fragnob;
638         } while (nob > 0);
639         
640         return 0;
641 }
642
643 int
644 kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd, 
645                       vv_access_con_bit_mask_t access,
646                       int nkiov, ptl_kiov_t *kiov, int offset, int nob)
647 {
648         /* active if I'm sending */
649         int            active = ((access & vv_acc_r_mem_write) == 0);
650         int            fragnob;
651         int            rc;
652
653         CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
654
655         LASSERT (nob > 0);
656         LASSERT (nkiov > 0);
657         LASSERT ((rd != tx->tx_rd) == !active);
658
659         while (offset >= kiov->kiov_len) {
660                 offset -= kiov->kiov_len;
661                 nkiov--;
662                 kiov++;
663                 LASSERT (nkiov > 0);
664         }
665
666         rd->rd_nfrag = 0;
667         do {
668                 LASSERT (nkiov > 0);
669                 fragnob = min((int)(kiov->kiov_len - offset), nob);
670                 
671                 rc = kibnal_append_rdfrag(rd, active, kiov->kiov_page,
672                                           kiov->kiov_offset + offset,
673                                           fragnob);
674                 if (rc != 0)
675                         return rc;
676
677                 offset = 0;
678                 kiov++;
679                 nkiov--;
680                 nob -= fragnob;
681         } while (nob > 0);
682
683         return 0;
684 }
685 #else
686 int
687 kibnal_map_tx (kib_tx_t *tx, kib_rdma_desc_t *rd, int active,
688                int npages, unsigned long page_offset, int nob)
689 {
690         vv_return_t   vvrc;
691         vv_fmr_map_t  map_props;
692
693         LASSERT ((rd != tx->tx_rd) == !active);
694         LASSERT (!tx->tx_md.md_active);
695         LASSERT (tx->tx_md.md_fmrcount > 0);
696         LASSERT (page_offset < PAGE_SIZE);
697         LASSERT (npages >= (1 + ((page_offset + nob - 1)>>PAGE_SHIFT)));
698         LASSERT (npages <= PTL_MD_MAX_IOV);
699
700         memset(&map_props, 0, sizeof(map_props));
701
702         map_props.start          = (void *)page_offset;
703         map_props.size           = nob;
704         map_props.page_array_len = npages;
705         map_props.page_array     = tx->tx_pages;
706
707         vvrc = vv_map_fmr(kibnal_data.kib_hca, tx->tx_md.md_fmrhandle,
708                           &map_props, &tx->tx_md.md_lkey, &tx->tx_md.md_rkey);
709         if (vvrc != vv_return_ok) {
710                 CERROR ("Can't map vaddr %p for %d in %d pages: %d\n", 
711                         map_props.start, nob, npages, vvrc);
712                 return -EFAULT;
713         }
714
715         tx->tx_md.md_addr = (unsigned long)map_props.start;
716         tx->tx_md.md_active = 1;
717         tx->tx_md.md_fmrcount--;
718
719         rd->rd_key = active ? tx->tx_md.md_lkey : tx->tx_md.md_rkey;
720         rd->rd_nob = nob;
721         rd->rd_addr = tx->tx_md.md_addr;
722
723         /* Compensate for adaptor-tavor's munging of gatherlist addresses */
724         if (active)
725                 rd->rd_addr += PAGE_OFFSET;
726
727         return 0;
728 }
729
730 int
731 kibnal_setup_rd_iov (kib_tx_t *tx, kib_rdma_desc_t *rd,
732                      vv_access_con_bit_mask_t access,
733                      int niov, struct iovec *iov, int offset, int nob)
734                  
735 {
736         /* active if I'm sending */
737         int           active = ((access & vv_acc_r_mem_write) == 0);
738         int           resid;
739         int           fragnob;
740         struct page  *page;
741         int           npages;
742         unsigned long page_offset;
743         unsigned long vaddr;
744         
745         LASSERT (nob > 0);
746         LASSERT (niov > 0);
747
748         while (offset >= iov->iov_len) {
749                 offset -= iov->iov_len;
750                 niov--;
751                 iov++;
752                 LASSERT (niov > 0);
753         }
754
755         if (nob > iov->iov_len - offset) {
756                 CERROR ("Can't map multiple vaddr fragments\n");
757                 return (-EMSGSIZE);
758         }
759
760         vaddr = ((unsigned long)iov->iov_base) + offset;
761         
762         page_offset = vaddr & (PAGE_SIZE - 1);
763         resid = nob;
764         npages = 0;
765
766         do {
767                 LASSERT (npages < PTL_MD_MAX_IOV);
768
769                 page = kibnal_kvaddr_to_page(vaddr);
770                 if (page == NULL) {
771                         CERROR("Can't find page for %lu\n", vaddr);
772                         return -EFAULT;
773                 }
774
775                 tx->tx_pages[npages++] = kibnal_page2phys(page);
776
777                 fragnob = PAGE_SIZE - (vaddr & (PAGE_SIZE - 1));
778                 vaddr += fragnob;
779                 resid -= fragnob;
780
781         } while (resid > 0);
782
783         return kibnal_map_tx(tx, rd, active, npages, page_offset, nob);
784 }
785
786 int
787 kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd,
788                       vv_access_con_bit_mask_t access,
789                       int nkiov, ptl_kiov_t *kiov, int offset, int nob)
790 {
791         /* active if I'm sending */
792         int            active = ((access & vv_acc_r_mem_write) == 0);
793         int            resid;
794         int            npages;
795         unsigned long  page_offset;
796         
797         CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
798
799         LASSERT (nob > 0);
800         LASSERT (nkiov > 0);
801         LASSERT (nkiov <= PTL_MD_MAX_IOV);
802         LASSERT (!tx->tx_md.md_active);
803         LASSERT ((rd != tx->tx_rd) == !active);
804
805         while (offset >= kiov->kiov_len) {
806                 offset -= kiov->kiov_len;
807                 nkiov--;
808                 kiov++;
809                 LASSERT (nkiov > 0);
810         }
811
812         page_offset = kiov->kiov_offset + offset;
813         
814         resid = offset + nob;
815         npages = 0;
816
817         do {
818                 LASSERT (npages < PTL_MD_MAX_IOV);
819                 LASSERT (nkiov > 0);
820
821                 if ((npages > 0 && kiov->kiov_offset != 0) ||
822                     (resid > kiov->kiov_len && 
823                      (kiov->kiov_offset + kiov->kiov_len) != PAGE_SIZE)) {
824                         /* Can't have gaps */
825                         CERROR ("Can't make payload contiguous in I/O VM:"
826                                 "page %d, offset %d, len %d \n",
827                                 npages, kiov->kiov_offset, kiov->kiov_len);
828                         
829                         return -EINVAL;
830                 }
831
832                 tx->tx_pages[npages++] = kibnal_page2phys(kiov->kiov_page);
833                 resid -= kiov->kiov_len;
834                 kiov++;
835                 nkiov--;
836         } while (resid > 0);
837
838         return kibnal_map_tx(tx, rd, active, npages, page_offset, nob);
839 }
840 #endif
841
842 kib_conn_t *
843 kibnal_find_conn_locked (kib_peer_t *peer)
844 {
845         struct list_head *tmp;
846
847         /* just return the first connection */
848         list_for_each (tmp, &peer->ibp_conns) {
849                 return (list_entry(tmp, kib_conn_t, ibc_list));
850         }
851
852         return (NULL);
853 }
854
855 void
856 kibnal_check_sends (kib_conn_t *conn)
857 {
858         kib_tx_t       *tx;
859         vv_return_t     vvrc;                        
860         int             rc;
861         int             done;
862
863         /* Don't send anything until after the connection is established */
864         if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
865                 CDEBUG(D_NET, LPX64"too soon\n", conn->ibc_peer->ibp_nid);
866                 return;
867         }
868         
869         spin_lock(&conn->ibc_lock);
870
871         LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE);
872
873         if (list_empty(&conn->ibc_tx_queue) &&
874             conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) {
875                 spin_unlock(&conn->ibc_lock);
876                 
877                 tx = kibnal_get_idle_tx(0);     /* don't block */
878                 if (tx != NULL)
879                         kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0);
880
881                 spin_lock(&conn->ibc_lock);
882                 
883                 if (tx != NULL)
884                         kibnal_queue_tx_locked(tx, conn);
885         }
886
887         while (!list_empty (&conn->ibc_tx_queue)) {
888                 tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list);
889
890                 LASSERT (tx->tx_queued);
891                 /* We rely on this for QP sizing */
892                 LASSERT (tx->tx_nwrq > 0 && tx->tx_nwrq <= 1 + IBNAL_MAX_RDMA_FRAGS);
893
894                 LASSERT (conn->ibc_outstanding_credits >= 0);
895                 LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE);
896                 LASSERT (conn->ibc_credits >= 0);
897                 LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE);
898
899                 if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE) {
900                         CDEBUG(D_NET, LPX64": posted enough\n",
901                                conn->ibc_peer->ibp_nid);
902                         break;
903                 }
904                 
905                 if (conn->ibc_credits == 0) {   /* no credits */
906                         CDEBUG(D_NET, LPX64": no credits\n",
907                                conn->ibc_peer->ibp_nid);
908                         break;
909                 }
910                 
911                 if (conn->ibc_credits == 1 &&   /* last credit reserved for */
912                     conn->ibc_outstanding_credits == 0) { /* giving back credits */
913                         CDEBUG(D_NET, LPX64": not using last credit\n",
914                                conn->ibc_peer->ibp_nid);
915                         break;
916                 }
917                 
918                 list_del (&tx->tx_list);
919                 tx->tx_queued = 0;
920
921                 /* NB don't drop ibc_lock before bumping tx_sending */
922
923                 if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP &&
924                     (!list_empty(&conn->ibc_tx_queue) ||
925                      conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) {
926                         /* redundant NOOP */
927                         spin_unlock(&conn->ibc_lock);
928                         kibnal_tx_done(tx);
929                         spin_lock(&conn->ibc_lock);
930                         CDEBUG(D_NET, LPX64": redundant noop\n",
931                                conn->ibc_peer->ibp_nid);
932                         continue;
933                 }
934
935                 kibnal_pack_msg(tx->tx_msg, conn->ibc_outstanding_credits,
936                                 conn->ibc_peer->ibp_nid, conn->ibc_incarnation,
937                                 conn->ibc_txseq);
938
939                 conn->ibc_txseq++;
940                 conn->ibc_outstanding_credits = 0;
941                 conn->ibc_nsends_posted++;
942                 conn->ibc_credits--;
943
944                 /* CAVEAT EMPTOR!  This tx could be the PUT_DONE of an RDMA
945                  * PUT.  If so, it was first queued here as a PUT_REQ, sent and
946                  * stashed on ibc_active_txs, matched by an incoming PUT_ACK,
947                  * and then re-queued here.  It's (just) possible that
948                  * tx_sending is non-zero if we've not done the tx_complete() from
949                  * the first send; hence the ++ rather than = below. */
950                 tx->tx_sending++;
951
952                 list_add (&tx->tx_list, &conn->ibc_active_txs);
953
954                 /* Keep holding ibc_lock while posting sends on this
955                  * connection; vv_post_send() isn't re-entrant on the same
956                  * QP!! */
957
958                 LASSERT (tx->tx_nwrq > 0);
959 #if 0
960                 if (tx->tx_wrq[0].wr_type == vv_wr_rdma_write) 
961                         CDEBUG(D_WARNING, "WORK[0]: RDMA gl %p for %d k %x -> "LPX64" k %x\n",
962                                tx->tx_wrq[0].scatgat_list->v_address,
963                                tx->tx_wrq[0].scatgat_list->length,
964                                tx->tx_wrq[0].scatgat_list->l_key,
965                                tx->tx_wrq[0].type.send.send_qp_type.rc_type.r_addr,
966                                tx->tx_wrq[0].type.send.send_qp_type.rc_type.r_r_key);
967                 else
968                         CDEBUG(D_WARNING, "WORK[0]: %s gl %p for %d k %x\n",
969                                tx->tx_wrq[0].wr_type == vv_wr_send ? "SEND" : "????",
970                                tx->tx_wrq[0].scatgat_list->v_address,
971                                tx->tx_wrq[0].scatgat_list->length,
972                                tx->tx_wrq[0].scatgat_list->l_key);
973
974                 if (tx->tx_nwrq > 1) {
975                         if (tx->tx_wrq[1].wr_type == vv_wr_rdma_write) 
976                                 CDEBUG(D_WARNING, "WORK[1]: RDMA gl %p for %d k %x -> "LPX64" k %x\n",
977                                        tx->tx_wrq[1].scatgat_list->v_address,
978                                        tx->tx_wrq[1].scatgat_list->length,
979                                        tx->tx_wrq[1].scatgat_list->l_key,
980                                        tx->tx_wrq[1].type.send.send_qp_type.rc_type.r_addr,
981                                        tx->tx_wrq[1].type.send.send_qp_type.rc_type.r_r_key);
982                         else
983                                 CDEBUG(D_WARNING, "WORK[1]: %s gl %p for %d k %x\n",
984                                        tx->tx_wrq[1].wr_type == vv_wr_send ? "SEND" : "????",
985                                        tx->tx_wrq[1].scatgat_list->v_address,
986                                        tx->tx_wrq[1].scatgat_list->length,
987                                        tx->tx_wrq[1].scatgat_list->l_key);
988                 }
989 #endif           
990                 rc = -ECONNABORTED;
991                 vvrc = vv_return_ok;
992                 if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
993                         tx->tx_status = 0;
994                         vvrc = vv_post_send_list(kibnal_data.kib_hca,
995                                                  conn->ibc_qp,
996                                                  tx->tx_nwrq,
997                                                  tx->tx_wrq,
998                                                  vv_operation_type_send_rc);
999                         rc = (vvrc == vv_return_ok) ? 0 : -EIO;
1000                 }
1001
1002                 if (rc != 0) {
1003                         /* NB credits are transferred in the actual
1004                          * message, which can only be the last work item */
1005                         conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits;
1006                         conn->ibc_credits++;
1007                         conn->ibc_nsends_posted--;
1008
1009                         tx->tx_status = rc;
1010                         tx->tx_waiting = 0;
1011                         tx->tx_sending--;
1012                         
1013                         done = (tx->tx_sending == 0);
1014                         if (done)
1015                                 list_del (&tx->tx_list);
1016                         
1017                         spin_unlock(&conn->ibc_lock);
1018                         
1019                         if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
1020                                 CERROR ("Error %d posting transmit to "LPX64"\n", 
1021                                         vvrc, conn->ibc_peer->ibp_nid);
1022                         else
1023                                 CDEBUG (D_NET, "Error %d posting transmit to "
1024                                         LPX64"\n", rc, conn->ibc_peer->ibp_nid);
1025
1026                         kibnal_close_conn (conn, rc);
1027
1028                         if (done)
1029                                 kibnal_tx_done (tx);
1030                         return;
1031                 }
1032         }
1033
1034         spin_unlock(&conn->ibc_lock);
1035 }
1036
1037 void
1038 kibnal_tx_complete (kib_tx_t *tx, vv_comp_status_t vvrc)
1039 {
1040         kib_conn_t   *conn = tx->tx_conn;
1041         int           failed = (vvrc != vv_comp_status_success);
1042         int           idle;
1043
1044         CDEBUG(D_NET, "tx %p conn %p sending %d nwrq %d vvrc %d\n", 
1045                tx, conn, tx->tx_sending, tx->tx_nwrq, vvrc);
1046
1047         LASSERT (tx->tx_sending > 0);
1048
1049         if (failed &&
1050             tx->tx_status == 0 &&
1051             conn->ibc_state == IBNAL_CONN_ESTABLISHED)
1052                 CERROR("tx -> "LPX64" type %x cookie "LPX64
1053                        "sending %d waiting %d: failed %d\n", 
1054                        conn->ibc_peer->ibp_nid, tx->tx_msg->ibm_type, 
1055                        tx->tx_cookie, tx->tx_sending, tx->tx_waiting, vvrc);
1056
1057         spin_lock(&conn->ibc_lock);
1058
1059         /* I could be racing with rdma completion.  Whoever makes 'tx' idle
1060          * gets to free it, which also drops its ref on 'conn'. */
1061
1062         tx->tx_sending--;
1063         conn->ibc_nsends_posted--;
1064
1065         if (failed) {
1066                 tx->tx_waiting = 0;
1067                 tx->tx_status = -EIO;
1068         }
1069         
1070         idle = (tx->tx_sending == 0) &&         /* This is the final callback */
1071                !tx->tx_waiting &&               /* Not waiting for peer */
1072                !tx->tx_queued;                  /* Not re-queued (PUT_DONE) */
1073         if (idle)
1074                 list_del(&tx->tx_list);
1075
1076         kibnal_conn_addref(conn);               /* 1 ref for me.... */
1077
1078         spin_unlock(&conn->ibc_lock);
1079
1080         if (idle)
1081                 kibnal_tx_done (tx);
1082
1083         if (failed)
1084                 kibnal_close_conn (conn, -EIO);
1085         else
1086                 kibnal_check_sends(conn);
1087
1088         kibnal_conn_decref(conn);               /* ...until here */
1089 }
1090
1091 void
1092 kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob)
1093 {
1094         vv_scatgat_t *gl = &tx->tx_gl[tx->tx_nwrq];
1095         vv_wr_t      *wrq = &tx->tx_wrq[tx->tx_nwrq];
1096         int           nob = offsetof (kib_msg_t, ibm_u) + body_nob;
1097         __u64         addr = (__u64)((unsigned long)((tx)->tx_msg));
1098
1099         LASSERT (tx->tx_nwrq >= 0 && 
1100                  tx->tx_nwrq < (1 + IBNAL_MAX_RDMA_FRAGS));
1101         LASSERT (nob <= IBNAL_MSG_SIZE);
1102
1103         kibnal_init_msg(tx->tx_msg, type, body_nob);
1104
1105         *gl = (vv_scatgat_t) {
1106                 .v_address = KIBNAL_ADDR2SG(addr),
1107                 .l_key     = tx->tx_lkey,
1108                 .length    = nob,
1109         };
1110
1111         memset(wrq, 0, sizeof(*wrq));
1112
1113         wrq->wr_id = kibnal_ptr2wreqid(tx, IBNAL_WID_TX);
1114         wrq->wr_type = vv_wr_send;
1115         wrq->scatgat_list = gl;
1116         wrq->num_of_data_segments = 1;
1117         wrq->completion_notification = 1;
1118         wrq->type.send.solicited_event = 1;
1119         wrq->type.send.immidiate_data_indicator = 0;
1120         wrq->type.send.send_qp_type.rc_type.fance_indicator = 0;
1121         
1122         tx->tx_nwrq++;
1123 }
1124
1125 int
1126 kibnal_init_rdma (kib_tx_t *tx, int type, int nob,
1127                   kib_rdma_desc_t *dstrd, __u64 dstcookie)
1128 {
1129         kib_msg_t       *ibmsg = tx->tx_msg;
1130         kib_rdma_desc_t *srcrd = tx->tx_rd;
1131         vv_scatgat_t    *gl;
1132         vv_wr_t         *wrq;
1133         int              rc;
1134
1135 #if IBNAL_USE_FMR
1136         LASSERT (tx->tx_nwrq == 0);
1137
1138         gl = &tx->tx_gl[0];
1139         gl->length    = nob;
1140         gl->v_address = KIBNAL_ADDR2SG(srcrd->rd_addr);
1141         gl->l_key     = srcrd->rd_key;
1142
1143         wrq = &tx->tx_wrq[0];
1144
1145         wrq->wr_id = kibnal_ptr2wreqid(tx, IBNAL_WID_RDMA);
1146         wrq->completion_notification = 0;
1147         wrq->scatgat_list = gl;
1148         wrq->num_of_data_segments = 1;
1149         wrq->wr_type = vv_wr_rdma_write;
1150         wrq->type.send.solicited_event = 0;
1151         wrq->type.send.send_qp_type.rc_type.fance_indicator = 0;
1152         wrq->type.send.send_qp_type.rc_type.r_addr = dstrd->rd_addr;
1153         wrq->type.send.send_qp_type.rc_type.r_r_key = dstrd->rd_key;
1154
1155         tx->tx_nwrq = 1;
1156         rc = nob;
1157 #else
1158         /* CAVEAT EMPTOR: this 'consumes' the frags in 'dstrd' */
1159         int              resid = nob;
1160         kib_rdma_frag_t *srcfrag;
1161         int              srcidx;
1162         kib_rdma_frag_t *dstfrag;
1163         int              dstidx;
1164         int              wrknob;
1165
1166         /* Called by scheduler */
1167         LASSERT (!in_interrupt());
1168
1169         LASSERT (type == IBNAL_MSG_GET_DONE ||
1170                  type == IBNAL_MSG_PUT_DONE);
1171
1172         srcidx = dstidx = 0;
1173         srcfrag = &srcrd->rd_frags[0];
1174         dstfrag = &dstrd->rd_frags[0];
1175         rc = resid;
1176
1177         while (resid > 0) {
1178                 if (srcidx >= srcrd->rd_nfrag) {
1179                         CERROR("Src buffer exhausted: %d frags\n", srcidx);
1180                         rc = -EPROTO;
1181                         break;
1182                 }
1183                 
1184                 if (dstidx == dstrd->rd_nfrag) {
1185                         CERROR("Dst buffer exhausted: %d frags\n", dstidx);
1186                         rc = -EPROTO;
1187                         break;
1188                 }
1189
1190                 if (tx->tx_nwrq == IBNAL_MAX_RDMA_FRAGS) {
1191                         CERROR("RDMA too fragmented: %d/%d src %d/%d dst frags\n",
1192                                srcidx, srcrd->rd_nfrag,
1193                                dstidx, dstrd->rd_nfrag);
1194                         rc = -EMSGSIZE;
1195                         break;
1196                 }
1197
1198                 wrknob = MIN(MIN(srcfrag->rf_nob, dstfrag->rf_nob), resid);
1199
1200                 gl = &tx->tx_gl[tx->tx_nwrq];
1201                 gl->v_address = KIBNAL_ADDR2SG(kibnal_rf_addr(srcfrag));
1202                 gl->length    = wrknob;
1203                 gl->l_key     = srcrd->rd_key;
1204
1205                 wrq = &tx->tx_wrq[tx->tx_nwrq];
1206
1207                 wrq->wr_id = kibnal_ptr2wreqid(tx, IBNAL_WID_RDMA);
1208                 wrq->completion_notification = 0;
1209                 wrq->scatgat_list = gl;
1210                 wrq->num_of_data_segments = 1;
1211                 wrq->wr_type = vv_wr_rdma_write;
1212                 wrq->type.send.solicited_event = 0;
1213                 wrq->type.send.send_qp_type.rc_type.fance_indicator = 0;
1214                 wrq->type.send.send_qp_type.rc_type.r_addr = kibnal_rf_addr(dstfrag);
1215                 wrq->type.send.send_qp_type.rc_type.r_r_key = dstrd->rd_key;
1216
1217                 resid -= wrknob;
1218                 if (wrknob < srcfrag->rf_nob) {
1219                         kibnal_rf_set(srcfrag, 
1220                                       kibnal_rf_addr(srcfrag) + wrknob, 
1221                                       srcfrag->rf_nob - wrknob);
1222                 } else {
1223                         srcfrag++;
1224                         srcidx++;
1225                 }
1226                 
1227                 if (wrknob < dstfrag->rf_nob) {
1228                         kibnal_rf_set(dstfrag,
1229                                       kibnal_rf_addr(dstfrag) + wrknob,
1230                                       dstfrag->rf_nob - wrknob);
1231                 } else {
1232                         dstfrag++;
1233                         dstidx++;
1234                 }
1235                 
1236                 tx->tx_nwrq++;
1237         }
1238
1239         if (rc < 0)                             /* no RDMA if completing with failure */
1240                 tx->tx_nwrq = 0;
1241 #endif
1242         
1243         ibmsg->ibm_u.completion.ibcm_status = rc;
1244         ibmsg->ibm_u.completion.ibcm_cookie = dstcookie;
1245         kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t));
1246
1247         return rc;
1248 }
1249
1250 void
1251 kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
1252 {
1253         spin_lock(&conn->ibc_lock);
1254         kibnal_queue_tx_locked (tx, conn);
1255         spin_unlock(&conn->ibc_lock);
1256         
1257         kibnal_check_sends(conn);
1258 }
1259
1260 void
1261 kibnal_schedule_peer_arp (kib_peer_t *peer)
1262 {
1263         unsigned long flags;
1264
1265         LASSERT (peer->ibp_connecting != 0);
1266         LASSERT (peer->ibp_arp_count > 0);
1267
1268         kibnal_peer_addref(peer); /* extra ref for connd */
1269
1270         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
1271
1272         list_add_tail (&peer->ibp_connd_list, &kibnal_data.kib_connd_peers);
1273         wake_up (&kibnal_data.kib_connd_waitq);
1274
1275         spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
1276 }
1277
1278 void
1279 kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid)
1280 {
1281         kib_peer_t      *peer;
1282         kib_conn_t      *conn;
1283         unsigned long    flags;
1284         rwlock_t        *g_lock = &kibnal_data.kib_global_lock;
1285
1286         /* If I get here, I've committed to send, so I complete the tx with
1287          * failure on any problems */
1288         
1289         LASSERT (tx->tx_conn == NULL);          /* only set when assigned a conn */
1290         LASSERT (tx->tx_nwrq > 0);              /* work items have been set up */
1291
1292         read_lock_irqsave(g_lock, flags);
1293         
1294         peer = kibnal_find_peer_locked (nid);
1295         if (peer == NULL) {
1296                 read_unlock_irqrestore(g_lock, flags);
1297                 tx->tx_status = -EHOSTUNREACH;
1298                 tx->tx_waiting = 0;
1299                 kibnal_tx_done (tx);
1300                 return;
1301         }
1302
1303         conn = kibnal_find_conn_locked (peer);
1304         if (conn != NULL) {
1305                 kibnal_conn_addref(conn);       /* 1 ref for me... */
1306                 read_unlock_irqrestore(g_lock, flags);
1307                 
1308                 kibnal_queue_tx (tx, conn);
1309                 kibnal_conn_decref(conn);       /* ...to here */
1310                 return;
1311         }
1312         
1313         /* Making one or more connections; I'll need a write lock... */
1314         read_unlock(g_lock);
1315         write_lock(g_lock);
1316
1317         peer = kibnal_find_peer_locked (nid);
1318         if (peer == NULL) {
1319                 write_unlock_irqrestore(g_lock, flags);
1320                 tx->tx_status = -EHOSTUNREACH;
1321                 tx->tx_waiting = 0;
1322                 kibnal_tx_done (tx);
1323                 return;
1324         }
1325
1326         conn = kibnal_find_conn_locked (peer);
1327         if (conn != NULL) {
1328                 /* Connection exists; queue message on it */
1329                 kibnal_conn_addref(conn);       /* 1 ref for me... */
1330                 write_unlock_irqrestore(g_lock, flags);
1331                 
1332                 kibnal_queue_tx (tx, conn);
1333                 kibnal_conn_decref(conn);       /* ...until here */
1334                 return;
1335         }
1336
1337         if (peer->ibp_connecting == 0) {
1338                 if (!time_after_eq(jiffies, peer->ibp_reconnect_time)) {
1339                         write_unlock_irqrestore(g_lock, flags);
1340                         tx->tx_status = -EHOSTUNREACH;
1341                         tx->tx_waiting = 0;
1342                         kibnal_tx_done (tx);
1343                         return;
1344                 }
1345         
1346                 peer->ibp_connecting = 1;
1347                 peer->ibp_arp_count = 1 + IBNAL_ARP_RETRIES;
1348                 kibnal_schedule_peer_arp(peer);
1349         }
1350         
1351         /* A connection is being established; queue the message... */
1352         list_add_tail (&tx->tx_list, &peer->ibp_tx_queue);
1353
1354         write_unlock_irqrestore(g_lock, flags);
1355 }
1356
1357 int
1358 kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist)
1359 {
1360         /* I would guess that if kibnal_get_peer (nid) == NULL,
1361            and we're not routing, then 'nid' is very distant :) */
1362         if ( nal->libnal_ni.ni_pid.nid == nid ) {
1363                 *dist = 0;
1364         } else {
1365                 *dist = 1;
1366         }
1367
1368         return 0;
1369 }
1370
1371 ptl_err_t
1372 kibnal_sendmsg(lib_nal_t    *nal, 
1373                void         *private,
1374                lib_msg_t    *libmsg,
1375                ptl_hdr_t    *hdr, 
1376                int           type, 
1377                ptl_nid_t     nid, 
1378                ptl_pid_t     pid,
1379                unsigned int  payload_niov, 
1380                struct iovec *payload_iov, 
1381                ptl_kiov_t   *payload_kiov,
1382                int           payload_offset,
1383                int           payload_nob)
1384 {
1385         kib_msg_t  *ibmsg;
1386         kib_tx_t   *tx;
1387         int         nob;
1388         int         rc;
1389
1390         /* NB 'private' is different depending on what we're sending.... */
1391
1392         CDEBUG(D_NET, "sending %d bytes in %d frags to nid:"LPX64
1393                " pid %d\n", payload_nob, payload_niov, nid , pid);
1394
1395         LASSERT (payload_nob == 0 || payload_niov > 0);
1396         LASSERT (payload_niov <= PTL_MD_MAX_IOV);
1397
1398         /* Thread context */
1399         LASSERT (!in_interrupt());
1400         /* payload is either all vaddrs or all pages */
1401         LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
1402
1403         switch (type) {
1404         default:
1405                 LBUG();
1406                 return (PTL_FAIL);
1407                 
1408         case PTL_MSG_REPLY: {
1409                 /* reply's 'private' is the incoming receive */
1410                 kib_rx_t *rx = private;
1411
1412                 LASSERT(rx != NULL);
1413
1414                 if (rx->rx_msg->ibm_type == IBNAL_MSG_IMMEDIATE) {
1415                         /* RDMA not expected */
1416                         nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
1417                         if (nob > IBNAL_MSG_SIZE) {
1418                                 CERROR("REPLY for "LPX64" too big (RDMA not requested):"
1419                                        "%d (max for message is %d)\n", 
1420                                        nid, payload_nob, IBNAL_MSG_SIZE);
1421                                 CERROR("Can't REPLY IMMEDIATE %d to "LPX64"\n",
1422                                        nob, nid);
1423                                 return PTL_FAIL;
1424                         }
1425                         break;
1426                 }
1427
1428                 /* Incoming message consistent with RDMA? */
1429                 if (rx->rx_msg->ibm_type != IBNAL_MSG_GET_REQ) {
1430                         CERROR("REPLY to "LPX64" bad msg type %x!!!\n",
1431                                nid, rx->rx_msg->ibm_type);
1432                         return PTL_FAIL;
1433                 }
1434
1435                 /* NB rx_complete() will send GET_NAK when I return to it from
1436                  * here, unless I set rx_responded! */
1437
1438                 tx = kibnal_get_idle_tx(0);
1439                 if (tx == NULL) {
1440                         CERROR("Can't get tx for REPLY to "LPX64"\n", nid);
1441                         return PTL_FAIL;
1442                 }
1443
1444                 if (payload_nob == 0)
1445                         rc = 0;
1446                 else if (payload_kiov == NULL)
1447                         rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0, 
1448                                                  payload_niov, payload_iov, 
1449                                                  payload_offset, payload_nob);
1450                 else
1451                         rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 0,
1452                                                   payload_niov, payload_kiov,
1453                                                   payload_offset, payload_nob);
1454                 if (rc != 0) {
1455                         CERROR("Can't setup GET src for "LPX64": %d\n", nid, rc);
1456                         kibnal_tx_done(tx);
1457                         return PTL_FAIL;
1458                 }
1459                 
1460                 rc = kibnal_init_rdma(tx, IBNAL_MSG_GET_DONE, payload_nob,
1461                                       &rx->rx_msg->ibm_u.get.ibgm_rd,
1462                                       rx->rx_msg->ibm_u.get.ibgm_cookie);
1463                 if (rc < 0) {
1464                         CERROR("Can't setup rdma for GET from "LPX64": %d\n", 
1465                                nid, rc);
1466                 } else if (rc == 0) {
1467                         /* No RDMA: local completion may happen now! */
1468                         lib_finalize (&kibnal_lib, NULL, libmsg, PTL_OK);
1469                 } else {
1470                         /* RDMA: lib_finalize(libmsg) when it completes */
1471                         tx->tx_libmsg[0] = libmsg;
1472                 }
1473
1474                 kibnal_queue_tx(tx, rx->rx_conn);
1475                 rx->rx_responded = 1;
1476                 return (rc >= 0) ? PTL_OK : PTL_FAIL;
1477         }
1478
1479         case PTL_MSG_GET:
1480                 /* will the REPLY message be small enough not to need RDMA? */
1481                 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[libmsg->md->length]);
1482                 if (nob <= IBNAL_MSG_SIZE)
1483                         break;
1484
1485                 tx = kibnal_get_idle_tx(1);     /* may block; caller is an app thread */
1486                 LASSERT (tx != NULL);
1487
1488                 ibmsg = tx->tx_msg;
1489                 ibmsg->ibm_u.get.ibgm_hdr = *hdr;
1490                 ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie;
1491
1492                 if ((libmsg->md->options & PTL_MD_KIOV) == 0)
1493                         rc = kibnal_setup_rd_iov(tx, &ibmsg->ibm_u.get.ibgm_rd,
1494                                                  vv_acc_r_mem_write,
1495                                                  libmsg->md->md_niov,
1496                                                  libmsg->md->md_iov.iov,
1497                                                  0, libmsg->md->length);
1498                 else
1499                         rc = kibnal_setup_rd_kiov(tx, &ibmsg->ibm_u.get.ibgm_rd,
1500                                                   vv_acc_r_mem_write,
1501                                                   libmsg->md->md_niov,
1502                                                   libmsg->md->md_iov.kiov,
1503                                                   0, libmsg->md->length);
1504                 if (rc != 0) {
1505                         CERROR("Can't setup GET sink for "LPX64": %d\n", nid, rc);
1506                         kibnal_tx_done(tx);
1507                         return PTL_FAIL;
1508                 }
1509
1510 #if IBNAL_USE_FMR
1511                 nob = sizeof(kib_get_msg_t);
1512 #else
1513                 {
1514                         int n = ibmsg->ibm_u.get.ibgm_rd.rd_nfrag;
1515                         
1516                         nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]);
1517                 }
1518 #endif
1519                 kibnal_init_tx_msg(tx, IBNAL_MSG_GET_REQ, nob);
1520
1521                 tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib, nid, libmsg);
1522                 if (tx->tx_libmsg[1] == NULL) {
1523                         CERROR("Can't create reply for GET -> "LPX64"\n", nid);
1524                         kibnal_tx_done(tx);
1525                         return PTL_FAIL;
1526                 }
1527
1528                 tx->tx_libmsg[0] = libmsg;      /* finalise libmsg[0,1] on completion */
1529                 tx->tx_waiting = 1;             /* waiting for GET_DONE */
1530                 kibnal_launch_tx(tx, nid);
1531                 return PTL_OK;
1532
1533         case PTL_MSG_ACK:
1534                 LASSERT (payload_nob == 0);
1535                 break;
1536
1537         case PTL_MSG_PUT:
1538                 /* Is the payload small enough not to need RDMA? */
1539                 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
1540                 if (nob <= IBNAL_MSG_SIZE)
1541                         break;
1542
1543                 tx = kibnal_get_idle_tx(1);     /* may block: caller is app thread */
1544                 LASSERT (tx != NULL);
1545
1546                 if (payload_kiov == NULL)
1547                         rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0,
1548                                                  payload_niov, payload_iov,
1549                                                  payload_offset, payload_nob);
1550                 else
1551                         rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 0,
1552                                                   payload_niov, payload_kiov,
1553                                                   payload_offset, payload_nob);
1554                 if (rc != 0) {
1555                         CERROR("Can't setup PUT src for "LPX64": %d\n", nid, rc);
1556                         kibnal_tx_done(tx);
1557                         return PTL_FAIL;
1558                 }
1559
1560                 ibmsg = tx->tx_msg;
1561                 ibmsg->ibm_u.putreq.ibprm_hdr = *hdr;
1562                 ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie;
1563                 kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_REQ, sizeof(kib_putreq_msg_t));
1564
1565                 tx->tx_libmsg[0] = libmsg;      /* finalise libmsg on completion */
1566                 tx->tx_waiting = 1;             /* waiting for PUT_{ACK,NAK} */
1567                 kibnal_launch_tx(tx, nid);
1568                 return PTL_OK;
1569         }
1570
1571         LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob])
1572                  <= IBNAL_MSG_SIZE);
1573
1574         tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK ||
1575                                   type == PTL_MSG_REPLY));
1576         if (tx == NULL) {
1577                 CERROR ("Can't send %d to "LPX64": tx descs exhausted\n", type, nid);
1578                 return PTL_NO_SPACE;
1579         }
1580
1581         ibmsg = tx->tx_msg;
1582         ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
1583
1584         if (payload_nob > 0) {
1585                 if (payload_kiov != NULL)
1586                         lib_copy_kiov2buf(ibmsg->ibm_u.immediate.ibim_payload,
1587                                           payload_niov, payload_kiov,
1588                                           payload_offset, payload_nob);
1589                 else
1590                         lib_copy_iov2buf(ibmsg->ibm_u.immediate.ibim_payload,
1591                                          payload_niov, payload_iov,
1592                                          payload_offset, payload_nob);
1593         }
1594
1595         nob = offsetof(kib_immediate_msg_t, ibim_payload[payload_nob]);
1596         kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE, nob);
1597
1598         tx->tx_libmsg[0] = libmsg;              /* finalise libmsg on completion */
1599         kibnal_launch_tx(tx, nid);
1600         return PTL_OK;
1601 }
1602
1603 ptl_err_t
1604 kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie,
1605                ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
1606                unsigned int payload_niov, struct iovec *payload_iov,
1607                size_t payload_offset, size_t payload_len)
1608 {
1609         CDEBUG(D_NET, "  pid = %d, nid="LPU64"\n",
1610                pid, nid);
1611         return (kibnal_sendmsg(nal, private, cookie,
1612                                hdr, type, nid, pid,
1613                                payload_niov, payload_iov, NULL,
1614                                payload_offset, payload_len));
1615 }
1616
1617 ptl_err_t
1618 kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, 
1619                      ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
1620                      unsigned int payload_niov, ptl_kiov_t *payload_kiov, 
1621                      size_t payload_offset, size_t payload_len)
1622 {
1623         return (kibnal_sendmsg(nal, private, cookie,
1624                                hdr, type, nid, pid,
1625                                payload_niov, NULL, payload_kiov,
1626                                payload_offset, payload_len));
1627 }
1628
1629 ptl_err_t
1630 kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
1631                  unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov,
1632                  size_t offset, int mlen, int rlen)
1633 {
1634         kib_rx_t    *rx = private;
1635         kib_msg_t   *rxmsg = rx->rx_msg;
1636         kib_conn_t  *conn = rx->rx_conn;
1637         kib_tx_t    *tx;
1638         kib_msg_t   *txmsg;
1639         int          nob;
1640         int          rc;
1641         
1642         LASSERT (mlen <= rlen);
1643         LASSERT (mlen >= 0);
1644         LASSERT (!in_interrupt());
1645         /* Either all pages or all vaddrs */
1646         LASSERT (!(kiov != NULL && iov != NULL));
1647
1648         switch (rxmsg->ibm_type) {
1649         default:
1650                 LBUG();
1651                 
1652         case IBNAL_MSG_IMMEDIATE:
1653                 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
1654                 if (nob > IBNAL_MSG_SIZE) {
1655                         CERROR ("Immediate message from "LPX64" too big: %d\n",
1656                                 rxmsg->ibm_u.immediate.ibim_hdr.src_nid, rlen);
1657                         return (PTL_FAIL);
1658                 }
1659
1660                 if (kiov != NULL)
1661                         lib_copy_buf2kiov(niov, kiov, offset,
1662                                           rxmsg->ibm_u.immediate.ibim_payload,
1663                                           mlen);
1664                 else
1665                         lib_copy_buf2iov(niov, iov, offset,
1666                                          rxmsg->ibm_u.immediate.ibim_payload,
1667                                          mlen);
1668
1669                 lib_finalize (nal, NULL, libmsg, PTL_OK);
1670                 return (PTL_OK);
1671
1672         case IBNAL_MSG_PUT_REQ:
1673                 /* NB rx_complete() will send PUT_NAK when I return to it from
1674                  * here, unless I set rx_responded!  */
1675
1676                 if (mlen == 0) { /* No payload to RDMA */
1677                         lib_finalize(nal, NULL, libmsg, PTL_OK);
1678                         return PTL_OK;
1679                 }
1680
1681                 tx = kibnal_get_idle_tx(0);
1682                 if (tx == NULL) {
1683                         CERROR("Can't allocate tx for "LPX64"\n",
1684                                conn->ibc_peer->ibp_nid);
1685                         return PTL_FAIL;
1686                 }
1687
1688                 txmsg = tx->tx_msg;
1689                 if (kiov == NULL)
1690                         rc = kibnal_setup_rd_iov(tx, 
1691                                                  &txmsg->ibm_u.putack.ibpam_rd,
1692                                                  vv_acc_r_mem_write,
1693                                                  niov, iov, offset, mlen);
1694                 else
1695                         rc = kibnal_setup_rd_kiov(tx,
1696                                                   &txmsg->ibm_u.putack.ibpam_rd,
1697                                                   vv_acc_r_mem_write,
1698                                                   niov, kiov, offset, mlen);
1699                 if (rc != 0) {
1700                         CERROR("Can't setup PUT sink for "LPX64": %d\n",
1701                                conn->ibc_peer->ibp_nid, rc);
1702                         kibnal_tx_done(tx);
1703                         return PTL_FAIL;
1704                 }
1705
1706                 txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
1707                 txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie;
1708 #if IBNAL_USE_FMR
1709                 nob = sizeof(kib_putack_msg_t);
1710 #else
1711                 {
1712                         int n = tx->tx_msg->ibm_u.putack.ibpam_rd.rd_nfrag;
1713
1714                         nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]);
1715                 }
1716 #endif
1717                 kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_ACK, nob);
1718
1719                 tx->tx_libmsg[0] = libmsg;      /* finalise libmsg on completion */
1720                 tx->tx_waiting = 1;             /* waiting for PUT_DONE */
1721                 kibnal_queue_tx(tx, conn);
1722
1723                 LASSERT (!rx->rx_responded);
1724                 rx->rx_responded = 1;
1725                 return PTL_OK;
1726
1727         case IBNAL_MSG_GET_REQ:
1728                 /* We get called here just to discard any junk after the
1729                  * GET hdr. */
1730                 LASSERT (libmsg == NULL);
1731                 lib_finalize (nal, NULL, libmsg, PTL_OK);
1732                 return (PTL_OK);
1733         }
1734 }
1735
1736 ptl_err_t
1737 kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg,
1738               unsigned int niov, struct iovec *iov, 
1739               size_t offset, size_t mlen, size_t rlen)
1740 {
1741         return (kibnal_recvmsg (nal, private, msg, niov, iov, NULL,
1742                                 offset, mlen, rlen));
1743 }
1744
1745 ptl_err_t
1746 kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg,
1747                      unsigned int niov, ptl_kiov_t *kiov, 
1748                      size_t offset, size_t mlen, size_t rlen)
1749 {
1750         return (kibnal_recvmsg (nal, private, msg, niov, NULL, kiov,
1751                                 offset, mlen, rlen));
1752 }
1753
1754 int
1755 kibnal_thread_start (int (*fn)(void *arg), void *arg)
1756 {
1757         long    pid = kernel_thread (fn, arg, 0);
1758
1759         if (pid < 0)
1760                 return ((int)pid);
1761
1762         atomic_inc (&kibnal_data.kib_nthreads);
1763         return (0);
1764 }
1765
1766 void
1767 kibnal_thread_fini (void)
1768 {
1769         atomic_dec (&kibnal_data.kib_nthreads);
1770 }
1771
1772 void
1773 kibnal_schedule_conn (kib_conn_t *conn)
1774 {
1775         unsigned long flags;
1776
1777         kibnal_conn_addref(conn);               /* ++ref for connd */
1778         
1779         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
1780
1781         list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns);
1782         wake_up (&kibnal_data.kib_connd_waitq);
1783                 
1784         spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
1785 }
1786
1787 void
1788 kibnal_close_conn_locked (kib_conn_t *conn, int error)
1789 {
1790         /* This just does the immmediate housekeeping.  'error' is zero for a
1791          * normal shutdown which can happen only after the connection has been
1792          * established.  If the connection is established, schedule the
1793          * connection to be finished off by the connd.  Otherwise the connd is
1794          * already dealing with it (either to set it up or tear it down).
1795          * Caller holds kib_global_lock exclusively in irq context */
1796         kib_peer_t       *peer = conn->ibc_peer;
1797         
1798         LASSERT (error != 0 || conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
1799
1800         if (error != 0 && conn->ibc_comms_error == 0)
1801                 conn->ibc_comms_error = error;
1802
1803         if (conn->ibc_state != IBNAL_CONN_ESTABLISHED)
1804                 return; /* already being handled  */
1805         
1806         /* NB Can't take ibc_lock here (could be in IRQ context), without
1807          * risking deadlock, so access to ibc_{tx_queue,active_txs} is racey */
1808
1809         if (error == 0 &&
1810             list_empty(&conn->ibc_tx_queue) &&
1811             list_empty(&conn->ibc_active_txs)) {
1812                 CDEBUG(D_NET, "closing conn to "LPX64
1813                        " rx# "LPD64" tx# "LPD64"\n", 
1814                        peer->ibp_nid, conn->ibc_txseq, conn->ibc_rxseq);
1815         } else {
1816                 CERROR("Closing conn to "LPX64": error %d%s%s"
1817                        " rx# "LPD64" tx# "LPD64"\n",
1818                        peer->ibp_nid, error,
1819                        list_empty(&conn->ibc_tx_queue) ? "" : "(sending)",
1820                        list_empty(&conn->ibc_active_txs) ? "" : "(waiting)",
1821                        conn->ibc_txseq, conn->ibc_rxseq);
1822
1823 #if 0
1824                 /* can't skip down the queue without holding ibc_lock (see above) */
1825                 list_for_each(tmp, &conn->ibc_tx_queue) {
1826                         kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
1827                         
1828                         CERROR("   queued tx type %x cookie "LPX64
1829                                " sending %d waiting %d ticks %ld/%d\n", 
1830                                tx->tx_msg->ibm_type, tx->tx_cookie, 
1831                                tx->tx_sending, tx->tx_waiting,
1832                                (long)(tx->tx_deadline - jiffies), HZ);
1833                 }
1834
1835                 list_for_each(tmp, &conn->ibc_active_txs) {
1836                         kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
1837                         
1838                         CERROR("   active tx type %x cookie "LPX64
1839                                " sending %d waiting %d ticks %ld/%d\n", 
1840                                tx->tx_msg->ibm_type, tx->tx_cookie, 
1841                                tx->tx_sending, tx->tx_waiting,
1842                                (long)(tx->tx_deadline - jiffies), HZ);
1843                 }
1844 #endif
1845         }
1846
1847         list_del (&conn->ibc_list);
1848         
1849         if (list_empty (&peer->ibp_conns) &&    /* no more conns */
1850             peer->ibp_persistence == 0 &&       /* non-persistent peer */
1851             kibnal_peer_active(peer)) {         /* still in peer table */
1852                 kibnal_unlink_peer_locked (peer);
1853         }
1854
1855         kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECT1);
1856
1857         kibnal_schedule_conn(conn);
1858         kibnal_conn_decref(conn);               /* lose ibc_list's ref */
1859 }
1860
1861 void
1862 kibnal_close_conn (kib_conn_t *conn, int error)
1863 {
1864         unsigned long flags;
1865         
1866         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1867
1868         kibnal_close_conn_locked (conn, error);
1869         
1870         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1871 }
1872
1873 void
1874 kibnal_handle_early_rxs(kib_conn_t *conn)
1875 {
1876         unsigned long    flags;
1877         kib_rx_t        *rx;
1878
1879         LASSERT (!in_interrupt());
1880         LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
1881         
1882         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1883         while (!list_empty(&conn->ibc_early_rxs)) {
1884                 rx = list_entry(conn->ibc_early_rxs.next,
1885                                 kib_rx_t, rx_list);
1886                 list_del(&rx->rx_list);
1887                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1888                 
1889                 kibnal_handle_rx(rx);
1890                 
1891                 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1892         }
1893         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1894 }
1895
1896 void
1897 kibnal_conn_disconnected(kib_conn_t *conn)
1898 {
1899         LIST_HEAD        (zombies); 
1900         struct list_head *tmp;
1901         struct list_head *nxt;
1902         kib_tx_t         *tx;
1903
1904         /* I'm the connd */
1905         LASSERT (!in_interrupt());
1906         LASSERT (current == kibnal_data.kib_connd);
1907         LASSERT (conn->ibc_state >= IBNAL_CONN_INIT);
1908         
1909         kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECTED);
1910
1911         /* move QP to error state to make posted work items complete */
1912         kibnal_set_qp_state(conn, vv_qp_state_error);
1913
1914         spin_lock(&conn->ibc_lock);
1915
1916         /* Complete all tx descs not waiting for sends to complete.
1917          * NB we should be safe from RDMA now that the QP has changed state */
1918
1919         list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) {
1920                 tx = list_entry (tmp, kib_tx_t, tx_list);
1921
1922                 LASSERT (tx->tx_queued);
1923
1924                 tx->tx_status = -ECONNABORTED;
1925                 tx->tx_queued = 0;
1926                 tx->tx_waiting = 0;
1927                 
1928                 if (tx->tx_sending != 0)
1929                         continue;
1930
1931                 list_del (&tx->tx_list);
1932                 list_add (&tx->tx_list, &zombies);
1933         }
1934
1935         list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) {
1936                 tx = list_entry (tmp, kib_tx_t, tx_list);
1937
1938                 LASSERT (!tx->tx_queued);
1939                 LASSERT (tx->tx_waiting ||
1940                          tx->tx_sending != 0);
1941
1942                 tx->tx_status = -ECONNABORTED;
1943                 tx->tx_waiting = 0;
1944                 
1945                 if (tx->tx_sending != 0)
1946                         continue;
1947
1948                 list_del (&tx->tx_list);
1949                 list_add (&tx->tx_list, &zombies);
1950         }
1951         
1952         spin_unlock(&conn->ibc_lock);
1953
1954         while (!list_empty(&zombies)) {
1955                 tx = list_entry (zombies.next, kib_tx_t, tx_list);
1956
1957                 list_del(&tx->tx_list);
1958                 kibnal_tx_done (tx);
1959         }
1960
1961         kibnal_handle_early_rxs(conn);
1962 }
1963
1964 void
1965 kibnal_peer_connect_failed (kib_peer_t *peer, int active)
1966 {
1967         struct list_head  zombies;
1968         kib_tx_t         *tx;
1969         unsigned long     flags;
1970
1971         /* Only the connd creates conns => single threaded */
1972         LASSERT (!in_interrupt());
1973         LASSERT (current == kibnal_data.kib_connd);
1974         LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL);
1975
1976         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1977
1978         if (active) {
1979                 LASSERT (peer->ibp_connecting != 0);
1980                 peer->ibp_connecting--;
1981         } else {
1982                 LASSERT (!kibnal_peer_active(peer));
1983         }
1984         
1985         if (peer->ibp_connecting != 0) {
1986                 /* another connection attempt under way (loopback?)... */
1987                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1988                 return;
1989         }
1990
1991         if (list_empty(&peer->ibp_conns)) {
1992                 /* Say when active connection can be re-attempted */
1993                 peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval;
1994                 /* Increase reconnection interval */
1995                 peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2,
1996                                                     IBNAL_MAX_RECONNECT_INTERVAL);
1997         
1998                 /* Take peer's blocked transmits to complete with error */
1999                 list_add(&zombies, &peer->ibp_tx_queue);
2000                 list_del_init(&peer->ibp_tx_queue);
2001                 
2002                 if (kibnal_peer_active(peer) &&
2003                     (peer->ibp_persistence == 0)) {
2004                         /* failed connection attempt on non-persistent peer */
2005                         kibnal_unlink_peer_locked (peer);
2006                 }
2007         } else {
2008                 /* Can't have blocked transmits if there are connections */
2009                 LASSERT (list_empty(&peer->ibp_tx_queue));
2010         }
2011         
2012         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2013
2014         if (list_empty (&zombies)) 
2015                 return;
2016         
2017         CERROR ("Deleting messages for "LPX64": connection failed\n", peer->ibp_nid);
2018         do {
2019                 tx = list_entry (zombies.next, kib_tx_t, tx_list);
2020
2021                 list_del (&tx->tx_list);
2022                 /* complete now */
2023                 tx->tx_status = -EHOSTUNREACH;
2024                 kibnal_tx_done (tx);
2025         } while (!list_empty (&zombies));
2026 }
2027
2028 void
2029 kibnal_connreq_done(kib_conn_t *conn, int active, int status)
2030 {
2031         static cm_reject_data_t   rej;
2032
2033         struct list_head   txs;
2034         kib_peer_t        *peer = conn->ibc_peer;
2035         kib_peer_t        *peer2;
2036         unsigned long      flags;
2037         kib_tx_t          *tx;
2038
2039         /* Only the connd creates conns => single threaded */
2040         LASSERT (!in_interrupt());
2041         LASSERT (current == kibnal_data.kib_connd);
2042         LASSERT (conn->ibc_state < IBNAL_CONN_ESTABLISHED);
2043
2044         if (active) {
2045                 LASSERT (peer->ibp_connecting > 0);
2046         } else {
2047                 LASSERT (!kibnal_peer_active(peer));
2048         }
2049         
2050         PORTAL_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
2051         conn->ibc_connvars = NULL;
2052
2053         if (status != 0) {
2054                 /* failed to establish connection */
2055                 switch (conn->ibc_state) {
2056                 default:
2057                         LBUG();
2058
2059                 case IBNAL_CONN_ACTIVE_CHECK_REPLY:
2060                         /* got a connection reply but failed checks */
2061                         LASSERT (active);
2062                         memset(&rej, 0, sizeof(rej));
2063                         rej.reason = cm_rej_code_usr_rej;
2064                         cm_reject(conn->ibc_cep, &rej);
2065                         break;
2066
2067                 case IBNAL_CONN_ACTIVE_CONNECT:
2068                         LASSERT (active);
2069                         cm_cancel(conn->ibc_cep);
2070                         kibnal_pause(HZ/10);
2071                         /* cm_connect() failed immediately or
2072                          * callback returned failure */
2073                         break;
2074
2075                 case IBNAL_CONN_ACTIVE_ARP:
2076                         LASSERT (active);
2077                         /* ibat_get_ib_data() failed immediately 
2078                          * or callback returned failure */
2079                         break;
2080
2081                 case IBNAL_CONN_INIT:
2082                         break;
2083
2084                 case IBNAL_CONN_PASSIVE_WAIT:
2085                         LASSERT (!active);
2086                         /* cm_accept callback returned failure */
2087                         break;
2088                 }
2089
2090                 kibnal_peer_connect_failed(conn->ibc_peer, active);
2091                 kibnal_conn_disconnected(conn);
2092                 return;
2093         }
2094
2095         /* connection established */
2096         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2097
2098         if (active) {
2099                 LASSERT(conn->ibc_state == IBNAL_CONN_ACTIVE_RTU);
2100         } else {
2101                 LASSERT(conn->ibc_state == IBNAL_CONN_PASSIVE_WAIT);
2102         }
2103         
2104         kibnal_set_conn_state(conn, IBNAL_CONN_ESTABLISHED);
2105
2106         if (!active) {
2107                 peer2 = kibnal_find_peer_locked(peer->ibp_nid);
2108                 if (peer2 != NULL) {
2109                         /* already in the peer table; swap */
2110                         conn->ibc_peer = peer2;
2111                         kibnal_peer_addref(peer2);
2112                         kibnal_peer_decref(peer);
2113                         peer = conn->ibc_peer;
2114                 } else {
2115                         /* add 'peer' to the peer table */
2116                         kibnal_peer_addref(peer);
2117                         list_add_tail(&peer->ibp_list,
2118                                       kibnal_nid2peerlist(peer->ibp_nid));
2119                 }
2120         }
2121         
2122         /* Add conn to peer's list and nuke any dangling conns from a different
2123          * peer instance... */
2124         kibnal_conn_addref(conn);               /* +1 ref for ibc_list */
2125         list_add(&conn->ibc_list, &peer->ibp_conns);
2126         kibnal_close_stale_conns_locked (conn->ibc_peer,
2127                                          conn->ibc_incarnation);
2128
2129         if (!kibnal_peer_active(peer) ||        /* peer has been deleted */
2130             conn->ibc_comms_error != 0 ||       /* comms error */
2131             conn->ibc_disconnect) {             /* need to disconnect */
2132                 
2133                 /* start to shut down connection */
2134                 kibnal_close_conn_locked(conn, -ECONNABORTED);
2135
2136                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2137                 kibnal_peer_connect_failed(peer, active);
2138                 return;
2139         }
2140
2141         if (active)
2142                 peer->ibp_connecting--;
2143
2144         /* grab pending txs while I have the lock */
2145         list_add(&txs, &peer->ibp_tx_queue);
2146         list_del_init(&peer->ibp_tx_queue);
2147         
2148         /* reset reconnect interval for next attempt */
2149         peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
2150         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2151
2152         /* Schedule blocked txs */
2153         spin_lock (&conn->ibc_lock);
2154         while (!list_empty (&txs)) {
2155                 tx = list_entry (txs.next, kib_tx_t, tx_list);
2156                 list_del (&tx->tx_list);
2157
2158                 kibnal_queue_tx_locked (tx, conn);
2159         }
2160         spin_unlock (&conn->ibc_lock);
2161         kibnal_check_sends (conn);
2162
2163         /* schedule blocked rxs */
2164         kibnal_handle_early_rxs(conn);
2165 }
2166
2167 void
2168 kibnal_cm_callback(cm_cep_handle_t cep, cm_conn_data_t *cmdata, void *arg)
2169 {
2170         static cm_dreply_data_t drep;           /* just zeroed space */
2171         
2172         kib_conn_t             *conn = (kib_conn_t *)arg;
2173         unsigned long           flags;
2174         
2175         /* CAVEAT EMPTOR: tasklet context */
2176
2177         switch (cmdata->status) {
2178         default:
2179                 LBUG();
2180                 
2181         case cm_event_disconn_request:
2182                 /* IBNAL_CONN_ACTIVE_RTU:  gets closed in kibnal_connreq_done
2183                  * IBNAL_CONN_ESTABLISHED: I start it closing
2184                  * otherwise:              it's closing anyway */
2185                 cm_disconnect(conn->ibc_cep, NULL, &drep);
2186                 cm_cancel(conn->ibc_cep);
2187
2188                 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2189                 LASSERT (!conn->ibc_disconnect);
2190                 conn->ibc_disconnect = 1;
2191
2192                 switch (conn->ibc_state) {
2193                 default:
2194                         LBUG();
2195
2196                 case IBNAL_CONN_ACTIVE_RTU:
2197                         /* kibnal_connreq_done is getting there; It'll see
2198                          * ibc_disconnect set... */
2199                         break;
2200
2201                 case IBNAL_CONN_ESTABLISHED:
2202                         /* kibnal_connreq_done got there already; get
2203                          * disconnect going... */
2204                         kibnal_close_conn_locked(conn, 0);
2205                         break;
2206
2207                 case IBNAL_CONN_DISCONNECT1:
2208                         /* kibnal_terminate_conn is getting there; It'll see
2209                          * ibc_disconnect set... */
2210                         break;
2211
2212                 case IBNAL_CONN_DISCONNECT2:
2213                         /* kibnal_terminate_conn got there already; complete
2214                          * the disconnect. */
2215                         kibnal_schedule_conn(conn);
2216                         break;
2217                 }
2218                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2219                 break;
2220                 
2221         case cm_event_disconn_timeout:
2222         case cm_event_disconn_reply:
2223                 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2224                 LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT2);
2225                 LASSERT (!conn->ibc_disconnect);
2226                 conn->ibc_disconnect = 1;
2227
2228                 /* kibnal_terminate_conn sent the disconnect request. */
2229                 kibnal_schedule_conn(conn);
2230
2231                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2232                 break;
2233                 
2234         case cm_event_connected:
2235         case cm_event_conn_timeout:
2236         case cm_event_conn_reject:
2237                 LASSERT (conn->ibc_state == IBNAL_CONN_PASSIVE_WAIT);
2238                 conn->ibc_connvars->cv_conndata = *cmdata;
2239
2240                 kibnal_schedule_conn(conn);
2241                 break;
2242         }
2243
2244         kibnal_conn_decref(conn); /* lose my ref */
2245 }
2246
2247 void
2248 kibnal_check_passive_wait(kib_conn_t *conn)
2249 {
2250         int     rc;
2251
2252         switch (conn->ibc_connvars->cv_conndata.status) {
2253         default:
2254                 LBUG();
2255                 
2256         case cm_event_connected:
2257                 kibnal_conn_addref(conn); /* ++ ref for CM callback */
2258                 rc = kibnal_set_qp_state(conn, vv_qp_state_rts);
2259                 if (rc != 0)
2260                         conn->ibc_comms_error = rc;
2261                 /* connection _has_ been established; it's just that we've had
2262                  * an error immediately... */
2263                 kibnal_connreq_done(conn, 0, 0);
2264                 break;
2265                 
2266         case cm_event_conn_timeout:
2267                 kibnal_connreq_done(conn, 0, -ETIMEDOUT);
2268                 break;
2269                 
2270         case cm_event_conn_reject:
2271                 kibnal_connreq_done(conn, 0, -ECONNRESET);
2272                 break;
2273         }
2274 }
2275
2276 void
2277 kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq)
2278 {
2279         static kib_msg_t        txmsg;
2280         static kib_msg_t        rxmsg;
2281         static cm_reply_data_t  reply;
2282         static cm_reject_data_t reject;
2283
2284         kib_conn_t         *conn = NULL;
2285         int                 rc = 0;
2286         int                 rxmsgnob;
2287         kib_connvars_t     *cv;
2288         kib_peer_t         *tmp_peer;
2289         cm_return_t         cmrc;
2290         vv_return_t         vvrc;
2291         
2292         /* I'm the connd executing in thread context
2293          * No concurrency problems with static data! */
2294         LASSERT (!in_interrupt());
2295         LASSERT (current == kibnal_data.kib_connd);
2296
2297         if (cmreq->sid != IBNAL_SERVICE_NUMBER) {
2298                 CERROR(LPX64" != IBNAL_SERVICE_NUMBER("LPX64")\n",
2299                        cmreq->sid, (__u64)IBNAL_SERVICE_NUMBER);
2300                 goto reject;
2301         }
2302
2303         /* copy into rxmsg to avoid alignment issues */
2304         rxmsgnob = MIN(cm_REQ_priv_data_len, sizeof(rxmsg));
2305         memcpy(&rxmsg, cmreq->priv_data, rxmsgnob);
2306
2307         rc = kibnal_unpack_msg(&rxmsg, rxmsgnob);
2308         if (rc != 0) {
2309                 CERROR("Can't parse connection request: %d\n", rc);
2310                 goto reject;
2311         }
2312
2313         if (rxmsg.ibm_type != IBNAL_MSG_CONNREQ) {
2314                 CERROR("Unexpected connreq msg type: %x from "LPX64"\n",
2315                        rxmsg.ibm_type, rxmsg.ibm_srcnid);
2316                 goto reject;
2317         }
2318
2319         if (rxmsg.ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid) {
2320                 CERROR("Can't accept "LPX64": bad dst nid "LPX64"\n",
2321                        rxmsg.ibm_srcnid, rxmsg.ibm_dstnid);
2322                 goto reject;
2323         }
2324
2325         if (rxmsg.ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) {
2326                 CERROR("Can't accept "LPX64": incompatible queue depth %d (%d wanted)\n",
2327                        rxmsg.ibm_srcnid, rxmsg.ibm_u.connparams.ibcp_queue_depth, 
2328                        IBNAL_MSG_QUEUE_SIZE);
2329                 goto reject;
2330         }
2331
2332         if (rxmsg.ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE) {
2333                 CERROR("Can't accept "LPX64": message size %d too big (%d max)\n",
2334                        rxmsg.ibm_srcnid, rxmsg.ibm_u.connparams.ibcp_max_msg_size, 
2335                        IBNAL_MSG_SIZE);
2336                 goto reject;
2337         }
2338                 
2339         if (rxmsg.ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) {
2340                 CERROR("Can't accept "LPX64": max frags %d too big (%d max)\n",
2341                        rxmsg.ibm_srcnid, rxmsg.ibm_u.connparams.ibcp_max_frags, 
2342                        IBNAL_MAX_RDMA_FRAGS);
2343                 goto reject;
2344         }
2345                 
2346         conn = kibnal_create_conn(cep);
2347         if (conn == NULL) {
2348                 CERROR("Can't create conn for "LPX64"\n", rxmsg.ibm_srcnid);
2349                 goto reject;
2350         }
2351         
2352         /* assume 'rxmsg.ibm_srcnid' is a new peer */
2353         tmp_peer = kibnal_create_peer (rxmsg.ibm_srcnid);
2354         if (tmp_peer == NULL) {
2355                 CERROR("Can't create tmp peer for "LPX64"\n", rxmsg.ibm_srcnid);
2356                 kibnal_conn_decref(conn);
2357                 conn = NULL;
2358                 goto reject;
2359         }
2360
2361         conn->ibc_peer = tmp_peer;              /* conn takes over my ref */
2362         conn->ibc_incarnation = rxmsg.ibm_srcstamp;
2363         conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
2364
2365         cv = conn->ibc_connvars;
2366
2367         cv->cv_txpsn          = cmreq->cep_data.start_psn;
2368         cv->cv_remote_qpn     = cmreq->cep_data.qpn;
2369         cv->cv_path           = cmreq->path_data.path;
2370         cv->cv_rnr_count      = cmreq->cep_data.rtr_retry_cnt;
2371         // XXX                  cmreq->cep_data.retry_cnt;
2372         cv->cv_port           = cmreq->cep_data.local_port_num;
2373
2374         vvrc = gid2gid_index(kibnal_data.kib_hca, cv->cv_port,
2375                              &cv->cv_path.sgid, &cv->cv_sgid_index);
2376         LASSERT (vvrc == vv_return_ok);
2377         
2378         vvrc = pkey2pkey_index(kibnal_data.kib_hca, cv->cv_port,
2379                                cv->cv_path.pkey, &cv->cv_pkey_index);
2380         LASSERT (vvrc == vv_return_ok);
2381
2382         rc = kibnal_set_qp_state(conn, vv_qp_state_init);
2383         if (rc != 0)
2384                 goto reject;
2385
2386         rc = kibnal_post_receives(conn);
2387         if (rc != 0) {
2388                 CERROR("Can't post receives for "LPX64"\n", rxmsg.ibm_srcnid);
2389                 goto reject;
2390         }
2391
2392         rc = kibnal_set_qp_state(conn, vv_qp_state_rtr);
2393         if (rc != 0)
2394                 goto reject;
2395         
2396         memset(&reply, 0, sizeof(reply));
2397         reply.qpn                 = cv->cv_local_qpn;
2398         reply.qkey                = IBNAL_QKEY;
2399         reply.start_psn           = cv->cv_rxpsn;
2400         reply.arb_initiator_depth = IBNAL_ARB_INITIATOR_DEPTH;
2401         reply.arb_resp_res        = IBNAL_ARB_RESP_RES;
2402         reply.failover_accepted   = IBNAL_FAILOVER_ACCEPTED;
2403         reply.rnr_retry_count     = cv->cv_rnr_count;
2404         reply.targ_ack_delay      = kibnal_data.kib_hca_attrs.ack_delay;
2405         
2406         /* setup txmsg... */
2407         memset(&txmsg, 0, sizeof(txmsg));
2408         kibnal_init_msg(&txmsg, IBNAL_MSG_CONNACK, 
2409                         sizeof(txmsg.ibm_u.connparams));
2410         LASSERT (txmsg.ibm_nob <= cm_REP_priv_data_len);
2411         txmsg.ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE;
2412         txmsg.ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE;
2413         txmsg.ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS;
2414         kibnal_pack_msg(&txmsg, 0, rxmsg.ibm_srcnid, rxmsg.ibm_srcstamp, 0);
2415
2416         /* ...and copy into reply to avoid alignment issues */
2417         memcpy(&reply.priv_data, &txmsg, txmsg.ibm_nob);
2418
2419         kibnal_set_conn_state(conn, IBNAL_CONN_PASSIVE_WAIT);
2420         
2421         cmrc = cm_accept(conn->ibc_cep, &reply, NULL,
2422                          kibnal_cm_callback, conn);
2423
2424         if (cmrc == cm_stat_success)
2425                 return;                         /* callback has got my ref on conn */
2426
2427         /* back out state change (no callback happening) */
2428         kibnal_set_conn_state(conn, IBNAL_CONN_INIT);
2429         rc = -EIO;
2430                 
2431  reject:
2432         CERROR("Rejected connreq from "LPX64"\n", rxmsg.ibm_srcnid);
2433
2434         memset(&reject, 0, sizeof(reject));
2435         reject.reason = cm_rej_code_usr_rej;
2436         cm_reject(cep, &reject);
2437
2438         if (conn != NULL) {
2439                 LASSERT (rc != 0);
2440                 kibnal_connreq_done(conn, 0, rc);
2441         } else {
2442                 cm_destroy_cep(cep);
2443         }
2444 }
2445
2446 void
2447 kibnal_listen_callback(cm_cep_handle_t cep, cm_conn_data_t *data, void *arg)
2448 {
2449         cm_request_data_t  *cmreq = &data->data.request;
2450         kib_pcreq_t        *pcr;
2451         unsigned long       flags;
2452         
2453         LASSERT (arg == NULL);
2454
2455         if (data->status != cm_event_conn_request) {
2456                 CERROR("status %d is not cm_event_conn_request\n",
2457                        data->status);
2458                 return;
2459         }
2460
2461         PORTAL_ALLOC_ATOMIC(pcr, sizeof(*pcr));
2462         if (pcr == NULL) {
2463                 CERROR("Can't allocate passive connreq\n");
2464
2465                 cm_reject(cep, &((cm_reject_data_t) /* NB RO struct */
2466                                  {.reason = cm_rej_code_no_res,}));
2467                 cm_destroy_cep(cep);
2468                 return;
2469         }
2470
2471         pcr->pcr_cep = cep;
2472         pcr->pcr_cmreq = *cmreq;
2473         
2474         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
2475
2476         list_add_tail(&pcr->pcr_list, &kibnal_data.kib_connd_pcreqs);
2477         wake_up(&kibnal_data.kib_connd_waitq);
2478         
2479         spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
2480 }
2481
2482
2483 void
2484 kibnal_active_connect_callback (cm_cep_handle_t cep, cm_conn_data_t *cd, 
2485                                 void *arg)
2486 {
2487         /* CAVEAT EMPTOR: tasklet context */
2488         kib_conn_t       *conn = (kib_conn_t *)arg;
2489         kib_connvars_t   *cv = conn->ibc_connvars;
2490
2491         LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT);
2492         cv->cv_conndata = *cd;
2493
2494         kibnal_schedule_conn(conn);
2495         kibnal_conn_decref(conn);
2496 }
2497
2498 void
2499 kibnal_connect_conn (kib_conn_t *conn)
2500 {
2501         static cm_request_data_t  cmreq;
2502         static kib_msg_t          msg;
2503         
2504         kib_connvars_t           *cv = conn->ibc_connvars;
2505         kib_peer_t               *peer = conn->ibc_peer;
2506         cm_return_t               cmrc;
2507         
2508         /* Only called by connd => statics OK */
2509         LASSERT (!in_interrupt());
2510         LASSERT (current == kibnal_data.kib_connd);
2511         LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP);
2512
2513         memset(&cmreq, 0, sizeof(cmreq));
2514         
2515         cmreq.sid = IBNAL_SERVICE_NUMBER;
2516
2517         cmreq.cep_data.ca_guid              = kibnal_data.kib_hca_attrs.guid;
2518         cmreq.cep_data.qpn                  = cv->cv_local_qpn;
2519         cmreq.cep_data.retry_cnt            = IBNAL_RETRY_CNT;
2520         cmreq.cep_data.rtr_retry_cnt        = IBNAL_RNR_CNT;
2521         cmreq.cep_data.start_psn            = cv->cv_rxpsn;
2522         cmreq.cep_data.end_to_end_flow_ctrl = IBNAL_EE_FLOW_CNT;
2523         // XXX ack_timeout?
2524         // offered_resp_res
2525         // offered_initiator_depth
2526
2527         cmreq.path_data.subn_local  = IBNAL_LOCAL_SUB;
2528         cmreq.path_data.path        = cv->cv_path;
2529         
2530         /* setup msg... */
2531         memset(&msg, 0, sizeof(msg));
2532         kibnal_init_msg(&msg, IBNAL_MSG_CONNREQ, sizeof(msg.ibm_u.connparams));
2533         LASSERT(msg.ibm_nob <= cm_REQ_priv_data_len);
2534         msg.ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE;
2535         msg.ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE;
2536         msg.ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS;
2537         kibnal_pack_msg(&msg, 0, peer->ibp_nid, 0, 0);
2538
2539         /* ...and copy into cmreq to avoid alignment issues */
2540         memcpy(&cmreq.priv_data, &msg, msg.ibm_nob);
2541         
2542         CDEBUG(D_NET, "Connecting %p to "LPX64"\n", conn, peer->ibp_nid);
2543
2544         kibnal_conn_addref(conn);               /* ++ref for CM callback */
2545         kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CONNECT);
2546
2547         cmrc = cm_connect(conn->ibc_cep, &cmreq, 
2548                           kibnal_active_connect_callback, conn);
2549         if (cmrc == cm_stat_success) {
2550                 CDEBUG(D_NET, "connection REQ sent to "LPX64"\n",
2551                        peer->ibp_nid);
2552                 return;
2553         }
2554
2555         CERROR ("Connect "LPX64" failed: %d\n", peer->ibp_nid, cmrc);
2556         kibnal_conn_decref(conn);       /* drop callback's ref */
2557         kibnal_connreq_done(conn, 1, -EHOSTUNREACH);
2558 }
2559
2560 void
2561 kibnal_check_connreply (kib_conn_t *conn)
2562 {
2563         static cm_rtu_data_t  rtu;
2564         static kib_msg_t      msg;
2565
2566         kib_connvars_t   *cv = conn->ibc_connvars;
2567         cm_reply_data_t  *reply = &cv->cv_conndata.data.reply;
2568         kib_peer_t       *peer = conn->ibc_peer;
2569         int               msgnob;
2570         cm_return_t       cmrc;
2571         cm_cep_handle_t   cep;
2572         unsigned long     flags;
2573         int               rc;
2574
2575         /* Only called by connd => statics OK */
2576         LASSERT (!in_interrupt());
2577         LASSERT (current == kibnal_data.kib_connd);
2578         LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT);
2579
2580         if (cv->cv_conndata.status == cm_event_conn_reply) {
2581                 cv->cv_remote_qpn = reply->qpn;
2582                 cv->cv_txpsn      = reply->start_psn;
2583                 // XXX              reply->targ_ack_delay;
2584                 cv->cv_rnr_count  = reply->rnr_retry_count;
2585
2586                 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CHECK_REPLY);
2587
2588                 /* copy into msg to avoid alignment issues */
2589                 msgnob = MIN(cm_REP_priv_data_len, sizeof(msg));
2590                 memcpy(&msg, &reply->priv_data, msgnob);
2591
2592                 rc = kibnal_unpack_msg(&msg, msgnob);
2593                 if (rc != 0) {
2594                         CERROR("Can't unpack reply from "LPX64"\n",
2595                                peer->ibp_nid);
2596                         kibnal_connreq_done(conn, 1, rc);
2597                         return;
2598                 }
2599
2600                 if (msg.ibm_type != IBNAL_MSG_CONNACK ) {
2601                         CERROR("Unexpected message type %d from "LPX64"\n",
2602                                msg.ibm_type, peer->ibp_nid);
2603                         kibnal_connreq_done(conn, 1, -EPROTO);
2604                         return;
2605                 }
2606
2607                 if (msg.ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) {
2608                         CERROR(LPX64" has incompatible queue depth %d(%d wanted)\n",
2609                                peer->ibp_nid, msg.ibm_u.connparams.ibcp_queue_depth,
2610                                IBNAL_MSG_QUEUE_SIZE);
2611                         kibnal_connreq_done(conn, 1, -EPROTO);
2612                         return;
2613                 }
2614                 
2615                 if (msg.ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE) {
2616                         CERROR(LPX64" max message size %d too big (%d max)\n",
2617                                peer->ibp_nid, msg.ibm_u.connparams.ibcp_max_msg_size, 
2618                                IBNAL_MSG_SIZE);
2619                         kibnal_connreq_done(conn, 1, -EPROTO);
2620                         return;
2621                 }
2622
2623                 if (msg.ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) {
2624                         CERROR(LPX64" max frags %d too big (%d max)\n",
2625                                peer->ibp_nid, msg.ibm_u.connparams.ibcp_max_frags, 
2626                                IBNAL_MAX_RDMA_FRAGS);
2627                         kibnal_connreq_done(conn, 1, -EPROTO);
2628                         return;
2629                 }
2630                 
2631                 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2632                 rc = (msg.ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid ||
2633                       msg.ibm_dststamp != kibnal_data.kib_incarnation) ?
2634                      -ESTALE : 0;
2635                 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2636                 if (rc != 0) {
2637                         CERROR("Stale connection reply from "LPX64"\n",
2638                                peer->ibp_nid);
2639                         kibnal_connreq_done(conn, 1, rc);
2640                         return;
2641                 }
2642
2643                 conn->ibc_incarnation = msg.ibm_srcstamp;
2644                 conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
2645                 
2646                 rc = kibnal_post_receives(conn);
2647                 if (rc != 0) {
2648                         CERROR("Can't post receives for "LPX64"\n",
2649                                peer->ibp_nid);
2650                         kibnal_connreq_done(conn, 1, rc);
2651                         return;
2652                 }
2653                 
2654                 rc = kibnal_set_qp_state(conn, vv_qp_state_rtr);
2655                 if (rc != 0) {
2656                         kibnal_connreq_done(conn, 1, rc);
2657                         return;
2658                 }
2659                 
2660                 rc = kibnal_set_qp_state(conn, vv_qp_state_rts);
2661                 if (rc != 0) {
2662                         kibnal_connreq_done(conn, 1, rc);
2663                         return;
2664                 }
2665                 
2666                 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_RTU);
2667                 kibnal_conn_addref(conn);       /* ++for CM callback */
2668                 
2669                 memset(&rtu, 0, sizeof(rtu));
2670                 cmrc = cm_accept(conn->ibc_cep, NULL, &rtu,
2671                                  kibnal_cm_callback, conn);
2672                 if (cmrc == cm_stat_success) {
2673                         /* Now I'm racing with disconnect signalled by
2674                          * kibnal_cm_callback */
2675                         kibnal_connreq_done(conn, 1, 0);
2676                         return;
2677                 }
2678
2679                 CERROR("cm_accept "LPX64" failed: %d\n", peer->ibp_nid, cmrc);
2680                 /* Back out of RTU: no callback coming */
2681                 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CHECK_REPLY);
2682                 kibnal_conn_decref(conn);
2683                 kibnal_connreq_done(conn, 1, -EIO);
2684                 return;
2685         }
2686
2687         if (cv->cv_conndata.status == cm_event_conn_reject) {
2688
2689                 if (cv->cv_conndata.data.reject.reason != cm_rej_code_stale_conn) {
2690                         CERROR("conn -> "LPX64" rejected: %d\n", peer->ibp_nid,
2691                                cv->cv_conndata.data.reject.reason);
2692                         kibnal_connreq_done(conn, 1, -ECONNREFUSED);
2693                         return;
2694                 }
2695
2696                 CWARN ("conn -> "LPX64" stale: retrying\n", peer->ibp_nid);
2697
2698                 cep = cm_create_cep(cm_cep_transp_rc);
2699                 if (cep == NULL) {
2700                         CERROR("Can't create new CEP\n");
2701                         kibnal_connreq_done(conn, 1, -ENOMEM);
2702                         return;
2703                 }
2704
2705                 cmrc = cm_cancel(conn->ibc_cep);
2706                 LASSERT (cmrc == cm_stat_success);
2707                 cmrc = cm_destroy_cep(conn->ibc_cep);
2708                 LASSERT (cmrc == cm_stat_success);
2709
2710                 conn->ibc_cep = cep;
2711
2712                 /* retry connect */
2713                 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_ARP);
2714                 kibnal_connect_conn(conn);
2715                 return;
2716         }
2717
2718         CERROR("conn -> "LPX64" failed: %d\n", peer->ibp_nid,
2719                cv->cv_conndata.status);
2720         kibnal_connreq_done(conn, 1, -ECONNABORTED);
2721 }
2722
2723 void
2724 kibnal_arp_done (kib_conn_t *conn)
2725 {
2726         kib_peer_t           *peer = conn->ibc_peer;
2727         kib_connvars_t       *cv = conn->ibc_connvars;
2728         ibat_arp_data_t      *arp = &cv->cv_arp;
2729         ib_path_record_v2_t  *path = &cv->cv_path;
2730         vv_return_t           vvrc;
2731         int                   rc;
2732         unsigned long         flags;
2733
2734         LASSERT (!in_interrupt());
2735         LASSERT (current == kibnal_data.kib_connd);
2736         LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP);
2737         LASSERT (peer->ibp_arp_count > 0);
2738         
2739         if (cv->cv_arprc != ibat_stat_ok) {
2740                 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2741                 peer->ibp_arp_count--;
2742                 if (peer->ibp_arp_count == 0) {
2743                         /* final ARP attempt failed */
2744                         write_unlock_irqrestore(&kibnal_data.kib_global_lock, 
2745                                                 flags);
2746                         CERROR("Arp "LPX64"@%u.%u.%u.%u failed: %d\n", 
2747                                peer->ibp_nid, HIPQUAD(peer->ibp_ip), 
2748                                cv->cv_arprc);
2749                 } else {
2750                         /* Retry ARP: ibp_connecting++ so terminating conn
2751                          * doesn't end peer's connection attempt */
2752                         peer->ibp_connecting++;
2753                         write_unlock_irqrestore(&kibnal_data.kib_global_lock, 
2754                                                 flags);
2755                         CWARN("Arp "LPX64"@%u.%u.%u.%u failed: %d "
2756                               "(%d attempts left)\n", 
2757                               peer->ibp_nid, HIPQUAD(peer->ibp_ip), 
2758                               cv->cv_arprc, peer->ibp_arp_count);
2759
2760                         kibnal_schedule_peer_arp(peer);
2761                 }
2762                 kibnal_connreq_done(conn, 1, -ENETUNREACH);
2763                 return;
2764         }
2765
2766         if ((arp->mask & IBAT_PRI_PATH_VALID) != 0) {
2767                 CDEBUG(D_NET, "Got valid path for "LPX64"\n", peer->ibp_nid);
2768
2769                 *path = *arp->primary_path;
2770
2771                 vvrc = base_gid2port_num(kibnal_data.kib_hca, &path->sgid,
2772                                          &cv->cv_port);
2773                 LASSERT (vvrc == vv_return_ok);
2774
2775                 vvrc = gid2gid_index(kibnal_data.kib_hca, cv->cv_port,
2776                                      &path->sgid, &cv->cv_sgid_index);
2777                 LASSERT (vvrc == vv_return_ok);
2778
2779                 vvrc = pkey2pkey_index(kibnal_data.kib_hca, cv->cv_port,
2780                                        path->pkey, &cv->cv_pkey_index);
2781                 LASSERT (vvrc == vv_return_ok);
2782
2783                 path->mtu = IBNAL_IB_MTU;
2784
2785         } else if ((arp->mask & IBAT_LID_VALID) != 0) {
2786                 CWARN("Creating new path record for "LPX64"@%u.%u.%u.%u\n",
2787                       peer->ibp_nid, HIPQUAD(peer->ibp_ip));
2788
2789                 cv->cv_pkey_index = IBNAL_PKEY_IDX;
2790                 cv->cv_sgid_index = IBNAL_SGID_IDX;
2791                 cv->cv_port = arp->local_port_num;
2792
2793                 memset(path, 0, sizeof(*path));
2794
2795                 vvrc = port_num2base_gid(kibnal_data.kib_hca, cv->cv_port,
2796                                          &path->sgid);
2797                 LASSERT (vvrc == vv_return_ok);
2798
2799                 vvrc = port_num2base_lid(kibnal_data.kib_hca, cv->cv_port,
2800                                          &path->slid);
2801                 LASSERT (vvrc == vv_return_ok);
2802
2803                 path->dgid          = arp->gid;
2804                 path->sl            = IBNAL_SERVICE_LEVEL;
2805                 path->dlid          = arp->lid;
2806                 path->mtu           = IBNAL_IB_MTU;
2807                 path->rate          = IBNAL_STATIC_RATE;
2808                 path->pkt_life_time = IBNAL_PKT_LIFETIME;
2809                 path->pkey          = IBNAL_PKEY;
2810                 path->traffic_class = IBNAL_TRAFFIC_CLASS;
2811         } else {
2812                 CERROR("Can't Arp "LPX64"@%u.%u.%u.%u: no PATH or LID\n", 
2813                        peer->ibp_nid, HIPQUAD(peer->ibp_ip));
2814                 kibnal_connreq_done(conn, 1, -ENETUNREACH);
2815                 return;
2816         }
2817
2818         rc = kibnal_set_qp_state(conn, vv_qp_state_init);
2819         if (rc != 0) {
2820                 kibnal_connreq_done(conn, 1, rc);
2821         }
2822
2823         /* do the actual connection request */
2824         kibnal_connect_conn(conn);
2825 }
2826
2827 void
2828 kibnal_arp_callback (ibat_stat_t arprc, ibat_arp_data_t *arp_data, void *arg)
2829 {
2830         /* CAVEAT EMPTOR: tasklet context */
2831         kib_conn_t      *conn = (kib_conn_t *)arg;
2832         kib_peer_t      *peer = conn->ibc_peer;
2833
2834         if (arprc != ibat_stat_ok)
2835                 CERROR("Arp "LPX64"@%u.%u.%u.%u failed: %d\n",
2836                        peer->ibp_nid, HIPQUAD(peer->ibp_ip), arprc);
2837         else
2838                 CDEBUG(D_NET, "Arp "LPX64"@%u.%u.%u.%u OK: LID %s PATH %s\n",
2839                        peer->ibp_nid, HIPQUAD(peer->ibp_ip), 
2840                        (arp_data->mask & IBAT_LID_VALID) == 0 ? "invalid" : "valid",
2841                        (arp_data->mask & IBAT_PRI_PATH_VALID) == 0 ? "invalid" : "valid");
2842
2843         LASSERT (conn != NULL);
2844         LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP);
2845
2846         conn->ibc_connvars->cv_arprc = arprc;
2847         if (arprc == ibat_stat_ok)
2848                 conn->ibc_connvars->cv_arp = *arp_data;
2849         
2850         kibnal_schedule_conn(conn);
2851         kibnal_conn_decref(conn);
2852 }
2853
2854 void
2855 kibnal_arp_peer (kib_peer_t *peer)
2856 {
2857         cm_cep_handle_t  cep;
2858         kib_conn_t      *conn;
2859         int              ibatrc;
2860
2861         /* Only the connd does this (i.e. single threaded) */
2862         LASSERT (current == kibnal_data.kib_connd);
2863         LASSERT (peer->ibp_connecting != 0);
2864         LASSERT (peer->ibp_arp_count > 0);
2865
2866         cep = cm_create_cep(cm_cep_transp_rc);
2867         if (cep == NULL) {
2868                 CERROR ("Can't create cep for conn->"LPX64"\n",
2869                         peer->ibp_nid);
2870                 kibnal_peer_connect_failed(peer, 1);
2871                 return;
2872         }
2873
2874         conn = kibnal_create_conn(cep);
2875         if (conn == NULL) {
2876                 CERROR ("Can't allocate conn->"LPX64"\n",
2877                         peer->ibp_nid);
2878                 cm_destroy_cep(cep);
2879                 kibnal_peer_connect_failed(peer, 1);
2880                 return;
2881         }
2882
2883         conn->ibc_peer = peer;
2884         kibnal_peer_addref(peer);
2885
2886         kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_ARP);
2887
2888         ibatrc = ibat_get_ib_data(htonl(peer->ibp_ip), INADDR_ANY, 
2889                                   ibat_paths_primary,
2890                                   &conn->ibc_connvars->cv_arp, 
2891                                   kibnal_arp_callback, conn, 0);
2892         CDEBUG(D_NET,"ibatrc %d\n", ibatrc);
2893         switch (ibatrc) {
2894         default:
2895                 LBUG();
2896                 
2897         case ibat_stat_pending:
2898                 /* NB callback has my ref on conn */
2899                 break;
2900                 
2901         case ibat_stat_ok:
2902         case ibat_stat_error:
2903         case ibat_stat_timeout:
2904         case ibat_stat_not_found:
2905                 /* Immediate return (ARP cache hit or failure) == no callback. 
2906                  * Do the next stage directly... */
2907                 conn->ibc_connvars->cv_arprc = ibatrc;
2908                 kibnal_arp_done(conn);
2909                 kibnal_conn_decref(conn);
2910                 break;
2911         }
2912 }
2913
2914 int
2915 kibnal_conn_timed_out (kib_conn_t *conn)
2916 {
2917         kib_tx_t          *tx;
2918         struct list_head  *ttmp;
2919
2920         spin_lock(&conn->ibc_lock);
2921
2922         list_for_each (ttmp, &conn->ibc_tx_queue) {
2923                 tx = list_entry (ttmp, kib_tx_t, tx_list);
2924
2925                 LASSERT (tx->tx_queued);
2926
2927                 if (time_after_eq (jiffies, tx->tx_deadline)) {
2928                         spin_unlock(&conn->ibc_lock);
2929                         return 1;
2930                 }
2931         }
2932
2933         list_for_each (ttmp, &conn->ibc_active_txs) {
2934                 tx = list_entry (ttmp, kib_tx_t, tx_list);
2935
2936                 LASSERT (!tx->tx_queued);
2937                 LASSERT (tx->tx_waiting ||
2938                          tx->tx_sending != 0);
2939
2940                 if (time_after_eq (jiffies, tx->tx_deadline)) {
2941                         spin_unlock(&conn->ibc_lock);
2942                         return 1;
2943                 }
2944         }
2945
2946         spin_unlock(&conn->ibc_lock);
2947         return 0;
2948 }
2949
2950 void
2951 kibnal_check_conns (int idx)
2952 {
2953         struct list_head  *peers = &kibnal_data.kib_peers[idx];
2954         struct list_head  *ptmp;
2955         kib_peer_t        *peer;
2956         kib_conn_t        *conn;
2957         struct list_head  *ctmp;
2958         unsigned long      flags;
2959
2960  again:
2961         /* NB. We expect to have a look at all the peers and not find any
2962          * rdmas to time out, so we just use a shared lock while we
2963          * take a look... */
2964         read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2965
2966         list_for_each (ptmp, peers) {
2967                 peer = list_entry (ptmp, kib_peer_t, ibp_list);
2968
2969                 list_for_each (ctmp, &peer->ibp_conns) {
2970                         conn = list_entry (ctmp, kib_conn_t, ibc_list);
2971
2972                         LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED);
2973
2974                         /* In case we have enough credits to return via a
2975                          * NOOP, but there were no non-blocking tx descs
2976                          * free to do it last time... */
2977                         kibnal_check_sends(conn);
2978
2979                         if (!kibnal_conn_timed_out(conn))
2980                                 continue;
2981
2982                         /* Handle timeout by closing the whole connection.  We
2983                          * can only be sure RDMA activity has ceased once the
2984                          * QP has been modified. */
2985                         
2986                         kibnal_conn_addref(conn); /* 1 ref for me... */
2987
2988                         read_unlock_irqrestore(&kibnal_data.kib_global_lock, 
2989                                                flags);
2990
2991                         CERROR("Timed out RDMA with "LPX64"\n",
2992                                peer->ibp_nid);
2993
2994                         kibnal_close_conn (conn, -ETIMEDOUT);
2995                         kibnal_conn_decref(conn); /* ...until here */
2996
2997                         /* start again now I've dropped the lock */
2998                         goto again;
2999                 }
3000         }
3001
3002         read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
3003 }
3004
3005 void
3006 kibnal_disconnect_conn (kib_conn_t *conn)
3007 {
3008         static cm_drequest_data_t dreq;         /* just for the space */
3009         
3010         cm_return_t    cmrc;
3011         unsigned long  flags;
3012
3013         LASSERT (!in_interrupt());
3014         LASSERT (current == kibnal_data.kib_connd);
3015         
3016         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
3017
3018         if (conn->ibc_disconnect) {
3019                 /* Had the CM callback already */
3020                 write_unlock_irqrestore(&kibnal_data.kib_global_lock,
3021                                         flags);
3022                 kibnal_conn_disconnected(conn);
3023                 return;
3024         }
3025                 
3026         LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT1);
3027
3028         /* active disconnect */
3029         cmrc = cm_disconnect(conn->ibc_cep, &dreq, NULL);
3030         if (cmrc == cm_stat_success) {
3031                 /* waiting for CM */
3032                 conn->ibc_state = IBNAL_CONN_DISCONNECT2;
3033                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
3034                 return;
3035         }
3036
3037         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
3038
3039         cm_cancel(conn->ibc_cep);
3040         kibnal_pause(HZ/10);
3041
3042         if (!conn->ibc_disconnect)              /* CM callback will never happen now */
3043                 kibnal_conn_decref(conn);
3044         
3045         LASSERT (atomic_read(&conn->ibc_refcount) > 0);
3046         LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT1);
3047
3048         kibnal_conn_disconnected(conn);
3049 }
3050
3051 int
3052 kibnal_connd (void *arg)
3053 {
3054         wait_queue_t       wait;
3055         unsigned long      flags;
3056         kib_pcreq_t       *pcr;
3057         kib_conn_t        *conn;
3058         kib_peer_t        *peer;
3059         int                timeout;
3060         int                i;
3061         int                dropped_lock;
3062         int                peer_index = 0;
3063         unsigned long      deadline = jiffies;
3064         
3065         kportal_daemonize ("kibnal_connd");
3066         kportal_blockallsigs ();
3067
3068         init_waitqueue_entry (&wait, current);
3069         kibnal_data.kib_connd = current;
3070
3071         spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3072
3073         while (!kibnal_data.kib_shutdown) {
3074
3075                 dropped_lock = 0;
3076
3077                 if (!list_empty (&kibnal_data.kib_connd_zombies)) {
3078                         conn = list_entry (kibnal_data.kib_connd_zombies.next,
3079                                            kib_conn_t, ibc_list);
3080                         list_del (&conn->ibc_list);
3081                         
3082                         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3083                         dropped_lock = 1;
3084
3085                         kibnal_destroy_conn(conn);
3086
3087                         spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3088                 }
3089
3090                 if (!list_empty (&kibnal_data.kib_connd_pcreqs)) {
3091                         pcr = list_entry(kibnal_data.kib_connd_pcreqs.next,
3092                                          kib_pcreq_t, pcr_list);
3093                         list_del(&pcr->pcr_list);
3094                         
3095                         spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
3096                         dropped_lock = 1;
3097
3098                         kibnal_recv_connreq(pcr->pcr_cep, &pcr->pcr_cmreq);
3099                         PORTAL_FREE(pcr, sizeof(*pcr));
3100
3101                         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
3102                 }
3103                         
3104                 if (!list_empty (&kibnal_data.kib_connd_peers)) {
3105                         peer = list_entry (kibnal_data.kib_connd_peers.next,
3106                                            kib_peer_t, ibp_connd_list);
3107                         
3108                         list_del_init (&peer->ibp_connd_list);
3109                         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3110                         dropped_lock = 1;
3111
3112                         kibnal_arp_peer (peer);
3113                         kibnal_peer_decref (peer);
3114
3115                         spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3116                 }
3117
3118                 if (!list_empty (&kibnal_data.kib_connd_conns)) {
3119                         conn = list_entry (kibnal_data.kib_connd_conns.next,
3120                                            kib_conn_t, ibc_list);
3121                         list_del (&conn->ibc_list);
3122                         
3123                         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3124                         dropped_lock = 1;
3125
3126                         switch (conn->ibc_state) {
3127                         default:
3128                                 LBUG();
3129                                 
3130                         case IBNAL_CONN_ACTIVE_ARP:
3131                                 kibnal_arp_done(conn);
3132                                 break;
3133
3134                         case IBNAL_CONN_ACTIVE_CONNECT:
3135                                 kibnal_check_connreply(conn);
3136                                 break;
3137
3138                         case IBNAL_CONN_PASSIVE_WAIT:
3139                                 kibnal_check_passive_wait(conn);
3140                                 break;
3141
3142                         case IBNAL_CONN_DISCONNECT1:
3143                         case IBNAL_CONN_DISCONNECT2:
3144                                 kibnal_disconnect_conn(conn);
3145                                 break;
3146                         }
3147                         kibnal_conn_decref(conn);
3148
3149                         spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3150                 }
3151
3152                 /* careful with the jiffy wrap... */
3153                 timeout = (int)(deadline - jiffies);
3154                 if (timeout <= 0) {
3155                         const int n = 4;
3156                         const int p = 1;
3157                         int       chunk = kibnal_data.kib_peer_hash_size;
3158                         
3159                         spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
3160                         dropped_lock = 1;
3161
3162                         /* Time to check for RDMA timeouts on a few more
3163                          * peers: I do checks every 'p' seconds on a
3164                          * proportion of the peer table and I need to check
3165                          * every connection 'n' times within a timeout
3166                          * interval, to ensure I detect a timeout on any
3167                          * connection within (n+1)/n times the timeout
3168                          * interval. */
3169
3170                         if (kibnal_tunables.kib_io_timeout > n * p)
3171                                 chunk = (chunk * n * p) / 
3172                                         kibnal_tunables.kib_io_timeout;
3173                         if (chunk == 0)
3174                                 chunk = 1;
3175
3176                         for (i = 0; i < chunk; i++) {
3177                                 kibnal_check_conns (peer_index);
3178                                 peer_index = (peer_index + 1) % 
3179                                              kibnal_data.kib_peer_hash_size;
3180                         }
3181
3182                         deadline += p * HZ;
3183                         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
3184                 }
3185
3186                 if (dropped_lock)
3187                         continue;
3188                 
3189                 /* Nothing to do for 'timeout'  */
3190                 set_current_state (TASK_INTERRUPTIBLE);
3191                 add_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
3192                 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3193
3194                 schedule_timeout (timeout);
3195
3196                 set_current_state (TASK_RUNNING);
3197                 remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
3198                 spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3199         }
3200
3201         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3202
3203         kibnal_thread_fini ();
3204         return (0);
3205 }
3206
3207 void 
3208 kibnal_async_callback(vv_event_record_t ev)
3209 {
3210         CERROR("type: %d, port: %d, data: "LPX64"\n", 
3211                ev.event_type, ev.port_num, ev.type.data);
3212 }
3213
3214 void
3215 kibnal_cq_callback (unsigned long unused_context)
3216 {
3217         unsigned long    flags;
3218
3219         CDEBUG(D_NET, "!!\n");
3220
3221         spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3222         kibnal_data.kib_ready = 1;
3223         wake_up(&kibnal_data.kib_sched_waitq);
3224         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
3225 }
3226
3227 int
3228 kibnal_scheduler(void *arg)
3229 {
3230         long            id = (long)arg;
3231         wait_queue_t    wait;
3232         char            name[16];
3233         vv_wc_t         wc;
3234         vv_return_t     vvrc;
3235         vv_return_t     vvrc2;
3236         unsigned long   flags;
3237         kib_rx_t       *rx;
3238         __u64           rxseq = 0;
3239         int             busy_loops = 0;
3240
3241         snprintf(name, sizeof(name), "kibnal_sd_%02ld", id);
3242         kportal_daemonize(name);
3243         kportal_blockallsigs();
3244
3245         init_waitqueue_entry(&wait, current);
3246
3247         spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3248
3249         while (!kibnal_data.kib_shutdown) {
3250                 if (busy_loops++ >= IBNAL_RESCHED) {
3251                         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
3252                                                flags);
3253
3254                         our_cond_resched();
3255                         busy_loops = 0;
3256                         
3257                         spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3258                 }
3259
3260                 if (kibnal_data.kib_ready &&
3261                     !kibnal_data.kib_checking_cq) {
3262                         /* take ownership of completion polling */
3263                         kibnal_data.kib_checking_cq = 1;
3264                         /* Assume I'll exhaust the CQ */
3265                         kibnal_data.kib_ready = 0;
3266                         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, 
3267                                                flags);
3268                         
3269                         vvrc = vv_poll_for_completion(kibnal_data.kib_hca, 
3270                                                       kibnal_data.kib_cq, &wc);
3271                         if (vvrc == vv_return_err_cq_empty) {
3272                                 vvrc2 = vv_request_completion_notification(
3273                                         kibnal_data.kib_hca, 
3274                                         kibnal_data.kib_cq, 
3275                                         vv_next_solicit_unsolicit_event);
3276                                 LASSERT (vvrc2 == vv_return_ok);
3277                         }
3278
3279                         if (vvrc == vv_return_ok &&
3280                             kibnal_wreqid2type(wc.wr_id) == IBNAL_WID_RX) {
3281                                 rx = (kib_rx_t *)kibnal_wreqid2ptr(wc.wr_id);
3282
3283                                 /* Grab the RX sequence number NOW before
3284                                  * anyone else can get an RX completion */
3285                                 rxseq = rx->rx_conn->ibc_rxseq++;
3286                         }
3287
3288                         spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3289                         /* give up ownership of completion polling */
3290                         kibnal_data.kib_checking_cq = 0;
3291
3292                         if (vvrc == vv_return_err_cq_empty)
3293                                 continue;
3294
3295                         LASSERT (vvrc == vv_return_ok);
3296                         /* Assume there's more: get another scheduler to check
3297                          * while I handle this completion... */
3298
3299                         kibnal_data.kib_ready = 1;
3300                         wake_up(&kibnal_data.kib_sched_waitq);
3301
3302                         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
3303                                                flags);
3304
3305                         switch (kibnal_wreqid2type(wc.wr_id)) {
3306                         case IBNAL_WID_RX:
3307                                 kibnal_rx_complete(
3308                                         (kib_rx_t *)kibnal_wreqid2ptr(wc.wr_id),
3309                                         wc.completion_status,
3310                                         wc.num_bytes_transfered,
3311                                         rxseq);
3312                                 break;
3313
3314                         case IBNAL_WID_TX:
3315                                 kibnal_tx_complete(
3316                                         (kib_tx_t *)kibnal_wreqid2ptr(wc.wr_id),
3317                                         wc.completion_status);
3318                                 break;
3319
3320                         case IBNAL_WID_RDMA:
3321                                 /* We only get RDMA completion notification if
3322                                  * it fails.  So we just ignore them completely
3323                                  * because...
3324                                  *
3325                                  * 1) If an RDMA fails, all subsequent work
3326                                  * items, including the final SEND will fail
3327                                  * too, so I'm still guaranteed to notice that
3328                                  * this connection is hosed.
3329                                  *
3330                                  * 2) It's positively dangerous to look inside
3331                                  * the tx descriptor obtained from an RDMA work
3332                                  * item.  As soon as I drop the kib_sched_lock,
3333                                  * I give a scheduler on another CPU a chance
3334                                  * to get the final SEND completion, so the tx
3335                                  * descriptor can get freed as I inspect it. */
3336                                 CERROR ("RDMA failed: %d\n", 
3337                                         wc.completion_status);
3338                                 break;
3339
3340                         default:
3341                                 LBUG();
3342                         }
3343                         
3344                         spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3345                         continue;
3346                 }
3347
3348                 /* Nothing to do; sleep... */
3349
3350                 set_current_state(TASK_INTERRUPTIBLE);
3351                 add_wait_queue(&kibnal_data.kib_sched_waitq, &wait);
3352                 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
3353                                        flags);
3354
3355                 schedule();
3356
3357                 remove_wait_queue(&kibnal_data.kib_sched_waitq, &wait);
3358                 set_current_state(TASK_RUNNING);
3359                 spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3360         }
3361
3362         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
3363
3364         kibnal_thread_fini();
3365         return (0);
3366 }
3367
3368
3369 lib_nal_t kibnal_lib = {
3370         .libnal_data = &kibnal_data,      /* NAL private data */
3371         .libnal_send = kibnal_send,
3372         .libnal_send_pages = kibnal_send_pages,
3373         .libnal_recv = kibnal_recv,
3374         .libnal_recv_pages = kibnal_recv_pages,
3375         .libnal_dist = kibnal_dist
3376 };