Whamcloud - gitweb
* fix for 5809: vibnal tx_sending race
[fs/lustre-release.git] / lnet / klnds / viblnd / viblnd_cb.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * Copyright (C) 2004 Cluster File Systems, Inc.
5  *   Author: Eric Barton <eric@bartonsoftware.com>
6  *   Author: Frank Zago <fzago@systemfabricworks.com>
7  *
8  *   This file is part of Lustre, http://www.lustre.org.
9  *
10  *   Lustre is free software; you can redistribute it and/or
11  *   modify it under the terms of version 2 of the GNU General Public
12  *   License as published by the Free Software Foundation.
13  *
14  *   Lustre is distributed in the hope that it will be useful,
15  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
16  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  *   GNU General Public License for more details.
18  *
19  *   You should have received a copy of the GNU General Public License
20  *   along with Lustre; if not, write to the Free Software
21  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22  *
23  */
24
25 #include "vibnal.h"
26
27 void
28 kibnal_tx_done (kib_tx_t *tx)
29 {
30         ptl_err_t        ptlrc = (tx->tx_status == 0) ? PTL_OK : PTL_FAIL;
31         int              i;
32
33         LASSERT (!in_interrupt());
34         LASSERT (!tx->tx_queued);               /* mustn't be queued for sending */
35         LASSERT (tx->tx_sending == 0);          /* mustn't be awaiting sent callback */
36         LASSERT (!tx->tx_waiting);              /* mustn't be awaiting peer response */
37
38 #if !IBNAL_WHOLE_MEM
39         switch (tx->tx_mapped) {
40         default:
41                 LBUG();
42
43         case KIB_TX_UNMAPPED:
44                 break;
45
46         case KIB_TX_MAPPED: {
47                 vv_return_t      vvrc;
48
49                 vvrc = vv_mem_region_destroy(kibnal_data.kib_hca,
50                                              tx->tx_md.md_handle);
51                 LASSERT (vvrc == vv_return_ok);
52                 tx->tx_mapped = KIB_TX_UNMAPPED;
53                 break;
54         }
55         }
56 #endif
57         for (i = 0; i < 2; i++) {
58                 /* tx may have up to 2 libmsgs to finalise */
59                 if (tx->tx_libmsg[i] == NULL)
60                         continue;
61
62                 lib_finalize (&kibnal_lib, NULL, tx->tx_libmsg[i], ptlrc);
63                 tx->tx_libmsg[i] = NULL;
64         }
65         
66         if (tx->tx_conn != NULL) {
67                 kibnal_conn_decref(tx->tx_conn);
68                 tx->tx_conn = NULL;
69         }
70
71         tx->tx_nwrq = 0;
72         tx->tx_status = 0;
73
74         spin_lock(&kibnal_data.kib_tx_lock);
75
76         if (tx->tx_isnblk) {
77                 list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs);
78         } else {
79                 list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs);
80                 wake_up (&kibnal_data.kib_idle_tx_waitq);
81         }
82
83         spin_unlock(&kibnal_data.kib_tx_lock);
84 }
85
86 kib_tx_t *
87 kibnal_get_idle_tx (int may_block) 
88 {
89         kib_tx_t      *tx = NULL;
90         ENTRY;
91         
92         for (;;) {
93                 spin_lock(&kibnal_data.kib_tx_lock);
94
95                 /* "normal" descriptor is free */
96                 if (!list_empty (&kibnal_data.kib_idle_txs)) {
97                         tx = list_entry (kibnal_data.kib_idle_txs.next,
98                                          kib_tx_t, tx_list);
99                         break;
100                 }
101
102                 if (!may_block) {
103                         /* may dip into reserve pool */
104                         if (list_empty (&kibnal_data.kib_idle_nblk_txs)) {
105                                 CERROR ("reserved tx desc pool exhausted\n");
106                                 break;
107                         }
108
109                         tx = list_entry (kibnal_data.kib_idle_nblk_txs.next,
110                                          kib_tx_t, tx_list);
111                         break;
112                 }
113
114                 /* block for idle tx */
115                 spin_unlock(&kibnal_data.kib_tx_lock);
116
117                 wait_event (kibnal_data.kib_idle_tx_waitq,
118                             !list_empty (&kibnal_data.kib_idle_txs) ||
119                             kibnal_data.kib_shutdown);
120         }
121
122         if (tx != NULL) {
123                 list_del (&tx->tx_list);
124
125                 /* Allocate a new completion cookie.  It might not be needed,
126                  * but we've got a lock right now and we're unlikely to
127                  * wrap... */
128                 tx->tx_cookie = kibnal_data.kib_next_tx_cookie++;
129 #if IBNAL_WHOLE_MEM
130                 LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
131 #endif
132                 LASSERT (tx->tx_nwrq == 0);
133                 LASSERT (!tx->tx_queued);
134                 LASSERT (tx->tx_sending == 0);
135                 LASSERT (!tx->tx_waiting);
136                 LASSERT (tx->tx_status == 0);
137                 LASSERT (tx->tx_conn == NULL);
138                 LASSERT (tx->tx_libmsg[0] == NULL);
139                 LASSERT (tx->tx_libmsg[1] == NULL);
140         }
141
142         spin_unlock(&kibnal_data.kib_tx_lock);
143         
144         RETURN(tx);
145 }
146
147 int
148 kibnal_post_rx (kib_rx_t *rx, int credit)
149 {
150         kib_conn_t   *conn = rx->rx_conn;
151         int           rc = 0;
152         vv_return_t   vvrc;
153
154         LASSERT (!in_interrupt());
155         
156         rx->rx_gl = (vv_scatgat_t) {
157                 .v_address = (void *)((unsigned long)KIBNAL_RX_VADDR(rx)),
158                 .l_key     = KIBNAL_RX_LKEY(rx),
159                 .length    = IBNAL_MSG_SIZE,
160         };
161
162         rx->rx_wrq = (vv_wr_t) {
163                 .wr_id                   = kibnal_ptr2wreqid(rx, IBNAL_WID_RX),
164                 .completion_notification = 1,
165                 .scatgat_list            = &rx->rx_gl,
166                 .num_of_data_segments    = 1,
167                 .wr_type                 = vv_wr_receive,
168         };
169
170         LASSERT (conn->ibc_state >= IBNAL_CONN_INIT);
171         LASSERT (!rx->rx_posted);
172
173         CDEBUG(D_NET, "posting rx [%d %x %p]\n", 
174                rx->rx_wrq.scatgat_list->length,
175                rx->rx_wrq.scatgat_list->l_key,
176                rx->rx_wrq.scatgat_list->v_address);
177
178         if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) {
179                 /* No more posts for this rx; so lose its ref */
180                 kibnal_conn_decref(conn);
181                 return 0;
182         }
183         
184         rx->rx_posted = 1;
185
186         spin_lock(&conn->ibc_lock);
187         /* Serialise vv_post_receive; it's not re-entrant on the same QP */
188         vvrc = vv_post_receive(kibnal_data.kib_hca,
189                                conn->ibc_qp, &rx->rx_wrq);
190         spin_unlock(&conn->ibc_lock);
191
192         if (vvrc == 0) {
193                 if (credit) {
194                         spin_lock(&conn->ibc_lock);
195                         conn->ibc_outstanding_credits++;
196                         spin_unlock(&conn->ibc_lock);
197
198                         kibnal_check_sends(conn);
199                 }
200                 return 0;
201         }
202         
203         CERROR ("post rx -> "LPX64" failed %d\n", 
204                 conn->ibc_peer->ibp_nid, vvrc);
205         rc = -EIO;
206         kibnal_close_conn(rx->rx_conn, rc);
207         /* No more posts for this rx; so lose its ref */
208         kibnal_conn_decref(conn);
209         return rc;
210 }
211
212 int
213 kibnal_post_receives (kib_conn_t *conn)
214 {
215         int    i;
216         int    rc;
217
218         LASSERT (conn->ibc_state < IBNAL_CONN_ESTABLISHED);
219         LASSERT (conn->ibc_comms_error == 0);
220
221         for (i = 0; i < IBNAL_RX_MSGS; i++) {
222                 /* +1 ref for rx desc.  This ref remains until kibnal_post_rx
223                  * fails (i.e. actual failure or we're disconnecting) */
224                 kibnal_conn_addref(conn);
225                 rc = kibnal_post_rx (&conn->ibc_rxs[i], 0);
226                 if (rc != 0)
227                         return rc;
228         }
229
230         return 0;
231 }
232
233 kib_tx_t *
234 kibnal_find_waiting_tx_locked(kib_conn_t *conn, int txtype, __u64 cookie)
235 {
236         struct list_head   *tmp;
237         
238         list_for_each(tmp, &conn->ibc_active_txs) {
239                 kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
240                 
241                 LASSERT (!tx->tx_queued);
242                 LASSERT (tx->tx_sending != 0 || tx->tx_waiting);
243
244                 if (tx->tx_cookie != cookie)
245                         continue;
246
247                 if (tx->tx_waiting &&
248                     tx->tx_msg->ibm_type == txtype)
249                         return tx;
250
251                 CWARN("Bad completion: %swaiting, type %x (wanted %x)\n",
252                       tx->tx_waiting ? "" : "NOT ",
253                       tx->tx_msg->ibm_type, txtype);
254         }
255         return NULL;
256 }
257
258 void
259 kibnal_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie)
260 {
261         kib_tx_t    *tx;
262         int          idle;
263
264         spin_lock(&conn->ibc_lock);
265
266         tx = kibnal_find_waiting_tx_locked(conn, txtype, cookie);
267         if (tx == NULL) {
268                 spin_unlock(&conn->ibc_lock);
269
270                 CWARN("Unmatched completion type %x cookie "LPX64
271                       " from "LPX64"\n",
272                       txtype, cookie, conn->ibc_peer->ibp_nid);
273                 kibnal_close_conn (conn, -EPROTO);
274                 return;
275         }
276
277         if (tx->tx_status == 0) {               /* success so far */
278                 if (status < 0) {               /* failed? */
279                         tx->tx_status = status;
280                 } else if (txtype == IBNAL_MSG_GET_REQ) { 
281                         /* XXX layering violation: set REPLY data length */
282                         LASSERT (tx->tx_libmsg[1] != NULL);
283                         LASSERT (tx->tx_libmsg[1]->ev.type == 
284                                  PTL_EVENT_REPLY_END);
285
286                         tx->tx_libmsg[1]->ev.mlength = status;
287                 }
288         }
289         
290         tx->tx_waiting = 0;
291
292         idle = !tx->tx_queued && (tx->tx_sending == 0);
293         if (idle)
294                 list_del(&tx->tx_list);
295
296         spin_unlock(&conn->ibc_lock);
297         
298         if (idle)
299                 kibnal_tx_done(tx);
300 }
301
302 void
303 kibnal_send_completion (kib_conn_t *conn, int type, int status, __u64 cookie) 
304 {
305         kib_tx_t    *tx = kibnal_get_idle_tx(0);
306         
307         if (tx == NULL) {
308                 CERROR("Can't get tx for completion %x for "LPX64"\n",
309                        type, conn->ibc_peer->ibp_nid);
310                 return;
311         }
312         
313         tx->tx_msg->ibm_u.completion.ibcm_status = status;
314         tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie;
315         kibnal_init_tx_msg(tx, type, sizeof(kib_completion_msg_t));
316         
317         kibnal_queue_tx(tx, conn);
318 }
319
320 void
321 kibnal_handle_rx (kib_rx_t *rx)
322 {
323         kib_msg_t    *msg = rx->rx_msg;
324         kib_conn_t   *conn = rx->rx_conn;
325         int           credits = msg->ibm_credits;
326         kib_tx_t     *tx;
327         int           rc;
328
329         LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
330
331         CDEBUG (D_NET, "Received %x[%d] from "LPX64"\n",
332                 msg->ibm_type, credits, conn->ibc_peer->ibp_nid);
333         
334         if (credits != 0) {
335                 /* Have I received credits that will let me send? */
336                 spin_lock(&conn->ibc_lock);
337                 conn->ibc_credits += credits;
338                 spin_unlock(&conn->ibc_lock);
339
340                 kibnal_check_sends(conn);
341         }
342
343         switch (msg->ibm_type) {
344         default:
345                 CERROR("Bad IBNAL message type %x from "LPX64"\n",
346                        msg->ibm_type, conn->ibc_peer->ibp_nid);
347                 break;
348
349         case IBNAL_MSG_NOOP:
350                 break;
351
352         case IBNAL_MSG_IMMEDIATE:
353                 lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx);
354                 break;
355                 
356         case IBNAL_MSG_PUT_REQ:
357                 rx->rx_responded = 0;
358                 lib_parse(&kibnal_lib, &msg->ibm_u.putreq.ibprm_hdr, rx);
359                 if (rx->rx_responded)
360                         break;
361
362                 /* I wasn't asked to transfer any payload data.  This happens
363                  * if the PUT didn't match, or got truncated. */
364                 kibnal_send_completion(rx->rx_conn, IBNAL_MSG_PUT_NAK, 0,
365                                        msg->ibm_u.putreq.ibprm_cookie);
366                 break;
367
368         case IBNAL_MSG_PUT_NAK:
369                 CWARN ("PUT_NACK from "LPX64"\n", conn->ibc_peer->ibp_nid);
370                 kibnal_handle_completion(conn, IBNAL_MSG_PUT_REQ, 
371                                          msg->ibm_u.completion.ibcm_status,
372                                          msg->ibm_u.completion.ibcm_cookie);
373                 break;
374
375         case IBNAL_MSG_PUT_ACK:
376                 spin_lock(&conn->ibc_lock);
377                 tx = kibnal_find_waiting_tx_locked(conn, IBNAL_MSG_PUT_REQ,
378                                                    msg->ibm_u.putack.ibpam_src_cookie);
379                 if (tx != NULL)
380                         list_del(&tx->tx_list);
381                 spin_unlock(&conn->ibc_lock);
382
383                 if (tx == NULL) {
384                         CERROR("Unmatched PUT_ACK from "LPX64"\n",
385                                conn->ibc_peer->ibp_nid);
386                         kibnal_close_conn(conn, -EPROTO);
387                         break;
388                 }
389
390                 LASSERT (tx->tx_waiting);
391                 /* CAVEAT EMPTOR: I could be racing with tx_complete, but...
392                  * (a) I can overwrite tx_msg since my peer has received it!
393                  * (b) tx_waiting set tells tx_complete() it's not done. */
394
395                 tx->tx_nwrq = 0;                /* overwrite PUT_REQ */
396
397                 rc = kibnal_init_rdma(tx, IBNAL_MSG_PUT_DONE, 
398                                       kibnal_rd_size(&msg->ibm_u.putack.ibpam_rd),
399                                       &msg->ibm_u.putack.ibpam_rd,
400                                       msg->ibm_u.putack.ibpam_dst_cookie);
401                 if (rc < 0)
402                         CERROR("Can't setup rdma for PUT to "LPX64": %d\n",
403                                conn->ibc_peer->ibp_nid, rc);
404
405                 spin_lock(&conn->ibc_lock);
406                 if (tx->tx_status == 0 && rc < 0)
407                         tx->tx_status = rc;
408                 tx->tx_waiting = 0;             /* clear waiting and queue atomically */
409                 kibnal_queue_tx_locked(tx, conn);
410                 spin_unlock(&conn->ibc_lock);
411                 break;
412                 
413         case IBNAL_MSG_PUT_DONE:
414                 kibnal_handle_completion(conn, IBNAL_MSG_PUT_ACK,
415                                          msg->ibm_u.completion.ibcm_status,
416                                          msg->ibm_u.completion.ibcm_cookie);
417                 break;
418
419         case IBNAL_MSG_GET_REQ:
420                 rx->rx_responded = 0;
421                 lib_parse(&kibnal_lib, &msg->ibm_u.get.ibgm_hdr, rx);
422                 if (rx->rx_responded)           /* I responded to the GET_REQ */
423                         break;
424                 /* NB GET didn't match (I'd have responded even with no payload
425                  * data) */
426                 kibnal_send_completion(rx->rx_conn, IBNAL_MSG_GET_DONE, -ENODATA,
427                                        msg->ibm_u.get.ibgm_cookie);
428                 break;
429
430         case IBNAL_MSG_GET_DONE:
431                 kibnal_handle_completion(conn, IBNAL_MSG_GET_REQ,
432                                          msg->ibm_u.completion.ibcm_status,
433                                          msg->ibm_u.completion.ibcm_cookie);
434                 break;
435         }
436
437         kibnal_post_rx(rx, 1);
438 }
439
440 void
441 kibnal_rx_complete (kib_rx_t *rx, vv_comp_status_t vvrc, int nob, __u64 rxseq)
442 {
443         kib_msg_t    *msg = rx->rx_msg;
444         kib_conn_t   *conn = rx->rx_conn;
445         unsigned long flags;
446         int           rc;
447
448         CDEBUG (D_NET, "rx %p conn %p\n", rx, conn);
449         LASSERT (rx->rx_posted);
450         rx->rx_posted = 0;
451
452         if (conn->ibc_state > IBNAL_CONN_ESTABLISHED)
453                 goto ignore;
454
455         if (vvrc != vv_comp_status_success) {
456                 CERROR("Rx from "LPX64" failed: %d\n", 
457                        conn->ibc_peer->ibp_nid, vvrc);
458                 goto failed;
459         }
460
461         rc = kibnal_unpack_msg(msg, nob);
462         if (rc != 0) {
463                 CERROR ("Error %d unpacking rx from "LPX64"\n",
464                         rc, conn->ibc_peer->ibp_nid);
465                 goto failed;
466         }
467
468         if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid ||
469             msg->ibm_srcstamp != conn->ibc_incarnation ||
470             msg->ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid ||
471             msg->ibm_dststamp != kibnal_data.kib_incarnation) {
472                 CERROR ("Stale rx from "LPX64"\n",
473                         conn->ibc_peer->ibp_nid);
474                 goto failed;
475         }
476
477         if (msg->ibm_seq != rxseq) {
478                 CERROR ("Out-of-sequence rx from "LPX64
479                         ": got "LPD64" but expected "LPD64"\n",
480                         conn->ibc_peer->ibp_nid, msg->ibm_seq, rxseq);
481                 goto failed;
482         }
483
484         /* racing with connection establishment/teardown! */
485
486         if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
487                 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
488                 /* must check holding global lock to eliminate race */
489                 if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
490                         list_add_tail(&rx->rx_list, &conn->ibc_early_rxs);
491                         write_unlock_irqrestore(&kibnal_data.kib_global_lock, 
492                                                 flags);
493                         return;
494                 }
495                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, 
496                                         flags);
497         }
498         kibnal_handle_rx(rx);
499         return;
500         
501  failed:
502         CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
503         kibnal_close_conn(conn, -EIO);
504  ignore:
505         /* Don't re-post rx & drop its ref on conn */
506         kibnal_conn_decref(conn);
507 }
508
509 #if IBNAL_WHOLE_MEM
510 int
511 kibnal_append_rdfrag(kib_rdma_desc_t *rd, int active, struct page *page, 
512                      unsigned long page_offset, unsigned long len)
513 {
514         kib_rdma_frag_t *frag = &rd->rd_frags[rd->rd_nfrag];
515         vv_l_key_t       l_key;
516         vv_r_key_t       r_key;
517         __u64            addr;
518         __u64            frag_addr;
519         void            *ptr;
520         vv_mem_reg_h_t   mem_h;
521         vv_return_t      vvrc;
522
523         if (rd->rd_nfrag >= IBNAL_MAX_RDMA_FRAGS) {
524                 CERROR ("Too many RDMA fragments\n");
525                 return -EMSGSIZE;
526         }
527
528 #if CONFIG_HIGHMEM
529 # error "This probably doesn't work because of over/underflow when casting between __u64 and void *..."
530 #endif
531         /* Try to create an address that adapter-tavor will munge into a valid
532          * network address, given how it maps all phys mem into 1 region */
533         addr = page_to_phys(page) + page_offset + PAGE_OFFSET;
534
535         vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca, 
536                                     (void *)((unsigned long)addr),
537                                     len, &mem_h, &l_key, &r_key);
538         LASSERT (vvrc == vv_return_ok);
539
540         if (active) {
541                 if (rd->rd_nfrag == 0) {
542                         rd->rd_key = l_key;
543                 } else if (l_key != rd->rd_key) {
544                         CERROR ("> 1 key for single RDMA desc\n");
545                         return -EINVAL;
546                 }
547                 frag_addr = addr;
548         } else {
549                 if (rd->rd_nfrag == 0) {
550                         rd->rd_key = r_key;
551                 } else if (r_key != rd->rd_key) {
552                         CERROR ("> 1 key for single RDMA desc\n");
553                         return -EINVAL;
554                 }
555                 vv_va2advertise_addr(kibnal_data.kib_hca, 
556                                      (void *)((unsigned long)addr), &ptr);
557                 frag_addr = (unsigned long)ptr;
558         }
559
560         kibnal_rf_set(frag, frag_addr, len);
561
562         CDEBUG(D_NET,"map frag [%d][%d %x %08x%08x] "LPX64"\n", 
563                rd->rd_nfrag, frag->rf_nob, rd->rd_key, 
564                frag->rf_addr_hi, frag->rf_addr_lo, frag_addr);
565
566         rd->rd_nfrag++;
567         return 0;
568 }
569
570 struct page *
571 kibnal_kvaddr_to_page (unsigned long vaddr)
572 {
573         struct page *page;
574
575         if (vaddr >= VMALLOC_START &&
576             vaddr < VMALLOC_END)
577                 page = vmalloc_to_page ((void *)vaddr);
578 #if CONFIG_HIGHMEM
579         else if (vaddr >= PKMAP_BASE &&
580                  vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE))
581                 page = vmalloc_to_page ((void *)vaddr);
582         /* in 2.4 ^ just walks the page tables */
583 #endif
584         else
585                 page = virt_to_page (vaddr);
586
587         return VALID_PAGE(page) ? page : NULL;
588 }
589
590 int
591 kibnal_setup_rd_iov(kib_tx_t *tx, kib_rdma_desc_t *rd, 
592                     vv_access_con_bit_mask_t access,
593                     int niov, struct iovec *iov, int offset, int nob)
594                  
595 {
596         /* active if I'm sending */
597         int           active = ((access & vv_acc_r_mem_write) == 0);
598         int           fragnob;
599         int           rc;
600         unsigned long vaddr;
601         struct page  *page;
602         int           page_offset;
603
604         LASSERT (nob > 0);
605         LASSERT (niov > 0);
606         LASSERT ((rd != tx->tx_rd) == !active);
607
608         while (offset >= iov->iov_len) {
609                 offset -= iov->iov_len;
610                 niov--;
611                 iov++;
612                 LASSERT (niov > 0);
613         }
614
615         rd->rd_nfrag = 0;
616         do {
617                 LASSERT (niov > 0);
618
619                 vaddr = ((unsigned long)iov->iov_base) + offset;
620                 page_offset = vaddr & (PAGE_SIZE - 1);
621                 page = kibnal_kvaddr_to_page(vaddr);
622                 if (page == NULL) {
623                         CERROR ("Can't find page\n");
624                         return -EFAULT;
625                 }
626
627                 fragnob = min((int)(iov->iov_len - offset), nob);
628                 fragnob = min(fragnob, (int)PAGE_SIZE - page_offset);
629
630                 rc = kibnal_append_rdfrag(rd, active, page, 
631                                           page_offset, fragnob);
632                 if (rc != 0)
633                         return rc;
634
635                 if (offset + fragnob < iov->iov_len) {
636                         offset += fragnob;
637                 } else {
638                         offset = 0;
639                         iov++;
640                         niov--;
641                 }
642                 nob -= fragnob;
643         } while (nob > 0);
644         
645         return 0;
646 }
647
648 int
649 kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd, 
650                       vv_access_con_bit_mask_t access,
651                       int nkiov, ptl_kiov_t *kiov, int offset, int nob)
652 {
653         /* active if I'm sending */
654         int            active = ((access & vv_acc_r_mem_write) == 0);
655         int            fragnob;
656         int            rc;
657
658         CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
659
660         LASSERT (nob > 0);
661         LASSERT (nkiov > 0);
662         LASSERT ((rd != tx->tx_rd) == !active);
663
664         while (offset >= kiov->kiov_len) {
665                 offset -= kiov->kiov_len;
666                 nkiov--;
667                 kiov++;
668                 LASSERT (nkiov > 0);
669         }
670
671         rd->rd_nfrag = 0;
672         do {
673                 LASSERT (nkiov > 0);
674                 fragnob = min((int)(kiov->kiov_len - offset), nob);
675                 
676                 rc = kibnal_append_rdfrag(rd, active, kiov->kiov_page,
677                                           kiov->kiov_offset + offset,
678                                           fragnob);
679                 if (rc != 0)
680                         return rc;
681
682                 offset = 0;
683                 kiov++;
684                 nkiov--;
685                 nob -= fragnob;
686         } while (nob > 0);
687
688         return 0;
689 }
690 #else
691 int
692 kibnal_setup_rd_iov (kib_tx_t *tx, kib_rdma_desc_t *rd,
693                      vv_access_con_bit_mask_t access,
694                      int niov, struct iovec *iov, int offset, int nob)
695                  
696 {
697         /* active if I'm sending */
698         int         active = ((access & vv_acc_r_mem_write) == 0);
699         void       *vaddr;
700         vv_return_t vvrc;
701
702         LASSERT (nob > 0);
703         LASSERT (niov > 0);
704         LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
705         LASSERT ((rd != tx->tx_rd) == !active);
706
707         while (offset >= iov->iov_len) {
708                 offset -= iov->iov_len;
709                 niov--;
710                 iov++;
711                 LASSERT (niov > 0);
712         }
713
714         if (nob > iov->iov_len - offset) {
715                 CERROR ("Can't map multiple vaddr fragments\n");
716                 return (-EMSGSIZE);
717         }
718
719         vaddr = (void *)(((unsigned long)iov->iov_base) + offset);
720         tx->tx_md.md_addr = (__u64)((unsigned long)vaddr);
721
722         vvrc = vv_mem_region_register(kibnal_data.kib_hca, vaddr, nob,
723                                       kibnal_data.kib_pd, access,
724                                       &tx->tx_md.md_handle, 
725                                       &tx->tx_md.md_lkey,
726                                       &tx->tx_md.md_rkey);
727         if (vvrc != vv_return_ok) {
728                 CERROR ("Can't map vaddr %p: %d\n", vaddr, vvrc);
729                 return -EFAULT;
730         }
731
732         tx->tx_mapped = KIB_TX_MAPPED;
733
734         rd->rd_key = active ? tx->tx_md.md_lkey : tx->tx_md.md_rkey;
735         rd->rd_nfrag = 1;
736         kibnal_rf_set(&rd->rd_frags[0], tx->tx_md.md_addr, nob);
737         
738         return (0);
739 }
740
741 int
742 kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd,
743                       vv_access_con_bit_mask_t access,
744                       int nkiov, ptl_kiov_t *kiov, int offset, int nob)
745 {
746         /* active if I'm sending */
747         int            active = ((access & vv_acc_r_mem_write) == 0);
748         vv_return_t    vvrc;
749         vv_phy_list_t  phys_pages;
750         vv_phy_buf_t  *phys;
751         int            page_offset;
752         int            nphys;
753         int            resid;
754         int            phys_size;
755         int            rc;
756
757         CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
758
759         LASSERT (nob > 0);
760         LASSERT (nkiov > 0);
761         LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
762         LASSERT ((rd != tx->tx_rd) == !active);
763
764         while (offset >= kiov->kiov_len) {
765                 offset -= kiov->kiov_len;
766                 nkiov--;
767                 kiov++;
768                 LASSERT (nkiov > 0);
769         }
770
771         phys_size = nkiov * sizeof (*phys);
772         PORTAL_ALLOC(phys, phys_size);
773         if (phys == NULL) {
774                 CERROR ("Can't allocate tmp phys\n");
775                 return (-ENOMEM);
776         }
777
778         page_offset = kiov->kiov_offset + offset;
779
780         phys[0].start = kibnal_page2phys(kiov->kiov_page);
781         phys[0].size = PAGE_SIZE;
782
783         nphys = 1;
784         resid = nob - (kiov->kiov_len - offset);
785
786         while (resid > 0) {
787                 kiov++;
788                 nkiov--;
789                 LASSERT (nkiov > 0);
790
791                 if (kiov->kiov_offset != 0 ||
792                     ((resid > PAGE_SIZE) && 
793                      kiov->kiov_len < PAGE_SIZE)) {
794                         int i;
795                         /* Can't have gaps */
796                         CERROR ("Can't make payload contiguous in I/O VM:"
797                                 "page %d, offset %d, len %d \n", nphys, 
798                                 kiov->kiov_offset, kiov->kiov_len);
799
800                         for (i = -nphys; i < nkiov; i++)
801                                 CERROR("kiov[%d] %p +%d for %d\n",
802                                        i, kiov[i].kiov_page, 
803                                        kiov[i].kiov_offset, 
804                                        kiov[i].kiov_len);
805                         
806                         rc = -EINVAL;
807                         goto out;
808                 }
809
810                 LASSERT (nphys * sizeof (*phys) < phys_size);
811                 phys[nphys].start = kibnal_page2phys(kiov->kiov_page);
812                 phys[nphys].size = PAGE_SIZE;
813
814                 nphys++;
815                 resid -= PAGE_SIZE;
816         }
817
818 #if 0
819         CWARN ("nphys %d, nob %d, page_offset %d\n", nphys, nob, page_offset);
820         for (i = 0; i < nphys; i++)
821                 CWARN ("   [%d] "LPX64"\n", i, phys[i]);
822 #endif
823
824         vvrc = vv_phy_mem_region_register(kibnal_data.kib_hca,
825                                           &phys_pages,
826                                           IBNAL_RDMA_BASE,
827                                           nphys,
828                                           page_offset,
829                                           kibnal_data.kib_pd,
830                                           access,
831                                           &tx->tx_md.md_handle,
832                                           &tx->tx_md.md_addr,
833                                           &tx->tx_md.md_lkey,
834                                           &tx->tx_md.md_rkey);
835
836         if (vvrc != vv_return_ok) {
837                 CERROR ("Can't map phys: %d\n", vvrc);
838                 rc = -EFAULT;
839                 goto out;
840         }
841
842         CDEBUG(D_NET, "Mapped %d pages %d bytes @ offset %d: "
843                "lkey %x, rkey %x, addr "LPX64"\n",
844                nphys, nob, page_offset, tx->tx_md.md_lkey, tx->tx_md.md_rkey,
845                tx->tx_md.md_addr);
846
847         tx->tx_mapped = KIB_TX_MAPPED;
848         rc = 0;
849
850         rd->rd_key = active ? tx->tx_md.md_lkey : tx->tx_md.md_rkey;
851         rd->rd_nfrag = 1;
852         kibnal_rf_set(&rd->rd_frags[0], tx->tx_md.md_addr, nob);
853         
854  out:
855         PORTAL_FREE(phys, phys_size);
856         return (rc);
857 }
858 #endif
859
860 kib_conn_t *
861 kibnal_find_conn_locked (kib_peer_t *peer)
862 {
863         struct list_head *tmp;
864
865         /* just return the first connection */
866         list_for_each (tmp, &peer->ibp_conns) {
867                 return (list_entry(tmp, kib_conn_t, ibc_list));
868         }
869
870         return (NULL);
871 }
872
873 void
874 kibnal_check_sends (kib_conn_t *conn)
875 {
876         kib_tx_t       *tx;
877         vv_return_t     vvrc;                        
878         int             rc;
879         int             i;
880         int             done;
881
882         /* Don't send anything until after the connection is established */
883         if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
884                 CDEBUG(D_NET, LPX64"too soon\n", conn->ibc_peer->ibp_nid);
885                 return;
886         }
887         
888         spin_lock(&conn->ibc_lock);
889
890         LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE);
891
892         if (list_empty(&conn->ibc_tx_queue) &&
893             conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) {
894                 spin_unlock(&conn->ibc_lock);
895                 
896                 tx = kibnal_get_idle_tx(0);     /* don't block */
897                 if (tx != NULL)
898                         kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0);
899
900                 spin_lock(&conn->ibc_lock);
901                 
902                 if (tx != NULL)
903                         kibnal_queue_tx_locked(tx, conn);
904         }
905
906         while (!list_empty (&conn->ibc_tx_queue)) {
907                 tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list);
908
909                 LASSERT (tx->tx_queued);
910                 /* We rely on this for QP sizing */
911                 LASSERT (tx->tx_nwrq > 0 && tx->tx_nwrq <= 1 + IBNAL_MAX_RDMA_FRAGS);
912
913                 LASSERT (conn->ibc_outstanding_credits >= 0);
914                 LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE);
915                 LASSERT (conn->ibc_credits >= 0);
916                 LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE);
917
918                 if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE) {
919                         CDEBUG(D_NET, LPX64": posted enough\n",
920                                conn->ibc_peer->ibp_nid);
921                         break;
922                 }
923                 
924                 if (conn->ibc_credits == 0) {   /* no credits */
925                         CDEBUG(D_NET, LPX64": no credits\n",
926                                conn->ibc_peer->ibp_nid);
927                         break;
928                 }
929                 
930                 if (conn->ibc_credits == 1 &&   /* last credit reserved for */
931                     conn->ibc_outstanding_credits == 0) { /* giving back credits */
932                         CDEBUG(D_NET, LPX64": not using last credit\n",
933                                conn->ibc_peer->ibp_nid);
934                         break;
935                 }
936                 
937                 list_del (&tx->tx_list);
938                 tx->tx_queued = 0;
939
940                 /* NB don't drop ibc_lock before bumping tx_sending */
941
942                 if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP &&
943                     (!list_empty(&conn->ibc_tx_queue) ||
944                      conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) {
945                         /* redundant NOOP */
946                         spin_unlock(&conn->ibc_lock);
947                         kibnal_tx_done(tx);
948                         spin_lock(&conn->ibc_lock);
949                         CDEBUG(D_NET, LPX64": redundant noop\n",
950                                conn->ibc_peer->ibp_nid);
951                         continue;
952                 }
953
954                 kibnal_pack_msg(tx->tx_msg, conn->ibc_outstanding_credits,
955                                 conn->ibc_peer->ibp_nid, conn->ibc_incarnation,
956                                 conn->ibc_txseq);
957
958                 conn->ibc_txseq++;
959                 conn->ibc_outstanding_credits = 0;
960                 conn->ibc_nsends_posted++;
961                 conn->ibc_credits--;
962
963                 /* CAVEAT EMPTOR!  This tx could be the PUT_DONE of an RDMA
964                  * PUT.  If so, it was first queued here as a PUT_REQ, sent and
965                  * stashed on ibc_active_txs, matched by an incoming PUT_ACK,
966                  * and then re-queued here.  It's (just) possible that
967                  * tx_sending is non-zero if we've not done the tx_complete() from
968                  * the first send; hence the ++ rather than = below. */
969                 tx->tx_sending++;
970
971                 list_add (&tx->tx_list, &conn->ibc_active_txs);
972
973                 /* Keep holding ibc_lock while posting sends on this
974                  * connection; vv_post_send() isn't re-entrant on the same
975                  * QP!! */
976
977                 LASSERT (tx->tx_nwrq > 0);
978
979                 rc = -ECONNABORTED;
980                 vvrc = vv_return_ok;
981                 if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
982                         tx->tx_status = 0;
983                         vvrc = vv_post_send_list(kibnal_data.kib_hca,
984                                                  conn->ibc_qp,
985                                                  tx->tx_nwrq,
986                                                  tx->tx_wrq,
987                                                  vv_operation_type_send_rc);
988                         rc = (vvrc == vv_return_ok) ? 0 : -EIO;
989                 }
990
991                 if (rc != 0) {
992                         /* NB credits are transferred in the actual
993                          * message, which can only be the last work item */
994                         conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits;
995                         conn->ibc_credits++;
996                         conn->ibc_nsends_posted--;
997
998                         tx->tx_status = rc;
999                         tx->tx_waiting = 0;
1000                         tx->tx_sending--;
1001                         
1002                         done = (tx->tx_sending == 0);
1003                         if (done)
1004                                 list_del (&tx->tx_list);
1005                         
1006                         spin_unlock(&conn->ibc_lock);
1007                         
1008                         if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
1009                                 CERROR ("Error %d posting transmit to "LPX64"\n", 
1010                                         vvrc, conn->ibc_peer->ibp_nid);
1011                         else
1012                                 CDEBUG (D_NET, "Error %d posting transmit to "
1013                                         LPX64"\n", rc, conn->ibc_peer->ibp_nid);
1014
1015                         kibnal_close_conn (conn, rc);
1016
1017                         if (done)
1018                                 kibnal_tx_done (tx);
1019                         return;
1020                 }
1021         }
1022
1023         spin_unlock(&conn->ibc_lock);
1024 }
1025
1026 void
1027 kibnal_tx_complete (kib_tx_t *tx, vv_comp_status_t vvrc)
1028 {
1029         kib_conn_t   *conn = tx->tx_conn;
1030         int           failed = (vvrc != vv_comp_status_success);
1031         int           idle;
1032
1033         CDEBUG(D_NET, "tx %p conn %p sending %d nwrq %d vvrc %d\n", 
1034                tx, conn, tx->tx_sending, tx->tx_nwrq, vvrc);
1035
1036         LASSERT (tx->tx_sending > 0);
1037
1038         if (failed &&
1039             tx->tx_status == 0 &&
1040             conn->ibc_state == IBNAL_CONN_ESTABLISHED)
1041                 CERROR("tx -> "LPX64" type %x cookie "LPX64
1042                        "sending %d waiting %d: failed %d\n", 
1043                        conn->ibc_peer->ibp_nid, tx->tx_msg->ibm_type, 
1044                        tx->tx_cookie, tx->tx_sending, tx->tx_waiting, vvrc);
1045
1046         spin_lock(&conn->ibc_lock);
1047
1048         /* I could be racing with rdma completion.  Whoever makes 'tx' idle
1049          * gets to free it, which also drops its ref on 'conn'. */
1050
1051         tx->tx_sending--;
1052         conn->ibc_nsends_posted--;
1053
1054         if (failed) {
1055                 tx->tx_waiting = 0;
1056                 tx->tx_status = -EIO;
1057         }
1058         
1059         idle = (tx->tx_sending == 0) &&         /* This is the final callback */
1060                !tx->tx_waiting &&               /* Not waiting for peer */
1061                !tx->tx_queued;                  /* Not re-queued (PUT_DONE) */
1062         if (idle)
1063                 list_del(&tx->tx_list);
1064
1065         kibnal_conn_addref(conn);               /* 1 ref for me.... */
1066
1067         spin_unlock(&conn->ibc_lock);
1068
1069         if (idle)
1070                 kibnal_tx_done (tx);
1071
1072         if (failed)
1073                 kibnal_close_conn (conn, -EIO);
1074         else
1075                 kibnal_check_sends(conn);
1076
1077         kibnal_conn_decref(conn);               /* ...until here */
1078 }
1079
1080 void
1081 kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob)
1082 {
1083         vv_scatgat_t *gl = &tx->tx_gl[tx->tx_nwrq];
1084         vv_wr_t      *wrq = &tx->tx_wrq[tx->tx_nwrq];
1085         int           nob = offsetof (kib_msg_t, ibm_u) + body_nob;
1086
1087         LASSERT (tx->tx_nwrq >= 0 && 
1088                  tx->tx_nwrq < (1 + IBNAL_MAX_RDMA_FRAGS));
1089         LASSERT (nob <= IBNAL_MSG_SIZE);
1090
1091         kibnal_init_msg(tx->tx_msg, type, body_nob);
1092
1093         *gl = (vv_scatgat_t) {
1094                 .v_address = (void *)((unsigned long)KIBNAL_TX_VADDR(tx)),
1095                 .l_key     = KIBNAL_TX_LKEY(tx),
1096                 .length    = nob,
1097         };
1098
1099         memset(wrq, 0, sizeof(*wrq));
1100
1101         wrq->wr_id = kibnal_ptr2wreqid(tx, IBNAL_WID_TX);
1102         wrq->wr_type = vv_wr_send;
1103         wrq->scatgat_list = gl;
1104         wrq->num_of_data_segments = 1;
1105         wrq->completion_notification = 1;
1106         wrq->type.send.solicited_event = 1;
1107         wrq->type.send.immidiate_data_indicator = 0;
1108         wrq->type.send.send_qp_type.rc_type.fance_indicator = 0;
1109         
1110         tx->tx_nwrq++;
1111 }
1112
1113 int
1114 kibnal_init_rdma (kib_tx_t *tx, int type, int nob,
1115                   kib_rdma_desc_t *dstrd, __u64 dstcookie)
1116 {
1117         /* CAVEAT EMPTOR: this 'consumes' the frags in 'dstrd' */
1118         int              resid = nob;
1119         kib_msg_t       *ibmsg = tx->tx_msg;
1120         kib_rdma_desc_t *srcrd = tx->tx_rd;
1121         kib_rdma_frag_t *srcfrag;
1122         int              srcidx;
1123         kib_rdma_frag_t *dstfrag;
1124         int              dstidx;
1125         vv_scatgat_t    *gl;
1126         vv_wr_t         *wrq;
1127         int              wrknob;
1128         int              rc;
1129
1130         /* Called by scheduler */
1131         LASSERT (!in_interrupt());
1132
1133         LASSERT (type == IBNAL_MSG_GET_DONE ||
1134                  type == IBNAL_MSG_PUT_DONE);
1135
1136         srcidx = dstidx = 0;
1137         srcfrag = &srcrd->rd_frags[0];
1138         dstfrag = &dstrd->rd_frags[0];
1139         rc = resid;
1140
1141         while (resid > 0) {
1142                 if (srcidx >= srcrd->rd_nfrag) {
1143                         CERROR("Src buffer exhausted: %d frags\n", srcidx);
1144                         rc = -EPROTO;
1145                         break;
1146                 }
1147                 
1148                 if (dstidx == dstrd->rd_nfrag) {
1149                         CERROR("Dst buffer exhausted: %d frags\n", dstidx);
1150                         rc = -EPROTO;
1151                         break;
1152                 }
1153
1154                 if (tx->tx_nwrq == IBNAL_MAX_RDMA_FRAGS) {
1155                         CERROR("RDMA too fragmented: %d/%d src %d/%d dst frags\n",
1156                                srcidx, srcrd->rd_nfrag,
1157                                dstidx, dstrd->rd_nfrag);
1158                         rc = -EMSGSIZE;
1159                         break;
1160                 }
1161
1162                 wrknob = MIN(MIN(srcfrag->rf_nob, dstfrag->rf_nob), resid);
1163
1164                 gl = &tx->tx_gl[tx->tx_nwrq];
1165                 gl->v_address = (void *)((unsigned long)kibnal_rf_addr(srcfrag));
1166                 gl->length    = wrknob;
1167                 gl->l_key     = srcrd->rd_key;
1168
1169                 wrq = &tx->tx_wrq[tx->tx_nwrq];
1170
1171                 wrq->wr_id = kibnal_ptr2wreqid(tx, IBNAL_WID_RDMA);
1172                 wrq->completion_notification = 0;
1173                 wrq->scatgat_list = gl;
1174                 wrq->num_of_data_segments = 1;
1175                 wrq->wr_type = vv_wr_rdma_write;
1176                 wrq->type.send.solicited_event = 0;
1177                 wrq->type.send.send_qp_type.rc_type.fance_indicator = 0;
1178                 wrq->type.send.send_qp_type.rc_type.r_addr = kibnal_rf_addr(dstfrag);
1179                 wrq->type.send.send_qp_type.rc_type.r_r_key = dstrd->rd_key;
1180
1181                 resid -= wrknob;
1182                 if (wrknob < srcfrag->rf_nob) {
1183                         kibnal_rf_set(srcfrag, 
1184                                       kibnal_rf_addr(srcfrag) + resid, 
1185                                       srcfrag->rf_nob - wrknob);
1186                 } else {
1187                         srcfrag++;
1188                         srcidx++;
1189                 }
1190                 
1191                 if (wrknob < dstfrag->rf_nob) {
1192                         kibnal_rf_set(dstfrag,
1193                                       kibnal_rf_addr(dstfrag) + resid,
1194                                       dstfrag->rf_nob - wrknob);
1195                 } else {
1196                         dstfrag++;
1197                         dstidx++;
1198                 }
1199                 
1200                 tx->tx_nwrq++;
1201         }
1202
1203         if (rc < 0)                             /* no RDMA if completing with failure */
1204                 tx->tx_nwrq = 0;
1205         
1206         ibmsg->ibm_u.completion.ibcm_status = rc;
1207         ibmsg->ibm_u.completion.ibcm_cookie = dstcookie;
1208         kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t));
1209
1210         return rc;
1211 }
1212
1213 void
1214 kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
1215 {
1216         spin_lock(&conn->ibc_lock);
1217         kibnal_queue_tx_locked (tx, conn);
1218         spin_unlock(&conn->ibc_lock);
1219         
1220         kibnal_check_sends(conn);
1221 }
1222
1223 void
1224 kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid)
1225 {
1226         kib_peer_t      *peer;
1227         kib_conn_t      *conn;
1228         unsigned long    flags;
1229         rwlock_t        *g_lock = &kibnal_data.kib_global_lock;
1230
1231         /* If I get here, I've committed to send, so I complete the tx with
1232          * failure on any problems */
1233         
1234         LASSERT (tx->tx_conn == NULL);          /* only set when assigned a conn */
1235         LASSERT (tx->tx_nwrq > 0);              /* work items have been set up */
1236
1237         read_lock_irqsave(g_lock, flags);
1238         
1239         peer = kibnal_find_peer_locked (nid);
1240         if (peer == NULL) {
1241                 read_unlock_irqrestore(g_lock, flags);
1242                 tx->tx_status = -EHOSTUNREACH;
1243                 tx->tx_waiting = 0;
1244                 kibnal_tx_done (tx);
1245                 return;
1246         }
1247
1248         conn = kibnal_find_conn_locked (peer);
1249         if (conn != NULL) {
1250                 kibnal_conn_addref(conn);       /* 1 ref for me... */
1251                 read_unlock_irqrestore(g_lock, flags);
1252                 
1253                 kibnal_queue_tx (tx, conn);
1254                 kibnal_conn_decref(conn);       /* ...to here */
1255                 return;
1256         }
1257         
1258         /* Making one or more connections; I'll need a write lock... */
1259         read_unlock(g_lock);
1260         write_lock(g_lock);
1261
1262         peer = kibnal_find_peer_locked (nid);
1263         if (peer == NULL) {
1264                 write_unlock_irqrestore(g_lock, flags);
1265                 tx->tx_status = -EHOSTUNREACH;
1266                 tx->tx_waiting = 0;
1267                 kibnal_tx_done (tx);
1268                 return;
1269         }
1270
1271         conn = kibnal_find_conn_locked (peer);
1272         if (conn != NULL) {
1273                 /* Connection exists; queue message on it */
1274                 kibnal_conn_addref(conn);       /* 1 ref for me... */
1275                 write_unlock_irqrestore(g_lock, flags);
1276                 
1277                 kibnal_queue_tx (tx, conn);
1278                 kibnal_conn_decref(conn);       /* ...until here */
1279                 return;
1280         }
1281
1282         if (peer->ibp_connecting == 0) {
1283                 if (!time_after_eq(jiffies, peer->ibp_reconnect_time)) {
1284                         write_unlock_irqrestore(g_lock, flags);
1285                         tx->tx_status = -EHOSTUNREACH;
1286                         tx->tx_waiting = 0;
1287                         kibnal_tx_done (tx);
1288                         return;
1289                 }
1290         
1291                 peer->ibp_connecting = 1;
1292                 kibnal_peer_addref(peer); /* extra ref for connd */
1293         
1294                 spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
1295         
1296                 list_add_tail (&peer->ibp_connd_list,
1297                                &kibnal_data.kib_connd_peers);
1298                 wake_up (&kibnal_data.kib_connd_waitq);
1299         
1300                 spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
1301         }
1302         
1303         /* A connection is being established; queue the message... */
1304         list_add_tail (&tx->tx_list, &peer->ibp_tx_queue);
1305
1306         write_unlock_irqrestore(g_lock, flags);
1307 }
1308
1309 int
1310 kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist)
1311 {
1312         /* I would guess that if kibnal_get_peer (nid) == NULL,
1313            and we're not routing, then 'nid' is very distant :) */
1314         if ( nal->libnal_ni.ni_pid.nid == nid ) {
1315                 *dist = 0;
1316         } else {
1317                 *dist = 1;
1318         }
1319
1320         return 0;
1321 }
1322
1323 ptl_err_t
1324 kibnal_sendmsg(lib_nal_t    *nal, 
1325                void         *private,
1326                lib_msg_t    *libmsg,
1327                ptl_hdr_t    *hdr, 
1328                int           type, 
1329                ptl_nid_t     nid, 
1330                ptl_pid_t     pid,
1331                unsigned int  payload_niov, 
1332                struct iovec *payload_iov, 
1333                ptl_kiov_t   *payload_kiov,
1334                int           payload_offset,
1335                int           payload_nob)
1336 {
1337         kib_msg_t  *ibmsg;
1338         kib_tx_t   *tx;
1339         int         nob;
1340         int         rc;
1341         int         n;
1342
1343         /* NB 'private' is different depending on what we're sending.... */
1344
1345         CDEBUG(D_NET, "sending %d bytes in %d frags to nid:"LPX64
1346                " pid %d\n", payload_nob, payload_niov, nid , pid);
1347
1348         LASSERT (payload_nob == 0 || payload_niov > 0);
1349         LASSERT (payload_niov <= PTL_MD_MAX_IOV);
1350
1351         /* Thread context */
1352         LASSERT (!in_interrupt());
1353         /* payload is either all vaddrs or all pages */
1354         LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
1355
1356         switch (type) {
1357         default:
1358                 LBUG();
1359                 return (PTL_FAIL);
1360                 
1361         case PTL_MSG_REPLY: {
1362                 /* reply's 'private' is the incoming receive */
1363                 kib_rx_t *rx = private;
1364
1365                 LASSERT(rx != NULL);
1366
1367                 if (rx->rx_msg->ibm_type == IBNAL_MSG_IMMEDIATE) {
1368                         /* RDMA not expected */
1369                         nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
1370                         if (nob > IBNAL_MSG_SIZE) {
1371                                 CERROR("REPLY for "LPX64" too big (RDMA not requested):"
1372                                        "%d (max for message is %d)\n", 
1373                                        nid, payload_nob, IBNAL_MSG_SIZE);
1374                                 CERROR("Can't REPLY IMMEDIATE %d to "LPX64"\n",
1375                                        nob, nid);
1376                                 return PTL_FAIL;
1377                         }
1378                         break;
1379                 }
1380
1381                 /* Incoming message consistent with RDMA? */
1382                 if (rx->rx_msg->ibm_type != IBNAL_MSG_GET_REQ) {
1383                         CERROR("REPLY to "LPX64" bad msg type %x!!!\n",
1384                                nid, rx->rx_msg->ibm_type);
1385                         return PTL_FAIL;
1386                 }
1387
1388                 /* NB rx_complete() will send GET_NAK when I return to it from
1389                  * here, unless I set rx_responded! */
1390
1391                 tx = kibnal_get_idle_tx(0);
1392                 if (tx == NULL) {
1393                         CERROR("Can't get tx for REPLY to "LPX64"\n", nid);
1394                         return PTL_FAIL;
1395                 }
1396
1397                 if (payload_nob == 0)
1398                         rc = 0;
1399                 else if (payload_kiov == NULL)
1400                         rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0, 
1401                                                  payload_niov, payload_iov, 
1402                                                  payload_offset, payload_nob);
1403                 else
1404                         rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 0,
1405                                                   payload_niov, payload_kiov,
1406                                                   payload_offset, payload_nob);
1407                 if (rc != 0) {
1408                         CERROR("Can't setup GET src for "LPX64": %d\n", nid, rc);
1409                         kibnal_tx_done(tx);
1410                         return PTL_FAIL;
1411                 }
1412                 
1413                 rc = kibnal_init_rdma(tx, IBNAL_MSG_GET_DONE, payload_nob,
1414                                       &rx->rx_msg->ibm_u.get.ibgm_rd,
1415                                       rx->rx_msg->ibm_u.get.ibgm_cookie);
1416                 if (rc < 0) {
1417                         CERROR("Can't setup rdma for GET from "LPX64": %d\n", 
1418                                nid, rc);
1419                 } else if (rc == 0) {
1420                         /* No RDMA: local completion may happen now! */
1421                         lib_finalize (&kibnal_lib, NULL, libmsg, PTL_OK);
1422                 } else {
1423                         /* RDMA: lib_finalize(libmsg) when it completes */
1424                         tx->tx_libmsg[0] = libmsg;
1425                 }
1426
1427                 kibnal_queue_tx(tx, rx->rx_conn);
1428                 rx->rx_responded = 1;
1429                 return (rc >= 0) ? PTL_OK : PTL_FAIL;
1430         }
1431
1432         case PTL_MSG_GET:
1433                 /* will the REPLY message be small enough not to need RDMA? */
1434                 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[libmsg->md->length]);
1435                 if (nob <= IBNAL_MSG_SIZE)
1436                         break;
1437
1438                 tx = kibnal_get_idle_tx(1);     /* may block; caller is an app thread */
1439                 LASSERT (tx != NULL);
1440
1441                 ibmsg = tx->tx_msg;
1442                 ibmsg->ibm_u.get.ibgm_hdr = *hdr;
1443                 ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie;
1444
1445                 if ((libmsg->md->options & PTL_MD_KIOV) == 0)
1446                         rc = kibnal_setup_rd_iov(tx, &ibmsg->ibm_u.get.ibgm_rd,
1447                                                  vv_acc_r_mem_write,
1448                                                  libmsg->md->md_niov,
1449                                                  libmsg->md->md_iov.iov,
1450                                                  0, libmsg->md->length);
1451                 else
1452                         rc = kibnal_setup_rd_kiov(tx, &ibmsg->ibm_u.get.ibgm_rd,
1453                                                   vv_acc_r_mem_write,
1454                                                   libmsg->md->md_niov,
1455                                                   libmsg->md->md_iov.kiov,
1456                                                   0, libmsg->md->length);
1457                 if (rc != 0) {
1458                         CERROR("Can't setup GET sink for "LPX64": %d\n", nid, rc);
1459                         kibnal_tx_done(tx);
1460                         return PTL_FAIL;
1461                 }
1462
1463                 n = ibmsg->ibm_u.get.ibgm_rd.rd_nfrag;
1464                 nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]);
1465                 kibnal_init_tx_msg(tx, IBNAL_MSG_GET_REQ, nob);
1466
1467                 tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib, nid, libmsg);
1468                 if (tx->tx_libmsg[1] == NULL) {
1469                         CERROR("Can't create reply for GET -> "LPX64"\n", nid);
1470                         kibnal_tx_done(tx);
1471                         return PTL_FAIL;
1472                 }
1473
1474                 tx->tx_libmsg[0] = libmsg;      /* finalise libmsg[0,1] on completion */
1475                 tx->tx_waiting = 1;             /* waiting for GET_DONE */
1476                 kibnal_launch_tx(tx, nid);
1477                 return PTL_OK;
1478
1479         case PTL_MSG_ACK:
1480                 LASSERT (payload_nob == 0);
1481                 break;
1482
1483         case PTL_MSG_PUT:
1484                 /* Is the payload small enough not to need RDMA? */
1485                 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
1486                 if (nob <= IBNAL_MSG_SIZE)
1487                         break;
1488
1489                 tx = kibnal_get_idle_tx(1);     /* may block: caller is app thread */
1490                 LASSERT (tx != NULL);
1491
1492                 if (payload_kiov == NULL)
1493                         rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0,
1494                                                  payload_niov, payload_iov,
1495                                                  payload_offset, payload_nob);
1496                 else
1497                         rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 0,
1498                                                   payload_niov, payload_kiov,
1499                                                   payload_offset, payload_nob);
1500                 if (rc != 0) {
1501                         CERROR("Can't setup PUT src for "LPX64": %d\n", nid, rc);
1502                         kibnal_tx_done(tx);
1503                         return PTL_FAIL;
1504                 }
1505
1506                 ibmsg = tx->tx_msg;
1507                 ibmsg->ibm_u.putreq.ibprm_hdr = *hdr;
1508                 ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie;
1509                 kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_REQ, sizeof(kib_putreq_msg_t));
1510
1511                 tx->tx_libmsg[0] = libmsg;      /* finalise libmsg on completion */
1512                 tx->tx_waiting = 1;             /* waiting for PUT_{ACK,NAK} */
1513                 kibnal_launch_tx(tx, nid);
1514                 return PTL_OK;
1515         }
1516
1517         LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob])
1518                  <= IBNAL_MSG_SIZE);
1519
1520         tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK ||
1521                                   type == PTL_MSG_REPLY));
1522         if (tx == NULL) {
1523                 CERROR ("Can't send %d to "LPX64": tx descs exhausted\n", type, nid);
1524                 return PTL_NO_SPACE;
1525         }
1526
1527         ibmsg = tx->tx_msg;
1528         ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
1529
1530         if (payload_nob > 0) {
1531                 if (payload_kiov != NULL)
1532                         lib_copy_kiov2buf(ibmsg->ibm_u.immediate.ibim_payload,
1533                                           payload_niov, payload_kiov,
1534                                           payload_offset, payload_nob);
1535                 else
1536                         lib_copy_iov2buf(ibmsg->ibm_u.immediate.ibim_payload,
1537                                          payload_niov, payload_iov,
1538                                          payload_offset, payload_nob);
1539         }
1540
1541         nob = offsetof(kib_immediate_msg_t, ibim_payload[payload_nob]);
1542         kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE, nob);
1543
1544         tx->tx_libmsg[0] = libmsg;              /* finalise libmsg on completion */
1545         kibnal_launch_tx(tx, nid);
1546         return PTL_OK;
1547 }
1548
1549 ptl_err_t
1550 kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie,
1551                ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
1552                unsigned int payload_niov, struct iovec *payload_iov,
1553                size_t payload_offset, size_t payload_len)
1554 {
1555         CDEBUG(D_NET, "  pid = %d, nid="LPU64"\n",
1556                pid, nid);
1557         return (kibnal_sendmsg(nal, private, cookie,
1558                                hdr, type, nid, pid,
1559                                payload_niov, payload_iov, NULL,
1560                                payload_offset, payload_len));
1561 }
1562
1563 ptl_err_t
1564 kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, 
1565                      ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
1566                      unsigned int payload_niov, ptl_kiov_t *payload_kiov, 
1567                      size_t payload_offset, size_t payload_len)
1568 {
1569         return (kibnal_sendmsg(nal, private, cookie,
1570                                hdr, type, nid, pid,
1571                                payload_niov, NULL, payload_kiov,
1572                                payload_offset, payload_len));
1573 }
1574
1575 ptl_err_t
1576 kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
1577                  unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov,
1578                  size_t offset, int mlen, int rlen)
1579 {
1580         kib_rx_t    *rx = private;
1581         kib_msg_t   *rxmsg = rx->rx_msg;
1582         kib_conn_t  *conn = rx->rx_conn;
1583         kib_tx_t    *tx;
1584         kib_msg_t   *txmsg;
1585         int          nob;
1586         int          rc;
1587         int          n;
1588         
1589         LASSERT (mlen <= rlen);
1590         LASSERT (mlen >= 0);
1591         LASSERT (!in_interrupt());
1592         /* Either all pages or all vaddrs */
1593         LASSERT (!(kiov != NULL && iov != NULL));
1594
1595         switch (rxmsg->ibm_type) {
1596         default:
1597                 LBUG();
1598                 
1599         case IBNAL_MSG_IMMEDIATE:
1600                 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
1601                 if (nob > IBNAL_MSG_SIZE) {
1602                         CERROR ("Immediate message from "LPX64" too big: %d\n",
1603                                 rxmsg->ibm_u.immediate.ibim_hdr.src_nid, rlen);
1604                         return (PTL_FAIL);
1605                 }
1606
1607                 if (kiov != NULL)
1608                         lib_copy_buf2kiov(niov, kiov, offset,
1609                                           rxmsg->ibm_u.immediate.ibim_payload,
1610                                           mlen);
1611                 else
1612                         lib_copy_buf2iov(niov, iov, offset,
1613                                          rxmsg->ibm_u.immediate.ibim_payload,
1614                                          mlen);
1615
1616                 lib_finalize (nal, NULL, libmsg, PTL_OK);
1617                 return (PTL_OK);
1618
1619         case IBNAL_MSG_PUT_REQ:
1620                 /* NB rx_complete() will send PUT_NAK when I return to it from
1621                  * here, unless I set rx_responded!  */
1622
1623                 if (mlen == 0) { /* No payload to RDMA */
1624                         lib_finalize(nal, NULL, libmsg, PTL_OK);
1625                         return PTL_OK;
1626                 }
1627
1628                 tx = kibnal_get_idle_tx(0);
1629                 if (tx == NULL) {
1630                         CERROR("Can't allocate tx for "LPX64"\n",
1631                                conn->ibc_peer->ibp_nid);
1632                         return PTL_FAIL;
1633                 }
1634
1635                 txmsg = tx->tx_msg;
1636                 if (kiov == NULL)
1637                         rc = kibnal_setup_rd_iov(tx, 
1638                                                  &txmsg->ibm_u.putack.ibpam_rd,
1639                                                  vv_acc_r_mem_write,
1640                                                  niov, iov, offset, mlen);
1641                 else
1642                         rc = kibnal_setup_rd_kiov(tx,
1643                                                   &txmsg->ibm_u.putack.ibpam_rd,
1644                                                   vv_acc_r_mem_write,
1645                                                   niov, kiov, offset, mlen);
1646                 if (rc != 0) {
1647                         CERROR("Can't setup PUT sink for "LPX64": %d\n",
1648                                conn->ibc_peer->ibp_nid, rc);
1649                         kibnal_tx_done(tx);
1650                         return PTL_FAIL;
1651                 }
1652
1653                 txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
1654                 txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie;
1655
1656                 n = tx->tx_msg->ibm_u.putack.ibpam_rd.rd_nfrag;
1657                 nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]);
1658                 kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_ACK, nob);
1659
1660                 tx->tx_libmsg[0] = libmsg;      /* finalise libmsg on completion */
1661                 tx->tx_waiting = 1;             /* waiting for PUT_DONE */
1662                 kibnal_queue_tx(tx, conn);
1663
1664                 LASSERT (!rx->rx_responded);
1665                 rx->rx_responded = 1;
1666                 return PTL_OK;
1667
1668         case IBNAL_MSG_GET_REQ:
1669                 /* We get called here just to discard any junk after the
1670                  * GET hdr. */
1671                 LASSERT (libmsg == NULL);
1672                 lib_finalize (nal, NULL, libmsg, PTL_OK);
1673                 return (PTL_OK);
1674         }
1675 }
1676
1677 ptl_err_t
1678 kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg,
1679               unsigned int niov, struct iovec *iov, 
1680               size_t offset, size_t mlen, size_t rlen)
1681 {
1682         return (kibnal_recvmsg (nal, private, msg, niov, iov, NULL,
1683                                 offset, mlen, rlen));
1684 }
1685
1686 ptl_err_t
1687 kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg,
1688                      unsigned int niov, ptl_kiov_t *kiov, 
1689                      size_t offset, size_t mlen, size_t rlen)
1690 {
1691         return (kibnal_recvmsg (nal, private, msg, niov, NULL, kiov,
1692                                 offset, mlen, rlen));
1693 }
1694
1695 int
1696 kibnal_thread_start (int (*fn)(void *arg), void *arg)
1697 {
1698         long    pid = kernel_thread (fn, arg, 0);
1699
1700         if (pid < 0)
1701                 return ((int)pid);
1702
1703         atomic_inc (&kibnal_data.kib_nthreads);
1704         return (0);
1705 }
1706
1707 void
1708 kibnal_thread_fini (void)
1709 {
1710         atomic_dec (&kibnal_data.kib_nthreads);
1711 }
1712
1713 void
1714 kibnal_close_conn_locked (kib_conn_t *conn, int error)
1715 {
1716         /* This just does the immmediate housekeeping.  'error' is zero for a
1717          * normal shutdown which can happen only after the connection has been
1718          * established.  If the connection is established, schedule the
1719          * connection to be finished off by the connd.  Otherwise the connd is
1720          * already dealing with it (either to set it up or tear it down).
1721          * Caller holds kib_global_lock exclusively in irq context */
1722         kib_peer_t       *peer = conn->ibc_peer;
1723         struct list_head *tmp;
1724         
1725         LASSERT (error != 0 || conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
1726
1727         if (error != 0 && conn->ibc_comms_error == 0)
1728                 conn->ibc_comms_error = error;
1729
1730         if (conn->ibc_state != IBNAL_CONN_ESTABLISHED)
1731                 return; /* already being handled  */
1732
1733         spin_lock(&conn->ibc_lock);
1734         
1735         if (error == 0 &&
1736             list_empty(&conn->ibc_tx_queue) &&
1737             list_empty(&conn->ibc_active_txs)) {
1738                 CDEBUG(D_NET, "closing conn to "LPX64
1739                        " rx# "LPD64" tx# "LPD64"\n", 
1740                        peer->ibp_nid, conn->ibc_txseq, conn->ibc_rxseq);
1741         } else {
1742                 CERROR("Closing conn to "LPX64": error %d%s%s"
1743                        " rx# "LPD64" tx# "LPD64"\n",
1744                        peer->ibp_nid, error,
1745                        list_empty(&conn->ibc_tx_queue) ? "" : "(sending)",
1746                        list_empty(&conn->ibc_active_txs) ? "" : "(waiting)",
1747                        conn->ibc_txseq, conn->ibc_rxseq);
1748
1749                 list_for_each(tmp, &conn->ibc_tx_queue) {
1750                         kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
1751                         
1752                         CERROR("   queued tx type %x cookie "LPX64
1753                                " sending %d waiting %d ticks %ld/%d\n", 
1754                                tx->tx_msg->ibm_type, tx->tx_cookie, 
1755                                tx->tx_sending, tx->tx_waiting,
1756                                (long)(tx->tx_deadline - jiffies), HZ);
1757                 }
1758
1759                 list_for_each(tmp, &conn->ibc_active_txs) {
1760                         kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
1761                         
1762                         CERROR("   active tx type %x cookie "LPX64
1763                                " sending %d waiting %d ticks %ld/%d\n", 
1764                                tx->tx_msg->ibm_type, tx->tx_cookie, 
1765                                tx->tx_sending, tx->tx_waiting,
1766                                (long)(tx->tx_deadline - jiffies), HZ);
1767                 }
1768         }
1769
1770         spin_unlock(&conn->ibc_lock);
1771
1772         /* connd takes ibc_list's ref */
1773         list_del (&conn->ibc_list);
1774         
1775         if (list_empty (&peer->ibp_conns) &&
1776             peer->ibp_persistence == 0) {
1777                 /* Non-persistent peer with no more conns... */
1778                 kibnal_unlink_peer_locked (peer);
1779         }
1780
1781         kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECT1);
1782
1783         spin_lock(&kibnal_data.kib_connd_lock);
1784
1785         list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns);
1786         wake_up (&kibnal_data.kib_connd_waitq);
1787                 
1788         spin_unlock(&kibnal_data.kib_connd_lock);
1789 }
1790
1791 void
1792 kibnal_close_conn (kib_conn_t *conn, int error)
1793 {
1794         unsigned long flags;
1795         
1796         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1797
1798         kibnal_close_conn_locked (conn, error);
1799         
1800         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1801 }
1802
1803 void
1804 kibnal_handle_early_rxs(kib_conn_t *conn)
1805 {
1806         unsigned long    flags;
1807         kib_rx_t        *rx;
1808
1809         LASSERT (!in_interrupt());
1810         LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
1811         
1812         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1813         while (!list_empty(&conn->ibc_early_rxs)) {
1814                 rx = list_entry(conn->ibc_early_rxs.next,
1815                                 kib_rx_t, rx_list);
1816                 list_del(&rx->rx_list);
1817                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1818                 
1819                 kibnal_handle_rx(rx);
1820                 
1821                 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1822         }
1823         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1824 }
1825
1826 void
1827 kibnal_conn_disconnected(kib_conn_t *conn)
1828 {
1829         LIST_HEAD        (zombies); 
1830         struct list_head *tmp;
1831         struct list_head *nxt;
1832         kib_tx_t         *tx;
1833
1834         /* I'm the connd */
1835         LASSERT (!in_interrupt());
1836         LASSERT (current == kibnal_data.kib_connd);
1837         LASSERT (conn->ibc_state >= IBNAL_CONN_INIT);
1838         
1839         kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECTED);
1840
1841         /* move QP to error state to make posted work items complete */
1842         kibnal_set_qp_state(conn, vv_qp_state_error);
1843
1844         spin_lock(&conn->ibc_lock);
1845
1846         /* Complete all tx descs not waiting for sends to complete.
1847          * NB we should be safe from RDMA now that the QP has changed state */
1848
1849         list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) {
1850                 tx = list_entry (tmp, kib_tx_t, tx_list);
1851
1852                 LASSERT (tx->tx_queued);
1853
1854                 tx->tx_status = -ECONNABORTED;
1855                 tx->tx_queued = 0;
1856                 tx->tx_waiting = 0;
1857                 
1858                 if (tx->tx_sending != 0)
1859                         continue;
1860
1861                 list_del (&tx->tx_list);
1862                 list_add (&tx->tx_list, &zombies);
1863         }
1864
1865         list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) {
1866                 tx = list_entry (tmp, kib_tx_t, tx_list);
1867
1868                 LASSERT (!tx->tx_queued);
1869                 LASSERT (tx->tx_waiting ||
1870                          tx->tx_sending != 0);
1871
1872                 tx->tx_status = -ECONNABORTED;
1873                 tx->tx_waiting = 0;
1874                 
1875                 if (tx->tx_sending != 0)
1876                         continue;
1877
1878                 list_del (&tx->tx_list);
1879                 list_add (&tx->tx_list, &zombies);
1880         }
1881         
1882         spin_unlock(&conn->ibc_lock);
1883
1884         while (!list_empty(&zombies)) {
1885                 tx = list_entry (zombies.next, kib_tx_t, tx_list);
1886
1887                 list_del(&tx->tx_list);
1888                 kibnal_tx_done (tx);
1889         }
1890
1891         kibnal_handle_early_rxs(conn);
1892 }
1893
1894 void
1895 kibnal_peer_connect_failed (kib_peer_t *peer, int active)
1896 {
1897         struct list_head  zombies;
1898         kib_tx_t         *tx;
1899         unsigned long     flags;
1900
1901         /* Only the connd creates conns => single threaded */
1902         LASSERT (!in_interrupt());
1903         LASSERT (current == kibnal_data.kib_connd);
1904         LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL);
1905
1906         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1907
1908         if (active) {
1909                 LASSERT (peer->ibp_connecting != 0);
1910                 peer->ibp_connecting--;
1911         } else {
1912                 LASSERT (!kibnal_peer_active(peer));
1913         }
1914         
1915         if (peer->ibp_connecting != 0) {
1916                 /* another connection attempt under way (loopback?)... */
1917                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1918                 return;
1919         }
1920
1921         if (list_empty(&peer->ibp_conns)) {
1922                 /* Say when active connection can be re-attempted */
1923                 peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval;
1924                 /* Increase reconnection interval */
1925                 peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2,
1926                                                     IBNAL_MAX_RECONNECT_INTERVAL);
1927         
1928                 /* Take peer's blocked transmits to complete with error */
1929                 list_add(&zombies, &peer->ibp_tx_queue);
1930                 list_del_init(&peer->ibp_tx_queue);
1931                 
1932                 if (kibnal_peer_active(peer) &&
1933                     (peer->ibp_persistence == 0)) {
1934                         /* failed connection attempt on non-persistent peer */
1935                         kibnal_unlink_peer_locked (peer);
1936                 }
1937         } else {
1938                 /* Can't have blocked transmits if there are connections */
1939                 LASSERT (list_empty(&peer->ibp_tx_queue));
1940         }
1941         
1942         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1943
1944         if (list_empty (&zombies)) 
1945                 return;
1946         
1947         CERROR ("Deleting messages for "LPX64": connection failed\n", peer->ibp_nid);
1948         do {
1949                 tx = list_entry (zombies.next, kib_tx_t, tx_list);
1950
1951                 list_del (&tx->tx_list);
1952                 /* complete now */
1953                 tx->tx_status = -EHOSTUNREACH;
1954                 kibnal_tx_done (tx);
1955         } while (!list_empty (&zombies));
1956 }
1957
1958 void
1959 kibnal_connreq_done(kib_conn_t *conn, int active, int status)
1960 {
1961         static cm_reject_data_t   rej;
1962
1963         struct list_head   txs;
1964         kib_peer_t        *peer = conn->ibc_peer;
1965         kib_peer_t        *peer2;
1966         unsigned long      flags;
1967         kib_tx_t          *tx;
1968
1969         /* Only the connd creates conns => single threaded */
1970         LASSERT (!in_interrupt());
1971         LASSERT (current == kibnal_data.kib_connd);
1972         LASSERT (conn->ibc_state < IBNAL_CONN_ESTABLISHED);
1973
1974         if (active) {
1975                 LASSERT (peer->ibp_connecting > 0);
1976         } else {
1977                 LASSERT (!kibnal_peer_active(peer));
1978         }
1979         
1980         PORTAL_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
1981         conn->ibc_connvars = NULL;
1982
1983         if (status != 0) {
1984                 /* failed to establish connection */
1985                 switch (conn->ibc_state) {
1986                 default:
1987                         LBUG();
1988                 case IBNAL_CONN_ACTIVE_CHECK_REPLY:
1989                         /* got a connection reply but failed checks */
1990                         LASSERT (active);
1991                         memset(&rej, 0, sizeof(rej));
1992                         rej.reason = cm_rej_code_usr_rej;
1993                         cm_reject(conn->ibc_cep, &rej);
1994                         break;
1995
1996                 case IBNAL_CONN_ACTIVE_CONNECT:
1997                         LASSERT (active);
1998                         cm_cancel(conn->ibc_cep);
1999                         kibnal_pause(HZ/10);
2000                         /* cm_connect() failed immediately or
2001                          * callback returned failure */
2002                         break;
2003
2004                 case IBNAL_CONN_ACTIVE_ARP:
2005                         LASSERT (active);
2006                         /* ibat_get_ib_data() failed immediately 
2007                          * or callback returned failure */
2008                         break;
2009
2010                 case IBNAL_CONN_INIT:
2011                         break;
2012
2013                 case IBNAL_CONN_PASSIVE_WAIT:
2014                         LASSERT (!active);
2015                         /* cm_accept callback returned failure */
2016                         break;
2017                 }
2018
2019                 kibnal_peer_connect_failed(conn->ibc_peer, active);
2020                 kibnal_conn_disconnected(conn);
2021                 return;
2022         }
2023
2024         /* connection established */
2025         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2026
2027         if (active) {
2028                 LASSERT(conn->ibc_state == IBNAL_CONN_ACTIVE_RTU);
2029         } else {
2030                 LASSERT(conn->ibc_state == IBNAL_CONN_PASSIVE_WAIT);
2031         }
2032         
2033         kibnal_set_conn_state(conn, IBNAL_CONN_ESTABLISHED);
2034
2035         if (!active) {
2036                 peer2 = kibnal_find_peer_locked(peer->ibp_nid);
2037                 if (peer2 != NULL) {
2038                         /* already in the peer table; swap */
2039                         conn->ibc_peer = peer2;
2040                         kibnal_peer_addref(peer2);
2041                         kibnal_peer_decref(peer);
2042                         peer = conn->ibc_peer;
2043                 } else {
2044                         /* add 'peer' to the peer table */
2045                         kibnal_peer_addref(peer);
2046                         list_add_tail(&peer->ibp_list,
2047                                       kibnal_nid2peerlist(peer->ibp_nid));
2048                 }
2049         }
2050         
2051         /* Add conn to peer's list and nuke any dangling conns from a different
2052          * peer instance... */
2053         kibnal_conn_addref(conn);               /* +1 ref for ibc_list */
2054         list_add(&conn->ibc_list, &peer->ibp_conns);
2055         kibnal_close_stale_conns_locked (conn->ibc_peer,
2056                                          conn->ibc_incarnation);
2057
2058         if (!kibnal_peer_active(peer) ||        /* peer has been deleted */
2059             conn->ibc_comms_error != 0 ||       /* comms error */
2060             conn->ibc_disconnect) {             /* need to disconnect */
2061                 
2062                 /* start to shut down connection */
2063                 kibnal_close_conn_locked(conn, -ECONNABORTED);
2064
2065                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2066                 kibnal_peer_connect_failed(peer, active);
2067                 return;
2068         }
2069
2070         if (active)
2071                 peer->ibp_connecting--;
2072
2073         /* grab pending txs while I have the lock */
2074         list_add(&txs, &peer->ibp_tx_queue);
2075         list_del_init(&peer->ibp_tx_queue);
2076         
2077         /* reset reconnect interval for next attempt */
2078         peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
2079         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2080
2081         /* Schedule blocked txs */
2082         spin_lock (&conn->ibc_lock);
2083         while (!list_empty (&txs)) {
2084                 tx = list_entry (txs.next, kib_tx_t, tx_list);
2085                 list_del (&tx->tx_list);
2086
2087                 kibnal_queue_tx_locked (tx, conn);
2088         }
2089         spin_unlock (&conn->ibc_lock);
2090         kibnal_check_sends (conn);
2091
2092         /* schedule blocked rxs */
2093         kibnal_handle_early_rxs(conn);
2094 }
2095
2096 void
2097 kibnal_cm_callback(cm_cep_handle_t cep, cm_conn_data_t *cmdata, void *arg)
2098 {
2099         static cm_dreply_data_t drep;           /* just zeroed space */
2100         
2101         kib_conn_t             *conn = (kib_conn_t *)arg;
2102         unsigned long           flags;
2103         
2104         /* CAVEAT EMPTOR: tasklet context */
2105
2106         switch (cmdata->status) {
2107         default:
2108                 LBUG();
2109                 
2110         case cm_event_disconn_request:
2111                 /* IBNAL_CONN_ACTIVE_RTU:  gets closed in kibnal_connreq_done
2112                  * IBNAL_CONN_ESTABLISHED: I start it closing
2113                  * otherwise:              it's closing anyway */
2114                 cm_disconnect(conn->ibc_cep, NULL, &drep);
2115                 cm_cancel(conn->ibc_cep);
2116
2117                 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2118                 LASSERT (!conn->ibc_disconnect);
2119                 conn->ibc_disconnect = 1;
2120
2121                 switch (conn->ibc_state) {
2122                 default:
2123                         LBUG();
2124
2125                 case IBNAL_CONN_ACTIVE_RTU:
2126                         /* kibnal_connreq_done is getting there; It'll see
2127                          * ibc_disconnect set... */
2128                         kibnal_conn_decref(conn); /* lose my ref */
2129                         break;
2130
2131                 case IBNAL_CONN_ESTABLISHED:
2132                         /* kibnal_connreq_done got there already; get
2133                          * disconnect going... */
2134                         kibnal_close_conn_locked(conn, 0);
2135                         kibnal_conn_decref(conn); /* lose my ref */
2136                         break;
2137
2138                 case IBNAL_CONN_DISCONNECT1:
2139                         /* kibnal_terminate_conn is getting there; It'll see
2140                          * ibc_disconnect set... */
2141                         kibnal_conn_decref(conn); /* lose my ref */
2142                         break;
2143
2144                 case IBNAL_CONN_DISCONNECT2:
2145                         /* kibnal_terminate_conn got there already; complete
2146                          * the disconnect.  NB kib_connd_conns takes my ref */
2147                         spin_lock(&kibnal_data.kib_connd_lock);
2148                         list_add_tail(&conn->ibc_list, &kibnal_data.kib_connd_conns);
2149                         wake_up(&kibnal_data.kib_connd_waitq);
2150                         spin_unlock(&kibnal_data.kib_connd_lock);
2151                         break;
2152                 }
2153                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2154                 return;
2155                 
2156         case cm_event_disconn_timeout:
2157         case cm_event_disconn_reply:
2158                 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2159                 LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT2);
2160                 LASSERT (!conn->ibc_disconnect);
2161                 conn->ibc_disconnect = 1;
2162
2163                 /* kibnal_terminate_conn sent the disconnect request.  
2164                  * NB kib_connd_conns takes my ref */
2165                 spin_lock(&kibnal_data.kib_connd_lock);
2166                 list_add_tail(&conn->ibc_list, &kibnal_data.kib_connd_conns);
2167                 wake_up(&kibnal_data.kib_connd_waitq);
2168                 spin_unlock(&kibnal_data.kib_connd_lock);
2169
2170                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2171                 break;
2172                 
2173         case cm_event_connected:
2174         case cm_event_conn_timeout:
2175         case cm_event_conn_reject:
2176                 LASSERT (conn->ibc_state == IBNAL_CONN_PASSIVE_WAIT);
2177                 conn->ibc_connvars->cv_conndata = *cmdata;
2178                 
2179                 spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
2180                 list_add_tail(&conn->ibc_list, &kibnal_data.kib_connd_conns);
2181                 wake_up(&kibnal_data.kib_connd_waitq);
2182                 spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
2183                 break;
2184         }
2185 }
2186
2187 void
2188 kibnal_check_passive_wait(kib_conn_t *conn)
2189 {
2190         int     rc;
2191
2192         switch (conn->ibc_connvars->cv_conndata.status) {
2193         default:
2194                 LBUG();
2195                 
2196         case cm_event_connected:
2197                 kibnal_conn_addref(conn); /* ++ ref for CM callback */
2198                 rc = kibnal_set_qp_state(conn, vv_qp_state_rts);
2199                 if (rc != 0)
2200                         conn->ibc_comms_error = rc;
2201                 /* connection _has_ been established; it's just that we've had
2202                  * an error immediately... */
2203                 kibnal_connreq_done(conn, 0, 0);
2204                 break;
2205                 
2206         case cm_event_conn_timeout:
2207                 kibnal_connreq_done(conn, 0, -ETIMEDOUT);
2208                 break;
2209                 
2210         case cm_event_conn_reject:
2211                 kibnal_connreq_done(conn, 0, -ECONNRESET);
2212                 break;
2213         }
2214 }
2215
2216 void
2217 kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq)
2218 {
2219         static cm_reply_data_t  reply;
2220         static cm_reject_data_t reject;
2221
2222         kib_msg_t          *rxmsg = (kib_msg_t *)cmreq->priv_data;
2223         kib_msg_t          *txmsg;
2224         kib_conn_t         *conn = NULL;
2225         int                 rc = 0;
2226         kib_connvars_t     *cv;
2227         kib_peer_t         *tmp_peer;
2228         cm_return_t         cmrc;
2229         vv_return_t         vvrc;
2230         
2231         /* I'm the connd executing in thread context
2232          * No concurrency problems with static data! */
2233         LASSERT (!in_interrupt());
2234         LASSERT (current == kibnal_data.kib_connd);
2235
2236         if (cmreq->sid != IBNAL_SERVICE_NUMBER) {
2237                 CERROR(LPX64" != IBNAL_SERVICE_NUMBER("LPX64")\n",
2238                        cmreq->sid, (__u64)IBNAL_SERVICE_NUMBER);
2239                 goto reject;
2240         }
2241
2242         rc = kibnal_unpack_msg(rxmsg, cm_REQ_priv_data_len);
2243         if (rc != 0) {
2244                 CERROR("Can't parse connection request: %d\n", rc);
2245                 goto reject;
2246         }
2247
2248         if (rxmsg->ibm_type != IBNAL_MSG_CONNREQ) {
2249                 CERROR("Unexpected connreq msg type: %x from "LPX64"\n",
2250                        rxmsg->ibm_type, rxmsg->ibm_srcnid);
2251                 goto reject;
2252         }
2253
2254         if (rxmsg->ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid) {
2255                 CERROR("Can't accept "LPX64": bad dst nid "LPX64"\n",
2256                        rxmsg->ibm_srcnid, rxmsg->ibm_dstnid);
2257                 goto reject;
2258         }
2259
2260         if (rxmsg->ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) {
2261                 CERROR("Can't accept "LPX64": incompatible queue depth %d (%d wanted)\n",
2262                        rxmsg->ibm_srcnid, rxmsg->ibm_u.connparams.ibcp_queue_depth, 
2263                        IBNAL_MSG_QUEUE_SIZE);
2264                 goto reject;
2265         }
2266
2267         if (rxmsg->ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE) {
2268                 CERROR("Can't accept "LPX64": message size %d too big (%d max)\n",
2269                        rxmsg->ibm_srcnid, rxmsg->ibm_u.connparams.ibcp_max_msg_size, 
2270                        IBNAL_MSG_SIZE);
2271                 goto reject;
2272         }
2273                 
2274         if (rxmsg->ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) {
2275                 CERROR("Can't accept "LPX64": max frags %d too big (%d max)\n",
2276                        rxmsg->ibm_srcnid, rxmsg->ibm_u.connparams.ibcp_max_frags, 
2277                        IBNAL_MAX_RDMA_FRAGS);
2278                 goto reject;
2279         }
2280                 
2281         conn = kibnal_create_conn(cep);
2282         if (conn == NULL) {
2283                 CERROR("Can't create conn for "LPX64"\n", rxmsg->ibm_srcnid);
2284                 goto reject;
2285         }
2286         
2287         /* assume 'rxmsg->ibm_srcnid' is a new peer */
2288         tmp_peer = kibnal_create_peer (rxmsg->ibm_srcnid);
2289         if (tmp_peer == NULL) {
2290                 CERROR("Can't create tmp peer for "LPX64"\n", rxmsg->ibm_srcnid);
2291                 kibnal_conn_decref(conn);
2292                 conn = NULL;
2293                 goto reject;
2294         }
2295
2296         conn->ibc_peer = tmp_peer;              /* conn takes over my ref */
2297         conn->ibc_incarnation = rxmsg->ibm_srcstamp;
2298         conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
2299
2300         cv = conn->ibc_connvars;
2301
2302         cv->cv_txpsn          = cmreq->cep_data.start_psn;
2303         cv->cv_remote_qpn     = cmreq->cep_data.qpn;
2304         cv->cv_path           = cmreq->path_data.path;
2305         cv->cv_rnr_count      = cmreq->cep_data.rtr_retry_cnt;
2306         // XXX                  cmreq->cep_data.retry_cnt;
2307         cv->cv_port           = cmreq->cep_data.local_port_num;
2308
2309         vvrc = gid2gid_index(kibnal_data.kib_hca, cv->cv_port,
2310                              &cv->cv_path.sgid, &cv->cv_sgid_index);
2311         LASSERT (vvrc == vv_return_ok);
2312         
2313         vvrc = pkey2pkey_index(kibnal_data.kib_hca, cv->cv_port,
2314                                cv->cv_path.pkey, &cv->cv_pkey_index);
2315         LASSERT (vvrc == vv_return_ok);
2316
2317         rc = kibnal_set_qp_state(conn, vv_qp_state_init);
2318         if (rc != 0)
2319                 goto reject;
2320
2321         rc = kibnal_post_receives(conn);
2322         if (rc != 0) {
2323                 CERROR("Can't post receives for "LPX64"\n", rxmsg->ibm_srcnid);
2324                 goto reject;
2325         }
2326
2327         rc = kibnal_set_qp_state(conn, vv_qp_state_rtr);
2328         if (rc != 0)
2329                 goto reject;
2330         
2331         memset(&reply, 0, sizeof(reply));
2332         reply.qpn                 = cv->cv_local_qpn;
2333         reply.qkey                = IBNAL_QKEY;
2334         reply.start_psn           = cv->cv_rxpsn;
2335         reply.arb_initiator_depth = IBNAL_ARB_INITIATOR_DEPTH;
2336         reply.arb_resp_res        = IBNAL_ARB_RESP_RES;
2337         reply.failover_accepted   = IBNAL_FAILOVER_ACCEPTED;
2338         reply.rnr_retry_count     = cv->cv_rnr_count;
2339         reply.targ_ack_delay      = kibnal_data.kib_hca_attrs.ack_delay;
2340         
2341         txmsg = (kib_msg_t *)&reply.priv_data;
2342         kibnal_init_msg(txmsg, IBNAL_MSG_CONNACK, 
2343                         sizeof(txmsg->ibm_u.connparams));
2344         LASSERT (txmsg->ibm_nob <= cm_REP_priv_data_len);
2345         txmsg->ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE;
2346         txmsg->ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE;
2347         txmsg->ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS;
2348         kibnal_pack_msg(txmsg, 0, rxmsg->ibm_srcnid, rxmsg->ibm_srcstamp, 0);
2349         
2350         kibnal_set_conn_state(conn, IBNAL_CONN_PASSIVE_WAIT);
2351         
2352         cmrc = cm_accept(conn->ibc_cep, &reply, NULL,
2353                          kibnal_cm_callback, conn);
2354
2355         if (cmrc == cm_stat_success)
2356                 return;                         /* callback has got my ref on conn */
2357
2358         /* back out state change (no callback happening) */
2359         kibnal_set_conn_state(conn, IBNAL_CONN_INIT);
2360         rc = -EIO;
2361                 
2362  reject:
2363         CERROR("Rejected connreq from "LPX64"\n", rxmsg->ibm_srcnid);
2364
2365         memset(&reject, 0, sizeof(reject));
2366         reject.reason = cm_rej_code_usr_rej;
2367         cm_reject(cep, &reject);
2368
2369         if (conn != NULL) {
2370                 LASSERT (rc != 0);
2371                 kibnal_connreq_done(conn, 0, rc);
2372         } else {
2373                 cm_destroy_cep(cep);
2374         }
2375 }
2376
2377 void
2378 kibnal_listen_callback(cm_cep_handle_t cep, cm_conn_data_t *data, void *arg)
2379 {
2380         cm_request_data_t  *cmreq = &data->data.request;
2381         kib_pcreq_t        *pcr;
2382         unsigned long       flags;
2383         
2384         LASSERT (arg == NULL);
2385
2386         if (data->status != cm_event_conn_request) {
2387                 CERROR("status %d is not cm_event_conn_request\n",
2388                        data->status);
2389                 return;
2390         }
2391
2392         PORTAL_ALLOC_ATOMIC(pcr, sizeof(*pcr));
2393         if (pcr == NULL) {
2394                 CERROR("Can't allocate passive connreq\n");
2395
2396                 cm_reject(cep, &((cm_reject_data_t) /* NB RO struct */
2397                                  {.reason = cm_rej_code_no_res,}));
2398                 cm_destroy_cep(cep);
2399                 return;
2400         }
2401
2402         pcr->pcr_cep = cep;
2403         pcr->pcr_cmreq = *cmreq;
2404         
2405         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
2406
2407         list_add_tail(&pcr->pcr_list, &kibnal_data.kib_connd_pcreqs);
2408         wake_up(&kibnal_data.kib_connd_waitq);
2409         
2410         spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
2411 }
2412
2413
2414 void
2415 kibnal_active_connect_callback (cm_cep_handle_t cep, cm_conn_data_t *cd, 
2416                                 void *arg)
2417 {
2418         /* CAVEAT EMPTOR: tasklet context */
2419         kib_conn_t       *conn = (kib_conn_t *)arg;
2420         kib_connvars_t   *cv = conn->ibc_connvars;
2421         unsigned long     flags;
2422
2423         LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT);
2424         cv->cv_conndata = *cd;
2425
2426         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
2427         /* connd takes my ref */
2428         list_add_tail(&conn->ibc_list, &kibnal_data.kib_connd_conns);
2429         wake_up(&kibnal_data.kib_connd_waitq);
2430         spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
2431 }
2432
2433 void
2434 kibnal_connect_conn (kib_conn_t *conn)
2435 {
2436         static cm_request_data_t  cmreq;
2437         kib_msg_t                *msg = (kib_msg_t *)&cmreq.priv_data;
2438         kib_connvars_t           *cv = conn->ibc_connvars;
2439         kib_peer_t               *peer = conn->ibc_peer;
2440         cm_return_t               cmrc;
2441         
2442         /* Only called by connd => statics OK */
2443         LASSERT (!in_interrupt());
2444         LASSERT (current == kibnal_data.kib_connd);
2445         LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP);
2446
2447         memset(&cmreq, 0, sizeof(cmreq));
2448         
2449         cmreq.sid = IBNAL_SERVICE_NUMBER;
2450
2451         cmreq.cep_data.ca_guid              = kibnal_data.kib_hca_attrs.guid;
2452         cmreq.cep_data.qpn                  = cv->cv_local_qpn;
2453         cmreq.cep_data.retry_cnt            = IBNAL_RETRY_CNT;
2454         cmreq.cep_data.rtr_retry_cnt        = IBNAL_RNR_CNT;
2455         cmreq.cep_data.start_psn            = cv->cv_rxpsn;
2456         cmreq.cep_data.end_to_end_flow_ctrl = IBNAL_EE_FLOW_CNT;
2457         // XXX ack_timeout?
2458         // offered_resp_res
2459         // offered_initiator_depth
2460
2461         cmreq.path_data.subn_local  = IBNAL_LOCAL_SUB;
2462         cmreq.path_data.path        = cv->cv_path;
2463         
2464         kibnal_init_msg(msg, IBNAL_MSG_CONNREQ, sizeof(msg->ibm_u.connparams));
2465         LASSERT(msg->ibm_nob <= cm_REQ_priv_data_len);
2466         msg->ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE;
2467         msg->ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE;
2468         msg->ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS;
2469         kibnal_pack_msg(msg, 0, peer->ibp_nid, 0, 0);
2470         
2471         CDEBUG(D_NET, "Connecting %p to "LPX64"\n", conn, peer->ibp_nid);
2472
2473         kibnal_conn_addref(conn);               /* ++ref for CM callback */
2474         kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CONNECT);
2475
2476         cmrc = cm_connect(conn->ibc_cep, &cmreq, 
2477                           kibnal_active_connect_callback, conn);
2478         if (cmrc == cm_stat_success) {
2479                 CDEBUG(D_NET, "connection REQ sent to "LPX64"\n",
2480                        peer->ibp_nid);
2481                 return;
2482         }
2483
2484         CERROR ("Connect "LPX64" failed: %d\n", peer->ibp_nid, cmrc);
2485         kibnal_conn_decref(conn);       /* drop callback's ref */
2486         kibnal_connreq_done(conn, 1, -EHOSTUNREACH);
2487 }
2488
2489 void
2490 kibnal_check_connreply (kib_conn_t *conn)
2491 {
2492         static cm_rtu_data_t  rtu;
2493
2494         kib_connvars_t   *cv = conn->ibc_connvars;
2495         cm_reply_data_t  *reply = &cv->cv_conndata.data.reply;
2496         kib_msg_t        *msg = (kib_msg_t *)&reply->priv_data;
2497         kib_peer_t       *peer = conn->ibc_peer;
2498         cm_return_t       cmrc;
2499         cm_cep_handle_t   cep;
2500         unsigned long     flags;
2501         int               rc;
2502
2503         /* Only called by connd => statics OK */
2504         LASSERT (!in_interrupt());
2505         LASSERT (current == kibnal_data.kib_connd);
2506         LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT);
2507
2508         if (cv->cv_conndata.status == cm_event_conn_reply) {
2509                 cv->cv_remote_qpn = reply->qpn;
2510                 cv->cv_txpsn      = reply->start_psn;
2511                 // XXX              reply->targ_ack_delay;
2512                 cv->cv_rnr_count  = reply->rnr_retry_count;
2513
2514                 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CHECK_REPLY);
2515
2516                 rc = kibnal_unpack_msg(msg, cm_REP_priv_data_len);
2517                 if (rc != 0) {
2518                         CERROR("Can't unpack reply from "LPX64"\n",
2519                                peer->ibp_nid);
2520                         kibnal_connreq_done(conn, 1, rc);
2521                         return;
2522                 }
2523
2524                 if (msg->ibm_type != IBNAL_MSG_CONNACK ) {
2525                         CERROR("Unexpected message type %d from "LPX64"\n",
2526                                msg->ibm_type, peer->ibp_nid);
2527                         kibnal_connreq_done(conn, 1, -EPROTO);
2528                         return;
2529                 }
2530
2531                 if (msg->ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) {
2532                         CERROR(LPX64" has incompatible queue depth %d(%d wanted)\n",
2533                                peer->ibp_nid, msg->ibm_u.connparams.ibcp_queue_depth,
2534                                IBNAL_MSG_QUEUE_SIZE);
2535                         kibnal_connreq_done(conn, 1, -EPROTO);
2536                         return;
2537                 }
2538                 
2539                 if (msg->ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE) {
2540                         CERROR(LPX64" max message size %d too big (%d max)\n",
2541                                peer->ibp_nid, msg->ibm_u.connparams.ibcp_max_msg_size, 
2542                                IBNAL_MSG_SIZE);
2543                         kibnal_connreq_done(conn, 1, -EPROTO);
2544                         return;
2545                 }
2546
2547                 if (msg->ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) {
2548                         CERROR(LPX64" max frags %d too big (%d max)\n",
2549                                peer->ibp_nid, msg->ibm_u.connparams.ibcp_max_frags, 
2550                                IBNAL_MAX_RDMA_FRAGS);
2551                         kibnal_connreq_done(conn, 1, -EPROTO);
2552                         return;
2553                 }
2554                 
2555                 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2556                 rc = (msg->ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid ||
2557                       msg->ibm_dststamp != kibnal_data.kib_incarnation) ?
2558                      -ESTALE : 0;
2559                 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2560                 if (rc != 0) {
2561                         CERROR("Stale connection reply from "LPX64"\n",
2562                                peer->ibp_nid);
2563                         kibnal_connreq_done(conn, 1, rc);
2564                         return;
2565                 }
2566
2567                 conn->ibc_incarnation = msg->ibm_srcstamp;
2568                 conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
2569                 
2570                 rc = kibnal_post_receives(conn);
2571                 if (rc != 0) {
2572                         CERROR("Can't post receives for "LPX64"\n",
2573                                peer->ibp_nid);
2574                         kibnal_connreq_done(conn, 1, rc);
2575                         return;
2576                 }
2577                 
2578                 rc = kibnal_set_qp_state(conn, vv_qp_state_rtr);
2579                 if (rc != 0) {
2580                         kibnal_connreq_done(conn, 1, rc);
2581                         return;
2582                 }
2583                 
2584                 rc = kibnal_set_qp_state(conn, vv_qp_state_rts);
2585                 if (rc != 0) {
2586                         kibnal_connreq_done(conn, 1, rc);
2587                         return;
2588                 }
2589                 
2590                 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_RTU);
2591                 kibnal_conn_addref(conn);       /* ++for CM callback */
2592                 
2593                 memset(&rtu, 0, sizeof(rtu));
2594                 cmrc = cm_accept(conn->ibc_cep, NULL, &rtu,
2595                                  kibnal_cm_callback, conn);
2596                 if (cmrc == cm_stat_success) {
2597                         /* Now I'm racing with disconnect signalled by
2598                          * kibnal_cm_callback */
2599                         kibnal_connreq_done(conn, 1, 0);
2600                         return;
2601                 }
2602
2603                 CERROR("cm_accept "LPX64" failed: %d\n", peer->ibp_nid, cmrc);
2604                 /* Back out of RTU: no callback coming */
2605                 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CHECK_REPLY);
2606                 kibnal_conn_decref(conn);
2607                 kibnal_connreq_done(conn, 1, -EIO);
2608                 return;
2609         }
2610
2611         if (cv->cv_conndata.status == cm_event_conn_reject) {
2612
2613                 if (cv->cv_conndata.data.reject.reason != cm_rej_code_stale_conn) {
2614                         CERROR("conn -> "LPX64" rejected: %d\n", peer->ibp_nid,
2615                                cv->cv_conndata.data.reject.reason);
2616                         kibnal_connreq_done(conn, 1, -ECONNREFUSED);
2617                         return;
2618                 }
2619
2620                 CWARN ("conn -> "LPX64" stale: retrying\n", peer->ibp_nid);
2621
2622                 cep = cm_create_cep(cm_cep_transp_rc);
2623                 if (cep == NULL) {
2624                         CERROR("Can't create new CEP\n");
2625                         kibnal_connreq_done(conn, 1, -ENOMEM);
2626                         return;
2627                 }
2628
2629                 cmrc = cm_cancel(conn->ibc_cep);
2630                 LASSERT (cmrc == cm_stat_success);
2631                 cmrc = cm_destroy_cep(conn->ibc_cep);
2632                 LASSERT (cmrc == cm_stat_success);
2633
2634                 conn->ibc_cep = cep;
2635
2636                 /* retry connect */
2637                 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_ARP);
2638                 kibnal_connect_conn(conn);
2639                 return;
2640         }
2641
2642         CERROR("conn -> "LPX64" failed: %d\n", peer->ibp_nid,
2643                cv->cv_conndata.status);
2644         kibnal_connreq_done(conn, 1, -ECONNABORTED);
2645 }
2646
2647 void
2648 kibnal_send_connreq (kib_conn_t *conn)
2649 {
2650         kib_peer_t           *peer = conn->ibc_peer;
2651         kib_connvars_t       *cv = conn->ibc_connvars;
2652         ibat_arp_data_t      *arp = &cv->cv_arp;
2653         ib_path_record_v2_t  *path = &cv->cv_path;
2654         vv_return_t           vvrc;
2655         int                   rc;
2656
2657         /* Only called by connd => statics OK */
2658         LASSERT (!in_interrupt());
2659         LASSERT (current == kibnal_data.kib_connd);
2660         LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP);
2661         
2662         if (cv->cv_arprc != ibat_stat_ok) {
2663                 CERROR("Can't Arp "LPX64"@%u.%u.%u.%u: %d\n", peer->ibp_nid,
2664                        HIPQUAD(peer->ibp_ip), cv->cv_arprc);
2665                 kibnal_connreq_done(conn, 1, -ENETUNREACH);
2666                 return;
2667         }
2668
2669         if ((arp->mask & IBAT_PRI_PATH_VALID) != 0) {
2670                 CDEBUG(D_NET, "Got valid path for "LPX64"\n", peer->ibp_nid);
2671
2672                 *path = *arp->primary_path;
2673
2674                 vvrc = base_gid2port_num(kibnal_data.kib_hca, &path->sgid,
2675                                          &cv->cv_port);
2676                 LASSERT (vvrc == vv_return_ok);
2677
2678                 vvrc = gid2gid_index(kibnal_data.kib_hca, cv->cv_port,
2679                                      &path->sgid, &cv->cv_sgid_index);
2680                 LASSERT (vvrc == vv_return_ok);
2681
2682                 vvrc = pkey2pkey_index(kibnal_data.kib_hca, cv->cv_port,
2683                                        path->pkey, &cv->cv_pkey_index);
2684                 LASSERT (vvrc == vv_return_ok);
2685
2686                 path->mtu = IBNAL_IB_MTU;
2687
2688         } else if ((arp->mask & IBAT_LID_VALID) != 0) {
2689                 CWARN("Creating new path record for "LPX64"@%u.%u.%u.%u\n",
2690                       peer->ibp_nid, HIPQUAD(peer->ibp_ip));
2691
2692                 cv->cv_pkey_index = IBNAL_PKEY_IDX;
2693                 cv->cv_sgid_index = IBNAL_SGID_IDX;
2694                 cv->cv_port = arp->local_port_num;
2695
2696                 memset(path, 0, sizeof(*path));
2697
2698                 vvrc = port_num2base_gid(kibnal_data.kib_hca, cv->cv_port,
2699                                          &path->sgid);
2700                 LASSERT (vvrc == vv_return_ok);
2701
2702                 vvrc = port_num2base_lid(kibnal_data.kib_hca, cv->cv_port,
2703                                          &path->slid);
2704                 LASSERT (vvrc == vv_return_ok);
2705
2706                 path->dgid          = arp->gid;
2707                 path->sl            = IBNAL_SERVICE_LEVEL;
2708                 path->dlid          = arp->lid;
2709                 path->mtu           = IBNAL_IB_MTU;
2710                 path->rate          = IBNAL_STATIC_RATE;
2711                 path->pkt_life_time = IBNAL_PKT_LIFETIME;
2712                 path->pkey          = IBNAL_PKEY;
2713                 path->traffic_class = IBNAL_TRAFFIC_CLASS;
2714         } else {
2715                 CERROR("Can't Arp "LPX64"@%u.%u.%u.%u: no PATH or LID\n", 
2716                        peer->ibp_nid, HIPQUAD(peer->ibp_ip));
2717                 kibnal_connreq_done(conn, 1, -ENETUNREACH);
2718                 return;
2719         }
2720
2721         rc = kibnal_set_qp_state(conn, vv_qp_state_init);
2722         if (rc != 0) {
2723                 kibnal_connreq_done(conn, 1, rc);
2724         }
2725
2726         /* do the actual connection request */
2727         kibnal_connect_conn(conn);
2728 }
2729
2730 void
2731 kibnal_arp_callback (ibat_stat_t arprc, ibat_arp_data_t *arp_data, void *arg)
2732 {
2733         /* CAVEAT EMPTOR: tasklet context */
2734         kib_conn_t      *conn = (kib_conn_t *)arg;
2735         kib_peer_t      *peer = conn->ibc_peer;
2736         unsigned long    flags;
2737
2738         CDEBUG(D_NET, "Arp "LPX64"@%u.%u.%u.%u rc %d LID %s PATH %s\n",
2739                peer->ibp_nid, HIPQUAD(peer->ibp_ip), arprc,
2740                (arp_data->mask & IBAT_LID_VALID) == 0 ? "invalid" : "valid",
2741                (arp_data->mask & IBAT_PRI_PATH_VALID) == 0 ? "invalid" : "valid");
2742         LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP);
2743
2744         conn->ibc_connvars->cv_arprc = arprc;
2745         if (arprc == ibat_stat_ok)
2746                 conn->ibc_connvars->cv_arp = *arp_data;
2747         
2748         /* connd takes over my ref on conn */
2749         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
2750         
2751         list_add_tail(&conn->ibc_list, &kibnal_data.kib_connd_conns);
2752         wake_up(&kibnal_data.kib_connd_waitq);
2753         
2754         spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
2755 }
2756
2757 void
2758 kibnal_arp_peer (kib_peer_t *peer)
2759 {
2760         cm_cep_handle_t  cep;
2761         kib_conn_t      *conn;
2762         int              ibatrc;
2763
2764         /* Only the connd does this (i.e. single threaded) */
2765         LASSERT (current == kibnal_data.kib_connd);
2766         LASSERT (peer->ibp_connecting != 0);
2767
2768         cep = cm_create_cep(cm_cep_transp_rc);
2769         if (cep == NULL) {
2770                 CERROR ("Can't create cep for conn->"LPX64"\n",
2771                         peer->ibp_nid);
2772                 kibnal_peer_connect_failed(peer, 1);
2773                 return;
2774         }
2775
2776         conn = kibnal_create_conn(cep);
2777         if (conn == NULL) {
2778                 CERROR ("Can't allocate conn->"LPX64"\n",
2779                         peer->ibp_nid);
2780                 cm_destroy_cep(cep);
2781                 kibnal_peer_connect_failed(peer, 1);
2782                 return;
2783         }
2784
2785         conn->ibc_peer = peer;
2786         kibnal_peer_addref(peer);
2787
2788         kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_ARP);
2789
2790         ibatrc = ibat_get_ib_data(htonl(peer->ibp_ip), INADDR_ANY, 
2791                                   ibat_paths_primary,
2792                                   &conn->ibc_connvars->cv_arp, 
2793                                   kibnal_arp_callback, conn, 0);
2794         CDEBUG(D_NET,"ibatrc %d\n", ibatrc);
2795         switch (ibatrc) {
2796         default:
2797                 LBUG();
2798                 
2799         case ibat_stat_pending:
2800                 /* NB callback has my ref on conn */
2801                 break;
2802                 
2803         case ibat_stat_ok:
2804                 /* Immediate return (ARP cache hit) == no callback. */
2805                 kibnal_send_connreq(conn);
2806                 kibnal_conn_decref(conn);
2807                 break;
2808
2809         case ibat_stat_error:
2810         case ibat_stat_timeout:
2811         case ibat_stat_not_found:
2812                 CERROR("Arp "LPX64"@%u.%u.%u.%u failed: %d\n", peer->ibp_nid,
2813                        HIPQUAD(peer->ibp_ip), ibatrc);
2814                 kibnal_connreq_done(conn, 1, -ENETUNREACH);
2815                 kibnal_conn_decref(conn);
2816                 break;
2817         }
2818 }
2819
2820 int
2821 kibnal_conn_timed_out (kib_conn_t *conn)
2822 {
2823         kib_tx_t          *tx;
2824         struct list_head  *ttmp;
2825
2826         spin_lock(&conn->ibc_lock);
2827
2828         list_for_each (ttmp, &conn->ibc_tx_queue) {
2829                 tx = list_entry (ttmp, kib_tx_t, tx_list);
2830
2831                 LASSERT (tx->tx_queued);
2832
2833                 if (time_after_eq (jiffies, tx->tx_deadline)) {
2834                         spin_unlock(&conn->ibc_lock);
2835                         return 1;
2836                 }
2837         }
2838
2839         list_for_each (ttmp, &conn->ibc_active_txs) {
2840                 tx = list_entry (ttmp, kib_tx_t, tx_list);
2841
2842                 LASSERT (!tx->tx_queued);
2843                 LASSERT (tx->tx_waiting ||
2844                          tx->tx_sending != 0);
2845
2846                 if (time_after_eq (jiffies, tx->tx_deadline)) {
2847                         spin_unlock(&conn->ibc_lock);
2848                         return 1;
2849                 }
2850         }
2851
2852         spin_unlock(&conn->ibc_lock);
2853         return 0;
2854 }
2855
2856 void
2857 kibnal_check_conns (int idx)
2858 {
2859         struct list_head  *peers = &kibnal_data.kib_peers[idx];
2860         struct list_head  *ptmp;
2861         kib_peer_t        *peer;
2862         kib_conn_t        *conn;
2863         struct list_head  *ctmp;
2864         unsigned long      flags;
2865
2866  again:
2867         /* NB. We expect to have a look at all the peers and not find any
2868          * rdmas to time out, so we just use a shared lock while we
2869          * take a look... */
2870         read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2871
2872         list_for_each (ptmp, peers) {
2873                 peer = list_entry (ptmp, kib_peer_t, ibp_list);
2874
2875                 list_for_each (ctmp, &peer->ibp_conns) {
2876                         conn = list_entry (ctmp, kib_conn_t, ibc_list);
2877
2878                         LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED);
2879
2880                         /* In case we have enough credits to return via a
2881                          * NOOP, but there were no non-blocking tx descs
2882                          * free to do it last time... */
2883                         kibnal_check_sends(conn);
2884
2885                         if (!kibnal_conn_timed_out(conn))
2886                                 continue;
2887
2888                         /* Handle timeout by closing the whole connection.  We
2889                          * can only be sure RDMA activity has ceased once the
2890                          * QP has been modified. */
2891                         
2892                         kibnal_conn_addref(conn); /* 1 ref for me... */
2893
2894                         read_unlock_irqrestore(&kibnal_data.kib_global_lock, 
2895                                                flags);
2896
2897                         CERROR("Timed out RDMA with "LPX64"\n",
2898                                peer->ibp_nid);
2899
2900                         kibnal_close_conn (conn, -ETIMEDOUT);
2901                         kibnal_conn_decref(conn); /* ...until here */
2902
2903                         /* start again now I've dropped the lock */
2904                         goto again;
2905                 }
2906         }
2907
2908         read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2909 }
2910
2911 void
2912 kibnal_disconnect_conn (kib_conn_t *conn)
2913 {
2914         static cm_drequest_data_t dreq;         /* just for the space */
2915         
2916         cm_return_t    cmrc;
2917         unsigned long  flags;
2918
2919         LASSERT (!in_interrupt());
2920         LASSERT (current == kibnal_data.kib_connd);
2921         
2922         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2923
2924         if (conn->ibc_disconnect) {
2925                 /* Had the CM callback already */
2926                 write_unlock_irqrestore(&kibnal_data.kib_global_lock,
2927                                         flags);
2928                 kibnal_conn_disconnected(conn);
2929                 return;
2930         }
2931                 
2932         LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT1);
2933
2934         /* active disconnect */
2935         cmrc = cm_disconnect(conn->ibc_cep, &dreq, NULL);
2936         if (cmrc == cm_stat_success) {
2937                 /* waiting for CM */
2938                 conn->ibc_state = IBNAL_CONN_DISCONNECT2;
2939                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2940                 return;
2941         }
2942
2943         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2944
2945         cm_cancel(conn->ibc_cep);
2946         kibnal_pause(HZ/10);
2947
2948         if (!conn->ibc_disconnect)              /* CM callback will never happen now */
2949                 kibnal_conn_decref(conn);
2950         
2951         LASSERT (atomic_read(&conn->ibc_refcount) > 0);
2952         LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT1);
2953
2954         kibnal_conn_disconnected(conn);
2955 }
2956
2957 int
2958 kibnal_connd (void *arg)
2959 {
2960         wait_queue_t       wait;
2961         unsigned long      flags;
2962         kib_pcreq_t       *pcr;
2963         kib_conn_t        *conn;
2964         kib_peer_t        *peer;
2965         int                timeout;
2966         int                i;
2967         int                dropped_lock;
2968         int                peer_index = 0;
2969         unsigned long      deadline = jiffies;
2970         
2971         kportal_daemonize ("kibnal_connd");
2972         kportal_blockallsigs ();
2973
2974         init_waitqueue_entry (&wait, current);
2975         kibnal_data.kib_connd = current;
2976
2977         spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
2978
2979         while (!kibnal_data.kib_shutdown) {
2980
2981                 dropped_lock = 0;
2982
2983                 if (!list_empty (&kibnal_data.kib_connd_zombies)) {
2984                         conn = list_entry (kibnal_data.kib_connd_zombies.next,
2985                                            kib_conn_t, ibc_list);
2986                         list_del (&conn->ibc_list);
2987                         
2988                         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
2989                         dropped_lock = 1;
2990
2991                         kibnal_destroy_conn(conn);
2992
2993                         spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
2994                 }
2995
2996                 if (!list_empty (&kibnal_data.kib_connd_pcreqs)) {
2997                         pcr = list_entry(kibnal_data.kib_connd_pcreqs.next,
2998                                          kib_pcreq_t, pcr_list);
2999                         list_del(&pcr->pcr_list);
3000                         
3001                         spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
3002                         dropped_lock = 1;
3003
3004                         kibnal_recv_connreq(pcr->pcr_cep, &pcr->pcr_cmreq);
3005                         PORTAL_FREE(pcr, sizeof(*pcr));
3006
3007                         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
3008                 }
3009                         
3010                 if (!list_empty (&kibnal_data.kib_connd_peers)) {
3011                         peer = list_entry (kibnal_data.kib_connd_peers.next,
3012                                            kib_peer_t, ibp_connd_list);
3013                         
3014                         list_del_init (&peer->ibp_connd_list);
3015                         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3016                         dropped_lock = 1;
3017
3018                         kibnal_arp_peer (peer);
3019                         kibnal_peer_decref (peer);
3020
3021                         spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3022                 }
3023
3024                 if (!list_empty (&kibnal_data.kib_connd_conns)) {
3025                         conn = list_entry (kibnal_data.kib_connd_conns.next,
3026                                            kib_conn_t, ibc_list);
3027                         list_del (&conn->ibc_list);
3028                         
3029                         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3030                         dropped_lock = 1;
3031
3032                         switch (conn->ibc_state) {
3033                         default:
3034                                 LBUG();
3035                                 
3036                         case IBNAL_CONN_ACTIVE_ARP:
3037                                 kibnal_send_connreq(conn);
3038                                 break;
3039
3040                         case IBNAL_CONN_ACTIVE_CONNECT:
3041                                 kibnal_check_connreply(conn);
3042                                 break;
3043
3044                         case IBNAL_CONN_PASSIVE_WAIT:
3045                                 kibnal_check_passive_wait(conn);
3046                                 break;
3047
3048                         case IBNAL_CONN_DISCONNECT1:
3049                         case IBNAL_CONN_DISCONNECT2:
3050                                 kibnal_disconnect_conn(conn);
3051                                 break;
3052                         }
3053                         kibnal_conn_decref(conn);
3054
3055                         spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3056                 }
3057
3058                 /* careful with the jiffy wrap... */
3059                 timeout = (int)(deadline - jiffies);
3060                 if (timeout <= 0) {
3061                         const int n = 4;
3062                         const int p = 1;
3063                         int       chunk = kibnal_data.kib_peer_hash_size;
3064                         
3065                         spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
3066                         dropped_lock = 1;
3067
3068                         /* Time to check for RDMA timeouts on a few more
3069                          * peers: I do checks every 'p' seconds on a
3070                          * proportion of the peer table and I need to check
3071                          * every connection 'n' times within a timeout
3072                          * interval, to ensure I detect a timeout on any
3073                          * connection within (n+1)/n times the timeout
3074                          * interval. */
3075
3076                         if (kibnal_tunables.kib_io_timeout > n * p)
3077                                 chunk = (chunk * n * p) / 
3078                                         kibnal_tunables.kib_io_timeout;
3079                         if (chunk == 0)
3080                                 chunk = 1;
3081
3082                         for (i = 0; i < chunk; i++) {
3083                                 kibnal_check_conns (peer_index);
3084                                 peer_index = (peer_index + 1) % 
3085                                              kibnal_data.kib_peer_hash_size;
3086                         }
3087
3088                         deadline += p * HZ;
3089                         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
3090                 }
3091
3092                 if (dropped_lock)
3093                         continue;
3094                 
3095                 /* Nothing to do for 'timeout'  */
3096                 set_current_state (TASK_INTERRUPTIBLE);
3097                 add_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
3098                 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3099
3100                 schedule_timeout (timeout);
3101
3102                 set_current_state (TASK_RUNNING);
3103                 remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
3104                 spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3105         }
3106
3107         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3108
3109         kibnal_thread_fini ();
3110         return (0);
3111 }
3112
3113 void 
3114 kibnal_async_callback(vv_event_record_t ev)
3115 {
3116         CERROR("type: %d, port: %d, data: "LPX64"\n", 
3117                ev.event_type, ev.port_num, ev.type.data);
3118 }
3119
3120 void
3121 kibnal_cq_callback (unsigned long unused_context)
3122 {
3123         unsigned long    flags;
3124
3125         CDEBUG(D_NET, "!!\n");
3126
3127         spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3128         kibnal_data.kib_ready = 1;
3129         wake_up(&kibnal_data.kib_sched_waitq);
3130         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
3131 }
3132
3133 int
3134 kibnal_scheduler(void *arg)
3135 {
3136         long            id = (long)arg;
3137         wait_queue_t    wait;
3138         char            name[16];
3139         vv_wc_t         wc;
3140         vv_return_t     vvrc;
3141         vv_return_t     vvrc2;
3142         unsigned long   flags;
3143         kib_rx_t       *rx;
3144         __u64           rxseq = 0;
3145         int             busy_loops = 0;
3146
3147         snprintf(name, sizeof(name), "kibnal_sd_%02ld", id);
3148         kportal_daemonize(name);
3149         kportal_blockallsigs();
3150
3151         init_waitqueue_entry(&wait, current);
3152
3153         spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3154
3155         while (!kibnal_data.kib_shutdown) {
3156                 if (busy_loops++ >= IBNAL_RESCHED) {
3157                         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
3158                                                flags);
3159
3160                         our_cond_resched();
3161                         busy_loops = 0;
3162                         
3163                         spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3164                 }
3165
3166                 if (kibnal_data.kib_ready &&
3167                     !kibnal_data.kib_checking_cq) {
3168                         /* take ownership of completion polling */
3169                         kibnal_data.kib_checking_cq = 1;
3170                         /* Assume I'll exhaust the CQ */
3171                         kibnal_data.kib_ready = 0;
3172                         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, 
3173                                                flags);
3174                         
3175                         vvrc = vv_poll_for_completion(kibnal_data.kib_hca, 
3176                                                       kibnal_data.kib_cq, &wc);
3177                         if (vvrc == vv_return_err_cq_empty) {
3178                                 vvrc2 = vv_request_completion_notification(
3179                                         kibnal_data.kib_hca, 
3180                                         kibnal_data.kib_cq, 
3181                                         vv_next_solicit_unsolicit_event);
3182                                 LASSERT (vvrc2 == vv_return_ok);
3183                         }
3184
3185                         if (vvrc == vv_return_ok &&
3186                             kibnal_wreqid2type(wc.wr_id) == IBNAL_WID_RX) {
3187                                 rx = (kib_rx_t *)kibnal_wreqid2ptr(wc.wr_id);
3188
3189                                 /* Grab the RX sequence number NOW before
3190                                  * anyone else can get an RX completion */
3191                                 rxseq = rx->rx_conn->ibc_rxseq++;
3192                         }
3193
3194                         spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3195                         /* give up ownership of completion polling */
3196                         kibnal_data.kib_checking_cq = 0;
3197
3198                         if (vvrc == vv_return_err_cq_empty)
3199                                 continue;
3200
3201                         LASSERT (vvrc == vv_return_ok);
3202                         /* Assume there's more: get another scheduler to check
3203                          * while I handle this completion... */
3204
3205                         kibnal_data.kib_ready = 1;
3206                         wake_up(&kibnal_data.kib_sched_waitq);
3207
3208                         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
3209                                                flags);
3210
3211                         switch (kibnal_wreqid2type(wc.wr_id)) {
3212                         case IBNAL_WID_RX:
3213                                 kibnal_rx_complete(
3214                                         (kib_rx_t *)kibnal_wreqid2ptr(wc.wr_id),
3215                                         wc.completion_status,
3216                                         wc.num_bytes_transfered,
3217                                         rxseq);
3218                                 break;
3219
3220                         case IBNAL_WID_TX:
3221                                 kibnal_tx_complete(
3222                                         (kib_tx_t *)kibnal_wreqid2ptr(wc.wr_id),
3223                                         wc.completion_status);
3224                                 break;
3225
3226                         case IBNAL_WID_RDMA:
3227                                 /* We only get RDMA completion notification if
3228                                  * it fails.  So we just ignore them completely
3229                                  * because...
3230                                  *
3231                                  * 1) If an RDMA fails, all subsequent work
3232                                  * items, including the final SEND will fail
3233                                  * too, so I'm still guaranteed to notice that
3234                                  * this connection is hosed.
3235                                  *
3236                                  * 2) It's positively dangerous to look inside
3237                                  * the tx descriptor obtained from an RDMA work
3238                                  * item.  As soon as I drop the kib_sched_lock,
3239                                  * I give a scheduler on another CPU a chance
3240                                  * to get the final SEND completion, so the tx
3241                                  * descriptor can get freed as I inspect it. */
3242                                 CERROR ("RDMA failed: %d\n", 
3243                                         wc.completion_status);
3244                                 break;
3245
3246                         default:
3247                                 LBUG();
3248                         }
3249                         
3250                         spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3251                         continue;
3252                 }
3253
3254                 /* Nothing to do; sleep... */
3255
3256                 set_current_state(TASK_INTERRUPTIBLE);
3257                 add_wait_queue(&kibnal_data.kib_sched_waitq, &wait);
3258                 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
3259                                        flags);
3260
3261                 schedule();
3262
3263                 remove_wait_queue(&kibnal_data.kib_sched_waitq, &wait);
3264                 set_current_state(TASK_RUNNING);
3265                 spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3266         }
3267
3268         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
3269
3270         kibnal_thread_fini();
3271         return (0);
3272 }
3273
3274
3275 lib_nal_t kibnal_lib = {
3276         .libnal_data = &kibnal_data,      /* NAL private data */
3277         .libnal_send = kibnal_send,
3278         .libnal_send_pages = kibnal_send_pages,
3279         .libnal_recv = kibnal_recv,
3280         .libnal_recv_pages = kibnal_recv_pages,
3281         .libnal_dist = kibnal_dist
3282 };