Whamcloud - gitweb
12dcdfd2ba316f2fbc045da04471dcf04f71ea2d
[fs/lustre-release.git] / lnet / klnds / viblnd / viblnd_cb.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * Copyright (C) 2004 Cluster File Systems, Inc.
5  *   Author: Eric Barton <eric@bartonsoftware.com>
6  *   Author: Frank Zago <fzago@systemfabricworks.com>
7  *
8  *   This file is part of Lustre, http://www.lustre.org.
9  *
10  *   Lustre is free software; you can redistribute it and/or
11  *   modify it under the terms of version 2 of the GNU General Public
12  *   License as published by the Free Software Foundation.
13  *
14  *   Lustre is distributed in the hope that it will be useful,
15  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
16  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  *   GNU General Public License for more details.
18  *
19  *   You should have received a copy of the GNU General Public License
20  *   along with Lustre; if not, write to the Free Software
21  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22  *
23  */
24
25 #include "vibnal.h"
26
27 void
28 kibnal_tx_done (kib_tx_t *tx)
29 {
30         ptl_err_t        ptlrc = (tx->tx_status == 0) ? PTL_OK : PTL_FAIL;
31         int              i;
32
33         LASSERT (!in_interrupt());
34         LASSERT (!tx->tx_queued);               /* mustn't be queued for sending */
35         LASSERT (tx->tx_sending == 0);          /* mustn't be awaiting sent callback */
36         LASSERT (!tx->tx_waiting);              /* mustn't be awaiting peer response */
37
38 #if !IBNAL_WHOLE_MEM
39         switch (tx->tx_mapped) {
40         default:
41                 LBUG();
42
43         case KIB_TX_UNMAPPED:
44                 break;
45
46         case KIB_TX_MAPPED: {
47                 vv_return_t      vvrc;
48
49                 vvrc = vv_mem_region_destroy(kibnal_data.kib_hca,
50                                              tx->tx_md.md_handle);
51                 LASSERT (vvrc == vv_return_ok);
52                 tx->tx_mapped = KIB_TX_UNMAPPED;
53                 break;
54         }
55         }
56 #endif
57         for (i = 0; i < 2; i++) {
58                 /* tx may have up to 2 libmsgs to finalise */
59                 if (tx->tx_libmsg[i] == NULL)
60                         continue;
61
62                 lib_finalize (&kibnal_lib, NULL, tx->tx_libmsg[i], ptlrc);
63                 tx->tx_libmsg[i] = NULL;
64         }
65         
66         if (tx->tx_conn != NULL) {
67                 kibnal_conn_decref(tx->tx_conn);
68                 tx->tx_conn = NULL;
69         }
70
71         tx->tx_nwrq = 0;
72         tx->tx_status = 0;
73
74         spin_lock(&kibnal_data.kib_tx_lock);
75
76         if (tx->tx_isnblk) {
77                 list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs);
78         } else {
79                 list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs);
80                 wake_up (&kibnal_data.kib_idle_tx_waitq);
81         }
82
83         spin_unlock(&kibnal_data.kib_tx_lock);
84 }
85
86 kib_tx_t *
87 kibnal_get_idle_tx (int may_block) 
88 {
89         kib_tx_t      *tx = NULL;
90         ENTRY;
91         
92         for (;;) {
93                 spin_lock(&kibnal_data.kib_tx_lock);
94
95                 /* "normal" descriptor is free */
96                 if (!list_empty (&kibnal_data.kib_idle_txs)) {
97                         tx = list_entry (kibnal_data.kib_idle_txs.next,
98                                          kib_tx_t, tx_list);
99                         break;
100                 }
101
102                 if (!may_block) {
103                         /* may dip into reserve pool */
104                         if (list_empty (&kibnal_data.kib_idle_nblk_txs)) {
105                                 CERROR ("reserved tx desc pool exhausted\n");
106                                 break;
107                         }
108
109                         tx = list_entry (kibnal_data.kib_idle_nblk_txs.next,
110                                          kib_tx_t, tx_list);
111                         break;
112                 }
113
114                 /* block for idle tx */
115                 spin_unlock(&kibnal_data.kib_tx_lock);
116
117                 wait_event (kibnal_data.kib_idle_tx_waitq,
118                             !list_empty (&kibnal_data.kib_idle_txs) ||
119                             kibnal_data.kib_shutdown);
120         }
121
122         if (tx != NULL) {
123                 list_del (&tx->tx_list);
124
125                 /* Allocate a new completion cookie.  It might not be needed,
126                  * but we've got a lock right now and we're unlikely to
127                  * wrap... */
128                 tx->tx_cookie = kibnal_data.kib_next_tx_cookie++;
129 #if IBNAL_WHOLE_MEM
130                 LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
131 #endif
132                 LASSERT (tx->tx_nwrq == 0);
133                 LASSERT (!tx->tx_queued);
134                 LASSERT (tx->tx_sending == 0);
135                 LASSERT (!tx->tx_waiting);
136                 LASSERT (tx->tx_status == 0);
137                 LASSERT (tx->tx_conn == NULL);
138                 LASSERT (tx->tx_libmsg[0] == NULL);
139                 LASSERT (tx->tx_libmsg[1] == NULL);
140         }
141
142         spin_unlock(&kibnal_data.kib_tx_lock);
143         
144         RETURN(tx);
145 }
146
147 int
148 kibnal_post_rx (kib_rx_t *rx, int credit)
149 {
150         kib_conn_t   *conn = rx->rx_conn;
151         int           rc = 0;
152         vv_return_t   vvrc;
153
154         LASSERT (!in_interrupt());
155         
156         rx->rx_gl = (vv_scatgat_t) {
157                 .v_address = KIBNAL_ADDR2SG(KIBNAL_RX_VADDR(rx)),
158                 .l_key     = KIBNAL_RX_LKEY(rx),
159                 .length    = IBNAL_MSG_SIZE,
160         };
161
162         rx->rx_wrq = (vv_wr_t) {
163                 .wr_id                   = kibnal_ptr2wreqid(rx, IBNAL_WID_RX),
164                 .completion_notification = 1,
165                 .scatgat_list            = &rx->rx_gl,
166                 .num_of_data_segments    = 1,
167                 .wr_type                 = vv_wr_receive,
168         };
169
170         LASSERT (conn->ibc_state >= IBNAL_CONN_INIT);
171         LASSERT (!rx->rx_posted);
172
173         CDEBUG(D_NET, "posting rx [%d %x "LPX64"]\n", 
174                rx->rx_wrq.scatgat_list->length,
175                rx->rx_wrq.scatgat_list->l_key,
176                KIBNAL_SG2ADDR(rx->rx_wrq.scatgat_list->v_address));
177
178         if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) {
179                 /* No more posts for this rx; so lose its ref */
180                 kibnal_conn_decref(conn);
181                 return 0;
182         }
183         
184         rx->rx_posted = 1;
185
186         spin_lock(&conn->ibc_lock);
187         /* Serialise vv_post_receive; it's not re-entrant on the same QP */
188         vvrc = vv_post_receive(kibnal_data.kib_hca,
189                                conn->ibc_qp, &rx->rx_wrq);
190         spin_unlock(&conn->ibc_lock);
191
192         if (vvrc == 0) {
193                 if (credit) {
194                         spin_lock(&conn->ibc_lock);
195                         conn->ibc_outstanding_credits++;
196                         spin_unlock(&conn->ibc_lock);
197
198                         kibnal_check_sends(conn);
199                 }
200                 return 0;
201         }
202         
203         CERROR ("post rx -> "LPX64" failed %d\n", 
204                 conn->ibc_peer->ibp_nid, vvrc);
205         rc = -EIO;
206         kibnal_close_conn(rx->rx_conn, rc);
207         /* No more posts for this rx; so lose its ref */
208         kibnal_conn_decref(conn);
209         return rc;
210 }
211
212 int
213 kibnal_post_receives (kib_conn_t *conn)
214 {
215         int    i;
216         int    rc;
217
218         LASSERT (conn->ibc_state < IBNAL_CONN_ESTABLISHED);
219         LASSERT (conn->ibc_comms_error == 0);
220
221         for (i = 0; i < IBNAL_RX_MSGS; i++) {
222                 /* +1 ref for rx desc.  This ref remains until kibnal_post_rx
223                  * fails (i.e. actual failure or we're disconnecting) */
224                 kibnal_conn_addref(conn);
225                 rc = kibnal_post_rx (&conn->ibc_rxs[i], 0);
226                 if (rc != 0)
227                         return rc;
228         }
229
230         return 0;
231 }
232
233 kib_tx_t *
234 kibnal_find_waiting_tx_locked(kib_conn_t *conn, int txtype, __u64 cookie)
235 {
236         struct list_head   *tmp;
237         
238         list_for_each(tmp, &conn->ibc_active_txs) {
239                 kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
240                 
241                 LASSERT (!tx->tx_queued);
242                 LASSERT (tx->tx_sending != 0 || tx->tx_waiting);
243
244                 if (tx->tx_cookie != cookie)
245                         continue;
246
247                 if (tx->tx_waiting &&
248                     tx->tx_msg->ibm_type == txtype)
249                         return tx;
250
251                 CWARN("Bad completion: %swaiting, type %x (wanted %x)\n",
252                       tx->tx_waiting ? "" : "NOT ",
253                       tx->tx_msg->ibm_type, txtype);
254         }
255         return NULL;
256 }
257
258 void
259 kibnal_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie)
260 {
261         kib_tx_t    *tx;
262         int          idle;
263
264         spin_lock(&conn->ibc_lock);
265
266         tx = kibnal_find_waiting_tx_locked(conn, txtype, cookie);
267         if (tx == NULL) {
268                 spin_unlock(&conn->ibc_lock);
269
270                 CWARN("Unmatched completion type %x cookie "LPX64
271                       " from "LPX64"\n",
272                       txtype, cookie, conn->ibc_peer->ibp_nid);
273                 kibnal_close_conn (conn, -EPROTO);
274                 return;
275         }
276
277         if (tx->tx_status == 0) {               /* success so far */
278                 if (status < 0) {               /* failed? */
279                         tx->tx_status = status;
280                 } else if (txtype == IBNAL_MSG_GET_REQ) { 
281                         /* XXX layering violation: set REPLY data length */
282                         LASSERT (tx->tx_libmsg[1] != NULL);
283                         LASSERT (tx->tx_libmsg[1]->ev.type == 
284                                  PTL_EVENT_REPLY_END);
285
286                         tx->tx_libmsg[1]->ev.mlength = status;
287                 }
288         }
289         
290         tx->tx_waiting = 0;
291
292         idle = !tx->tx_queued && (tx->tx_sending == 0);
293         if (idle)
294                 list_del(&tx->tx_list);
295
296         spin_unlock(&conn->ibc_lock);
297         
298         if (idle)
299                 kibnal_tx_done(tx);
300 }
301
302 void
303 kibnal_send_completion (kib_conn_t *conn, int type, int status, __u64 cookie) 
304 {
305         kib_tx_t    *tx = kibnal_get_idle_tx(0);
306         
307         if (tx == NULL) {
308                 CERROR("Can't get tx for completion %x for "LPX64"\n",
309                        type, conn->ibc_peer->ibp_nid);
310                 return;
311         }
312         
313         tx->tx_msg->ibm_u.completion.ibcm_status = status;
314         tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie;
315         kibnal_init_tx_msg(tx, type, sizeof(kib_completion_msg_t));
316         
317         kibnal_queue_tx(tx, conn);
318 }
319
320 void
321 kibnal_handle_rx (kib_rx_t *rx)
322 {
323         kib_msg_t    *msg = rx->rx_msg;
324         kib_conn_t   *conn = rx->rx_conn;
325         int           credits = msg->ibm_credits;
326         kib_tx_t     *tx;
327         int           rc;
328
329         LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
330
331         CDEBUG (D_NET, "Received %x[%d] from "LPX64"\n",
332                 msg->ibm_type, credits, conn->ibc_peer->ibp_nid);
333         
334         if (credits != 0) {
335                 /* Have I received credits that will let me send? */
336                 spin_lock(&conn->ibc_lock);
337                 conn->ibc_credits += credits;
338                 spin_unlock(&conn->ibc_lock);
339
340                 kibnal_check_sends(conn);
341         }
342
343         switch (msg->ibm_type) {
344         default:
345                 CERROR("Bad IBNAL message type %x from "LPX64"\n",
346                        msg->ibm_type, conn->ibc_peer->ibp_nid);
347                 break;
348
349         case IBNAL_MSG_NOOP:
350                 break;
351
352         case IBNAL_MSG_IMMEDIATE:
353                 lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx);
354                 break;
355                 
356         case IBNAL_MSG_PUT_REQ:
357                 rx->rx_responded = 0;
358                 lib_parse(&kibnal_lib, &msg->ibm_u.putreq.ibprm_hdr, rx);
359                 if (rx->rx_responded)
360                         break;
361
362                 /* I wasn't asked to transfer any payload data.  This happens
363                  * if the PUT didn't match, or got truncated. */
364                 kibnal_send_completion(rx->rx_conn, IBNAL_MSG_PUT_NAK, 0,
365                                        msg->ibm_u.putreq.ibprm_cookie);
366                 break;
367
368         case IBNAL_MSG_PUT_NAK:
369                 CWARN ("PUT_NACK from "LPX64"\n", conn->ibc_peer->ibp_nid);
370                 kibnal_handle_completion(conn, IBNAL_MSG_PUT_REQ, 
371                                          msg->ibm_u.completion.ibcm_status,
372                                          msg->ibm_u.completion.ibcm_cookie);
373                 break;
374
375         case IBNAL_MSG_PUT_ACK:
376                 spin_lock(&conn->ibc_lock);
377                 tx = kibnal_find_waiting_tx_locked(conn, IBNAL_MSG_PUT_REQ,
378                                                    msg->ibm_u.putack.ibpam_src_cookie);
379                 if (tx != NULL)
380                         list_del(&tx->tx_list);
381                 spin_unlock(&conn->ibc_lock);
382
383                 if (tx == NULL) {
384                         CERROR("Unmatched PUT_ACK from "LPX64"\n",
385                                conn->ibc_peer->ibp_nid);
386                         kibnal_close_conn(conn, -EPROTO);
387                         break;
388                 }
389
390                 LASSERT (tx->tx_waiting);
391                 /* CAVEAT EMPTOR: I could be racing with tx_complete, but...
392                  * (a) I can overwrite tx_msg since my peer has received it!
393                  * (b) tx_waiting set tells tx_complete() it's not done. */
394
395                 tx->tx_nwrq = 0;                /* overwrite PUT_REQ */
396
397                 rc = kibnal_init_rdma(tx, IBNAL_MSG_PUT_DONE, 
398                                       kibnal_rd_size(&msg->ibm_u.putack.ibpam_rd),
399                                       &msg->ibm_u.putack.ibpam_rd,
400                                       msg->ibm_u.putack.ibpam_dst_cookie);
401                 if (rc < 0)
402                         CERROR("Can't setup rdma for PUT to "LPX64": %d\n",
403                                conn->ibc_peer->ibp_nid, rc);
404
405                 spin_lock(&conn->ibc_lock);
406                 if (tx->tx_status == 0 && rc < 0)
407                         tx->tx_status = rc;
408                 tx->tx_waiting = 0;             /* clear waiting and queue atomically */
409                 kibnal_queue_tx_locked(tx, conn);
410                 spin_unlock(&conn->ibc_lock);
411                 break;
412                 
413         case IBNAL_MSG_PUT_DONE:
414                 kibnal_handle_completion(conn, IBNAL_MSG_PUT_ACK,
415                                          msg->ibm_u.completion.ibcm_status,
416                                          msg->ibm_u.completion.ibcm_cookie);
417                 break;
418
419         case IBNAL_MSG_GET_REQ:
420                 rx->rx_responded = 0;
421                 lib_parse(&kibnal_lib, &msg->ibm_u.get.ibgm_hdr, rx);
422                 if (rx->rx_responded)           /* I responded to the GET_REQ */
423                         break;
424                 /* NB GET didn't match (I'd have responded even with no payload
425                  * data) */
426                 kibnal_send_completion(rx->rx_conn, IBNAL_MSG_GET_DONE, -ENODATA,
427                                        msg->ibm_u.get.ibgm_cookie);
428                 break;
429
430         case IBNAL_MSG_GET_DONE:
431                 kibnal_handle_completion(conn, IBNAL_MSG_GET_REQ,
432                                          msg->ibm_u.completion.ibcm_status,
433                                          msg->ibm_u.completion.ibcm_cookie);
434                 break;
435         }
436
437         kibnal_post_rx(rx, 1);
438 }
439
440 void
441 kibnal_rx_complete (kib_rx_t *rx, vv_comp_status_t vvrc, int nob, __u64 rxseq)
442 {
443         kib_msg_t    *msg = rx->rx_msg;
444         kib_conn_t   *conn = rx->rx_conn;
445         unsigned long flags;
446         int           rc;
447
448         CDEBUG (D_NET, "rx %p conn %p\n", rx, conn);
449         LASSERT (rx->rx_posted);
450         rx->rx_posted = 0;
451
452         if (conn->ibc_state > IBNAL_CONN_ESTABLISHED)
453                 goto ignore;
454
455         if (vvrc != vv_comp_status_success) {
456                 CERROR("Rx from "LPX64" failed: %d\n", 
457                        conn->ibc_peer->ibp_nid, vvrc);
458                 goto failed;
459         }
460
461         rc = kibnal_unpack_msg(msg, nob);
462         if (rc != 0) {
463                 CERROR ("Error %d unpacking rx from "LPX64"\n",
464                         rc, conn->ibc_peer->ibp_nid);
465                 goto failed;
466         }
467
468         if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid ||
469             msg->ibm_srcstamp != conn->ibc_incarnation ||
470             msg->ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid ||
471             msg->ibm_dststamp != kibnal_data.kib_incarnation) {
472                 CERROR ("Stale rx from "LPX64"\n",
473                         conn->ibc_peer->ibp_nid);
474                 goto failed;
475         }
476
477         if (msg->ibm_seq != rxseq) {
478                 CERROR ("Out-of-sequence rx from "LPX64
479                         ": got "LPD64" but expected "LPD64"\n",
480                         conn->ibc_peer->ibp_nid, msg->ibm_seq, rxseq);
481                 goto failed;
482         }
483
484         /* racing with connection establishment/teardown! */
485
486         if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
487                 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
488                 /* must check holding global lock to eliminate race */
489                 if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
490                         list_add_tail(&rx->rx_list, &conn->ibc_early_rxs);
491                         write_unlock_irqrestore(&kibnal_data.kib_global_lock, 
492                                                 flags);
493                         return;
494                 }
495                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, 
496                                         flags);
497         }
498         kibnal_handle_rx(rx);
499         return;
500         
501  failed:
502         CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
503         kibnal_close_conn(conn, -EIO);
504  ignore:
505         /* Don't re-post rx & drop its ref on conn */
506         kibnal_conn_decref(conn);
507 }
508
509 #if IBNAL_WHOLE_MEM
510 int
511 kibnal_append_rdfrag(kib_rdma_desc_t *rd, int active, struct page *page, 
512                      unsigned long page_offset, unsigned long len)
513 {
514         kib_rdma_frag_t *frag = &rd->rd_frags[rd->rd_nfrag];
515         vv_l_key_t       l_key;
516         vv_r_key_t       r_key;
517         __u64            addr;
518         __u64            frag_addr;
519         vv_mem_reg_h_t   mem_h;
520         vv_return_t      vvrc;
521
522         if (rd->rd_nfrag >= IBNAL_MAX_RDMA_FRAGS) {
523                 CERROR ("Too many RDMA fragments\n");
524                 return -EMSGSIZE;
525         }
526
527         /* Try to create an address that adapter-tavor will munge into a valid
528          * network address, given how it maps all phys mem into 1 region */
529         addr = kibnal_page2phys(page) + page_offset + PAGE_OFFSET;
530
531         vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca, 
532                                     (void *)((unsigned long)addr),
533                                     len, &mem_h, &l_key, &r_key);
534         LASSERT (vvrc == vv_return_ok);
535
536         if (active) {
537                 if (rd->rd_nfrag == 0) {
538                         rd->rd_key = l_key;
539                 } else if (l_key != rd->rd_key) {
540                         CERROR ("> 1 key for single RDMA desc\n");
541                         return -EINVAL;
542                 }
543                 frag_addr = addr;
544         } else {
545                 if (rd->rd_nfrag == 0) {
546                         rd->rd_key = r_key;
547                 } else if (r_key != rd->rd_key) {
548                         CERROR ("> 1 key for single RDMA desc\n");
549                         return -EINVAL;
550                 }
551
552                 frag_addr = kibnal_addr2net(addr);
553         }
554
555         kibnal_rf_set(frag, frag_addr, len);
556
557         CDEBUG(D_NET,"map frag [%d][%d %x %08x%08x] "LPX64"\n", 
558                rd->rd_nfrag, frag->rf_nob, rd->rd_key, 
559                frag->rf_addr_hi, frag->rf_addr_lo, frag_addr);
560
561         rd->rd_nfrag++;
562         return 0;
563 }
564
565 struct page *
566 kibnal_kvaddr_to_page (unsigned long vaddr)
567 {
568         struct page *page;
569
570         if (vaddr >= VMALLOC_START &&
571             vaddr < VMALLOC_END) {
572                 page = vmalloc_to_page ((void *)vaddr);
573                 LASSERT (page != NULL);
574                 return page;
575         }
576 #if CONFIG_HIGHMEM
577         if (vaddr >= PKMAP_BASE &&
578             vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) {
579                 /* No highmem pages only used for bulk (kiov) I/O */
580                 CERROR("find page for address in highmem\n");
581                 LBUG();
582         }
583 #endif
584         page = virt_to_page (vaddr);
585         LASSERT (page != NULL);
586         return page;
587 }
588
589 int
590 kibnal_setup_rd_iov(kib_tx_t *tx, kib_rdma_desc_t *rd, 
591                     vv_access_con_bit_mask_t access,
592                     int niov, struct iovec *iov, int offset, int nob)
593                  
594 {
595         /* active if I'm sending */
596         int           active = ((access & vv_acc_r_mem_write) == 0);
597         int           fragnob;
598         int           rc;
599         unsigned long vaddr;
600         struct page  *page;
601         int           page_offset;
602
603         LASSERT (nob > 0);
604         LASSERT (niov > 0);
605         LASSERT ((rd != tx->tx_rd) == !active);
606
607         while (offset >= iov->iov_len) {
608                 offset -= iov->iov_len;
609                 niov--;
610                 iov++;
611                 LASSERT (niov > 0);
612         }
613
614         rd->rd_nfrag = 0;
615         do {
616                 LASSERT (niov > 0);
617
618                 vaddr = ((unsigned long)iov->iov_base) + offset;
619                 page_offset = vaddr & (PAGE_SIZE - 1);
620                 page = kibnal_kvaddr_to_page(vaddr);
621                 if (page == NULL) {
622                         CERROR ("Can't find page\n");
623                         return -EFAULT;
624                 }
625
626                 fragnob = min((int)(iov->iov_len - offset), nob);
627                 fragnob = min(fragnob, (int)PAGE_SIZE - page_offset);
628
629                 rc = kibnal_append_rdfrag(rd, active, page, 
630                                           page_offset, fragnob);
631                 if (rc != 0)
632                         return rc;
633
634                 if (offset + fragnob < iov->iov_len) {
635                         offset += fragnob;
636                 } else {
637                         offset = 0;
638                         iov++;
639                         niov--;
640                 }
641                 nob -= fragnob;
642         } while (nob > 0);
643         
644         return 0;
645 }
646
647 int
648 kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd, 
649                       vv_access_con_bit_mask_t access,
650                       int nkiov, ptl_kiov_t *kiov, int offset, int nob)
651 {
652         /* active if I'm sending */
653         int            active = ((access & vv_acc_r_mem_write) == 0);
654         int            fragnob;
655         int            rc;
656
657         CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
658
659         LASSERT (nob > 0);
660         LASSERT (nkiov > 0);
661         LASSERT ((rd != tx->tx_rd) == !active);
662
663         while (offset >= kiov->kiov_len) {
664                 offset -= kiov->kiov_len;
665                 nkiov--;
666                 kiov++;
667                 LASSERT (nkiov > 0);
668         }
669
670         rd->rd_nfrag = 0;
671         do {
672                 LASSERT (nkiov > 0);
673                 fragnob = min((int)(kiov->kiov_len - offset), nob);
674                 
675                 rc = kibnal_append_rdfrag(rd, active, kiov->kiov_page,
676                                           kiov->kiov_offset + offset,
677                                           fragnob);
678                 if (rc != 0)
679                         return rc;
680
681                 offset = 0;
682                 kiov++;
683                 nkiov--;
684                 nob -= fragnob;
685         } while (nob > 0);
686
687         return 0;
688 }
689 #else
690 int
691 kibnal_setup_rd_iov (kib_tx_t *tx, kib_rdma_desc_t *rd,
692                      vv_access_con_bit_mask_t access,
693                      int niov, struct iovec *iov, int offset, int nob)
694                  
695 {
696         /* active if I'm sending */
697         int         active = ((access & vv_acc_r_mem_write) == 0);
698         void       *vaddr;
699         vv_return_t vvrc;
700
701         LASSERT (nob > 0);
702         LASSERT (niov > 0);
703         LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
704         LASSERT ((rd != tx->tx_rd) == !active);
705
706         while (offset >= iov->iov_len) {
707                 offset -= iov->iov_len;
708                 niov--;
709                 iov++;
710                 LASSERT (niov > 0);
711         }
712
713         if (nob > iov->iov_len - offset) {
714                 CERROR ("Can't map multiple vaddr fragments\n");
715                 return (-EMSGSIZE);
716         }
717
718         vaddr = (void *)(((unsigned long)iov->iov_base) + offset);
719         tx->tx_md.md_addr = (__u64)((unsigned long)vaddr);
720
721         vvrc = vv_mem_region_register(kibnal_data.kib_hca, vaddr, nob,
722                                       kibnal_data.kib_pd, access,
723                                       &tx->tx_md.md_handle, 
724                                       &tx->tx_md.md_lkey,
725                                       &tx->tx_md.md_rkey);
726         if (vvrc != vv_return_ok) {
727                 CERROR ("Can't map vaddr %p: %d\n", vaddr, vvrc);
728                 return -EFAULT;
729         }
730
731         tx->tx_mapped = KIB_TX_MAPPED;
732
733         rd->rd_key = active ? tx->tx_md.md_lkey : tx->tx_md.md_rkey;
734         rd->rd_nfrag = 1;
735         kibnal_rf_set(&rd->rd_frags[0], tx->tx_md.md_addr, nob);
736         
737         return (0);
738 }
739
740 int
741 kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd,
742                       vv_access_con_bit_mask_t access,
743                       int nkiov, ptl_kiov_t *kiov, int offset, int nob)
744 {
745         /* active if I'm sending */
746         int            active = ((access & vv_acc_r_mem_write) == 0);
747         vv_return_t    vvrc;
748         vv_phy_list_t  phys_pages;
749         vv_phy_buf_t  *phys;
750         int            page_offset;
751         int            nphys;
752         int            resid;
753         int            phys_size;
754         int            rc;
755
756         CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
757
758         LASSERT (nob > 0);
759         LASSERT (nkiov > 0);
760         LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
761         LASSERT ((rd != tx->tx_rd) == !active);
762
763         while (offset >= kiov->kiov_len) {
764                 offset -= kiov->kiov_len;
765                 nkiov--;
766                 kiov++;
767                 LASSERT (nkiov > 0);
768         }
769
770         phys_size = nkiov * sizeof (*phys);
771         PORTAL_ALLOC(phys, phys_size);
772         if (phys == NULL) {
773                 CERROR ("Can't allocate tmp phys\n");
774                 return (-ENOMEM);
775         }
776
777         page_offset = kiov->kiov_offset + offset;
778
779         phys[0].start = kibnal_page2phys(kiov->kiov_page);
780         phys[0].size = PAGE_SIZE;
781
782         nphys = 1;
783         resid = nob - (kiov->kiov_len - offset);
784
785         while (resid > 0) {
786                 kiov++;
787                 nkiov--;
788                 LASSERT (nkiov > 0);
789
790                 if (kiov->kiov_offset != 0 ||
791                     ((resid > PAGE_SIZE) && 
792                      kiov->kiov_len < PAGE_SIZE)) {
793                         int i;
794                         /* Can't have gaps */
795                         CERROR ("Can't make payload contiguous in I/O VM:"
796                                 "page %d, offset %d, len %d \n", nphys, 
797                                 kiov->kiov_offset, kiov->kiov_len);
798
799                         for (i = -nphys; i < nkiov; i++)
800                                 CERROR("kiov[%d] %p +%d for %d\n",
801                                        i, kiov[i].kiov_page, 
802                                        kiov[i].kiov_offset, 
803                                        kiov[i].kiov_len);
804                         
805                         rc = -EINVAL;
806                         goto out;
807                 }
808
809                 LASSERT (nphys * sizeof (*phys) < phys_size);
810                 phys[nphys].start = kibnal_page2phys(kiov->kiov_page);
811                 phys[nphys].size = PAGE_SIZE;
812
813                 nphys++;
814                 resid -= PAGE_SIZE;
815         }
816
817 #if 0
818         CWARN ("nphys %d, nob %d, page_offset %d\n", nphys, nob, page_offset);
819         for (i = 0; i < nphys; i++)
820                 CWARN ("   [%d] "LPX64"\n", i, phys[i]);
821 #endif
822
823         vvrc = vv_phy_mem_region_register(kibnal_data.kib_hca,
824                                           &phys_pages,
825                                           IBNAL_RDMA_BASE,
826                                           nphys,
827                                           page_offset,
828                                           kibnal_data.kib_pd,
829                                           access,
830                                           &tx->tx_md.md_handle,
831                                           &tx->tx_md.md_addr,
832                                           &tx->tx_md.md_lkey,
833                                           &tx->tx_md.md_rkey);
834
835         if (vvrc != vv_return_ok) {
836                 CERROR ("Can't map phys: %d\n", vvrc);
837                 rc = -EFAULT;
838                 goto out;
839         }
840
841         CDEBUG(D_NET, "Mapped %d pages %d bytes @ offset %d: "
842                "lkey %x, rkey %x, addr "LPX64"\n",
843                nphys, nob, page_offset, tx->tx_md.md_lkey, tx->tx_md.md_rkey,
844                tx->tx_md.md_addr);
845
846         tx->tx_mapped = KIB_TX_MAPPED;
847         rc = 0;
848
849         rd->rd_key = active ? tx->tx_md.md_lkey : tx->tx_md.md_rkey;
850         rd->rd_nfrag = 1;
851         kibnal_rf_set(&rd->rd_frags[0], tx->tx_md.md_addr, nob);
852         
853  out:
854         PORTAL_FREE(phys, phys_size);
855         return (rc);
856 }
857 #endif
858
859 kib_conn_t *
860 kibnal_find_conn_locked (kib_peer_t *peer)
861 {
862         struct list_head *tmp;
863
864         /* just return the first connection */
865         list_for_each (tmp, &peer->ibp_conns) {
866                 return (list_entry(tmp, kib_conn_t, ibc_list));
867         }
868
869         return (NULL);
870 }
871
872 void
873 kibnal_check_sends (kib_conn_t *conn)
874 {
875         kib_tx_t       *tx;
876         vv_return_t     vvrc;                        
877         int             rc;
878         int             done;
879
880         /* Don't send anything until after the connection is established */
881         if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
882                 CDEBUG(D_NET, LPX64"too soon\n", conn->ibc_peer->ibp_nid);
883                 return;
884         }
885         
886         spin_lock(&conn->ibc_lock);
887
888         LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE);
889
890         if (list_empty(&conn->ibc_tx_queue) &&
891             conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) {
892                 spin_unlock(&conn->ibc_lock);
893                 
894                 tx = kibnal_get_idle_tx(0);     /* don't block */
895                 if (tx != NULL)
896                         kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0);
897
898                 spin_lock(&conn->ibc_lock);
899                 
900                 if (tx != NULL)
901                         kibnal_queue_tx_locked(tx, conn);
902         }
903
904         while (!list_empty (&conn->ibc_tx_queue)) {
905                 tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list);
906
907                 LASSERT (tx->tx_queued);
908                 /* We rely on this for QP sizing */
909                 LASSERT (tx->tx_nwrq > 0 && tx->tx_nwrq <= 1 + IBNAL_MAX_RDMA_FRAGS);
910
911                 LASSERT (conn->ibc_outstanding_credits >= 0);
912                 LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE);
913                 LASSERT (conn->ibc_credits >= 0);
914                 LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE);
915
916                 if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE) {
917                         CDEBUG(D_NET, LPX64": posted enough\n",
918                                conn->ibc_peer->ibp_nid);
919                         break;
920                 }
921                 
922                 if (conn->ibc_credits == 0) {   /* no credits */
923                         CDEBUG(D_NET, LPX64": no credits\n",
924                                conn->ibc_peer->ibp_nid);
925                         break;
926                 }
927                 
928                 if (conn->ibc_credits == 1 &&   /* last credit reserved for */
929                     conn->ibc_outstanding_credits == 0) { /* giving back credits */
930                         CDEBUG(D_NET, LPX64": not using last credit\n",
931                                conn->ibc_peer->ibp_nid);
932                         break;
933                 }
934                 
935                 list_del (&tx->tx_list);
936                 tx->tx_queued = 0;
937
938                 /* NB don't drop ibc_lock before bumping tx_sending */
939
940                 if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP &&
941                     (!list_empty(&conn->ibc_tx_queue) ||
942                      conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) {
943                         /* redundant NOOP */
944                         spin_unlock(&conn->ibc_lock);
945                         kibnal_tx_done(tx);
946                         spin_lock(&conn->ibc_lock);
947                         CDEBUG(D_NET, LPX64": redundant noop\n",
948                                conn->ibc_peer->ibp_nid);
949                         continue;
950                 }
951
952                 kibnal_pack_msg(tx->tx_msg, conn->ibc_outstanding_credits,
953                                 conn->ibc_peer->ibp_nid, conn->ibc_incarnation,
954                                 conn->ibc_txseq);
955
956                 conn->ibc_txseq++;
957                 conn->ibc_outstanding_credits = 0;
958                 conn->ibc_nsends_posted++;
959                 conn->ibc_credits--;
960
961                 /* CAVEAT EMPTOR!  This tx could be the PUT_DONE of an RDMA
962                  * PUT.  If so, it was first queued here as a PUT_REQ, sent and
963                  * stashed on ibc_active_txs, matched by an incoming PUT_ACK,
964                  * and then re-queued here.  It's (just) possible that
965                  * tx_sending is non-zero if we've not done the tx_complete() from
966                  * the first send; hence the ++ rather than = below. */
967                 tx->tx_sending++;
968
969                 list_add (&tx->tx_list, &conn->ibc_active_txs);
970
971                 /* Keep holding ibc_lock while posting sends on this
972                  * connection; vv_post_send() isn't re-entrant on the same
973                  * QP!! */
974
975                 LASSERT (tx->tx_nwrq > 0);
976
977                 rc = -ECONNABORTED;
978                 vvrc = vv_return_ok;
979                 if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
980                         tx->tx_status = 0;
981                         vvrc = vv_post_send_list(kibnal_data.kib_hca,
982                                                  conn->ibc_qp,
983                                                  tx->tx_nwrq,
984                                                  tx->tx_wrq,
985                                                  vv_operation_type_send_rc);
986                         rc = (vvrc == vv_return_ok) ? 0 : -EIO;
987                 }
988
989                 if (rc != 0) {
990                         /* NB credits are transferred in the actual
991                          * message, which can only be the last work item */
992                         conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits;
993                         conn->ibc_credits++;
994                         conn->ibc_nsends_posted--;
995
996                         tx->tx_status = rc;
997                         tx->tx_waiting = 0;
998                         tx->tx_sending--;
999                         
1000                         done = (tx->tx_sending == 0);
1001                         if (done)
1002                                 list_del (&tx->tx_list);
1003                         
1004                         spin_unlock(&conn->ibc_lock);
1005                         
1006                         if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
1007                                 CERROR ("Error %d posting transmit to "LPX64"\n", 
1008                                         vvrc, conn->ibc_peer->ibp_nid);
1009                         else
1010                                 CDEBUG (D_NET, "Error %d posting transmit to "
1011                                         LPX64"\n", rc, conn->ibc_peer->ibp_nid);
1012
1013                         kibnal_close_conn (conn, rc);
1014
1015                         if (done)
1016                                 kibnal_tx_done (tx);
1017                         return;
1018                 }
1019         }
1020
1021         spin_unlock(&conn->ibc_lock);
1022 }
1023
1024 void
1025 kibnal_tx_complete (kib_tx_t *tx, vv_comp_status_t vvrc)
1026 {
1027         kib_conn_t   *conn = tx->tx_conn;
1028         int           failed = (vvrc != vv_comp_status_success);
1029         int           idle;
1030
1031         CDEBUG(D_NET, "tx %p conn %p sending %d nwrq %d vvrc %d\n", 
1032                tx, conn, tx->tx_sending, tx->tx_nwrq, vvrc);
1033
1034         LASSERT (tx->tx_sending > 0);
1035
1036         if (failed &&
1037             tx->tx_status == 0 &&
1038             conn->ibc_state == IBNAL_CONN_ESTABLISHED)
1039                 CERROR("tx -> "LPX64" type %x cookie "LPX64
1040                        "sending %d waiting %d: failed %d\n", 
1041                        conn->ibc_peer->ibp_nid, tx->tx_msg->ibm_type, 
1042                        tx->tx_cookie, tx->tx_sending, tx->tx_waiting, vvrc);
1043
1044         spin_lock(&conn->ibc_lock);
1045
1046         /* I could be racing with rdma completion.  Whoever makes 'tx' idle
1047          * gets to free it, which also drops its ref on 'conn'. */
1048
1049         tx->tx_sending--;
1050         conn->ibc_nsends_posted--;
1051
1052         if (failed) {
1053                 tx->tx_waiting = 0;
1054                 tx->tx_status = -EIO;
1055         }
1056         
1057         idle = (tx->tx_sending == 0) &&         /* This is the final callback */
1058                !tx->tx_waiting &&               /* Not waiting for peer */
1059                !tx->tx_queued;                  /* Not re-queued (PUT_DONE) */
1060         if (idle)
1061                 list_del(&tx->tx_list);
1062
1063         kibnal_conn_addref(conn);               /* 1 ref for me.... */
1064
1065         spin_unlock(&conn->ibc_lock);
1066
1067         if (idle)
1068                 kibnal_tx_done (tx);
1069
1070         if (failed)
1071                 kibnal_close_conn (conn, -EIO);
1072         else
1073                 kibnal_check_sends(conn);
1074
1075         kibnal_conn_decref(conn);               /* ...until here */
1076 }
1077
1078 void
1079 kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob)
1080 {
1081         vv_scatgat_t *gl = &tx->tx_gl[tx->tx_nwrq];
1082         vv_wr_t      *wrq = &tx->tx_wrq[tx->tx_nwrq];
1083         int           nob = offsetof (kib_msg_t, ibm_u) + body_nob;
1084
1085         LASSERT (tx->tx_nwrq >= 0 && 
1086                  tx->tx_nwrq < (1 + IBNAL_MAX_RDMA_FRAGS));
1087         LASSERT (nob <= IBNAL_MSG_SIZE);
1088
1089         kibnal_init_msg(tx->tx_msg, type, body_nob);
1090
1091         *gl = (vv_scatgat_t) {
1092                 .v_address = KIBNAL_ADDR2SG(KIBNAL_TX_VADDR(tx)),
1093                 .l_key     = KIBNAL_TX_LKEY(tx),
1094                 .length    = nob,
1095         };
1096
1097         memset(wrq, 0, sizeof(*wrq));
1098
1099         wrq->wr_id = kibnal_ptr2wreqid(tx, IBNAL_WID_TX);
1100         wrq->wr_type = vv_wr_send;
1101         wrq->scatgat_list = gl;
1102         wrq->num_of_data_segments = 1;
1103         wrq->completion_notification = 1;
1104         wrq->type.send.solicited_event = 1;
1105         wrq->type.send.immidiate_data_indicator = 0;
1106         wrq->type.send.send_qp_type.rc_type.fance_indicator = 0;
1107         
1108         tx->tx_nwrq++;
1109 }
1110
1111 int
1112 kibnal_init_rdma (kib_tx_t *tx, int type, int nob,
1113                   kib_rdma_desc_t *dstrd, __u64 dstcookie)
1114 {
1115         /* CAVEAT EMPTOR: this 'consumes' the frags in 'dstrd' */
1116         int              resid = nob;
1117         kib_msg_t       *ibmsg = tx->tx_msg;
1118         kib_rdma_desc_t *srcrd = tx->tx_rd;
1119         kib_rdma_frag_t *srcfrag;
1120         int              srcidx;
1121         kib_rdma_frag_t *dstfrag;
1122         int              dstidx;
1123         vv_scatgat_t    *gl;
1124         vv_wr_t         *wrq;
1125         int              wrknob;
1126         int              rc;
1127
1128         /* Called by scheduler */
1129         LASSERT (!in_interrupt());
1130
1131         LASSERT (type == IBNAL_MSG_GET_DONE ||
1132                  type == IBNAL_MSG_PUT_DONE);
1133
1134         srcidx = dstidx = 0;
1135         srcfrag = &srcrd->rd_frags[0];
1136         dstfrag = &dstrd->rd_frags[0];
1137         rc = resid;
1138
1139         while (resid > 0) {
1140                 if (srcidx >= srcrd->rd_nfrag) {
1141                         CERROR("Src buffer exhausted: %d frags\n", srcidx);
1142                         rc = -EPROTO;
1143                         break;
1144                 }
1145                 
1146                 if (dstidx == dstrd->rd_nfrag) {
1147                         CERROR("Dst buffer exhausted: %d frags\n", dstidx);
1148                         rc = -EPROTO;
1149                         break;
1150                 }
1151
1152                 if (tx->tx_nwrq == IBNAL_MAX_RDMA_FRAGS) {
1153                         CERROR("RDMA too fragmented: %d/%d src %d/%d dst frags\n",
1154                                srcidx, srcrd->rd_nfrag,
1155                                dstidx, dstrd->rd_nfrag);
1156                         rc = -EMSGSIZE;
1157                         break;
1158                 }
1159
1160                 wrknob = MIN(MIN(srcfrag->rf_nob, dstfrag->rf_nob), resid);
1161
1162                 gl = &tx->tx_gl[tx->tx_nwrq];
1163                 gl->v_address = KIBNAL_ADDR2SG(kibnal_rf_addr(srcfrag));
1164                 gl->length    = wrknob;
1165                 gl->l_key     = srcrd->rd_key;
1166
1167                 wrq = &tx->tx_wrq[tx->tx_nwrq];
1168
1169                 wrq->wr_id = kibnal_ptr2wreqid(tx, IBNAL_WID_RDMA);
1170                 wrq->completion_notification = 0;
1171                 wrq->scatgat_list = gl;
1172                 wrq->num_of_data_segments = 1;
1173                 wrq->wr_type = vv_wr_rdma_write;
1174                 wrq->type.send.solicited_event = 0;
1175                 wrq->type.send.send_qp_type.rc_type.fance_indicator = 0;
1176                 wrq->type.send.send_qp_type.rc_type.r_addr = kibnal_rf_addr(dstfrag);
1177                 wrq->type.send.send_qp_type.rc_type.r_r_key = dstrd->rd_key;
1178
1179                 resid -= wrknob;
1180                 if (wrknob < srcfrag->rf_nob) {
1181                         kibnal_rf_set(srcfrag, 
1182                                       kibnal_rf_addr(srcfrag) + wrknob, 
1183                                       srcfrag->rf_nob - wrknob);
1184                 } else {
1185                         srcfrag++;
1186                         srcidx++;
1187                 }
1188                 
1189                 if (wrknob < dstfrag->rf_nob) {
1190                         kibnal_rf_set(dstfrag,
1191                                       kibnal_rf_addr(dstfrag) + wrknob,
1192                                       dstfrag->rf_nob - wrknob);
1193                 } else {
1194                         dstfrag++;
1195                         dstidx++;
1196                 }
1197                 
1198                 tx->tx_nwrq++;
1199         }
1200
1201         if (rc < 0)                             /* no RDMA if completing with failure */
1202                 tx->tx_nwrq = 0;
1203         
1204         ibmsg->ibm_u.completion.ibcm_status = rc;
1205         ibmsg->ibm_u.completion.ibcm_cookie = dstcookie;
1206         kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t));
1207
1208         return rc;
1209 }
1210
1211 void
1212 kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
1213 {
1214         spin_lock(&conn->ibc_lock);
1215         kibnal_queue_tx_locked (tx, conn);
1216         spin_unlock(&conn->ibc_lock);
1217         
1218         kibnal_check_sends(conn);
1219 }
1220
1221 void
1222 kibnal_schedule_peer_arp (kib_peer_t *peer)
1223 {
1224         unsigned long flags;
1225
1226         LASSERT (peer->ibp_connecting != 0);
1227         LASSERT (peer->ibp_arp_count > 0);
1228
1229         kibnal_peer_addref(peer); /* extra ref for connd */
1230
1231         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
1232
1233         list_add_tail (&peer->ibp_connd_list, &kibnal_data.kib_connd_peers);
1234         wake_up (&kibnal_data.kib_connd_waitq);
1235
1236         spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
1237 }
1238
1239 void
1240 kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid)
1241 {
1242         kib_peer_t      *peer;
1243         kib_conn_t      *conn;
1244         unsigned long    flags;
1245         rwlock_t        *g_lock = &kibnal_data.kib_global_lock;
1246
1247         /* If I get here, I've committed to send, so I complete the tx with
1248          * failure on any problems */
1249         
1250         LASSERT (tx->tx_conn == NULL);          /* only set when assigned a conn */
1251         LASSERT (tx->tx_nwrq > 0);              /* work items have been set up */
1252
1253         read_lock_irqsave(g_lock, flags);
1254         
1255         peer = kibnal_find_peer_locked (nid);
1256         if (peer == NULL) {
1257                 read_unlock_irqrestore(g_lock, flags);
1258                 tx->tx_status = -EHOSTUNREACH;
1259                 tx->tx_waiting = 0;
1260                 kibnal_tx_done (tx);
1261                 return;
1262         }
1263
1264         conn = kibnal_find_conn_locked (peer);
1265         if (conn != NULL) {
1266                 kibnal_conn_addref(conn);       /* 1 ref for me... */
1267                 read_unlock_irqrestore(g_lock, flags);
1268                 
1269                 kibnal_queue_tx (tx, conn);
1270                 kibnal_conn_decref(conn);       /* ...to here */
1271                 return;
1272         }
1273         
1274         /* Making one or more connections; I'll need a write lock... */
1275         read_unlock(g_lock);
1276         write_lock(g_lock);
1277
1278         peer = kibnal_find_peer_locked (nid);
1279         if (peer == NULL) {
1280                 write_unlock_irqrestore(g_lock, flags);
1281                 tx->tx_status = -EHOSTUNREACH;
1282                 tx->tx_waiting = 0;
1283                 kibnal_tx_done (tx);
1284                 return;
1285         }
1286
1287         conn = kibnal_find_conn_locked (peer);
1288         if (conn != NULL) {
1289                 /* Connection exists; queue message on it */
1290                 kibnal_conn_addref(conn);       /* 1 ref for me... */
1291                 write_unlock_irqrestore(g_lock, flags);
1292                 
1293                 kibnal_queue_tx (tx, conn);
1294                 kibnal_conn_decref(conn);       /* ...until here */
1295                 return;
1296         }
1297
1298         if (peer->ibp_connecting == 0) {
1299                 if (!time_after_eq(jiffies, peer->ibp_reconnect_time)) {
1300                         write_unlock_irqrestore(g_lock, flags);
1301                         tx->tx_status = -EHOSTUNREACH;
1302                         tx->tx_waiting = 0;
1303                         kibnal_tx_done (tx);
1304                         return;
1305                 }
1306         
1307                 peer->ibp_connecting = 1;
1308                 peer->ibp_arp_count = 1 + IBNAL_ARP_RETRIES;
1309                 kibnal_schedule_peer_arp(peer);
1310         }
1311         
1312         /* A connection is being established; queue the message... */
1313         list_add_tail (&tx->tx_list, &peer->ibp_tx_queue);
1314
1315         write_unlock_irqrestore(g_lock, flags);
1316 }
1317
1318 int
1319 kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist)
1320 {
1321         /* I would guess that if kibnal_get_peer (nid) == NULL,
1322            and we're not routing, then 'nid' is very distant :) */
1323         if ( nal->libnal_ni.ni_pid.nid == nid ) {
1324                 *dist = 0;
1325         } else {
1326                 *dist = 1;
1327         }
1328
1329         return 0;
1330 }
1331
1332 ptl_err_t
1333 kibnal_sendmsg(lib_nal_t    *nal, 
1334                void         *private,
1335                lib_msg_t    *libmsg,
1336                ptl_hdr_t    *hdr, 
1337                int           type, 
1338                ptl_nid_t     nid, 
1339                ptl_pid_t     pid,
1340                unsigned int  payload_niov, 
1341                struct iovec *payload_iov, 
1342                ptl_kiov_t   *payload_kiov,
1343                int           payload_offset,
1344                int           payload_nob)
1345 {
1346         kib_msg_t  *ibmsg;
1347         kib_tx_t   *tx;
1348         int         nob;
1349         int         rc;
1350         int         n;
1351
1352         /* NB 'private' is different depending on what we're sending.... */
1353
1354         CDEBUG(D_NET, "sending %d bytes in %d frags to nid:"LPX64
1355                " pid %d\n", payload_nob, payload_niov, nid , pid);
1356
1357         LASSERT (payload_nob == 0 || payload_niov > 0);
1358         LASSERT (payload_niov <= PTL_MD_MAX_IOV);
1359
1360         /* Thread context */
1361         LASSERT (!in_interrupt());
1362         /* payload is either all vaddrs or all pages */
1363         LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
1364
1365         switch (type) {
1366         default:
1367                 LBUG();
1368                 return (PTL_FAIL);
1369                 
1370         case PTL_MSG_REPLY: {
1371                 /* reply's 'private' is the incoming receive */
1372                 kib_rx_t *rx = private;
1373
1374                 LASSERT(rx != NULL);
1375
1376                 if (rx->rx_msg->ibm_type == IBNAL_MSG_IMMEDIATE) {
1377                         /* RDMA not expected */
1378                         nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
1379                         if (nob > IBNAL_MSG_SIZE) {
1380                                 CERROR("REPLY for "LPX64" too big (RDMA not requested):"
1381                                        "%d (max for message is %d)\n", 
1382                                        nid, payload_nob, IBNAL_MSG_SIZE);
1383                                 CERROR("Can't REPLY IMMEDIATE %d to "LPX64"\n",
1384                                        nob, nid);
1385                                 return PTL_FAIL;
1386                         }
1387                         break;
1388                 }
1389
1390                 /* Incoming message consistent with RDMA? */
1391                 if (rx->rx_msg->ibm_type != IBNAL_MSG_GET_REQ) {
1392                         CERROR("REPLY to "LPX64" bad msg type %x!!!\n",
1393                                nid, rx->rx_msg->ibm_type);
1394                         return PTL_FAIL;
1395                 }
1396
1397                 /* NB rx_complete() will send GET_NAK when I return to it from
1398                  * here, unless I set rx_responded! */
1399
1400                 tx = kibnal_get_idle_tx(0);
1401                 if (tx == NULL) {
1402                         CERROR("Can't get tx for REPLY to "LPX64"\n", nid);
1403                         return PTL_FAIL;
1404                 }
1405
1406                 if (payload_nob == 0)
1407                         rc = 0;
1408                 else if (payload_kiov == NULL)
1409                         rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0, 
1410                                                  payload_niov, payload_iov, 
1411                                                  payload_offset, payload_nob);
1412                 else
1413                         rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 0,
1414                                                   payload_niov, payload_kiov,
1415                                                   payload_offset, payload_nob);
1416                 if (rc != 0) {
1417                         CERROR("Can't setup GET src for "LPX64": %d\n", nid, rc);
1418                         kibnal_tx_done(tx);
1419                         return PTL_FAIL;
1420                 }
1421                 
1422                 rc = kibnal_init_rdma(tx, IBNAL_MSG_GET_DONE, payload_nob,
1423                                       &rx->rx_msg->ibm_u.get.ibgm_rd,
1424                                       rx->rx_msg->ibm_u.get.ibgm_cookie);
1425                 if (rc < 0) {
1426                         CERROR("Can't setup rdma for GET from "LPX64": %d\n", 
1427                                nid, rc);
1428                 } else if (rc == 0) {
1429                         /* No RDMA: local completion may happen now! */
1430                         lib_finalize (&kibnal_lib, NULL, libmsg, PTL_OK);
1431                 } else {
1432                         /* RDMA: lib_finalize(libmsg) when it completes */
1433                         tx->tx_libmsg[0] = libmsg;
1434                 }
1435
1436                 kibnal_queue_tx(tx, rx->rx_conn);
1437                 rx->rx_responded = 1;
1438                 return (rc >= 0) ? PTL_OK : PTL_FAIL;
1439         }
1440
1441         case PTL_MSG_GET:
1442                 /* will the REPLY message be small enough not to need RDMA? */
1443                 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[libmsg->md->length]);
1444                 if (nob <= IBNAL_MSG_SIZE)
1445                         break;
1446
1447                 tx = kibnal_get_idle_tx(1);     /* may block; caller is an app thread */
1448                 LASSERT (tx != NULL);
1449
1450                 ibmsg = tx->tx_msg;
1451                 ibmsg->ibm_u.get.ibgm_hdr = *hdr;
1452                 ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie;
1453
1454                 if ((libmsg->md->options & PTL_MD_KIOV) == 0)
1455                         rc = kibnal_setup_rd_iov(tx, &ibmsg->ibm_u.get.ibgm_rd,
1456                                                  vv_acc_r_mem_write,
1457                                                  libmsg->md->md_niov,
1458                                                  libmsg->md->md_iov.iov,
1459                                                  0, libmsg->md->length);
1460                 else
1461                         rc = kibnal_setup_rd_kiov(tx, &ibmsg->ibm_u.get.ibgm_rd,
1462                                                   vv_acc_r_mem_write,
1463                                                   libmsg->md->md_niov,
1464                                                   libmsg->md->md_iov.kiov,
1465                                                   0, libmsg->md->length);
1466                 if (rc != 0) {
1467                         CERROR("Can't setup GET sink for "LPX64": %d\n", nid, rc);
1468                         kibnal_tx_done(tx);
1469                         return PTL_FAIL;
1470                 }
1471
1472                 n = ibmsg->ibm_u.get.ibgm_rd.rd_nfrag;
1473                 nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]);
1474                 kibnal_init_tx_msg(tx, IBNAL_MSG_GET_REQ, nob);
1475
1476                 tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib, nid, libmsg);
1477                 if (tx->tx_libmsg[1] == NULL) {
1478                         CERROR("Can't create reply for GET -> "LPX64"\n", nid);
1479                         kibnal_tx_done(tx);
1480                         return PTL_FAIL;
1481                 }
1482
1483                 tx->tx_libmsg[0] = libmsg;      /* finalise libmsg[0,1] on completion */
1484                 tx->tx_waiting = 1;             /* waiting for GET_DONE */
1485                 kibnal_launch_tx(tx, nid);
1486                 return PTL_OK;
1487
1488         case PTL_MSG_ACK:
1489                 LASSERT (payload_nob == 0);
1490                 break;
1491
1492         case PTL_MSG_PUT:
1493                 /* Is the payload small enough not to need RDMA? */
1494                 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
1495                 if (nob <= IBNAL_MSG_SIZE)
1496                         break;
1497
1498                 tx = kibnal_get_idle_tx(1);     /* may block: caller is app thread */
1499                 LASSERT (tx != NULL);
1500
1501                 if (payload_kiov == NULL)
1502                         rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0,
1503                                                  payload_niov, payload_iov,
1504                                                  payload_offset, payload_nob);
1505                 else
1506                         rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 0,
1507                                                   payload_niov, payload_kiov,
1508                                                   payload_offset, payload_nob);
1509                 if (rc != 0) {
1510                         CERROR("Can't setup PUT src for "LPX64": %d\n", nid, rc);
1511                         kibnal_tx_done(tx);
1512                         return PTL_FAIL;
1513                 }
1514
1515                 ibmsg = tx->tx_msg;
1516                 ibmsg->ibm_u.putreq.ibprm_hdr = *hdr;
1517                 ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie;
1518                 kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_REQ, sizeof(kib_putreq_msg_t));
1519
1520                 tx->tx_libmsg[0] = libmsg;      /* finalise libmsg on completion */
1521                 tx->tx_waiting = 1;             /* waiting for PUT_{ACK,NAK} */
1522                 kibnal_launch_tx(tx, nid);
1523                 return PTL_OK;
1524         }
1525
1526         LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob])
1527                  <= IBNAL_MSG_SIZE);
1528
1529         tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK ||
1530                                   type == PTL_MSG_REPLY));
1531         if (tx == NULL) {
1532                 CERROR ("Can't send %d to "LPX64": tx descs exhausted\n", type, nid);
1533                 return PTL_NO_SPACE;
1534         }
1535
1536         ibmsg = tx->tx_msg;
1537         ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
1538
1539         if (payload_nob > 0) {
1540                 if (payload_kiov != NULL)
1541                         lib_copy_kiov2buf(ibmsg->ibm_u.immediate.ibim_payload,
1542                                           payload_niov, payload_kiov,
1543                                           payload_offset, payload_nob);
1544                 else
1545                         lib_copy_iov2buf(ibmsg->ibm_u.immediate.ibim_payload,
1546                                          payload_niov, payload_iov,
1547                                          payload_offset, payload_nob);
1548         }
1549
1550         nob = offsetof(kib_immediate_msg_t, ibim_payload[payload_nob]);
1551         kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE, nob);
1552
1553         tx->tx_libmsg[0] = libmsg;              /* finalise libmsg on completion */
1554         kibnal_launch_tx(tx, nid);
1555         return PTL_OK;
1556 }
1557
1558 ptl_err_t
1559 kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie,
1560                ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
1561                unsigned int payload_niov, struct iovec *payload_iov,
1562                size_t payload_offset, size_t payload_len)
1563 {
1564         CDEBUG(D_NET, "  pid = %d, nid="LPU64"\n",
1565                pid, nid);
1566         return (kibnal_sendmsg(nal, private, cookie,
1567                                hdr, type, nid, pid,
1568                                payload_niov, payload_iov, NULL,
1569                                payload_offset, payload_len));
1570 }
1571
1572 ptl_err_t
1573 kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, 
1574                      ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
1575                      unsigned int payload_niov, ptl_kiov_t *payload_kiov, 
1576                      size_t payload_offset, size_t payload_len)
1577 {
1578         return (kibnal_sendmsg(nal, private, cookie,
1579                                hdr, type, nid, pid,
1580                                payload_niov, NULL, payload_kiov,
1581                                payload_offset, payload_len));
1582 }
1583
1584 ptl_err_t
1585 kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
1586                  unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov,
1587                  size_t offset, int mlen, int rlen)
1588 {
1589         kib_rx_t    *rx = private;
1590         kib_msg_t   *rxmsg = rx->rx_msg;
1591         kib_conn_t  *conn = rx->rx_conn;
1592         kib_tx_t    *tx;
1593         kib_msg_t   *txmsg;
1594         int          nob;
1595         int          rc;
1596         int          n;
1597         
1598         LASSERT (mlen <= rlen);
1599         LASSERT (mlen >= 0);
1600         LASSERT (!in_interrupt());
1601         /* Either all pages or all vaddrs */
1602         LASSERT (!(kiov != NULL && iov != NULL));
1603
1604         switch (rxmsg->ibm_type) {
1605         default:
1606                 LBUG();
1607                 
1608         case IBNAL_MSG_IMMEDIATE:
1609                 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
1610                 if (nob > IBNAL_MSG_SIZE) {
1611                         CERROR ("Immediate message from "LPX64" too big: %d\n",
1612                                 rxmsg->ibm_u.immediate.ibim_hdr.src_nid, rlen);
1613                         return (PTL_FAIL);
1614                 }
1615
1616                 if (kiov != NULL)
1617                         lib_copy_buf2kiov(niov, kiov, offset,
1618                                           rxmsg->ibm_u.immediate.ibim_payload,
1619                                           mlen);
1620                 else
1621                         lib_copy_buf2iov(niov, iov, offset,
1622                                          rxmsg->ibm_u.immediate.ibim_payload,
1623                                          mlen);
1624
1625                 lib_finalize (nal, NULL, libmsg, PTL_OK);
1626                 return (PTL_OK);
1627
1628         case IBNAL_MSG_PUT_REQ:
1629                 /* NB rx_complete() will send PUT_NAK when I return to it from
1630                  * here, unless I set rx_responded!  */
1631
1632                 if (mlen == 0) { /* No payload to RDMA */
1633                         lib_finalize(nal, NULL, libmsg, PTL_OK);
1634                         return PTL_OK;
1635                 }
1636
1637                 tx = kibnal_get_idle_tx(0);
1638                 if (tx == NULL) {
1639                         CERROR("Can't allocate tx for "LPX64"\n",
1640                                conn->ibc_peer->ibp_nid);
1641                         return PTL_FAIL;
1642                 }
1643
1644                 txmsg = tx->tx_msg;
1645                 if (kiov == NULL)
1646                         rc = kibnal_setup_rd_iov(tx, 
1647                                                  &txmsg->ibm_u.putack.ibpam_rd,
1648                                                  vv_acc_r_mem_write,
1649                                                  niov, iov, offset, mlen);
1650                 else
1651                         rc = kibnal_setup_rd_kiov(tx,
1652                                                   &txmsg->ibm_u.putack.ibpam_rd,
1653                                                   vv_acc_r_mem_write,
1654                                                   niov, kiov, offset, mlen);
1655                 if (rc != 0) {
1656                         CERROR("Can't setup PUT sink for "LPX64": %d\n",
1657                                conn->ibc_peer->ibp_nid, rc);
1658                         kibnal_tx_done(tx);
1659                         return PTL_FAIL;
1660                 }
1661
1662                 txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
1663                 txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie;
1664
1665                 n = tx->tx_msg->ibm_u.putack.ibpam_rd.rd_nfrag;
1666                 nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]);
1667                 kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_ACK, nob);
1668
1669                 tx->tx_libmsg[0] = libmsg;      /* finalise libmsg on completion */
1670                 tx->tx_waiting = 1;             /* waiting for PUT_DONE */
1671                 kibnal_queue_tx(tx, conn);
1672
1673                 LASSERT (!rx->rx_responded);
1674                 rx->rx_responded = 1;
1675                 return PTL_OK;
1676
1677         case IBNAL_MSG_GET_REQ:
1678                 /* We get called here just to discard any junk after the
1679                  * GET hdr. */
1680                 LASSERT (libmsg == NULL);
1681                 lib_finalize (nal, NULL, libmsg, PTL_OK);
1682                 return (PTL_OK);
1683         }
1684 }
1685
1686 ptl_err_t
1687 kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg,
1688               unsigned int niov, struct iovec *iov, 
1689               size_t offset, size_t mlen, size_t rlen)
1690 {
1691         return (kibnal_recvmsg (nal, private, msg, niov, iov, NULL,
1692                                 offset, mlen, rlen));
1693 }
1694
1695 ptl_err_t
1696 kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg,
1697                      unsigned int niov, ptl_kiov_t *kiov, 
1698                      size_t offset, size_t mlen, size_t rlen)
1699 {
1700         return (kibnal_recvmsg (nal, private, msg, niov, NULL, kiov,
1701                                 offset, mlen, rlen));
1702 }
1703
1704 int
1705 kibnal_thread_start (int (*fn)(void *arg), void *arg)
1706 {
1707         long    pid = kernel_thread (fn, arg, 0);
1708
1709         if (pid < 0)
1710                 return ((int)pid);
1711
1712         atomic_inc (&kibnal_data.kib_nthreads);
1713         return (0);
1714 }
1715
1716 void
1717 kibnal_thread_fini (void)
1718 {
1719         atomic_dec (&kibnal_data.kib_nthreads);
1720 }
1721
1722 void
1723 kibnal_schedule_conn (kib_conn_t *conn)
1724 {
1725         unsigned long flags;
1726
1727         kibnal_conn_addref(conn);               /* ++ref for connd */
1728         
1729         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
1730
1731         list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns);
1732         wake_up (&kibnal_data.kib_connd_waitq);
1733                 
1734         spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
1735 }
1736
1737 void
1738 kibnal_close_conn_locked (kib_conn_t *conn, int error)
1739 {
1740         /* This just does the immmediate housekeeping.  'error' is zero for a
1741          * normal shutdown which can happen only after the connection has been
1742          * established.  If the connection is established, schedule the
1743          * connection to be finished off by the connd.  Otherwise the connd is
1744          * already dealing with it (either to set it up or tear it down).
1745          * Caller holds kib_global_lock exclusively in irq context */
1746         kib_peer_t       *peer = conn->ibc_peer;
1747         struct list_head *tmp;
1748         
1749         LASSERT (error != 0 || conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
1750
1751         if (error != 0 && conn->ibc_comms_error == 0)
1752                 conn->ibc_comms_error = error;
1753
1754         if (conn->ibc_state != IBNAL_CONN_ESTABLISHED)
1755                 return; /* already being handled  */
1756         
1757         /* NB Can't take ibc_lock here (could be in IRQ context), without
1758          * risking deadlock, so access to ibc_{tx_queue,active_txs} is racey */
1759
1760         if (error == 0 &&
1761             list_empty(&conn->ibc_tx_queue) &&
1762             list_empty(&conn->ibc_active_txs)) {
1763                 CDEBUG(D_NET, "closing conn to "LPX64
1764                        " rx# "LPD64" tx# "LPD64"\n", 
1765                        peer->ibp_nid, conn->ibc_txseq, conn->ibc_rxseq);
1766         } else {
1767                 CERROR("Closing conn to "LPX64": error %d%s%s"
1768                        " rx# "LPD64" tx# "LPD64"\n",
1769                        peer->ibp_nid, error,
1770                        list_empty(&conn->ibc_tx_queue) ? "" : "(sending)",
1771                        list_empty(&conn->ibc_active_txs) ? "" : "(waiting)",
1772                        conn->ibc_txseq, conn->ibc_rxseq);
1773
1774 #if 0
1775                 /* can't skip down the queue without holding ibc_lock (see above) */
1776                 list_for_each(tmp, &conn->ibc_tx_queue) {
1777                         kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
1778                         
1779                         CERROR("   queued tx type %x cookie "LPX64
1780                                " sending %d waiting %d ticks %ld/%d\n", 
1781                                tx->tx_msg->ibm_type, tx->tx_cookie, 
1782                                tx->tx_sending, tx->tx_waiting,
1783                                (long)(tx->tx_deadline - jiffies), HZ);
1784                 }
1785
1786                 list_for_each(tmp, &conn->ibc_active_txs) {
1787                         kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
1788                         
1789                         CERROR("   active tx type %x cookie "LPX64
1790                                " sending %d waiting %d ticks %ld/%d\n", 
1791                                tx->tx_msg->ibm_type, tx->tx_cookie, 
1792                                tx->tx_sending, tx->tx_waiting,
1793                                (long)(tx->tx_deadline - jiffies), HZ);
1794                 }
1795 #endif
1796         }
1797
1798         list_del (&conn->ibc_list);
1799         
1800         if (list_empty (&peer->ibp_conns) &&    /* no more conns */
1801             peer->ibp_persistence == 0 &&       /* non-persistent peer */
1802             kibnal_peer_active(peer)) {         /* still in peer table */
1803                 kibnal_unlink_peer_locked (peer);
1804         }
1805
1806         kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECT1);
1807
1808         kibnal_schedule_conn(conn);
1809         kibnal_conn_decref(conn);               /* lose ibc_list's ref */
1810 }
1811
1812 void
1813 kibnal_close_conn (kib_conn_t *conn, int error)
1814 {
1815         unsigned long flags;
1816         
1817         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1818
1819         kibnal_close_conn_locked (conn, error);
1820         
1821         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1822 }
1823
1824 void
1825 kibnal_handle_early_rxs(kib_conn_t *conn)
1826 {
1827         unsigned long    flags;
1828         kib_rx_t        *rx;
1829
1830         LASSERT (!in_interrupt());
1831         LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
1832         
1833         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1834         while (!list_empty(&conn->ibc_early_rxs)) {
1835                 rx = list_entry(conn->ibc_early_rxs.next,
1836                                 kib_rx_t, rx_list);
1837                 list_del(&rx->rx_list);
1838                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1839                 
1840                 kibnal_handle_rx(rx);
1841                 
1842                 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1843         }
1844         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1845 }
1846
1847 void
1848 kibnal_conn_disconnected(kib_conn_t *conn)
1849 {
1850         LIST_HEAD        (zombies); 
1851         struct list_head *tmp;
1852         struct list_head *nxt;
1853         kib_tx_t         *tx;
1854
1855         /* I'm the connd */
1856         LASSERT (!in_interrupt());
1857         LASSERT (current == kibnal_data.kib_connd);
1858         LASSERT (conn->ibc_state >= IBNAL_CONN_INIT);
1859         
1860         kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECTED);
1861
1862         /* move QP to error state to make posted work items complete */
1863         kibnal_set_qp_state(conn, vv_qp_state_error);
1864
1865         spin_lock(&conn->ibc_lock);
1866
1867         /* Complete all tx descs not waiting for sends to complete.
1868          * NB we should be safe from RDMA now that the QP has changed state */
1869
1870         list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) {
1871                 tx = list_entry (tmp, kib_tx_t, tx_list);
1872
1873                 LASSERT (tx->tx_queued);
1874
1875                 tx->tx_status = -ECONNABORTED;
1876                 tx->tx_queued = 0;
1877                 tx->tx_waiting = 0;
1878                 
1879                 if (tx->tx_sending != 0)
1880                         continue;
1881
1882                 list_del (&tx->tx_list);
1883                 list_add (&tx->tx_list, &zombies);
1884         }
1885
1886         list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) {
1887                 tx = list_entry (tmp, kib_tx_t, tx_list);
1888
1889                 LASSERT (!tx->tx_queued);
1890                 LASSERT (tx->tx_waiting ||
1891                          tx->tx_sending != 0);
1892
1893                 tx->tx_status = -ECONNABORTED;
1894                 tx->tx_waiting = 0;
1895                 
1896                 if (tx->tx_sending != 0)
1897                         continue;
1898
1899                 list_del (&tx->tx_list);
1900                 list_add (&tx->tx_list, &zombies);
1901         }
1902         
1903         spin_unlock(&conn->ibc_lock);
1904
1905         while (!list_empty(&zombies)) {
1906                 tx = list_entry (zombies.next, kib_tx_t, tx_list);
1907
1908                 list_del(&tx->tx_list);
1909                 kibnal_tx_done (tx);
1910         }
1911
1912         kibnal_handle_early_rxs(conn);
1913 }
1914
1915 void
1916 kibnal_peer_connect_failed (kib_peer_t *peer, int active)
1917 {
1918         struct list_head  zombies;
1919         kib_tx_t         *tx;
1920         unsigned long     flags;
1921
1922         /* Only the connd creates conns => single threaded */
1923         LASSERT (!in_interrupt());
1924         LASSERT (current == kibnal_data.kib_connd);
1925         LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL);
1926
1927         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1928
1929         if (active) {
1930                 LASSERT (peer->ibp_connecting != 0);
1931                 peer->ibp_connecting--;
1932         } else {
1933                 LASSERT (!kibnal_peer_active(peer));
1934         }
1935         
1936         if (peer->ibp_connecting != 0) {
1937                 /* another connection attempt under way (loopback?)... */
1938                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1939                 return;
1940         }
1941
1942         if (list_empty(&peer->ibp_conns)) {
1943                 /* Say when active connection can be re-attempted */
1944                 peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval;
1945                 /* Increase reconnection interval */
1946                 peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2,
1947                                                     IBNAL_MAX_RECONNECT_INTERVAL);
1948         
1949                 /* Take peer's blocked transmits to complete with error */
1950                 list_add(&zombies, &peer->ibp_tx_queue);
1951                 list_del_init(&peer->ibp_tx_queue);
1952                 
1953                 if (kibnal_peer_active(peer) &&
1954                     (peer->ibp_persistence == 0)) {
1955                         /* failed connection attempt on non-persistent peer */
1956                         kibnal_unlink_peer_locked (peer);
1957                 }
1958         } else {
1959                 /* Can't have blocked transmits if there are connections */
1960                 LASSERT (list_empty(&peer->ibp_tx_queue));
1961         }
1962         
1963         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1964
1965         if (list_empty (&zombies)) 
1966                 return;
1967         
1968         CERROR ("Deleting messages for "LPX64": connection failed\n", peer->ibp_nid);
1969         do {
1970                 tx = list_entry (zombies.next, kib_tx_t, tx_list);
1971
1972                 list_del (&tx->tx_list);
1973                 /* complete now */
1974                 tx->tx_status = -EHOSTUNREACH;
1975                 kibnal_tx_done (tx);
1976         } while (!list_empty (&zombies));
1977 }
1978
1979 void
1980 kibnal_connreq_done(kib_conn_t *conn, int active, int status)
1981 {
1982         static cm_reject_data_t   rej;
1983
1984         struct list_head   txs;
1985         kib_peer_t        *peer = conn->ibc_peer;
1986         kib_peer_t        *peer2;
1987         unsigned long      flags;
1988         kib_tx_t          *tx;
1989
1990         /* Only the connd creates conns => single threaded */
1991         LASSERT (!in_interrupt());
1992         LASSERT (current == kibnal_data.kib_connd);
1993         LASSERT (conn->ibc_state < IBNAL_CONN_ESTABLISHED);
1994
1995         if (active) {
1996                 LASSERT (peer->ibp_connecting > 0);
1997         } else {
1998                 LASSERT (!kibnal_peer_active(peer));
1999         }
2000         
2001         PORTAL_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
2002         conn->ibc_connvars = NULL;
2003
2004         if (status != 0) {
2005                 /* failed to establish connection */
2006                 switch (conn->ibc_state) {
2007                 default:
2008                         LBUG();
2009
2010                 case IBNAL_CONN_ACTIVE_CHECK_REPLY:
2011                         /* got a connection reply but failed checks */
2012                         LASSERT (active);
2013                         memset(&rej, 0, sizeof(rej));
2014                         rej.reason = cm_rej_code_usr_rej;
2015                         cm_reject(conn->ibc_cep, &rej);
2016                         break;
2017
2018                 case IBNAL_CONN_ACTIVE_CONNECT:
2019                         LASSERT (active);
2020                         cm_cancel(conn->ibc_cep);
2021                         kibnal_pause(HZ/10);
2022                         /* cm_connect() failed immediately or
2023                          * callback returned failure */
2024                         break;
2025
2026                 case IBNAL_CONN_ACTIVE_ARP:
2027                         LASSERT (active);
2028                         /* ibat_get_ib_data() failed immediately 
2029                          * or callback returned failure */
2030                         break;
2031
2032                 case IBNAL_CONN_INIT:
2033                         break;
2034
2035                 case IBNAL_CONN_PASSIVE_WAIT:
2036                         LASSERT (!active);
2037                         /* cm_accept callback returned failure */
2038                         break;
2039                 }
2040
2041                 kibnal_peer_connect_failed(conn->ibc_peer, active);
2042                 kibnal_conn_disconnected(conn);
2043                 return;
2044         }
2045
2046         /* connection established */
2047         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2048
2049         if (active) {
2050                 LASSERT(conn->ibc_state == IBNAL_CONN_ACTIVE_RTU);
2051         } else {
2052                 LASSERT(conn->ibc_state == IBNAL_CONN_PASSIVE_WAIT);
2053         }
2054         
2055         kibnal_set_conn_state(conn, IBNAL_CONN_ESTABLISHED);
2056
2057         if (!active) {
2058                 peer2 = kibnal_find_peer_locked(peer->ibp_nid);
2059                 if (peer2 != NULL) {
2060                         /* already in the peer table; swap */
2061                         conn->ibc_peer = peer2;
2062                         kibnal_peer_addref(peer2);
2063                         kibnal_peer_decref(peer);
2064                         peer = conn->ibc_peer;
2065                 } else {
2066                         /* add 'peer' to the peer table */
2067                         kibnal_peer_addref(peer);
2068                         list_add_tail(&peer->ibp_list,
2069                                       kibnal_nid2peerlist(peer->ibp_nid));
2070                 }
2071         }
2072         
2073         /* Add conn to peer's list and nuke any dangling conns from a different
2074          * peer instance... */
2075         kibnal_conn_addref(conn);               /* +1 ref for ibc_list */
2076         list_add(&conn->ibc_list, &peer->ibp_conns);
2077         kibnal_close_stale_conns_locked (conn->ibc_peer,
2078                                          conn->ibc_incarnation);
2079
2080         if (!kibnal_peer_active(peer) ||        /* peer has been deleted */
2081             conn->ibc_comms_error != 0 ||       /* comms error */
2082             conn->ibc_disconnect) {             /* need to disconnect */
2083                 
2084                 /* start to shut down connection */
2085                 kibnal_close_conn_locked(conn, -ECONNABORTED);
2086
2087                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2088                 kibnal_peer_connect_failed(peer, active);
2089                 return;
2090         }
2091
2092         if (active)
2093                 peer->ibp_connecting--;
2094
2095         /* grab pending txs while I have the lock */
2096         list_add(&txs, &peer->ibp_tx_queue);
2097         list_del_init(&peer->ibp_tx_queue);
2098         
2099         /* reset reconnect interval for next attempt */
2100         peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
2101         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2102
2103         /* Schedule blocked txs */
2104         spin_lock (&conn->ibc_lock);
2105         while (!list_empty (&txs)) {
2106                 tx = list_entry (txs.next, kib_tx_t, tx_list);
2107                 list_del (&tx->tx_list);
2108
2109                 kibnal_queue_tx_locked (tx, conn);
2110         }
2111         spin_unlock (&conn->ibc_lock);
2112         kibnal_check_sends (conn);
2113
2114         /* schedule blocked rxs */
2115         kibnal_handle_early_rxs(conn);
2116 }
2117
2118 void
2119 kibnal_cm_callback(cm_cep_handle_t cep, cm_conn_data_t *cmdata, void *arg)
2120 {
2121         static cm_dreply_data_t drep;           /* just zeroed space */
2122         
2123         kib_conn_t             *conn = (kib_conn_t *)arg;
2124         unsigned long           flags;
2125         
2126         /* CAVEAT EMPTOR: tasklet context */
2127
2128         switch (cmdata->status) {
2129         default:
2130                 LBUG();
2131                 
2132         case cm_event_disconn_request:
2133                 /* IBNAL_CONN_ACTIVE_RTU:  gets closed in kibnal_connreq_done
2134                  * IBNAL_CONN_ESTABLISHED: I start it closing
2135                  * otherwise:              it's closing anyway */
2136                 cm_disconnect(conn->ibc_cep, NULL, &drep);
2137                 cm_cancel(conn->ibc_cep);
2138
2139                 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2140                 LASSERT (!conn->ibc_disconnect);
2141                 conn->ibc_disconnect = 1;
2142
2143                 switch (conn->ibc_state) {
2144                 default:
2145                         LBUG();
2146
2147                 case IBNAL_CONN_ACTIVE_RTU:
2148                         /* kibnal_connreq_done is getting there; It'll see
2149                          * ibc_disconnect set... */
2150                         break;
2151
2152                 case IBNAL_CONN_ESTABLISHED:
2153                         /* kibnal_connreq_done got there already; get
2154                          * disconnect going... */
2155                         kibnal_close_conn_locked(conn, 0);
2156                         break;
2157
2158                 case IBNAL_CONN_DISCONNECT1:
2159                         /* kibnal_terminate_conn is getting there; It'll see
2160                          * ibc_disconnect set... */
2161                         break;
2162
2163                 case IBNAL_CONN_DISCONNECT2:
2164                         /* kibnal_terminate_conn got there already; complete
2165                          * the disconnect. */
2166                         kibnal_schedule_conn(conn);
2167                         break;
2168                 }
2169                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2170                 break;
2171                 
2172         case cm_event_disconn_timeout:
2173         case cm_event_disconn_reply:
2174                 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2175                 LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT2);
2176                 LASSERT (!conn->ibc_disconnect);
2177                 conn->ibc_disconnect = 1;
2178
2179                 /* kibnal_terminate_conn sent the disconnect request. */
2180                 kibnal_schedule_conn(conn);
2181
2182                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2183                 break;
2184                 
2185         case cm_event_connected:
2186         case cm_event_conn_timeout:
2187         case cm_event_conn_reject:
2188                 LASSERT (conn->ibc_state == IBNAL_CONN_PASSIVE_WAIT);
2189                 conn->ibc_connvars->cv_conndata = *cmdata;
2190
2191                 kibnal_schedule_conn(conn);
2192                 break;
2193         }
2194
2195         kibnal_conn_decref(conn); /* lose my ref */
2196 }
2197
2198 void
2199 kibnal_check_passive_wait(kib_conn_t *conn)
2200 {
2201         int     rc;
2202
2203         switch (conn->ibc_connvars->cv_conndata.status) {
2204         default:
2205                 LBUG();
2206                 
2207         case cm_event_connected:
2208                 kibnal_conn_addref(conn); /* ++ ref for CM callback */
2209                 rc = kibnal_set_qp_state(conn, vv_qp_state_rts);
2210                 if (rc != 0)
2211                         conn->ibc_comms_error = rc;
2212                 /* connection _has_ been established; it's just that we've had
2213                  * an error immediately... */
2214                 kibnal_connreq_done(conn, 0, 0);
2215                 break;
2216                 
2217         case cm_event_conn_timeout:
2218                 kibnal_connreq_done(conn, 0, -ETIMEDOUT);
2219                 break;
2220                 
2221         case cm_event_conn_reject:
2222                 kibnal_connreq_done(conn, 0, -ECONNRESET);
2223                 break;
2224         }
2225 }
2226
2227 void
2228 kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq)
2229 {
2230         static kib_msg_t        txmsg;
2231         static kib_msg_t        rxmsg;
2232         static cm_reply_data_t  reply;
2233         static cm_reject_data_t reject;
2234
2235         kib_conn_t         *conn = NULL;
2236         int                 rc = 0;
2237         int                 rxmsgnob;
2238         kib_connvars_t     *cv;
2239         kib_peer_t         *tmp_peer;
2240         cm_return_t         cmrc;
2241         vv_return_t         vvrc;
2242         
2243         /* I'm the connd executing in thread context
2244          * No concurrency problems with static data! */
2245         LASSERT (!in_interrupt());
2246         LASSERT (current == kibnal_data.kib_connd);
2247
2248         if (cmreq->sid != IBNAL_SERVICE_NUMBER) {
2249                 CERROR(LPX64" != IBNAL_SERVICE_NUMBER("LPX64")\n",
2250                        cmreq->sid, (__u64)IBNAL_SERVICE_NUMBER);
2251                 goto reject;
2252         }
2253
2254         /* copy into rxmsg to avoid alignment issues */
2255         rxmsgnob = MIN(cm_REQ_priv_data_len, sizeof(rxmsg));
2256         memcpy(&rxmsg, cmreq->priv_data, rxmsgnob);
2257
2258         rc = kibnal_unpack_msg(&rxmsg, rxmsgnob);
2259         if (rc != 0) {
2260                 CERROR("Can't parse connection request: %d\n", rc);
2261                 goto reject;
2262         }
2263
2264         if (rxmsg.ibm_type != IBNAL_MSG_CONNREQ) {
2265                 CERROR("Unexpected connreq msg type: %x from "LPX64"\n",
2266                        rxmsg.ibm_type, rxmsg.ibm_srcnid);
2267                 goto reject;
2268         }
2269
2270         if (rxmsg.ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid) {
2271                 CERROR("Can't accept "LPX64": bad dst nid "LPX64"\n",
2272                        rxmsg.ibm_srcnid, rxmsg.ibm_dstnid);
2273                 goto reject;
2274         }
2275
2276         if (rxmsg.ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) {
2277                 CERROR("Can't accept "LPX64": incompatible queue depth %d (%d wanted)\n",
2278                        rxmsg.ibm_srcnid, rxmsg.ibm_u.connparams.ibcp_queue_depth, 
2279                        IBNAL_MSG_QUEUE_SIZE);
2280                 goto reject;
2281         }
2282
2283         if (rxmsg.ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE) {
2284                 CERROR("Can't accept "LPX64": message size %d too big (%d max)\n",
2285                        rxmsg.ibm_srcnid, rxmsg.ibm_u.connparams.ibcp_max_msg_size, 
2286                        IBNAL_MSG_SIZE);
2287                 goto reject;
2288         }
2289                 
2290         if (rxmsg.ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) {
2291                 CERROR("Can't accept "LPX64": max frags %d too big (%d max)\n",
2292                        rxmsg.ibm_srcnid, rxmsg.ibm_u.connparams.ibcp_max_frags, 
2293                        IBNAL_MAX_RDMA_FRAGS);
2294                 goto reject;
2295         }
2296                 
2297         conn = kibnal_create_conn(cep);
2298         if (conn == NULL) {
2299                 CERROR("Can't create conn for "LPX64"\n", rxmsg.ibm_srcnid);
2300                 goto reject;
2301         }
2302         
2303         /* assume 'rxmsg.ibm_srcnid' is a new peer */
2304         tmp_peer = kibnal_create_peer (rxmsg.ibm_srcnid);
2305         if (tmp_peer == NULL) {
2306                 CERROR("Can't create tmp peer for "LPX64"\n", rxmsg.ibm_srcnid);
2307                 kibnal_conn_decref(conn);
2308                 conn = NULL;
2309                 goto reject;
2310         }
2311
2312         conn->ibc_peer = tmp_peer;              /* conn takes over my ref */
2313         conn->ibc_incarnation = rxmsg.ibm_srcstamp;
2314         conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
2315
2316         cv = conn->ibc_connvars;
2317
2318         cv->cv_txpsn          = cmreq->cep_data.start_psn;
2319         cv->cv_remote_qpn     = cmreq->cep_data.qpn;
2320         cv->cv_path           = cmreq->path_data.path;
2321         cv->cv_rnr_count      = cmreq->cep_data.rtr_retry_cnt;
2322         // XXX                  cmreq->cep_data.retry_cnt;
2323         cv->cv_port           = cmreq->cep_data.local_port_num;
2324
2325         vvrc = gid2gid_index(kibnal_data.kib_hca, cv->cv_port,
2326                              &cv->cv_path.sgid, &cv->cv_sgid_index);
2327         LASSERT (vvrc == vv_return_ok);
2328         
2329         vvrc = pkey2pkey_index(kibnal_data.kib_hca, cv->cv_port,
2330                                cv->cv_path.pkey, &cv->cv_pkey_index);
2331         LASSERT (vvrc == vv_return_ok);
2332
2333         rc = kibnal_set_qp_state(conn, vv_qp_state_init);
2334         if (rc != 0)
2335                 goto reject;
2336
2337         rc = kibnal_post_receives(conn);
2338         if (rc != 0) {
2339                 CERROR("Can't post receives for "LPX64"\n", rxmsg.ibm_srcnid);
2340                 goto reject;
2341         }
2342
2343         rc = kibnal_set_qp_state(conn, vv_qp_state_rtr);
2344         if (rc != 0)
2345                 goto reject;
2346         
2347         memset(&reply, 0, sizeof(reply));
2348         reply.qpn                 = cv->cv_local_qpn;
2349         reply.qkey                = IBNAL_QKEY;
2350         reply.start_psn           = cv->cv_rxpsn;
2351         reply.arb_initiator_depth = IBNAL_ARB_INITIATOR_DEPTH;
2352         reply.arb_resp_res        = IBNAL_ARB_RESP_RES;
2353         reply.failover_accepted   = IBNAL_FAILOVER_ACCEPTED;
2354         reply.rnr_retry_count     = cv->cv_rnr_count;
2355         reply.targ_ack_delay      = kibnal_data.kib_hca_attrs.ack_delay;
2356         
2357         /* setup txmsg... */
2358         memset(&txmsg, 0, sizeof(txmsg));
2359         kibnal_init_msg(&txmsg, IBNAL_MSG_CONNACK, 
2360                         sizeof(txmsg.ibm_u.connparams));
2361         LASSERT (txmsg.ibm_nob <= cm_REP_priv_data_len);
2362         txmsg.ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE;
2363         txmsg.ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE;
2364         txmsg.ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS;
2365         kibnal_pack_msg(&txmsg, 0, rxmsg.ibm_srcnid, rxmsg.ibm_srcstamp, 0);
2366
2367         /* ...and copy into reply to avoid alignment issues */
2368         memcpy(&reply.priv_data, &txmsg, txmsg.ibm_nob);
2369
2370         kibnal_set_conn_state(conn, IBNAL_CONN_PASSIVE_WAIT);
2371         
2372         cmrc = cm_accept(conn->ibc_cep, &reply, NULL,
2373                          kibnal_cm_callback, conn);
2374
2375         if (cmrc == cm_stat_success)
2376                 return;                         /* callback has got my ref on conn */
2377
2378         /* back out state change (no callback happening) */
2379         kibnal_set_conn_state(conn, IBNAL_CONN_INIT);
2380         rc = -EIO;
2381                 
2382  reject:
2383         CERROR("Rejected connreq from "LPX64"\n", rxmsg.ibm_srcnid);
2384
2385         memset(&reject, 0, sizeof(reject));
2386         reject.reason = cm_rej_code_usr_rej;
2387         cm_reject(cep, &reject);
2388
2389         if (conn != NULL) {
2390                 LASSERT (rc != 0);
2391                 kibnal_connreq_done(conn, 0, rc);
2392         } else {
2393                 cm_destroy_cep(cep);
2394         }
2395 }
2396
2397 void
2398 kibnal_listen_callback(cm_cep_handle_t cep, cm_conn_data_t *data, void *arg)
2399 {
2400         cm_request_data_t  *cmreq = &data->data.request;
2401         kib_pcreq_t        *pcr;
2402         unsigned long       flags;
2403         
2404         LASSERT (arg == NULL);
2405
2406         if (data->status != cm_event_conn_request) {
2407                 CERROR("status %d is not cm_event_conn_request\n",
2408                        data->status);
2409                 return;
2410         }
2411
2412         PORTAL_ALLOC_ATOMIC(pcr, sizeof(*pcr));
2413         if (pcr == NULL) {
2414                 CERROR("Can't allocate passive connreq\n");
2415
2416                 cm_reject(cep, &((cm_reject_data_t) /* NB RO struct */
2417                                  {.reason = cm_rej_code_no_res,}));
2418                 cm_destroy_cep(cep);
2419                 return;
2420         }
2421
2422         pcr->pcr_cep = cep;
2423         pcr->pcr_cmreq = *cmreq;
2424         
2425         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
2426
2427         list_add_tail(&pcr->pcr_list, &kibnal_data.kib_connd_pcreqs);
2428         wake_up(&kibnal_data.kib_connd_waitq);
2429         
2430         spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
2431 }
2432
2433
2434 void
2435 kibnal_active_connect_callback (cm_cep_handle_t cep, cm_conn_data_t *cd, 
2436                                 void *arg)
2437 {
2438         /* CAVEAT EMPTOR: tasklet context */
2439         kib_conn_t       *conn = (kib_conn_t *)arg;
2440         kib_connvars_t   *cv = conn->ibc_connvars;
2441         unsigned long     flags;
2442
2443         LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT);
2444         cv->cv_conndata = *cd;
2445
2446         kibnal_schedule_conn(conn);
2447         kibnal_conn_decref(conn);
2448 }
2449
2450 void
2451 kibnal_connect_conn (kib_conn_t *conn)
2452 {
2453         static cm_request_data_t  cmreq;
2454         static kib_msg_t          msg;
2455         
2456         kib_connvars_t           *cv = conn->ibc_connvars;
2457         kib_peer_t               *peer = conn->ibc_peer;
2458         cm_return_t               cmrc;
2459         
2460         /* Only called by connd => statics OK */
2461         LASSERT (!in_interrupt());
2462         LASSERT (current == kibnal_data.kib_connd);
2463         LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP);
2464
2465         memset(&cmreq, 0, sizeof(cmreq));
2466         
2467         cmreq.sid = IBNAL_SERVICE_NUMBER;
2468
2469         cmreq.cep_data.ca_guid              = kibnal_data.kib_hca_attrs.guid;
2470         cmreq.cep_data.qpn                  = cv->cv_local_qpn;
2471         cmreq.cep_data.retry_cnt            = IBNAL_RETRY_CNT;
2472         cmreq.cep_data.rtr_retry_cnt        = IBNAL_RNR_CNT;
2473         cmreq.cep_data.start_psn            = cv->cv_rxpsn;
2474         cmreq.cep_data.end_to_end_flow_ctrl = IBNAL_EE_FLOW_CNT;
2475         // XXX ack_timeout?
2476         // offered_resp_res
2477         // offered_initiator_depth
2478
2479         cmreq.path_data.subn_local  = IBNAL_LOCAL_SUB;
2480         cmreq.path_data.path        = cv->cv_path;
2481         
2482         /* setup msg... */
2483         memset(&msg, 0, sizeof(msg));
2484         kibnal_init_msg(&msg, IBNAL_MSG_CONNREQ, sizeof(msg.ibm_u.connparams));
2485         LASSERT(msg.ibm_nob <= cm_REQ_priv_data_len);
2486         msg.ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE;
2487         msg.ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE;
2488         msg.ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS;
2489         kibnal_pack_msg(&msg, 0, peer->ibp_nid, 0, 0);
2490
2491         /* ...and copy into cmreq to avoid alignment issues */
2492         memcpy(&cmreq.priv_data, &msg, msg.ibm_nob);
2493         
2494         CDEBUG(D_NET, "Connecting %p to "LPX64"\n", conn, peer->ibp_nid);
2495
2496         kibnal_conn_addref(conn);               /* ++ref for CM callback */
2497         kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CONNECT);
2498
2499         cmrc = cm_connect(conn->ibc_cep, &cmreq, 
2500                           kibnal_active_connect_callback, conn);
2501         if (cmrc == cm_stat_success) {
2502                 CDEBUG(D_NET, "connection REQ sent to "LPX64"\n",
2503                        peer->ibp_nid);
2504                 return;
2505         }
2506
2507         CERROR ("Connect "LPX64" failed: %d\n", peer->ibp_nid, cmrc);
2508         kibnal_conn_decref(conn);       /* drop callback's ref */
2509         kibnal_connreq_done(conn, 1, -EHOSTUNREACH);
2510 }
2511
2512 void
2513 kibnal_check_connreply (kib_conn_t *conn)
2514 {
2515         static cm_rtu_data_t  rtu;
2516         static kib_msg_t      msg;
2517
2518         kib_connvars_t   *cv = conn->ibc_connvars;
2519         cm_reply_data_t  *reply = &cv->cv_conndata.data.reply;
2520         kib_peer_t       *peer = conn->ibc_peer;
2521         int               msgnob;
2522         cm_return_t       cmrc;
2523         cm_cep_handle_t   cep;
2524         unsigned long     flags;
2525         int               rc;
2526
2527         /* Only called by connd => statics OK */
2528         LASSERT (!in_interrupt());
2529         LASSERT (current == kibnal_data.kib_connd);
2530         LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT);
2531
2532         if (cv->cv_conndata.status == cm_event_conn_reply) {
2533                 cv->cv_remote_qpn = reply->qpn;
2534                 cv->cv_txpsn      = reply->start_psn;
2535                 // XXX              reply->targ_ack_delay;
2536                 cv->cv_rnr_count  = reply->rnr_retry_count;
2537
2538                 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CHECK_REPLY);
2539
2540                 /* copy into msg to avoid alignment issues */
2541                 msgnob = MIN(cm_REP_priv_data_len, sizeof(msg));
2542                 memcpy(&msg, &reply->priv_data, msgnob);
2543
2544                 rc = kibnal_unpack_msg(&msg, msgnob);
2545                 if (rc != 0) {
2546                         CERROR("Can't unpack reply from "LPX64"\n",
2547                                peer->ibp_nid);
2548                         kibnal_connreq_done(conn, 1, rc);
2549                         return;
2550                 }
2551
2552                 if (msg.ibm_type != IBNAL_MSG_CONNACK ) {
2553                         CERROR("Unexpected message type %d from "LPX64"\n",
2554                                msg.ibm_type, peer->ibp_nid);
2555                         kibnal_connreq_done(conn, 1, -EPROTO);
2556                         return;
2557                 }
2558
2559                 if (msg.ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) {
2560                         CERROR(LPX64" has incompatible queue depth %d(%d wanted)\n",
2561                                peer->ibp_nid, msg.ibm_u.connparams.ibcp_queue_depth,
2562                                IBNAL_MSG_QUEUE_SIZE);
2563                         kibnal_connreq_done(conn, 1, -EPROTO);
2564                         return;
2565                 }
2566                 
2567                 if (msg.ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE) {
2568                         CERROR(LPX64" max message size %d too big (%d max)\n",
2569                                peer->ibp_nid, msg.ibm_u.connparams.ibcp_max_msg_size, 
2570                                IBNAL_MSG_SIZE);
2571                         kibnal_connreq_done(conn, 1, -EPROTO);
2572                         return;
2573                 }
2574
2575                 if (msg.ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) {
2576                         CERROR(LPX64" max frags %d too big (%d max)\n",
2577                                peer->ibp_nid, msg.ibm_u.connparams.ibcp_max_frags, 
2578                                IBNAL_MAX_RDMA_FRAGS);
2579                         kibnal_connreq_done(conn, 1, -EPROTO);
2580                         return;
2581                 }
2582                 
2583                 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2584                 rc = (msg.ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid ||
2585                       msg.ibm_dststamp != kibnal_data.kib_incarnation) ?
2586                      -ESTALE : 0;
2587                 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2588                 if (rc != 0) {
2589                         CERROR("Stale connection reply from "LPX64"\n",
2590                                peer->ibp_nid);
2591                         kibnal_connreq_done(conn, 1, rc);
2592                         return;
2593                 }
2594
2595                 conn->ibc_incarnation = msg.ibm_srcstamp;
2596                 conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
2597                 
2598                 rc = kibnal_post_receives(conn);
2599                 if (rc != 0) {
2600                         CERROR("Can't post receives for "LPX64"\n",
2601                                peer->ibp_nid);
2602                         kibnal_connreq_done(conn, 1, rc);
2603                         return;
2604                 }
2605                 
2606                 rc = kibnal_set_qp_state(conn, vv_qp_state_rtr);
2607                 if (rc != 0) {
2608                         kibnal_connreq_done(conn, 1, rc);
2609                         return;
2610                 }
2611                 
2612                 rc = kibnal_set_qp_state(conn, vv_qp_state_rts);
2613                 if (rc != 0) {
2614                         kibnal_connreq_done(conn, 1, rc);
2615                         return;
2616                 }
2617                 
2618                 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_RTU);
2619                 kibnal_conn_addref(conn);       /* ++for CM callback */
2620                 
2621                 memset(&rtu, 0, sizeof(rtu));
2622                 cmrc = cm_accept(conn->ibc_cep, NULL, &rtu,
2623                                  kibnal_cm_callback, conn);
2624                 if (cmrc == cm_stat_success) {
2625                         /* Now I'm racing with disconnect signalled by
2626                          * kibnal_cm_callback */
2627                         kibnal_connreq_done(conn, 1, 0);
2628                         return;
2629                 }
2630
2631                 CERROR("cm_accept "LPX64" failed: %d\n", peer->ibp_nid, cmrc);
2632                 /* Back out of RTU: no callback coming */
2633                 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CHECK_REPLY);
2634                 kibnal_conn_decref(conn);
2635                 kibnal_connreq_done(conn, 1, -EIO);
2636                 return;
2637         }
2638
2639         if (cv->cv_conndata.status == cm_event_conn_reject) {
2640
2641                 if (cv->cv_conndata.data.reject.reason != cm_rej_code_stale_conn) {
2642                         CERROR("conn -> "LPX64" rejected: %d\n", peer->ibp_nid,
2643                                cv->cv_conndata.data.reject.reason);
2644                         kibnal_connreq_done(conn, 1, -ECONNREFUSED);
2645                         return;
2646                 }
2647
2648                 CWARN ("conn -> "LPX64" stale: retrying\n", peer->ibp_nid);
2649
2650                 cep = cm_create_cep(cm_cep_transp_rc);
2651                 if (cep == NULL) {
2652                         CERROR("Can't create new CEP\n");
2653                         kibnal_connreq_done(conn, 1, -ENOMEM);
2654                         return;
2655                 }
2656
2657                 cmrc = cm_cancel(conn->ibc_cep);
2658                 LASSERT (cmrc == cm_stat_success);
2659                 cmrc = cm_destroy_cep(conn->ibc_cep);
2660                 LASSERT (cmrc == cm_stat_success);
2661
2662                 conn->ibc_cep = cep;
2663
2664                 /* retry connect */
2665                 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_ARP);
2666                 kibnal_connect_conn(conn);
2667                 return;
2668         }
2669
2670         CERROR("conn -> "LPX64" failed: %d\n", peer->ibp_nid,
2671                cv->cv_conndata.status);
2672         kibnal_connreq_done(conn, 1, -ECONNABORTED);
2673 }
2674
2675 void
2676 kibnal_arp_done (kib_conn_t *conn)
2677 {
2678         kib_peer_t           *peer = conn->ibc_peer;
2679         kib_connvars_t       *cv = conn->ibc_connvars;
2680         ibat_arp_data_t      *arp = &cv->cv_arp;
2681         ib_path_record_v2_t  *path = &cv->cv_path;
2682         vv_return_t           vvrc;
2683         int                   rc;
2684         unsigned long         flags;
2685
2686         LASSERT (!in_interrupt());
2687         LASSERT (current == kibnal_data.kib_connd);
2688         LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP);
2689         LASSERT (peer->ibp_arp_count > 0);
2690         
2691         if (cv->cv_arprc != ibat_stat_ok) {
2692                 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2693                 peer->ibp_arp_count--;
2694                 if (peer->ibp_arp_count == 0) {
2695                         /* final ARP attempt failed */
2696                         write_unlock_irqrestore(&kibnal_data.kib_global_lock, 
2697                                                 flags);
2698                         CERROR("Arp "LPX64"@%u.%u.%u.%u failed: %d\n", 
2699                                peer->ibp_nid, HIPQUAD(peer->ibp_ip), 
2700                                cv->cv_arprc);
2701                 } else {
2702                         /* Retry ARP: ibp_connecting++ so terminating conn
2703                          * doesn't end peer's connection attempt */
2704                         peer->ibp_connecting++;
2705                         write_unlock_irqrestore(&kibnal_data.kib_global_lock, 
2706                                                 flags);
2707                         CWARN("Arp "LPX64"@%u.%u.%u.%u failed: %d "
2708                               "(%d attempts left)\n", 
2709                               peer->ibp_nid, HIPQUAD(peer->ibp_ip), 
2710                               cv->cv_arprc, peer->ibp_arp_count);
2711
2712                         kibnal_schedule_peer_arp(peer);
2713                 }
2714                 kibnal_connreq_done(conn, 1, -ENETUNREACH);
2715                 return;
2716         }
2717
2718         if ((arp->mask & IBAT_PRI_PATH_VALID) != 0) {
2719                 CDEBUG(D_NET, "Got valid path for "LPX64"\n", peer->ibp_nid);
2720
2721                 *path = *arp->primary_path;
2722
2723                 vvrc = base_gid2port_num(kibnal_data.kib_hca, &path->sgid,
2724                                          &cv->cv_port);
2725                 LASSERT (vvrc == vv_return_ok);
2726
2727                 vvrc = gid2gid_index(kibnal_data.kib_hca, cv->cv_port,
2728                                      &path->sgid, &cv->cv_sgid_index);
2729                 LASSERT (vvrc == vv_return_ok);
2730
2731                 vvrc = pkey2pkey_index(kibnal_data.kib_hca, cv->cv_port,
2732                                        path->pkey, &cv->cv_pkey_index);
2733                 LASSERT (vvrc == vv_return_ok);
2734
2735                 path->mtu = IBNAL_IB_MTU;
2736
2737         } else if ((arp->mask & IBAT_LID_VALID) != 0) {
2738                 CWARN("Creating new path record for "LPX64"@%u.%u.%u.%u\n",
2739                       peer->ibp_nid, HIPQUAD(peer->ibp_ip));
2740
2741                 cv->cv_pkey_index = IBNAL_PKEY_IDX;
2742                 cv->cv_sgid_index = IBNAL_SGID_IDX;
2743                 cv->cv_port = arp->local_port_num;
2744
2745                 memset(path, 0, sizeof(*path));
2746
2747                 vvrc = port_num2base_gid(kibnal_data.kib_hca, cv->cv_port,
2748                                          &path->sgid);
2749                 LASSERT (vvrc == vv_return_ok);
2750
2751                 vvrc = port_num2base_lid(kibnal_data.kib_hca, cv->cv_port,
2752                                          &path->slid);
2753                 LASSERT (vvrc == vv_return_ok);
2754
2755                 path->dgid          = arp->gid;
2756                 path->sl            = IBNAL_SERVICE_LEVEL;
2757                 path->dlid          = arp->lid;
2758                 path->mtu           = IBNAL_IB_MTU;
2759                 path->rate          = IBNAL_STATIC_RATE;
2760                 path->pkt_life_time = IBNAL_PKT_LIFETIME;
2761                 path->pkey          = IBNAL_PKEY;
2762                 path->traffic_class = IBNAL_TRAFFIC_CLASS;
2763         } else {
2764                 CERROR("Can't Arp "LPX64"@%u.%u.%u.%u: no PATH or LID\n", 
2765                        peer->ibp_nid, HIPQUAD(peer->ibp_ip));
2766                 kibnal_connreq_done(conn, 1, -ENETUNREACH);
2767                 return;
2768         }
2769
2770         rc = kibnal_set_qp_state(conn, vv_qp_state_init);
2771         if (rc != 0) {
2772                 kibnal_connreq_done(conn, 1, rc);
2773         }
2774
2775         /* do the actual connection request */
2776         kibnal_connect_conn(conn);
2777 }
2778
2779 void
2780 kibnal_arp_callback (ibat_stat_t arprc, ibat_arp_data_t *arp_data, void *arg)
2781 {
2782         /* CAVEAT EMPTOR: tasklet context */
2783         kib_conn_t      *conn = (kib_conn_t *)arg;
2784         kib_peer_t      *peer = conn->ibc_peer;
2785         unsigned long    flags;
2786
2787         if (arprc != ibat_stat_ok)
2788                 CERROR("Arp "LPX64"@%u.%u.%u.%u failed: %d\n",
2789                        peer->ibp_nid, HIPQUAD(peer->ibp_ip), arprc);
2790         else
2791                 CDEBUG(D_NET, "Arp "LPX64"@%u.%u.%u.%u OK: LID %s PATH %s\n",
2792                        peer->ibp_nid, HIPQUAD(peer->ibp_ip), 
2793                        (arp_data->mask & IBAT_LID_VALID) == 0 ? "invalid" : "valid",
2794                        (arp_data->mask & IBAT_PRI_PATH_VALID) == 0 ? "invalid" : "valid");
2795
2796         LASSERT (conn != NULL);
2797         LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP);
2798
2799         conn->ibc_connvars->cv_arprc = arprc;
2800         if (arprc == ibat_stat_ok)
2801                 conn->ibc_connvars->cv_arp = *arp_data;
2802         
2803         kibnal_schedule_conn(conn);
2804         kibnal_conn_decref(conn);
2805 }
2806
2807 void
2808 kibnal_arp_peer (kib_peer_t *peer)
2809 {
2810         cm_cep_handle_t  cep;
2811         kib_conn_t      *conn;
2812         int              ibatrc;
2813
2814         /* Only the connd does this (i.e. single threaded) */
2815         LASSERT (current == kibnal_data.kib_connd);
2816         LASSERT (peer->ibp_connecting != 0);
2817         LASSERT (peer->ibp_arp_count > 0);
2818
2819         cep = cm_create_cep(cm_cep_transp_rc);
2820         if (cep == NULL) {
2821                 CERROR ("Can't create cep for conn->"LPX64"\n",
2822                         peer->ibp_nid);
2823                 kibnal_peer_connect_failed(peer, 1);
2824                 return;
2825         }
2826
2827         conn = kibnal_create_conn(cep);
2828         if (conn == NULL) {
2829                 CERROR ("Can't allocate conn->"LPX64"\n",
2830                         peer->ibp_nid);
2831                 cm_destroy_cep(cep);
2832                 kibnal_peer_connect_failed(peer, 1);
2833                 return;
2834         }
2835
2836         conn->ibc_peer = peer;
2837         kibnal_peer_addref(peer);
2838
2839         kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_ARP);
2840
2841         ibatrc = ibat_get_ib_data(htonl(peer->ibp_ip), INADDR_ANY, 
2842                                   ibat_paths_primary,
2843                                   &conn->ibc_connvars->cv_arp, 
2844                                   kibnal_arp_callback, conn, 0);
2845         CDEBUG(D_NET,"ibatrc %d\n", ibatrc);
2846         switch (ibatrc) {
2847         default:
2848                 LBUG();
2849                 
2850         case ibat_stat_pending:
2851                 /* NB callback has my ref on conn */
2852                 break;
2853                 
2854         case ibat_stat_ok:
2855         case ibat_stat_error:
2856         case ibat_stat_timeout:
2857         case ibat_stat_not_found:
2858                 /* Immediate return (ARP cache hit or failure) == no callback. 
2859                  * Do the next stage directly... */
2860                 conn->ibc_connvars->cv_arprc = ibatrc;
2861                 kibnal_arp_done(conn);
2862                 kibnal_conn_decref(conn);
2863                 break;
2864         }
2865 }
2866
2867 int
2868 kibnal_conn_timed_out (kib_conn_t *conn)
2869 {
2870         kib_tx_t          *tx;
2871         struct list_head  *ttmp;
2872
2873         spin_lock(&conn->ibc_lock);
2874
2875         list_for_each (ttmp, &conn->ibc_tx_queue) {
2876                 tx = list_entry (ttmp, kib_tx_t, tx_list);
2877
2878                 LASSERT (tx->tx_queued);
2879
2880                 if (time_after_eq (jiffies, tx->tx_deadline)) {
2881                         spin_unlock(&conn->ibc_lock);
2882                         return 1;
2883                 }
2884         }
2885
2886         list_for_each (ttmp, &conn->ibc_active_txs) {
2887                 tx = list_entry (ttmp, kib_tx_t, tx_list);
2888
2889                 LASSERT (!tx->tx_queued);
2890                 LASSERT (tx->tx_waiting ||
2891                          tx->tx_sending != 0);
2892
2893                 if (time_after_eq (jiffies, tx->tx_deadline)) {
2894                         spin_unlock(&conn->ibc_lock);
2895                         return 1;
2896                 }
2897         }
2898
2899         spin_unlock(&conn->ibc_lock);
2900         return 0;
2901 }
2902
2903 void
2904 kibnal_check_conns (int idx)
2905 {
2906         struct list_head  *peers = &kibnal_data.kib_peers[idx];
2907         struct list_head  *ptmp;
2908         kib_peer_t        *peer;
2909         kib_conn_t        *conn;
2910         struct list_head  *ctmp;
2911         unsigned long      flags;
2912
2913  again:
2914         /* NB. We expect to have a look at all the peers and not find any
2915          * rdmas to time out, so we just use a shared lock while we
2916          * take a look... */
2917         read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2918
2919         list_for_each (ptmp, peers) {
2920                 peer = list_entry (ptmp, kib_peer_t, ibp_list);
2921
2922                 list_for_each (ctmp, &peer->ibp_conns) {
2923                         conn = list_entry (ctmp, kib_conn_t, ibc_list);
2924
2925                         LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED);
2926
2927                         /* In case we have enough credits to return via a
2928                          * NOOP, but there were no non-blocking tx descs
2929                          * free to do it last time... */
2930                         kibnal_check_sends(conn);
2931
2932                         if (!kibnal_conn_timed_out(conn))
2933                                 continue;
2934
2935                         /* Handle timeout by closing the whole connection.  We
2936                          * can only be sure RDMA activity has ceased once the
2937                          * QP has been modified. */
2938                         
2939                         kibnal_conn_addref(conn); /* 1 ref for me... */
2940
2941                         read_unlock_irqrestore(&kibnal_data.kib_global_lock, 
2942                                                flags);
2943
2944                         CERROR("Timed out RDMA with "LPX64"\n",
2945                                peer->ibp_nid);
2946
2947                         kibnal_close_conn (conn, -ETIMEDOUT);
2948                         kibnal_conn_decref(conn); /* ...until here */
2949
2950                         /* start again now I've dropped the lock */
2951                         goto again;
2952                 }
2953         }
2954
2955         read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2956 }
2957
2958 void
2959 kibnal_disconnect_conn (kib_conn_t *conn)
2960 {
2961         static cm_drequest_data_t dreq;         /* just for the space */
2962         
2963         cm_return_t    cmrc;
2964         unsigned long  flags;
2965
2966         LASSERT (!in_interrupt());
2967         LASSERT (current == kibnal_data.kib_connd);
2968         
2969         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2970
2971         if (conn->ibc_disconnect) {
2972                 /* Had the CM callback already */
2973                 write_unlock_irqrestore(&kibnal_data.kib_global_lock,
2974                                         flags);
2975                 kibnal_conn_disconnected(conn);
2976                 return;
2977         }
2978                 
2979         LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT1);
2980
2981         /* active disconnect */
2982         cmrc = cm_disconnect(conn->ibc_cep, &dreq, NULL);
2983         if (cmrc == cm_stat_success) {
2984                 /* waiting for CM */
2985                 conn->ibc_state = IBNAL_CONN_DISCONNECT2;
2986                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2987                 return;
2988         }
2989
2990         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2991
2992         cm_cancel(conn->ibc_cep);
2993         kibnal_pause(HZ/10);
2994
2995         if (!conn->ibc_disconnect)              /* CM callback will never happen now */
2996                 kibnal_conn_decref(conn);
2997         
2998         LASSERT (atomic_read(&conn->ibc_refcount) > 0);
2999         LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT1);
3000
3001         kibnal_conn_disconnected(conn);
3002 }
3003
3004 int
3005 kibnal_connd (void *arg)
3006 {
3007         wait_queue_t       wait;
3008         unsigned long      flags;
3009         kib_pcreq_t       *pcr;
3010         kib_conn_t        *conn;
3011         kib_peer_t        *peer;
3012         int                timeout;
3013         int                i;
3014         int                dropped_lock;
3015         int                peer_index = 0;
3016         unsigned long      deadline = jiffies;
3017         
3018         kportal_daemonize ("kibnal_connd");
3019         kportal_blockallsigs ();
3020
3021         init_waitqueue_entry (&wait, current);
3022         kibnal_data.kib_connd = current;
3023
3024         spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3025
3026         while (!kibnal_data.kib_shutdown) {
3027
3028                 dropped_lock = 0;
3029
3030                 if (!list_empty (&kibnal_data.kib_connd_zombies)) {
3031                         conn = list_entry (kibnal_data.kib_connd_zombies.next,
3032                                            kib_conn_t, ibc_list);
3033                         list_del (&conn->ibc_list);
3034                         
3035                         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3036                         dropped_lock = 1;
3037
3038                         kibnal_destroy_conn(conn);
3039
3040                         spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3041                 }
3042
3043                 if (!list_empty (&kibnal_data.kib_connd_pcreqs)) {
3044                         pcr = list_entry(kibnal_data.kib_connd_pcreqs.next,
3045                                          kib_pcreq_t, pcr_list);
3046                         list_del(&pcr->pcr_list);
3047                         
3048                         spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
3049                         dropped_lock = 1;
3050
3051                         kibnal_recv_connreq(pcr->pcr_cep, &pcr->pcr_cmreq);
3052                         PORTAL_FREE(pcr, sizeof(*pcr));
3053
3054                         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
3055                 }
3056                         
3057                 if (!list_empty (&kibnal_data.kib_connd_peers)) {
3058                         peer = list_entry (kibnal_data.kib_connd_peers.next,
3059                                            kib_peer_t, ibp_connd_list);
3060                         
3061                         list_del_init (&peer->ibp_connd_list);
3062                         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3063                         dropped_lock = 1;
3064
3065                         kibnal_arp_peer (peer);
3066                         kibnal_peer_decref (peer);
3067
3068                         spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3069                 }
3070
3071                 if (!list_empty (&kibnal_data.kib_connd_conns)) {
3072                         conn = list_entry (kibnal_data.kib_connd_conns.next,
3073                                            kib_conn_t, ibc_list);
3074                         list_del (&conn->ibc_list);
3075                         
3076                         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3077                         dropped_lock = 1;
3078
3079                         switch (conn->ibc_state) {
3080                         default:
3081                                 LBUG();
3082                                 
3083                         case IBNAL_CONN_ACTIVE_ARP:
3084                                 kibnal_arp_done(conn);
3085                                 break;
3086
3087                         case IBNAL_CONN_ACTIVE_CONNECT:
3088                                 kibnal_check_connreply(conn);
3089                                 break;
3090
3091                         case IBNAL_CONN_PASSIVE_WAIT:
3092                                 kibnal_check_passive_wait(conn);
3093                                 break;
3094
3095                         case IBNAL_CONN_DISCONNECT1:
3096                         case IBNAL_CONN_DISCONNECT2:
3097                                 kibnal_disconnect_conn(conn);
3098                                 break;
3099                         }
3100                         kibnal_conn_decref(conn);
3101
3102                         spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3103                 }
3104
3105                 /* careful with the jiffy wrap... */
3106                 timeout = (int)(deadline - jiffies);
3107                 if (timeout <= 0) {
3108                         const int n = 4;
3109                         const int p = 1;
3110                         int       chunk = kibnal_data.kib_peer_hash_size;
3111                         
3112                         spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
3113                         dropped_lock = 1;
3114
3115                         /* Time to check for RDMA timeouts on a few more
3116                          * peers: I do checks every 'p' seconds on a
3117                          * proportion of the peer table and I need to check
3118                          * every connection 'n' times within a timeout
3119                          * interval, to ensure I detect a timeout on any
3120                          * connection within (n+1)/n times the timeout
3121                          * interval. */
3122
3123                         if (kibnal_tunables.kib_io_timeout > n * p)
3124                                 chunk = (chunk * n * p) / 
3125                                         kibnal_tunables.kib_io_timeout;
3126                         if (chunk == 0)
3127                                 chunk = 1;
3128
3129                         for (i = 0; i < chunk; i++) {
3130                                 kibnal_check_conns (peer_index);
3131                                 peer_index = (peer_index + 1) % 
3132                                              kibnal_data.kib_peer_hash_size;
3133                         }
3134
3135                         deadline += p * HZ;
3136                         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
3137                 }
3138
3139                 if (dropped_lock)
3140                         continue;
3141                 
3142                 /* Nothing to do for 'timeout'  */
3143                 set_current_state (TASK_INTERRUPTIBLE);
3144                 add_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
3145                 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3146
3147                 schedule_timeout (timeout);
3148
3149                 set_current_state (TASK_RUNNING);
3150                 remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
3151                 spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3152         }
3153
3154         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3155
3156         kibnal_thread_fini ();
3157         return (0);
3158 }
3159
3160 void 
3161 kibnal_async_callback(vv_event_record_t ev)
3162 {
3163         CERROR("type: %d, port: %d, data: "LPX64"\n", 
3164                ev.event_type, ev.port_num, ev.type.data);
3165 }
3166
3167 void
3168 kibnal_cq_callback (unsigned long unused_context)
3169 {
3170         unsigned long    flags;
3171
3172         CDEBUG(D_NET, "!!\n");
3173
3174         spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3175         kibnal_data.kib_ready = 1;
3176         wake_up(&kibnal_data.kib_sched_waitq);
3177         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
3178 }
3179
3180 int
3181 kibnal_scheduler(void *arg)
3182 {
3183         long            id = (long)arg;
3184         wait_queue_t    wait;
3185         char            name[16];
3186         vv_wc_t         wc;
3187         vv_return_t     vvrc;
3188         vv_return_t     vvrc2;
3189         unsigned long   flags;
3190         kib_rx_t       *rx;
3191         __u64           rxseq = 0;
3192         int             busy_loops = 0;
3193
3194         snprintf(name, sizeof(name), "kibnal_sd_%02ld", id);
3195         kportal_daemonize(name);
3196         kportal_blockallsigs();
3197
3198         init_waitqueue_entry(&wait, current);
3199
3200         spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3201
3202         while (!kibnal_data.kib_shutdown) {
3203                 if (busy_loops++ >= IBNAL_RESCHED) {
3204                         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
3205                                                flags);
3206
3207                         our_cond_resched();
3208                         busy_loops = 0;
3209                         
3210                         spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3211                 }
3212
3213                 if (kibnal_data.kib_ready &&
3214                     !kibnal_data.kib_checking_cq) {
3215                         /* take ownership of completion polling */
3216                         kibnal_data.kib_checking_cq = 1;
3217                         /* Assume I'll exhaust the CQ */
3218                         kibnal_data.kib_ready = 0;
3219                         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, 
3220                                                flags);
3221                         
3222                         vvrc = vv_poll_for_completion(kibnal_data.kib_hca, 
3223                                                       kibnal_data.kib_cq, &wc);
3224                         if (vvrc == vv_return_err_cq_empty) {
3225                                 vvrc2 = vv_request_completion_notification(
3226                                         kibnal_data.kib_hca, 
3227                                         kibnal_data.kib_cq, 
3228                                         vv_next_solicit_unsolicit_event);
3229                                 LASSERT (vvrc2 == vv_return_ok);
3230                         }
3231
3232                         if (vvrc == vv_return_ok &&
3233                             kibnal_wreqid2type(wc.wr_id) == IBNAL_WID_RX) {
3234                                 rx = (kib_rx_t *)kibnal_wreqid2ptr(wc.wr_id);
3235
3236                                 /* Grab the RX sequence number NOW before
3237                                  * anyone else can get an RX completion */
3238                                 rxseq = rx->rx_conn->ibc_rxseq++;
3239                         }
3240
3241                         spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3242                         /* give up ownership of completion polling */
3243                         kibnal_data.kib_checking_cq = 0;
3244
3245                         if (vvrc == vv_return_err_cq_empty)
3246                                 continue;
3247
3248                         LASSERT (vvrc == vv_return_ok);
3249                         /* Assume there's more: get another scheduler to check
3250                          * while I handle this completion... */
3251
3252                         kibnal_data.kib_ready = 1;
3253                         wake_up(&kibnal_data.kib_sched_waitq);
3254
3255                         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
3256                                                flags);
3257
3258                         switch (kibnal_wreqid2type(wc.wr_id)) {
3259                         case IBNAL_WID_RX:
3260                                 kibnal_rx_complete(
3261                                         (kib_rx_t *)kibnal_wreqid2ptr(wc.wr_id),
3262                                         wc.completion_status,
3263                                         wc.num_bytes_transfered,
3264                                         rxseq);
3265                                 break;
3266
3267                         case IBNAL_WID_TX:
3268                                 kibnal_tx_complete(
3269                                         (kib_tx_t *)kibnal_wreqid2ptr(wc.wr_id),
3270                                         wc.completion_status);
3271                                 break;
3272
3273                         case IBNAL_WID_RDMA:
3274                                 /* We only get RDMA completion notification if
3275                                  * it fails.  So we just ignore them completely
3276                                  * because...
3277                                  *
3278                                  * 1) If an RDMA fails, all subsequent work
3279                                  * items, including the final SEND will fail
3280                                  * too, so I'm still guaranteed to notice that
3281                                  * this connection is hosed.
3282                                  *
3283                                  * 2) It's positively dangerous to look inside
3284                                  * the tx descriptor obtained from an RDMA work
3285                                  * item.  As soon as I drop the kib_sched_lock,
3286                                  * I give a scheduler on another CPU a chance
3287                                  * to get the final SEND completion, so the tx
3288                                  * descriptor can get freed as I inspect it. */
3289                                 CERROR ("RDMA failed: %d\n", 
3290                                         wc.completion_status);
3291                                 break;
3292
3293                         default:
3294                                 LBUG();
3295                         }
3296                         
3297                         spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3298                         continue;
3299                 }
3300
3301                 /* Nothing to do; sleep... */
3302
3303                 set_current_state(TASK_INTERRUPTIBLE);
3304                 add_wait_queue(&kibnal_data.kib_sched_waitq, &wait);
3305                 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
3306                                        flags);
3307
3308                 schedule();
3309
3310                 remove_wait_queue(&kibnal_data.kib_sched_waitq, &wait);
3311                 set_current_state(TASK_RUNNING);
3312                 spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3313         }
3314
3315         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
3316
3317         kibnal_thread_fini();
3318         return (0);
3319 }
3320
3321
3322 lib_nal_t kibnal_lib = {
3323         .libnal_data = &kibnal_data,      /* NAL private data */
3324         .libnal_send = kibnal_send,
3325         .libnal_send_pages = kibnal_send_pages,
3326         .libnal_recv = kibnal_recv,
3327         .libnal_recv_pages = kibnal_recv_pages,
3328         .libnal_dist = kibnal_dist
3329 };