Whamcloud - gitweb
60572b8de1d2bfd17a65133f049303f50720950a
[fs/lustre-release.git] / lnet / klnds / viblnd / viblnd_cb.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * Copyright (C) 2004 Cluster File Systems, Inc.
5  *   Author: Eric Barton <eric@bartonsoftware.com>
6  *   Author: Frank Zago <fzago@systemfabricworks.com>
7  *
8  *   This file is part of Lustre, http://www.lustre.org.
9  *
10  *   Lustre is free software; you can redistribute it and/or
11  *   modify it under the terms of version 2 of the GNU General Public
12  *   License as published by the Free Software Foundation.
13  *
14  *   Lustre is distributed in the hope that it will be useful,
15  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
16  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  *   GNU General Public License for more details.
18  *
19  *   You should have received a copy of the GNU General Public License
20  *   along with Lustre; if not, write to the Free Software
21  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22  *
23  */
24
25 #include "viblnd.h"
26
27 void
28 kibnal_tx_done (kib_tx_t *tx)
29 {
30         lnet_msg_t *lntmsg[2];
31         int         rc = tx->tx_status;
32         int         i;
33
34         LASSERT (!in_interrupt());
35         LASSERT (!tx->tx_queued);               /* mustn't be queued for sending */
36         LASSERT (tx->tx_sending == 0);          /* mustn't be awaiting sent callback */
37         LASSERT (!tx->tx_waiting);              /* mustn't be awaiting peer response */
38
39 #if IBNAL_USE_FMR
40         if (tx->tx_md.md_fmrcount == 0 ||
41             (rc != 0 && tx->tx_md.md_active)) {
42                 vv_return_t      vvrc;
43
44                 /* mapping must be active (it dropped fmrcount to 0) */
45                 LASSERT (tx->tx_md.md_active); 
46
47                 vvrc = vv_unmap_fmr(kibnal_data.kib_hca,
48                                     1, &tx->tx_md.md_fmrhandle);
49                 LASSERT (vvrc == vv_return_ok);
50
51                 tx->tx_md.md_fmrcount = *kibnal_tunables.kib_fmr_remaps;
52         }
53         tx->tx_md.md_active = 0;
54 #endif
55
56         /* tx may have up to 2 lnet msgs to finalise */
57         lntmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL;
58         lntmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL;
59         
60         if (tx->tx_conn != NULL) {
61                 kibnal_conn_decref(tx->tx_conn);
62                 tx->tx_conn = NULL;
63         }
64
65         tx->tx_nwrq = 0;
66         tx->tx_status = 0;
67
68         spin_lock(&kibnal_data.kib_tx_lock);
69
70         list_add (&tx->tx_list, &kibnal_data.kib_idle_txs);
71
72         spin_unlock(&kibnal_data.kib_tx_lock);
73
74         /* delay finalize until my descs have been freed */
75         for (i = 0; i < 2; i++) {
76                 if (lntmsg[i] == NULL)
77                         continue;
78
79                 lnet_finalize (kibnal_data.kib_ni, lntmsg[i], rc);
80         }
81 }
82
83 void
84 kibnal_txlist_done (struct list_head *txlist, int status)
85 {
86         kib_tx_t *tx;
87
88         while (!list_empty (txlist)) {
89                 tx = list_entry (txlist->next, kib_tx_t, tx_list);
90
91                 list_del (&tx->tx_list);
92                 /* complete now */
93                 tx->tx_waiting = 0;
94                 tx->tx_status = status;
95                 kibnal_tx_done (tx);
96         }
97 }
98
99 kib_tx_t *
100 kibnal_get_idle_tx (void) 
101 {
102         kib_tx_t      *tx;
103         
104         spin_lock(&kibnal_data.kib_tx_lock);
105
106         if (list_empty (&kibnal_data.kib_idle_txs)) {
107                 spin_unlock(&kibnal_data.kib_tx_lock);
108                 return NULL;
109         }
110
111         tx = list_entry (kibnal_data.kib_idle_txs.next, kib_tx_t, tx_list);
112         list_del (&tx->tx_list);
113
114         /* Allocate a new completion cookie.  It might not be needed,
115          * but we've got a lock right now and we're unlikely to
116          * wrap... */
117         tx->tx_cookie = kibnal_data.kib_next_tx_cookie++;
118
119         spin_unlock(&kibnal_data.kib_tx_lock);
120
121         LASSERT (tx->tx_nwrq == 0);
122         LASSERT (!tx->tx_queued);
123         LASSERT (tx->tx_sending == 0);
124         LASSERT (!tx->tx_waiting);
125         LASSERT (tx->tx_status == 0);
126         LASSERT (tx->tx_conn == NULL);
127         LASSERT (tx->tx_lntmsg[0] == NULL);
128         LASSERT (tx->tx_lntmsg[1] == NULL);
129         
130         return tx;
131 }
132
133 int
134 kibnal_post_rx (kib_rx_t *rx, int credit, int rsrvd_credit)
135 {
136         kib_conn_t   *conn = rx->rx_conn;
137         int           rc = 0;
138         __u64         addr = (__u64)((unsigned long)((rx)->rx_msg));
139         vv_return_t   vvrc;
140
141         LASSERT (!in_interrupt());
142         /* old peers don't reserve rxs for RDMA replies */
143         LASSERT (!rsrvd_credit ||
144                  conn->ibc_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD);
145         
146         rx->rx_gl = (vv_scatgat_t) {
147                 .v_address = KIBNAL_ADDR2SG(addr),
148                 .l_key     = rx->rx_lkey,
149                 .length    = IBNAL_MSG_SIZE,
150         };
151
152         rx->rx_wrq = (vv_wr_t) {
153                 .wr_id                   = kibnal_ptr2wreqid(rx, IBNAL_WID_RX),
154                 .completion_notification = 1,
155                 .scatgat_list            = &rx->rx_gl,
156                 .num_of_data_segments    = 1,
157                 .wr_type                 = vv_wr_receive,
158         };
159
160         LASSERT (conn->ibc_state >= IBNAL_CONN_INIT);
161         LASSERT (rx->rx_nob >= 0);              /* not posted */
162
163         CDEBUG(D_NET, "posting rx [%d %x "LPX64"]\n", 
164                rx->rx_wrq.scatgat_list->length,
165                rx->rx_wrq.scatgat_list->l_key,
166                KIBNAL_SG2ADDR(rx->rx_wrq.scatgat_list->v_address));
167
168         if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) {
169                 /* No more posts for this rx; so lose its ref */
170                 kibnal_conn_decref(conn);
171                 return 0;
172         }
173         
174         rx->rx_nob = -1;                        /* flag posted */
175         
176         spin_lock(&conn->ibc_lock);
177         /* Serialise vv_post_receive; it's not re-entrant on the same QP */
178         vvrc = vv_post_receive(kibnal_data.kib_hca,
179                                conn->ibc_qp, &rx->rx_wrq);
180
181         if (vvrc == vv_return_ok) {
182                 if (credit)
183                         conn->ibc_outstanding_credits++;
184                 if (rsrvd_credit)
185                         conn->ibc_reserved_credits++;
186
187                 spin_unlock(&conn->ibc_lock);
188
189                 if (credit || rsrvd_credit)
190                         kibnal_check_sends(conn);
191
192                 return 0;
193         }
194         
195         spin_unlock(&conn->ibc_lock);
196
197         CERROR ("post rx -> %s failed %d\n", 
198                 libcfs_nid2str(conn->ibc_peer->ibp_nid), vvrc);
199         rc = -EIO;
200         kibnal_close_conn(rx->rx_conn, rc);
201         /* No more posts for this rx; so lose its ref */
202         kibnal_conn_decref(conn);
203         return rc;
204 }
205
206 int
207 kibnal_post_receives (kib_conn_t *conn)
208 {
209         int    i;
210         int    rc;
211
212         LASSERT (conn->ibc_state < IBNAL_CONN_ESTABLISHED);
213         LASSERT (conn->ibc_comms_error == 0);
214
215         for (i = 0; i < IBNAL_RX_MSGS; i++) {
216                 /* +1 ref for rx desc.  This ref remains until kibnal_post_rx
217                  * fails (i.e. actual failure or we're disconnecting) */
218                 kibnal_conn_addref(conn);
219                 rc = kibnal_post_rx (&conn->ibc_rxs[i], 0, 0);
220                 if (rc != 0)
221                         return rc;
222         }
223
224         return 0;
225 }
226
227 kib_tx_t *
228 kibnal_find_waiting_tx_locked(kib_conn_t *conn, int txtype, __u64 cookie)
229 {
230         struct list_head   *tmp;
231         
232         list_for_each(tmp, &conn->ibc_active_txs) {
233                 kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
234                 
235                 LASSERT (!tx->tx_queued);
236                 LASSERT (tx->tx_sending != 0 || tx->tx_waiting);
237
238                 if (tx->tx_cookie != cookie)
239                         continue;
240
241                 if (tx->tx_waiting &&
242                     tx->tx_msg->ibm_type == txtype)
243                         return tx;
244
245                 CWARN("Bad completion: %swaiting, type %x (wanted %x)\n",
246                       tx->tx_waiting ? "" : "NOT ",
247                       tx->tx_msg->ibm_type, txtype);
248         }
249         return NULL;
250 }
251
252 void
253 kibnal_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie)
254 {
255         kib_tx_t    *tx;
256         int          idle;
257
258         spin_lock(&conn->ibc_lock);
259
260         tx = kibnal_find_waiting_tx_locked(conn, txtype, cookie);
261         if (tx == NULL) {
262                 spin_unlock(&conn->ibc_lock);
263
264                 CWARN("Unmatched completion type %x cookie "LPX64" from %s\n",
265                       txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid));
266                 kibnal_close_conn (conn, -EPROTO);
267                 return;
268         }
269
270         if (tx->tx_status == 0) {               /* success so far */
271                 if (status < 0) {               /* failed? */
272                         tx->tx_status = status;
273                 } else if (txtype == IBNAL_MSG_GET_REQ) { 
274                         lnet_set_reply_msg_len(kibnal_data.kib_ni,
275                                                tx->tx_lntmsg[1], status);
276                 }
277         }
278         
279         tx->tx_waiting = 0;
280
281         idle = !tx->tx_queued && (tx->tx_sending == 0);
282         if (idle)
283                 list_del(&tx->tx_list);
284
285         spin_unlock(&conn->ibc_lock);
286         
287         if (idle)
288                 kibnal_tx_done(tx);
289 }
290
291 void
292 kibnal_send_completion (kib_conn_t *conn, int type, int status, __u64 cookie) 
293 {
294         kib_tx_t    *tx = kibnal_get_idle_tx();
295         
296         if (tx == NULL) {
297                 CERROR("Can't get tx for completion %x for %s\n",
298                        type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
299                 return;
300         }
301         
302         tx->tx_msg->ibm_u.completion.ibcm_status = status;
303         tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie;
304         kibnal_init_tx_msg(tx, type, sizeof(kib_completion_msg_t));
305         
306         kibnal_queue_tx(tx, conn);
307 }
308
309 void
310 kibnal_handle_rx (kib_rx_t *rx)
311 {
312         kib_msg_t    *msg = rx->rx_msg;
313         kib_conn_t   *conn = rx->rx_conn;
314         int           credits = msg->ibm_credits;
315         kib_tx_t     *tx;
316         int           rc = 0;
317         int           repost = 1;
318         int           rsrvd_credit = 0;
319         int           rc2;
320
321         LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
322
323         CDEBUG (D_NET, "Received %x[%d] from %s\n",
324                 msg->ibm_type, credits, libcfs_nid2str(conn->ibc_peer->ibp_nid));
325         
326         if (credits != 0) {
327                 /* Have I received credits that will let me send? */
328                 spin_lock(&conn->ibc_lock);
329                 conn->ibc_credits += credits;
330                 spin_unlock(&conn->ibc_lock);
331
332                 kibnal_check_sends(conn);
333         }
334
335         switch (msg->ibm_type) {
336         default:
337                 CERROR("Bad IBNAL message type %x from %s\n",
338                        msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
339                 rc = -EPROTO;
340                 break;
341
342         case IBNAL_MSG_NOOP:
343                 break;
344
345         case IBNAL_MSG_IMMEDIATE:
346                 rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.immediate.ibim_hdr,
347                                 msg->ibm_srcnid, rx, 0);
348                 repost = rc < 0;                /* repost on error */
349                 break;
350                 
351         case IBNAL_MSG_PUT_REQ:
352                 rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.putreq.ibprm_hdr,
353                                 msg->ibm_srcnid, rx, 1);
354                 repost = rc < 0;                /* repost on error */
355                 break;
356
357         case IBNAL_MSG_PUT_NAK:
358                 rsrvd_credit = 1;               /* rdma reply (was pre-reserved) */
359                 
360                 CWARN ("PUT_NACK from %s\n", libcfs_nid2str(conn->ibc_peer->ibp_nid));
361                 kibnal_handle_completion(conn, IBNAL_MSG_PUT_REQ, 
362                                          msg->ibm_u.completion.ibcm_status,
363                                          msg->ibm_u.completion.ibcm_cookie);
364                 break;
365
366         case IBNAL_MSG_PUT_ACK:
367                 rsrvd_credit = 1;               /* rdma reply (was pre-reserved) */
368
369                 spin_lock(&conn->ibc_lock);
370                 tx = kibnal_find_waiting_tx_locked(conn, IBNAL_MSG_PUT_REQ,
371                                                    msg->ibm_u.putack.ibpam_src_cookie);
372                 if (tx != NULL)
373                         list_del(&tx->tx_list);
374                 spin_unlock(&conn->ibc_lock);
375
376                 if (tx == NULL) {
377                         CERROR("Unmatched PUT_ACK from %s\n",
378                                libcfs_nid2str(conn->ibc_peer->ibp_nid));
379                         rc = -EPROTO;
380                         break;
381                 }
382
383                 LASSERT (tx->tx_waiting);
384                 /* CAVEAT EMPTOR: I could be racing with tx_complete, but...
385                  * (a) I can overwrite tx_msg since my peer has received it!
386                  * (b) tx_waiting set tells tx_complete() it's not done. */
387
388                 tx->tx_nwrq = 0;                /* overwrite PUT_REQ */
389
390                 rc2 = kibnal_init_rdma(tx, IBNAL_MSG_PUT_DONE, 
391                                        kibnal_rd_size(&msg->ibm_u.putack.ibpam_rd),
392                                        &msg->ibm_u.putack.ibpam_rd,
393                                        msg->ibm_u.putack.ibpam_dst_cookie);
394                 if (rc2 < 0)
395                         CERROR("Can't setup rdma for PUT to %s: %d\n",
396                                libcfs_nid2str(conn->ibc_peer->ibp_nid), rc2);
397
398                 spin_lock(&conn->ibc_lock);
399                 if (tx->tx_status == 0 && rc2 < 0)
400                         tx->tx_status = rc2;
401                 tx->tx_waiting = 0;             /* clear waiting and queue atomically */
402                 kibnal_queue_tx_locked(tx, conn);
403                 spin_unlock(&conn->ibc_lock);
404                 break;
405                 
406         case IBNAL_MSG_PUT_DONE:
407                 /* This buffer was pre-reserved by not returning the credit
408                  * when the PUT_REQ's buffer was reposted, so I just return it
409                  * now */
410                 kibnal_handle_completion(conn, IBNAL_MSG_PUT_ACK,
411                                          msg->ibm_u.completion.ibcm_status,
412                                          msg->ibm_u.completion.ibcm_cookie);
413                 break;
414
415         case IBNAL_MSG_GET_REQ:
416                 rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.get.ibgm_hdr,
417                                 msg->ibm_srcnid, rx, 1);
418                 repost = rc < 0;                /* repost on error */
419                 break;
420
421         case IBNAL_MSG_GET_DONE:
422                 rsrvd_credit = 1;               /* rdma reply (was pre-reserved) */
423                 
424                 kibnal_handle_completion(conn, IBNAL_MSG_GET_REQ,
425                                          msg->ibm_u.completion.ibcm_status,
426                                          msg->ibm_u.completion.ibcm_cookie);
427                 break;
428         }
429
430         if (rc < 0)                             /* protocol error */
431                 kibnal_close_conn(conn, rc);
432
433         if (repost) {
434                 if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD)
435                         rsrvd_credit = 0;       /* peer isn't pre-reserving */
436
437                 kibnal_post_rx(rx, !rsrvd_credit, rsrvd_credit);
438         }
439 }
440
441 void
442 kibnal_rx_complete (kib_rx_t *rx, vv_comp_status_t vvrc, int nob, __u64 rxseq)
443 {
444         kib_msg_t    *msg = rx->rx_msg;
445         kib_conn_t   *conn = rx->rx_conn;
446         unsigned long flags;
447         int           rc;
448
449         CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
450         LASSERT (rx->rx_nob < 0);               /* was posted */
451         rx->rx_nob = 0;                         /* isn't now */
452
453         if (conn->ibc_state > IBNAL_CONN_ESTABLISHED)
454                 goto ignore;
455
456         if (vvrc != vv_comp_status_success) {
457                 CERROR("Rx from %s failed: %d\n", 
458                        libcfs_nid2str(conn->ibc_peer->ibp_nid), vvrc);
459                 goto failed;
460         }
461
462         rc = kibnal_unpack_msg(msg, conn->ibc_version, nob);
463         if (rc != 0) {
464                 CERROR ("Error %d unpacking rx from %s\n",
465                         rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
466                 goto failed;
467         }
468
469         rx->rx_nob = nob;                       /* Can trust 'nob' now */
470
471         if (!lnet_ptlcompat_matchnid(conn->ibc_peer->ibp_nid,
472                                      msg->ibm_srcnid) ||
473             !lnet_ptlcompat_matchnid(kibnal_data.kib_ni->ni_nid, 
474                                      msg->ibm_dstnid) ||
475             msg->ibm_srcstamp != conn->ibc_incarnation ||
476             msg->ibm_dststamp != kibnal_data.kib_incarnation) {
477                 CERROR ("Stale rx from %s\n",
478                         libcfs_nid2str(conn->ibc_peer->ibp_nid));
479                 goto failed;
480         }
481
482         if (msg->ibm_seq != rxseq) {
483                 CERROR ("Out-of-sequence rx from %s"
484                         ": got "LPD64" but expected "LPD64"\n",
485                         libcfs_nid2str(conn->ibc_peer->ibp_nid),
486                         msg->ibm_seq, rxseq);
487                 goto failed;
488         }
489
490         /* set time last known alive */
491         kibnal_peer_alive(conn->ibc_peer);
492
493         /* racing with connection establishment/teardown! */
494
495         if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
496                 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
497                 /* must check holding global lock to eliminate race */
498                 if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
499                         list_add_tail(&rx->rx_list, &conn->ibc_early_rxs);
500                         write_unlock_irqrestore(&kibnal_data.kib_global_lock, 
501                                                 flags);
502                         return;
503                 }
504                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, 
505                                         flags);
506         }
507         kibnal_handle_rx(rx);
508         return;
509         
510  failed:
511         CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
512         kibnal_close_conn(conn, -EIO);
513  ignore:
514         /* Don't re-post rx & drop its ref on conn */
515         kibnal_conn_decref(conn);
516 }
517
518 struct page *
519 kibnal_kvaddr_to_page (unsigned long vaddr)
520 {
521         struct page *page;
522
523         if (vaddr >= VMALLOC_START &&
524             vaddr < VMALLOC_END) {
525                 page = vmalloc_to_page ((void *)vaddr);
526                 LASSERT (page != NULL);
527                 return page;
528         }
529 #if CONFIG_HIGHMEM
530         if (vaddr >= PKMAP_BASE &&
531             vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) {
532                 /* No highmem pages only used for bulk (kiov) I/O */
533                 CERROR("find page for address in highmem\n");
534                 LBUG();
535         }
536 #endif
537         page = virt_to_page (vaddr);
538         LASSERT (page != NULL);
539         return page;
540 }
541
542 #if !IBNAL_USE_FMR
543 int
544 kibnal_append_rdfrag(kib_rdma_desc_t *rd, int active, struct page *page, 
545                      unsigned long page_offset, unsigned long len)
546 {
547         kib_rdma_frag_t *frag = &rd->rd_frags[rd->rd_nfrag];
548         vv_l_key_t       l_key;
549         vv_r_key_t       r_key;
550         __u64            addr;
551         __u64            frag_addr;
552         vv_mem_reg_h_t   mem_h;
553         vv_return_t      vvrc;
554
555         if (rd->rd_nfrag >= IBNAL_MAX_RDMA_FRAGS) {
556                 CERROR ("Too many RDMA fragments\n");
557                 return -EMSGSIZE;
558         }
559
560         /* Try to create an address that adaptor-tavor will munge into a valid
561          * network address, given how it maps all phys mem into 1 region */
562         addr = lnet_page2phys(page) + page_offset + PAGE_OFFSET;
563
564         /* NB this relies entirely on there being a single region for the whole
565          * of memory, since "high" memory will wrap in the (void *) cast! */
566         vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca, 
567                                     (void *)((unsigned long)addr),
568                                     len, &mem_h, &l_key, &r_key);
569         LASSERT (vvrc == vv_return_ok);
570
571         if (active) {
572                 if (rd->rd_nfrag == 0) {
573                         rd->rd_key = l_key;
574                 } else if (l_key != rd->rd_key) {
575                         CERROR ("> 1 key for single RDMA desc\n");
576                         return -EINVAL;
577                 }
578                 frag_addr = addr;
579         } else {
580                 if (rd->rd_nfrag == 0) {
581                         rd->rd_key = r_key;
582                 } else if (r_key != rd->rd_key) {
583                         CERROR ("> 1 key for single RDMA desc\n");
584                         return -EINVAL;
585                 }
586
587                 frag_addr = kibnal_addr2net(addr);
588         }
589
590         kibnal_rf_set(frag, frag_addr, len);
591
592         CDEBUG(D_NET,"map frag [%d][%d %x %08x%08x] "LPX64"\n", 
593                rd->rd_nfrag, frag->rf_nob, rd->rd_key, 
594                frag->rf_addr_hi, frag->rf_addr_lo, frag_addr);
595
596         rd->rd_nfrag++;
597         return 0;
598 }
599
600 int
601 kibnal_setup_rd_iov(kib_tx_t *tx, kib_rdma_desc_t *rd, 
602                     vv_access_con_bit_mask_t access,
603                     unsigned int niov, struct iovec *iov, int offset, int nob)
604                  
605 {
606         /* active if I'm sending */
607         int           active = ((access & vv_acc_r_mem_write) == 0);
608         int           fragnob;
609         int           rc;
610         unsigned long vaddr;
611         struct page  *page;
612         int           page_offset;
613
614         LASSERT (nob > 0);
615         LASSERT (niov > 0);
616         LASSERT ((rd != tx->tx_rd) == !active);
617
618         while (offset >= iov->iov_len) {
619                 offset -= iov->iov_len;
620                 niov--;
621                 iov++;
622                 LASSERT (niov > 0);
623         }
624
625         rd->rd_nfrag = 0;
626         do {
627                 LASSERT (niov > 0);
628
629                 vaddr = ((unsigned long)iov->iov_base) + offset;
630                 page_offset = vaddr & (PAGE_SIZE - 1);
631                 page = kibnal_kvaddr_to_page(vaddr);
632                 if (page == NULL) {
633                         CERROR ("Can't find page\n");
634                         return -EFAULT;
635                 }
636
637                 fragnob = min((int)(iov->iov_len - offset), nob);
638                 fragnob = min(fragnob, (int)PAGE_SIZE - page_offset);
639
640                 rc = kibnal_append_rdfrag(rd, active, page, 
641                                           page_offset, fragnob);
642                 if (rc != 0)
643                         return rc;
644
645                 if (offset + fragnob < iov->iov_len) {
646                         offset += fragnob;
647                 } else {
648                         offset = 0;
649                         iov++;
650                         niov--;
651                 }
652                 nob -= fragnob;
653         } while (nob > 0);
654         
655         return 0;
656 }
657
658 int
659 kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd, 
660                       vv_access_con_bit_mask_t access,
661                       int nkiov, lnet_kiov_t *kiov, int offset, int nob)
662 {
663         /* active if I'm sending */
664         int            active = ((access & vv_acc_r_mem_write) == 0);
665         int            fragnob;
666         int            rc;
667
668         CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
669
670         LASSERT (nob > 0);
671         LASSERT (nkiov > 0);
672         LASSERT ((rd != tx->tx_rd) == !active);
673
674         while (offset >= kiov->kiov_len) {
675                 offset -= kiov->kiov_len;
676                 nkiov--;
677                 kiov++;
678                 LASSERT (nkiov > 0);
679         }
680
681         rd->rd_nfrag = 0;
682         do {
683                 LASSERT (nkiov > 0);
684                 fragnob = min((int)(kiov->kiov_len - offset), nob);
685                 
686                 rc = kibnal_append_rdfrag(rd, active, kiov->kiov_page,
687                                           kiov->kiov_offset + offset,
688                                           fragnob);
689                 if (rc != 0)
690                         return rc;
691
692                 offset = 0;
693                 kiov++;
694                 nkiov--;
695                 nob -= fragnob;
696         } while (nob > 0);
697
698         return 0;
699 }
700 #else
701 int
702 kibnal_map_tx (kib_tx_t *tx, kib_rdma_desc_t *rd, int active,
703                int npages, unsigned long page_offset, int nob)
704 {
705         vv_return_t   vvrc;
706         vv_fmr_map_t  map_props;
707
708         LASSERT ((rd != tx->tx_rd) == !active);
709         LASSERT (!tx->tx_md.md_active);
710         LASSERT (tx->tx_md.md_fmrcount > 0);
711         LASSERT (page_offset < PAGE_SIZE);
712         LASSERT (npages >= (1 + ((page_offset + nob - 1)>>PAGE_SHIFT)));
713         LASSERT (npages <= LNET_MAX_IOV);
714
715         memset(&map_props, 0, sizeof(map_props));
716
717         map_props.start          = (void *)page_offset;
718         map_props.size           = nob;
719         map_props.page_array_len = npages;
720         map_props.page_array     = tx->tx_pages;
721
722         vvrc = vv_map_fmr(kibnal_data.kib_hca, tx->tx_md.md_fmrhandle,
723                           &map_props, &tx->tx_md.md_lkey, &tx->tx_md.md_rkey);
724         if (vvrc != vv_return_ok) {
725                 CERROR ("Can't map vaddr %p for %d in %d pages: %d\n", 
726                         map_props.start, nob, npages, vvrc);
727                 return -EFAULT;
728         }
729
730         tx->tx_md.md_addr = (unsigned long)map_props.start;
731         tx->tx_md.md_active = 1;
732         tx->tx_md.md_fmrcount--;
733
734         rd->rd_key = active ? tx->tx_md.md_lkey : tx->tx_md.md_rkey;
735         rd->rd_nob = nob;
736         rd->rd_addr = tx->tx_md.md_addr;
737
738         /* Compensate for adaptor-tavor's munging of gatherlist addresses */
739         if (active)
740                 rd->rd_addr += PAGE_OFFSET;
741
742         return 0;
743 }
744
745 int
746 kibnal_setup_rd_iov (kib_tx_t *tx, kib_rdma_desc_t *rd,
747                      vv_access_con_bit_mask_t access,
748                      unsigned int niov, struct iovec *iov, int offset, int nob)
749                  
750 {
751         /* active if I'm sending */
752         int           active = ((access & vv_acc_r_mem_write) == 0);
753         int           resid;
754         int           fragnob;
755         struct page  *page;
756         int           npages;
757         unsigned long page_offset;
758         unsigned long vaddr;
759
760         LASSERT (nob > 0);
761         LASSERT (niov > 0);
762
763         while (offset >= iov->iov_len) {
764                 offset -= iov->iov_len;
765                 niov--;
766                 iov++;
767                 LASSERT (niov > 0);
768         }
769
770         if (nob > iov->iov_len - offset) {
771                 CERROR ("Can't map multiple vaddr fragments\n");
772                 return (-EMSGSIZE);
773         }
774
775         vaddr = ((unsigned long)iov->iov_base) + offset;
776         
777         page_offset = vaddr & (PAGE_SIZE - 1);
778         resid = nob;
779         npages = 0;
780
781         do {
782                 LASSERT (npages < LNET_MAX_IOV);
783
784                 page = kibnal_kvaddr_to_page(vaddr);
785                 if (page == NULL) {
786                         CERROR("Can't find page for %lu\n", vaddr);
787                         return -EFAULT;
788                 }
789
790                 tx->tx_pages[npages++] = lnet_page2phys(page);
791
792                 fragnob = PAGE_SIZE - (vaddr & (PAGE_SIZE - 1));
793                 vaddr += fragnob;
794                 resid -= fragnob;
795
796         } while (resid > 0);
797
798         return kibnal_map_tx(tx, rd, active, npages, page_offset, nob);
799 }
800
801 int
802 kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd,
803                       vv_access_con_bit_mask_t access,
804                       int nkiov, lnet_kiov_t *kiov, int offset, int nob)
805 {
806         /* active if I'm sending */
807         int            active = ((access & vv_acc_r_mem_write) == 0);
808         int            resid;
809         int            npages;
810         unsigned long  page_offset;
811         
812         CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
813
814         LASSERT (nob > 0);
815         LASSERT (nkiov > 0);
816         LASSERT (nkiov <= LNET_MAX_IOV);
817         LASSERT (!tx->tx_md.md_active);
818         LASSERT ((rd != tx->tx_rd) == !active);
819
820         while (offset >= kiov->kiov_len) {
821                 offset -= kiov->kiov_len;
822                 nkiov--;
823                 kiov++;
824                 LASSERT (nkiov > 0);
825         }
826
827         page_offset = kiov->kiov_offset + offset;
828         
829         resid = offset + nob;
830         npages = 0;
831
832         do {
833                 LASSERT (npages < LNET_MAX_IOV);
834                 LASSERT (nkiov > 0);
835
836                 if ((npages > 0 && kiov->kiov_offset != 0) ||
837                     (resid > kiov->kiov_len && 
838                      (kiov->kiov_offset + kiov->kiov_len) != PAGE_SIZE)) {
839                         /* Can't have gaps */
840                         CERROR ("Can't make payload contiguous in I/O VM:"
841                                 "page %d, offset %d, len %d \n",
842                                 npages, kiov->kiov_offset, kiov->kiov_len);
843                         
844                         return -EINVAL;
845                 }
846
847                 tx->tx_pages[npages++] = lnet_page2phys(kiov->kiov_page);
848                 resid -= kiov->kiov_len;
849                 kiov++;
850                 nkiov--;
851         } while (resid > 0);
852
853         return kibnal_map_tx(tx, rd, active, npages, page_offset, nob);
854 }
855 #endif
856
857 kib_conn_t *
858 kibnal_find_conn_locked (kib_peer_t *peer)
859 {
860         struct list_head *tmp;
861
862         /* just return the first connection */
863         list_for_each (tmp, &peer->ibp_conns) {
864                 return (list_entry(tmp, kib_conn_t, ibc_list));
865         }
866
867         return (NULL);
868 }
869
870 void
871 kibnal_check_sends (kib_conn_t *conn)
872 {
873         kib_tx_t       *tx;
874         vv_return_t     vvrc;
875         int             rc;
876         int             consume_cred;
877         int             done;
878
879         /* Don't send anything until after the connection is established */
880         if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
881                 CDEBUG(D_NET, "%s too soon\n",
882                        libcfs_nid2str(conn->ibc_peer->ibp_nid));
883                 return;
884         }
885         
886         spin_lock(&conn->ibc_lock);
887
888         LASSERT (conn->ibc_nsends_posted <=
889                  *kibnal_tunables.kib_concurrent_sends);
890         LASSERT (conn->ibc_reserved_credits >= 0);
891         
892         while (conn->ibc_reserved_credits > 0 &&
893                !list_empty(&conn->ibc_tx_queue_rsrvd)) {
894                 LASSERT (conn->ibc_version != 
895                          IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD);
896                 tx = list_entry(conn->ibc_tx_queue_rsrvd.next,
897                                 kib_tx_t, tx_list);
898                 list_del(&tx->tx_list);
899                 list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
900                 conn->ibc_reserved_credits--;
901         }
902         
903         if (list_empty(&conn->ibc_tx_queue) &&
904             list_empty(&conn->ibc_tx_queue_nocred) &&
905             (conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER ||
906              kibnal_send_keepalive(conn))) {
907                 spin_unlock(&conn->ibc_lock);
908                 
909                 tx = kibnal_get_idle_tx();
910                 if (tx != NULL)
911                         kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0);
912
913                 spin_lock(&conn->ibc_lock);
914                 
915                 if (tx != NULL)
916                         kibnal_queue_tx_locked(tx, conn);
917         }
918
919         for (;;) {
920                 if (!list_empty(&conn->ibc_tx_queue_nocred)) {
921                         LASSERT (conn->ibc_version != 
922                                  IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD);
923                         tx = list_entry (conn->ibc_tx_queue_nocred.next, 
924                                          kib_tx_t, tx_list);
925                         consume_cred = 0;
926                 } else if (!list_empty (&conn->ibc_tx_queue)) {
927                         tx = list_entry (conn->ibc_tx_queue.next, 
928                                          kib_tx_t, tx_list);
929                         consume_cred = 1;
930                 } else {
931                         /* nothing waiting */
932                         break;
933                 }
934                 
935                 LASSERT (tx->tx_queued);
936                 /* We rely on this for QP sizing */
937                 LASSERT (tx->tx_nwrq > 0 && tx->tx_nwrq <= 1 + IBNAL_MAX_RDMA_FRAGS);
938
939                 LASSERT (conn->ibc_outstanding_credits >= 0);
940                 LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE);
941                 LASSERT (conn->ibc_credits >= 0);
942                 LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE);
943
944                 if (conn->ibc_nsends_posted ==
945                     *kibnal_tunables.kib_concurrent_sends) {
946                         /* We've got some tx completions outstanding... */
947                         CDEBUG(D_NET, "%s: posted enough\n",
948                                libcfs_nid2str(conn->ibc_peer->ibp_nid));
949                         break;
950                 }
951                 
952                 if (consume_cred) {
953                         if (conn->ibc_credits == 0) {   /* no credits */
954                                 CDEBUG(D_NET, "%s: no credits\n",
955                                        libcfs_nid2str(conn->ibc_peer->ibp_nid));
956                                 break;
957                         }
958                         
959                         if (conn->ibc_credits == 1 &&   /* last credit reserved for */
960                             conn->ibc_outstanding_credits == 0) { /* giving back credits */
961                                 CDEBUG(D_NET, "%s: not using last credit\n",
962                                        libcfs_nid2str(conn->ibc_peer->ibp_nid));
963                                 break;
964                         }
965                 }
966                 
967                 list_del (&tx->tx_list);
968                 tx->tx_queued = 0;
969
970                 /* NB don't drop ibc_lock before bumping tx_sending */
971
972                 if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP &&
973                     (!list_empty(&conn->ibc_tx_queue) ||
974                      !list_empty(&conn->ibc_tx_queue_nocred) ||
975                      (conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER &&
976                       !kibnal_send_keepalive(conn)))) {
977                         /* redundant NOOP */
978                         spin_unlock(&conn->ibc_lock);
979                         kibnal_tx_done(tx);
980                         spin_lock(&conn->ibc_lock);
981                         CDEBUG(D_NET, "%s: redundant noop\n",
982                                libcfs_nid2str(conn->ibc_peer->ibp_nid));
983                         continue;
984                 }
985
986                 kibnal_pack_msg(tx->tx_msg, conn->ibc_version,
987                                 conn->ibc_outstanding_credits,
988                                 conn->ibc_peer->ibp_nid, conn->ibc_incarnation,
989                                 conn->ibc_txseq);
990
991                 conn->ibc_txseq++;
992                 conn->ibc_outstanding_credits = 0;
993                 conn->ibc_nsends_posted++;
994                 if (consume_cred)
995                         conn->ibc_credits--;
996
997                 /* CAVEAT EMPTOR!  This tx could be the PUT_DONE of an RDMA
998                  * PUT.  If so, it was first queued here as a PUT_REQ, sent and
999                  * stashed on ibc_active_txs, matched by an incoming PUT_ACK,
1000                  * and then re-queued here.  It's (just) possible that
1001                  * tx_sending is non-zero if we've not done the tx_complete() from
1002                  * the first send; hence the ++ rather than = below. */
1003                 tx->tx_sending++;
1004
1005                 list_add (&tx->tx_list, &conn->ibc_active_txs);
1006
1007                 /* Keep holding ibc_lock while posting sends on this
1008                  * connection; vv_post_send() isn't re-entrant on the same
1009                  * QP!! */
1010
1011                 LASSERT (tx->tx_nwrq > 0);
1012 #if 0
1013                 if (tx->tx_wrq[0].wr_type == vv_wr_rdma_write) 
1014                         CDEBUG(D_NET, "WORK[0]: RDMA gl %p for %d k %x -> "LPX64" k %x\n",
1015                                tx->tx_wrq[0].scatgat_list->v_address,
1016                                tx->tx_wrq[0].scatgat_list->length,
1017                                tx->tx_wrq[0].scatgat_list->l_key,
1018                                tx->tx_wrq[0].type.send.send_qp_type.rc_type.r_addr,
1019                                tx->tx_wrq[0].type.send.send_qp_type.rc_type.r_r_key);
1020                 else
1021                         CDEBUG(D_NET, "WORK[0]: %s gl %p for %d k %x\n",
1022                                tx->tx_wrq[0].wr_type == vv_wr_send ? "SEND" : "????",
1023                                tx->tx_wrq[0].scatgat_list->v_address,
1024                                tx->tx_wrq[0].scatgat_list->length,
1025                                tx->tx_wrq[0].scatgat_list->l_key);
1026
1027                 if (tx->tx_nwrq > 1) {
1028                         if (tx->tx_wrq[1].wr_type == vv_wr_rdma_write) 
1029                                 CDEBUG(D_NET, "WORK[1]: RDMA gl %p for %d k %x -> "LPX64" k %x\n",
1030                                        tx->tx_wrq[1].scatgat_list->v_address,
1031                                        tx->tx_wrq[1].scatgat_list->length,
1032                                        tx->tx_wrq[1].scatgat_list->l_key,
1033                                        tx->tx_wrq[1].type.send.send_qp_type.rc_type.r_addr,
1034                                        tx->tx_wrq[1].type.send.send_qp_type.rc_type.r_r_key);
1035                         else
1036                                 CDEBUG(D_NET, "WORK[1]: %s gl %p for %d k %x\n",
1037                                        tx->tx_wrq[1].wr_type == vv_wr_send ? "SEND" : "????",
1038                                        tx->tx_wrq[1].scatgat_list->v_address,
1039                                        tx->tx_wrq[1].scatgat_list->length,
1040                                        tx->tx_wrq[1].scatgat_list->l_key);
1041                 }
1042 #endif           
1043                 rc = -ECONNABORTED;
1044                 vvrc = vv_return_ok;
1045                 if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
1046                         tx->tx_status = 0;
1047                         vvrc = vv_post_send_list(kibnal_data.kib_hca,
1048                                                  conn->ibc_qp,
1049                                                  tx->tx_nwrq,
1050                                                  tx->tx_wrq,
1051                                                  vv_operation_type_send_rc);
1052                         rc = (vvrc == vv_return_ok) ? 0 : -EIO;
1053                 }
1054
1055                 conn->ibc_last_send = jiffies;
1056
1057                 if (rc != 0) {
1058                         /* NB credits are transferred in the actual
1059                          * message, which can only be the last work item */
1060                         conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits;
1061                         if (consume_cred)
1062                                 conn->ibc_credits++;
1063                         conn->ibc_nsends_posted--;
1064
1065                         tx->tx_status = rc;
1066                         tx->tx_waiting = 0;
1067                         tx->tx_sending--;
1068                         
1069                         done = (tx->tx_sending == 0);
1070                         if (done)
1071                                 list_del (&tx->tx_list);
1072                         
1073                         spin_unlock(&conn->ibc_lock);
1074                         
1075                         if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
1076                                 CERROR ("Error %d posting transmit to %s\n", 
1077                                         vvrc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
1078                         else
1079                                 CDEBUG (D_NET, "Error %d posting transmit to %s\n",
1080                                         rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
1081
1082                         kibnal_close_conn (conn, rc);
1083
1084                         if (done)
1085                                 kibnal_tx_done (tx);
1086                         return;
1087                 }
1088         }
1089
1090         spin_unlock(&conn->ibc_lock);
1091 }
1092
1093 void
1094 kibnal_tx_complete (kib_tx_t *tx, vv_comp_status_t vvrc)
1095 {
1096         kib_conn_t   *conn = tx->tx_conn;
1097         int           failed = (vvrc != vv_comp_status_success);
1098         int           idle;
1099
1100         CDEBUG(D_NET, "tx %p conn %p sending %d nwrq %d vvrc %d\n", 
1101                tx, conn, tx->tx_sending, tx->tx_nwrq, vvrc);
1102
1103         LASSERT (tx->tx_sending > 0);
1104
1105         if (failed &&
1106             tx->tx_status == 0 &&
1107             conn->ibc_state == IBNAL_CONN_ESTABLISHED)
1108                 CDEBUG(D_NETERROR, "tx -> %s type %x cookie "LPX64
1109                        "sending %d waiting %d: failed %d\n", 
1110                        libcfs_nid2str(conn->ibc_peer->ibp_nid),
1111                        tx->tx_msg->ibm_type, tx->tx_cookie,
1112                        tx->tx_sending, tx->tx_waiting, vvrc);
1113
1114         spin_lock(&conn->ibc_lock);
1115
1116         /* I could be racing with rdma completion.  Whoever makes 'tx' idle
1117          * gets to free it, which also drops its ref on 'conn'. */
1118
1119         tx->tx_sending--;
1120         conn->ibc_nsends_posted--;
1121
1122         if (failed) {
1123                 tx->tx_waiting = 0;
1124                 tx->tx_status = -EIO;
1125         }
1126         
1127         idle = (tx->tx_sending == 0) &&         /* This is the final callback */
1128                !tx->tx_waiting &&               /* Not waiting for peer */
1129                !tx->tx_queued;                  /* Not re-queued (PUT_DONE) */
1130         if (idle)
1131                 list_del(&tx->tx_list);
1132
1133         kibnal_conn_addref(conn);               /* 1 ref for me.... */
1134
1135         spin_unlock(&conn->ibc_lock);
1136
1137         if (idle)
1138                 kibnal_tx_done (tx);
1139
1140         if (failed) {
1141                 kibnal_close_conn (conn, -EIO);
1142         } else {
1143                 kibnal_peer_alive(conn->ibc_peer);
1144                 kibnal_check_sends(conn);
1145         }
1146
1147         kibnal_conn_decref(conn);               /* ...until here */
1148 }
1149
1150 void
1151 kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob)
1152 {
1153         vv_scatgat_t *gl = &tx->tx_gl[tx->tx_nwrq];
1154         vv_wr_t      *wrq = &tx->tx_wrq[tx->tx_nwrq];
1155         int           nob = offsetof (kib_msg_t, ibm_u) + body_nob;
1156         __u64         addr = (__u64)((unsigned long)((tx)->tx_msg));
1157
1158         LASSERT (tx->tx_nwrq >= 0 && 
1159                  tx->tx_nwrq < (1 + IBNAL_MAX_RDMA_FRAGS));
1160         LASSERT (nob <= IBNAL_MSG_SIZE);
1161
1162         kibnal_init_msg(tx->tx_msg, type, body_nob);
1163
1164         *gl = (vv_scatgat_t) {
1165                 .v_address = KIBNAL_ADDR2SG(addr),
1166                 .l_key     = tx->tx_lkey,
1167                 .length    = nob,
1168         };
1169
1170         memset(wrq, 0, sizeof(*wrq));
1171
1172         wrq->wr_id = kibnal_ptr2wreqid(tx, IBNAL_WID_TX);
1173         wrq->wr_type = vv_wr_send;
1174         wrq->scatgat_list = gl;
1175         wrq->num_of_data_segments = 1;
1176         wrq->completion_notification = 1;
1177         wrq->type.send.solicited_event = 1;
1178         wrq->type.send.immidiate_data_indicator = 0;
1179         wrq->type.send.send_qp_type.rc_type.fance_indicator = 0;
1180         
1181         tx->tx_nwrq++;
1182 }
1183
1184 int
1185 kibnal_init_rdma (kib_tx_t *tx, int type, int nob,
1186                   kib_rdma_desc_t *dstrd, __u64 dstcookie)
1187 {
1188         kib_msg_t       *ibmsg = tx->tx_msg;
1189         kib_rdma_desc_t *srcrd = tx->tx_rd;
1190         vv_scatgat_t    *gl;
1191         vv_wr_t         *wrq;
1192         int              rc;
1193
1194 #if IBNAL_USE_FMR
1195         LASSERT (tx->tx_nwrq == 0);
1196
1197         gl = &tx->tx_gl[0];
1198         gl->length    = nob;
1199         gl->v_address = KIBNAL_ADDR2SG(srcrd->rd_addr);
1200         gl->l_key     = srcrd->rd_key;
1201
1202         wrq = &tx->tx_wrq[0];
1203
1204         wrq->wr_id = kibnal_ptr2wreqid(tx, IBNAL_WID_RDMA);
1205         wrq->completion_notification = 0;
1206         wrq->scatgat_list = gl;
1207         wrq->num_of_data_segments = 1;
1208         wrq->wr_type = vv_wr_rdma_write;
1209         wrq->type.send.solicited_event = 0;
1210         wrq->type.send.send_qp_type.rc_type.fance_indicator = 0;
1211         wrq->type.send.send_qp_type.rc_type.r_addr = dstrd->rd_addr;
1212         wrq->type.send.send_qp_type.rc_type.r_r_key = dstrd->rd_key;
1213
1214         tx->tx_nwrq = 1;
1215         rc = nob;
1216 #else
1217         /* CAVEAT EMPTOR: this 'consumes' the frags in 'dstrd' */
1218         int              resid = nob;
1219         kib_rdma_frag_t *srcfrag;
1220         int              srcidx;
1221         kib_rdma_frag_t *dstfrag;
1222         int              dstidx;
1223         int              wrknob;
1224
1225         /* Called by scheduler */
1226         LASSERT (!in_interrupt());
1227
1228         LASSERT (type == IBNAL_MSG_GET_DONE ||
1229                  type == IBNAL_MSG_PUT_DONE);
1230
1231         srcidx = dstidx = 0;
1232         srcfrag = &srcrd->rd_frags[0];
1233         dstfrag = &dstrd->rd_frags[0];
1234         rc = resid;
1235
1236         while (resid > 0) {
1237                 if (srcidx >= srcrd->rd_nfrag) {
1238                         CERROR("Src buffer exhausted: %d frags\n", srcidx);
1239                         rc = -EPROTO;
1240                         break;
1241                 }
1242                 
1243                 if (dstidx == dstrd->rd_nfrag) {
1244                         CERROR("Dst buffer exhausted: %d frags\n", dstidx);
1245                         rc = -EPROTO;
1246                         break;
1247                 }
1248
1249                 if (tx->tx_nwrq == IBNAL_MAX_RDMA_FRAGS) {
1250                         CERROR("RDMA too fragmented: %d/%d src %d/%d dst frags\n",
1251                                srcidx, srcrd->rd_nfrag,
1252                                dstidx, dstrd->rd_nfrag);
1253                         rc = -EMSGSIZE;
1254                         break;
1255                 }
1256
1257                 wrknob = MIN(MIN(srcfrag->rf_nob, dstfrag->rf_nob), resid);
1258
1259                 gl = &tx->tx_gl[tx->tx_nwrq];
1260                 gl->v_address = KIBNAL_ADDR2SG(kibnal_rf_addr(srcfrag));
1261                 gl->length    = wrknob;
1262                 gl->l_key     = srcrd->rd_key;
1263
1264                 wrq = &tx->tx_wrq[tx->tx_nwrq];
1265
1266                 wrq->wr_id = kibnal_ptr2wreqid(tx, IBNAL_WID_RDMA);
1267                 wrq->completion_notification = 0;
1268                 wrq->scatgat_list = gl;
1269                 wrq->num_of_data_segments = 1;
1270                 wrq->wr_type = vv_wr_rdma_write;
1271                 wrq->type.send.solicited_event = 0;
1272                 wrq->type.send.send_qp_type.rc_type.fance_indicator = 0;
1273                 wrq->type.send.send_qp_type.rc_type.r_addr = kibnal_rf_addr(dstfrag);
1274                 wrq->type.send.send_qp_type.rc_type.r_r_key = dstrd->rd_key;
1275
1276                 resid -= wrknob;
1277                 if (wrknob < srcfrag->rf_nob) {
1278                         kibnal_rf_set(srcfrag, 
1279                                       kibnal_rf_addr(srcfrag) + wrknob, 
1280                                       srcfrag->rf_nob - wrknob);
1281                 } else {
1282                         srcfrag++;
1283                         srcidx++;
1284                 }
1285                 
1286                 if (wrknob < dstfrag->rf_nob) {
1287                         kibnal_rf_set(dstfrag,
1288                                       kibnal_rf_addr(dstfrag) + wrknob,
1289                                       dstfrag->rf_nob - wrknob);
1290                 } else {
1291                         dstfrag++;
1292                         dstidx++;
1293                 }
1294                 
1295                 tx->tx_nwrq++;
1296         }
1297
1298         if (rc < 0)                             /* no RDMA if completing with failure */
1299                 tx->tx_nwrq = 0;
1300 #endif
1301         
1302         ibmsg->ibm_u.completion.ibcm_status = rc;
1303         ibmsg->ibm_u.completion.ibcm_cookie = dstcookie;
1304         kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t));
1305
1306         return rc;
1307 }
1308
1309 void
1310 kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
1311 {
1312         spin_lock(&conn->ibc_lock);
1313         kibnal_queue_tx_locked (tx, conn);
1314         spin_unlock(&conn->ibc_lock);
1315         
1316         kibnal_check_sends(conn);
1317 }
1318
1319 void
1320 kibnal_schedule_peer_arp (kib_peer_t *peer)
1321 {
1322         unsigned long flags;
1323
1324         LASSERT (peer->ibp_connecting != 0);
1325         LASSERT (peer->ibp_arp_count > 0);
1326
1327         kibnal_peer_addref(peer); /* extra ref for connd */
1328
1329         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
1330
1331         list_add_tail (&peer->ibp_connd_list, &kibnal_data.kib_connd_peers);
1332         wake_up (&kibnal_data.kib_connd_waitq);
1333
1334         spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
1335 }
1336
1337 void
1338 kibnal_launch_tx (kib_tx_t *tx, lnet_nid_t nid)
1339 {
1340         kib_peer_t      *peer;
1341         kib_conn_t      *conn;
1342         unsigned long    flags;
1343         rwlock_t        *g_lock = &kibnal_data.kib_global_lock;
1344         int              retry;
1345         int              rc;
1346
1347         /* If I get here, I've committed to send, so I complete the tx with
1348          * failure on any problems */
1349         
1350         LASSERT (tx->tx_conn == NULL);          /* only set when assigned a conn */
1351         LASSERT (tx->tx_nwrq > 0);              /* work items have been set up */
1352
1353         for (retry = 0; ; retry = 1) {
1354                 read_lock_irqsave(g_lock, flags);
1355         
1356                 peer = kibnal_find_peer_locked (nid);
1357                 if (peer != NULL) {
1358                         conn = kibnal_find_conn_locked (peer);
1359                         if (conn != NULL) {
1360                                 kibnal_conn_addref(conn); /* 1 ref for me... */
1361                                 read_unlock_irqrestore(g_lock, flags);
1362
1363                                 kibnal_queue_tx (tx, conn);
1364                                 kibnal_conn_decref(conn); /* ...to here */
1365                                 return;
1366                         }
1367                 }
1368                 
1369                 /* Making one or more connections; I'll need a write lock... */
1370                 read_unlock(g_lock);
1371                 write_lock(g_lock);
1372
1373                 peer = kibnal_find_peer_locked (nid);
1374                 if (peer != NULL)
1375                         break;
1376
1377                 write_unlock_irqrestore(g_lock, flags);
1378
1379                 if (retry) {
1380                         CERROR("Can't find peer %s\n", libcfs_nid2str(nid));
1381
1382                         tx->tx_status = -EHOSTUNREACH;
1383                         tx->tx_waiting = 0;
1384                         kibnal_tx_done (tx);
1385                         return;
1386                 }
1387
1388                 rc = kibnal_add_persistent_peer(nid, LNET_NIDADDR(nid));
1389                 if (rc != 0) {
1390                         CERROR("Can't add peer %s: %d\n",
1391                                libcfs_nid2str(nid), rc);
1392                         
1393                         tx->tx_status = -EHOSTUNREACH;
1394                         tx->tx_waiting = 0;
1395                         kibnal_tx_done (tx);
1396                         return;
1397                 }
1398         }
1399
1400         conn = kibnal_find_conn_locked (peer);
1401         if (conn != NULL) {
1402                 /* Connection exists; queue message on it */
1403                 kibnal_conn_addref(conn);       /* 1 ref for me... */
1404                 write_unlock_irqrestore(g_lock, flags);
1405                 
1406                 kibnal_queue_tx (tx, conn);
1407                 kibnal_conn_decref(conn);       /* ...until here */
1408                 return;
1409         }
1410
1411         if (peer->ibp_connecting == 0 &&
1412             peer->ibp_accepting == 0) {
1413                 if (!(peer->ibp_reconnect_interval == 0 || /* first attempt */
1414                       time_after_eq(jiffies, peer->ibp_reconnect_time))) {
1415                         write_unlock_irqrestore(g_lock, flags);
1416                         tx->tx_status = -EHOSTUNREACH;
1417                         tx->tx_waiting = 0;
1418                         kibnal_tx_done (tx);
1419                         return;
1420                 }
1421
1422                 peer->ibp_connecting = 1;
1423                 peer->ibp_arp_count = 1 + *kibnal_tunables.kib_arp_retries;
1424                 kibnal_schedule_peer_arp(peer);
1425         }
1426         
1427         /* A connection is being established; queue the message... */
1428         list_add_tail (&tx->tx_list, &peer->ibp_tx_queue);
1429
1430         write_unlock_irqrestore(g_lock, flags);
1431 }
1432
1433 int
1434 kibnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
1435 {
1436         lnet_hdr_t       *hdr = &lntmsg->msg_hdr; 
1437         int               type = lntmsg->msg_type; 
1438         lnet_process_id_t target = lntmsg->msg_target;
1439         int               target_is_router = lntmsg->msg_target_is_router;
1440         int               routing = lntmsg->msg_routing;
1441         unsigned int      payload_niov = lntmsg->msg_niov; 
1442         struct iovec     *payload_iov = lntmsg->msg_iov; 
1443         lnet_kiov_t      *payload_kiov = lntmsg->msg_kiov;
1444         unsigned int      payload_offset = lntmsg->msg_offset;
1445         unsigned int      payload_nob = lntmsg->msg_len;
1446         kib_msg_t        *ibmsg;
1447         kib_tx_t         *tx;
1448         int               nob;
1449         int               rc;
1450
1451         /* NB 'private' is different depending on what we're sending.... */
1452
1453         CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n",
1454                payload_nob, payload_niov, libcfs_id2str(target));
1455
1456         LASSERT (payload_nob == 0 || payload_niov > 0);
1457         LASSERT (payload_niov <= LNET_MAX_IOV);
1458
1459         /* Thread context */
1460         LASSERT (!in_interrupt());
1461         /* payload is either all vaddrs or all pages */
1462         LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
1463
1464         switch (type) {
1465         default:
1466                 LBUG();
1467                 return (-EIO);
1468                 
1469         case LNET_MSG_ACK:
1470                 LASSERT (payload_nob == 0);
1471                 break;
1472
1473         case LNET_MSG_GET:
1474                 if (routing || target_is_router)
1475                         break;                  /* send IMMEDIATE */
1476                 
1477                 /* is the REPLY message too small for RDMA? */
1478                 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]);
1479                 if (nob <= IBNAL_MSG_SIZE)
1480                         break;                  /* send IMMEDIATE */
1481
1482                 tx = kibnal_get_idle_tx();
1483                 if (tx == NULL) {
1484                         CERROR("Can allocate txd for GET to %s: \n",
1485                                libcfs_nid2str(target.nid));
1486                         return -ENOMEM;
1487                 }
1488                 
1489                 ibmsg = tx->tx_msg;
1490                 ibmsg->ibm_u.get.ibgm_hdr = *hdr;
1491                 ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie;
1492
1493                 if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0)
1494                         rc = kibnal_setup_rd_iov(tx, &ibmsg->ibm_u.get.ibgm_rd,
1495                                                  vv_acc_r_mem_write,
1496                                                  lntmsg->msg_md->md_niov,
1497                                                  lntmsg->msg_md->md_iov.iov,
1498                                                  0, lntmsg->msg_md->md_length);
1499                 else
1500                         rc = kibnal_setup_rd_kiov(tx, &ibmsg->ibm_u.get.ibgm_rd,
1501                                                   vv_acc_r_mem_write,
1502                                                   lntmsg->msg_md->md_niov,
1503                                                   lntmsg->msg_md->md_iov.kiov,
1504                                                   0, lntmsg->msg_md->md_length);
1505                 if (rc != 0) {
1506                         CERROR("Can't setup GET sink for %s: %d\n",
1507                                libcfs_nid2str(target.nid), rc);
1508                         kibnal_tx_done(tx);
1509                         return -EIO;
1510                 }
1511
1512 #if IBNAL_USE_FMR
1513                 nob = sizeof(kib_get_msg_t);
1514 #else
1515                 {
1516                         int n = ibmsg->ibm_u.get.ibgm_rd.rd_nfrag;
1517                         
1518                         nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]);
1519                 }
1520 #endif
1521                 kibnal_init_tx_msg(tx, IBNAL_MSG_GET_REQ, nob);
1522
1523                 tx->tx_lntmsg[1] = lnet_create_reply_msg(kibnal_data.kib_ni,
1524                                                          lntmsg);
1525                 if (tx->tx_lntmsg[1] == NULL) {
1526                         CERROR("Can't create reply for GET -> %s\n",
1527                                libcfs_nid2str(target.nid));
1528                         kibnal_tx_done(tx);
1529                         return -EIO;
1530                 }
1531
1532                 tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg[0,1] on completion */
1533                 tx->tx_waiting = 1;             /* waiting for GET_DONE */
1534                 kibnal_launch_tx(tx, target.nid);
1535                 return 0;
1536
1537         case LNET_MSG_REPLY:
1538         case LNET_MSG_PUT:
1539                 /* Is the payload small enough not to need RDMA? */
1540                 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
1541                 if (nob <= IBNAL_MSG_SIZE)
1542                         break;                  /* send IMMEDIATE */
1543
1544                 tx = kibnal_get_idle_tx();
1545                 if (tx == NULL) {
1546                         CERROR("Can't allocate %s txd for %s\n",
1547                                type == LNET_MSG_PUT ? "PUT" : "REPLY",
1548                                libcfs_nid2str(target.nid));
1549                         return -ENOMEM;
1550                 }
1551
1552                 if (payload_kiov == NULL)
1553                         rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0,
1554                                                  payload_niov, payload_iov,
1555                                                  payload_offset, payload_nob);
1556                 else
1557                         rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 0,
1558                                                   payload_niov, payload_kiov,
1559                                                   payload_offset, payload_nob);
1560                 if (rc != 0) {
1561                         CERROR("Can't setup PUT src for %s: %d\n",
1562                                libcfs_nid2str(target.nid), rc);
1563                         kibnal_tx_done(tx);
1564                         return -EIO;
1565                 }
1566
1567                 ibmsg = tx->tx_msg;
1568                 ibmsg->ibm_u.putreq.ibprm_hdr = *hdr;
1569                 ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie;
1570                 kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_REQ, sizeof(kib_putreq_msg_t));
1571
1572                 tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg on completion */
1573                 tx->tx_waiting = 1;             /* waiting for PUT_{ACK,NAK} */
1574                 kibnal_launch_tx(tx, target.nid);
1575                 return 0;
1576         }
1577
1578         /* send IMMEDIATE */
1579
1580         LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob])
1581                  <= IBNAL_MSG_SIZE);
1582
1583         tx = kibnal_get_idle_tx();
1584         if (tx == NULL) {
1585                 CERROR ("Can't send %d to %s: tx descs exhausted\n",
1586                         type, libcfs_nid2str(target.nid));
1587                 return -ENOMEM;
1588         }
1589
1590         ibmsg = tx->tx_msg;
1591         ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
1592
1593         if (payload_kiov != NULL)
1594                 lnet_copy_kiov2flat(IBNAL_MSG_SIZE, ibmsg,
1595                                     offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
1596                                     payload_niov, payload_kiov,
1597                                     payload_offset, payload_nob);
1598         else
1599                 lnet_copy_iov2flat(IBNAL_MSG_SIZE, ibmsg,
1600                                    offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
1601                                    payload_niov, payload_iov,
1602                                    payload_offset, payload_nob);
1603
1604         nob = offsetof(kib_immediate_msg_t, ibim_payload[payload_nob]);
1605         kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE, nob);
1606
1607         tx->tx_lntmsg[0] = lntmsg;              /* finalise lntmsg on completion */
1608         kibnal_launch_tx(tx, target.nid);
1609         return 0;
1610 }
1611
1612 void
1613 kibnal_reply (lnet_ni_t *ni, kib_rx_t *rx, lnet_msg_t *lntmsg)
1614 {
1615         lnet_process_id_t target = lntmsg->msg_target;
1616         unsigned int      niov = lntmsg->msg_niov; 
1617         struct iovec     *iov = lntmsg->msg_iov; 
1618         lnet_kiov_t      *kiov = lntmsg->msg_kiov;
1619         unsigned int      offset = lntmsg->msg_offset;
1620         unsigned int      nob = lntmsg->msg_len;
1621         kib_tx_t         *tx;
1622         int               rc;
1623         
1624         tx = kibnal_get_idle_tx();
1625         if (tx == NULL) {
1626                 CERROR("Can't get tx for REPLY to %s\n",
1627                        libcfs_nid2str(target.nid));
1628                 goto failed_0;
1629         }
1630
1631         if (nob == 0)
1632                 rc = 0;
1633         else if (kiov == NULL)
1634                 rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0, 
1635                                          niov, iov, offset, nob);
1636         else
1637                 rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 0,
1638                                           niov, kiov, offset, nob);
1639
1640         if (rc != 0) {
1641                 CERROR("Can't setup GET src for %s: %d\n",
1642                        libcfs_nid2str(target.nid), rc);
1643                 goto failed_1;
1644         }
1645         
1646         rc = kibnal_init_rdma(tx, IBNAL_MSG_GET_DONE, nob,
1647                               &rx->rx_msg->ibm_u.get.ibgm_rd,
1648                               rx->rx_msg->ibm_u.get.ibgm_cookie);
1649         if (rc < 0) {
1650                 CERROR("Can't setup rdma for GET from %s: %d\n", 
1651                        libcfs_nid2str(target.nid), rc);
1652                 goto failed_1;
1653         }
1654         
1655         if (rc == 0) {
1656                 /* No RDMA: local completion may happen now! */
1657                 lnet_finalize(ni, lntmsg, 0);
1658         } else {
1659                 /* RDMA: lnet_finalize(lntmsg) when it
1660                  * completes */
1661                 tx->tx_lntmsg[0] = lntmsg;
1662         }
1663         
1664         kibnal_queue_tx(tx, rx->rx_conn);
1665         return;
1666         
1667  failed_1:
1668         kibnal_tx_done(tx);
1669  failed_0:
1670         lnet_finalize(ni, lntmsg, -EIO);
1671 }
1672
1673 int
1674 kibnal_eager_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
1675                    void **new_private)
1676 {
1677         kib_rx_t    *rx = private;
1678         kib_conn_t  *conn = rx->rx_conn;
1679
1680         if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) {
1681                 /* Can't block if RDMA completions need normal credits */
1682                 LCONSOLE_ERROR_MSG(0x129, "Dropping message from %s: no buffers"
1683                                    " free. %s is running an old version of LNET "
1684                                    "that may deadlock if messages wait for"
1685                                    "buffers) \n", 
1686                                    libcfs_nid2str(conn->ibc_peer->ibp_nid),
1687                                    libcfs_nid2str(conn->ibc_peer->ibp_nid));
1688                 return -EDEADLK;
1689         }
1690         
1691         *new_private = private;
1692         return 0;
1693 }
1694
1695 int
1696 kibnal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
1697              unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
1698              unsigned int offset, unsigned int mlen, unsigned int rlen)
1699 {
1700         kib_rx_t    *rx = private;
1701         kib_msg_t   *rxmsg = rx->rx_msg;
1702         kib_conn_t  *conn = rx->rx_conn;
1703         kib_tx_t    *tx;
1704         kib_msg_t   *txmsg;
1705         int          nob;
1706         int          post_cred = 1;
1707         int          rc = 0;
1708         
1709         LASSERT (mlen <= rlen);
1710         LASSERT (!in_interrupt());
1711         /* Either all pages or all vaddrs */
1712         LASSERT (!(kiov != NULL && iov != NULL));
1713
1714         switch (rxmsg->ibm_type) {
1715         default:
1716                 LBUG();
1717                 
1718         case IBNAL_MSG_IMMEDIATE:
1719                 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
1720                 if (nob > rx->rx_nob) {
1721                         CERROR ("Immediate message from %s too big: %d(%d)\n",
1722                                 libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid),
1723                                 nob, rx->rx_nob);
1724                         rc = -EPROTO;
1725                         break;
1726                 }
1727
1728                 if (kiov != NULL)
1729                         lnet_copy_flat2kiov(niov, kiov, offset,
1730                                             IBNAL_MSG_SIZE, rxmsg,
1731                                             offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
1732                                             mlen);
1733                 else
1734                         lnet_copy_flat2iov(niov, iov, offset,
1735                                            IBNAL_MSG_SIZE, rxmsg,
1736                                            offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
1737                                            mlen);
1738                 lnet_finalize (ni, lntmsg, 0);
1739                 break;
1740
1741         case IBNAL_MSG_PUT_REQ:
1742                 if (mlen == 0) {
1743                         lnet_finalize(ni, lntmsg, 0);
1744                         kibnal_send_completion(rx->rx_conn, IBNAL_MSG_PUT_NAK, 0,
1745                                                rxmsg->ibm_u.putreq.ibprm_cookie);
1746                         break;
1747                 }
1748                 
1749                 tx = kibnal_get_idle_tx();
1750                 if (tx == NULL) {
1751                         CERROR("Can't allocate tx for %s\n",
1752                                libcfs_nid2str(conn->ibc_peer->ibp_nid));
1753                         /* Not replying will break the connection */
1754                         rc = -ENOMEM;
1755                         break;
1756                 }
1757
1758                 txmsg = tx->tx_msg;
1759                 if (kiov == NULL)
1760                         rc = kibnal_setup_rd_iov(tx, 
1761                                                  &txmsg->ibm_u.putack.ibpam_rd,
1762                                                  vv_acc_r_mem_write,
1763                                                  niov, iov, offset, mlen);
1764                 else
1765                         rc = kibnal_setup_rd_kiov(tx,
1766                                                   &txmsg->ibm_u.putack.ibpam_rd,
1767                                                   vv_acc_r_mem_write,
1768                                                   niov, kiov, offset, mlen);
1769                 if (rc != 0) {
1770                         CERROR("Can't setup PUT sink for %s: %d\n",
1771                                libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
1772                         kibnal_tx_done(tx);
1773                         /* tell peer it's over */
1774                         kibnal_send_completion(rx->rx_conn, IBNAL_MSG_PUT_NAK, rc,
1775                                                rxmsg->ibm_u.putreq.ibprm_cookie);
1776                         break;
1777                 }
1778
1779                 txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
1780                 txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie;
1781 #if IBNAL_USE_FMR
1782                 nob = sizeof(kib_putack_msg_t);
1783 #else
1784                 {
1785                         int n = tx->tx_msg->ibm_u.putack.ibpam_rd.rd_nfrag;
1786
1787                         nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]);
1788                 }
1789 #endif
1790                 kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_ACK, nob);
1791
1792                 tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg on completion */
1793                 tx->tx_waiting = 1;             /* waiting for PUT_DONE */
1794                 kibnal_queue_tx(tx, conn);
1795
1796                 if (conn->ibc_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD)
1797                         post_cred = 0; /* peer still owns 'rx' for sending PUT_DONE */
1798                 break;
1799
1800         case IBNAL_MSG_GET_REQ:
1801                 if (lntmsg != NULL) {
1802                         /* Optimized GET; RDMA lntmsg's payload */
1803                         kibnal_reply(ni, rx, lntmsg);
1804                 } else {
1805                         /* GET didn't match anything */
1806                         kibnal_send_completion(rx->rx_conn, IBNAL_MSG_GET_DONE, 
1807                                                -ENODATA,
1808                                                rxmsg->ibm_u.get.ibgm_cookie);
1809                 }
1810                 break;
1811         }
1812
1813         kibnal_post_rx(rx, post_cred, 0);
1814         return rc;
1815 }
1816
1817 int
1818 kibnal_thread_start (int (*fn)(void *arg), void *arg)
1819 {
1820         long    pid = kernel_thread (fn, arg, 0);
1821
1822         if (pid < 0)
1823                 return ((int)pid);
1824
1825         atomic_inc (&kibnal_data.kib_nthreads);
1826         return (0);
1827 }
1828
1829 void
1830 kibnal_thread_fini (void)
1831 {
1832         atomic_dec (&kibnal_data.kib_nthreads);
1833 }
1834
1835 void
1836 kibnal_peer_alive (kib_peer_t *peer)
1837 {
1838         /* This is racy, but everyone's only writing cfs_time_current() */
1839         peer->ibp_last_alive = cfs_time_current();
1840         mb();
1841 }
1842
1843 void
1844 kibnal_peer_notify (kib_peer_t *peer)
1845 {
1846         time_t        last_alive = 0;
1847         int           error = 0;
1848         unsigned long flags;
1849         
1850         read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1851
1852         if (list_empty(&peer->ibp_conns) &&
1853             peer->ibp_accepting == 0 &&
1854             peer->ibp_connecting == 0 &&
1855             peer->ibp_error != 0) {
1856                 error = peer->ibp_error;
1857                 peer->ibp_error = 0;
1858                 
1859                 last_alive = cfs_time_current_sec() -
1860                              cfs_duration_sec(cfs_time_current() -
1861                                               peer->ibp_last_alive);
1862         }
1863         
1864         read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1865         
1866         if (error != 0)
1867                 lnet_notify(kibnal_data.kib_ni, peer->ibp_nid, 0, last_alive);
1868 }
1869
1870 void
1871 kibnal_schedule_conn (kib_conn_t *conn)
1872 {
1873         unsigned long flags;
1874
1875         kibnal_conn_addref(conn);               /* ++ref for connd */
1876         
1877         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
1878
1879         list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns);
1880         wake_up (&kibnal_data.kib_connd_waitq);
1881                 
1882         spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
1883 }
1884
1885 void
1886 kibnal_close_conn_locked (kib_conn_t *conn, int error)
1887 {
1888         /* This just does the immediate housekeeping.  'error' is zero for a
1889          * normal shutdown which can happen only after the connection has been
1890          * established.  If the connection is established, schedule the
1891          * connection to be finished off by the connd.  Otherwise the connd is
1892          * already dealing with it (either to set it up or tear it down).
1893          * Caller holds kib_global_lock exclusively in irq context */
1894         kib_peer_t       *peer = conn->ibc_peer;
1895         
1896         LASSERT (error != 0 || conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
1897
1898         if (error != 0 && conn->ibc_comms_error == 0)
1899                 conn->ibc_comms_error = error;
1900
1901         if (conn->ibc_state != IBNAL_CONN_ESTABLISHED)
1902                 return; /* already being handled  */
1903         
1904         /* NB Can't take ibc_lock here (could be in IRQ context), without
1905          * risking deadlock, so access to ibc_{tx_queue,active_txs} is racey */
1906
1907         if (error == 0 &&
1908             list_empty(&conn->ibc_tx_queue) &&
1909             list_empty(&conn->ibc_tx_queue_rsrvd) &&
1910             list_empty(&conn->ibc_tx_queue_nocred) &&
1911             list_empty(&conn->ibc_active_txs)) {
1912                 CDEBUG(D_NET, "closing conn to %s"
1913                        " rx# "LPD64" tx# "LPD64"\n", 
1914                        libcfs_nid2str(peer->ibp_nid),
1915                        conn->ibc_txseq, conn->ibc_rxseq);
1916         } else {
1917                 CDEBUG(D_NETERROR, "Closing conn to %s: error %d%s%s%s%s"
1918                        " rx# "LPD64" tx# "LPD64"\n",
1919                        libcfs_nid2str(peer->ibp_nid), error,
1920                        list_empty(&conn->ibc_tx_queue) ? "" : "(sending)",
1921                        list_empty(&conn->ibc_tx_queue_rsrvd) ? "" : "(sending_rsrvd)",
1922                        list_empty(&conn->ibc_tx_queue_nocred) ? "" : "(sending_nocred)",
1923                        list_empty(&conn->ibc_active_txs) ? "" : "(waiting)",
1924                        conn->ibc_txseq, conn->ibc_rxseq);
1925         }
1926
1927         list_del (&conn->ibc_list);
1928
1929         if (list_empty (&peer->ibp_conns)) {   /* no more conns */
1930                 if (peer->ibp_persistence == 0 && /* non-persistent peer */
1931                     kibnal_peer_active(peer))     /* still in peer table */
1932                         kibnal_unlink_peer_locked (peer);
1933
1934                 /* set/clear error on last conn */
1935                 peer->ibp_error = conn->ibc_comms_error;
1936         }
1937
1938         kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECT1);
1939
1940         kibnal_schedule_conn(conn);
1941         kibnal_conn_decref(conn);               /* lose ibc_list's ref */
1942 }
1943
1944 void
1945 kibnal_close_conn (kib_conn_t *conn, int error)
1946 {
1947         unsigned long flags;
1948         
1949         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1950
1951         kibnal_close_conn_locked (conn, error);
1952         
1953         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1954 }
1955
1956 void
1957 kibnal_handle_early_rxs(kib_conn_t *conn)
1958 {
1959         unsigned long    flags;
1960         kib_rx_t        *rx;
1961
1962         LASSERT (!in_interrupt());
1963         LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
1964         
1965         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1966         while (!list_empty(&conn->ibc_early_rxs)) {
1967                 rx = list_entry(conn->ibc_early_rxs.next,
1968                                 kib_rx_t, rx_list);
1969                 list_del(&rx->rx_list);
1970                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1971                 
1972                 kibnal_handle_rx(rx);
1973                 
1974                 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1975         }
1976         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1977 }
1978
1979 void
1980 kibnal_abort_txs(kib_conn_t *conn, struct list_head *txs)
1981 {
1982         LIST_HEAD           (zombies); 
1983         struct list_head    *tmp;
1984         struct list_head    *nxt;
1985         kib_tx_t            *tx;
1986
1987         spin_lock(&conn->ibc_lock);
1988
1989         list_for_each_safe (tmp, nxt, txs) {
1990                 tx = list_entry (tmp, kib_tx_t, tx_list);
1991
1992                 if (txs == &conn->ibc_active_txs) {
1993                         LASSERT (!tx->tx_queued);
1994                         LASSERT (tx->tx_waiting || tx->tx_sending != 0);
1995                 } else {
1996                         LASSERT (tx->tx_queued);
1997                 }
1998                 
1999                 tx->tx_status = -ECONNABORTED;
2000                 tx->tx_queued = 0;
2001                 tx->tx_waiting = 0;
2002                 
2003                 if (tx->tx_sending == 0) {
2004                         list_del (&tx->tx_list);
2005                         list_add (&tx->tx_list, &zombies);
2006                 }
2007         }
2008
2009         spin_unlock(&conn->ibc_lock);
2010
2011         kibnal_txlist_done(&zombies, -ECONNABORTED);
2012 }
2013
2014 void
2015 kibnal_conn_disconnected(kib_conn_t *conn)
2016 {
2017         /* I'm the connd */
2018         LASSERT (!in_interrupt());
2019         LASSERT (current == kibnal_data.kib_connd);
2020         LASSERT (conn->ibc_state >= IBNAL_CONN_INIT);
2021         
2022         kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECTED);
2023
2024         /* move QP to error state to make posted work items complete */
2025         kibnal_set_qp_state(conn, vv_qp_state_error);
2026
2027         /* Complete all tx descs not waiting for sends to complete.
2028          * NB we should be safe from RDMA now that the QP has changed state */
2029
2030         kibnal_abort_txs(conn, &conn->ibc_tx_queue);
2031         kibnal_abort_txs(conn, &conn->ibc_tx_queue_rsrvd);
2032         kibnal_abort_txs(conn, &conn->ibc_tx_queue_nocred);
2033         kibnal_abort_txs(conn, &conn->ibc_active_txs);
2034
2035         kibnal_handle_early_rxs(conn);
2036
2037         kibnal_peer_notify(conn->ibc_peer);
2038 }
2039
2040 void
2041 kibnal_peer_connect_failed (kib_peer_t *peer, int active, int error)
2042 {
2043         LIST_HEAD        (zombies);
2044         unsigned long     flags;
2045
2046         /* Only the connd creates conns => single threaded */
2047         LASSERT (error != 0);
2048         LASSERT (!in_interrupt());
2049         LASSERT (current == kibnal_data.kib_connd);
2050
2051         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2052
2053         if (active) {
2054                 LASSERT (peer->ibp_connecting != 0);
2055                 peer->ibp_connecting--;
2056         } else {
2057                 LASSERT (peer->ibp_accepting != 0);
2058                 peer->ibp_accepting--;
2059         }
2060         
2061         if (peer->ibp_connecting != 0 ||
2062             peer->ibp_accepting != 0) {
2063                 /* another connection attempt under way (loopback?)... */
2064                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2065                 return;
2066         }
2067
2068         if (list_empty(&peer->ibp_conns)) {
2069                 /* Say when active connection can be re-attempted */
2070                 peer->ibp_reconnect_interval *= 2;
2071                 peer->ibp_reconnect_interval =
2072                         MAX(peer->ibp_reconnect_interval,
2073                             *kibnal_tunables.kib_min_reconnect_interval);
2074                 peer->ibp_reconnect_interval =
2075                         MIN(peer->ibp_reconnect_interval,
2076                             *kibnal_tunables.kib_max_reconnect_interval);
2077                 
2078                 peer->ibp_reconnect_time = jiffies + 
2079                                            peer->ibp_reconnect_interval * HZ;
2080
2081                 /* Take peer's blocked transmits to complete with error */
2082                 list_add(&zombies, &peer->ibp_tx_queue);
2083                 list_del_init(&peer->ibp_tx_queue);
2084                 
2085                 if (kibnal_peer_active(peer) &&
2086                     (peer->ibp_persistence == 0)) {
2087                         /* failed connection attempt on non-persistent peer */
2088                         kibnal_unlink_peer_locked (peer);
2089                 }
2090
2091                 peer->ibp_error = error;
2092         } else {
2093                 /* Can't have blocked transmits if there are connections */
2094                 LASSERT (list_empty(&peer->ibp_tx_queue));
2095         }
2096         
2097         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2098
2099         kibnal_peer_notify(peer);
2100
2101         if (list_empty (&zombies)) 
2102                 return;
2103         
2104         CDEBUG (D_NETERROR, "Deleting messages for %s: connection failed\n",
2105                 libcfs_nid2str(peer->ibp_nid));
2106
2107         kibnal_txlist_done(&zombies, -EHOSTUNREACH);
2108 }
2109
2110 void
2111 kibnal_reject(cm_cep_handle_t cep, int why)
2112 {
2113         static cm_reject_data_t   rejs[3];
2114         cm_reject_data_t         *rej = &rejs[why];
2115
2116         LASSERT (why >= 0 && why < sizeof(rejs)/sizeof(rejs[0]));
2117
2118         /* If I wasn't so lazy, I'd initialise this only once; it's effective
2119          * read-only */
2120         rej->reason = cm_rej_code_usr_rej;
2121         rej->priv_data[0] = (IBNAL_MSG_MAGIC) & 0xff;
2122         rej->priv_data[1] = (IBNAL_MSG_MAGIC >> 8) & 0xff;
2123         rej->priv_data[2] = (IBNAL_MSG_MAGIC >> 16) & 0xff;
2124         rej->priv_data[3] = (IBNAL_MSG_MAGIC >> 24) & 0xff;
2125         rej->priv_data[4] = (IBNAL_MSG_VERSION) & 0xff;
2126         rej->priv_data[5] = (IBNAL_MSG_VERSION >> 8) & 0xff;
2127         rej->priv_data[6] = why;
2128
2129         cm_reject(cep, rej);
2130 }
2131
2132 void
2133 kibnal_connreq_done(kib_conn_t *conn, int active, int status)
2134 {
2135         struct list_head   txs;
2136         kib_peer_t        *peer = conn->ibc_peer;
2137         unsigned long      flags;
2138         kib_tx_t          *tx;
2139
2140         CDEBUG(D_NET,"%d\n", status);
2141
2142         /* Only the connd creates conns => single threaded */
2143         LASSERT (!in_interrupt());
2144         LASSERT (current == kibnal_data.kib_connd);
2145         LASSERT (conn->ibc_state < IBNAL_CONN_ESTABLISHED);
2146
2147         if (active) {
2148                 LASSERT (peer->ibp_connecting > 0);
2149         } else {
2150                 LASSERT (peer->ibp_accepting > 0);
2151         }
2152         
2153         LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
2154         conn->ibc_connvars = NULL;
2155
2156         if (status != 0) {
2157                 /* failed to establish connection */
2158                 switch (conn->ibc_state) {
2159                 default:
2160                         LBUG();
2161
2162                 case IBNAL_CONN_ACTIVE_CHECK_REPLY:
2163                         /* got a connection reply but failed checks */
2164                         LASSERT (active);
2165                         kibnal_reject(conn->ibc_cep, IBNAL_REJECT_FATAL);
2166                         break;
2167
2168                 case IBNAL_CONN_ACTIVE_CONNECT:
2169                         LASSERT (active);
2170                         cm_cancel(conn->ibc_cep);
2171                         cfs_pause(cfs_time_seconds(1)/10);
2172                         /* cm_connect() failed immediately or
2173                          * callback returned failure */
2174                         break;
2175
2176                 case IBNAL_CONN_ACTIVE_ARP:
2177                         LASSERT (active);
2178                         /* ibat_get_ib_data() failed immediately 
2179                          * or callback returned failure */
2180                         break;
2181
2182                 case IBNAL_CONN_INIT:
2183                         break;
2184
2185                 case IBNAL_CONN_PASSIVE_WAIT:
2186                         LASSERT (!active);
2187                         /* cm_accept callback returned failure */
2188                         break;
2189                 }
2190
2191                 kibnal_peer_connect_failed(peer, active, status);
2192                 kibnal_conn_disconnected(conn);
2193                 return;
2194         }
2195
2196         /* connection established */
2197         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2198
2199         if (active) {
2200                 LASSERT(conn->ibc_state == IBNAL_CONN_ACTIVE_RTU);
2201         } else {
2202                 LASSERT(conn->ibc_state == IBNAL_CONN_PASSIVE_WAIT);
2203         }
2204         
2205         conn->ibc_last_send = jiffies;
2206         kibnal_set_conn_state(conn, IBNAL_CONN_ESTABLISHED);
2207         kibnal_peer_alive(peer);
2208
2209         /* Add conn to peer's list and nuke any dangling conns from a different
2210          * peer instance... */
2211         kibnal_conn_addref(conn);               /* +1 ref for ibc_list */
2212         list_add(&conn->ibc_list, &peer->ibp_conns);
2213         kibnal_close_stale_conns_locked (peer, conn->ibc_incarnation);
2214
2215         if (!kibnal_peer_active(peer) ||        /* peer has been deleted */
2216             conn->ibc_comms_error != 0 ||       /* comms error */
2217             conn->ibc_disconnect) {             /* need to disconnect */
2218                 
2219                 /* start to shut down connection */
2220                 kibnal_close_conn_locked(conn, -ECONNABORTED);
2221
2222                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2223                 kibnal_peer_connect_failed(peer, active, -ECONNABORTED);
2224                 return;
2225         }
2226
2227         if (active)
2228                 peer->ibp_connecting--;
2229         else
2230                 peer->ibp_accepting--;
2231
2232         /* grab pending txs while I have the lock */
2233         list_add(&txs, &peer->ibp_tx_queue);
2234         list_del_init(&peer->ibp_tx_queue);
2235         
2236         peer->ibp_reconnect_interval = 0;       /* OK to reconnect at any time */
2237
2238         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2239
2240         /* Schedule blocked txs */
2241         spin_lock (&conn->ibc_lock);
2242         while (!list_empty (&txs)) {
2243                 tx = list_entry (txs.next, kib_tx_t, tx_list);
2244                 list_del (&tx->tx_list);
2245
2246                 kibnal_queue_tx_locked (tx, conn);
2247         }
2248         spin_unlock (&conn->ibc_lock);
2249         kibnal_check_sends (conn);
2250
2251         /* schedule blocked rxs */
2252         kibnal_handle_early_rxs(conn);
2253 }
2254
2255 void
2256 kibnal_cm_callback(cm_cep_handle_t cep, cm_conn_data_t *cmdata, void *arg)
2257 {
2258         static cm_dreply_data_t drep;           /* just zeroed space */
2259         
2260         kib_conn_t             *conn = (kib_conn_t *)arg;
2261         unsigned long           flags;
2262         
2263         /* CAVEAT EMPTOR: tasklet context */
2264
2265         switch (cmdata->status) {
2266         default:
2267                 LBUG();
2268                 
2269         case cm_event_disconn_request:
2270                 /* IBNAL_CONN_ACTIVE_RTU:  gets closed in kibnal_connreq_done
2271                  * IBNAL_CONN_ESTABLISHED: I start it closing
2272                  * otherwise:              it's closing anyway */
2273                 cm_disconnect(conn->ibc_cep, NULL, &drep);
2274                 cm_cancel(conn->ibc_cep);
2275
2276                 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2277                 LASSERT (!conn->ibc_disconnect);
2278                 conn->ibc_disconnect = 1;
2279
2280                 switch (conn->ibc_state) {
2281                 default:
2282                         LBUG();
2283
2284                 case IBNAL_CONN_ACTIVE_RTU:
2285                         /* kibnal_connreq_done is getting there; It'll see
2286                          * ibc_disconnect set... */
2287                         break;
2288
2289                 case IBNAL_CONN_ESTABLISHED:
2290                         /* kibnal_connreq_done got there already; get
2291                          * disconnect going... */
2292                         kibnal_close_conn_locked(conn, 0);
2293                         break;
2294
2295                 case IBNAL_CONN_DISCONNECT1:
2296                         /* kibnal_disconnect_conn is getting there; It'll see
2297                          * ibc_disconnect set... */
2298                         break;
2299
2300                 case IBNAL_CONN_DISCONNECT2:
2301                         /* kibnal_disconnect_conn got there already; complete
2302                          * the disconnect. */
2303                         kibnal_schedule_conn(conn);
2304                         break;
2305                 }
2306                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2307                 break;
2308                 
2309         case cm_event_disconn_timeout:
2310         case cm_event_disconn_reply:
2311                 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2312                 LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT2);
2313                 LASSERT (!conn->ibc_disconnect);
2314                 conn->ibc_disconnect = 1;
2315
2316                 /* kibnal_disconnect_conn sent the disconnect request. */
2317                 kibnal_schedule_conn(conn);
2318
2319                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2320                 break;
2321                 
2322         case cm_event_connected:
2323         case cm_event_conn_timeout:
2324         case cm_event_conn_reject:
2325                 LASSERT (conn->ibc_state == IBNAL_CONN_PASSIVE_WAIT);
2326                 conn->ibc_connvars->cv_conndata = *cmdata;
2327
2328                 kibnal_schedule_conn(conn);
2329                 break;
2330         }
2331
2332         kibnal_conn_decref(conn); /* lose my ref */
2333 }
2334
2335 void
2336 kibnal_check_passive_wait(kib_conn_t *conn)
2337 {
2338         int     rc;
2339
2340         switch (conn->ibc_connvars->cv_conndata.status) {
2341         default:
2342                 LBUG();
2343                 
2344         case cm_event_connected:
2345                 kibnal_conn_addref(conn); /* ++ ref for CM callback */
2346                 rc = kibnal_set_qp_state(conn, vv_qp_state_rts);
2347                 if (rc != 0)
2348                         conn->ibc_comms_error = rc;
2349                 /* connection _has_ been established; it's just that we've had
2350                  * an error immediately... */
2351                 kibnal_connreq_done(conn, 0, 0);
2352                 break;
2353                 
2354         case cm_event_conn_timeout:
2355                 kibnal_connreq_done(conn, 0, -ETIMEDOUT);
2356                 break;
2357                 
2358         case cm_event_conn_reject:
2359                 kibnal_connreq_done(conn, 0, -ECONNRESET);
2360                 break;
2361         }
2362 }
2363
2364 void
2365 kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq)
2366 {
2367         static kib_msg_t        txmsg;
2368         static kib_msg_t        rxmsg;
2369         static cm_reply_data_t  reply;
2370
2371         kib_conn_t         *conn = NULL;
2372         int                 rc = 0;
2373         int                 reason;
2374         int                 rxmsgnob;
2375         rwlock_t           *g_lock = &kibnal_data.kib_global_lock;
2376         kib_peer_t         *peer;
2377         kib_peer_t         *peer2;
2378         unsigned long       flags;
2379         kib_connvars_t     *cv;
2380         cm_return_t         cmrc;
2381         vv_return_t         vvrc;
2382         
2383         /* I'm the connd executing in thread context
2384          * No concurrency problems with static data! */
2385         LASSERT (!in_interrupt());
2386         LASSERT (current == kibnal_data.kib_connd);
2387
2388         if (cmreq->sid != (__u64)(*kibnal_tunables.kib_service_number)) {
2389                 CERROR(LPX64" != IBNAL_SERVICE_NUMBER("LPX64")\n",
2390                        cmreq->sid, (__u64)(*kibnal_tunables.kib_service_number));
2391                 reason = IBNAL_REJECT_FATAL;
2392                 goto reject;
2393         }
2394
2395         /* copy into rxmsg to avoid alignment issues */
2396         rxmsgnob = MIN(cm_REQ_priv_data_len, sizeof(rxmsg));
2397         memcpy(&rxmsg, cmreq->priv_data, rxmsgnob);
2398
2399         rc = kibnal_unpack_msg(&rxmsg, 0, rxmsgnob);
2400         if (rc != 0) {
2401                 /* SILENT! kibnal_unpack_msg() complains if required */
2402                 reason = IBNAL_REJECT_FATAL;
2403                 goto reject;
2404         }
2405
2406         if (rxmsg.ibm_version != IBNAL_MSG_VERSION)
2407                 CWARN("Connection from %s: old protocol version 0x%x\n",
2408                       libcfs_nid2str(rxmsg.ibm_srcnid), rxmsg.ibm_version);
2409
2410         if (rxmsg.ibm_type != IBNAL_MSG_CONNREQ) {
2411                 CERROR("Unexpected connreq msg type: %x from %s\n",
2412                        rxmsg.ibm_type, libcfs_nid2str(rxmsg.ibm_srcnid));
2413                 reason = IBNAL_REJECT_FATAL;
2414                 goto reject;
2415         }
2416
2417         if (!lnet_ptlcompat_matchnid(kibnal_data.kib_ni->ni_nid,
2418                                      rxmsg.ibm_dstnid)) {
2419                 CERROR("Can't accept %s: bad dst nid %s\n",
2420                        libcfs_nid2str(rxmsg.ibm_srcnid), 
2421                        libcfs_nid2str(rxmsg.ibm_dstnid));
2422                 reason = IBNAL_REJECT_FATAL;
2423                 goto reject;
2424         }
2425
2426         if (rxmsg.ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) {
2427                 CERROR("Can't accept %s: incompatible queue depth %d (%d wanted)\n",
2428                        libcfs_nid2str(rxmsg.ibm_srcnid), 
2429                        rxmsg.ibm_u.connparams.ibcp_queue_depth, 
2430                        IBNAL_MSG_QUEUE_SIZE);
2431                 reason = IBNAL_REJECT_FATAL;
2432                 goto reject;
2433         }
2434
2435         if (rxmsg.ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE) {
2436                 CERROR("Can't accept %s: message size %d too big (%d max)\n",
2437                        libcfs_nid2str(rxmsg.ibm_srcnid), 
2438                        rxmsg.ibm_u.connparams.ibcp_max_msg_size, 
2439                        IBNAL_MSG_SIZE);
2440                 reason = IBNAL_REJECT_FATAL;
2441                 goto reject;
2442         }
2443                 
2444         if (rxmsg.ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) {
2445                 CERROR("Can't accept %s: max frags %d too big (%d max)\n",
2446                        libcfs_nid2str(rxmsg.ibm_srcnid), 
2447                        rxmsg.ibm_u.connparams.ibcp_max_frags, 
2448                        IBNAL_MAX_RDMA_FRAGS);
2449                 reason = IBNAL_REJECT_FATAL;
2450                 goto reject;
2451         }
2452         
2453         /* assume 'rxmsg.ibm_srcnid' is a new peer; create */
2454         rc = kibnal_create_peer (&peer, rxmsg.ibm_srcnid);
2455         if (rc != 0) {
2456                 CERROR("Can't create peer for %s\n",
2457                        libcfs_nid2str(rxmsg.ibm_srcnid));
2458                 reason = IBNAL_REJECT_NO_RESOURCES;
2459                 goto reject;
2460         }
2461
2462         write_lock_irqsave(g_lock, flags);
2463
2464         if (kibnal_data.kib_listen_handle == NULL) {
2465                 write_unlock_irqrestore(g_lock, flags);
2466
2467                 CWARN ("Shutdown has started, rejecting connreq from %s\n",
2468                        libcfs_nid2str(rxmsg.ibm_srcnid));
2469                 kibnal_peer_decref(peer);
2470                 reason = IBNAL_REJECT_FATAL;
2471                 goto reject;
2472         }
2473
2474         peer2 = kibnal_find_peer_locked(rxmsg.ibm_srcnid);
2475         if (peer2 != NULL) {
2476                 /* tie-break connection race in favour of the higher NID */                
2477                 if (peer2->ibp_connecting != 0 &&
2478                     rxmsg.ibm_srcnid < kibnal_data.kib_ni->ni_nid) {
2479                         write_unlock_irqrestore(g_lock, flags);
2480
2481                         CWARN("Conn race %s\n",
2482                               libcfs_nid2str(peer2->ibp_nid));
2483
2484                         kibnal_peer_decref(peer);
2485                         reason = IBNAL_REJECT_CONN_RACE;
2486                         goto reject;
2487                 }
2488
2489                 peer2->ibp_accepting++;
2490                 kibnal_peer_addref(peer2);
2491
2492                 write_unlock_irqrestore(g_lock, flags);
2493                 kibnal_peer_decref(peer);
2494                 peer = peer2;
2495         } else {
2496                 /* Brand new peer */
2497                 LASSERT (peer->ibp_accepting == 0);
2498                 peer->ibp_accepting = 1;
2499
2500                 kibnal_peer_addref(peer);
2501                 list_add_tail(&peer->ibp_list, kibnal_nid2peerlist(rxmsg.ibm_srcnid));
2502
2503                 write_unlock_irqrestore(g_lock, flags);
2504         }
2505                 
2506         conn = kibnal_create_conn(cep);
2507         if (conn == NULL) {
2508                 CERROR("Can't create conn for %s\n",
2509                        libcfs_nid2str(rxmsg.ibm_srcnid));
2510                 kibnal_peer_connect_failed(peer, 0, -ENOMEM);
2511                 kibnal_peer_decref(peer);
2512                 reason = IBNAL_REJECT_NO_RESOURCES;
2513                 goto reject;
2514         }
2515
2516         conn->ibc_version = rxmsg.ibm_version;
2517
2518         conn->ibc_peer = peer;              /* conn takes over my ref */
2519         conn->ibc_incarnation = rxmsg.ibm_srcstamp;
2520         conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
2521         conn->ibc_reserved_credits = IBNAL_MSG_QUEUE_SIZE;
2522         LASSERT (conn->ibc_credits + conn->ibc_reserved_credits
2523                  <= IBNAL_RX_MSGS);
2524
2525         cv = conn->ibc_connvars;
2526
2527         cv->cv_txpsn          = cmreq->cep_data.start_psn;
2528         cv->cv_remote_qpn     = cmreq->cep_data.qpn;
2529         cv->cv_path           = cmreq->path_data.path;
2530         cv->cv_rnr_count      = cmreq->cep_data.rtr_retry_cnt;
2531         // XXX                  cmreq->cep_data.retry_cnt;
2532         cv->cv_port           = cmreq->cep_data.local_port_num;
2533
2534         vvrc = gid2gid_index(kibnal_data.kib_hca, cv->cv_port,
2535                              &cv->cv_path.sgid, &cv->cv_sgid_index);
2536         if (vvrc != vv_return_ok) {
2537                 CERROR("gid2gid_index failed for %s: %d\n",
2538                        libcfs_nid2str(rxmsg.ibm_srcnid), vvrc);
2539                 rc = -EIO;
2540                 reason = IBNAL_REJECT_FATAL;
2541                 goto reject;
2542         }
2543         
2544         vvrc = pkey2pkey_index(kibnal_data.kib_hca, cv->cv_port,
2545                                cv->cv_path.pkey, &cv->cv_pkey_index);
2546         if (vvrc != vv_return_ok) {
2547                 CERROR("pkey2pkey_index failed for %s: %d\n",
2548                        libcfs_nid2str(rxmsg.ibm_srcnid), vvrc);
2549                 rc = -EIO;
2550                 reason = IBNAL_REJECT_FATAL;
2551                 goto reject;
2552         }
2553
2554         rc = kibnal_set_qp_state(conn, vv_qp_state_init);
2555         if (rc != 0) {
2556                 reason = IBNAL_REJECT_FATAL;
2557                 goto reject;
2558         }
2559
2560         rc = kibnal_post_receives(conn);
2561         if (rc != 0) {
2562                 CERROR("Can't post receives for %s\n", 
2563                        libcfs_nid2str(rxmsg.ibm_srcnid));
2564                 reason = IBNAL_REJECT_FATAL;
2565                 goto reject;
2566         }
2567
2568         rc = kibnal_set_qp_state(conn, vv_qp_state_rtr);
2569         if (rc != 0) {
2570                 reason = IBNAL_REJECT_FATAL;
2571                 goto reject;
2572         }
2573         
2574         memset(&reply, 0, sizeof(reply));
2575         reply.qpn                 = cv->cv_local_qpn;
2576         reply.qkey                = IBNAL_QKEY;
2577         reply.start_psn           = cv->cv_rxpsn;
2578         reply.arb_initiator_depth = IBNAL_ARB_INITIATOR_DEPTH;
2579         reply.arb_resp_res        = IBNAL_ARB_RESP_RES;
2580         reply.failover_accepted   = IBNAL_FAILOVER_ACCEPTED;
2581         reply.rnr_retry_count     = cv->cv_rnr_count;
2582         reply.targ_ack_delay      = kibnal_data.kib_hca_attrs.ack_delay;
2583         
2584         /* setup txmsg... */
2585         memset(&txmsg, 0, sizeof(txmsg));
2586         kibnal_init_msg(&txmsg, IBNAL_MSG_CONNACK, 
2587                         sizeof(txmsg.ibm_u.connparams));
2588         LASSERT (txmsg.ibm_nob <= cm_REP_priv_data_len);
2589         txmsg.ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE;
2590         txmsg.ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE;
2591         txmsg.ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS;
2592         kibnal_pack_msg(&txmsg, conn->ibc_version,
2593                         0, rxmsg.ibm_srcnid, rxmsg.ibm_srcstamp, 0);
2594
2595         /* ...and copy into reply to avoid alignment issues */
2596         memcpy(&reply.priv_data, &txmsg, txmsg.ibm_nob);
2597
2598         kibnal_set_conn_state(conn, IBNAL_CONN_PASSIVE_WAIT);
2599         
2600         cmrc = cm_accept(conn->ibc_cep, &reply, NULL,
2601                          kibnal_cm_callback, conn);
2602
2603         if (cmrc == cm_stat_success)
2604                 return;                         /* callback has got my ref on conn */
2605
2606         /* back out state change (no callback happening) */
2607         kibnal_set_conn_state(conn, IBNAL_CONN_INIT);
2608         rc = -EIO;
2609         reason = IBNAL_REJECT_FATAL;
2610                 
2611  reject:
2612         CDEBUG(D_NET, "Rejecting connreq from %s\n",
2613                libcfs_nid2str(rxmsg.ibm_srcnid));
2614
2615         kibnal_reject(cep, reason);
2616
2617         if (conn != NULL) {
2618                 LASSERT (rc != 0);
2619                 kibnal_connreq_done(conn, 0, rc);
2620         } else {
2621                 cm_destroy_cep(cep);
2622         }
2623 }
2624
2625 void
2626 kibnal_listen_callback(cm_cep_handle_t cep, cm_conn_data_t *data, void *arg)
2627 {
2628         cm_request_data_t  *cmreq = &data->data.request;
2629         kib_pcreq_t        *pcr;
2630         unsigned long       flags;
2631         
2632         LASSERT (arg == NULL);
2633
2634         if (data->status != cm_event_conn_request) {
2635                 CERROR("status %d is not cm_event_conn_request\n",
2636                        data->status);
2637                 return;
2638         }
2639
2640         LIBCFS_ALLOC_ATOMIC(pcr, sizeof(*pcr));
2641         if (pcr == NULL) {
2642                 CERROR("Can't allocate passive connreq\n");
2643
2644                 kibnal_reject(cep, IBNAL_REJECT_NO_RESOURCES);
2645                 cm_destroy_cep(cep);
2646                 return;
2647         }
2648
2649         pcr->pcr_cep = cep;
2650         pcr->pcr_cmreq = *cmreq;
2651         
2652         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
2653
2654         list_add_tail(&pcr->pcr_list, &kibnal_data.kib_connd_pcreqs);
2655         wake_up(&kibnal_data.kib_connd_waitq);
2656         
2657         spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
2658 }
2659
2660
2661 void
2662 kibnal_active_connect_callback (cm_cep_handle_t cep, cm_conn_data_t *cd, 
2663                                 void *arg)
2664 {
2665         /* CAVEAT EMPTOR: tasklet context */
2666         kib_conn_t       *conn = (kib_conn_t *)arg;
2667         kib_connvars_t   *cv = conn->ibc_connvars;
2668
2669         LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT);
2670         cv->cv_conndata = *cd;
2671
2672         kibnal_schedule_conn(conn);
2673         kibnal_conn_decref(conn);
2674 }
2675
2676 void
2677 kibnal_connect_conn (kib_conn_t *conn)
2678 {
2679         static cm_request_data_t  cmreq;
2680         static kib_msg_t          msg;
2681         
2682         kib_connvars_t           *cv = conn->ibc_connvars;
2683         kib_peer_t               *peer = conn->ibc_peer;
2684         cm_return_t               cmrc;
2685
2686         /* Only called by connd => statics OK */
2687         LASSERT (!in_interrupt());
2688         LASSERT (current == kibnal_data.kib_connd);
2689         LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP);
2690
2691         memset(&cmreq, 0, sizeof(cmreq));
2692         
2693         cmreq.sid = (__u64)(*kibnal_tunables.kib_service_number);
2694
2695         cmreq.cep_data.ca_guid              = kibnal_data.kib_hca_attrs.guid;
2696         cmreq.cep_data.qpn                  = cv->cv_local_qpn;
2697         cmreq.cep_data.retry_cnt            = *kibnal_tunables.kib_retry_cnt;
2698         cmreq.cep_data.rtr_retry_cnt        = *kibnal_tunables.kib_rnr_cnt;
2699         cmreq.cep_data.start_psn            = cv->cv_rxpsn;
2700         cmreq.cep_data.end_to_end_flow_ctrl = IBNAL_EE_FLOW_CNT;
2701         // XXX ack_timeout?
2702         // offered_resp_res
2703         // offered_initiator_depth
2704
2705         cmreq.path_data.subn_local  = IBNAL_LOCAL_SUB;
2706         cmreq.path_data.path        = cv->cv_path;
2707         
2708         /* setup msg... */
2709         memset(&msg, 0, sizeof(msg));
2710         kibnal_init_msg(&msg, IBNAL_MSG_CONNREQ, sizeof(msg.ibm_u.connparams));
2711         LASSERT(msg.ibm_nob <= cm_REQ_priv_data_len);
2712         msg.ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE;
2713         msg.ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE;
2714         msg.ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS;
2715         kibnal_pack_msg(&msg, conn->ibc_version, 0, peer->ibp_nid, 0, 0);
2716
2717         if (the_lnet.ln_testprotocompat != 0) {
2718                 /* single-shot proto check */
2719                 LNET_LOCK();
2720                 if ((the_lnet.ln_testprotocompat & 1) != 0) {
2721                         msg.ibm_version++;
2722                         the_lnet.ln_testprotocompat &= ~1;
2723                 }
2724                 if ((the_lnet.ln_testprotocompat & 2) != 0) {
2725                         msg.ibm_magic = LNET_PROTO_MAGIC;
2726                         the_lnet.ln_testprotocompat &= ~2;
2727                 }
2728                 LNET_UNLOCK();
2729         }
2730
2731         /* ...and copy into cmreq to avoid alignment issues */
2732         memcpy(&cmreq.priv_data, &msg, msg.ibm_nob);
2733         
2734         CDEBUG(D_NET, "Connecting %p to %s\n", conn,
2735                libcfs_nid2str(peer->ibp_nid));
2736
2737         kibnal_conn_addref(conn);               /* ++ref for CM callback */
2738         kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CONNECT);
2739
2740         cmrc = cm_connect(conn->ibc_cep, &cmreq, 
2741                           kibnal_active_connect_callback, conn);
2742         if (cmrc == cm_stat_success) {
2743                 CDEBUG(D_NET, "connection REQ sent to %s\n",
2744                        libcfs_nid2str(peer->ibp_nid));
2745                 return;
2746         }
2747
2748         CERROR ("Connect %s failed: %d\n", libcfs_nid2str(peer->ibp_nid), cmrc);
2749         kibnal_conn_decref(conn);       /* drop callback's ref */
2750         kibnal_connreq_done(conn, 1, -EHOSTUNREACH);
2751 }
2752
2753 void
2754 kibnal_reconnect (kib_conn_t *conn, int why)
2755 {
2756         kib_peer_t      *peer = conn->ibc_peer;
2757         int              retry;
2758         unsigned long    flags;
2759         cm_return_t      cmrc;
2760         cm_cep_handle_t  cep;
2761         
2762         LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT);
2763
2764         read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2765
2766         LASSERT (peer->ibp_connecting > 0);          /* 'conn' at least */
2767
2768         /* retry connection if it's still needed and no other connection
2769          * attempts (active or passive) are in progress.
2770          * Immediate reconnect is required, so I don't even look at the
2771          * reconnection timeout etc */
2772
2773         retry = (!list_empty(&peer->ibp_tx_queue) &&
2774                  peer->ibp_connecting == 1 &&
2775                  peer->ibp_accepting == 0);
2776         
2777         read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2778
2779         if (!retry) {
2780                 kibnal_connreq_done(conn, 1, why);
2781                 return;
2782         }
2783
2784         cep = cm_create_cep(cm_cep_transp_rc);
2785         if (cep == NULL) {
2786                 CERROR("Can't create new CEP\n");
2787                 kibnal_connreq_done(conn, 1, -ENOMEM);
2788                 return;
2789         }
2790
2791         cmrc = cm_cancel(conn->ibc_cep);
2792         LASSERT (cmrc == cm_stat_success);
2793         cmrc = cm_destroy_cep(conn->ibc_cep);
2794         LASSERT (cmrc == cm_stat_success);
2795
2796         conn->ibc_cep = cep;
2797
2798         /* reuse conn; no need to peer->ibp_connecting++ */
2799         kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_ARP);
2800         kibnal_connect_conn(conn);
2801 }
2802
2803 void
2804 kibnal_check_connreply (kib_conn_t *conn)
2805 {
2806         static cm_rtu_data_t  rtu;
2807         static kib_msg_t      msg;
2808
2809         kib_connvars_t   *cv = conn->ibc_connvars;
2810         cm_reply_data_t  *reply = &cv->cv_conndata.data.reply;
2811         kib_peer_t       *peer = conn->ibc_peer;
2812         int               msgnob;
2813         cm_return_t       cmrc;
2814         unsigned long     flags;
2815         int               rc;
2816
2817         /* Only called by connd => statics OK */
2818         LASSERT (!in_interrupt());
2819         LASSERT (current == kibnal_data.kib_connd);
2820         LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT);
2821
2822         if (cv->cv_conndata.status == cm_event_conn_reply) {
2823                 cv->cv_remote_qpn = reply->qpn;
2824                 cv->cv_txpsn      = reply->start_psn;
2825                 // XXX              reply->targ_ack_delay;
2826                 cv->cv_rnr_count  = reply->rnr_retry_count;
2827
2828                 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CHECK_REPLY);
2829
2830                 /* copy into msg to avoid alignment issues */
2831                 msgnob = MIN(cm_REP_priv_data_len, sizeof(msg));
2832                 memcpy(&msg, &reply->priv_data, msgnob);
2833
2834                 rc = kibnal_unpack_msg(&msg, conn->ibc_version, msgnob);
2835                 if (rc != 0) {
2836                         CERROR("Can't unpack reply from %s\n",
2837                                libcfs_nid2str(peer->ibp_nid));
2838                         kibnal_connreq_done(conn, 1, rc);
2839                         return;
2840                 }
2841
2842                 if (msg.ibm_type != IBNAL_MSG_CONNACK ) {
2843                         CERROR("Unexpected message type %d from %s\n",
2844                                msg.ibm_type, libcfs_nid2str(peer->ibp_nid));
2845                         kibnal_connreq_done(conn, 1, -EPROTO);
2846                         return;
2847                 }
2848
2849                 if (msg.ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) {
2850                         CERROR("%s has incompatible queue depth %d(%d wanted)\n",
2851                                libcfs_nid2str(peer->ibp_nid), 
2852                                msg.ibm_u.connparams.ibcp_queue_depth,
2853                                IBNAL_MSG_QUEUE_SIZE);
2854                         kibnal_connreq_done(conn, 1, -EPROTO);
2855                         return;
2856                 }
2857                 
2858                 if (msg.ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE) {
2859                         CERROR("%s max message size %d too big (%d max)\n",
2860                                libcfs_nid2str(peer->ibp_nid), 
2861                                msg.ibm_u.connparams.ibcp_max_msg_size, 
2862                                IBNAL_MSG_SIZE);
2863                         kibnal_connreq_done(conn, 1, -EPROTO);
2864                         return;
2865                 }
2866
2867                 if (msg.ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) {
2868                         CERROR("%s max frags %d too big (%d max)\n",
2869                                libcfs_nid2str(peer->ibp_nid),
2870                                msg.ibm_u.connparams.ibcp_max_frags, 
2871                                IBNAL_MAX_RDMA_FRAGS);
2872                         kibnal_connreq_done(conn, 1, -EPROTO);
2873                         return;
2874                 }
2875                 
2876                 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2877                 if (lnet_ptlcompat_matchnid(kibnal_data.kib_ni->ni_nid,
2878                                             msg.ibm_dstnid) &&
2879                     msg.ibm_dststamp == kibnal_data.kib_incarnation)
2880                         rc = 0;
2881                 else
2882                         rc = -ESTALE;
2883                 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2884                 if (rc != 0) {
2885                         CERROR("Stale connection reply from %s\n",
2886                                libcfs_nid2str(peer->ibp_nid));
2887                         kibnal_connreq_done(conn, 1, rc);
2888                         return;
2889                 }
2890
2891                 conn->ibc_incarnation = msg.ibm_srcstamp;
2892                 conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
2893                 conn->ibc_reserved_credits = IBNAL_MSG_QUEUE_SIZE;
2894                 LASSERT (conn->ibc_credits + conn->ibc_reserved_credits
2895                          <= IBNAL_RX_MSGS);
2896                 
2897                 rc = kibnal_post_receives(conn);
2898                 if (rc != 0) {
2899                         CERROR("Can't post receives for %s\n",
2900                                libcfs_nid2str(peer->ibp_nid));
2901                         kibnal_connreq_done(conn, 1, rc);
2902                         return;
2903                 }
2904                 
2905                 rc = kibnal_set_qp_state(conn, vv_qp_state_rtr);
2906                 if (rc != 0) {
2907                         kibnal_connreq_done(conn, 1, rc);
2908                         return;
2909                 }
2910                 
2911                 rc = kibnal_set_qp_state(conn, vv_qp_state_rts);
2912                 if (rc != 0) {
2913                         kibnal_connreq_done(conn, 1, rc);
2914                         return;
2915                 }
2916                 
2917                 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_RTU);
2918                 kibnal_conn_addref(conn);       /* ++for CM callback */
2919                 
2920                 memset(&rtu, 0, sizeof(rtu));
2921                 cmrc = cm_accept(conn->ibc_cep, NULL, &rtu,
2922                                  kibnal_cm_callback, conn);
2923                 if (cmrc == cm_stat_success) {
2924                         /* Now I'm racing with disconnect signalled by
2925                          * kibnal_cm_callback */
2926                         kibnal_connreq_done(conn, 1, 0);
2927                         return;
2928                 }
2929
2930                 CERROR("cm_accept %s failed: %d\n", 
2931                        libcfs_nid2str(peer->ibp_nid), cmrc);
2932                 /* Back out of RTU: no callback coming */
2933                 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CHECK_REPLY);
2934                 kibnal_conn_decref(conn);
2935                 kibnal_connreq_done(conn, 1, -EIO);
2936                 return;
2937         }
2938
2939         if (cv->cv_conndata.status == cm_event_conn_reject) {
2940
2941                 if (cv->cv_conndata.data.reject.reason == cm_rej_code_usr_rej) {
2942                         unsigned char *bytes =
2943                                 cv->cv_conndata.data.reject.priv_data;
2944                         int   magic   = (bytes[0]) |
2945                                         (bytes[1] << 8) |
2946                                         (bytes[2] << 16) |
2947                                         (bytes[3] << 24);
2948                         int   version = (bytes[4]) |
2949                                         (bytes[5] << 8);
2950                         int   why     = (bytes[6]);
2951
2952                         /* Expected proto/version: she just doesn't like me (or
2953                          * ran out of resources) */
2954                         if (magic == IBNAL_MSG_MAGIC &&
2955                             version == conn->ibc_version) {
2956                                 CERROR("conn -> %s rejected: fatal error %d\n",
2957                                        libcfs_nid2str(peer->ibp_nid), why);
2958
2959                                 if (why == IBNAL_REJECT_CONN_RACE) 
2960                                         kibnal_reconnect(conn, -EALREADY);
2961                                 else
2962                                         kibnal_connreq_done(conn, 1, -ECONNREFUSED);
2963                                 return;
2964                         }
2965                         
2966                         /* Fail unless it's worth retrying with an old proto
2967                          * version */
2968                         if (!(magic == IBNAL_MSG_MAGIC &&
2969                               version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD &&
2970                               conn->ibc_version == IBNAL_MSG_VERSION)) {
2971                                 CERROR("conn -> %s rejected: bad protocol "
2972                                        "magic/ver %08x/%x why %d\n",
2973                                        libcfs_nid2str(peer->ibp_nid),
2974                                        magic, version, why);
2975
2976                                 kibnal_connreq_done(conn, 1, -ECONNREFUSED);
2977                                 return;
2978                         }
2979
2980                         conn->ibc_version = version;
2981                         CWARN ("Connection to %s refused: "
2982                                "retrying with old protocol version 0x%x\n", 
2983                                libcfs_nid2str(peer->ibp_nid), version);
2984
2985                         kibnal_reconnect(conn, -ECONNREFUSED);
2986                         return;
2987                 } else if (cv->cv_conndata.data.reject.reason == 
2988                            cm_rej_code_stale_conn) {
2989                         
2990                         CWARN ("conn -> %s stale: retrying\n", 
2991                                libcfs_nid2str(peer->ibp_nid));
2992
2993                         kibnal_reconnect(conn, -ESTALE);
2994                         return;
2995                 } else {
2996                         CDEBUG(D_NETERROR, "conn -> %s rejected: reason %d\n",
2997                                libcfs_nid2str(peer->ibp_nid),
2998                                cv->cv_conndata.data.reject.reason);
2999                         kibnal_connreq_done(conn, 1, -ECONNREFUSED);
3000                         return;
3001                 }
3002                 /* NOT REACHED */
3003         }
3004
3005         CDEBUG(D_NETERROR, "conn -> %s failed: %d\n", 
3006                libcfs_nid2str(peer->ibp_nid), cv->cv_conndata.status);
3007         kibnal_connreq_done(conn, 1, -ECONNABORTED);
3008 }
3009
3010 void
3011 kibnal_arp_done (kib_conn_t *conn)
3012 {
3013         kib_peer_t           *peer = conn->ibc_peer;
3014         kib_connvars_t       *cv = conn->ibc_connvars;
3015         ibat_arp_data_t      *arp = &cv->cv_arp;
3016         ib_path_record_v2_t  *path = &cv->cv_path;
3017         vv_return_t           vvrc;
3018         int                   rc;
3019         unsigned long         flags;
3020
3021         LASSERT (!in_interrupt());
3022         LASSERT (current == kibnal_data.kib_connd);
3023         LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP);
3024         LASSERT (peer->ibp_arp_count > 0);
3025         
3026         if (cv->cv_arprc != ibat_stat_ok) {
3027                 CDEBUG(D_NETERROR, "Arp %s @ %u.%u.%u.%u failed: %d\n", 
3028                        libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip),
3029                        cv->cv_arprc);
3030                 goto failed;
3031         }
3032
3033         if ((arp->mask & IBAT_PRI_PATH_VALID) != 0) {
3034                 CDEBUG(D_NET, "Got valid path for %s\n",
3035                        libcfs_nid2str(peer->ibp_nid));
3036
3037                 *path = *arp->primary_path;
3038
3039                 vvrc = base_gid2port_num(kibnal_data.kib_hca, &path->sgid,
3040                                          &cv->cv_port);
3041                 if (vvrc != vv_return_ok) {
3042                         CWARN("base_gid2port_num failed for %s @ %u.%u.%u.%u: %d\n", 
3043                               libcfs_nid2str(peer->ibp_nid),
3044                               HIPQUAD(peer->ibp_ip), vvrc);
3045                         goto failed;
3046                 }
3047
3048                 vvrc = gid2gid_index(kibnal_data.kib_hca, cv->cv_port,
3049                                      &path->sgid, &cv->cv_sgid_index);
3050                 if (vvrc != vv_return_ok) {
3051                         CWARN("gid2gid_index failed for %s @ %u.%u.%u.%u: %d\n", 
3052                               libcfs_nid2str(peer->ibp_nid),
3053                               HIPQUAD(peer->ibp_ip), vvrc);
3054                         goto failed;
3055                 }
3056
3057                 vvrc = pkey2pkey_index(kibnal_data.kib_hca, cv->cv_port,
3058                                        path->pkey, &cv->cv_pkey_index);
3059                 if (vvrc != vv_return_ok) {
3060                         CWARN("pkey2pkey_index failed for %s @ %u.%u.%u.%u: %d\n", 
3061                               libcfs_nid2str(peer->ibp_nid), 
3062                               HIPQUAD(peer->ibp_ip), vvrc);
3063                         goto failed;
3064                 }
3065
3066                 path->mtu = IBNAL_IB_MTU;
3067
3068         } else if ((arp->mask & IBAT_LID_VALID) != 0) {
3069                 CWARN("Creating new path record for %s @ %u.%u.%u.%u\n",
3070                       libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip));
3071
3072                 cv->cv_pkey_index = IBNAL_PKEY_IDX;
3073                 cv->cv_sgid_index = IBNAL_SGID_IDX;
3074                 cv->cv_port = arp->local_port_num;
3075
3076                 memset(path, 0, sizeof(*path));
3077
3078                 vvrc = port_num2base_gid(kibnal_data.kib_hca, cv->cv_port,
3079                                          &path->sgid);
3080                 if (vvrc != vv_return_ok) {
3081                         CWARN("port_num2base_gid failed for %s @ %u.%u.%u.%u: %d\n", 
3082                               libcfs_nid2str(peer->ibp_ip),
3083                               HIPQUAD(peer->ibp_ip), vvrc);
3084                         goto failed;
3085                 }
3086
3087                 vvrc = port_num2base_lid(kibnal_data.kib_hca, cv->cv_port,
3088                                          &path->slid);
3089                 if (vvrc != vv_return_ok) {
3090                         CWARN("port_num2base_lid failed for %s @ %u.%u.%u.%u: %d\n", 
3091                               libcfs_nid2str(peer->ibp_ip), 
3092                               HIPQUAD(peer->ibp_ip), vvrc);
3093                         goto failed;
3094                 }
3095
3096                 path->dgid          = arp->gid;
3097                 path->sl            = IBNAL_SERVICE_LEVEL;
3098                 path->dlid          = arp->lid;
3099                 path->mtu           = IBNAL_IB_MTU;
3100                 path->rate          = IBNAL_STATIC_RATE;
3101                 path->pkt_life_time = IBNAL_PKT_LIFETIME;
3102                 path->pkey          = IBNAL_PKEY;
3103                 path->traffic_class = IBNAL_TRAFFIC_CLASS;
3104         } else {
3105                 CWARN("Arp for %s @ %u.%u.%u.%u returned neither PATH nor LID\n",
3106                       libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip));
3107                 goto failed;
3108         }
3109
3110         rc = kibnal_set_qp_state(conn, vv_qp_state_init);
3111         if (rc != 0) {
3112                 kibnal_connreq_done(conn, 1, rc);
3113         }
3114
3115         /* do the actual connection request */
3116         kibnal_connect_conn(conn);
3117         return;
3118
3119  failed:
3120         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
3121         peer->ibp_arp_count--;
3122         if (peer->ibp_arp_count == 0) {
3123                 /* final ARP attempt failed */
3124                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, 
3125                                         flags);
3126                 CDEBUG(D_NETERROR, "Arp %s @ %u.%u.%u.%u failed (final attempt)\n", 
3127                        libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip));
3128         } else {
3129                 /* Retry ARP: ibp_connecting++ so terminating conn
3130                  * doesn't end peer's connection attempt */
3131                 peer->ibp_connecting++;
3132                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, 
3133                                         flags);
3134                 CDEBUG(D_NETERROR, "Arp %s @ %u.%u.%u.%u failed (%d attempts left)\n",
3135                        libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip), 
3136                        peer->ibp_arp_count);
3137                 
3138                 kibnal_schedule_peer_arp(peer);
3139         }
3140         kibnal_connreq_done(conn, 1, -ENETUNREACH);
3141 }
3142
3143 void
3144 kibnal_arp_callback (ibat_stat_t arprc, ibat_arp_data_t *arp_data, void *arg)
3145 {
3146         /* CAVEAT EMPTOR: tasklet context */
3147         kib_peer_t *peer;
3148         kib_conn_t *conn = (kib_conn_t *)arg;
3149
3150         LASSERT (conn != NULL);
3151         LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP);
3152
3153         peer = conn->ibc_peer;
3154
3155         if (arprc != ibat_stat_ok)
3156                 CDEBUG(D_NETERROR, "Arp %s at %u.%u.%u.%u failed: %d\n",
3157                        libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip), arprc);
3158         else
3159                 CDEBUG(D_NET, "Arp %s at %u.%u.%u.%u OK: LID %s PATH %s\n",
3160                        libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip), 
3161                        (arp_data->mask & IBAT_LID_VALID) == 0 ? "invalid" : "valid",
3162                        (arp_data->mask & IBAT_PRI_PATH_VALID) == 0 ? "invalid" : "valid");
3163
3164         conn->ibc_connvars->cv_arprc = arprc;
3165         if (arprc == ibat_stat_ok)
3166                 conn->ibc_connvars->cv_arp = *arp_data;
3167         
3168         kibnal_schedule_conn(conn);
3169         kibnal_conn_decref(conn);
3170 }
3171
3172 void
3173 kibnal_arp_peer (kib_peer_t *peer)
3174 {
3175         cm_cep_handle_t  cep;
3176         kib_conn_t      *conn;
3177         int              ibatrc;
3178
3179         /* Only the connd does this (i.e. single threaded) */
3180         LASSERT (current == kibnal_data.kib_connd);
3181         LASSERT (peer->ibp_connecting != 0);
3182         LASSERT (peer->ibp_arp_count > 0);
3183
3184         cep = cm_create_cep(cm_cep_transp_rc);
3185         if (cep == NULL) {
3186                 CERROR ("Can't create cep for conn->%s\n",
3187                         libcfs_nid2str(peer->ibp_nid));
3188                 kibnal_peer_connect_failed(peer, 1, -ENOMEM);
3189                 return;
3190         }
3191
3192         conn = kibnal_create_conn(cep);
3193         if (conn == NULL) {
3194                 CERROR ("Can't allocate conn->%s\n",
3195                         libcfs_nid2str(peer->ibp_nid));
3196                 cm_destroy_cep(cep);
3197                 kibnal_peer_connect_failed(peer, 1, -ENOMEM);
3198                 return;
3199         }
3200
3201         conn->ibc_peer = peer;
3202         kibnal_peer_addref(peer);
3203
3204         kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_ARP);
3205
3206         ibatrc = ibat_get_ib_data(htonl(peer->ibp_ip), INADDR_ANY, 
3207                                   ibat_paths_primary,
3208                                   &conn->ibc_connvars->cv_arp, 
3209                                   kibnal_arp_callback, conn, 0);
3210         CDEBUG(D_NET,"ibatrc %d\n", ibatrc);
3211         switch (ibatrc) {
3212         default:
3213                 LBUG();
3214                 
3215         case ibat_stat_pending:
3216                 /* NB callback has my ref on conn */
3217                 break;
3218                 
3219         case ibat_stat_ok:
3220         case ibat_stat_error:
3221         case ibat_stat_timeout:
3222         case ibat_stat_not_found:
3223                 /* Immediate return (ARP cache hit or failure) == no callback. 
3224                  * Do the next stage directly... */
3225                 conn->ibc_connvars->cv_arprc = ibatrc;
3226                 kibnal_arp_done(conn);
3227                 kibnal_conn_decref(conn);
3228                 break;
3229         }
3230 }
3231
3232 int
3233 kibnal_check_txs (kib_conn_t *conn, struct list_head *txs)
3234 {
3235         kib_tx_t          *tx;
3236         struct list_head  *ttmp;
3237         int                timed_out = 0;
3238
3239         spin_lock(&conn->ibc_lock);
3240
3241         list_for_each (ttmp, txs) {
3242                 tx = list_entry (ttmp, kib_tx_t, tx_list);
3243
3244                 if (txs == &conn->ibc_active_txs) {
3245                         LASSERT (!tx->tx_queued);
3246                         LASSERT (tx->tx_waiting || tx->tx_sending != 0);
3247                 } else {
3248                         LASSERT (tx->tx_queued);
3249                 }
3250
3251                 if (time_after_eq (jiffies, tx->tx_deadline)) {
3252                         timed_out = 1;
3253                         break;
3254                 }
3255         }
3256
3257         spin_unlock(&conn->ibc_lock);
3258         return timed_out;
3259 }
3260
3261 int
3262 kibnal_conn_timed_out (kib_conn_t *conn)
3263 {
3264         return  kibnal_check_txs(conn, &conn->ibc_tx_queue) ||
3265                 kibnal_check_txs(conn, &conn->ibc_tx_queue_rsrvd) ||
3266                 kibnal_check_txs(conn, &conn->ibc_tx_queue_nocred) ||
3267                 kibnal_check_txs(conn, &conn->ibc_active_txs);
3268 }
3269
3270 void
3271 kibnal_check_conns (int idx)
3272 {
3273         struct list_head  *peers = &kibnal_data.kib_peers[idx];
3274         struct list_head  *ptmp;
3275         kib_peer_t        *peer;
3276         kib_conn_t        *conn;
3277         struct list_head  *ctmp;
3278         unsigned long      flags;
3279
3280  again:
3281         /* NB. We expect to have a look at all the peers and not find any
3282          * rdmas to time out, so we just use a shared lock while we
3283          * take a look... */
3284         read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
3285
3286         list_for_each (ptmp, peers) {
3287                 peer = list_entry (ptmp, kib_peer_t, ibp_list);
3288
3289                 list_for_each (ctmp, &peer->ibp_conns) {
3290                         conn = list_entry (ctmp, kib_conn_t, ibc_list);
3291
3292                         LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED);
3293
3294                         /* In case we have enough credits to return via a
3295                          * NOOP, but there were no non-blocking tx descs
3296                          * free to do it last time... */
3297                         kibnal_check_sends(conn);
3298
3299                         if (!kibnal_conn_timed_out(conn))
3300                                 continue;
3301
3302                         /* Handle timeout by closing the whole connection.  We
3303                          * can only be sure RDMA activity has ceased once the
3304                          * QP has been modified. */
3305                         
3306                         kibnal_conn_addref(conn); /* 1 ref for me... */
3307
3308                         read_unlock_irqrestore(&kibnal_data.kib_global_lock,
3309                                                flags);
3310
3311                         CERROR("Timed out RDMA with %s\n",
3312                                libcfs_nid2str(peer->ibp_nid));
3313
3314                         kibnal_close_conn (conn, -ETIMEDOUT);
3315                         kibnal_conn_decref(conn); /* ...until here */
3316
3317                         /* start again now I've dropped the lock */
3318                         goto again;
3319                 }
3320         }
3321
3322         read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
3323 }
3324
3325 void
3326 kibnal_disconnect_conn (kib_conn_t *conn)
3327 {
3328         static cm_drequest_data_t dreq;         /* just for the space */
3329         
3330         cm_return_t    cmrc;
3331         unsigned long  flags;
3332
3333         LASSERT (!in_interrupt());
3334         LASSERT (current == kibnal_data.kib_connd);
3335         
3336         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
3337
3338         if (conn->ibc_disconnect) {
3339                 /* Had the CM callback already */
3340                 write_unlock_irqrestore(&kibnal_data.kib_global_lock,
3341                                         flags);
3342                 kibnal_conn_disconnected(conn);
3343                 return;
3344         }
3345                 
3346         LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT1);
3347
3348         /* active disconnect */
3349         cmrc = cm_disconnect(conn->ibc_cep, &dreq, NULL);
3350         if (cmrc == cm_stat_success) {
3351                 /* waiting for CM */
3352                 conn->ibc_state = IBNAL_CONN_DISCONNECT2;
3353                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
3354                 return;
3355         }
3356
3357         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
3358
3359         cm_cancel(conn->ibc_cep);
3360         cfs_pause(cfs_time_seconds(1)/10);
3361
3362         if (!conn->ibc_disconnect)              /* CM callback will never happen now */
3363                 kibnal_conn_decref(conn);
3364         
3365         LASSERT (atomic_read(&conn->ibc_refcount) > 0);
3366         LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT1);
3367
3368         kibnal_conn_disconnected(conn);
3369 }
3370
3371 int
3372 kibnal_connd (void *arg)
3373 {
3374         wait_queue_t       wait;
3375         unsigned long      flags;
3376         kib_pcreq_t       *pcr;
3377         kib_conn_t        *conn;
3378         kib_peer_t        *peer;
3379         int                timeout;
3380         int                i;
3381         int                dropped_lock;
3382         int                peer_index = 0;
3383         unsigned long      deadline = jiffies;
3384         
3385         cfs_daemonize ("kibnal_connd");
3386         cfs_block_allsigs ();
3387
3388         init_waitqueue_entry (&wait, current);
3389         kibnal_data.kib_connd = current;
3390
3391         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
3392
3393         while (!kibnal_data.kib_shutdown) {
3394
3395                 dropped_lock = 0;
3396
3397                 if (!list_empty (&kibnal_data.kib_connd_zombies)) {
3398                         conn = list_entry (kibnal_data.kib_connd_zombies.next,
3399                                            kib_conn_t, ibc_list);
3400                         list_del (&conn->ibc_list);
3401                         
3402                         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3403                         dropped_lock = 1;
3404
3405                         kibnal_destroy_conn(conn);
3406
3407                         spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3408                 }
3409
3410                 if (!list_empty (&kibnal_data.kib_connd_pcreqs)) {
3411                         pcr = list_entry(kibnal_data.kib_connd_pcreqs.next,
3412                                          kib_pcreq_t, pcr_list);
3413                         list_del(&pcr->pcr_list);
3414                         
3415                         spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
3416                         dropped_lock = 1;
3417
3418                         kibnal_recv_connreq(pcr->pcr_cep, &pcr->pcr_cmreq);
3419                         LIBCFS_FREE(pcr, sizeof(*pcr));
3420
3421                         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
3422                 }
3423                         
3424                 if (!list_empty (&kibnal_data.kib_connd_peers)) {
3425                         peer = list_entry (kibnal_data.kib_connd_peers.next,
3426                                            kib_peer_t, ibp_connd_list);
3427                         
3428                         list_del_init (&peer->ibp_connd_list);
3429                         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3430                         dropped_lock = 1;
3431
3432                         kibnal_arp_peer (peer);
3433                         kibnal_peer_decref (peer);
3434
3435                         spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3436                 }
3437
3438                 if (!list_empty (&kibnal_data.kib_connd_conns)) {
3439                         conn = list_entry (kibnal_data.kib_connd_conns.next,
3440                                            kib_conn_t, ibc_list);
3441                         list_del (&conn->ibc_list);
3442                         
3443                         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3444                         dropped_lock = 1;
3445
3446                         switch (conn->ibc_state) {
3447                         default:
3448                                 LBUG();
3449                                 
3450                         case IBNAL_CONN_ACTIVE_ARP:
3451                                 kibnal_arp_done(conn);
3452                                 break;
3453
3454                         case IBNAL_CONN_ACTIVE_CONNECT:
3455                                 kibnal_check_connreply(conn);
3456                                 break;
3457
3458                         case IBNAL_CONN_PASSIVE_WAIT:
3459                                 kibnal_check_passive_wait(conn);
3460                                 break;
3461
3462                         case IBNAL_CONN_DISCONNECT1:
3463                         case IBNAL_CONN_DISCONNECT2:
3464                                 kibnal_disconnect_conn(conn);
3465                                 break;
3466                         }
3467                         kibnal_conn_decref(conn);
3468
3469                         spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3470                 }
3471
3472                 /* careful with the jiffy wrap... */
3473                 timeout = (int)(deadline - jiffies);
3474                 if (timeout <= 0) {
3475                         const int n = 4;
3476                         const int p = 1;
3477                         int       chunk = kibnal_data.kib_peer_hash_size;
3478                         
3479                         spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
3480                         dropped_lock = 1;
3481
3482                         /* Time to check for RDMA timeouts on a few more
3483                          * peers: I do checks every 'p' seconds on a
3484                          * proportion of the peer table and I need to check
3485                          * every connection 'n' times within a timeout
3486                          * interval, to ensure I detect a timeout on any
3487                          * connection within (n+1)/n times the timeout
3488                          * interval. */
3489
3490                         if (*kibnal_tunables.kib_timeout > n * p)
3491                                 chunk = (chunk * n * p) / 
3492                                         *kibnal_tunables.kib_timeout;
3493                         if (chunk == 0)
3494                                 chunk = 1;
3495
3496                         for (i = 0; i < chunk; i++) {
3497                                 kibnal_check_conns (peer_index);
3498                                 peer_index = (peer_index + 1) % 
3499                                              kibnal_data.kib_peer_hash_size;
3500                         }
3501
3502                         deadline += p * HZ;
3503                         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
3504                 }
3505
3506                 if (dropped_lock)
3507                         continue;
3508                 
3509                 /* Nothing to do for 'timeout'  */
3510                 set_current_state (TASK_INTERRUPTIBLE);
3511                 add_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
3512                 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3513
3514                 schedule_timeout (timeout);
3515
3516                 set_current_state (TASK_RUNNING);
3517                 remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
3518                 spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3519         }
3520
3521         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3522
3523         kibnal_thread_fini ();
3524         return (0);
3525 }
3526
3527 void 
3528 kibnal_async_callback(vv_event_record_t ev)
3529 {
3530         CERROR("type: %d, port: %d, data: "LPX64"\n", 
3531                ev.event_type, ev.port_num, ev.type.data);
3532 }
3533
3534 void
3535 kibnal_cq_callback (unsigned long unused_context)
3536 {
3537         unsigned long    flags;
3538
3539         spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3540         kibnal_data.kib_ready = 1;
3541         wake_up(&kibnal_data.kib_sched_waitq);
3542         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
3543 }
3544
3545 int
3546 kibnal_scheduler(void *arg)
3547 {
3548         long            id = (long)arg;
3549         wait_queue_t    wait;
3550         char            name[16];
3551         vv_wc_t         wc;
3552         vv_return_t     vvrc;
3553         vv_return_t     vvrc2;
3554         unsigned long   flags;
3555         kib_rx_t       *rx;
3556         __u64           rxseq = 0;
3557         int             busy_loops = 0;
3558
3559         snprintf(name, sizeof(name), "kibnal_sd_%02ld", id);
3560         cfs_daemonize(name);
3561         cfs_block_allsigs();
3562
3563         init_waitqueue_entry(&wait, current);
3564
3565         spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3566
3567         while (!kibnal_data.kib_shutdown) {
3568                 if (busy_loops++ >= IBNAL_RESCHED) {
3569                         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
3570                                                flags);
3571
3572                         our_cond_resched();
3573                         busy_loops = 0;
3574                         
3575                         spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3576                 }
3577
3578                 if (kibnal_data.kib_ready &&
3579                     !kibnal_data.kib_checking_cq) {
3580                         /* take ownership of completion polling */
3581                         kibnal_data.kib_checking_cq = 1;
3582                         /* Assume I'll exhaust the CQ */
3583                         kibnal_data.kib_ready = 0;
3584                         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, 
3585                                                flags);
3586                         
3587                         vvrc = vv_poll_for_completion(kibnal_data.kib_hca, 
3588                                                       kibnal_data.kib_cq, &wc);
3589                         if (vvrc == vv_return_err_cq_empty) {
3590                                 vvrc2 = vv_request_completion_notification(
3591                                         kibnal_data.kib_hca, 
3592                                         kibnal_data.kib_cq, 
3593                                         vv_next_solicit_unsolicit_event);
3594                                 LASSERT (vvrc2 == vv_return_ok);
3595                         }
3596
3597                         if (vvrc == vv_return_ok &&
3598                             kibnal_wreqid2type(wc.wr_id) == IBNAL_WID_RX) {
3599                                 rx = (kib_rx_t *)kibnal_wreqid2ptr(wc.wr_id);
3600
3601                                 /* Grab the RX sequence number NOW before
3602                                  * anyone else can get an RX completion */
3603                                 rxseq = rx->rx_conn->ibc_rxseq++;
3604                         }
3605
3606                         spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3607                         /* give up ownership of completion polling */
3608                         kibnal_data.kib_checking_cq = 0;
3609
3610                         if (vvrc == vv_return_err_cq_empty)
3611                                 continue;
3612
3613                         LASSERT (vvrc == vv_return_ok);
3614                         /* Assume there's more: get another scheduler to check
3615                          * while I handle this completion... */
3616
3617                         kibnal_data.kib_ready = 1;
3618                         wake_up(&kibnal_data.kib_sched_waitq);
3619
3620                         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
3621                                                flags);
3622
3623                         switch (kibnal_wreqid2type(wc.wr_id)) {
3624                         case IBNAL_WID_RX:
3625                                 kibnal_rx_complete(
3626                                         (kib_rx_t *)kibnal_wreqid2ptr(wc.wr_id),
3627                                         wc.completion_status,
3628                                         wc.num_bytes_transfered,
3629                                         rxseq);
3630                                 break;
3631
3632                         case IBNAL_WID_TX:
3633                                 kibnal_tx_complete(
3634                                         (kib_tx_t *)kibnal_wreqid2ptr(wc.wr_id),
3635                                         wc.completion_status);
3636                                 break;
3637
3638                         case IBNAL_WID_RDMA:
3639                                 /* We only get RDMA completion notification if
3640                                  * it fails.  So we just ignore them completely
3641                                  * because...
3642                                  *
3643                                  * 1) If an RDMA fails, all subsequent work
3644                                  * items, including the final SEND will fail
3645                                  * too, so I'm still guaranteed to notice that
3646                                  * this connection is hosed.
3647                                  *
3648                                  * 2) It's positively dangerous to look inside
3649                                  * the tx descriptor obtained from an RDMA work
3650                                  * item.  As soon as I drop the kib_sched_lock,
3651                                  * I give a scheduler on another CPU a chance
3652                                  * to get the final SEND completion, so the tx
3653                                  * descriptor can get freed as I inspect it. */
3654                                 CDEBUG(D_NETERROR, "RDMA failed: %d\n", 
3655                                        wc.completion_status);
3656                                 break;
3657
3658                         default:
3659                                 LBUG();
3660                         }
3661                         
3662                         spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3663                         continue;
3664                 }
3665
3666                 /* Nothing to do; sleep... */
3667
3668                 set_current_state(TASK_INTERRUPTIBLE);
3669                 add_wait_queue_exclusive(&kibnal_data.kib_sched_waitq, &wait);
3670                 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
3671                                        flags);
3672
3673                 schedule();
3674
3675                 remove_wait_queue(&kibnal_data.kib_sched_waitq, &wait);
3676                 set_current_state(TASK_RUNNING);
3677                 spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3678         }
3679
3680         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
3681
3682         kibnal_thread_fini();
3683         return (0);
3684 }