Whamcloud - gitweb
c6d8bf802ac98a1dddfeb2b12afee9bbb75d6a69
[fs/lustre-release.git] / lnet / klnds / viblnd / viblnd_cb.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * Copyright (C) 2004 Cluster File Systems, Inc.
5  *   Author: Eric Barton <eric@bartonsoftware.com>
6  *   Author: Frank Zago <fzago@systemfabricworks.com>
7  *
8  *   This file is part of Lustre, http://www.lustre.org.
9  *
10  *   Lustre is free software; you can redistribute it and/or
11  *   modify it under the terms of version 2 of the GNU General Public
12  *   License as published by the Free Software Foundation.
13  *
14  *   Lustre is distributed in the hope that it will be useful,
15  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
16  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  *   GNU General Public License for more details.
18  *
19  *   You should have received a copy of the GNU General Public License
20  *   along with Lustre; if not, write to the Free Software
21  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22  *
23  */
24
25 #include "vibnal.h"
26
27 void
28 kibnal_tx_done (kib_tx_t *tx)
29 {
30         ptl_err_t        ptlrc = (tx->tx_status == 0) ? PTL_OK : PTL_FAIL;
31         int              i;
32
33         LASSERT (!in_interrupt());
34         LASSERT (!tx->tx_queued);               /* mustn't be queued for sending */
35         LASSERT (tx->tx_sending == 0);          /* mustn't be awaiting sent callback */
36         LASSERT (!tx->tx_waiting);              /* mustn't be awaiting peer response */
37
38 #if !IBNAL_WHOLE_MEM
39         switch (tx->tx_mapped) {
40         default:
41                 LBUG();
42
43         case KIB_TX_UNMAPPED:
44                 break;
45
46         case KIB_TX_MAPPED: {
47                 vv_return_t      vvrc;
48
49                 vvrc = vv_mem_region_destroy(kibnal_data.kib_hca,
50                                              tx->tx_md.md_handle);
51                 LASSERT (vvrc == vv_return_ok);
52                 tx->tx_mapped = KIB_TX_UNMAPPED;
53                 break;
54         }
55         }
56 #endif
57         for (i = 0; i < 2; i++) {
58                 /* tx may have up to 2 libmsgs to finalise */
59                 if (tx->tx_libmsg[i] == NULL)
60                         continue;
61
62                 lib_finalize (&kibnal_lib, NULL, tx->tx_libmsg[i], ptlrc);
63                 tx->tx_libmsg[i] = NULL;
64         }
65         
66         if (tx->tx_conn != NULL) {
67                 kibnal_conn_decref(tx->tx_conn);
68                 tx->tx_conn = NULL;
69         }
70
71         tx->tx_nwrq = 0;
72         tx->tx_status = 0;
73
74         spin_lock(&kibnal_data.kib_tx_lock);
75
76         if (tx->tx_isnblk) {
77                 list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs);
78         } else {
79                 list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs);
80                 wake_up (&kibnal_data.kib_idle_tx_waitq);
81         }
82
83         spin_unlock(&kibnal_data.kib_tx_lock);
84 }
85
86 kib_tx_t *
87 kibnal_get_idle_tx (int may_block) 
88 {
89         kib_tx_t      *tx = NULL;
90         ENTRY;
91         
92         for (;;) {
93                 spin_lock(&kibnal_data.kib_tx_lock);
94
95                 /* "normal" descriptor is free */
96                 if (!list_empty (&kibnal_data.kib_idle_txs)) {
97                         tx = list_entry (kibnal_data.kib_idle_txs.next,
98                                          kib_tx_t, tx_list);
99                         break;
100                 }
101
102                 if (!may_block) {
103                         /* may dip into reserve pool */
104                         if (list_empty (&kibnal_data.kib_idle_nblk_txs)) {
105                                 CERROR ("reserved tx desc pool exhausted\n");
106                                 break;
107                         }
108
109                         tx = list_entry (kibnal_data.kib_idle_nblk_txs.next,
110                                          kib_tx_t, tx_list);
111                         break;
112                 }
113
114                 /* block for idle tx */
115                 spin_unlock(&kibnal_data.kib_tx_lock);
116
117                 wait_event (kibnal_data.kib_idle_tx_waitq,
118                             !list_empty (&kibnal_data.kib_idle_txs) ||
119                             kibnal_data.kib_shutdown);
120         }
121
122         if (tx != NULL) {
123                 list_del (&tx->tx_list);
124
125                 /* Allocate a new completion cookie.  It might not be needed,
126                  * but we've got a lock right now and we're unlikely to
127                  * wrap... */
128                 tx->tx_cookie = kibnal_data.kib_next_tx_cookie++;
129 #if IBNAL_WHOLE_MEM
130                 LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
131 #endif
132                 LASSERT (tx->tx_nwrq == 0);
133                 LASSERT (!tx->tx_queued);
134                 LASSERT (tx->tx_sending == 0);
135                 LASSERT (!tx->tx_waiting);
136                 LASSERT (tx->tx_status == 0);
137                 LASSERT (tx->tx_conn == NULL);
138                 LASSERT (tx->tx_libmsg[0] == NULL);
139                 LASSERT (tx->tx_libmsg[1] == NULL);
140         }
141
142         spin_unlock(&kibnal_data.kib_tx_lock);
143         
144         RETURN(tx);
145 }
146
147 int
148 kibnal_post_rx (kib_rx_t *rx, int credit)
149 {
150         kib_conn_t   *conn = rx->rx_conn;
151         int           rc = 0;
152         vv_return_t   vvrc;
153
154         LASSERT (!in_interrupt());
155         
156         rx->rx_gl = (vv_scatgat_t) {
157                 .v_address = KIBNAL_ADDR2SG(KIBNAL_RX_VADDR(rx)),
158                 .l_key     = KIBNAL_RX_LKEY(rx),
159                 .length    = IBNAL_MSG_SIZE,
160         };
161
162         rx->rx_wrq = (vv_wr_t) {
163                 .wr_id                   = kibnal_ptr2wreqid(rx, IBNAL_WID_RX),
164                 .completion_notification = 1,
165                 .scatgat_list            = &rx->rx_gl,
166                 .num_of_data_segments    = 1,
167                 .wr_type                 = vv_wr_receive,
168         };
169
170         LASSERT (conn->ibc_state >= IBNAL_CONN_INIT);
171         LASSERT (!rx->rx_posted);
172
173         CDEBUG(D_NET, "posting rx [%d %x "LPX64"]\n", 
174                rx->rx_wrq.scatgat_list->length,
175                rx->rx_wrq.scatgat_list->l_key,
176                KIBNAL_SG2ADDR(rx->rx_wrq.scatgat_list->v_address));
177
178         if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) {
179                 /* No more posts for this rx; so lose its ref */
180                 kibnal_conn_decref(conn);
181                 return 0;
182         }
183         
184         rx->rx_posted = 1;
185
186         spin_lock(&conn->ibc_lock);
187         /* Serialise vv_post_receive; it's not re-entrant on the same QP */
188         vvrc = vv_post_receive(kibnal_data.kib_hca,
189                                conn->ibc_qp, &rx->rx_wrq);
190         spin_unlock(&conn->ibc_lock);
191
192         if (vvrc == 0) {
193                 if (credit) {
194                         spin_lock(&conn->ibc_lock);
195                         conn->ibc_outstanding_credits++;
196                         spin_unlock(&conn->ibc_lock);
197
198                         kibnal_check_sends(conn);
199                 }
200                 return 0;
201         }
202         
203         CERROR ("post rx -> "LPX64" failed %d\n", 
204                 conn->ibc_peer->ibp_nid, vvrc);
205         rc = -EIO;
206         kibnal_close_conn(rx->rx_conn, rc);
207         /* No more posts for this rx; so lose its ref */
208         kibnal_conn_decref(conn);
209         return rc;
210 }
211
212 int
213 kibnal_post_receives (kib_conn_t *conn)
214 {
215         int    i;
216         int    rc;
217
218         LASSERT (conn->ibc_state < IBNAL_CONN_ESTABLISHED);
219         LASSERT (conn->ibc_comms_error == 0);
220
221         for (i = 0; i < IBNAL_RX_MSGS; i++) {
222                 /* +1 ref for rx desc.  This ref remains until kibnal_post_rx
223                  * fails (i.e. actual failure or we're disconnecting) */
224                 kibnal_conn_addref(conn);
225                 rc = kibnal_post_rx (&conn->ibc_rxs[i], 0);
226                 if (rc != 0)
227                         return rc;
228         }
229
230         return 0;
231 }
232
233 kib_tx_t *
234 kibnal_find_waiting_tx_locked(kib_conn_t *conn, int txtype, __u64 cookie)
235 {
236         struct list_head   *tmp;
237         
238         list_for_each(tmp, &conn->ibc_active_txs) {
239                 kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
240                 
241                 LASSERT (!tx->tx_queued);
242                 LASSERT (tx->tx_sending != 0 || tx->tx_waiting);
243
244                 if (tx->tx_cookie != cookie)
245                         continue;
246
247                 if (tx->tx_waiting &&
248                     tx->tx_msg->ibm_type == txtype)
249                         return tx;
250
251                 CWARN("Bad completion: %swaiting, type %x (wanted %x)\n",
252                       tx->tx_waiting ? "" : "NOT ",
253                       tx->tx_msg->ibm_type, txtype);
254         }
255         return NULL;
256 }
257
258 void
259 kibnal_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie)
260 {
261         kib_tx_t    *tx;
262         int          idle;
263
264         spin_lock(&conn->ibc_lock);
265
266         tx = kibnal_find_waiting_tx_locked(conn, txtype, cookie);
267         if (tx == NULL) {
268                 spin_unlock(&conn->ibc_lock);
269
270                 CWARN("Unmatched completion type %x cookie "LPX64
271                       " from "LPX64"\n",
272                       txtype, cookie, conn->ibc_peer->ibp_nid);
273                 kibnal_close_conn (conn, -EPROTO);
274                 return;
275         }
276
277         if (tx->tx_status == 0) {               /* success so far */
278                 if (status < 0) {               /* failed? */
279                         tx->tx_status = status;
280                 } else if (txtype == IBNAL_MSG_GET_REQ) { 
281                         /* XXX layering violation: set REPLY data length */
282                         LASSERT (tx->tx_libmsg[1] != NULL);
283                         LASSERT (tx->tx_libmsg[1]->ev.type == 
284                                  PTL_EVENT_REPLY_END);
285
286                         tx->tx_libmsg[1]->ev.mlength = status;
287                 }
288         }
289         
290         tx->tx_waiting = 0;
291
292         idle = !tx->tx_queued && (tx->tx_sending == 0);
293         if (idle)
294                 list_del(&tx->tx_list);
295
296         spin_unlock(&conn->ibc_lock);
297         
298         if (idle)
299                 kibnal_tx_done(tx);
300 }
301
302 void
303 kibnal_send_completion (kib_conn_t *conn, int type, int status, __u64 cookie) 
304 {
305         kib_tx_t    *tx = kibnal_get_idle_tx(0);
306         
307         if (tx == NULL) {
308                 CERROR("Can't get tx for completion %x for "LPX64"\n",
309                        type, conn->ibc_peer->ibp_nid);
310                 return;
311         }
312         
313         tx->tx_msg->ibm_u.completion.ibcm_status = status;
314         tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie;
315         kibnal_init_tx_msg(tx, type, sizeof(kib_completion_msg_t));
316         
317         kibnal_queue_tx(tx, conn);
318 }
319
320 void
321 kibnal_handle_rx (kib_rx_t *rx)
322 {
323         kib_msg_t    *msg = rx->rx_msg;
324         kib_conn_t   *conn = rx->rx_conn;
325         int           credits = msg->ibm_credits;
326         kib_tx_t     *tx;
327         int           rc;
328
329         LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
330
331         CDEBUG (D_NET, "Received %x[%d] from "LPX64"\n",
332                 msg->ibm_type, credits, conn->ibc_peer->ibp_nid);
333         
334         if (credits != 0) {
335                 /* Have I received credits that will let me send? */
336                 spin_lock(&conn->ibc_lock);
337                 conn->ibc_credits += credits;
338                 spin_unlock(&conn->ibc_lock);
339
340                 kibnal_check_sends(conn);
341         }
342
343         switch (msg->ibm_type) {
344         default:
345                 CERROR("Bad IBNAL message type %x from "LPX64"\n",
346                        msg->ibm_type, conn->ibc_peer->ibp_nid);
347                 break;
348
349         case IBNAL_MSG_NOOP:
350                 break;
351
352         case IBNAL_MSG_IMMEDIATE:
353                 lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx);
354                 break;
355                 
356         case IBNAL_MSG_PUT_REQ:
357                 rx->rx_responded = 0;
358                 lib_parse(&kibnal_lib, &msg->ibm_u.putreq.ibprm_hdr, rx);
359                 if (rx->rx_responded)
360                         break;
361
362                 /* I wasn't asked to transfer any payload data.  This happens
363                  * if the PUT didn't match, or got truncated. */
364                 kibnal_send_completion(rx->rx_conn, IBNAL_MSG_PUT_NAK, 0,
365                                        msg->ibm_u.putreq.ibprm_cookie);
366                 break;
367
368         case IBNAL_MSG_PUT_NAK:
369                 CWARN ("PUT_NACK from "LPX64"\n", conn->ibc_peer->ibp_nid);
370                 kibnal_handle_completion(conn, IBNAL_MSG_PUT_REQ, 
371                                          msg->ibm_u.completion.ibcm_status,
372                                          msg->ibm_u.completion.ibcm_cookie);
373                 break;
374
375         case IBNAL_MSG_PUT_ACK:
376                 spin_lock(&conn->ibc_lock);
377                 tx = kibnal_find_waiting_tx_locked(conn, IBNAL_MSG_PUT_REQ,
378                                                    msg->ibm_u.putack.ibpam_src_cookie);
379                 if (tx != NULL)
380                         list_del(&tx->tx_list);
381                 spin_unlock(&conn->ibc_lock);
382
383                 if (tx == NULL) {
384                         CERROR("Unmatched PUT_ACK from "LPX64"\n",
385                                conn->ibc_peer->ibp_nid);
386                         kibnal_close_conn(conn, -EPROTO);
387                         break;
388                 }
389
390                 LASSERT (tx->tx_waiting);
391                 /* CAVEAT EMPTOR: I could be racing with tx_complete, but...
392                  * (a) I can overwrite tx_msg since my peer has received it!
393                  * (b) tx_waiting set tells tx_complete() it's not done. */
394
395                 tx->tx_nwrq = 0;                /* overwrite PUT_REQ */
396
397                 rc = kibnal_init_rdma(tx, IBNAL_MSG_PUT_DONE, 
398                                       kibnal_rd_size(&msg->ibm_u.putack.ibpam_rd),
399                                       &msg->ibm_u.putack.ibpam_rd,
400                                       msg->ibm_u.putack.ibpam_dst_cookie);
401                 if (rc < 0)
402                         CERROR("Can't setup rdma for PUT to "LPX64": %d\n",
403                                conn->ibc_peer->ibp_nid, rc);
404
405                 spin_lock(&conn->ibc_lock);
406                 if (tx->tx_status == 0 && rc < 0)
407                         tx->tx_status = rc;
408                 tx->tx_waiting = 0;             /* clear waiting and queue atomically */
409                 kibnal_queue_tx_locked(tx, conn);
410                 spin_unlock(&conn->ibc_lock);
411                 break;
412                 
413         case IBNAL_MSG_PUT_DONE:
414                 kibnal_handle_completion(conn, IBNAL_MSG_PUT_ACK,
415                                          msg->ibm_u.completion.ibcm_status,
416                                          msg->ibm_u.completion.ibcm_cookie);
417                 break;
418
419         case IBNAL_MSG_GET_REQ:
420                 rx->rx_responded = 0;
421                 lib_parse(&kibnal_lib, &msg->ibm_u.get.ibgm_hdr, rx);
422                 if (rx->rx_responded)           /* I responded to the GET_REQ */
423                         break;
424                 /* NB GET didn't match (I'd have responded even with no payload
425                  * data) */
426                 kibnal_send_completion(rx->rx_conn, IBNAL_MSG_GET_DONE, -ENODATA,
427                                        msg->ibm_u.get.ibgm_cookie);
428                 break;
429
430         case IBNAL_MSG_GET_DONE:
431                 kibnal_handle_completion(conn, IBNAL_MSG_GET_REQ,
432                                          msg->ibm_u.completion.ibcm_status,
433                                          msg->ibm_u.completion.ibcm_cookie);
434                 break;
435         }
436
437         kibnal_post_rx(rx, 1);
438 }
439
440 void
441 kibnal_rx_complete (kib_rx_t *rx, vv_comp_status_t vvrc, int nob, __u64 rxseq)
442 {
443         kib_msg_t    *msg = rx->rx_msg;
444         kib_conn_t   *conn = rx->rx_conn;
445         unsigned long flags;
446         int           rc;
447
448         CDEBUG (D_NET, "rx %p conn %p\n", rx, conn);
449         LASSERT (rx->rx_posted);
450         rx->rx_posted = 0;
451
452         if (conn->ibc_state > IBNAL_CONN_ESTABLISHED)
453                 goto ignore;
454
455         if (vvrc != vv_comp_status_success) {
456                 CERROR("Rx from "LPX64" failed: %d\n", 
457                        conn->ibc_peer->ibp_nid, vvrc);
458                 goto failed;
459         }
460
461         rc = kibnal_unpack_msg(msg, nob);
462         if (rc != 0) {
463                 CERROR ("Error %d unpacking rx from "LPX64"\n",
464                         rc, conn->ibc_peer->ibp_nid);
465                 goto failed;
466         }
467
468         if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid ||
469             msg->ibm_srcstamp != conn->ibc_incarnation ||
470             msg->ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid ||
471             msg->ibm_dststamp != kibnal_data.kib_incarnation) {
472                 CERROR ("Stale rx from "LPX64"\n",
473                         conn->ibc_peer->ibp_nid);
474                 goto failed;
475         }
476
477         if (msg->ibm_seq != rxseq) {
478                 CERROR ("Out-of-sequence rx from "LPX64
479                         ": got "LPD64" but expected "LPD64"\n",
480                         conn->ibc_peer->ibp_nid, msg->ibm_seq, rxseq);
481                 goto failed;
482         }
483
484         /* racing with connection establishment/teardown! */
485
486         if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
487                 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
488                 /* must check holding global lock to eliminate race */
489                 if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
490                         list_add_tail(&rx->rx_list, &conn->ibc_early_rxs);
491                         write_unlock_irqrestore(&kibnal_data.kib_global_lock, 
492                                                 flags);
493                         return;
494                 }
495                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, 
496                                         flags);
497         }
498         kibnal_handle_rx(rx);
499         return;
500         
501  failed:
502         CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
503         kibnal_close_conn(conn, -EIO);
504  ignore:
505         /* Don't re-post rx & drop its ref on conn */
506         kibnal_conn_decref(conn);
507 }
508
509 #if IBNAL_WHOLE_MEM
510 int
511 kibnal_append_rdfrag(kib_rdma_desc_t *rd, int active, struct page *page, 
512                      unsigned long page_offset, unsigned long len)
513 {
514         kib_rdma_frag_t *frag = &rd->rd_frags[rd->rd_nfrag];
515         vv_l_key_t       l_key;
516         vv_r_key_t       r_key;
517         __u64            addr;
518         __u64            frag_addr;
519         void            *ptr;
520         vv_mem_reg_h_t   mem_h;
521         vv_return_t      vvrc;
522
523         if (rd->rd_nfrag >= IBNAL_MAX_RDMA_FRAGS) {
524                 CERROR ("Too many RDMA fragments\n");
525                 return -EMSGSIZE;
526         }
527
528         /* Try to create an address that adapter-tavor will munge into a valid
529          * network address, given how it maps all phys mem into 1 region */
530         addr = kibnal_page2phys(page) + page_offset + PAGE_OFFSET;
531
532         vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca, 
533                                     (void *)((unsigned long)addr),
534                                     len, &mem_h, &l_key, &r_key);
535         LASSERT (vvrc == vv_return_ok);
536
537         if (active) {
538                 if (rd->rd_nfrag == 0) {
539                         rd->rd_key = l_key;
540                 } else if (l_key != rd->rd_key) {
541                         CERROR ("> 1 key for single RDMA desc\n");
542                         return -EINVAL;
543                 }
544                 frag_addr = addr;
545         } else {
546                 if (rd->rd_nfrag == 0) {
547                         rd->rd_key = r_key;
548                 } else if (r_key != rd->rd_key) {
549                         CERROR ("> 1 key for single RDMA desc\n");
550                         return -EINVAL;
551                 }
552
553                 frag_addr = kibnal_addr2net(addr);
554         }
555
556         kibnal_rf_set(frag, frag_addr, len);
557
558         CDEBUG(D_NET,"map frag [%d][%d %x %08x%08x] "LPX64"\n", 
559                rd->rd_nfrag, frag->rf_nob, rd->rd_key, 
560                frag->rf_addr_hi, frag->rf_addr_lo, frag_addr);
561
562         rd->rd_nfrag++;
563         return 0;
564 }
565
566 struct page *
567 kibnal_kvaddr_to_page (unsigned long vaddr)
568 {
569         struct page *page;
570
571         if (vaddr >= VMALLOC_START &&
572             vaddr < VMALLOC_END)
573                 page = vmalloc_to_page ((void *)vaddr);
574 #if CONFIG_HIGHMEM
575         else if (vaddr >= PKMAP_BASE &&
576                  vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE))
577                 page = vmalloc_to_page ((void *)vaddr);
578         /* in 2.4 ^ just walks the page tables */
579 #endif
580         else
581                 page = virt_to_page (vaddr);
582
583         return VALID_PAGE(page) ? page : NULL;
584 }
585
586 int
587 kibnal_setup_rd_iov(kib_tx_t *tx, kib_rdma_desc_t *rd, 
588                     vv_access_con_bit_mask_t access,
589                     int niov, struct iovec *iov, int offset, int nob)
590                  
591 {
592         /* active if I'm sending */
593         int           active = ((access & vv_acc_r_mem_write) == 0);
594         int           fragnob;
595         int           rc;
596         unsigned long vaddr;
597         struct page  *page;
598         int           page_offset;
599
600         LASSERT (nob > 0);
601         LASSERT (niov > 0);
602         LASSERT ((rd != tx->tx_rd) == !active);
603
604         while (offset >= iov->iov_len) {
605                 offset -= iov->iov_len;
606                 niov--;
607                 iov++;
608                 LASSERT (niov > 0);
609         }
610
611         rd->rd_nfrag = 0;
612         do {
613                 LASSERT (niov > 0);
614
615                 vaddr = ((unsigned long)iov->iov_base) + offset;
616                 page_offset = vaddr & (PAGE_SIZE - 1);
617                 page = kibnal_kvaddr_to_page(vaddr);
618                 if (page == NULL) {
619                         CERROR ("Can't find page\n");
620                         return -EFAULT;
621                 }
622
623                 fragnob = min((int)(iov->iov_len - offset), nob);
624                 fragnob = min(fragnob, (int)PAGE_SIZE - page_offset);
625
626                 rc = kibnal_append_rdfrag(rd, active, page, 
627                                           page_offset, fragnob);
628                 if (rc != 0)
629                         return rc;
630
631                 if (offset + fragnob < iov->iov_len) {
632                         offset += fragnob;
633                 } else {
634                         offset = 0;
635                         iov++;
636                         niov--;
637                 }
638                 nob -= fragnob;
639         } while (nob > 0);
640         
641         return 0;
642 }
643
644 int
645 kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd, 
646                       vv_access_con_bit_mask_t access,
647                       int nkiov, ptl_kiov_t *kiov, int offset, int nob)
648 {
649         /* active if I'm sending */
650         int            active = ((access & vv_acc_r_mem_write) == 0);
651         int            fragnob;
652         int            rc;
653
654         CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
655
656         LASSERT (nob > 0);
657         LASSERT (nkiov > 0);
658         LASSERT ((rd != tx->tx_rd) == !active);
659
660         while (offset >= kiov->kiov_len) {
661                 offset -= kiov->kiov_len;
662                 nkiov--;
663                 kiov++;
664                 LASSERT (nkiov > 0);
665         }
666
667         rd->rd_nfrag = 0;
668         do {
669                 LASSERT (nkiov > 0);
670                 fragnob = min((int)(kiov->kiov_len - offset), nob);
671                 
672                 rc = kibnal_append_rdfrag(rd, active, kiov->kiov_page,
673                                           kiov->kiov_offset + offset,
674                                           fragnob);
675                 if (rc != 0)
676                         return rc;
677
678                 offset = 0;
679                 kiov++;
680                 nkiov--;
681                 nob -= fragnob;
682         } while (nob > 0);
683
684         return 0;
685 }
686 #else
687 int
688 kibnal_setup_rd_iov (kib_tx_t *tx, kib_rdma_desc_t *rd,
689                      vv_access_con_bit_mask_t access,
690                      int niov, struct iovec *iov, int offset, int nob)
691                  
692 {
693         /* active if I'm sending */
694         int         active = ((access & vv_acc_r_mem_write) == 0);
695         void       *vaddr;
696         vv_return_t vvrc;
697
698         LASSERT (nob > 0);
699         LASSERT (niov > 0);
700         LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
701         LASSERT ((rd != tx->tx_rd) == !active);
702
703         while (offset >= iov->iov_len) {
704                 offset -= iov->iov_len;
705                 niov--;
706                 iov++;
707                 LASSERT (niov > 0);
708         }
709
710         if (nob > iov->iov_len - offset) {
711                 CERROR ("Can't map multiple vaddr fragments\n");
712                 return (-EMSGSIZE);
713         }
714
715         vaddr = (void *)(((unsigned long)iov->iov_base) + offset);
716         tx->tx_md.md_addr = (__u64)((unsigned long)vaddr);
717
718         vvrc = vv_mem_region_register(kibnal_data.kib_hca, vaddr, nob,
719                                       kibnal_data.kib_pd, access,
720                                       &tx->tx_md.md_handle, 
721                                       &tx->tx_md.md_lkey,
722                                       &tx->tx_md.md_rkey);
723         if (vvrc != vv_return_ok) {
724                 CERROR ("Can't map vaddr %p: %d\n", vaddr, vvrc);
725                 return -EFAULT;
726         }
727
728         tx->tx_mapped = KIB_TX_MAPPED;
729
730         rd->rd_key = active ? tx->tx_md.md_lkey : tx->tx_md.md_rkey;
731         rd->rd_nfrag = 1;
732         kibnal_rf_set(&rd->rd_frags[0], tx->tx_md.md_addr, nob);
733         
734         return (0);
735 }
736
737 int
738 kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd,
739                       vv_access_con_bit_mask_t access,
740                       int nkiov, ptl_kiov_t *kiov, int offset, int nob)
741 {
742         /* active if I'm sending */
743         int            active = ((access & vv_acc_r_mem_write) == 0);
744         vv_return_t    vvrc;
745         vv_phy_list_t  phys_pages;
746         vv_phy_buf_t  *phys;
747         int            page_offset;
748         int            nphys;
749         int            resid;
750         int            phys_size;
751         int            rc;
752
753         CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
754
755         LASSERT (nob > 0);
756         LASSERT (nkiov > 0);
757         LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
758         LASSERT ((rd != tx->tx_rd) == !active);
759
760         while (offset >= kiov->kiov_len) {
761                 offset -= kiov->kiov_len;
762                 nkiov--;
763                 kiov++;
764                 LASSERT (nkiov > 0);
765         }
766
767         phys_size = nkiov * sizeof (*phys);
768         PORTAL_ALLOC(phys, phys_size);
769         if (phys == NULL) {
770                 CERROR ("Can't allocate tmp phys\n");
771                 return (-ENOMEM);
772         }
773
774         page_offset = kiov->kiov_offset + offset;
775
776         phys[0].start = kibnal_page2phys(kiov->kiov_page);
777         phys[0].size = PAGE_SIZE;
778
779         nphys = 1;
780         resid = nob - (kiov->kiov_len - offset);
781
782         while (resid > 0) {
783                 kiov++;
784                 nkiov--;
785                 LASSERT (nkiov > 0);
786
787                 if (kiov->kiov_offset != 0 ||
788                     ((resid > PAGE_SIZE) && 
789                      kiov->kiov_len < PAGE_SIZE)) {
790                         int i;
791                         /* Can't have gaps */
792                         CERROR ("Can't make payload contiguous in I/O VM:"
793                                 "page %d, offset %d, len %d \n", nphys, 
794                                 kiov->kiov_offset, kiov->kiov_len);
795
796                         for (i = -nphys; i < nkiov; i++)
797                                 CERROR("kiov[%d] %p +%d for %d\n",
798                                        i, kiov[i].kiov_page, 
799                                        kiov[i].kiov_offset, 
800                                        kiov[i].kiov_len);
801                         
802                         rc = -EINVAL;
803                         goto out;
804                 }
805
806                 LASSERT (nphys * sizeof (*phys) < phys_size);
807                 phys[nphys].start = kibnal_page2phys(kiov->kiov_page);
808                 phys[nphys].size = PAGE_SIZE;
809
810                 nphys++;
811                 resid -= PAGE_SIZE;
812         }
813
814 #if 0
815         CWARN ("nphys %d, nob %d, page_offset %d\n", nphys, nob, page_offset);
816         for (i = 0; i < nphys; i++)
817                 CWARN ("   [%d] "LPX64"\n", i, phys[i]);
818 #endif
819
820         vvrc = vv_phy_mem_region_register(kibnal_data.kib_hca,
821                                           &phys_pages,
822                                           IBNAL_RDMA_BASE,
823                                           nphys,
824                                           page_offset,
825                                           kibnal_data.kib_pd,
826                                           access,
827                                           &tx->tx_md.md_handle,
828                                           &tx->tx_md.md_addr,
829                                           &tx->tx_md.md_lkey,
830                                           &tx->tx_md.md_rkey);
831
832         if (vvrc != vv_return_ok) {
833                 CERROR ("Can't map phys: %d\n", vvrc);
834                 rc = -EFAULT;
835                 goto out;
836         }
837
838         CDEBUG(D_NET, "Mapped %d pages %d bytes @ offset %d: "
839                "lkey %x, rkey %x, addr "LPX64"\n",
840                nphys, nob, page_offset, tx->tx_md.md_lkey, tx->tx_md.md_rkey,
841                tx->tx_md.md_addr);
842
843         tx->tx_mapped = KIB_TX_MAPPED;
844         rc = 0;
845
846         rd->rd_key = active ? tx->tx_md.md_lkey : tx->tx_md.md_rkey;
847         rd->rd_nfrag = 1;
848         kibnal_rf_set(&rd->rd_frags[0], tx->tx_md.md_addr, nob);
849         
850  out:
851         PORTAL_FREE(phys, phys_size);
852         return (rc);
853 }
854 #endif
855
856 kib_conn_t *
857 kibnal_find_conn_locked (kib_peer_t *peer)
858 {
859         struct list_head *tmp;
860
861         /* just return the first connection */
862         list_for_each (tmp, &peer->ibp_conns) {
863                 return (list_entry(tmp, kib_conn_t, ibc_list));
864         }
865
866         return (NULL);
867 }
868
869 void
870 kibnal_check_sends (kib_conn_t *conn)
871 {
872         kib_tx_t       *tx;
873         vv_return_t     vvrc;                        
874         int             rc;
875         int             i;
876         int             done;
877
878         /* Don't send anything until after the connection is established */
879         if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
880                 CDEBUG(D_NET, LPX64"too soon\n", conn->ibc_peer->ibp_nid);
881                 return;
882         }
883         
884         spin_lock(&conn->ibc_lock);
885
886         LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE);
887
888         if (list_empty(&conn->ibc_tx_queue) &&
889             conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) {
890                 spin_unlock(&conn->ibc_lock);
891                 
892                 tx = kibnal_get_idle_tx(0);     /* don't block */
893                 if (tx != NULL)
894                         kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0);
895
896                 spin_lock(&conn->ibc_lock);
897                 
898                 if (tx != NULL)
899                         kibnal_queue_tx_locked(tx, conn);
900         }
901
902         while (!list_empty (&conn->ibc_tx_queue)) {
903                 tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list);
904
905                 LASSERT (tx->tx_queued);
906                 /* We rely on this for QP sizing */
907                 LASSERT (tx->tx_nwrq > 0 && tx->tx_nwrq <= 1 + IBNAL_MAX_RDMA_FRAGS);
908
909                 LASSERT (conn->ibc_outstanding_credits >= 0);
910                 LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE);
911                 LASSERT (conn->ibc_credits >= 0);
912                 LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE);
913
914                 if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE) {
915                         CDEBUG(D_NET, LPX64": posted enough\n",
916                                conn->ibc_peer->ibp_nid);
917                         break;
918                 }
919                 
920                 if (conn->ibc_credits == 0) {   /* no credits */
921                         CDEBUG(D_NET, LPX64": no credits\n",
922                                conn->ibc_peer->ibp_nid);
923                         break;
924                 }
925                 
926                 if (conn->ibc_credits == 1 &&   /* last credit reserved for */
927                     conn->ibc_outstanding_credits == 0) { /* giving back credits */
928                         CDEBUG(D_NET, LPX64": not using last credit\n",
929                                conn->ibc_peer->ibp_nid);
930                         break;
931                 }
932                 
933                 list_del (&tx->tx_list);
934                 tx->tx_queued = 0;
935
936                 /* NB don't drop ibc_lock before bumping tx_sending */
937
938                 if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP &&
939                     (!list_empty(&conn->ibc_tx_queue) ||
940                      conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) {
941                         /* redundant NOOP */
942                         spin_unlock(&conn->ibc_lock);
943                         kibnal_tx_done(tx);
944                         spin_lock(&conn->ibc_lock);
945                         CDEBUG(D_NET, LPX64": redundant noop\n",
946                                conn->ibc_peer->ibp_nid);
947                         continue;
948                 }
949
950                 kibnal_pack_msg(tx->tx_msg, conn->ibc_outstanding_credits,
951                                 conn->ibc_peer->ibp_nid, conn->ibc_incarnation,
952                                 conn->ibc_txseq);
953
954                 conn->ibc_txseq++;
955                 conn->ibc_outstanding_credits = 0;
956                 conn->ibc_nsends_posted++;
957                 conn->ibc_credits--;
958
959                 /* CAVEAT EMPTOR!  This tx could be the PUT_DONE of an RDMA
960                  * PUT.  If so, it was first queued here as a PUT_REQ, sent and
961                  * stashed on ibc_active_txs, matched by an incoming PUT_ACK,
962                  * and then re-queued here.  It's (just) possible that
963                  * tx_sending is non-zero if we've not done the tx_complete() from
964                  * the first send; hence the ++ rather than = below. */
965                 tx->tx_sending++;
966
967                 list_add (&tx->tx_list, &conn->ibc_active_txs);
968
969                 /* Keep holding ibc_lock while posting sends on this
970                  * connection; vv_post_send() isn't re-entrant on the same
971                  * QP!! */
972
973                 LASSERT (tx->tx_nwrq > 0);
974
975                 rc = -ECONNABORTED;
976                 vvrc = vv_return_ok;
977                 if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
978                         tx->tx_status = 0;
979                         vvrc = vv_post_send_list(kibnal_data.kib_hca,
980                                                  conn->ibc_qp,
981                                                  tx->tx_nwrq,
982                                                  tx->tx_wrq,
983                                                  vv_operation_type_send_rc);
984                         rc = (vvrc == vv_return_ok) ? 0 : -EIO;
985                 }
986
987                 if (rc != 0) {
988                         /* NB credits are transferred in the actual
989                          * message, which can only be the last work item */
990                         conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits;
991                         conn->ibc_credits++;
992                         conn->ibc_nsends_posted--;
993
994                         tx->tx_status = rc;
995                         tx->tx_waiting = 0;
996                         tx->tx_sending--;
997                         
998                         done = (tx->tx_sending == 0);
999                         if (done)
1000                                 list_del (&tx->tx_list);
1001                         
1002                         spin_unlock(&conn->ibc_lock);
1003                         
1004                         if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
1005                                 CERROR ("Error %d posting transmit to "LPX64"\n", 
1006                                         vvrc, conn->ibc_peer->ibp_nid);
1007                         else
1008                                 CDEBUG (D_NET, "Error %d posting transmit to "
1009                                         LPX64"\n", rc, conn->ibc_peer->ibp_nid);
1010
1011                         kibnal_close_conn (conn, rc);
1012
1013                         if (done)
1014                                 kibnal_tx_done (tx);
1015                         return;
1016                 }
1017         }
1018
1019         spin_unlock(&conn->ibc_lock);
1020 }
1021
1022 void
1023 kibnal_tx_complete (kib_tx_t *tx, vv_comp_status_t vvrc)
1024 {
1025         kib_conn_t   *conn = tx->tx_conn;
1026         int           failed = (vvrc != vv_comp_status_success);
1027         int           idle;
1028
1029         CDEBUG(D_NET, "tx %p conn %p sending %d nwrq %d vvrc %d\n", 
1030                tx, conn, tx->tx_sending, tx->tx_nwrq, vvrc);
1031
1032         LASSERT (tx->tx_sending > 0);
1033
1034         if (failed &&
1035             tx->tx_status == 0 &&
1036             conn->ibc_state == IBNAL_CONN_ESTABLISHED)
1037                 CERROR("tx -> "LPX64" type %x cookie "LPX64
1038                        "sending %d waiting %d: failed %d\n", 
1039                        conn->ibc_peer->ibp_nid, tx->tx_msg->ibm_type, 
1040                        tx->tx_cookie, tx->tx_sending, tx->tx_waiting, vvrc);
1041
1042         spin_lock(&conn->ibc_lock);
1043
1044         /* I could be racing with rdma completion.  Whoever makes 'tx' idle
1045          * gets to free it, which also drops its ref on 'conn'. */
1046
1047         tx->tx_sending--;
1048         conn->ibc_nsends_posted--;
1049
1050         if (failed) {
1051                 tx->tx_waiting = 0;
1052                 tx->tx_status = -EIO;
1053         }
1054         
1055         idle = (tx->tx_sending == 0) &&         /* This is the final callback */
1056                !tx->tx_waiting &&               /* Not waiting for peer */
1057                !tx->tx_queued;                  /* Not re-queued (PUT_DONE) */
1058         if (idle)
1059                 list_del(&tx->tx_list);
1060
1061         kibnal_conn_addref(conn);               /* 1 ref for me.... */
1062
1063         spin_unlock(&conn->ibc_lock);
1064
1065         if (idle)
1066                 kibnal_tx_done (tx);
1067
1068         if (failed)
1069                 kibnal_close_conn (conn, -EIO);
1070         else
1071                 kibnal_check_sends(conn);
1072
1073         kibnal_conn_decref(conn);               /* ...until here */
1074 }
1075
1076 void
1077 kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob)
1078 {
1079         vv_scatgat_t *gl = &tx->tx_gl[tx->tx_nwrq];
1080         vv_wr_t      *wrq = &tx->tx_wrq[tx->tx_nwrq];
1081         int           nob = offsetof (kib_msg_t, ibm_u) + body_nob;
1082
1083         LASSERT (tx->tx_nwrq >= 0 && 
1084                  tx->tx_nwrq < (1 + IBNAL_MAX_RDMA_FRAGS));
1085         LASSERT (nob <= IBNAL_MSG_SIZE);
1086
1087         kibnal_init_msg(tx->tx_msg, type, body_nob);
1088
1089         *gl = (vv_scatgat_t) {
1090                 .v_address = KIBNAL_ADDR2SG(KIBNAL_TX_VADDR(tx)),
1091                 .l_key     = KIBNAL_TX_LKEY(tx),
1092                 .length    = nob,
1093         };
1094
1095         memset(wrq, 0, sizeof(*wrq));
1096
1097         wrq->wr_id = kibnal_ptr2wreqid(tx, IBNAL_WID_TX);
1098         wrq->wr_type = vv_wr_send;
1099         wrq->scatgat_list = gl;
1100         wrq->num_of_data_segments = 1;
1101         wrq->completion_notification = 1;
1102         wrq->type.send.solicited_event = 1;
1103         wrq->type.send.immidiate_data_indicator = 0;
1104         wrq->type.send.send_qp_type.rc_type.fance_indicator = 0;
1105         
1106         tx->tx_nwrq++;
1107 }
1108
1109 int
1110 kibnal_init_rdma (kib_tx_t *tx, int type, int nob,
1111                   kib_rdma_desc_t *dstrd, __u64 dstcookie)
1112 {
1113         /* CAVEAT EMPTOR: this 'consumes' the frags in 'dstrd' */
1114         int              resid = nob;
1115         kib_msg_t       *ibmsg = tx->tx_msg;
1116         kib_rdma_desc_t *srcrd = tx->tx_rd;
1117         kib_rdma_frag_t *srcfrag;
1118         int              srcidx;
1119         kib_rdma_frag_t *dstfrag;
1120         int              dstidx;
1121         vv_scatgat_t    *gl;
1122         vv_wr_t         *wrq;
1123         int              wrknob;
1124         int              rc;
1125
1126         /* Called by scheduler */
1127         LASSERT (!in_interrupt());
1128
1129         LASSERT (type == IBNAL_MSG_GET_DONE ||
1130                  type == IBNAL_MSG_PUT_DONE);
1131
1132         srcidx = dstidx = 0;
1133         srcfrag = &srcrd->rd_frags[0];
1134         dstfrag = &dstrd->rd_frags[0];
1135         rc = resid;
1136
1137         while (resid > 0) {
1138                 if (srcidx >= srcrd->rd_nfrag) {
1139                         CERROR("Src buffer exhausted: %d frags\n", srcidx);
1140                         rc = -EPROTO;
1141                         break;
1142                 }
1143                 
1144                 if (dstidx == dstrd->rd_nfrag) {
1145                         CERROR("Dst buffer exhausted: %d frags\n", dstidx);
1146                         rc = -EPROTO;
1147                         break;
1148                 }
1149
1150                 if (tx->tx_nwrq == IBNAL_MAX_RDMA_FRAGS) {
1151                         CERROR("RDMA too fragmented: %d/%d src %d/%d dst frags\n",
1152                                srcidx, srcrd->rd_nfrag,
1153                                dstidx, dstrd->rd_nfrag);
1154                         rc = -EMSGSIZE;
1155                         break;
1156                 }
1157
1158                 wrknob = MIN(MIN(srcfrag->rf_nob, dstfrag->rf_nob), resid);
1159
1160                 gl = &tx->tx_gl[tx->tx_nwrq];
1161                 gl->v_address = KIBNAL_ADDR2SG(kibnal_rf_addr(srcfrag));
1162                 gl->length    = wrknob;
1163                 gl->l_key     = srcrd->rd_key;
1164
1165                 wrq = &tx->tx_wrq[tx->tx_nwrq];
1166
1167                 wrq->wr_id = kibnal_ptr2wreqid(tx, IBNAL_WID_RDMA);
1168                 wrq->completion_notification = 0;
1169                 wrq->scatgat_list = gl;
1170                 wrq->num_of_data_segments = 1;
1171                 wrq->wr_type = vv_wr_rdma_write;
1172                 wrq->type.send.solicited_event = 0;
1173                 wrq->type.send.send_qp_type.rc_type.fance_indicator = 0;
1174                 wrq->type.send.send_qp_type.rc_type.r_addr = kibnal_rf_addr(dstfrag);
1175                 wrq->type.send.send_qp_type.rc_type.r_r_key = dstrd->rd_key;
1176
1177                 resid -= wrknob;
1178                 if (wrknob < srcfrag->rf_nob) {
1179                         kibnal_rf_set(srcfrag, 
1180                                       kibnal_rf_addr(srcfrag) + resid, 
1181                                       srcfrag->rf_nob - wrknob);
1182                 } else {
1183                         srcfrag++;
1184                         srcidx++;
1185                 }
1186                 
1187                 if (wrknob < dstfrag->rf_nob) {
1188                         kibnal_rf_set(dstfrag,
1189                                       kibnal_rf_addr(dstfrag) + resid,
1190                                       dstfrag->rf_nob - wrknob);
1191                 } else {
1192                         dstfrag++;
1193                         dstidx++;
1194                 }
1195                 
1196                 tx->tx_nwrq++;
1197         }
1198
1199         if (rc < 0)                             /* no RDMA if completing with failure */
1200                 tx->tx_nwrq = 0;
1201         
1202         ibmsg->ibm_u.completion.ibcm_status = rc;
1203         ibmsg->ibm_u.completion.ibcm_cookie = dstcookie;
1204         kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t));
1205
1206         return rc;
1207 }
1208
1209 void
1210 kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
1211 {
1212         spin_lock(&conn->ibc_lock);
1213         kibnal_queue_tx_locked (tx, conn);
1214         spin_unlock(&conn->ibc_lock);
1215         
1216         kibnal_check_sends(conn);
1217 }
1218
1219 void
1220 kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid)
1221 {
1222         kib_peer_t      *peer;
1223         kib_conn_t      *conn;
1224         unsigned long    flags;
1225         rwlock_t        *g_lock = &kibnal_data.kib_global_lock;
1226
1227         /* If I get here, I've committed to send, so I complete the tx with
1228          * failure on any problems */
1229         
1230         LASSERT (tx->tx_conn == NULL);          /* only set when assigned a conn */
1231         LASSERT (tx->tx_nwrq > 0);              /* work items have been set up */
1232
1233         read_lock_irqsave(g_lock, flags);
1234         
1235         peer = kibnal_find_peer_locked (nid);
1236         if (peer == NULL) {
1237                 read_unlock_irqrestore(g_lock, flags);
1238                 tx->tx_status = -EHOSTUNREACH;
1239                 tx->tx_waiting = 0;
1240                 kibnal_tx_done (tx);
1241                 return;
1242         }
1243
1244         conn = kibnal_find_conn_locked (peer);
1245         if (conn != NULL) {
1246                 kibnal_conn_addref(conn);       /* 1 ref for me... */
1247                 read_unlock_irqrestore(g_lock, flags);
1248                 
1249                 kibnal_queue_tx (tx, conn);
1250                 kibnal_conn_decref(conn);       /* ...to here */
1251                 return;
1252         }
1253         
1254         /* Making one or more connections; I'll need a write lock... */
1255         read_unlock(g_lock);
1256         write_lock(g_lock);
1257
1258         peer = kibnal_find_peer_locked (nid);
1259         if (peer == NULL) {
1260                 write_unlock_irqrestore(g_lock, flags);
1261                 tx->tx_status = -EHOSTUNREACH;
1262                 tx->tx_waiting = 0;
1263                 kibnal_tx_done (tx);
1264                 return;
1265         }
1266
1267         conn = kibnal_find_conn_locked (peer);
1268         if (conn != NULL) {
1269                 /* Connection exists; queue message on it */
1270                 kibnal_conn_addref(conn);       /* 1 ref for me... */
1271                 write_unlock_irqrestore(g_lock, flags);
1272                 
1273                 kibnal_queue_tx (tx, conn);
1274                 kibnal_conn_decref(conn);       /* ...until here */
1275                 return;
1276         }
1277
1278         if (peer->ibp_connecting == 0) {
1279                 if (!time_after_eq(jiffies, peer->ibp_reconnect_time)) {
1280                         write_unlock_irqrestore(g_lock, flags);
1281                         tx->tx_status = -EHOSTUNREACH;
1282                         tx->tx_waiting = 0;
1283                         kibnal_tx_done (tx);
1284                         return;
1285                 }
1286         
1287                 peer->ibp_connecting = 1;
1288                 kibnal_peer_addref(peer); /* extra ref for connd */
1289         
1290                 spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
1291         
1292                 list_add_tail (&peer->ibp_connd_list,
1293                                &kibnal_data.kib_connd_peers);
1294                 wake_up (&kibnal_data.kib_connd_waitq);
1295         
1296                 spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
1297         }
1298         
1299         /* A connection is being established; queue the message... */
1300         list_add_tail (&tx->tx_list, &peer->ibp_tx_queue);
1301
1302         write_unlock_irqrestore(g_lock, flags);
1303 }
1304
1305 int
1306 kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist)
1307 {
1308         /* I would guess that if kibnal_get_peer (nid) == NULL,
1309            and we're not routing, then 'nid' is very distant :) */
1310         if ( nal->libnal_ni.ni_pid.nid == nid ) {
1311                 *dist = 0;
1312         } else {
1313                 *dist = 1;
1314         }
1315
1316         return 0;
1317 }
1318
1319 ptl_err_t
1320 kibnal_sendmsg(lib_nal_t    *nal, 
1321                void         *private,
1322                lib_msg_t    *libmsg,
1323                ptl_hdr_t    *hdr, 
1324                int           type, 
1325                ptl_nid_t     nid, 
1326                ptl_pid_t     pid,
1327                unsigned int  payload_niov, 
1328                struct iovec *payload_iov, 
1329                ptl_kiov_t   *payload_kiov,
1330                int           payload_offset,
1331                int           payload_nob)
1332 {
1333         kib_msg_t  *ibmsg;
1334         kib_tx_t   *tx;
1335         int         nob;
1336         int         rc;
1337         int         n;
1338
1339         /* NB 'private' is different depending on what we're sending.... */
1340
1341         CDEBUG(D_NET, "sending %d bytes in %d frags to nid:"LPX64
1342                " pid %d\n", payload_nob, payload_niov, nid , pid);
1343
1344         LASSERT (payload_nob == 0 || payload_niov > 0);
1345         LASSERT (payload_niov <= PTL_MD_MAX_IOV);
1346
1347         /* Thread context */
1348         LASSERT (!in_interrupt());
1349         /* payload is either all vaddrs or all pages */
1350         LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
1351
1352         switch (type) {
1353         default:
1354                 LBUG();
1355                 return (PTL_FAIL);
1356                 
1357         case PTL_MSG_REPLY: {
1358                 /* reply's 'private' is the incoming receive */
1359                 kib_rx_t *rx = private;
1360
1361                 LASSERT(rx != NULL);
1362
1363                 if (rx->rx_msg->ibm_type == IBNAL_MSG_IMMEDIATE) {
1364                         /* RDMA not expected */
1365                         nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
1366                         if (nob > IBNAL_MSG_SIZE) {
1367                                 CERROR("REPLY for "LPX64" too big (RDMA not requested):"
1368                                        "%d (max for message is %d)\n", 
1369                                        nid, payload_nob, IBNAL_MSG_SIZE);
1370                                 CERROR("Can't REPLY IMMEDIATE %d to "LPX64"\n",
1371                                        nob, nid);
1372                                 return PTL_FAIL;
1373                         }
1374                         break;
1375                 }
1376
1377                 /* Incoming message consistent with RDMA? */
1378                 if (rx->rx_msg->ibm_type != IBNAL_MSG_GET_REQ) {
1379                         CERROR("REPLY to "LPX64" bad msg type %x!!!\n",
1380                                nid, rx->rx_msg->ibm_type);
1381                         return PTL_FAIL;
1382                 }
1383
1384                 /* NB rx_complete() will send GET_NAK when I return to it from
1385                  * here, unless I set rx_responded! */
1386
1387                 tx = kibnal_get_idle_tx(0);
1388                 if (tx == NULL) {
1389                         CERROR("Can't get tx for REPLY to "LPX64"\n", nid);
1390                         return PTL_FAIL;
1391                 }
1392
1393                 if (payload_nob == 0)
1394                         rc = 0;
1395                 else if (payload_kiov == NULL)
1396                         rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0, 
1397                                                  payload_niov, payload_iov, 
1398                                                  payload_offset, payload_nob);
1399                 else
1400                         rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 0,
1401                                                   payload_niov, payload_kiov,
1402                                                   payload_offset, payload_nob);
1403                 if (rc != 0) {
1404                         CERROR("Can't setup GET src for "LPX64": %d\n", nid, rc);
1405                         kibnal_tx_done(tx);
1406                         return PTL_FAIL;
1407                 }
1408                 
1409                 rc = kibnal_init_rdma(tx, IBNAL_MSG_GET_DONE, payload_nob,
1410                                       &rx->rx_msg->ibm_u.get.ibgm_rd,
1411                                       rx->rx_msg->ibm_u.get.ibgm_cookie);
1412                 if (rc < 0) {
1413                         CERROR("Can't setup rdma for GET from "LPX64": %d\n", 
1414                                nid, rc);
1415                 } else if (rc == 0) {
1416                         /* No RDMA: local completion may happen now! */
1417                         lib_finalize (&kibnal_lib, NULL, libmsg, PTL_OK);
1418                 } else {
1419                         /* RDMA: lib_finalize(libmsg) when it completes */
1420                         tx->tx_libmsg[0] = libmsg;
1421                 }
1422
1423                 kibnal_queue_tx(tx, rx->rx_conn);
1424                 rx->rx_responded = 1;
1425                 return (rc >= 0) ? PTL_OK : PTL_FAIL;
1426         }
1427
1428         case PTL_MSG_GET:
1429                 /* will the REPLY message be small enough not to need RDMA? */
1430                 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[libmsg->md->length]);
1431                 if (nob <= IBNAL_MSG_SIZE)
1432                         break;
1433
1434                 tx = kibnal_get_idle_tx(1);     /* may block; caller is an app thread */
1435                 LASSERT (tx != NULL);
1436
1437                 ibmsg = tx->tx_msg;
1438                 ibmsg->ibm_u.get.ibgm_hdr = *hdr;
1439                 ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie;
1440
1441                 if ((libmsg->md->options & PTL_MD_KIOV) == 0)
1442                         rc = kibnal_setup_rd_iov(tx, &ibmsg->ibm_u.get.ibgm_rd,
1443                                                  vv_acc_r_mem_write,
1444                                                  libmsg->md->md_niov,
1445                                                  libmsg->md->md_iov.iov,
1446                                                  0, libmsg->md->length);
1447                 else
1448                         rc = kibnal_setup_rd_kiov(tx, &ibmsg->ibm_u.get.ibgm_rd,
1449                                                   vv_acc_r_mem_write,
1450                                                   libmsg->md->md_niov,
1451                                                   libmsg->md->md_iov.kiov,
1452                                                   0, libmsg->md->length);
1453                 if (rc != 0) {
1454                         CERROR("Can't setup GET sink for "LPX64": %d\n", nid, rc);
1455                         kibnal_tx_done(tx);
1456                         return PTL_FAIL;
1457                 }
1458
1459                 n = ibmsg->ibm_u.get.ibgm_rd.rd_nfrag;
1460                 nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]);
1461                 kibnal_init_tx_msg(tx, IBNAL_MSG_GET_REQ, nob);
1462
1463                 tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib, nid, libmsg);
1464                 if (tx->tx_libmsg[1] == NULL) {
1465                         CERROR("Can't create reply for GET -> "LPX64"\n", nid);
1466                         kibnal_tx_done(tx);
1467                         return PTL_FAIL;
1468                 }
1469
1470                 tx->tx_libmsg[0] = libmsg;      /* finalise libmsg[0,1] on completion */
1471                 tx->tx_waiting = 1;             /* waiting for GET_DONE */
1472                 kibnal_launch_tx(tx, nid);
1473                 return PTL_OK;
1474
1475         case PTL_MSG_ACK:
1476                 LASSERT (payload_nob == 0);
1477                 break;
1478
1479         case PTL_MSG_PUT:
1480                 /* Is the payload small enough not to need RDMA? */
1481                 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
1482                 if (nob <= IBNAL_MSG_SIZE)
1483                         break;
1484
1485                 tx = kibnal_get_idle_tx(1);     /* may block: caller is app thread */
1486                 LASSERT (tx != NULL);
1487
1488                 if (payload_kiov == NULL)
1489                         rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0,
1490                                                  payload_niov, payload_iov,
1491                                                  payload_offset, payload_nob);
1492                 else
1493                         rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 0,
1494                                                   payload_niov, payload_kiov,
1495                                                   payload_offset, payload_nob);
1496                 if (rc != 0) {
1497                         CERROR("Can't setup PUT src for "LPX64": %d\n", nid, rc);
1498                         kibnal_tx_done(tx);
1499                         return PTL_FAIL;
1500                 }
1501
1502                 ibmsg = tx->tx_msg;
1503                 ibmsg->ibm_u.putreq.ibprm_hdr = *hdr;
1504                 ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie;
1505                 kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_REQ, sizeof(kib_putreq_msg_t));
1506
1507                 tx->tx_libmsg[0] = libmsg;      /* finalise libmsg on completion */
1508                 tx->tx_waiting = 1;             /* waiting for PUT_{ACK,NAK} */
1509                 kibnal_launch_tx(tx, nid);
1510                 return PTL_OK;
1511         }
1512
1513         LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob])
1514                  <= IBNAL_MSG_SIZE);
1515
1516         tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK ||
1517                                   type == PTL_MSG_REPLY));
1518         if (tx == NULL) {
1519                 CERROR ("Can't send %d to "LPX64": tx descs exhausted\n", type, nid);
1520                 return PTL_NO_SPACE;
1521         }
1522
1523         ibmsg = tx->tx_msg;
1524         ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
1525
1526         if (payload_nob > 0) {
1527                 if (payload_kiov != NULL)
1528                         lib_copy_kiov2buf(ibmsg->ibm_u.immediate.ibim_payload,
1529                                           payload_niov, payload_kiov,
1530                                           payload_offset, payload_nob);
1531                 else
1532                         lib_copy_iov2buf(ibmsg->ibm_u.immediate.ibim_payload,
1533                                          payload_niov, payload_iov,
1534                                          payload_offset, payload_nob);
1535         }
1536
1537         nob = offsetof(kib_immediate_msg_t, ibim_payload[payload_nob]);
1538         kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE, nob);
1539
1540         tx->tx_libmsg[0] = libmsg;              /* finalise libmsg on completion */
1541         kibnal_launch_tx(tx, nid);
1542         return PTL_OK;
1543 }
1544
1545 ptl_err_t
1546 kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie,
1547                ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
1548                unsigned int payload_niov, struct iovec *payload_iov,
1549                size_t payload_offset, size_t payload_len)
1550 {
1551         CDEBUG(D_NET, "  pid = %d, nid="LPU64"\n",
1552                pid, nid);
1553         return (kibnal_sendmsg(nal, private, cookie,
1554                                hdr, type, nid, pid,
1555                                payload_niov, payload_iov, NULL,
1556                                payload_offset, payload_len));
1557 }
1558
1559 ptl_err_t
1560 kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, 
1561                      ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
1562                      unsigned int payload_niov, ptl_kiov_t *payload_kiov, 
1563                      size_t payload_offset, size_t payload_len)
1564 {
1565         return (kibnal_sendmsg(nal, private, cookie,
1566                                hdr, type, nid, pid,
1567                                payload_niov, NULL, payload_kiov,
1568                                payload_offset, payload_len));
1569 }
1570
1571 ptl_err_t
1572 kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
1573                  unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov,
1574                  size_t offset, int mlen, int rlen)
1575 {
1576         kib_rx_t    *rx = private;
1577         kib_msg_t   *rxmsg = rx->rx_msg;
1578         kib_conn_t  *conn = rx->rx_conn;
1579         kib_tx_t    *tx;
1580         kib_msg_t   *txmsg;
1581         int          nob;
1582         int          rc;
1583         int          n;
1584         
1585         LASSERT (mlen <= rlen);
1586         LASSERT (mlen >= 0);
1587         LASSERT (!in_interrupt());
1588         /* Either all pages or all vaddrs */
1589         LASSERT (!(kiov != NULL && iov != NULL));
1590
1591         switch (rxmsg->ibm_type) {
1592         default:
1593                 LBUG();
1594                 
1595         case IBNAL_MSG_IMMEDIATE:
1596                 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
1597                 if (nob > IBNAL_MSG_SIZE) {
1598                         CERROR ("Immediate message from "LPX64" too big: %d\n",
1599                                 rxmsg->ibm_u.immediate.ibim_hdr.src_nid, rlen);
1600                         return (PTL_FAIL);
1601                 }
1602
1603                 if (kiov != NULL)
1604                         lib_copy_buf2kiov(niov, kiov, offset,
1605                                           rxmsg->ibm_u.immediate.ibim_payload,
1606                                           mlen);
1607                 else
1608                         lib_copy_buf2iov(niov, iov, offset,
1609                                          rxmsg->ibm_u.immediate.ibim_payload,
1610                                          mlen);
1611
1612                 lib_finalize (nal, NULL, libmsg, PTL_OK);
1613                 return (PTL_OK);
1614
1615         case IBNAL_MSG_PUT_REQ:
1616                 /* NB rx_complete() will send PUT_NAK when I return to it from
1617                  * here, unless I set rx_responded!  */
1618
1619                 if (mlen == 0) { /* No payload to RDMA */
1620                         lib_finalize(nal, NULL, libmsg, PTL_OK);
1621                         return PTL_OK;
1622                 }
1623
1624                 tx = kibnal_get_idle_tx(0);
1625                 if (tx == NULL) {
1626                         CERROR("Can't allocate tx for "LPX64"\n",
1627                                conn->ibc_peer->ibp_nid);
1628                         return PTL_FAIL;
1629                 }
1630
1631                 txmsg = tx->tx_msg;
1632                 if (kiov == NULL)
1633                         rc = kibnal_setup_rd_iov(tx, 
1634                                                  &txmsg->ibm_u.putack.ibpam_rd,
1635                                                  vv_acc_r_mem_write,
1636                                                  niov, iov, offset, mlen);
1637                 else
1638                         rc = kibnal_setup_rd_kiov(tx,
1639                                                   &txmsg->ibm_u.putack.ibpam_rd,
1640                                                   vv_acc_r_mem_write,
1641                                                   niov, kiov, offset, mlen);
1642                 if (rc != 0) {
1643                         CERROR("Can't setup PUT sink for "LPX64": %d\n",
1644                                conn->ibc_peer->ibp_nid, rc);
1645                         kibnal_tx_done(tx);
1646                         return PTL_FAIL;
1647                 }
1648
1649                 txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
1650                 txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie;
1651
1652                 n = tx->tx_msg->ibm_u.putack.ibpam_rd.rd_nfrag;
1653                 nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]);
1654                 kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_ACK, nob);
1655
1656                 tx->tx_libmsg[0] = libmsg;      /* finalise libmsg on completion */
1657                 tx->tx_waiting = 1;             /* waiting for PUT_DONE */
1658                 kibnal_queue_tx(tx, conn);
1659
1660                 LASSERT (!rx->rx_responded);
1661                 rx->rx_responded = 1;
1662                 return PTL_OK;
1663
1664         case IBNAL_MSG_GET_REQ:
1665                 /* We get called here just to discard any junk after the
1666                  * GET hdr. */
1667                 LASSERT (libmsg == NULL);
1668                 lib_finalize (nal, NULL, libmsg, PTL_OK);
1669                 return (PTL_OK);
1670         }
1671 }
1672
1673 ptl_err_t
1674 kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg,
1675               unsigned int niov, struct iovec *iov, 
1676               size_t offset, size_t mlen, size_t rlen)
1677 {
1678         return (kibnal_recvmsg (nal, private, msg, niov, iov, NULL,
1679                                 offset, mlen, rlen));
1680 }
1681
1682 ptl_err_t
1683 kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg,
1684                      unsigned int niov, ptl_kiov_t *kiov, 
1685                      size_t offset, size_t mlen, size_t rlen)
1686 {
1687         return (kibnal_recvmsg (nal, private, msg, niov, NULL, kiov,
1688                                 offset, mlen, rlen));
1689 }
1690
1691 int
1692 kibnal_thread_start (int (*fn)(void *arg), void *arg)
1693 {
1694         long    pid = kernel_thread (fn, arg, 0);
1695
1696         if (pid < 0)
1697                 return ((int)pid);
1698
1699         atomic_inc (&kibnal_data.kib_nthreads);
1700         return (0);
1701 }
1702
1703 void
1704 kibnal_thread_fini (void)
1705 {
1706         atomic_dec (&kibnal_data.kib_nthreads);
1707 }
1708
1709 void
1710 kibnal_close_conn_locked (kib_conn_t *conn, int error)
1711 {
1712         /* This just does the immmediate housekeeping.  'error' is zero for a
1713          * normal shutdown which can happen only after the connection has been
1714          * established.  If the connection is established, schedule the
1715          * connection to be finished off by the connd.  Otherwise the connd is
1716          * already dealing with it (either to set it up or tear it down).
1717          * Caller holds kib_global_lock exclusively in irq context */
1718         kib_peer_t       *peer = conn->ibc_peer;
1719         struct list_head *tmp;
1720         
1721         LASSERT (error != 0 || conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
1722
1723         if (error != 0 && conn->ibc_comms_error == 0)
1724                 conn->ibc_comms_error = error;
1725
1726         if (conn->ibc_state != IBNAL_CONN_ESTABLISHED)
1727                 return; /* already being handled  */
1728
1729         spin_lock(&conn->ibc_lock);
1730         
1731         if (error == 0 &&
1732             list_empty(&conn->ibc_tx_queue) &&
1733             list_empty(&conn->ibc_active_txs)) {
1734                 CDEBUG(D_NET, "closing conn to "LPX64
1735                        " rx# "LPD64" tx# "LPD64"\n", 
1736                        peer->ibp_nid, conn->ibc_txseq, conn->ibc_rxseq);
1737         } else {
1738                 CERROR("Closing conn to "LPX64": error %d%s%s"
1739                        " rx# "LPD64" tx# "LPD64"\n",
1740                        peer->ibp_nid, error,
1741                        list_empty(&conn->ibc_tx_queue) ? "" : "(sending)",
1742                        list_empty(&conn->ibc_active_txs) ? "" : "(waiting)",
1743                        conn->ibc_txseq, conn->ibc_rxseq);
1744
1745                 list_for_each(tmp, &conn->ibc_tx_queue) {
1746                         kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
1747                         
1748                         CERROR("   queued tx type %x cookie "LPX64
1749                                " sending %d waiting %d ticks %ld/%d\n", 
1750                                tx->tx_msg->ibm_type, tx->tx_cookie, 
1751                                tx->tx_sending, tx->tx_waiting,
1752                                (long)(tx->tx_deadline - jiffies), HZ);
1753                 }
1754
1755                 list_for_each(tmp, &conn->ibc_active_txs) {
1756                         kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
1757                         
1758                         CERROR("   active tx type %x cookie "LPX64
1759                                " sending %d waiting %d ticks %ld/%d\n", 
1760                                tx->tx_msg->ibm_type, tx->tx_cookie, 
1761                                tx->tx_sending, tx->tx_waiting,
1762                                (long)(tx->tx_deadline - jiffies), HZ);
1763                 }
1764         }
1765
1766         spin_unlock(&conn->ibc_lock);
1767
1768         /* connd takes ibc_list's ref */
1769         list_del (&conn->ibc_list);
1770         
1771         if (list_empty (&peer->ibp_conns) &&    /* no more conns */
1772             peer->ibp_persistence == 0 &&       /* non-persistent peer */
1773             kibnal_peer_active(peer)) {         /* still in peer table */
1774                 kibnal_unlink_peer_locked (peer);
1775         }
1776
1777         kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECT1);
1778
1779         spin_lock(&kibnal_data.kib_connd_lock);
1780
1781         list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns);
1782         wake_up (&kibnal_data.kib_connd_waitq);
1783                 
1784         spin_unlock(&kibnal_data.kib_connd_lock);
1785 }
1786
1787 void
1788 kibnal_close_conn (kib_conn_t *conn, int error)
1789 {
1790         unsigned long flags;
1791         
1792         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1793
1794         kibnal_close_conn_locked (conn, error);
1795         
1796         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1797 }
1798
1799 void
1800 kibnal_handle_early_rxs(kib_conn_t *conn)
1801 {
1802         unsigned long    flags;
1803         kib_rx_t        *rx;
1804
1805         LASSERT (!in_interrupt());
1806         LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
1807         
1808         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1809         while (!list_empty(&conn->ibc_early_rxs)) {
1810                 rx = list_entry(conn->ibc_early_rxs.next,
1811                                 kib_rx_t, rx_list);
1812                 list_del(&rx->rx_list);
1813                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1814                 
1815                 kibnal_handle_rx(rx);
1816                 
1817                 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1818         }
1819         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1820 }
1821
1822 void
1823 kibnal_conn_disconnected(kib_conn_t *conn)
1824 {
1825         LIST_HEAD        (zombies); 
1826         struct list_head *tmp;
1827         struct list_head *nxt;
1828         kib_tx_t         *tx;
1829
1830         /* I'm the connd */
1831         LASSERT (!in_interrupt());
1832         LASSERT (current == kibnal_data.kib_connd);
1833         LASSERT (conn->ibc_state >= IBNAL_CONN_INIT);
1834         
1835         kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECTED);
1836
1837         /* move QP to error state to make posted work items complete */
1838         kibnal_set_qp_state(conn, vv_qp_state_error);
1839
1840         spin_lock(&conn->ibc_lock);
1841
1842         /* Complete all tx descs not waiting for sends to complete.
1843          * NB we should be safe from RDMA now that the QP has changed state */
1844
1845         list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) {
1846                 tx = list_entry (tmp, kib_tx_t, tx_list);
1847
1848                 LASSERT (tx->tx_queued);
1849
1850                 tx->tx_status = -ECONNABORTED;
1851                 tx->tx_queued = 0;
1852                 tx->tx_waiting = 0;
1853                 
1854                 if (tx->tx_sending != 0)
1855                         continue;
1856
1857                 list_del (&tx->tx_list);
1858                 list_add (&tx->tx_list, &zombies);
1859         }
1860
1861         list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) {
1862                 tx = list_entry (tmp, kib_tx_t, tx_list);
1863
1864                 LASSERT (!tx->tx_queued);
1865                 LASSERT (tx->tx_waiting ||
1866                          tx->tx_sending != 0);
1867
1868                 tx->tx_status = -ECONNABORTED;
1869                 tx->tx_waiting = 0;
1870                 
1871                 if (tx->tx_sending != 0)
1872                         continue;
1873
1874                 list_del (&tx->tx_list);
1875                 list_add (&tx->tx_list, &zombies);
1876         }
1877         
1878         spin_unlock(&conn->ibc_lock);
1879
1880         while (!list_empty(&zombies)) {
1881                 tx = list_entry (zombies.next, kib_tx_t, tx_list);
1882
1883                 list_del(&tx->tx_list);
1884                 kibnal_tx_done (tx);
1885         }
1886
1887         kibnal_handle_early_rxs(conn);
1888 }
1889
1890 void
1891 kibnal_peer_connect_failed (kib_peer_t *peer, int active)
1892 {
1893         struct list_head  zombies;
1894         kib_tx_t         *tx;
1895         unsigned long     flags;
1896
1897         /* Only the connd creates conns => single threaded */
1898         LASSERT (!in_interrupt());
1899         LASSERT (current == kibnal_data.kib_connd);
1900         LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL);
1901
1902         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1903
1904         if (active) {
1905                 LASSERT (peer->ibp_connecting != 0);
1906                 peer->ibp_connecting--;
1907         } else {
1908                 LASSERT (!kibnal_peer_active(peer));
1909         }
1910         
1911         if (peer->ibp_connecting != 0) {
1912                 /* another connection attempt under way (loopback?)... */
1913                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1914                 return;
1915         }
1916
1917         if (list_empty(&peer->ibp_conns)) {
1918                 /* Say when active connection can be re-attempted */
1919                 peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval;
1920                 /* Increase reconnection interval */
1921                 peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2,
1922                                                     IBNAL_MAX_RECONNECT_INTERVAL);
1923         
1924                 /* Take peer's blocked transmits to complete with error */
1925                 list_add(&zombies, &peer->ibp_tx_queue);
1926                 list_del_init(&peer->ibp_tx_queue);
1927                 
1928                 if (kibnal_peer_active(peer) &&
1929                     (peer->ibp_persistence == 0)) {
1930                         /* failed connection attempt on non-persistent peer */
1931                         kibnal_unlink_peer_locked (peer);
1932                 }
1933         } else {
1934                 /* Can't have blocked transmits if there are connections */
1935                 LASSERT (list_empty(&peer->ibp_tx_queue));
1936         }
1937         
1938         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1939
1940         if (list_empty (&zombies)) 
1941                 return;
1942         
1943         CERROR ("Deleting messages for "LPX64": connection failed\n", peer->ibp_nid);
1944         do {
1945                 tx = list_entry (zombies.next, kib_tx_t, tx_list);
1946
1947                 list_del (&tx->tx_list);
1948                 /* complete now */
1949                 tx->tx_status = -EHOSTUNREACH;
1950                 kibnal_tx_done (tx);
1951         } while (!list_empty (&zombies));
1952 }
1953
1954 void
1955 kibnal_connreq_done(kib_conn_t *conn, int active, int status)
1956 {
1957         static cm_reject_data_t   rej;
1958
1959         struct list_head   txs;
1960         kib_peer_t        *peer = conn->ibc_peer;
1961         kib_peer_t        *peer2;
1962         unsigned long      flags;
1963         kib_tx_t          *tx;
1964
1965         /* Only the connd creates conns => single threaded */
1966         LASSERT (!in_interrupt());
1967         LASSERT (current == kibnal_data.kib_connd);
1968         LASSERT (conn->ibc_state < IBNAL_CONN_ESTABLISHED);
1969
1970         if (active) {
1971                 LASSERT (peer->ibp_connecting > 0);
1972         } else {
1973                 LASSERT (!kibnal_peer_active(peer));
1974         }
1975         
1976         PORTAL_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
1977         conn->ibc_connvars = NULL;
1978
1979         if (status != 0) {
1980                 /* failed to establish connection */
1981                 switch (conn->ibc_state) {
1982                 default:
1983                         LBUG();
1984                 case IBNAL_CONN_ACTIVE_CHECK_REPLY:
1985                         /* got a connection reply but failed checks */
1986                         LASSERT (active);
1987                         memset(&rej, 0, sizeof(rej));
1988                         rej.reason = cm_rej_code_usr_rej;
1989                         cm_reject(conn->ibc_cep, &rej);
1990                         break;
1991
1992                 case IBNAL_CONN_ACTIVE_CONNECT:
1993                         LASSERT (active);
1994                         cm_cancel(conn->ibc_cep);
1995                         kibnal_pause(HZ/10);
1996                         /* cm_connect() failed immediately or
1997                          * callback returned failure */
1998                         break;
1999
2000                 case IBNAL_CONN_ACTIVE_ARP:
2001                         LASSERT (active);
2002                         /* ibat_get_ib_data() failed immediately 
2003                          * or callback returned failure */
2004                         break;
2005
2006                 case IBNAL_CONN_INIT:
2007                         break;
2008
2009                 case IBNAL_CONN_PASSIVE_WAIT:
2010                         LASSERT (!active);
2011                         /* cm_accept callback returned failure */
2012                         break;
2013                 }
2014
2015                 kibnal_peer_connect_failed(conn->ibc_peer, active);
2016                 kibnal_conn_disconnected(conn);
2017                 return;
2018         }
2019
2020         /* connection established */
2021         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2022
2023         if (active) {
2024                 LASSERT(conn->ibc_state == IBNAL_CONN_ACTIVE_RTU);
2025         } else {
2026                 LASSERT(conn->ibc_state == IBNAL_CONN_PASSIVE_WAIT);
2027         }
2028         
2029         kibnal_set_conn_state(conn, IBNAL_CONN_ESTABLISHED);
2030
2031         if (!active) {
2032                 peer2 = kibnal_find_peer_locked(peer->ibp_nid);
2033                 if (peer2 != NULL) {
2034                         /* already in the peer table; swap */
2035                         conn->ibc_peer = peer2;
2036                         kibnal_peer_addref(peer2);
2037                         kibnal_peer_decref(peer);
2038                         peer = conn->ibc_peer;
2039                 } else {
2040                         /* add 'peer' to the peer table */
2041                         kibnal_peer_addref(peer);
2042                         list_add_tail(&peer->ibp_list,
2043                                       kibnal_nid2peerlist(peer->ibp_nid));
2044                 }
2045         }
2046         
2047         /* Add conn to peer's list and nuke any dangling conns from a different
2048          * peer instance... */
2049         kibnal_conn_addref(conn);               /* +1 ref for ibc_list */
2050         list_add(&conn->ibc_list, &peer->ibp_conns);
2051         kibnal_close_stale_conns_locked (conn->ibc_peer,
2052                                          conn->ibc_incarnation);
2053
2054         if (!kibnal_peer_active(peer) ||        /* peer has been deleted */
2055             conn->ibc_comms_error != 0 ||       /* comms error */
2056             conn->ibc_disconnect) {             /* need to disconnect */
2057                 
2058                 /* start to shut down connection */
2059                 kibnal_close_conn_locked(conn, -ECONNABORTED);
2060
2061                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2062                 kibnal_peer_connect_failed(peer, active);
2063                 return;
2064         }
2065
2066         if (active)
2067                 peer->ibp_connecting--;
2068
2069         /* grab pending txs while I have the lock */
2070         list_add(&txs, &peer->ibp_tx_queue);
2071         list_del_init(&peer->ibp_tx_queue);
2072         
2073         /* reset reconnect interval for next attempt */
2074         peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
2075         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2076
2077         /* Schedule blocked txs */
2078         spin_lock (&conn->ibc_lock);
2079         while (!list_empty (&txs)) {
2080                 tx = list_entry (txs.next, kib_tx_t, tx_list);
2081                 list_del (&tx->tx_list);
2082
2083                 kibnal_queue_tx_locked (tx, conn);
2084         }
2085         spin_unlock (&conn->ibc_lock);
2086         kibnal_check_sends (conn);
2087
2088         /* schedule blocked rxs */
2089         kibnal_handle_early_rxs(conn);
2090 }
2091
2092 void
2093 kibnal_cm_callback(cm_cep_handle_t cep, cm_conn_data_t *cmdata, void *arg)
2094 {
2095         static cm_dreply_data_t drep;           /* just zeroed space */
2096         
2097         kib_conn_t             *conn = (kib_conn_t *)arg;
2098         unsigned long           flags;
2099         
2100         /* CAVEAT EMPTOR: tasklet context */
2101
2102         switch (cmdata->status) {
2103         default:
2104                 LBUG();
2105                 
2106         case cm_event_disconn_request:
2107                 /* IBNAL_CONN_ACTIVE_RTU:  gets closed in kibnal_connreq_done
2108                  * IBNAL_CONN_ESTABLISHED: I start it closing
2109                  * otherwise:              it's closing anyway */
2110                 cm_disconnect(conn->ibc_cep, NULL, &drep);
2111                 cm_cancel(conn->ibc_cep);
2112
2113                 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2114                 LASSERT (!conn->ibc_disconnect);
2115                 conn->ibc_disconnect = 1;
2116
2117                 switch (conn->ibc_state) {
2118                 default:
2119                         LBUG();
2120
2121                 case IBNAL_CONN_ACTIVE_RTU:
2122                         /* kibnal_connreq_done is getting there; It'll see
2123                          * ibc_disconnect set... */
2124                         kibnal_conn_decref(conn); /* lose my ref */
2125                         break;
2126
2127                 case IBNAL_CONN_ESTABLISHED:
2128                         /* kibnal_connreq_done got there already; get
2129                          * disconnect going... */
2130                         kibnal_close_conn_locked(conn, 0);
2131                         kibnal_conn_decref(conn); /* lose my ref */
2132                         break;
2133
2134                 case IBNAL_CONN_DISCONNECT1:
2135                         /* kibnal_terminate_conn is getting there; It'll see
2136                          * ibc_disconnect set... */
2137                         kibnal_conn_decref(conn); /* lose my ref */
2138                         break;
2139
2140                 case IBNAL_CONN_DISCONNECT2:
2141                         /* kibnal_terminate_conn got there already; complete
2142                          * the disconnect.  NB kib_connd_conns takes my ref */
2143                         spin_lock(&kibnal_data.kib_connd_lock);
2144                         list_add_tail(&conn->ibc_list, &kibnal_data.kib_connd_conns);
2145                         wake_up(&kibnal_data.kib_connd_waitq);
2146                         spin_unlock(&kibnal_data.kib_connd_lock);
2147                         break;
2148                 }
2149                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2150                 return;
2151                 
2152         case cm_event_disconn_timeout:
2153         case cm_event_disconn_reply:
2154                 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2155                 LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT2);
2156                 LASSERT (!conn->ibc_disconnect);
2157                 conn->ibc_disconnect = 1;
2158
2159                 /* kibnal_terminate_conn sent the disconnect request.  
2160                  * NB kib_connd_conns takes my ref */
2161                 spin_lock(&kibnal_data.kib_connd_lock);
2162                 list_add_tail(&conn->ibc_list, &kibnal_data.kib_connd_conns);
2163                 wake_up(&kibnal_data.kib_connd_waitq);
2164                 spin_unlock(&kibnal_data.kib_connd_lock);
2165
2166                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2167                 break;
2168                 
2169         case cm_event_connected:
2170         case cm_event_conn_timeout:
2171         case cm_event_conn_reject:
2172                 LASSERT (conn->ibc_state == IBNAL_CONN_PASSIVE_WAIT);
2173                 conn->ibc_connvars->cv_conndata = *cmdata;
2174                 
2175                 spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
2176                 list_add_tail(&conn->ibc_list, &kibnal_data.kib_connd_conns);
2177                 wake_up(&kibnal_data.kib_connd_waitq);
2178                 spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
2179                 break;
2180         }
2181 }
2182
2183 void
2184 kibnal_check_passive_wait(kib_conn_t *conn)
2185 {
2186         int     rc;
2187
2188         switch (conn->ibc_connvars->cv_conndata.status) {
2189         default:
2190                 LBUG();
2191                 
2192         case cm_event_connected:
2193                 kibnal_conn_addref(conn); /* ++ ref for CM callback */
2194                 rc = kibnal_set_qp_state(conn, vv_qp_state_rts);
2195                 if (rc != 0)
2196                         conn->ibc_comms_error = rc;
2197                 /* connection _has_ been established; it's just that we've had
2198                  * an error immediately... */
2199                 kibnal_connreq_done(conn, 0, 0);
2200                 break;
2201                 
2202         case cm_event_conn_timeout:
2203                 kibnal_connreq_done(conn, 0, -ETIMEDOUT);
2204                 break;
2205                 
2206         case cm_event_conn_reject:
2207                 kibnal_connreq_done(conn, 0, -ECONNRESET);
2208                 break;
2209         }
2210 }
2211
2212 void
2213 kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq)
2214 {
2215         static kib_msg_t        txmsg;
2216         static kib_msg_t        rxmsg;
2217         static cm_reply_data_t  reply;
2218         static cm_reject_data_t reject;
2219
2220         kib_conn_t         *conn = NULL;
2221         int                 rc = 0;
2222         int                 rxmsgnob;
2223         kib_connvars_t     *cv;
2224         kib_peer_t         *tmp_peer;
2225         cm_return_t         cmrc;
2226         vv_return_t         vvrc;
2227         
2228         /* I'm the connd executing in thread context
2229          * No concurrency problems with static data! */
2230         LASSERT (!in_interrupt());
2231         LASSERT (current == kibnal_data.kib_connd);
2232
2233         if (cmreq->sid != IBNAL_SERVICE_NUMBER) {
2234                 CERROR(LPX64" != IBNAL_SERVICE_NUMBER("LPX64")\n",
2235                        cmreq->sid, (__u64)IBNAL_SERVICE_NUMBER);
2236                 goto reject;
2237         }
2238
2239         /* copy into rxmsg to avoid alignment issues */
2240         rxmsgnob = MIN(cm_REQ_priv_data_len, sizeof(rxmsg));
2241         memcpy(&rxmsg, cmreq->priv_data, rxmsgnob);
2242
2243         rc = kibnal_unpack_msg(&rxmsg, rxmsgnob);
2244         if (rc != 0) {
2245                 CERROR("Can't parse connection request: %d\n", rc);
2246                 goto reject;
2247         }
2248
2249         if (rxmsg.ibm_type != IBNAL_MSG_CONNREQ) {
2250                 CERROR("Unexpected connreq msg type: %x from "LPX64"\n",
2251                        rxmsg.ibm_type, rxmsg.ibm_srcnid);
2252                 goto reject;
2253         }
2254
2255         if (rxmsg.ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid) {
2256                 CERROR("Can't accept "LPX64": bad dst nid "LPX64"\n",
2257                        rxmsg.ibm_srcnid, rxmsg.ibm_dstnid);
2258                 goto reject;
2259         }
2260
2261         if (rxmsg.ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) {
2262                 CERROR("Can't accept "LPX64": incompatible queue depth %d (%d wanted)\n",
2263                        rxmsg.ibm_srcnid, rxmsg.ibm_u.connparams.ibcp_queue_depth, 
2264                        IBNAL_MSG_QUEUE_SIZE);
2265                 goto reject;
2266         }
2267
2268         if (rxmsg.ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE) {
2269                 CERROR("Can't accept "LPX64": message size %d too big (%d max)\n",
2270                        rxmsg.ibm_srcnid, rxmsg.ibm_u.connparams.ibcp_max_msg_size, 
2271                        IBNAL_MSG_SIZE);
2272                 goto reject;
2273         }
2274                 
2275         if (rxmsg.ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) {
2276                 CERROR("Can't accept "LPX64": max frags %d too big (%d max)\n",
2277                        rxmsg.ibm_srcnid, rxmsg.ibm_u.connparams.ibcp_max_frags, 
2278                        IBNAL_MAX_RDMA_FRAGS);
2279                 goto reject;
2280         }
2281                 
2282         conn = kibnal_create_conn(cep);
2283         if (conn == NULL) {
2284                 CERROR("Can't create conn for "LPX64"\n", rxmsg.ibm_srcnid);
2285                 goto reject;
2286         }
2287         
2288         /* assume 'rxmsg.ibm_srcnid' is a new peer */
2289         tmp_peer = kibnal_create_peer (rxmsg.ibm_srcnid);
2290         if (tmp_peer == NULL) {
2291                 CERROR("Can't create tmp peer for "LPX64"\n", rxmsg.ibm_srcnid);
2292                 kibnal_conn_decref(conn);
2293                 conn = NULL;
2294                 goto reject;
2295         }
2296
2297         conn->ibc_peer = tmp_peer;              /* conn takes over my ref */
2298         conn->ibc_incarnation = rxmsg.ibm_srcstamp;
2299         conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
2300
2301         cv = conn->ibc_connvars;
2302
2303         cv->cv_txpsn          = cmreq->cep_data.start_psn;
2304         cv->cv_remote_qpn     = cmreq->cep_data.qpn;
2305         cv->cv_path           = cmreq->path_data.path;
2306         cv->cv_rnr_count      = cmreq->cep_data.rtr_retry_cnt;
2307         // XXX                  cmreq->cep_data.retry_cnt;
2308         cv->cv_port           = cmreq->cep_data.local_port_num;
2309
2310         vvrc = gid2gid_index(kibnal_data.kib_hca, cv->cv_port,
2311                              &cv->cv_path.sgid, &cv->cv_sgid_index);
2312         LASSERT (vvrc == vv_return_ok);
2313         
2314         vvrc = pkey2pkey_index(kibnal_data.kib_hca, cv->cv_port,
2315                                cv->cv_path.pkey, &cv->cv_pkey_index);
2316         LASSERT (vvrc == vv_return_ok);
2317
2318         rc = kibnal_set_qp_state(conn, vv_qp_state_init);
2319         if (rc != 0)
2320                 goto reject;
2321
2322         rc = kibnal_post_receives(conn);
2323         if (rc != 0) {
2324                 CERROR("Can't post receives for "LPX64"\n", rxmsg.ibm_srcnid);
2325                 goto reject;
2326         }
2327
2328         rc = kibnal_set_qp_state(conn, vv_qp_state_rtr);
2329         if (rc != 0)
2330                 goto reject;
2331         
2332         memset(&reply, 0, sizeof(reply));
2333         reply.qpn                 = cv->cv_local_qpn;
2334         reply.qkey                = IBNAL_QKEY;
2335         reply.start_psn           = cv->cv_rxpsn;
2336         reply.arb_initiator_depth = IBNAL_ARB_INITIATOR_DEPTH;
2337         reply.arb_resp_res        = IBNAL_ARB_RESP_RES;
2338         reply.failover_accepted   = IBNAL_FAILOVER_ACCEPTED;
2339         reply.rnr_retry_count     = cv->cv_rnr_count;
2340         reply.targ_ack_delay      = kibnal_data.kib_hca_attrs.ack_delay;
2341         
2342         /* setup txmsg... */
2343         memset(&txmsg, 0, sizeof(txmsg));
2344         kibnal_init_msg(&txmsg, IBNAL_MSG_CONNACK, 
2345                         sizeof(txmsg.ibm_u.connparams));
2346         LASSERT (txmsg.ibm_nob <= cm_REP_priv_data_len);
2347         txmsg.ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE;
2348         txmsg.ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE;
2349         txmsg.ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS;
2350         kibnal_pack_msg(&txmsg, 0, rxmsg.ibm_srcnid, rxmsg.ibm_srcstamp, 0);
2351
2352         /* ...and copy into reply to avoid alignment issues */
2353         memcpy(&reply.priv_data, &txmsg, txmsg.ibm_nob);
2354
2355         kibnal_set_conn_state(conn, IBNAL_CONN_PASSIVE_WAIT);
2356         
2357         cmrc = cm_accept(conn->ibc_cep, &reply, NULL,
2358                          kibnal_cm_callback, conn);
2359
2360         if (cmrc == cm_stat_success)
2361                 return;                         /* callback has got my ref on conn */
2362
2363         /* back out state change (no callback happening) */
2364         kibnal_set_conn_state(conn, IBNAL_CONN_INIT);
2365         rc = -EIO;
2366                 
2367  reject:
2368         CERROR("Rejected connreq from "LPX64"\n", rxmsg.ibm_srcnid);
2369
2370         memset(&reject, 0, sizeof(reject));
2371         reject.reason = cm_rej_code_usr_rej;
2372         cm_reject(cep, &reject);
2373
2374         if (conn != NULL) {
2375                 LASSERT (rc != 0);
2376                 kibnal_connreq_done(conn, 0, rc);
2377         } else {
2378                 cm_destroy_cep(cep);
2379         }
2380 }
2381
2382 void
2383 kibnal_listen_callback(cm_cep_handle_t cep, cm_conn_data_t *data, void *arg)
2384 {
2385         cm_request_data_t  *cmreq = &data->data.request;
2386         kib_pcreq_t        *pcr;
2387         unsigned long       flags;
2388         
2389         LASSERT (arg == NULL);
2390
2391         if (data->status != cm_event_conn_request) {
2392                 CERROR("status %d is not cm_event_conn_request\n",
2393                        data->status);
2394                 return;
2395         }
2396
2397         PORTAL_ALLOC_ATOMIC(pcr, sizeof(*pcr));
2398         if (pcr == NULL) {
2399                 CERROR("Can't allocate passive connreq\n");
2400
2401                 cm_reject(cep, &((cm_reject_data_t) /* NB RO struct */
2402                                  {.reason = cm_rej_code_no_res,}));
2403                 cm_destroy_cep(cep);
2404                 return;
2405         }
2406
2407         pcr->pcr_cep = cep;
2408         pcr->pcr_cmreq = *cmreq;
2409         
2410         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
2411
2412         list_add_tail(&pcr->pcr_list, &kibnal_data.kib_connd_pcreqs);
2413         wake_up(&kibnal_data.kib_connd_waitq);
2414         
2415         spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
2416 }
2417
2418
2419 void
2420 kibnal_active_connect_callback (cm_cep_handle_t cep, cm_conn_data_t *cd, 
2421                                 void *arg)
2422 {
2423         /* CAVEAT EMPTOR: tasklet context */
2424         kib_conn_t       *conn = (kib_conn_t *)arg;
2425         kib_connvars_t   *cv = conn->ibc_connvars;
2426         unsigned long     flags;
2427
2428         LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT);
2429         cv->cv_conndata = *cd;
2430
2431         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
2432         /* connd takes my ref */
2433         list_add_tail(&conn->ibc_list, &kibnal_data.kib_connd_conns);
2434         wake_up(&kibnal_data.kib_connd_waitq);
2435         spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
2436 }
2437
2438 void
2439 kibnal_connect_conn (kib_conn_t *conn)
2440 {
2441         static cm_request_data_t  cmreq;
2442         static kib_msg_t          msg;
2443         
2444         kib_connvars_t           *cv = conn->ibc_connvars;
2445         kib_peer_t               *peer = conn->ibc_peer;
2446         cm_return_t               cmrc;
2447         
2448         /* Only called by connd => statics OK */
2449         LASSERT (!in_interrupt());
2450         LASSERT (current == kibnal_data.kib_connd);
2451         LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP);
2452
2453         memset(&cmreq, 0, sizeof(cmreq));
2454         
2455         cmreq.sid = IBNAL_SERVICE_NUMBER;
2456
2457         cmreq.cep_data.ca_guid              = kibnal_data.kib_hca_attrs.guid;
2458         cmreq.cep_data.qpn                  = cv->cv_local_qpn;
2459         cmreq.cep_data.retry_cnt            = IBNAL_RETRY_CNT;
2460         cmreq.cep_data.rtr_retry_cnt        = IBNAL_RNR_CNT;
2461         cmreq.cep_data.start_psn            = cv->cv_rxpsn;
2462         cmreq.cep_data.end_to_end_flow_ctrl = IBNAL_EE_FLOW_CNT;
2463         // XXX ack_timeout?
2464         // offered_resp_res
2465         // offered_initiator_depth
2466
2467         cmreq.path_data.subn_local  = IBNAL_LOCAL_SUB;
2468         cmreq.path_data.path        = cv->cv_path;
2469         
2470         /* setup msg... */
2471         memset(&msg, 0, sizeof(msg));
2472         kibnal_init_msg(&msg, IBNAL_MSG_CONNREQ, sizeof(msg.ibm_u.connparams));
2473         LASSERT(msg.ibm_nob <= cm_REQ_priv_data_len);
2474         msg.ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE;
2475         msg.ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE;
2476         msg.ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS;
2477         kibnal_pack_msg(&msg, 0, peer->ibp_nid, 0, 0);
2478
2479         /* ...and copy into cmreq to avoid alignment issues */
2480         memcpy(&cmreq.priv_data, &msg, msg.ibm_nob);
2481         
2482         CDEBUG(D_NET, "Connecting %p to "LPX64"\n", conn, peer->ibp_nid);
2483
2484         kibnal_conn_addref(conn);               /* ++ref for CM callback */
2485         kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CONNECT);
2486
2487         cmrc = cm_connect(conn->ibc_cep, &cmreq, 
2488                           kibnal_active_connect_callback, conn);
2489         if (cmrc == cm_stat_success) {
2490                 CDEBUG(D_NET, "connection REQ sent to "LPX64"\n",
2491                        peer->ibp_nid);
2492                 return;
2493         }
2494
2495         CERROR ("Connect "LPX64" failed: %d\n", peer->ibp_nid, cmrc);
2496         kibnal_conn_decref(conn);       /* drop callback's ref */
2497         kibnal_connreq_done(conn, 1, -EHOSTUNREACH);
2498 }
2499
2500 void
2501 kibnal_check_connreply (kib_conn_t *conn)
2502 {
2503         static cm_rtu_data_t  rtu;
2504         static kib_msg_t      msg;
2505
2506         kib_connvars_t   *cv = conn->ibc_connvars;
2507         cm_reply_data_t  *reply = &cv->cv_conndata.data.reply;
2508         kib_peer_t       *peer = conn->ibc_peer;
2509         int               msgnob;
2510         cm_return_t       cmrc;
2511         cm_cep_handle_t   cep;
2512         unsigned long     flags;
2513         int               rc;
2514
2515         /* Only called by connd => statics OK */
2516         LASSERT (!in_interrupt());
2517         LASSERT (current == kibnal_data.kib_connd);
2518         LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT);
2519
2520         if (cv->cv_conndata.status == cm_event_conn_reply) {
2521                 cv->cv_remote_qpn = reply->qpn;
2522                 cv->cv_txpsn      = reply->start_psn;
2523                 // XXX              reply->targ_ack_delay;
2524                 cv->cv_rnr_count  = reply->rnr_retry_count;
2525
2526                 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CHECK_REPLY);
2527
2528                 /* copy into msg to avoid alignment issues */
2529                 msgnob = MIN(cm_REP_priv_data_len, sizeof(msg));
2530                 memcpy(&msg, &reply->priv_data, msgnob);
2531
2532                 rc = kibnal_unpack_msg(&msg, msgnob);
2533                 if (rc != 0) {
2534                         CERROR("Can't unpack reply from "LPX64"\n",
2535                                peer->ibp_nid);
2536                         kibnal_connreq_done(conn, 1, rc);
2537                         return;
2538                 }
2539
2540                 if (msg.ibm_type != IBNAL_MSG_CONNACK ) {
2541                         CERROR("Unexpected message type %d from "LPX64"\n",
2542                                msg.ibm_type, peer->ibp_nid);
2543                         kibnal_connreq_done(conn, 1, -EPROTO);
2544                         return;
2545                 }
2546
2547                 if (msg.ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) {
2548                         CERROR(LPX64" has incompatible queue depth %d(%d wanted)\n",
2549                                peer->ibp_nid, msg.ibm_u.connparams.ibcp_queue_depth,
2550                                IBNAL_MSG_QUEUE_SIZE);
2551                         kibnal_connreq_done(conn, 1, -EPROTO);
2552                         return;
2553                 }
2554                 
2555                 if (msg.ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE) {
2556                         CERROR(LPX64" max message size %d too big (%d max)\n",
2557                                peer->ibp_nid, msg.ibm_u.connparams.ibcp_max_msg_size, 
2558                                IBNAL_MSG_SIZE);
2559                         kibnal_connreq_done(conn, 1, -EPROTO);
2560                         return;
2561                 }
2562
2563                 if (msg.ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) {
2564                         CERROR(LPX64" max frags %d too big (%d max)\n",
2565                                peer->ibp_nid, msg.ibm_u.connparams.ibcp_max_frags, 
2566                                IBNAL_MAX_RDMA_FRAGS);
2567                         kibnal_connreq_done(conn, 1, -EPROTO);
2568                         return;
2569                 }
2570                 
2571                 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2572                 rc = (msg.ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid ||
2573                       msg.ibm_dststamp != kibnal_data.kib_incarnation) ?
2574                      -ESTALE : 0;
2575                 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2576                 if (rc != 0) {
2577                         CERROR("Stale connection reply from "LPX64"\n",
2578                                peer->ibp_nid);
2579                         kibnal_connreq_done(conn, 1, rc);
2580                         return;
2581                 }
2582
2583                 conn->ibc_incarnation = msg.ibm_srcstamp;
2584                 conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
2585                 
2586                 rc = kibnal_post_receives(conn);
2587                 if (rc != 0) {
2588                         CERROR("Can't post receives for "LPX64"\n",
2589                                peer->ibp_nid);
2590                         kibnal_connreq_done(conn, 1, rc);
2591                         return;
2592                 }
2593                 
2594                 rc = kibnal_set_qp_state(conn, vv_qp_state_rtr);
2595                 if (rc != 0) {
2596                         kibnal_connreq_done(conn, 1, rc);
2597                         return;
2598                 }
2599                 
2600                 rc = kibnal_set_qp_state(conn, vv_qp_state_rts);
2601                 if (rc != 0) {
2602                         kibnal_connreq_done(conn, 1, rc);
2603                         return;
2604                 }
2605                 
2606                 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_RTU);
2607                 kibnal_conn_addref(conn);       /* ++for CM callback */
2608                 
2609                 memset(&rtu, 0, sizeof(rtu));
2610                 cmrc = cm_accept(conn->ibc_cep, NULL, &rtu,
2611                                  kibnal_cm_callback, conn);
2612                 if (cmrc == cm_stat_success) {
2613                         /* Now I'm racing with disconnect signalled by
2614                          * kibnal_cm_callback */
2615                         kibnal_connreq_done(conn, 1, 0);
2616                         return;
2617                 }
2618
2619                 CERROR("cm_accept "LPX64" failed: %d\n", peer->ibp_nid, cmrc);
2620                 /* Back out of RTU: no callback coming */
2621                 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CHECK_REPLY);
2622                 kibnal_conn_decref(conn);
2623                 kibnal_connreq_done(conn, 1, -EIO);
2624                 return;
2625         }
2626
2627         if (cv->cv_conndata.status == cm_event_conn_reject) {
2628
2629                 if (cv->cv_conndata.data.reject.reason != cm_rej_code_stale_conn) {
2630                         CERROR("conn -> "LPX64" rejected: %d\n", peer->ibp_nid,
2631                                cv->cv_conndata.data.reject.reason);
2632                         kibnal_connreq_done(conn, 1, -ECONNREFUSED);
2633                         return;
2634                 }
2635
2636                 CWARN ("conn -> "LPX64" stale: retrying\n", peer->ibp_nid);
2637
2638                 cep = cm_create_cep(cm_cep_transp_rc);
2639                 if (cep == NULL) {
2640                         CERROR("Can't create new CEP\n");
2641                         kibnal_connreq_done(conn, 1, -ENOMEM);
2642                         return;
2643                 }
2644
2645                 cmrc = cm_cancel(conn->ibc_cep);
2646                 LASSERT (cmrc == cm_stat_success);
2647                 cmrc = cm_destroy_cep(conn->ibc_cep);
2648                 LASSERT (cmrc == cm_stat_success);
2649
2650                 conn->ibc_cep = cep;
2651
2652                 /* retry connect */
2653                 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_ARP);
2654                 kibnal_connect_conn(conn);
2655                 return;
2656         }
2657
2658         CERROR("conn -> "LPX64" failed: %d\n", peer->ibp_nid,
2659                cv->cv_conndata.status);
2660         kibnal_connreq_done(conn, 1, -ECONNABORTED);
2661 }
2662
2663 void
2664 kibnal_send_connreq (kib_conn_t *conn)
2665 {
2666         kib_peer_t           *peer = conn->ibc_peer;
2667         kib_connvars_t       *cv = conn->ibc_connvars;
2668         ibat_arp_data_t      *arp = &cv->cv_arp;
2669         ib_path_record_v2_t  *path = &cv->cv_path;
2670         vv_return_t           vvrc;
2671         int                   rc;
2672
2673         /* Only called by connd => statics OK */
2674         LASSERT (!in_interrupt());
2675         LASSERT (current == kibnal_data.kib_connd);
2676         LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP);
2677         
2678         if (cv->cv_arprc != ibat_stat_ok) {
2679                 CERROR("Can't Arp "LPX64"@%u.%u.%u.%u: %d\n", peer->ibp_nid,
2680                        HIPQUAD(peer->ibp_ip), cv->cv_arprc);
2681                 kibnal_connreq_done(conn, 1, -ENETUNREACH);
2682                 return;
2683         }
2684
2685         if ((arp->mask & IBAT_PRI_PATH_VALID) != 0) {
2686                 CDEBUG(D_NET, "Got valid path for "LPX64"\n", peer->ibp_nid);
2687
2688                 *path = *arp->primary_path;
2689
2690                 vvrc = base_gid2port_num(kibnal_data.kib_hca, &path->sgid,
2691                                          &cv->cv_port);
2692                 LASSERT (vvrc == vv_return_ok);
2693
2694                 vvrc = gid2gid_index(kibnal_data.kib_hca, cv->cv_port,
2695                                      &path->sgid, &cv->cv_sgid_index);
2696                 LASSERT (vvrc == vv_return_ok);
2697
2698                 vvrc = pkey2pkey_index(kibnal_data.kib_hca, cv->cv_port,
2699                                        path->pkey, &cv->cv_pkey_index);
2700                 LASSERT (vvrc == vv_return_ok);
2701
2702                 path->mtu = IBNAL_IB_MTU;
2703
2704         } else if ((arp->mask & IBAT_LID_VALID) != 0) {
2705                 CWARN("Creating new path record for "LPX64"@%u.%u.%u.%u\n",
2706                       peer->ibp_nid, HIPQUAD(peer->ibp_ip));
2707
2708                 cv->cv_pkey_index = IBNAL_PKEY_IDX;
2709                 cv->cv_sgid_index = IBNAL_SGID_IDX;
2710                 cv->cv_port = arp->local_port_num;
2711
2712                 memset(path, 0, sizeof(*path));
2713
2714                 vvrc = port_num2base_gid(kibnal_data.kib_hca, cv->cv_port,
2715                                          &path->sgid);
2716                 LASSERT (vvrc == vv_return_ok);
2717
2718                 vvrc = port_num2base_lid(kibnal_data.kib_hca, cv->cv_port,
2719                                          &path->slid);
2720                 LASSERT (vvrc == vv_return_ok);
2721
2722                 path->dgid          = arp->gid;
2723                 path->sl            = IBNAL_SERVICE_LEVEL;
2724                 path->dlid          = arp->lid;
2725                 path->mtu           = IBNAL_IB_MTU;
2726                 path->rate          = IBNAL_STATIC_RATE;
2727                 path->pkt_life_time = IBNAL_PKT_LIFETIME;
2728                 path->pkey          = IBNAL_PKEY;
2729                 path->traffic_class = IBNAL_TRAFFIC_CLASS;
2730         } else {
2731                 CERROR("Can't Arp "LPX64"@%u.%u.%u.%u: no PATH or LID\n", 
2732                        peer->ibp_nid, HIPQUAD(peer->ibp_ip));
2733                 kibnal_connreq_done(conn, 1, -ENETUNREACH);
2734                 return;
2735         }
2736
2737         rc = kibnal_set_qp_state(conn, vv_qp_state_init);
2738         if (rc != 0) {
2739                 kibnal_connreq_done(conn, 1, rc);
2740         }
2741
2742         /* do the actual connection request */
2743         kibnal_connect_conn(conn);
2744 }
2745
2746 void
2747 kibnal_arp_callback (ibat_stat_t arprc, ibat_arp_data_t *arp_data, void *arg)
2748 {
2749         /* CAVEAT EMPTOR: tasklet context */
2750         kib_conn_t      *conn = (kib_conn_t *)arg;
2751         kib_peer_t      *peer = conn->ibc_peer;
2752         unsigned long    flags;
2753
2754         CDEBUG(arprc == ibat_stat_ok ? D_NET : D_ERROR,
2755                "Arp "LPX64"@%u.%u.%u.%u rc %d LID %s PATH %s\n",
2756                peer->ibp_nid, HIPQUAD(peer->ibp_ip), arprc,
2757                (arp_data->mask & IBAT_LID_VALID) == 0 ? "invalid" : "valid",
2758                (arp_data->mask & IBAT_PRI_PATH_VALID) == 0 ? "invalid" : "valid");
2759         LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP);
2760
2761         conn->ibc_connvars->cv_arprc = arprc;
2762         if (arprc == ibat_stat_ok)
2763                 conn->ibc_connvars->cv_arp = *arp_data;
2764         
2765         /* connd takes over my ref on conn */
2766         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
2767         
2768         list_add_tail(&conn->ibc_list, &kibnal_data.kib_connd_conns);
2769         wake_up(&kibnal_data.kib_connd_waitq);
2770         
2771         spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
2772 }
2773
2774 void
2775 kibnal_arp_peer (kib_peer_t *peer)
2776 {
2777         cm_cep_handle_t  cep;
2778         kib_conn_t      *conn;
2779         int              ibatrc;
2780
2781         /* Only the connd does this (i.e. single threaded) */
2782         LASSERT (current == kibnal_data.kib_connd);
2783         LASSERT (peer->ibp_connecting != 0);
2784
2785         cep = cm_create_cep(cm_cep_transp_rc);
2786         if (cep == NULL) {
2787                 CERROR ("Can't create cep for conn->"LPX64"\n",
2788                         peer->ibp_nid);
2789                 kibnal_peer_connect_failed(peer, 1);
2790                 return;
2791         }
2792
2793         conn = kibnal_create_conn(cep);
2794         if (conn == NULL) {
2795                 CERROR ("Can't allocate conn->"LPX64"\n",
2796                         peer->ibp_nid);
2797                 cm_destroy_cep(cep);
2798                 kibnal_peer_connect_failed(peer, 1);
2799                 return;
2800         }
2801
2802         conn->ibc_peer = peer;
2803         kibnal_peer_addref(peer);
2804
2805         kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_ARP);
2806
2807         ibatrc = ibat_get_ib_data(htonl(peer->ibp_ip), INADDR_ANY, 
2808                                   ibat_paths_primary,
2809                                   &conn->ibc_connvars->cv_arp, 
2810                                   kibnal_arp_callback, conn, 0);
2811         CDEBUG(D_NET,"ibatrc %d\n", ibatrc);
2812         switch (ibatrc) {
2813         default:
2814                 LBUG();
2815                 
2816         case ibat_stat_pending:
2817                 /* NB callback has my ref on conn */
2818                 break;
2819                 
2820         case ibat_stat_ok:
2821                 /* Immediate return (ARP cache hit) == no callback. */
2822                 conn->ibc_connvars->cv_arprc = ibat_stat_ok;
2823                 kibnal_send_connreq(conn);
2824                 kibnal_conn_decref(conn);
2825                 break;
2826
2827         case ibat_stat_error:
2828         case ibat_stat_timeout:
2829         case ibat_stat_not_found:
2830                 CERROR("Arp "LPX64"@%u.%u.%u.%u failed: %d\n", peer->ibp_nid,
2831                        HIPQUAD(peer->ibp_ip), ibatrc);
2832                 kibnal_connreq_done(conn, 1, -ENETUNREACH);
2833                 kibnal_conn_decref(conn);
2834                 break;
2835         }
2836 }
2837
2838 int
2839 kibnal_conn_timed_out (kib_conn_t *conn)
2840 {
2841         kib_tx_t          *tx;
2842         struct list_head  *ttmp;
2843
2844         spin_lock(&conn->ibc_lock);
2845
2846         list_for_each (ttmp, &conn->ibc_tx_queue) {
2847                 tx = list_entry (ttmp, kib_tx_t, tx_list);
2848
2849                 LASSERT (tx->tx_queued);
2850
2851                 if (time_after_eq (jiffies, tx->tx_deadline)) {
2852                         spin_unlock(&conn->ibc_lock);
2853                         return 1;
2854                 }
2855         }
2856
2857         list_for_each (ttmp, &conn->ibc_active_txs) {
2858                 tx = list_entry (ttmp, kib_tx_t, tx_list);
2859
2860                 LASSERT (!tx->tx_queued);
2861                 LASSERT (tx->tx_waiting ||
2862                          tx->tx_sending != 0);
2863
2864                 if (time_after_eq (jiffies, tx->tx_deadline)) {
2865                         spin_unlock(&conn->ibc_lock);
2866                         return 1;
2867                 }
2868         }
2869
2870         spin_unlock(&conn->ibc_lock);
2871         return 0;
2872 }
2873
2874 void
2875 kibnal_check_conns (int idx)
2876 {
2877         struct list_head  *peers = &kibnal_data.kib_peers[idx];
2878         struct list_head  *ptmp;
2879         kib_peer_t        *peer;
2880         kib_conn_t        *conn;
2881         struct list_head  *ctmp;
2882         unsigned long      flags;
2883
2884  again:
2885         /* NB. We expect to have a look at all the peers and not find any
2886          * rdmas to time out, so we just use a shared lock while we
2887          * take a look... */
2888         read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2889
2890         list_for_each (ptmp, peers) {
2891                 peer = list_entry (ptmp, kib_peer_t, ibp_list);
2892
2893                 list_for_each (ctmp, &peer->ibp_conns) {
2894                         conn = list_entry (ctmp, kib_conn_t, ibc_list);
2895
2896                         LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED);
2897
2898                         /* In case we have enough credits to return via a
2899                          * NOOP, but there were no non-blocking tx descs
2900                          * free to do it last time... */
2901                         kibnal_check_sends(conn);
2902
2903                         if (!kibnal_conn_timed_out(conn))
2904                                 continue;
2905
2906                         /* Handle timeout by closing the whole connection.  We
2907                          * can only be sure RDMA activity has ceased once the
2908                          * QP has been modified. */
2909                         
2910                         kibnal_conn_addref(conn); /* 1 ref for me... */
2911
2912                         read_unlock_irqrestore(&kibnal_data.kib_global_lock, 
2913                                                flags);
2914
2915                         CERROR("Timed out RDMA with "LPX64"\n",
2916                                peer->ibp_nid);
2917
2918                         kibnal_close_conn (conn, -ETIMEDOUT);
2919                         kibnal_conn_decref(conn); /* ...until here */
2920
2921                         /* start again now I've dropped the lock */
2922                         goto again;
2923                 }
2924         }
2925
2926         read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2927 }
2928
2929 void
2930 kibnal_disconnect_conn (kib_conn_t *conn)
2931 {
2932         static cm_drequest_data_t dreq;         /* just for the space */
2933         
2934         cm_return_t    cmrc;
2935         unsigned long  flags;
2936
2937         LASSERT (!in_interrupt());
2938         LASSERT (current == kibnal_data.kib_connd);
2939         
2940         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2941
2942         if (conn->ibc_disconnect) {
2943                 /* Had the CM callback already */
2944                 write_unlock_irqrestore(&kibnal_data.kib_global_lock,
2945                                         flags);
2946                 kibnal_conn_disconnected(conn);
2947                 return;
2948         }
2949                 
2950         LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT1);
2951
2952         /* active disconnect */
2953         cmrc = cm_disconnect(conn->ibc_cep, &dreq, NULL);
2954         if (cmrc == cm_stat_success) {
2955                 /* waiting for CM */
2956                 conn->ibc_state = IBNAL_CONN_DISCONNECT2;
2957                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2958                 return;
2959         }
2960
2961         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2962
2963         cm_cancel(conn->ibc_cep);
2964         kibnal_pause(HZ/10);
2965
2966         if (!conn->ibc_disconnect)              /* CM callback will never happen now */
2967                 kibnal_conn_decref(conn);
2968         
2969         LASSERT (atomic_read(&conn->ibc_refcount) > 0);
2970         LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT1);
2971
2972         kibnal_conn_disconnected(conn);
2973 }
2974
2975 int
2976 kibnal_connd (void *arg)
2977 {
2978         wait_queue_t       wait;
2979         unsigned long      flags;
2980         kib_pcreq_t       *pcr;
2981         kib_conn_t        *conn;
2982         kib_peer_t        *peer;
2983         int                timeout;
2984         int                i;
2985         int                dropped_lock;
2986         int                peer_index = 0;
2987         unsigned long      deadline = jiffies;
2988         
2989         kportal_daemonize ("kibnal_connd");
2990         kportal_blockallsigs ();
2991
2992         init_waitqueue_entry (&wait, current);
2993         kibnal_data.kib_connd = current;
2994
2995         spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
2996
2997         while (!kibnal_data.kib_shutdown) {
2998
2999                 dropped_lock = 0;
3000
3001                 if (!list_empty (&kibnal_data.kib_connd_zombies)) {
3002                         conn = list_entry (kibnal_data.kib_connd_zombies.next,
3003                                            kib_conn_t, ibc_list);
3004                         list_del (&conn->ibc_list);
3005                         
3006                         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3007                         dropped_lock = 1;
3008
3009                         kibnal_destroy_conn(conn);
3010
3011                         spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3012                 }
3013
3014                 if (!list_empty (&kibnal_data.kib_connd_pcreqs)) {
3015                         pcr = list_entry(kibnal_data.kib_connd_pcreqs.next,
3016                                          kib_pcreq_t, pcr_list);
3017                         list_del(&pcr->pcr_list);
3018                         
3019                         spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
3020                         dropped_lock = 1;
3021
3022                         kibnal_recv_connreq(pcr->pcr_cep, &pcr->pcr_cmreq);
3023                         PORTAL_FREE(pcr, sizeof(*pcr));
3024
3025                         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
3026                 }
3027                         
3028                 if (!list_empty (&kibnal_data.kib_connd_peers)) {
3029                         peer = list_entry (kibnal_data.kib_connd_peers.next,
3030                                            kib_peer_t, ibp_connd_list);
3031                         
3032                         list_del_init (&peer->ibp_connd_list);
3033                         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3034                         dropped_lock = 1;
3035
3036                         kibnal_arp_peer (peer);
3037                         kibnal_peer_decref (peer);
3038
3039                         spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3040                 }
3041
3042                 if (!list_empty (&kibnal_data.kib_connd_conns)) {
3043                         conn = list_entry (kibnal_data.kib_connd_conns.next,
3044                                            kib_conn_t, ibc_list);
3045                         list_del (&conn->ibc_list);
3046                         
3047                         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3048                         dropped_lock = 1;
3049
3050                         switch (conn->ibc_state) {
3051                         default:
3052                                 LBUG();
3053                                 
3054                         case IBNAL_CONN_ACTIVE_ARP:
3055                                 kibnal_send_connreq(conn);
3056                                 break;
3057
3058                         case IBNAL_CONN_ACTIVE_CONNECT:
3059                                 kibnal_check_connreply(conn);
3060                                 break;
3061
3062                         case IBNAL_CONN_PASSIVE_WAIT:
3063                                 kibnal_check_passive_wait(conn);
3064                                 break;
3065
3066                         case IBNAL_CONN_DISCONNECT1:
3067                         case IBNAL_CONN_DISCONNECT2:
3068                                 kibnal_disconnect_conn(conn);
3069                                 break;
3070                         }
3071                         kibnal_conn_decref(conn);
3072
3073                         spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3074                 }
3075
3076                 /* careful with the jiffy wrap... */
3077                 timeout = (int)(deadline - jiffies);
3078                 if (timeout <= 0) {
3079                         const int n = 4;
3080                         const int p = 1;
3081                         int       chunk = kibnal_data.kib_peer_hash_size;
3082                         
3083                         spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
3084                         dropped_lock = 1;
3085
3086                         /* Time to check for RDMA timeouts on a few more
3087                          * peers: I do checks every 'p' seconds on a
3088                          * proportion of the peer table and I need to check
3089                          * every connection 'n' times within a timeout
3090                          * interval, to ensure I detect a timeout on any
3091                          * connection within (n+1)/n times the timeout
3092                          * interval. */
3093
3094                         if (kibnal_tunables.kib_io_timeout > n * p)
3095                                 chunk = (chunk * n * p) / 
3096                                         kibnal_tunables.kib_io_timeout;
3097                         if (chunk == 0)
3098                                 chunk = 1;
3099
3100                         for (i = 0; i < chunk; i++) {
3101                                 kibnal_check_conns (peer_index);
3102                                 peer_index = (peer_index + 1) % 
3103                                              kibnal_data.kib_peer_hash_size;
3104                         }
3105
3106                         deadline += p * HZ;
3107                         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
3108                 }
3109
3110                 if (dropped_lock)
3111                         continue;
3112                 
3113                 /* Nothing to do for 'timeout'  */
3114                 set_current_state (TASK_INTERRUPTIBLE);
3115                 add_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
3116                 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3117
3118                 schedule_timeout (timeout);
3119
3120                 set_current_state (TASK_RUNNING);
3121                 remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
3122                 spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3123         }
3124
3125         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3126
3127         kibnal_thread_fini ();
3128         return (0);
3129 }
3130
3131 void 
3132 kibnal_async_callback(vv_event_record_t ev)
3133 {
3134         CERROR("type: %d, port: %d, data: "LPX64"\n", 
3135                ev.event_type, ev.port_num, ev.type.data);
3136 }
3137
3138 void
3139 kibnal_cq_callback (unsigned long unused_context)
3140 {
3141         unsigned long    flags;
3142
3143         CDEBUG(D_NET, "!!\n");
3144
3145         spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3146         kibnal_data.kib_ready = 1;
3147         wake_up(&kibnal_data.kib_sched_waitq);
3148         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
3149 }
3150
3151 int
3152 kibnal_scheduler(void *arg)
3153 {
3154         long            id = (long)arg;
3155         wait_queue_t    wait;
3156         char            name[16];
3157         vv_wc_t         wc;
3158         vv_return_t     vvrc;
3159         vv_return_t     vvrc2;
3160         unsigned long   flags;
3161         kib_rx_t       *rx;
3162         __u64           rxseq = 0;
3163         int             busy_loops = 0;
3164
3165         snprintf(name, sizeof(name), "kibnal_sd_%02ld", id);
3166         kportal_daemonize(name);
3167         kportal_blockallsigs();
3168
3169         init_waitqueue_entry(&wait, current);
3170
3171         spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3172
3173         while (!kibnal_data.kib_shutdown) {
3174                 if (busy_loops++ >= IBNAL_RESCHED) {
3175                         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
3176                                                flags);
3177
3178                         our_cond_resched();
3179                         busy_loops = 0;
3180                         
3181                         spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3182                 }
3183
3184                 if (kibnal_data.kib_ready &&
3185                     !kibnal_data.kib_checking_cq) {
3186                         /* take ownership of completion polling */
3187                         kibnal_data.kib_checking_cq = 1;
3188                         /* Assume I'll exhaust the CQ */
3189                         kibnal_data.kib_ready = 0;
3190                         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, 
3191                                                flags);
3192                         
3193                         vvrc = vv_poll_for_completion(kibnal_data.kib_hca, 
3194                                                       kibnal_data.kib_cq, &wc);
3195                         if (vvrc == vv_return_err_cq_empty) {
3196                                 vvrc2 = vv_request_completion_notification(
3197                                         kibnal_data.kib_hca, 
3198                                         kibnal_data.kib_cq, 
3199                                         vv_next_solicit_unsolicit_event);
3200                                 LASSERT (vvrc2 == vv_return_ok);
3201                         }
3202
3203                         if (vvrc == vv_return_ok &&
3204                             kibnal_wreqid2type(wc.wr_id) == IBNAL_WID_RX) {
3205                                 rx = (kib_rx_t *)kibnal_wreqid2ptr(wc.wr_id);
3206
3207                                 /* Grab the RX sequence number NOW before
3208                                  * anyone else can get an RX completion */
3209                                 rxseq = rx->rx_conn->ibc_rxseq++;
3210                         }
3211
3212                         spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3213                         /* give up ownership of completion polling */
3214                         kibnal_data.kib_checking_cq = 0;
3215
3216                         if (vvrc == vv_return_err_cq_empty)
3217                                 continue;
3218
3219                         LASSERT (vvrc == vv_return_ok);
3220                         /* Assume there's more: get another scheduler to check
3221                          * while I handle this completion... */
3222
3223                         kibnal_data.kib_ready = 1;
3224                         wake_up(&kibnal_data.kib_sched_waitq);
3225
3226                         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
3227                                                flags);
3228
3229                         switch (kibnal_wreqid2type(wc.wr_id)) {
3230                         case IBNAL_WID_RX:
3231                                 kibnal_rx_complete(
3232                                         (kib_rx_t *)kibnal_wreqid2ptr(wc.wr_id),
3233                                         wc.completion_status,
3234                                         wc.num_bytes_transfered,
3235                                         rxseq);
3236                                 break;
3237
3238                         case IBNAL_WID_TX:
3239                                 kibnal_tx_complete(
3240                                         (kib_tx_t *)kibnal_wreqid2ptr(wc.wr_id),
3241                                         wc.completion_status);
3242                                 break;
3243
3244                         case IBNAL_WID_RDMA:
3245                                 /* We only get RDMA completion notification if
3246                                  * it fails.  So we just ignore them completely
3247                                  * because...
3248                                  *
3249                                  * 1) If an RDMA fails, all subsequent work
3250                                  * items, including the final SEND will fail
3251                                  * too, so I'm still guaranteed to notice that
3252                                  * this connection is hosed.
3253                                  *
3254                                  * 2) It's positively dangerous to look inside
3255                                  * the tx descriptor obtained from an RDMA work
3256                                  * item.  As soon as I drop the kib_sched_lock,
3257                                  * I give a scheduler on another CPU a chance
3258                                  * to get the final SEND completion, so the tx
3259                                  * descriptor can get freed as I inspect it. */
3260                                 CERROR ("RDMA failed: %d\n", 
3261                                         wc.completion_status);
3262                                 break;
3263
3264                         default:
3265                                 LBUG();
3266                         }
3267                         
3268                         spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3269                         continue;
3270                 }
3271
3272                 /* Nothing to do; sleep... */
3273
3274                 set_current_state(TASK_INTERRUPTIBLE);
3275                 add_wait_queue(&kibnal_data.kib_sched_waitq, &wait);
3276                 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
3277                                        flags);
3278
3279                 schedule();
3280
3281                 remove_wait_queue(&kibnal_data.kib_sched_waitq, &wait);
3282                 set_current_state(TASK_RUNNING);
3283                 spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3284         }
3285
3286         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
3287
3288         kibnal_thread_fini();
3289         return (0);
3290 }
3291
3292
3293 lib_nal_t kibnal_lib = {
3294         .libnal_data = &kibnal_data,      /* NAL private data */
3295         .libnal_send = kibnal_send,
3296         .libnal_send_pages = kibnal_send_pages,
3297         .libnal_recv = kibnal_recv,
3298         .libnal_recv_pages = kibnal_recv_pages,
3299         .libnal_dist = kibnal_dist
3300 };