Whamcloud - gitweb
- removed trailing spaces and converted tabs.
[fs/lustre-release.git] / lnet / klnds / o2iblnd / o2iblnd_cb.c
index c13c4c0..7881b49 100644 (file)
@@ -90,8 +90,8 @@ kiblnd_tx_done (lnet_ni_t *ni, kib_tx_t *tx)
         }
 #else
         if (tx->tx_nfrags != 0) {
-                dma_unmap_sg(net->ibn_dev->ibd_cmid->device->dma_device,
-                             tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir);
+                kiblnd_dma_unmap_sg(net->ibn_dev->ibd_cmid->device,
+                                    tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir);
                 tx->tx_nfrags = 0;
         }
 #endif
@@ -376,6 +376,10 @@ kiblnd_handle_rx (kib_rx_t *rx)
 
                 conn->ibc_credits += credits;
 
+                /* This ensures the credit taken by NOOP can be returned */
+                if (msg->ibm_type == IBLND_MSG_NOOP)
+                        conn->ibc_outstanding_credits++;
+
                 spin_unlock(&conn->ibc_lock);
                 kiblnd_check_sends(conn);
         }
@@ -389,7 +393,10 @@ kiblnd_handle_rx (kib_rx_t *rx)
                 break;
 
         case IBLND_MSG_NOOP:
-                post_credit = IBLND_POSTRX_PEER_CREDIT;
+                if (credits != 0) /* credit already posted */
+                        post_credit = IBLND_POSTRX_NO_CREDIT;
+                else              /* a keepalive NOOP */
+                        post_credit = IBLND_POSTRX_PEER_CREDIT;
                 break;
 
         case IBLND_MSG_IMMEDIATE:
@@ -498,7 +505,7 @@ kiblnd_rx_complete (kib_rx_t *rx, int status, int nob)
         LASSERT (net != NULL);
         LASSERT (rx->rx_nob < 0);               /* was posted */
         rx->rx_nob = 0;                         /* isn't now */
-        
+
         if (conn->ibc_state > IBLND_CONN_ESTABLISHED)
                 goto ignore;
 
@@ -566,7 +573,7 @@ kiblnd_kvaddr_to_page (unsigned long vaddr)
                 LASSERT (page != NULL);
                 return page;
         }
-#if CONFIG_HIGHMEM
+#ifdef CONFIG_HIGHMEM
         if (vaddr >= PKMAP_BASE &&
             vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) {
                 /* No highmem pages only used for bulk (kiov) I/O */
@@ -639,14 +646,17 @@ kiblnd_setup_rd_iov(lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
         tx->tx_nfrags = sg - tx->tx_frags;
         tx->tx_dmadir = (rd != tx->tx_rd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
 
-        rd->rd_nfrags = dma_map_sg(net->ibn_dev->ibd_cmid->device->dma_device,
-                                   tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir);
+        rd->rd_nfrags = kiblnd_dma_map_sg(net->ibn_dev->ibd_cmid->device,
+                                          tx->tx_frags, tx->tx_nfrags,
+                                          tx->tx_dmadir);
         rd->rd_key    = (rd != tx->tx_rd) ? 
                         net->ibn_dev->ibd_mr->rkey : net->ibn_dev->ibd_mr->lkey;
 
         for (i = 0; i < rd->rd_nfrags; i++) {
-                rd->rd_frags[i].rf_nob  = sg_dma_len(&tx->tx_frags[i]);
-                rd->rd_frags[i].rf_addr = sg_dma_address(&tx->tx_frags[i]);
+                rd->rd_frags[i].rf_nob  = kiblnd_sg_dma_len(
+                        net->ibn_dev->ibd_cmid->device, &tx->tx_frags[i]);
+                rd->rd_frags[i].rf_addr = kiblnd_sg_dma_address(
+                        net->ibn_dev->ibd_cmid->device, &tx->tx_frags[i]);
         }
         
         return 0;
@@ -697,14 +707,16 @@ kiblnd_setup_rd_kiov (lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
         tx->tx_nfrags = sg - tx->tx_frags;
         tx->tx_dmadir = (rd != tx->tx_rd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
 
-        rd->rd_nfrags = dma_map_sg(net->ibn_dev->ibd_cmid->device->dma_device,
-                                   tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir);
+        rd->rd_nfrags = kiblnd_dma_map_sg(net->ibn_dev->ibd_cmid->device,
+                                          tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir);
         rd->rd_key    = (rd != tx->tx_rd) ? 
                         net->ibn_dev->ibd_mr->rkey : net->ibn_dev->ibd_mr->lkey;
 
         for (i = 0; i < tx->tx_nfrags; i++) {
-                rd->rd_frags[i].rf_nob  = sg_dma_len(&tx->tx_frags[i]);
-                rd->rd_frags[i].rf_addr = sg_dma_address(&tx->tx_frags[i]);
+                rd->rd_frags[i].rf_nob  = kiblnd_sg_dma_len(
+                        net->ibn_dev->ibd_cmid->device, &tx->tx_frags[i]);
+                rd->rd_frags[i].rf_addr = kiblnd_sg_dma_address(
+                        net->ibn_dev->ibd_cmid->device, &tx->tx_frags[i]);
 #if 0
                 CDEBUG(D_WARNING,"frag[%d]: "LPX64" for %d\n",
                        i, rd->rd_frags[i].rf_addr, rd->rd_frags[i].rf_nob);
@@ -882,10 +894,7 @@ kiblnd_check_sends (kib_conn_t *conn)
                 conn->ibc_reserved_credits--;
         }
 
-        if (list_empty(&conn->ibc_tx_queue) &&
-            list_empty(&conn->ibc_tx_queue_nocred) &&
-            (conn->ibc_outstanding_credits >= IBLND_CREDIT_HIGHWATER ||
-             kiblnd_send_keepalive(conn))) {
+        if (kiblnd_send_noop(conn)) {
                 spin_unlock(&conn->ibc_lock);
 
                 tx = kiblnd_get_idle_tx(ni);
@@ -899,13 +908,17 @@ kiblnd_check_sends (kib_conn_t *conn)
         }
 
         for (;;) {
-                if (!list_empty (&conn->ibc_tx_queue_nocred)) {
-                        tx = list_entry (conn->ibc_tx_queue_nocred.next, 
-                                         kib_tx_t, tx_list);
+                if (!list_empty(&conn->ibc_tx_queue_nocred)) {
+                        tx = list_entry(conn->ibc_tx_queue_nocred.next, 
+                                        kib_tx_t, tx_list);
                         consume_cred = 0;
-                } else if (!list_empty (&conn->ibc_tx_queue)) {
-                        tx = list_entry (conn->ibc_tx_queue.next,
-                                         kib_tx_t, tx_list);
+                } else if (!list_empty(&conn->ibc_tx_noops)) {
+                        tx = list_entry(conn->ibc_tx_noops.next,
+                                        kib_tx_t, tx_list);
+                        consume_cred = 1;
+                } else if (!list_empty(&conn->ibc_tx_queue)) {
+                        tx = list_entry(conn->ibc_tx_queue.next,
+                                        kib_tx_t, tx_list);
                         consume_cred = 1;
                 } else {
                         /* nothing to send right now */
@@ -934,27 +947,25 @@ kiblnd_check_sends (kib_conn_t *conn)
                         if (conn->ibc_credits == 0) {   /* no credits */
                                 CDEBUG(D_NET, "%s: no credits\n",
                                        libcfs_nid2str(conn->ibc_peer->ibp_nid));
-                                break;
+                                break; /* NB ibc_tx_queue_nocred checked */
                         }
 
-                        if (conn->ibc_credits == 1 &&   /* last credit reserved for */
-                            conn->ibc_outstanding_credits == 0) { /* giving back credits */
+                        /* Last credit reserved for NOOP */
+                        if (conn->ibc_credits == 1 &&
+                            tx->tx_msg->ibm_type != IBLND_MSG_NOOP) {
                                 CDEBUG(D_NET, "%s: not using last credit\n",
                                        libcfs_nid2str(conn->ibc_peer->ibp_nid));
-                                break;
+                                break; /* NB ibc_tx_noops checked */
                         }
                 }
 
-                list_del (&tx->tx_list);
+                list_del(&tx->tx_list);
                 tx->tx_queued = 0;
 
                 /* NB don't drop ibc_lock before bumping tx_sending */
 
                 if (tx->tx_msg->ibm_type == IBLND_MSG_NOOP &&
-                    (!list_empty(&conn->ibc_tx_queue) ||
-                     !list_empty(&conn->ibc_tx_queue_nocred) ||
-                     (conn->ibc_outstanding_credits < IBLND_CREDIT_HIGHWATER &&
-                      !kiblnd_send_keepalive(conn)))) {
+                    !kiblnd_send_noop(conn)) {
                         /* redundant NOOP */
                         spin_unlock(&conn->ibc_lock);
                         kiblnd_tx_done(ni, tx);
@@ -1247,7 +1258,7 @@ kiblnd_init_rdma (lnet_ni_t *ni, kib_tx_t *tx, int type,
                         dstfrag++;
                         dstidx++;
                 }
-                
+
                 tx->tx_nwrq++;
         }
 
@@ -1299,6 +1310,9 @@ kiblnd_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn)
                 break;
 
         case IBLND_MSG_NOOP:
+                q = &conn->ibc_tx_noops;
+                break;
+
         case IBLND_MSG_IMMEDIATE:
                 q = &conn->ibc_tx_queue;
                 break;
@@ -1321,9 +1335,12 @@ void
 kiblnd_connect_peer (kib_peer_t *peer)
 {
         struct rdma_cm_id *cmid;
-        struct sockaddr_in sockaddr;
+        kib_net_t         *net = peer->ibp_ni->ni_data;
+        struct sockaddr_in srcaddr;
+        struct sockaddr_in dstaddr;
         int                rc;
 
+        LASSERT (net != NULL);
         LASSERT (peer->ibp_connecting > 0);
 
         cmid = rdma_create_id(kiblnd_cm_callback, peer, RDMA_PS_TCP);
@@ -1334,14 +1351,20 @@ kiblnd_connect_peer (kib_peer_t *peer)
                 goto failed;
         }
 
-        memset(&sockaddr, 0, sizeof(sockaddr));
-        sockaddr.sin_family = AF_INET;
-        sockaddr.sin_port = htons(*kiblnd_tunables.kib_service);
-        sockaddr.sin_addr.s_addr = htonl(LNET_NIDADDR(peer->ibp_nid));
+        memset(&srcaddr, 0, sizeof(srcaddr));
+        srcaddr.sin_family = AF_INET;
+        srcaddr.sin_addr.s_addr = htonl(net->ibn_dev->ibd_ifip);
+
+        memset(&dstaddr, 0, sizeof(dstaddr));
+        dstaddr.sin_family = AF_INET;
+        dstaddr.sin_port = htons(*kiblnd_tunables.kib_service);
+        dstaddr.sin_addr.s_addr = htonl(LNET_NIDADDR(peer->ibp_nid));
 
         kiblnd_peer_addref(peer);               /* cmid's ref */
 
-        rc = rdma_resolve_addr(cmid, NULL, (struct sockaddr *)&sockaddr,
+        rc = rdma_resolve_addr(cmid,
+                               (struct sockaddr *)&srcaddr,
+                               (struct sockaddr *)&dstaddr,
                                *kiblnd_tunables.kib_timeout * 1000);
         if (rc == 0)
                 return;
@@ -1453,6 +1476,9 @@ kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid)
         LASSERT (peer->ibp_connecting == 0);
         peer->ibp_connecting = 1;
 
+        /* always called with a ref on ni, which prevents ni being shutdown */
+        LASSERT (((kib_net_t *)ni->ni_data)->ibn_shutdown == 0);
+
         list_add_tail(&tx->tx_list, &peer->ibp_tx_queue);
 
         kiblnd_peer_addref(peer);
@@ -1846,7 +1872,7 @@ kiblnd_peer_notify (kib_peer_t *peer)
         time_t        last_alive = 0;
         int           error = 0;
         unsigned long flags;
-        
+
         read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
 
         if (list_empty(&peer->ibp_conns) &&
@@ -1855,14 +1881,14 @@ kiblnd_peer_notify (kib_peer_t *peer)
             peer->ibp_error != 0) {
                 error = peer->ibp_error;
                 peer->ibp_error = 0;
-                
+
                 last_alive = cfs_time_current_sec() -
                              cfs_duration_sec(cfs_time_current() -
                                               peer->ibp_last_alive);
         }
-        
+
         read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
-        
+
         if (error != 0)
                 lnet_notify(peer->ibp_ni,
                             peer->ibp_nid, 0, last_alive);
@@ -1889,6 +1915,7 @@ kiblnd_close_conn_locked (kib_conn_t *conn, int error)
                 return; /* already being handled  */
 
         if (error == 0 &&
+            list_empty(&conn->ibc_tx_noops) &&
             list_empty(&conn->ibc_tx_queue) &&
             list_empty(&conn->ibc_tx_queue_rsrvd) &&
             list_empty(&conn->ibc_tx_queue_nocred) &&
@@ -1896,9 +1923,10 @@ kiblnd_close_conn_locked (kib_conn_t *conn, int error)
                 CDEBUG(D_NET, "closing conn to %s\n", 
                        libcfs_nid2str(peer->ibp_nid));
         } else {
-                CDEBUG(D_NETERROR, "Closing conn to %s: error %d%s%s%s%s\n",
+                CDEBUG(D_NETERROR, "Closing conn to %s: error %d%s%s%s%s%s\n",
                        libcfs_nid2str(peer->ibp_nid), error,
                        list_empty(&conn->ibc_tx_queue) ? "" : "(sending)",
+                       list_empty(&conn->ibc_tx_noops) ? "" : "(sending_noops)",
                        list_empty(&conn->ibc_tx_queue_rsrvd) ? "" : "(sending_rsrvd)",
                        list_empty(&conn->ibc_tx_queue_nocred) ? "" : "(sending_nocred)",
                        list_empty(&conn->ibc_active_txs) ? "" : "(waiting)");
@@ -2013,6 +2041,7 @@ kiblnd_finalise_conn (kib_conn_t *conn)
         /* Complete all tx descs not waiting for sends to complete.
          * NB we should be safe from RDMA now that the QP has changed state */
 
+        kiblnd_abort_txs(conn, &conn->ibc_tx_noops);
         kiblnd_abort_txs(conn, &conn->ibc_tx_queue);
         kiblnd_abort_txs(conn, &conn->ibc_tx_queue_rsrvd);
         kiblnd_abort_txs(conn, &conn->ibc_tx_queue_nocred);
@@ -2100,7 +2129,7 @@ kiblnd_connreq_done(kib_conn_t *conn, int status)
 
         if (status != 0) {
                 /* failed to establish connection */
-                kiblnd_peer_connect_failed(conn->ibc_peer, active, status);
+                kiblnd_peer_connect_failed(peer, active, status);
                 kiblnd_finalise_conn(conn);
                 return;
         }
@@ -2121,22 +2150,25 @@ kiblnd_connreq_done(kib_conn_t *conn, int status)
         else
                 peer->ibp_accepting--;
 
-        kiblnd_close_stale_conns_locked(conn->ibc_peer,
-                                        conn->ibc_incarnation);
+        kiblnd_close_stale_conns_locked(peer, conn->ibc_incarnation);
+
+        /* grab pending txs while I have the lock */
+        list_add(&txs, &peer->ibp_tx_queue);
+        list_del_init(&peer->ibp_tx_queue);
 
         if (!kiblnd_peer_active(peer) ||        /* peer has been deleted */
             conn->ibc_comms_error != 0) {       /* error has happened already */
+                lnet_ni_t *ni = peer->ibp_ni;
 
                 /* start to shut down connection */
                 kiblnd_close_conn_locked(conn, -ECONNABORTED);
                 write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+                kiblnd_txlist_done(ni, &txs, -ECONNABORTED);
+
                 return;
         }
 
-        /* grab pending txs while I have the lock */
-        list_add(&txs, &peer->ibp_tx_queue);
-        list_del_init(&peer->ibp_tx_queue);
-
         write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
 
         /* Schedule blocked txs */
@@ -2237,8 +2269,8 @@ kiblnd_passive_connect (struct rdma_cm_id *cmid, void *priv, int priv_nob)
         if (reqmsg->ibm_u.connparams.ibcp_max_frags != IBLND_MAX_RDMA_FRAGS) {
                 CERROR("Can't accept %s: incompatible max_frags %d (%d wanted)\n",
                        libcfs_nid2str(nid),
-                       reqmsg->ibm_u.connparams.ibcp_queue_depth,
-                       IBLND_MSG_QUEUE_SIZE);
+                       reqmsg->ibm_u.connparams.ibcp_max_frags,
+                       IBLND_MAX_RDMA_FRAGS);
                 goto failed;
         }
 
@@ -2297,6 +2329,9 @@ kiblnd_passive_connect (struct rdma_cm_id *cmid, void *priv, int priv_nob)
                 LASSERT (peer->ibp_accepting == 0);
                 peer->ibp_accepting = 1;
 
+                /* I have a ref on ni that prevents it being shutdown */
+                LASSERT (((kib_net_t *)ni->ni_data)->ibn_shutdown == 0);
+
                 kiblnd_peer_addref(peer);
                 list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid));
 
@@ -2314,8 +2349,8 @@ kiblnd_passive_connect (struct rdma_cm_id *cmid, void *priv, int priv_nob)
         /* conn now "owns" cmid, so I return success from here on to ensure the
          * CM callback doesn't destroy cmid. */
 
-        conn->ibc_incarnation = reqmsg->ibm_srcstamp;
-        conn->ibc_credits = IBLND_MSG_QUEUE_SIZE;
+        conn->ibc_incarnation      = reqmsg->ibm_srcstamp;
+        conn->ibc_credits          = IBLND_MSG_QUEUE_SIZE;
         conn->ibc_reserved_credits = IBLND_MSG_QUEUE_SIZE;
         LASSERT (conn->ibc_credits + conn->ibc_reserved_credits
                  <= IBLND_RX_MSGS);
@@ -2495,8 +2530,8 @@ kiblnd_check_connreply (kib_conn_t *conn, void *priv, int priv_nob)
         if (msg->ibm_u.connparams.ibcp_max_frags != IBLND_MAX_RDMA_FRAGS) {
                 CERROR("%s has incompatible max_frags %d (%d wanted)\n",
                        libcfs_nid2str(peer->ibp_nid),
-                       msg->ibm_u.connparams.ibcp_queue_depth,
-                       IBLND_MSG_QUEUE_SIZE);
+                       msg->ibm_u.connparams.ibcp_max_frags,
+                       IBLND_MAX_RDMA_FRAGS);
                 rc = -EPROTO;
                 goto failed;
         }
@@ -2524,8 +2559,8 @@ kiblnd_check_connreply (kib_conn_t *conn, void *priv, int priv_nob)
                 goto failed;
         }
 
-        conn->ibc_incarnation = msg->ibm_srcstamp;
-        conn->ibc_credits = IBLND_MSG_QUEUE_SIZE;
+        conn->ibc_incarnation      = msg->ibm_srcstamp;
+        conn->ibc_credits          = IBLND_MSG_QUEUE_SIZE;
         conn->ibc_reserved_credits = IBLND_MSG_QUEUE_SIZE;
         LASSERT (conn->ibc_credits + conn->ibc_reserved_credits
                  <= IBLND_RX_MSGS);
@@ -2610,8 +2645,8 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
        case RDMA_CM_EVENT_CONNECT_REQUEST:
                 /* destroy cmid on failure */
                rc = kiblnd_passive_connect(cmid, 
-                                            event->private_data,
-                                            event->private_data_len);
+                                            (void *)KIBLND_CONN_PARAM(event),
+                                            KIBLND_CONN_PARAM_LEN(event));
                 CDEBUG(D_NET, "connreq: %d\n", rc);
                 return rc;
                 
@@ -2703,8 +2738,8 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
 
                 case IBLND_CONN_ACTIVE_CONNECT:
                         kiblnd_rejected(conn, event->status,
-                                        event->private_data,
-                                        event->private_data_len);
+                                        (void *)KIBLND_CONN_PARAM(event),
+                                        KIBLND_CONN_PARAM_LEN(event));
                         break;
                 }
                 kiblnd_conn_decref(conn);
@@ -2726,8 +2761,8 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
                         CDEBUG(D_NET, "ESTABLISHED(active): %s\n",
                                libcfs_nid2str(conn->ibc_peer->ibp_nid));
                         kiblnd_check_connreply(conn,
-                                               event->private_data,
-                                               event->private_data_len);
+                                               (void *)KIBLND_CONN_PARAM(event),
+                                               KIBLND_CONN_PARAM_LEN(event));
                         break;
                 }
                 /* net keeps its ref on conn! */
@@ -2745,13 +2780,14 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
                 kiblnd_conn_decref(conn);
                 return 0;
 
-       case RDMA_CM_EVENT_DEVICE_REMOVAL:
-                LCONSOLE_ERROR(0x131, "Received notification of device removal\n"
-                        "Please shutdown LNET to allow this to proceed\n");
+        case RDMA_CM_EVENT_DEVICE_REMOVAL:
+                LCONSOLE_ERROR_MSG(0x131,
+                                   "Received notification of device removal\n"
+                                   "Please shutdown LNET to allow this to proceed\n");
                 /* Can't remove network from underneath LNET for now, so I have
                  * to ignore this */
-               return 0;
-       }
+                return 0;
+        }
 }
 
 int
@@ -2771,7 +2807,7 @@ kiblnd_check_txs (kib_conn_t *conn, struct list_head *txs)
                 } else {
                         LASSERT (!tx->tx_queued);
                         LASSERT (tx->tx_waiting || tx->tx_sending != 0);
-                }                        
+                }
 
                 if (time_after_eq (jiffies, tx->tx_deadline)) {
                         timed_out = 1;
@@ -2787,6 +2823,7 @@ int
 kiblnd_conn_timed_out (kib_conn_t *conn)
 {
         return  kiblnd_check_txs(conn, &conn->ibc_tx_queue) ||
+                kiblnd_check_txs(conn, &conn->ibc_tx_noops) ||
                 kiblnd_check_txs(conn, &conn->ibc_tx_queue_rsrvd) ||
                 kiblnd_check_txs(conn, &conn->ibc_tx_queue_nocred) ||
                 kiblnd_check_txs(conn, &conn->ibc_active_txs);