Whamcloud - gitweb
Land b_release_1_4_3 onto HEAD (20050619_0305)
[fs/lustre-release.git] / lnet / klnds / ralnd / ralnd_cb.c
index 38f1b77..dd910ce 100644 (file)
@@ -77,7 +77,7 @@ kranal_schedule_conn(kra_conn_t *conn)
         if (!conn->rac_scheduled) {
                 kranal_conn_addref(conn);       /* +1 ref for scheduler */
                 conn->rac_scheduled = 1;
-                list_add_tail(&conn->rac_schedlist, &dev->rad_connq);
+                list_add_tail(&conn->rac_schedlist, &dev->rad_ready_conns);
                 wake_up(&dev->rad_waitq);
         }
 
@@ -296,7 +296,7 @@ kranal_setup_rdma_buffer (kra_tx_t *tx, int niov,
         return kranal_setup_virt_buffer(tx, niov, iov, offset, nob);
 }
 
-void
+int
 kranal_map_buffer (kra_tx_t *tx)
 {
         kra_conn_t     *conn = tx->tx_conn;
@@ -313,23 +313,45 @@ kranal_map_buffer (kra_tx_t *tx)
         case RANAL_BUF_IMMEDIATE:
         case RANAL_BUF_PHYS_MAPPED:
         case RANAL_BUF_VIRT_MAPPED:
-                break;
+                return 0;
 
         case RANAL_BUF_PHYS_UNMAPPED:
                 rrc = RapkRegisterPhys(dev->rad_handle,
                                        tx->tx_phys, tx->tx_phys_npages,
                                        &tx->tx_map_key);
-                LASSERT (rrc == RAP_SUCCESS);
+                if (rrc != RAP_SUCCESS) {
+                        CERROR ("Can't map %d pages: dev %d "
+                                "phys %u pp %u, virt %u nob %lu\n",
+                                tx->tx_phys_npages, dev->rad_id, 
+                                dev->rad_nphysmap, dev->rad_nppphysmap,
+                                dev->rad_nvirtmap, dev->rad_nobvirtmap);
+                        return -ENOMEM; /* assume insufficient resources */
+                }
+
+                dev->rad_nphysmap++;
+                dev->rad_nppphysmap += tx->tx_phys_npages;
+
                 tx->tx_buftype = RANAL_BUF_PHYS_MAPPED;
-                break;
+                return 0;
 
         case RANAL_BUF_VIRT_UNMAPPED:
                 rrc = RapkRegisterMemory(dev->rad_handle,
                                          tx->tx_buffer, tx->tx_nob,
                                          &tx->tx_map_key);
-                LASSERT (rrc == RAP_SUCCESS);
+                if (rrc != RAP_SUCCESS) {
+                        CERROR ("Can't map %d bytes: dev %d "
+                                "phys %u pp %u, virt %u nob %lu\n",
+                                tx->tx_nob, dev->rad_id, 
+                                dev->rad_nphysmap, dev->rad_nppphysmap,
+                                dev->rad_nvirtmap, dev->rad_nobvirtmap);
+                        return -ENOMEM; /* assume insufficient resources */
+                }
+
+                dev->rad_nvirtmap++;
+                dev->rad_nobvirtmap += tx->tx_nob;
+
                 tx->tx_buftype = RANAL_BUF_VIRT_MAPPED;
-                break;
+                return 0;
         }
 }
 
@@ -356,6 +378,10 @@ kranal_unmap_buffer (kra_tx_t *tx)
                 rrc = RapkDeregisterMemory(dev->rad_handle, NULL,
                                            &tx->tx_map_key);
                 LASSERT (rrc == RAP_SUCCESS);
+
+                dev->rad_nphysmap--;
+                dev->rad_nppphysmap -= tx->tx_phys_npages;
+
                 tx->tx_buftype = RANAL_BUF_PHYS_UNMAPPED;
                 break;
 
@@ -366,6 +392,10 @@ kranal_unmap_buffer (kra_tx_t *tx)
                 rrc = RapkDeregisterMemory(dev->rad_handle, tx->tx_buffer,
                                            &tx->tx_map_key);
                 LASSERT (rrc == RAP_SUCCESS);
+
+                dev->rad_nvirtmap--;
+                dev->rad_nobvirtmap -= tx->tx_nob;
+
                 tx->tx_buftype = RANAL_BUF_VIRT_UNMAPPED;
                 break;
         }
@@ -630,7 +660,7 @@ kranal_do_send (lib_nal_t    *nal,
                         break;                  /* RDMA not expected */
                 }
 
-                /* Incoming message consistent with immediate reply? */
+                /* Incoming message consistent with RDMA? */
                 if (conn->rac_rxmsg->ram_type != RANAL_MSG_GET_REQ) {
                         CERROR("REPLY to "LPX64" bad msg type %x!!!\n",
                                nid, conn->rac_rxmsg->ram_type);
@@ -650,7 +680,12 @@ kranal_do_send (lib_nal_t    *nal,
                 tx->tx_conn = conn;
                 tx->tx_libmsg[0] = libmsg;
 
-                kranal_map_buffer(tx);
+                rc = kranal_map_buffer(tx);
+                if (rc != 0) {
+                        kranal_tx_done(tx, rc);
+                        return PTL_FAIL;
+                }
+
                 kranal_rdma(tx, RANAL_MSG_GET_DONE,
                             &conn->rac_rxmsg->ram_u.get.ragm_desc, nob,
                             conn->rac_rxmsg->ram_u.get.ragm_cookie);
@@ -843,7 +878,11 @@ kranal_do_recv (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
                 }
 
                 tx->tx_conn = conn;
-                kranal_map_buffer(tx);
+                rc = kranal_map_buffer(tx);
+                if (rc != 0) {
+                        kranal_tx_done(tx, rc);
+                        return PTL_FAIL;
+                }
 
                 tx->tx_msg.ram_u.putack.rapam_src_cookie =
                         conn->rac_rxmsg->ram_u.putreq.raprm_cookie;
@@ -1397,6 +1436,10 @@ kranal_sendmsg(kra_conn_t *conn, kra_msg_t *msg,
                 return 0;
 
         case RAP_NOT_DONE:
+                if (time_after_eq(jiffies,
+                                  conn->rac_last_tx + conn->rac_keepalive*HZ))
+                        CWARN("EAGAIN sending %02x (idle %lu secs)\n",
+                               msg->ram_type, (jiffies - conn->rac_last_tx)/HZ);
                 return -EAGAIN;
         }
 }
@@ -1411,7 +1454,7 @@ kranal_process_fmaq (kra_conn_t *conn)
         int           expect_reply;
 
         /* NB 1. kranal_sendmsg() may fail if I'm out of credits right now.
-         *       However I will be rescheduled some by an FMA completion event
+         *       However I will be rescheduled by an FMA completion event
          *       when I eventually get some.
          * NB 2. Sampling rac_state here races with setting it elsewhere.
          *       But it doesn't matter if I try to send a "real" message just
@@ -1466,7 +1509,9 @@ kranal_process_fmaq (kra_conn_t *conn)
 
                 if (time_after_eq(jiffies,
                                   conn->rac_last_tx + conn->rac_keepalive * HZ)) {
-                        CDEBUG(D_NET, "sending NOOP (idle)\n");
+                        CDEBUG(D_NET, "sending NOOP -> "LPX64" (%p idle %lu(%ld))\n",
+                               conn->rac_peer->rap_nid, conn,
+                               (jiffies - conn->rac_last_tx)/HZ, conn->rac_keepalive);
                         kranal_init_msg(&conn->rac_msg, RANAL_MSG_NOOP);
                         kranal_sendmsg(conn, &conn->rac_msg, NULL, 0);
                 }
@@ -1489,7 +1534,6 @@ kranal_process_fmaq (kra_conn_t *conn)
         case RANAL_MSG_IMMEDIATE:
                 rc = kranal_sendmsg(conn, &tx->tx_msg,
                                     tx->tx_buffer, tx->tx_nob);
-                expect_reply = 0;
                 break;
 
         case RANAL_MSG_PUT_NAK:
@@ -1497,13 +1541,16 @@ kranal_process_fmaq (kra_conn_t *conn)
         case RANAL_MSG_GET_NAK:
         case RANAL_MSG_GET_DONE:
                 rc = kranal_sendmsg(conn, &tx->tx_msg, NULL, 0);
-                expect_reply = 0;
                 break;
 
         case RANAL_MSG_PUT_REQ:
+                rc = kranal_map_buffer(tx);
+                LASSERT (rc != -EAGAIN);
+                if (rc != 0)
+                        break;
+
                 tx->tx_msg.ram_u.putreq.raprm_cookie = tx->tx_cookie;
                 rc = kranal_sendmsg(conn, &tx->tx_msg, NULL, 0);
-                kranal_map_buffer(tx);
                 expect_reply = 1;
                 break;
 
@@ -1513,7 +1560,11 @@ kranal_process_fmaq (kra_conn_t *conn)
                 break;
 
         case RANAL_MSG_GET_REQ:
-                kranal_map_buffer(tx);
+                rc = kranal_map_buffer(tx);
+                LASSERT (rc != -EAGAIN);
+                if (rc != 0)
+                        break;
+
                 tx->tx_msg.ram_u.get.ragm_cookie = tx->tx_cookie;
                 tx->tx_msg.ram_u.get.ragm_desc.rard_key = tx->tx_map_key;
                 tx->tx_msg.ram_u.get.ragm_desc.rard_addr.AddressBits =
@@ -1534,10 +1585,8 @@ kranal_process_fmaq (kra_conn_t *conn)
                 return;
         }
 
-        LASSERT (rc == 0);
-
-        if (!expect_reply) {
-                kranal_tx_done(tx, 0);
+        if (!expect_reply || rc != 0) {
+                kranal_tx_done(tx, rc);
         } else {
                 /* LASSERT(current) above ensures this doesn't race with reply
                  * processing */
@@ -1829,12 +1878,14 @@ void
 kranal_complete_closed_conn (kra_conn_t *conn)
 {
         kra_tx_t   *tx;
+        int         nfma;
+        int         nreplies;
 
         LASSERT (conn->rac_state == RANAL_CONN_CLOSED);
         LASSERT (list_empty(&conn->rac_list));
         LASSERT (list_empty(&conn->rac_hashlist));
 
-        while (!list_empty(&conn->rac_fmaq)) {
+        for (nfma = 0; !list_empty(&conn->rac_fmaq); nfma++) {
                 tx = list_entry(conn->rac_fmaq.next, kra_tx_t, tx_list);
 
                 list_del(&tx->tx_list);
@@ -1843,23 +1894,54 @@ kranal_complete_closed_conn (kra_conn_t *conn)
 
         LASSERT (list_empty(&conn->rac_rdmaq));
 
-        while (!list_empty(&conn->rac_replyq)) {
+        for (nreplies = 0; !list_empty(&conn->rac_replyq); nreplies++) {
                 tx = list_entry(conn->rac_replyq.next, kra_tx_t, tx_list);
 
                 list_del(&tx->tx_list);
                 kranal_tx_done(tx, -ECONNABORTED);
         }
+
+        CWARN("Closed conn %p -> "LPX64": nmsg %d nreplies %d\n",
+              conn, conn->rac_peer->rap_nid, nfma, nreplies);
+}
+
+int
+kranal_process_new_conn (kra_conn_t *conn)
+{
+        RAP_RETURN   rrc;
+        
+        rrc = RapkCompleteSync(conn->rac_rihandle, 1);
+        if (rrc == RAP_SUCCESS)
+                return 0;
+
+        LASSERT (rrc == RAP_NOT_DONE);
+        if (!time_after_eq(jiffies, conn->rac_last_tx + 
+                           conn->rac_timeout * HZ))
+                return -EAGAIN;
+
+        /* Too late */
+        rrc = RapkCompleteSync(conn->rac_rihandle, 0);
+        LASSERT (rrc == RAP_SUCCESS);
+        return -ETIMEDOUT;
 }
 
 int
 kranal_scheduler (void *arg)
 {
-        kra_device_t   *dev = (kra_device_t *)arg;
-        wait_queue_t    wait;
-        char            name[16];
-        kra_conn_t     *conn;
-        unsigned long   flags;
-        int             busy_loops = 0;
+        kra_device_t     *dev = (kra_device_t *)arg;
+        wait_queue_t      wait;
+        char              name[16];
+        kra_conn_t       *conn;
+        unsigned long     flags;
+        unsigned long     deadline;
+        unsigned long     soonest;
+        int               nsoonest;
+        long              timeout;
+        struct list_head *tmp;
+        struct list_head *nxt;
+        int               rc;
+        int               dropped_lock;
+        int               busy_loops = 0;
 
         snprintf(name, sizeof(name), "kranal_sd_%02d", dev->rad_idx);
         kportal_daemonize(name);
@@ -1882,10 +1964,13 @@ kranal_scheduler (void *arg)
                         spin_lock_irqsave(&dev->rad_lock, flags);
                 }
 
+                dropped_lock = 0;
+
                 if (dev->rad_ready) {
                         /* Device callback fired since I last checked it */
                         dev->rad_ready = 0;
                         spin_unlock_irqrestore(&dev->rad_lock, flags);
+                        dropped_lock = 1;
 
                         kranal_check_rdma_cq(dev);
                         kranal_check_fma_cq(dev);
@@ -1893,14 +1978,14 @@ kranal_scheduler (void *arg)
                         spin_lock_irqsave(&dev->rad_lock, flags);
                 }
 
-                if (!list_empty(&dev->rad_connq)) {
-                        /* Connection needs attention */
-                        conn = list_entry(dev->rad_connq.next,
-                                          kra_conn_t, rac_schedlist);
+                list_for_each_safe(tmp, nxt, &dev->rad_ready_conns) {
+                        conn = list_entry(tmp, kra_conn_t, rac_schedlist);
+
                         list_del_init(&conn->rac_schedlist);
                         LASSERT (conn->rac_scheduled);
                         conn->rac_scheduled = 0;
                         spin_unlock_irqrestore(&dev->rad_lock, flags);
+                        dropped_lock = 1;
 
                         kranal_check_fma_rx(conn);
                         kranal_process_fmaq(conn);
@@ -1909,26 +1994,71 @@ kranal_scheduler (void *arg)
                                 kranal_complete_closed_conn(conn);
 
                         kranal_conn_decref(conn);
-
                         spin_lock_irqsave(&dev->rad_lock, flags);
-                        continue;
                 }
 
-                /* recheck device callback fired before sleeping */
-                if (dev->rad_ready)
+                nsoonest = 0;
+                soonest = jiffies;
+
+                list_for_each_safe(tmp, nxt, &dev->rad_new_conns) {
+                        conn = list_entry(tmp, kra_conn_t, rac_schedlist);
+                        
+                        deadline = conn->rac_last_tx + conn->rac_keepalive;
+                        if (time_after_eq(jiffies, deadline)) {
+                                /* Time to process this new conn */
+                                spin_unlock_irqrestore(&dev->rad_lock, flags);
+                                dropped_lock = 1;
+
+                                rc = kranal_process_new_conn(conn);
+                                if (rc != -EAGAIN) {
+                                        /* All done with this conn */
+                                        spin_lock_irqsave(&dev->rad_lock, flags);
+                                        list_del_init(&conn->rac_schedlist);
+                                        spin_unlock_irqrestore(&dev->rad_lock, flags);
+
+                                        kranal_conn_decref(conn);
+                                        spin_lock_irqsave(&dev->rad_lock, flags);
+                                        continue;
+                                }
+
+                                /* retry with exponential backoff until HZ */
+                                if (conn->rac_keepalive == 0)
+                                        conn->rac_keepalive = 1;
+                                else if (conn->rac_keepalive <= HZ)
+                                        conn->rac_keepalive *= 2;
+                                else
+                                        conn->rac_keepalive += HZ;
+                                
+                                deadline = conn->rac_last_tx + conn->rac_keepalive;
+                                spin_lock_irqsave(&dev->rad_lock, flags);
+                        }
+
+                        /* Does this conn need attention soonest? */
+                        if (nsoonest++ == 0 ||
+                            !time_after_eq(deadline, soonest))
+                                soonest = deadline;
+                }
+
+                if (dropped_lock)               /* may sleep iff I didn't drop the lock */
                         continue;
 
-                add_wait_queue(&dev->rad_waitq, &wait);
                 set_current_state(TASK_INTERRUPTIBLE);
-
+                add_wait_queue(&dev->rad_waitq, &wait);
                 spin_unlock_irqrestore(&dev->rad_lock, flags);
 
-                busy_loops = 0;
-                schedule();
+                if (nsoonest == 0) {
+                        busy_loops = 0;
+                        schedule();
+                } else {
+                        timeout = (long)(soonest - jiffies);
+                        if (timeout > 0) {
+                                busy_loops = 0;
+                                schedule_timeout(timeout);
+                        }
+                }
 
-                set_current_state(TASK_RUNNING);
                 remove_wait_queue(&dev->rad_waitq, &wait);
-
+                set_current_state(TASK_RUNNING);
                 spin_lock_irqsave(&dev->rad_lock, flags);
         }