Whamcloud - gitweb
- merge 0.7rc1 from b_devel to HEAD (20030612 merge point)
[fs/lustre-release.git] / lustre / ptlrpc / events.c
index b819c29..167898a 100644 (file)
@@ -1,7 +1,7 @@
 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
  * vim:expandtab:shiftwidth=8:tabstop=8:
  *
- *  Copyright (C) 2002 Cluster File Systems, Inc.
+ *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
  *
  *   This file is part of Lustre, http://www.lustre.org.
  *
 
 #define DEBUG_SUBSYSTEM S_RPC
 
+#ifdef __KERNEL__
 #include <linux/module.h>
-#include <linux/obd_support.h>
+#else
+#include <liblustre.h>
+#endif
+#include <linux/obd_class.h>
 #include <linux/lustre_net.h>
 
-ptl_handle_eq_t request_out_eq, reply_in_eq, reply_out_eq, bulk_source_eq,
-        bulk_sink_eq;
-static const ptl_handle_ni_t *socknal_nip = NULL, *qswnal_nip = NULL;
+struct ptlrpc_ni  ptlrpc_interfaces[NAL_MAX_NR];
+int               ptlrpc_ninterfaces;
 
 /*
  *  Free the packet when it has gone out
  */
 static int request_out_callback(ptl_event_t *ev)
 {
+        struct ptlrpc_request *req = ev->mem_desc.user_ptr;
         ENTRY;
 
+        /* requests always contiguous */
+        LASSERT((ev->mem_desc.options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0);
+
         if (ev->type != PTL_EVENT_SENT) {
                 // XXX make sure we understand all events, including ACK's
                 CERROR("Unknown event %d\n", ev->type);
                 LBUG();
         }
 
+        /* this balances the atomic_inc in ptl_send_rpc() */
+        ptlrpc_req_finished(req);
         RETURN(1);
 }
 
-
 /*
  *  Free the packet when it has gone out
  */
 static int reply_out_callback(ptl_event_t *ev)
 {
+        struct ptlrpc_request *req = ev->mem_desc.user_ptr;
+        unsigned long          flags;
         ENTRY;
 
+        /* replies always contiguous */
+        LASSERT((ev->mem_desc.options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0);
+
         if (ev->type == PTL_EVENT_SENT) {
+                /* NB don't even know if this is the current reply! In fact
+                 * we can't touch any state in the request, since the
+                 * service handler zeros it on each incoming request. */
                 OBD_FREE(ev->mem_desc.start, ev->mem_desc.length);
+        } else if (ev->type == PTL_EVENT_ACK) {
+                LASSERT(req->rq_want_ack);
+                spin_lock_irqsave(&req->rq_lock, flags);
+                req->rq_want_ack = 0;
+                wake_up(&req->rq_wait_for_rep);
+                spin_unlock_irqrestore(&req->rq_lock, flags);
         } else {
-                // XXX make sure we understand all events, including ACK's
+                // XXX make sure we understand all events
                 CERROR("Unknown event %d\n", ev->type);
                 LBUG();
         }
@@ -68,11 +90,15 @@ static int reply_out_callback(ptl_event_t *ev)
 /*
  * Wake up the thread waiting for the reply once it comes in.
  */
-static int reply_in_callback(ptl_event_t *ev)
+int reply_in_callback(ptl_event_t *ev)
 {
         struct ptlrpc_request *req = ev->mem_desc.user_ptr;
+        unsigned long flags;
         ENTRY;
 
+        /* replies always contiguous */
+        LASSERT((ev->mem_desc.options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0);
+
         if (req->rq_xid == 0x5a5a5a5a5a5a5a5a) {
                 CERROR("Reply received for freed request!  Probably a missing "
                        "ptlrpc_abort()\n");
@@ -81,15 +107,25 @@ static int reply_in_callback(ptl_event_t *ev)
 
         if (req->rq_xid != ev->match_bits) {
                 CERROR("Reply packet for wrong request\n");
-                LBUG(); 
+                LBUG();
         }
 
         if (ev->type == PTL_EVENT_PUT) {
-                req->rq_repmsg = ev->mem_desc.start + ev->offset;
-                barrier();
-                wake_up(&req->rq_wait_for_rep);
+                /* Bug 1190: should handle non-zero offset as a protocol
+                 * error  */
+                LASSERT (ev->offset == 0);
+
+                spin_lock_irqsave (&req->rq_lock, flags);
+                LASSERT (req->rq_receiving_reply);
+                req->rq_receiving_reply = 0;
+                req->rq_replied = 1;
+                if (req->rq_set != NULL)
+                        wake_up(&req->rq_set->set_waitq);
+                else
+                        wake_up(&req->rq_wait_for_rep);
+                spin_unlock_irqrestore (&req->rq_lock, flags);
         } else {
-                // XXX make sure we understand all events, including ACK's
+                // XXX make sure we understand all events, including ACKs
                 CERROR("Unknown event %d\n", ev->type);
                 LBUG();
         }
@@ -99,122 +135,363 @@ static int reply_in_callback(ptl_event_t *ev)
 
 int request_in_callback(ptl_event_t *ev)
 {
-        struct ptlrpc_service *service = ev->mem_desc.user_ptr;
+        struct ptlrpc_request_buffer_desc *rqbd = ev->mem_desc.user_ptr;
+        struct ptlrpc_srv_ni  *srv_ni = rqbd->rqbd_srv_ni;
+        struct ptlrpc_service *service = srv_ni->sni_service;
+
+        /* requests always contiguous */
+        LASSERT((ev->mem_desc.options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0);
+        /* we only enable puts */
+        LASSERT(ev->type == PTL_EVENT_PUT);
+        LASSERT(atomic_read(&srv_ni->sni_nrqbds_receiving) > 0);
+        LASSERT(atomic_read(&rqbd->rqbd_refcount) > 0);
 
         if (ev->rlength != ev->mlength)
                 CERROR("Warning: Possibly truncated rpc (%d/%d)\n",
                        ev->mlength, ev->rlength);
 
-        if (ev->type == PTL_EVENT_PUT)
-                wake_up(&service->srv_waitq);
-        else
-                CERROR("Unexpected event type: %d\n", ev->type);
+        if (!PtlHandleEqual (ev->unlinked_me, PTL_HANDLE_NONE)) {
+                /* This is the last request to be received into this
+                 * request buffer.  We don't bump the refcount, since the
+                 * thread servicing this event is effectively taking over
+                 * portals' reference.
+                 */
+                /* NB ev->unlinked_me.nal_idx is not set properly in a callback */
+                LASSERT(ev->unlinked_me.cookie==rqbd->rqbd_me_h.cookie);
+
+                /* we're off the air */
+                /* we'll probably start dropping packets in portals soon */
+                if (atomic_dec_and_test(&srv_ni->sni_nrqbds_receiving))
+                        CERROR("All request buffers busy\n");
+        } else {
+                /* +1 ref for service thread */
+                atomic_inc(&rqbd->rqbd_refcount);
+        }
+
+        wake_up(&service->srv_waitq);
 
         return 0;
 }
 
-static int bulk_source_callback(ptl_event_t *ev)
+static int bulk_put_source_callback(ptl_event_t *ev)
 {
-        struct ptlrpc_bulk_page *bulk = ev->mem_desc.user_ptr;
-        struct ptlrpc_bulk_desc *desc = bulk->b_desc;
+        unsigned long            flags;
+        struct ptlrpc_bulk_desc *desc = ev->mem_desc.user_ptr;
         ENTRY;
 
-        if (ev->type == PTL_EVENT_SENT) {
-                CDEBUG(D_NET, "got SENT event\n");
-        } else if (ev->type == PTL_EVENT_ACK) {
-                CDEBUG(D_NET, "got ACK event\n");
-                if (bulk->b_cb != NULL)
-                        bulk->b_cb(bulk);
-                if (atomic_dec_and_test(&desc->b_pages_remaining)) {
-                        desc->b_flags |= PTL_BULK_FL_SENT;
-                        wake_up(&desc->b_waitq);
-                        if (desc->b_cb != NULL)
-                                desc->b_cb(desc, desc->b_cb_data);
-                }
-        } else {
-                CERROR("Unexpected event type!\n");
-                LBUG();
+        CDEBUG(D_NET, "got %s event %d\n",
+               (ev->type == PTL_EVENT_SENT) ? "SENT" :
+               (ev->type == PTL_EVENT_ACK)  ? "ACK"  : "UNEXPECTED", ev->type);
+
+        LASSERT(ev->type == PTL_EVENT_SENT || ev->type == PTL_EVENT_ACK);
+
+        /* 1 fragment for each page always */
+        LASSERT(ev->mem_desc.niov == desc->bd_page_count);
+
+        spin_lock_irqsave (&desc->bd_lock, flags);
+        
+        LASSERT(desc->bd_callback_count > 0 &&
+                desc->bd_callback_count <= 2);
+        
+        if (--desc->bd_callback_count == 0) {
+                desc->bd_network_rw = 0;
+                desc->bd_complete = 1;
+                wake_up(&desc->bd_waitq);
         }
 
-        RETURN(1);
+        spin_unlock_irqrestore (&desc->bd_lock, flags);
+        RETURN(0);
 }
 
-static int bulk_sink_callback(ptl_event_t *ev)
+struct ptlrpc_bulk_desc ptlrpc_bad_desc;
+ptl_event_t ptlrpc_bad_event;
+
+static int bulk_put_sink_callback(ptl_event_t *ev)
 {
-        struct ptlrpc_bulk_page *bulk = ev->mem_desc.user_ptr;
-        struct ptlrpc_bulk_desc *desc = bulk->b_desc;
+        struct ptlrpc_bulk_desc *desc = ev->mem_desc.user_ptr;
+        unsigned long            flags;
         ENTRY;
 
-        if (ev->type == PTL_EVENT_PUT) {
-                if (bulk->b_buf != ev->mem_desc.start + ev->offset)
-                        CERROR("bulkbuf != mem_desc -- why?\n");
-                if (bulk->b_cb != NULL)
-                        bulk->b_cb(bulk);
-                if (atomic_dec_and_test(&desc->b_pages_remaining)) {
-                        desc->b_flags |= PTL_BULK_FL_RCVD;
-                        wake_up(&desc->b_waitq);
-                        if (desc->b_cb != NULL)
-                                desc->b_cb(desc, desc->b_cb_data);
-                }
-        } else {
-                CERROR("Unexpected event type!\n");
-                LBUG();
+        LASSERT(ev->type == PTL_EVENT_PUT);
+
+        /* used iovs */
+        LASSERT((ev->mem_desc.options & (PTL_MD_IOV | PTL_MD_KIOV)) ==
+                PTL_MD_KIOV);
+        /* Honestly, it's best to find out early. */
+        if (desc->bd_page_count == 0x5a5a5a5a5a ||
+            desc->bd_page_count != ev->mem_desc.niov ||
+            ev->mem_desc.start != &desc->bd_iov) {
+                /* not guaranteed (don't LASSERT) but good for this bug hunt */
+                ptlrpc_bad_event = *ev;
+                ptlrpc_bad_desc = *desc;
+                CERROR ("XXX ev %p type %d portal %d match "LPX64", seq %ld\n",
+                        ev, ev->type, ev->portal, ev->match_bits, ev->sequence);
+                CERROR ("XXX desc %p, export %p import %p gen %d "
+                        " portal %d\n", 
+                        desc, desc->bd_export,
+                        desc->bd_import, desc->bd_import_generation,
+                        desc->bd_portal);
+                RETURN (0);
+        }
+        
+        LASSERT(desc->bd_page_count != 0x5a5a5a5a);
+        /* 1 fragment for each page always */
+        LASSERT(ev->mem_desc.niov == desc->bd_page_count);
+        LASSERT(ev->match_bits == desc->bd_req->rq_xid);
+        
+        /* peer must put with zero offset */
+        if (ev->offset != 0) {
+                /* Bug 1190: handle this as a protocol failure */
+                CERROR ("Bad offset %d\n", ev->offset);
+                LBUG ();
         }
 
+        /* No check for total # bytes; this could be a short read */
+
+        spin_lock_irqsave (&desc->bd_lock, flags);
+        desc->bd_network_rw = 0;
+        desc->bd_complete = 1;
+        if (desc->bd_req->rq_set != NULL)
+                wake_up (&desc->bd_req->rq_set->set_waitq);
+        else
+                wake_up (&desc->bd_req->rq_wait_for_rep);
+        spin_unlock_irqrestore (&desc->bd_lock, flags);
+
         RETURN(1);
 }
 
-int ptlrpc_init_portals(void)
+static int bulk_get_source_callback(ptl_event_t *ev)
 {
-        int rc;
-        ptl_handle_ni_t ni;
+        struct ptlrpc_bulk_desc *desc = ev->mem_desc.user_ptr;
+        struct ptlrpc_bulk_page *bulk;
+        struct list_head        *tmp;
+        unsigned long            flags;
+        ptl_size_t               total = 0;
+        ENTRY;
 
-        socknal_nip = inter_module_get_request("ksocknal_ni", "ksocknal");
-        qswnal_nip = inter_module_get_request("kqswnal_ni", "kqswnal");
-        if (socknal_nip == NULL && qswnal_nip == NULL) {
-                CERROR("get_ni failed: is a NAL module loaded?\n");
-                return -EIO;
+        LASSERT(ev->type == PTL_EVENT_GET);
+
+        /* used iovs */
+        LASSERT((ev->mem_desc.options & (PTL_MD_IOV | PTL_MD_KIOV)) ==
+                PTL_MD_KIOV);
+        /* 1 fragment for each page always */
+        LASSERT(ev->mem_desc.niov == desc->bd_page_count);
+        LASSERT(ev->match_bits == desc->bd_req->rq_xid);
+
+        /* peer must get with zero offset */
+        if (ev->offset != 0) {
+                /* Bug 1190: handle this as a protocol failure */
+                CERROR ("Bad offset %d\n", ev->offset);
+                LBUG ();
         }
+        
+        list_for_each (tmp, &desc->bd_page_list) {
+                bulk = list_entry(tmp, struct ptlrpc_bulk_page, bp_link);
 
-        /* Use the qswnal if it's there */
-        if (qswnal_nip != NULL)
-                ni = *qswnal_nip;
+                total += bulk->bp_buflen;
+        }
+
+        /* peer must get everything */
+        if (ev->mem_desc.length != total) {
+                /* Bug 1190: handle this as a protocol failure */
+                CERROR ("Bad length/total %d/%d\n", ev->mem_desc.length, total);
+                LBUG ();
+        }
+
+        spin_lock_irqsave (&desc->bd_lock, flags);
+        desc->bd_network_rw = 0;
+        desc->bd_complete = 1;
+        if (desc->bd_req->rq_set != NULL)
+                wake_up (&desc->bd_req->rq_set->set_waitq);
         else
-                ni = *socknal_nip;
+                wake_up (&desc->bd_req->rq_wait_for_rep);
+        spin_unlock_irqrestore (&desc->bd_lock, flags);
 
-        rc = PtlEQAlloc(ni, 128, request_out_callback, &request_out_eq);
+        RETURN(1);
+}
+
+static int bulk_get_sink_callback(ptl_event_t *ev)
+{
+        struct ptlrpc_bulk_desc *desc = ev->mem_desc.user_ptr;
+        unsigned long            flags;
+        ENTRY;
+
+        CDEBUG(D_NET, "got %s event %d\n",
+               (ev->type == PTL_EVENT_SENT) ? "SENT" :
+               (ev->type == PTL_EVENT_REPLY)  ? "REPLY"  : "UNEXPECTED",
+               ev->type);
+
+        LASSERT(ev->type == PTL_EVENT_SENT || ev->type == PTL_EVENT_REPLY);
+
+        /* 1 fragment for each page always */
+        LASSERT(ev->mem_desc.niov == desc->bd_page_count);
+
+        spin_lock_irqsave (&desc->bd_lock, flags);
+        LASSERT(desc->bd_callback_count > 0 &&
+                desc->bd_callback_count <= 2);
+
+        if (--desc->bd_callback_count == 0) {
+                desc->bd_network_rw = 0;
+                desc->bd_complete = 1;
+                wake_up(&desc->bd_waitq);
+        }
+        spin_unlock_irqrestore (&desc->bd_lock, flags);
+
+        RETURN(0);
+}
+
+int ptlrpc_uuid_to_peer (struct obd_uuid *uuid, struct ptlrpc_peer *peer)
+{
+        struct ptlrpc_ni   *pni;
+        struct lustre_peer  lpeer;
+        int                 i;
+        int                 rc = lustre_uuid_to_peer (uuid->uuid, &lpeer);
+
+        if (rc != 0)
+                RETURN (rc);
+
+        for (i = 0; i < ptlrpc_ninterfaces; i++) {
+                pni = &ptlrpc_interfaces[i];
+
+                if (!memcmp(&lpeer.peer_ni, &pni->pni_ni_h,
+                            sizeof (lpeer.peer_ni))) {
+                        peer->peer_nid = lpeer.peer_nid;
+                        peer->peer_ni = pni;
+                        return (0);
+                }
+        }
+
+        CERROR("Can't find ptlrpc interface for "LPX64" ni handle %08lx."LPX64"\n",
+               lpeer.peer_nid, lpeer.peer_ni.nal_idx, lpeer.peer_ni.cookie);
+        return (-ENOENT);
+}
+
+void ptlrpc_ni_fini(struct ptlrpc_ni *pni)
+{
+        PtlEQFree(pni->pni_request_out_eq_h);
+        PtlEQFree(pni->pni_reply_out_eq_h);
+        PtlEQFree(pni->pni_reply_in_eq_h);
+        PtlEQFree(pni->pni_bulk_put_source_eq_h);
+        PtlEQFree(pni->pni_bulk_put_sink_eq_h);
+        PtlEQFree(pni->pni_bulk_get_source_eq_h);
+        PtlEQFree(pni->pni_bulk_get_sink_eq_h);
+
+        kportal_put_ni (pni->pni_number);
+}
+
+int ptlrpc_ni_init(int number, char *name, struct ptlrpc_ni *pni)
+{
+        int              rc;
+        ptl_handle_ni_t *nip = kportal_get_ni (number);
+
+        if (nip == NULL) {
+                CDEBUG (D_NET, "Network interface %s not loaded\n", name);
+                return (-ENOENT);
+        }
+
+        CDEBUG (D_NET, "init %d %s: nal_idx %ld\n", number, name, nip->nal_idx);
+
+        pni->pni_name = name;
+        pni->pni_number = number;
+        pni->pni_ni_h = *nip;
+
+        pni->pni_request_out_eq_h = PTL_HANDLE_NONE;
+        pni->pni_reply_out_eq_h = PTL_HANDLE_NONE;
+        pni->pni_reply_in_eq_h = PTL_HANDLE_NONE;
+        pni->pni_bulk_put_source_eq_h = PTL_HANDLE_NONE;
+        pni->pni_bulk_put_sink_eq_h = PTL_HANDLE_NONE;
+        pni->pni_bulk_get_source_eq_h = PTL_HANDLE_NONE;
+        pni->pni_bulk_get_sink_eq_h = PTL_HANDLE_NONE;
+
+        /* NB We never actually PtlEQGet() out of these events queues since
+         * we're only interested in the event callback, so we can just let
+         * them wrap.  Their sizes aren't a big deal, apart from providing
+         * a little history for debugging... */
+
+        rc = PtlEQAlloc(pni->pni_ni_h, 1024, request_out_callback,
+                        &pni->pni_request_out_eq_h);
+        if (rc != PTL_OK)
+                GOTO (fail, rc = -ENOMEM);
+
+        rc = PtlEQAlloc(pni->pni_ni_h, 1024, reply_out_callback,
+                        &pni->pni_reply_out_eq_h);
+        if (rc != PTL_OK)
+                GOTO (fail, rc = -ENOMEM);
+
+        rc = PtlEQAlloc(pni->pni_ni_h, 1024, reply_in_callback,
+                        &pni->pni_reply_in_eq_h);
         if (rc != PTL_OK)
-                CERROR("PtlEQAlloc failed: %d\n", rc);
+                GOTO (fail, rc = -ENOMEM);
 
-        rc = PtlEQAlloc(ni, 128, reply_out_callback, &reply_out_eq);
+        rc = PtlEQAlloc(pni->pni_ni_h, 1024, bulk_put_source_callback,
+                        &pni->pni_bulk_put_source_eq_h);
         if (rc != PTL_OK)
-                CERROR("PtlEQAlloc failed: %d\n", rc);
+                GOTO (fail, rc = -ENOMEM);
 
-        rc = PtlEQAlloc(ni, 128, reply_in_callback, &reply_in_eq);
+        rc = PtlEQAlloc(pni->pni_ni_h, 1024, bulk_put_sink_callback,
+                        &pni->pni_bulk_put_sink_eq_h);
         if (rc != PTL_OK)
-                CERROR("PtlEQAlloc failed: %d\n", rc);
+                GOTO (fail, rc = -ENOMEM);
 
-        rc = PtlEQAlloc(ni, 128, bulk_source_callback, &bulk_source_eq);
+        rc = PtlEQAlloc(pni->pni_ni_h, 1024, bulk_get_source_callback,
+                        &pni->pni_bulk_get_source_eq_h);
         if (rc != PTL_OK)
-                CERROR("PtlEQAlloc failed: %d\n", rc);
+                GOTO (fail, rc = -ENOMEM);
 
-        rc = PtlEQAlloc(ni, 128, bulk_sink_callback, &bulk_sink_eq);
+        rc = PtlEQAlloc(pni->pni_ni_h, 1024, bulk_get_sink_callback,
+                        &pni->pni_bulk_get_sink_eq_h);
         if (rc != PTL_OK)
-                CERROR("PtlEQAlloc failed: %d\n", rc);
+                GOTO (fail, rc = -ENOMEM);
+
+        return (0);
+ fail:
+        CERROR ("Failed to initialise network interface %s: %d\n",
+                name, rc);
+
+        /* OK to do complete teardown since we invalidated the handles above */
+        ptlrpc_ni_fini (pni);
+        return (rc);
+}
 
-        return rc;
+int ptlrpc_init_portals(void)
+{
+        /* Add new portals network interfaces here.
+         * Order is irrelevent! */
+        static struct {
+                int   number;
+                char *name;
+        } ptl_nis[] = {
+                {QSWNAL,  "qswnal"},
+                {SOCKNAL, "socknal"},
+                {GMNAL,   "gmnal"},
+                {TOENAL,  "toenal"},
+                {TCPNAL,  "tcpnal"},
+                {SCIMACNAL, "scimacnal"}};
+        int   rc;
+        int   i;
+
+        LASSERT(ptlrpc_ninterfaces == 0);
+
+        for (i = 0; i < sizeof (ptl_nis) / sizeof (ptl_nis[0]); i++) {
+                LASSERT(ptlrpc_ninterfaces < (sizeof(ptlrpc_interfaces) /
+                                              sizeof(ptlrpc_interfaces[0])));
+
+                rc = ptlrpc_ni_init(ptl_nis[i].number, ptl_nis[i].name,
+                                    &ptlrpc_interfaces[ptlrpc_ninterfaces]);
+                if (rc == 0)
+                        ptlrpc_ninterfaces++;
+        }
+
+        if (ptlrpc_ninterfaces == 0) {
+                CERROR("network initialisation failed: is a NAL module "
+                       "loaded?\n");
+                return -EIO;
+        }
+        return 0;
 }
 
 void ptlrpc_exit_portals(void)
 {
-        PtlEQFree(request_out_eq);
-        PtlEQFree(reply_out_eq);
-        PtlEQFree(reply_in_eq);
-        PtlEQFree(bulk_source_eq);
-        PtlEQFree(bulk_sink_eq);
-
-        if (qswnal_nip != NULL)
-                inter_module_put("kqswnal_ni");
-        if (socknal_nip != NULL)
-                inter_module_put("ksocknal_ni");
+        while (ptlrpc_ninterfaces > 0)
+                ptlrpc_ni_fini (&ptlrpc_interfaces[--ptlrpc_ninterfaces]);
 }