Whamcloud - gitweb
b=16186,i=liangzhen,i=maxim:
[fs/lustre-release.git] / lnet / klnds / ptllnd / ptllnd_peer.c
index c47fd04..1a5c383 100644 (file)
@@ -1,20 +1,42 @@
 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
  * vim:expandtab:shiftwidth=8:tabstop=8:
  *
- * Copyright (C) 2005 Cluster File Systems, Inc. All rights reserved.
- *   Author: PJ Kirner <pjkirner@clusterfs.com>
- *           E Barton <eeb@bartonsoftware.com>
+ * GPL HEADER START
  *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
- *   This file is confidential source code owned by Cluster File Systems.
- *   No viewing, modification, compilation, redistribution, or any other
- *   form of use is permitted except through a signed license agreement.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
  *
- *   If you have not signed such an agreement, then you have no rights to
- *   this file.  Please destroy it immediately and contact CFS.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
  *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/ptllnd/ptllnd_peer.c
+ *
+ * Author: PJ Kirner <pjkirner@clusterfs.com>
+ * Author: E Barton <eeb@bartonsoftware.com>
  */
 
 #include "ptllnd.h"
@@ -90,6 +112,7 @@ kptllnd_get_peer_info(int index,
 void
 kptllnd_peer_add_peertable_locked (kptl_peer_t *peer)
 {
+        LASSERT (!kptllnd_data.kptl_shutdown);
         LASSERT (kptllnd_data.kptl_n_active_peers <
                  kptllnd_data.kptl_expected_peers);
 
@@ -157,13 +180,14 @@ kptllnd_peer_allocate (lnet_process_id_t lpid, ptl_process_id_t ppid)
 
         memset(peer, 0, sizeof(*peer));         /* zero flags etc */
 
+        INIT_LIST_HEAD (&peer->peer_noops);
         INIT_LIST_HEAD (&peer->peer_sendq);
         INIT_LIST_HEAD (&peer->peer_activeq);
         spin_lock_init (&peer->peer_lock);
 
         peer->peer_state = PEER_STATE_ALLOCATED;
         peer->peer_error = 0;
-        peer->peer_last_alive = cfs_time_current();
+        peer->peer_last_alive = 0;
         peer->peer_id = lpid;
         peer->peer_ptlid = ppid;
         peer->peer_credits = 1;                 /* enough for HELLO */
@@ -204,6 +228,7 @@ kptllnd_peer_destroy (kptl_peer_t *peer)
         LASSERT (atomic_read(&peer->peer_refcount) == 0);
         LASSERT (peer->peer_state == PEER_STATE_ALLOCATED ||
                  peer->peer_state == PEER_STATE_ZOMBIE);
+        LASSERT (list_empty(&peer->peer_noops));
         LASSERT (list_empty(&peer->peer_sendq));
         LASSERT (list_empty(&peer->peer_activeq));
 
@@ -244,6 +269,7 @@ kptllnd_peer_cancel_txs(kptl_peer_t *peer, struct list_head *txs)
 
         spin_lock_irqsave(&peer->peer_lock, flags);
 
+        kptllnd_cancel_txlist(&peer->peer_noops, txs);
         kptllnd_cancel_txlist(&peer->peer_sendq, txs);
         kptllnd_cancel_txlist(&peer->peer_activeq, txs);
                 
@@ -465,9 +491,7 @@ void
 kptllnd_post_tx(kptl_peer_t *peer, kptl_tx_t *tx, int nfrag)
 {
         /* CAVEAT EMPTOR: I take over caller's ref on 'tx' */
-        ptl_handle_md_t  rdma_mdh = PTL_INVALID_HANDLE;
-        ptl_handle_md_t  msg_mdh = PTL_INVALID_HANDLE;
-        ptl_handle_me_t  meh;
+        ptl_handle_md_t  msg_mdh;
         ptl_md_t         md;
         ptl_err_t        prc;
         unsigned long    flags;
@@ -482,48 +506,6 @@ kptllnd_post_tx(kptl_peer_t *peer, kptl_tx_t *tx, int nfrag)
 
         kptllnd_set_tx_peer(tx, peer);
 
-        if (tx->tx_type == TX_TYPE_PUT_REQUEST ||
-            tx->tx_type == TX_TYPE_GET_REQUEST) {
-
-                spin_lock_irqsave(&peer->peer_lock, flags);
-
-                /* Assume 64-bit matchbits can't wrap */
-                LASSERT (peer->peer_next_matchbits >= PTL_RESERVED_MATCHBITS);
-                tx->tx_msg->ptlm_u.rdma.kptlrm_matchbits =
-                        peer->peer_next_matchbits++;
-                        
-                spin_unlock_irqrestore(&peer->peer_lock, flags);
-
-                prc = PtlMEAttach(kptllnd_data.kptl_nih,
-                                  *kptllnd_tunables.kptl_portal,
-                                  peer->peer_ptlid,
-                                  tx->tx_msg->ptlm_u.rdma.kptlrm_matchbits,
-                                  0,             /* ignore bits */
-                                  PTL_UNLINK,
-                                  PTL_INS_BEFORE,
-                                  &meh);
-                if (prc != PTL_OK) {
-                        CERROR("PtlMEAttach(%s) failed: %d\n",
-                               libcfs_id2str(peer->peer_id), prc);
-                        goto failed;
-                }
-
-                prc = PtlMDAttach(meh, tx->tx_rdma_md, PTL_UNLINK, &rdma_mdh);
-                if (prc != PTL_OK) {
-                        CERROR("PtlMDAttach(%s) failed: %d\n",
-                               libcfs_id2str(tx->tx_peer->peer_id), prc);
-                        prc = PtlMEUnlink(meh);
-                        LASSERT(prc == PTL_OK);
-                        rdma_mdh = PTL_INVALID_HANDLE;
-                        goto failed;
-                }
-
-                /* I'm not racing with the event callback here.  It's a bug if
-                 * there's an event on the MD I just attached before I actually
-                 * send the RDMA request message which the event callback
-                 * catches by asserting 'rdma_mdh' is valid. */
-        }
-
         memset(&md, 0, sizeof(md));
 
         md.threshold = tx->tx_acked ? 2 : 1;    /* SEND END + ACK? */
@@ -547,44 +529,51 @@ kptllnd_post_tx(kptl_peer_t *peer, kptl_tx_t *tx, int nfrag)
 
         prc = PtlMDBind(kptllnd_data.kptl_nih, md, PTL_UNLINK, &msg_mdh);
         if (prc != PTL_OK) {
-                msg_mdh = PTL_INVALID_HANDLE;
-                goto failed;
+                CERROR("PtlMDBind(%s) failed: %s(%d)\n",
+                       libcfs_id2str(peer->peer_id),
+                       kptllnd_errtype2str(prc), prc);
+                tx->tx_status = -EIO;
+                kptllnd_tx_decref(tx);
+                return;
         }
-        
+
         spin_lock_irqsave(&peer->peer_lock, flags);
 
         tx->tx_deadline = jiffies + (*kptllnd_tunables.kptl_timeout * HZ);
         tx->tx_active = 1;
-        tx->tx_rdma_mdh = rdma_mdh;
         tx->tx_msg_mdh = msg_mdh;
 
        /* Ensure HELLO is sent first */
-       if (tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_HELLO)
+        if (tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_NOOP)
+               list_add(&tx->tx_list, &peer->peer_noops);
+       else if (tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_HELLO)
                list_add(&tx->tx_list, &peer->peer_sendq);
        else
                list_add_tail(&tx->tx_list, &peer->peer_sendq);
 
         spin_unlock_irqrestore(&peer->peer_lock, flags);
-        return;
-        
- failed:
-        spin_lock_irqsave(&peer->peer_lock, flags);
-
-        tx->tx_status = -EIO;
-        tx->tx_rdma_mdh = rdma_mdh;
-        tx->tx_msg_mdh = msg_mdh;
+}
 
-        spin_unlock_irqrestore(&peer->peer_lock, flags);
+static inline int
+kptllnd_peer_send_noop (kptl_peer_t *peer)
+{
+        if (!peer->peer_sent_hello ||
+            peer->peer_credits == 0 ||
+            !list_empty(&peer->peer_noops) ||
+            peer->peer_outstanding_credits < PTLLND_CREDIT_HIGHWATER)
+                return 0;
 
-        kptllnd_tx_decref(tx);
+        /* No tx to piggyback NOOP onto or no credit to send a tx */
+        return (list_empty(&peer->peer_sendq) || peer->peer_credits == 1);
 }
 
 void
 kptllnd_peer_check_sends (kptl_peer_t *peer)
 {
-
+        ptl_handle_me_t  meh;
         kptl_tx_t       *tx;
         int              rc;
+        int              msg_type;
         unsigned long    flags;
 
         LASSERT(!in_interrupt());
@@ -593,10 +582,7 @@ kptllnd_peer_check_sends (kptl_peer_t *peer)
 
         peer->peer_retry_noop = 0;
 
-        if (list_empty(&peer->peer_sendq) &&
-            peer->peer_outstanding_credits >= PTLLND_CREDIT_HIGHWATER &&
-            peer->peer_credits != 0) {
-
+        if (kptllnd_peer_send_noop(peer)) {
                 /* post a NOOP to return credits */
                 spin_unlock_irqrestore(&peer->peer_lock, flags);
 
@@ -613,13 +599,22 @@ kptllnd_peer_check_sends (kptl_peer_t *peer)
                 peer->peer_retry_noop = (tx == NULL);
         }
 
-        while (!list_empty(&peer->peer_sendq)) {
-                tx = list_entry (peer->peer_sendq.next, kptl_tx_t, tx_list);
+        for (;;) {
+                if (!list_empty(&peer->peer_noops)) {
+                        LASSERT (peer->peer_sent_hello);
+                        tx = list_entry(peer->peer_noops.next,
+                                        kptl_tx_t, tx_list);
+                } else if (!list_empty(&peer->peer_sendq)) {
+                        tx = list_entry(peer->peer_sendq.next,
+                                        kptl_tx_t, tx_list);
+                } else {
+                        /* nothing to send right now */
+                        break;
+                }
 
                 LASSERT (tx->tx_active);
                 LASSERT (!PtlHandleIsEqual(tx->tx_msg_mdh, PTL_INVALID_HANDLE));
-                LASSERT (tx->tx_type == TX_TYPE_SMALL_MESSAGE ||
-                         !PtlHandleIsEqual(tx->tx_rdma_mdh, PTL_INVALID_HANDLE));
+                LASSERT (PtlHandleIsEqual(tx->tx_rdma_mdh, PTL_INVALID_HANDLE));
 
                 LASSERT (peer->peer_outstanding_credits >= 0);
                 LASSERT (peer->peer_sent_credits >= 0);
@@ -628,32 +623,37 @@ kptllnd_peer_check_sends (kptl_peer_t *peer)
                          *kptllnd_tunables.kptl_peercredits);
                 LASSERT (peer->peer_credits >= 0);
 
-               /* Ensure HELLO is sent first */
-               if (!peer->peer_sent_hello) {
-                       if (tx->tx_msg->ptlm_type != PTLLND_MSG_TYPE_HELLO)
-                               break;
-                       peer->peer_sent_hello = 1;
-               }
+                msg_type = tx->tx_msg->ptlm_type;
+
+                /* Ensure HELLO is sent first */
+                if (!peer->peer_sent_hello) {
+                        LASSERT (list_empty(&peer->peer_noops));
+                        if (msg_type != PTLLND_MSG_TYPE_HELLO)
+                                break;
+                        peer->peer_sent_hello = 1;
+                }
 
                 if (peer->peer_credits == 0) {
-                        CDEBUG(D_NETTRACE, "%s[%d/%d+%d]: no credits for %p\n",
+                        CDEBUG(D_NETTRACE, "%s[%d/%d+%d]: no credits for %s[%p]\n",
                                libcfs_id2str(peer->peer_id), 
                                peer->peer_credits,
                                peer->peer_outstanding_credits, 
-                               peer->peer_sent_credits, tx);
+                               peer->peer_sent_credits, 
+                               kptllnd_msgtype2str(msg_type), tx);
                         break;
                 }
 
-                /* Don't use the last credit unless I've got credits to
-                 * return */
+                /* Last/Initial credit reserved for NOOP/HELLO */
                 if (peer->peer_credits == 1 &&
-                    peer->peer_outstanding_credits == 0) {
+                    msg_type != PTLLND_MSG_TYPE_HELLO &&
+                    msg_type != PTLLND_MSG_TYPE_NOOP) {
                         CDEBUG(D_NETTRACE, "%s[%d/%d+%d]: "
-                               "not using last credit for %p\n",
+                               "not using last credit for %s[%p]\n",
                                libcfs_id2str(peer->peer_id), 
                                peer->peer_credits,
                                peer->peer_outstanding_credits,
-                               peer->peer_sent_credits, tx);
+                               peer->peer_sent_credits,
+                               kptllnd_msgtype2str(msg_type), tx);
                         break;
                 }
 
@@ -661,10 +661,8 @@ kptllnd_peer_check_sends (kptl_peer_t *peer)
 
                 /* Discard any NOOP I queued if I'm not at the high-water mark
                  * any more or more messages have been queued */
-                if (tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_NOOP &&
-                    (!list_empty(&peer->peer_sendq) ||
-                     peer->peer_outstanding_credits < PTLLND_CREDIT_HIGHWATER)) {
-
+                if (msg_type == PTLLND_MSG_TYPE_NOOP &&
+                    !kptllnd_peer_send_noop(peer)) {
                         tx->tx_active = 0;
 
                         spin_unlock_irqrestore(&peer->peer_lock, flags);
@@ -677,9 +675,19 @@ kptllnd_peer_check_sends (kptl_peer_t *peer)
                         continue;
                 }
 
-                /* fill last-minute msg header fields */
+                /* fill last-minute msg fields */
                 kptllnd_msg_pack(tx->tx_msg, peer);
 
+                if (tx->tx_type == TX_TYPE_PUT_REQUEST ||
+                    tx->tx_type == TX_TYPE_GET_REQUEST) {
+                        /* peer_next_matchbits must be known good */
+                        LASSERT (peer->peer_state >= PEER_STATE_ACTIVE);
+                        /* Assume 64-bit matchbits can't wrap */
+                        LASSERT (peer->peer_next_matchbits >= PTL_RESERVED_MATCHBITS);
+                        tx->tx_msg->ptlm_u.rdma.kptlrm_matchbits =
+                                peer->peer_next_matchbits++;
+                }
+
                 peer->peer_sent_credits += peer->peer_outstanding_credits;
                 peer->peer_outstanding_credits = 0;
                 peer->peer_credits--;
@@ -687,8 +695,7 @@ kptllnd_peer_check_sends (kptl_peer_t *peer)
                 CDEBUG(D_NETTRACE, "%s[%d/%d+%d]: %s tx=%p nob=%d cred=%d\n",
                        libcfs_id2str(peer->peer_id), peer->peer_credits,
                        peer->peer_outstanding_credits, peer->peer_sent_credits,
-                       kptllnd_msgtype2str(tx->tx_msg->ptlm_type),
-                       tx, tx->tx_msg->ptlm_nob,
+                       kptllnd_msgtype2str(msg_type), tx, tx->tx_msg->ptlm_nob,
                        tx->tx_msg->ptlm_credits);
 
                 list_add_tail(&tx->tx_list, &peer->peer_activeq);
@@ -697,6 +704,41 @@ kptllnd_peer_check_sends (kptl_peer_t *peer)
 
                 spin_unlock_irqrestore(&peer->peer_lock, flags);
 
+                if (tx->tx_type == TX_TYPE_PUT_REQUEST ||
+                    tx->tx_type == TX_TYPE_GET_REQUEST) {
+                        /* Post bulk now we have safe matchbits */
+                        rc = PtlMEAttach(kptllnd_data.kptl_nih,
+                                         *kptllnd_tunables.kptl_portal,
+                                         peer->peer_ptlid,
+                                         tx->tx_msg->ptlm_u.rdma.kptlrm_matchbits,
+                                         0,             /* ignore bits */
+                                         PTL_UNLINK,
+                                         PTL_INS_BEFORE,
+                                         &meh);
+                        if (rc != PTL_OK) {
+                                CERROR("PtlMEAttach(%s) failed: %s(%d)\n",
+                                       libcfs_id2str(peer->peer_id),
+                                       kptllnd_errtype2str(rc), rc);
+                                goto failed;
+                        }
+
+                        rc = PtlMDAttach(meh, tx->tx_rdma_md, PTL_UNLINK,
+                                         &tx->tx_rdma_mdh);
+                        if (rc != PTL_OK) {
+                                CERROR("PtlMDAttach(%s) failed: %s(%d)\n",
+                                       libcfs_id2str(tx->tx_peer->peer_id),
+                                       kptllnd_errtype2str(rc), rc);
+                                rc = PtlMEUnlink(meh);
+                                LASSERT(rc == PTL_OK);
+                                tx->tx_rdma_mdh = PTL_INVALID_HANDLE;
+                                goto failed;
+                        }
+                        /* I'm not racing with the event callback here.  It's a
+                         * bug if there's an event on the MD I just attached
+                         * before I actually send the RDMA request message -
+                         * probably matchbits re-used in error. */
+                }
+
                 tx->tx_tposted = jiffies;       /* going on the wire */
 
                 rc = PtlPut (tx->tx_msg_mdh,
@@ -708,12 +750,10 @@ kptllnd_peer_check_sends (kptl_peer_t *peer)
                              0,                 /* offset */
                              0);                /* header data */
                 if (rc != PTL_OK) {
-                        CERROR("PtlPut %s error %d\n",
-                               libcfs_id2str(peer->peer_id), rc);
-
-                        /* Nuke everything (including this tx) */
-                        kptllnd_peer_close(peer, -EIO);
-                        return;
+                        CERROR("PtlPut %s error %s(%d)\n",
+                               libcfs_id2str(peer->peer_id),
+                               kptllnd_errtype2str(rc), rc);
+                        goto failed;
                 }
 
                 kptllnd_tx_decref(tx);          /* drop my ref */
@@ -722,16 +762,22 @@ kptllnd_peer_check_sends (kptl_peer_t *peer)
         }
 
         spin_unlock_irqrestore(&peer->peer_lock, flags);
+        return;
+
+ failed:
+        /* Nuke everything (including tx we were trying) */
+        kptllnd_peer_close(peer, -EIO);
+        kptllnd_tx_decref(tx);
 }
 
 kptl_tx_t *
 kptllnd_find_timed_out_tx(kptl_peer_t *peer)
 {
         kptl_tx_t         *tx;
-        struct list_head  *tmp;
+        struct list_head  *ele;
 
-        list_for_each(tmp, &peer->peer_sendq) {
-                tx = list_entry(peer->peer_sendq.next, kptl_tx_t, tx_list);
+        list_for_each(ele, &peer->peer_sendq) {
+                tx = list_entry(ele, kptl_tx_t, tx_list);
 
                 if (time_after_eq(jiffies, tx->tx_deadline)) {
                         kptllnd_tx_addref(tx);
@@ -739,8 +785,8 @@ kptllnd_find_timed_out_tx(kptl_peer_t *peer)
                 }
         }
 
-        list_for_each(tmp, &peer->peer_activeq) {
-                tx = list_entry(peer->peer_activeq.next, kptl_tx_t, tx_list);
+        list_for_each(ele, &peer->peer_activeq) {
+                tx = list_entry(ele, kptl_tx_t, tx_list);
 
                 if (time_after_eq(jiffies, tx->tx_deadline)) {
                         kptllnd_tx_addref(tx);
@@ -811,10 +857,24 @@ kptllnd_peer_check_bucket (int idx, int stamp)
                 nactive = kptllnd_count_queue(&peer->peer_activeq);
                 spin_unlock_irqrestore(&peer->peer_lock, flags);
 
-                LCONSOLE_ERROR("Timing out %s: %s\n",
-                               libcfs_id2str(peer->peer_id),
-                               (tx->tx_tposted == 0) ? 
-                               "no free peer buffers" : "please check Portals");
+                LCONSOLE_ERROR_MSG(0x126, "Timing out %s: %s\n",
+                                   libcfs_id2str(peer->peer_id),
+                                   (tx->tx_tposted == 0) ? 
+                                   "no free peer buffers" : 
+                                   "please check Portals");
+
+               if (tx->tx_tposted) {
+                       CERROR("Could not send to %s after %ds (sent %lds ago); "
+                               "check Portals for possible issues\n",
+                               libcfs_id2str(peer->peer_id),
+                               *kptllnd_tunables.kptl_timeout,
+                               cfs_duration_sec(jiffies - tx->tx_tposted));
+               } else {
+                       CERROR("Could not get credits for %s after %ds; "
+                               "possible Lustre networking issues\n",
+                       libcfs_id2str(peer->peer_id),
+                       *kptllnd_tunables.kptl_timeout);
+               }
 
                 CERROR("%s timed out: cred %d outstanding %d, sent %d, "
                        "sendq %d, activeq %d Tx %p %s (%s%s%s) status %d "
@@ -879,12 +939,12 @@ kptllnd_id2peer_locked (lnet_process_id_t id)
 void
 kptllnd_peertable_overflow_msg(char *str, lnet_process_id_t id)
 {
-        LCONSOLE_ERROR("%s %s overflows the peer table[%d]: "
-                       "messages may be dropped\n",
-                       str, libcfs_id2str(id),
-                       kptllnd_data.kptl_n_active_peers);
-        LCONSOLE_ERROR("Please correct by increasing "
-                       "'max_nodes' or 'max_procs_per_node'\n");
+        LCONSOLE_ERROR_MSG(0x127, "%s %s overflows the peer table[%d]: "
+                           "messages may be dropped\n",
+                           str, libcfs_id2str(id),
+                           kptllnd_data.kptl_n_active_peers);
+        LCONSOLE_ERROR_MSG(0x128, "Please correct by increasing "
+                           "'max_nodes' or 'max_procs_per_node'\n");
 }
 
 __u64
@@ -970,7 +1030,7 @@ kptllnd_peer_handle_hello (ptl_process_id_t  initiator,
                 CERROR("%s: max message size %d < MIN %d",
                        libcfs_id2str(lpid),
                        msg->ptlm_u.hello.kptlhm_max_msg_size,
-                       *kptllnd_tunables.kptl_max_msg_size);
+                       PTLLND_MIN_BUFFER_SIZE);
                 return NULL;
         }
 
@@ -1069,7 +1129,19 @@ kptllnd_peer_handle_hello (ptl_process_id_t  initiator,
         }
 
         write_lock_irqsave(g_lock, flags);
+
  again:
+        if (kptllnd_data.kptl_shutdown) {
+                write_unlock_irqrestore(g_lock, flags);
+
+                CERROR ("Shutdown started, refusing connection from %s\n",
+                        libcfs_id2str(lpid));
+                kptllnd_peer_unreserve_buffers();
+                kptllnd_peer_decref(new_peer);
+                kptllnd_tx_decref(hello_tx);
+                return NULL;
+        }
+
         peer = kptllnd_id2peer_locked(lpid);
         if (peer != NULL) {
                 if (peer->peer_state == PEER_STATE_WAITING_HELLO) {
@@ -1215,6 +1287,12 @@ kptllnd_find_target(kptl_peer_t **peerp, lnet_process_id_t target)
 
         write_lock_irqsave(g_lock, flags);
  again:
+        if (kptllnd_data.kptl_shutdown) {
+                write_unlock_irqrestore(g_lock, flags);
+                rc = -ESHUTDOWN;
+                goto unwind_2;
+        }
+
         *peerp = kptllnd_id2peer_locked(target);
         if (*peerp != NULL) {
                 write_unlock_irqrestore(g_lock, flags);