/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
* vim:expandtab:shiftwidth=8:tabstop=8:
*
- * Copyright (C) 2005 Cluster File Systems, Inc. All rights reserved.
- * Author: PJ Kirner <pjkirner@clusterfs.com>
- * E Barton <eeb@bartonsoftware.com>
+ * GPL HEADER START
*
- * This file is part of the Lustre file system, http://www.lustre.org
- * Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
- * This file is confidential source code owned by Cluster File Systems.
- * No viewing, modification, compilation, redistribution, or any other
- * form of use is permitted except through a signed license agreement.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
*
- * If you have not signed such an agreement, then you have no rights to
- * this file. Please destroy it immediately and contact CFS.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
*
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/ptllnd/ptllnd_peer.c
+ *
+ * Author: PJ Kirner <pjkirner@clusterfs.com>
+ * Author: E Barton <eeb@bartonsoftware.com>
*/
#include "ptllnd.h"
void
kptllnd_peer_add_peertable_locked (kptl_peer_t *peer)
{
+ LASSERT (!kptllnd_data.kptl_shutdown);
LASSERT (kptllnd_data.kptl_n_active_peers <
kptllnd_data.kptl_expected_peers);
memset(peer, 0, sizeof(*peer)); /* zero flags etc */
+ INIT_LIST_HEAD (&peer->peer_noops);
INIT_LIST_HEAD (&peer->peer_sendq);
INIT_LIST_HEAD (&peer->peer_activeq);
spin_lock_init (&peer->peer_lock);
peer->peer_state = PEER_STATE_ALLOCATED;
peer->peer_error = 0;
- peer->peer_last_alive = cfs_time_current();
+ peer->peer_last_alive = 0;
peer->peer_id = lpid;
peer->peer_ptlid = ppid;
peer->peer_credits = 1; /* enough for HELLO */
LASSERT (atomic_read(&peer->peer_refcount) == 0);
LASSERT (peer->peer_state == PEER_STATE_ALLOCATED ||
peer->peer_state == PEER_STATE_ZOMBIE);
+ LASSERT (list_empty(&peer->peer_noops));
LASSERT (list_empty(&peer->peer_sendq));
LASSERT (list_empty(&peer->peer_activeq));
spin_lock_irqsave(&peer->peer_lock, flags);
+ kptllnd_cancel_txlist(&peer->peer_noops, txs);
kptllnd_cancel_txlist(&peer->peer_sendq, txs);
kptllnd_cancel_txlist(&peer->peer_activeq, txs);
kptllnd_post_tx(kptl_peer_t *peer, kptl_tx_t *tx, int nfrag)
{
/* CAVEAT EMPTOR: I take over caller's ref on 'tx' */
- ptl_handle_md_t rdma_mdh = PTL_INVALID_HANDLE;
- ptl_handle_md_t msg_mdh = PTL_INVALID_HANDLE;
- ptl_handle_me_t meh;
+ ptl_handle_md_t msg_mdh;
ptl_md_t md;
ptl_err_t prc;
unsigned long flags;
kptllnd_set_tx_peer(tx, peer);
- if (tx->tx_type == TX_TYPE_PUT_REQUEST ||
- tx->tx_type == TX_TYPE_GET_REQUEST) {
-
- spin_lock_irqsave(&peer->peer_lock, flags);
-
- /* Assume 64-bit matchbits can't wrap */
- LASSERT (peer->peer_next_matchbits >= PTL_RESERVED_MATCHBITS);
- tx->tx_msg->ptlm_u.rdma.kptlrm_matchbits =
- peer->peer_next_matchbits++;
-
- spin_unlock_irqrestore(&peer->peer_lock, flags);
-
- prc = PtlMEAttach(kptllnd_data.kptl_nih,
- *kptllnd_tunables.kptl_portal,
- peer->peer_ptlid,
- tx->tx_msg->ptlm_u.rdma.kptlrm_matchbits,
- 0, /* ignore bits */
- PTL_UNLINK,
- PTL_INS_BEFORE,
- &meh);
- if (prc != PTL_OK) {
- CERROR("PtlMEAttach(%s) failed: %d\n",
- libcfs_id2str(peer->peer_id), prc);
- goto failed;
- }
-
- prc = PtlMDAttach(meh, tx->tx_rdma_md, PTL_UNLINK, &rdma_mdh);
- if (prc != PTL_OK) {
- CERROR("PtlMDAttach(%s) failed: %d\n",
- libcfs_id2str(tx->tx_peer->peer_id), prc);
- prc = PtlMEUnlink(meh);
- LASSERT(prc == PTL_OK);
- rdma_mdh = PTL_INVALID_HANDLE;
- goto failed;
- }
-
- /* I'm not racing with the event callback here. It's a bug if
- * there's an event on the MD I just attached before I actually
- * send the RDMA request message which the event callback
- * catches by asserting 'rdma_mdh' is valid. */
- }
-
memset(&md, 0, sizeof(md));
md.threshold = tx->tx_acked ? 2 : 1; /* SEND END + ACK? */
prc = PtlMDBind(kptllnd_data.kptl_nih, md, PTL_UNLINK, &msg_mdh);
if (prc != PTL_OK) {
- msg_mdh = PTL_INVALID_HANDLE;
- goto failed;
+ CERROR("PtlMDBind(%s) failed: %s(%d)\n",
+ libcfs_id2str(peer->peer_id),
+ kptllnd_errtype2str(prc), prc);
+ tx->tx_status = -EIO;
+ kptllnd_tx_decref(tx);
+ return;
}
-
+
spin_lock_irqsave(&peer->peer_lock, flags);
tx->tx_deadline = jiffies + (*kptllnd_tunables.kptl_timeout * HZ);
tx->tx_active = 1;
- tx->tx_rdma_mdh = rdma_mdh;
tx->tx_msg_mdh = msg_mdh;
/* Ensure HELLO is sent first */
- if (tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_HELLO)
+ if (tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_NOOP)
+ list_add(&tx->tx_list, &peer->peer_noops);
+ else if (tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_HELLO)
list_add(&tx->tx_list, &peer->peer_sendq);
else
list_add_tail(&tx->tx_list, &peer->peer_sendq);
spin_unlock_irqrestore(&peer->peer_lock, flags);
- return;
-
- failed:
- spin_lock_irqsave(&peer->peer_lock, flags);
-
- tx->tx_status = -EIO;
- tx->tx_rdma_mdh = rdma_mdh;
- tx->tx_msg_mdh = msg_mdh;
+}
- spin_unlock_irqrestore(&peer->peer_lock, flags);
+static inline int
+kptllnd_peer_send_noop (kptl_peer_t *peer)
+{
+ if (!peer->peer_sent_hello ||
+ peer->peer_credits == 0 ||
+ !list_empty(&peer->peer_noops) ||
+ peer->peer_outstanding_credits < PTLLND_CREDIT_HIGHWATER)
+ return 0;
- kptllnd_tx_decref(tx);
+ /* No tx to piggyback NOOP onto or no credit to send a tx */
+ return (list_empty(&peer->peer_sendq) || peer->peer_credits == 1);
}
void
kptllnd_peer_check_sends (kptl_peer_t *peer)
{
-
+ ptl_handle_me_t meh;
kptl_tx_t *tx;
int rc;
+ int msg_type;
unsigned long flags;
LASSERT(!in_interrupt());
peer->peer_retry_noop = 0;
- if (list_empty(&peer->peer_sendq) &&
- peer->peer_outstanding_credits >= PTLLND_CREDIT_HIGHWATER &&
- peer->peer_credits != 0) {
-
+ if (kptllnd_peer_send_noop(peer)) {
/* post a NOOP to return credits */
spin_unlock_irqrestore(&peer->peer_lock, flags);
peer->peer_retry_noop = (tx == NULL);
}
- while (!list_empty(&peer->peer_sendq)) {
- tx = list_entry (peer->peer_sendq.next, kptl_tx_t, tx_list);
+ for (;;) {
+ if (!list_empty(&peer->peer_noops)) {
+ LASSERT (peer->peer_sent_hello);
+ tx = list_entry(peer->peer_noops.next,
+ kptl_tx_t, tx_list);
+ } else if (!list_empty(&peer->peer_sendq)) {
+ tx = list_entry(peer->peer_sendq.next,
+ kptl_tx_t, tx_list);
+ } else {
+ /* nothing to send right now */
+ break;
+ }
LASSERT (tx->tx_active);
LASSERT (!PtlHandleIsEqual(tx->tx_msg_mdh, PTL_INVALID_HANDLE));
- LASSERT (tx->tx_type == TX_TYPE_SMALL_MESSAGE ||
- !PtlHandleIsEqual(tx->tx_rdma_mdh, PTL_INVALID_HANDLE));
+ LASSERT (PtlHandleIsEqual(tx->tx_rdma_mdh, PTL_INVALID_HANDLE));
LASSERT (peer->peer_outstanding_credits >= 0);
LASSERT (peer->peer_sent_credits >= 0);
*kptllnd_tunables.kptl_peercredits);
LASSERT (peer->peer_credits >= 0);
- /* Ensure HELLO is sent first */
- if (!peer->peer_sent_hello) {
- if (tx->tx_msg->ptlm_type != PTLLND_MSG_TYPE_HELLO)
- break;
- peer->peer_sent_hello = 1;
- }
+ msg_type = tx->tx_msg->ptlm_type;
+
+ /* Ensure HELLO is sent first */
+ if (!peer->peer_sent_hello) {
+ LASSERT (list_empty(&peer->peer_noops));
+ if (msg_type != PTLLND_MSG_TYPE_HELLO)
+ break;
+ peer->peer_sent_hello = 1;
+ }
if (peer->peer_credits == 0) {
- CDEBUG(D_NETTRACE, "%s[%d/%d+%d]: no credits for %p\n",
+ CDEBUG(D_NETTRACE, "%s[%d/%d+%d]: no credits for %s[%p]\n",
libcfs_id2str(peer->peer_id),
peer->peer_credits,
peer->peer_outstanding_credits,
- peer->peer_sent_credits, tx);
+ peer->peer_sent_credits,
+ kptllnd_msgtype2str(msg_type), tx);
break;
}
- /* Don't use the last credit unless I've got credits to
- * return */
+ /* Last/Initial credit reserved for NOOP/HELLO */
if (peer->peer_credits == 1 &&
- peer->peer_outstanding_credits == 0) {
+ msg_type != PTLLND_MSG_TYPE_HELLO &&
+ msg_type != PTLLND_MSG_TYPE_NOOP) {
CDEBUG(D_NETTRACE, "%s[%d/%d+%d]: "
- "not using last credit for %p\n",
+ "not using last credit for %s[%p]\n",
libcfs_id2str(peer->peer_id),
peer->peer_credits,
peer->peer_outstanding_credits,
- peer->peer_sent_credits, tx);
+ peer->peer_sent_credits,
+ kptllnd_msgtype2str(msg_type), tx);
break;
}
/* Discard any NOOP I queued if I'm not at the high-water mark
* any more or more messages have been queued */
- if (tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_NOOP &&
- (!list_empty(&peer->peer_sendq) ||
- peer->peer_outstanding_credits < PTLLND_CREDIT_HIGHWATER)) {
-
+ if (msg_type == PTLLND_MSG_TYPE_NOOP &&
+ !kptllnd_peer_send_noop(peer)) {
tx->tx_active = 0;
spin_unlock_irqrestore(&peer->peer_lock, flags);
continue;
}
- /* fill last-minute msg header fields */
+ /* fill last-minute msg fields */
kptllnd_msg_pack(tx->tx_msg, peer);
+ if (tx->tx_type == TX_TYPE_PUT_REQUEST ||
+ tx->tx_type == TX_TYPE_GET_REQUEST) {
+ /* peer_next_matchbits must be known good */
+ LASSERT (peer->peer_state >= PEER_STATE_ACTIVE);
+ /* Assume 64-bit matchbits can't wrap */
+ LASSERT (peer->peer_next_matchbits >= PTL_RESERVED_MATCHBITS);
+ tx->tx_msg->ptlm_u.rdma.kptlrm_matchbits =
+ peer->peer_next_matchbits++;
+ }
+
peer->peer_sent_credits += peer->peer_outstanding_credits;
peer->peer_outstanding_credits = 0;
peer->peer_credits--;
CDEBUG(D_NETTRACE, "%s[%d/%d+%d]: %s tx=%p nob=%d cred=%d\n",
libcfs_id2str(peer->peer_id), peer->peer_credits,
peer->peer_outstanding_credits, peer->peer_sent_credits,
- kptllnd_msgtype2str(tx->tx_msg->ptlm_type),
- tx, tx->tx_msg->ptlm_nob,
+ kptllnd_msgtype2str(msg_type), tx, tx->tx_msg->ptlm_nob,
tx->tx_msg->ptlm_credits);
list_add_tail(&tx->tx_list, &peer->peer_activeq);
spin_unlock_irqrestore(&peer->peer_lock, flags);
+ if (tx->tx_type == TX_TYPE_PUT_REQUEST ||
+ tx->tx_type == TX_TYPE_GET_REQUEST) {
+ /* Post bulk now we have safe matchbits */
+ rc = PtlMEAttach(kptllnd_data.kptl_nih,
+ *kptllnd_tunables.kptl_portal,
+ peer->peer_ptlid,
+ tx->tx_msg->ptlm_u.rdma.kptlrm_matchbits,
+ 0, /* ignore bits */
+ PTL_UNLINK,
+ PTL_INS_BEFORE,
+ &meh);
+ if (rc != PTL_OK) {
+ CERROR("PtlMEAttach(%s) failed: %s(%d)\n",
+ libcfs_id2str(peer->peer_id),
+ kptllnd_errtype2str(rc), rc);
+ goto failed;
+ }
+
+ rc = PtlMDAttach(meh, tx->tx_rdma_md, PTL_UNLINK,
+ &tx->tx_rdma_mdh);
+ if (rc != PTL_OK) {
+ CERROR("PtlMDAttach(%s) failed: %s(%d)\n",
+ libcfs_id2str(tx->tx_peer->peer_id),
+ kptllnd_errtype2str(rc), rc);
+ rc = PtlMEUnlink(meh);
+ LASSERT(rc == PTL_OK);
+ tx->tx_rdma_mdh = PTL_INVALID_HANDLE;
+ goto failed;
+ }
+ /* I'm not racing with the event callback here. It's a
+ * bug if there's an event on the MD I just attached
+ * before I actually send the RDMA request message -
+ * probably matchbits re-used in error. */
+ }
+
tx->tx_tposted = jiffies; /* going on the wire */
rc = PtlPut (tx->tx_msg_mdh,
0, /* offset */
0); /* header data */
if (rc != PTL_OK) {
- CERROR("PtlPut %s error %d\n",
- libcfs_id2str(peer->peer_id), rc);
-
- /* Nuke everything (including this tx) */
- kptllnd_peer_close(peer, -EIO);
- return;
+ CERROR("PtlPut %s error %s(%d)\n",
+ libcfs_id2str(peer->peer_id),
+ kptllnd_errtype2str(rc), rc);
+ goto failed;
}
kptllnd_tx_decref(tx); /* drop my ref */
}
spin_unlock_irqrestore(&peer->peer_lock, flags);
+ return;
+
+ failed:
+ /* Nuke everything (including tx we were trying) */
+ kptllnd_peer_close(peer, -EIO);
+ kptllnd_tx_decref(tx);
}
kptl_tx_t *
kptllnd_find_timed_out_tx(kptl_peer_t *peer)
{
kptl_tx_t *tx;
- struct list_head *tmp;
+ struct list_head *ele;
- list_for_each(tmp, &peer->peer_sendq) {
- tx = list_entry(peer->peer_sendq.next, kptl_tx_t, tx_list);
+ list_for_each(ele, &peer->peer_sendq) {
+ tx = list_entry(ele, kptl_tx_t, tx_list);
if (time_after_eq(jiffies, tx->tx_deadline)) {
kptllnd_tx_addref(tx);
}
}
- list_for_each(tmp, &peer->peer_activeq) {
- tx = list_entry(peer->peer_activeq.next, kptl_tx_t, tx_list);
+ list_for_each(ele, &peer->peer_activeq) {
+ tx = list_entry(ele, kptl_tx_t, tx_list);
if (time_after_eq(jiffies, tx->tx_deadline)) {
kptllnd_tx_addref(tx);
nactive = kptllnd_count_queue(&peer->peer_activeq);
spin_unlock_irqrestore(&peer->peer_lock, flags);
- LCONSOLE_ERROR("Timing out %s: %s\n",
- libcfs_id2str(peer->peer_id),
- (tx->tx_tposted == 0) ?
- "no free peer buffers" : "please check Portals");
+ LCONSOLE_ERROR_MSG(0x126, "Timing out %s: %s\n",
+ libcfs_id2str(peer->peer_id),
+ (tx->tx_tposted == 0) ?
+ "no free peer buffers" :
+ "please check Portals");
+
+ if (tx->tx_tposted) {
+ CERROR("Could not send to %s after %ds (sent %lds ago); "
+ "check Portals for possible issues\n",
+ libcfs_id2str(peer->peer_id),
+ *kptllnd_tunables.kptl_timeout,
+ cfs_duration_sec(jiffies - tx->tx_tposted));
+ } else {
+ CERROR("Could not get credits for %s after %ds; "
+ "possible Lustre networking issues\n",
+ libcfs_id2str(peer->peer_id),
+ *kptllnd_tunables.kptl_timeout);
+ }
CERROR("%s timed out: cred %d outstanding %d, sent %d, "
"sendq %d, activeq %d Tx %p %s (%s%s%s) status %d "
void
kptllnd_peertable_overflow_msg(char *str, lnet_process_id_t id)
{
- LCONSOLE_ERROR("%s %s overflows the peer table[%d]: "
- "messages may be dropped\n",
- str, libcfs_id2str(id),
- kptllnd_data.kptl_n_active_peers);
- LCONSOLE_ERROR("Please correct by increasing "
- "'max_nodes' or 'max_procs_per_node'\n");
+ LCONSOLE_ERROR_MSG(0x127, "%s %s overflows the peer table[%d]: "
+ "messages may be dropped\n",
+ str, libcfs_id2str(id),
+ kptllnd_data.kptl_n_active_peers);
+ LCONSOLE_ERROR_MSG(0x128, "Please correct by increasing "
+ "'max_nodes' or 'max_procs_per_node'\n");
}
__u64
CERROR("%s: max message size %d < MIN %d",
libcfs_id2str(lpid),
msg->ptlm_u.hello.kptlhm_max_msg_size,
- *kptllnd_tunables.kptl_max_msg_size);
+ PTLLND_MIN_BUFFER_SIZE);
return NULL;
}
}
write_lock_irqsave(g_lock, flags);
+
again:
+ if (kptllnd_data.kptl_shutdown) {
+ write_unlock_irqrestore(g_lock, flags);
+
+ CERROR ("Shutdown started, refusing connection from %s\n",
+ libcfs_id2str(lpid));
+ kptllnd_peer_unreserve_buffers();
+ kptllnd_peer_decref(new_peer);
+ kptllnd_tx_decref(hello_tx);
+ return NULL;
+ }
+
peer = kptllnd_id2peer_locked(lpid);
if (peer != NULL) {
if (peer->peer_state == PEER_STATE_WAITING_HELLO) {
write_lock_irqsave(g_lock, flags);
again:
+ if (kptllnd_data.kptl_shutdown) {
+ write_unlock_irqrestore(g_lock, flags);
+ rc = -ESHUTDOWN;
+ goto unwind_2;
+ }
+
*peerp = kptllnd_id2peer_locked(target);
if (*peerp != NULL) {
write_unlock_irqrestore(g_lock, flags);