* Performance Tuning defines
* NB no mention of PAGE_SIZE for interoperability
*/
-#define KQSW_MAXPAYLOAD PTL_MTU
-#define KQSW_SMALLPAYLOAD ((4<<10) - KQSW_HDR_SIZE) /* small/large ep receiver breakpoint */
-
#define KQSW_TX_MAXCONTIG (1<<10) /* largest payload that gets made contiguous on transmit */
#define KQSW_NTXMSGS 8 /* # normal transmit messages */
#define KQSW_NRXMSGS_SMALL 256 /* # small receive buffers */
#define KQSW_EP_ENVELOPES_SMALL 2048 /* # small ep envelopes */
-#define KQSW_RESCHED 100 /* # busy loops that forces scheduler to yield */
-
#define KQSW_OPTIMIZED_GETS 1 /* optimize gets >= this size */
#define KQSW_OPTIMIZED_PUTS (32<<10) /* optimize puts >= this size */
+/* fixed constants */
+#define KQSW_MAXPAYLOAD PTL_MTU
+#define KQSW_SMALLPAYLOAD ((4<<10) - KQSW_HDR_SIZE) /* small/large ep receiver breakpoint */
+#define KQSW_RESCHED 100 /* # busy loops that forces scheduler to yield */
+
/*
* derived constants
*/
MODULES := kvibnal
-kvibnal-objs := vibnal.o vibnal_cb.o
+kvibnal-objs := vibnal.o vibnal_cb.o vibnal_modparams.o
EXTRA_POST_CFLAGS := @VIBCPPFLAGS@
};
kib_data_t kibnal_data;
-kib_tunables_t kibnal_tunables;
-
-#ifdef CONFIG_SYSCTL
-#define IBNAL_SYSCTL 202
-
-#define IBNAL_SYSCTL_TIMEOUT 1
-
-static ctl_table kibnal_ctl_table[] = {
- {IBNAL_SYSCTL_TIMEOUT, "timeout",
- &kibnal_tunables.kib_io_timeout, sizeof (int),
- 0644, NULL, &proc_dointvec},
- { 0 }
-};
-
-static ctl_table kibnal_top_ctl_table[] = {
- {IBNAL_SYSCTL, "vibnal", NULL, 0, 0555, kibnal_ctl_table},
- { 0 }
-};
-#endif
void vibnal_assert_wire_constants (void)
{
CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.completion) == 12);
}
-void
-kibnal_pause(int ticks)
-{
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_timeout(ticks);
-}
-
__u32
kibnal_cksum (void *ptr, int nob)
{
msg->ibm_dstnid = dstnid;
msg->ibm_dststamp = dststamp;
msg->ibm_seq = seq;
-#if IBNAL_CKSUM
- /* NB ibm_cksum zero while computing cksum */
- msg->ibm_cksum = kibnal_cksum(msg, msg->ibm_nob);
-#endif
+
+ if (*kibnal_tunables.kib_cksum) {
+ /* NB ibm_cksum zero while computing cksum */
+ msg->ibm_cksum = kibnal_cksum(msg, msg->ibm_nob);
+ }
}
int
}
int
-kibnal_set_mynid(ptl_nid_t nid)
+kibnal_start_listener (ptl_ni_t *ni)
{
- static cm_listen_data_t info; /* protected by kib_nid_mutex */
+ static cm_listen_data_t info;
- ptl_ni_t *ni = kibnal_data.kib_ni;
- int rc;
cm_return_t cmrc;
- CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n",
- nid, ni->ni_nid);
+ LASSERT (kibnal_data.kib_listen_handle == NULL);
- down (&kibnal_data.kib_nid_mutex);
-
- if (nid == ni->ni_nid) {
- /* no change of NID */
- up (&kibnal_data.kib_nid_mutex);
- return (0);
+ kibnal_data.kib_listen_handle =
+ cm_create_cep(cm_cep_transp_rc);
+ if (kibnal_data.kib_listen_handle == NULL) {
+ CERROR ("Can't create listen CEP\n");
+ return -ENOMEM;
}
- CDEBUG(D_NET, "NID "LPX64"("LPX64")\n", ni->ni_nid, nid);
+ CDEBUG(D_NET, "Created CEP %p for listening\n",
+ kibnal_data.kib_listen_handle);
- if (kibnal_data.kib_listen_handle != NULL) {
- cmrc = cm_cancel(kibnal_data.kib_listen_handle);
- if (cmrc != cm_stat_success)
- CERROR ("Error %d stopping listener\n", cmrc);
+ memset(&info, 0, sizeof(info));
+ info.listen_addr.end_pt.sid =
+ (__u64)(*kibnal_tunables.kib_service_number);
- kibnal_pause(HZ/10); /* ensure no more callbacks */
+ cmrc = cm_listen(kibnal_data.kib_listen_handle, &info,
+ kibnal_listen_callback, NULL);
+ if (cmrc == cm_stat_success)
+ return 0;
- cmrc = cm_destroy_cep(kibnal_data.kib_listen_handle);
- if (cmrc != vv_return_ok)
- CERROR ("Error %d destroying CEP\n", cmrc);
-
- kibnal_data.kib_listen_handle = NULL;
- }
-
- /* Change NID. NB queued passive connection requests (if any) will be
- * rejected with an incorrect destination NID */
- ni->ni_nid = nid;
- kibnal_data.kib_incarnation++;
- mb();
-
- /* Delete all existing peers and their connections after new
- * NID/incarnation set to ensure no old connections in our brave
- * new world. */
- kibnal_del_peer (PTL_NID_ANY, 0);
-
- if (ni->ni_nid != PTL_NID_ANY) { /* got a new NID to install */
- kibnal_data.kib_listen_handle =
- cm_create_cep(cm_cep_transp_rc);
- if (kibnal_data.kib_listen_handle == NULL) {
- CERROR ("Can't create listen CEP\n");
- rc = -ENOMEM;
- goto failed_0;
- }
+ CERROR ("cm_listen error: %d\n", cmrc);
- CDEBUG(D_NET, "Created CEP %p for listening\n",
- kibnal_data.kib_listen_handle);
+ cmrc = cm_destroy_cep(kibnal_data.kib_listen_handle);
+ LASSERT (cmrc == cm_stat_success);
- memset(&info, 0, sizeof(info));
- info.listen_addr.end_pt.sid = kibnal_data.kib_svc_id;
+ kibnal_data.kib_listen_handle = NULL;
+ return -EINVAL;
+}
- cmrc = cm_listen(kibnal_data.kib_listen_handle, &info,
- kibnal_listen_callback, NULL);
- if (cmrc != 0) {
- CERROR ("cm_listen error: %d\n", cmrc);
- rc = -EINVAL;
- goto failed_1;
- }
- }
+void
+kibnal_stop_listener(ptl_ni_t *ni)
+{
+ cm_return_t cmrc;
- up (&kibnal_data.kib_nid_mutex);
- return (0);
+ LASSERT (kibnal_data.kib_listen_handle != NULL);
+
+ cmrc = cm_cancel(kibnal_data.kib_listen_handle);
+ if (cmrc != cm_stat_success)
+ CERROR ("Error %d stopping listener\n", cmrc);
- failed_1:
+ libcfs_pause(cfs_time_seconds(1)/10); /* ensure no more callbacks */
+
cmrc = cm_destroy_cep(kibnal_data.kib_listen_handle);
- LASSERT (cmrc == cm_stat_success);
+ if (cmrc != vv_return_ok)
+ CERROR ("Error %d destroying CEP\n", cmrc);
+
kibnal_data.kib_listen_handle = NULL;
- failed_0:
- ni->ni_nid = PTL_NID_ANY;
- kibnal_data.kib_incarnation++;
- mb();
- kibnal_del_peer (PTL_NID_ANY, 0);
- up (&kibnal_data.kib_nid_mutex);
- return rc;
}
-kib_peer_t *
-kibnal_create_peer (ptl_nid_t nid)
+int
+kibnal_create_peer (kib_peer_t **peerp, ptl_nid_t nid)
{
- kib_peer_t *peer;
+ kib_peer_t *peer;
+ unsigned long flags;
+ int rc;
LASSERT (nid != PTL_NID_ANY);
PORTAL_ALLOC(peer, sizeof (*peer));
if (peer == NULL) {
CERROR("Canot allocate perr\n");
- return (NULL);
+ return -ENOMEM;
}
memset(peer, 0, sizeof(*peer)); /* zero flags etc */
INIT_LIST_HEAD (&peer->ibp_tx_queue);
peer->ibp_reconnect_time = jiffies;
- peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
+ peer->ibp_reconnect_interval =
+ *kibnal_tunables.kib_min_reconnect_interval * HZ;
- atomic_inc (&kibnal_data.kib_npeers);
- if (atomic_read(&kibnal_data.kib_npeers) <= IBNAL_CONCURRENT_PEERS)
- return peer;
+ write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
- CERROR("Too many peers: CQ will overflow\n");
- kibnal_peer_decref(peer);
- return NULL;
+ if (kibnal_data.kib_npeers <
+ *kibnal_tunables.kib_concurrent_peers) {
+ rc = -EOVERFLOW; /* !! but at least it distinguishes */
+ } else if (kibnal_data.kib_listen_handle == NULL) {
+ rc = -ESHUTDOWN; /* shutdown has started */
+ } else {
+ rc = 0;
+ kibnal_data.kib_npeers++;
+ }
+
+ write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+
+ if (rc != 0) {
+ CERROR("Can't create peer: %s\n",
+ (rc == -ESHUTDOWN) ? "shutting down" :
+ "too many peers");
+ PORTAL_FREE(peer, sizeof(*peer));
+ } else {
+ *peerp = peer;
+ }
+
+ return rc;
}
void
kibnal_destroy_peer (kib_peer_t *peer)
{
+ unsigned long flags;
LASSERT (atomic_read (&peer->ibp_refcount) == 0);
LASSERT (peer->ibp_persistence == 0);
PORTAL_FREE (peer, sizeof (*peer));
+ write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+
/* NB a peer's connections keep a reference on their peer until
* they are destroyed, so we can be assured that _all_ state to do
* with this peer has been cleaned up when its refcount drops to
* zero. */
- atomic_dec (&kibnal_data.kib_npeers);
+ kibnal_data.kib_npeers--;
+
+ write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
}
-/* the caller is responsible for accounting for the additional reference
- * that this creates */
kib_peer_t *
kibnal_find_peer_locked (ptl_nid_t nid)
{
+ /* the caller is responsible for accounting the additional reference
+ * that this creates */
struct list_head *peer_list = kibnal_nid2peerlist (nid);
struct list_head *tmp;
kib_peer_t *peer;
kib_peer_t *peer;
kib_peer_t *peer2;
unsigned long flags;
+ int rc;
CDEBUG(D_NET, LPX64"@%08x\n", nid, ip);
if (nid == PTL_NID_ANY)
return (-EINVAL);
- peer = kibnal_create_peer (nid);
- if (peer == NULL)
- return (-ENOMEM);
+ rc = kibnal_create_peer(&peer, nid);
+ if (rc != 0)
+ return rc;
write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
LASSERT (ni == kibnal_data.kib_ni);
- switch(data->ioc_command) {
+ switch(cmd) {
case IOC_PORTAL_GET_PEER: {
ptl_nid_t nid = 0;
__u32 ip = 0;
data->ioc_nid = nid;
data->ioc_count = share_count;
data->ioc_u32[0] = ip;
- data->ioc_u32[1] = IBNAL_SERVICE_NUMBER; /* port */
+ data->ioc_u32[1] = *kibnal_tunables.kib_service_number; /* port */
break;
}
case IOC_PORTAL_ADD_PEER: {
break;
}
case IOC_PORTAL_REGISTER_MYNID: {
- if (data->ioc_nid == PTL_NID_ANY)
+ if (ni->ni_nid == data->ioc_nid) {
+ rc = 0;
+ } else {
+ CERROR("obsolete IOC_PORTAL_REGISTER_MYNID: %s(%s)\n",
+ libcfs_nid2str(data->ioc_nid),
+ libcfs_nid2str(ni->ni_nid));
rc = -EINVAL;
- else
- rc = kibnal_set_mynid (data->ioc_nid);
+ }
break;
}
}
int i;
PORTAL_ALLOC (kibnal_data.kib_tx_descs,
- IBNAL_TX_MSGS * sizeof(kib_tx_t));
+ IBNAL_TX_MSGS() * sizeof(kib_tx_t));
if (kibnal_data.kib_tx_descs == NULL)
return -ENOMEM;
memset(kibnal_data.kib_tx_descs, 0,
- IBNAL_TX_MSGS * sizeof(kib_tx_t));
+ IBNAL_TX_MSGS() * sizeof(kib_tx_t));
- for (i = 0; i < IBNAL_TX_MSGS; i++) {
+ for (i = 0; i < IBNAL_TX_MSGS(); i++) {
kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
PORTAL_ALLOC(tx->tx_wrq,
if (kibnal_data.kib_tx_descs == NULL)
return;
- for (i = 0; i < IBNAL_TX_MSGS; i++) {
+ for (i = 0; i < IBNAL_TX_MSGS(); i++) {
kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
if (tx->tx_wrq != NULL)
}
PORTAL_FREE(kibnal_data.kib_tx_descs,
- IBNAL_TX_MSGS * sizeof(kib_tx_t));
+ IBNAL_TX_MSGS() * sizeof(kib_tx_t));
}
int
/* No fancy arithmetic when we do the buffer calculations */
CLASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
- rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, IBNAL_TX_MSG_PAGES,
- 0);
+ rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages,
+ IBNAL_TX_MSG_PAGES(), 0);
if (rc != 0)
return (rc);
/* ignored for the whole_mem case */
vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr;
- for (i = 0; i < IBNAL_TX_MSGS; i++) {
+ for (i = 0; i < IBNAL_TX_MSGS(); i++) {
page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
tx = &kibnal_data.kib_tx_descs[i];
#else
tx->tx_vaddr = vaddr;
#endif
- tx->tx_isnblk = (i >= IBNAL_NTX);
+ tx->tx_isnblk = (i >= *kibnal_tunables.kib_ntx);
tx->tx_mapped = KIB_TX_UNMAPPED;
CDEBUG(D_NET, "Tx[%d] %p->%p[%x:"LPX64"]\n", i, tx,
&kibnal_data.kib_idle_txs);
vaddr += IBNAL_MSG_SIZE;
- LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES);
+ LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES());
page_offset += IBNAL_MSG_SIZE;
LASSERT (page_offset <= PAGE_SIZE);
if (page_offset == PAGE_SIZE) {
page_offset = 0;
ipage++;
- LASSERT (ipage <= IBNAL_TX_MSG_PAGES);
+ LASSERT (ipage <= IBNAL_TX_MSG_PAGES());
}
}
void
kibnal_shutdown (ptl_ni_t *ni)
{
- int i;
- vv_return_t vvrc;
+ unsigned long flags;
+ int i;
+ vv_return_t vvrc;
LASSERT (ni == kibnal_data.kib_ni);
LASSERT (ni->ni_data == &kibnal_data);
switch (kibnal_data.kib_init) {
case IBNAL_INIT_ALL:
- /* resetting my NID removes my listener and nukes all current
- * peers and their connections */
- kibnal_set_mynid (PTL_NID_ANY);
+ /* stop accepting connections and prevent new peers */
+ kibnal_stop_listener(ni);
+
+ /* nuke all existing peers */
+ kibnal_del_peer(PTL_NID_ANY);
/* Wait for all peer state to clean up */
i = 2;
- while (atomic_read (&kibnal_data.kib_npeers) != 0) {
+ write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+ while (kibnal_data.kib_npeers != 0) {
+ write_unlock_irqrestore(&kibnal_data.kib_global_lock,
+ flags);
i++;
- CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+ CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n? */
"waiting for %d peers to disconnect\n",
- atomic_read (&kibnal_data.kib_npeers));
+ kibnal_data.kib_npeers);
set_current_state (TASK_UNINTERRUPTIBLE);
schedule_timeout (HZ);
+
+ write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
}
+ write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
/* fall through */
case IBNAL_INIT_CQ:
/* fall through */
case IBNAL_INIT_DATA:
- LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0);
+ LASSERT (kibnal_data.kib_npeers == 0);
LASSERT (kibnal_data.kib_peers != NULL);
for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
LASSERT (list_empty (&kibnal_data.kib_peers[i]));
CERROR("Explicit interface config not supported\n");
return PTL_FAIL;
}
+
+#warning discover IPoIB IP address here
PORTAL_MODULE_USE;
memset (&kibnal_data, 0, sizeof (kibnal_data)); /* zero pointers, flags etc */
do_gettimeofday(&tv);
kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
- kibnal_data.kib_svc_id = IBNAL_SERVICE_NUMBER;
-
- init_MUTEX (&kibnal_data.kib_nid_mutex);
rwlock_init(&kibnal_data.kib_global_lock);
/* flag TX descs initialised */
kibnal_data.kib_init = IBNAL_INIT_TXD;
/*****************************************************/
+
{
uint32_t nentries;
- vvrc = vv_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES,
+ vvrc = vv_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES(),
kibnal_cq_callback,
NULL, /* context */
&kibnal_data.kib_cq, &nentries);
/* flag CQ initialised */
kibnal_data.kib_init = IBNAL_INIT_CQ;
- if (nentries < IBNAL_CQ_ENTRIES) {
+ if (nentries < IBNAL_CQ_ENTRIES()) {
CERROR ("CQ only has %d entries, need %d\n",
- nentries, IBNAL_CQ_ENTRIES);
+ nentries, IBNAL_CQ_ENTRIES());
goto failed;
}
goto failed;
}
}
+
+ rc = kibnal_start_listener(ni);
+ if (rc != 0) {
+ CERROR("Can't start listener: %d\n", rc);
+ goto failed;
+ }
/* flag everything initialised */
kibnal_data.kib_init = IBNAL_INIT_ALL;
void __exit
kibnal_module_fini (void)
{
-#ifdef CONFIG_SYSCTL
- if (kibnal_tunables.kib_sysctl != NULL)
- unregister_sysctl_table (kibnal_tunables.kib_sysctl);
-#endif
ptl_unregister_nal(&kibnal_nal);
+ kibnal_tunables_fini();
}
int __init
<= IBNAL_MSG_SIZE);
CLASSERT (offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[IBNAL_MAX_RDMA_FRAGS])
<= IBNAL_MSG_SIZE);
-
- /* the following must be sizeof(int) for proc_dointvec() */
- CLASSERT (sizeof (kibnal_tunables.kib_io_timeout) == sizeof (int));
- /* Initialise dynamic tunables to defaults once only */
- kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT;
+ rc = kibnal_tunables_init();
+ if (rc != 0)
+ return rc;
ptl_register_nal(&kibnal_nal);
-
-#ifdef CONFIG_SYSCTL
- /* Press on regardless even if registering sysctl doesn't work */
- kibnal_tunables.kib_sysctl =
- register_sysctl_table (kibnal_top_ctl_table, 0);
-#endif
- return (0);
+
+ return 0;
}
MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
# define IBNAL_N_SCHED 1 /* # schedulers */
#endif
+#define IBNAL_WHOLE_MEM 1
+#if !IBNAL_WHOLE_MEM
+# error "incompatible with voltaire adaptor-tavor (REGISTER_RAM_IN_ONE_PHY_MR)"
+#endif
+
+/* defaults for modparams/tunables */
+#define IBNAL_SERVICE_NUMBER 0x11b9a2 /* Fixed service number */
+#define IBNAL_MIN_RECONNECT_INTERVAL 1 /* first failed connection retry... */
+#define IBNAL_MAX_RECONNECT_INTERVAL 60 /* ...exponentially increasing to this */
+#define IBNAL_CONCURRENT_PEERS 1024 /* # nodes all talking at once to me */
+#define IBNAL_CKSUM 0 /* checksum kib_msg_t? */
+#define IBNAL_TIMEOUT 50 /* default comms timeout (seconds) */
+#define IBNAL_NTX 64 /* # tx descs */
+#define IBNAL_NTX_NBLK 128 /* # reserved tx descs */
+
+/* tunables fixed at compile time */
+#define IBNAL_PEER_HASH_SIZE 101 /* # peer lists */
+#define IBNAL_RESCHED 100 /* # scheduler loops before reschedule */
+#define IBNAL_MSG_QUEUE_SIZE 8 /* # messages/RDMAs in-flight */
+#define IBNAL_CREDIT_HIGHWATER 7 /* when to eagerly return credits */
+#define IBNAL_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */
+
/* sdp-connection.c */
#define IBNAL_QKEY 0
#define IBNAL_PKEY 0xffff
#define IBNAL_ARB_INITIATOR_DEPTH 0
#define IBNAL_ARB_RESP_RES 0
#define IBNAL_FAILOVER_ACCEPTED 0
-#define IBNAL_SERVICE_NUMBER 0x11b9a2 /* Fixed service number */
-
-#define IBNAL_MIN_RECONNECT_INTERVAL HZ /* first failed connection retry... */
-#define IBNAL_MAX_RECONNECT_INTERVAL (60*HZ) /* ...exponentially increasing to this */
-
-#define IBNAL_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */
-
-#define IBNAL_MSG_QUEUE_SIZE 8 /* # messages/RDMAs in-flight */
-#define IBNAL_CREDIT_HIGHWATER 7 /* when to eagerly return credits */
-
-#define IBNAL_NTX 64 /* # tx descs */
-#define IBNAL_NTX_NBLK 128 /* # reserved tx descs */
-/* reduced from 256 to ensure we register < 255 pages per region.
- * this can change if we register all memory. */
-
-#define IBNAL_PEER_HASH_SIZE 101 /* # peer lists */
-
-#define IBNAL_RESCHED 100 /* # scheduler loops before reschedule */
-
-#define IBNAL_CONCURRENT_PEERS 1000 /* # nodes all talking at once to me */
-
-#define IBNAL_RDMA_BASE 0x0eeb0000
-#define IBNAL_CKSUM 0
-#define IBNAL_WHOLE_MEM 1
-#if !IBNAL_WHOLE_MEM
-# error "incompatible with voltaire adaptor-tavor (REGISTER_RAM_IN_ONE_PHY_MR)"
-#endif
-
-/* default vals for runtime tunables */
-#define IBNAL_IO_TIMEOUT 50 /* default comms timeout (seconds) */
/************************/
/* derived constants... */
/* TX messages (shared by all connections) */
-#define IBNAL_TX_MSGS (IBNAL_NTX + IBNAL_NTX_NBLK)
-#define IBNAL_TX_MSG_BYTES (IBNAL_TX_MSGS * IBNAL_MSG_SIZE)
-#define IBNAL_TX_MSG_PAGES ((IBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
+#define IBNAL_TX_MSGS() (*kibnal_tunables.kib_ntx + \
+ *kibnal_tunables.kib_ntx_nblk)
+#define IBNAL_TX_MSG_BYTES() (IBNAL_TX_MSGS() * IBNAL_MSG_SIZE)
+#define IBNAL_TX_MSG_PAGES() ((IBNAL_TX_MSG_BYTES() + PAGE_SIZE - 1)/PAGE_SIZE)
#if IBNAL_WHOLE_MEM
# define IBNAL_MAX_RDMA_FRAGS PTL_MD_MAX_IOV
#else
+# define IBNAL_RDMA_BASE 0x0eeb0000
# define IBNAL_MAX_RDMA_FRAGS 1
#endif
/* RX messages (per connection) */
-#define IBNAL_RX_MSGS IBNAL_MSG_QUEUE_SIZE
-#define IBNAL_RX_MSG_BYTES (IBNAL_RX_MSGS * IBNAL_MSG_SIZE)
-#define IBNAL_RX_MSG_PAGES ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
+#define IBNAL_RX_MSGS IBNAL_MSG_QUEUE_SIZE
+#define IBNAL_RX_MSG_BYTES (IBNAL_RX_MSGS * IBNAL_MSG_SIZE)
+#define IBNAL_RX_MSG_PAGES ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
-#define IBNAL_CQ_ENTRIES (IBNAL_TX_MSGS * (1 + IBNAL_MAX_RDMA_FRAGS) + \
- IBNAL_RX_MSGS * IBNAL_CONCURRENT_PEERS)
+#define IBNAL_CQ_ENTRIES() (IBNAL_TX_MSGS() * (1 + IBNAL_MAX_RDMA_FRAGS) + \
+ IBNAL_RX_MSGS * *kibnal_tunables.kib_concurrent_peers)
typedef struct
{
- int kib_io_timeout; /* comms timeout (seconds) */
+ unsigned int *kib_service_number; /* IB service number */
+ int *kib_min_reconnect_interval; /* first failed connection retry... */
+ int *kib_max_reconnect_interval; /* ...exponentially increasing to this */
+ int *kib_concurrent_peers; /* max # nodes all talking to me */
+ int *kib_cksum; /* checksum kib_msg_t? */
+ int *kib_timeout; /* comms timeout (seconds) */
+ int *kib_ntx; /* # tx descs */
+ int *kib_ntx_nblk; /* # reserved tx descs */
+
struct ctl_table_header *kib_sysctl; /* sysctl interface */
} kib_tunables_t;
atomic_t kib_nthreads; /* # live threads */
ptl_ni_t *kib_ni; /* _the_ nal instance */
- __u64 kib_svc_id; /* service number I listen on */
vv_gid_t kib_port_gid; /* device/port GID */
vv_p_key_t kib_port_pkey; /* device/port pkey */
- struct semaphore kib_nid_mutex; /* serialise NID ops */
cm_cep_handle_t kib_listen_handle; /* IB listen handle */
rwlock_t kib_global_lock; /* stabilize peer/conn ops */
struct list_head *kib_peers; /* hash table of all my known peers */
int kib_peer_hash_size; /* size of kib_peers */
- atomic_t kib_npeers; /* # peers extant */
+ int kib_npeers; /* # peers extant */
atomic_t kib_nconns; /* # connections extant */
void *kib_connd; /* the connd task (serialisation assertions) */
extern void kibnal_init_msg(kib_msg_t *msg, int type, int body_nob);
extern void kibnal_pack_msg(kib_msg_t *msg, int credits, ptl_nid_t dstnid,
__u64 dststamp, __u64 seq);
-extern int kibnal_unpack_msg(kib_msg_t *msg, int nob);
-extern kib_peer_t *kibnal_create_peer(ptl_nid_t nid);
+extern int kibnal_unpack_msg(kib_msg_t *msg, int nob);
+extern int kibnal_create_peer(kib_peer_t **peerp, ptl_nid_t nid);
extern void kibnal_destroy_peer(kib_peer_t *peer);
-extern int kibnal_del_peer(ptl_nid_t nid);
+extern int kibnal_add_persistent_peer (ptl_nid_t nid, __u32 ip);
+extern int kibnal_del_peer(ptl_nid_t nid);
extern kib_peer_t *kibnal_find_peer_locked(ptl_nid_t nid);
extern void kibnal_unlink_peer_locked(kib_peer_t *peer);
extern int kibnal_close_stale_conns_locked(kib_peer_t *peer,
extern kib_conn_t *kibnal_create_conn(cm_cep_handle_t cep);
extern void kibnal_listen_callback(cm_cep_handle_t cep, cm_conn_data_t *info, void *arg);
-extern int kibnal_alloc_pages(kib_pages_t **pp, int npages, int access);
+extern int kibnal_alloc_pages(kib_pages_t **pp, int npages, int access);
extern void kibnal_free_pages(kib_pages_t *p);
extern void kibnal_check_sends(kib_conn_t *conn);
extern void kibnal_async_callback(vv_event_record_t ev);
extern void kibnal_cq_callback(unsigned long context);
extern void kibnal_passive_connreq(kib_pcreq_t *pcr, int reject);
-extern void kibnal_pause(int ticks);
extern void kibnal_queue_tx(kib_tx_t *tx, kib_conn_t *conn);
extern int kibnal_init_rdma(kib_tx_t *tx, int type, int nob,
kib_rdma_desc_t *dstrd, __u64 dstcookie);
+extern int kibnal_tunables_init(void);
+extern void kibnal_tunables_fini(void);
static inline int
wrq_signals_completion (vv_wr_t *wrq)
LASSERT (tx->tx_msg->ibm_type == IBNAL_MSG_PUT_DONE);
}
tx->tx_queued = 1;
- tx->tx_deadline = jiffies + kibnal_tunables.kib_io_timeout * HZ;
+ tx->tx_deadline = jiffies + (*kibnal_tunables.kib_timeout * HZ);
list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
}
* network address, given how it maps all phys mem into 1 region */
addr = kibnal_page2phys(page) + page_offset + PAGE_OFFSET;
+ /* NB this relies entirely on there being a single region for the whole
+ * of memory, since "high" memory will wrap in the (void *) cast! */
vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca,
(void *)((unsigned long)addr),
len, &mem_h, &l_key, &r_key);
#if CONFIG_HIGHMEM
if (vaddr >= PKMAP_BASE &&
vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) {
- /* No highmem pages only used for bulk (kiov) I/O */
+ /* Highmem pages only used for bulk (kiov) I/O */
CERROR("find page for address in highmem\n");
LBUG();
}
int niov, struct iovec *iov, int offset, int nob)
{
+#error "check this thoroughly before enabling"
/* active if I'm sending */
int active = ((access & vv_acc_r_mem_write) == 0);
void *vaddr;
vv_access_con_bit_mask_t access,
int nkiov, ptl_kiov_t *kiov, int offset, int nob)
{
+#error "check this thoroughly before enabling"
/* active if I'm sending */
int active = ((access & vv_acc_r_mem_write) == 0);
vv_return_t vvrc;
kib_conn_t *conn;
unsigned long flags;
rwlock_t *g_lock = &kibnal_data.kib_global_lock;
+ int retry;
+ int rc;
/* If I get here, I've committed to send, so I complete the tx with
* failure on any problems */
LASSERT (tx->tx_conn == NULL); /* only set when assigned a conn */
LASSERT (tx->tx_nwrq > 0); /* work items have been set up */
- read_lock_irqsave(g_lock, flags);
+ for (retry = 0; ; retry = 1) {
+ read_lock_irqsave(g_lock, flags);
- peer = kibnal_find_peer_locked (nid);
- if (peer == NULL) {
- read_unlock_irqrestore(g_lock, flags);
- tx->tx_status = -EHOSTUNREACH;
- tx->tx_waiting = 0;
- kibnal_tx_done (tx);
- return;
- }
+ peer = kibnal_find_peer_locked (nid);
+ if (peer != NULL) {
+ conn = kibnal_find_conn_locked (peer);
+ if (conn != NULL) {
+ kibnal_conn_addref(conn); /* 1 ref for me... */
+ read_unlock_irqrestore(g_lock, flags);
- conn = kibnal_find_conn_locked (peer);
- if (conn != NULL) {
- kibnal_conn_addref(conn); /* 1 ref for me... */
- read_unlock_irqrestore(g_lock, flags);
+ kibnal_queue_tx (tx, conn);
+ kibnal_conn_decref(conn); /* ...to here */
+ return;
+ }
+ }
- kibnal_queue_tx (tx, conn);
- kibnal_conn_decref(conn); /* ...to here */
- return;
- }
-
- /* Making one or more connections; I'll need a write lock... */
- read_unlock(g_lock);
- write_lock(g_lock);
+ /* Making one or more connections; I'll need a write lock... */
+ read_unlock(g_lock);
+ write_lock(g_lock);
+
+ peer = kibnal_find_peer_locked (nid);
+ if (peer != NULL)
+ break;
- peer = kibnal_find_peer_locked (nid);
- if (peer == NULL) {
write_unlock_irqrestore(g_lock, flags);
- tx->tx_status = -EHOSTUNREACH;
- tx->tx_waiting = 0;
- kibnal_tx_done (tx);
- return;
+
+ if (retry) {
+ CERROR("Can't find peer %s\n", libcfs_nid2str(nid));
+
+ tx->tx_status = -EHOSTUNREACH;
+ tx->tx_waiting = 0;
+ kibnal_tx_done (tx);
+ return;
+ }
+
+ rc = kibnal_add_persistent_peer(nid, PTL_NIDADDR(nid));
+ if (rc != 0) {
+ CERROR("Can't add peer %s: %d\n",
+ libcfs_nid2str(nid), rc);
+
+ tx->tx_status = -EHOSTUNREACH;
+ tx->tx_waiting = 0;
+ kibnal_tx_done (tx);
+ return;
+ }
}
conn = kibnal_find_conn_locked (peer);
/* Only the connd creates conns => single threaded */
LASSERT (!in_interrupt());
LASSERT (current == kibnal_data.kib_connd);
- LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL);
write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
/* Say when active connection can be re-attempted */
peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval;
/* Increase reconnection interval */
- peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2,
- IBNAL_MAX_RECONNECT_INTERVAL);
+ peer->ibp_reconnect_interval =
+ MIN (peer->ibp_reconnect_interval * 2,
+ *kibnal_tunables.kib_max_reconnect_interval * HZ);
/* Take peer's blocked transmits to complete with error */
list_add(&zombies, &peer->ibp_tx_queue);
case IBNAL_CONN_ACTIVE_CONNECT:
LASSERT (active);
cm_cancel(conn->ibc_cep);
- kibnal_pause(HZ/10);
+ libcfs_pause(cfs_time_seconds(1)/10);
/* cm_connect() failed immediately or
* callback returned failure */
break;
list_del_init(&peer->ibp_tx_queue);
/* reset reconnect interval for next attempt */
- peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
+ peer->ibp_reconnect_interval =
+ *kibnal_tunables.kib_min_reconnect_interval * HZ;
write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
/* Schedule blocked txs */
LASSERT (!in_interrupt());
LASSERT (current == kibnal_data.kib_connd);
- if (cmreq->sid != IBNAL_SERVICE_NUMBER) {
+ if (cmreq->sid != (__u64)(*kibnal_tunables.kib_service_number)) {
CERROR(LPX64" != IBNAL_SERVICE_NUMBER("LPX64")\n",
- cmreq->sid, (__u64)IBNAL_SERVICE_NUMBER);
+ cmreq->sid, (__u64)(*kibnal_tunables.kib_service_number));
goto reject;
}
}
/* assume 'rxmsg.ibm_srcnid' is a new peer */
- tmp_peer = kibnal_create_peer (rxmsg.ibm_srcnid);
- if (tmp_peer == NULL) {
+ rc = kibnal_create_peer (&tmp_peer, rxmsg.ibm_srcnid);
+ if (rc != 0) {
CERROR("Can't create tmp peer for "LPX64"\n", rxmsg.ibm_srcnid);
kibnal_conn_decref(conn);
conn = NULL;
memset(&cmreq, 0, sizeof(cmreq));
- cmreq.sid = IBNAL_SERVICE_NUMBER;
+ cmreq.sid = (__u64)(*kibnal_tunables.kib_service_number);
cmreq.cep_data.ca_guid = kibnal_data.kib_hca_attrs.guid;
cmreq.cep_data.qpn = cv->cv_local_qpn;
write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
cm_cancel(conn->ibc_cep);
- kibnal_pause(HZ/10);
+ libcfs_pause(cfs_time_seconds(1)/10);
if (!conn->ibc_disconnect) /* CM callback will never happen now */
kibnal_conn_decref(conn);
* connection within (n+1)/n times the timeout
* interval. */
- if (kibnal_tunables.kib_io_timeout > n * p)
+ if (*kibnal_tunables.kib_timeout > n * p)
chunk = (chunk * n * p) /
- kibnal_tunables.kib_io_timeout;
+ *kibnal_tunables.kib_timeout;
if (chunk == 0)
chunk = 1;
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "vibnal.h"
+
+static int service_number = IBNAL_SERVICE_NUMBER;
+CFS_MODULE_PARM(service_number, "i", int, 0444,
+ "IB service number");
+
+static int min_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
+CFS_MODULE_PARM(min_reconnect_interval, "i", int, 0644,
+ "minimum connection retry interval (seconds)");
+
+static int max_reconnect_interval = IBNAL_MAX_RECONNECT_INTERVAL;
+CFS_MODULE_PARM(max_reconnect_interval, "i", int, 0644,
+ "maximum connection retry interval (seconds)");
+
+static int concurrent_peers = IBNAL_CONCURRENT_PEERS;
+CFS_MODULE_PARM(concurrent_peers, "i", int, 0444,
+ "maximum number of peers that may connect");
+
+static int cksum = IBNAL_CKSUM;
+CFS_MODULE_PARM(cksum, "i", int, 0644,
+ "set non-zero to enable message (not RDMA) checksums");
+
+static int timeout = IBNAL_TIMEOUT;
+CFS_MODULE_PARM(timeout, "i", int, 0644,
+ "timeout (seconds)");
+
+static int ntx = IBNAL_NTX;
+CFS_MODULE_PARM(ntx, "i", int, 0444,
+ "# of 'normal' message descriptors");
+
+static int ntx_nblk = IBNAL_NTX_NBLK;
+CFS_MODULE_PARM(ntx_nblk, "i", int, 0444,
+ "# of 'reserved' message descriptors");
+
+kib_tunables_t kibnal_tunables = {
+ .kib_service_number = &service_number,
+ .kib_min_reconnect_interval = &min_reconnect_interval,
+ .kib_max_reconnect_interval = &max_reconnect_interval,
+ .kib_concurrent_peers = &concurrent_peers,
+ .kib_cksum = &cksum,
+ .kib_timeout = &timeout,
+ .kib_ntx = &ntx,
+ .kib_ntx_nblk = &ntx_nblk,
+};
+
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
+static ctl_table kibnal_ctl_table[] = {
+ {1, "service_number", &service_number,
+ sizeof(int), 0444, NULL, &proc_dointvec},
+ {2, "min_reconnect_interval", &min_reconnect_interval,
+ sizeof(int), 0644, NULL, &proc_dointvec},
+ {3, "max_reconnect_interval", &max_reconnect_interval,
+ sizeof(int), 0644, NULL, &proc_dointvec},
+ {4, "concurrent_peers", &concurrent_peers,
+ sizeof(int), 0444, NULL, &proc_dointvec},
+ {5, "cksum", &cksum,
+ sizeof(int), 0644, NULL, &proc_dointvec},
+ {6, "timeout", &timeout,
+ sizeof(int), 0644, NULL, &proc_dointvec},
+ {7, "ntx", &ntx,
+ sizeof(int), 0444, NULL, &proc_dointvec},
+ {8, "ntx_nblk", &ntx_nblk,
+ sizeof(int), 0444, NULL, &proc_dointvec},
+ {0}
+};
+
+static ctl_table kibnal_top_ctl_table[] = {
+ {203, "vibnal", NULL, 0, 0555, kibnal_ctl_table},
+ {0}
+};
+
+int
+kibnal_tunables_init ()
+{
+ kibnal_tunables.kib_sysctl =
+ register_sysctl_table(kibnal_top_ctl_table, 0);
+
+ if (kibnal_tunables.kib_sysctl == NULL)
+ CWARN("Can't setup /proc tunables\n");
+
+ return 0;
+}
+
+void
+kibnal_tunables_fini ()
+{
+ if (kibnal_tunables.kib_sysctl != NULL)
+ unregister_sysctl_table(kibnal_tunables.kib_sysctl);
+}
+
+#else
+
+int
+kibnal_tunables_init ()
+{
+ return 0;
+}
+
+void
+kibnal_tunables_fini ()
+{
+}
+
+#endif
+
+
+
+
+
+