From 9116cc42da75f0d4ea9ee4ccdf9201028c77835d Mon Sep 17 00:00:00 2001 From: eeb Date: Thu, 2 Jun 2005 07:09:25 +0000 Subject: [PATCH] * moved #defines in qswnal.h around to separate fixed constants from ones that can be set via modparams * placeholder vibnal for newconfig; still need to get the IPoIB IP address somehow. --- lnet/klnds/qswlnd/qswlnd.h | 10 +- lnet/klnds/viblnd/Makefile.in | 2 +- lnet/klnds/viblnd/viblnd.c | 302 ++++++++++++++++------------------- lnet/klnds/viblnd/viblnd.h | 98 ++++++------ lnet/klnds/viblnd/viblnd_cb.c | 102 +++++++----- lnet/klnds/viblnd/viblnd_modparams.c | 133 +++++++++++++++ 6 files changed, 391 insertions(+), 256 deletions(-) create mode 100644 lnet/klnds/viblnd/viblnd_modparams.c diff --git a/lnet/klnds/qswlnd/qswlnd.h b/lnet/klnds/qswlnd/qswlnd.h index 767fce8..8269d29 100644 --- a/lnet/klnds/qswlnd/qswlnd.h +++ b/lnet/klnds/qswlnd/qswlnd.h @@ -92,9 +92,6 @@ typedef unsigned long kqsw_csum_t; * Performance Tuning defines * NB no mention of PAGE_SIZE for interoperability */ -#define KQSW_MAXPAYLOAD PTL_MTU -#define KQSW_SMALLPAYLOAD ((4<<10) - KQSW_HDR_SIZE) /* small/large ep receiver breakpoint */ - #define KQSW_TX_MAXCONTIG (1<<10) /* largest payload that gets made contiguous on transmit */ #define KQSW_NTXMSGS 8 /* # normal transmit messages */ @@ -106,11 +103,14 @@ typedef unsigned long kqsw_csum_t; #define KQSW_NRXMSGS_SMALL 256 /* # small receive buffers */ #define KQSW_EP_ENVELOPES_SMALL 2048 /* # small ep envelopes */ -#define KQSW_RESCHED 100 /* # busy loops that forces scheduler to yield */ - #define KQSW_OPTIMIZED_GETS 1 /* optimize gets >= this size */ #define KQSW_OPTIMIZED_PUTS (32<<10) /* optimize puts >= this size */ +/* fixed constants */ +#define KQSW_MAXPAYLOAD PTL_MTU +#define KQSW_SMALLPAYLOAD ((4<<10) - KQSW_HDR_SIZE) /* small/large ep receiver breakpoint */ +#define KQSW_RESCHED 100 /* # busy loops that forces scheduler to yield */ + /* * derived constants */ diff --git a/lnet/klnds/viblnd/Makefile.in b/lnet/klnds/viblnd/Makefile.in index 5287e70..5fe1630 100644 --- a/lnet/klnds/viblnd/Makefile.in +++ b/lnet/klnds/viblnd/Makefile.in @@ -1,5 +1,5 @@ MODULES := kvibnal -kvibnal-objs := vibnal.o vibnal_cb.o +kvibnal-objs := vibnal.o vibnal_cb.o vibnal_modparams.o EXTRA_POST_CFLAGS := @VIBCPPFLAGS@ diff --git a/lnet/klnds/viblnd/viblnd.c b/lnet/klnds/viblnd/viblnd.c index 32e1ff1..0486ca9 100644 --- a/lnet/klnds/viblnd/viblnd.c +++ b/lnet/klnds/viblnd/viblnd.c @@ -36,25 +36,6 @@ ptl_nal_t kibnal_nal = { }; kib_data_t kibnal_data; -kib_tunables_t kibnal_tunables; - -#ifdef CONFIG_SYSCTL -#define IBNAL_SYSCTL 202 - -#define IBNAL_SYSCTL_TIMEOUT 1 - -static ctl_table kibnal_ctl_table[] = { - {IBNAL_SYSCTL_TIMEOUT, "timeout", - &kibnal_tunables.kib_io_timeout, sizeof (int), - 0644, NULL, &proc_dointvec}, - { 0 } -}; - -static ctl_table kibnal_top_ctl_table[] = { - {IBNAL_SYSCTL, "vibnal", NULL, 0, 0555, kibnal_ctl_table}, - { 0 } -}; -#endif void vibnal_assert_wire_constants (void) { @@ -181,13 +162,6 @@ void vibnal_assert_wire_constants (void) CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.completion) == 12); } -void -kibnal_pause(int ticks) -{ - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(ticks); -} - __u32 kibnal_cksum (void *ptr, int nob) { @@ -225,10 +199,11 @@ kibnal_pack_msg(kib_msg_t *msg, int credits, ptl_nid_t dstnid, msg->ibm_dstnid = dstnid; msg->ibm_dststamp = dststamp; msg->ibm_seq = seq; -#if IBNAL_CKSUM - /* NB ibm_cksum zero while computing cksum */ - msg->ibm_cksum = kibnal_cksum(msg, msg->ibm_nob); -#endif + + if (*kibnal_tunables.kib_cksum) { + /* NB ibm_cksum zero while computing cksum */ + msg->ibm_cksum = kibnal_cksum(msg, msg->ibm_nob); + } } int @@ -421,103 +396,75 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob) } int -kibnal_set_mynid(ptl_nid_t nid) +kibnal_start_listener (ptl_ni_t *ni) { - static cm_listen_data_t info; /* protected by kib_nid_mutex */ + static cm_listen_data_t info; - ptl_ni_t *ni = kibnal_data.kib_ni; - int rc; cm_return_t cmrc; - CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n", - nid, ni->ni_nid); + LASSERT (kibnal_data.kib_listen_handle == NULL); - down (&kibnal_data.kib_nid_mutex); - - if (nid == ni->ni_nid) { - /* no change of NID */ - up (&kibnal_data.kib_nid_mutex); - return (0); + kibnal_data.kib_listen_handle = + cm_create_cep(cm_cep_transp_rc); + if (kibnal_data.kib_listen_handle == NULL) { + CERROR ("Can't create listen CEP\n"); + return -ENOMEM; } - CDEBUG(D_NET, "NID "LPX64"("LPX64")\n", ni->ni_nid, nid); + CDEBUG(D_NET, "Created CEP %p for listening\n", + kibnal_data.kib_listen_handle); - if (kibnal_data.kib_listen_handle != NULL) { - cmrc = cm_cancel(kibnal_data.kib_listen_handle); - if (cmrc != cm_stat_success) - CERROR ("Error %d stopping listener\n", cmrc); + memset(&info, 0, sizeof(info)); + info.listen_addr.end_pt.sid = + (__u64)(*kibnal_tunables.kib_service_number); - kibnal_pause(HZ/10); /* ensure no more callbacks */ + cmrc = cm_listen(kibnal_data.kib_listen_handle, &info, + kibnal_listen_callback, NULL); + if (cmrc == cm_stat_success) + return 0; - cmrc = cm_destroy_cep(kibnal_data.kib_listen_handle); - if (cmrc != vv_return_ok) - CERROR ("Error %d destroying CEP\n", cmrc); - - kibnal_data.kib_listen_handle = NULL; - } - - /* Change NID. NB queued passive connection requests (if any) will be - * rejected with an incorrect destination NID */ - ni->ni_nid = nid; - kibnal_data.kib_incarnation++; - mb(); - - /* Delete all existing peers and their connections after new - * NID/incarnation set to ensure no old connections in our brave - * new world. */ - kibnal_del_peer (PTL_NID_ANY, 0); - - if (ni->ni_nid != PTL_NID_ANY) { /* got a new NID to install */ - kibnal_data.kib_listen_handle = - cm_create_cep(cm_cep_transp_rc); - if (kibnal_data.kib_listen_handle == NULL) { - CERROR ("Can't create listen CEP\n"); - rc = -ENOMEM; - goto failed_0; - } + CERROR ("cm_listen error: %d\n", cmrc); - CDEBUG(D_NET, "Created CEP %p for listening\n", - kibnal_data.kib_listen_handle); + cmrc = cm_destroy_cep(kibnal_data.kib_listen_handle); + LASSERT (cmrc == cm_stat_success); - memset(&info, 0, sizeof(info)); - info.listen_addr.end_pt.sid = kibnal_data.kib_svc_id; + kibnal_data.kib_listen_handle = NULL; + return -EINVAL; +} - cmrc = cm_listen(kibnal_data.kib_listen_handle, &info, - kibnal_listen_callback, NULL); - if (cmrc != 0) { - CERROR ("cm_listen error: %d\n", cmrc); - rc = -EINVAL; - goto failed_1; - } - } +void +kibnal_stop_listener(ptl_ni_t *ni) +{ + cm_return_t cmrc; - up (&kibnal_data.kib_nid_mutex); - return (0); + LASSERT (kibnal_data.kib_listen_handle != NULL); + + cmrc = cm_cancel(kibnal_data.kib_listen_handle); + if (cmrc != cm_stat_success) + CERROR ("Error %d stopping listener\n", cmrc); - failed_1: + libcfs_pause(cfs_time_seconds(1)/10); /* ensure no more callbacks */ + cmrc = cm_destroy_cep(kibnal_data.kib_listen_handle); - LASSERT (cmrc == cm_stat_success); + if (cmrc != vv_return_ok) + CERROR ("Error %d destroying CEP\n", cmrc); + kibnal_data.kib_listen_handle = NULL; - failed_0: - ni->ni_nid = PTL_NID_ANY; - kibnal_data.kib_incarnation++; - mb(); - kibnal_del_peer (PTL_NID_ANY, 0); - up (&kibnal_data.kib_nid_mutex); - return rc; } -kib_peer_t * -kibnal_create_peer (ptl_nid_t nid) +int +kibnal_create_peer (kib_peer_t **peerp, ptl_nid_t nid) { - kib_peer_t *peer; + kib_peer_t *peer; + unsigned long flags; + int rc; LASSERT (nid != PTL_NID_ANY); PORTAL_ALLOC(peer, sizeof (*peer)); if (peer == NULL) { CERROR("Canot allocate perr\n"); - return (NULL); + return -ENOMEM; } memset(peer, 0, sizeof(*peer)); /* zero flags etc */ @@ -530,20 +477,39 @@ kibnal_create_peer (ptl_nid_t nid) INIT_LIST_HEAD (&peer->ibp_tx_queue); peer->ibp_reconnect_time = jiffies; - peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL; + peer->ibp_reconnect_interval = + *kibnal_tunables.kib_min_reconnect_interval * HZ; - atomic_inc (&kibnal_data.kib_npeers); - if (atomic_read(&kibnal_data.kib_npeers) <= IBNAL_CONCURRENT_PEERS) - return peer; + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); - CERROR("Too many peers: CQ will overflow\n"); - kibnal_peer_decref(peer); - return NULL; + if (kibnal_data.kib_npeers < + *kibnal_tunables.kib_concurrent_peers) { + rc = -EOVERFLOW; /* !! but at least it distinguishes */ + } else if (kibnal_data.kib_listen_handle == NULL) { + rc = -ESHUTDOWN; /* shutdown has started */ + } else { + rc = 0; + kibnal_data.kib_npeers++; + } + + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); + + if (rc != 0) { + CERROR("Can't create peer: %s\n", + (rc == -ESHUTDOWN) ? "shutting down" : + "too many peers"); + PORTAL_FREE(peer, sizeof(*peer)); + } else { + *peerp = peer; + } + + return rc; } void kibnal_destroy_peer (kib_peer_t *peer) { + unsigned long flags; LASSERT (atomic_read (&peer->ibp_refcount) == 0); LASSERT (peer->ibp_persistence == 0); @@ -554,18 +520,22 @@ kibnal_destroy_peer (kib_peer_t *peer) PORTAL_FREE (peer, sizeof (*peer)); + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); + /* NB a peer's connections keep a reference on their peer until * they are destroyed, so we can be assured that _all_ state to do * with this peer has been cleaned up when its refcount drops to * zero. */ - atomic_dec (&kibnal_data.kib_npeers); + kibnal_data.kib_npeers--; + + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); } -/* the caller is responsible for accounting for the additional reference - * that this creates */ kib_peer_t * kibnal_find_peer_locked (ptl_nid_t nid) { + /* the caller is responsible for accounting the additional reference + * that this creates */ struct list_head *peer_list = kibnal_nid2peerlist (nid); struct list_head *tmp; kib_peer_t *peer; @@ -643,15 +613,16 @@ kibnal_add_persistent_peer (ptl_nid_t nid, __u32 ip) kib_peer_t *peer; kib_peer_t *peer2; unsigned long flags; + int rc; CDEBUG(D_NET, LPX64"@%08x\n", nid, ip); if (nid == PTL_NID_ANY) return (-EINVAL); - peer = kibnal_create_peer (nid); - if (peer == NULL) - return (-ENOMEM); + rc = kibnal_create_peer(&peer, nid); + if (rc != 0) + return rc; write_lock_irqsave(&kibnal_data.kib_global_lock, flags); @@ -1176,7 +1147,7 @@ kibnal_ctl(ptl_ni_t *ni, unsigned int cmd, void *arg) LASSERT (ni == kibnal_data.kib_ni); - switch(data->ioc_command) { + switch(cmd) { case IOC_PORTAL_GET_PEER: { ptl_nid_t nid = 0; __u32 ip = 0; @@ -1187,7 +1158,7 @@ kibnal_ctl(ptl_ni_t *ni, unsigned int cmd, void *arg) data->ioc_nid = nid; data->ioc_count = share_count; data->ioc_u32[0] = ip; - data->ioc_u32[1] = IBNAL_SERVICE_NUMBER; /* port */ + data->ioc_u32[1] = *kibnal_tunables.kib_service_number; /* port */ break; } case IOC_PORTAL_ADD_PEER: { @@ -1216,10 +1187,14 @@ kibnal_ctl(ptl_ni_t *ni, unsigned int cmd, void *arg) break; } case IOC_PORTAL_REGISTER_MYNID: { - if (data->ioc_nid == PTL_NID_ANY) + if (ni->ni_nid == data->ioc_nid) { + rc = 0; + } else { + CERROR("obsolete IOC_PORTAL_REGISTER_MYNID: %s(%s)\n", + libcfs_nid2str(data->ioc_nid), + libcfs_nid2str(ni->ni_nid)); rc = -EINVAL; - else - rc = kibnal_set_mynid (data->ioc_nid); + } break; } } @@ -1331,14 +1306,14 @@ kibnal_alloc_tx_descs (void) int i; PORTAL_ALLOC (kibnal_data.kib_tx_descs, - IBNAL_TX_MSGS * sizeof(kib_tx_t)); + IBNAL_TX_MSGS() * sizeof(kib_tx_t)); if (kibnal_data.kib_tx_descs == NULL) return -ENOMEM; memset(kibnal_data.kib_tx_descs, 0, - IBNAL_TX_MSGS * sizeof(kib_tx_t)); + IBNAL_TX_MSGS() * sizeof(kib_tx_t)); - for (i = 0; i < IBNAL_TX_MSGS; i++) { + for (i = 0; i < IBNAL_TX_MSGS(); i++) { kib_tx_t *tx = &kibnal_data.kib_tx_descs[i]; PORTAL_ALLOC(tx->tx_wrq, @@ -1371,7 +1346,7 @@ kibnal_free_tx_descs (void) if (kibnal_data.kib_tx_descs == NULL) return; - for (i = 0; i < IBNAL_TX_MSGS; i++) { + for (i = 0; i < IBNAL_TX_MSGS(); i++) { kib_tx_t *tx = &kibnal_data.kib_tx_descs[i]; if (tx->tx_wrq != NULL) @@ -1391,7 +1366,7 @@ kibnal_free_tx_descs (void) } PORTAL_FREE(kibnal_data.kib_tx_descs, - IBNAL_TX_MSGS * sizeof(kib_tx_t)); + IBNAL_TX_MSGS() * sizeof(kib_tx_t)); } int @@ -1412,15 +1387,15 @@ kibnal_setup_tx_descs (void) /* No fancy arithmetic when we do the buffer calculations */ CLASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0); - rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, IBNAL_TX_MSG_PAGES, - 0); + rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, + IBNAL_TX_MSG_PAGES(), 0); if (rc != 0) return (rc); /* ignored for the whole_mem case */ vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr; - for (i = 0; i < IBNAL_TX_MSGS; i++) { + for (i = 0; i < IBNAL_TX_MSGS(); i++) { page = kibnal_data.kib_tx_pages->ibp_pages[ipage]; tx = &kibnal_data.kib_tx_descs[i]; @@ -1445,7 +1420,7 @@ kibnal_setup_tx_descs (void) #else tx->tx_vaddr = vaddr; #endif - tx->tx_isnblk = (i >= IBNAL_NTX); + tx->tx_isnblk = (i >= *kibnal_tunables.kib_ntx); tx->tx_mapped = KIB_TX_UNMAPPED; CDEBUG(D_NET, "Tx[%d] %p->%p[%x:"LPX64"]\n", i, tx, @@ -1459,7 +1434,7 @@ kibnal_setup_tx_descs (void) &kibnal_data.kib_idle_txs); vaddr += IBNAL_MSG_SIZE; - LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES); + LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES()); page_offset += IBNAL_MSG_SIZE; LASSERT (page_offset <= PAGE_SIZE); @@ -1467,7 +1442,7 @@ kibnal_setup_tx_descs (void) if (page_offset == PAGE_SIZE) { page_offset = 0; ipage++; - LASSERT (ipage <= IBNAL_TX_MSG_PAGES); + LASSERT (ipage <= IBNAL_TX_MSG_PAGES()); } } @@ -1477,8 +1452,9 @@ kibnal_setup_tx_descs (void) void kibnal_shutdown (ptl_ni_t *ni) { - int i; - vv_return_t vvrc; + unsigned long flags; + int i; + vv_return_t vvrc; LASSERT (ni == kibnal_data.kib_ni); LASSERT (ni->ni_data == &kibnal_data); @@ -1489,20 +1465,28 @@ kibnal_shutdown (ptl_ni_t *ni) switch (kibnal_data.kib_init) { case IBNAL_INIT_ALL: - /* resetting my NID removes my listener and nukes all current - * peers and their connections */ - kibnal_set_mynid (PTL_NID_ANY); + /* stop accepting connections and prevent new peers */ + kibnal_stop_listener(ni); + + /* nuke all existing peers */ + kibnal_del_peer(PTL_NID_ANY); /* Wait for all peer state to clean up */ i = 2; - while (atomic_read (&kibnal_data.kib_npeers) != 0) { + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); + while (kibnal_data.kib_npeers != 0) { + write_unlock_irqrestore(&kibnal_data.kib_global_lock, + flags); i++; - CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n? */ "waiting for %d peers to disconnect\n", - atomic_read (&kibnal_data.kib_npeers)); + kibnal_data.kib_npeers); set_current_state (TASK_UNINTERRUPTIBLE); schedule_timeout (HZ); + + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); } + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); /* fall through */ case IBNAL_INIT_CQ: @@ -1539,7 +1523,7 @@ kibnal_shutdown (ptl_ni_t *ni) /* fall through */ case IBNAL_INIT_DATA: - LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0); + LASSERT (kibnal_data.kib_npeers == 0); LASSERT (kibnal_data.kib_peers != NULL); for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { LASSERT (list_empty (&kibnal_data.kib_peers[i])); @@ -1610,6 +1594,8 @@ kibnal_startup (ptl_ni_t *ni) CERROR("Explicit interface config not supported\n"); return PTL_FAIL; } + +#warning discover IPoIB IP address here PORTAL_MODULE_USE; memset (&kibnal_data, 0, sizeof (kibnal_data)); /* zero pointers, flags etc */ @@ -1619,9 +1605,6 @@ kibnal_startup (ptl_ni_t *ni) do_gettimeofday(&tv); kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; - kibnal_data.kib_svc_id = IBNAL_SERVICE_NUMBER; - - init_MUTEX (&kibnal_data.kib_nid_mutex); rwlock_init(&kibnal_data.kib_global_lock); @@ -1801,10 +1784,11 @@ kibnal_startup (ptl_ni_t *ni) /* flag TX descs initialised */ kibnal_data.kib_init = IBNAL_INIT_TXD; /*****************************************************/ + { uint32_t nentries; - vvrc = vv_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES, + vvrc = vv_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES(), kibnal_cq_callback, NULL, /* context */ &kibnal_data.kib_cq, &nentries); @@ -1816,9 +1800,9 @@ kibnal_startup (ptl_ni_t *ni) /* flag CQ initialised */ kibnal_data.kib_init = IBNAL_INIT_CQ; - if (nentries < IBNAL_CQ_ENTRIES) { + if (nentries < IBNAL_CQ_ENTRIES()) { CERROR ("CQ only has %d entries, need %d\n", - nentries, IBNAL_CQ_ENTRIES); + nentries, IBNAL_CQ_ENTRIES()); goto failed; } @@ -1830,6 +1814,12 @@ kibnal_startup (ptl_ni_t *ni) goto failed; } } + + rc = kibnal_start_listener(ni); + if (rc != 0) { + CERROR("Can't start listener: %d\n", rc); + goto failed; + } /* flag everything initialised */ kibnal_data.kib_init = IBNAL_INIT_ALL; @@ -1849,11 +1839,8 @@ kibnal_startup (ptl_ni_t *ni) void __exit kibnal_module_fini (void) { -#ifdef CONFIG_SYSCTL - if (kibnal_tunables.kib_sysctl != NULL) - unregister_sysctl_table (kibnal_tunables.kib_sysctl); -#endif ptl_unregister_nal(&kibnal_nal); + kibnal_tunables_fini(); } int __init @@ -1871,21 +1858,14 @@ kibnal_module_init (void) <= IBNAL_MSG_SIZE); CLASSERT (offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[IBNAL_MAX_RDMA_FRAGS]) <= IBNAL_MSG_SIZE); - - /* the following must be sizeof(int) for proc_dointvec() */ - CLASSERT (sizeof (kibnal_tunables.kib_io_timeout) == sizeof (int)); - /* Initialise dynamic tunables to defaults once only */ - kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT; + rc = kibnal_tunables_init(); + if (rc != 0) + return rc; ptl_register_nal(&kibnal_nal); - -#ifdef CONFIG_SYSCTL - /* Press on regardless even if registering sysctl doesn't work */ - kibnal_tunables.kib_sysctl = - register_sysctl_table (kibnal_top_ctl_table, 0); -#endif - return (0); + + return 0; } MODULE_AUTHOR("Cluster File Systems, Inc. "); diff --git a/lnet/klnds/viblnd/viblnd.h b/lnet/klnds/viblnd/viblnd.h index f3fbacd..bc1790c 100644 --- a/lnet/klnds/viblnd/viblnd.h +++ b/lnet/klnds/viblnd/viblnd.h @@ -87,6 +87,28 @@ # define IBNAL_N_SCHED 1 /* # schedulers */ #endif +#define IBNAL_WHOLE_MEM 1 +#if !IBNAL_WHOLE_MEM +# error "incompatible with voltaire adaptor-tavor (REGISTER_RAM_IN_ONE_PHY_MR)" +#endif + +/* defaults for modparams/tunables */ +#define IBNAL_SERVICE_NUMBER 0x11b9a2 /* Fixed service number */ +#define IBNAL_MIN_RECONNECT_INTERVAL 1 /* first failed connection retry... */ +#define IBNAL_MAX_RECONNECT_INTERVAL 60 /* ...exponentially increasing to this */ +#define IBNAL_CONCURRENT_PEERS 1024 /* # nodes all talking at once to me */ +#define IBNAL_CKSUM 0 /* checksum kib_msg_t? */ +#define IBNAL_TIMEOUT 50 /* default comms timeout (seconds) */ +#define IBNAL_NTX 64 /* # tx descs */ +#define IBNAL_NTX_NBLK 128 /* # reserved tx descs */ + +/* tunables fixed at compile time */ +#define IBNAL_PEER_HASH_SIZE 101 /* # peer lists */ +#define IBNAL_RESCHED 100 /* # scheduler loops before reschedule */ +#define IBNAL_MSG_QUEUE_SIZE 8 /* # messages/RDMAs in-flight */ +#define IBNAL_CREDIT_HIGHWATER 7 /* when to eagerly return credits */ +#define IBNAL_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */ + /* sdp-connection.c */ #define IBNAL_QKEY 0 #define IBNAL_PKEY 0xffff @@ -115,62 +137,42 @@ #define IBNAL_ARB_INITIATOR_DEPTH 0 #define IBNAL_ARB_RESP_RES 0 #define IBNAL_FAILOVER_ACCEPTED 0 -#define IBNAL_SERVICE_NUMBER 0x11b9a2 /* Fixed service number */ - -#define IBNAL_MIN_RECONNECT_INTERVAL HZ /* first failed connection retry... */ -#define IBNAL_MAX_RECONNECT_INTERVAL (60*HZ) /* ...exponentially increasing to this */ - -#define IBNAL_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */ - -#define IBNAL_MSG_QUEUE_SIZE 8 /* # messages/RDMAs in-flight */ -#define IBNAL_CREDIT_HIGHWATER 7 /* when to eagerly return credits */ - -#define IBNAL_NTX 64 /* # tx descs */ -#define IBNAL_NTX_NBLK 128 /* # reserved tx descs */ -/* reduced from 256 to ensure we register < 255 pages per region. - * this can change if we register all memory. */ - -#define IBNAL_PEER_HASH_SIZE 101 /* # peer lists */ - -#define IBNAL_RESCHED 100 /* # scheduler loops before reschedule */ - -#define IBNAL_CONCURRENT_PEERS 1000 /* # nodes all talking at once to me */ - -#define IBNAL_RDMA_BASE 0x0eeb0000 -#define IBNAL_CKSUM 0 -#define IBNAL_WHOLE_MEM 1 -#if !IBNAL_WHOLE_MEM -# error "incompatible with voltaire adaptor-tavor (REGISTER_RAM_IN_ONE_PHY_MR)" -#endif - -/* default vals for runtime tunables */ -#define IBNAL_IO_TIMEOUT 50 /* default comms timeout (seconds) */ /************************/ /* derived constants... */ /* TX messages (shared by all connections) */ -#define IBNAL_TX_MSGS (IBNAL_NTX + IBNAL_NTX_NBLK) -#define IBNAL_TX_MSG_BYTES (IBNAL_TX_MSGS * IBNAL_MSG_SIZE) -#define IBNAL_TX_MSG_PAGES ((IBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) +#define IBNAL_TX_MSGS() (*kibnal_tunables.kib_ntx + \ + *kibnal_tunables.kib_ntx_nblk) +#define IBNAL_TX_MSG_BYTES() (IBNAL_TX_MSGS() * IBNAL_MSG_SIZE) +#define IBNAL_TX_MSG_PAGES() ((IBNAL_TX_MSG_BYTES() + PAGE_SIZE - 1)/PAGE_SIZE) #if IBNAL_WHOLE_MEM # define IBNAL_MAX_RDMA_FRAGS PTL_MD_MAX_IOV #else +# define IBNAL_RDMA_BASE 0x0eeb0000 # define IBNAL_MAX_RDMA_FRAGS 1 #endif /* RX messages (per connection) */ -#define IBNAL_RX_MSGS IBNAL_MSG_QUEUE_SIZE -#define IBNAL_RX_MSG_BYTES (IBNAL_RX_MSGS * IBNAL_MSG_SIZE) -#define IBNAL_RX_MSG_PAGES ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) +#define IBNAL_RX_MSGS IBNAL_MSG_QUEUE_SIZE +#define IBNAL_RX_MSG_BYTES (IBNAL_RX_MSGS * IBNAL_MSG_SIZE) +#define IBNAL_RX_MSG_PAGES ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) -#define IBNAL_CQ_ENTRIES (IBNAL_TX_MSGS * (1 + IBNAL_MAX_RDMA_FRAGS) + \ - IBNAL_RX_MSGS * IBNAL_CONCURRENT_PEERS) +#define IBNAL_CQ_ENTRIES() (IBNAL_TX_MSGS() * (1 + IBNAL_MAX_RDMA_FRAGS) + \ + IBNAL_RX_MSGS * *kibnal_tunables.kib_concurrent_peers) typedef struct { - int kib_io_timeout; /* comms timeout (seconds) */ + unsigned int *kib_service_number; /* IB service number */ + int *kib_min_reconnect_interval; /* first failed connection retry... */ + int *kib_max_reconnect_interval; /* ...exponentially increasing to this */ + int *kib_concurrent_peers; /* max # nodes all talking to me */ + int *kib_cksum; /* checksum kib_msg_t? */ + int *kib_timeout; /* comms timeout (seconds) */ + int *kib_ntx; /* # tx descs */ + int *kib_ntx_nblk; /* # reserved tx descs */ + struct ctl_table_header *kib_sysctl; /* sysctl interface */ } kib_tunables_t; @@ -201,11 +203,9 @@ typedef struct atomic_t kib_nthreads; /* # live threads */ ptl_ni_t *kib_ni; /* _the_ nal instance */ - __u64 kib_svc_id; /* service number I listen on */ vv_gid_t kib_port_gid; /* device/port GID */ vv_p_key_t kib_port_pkey; /* device/port pkey */ - struct semaphore kib_nid_mutex; /* serialise NID ops */ cm_cep_handle_t kib_listen_handle; /* IB listen handle */ rwlock_t kib_global_lock; /* stabilize peer/conn ops */ @@ -215,7 +215,7 @@ typedef struct struct list_head *kib_peers; /* hash table of all my known peers */ int kib_peer_hash_size; /* size of kib_peers */ - atomic_t kib_npeers; /* # peers extant */ + int kib_npeers; /* # peers extant */ atomic_t kib_nconns; /* # connections extant */ void *kib_connd; /* the connd task (serialisation assertions) */ @@ -434,10 +434,11 @@ ptl_err_t kibnal_recv_pages(ptl_ni_t *ni, void *private, extern void kibnal_init_msg(kib_msg_t *msg, int type, int body_nob); extern void kibnal_pack_msg(kib_msg_t *msg, int credits, ptl_nid_t dstnid, __u64 dststamp, __u64 seq); -extern int kibnal_unpack_msg(kib_msg_t *msg, int nob); -extern kib_peer_t *kibnal_create_peer(ptl_nid_t nid); +extern int kibnal_unpack_msg(kib_msg_t *msg, int nob); +extern int kibnal_create_peer(kib_peer_t **peerp, ptl_nid_t nid); extern void kibnal_destroy_peer(kib_peer_t *peer); -extern int kibnal_del_peer(ptl_nid_t nid); +extern int kibnal_add_persistent_peer (ptl_nid_t nid, __u32 ip); +extern int kibnal_del_peer(ptl_nid_t nid); extern kib_peer_t *kibnal_find_peer_locked(ptl_nid_t nid); extern void kibnal_unlink_peer_locked(kib_peer_t *peer); extern int kibnal_close_stale_conns_locked(kib_peer_t *peer, @@ -445,7 +446,7 @@ extern int kibnal_close_stale_conns_locked(kib_peer_t *peer, extern kib_conn_t *kibnal_create_conn(cm_cep_handle_t cep); extern void kibnal_listen_callback(cm_cep_handle_t cep, cm_conn_data_t *info, void *arg); -extern int kibnal_alloc_pages(kib_pages_t **pp, int npages, int access); +extern int kibnal_alloc_pages(kib_pages_t **pp, int npages, int access); extern void kibnal_free_pages(kib_pages_t *p); extern void kibnal_check_sends(kib_conn_t *conn); @@ -460,10 +461,11 @@ extern int kibnal_set_qp_state(kib_conn_t *conn, vv_qp_state_t new_state); extern void kibnal_async_callback(vv_event_record_t ev); extern void kibnal_cq_callback(unsigned long context); extern void kibnal_passive_connreq(kib_pcreq_t *pcr, int reject); -extern void kibnal_pause(int ticks); extern void kibnal_queue_tx(kib_tx_t *tx, kib_conn_t *conn); extern int kibnal_init_rdma(kib_tx_t *tx, int type, int nob, kib_rdma_desc_t *dstrd, __u64 dstcookie); +extern int kibnal_tunables_init(void); +extern void kibnal_tunables_fini(void); static inline int wrq_signals_completion (vv_wr_t *wrq) @@ -545,7 +547,7 @@ kibnal_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn) LASSERT (tx->tx_msg->ibm_type == IBNAL_MSG_PUT_DONE); } tx->tx_queued = 1; - tx->tx_deadline = jiffies + kibnal_tunables.kib_io_timeout * HZ; + tx->tx_deadline = jiffies + (*kibnal_tunables.kib_timeout * HZ); list_add_tail(&tx->tx_list, &conn->ibc_tx_queue); } diff --git a/lnet/klnds/viblnd/viblnd_cb.c b/lnet/klnds/viblnd/viblnd_cb.c index 6cea29c..c55671fa 100644 --- a/lnet/klnds/viblnd/viblnd_cb.c +++ b/lnet/klnds/viblnd/viblnd_cb.c @@ -528,6 +528,8 @@ kibnal_append_rdfrag(kib_rdma_desc_t *rd, int active, struct page *page, * network address, given how it maps all phys mem into 1 region */ addr = kibnal_page2phys(page) + page_offset + PAGE_OFFSET; + /* NB this relies entirely on there being a single region for the whole + * of memory, since "high" memory will wrap in the (void *) cast! */ vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca, (void *)((unsigned long)addr), len, &mem_h, &l_key, &r_key); @@ -576,7 +578,7 @@ kibnal_kvaddr_to_page (unsigned long vaddr) #if CONFIG_HIGHMEM if (vaddr >= PKMAP_BASE && vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) { - /* No highmem pages only used for bulk (kiov) I/O */ + /* Highmem pages only used for bulk (kiov) I/O */ CERROR("find page for address in highmem\n"); LBUG(); } @@ -693,6 +695,7 @@ kibnal_setup_rd_iov (kib_tx_t *tx, kib_rdma_desc_t *rd, int niov, struct iovec *iov, int offset, int nob) { +#error "check this thoroughly before enabling" /* active if I'm sending */ int active = ((access & vv_acc_r_mem_write) == 0); void *vaddr; @@ -742,6 +745,7 @@ kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd, vv_access_con_bit_mask_t access, int nkiov, ptl_kiov_t *kiov, int offset, int nob) { +#error "check this thoroughly before enabling" /* active if I'm sending */ int active = ((access & vv_acc_r_mem_write) == 0); vv_return_t vvrc; @@ -1225,6 +1229,8 @@ kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid) kib_conn_t *conn; unsigned long flags; rwlock_t *g_lock = &kibnal_data.kib_global_lock; + int retry; + int rc; /* If I get here, I've committed to send, so I complete the tx with * failure on any problems */ @@ -1232,38 +1238,51 @@ kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid) LASSERT (tx->tx_conn == NULL); /* only set when assigned a conn */ LASSERT (tx->tx_nwrq > 0); /* work items have been set up */ - read_lock_irqsave(g_lock, flags); + for (retry = 0; ; retry = 1) { + read_lock_irqsave(g_lock, flags); - peer = kibnal_find_peer_locked (nid); - if (peer == NULL) { - read_unlock_irqrestore(g_lock, flags); - tx->tx_status = -EHOSTUNREACH; - tx->tx_waiting = 0; - kibnal_tx_done (tx); - return; - } + peer = kibnal_find_peer_locked (nid); + if (peer != NULL) { + conn = kibnal_find_conn_locked (peer); + if (conn != NULL) { + kibnal_conn_addref(conn); /* 1 ref for me... */ + read_unlock_irqrestore(g_lock, flags); - conn = kibnal_find_conn_locked (peer); - if (conn != NULL) { - kibnal_conn_addref(conn); /* 1 ref for me... */ - read_unlock_irqrestore(g_lock, flags); + kibnal_queue_tx (tx, conn); + kibnal_conn_decref(conn); /* ...to here */ + return; + } + } - kibnal_queue_tx (tx, conn); - kibnal_conn_decref(conn); /* ...to here */ - return; - } - - /* Making one or more connections; I'll need a write lock... */ - read_unlock(g_lock); - write_lock(g_lock); + /* Making one or more connections; I'll need a write lock... */ + read_unlock(g_lock); + write_lock(g_lock); + + peer = kibnal_find_peer_locked (nid); + if (peer != NULL) + break; - peer = kibnal_find_peer_locked (nid); - if (peer == NULL) { write_unlock_irqrestore(g_lock, flags); - tx->tx_status = -EHOSTUNREACH; - tx->tx_waiting = 0; - kibnal_tx_done (tx); - return; + + if (retry) { + CERROR("Can't find peer %s\n", libcfs_nid2str(nid)); + + tx->tx_status = -EHOSTUNREACH; + tx->tx_waiting = 0; + kibnal_tx_done (tx); + return; + } + + rc = kibnal_add_persistent_peer(nid, PTL_NIDADDR(nid)); + if (rc != 0) { + CERROR("Can't add peer %s: %d\n", + libcfs_nid2str(nid), rc); + + tx->tx_status = -EHOSTUNREACH; + tx->tx_waiting = 0; + kibnal_tx_done (tx); + return; + } } conn = kibnal_find_conn_locked (peer); @@ -1888,7 +1907,6 @@ kibnal_peer_connect_failed (kib_peer_t *peer, int active) /* Only the connd creates conns => single threaded */ LASSERT (!in_interrupt()); LASSERT (current == kibnal_data.kib_connd); - LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL); write_lock_irqsave(&kibnal_data.kib_global_lock, flags); @@ -1909,8 +1927,9 @@ kibnal_peer_connect_failed (kib_peer_t *peer, int active) /* Say when active connection can be re-attempted */ peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval; /* Increase reconnection interval */ - peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2, - IBNAL_MAX_RECONNECT_INTERVAL); + peer->ibp_reconnect_interval = + MIN (peer->ibp_reconnect_interval * 2, + *kibnal_tunables.kib_max_reconnect_interval * HZ); /* Take peer's blocked transmits to complete with error */ list_add(&zombies, &peer->ibp_tx_queue); @@ -1983,7 +2002,7 @@ kibnal_connreq_done(kib_conn_t *conn, int active, int status) case IBNAL_CONN_ACTIVE_CONNECT: LASSERT (active); cm_cancel(conn->ibc_cep); - kibnal_pause(HZ/10); + libcfs_pause(cfs_time_seconds(1)/10); /* cm_connect() failed immediately or * callback returned failure */ break; @@ -2062,7 +2081,8 @@ kibnal_connreq_done(kib_conn_t *conn, int active, int status) list_del_init(&peer->ibp_tx_queue); /* reset reconnect interval for next attempt */ - peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL; + peer->ibp_reconnect_interval = + *kibnal_tunables.kib_min_reconnect_interval * HZ; write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); /* Schedule blocked txs */ @@ -2221,9 +2241,9 @@ kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq) LASSERT (!in_interrupt()); LASSERT (current == kibnal_data.kib_connd); - if (cmreq->sid != IBNAL_SERVICE_NUMBER) { + if (cmreq->sid != (__u64)(*kibnal_tunables.kib_service_number)) { CERROR(LPX64" != IBNAL_SERVICE_NUMBER("LPX64")\n", - cmreq->sid, (__u64)IBNAL_SERVICE_NUMBER); + cmreq->sid, (__u64)(*kibnal_tunables.kib_service_number)); goto reject; } @@ -2277,8 +2297,8 @@ kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq) } /* assume 'rxmsg.ibm_srcnid' is a new peer */ - tmp_peer = kibnal_create_peer (rxmsg.ibm_srcnid); - if (tmp_peer == NULL) { + rc = kibnal_create_peer (&tmp_peer, rxmsg.ibm_srcnid); + if (rc != 0) { CERROR("Can't create tmp peer for "LPX64"\n", rxmsg.ibm_srcnid); kibnal_conn_decref(conn); conn = NULL; @@ -2443,7 +2463,7 @@ kibnal_connect_conn (kib_conn_t *conn) memset(&cmreq, 0, sizeof(cmreq)); - cmreq.sid = IBNAL_SERVICE_NUMBER; + cmreq.sid = (__u64)(*kibnal_tunables.kib_service_number); cmreq.cep_data.ca_guid = kibnal_data.kib_hca_attrs.guid; cmreq.cep_data.qpn = cv->cv_local_qpn; @@ -2957,7 +2977,7 @@ kibnal_disconnect_conn (kib_conn_t *conn) write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); cm_cancel(conn->ibc_cep); - kibnal_pause(HZ/10); + libcfs_pause(cfs_time_seconds(1)/10); if (!conn->ibc_disconnect) /* CM callback will never happen now */ kibnal_conn_decref(conn); @@ -3087,9 +3107,9 @@ kibnal_connd (void *arg) * connection within (n+1)/n times the timeout * interval. */ - if (kibnal_tunables.kib_io_timeout > n * p) + if (*kibnal_tunables.kib_timeout > n * p) chunk = (chunk * n * p) / - kibnal_tunables.kib_io_timeout; + *kibnal_tunables.kib_timeout; if (chunk == 0) chunk = 1; diff --git a/lnet/klnds/viblnd/viblnd_modparams.c b/lnet/klnds/viblnd/viblnd_modparams.c new file mode 100644 index 0000000..b084d48 --- /dev/null +++ b/lnet/klnds/viblnd/viblnd_modparams.c @@ -0,0 +1,133 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2004 Cluster File Systems, Inc. + * Author: Eric Barton + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "vibnal.h" + +static int service_number = IBNAL_SERVICE_NUMBER; +CFS_MODULE_PARM(service_number, "i", int, 0444, + "IB service number"); + +static int min_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL; +CFS_MODULE_PARM(min_reconnect_interval, "i", int, 0644, + "minimum connection retry interval (seconds)"); + +static int max_reconnect_interval = IBNAL_MAX_RECONNECT_INTERVAL; +CFS_MODULE_PARM(max_reconnect_interval, "i", int, 0644, + "maximum connection retry interval (seconds)"); + +static int concurrent_peers = IBNAL_CONCURRENT_PEERS; +CFS_MODULE_PARM(concurrent_peers, "i", int, 0444, + "maximum number of peers that may connect"); + +static int cksum = IBNAL_CKSUM; +CFS_MODULE_PARM(cksum, "i", int, 0644, + "set non-zero to enable message (not RDMA) checksums"); + +static int timeout = IBNAL_TIMEOUT; +CFS_MODULE_PARM(timeout, "i", int, 0644, + "timeout (seconds)"); + +static int ntx = IBNAL_NTX; +CFS_MODULE_PARM(ntx, "i", int, 0444, + "# of 'normal' message descriptors"); + +static int ntx_nblk = IBNAL_NTX_NBLK; +CFS_MODULE_PARM(ntx_nblk, "i", int, 0444, + "# of 'reserved' message descriptors"); + +kib_tunables_t kibnal_tunables = { + .kib_service_number = &service_number, + .kib_min_reconnect_interval = &min_reconnect_interval, + .kib_max_reconnect_interval = &max_reconnect_interval, + .kib_concurrent_peers = &concurrent_peers, + .kib_cksum = &cksum, + .kib_timeout = &timeout, + .kib_ntx = &ntx, + .kib_ntx_nblk = &ntx_nblk, +}; + +#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM +static ctl_table kibnal_ctl_table[] = { + {1, "service_number", &service_number, + sizeof(int), 0444, NULL, &proc_dointvec}, + {2, "min_reconnect_interval", &min_reconnect_interval, + sizeof(int), 0644, NULL, &proc_dointvec}, + {3, "max_reconnect_interval", &max_reconnect_interval, + sizeof(int), 0644, NULL, &proc_dointvec}, + {4, "concurrent_peers", &concurrent_peers, + sizeof(int), 0444, NULL, &proc_dointvec}, + {5, "cksum", &cksum, + sizeof(int), 0644, NULL, &proc_dointvec}, + {6, "timeout", &timeout, + sizeof(int), 0644, NULL, &proc_dointvec}, + {7, "ntx", &ntx, + sizeof(int), 0444, NULL, &proc_dointvec}, + {8, "ntx_nblk", &ntx_nblk, + sizeof(int), 0444, NULL, &proc_dointvec}, + {0} +}; + +static ctl_table kibnal_top_ctl_table[] = { + {203, "vibnal", NULL, 0, 0555, kibnal_ctl_table}, + {0} +}; + +int +kibnal_tunables_init () +{ + kibnal_tunables.kib_sysctl = + register_sysctl_table(kibnal_top_ctl_table, 0); + + if (kibnal_tunables.kib_sysctl == NULL) + CWARN("Can't setup /proc tunables\n"); + + return 0; +} + +void +kibnal_tunables_fini () +{ + if (kibnal_tunables.kib_sysctl != NULL) + unregister_sysctl_table(kibnal_tunables.kib_sysctl); +} + +#else + +int +kibnal_tunables_init () +{ + return 0; +} + +void +kibnal_tunables_fini () +{ +} + +#endif + + + + + + -- 1.8.3.1