From 46331f150d15a53db41683f191eef405d040f407 Mon Sep 17 00:00:00 2001 From: ericm Date: Tue, 9 Sep 2003 14:37:10 +0000 Subject: [PATCH] merge b_devel -> b_eq: 20030909 only kernel pass sanity, liblustre still broken --- lnet/archdep.m4 | 3 +- lnet/include/linux/kp30.h | 7 +- lnet/include/lnet/list.h | 2 + lnet/include/lnet/lnetctl.h | 5 + lnet/include/lnet/ptlctl.h | 5 + lnet/klnds/qswlnd/qswlnd.c | 50 +- lnet/klnds/qswlnd/qswlnd.h | 87 +- lnet/klnds/qswlnd/qswlnd_cb.c | 66 +- lnet/klnds/socklnd/Makefile.am | 4 + lnet/klnds/socklnd/socklnd.c | 1231 ++++++++++++++++++++++------ lnet/klnds/socklnd/socklnd.h | 201 ++++- lnet/klnds/socklnd/socklnd_cb.c | 1271 +++++++++++++++++++++++------ lnet/libcfs/debug.c | 126 ++- lnet/lnet/lib-move.c | 37 +- lnet/lnet/lib-msg.c | 1 + lnet/utils/debug.c | 32 +- lnet/utils/portals.c | 789 ++++++++++++------ lnet/utils/ptlctl.c | 11 +- lustre/portals/archdep.m4 | 3 +- lustre/portals/include/linux/kp30.h | 7 +- lustre/portals/include/portals/list.h | 2 + lustre/portals/include/portals/ptlctl.h | 5 + lustre/portals/knals/qswnal/qswnal.c | 50 +- lustre/portals/knals/qswnal/qswnal.h | 87 +- lustre/portals/knals/qswnal/qswnal_cb.c | 66 +- lustre/portals/knals/socknal/Makefile.am | 4 + lustre/portals/knals/socknal/socknal.c | 1231 ++++++++++++++++++++++------ lustre/portals/knals/socknal/socknal.h | 201 ++++- lustre/portals/knals/socknal/socknal_cb.c | 1271 +++++++++++++++++++++++------ lustre/portals/libcfs/debug.c | 126 ++- lustre/portals/portals/lib-move.c | 37 +- lustre/portals/portals/lib-msg.c | 1 + lustre/portals/utils/debug.c | 32 +- lustre/portals/utils/portals.c | 789 ++++++++++++------ lustre/portals/utils/ptlctl.c | 11 +- 35 files changed, 5997 insertions(+), 1854 deletions(-) diff --git a/lnet/archdep.m4 b/lnet/archdep.m4 index db14c29..428fe42 100644 --- a/lnet/archdep.m4 +++ b/lnet/archdep.m4 @@ -225,7 +225,8 @@ fi #AC_SUBST(usrprefix) AC_MSG_CHECKING(if kernel has CPU affinity support) -if test "$target_cpu" != ia64 ; then +SET_CPUS_ALLOW="`grep -c set_cpus_allowed $LINUX/kernel/softirq.c`" +if test "$SET_CPUS_ALLOW" != 0 ; then enable_affinity_temp="-DCPU_AFFINITY=1" AC_MSG_RESULT(yes) else diff --git a/lnet/include/linux/kp30.h b/lnet/include/linux/kp30.h index e7b850d..a0626cc 100644 --- a/lnet/include/linux/kp30.h +++ b/lnet/include/linux/kp30.h @@ -574,7 +574,7 @@ __s32 portals_debug_copy_to_user(char *buf, unsigned long len); #endif void portals_debug_msg(int subsys, int mask, char *file, const char *fn, const int line, unsigned long stack, - const char *format, ...) + char *format, ...) __attribute__ ((format (printf, 7, 8))); #else void portals_debug_msg(int subsys, int mask, char *file, const char *fn, @@ -885,6 +885,11 @@ extern ptl_handle_ni_t kscimacnal_ni; #define NAL_CMD_CLOSE_CONNECTION 101 #define NAL_CMD_REGISTER_MYNID 102 #define NAL_CMD_PUSH_CONNECTION 103 +#define NAL_CMD_GET_CONN 104 +#define NAL_CMD_DEL_AUTOCONN 105 +#define NAL_CMD_ADD_AUTOCONN 106 +#define NAL_CMD_GET_AUTOCONN 107 +#define NAL_CMD_GET_TXDESC 108 enum { DEBUG_DAEMON_START = 1, diff --git a/lnet/include/lnet/list.h b/lnet/include/lnet/list.h index 78a1e2d..57713cb 100644 --- a/lnet/include/lnet/list.h +++ b/lnet/include/lnet/list.h @@ -16,6 +16,8 @@ struct list_head { struct list_head *next, *prev; }; +typedef struct list_head list_t; + #define LIST_HEAD_INIT(name) { &(name), &(name) } #define LIST_HEAD(name) \ diff --git a/lnet/include/lnet/lnetctl.h b/lnet/include/lnet/lnetctl.h index dc02780..8278111 100644 --- a/lnet/include/lnet/lnetctl.h +++ b/lnet/include/lnet/lnetctl.h @@ -34,9 +34,14 @@ char * ptl_nid2str (char *buffer, ptl_nid_t nid); int ptl_initialize(int argc, char **argv); int jt_ptl_network(int argc, char **argv); +int jt_ptl_print_autoconnects (int argc, char **argv); +int jt_ptl_add_autoconnect (int argc, char **argv); +int jt_ptl_del_autoconnect (int argc, char **argv); +int jt_ptl_print_connections (int argc, char **argv); int jt_ptl_connect(int argc, char **argv); int jt_ptl_disconnect(int argc, char **argv); int jt_ptl_push_connection(int argc, char **argv); +int jt_ptl_print_active_txs(int argc, char **argv); int jt_ptl_ping(int argc, char **argv); int jt_ptl_shownid(int argc, char **argv); int jt_ptl_mynid(int argc, char **argv); diff --git a/lnet/include/lnet/ptlctl.h b/lnet/include/lnet/ptlctl.h index dc02780..8278111 100644 --- a/lnet/include/lnet/ptlctl.h +++ b/lnet/include/lnet/ptlctl.h @@ -34,9 +34,14 @@ char * ptl_nid2str (char *buffer, ptl_nid_t nid); int ptl_initialize(int argc, char **argv); int jt_ptl_network(int argc, char **argv); +int jt_ptl_print_autoconnects (int argc, char **argv); +int jt_ptl_add_autoconnect (int argc, char **argv); +int jt_ptl_del_autoconnect (int argc, char **argv); +int jt_ptl_print_connections (int argc, char **argv); int jt_ptl_connect(int argc, char **argv); int jt_ptl_disconnect(int argc, char **argv); int jt_ptl_push_connection(int argc, char **argv); +int jt_ptl_print_active_txs(int argc, char **argv); int jt_ptl_ping(int argc, char **argv); int jt_ptl_shownid(int argc, char **argv); int jt_ptl_mynid(int argc, char **argv); diff --git a/lnet/klnds/qswlnd/qswlnd.c b/lnet/klnds/qswlnd/qswlnd.c index 7cfc80e..d4ee960 100644 --- a/lnet/klnds/qswlnd/qswlnd.c +++ b/lnet/klnds/qswlnd/qswlnd.c @@ -112,11 +112,48 @@ kqswnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size, } int +kqswnal_get_tx_desc (struct portal_ioctl_data *data) +{ + unsigned long flags; + struct list_head *tmp; + kqswnal_tx_t *ktx; + int index = data->ioc_count; + int rc = -ENOENT; + + spin_lock_irqsave (&kqswnal_data.kqn_idletxd_lock, flags); + + list_for_each (tmp, &kqswnal_data.kqn_activetxds) { + if (index-- != 0) + continue; + + ktx = list_entry (tmp, kqswnal_tx_t, ktx_list); + + data->ioc_pbuf1 = (char *)ktx; + data->ioc_count = NTOH__u32(ktx->ktx_wire_hdr->type); + data->ioc_size = NTOH__u32(PTL_HDR_LENGTH(ktx->ktx_wire_hdr)); + data->ioc_nid = NTOH__u64(ktx->ktx_wire_hdr->dest_nid); + data->ioc_nid2 = ktx->ktx_nid; + data->ioc_misc = ktx->ktx_launcher; + data->ioc_flags = (list_empty (&ktx->ktx_delayed_list) ? 0 : 1) | + (!ktx->ktx_isnblk ? 0 : 2) | + (ktx->ktx_state << 2); + rc = 0; + break; + } + + spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags); + return (rc); +} + +int kqswnal_cmd (struct portal_ioctl_data *data, void *private) { LASSERT (data != NULL); switch (data->ioc_nal_cmd) { + case NAL_CMD_GET_TXDESC: + return (kqswnal_get_tx_desc (data)); + case NAL_CMD_REGISTER_MYNID: CDEBUG (D_IOCTL, "setting NID offset to "LPX64" (was "LPX64")\n", data->ioc_nid - kqswnal_data.kqn_elanid, @@ -149,6 +186,7 @@ kqswnal_finalise (void) /* fall through */ case KQN_INIT_DATA: + LASSERT(list_empty(&kqswnal_data.kqn_activetxds)); break; case KQN_INIT_NOTHING: @@ -318,6 +356,7 @@ kqswnal_initialise (void) INIT_LIST_HEAD (&kqswnal_data.kqn_idletxds); INIT_LIST_HEAD (&kqswnal_data.kqn_nblk_idletxds); + INIT_LIST_HEAD (&kqswnal_data.kqn_activetxds); spin_lock_init (&kqswnal_data.kqn_idletxd_lock); init_waitqueue_head (&kqswnal_data.kqn_idletxd_waitq); INIT_LIST_HEAD (&kqswnal_data.kqn_idletxd_fwdq); @@ -460,12 +499,13 @@ kqswnal_initialise (void) ktx->ktx_basepage = basepage + premapped_pages; /* message mapping starts here */ ktx->ktx_npages = KQSW_NTXMSGPAGES - premapped_pages; /* for this many pages */ - if (i < KQSW_NTXMSGS) - ktx->ktx_idle = &kqswnal_data.kqn_idletxds; - else - ktx->ktx_idle = &kqswnal_data.kqn_nblk_idletxds; + INIT_LIST_HEAD (&ktx->ktx_delayed_list); - list_add_tail (&ktx->ktx_list, ktx->ktx_idle); + ktx->ktx_state = KTX_IDLE; + ktx->ktx_isnblk = (i >= KQSW_NTXMSGS); + list_add_tail (&ktx->ktx_list, + ktx->ktx_isnblk ? &kqswnal_data.kqn_nblk_idletxds : + &kqswnal_data.kqn_idletxds); } /**********************************************************************/ diff --git a/lnet/klnds/qswlnd/qswlnd.h b/lnet/klnds/qswlnd/qswlnd.h index 5cbbb9a..a27239c 100644 --- a/lnet/klnds/qswlnd/qswlnd.h +++ b/lnet/klnds/qswlnd/qswlnd.h @@ -77,20 +77,20 @@ #define KQSW_OPTIMIZE_GETS 1 -#define KQSW_CHECKSUM 0 +#define KQSW_CHECKSUM 0 #if KQSW_CHECKSUM typedef unsigned long kqsw_csum_t; -#define KQSW_CSUM_SIZE (2 * sizeof (kqsw_csum_t)) +#define KQSW_CSUM_SIZE (2 * sizeof (kqsw_csum_t)) #else -#define KQSW_CSUM_SIZE 0 +#define KQSW_CSUM_SIZE 0 #endif -#define KQSW_HDR_SIZE (sizeof (ptl_hdr_t) + KQSW_CSUM_SIZE) +#define KQSW_HDR_SIZE (sizeof (ptl_hdr_t) + KQSW_CSUM_SIZE) /* * Elan NAL */ -#define EP_SVC_LARGE_PORTALS_SMALL (0x10) /* Portals over elan port number (large payloads) */ -#define EP_SVC_LARGE_PORTALS_LARGE (0x11) /* Portals over elan port number (small payloads) */ +#define EP_SVC_LARGE_PORTALS_SMALL (0x10) /* Portals over elan port number (large payloads) */ +#define EP_SVC_LARGE_PORTALS_LARGE (0x11) /* Portals over elan port number (small payloads) */ /* NB small/large message sizes are GLOBAL constants */ /* @@ -98,39 +98,39 @@ typedef unsigned long kqsw_csum_t; * NB no mention of PAGE_SIZE for interoperability */ #define KQSW_MAXPAYLOAD PTL_MTU -#define KQSW_SMALLPAYLOAD ((4<<10) - KQSW_HDR_SIZE) /* small/large ep receiver breakpoint */ +#define KQSW_SMALLPAYLOAD ((4<<10) - KQSW_HDR_SIZE) /* small/large ep receiver breakpoint */ -#define KQSW_TX_MAXCONTIG (1<<10) /* largest payload that gets made contiguous on transmit */ +#define KQSW_TX_MAXCONTIG (1<<10) /* largest payload that gets made contiguous on transmit */ -#define KQSW_NTXMSGS 8 /* # normal transmit messages */ -#define KQSW_NNBLK_TXMSGS 256 /* # reserved transmit messages if can't block */ +#define KQSW_NTXMSGS 8 /* # normal transmit messages */ +#define KQSW_NNBLK_TXMSGS 256 /* # reserved transmit messages if can't block */ -#define KQSW_NRXMSGS_LARGE 64 /* # large receive buffers */ -#define KQSW_EP_ENVELOPES_LARGE 128 /* # large ep envelopes */ +#define KQSW_NRXMSGS_LARGE 64 /* # large receive buffers */ +#define KQSW_EP_ENVELOPES_LARGE 128 /* # large ep envelopes */ -#define KQSW_NRXMSGS_SMALL 256 /* # small receive buffers */ -#define KQSW_EP_ENVELOPES_SMALL 2048 /* # small ep envelopes */ +#define KQSW_NRXMSGS_SMALL 256 /* # small receive buffers */ +#define KQSW_EP_ENVELOPES_SMALL 2048 /* # small ep envelopes */ -#define KQSW_RESCHED 100 /* # busy loops that forces scheduler to yield */ +#define KQSW_RESCHED 100 /* # busy loops that forces scheduler to yield */ /* * derived constants */ -#define KQSW_TX_BUFFER_SIZE (KQSW_HDR_SIZE + KQSW_TX_MAXCONTIG) +#define KQSW_TX_BUFFER_SIZE (KQSW_HDR_SIZE + KQSW_TX_MAXCONTIG) /* The pre-allocated tx buffer (hdr + small payload) */ -#define KQSW_NTXMSGPAGES (btopr(KQSW_TX_BUFFER_SIZE) + 1 + btopr(KQSW_MAXPAYLOAD) + 1) +#define KQSW_NTXMSGPAGES (btopr(KQSW_TX_BUFFER_SIZE) + 1 + btopr(KQSW_MAXPAYLOAD) + 1) /* Reserve elan address space for pre-allocated and pre-mapped transmit * buffer and a full payload too. Extra pages allow for page alignment */ -#define KQSW_NRXMSGPAGES_SMALL (btopr(KQSW_HDR_SIZE + KQSW_SMALLPAYLOAD)) +#define KQSW_NRXMSGPAGES_SMALL (btopr(KQSW_HDR_SIZE + KQSW_SMALLPAYLOAD)) /* receive hdr/payload always contiguous and page aligned */ -#define KQSW_NRXMSGBYTES_SMALL (KQSW_NRXMSGPAGES_SMALL * PAGE_SIZE) +#define KQSW_NRXMSGBYTES_SMALL (KQSW_NRXMSGPAGES_SMALL * PAGE_SIZE) -#define KQSW_NRXMSGPAGES_LARGE (btopr(KQSW_HDR_SIZE + KQSW_MAXPAYLOAD)) +#define KQSW_NRXMSGPAGES_LARGE (btopr(KQSW_HDR_SIZE + KQSW_MAXPAYLOAD)) /* receive hdr/payload always contiguous and page aligned */ -#define KQSW_NRXMSGBYTES_LARGE (KQSW_NRXMSGPAGES_LARGE * PAGE_SIZE) +#define KQSW_NRXMSGBYTES_LARGE (KQSW_NRXMSGPAGES_LARGE * PAGE_SIZE) /* biggest complete packet we can receive (or transmit) */ /* Remote memory descriptor */ @@ -143,7 +143,7 @@ typedef struct typedef struct { struct list_head krx_list; /* enqueue -> thread */ - EP_RCVR *krx_eprx; /* port to post receives to */ + EP_RCVR *krx_eprx; /* port to post receives to */ EP_RXD *krx_rxd; /* receive descriptor (for repost) */ E3_Addr krx_elanaddr; /* Elan address of buffer (contiguous in elan vm) */ int krx_npages; /* # pages in receive buffer */ @@ -157,17 +157,23 @@ typedef struct typedef struct { - struct list_head ktx_list; /* enqueue idle/delayed */ - struct list_head *ktx_idle; /* where to put when idle */ - char ktx_state; /* What I'm doing */ + struct list_head ktx_list; /* enqueue idle/active */ + struct list_head ktx_delayed_list; /* enqueue delayedtxds */ + int ktx_isnblk:1; /* reserved descriptor? */ + int ktx_state:7; /* What I'm doing */ uint32_t ktx_basepage; /* page offset in reserved elan tx vaddrs for mapping pages */ int ktx_npages; /* pages reserved for mapping messages */ int ktx_nmappedpages; /* # pages mapped for current message */ int ktx_port; /* destination ep port */ ptl_nid_t ktx_nid; /* destination node */ void *ktx_args[2]; /* completion passthru */ - E3_Addr ktx_ebuffer; /* elan address of ktx_buffer */ + E3_Addr ktx_ebuffer; /* elan address of ktx_buffer */ char *ktx_buffer; /* pre-allocated contiguous buffer for hdr + small payloads */ + + /* debug/info fields */ + pid_t ktx_launcher; /* pid of launching process */ + ptl_hdr_t *ktx_wire_hdr; /* portals header (wire endian) */ + int ktx_nfrag; /* # message frags */ union { EP_IOVEC iov[EP_MAXFRAG]; /* msg frags (elan vaddrs) */ @@ -175,9 +181,9 @@ typedef struct } ktx_frags; } kqswnal_tx_t; -#define KTX_IDLE 0 /* MUST BE ZERO (so zeroed ktx is idle) */ -#define KTX_SENDING 1 /* local send */ -#define KTX_FORWARDING 2 /* routing a packet */ +#define KTX_IDLE 0 /* on kqn_(nblk_)idletxds */ +#define KTX_SENDING 1 /* local send */ +#define KTX_FORWARDING 2 /* routing a packet */ #define KTX_GETTING 3 /* local optimised get */ typedef struct @@ -190,7 +196,8 @@ typedef struct kqswnal_tx_t *kqn_txds; /* all the transmit descriptors */ struct list_head kqn_idletxds; /* transmit descriptors free to use */ - struct list_head kqn_nblk_idletxds; /* reserve of */ + struct list_head kqn_nblk_idletxds; /* reserved free transmit descriptors */ + struct list_head kqn_activetxds; /* transmit descriptors being used */ spinlock_t kqn_idletxd_lock; /* serialise idle txd access */ wait_queue_head_t kqn_idletxd_waitq; /* sender blocks here waiting for idle txd */ struct list_head kqn_idletxd_fwdq; /* forwarded packets block here waiting for idle txd */ @@ -204,12 +211,12 @@ typedef struct spinlock_t kqn_statelock; /* cb_cli/cb_sti */ nal_cb_t *kqn_cb; /* -> kqswnal_lib */ - EP_DEV *kqn_epdev; /* elan device */ - EP_XMTR *kqn_eptx; /* elan transmitter */ - EP_RCVR *kqn_eprx_small; /* elan receiver (small messages) */ - EP_RCVR *kqn_eprx_large; /* elan receiver (large messages) */ - ELAN3_DMA_HANDLE *kqn_eptxdmahandle; /* elan reserved tx vaddrs */ - ELAN3_DMA_HANDLE *kqn_eprxdmahandle; /* elan reserved rx vaddrs */ + EP_DEV *kqn_epdev; /* elan device */ + EP_XMTR *kqn_eptx; /* elan transmitter */ + EP_RCVR *kqn_eprx_small; /* elan receiver (small messages) */ + EP_RCVR *kqn_eprx_large; /* elan receiver (large messages) */ + ELAN3_DMA_HANDLE *kqn_eptxdmahandle; /* elan reserved tx vaddrs */ + ELAN3_DMA_HANDLE *kqn_eprxdmahandle; /* elan reserved rx vaddrs */ kpr_router_t kqn_router; /* connection to Kernel Portals Router module */ ptl_nid_t kqn_nid_offset; /* this cluster's NID offset */ @@ -218,10 +225,10 @@ typedef struct } kqswnal_data_t; /* kqn_init state */ -#define KQN_INIT_NOTHING 0 /* MUST BE ZERO so zeroed state is initialised OK */ -#define KQN_INIT_DATA 1 -#define KQN_INIT_PTL 2 -#define KQN_INIT_ALL 3 +#define KQN_INIT_NOTHING 0 /* MUST BE ZERO so zeroed state is initialised OK */ +#define KQN_INIT_DATA 1 +#define KQN_INIT_PTL 2 +#define KQN_INIT_ALL 3 extern nal_cb_t kqswnal_lib; extern nal_t kqswnal_api; diff --git a/lnet/klnds/qswlnd/qswlnd_cb.c b/lnet/klnds/qswlnd/qswlnd_cb.c index 7032f6b..6390137 100644 --- a/lnet/klnds/qswlnd/qswlnd_cb.c +++ b/lnet/klnds/qswlnd/qswlnd_cb.c @@ -290,22 +290,24 @@ void kqswnal_put_idle_tx (kqswnal_tx_t *ktx) { kpr_fwd_desc_t *fwd = NULL; - struct list_head *idle = ktx->ktx_idle; unsigned long flags; - kqswnal_unmap_tx (ktx); /* release temporary mappings */ + kqswnal_unmap_tx (ktx); /* release temporary mappings */ ktx->ktx_state = KTX_IDLE; spin_lock_irqsave (&kqswnal_data.kqn_idletxd_lock, flags); - list_add (&ktx->ktx_list, idle); + list_del (&ktx->ktx_list); /* take off active list */ - /* reserved for non-blocking tx */ - if (idle == &kqswnal_data.kqn_nblk_idletxds) { + if (ktx->ktx_isnblk) { + /* reserved for non-blocking tx */ + list_add (&ktx->ktx_list, &kqswnal_data.kqn_nblk_idletxds); spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags); return; } + list_add (&ktx->ktx_list, &kqswnal_data.kqn_idletxds); + /* anything blocking for a tx descriptor? */ if (!list_empty(&kqswnal_data.kqn_idletxd_fwdq)) /* forwarded packet? */ { @@ -351,7 +353,6 @@ kqswnal_get_idle_tx (kpr_fwd_desc_t *fwd, int may_block) if (!list_empty (&kqswnal_data.kqn_idletxds)) { ktx = list_entry (kqswnal_data.kqn_idletxds.next, kqswnal_tx_t, ktx_list); - list_del (&ktx->ktx_list); break; } @@ -373,7 +374,6 @@ kqswnal_get_idle_tx (kpr_fwd_desc_t *fwd, int may_block) ktx = list_entry (kqswnal_data.kqn_nblk_idletxds.next, kqswnal_tx_t, ktx_list); - list_del (&ktx->ktx_list); break; } @@ -386,6 +386,12 @@ kqswnal_get_idle_tx (kpr_fwd_desc_t *fwd, int may_block) !list_empty (&kqswnal_data.kqn_idletxds)); } + if (ktx != NULL) { + list_del (&ktx->ktx_list); + list_add (&ktx->ktx_list, &kqswnal_data.kqn_activetxds); + ktx->ktx_launcher = current->pid; + } + spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags); /* Idle descs can't have any mapped (as opposed to pre-mapped) pages */ @@ -490,7 +496,7 @@ kqswnal_launch (kqswnal_tx_t *ktx) spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); - list_add_tail (&ktx->ktx_list, &kqswnal_data.kqn_delayedtxds); + list_add_tail (&ktx->ktx_delayed_list, &kqswnal_data.kqn_delayedtxds); if (waitqueue_active (&kqswnal_data.kqn_sched_waitq)) wake_up (&kqswnal_data.kqn_sched_waitq); @@ -648,7 +654,7 @@ kqswnal_dma_reply (kqswnal_tx_t *ktx, int nfrag, if (buffer + krx->krx_nob < (char *)(rmd + 1)) { /* msg too small to discover rmd size */ CERROR ("Incoming message [%d] too small for RMD (%d needed)\n", - krx->krx_nob, ((char *)(rmd + 1)) - buffer); + krx->krx_nob, (int)(((char *)(rmd + 1)) - buffer)); return (-EINVAL); } @@ -656,7 +662,7 @@ kqswnal_dma_reply (kqswnal_tx_t *ktx, int nfrag, /* rmd doesn't fit in the incoming message */ CERROR ("Incoming message [%d] too small for RMD[%d] (%d needed)\n", krx->krx_nob, rmd->kqrmd_neiov, - ((char *)&rmd->kqrmd_eiov[rmd->kqrmd_neiov]) - buffer); + (int)(((char *)&rmd->kqrmd_eiov[rmd->kqrmd_neiov]) - buffer)); return (-EINVAL); } @@ -723,11 +729,6 @@ kqswnal_sendmsg (nal_cb_t *nal, int sumnob; #endif - /* NB, the return code from this procedure is ignored. - * If we can't send, we must still complete with lib_finalize(). - * We'll have to wait for 3.2 to return an error event. - */ - CDEBUG(D_NET, "sending "LPSZ" bytes in %d frags to nid: "LPX64 " pid %u\n", payload_nob, payload_niov, nid, pid); @@ -742,8 +743,7 @@ kqswnal_sendmsg (nal_cb_t *nal, if (payload_nob > KQSW_MAXPAYLOAD) { CERROR ("request exceeds MTU size "LPSZ" (max %u).\n", payload_nob, KQSW_MAXPAYLOAD); - lib_finalize (&kqswnal_lib, private, libmsg); - return (-1); + return (PTL_FAIL); } targetnid = nid; @@ -752,14 +752,12 @@ kqswnal_sendmsg (nal_cb_t *nal, if (rc != 0) { CERROR("Can't route to "LPX64": router error %d\n", nid, rc); - lib_finalize (&kqswnal_lib, private, libmsg); - return (-1); + return (PTL_FAIL); } if (kqswnal_nid2elanid (targetnid) < 0) { CERROR("Bad gateway "LPX64" for "LPX64"\n", targetnid, nid); - lib_finalize (&kqswnal_lib, private, libmsg); - return (-1); + return (PTL_FAIL); } } @@ -770,8 +768,7 @@ kqswnal_sendmsg (nal_cb_t *nal, in_interrupt())); if (ktx == NULL) { kqswnal_cerror_hdr (hdr); - lib_finalize (&kqswnal_lib, private, libmsg); - return (-1); + return (PTL_NOSPACE); } ktx->ktx_args[0] = private; @@ -789,12 +786,12 @@ kqswnal_sendmsg (nal_cb_t *nal, CERROR ("Can't DMA reply to "LPX64": %d\n", nid, rc); kqswnal_put_idle_tx (ktx); - lib_finalize (&kqswnal_lib, private, libmsg); - return (-1); + return (PTL_FAIL); } #endif memcpy (ktx->ktx_buffer, hdr, sizeof (*hdr)); /* copy hdr from caller's stack */ + ktx->ktx_wire_hdr = (ptl_hdr_t *)ktx->ktx_buffer; #if KQSW_CHECKSUM csum = kqsw_csum (0, (char *)hdr, sizeof (*hdr)); @@ -853,8 +850,7 @@ kqswnal_sendmsg (nal_cb_t *nal, if (rc < 0) { kqswnal_put_idle_tx (ktx); - lib_finalize (&kqswnal_lib, private, libmsg); - return (-1); + return (PTL_FAIL); } rmd->kqrmd_neiov = ktx->ktx_nfrag - 1; @@ -864,9 +860,8 @@ kqswnal_sendmsg (nal_cb_t *nal, ktx->ktx_nfrag = 1; ktx->ktx_frags.iov[0].Len += offsetof (kqswnal_remotemd_t, kqrmd_eiov[rmd->kqrmd_neiov]); + payload_nob = ktx->ktx_frags.iov[0].Len; ktx->ktx_state = KTX_GETTING; - payload_nob = rc; - } else #endif if (payload_nob > 0) { /* got some payload (something more to do) */ @@ -890,8 +885,7 @@ kqswnal_sendmsg (nal_cb_t *nal, payload_niov, payload_iov); if (rc != 0) { kqswnal_put_idle_tx (ktx); - lib_finalize (&kqswnal_lib, private, libmsg); - return (-1); + return (PTL_FAIL); } } } @@ -904,12 +898,11 @@ kqswnal_sendmsg (nal_cb_t *nal, if (rc != 0) { /* failed? */ CERROR ("Failed to send packet to "LPX64": %d\n", targetnid, rc); kqswnal_put_idle_tx (ktx); - lib_finalize (&kqswnal_lib, private, libmsg); - return (-1); + return (PTL_FAIL); } CDEBUG(D_NET, "send to "LPSZ" bytes to "LPX64"\n", payload_nob, targetnid); - return (0); + return (PTL_OK); } static int @@ -995,6 +988,7 @@ kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) ktx->ktx_frags.iov[0].Base = ktx->ktx_ebuffer; /* already mapped */ ktx->ktx_frags.iov[0].Len = nob; ktx->ktx_nfrag = 1; + ktx->ktx_wire_hdr = (ptl_hdr_t *)ktx->ktx_buffer; } else { @@ -1003,6 +997,8 @@ kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) rc = kqswnal_map_tx_iov (ktx, nob, niov, iov); if (rc != 0) goto failed; + + ktx->ktx_wire_hdr = (ptl_hdr_t *)iov[0].iov_base; } ktx->ktx_port = (nob <= (sizeof (ptl_hdr_t) + KQSW_SMALLPAYLOAD)) ? @@ -1482,7 +1478,7 @@ kqswnal_scheduler (void *arg) { ktx = list_entry(kqswnal_data.kqn_delayedtxds.next, kqswnal_tx_t, ktx_list); - list_del (&ktx->ktx_list); + list_del_init (&ktx->ktx_delayed_list); spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock, flags); diff --git a/lnet/klnds/socklnd/Makefile.am b/lnet/klnds/socklnd/Makefile.am index 437d7fc..acdba5e 100644 --- a/lnet/klnds/socklnd/Makefile.am +++ b/lnet/klnds/socklnd/Makefile.am @@ -9,5 +9,9 @@ MODULE = ksocknal modulenet_DATA = ksocknal.o EXTRA_PROGRAMS = ksocknal +# If you don't build with -O2, your modules won't insert, becahse htonl is +# just special that way. +AM_CFLAGS = -O2 + DEFS = ksocknal_SOURCES = socknal.c socknal_cb.c socknal.h diff --git a/lnet/klnds/socklnd/socklnd.c b/lnet/klnds/socklnd/socklnd.c index c9681a4..e7232a0 100644 --- a/lnet/klnds/socklnd/socklnd.c +++ b/lnet/klnds/socklnd/socklnd.c @@ -59,7 +59,9 @@ ksocknal_api_shutdown(nal_t *nal, int ni) { CDEBUG (D_NET, "closing all connections\n"); - return ksocknal_close_sock(0); /* close all sockets */ + ksocknal_del_route (PTL_NID_ANY, 0, 0, 0); + ksocknal_close_conn (PTL_NID_ANY, 0); + return PTL_OK; } void @@ -104,15 +106,6 @@ ksocknal_init(int interface, ptl_pt_index_t ptl_size, * EXTRA functions follow */ -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) -#define SOCKET_I(inode) (&(inode)->u.socket_i) -#endif -static __inline__ struct socket * -socki_lookup(struct inode *inode) -{ - return SOCKET_I(inode); -} - int ksocknal_set_mynid(ptl_nid_t nid) { @@ -132,23 +125,43 @@ ksocknal_set_mynid(ptl_nid_t nid) } void -ksocknal_bind_irq (unsigned int irq, int cpu) +ksocknal_bind_irq (unsigned int irq) { #if (defined(CONFIG_SMP) && CPU_AFFINITY) - char cmdline[64]; - char *argv[] = {"/bin/sh", - "-c", - cmdline, - NULL}; - char *envp[] = {"HOME=/", - "PATH=/sbin:/bin:/usr/sbin:/usr/bin", - NULL}; + int bind; + unsigned long flags; + char cmdline[64]; + ksock_irqinfo_t *info; + char *argv[] = {"/bin/sh", + "-c", + cmdline, + NULL}; + char *envp[] = {"HOME=/", + "PATH=/sbin:/bin:/usr/sbin:/usr/bin", + NULL}; + + LASSERT (irq < NR_IRQS); + if (irq == 0) /* software NIC */ + return; + + info = &ksocknal_data.ksnd_irqinfo[irq]; + + write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); + + LASSERT (info->ksni_valid); + bind = !info->ksni_bound; + info->ksni_bound = 1; + + write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); + + if (!bind) /* bound already */ + return; snprintf (cmdline, sizeof (cmdline), - "echo %d > /proc/irq/%u/smp_affinity", 1 << cpu, irq); + "echo %d > /proc/irq/%u/smp_affinity", 1 << info->ksni_sched, irq); printk (KERN_INFO "Binding irq %u to CPU %d with cmd: %s\n", - irq, cpu, cmdline); + irq, info->ksni_sched, cmdline); /* FIXME: Find a better method of setting IRQ affinity... */ @@ -157,57 +170,432 @@ ksocknal_bind_irq (unsigned int irq, int cpu) #endif } +ksock_route_t * +ksocknal_create_route (__u32 ipaddr, int port, int buffer_size, + int irq_affinity, int xchange_nids, int nonagel) +{ + ksock_route_t *route; + + PORTAL_ALLOC (route, sizeof (*route)); + if (route == NULL) + return (NULL); + + atomic_set (&route->ksnr_refcount, 1); + route->ksnr_sharecount = 0; + route->ksnr_peer = NULL; + route->ksnr_timeout = jiffies_64; + route->ksnr_retry_interval = SOCKNAL_MIN_RECONNECT_INTERVAL; + route->ksnr_ipaddr = ipaddr; + route->ksnr_port = port; + route->ksnr_buffer_size = buffer_size; + route->ksnr_irq_affinity = irq_affinity; + route->ksnr_xchange_nids = xchange_nids; + route->ksnr_nonagel = nonagel; + route->ksnr_connecting = 0; + route->ksnr_deleted = 0; + route->ksnr_generation = 0; + route->ksnr_conn = NULL; + + return (route); +} + +void +ksocknal_destroy_route (ksock_route_t *route) +{ + LASSERT (route->ksnr_sharecount == 0); + LASSERT (route->ksnr_conn == NULL); + + if (route->ksnr_peer != NULL) + ksocknal_put_peer (route->ksnr_peer); + + PORTAL_FREE (route, sizeof (*route)); +} + +void +ksocknal_put_route (ksock_route_t *route) +{ + CDEBUG (D_OTHER, "putting route[%p] -> "LPX64" (%d)\n", + route, route->ksnr_peer->ksnp_nid, + atomic_read (&route->ksnr_refcount)); + + LASSERT (atomic_read (&route->ksnr_refcount) > 0); + if (!atomic_dec_and_test (&route->ksnr_refcount)) + return; + + ksocknal_destroy_route (route); +} + +ksock_peer_t * +ksocknal_create_peer (ptl_nid_t nid) +{ + ksock_peer_t *peer; + + LASSERT (nid != PTL_NID_ANY); + + PORTAL_ALLOC (peer, sizeof (*peer)); + if (peer == NULL) + return (NULL); + + memset (peer, 0, sizeof (*peer)); + + peer->ksnp_nid = nid; + atomic_set (&peer->ksnp_refcount, 1); /* 1 ref for caller */ + peer->ksnp_closing = 0; + INIT_LIST_HEAD (&peer->ksnp_conns); + INIT_LIST_HEAD (&peer->ksnp_routes); + INIT_LIST_HEAD (&peer->ksnp_tx_queue); + + /* Can't unload while peers exist; ensures all I/O has terminated + * before unload attempts */ + PORTAL_MODULE_USE; + atomic_inc (&ksocknal_data.ksnd_npeers); + return (peer); +} + +void +ksocknal_destroy_peer (ksock_peer_t *peer) +{ + CDEBUG (D_NET, "peer "LPX64" %p deleted\n", peer->ksnp_nid, peer); + + LASSERT (atomic_read (&peer->ksnp_refcount) == 0); + LASSERT (list_empty (&peer->ksnp_conns)); + LASSERT (list_empty (&peer->ksnp_routes)); + LASSERT (list_empty (&peer->ksnp_tx_queue)); + + PORTAL_FREE (peer, sizeof (*peer)); + + /* NB a peer's connections and autoconnect routes keep a reference + * on their peer until they are destroyed, so we can be assured + * that _all_ state to do with this peer has been cleaned up when + * its refcount drops to zero. */ + atomic_dec (&ksocknal_data.ksnd_npeers); + PORTAL_MODULE_UNUSE; +} + +void +ksocknal_put_peer (ksock_peer_t *peer) +{ + CDEBUG (D_OTHER, "putting peer[%p] -> "LPX64" (%d)\n", + peer, peer->ksnp_nid, + atomic_read (&peer->ksnp_refcount)); + + LASSERT (atomic_read (&peer->ksnp_refcount) > 0); + if (!atomic_dec_and_test (&peer->ksnp_refcount)) + return; + + ksocknal_destroy_peer (peer); +} + +ksock_peer_t * +ksocknal_find_peer_locked (ptl_nid_t nid) +{ + struct list_head *peer_list = ksocknal_nid2peerlist (nid); + struct list_head *tmp; + ksock_peer_t *peer; + + list_for_each (tmp, peer_list) { + + peer = list_entry (tmp, ksock_peer_t, ksnp_list); + + LASSERT (!peer->ksnp_closing); + LASSERT (!(list_empty (&peer->ksnp_routes) && + list_empty (&peer->ksnp_conns))); + + if (peer->ksnp_nid != nid) + continue; + + CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n", + peer, nid, atomic_read (&peer->ksnp_refcount)); + return (peer); + } + return (NULL); +} + +ksock_peer_t * +ksocknal_get_peer (ptl_nid_t nid) +{ + ksock_peer_t *peer; + + read_lock (&ksocknal_data.ksnd_global_lock); + peer = ksocknal_find_peer_locked (nid); + if (peer != NULL) /* +1 ref for caller? */ + atomic_inc (&peer->ksnp_refcount); + read_unlock (&ksocknal_data.ksnd_global_lock); + + return (peer); +} + +void +ksocknal_unlink_peer_locked (ksock_peer_t *peer) +{ + LASSERT (!peer->ksnp_closing); + peer->ksnp_closing = 1; + list_del (&peer->ksnp_list); + /* lose peerlist's ref */ + ksocknal_put_peer (peer); +} + +ksock_route_t * +ksocknal_get_route_by_idx (int index) +{ + ksock_peer_t *peer; + struct list_head *ptmp; + ksock_route_t *route; + struct list_head *rtmp; + int i; + + read_lock (&ksocknal_data.ksnd_global_lock); + + for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { + list_for_each (ptmp, &ksocknal_data.ksnd_peers[i]) { + peer = list_entry (ptmp, ksock_peer_t, ksnp_list); + + LASSERT (!(list_empty (&peer->ksnp_routes) && + list_empty (&peer->ksnp_conns))); + + list_for_each (rtmp, &peer->ksnp_routes) { + if (index-- > 0) + continue; + + route = list_entry (rtmp, ksock_route_t, ksnr_list); + atomic_inc (&route->ksnr_refcount); + read_unlock (&ksocknal_data.ksnd_global_lock); + return (route); + } + } + } + + read_unlock (&ksocknal_data.ksnd_global_lock); + return (NULL); +} + int -ksocknal_add_sock (ptl_nid_t nid, int fd, int bind_irq) +ksocknal_add_route (ptl_nid_t nid, __u32 ipaddr, int port, int bufnob, + int nonagle, int xchange_nids, int bind_irq, int share) { unsigned long flags; + ksock_peer_t *peer; + ksock_peer_t *peer2; + ksock_route_t *route; + struct list_head *rtmp; + ksock_route_t *route2; + + if (nid == PTL_NID_ANY) + return (-EINVAL); + + /* Have a brand new peer ready... */ + peer = ksocknal_create_peer (nid); + if (peer == NULL) + return (-ENOMEM); + + route = ksocknal_create_route (ipaddr, port, bufnob, + nonagle, xchange_nids, bind_irq); + if (route == NULL) { + ksocknal_put_peer (peer); + return (-ENOMEM); + } + + write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); + + peer2 = ksocknal_find_peer_locked (nid); + if (peer2 != NULL) { + ksocknal_put_peer (peer); + peer = peer2; + } else { + /* peer table takes existing ref on peer */ + list_add (&peer->ksnp_list, + ksocknal_nid2peerlist (nid)); + } + + route2 = NULL; + if (share) { + /* check for existing route to this NID via this ipaddr */ + list_for_each (rtmp, &peer->ksnp_routes) { + route2 = list_entry (rtmp, ksock_route_t, ksnr_list); + + if (route2->ksnr_ipaddr == ipaddr) + break; + + route2 = NULL; + } + } + + if (route2 != NULL) { + ksocknal_put_route (route); + route = route2; + } else { + /* route takes a ref on peer */ + route->ksnr_peer = peer; + atomic_inc (&peer->ksnp_refcount); + /* peer's route list takes existing ref on route */ + list_add (&route->ksnr_list, &peer->ksnp_routes); + } + + route->ksnr_sharecount++; + + write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); + + return (0); +} + +void +ksocknal_del_route_locked (ksock_route_t *route, int share, int keep_conn) +{ + ksock_peer_t *peer = route->ksnr_peer; + ksock_conn_t *conn = route->ksnr_conn; + + if (!share) + route->ksnr_sharecount = 0; + else { + route->ksnr_sharecount--; + if (route->ksnr_sharecount != 0) + return; + } + + if (conn != NULL) { + if (!keep_conn) + ksocknal_close_conn_locked (conn); + else { + /* keeping the conn; just dissociate it and route... */ + conn->ksnc_route = NULL; + route->ksnr_conn = NULL; + ksocknal_put_route (route); /* drop conn's ref on route */ + ksocknal_put_conn (conn); /* drop route's ref on conn */ + } + } + + route->ksnr_deleted = 1; + list_del (&route->ksnr_list); + ksocknal_put_route (route); /* drop peer's ref */ + + if (list_empty (&peer->ksnp_routes) && + list_empty (&peer->ksnp_conns)) { + /* I've just removed the last autoconnect route of a peer + * with no active connections */ + ksocknal_unlink_peer_locked (peer); + } +} + +int +ksocknal_del_route (ptl_nid_t nid, __u32 ipaddr, int share, int keep_conn) +{ + unsigned long flags; + struct list_head *ptmp; + struct list_head *pnxt; + ksock_peer_t *peer; + struct list_head *rtmp; + struct list_head *rnxt; + ksock_route_t *route; + int lo; + int hi; + int i; + int rc = -ENOENT; + + write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); + + if (nid != PTL_NID_ANY) + lo = hi = ksocknal_nid2peerlist(nid) - ksocknal_data.ksnd_peers; + else { + lo = 0; + hi = ksocknal_data.ksnd_peer_hash_size - 1; + } + + for (i = lo; i <= hi; i++) { + list_for_each_safe (ptmp, pnxt, &ksocknal_data.ksnd_peers[i]) { + peer = list_entry (ptmp, ksock_peer_t, ksnp_list); + + if (!(nid == PTL_NID_ANY || peer->ksnp_nid == nid)) + continue; + + list_for_each_safe (rtmp, rnxt, &peer->ksnp_routes) { + route = list_entry (rtmp, ksock_route_t, + ksnr_list); + + if (!(ipaddr == 0 || + route->ksnr_ipaddr == ipaddr)) + continue; + + ksocknal_del_route_locked (route, share, keep_conn); + rc = 0; /* matched something */ + if (share) + goto out; + } + } + } + out: + write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); + + return (rc); +} + +ksock_conn_t * +ksocknal_get_conn_by_idx (int index) +{ + ksock_peer_t *peer; + struct list_head *ptmp; ksock_conn_t *conn; - struct file *file = NULL; - struct socket *sock = NULL; - ksock_sched_t *sched = NULL; - unsigned int irq = 0; - struct dst_entry *dst; - int ret; - int idx; - ENTRY; + struct list_head *ctmp; + int i; - LASSERT (!in_interrupt()); + read_lock (&ksocknal_data.ksnd_global_lock); - file = fget(fd); - if (file == NULL) - RETURN(-EINVAL); + for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { + list_for_each (ptmp, &ksocknal_data.ksnd_peers[i]) { + peer = list_entry (ptmp, ksock_peer_t, ksnp_list); - ret = -EINVAL; - sock = socki_lookup(file->f_dentry->d_inode); - if (sock == NULL) - GOTO(error, ret); + LASSERT (!(list_empty (&peer->ksnp_routes) && + list_empty (&peer->ksnp_conns))); - ret = -ENOMEM; - PORTAL_ALLOC(conn, sizeof(*conn)); - if (!conn) - GOTO(error, ret); + list_for_each (ctmp, &peer->ksnp_conns) { + if (index-- > 0) + continue; -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) - sock->sk->allocation = GFP_NOFS; /* don't call info fs for alloc */ -#else -#warning Zach, Eric: fix me!! -#endif + conn = list_entry (ctmp, ksock_conn_t, ksnc_list); + atomic_inc (&conn->ksnc_refcount); + read_unlock (&ksocknal_data.ksnd_global_lock); + return (conn); + } + } + } - conn->ksnc_file = file; - conn->ksnc_sock = sock; - conn->ksnc_saved_data_ready = sock->sk->sk_data_ready; - conn->ksnc_saved_write_space = sock->sk->sk_write_space; - conn->ksnc_peernid = nid; - atomic_set (&conn->ksnc_refcount, 1); /* 1 ref for socklist */ + read_unlock (&ksocknal_data.ksnd_global_lock); + return (NULL); +} - conn->ksnc_rx_ready = 0; - conn->ksnc_rx_scheduled = 0; - ksocknal_new_packet (conn, 0); +void +ksocknal_get_peer_addr (ksock_conn_t *conn) +{ + struct sockaddr_in sin; + int len = sizeof (sin); + int rc; - INIT_LIST_HEAD (&conn->ksnc_tx_queue); - conn->ksnc_tx_ready = 0; - conn->ksnc_tx_scheduled = 0; + rc = ksocknal_getconnsock (conn); + LASSERT (rc == 0); + + rc = conn->ksnc_sock->ops->getname (conn->ksnc_sock, + (struct sockaddr *)&sin, &len, 2); + LASSERT (len <= sizeof (sin)); + ksocknal_putconnsock (conn); + if (rc != 0) { + CERROR ("Error %d getting sock peer IP\n", rc); + return; + } + + conn->ksnc_ipaddr = ntohl (sin.sin_addr.s_addr); + conn->ksnc_port = ntohs (sin.sin_port); +} + +unsigned int +ksocknal_conn_irq (ksock_conn_t *conn) +{ + int irq = 0; + int rc; + struct dst_entry *dst; + + rc = ksocknal_getconnsock (conn); + LASSERT (rc == 0); + dst = sk_dst_get (conn->ksnc_sock->sk); if (dst != NULL) { if (dst->dev != NULL) { @@ -219,139 +607,417 @@ ksocknal_add_sock (ptl_nid_t nid, int fd, int bind_irq) } dst_release (dst); } + + ksocknal_putconnsock (conn); + return (irq); +} + +ksock_sched_t * +ksocknal_choose_scheduler_locked (unsigned int irq) +{ + ksock_sched_t *sched; + ksock_irqinfo_t *info; + int i; + + LASSERT (irq < NR_IRQS); + info = &ksocknal_data.ksnd_irqinfo[irq]; + + if (irq != 0 && /* hardware NIC */ + info->ksni_valid) { /* already set up */ + return (&ksocknal_data.ksnd_schedulers[info->ksni_sched]); + } + + /* software NIC (irq == 0) || not associated with a scheduler yet. + * Choose the CPU with the fewest connections... */ + sched = &ksocknal_data.ksnd_schedulers[0]; + for (i = 1; i < SOCKNAL_N_SCHED; i++) + if (sched->kss_nconns > + ksocknal_data.ksnd_schedulers[i].kss_nconns) + sched = &ksocknal_data.ksnd_schedulers[i]; + + if (irq != 0) { /* Hardware NIC */ + info->ksni_valid = 1; + info->ksni_sched = sched - ksocknal_data.ksnd_schedulers; + + /* no overflow... */ + LASSERT (info->ksni_sched == sched - ksocknal_data.ksnd_schedulers); + } + + return (sched); +} + +int +ksocknal_create_conn (ptl_nid_t nid, ksock_route_t *route, + struct socket *sock, int bind_irq) +{ + unsigned long flags; + ksock_conn_t *conn; + ksock_peer_t *peer; + ksock_peer_t *peer2; + ksock_sched_t *sched; + unsigned int irq; + ksock_tx_t *tx; + int rc; + + /* NB, sock has an associated file since (a) this connection might + * have been created in userland and (b) we need the refcounting so + * that we don't close the socket while I/O is being done on it. */ + LASSERT (sock->file != NULL); + + rc = ksocknal_set_linger (sock); + if (rc != 0) + return (rc); + + peer = NULL; + if (route == NULL) { /* not autoconnect */ + /* Assume this socket connects to a brand new peer */ + peer = ksocknal_create_peer (nid); + if (peer == NULL) + return (-ENOMEM); + } - write_lock_irqsave (&ksocknal_data.ksnd_socklist_lock, flags); + PORTAL_ALLOC(conn, sizeof(*conn)); + if (conn == NULL) { + if (peer != NULL) + ksocknal_put_peer (peer); + return (-ENOMEM); + } - if (irq == 0 || - ksocknal_data.ksnd_irq_info[irq] == SOCKNAL_IRQ_UNASSIGNED) { - /* This is a software NIC, or we haven't associated it with - * a CPU yet */ + memset (conn, 0, sizeof (*conn)); + conn->ksnc_peer = NULL; + conn->ksnc_route = NULL; + conn->ksnc_sock = sock; + conn->ksnc_saved_data_ready = sock->sk->sk_data_ready; + conn->ksnc_saved_write_space = sock->sk->sk_write_space; + atomic_set (&conn->ksnc_refcount, 1); /* 1 ref for me */ + + conn->ksnc_rx_ready = 0; + conn->ksnc_rx_scheduled = 0; + ksocknal_new_packet (conn, 0); + + INIT_LIST_HEAD (&conn->ksnc_tx_queue); +#if SOCKNAL_ZC + INIT_LIST_HEAD (&conn->ksnc_tx_pending); +#endif + conn->ksnc_tx_ready = 0; + conn->ksnc_tx_scheduled = 0; + atomic_set (&conn->ksnc_tx_nob, 0); - /* Choose the CPU with the fewest connections */ - sched = ksocknal_data.ksnd_schedulers; - for (idx = 1; idx < SOCKNAL_N_SCHED; idx++) - if (sched->kss_nconns > - ksocknal_data.ksnd_schedulers[idx].kss_nconns) - sched = &ksocknal_data.ksnd_schedulers[idx]; + ksocknal_get_peer_addr (conn); - if (irq != 0) { /* Hardware NIC */ - /* Remember which scheduler we chose */ - idx = sched - ksocknal_data.ksnd_schedulers; + irq = ksocknal_conn_irq (conn); - LASSERT (idx < SOCKNAL_IRQ_SCHED_MASK); + write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); - if (bind_irq) /* remember if we will bind below */ - idx |= SOCKNAL_IRQ_BOUND; + if (route != NULL) { + /* Autoconnected! */ + LASSERT (route->ksnr_conn == NULL && route->ksnr_connecting); - ksocknal_data.ksnd_irq_info[irq] = idx; + if (route->ksnr_deleted) { + /* This conn was autoconnected, but the autoconnect + * route got deleted while it was being + * established! */ + write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, + flags); + PORTAL_FREE (conn, sizeof (*conn)); + return (-ESTALE); } - } else { - /* This is a hardware NIC, associated with a CPU */ - idx = ksocknal_data.ksnd_irq_info[irq]; - /* Don't bind again if we've bound already */ - if ((idx & SOCKNAL_IRQ_BOUND) != 0) - bind_irq = 0; - - sched = &ksocknal_data.ksnd_schedulers[idx & SOCKNAL_IRQ_SCHED_MASK]; + + /* associate conn/route for auto-reconnect */ + route->ksnr_conn = conn; + atomic_inc (&conn->ksnc_refcount); + conn->ksnc_route = route; + atomic_inc (&route->ksnr_refcount); + route->ksnr_connecting = 0; + + route->ksnr_generation++; + route->ksnr_retry_interval = SOCKNAL_MIN_RECONNECT_INTERVAL; + + peer = route->ksnr_peer; + } else { + /* Not an autoconnected connection; see if there is an + * existing peer for this NID */ + peer2 = ksocknal_find_peer_locked (nid); + if (peer2 != NULL) { + ksocknal_put_peer (peer); + peer = peer2; + } else { + list_add (&peer->ksnp_list, + ksocknal_nid2peerlist (nid)); + /* peer list takes over existing ref */ + } } - sched->kss_nconns++; - conn->ksnc_scheduler = sched; + LASSERT (!peer->ksnp_closing); - list_add(&conn->ksnc_list, &ksocknal_data.ksnd_socklist); + conn->ksnc_peer = peer; + atomic_inc (&peer->ksnp_refcount); - write_unlock_irqrestore (&ksocknal_data.ksnd_socklist_lock, flags); + list_add (&conn->ksnc_list, &peer->ksnp_conns); + atomic_inc (&conn->ksnc_refcount); - if (bind_irq && /* irq binding required */ - irq != 0) /* hardware NIC */ - ksocknal_bind_irq (irq, sched - ksocknal_data.ksnd_schedulers); + sched = ksocknal_choose_scheduler_locked (irq); + sched->kss_nconns++; + conn->ksnc_scheduler = sched; - /* NOW it's safe to get called back when socket is ready... */ + /* NB my callbacks block while I hold ksnd_global_lock */ sock->sk->sk_user_data = conn; sock->sk->sk_data_ready = ksocknal_data_ready; sock->sk->sk_write_space = ksocknal_write_space; - /* ...which I call right now to get things going */ + /* Take all the packets blocking for a connection. + * NB, it might be nicer to share these blocked packets among any + * other connections that are becoming established, however that + * confuses the normal packet launching operation, which selects a + * connection and queues the packet on it without needing an + * exclusive lock on ksnd_global_lock. */ + while (!list_empty (&peer->ksnp_tx_queue)) { + tx = list_entry (peer->ksnp_tx_queue.next, + ksock_tx_t, tx_list); + + list_del (&tx->tx_list); + ksocknal_queue_tx_locked (tx, conn); + } + + write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); + + if (bind_irq) /* irq binding required */ + ksocknal_bind_irq (irq); + + /* Call the callbacks right now to get things going. */ ksocknal_data_ready (sock->sk, 0); ksocknal_write_space (sock->sk); CDEBUG(D_IOCTL, "conn [%p] registered for nid "LPX64"\n", - conn, conn->ksnc_peernid); + conn, conn->ksnc_peer->ksnp_nid); - /* Can't unload while connection active */ - PORTAL_MODULE_USE; - RETURN(0); + ksocknal_put_conn (conn); + return (0); +} -error: - fput(file); - return (ret); +void +ksocknal_close_conn_locked (ksock_conn_t *conn) +{ + /* This just does the immmediate housekeeping, and queues the + * connection for the reaper to terminate. + * Caller holds ksnd_global_lock exclusively in irq context */ + ksock_peer_t *peer = conn->ksnc_peer; + ksock_route_t *route; + + LASSERT (!conn->ksnc_closing); + conn->ksnc_closing = 1; + atomic_inc (&ksocknal_data.ksnd_nclosing_conns); + + route = conn->ksnc_route; + if (route != NULL) { + /* dissociate conn from route... */ + LASSERT (!route->ksnr_connecting && + !route->ksnr_deleted); + + route->ksnr_conn = NULL; + conn->ksnc_route = NULL; + + ksocknal_put_route (route); /* drop conn's ref on route */ + ksocknal_put_conn (conn); /* drop route's ref on conn */ + } + + /* ksnd_deathrow_conns takes over peer's ref */ + list_del (&conn->ksnc_list); + + if (list_empty (&peer->ksnp_conns) && + list_empty (&peer->ksnp_routes)) { + /* I've just closed last conn belonging to a + * non-autoconnecting peer */ + ksocknal_unlink_peer_locked (peer); + } + + spin_lock (&ksocknal_data.ksnd_reaper_lock); + + list_add_tail (&conn->ksnc_list, &ksocknal_data.ksnd_deathrow_conns); + if (waitqueue_active (&ksocknal_data.ksnd_reaper_waitq)) + wake_up (&ksocknal_data.ksnd_reaper_waitq); + + spin_unlock (&ksocknal_data.ksnd_reaper_lock); } -/* Passing in a zero nid will close all connections */ int -ksocknal_close_sock(ptl_nid_t nid) +ksocknal_close_conn_unlocked (ksock_conn_t *conn) { - unsigned long flags; - ksock_conn_t *conn; - LIST_HEAD (death_row); - struct list_head *tmp; + unsigned long flags; + int did_it = 0; + + write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); + + if (!conn->ksnc_closing) { + did_it = 1; + ksocknal_close_conn_locked (conn); + } + + write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); - LASSERT (!in_interrupt()); - write_lock_irqsave (&ksocknal_data.ksnd_socklist_lock, flags); + return (did_it); +} - if (nid == 0) { /* close ALL connections */ - /* insert 'death row' into the socket list... */ - list_add (&death_row, &ksocknal_data.ksnd_socklist); - /* ...extract and reinitialise the socket list itself... */ - list_del_init (&ksocknal_data.ksnd_socklist); - /* ...and voila, death row is the proud owner of all conns */ - } else list_for_each (tmp, &ksocknal_data.ksnd_socklist) { +void +ksocknal_terminate_conn (ksock_conn_t *conn) +{ + /* This gets called by the reaper (guaranteed thread context) to + * disengage the socket from its callbacks and close it. + * ksnc_refcount will eventually hit zero, and then the reaper will + * destroy it. */ + unsigned long flags; + + /* serialise with callbacks */ + write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); + + LASSERT (conn->ksnc_closing); + + /* Remove conn's network callbacks. + * NB I _have_ to restore the callback, rather than storing a noop, + * since the socket could survive past this module being unloaded!! */ + conn->ksnc_sock->sk->sk_data_ready = conn->ksnc_saved_data_ready; + conn->ksnc_sock->sk->sk_write_space = conn->ksnc_saved_write_space; + + /* A callback could be in progress already; they hold a read lock + * on ksnd_global_lock (to serialise with me) and NOOP if + * sk_user_data is NULL. */ + conn->ksnc_sock->sk->sk_user_data = NULL; + + conn->ksnc_scheduler->kss_nconns--; + + write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); + + /* The socket is closed on the final put; either here, or in + * ksocknal_{send,recv}msg(). Since we set up the linger2 option + * when the connection was established, this will close the socket + * immediately, aborting anything buffered in it. Any hung + * zero-copy transmits will therefore complete in finite time. */ + ksocknal_putconnsock (conn); +} - conn = list_entry (tmp, ksock_conn_t, ksnc_list); +void +ksocknal_destroy_conn (ksock_conn_t *conn) +{ + /* Final coup-de-grace of the reaper */ + CDEBUG (D_NET, "connection %p\n", conn); - if (conn->ksnc_peernid == nid) { - list_del (&conn->ksnc_list); - list_add (&conn->ksnc_list, &death_row); - break; - } + LASSERT (atomic_read (&conn->ksnc_refcount) == 0); + LASSERT (conn->ksnc_route == NULL); + LASSERT (!conn->ksnc_tx_scheduled); + LASSERT (!conn->ksnc_rx_scheduled); +#if SOCKNAL_ZC + LASSERT (list_empty (&conn->ksnc_tx_pending)); +#endif + /* complete queued packets */ + while (!list_empty (&conn->ksnc_tx_queue)) { + ksock_tx_t *tx = list_entry (conn->ksnc_tx_queue.next, + ksock_tx_t, tx_list); + + CERROR ("Deleting packet type %d len %d ("LPX64"->"LPX64")\n", + NTOH__u32 (tx->tx_hdr->type), + NTOH__u32 (PTL_HDR_LENGTH(tx->tx_hdr)), + NTOH__u64 (tx->tx_hdr->src_nid), + NTOH__u64 (tx->tx_hdr->dest_nid)); + + list_del (&tx->tx_list); + ksocknal_tx_done (tx, 0); } - write_unlock_irqrestore (&ksocknal_data.ksnd_socklist_lock, flags); + /* complete current receive if any */ + switch (conn->ksnc_rx_state) { + case SOCKNAL_RX_BODY: + lib_finalize (&ksocknal_lib, NULL, conn->ksnc_cookie); + break; + case SOCKNAL_RX_BODY_FWD: + ksocknal_fmb_callback (conn->ksnc_cookie, -ECONNABORTED); + break; + case SOCKNAL_RX_HEADER: + case SOCKNAL_RX_SLOP: + break; + default: + LBUG (); + break; + } - if (nid && list_empty (&death_row)) - return (-ENOENT); + ksocknal_put_peer (conn->ksnc_peer); - while (!list_empty (&death_row)) { - conn = list_entry (death_row.next, ksock_conn_t, ksnc_list); - list_del (&conn->ksnc_list); + PORTAL_FREE (conn, sizeof (*conn)); + atomic_dec (&ksocknal_data.ksnd_nclosing_conns); +} - /* NB I _have_ to restore the callback, rather than storing - * a noop, since the socket could survive past this module - * being unloaded!! */ - conn->ksnc_sock->sk->sk_data_ready = conn->ksnc_saved_data_ready; - conn->ksnc_sock->sk->sk_write_space = conn->ksnc_saved_write_space; +void +ksocknal_put_conn (ksock_conn_t *conn) +{ + unsigned long flags; - /* OK; no more callbacks, but they could be in progress now, - * so wait for them to complete... */ - write_lock_irqsave (&ksocknal_data.ksnd_socklist_lock, flags); + CDEBUG (D_OTHER, "putting conn[%p] -> "LPX64" (%d)\n", + conn, conn->ksnc_peer->ksnp_nid, + atomic_read (&conn->ksnc_refcount)); - /* ...however if I get the lock before a callback gets it, - * this will make them noop - */ - conn->ksnc_sock->sk->sk_user_data = NULL; + LASSERT (atomic_read (&conn->ksnc_refcount) > 0); + if (!atomic_dec_and_test (&conn->ksnc_refcount)) + return; + + spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags); - /* And drop the scheduler's connection count while I've got - * the exclusive lock */ - conn->ksnc_scheduler->kss_nconns--; + list_add (&conn->ksnc_list, &ksocknal_data.ksnd_zombie_conns); + if (waitqueue_active (&ksocknal_data.ksnd_reaper_waitq)) + wake_up (&ksocknal_data.ksnd_reaper_waitq); - write_unlock_irqrestore(&ksocknal_data.ksnd_socklist_lock, - flags); + spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags); +} - ksocknal_put_conn (conn); /* drop ref for ksnd_socklist */ +int +ksocknal_close_conn (ptl_nid_t nid, __u32 ipaddr) +{ + unsigned long flags; + ksock_conn_t *conn; + struct list_head *ctmp; + struct list_head *cnxt; + ksock_peer_t *peer; + struct list_head *ptmp; + struct list_head *pnxt; + int lo; + int hi; + int i; + int rc = -ENOENT; + + write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); + + if (nid != PTL_NID_ANY) + lo = hi = ksocknal_nid2peerlist(nid) - ksocknal_data.ksnd_peers; + else { + lo = 0; + hi = ksocknal_data.ksnd_peer_hash_size - 1; } - return (0); + for (i = lo; i <= hi; i++) { + list_for_each_safe (ptmp, pnxt, &ksocknal_data.ksnd_peers[i]) { + + peer = list_entry (ptmp, ksock_peer_t, ksnp_list); + + if (!(nid == PTL_NID_ANY || nid == peer->ksnp_nid)) + continue; + + list_for_each_safe (ctmp, cnxt, &peer->ksnp_conns) { + + conn = list_entry (ctmp, ksock_conn_t, + ksnc_list); + + if (!(ipaddr == 0 || + conn->ksnc_ipaddr == ipaddr)) + continue; + + rc = 0; + ksocknal_close_conn_locked (conn); + } + } + } + + write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); + + return (rc); } #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) @@ -370,13 +1036,20 @@ struct tcp_opt *sock2tcp_opt(struct sock *sk) void ksocknal_push_conn (ksock_conn_t *conn) { - struct sock *sk = conn->ksnc_sock->sk; - struct tcp_opt *tp = sock2tcp_opt(sk); + struct sock *sk; + struct tcp_opt *tp; int nonagle; int val = 1; int rc; mm_segment_t oldmm; + rc = ksocknal_getconnsock (conn); + if (rc != 0) /* being shut down */ + return; + + sk = conn->ksnc_sock->sk; + tp = sock2tcp_opt(sk); + lock_sock (sk); nonagle = tp->nonagle; tp->nonagle = 1; @@ -394,47 +1067,33 @@ ksocknal_push_conn (ksock_conn_t *conn) lock_sock (sk); tp->nonagle = nonagle; release_sock (sk); + + ksocknal_putconnsock (conn); } -/* Passing in a zero nid pushes all connections */ -int -ksocknal_push_sock (ptl_nid_t nid) +void +ksocknal_push_peer (ksock_peer_t *peer) { - ksock_conn_t *conn; - struct list_head *tmp; - int index; - int i; - - if (nid != 0) { - conn = ksocknal_get_conn (nid); - - if (conn == NULL) - return (-ENOENT); - - ksocknal_push_conn (conn); - ksocknal_put_conn (conn); - - return (0); - } + int index; + int i; + struct list_head *tmp; + ksock_conn_t *conn; - /* NB we can't remove connections from the socket list so we have to - * cope with them being removed from under us... - */ for (index = 0; ; index++) { - read_lock (&ksocknal_data.ksnd_socklist_lock); + read_lock (&ksocknal_data.ksnd_global_lock); i = 0; conn = NULL; - list_for_each (tmp, &ksocknal_data.ksnd_socklist) { + list_for_each (tmp, &peer->ksnp_conns) { if (i++ == index) { - conn = list_entry(tmp, ksock_conn_t, ksnc_list); - atomic_inc (&conn->ksnc_refcount); // take a ref + conn = list_entry (tmp, ksock_conn_t, ksnc_list); + atomic_inc (&conn->ksnc_refcount); break; } } - read_unlock (&ksocknal_data.ksnd_socklist_lock); + read_unlock (&ksocknal_data.ksnd_global_lock); if (conn == NULL) break; @@ -442,80 +1101,57 @@ ksocknal_push_sock (ptl_nid_t nid) ksocknal_push_conn (conn); ksocknal_put_conn (conn); } - - return (0); } -ksock_conn_t * -ksocknal_get_conn (ptl_nid_t nid) +int +ksocknal_push (ptl_nid_t nid) { - struct list_head *tmp; - ksock_conn_t *conn; - - PROF_START(conn_list_walk); - - read_lock (&ksocknal_data.ksnd_socklist_lock); - - list_for_each(tmp, &ksocknal_data.ksnd_socklist) { - - conn = list_entry(tmp, ksock_conn_t, ksnc_list); - - if (conn->ksnc_peernid == nid) { - /* caller is referencing */ - atomic_inc (&conn->ksnc_refcount); - - read_unlock (&ksocknal_data.ksnd_socklist_lock); + ksock_peer_t *peer; + struct list_head *tmp; + int index; + int i; + int j; + int rc = -ENOENT; - CDEBUG(D_NET, "got conn [%p] -> "LPX64" (%d)\n", - conn, nid, atomic_read (&conn->ksnc_refcount)); + if (nid != PTL_NID_ANY) { + peer = ksocknal_get_peer (nid); - PROF_FINISH(conn_list_walk); - return (conn); + if (peer != NULL) { + rc = 0; + ksocknal_push_peer (peer); + ksocknal_put_peer (peer); } + return (rc); } - read_unlock (&ksocknal_data.ksnd_socklist_lock); - - CDEBUG(D_NET, "No connection found when looking for nid "LPX64"\n", - nid); - PROF_FINISH(conn_list_walk); - return (NULL); -} - -void -ksocknal_close_conn (ksock_conn_t *conn) -{ - CDEBUG (D_NET, "connection [%p] closed \n", conn); - - fput (conn->ksnc_file); - PORTAL_FREE (conn, sizeof (*conn)); - - /* One less connection keeping us hanging on */ - PORTAL_MODULE_UNUSE; -} + for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { + for (j = 0; ; j++) { + read_lock (&ksocknal_data.ksnd_global_lock); -void -_ksocknal_put_conn (ksock_conn_t *conn) -{ - unsigned long flags; + index = 0; + peer = NULL; - CDEBUG (D_NET, "connection [%p] handed the black spot\n", conn); + list_for_each (tmp, &ksocknal_data.ksnd_peers[i]) { + if (index++ == j) { + peer = list_entry(tmp, ksock_peer_t, + ksnp_list); + atomic_inc (&peer->ksnp_refcount); + break; + } + } - /* "But what is the black spot, captain?" I asked. - * "That's a summons, mate..." */ + read_unlock (&ksocknal_data.ksnd_global_lock); - LASSERT (atomic_read (&conn->ksnc_refcount) == 0); - LASSERT (conn->ksnc_sock->sk->sk_data_ready != ksocknal_data_ready); - LASSERT (conn->ksnc_sock->sk->sk_write_space != ksocknal_write_space); - LASSERT (conn->ksnc_sock->sk->sk_user_data == NULL); - LASSERT (!conn->ksnc_rx_scheduled); - - spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags); + if (peer != NULL) { + rc = 0; + ksocknal_push_peer (peer); + ksocknal_put_peer (peer); + } + } - list_add (&conn->ksnc_list, &ksocknal_data.ksnd_reaper_list); - wake_up (&ksocknal_data.ksnd_reaper_waitq); + } - spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags); + return (rc); } int @@ -526,13 +1162,68 @@ ksocknal_cmd(struct portal_ioctl_data * data, void * private) LASSERT (data != NULL); switch(data->ioc_nal_cmd) { + case NAL_CMD_GET_AUTOCONN: { + ksock_route_t *route = ksocknal_get_route_by_idx (data->ioc_count); + + if (route == NULL) + rc = -ENOENT; + else { + rc = 0; + data->ioc_nid = route->ksnr_peer->ksnp_nid; + data->ioc_id = route->ksnr_ipaddr; + data->ioc_misc = route->ksnr_port; + data->ioc_count = route->ksnr_generation; + data->ioc_size = route->ksnr_buffer_size; + data->ioc_wait = route->ksnr_sharecount; + data->ioc_flags = (route->ksnr_nonagel ? 1 : 0) | + (route->ksnr_xchange_nids ? 2 : 0) | + (route->ksnr_irq_affinity ? 4 : 0); + ksocknal_put_route (route); + } + break; + } + case NAL_CMD_ADD_AUTOCONN: { + rc = ksocknal_add_route (data->ioc_nid, data->ioc_id, + data->ioc_misc, data->ioc_size, + (data->ioc_flags & 1) != 0, + (data->ioc_flags & 2) != 0, + (data->ioc_flags & 4) != 0, + (data->ioc_flags & 8) != 0); + break; + } + case NAL_CMD_DEL_AUTOCONN: { + rc = ksocknal_del_route (data->ioc_nid, data->ioc_id, + (data->ioc_flags & 1) != 0, + (data->ioc_flags & 2) != 0); + break; + } + case NAL_CMD_GET_CONN: { + ksock_conn_t *conn = ksocknal_get_conn_by_idx (data->ioc_count); + + if (conn == NULL) + rc = -ENOENT; + else { + rc = 0; + data->ioc_nid = conn->ksnc_peer->ksnp_nid; + data->ioc_id = conn->ksnc_ipaddr; + data->ioc_misc = conn->ksnc_port; + ksocknal_put_conn (conn); + } + break; + } case NAL_CMD_REGISTER_PEER_FD: { - rc = ksocknal_add_sock(data->ioc_nid, data->ioc_fd, - data->ioc_flags); + struct socket *sock = sockfd_lookup (data->ioc_fd, &rc); + + if (sock != NULL) { + rc = ksocknal_create_conn (data->ioc_nid, NULL, + sock, data->ioc_flags); + if (rc != 0) + fput (sock->file); + } break; } case NAL_CMD_CLOSE_CONNECTION: { - rc = ksocknal_close_sock(data->ioc_nid); + rc = ksocknal_close_conn (data->ioc_nid, data->ioc_id); break; } case NAL_CMD_REGISTER_MYNID: { @@ -540,7 +1231,7 @@ ksocknal_cmd(struct portal_ioctl_data * data, void * private) break; } case NAL_CMD_PUSH_CONNECTION: { - rc = ksocknal_push_sock (data->ioc_nid); + rc = ksocknal_push (data->ioc_nid); break; } } @@ -568,6 +1259,7 @@ ksocknal_free_buffers (void) SOCKNAL_LARGE_FWD_NMSGS)); } + LASSERT (ksocknal_data.ksnd_active_ltxs == 0); if (ksocknal_data.ksnd_ltxs != NULL) PORTAL_FREE (ksocknal_data.ksnd_ltxs, sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + @@ -576,9 +1268,13 @@ ksocknal_free_buffers (void) if (ksocknal_data.ksnd_schedulers != NULL) PORTAL_FREE (ksocknal_data.ksnd_schedulers, sizeof (ksock_sched_t) * SOCKNAL_N_SCHED); + + PORTAL_FREE (ksocknal_data.ksnd_peers, + sizeof (struct list_head) * + ksocknal_data.ksnd_peer_hash_size); } -void __exit +void /*__exit*/ ksocknal_module_fini (void) { int i; @@ -601,10 +1297,15 @@ ksocknal_module_fini (void) /* fall through */ case SOCKNAL_INIT_DATA: - /* Module refcount only gets to zero when all connections + /* Module refcount only gets to zero when all peers * have been closed so all lists must be empty */ - LASSERT (list_empty (&ksocknal_data.ksnd_socklist)); - LASSERT (list_empty (&ksocknal_data.ksnd_reaper_list)); + LASSERT (atomic_read (&ksocknal_data.ksnd_npeers) == 0); + LASSERT (ksocknal_data.ksnd_peers != NULL); + for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { + LASSERT (list_empty (&ksocknal_data.ksnd_peers[i])); + } + LASSERT (list_empty (&ksocknal_data.ksnd_zombie_conns)); + LASSERT (list_empty (&ksocknal_data.ksnd_autoconnectd_routes)); LASSERT (list_empty (&ksocknal_data.ksnd_small_fmp.fmp_blocked_conns)); LASSERT (list_empty (&ksocknal_data.ksnd_large_fmp.fmp_blocked_conns)); @@ -623,6 +1324,7 @@ ksocknal_module_fini (void) /* flag threads to terminate; wake and wait for them to die */ ksocknal_data.ksnd_shuttingdown = 1; + wake_up_all (&ksocknal_data.ksnd_autoconnectd_waitq); wake_up_all (&ksocknal_data.ksnd_reaper_waitq); for (i = 0; i < SOCKNAL_N_SCHED; i++) @@ -677,8 +1379,16 @@ ksocknal_module_init (void) memset (&ksocknal_data, 0, sizeof (ksocknal_data)); /* zero pointers */ - INIT_LIST_HEAD(&ksocknal_data.ksnd_socklist); - rwlock_init(&ksocknal_data.ksnd_socklist_lock); + ksocknal_data.ksnd_peer_hash_size = SOCKNAL_PEER_HASH_SIZE; + PORTAL_ALLOC (ksocknal_data.ksnd_peers, + sizeof (struct list_head) * ksocknal_data.ksnd_peer_hash_size); + if (ksocknal_data.ksnd_peers == NULL) + RETURN (-ENOMEM); + + for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) + INIT_LIST_HEAD(&ksocknal_data.ksnd_peers[i]); + + rwlock_init(&ksocknal_data.ksnd_global_lock); ksocknal_data.ksnd_nal_cb = &ksocknal_lib; spin_lock_init (&ksocknal_data.ksnd_nal_cb_lock); @@ -697,19 +1407,26 @@ ksocknal_module_init (void) init_waitqueue_head(&ksocknal_data.ksnd_idle_ltx_waitq); spin_lock_init (&ksocknal_data.ksnd_reaper_lock); - INIT_LIST_HEAD (&ksocknal_data.ksnd_reaper_list); + INIT_LIST_HEAD (&ksocknal_data.ksnd_zombie_conns); + INIT_LIST_HEAD (&ksocknal_data.ksnd_deathrow_conns); init_waitqueue_head(&ksocknal_data.ksnd_reaper_waitq); - memset (&ksocknal_data.ksnd_irq_info, SOCKNAL_IRQ_UNASSIGNED, - sizeof (ksocknal_data.ksnd_irq_info)); + spin_lock_init (&ksocknal_data.ksnd_autoconnectd_lock); + INIT_LIST_HEAD (&ksocknal_data.ksnd_autoconnectd_routes); + init_waitqueue_head(&ksocknal_data.ksnd_autoconnectd_waitq); + + /* NB memset above zeros whole of ksocknal_data, including + * ksocknal_data.ksnd_irqinfo[all].ksni_valid */ /* flag lists/ptrs/locks initialised */ ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA; PORTAL_ALLOC(ksocknal_data.ksnd_schedulers, sizeof(ksock_sched_t) * SOCKNAL_N_SCHED); - if (ksocknal_data.ksnd_schedulers == NULL) + if (ksocknal_data.ksnd_schedulers == NULL) { + ksocknal_module_fini (); RETURN(-ENOMEM); + } for (i = 0; i < SOCKNAL_N_SCHED; i++) { ksock_sched_t *kss = &ksocknal_data.ksnd_schedulers[i]; @@ -723,7 +1440,7 @@ ksocknal_module_init (void) init_waitqueue_head (&kss->kss_waitq); } - CERROR ("ltx "LPSZ", total "LPSZ"\n", sizeof (ksock_ltx_t), + CDEBUG (D_MALLOC, "ltx "LPSZ", total "LPSZ"\n", sizeof (ksock_ltx_t), sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS)); PORTAL_ALLOC(ksocknal_data.ksnd_ltxs, @@ -740,6 +1457,7 @@ ksocknal_module_init (void) for (i = 0; i < SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS; i++) { ksock_ltx_t *ltx = &((ksock_ltx_t *)ksocknal_data.ksnd_ltxs)[i]; + ltx->ltx_tx.tx_hdr = <x->ltx_hdr; ltx->ltx_idle = i < SOCKNAL_NLTXS ? &ksocknal_data.ksnd_idle_ltx_list : &ksocknal_data.ksnd_idle_nblk_ltx_list; @@ -767,9 +1485,18 @@ ksocknal_module_init (void) } } + for (i = 0; i < SOCKNAL_N_AUTOCONNECTD; i++) { + rc = ksocknal_thread_start (ksocknal_autoconnectd, (void *)((long)i)); + if (rc != 0) { + CERROR("Can't spawn socknal autoconnectd: %d\n", rc); + ksocknal_module_fini (); + RETURN (rc); + } + } + rc = ksocknal_thread_start (ksocknal_reaper, NULL); if (rc != 0) { - CERROR("Can't spawn socknal reaper: %d\n", rc); + CERROR ("Can't spawn socknal reaper: %d\n", rc); ksocknal_module_fini (); RETURN (rc); } diff --git a/lnet/klnds/socklnd/socklnd.h b/lnet/klnds/socklnd/socklnd.h index 47ccd62..7a13396 100644 --- a/lnet/klnds/socklnd/socklnd.h +++ b/lnet/klnds/socklnd/socklnd.h @@ -25,9 +25,7 @@ */ #define DEBUG_PORTAL_ALLOC -#ifndef EXPORT_SYMTAB -# define EXPORT_SYMTAB -#endif +#define EXPORT_SYMTAB #include #include @@ -52,6 +50,7 @@ #include #include #include +#include #define DEBUG_SUBSYSTEM S_SOCKNAL @@ -59,9 +58,27 @@ #include #include -#define SOCKNAL_N_SCHED num_online_cpus() /* # socknal schedulers */ +#if CONFIG_SMP +# define SOCKNAL_N_SCHED num_online_cpus() /* # socknal schedulers */ +#else +# define SOCKNAL_N_SCHED 1 /* # socknal schedulers */ +#endif +#define SOCKNAL_N_AUTOCONNECTD 4 /* # socknal autoconnect daemons */ + +#define SOCKNAL_MIN_RECONNECT_INTERVAL HZ /* first failed connection retry... */ +#define SOCKNAL_MAX_RECONNECT_INTERVAL (60*HZ) /* ...exponentially increasing to this */ + +#define SOCKNAL_IO_TIMEOUT (60*HZ) /* default comms timeout */ + +#define SOCKNAL_PEER_HASH_SIZE 101 /* # peer lists */ + +#if PTL_LARGE_MTU +# define SOCKNAL_MAX_FWD_PAYLOAD (256<<10) /* biggest payload I can forward */ +#else +# define SOCKNAL_MAX_FWD_PAYLOAD (64<<10) /* biggest payload I can forward */ +#endif -#define SOCKNAL_MAX_FWD_PAYLOAD PTL_MTU /* biggest payload I can forward */ +#define SOCKNAL_ZC_MIN_FRAG (2<<10) /* default smallest zerocopy fragment */ #define SOCKNAL_NLTXS 128 /* # normal transmit messages */ #define SOCKNAL_NNBLK_LTXS 128 /* # transmit messages reserved if can't block */ @@ -78,6 +95,10 @@ #define SOCKNAL_TX_LOW_WATER(sk) (((sk)->sk_sndbuf*8)/10) +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +# define jiffies_64 jiffies +#endif + #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,72)) # define sk_data_ready data_ready # define sk_write_space write_space @@ -108,10 +129,17 @@ typedef struct /* per scheduler state */ } ksock_sched_t; typedef struct { + int ksni_valid:1; /* been set yet? */ + int ksni_bound:1; /* bound to a cpu yet? */ + int ksni_sched:6; /* which scheduler (assumes < 64) */ +} ksock_irqinfo_t; + +typedef struct { int ksnd_init; /* initialisation state */ - struct list_head ksnd_socklist; /* all my connections */ - rwlock_t ksnd_socklist_lock; /* stabilise add/find/remove */ + rwlock_t ksnd_global_lock; /* stabilize peer/conn ops */ + struct list_head *ksnd_peers; /* hash table of all my known peers */ + int ksnd_peer_hash_size; /* size of ksnd_peers */ nal_cb_t *ksnd_nal_cb; spinlock_t ksnd_nal_cb_lock; /* lib cli/sti lock */ @@ -119,7 +147,10 @@ typedef struct { atomic_t ksnd_nthreads; /* # live threads */ int ksnd_shuttingdown; /* tell threads to exit */ ksock_sched_t *ksnd_schedulers; /* scheduler state */ - + + atomic_t ksnd_npeers; /* total # peers extant */ + atomic_t ksnd_nclosing_conns; /* # closed conns extant */ + kpr_router_t ksnd_router; /* THE router */ void *ksnd_fmbs; /* all the pre-allocated FMBs */ @@ -131,11 +162,21 @@ typedef struct { struct list_head ksnd_idle_ltx_list; /* where to get an idle LTX */ struct list_head ksnd_idle_nblk_ltx_list; /* where to get an idle LTX if you can't block */ wait_queue_head_t ksnd_idle_ltx_waitq; /* where to block for an idle LTX */ + int ksnd_active_ltxs; /* #active ltxs */ - struct list_head ksnd_reaper_list; /* conn waiting to be reaped */ - wait_queue_head_t ksnd_reaper_waitq; /* reaper sleeps here */ + struct list_head ksnd_deathrow_conns; /* conns to be closed */ + struct list_head ksnd_zombie_conns; /* conns to be freed */ + wait_queue_head_t ksnd_reaper_waitq; /* reaper sleep here */ spinlock_t ksnd_reaper_lock; /* serialise */ - unsigned char ksnd_irq_info[NR_IRQS]; /* irq->scheduler lookup */ + + int ksnd_stall_tx; /* test sluggish sender */ + int ksnd_stall_rx; /* test sluggish receiver */ + + struct list_head ksnd_autoconnectd_routes; /* routes waiting to be connected */ + wait_queue_head_t ksnd_autoconnectd_waitq; /* autoconnectds sleep here */ + spinlock_t ksnd_autoconnectd_lock; /* serialise */ + + ksock_irqinfo_t ksnd_irqinfo[NR_IRQS];/* irq->scheduler lookup */ } ksock_nal_data_t; #define SOCKNAL_INIT_NOTHING 0 @@ -143,10 +184,6 @@ typedef struct { #define SOCKNAL_INIT_PTL 2 #define SOCKNAL_INIT_ALL 3 -#define SOCKNAL_IRQ_BOUND 0x80 /* flag we _did_ bind already */ -#define SOCKNAL_IRQ_SCHED_MASK 0x7f /* we assume < 127 CPUs */ -#define SOCKNAL_IRQ_UNASSIGNED 0xff /* flag unassigned */ - /* A packet just assembled for transmission is represented by 1 or more * struct iovec fragments and 0 or more ptl_kiov_t fragments. Forwarded * messages, or messages from an MD with PTL_MD_KIOV _not_ set have 0 @@ -161,17 +198,24 @@ typedef struct { * Otherwise up to PTL_MD_MAX_IOV ptl_kiov_t fragments are used. */ +struct ksock_conn; /* forward ref */ +struct ksock_peer; /* forward ref */ +struct ksock_route; /* forward ref */ + typedef struct /* transmit packet */ { struct list_head tx_list; /* queue on conn for transmission etc */ + __u64 tx_deadline; /* when (in jiffies) tx times out */ char tx_isfwd; /* forwarding / sourced here */ int tx_nob; /* # packet bytes */ + int tx_resid; /* residual bytes */ int tx_niov; /* # packet iovec frags */ struct iovec *tx_iov; /* packet iovec frags */ int tx_nkiov; /* # packet page frags */ ptl_kiov_t *tx_kiov; /* packet page frags */ + struct ksock_conn *tx_conn; /* owning conn */ + ptl_hdr_t *tx_hdr; /* packet header (for debug only) */ #if SOCKNAL_ZC - ksock_sched_t *tx_sched; /* who to wake on callback */ zccd_t tx_zccd; /* zero copy callback descriptor */ #endif } ksock_tx_t; @@ -207,8 +251,7 @@ typedef struct /* locally transmitted packet */ /* local packets (lib->socknal) embedded in ksock_ltx_t::ltx_tx */ /* NB list_entry() is used here as convenient macro for calculating a - * pointer to a struct from the address of a member. - */ + * pointer to a struct from the address of a member. */ typedef struct /* Kernel portals Socket Forwarding message buffer */ { /* (socknal->router) */ @@ -216,6 +259,7 @@ typedef struct /* Kernel portals Socket Forward kpr_fwd_desc_t fmb_fwd; /* router's descriptor */ int fmb_npages; /* # pages allocated */ ksock_fmb_pool_t *fmb_pool; /* owning pool */ + struct ksock_peer *fmb_peer; /* peer received from */ struct page *fmb_pages[SOCKNAL_LARGE_FWD_PAGES]; struct iovec fmb_iov[SOCKNAL_LARGE_FWD_PAGES]; } ksock_fmb_t; @@ -234,19 +278,23 @@ typedef union { #define SOCKNAL_RX_GET_FMB 5 /* scheduled for forwarding */ #define SOCKNAL_RX_FMB_SLEEP 6 /* blocked waiting for a fwd desc */ -typedef struct +typedef struct ksock_conn { - struct list_head ksnc_list; /* stash on global socket list */ - struct file *ksnc_file; /* socket filp */ + struct ksock_peer *ksnc_peer; /* owning peer */ + struct ksock_route *ksnc_route; /* owning route */ + struct list_head ksnc_list; /* stash on peer's conn list */ struct socket *ksnc_sock; /* actual socket */ void *ksnc_saved_data_ready; /* socket's original data_ready() callback */ void *ksnc_saved_write_space; /* socket's original write_space() callback */ - ptl_nid_t ksnc_peernid; /* who's on the other end */ atomic_t ksnc_refcount; /* # users */ ksock_sched_t *ksnc_scheduler; /* who schedules this connection */ - + __u32 ksnc_ipaddr; /* peer's IP */ + int ksnc_port; /* peer's port */ + int ksnc_closing; /* being shut down */ + /* READER */ struct list_head ksnc_rx_list; /* where I enq waiting input or a forwarding descriptor */ + __u64 ksnc_rx_deadline; /* when receive times out */ int ksnc_rx_ready; /* data ready to read */ int ksnc_rx_scheduled; /* being progressed */ int ksnc_rx_state; /* what is being read */ @@ -263,37 +311,104 @@ typedef struct /* WRITER */ struct list_head ksnc_tx_list; /* where I enq waiting for output space */ struct list_head ksnc_tx_queue; /* packets waiting to be sent */ +#if SOCKNAL_ZC + struct list_head ksnc_tx_pending; /* zc packets pending callback */ +#endif + atomic_t ksnc_tx_nob; /* # bytes queued */ int ksnc_tx_ready; /* write space */ int ksnc_tx_scheduled; /* being progressed */ - } ksock_conn_t; -extern int ksocknal_add_sock (ptl_nid_t nid, int fd, int client); -extern int ksocknal_close_sock(ptl_nid_t nid); -extern int ksocknal_set_mynid(ptl_nid_t nid); -extern int ksocknal_push_sock(ptl_nid_t nid); -extern ksock_conn_t *ksocknal_get_conn (ptl_nid_t nid); -extern void _ksocknal_put_conn (ksock_conn_t *conn); -extern void ksocknal_close_conn (ksock_conn_t *conn); +typedef struct ksock_route +{ + struct list_head ksnr_list; /* chain on peer route list */ + struct list_head ksnr_connect_list; /* chain on autoconnect list */ + struct ksock_peer *ksnr_peer; /* owning peer */ + atomic_t ksnr_refcount; /* # users */ + int ksnr_sharecount; /* lconf usage counter */ + __u64 ksnr_timeout; /* when reconnection can happen next */ + unsigned int ksnr_retry_interval; /* how long between retries */ + __u32 ksnr_ipaddr; /* an IP address for this peer */ + int ksnr_port; /* port to connect to */ + int ksnr_buffer_size; /* size of socket buffers */ + unsigned int ksnr_irq_affinity:1; /* set affinity? */ + unsigned int ksnr_xchange_nids:1; /* do hello protocol? */ + unsigned int ksnr_nonagel:1; /* disable nagle? */ + unsigned int ksnr_connecting; /* autoconnect in progress? */ + unsigned int ksnr_deleted; /* been removed from peer? */ + int ksnr_generation; /* connection incarnation # */ + ksock_conn_t *ksnr_conn; /* NULL/active connection */ +} ksock_route_t; + +typedef struct ksock_peer +{ + struct list_head ksnp_list; /* stash on global peer list */ + ptl_nid_t ksnp_nid; /* who's on the other end(s) */ + atomic_t ksnp_refcount; /* # users */ + int ksnp_closing; /* being closed */ + struct list_head ksnp_conns; /* all active connections */ + struct list_head ksnp_routes; /* routes */ + struct list_head ksnp_tx_queue; /* waiting packets */ +} ksock_peer_t; -static inline void -ksocknal_put_conn (ksock_conn_t *conn) + + +extern nal_cb_t ksocknal_lib; +extern ksock_nal_data_t ksocknal_data; + +static inline struct list_head * +ksocknal_nid2peerlist (ptl_nid_t nid) +{ + unsigned int hash = ((unsigned int)nid) % ksocknal_data.ksnd_peer_hash_size; + + return (&ksocknal_data.ksnd_peers [hash]); +} + +static inline int +ksocknal_getconnsock (ksock_conn_t *conn) { - CDEBUG (D_OTHER, "putting conn[%p] -> "LPX64" (%d)\n", - conn, conn->ksnc_peernid, atomic_read (&conn->ksnc_refcount)); + int rc = -ESHUTDOWN; + + read_lock (&ksocknal_data.ksnd_global_lock); + if (!conn->ksnc_closing) { + rc = 0; + get_file (conn->ksnc_sock->file); + } + read_unlock (&ksocknal_data.ksnd_global_lock); + + return (rc); +} - if (atomic_dec_and_test (&conn->ksnc_refcount)) - _ksocknal_put_conn (conn); +static inline void +ksocknal_putconnsock (ksock_conn_t *conn) +{ + fput (conn->ksnc_sock->file); } +extern void ksocknal_put_route (ksock_route_t *route); +extern void ksocknal_put_peer (ksock_peer_t *peer); +extern ksock_peer_t *ksocknal_find_peer_locked (ptl_nid_t nid); +extern ksock_peer_t *ksocknal_get_peer (ptl_nid_t nid); +extern int ksocknal_del_route (ptl_nid_t nid, __u32 ipaddr, + int single, int keep_conn); +extern int ksocknal_create_conn (ptl_nid_t nid, ksock_route_t *route, + struct socket *sock, int bind_irq); +extern void ksocknal_close_conn_locked (ksock_conn_t *conn); +extern int ksocknal_close_conn_unlocked (ksock_conn_t *conn); +extern void ksocknal_terminate_conn (ksock_conn_t *conn); +extern void ksocknal_destroy_conn (ksock_conn_t *conn); +extern void ksocknal_put_conn (ksock_conn_t *conn); +extern int ksocknal_close_conn (ptl_nid_t nid, __u32 ipaddr); + +extern void ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn); +extern void ksocknal_tx_done (ksock_tx_t *tx, int asynch); +extern void ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd); +extern void ksocknal_fmb_callback (void *arg, int error); extern int ksocknal_thread_start (int (*fn)(void *arg), void *arg); extern int ksocknal_new_packet (ksock_conn_t *conn, int skip); -extern void ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd); extern int ksocknal_scheduler (void *arg); -extern int ksocknal_reaper (void *arg); extern void ksocknal_data_ready(struct sock *sk, int n); extern void ksocknal_write_space(struct sock *sk); - - -extern nal_cb_t ksocknal_lib; -extern ksock_nal_data_t ksocknal_data; +extern int ksocknal_autoconnectd (void *arg); +extern int ksocknal_reaper (void *arg); +extern int ksocknal_set_linger (struct socket *sock); diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c index 552ea50..656a0c5 100644 --- a/lnet/klnds/socklnd/socklnd_cb.c +++ b/lnet/klnds/socklnd/socklnd_cb.c @@ -25,13 +25,10 @@ #include "socknal.h" -atomic_t ksocknal_packets_received; -atomic_t ksocknal_packets_launched; -atomic_t ksocknal_packets_being_sent; - +int ksocknal_io_timeout = SOCKNAL_IO_TIMEOUT; #if SOCKNAL_ZC int ksocknal_do_zc = 1; -int ksocknal_zc_min_frag = 2048; +int ksocknal_zc_min_frag = SOCKNAL_ZC_MIN_FRAG; #endif /* @@ -127,7 +124,7 @@ ksocknal_sti(nal_cb_t *nal, unsigned long *flags) int ksocknal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist) { - /* I would guess that if ksocknal_get_conn(nid) == NULL, + /* I would guess that if ksocknal_get_peer (nid) == NULL, and we're not routing, then 'nid' is very distant :) */ if ( nal->ni.nid == nid ) { *dist = 0; @@ -151,6 +148,7 @@ ksocknal_get_ltx (int may_block) ltx = list_entry(ksocknal_data.ksnd_idle_ltx_list.next, ksock_ltx_t, ltx_tx.tx_list); list_del (<x->ltx_tx.tx_list); + ksocknal_data.ksnd_active_ltxs++; break; } @@ -159,6 +157,7 @@ ksocknal_get_ltx (int may_block) ltx = list_entry(ksocknal_data.ksnd_idle_nblk_ltx_list.next, ksock_ltx_t, ltx_tx.tx_list); list_del (<x->ltx_tx.tx_list); + ksocknal_data.ksnd_active_ltxs++; } break; } @@ -175,6 +174,24 @@ ksocknal_get_ltx (int may_block) return (ltx); } +void +ksocknal_put_ltx (ksock_ltx_t *ltx) +{ + unsigned long flags; + + spin_lock_irqsave (&ksocknal_data.ksnd_idle_ltx_lock, flags); + + ksocknal_data.ksnd_active_ltxs--; + list_add_tail (<x->ltx_tx.tx_list, ltx->ltx_idle); + + /* normal tx desc => wakeup anyone blocking for one */ + if (ltx->ltx_idle == &ksocknal_data.ksnd_idle_ltx_list && + waitqueue_active (&ksocknal_data.ksnd_idle_ltx_waitq)) + wake_up (&ksocknal_data.ksnd_idle_ltx_waitq); + + spin_unlock_irqrestore (&ksocknal_data.ksnd_idle_ltx_lock, flags); +} + #if SOCKNAL_ZC struct page * ksocknal_kvaddr_to_page (unsigned long vaddr) @@ -202,11 +219,15 @@ ksocknal_kvaddr_to_page (unsigned long vaddr) #endif int -ksocknal_send_iov (struct socket *sock, ksock_tx_t *tx, int more) +ksocknal_send_iov (ksock_conn_t *conn, ksock_tx_t *tx) { + struct socket *sock = conn->ksnc_sock; struct iovec *iov = tx->tx_iov; int fragsize = iov->iov_len; unsigned long vaddr = (unsigned long)iov->iov_base; + int more = !list_empty (&conn->ksnc_tx_queue) | + (tx->tx_niov > 1) | + (tx->tx_nkiov > 1); #if SOCKNAL_ZC int offset = vaddr & (PAGE_SIZE - 1); int zcsize = MIN (fragsize, PAGE_SIZE - offset); @@ -216,9 +237,8 @@ ksocknal_send_iov (struct socket *sock, ksock_tx_t *tx, int more) /* NB we can't trust socket ops to either consume our iovs * or leave them alone, so we only send 1 frag at a time. */ - LASSERT (fragsize <= tx->tx_nob); + LASSERT (fragsize <= tx->tx_resid); LASSERT (tx->tx_niov > 0); - more |= (tx->tx_niov > 1); #if SOCKNAL_ZC if (ksocknal_do_zc && @@ -230,7 +250,10 @@ ksocknal_send_iov (struct socket *sock, ksock_tx_t *tx, int more) CDEBUG(D_NET, "vaddr %p, page %p->%p + offset %x for %d\n", (void *)vaddr, page, page_address(page), offset, zcsize); - more |= (zcsize < fragsize); + if (fragsize > zcsize) { + more = 1; + fragsize = zcsize; + } rc = tcp_sendpage_zccd(sock, page, offset, zcsize, more ? (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT, @@ -260,37 +283,39 @@ ksocknal_send_iov (struct socket *sock, ksock_tx_t *tx, int more) if (rc <= 0) return (rc); - tx->tx_nob -= rc; + tx->tx_resid -= rc; - if (rc < fragsize) { - /* didn't send whole frag */ + if (rc < iov->iov_len) { + /* didn't send whole iov entry... */ iov->iov_base = (void *)(vaddr + rc); - iov->iov_len = fragsize - rc; - return (-EAGAIN); + iov->iov_len -= rc; + /* ...but did we send everything we tried to send? */ + return ((rc == fragsize) ? 1 : -EAGAIN); } - /* everything went */ - LASSERT (rc == fragsize); tx->tx_iov++; tx->tx_niov--; return (1); } int -ksocknal_send_kiov (struct socket *sock, ksock_tx_t *tx, int more) +ksocknal_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) { + struct socket *sock = conn->ksnc_sock; ptl_kiov_t *kiov = tx->tx_kiov; int fragsize = kiov->kiov_len; struct page *page = kiov->kiov_page; int offset = kiov->kiov_offset; + int more = !list_empty (&conn->ksnc_tx_queue) | + (tx->tx_nkiov > 1); int rc; /* NB we can't trust socket ops to either consume our iovs * or leave them alone, so we only send 1 frag at a time. */ - LASSERT (fragsize <= tx->tx_nob); + LASSERT (fragsize <= tx->tx_resid); LASSERT (offset + fragsize <= PAGE_SIZE); + LASSERT (tx->tx_niov == 0); LASSERT (tx->tx_nkiov > 0); - more |= (tx->tx_nkiov > 1); #if SOCKNAL_ZC if (ksocknal_do_zc && @@ -330,8 +355,8 @@ ksocknal_send_kiov (struct socket *sock, ksock_tx_t *tx, int more) if (rc <= 0) return (rc); - tx->tx_nob -= rc; - + tx->tx_resid -= rc; + if (rc < fragsize) { /* didn't send whole frag */ kiov->kiov_offset = offset + rc; @@ -347,29 +372,52 @@ ksocknal_send_kiov (struct socket *sock, ksock_tx_t *tx, int more) } int -ksocknal_sendmsg (struct socket *sock, ksock_tx_t *tx, int more) +ksocknal_sendmsg (ksock_conn_t *conn, ksock_tx_t *tx) { - int rc; - int sent_some = 0; + /* Return 0 on success, < 0 on error. + * caller checks tx_resid to determine progress/completion */ + int rc; ENTRY; - LASSERT (!in_interrupt()); + if (ksocknal_data.ksnd_stall_tx != 0) { + set_current_state (TASK_UNINTERRUPTIBLE); + schedule_timeout (ksocknal_data.ksnd_stall_tx * HZ); + } + + rc = ksocknal_getconnsock (conn); + if (rc != 0) + return (rc); for (;;) { + LASSERT (tx->tx_resid != 0); + + if (conn->ksnc_closing) { + rc = -ESHUTDOWN; + break; + } + if (tx->tx_niov != 0) - rc = ksocknal_send_iov (sock, tx, more || tx->tx_nkiov != 0); + rc = ksocknal_send_iov (conn, tx); else - rc = ksocknal_send_kiov (sock, tx, more); - - /* Interpret a zero rc the same as -EAGAIN (Adaptech TOE) */ - if (rc <= 0) /* error or partial send */ - RETURN ((sent_some || rc == -EAGAIN) ? 0 : rc); - - if (tx->tx_nob == 0) /* sent everything */ - RETURN (0); + rc = ksocknal_send_kiov (conn, tx); + + if (rc <= 0) { /* error or socket full? */ + /* NB: rc == 0 and rc == -EAGAIN both mean try + * again later (linux stack returns -EAGAIN for + * this, but Adaptech TOE returns 0) */ + if (rc == -EAGAIN) + rc = 0; + break; + } - sent_some = 1; + if (tx->tx_resid == 0) { /* sent everything */ + rc = 0; + break; + } } + + ksocknal_putconnsock (conn); + RETURN (rc); } int @@ -414,7 +462,6 @@ ksocknal_recv_iov (ksock_conn_t *conn) return (-EAGAIN); } - LASSERT (rc == fragsize); conn->ksnc_rx_iov++; conn->ksnc_rx_niov--; return (1); @@ -466,7 +513,6 @@ ksocknal_recv_kiov (ksock_conn_t *conn) return (-EAGAIN); } - LASSERT (rc == fragsize); conn->ksnc_rx_kiov++; conn->ksnc_rx_nkiov--; return (1); @@ -475,31 +521,47 @@ ksocknal_recv_kiov (ksock_conn_t *conn) int ksocknal_recvmsg (ksock_conn_t *conn) { - int rc; - int got_some = 0; + /* Return 1 on success, 0 on EOF, < 0 on error. + * Caller checks ksnc_rx_nob_wanted to determine + * progress/completion. */ + int rc; ENTRY; - LASSERT (!in_interrupt ()); + if (ksocknal_data.ksnd_stall_rx != 0) { + set_current_state (TASK_UNINTERRUPTIBLE); + schedule_timeout (ksocknal_data.ksnd_stall_rx * HZ); + } + + rc = ksocknal_getconnsock (conn); + if (rc != 0) + return (rc); for (;;) { - LASSERT (conn->ksnc_rx_nob_wanted > 0); + if (conn->ksnc_closing) { + rc = -ESHUTDOWN; + break; + } if (conn->ksnc_rx_niov != 0) rc = ksocknal_recv_iov (conn); else rc = ksocknal_recv_kiov (conn); - - /* CAVEAT EMPTOR: we return... - * <= 0 for error (0 == EOF) and > 0 for success (unlike sendmsg()) */ - - if (rc <= 0) /* error/EOF or partial receive */ - RETURN ((got_some || rc == -EAGAIN) ? 1 : rc); - if (conn->ksnc_rx_nob_wanted == 0) - RETURN (1); + if (rc <= 0) { + /* error/EOF or partial receive */ + if (rc == -EAGAIN) + rc = 1; + break; + } - got_some = 0; + if (conn->ksnc_rx_nob_wanted == 0) { + rc = 1; + break; + } } + + ksocknal_putconnsock (conn); + RETURN (rc); } #if SOCKNAL_ZC @@ -507,7 +569,7 @@ void ksocknal_zc_callback (zccd_t *zcd) { ksock_tx_t *tx = KSOCK_ZCCD_2_TX(zcd); - ksock_sched_t *sched = tx->tx_sched; + ksock_sched_t *sched = tx->tx_conn->ksnc_scheduler; unsigned long flags; ENTRY; @@ -515,6 +577,7 @@ ksocknal_zc_callback (zccd_t *zcd) spin_lock_irqsave (&sched->kss_lock, flags); + list_del (&tx->tx_list); /* remove from kss_zctxpending_list */ list_add_tail (&tx->tx_list, &sched->kss_zctxdone_list); if (waitqueue_active (&sched->kss_waitq)) wake_up (&sched->kss_waitq); @@ -525,13 +588,24 @@ ksocknal_zc_callback (zccd_t *zcd) #endif void -ksocknal_tx_done (ksock_tx_t *tx) +ksocknal_tx_done (ksock_tx_t *tx, int asynch) { - unsigned long flags; ksock_ltx_t *ltx; ENTRY; - atomic_dec (&ksocknal_packets_being_sent); + if (tx->tx_conn != NULL) { + /* This tx got queued on a conn; do the accounting... */ + atomic_sub (tx->tx_nob, &tx->tx_conn->ksnc_tx_nob); +#if SOCKNAL_ZC + /* zero copy completion isn't always from + * process_transmit() so it needs to keep a ref on + * tx_conn... */ + if (asynch) + ksocknal_put_conn (tx->tx_conn); +#else + LASSERT (!asynch); +#endif + } if (tx->tx_isfwd) { /* was a forwarded packet? */ kpr_fwd_done (&ksocknal_data.ksnd_router, @@ -545,20 +619,44 @@ ksocknal_tx_done (ksock_tx_t *tx) lib_finalize (&ksocknal_lib, ltx->ltx_private, ltx->ltx_cookie); - spin_lock_irqsave (&ksocknal_data.ksnd_idle_ltx_lock, flags); - - list_add_tail (<x->ltx_tx.tx_list, ltx->ltx_idle); - - /* normal tx desc => wakeup anyone blocking for one */ - if (ltx->ltx_idle == &ksocknal_data.ksnd_idle_ltx_list && - waitqueue_active (&ksocknal_data.ksnd_idle_ltx_waitq)) - wake_up (&ksocknal_data.ksnd_idle_ltx_waitq); - - spin_unlock_irqrestore (&ksocknal_data.ksnd_idle_ltx_lock, flags); + ksocknal_put_ltx (ltx); EXIT; } void +ksocknal_tx_launched (ksock_tx_t *tx) +{ +#if SOCKNAL_ZC + if (atomic_read (&tx->tx_zccd.zccd_count) != 1) { + unsigned long flags; + ksock_conn_t *conn = tx->tx_conn; + ksock_sched_t *sched = conn->ksnc_scheduler; + + /* zccd skbufs are still in-flight. First take a ref on + * conn, so it hangs about for ksocknal_tx_done... */ + atomic_inc (&conn->ksnc_refcount); + + /* Stash it for timeout... + * NB We have to hold a lock to stash the tx, and we have + * stash it before we zcc_put(), but we have to _not_ hold + * this lock when we zcc_put(), otherwise we could deadlock + * if it turns out to be the last put. Aaaaarrrrggghhh! */ + spin_lock_irqsave (&sched->kss_lock, flags); + list_add_tail (&tx->tx_list, &conn->ksnc_tx_pending); + spin_unlock_irqrestore (&sched->kss_lock, flags); + + /* ...then drop the initial ref on zccd, so the zero copy + * callback can occur */ + zccd_put (&tx->tx_zccd); + return; + } +#endif + /* Any zero-copy-ness (if any) has completed; I can complete the + * transmit now, avoiding an extra schedule */ + ksocknal_tx_done (tx, 0); +} + +void ksocknal_process_transmit (ksock_sched_t *sched, unsigned long *irq_flags) { ksock_conn_t *conn; @@ -578,41 +676,32 @@ ksocknal_process_transmit (ksock_sched_t *sched, unsigned long *irq_flags) spin_unlock_irqrestore (&sched->kss_lock, *irq_flags); - LASSERT (tx->tx_nob > 0); + LASSERT (tx->tx_resid > 0); conn->ksnc_tx_ready = 0;/* write_space may race with me and set ready */ mb(); /* => clear BEFORE trying to write */ - rc = ksocknal_sendmsg (conn->ksnc_sock, tx, - !list_empty (&conn->ksnc_tx_queue)); /* more to come? */ + rc = ksocknal_sendmsg (conn, tx); - CDEBUG (D_NET, "send(%d) %d\n", tx->tx_nob, rc); + CDEBUG (D_NET, "send(%d) %d\n", tx->tx_resid, rc); if (rc != 0) { - CERROR("Error socknal send(%d) %p: %d\n", tx->tx_nob, conn, rc); - /* kid on for now the whole packet went. - * NB when we handle the error better, we'll still need to - * block for zccd completion. - */ - tx->tx_nob = 0; - } + if (ksocknal_close_conn_unlocked (conn)) { + /* I'm the first to close */ + CERROR ("[%p] Error %d on write to "LPX64" ip %08x:%d\n", + conn, rc, conn->ksnc_peer->ksnp_nid, + conn->ksnc_ipaddr, conn->ksnc_port); + } + ksocknal_tx_launched (tx); + spin_lock_irqsave (&sched->kss_lock, *irq_flags); - if (tx->tx_nob == 0) /* nothing left to send */ - { - /* everything went; assume more can go, so prevent write_space locking */ - conn->ksnc_tx_ready = 1; + } else if (tx->tx_resid == 0) { - ksocknal_put_conn (conn); /* release packet's ref */ - atomic_inc (&ksocknal_packets_being_sent); -#if SOCKNAL_ZC - if (atomic_read (&tx->tx_zccd.zccd_count) != 1) { - /* zccd skbufs are still in-flight. Release my - * initial ref on zccd, so callback can occur */ - zccd_put (&tx->tx_zccd); - } else -#endif - ksocknal_tx_done (tx); + /* everything went; assume more can go, and avoid + * write_space locking */ + conn->ksnc_tx_ready = 1; + ksocknal_tx_launched (tx); spin_lock_irqsave (&sched->kss_lock, *irq_flags); } else { spin_lock_irqsave (&sched->kss_lock, *irq_flags); @@ -621,84 +710,247 @@ ksocknal_process_transmit (ksock_sched_t *sched, unsigned long *irq_flags) list_add (&tx->tx_list, &conn->ksnc_tx_queue); } - if (!conn->ksnc_tx_ready || /* no space to write now */ - list_empty (&conn->ksnc_tx_queue)) {/* nothing to write */ - conn->ksnc_tx_scheduled = 0; /* not being scheduled */ - ksocknal_put_conn (conn); /* release scheduler's ref */ - } else /* let scheduler call me again */ + /* no space to write, or nothing to write? */ + if (!conn->ksnc_tx_ready || + list_empty (&conn->ksnc_tx_queue)) { + /* mark not scheduled */ + conn->ksnc_tx_scheduled = 0; + /* drop scheduler's ref */ + ksocknal_put_conn (conn); + } else { + /* stay scheduled */ list_add_tail (&conn->ksnc_tx_list, &sched->kss_tx_conns); + } +} + +void +ksocknal_launch_autoconnect_locked (ksock_route_t *route) +{ + unsigned long flags; + + /* called holding write lock on ksnd_global_lock */ + + LASSERT (route->ksnr_conn == NULL); + LASSERT (!route->ksnr_deleted && !route->ksnr_connecting); + + route->ksnr_connecting = 1; + atomic_inc (&route->ksnr_refcount); /* extra ref for asynchd */ + + spin_lock_irqsave (&ksocknal_data.ksnd_autoconnectd_lock, flags); + + list_add_tail (&route->ksnr_connect_list, + &ksocknal_data.ksnd_autoconnectd_routes); + + if (waitqueue_active (&ksocknal_data.ksnd_autoconnectd_waitq)) + wake_up (&ksocknal_data.ksnd_autoconnectd_waitq); + + spin_unlock_irqrestore (&ksocknal_data.ksnd_autoconnectd_lock, flags); +} + +ksock_peer_t * +ksocknal_find_target_peer_locked (ksock_tx_t *tx, ptl_nid_t nid) +{ + ptl_nid_t target_nid; + int rc; + ksock_peer_t *peer = ksocknal_find_peer_locked (nid); + + if (peer != NULL) + return (peer); + + if (tx->tx_isfwd) { + CERROR ("Can't send packet to "LPX64 + ": routed target is not a peer\n", nid); + return (NULL); + } + + rc = kpr_lookup (&ksocknal_data.ksnd_router, nid, &target_nid); + if (rc != 0) { + CERROR ("Can't route to "LPX64": router error %d\n", nid, rc); + return (NULL); + } + + peer = ksocknal_find_peer_locked (target_nid); + if (peer != NULL) + return (peer); + + CERROR ("Can't send packet to "LPX64": no peer entry\n", target_nid); + return (NULL); +} + +ksock_conn_t * +ksocknal_find_conn_locked (ksock_tx_t *tx, ksock_peer_t *peer) +{ + struct list_head *tmp; + ksock_conn_t *conn = NULL; + + /* Find the conn with the shortest tx queue */ + list_for_each (tmp, &peer->ksnp_conns) { + ksock_conn_t *c = list_entry (tmp, ksock_conn_t, ksnc_list); + + LASSERT (!c->ksnc_closing); + + if (conn == NULL || + atomic_read (&conn->ksnc_tx_nob) > + atomic_read (&c->ksnc_tx_nob)) + conn = c; + } + + return (conn); } void -ksocknal_launch_packet (ksock_conn_t *conn, ksock_tx_t *tx) +ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn) { unsigned long flags; ksock_sched_t *sched = conn->ksnc_scheduler; - /* Ensure the frags we've been given EXACTLY match the number of - * bytes we want to send. Many TCP/IP stacks disregard any total - * size parameters passed to them and just look at the frags. - * - * We always expect at least 1 mapped fragment containing the - * complete portals header. - */ - LASSERT (lib_iov_nob (tx->tx_niov, tx->tx_iov) + - lib_kiov_nob (tx->tx_nkiov, tx->tx_kiov) == tx->tx_nob); - LASSERT (tx->tx_niov >= 1); - LASSERT (tx->tx_iov[0].iov_len >= sizeof (ptl_hdr_t)); - - CDEBUG (D_NET, "type %d, nob %d niov %d nkiov %d\n", - ((ptl_hdr_t *)tx->tx_iov[0].iov_base)->type, tx->tx_nob, - tx->tx_niov, tx->tx_nkiov); + /* called holding global lock (read or irq-write) */ + + CDEBUG (D_NET, "Sending to "LPX64" on port %d\n", + conn->ksnc_peer->ksnp_nid, conn->ksnc_port); + + atomic_add (tx->tx_nob, &conn->ksnc_tx_nob); + tx->tx_resid = tx->tx_nob; + tx->tx_conn = conn; #if SOCKNAL_ZC zccd_init (&tx->tx_zccd, ksocknal_zc_callback); - /* NB this sets 1 ref on zccd, so the callback can only occur - * after I've released this ref */ - tx->tx_sched = sched; + /* NB this sets 1 ref on zccd, so the callback can only occur after + * I've released this ref. */ #endif - spin_lock_irqsave (&sched->kss_lock, flags); + spin_lock_irqsave (&sched->kss_lock, flags); + + tx->tx_deadline = jiffies_64 + ksocknal_io_timeout; list_add_tail (&tx->tx_list, &conn->ksnc_tx_queue); - - if (conn->ksnc_tx_ready && /* able to send */ - !conn->ksnc_tx_scheduled) { /* not scheduled to send */ - list_add_tail (&conn->ksnc_tx_list, &sched->kss_tx_conns); + + if (conn->ksnc_tx_ready && /* able to send */ + !conn->ksnc_tx_scheduled) { /* not scheduled to send */ + /* +1 ref for scheduler */ + atomic_inc (&conn->ksnc_refcount); + list_add_tail (&conn->ksnc_tx_list, + &sched->kss_tx_conns); conn->ksnc_tx_scheduled = 1; - atomic_inc (&conn->ksnc_refcount); /* extra ref for scheduler */ if (waitqueue_active (&sched->kss_waitq)) wake_up (&sched->kss_waitq); } spin_unlock_irqrestore (&sched->kss_lock, flags); +} - atomic_inc (&ksocknal_packets_launched); +ksock_route_t * +ksocknal_find_connectable_route_locked (ksock_peer_t *peer) +{ + struct list_head *tmp; + ksock_route_t *route; + + list_for_each (tmp, &peer->ksnp_routes) { + route = list_entry (tmp, ksock_route_t, ksnr_list); + + if (route->ksnr_conn == NULL && /* not connected */ + !route->ksnr_connecting && /* not connecting */ + route->ksnr_timeout <= jiffies_64) /* OK to retry */ + return (route); + } + + return (NULL); } -ksock_conn_t * -ksocknal_send_target (ptl_nid_t nid) +ksock_route_t * +ksocknal_find_connecting_route_locked (ksock_peer_t *peer) { - ptl_nid_t gatewaynid; - ksock_conn_t *conn; - int rc; + struct list_head *tmp; + ksock_route_t *route; - if ((conn = ksocknal_get_conn (nid)) == NULL) { - /* It's not a peer; try to find a gateway */ - rc = kpr_lookup (&ksocknal_data.ksnd_router, nid, &gatewaynid); - if (rc != 0) { - CERROR("Can't route to "LPX64": router error %d\n", - nid, rc); - return (NULL); - } + list_for_each (tmp, &peer->ksnp_routes) { + route = list_entry (tmp, ksock_route_t, ksnr_list); + + if (route->ksnr_connecting) + return (route); + } + + return (NULL); +} - if ((conn = ksocknal_get_conn (gatewaynid)) == NULL) { - CERROR ("Can't route to "LPX64": gateway "LPX64 - " is not a peer\n", nid, gatewaynid); - return (NULL); +int +ksocknal_launch_packet (ksock_tx_t *tx, ptl_nid_t nid) +{ + unsigned long flags; + ksock_peer_t *peer; + ksock_conn_t *conn; + ksock_route_t *route; + rwlock_t *g_lock; + + /* Ensure the frags we've been given EXACTLY match the number of + * bytes we want to send. Many TCP/IP stacks disregard any total + * size parameters passed to them and just look at the frags. + * + * We always expect at least 1 mapped fragment containing the + * complete portals header. */ + LASSERT (lib_iov_nob (tx->tx_niov, tx->tx_iov) + + lib_kiov_nob (tx->tx_nkiov, tx->tx_kiov) == tx->tx_nob); + LASSERT (tx->tx_niov >= 1); + LASSERT (tx->tx_iov[0].iov_len >= sizeof (ptl_hdr_t)); + + CDEBUG (D_NET, "packet %p type %d, nob %d niov %d nkiov %d\n", + tx, ((ptl_hdr_t *)tx->tx_iov[0].iov_base)->type, + tx->tx_nob, tx->tx_niov, tx->tx_nkiov); + + tx->tx_conn = NULL; /* only set when assigned a conn */ + + g_lock = &ksocknal_data.ksnd_global_lock; + read_lock (g_lock); + + peer = ksocknal_find_target_peer_locked (tx, nid); + if (peer == NULL) { + read_unlock (g_lock); + return (PTL_FAIL); + } + + /* Any routes need to be connected? (need write lock if so) */ + if (ksocknal_find_connectable_route_locked (peer) == NULL) { + conn = ksocknal_find_conn_locked (tx, peer); + if (conn != NULL) { + ksocknal_queue_tx_locked (tx, conn); + read_unlock (g_lock); + return (PTL_OK); } } + + /* need a write lock now to change peer state... */ - return (conn); + atomic_inc (&peer->ksnp_refcount); /* +1 ref for me while I unlock */ + read_unlock (g_lock); + write_lock_irqsave (g_lock, flags); + + if (peer->ksnp_closing) { /* peer deleted as I blocked! */ + write_unlock_irqrestore (g_lock, flags); + ksocknal_put_peer (peer); + return (PTL_FAIL); + } + ksocknal_put_peer (peer); /* drop ref I got above */ + + /* I may launch autoconnects, now we're write locked... */ + while ((route = ksocknal_find_connectable_route_locked (peer)) != NULL) + ksocknal_launch_autoconnect_locked (route); + + conn = ksocknal_find_conn_locked (tx, peer); + if (conn != NULL) { + ksocknal_queue_tx_locked (tx, conn); + write_unlock_irqrestore (g_lock, flags); + return (PTL_OK); + } + + if (ksocknal_find_connecting_route_locked (peer) == NULL) { + /* no routes actually connecting now */ + write_unlock_irqrestore (g_lock, flags); + return (PTL_FAIL); + } + + list_add_tail (&tx->tx_list, &peer->ksnp_tx_queue); + + write_unlock_irqrestore (g_lock, flags); + return (PTL_OK); } ksock_ltx_t * @@ -745,32 +997,19 @@ ksocknal_send (nal_cb_t *nal, void *private, lib_msg_t *cookie, size_t payload_len) { ksock_ltx_t *ltx; - ksock_conn_t *conn; + int rc; /* NB 'private' is different depending on what we're sending. * Just ignore it until we can rely on it - * - * Also, the return code from this procedure is ignored. - * If we can't send, we must still complete with lib_finalize(). - * We'll have to wait for 3.2 to return an error event. */ CDEBUG(D_NET, "sending "LPSZ" bytes in %d mapped frags to nid: "LPX64 " pid %d\n", payload_len, payload_niov, nid, pid); - conn = ksocknal_send_target (nid); - if (conn == NULL) { - lib_finalize (&ksocknal_lib, private, cookie); - return (-1); - } - ltx = ksocknal_setup_hdr (nal, private, cookie, hdr, type); - if (ltx == NULL) { - ksocknal_put_conn (conn); - lib_finalize (&ksocknal_lib, private, cookie); - return (-1); - } + if (ltx == NULL) + return (PTL_FAIL); /* append the payload_iovs to the one pointing at the header */ LASSERT (ltx->ltx_tx.tx_niov == 1 && ltx->ltx_tx.tx_nkiov == 0); @@ -781,8 +1020,11 @@ ksocknal_send (nal_cb_t *nal, void *private, lib_msg_t *cookie, ltx->ltx_tx.tx_niov = 1 + payload_niov; ltx->ltx_tx.tx_nob = sizeof (*hdr) + payload_len; - ksocknal_launch_packet (conn, <x->ltx_tx); - return (0); + rc = ksocknal_launch_packet (<x->ltx_tx, nid); + if (rc != PTL_OK) + ksocknal_put_ltx (ltx); + + return (rc); } int @@ -791,8 +1033,8 @@ ksocknal_send_pages (nal_cb_t *nal, void *private, lib_msg_t *cookie, unsigned int payload_niov, ptl_kiov_t *payload_iov, size_t payload_len) { ksock_ltx_t *ltx; - ksock_conn_t *conn; - + int rc; + /* NB 'private' is different depending on what we're sending. * Just ignore it until we can rely on it */ @@ -800,15 +1042,9 @@ ksocknal_send_pages (nal_cb_t *nal, void *private, lib_msg_t *cookie, "sending "LPSZ" bytes in %d mapped frags to nid: "LPX64" pid %d\n", payload_len, payload_niov, nid, pid); - conn = ksocknal_send_target (nid); - if (conn == NULL) - return (-1); - ltx = ksocknal_setup_hdr (nal, private, cookie, hdr, type); - if (ltx == NULL) { - ksocknal_put_conn (conn); - return (-1); - } + if (ltx == NULL) + return (PTL_FAIL); LASSERT (ltx->ltx_tx.tx_niov == 1 && ltx->ltx_tx.tx_nkiov == 0); LASSERT (payload_niov <= PTL_MD_MAX_IOV); @@ -819,17 +1055,20 @@ ksocknal_send_pages (nal_cb_t *nal, void *private, lib_msg_t *cookie, ltx->ltx_tx.tx_nkiov = payload_niov; ltx->ltx_tx.tx_nob = sizeof (*hdr) + payload_len; - ksocknal_launch_packet (conn, <x->ltx_tx); - return (0); + rc = ksocknal_launch_packet (<x->ltx_tx, nid); + if (rc != PTL_OK) + ksocknal_put_ltx (ltx); + + return (rc); } void ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) { - ksock_conn_t *conn; ptl_nid_t nid = fwd->kprfd_gateway_nid; ksock_tx_t *tx = (ksock_tx_t *)&fwd->kprfd_scratch; - + int rc; + CDEBUG (D_NET, "Forwarding [%p] -> "LPX64" ("LPX64"))\n", fwd, fwd->kprfd_gateway_nid, fwd->kprfd_target_nid); @@ -837,23 +1076,19 @@ ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) if (nid == ksocknal_lib.ni.nid) nid = fwd->kprfd_target_nid; - conn = ksocknal_get_conn (nid); - if (conn == NULL) { - CERROR ("[%p] fwd to "LPX64" isn't a peer\n", fwd, nid); - kpr_fwd_done (&ksocknal_data.ksnd_router, fwd, -EHOSTUNREACH); - return; - } - - /* This forward has now got a ref on conn */ - tx->tx_isfwd = 1; /* This is a forwarding packet */ tx->tx_nob = fwd->kprfd_nob; tx->tx_niov = fwd->kprfd_niov; tx->tx_iov = fwd->kprfd_iov; tx->tx_nkiov = 0; tx->tx_kiov = NULL; - - ksocknal_launch_packet (conn, tx); + tx->tx_hdr = (ptl_hdr_t *)fwd->kprfd_iov[0].iov_base; + + rc = ksocknal_launch_packet (tx, nid); + if (rc != 0) { + /* FIXME, could pass a better completion error */ + kpr_fwd_done (&ksocknal_data.ksnd_router, fwd, -EHOSTUNREACH); + } } int @@ -904,6 +1139,9 @@ ksocknal_fmb_callback (void *arg, int error) spin_unlock_irqrestore (&fmp->fmp_lock, flags); + /* drop peer ref taken on init */ + ksocknal_put_peer (fmb->fmb_peer); + if (conn == NULL) return; @@ -964,7 +1202,6 @@ ksocknal_get_idle_fmb (ksock_conn_t *conn) return (NULL); } - int ksocknal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb) { @@ -982,22 +1219,26 @@ ksocknal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb) LASSERT (sizeof (ptl_hdr_t) < PAGE_SIZE); /* Got a forwarding buffer; copy the header we just read into the - * forwarding buffer. If there's payload start reading reading it + * forwarding buffer. If there's payload, start reading reading it * into the buffer, otherwise the forwarding buffer can be kicked * off immediately. * * NB fmb->fmb_iov spans the WHOLE packet. * conn->ksnc_rx_iov spans just the payload. */ - fmb->fmb_iov[0].iov_base = page_address (fmb->fmb_pages[0]); /* copy header */ memcpy (fmb->fmb_iov[0].iov_base, &conn->ksnc_hdr, sizeof (ptl_hdr_t)); - if (payload_nob == 0) { /* got complete packet already */ - atomic_inc (&ksocknal_packets_received); + /* Take a ref on the conn's peer to prevent module unload before + * forwarding completes. NB we ref peer and not conn since because + * all refs on conn after it has been closed must remove themselves + * in finite time */ + fmb->fmb_peer = conn->ksnc_peer; + atomic_inc (&conn->ksnc_peer->ksnp_refcount); + if (payload_nob == 0) { /* got complete packet already */ CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (immediate)\n", conn, NTOH__u64 (conn->ksnc_hdr.src_nid), dest_nid, packet_nob); @@ -1036,11 +1277,10 @@ ksocknal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb) packet_nob, niov, fmb->fmb_iov, ksocknal_fmb_callback, fmb); - /* stash router's descriptor ready for call to kpr_fwd_start */ - conn->ksnc_cookie = &fmb->fmb_fwd; - + conn->ksnc_cookie = fmb; /* stash fmb for later */ conn->ksnc_rx_state = SOCKNAL_RX_BODY_FWD; /* read in the payload */ - + conn->ksnc_rx_deadline = jiffies_64 + ksocknal_io_timeout; /* start timeout */ + /* payload is desc's iov-ed buffer, but skipping the hdr */ LASSERT (niov <= sizeof (conn->ksnc_rx_iov_space) / sizeof (struct iovec)); @@ -1066,7 +1306,7 @@ ksocknal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb) void ksocknal_fwd_parse (ksock_conn_t *conn) { - ksock_conn_t *conn2; + ksock_peer_t *peer; ptl_nid_t dest_nid = NTOH__u64 (conn->ksnc_hdr.dest_nid); int body_len = NTOH__u32 (PTL_HDR_LENGTH(&conn->ksnc_hdr)); @@ -1081,7 +1321,9 @@ ksocknal_fwd_parse (ksock_conn_t *conn) CERROR("dropping packet from "LPX64" for "LPX64": packet " "size %d illegal\n", NTOH__u64 (conn->ksnc_hdr.src_nid), dest_nid, body_len); - ksocknal_new_packet (conn, 0); /* on to new packet */ + + ksocknal_new_packet (conn, 0); /* on to new packet */ + ksocknal_close_conn_unlocked (conn); /* give up on conn */ return; } @@ -1104,12 +1346,12 @@ ksocknal_fwd_parse (ksock_conn_t *conn) } /* should have gone direct */ - conn2 = ksocknal_get_conn (conn->ksnc_hdr.dest_nid); - if (conn2 != NULL) { + peer = ksocknal_get_peer (conn->ksnc_hdr.dest_nid); + if (peer != NULL) { CERROR ("dropping packet from "LPX64" for "LPX64 ": target is a peer\n", conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid); - ksocknal_put_conn (conn2); /* drop ref from get above */ + ksocknal_put_peer (peer); /* drop ref from get above */ /* on to next packet (skip this one's body) */ ksocknal_new_packet (conn, body_len); @@ -1221,10 +1463,18 @@ ksocknal_process_receive (ksock_sched_t *sched, unsigned long *irq_flags) rc = ksocknal_recvmsg(conn); - if (rc == 0) - goto out; - if (rc < 0) { - CERROR ("Error socknal read %p: %d\n", conn, rc); + if (rc <= 0) { + if (ksocknal_close_conn_unlocked (conn)) { + /* I'm the first to close */ + if (rc < 0) + CERROR ("[%p] Error %d on read from "LPX64" ip %08x:%d\n", + conn, rc, conn->ksnc_peer->ksnp_nid, + conn->ksnc_ipaddr, conn->ksnc_port); + else + CERROR ("[%p] EOF from "LPX64" ip %08x:%d\n", + conn, conn->ksnc_peer->ksnp_nid, + conn->ksnc_ipaddr, conn->ksnc_port); + } goto out; } @@ -1236,9 +1486,9 @@ ksocknal_process_receive (ksock_sched_t *sched, unsigned long *irq_flags) switch (conn->ksnc_rx_state) { case SOCKNAL_RX_HEADER: - /* It's not for me */ - if (conn->ksnc_hdr.type != PTL_MSG_HELLO && + if (conn->ksnc_hdr.type != HTON__u32(PTL_MSG_HELLO) && NTOH__u64(conn->ksnc_hdr.dest_nid) != ksocknal_lib.ni.nid) { + /* This packet isn't for me */ ksocknal_fwd_parse (conn); switch (conn->ksnc_rx_state) { case SOCKNAL_RX_HEADER: /* skipped (zero payload) */ @@ -1253,10 +1503,11 @@ ksocknal_process_receive (ksock_sched_t *sched, unsigned long *irq_flags) /* Not Reached */ } - PROF_START(lib_parse); /* sets wanted_len, iovs etc */ lib_parse(&ksocknal_lib, &conn->ksnc_hdr, conn); - PROF_FINISH(lib_parse); + + /* start timeout (lib is waiting for finalize) */ + conn->ksnc_rx_deadline = jiffies_64 + ksocknal_io_timeout; if (conn->ksnc_rx_nob_wanted != 0) { /* need to get payload? */ conn->ksnc_rx_state = SOCKNAL_RX_BODY; @@ -1265,8 +1516,8 @@ ksocknal_process_receive (ksock_sched_t *sched, unsigned long *irq_flags) /* Fall through (completed packet for me) */ case SOCKNAL_RX_BODY: - atomic_inc (&ksocknal_packets_received); - /* packet is done now */ + /* payload all received */ + conn->ksnc_rx_deadline = 0; /* cancel timeout */ lib_finalize(&ksocknal_lib, NULL, conn->ksnc_cookie); /* Fall through */ @@ -1277,16 +1528,19 @@ ksocknal_process_receive (ksock_sched_t *sched, unsigned long *irq_flags) goto try_read; /* try to finish reading slop now */ case SOCKNAL_RX_BODY_FWD: + /* payload all received */ CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (got body)\n", conn, NTOH__u64 (conn->ksnc_hdr.src_nid), NTOH__u64 (conn->ksnc_hdr.dest_nid), conn->ksnc_rx_nob_left); - atomic_inc (&ksocknal_packets_received); + /* cancel timeout (only needed it while fmb allocated) */ + conn->ksnc_rx_deadline = 0; - /* ksocknal_init_fmb() put router desc. in conn->ksnc_cookie */ - kpr_fwd_start (&ksocknal_data.ksnd_router, - (kpr_fwd_desc_t *)conn->ksnc_cookie); + /* forward the packet. NB ksocknal_init_fmb() put fmb into + * conn->ksnc_cookie */ + fmb = (ksock_fmb_t *)conn->ksnc_cookie; + kpr_fwd_start (&ksocknal_data.ksnd_router, &fmb->fmb_fwd); /* no slop in forwarded packets */ LASSERT (conn->ksnc_rx_nob_left == 0); @@ -1308,9 +1562,12 @@ ksocknal_process_receive (ksock_sched_t *sched, unsigned long *irq_flags) if (!conn->ksnc_rx_ready) { /* let socket callback schedule again */ conn->ksnc_rx_scheduled = 0; - ksocknal_put_conn (conn); /* release scheduler's ref */ - } else /* let scheduler call me again */ + /* drop scheduler's ref */ + ksocknal_put_conn (conn); + } else { + /* stay scheduled */ list_add_tail (&conn->ksnc_rx_list, &sched->kss_rx_conns); + } } int @@ -1384,7 +1641,7 @@ int ksocknal_scheduler (void *arg) else CERROR ("Can't set CPU affinity for %s\n", name); #endif /* CONFIG_SMP && CPU_AFFINITY */ - + spin_lock_irqsave (&sched->kss_lock, flags); while (!ksocknal_data.ksnd_shuttingdown) { @@ -1413,7 +1670,7 @@ int ksocknal_scheduler (void *arg) list_del (&tx->tx_list); spin_unlock_irqrestore (&sched->kss_lock, flags); - ksocknal_tx_done (tx); + ksocknal_tx_done (tx, 1); spin_lock_irqsave (&sched->kss_lock, flags); } @@ -1459,7 +1716,7 @@ ksocknal_data_ready (struct sock *sk, int n) ENTRY; /* interleave correctly with closing sockets... */ - read_lock (&ksocknal_data.ksnd_socklist_lock); + read_lock (&ksocknal_data.ksnd_global_lock); conn = sk->sk_user_data; if (conn == NULL) { /* raced with ksocknal_close_sock */ @@ -1490,7 +1747,7 @@ ksocknal_data_ready (struct sock *sk, int n) spin_unlock_irqrestore (&sched->kss_lock, flags); } - read_unlock (&ksocknal_data.ksnd_socklist_lock); + read_unlock (&ksocknal_data.ksnd_global_lock); EXIT; } @@ -1503,7 +1760,7 @@ ksocknal_write_space (struct sock *sk) ksock_sched_t *sched; /* interleave correctly with closing sockets... */ - read_lock (&ksocknal_data.ksnd_socklist_lock); + read_lock (&ksocknal_data.ksnd_global_lock); conn = sk->sk_user_data; @@ -1550,42 +1807,594 @@ ksocknal_write_space (struct sock *sk) } } - read_unlock (&ksocknal_data.ksnd_socklist_lock); + read_unlock (&ksocknal_data.ksnd_global_lock); +} + +int +ksocknal_sock_write (struct socket *sock, void *buffer, int nob) +{ + int rc; + mm_segment_t oldmm = get_fs(); + + while (nob > 0) { + struct iovec iov = { + .iov_base = buffer, + .iov_len = nob + }; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = 0 + }; + + set_fs (KERNEL_DS); + rc = sock_sendmsg (sock, &msg, iov.iov_len); + set_fs (oldmm); + + if (rc < 0) + return (rc); + + if (rc == 0) { + CERROR ("Unexpected zero rc\n"); + return (-ECONNABORTED); + } + + buffer = ((char *)buffer) + rc; + nob -= rc; + } + + return (0); +} + +int +ksocknal_sock_read (struct socket *sock, void *buffer, int nob) +{ + int rc; + mm_segment_t oldmm = get_fs(); + + while (nob > 0) { + struct iovec iov = { + .iov_base = buffer, + .iov_len = nob + }; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = 0 + }; + + set_fs (KERNEL_DS); + rc = sock_recvmsg (sock, &msg, iov.iov_len, 0); + set_fs (oldmm); + + if (rc < 0) + return (rc); + + if (rc == 0) + return (-ECONNABORTED); + + buffer = ((char *)buffer) + rc; + nob -= rc; + } + + return (0); +} + +int +ksocknal_exchange_nids (struct socket *sock, ptl_nid_t nid) +{ + int rc; + ptl_hdr_t hdr; + ptl_magicversion_t *hmv = (ptl_magicversion_t *)&hdr.dest_nid; + + LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid)); + + memset (&hdr, 0, sizeof (hdr)); + hmv->magic = __cpu_to_le32 (PORTALS_PROTO_MAGIC); + hmv->version_major = __cpu_to_le32 (PORTALS_PROTO_VERSION_MAJOR); + hmv->version_minor = __cpu_to_le32 (PORTALS_PROTO_VERSION_MINOR); + + hdr.src_nid = __cpu_to_le64 (ksocknal_lib.ni.nid); + hdr.type = __cpu_to_le32 (PTL_MSG_HELLO); + + /* Assume sufficient socket buffering for this message */ + rc = ksocknal_sock_write (sock, &hdr, sizeof (hdr)); + if (rc != 0) { + CERROR ("Error %d sending HELLO to "LPX64"\n", rc, nid); + return (rc); + } + + rc = ksocknal_sock_read (sock, hmv, sizeof (*hmv)); + if (rc != 0) { + CERROR ("Error %d reading HELLO from "LPX64"\n", rc, nid); + return (rc); + } + + if (hmv->magic != __le32_to_cpu (PORTALS_PROTO_MAGIC)) { + CERROR ("Bad magic %#08x (%#08x expected) from "LPX64"\n", + __cpu_to_le32 (hmv->magic), PORTALS_PROTO_MAGIC, nid); + return (-EINVAL); + } + + if (hmv->version_major != __cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR) || + hmv->version_minor != __cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR)) { + CERROR ("Incompatible protocol version %d.%d (%d.%d expected)" + " from "LPX64"\n", + __le16_to_cpu (hmv->version_major), + __le16_to_cpu (hmv->version_minor), + PORTALS_PROTO_VERSION_MAJOR, + PORTALS_PROTO_VERSION_MINOR, + nid); + return (-EINVAL); + } + + LASSERT (PORTALS_PROTO_VERSION_MAJOR == 0); + /* version 0 sends magic/version as the dest_nid of a 'hello' header, + * so read the rest of it in now... */ + + rc = ksocknal_sock_read (sock, hmv + 1, sizeof (hdr) - sizeof (*hmv)); + if (rc != 0) { + CERROR ("Error %d reading rest of HELLO hdr from "LPX64"\n", + rc, nid); + return (rc); + } + + /* ...and check we got what we expected */ + if (hdr.type != __cpu_to_le32 (PTL_MSG_HELLO) || + PTL_HDR_LENGTH (&hdr) != __cpu_to_le32 (0)) { + CERROR ("Expecting a HELLO hdr with 0 payload," + " but got type %d with %d payload from "LPX64"\n", + __le32_to_cpu (hdr.type), + __le32_to_cpu (PTL_HDR_LENGTH (&hdr)), nid); + return (-EINVAL); + } + + if (__le64_to_cpu (hdr.src_nid) != nid) { + CERROR ("Connected to nid "LPX64", but expecting "LPX64"\n", + __le64_to_cpu (hdr.src_nid), nid); + return (-EINVAL); + } + + return (0); +} + +int +ksocknal_set_linger (struct socket *sock) +{ + mm_segment_t oldmm = get_fs (); + int rc; + int option; + struct linger linger; + + /* Ensure this socket aborts active sends immediately when we close + * it. */ + + linger.l_onoff = 0; + linger.l_linger = 0; + + set_fs (KERNEL_DS); + rc = sock_setsockopt (sock, SOL_SOCKET, SO_LINGER, + (char *)&linger, sizeof (linger)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't set SO_LINGER: %d\n", rc); + return (rc); + } + + option = -1; + set_fs (KERNEL_DS); + rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_LINGER2, + (char *)&option, sizeof (option)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't set SO_LINGER2: %d\n", rc); + return (rc); + } + + return (0); +} + +int +ksocknal_connect_peer (ksock_route_t *route) +{ + struct sockaddr_in peer_addr; + mm_segment_t oldmm = get_fs(); + __u64 n; + struct timeval tv; + int fd; + struct socket *sock; + int rc; + + rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock); + if (rc != 0) { + CERROR ("Can't create autoconnect socket: %d\n", rc); + return (rc); + } + + /* Ugh; have to map_fd for compatibility with sockets passed in + * from userspace. And we actually need the refcounting that + * this gives you :) */ + + fd = sock_map_fd (sock); + if (fd < 0) { + sock_release (sock); + CERROR ("sock_map_fd error %d\n", fd); + return (fd); + } + + /* NB the fd now owns the ref on sock->file */ + LASSERT (sock->file != NULL); + LASSERT (file_count(sock->file) == 1); + + /* Set the socket timeouts, so our connection attempt completes in + * finite time */ + tv.tv_sec = ksocknal_io_timeout / HZ; + n = ksocknal_io_timeout % HZ; + n = n * 1000000 + HZ - 1; + do_div (n, HZ); + tv.tv_usec = n; + + set_fs (KERNEL_DS); + rc = sock_setsockopt (sock, SOL_SOCKET, SO_SNDTIMEO, + (char *)&tv, sizeof (tv)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't set send timeout %d (in HZ): %d\n", + ksocknal_io_timeout, rc); + goto out; + } + + set_fs (KERNEL_DS); + rc = sock_setsockopt (sock, SOL_SOCKET, SO_RCVTIMEO, + (char *)&tv, sizeof (tv)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't set receive timeout %d (in HZ): %d\n", + ksocknal_io_timeout, rc); + goto out; + } + + if (route->ksnr_nonagel) { + int option = 1; + + set_fs (KERNEL_DS); + rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_NODELAY, + (char *)&option, sizeof (option)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't disable nagel: %d\n", rc); + goto out; + } + } + + if (route->ksnr_buffer_size != 0) { + int option = route->ksnr_buffer_size; + + set_fs (KERNEL_DS); + rc = sock_setsockopt (sock, SOL_SOCKET, SO_SNDBUF, + (char *)&option, sizeof (option)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't set send buffer %d: %d\n", + route->ksnr_buffer_size, rc); + goto out; + } + + set_fs (KERNEL_DS); + rc = sock_setsockopt (sock, SOL_SOCKET, SO_RCVBUF, + (char *)&option, sizeof (option)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't set receive buffer %d: %d\n", + route->ksnr_buffer_size, rc); + goto out; + } + } + + memset (&peer_addr, 0, sizeof (peer_addr)); + peer_addr.sin_family = AF_INET; + peer_addr.sin_port = htons (route->ksnr_port); + peer_addr.sin_addr.s_addr = htonl (route->ksnr_ipaddr); + + rc = sock->ops->connect (sock, (struct sockaddr *)&peer_addr, + sizeof (peer_addr), sock->file->f_flags); + if (rc != 0) { + CERROR ("Error %d connecting to "LPX64"\n", rc, + route->ksnr_peer->ksnp_nid); + goto out; + } + + if (route->ksnr_xchange_nids) { + rc = ksocknal_exchange_nids (sock, route->ksnr_peer->ksnp_nid); + if (rc != 0) + goto out; + } + + rc = ksocknal_create_conn (route->ksnr_peer->ksnp_nid, + route, sock, route->ksnr_irq_affinity); + if (rc == 0) { + /* Take an extra ref on sock->file to compensate for the + * upcoming close which will lose fd's ref on it. */ + get_file (sock->file); + } + + out: + sys_close (fd); + return (rc); +} + +void +ksocknal_autoconnect (ksock_route_t *route) +{ + LIST_HEAD (zombies); + ksock_tx_t *tx; + ksock_peer_t *peer; + unsigned long flags; + int rc; + + rc = ksocknal_connect_peer (route); + if (rc == 0) { + /* successfully autoconnected: create_conn did the + * route/conn binding and scheduled any blocked packets, + * so there's nothing left to do now. */ + return; + } + + write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); + + peer = route->ksnr_peer; + route->ksnr_connecting = 0; + + LASSERT (route->ksnr_retry_interval != 0); + route->ksnr_timeout = jiffies_64 + route->ksnr_retry_interval; + route->ksnr_retry_interval = MIN (route->ksnr_retry_interval * 2, + SOCKNAL_MAX_RECONNECT_INTERVAL); + + if (!list_empty (&peer->ksnp_tx_queue) && + ksocknal_find_connecting_route_locked (peer) == NULL) { + LASSERT (list_empty (&peer->ksnp_conns)); + + /* None of the connections that the blocked packets are + * waiting for have been successful. Complete them now... */ + do { + tx = list_entry (peer->ksnp_tx_queue.next, + ksock_tx_t, tx_list); + list_del (&tx->tx_list); + list_add_tail (&tx->tx_list, &zombies); + } while (!list_empty (&peer->ksnp_tx_queue)); + } + + write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); + + while (!list_empty (&zombies)) { + tx = list_entry (zombies.next, ksock_tx_t, tx_list); + + CERROR ("Deleting packet type %d len %d ("LPX64"->"LPX64")\n", + NTOH__u32 (tx->tx_hdr->type), + NTOH__u32 (PTL_HDR_LENGTH(tx->tx_hdr)), + NTOH__u64 (tx->tx_hdr->src_nid), + NTOH__u64 (tx->tx_hdr->dest_nid)); + + list_del (&tx->tx_list); + /* complete now */ + ksocknal_tx_done (tx, 0); + } +} + +int +ksocknal_autoconnectd (void *arg) +{ + long id = (long)arg; + char name[16]; + unsigned long flags; + ksock_route_t *route; + int rc; + + snprintf (name, sizeof (name), "ksocknal_ad[%ld]", id); + kportal_daemonize (name); + kportal_blockallsigs (); + + spin_lock_irqsave (&ksocknal_data.ksnd_autoconnectd_lock, flags); + + while (!ksocknal_data.ksnd_shuttingdown) { + + if (!list_empty (&ksocknal_data.ksnd_autoconnectd_routes)) { + route = list_entry (ksocknal_data.ksnd_autoconnectd_routes.next, + ksock_route_t, ksnr_connect_list); + + list_del (&route->ksnr_connect_list); + spin_unlock_irqrestore (&ksocknal_data.ksnd_autoconnectd_lock, flags); + + ksocknal_autoconnect (route); + ksocknal_put_route (route); + + spin_lock_irqsave (&ksocknal_data.ksnd_autoconnectd_lock, flags); + continue; + } + + spin_unlock_irqrestore (&ksocknal_data.ksnd_autoconnectd_lock, flags); + + rc = wait_event_interruptible (ksocknal_data.ksnd_autoconnectd_waitq, + ksocknal_data.ksnd_shuttingdown || + !list_empty (&ksocknal_data.ksnd_autoconnectd_routes)); + + spin_lock_irqsave (&ksocknal_data.ksnd_autoconnectd_lock, flags); + } + + spin_unlock_irqrestore (&ksocknal_data.ksnd_autoconnectd_lock, flags); + + ksocknal_thread_fini (); + return (0); +} + +ksock_conn_t * +ksocknal_find_timed_out_conn (ksock_peer_t *peer) +{ + /* We're called with a shared lock on ksnd_global_lock */ + unsigned long flags; + ksock_conn_t *conn; + struct list_head *ctmp; + ksock_tx_t *tx; + struct list_head *ttmp; + ksock_sched_t *sched; + + list_for_each (ctmp, &peer->ksnp_conns) { + conn = list_entry (ctmp, ksock_conn_t, ksnc_list); + sched = conn->ksnc_scheduler; + + if (conn->ksnc_rx_deadline != 0 && + conn->ksnc_rx_deadline <= jiffies_64) + goto timed_out; + + spin_lock_irqsave (&sched->kss_lock, flags); + + list_for_each (ttmp, &conn->ksnc_tx_queue) { + tx = list_entry (ttmp, ksock_tx_t, tx_list); + LASSERT (tx->tx_deadline != 0); + + if (tx->tx_deadline <= jiffies_64) + goto timed_out_locked; + } +#if SOCKNAL_ZC + list_for_each (ttmp, &conn->ksnc_tx_pending) { + tx = list_entry (ttmp, ksock_tx_t, tx_list); + LASSERT (tx->tx_deadline != 0); + + if (tx->tx_deadline <= jiffies_64) + goto timed_out_locked; + } +#endif + spin_unlock_irqrestore (&sched->kss_lock, flags); + continue; + + timed_out_locked: + spin_unlock_irqrestore (&sched->kss_lock, flags); + timed_out: + atomic_inc (&conn->ksnc_refcount); + return (conn); + } + + return (NULL); +} + +void +ksocknal_check_peer_timeouts (struct list_head *peers) +{ + struct list_head *ptmp; + ksock_peer_t *peer; + ksock_conn_t *conn; + + again: + /* NB. We expect to have a look at all the peers and not find any + * connections to time out, so we just use a shared lock while we + * take a look... */ + read_lock (&ksocknal_data.ksnd_global_lock); + + list_for_each (ptmp, peers) { + peer = list_entry (ptmp, ksock_peer_t, ksnp_list); + conn = ksocknal_find_timed_out_conn (peer); + + if (conn != NULL) { + read_unlock (&ksocknal_data.ksnd_global_lock); + + if (ksocknal_close_conn_unlocked (conn)) { + /* I actually closed... */ + CERROR ("Timeout out conn->"LPX64" ip %x:%d\n", + peer->ksnp_nid, conn->ksnc_ipaddr, + conn->ksnc_port); + } + + /* NB we won't find this one again, but we can't + * just proceed with the next peer, since we dropped + * ksnd_global_lock and it might be dead already! */ + ksocknal_put_conn (conn); + goto again; + } + } + + read_unlock (&ksocknal_data.ksnd_global_lock); } int ksocknal_reaper (void *arg) { + wait_queue_t wait; unsigned long flags; ksock_conn_t *conn; - int rc; + int timeout; + int peer_index = 0; + __u64 deadline = jiffies_64; kportal_daemonize ("ksocknal_reaper"); kportal_blockallsigs (); + init_waitqueue_entry (&wait, current); + + spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags); + while (!ksocknal_data.ksnd_shuttingdown) { - spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags); - if (list_empty (&ksocknal_data.ksnd_reaper_list)) { - conn = NULL; - } else { - conn = list_entry (ksocknal_data.ksnd_reaper_list.next, + if (!list_empty (&ksocknal_data.ksnd_deathrow_conns)) { + conn = list_entry (ksocknal_data.ksnd_deathrow_conns.next, ksock_conn_t, ksnc_list); list_del (&conn->ksnc_list); + + spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags); + + ksocknal_terminate_conn (conn); + ksocknal_put_conn (conn); + + spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags); + continue; } + if (!list_empty (&ksocknal_data.ksnd_zombie_conns)) { + conn = list_entry (ksocknal_data.ksnd_zombie_conns.next, + ksock_conn_t, ksnc_list); + list_del (&conn->ksnc_list); + + spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags); + + ksocknal_destroy_conn (conn); + + spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags); + continue; + } + spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags); - if (conn != NULL) - ksocknal_close_conn (conn); - else { - rc = wait_event_interruptible (ksocknal_data.ksnd_reaper_waitq, - ksocknal_data.ksnd_shuttingdown || - !list_empty(&ksocknal_data.ksnd_reaper_list)); - LASSERT (rc == 0); + while ((timeout = deadline - jiffies_64) <= 0) { + /* Time to check for timeouts on a few more peers */ + ksocknal_check_peer_timeouts (&ksocknal_data.ksnd_peers[peer_index]); + + peer_index = (peer_index + 1) % SOCKNAL_PEER_HASH_SIZE; + deadline += HZ; } + + add_wait_queue (&ksocknal_data.ksnd_reaper_waitq, &wait); + set_current_state (TASK_INTERRUPTIBLE); + + if (!ksocknal_data.ksnd_shuttingdown && + list_empty (&ksocknal_data.ksnd_deathrow_conns) && + list_empty (&ksocknal_data.ksnd_zombie_conns)) + schedule_timeout (timeout); + + set_current_state (TASK_RUNNING); + remove_wait_queue (&ksocknal_data.ksnd_reaper_waitq, &wait); + + spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags); } + spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags); + ksocknal_thread_fini (); return (0); } diff --git a/lnet/libcfs/debug.c b/lnet/libcfs/debug.c index e9b0e12..16ef401 100644 --- a/lnet/libcfs/debug.c +++ b/lnet/libcfs/debug.c @@ -636,7 +636,7 @@ int portals_debug_mark_buffer(char *text) return -EINVAL; CDEBUG(0, "********************************************************\n"); - CDEBUG(0, "DEBUG MARKER: %s\n", text); + CERROR("DEBUG MARKER: %s\n", text); CDEBUG(0, "********************************************************\n"); return 0; @@ -644,39 +644,104 @@ int portals_debug_mark_buffer(char *text) #undef DEBUG_SUBSYSTEM #define DEBUG_SUBSYSTEM S_PORTALS +/* this copies a snapshot of the debug buffer into an array of pages + * before doing the potentially blocking copy into userspace. it could + * be warning userspace if things wrap heavily while its off copying. */ __s32 portals_debug_copy_to_user(char *buf, unsigned long len) { int rc; - unsigned long debug_off; + unsigned long debug_off, i, off, copied; unsigned long flags; + struct page *page; + LIST_HEAD(my_pages); + struct list_head *pos, *n; if (len < debug_size) return -ENOSPC; - debug_off = atomic_read(&debug_off_a); + for (i = 0 ; i < debug_size; i += PAGE_SIZE) { + page = alloc_page(GFP_NOFS); + if (page == NULL) { + rc = -ENOMEM; + goto cleanup; + } + list_add(&page->list, &my_pages); + } + spin_lock_irqsave(&portals_debug_lock, flags); - if (debug_wrapped) { - /* All of this juggling with the 1s is to keep the trailing nul - * (which falls at debug_buf + debug_off) at the end of what we - * copy into user space */ - copy_to_user(buf, debug_buf + debug_off + 1, - debug_size - debug_off - 1); - copy_to_user(buf + debug_size - debug_off - 1, - debug_buf, debug_off + 1); - rc = debug_size; - } else { - copy_to_user(buf, debug_buf, debug_off); - rc = debug_off; + debug_off = atomic_read(&debug_off_a); + + /* Sigh. If the buffer is empty, then skip to the end. */ + if (debug_off == 0 && !debug_wrapped) { + spin_unlock_irqrestore(&portals_debug_lock, flags); + rc = 0; + goto cleanup; } + + if (debug_wrapped) + off = debug_off + 1; + else + off = 0; + copied = 0; + list_for_each(pos, &my_pages) { + unsigned long to_copy; + page = list_entry(pos, struct page, list); + + to_copy = min(debug_size - off, PAGE_SIZE); + if (to_copy == 0) { + off = 0; + to_copy = min(debug_size - off, PAGE_SIZE); + } +finish_partial: + memcpy(kmap(page), debug_buf + off, to_copy); + kunmap(page); + copied += to_copy; + if (copied >= (debug_wrapped ? debug_size : debug_off)) + break; + + off += to_copy; + if (off >= debug_size) { + off = 0; + if (to_copy != PAGE_SIZE) { + to_copy = PAGE_SIZE - to_copy; + goto finish_partial; + } + } + } + spin_unlock_irqrestore(&portals_debug_lock, flags); + off = 0; + list_for_each(pos, &my_pages) { + unsigned long to_copy; + page = list_entry(pos, struct page, list); + + to_copy = min(copied - off, PAGE_SIZE); + rc = copy_to_user(buf + off, kmap(page), to_copy); + kunmap(page); + if (rc) { + rc = -EFAULT; + goto cleanup; + } + off += to_copy; + if (off >= copied) + break; + } + rc = copied; + +cleanup: + list_for_each_safe(pos, n, &my_pages) { + page = list_entry(pos, struct page, list); + list_del(&page->list); + __free_page(page); + } return rc; } /* FIXME: I'm not very smart; someone smarter should make this better. */ void portals_debug_msg(int subsys, int mask, char *file, const char *fn, - const int line, unsigned long stack, const char *format, ...) + const int line, unsigned long stack, char *format, ...) { va_list ap; unsigned long flags; @@ -731,33 +796,34 @@ portals_debug_msg(int subsys, int mask, char *file, const char *fn, do_gettimeofday(&tv); prefix_nob = snprintf(debug_buf + debug_off, max_nob, - "%06x:%06x:%d:%lu.%06lu ", + "%06x:%06x:%d:%lu.%06lu :", subsys, mask, smp_processor_id(), tv.tv_sec, tv.tv_usec); max_nob -= prefix_nob; + if(*(format + strlen(format) - 1) == '\n') + *(format + strlen(format) - 1) = ':'; + + va_start(ap, format); + msg_nob = vsnprintf(debug_buf + debug_off + prefix_nob , + max_nob, format, ap); + max_nob -= msg_nob; + va_end(ap); #if defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,4,20)) - msg_nob = snprintf(debug_buf + debug_off + prefix_nob, max_nob, - "(%s:%d:%s() %d | %d+%lu): ", + msg_nob += snprintf(debug_buf + debug_off + prefix_nob + msg_nob, max_nob, + "(%s:%d:%s() %d | %d+%lu)\n", file, line, fn, current->pid, current->thread.extern_pid, stack); #elif defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) - msg_nob = snprintf(debug_buf + debug_off + prefix_nob, max_nob, - "(%s:%d:%s() %d | %d+%lu): ", + msg_nob += snprintf(debug_buf + debug_off + prefix_nob + msg_nob, max_nob, + "(%s:%d:%s() %d | %d+%lu)\n", file, line, fn, current->pid, current->thread.mode.tt.extern_pid, stack); #else - msg_nob = snprintf(debug_buf + debug_off + prefix_nob, max_nob, - "(%s:%d:%s() %d+%lu): ", + msg_nob += snprintf(debug_buf + debug_off + prefix_nob + msg_nob, max_nob, + "(%s:%d:%s() %d+%lu)\n", file, line, fn, current->pid, stack); #endif - max_nob -= msg_nob; - - va_start(ap, format); - msg_nob += vsnprintf(debug_buf + debug_off + prefix_nob + msg_nob, - max_nob, format, ap); - max_nob -= msg_nob; - va_end(ap); /* Print to console, while msg is contiguous in debug_buf */ /* NB safely terminated see above */ diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 23527ce..62db766 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -560,7 +560,6 @@ get_new_msg (nal_cb_t *nal, lib_md_t *md) return (msg); } - /* * Incoming messages have a ptl_msg_t object associated with them * by the library. This object encapsulates the state of the @@ -756,9 +755,13 @@ static int parse_get(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) rc = lib_send (nal, private, msg, &reply, PTL_MSG_REPLY, hdr->src_nid, hdr->src_pid, md, offset, mlength); - if (rc != 0) { + if (rc != PTL_OK) { CERROR(LPU64": Dropping GET from "LPU64": send REPLY failed\n", ni->nid, hdr->src_nid); + /* Hmm, this will create a GET event and make believe + * the reply completed, which it kind of did, only the + * source won't get her reply */ + lib_finalize (nal, private, msg); state_lock (nal, &flags); goto drop; } @@ -1099,7 +1102,8 @@ int do_PtlPut(nal_cb_t * nal, void *private, void *v_args, void *v_ret) lib_msg_t *msg = NULL; ptl_process_id_t *id = &args->target_in; unsigned long flags; - + int rc; + if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */ fail_peer (nal, id->nid, 1)) /* shall we now? */ { @@ -1177,9 +1181,15 @@ int do_PtlPut(nal_cb_t * nal, void *private, void *v_args, void *v_ret) state_unlock(nal, &flags); - lib_send (nal, private, msg, &hdr, PTL_MSG_PUT, - id->nid, id->pid, md, 0, md->length); - + rc = lib_send (nal, private, msg, &hdr, PTL_MSG_PUT, + id->nid, id->pid, md, 0, md->length); + if (rc != PTL_OK) { + /* get_new_msg() committed us to sending by decrementing + * md->threshold, so we have to act like we did send, but + * the network dropped it. */ + lib_finalize (nal, private, msg); + } + return ret->rc = PTL_OK; } @@ -1268,7 +1278,8 @@ int do_PtlGet(nal_cb_t * nal, void *private, void *v_args, void *v_ret) ptl_process_id_t *id = &args->target_in; lib_md_t *md; unsigned long flags; - + int rc; + if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */ fail_peer (nal, id->nid, 1)) /* shall we now? */ { @@ -1342,9 +1353,15 @@ int do_PtlGet(nal_cb_t * nal, void *private, void *v_args, void *v_ret) state_unlock(nal, &flags); - lib_send (nal, private, msg, &hdr, PTL_MSG_GET, - id->nid, id->pid, NULL, 0, 0); - + rc = lib_send (nal, private, msg, &hdr, PTL_MSG_GET, + id->nid, id->pid, NULL, 0, 0); + if (rc != PTL_OK) { + /* get_new_msg() committed us to sending by decrementing + * md->threshold, so we have to act like we did send, but + * the network dropped it. */ + lib_finalize (nal, private, msg); + } + return ret->rc = PTL_OK; } diff --git a/lnet/lnet/lib-msg.c b/lnet/lnet/lib-msg.c index f10892c..9363251 100644 --- a/lnet/lnet/lib-msg.c +++ b/lnet/lnet/lib-msg.c @@ -67,6 +67,7 @@ int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t *msg) rc = lib_send (nal, private, NULL, &ack, PTL_MSG_ACK, msg->nid, msg->pid, NULL, 0, 0); + /* If this send fails, there's nothing else to clean up */ } md = msg->md; diff --git a/lnet/utils/debug.c b/lnet/utils/debug.c index a2a8e77..462fd28 100644 --- a/lnet/utils/debug.c +++ b/lnet/utils/debug.c @@ -59,6 +59,8 @@ static int max = 8192; static int subsystem_mask = ~0; static int debug_mask = ~0; +#define MAX_MARK_SIZE 100 + static const char *portal_debug_subsystems[] = {"undefined", "mdc", "mds", "osc", "ost", "class", "log", "llite", "rpc", "mgmt", "portals", "socknal", "qswnal", "pinger", "filter", @@ -488,22 +490,29 @@ int jt_dbg_clear_debug_buf(int argc, char **argv) int jt_dbg_mark_debug_buf(int argc, char **argv) { - int rc; + int rc, max_size = MAX_MARK_SIZE-1; struct portal_ioctl_data data; char *text; time_t now = time(NULL); - if (argc > 2) { - fprintf(stderr, "usage: %s [marker text]\n", argv[0]); - return 0; - } - - if (argc == 2) { - text = argv[1]; + if (argc > 1) { + int counter; + text = malloc(MAX_MARK_SIZE); + strncpy(text, argv[1], max_size); + max_size-=strlen(argv[1]); + for(counter = 2; (counter < argc) && (max_size > 0) ; counter++){ + strncat(text, " ", 1); + max_size-=1; + strncat(text, argv[counter], max_size); + max_size-=strlen(argv[counter]); + } } else { text = ctime(&now); text[strlen(text) - 1] = '\0'; /* stupid \n */ } + if (!max_size) { + text[MAX_MARK_SIZE - 1] = '\0'; + } memset(&data, 0, sizeof(data)); data.ioc_inllen1 = strlen(text) + 1; @@ -530,7 +539,9 @@ int jt_dbg_modules(int argc, char **argv) } *mp, mod_paths[] = { {"portals", "lustre/portals/libcfs"}, {"ksocknal", "lustre/portals/knals/socknal"}, + {"lvfs", "lustre/lvfs"}, {"obdclass", "lustre/obdclass"}, + {"llog_test", "lustre/obdclass"}, {"ptlrpc", "lustre/ptlrpc"}, {"obdext2", "lustre/obdext2"}, {"ost", "lustre/ost"}, @@ -543,8 +554,9 @@ int jt_dbg_modules(int argc, char **argv) {"obdfilter", "lustre/obdfilter"}, {"extN", "lustre/extN"}, {"lov", "lustre/lov"}, - {"fsfilt_ext3", "lustre/obdclass"}, - {"fsfilt_extN", "lustre/obdclass"}, + {"fsfilt_ext3", "lustre/lvfs"}, + {"fsfilt_extN", "lustre/lvfs"}, + {"fsfilt_reiserfs", "lustre/lvfs"}, {"mds_ext2", "lustre/mds"}, {"mds_ext3", "lustre/mds"}, {"mds_extN", "lustre/mds"}, diff --git a/lnet/utils/portals.c b/lnet/utils/portals.c index c87b0952..b68bf34 100644 --- a/lnet/utils/portals.c +++ b/lnet/utils/portals.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #ifdef __CYGWIN__ @@ -61,9 +62,7 @@ unsigned int portal_printk; unsigned int portal_stack; -static ptl_nid_t g_nid = 0; static unsigned int g_nal = 0; -static unsigned short g_port = 0; static int g_socket_txmem = 0; static int g_socket_rxmem = 0; @@ -144,21 +143,25 @@ ptl_gethostbyname(char * hname) { } int -ptl_parse_nid (ptl_nid_t *nidp, char *str) +ptl_parse_ipaddr (__u32 *ipaddrp, char *str) { struct hostent *he; int a; int b; int c; int d; - + + if (!strcmp (str, "_all_")) + { + *ipaddrp = 0; + return (0); + } + if (sscanf (str, "%d.%d.%d.%d", &a, &b, &c, &d) == 4 && (a & ~0xff) == 0 && (b & ~0xff) == 0 && (c & ~0xff) == 0 && (d & ~0xff) == 0) { - __u32 addr = (a<<24)|(b<<16)|(c<<8)|d; - - *nidp = (ptl_nid_t)addr; + *ipaddrp = (a<<24)|(b<<16)|(c<<8)|d; return (0); } @@ -168,19 +171,55 @@ ptl_parse_nid (ptl_nid_t *nidp, char *str) { __u32 addr = *(__u32 *)he->h_addr; - *nidp = (ptl_nid_t)ntohl(addr); /* HOST byte order */ + *ipaddrp = ntohl(addr); /* HOST byte order */ return (0); } - if (sscanf (str, "%i", &a) == 1) + return (-1); +} + +char * +ptl_ipaddr_2_str (__u32 ipaddr, char *str) +{ + __u32 net_ip; + struct hostent *he; + + net_ip = htonl (ipaddr); + he = gethostbyaddr (&net_ip, sizeof (net_ip), AF_INET); + if (he != NULL) + return (he->h_name); + + sprintf (str, "%d.%d.%d.%d", + (ipaddr >> 24) & 0xff, (ipaddr >> 16) & 0xff, + (ipaddr >> 8) & 0xff, ipaddr & 0xff); + return (str); +} + +int +ptl_parse_nid (ptl_nid_t *nidp, char *str) +{ + __u32 ipaddr; + long lval; + + if (!strcmp (str, "_all_")) { + *nidp = PTL_NID_ANY; + return (0); + } + + if (ptl_parse_ipaddr (&ipaddr, str) == 0) { + *nidp = (ptl_nid_t)ipaddr; + return (0); + } + + if (sscanf (str, "%li", &lval) == 1) { - *nidp = (ptl_nid_t)a; + *nidp = (ptl_nid_t)lval; return (0); } - if (sscanf (str, "%x", &a) == 1) + if (sscanf (str, "%lx", &lval) == 1) { - *nidp = (ptl_nid_t) a; + *nidp = (ptl_nid_t)lval; return (0); } @@ -201,6 +240,32 @@ ptl_nid2str (char *buffer, ptl_nid_t nid) return (buffer); } +int g_nal_is_compatible (char *cmd, ...) +{ + va_list ap; + int nal; + + if (g_nal == 0) { + fprintf (stderr, "Error: you must run the 'network' command first.\n"); + return (0); + } + + va_start (ap, cmd); + + do { + nal = va_arg (ap, int); + } while (nal != 0 && nal != g_nal); + + va_end (ap); + + if (g_nal == nal) + return (1); + + fprintf (stderr, "Command %s not compatible with nal %s\n", + cmd, nal2name (g_nal)); + return (0); +} + int sock_write (int cfd, void *buffer, int nob) { @@ -266,22 +331,231 @@ int ptl_initialize(int argc, char **argv) int jt_ptl_network(int argc, char **argv) { - int nal; + name2num_t *entry; + int nal; - if (argc != 2 || - (nal = ptl_name2nal (argv[1])) == 0) - { - name2num_t *entry; + if (argc == 2 && + (nal = ptl_name2nal (argv[1])) != 0) { + g_nal = nal; + return (0); + } - fprintf(stderr, "usage: %s \n", argv[0]); - for (entry = nalnames; entry->name != NULL; entry++) - fprintf (stderr, "%s%s", entry == nalnames ? "<" : "|", entry->name); - fprintf(stderr, ">\n"); + fprintf(stderr, "usage: %s \n", argv[0]); + for (entry = nalnames; entry->name != NULL; entry++) + fprintf (stderr, "%s%s", entry == nalnames ? "<" : "|", entry->name); + fprintf(stderr, ">\n"); + return (-1); +} + +int +jt_ptl_print_autoconnects (int argc, char **argv) +{ + struct portal_ioctl_data data; + char buffer[64]; + int index; + int rc; + + if (!g_nal_is_compatible (argv[0], SOCKNAL, 0)) + return -1; + + for (index = 0;;index++) { + PORTAL_IOC_INIT (data); + data.ioc_nal = g_nal; + data.ioc_nal_cmd = NAL_CMD_GET_AUTOCONN; + data.ioc_count = index; + + rc = l_ioctl (PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); + if (rc != 0) + break; + + printf (LPX64"@%s:%d #%d buffer %d nonagle %s xchg %s affinity %s share %d\n", + data.ioc_nid, ptl_ipaddr_2_str (data.ioc_id, buffer), + data.ioc_misc, data.ioc_count, data.ioc_size, + (data.ioc_flags & 1) ? "on" : "off", + (data.ioc_flags & 2) ? "on" : "off", + (data.ioc_flags & 4) ? "on" : "off", + data.ioc_wait); } - else - g_nal = nal; - return (0); + if (index == 0) + printf ("\n"); + return 0; +} + +int +jt_ptl_add_autoconnect (int argc, char **argv) +{ + struct portal_ioctl_data data; + ptl_nid_t nid; + __u32 ip; + int port; + int xchange_nids = 0; + int irq_affinity = 0; + int share = 0; + int rc; + + if (argc < 4 || argc > 5) { + fprintf (stderr, "usage: %s nid ipaddr port [ixs]\n", argv[0]); + return 0; + } + + if (!g_nal_is_compatible (argv[0], SOCKNAL, 0)) + return -1; + + if (ptl_parse_nid (&nid, argv[1]) != 0 || + nid == PTL_NID_ANY) { + fprintf (stderr, "Can't parse NID: %s\n", argv[1]); + return -1; + } + + if (ptl_parse_ipaddr (&ip, argv[2]) != 0) { + fprintf (stderr, "Can't parse ip addr: %s\n", argv[2]); + return -1; + } + + port = atol (argv[3]); + + if (argc > 4) { + char *opts = argv[4]; + + while (*opts != 0) + switch (*opts++) { + case 'x': + xchange_nids = 1; + break; + case 'i': + irq_affinity = 1; + break; + case 's': + share = 1; + break; + default: + fprintf (stderr, "Can't parse options: %s\n", + argv[4]); + return -1; + } + } + + PORTAL_IOC_INIT (data); + data.ioc_nal = g_nal; + data.ioc_nal_cmd = NAL_CMD_ADD_AUTOCONN; + data.ioc_nid = nid; + data.ioc_id = ip; + data.ioc_misc = port; + /* only passing one buffer size! */ + data.ioc_size = MAX (g_socket_rxmem, g_socket_txmem); + data.ioc_flags = (g_socket_nonagle ? 1 : 0) | + (xchange_nids ? 2 : 0) | + (irq_affinity ? 4 : 0) | + (share ? 8 : 0); + + rc = l_ioctl (PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); + if (rc != 0) { + fprintf (stderr, "failed to enable autoconnect: %s\n", + strerror (errno)); + return -1; + } + + return 0; +} + +int +jt_ptl_del_autoconnect (int argc, char **argv) +{ + struct portal_ioctl_data data; + ptl_nid_t nid = PTL_NID_ANY; + __u32 ip = 0; + int share = 0; + int keep_conn = 0; + int rc; + + if (argc > 4) { + fprintf (stderr, "usage: %s [nid] [ipaddr] [sk]\n", + argv[0]); + return 0; + } + + if (!g_nal_is_compatible (argv[0], SOCKNAL, 0)) + return -1; + + if (argc > 1 && + ptl_parse_nid (&nid, argv[1]) != 0) { + fprintf (stderr, "Can't parse nid: %s\n", argv[1]); + return -1; + } + + if (argc > 2 && + ptl_parse_ipaddr (&ip, argv[2]) != 0) { + fprintf (stderr, "Can't parse ip addr: %s\n", argv[2]); + return -1; + } + + if (argc > 3) { + char *opts = argv[3]; + + while (*opts != 0) + switch (*opts++) { + case 's': + share = 1; + break; + case 'k': + keep_conn = 1; + break; + default: + fprintf (stderr, "Can't parse flags: %s\n", + argv[3]); + return -1; + } + } + + PORTAL_IOC_INIT (data); + data.ioc_nal = g_nal; + data.ioc_nal_cmd = NAL_CMD_DEL_AUTOCONN; + data.ioc_nid = nid; + data.ioc_id = ip; + data.ioc_flags = (share ? 1 : 0) | + (keep_conn ? 2 : 0); + + rc = l_ioctl (PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); + if (rc != 0) { + fprintf (stderr, "failed to remove autoconnect route: %s\n", + strerror (errno)); + return -1; + } + + return 0; +} + +int +jt_ptl_print_connections (int argc, char **argv) +{ + struct portal_ioctl_data data; + char buffer[64]; + int index; + int rc; + + if (!g_nal_is_compatible (argv[0], SOCKNAL, 0)) + return -1; + + for (index = 0;;index++) { + PORTAL_IOC_INIT (data); + data.ioc_nal = g_nal; + data.ioc_nal_cmd = NAL_CMD_GET_CONN; + data.ioc_count = index; + + rc = l_ioctl (PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); + if (rc != 0) + break; + + printf (LPD64"@%s:%d\n", + data.ioc_nid, + ptl_ipaddr_2_str (data.ioc_id, buffer), + data.ioc_misc); + } + + if (index == 0) + printf ("\n"); + return 0; } int @@ -318,17 +592,17 @@ exchange_nids (int cfd, ptl_nid_t my_nid, ptl_nid_t *peer_nid) return (-1); } - if (__cpu_to_le32 (hmv->magic) != PORTALS_PROTO_MAGIC) { + if (hmv->magic != __cpu_to_le32 (PORTALS_PROTO_MAGIC)) { fprintf (stderr, "Bad magic %#08x (%#08x expected)\n", - __cpu_to_le32 (hmv->magic), PORTALS_PROTO_MAGIC); + __le32_to_cpu (hmv->magic), PORTALS_PROTO_MAGIC); return (-1); } - if (__cpu_to_le16 (hmv->version_major) != PORTALS_PROTO_VERSION_MAJOR || - __cpu_to_le16 (hmv->version_minor) != PORTALS_PROTO_VERSION_MINOR) { + if (hmv->version_major != __cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR) || + hmv->version_minor != __cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR)) { fprintf (stderr, "Incompatible protocol version %d.%d (%d.%d expected)\n", - __cpu_to_le16 (hmv->version_major), - __cpu_to_le16 (hmv->version_minor), + __le16_to_cpu (hmv->version_major), + __le16_to_cpu (hmv->version_minor), PORTALS_PROTO_VERSION_MAJOR, PORTALS_PROTO_VERSION_MINOR); } @@ -343,12 +617,12 @@ exchange_nids (int cfd, ptl_nid_t my_nid, ptl_nid_t *peer_nid) } /* ...and check we got what we expected */ - if (__cpu_to_le32 (hdr.type) != PTL_MSG_HELLO || - __cpu_to_le32 (PTL_HDR_LENGTH (&hdr)) != 0) { + if (hdr.type != __cpu_to_le32 (PTL_MSG_HELLO) || + PTL_HDR_LENGTH (&hdr) != __cpu_to_le32 (0)) { fprintf (stderr, "Expecting a HELLO hdr with 0 payload," " but got type %d with %d payload\n", - __cpu_to_le32 (hdr.type), - __cpu_to_le32 (PTL_HDR_LENGTH (&hdr))); + __le32_to_cpu (hdr.type), + __le32_to_cpu (PTL_HDR_LENGTH (&hdr))); return (-1); } @@ -358,237 +632,189 @@ exchange_nids (int cfd, ptl_nid_t my_nid, ptl_nid_t *peer_nid) int jt_ptl_connect(int argc, char **argv) { - if (argc < 2) { - usage: - fprintf(stderr, "usage: %s or \n", - argv[0]); + ptl_nid_t peer_nid; + struct portal_ioctl_data data; + struct sockaddr_in srvaddr; + __u32 ipaddr; + char *flag; + int fd, rc; + int nonagle = 0; + int rxmem = 0; + int txmem = 0; + int bind_irq = 0; + int xchange_nids = 0; + int port; + int o; + int olen; + + if (argc < 3) { + fprintf(stderr, "usage: %s ip port [xi]\n", argv[0]); return 0; } - if (g_nal == 0) { - fprintf(stderr, "Error: you must run the 'network' command " - "first.\n"); + + if (!g_nal_is_compatible (argv[0], SOCKNAL, TOENAL, 0)) + return -1; + + rc = ptl_parse_ipaddr (&ipaddr, argv[1]); + if (rc != 0) { + fprintf(stderr, "Can't parse hostname: %s\n", argv[1]); return -1; } - if (g_nal == SOCKNAL || g_nal == TOENAL) { - ptl_nid_t peer_nid; - struct hostent *he; - struct portal_ioctl_data data; - struct sockaddr_in srvaddr; - char *flag; - int fd, rc; - int nonagle = 0; - int rxmem = 0; - int txmem = 0; - int bind_irq = 0; - int xchange_nids = 0; - int o; - int olen; - - if (argc < 3) { - goto usage; - } - he = ptl_gethostbyname(argv[1]); - if (!he) - return -1; - - g_port = atol(argv[2]); - - if (argc > 3) - for (flag = argv[3]; *flag != 0; flag++) - switch (*flag) - { - case 'i': - bind_irq = 1; - break; - - case 'x': - xchange_nids = 1; - break; - - default: - fprintf (stderr, "unrecognised flag '%c'\n", - *flag); - return (-1); - } - - memset(&srvaddr, 0, sizeof(srvaddr)); - srvaddr.sin_family = AF_INET; - srvaddr.sin_port = htons(g_port); - srvaddr.sin_addr.s_addr = *(__u32 *)he->h_addr; - - fd = socket(PF_INET, SOCK_STREAM, 0); - if ( fd < 0 ) { - fprintf(stderr, "socket() failed: %s\n", - strerror(errno)); - return -1; - } - - if (g_socket_nonagle) - { - o = 1; - if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &o, sizeof (o)) != 0) - { - fprintf(stderr, "cannot disable nagle: %s\n", strerror(errno)); + port = atol(argv[2]); + if (argc > 3) + for (flag = argv[3]; *flag != 0; flag++) + switch (*flag) + { + case 'i': + bind_irq = 1; + break; + + case 'x': + xchange_nids = 1; + break; + + default: + fprintf (stderr, "unrecognised flag '%c'\n", + *flag); return (-1); } - } - if (g_socket_rxmem != 0) - { - o = g_socket_rxmem; - if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &o, sizeof (o)) != 0) - { - fprintf(stderr, "cannot set receive buffer size: %s\n", strerror(errno)); - return (-1); - } - } + memset(&srvaddr, 0, sizeof(srvaddr)); + srvaddr.sin_family = AF_INET; + srvaddr.sin_port = htons(port); + srvaddr.sin_addr.s_addr = htonl(ipaddr); - if (g_socket_txmem != 0) - { - o = g_socket_txmem; - if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &o, sizeof (o)) != 0) - { - fprintf(stderr, "cannot set send buffer size: %s\n", strerror(errno)); - return (-1); - } + fd = socket(PF_INET, SOCK_STREAM, 0); + if ( fd < 0 ) { + fprintf(stderr, "socket() failed: %s\n", strerror(errno)); + return -1; + } + + if (g_socket_nonagle) + { + o = 1; + if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &o, sizeof (o)) != 0) { + fprintf(stderr, "cannot disable nagle: %s\n", strerror(errno)); + return (-1); } + } - rc = connect(fd, (struct sockaddr *)&srvaddr, sizeof(srvaddr)); - if ( rc == -1 ) { - fprintf(stderr, "connect() failed: %s\n", - strerror(errno)); - return -1; + if (g_socket_rxmem != 0) { + o = g_socket_rxmem; + if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &o, sizeof (o)) != 0) { + fprintf(stderr, "cannot set receive buffer size: %s\n", strerror(errno)); + return (-1); } + } - olen = sizeof (txmem); - if (getsockopt (fd, SOL_SOCKET, SO_SNDBUF, &txmem, &olen) != 0) - fprintf (stderr, "Can't get send buffer size: %s\n", strerror (errno)); - olen = sizeof (rxmem); - if (getsockopt (fd, SOL_SOCKET, SO_RCVBUF, &rxmem, &olen) != 0) - fprintf (stderr, "Can't get receive buffer size: %s\n", strerror (errno)); - olen = sizeof (nonagle); - if (getsockopt (fd, IPPROTO_TCP, TCP_NODELAY, &nonagle, &olen) != 0) - fprintf (stderr, "Can't get nagle: %s\n", strerror (errno)); - - if (xchange_nids) { - - PORTAL_IOC_INIT (data); - data.ioc_nal = g_nal; - rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_NID, &data); - if (rc != 0) - { - fprintf (stderr, "failed to get my nid: %s\n", - strerror (errno)); - close (fd); - return (-1); - } - - rc = exchange_nids (fd, data.ioc_nid, &peer_nid); - if (rc != 0) - { - close (fd); - return (-1); - } + if (g_socket_txmem != 0) { + o = g_socket_txmem; + if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &o, sizeof (o)) != 0) { + fprintf(stderr, "cannot set send buffer size: %s\n", strerror(errno)); + return (-1); } - else - peer_nid = ntohl (srvaddr.sin_addr.s_addr); /* HOST byte order */ + } - printf("Connected host: %s NID "LPX64" snd: %d rcv: %d nagle: %s\n", argv[1], - peer_nid, txmem, rxmem, nonagle ? "Disabled" : "Enabled"); + rc = connect(fd, (struct sockaddr *)&srvaddr, sizeof(srvaddr)); + if ( rc == -1 ) { + fprintf(stderr, "connect() failed: %s\n", strerror(errno)); + return -1; + } - PORTAL_IOC_INIT(data); - data.ioc_fd = fd; + olen = sizeof (txmem); + if (getsockopt (fd, SOL_SOCKET, SO_SNDBUF, &txmem, &olen) != 0) + fprintf (stderr, "Can't get send buffer size: %s\n", strerror (errno)); + olen = sizeof (rxmem); + if (getsockopt (fd, SOL_SOCKET, SO_RCVBUF, &rxmem, &olen) != 0) + fprintf (stderr, "Can't get receive buffer size: %s\n", strerror (errno)); + olen = sizeof (nonagle); + if (getsockopt (fd, IPPROTO_TCP, TCP_NODELAY, &nonagle, &olen) != 0) + fprintf (stderr, "Can't get nagle: %s\n", strerror (errno)); + + if (!xchange_nids) + peer_nid = ipaddr; + else { + PORTAL_IOC_INIT (data); data.ioc_nal = g_nal; - data.ioc_nal_cmd = NAL_CMD_REGISTER_PEER_FD; - data.ioc_nid = peer_nid; - data.ioc_flags = bind_irq; - - rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); - if (rc) { - fprintf(stderr, "failed to register fd with portals: " - "%s\n", strerror(errno)); + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_NID, &data); + if (rc != 0) { + fprintf (stderr, "failed to get my nid: %s\n", + strerror (errno)); close (fd); - return -1; + return (-1); } - g_nid = peer_nid; - printf("Connection to "LPX64" registered with socknal\n", g_nid); - - rc = close(fd); - if (rc) { - fprintf(stderr, "close failed: %d\n", rc); - } - } else if (g_nal == QSWNAL) { - g_nid = atoi(argv[1]); - } else if (g_nal == GMNAL) { - g_nid = atoi(argv[1]); - } else if (g_nal == SCIMACNAL) { - unsigned int tmpnid; - if(sscanf(argv[1], "%x", &tmpnid) == 1) { - g_nid=tmpnid; - } - else { - fprintf(stderr, "nid %s invalid for SCI nal\n", argv[1]); + rc = exchange_nids (fd, data.ioc_nid, &peer_nid); + if (rc != 0) { + close (fd); + return (-1); } + } + printf("Connected host: %s NID "LPX64" snd: %d rcv: %d nagle: %s\n", argv[1], + peer_nid, txmem, rxmem, nonagle ? "Disabled" : "Enabled"); + PORTAL_IOC_INIT(data); + data.ioc_fd = fd; + data.ioc_nal = g_nal; + data.ioc_nal_cmd = NAL_CMD_REGISTER_PEER_FD; + data.ioc_nid = peer_nid; + data.ioc_flags = bind_irq; - } else { - fprintf(stderr, "This should never happen. Also it is very " - "bad.\n"); + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); + if (rc) { + fprintf(stderr, "failed to register fd with portals: %s\n", + strerror(errno)); + close (fd); + return -1; } + printf("Connection to "LPX64" registered with socknal\n", peer_nid); + + rc = close(fd); + if (rc) + fprintf(stderr, "close failed: %d\n", rc); + return 0; } int jt_ptl_disconnect(int argc, char **argv) { - if (argc > 2) { - fprintf(stderr, "usage: %s [hostname]\n", argv[0]); + struct portal_ioctl_data data; + ptl_nid_t nid = PTL_NID_ANY; + __u32 ipaddr = 0; + int rc; + + if (argc > 3) { + fprintf(stderr, "usage: %s [nid] [ipaddr]\n", argv[0]); return 0; } - if (g_nal == 0) { - fprintf(stderr, "Error: you must run the 'network' command " - "first.\n"); + + if (!g_nal_is_compatible (argv[0], SOCKNAL, TOENAL, 0)) + return -1; + + if (argc >= 2 && + ptl_parse_nid (&nid, argv[1]) != 0) { + fprintf (stderr, "Can't parse nid %s\n", argv[1]); return -1; } - if (g_nal == SOCKNAL || g_nal == TOENAL) { - struct hostent *he; - struct portal_ioctl_data data; - int rc; - PORTAL_IOC_INIT(data); - if (argc == 2) { - he = ptl_gethostbyname(argv[1]); - if (!he) - return -1; - - data.ioc_nid = ntohl (*(__u32 *)he->h_addr); /* HOST byte order */ + if (argc >= 3 && + ptl_parse_ipaddr (&ipaddr, argv[2]) != 0) { + fprintf (stderr, "Can't parse ip addr %s\n", argv[2]); + return -1; + } - } else { - printf("Disconnecting ALL connections.\n"); - /* leave ioc_nid zeroed == disconnect all */ - } - data.ioc_nal = g_nal; - data.ioc_nal_cmd = NAL_CMD_CLOSE_CONNECTION; - rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); - if (rc) { - fprintf(stderr, "failed to remove connection: %s\n", - strerror(errno)); - return -1; - } - } else if (g_nal == QSWNAL) { - printf("'disconnect' doesn't make any sense for " - "elan.\n"); - } else if (g_nal == GMNAL) { - printf("'disconnect' doesn't make any sense for " - "GM.\n"); - } else if (g_nal == SCIMACNAL) { - printf("'disconnect' doesn't make any sense for " - "SCI.\n"); - } else { - fprintf(stderr, "This should never happen. Also it is very " - "bad.\n"); + PORTAL_IOC_INIT(data); + data.ioc_nal = g_nal; + data.ioc_nal_cmd = NAL_CMD_CLOSE_CONNECTION; + data.ioc_nid = nid; + data.ioc_id = ipaddr; + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); + if (rc) { + fprintf(stderr, "failed to remove connection: %s\n", + strerror(errno)); return -1; } @@ -597,55 +823,86 @@ int jt_ptl_disconnect(int argc, char **argv) int jt_ptl_push_connection (int argc, char **argv) { - if (argc > 2) { - fprintf(stderr, "usage: %s [hostname]\n", argv[0]); + struct portal_ioctl_data data; + int rc; + ptl_nid_t nid = PTL_NID_ANY; + __u32 ipaddr = 0; + + if (argc > 3) { + fprintf(stderr, "usage: %s [nid] [ip]\n", argv[0]); return 0; } - if (g_nal == 0) { - fprintf(stderr, "Error: you must run the 'network' command " - "first.\n"); + + if (!g_nal_is_compatible (argv[0], SOCKNAL, TOENAL, 0)) + return -1; + + if (argc > 1 && + ptl_parse_nid (&nid, argv[1]) != 0) { + fprintf(stderr, "Can't parse nid: %s\n", argv[1]); return -1; } - if (g_nal == SOCKNAL || g_nal == TOENAL) { - struct hostent *he; - struct portal_ioctl_data data; - int rc; - - PORTAL_IOC_INIT(data); - if (argc == 2) { - he = ptl_gethostbyname(argv[1]); - if (!he) - return -1; - data.ioc_nid = ntohl (*(__u32 *)he->h_addr); /* HOST byte order */ + if (argc > 2 && + ptl_parse_ipaddr (&ipaddr, argv[2]) != 0) { + fprintf(stderr, "Can't parse ipaddr: %s\n", argv[2]); + } - } else { - printf("Pushing ALL connections.\n"); - /* leave ioc_nid zeroed == disconnect all */ - } - data.ioc_nal = g_nal; - data.ioc_nal_cmd = NAL_CMD_PUSH_CONNECTION; - rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); - if (rc) { - fprintf(stderr, "failed to push connection: %s\n", - strerror(errno)); - return -1; - } - } else if (g_nal == QSWNAL) { - printf("'push' doesn't make any sense for elan.\n"); - } else if (g_nal == GMNAL) { - printf("'push' doesn't make any sense for GM.\n"); - } else if (g_nal == SCIMACNAL) { - printf("'push' doesn't make any sense for SCI.\n"); - } else { - fprintf(stderr, "This should never happen. Also it is very " - "bad.\n"); + PORTAL_IOC_INIT(data); + data.ioc_nal = g_nal; + data.ioc_nal_cmd = NAL_CMD_PUSH_CONNECTION; + data.ioc_nid = nid; + data.ioc_id = ipaddr; + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); + if (rc) { + fprintf(stderr, "failed to push connection: %s\n", + strerror(errno)); return -1; } return 0; } +int +jt_ptl_print_active_txs (int argc, char **argv) +{ + struct portal_ioctl_data data; + int index; + int rc; + + if (!g_nal_is_compatible (argv[0], QSWNAL, 0)) + return -1; + + for (index = 0;;index++) { + PORTAL_IOC_INIT (data); + data.ioc_nal = g_nal; + data.ioc_nal_cmd = NAL_CMD_GET_TXDESC; + data.ioc_count = index; + + rc = l_ioctl (PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); + if (rc != 0) + break; + + printf ("%p: %5s payload %6d bytes to "LPX64" via "LPX64" by pid %6d: %s, %s, state %d\n", + data.ioc_pbuf1, + data.ioc_count == PTL_MSG_ACK ? "ACK" : + data.ioc_count == PTL_MSG_PUT ? "PUT" : + data.ioc_count == PTL_MSG_GET ? "GET" : + data.ioc_count == PTL_MSG_REPLY ? "REPLY" : "", + data.ioc_size, + data.ioc_nid, + data.ioc_nid2, + data.ioc_misc, + (data.ioc_flags & 1) ? "delayed" : "immediate", + (data.ioc_flags & 2) ? "nblk" : "normal", + data.ioc_flags >> 2); + } + + if (index == 0) + printf ("\n"); + return 0; +} + int jt_ptl_ping(int argc, char **argv) { int rc; diff --git a/lnet/utils/ptlctl.c b/lnet/utils/ptlctl.c index 8c56d93..c083e48 100644 --- a/lnet/utils/ptlctl.c +++ b/lnet/utils/ptlctl.c @@ -30,9 +30,14 @@ command_t list[] = { {"network", jt_ptl_network, 0,"setup the NAL (args: nal name)"}, - {"connect", jt_ptl_connect, 0, "connect to a remote nid (args: | for tcp/elan respectively)"}, - {"disconnect", jt_ptl_disconnect, 0, "disconnect from a remote nid (args: [hostname]"}, - {"push", jt_ptl_push_connection, 0, "flush connection to a remote nid (args: [hostname]"}, + {"print_autoconns", jt_ptl_print_autoconnects, 0, "print autoconnect entries (no args)"}, + {"add_autoconn", jt_ptl_add_autoconnect, 0, "add autoconnect entry (args: nid host [ixs])"}, + {"del_autoconn", jt_ptl_del_autoconnect, 0, "delete autoconnect entry (args: [nid] [host] [ks])"}, + {"print_conns", jt_ptl_print_connections, 0, "print connections (no args)"}, + {"connect", jt_ptl_connect, 0, "connect to a remote nid (args: host port [xi])"}, + {"disconnect", jt_ptl_disconnect, 0, "disconnect from a remote nid (args: [nid] [host]"}, + {"push", jt_ptl_push_connection, 0, "flush connection to a remote nid (args: [nid]"}, + {"active_tx", jt_ptl_print_active_txs, 0, "print active transmits (no args)"}, {"ping", jt_ptl_ping, 0, "do a ping test (args: nid [count] [size] [timeout])"}, {"shownid", jt_ptl_shownid, 0, "print the local NID"}, {"mynid", jt_ptl_mynid, 0, "inform the socknal of the local NID (args: [hostname])"}, diff --git a/lustre/portals/archdep.m4 b/lustre/portals/archdep.m4 index db14c29..428fe42 100644 --- a/lustre/portals/archdep.m4 +++ b/lustre/portals/archdep.m4 @@ -225,7 +225,8 @@ fi #AC_SUBST(usrprefix) AC_MSG_CHECKING(if kernel has CPU affinity support) -if test "$target_cpu" != ia64 ; then +SET_CPUS_ALLOW="`grep -c set_cpus_allowed $LINUX/kernel/softirq.c`" +if test "$SET_CPUS_ALLOW" != 0 ; then enable_affinity_temp="-DCPU_AFFINITY=1" AC_MSG_RESULT(yes) else diff --git a/lustre/portals/include/linux/kp30.h b/lustre/portals/include/linux/kp30.h index e7b850d..a0626cc 100644 --- a/lustre/portals/include/linux/kp30.h +++ b/lustre/portals/include/linux/kp30.h @@ -574,7 +574,7 @@ __s32 portals_debug_copy_to_user(char *buf, unsigned long len); #endif void portals_debug_msg(int subsys, int mask, char *file, const char *fn, const int line, unsigned long stack, - const char *format, ...) + char *format, ...) __attribute__ ((format (printf, 7, 8))); #else void portals_debug_msg(int subsys, int mask, char *file, const char *fn, @@ -885,6 +885,11 @@ extern ptl_handle_ni_t kscimacnal_ni; #define NAL_CMD_CLOSE_CONNECTION 101 #define NAL_CMD_REGISTER_MYNID 102 #define NAL_CMD_PUSH_CONNECTION 103 +#define NAL_CMD_GET_CONN 104 +#define NAL_CMD_DEL_AUTOCONN 105 +#define NAL_CMD_ADD_AUTOCONN 106 +#define NAL_CMD_GET_AUTOCONN 107 +#define NAL_CMD_GET_TXDESC 108 enum { DEBUG_DAEMON_START = 1, diff --git a/lustre/portals/include/portals/list.h b/lustre/portals/include/portals/list.h index 78a1e2d..57713cb 100644 --- a/lustre/portals/include/portals/list.h +++ b/lustre/portals/include/portals/list.h @@ -16,6 +16,8 @@ struct list_head { struct list_head *next, *prev; }; +typedef struct list_head list_t; + #define LIST_HEAD_INIT(name) { &(name), &(name) } #define LIST_HEAD(name) \ diff --git a/lustre/portals/include/portals/ptlctl.h b/lustre/portals/include/portals/ptlctl.h index dc02780..8278111 100644 --- a/lustre/portals/include/portals/ptlctl.h +++ b/lustre/portals/include/portals/ptlctl.h @@ -34,9 +34,14 @@ char * ptl_nid2str (char *buffer, ptl_nid_t nid); int ptl_initialize(int argc, char **argv); int jt_ptl_network(int argc, char **argv); +int jt_ptl_print_autoconnects (int argc, char **argv); +int jt_ptl_add_autoconnect (int argc, char **argv); +int jt_ptl_del_autoconnect (int argc, char **argv); +int jt_ptl_print_connections (int argc, char **argv); int jt_ptl_connect(int argc, char **argv); int jt_ptl_disconnect(int argc, char **argv); int jt_ptl_push_connection(int argc, char **argv); +int jt_ptl_print_active_txs(int argc, char **argv); int jt_ptl_ping(int argc, char **argv); int jt_ptl_shownid(int argc, char **argv); int jt_ptl_mynid(int argc, char **argv); diff --git a/lustre/portals/knals/qswnal/qswnal.c b/lustre/portals/knals/qswnal/qswnal.c index 7cfc80e..d4ee960 100644 --- a/lustre/portals/knals/qswnal/qswnal.c +++ b/lustre/portals/knals/qswnal/qswnal.c @@ -112,11 +112,48 @@ kqswnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size, } int +kqswnal_get_tx_desc (struct portal_ioctl_data *data) +{ + unsigned long flags; + struct list_head *tmp; + kqswnal_tx_t *ktx; + int index = data->ioc_count; + int rc = -ENOENT; + + spin_lock_irqsave (&kqswnal_data.kqn_idletxd_lock, flags); + + list_for_each (tmp, &kqswnal_data.kqn_activetxds) { + if (index-- != 0) + continue; + + ktx = list_entry (tmp, kqswnal_tx_t, ktx_list); + + data->ioc_pbuf1 = (char *)ktx; + data->ioc_count = NTOH__u32(ktx->ktx_wire_hdr->type); + data->ioc_size = NTOH__u32(PTL_HDR_LENGTH(ktx->ktx_wire_hdr)); + data->ioc_nid = NTOH__u64(ktx->ktx_wire_hdr->dest_nid); + data->ioc_nid2 = ktx->ktx_nid; + data->ioc_misc = ktx->ktx_launcher; + data->ioc_flags = (list_empty (&ktx->ktx_delayed_list) ? 0 : 1) | + (!ktx->ktx_isnblk ? 0 : 2) | + (ktx->ktx_state << 2); + rc = 0; + break; + } + + spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags); + return (rc); +} + +int kqswnal_cmd (struct portal_ioctl_data *data, void *private) { LASSERT (data != NULL); switch (data->ioc_nal_cmd) { + case NAL_CMD_GET_TXDESC: + return (kqswnal_get_tx_desc (data)); + case NAL_CMD_REGISTER_MYNID: CDEBUG (D_IOCTL, "setting NID offset to "LPX64" (was "LPX64")\n", data->ioc_nid - kqswnal_data.kqn_elanid, @@ -149,6 +186,7 @@ kqswnal_finalise (void) /* fall through */ case KQN_INIT_DATA: + LASSERT(list_empty(&kqswnal_data.kqn_activetxds)); break; case KQN_INIT_NOTHING: @@ -318,6 +356,7 @@ kqswnal_initialise (void) INIT_LIST_HEAD (&kqswnal_data.kqn_idletxds); INIT_LIST_HEAD (&kqswnal_data.kqn_nblk_idletxds); + INIT_LIST_HEAD (&kqswnal_data.kqn_activetxds); spin_lock_init (&kqswnal_data.kqn_idletxd_lock); init_waitqueue_head (&kqswnal_data.kqn_idletxd_waitq); INIT_LIST_HEAD (&kqswnal_data.kqn_idletxd_fwdq); @@ -460,12 +499,13 @@ kqswnal_initialise (void) ktx->ktx_basepage = basepage + premapped_pages; /* message mapping starts here */ ktx->ktx_npages = KQSW_NTXMSGPAGES - premapped_pages; /* for this many pages */ - if (i < KQSW_NTXMSGS) - ktx->ktx_idle = &kqswnal_data.kqn_idletxds; - else - ktx->ktx_idle = &kqswnal_data.kqn_nblk_idletxds; + INIT_LIST_HEAD (&ktx->ktx_delayed_list); - list_add_tail (&ktx->ktx_list, ktx->ktx_idle); + ktx->ktx_state = KTX_IDLE; + ktx->ktx_isnblk = (i >= KQSW_NTXMSGS); + list_add_tail (&ktx->ktx_list, + ktx->ktx_isnblk ? &kqswnal_data.kqn_nblk_idletxds : + &kqswnal_data.kqn_idletxds); } /**********************************************************************/ diff --git a/lustre/portals/knals/qswnal/qswnal.h b/lustre/portals/knals/qswnal/qswnal.h index 5cbbb9a..a27239c 100644 --- a/lustre/portals/knals/qswnal/qswnal.h +++ b/lustre/portals/knals/qswnal/qswnal.h @@ -77,20 +77,20 @@ #define KQSW_OPTIMIZE_GETS 1 -#define KQSW_CHECKSUM 0 +#define KQSW_CHECKSUM 0 #if KQSW_CHECKSUM typedef unsigned long kqsw_csum_t; -#define KQSW_CSUM_SIZE (2 * sizeof (kqsw_csum_t)) +#define KQSW_CSUM_SIZE (2 * sizeof (kqsw_csum_t)) #else -#define KQSW_CSUM_SIZE 0 +#define KQSW_CSUM_SIZE 0 #endif -#define KQSW_HDR_SIZE (sizeof (ptl_hdr_t) + KQSW_CSUM_SIZE) +#define KQSW_HDR_SIZE (sizeof (ptl_hdr_t) + KQSW_CSUM_SIZE) /* * Elan NAL */ -#define EP_SVC_LARGE_PORTALS_SMALL (0x10) /* Portals over elan port number (large payloads) */ -#define EP_SVC_LARGE_PORTALS_LARGE (0x11) /* Portals over elan port number (small payloads) */ +#define EP_SVC_LARGE_PORTALS_SMALL (0x10) /* Portals over elan port number (large payloads) */ +#define EP_SVC_LARGE_PORTALS_LARGE (0x11) /* Portals over elan port number (small payloads) */ /* NB small/large message sizes are GLOBAL constants */ /* @@ -98,39 +98,39 @@ typedef unsigned long kqsw_csum_t; * NB no mention of PAGE_SIZE for interoperability */ #define KQSW_MAXPAYLOAD PTL_MTU -#define KQSW_SMALLPAYLOAD ((4<<10) - KQSW_HDR_SIZE) /* small/large ep receiver breakpoint */ +#define KQSW_SMALLPAYLOAD ((4<<10) - KQSW_HDR_SIZE) /* small/large ep receiver breakpoint */ -#define KQSW_TX_MAXCONTIG (1<<10) /* largest payload that gets made contiguous on transmit */ +#define KQSW_TX_MAXCONTIG (1<<10) /* largest payload that gets made contiguous on transmit */ -#define KQSW_NTXMSGS 8 /* # normal transmit messages */ -#define KQSW_NNBLK_TXMSGS 256 /* # reserved transmit messages if can't block */ +#define KQSW_NTXMSGS 8 /* # normal transmit messages */ +#define KQSW_NNBLK_TXMSGS 256 /* # reserved transmit messages if can't block */ -#define KQSW_NRXMSGS_LARGE 64 /* # large receive buffers */ -#define KQSW_EP_ENVELOPES_LARGE 128 /* # large ep envelopes */ +#define KQSW_NRXMSGS_LARGE 64 /* # large receive buffers */ +#define KQSW_EP_ENVELOPES_LARGE 128 /* # large ep envelopes */ -#define KQSW_NRXMSGS_SMALL 256 /* # small receive buffers */ -#define KQSW_EP_ENVELOPES_SMALL 2048 /* # small ep envelopes */ +#define KQSW_NRXMSGS_SMALL 256 /* # small receive buffers */ +#define KQSW_EP_ENVELOPES_SMALL 2048 /* # small ep envelopes */ -#define KQSW_RESCHED 100 /* # busy loops that forces scheduler to yield */ +#define KQSW_RESCHED 100 /* # busy loops that forces scheduler to yield */ /* * derived constants */ -#define KQSW_TX_BUFFER_SIZE (KQSW_HDR_SIZE + KQSW_TX_MAXCONTIG) +#define KQSW_TX_BUFFER_SIZE (KQSW_HDR_SIZE + KQSW_TX_MAXCONTIG) /* The pre-allocated tx buffer (hdr + small payload) */ -#define KQSW_NTXMSGPAGES (btopr(KQSW_TX_BUFFER_SIZE) + 1 + btopr(KQSW_MAXPAYLOAD) + 1) +#define KQSW_NTXMSGPAGES (btopr(KQSW_TX_BUFFER_SIZE) + 1 + btopr(KQSW_MAXPAYLOAD) + 1) /* Reserve elan address space for pre-allocated and pre-mapped transmit * buffer and a full payload too. Extra pages allow for page alignment */ -#define KQSW_NRXMSGPAGES_SMALL (btopr(KQSW_HDR_SIZE + KQSW_SMALLPAYLOAD)) +#define KQSW_NRXMSGPAGES_SMALL (btopr(KQSW_HDR_SIZE + KQSW_SMALLPAYLOAD)) /* receive hdr/payload always contiguous and page aligned */ -#define KQSW_NRXMSGBYTES_SMALL (KQSW_NRXMSGPAGES_SMALL * PAGE_SIZE) +#define KQSW_NRXMSGBYTES_SMALL (KQSW_NRXMSGPAGES_SMALL * PAGE_SIZE) -#define KQSW_NRXMSGPAGES_LARGE (btopr(KQSW_HDR_SIZE + KQSW_MAXPAYLOAD)) +#define KQSW_NRXMSGPAGES_LARGE (btopr(KQSW_HDR_SIZE + KQSW_MAXPAYLOAD)) /* receive hdr/payload always contiguous and page aligned */ -#define KQSW_NRXMSGBYTES_LARGE (KQSW_NRXMSGPAGES_LARGE * PAGE_SIZE) +#define KQSW_NRXMSGBYTES_LARGE (KQSW_NRXMSGPAGES_LARGE * PAGE_SIZE) /* biggest complete packet we can receive (or transmit) */ /* Remote memory descriptor */ @@ -143,7 +143,7 @@ typedef struct typedef struct { struct list_head krx_list; /* enqueue -> thread */ - EP_RCVR *krx_eprx; /* port to post receives to */ + EP_RCVR *krx_eprx; /* port to post receives to */ EP_RXD *krx_rxd; /* receive descriptor (for repost) */ E3_Addr krx_elanaddr; /* Elan address of buffer (contiguous in elan vm) */ int krx_npages; /* # pages in receive buffer */ @@ -157,17 +157,23 @@ typedef struct typedef struct { - struct list_head ktx_list; /* enqueue idle/delayed */ - struct list_head *ktx_idle; /* where to put when idle */ - char ktx_state; /* What I'm doing */ + struct list_head ktx_list; /* enqueue idle/active */ + struct list_head ktx_delayed_list; /* enqueue delayedtxds */ + int ktx_isnblk:1; /* reserved descriptor? */ + int ktx_state:7; /* What I'm doing */ uint32_t ktx_basepage; /* page offset in reserved elan tx vaddrs for mapping pages */ int ktx_npages; /* pages reserved for mapping messages */ int ktx_nmappedpages; /* # pages mapped for current message */ int ktx_port; /* destination ep port */ ptl_nid_t ktx_nid; /* destination node */ void *ktx_args[2]; /* completion passthru */ - E3_Addr ktx_ebuffer; /* elan address of ktx_buffer */ + E3_Addr ktx_ebuffer; /* elan address of ktx_buffer */ char *ktx_buffer; /* pre-allocated contiguous buffer for hdr + small payloads */ + + /* debug/info fields */ + pid_t ktx_launcher; /* pid of launching process */ + ptl_hdr_t *ktx_wire_hdr; /* portals header (wire endian) */ + int ktx_nfrag; /* # message frags */ union { EP_IOVEC iov[EP_MAXFRAG]; /* msg frags (elan vaddrs) */ @@ -175,9 +181,9 @@ typedef struct } ktx_frags; } kqswnal_tx_t; -#define KTX_IDLE 0 /* MUST BE ZERO (so zeroed ktx is idle) */ -#define KTX_SENDING 1 /* local send */ -#define KTX_FORWARDING 2 /* routing a packet */ +#define KTX_IDLE 0 /* on kqn_(nblk_)idletxds */ +#define KTX_SENDING 1 /* local send */ +#define KTX_FORWARDING 2 /* routing a packet */ #define KTX_GETTING 3 /* local optimised get */ typedef struct @@ -190,7 +196,8 @@ typedef struct kqswnal_tx_t *kqn_txds; /* all the transmit descriptors */ struct list_head kqn_idletxds; /* transmit descriptors free to use */ - struct list_head kqn_nblk_idletxds; /* reserve of */ + struct list_head kqn_nblk_idletxds; /* reserved free transmit descriptors */ + struct list_head kqn_activetxds; /* transmit descriptors being used */ spinlock_t kqn_idletxd_lock; /* serialise idle txd access */ wait_queue_head_t kqn_idletxd_waitq; /* sender blocks here waiting for idle txd */ struct list_head kqn_idletxd_fwdq; /* forwarded packets block here waiting for idle txd */ @@ -204,12 +211,12 @@ typedef struct spinlock_t kqn_statelock; /* cb_cli/cb_sti */ nal_cb_t *kqn_cb; /* -> kqswnal_lib */ - EP_DEV *kqn_epdev; /* elan device */ - EP_XMTR *kqn_eptx; /* elan transmitter */ - EP_RCVR *kqn_eprx_small; /* elan receiver (small messages) */ - EP_RCVR *kqn_eprx_large; /* elan receiver (large messages) */ - ELAN3_DMA_HANDLE *kqn_eptxdmahandle; /* elan reserved tx vaddrs */ - ELAN3_DMA_HANDLE *kqn_eprxdmahandle; /* elan reserved rx vaddrs */ + EP_DEV *kqn_epdev; /* elan device */ + EP_XMTR *kqn_eptx; /* elan transmitter */ + EP_RCVR *kqn_eprx_small; /* elan receiver (small messages) */ + EP_RCVR *kqn_eprx_large; /* elan receiver (large messages) */ + ELAN3_DMA_HANDLE *kqn_eptxdmahandle; /* elan reserved tx vaddrs */ + ELAN3_DMA_HANDLE *kqn_eprxdmahandle; /* elan reserved rx vaddrs */ kpr_router_t kqn_router; /* connection to Kernel Portals Router module */ ptl_nid_t kqn_nid_offset; /* this cluster's NID offset */ @@ -218,10 +225,10 @@ typedef struct } kqswnal_data_t; /* kqn_init state */ -#define KQN_INIT_NOTHING 0 /* MUST BE ZERO so zeroed state is initialised OK */ -#define KQN_INIT_DATA 1 -#define KQN_INIT_PTL 2 -#define KQN_INIT_ALL 3 +#define KQN_INIT_NOTHING 0 /* MUST BE ZERO so zeroed state is initialised OK */ +#define KQN_INIT_DATA 1 +#define KQN_INIT_PTL 2 +#define KQN_INIT_ALL 3 extern nal_cb_t kqswnal_lib; extern nal_t kqswnal_api; diff --git a/lustre/portals/knals/qswnal/qswnal_cb.c b/lustre/portals/knals/qswnal/qswnal_cb.c index 7032f6b..6390137 100644 --- a/lustre/portals/knals/qswnal/qswnal_cb.c +++ b/lustre/portals/knals/qswnal/qswnal_cb.c @@ -290,22 +290,24 @@ void kqswnal_put_idle_tx (kqswnal_tx_t *ktx) { kpr_fwd_desc_t *fwd = NULL; - struct list_head *idle = ktx->ktx_idle; unsigned long flags; - kqswnal_unmap_tx (ktx); /* release temporary mappings */ + kqswnal_unmap_tx (ktx); /* release temporary mappings */ ktx->ktx_state = KTX_IDLE; spin_lock_irqsave (&kqswnal_data.kqn_idletxd_lock, flags); - list_add (&ktx->ktx_list, idle); + list_del (&ktx->ktx_list); /* take off active list */ - /* reserved for non-blocking tx */ - if (idle == &kqswnal_data.kqn_nblk_idletxds) { + if (ktx->ktx_isnblk) { + /* reserved for non-blocking tx */ + list_add (&ktx->ktx_list, &kqswnal_data.kqn_nblk_idletxds); spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags); return; } + list_add (&ktx->ktx_list, &kqswnal_data.kqn_idletxds); + /* anything blocking for a tx descriptor? */ if (!list_empty(&kqswnal_data.kqn_idletxd_fwdq)) /* forwarded packet? */ { @@ -351,7 +353,6 @@ kqswnal_get_idle_tx (kpr_fwd_desc_t *fwd, int may_block) if (!list_empty (&kqswnal_data.kqn_idletxds)) { ktx = list_entry (kqswnal_data.kqn_idletxds.next, kqswnal_tx_t, ktx_list); - list_del (&ktx->ktx_list); break; } @@ -373,7 +374,6 @@ kqswnal_get_idle_tx (kpr_fwd_desc_t *fwd, int may_block) ktx = list_entry (kqswnal_data.kqn_nblk_idletxds.next, kqswnal_tx_t, ktx_list); - list_del (&ktx->ktx_list); break; } @@ -386,6 +386,12 @@ kqswnal_get_idle_tx (kpr_fwd_desc_t *fwd, int may_block) !list_empty (&kqswnal_data.kqn_idletxds)); } + if (ktx != NULL) { + list_del (&ktx->ktx_list); + list_add (&ktx->ktx_list, &kqswnal_data.kqn_activetxds); + ktx->ktx_launcher = current->pid; + } + spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags); /* Idle descs can't have any mapped (as opposed to pre-mapped) pages */ @@ -490,7 +496,7 @@ kqswnal_launch (kqswnal_tx_t *ktx) spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); - list_add_tail (&ktx->ktx_list, &kqswnal_data.kqn_delayedtxds); + list_add_tail (&ktx->ktx_delayed_list, &kqswnal_data.kqn_delayedtxds); if (waitqueue_active (&kqswnal_data.kqn_sched_waitq)) wake_up (&kqswnal_data.kqn_sched_waitq); @@ -648,7 +654,7 @@ kqswnal_dma_reply (kqswnal_tx_t *ktx, int nfrag, if (buffer + krx->krx_nob < (char *)(rmd + 1)) { /* msg too small to discover rmd size */ CERROR ("Incoming message [%d] too small for RMD (%d needed)\n", - krx->krx_nob, ((char *)(rmd + 1)) - buffer); + krx->krx_nob, (int)(((char *)(rmd + 1)) - buffer)); return (-EINVAL); } @@ -656,7 +662,7 @@ kqswnal_dma_reply (kqswnal_tx_t *ktx, int nfrag, /* rmd doesn't fit in the incoming message */ CERROR ("Incoming message [%d] too small for RMD[%d] (%d needed)\n", krx->krx_nob, rmd->kqrmd_neiov, - ((char *)&rmd->kqrmd_eiov[rmd->kqrmd_neiov]) - buffer); + (int)(((char *)&rmd->kqrmd_eiov[rmd->kqrmd_neiov]) - buffer)); return (-EINVAL); } @@ -723,11 +729,6 @@ kqswnal_sendmsg (nal_cb_t *nal, int sumnob; #endif - /* NB, the return code from this procedure is ignored. - * If we can't send, we must still complete with lib_finalize(). - * We'll have to wait for 3.2 to return an error event. - */ - CDEBUG(D_NET, "sending "LPSZ" bytes in %d frags to nid: "LPX64 " pid %u\n", payload_nob, payload_niov, nid, pid); @@ -742,8 +743,7 @@ kqswnal_sendmsg (nal_cb_t *nal, if (payload_nob > KQSW_MAXPAYLOAD) { CERROR ("request exceeds MTU size "LPSZ" (max %u).\n", payload_nob, KQSW_MAXPAYLOAD); - lib_finalize (&kqswnal_lib, private, libmsg); - return (-1); + return (PTL_FAIL); } targetnid = nid; @@ -752,14 +752,12 @@ kqswnal_sendmsg (nal_cb_t *nal, if (rc != 0) { CERROR("Can't route to "LPX64": router error %d\n", nid, rc); - lib_finalize (&kqswnal_lib, private, libmsg); - return (-1); + return (PTL_FAIL); } if (kqswnal_nid2elanid (targetnid) < 0) { CERROR("Bad gateway "LPX64" for "LPX64"\n", targetnid, nid); - lib_finalize (&kqswnal_lib, private, libmsg); - return (-1); + return (PTL_FAIL); } } @@ -770,8 +768,7 @@ kqswnal_sendmsg (nal_cb_t *nal, in_interrupt())); if (ktx == NULL) { kqswnal_cerror_hdr (hdr); - lib_finalize (&kqswnal_lib, private, libmsg); - return (-1); + return (PTL_NOSPACE); } ktx->ktx_args[0] = private; @@ -789,12 +786,12 @@ kqswnal_sendmsg (nal_cb_t *nal, CERROR ("Can't DMA reply to "LPX64": %d\n", nid, rc); kqswnal_put_idle_tx (ktx); - lib_finalize (&kqswnal_lib, private, libmsg); - return (-1); + return (PTL_FAIL); } #endif memcpy (ktx->ktx_buffer, hdr, sizeof (*hdr)); /* copy hdr from caller's stack */ + ktx->ktx_wire_hdr = (ptl_hdr_t *)ktx->ktx_buffer; #if KQSW_CHECKSUM csum = kqsw_csum (0, (char *)hdr, sizeof (*hdr)); @@ -853,8 +850,7 @@ kqswnal_sendmsg (nal_cb_t *nal, if (rc < 0) { kqswnal_put_idle_tx (ktx); - lib_finalize (&kqswnal_lib, private, libmsg); - return (-1); + return (PTL_FAIL); } rmd->kqrmd_neiov = ktx->ktx_nfrag - 1; @@ -864,9 +860,8 @@ kqswnal_sendmsg (nal_cb_t *nal, ktx->ktx_nfrag = 1; ktx->ktx_frags.iov[0].Len += offsetof (kqswnal_remotemd_t, kqrmd_eiov[rmd->kqrmd_neiov]); + payload_nob = ktx->ktx_frags.iov[0].Len; ktx->ktx_state = KTX_GETTING; - payload_nob = rc; - } else #endif if (payload_nob > 0) { /* got some payload (something more to do) */ @@ -890,8 +885,7 @@ kqswnal_sendmsg (nal_cb_t *nal, payload_niov, payload_iov); if (rc != 0) { kqswnal_put_idle_tx (ktx); - lib_finalize (&kqswnal_lib, private, libmsg); - return (-1); + return (PTL_FAIL); } } } @@ -904,12 +898,11 @@ kqswnal_sendmsg (nal_cb_t *nal, if (rc != 0) { /* failed? */ CERROR ("Failed to send packet to "LPX64": %d\n", targetnid, rc); kqswnal_put_idle_tx (ktx); - lib_finalize (&kqswnal_lib, private, libmsg); - return (-1); + return (PTL_FAIL); } CDEBUG(D_NET, "send to "LPSZ" bytes to "LPX64"\n", payload_nob, targetnid); - return (0); + return (PTL_OK); } static int @@ -995,6 +988,7 @@ kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) ktx->ktx_frags.iov[0].Base = ktx->ktx_ebuffer; /* already mapped */ ktx->ktx_frags.iov[0].Len = nob; ktx->ktx_nfrag = 1; + ktx->ktx_wire_hdr = (ptl_hdr_t *)ktx->ktx_buffer; } else { @@ -1003,6 +997,8 @@ kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) rc = kqswnal_map_tx_iov (ktx, nob, niov, iov); if (rc != 0) goto failed; + + ktx->ktx_wire_hdr = (ptl_hdr_t *)iov[0].iov_base; } ktx->ktx_port = (nob <= (sizeof (ptl_hdr_t) + KQSW_SMALLPAYLOAD)) ? @@ -1482,7 +1478,7 @@ kqswnal_scheduler (void *arg) { ktx = list_entry(kqswnal_data.kqn_delayedtxds.next, kqswnal_tx_t, ktx_list); - list_del (&ktx->ktx_list); + list_del_init (&ktx->ktx_delayed_list); spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock, flags); diff --git a/lustre/portals/knals/socknal/Makefile.am b/lustre/portals/knals/socknal/Makefile.am index 437d7fc..acdba5e 100644 --- a/lustre/portals/knals/socknal/Makefile.am +++ b/lustre/portals/knals/socknal/Makefile.am @@ -9,5 +9,9 @@ MODULE = ksocknal modulenet_DATA = ksocknal.o EXTRA_PROGRAMS = ksocknal +# If you don't build with -O2, your modules won't insert, becahse htonl is +# just special that way. +AM_CFLAGS = -O2 + DEFS = ksocknal_SOURCES = socknal.c socknal_cb.c socknal.h diff --git a/lustre/portals/knals/socknal/socknal.c b/lustre/portals/knals/socknal/socknal.c index c9681a4..e7232a0 100644 --- a/lustre/portals/knals/socknal/socknal.c +++ b/lustre/portals/knals/socknal/socknal.c @@ -59,7 +59,9 @@ ksocknal_api_shutdown(nal_t *nal, int ni) { CDEBUG (D_NET, "closing all connections\n"); - return ksocknal_close_sock(0); /* close all sockets */ + ksocknal_del_route (PTL_NID_ANY, 0, 0, 0); + ksocknal_close_conn (PTL_NID_ANY, 0); + return PTL_OK; } void @@ -104,15 +106,6 @@ ksocknal_init(int interface, ptl_pt_index_t ptl_size, * EXTRA functions follow */ -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) -#define SOCKET_I(inode) (&(inode)->u.socket_i) -#endif -static __inline__ struct socket * -socki_lookup(struct inode *inode) -{ - return SOCKET_I(inode); -} - int ksocknal_set_mynid(ptl_nid_t nid) { @@ -132,23 +125,43 @@ ksocknal_set_mynid(ptl_nid_t nid) } void -ksocknal_bind_irq (unsigned int irq, int cpu) +ksocknal_bind_irq (unsigned int irq) { #if (defined(CONFIG_SMP) && CPU_AFFINITY) - char cmdline[64]; - char *argv[] = {"/bin/sh", - "-c", - cmdline, - NULL}; - char *envp[] = {"HOME=/", - "PATH=/sbin:/bin:/usr/sbin:/usr/bin", - NULL}; + int bind; + unsigned long flags; + char cmdline[64]; + ksock_irqinfo_t *info; + char *argv[] = {"/bin/sh", + "-c", + cmdline, + NULL}; + char *envp[] = {"HOME=/", + "PATH=/sbin:/bin:/usr/sbin:/usr/bin", + NULL}; + + LASSERT (irq < NR_IRQS); + if (irq == 0) /* software NIC */ + return; + + info = &ksocknal_data.ksnd_irqinfo[irq]; + + write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); + + LASSERT (info->ksni_valid); + bind = !info->ksni_bound; + info->ksni_bound = 1; + + write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); + + if (!bind) /* bound already */ + return; snprintf (cmdline, sizeof (cmdline), - "echo %d > /proc/irq/%u/smp_affinity", 1 << cpu, irq); + "echo %d > /proc/irq/%u/smp_affinity", 1 << info->ksni_sched, irq); printk (KERN_INFO "Binding irq %u to CPU %d with cmd: %s\n", - irq, cpu, cmdline); + irq, info->ksni_sched, cmdline); /* FIXME: Find a better method of setting IRQ affinity... */ @@ -157,57 +170,432 @@ ksocknal_bind_irq (unsigned int irq, int cpu) #endif } +ksock_route_t * +ksocknal_create_route (__u32 ipaddr, int port, int buffer_size, + int irq_affinity, int xchange_nids, int nonagel) +{ + ksock_route_t *route; + + PORTAL_ALLOC (route, sizeof (*route)); + if (route == NULL) + return (NULL); + + atomic_set (&route->ksnr_refcount, 1); + route->ksnr_sharecount = 0; + route->ksnr_peer = NULL; + route->ksnr_timeout = jiffies_64; + route->ksnr_retry_interval = SOCKNAL_MIN_RECONNECT_INTERVAL; + route->ksnr_ipaddr = ipaddr; + route->ksnr_port = port; + route->ksnr_buffer_size = buffer_size; + route->ksnr_irq_affinity = irq_affinity; + route->ksnr_xchange_nids = xchange_nids; + route->ksnr_nonagel = nonagel; + route->ksnr_connecting = 0; + route->ksnr_deleted = 0; + route->ksnr_generation = 0; + route->ksnr_conn = NULL; + + return (route); +} + +void +ksocknal_destroy_route (ksock_route_t *route) +{ + LASSERT (route->ksnr_sharecount == 0); + LASSERT (route->ksnr_conn == NULL); + + if (route->ksnr_peer != NULL) + ksocknal_put_peer (route->ksnr_peer); + + PORTAL_FREE (route, sizeof (*route)); +} + +void +ksocknal_put_route (ksock_route_t *route) +{ + CDEBUG (D_OTHER, "putting route[%p] -> "LPX64" (%d)\n", + route, route->ksnr_peer->ksnp_nid, + atomic_read (&route->ksnr_refcount)); + + LASSERT (atomic_read (&route->ksnr_refcount) > 0); + if (!atomic_dec_and_test (&route->ksnr_refcount)) + return; + + ksocknal_destroy_route (route); +} + +ksock_peer_t * +ksocknal_create_peer (ptl_nid_t nid) +{ + ksock_peer_t *peer; + + LASSERT (nid != PTL_NID_ANY); + + PORTAL_ALLOC (peer, sizeof (*peer)); + if (peer == NULL) + return (NULL); + + memset (peer, 0, sizeof (*peer)); + + peer->ksnp_nid = nid; + atomic_set (&peer->ksnp_refcount, 1); /* 1 ref for caller */ + peer->ksnp_closing = 0; + INIT_LIST_HEAD (&peer->ksnp_conns); + INIT_LIST_HEAD (&peer->ksnp_routes); + INIT_LIST_HEAD (&peer->ksnp_tx_queue); + + /* Can't unload while peers exist; ensures all I/O has terminated + * before unload attempts */ + PORTAL_MODULE_USE; + atomic_inc (&ksocknal_data.ksnd_npeers); + return (peer); +} + +void +ksocknal_destroy_peer (ksock_peer_t *peer) +{ + CDEBUG (D_NET, "peer "LPX64" %p deleted\n", peer->ksnp_nid, peer); + + LASSERT (atomic_read (&peer->ksnp_refcount) == 0); + LASSERT (list_empty (&peer->ksnp_conns)); + LASSERT (list_empty (&peer->ksnp_routes)); + LASSERT (list_empty (&peer->ksnp_tx_queue)); + + PORTAL_FREE (peer, sizeof (*peer)); + + /* NB a peer's connections and autoconnect routes keep a reference + * on their peer until they are destroyed, so we can be assured + * that _all_ state to do with this peer has been cleaned up when + * its refcount drops to zero. */ + atomic_dec (&ksocknal_data.ksnd_npeers); + PORTAL_MODULE_UNUSE; +} + +void +ksocknal_put_peer (ksock_peer_t *peer) +{ + CDEBUG (D_OTHER, "putting peer[%p] -> "LPX64" (%d)\n", + peer, peer->ksnp_nid, + atomic_read (&peer->ksnp_refcount)); + + LASSERT (atomic_read (&peer->ksnp_refcount) > 0); + if (!atomic_dec_and_test (&peer->ksnp_refcount)) + return; + + ksocknal_destroy_peer (peer); +} + +ksock_peer_t * +ksocknal_find_peer_locked (ptl_nid_t nid) +{ + struct list_head *peer_list = ksocknal_nid2peerlist (nid); + struct list_head *tmp; + ksock_peer_t *peer; + + list_for_each (tmp, peer_list) { + + peer = list_entry (tmp, ksock_peer_t, ksnp_list); + + LASSERT (!peer->ksnp_closing); + LASSERT (!(list_empty (&peer->ksnp_routes) && + list_empty (&peer->ksnp_conns))); + + if (peer->ksnp_nid != nid) + continue; + + CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n", + peer, nid, atomic_read (&peer->ksnp_refcount)); + return (peer); + } + return (NULL); +} + +ksock_peer_t * +ksocknal_get_peer (ptl_nid_t nid) +{ + ksock_peer_t *peer; + + read_lock (&ksocknal_data.ksnd_global_lock); + peer = ksocknal_find_peer_locked (nid); + if (peer != NULL) /* +1 ref for caller? */ + atomic_inc (&peer->ksnp_refcount); + read_unlock (&ksocknal_data.ksnd_global_lock); + + return (peer); +} + +void +ksocknal_unlink_peer_locked (ksock_peer_t *peer) +{ + LASSERT (!peer->ksnp_closing); + peer->ksnp_closing = 1; + list_del (&peer->ksnp_list); + /* lose peerlist's ref */ + ksocknal_put_peer (peer); +} + +ksock_route_t * +ksocknal_get_route_by_idx (int index) +{ + ksock_peer_t *peer; + struct list_head *ptmp; + ksock_route_t *route; + struct list_head *rtmp; + int i; + + read_lock (&ksocknal_data.ksnd_global_lock); + + for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { + list_for_each (ptmp, &ksocknal_data.ksnd_peers[i]) { + peer = list_entry (ptmp, ksock_peer_t, ksnp_list); + + LASSERT (!(list_empty (&peer->ksnp_routes) && + list_empty (&peer->ksnp_conns))); + + list_for_each (rtmp, &peer->ksnp_routes) { + if (index-- > 0) + continue; + + route = list_entry (rtmp, ksock_route_t, ksnr_list); + atomic_inc (&route->ksnr_refcount); + read_unlock (&ksocknal_data.ksnd_global_lock); + return (route); + } + } + } + + read_unlock (&ksocknal_data.ksnd_global_lock); + return (NULL); +} + int -ksocknal_add_sock (ptl_nid_t nid, int fd, int bind_irq) +ksocknal_add_route (ptl_nid_t nid, __u32 ipaddr, int port, int bufnob, + int nonagle, int xchange_nids, int bind_irq, int share) { unsigned long flags; + ksock_peer_t *peer; + ksock_peer_t *peer2; + ksock_route_t *route; + struct list_head *rtmp; + ksock_route_t *route2; + + if (nid == PTL_NID_ANY) + return (-EINVAL); + + /* Have a brand new peer ready... */ + peer = ksocknal_create_peer (nid); + if (peer == NULL) + return (-ENOMEM); + + route = ksocknal_create_route (ipaddr, port, bufnob, + nonagle, xchange_nids, bind_irq); + if (route == NULL) { + ksocknal_put_peer (peer); + return (-ENOMEM); + } + + write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); + + peer2 = ksocknal_find_peer_locked (nid); + if (peer2 != NULL) { + ksocknal_put_peer (peer); + peer = peer2; + } else { + /* peer table takes existing ref on peer */ + list_add (&peer->ksnp_list, + ksocknal_nid2peerlist (nid)); + } + + route2 = NULL; + if (share) { + /* check for existing route to this NID via this ipaddr */ + list_for_each (rtmp, &peer->ksnp_routes) { + route2 = list_entry (rtmp, ksock_route_t, ksnr_list); + + if (route2->ksnr_ipaddr == ipaddr) + break; + + route2 = NULL; + } + } + + if (route2 != NULL) { + ksocknal_put_route (route); + route = route2; + } else { + /* route takes a ref on peer */ + route->ksnr_peer = peer; + atomic_inc (&peer->ksnp_refcount); + /* peer's route list takes existing ref on route */ + list_add (&route->ksnr_list, &peer->ksnp_routes); + } + + route->ksnr_sharecount++; + + write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); + + return (0); +} + +void +ksocknal_del_route_locked (ksock_route_t *route, int share, int keep_conn) +{ + ksock_peer_t *peer = route->ksnr_peer; + ksock_conn_t *conn = route->ksnr_conn; + + if (!share) + route->ksnr_sharecount = 0; + else { + route->ksnr_sharecount--; + if (route->ksnr_sharecount != 0) + return; + } + + if (conn != NULL) { + if (!keep_conn) + ksocknal_close_conn_locked (conn); + else { + /* keeping the conn; just dissociate it and route... */ + conn->ksnc_route = NULL; + route->ksnr_conn = NULL; + ksocknal_put_route (route); /* drop conn's ref on route */ + ksocknal_put_conn (conn); /* drop route's ref on conn */ + } + } + + route->ksnr_deleted = 1; + list_del (&route->ksnr_list); + ksocknal_put_route (route); /* drop peer's ref */ + + if (list_empty (&peer->ksnp_routes) && + list_empty (&peer->ksnp_conns)) { + /* I've just removed the last autoconnect route of a peer + * with no active connections */ + ksocknal_unlink_peer_locked (peer); + } +} + +int +ksocknal_del_route (ptl_nid_t nid, __u32 ipaddr, int share, int keep_conn) +{ + unsigned long flags; + struct list_head *ptmp; + struct list_head *pnxt; + ksock_peer_t *peer; + struct list_head *rtmp; + struct list_head *rnxt; + ksock_route_t *route; + int lo; + int hi; + int i; + int rc = -ENOENT; + + write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); + + if (nid != PTL_NID_ANY) + lo = hi = ksocknal_nid2peerlist(nid) - ksocknal_data.ksnd_peers; + else { + lo = 0; + hi = ksocknal_data.ksnd_peer_hash_size - 1; + } + + for (i = lo; i <= hi; i++) { + list_for_each_safe (ptmp, pnxt, &ksocknal_data.ksnd_peers[i]) { + peer = list_entry (ptmp, ksock_peer_t, ksnp_list); + + if (!(nid == PTL_NID_ANY || peer->ksnp_nid == nid)) + continue; + + list_for_each_safe (rtmp, rnxt, &peer->ksnp_routes) { + route = list_entry (rtmp, ksock_route_t, + ksnr_list); + + if (!(ipaddr == 0 || + route->ksnr_ipaddr == ipaddr)) + continue; + + ksocknal_del_route_locked (route, share, keep_conn); + rc = 0; /* matched something */ + if (share) + goto out; + } + } + } + out: + write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); + + return (rc); +} + +ksock_conn_t * +ksocknal_get_conn_by_idx (int index) +{ + ksock_peer_t *peer; + struct list_head *ptmp; ksock_conn_t *conn; - struct file *file = NULL; - struct socket *sock = NULL; - ksock_sched_t *sched = NULL; - unsigned int irq = 0; - struct dst_entry *dst; - int ret; - int idx; - ENTRY; + struct list_head *ctmp; + int i; - LASSERT (!in_interrupt()); + read_lock (&ksocknal_data.ksnd_global_lock); - file = fget(fd); - if (file == NULL) - RETURN(-EINVAL); + for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { + list_for_each (ptmp, &ksocknal_data.ksnd_peers[i]) { + peer = list_entry (ptmp, ksock_peer_t, ksnp_list); - ret = -EINVAL; - sock = socki_lookup(file->f_dentry->d_inode); - if (sock == NULL) - GOTO(error, ret); + LASSERT (!(list_empty (&peer->ksnp_routes) && + list_empty (&peer->ksnp_conns))); - ret = -ENOMEM; - PORTAL_ALLOC(conn, sizeof(*conn)); - if (!conn) - GOTO(error, ret); + list_for_each (ctmp, &peer->ksnp_conns) { + if (index-- > 0) + continue; -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) - sock->sk->allocation = GFP_NOFS; /* don't call info fs for alloc */ -#else -#warning Zach, Eric: fix me!! -#endif + conn = list_entry (ctmp, ksock_conn_t, ksnc_list); + atomic_inc (&conn->ksnc_refcount); + read_unlock (&ksocknal_data.ksnd_global_lock); + return (conn); + } + } + } - conn->ksnc_file = file; - conn->ksnc_sock = sock; - conn->ksnc_saved_data_ready = sock->sk->sk_data_ready; - conn->ksnc_saved_write_space = sock->sk->sk_write_space; - conn->ksnc_peernid = nid; - atomic_set (&conn->ksnc_refcount, 1); /* 1 ref for socklist */ + read_unlock (&ksocknal_data.ksnd_global_lock); + return (NULL); +} - conn->ksnc_rx_ready = 0; - conn->ksnc_rx_scheduled = 0; - ksocknal_new_packet (conn, 0); +void +ksocknal_get_peer_addr (ksock_conn_t *conn) +{ + struct sockaddr_in sin; + int len = sizeof (sin); + int rc; - INIT_LIST_HEAD (&conn->ksnc_tx_queue); - conn->ksnc_tx_ready = 0; - conn->ksnc_tx_scheduled = 0; + rc = ksocknal_getconnsock (conn); + LASSERT (rc == 0); + + rc = conn->ksnc_sock->ops->getname (conn->ksnc_sock, + (struct sockaddr *)&sin, &len, 2); + LASSERT (len <= sizeof (sin)); + ksocknal_putconnsock (conn); + if (rc != 0) { + CERROR ("Error %d getting sock peer IP\n", rc); + return; + } + + conn->ksnc_ipaddr = ntohl (sin.sin_addr.s_addr); + conn->ksnc_port = ntohs (sin.sin_port); +} + +unsigned int +ksocknal_conn_irq (ksock_conn_t *conn) +{ + int irq = 0; + int rc; + struct dst_entry *dst; + + rc = ksocknal_getconnsock (conn); + LASSERT (rc == 0); + dst = sk_dst_get (conn->ksnc_sock->sk); if (dst != NULL) { if (dst->dev != NULL) { @@ -219,139 +607,417 @@ ksocknal_add_sock (ptl_nid_t nid, int fd, int bind_irq) } dst_release (dst); } + + ksocknal_putconnsock (conn); + return (irq); +} + +ksock_sched_t * +ksocknal_choose_scheduler_locked (unsigned int irq) +{ + ksock_sched_t *sched; + ksock_irqinfo_t *info; + int i; + + LASSERT (irq < NR_IRQS); + info = &ksocknal_data.ksnd_irqinfo[irq]; + + if (irq != 0 && /* hardware NIC */ + info->ksni_valid) { /* already set up */ + return (&ksocknal_data.ksnd_schedulers[info->ksni_sched]); + } + + /* software NIC (irq == 0) || not associated with a scheduler yet. + * Choose the CPU with the fewest connections... */ + sched = &ksocknal_data.ksnd_schedulers[0]; + for (i = 1; i < SOCKNAL_N_SCHED; i++) + if (sched->kss_nconns > + ksocknal_data.ksnd_schedulers[i].kss_nconns) + sched = &ksocknal_data.ksnd_schedulers[i]; + + if (irq != 0) { /* Hardware NIC */ + info->ksni_valid = 1; + info->ksni_sched = sched - ksocknal_data.ksnd_schedulers; + + /* no overflow... */ + LASSERT (info->ksni_sched == sched - ksocknal_data.ksnd_schedulers); + } + + return (sched); +} + +int +ksocknal_create_conn (ptl_nid_t nid, ksock_route_t *route, + struct socket *sock, int bind_irq) +{ + unsigned long flags; + ksock_conn_t *conn; + ksock_peer_t *peer; + ksock_peer_t *peer2; + ksock_sched_t *sched; + unsigned int irq; + ksock_tx_t *tx; + int rc; + + /* NB, sock has an associated file since (a) this connection might + * have been created in userland and (b) we need the refcounting so + * that we don't close the socket while I/O is being done on it. */ + LASSERT (sock->file != NULL); + + rc = ksocknal_set_linger (sock); + if (rc != 0) + return (rc); + + peer = NULL; + if (route == NULL) { /* not autoconnect */ + /* Assume this socket connects to a brand new peer */ + peer = ksocknal_create_peer (nid); + if (peer == NULL) + return (-ENOMEM); + } - write_lock_irqsave (&ksocknal_data.ksnd_socklist_lock, flags); + PORTAL_ALLOC(conn, sizeof(*conn)); + if (conn == NULL) { + if (peer != NULL) + ksocknal_put_peer (peer); + return (-ENOMEM); + } - if (irq == 0 || - ksocknal_data.ksnd_irq_info[irq] == SOCKNAL_IRQ_UNASSIGNED) { - /* This is a software NIC, or we haven't associated it with - * a CPU yet */ + memset (conn, 0, sizeof (*conn)); + conn->ksnc_peer = NULL; + conn->ksnc_route = NULL; + conn->ksnc_sock = sock; + conn->ksnc_saved_data_ready = sock->sk->sk_data_ready; + conn->ksnc_saved_write_space = sock->sk->sk_write_space; + atomic_set (&conn->ksnc_refcount, 1); /* 1 ref for me */ + + conn->ksnc_rx_ready = 0; + conn->ksnc_rx_scheduled = 0; + ksocknal_new_packet (conn, 0); + + INIT_LIST_HEAD (&conn->ksnc_tx_queue); +#if SOCKNAL_ZC + INIT_LIST_HEAD (&conn->ksnc_tx_pending); +#endif + conn->ksnc_tx_ready = 0; + conn->ksnc_tx_scheduled = 0; + atomic_set (&conn->ksnc_tx_nob, 0); - /* Choose the CPU with the fewest connections */ - sched = ksocknal_data.ksnd_schedulers; - for (idx = 1; idx < SOCKNAL_N_SCHED; idx++) - if (sched->kss_nconns > - ksocknal_data.ksnd_schedulers[idx].kss_nconns) - sched = &ksocknal_data.ksnd_schedulers[idx]; + ksocknal_get_peer_addr (conn); - if (irq != 0) { /* Hardware NIC */ - /* Remember which scheduler we chose */ - idx = sched - ksocknal_data.ksnd_schedulers; + irq = ksocknal_conn_irq (conn); - LASSERT (idx < SOCKNAL_IRQ_SCHED_MASK); + write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); - if (bind_irq) /* remember if we will bind below */ - idx |= SOCKNAL_IRQ_BOUND; + if (route != NULL) { + /* Autoconnected! */ + LASSERT (route->ksnr_conn == NULL && route->ksnr_connecting); - ksocknal_data.ksnd_irq_info[irq] = idx; + if (route->ksnr_deleted) { + /* This conn was autoconnected, but the autoconnect + * route got deleted while it was being + * established! */ + write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, + flags); + PORTAL_FREE (conn, sizeof (*conn)); + return (-ESTALE); } - } else { - /* This is a hardware NIC, associated with a CPU */ - idx = ksocknal_data.ksnd_irq_info[irq]; - /* Don't bind again if we've bound already */ - if ((idx & SOCKNAL_IRQ_BOUND) != 0) - bind_irq = 0; - - sched = &ksocknal_data.ksnd_schedulers[idx & SOCKNAL_IRQ_SCHED_MASK]; + + /* associate conn/route for auto-reconnect */ + route->ksnr_conn = conn; + atomic_inc (&conn->ksnc_refcount); + conn->ksnc_route = route; + atomic_inc (&route->ksnr_refcount); + route->ksnr_connecting = 0; + + route->ksnr_generation++; + route->ksnr_retry_interval = SOCKNAL_MIN_RECONNECT_INTERVAL; + + peer = route->ksnr_peer; + } else { + /* Not an autoconnected connection; see if there is an + * existing peer for this NID */ + peer2 = ksocknal_find_peer_locked (nid); + if (peer2 != NULL) { + ksocknal_put_peer (peer); + peer = peer2; + } else { + list_add (&peer->ksnp_list, + ksocknal_nid2peerlist (nid)); + /* peer list takes over existing ref */ + } } - sched->kss_nconns++; - conn->ksnc_scheduler = sched; + LASSERT (!peer->ksnp_closing); - list_add(&conn->ksnc_list, &ksocknal_data.ksnd_socklist); + conn->ksnc_peer = peer; + atomic_inc (&peer->ksnp_refcount); - write_unlock_irqrestore (&ksocknal_data.ksnd_socklist_lock, flags); + list_add (&conn->ksnc_list, &peer->ksnp_conns); + atomic_inc (&conn->ksnc_refcount); - if (bind_irq && /* irq binding required */ - irq != 0) /* hardware NIC */ - ksocknal_bind_irq (irq, sched - ksocknal_data.ksnd_schedulers); + sched = ksocknal_choose_scheduler_locked (irq); + sched->kss_nconns++; + conn->ksnc_scheduler = sched; - /* NOW it's safe to get called back when socket is ready... */ + /* NB my callbacks block while I hold ksnd_global_lock */ sock->sk->sk_user_data = conn; sock->sk->sk_data_ready = ksocknal_data_ready; sock->sk->sk_write_space = ksocknal_write_space; - /* ...which I call right now to get things going */ + /* Take all the packets blocking for a connection. + * NB, it might be nicer to share these blocked packets among any + * other connections that are becoming established, however that + * confuses the normal packet launching operation, which selects a + * connection and queues the packet on it without needing an + * exclusive lock on ksnd_global_lock. */ + while (!list_empty (&peer->ksnp_tx_queue)) { + tx = list_entry (peer->ksnp_tx_queue.next, + ksock_tx_t, tx_list); + + list_del (&tx->tx_list); + ksocknal_queue_tx_locked (tx, conn); + } + + write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); + + if (bind_irq) /* irq binding required */ + ksocknal_bind_irq (irq); + + /* Call the callbacks right now to get things going. */ ksocknal_data_ready (sock->sk, 0); ksocknal_write_space (sock->sk); CDEBUG(D_IOCTL, "conn [%p] registered for nid "LPX64"\n", - conn, conn->ksnc_peernid); + conn, conn->ksnc_peer->ksnp_nid); - /* Can't unload while connection active */ - PORTAL_MODULE_USE; - RETURN(0); + ksocknal_put_conn (conn); + return (0); +} -error: - fput(file); - return (ret); +void +ksocknal_close_conn_locked (ksock_conn_t *conn) +{ + /* This just does the immmediate housekeeping, and queues the + * connection for the reaper to terminate. + * Caller holds ksnd_global_lock exclusively in irq context */ + ksock_peer_t *peer = conn->ksnc_peer; + ksock_route_t *route; + + LASSERT (!conn->ksnc_closing); + conn->ksnc_closing = 1; + atomic_inc (&ksocknal_data.ksnd_nclosing_conns); + + route = conn->ksnc_route; + if (route != NULL) { + /* dissociate conn from route... */ + LASSERT (!route->ksnr_connecting && + !route->ksnr_deleted); + + route->ksnr_conn = NULL; + conn->ksnc_route = NULL; + + ksocknal_put_route (route); /* drop conn's ref on route */ + ksocknal_put_conn (conn); /* drop route's ref on conn */ + } + + /* ksnd_deathrow_conns takes over peer's ref */ + list_del (&conn->ksnc_list); + + if (list_empty (&peer->ksnp_conns) && + list_empty (&peer->ksnp_routes)) { + /* I've just closed last conn belonging to a + * non-autoconnecting peer */ + ksocknal_unlink_peer_locked (peer); + } + + spin_lock (&ksocknal_data.ksnd_reaper_lock); + + list_add_tail (&conn->ksnc_list, &ksocknal_data.ksnd_deathrow_conns); + if (waitqueue_active (&ksocknal_data.ksnd_reaper_waitq)) + wake_up (&ksocknal_data.ksnd_reaper_waitq); + + spin_unlock (&ksocknal_data.ksnd_reaper_lock); } -/* Passing in a zero nid will close all connections */ int -ksocknal_close_sock(ptl_nid_t nid) +ksocknal_close_conn_unlocked (ksock_conn_t *conn) { - unsigned long flags; - ksock_conn_t *conn; - LIST_HEAD (death_row); - struct list_head *tmp; + unsigned long flags; + int did_it = 0; + + write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); + + if (!conn->ksnc_closing) { + did_it = 1; + ksocknal_close_conn_locked (conn); + } + + write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); - LASSERT (!in_interrupt()); - write_lock_irqsave (&ksocknal_data.ksnd_socklist_lock, flags); + return (did_it); +} - if (nid == 0) { /* close ALL connections */ - /* insert 'death row' into the socket list... */ - list_add (&death_row, &ksocknal_data.ksnd_socklist); - /* ...extract and reinitialise the socket list itself... */ - list_del_init (&ksocknal_data.ksnd_socklist); - /* ...and voila, death row is the proud owner of all conns */ - } else list_for_each (tmp, &ksocknal_data.ksnd_socklist) { +void +ksocknal_terminate_conn (ksock_conn_t *conn) +{ + /* This gets called by the reaper (guaranteed thread context) to + * disengage the socket from its callbacks and close it. + * ksnc_refcount will eventually hit zero, and then the reaper will + * destroy it. */ + unsigned long flags; + + /* serialise with callbacks */ + write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); + + LASSERT (conn->ksnc_closing); + + /* Remove conn's network callbacks. + * NB I _have_ to restore the callback, rather than storing a noop, + * since the socket could survive past this module being unloaded!! */ + conn->ksnc_sock->sk->sk_data_ready = conn->ksnc_saved_data_ready; + conn->ksnc_sock->sk->sk_write_space = conn->ksnc_saved_write_space; + + /* A callback could be in progress already; they hold a read lock + * on ksnd_global_lock (to serialise with me) and NOOP if + * sk_user_data is NULL. */ + conn->ksnc_sock->sk->sk_user_data = NULL; + + conn->ksnc_scheduler->kss_nconns--; + + write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); + + /* The socket is closed on the final put; either here, or in + * ksocknal_{send,recv}msg(). Since we set up the linger2 option + * when the connection was established, this will close the socket + * immediately, aborting anything buffered in it. Any hung + * zero-copy transmits will therefore complete in finite time. */ + ksocknal_putconnsock (conn); +} - conn = list_entry (tmp, ksock_conn_t, ksnc_list); +void +ksocknal_destroy_conn (ksock_conn_t *conn) +{ + /* Final coup-de-grace of the reaper */ + CDEBUG (D_NET, "connection %p\n", conn); - if (conn->ksnc_peernid == nid) { - list_del (&conn->ksnc_list); - list_add (&conn->ksnc_list, &death_row); - break; - } + LASSERT (atomic_read (&conn->ksnc_refcount) == 0); + LASSERT (conn->ksnc_route == NULL); + LASSERT (!conn->ksnc_tx_scheduled); + LASSERT (!conn->ksnc_rx_scheduled); +#if SOCKNAL_ZC + LASSERT (list_empty (&conn->ksnc_tx_pending)); +#endif + /* complete queued packets */ + while (!list_empty (&conn->ksnc_tx_queue)) { + ksock_tx_t *tx = list_entry (conn->ksnc_tx_queue.next, + ksock_tx_t, tx_list); + + CERROR ("Deleting packet type %d len %d ("LPX64"->"LPX64")\n", + NTOH__u32 (tx->tx_hdr->type), + NTOH__u32 (PTL_HDR_LENGTH(tx->tx_hdr)), + NTOH__u64 (tx->tx_hdr->src_nid), + NTOH__u64 (tx->tx_hdr->dest_nid)); + + list_del (&tx->tx_list); + ksocknal_tx_done (tx, 0); } - write_unlock_irqrestore (&ksocknal_data.ksnd_socklist_lock, flags); + /* complete current receive if any */ + switch (conn->ksnc_rx_state) { + case SOCKNAL_RX_BODY: + lib_finalize (&ksocknal_lib, NULL, conn->ksnc_cookie); + break; + case SOCKNAL_RX_BODY_FWD: + ksocknal_fmb_callback (conn->ksnc_cookie, -ECONNABORTED); + break; + case SOCKNAL_RX_HEADER: + case SOCKNAL_RX_SLOP: + break; + default: + LBUG (); + break; + } - if (nid && list_empty (&death_row)) - return (-ENOENT); + ksocknal_put_peer (conn->ksnc_peer); - while (!list_empty (&death_row)) { - conn = list_entry (death_row.next, ksock_conn_t, ksnc_list); - list_del (&conn->ksnc_list); + PORTAL_FREE (conn, sizeof (*conn)); + atomic_dec (&ksocknal_data.ksnd_nclosing_conns); +} - /* NB I _have_ to restore the callback, rather than storing - * a noop, since the socket could survive past this module - * being unloaded!! */ - conn->ksnc_sock->sk->sk_data_ready = conn->ksnc_saved_data_ready; - conn->ksnc_sock->sk->sk_write_space = conn->ksnc_saved_write_space; +void +ksocknal_put_conn (ksock_conn_t *conn) +{ + unsigned long flags; - /* OK; no more callbacks, but they could be in progress now, - * so wait for them to complete... */ - write_lock_irqsave (&ksocknal_data.ksnd_socklist_lock, flags); + CDEBUG (D_OTHER, "putting conn[%p] -> "LPX64" (%d)\n", + conn, conn->ksnc_peer->ksnp_nid, + atomic_read (&conn->ksnc_refcount)); - /* ...however if I get the lock before a callback gets it, - * this will make them noop - */ - conn->ksnc_sock->sk->sk_user_data = NULL; + LASSERT (atomic_read (&conn->ksnc_refcount) > 0); + if (!atomic_dec_and_test (&conn->ksnc_refcount)) + return; + + spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags); - /* And drop the scheduler's connection count while I've got - * the exclusive lock */ - conn->ksnc_scheduler->kss_nconns--; + list_add (&conn->ksnc_list, &ksocknal_data.ksnd_zombie_conns); + if (waitqueue_active (&ksocknal_data.ksnd_reaper_waitq)) + wake_up (&ksocknal_data.ksnd_reaper_waitq); - write_unlock_irqrestore(&ksocknal_data.ksnd_socklist_lock, - flags); + spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags); +} - ksocknal_put_conn (conn); /* drop ref for ksnd_socklist */ +int +ksocknal_close_conn (ptl_nid_t nid, __u32 ipaddr) +{ + unsigned long flags; + ksock_conn_t *conn; + struct list_head *ctmp; + struct list_head *cnxt; + ksock_peer_t *peer; + struct list_head *ptmp; + struct list_head *pnxt; + int lo; + int hi; + int i; + int rc = -ENOENT; + + write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); + + if (nid != PTL_NID_ANY) + lo = hi = ksocknal_nid2peerlist(nid) - ksocknal_data.ksnd_peers; + else { + lo = 0; + hi = ksocknal_data.ksnd_peer_hash_size - 1; } - return (0); + for (i = lo; i <= hi; i++) { + list_for_each_safe (ptmp, pnxt, &ksocknal_data.ksnd_peers[i]) { + + peer = list_entry (ptmp, ksock_peer_t, ksnp_list); + + if (!(nid == PTL_NID_ANY || nid == peer->ksnp_nid)) + continue; + + list_for_each_safe (ctmp, cnxt, &peer->ksnp_conns) { + + conn = list_entry (ctmp, ksock_conn_t, + ksnc_list); + + if (!(ipaddr == 0 || + conn->ksnc_ipaddr == ipaddr)) + continue; + + rc = 0; + ksocknal_close_conn_locked (conn); + } + } + } + + write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); + + return (rc); } #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) @@ -370,13 +1036,20 @@ struct tcp_opt *sock2tcp_opt(struct sock *sk) void ksocknal_push_conn (ksock_conn_t *conn) { - struct sock *sk = conn->ksnc_sock->sk; - struct tcp_opt *tp = sock2tcp_opt(sk); + struct sock *sk; + struct tcp_opt *tp; int nonagle; int val = 1; int rc; mm_segment_t oldmm; + rc = ksocknal_getconnsock (conn); + if (rc != 0) /* being shut down */ + return; + + sk = conn->ksnc_sock->sk; + tp = sock2tcp_opt(sk); + lock_sock (sk); nonagle = tp->nonagle; tp->nonagle = 1; @@ -394,47 +1067,33 @@ ksocknal_push_conn (ksock_conn_t *conn) lock_sock (sk); tp->nonagle = nonagle; release_sock (sk); + + ksocknal_putconnsock (conn); } -/* Passing in a zero nid pushes all connections */ -int -ksocknal_push_sock (ptl_nid_t nid) +void +ksocknal_push_peer (ksock_peer_t *peer) { - ksock_conn_t *conn; - struct list_head *tmp; - int index; - int i; - - if (nid != 0) { - conn = ksocknal_get_conn (nid); - - if (conn == NULL) - return (-ENOENT); - - ksocknal_push_conn (conn); - ksocknal_put_conn (conn); - - return (0); - } + int index; + int i; + struct list_head *tmp; + ksock_conn_t *conn; - /* NB we can't remove connections from the socket list so we have to - * cope with them being removed from under us... - */ for (index = 0; ; index++) { - read_lock (&ksocknal_data.ksnd_socklist_lock); + read_lock (&ksocknal_data.ksnd_global_lock); i = 0; conn = NULL; - list_for_each (tmp, &ksocknal_data.ksnd_socklist) { + list_for_each (tmp, &peer->ksnp_conns) { if (i++ == index) { - conn = list_entry(tmp, ksock_conn_t, ksnc_list); - atomic_inc (&conn->ksnc_refcount); // take a ref + conn = list_entry (tmp, ksock_conn_t, ksnc_list); + atomic_inc (&conn->ksnc_refcount); break; } } - read_unlock (&ksocknal_data.ksnd_socklist_lock); + read_unlock (&ksocknal_data.ksnd_global_lock); if (conn == NULL) break; @@ -442,80 +1101,57 @@ ksocknal_push_sock (ptl_nid_t nid) ksocknal_push_conn (conn); ksocknal_put_conn (conn); } - - return (0); } -ksock_conn_t * -ksocknal_get_conn (ptl_nid_t nid) +int +ksocknal_push (ptl_nid_t nid) { - struct list_head *tmp; - ksock_conn_t *conn; - - PROF_START(conn_list_walk); - - read_lock (&ksocknal_data.ksnd_socklist_lock); - - list_for_each(tmp, &ksocknal_data.ksnd_socklist) { - - conn = list_entry(tmp, ksock_conn_t, ksnc_list); - - if (conn->ksnc_peernid == nid) { - /* caller is referencing */ - atomic_inc (&conn->ksnc_refcount); - - read_unlock (&ksocknal_data.ksnd_socklist_lock); + ksock_peer_t *peer; + struct list_head *tmp; + int index; + int i; + int j; + int rc = -ENOENT; - CDEBUG(D_NET, "got conn [%p] -> "LPX64" (%d)\n", - conn, nid, atomic_read (&conn->ksnc_refcount)); + if (nid != PTL_NID_ANY) { + peer = ksocknal_get_peer (nid); - PROF_FINISH(conn_list_walk); - return (conn); + if (peer != NULL) { + rc = 0; + ksocknal_push_peer (peer); + ksocknal_put_peer (peer); } + return (rc); } - read_unlock (&ksocknal_data.ksnd_socklist_lock); - - CDEBUG(D_NET, "No connection found when looking for nid "LPX64"\n", - nid); - PROF_FINISH(conn_list_walk); - return (NULL); -} - -void -ksocknal_close_conn (ksock_conn_t *conn) -{ - CDEBUG (D_NET, "connection [%p] closed \n", conn); - - fput (conn->ksnc_file); - PORTAL_FREE (conn, sizeof (*conn)); - - /* One less connection keeping us hanging on */ - PORTAL_MODULE_UNUSE; -} + for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { + for (j = 0; ; j++) { + read_lock (&ksocknal_data.ksnd_global_lock); -void -_ksocknal_put_conn (ksock_conn_t *conn) -{ - unsigned long flags; + index = 0; + peer = NULL; - CDEBUG (D_NET, "connection [%p] handed the black spot\n", conn); + list_for_each (tmp, &ksocknal_data.ksnd_peers[i]) { + if (index++ == j) { + peer = list_entry(tmp, ksock_peer_t, + ksnp_list); + atomic_inc (&peer->ksnp_refcount); + break; + } + } - /* "But what is the black spot, captain?" I asked. - * "That's a summons, mate..." */ + read_unlock (&ksocknal_data.ksnd_global_lock); - LASSERT (atomic_read (&conn->ksnc_refcount) == 0); - LASSERT (conn->ksnc_sock->sk->sk_data_ready != ksocknal_data_ready); - LASSERT (conn->ksnc_sock->sk->sk_write_space != ksocknal_write_space); - LASSERT (conn->ksnc_sock->sk->sk_user_data == NULL); - LASSERT (!conn->ksnc_rx_scheduled); - - spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags); + if (peer != NULL) { + rc = 0; + ksocknal_push_peer (peer); + ksocknal_put_peer (peer); + } + } - list_add (&conn->ksnc_list, &ksocknal_data.ksnd_reaper_list); - wake_up (&ksocknal_data.ksnd_reaper_waitq); + } - spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags); + return (rc); } int @@ -526,13 +1162,68 @@ ksocknal_cmd(struct portal_ioctl_data * data, void * private) LASSERT (data != NULL); switch(data->ioc_nal_cmd) { + case NAL_CMD_GET_AUTOCONN: { + ksock_route_t *route = ksocknal_get_route_by_idx (data->ioc_count); + + if (route == NULL) + rc = -ENOENT; + else { + rc = 0; + data->ioc_nid = route->ksnr_peer->ksnp_nid; + data->ioc_id = route->ksnr_ipaddr; + data->ioc_misc = route->ksnr_port; + data->ioc_count = route->ksnr_generation; + data->ioc_size = route->ksnr_buffer_size; + data->ioc_wait = route->ksnr_sharecount; + data->ioc_flags = (route->ksnr_nonagel ? 1 : 0) | + (route->ksnr_xchange_nids ? 2 : 0) | + (route->ksnr_irq_affinity ? 4 : 0); + ksocknal_put_route (route); + } + break; + } + case NAL_CMD_ADD_AUTOCONN: { + rc = ksocknal_add_route (data->ioc_nid, data->ioc_id, + data->ioc_misc, data->ioc_size, + (data->ioc_flags & 1) != 0, + (data->ioc_flags & 2) != 0, + (data->ioc_flags & 4) != 0, + (data->ioc_flags & 8) != 0); + break; + } + case NAL_CMD_DEL_AUTOCONN: { + rc = ksocknal_del_route (data->ioc_nid, data->ioc_id, + (data->ioc_flags & 1) != 0, + (data->ioc_flags & 2) != 0); + break; + } + case NAL_CMD_GET_CONN: { + ksock_conn_t *conn = ksocknal_get_conn_by_idx (data->ioc_count); + + if (conn == NULL) + rc = -ENOENT; + else { + rc = 0; + data->ioc_nid = conn->ksnc_peer->ksnp_nid; + data->ioc_id = conn->ksnc_ipaddr; + data->ioc_misc = conn->ksnc_port; + ksocknal_put_conn (conn); + } + break; + } case NAL_CMD_REGISTER_PEER_FD: { - rc = ksocknal_add_sock(data->ioc_nid, data->ioc_fd, - data->ioc_flags); + struct socket *sock = sockfd_lookup (data->ioc_fd, &rc); + + if (sock != NULL) { + rc = ksocknal_create_conn (data->ioc_nid, NULL, + sock, data->ioc_flags); + if (rc != 0) + fput (sock->file); + } break; } case NAL_CMD_CLOSE_CONNECTION: { - rc = ksocknal_close_sock(data->ioc_nid); + rc = ksocknal_close_conn (data->ioc_nid, data->ioc_id); break; } case NAL_CMD_REGISTER_MYNID: { @@ -540,7 +1231,7 @@ ksocknal_cmd(struct portal_ioctl_data * data, void * private) break; } case NAL_CMD_PUSH_CONNECTION: { - rc = ksocknal_push_sock (data->ioc_nid); + rc = ksocknal_push (data->ioc_nid); break; } } @@ -568,6 +1259,7 @@ ksocknal_free_buffers (void) SOCKNAL_LARGE_FWD_NMSGS)); } + LASSERT (ksocknal_data.ksnd_active_ltxs == 0); if (ksocknal_data.ksnd_ltxs != NULL) PORTAL_FREE (ksocknal_data.ksnd_ltxs, sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + @@ -576,9 +1268,13 @@ ksocknal_free_buffers (void) if (ksocknal_data.ksnd_schedulers != NULL) PORTAL_FREE (ksocknal_data.ksnd_schedulers, sizeof (ksock_sched_t) * SOCKNAL_N_SCHED); + + PORTAL_FREE (ksocknal_data.ksnd_peers, + sizeof (struct list_head) * + ksocknal_data.ksnd_peer_hash_size); } -void __exit +void /*__exit*/ ksocknal_module_fini (void) { int i; @@ -601,10 +1297,15 @@ ksocknal_module_fini (void) /* fall through */ case SOCKNAL_INIT_DATA: - /* Module refcount only gets to zero when all connections + /* Module refcount only gets to zero when all peers * have been closed so all lists must be empty */ - LASSERT (list_empty (&ksocknal_data.ksnd_socklist)); - LASSERT (list_empty (&ksocknal_data.ksnd_reaper_list)); + LASSERT (atomic_read (&ksocknal_data.ksnd_npeers) == 0); + LASSERT (ksocknal_data.ksnd_peers != NULL); + for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { + LASSERT (list_empty (&ksocknal_data.ksnd_peers[i])); + } + LASSERT (list_empty (&ksocknal_data.ksnd_zombie_conns)); + LASSERT (list_empty (&ksocknal_data.ksnd_autoconnectd_routes)); LASSERT (list_empty (&ksocknal_data.ksnd_small_fmp.fmp_blocked_conns)); LASSERT (list_empty (&ksocknal_data.ksnd_large_fmp.fmp_blocked_conns)); @@ -623,6 +1324,7 @@ ksocknal_module_fini (void) /* flag threads to terminate; wake and wait for them to die */ ksocknal_data.ksnd_shuttingdown = 1; + wake_up_all (&ksocknal_data.ksnd_autoconnectd_waitq); wake_up_all (&ksocknal_data.ksnd_reaper_waitq); for (i = 0; i < SOCKNAL_N_SCHED; i++) @@ -677,8 +1379,16 @@ ksocknal_module_init (void) memset (&ksocknal_data, 0, sizeof (ksocknal_data)); /* zero pointers */ - INIT_LIST_HEAD(&ksocknal_data.ksnd_socklist); - rwlock_init(&ksocknal_data.ksnd_socklist_lock); + ksocknal_data.ksnd_peer_hash_size = SOCKNAL_PEER_HASH_SIZE; + PORTAL_ALLOC (ksocknal_data.ksnd_peers, + sizeof (struct list_head) * ksocknal_data.ksnd_peer_hash_size); + if (ksocknal_data.ksnd_peers == NULL) + RETURN (-ENOMEM); + + for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) + INIT_LIST_HEAD(&ksocknal_data.ksnd_peers[i]); + + rwlock_init(&ksocknal_data.ksnd_global_lock); ksocknal_data.ksnd_nal_cb = &ksocknal_lib; spin_lock_init (&ksocknal_data.ksnd_nal_cb_lock); @@ -697,19 +1407,26 @@ ksocknal_module_init (void) init_waitqueue_head(&ksocknal_data.ksnd_idle_ltx_waitq); spin_lock_init (&ksocknal_data.ksnd_reaper_lock); - INIT_LIST_HEAD (&ksocknal_data.ksnd_reaper_list); + INIT_LIST_HEAD (&ksocknal_data.ksnd_zombie_conns); + INIT_LIST_HEAD (&ksocknal_data.ksnd_deathrow_conns); init_waitqueue_head(&ksocknal_data.ksnd_reaper_waitq); - memset (&ksocknal_data.ksnd_irq_info, SOCKNAL_IRQ_UNASSIGNED, - sizeof (ksocknal_data.ksnd_irq_info)); + spin_lock_init (&ksocknal_data.ksnd_autoconnectd_lock); + INIT_LIST_HEAD (&ksocknal_data.ksnd_autoconnectd_routes); + init_waitqueue_head(&ksocknal_data.ksnd_autoconnectd_waitq); + + /* NB memset above zeros whole of ksocknal_data, including + * ksocknal_data.ksnd_irqinfo[all].ksni_valid */ /* flag lists/ptrs/locks initialised */ ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA; PORTAL_ALLOC(ksocknal_data.ksnd_schedulers, sizeof(ksock_sched_t) * SOCKNAL_N_SCHED); - if (ksocknal_data.ksnd_schedulers == NULL) + if (ksocknal_data.ksnd_schedulers == NULL) { + ksocknal_module_fini (); RETURN(-ENOMEM); + } for (i = 0; i < SOCKNAL_N_SCHED; i++) { ksock_sched_t *kss = &ksocknal_data.ksnd_schedulers[i]; @@ -723,7 +1440,7 @@ ksocknal_module_init (void) init_waitqueue_head (&kss->kss_waitq); } - CERROR ("ltx "LPSZ", total "LPSZ"\n", sizeof (ksock_ltx_t), + CDEBUG (D_MALLOC, "ltx "LPSZ", total "LPSZ"\n", sizeof (ksock_ltx_t), sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS)); PORTAL_ALLOC(ksocknal_data.ksnd_ltxs, @@ -740,6 +1457,7 @@ ksocknal_module_init (void) for (i = 0; i < SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS; i++) { ksock_ltx_t *ltx = &((ksock_ltx_t *)ksocknal_data.ksnd_ltxs)[i]; + ltx->ltx_tx.tx_hdr = <x->ltx_hdr; ltx->ltx_idle = i < SOCKNAL_NLTXS ? &ksocknal_data.ksnd_idle_ltx_list : &ksocknal_data.ksnd_idle_nblk_ltx_list; @@ -767,9 +1485,18 @@ ksocknal_module_init (void) } } + for (i = 0; i < SOCKNAL_N_AUTOCONNECTD; i++) { + rc = ksocknal_thread_start (ksocknal_autoconnectd, (void *)((long)i)); + if (rc != 0) { + CERROR("Can't spawn socknal autoconnectd: %d\n", rc); + ksocknal_module_fini (); + RETURN (rc); + } + } + rc = ksocknal_thread_start (ksocknal_reaper, NULL); if (rc != 0) { - CERROR("Can't spawn socknal reaper: %d\n", rc); + CERROR ("Can't spawn socknal reaper: %d\n", rc); ksocknal_module_fini (); RETURN (rc); } diff --git a/lustre/portals/knals/socknal/socknal.h b/lustre/portals/knals/socknal/socknal.h index 47ccd62..7a13396 100644 --- a/lustre/portals/knals/socknal/socknal.h +++ b/lustre/portals/knals/socknal/socknal.h @@ -25,9 +25,7 @@ */ #define DEBUG_PORTAL_ALLOC -#ifndef EXPORT_SYMTAB -# define EXPORT_SYMTAB -#endif +#define EXPORT_SYMTAB #include #include @@ -52,6 +50,7 @@ #include #include #include +#include #define DEBUG_SUBSYSTEM S_SOCKNAL @@ -59,9 +58,27 @@ #include #include -#define SOCKNAL_N_SCHED num_online_cpus() /* # socknal schedulers */ +#if CONFIG_SMP +# define SOCKNAL_N_SCHED num_online_cpus() /* # socknal schedulers */ +#else +# define SOCKNAL_N_SCHED 1 /* # socknal schedulers */ +#endif +#define SOCKNAL_N_AUTOCONNECTD 4 /* # socknal autoconnect daemons */ + +#define SOCKNAL_MIN_RECONNECT_INTERVAL HZ /* first failed connection retry... */ +#define SOCKNAL_MAX_RECONNECT_INTERVAL (60*HZ) /* ...exponentially increasing to this */ + +#define SOCKNAL_IO_TIMEOUT (60*HZ) /* default comms timeout */ + +#define SOCKNAL_PEER_HASH_SIZE 101 /* # peer lists */ + +#if PTL_LARGE_MTU +# define SOCKNAL_MAX_FWD_PAYLOAD (256<<10) /* biggest payload I can forward */ +#else +# define SOCKNAL_MAX_FWD_PAYLOAD (64<<10) /* biggest payload I can forward */ +#endif -#define SOCKNAL_MAX_FWD_PAYLOAD PTL_MTU /* biggest payload I can forward */ +#define SOCKNAL_ZC_MIN_FRAG (2<<10) /* default smallest zerocopy fragment */ #define SOCKNAL_NLTXS 128 /* # normal transmit messages */ #define SOCKNAL_NNBLK_LTXS 128 /* # transmit messages reserved if can't block */ @@ -78,6 +95,10 @@ #define SOCKNAL_TX_LOW_WATER(sk) (((sk)->sk_sndbuf*8)/10) +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +# define jiffies_64 jiffies +#endif + #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,72)) # define sk_data_ready data_ready # define sk_write_space write_space @@ -108,10 +129,17 @@ typedef struct /* per scheduler state */ } ksock_sched_t; typedef struct { + int ksni_valid:1; /* been set yet? */ + int ksni_bound:1; /* bound to a cpu yet? */ + int ksni_sched:6; /* which scheduler (assumes < 64) */ +} ksock_irqinfo_t; + +typedef struct { int ksnd_init; /* initialisation state */ - struct list_head ksnd_socklist; /* all my connections */ - rwlock_t ksnd_socklist_lock; /* stabilise add/find/remove */ + rwlock_t ksnd_global_lock; /* stabilize peer/conn ops */ + struct list_head *ksnd_peers; /* hash table of all my known peers */ + int ksnd_peer_hash_size; /* size of ksnd_peers */ nal_cb_t *ksnd_nal_cb; spinlock_t ksnd_nal_cb_lock; /* lib cli/sti lock */ @@ -119,7 +147,10 @@ typedef struct { atomic_t ksnd_nthreads; /* # live threads */ int ksnd_shuttingdown; /* tell threads to exit */ ksock_sched_t *ksnd_schedulers; /* scheduler state */ - + + atomic_t ksnd_npeers; /* total # peers extant */ + atomic_t ksnd_nclosing_conns; /* # closed conns extant */ + kpr_router_t ksnd_router; /* THE router */ void *ksnd_fmbs; /* all the pre-allocated FMBs */ @@ -131,11 +162,21 @@ typedef struct { struct list_head ksnd_idle_ltx_list; /* where to get an idle LTX */ struct list_head ksnd_idle_nblk_ltx_list; /* where to get an idle LTX if you can't block */ wait_queue_head_t ksnd_idle_ltx_waitq; /* where to block for an idle LTX */ + int ksnd_active_ltxs; /* #active ltxs */ - struct list_head ksnd_reaper_list; /* conn waiting to be reaped */ - wait_queue_head_t ksnd_reaper_waitq; /* reaper sleeps here */ + struct list_head ksnd_deathrow_conns; /* conns to be closed */ + struct list_head ksnd_zombie_conns; /* conns to be freed */ + wait_queue_head_t ksnd_reaper_waitq; /* reaper sleep here */ spinlock_t ksnd_reaper_lock; /* serialise */ - unsigned char ksnd_irq_info[NR_IRQS]; /* irq->scheduler lookup */ + + int ksnd_stall_tx; /* test sluggish sender */ + int ksnd_stall_rx; /* test sluggish receiver */ + + struct list_head ksnd_autoconnectd_routes; /* routes waiting to be connected */ + wait_queue_head_t ksnd_autoconnectd_waitq; /* autoconnectds sleep here */ + spinlock_t ksnd_autoconnectd_lock; /* serialise */ + + ksock_irqinfo_t ksnd_irqinfo[NR_IRQS];/* irq->scheduler lookup */ } ksock_nal_data_t; #define SOCKNAL_INIT_NOTHING 0 @@ -143,10 +184,6 @@ typedef struct { #define SOCKNAL_INIT_PTL 2 #define SOCKNAL_INIT_ALL 3 -#define SOCKNAL_IRQ_BOUND 0x80 /* flag we _did_ bind already */ -#define SOCKNAL_IRQ_SCHED_MASK 0x7f /* we assume < 127 CPUs */ -#define SOCKNAL_IRQ_UNASSIGNED 0xff /* flag unassigned */ - /* A packet just assembled for transmission is represented by 1 or more * struct iovec fragments and 0 or more ptl_kiov_t fragments. Forwarded * messages, or messages from an MD with PTL_MD_KIOV _not_ set have 0 @@ -161,17 +198,24 @@ typedef struct { * Otherwise up to PTL_MD_MAX_IOV ptl_kiov_t fragments are used. */ +struct ksock_conn; /* forward ref */ +struct ksock_peer; /* forward ref */ +struct ksock_route; /* forward ref */ + typedef struct /* transmit packet */ { struct list_head tx_list; /* queue on conn for transmission etc */ + __u64 tx_deadline; /* when (in jiffies) tx times out */ char tx_isfwd; /* forwarding / sourced here */ int tx_nob; /* # packet bytes */ + int tx_resid; /* residual bytes */ int tx_niov; /* # packet iovec frags */ struct iovec *tx_iov; /* packet iovec frags */ int tx_nkiov; /* # packet page frags */ ptl_kiov_t *tx_kiov; /* packet page frags */ + struct ksock_conn *tx_conn; /* owning conn */ + ptl_hdr_t *tx_hdr; /* packet header (for debug only) */ #if SOCKNAL_ZC - ksock_sched_t *tx_sched; /* who to wake on callback */ zccd_t tx_zccd; /* zero copy callback descriptor */ #endif } ksock_tx_t; @@ -207,8 +251,7 @@ typedef struct /* locally transmitted packet */ /* local packets (lib->socknal) embedded in ksock_ltx_t::ltx_tx */ /* NB list_entry() is used here as convenient macro for calculating a - * pointer to a struct from the address of a member. - */ + * pointer to a struct from the address of a member. */ typedef struct /* Kernel portals Socket Forwarding message buffer */ { /* (socknal->router) */ @@ -216,6 +259,7 @@ typedef struct /* Kernel portals Socket Forward kpr_fwd_desc_t fmb_fwd; /* router's descriptor */ int fmb_npages; /* # pages allocated */ ksock_fmb_pool_t *fmb_pool; /* owning pool */ + struct ksock_peer *fmb_peer; /* peer received from */ struct page *fmb_pages[SOCKNAL_LARGE_FWD_PAGES]; struct iovec fmb_iov[SOCKNAL_LARGE_FWD_PAGES]; } ksock_fmb_t; @@ -234,19 +278,23 @@ typedef union { #define SOCKNAL_RX_GET_FMB 5 /* scheduled for forwarding */ #define SOCKNAL_RX_FMB_SLEEP 6 /* blocked waiting for a fwd desc */ -typedef struct +typedef struct ksock_conn { - struct list_head ksnc_list; /* stash on global socket list */ - struct file *ksnc_file; /* socket filp */ + struct ksock_peer *ksnc_peer; /* owning peer */ + struct ksock_route *ksnc_route; /* owning route */ + struct list_head ksnc_list; /* stash on peer's conn list */ struct socket *ksnc_sock; /* actual socket */ void *ksnc_saved_data_ready; /* socket's original data_ready() callback */ void *ksnc_saved_write_space; /* socket's original write_space() callback */ - ptl_nid_t ksnc_peernid; /* who's on the other end */ atomic_t ksnc_refcount; /* # users */ ksock_sched_t *ksnc_scheduler; /* who schedules this connection */ - + __u32 ksnc_ipaddr; /* peer's IP */ + int ksnc_port; /* peer's port */ + int ksnc_closing; /* being shut down */ + /* READER */ struct list_head ksnc_rx_list; /* where I enq waiting input or a forwarding descriptor */ + __u64 ksnc_rx_deadline; /* when receive times out */ int ksnc_rx_ready; /* data ready to read */ int ksnc_rx_scheduled; /* being progressed */ int ksnc_rx_state; /* what is being read */ @@ -263,37 +311,104 @@ typedef struct /* WRITER */ struct list_head ksnc_tx_list; /* where I enq waiting for output space */ struct list_head ksnc_tx_queue; /* packets waiting to be sent */ +#if SOCKNAL_ZC + struct list_head ksnc_tx_pending; /* zc packets pending callback */ +#endif + atomic_t ksnc_tx_nob; /* # bytes queued */ int ksnc_tx_ready; /* write space */ int ksnc_tx_scheduled; /* being progressed */ - } ksock_conn_t; -extern int ksocknal_add_sock (ptl_nid_t nid, int fd, int client); -extern int ksocknal_close_sock(ptl_nid_t nid); -extern int ksocknal_set_mynid(ptl_nid_t nid); -extern int ksocknal_push_sock(ptl_nid_t nid); -extern ksock_conn_t *ksocknal_get_conn (ptl_nid_t nid); -extern void _ksocknal_put_conn (ksock_conn_t *conn); -extern void ksocknal_close_conn (ksock_conn_t *conn); +typedef struct ksock_route +{ + struct list_head ksnr_list; /* chain on peer route list */ + struct list_head ksnr_connect_list; /* chain on autoconnect list */ + struct ksock_peer *ksnr_peer; /* owning peer */ + atomic_t ksnr_refcount; /* # users */ + int ksnr_sharecount; /* lconf usage counter */ + __u64 ksnr_timeout; /* when reconnection can happen next */ + unsigned int ksnr_retry_interval; /* how long between retries */ + __u32 ksnr_ipaddr; /* an IP address for this peer */ + int ksnr_port; /* port to connect to */ + int ksnr_buffer_size; /* size of socket buffers */ + unsigned int ksnr_irq_affinity:1; /* set affinity? */ + unsigned int ksnr_xchange_nids:1; /* do hello protocol? */ + unsigned int ksnr_nonagel:1; /* disable nagle? */ + unsigned int ksnr_connecting; /* autoconnect in progress? */ + unsigned int ksnr_deleted; /* been removed from peer? */ + int ksnr_generation; /* connection incarnation # */ + ksock_conn_t *ksnr_conn; /* NULL/active connection */ +} ksock_route_t; + +typedef struct ksock_peer +{ + struct list_head ksnp_list; /* stash on global peer list */ + ptl_nid_t ksnp_nid; /* who's on the other end(s) */ + atomic_t ksnp_refcount; /* # users */ + int ksnp_closing; /* being closed */ + struct list_head ksnp_conns; /* all active connections */ + struct list_head ksnp_routes; /* routes */ + struct list_head ksnp_tx_queue; /* waiting packets */ +} ksock_peer_t; -static inline void -ksocknal_put_conn (ksock_conn_t *conn) + + +extern nal_cb_t ksocknal_lib; +extern ksock_nal_data_t ksocknal_data; + +static inline struct list_head * +ksocknal_nid2peerlist (ptl_nid_t nid) +{ + unsigned int hash = ((unsigned int)nid) % ksocknal_data.ksnd_peer_hash_size; + + return (&ksocknal_data.ksnd_peers [hash]); +} + +static inline int +ksocknal_getconnsock (ksock_conn_t *conn) { - CDEBUG (D_OTHER, "putting conn[%p] -> "LPX64" (%d)\n", - conn, conn->ksnc_peernid, atomic_read (&conn->ksnc_refcount)); + int rc = -ESHUTDOWN; + + read_lock (&ksocknal_data.ksnd_global_lock); + if (!conn->ksnc_closing) { + rc = 0; + get_file (conn->ksnc_sock->file); + } + read_unlock (&ksocknal_data.ksnd_global_lock); + + return (rc); +} - if (atomic_dec_and_test (&conn->ksnc_refcount)) - _ksocknal_put_conn (conn); +static inline void +ksocknal_putconnsock (ksock_conn_t *conn) +{ + fput (conn->ksnc_sock->file); } +extern void ksocknal_put_route (ksock_route_t *route); +extern void ksocknal_put_peer (ksock_peer_t *peer); +extern ksock_peer_t *ksocknal_find_peer_locked (ptl_nid_t nid); +extern ksock_peer_t *ksocknal_get_peer (ptl_nid_t nid); +extern int ksocknal_del_route (ptl_nid_t nid, __u32 ipaddr, + int single, int keep_conn); +extern int ksocknal_create_conn (ptl_nid_t nid, ksock_route_t *route, + struct socket *sock, int bind_irq); +extern void ksocknal_close_conn_locked (ksock_conn_t *conn); +extern int ksocknal_close_conn_unlocked (ksock_conn_t *conn); +extern void ksocknal_terminate_conn (ksock_conn_t *conn); +extern void ksocknal_destroy_conn (ksock_conn_t *conn); +extern void ksocknal_put_conn (ksock_conn_t *conn); +extern int ksocknal_close_conn (ptl_nid_t nid, __u32 ipaddr); + +extern void ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn); +extern void ksocknal_tx_done (ksock_tx_t *tx, int asynch); +extern void ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd); +extern void ksocknal_fmb_callback (void *arg, int error); extern int ksocknal_thread_start (int (*fn)(void *arg), void *arg); extern int ksocknal_new_packet (ksock_conn_t *conn, int skip); -extern void ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd); extern int ksocknal_scheduler (void *arg); -extern int ksocknal_reaper (void *arg); extern void ksocknal_data_ready(struct sock *sk, int n); extern void ksocknal_write_space(struct sock *sk); - - -extern nal_cb_t ksocknal_lib; -extern ksock_nal_data_t ksocknal_data; +extern int ksocknal_autoconnectd (void *arg); +extern int ksocknal_reaper (void *arg); +extern int ksocknal_set_linger (struct socket *sock); diff --git a/lustre/portals/knals/socknal/socknal_cb.c b/lustre/portals/knals/socknal/socknal_cb.c index 552ea50..656a0c5 100644 --- a/lustre/portals/knals/socknal/socknal_cb.c +++ b/lustre/portals/knals/socknal/socknal_cb.c @@ -25,13 +25,10 @@ #include "socknal.h" -atomic_t ksocknal_packets_received; -atomic_t ksocknal_packets_launched; -atomic_t ksocknal_packets_being_sent; - +int ksocknal_io_timeout = SOCKNAL_IO_TIMEOUT; #if SOCKNAL_ZC int ksocknal_do_zc = 1; -int ksocknal_zc_min_frag = 2048; +int ksocknal_zc_min_frag = SOCKNAL_ZC_MIN_FRAG; #endif /* @@ -127,7 +124,7 @@ ksocknal_sti(nal_cb_t *nal, unsigned long *flags) int ksocknal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist) { - /* I would guess that if ksocknal_get_conn(nid) == NULL, + /* I would guess that if ksocknal_get_peer (nid) == NULL, and we're not routing, then 'nid' is very distant :) */ if ( nal->ni.nid == nid ) { *dist = 0; @@ -151,6 +148,7 @@ ksocknal_get_ltx (int may_block) ltx = list_entry(ksocknal_data.ksnd_idle_ltx_list.next, ksock_ltx_t, ltx_tx.tx_list); list_del (<x->ltx_tx.tx_list); + ksocknal_data.ksnd_active_ltxs++; break; } @@ -159,6 +157,7 @@ ksocknal_get_ltx (int may_block) ltx = list_entry(ksocknal_data.ksnd_idle_nblk_ltx_list.next, ksock_ltx_t, ltx_tx.tx_list); list_del (<x->ltx_tx.tx_list); + ksocknal_data.ksnd_active_ltxs++; } break; } @@ -175,6 +174,24 @@ ksocknal_get_ltx (int may_block) return (ltx); } +void +ksocknal_put_ltx (ksock_ltx_t *ltx) +{ + unsigned long flags; + + spin_lock_irqsave (&ksocknal_data.ksnd_idle_ltx_lock, flags); + + ksocknal_data.ksnd_active_ltxs--; + list_add_tail (<x->ltx_tx.tx_list, ltx->ltx_idle); + + /* normal tx desc => wakeup anyone blocking for one */ + if (ltx->ltx_idle == &ksocknal_data.ksnd_idle_ltx_list && + waitqueue_active (&ksocknal_data.ksnd_idle_ltx_waitq)) + wake_up (&ksocknal_data.ksnd_idle_ltx_waitq); + + spin_unlock_irqrestore (&ksocknal_data.ksnd_idle_ltx_lock, flags); +} + #if SOCKNAL_ZC struct page * ksocknal_kvaddr_to_page (unsigned long vaddr) @@ -202,11 +219,15 @@ ksocknal_kvaddr_to_page (unsigned long vaddr) #endif int -ksocknal_send_iov (struct socket *sock, ksock_tx_t *tx, int more) +ksocknal_send_iov (ksock_conn_t *conn, ksock_tx_t *tx) { + struct socket *sock = conn->ksnc_sock; struct iovec *iov = tx->tx_iov; int fragsize = iov->iov_len; unsigned long vaddr = (unsigned long)iov->iov_base; + int more = !list_empty (&conn->ksnc_tx_queue) | + (tx->tx_niov > 1) | + (tx->tx_nkiov > 1); #if SOCKNAL_ZC int offset = vaddr & (PAGE_SIZE - 1); int zcsize = MIN (fragsize, PAGE_SIZE - offset); @@ -216,9 +237,8 @@ ksocknal_send_iov (struct socket *sock, ksock_tx_t *tx, int more) /* NB we can't trust socket ops to either consume our iovs * or leave them alone, so we only send 1 frag at a time. */ - LASSERT (fragsize <= tx->tx_nob); + LASSERT (fragsize <= tx->tx_resid); LASSERT (tx->tx_niov > 0); - more |= (tx->tx_niov > 1); #if SOCKNAL_ZC if (ksocknal_do_zc && @@ -230,7 +250,10 @@ ksocknal_send_iov (struct socket *sock, ksock_tx_t *tx, int more) CDEBUG(D_NET, "vaddr %p, page %p->%p + offset %x for %d\n", (void *)vaddr, page, page_address(page), offset, zcsize); - more |= (zcsize < fragsize); + if (fragsize > zcsize) { + more = 1; + fragsize = zcsize; + } rc = tcp_sendpage_zccd(sock, page, offset, zcsize, more ? (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT, @@ -260,37 +283,39 @@ ksocknal_send_iov (struct socket *sock, ksock_tx_t *tx, int more) if (rc <= 0) return (rc); - tx->tx_nob -= rc; + tx->tx_resid -= rc; - if (rc < fragsize) { - /* didn't send whole frag */ + if (rc < iov->iov_len) { + /* didn't send whole iov entry... */ iov->iov_base = (void *)(vaddr + rc); - iov->iov_len = fragsize - rc; - return (-EAGAIN); + iov->iov_len -= rc; + /* ...but did we send everything we tried to send? */ + return ((rc == fragsize) ? 1 : -EAGAIN); } - /* everything went */ - LASSERT (rc == fragsize); tx->tx_iov++; tx->tx_niov--; return (1); } int -ksocknal_send_kiov (struct socket *sock, ksock_tx_t *tx, int more) +ksocknal_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) { + struct socket *sock = conn->ksnc_sock; ptl_kiov_t *kiov = tx->tx_kiov; int fragsize = kiov->kiov_len; struct page *page = kiov->kiov_page; int offset = kiov->kiov_offset; + int more = !list_empty (&conn->ksnc_tx_queue) | + (tx->tx_nkiov > 1); int rc; /* NB we can't trust socket ops to either consume our iovs * or leave them alone, so we only send 1 frag at a time. */ - LASSERT (fragsize <= tx->tx_nob); + LASSERT (fragsize <= tx->tx_resid); LASSERT (offset + fragsize <= PAGE_SIZE); + LASSERT (tx->tx_niov == 0); LASSERT (tx->tx_nkiov > 0); - more |= (tx->tx_nkiov > 1); #if SOCKNAL_ZC if (ksocknal_do_zc && @@ -330,8 +355,8 @@ ksocknal_send_kiov (struct socket *sock, ksock_tx_t *tx, int more) if (rc <= 0) return (rc); - tx->tx_nob -= rc; - + tx->tx_resid -= rc; + if (rc < fragsize) { /* didn't send whole frag */ kiov->kiov_offset = offset + rc; @@ -347,29 +372,52 @@ ksocknal_send_kiov (struct socket *sock, ksock_tx_t *tx, int more) } int -ksocknal_sendmsg (struct socket *sock, ksock_tx_t *tx, int more) +ksocknal_sendmsg (ksock_conn_t *conn, ksock_tx_t *tx) { - int rc; - int sent_some = 0; + /* Return 0 on success, < 0 on error. + * caller checks tx_resid to determine progress/completion */ + int rc; ENTRY; - LASSERT (!in_interrupt()); + if (ksocknal_data.ksnd_stall_tx != 0) { + set_current_state (TASK_UNINTERRUPTIBLE); + schedule_timeout (ksocknal_data.ksnd_stall_tx * HZ); + } + + rc = ksocknal_getconnsock (conn); + if (rc != 0) + return (rc); for (;;) { + LASSERT (tx->tx_resid != 0); + + if (conn->ksnc_closing) { + rc = -ESHUTDOWN; + break; + } + if (tx->tx_niov != 0) - rc = ksocknal_send_iov (sock, tx, more || tx->tx_nkiov != 0); + rc = ksocknal_send_iov (conn, tx); else - rc = ksocknal_send_kiov (sock, tx, more); - - /* Interpret a zero rc the same as -EAGAIN (Adaptech TOE) */ - if (rc <= 0) /* error or partial send */ - RETURN ((sent_some || rc == -EAGAIN) ? 0 : rc); - - if (tx->tx_nob == 0) /* sent everything */ - RETURN (0); + rc = ksocknal_send_kiov (conn, tx); + + if (rc <= 0) { /* error or socket full? */ + /* NB: rc == 0 and rc == -EAGAIN both mean try + * again later (linux stack returns -EAGAIN for + * this, but Adaptech TOE returns 0) */ + if (rc == -EAGAIN) + rc = 0; + break; + } - sent_some = 1; + if (tx->tx_resid == 0) { /* sent everything */ + rc = 0; + break; + } } + + ksocknal_putconnsock (conn); + RETURN (rc); } int @@ -414,7 +462,6 @@ ksocknal_recv_iov (ksock_conn_t *conn) return (-EAGAIN); } - LASSERT (rc == fragsize); conn->ksnc_rx_iov++; conn->ksnc_rx_niov--; return (1); @@ -466,7 +513,6 @@ ksocknal_recv_kiov (ksock_conn_t *conn) return (-EAGAIN); } - LASSERT (rc == fragsize); conn->ksnc_rx_kiov++; conn->ksnc_rx_nkiov--; return (1); @@ -475,31 +521,47 @@ ksocknal_recv_kiov (ksock_conn_t *conn) int ksocknal_recvmsg (ksock_conn_t *conn) { - int rc; - int got_some = 0; + /* Return 1 on success, 0 on EOF, < 0 on error. + * Caller checks ksnc_rx_nob_wanted to determine + * progress/completion. */ + int rc; ENTRY; - LASSERT (!in_interrupt ()); + if (ksocknal_data.ksnd_stall_rx != 0) { + set_current_state (TASK_UNINTERRUPTIBLE); + schedule_timeout (ksocknal_data.ksnd_stall_rx * HZ); + } + + rc = ksocknal_getconnsock (conn); + if (rc != 0) + return (rc); for (;;) { - LASSERT (conn->ksnc_rx_nob_wanted > 0); + if (conn->ksnc_closing) { + rc = -ESHUTDOWN; + break; + } if (conn->ksnc_rx_niov != 0) rc = ksocknal_recv_iov (conn); else rc = ksocknal_recv_kiov (conn); - - /* CAVEAT EMPTOR: we return... - * <= 0 for error (0 == EOF) and > 0 for success (unlike sendmsg()) */ - - if (rc <= 0) /* error/EOF or partial receive */ - RETURN ((got_some || rc == -EAGAIN) ? 1 : rc); - if (conn->ksnc_rx_nob_wanted == 0) - RETURN (1); + if (rc <= 0) { + /* error/EOF or partial receive */ + if (rc == -EAGAIN) + rc = 1; + break; + } - got_some = 0; + if (conn->ksnc_rx_nob_wanted == 0) { + rc = 1; + break; + } } + + ksocknal_putconnsock (conn); + RETURN (rc); } #if SOCKNAL_ZC @@ -507,7 +569,7 @@ void ksocknal_zc_callback (zccd_t *zcd) { ksock_tx_t *tx = KSOCK_ZCCD_2_TX(zcd); - ksock_sched_t *sched = tx->tx_sched; + ksock_sched_t *sched = tx->tx_conn->ksnc_scheduler; unsigned long flags; ENTRY; @@ -515,6 +577,7 @@ ksocknal_zc_callback (zccd_t *zcd) spin_lock_irqsave (&sched->kss_lock, flags); + list_del (&tx->tx_list); /* remove from kss_zctxpending_list */ list_add_tail (&tx->tx_list, &sched->kss_zctxdone_list); if (waitqueue_active (&sched->kss_waitq)) wake_up (&sched->kss_waitq); @@ -525,13 +588,24 @@ ksocknal_zc_callback (zccd_t *zcd) #endif void -ksocknal_tx_done (ksock_tx_t *tx) +ksocknal_tx_done (ksock_tx_t *tx, int asynch) { - unsigned long flags; ksock_ltx_t *ltx; ENTRY; - atomic_dec (&ksocknal_packets_being_sent); + if (tx->tx_conn != NULL) { + /* This tx got queued on a conn; do the accounting... */ + atomic_sub (tx->tx_nob, &tx->tx_conn->ksnc_tx_nob); +#if SOCKNAL_ZC + /* zero copy completion isn't always from + * process_transmit() so it needs to keep a ref on + * tx_conn... */ + if (asynch) + ksocknal_put_conn (tx->tx_conn); +#else + LASSERT (!asynch); +#endif + } if (tx->tx_isfwd) { /* was a forwarded packet? */ kpr_fwd_done (&ksocknal_data.ksnd_router, @@ -545,20 +619,44 @@ ksocknal_tx_done (ksock_tx_t *tx) lib_finalize (&ksocknal_lib, ltx->ltx_private, ltx->ltx_cookie); - spin_lock_irqsave (&ksocknal_data.ksnd_idle_ltx_lock, flags); - - list_add_tail (<x->ltx_tx.tx_list, ltx->ltx_idle); - - /* normal tx desc => wakeup anyone blocking for one */ - if (ltx->ltx_idle == &ksocknal_data.ksnd_idle_ltx_list && - waitqueue_active (&ksocknal_data.ksnd_idle_ltx_waitq)) - wake_up (&ksocknal_data.ksnd_idle_ltx_waitq); - - spin_unlock_irqrestore (&ksocknal_data.ksnd_idle_ltx_lock, flags); + ksocknal_put_ltx (ltx); EXIT; } void +ksocknal_tx_launched (ksock_tx_t *tx) +{ +#if SOCKNAL_ZC + if (atomic_read (&tx->tx_zccd.zccd_count) != 1) { + unsigned long flags; + ksock_conn_t *conn = tx->tx_conn; + ksock_sched_t *sched = conn->ksnc_scheduler; + + /* zccd skbufs are still in-flight. First take a ref on + * conn, so it hangs about for ksocknal_tx_done... */ + atomic_inc (&conn->ksnc_refcount); + + /* Stash it for timeout... + * NB We have to hold a lock to stash the tx, and we have + * stash it before we zcc_put(), but we have to _not_ hold + * this lock when we zcc_put(), otherwise we could deadlock + * if it turns out to be the last put. Aaaaarrrrggghhh! */ + spin_lock_irqsave (&sched->kss_lock, flags); + list_add_tail (&tx->tx_list, &conn->ksnc_tx_pending); + spin_unlock_irqrestore (&sched->kss_lock, flags); + + /* ...then drop the initial ref on zccd, so the zero copy + * callback can occur */ + zccd_put (&tx->tx_zccd); + return; + } +#endif + /* Any zero-copy-ness (if any) has completed; I can complete the + * transmit now, avoiding an extra schedule */ + ksocknal_tx_done (tx, 0); +} + +void ksocknal_process_transmit (ksock_sched_t *sched, unsigned long *irq_flags) { ksock_conn_t *conn; @@ -578,41 +676,32 @@ ksocknal_process_transmit (ksock_sched_t *sched, unsigned long *irq_flags) spin_unlock_irqrestore (&sched->kss_lock, *irq_flags); - LASSERT (tx->tx_nob > 0); + LASSERT (tx->tx_resid > 0); conn->ksnc_tx_ready = 0;/* write_space may race with me and set ready */ mb(); /* => clear BEFORE trying to write */ - rc = ksocknal_sendmsg (conn->ksnc_sock, tx, - !list_empty (&conn->ksnc_tx_queue)); /* more to come? */ + rc = ksocknal_sendmsg (conn, tx); - CDEBUG (D_NET, "send(%d) %d\n", tx->tx_nob, rc); + CDEBUG (D_NET, "send(%d) %d\n", tx->tx_resid, rc); if (rc != 0) { - CERROR("Error socknal send(%d) %p: %d\n", tx->tx_nob, conn, rc); - /* kid on for now the whole packet went. - * NB when we handle the error better, we'll still need to - * block for zccd completion. - */ - tx->tx_nob = 0; - } + if (ksocknal_close_conn_unlocked (conn)) { + /* I'm the first to close */ + CERROR ("[%p] Error %d on write to "LPX64" ip %08x:%d\n", + conn, rc, conn->ksnc_peer->ksnp_nid, + conn->ksnc_ipaddr, conn->ksnc_port); + } + ksocknal_tx_launched (tx); + spin_lock_irqsave (&sched->kss_lock, *irq_flags); - if (tx->tx_nob == 0) /* nothing left to send */ - { - /* everything went; assume more can go, so prevent write_space locking */ - conn->ksnc_tx_ready = 1; + } else if (tx->tx_resid == 0) { - ksocknal_put_conn (conn); /* release packet's ref */ - atomic_inc (&ksocknal_packets_being_sent); -#if SOCKNAL_ZC - if (atomic_read (&tx->tx_zccd.zccd_count) != 1) { - /* zccd skbufs are still in-flight. Release my - * initial ref on zccd, so callback can occur */ - zccd_put (&tx->tx_zccd); - } else -#endif - ksocknal_tx_done (tx); + /* everything went; assume more can go, and avoid + * write_space locking */ + conn->ksnc_tx_ready = 1; + ksocknal_tx_launched (tx); spin_lock_irqsave (&sched->kss_lock, *irq_flags); } else { spin_lock_irqsave (&sched->kss_lock, *irq_flags); @@ -621,84 +710,247 @@ ksocknal_process_transmit (ksock_sched_t *sched, unsigned long *irq_flags) list_add (&tx->tx_list, &conn->ksnc_tx_queue); } - if (!conn->ksnc_tx_ready || /* no space to write now */ - list_empty (&conn->ksnc_tx_queue)) {/* nothing to write */ - conn->ksnc_tx_scheduled = 0; /* not being scheduled */ - ksocknal_put_conn (conn); /* release scheduler's ref */ - } else /* let scheduler call me again */ + /* no space to write, or nothing to write? */ + if (!conn->ksnc_tx_ready || + list_empty (&conn->ksnc_tx_queue)) { + /* mark not scheduled */ + conn->ksnc_tx_scheduled = 0; + /* drop scheduler's ref */ + ksocknal_put_conn (conn); + } else { + /* stay scheduled */ list_add_tail (&conn->ksnc_tx_list, &sched->kss_tx_conns); + } +} + +void +ksocknal_launch_autoconnect_locked (ksock_route_t *route) +{ + unsigned long flags; + + /* called holding write lock on ksnd_global_lock */ + + LASSERT (route->ksnr_conn == NULL); + LASSERT (!route->ksnr_deleted && !route->ksnr_connecting); + + route->ksnr_connecting = 1; + atomic_inc (&route->ksnr_refcount); /* extra ref for asynchd */ + + spin_lock_irqsave (&ksocknal_data.ksnd_autoconnectd_lock, flags); + + list_add_tail (&route->ksnr_connect_list, + &ksocknal_data.ksnd_autoconnectd_routes); + + if (waitqueue_active (&ksocknal_data.ksnd_autoconnectd_waitq)) + wake_up (&ksocknal_data.ksnd_autoconnectd_waitq); + + spin_unlock_irqrestore (&ksocknal_data.ksnd_autoconnectd_lock, flags); +} + +ksock_peer_t * +ksocknal_find_target_peer_locked (ksock_tx_t *tx, ptl_nid_t nid) +{ + ptl_nid_t target_nid; + int rc; + ksock_peer_t *peer = ksocknal_find_peer_locked (nid); + + if (peer != NULL) + return (peer); + + if (tx->tx_isfwd) { + CERROR ("Can't send packet to "LPX64 + ": routed target is not a peer\n", nid); + return (NULL); + } + + rc = kpr_lookup (&ksocknal_data.ksnd_router, nid, &target_nid); + if (rc != 0) { + CERROR ("Can't route to "LPX64": router error %d\n", nid, rc); + return (NULL); + } + + peer = ksocknal_find_peer_locked (target_nid); + if (peer != NULL) + return (peer); + + CERROR ("Can't send packet to "LPX64": no peer entry\n", target_nid); + return (NULL); +} + +ksock_conn_t * +ksocknal_find_conn_locked (ksock_tx_t *tx, ksock_peer_t *peer) +{ + struct list_head *tmp; + ksock_conn_t *conn = NULL; + + /* Find the conn with the shortest tx queue */ + list_for_each (tmp, &peer->ksnp_conns) { + ksock_conn_t *c = list_entry (tmp, ksock_conn_t, ksnc_list); + + LASSERT (!c->ksnc_closing); + + if (conn == NULL || + atomic_read (&conn->ksnc_tx_nob) > + atomic_read (&c->ksnc_tx_nob)) + conn = c; + } + + return (conn); } void -ksocknal_launch_packet (ksock_conn_t *conn, ksock_tx_t *tx) +ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn) { unsigned long flags; ksock_sched_t *sched = conn->ksnc_scheduler; - /* Ensure the frags we've been given EXACTLY match the number of - * bytes we want to send. Many TCP/IP stacks disregard any total - * size parameters passed to them and just look at the frags. - * - * We always expect at least 1 mapped fragment containing the - * complete portals header. - */ - LASSERT (lib_iov_nob (tx->tx_niov, tx->tx_iov) + - lib_kiov_nob (tx->tx_nkiov, tx->tx_kiov) == tx->tx_nob); - LASSERT (tx->tx_niov >= 1); - LASSERT (tx->tx_iov[0].iov_len >= sizeof (ptl_hdr_t)); - - CDEBUG (D_NET, "type %d, nob %d niov %d nkiov %d\n", - ((ptl_hdr_t *)tx->tx_iov[0].iov_base)->type, tx->tx_nob, - tx->tx_niov, tx->tx_nkiov); + /* called holding global lock (read or irq-write) */ + + CDEBUG (D_NET, "Sending to "LPX64" on port %d\n", + conn->ksnc_peer->ksnp_nid, conn->ksnc_port); + + atomic_add (tx->tx_nob, &conn->ksnc_tx_nob); + tx->tx_resid = tx->tx_nob; + tx->tx_conn = conn; #if SOCKNAL_ZC zccd_init (&tx->tx_zccd, ksocknal_zc_callback); - /* NB this sets 1 ref on zccd, so the callback can only occur - * after I've released this ref */ - tx->tx_sched = sched; + /* NB this sets 1 ref on zccd, so the callback can only occur after + * I've released this ref. */ #endif - spin_lock_irqsave (&sched->kss_lock, flags); + spin_lock_irqsave (&sched->kss_lock, flags); + + tx->tx_deadline = jiffies_64 + ksocknal_io_timeout; list_add_tail (&tx->tx_list, &conn->ksnc_tx_queue); - - if (conn->ksnc_tx_ready && /* able to send */ - !conn->ksnc_tx_scheduled) { /* not scheduled to send */ - list_add_tail (&conn->ksnc_tx_list, &sched->kss_tx_conns); + + if (conn->ksnc_tx_ready && /* able to send */ + !conn->ksnc_tx_scheduled) { /* not scheduled to send */ + /* +1 ref for scheduler */ + atomic_inc (&conn->ksnc_refcount); + list_add_tail (&conn->ksnc_tx_list, + &sched->kss_tx_conns); conn->ksnc_tx_scheduled = 1; - atomic_inc (&conn->ksnc_refcount); /* extra ref for scheduler */ if (waitqueue_active (&sched->kss_waitq)) wake_up (&sched->kss_waitq); } spin_unlock_irqrestore (&sched->kss_lock, flags); +} - atomic_inc (&ksocknal_packets_launched); +ksock_route_t * +ksocknal_find_connectable_route_locked (ksock_peer_t *peer) +{ + struct list_head *tmp; + ksock_route_t *route; + + list_for_each (tmp, &peer->ksnp_routes) { + route = list_entry (tmp, ksock_route_t, ksnr_list); + + if (route->ksnr_conn == NULL && /* not connected */ + !route->ksnr_connecting && /* not connecting */ + route->ksnr_timeout <= jiffies_64) /* OK to retry */ + return (route); + } + + return (NULL); } -ksock_conn_t * -ksocknal_send_target (ptl_nid_t nid) +ksock_route_t * +ksocknal_find_connecting_route_locked (ksock_peer_t *peer) { - ptl_nid_t gatewaynid; - ksock_conn_t *conn; - int rc; + struct list_head *tmp; + ksock_route_t *route; - if ((conn = ksocknal_get_conn (nid)) == NULL) { - /* It's not a peer; try to find a gateway */ - rc = kpr_lookup (&ksocknal_data.ksnd_router, nid, &gatewaynid); - if (rc != 0) { - CERROR("Can't route to "LPX64": router error %d\n", - nid, rc); - return (NULL); - } + list_for_each (tmp, &peer->ksnp_routes) { + route = list_entry (tmp, ksock_route_t, ksnr_list); + + if (route->ksnr_connecting) + return (route); + } + + return (NULL); +} - if ((conn = ksocknal_get_conn (gatewaynid)) == NULL) { - CERROR ("Can't route to "LPX64": gateway "LPX64 - " is not a peer\n", nid, gatewaynid); - return (NULL); +int +ksocknal_launch_packet (ksock_tx_t *tx, ptl_nid_t nid) +{ + unsigned long flags; + ksock_peer_t *peer; + ksock_conn_t *conn; + ksock_route_t *route; + rwlock_t *g_lock; + + /* Ensure the frags we've been given EXACTLY match the number of + * bytes we want to send. Many TCP/IP stacks disregard any total + * size parameters passed to them and just look at the frags. + * + * We always expect at least 1 mapped fragment containing the + * complete portals header. */ + LASSERT (lib_iov_nob (tx->tx_niov, tx->tx_iov) + + lib_kiov_nob (tx->tx_nkiov, tx->tx_kiov) == tx->tx_nob); + LASSERT (tx->tx_niov >= 1); + LASSERT (tx->tx_iov[0].iov_len >= sizeof (ptl_hdr_t)); + + CDEBUG (D_NET, "packet %p type %d, nob %d niov %d nkiov %d\n", + tx, ((ptl_hdr_t *)tx->tx_iov[0].iov_base)->type, + tx->tx_nob, tx->tx_niov, tx->tx_nkiov); + + tx->tx_conn = NULL; /* only set when assigned a conn */ + + g_lock = &ksocknal_data.ksnd_global_lock; + read_lock (g_lock); + + peer = ksocknal_find_target_peer_locked (tx, nid); + if (peer == NULL) { + read_unlock (g_lock); + return (PTL_FAIL); + } + + /* Any routes need to be connected? (need write lock if so) */ + if (ksocknal_find_connectable_route_locked (peer) == NULL) { + conn = ksocknal_find_conn_locked (tx, peer); + if (conn != NULL) { + ksocknal_queue_tx_locked (tx, conn); + read_unlock (g_lock); + return (PTL_OK); } } + + /* need a write lock now to change peer state... */ - return (conn); + atomic_inc (&peer->ksnp_refcount); /* +1 ref for me while I unlock */ + read_unlock (g_lock); + write_lock_irqsave (g_lock, flags); + + if (peer->ksnp_closing) { /* peer deleted as I blocked! */ + write_unlock_irqrestore (g_lock, flags); + ksocknal_put_peer (peer); + return (PTL_FAIL); + } + ksocknal_put_peer (peer); /* drop ref I got above */ + + /* I may launch autoconnects, now we're write locked... */ + while ((route = ksocknal_find_connectable_route_locked (peer)) != NULL) + ksocknal_launch_autoconnect_locked (route); + + conn = ksocknal_find_conn_locked (tx, peer); + if (conn != NULL) { + ksocknal_queue_tx_locked (tx, conn); + write_unlock_irqrestore (g_lock, flags); + return (PTL_OK); + } + + if (ksocknal_find_connecting_route_locked (peer) == NULL) { + /* no routes actually connecting now */ + write_unlock_irqrestore (g_lock, flags); + return (PTL_FAIL); + } + + list_add_tail (&tx->tx_list, &peer->ksnp_tx_queue); + + write_unlock_irqrestore (g_lock, flags); + return (PTL_OK); } ksock_ltx_t * @@ -745,32 +997,19 @@ ksocknal_send (nal_cb_t *nal, void *private, lib_msg_t *cookie, size_t payload_len) { ksock_ltx_t *ltx; - ksock_conn_t *conn; + int rc; /* NB 'private' is different depending on what we're sending. * Just ignore it until we can rely on it - * - * Also, the return code from this procedure is ignored. - * If we can't send, we must still complete with lib_finalize(). - * We'll have to wait for 3.2 to return an error event. */ CDEBUG(D_NET, "sending "LPSZ" bytes in %d mapped frags to nid: "LPX64 " pid %d\n", payload_len, payload_niov, nid, pid); - conn = ksocknal_send_target (nid); - if (conn == NULL) { - lib_finalize (&ksocknal_lib, private, cookie); - return (-1); - } - ltx = ksocknal_setup_hdr (nal, private, cookie, hdr, type); - if (ltx == NULL) { - ksocknal_put_conn (conn); - lib_finalize (&ksocknal_lib, private, cookie); - return (-1); - } + if (ltx == NULL) + return (PTL_FAIL); /* append the payload_iovs to the one pointing at the header */ LASSERT (ltx->ltx_tx.tx_niov == 1 && ltx->ltx_tx.tx_nkiov == 0); @@ -781,8 +1020,11 @@ ksocknal_send (nal_cb_t *nal, void *private, lib_msg_t *cookie, ltx->ltx_tx.tx_niov = 1 + payload_niov; ltx->ltx_tx.tx_nob = sizeof (*hdr) + payload_len; - ksocknal_launch_packet (conn, <x->ltx_tx); - return (0); + rc = ksocknal_launch_packet (<x->ltx_tx, nid); + if (rc != PTL_OK) + ksocknal_put_ltx (ltx); + + return (rc); } int @@ -791,8 +1033,8 @@ ksocknal_send_pages (nal_cb_t *nal, void *private, lib_msg_t *cookie, unsigned int payload_niov, ptl_kiov_t *payload_iov, size_t payload_len) { ksock_ltx_t *ltx; - ksock_conn_t *conn; - + int rc; + /* NB 'private' is different depending on what we're sending. * Just ignore it until we can rely on it */ @@ -800,15 +1042,9 @@ ksocknal_send_pages (nal_cb_t *nal, void *private, lib_msg_t *cookie, "sending "LPSZ" bytes in %d mapped frags to nid: "LPX64" pid %d\n", payload_len, payload_niov, nid, pid); - conn = ksocknal_send_target (nid); - if (conn == NULL) - return (-1); - ltx = ksocknal_setup_hdr (nal, private, cookie, hdr, type); - if (ltx == NULL) { - ksocknal_put_conn (conn); - return (-1); - } + if (ltx == NULL) + return (PTL_FAIL); LASSERT (ltx->ltx_tx.tx_niov == 1 && ltx->ltx_tx.tx_nkiov == 0); LASSERT (payload_niov <= PTL_MD_MAX_IOV); @@ -819,17 +1055,20 @@ ksocknal_send_pages (nal_cb_t *nal, void *private, lib_msg_t *cookie, ltx->ltx_tx.tx_nkiov = payload_niov; ltx->ltx_tx.tx_nob = sizeof (*hdr) + payload_len; - ksocknal_launch_packet (conn, <x->ltx_tx); - return (0); + rc = ksocknal_launch_packet (<x->ltx_tx, nid); + if (rc != PTL_OK) + ksocknal_put_ltx (ltx); + + return (rc); } void ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) { - ksock_conn_t *conn; ptl_nid_t nid = fwd->kprfd_gateway_nid; ksock_tx_t *tx = (ksock_tx_t *)&fwd->kprfd_scratch; - + int rc; + CDEBUG (D_NET, "Forwarding [%p] -> "LPX64" ("LPX64"))\n", fwd, fwd->kprfd_gateway_nid, fwd->kprfd_target_nid); @@ -837,23 +1076,19 @@ ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) if (nid == ksocknal_lib.ni.nid) nid = fwd->kprfd_target_nid; - conn = ksocknal_get_conn (nid); - if (conn == NULL) { - CERROR ("[%p] fwd to "LPX64" isn't a peer\n", fwd, nid); - kpr_fwd_done (&ksocknal_data.ksnd_router, fwd, -EHOSTUNREACH); - return; - } - - /* This forward has now got a ref on conn */ - tx->tx_isfwd = 1; /* This is a forwarding packet */ tx->tx_nob = fwd->kprfd_nob; tx->tx_niov = fwd->kprfd_niov; tx->tx_iov = fwd->kprfd_iov; tx->tx_nkiov = 0; tx->tx_kiov = NULL; - - ksocknal_launch_packet (conn, tx); + tx->tx_hdr = (ptl_hdr_t *)fwd->kprfd_iov[0].iov_base; + + rc = ksocknal_launch_packet (tx, nid); + if (rc != 0) { + /* FIXME, could pass a better completion error */ + kpr_fwd_done (&ksocknal_data.ksnd_router, fwd, -EHOSTUNREACH); + } } int @@ -904,6 +1139,9 @@ ksocknal_fmb_callback (void *arg, int error) spin_unlock_irqrestore (&fmp->fmp_lock, flags); + /* drop peer ref taken on init */ + ksocknal_put_peer (fmb->fmb_peer); + if (conn == NULL) return; @@ -964,7 +1202,6 @@ ksocknal_get_idle_fmb (ksock_conn_t *conn) return (NULL); } - int ksocknal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb) { @@ -982,22 +1219,26 @@ ksocknal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb) LASSERT (sizeof (ptl_hdr_t) < PAGE_SIZE); /* Got a forwarding buffer; copy the header we just read into the - * forwarding buffer. If there's payload start reading reading it + * forwarding buffer. If there's payload, start reading reading it * into the buffer, otherwise the forwarding buffer can be kicked * off immediately. * * NB fmb->fmb_iov spans the WHOLE packet. * conn->ksnc_rx_iov spans just the payload. */ - fmb->fmb_iov[0].iov_base = page_address (fmb->fmb_pages[0]); /* copy header */ memcpy (fmb->fmb_iov[0].iov_base, &conn->ksnc_hdr, sizeof (ptl_hdr_t)); - if (payload_nob == 0) { /* got complete packet already */ - atomic_inc (&ksocknal_packets_received); + /* Take a ref on the conn's peer to prevent module unload before + * forwarding completes. NB we ref peer and not conn since because + * all refs on conn after it has been closed must remove themselves + * in finite time */ + fmb->fmb_peer = conn->ksnc_peer; + atomic_inc (&conn->ksnc_peer->ksnp_refcount); + if (payload_nob == 0) { /* got complete packet already */ CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (immediate)\n", conn, NTOH__u64 (conn->ksnc_hdr.src_nid), dest_nid, packet_nob); @@ -1036,11 +1277,10 @@ ksocknal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb) packet_nob, niov, fmb->fmb_iov, ksocknal_fmb_callback, fmb); - /* stash router's descriptor ready for call to kpr_fwd_start */ - conn->ksnc_cookie = &fmb->fmb_fwd; - + conn->ksnc_cookie = fmb; /* stash fmb for later */ conn->ksnc_rx_state = SOCKNAL_RX_BODY_FWD; /* read in the payload */ - + conn->ksnc_rx_deadline = jiffies_64 + ksocknal_io_timeout; /* start timeout */ + /* payload is desc's iov-ed buffer, but skipping the hdr */ LASSERT (niov <= sizeof (conn->ksnc_rx_iov_space) / sizeof (struct iovec)); @@ -1066,7 +1306,7 @@ ksocknal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb) void ksocknal_fwd_parse (ksock_conn_t *conn) { - ksock_conn_t *conn2; + ksock_peer_t *peer; ptl_nid_t dest_nid = NTOH__u64 (conn->ksnc_hdr.dest_nid); int body_len = NTOH__u32 (PTL_HDR_LENGTH(&conn->ksnc_hdr)); @@ -1081,7 +1321,9 @@ ksocknal_fwd_parse (ksock_conn_t *conn) CERROR("dropping packet from "LPX64" for "LPX64": packet " "size %d illegal\n", NTOH__u64 (conn->ksnc_hdr.src_nid), dest_nid, body_len); - ksocknal_new_packet (conn, 0); /* on to new packet */ + + ksocknal_new_packet (conn, 0); /* on to new packet */ + ksocknal_close_conn_unlocked (conn); /* give up on conn */ return; } @@ -1104,12 +1346,12 @@ ksocknal_fwd_parse (ksock_conn_t *conn) } /* should have gone direct */ - conn2 = ksocknal_get_conn (conn->ksnc_hdr.dest_nid); - if (conn2 != NULL) { + peer = ksocknal_get_peer (conn->ksnc_hdr.dest_nid); + if (peer != NULL) { CERROR ("dropping packet from "LPX64" for "LPX64 ": target is a peer\n", conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid); - ksocknal_put_conn (conn2); /* drop ref from get above */ + ksocknal_put_peer (peer); /* drop ref from get above */ /* on to next packet (skip this one's body) */ ksocknal_new_packet (conn, body_len); @@ -1221,10 +1463,18 @@ ksocknal_process_receive (ksock_sched_t *sched, unsigned long *irq_flags) rc = ksocknal_recvmsg(conn); - if (rc == 0) - goto out; - if (rc < 0) { - CERROR ("Error socknal read %p: %d\n", conn, rc); + if (rc <= 0) { + if (ksocknal_close_conn_unlocked (conn)) { + /* I'm the first to close */ + if (rc < 0) + CERROR ("[%p] Error %d on read from "LPX64" ip %08x:%d\n", + conn, rc, conn->ksnc_peer->ksnp_nid, + conn->ksnc_ipaddr, conn->ksnc_port); + else + CERROR ("[%p] EOF from "LPX64" ip %08x:%d\n", + conn, conn->ksnc_peer->ksnp_nid, + conn->ksnc_ipaddr, conn->ksnc_port); + } goto out; } @@ -1236,9 +1486,9 @@ ksocknal_process_receive (ksock_sched_t *sched, unsigned long *irq_flags) switch (conn->ksnc_rx_state) { case SOCKNAL_RX_HEADER: - /* It's not for me */ - if (conn->ksnc_hdr.type != PTL_MSG_HELLO && + if (conn->ksnc_hdr.type != HTON__u32(PTL_MSG_HELLO) && NTOH__u64(conn->ksnc_hdr.dest_nid) != ksocknal_lib.ni.nid) { + /* This packet isn't for me */ ksocknal_fwd_parse (conn); switch (conn->ksnc_rx_state) { case SOCKNAL_RX_HEADER: /* skipped (zero payload) */ @@ -1253,10 +1503,11 @@ ksocknal_process_receive (ksock_sched_t *sched, unsigned long *irq_flags) /* Not Reached */ } - PROF_START(lib_parse); /* sets wanted_len, iovs etc */ lib_parse(&ksocknal_lib, &conn->ksnc_hdr, conn); - PROF_FINISH(lib_parse); + + /* start timeout (lib is waiting for finalize) */ + conn->ksnc_rx_deadline = jiffies_64 + ksocknal_io_timeout; if (conn->ksnc_rx_nob_wanted != 0) { /* need to get payload? */ conn->ksnc_rx_state = SOCKNAL_RX_BODY; @@ -1265,8 +1516,8 @@ ksocknal_process_receive (ksock_sched_t *sched, unsigned long *irq_flags) /* Fall through (completed packet for me) */ case SOCKNAL_RX_BODY: - atomic_inc (&ksocknal_packets_received); - /* packet is done now */ + /* payload all received */ + conn->ksnc_rx_deadline = 0; /* cancel timeout */ lib_finalize(&ksocknal_lib, NULL, conn->ksnc_cookie); /* Fall through */ @@ -1277,16 +1528,19 @@ ksocknal_process_receive (ksock_sched_t *sched, unsigned long *irq_flags) goto try_read; /* try to finish reading slop now */ case SOCKNAL_RX_BODY_FWD: + /* payload all received */ CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (got body)\n", conn, NTOH__u64 (conn->ksnc_hdr.src_nid), NTOH__u64 (conn->ksnc_hdr.dest_nid), conn->ksnc_rx_nob_left); - atomic_inc (&ksocknal_packets_received); + /* cancel timeout (only needed it while fmb allocated) */ + conn->ksnc_rx_deadline = 0; - /* ksocknal_init_fmb() put router desc. in conn->ksnc_cookie */ - kpr_fwd_start (&ksocknal_data.ksnd_router, - (kpr_fwd_desc_t *)conn->ksnc_cookie); + /* forward the packet. NB ksocknal_init_fmb() put fmb into + * conn->ksnc_cookie */ + fmb = (ksock_fmb_t *)conn->ksnc_cookie; + kpr_fwd_start (&ksocknal_data.ksnd_router, &fmb->fmb_fwd); /* no slop in forwarded packets */ LASSERT (conn->ksnc_rx_nob_left == 0); @@ -1308,9 +1562,12 @@ ksocknal_process_receive (ksock_sched_t *sched, unsigned long *irq_flags) if (!conn->ksnc_rx_ready) { /* let socket callback schedule again */ conn->ksnc_rx_scheduled = 0; - ksocknal_put_conn (conn); /* release scheduler's ref */ - } else /* let scheduler call me again */ + /* drop scheduler's ref */ + ksocknal_put_conn (conn); + } else { + /* stay scheduled */ list_add_tail (&conn->ksnc_rx_list, &sched->kss_rx_conns); + } } int @@ -1384,7 +1641,7 @@ int ksocknal_scheduler (void *arg) else CERROR ("Can't set CPU affinity for %s\n", name); #endif /* CONFIG_SMP && CPU_AFFINITY */ - + spin_lock_irqsave (&sched->kss_lock, flags); while (!ksocknal_data.ksnd_shuttingdown) { @@ -1413,7 +1670,7 @@ int ksocknal_scheduler (void *arg) list_del (&tx->tx_list); spin_unlock_irqrestore (&sched->kss_lock, flags); - ksocknal_tx_done (tx); + ksocknal_tx_done (tx, 1); spin_lock_irqsave (&sched->kss_lock, flags); } @@ -1459,7 +1716,7 @@ ksocknal_data_ready (struct sock *sk, int n) ENTRY; /* interleave correctly with closing sockets... */ - read_lock (&ksocknal_data.ksnd_socklist_lock); + read_lock (&ksocknal_data.ksnd_global_lock); conn = sk->sk_user_data; if (conn == NULL) { /* raced with ksocknal_close_sock */ @@ -1490,7 +1747,7 @@ ksocknal_data_ready (struct sock *sk, int n) spin_unlock_irqrestore (&sched->kss_lock, flags); } - read_unlock (&ksocknal_data.ksnd_socklist_lock); + read_unlock (&ksocknal_data.ksnd_global_lock); EXIT; } @@ -1503,7 +1760,7 @@ ksocknal_write_space (struct sock *sk) ksock_sched_t *sched; /* interleave correctly with closing sockets... */ - read_lock (&ksocknal_data.ksnd_socklist_lock); + read_lock (&ksocknal_data.ksnd_global_lock); conn = sk->sk_user_data; @@ -1550,42 +1807,594 @@ ksocknal_write_space (struct sock *sk) } } - read_unlock (&ksocknal_data.ksnd_socklist_lock); + read_unlock (&ksocknal_data.ksnd_global_lock); +} + +int +ksocknal_sock_write (struct socket *sock, void *buffer, int nob) +{ + int rc; + mm_segment_t oldmm = get_fs(); + + while (nob > 0) { + struct iovec iov = { + .iov_base = buffer, + .iov_len = nob + }; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = 0 + }; + + set_fs (KERNEL_DS); + rc = sock_sendmsg (sock, &msg, iov.iov_len); + set_fs (oldmm); + + if (rc < 0) + return (rc); + + if (rc == 0) { + CERROR ("Unexpected zero rc\n"); + return (-ECONNABORTED); + } + + buffer = ((char *)buffer) + rc; + nob -= rc; + } + + return (0); +} + +int +ksocknal_sock_read (struct socket *sock, void *buffer, int nob) +{ + int rc; + mm_segment_t oldmm = get_fs(); + + while (nob > 0) { + struct iovec iov = { + .iov_base = buffer, + .iov_len = nob + }; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = 0 + }; + + set_fs (KERNEL_DS); + rc = sock_recvmsg (sock, &msg, iov.iov_len, 0); + set_fs (oldmm); + + if (rc < 0) + return (rc); + + if (rc == 0) + return (-ECONNABORTED); + + buffer = ((char *)buffer) + rc; + nob -= rc; + } + + return (0); +} + +int +ksocknal_exchange_nids (struct socket *sock, ptl_nid_t nid) +{ + int rc; + ptl_hdr_t hdr; + ptl_magicversion_t *hmv = (ptl_magicversion_t *)&hdr.dest_nid; + + LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid)); + + memset (&hdr, 0, sizeof (hdr)); + hmv->magic = __cpu_to_le32 (PORTALS_PROTO_MAGIC); + hmv->version_major = __cpu_to_le32 (PORTALS_PROTO_VERSION_MAJOR); + hmv->version_minor = __cpu_to_le32 (PORTALS_PROTO_VERSION_MINOR); + + hdr.src_nid = __cpu_to_le64 (ksocknal_lib.ni.nid); + hdr.type = __cpu_to_le32 (PTL_MSG_HELLO); + + /* Assume sufficient socket buffering for this message */ + rc = ksocknal_sock_write (sock, &hdr, sizeof (hdr)); + if (rc != 0) { + CERROR ("Error %d sending HELLO to "LPX64"\n", rc, nid); + return (rc); + } + + rc = ksocknal_sock_read (sock, hmv, sizeof (*hmv)); + if (rc != 0) { + CERROR ("Error %d reading HELLO from "LPX64"\n", rc, nid); + return (rc); + } + + if (hmv->magic != __le32_to_cpu (PORTALS_PROTO_MAGIC)) { + CERROR ("Bad magic %#08x (%#08x expected) from "LPX64"\n", + __cpu_to_le32 (hmv->magic), PORTALS_PROTO_MAGIC, nid); + return (-EINVAL); + } + + if (hmv->version_major != __cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR) || + hmv->version_minor != __cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR)) { + CERROR ("Incompatible protocol version %d.%d (%d.%d expected)" + " from "LPX64"\n", + __le16_to_cpu (hmv->version_major), + __le16_to_cpu (hmv->version_minor), + PORTALS_PROTO_VERSION_MAJOR, + PORTALS_PROTO_VERSION_MINOR, + nid); + return (-EINVAL); + } + + LASSERT (PORTALS_PROTO_VERSION_MAJOR == 0); + /* version 0 sends magic/version as the dest_nid of a 'hello' header, + * so read the rest of it in now... */ + + rc = ksocknal_sock_read (sock, hmv + 1, sizeof (hdr) - sizeof (*hmv)); + if (rc != 0) { + CERROR ("Error %d reading rest of HELLO hdr from "LPX64"\n", + rc, nid); + return (rc); + } + + /* ...and check we got what we expected */ + if (hdr.type != __cpu_to_le32 (PTL_MSG_HELLO) || + PTL_HDR_LENGTH (&hdr) != __cpu_to_le32 (0)) { + CERROR ("Expecting a HELLO hdr with 0 payload," + " but got type %d with %d payload from "LPX64"\n", + __le32_to_cpu (hdr.type), + __le32_to_cpu (PTL_HDR_LENGTH (&hdr)), nid); + return (-EINVAL); + } + + if (__le64_to_cpu (hdr.src_nid) != nid) { + CERROR ("Connected to nid "LPX64", but expecting "LPX64"\n", + __le64_to_cpu (hdr.src_nid), nid); + return (-EINVAL); + } + + return (0); +} + +int +ksocknal_set_linger (struct socket *sock) +{ + mm_segment_t oldmm = get_fs (); + int rc; + int option; + struct linger linger; + + /* Ensure this socket aborts active sends immediately when we close + * it. */ + + linger.l_onoff = 0; + linger.l_linger = 0; + + set_fs (KERNEL_DS); + rc = sock_setsockopt (sock, SOL_SOCKET, SO_LINGER, + (char *)&linger, sizeof (linger)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't set SO_LINGER: %d\n", rc); + return (rc); + } + + option = -1; + set_fs (KERNEL_DS); + rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_LINGER2, + (char *)&option, sizeof (option)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't set SO_LINGER2: %d\n", rc); + return (rc); + } + + return (0); +} + +int +ksocknal_connect_peer (ksock_route_t *route) +{ + struct sockaddr_in peer_addr; + mm_segment_t oldmm = get_fs(); + __u64 n; + struct timeval tv; + int fd; + struct socket *sock; + int rc; + + rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock); + if (rc != 0) { + CERROR ("Can't create autoconnect socket: %d\n", rc); + return (rc); + } + + /* Ugh; have to map_fd for compatibility with sockets passed in + * from userspace. And we actually need the refcounting that + * this gives you :) */ + + fd = sock_map_fd (sock); + if (fd < 0) { + sock_release (sock); + CERROR ("sock_map_fd error %d\n", fd); + return (fd); + } + + /* NB the fd now owns the ref on sock->file */ + LASSERT (sock->file != NULL); + LASSERT (file_count(sock->file) == 1); + + /* Set the socket timeouts, so our connection attempt completes in + * finite time */ + tv.tv_sec = ksocknal_io_timeout / HZ; + n = ksocknal_io_timeout % HZ; + n = n * 1000000 + HZ - 1; + do_div (n, HZ); + tv.tv_usec = n; + + set_fs (KERNEL_DS); + rc = sock_setsockopt (sock, SOL_SOCKET, SO_SNDTIMEO, + (char *)&tv, sizeof (tv)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't set send timeout %d (in HZ): %d\n", + ksocknal_io_timeout, rc); + goto out; + } + + set_fs (KERNEL_DS); + rc = sock_setsockopt (sock, SOL_SOCKET, SO_RCVTIMEO, + (char *)&tv, sizeof (tv)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't set receive timeout %d (in HZ): %d\n", + ksocknal_io_timeout, rc); + goto out; + } + + if (route->ksnr_nonagel) { + int option = 1; + + set_fs (KERNEL_DS); + rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_NODELAY, + (char *)&option, sizeof (option)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't disable nagel: %d\n", rc); + goto out; + } + } + + if (route->ksnr_buffer_size != 0) { + int option = route->ksnr_buffer_size; + + set_fs (KERNEL_DS); + rc = sock_setsockopt (sock, SOL_SOCKET, SO_SNDBUF, + (char *)&option, sizeof (option)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't set send buffer %d: %d\n", + route->ksnr_buffer_size, rc); + goto out; + } + + set_fs (KERNEL_DS); + rc = sock_setsockopt (sock, SOL_SOCKET, SO_RCVBUF, + (char *)&option, sizeof (option)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't set receive buffer %d: %d\n", + route->ksnr_buffer_size, rc); + goto out; + } + } + + memset (&peer_addr, 0, sizeof (peer_addr)); + peer_addr.sin_family = AF_INET; + peer_addr.sin_port = htons (route->ksnr_port); + peer_addr.sin_addr.s_addr = htonl (route->ksnr_ipaddr); + + rc = sock->ops->connect (sock, (struct sockaddr *)&peer_addr, + sizeof (peer_addr), sock->file->f_flags); + if (rc != 0) { + CERROR ("Error %d connecting to "LPX64"\n", rc, + route->ksnr_peer->ksnp_nid); + goto out; + } + + if (route->ksnr_xchange_nids) { + rc = ksocknal_exchange_nids (sock, route->ksnr_peer->ksnp_nid); + if (rc != 0) + goto out; + } + + rc = ksocknal_create_conn (route->ksnr_peer->ksnp_nid, + route, sock, route->ksnr_irq_affinity); + if (rc == 0) { + /* Take an extra ref on sock->file to compensate for the + * upcoming close which will lose fd's ref on it. */ + get_file (sock->file); + } + + out: + sys_close (fd); + return (rc); +} + +void +ksocknal_autoconnect (ksock_route_t *route) +{ + LIST_HEAD (zombies); + ksock_tx_t *tx; + ksock_peer_t *peer; + unsigned long flags; + int rc; + + rc = ksocknal_connect_peer (route); + if (rc == 0) { + /* successfully autoconnected: create_conn did the + * route/conn binding and scheduled any blocked packets, + * so there's nothing left to do now. */ + return; + } + + write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); + + peer = route->ksnr_peer; + route->ksnr_connecting = 0; + + LASSERT (route->ksnr_retry_interval != 0); + route->ksnr_timeout = jiffies_64 + route->ksnr_retry_interval; + route->ksnr_retry_interval = MIN (route->ksnr_retry_interval * 2, + SOCKNAL_MAX_RECONNECT_INTERVAL); + + if (!list_empty (&peer->ksnp_tx_queue) && + ksocknal_find_connecting_route_locked (peer) == NULL) { + LASSERT (list_empty (&peer->ksnp_conns)); + + /* None of the connections that the blocked packets are + * waiting for have been successful. Complete them now... */ + do { + tx = list_entry (peer->ksnp_tx_queue.next, + ksock_tx_t, tx_list); + list_del (&tx->tx_list); + list_add_tail (&tx->tx_list, &zombies); + } while (!list_empty (&peer->ksnp_tx_queue)); + } + + write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); + + while (!list_empty (&zombies)) { + tx = list_entry (zombies.next, ksock_tx_t, tx_list); + + CERROR ("Deleting packet type %d len %d ("LPX64"->"LPX64")\n", + NTOH__u32 (tx->tx_hdr->type), + NTOH__u32 (PTL_HDR_LENGTH(tx->tx_hdr)), + NTOH__u64 (tx->tx_hdr->src_nid), + NTOH__u64 (tx->tx_hdr->dest_nid)); + + list_del (&tx->tx_list); + /* complete now */ + ksocknal_tx_done (tx, 0); + } +} + +int +ksocknal_autoconnectd (void *arg) +{ + long id = (long)arg; + char name[16]; + unsigned long flags; + ksock_route_t *route; + int rc; + + snprintf (name, sizeof (name), "ksocknal_ad[%ld]", id); + kportal_daemonize (name); + kportal_blockallsigs (); + + spin_lock_irqsave (&ksocknal_data.ksnd_autoconnectd_lock, flags); + + while (!ksocknal_data.ksnd_shuttingdown) { + + if (!list_empty (&ksocknal_data.ksnd_autoconnectd_routes)) { + route = list_entry (ksocknal_data.ksnd_autoconnectd_routes.next, + ksock_route_t, ksnr_connect_list); + + list_del (&route->ksnr_connect_list); + spin_unlock_irqrestore (&ksocknal_data.ksnd_autoconnectd_lock, flags); + + ksocknal_autoconnect (route); + ksocknal_put_route (route); + + spin_lock_irqsave (&ksocknal_data.ksnd_autoconnectd_lock, flags); + continue; + } + + spin_unlock_irqrestore (&ksocknal_data.ksnd_autoconnectd_lock, flags); + + rc = wait_event_interruptible (ksocknal_data.ksnd_autoconnectd_waitq, + ksocknal_data.ksnd_shuttingdown || + !list_empty (&ksocknal_data.ksnd_autoconnectd_routes)); + + spin_lock_irqsave (&ksocknal_data.ksnd_autoconnectd_lock, flags); + } + + spin_unlock_irqrestore (&ksocknal_data.ksnd_autoconnectd_lock, flags); + + ksocknal_thread_fini (); + return (0); +} + +ksock_conn_t * +ksocknal_find_timed_out_conn (ksock_peer_t *peer) +{ + /* We're called with a shared lock on ksnd_global_lock */ + unsigned long flags; + ksock_conn_t *conn; + struct list_head *ctmp; + ksock_tx_t *tx; + struct list_head *ttmp; + ksock_sched_t *sched; + + list_for_each (ctmp, &peer->ksnp_conns) { + conn = list_entry (ctmp, ksock_conn_t, ksnc_list); + sched = conn->ksnc_scheduler; + + if (conn->ksnc_rx_deadline != 0 && + conn->ksnc_rx_deadline <= jiffies_64) + goto timed_out; + + spin_lock_irqsave (&sched->kss_lock, flags); + + list_for_each (ttmp, &conn->ksnc_tx_queue) { + tx = list_entry (ttmp, ksock_tx_t, tx_list); + LASSERT (tx->tx_deadline != 0); + + if (tx->tx_deadline <= jiffies_64) + goto timed_out_locked; + } +#if SOCKNAL_ZC + list_for_each (ttmp, &conn->ksnc_tx_pending) { + tx = list_entry (ttmp, ksock_tx_t, tx_list); + LASSERT (tx->tx_deadline != 0); + + if (tx->tx_deadline <= jiffies_64) + goto timed_out_locked; + } +#endif + spin_unlock_irqrestore (&sched->kss_lock, flags); + continue; + + timed_out_locked: + spin_unlock_irqrestore (&sched->kss_lock, flags); + timed_out: + atomic_inc (&conn->ksnc_refcount); + return (conn); + } + + return (NULL); +} + +void +ksocknal_check_peer_timeouts (struct list_head *peers) +{ + struct list_head *ptmp; + ksock_peer_t *peer; + ksock_conn_t *conn; + + again: + /* NB. We expect to have a look at all the peers and not find any + * connections to time out, so we just use a shared lock while we + * take a look... */ + read_lock (&ksocknal_data.ksnd_global_lock); + + list_for_each (ptmp, peers) { + peer = list_entry (ptmp, ksock_peer_t, ksnp_list); + conn = ksocknal_find_timed_out_conn (peer); + + if (conn != NULL) { + read_unlock (&ksocknal_data.ksnd_global_lock); + + if (ksocknal_close_conn_unlocked (conn)) { + /* I actually closed... */ + CERROR ("Timeout out conn->"LPX64" ip %x:%d\n", + peer->ksnp_nid, conn->ksnc_ipaddr, + conn->ksnc_port); + } + + /* NB we won't find this one again, but we can't + * just proceed with the next peer, since we dropped + * ksnd_global_lock and it might be dead already! */ + ksocknal_put_conn (conn); + goto again; + } + } + + read_unlock (&ksocknal_data.ksnd_global_lock); } int ksocknal_reaper (void *arg) { + wait_queue_t wait; unsigned long flags; ksock_conn_t *conn; - int rc; + int timeout; + int peer_index = 0; + __u64 deadline = jiffies_64; kportal_daemonize ("ksocknal_reaper"); kportal_blockallsigs (); + init_waitqueue_entry (&wait, current); + + spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags); + while (!ksocknal_data.ksnd_shuttingdown) { - spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags); - if (list_empty (&ksocknal_data.ksnd_reaper_list)) { - conn = NULL; - } else { - conn = list_entry (ksocknal_data.ksnd_reaper_list.next, + if (!list_empty (&ksocknal_data.ksnd_deathrow_conns)) { + conn = list_entry (ksocknal_data.ksnd_deathrow_conns.next, ksock_conn_t, ksnc_list); list_del (&conn->ksnc_list); + + spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags); + + ksocknal_terminate_conn (conn); + ksocknal_put_conn (conn); + + spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags); + continue; } + if (!list_empty (&ksocknal_data.ksnd_zombie_conns)) { + conn = list_entry (ksocknal_data.ksnd_zombie_conns.next, + ksock_conn_t, ksnc_list); + list_del (&conn->ksnc_list); + + spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags); + + ksocknal_destroy_conn (conn); + + spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags); + continue; + } + spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags); - if (conn != NULL) - ksocknal_close_conn (conn); - else { - rc = wait_event_interruptible (ksocknal_data.ksnd_reaper_waitq, - ksocknal_data.ksnd_shuttingdown || - !list_empty(&ksocknal_data.ksnd_reaper_list)); - LASSERT (rc == 0); + while ((timeout = deadline - jiffies_64) <= 0) { + /* Time to check for timeouts on a few more peers */ + ksocknal_check_peer_timeouts (&ksocknal_data.ksnd_peers[peer_index]); + + peer_index = (peer_index + 1) % SOCKNAL_PEER_HASH_SIZE; + deadline += HZ; } + + add_wait_queue (&ksocknal_data.ksnd_reaper_waitq, &wait); + set_current_state (TASK_INTERRUPTIBLE); + + if (!ksocknal_data.ksnd_shuttingdown && + list_empty (&ksocknal_data.ksnd_deathrow_conns) && + list_empty (&ksocknal_data.ksnd_zombie_conns)) + schedule_timeout (timeout); + + set_current_state (TASK_RUNNING); + remove_wait_queue (&ksocknal_data.ksnd_reaper_waitq, &wait); + + spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags); } + spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags); + ksocknal_thread_fini (); return (0); } diff --git a/lustre/portals/libcfs/debug.c b/lustre/portals/libcfs/debug.c index e9b0e12..16ef401 100644 --- a/lustre/portals/libcfs/debug.c +++ b/lustre/portals/libcfs/debug.c @@ -636,7 +636,7 @@ int portals_debug_mark_buffer(char *text) return -EINVAL; CDEBUG(0, "********************************************************\n"); - CDEBUG(0, "DEBUG MARKER: %s\n", text); + CERROR("DEBUG MARKER: %s\n", text); CDEBUG(0, "********************************************************\n"); return 0; @@ -644,39 +644,104 @@ int portals_debug_mark_buffer(char *text) #undef DEBUG_SUBSYSTEM #define DEBUG_SUBSYSTEM S_PORTALS +/* this copies a snapshot of the debug buffer into an array of pages + * before doing the potentially blocking copy into userspace. it could + * be warning userspace if things wrap heavily while its off copying. */ __s32 portals_debug_copy_to_user(char *buf, unsigned long len) { int rc; - unsigned long debug_off; + unsigned long debug_off, i, off, copied; unsigned long flags; + struct page *page; + LIST_HEAD(my_pages); + struct list_head *pos, *n; if (len < debug_size) return -ENOSPC; - debug_off = atomic_read(&debug_off_a); + for (i = 0 ; i < debug_size; i += PAGE_SIZE) { + page = alloc_page(GFP_NOFS); + if (page == NULL) { + rc = -ENOMEM; + goto cleanup; + } + list_add(&page->list, &my_pages); + } + spin_lock_irqsave(&portals_debug_lock, flags); - if (debug_wrapped) { - /* All of this juggling with the 1s is to keep the trailing nul - * (which falls at debug_buf + debug_off) at the end of what we - * copy into user space */ - copy_to_user(buf, debug_buf + debug_off + 1, - debug_size - debug_off - 1); - copy_to_user(buf + debug_size - debug_off - 1, - debug_buf, debug_off + 1); - rc = debug_size; - } else { - copy_to_user(buf, debug_buf, debug_off); - rc = debug_off; + debug_off = atomic_read(&debug_off_a); + + /* Sigh. If the buffer is empty, then skip to the end. */ + if (debug_off == 0 && !debug_wrapped) { + spin_unlock_irqrestore(&portals_debug_lock, flags); + rc = 0; + goto cleanup; } + + if (debug_wrapped) + off = debug_off + 1; + else + off = 0; + copied = 0; + list_for_each(pos, &my_pages) { + unsigned long to_copy; + page = list_entry(pos, struct page, list); + + to_copy = min(debug_size - off, PAGE_SIZE); + if (to_copy == 0) { + off = 0; + to_copy = min(debug_size - off, PAGE_SIZE); + } +finish_partial: + memcpy(kmap(page), debug_buf + off, to_copy); + kunmap(page); + copied += to_copy; + if (copied >= (debug_wrapped ? debug_size : debug_off)) + break; + + off += to_copy; + if (off >= debug_size) { + off = 0; + if (to_copy != PAGE_SIZE) { + to_copy = PAGE_SIZE - to_copy; + goto finish_partial; + } + } + } + spin_unlock_irqrestore(&portals_debug_lock, flags); + off = 0; + list_for_each(pos, &my_pages) { + unsigned long to_copy; + page = list_entry(pos, struct page, list); + + to_copy = min(copied - off, PAGE_SIZE); + rc = copy_to_user(buf + off, kmap(page), to_copy); + kunmap(page); + if (rc) { + rc = -EFAULT; + goto cleanup; + } + off += to_copy; + if (off >= copied) + break; + } + rc = copied; + +cleanup: + list_for_each_safe(pos, n, &my_pages) { + page = list_entry(pos, struct page, list); + list_del(&page->list); + __free_page(page); + } return rc; } /* FIXME: I'm not very smart; someone smarter should make this better. */ void portals_debug_msg(int subsys, int mask, char *file, const char *fn, - const int line, unsigned long stack, const char *format, ...) + const int line, unsigned long stack, char *format, ...) { va_list ap; unsigned long flags; @@ -731,33 +796,34 @@ portals_debug_msg(int subsys, int mask, char *file, const char *fn, do_gettimeofday(&tv); prefix_nob = snprintf(debug_buf + debug_off, max_nob, - "%06x:%06x:%d:%lu.%06lu ", + "%06x:%06x:%d:%lu.%06lu :", subsys, mask, smp_processor_id(), tv.tv_sec, tv.tv_usec); max_nob -= prefix_nob; + if(*(format + strlen(format) - 1) == '\n') + *(format + strlen(format) - 1) = ':'; + + va_start(ap, format); + msg_nob = vsnprintf(debug_buf + debug_off + prefix_nob , + max_nob, format, ap); + max_nob -= msg_nob; + va_end(ap); #if defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,4,20)) - msg_nob = snprintf(debug_buf + debug_off + prefix_nob, max_nob, - "(%s:%d:%s() %d | %d+%lu): ", + msg_nob += snprintf(debug_buf + debug_off + prefix_nob + msg_nob, max_nob, + "(%s:%d:%s() %d | %d+%lu)\n", file, line, fn, current->pid, current->thread.extern_pid, stack); #elif defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) - msg_nob = snprintf(debug_buf + debug_off + prefix_nob, max_nob, - "(%s:%d:%s() %d | %d+%lu): ", + msg_nob += snprintf(debug_buf + debug_off + prefix_nob + msg_nob, max_nob, + "(%s:%d:%s() %d | %d+%lu)\n", file, line, fn, current->pid, current->thread.mode.tt.extern_pid, stack); #else - msg_nob = snprintf(debug_buf + debug_off + prefix_nob, max_nob, - "(%s:%d:%s() %d+%lu): ", + msg_nob += snprintf(debug_buf + debug_off + prefix_nob + msg_nob, max_nob, + "(%s:%d:%s() %d+%lu)\n", file, line, fn, current->pid, stack); #endif - max_nob -= msg_nob; - - va_start(ap, format); - msg_nob += vsnprintf(debug_buf + debug_off + prefix_nob + msg_nob, - max_nob, format, ap); - max_nob -= msg_nob; - va_end(ap); /* Print to console, while msg is contiguous in debug_buf */ /* NB safely terminated see above */ diff --git a/lustre/portals/portals/lib-move.c b/lustre/portals/portals/lib-move.c index 23527ce..62db766 100644 --- a/lustre/portals/portals/lib-move.c +++ b/lustre/portals/portals/lib-move.c @@ -560,7 +560,6 @@ get_new_msg (nal_cb_t *nal, lib_md_t *md) return (msg); } - /* * Incoming messages have a ptl_msg_t object associated with them * by the library. This object encapsulates the state of the @@ -756,9 +755,13 @@ static int parse_get(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) rc = lib_send (nal, private, msg, &reply, PTL_MSG_REPLY, hdr->src_nid, hdr->src_pid, md, offset, mlength); - if (rc != 0) { + if (rc != PTL_OK) { CERROR(LPU64": Dropping GET from "LPU64": send REPLY failed\n", ni->nid, hdr->src_nid); + /* Hmm, this will create a GET event and make believe + * the reply completed, which it kind of did, only the + * source won't get her reply */ + lib_finalize (nal, private, msg); state_lock (nal, &flags); goto drop; } @@ -1099,7 +1102,8 @@ int do_PtlPut(nal_cb_t * nal, void *private, void *v_args, void *v_ret) lib_msg_t *msg = NULL; ptl_process_id_t *id = &args->target_in; unsigned long flags; - + int rc; + if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */ fail_peer (nal, id->nid, 1)) /* shall we now? */ { @@ -1177,9 +1181,15 @@ int do_PtlPut(nal_cb_t * nal, void *private, void *v_args, void *v_ret) state_unlock(nal, &flags); - lib_send (nal, private, msg, &hdr, PTL_MSG_PUT, - id->nid, id->pid, md, 0, md->length); - + rc = lib_send (nal, private, msg, &hdr, PTL_MSG_PUT, + id->nid, id->pid, md, 0, md->length); + if (rc != PTL_OK) { + /* get_new_msg() committed us to sending by decrementing + * md->threshold, so we have to act like we did send, but + * the network dropped it. */ + lib_finalize (nal, private, msg); + } + return ret->rc = PTL_OK; } @@ -1268,7 +1278,8 @@ int do_PtlGet(nal_cb_t * nal, void *private, void *v_args, void *v_ret) ptl_process_id_t *id = &args->target_in; lib_md_t *md; unsigned long flags; - + int rc; + if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */ fail_peer (nal, id->nid, 1)) /* shall we now? */ { @@ -1342,9 +1353,15 @@ int do_PtlGet(nal_cb_t * nal, void *private, void *v_args, void *v_ret) state_unlock(nal, &flags); - lib_send (nal, private, msg, &hdr, PTL_MSG_GET, - id->nid, id->pid, NULL, 0, 0); - + rc = lib_send (nal, private, msg, &hdr, PTL_MSG_GET, + id->nid, id->pid, NULL, 0, 0); + if (rc != PTL_OK) { + /* get_new_msg() committed us to sending by decrementing + * md->threshold, so we have to act like we did send, but + * the network dropped it. */ + lib_finalize (nal, private, msg); + } + return ret->rc = PTL_OK; } diff --git a/lustre/portals/portals/lib-msg.c b/lustre/portals/portals/lib-msg.c index f10892c..9363251 100644 --- a/lustre/portals/portals/lib-msg.c +++ b/lustre/portals/portals/lib-msg.c @@ -67,6 +67,7 @@ int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t *msg) rc = lib_send (nal, private, NULL, &ack, PTL_MSG_ACK, msg->nid, msg->pid, NULL, 0, 0); + /* If this send fails, there's nothing else to clean up */ } md = msg->md; diff --git a/lustre/portals/utils/debug.c b/lustre/portals/utils/debug.c index a2a8e77..462fd28 100644 --- a/lustre/portals/utils/debug.c +++ b/lustre/portals/utils/debug.c @@ -59,6 +59,8 @@ static int max = 8192; static int subsystem_mask = ~0; static int debug_mask = ~0; +#define MAX_MARK_SIZE 100 + static const char *portal_debug_subsystems[] = {"undefined", "mdc", "mds", "osc", "ost", "class", "log", "llite", "rpc", "mgmt", "portals", "socknal", "qswnal", "pinger", "filter", @@ -488,22 +490,29 @@ int jt_dbg_clear_debug_buf(int argc, char **argv) int jt_dbg_mark_debug_buf(int argc, char **argv) { - int rc; + int rc, max_size = MAX_MARK_SIZE-1; struct portal_ioctl_data data; char *text; time_t now = time(NULL); - if (argc > 2) { - fprintf(stderr, "usage: %s [marker text]\n", argv[0]); - return 0; - } - - if (argc == 2) { - text = argv[1]; + if (argc > 1) { + int counter; + text = malloc(MAX_MARK_SIZE); + strncpy(text, argv[1], max_size); + max_size-=strlen(argv[1]); + for(counter = 2; (counter < argc) && (max_size > 0) ; counter++){ + strncat(text, " ", 1); + max_size-=1; + strncat(text, argv[counter], max_size); + max_size-=strlen(argv[counter]); + } } else { text = ctime(&now); text[strlen(text) - 1] = '\0'; /* stupid \n */ } + if (!max_size) { + text[MAX_MARK_SIZE - 1] = '\0'; + } memset(&data, 0, sizeof(data)); data.ioc_inllen1 = strlen(text) + 1; @@ -530,7 +539,9 @@ int jt_dbg_modules(int argc, char **argv) } *mp, mod_paths[] = { {"portals", "lustre/portals/libcfs"}, {"ksocknal", "lustre/portals/knals/socknal"}, + {"lvfs", "lustre/lvfs"}, {"obdclass", "lustre/obdclass"}, + {"llog_test", "lustre/obdclass"}, {"ptlrpc", "lustre/ptlrpc"}, {"obdext2", "lustre/obdext2"}, {"ost", "lustre/ost"}, @@ -543,8 +554,9 @@ int jt_dbg_modules(int argc, char **argv) {"obdfilter", "lustre/obdfilter"}, {"extN", "lustre/extN"}, {"lov", "lustre/lov"}, - {"fsfilt_ext3", "lustre/obdclass"}, - {"fsfilt_extN", "lustre/obdclass"}, + {"fsfilt_ext3", "lustre/lvfs"}, + {"fsfilt_extN", "lustre/lvfs"}, + {"fsfilt_reiserfs", "lustre/lvfs"}, {"mds_ext2", "lustre/mds"}, {"mds_ext3", "lustre/mds"}, {"mds_extN", "lustre/mds"}, diff --git a/lustre/portals/utils/portals.c b/lustre/portals/utils/portals.c index c87b0952..b68bf34 100644 --- a/lustre/portals/utils/portals.c +++ b/lustre/portals/utils/portals.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #ifdef __CYGWIN__ @@ -61,9 +62,7 @@ unsigned int portal_printk; unsigned int portal_stack; -static ptl_nid_t g_nid = 0; static unsigned int g_nal = 0; -static unsigned short g_port = 0; static int g_socket_txmem = 0; static int g_socket_rxmem = 0; @@ -144,21 +143,25 @@ ptl_gethostbyname(char * hname) { } int -ptl_parse_nid (ptl_nid_t *nidp, char *str) +ptl_parse_ipaddr (__u32 *ipaddrp, char *str) { struct hostent *he; int a; int b; int c; int d; - + + if (!strcmp (str, "_all_")) + { + *ipaddrp = 0; + return (0); + } + if (sscanf (str, "%d.%d.%d.%d", &a, &b, &c, &d) == 4 && (a & ~0xff) == 0 && (b & ~0xff) == 0 && (c & ~0xff) == 0 && (d & ~0xff) == 0) { - __u32 addr = (a<<24)|(b<<16)|(c<<8)|d; - - *nidp = (ptl_nid_t)addr; + *ipaddrp = (a<<24)|(b<<16)|(c<<8)|d; return (0); } @@ -168,19 +171,55 @@ ptl_parse_nid (ptl_nid_t *nidp, char *str) { __u32 addr = *(__u32 *)he->h_addr; - *nidp = (ptl_nid_t)ntohl(addr); /* HOST byte order */ + *ipaddrp = ntohl(addr); /* HOST byte order */ return (0); } - if (sscanf (str, "%i", &a) == 1) + return (-1); +} + +char * +ptl_ipaddr_2_str (__u32 ipaddr, char *str) +{ + __u32 net_ip; + struct hostent *he; + + net_ip = htonl (ipaddr); + he = gethostbyaddr (&net_ip, sizeof (net_ip), AF_INET); + if (he != NULL) + return (he->h_name); + + sprintf (str, "%d.%d.%d.%d", + (ipaddr >> 24) & 0xff, (ipaddr >> 16) & 0xff, + (ipaddr >> 8) & 0xff, ipaddr & 0xff); + return (str); +} + +int +ptl_parse_nid (ptl_nid_t *nidp, char *str) +{ + __u32 ipaddr; + long lval; + + if (!strcmp (str, "_all_")) { + *nidp = PTL_NID_ANY; + return (0); + } + + if (ptl_parse_ipaddr (&ipaddr, str) == 0) { + *nidp = (ptl_nid_t)ipaddr; + return (0); + } + + if (sscanf (str, "%li", &lval) == 1) { - *nidp = (ptl_nid_t)a; + *nidp = (ptl_nid_t)lval; return (0); } - if (sscanf (str, "%x", &a) == 1) + if (sscanf (str, "%lx", &lval) == 1) { - *nidp = (ptl_nid_t) a; + *nidp = (ptl_nid_t)lval; return (0); } @@ -201,6 +240,32 @@ ptl_nid2str (char *buffer, ptl_nid_t nid) return (buffer); } +int g_nal_is_compatible (char *cmd, ...) +{ + va_list ap; + int nal; + + if (g_nal == 0) { + fprintf (stderr, "Error: you must run the 'network' command first.\n"); + return (0); + } + + va_start (ap, cmd); + + do { + nal = va_arg (ap, int); + } while (nal != 0 && nal != g_nal); + + va_end (ap); + + if (g_nal == nal) + return (1); + + fprintf (stderr, "Command %s not compatible with nal %s\n", + cmd, nal2name (g_nal)); + return (0); +} + int sock_write (int cfd, void *buffer, int nob) { @@ -266,22 +331,231 @@ int ptl_initialize(int argc, char **argv) int jt_ptl_network(int argc, char **argv) { - int nal; + name2num_t *entry; + int nal; - if (argc != 2 || - (nal = ptl_name2nal (argv[1])) == 0) - { - name2num_t *entry; + if (argc == 2 && + (nal = ptl_name2nal (argv[1])) != 0) { + g_nal = nal; + return (0); + } - fprintf(stderr, "usage: %s \n", argv[0]); - for (entry = nalnames; entry->name != NULL; entry++) - fprintf (stderr, "%s%s", entry == nalnames ? "<" : "|", entry->name); - fprintf(stderr, ">\n"); + fprintf(stderr, "usage: %s \n", argv[0]); + for (entry = nalnames; entry->name != NULL; entry++) + fprintf (stderr, "%s%s", entry == nalnames ? "<" : "|", entry->name); + fprintf(stderr, ">\n"); + return (-1); +} + +int +jt_ptl_print_autoconnects (int argc, char **argv) +{ + struct portal_ioctl_data data; + char buffer[64]; + int index; + int rc; + + if (!g_nal_is_compatible (argv[0], SOCKNAL, 0)) + return -1; + + for (index = 0;;index++) { + PORTAL_IOC_INIT (data); + data.ioc_nal = g_nal; + data.ioc_nal_cmd = NAL_CMD_GET_AUTOCONN; + data.ioc_count = index; + + rc = l_ioctl (PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); + if (rc != 0) + break; + + printf (LPX64"@%s:%d #%d buffer %d nonagle %s xchg %s affinity %s share %d\n", + data.ioc_nid, ptl_ipaddr_2_str (data.ioc_id, buffer), + data.ioc_misc, data.ioc_count, data.ioc_size, + (data.ioc_flags & 1) ? "on" : "off", + (data.ioc_flags & 2) ? "on" : "off", + (data.ioc_flags & 4) ? "on" : "off", + data.ioc_wait); } - else - g_nal = nal; - return (0); + if (index == 0) + printf ("\n"); + return 0; +} + +int +jt_ptl_add_autoconnect (int argc, char **argv) +{ + struct portal_ioctl_data data; + ptl_nid_t nid; + __u32 ip; + int port; + int xchange_nids = 0; + int irq_affinity = 0; + int share = 0; + int rc; + + if (argc < 4 || argc > 5) { + fprintf (stderr, "usage: %s nid ipaddr port [ixs]\n", argv[0]); + return 0; + } + + if (!g_nal_is_compatible (argv[0], SOCKNAL, 0)) + return -1; + + if (ptl_parse_nid (&nid, argv[1]) != 0 || + nid == PTL_NID_ANY) { + fprintf (stderr, "Can't parse NID: %s\n", argv[1]); + return -1; + } + + if (ptl_parse_ipaddr (&ip, argv[2]) != 0) { + fprintf (stderr, "Can't parse ip addr: %s\n", argv[2]); + return -1; + } + + port = atol (argv[3]); + + if (argc > 4) { + char *opts = argv[4]; + + while (*opts != 0) + switch (*opts++) { + case 'x': + xchange_nids = 1; + break; + case 'i': + irq_affinity = 1; + break; + case 's': + share = 1; + break; + default: + fprintf (stderr, "Can't parse options: %s\n", + argv[4]); + return -1; + } + } + + PORTAL_IOC_INIT (data); + data.ioc_nal = g_nal; + data.ioc_nal_cmd = NAL_CMD_ADD_AUTOCONN; + data.ioc_nid = nid; + data.ioc_id = ip; + data.ioc_misc = port; + /* only passing one buffer size! */ + data.ioc_size = MAX (g_socket_rxmem, g_socket_txmem); + data.ioc_flags = (g_socket_nonagle ? 1 : 0) | + (xchange_nids ? 2 : 0) | + (irq_affinity ? 4 : 0) | + (share ? 8 : 0); + + rc = l_ioctl (PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); + if (rc != 0) { + fprintf (stderr, "failed to enable autoconnect: %s\n", + strerror (errno)); + return -1; + } + + return 0; +} + +int +jt_ptl_del_autoconnect (int argc, char **argv) +{ + struct portal_ioctl_data data; + ptl_nid_t nid = PTL_NID_ANY; + __u32 ip = 0; + int share = 0; + int keep_conn = 0; + int rc; + + if (argc > 4) { + fprintf (stderr, "usage: %s [nid] [ipaddr] [sk]\n", + argv[0]); + return 0; + } + + if (!g_nal_is_compatible (argv[0], SOCKNAL, 0)) + return -1; + + if (argc > 1 && + ptl_parse_nid (&nid, argv[1]) != 0) { + fprintf (stderr, "Can't parse nid: %s\n", argv[1]); + return -1; + } + + if (argc > 2 && + ptl_parse_ipaddr (&ip, argv[2]) != 0) { + fprintf (stderr, "Can't parse ip addr: %s\n", argv[2]); + return -1; + } + + if (argc > 3) { + char *opts = argv[3]; + + while (*opts != 0) + switch (*opts++) { + case 's': + share = 1; + break; + case 'k': + keep_conn = 1; + break; + default: + fprintf (stderr, "Can't parse flags: %s\n", + argv[3]); + return -1; + } + } + + PORTAL_IOC_INIT (data); + data.ioc_nal = g_nal; + data.ioc_nal_cmd = NAL_CMD_DEL_AUTOCONN; + data.ioc_nid = nid; + data.ioc_id = ip; + data.ioc_flags = (share ? 1 : 0) | + (keep_conn ? 2 : 0); + + rc = l_ioctl (PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); + if (rc != 0) { + fprintf (stderr, "failed to remove autoconnect route: %s\n", + strerror (errno)); + return -1; + } + + return 0; +} + +int +jt_ptl_print_connections (int argc, char **argv) +{ + struct portal_ioctl_data data; + char buffer[64]; + int index; + int rc; + + if (!g_nal_is_compatible (argv[0], SOCKNAL, 0)) + return -1; + + for (index = 0;;index++) { + PORTAL_IOC_INIT (data); + data.ioc_nal = g_nal; + data.ioc_nal_cmd = NAL_CMD_GET_CONN; + data.ioc_count = index; + + rc = l_ioctl (PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); + if (rc != 0) + break; + + printf (LPD64"@%s:%d\n", + data.ioc_nid, + ptl_ipaddr_2_str (data.ioc_id, buffer), + data.ioc_misc); + } + + if (index == 0) + printf ("\n"); + return 0; } int @@ -318,17 +592,17 @@ exchange_nids (int cfd, ptl_nid_t my_nid, ptl_nid_t *peer_nid) return (-1); } - if (__cpu_to_le32 (hmv->magic) != PORTALS_PROTO_MAGIC) { + if (hmv->magic != __cpu_to_le32 (PORTALS_PROTO_MAGIC)) { fprintf (stderr, "Bad magic %#08x (%#08x expected)\n", - __cpu_to_le32 (hmv->magic), PORTALS_PROTO_MAGIC); + __le32_to_cpu (hmv->magic), PORTALS_PROTO_MAGIC); return (-1); } - if (__cpu_to_le16 (hmv->version_major) != PORTALS_PROTO_VERSION_MAJOR || - __cpu_to_le16 (hmv->version_minor) != PORTALS_PROTO_VERSION_MINOR) { + if (hmv->version_major != __cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR) || + hmv->version_minor != __cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR)) { fprintf (stderr, "Incompatible protocol version %d.%d (%d.%d expected)\n", - __cpu_to_le16 (hmv->version_major), - __cpu_to_le16 (hmv->version_minor), + __le16_to_cpu (hmv->version_major), + __le16_to_cpu (hmv->version_minor), PORTALS_PROTO_VERSION_MAJOR, PORTALS_PROTO_VERSION_MINOR); } @@ -343,12 +617,12 @@ exchange_nids (int cfd, ptl_nid_t my_nid, ptl_nid_t *peer_nid) } /* ...and check we got what we expected */ - if (__cpu_to_le32 (hdr.type) != PTL_MSG_HELLO || - __cpu_to_le32 (PTL_HDR_LENGTH (&hdr)) != 0) { + if (hdr.type != __cpu_to_le32 (PTL_MSG_HELLO) || + PTL_HDR_LENGTH (&hdr) != __cpu_to_le32 (0)) { fprintf (stderr, "Expecting a HELLO hdr with 0 payload," " but got type %d with %d payload\n", - __cpu_to_le32 (hdr.type), - __cpu_to_le32 (PTL_HDR_LENGTH (&hdr))); + __le32_to_cpu (hdr.type), + __le32_to_cpu (PTL_HDR_LENGTH (&hdr))); return (-1); } @@ -358,237 +632,189 @@ exchange_nids (int cfd, ptl_nid_t my_nid, ptl_nid_t *peer_nid) int jt_ptl_connect(int argc, char **argv) { - if (argc < 2) { - usage: - fprintf(stderr, "usage: %s or \n", - argv[0]); + ptl_nid_t peer_nid; + struct portal_ioctl_data data; + struct sockaddr_in srvaddr; + __u32 ipaddr; + char *flag; + int fd, rc; + int nonagle = 0; + int rxmem = 0; + int txmem = 0; + int bind_irq = 0; + int xchange_nids = 0; + int port; + int o; + int olen; + + if (argc < 3) { + fprintf(stderr, "usage: %s ip port [xi]\n", argv[0]); return 0; } - if (g_nal == 0) { - fprintf(stderr, "Error: you must run the 'network' command " - "first.\n"); + + if (!g_nal_is_compatible (argv[0], SOCKNAL, TOENAL, 0)) + return -1; + + rc = ptl_parse_ipaddr (&ipaddr, argv[1]); + if (rc != 0) { + fprintf(stderr, "Can't parse hostname: %s\n", argv[1]); return -1; } - if (g_nal == SOCKNAL || g_nal == TOENAL) { - ptl_nid_t peer_nid; - struct hostent *he; - struct portal_ioctl_data data; - struct sockaddr_in srvaddr; - char *flag; - int fd, rc; - int nonagle = 0; - int rxmem = 0; - int txmem = 0; - int bind_irq = 0; - int xchange_nids = 0; - int o; - int olen; - - if (argc < 3) { - goto usage; - } - he = ptl_gethostbyname(argv[1]); - if (!he) - return -1; - - g_port = atol(argv[2]); - - if (argc > 3) - for (flag = argv[3]; *flag != 0; flag++) - switch (*flag) - { - case 'i': - bind_irq = 1; - break; - - case 'x': - xchange_nids = 1; - break; - - default: - fprintf (stderr, "unrecognised flag '%c'\n", - *flag); - return (-1); - } - - memset(&srvaddr, 0, sizeof(srvaddr)); - srvaddr.sin_family = AF_INET; - srvaddr.sin_port = htons(g_port); - srvaddr.sin_addr.s_addr = *(__u32 *)he->h_addr; - - fd = socket(PF_INET, SOCK_STREAM, 0); - if ( fd < 0 ) { - fprintf(stderr, "socket() failed: %s\n", - strerror(errno)); - return -1; - } - - if (g_socket_nonagle) - { - o = 1; - if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &o, sizeof (o)) != 0) - { - fprintf(stderr, "cannot disable nagle: %s\n", strerror(errno)); + port = atol(argv[2]); + if (argc > 3) + for (flag = argv[3]; *flag != 0; flag++) + switch (*flag) + { + case 'i': + bind_irq = 1; + break; + + case 'x': + xchange_nids = 1; + break; + + default: + fprintf (stderr, "unrecognised flag '%c'\n", + *flag); return (-1); } - } - if (g_socket_rxmem != 0) - { - o = g_socket_rxmem; - if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &o, sizeof (o)) != 0) - { - fprintf(stderr, "cannot set receive buffer size: %s\n", strerror(errno)); - return (-1); - } - } + memset(&srvaddr, 0, sizeof(srvaddr)); + srvaddr.sin_family = AF_INET; + srvaddr.sin_port = htons(port); + srvaddr.sin_addr.s_addr = htonl(ipaddr); - if (g_socket_txmem != 0) - { - o = g_socket_txmem; - if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &o, sizeof (o)) != 0) - { - fprintf(stderr, "cannot set send buffer size: %s\n", strerror(errno)); - return (-1); - } + fd = socket(PF_INET, SOCK_STREAM, 0); + if ( fd < 0 ) { + fprintf(stderr, "socket() failed: %s\n", strerror(errno)); + return -1; + } + + if (g_socket_nonagle) + { + o = 1; + if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &o, sizeof (o)) != 0) { + fprintf(stderr, "cannot disable nagle: %s\n", strerror(errno)); + return (-1); } + } - rc = connect(fd, (struct sockaddr *)&srvaddr, sizeof(srvaddr)); - if ( rc == -1 ) { - fprintf(stderr, "connect() failed: %s\n", - strerror(errno)); - return -1; + if (g_socket_rxmem != 0) { + o = g_socket_rxmem; + if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &o, sizeof (o)) != 0) { + fprintf(stderr, "cannot set receive buffer size: %s\n", strerror(errno)); + return (-1); } + } - olen = sizeof (txmem); - if (getsockopt (fd, SOL_SOCKET, SO_SNDBUF, &txmem, &olen) != 0) - fprintf (stderr, "Can't get send buffer size: %s\n", strerror (errno)); - olen = sizeof (rxmem); - if (getsockopt (fd, SOL_SOCKET, SO_RCVBUF, &rxmem, &olen) != 0) - fprintf (stderr, "Can't get receive buffer size: %s\n", strerror (errno)); - olen = sizeof (nonagle); - if (getsockopt (fd, IPPROTO_TCP, TCP_NODELAY, &nonagle, &olen) != 0) - fprintf (stderr, "Can't get nagle: %s\n", strerror (errno)); - - if (xchange_nids) { - - PORTAL_IOC_INIT (data); - data.ioc_nal = g_nal; - rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_NID, &data); - if (rc != 0) - { - fprintf (stderr, "failed to get my nid: %s\n", - strerror (errno)); - close (fd); - return (-1); - } - - rc = exchange_nids (fd, data.ioc_nid, &peer_nid); - if (rc != 0) - { - close (fd); - return (-1); - } + if (g_socket_txmem != 0) { + o = g_socket_txmem; + if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &o, sizeof (o)) != 0) { + fprintf(stderr, "cannot set send buffer size: %s\n", strerror(errno)); + return (-1); } - else - peer_nid = ntohl (srvaddr.sin_addr.s_addr); /* HOST byte order */ + } - printf("Connected host: %s NID "LPX64" snd: %d rcv: %d nagle: %s\n", argv[1], - peer_nid, txmem, rxmem, nonagle ? "Disabled" : "Enabled"); + rc = connect(fd, (struct sockaddr *)&srvaddr, sizeof(srvaddr)); + if ( rc == -1 ) { + fprintf(stderr, "connect() failed: %s\n", strerror(errno)); + return -1; + } - PORTAL_IOC_INIT(data); - data.ioc_fd = fd; + olen = sizeof (txmem); + if (getsockopt (fd, SOL_SOCKET, SO_SNDBUF, &txmem, &olen) != 0) + fprintf (stderr, "Can't get send buffer size: %s\n", strerror (errno)); + olen = sizeof (rxmem); + if (getsockopt (fd, SOL_SOCKET, SO_RCVBUF, &rxmem, &olen) != 0) + fprintf (stderr, "Can't get receive buffer size: %s\n", strerror (errno)); + olen = sizeof (nonagle); + if (getsockopt (fd, IPPROTO_TCP, TCP_NODELAY, &nonagle, &olen) != 0) + fprintf (stderr, "Can't get nagle: %s\n", strerror (errno)); + + if (!xchange_nids) + peer_nid = ipaddr; + else { + PORTAL_IOC_INIT (data); data.ioc_nal = g_nal; - data.ioc_nal_cmd = NAL_CMD_REGISTER_PEER_FD; - data.ioc_nid = peer_nid; - data.ioc_flags = bind_irq; - - rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); - if (rc) { - fprintf(stderr, "failed to register fd with portals: " - "%s\n", strerror(errno)); + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_NID, &data); + if (rc != 0) { + fprintf (stderr, "failed to get my nid: %s\n", + strerror (errno)); close (fd); - return -1; + return (-1); } - g_nid = peer_nid; - printf("Connection to "LPX64" registered with socknal\n", g_nid); - - rc = close(fd); - if (rc) { - fprintf(stderr, "close failed: %d\n", rc); - } - } else if (g_nal == QSWNAL) { - g_nid = atoi(argv[1]); - } else if (g_nal == GMNAL) { - g_nid = atoi(argv[1]); - } else if (g_nal == SCIMACNAL) { - unsigned int tmpnid; - if(sscanf(argv[1], "%x", &tmpnid) == 1) { - g_nid=tmpnid; - } - else { - fprintf(stderr, "nid %s invalid for SCI nal\n", argv[1]); + rc = exchange_nids (fd, data.ioc_nid, &peer_nid); + if (rc != 0) { + close (fd); + return (-1); } + } + printf("Connected host: %s NID "LPX64" snd: %d rcv: %d nagle: %s\n", argv[1], + peer_nid, txmem, rxmem, nonagle ? "Disabled" : "Enabled"); + PORTAL_IOC_INIT(data); + data.ioc_fd = fd; + data.ioc_nal = g_nal; + data.ioc_nal_cmd = NAL_CMD_REGISTER_PEER_FD; + data.ioc_nid = peer_nid; + data.ioc_flags = bind_irq; - } else { - fprintf(stderr, "This should never happen. Also it is very " - "bad.\n"); + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); + if (rc) { + fprintf(stderr, "failed to register fd with portals: %s\n", + strerror(errno)); + close (fd); + return -1; } + printf("Connection to "LPX64" registered with socknal\n", peer_nid); + + rc = close(fd); + if (rc) + fprintf(stderr, "close failed: %d\n", rc); + return 0; } int jt_ptl_disconnect(int argc, char **argv) { - if (argc > 2) { - fprintf(stderr, "usage: %s [hostname]\n", argv[0]); + struct portal_ioctl_data data; + ptl_nid_t nid = PTL_NID_ANY; + __u32 ipaddr = 0; + int rc; + + if (argc > 3) { + fprintf(stderr, "usage: %s [nid] [ipaddr]\n", argv[0]); return 0; } - if (g_nal == 0) { - fprintf(stderr, "Error: you must run the 'network' command " - "first.\n"); + + if (!g_nal_is_compatible (argv[0], SOCKNAL, TOENAL, 0)) + return -1; + + if (argc >= 2 && + ptl_parse_nid (&nid, argv[1]) != 0) { + fprintf (stderr, "Can't parse nid %s\n", argv[1]); return -1; } - if (g_nal == SOCKNAL || g_nal == TOENAL) { - struct hostent *he; - struct portal_ioctl_data data; - int rc; - PORTAL_IOC_INIT(data); - if (argc == 2) { - he = ptl_gethostbyname(argv[1]); - if (!he) - return -1; - - data.ioc_nid = ntohl (*(__u32 *)he->h_addr); /* HOST byte order */ + if (argc >= 3 && + ptl_parse_ipaddr (&ipaddr, argv[2]) != 0) { + fprintf (stderr, "Can't parse ip addr %s\n", argv[2]); + return -1; + } - } else { - printf("Disconnecting ALL connections.\n"); - /* leave ioc_nid zeroed == disconnect all */ - } - data.ioc_nal = g_nal; - data.ioc_nal_cmd = NAL_CMD_CLOSE_CONNECTION; - rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); - if (rc) { - fprintf(stderr, "failed to remove connection: %s\n", - strerror(errno)); - return -1; - } - } else if (g_nal == QSWNAL) { - printf("'disconnect' doesn't make any sense for " - "elan.\n"); - } else if (g_nal == GMNAL) { - printf("'disconnect' doesn't make any sense for " - "GM.\n"); - } else if (g_nal == SCIMACNAL) { - printf("'disconnect' doesn't make any sense for " - "SCI.\n"); - } else { - fprintf(stderr, "This should never happen. Also it is very " - "bad.\n"); + PORTAL_IOC_INIT(data); + data.ioc_nal = g_nal; + data.ioc_nal_cmd = NAL_CMD_CLOSE_CONNECTION; + data.ioc_nid = nid; + data.ioc_id = ipaddr; + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); + if (rc) { + fprintf(stderr, "failed to remove connection: %s\n", + strerror(errno)); return -1; } @@ -597,55 +823,86 @@ int jt_ptl_disconnect(int argc, char **argv) int jt_ptl_push_connection (int argc, char **argv) { - if (argc > 2) { - fprintf(stderr, "usage: %s [hostname]\n", argv[0]); + struct portal_ioctl_data data; + int rc; + ptl_nid_t nid = PTL_NID_ANY; + __u32 ipaddr = 0; + + if (argc > 3) { + fprintf(stderr, "usage: %s [nid] [ip]\n", argv[0]); return 0; } - if (g_nal == 0) { - fprintf(stderr, "Error: you must run the 'network' command " - "first.\n"); + + if (!g_nal_is_compatible (argv[0], SOCKNAL, TOENAL, 0)) + return -1; + + if (argc > 1 && + ptl_parse_nid (&nid, argv[1]) != 0) { + fprintf(stderr, "Can't parse nid: %s\n", argv[1]); return -1; } - if (g_nal == SOCKNAL || g_nal == TOENAL) { - struct hostent *he; - struct portal_ioctl_data data; - int rc; - - PORTAL_IOC_INIT(data); - if (argc == 2) { - he = ptl_gethostbyname(argv[1]); - if (!he) - return -1; - data.ioc_nid = ntohl (*(__u32 *)he->h_addr); /* HOST byte order */ + if (argc > 2 && + ptl_parse_ipaddr (&ipaddr, argv[2]) != 0) { + fprintf(stderr, "Can't parse ipaddr: %s\n", argv[2]); + } - } else { - printf("Pushing ALL connections.\n"); - /* leave ioc_nid zeroed == disconnect all */ - } - data.ioc_nal = g_nal; - data.ioc_nal_cmd = NAL_CMD_PUSH_CONNECTION; - rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); - if (rc) { - fprintf(stderr, "failed to push connection: %s\n", - strerror(errno)); - return -1; - } - } else if (g_nal == QSWNAL) { - printf("'push' doesn't make any sense for elan.\n"); - } else if (g_nal == GMNAL) { - printf("'push' doesn't make any sense for GM.\n"); - } else if (g_nal == SCIMACNAL) { - printf("'push' doesn't make any sense for SCI.\n"); - } else { - fprintf(stderr, "This should never happen. Also it is very " - "bad.\n"); + PORTAL_IOC_INIT(data); + data.ioc_nal = g_nal; + data.ioc_nal_cmd = NAL_CMD_PUSH_CONNECTION; + data.ioc_nid = nid; + data.ioc_id = ipaddr; + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); + if (rc) { + fprintf(stderr, "failed to push connection: %s\n", + strerror(errno)); return -1; } return 0; } +int +jt_ptl_print_active_txs (int argc, char **argv) +{ + struct portal_ioctl_data data; + int index; + int rc; + + if (!g_nal_is_compatible (argv[0], QSWNAL, 0)) + return -1; + + for (index = 0;;index++) { + PORTAL_IOC_INIT (data); + data.ioc_nal = g_nal; + data.ioc_nal_cmd = NAL_CMD_GET_TXDESC; + data.ioc_count = index; + + rc = l_ioctl (PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); + if (rc != 0) + break; + + printf ("%p: %5s payload %6d bytes to "LPX64" via "LPX64" by pid %6d: %s, %s, state %d\n", + data.ioc_pbuf1, + data.ioc_count == PTL_MSG_ACK ? "ACK" : + data.ioc_count == PTL_MSG_PUT ? "PUT" : + data.ioc_count == PTL_MSG_GET ? "GET" : + data.ioc_count == PTL_MSG_REPLY ? "REPLY" : "", + data.ioc_size, + data.ioc_nid, + data.ioc_nid2, + data.ioc_misc, + (data.ioc_flags & 1) ? "delayed" : "immediate", + (data.ioc_flags & 2) ? "nblk" : "normal", + data.ioc_flags >> 2); + } + + if (index == 0) + printf ("\n"); + return 0; +} + int jt_ptl_ping(int argc, char **argv) { int rc; diff --git a/lustre/portals/utils/ptlctl.c b/lustre/portals/utils/ptlctl.c index 8c56d93..c083e48 100644 --- a/lustre/portals/utils/ptlctl.c +++ b/lustre/portals/utils/ptlctl.c @@ -30,9 +30,14 @@ command_t list[] = { {"network", jt_ptl_network, 0,"setup the NAL (args: nal name)"}, - {"connect", jt_ptl_connect, 0, "connect to a remote nid (args: | for tcp/elan respectively)"}, - {"disconnect", jt_ptl_disconnect, 0, "disconnect from a remote nid (args: [hostname]"}, - {"push", jt_ptl_push_connection, 0, "flush connection to a remote nid (args: [hostname]"}, + {"print_autoconns", jt_ptl_print_autoconnects, 0, "print autoconnect entries (no args)"}, + {"add_autoconn", jt_ptl_add_autoconnect, 0, "add autoconnect entry (args: nid host [ixs])"}, + {"del_autoconn", jt_ptl_del_autoconnect, 0, "delete autoconnect entry (args: [nid] [host] [ks])"}, + {"print_conns", jt_ptl_print_connections, 0, "print connections (no args)"}, + {"connect", jt_ptl_connect, 0, "connect to a remote nid (args: host port [xi])"}, + {"disconnect", jt_ptl_disconnect, 0, "disconnect from a remote nid (args: [nid] [host]"}, + {"push", jt_ptl_push_connection, 0, "flush connection to a remote nid (args: [nid]"}, + {"active_tx", jt_ptl_print_active_txs, 0, "print active transmits (no args)"}, {"ping", jt_ptl_ping, 0, "do a ping test (args: nid [count] [size] [timeout])"}, {"shownid", jt_ptl_shownid, 0, "print the local NID"}, {"mynid", jt_ptl_mynid, 0, "inform the socknal of the local NID (args: [hostname])"}, -- 1.8.3.1