*/
#define DEBUG_PORTAL_ALLOC
-#define EXPORT_SYMTAB
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
#include <linux/config.h>
#include <linux/module.h>
#include <asm/system.h>
#include <asm/uaccess.h>
+#include <linux/init.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/stat.h>
#include <linux/list.h>
#include <linux/kmod.h>
+#include <linux/sysctl.h>
#include <asm/uaccess.h>
#include <asm/segment.h>
#include <asm/div64.h>
#include <linux/kp30.h>
#include <portals/p30.h>
#include <portals/lib-p30.h>
+#include <portals/socknal.h>
#if CONFIG_SMP
-# define SOCKNAL_N_SCHED smp_num_cpus /* # socknal schedulers */
+# define SOCKNAL_N_SCHED num_online_cpus() /* # socknal schedulers */
#else
# define SOCKNAL_N_SCHED 1 /* # socknal schedulers */
#endif
#define SOCKNAL_MIN_RECONNECT_INTERVAL HZ /* first failed connection retry... */
#define SOCKNAL_MAX_RECONNECT_INTERVAL (60*HZ) /* ...exponentially increasing to this */
-#define SOCKNAL_IO_TIMEOUT (60*HZ) /* default comms timeout */
+/* default vals for runtime tunables */
+#define SOCKNAL_IO_TIMEOUT 50 /* default comms timeout (seconds) */
+#define SOCKNAL_EAGER_ACK 0 /* default eager ack (boolean) */
+#define SOCKNAL_TYPED_CONNS 1 /* unidirectional large, bidirectional small? */
+#define SOCKNAL_ZC_MIN_FRAG (2<<10) /* default smallest zerocopy fragment */
+#define SOCKNAL_MIN_BULK (1<<10) /* smallest "large" message */
+#define SOCKNAL_USE_KEEPALIVES 0 /* use tcp/ip keepalive? */
#define SOCKNAL_PEER_HASH_SIZE 101 /* # peer lists */
-#if PTL_LARGE_MTU
-# define SOCKNAL_MAX_FWD_PAYLOAD (256<<10) /* biggest payload I can forward */
-#else
-# define SOCKNAL_MAX_FWD_PAYLOAD (64<<10) /* biggest payload I can forward */
-#endif
-
-#define SOCKNAL_ZC_MIN_FRAG (2<<10) /* default smallest zerocopy fragment */
-
-#define SOCKNAL_NLTXS 128 /* # normal transmit messages */
-#define SOCKNAL_NNBLK_LTXS 128 /* # transmit messages reserved if can't block */
-
#define SOCKNAL_SMALL_FWD_NMSGS 128 /* # small messages I can be forwarding at any time */
#define SOCKNAL_LARGE_FWD_NMSGS 64 /* # large messages I can be forwarding at any time */
#define SOCKNAL_SMALL_FWD_PAGES 1 /* # pages in a small message fwd buffer */
-#define SOCKNAL_LARGE_FWD_PAGES (PAGE_ALIGN (sizeof (ptl_hdr_t) + SOCKNAL_MAX_FWD_PAYLOAD) >> PAGE_SHIFT)
+#define SOCKNAL_LARGE_FWD_PAGES (PAGE_ALIGN(PTL_MTU) >> PAGE_SHIFT)
/* # pages in a large message fwd buffer */
#define SOCKNAL_RESCHED 100 /* # scheduler loops before reschedule */
+#define SOCKNAL_ENOMEM_RETRY 1 /* jiffies between retries */
#define SOCKNAL_TX_LOW_WATER(sk) (((sk)->sk_sndbuf*8)/10)
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-# define jiffies_64 jiffies
-#endif
-
#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,72))
# define sk_data_ready data_ready
# define sk_write_space write_space
# define sk_socket socket
#endif
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0))
+# define sk_wmem_queued wmem_queued
+#endif
+
typedef struct /* pool of forwarding buffers */
{
spinlock_t fmp_lock; /* serialise */
- struct list_head fmp_idle_fmbs; /* buffers waiting for a connection */
+ struct list_head fmp_idle_fmbs; /* free buffers */
struct list_head fmp_blocked_conns; /* connections waiting for a buffer */
+ int fmp_nactive_fmbs; /* # buffers in use */
+ int fmp_buff_pages; /* # pages per buffer */
} ksock_fmb_pool_t;
typedef struct {
int ksnd_init; /* initialisation state */
+ int ksnd_io_timeout; /* "stuck" socket timeout (seconds) */
+ int ksnd_eager_ack; /* make TCP ack eagerly? */
+ int ksnd_typed_conns; /* drive sockets by type? */
+ int ksnd_min_bulk; /* smallest "large" message */
+#if SOCKNAL_ZC
+ unsigned int ksnd_zc_min_frag; /* minimum zero copy frag size */
+#endif
+ struct ctl_table_header *ksnd_sysctl; /* sysctl interface */
+ __u64 ksnd_incarnation; /* my epoch */
rwlock_t ksnd_global_lock; /* stabilize peer/conn ops */
struct list_head *ksnd_peers; /* hash table of all my known peers */
kpr_router_t ksnd_router; /* THE router */
- void *ksnd_fmbs; /* all the pre-allocated FMBs */
ksock_fmb_pool_t ksnd_small_fmp; /* small message forwarding buffers */
ksock_fmb_pool_t ksnd_large_fmp; /* large message forwarding buffers */
- void *ksnd_ltxs; /* all the pre-allocated LTXs */
- spinlock_t ksnd_idle_ltx_lock; /* serialise ltx alloc/free */
- struct list_head ksnd_idle_ltx_list; /* where to get an idle LTX */
- struct list_head ksnd_idle_nblk_ltx_list; /* where to get an idle LTX if you can't block */
- wait_queue_head_t ksnd_idle_ltx_waitq; /* where to block for an idle LTX */
- int ksnd_active_ltxs; /* #active ltxs */
+ atomic_t ksnd_nactive_ltxs; /* #active ltxs */
struct list_head ksnd_deathrow_conns; /* conns to be closed */
struct list_head ksnd_zombie_conns; /* conns to be freed */
- wait_queue_head_t ksnd_reaper_waitq; /* reaper sleep here */
+ struct list_head ksnd_enomem_conns; /* conns to be retried */
+ wait_queue_head_t ksnd_reaper_waitq; /* reaper sleeps here */
+ unsigned long ksnd_reaper_waketime; /* when reaper will wake */
spinlock_t ksnd_reaper_lock; /* serialise */
+ int ksnd_enomem_tx; /* test ENOMEM sender */
int ksnd_stall_tx; /* test sluggish sender */
int ksnd_stall_rx; /* test sluggish receiver */
#define SOCKNAL_INIT_ALL 3
/* A packet just assembled for transmission is represented by 1 or more
- * struct iovec fragments and 0 or more ptl_kiov_t fragments. Forwarded
- * messages, or messages from an MD with PTL_MD_KIOV _not_ set have 0
- * ptl_kiov_t fragments. Messages from an MD with PTL_MD_KIOV set, have 1
- * struct iovec fragment (the header) and up to PTL_MD_MAX_IOV ptl_kiov_t
- * fragments.
+ * struct iovec fragments (the first frag contains the portals header),
+ * followed by 0 or more ptl_kiov_t fragments.
*
* On the receive side, initially 1 struct iovec fragment is posted for
- * receive (the header). Once the header has been received, if the message
- * requires forwarding or will be received into mapped memory, up to
- * PTL_MD_MAX_IOV struct iovec fragments describe the target memory.
- * Otherwise up to PTL_MD_MAX_IOV ptl_kiov_t fragments are used.
- */
+ * receive (the header). Once the header has been received, the payload is
+ * received into either struct iovec or ptl_kiov_t fragments, depending on
+ * what the header matched or whether the message needs forwarding. */
struct ksock_conn; /* forward ref */
struct ksock_peer; /* forward ref */
typedef struct /* transmit packet */
{
struct list_head tx_list; /* queue on conn for transmission etc */
- __u64 tx_deadline; /* when (in jiffies) tx times out */
char tx_isfwd; /* forwarding / sourced here */
int tx_nob; /* # packet bytes */
int tx_resid; /* residual bytes */
#endif
} ksock_tx_t;
+typedef struct /* forwarded packet */
+{
+ ksock_tx_t ftx_tx; /* send info */
+ struct iovec ftx_iov; /* hdr iovec */
+} ksock_ftx_t;
+
#define KSOCK_ZCCD_2_TX(ptr) list_entry (ptr, ksock_tx_t, tx_zccd)
/* network zero copy callback descriptor embedded in ksock_tx_t */
-/* space for the tx frag descriptors: hdr is always 1 iovec
- * and payload is PTL_MD_MAX of either type. */
-typedef struct
-{
- struct iovec hdr;
- union {
- struct iovec iov[PTL_MD_MAX_IOV];
- ptl_kiov_t kiov[PTL_MD_MAX_IOV];
- } payload;
-} ksock_txiovspace_t;
-
typedef struct /* locally transmitted packet */
{
ksock_tx_t ltx_tx; /* send info */
- struct list_head *ltx_idle; /* where to put when idle */
void *ltx_private; /* lib_finalize() callback arg */
void *ltx_cookie; /* lib_finalize() callback arg */
- ksock_txiovspace_t ltx_iov_space; /* where to stash frag descriptors */
ptl_hdr_t ltx_hdr; /* buffer for packet header */
+ int ltx_desc_size; /* bytes allocated for this desc */
+ struct iovec ltx_iov[1]; /* iov for hdr + payload */
+ ptl_kiov_t ltx_kiov[0]; /* kiov for payload */
} ksock_ltx_t;
#define KSOCK_TX_2_KPR_FWD_DESC(ptr) list_entry ((kprfd_scratch_t *)ptr, kpr_fwd_desc_t, kprfd_scratch)
{ /* (socknal->router) */
struct list_head fmb_list; /* queue idle */
kpr_fwd_desc_t fmb_fwd; /* router's descriptor */
- int fmb_npages; /* # pages allocated */
ksock_fmb_pool_t *fmb_pool; /* owning pool */
struct ksock_peer *fmb_peer; /* peer received from */
- struct page *fmb_pages[SOCKNAL_LARGE_FWD_PAGES];
- struct iovec fmb_iov[SOCKNAL_LARGE_FWD_PAGES];
+ ptl_hdr_t fmb_hdr; /* message header */
+ ptl_kiov_t fmb_kiov[0]; /* payload frags */
} ksock_fmb_t;
/* space for the rx frag descriptors; we either read a single contiguous
- * header, or PTL_MD_MAX_IOV frags of payload of either type. */
+ * header, or up to PTL_MD_MAX_IOV frags of payload of either type. */
typedef union {
struct iovec iov[PTL_MD_MAX_IOV];
ptl_kiov_t kiov[PTL_MD_MAX_IOV];
__u32 ksnc_ipaddr; /* peer's IP */
int ksnc_port; /* peer's port */
int ksnc_closing; /* being shut down */
-
- /* READER */
+ int ksnc_type; /* type of connection */
+ __u64 ksnc_incarnation; /* peer's incarnation */
+
+ /* reader */
struct list_head ksnc_rx_list; /* where I enq waiting input or a forwarding descriptor */
- __u64 ksnc_rx_deadline; /* when receive times out */
+ unsigned long ksnc_rx_deadline; /* when (in jiffies) receive times out */
+ int ksnc_rx_started; /* started receiving a message */
int ksnc_rx_ready; /* data ready to read */
int ksnc_rx_scheduled; /* being progressed */
int ksnc_rx_state; /* what is being read */
/* WRITER */
struct list_head ksnc_tx_list; /* where I enq waiting for output space */
struct list_head ksnc_tx_queue; /* packets waiting to be sent */
-#if SOCKNAL_ZC
- struct list_head ksnc_tx_pending; /* zc packets pending callback */
-#endif
+ unsigned long ksnc_tx_deadline; /* when (in jiffies) tx times out */
atomic_t ksnc_tx_nob; /* # bytes queued */
int ksnc_tx_ready; /* write space */
int ksnc_tx_scheduled; /* being progressed */
} ksock_conn_t;
+#define KSNR_TYPED_ROUTES ((1 << SOCKNAL_CONN_CONTROL) | \
+ (1 << SOCKNAL_CONN_BULK_IN) | \
+ (1 << SOCKNAL_CONN_BULK_OUT))
+
typedef struct ksock_route
{
struct list_head ksnr_list; /* chain on peer route list */
struct ksock_peer *ksnr_peer; /* owning peer */
atomic_t ksnr_refcount; /* # users */
int ksnr_sharecount; /* lconf usage counter */
- __u64 ksnr_timeout; /* when reconnection can happen next */
+ unsigned long ksnr_timeout; /* when (in jiffies) reconnection can happen next */
unsigned int ksnr_retry_interval; /* how long between retries */
__u32 ksnr_ipaddr; /* an IP address for this peer */
int ksnr_port; /* port to connect to */
int ksnr_buffer_size; /* size of socket buffers */
unsigned int ksnr_irq_affinity:1; /* set affinity? */
- unsigned int ksnr_xchange_nids:1; /* do hello protocol? */
- unsigned int ksnr_nonagel:1; /* disable nagle? */
- unsigned int ksnr_connecting; /* autoconnect in progress? */
- unsigned int ksnr_deleted; /* been removed from peer? */
- int ksnr_generation; /* connection incarnation # */
- ksock_conn_t *ksnr_conn; /* NULL/active connection */
+ unsigned int ksnr_eager:1; /* connect eagery? */
+ unsigned int ksnr_connecting:4; /* autoconnects in progress by type */
+ unsigned int ksnr_connected:4; /* connections established by type */
+ unsigned int ksnr_deleted:1; /* been removed from peer? */
+ int ksnr_conn_count; /* # conns established by this route */
} ksock_route_t;
typedef struct ksock_peer
ptl_nid_t ksnp_nid; /* who's on the other end(s) */
atomic_t ksnp_refcount; /* # users */
int ksnp_closing; /* being closed */
+ int ksnp_error; /* errno on closing last conn */
struct list_head ksnp_conns; /* all active connections */
struct list_head ksnp_routes; /* routes */
struct list_head ksnp_tx_queue; /* waiting packets */
+ unsigned long ksnp_last_alive; /* when (in jiffies) I was last alive */
} ksock_peer_t;
-
extern nal_cb_t ksocknal_lib;
extern ksock_nal_data_t ksocknal_data;
extern ksock_peer_t *ksocknal_get_peer (ptl_nid_t nid);
extern int ksocknal_del_route (ptl_nid_t nid, __u32 ipaddr,
int single, int keep_conn);
-extern int ksocknal_create_conn (ptl_nid_t nid, ksock_route_t *route,
- struct socket *sock, int bind_irq);
-extern void ksocknal_close_conn_locked (ksock_conn_t *conn);
-extern int ksocknal_close_conn_unlocked (ksock_conn_t *conn);
+extern int ksocknal_create_conn (ksock_route_t *route,
+ struct socket *sock, int bind_irq, int type);
+extern void ksocknal_close_conn_locked (ksock_conn_t *conn, int why);
extern void ksocknal_terminate_conn (ksock_conn_t *conn);
extern void ksocknal_destroy_conn (ksock_conn_t *conn);
extern void ksocknal_put_conn (ksock_conn_t *conn);
-extern int ksocknal_close_conn (ptl_nid_t nid, __u32 ipaddr);
+extern int ksocknal_close_stale_conns_locked (ksock_peer_t *peer, __u64 incarnation);
+extern int ksocknal_close_conn_and_siblings (ksock_conn_t *conn, int why);
+extern int ksocknal_close_matching_conns (ptl_nid_t nid, __u32 ipaddr);
extern void ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn);
extern void ksocknal_tx_done (ksock_tx_t *tx, int asynch);
extern void ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd);
extern void ksocknal_fmb_callback (void *arg, int error);
+extern void ksocknal_notify (void *arg, ptl_nid_t gw_nid, int alive);
extern int ksocknal_thread_start (int (*fn)(void *arg), void *arg);
extern int ksocknal_new_packet (ksock_conn_t *conn, int skip);
extern int ksocknal_scheduler (void *arg);
extern void ksocknal_write_space(struct sock *sk);
extern int ksocknal_autoconnectd (void *arg);
extern int ksocknal_reaper (void *arg);
-extern int ksocknal_set_linger (struct socket *sock);
+extern int ksocknal_setup_sock (struct socket *sock);
+extern int ksocknal_hello (struct socket *sock,
+ ptl_nid_t *nid, int *type, __u64 *incarnation);