X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lnet%2Fklnds%2Fralnd%2Fralnd.h;h=300cf40b92c725351d3d8cf4df1b9e8893a0ec96;hb=ed88907a96ba81d3558e71ade9def98bdc785169;hp=7e437056100aa5120f39c833e6efcce048d4b7dd;hpb=4ab1d51e7bbd98006a21a1655f7e5bffec3cf0d4;p=fs%2Flustre-release.git diff --git a/lnet/klnds/ralnd/ralnd.h b/lnet/klnds/ralnd/ralnd.h index 7e43705..300cf40 100644 --- a/lnet/klnds/ralnd/ralnd.h +++ b/lnet/klnds/ralnd/ralnd.h @@ -51,56 +51,44 @@ #include #include -#define DEBUG_SUBSYSTEM S_NAL +#define DEBUG_SUBSYSTEM S_LND -#include -#include -#include -#include -#include +#include +#include +#include #include -#define RANAL_MAXDEVS 2 /* max # devices RapidArray supports */ +/* tunables determined at compile time */ +#define RANAL_RESCHED 100 /* # scheduler loops before reschedule */ -#define RANAL_N_CONND 4 /* # connection daemons */ +#define RANAL_PEER_HASH_SIZE 101 /* # peer lists */ +#define RANAL_CONN_HASH_SIZE 101 /* # conn lists */ -#define RANAL_MIN_RECONNECT_INTERVAL 1 /* first failed connection retry (seconds)... */ -#define RANAL_MAX_RECONNECT_INTERVAL 60 /* ...exponentially increasing to this */ +#define RANAL_MIN_TIMEOUT 5 /* minimum timeout interval (seconds) */ +#define RANAL_TIMEOUT2KEEPALIVE(t) (((t)+1)/2) /* timeout -> keepalive interval */ -#define RANAL_FMA_MAX_PREFIX 232 /* max size of FMA "Prefix" */ +/* fixed constants */ +#define RANAL_MAXDEVS 2 /* max # devices RapidArray supports */ +#define RANAL_FMA_MAX_PREFIX 232 /* max bytes in FMA "Prefix" we can use */ #define RANAL_FMA_MAX_DATA ((7<<10)-256) /* Max FMA MSG is 7K including prefix */ -#define RANAL_PEER_HASH_SIZE 101 /* # peer lists */ -#define RANAL_CONN_HASH_SIZE 101 /* # conn lists */ - -#define RANAL_NTX 64 /* # tx descs */ -#define RANAL_NTX_NBLK 256 /* # reserved tx descs */ - -#define RANAL_FMA_CQ_SIZE 8192 /* # entries in receive CQ - * (overflow is a performance hit) */ -#define RANAL_RESCHED 100 /* # scheduler loops before reschedule */ - -#define RANAL_MIN_TIMEOUT 5 /* minimum timeout interval (seconds) */ -#define RANAL_TIMEOUT2KEEPALIVE(t) (((t)+1)/2) /* timeout -> keepalive interval */ - -/* default vals for runtime tunables */ -#define RANAL_TIMEOUT 30 /* comms timeout (seconds) */ -#define RANAL_LISTENER_TIMEOUT 5 /* listener timeout (seconds) */ -#define RANAL_BACKLOG 127 /* listener's backlog */ -#define RANAL_PORT 988 /* listener's port */ -#define RANAL_MAX_IMMEDIATE (2<<10) /* immediate payload breakpoint */ - -typedef struct +typedef struct { - int kra_timeout; /* comms timeout (seconds) */ - int kra_listener_timeout; /* max time the listener can block */ - int kra_backlog; /* listener's backlog */ - int kra_port; /* listener's TCP/IP port */ - int kra_max_immediate; /* immediate payload breakpoint */ - + int *kra_n_connd; /* # connection daemons */ + int *kra_min_reconnect_interval; /* first failed connection retry... */ + int *kra_max_reconnect_interval; /* ...exponentially increasing to this */ + int *kra_ntx; /* # tx descs */ + int *kra_credits; /* # concurrent sends */ + int *kra_peercredits; /* # concurrent sends to 1 peer */ + int *kra_fma_cq_size; /* # entries in receive CQ */ + int *kra_timeout; /* comms timeout (seconds) */ + int *kra_max_immediate; /* immediate payload breakpoint */ + +#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM struct ctl_table_header *kra_sysctl; /* sysctl interface */ +#endif } kra_tunables_t; typedef struct @@ -111,24 +99,24 @@ typedef struct int rad_id; /* device id */ int rad_idx; /* index in kra_devices */ int rad_ready; /* set by device callback */ - struct list_head rad_connq; /* connections requiring attention */ - struct list_head rad_zombies; /* connections to free */ + struct list_head rad_ready_conns;/* connections ready to tx/rx */ + struct list_head rad_new_conns; /* new connections to complete */ wait_queue_head_t rad_waitq; /* scheduler waits here */ spinlock_t rad_lock; /* serialise */ void *rad_scheduler; /* scheduling thread */ + unsigned int rad_nphysmap; /* # phys mappings */ + unsigned int rad_nppphysmap; /* # phys pages mapped */ + unsigned int rad_nvirtmap; /* # virt mappings */ + unsigned long rad_nobvirtmap; /* # virt bytes mapped */ } kra_device_t; - -typedef struct + +typedef struct { int kra_init; /* initialisation state */ int kra_shutdown; /* shut down? */ atomic_t kra_nthreads; /* # live threads */ - - struct semaphore kra_nid_mutex; /* serialise NID/listener ops */ - struct semaphore kra_listener_signal; /* block for listener startup/shutdown */ - struct socket *kra_listener_sock; /* listener's socket */ - int kra_listener_shutdown; /* ask listener to close */ - + lnet_ni_t *kra_ni; /* _the_ nal instance */ + kra_device_t kra_devices[RANAL_MAXDEVS]; /* device/ptag/cq etc */ int kra_ndevs; /* # devices */ @@ -137,6 +125,7 @@ typedef struct struct list_head *kra_peers; /* hash table of all my known peers */ int kra_peer_hash_size; /* size of kra_peers */ atomic_t kra_npeers; /* # peers extant */ + int kra_nonewpeers; /* prevent new peers */ struct list_head *kra_conns; /* conns hashed by cqid */ int kra_conn_hash_size; /* size of kra_conns */ @@ -148,23 +137,20 @@ typedef struct long kra_new_min_timeout; /* minimum timeout on any new conn */ wait_queue_head_t kra_reaper_waitq; /* reaper sleeps here */ spinlock_t kra_reaper_lock; /* serialise */ - + struct list_head kra_connd_peers; /* peers waiting for a connection */ struct list_head kra_connd_acceptq; /* accepted sockets to handshake */ wait_queue_head_t kra_connd_waitq; /* connection daemons sleep here */ spinlock_t kra_connd_lock; /* serialise */ struct list_head kra_idle_txs; /* idle tx descriptors */ - struct list_head kra_idle_nblk_txs; /* idle reserved tx descriptors */ __u64 kra_next_tx_cookie; /* RDMA completion cookie */ - wait_queue_head_t kra_idle_tx_waitq; /* block here for tx descriptor */ spinlock_t kra_tx_lock; /* serialise */ } kra_data_t; #define RANAL_INIT_NOTHING 0 #define RANAL_INIT_DATA 1 -#define RANAL_INIT_LIB 2 -#define RANAL_INIT_ALL 3 +#define RANAL_INIT_ALL 2 typedef struct kra_acceptsock /* accepted socket queued for connd */ { @@ -199,13 +185,13 @@ typedef struct typedef struct { - ptl_hdr_t raim_hdr; /* portals header */ + lnet_hdr_t raim_hdr; /* portals header */ /* Portals payload is in FMA "Message Data" */ } kra_immediate_msg_t; typedef struct { - ptl_hdr_t raprm_hdr; /* portals header */ + lnet_hdr_t raprm_hdr; /* portals header */ __u64 raprm_cookie; /* opaque completion cookie */ } kra_putreq_msg_t; @@ -218,7 +204,7 @@ typedef struct typedef struct { - ptl_hdr_t ragm_hdr; /* portals header */ + lnet_hdr_t ragm_hdr; /* portals header */ __u64 ragm_cookie; /* opaque completion cookie */ kra_rdma_desc_t ragm_desc; /* sender's sink buffer */ } kra_get_msg_t; @@ -245,7 +231,7 @@ typedef struct /* NB must fit in FMA "Prefix" * __u32 ram_seq; /* incrementing sequence number */ } kra_msg_t; -#define RANAL_MSG_MAGIC 0x0be91b92 /* unique magic */ +#define RANAL_MSG_MAGIC LNET_PROTO_RA_MAGIC /* unique magic */ #define RANAL_MSG_VERSION 1 /* current protocol version */ #define RANAL_MSG_FENCE 0x80 /* fence RDMA */ @@ -268,9 +254,8 @@ typedef struct kra_tx /* message descriptor */ { struct list_head tx_list; /* queue on idle_txs/rac_sendq/rac_waitq */ struct kra_conn *tx_conn; /* owning conn */ - lib_msg_t *tx_libmsg[2]; /* lib msgs to finalize on completion */ - unsigned long tx_qtime; /* when tx started to wait for something */ - int tx_isnblk; /* I'm reserved for non-blocking sends */ + lnet_msg_t *tx_lntmsg[2]; /* ptl msgs to finalize on completion */ + unsigned long tx_qtime; /* when tx started to wait for something (jiffies) */ int tx_nob; /* # bytes of payload */ int tx_buftype; /* payload buffer type */ void *tx_buffer; /* source/sink buffer */ @@ -290,37 +275,22 @@ typedef struct kra_tx /* message descriptor */ #define RANAL_BUF_VIRT_UNMAPPED 4 /* virtual: not mapped yet */ #define RANAL_BUF_VIRT_MAPPED 5 /* virtual: mapped already */ -#define RANAL_TX_IDLE 0x00 /* on freelist */ -#define RANAL_TX_SIMPLE 0x10 /* about to send a simple message */ -#define RANAL_TX_PUTI_REQ 0x20 /* PUT initiator about to send PUT_REQ */ -#define RANAL_TX_PUTI_WAIT_ACK 0x21 /* PUT initiator waiting for PUT_ACK */ -#define RANAL_TX_PUTI_RDMA 0x22 /* PUT initiator waiting for RDMA to complete */ -#define RANAL_TX_PUTI_DONE 0x23 /* PUT initiator about to send PUT_DONE */ -#define RANAL_TX_PUTT_NAK 0x30 /* PUT target about to send PUT_NAK */ -#define RANAL_TX_PUTT_ACK 0x30 /* PUT target about to send PUT_ACK */ -#define RANAL_TX_PUTT_WAIT_DONE 0x31 /* PUT target waiting for PUT_DONE */ -#define RANAL_TX_GETI_REQ 0x40 /* GET initiator about to send GET_REQ */ -#define RANAL_TX_GETI_WAIT_DONE 0x41 /* GET initiator waiting for GET_DONE */ -#define RANAL_TX_GETT_NAK 0x50 /* GET target about to send PUT_NAK */ -#define RANAL_TX_GETT_RDMA 0x51 /* GET target waiting for RDMA to complete */ -#define RANAL_TX_GETT_DONE 0x52 /* GET target about to send GET_DONE */ - typedef struct kra_conn -{ +{ struct kra_peer *rac_peer; /* owning peer */ struct list_head rac_list; /* stash on peer's conn list */ struct list_head rac_hashlist; /* stash in connection hash table */ - struct list_head rac_schedlist; /* schedule (on rad_connq) for attention */ + struct list_head rac_schedlist; /* schedule (on rad_???_conns) for attention */ struct list_head rac_fmaq; /* txs queued for FMA */ struct list_head rac_rdmaq; /* txs awaiting RDMA completion */ struct list_head rac_replyq; /* txs awaiting replies */ __u64 rac_peerstamp; /* peer's unique stamp */ __u64 rac_peer_connstamp; /* peer's unique connection stamp */ __u64 rac_my_connstamp; /* my unique connection stamp */ - unsigned long rac_last_tx; /* when I last sent an FMA message */ - unsigned long rac_last_rx; /* when I last received an FMA messages */ - long rac_keepalive; /* keepalive interval */ - long rac_timeout; /* infer peer death on (last_rx + timout > now) */ + unsigned long rac_last_tx; /* when I last sent an FMA message (jiffies) */ + unsigned long rac_last_rx; /* when I last received an FMA messages (jiffies) */ + long rac_keepalive; /* keepalive interval (seconds) */ + long rac_timeout; /* infer peer death if no rx for this many seconds */ __u32 rac_cqid; /* my completion callback id (non-unique) */ __u32 rac_tx_seq; /* tx msg sequence number */ __u32 rac_rx_seq; /* rx msg sequence number */ @@ -346,7 +316,7 @@ typedef struct kra_peer struct list_head rap_connd_list; /* schedule on kra_connd_peers */ struct list_head rap_conns; /* all active connections */ struct list_head rap_tx_queue; /* msgs waiting for a conn */ - ptl_nid_t rap_nid; /* who's on the other end(s) */ + lnet_nid_t rap_nid; /* who's on the other end(s) */ __u32 rap_ip; /* IP address of peer */ int rap_port; /* port on which peer listens */ atomic_t rap_refcount; /* # users */ @@ -356,20 +326,6 @@ typedef struct kra_peer unsigned long rap_reconnect_interval; /* exponential backoff */ } kra_peer_t; -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)) -# define sk_allocation allocation -# define sk_data_ready data_ready -# define sk_write_space write_space -# define sk_user_data user_data -# define sk_prot prot -# define sk_sndbuf sndbuf -# define sk_socket socket -# define sk_wmem_queued wmem_queued -# define sk_err err -# define sk_sleep sleep -#endif - -extern lib_nal_t kranal_lib; extern kra_data_t kranal_data; extern kra_tunables_t kranal_tunables; @@ -379,7 +335,7 @@ extern void kranal_destroy_conn(kra_conn_t *conn); static inline void kranal_peer_addref(kra_peer_t *peer) { - CDEBUG(D_NET, "%p->"LPX64"\n", peer, peer->rap_nid); + CDEBUG(D_NET, "%p->%s\n", peer, libcfs_nid2str(peer->rap_nid)); LASSERT(atomic_read(&peer->rap_refcount) > 0); atomic_inc(&peer->rap_refcount); } @@ -387,17 +343,17 @@ kranal_peer_addref(kra_peer_t *peer) static inline void kranal_peer_decref(kra_peer_t *peer) { - CDEBUG(D_NET, "%p->"LPX64"\n", peer, peer->rap_nid); + CDEBUG(D_NET, "%p->%s\n", peer, libcfs_nid2str(peer->rap_nid)); LASSERT(atomic_read(&peer->rap_refcount) > 0); if (atomic_dec_and_test(&peer->rap_refcount)) kranal_destroy_peer(peer); } static inline struct list_head * -kranal_nid2peerlist (ptl_nid_t nid) +kranal_nid2peerlist (lnet_nid_t nid) { unsigned int hash = ((unsigned int)nid) % kranal_data.kra_peer_hash_size; - + return (&kranal_data.kra_peers[hash]); } @@ -411,7 +367,8 @@ kranal_peer_active(kra_peer_t *peer) static inline void kranal_conn_addref(kra_conn_t *conn) { - CDEBUG(D_NET, "%p->"LPX64"\n", conn, conn->rac_peer->rap_nid); + CDEBUG(D_NET, "%p->%s\n", conn, + libcfs_nid2str(conn->rac_peer->rap_nid)); LASSERT(atomic_read(&conn->rac_refcount) > 0); atomic_inc(&conn->rac_refcount); } @@ -419,34 +376,35 @@ kranal_conn_addref(kra_conn_t *conn) static inline void kranal_conn_decref(kra_conn_t *conn) { - CDEBUG(D_NET, "%p->"LPX64"\n", conn, conn->rac_peer->rap_nid); + CDEBUG(D_NET, "%p->%s\n", conn, + libcfs_nid2str(conn->rac_peer->rap_nid)); LASSERT(atomic_read(&conn->rac_refcount) > 0); if (atomic_dec_and_test(&conn->rac_refcount)) kranal_destroy_conn(conn); } static inline struct list_head * -kranal_cqid2connlist (__u32 cqid) +kranal_cqid2connlist (__u32 cqid) { unsigned int hash = cqid % kranal_data.kra_conn_hash_size; - + return (&kranal_data.kra_conns [hash]); } static inline kra_conn_t * -kranal_cqid2conn_locked (__u32 cqid) +kranal_cqid2conn_locked (__u32 cqid) { struct list_head *conns = kranal_cqid2connlist(cqid); struct list_head *tmp; kra_conn_t *conn; - + list_for_each(tmp, conns) { conn = list_entry(tmp, kra_conn_t, rac_hashlist); - + if (conn->rac_cqid == cqid) return conn; } - + return NULL; } @@ -457,30 +415,41 @@ kranal_tx_mapped (kra_tx_t *tx) tx->tx_buftype == RANAL_BUF_PHYS_MAPPED); } -static inline __u64 -kranal_page2phys (struct page *p) -{ - return page_to_phys(p); -} +int kranal_startup (lnet_ni_t *ni); +void kranal_shutdown (lnet_ni_t *ni); +int kranal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg); +int kranal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg); +int kranal_eager_recv(lnet_ni_t *ni, void *private, + lnet_msg_t *lntmsg, void **new_private); +int kranal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, + int delayed, unsigned int niov, + struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen); +int kranal_accept(lnet_ni_t *ni, struct socket *sock); extern void kranal_free_acceptsock (kra_acceptsock_t *ras); -extern int kranal_listener_procint (ctl_table *table, - int write, struct file *filp, +extern int kranal_listener_procint (ctl_table *table, + int write, struct file *filp, void *buffer, size_t *lenp); extern void kranal_update_reaper_timeout (long timeout); extern void kranal_tx_done (kra_tx_t *tx, int completion); extern void kranal_unlink_peer_locked (kra_peer_t *peer); extern void kranal_schedule_conn (kra_conn_t *conn); -extern kra_peer_t *kranal_create_peer (ptl_nid_t nid); -extern kra_peer_t *kranal_find_peer_locked (ptl_nid_t nid); +extern int kranal_create_peer (kra_peer_t **peerp, lnet_nid_t nid); +extern int kranal_add_persistent_peer (lnet_nid_t nid, __u32 ip, int port); +extern kra_peer_t *kranal_find_peer_locked (lnet_nid_t nid); extern void kranal_post_fma (kra_conn_t *conn, kra_tx_t *tx); -extern int kranal_del_peer (ptl_nid_t nid, int single_share); +extern int kranal_del_peer (lnet_nid_t nid); extern void kranal_device_callback (RAP_INT32 devid, RAP_PVOID arg); extern int kranal_thread_start (int(*fn)(void *arg), void *arg); extern int kranal_connd (void *arg); extern int kranal_reaper (void *arg); extern int kranal_scheduler (void *arg); extern void kranal_close_conn_locked (kra_conn_t *conn, int error); +extern void kranal_close_conn (kra_conn_t *conn, int error); extern void kranal_terminate_conn_locked (kra_conn_t *conn); extern void kranal_connect (kra_peer_t *peer); extern int kranal_conn_handshake (struct socket *sock, kra_peer_t *peer); +extern int kranal_tunables_init(void); +extern void kranal_tunables_fini(void); +extern void kranal_init_msg(kra_msg_t *msg, int type);