From 0fa1ef29bd36f0ec08e4d9958180604eba32c88e Mon Sep 17 00:00:00 2001 From: pjkirner Date: Tue, 20 Sep 2005 17:00:04 +0000 Subject: [PATCH] b=7981 * Landing of b_newconfig_rdmarouting * Passed sanity.sh * 9348 is still open, but this landing hasn't introduced it. --- lnet/autoconf/lustre-lnet.m4 | 6 +- lnet/include/lnet/lib-lnet.h | 247 +++--- lnet/include/lnet/lib-types.h | 201 +++-- lnet/klnds/gmlnd/gmlnd.h | 10 +- lnet/klnds/gmlnd/gmlnd_api.c | 4 +- lnet/klnds/gmlnd/gmlnd_cb.c | 9 +- lnet/klnds/gmlnd/gmlnd_comm.c | 52 +- lnet/klnds/gmlnd/gmlnd_module.c | 12 +- lnet/klnds/gmlnd/gmlnd_utils.c | 13 +- lnet/klnds/iiblnd/iiblnd.c | 21 +- lnet/klnds/iiblnd/iiblnd.h | 10 +- lnet/klnds/iiblnd/iiblnd_cb.c | 102 +-- lnet/klnds/openiblnd/openiblnd.c | 23 +- lnet/klnds/openiblnd/openiblnd.h | 14 +- lnet/klnds/openiblnd/openiblnd_cb.c | 113 +-- lnet/klnds/openiblnd/openiblnd_modparams.c | 19 +- lnet/klnds/qswlnd/qswlnd.c | 37 +- lnet/klnds/qswlnd/qswlnd.h | 17 +- lnet/klnds/qswlnd/qswlnd_cb.c | 210 +++--- lnet/klnds/qswlnd/qswlnd_modparams.c | 29 +- lnet/klnds/ralnd/ralnd.c | 32 +- lnet/klnds/ralnd/ralnd.h | 11 +- lnet/klnds/ralnd/ralnd_cb.c | 100 +-- lnet/klnds/ralnd/ralnd_modparams.c | 25 +- lnet/klnds/socklnd/socklnd.c | 13 +- lnet/klnds/socklnd/socklnd.h | 4 + lnet/klnds/socklnd/socklnd_cb.c | 8 +- lnet/klnds/socklnd/socklnd_lib-linux.c | 6 + lnet/klnds/socklnd/socklnd_modparams.c | 10 + lnet/klnds/viblnd/viblnd.c | 21 +- lnet/klnds/viblnd/viblnd.h | 14 +- lnet/klnds/viblnd/viblnd_cb.c | 119 ++- lnet/klnds/viblnd/viblnd_modparams.c | 35 +- lnet/libcfs/nidstrings.c | 44 +- lnet/libcfs/tracefile.c | 8 +- lnet/lnet/Makefile.in | 2 +- lnet/lnet/api-ni.c | 140 ++-- lnet/lnet/autoMakefile.am | 2 +- lnet/lnet/config.c | 3 +- lnet/lnet/lib-eq.c | 29 +- lnet/lnet/lib-md.c | 23 +- lnet/lnet/lib-me.c | 17 +- lnet/lnet/lib-move.c | 1114 +++++++++++++++++++++------- lnet/lnet/lib-msg.c | 107 ++- lnet/lnet/lo.c | 6 +- lnet/lnet/peer.c | 210 ++++++ lnet/lnet/router.c | 881 ++++++++++------------ lnet/lnet/router.h | 87 --- lnet/lnet/router_proc.c | 775 +++++++++++++++---- lnet/ulnds/socklnd/tcplnd.c | 6 +- lnet/utils/lbstats | 14 + lnet/utils/portals.c | 10 +- lnet/utils/routerstat.c | 128 ++-- 53 files changed, 3219 insertions(+), 1934 deletions(-) create mode 100644 lnet/lnet/peer.c delete mode 100644 lnet/lnet/router.h create mode 100755 lnet/utils/lbstats diff --git a/lnet/autoconf/lustre-lnet.m4 b/lnet/autoconf/lustre-lnet.m4 index 8a4f281..696a269 100644 --- a/lnet/autoconf/lustre-lnet.m4 +++ b/lnet/autoconf/lustre-lnet.m4 @@ -83,9 +83,9 @@ fi # configure support for Portals # AC_DEFUN([LN_CONFIG_PTLLND], -[AC_MSG_CHECKING([for ptllnd]) +[AC_MSG_CHECKING([for Portals API headers]) -if test ! "x$PORTALS" = "x" -o ! "x$LUSTRE_PORTALS" = "x" ; then +if test $PORTALS -o test $LUSTRE_PORTALS ; then AC_MSG_RESULT([yes]) PTLLND="ptllnd" if test $PORTALS ; then @@ -93,10 +93,12 @@ if test ! "x$PORTALS" = "x" -o ! "x$LUSTRE_PORTALS" = "x" ; then else PTLLNDCPPFLAGS="-I$LUSTRE_PORTALS/include" fi + AC_MSG_RESULT([$LUSTRE_PORTALS]) else AC_MSG_RESULT([no]) PTLLND="" PTLLNDCPPFLAGS="" + AC_MSG_RESULT([no]) fi AC_SUBST(PTLLNDCPPFLAGS) AC_SUBST(PTLLND) diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index b480333..5065258 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -38,16 +38,16 @@ static inline int lnet_md_exhausted (lnet_libmd_t *md) } #ifdef __KERNEL__ -#define LNET_LOCK(flags) \ - spin_lock_irqsave(&the_lnet.ln_lock, flags) -#define LNET_UNLOCK(flags) \ - spin_unlock_irqrestore(&the_lnet.ln_lock, flags) +#define LNET_LOCK() \ + spin_lock(&the_lnet.ln_lock) +#define LNET_UNLOCK() \ + spin_unlock(&the_lnet.ln_lock) #define LNET_MUTEX_DOWN(m) mutex_down(m) #define LNET_MUTEX_UP(m) mutex_up(m) #else -#define LNET_LOCK(flags) \ - (pthread_mutex_lock(&the_lnet.ln_mutex), (flags) = 0) -#define LNET_UNLOCK(flags) \ +#define LNET_LOCK() \ + pthread_mutex_lock(&the_lnet.ln_mutex) +#define LNET_UNLOCK() \ pthread_mutex_unlock(&the_lnet.ln_mutex) #define LNET_MUTEX_DOWN(m) pthread_mutex_lock(m) #define LNET_MUTEX_UP(m) pthread_mutex_unlock(m) @@ -90,12 +90,11 @@ static inline lnet_eq_t * lnet_eq_alloc (void) { /* NEVER called with liblock held */ - unsigned long flags; lnet_eq_t *eq; - LNET_LOCK(flags); + LNET_LOCK(); eq = (lnet_eq_t *)lnet_freelist_alloc(&the_lnet.ln_free_eqs); - LNET_UNLOCK(flags); + LNET_UNLOCK(); return (eq); } @@ -111,12 +110,11 @@ static inline lnet_libmd_t * lnet_md_alloc (lnet_md_t *umd) { /* NEVER called with liblock held */ - unsigned long flags; lnet_libmd_t *md; - LNET_LOCK(flags); + LNET_LOCK(); md = (lnet_libmd_t *)lnet_freelist_alloc(&the_lnet.ln_free_mds); - LNET_UNLOCK(flags); + LNET_UNLOCK(); return (md); } @@ -132,12 +130,11 @@ static inline lnet_me_t * lnet_me_alloc (void) { /* NEVER called with liblock held */ - unsigned long flags; lnet_me_t *me; - LNET_LOCK(flags); + LNET_LOCK(); me = (lnet_me_t *)lnet_freelist_alloc(&the_lnet.ln_free_mes); - LNET_UNLOCK(flags); + LNET_UNLOCK(); return (me); } @@ -153,17 +150,15 @@ static inline lnet_msg_t * lnet_msg_alloc (void) { /* NEVER called with liblock held */ - unsigned long flags; lnet_msg_t *msg; - LNET_LOCK(flags); + LNET_LOCK(); msg = (lnet_msg_t *)lnet_freelist_alloc(&the_lnet.ln_free_msgs); - LNET_UNLOCK(flags); + LNET_UNLOCK(); if (msg != NULL) { /* NULL pointers, clear flags etc */ memset (msg, 0, sizeof (*msg)); - msg->msg_ack_wmd = LNET_WIRE_HANDLE_NONE; } return(msg); } @@ -172,6 +167,7 @@ static inline void lnet_msg_free (lnet_msg_t *msg) { /* ALWAYS called with liblock held */ + LASSERT (!msg->msg_onactivelist); lnet_freelist_free(&the_lnet.ln_free_msgs, msg); } @@ -259,15 +255,11 @@ lnet_msg_alloc(void) /* NEVER called with liblock held; may be in interrupt... */ lnet_msg_t *msg; - if (in_interrupt()) - PORTAL_ALLOC_ATOMIC(msg, sizeof(*msg)); - else - PORTAL_ALLOC(msg, sizeof(*msg)); + PORTAL_ALLOC(msg, sizeof(*msg)); if (msg != NULL) { /* NULL pointers, clear flags etc */ memset (msg, 0, sizeof (*msg)); - msg->msg_ack_wmd = LNET_WIRE_HANDLE_NONE; } return (msg); } @@ -276,6 +268,7 @@ static inline void lnet_msg_free(lnet_msg_t *msg) { /* ALWAYS called with liblock held */ + LASSERT (!msg->msg_onactivelist); PORTAL_FREE(msg, sizeof(*msg)); } #endif @@ -360,22 +353,23 @@ lnet_handle2me (lnet_handle_me_t *handle) return (lh_entry (lh, lnet_me_t, me_lh)); } -/******************************************************************************/ -/* Portals Router */ - -/* NI APIs */ -int lnet_forwarding(void); -lnet_nid_t lnet_lookup(lnet_ni_t **ni, lnet_nid_t nid, int nob); -int lnet_notify(lnet_ni_t *ni, lnet_nid_t peer, int alive, time_t when); +static inline void +lnet_peer_addref_locked(lnet_peer_t *lp) +{ + LASSERT (lp->lp_refcount > 0); + lp->lp_refcount++; +} -/* internal APIs */ -int kpr_distance(lnet_nid_t nid, int *order); -int kpr_ctl(unsigned int cmd, void *arg); -int kpr_add_route(__u32 net, unsigned int hops, lnet_nid_t gateway_nid); -int kpr_initialise(void); -void kpr_finalise(void); +extern void lnet_destroy_peer_locked(lnet_peer_t *lp); -/******************************************************************************/ +static inline void +lnet_peer_decref_locked(lnet_peer_t *lp) +{ + LASSERT (lp->lp_refcount > 0); + lp->lp_refcount--; + if (lp->lp_refcount == 0) + lnet_destroy_peer_locked(lp); +} static inline void lnet_ni_addref_locked(lnet_ni_t *ni) @@ -387,11 +381,9 @@ lnet_ni_addref_locked(lnet_ni_t *ni) static inline void lnet_ni_addref(lnet_ni_t *ni) { - unsigned long flags; - - LNET_LOCK(flags); + LNET_LOCK(); lnet_ni_addref_locked(ni); - LNET_UNLOCK(flags); + LNET_UNLOCK(); } static inline void @@ -406,11 +398,9 @@ lnet_ni_decref_locked(lnet_ni_t *ni) static inline void lnet_ni_decref(lnet_ni_t *ni) { - unsigned long flags; - - LNET_LOCK(flags); + LNET_LOCK(); lnet_ni_decref_locked(ni); - LNET_UNLOCK(flags); + LNET_UNLOCK(); } static inline lnet_nid_t @@ -433,46 +423,87 @@ lnet_ptlcompat_matchnid(lnet_nid_t lnet_nid, lnet_nid_t ptl_nid) PTL_NIDADDR(ptl_nid) == PTL_NIDADDR(lnet_nid))); } +static inline struct list_head * +lnet_nid2peerhash (lnet_nid_t nid) +{ + unsigned int idx = PTL_NIDADDR(nid) % LNET_PEER_HASHSIZE; + + return &the_lnet.ln_peer_hash[idx]; +} + extern lnd_t the_lolnd; extern lnet_ni_t *lnet_loni; -extern lnet_ni_t *lnet_net2ni (__u32 net); -extern int lnet_islocalnid (lnet_nid_t nid); -extern int lnet_islocalnet (__u32 net, int *orderp); -extern void lnet_enq_event_locked (void *private, - lnet_eq_t *eq, lnet_event_t *ev); -extern void lnet_finalize (lnet_ni_t *ni, void *private, lnet_msg_t *msg, int rc); -extern int lnet_parse (lnet_ni_t *ni, lnet_hdr_t *hdr, void *private); -extern lnet_msg_t *lnet_create_reply_msg (lnet_ni_t *ni, lnet_msg_t *get_msg); -extern void lnet_print_hdr (lnet_hdr_t * hdr); -extern int lnet_fail_nid(lnet_nid_t nid, unsigned int threshold); - -extern unsigned int lnet_iov_nob (unsigned int niov, struct iovec *iov); -extern int lnet_extract_iov (int dst_niov, struct iovec *dst, - int src_niov, struct iovec *src, - unsigned int offset, unsigned int len); - -extern unsigned int lnet_kiov_nob (unsigned int niov, lnet_kiov_t *iov); -extern int lnet_extract_kiov (int dst_niov, lnet_kiov_t *dst, - int src_niov, lnet_kiov_t *src, - unsigned int offset, unsigned int len); - -extern void lnet_copy_iov2iov (unsigned int ndiov, struct iovec *diov, - unsigned int doffset, - unsigned int nsiov, struct iovec *siov, - unsigned int soffset, unsigned int nob); -extern void lnet_copy_kiov2iov (unsigned int niov, struct iovec *iov, - unsigned int iovoffset, - unsigned int nkiov, lnet_kiov_t *kiov, - unsigned int kiovoffset, unsigned int nob); -extern void lnet_copy_iov2kiov (unsigned int nkiov, lnet_kiov_t *kiov, - unsigned int kiovoffset, - unsigned int niov, struct iovec *iov, - unsigned int iovoffset, unsigned int nob); -extern void lnet_copy_kiov2kiov (unsigned int ndkiov, lnet_kiov_t *dkiov, - unsigned int doffset, - unsigned int nskiov, lnet_kiov_t *skiov, - unsigned int soffset, unsigned int nob); +extern lnet_ni_t *lnet_net2ni_locked (__u32 net); +static inline lnet_ni_t * +lnet_net2ni (__u32 net) +{ + lnet_ni_t *ni; + + LNET_LOCK(); + ni = lnet_net2ni_locked(net); + LNET_UNLOCK(); + + return ni; +} + +int lnet_notify(lnet_ni_t *ni, lnet_nid_t peer, int alive, time_t when); +int lnet_distance(lnet_nid_t nid, int *order); +int lnet_add_route(__u32 net, unsigned int hops, lnet_nid_t gateway_nid); +int lnet_del_route (__u32 net, lnet_nid_t gw_nid); +int lnet_get_route (int idx, __u32 *net, __u32 *hops, + lnet_nid_t *gateway, __u32 *alive); +void lnet_proc_init(void); +void lnet_proc_fini(void); +int lnet_router_init(void); +void lnet_router_fini(void); +lnet_remotenet_t *lnet_find_net_locked (__u32 net); + +int lnet_islocalnid(lnet_nid_t nid); +int lnet_islocalnet(__u32 net, int *orderp); + +void lnet_enq_event_locked(lnet_eq_t *eq, lnet_event_t *ev); +void lnet_prep_send(lnet_msg_t *msg, int type, lnet_process_id_t target, + unsigned int offset, unsigned int len); +int lnet_send(lnet_ni_t *ni, lnet_msg_t *msg); +void lnet_return_credits_locked (lnet_msg_t *msg); +int lnet_parse (lnet_ni_t *ni, lnet_hdr_t *hdr, + lnet_nid_t fromnid, void *private); +void lnet_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed, + unsigned int offset, unsigned int mlen, unsigned int rlen); +lnet_msg_t *lnet_create_reply_msg (lnet_ni_t *ni, lnet_msg_t *get_msg); +void lnet_finalize(lnet_ni_t *ni, lnet_msg_t *msg, int rc); + +char *lnet_msgtyp2str (int type); +void lnet_print_hdr (lnet_hdr_t * hdr); +int lnet_fail_nid(lnet_nid_t nid, unsigned int threshold); + +unsigned int lnet_iov_nob (unsigned int niov, struct iovec *iov); +int lnet_extract_iov (int dst_niov, struct iovec *dst, + int src_niov, struct iovec *src, + unsigned int offset, unsigned int len); + +unsigned int lnet_kiov_nob (unsigned int niov, lnet_kiov_t *iov); +int lnet_extract_kiov (int dst_niov, lnet_kiov_t *dst, + int src_niov, lnet_kiov_t *src, + unsigned int offset, unsigned int len); + +void lnet_copy_iov2iov (unsigned int ndiov, struct iovec *diov, + unsigned int doffset, + unsigned int nsiov, struct iovec *siov, + unsigned int soffset, unsigned int nob); +void lnet_copy_kiov2iov (unsigned int niov, struct iovec *iov, + unsigned int iovoffset, + unsigned int nkiov, lnet_kiov_t *kiov, + unsigned int kiovoffset, unsigned int nob); +void lnet_copy_iov2kiov (unsigned int nkiov, lnet_kiov_t *kiov, + unsigned int kiovoffset, + unsigned int niov, struct iovec *iov, + unsigned int iovoffset, unsigned int nob); +void lnet_copy_kiov2kiov (unsigned int ndkiov, lnet_kiov_t *dkiov, + unsigned int doffset, + unsigned int nskiov, lnet_kiov_t *skiov, + unsigned int soffset, unsigned int nob); static inline void lnet_copy_iov2flat(int dlen, void *dest, unsigned int doffset, @@ -512,35 +543,35 @@ lnet_copy_flat2kiov(unsigned int ndiov, lnet_kiov_t *dkiov, unsigned int doffset 1, &siov, soffset, nob); } -extern int lnet_send(lnet_ni_t *ni, void *private, lnet_msg_t *msg, - int type, lnet_process_id_t target, - lnet_libmd_t *md, unsigned int offset, unsigned int len); +void lnet_me_unlink(lnet_me_t *me); -extern void lnet_me_unlink(lnet_me_t *me); +void lnet_md_unlink(lnet_libmd_t *md); +void lnet_md_deconstruct(lnet_libmd_t *lmd, lnet_md_t *umd); -extern void lnet_md_unlink(lnet_libmd_t *md); -extern void lnet_md_deconstruct(lnet_libmd_t *lmd, lnet_md_t *umd); - -extern void lnet_register_lnd(lnd_t *lnd); -extern void lnet_unregister_lnd(lnd_t *lnd); -extern int lnet_set_ip_niaddr (lnet_ni_t *ni); +void lnet_register_lnd(lnd_t *lnd); +void lnet_unregister_lnd(lnd_t *lnd); +int lnet_set_ip_niaddr (lnet_ni_t *ni); #ifdef __KERNEL__ -extern int lnet_connect(struct socket **sockp, lnet_nid_t peer_nid, - __u32 local_ip, __u32 peer_ip, int peer_port); -extern void lnet_connect_console_error(int rc, lnet_nid_t peer_nid, - __u32 peer_ip, int port); - -extern int lnet_count_acceptor_nis(lnet_ni_t **first_ni); - -extern int lnet_accept(lnet_ni_t *blind_ni, struct socket *sock, __u32 magic); -extern int lnet_acceptor_timeout(void); -extern int lnet_acceptor_port(void); +int lnet_connect(struct socket **sockp, lnet_nid_t peer_nid, + __u32 local_ip, __u32 peer_ip, int peer_port); +void lnet_connect_console_error(int rc, lnet_nid_t peer_nid, + __u32 peer_ip, int port); +int lnet_count_acceptor_nis(lnet_ni_t **first_ni); +int lnet_accept(lnet_ni_t *blind_ni, struct socket *sock, __u32 magic); +int lnet_acceptor_timeout(void); +int lnet_acceptor_port(void); #endif -extern int lnet_acceptor_start(void); -extern void lnet_acceptor_stop(void); +int lnet_acceptor_start(void); +void lnet_acceptor_stop(void); + +int lnet_parse_routes (char *route_str); +int lnet_parse_networks (struct list_head *nilist, char *networks); -extern int lnet_parse_routes (char *route_str); -extern int lnet_parse_networks (struct list_head *nilist, char *networks); +int lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid); +lnet_peer_t *lnet_find_peer_locked (lnet_nid_t nid); +void lnet_clear_peer_table(void); +void lnet_destroy_peer_table(void); +int lnet_create_peer_table(void); #endif diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index 4f52907..cfb830e 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -135,23 +135,41 @@ struct lnet_libmd; typedef struct lnet_msg { struct list_head msg_activelist; + struct list_head msg_list; /* Q for credits */ - __u32 msg_type; lnet_process_id_t msg_target; - int msg_target_is_router:1; - int msg_routing:1; - lnet_hdr_t msg_hdr; + __u32 msg_type; + + int msg_target_is_router:1; /* sending to a router */ + int msg_routing:1; /* being forwarded */ + int msg_ack:1; /* ack on finalize (PUT) */ + int msg_sending:1; /* outgoing message */ + int msg_receiving:1; /* being received */ + int msg_recvaftersend:1; /* lnd_recv() outstanding */ + int msg_delayed:1; /* had to Q for buffer or tx credit */ + int msg_txcredit:1; /* taken an NI send credit */ + int msg_peertxcredit:1; /* taken a peer send credit */ + int msg_rtrcredit:1; /* taken a globel router credit */ + int msg_peerrtrcredit:1; /* taken a peer router credit */ + int msg_onactivelist:1; /* on the activelist */ + + struct lnet_peer *msg_txpeer; /* peer I'm sending to */ + struct lnet_peer *msg_rxpeer; /* peer I received from (routed only) */ + + void *msg_private; + struct lnet_libmd *msg_md; + unsigned int msg_len; unsigned int msg_offset; unsigned int msg_niov; struct iovec *msg_iov; lnet_kiov_t *msg_kiov; - struct lnet_libmd *msg_md; - lnet_handle_wire_t msg_ack_wmd; lnet_event_t msg_ev; + lnet_hdr_t msg_hdr; } lnet_msg_t; + typedef struct lnet_libhandle { struct list_head lh_hash_chain; __u64 lh_cookie; @@ -272,12 +290,22 @@ typedef struct lnet_lnd /* Start receiving 'mlen' bytes of payload data, skipping the following * 'rlen' - 'mlen' bytes. 'private' is the 'private' passed to * lnet_parse(). Return non-zero for immedaite failure, otherwise - * complete later with lnet_finalize() */ + * complete later with lnet_finalize(). This also gives back a receive + * credit if the LND does flow control. */ int (*lnd_recv)(struct lnet_ni *ni, void *private, lnet_msg_t *msg, int delayed, unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov, unsigned int offset, unsigned int mlen, unsigned int rlen); + /* lnet_parse() has had to delay processing of this message + * (e.g. waiting for a forwarding buffer or send credits). Give the + * LND a chance to free urgently needed resources. If called, return 0 + * for success and do NOT give back a receive credit; that has to wait + * until lnd_recv() gets called. On failure return < 0 and + * release resources; lnd_recv() will not be called. */ + int (*lnd_eager_recv)(struct lnet_ni *ni, void *private, lnet_msg_t *msg, + void **new_privatep); + /* notification of peer health */ void (*lnd_notify)(struct lnet_ni *ni, lnet_nid_t peer, int alive); @@ -291,79 +319,146 @@ typedef struct lnet_lnd typedef struct lnet_ni { struct list_head ni_list; /* chain on ln_nis */ + struct list_head ni_txq; /* messages waiting for tx credits */ + int ni_maxtxcredits; /* # tx credits */ + int ni_txcredits; /* # tx credits free */ + int ni_mintxcredits; /* lowest it's been */ + int ni_peertxcredits; /* # per-peer send credits */ lnet_nid_t ni_nid; /* interface's NID */ void *ni_data; /* instance-specific data */ lnd_t *ni_lnd; /* procedural interface */ - int ni_shutdown; /* shutting down? */ int ni_refcount; /* reference count */ char *ni_interfaces[LNET_MAX_INTERFACES]; /* equivalent interfaces to use */ } lnet_ni_t; +typedef struct lnet_peer { + struct list_head lp_hashlist; /* chain on peer hash */ + struct list_head lp_txq; /* messages blocking for tx credits */ + struct list_head lp_rtrq; /* messages blocking for router credits */ + int lp_txcredits; /* # tx credits available */ + int lp_mintxcredits; /* low water mark */ + int lp_rtrcredits; /* # router credits */ + int lp_minrtrcredits; /* low water mark */ + int lp_alive; /* enabled? */ + long lp_txqnob; /* bytes queued for sending */ + time_t lp_timestamp; /* time of last aliveness news */ + lnet_ni_t *lp_ni; /* interface peer is on */ + lnet_nid_t lp_nid; /* peer's NID */ + int lp_refcount; /* # refs */ +} lnet_peer_t; + +typedef struct { + struct list_head lr_list; /* chain on net */ + lnet_peer_t *lr_gateway; /* router node */ +} lnet_route_t; + +typedef struct { + struct list_head lrn_list; /* chain on ln_remote_nets */ + struct list_head lrn_routes; /* routes to me */ + __u32 lrn_net; /* my net number */ + unsigned int lrn_hops; /* how far I am */ + lnet_ni_t *lrn_ni; /* local net that sends to me */ +} lnet_remotenet_t; + +typedef struct { + struct list_head rbp_bufs; /* my free buffer pool */ + struct list_head rbp_msgs; /* messages blocking for a buffer */ + int rbp_npages; /* # pages in each buffer */ + int rbp_nbuffers; /* # buffers */ + int rbp_credits; /* # free buffers / blocked messages */ + int rbp_mincredits; /* low water mark */ +} lnet_rtrbufpool_t; + +typedef struct { + struct list_head rb_list; /* chain on rbp_bufs */ + lnet_rtrbufpool_t *rb_pool; /* owning pool */ + lnet_kiov_t rb_kiov[0]; /* the buffer space */ +} lnet_rtrbuf_t; + +typedef struct { + __u32 msgs_alloc; + __u32 msgs_max; + __u32 errors; + __u32 send_count; + __u32 recv_count; + __u32 route_count; + __u32 drop_count; + __u64 send_length; + __u64 recv_length; + __u64 route_length; + __u64 drop_length; +} lnet_counters_t; + +#define LNET_PEER_HASHSIZE 503 /* prime! */ + +#define LNET_NRBPOOLS 3 /* # different router buffer pools */ + typedef struct { /* Stuff initialised at LNetInit() */ - int ln_init; /* LNetInit() called? */ - int ln_refcount; /* LNetNIInit/LNetNIFini counter */ - int ln_niinit_self; /* Have I called LNetNIInit myself? */ + int ln_init; /* LNetInit() called? */ + int ln_refcount; /* LNetNIInit/LNetNIFini counter */ + int ln_niinit_self; /* Have I called LNetNIInit myself? */ - int ln_ptlcompat; /* support talking to portals */ + int ln_ptlcompat; /* do I support talking to portals? */ - struct list_head ln_lnds; /* registered NALs */ + struct list_head ln_lnds; /* registered NALs */ #ifdef __KERNEL__ - spinlock_t ln_lock; - cfs_waitq_t ln_waitq; - struct semaphore ln_api_mutex; - struct semaphore ln_lnd_mutex; + spinlock_t ln_lock; + cfs_waitq_t ln_waitq; + struct semaphore ln_api_mutex; + struct semaphore ln_lnd_mutex; #else - pthread_mutex_t ln_mutex; - pthread_cond_t ln_cond; - pthread_mutex_t ln_api_mutex; - pthread_mutex_t ln_lnd_mutex; + pthread_mutex_t ln_mutex; + pthread_cond_t ln_cond; + pthread_mutex_t ln_api_mutex; + pthread_mutex_t ln_lnd_mutex; #endif /* Stuff initialised at LNetNIInit() */ - int ln_nportals; /* # portals */ - struct list_head *ln_portals; /* the vector of portals */ + int ln_shutdown; /* shutdown in progress */ + int ln_nportals; /* # portals */ + struct list_head *ln_portals; /* the vector of portals */ + + lnet_pid_t ln_pid; /* requested pid */ - lnet_pid_t ln_pid; /* requested pid */ + struct list_head ln_nis; /* NAL instances */ + struct list_head ln_zombie_nis; /* dying NAL instances */ + int ln_nzombie_nis; /* # of NIS to wait for */ - struct list_head ln_nis; /* NAL instances */ - struct list_head ln_zombie_nis; /* dying NAL instances */ - int ln_nzombie_nis; /* # of NIS to wait for */ + struct list_head ln_remote_nets; /* remote networks with routes to them */ + __u64 ln_remote_nets_version; /* validity stamp */ - int ln_lh_hash_size; /* size of lib handle hash table */ - struct list_head *ln_lh_hash_table; /* all extant lib handles, this interface */ - __u64 ln_next_object_cookie; /* cookie generator */ - __u64 ln_interface_cookie; /* uniquely identifies this ni in this epoch */ + struct list_head *ln_peer_hash; /* NID->peer hash */ + int ln_npeers; /* # peers extant */ + int ln_peertable_version; /* /proc validity stamp */ + + int ln_routing; /* am I a router? */ + lnet_rtrbufpool_t ln_rtrpools[LNET_NRBPOOLS]; /* router buffer pools */ + + int ln_lh_hash_size; /* size of lib handle hash table */ + struct list_head *ln_lh_hash_table; /* all extant lib handles, this interface */ + __u64 ln_next_object_cookie; /* cookie generator */ + __u64 ln_interface_cookie; /* uniquely identifies this ni in this epoch */ - char *ln_network_tokens; /* space for network names */ - int ln_network_tokens_nob; + char *ln_network_tokens; /* space for network names */ + int ln_network_tokens_nob; - struct list_head ln_test_peers; + struct list_head ln_test_peers; /* failure simulation */ #ifdef LNET_USE_LIB_FREELIST - lnet_freelist_t ln_free_mes; - lnet_freelist_t ln_free_msgs; - lnet_freelist_t ln_free_mds; - lnet_freelist_t ln_free_eqs; + lnet_freelist_t ln_free_mes; + lnet_freelist_t ln_free_msgs; + lnet_freelist_t ln_free_mds; + lnet_freelist_t ln_free_eqs; #endif - struct list_head ln_active_msgs; - struct list_head ln_active_mds; - struct list_head ln_active_eqs; - - struct { - long recv_count; - long recv_length; - long send_count; - long send_length; - long drop_count; - long drop_length; - long msgs_alloc; - long msgs_max; - } ln_counters; - + struct list_head ln_active_msgs; + struct list_head ln_active_mds; + struct list_head ln_active_eqs; + + lnet_counters_t ln_counters; } lnet_t; #endif diff --git a/lnet/klnds/gmlnd/gmlnd.h b/lnet/klnds/gmlnd/gmlnd.h index be7f8f3..54f5b22 100644 --- a/lnet/klnds/gmlnd/gmlnd.h +++ b/lnet/klnds/gmlnd/gmlnd.h @@ -82,8 +82,8 @@ /* Default Tunable Values */ #define GMNAL_PORT 4 /* which port to use */ -#define GMNAL_NTX 32 /* # tx descs */ -#define GMNAL_NTX_NBLK 256 /* # reserved tx descs */ +#define GMNAL_NTX 256 /* # tx descs */ +#define GMNAL_NTX_PEER 8 /* # concurrent sends per peer */ #define GMNAL_NRX_SMALL 128 /* # small receives to post */ #define GMNAL_NRX_LARGE 64 /* # large receives to post */ #define GMNAL_NLARGE_TX_BUFS 32 /* # large tx buffers */ @@ -133,7 +133,6 @@ typedef struct gmnal_txbuf { typedef struct gmnal_tx { struct list_head tx_list; /* queue */ - int tx_isnblk:1; /* reserved for non-blocking? */ int tx_credit:1; /* consumed a credit? */ int tx_large_iskiov:1; /* large is in kiovs? */ struct gmnal_ni *tx_gmni; /* owning NI */ @@ -187,7 +186,6 @@ typedef struct gmnal_ni { int gmni_shutdown; /* tell all threads to exit */ struct list_head gmni_idle_txs; /* idle tx's */ - struct list_head gmni_nblk_idle_txs; /* reserved for non-blocking callers */ wait_queue_head_t gmni_idle_tx_wait; /* block here for idle tx */ int gmni_tx_credits; /* # transmits still possible */ struct list_head gmni_idle_ltxbs; /* idle large tx buffers */ @@ -202,7 +200,7 @@ typedef struct gmnal_ni { typedef struct { int *gm_port; int *gm_ntx; - int *gm_ntx_nblk; + int *gm_ntx_peer; int *gm_nlarge_tx_bufs; int *gm_nrx_small; int *gm_nrx_large; @@ -240,7 +238,7 @@ void gmnal_yield(int delay); /* gmnal_comm.c */ void gmnal_post_rx(gmnal_ni_t *gmni, gmnal_rx_t *rx); -gmnal_tx_t *gmnal_get_tx(gmnal_ni_t *gmni, int may_block); +gmnal_tx_t *gmnal_get_tx(gmnal_ni_t *gmni); void gmnal_tx_done(gmnal_tx_t *tx, int rc); void gmnal_pack_msg(gmnal_ni_t *gmni, gmnal_msg_t *msg, lnet_nid_t dstnid, int type); diff --git a/lnet/klnds/gmlnd/gmlnd_api.c b/lnet/klnds/gmlnd/gmlnd_api.c index a4785c0..936c0c1 100644 --- a/lnet/klnds/gmlnd/gmlnd_api.c +++ b/lnet/klnds/gmlnd/gmlnd_api.c @@ -122,6 +122,9 @@ gmnal_startup(lnet_ni_t *ni) int rc; LASSERT (ni->ni_lnd == &the_gmlnd); + + ni->ni_maxtxcredits = *gmnal_tunables.gm_ntx; + ni->ni_peertxcredits = *gmnal_tunables.gm_ntx_peer; if (the_gmni != NULL) { CERROR("Only 1 instance supported\n"); @@ -142,7 +145,6 @@ gmnal_startup(lnet_ni_t *ni) spin_lock_init(&gmni->gmni_gm_lock); init_waitqueue_head(&gmni->gmni_idle_tx_wait); INIT_LIST_HEAD(&gmni->gmni_idle_txs); - INIT_LIST_HEAD(&gmni->gmni_nblk_idle_txs); INIT_LIST_HEAD(&gmni->gmni_idle_ltxbs); INIT_LIST_HEAD(&gmni->gmni_buf_txq); INIT_LIST_HEAD(&gmni->gmni_cred_txq); diff --git a/lnet/klnds/gmlnd/gmlnd_cb.c b/lnet/klnds/gmlnd/gmlnd_cb.c index c7c7f38..b364a61 100644 --- a/lnet/klnds/gmlnd/gmlnd_cb.c +++ b/lnet/klnds/gmlnd/gmlnd_cb.c @@ -60,7 +60,7 @@ gmnal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, npages, rx->rx_buf.nb_kiov, payload_offset, mlen); - lnet_finalize(ni, private, lntmsg, 0); + lnet_finalize(ni, lntmsg, 0); gmnal_post_rx(gmni, rx); return 0; } @@ -85,10 +85,7 @@ gmnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) LASSERT (iov == NULL || kiov == NULL); /* I may not block for a tx if I'm responding to an incoming message */ - tx = gmnal_get_tx(gmni, - !(routing || - type == LNET_MSG_ACK || - type == LNET_MSG_REPLY)); + tx = gmnal_get_tx(gmni); if (tx == NULL) { if (!gmni->gmni_shutdown) CERROR ("Can't get tx for msg type %d for %s\n", @@ -129,7 +126,7 @@ gmnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) /* We've copied everything... */ LASSERT(tx->tx_lntmsg == NULL); - lnet_finalize(ni, NULL, lntmsg, 0); + lnet_finalize(ni, lntmsg, 0); } else { /* stash payload pts to copy later */ tx->tx_large_nob = len; diff --git a/lnet/klnds/gmlnd/gmlnd_comm.c b/lnet/klnds/gmlnd/gmlnd_comm.c index fbb2acc..a4e19f0 100644 --- a/lnet/klnds/gmlnd/gmlnd_comm.c +++ b/lnet/klnds/gmlnd/gmlnd_comm.c @@ -120,43 +120,26 @@ gmnal_unpack_msg(gmnal_ni_t *gmni, gmnal_rx_t *rx) } gmnal_tx_t * -gmnal_get_tx(gmnal_ni_t *gmni, int may_block) +gmnal_get_tx(gmnal_ni_t *gmni) { gmnal_tx_t *tx = NULL; spin_lock(&gmni->gmni_tx_lock); - while (!gmni->gmni_shutdown) { - - if (!list_empty(&gmni->gmni_idle_txs)) { - tx = list_entry(gmni->gmni_idle_txs.next, - gmnal_tx_t, tx_list); - break; - } - - if (!may_block) { - if (!list_empty(&gmni->gmni_nblk_idle_txs)) - tx = list_entry(gmni->gmni_nblk_idle_txs.next, - gmnal_tx_t, tx_list); - break; - } - + if (gmni->gmni_shutdown || + list_empty(&gmni->gmni_idle_txs)) { spin_unlock(&gmni->gmni_tx_lock); - wait_event(gmni->gmni_idle_tx_wait, - gmni->gmni_shutdown || - !list_empty(&gmni->gmni_idle_txs)); - spin_lock(&gmni->gmni_tx_lock); - } - - if (tx != NULL) { - LASSERT (tx->tx_lntmsg == NULL); - LASSERT (tx->tx_ltxb == NULL); - LASSERT (!tx->tx_credit); - - list_del(&tx->tx_list); + return NULL; } + tx = list_entry(gmni->gmni_idle_txs.next, gmnal_tx_t, tx_list); + list_del(&tx->tx_list); + spin_unlock(&gmni->gmni_tx_lock); + + LASSERT (tx->tx_lntmsg == NULL); + LASSERT (tx->tx_ltxb == NULL); + LASSERT (!tx->tx_credit); return tx; } @@ -184,20 +167,12 @@ gmnal_tx_done(gmnal_tx_t *tx, int rc) tx->tx_credit = 0; } - if (tx->tx_isnblk) { - list_add_tail(&tx->tx_list, &gmni->gmni_nblk_idle_txs); - } else { - list_add_tail(&tx->tx_list, &gmni->gmni_idle_txs); - wake_idle = 1; - } + list_add_tail(&tx->tx_list, &gmni->gmni_idle_txs); if (wake_sched) gmnal_check_txqueues_locked(gmni); spin_unlock(&gmni->gmni_tx_lock); - - if (wake_idle) - wake_up(&gmni->gmni_idle_tx_wait); } void @@ -315,7 +290,7 @@ gmnal_check_txqueues_locked (gmnal_ni_t *gmni) tx->tx_msgnob += tx->tx_large_nob; /* We've copied everything... */ - lnet_finalize(gmni->gmni_ni, NULL, tx->tx_lntmsg, 0); + lnet_finalize(gmni->gmni_ni, tx->tx_lntmsg, 0); tx->tx_lntmsg = NULL; spin_lock(&gmni->gmni_tx_lock); @@ -459,6 +434,7 @@ gmnal_rx_thread(void *arg) LASSERT (msg->gmm_type == GMNAL_MSG_IMMEDIATE); rc = lnet_parse(gmni->gmni_ni, &msg->gmm_u.immediate.gmim_hdr, + msg->gmm_srcnid, rx); } diff --git a/lnet/klnds/gmlnd/gmlnd_module.c b/lnet/klnds/gmlnd/gmlnd_module.c index c016d4f..b74e21d 100644 --- a/lnet/klnds/gmlnd/gmlnd_module.c +++ b/lnet/klnds/gmlnd/gmlnd_module.c @@ -28,11 +28,11 @@ CFS_MODULE_PARM(port, "i", int, 0444, static int ntx = GMNAL_NTX; CFS_MODULE_PARM(ntx, "i", int, 0444, - "# 'normal' tx descriptors"); + "# tx descriptors"); -static int ntx_nblk = GMNAL_NTX_NBLK; -CFS_MODULE_PARM(ntx_nblk, "i", int, 0444, - "# 'reserved' tx descriptors"); +static int ntx_peer = GMNAL_NTX_PEER; +CFS_MODULE_PARM(ntx_peer, "i", int, 0444, + "# concurrent sends per peer"); static int nlarge_tx_bufs = GMNAL_NLARGE_TX_BUFS; CFS_MODULE_PARM(nlarge_tx_bufs, "i", int, 0444, @@ -49,7 +49,7 @@ CFS_MODULE_PARM(nrx_large, "i", int, 0444, gmnal_tunables_t gmnal_tunables = { .gm_port = &port, .gm_ntx = &ntx, - .gm_ntx_nblk = &ntx_nblk, + .gm_ntx_peer = &ntx_peer, .gm_nlarge_tx_bufs = &nlarge_tx_bufs, .gm_nrx_small = &nrx_small, .gm_nrx_large = &nrx_large, @@ -61,7 +61,7 @@ static ctl_table gmnal_ctl_table[] = { sizeof (int), 0444, NULL, &proc_dointvec}, {2, "ntx", &ntx, sizeof (int), 0444, NULL, &proc_dointvec}, - {3, "ntx_nblk", &ntx_nblk, + {3, "ntx_peer", &ntx_peer, sizeof (int), 0444, NULL, &proc_dointvec}, {4, "nlarge_tx_bufs", &nlarge_tx_bufs, sizeof (int), 0444, NULL, &proc_dointvec}, diff --git a/lnet/klnds/gmlnd/gmlnd_utils.c b/lnet/klnds/gmlnd/gmlnd_utils.c index 8afd98e..3e7bd32 100644 --- a/lnet/klnds/gmlnd/gmlnd_utils.c +++ b/lnet/klnds/gmlnd/gmlnd_utils.c @@ -127,7 +127,7 @@ gmnal_free_tx (gmnal_tx_t *tx) } int -gmnal_alloc_tx (gmnal_ni_t *gmni, int nblk) +gmnal_alloc_tx (gmnal_ni_t *gmni) { gmnal_tx_t *tx; int rc; @@ -147,12 +147,8 @@ gmnal_alloc_tx (gmnal_ni_t *gmni, int nblk) } tx->tx_gmni = gmni; - tx->tx_isnblk = nblk; - if (tx->tx_isnblk) - list_add_tail(&tx->tx_list, &gmni->gmni_nblk_idle_txs); - else - list_add_tail(&tx->tx_list, &gmni->gmni_idle_txs); + list_add_tail(&tx->tx_list, &gmni->gmni_idle_txs); tx->tx_next = gmni->gmni_txs; gmni->gmni_txs = tx; @@ -252,15 +248,14 @@ gmnal_alloc_txs(gmnal_ni_t *gmni) { int ntxcred = gm_num_send_tokens(gmni->gmni_port); int ntx = *gmnal_tunables.gm_ntx; - int ntx_nblk = *gmnal_tunables.gm_ntx_nblk; int i; int rc; CWARN("ntxcred: %d\n", ntxcred); gmni->gmni_tx_credits = ntxcred; - for (i = 0; i < ntx_nblk + ntx; i++) { - rc = gmnal_alloc_tx(gmni, i < ntx_nblk); + for (i = 0; i < ntx; i++) { + rc = gmnal_alloc_tx(gmni); if (rc != 0) return rc; } diff --git a/lnet/klnds/iiblnd/iiblnd.c b/lnet/klnds/iiblnd/iiblnd.c index 3c4cd8e..3c040c1 100644 --- a/lnet/klnds/iiblnd/iiblnd.c +++ b/lnet/klnds/iiblnd/iiblnd.c @@ -1118,18 +1118,12 @@ kibnal_setup_tx_descs (void) else tx->tx_vaddr = vaddr; - tx->tx_isnblk = (i >= IBNAL_NTX); tx->tx_mapped = KIB_TX_UNMAPPED; CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n", i, tx, tx->tx_msg, tx->tx_vaddr); - if (tx->tx_isnblk) - list_add (&tx->tx_list, - &kibnal_data.kib_idle_nblk_txs); - else - list_add (&tx->tx_list, - &kibnal_data.kib_idle_txs); + list_add (&tx->tx_list, &kibnal_data.kib_idle_txs); vaddr += IBNAL_MSG_SIZE; LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES); @@ -1324,6 +1318,15 @@ kibnal_startup (lnet_ni_t *ni) return -EPERM; } + if (IBNAL_CREDITS > IBNAL_NTX) { + CERROR ("Can't set credits(%d) > ntx(%d)\n", + IBNAL_CREDITS, IBNAL_NTX); + return -EINVAL; + } + + ni->ni_maxtxcredits = IBNAL_CREDITS; + ni->ni_peertxcredits = IBNAL_PEERCREDITS; + ni->ni_data = &kibnal_data; kibnal_data.kib_ni = ni; @@ -1364,8 +1367,6 @@ kibnal_startup (lnet_ni_t *ni) spin_lock_init (&kibnal_data.kib_tx_lock); INIT_LIST_HEAD (&kibnal_data.kib_idle_txs); - INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs); - init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq); PORTAL_ALLOC (kibnal_data.kib_tx_descs, IBNAL_TX_MSGS * sizeof(kib_tx_t)); @@ -1506,7 +1507,7 @@ kibnal_startup (lnet_ni_t *ni) #if IBNAL_FMR { - const int pool_size = IBNAL_NTX + IBNAL_NTX_NBLK; + const int pool_size = IBNAL_NTX; struct ib_fmr_pool_param params = { .max_pages_per_fmr = PTL_MTU/PAGE_SIZE, .access = (IB_ACCESS_LOCAL_WRITE | diff --git a/lnet/klnds/iiblnd/iiblnd.h b/lnet/klnds/iiblnd/iiblnd.h index d58eb63..35a8f6f 100644 --- a/lnet/klnds/iiblnd/iiblnd.h +++ b/lnet/klnds/iiblnd/iiblnd.h @@ -94,7 +94,8 @@ #define IBNAL_NTX 64 /* # tx descs */ /* this had to be dropped down so that we only register < 255 pages per * region. this will change if we register all memory. */ -#define IBNAL_NTX_NBLK 128 /* # reserved tx descs */ +#define IBNAL_CREDITS 32 +#define IBNAL_PEERCREDITS 8 #define IBNAL_PEER_HASH_SIZE 101 /* # peer lists */ @@ -109,7 +110,7 @@ /* derived constants... */ /* TX messages (shared by all connections) */ -#define IBNAL_TX_MSGS (IBNAL_NTX + IBNAL_NTX_NBLK) +#define IBNAL_TX_MSGS IBNAL_NTX #define IBNAL_TX_MSG_BYTES (IBNAL_TX_MSGS * IBNAL_MSG_SIZE) #define IBNAL_TX_MSG_PAGES ((IBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) @@ -202,8 +203,6 @@ typedef struct kib_pages_t *kib_tx_pages; /* premapped tx msg pages */ struct list_head kib_idle_txs; /* idle tx descriptors */ - struct list_head kib_idle_nblk_txs; /* idle reserved tx descriptors */ - wait_queue_head_t kib_idle_tx_waitq; /* block here for tx descriptor */ __u64 kib_next_tx_cookie; /* RDMA completion cookie */ spinlock_t kib_tx_lock; /* serialise */ @@ -324,7 +323,6 @@ typedef struct kib_rx /* receive message */ typedef struct kib_tx /* transmit message */ { struct list_head tx_list; /* queue on idle_txs ibc_tx_queue etc. */ - int tx_isnblk; /* I'm reserved for non-blocking sends */ struct kib_conn *tx_conn; /* owning conn */ int tx_mapped; /* mapped for RDMA? */ int tx_sending; /* # tx callbacks outstanding */ @@ -420,7 +418,7 @@ typedef struct kib_peer { struct list_head ibp_list; /* stash on global peer list */ struct list_head ibp_connd_list; /* schedule on kib_connd_peers */ - lnet_nid_t ibp_nid; /* who's on the other end(s) */ + lnet_nid_t ibp_nid; /* who's on the other end(s) */ atomic_t ibp_refcount; /* # users */ int ibp_persistence; /* "known" peer refs */ struct list_head ibp_conns; /* all active connections */ diff --git a/lnet/klnds/iiblnd/iiblnd_cb.c b/lnet/klnds/iiblnd/iiblnd_cb.c index cb7f580..08474b5 100644 --- a/lnet/klnds/iiblnd/iiblnd_cb.c +++ b/lnet/klnds/iiblnd/iiblnd_cb.c @@ -91,7 +91,7 @@ kibnal_tx_done (kib_tx_t *tx) if (tx->tx_lntmsg[i] == NULL) continue; - lnet_finalize (kibnal_data.kib_ni, NULL, tx->tx_lntmsg[i], + lnet_finalize (kibnal_data.kib_ni, tx->tx_lntmsg[i], tx->tx_status); tx->tx_lntmsg[i] = NULL; } @@ -107,73 +107,43 @@ kibnal_tx_done (kib_tx_t *tx) spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags); - if (tx->tx_isnblk) { - list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs); - } else { - list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs); - wake_up (&kibnal_data.kib_idle_tx_waitq); - } + list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs); spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); } static kib_tx_t * -kibnal_get_idle_tx (int may_block) +kibnal_get_idle_tx (void) { unsigned long flags; kib_tx_t *tx = NULL; - ENTRY; - for (;;) { - spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags); - - /* "normal" descriptor is free */ - if (!list_empty (&kibnal_data.kib_idle_txs)) { - tx = list_entry (kibnal_data.kib_idle_txs.next, - kib_tx_t, tx_list); - break; - } - - if (!may_block) { - /* may dip into reserve pool */ - if (list_empty (&kibnal_data.kib_idle_nblk_txs)) { - CERROR ("reserved tx desc pool exhausted\n"); - break; - } - - tx = list_entry (kibnal_data.kib_idle_nblk_txs.next, - kib_tx_t, tx_list); - break; - } + spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags); - /* block for idle tx */ + if (list_empty (&kibnal_data.kib_idle_txs)) { spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); - - wait_event (kibnal_data.kib_idle_tx_waitq, - !list_empty (&kibnal_data.kib_idle_txs) || - kibnal_data.kib_shutdown); + return NULL; } + + tx = list_entry (kibnal_data.kib_idle_txs.next, kib_tx_t, tx_list); + list_del (&tx->tx_list); - if (tx != NULL) { - list_del (&tx->tx_list); - - /* Allocate a new passive RDMA completion cookie. It might - * not be needed, but we've got a lock right now and we're - * unlikely to wrap... */ - tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++; - - LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); - LASSERT (tx->tx_nsp == 0); - LASSERT (tx->tx_sending == 0); - LASSERT (tx->tx_status == 0); - LASSERT (tx->tx_conn == NULL); - LASSERT (!tx->tx_passive_rdma); - LASSERT (!tx->tx_passive_rdma_wait); - LASSERT (tx->tx_lntmsg[0] == NULL); - LASSERT (tx->tx_lntmsg[1] == NULL); - } + /* Allocate a new passive RDMA completion cookie. It might not be + * needed, but we've got a lock right now and we're unlikely to + * wrap... */ + tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++; spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); + + LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); + LASSERT (tx->tx_nsp == 0); + LASSERT (tx->tx_sending == 0); + LASSERT (tx->tx_status == 0); + LASSERT (tx->tx_conn == NULL); + LASSERT (!tx->tx_passive_rdma); + LASSERT (!tx->tx_passive_rdma_wait); + LASSERT (tx->tx_lntmsg[0] == NULL); + LASSERT (tx->tx_lntmsg[1] == NULL); RETURN(tx); } @@ -548,15 +518,18 @@ kibnal_rx (kib_rx_t *rx) switch (msg->ibm_type) { case IBNAL_MSG_GET_RDMA: - rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.rdma.ibrm_hdr, rx); + rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.rdma.ibrm_hdr, + rx->rx_conn->ibc_peer->ibp_nid, rx); break; case IBNAL_MSG_PUT_RDMA: - rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.rdma.ibrm_hdr, rx); + rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.rdma.ibrm_hdr, + rx->rx_conn->ibc_peer->ibp_nid, rx); break; case IBNAL_MSG_IMMEDIATE: - rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.immediate.ibim_hdr, rx); + rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.immediate.ibim_hdr, + rx->rx_conn->ibc_peer->ibp_nid, rx); break; default: @@ -864,7 +837,7 @@ kibnal_check_sends (kib_conn_t *conn) conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) { spin_unlock_irqrestore(&conn->ibc_lock, flags); - tx = kibnal_get_idle_tx(0); /* don't block */ + tx = kibnal_get_idle_tx(); if (tx != NULL) kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0); @@ -1253,7 +1226,7 @@ kibnal_start_passive_rdma (int type, int may_block, lnet_msg_t *lntmsg) access.s.RdmaRead = 1; access.s.RdmaWrite = 1; - tx = kibnal_get_idle_tx (may_block); + tx = kibnal_get_idle_tx (); if (tx == NULL) { CERROR("Can't allocate %s txd for %s\n", (type == IBNAL_MSG_PUT_RDMA) ? "PUT/REPLY" : "GET", @@ -1369,12 +1342,12 @@ kibnal_start_active_rdma (int type, int status, LASSERT (rxmsg->ibm_type == IBNAL_MSG_PUT_RDMA); } - tx = kibnal_get_idle_tx (0); /* Mustn't block */ + tx = kibnal_get_idle_tx (); if (tx == NULL) { CERROR ("tx descs exhausted on RDMA from %s" " completing locally with failure\n", libcfs_nid2str(rx->rx_conn->ibc_peer->ibp_nid)); - lnet_finalize (kibnal_data.kib_ni, NULL, lntmsg, -ENOMEM); + lnet_finalize (kibnal_data.kib_ni, lntmsg, -ENOMEM); return; } LASSERT (tx->tx_nsp == 0); @@ -1472,7 +1445,7 @@ init_tx: LASSERT (tx->tx_nsp == 1); /* No RDMA: local completion happens now! */ CDEBUG(D_WARNING,"No data: immediate completion\n"); - lnet_finalize (kibnal_data.kib_ni, NULL, lntmsg, + lnet_finalize (kibnal_data.kib_ni, lntmsg, status == 0 ? 0 : -EIO); } @@ -1574,10 +1547,7 @@ kibnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) /* Send IMMEDIATE */ - tx = kibnal_get_idle_tx(!(routing || - type == LNET_MSG_ACK || - type == LNET_MSG_REPLY || - in_interrupt())); + tx = kibnal_get_idle_tx(); if (tx == NULL) { CERROR ("Can't send %d to %s: tx descs exhausted%s\n", type, libcfs_nid2str(target.nid), @@ -1651,7 +1621,7 @@ kibnal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), mlen); - lnet_finalize (ni, NULL, lntmsg, 0); + lnet_finalize (ni, lntmsg, 0); break; case IBNAL_MSG_GET_RDMA: diff --git a/lnet/klnds/openiblnd/openiblnd.c b/lnet/klnds/openiblnd/openiblnd.c index 5b395f6..c2d89df 100644 --- a/lnet/klnds/openiblnd/openiblnd.c +++ b/lnet/klnds/openiblnd/openiblnd.c @@ -1273,18 +1273,12 @@ kibnal_setup_tx_descs (void) tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset); tx->tx_vaddr = vaddr; - tx->tx_isnblk = (i >= *kibnal_tunables.kib_ntx); tx->tx_mapped = KIB_TX_UNMAPPED; CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n", i, tx, tx->tx_msg, tx->tx_vaddr); - if (tx->tx_isnblk) - list_add (&tx->tx_list, - &kibnal_data.kib_idle_nblk_txs); - else - list_add (&tx->tx_list, - &kibnal_data.kib_idle_txs); + list_add (&tx->tx_list, &kibnal_data.kib_idle_txs); vaddr += IBNAL_MSG_SIZE; LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES()); @@ -1450,6 +1444,16 @@ kibnal_startup (lnet_ni_t *ni) return -EPERM; } + if (*kibnal_tunables.kib_credits > *kibnal_tunables.kib_ntx) { + CERROR ("Can't set credits(%d) > ntx(%d)\n", + *kibnal_tunables.kib_credits, + *kibnal_tunables.kib_ntx); + return -EINVAL; + } + + ni->ni_maxtxcredits = *kibnal_tunables.kib_credits; + ni->ni_peertxcredits = *kibnal_tunables.kib_peercredits; + PORTAL_MODULE_USE; memset (&kibnal_data, 0, sizeof (kibnal_data)); /* zero pointers, flags etc */ @@ -1486,8 +1490,6 @@ kibnal_startup (lnet_ni_t *ni) spin_lock_init (&kibnal_data.kib_tx_lock); INIT_LIST_HEAD (&kibnal_data.kib_idle_txs); - INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs); - init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq); PORTAL_ALLOC (kibnal_data.kib_tx_descs, IBNAL_TX_MSGS() * sizeof(kib_tx_t)); @@ -1569,8 +1571,7 @@ kibnal_startup (lnet_ni_t *ni) /*****************************************************/ #if IBNAL_FMR { - const int pool_size = *kibnal_tunables.kib_ntx + - *kibnal_tunables.kib_ntx_nblk; + const int pool_size = *kibnal_tunables.kib_ntx; struct ib_fmr_pool_param params = { .max_pages_per_fmr = PTL_MTU/PAGE_SIZE, .access = (IB_ACCESS_LOCAL_WRITE | diff --git a/lnet/klnds/openiblnd/openiblnd.h b/lnet/klnds/openiblnd/openiblnd.h index a81a7d7..c364d90 100644 --- a/lnet/klnds/openiblnd/openiblnd.h +++ b/lnet/klnds/openiblnd/openiblnd.h @@ -79,8 +79,9 @@ #define IBNAL_CONCURRENT_PEERS 1024 /* # nodes all talking at once to me */ #define IBNAL_CKSUM 0 /* checksum kib_msg_t? */ #define IBNAL_TIMEOUT 50 /* default comms timeout (seconds) */ -#define IBNAL_NTX 64 /* # tx descs */ -#define IBNAL_NTX_NBLK 256 /* # reserved tx descs */ +#define IBNAL_NTX 384 /* # tx descs */ +#define IBNAL_CREDITS 256 /* # reserved tx descs */ +#define IBNAL_PEERCREDITS 16 /* tunables fixed at compile time */ #define IBNAL_PEER_HASH_SIZE 101 /* # peer lists */ @@ -101,8 +102,7 @@ /* derived constants... */ /* TX messages (shared by all connections) */ -#define IBNAL_TX_MSGS() (*kibnal_tunables.kib_ntx + \ - *kibnal_tunables.kib_ntx_nblk) +#define IBNAL_TX_MSGS() (*kibnal_tunables.kib_ntx) #define IBNAL_TX_MSG_BYTES() (IBNAL_TX_MSGS() * IBNAL_MSG_SIZE) #define IBNAL_TX_MSG_PAGES() ((IBNAL_TX_MSG_BYTES() + PAGE_SIZE - 1)/PAGE_SIZE) @@ -125,7 +125,8 @@ typedef struct int *kib_cksum; /* checksum kib_msg_t? */ int *kib_timeout; /* comms timeout (seconds) */ int *kib_ntx; /* # tx descs */ - int *kib_ntx_nblk; /* # reserved tx descs */ + int *kib_credits; /* # concurrent sends */ + int *kib_peercredits; /* # concurrent sends to 1 peer */ struct ctl_table_header *kib_sysctl; /* sysctl interface */ } kib_tunables_t; @@ -182,8 +183,6 @@ typedef struct kib_pages_t *kib_tx_pages; /* premapped tx msg pages */ struct list_head kib_idle_txs; /* idle tx descriptors */ - struct list_head kib_idle_nblk_txs; /* idle reserved tx descriptors */ - wait_queue_head_t kib_idle_tx_waitq; /* block here for tx descriptor */ __u64 kib_next_tx_cookie; /* RDMA completion cookie */ spinlock_t kib_tx_lock; /* serialise */ @@ -324,7 +323,6 @@ typedef struct kib_rx /* receive message */ typedef struct kib_tx /* transmit message */ { struct list_head tx_list; /* queue on idle_txs ibc_tx_queue etc. */ - int tx_isnblk; /* I'm reserved for non-blocking sends */ struct kib_conn *tx_conn; /* owning conn */ int tx_mapped; /* mapped for RDMA? */ int tx_sending; /* # tx callbacks outstanding */ diff --git a/lnet/klnds/openiblnd/openiblnd_cb.c b/lnet/klnds/openiblnd/openiblnd_cb.c index 094b2a37..509a9f4 100644 --- a/lnet/klnds/openiblnd/openiblnd_cb.c +++ b/lnet/klnds/openiblnd/openiblnd_cb.c @@ -91,7 +91,7 @@ kibnal_tx_done (kib_tx_t *tx) if (tx->tx_lntmsg[i] == NULL) continue; - lnet_finalize (kibnal_data.kib_ni, NULL, tx->tx_lntmsg[i], + lnet_finalize (kibnal_data.kib_ni, tx->tx_lntmsg[i], tx->tx_status); tx->tx_lntmsg[i] = NULL; } @@ -107,74 +107,45 @@ kibnal_tx_done (kib_tx_t *tx) spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags); - if (tx->tx_isnblk) { - list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs); - } else { - list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs); - wake_up (&kibnal_data.kib_idle_tx_waitq); - } + list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs); spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); } kib_tx_t * -kibnal_get_idle_tx (int may_block) +kibnal_get_idle_tx (void) { unsigned long flags; - kib_tx_t *tx = NULL; + kib_tx_t *tx; - for (;;) { - spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags); - - /* "normal" descriptor is free */ - if (!list_empty (&kibnal_data.kib_idle_txs)) { - tx = list_entry (kibnal_data.kib_idle_txs.next, - kib_tx_t, tx_list); - break; - } - - if (!may_block) { - /* may dip into reserve pool */ - if (list_empty (&kibnal_data.kib_idle_nblk_txs)) { - CERROR ("reserved tx desc pool exhausted\n"); - break; - } - - tx = list_entry (kibnal_data.kib_idle_nblk_txs.next, - kib_tx_t, tx_list); - break; - } + spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags); - /* block for idle tx */ + if (list_empty (&kibnal_data.kib_idle_txs)) { spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); - - wait_event (kibnal_data.kib_idle_tx_waitq, - !list_empty (&kibnal_data.kib_idle_txs) || - kibnal_data.kib_shutdown); + return NULL; } - if (tx != NULL) { - list_del (&tx->tx_list); - - /* Allocate a new passive RDMA completion cookie. It might - * not be needed, but we've got a lock right now and we're - * unlikely to wrap... */ - tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++; + tx = list_entry (kibnal_data.kib_idle_txs.next, kib_tx_t, tx_list); + list_del (&tx->tx_list); - LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); - LASSERT (tx->tx_nsp == 0); - LASSERT (tx->tx_sending == 0); - LASSERT (tx->tx_status == 0); - LASSERT (tx->tx_conn == NULL); - LASSERT (!tx->tx_passive_rdma); - LASSERT (!tx->tx_passive_rdma_wait); - LASSERT (tx->tx_lntmsg[0] == NULL); - LASSERT (tx->tx_lntmsg[1] == NULL); - } + /* Allocate a new passive RDMA completion cookie. It might not be + * needed, but we've got a lock right now and we're unlikely to + * wrap... */ + tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++; spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); - - return (tx); + + LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); + LASSERT (tx->tx_nsp == 0); + LASSERT (tx->tx_sending == 0); + LASSERT (tx->tx_status == 0); + LASSERT (tx->tx_conn == NULL); + LASSERT (!tx->tx_passive_rdma); + LASSERT (!tx->tx_passive_rdma_wait); + LASSERT (tx->tx_lntmsg[0] == NULL); + LASSERT (tx->tx_lntmsg[1] == NULL); + + return tx; } void @@ -401,15 +372,18 @@ kibnal_rx (kib_rx_t *rx) switch (msg->ibm_type) { case IBNAL_MSG_GET_RDMA: - rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.rdma.ibrm_hdr, rx); + rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.rdma.ibrm_hdr, + msg->ibm_srcnid, rx); break; case IBNAL_MSG_PUT_RDMA: - rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.rdma.ibrm_hdr, rx); + rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.rdma.ibrm_hdr, + msg->ibm_srcnid, rx); break; case IBNAL_MSG_IMMEDIATE: - rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.immediate.ibim_hdr, rx); + rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.immediate.ibim_hdr, + msg->ibm_srcnid, rx); break; default: @@ -648,7 +622,7 @@ kibnal_check_sends (kib_conn_t *conn) conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) { spin_unlock_irqrestore(&conn->ibc_lock, flags); - tx = kibnal_get_idle_tx(0); /* don't block */ + tx = kibnal_get_idle_tx(); /* don't block */ if (tx != NULL) kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0); @@ -1001,7 +975,7 @@ kibnal_launch_tx (kib_tx_t *tx, lnet_nid_t nid) } int -kibnal_start_passive_rdma (int type, int may_block, lnet_msg_t *lntmsg) +kibnal_start_passive_rdma (int type, lnet_msg_t *lntmsg) { lnet_nid_t nid = lntmsg->msg_target.nid; int nob = lntmsg->msg_md->md_length; @@ -1022,7 +996,7 @@ kibnal_start_passive_rdma (int type, int may_block, lnet_msg_t *lntmsg) IB_ACCESS_LOCAL_WRITE; } - tx = kibnal_get_idle_tx (may_block); + tx = kibnal_get_idle_tx (); if (tx == NULL) { CERROR("Can't allocate %s txd for %s\n", (type == IBNAL_MSG_PUT_RDMA) ? "PUT/REPLY" : "GET", @@ -1133,12 +1107,12 @@ kibnal_start_active_rdma (int type, int status, LASSERT (rxmsg->ibm_type == IBNAL_MSG_PUT_RDMA); } - tx = kibnal_get_idle_tx (0); /* Mustn't block */ + tx = kibnal_get_idle_tx (); if (tx == NULL) { CERROR ("tx descs exhausted on RDMA from %s" " completing locally with failure\n", libcfs_nid2str(rx->rx_conn->ibc_peer->ibp_nid)); - lnet_finalize (kibnal_data.kib_ni, NULL, lntmsg, -ENOMEM); + lnet_finalize (kibnal_data.kib_ni, lntmsg, -ENOMEM); return; } LASSERT (tx->tx_nsp == 0); @@ -1205,7 +1179,7 @@ kibnal_start_active_rdma (int type, int status, LASSERT (tx->tx_nsp == 1); /* No RDMA: local completion happens now! */ CDEBUG(D_NET, "No data: immediate completion\n"); - lnet_finalize (kibnal_data.kib_ni, NULL, lntmsg, + lnet_finalize (kibnal_data.kib_ni, lntmsg, status == 0 ? 0 : -EIO); } @@ -1267,7 +1241,7 @@ kibnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) if (nob <= IBNAL_MSG_SIZE) break; /* send IMMEDIATE */ - return kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, 1, lntmsg); + return kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, lntmsg); case LNET_MSG_REPLY: { /* reply's 'private' is the incoming receive */ @@ -1300,17 +1274,12 @@ kibnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) if (nob <= IBNAL_MSG_SIZE) break; /* send IMMEDIATE */ - return kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA, - !(routing || type == LNET_MSG_REPLY), - lntmsg); + return kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA, lntmsg); } /* Send IMMEDIATE */ - tx = kibnal_get_idle_tx(!(routing || - type == LNET_MSG_ACK || - type == LNET_MSG_REPLY || - in_interrupt())); + tx = kibnal_get_idle_tx(); if (tx == NULL) { CERROR ("Can't send %d to %s: tx descs exhausted%s\n", type, libcfs_nid2str(target.nid), @@ -1386,7 +1355,7 @@ kibnal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), mlen); - lnet_finalize (ni, NULL, lntmsg, 0); + lnet_finalize (ni, lntmsg, 0); break; case IBNAL_MSG_GET_RDMA: diff --git a/lnet/klnds/openiblnd/openiblnd_modparams.c b/lnet/klnds/openiblnd/openiblnd_modparams.c index 99ba6a4..6ae7a91 100644 --- a/lnet/klnds/openiblnd/openiblnd_modparams.c +++ b/lnet/klnds/openiblnd/openiblnd_modparams.c @@ -49,11 +49,15 @@ CFS_MODULE_PARM(timeout, "i", int, 0644, static int ntx = IBNAL_NTX; CFS_MODULE_PARM(ntx, "i", int, 0444, - "# of 'normal' message descriptors"); + "# of message descriptors"); -static int ntx_nblk = IBNAL_NTX_NBLK; -CFS_MODULE_PARM(ntx_nblk, "i", int, 0444, - "# of 'reserved' message descriptors"); +static int credits = IBNAL_CREDITS; +CFS_MODULE_PARM(credits, "i", int, 0444, + "# concurrent sends"); + +static int peer_credits = IBNAL_PEERCREDITS; +CFS_MODULE_PARM(peer_credits, "i", int, 0444, + "# concurrent sends to 1 peer"); kib_tunables_t kibnal_tunables = { .kib_n_connd = &n_connd, @@ -63,7 +67,8 @@ kib_tunables_t kibnal_tunables = { .kib_cksum = &cksum, .kib_timeout = &timeout, .kib_ntx = &ntx, - .kib_ntx_nblk = &ntx_nblk, + .kib_credits = &credits, + .kib_peercredits = &peer_credits, }; #if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM @@ -83,7 +88,9 @@ static ctl_table kibnal_ctl_table[] = { sizeof(int), 0644, NULL, &proc_dointvec}, {7, "ntx", &ntx, sizeof(int), 0444, NULL, &proc_dointvec}, - {8, "ntx_nblk", &ntx_nblk, + {8, "credits", &credits, + sizeof(int), 0444, NULL, &proc_dointvec}, + {9, "peer_credits", &peer_credits, sizeof(int), 0444, NULL, &proc_dointvec}, {0} }; diff --git a/lnet/klnds/qswlnd/qswlnd.c b/lnet/klnds/qswlnd/qswlnd.c index 6c91dbe..6ce966d 100644 --- a/lnet/klnds/qswlnd/qswlnd.c +++ b/lnet/klnds/qswlnd/qswlnd.c @@ -58,8 +58,7 @@ kqswnal_get_tx_desc (struct portal_ioctl_data *data) data->ioc_u64[0] = ktx->ktx_nid; data->ioc_u32[0] = le32_to_cpu(hdr->type); data->ioc_u32[1] = ktx->ktx_launcher; - data->ioc_flags = (list_empty (&ktx->ktx_delayed_list) ? 0 : 1) | - (!ktx->ktx_isnblk ? 0 : 2) | + data->ioc_flags = (list_empty (&ktx->ktx_schedlist) ? 0 : 1) | (ktx->ktx_state << 2); rc = 0; break; @@ -123,8 +122,6 @@ kqswnal_shutdown(lnet_ni_t *ni) kqswnal_data.kqn_shuttingdown = 1; spin_unlock_irqrestore(&kqswnal_data.kqn_idletxd_lock, flags); - wake_up_all(&kqswnal_data.kqn_idletxd_waitq); - /**********************************************************************/ /* wait for sends that have allocated a tx desc to launch or give up */ while (atomic_read (&kqswnal_data.kqn_pending_txs) != 0) { @@ -190,6 +187,7 @@ kqswnal_shutdown(lnet_ni_t *ni) #if MULTIRAIL_EKC LASSERT (list_empty (&kqswnal_data.kqn_readyrxds)); + LASSERT (list_empty (&kqswnal_data.kqn_donetxds)); LASSERT (list_empty (&kqswnal_data.kqn_delayedtxds)); #endif @@ -247,8 +245,7 @@ kqswnal_shutdown(lnet_ni_t *ni) { elan3_dvma_unload(kqswnal_data.kqn_ep->DmaState, kqswnal_data.kqn_eptxdmahandle, 0, - KQSW_NTXMSGPAGES * (*kqswnal_tunables.kqn_ntxmsgs + - *kqswnal_tunables.kqn_nnblk_txmsgs)); + KQSW_NTXMSGPAGES * (*kqswnal_tunables.kqn_ntxmsgs)); elan3_dma_release(kqswnal_data.kqn_ep->DmaState, kqswnal_data.kqn_eptxdmahandle); @@ -311,6 +308,14 @@ kqswnal_startup (lnet_ni_t *ni) CERROR("Explicit interface config not supported\n"); return -EPERM; } + + if (*kqswnal_tunables.kqn_credits >= + *kqswnal_tunables.kqn_ntxmsgs) { + LCONSOLE_ERROR("Configuration error: please set " + "ntxmsgs(%d) > credits(%d)\n", + *kqswnal_tunables.kqn_ntxmsgs, + *kqswnal_tunables.kqn_credits); + } CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read(&libcfs_kmemory)); @@ -319,14 +324,15 @@ kqswnal_startup (lnet_ni_t *ni) kqswnal_data.kqn_ni = ni; ni->ni_data = &kqswnal_data; + ni->ni_peertxcredits = *kqswnal_tunables.kqn_peercredits; + ni->ni_maxtxcredits = *kqswnal_tunables.kqn_credits; INIT_LIST_HEAD (&kqswnal_data.kqn_idletxds); - INIT_LIST_HEAD (&kqswnal_data.kqn_nblk_idletxds); INIT_LIST_HEAD (&kqswnal_data.kqn_activetxds); spin_lock_init (&kqswnal_data.kqn_idletxd_lock); - init_waitqueue_head (&kqswnal_data.kqn_idletxd_waitq); INIT_LIST_HEAD (&kqswnal_data.kqn_delayedtxds); + INIT_LIST_HEAD (&kqswnal_data.kqn_donetxds); INIT_LIST_HEAD (&kqswnal_data.kqn_readyrxds); spin_lock_init (&kqswnal_data.kqn_sched_lock); @@ -417,8 +423,7 @@ kqswnal_startup (lnet_ni_t *ni) #if MULTIRAIL_EKC kqswnal_data.kqn_ep_tx_nmh = ep_dvma_reserve(kqswnal_data.kqn_ep, - KQSW_NTXMSGPAGES*(*kqswnal_tunables.kqn_ntxmsgs+ - *kqswnal_tunables.kqn_nnblk_txmsgs), + KQSW_NTXMSGPAGES*(*kqswnal_tunables.kqn_ntxmsgs), EP_PERM_WRITE); if (kqswnal_data.kqn_ep_tx_nmh == NULL) { CERROR("Can't reserve tx dma space\n"); @@ -432,8 +437,7 @@ kqswnal_startup (lnet_ni_t *ni) dmareq.Perm = ELAN_PERM_REMOTEWRITE; rc = elan3_dma_reserve(kqswnal_data.kqn_ep->DmaState, - KQSW_NTXMSGPAGES*(*kqswnal_tunables.kqn_ntxmsgs+ - *kqswnal_tunables.kqn_nnblk_txmsgs), + KQSW_NTXMSGPAGES*(*kqswnal_tunables.kqn_ntxmsgs), &dmareq, &kqswnal_data.kqn_eptxdmahandle); if (rc != DDI_SUCCESS) { @@ -480,7 +484,7 @@ kqswnal_startup (lnet_ni_t *ni) /* Allocate/Initialise transmit descriptors */ kqswnal_data.kqn_txds = NULL; - for (i = 0; i < (*kqswnal_tunables.kqn_ntxmsgs + *kqswnal_tunables.kqn_nnblk_txmsgs); i++) + for (i = 0; i < (*kqswnal_tunables.kqn_ntxmsgs); i++) { int premapped_pages; int basepage = i * KQSW_NTXMSGPAGES; @@ -519,16 +523,13 @@ kqswnal_startup (lnet_ni_t *ni) ktx->ktx_basepage = basepage + premapped_pages; /* message mapping starts here */ ktx->ktx_npages = KQSW_NTXMSGPAGES - premapped_pages; /* for this many pages */ - INIT_LIST_HEAD (&ktx->ktx_delayed_list); + INIT_LIST_HEAD (&ktx->ktx_schedlist); ktx->ktx_state = KTX_IDLE; #if MULTIRAIL_EKC ktx->ktx_rail = -1; /* unset rail */ #endif - ktx->ktx_isnblk = (i >= *kqswnal_tunables.kqn_ntxmsgs); - list_add_tail (&ktx->ktx_list, - ktx->ktx_isnblk ? &kqswnal_data.kqn_nblk_idletxds : - &kqswnal_data.kqn_idletxds); + list_add_tail (&ktx->ktx_list, &kqswnal_data.kqn_idletxds); } /**********************************************************************/ diff --git a/lnet/klnds/qswlnd/qswlnd.h b/lnet/klnds/qswlnd/qswlnd.h index 410f6b2..ded5812 100644 --- a/lnet/klnds/qswlnd/qswlnd.h +++ b/lnet/klnds/qswlnd/qswlnd.h @@ -86,8 +86,9 @@ */ #define KQSW_TX_MAXCONTIG (1<<10) /* largest payload that gets made contiguous on transmit */ -#define KQSW_NTXMSGS 8 /* # normal transmit messages */ -#define KQSW_NNBLK_TXMSGS (PAGE_SIZE == 4096 ? 512 : 256) /* # reserved transmit messages if can't block */ /* avoid qsnet crash b=5291 */ +#define KQSW_NTXMSGS 256 /* # message descriptors */ +#define KQSW_CREDITS 128 /* # concurrent sends */ +#define KQSW_PEERCREDITS 8 /* # concurrent sends to 1 node */ #define KQSW_NRXMSGS_LARGE 64 /* # large receive buffers */ #define KQSW_EP_ENVELOPES_LARGE 256 /* # large ep envelopes */ @@ -161,9 +162,8 @@ typedef struct kqswnal_rx typedef struct kqswnal_tx { struct list_head ktx_list; /* enqueue idle/active */ - struct list_head ktx_delayed_list; /* enqueue delayedtxds */ + struct list_head ktx_schedlist; /* enqueue on scheduler */ struct kqswnal_tx *ktx_alloclist; /* stack in kqn_txds */ - unsigned int ktx_isnblk:1; /* reserved descriptor? */ unsigned int ktx_state:7; /* What I'm doing */ unsigned int ktx_firsttmpfrag:1; /* ktx_frags[0] is in my ebuffer ? 0 : 1 */ uint32_t ktx_basepage; /* page offset in reserved elan tx vaddrs for mapping pages */ @@ -174,6 +174,7 @@ typedef struct kqswnal_tx void *ktx_args[3]; /* completion passthru */ char *ktx_buffer; /* pre-allocated contiguous buffer for hdr + small payloads */ unsigned long ktx_launchtime; /* when (in jiffies) the transmit was launched */ + int ktx_status; /* completion status */ /* debug/info fields */ pid_t ktx_launcher; /* pid of launching process */ @@ -189,7 +190,7 @@ typedef struct kqswnal_tx #endif } kqswnal_tx_t; -#define KTX_IDLE 0 /* on kqn_(nblk_)idletxds */ +#define KTX_IDLE 0 /* on kqn_idletxds */ #define KTX_SENDING 1 /* normal send */ #define KTX_GETTING 2 /* sending optimised get */ #define KTX_PUTTING 3 /* sending optimised put */ @@ -199,7 +200,8 @@ typedef struct { int *kqn_tx_maxcontig; /* maximum payload to defrag */ int *kqn_ntxmsgs; /* # normal tx msgs */ - int *kqn_nnblk_txmsgs; /* # reserved tx msgs */ + int *kqn_credits; /* # concurrent sends */ + int *kqn_peercredits; /* # concurrent sends to 1 peer */ int *kqn_nrxmsgs_large; /* # 'large' rx msgs */ int *kqn_ep_envelopes_large; /* # 'large' rx ep envelopes */ int *kqn_nrxmsgs_small; /* # 'small' rx msgs */ @@ -223,16 +225,15 @@ typedef struct kqswnal_tx_t *kqn_txds; /* stack of all the transmit descriptors */ struct list_head kqn_idletxds; /* transmit descriptors free to use */ - struct list_head kqn_nblk_idletxds; /* reserved free transmit descriptors */ struct list_head kqn_activetxds; /* transmit descriptors being used */ spinlock_t kqn_idletxd_lock; /* serialise idle txd access */ - wait_queue_head_t kqn_idletxd_waitq; /* sender blocks here waiting for idle txd */ atomic_t kqn_pending_txs; /* # transmits being prepped */ spinlock_t kqn_sched_lock; /* serialise packet schedulers */ wait_queue_head_t kqn_sched_waitq; /* scheduler blocks here */ struct list_head kqn_readyrxds; /* rxds full of data */ + struct list_head kqn_donetxds; /* completed transmits */ struct list_head kqn_delayedtxds; /* delayed transmits */ #if MULTIRAIL_EKC diff --git a/lnet/klnds/qswlnd/qswlnd_cb.c b/lnet/klnds/qswlnd/qswlnd_cb.c index 81a2a30..86b9af9 100644 --- a/lnet/klnds/qswlnd/qswlnd_cb.c +++ b/lnet/klnds/qswlnd/qswlnd_cb.c @@ -32,7 +32,8 @@ kqswnal_notify_peer_down(kqswnal_tx_t *ktx) do_gettimeofday (&now); then = now.tv_sec - (jiffies - ktx->ktx_launchtime)/HZ; - lnet_notify(kqswnal_data.kqn_ni, ktx->ktx_nid, 0, then); + /* no auto-down for now */ + // lnet_notify(kqswnal_data.kqn_ni, ktx->ktx_nid, 0, then); } void @@ -318,97 +319,66 @@ kqswnal_put_idle_tx (kqswnal_tx_t *ktx) spin_lock_irqsave (&kqswnal_data.kqn_idletxd_lock, flags); list_del (&ktx->ktx_list); /* take off active list */ - - if (ktx->ktx_isnblk) { - /* reserved for non-blocking tx */ - list_add (&ktx->ktx_list, &kqswnal_data.kqn_nblk_idletxds); - spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags); - return; - } - list_add (&ktx->ktx_list, &kqswnal_data.kqn_idletxds); - wake_up (&kqswnal_data.kqn_idletxd_waitq); - spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags); } kqswnal_tx_t * -kqswnal_get_idle_tx (int may_block) +kqswnal_get_idle_tx (void) { unsigned long flags; - kqswnal_tx_t *ktx = NULL; - - for (;;) { - spin_lock_irqsave (&kqswnal_data.kqn_idletxd_lock, flags); - - if (kqswnal_data.kqn_shuttingdown) - break; - - /* "normal" descriptor is free */ - if (!list_empty (&kqswnal_data.kqn_idletxds)) { - ktx = list_entry (kqswnal_data.kqn_idletxds.next, - kqswnal_tx_t, ktx_list); - break; - } - - if (!may_block) { - if (list_empty (&kqswnal_data.kqn_nblk_idletxds)) { - CERROR ("intr tx desc pool exhausted\n"); - break; - } - - ktx = list_entry (kqswnal_data.kqn_nblk_idletxds.next, - kqswnal_tx_t, ktx_list); - break; - } + kqswnal_tx_t *ktx; - /* block for idle tx */ + spin_lock_irqsave (&kqswnal_data.kqn_idletxd_lock, flags); + if (kqswnal_data.kqn_shuttingdown || + list_empty (&kqswnal_data.kqn_idletxds)) { spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags); - CDEBUG (D_NET, "blocking for tx desc\n"); - wait_event (kqswnal_data.kqn_idletxd_waitq, - !list_empty (&kqswnal_data.kqn_idletxds) || - kqswnal_data.kqn_shuttingdown); + return NULL; } - if (ktx != NULL) { - list_del (&ktx->ktx_list); - list_add (&ktx->ktx_list, &kqswnal_data.kqn_activetxds); - ktx->ktx_launcher = current->pid; - atomic_inc(&kqswnal_data.kqn_pending_txs); - } + ktx = list_entry (kqswnal_data.kqn_idletxds.next, kqswnal_tx_t, ktx_list); + list_del (&ktx->ktx_list); + + list_add (&ktx->ktx_list, &kqswnal_data.kqn_activetxds); + ktx->ktx_launcher = current->pid; + atomic_inc(&kqswnal_data.kqn_pending_txs); spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags); /* Idle descs can't have any mapped (as opposed to pre-mapped) pages */ - LASSERT (ktx == NULL || ktx->ktx_nmappedpages == 0); - + LASSERT (ktx->ktx_nmappedpages == 0); return (ktx); } void -kqswnal_tx_done (kqswnal_tx_t *ktx, int error) +kqswnal_tx_done_in_thread_context (kqswnal_tx_t *ktx) { + LASSERT (!in_interrupt()); + + if (ktx->ktx_status == -EHOSTDOWN) + kqswnal_notify_peer_down(ktx); + switch (ktx->ktx_state) { case KTX_RDMAING: /* optimized GET/PUT handled */ case KTX_PUTTING: /* optimized PUT sent */ case KTX_SENDING: /* normal send */ - lnet_finalize (kqswnal_data.kqn_ni, NULL, - (lnet_msg_t *)ktx->ktx_args[1], - (error == 0) ? 0 : -EIO); + lnet_finalize (kqswnal_data.kqn_ni, + (lnet_msg_t *)ktx->ktx_args[1], + ktx->ktx_status); break; case KTX_GETTING: /* optimized GET sent & REPLY received */ /* Complete the GET with success since we can't avoid * delivering a REPLY event; we committed to it when we * launched the GET */ - lnet_finalize (kqswnal_data.kqn_ni, NULL, - (lnet_msg_t *)ktx->ktx_args[1], 0); - lnet_finalize (kqswnal_data.kqn_ni, NULL, - (lnet_msg_t *)ktx->ktx_args[2], - (error == 0) ? 0 : -EIO); + lnet_finalize (kqswnal_data.kqn_ni, + (lnet_msg_t *)ktx->ktx_args[1], 0); + lnet_finalize (kqswnal_data.kqn_ni, + (lnet_msg_t *)ktx->ktx_args[2], + ktx->ktx_status); break; default: @@ -418,6 +388,28 @@ kqswnal_tx_done (kqswnal_tx_t *ktx, int error) kqswnal_put_idle_tx (ktx); } +void +kqswnal_tx_done (kqswnal_tx_t *ktx, int status) +{ + unsigned long flags; + + ktx->ktx_status = status; + + if (!in_interrupt()) { + kqswnal_tx_done_in_thread_context(ktx); + return; + } + + /* Complete the send in thread context */ + spin_lock_irqsave(&kqswnal_data.kqn_sched_lock, flags); + + list_add_tail(&ktx->ktx_schedlist, + &kqswnal_data.kqn_donetxds); + wake_up(&kqswnal_data.kqn_sched_waitq); + + spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock, flags); +} + static void kqswnal_txhandler(EP_TXD *txd, void *arg, int status) { @@ -433,7 +425,6 @@ kqswnal_txhandler(EP_TXD *txd, void *arg, int status) CERROR ("Tx completion to %s failed: %d\n", libcfs_nid2str(ktx->ktx_nid), status); - kqswnal_notify_peer_down(ktx); status = -EHOSTDOWN; } else switch (ktx->ktx_state) { @@ -458,7 +449,7 @@ kqswnal_txhandler(EP_TXD *txd, void *arg, int status) break; } - kqswnal_tx_done (ktx, status); + kqswnal_tx_done(ktx, status); } int @@ -520,7 +511,7 @@ kqswnal_launch (kqswnal_tx_t *ktx) case EP_ENOMEM: /* can't allocate ep txd => queue for later */ spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); - list_add_tail (&ktx->ktx_delayed_list, &kqswnal_data.kqn_delayedtxds); + list_add_tail (&ktx->ktx_schedlist, &kqswnal_data.kqn_delayedtxds); wake_up (&kqswnal_data.kqn_sched_waitq); spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); @@ -690,34 +681,25 @@ kqswnal_check_rdma (int nlfrag, EP_NMD *lfrag, #endif kqswnal_remotemd_t * -kqswnal_parse_rmd (kqswnal_rx_t *krx, int type, lnet_nid_t expected_nid) +kqswnal_parse_rmd (kqswnal_rx_t *krx, int type) { char *buffer = (char *)page_address(krx->krx_kiov[0].kiov_page); lnet_hdr_t *hdr = (lnet_hdr_t *)buffer; kqswnal_remotemd_t *rmd = (kqswnal_remotemd_t *)(buffer + KQSW_HDR_SIZE); - lnet_nid_t nid = kqswnal_rx_nid(krx); - /* Note (1) lnet_parse has already flipped hdr. - * (2) RDMA addresses are sent in native endian-ness. When + /* Note RDMA addresses are sent in native endian-ness. When * EKC copes with different endian nodes, I'll fix this (and * eat my hat :) */ LASSERT (krx->krx_nob >= sizeof(*hdr)); - if (hdr->type != type) { + if (le32_to_cpu(hdr->type) != type) { CERROR ("Unexpected optimized get/put type %d (%d expected)" - "from %s\n", hdr->type, type, libcfs_nid2str(nid)); + "from %s\n", le32_to_cpu(hdr->type), type, + libcfs_nid2str(kqswnal_rx_nid(krx))); return (NULL); } - if (!lnet_ptlcompat_matchnid(nid, hdr->src_nid)) { - CERROR ("Unexpected optimized get/put source NID %s from %s\n", - libcfs_nid2str(hdr->src_nid), libcfs_nid2str(nid)); - return (NULL); - } - - LASSERT (hdr->src_nid == expected_nid); - if (buffer + krx->krx_nob < (char *)(rmd + 1)) { /* msg too small to discover rmd size */ CERROR ("Incoming message [%d] too small for RMD (%d needed)\n", @@ -827,13 +809,13 @@ kqswnal_rdma (kqswnal_rx_t *krx, lnet_msg_t *lntmsg, int type, LASSERT (krx->krx_rpc_reply_needed); LASSERT (krx->krx_rpc_reply_status != 0); - rmd = kqswnal_parse_rmd(krx, type, lntmsg->msg_ev.initiator.nid); + rmd = kqswnal_parse_rmd(krx, type); if (rmd == NULL) return (-EPROTO); if (len == 0) { /* data got truncated to nothing. */ - lnet_finalize(kqswnal_data.kqn_ni, krx, lntmsg, 0); + lnet_finalize(kqswnal_data.kqn_ni, lntmsg, 0); /* Let kqswnal_rx_done() complete the RPC with success */ krx->krx_rpc_reply_status = 0; return (0); @@ -841,7 +823,7 @@ kqswnal_rdma (kqswnal_rx_t *krx, lnet_msg_t *lntmsg, int type, /* NB I'm using 'ktx' just to map the local RDMA buffers; I'm not actually sending a portals message with it */ - ktx = kqswnal_get_idle_tx(0); + ktx = kqswnal_get_idle_tx(); if (ktx == NULL) { CERROR ("Can't get txd for RDMA with %s\n", libcfs_nid2str(lntmsg->msg_ev.initiator.nid)); @@ -1016,10 +998,7 @@ kqswnal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) /* I may not block for a transmit descriptor if I might block the * router, receiver, or an interrupt handler. */ - ktx = kqswnal_get_idle_tx(!(routing || - type == LNET_MSG_ACK || - type == LNET_MSG_REPLY || - in_interrupt())); + ktx = kqswnal_get_idle_tx(); if (ktx == NULL) { CERROR ("Can't get txd for msg type %d for %s\n", type, libcfs_nid2str(target.nid)); @@ -1061,14 +1040,24 @@ kqswnal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) * immediately after the header, and send that as my * message. */ - ktx->ktx_state = (type == LNET_MSG_GET) ? KTX_GETTING : KTX_PUTTING; + if (type == LNET_MSG_GET) { + if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) != 0) + rc = kqswnal_map_tx_kiov (ktx, 0, md->md_length, + md->md_niov, md->md_iov.kiov); + else + rc = kqswnal_map_tx_iov (ktx, 0, md->md_length, + md->md_niov, md->md_iov.iov); + ktx->ktx_state = KTX_GETTING; + } else { + if (payload_kiov != NULL) + rc = kqswnal_map_tx_kiov(ktx, 0, payload_nob, + payload_niov, payload_kiov); + else + rc = kqswnal_map_tx_iov(ktx, 0, payload_nob, + payload_niov, payload_iov); + ktx->ktx_state = KTX_PUTTING; + } - if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) != 0) - rc = kqswnal_map_tx_kiov (ktx, 0, md->md_length, - md->md_niov, md->md_iov.kiov); - else - rc = kqswnal_map_tx_iov (ktx, 0, md->md_length, - md->md_niov, md->md_iov.iov); if (rc != 0) goto out; @@ -1165,8 +1154,8 @@ kqswnal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) * pretend the GET succeeded but the REPLY * failed. */ rc = 0; - lnet_finalize (kqswnal_data.kqn_ni, private, lntmsg, 0); - lnet_finalize (kqswnal_data.kqn_ni, private, + lnet_finalize (kqswnal_data.kqn_ni, lntmsg, 0); + lnet_finalize (kqswnal_data.kqn_ni, (lnet_msg_t *)ktx->ktx_args[2], -EIO); } @@ -1274,12 +1263,16 @@ kqswnal_rx_done (kqswnal_rx_t *krx) void kqswnal_parse (kqswnal_rx_t *krx) { + lnet_ni_t *ni = kqswnal_data.kqn_ni; lnet_hdr_t *hdr = (lnet_hdr_t *) page_address(krx->krx_kiov[0].kiov_page); + lnet_nid_t fromnid; int rc; LASSERT (atomic_read(&krx->krx_refcount) == 1); - rc = lnet_parse (kqswnal_data.kqn_ni, hdr, krx); + fromnid = PTL_MKNID(PTL_NIDNET(ni->ni_nid), ep_rxd_node(krx->krx_rxd)); + + rc = lnet_parse(ni, hdr, kqswnal_rx_nid(krx), krx); if (rc < 0) { kqswnal_rx_decref(krx); return; @@ -1294,7 +1287,6 @@ kqswnal_rxhandler(EP_RXD *rxd) int nob = ep_rxd_len (rxd); int status = ep_rxd_status (rxd); kqswnal_rx_t *krx = (kqswnal_rx_t *)ep_rxd_arg (rxd); - CDEBUG(D_NET, "kqswnal_rxhandler: rxd %p, krx %p, nob %d, status %d\n", rxd, krx, nob, status); @@ -1361,15 +1353,16 @@ kqswnal_recv (lnet_ni_t *ni, kqswnal_rx_t *krx = (kqswnal_rx_t *)private; char *buffer = page_address(krx->krx_kiov[0].kiov_page); lnet_hdr_t *hdr = (lnet_hdr_t *)buffer; + int hdrtype = le32_to_cpu(hdr->type); int rc; - /* NB lnet_parse() has already flipped *hdr */ + /* NB hdr still in network byte order */ if (krx->krx_rpc_reply_needed && - (hdr->type == LNET_MSG_PUT || - hdr->type == LNET_MSG_REPLY)) { + (hdrtype == LNET_MSG_PUT || + hdrtype == LNET_MSG_REPLY)) { /* This is an optimized PUT/REPLY */ - rc = kqswnal_rdma(krx, lntmsg, hdr->type, + rc = kqswnal_rdma(krx, lntmsg, hdrtype, niov, iov, kiov, offset, mlen); kqswnal_rx_decref(krx); return rc; @@ -1397,7 +1390,7 @@ kqswnal_recv (lnet_ni_t *ni, krx->krx_npages, krx->krx_kiov, KQSW_HDR_SIZE, mlen); - lnet_finalize(ni, private, lntmsg, 0); + lnet_finalize(ni, lntmsg, 0); kqswnal_rx_decref(krx); return 0; } @@ -1462,11 +1455,25 @@ kqswnal_scheduler (void *arg) spin_lock_irqsave(&kqswnal_data.kqn_sched_lock, flags); } + if (!list_empty (&kqswnal_data.kqn_donetxds)) + { + ktx = list_entry(kqswnal_data.kqn_donetxds.next, + kqswnal_tx_t, ktx_schedlist); + list_del_init (&ktx->ktx_schedlist); + spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock, + flags); + + kqswnal_tx_done_in_thread_context(ktx); + + did_something = 1; + spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); + } + if (!list_empty (&kqswnal_data.kqn_delayedtxds)) { ktx = list_entry(kqswnal_data.kqn_delayedtxds.next, - kqswnal_tx_t, ktx_delayed_list); - list_del_init (&ktx->ktx_delayed_list); + kqswnal_tx_t, ktx_schedlist); + list_del_init (&ktx->ktx_schedlist); spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock, flags); @@ -1498,6 +1505,7 @@ kqswnal_scheduler (void *arg) rc = wait_event_interruptible (kqswnal_data.kqn_sched_waitq, kqswnal_data.kqn_shuttingdown == 2 || !list_empty(&kqswnal_data.kqn_readyrxds) || + !list_empty(&kqswnal_data.kqn_donetxds) || !list_empty(&kqswnal_data.kqn_delayedtxds)); LASSERT (rc == 0); } else if (need_resched()) diff --git a/lnet/klnds/qswlnd/qswlnd_modparams.c b/lnet/klnds/qswlnd/qswlnd_modparams.c index 3d5035f..30f2fb0 100644 --- a/lnet/klnds/qswlnd/qswlnd_modparams.c +++ b/lnet/klnds/qswlnd/qswlnd_modparams.c @@ -29,9 +29,13 @@ static int ntxmsgs = KQSW_NTXMSGS; CFS_MODULE_PARM(ntxmsgs, "i", int, 0444, "# 'normal' tx msg buffers"); -static int nnblk_txmsgs = KQSW_NNBLK_TXMSGS; -CFS_MODULE_PARM(nnblk_txmsgs, "i", int, 0444, - "# 'reserved' tx msg buffers"); +static int credits = KQSW_CREDITS; +CFS_MODULE_PARM(credits, "i", int, 0444, + "# concurrent sends"); + +static int peer_credits = KQSW_PEERCREDITS; +CFS_MODULE_PARM(peer_credits, "i", int, 0444, + "# per-peer concurrent sends"); static int nrxmsgs_large = KQSW_NRXMSGS_LARGE; CFS_MODULE_PARM(nrxmsgs_large, "i", int, 0444, @@ -60,7 +64,8 @@ CFS_MODULE_PARM(optimized_gets, "i", int, 0644, kqswnal_tunables_t kqswnal_tunables = { .kqn_tx_maxcontig = &tx_maxcontig, .kqn_ntxmsgs = &ntxmsgs, - .kqn_nnblk_txmsgs = &nnblk_txmsgs, + .kqn_credits = &credits, + .kqn_peercredits = &peer_credits, .kqn_nrxmsgs_large = &nrxmsgs_large, .kqn_ep_envelopes_large = &ep_envelopes_large, .kqn_nrxmsgs_small = &nrxmsgs_small, @@ -75,19 +80,21 @@ static ctl_table kqswnal_ctl_table[] = { sizeof (int), 0444, NULL, &proc_dointvec}, {2, "ntxmsgs", &ntxmsgs, sizeof (int), 0444, NULL, &proc_dointvec}, - {3, "nnblk_txmsgs", &nnblk_txmsgs, + {3, "credits", &credits, + sizeof (int), 0444, NULL, &proc_dointvec}, + {4, "peer_credits", &peer_credits, sizeof (int), 0444, NULL, &proc_dointvec}, - {4, "nrxmsgs_large", &nrxmsgs_large, + {5, "nrxmsgs_large", &nrxmsgs_large, sizeof (int), 0444, NULL, &proc_dointvec}, - {5, "ep_envelopes_large", &ep_envelopes_large, + {6, "ep_envelopes_large", &ep_envelopes_large, sizeof (int), 0444, NULL, &proc_dointvec}, - {6, "nrxmsgs_small", &nrxmsgs_small, + {7, "nrxmsgs_small", &nrxmsgs_small, sizeof (int), 0444, NULL, &proc_dointvec}, - {7, "ep_envelopes_small", &ep_envelopes_small, + {8, "ep_envelopes_small", &ep_envelopes_small, sizeof (int), 0444, NULL, &proc_dointvec}, - {8, "optimized_puts", &optimized_puts, + {9, "optimized_puts", &optimized_puts, sizeof (int), 0644, NULL, &proc_dointvec}, - {9, "optimized_gets", &optimized_gets, + {10, "optimized_gets", &optimized_gets, sizeof (int), 0644, NULL, &proc_dointvec}, {0} }; diff --git a/lnet/klnds/ralnd/ralnd.c b/lnet/klnds/ralnd/ralnd.c index c24cb01..393b7257 100644 --- a/lnet/klnds/ralnd/ralnd.c +++ b/lnet/klnds/ralnd/ralnd.c @@ -1249,20 +1249,17 @@ kranal_free_txdescs(struct list_head *freelist) int kranal_alloc_txdescs(struct list_head *freelist, int n) { - int isnblk = (freelist == &kranal_data.kra_idle_nblk_txs); int i; kra_tx_t *tx; - LASSERT (freelist == &kranal_data.kra_idle_txs || - freelist == &kranal_data.kra_idle_nblk_txs); + LASSERT (freelist == &kranal_data.kra_idle_txs); LASSERT (list_empty(freelist)); for (i = 0; i < n; i++) { PORTAL_ALLOC(tx, sizeof(*tx)); if (tx == NULL) { - CERROR("Can't allocate %stx[%d]\n", - isnblk ? "nblk " : "", i); + CERROR("Can't allocate tx[%d]\n", i); kranal_free_txdescs(freelist); return -ENOMEM; } @@ -1270,15 +1267,13 @@ kranal_alloc_txdescs(struct list_head *freelist, int n) PORTAL_ALLOC(tx->tx_phys, PTL_MD_MAX_IOV * sizeof(*tx->tx_phys)); if (tx->tx_phys == NULL) { - CERROR("Can't allocate %stx[%d]->tx_phys\n", - isnblk ? "nblk " : "", i); + CERROR("Can't allocate tx[%d]->tx_phys\n", i); PORTAL_FREE(tx, sizeof(*tx)); kranal_free_txdescs(freelist); return -ENOMEM; } - tx->tx_isnblk = isnblk; tx->tx_buftype = RANAL_BUF_NONE; tx->tx_msg.ram_type = RANAL_MSG_NONE; @@ -1291,8 +1286,7 @@ kranal_alloc_txdescs(struct list_head *freelist, int n) int kranal_device_init(int id, kra_device_t *dev) { - int total_ntx = *kranal_tunables.kra_ntx + - *kranal_tunables.kra_ntx_nblk; + int total_ntx = *kranal_tunables.kra_ntx; RAP_RETURN rrc; dev->rad_id = id; @@ -1470,7 +1464,6 @@ kranal_shutdown (lnet_ni_t *ni) kranal_device_fini(&kranal_data.kra_devices[i]); kranal_free_txdescs(&kranal_data.kra_idle_txs); - kranal_free_txdescs(&kranal_data.kra_idle_nblk_txs); CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", atomic_read(&libcfs_kmemory)); @@ -1500,9 +1493,19 @@ kranal_startup (lnet_ni_t *ni) CERROR ("Can't determine my NID\n"); return -EPERM; } + + if (*kranal_tunables.kra_credits > *kranal_tunables.kra_ntx) { + CERROR ("Can't set credits(%d) > ntx(%d)\n", + *kranal_tunables.kra_credits, + *kranal_tunables.kra_ntx); + return -EINVAL; + } memset(&kranal_data, 0, sizeof(kranal_data)); /* zero pointers, flags etc */ + ni->ni_maxtxcredits = *kranal_tunables.kra_credits; + ni->ni_peertxcredits = *kranal_tunables.kra_peercredits; + ni->ni_data = &kranal_data; kranal_data.kra_ni = ni; @@ -1538,8 +1541,6 @@ kranal_startup (lnet_ni_t *ni) spin_lock_init(&kranal_data.kra_connd_lock); INIT_LIST_HEAD(&kranal_data.kra_idle_txs); - INIT_LIST_HEAD(&kranal_data.kra_idle_nblk_txs); - init_waitqueue_head(&kranal_data.kra_idle_tx_waitq); spin_lock_init(&kranal_data.kra_tx_lock); /* OK to call kranal_api_shutdown() to cleanup now */ @@ -1569,11 +1570,6 @@ kranal_startup (lnet_ni_t *ni) if (rc != 0) goto failed; - rc = kranal_alloc_txdescs(&kranal_data.kra_idle_nblk_txs, - *kranal_tunables.kra_ntx_nblk); - if (rc != 0) - goto failed; - rc = kranal_thread_start(kranal_reaper, NULL); if (rc != 0) { CERROR("Can't spawn ranal reaper: %d\n", rc); diff --git a/lnet/klnds/ralnd/ralnd.h b/lnet/klnds/ralnd/ralnd.h index 4b0eb7c..4e6ccbb 100644 --- a/lnet/klnds/ralnd/ralnd.h +++ b/lnet/klnds/ralnd/ralnd.h @@ -65,8 +65,9 @@ #define RANAL_MIN_RECONNECT_INTERVAL 1 /* first failed connection retry... */ #define RANAL_MAX_RECONNECT_INTERVAL 60 /* ...exponentially increasing to this */ -#define RANAL_NTX 64 /* # tx descs */ -#define RANAL_NTX_NBLK 256 /* # reserved tx descs */ +#define RANAL_NTX 256 /* # tx descs */ +#define RANAL_CREDITS 128 /* # concurrent sends */ +#define RANAL_PEERCREDITS 32 /* # concurrent sends to 1 peer */ #define RANAL_FMA_CQ_SIZE 8192 /* # entries in receive CQ * (overflow is a performance hit) */ @@ -94,7 +95,8 @@ typedef struct int *kra_min_reconnect_interval; /* first failed connection retry... */ int *kra_max_reconnect_interval; /* ...exponentially increasing to this */ int *kra_ntx; /* # tx descs */ - int *kra_ntx_nblk; /* # reserved tx descs */ + int *kra_credits; /* # concurrent sends */ + int *kra_peercredits; /* # concurrent sends to 1 peer */ int *kra_fma_cq_size; /* # entries in receive CQ */ int *kra_timeout; /* comms timeout (seconds) */ int *kra_max_immediate; /* immediate payload breakpoint */ @@ -157,9 +159,7 @@ typedef struct spinlock_t kra_connd_lock; /* serialise */ struct list_head kra_idle_txs; /* idle tx descriptors */ - struct list_head kra_idle_nblk_txs; /* idle reserved tx descriptors */ __u64 kra_next_tx_cookie; /* RDMA completion cookie */ - wait_queue_head_t kra_idle_tx_waitq; /* block here for tx descriptor */ spinlock_t kra_tx_lock; /* serialise */ } kra_data_t; @@ -271,7 +271,6 @@ typedef struct kra_tx /* message descriptor */ struct kra_conn *tx_conn; /* owning conn */ lnet_msg_t *tx_lntmsg[2]; /* ptl msgs to finalize on completion */ unsigned long tx_qtime; /* when tx started to wait for something (jiffies) */ - int tx_isnblk; /* I'm reserved for non-blocking sends */ int tx_nob; /* # bytes of payload */ int tx_buftype; /* payload buffer type */ void *tx_buffer; /* source/sink buffer */ diff --git a/lnet/klnds/ralnd/ralnd_cb.c b/lnet/klnds/ralnd/ralnd_cb.c index 43b871b..1e79640 100644 --- a/lnet/klnds/ralnd/ralnd_cb.c +++ b/lnet/klnds/ralnd/ralnd_cb.c @@ -71,56 +71,33 @@ kranal_schedule_conn(kra_conn_t *conn) } kra_tx_t * -kranal_get_idle_tx (int may_block) +kranal_get_idle_tx (void) { unsigned long flags; - kra_tx_t *tx = NULL; + kra_tx_t *tx; - for (;;) { - spin_lock_irqsave(&kranal_data.kra_tx_lock, flags); - - /* "normal" descriptor is free */ - if (!list_empty(&kranal_data.kra_idle_txs)) { - tx = list_entry(kranal_data.kra_idle_txs.next, - kra_tx_t, tx_list); - break; - } - - if (!may_block) { - /* may dip into reserve pool */ - if (list_empty(&kranal_data.kra_idle_nblk_txs)) { - CERROR("reserved tx desc pool exhausted\n"); - break; - } - - tx = list_entry(kranal_data.kra_idle_nblk_txs.next, - kra_tx_t, tx_list); - break; - } + spin_lock_irqsave(&kranal_data.kra_tx_lock, flags); - /* block for idle tx */ + if (list_empty(&kranal_data.kra_idle_txs)) { spin_unlock_irqrestore(&kranal_data.kra_tx_lock, flags); - - wait_event(kranal_data.kra_idle_tx_waitq, - !list_empty(&kranal_data.kra_idle_txs)); + return NULL; } - if (tx != NULL) { - list_del(&tx->tx_list); - - /* Allocate a new completion cookie. It might not be - * needed, but we've got a lock right now... */ - tx->tx_cookie = kranal_data.kra_next_tx_cookie++; + tx = list_entry(kranal_data.kra_idle_txs.next, kra_tx_t, tx_list); + list_del(&tx->tx_list); - LASSERT (tx->tx_buftype == RANAL_BUF_NONE); - LASSERT (tx->tx_msg.ram_type == RANAL_MSG_NONE); - LASSERT (tx->tx_conn == NULL); - LASSERT (tx->tx_lntmsg[0] == NULL); - LASSERT (tx->tx_lntmsg[1] == NULL); - } + /* Allocate a new completion cookie. It might not be needed, but we've + * got a lock right now... */ + tx->tx_cookie = kranal_data.kra_next_tx_cookie++; spin_unlock_irqrestore(&kranal_data.kra_tx_lock, flags); + LASSERT (tx->tx_buftype == RANAL_BUF_NONE); + LASSERT (tx->tx_msg.ram_type == RANAL_MSG_NONE); + LASSERT (tx->tx_conn == NULL); + LASSERT (tx->tx_lntmsg[0] == NULL); + LASSERT (tx->tx_lntmsg[1] == NULL); + return tx; } @@ -135,14 +112,13 @@ kranal_init_msg(kra_msg_t *msg, int type) } kra_tx_t * -kranal_new_tx_msg (int may_block, int type) +kranal_new_tx_msg (int type) { - kra_tx_t *tx = kranal_get_idle_tx(may_block); + kra_tx_t *tx = kranal_get_idle_tx(); - if (tx == NULL) - return NULL; + if (tx != NULL) + kranal_init_msg(&tx->tx_msg, type); - kranal_init_msg(&tx->tx_msg, type); return tx; } @@ -404,7 +380,7 @@ kranal_tx_done (kra_tx_t *tx, int completion) if (tx->tx_lntmsg[i] == NULL) continue; - lnet_finalize(kranal_data.kra_ni, NULL, tx->tx_lntmsg[i], + lnet_finalize(kranal_data.kra_ni, tx->tx_lntmsg[i], completion); tx->tx_lntmsg[i] = NULL; } @@ -415,12 +391,7 @@ kranal_tx_done (kra_tx_t *tx, int completion) spin_lock_irqsave(&kranal_data.kra_tx_lock, flags); - if (tx->tx_isnblk) { - list_add_tail(&tx->tx_list, &kranal_data.kra_idle_nblk_txs); - } else { - list_add_tail(&tx->tx_list, &kranal_data.kra_idle_txs); - wake_up(&kranal_data.kra_idle_tx_waitq); - } + list_add_tail(&tx->tx_list, &kranal_data.kra_idle_txs); spin_unlock_irqrestore(&kranal_data.kra_tx_lock, flags); } @@ -674,7 +645,7 @@ kranal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) lntmsg->msg_md->md_length <= *kranal_tunables.kra_max_immediate) break; /* send IMMEDIATE */ - tx = kranal_new_tx_msg(!in_interrupt(), RANAL_MSG_GET_REQ); + tx = kranal_new_tx_msg(RANAL_MSG_GET_REQ); if (tx == NULL) return -ENOMEM; @@ -722,7 +693,7 @@ kranal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) return -EIO; } - tx = kranal_get_idle_tx(0); + tx = kranal_get_idle_tx(); if (tx == NULL) return -EIO; @@ -759,10 +730,7 @@ kranal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) nob <= *kranal_tunables.kra_max_immediate) break; /* send IMMEDIATE */ - tx = kranal_new_tx_msg(!(routing || - type == LNET_MSG_REPLY || - in_interrupt()), - RANAL_MSG_PUT_REQ); + tx = kranal_new_tx_msg(RANAL_MSG_PUT_REQ); if (tx == NULL) return -ENOMEM; @@ -784,11 +752,7 @@ kranal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) LASSERT (kiov == NULL); LASSERT (nob <= RANAL_FMA_MAX_DATA); - tx = kranal_new_tx_msg(!(routing || - type == LNET_MSG_ACK || - type == LNET_MSG_REPLY || - in_interrupt()), - RANAL_MSG_IMMEDIATE); + tx = kranal_new_tx_msg(RANAL_MSG_IMMEDIATE); if (tx == NULL) return -ENOMEM; @@ -855,11 +819,11 @@ kranal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, buffer = ((char *)iov->iov_base) + offset; } rc = kranal_consume_rxmsg(conn, buffer, mlen); - lnet_finalize(ni, NULL, lntmsg, (rc == 0) ? 0 : -EIO); + lnet_finalize(ni, lntmsg, (rc == 0) ? 0 : -EIO); return 0; case RANAL_MSG_PUT_REQ: - tx = kranal_new_tx_msg(0, RANAL_MSG_PUT_ACK); + tx = kranal_new_tx_msg(RANAL_MSG_PUT_ACK); if (tx == NULL) { kranal_consume_rxmsg(conn, NULL, 0); return -ENOMEM; @@ -896,7 +860,7 @@ kranal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, case RANAL_MSG_GET_REQ: /* This one didn't match anything */ - tx = kranal_new_tx_msg(0, RANAL_MSG_GET_NAK); + tx = kranal_new_tx_msg(RANAL_MSG_GET_NAK); if (tx != NULL) { tx->tx_msg.ram_u.completion.racm_cookie = rxmsg->ram_u.get.ragm_cookie; @@ -1775,14 +1739,14 @@ kranal_check_fma_rx (kra_conn_t *conn) case RANAL_MSG_IMMEDIATE: CDEBUG(D_NET, "RX IMMEDIATE on %p\n", conn); rc = lnet_parse(kranal_data.kra_ni, &msg->ram_u.immediate.raim_hdr, - conn); + msg->ram_srcnid, conn); repost = rc < 0; break; case RANAL_MSG_PUT_REQ: CDEBUG(D_NET, "RX PUT_REQ on %p\n", conn); rc = lnet_parse(kranal_data.kra_ni, &msg->ram_u.putreq.raprm_hdr, - conn); + msg->ram_srcnid, conn); repost = rc < 0; break; @@ -1826,7 +1790,7 @@ kranal_check_fma_rx (kra_conn_t *conn) case RANAL_MSG_GET_REQ: CDEBUG(D_NET, "RX GET_REQ on %p\n", conn); rc = lnet_parse(kranal_data.kra_ni, &msg->ram_u.get.ragm_hdr, - conn); + msg->ram_srcnid, conn); repost = rc < 0; break; diff --git a/lnet/klnds/ralnd/ralnd_modparams.c b/lnet/klnds/ralnd/ralnd_modparams.c index 526f287..9480934 100644 --- a/lnet/klnds/ralnd/ralnd_modparams.c +++ b/lnet/klnds/ralnd/ralnd_modparams.c @@ -37,11 +37,15 @@ CFS_MODULE_PARM(max_reconnect_interval, "i", int, 0644, static int ntx = RANAL_NTX; CFS_MODULE_PARM(ntx, "i", int, 0444, - "# of 'normal' transmit descriptors"); + "# of transmit descriptors"); -static int ntx_nblk = RANAL_NTX_NBLK; -CFS_MODULE_PARM(ntx_nblk, "i", int, 0444, - "# of 'reserved' transmit descriptors"); +static int credits = RANAL_NTX; +CFS_MODULE_PARM(credits, "i", int, 0444, + "# concurrent sends"); + +static int peer_credits = RANAL_NTX; +CFS_MODULE_PARM(peer_credits, "i", int, 0444, + "# concurrent sends to 1 peer"); static int fma_cq_size = RANAL_FMA_CQ_SIZE; CFS_MODULE_PARM(fma_cq_size, "i", int, 0444, @@ -60,7 +64,8 @@ kra_tunables_t kranal_tunables = { .kra_min_reconnect_interval = &min_reconnect_interval, .kra_max_reconnect_interval = &max_reconnect_interval, .kra_ntx = &ntx, - .kra_ntx_nblk = &ntx_nblk, + .kra_credits = &credits, + .kra_peercredits = &peer_credits, .kra_fma_cq_size = &fma_cq_size, .kra_timeout = &timeout, .kra_max_immediate = &max_immediate, @@ -76,13 +81,15 @@ static ctl_table kranal_ctl_table[] = { sizeof(int), 0644, NULL, &proc_dointvec}, {4, "ntx", &ntx, sizeof(int), 0444, NULL, &proc_dointvec}, - {5, "ntx_nblk", &ntx_nblk, + {5, "credits", &credits, + sizeof(int), 0444, NULL, &proc_dointvec}, + {6, "peer_credits", &peer_credits, sizeof(int), 0444, NULL, &proc_dointvec}, - {6, "fma_cq_size", &fma_cq_size, + {7, "fma_cq_size", &fma_cq_size, sizeof(int), 0444, NULL, &proc_dointvec}, - {7, "timeout", &timeout, + {8, "timeout", &timeout, sizeof(int), 0644, NULL, &proc_dointvec}, - {8, "max_immediate", &max_immediate, + {9, "max_immediate", &max_immediate, sizeof(int), 0644, NULL, &proc_dointvec}, {0} }; diff --git a/lnet/klnds/socklnd/socklnd.c b/lnet/klnds/socklnd/socklnd.c index bc7309e..1c31f8d 100644 --- a/lnet/klnds/socklnd/socklnd.c +++ b/lnet/klnds/socklnd/socklnd.c @@ -1325,8 +1325,9 @@ ksocknal_terminate_conn (ksock_conn_t *conn) * zero-copy transmits will therefore complete in finite time. */ ksocknal_connsock_decref(conn); - if (notify) - lnet_notify (peer->ksnp_ni, peer->ksnp_id.nid, 0, then); + /* no auto-down for now */ + // if (notify) + // lnet_notify (peer->ksnp_ni, peer->ksnp_id.nid, 0, then); } void @@ -1365,8 +1366,8 @@ ksocknal_destroy_conn (ksock_conn_t *conn) ", ip %d.%d.%d.%d:%d, with error\n", libcfs_id2str(conn->ksnc_peer->ksnp_id), HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port); - lnet_finalize (conn->ksnc_peer->ksnp_ni, NULL, - conn->ksnc_cookie, -EIO); + lnet_finalize (conn->ksnc_peer->ksnp_ni, + conn->ksnc_cookie, -EIO); break; case SOCKNAL_RX_HEADER: case SOCKNAL_RX_SLOP: @@ -2206,7 +2207,9 @@ ksocknal_startup (lnet_ni_t *ni) spin_lock_init(&net->ksnn_lock); net->ksnn_incarnation = ksocknal_new_incarnation(); ni->ni_data = net; - + ni->ni_maxtxcredits = *ksocknal_tunables.ksnd_credits; + ni->ni_peertxcredits = *ksocknal_tunables.ksnd_peercredits; + if (ni->ni_interfaces[0] == NULL) { rc = ksocknal_enumerate_interfaces(net); if (rc <= 0) diff --git a/lnet/klnds/socklnd/socklnd.h b/lnet/klnds/socklnd/socklnd.h index 6fa24b1..99c0926 100644 --- a/lnet/klnds/socklnd/socklnd.h +++ b/lnet/klnds/socklnd/socklnd.h @@ -58,6 +58,8 @@ #define SOCKNAL_KEEPALIVE_IDLE 30 /* # seconds idle before 1st probe */ #define SOCKNAL_KEEPALIVE_COUNT 10 /* # unanswered probes to determine peer death */ #define SOCKNAL_KEEPALIVE_INTVL 2 /* seconds between probes */ +#define SOCKNAL_CREDITS 256 /* # concurrent sends */ +#define SOCKNAL_PEERCREDITS 8 /* # concurrent sends to 1 peer */ #define SOCKNAL_PEER_HASH_SIZE 101 /* # peer lists */ @@ -126,6 +128,8 @@ typedef struct int *ksnd_keepalive_idle; /* # idle secs before 1st probe */ int *ksnd_keepalive_count; /* # probes */ int *ksnd_keepalive_intvl; /* time between probes */ + int *ksnd_credits; /* # concurrent sends */ + int *ksnd_peercredits; /* # concurrent sends to 1 peer */ #if SOCKNAL_ZC unsigned int *ksnd_zc_min_frag; /* minimum zero copy frag size */ #endif diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c index c5b24e2..0266b68 100644 --- a/lnet/klnds/socklnd/socklnd_cb.c +++ b/lnet/klnds/socklnd/socklnd_cb.c @@ -376,8 +376,7 @@ ksocknal_tx_done (ksock_peer_t *peer, ksock_tx_t *tx, int asynch) ltx = KSOCK_TX_2_KSOCK_LTX (tx); - lnet_finalize (peer->ksnp_ni, - ltx->ltx_private, ltx->ltx_cookie, + lnet_finalize (peer->ksnp_ni, ltx->ltx_cookie, (tx->tx_resid == 0) ? 0 : -EIO); ksocknal_free_ltx (ltx); @@ -1014,7 +1013,8 @@ ksocknal_process_receive (ksock_conn_t *conn) conn->ksnc_rx_state = SOCKNAL_RX_PARSE; ksocknal_conn_addref(conn); /* ++ref while parsing */ - rc = lnet_parse(conn->ksnc_peer->ksnp_ni, &conn->ksnc_hdr, conn); + rc = lnet_parse(conn->ksnc_peer->ksnp_ni, &conn->ksnc_hdr, + conn->ksnc_peer->ksnp_id.nid, conn); if (rc < 0) { /* I just received garbage: give up on this conn */ ksocknal_new_packet(conn, 0); @@ -1035,7 +1035,7 @@ ksocknal_process_receive (ksock_conn_t *conn) case SOCKNAL_RX_BODY: /* payload all received */ - lnet_finalize(conn->ksnc_peer->ksnp_ni, NULL, conn->ksnc_cookie, 0); + lnet_finalize(conn->ksnc_peer->ksnp_ni, conn->ksnc_cookie, 0); /* Fall through */ case SOCKNAL_RX_SLOP: diff --git a/lnet/klnds/socklnd/socklnd_lib-linux.c b/lnet/klnds/socklnd/socklnd_lib-linux.c index 66f6d45..294edbf 100644 --- a/lnet/klnds/socklnd/socklnd_lib-linux.c +++ b/lnet/klnds/socklnd/socklnd_lib-linux.c @@ -18,6 +18,12 @@ ksocknal_lib_tunables_init () {j++, "timeout", ksocknal_tunables.ksnd_timeout, sizeof (int), 0644, NULL, &proc_dointvec}; ksocknal_ctl_table[i++] = (ctl_table) + {j++, "credits", ksocknal_tunables.ksnd_credits, + sizeof (int), 0444, NULL, &proc_dointvec}; + ksocknal_ctl_table[i++] = (ctl_table) + {j++, "peer_credits", ksocknal_tunables.ksnd_peercredits, + sizeof (int), 0444, NULL, &proc_dointvec}; + ksocknal_ctl_table[i++] = (ctl_table) {j++, "nconnds", ksocknal_tunables.ksnd_nconnds, sizeof (int), 0444, NULL, &proc_dointvec}; ksocknal_ctl_table[i++] = (ctl_table) diff --git a/lnet/klnds/socklnd/socklnd_modparams.c b/lnet/klnds/socklnd/socklnd_modparams.c index 8aa1e3e..f413a42 100644 --- a/lnet/klnds/socklnd/socklnd_modparams.c +++ b/lnet/klnds/socklnd/socklnd_modparams.c @@ -24,6 +24,14 @@ static int timeout = SOCKNAL_TIMEOUT; CFS_MODULE_PARM(timeout, "i", int, 0644, "dead socket timeout (seconds)"); +static int credits = SOCKNAL_CREDITS; +CFS_MODULE_PARM(credits, "i", int, 0444, + "# concurrent sends"); + +static int peer_credits = SOCKNAL_PEERCREDITS; +CFS_MODULE_PARM(peer_credits, "i", int, 0444, + "# concurrent sends to 1 peer"); + static int nconnds = SOCKNAL_NCONND; CFS_MODULE_PARM(nconnds, "i", int, 0444, "# connection daemons"); @@ -82,6 +90,8 @@ CFS_MODULE_PARM(zc_min_frag, "i", int, 0644, ksock_tunables_t ksocknal_tunables = { .ksnd_timeout = &timeout, + .ksnd_credits = &credits, + .ksnd_peercredits = &peer_credits, .ksnd_nconnds = &nconnds, .ksnd_min_reconnectms = &min_reconnectms, .ksnd_max_reconnectms = &max_reconnectms, diff --git a/lnet/klnds/viblnd/viblnd.c b/lnet/klnds/viblnd/viblnd.c index 527c198..d1cd5b3 100644 --- a/lnet/klnds/viblnd/viblnd.c +++ b/lnet/klnds/viblnd/viblnd.c @@ -1404,17 +1404,10 @@ kibnal_setup_tx_descs (void) &rkey); LASSERT (vvrc == vv_return_ok); - tx->tx_isnblk = (i >= *kibnal_tunables.kib_ntx); - CDEBUG(D_NET, "Tx[%d] %p->%p[%x]\n", i, tx, tx->tx_msg, tx->tx_lkey); - if (tx->tx_isnblk) - list_add (&tx->tx_list, - &kibnal_data.kib_idle_nblk_txs); - else - list_add (&tx->tx_list, - &kibnal_data.kib_idle_txs); + list_add (&tx->tx_list, &kibnal_data.kib_idle_txs); page_offset += IBNAL_MSG_SIZE; LASSERT (page_offset <= PAGE_SIZE); @@ -1570,6 +1563,16 @@ kibnal_startup (lnet_ni_t *ni) return -EPERM; } + if (*kibnal_tunables.kib_credits > *kibnal_tunables.kib_ntx) { + CERROR ("Can't set credits(%d) > ntx(%d)\n", + *kibnal_tunables.kib_credits, + *kibnal_tunables.kib_ntx); + return -EINVAL; + } + + ni->ni_maxtxcredits = *kibnal_tunables.kib_credits; + ni->ni_peertxcredits = *kibnal_tunables.kib_peercredits; + CLASSERT (LNET_MAX_INTERFACES > 1); if (ni->ni_interfaces[0] != NULL) { @@ -1658,8 +1661,6 @@ kibnal_startup (lnet_ni_t *ni) spin_lock_init (&kibnal_data.kib_tx_lock); INIT_LIST_HEAD (&kibnal_data.kib_idle_txs); - INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs); - init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq); rc = kibnal_alloc_tx_descs(); if (rc != 0) { diff --git a/lnet/klnds/viblnd/viblnd.h b/lnet/klnds/viblnd/viblnd.h index 0cc7906..7d0e5c53 100644 --- a/lnet/klnds/viblnd/viblnd.h +++ b/lnet/klnds/viblnd/viblnd.h @@ -96,8 +96,9 @@ #define IBNAL_CONCURRENT_PEERS 1152 /* # nodes all talking at once to me */ #define IBNAL_CKSUM 0 /* checksum kib_msg_t? */ #define IBNAL_TIMEOUT 50 /* default comms timeout (seconds) */ -#define IBNAL_NTX 32 /* # tx descs */ -#define IBNAL_NTX_NBLK 256 /* # reserved tx descs */ +#define IBNAL_NTX 256 /* # tx descs */ +#define IBNAL_CREDITS 128 /* # concurrent sends */ +#define IBNAL_PEERCREDITS 8 /* # concurrent sends to 1 peer */ #define IBNAL_ARP_RETRIES 3 /* # times to retry ARP */ #define IBNAL_HCA_BASENAME "InfiniHost" /* HCA basename */ #define IBNAL_IPIF_BASENAME "ipoib" /* IPoIB interface basename */ @@ -146,8 +147,7 @@ /* derived constants... */ /* TX messages (shared by all connections) */ -#define IBNAL_TX_MSGS() (*kibnal_tunables.kib_ntx + \ - *kibnal_tunables.kib_ntx_nblk) +#define IBNAL_TX_MSGS() (*kibnal_tunables.kib_ntx) #define IBNAL_TX_MSG_BYTES() (IBNAL_TX_MSGS() * IBNAL_MSG_SIZE) #define IBNAL_TX_MSG_PAGES() ((IBNAL_TX_MSG_BYTES() + PAGE_SIZE - 1)/PAGE_SIZE) @@ -174,7 +174,8 @@ typedef struct int *kib_cksum; /* checksum kib_msg_t? */ int *kib_timeout; /* comms timeout (seconds) */ int *kib_ntx; /* # tx descs */ - int *kib_ntx_nblk; /* # reserved tx descs */ + int *kib_credits; /* # concurrent sends */ + int *kib_peercredits; /* # concurrent sends to 1 peer */ int *kib_arp_retries; /* # times to retry ARP */ char **kib_hca_basename; /* HCA base name */ char **kib_ipif_basename; /* IPoIB interface base name */ @@ -248,8 +249,6 @@ typedef struct kib_pages_t *kib_tx_pages; /* premapped tx msg pages */ struct list_head kib_idle_txs; /* idle tx descriptors */ - struct list_head kib_idle_nblk_txs; /* idle reserved tx descriptors */ - wait_queue_head_t kib_idle_tx_waitq; /* block here for tx descriptor */ __u64 kib_next_tx_cookie; /* RDMA completion cookie */ spinlock_t kib_tx_lock; /* serialise */ @@ -292,7 +291,6 @@ typedef struct kib_rx /* receive message */ typedef struct kib_tx /* transmit message */ { struct list_head tx_list; /* queue on idle_txs ibc_tx_queue etc. */ - int tx_isnblk; /* I'm reserved for non-blocking sends */ struct kib_conn *tx_conn; /* owning conn */ int tx_sending; /* # tx callbacks outstanding */ int tx_queued; /* queued for sending */ diff --git a/lnet/klnds/viblnd/viblnd_cb.c b/lnet/klnds/viblnd/viblnd_cb.c index 17836e7..439d836 100644 --- a/lnet/klnds/viblnd/viblnd_cb.c +++ b/lnet/klnds/viblnd/viblnd_cb.c @@ -56,7 +56,7 @@ kibnal_tx_done (kib_tx_t *tx) if (tx->tx_lntmsg[i] == NULL) continue; - lnet_finalize (kibnal_data.kib_ni, NULL, tx->tx_lntmsg[i], rc); + lnet_finalize (kibnal_data.kib_ni, tx->tx_lntmsg[i], rc); tx->tx_lntmsg[i] = NULL; } @@ -70,73 +70,44 @@ kibnal_tx_done (kib_tx_t *tx) spin_lock(&kibnal_data.kib_tx_lock); - if (tx->tx_isnblk) { - list_add (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs); - } else { - list_add (&tx->tx_list, &kibnal_data.kib_idle_txs); - wake_up (&kibnal_data.kib_idle_tx_waitq); - } + list_add (&tx->tx_list, &kibnal_data.kib_idle_txs); spin_unlock(&kibnal_data.kib_tx_lock); } kib_tx_t * -kibnal_get_idle_tx (int may_block) +kibnal_get_idle_tx (void) { - kib_tx_t *tx = NULL; - ENTRY; + kib_tx_t *tx; - for (;;) { - spin_lock(&kibnal_data.kib_tx_lock); - - /* "normal" descriptor is free */ - if (!list_empty (&kibnal_data.kib_idle_txs)) { - tx = list_entry (kibnal_data.kib_idle_txs.next, - kib_tx_t, tx_list); - break; - } - - if (!may_block) { - /* may dip into reserve pool */ - if (list_empty (&kibnal_data.kib_idle_nblk_txs)) { - CERROR ("reserved tx desc pool exhausted\n"); - break; - } - - tx = list_entry (kibnal_data.kib_idle_nblk_txs.next, - kib_tx_t, tx_list); - break; - } + spin_lock(&kibnal_data.kib_tx_lock); - /* block for idle tx */ + /* "normal" descriptor is free */ + if (list_empty (&kibnal_data.kib_idle_txs)) { spin_unlock(&kibnal_data.kib_tx_lock); - - wait_event (kibnal_data.kib_idle_tx_waitq, - !list_empty (&kibnal_data.kib_idle_txs) || - kibnal_data.kib_shutdown); + return NULL; } - if (tx != NULL) { - list_del (&tx->tx_list); - - /* Allocate a new completion cookie. It might not be needed, - * but we've got a lock right now and we're unlikely to - * wrap... */ - tx->tx_cookie = kibnal_data.kib_next_tx_cookie++; + tx = list_entry (kibnal_data.kib_idle_txs.next, kib_tx_t, tx_list); + list_del (&tx->tx_list); - LASSERT (tx->tx_nwrq == 0); - LASSERT (!tx->tx_queued); - LASSERT (tx->tx_sending == 0); - LASSERT (!tx->tx_waiting); - LASSERT (tx->tx_status == 0); - LASSERT (tx->tx_conn == NULL); - LASSERT (tx->tx_lntmsg[0] == NULL); - LASSERT (tx->tx_lntmsg[1] == NULL); - } + /* Allocate a new completion cookie. It might not be needed, + * but we've got a lock right now and we're unlikely to + * wrap... */ + tx->tx_cookie = kibnal_data.kib_next_tx_cookie++; spin_unlock(&kibnal_data.kib_tx_lock); - - RETURN(tx); + + LASSERT (tx->tx_nwrq == 0); + LASSERT (!tx->tx_queued); + LASSERT (tx->tx_sending == 0); + LASSERT (!tx->tx_waiting); + LASSERT (tx->tx_status == 0); + LASSERT (tx->tx_conn == NULL); + LASSERT (tx->tx_lntmsg[0] == NULL); + LASSERT (tx->tx_lntmsg[1] == NULL); + + return tx; } int @@ -297,7 +268,7 @@ kibnal_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie) void kibnal_send_completion (kib_conn_t *conn, int type, int status, __u64 cookie) { - kib_tx_t *tx = kibnal_get_idle_tx(0); + kib_tx_t *tx = kibnal_get_idle_tx(); if (tx == NULL) { CERROR("Can't get tx for completion %x for %s\n", @@ -351,12 +322,14 @@ kibnal_handle_rx (kib_rx_t *rx) break; case IBNAL_MSG_IMMEDIATE: - rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.immediate.ibim_hdr, rx); + rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.immediate.ibim_hdr, + msg->ibm_srcnid, rx); repost = rc < 0; /* repost on error */ break; case IBNAL_MSG_PUT_REQ: - rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.putreq.ibprm_hdr, rx); + rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.putreq.ibprm_hdr, + msg->ibm_srcnid, rx); repost = rc < 0; /* repost on error */ break; @@ -412,7 +385,8 @@ kibnal_handle_rx (kib_rx_t *rx) break; case IBNAL_MSG_GET_REQ: - rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.get.ibgm_hdr, rx); + rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.get.ibgm_hdr, + msg->ibm_srcnid, rx); repost = rc < 0; /* repost on error */ break; @@ -878,7 +852,7 @@ kibnal_check_sends (kib_conn_t *conn) conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) { spin_unlock(&conn->ibc_lock); - tx = kibnal_get_idle_tx(0); /* don't block */ + tx = kibnal_get_idle_tx(); if (tx != NULL) kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0); @@ -1424,9 +1398,13 @@ kibnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) if (nob <= IBNAL_MSG_SIZE) break; /* send IMMEDIATE */ - tx = kibnal_get_idle_tx(1); /* may block; caller is an app thread */ - LASSERT (tx != NULL); - + tx = kibnal_get_idle_tx(); + if (tx == NULL) { + CERROR("Can allocate txd for GET to %s: \n", + libcfs_nid2str(target.nid)); + return -ENOMEM; + } + ibmsg = tx->tx_msg; ibmsg->ibm_u.get.ibgm_hdr = *hdr; ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie; @@ -1493,7 +1471,7 @@ kibnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) /* NB handle_rx() will send GET_NAK when I return to * it from here, unless I set rx_responded! */ - tx = kibnal_get_idle_tx(0); + tx = kibnal_get_idle_tx(); if (tx == NULL) { CERROR("Can't get tx for REPLY to %s\n", libcfs_nid2str(target.nid)); @@ -1528,8 +1506,7 @@ kibnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) libcfs_nid2str(target.nid), rc); } else if (rc == 0) { /* No RDMA: local completion may happen now! */ - lnet_finalize (kibnal_data.kib_ni, NULL, - lntmsg, 0); + lnet_finalize (kibnal_data.kib_ni, lntmsg, 0); } else { /* RDMA: lnet_finalize(lntmsg) when it * completes */ @@ -1550,7 +1527,7 @@ kibnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) break; /* send IMMEDIATE */ /* may block if caller is app thread */ - tx = kibnal_get_idle_tx(!(routing || type == LNET_MSG_REPLY)); + tx = kibnal_get_idle_tx(); if (tx == NULL) { CERROR("Can't allocate %s txd for %s\n", type == LNET_MSG_PUT ? "PUT" : "REPLY", @@ -1589,9 +1566,7 @@ kibnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]) <= IBNAL_MSG_SIZE); - tx = kibnal_get_idle_tx(!(routing || - type == LNET_MSG_ACK || - type == LNET_MSG_REPLY)); + tx = kibnal_get_idle_tx(); if (tx == NULL) { CERROR ("Can't send %d to %s: tx descs exhausted\n", type, libcfs_nid2str(target.nid)); @@ -1663,18 +1638,18 @@ kibnal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed, IBNAL_MSG_SIZE, rxmsg, offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), mlen); - lnet_finalize (ni, NULL, lntmsg, 0); + lnet_finalize (ni, lntmsg, 0); break; case IBNAL_MSG_PUT_REQ: if (mlen == 0) { - lnet_finalize(ni, NULL, lntmsg, 0); + lnet_finalize(ni, lntmsg, 0); kibnal_send_completion(rx->rx_conn, IBNAL_MSG_PUT_NAK, 0, rxmsg->ibm_u.putreq.ibprm_cookie); break; } - tx = kibnal_get_idle_tx(0); + tx = kibnal_get_idle_tx(); if (tx == NULL) { CERROR("Can't allocate tx for %s\n", libcfs_nid2str(conn->ibc_peer->ibp_nid)); diff --git a/lnet/klnds/viblnd/viblnd_modparams.c b/lnet/klnds/viblnd/viblnd_modparams.c index 5b33794..d830d07 100644 --- a/lnet/klnds/viblnd/viblnd_modparams.c +++ b/lnet/klnds/viblnd/viblnd_modparams.c @@ -49,11 +49,15 @@ CFS_MODULE_PARM(timeout, "i", int, 0644, static int ntx = IBNAL_NTX; CFS_MODULE_PARM(ntx, "i", int, 0444, - "# of 'normal' message descriptors"); + "# of message descriptors"); -static int ntx_nblk = IBNAL_NTX_NBLK; -CFS_MODULE_PARM(ntx_nblk, "i", int, 0444, - "# of 'reserved' message descriptors"); +static int credits = IBNAL_CREDITS; +CFS_MODULE_PARM(credits, "i", int, 0444, + "# concurrent sends"); + +static int peer_credits = IBNAL_PEERCREDITS; +CFS_MODULE_PARM(peer_credits, "i", int, 0444, + "# concurrent sends to 1 peer"); static int arp_retries = IBNAL_ARP_RETRIES; CFS_MODULE_PARM(arp_retries, "i", int, 0644, @@ -97,7 +101,8 @@ kib_tunables_t kibnal_tunables = { .kib_cksum = &cksum, .kib_timeout = &timeout, .kib_ntx = &ntx, - .kib_ntx_nblk = &ntx_nblk, + .kib_credits = &credits, + .kib_peercredits = &peer_credits, .kib_arp_retries = &arp_retries, .kib_hca_basename = &hca_basename, .kib_ipif_basename = &ipif_basename, @@ -130,24 +135,26 @@ static ctl_table kibnal_ctl_table[] = { sizeof(int), 0644, NULL, &proc_dointvec}, {7, "ntx", &ntx, sizeof(int), 0444, NULL, &proc_dointvec}, - {8, "ntx_nblk", &ntx_nblk, + {8, "credits", &credits, + sizeof(int), 0444, NULL, &proc_dointvec}, + {9, "peer_credits", &peer_credits, sizeof(int), 0444, NULL, &proc_dointvec}, - {9, "arp_retries", &arp_retries, + {10, "arp_retries", &arp_retries, sizeof(int), 0644, NULL, &proc_dointvec}, - {10, "hca_basename", hca_basename_space, + {11, "hca_basename", hca_basename_space, sizeof(hca_basename_space), 0444, NULL, &proc_dostring}, - {11, "ipif_basename", ipif_basename_space, + {12, "ipif_basename", ipif_basename_space, sizeof(ipif_basename_space), 0444, NULL, &proc_dostring}, - {12, "local_ack_timeout", &local_ack_timeout, + {13, "local_ack_timeout", &local_ack_timeout, sizeof(int), 0644, NULL, &proc_dointvec}, - {13, "retry_cnt", &retry_cnt, + {14, "retry_cnt", &retry_cnt, sizeof(int), 0644, NULL, &proc_dointvec}, - {14, "rnr_cnt", &rnr_cnt, + {15, "rnr_cnt", &rnr_cnt, sizeof(int), 0644, NULL, &proc_dointvec}, - {15, "rnr_nak_timer", &rnr_nak_timer, + {16, "rnr_nak_timer", &rnr_nak_timer, sizeof(int), 0644, NULL, &proc_dointvec}, #if IBNAL_USE_FMR - {16, "fmr_remaps", &fmr_remaps, + {17, "fmr_remaps", &fmr_remaps, sizeof(int), 0444, NULL, &proc_dointvec}, #endif {0} diff --git a/lnet/libcfs/nidstrings.c b/lnet/libcfs/nidstrings.c index 5f76612..1f0c9fd 100644 --- a/lnet/libcfs/nidstrings.c +++ b/lnet/libcfs/nidstrings.c @@ -89,7 +89,7 @@ static void libcfs_num_addr2str(__u32 addr, char *str); static int libcfs_num_str2addr(char *str, int nob, __u32 *addr); struct netstrfns { - int nf_lnd; + int nf_type; char *nf_name; char *nf_modname; void (*nf_addr2str)(__u32 addr, char *str); @@ -97,48 +97,48 @@ struct netstrfns { }; static struct netstrfns libcfs_netstrfns[] = { - {.nf_lnd = LOLND, + {.nf_type = LOLND, .nf_name = "lo", .nf_modname = "klolnd", .nf_addr2str = libcfs_num_addr2str, .nf_str2addr = libcfs_lo_str2addr}, - {.nf_lnd = SOCKLND, + {.nf_type = SOCKLND, .nf_name = "tcp", .nf_modname = "ksocklnd", .nf_addr2str = libcfs_ip_addr2str, .nf_str2addr = libcfs_ip_str2addr}, - {.nf_lnd = OPENIBLND, + {.nf_type = OPENIBLND, .nf_name = "openib", .nf_modname = "kopeniblnd", .nf_addr2str = libcfs_ip_addr2str, .nf_str2addr = libcfs_ip_str2addr}, - {.nf_lnd = IIBLND, + {.nf_type = IIBLND, .nf_name = "iib", .nf_modname = "kiiblnd", .nf_addr2str = libcfs_ip_addr2str, .nf_str2addr = libcfs_ip_str2addr}, - {.nf_lnd = VIBLND, + {.nf_type = VIBLND, .nf_name = "vib", .nf_modname = "kviblnd", .nf_addr2str = libcfs_ip_addr2str, .nf_str2addr = libcfs_ip_str2addr}, - {.nf_lnd = RALND, + {.nf_type = RALND, .nf_name = "ra", .nf_modname = "kralnd", .nf_addr2str = libcfs_ip_addr2str, .nf_str2addr = libcfs_ip_str2addr}, - {.nf_lnd = QSWLND, + {.nf_type = QSWLND, .nf_name = "elan", .nf_modname = "kqswlnd", .nf_addr2str = libcfs_num_addr2str, .nf_str2addr = libcfs_num_str2addr}, - {.nf_lnd = GMLND, + {.nf_type = GMLND, .nf_name = "gm", .nf_modname = "kgmlnd", .nf_addr2str = libcfs_num_addr2str, .nf_str2addr = libcfs_num_str2addr}, /* placeholder for net0 alias. It MUST BE THE LAST ENTRY */ - {.nf_lnd = -1}, + {.nf_type = -1}, }; const int libcfs_nnetstrfns = sizeof(libcfs_netstrfns)/sizeof(libcfs_netstrfns[0]); @@ -248,7 +248,7 @@ libcfs_lnd2netstrfns(int lnd) if (lnd >= 0) for (i = 0; i < libcfs_nnetstrfns; i++) - if (lnd == libcfs_netstrfns[i].nf_lnd) + if (lnd == libcfs_netstrfns[i].nf_type) return &libcfs_netstrfns[i]; return NULL; @@ -260,7 +260,7 @@ libcfs_name2netstrfns(char *name) int i; for (i = 0; i < libcfs_nnetstrfns; i++) - if (libcfs_netstrfns[i].nf_lnd >= 0 && + if (libcfs_netstrfns[i].nf_type >= 0 && !strcmp(libcfs_netstrfns[i].nf_name, name)) return &libcfs_netstrfns[i]; @@ -301,7 +301,7 @@ libcfs_str2lnd(char *str) struct netstrfns *nf = libcfs_name2netstrfns(str); if (nf != NULL) - return nf->nf_lnd; + return nf->nf_type; return -1; } @@ -358,7 +358,7 @@ libcfs_nid2str(lnet_nid_t nid) } static struct netstrfns * -libcfs_str2net_interlnd(char *str, __u32 *net) +libcfs_str2net_internal(char *str, __u32 *net) { struct netstrfns *nf; int nob; @@ -367,8 +367,8 @@ libcfs_str2net_interlnd(char *str, __u32 *net) for (i = 0; i < libcfs_nnetstrfns; i++) { nf = &libcfs_netstrfns[i]; - - if (!strncmp(str, nf->nf_name, strlen(nf->nf_name))) + if (nf->nf_type >= 0 && + !strncmp(str, nf->nf_name, strlen(nf->nf_name))) break; } @@ -380,7 +380,7 @@ libcfs_str2net_interlnd(char *str, __u32 *net) if (strlen(str) == nob) { netnum = 0; } else { - if (nf->nf_lnd == LOLND) /* net number not allowed */ + if (nf->nf_type == LOLND) /* net number not allowed */ return NULL; str += nob; @@ -390,7 +390,7 @@ libcfs_str2net_interlnd(char *str, __u32 *net) return NULL; } - *net = PTL_MKNET(nf->nf_lnd, netnum); + *net = PTL_MKNET(nf->nf_type, netnum); return nf; } @@ -399,7 +399,7 @@ libcfs_str2net(char *str) { __u32 net; - if (libcfs_str2net_interlnd(str, &net) != NULL) + if (libcfs_str2net_internal(str, &net) != NULL) return net; return PTL_NIDNET(LNET_NID_ANY); @@ -414,7 +414,7 @@ libcfs_str2nid(char *str) __u32 addr; if (sep != NULL) { - nf = libcfs_str2net_interlnd(sep + 1, &net); + nf = libcfs_str2net_internal(sep + 1, &net); if (nf == NULL) return LNET_NID_ANY; } else { @@ -465,14 +465,14 @@ libcfs_setnet0alias(int lnd) * this assignment here means we can parse and print its NIDs */ LASSERT (nf != NULL); - LASSERT (nf0->nf_lnd < 0); + LASSERT (nf0->nf_type < 0); nf0->nf_name = "zero";//nf->nf_name; nf0->nf_modname = nf->nf_modname; nf0->nf_addr2str = nf->nf_addr2str; nf0->nf_str2addr = nf->nf_str2addr; mb(); - nf0->nf_lnd = 0; + nf0->nf_type = 0; } EXPORT_SYMBOL(libcfs_isknown_lnd); diff --git a/lnet/libcfs/tracefile.c b/lnet/libcfs/tracefile.c index 12ce183..ff3bf22 100644 --- a/lnet/libcfs/tracefile.c +++ b/lnet/libcfs/tracefile.c @@ -201,13 +201,7 @@ void libcfs_debug_msg(int subsys, int mask, char *file, const char *fn, debug_buf = cfs_page_address(tage->page) + tage->used + known_size; max_nob = CFS_PAGE_SIZE - tage->used - known_size; - if (max_nob <= 0) { - printk(KERN_EMERG "negative max_nob: %i\n", max_nob); - debug_buf = format; - needed = strlen(format); - mask |= D_ERROR; - goto out; - } + LASSERT(max_nob > 0); va_start(ap, format); needed = vsnprintf(debug_buf, max_nob, format, ap); va_end(ap); diff --git a/lnet/lnet/Makefile.in b/lnet/lnet/Makefile.in index 53461a1..3bc86f6 100644 --- a/lnet/lnet/Makefile.in +++ b/lnet/lnet/Makefile.in @@ -3,7 +3,7 @@ MODULES := lnet lnet-objs := api-errno.o api-ni.o config.o lnet-objs += lib-me.o lib-msg.o lib-eq.o lib-md.o lnet-objs += lib-move.o module.o lo.o -lnet-objs += router.o router_proc.o acceptor.o +lnet-objs += router.o router_proc.o acceptor.o peer.o default: all diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index 87bced7..3992fab 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -415,12 +415,17 @@ lnet_init(lnet_pid_t requested_pid) CFS_INIT_LIST_HEAD (&the_lnet.ln_test_peers); CFS_INIT_LIST_HEAD (&the_lnet.ln_nis); CFS_INIT_LIST_HEAD (&the_lnet.ln_zombie_nis); + CFS_INIT_LIST_HEAD (&the_lnet.ln_remote_nets); the_lnet.ln_interface_cookie = lnet_create_interface_cookie(); rc = lnet_setup_handle_hash (); if (rc != 0) goto out; + + rc = lnet_create_peer_table(); + if (rc != 0) + goto out; the_lnet.ln_nportals = MAX_PORTALS; PORTAL_ALLOC(the_lnet.ln_portals, @@ -436,8 +441,9 @@ lnet_init(lnet_pid_t requested_pid) out: if (rc != 0) { - lnet_cleanup_handle_hash (); - lnet_descriptor_cleanup (); + lnet_destroy_peer_table(); + lnet_cleanup_handle_hash(); + lnet_descriptor_cleanup(); } RETURN (rc); @@ -494,6 +500,7 @@ lnet_fini (void) lnet_msg_t, msg_activelist); CERROR ("Active msg %p on exit\n", msg); + LASSERT (msg->msg_onactivelist); list_del (&msg->msg_activelist); lnet_msg_free (msg); } @@ -501,8 +508,9 @@ lnet_fini (void) PORTAL_FREE(the_lnet.ln_portals, the_lnet.ln_nportals * sizeof(*the_lnet.ln_portals)); - lnet_cleanup_handle_hash (); - lnet_descriptor_cleanup (); + lnet_destroy_peer_table(); + lnet_cleanup_handle_hash(); + lnet_descriptor_cleanup(); #ifndef __KERNEL__ pthread_mutex_destroy(&the_lnet.ln_mutex); @@ -513,13 +521,11 @@ lnet_fini (void) } lnet_ni_t * -lnet_net2ni (__u32 net) +lnet_net2ni_locked (__u32 net) { struct list_head *tmp; lnet_ni_t *ni; - unsigned long flags; - LNET_LOCK(flags); list_for_each (tmp, &the_lnet.ln_nis) { ni = list_entry(tmp, lnet_ni_t, ni_list); @@ -528,12 +534,10 @@ lnet_net2ni (__u32 net) net == 0 && PTL_NETTYP(PTL_NIDNET(ni->ni_nid)) != LOLND)) { lnet_ni_addref_locked(ni); - LNET_UNLOCK(flags); return ni; } } - LNET_UNLOCK(flags); return NULL; } @@ -545,11 +549,10 @@ lnet_count_acceptor_nis (lnet_ni_t **first_ni) * binary compatibility. */ int count = 0; #ifdef __KERNEL__ - unsigned long flags; struct list_head *tmp; lnet_ni_t *ni; - LNET_LOCK(flags); + LNET_LOCK(); list_for_each (tmp, &the_lnet.ln_nis) { ni = list_entry(tmp, lnet_ni_t, ni_list); @@ -563,7 +566,7 @@ lnet_count_acceptor_nis (lnet_ni_t **first_ni) } } - LNET_UNLOCK(flags); + LNET_UNLOCK(); #endif return count; } @@ -573,10 +576,9 @@ lnet_islocalnid (lnet_nid_t nid) { struct list_head *tmp; lnet_ni_t *ni; - unsigned long flags; int islocal = 0; - LNET_LOCK(flags); + LNET_LOCK(); list_for_each (tmp, &the_lnet.ln_nis) { ni = list_entry(tmp, lnet_ni_t, ni_list); @@ -587,7 +589,7 @@ lnet_islocalnid (lnet_nid_t nid) } } - LNET_UNLOCK(flags); + LNET_UNLOCK(); return islocal; } @@ -596,11 +598,10 @@ lnet_islocalnet (__u32 net, int *orderp) { struct list_head *tmp; lnet_ni_t *ni; - unsigned long flags; int order = 0; int islocal = 0; - LNET_LOCK(flags); + LNET_LOCK(); list_for_each (tmp, &the_lnet.ln_nis) { ni = list_entry(tmp, lnet_ni_t, ni_list); @@ -614,7 +615,7 @@ lnet_islocalnet (__u32 net, int *orderp) order++; } - LNET_UNLOCK(flags); + LNET_UNLOCK(); return islocal; } @@ -624,28 +625,27 @@ lnet_shutdown_lndnis (void) int i; int islo; lnet_ni_t *ni; - unsigned long flags; /* NB called holding the global mutex */ /* All quiet on the API front */ + LASSERT (!the_lnet.ln_shutdown); LASSERT (the_lnet.ln_refcount == 0); LASSERT (list_empty(&the_lnet.ln_zombie_nis)); LASSERT (the_lnet.ln_nzombie_nis == 0); + LASSERT (list_empty(&the_lnet.ln_remote_nets)); - /* First unlink the NIs from the global list and drop its ref. When - * the last ref goes, the NI is queued on apini_zombie_nis....*/ + LNET_LOCK(); + the_lnet.ln_shutdown = 1; /* flag shutdown */ - LNET_LOCK(flags); + /* Unlink NIs from the global table */ while (!list_empty(&the_lnet.ln_nis)) { ni = list_entry(the_lnet.ln_nis.next, lnet_ni_t, ni_list); list_del (&ni->ni_list); - ni->ni_shutdown = 1; the_lnet.ln_nzombie_nis++; - - lnet_ni_decref_locked(ni); /* drop apini's ref (shutdown on last ref) */ + lnet_ni_decref_locked(ni); /* drop apini's ref */ } /* Drop the cached loopback NI. */ @@ -653,20 +653,26 @@ lnet_shutdown_lndnis (void) lnet_ni_decref_locked(lnet_loni); lnet_loni = NULL; } + LNET_UNLOCK(); + /* Clear the peer table and wait for all peers to go (they hold refs on + * their NIs) */ + lnet_clear_peer_table(); + + LNET_LOCK(); /* Now wait for the NI's I just nuked to show up on apini_zombie_nis * and shut them down in guaranteed thread context */ i = 2; while (the_lnet.ln_nzombie_nis != 0) { while (list_empty(&the_lnet.ln_zombie_nis)) { - LNET_UNLOCK(flags); + LNET_UNLOCK(); ++i; if ((i & (-i)) == i) CDEBUG(D_WARNING,"Waiting for %d zombie NIs\n", the_lnet.ln_nzombie_nis); cfs_pause(cfs_time_seconds(1)); - LNET_LOCK(flags); + LNET_LOCK(); } ni = list_entry(the_lnet.ln_zombie_nis.next, @@ -674,7 +680,7 @@ lnet_shutdown_lndnis (void) list_del(&ni->ni_list); ni->ni_lnd->lnd_refcount--; - LNET_UNLOCK(flags); + LNET_UNLOCK(); islo = ni->ni_lnd->lnd_type == LOLND; @@ -690,10 +696,12 @@ lnet_shutdown_lndnis (void) PORTAL_FREE(ni, sizeof(*ni)); - LNET_LOCK(flags); + LNET_LOCK(); the_lnet.ln_nzombie_nis--; } - LNET_UNLOCK(flags); + + the_lnet.ln_shutdown = 0; + LNET_UNLOCK(); if (the_lnet.ln_network_tokens != NULL) { PORTAL_FREE(the_lnet.ln_network_tokens, @@ -709,7 +717,6 @@ lnet_startup_lndnis (void) lnet_ni_t *ni; struct list_head nilist; int rc = 0; - unsigned long flags; int lnd_type; int retry; @@ -751,9 +758,9 @@ lnet_startup_lndnis (void) ni->ni_refcount = 1; - LNET_LOCK(flags); + LNET_LOCK(); lnd->lnd_refcount++; - LNET_UNLOCK(flags); + LNET_UNLOCK(); ni->ni_lnd = lnd; @@ -764,15 +771,28 @@ lnet_startup_lndnis (void) if (rc != 0) { CERROR("Error %d starting up NI %s\n", rc, libcfs_lnd2str(lnd->lnd_type)); - LNET_LOCK(flags); + LNET_LOCK(); lnd->lnd_refcount--; - LNET_UNLOCK(flags); + LNET_UNLOCK(); goto failed; } if (lnd->lnd_type != LOLND) { - LCONSOLE(0, "Added NI %s\n", - libcfs_nid2str(ni->ni_nid)); + if (ni->ni_peertxcredits == 0 || + ni->ni_maxtxcredits == 0) { + LCONSOLE_ERROR("NI %s has no %scredits\n", + libcfs_lnd2str(lnd->lnd_type), + ni->ni_peertxcredits == 0 ? + "" : "per-peer "); + goto failed; + } + + ni->ni_txcredits = + ni->ni_mintxcredits = ni->ni_maxtxcredits; + + LCONSOLE(0, "Added NI %s [%d/%d]\n", + libcfs_nid2str(ni->ni_nid), + ni->ni_peertxcredits, ni->ni_txcredits); /* Handle nidstrings for network 0 just like this one */ if (the_lnet.ln_ptlcompat > 0) @@ -781,9 +801,9 @@ lnet_startup_lndnis (void) list_del(&ni->ni_list); - LNET_LOCK(flags); + LNET_LOCK(); list_add_tail(&ni->ni_list, &the_lnet.ln_nis); - LNET_UNLOCK(flags); + LNET_UNLOCK(); } lnet_loni = lnet_net2ni(PTL_MKNET(LOLND, 0)); @@ -871,8 +891,20 @@ LNetFini(void) LASSERT (list_empty(&the_lnet.ln_lnds)); the_lnet.ln_init = 0; + + } +#ifndef __KERNEL__ +void lnet_proc_init(void) +{ +} + +void lnet_proc_fini(void) +{ +} +#endif + int LNetNIInit(lnet_pid_t requested_pid) { @@ -892,27 +924,29 @@ LNetNIInit(lnet_pid_t requested_pid) if (rc != 0) goto out; - rc = kpr_initialise(); + rc = lnet_startup_lndnis(); if (rc != 0) { lnet_fini(); goto out; } - rc = lnet_startup_lndnis(); + rc = lnet_router_init(); if (rc != 0) { - kpr_finalise(); + lnet_shutdown_lndnis(); lnet_fini(); goto out; } - + rc = lnet_acceptor_start(); if (rc != 0) { + lnet_router_fini(); lnet_shutdown_lndnis(); - kpr_finalise(); lnet_fini(); goto out; } + lnet_proc_init(); + the_lnet.ln_refcount = 1; out: @@ -930,9 +964,10 @@ LNetNIFini() the_lnet.ln_refcount--; if (the_lnet.ln_refcount == 0) { + lnet_proc_fini(); lnet_acceptor_stop(); + lnet_router_fini(); lnet_shutdown_lndnis(); - kpr_finalise(); lnet_fini(); } @@ -961,10 +996,18 @@ LNetCtl(unsigned int cmd, void *arg) return lnet_fail_nid(data->ioc_nid, data->ioc_count); case IOC_PORTAL_ADD_ROUTE: + return lnet_add_route(data->ioc_net, data->ioc_count, + data->ioc_nid); case IOC_PORTAL_DEL_ROUTE: + return lnet_del_route(data->ioc_net, data->ioc_nid); + case IOC_PORTAL_GET_ROUTE: + return lnet_get_route(data->ioc_count, + &data->ioc_net, &data->ioc_count, + &data->ioc_nid, &data->ioc_flags); case IOC_PORTAL_NOTIFY_ROUTER: - return kpr_ctl(cmd, arg); + return lnet_notify(NULL, data->ioc_nid, data->ioc_flags, + (time_t)data->ioc_u64[0]); case IOC_PORTAL_PORTALS_COMPATIBILITY: return the_lnet.ln_ptlcompat; @@ -989,14 +1032,13 @@ int LNetGetId(unsigned int index, lnet_process_id_t *id) { lnet_ni_t *ni; - unsigned long flags; struct list_head *tmp; int rc = -ENOENT; LASSERT (the_lnet.ln_init); LASSERT (the_lnet.ln_refcount > 0); - LNET_LOCK(flags); + LNET_LOCK(); list_for_each(tmp, &the_lnet.ln_nis) { if (index-- != 0) @@ -1010,7 +1052,7 @@ LNetGetId(unsigned int index, lnet_process_id_t *id) break; } - LNET_UNLOCK(flags); + LNET_UNLOCK(); return rc; } diff --git a/lnet/lnet/autoMakefile.am b/lnet/lnet/autoMakefile.am index 858d5fe..dc09747 100644 --- a/lnet/lnet/autoMakefile.am +++ b/lnet/lnet/autoMakefile.am @@ -1,7 +1,7 @@ my_sources = api-errno.c api-ni.c config.c \ lib-me.c lib-msg.c lib-eq.c \ lib-md.c lib-move.c lo.c \ - router.c acceptor.c + router.c acceptor.c peer.c if LIBLUSTRE diff --git a/lnet/lnet/config.c b/lnet/lnet/config.c index 58ef458..2487187 100644 --- a/lnet/lnet/config.c +++ b/lnet/lnet/config.c @@ -143,6 +143,7 @@ lnet_new_ni(__u32 net, struct list_head *nilist) /* NAL will fill in the address part of the NID */ ni->ni_nid = PTL_MKNID(net, 0); + CFS_INIT_LIST_HEAD(&ni->ni_txq); list_add_tail(&ni->ni_list, nilist); return ni; @@ -649,7 +650,7 @@ lnet_parse_route (char *str) nid = libcfs_str2nid(ptb->ptb_text); LASSERT (nid != LNET_NID_ANY); - rc = kpr_add_route (net, hops, nid); + rc = lnet_add_route (net, hops, nid); if (rc != 0) { CERROR("Can't create route " "to %s via %s\n", diff --git a/lnet/lnet/lib-eq.c b/lnet/lnet/lib-eq.c index 99eb32a..9e3f480 100644 --- a/lnet/lnet/lib-eq.c +++ b/lnet/lnet/lib-eq.c @@ -30,7 +30,6 @@ LNetEQAlloc(unsigned int count, lnet_eq_handler_t callback, lnet_handle_eq_t *handle) { lnet_eq_t *eq; - unsigned long flags; LASSERT (the_lnet.ln_init); LASSERT (the_lnet.ln_refcount > 0); @@ -56,9 +55,9 @@ LNetEQAlloc(unsigned int count, lnet_eq_handler_t callback, PORTAL_ALLOC(eq->eq_events, count * sizeof(lnet_event_t)); if (eq->eq_events == NULL) { - LNET_LOCK(flags); + LNET_LOCK(); lnet_eq_free (eq); - LNET_UNLOCK(flags); + LNET_UNLOCK(); } /* NB this resets all event sequence numbers to 0, to be earlier @@ -71,12 +70,12 @@ LNetEQAlloc(unsigned int count, lnet_eq_handler_t callback, eq->eq_refcount = 0; eq->eq_callback = callback; - LNET_LOCK(flags); + LNET_LOCK(); lnet_initialise_handle (&eq->eq_lh, LNET_COOKIE_TYPE_EQ); list_add (&eq->eq_list, &the_lnet.ln_active_eqs); - LNET_UNLOCK(flags); + LNET_UNLOCK(); lnet_eq2handle(handle, eq); return (0); @@ -88,21 +87,20 @@ LNetEQFree(lnet_handle_eq_t eqh) lnet_eq_t *eq; int size; lnet_event_t *events; - unsigned long flags; LASSERT (the_lnet.ln_init); LASSERT (the_lnet.ln_refcount > 0); - LNET_LOCK(flags); + LNET_LOCK(); eq = lnet_handle2eq(&eqh); if (eq == NULL) { - LNET_UNLOCK(flags); + LNET_UNLOCK(); return (-ENOENT); } if (eq->eq_refcount != 0) { - LNET_UNLOCK(flags); + LNET_UNLOCK(); return (-EBUSY); } @@ -114,7 +112,7 @@ LNetEQFree(lnet_handle_eq_t eqh) list_del (&eq->eq_list); lnet_eq_free (eq); - LNET_UNLOCK(flags); + LNET_UNLOCK(); PORTAL_FREE(events, size * sizeof (lnet_event_t)); @@ -175,7 +173,6 @@ int LNetEQPoll (lnet_handle_eq_t *eventqs, int neq, int timeout_ms, lnet_event_t *event, int *which) { - unsigned long flags; int i; int rc; #ifdef __KERNEL__ @@ -194,7 +191,7 @@ LNetEQPoll (lnet_handle_eq_t *eventqs, int neq, int timeout_ms, if (neq < 1) RETURN(-ENOENT); - LNET_LOCK(flags); + LNET_LOCK(); for (;;) { for (i = 0; i < neq; i++) { @@ -202,14 +199,14 @@ LNetEQPoll (lnet_handle_eq_t *eventqs, int neq, int timeout_ms, rc = lib_get_event (eq, event); if (rc != 0) { - LNET_UNLOCK(flags); + LNET_UNLOCK(); *which = i; RETURN(rc); } } if (timeout_ms == 0) { - LNET_UNLOCK (flags); + LNET_UNLOCK (); RETURN (0); } @@ -221,7 +218,7 @@ LNetEQPoll (lnet_handle_eq_t *eventqs, int neq, int timeout_ms, set_current_state(TASK_INTERRUPTIBLE); cfs_waitq_add(&the_lnet.ln_waitq, &wl); - LNET_UNLOCK(flags); + LNET_UNLOCK(); if (timeout_ms < 0) { cfs_waitq_wait (&wl); @@ -236,7 +233,7 @@ LNetEQPoll (lnet_handle_eq_t *eventqs, int neq, int timeout_ms, timeout_ms = 0; } - LNET_LOCK(flags); + LNET_LOCK(); cfs_waitq_del(&the_lnet.ln_waitq, &wl); #else if (timeout_ms < 0) { diff --git a/lnet/lnet/lib-md.c b/lnet/lnet/lib-md.c index abfd9a0..45bb122 100644 --- a/lnet/lnet/lib-md.c +++ b/lnet/lnet/lib-md.c @@ -199,7 +199,6 @@ LNetMDAttach(lnet_handle_me_t meh, lnet_md_t umd, { lnet_me_t *me; lnet_libmd_t *md; - unsigned long flags; int rc; LASSERT (the_lnet.ln_init); @@ -213,7 +212,7 @@ LNetMDAttach(lnet_handle_me_t meh, lnet_md_t umd, if (md == NULL) return -ENOMEM; - LNET_LOCK(flags); + LNET_LOCK(); me = lnet_handle2me(&meh); if (me == NULL) { @@ -228,14 +227,14 @@ LNetMDAttach(lnet_handle_me_t meh, lnet_md_t umd, lnet_md2handle(handle, md); - LNET_UNLOCK(flags); + LNET_UNLOCK(); return (0); } } lnet_md_free (md); - LNET_UNLOCK(flags); + LNET_UNLOCK(); return (rc); } @@ -243,7 +242,6 @@ int LNetMDBind(lnet_md_t umd, lnet_unlink_t unlink, lnet_handle_md_t *handle) { lnet_libmd_t *md; - unsigned long flags; int rc; LASSERT (the_lnet.ln_init); @@ -257,20 +255,20 @@ LNetMDBind(lnet_md_t umd, lnet_unlink_t unlink, lnet_handle_md_t *handle) if (md == NULL) return -ENOMEM; - LNET_LOCK(flags); + LNET_LOCK(); rc = lib_md_build(md, &umd, unlink); if (rc == 0) { lnet_md2handle(handle, md); - LNET_UNLOCK(flags); + LNET_UNLOCK(); return (0); } lnet_md_free (md); - LNET_UNLOCK(flags); + LNET_UNLOCK(); return (rc); } @@ -279,16 +277,15 @@ LNetMDUnlink (lnet_handle_md_t mdh) { lnet_event_t ev; lnet_libmd_t *md; - unsigned long flags; LASSERT (the_lnet.ln_init); LASSERT (the_lnet.ln_refcount > 0); - LNET_LOCK(flags); + LNET_LOCK(); md = lnet_handle2md(&mdh); if (md == NULL) { - LNET_UNLOCK(flags); + LNET_UNLOCK(); return -ENOENT; } @@ -306,12 +303,12 @@ LNetMDUnlink (lnet_handle_md_t mdh) lnet_md_deconstruct(md, &ev.md); lnet_md2handle(&ev.md_handle, md); - lnet_enq_event_locked(NULL, md->md_eq, &ev); + lnet_enq_event_locked(md->md_eq, &ev); } lnet_md_unlink(md); - LNET_UNLOCK(flags); + LNET_UNLOCK(); return 0; } diff --git a/lnet/lnet/lib-me.c b/lnet/lnet/lib-me.c index 3d49280..f39b31a 100644 --- a/lnet/lnet/lib-me.c +++ b/lnet/lnet/lib-me.c @@ -34,7 +34,6 @@ LNetMEAttach(unsigned int portal, lnet_handle_me_t *handle) { lnet_me_t *me; - unsigned long flags; LASSERT (the_lnet.ln_init); LASSERT (the_lnet.ln_refcount > 0); @@ -46,7 +45,7 @@ LNetMEAttach(unsigned int portal, if (me == NULL) return -ENOMEM; - LNET_LOCK(flags); + LNET_LOCK(); me->me_match_id = match_id; me->me_match_bits = match_bits; @@ -63,7 +62,7 @@ LNetMEAttach(unsigned int portal, lnet_me2handle(handle, me); - LNET_UNLOCK(flags); + LNET_UNLOCK(); return 0; } @@ -77,7 +76,6 @@ LNetMEInsert(lnet_handle_me_t current_meh, { lnet_me_t *current_me; lnet_me_t *new_me; - unsigned long flags; LASSERT (the_lnet.ln_init); LASSERT (the_lnet.ln_refcount > 0); @@ -86,13 +84,13 @@ LNetMEInsert(lnet_handle_me_t current_meh, if (new_me == NULL) return -ENOMEM; - LNET_LOCK(flags); + LNET_LOCK(); current_me = lnet_handle2me(¤t_meh); if (current_me == NULL) { lnet_me_free (new_me); - LNET_UNLOCK(flags); + LNET_UNLOCK(); return -ENOENT; } @@ -111,7 +109,7 @@ LNetMEInsert(lnet_handle_me_t current_meh, lnet_me2handle(handle, new_me); - LNET_UNLOCK(flags); + LNET_UNLOCK(); return 0; } @@ -119,14 +117,13 @@ LNetMEInsert(lnet_handle_me_t current_meh, int LNetMEUnlink(lnet_handle_me_t meh) { - unsigned long flags; lnet_me_t *me; int rc; LASSERT (the_lnet.ln_init); LASSERT (the_lnet.ln_refcount > 0); - LNET_LOCK(flags); + LNET_LOCK(); me = lnet_handle2me(&meh); if (me == NULL) { @@ -136,7 +133,7 @@ LNetMEUnlink(lnet_handle_me_t meh) rc = 0; } - LNET_UNLOCK(flags); + LNET_UNLOCK(); return (rc); } diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index c5d3768..9c255df 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -26,16 +26,6 @@ #include -#if 1 -/* Enforce the rule that the target NID must be that of the receiving NI */ -const int allow_destination_aliases = 0; -#else -/* Allow NID aliasing experiments */ -static int allow_destination_aliases = 0; -CFS_MODULE_PARM(allow_destination_aliases, "i", int, 0644, - "Boolean: don't require strict destination NIDs"); -#endif - static int implicit_loopback = 1; CFS_MODULE_PARM(implicit_loopback, "i", int, 0644, "Boolean: substitute 0@lo when sending to any local NID"); @@ -131,7 +121,7 @@ lnet_match_md(int index, int op_mask, lnet_process_id_t src, lnet_commit_md(md, msg); md->md_offset = offset + mlength; - /* NB Caller sets ev.type and ev.hdr_data */ + /* NB Caller will set ev.type and ev.hdr_data */ msg->msg_ev.initiator = src; msg->msg_ev.pt_index = index; msg->msg_ev.match_bits = match_bits; @@ -167,7 +157,6 @@ int lnet_fail_nid (lnet_nid_t nid, unsigned int threshold) { lnet_test_peer_t *tp; - unsigned long flags; struct list_head *el; struct list_head *next; struct list_head cull; @@ -183,16 +172,16 @@ lnet_fail_nid (lnet_nid_t nid, unsigned int threshold) tp->tp_nid = nid; tp->tp_threshold = threshold; - LNET_LOCK(flags); + LNET_LOCK(); list_add_tail (&tp->tp_list, &the_lnet.ln_test_peers); - LNET_UNLOCK(flags); + LNET_UNLOCK(); return 0; } /* removing entries */ CFS_INIT_LIST_HEAD (&cull); - LNET_LOCK(flags); + LNET_LOCK(); list_for_each_safe (el, next, &the_lnet.ln_test_peers) { tp = list_entry (el, lnet_test_peer_t, tp_list); @@ -206,7 +195,7 @@ lnet_fail_nid (lnet_nid_t nid, unsigned int threshold) } } - LNET_UNLOCK(flags); + LNET_UNLOCK(); while (!list_empty (&cull)) { tp = list_entry (cull.next, lnet_test_peer_t, tp_list); @@ -223,13 +212,12 @@ fail_peer (lnet_nid_t nid, int outgoing) lnet_test_peer_t *tp; struct list_head *el; struct list_head *next; - unsigned long flags; struct list_head cull; int fail = 0; CFS_INIT_LIST_HEAD (&cull); - LNET_LOCK(flags); + LNET_LOCK(); list_for_each_safe (el, next, &the_lnet.ln_test_peers) { tp = list_entry (el, lnet_test_peer_t, tp_list); @@ -263,7 +251,7 @@ fail_peer (lnet_nid_t nid, int outgoing) } } - LNET_UNLOCK (flags); + LNET_UNLOCK (); while (!list_empty (&cull)) { tp = list_entry (cull.next, lnet_test_peer_t, tp_list); @@ -704,112 +692,627 @@ lnet_extract_kiov (int dst_niov, lnet_kiov_t *dst, #endif void -lnet_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed, - unsigned int offset, unsigned int mlen, unsigned int rlen) +lnet_ni_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed, + unsigned int offset, unsigned int mlen, unsigned int rlen) { unsigned int niov = 0; struct iovec *iov = NULL; lnet_kiov_t *kiov = NULL; int rc; - if (mlen != 0) { - lnet_libmd_t *md = msg->msg_md; + LASSERT (mlen == 0 || msg != NULL); + + if (msg != NULL) { + LASSERT(!msg->msg_recvaftersend); + LASSERT(msg->msg_receiving); + msg->msg_receiving = 0; + + if (mlen != 0) { + niov = msg->msg_niov; + iov = msg->msg_iov; + kiov = msg->msg_kiov; - niov = md->md_niov; - if ((md->md_options & LNET_MD_KIOV) != 0) - kiov = md->md_iov.kiov; - else - iov = md->md_iov.iov; + LASSERT (niov > 0); + LASSERT ((iov == NULL) != (kiov == NULL)); + } } rc = (ni->ni_lnd->lnd_recv)(ni, private, msg, delayed, niov, iov, kiov, offset, mlen, rlen); if (rc != 0) - lnet_finalize(ni, private, msg, rc); + lnet_finalize(ni, msg, rc); } int -lnet_send(lnet_ni_t *ni, void *private, lnet_msg_t *msg, - int type, lnet_process_id_t target, - lnet_libmd_t *md, unsigned int offset, unsigned int len) +lnet_compare_routers(lnet_peer_t *p1, lnet_peer_t *p2) { - unsigned long flags; - lnet_nid_t gw_nid; - lnet_nid_t src_nid; - int rc; - - /* CAVEAT EMPTOR! ni != NULL == interface pre-determined (ACK) */ + /* Go for the one with more available credits. + * Otherwise go for the minimum queue depth */ + if (p1->lp_txcredits > 0) { + + if (p1->lp_txcredits > p2->lp_txcredits) + return 1; + + if (p1->lp_txcredits < p2->lp_txcredits) + return 0; - gw_nid = lnet_lookup (&ni, target.nid, sizeof(lnet_hdr_t) + len); - if (gw_nid == LNET_NID_ANY) { - CERROR("No route to %s\n", libcfs_id2str(target)); - LCONSOLE_ERROR("Cannot send to %s: %s is not a local network " - "and I can't route to it. Is lustre configured " - "correctly?\n", libcfs_nid2str(target.nid), - libcfs_net2str(PTL_NIDNET(target.nid))); - - return -EIO; + } else if (p2->lp_txcredits > 0) { + return 0; } + + return (p1->lp_txqnob > p2->lp_txqnob); +} - /* set the completion event's initiator.nid now we know it */ - if (type == LNET_MSG_PUT || type == LNET_MSG_GET) - msg->msg_ev.initiator.nid = ni->ni_nid; - src_nid = lnet_ptlcompat_srcnid(ni->ni_nid, target.nid); +void +lnet_setpayloadbuffer(lnet_msg_t *msg) +{ + lnet_libmd_t *md = msg->msg_md; + + LASSERT (msg->msg_len > 0); + LASSERT (!msg->msg_routing); + LASSERT (md != NULL); + LASSERT (msg->msg_niov == 0); + LASSERT (msg->msg_iov == NULL); + LASSERT (msg->msg_kiov == NULL); + + msg->msg_niov = md->md_niov; + if ((md->md_options & LNET_MD_KIOV) != 0) + msg->msg_kiov = md->md_iov.kiov; + else + msg->msg_iov = md->md_iov.iov; +} - msg->msg_type = type; - msg->msg_target = target; - msg->msg_target_is_router = 0; - msg->msg_routing = 0; - +void +lnet_prep_send(lnet_msg_t *msg, int type, lnet_process_id_t target, + unsigned int offset, unsigned int len) +{ + msg->msg_type = type; + msg->msg_target = target; + msg->msg_len = len; + msg->msg_offset = offset; + + if (len != 0) + lnet_setpayloadbuffer(msg); + + memset (&msg->msg_hdr, 0, sizeof (msg->msg_hdr)); msg->msg_hdr.type = cpu_to_le32(type); msg->msg_hdr.dest_nid = cpu_to_le64(target.nid); msg->msg_hdr.dest_pid = cpu_to_le32(target.pid); - msg->msg_hdr.src_nid = cpu_to_le64(src_nid); - msg->msg_hdr.src_pid = cpu_to_le64(the_lnet.ln_pid); + /* src not set yet */ msg->msg_hdr.payload_length = cpu_to_le32(len); +} - if (PTL_NETTYP(PTL_NIDNET(ni->ni_nid)) != LOLND) { - if (!lnet_ptlcompat_matchnid(ni->ni_nid, gw_nid)) { - /* it's not for me: will the gateway have to forward? */ - if (gw_nid != target.nid && - the_lnet.ln_ptlcompat == 0) { - msg->msg_target_is_router = 1; - msg->msg_target.pid = LUSTRE_SRV_PTL_PID; - msg->msg_target.nid = gw_nid; - } - } else if (implicit_loopback) { /* its for me: force lonal? */ - LNET_LOCK(flags); - lnet_ni_decref_locked(ni); - ni = lnet_loni; - if (ni != NULL) - lnet_ni_addref_locked(ni); - LNET_UNLOCK(flags); +void +lnet_ni_send(lnet_ni_t *ni, lnet_msg_t *msg) +{ + void *priv = msg->msg_private; + int recv = msg->msg_recvaftersend; + int delayed = msg->msg_delayed; + int rc; + + /* On GET, call lnet_ni_recv() right after the send. The recv gets + * delayed until after the send to ensure the LND still has any RDMA + * descriptors associated with the incoming GET when lnd_send() calls + * in with the REPLY. Note that if we actually had to pass 'msg' in to + * lnet_ni_recv() here, we'd be forking it (i.e. it would have 2 separate + * existances and we'd have to refcount it) */ + + LASSERT (!recv == !msg->msg_receiving); + msg->msg_recvaftersend = 0; + msg->msg_receiving = 0; + + LASSERT (PTL_NETTYP(PTL_NIDNET(ni->ni_nid)) == LOLND || + (msg->msg_txcredit && msg->msg_peertxcredit)); + + rc = (ni->ni_lnd->lnd_send)(ni, priv, msg); + if (rc != 0) + lnet_finalize(ni, msg, rc); + + if (recv) + lnet_ni_recv(ni, priv, NULL, delayed, 0, 0, 0); +} + +int +lnet_eager_recv_locked(lnet_msg_t *msg) +{ + lnet_peer_t *peer; + lnet_ni_t *ni; + int rc = 0; + + LASSERT (!msg->msg_delayed); + msg->msg_delayed = 1; + + /* I might have to do an eager receive since I'm blocking */ + if (!msg->msg_receiving) + return 0; + + if (msg->msg_routing) { + peer = msg->msg_rxpeer; + LASSERT (!msg->msg_sending); + } else { + peer = msg->msg_txpeer; + LASSERT (msg->msg_recvaftersend); + LASSERT (msg->msg_type == LNET_MSG_REPLY); + } + + ni = peer->lp_ni; + if (ni->ni_lnd->lnd_eager_recv != NULL) { + LNET_UNLOCK(); - if (ni == NULL) /* shutdown in progress */ - return -ENETDOWN; + rc = (ni->ni_lnd->lnd_eager_recv)(ni, msg->msg_private, msg, + &msg->msg_private); + if (rc != 0) { + CERROR("recv from %s / send to %s aborted: " + "eager_recv failed %d\n", + libcfs_nid2str(peer->lp_nid), + libcfs_id2str(msg->msg_target), rc); + LASSERT (rc < 0); /* required by my callers */ } + + LNET_LOCK(); } - msg->msg_len = len; - msg->msg_offset = offset; - msg->msg_niov = 0; - msg->msg_iov = NULL; - msg->msg_kiov = NULL; + return rc; +} + +int +lnet_post_send_locked (lnet_msg_t *msg, int do_send) +{ + /* lnet_send is going to LNET_UNLOCK immediately after this, so it sets + * do_send FALSE and I don't do the unlock/send/lock bit. I return + * EAGAIN if msg blocked and 0 if sent or OK to send */ + lnet_peer_t *lp = msg->msg_txpeer; + lnet_ni_t *ni = lp->lp_ni; + + if (!msg->msg_peertxcredit) { + LASSERT ((lp->lp_txcredits < 0) == !list_empty(&lp->lp_txq)); + + msg->msg_peertxcredit = 1; + lp->lp_txqnob += msg->msg_len + sizeof(lnet_hdr_t); + lp->lp_txcredits--; + if (lp->lp_txcredits < lp->lp_mintxcredits) + lp->lp_mintxcredits = lp->lp_txcredits; + + if (lp->lp_txcredits < 0) { + /* must have checked eager_recv before here */ + LASSERT (msg->msg_delayed); + list_add_tail (&msg->msg_list, &lp->lp_txq); + return EAGAIN; + } + } - if (len > 0) { - msg->msg_niov = md->md_niov; + if (!msg->msg_txcredit) { + LASSERT ((ni->ni_txcredits < 0) == !list_empty(&ni->ni_txq)); - if (((md->md_options) & LNET_MD_KIOV) != 0) - msg->msg_kiov = md->md_iov.kiov; - else - msg->msg_iov = md->md_iov.iov; + msg->msg_txcredit = 1; + ni->ni_txcredits--; + + if (ni->ni_txcredits < ni->ni_mintxcredits) + ni->ni_mintxcredits = ni->ni_txcredits; + + if (ni->ni_txcredits < 0) { + /* must have checkd eager_recv before here */ + LASSERT (msg->msg_delayed); + list_add_tail (&msg->msg_list, &ni->ni_txq); + return EAGAIN; + } } + + if (do_send) { + LNET_UNLOCK(); + /* non-lnet_send() callers always send delayed */ + LASSERT (msg->msg_delayed); + lnet_ni_send(ni, msg); + LNET_LOCK(); + } + return 0; +} + +#ifdef __KERNEL__ +static void +lnet_commit_routedmsg (lnet_msg_t *msg) +{ + /* ALWAYS called holding the LNET_LOCK */ + LASSERT (msg->msg_routing); - rc = (ni->ni_lnd->lnd_send)(ni, private, msg); + the_lnet.ln_counters.msgs_alloc++; + if (the_lnet.ln_counters.msgs_alloc > + the_lnet.ln_counters.msgs_max) + the_lnet.ln_counters.msgs_max = + the_lnet.ln_counters.msgs_alloc; - lnet_ni_decref(ni); /* lose ref from lnet_lookup */ - return rc; + the_lnet.ln_counters.route_count++; + the_lnet.ln_counters.route_length += msg->msg_len; + + LASSERT (!msg->msg_onactivelist); + msg->msg_onactivelist = 1; + list_add (&msg->msg_activelist, &the_lnet.ln_active_msgs); +} + +lnet_rtrbufpool_t * +lnet_msg2bufpool(lnet_msg_t *msg) +{ + lnet_rtrbufpool_t *rbp = &the_lnet.ln_rtrpools[0]; + + LASSERT (msg->msg_len <= PTL_MTU); + while (msg->msg_len > rbp->rbp_npages * PAGE_SIZE) { + rbp++; + LASSERT (rbp < &the_lnet.ln_rtrpools[LNET_NRBPOOLS]); + } + + return rbp; +} + +int +lnet_post_routed_recv_locked (lnet_msg_t *msg, int do_recv) +{ + /* lnet_route is going to LNET_UNLOCK immediately after this, so it + * sets do_recv FALSE and I don't do the unlock/send/lock bit. I + * return EAGAIN if msg blocked and 0 if sent or OK to send */ + lnet_peer_t *lp = msg->msg_rxpeer; + lnet_rtrbufpool_t *rbp; + lnet_rtrbuf_t *rb; + + LASSERT (msg->msg_iov == NULL); + LASSERT (msg->msg_kiov == NULL); + LASSERT (msg->msg_niov == 0); + LASSERT (msg->msg_routing); + LASSERT (msg->msg_receiving); + + if (!msg->msg_peerrtrcredit) { + LASSERT ((lp->lp_rtrcredits < 0) == !list_empty(&lp->lp_rtrq)); + + msg->msg_peerrtrcredit = 1; + lp->lp_rtrcredits--; + if (lp->lp_rtrcredits < lp->lp_minrtrcredits) + lp->lp_minrtrcredits = lp->lp_rtrcredits; + + if (lp->lp_rtrcredits < 0) { + /* must have checked eager_recv before here */ + LASSERT (msg->msg_delayed); + list_add_tail(&msg->msg_list, &lp->lp_rtrq); + return EAGAIN; + } + } + + rbp = lnet_msg2bufpool(msg); + + if (!msg->msg_rtrcredit) { + LASSERT ((rbp->rbp_credits < 0) == !list_empty(&rbp->rbp_msgs)); + + msg->msg_rtrcredit = 1; + rbp->rbp_credits--; + if (rbp->rbp_credits < rbp->rbp_mincredits) + rbp->rbp_mincredits = rbp->rbp_credits; + + if (rbp->rbp_credits < 0) { + /* must have checked eager_recv before here */ + LASSERT (msg->msg_delayed); + list_add_tail(&msg->msg_list, &rbp->rbp_msgs); + return EAGAIN; + } + } + + LASSERT (!list_empty(&rbp->rbp_bufs)); + rb = list_entry(rbp->rbp_bufs.next, lnet_rtrbuf_t, rb_list); + list_del(&rb->rb_list); + + msg->msg_niov = rbp->rbp_npages; + msg->msg_kiov = &rb->rb_kiov[0]; + + if (do_recv) { + LNET_UNLOCK(); + /* non-lnet_route() callers always send delayed */ + LASSERT (msg->msg_delayed); + lnet_ni_recv(lp->lp_ni, msg->msg_private, msg, 1, + 0, msg->msg_len, msg->msg_len); + LNET_LOCK(); + } + return 0; +} +#endif + +void +lnet_return_credits_locked (lnet_msg_t *msg) +{ + lnet_peer_t *txpeer = msg->msg_txpeer; + lnet_peer_t *rxpeer = msg->msg_rxpeer; + lnet_msg_t *msg2; + lnet_ni_t *ni; + + if (msg->msg_txcredit) { + /* give back NI txcredits */ + msg->msg_txcredit = 0; + ni = txpeer->lp_ni; + + LASSERT((ni->ni_txcredits < 0) == !list_empty(&ni->ni_txq)); + + ni->ni_txcredits++; + if (ni->ni_txcredits <= 0) { + msg2 = list_entry(ni->ni_txq.next, lnet_msg_t, msg_list); + list_del(&msg2->msg_list); + + LASSERT(msg2->msg_txpeer->lp_ni == ni); + LASSERT(msg2->msg_delayed); + + (void) lnet_post_send_locked(msg2, 1); + } + } + + if (msg->msg_peertxcredit) { + /* give back peer txcredits */ + msg->msg_peertxcredit = 0; + + LASSERT((txpeer->lp_txcredits < 0) == !list_empty(&txpeer->lp_txq)); + + txpeer->lp_txqnob -= msg->msg_len + sizeof(lnet_hdr_t); + LASSERT (txpeer->lp_txqnob >= 0); + + txpeer->lp_txcredits++; + if (txpeer->lp_txcredits <= 0) { + msg2 = list_entry(txpeer->lp_txq.next, + lnet_msg_t, msg_list); + list_del(&msg2->msg_list); + + LASSERT (msg2->msg_txpeer == txpeer); + LASSERT (msg2->msg_delayed); + + (void) lnet_post_send_locked(msg2, 1); + } + } + + if (txpeer != NULL) { + msg->msg_txpeer = NULL; + lnet_peer_decref_locked(txpeer); + } + +#ifdef __KERNEL__ + if (msg->msg_rtrcredit) { + /* give back global router credits */ + lnet_rtrbuf_t *rb; + lnet_rtrbufpool_t *rbp; + + /* NB If a msg ever blocks for a buffer in rbp_msgs, it stays + * there until it gets one allocated, or aborts the wait + * itself */ + LASSERT (msg->msg_kiov != NULL); + + rb = list_entry(msg->msg_kiov, lnet_rtrbuf_t, rb_kiov[0]); + rbp = rb->rb_pool; + LASSERT (rbp == lnet_msg2bufpool(msg)); + + msg->msg_kiov = NULL; + msg->msg_rtrcredit = 0; + + LASSERT((rbp->rbp_credits < 0) == !list_empty(&rbp->rbp_msgs)); + LASSERT((rbp->rbp_credits > 0) == !list_empty(&rbp->rbp_bufs)); + + list_add(&rb->rb_list, &rbp->rbp_bufs); + rbp->rbp_credits++; + if (rbp->rbp_credits <= 0) { + msg2 = list_entry(rbp->rbp_msgs.next, + lnet_msg_t, msg_list); + list_del(&msg2->msg_list); + + LASSERT (msg2->msg_delayed); + + (void) lnet_post_routed_recv_locked(msg2, 1); + } + } + + if (msg->msg_peerrtrcredit) { + /* give pack peer router credits */ + msg->msg_peerrtrcredit = 0; + + LASSERT((rxpeer->lp_rtrcredits < 0) == !list_empty(&rxpeer->lp_rtrq)); + + rxpeer->lp_rtrcredits++; + if (rxpeer->lp_rtrcredits <= 0) { + msg2 = list_entry(rxpeer->lp_rtrq.next, + lnet_msg_t, msg_list); + list_del(&msg2->msg_list); + + LASSERT (msg2->msg_delayed); + + (void) lnet_post_routed_recv_locked(msg2, 1); + } + } + + if (rxpeer != NULL) { + msg->msg_rxpeer = NULL; + lnet_peer_decref_locked(rxpeer); + } +#else + LASSERT (!msg->msg_rtrcredit); + LASSERT (!msg->msg_peerrtrcredit); + LASSERT (rxpeer == NULL); +#endif +} + +int +lnet_send(lnet_ni_t *ni, lnet_msg_t *msg) +{ + lnet_nid_t dst_nid = msg->msg_target.nid; + lnet_ni_t *src_ni = NULL; + lnet_remotenet_t *rnet = NULL; + lnet_route_t *route; + struct list_head *tmp; + lnet_ni_t *tmp_ni; + lnet_nid_t src_nid; + lnet_peer_t *lp; + lnet_peer_t *lp2; + int rc; + + LASSERT (msg->msg_txpeer == NULL); + LASSERT (!msg->msg_sending); + LASSERT (!msg->msg_target_is_router); + + msg->msg_sending = 1; + + /* NB! ni != NULL == interface pre-determined (ACK/REPLY) */ + + LNET_LOCK(); + + if (the_lnet.ln_shutdown) { + LNET_UNLOCK(); + return -ESHUTDOWN; + } + + if (msg->msg_routing) { + LASSERT (ni == NULL); + + /* msg->msg_hdr.src_nid/pid are already set up and I already + * know this message isn't for me; is it for someone local? */ + list_for_each (tmp, &the_lnet.ln_nis) { + tmp_ni = list_entry(tmp, lnet_ni_t, ni_list); + + if (PTL_NIDNET(tmp_ni->ni_nid) == + PTL_NIDNET(dst_nid)) { + src_ni = tmp_ni; + break; + } + } + } else { + /* msg->msg_hdr.src_nid/pid get set depending on where I send + * from. If (ni != NULL) this is a response to a message that + * came in on that NI, and I should be replying on it. */ + + msg->msg_hdr.src_pid = the_lnet.ln_pid; + + if (PTL_NETTYP(PTL_NIDNET(dst_nid)) == LOLND) { + src_ni = lnet_loni; + msg->msg_hdr.src_nid = cpu_to_le64(lnet_loni->ni_nid); + } else { + list_for_each (tmp, &the_lnet.ln_nis) { + tmp_ni = list_entry(tmp, lnet_ni_t, ni_list); + + if (PTL_NIDNET(tmp_ni->ni_nid) != + PTL_NIDNET(dst_nid)) + continue; + + /* dst is on a local net */ + src_ni = tmp_ni; + src_nid = lnet_ptlcompat_srcnid(src_ni->ni_nid, + dst_nid); + msg->msg_hdr.src_nid = cpu_to_le64(src_nid); + + /* send via lo0? */ + if (implicit_loopback && + lnet_ptlcompat_matchnid(src_ni->ni_nid, dst_nid)) + src_ni = lnet_loni; + break; + } + } + + if (src_ni == lnet_loni) { + /* No send credit hassles with LOLND */ + lnet_ni_addref_locked(src_ni); + LNET_UNLOCK(); + + if (ni != NULL && ni != src_ni) { + /* a different LND expects a response */ + rc = -EPROTO; + } else { + rc = 0; + lnet_ni_send(src_ni, msg); + } + lnet_ni_decref(src_ni); + return rc; + } + } + + if (src_ni != NULL) { + /* sending to a local network */ + rc = lnet_nid2peer_locked(&lp, dst_nid); + if (rc != 0) { + LNET_UNLOCK(); + CERROR("Error %d finding peer %s\n", rc, + libcfs_nid2str(dst_nid)); + /* ENOMEM or shutting down */ + return rc; + } + } else { + /* sending to a remote network */ + rnet = lnet_find_net_locked(PTL_NIDNET(dst_nid)); + if (rnet == NULL) { + LNET_UNLOCK(); + CERROR("No route to %s\n", libcfs_id2str(msg->msg_target)); + return -EHOSTUNREACH; + } + src_ni = rnet->lrn_ni; + + if (!msg->msg_routing) { + /* I'm the source; now I know which NI */ + src_nid = lnet_ptlcompat_srcnid(src_ni->ni_nid, dst_nid); + msg->msg_hdr.src_nid = cpu_to_le64(src_nid); + } + + /* Find the best gateway I can use */ + lp = NULL; + list_for_each(tmp, &rnet->lrn_routes) { + route = list_entry(tmp, lnet_route_t, lr_list); + lp2 = route->lr_gateway; + + LASSERT (lp2->lp_ni == src_ni); + + if (!lp2->lp_alive) + continue; + + if (lp == NULL || + lnet_compare_routers(lp2, lp)) + lp = lp2; + } + + if (lp == NULL) { + LNET_UNLOCK(); + CERROR("No route to %s (all routers down)\n", + libcfs_id2str(msg->msg_target)); + return -EHOSTUNREACH; + } + + lnet_peer_addref_locked(lp); + + msg->msg_target_is_router = 1; + msg->msg_target.nid = lp->lp_nid; + msg->msg_target.pid = LUSTRE_SRV_PTL_PID; + } + + /* 'lp' is our best choice of peer */ + LASSERT (lp->lp_ni == src_ni); + + if (ni != NULL && ni != src_ni) { + /* A different LND expects a response */ + lnet_peer_decref_locked(lp); + LNET_UNLOCK(); + CERROR("NI mismatch: dest %s expected %s\n", + libcfs_nid2str(dst_nid), + libcfs_net2str(PTL_NIDNET(ni->ni_nid))); + return -EPROTO; + } + + LASSERT (msg->msg_txpeer == NULL); + LASSERT (!msg->msg_peertxcredit); + LASSERT (!msg->msg_txcredit); + + msg->msg_txpeer = lp; + + if (!msg->msg_delayed && + (lp->lp_txcredits <= 0 || src_ni->ni_txcredits <= 0)) { + rc = lnet_eager_recv_locked(msg); + if (rc != 0) { + LNET_UNLOCK(); + return rc; + } + } + + rc = lnet_post_send_locked(msg, 0); + LNET_UNLOCK(); + + if (rc == 0) + lnet_ni_send(src_ni, msg); + + return 0; } static void @@ -819,6 +1322,8 @@ lnet_commit_md (lnet_libmd_t *md, lnet_msg_t *msg) /* Here, we commit the MD to a network OP by marking it busy and * decrementing its threshold. Come what may, the network "owns" * the MD until a call to lnet_finalize() signals completion. */ + LASSERT (!msg->msg_routing); + msg->msg_md = md; md->md_pending++; @@ -833,76 +1338,81 @@ lnet_commit_md (lnet_libmd_t *md, lnet_msg_t *msg) the_lnet.ln_counters.msgs_max = the_lnet.ln_counters.msgs_alloc; + LASSERT (!msg->msg_onactivelist); + msg->msg_onactivelist = 1; list_add (&msg->msg_activelist, &the_lnet.ln_active_msgs); } void lnet_drop_message (lnet_ni_t *ni, void *private, unsigned int nob) { - unsigned long flags; - - LNET_LOCK(flags); + LNET_LOCK(); the_lnet.ln_counters.drop_count++; the_lnet.ln_counters.drop_length += nob; - LNET_UNLOCK(flags); + LNET_UNLOCK(); - lnet_recv(ni, private, NULL, 0, 0, 0, nob); + lnet_ni_recv(ni, private, NULL, 0, 0, 0, nob); } static int -lnet_parse_put(lnet_ni_t *ni, lnet_hdr_t *hdr, void *private, lnet_msg_t *msg) +lnet_parse_put(lnet_ni_t *ni, lnet_msg_t *msg) { + void *private = msg->msg_private; + lnet_hdr_t *hdr = &msg->msg_hdr; unsigned int rlength = hdr->payload_length; unsigned int mlength = 0; unsigned int offset = 0; lnet_process_id_t src = {.nid = hdr->src_nid, .pid = hdr->src_pid}; lnet_libmd_t *md; - unsigned long flags; /* Convert put fields to host byte order */ hdr->msg.put.match_bits = le64_to_cpu(hdr->msg.put.match_bits); hdr->msg.put.ptl_index = le32_to_cpu(hdr->msg.put.ptl_index); hdr->msg.put.offset = le32_to_cpu(hdr->msg.put.offset); - LNET_LOCK(flags); + LNET_LOCK(); md = lnet_match_md(hdr->msg.put.ptl_index, LNET_MD_OP_PUT, src, rlength, hdr->msg.put.offset, hdr->msg.put.match_bits, msg, &mlength, &offset); if (md == NULL) { - LNET_UNLOCK(flags); + LNET_UNLOCK(); return ENOENT; /* +ve: OK but no match */ } - msg->msg_ev.type = LNET_EVENT_PUT; - msg->msg_ev.hdr_data = hdr->msg.put.hdr_data; - - if (!lnet_is_wire_handle_none(&hdr->msg.put.ack_wmd) && - !(md->md_options & LNET_MD_ACK_DISABLE)) { - msg->msg_ack_wmd = hdr->msg.put.ack_wmd; - } - the_lnet.ln_counters.recv_count++; the_lnet.ln_counters.recv_length += mlength; - LNET_UNLOCK(flags); + LNET_UNLOCK(); + + if (mlength != 0) + lnet_setpayloadbuffer(msg); + + msg->msg_ev.type = LNET_EVENT_PUT; + msg->msg_ev.hdr_data = hdr->msg.put.hdr_data; - lnet_recv(ni, private, msg, 0, offset, mlength, rlength); + /* Must I ACK? If so I'll grab the ack_wmd out of the header and put + * it back into the ACK when during lnet_finalize() */ + msg->msg_ack = (!lnet_is_wire_handle_none(&hdr->msg.put.ack_wmd) && + (md->md_options & LNET_MD_ACK_DISABLE) == 0); + + lnet_ni_recv(ni, private, msg, 0, offset, mlength, rlength); return 0; } static int -lnet_parse_get(lnet_ni_t *ni, lnet_hdr_t *hdr, void *private, lnet_msg_t *msg) +lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg) { - unsigned int mlength = 0; - unsigned int offset = 0; - lnet_process_id_t src = {.nid = hdr->src_nid, - .pid = hdr->src_pid}; - lnet_libmd_t *md; - unsigned long flags; - int rc; + lnet_hdr_t *hdr = &msg->msg_hdr; + unsigned int mlength = 0; + unsigned int offset = 0; + lnet_process_id_t src = {.nid = hdr->src_nid, + .pid = hdr->src_pid}; + lnet_handle_wire_t reply_wmd; + lnet_libmd_t *md; + int rc; /* Convert get fields to host byte order */ hdr->msg.get.match_bits = le64_to_cpu(hdr->msg.get.match_bits); @@ -910,55 +1420,67 @@ lnet_parse_get(lnet_ni_t *ni, lnet_hdr_t *hdr, void *private, lnet_msg_t *msg) hdr->msg.get.sink_length = le32_to_cpu(hdr->msg.get.sink_length); hdr->msg.get.src_offset = le32_to_cpu(hdr->msg.get.src_offset); - LNET_LOCK(flags); + LNET_LOCK(); md = lnet_match_md(hdr->msg.get.ptl_index, LNET_MD_OP_GET, src, hdr->msg.get.sink_length, hdr->msg.get.src_offset, hdr->msg.get.match_bits, msg, &mlength, &offset); if (md == NULL) { - LNET_UNLOCK(flags); + LNET_UNLOCK(); return ENOENT; /* +ve: OK but no match */ } - msg->msg_ev.type = LNET_EVENT_GET; - msg->msg_ev.hdr_data = 0; - the_lnet.ln_counters.send_count++; the_lnet.ln_counters.send_length += mlength; - LNET_UNLOCK(flags); + LNET_UNLOCK(); - memset (&msg->msg_hdr, 0, sizeof (msg->msg_hdr)); - msg->msg_hdr.msg.reply.dst_wmd = hdr->msg.get.return_wmd; + reply_wmd = hdr->msg.get.return_wmd; - /* NB call lnet_send() _BEFORE_ lnet_recv() completes the incoming - * message. Some NALs _require_ this to implement optimized GET */ + lnet_prep_send(msg, LNET_MSG_REPLY, src, offset, mlength); - rc = lnet_send(ni, private, msg, LNET_MSG_REPLY, src, - md, offset, mlength); + msg->msg_hdr.msg.reply.dst_wmd = reply_wmd; + + msg->msg_ev.type = LNET_EVENT_GET; + msg->msg_ev.hdr_data = 0; + + /* set msg_recvaftersend so the incoming message is consumed (by + * calling lnet_ni_recv()) in lnet_ni_send() AFTER lnd_send() has been + * called. This ensures that the LND can rely on the recv happening + * after the send so any RDMA descriptors it has stashed are still + * valid. */ + msg->msg_recvaftersend = 1; + + rc = lnet_send(ni, msg); if (rc != 0) { - /* LND won't lnet_finalize()... */ + /* didn't get as far as lnet_ni_send() */ CERROR("%s: Unable to send REPLY for GET from %s: %d\n", libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), rc); - lnet_finalize(ni, private, msg, rc); + + /* consume to release LND resources */ + lnet_ni_recv(ni, msg->msg_private, NULL, 0, 0, 0, 0); + + msg->msg_recvaftersend = 0; + msg->msg_receiving = 0; + lnet_finalize(ni, msg, rc); } - lnet_recv(ni, private, NULL, 0, 0, 0, 0); return 0; } static int -lnet_parse_reply(lnet_ni_t *ni, lnet_hdr_t *hdr, void *private, lnet_msg_t *msg) +lnet_parse_reply(lnet_ni_t *ni, lnet_msg_t *msg) { + void *private = msg->msg_private; + lnet_hdr_t *hdr = &msg->msg_hdr; lnet_process_id_t src = {.nid = hdr->src_nid, .pid = hdr->src_pid}; lnet_libmd_t *md; int rlength; int mlength; - unsigned long flags; - LNET_LOCK(flags); + LNET_LOCK(); /* NB handles only looked up by creator (no flips) */ md = lnet_wire_handle2md(&hdr->msg.reply.dst_wmd); @@ -970,7 +1492,7 @@ lnet_parse_reply(lnet_ni_t *ni, lnet_hdr_t *hdr, void *private, lnet_msg_t *msg) hdr->msg.reply.dst_wmd.wh_interface_cookie, hdr->msg.reply.dst_wmd.wh_object_cookie); - LNET_UNLOCK(flags); + LNET_UNLOCK(); return ENOENT; /* +ve: OK but no match */ } @@ -986,7 +1508,7 @@ lnet_parse_reply(lnet_ni_t *ni, lnet_hdr_t *hdr, void *private, lnet_msg_t *msg) libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), rlength, hdr->msg.reply.dst_wmd.wh_object_cookie, mlength); - LNET_UNLOCK(flags); + LNET_UNLOCK(); return ENOENT; /* +ve: OK but no match */ } @@ -996,6 +1518,9 @@ lnet_parse_reply(lnet_ni_t *ni, lnet_hdr_t *hdr, void *private, lnet_msg_t *msg) lnet_commit_md(md, msg); + if (mlength != 0) + lnet_setpayloadbuffer(msg); + msg->msg_ev.type = LNET_EVENT_REPLY; msg->msg_ev.initiator = src; msg->msg_ev.rlength = rlength; @@ -1008,25 +1533,25 @@ lnet_parse_reply(lnet_ni_t *ni, lnet_hdr_t *hdr, void *private, lnet_msg_t *msg) the_lnet.ln_counters.recv_count++; the_lnet.ln_counters.recv_length += mlength; - LNET_UNLOCK(flags); + LNET_UNLOCK(); - lnet_recv(ni, private, msg, 0, 0, mlength, rlength); + lnet_ni_recv(ni, private, msg, 0, 0, mlength, rlength); return 0; } static int -lnet_parse_ack(lnet_ni_t *ni, lnet_hdr_t *hdr, void *private, lnet_msg_t *msg) +lnet_parse_ack(lnet_ni_t *ni, lnet_msg_t *msg) { + lnet_hdr_t *hdr = &msg->msg_hdr; lnet_process_id_t src = {.nid = hdr->src_nid, .pid = hdr->src_pid}; lnet_libmd_t *md; - unsigned long flags; /* Convert ack fields to host byte order */ hdr->msg.ack.match_bits = le64_to_cpu(hdr->msg.ack.match_bits); hdr->msg.ack.mlength = le32_to_cpu(hdr->msg.ack.mlength); - LNET_LOCK(flags); + LNET_LOCK(); /* NB handles only looked up by creator (no flips) */ md = lnet_wire_handle2md(&hdr->msg.ack.dst_wmd); @@ -1039,7 +1564,7 @@ lnet_parse_ack(lnet_ni_t *ni, lnet_hdr_t *hdr, void *private, lnet_msg_t *msg) hdr->msg.ack.dst_wmd.wh_interface_cookie, hdr->msg.ack.dst_wmd.wh_object_cookie); #endif - LNET_UNLOCK(flags); + LNET_UNLOCK(); return ENOENT; /* +ve! */ } @@ -1059,18 +1584,16 @@ lnet_parse_ack(lnet_ni_t *ni, lnet_hdr_t *hdr, void *private, lnet_msg_t *msg) the_lnet.ln_counters.recv_count++; - LNET_UNLOCK(flags); - - lnet_finalize(ni, private, msg, 0); + LNET_UNLOCK(); - lnet_recv(ni, private, NULL, 0, 0, 0, 0); + lnet_ni_recv(ni, msg->msg_private, msg, 0, 0, 0, 0); return 0; } -static char * -hdr_type_string (lnet_hdr_t *hdr) +char * +lnet_msgtyp2str (int type) { - switch (hdr->type) { + switch (type) { case LNET_MSG_ACK: return ("ACK"); case LNET_MSG_PUT: @@ -1093,7 +1616,7 @@ lnet_print_hdr(lnet_hdr_t * hdr) .pid = hdr->src_pid}; lnet_process_id_t dst = {.nid = hdr->dest_nid, .pid = hdr->dest_pid}; - char *type_str = hdr_type_string (hdr); + char *type_str = lnet_msgtyp2str (hdr->type); CWARN("P3 Header at %p of type %s\n", hdr, type_str); CWARN(" From %s\n", libcfs_id2str(src)); @@ -1146,140 +1669,212 @@ lnet_print_hdr(lnet_hdr_t * hdr) int -lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, void *private) +lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, void *private) { - unsigned long flags; int rc = 0; int for_me; lnet_msg_t *msg; lnet_nid_t dest_nid; - __u32 type = le32_to_cpu(hdr->type); + lnet_nid_t src_nid; + __u32 payload_length; + __u32 type; - /* NB we return 0 if we manage to parse the header and believe - * it looks OK. Anything that goes wrong with receiving the - * message after that point is the responsibility of the LND. - * If we don't think the packet is for us, return 1 */ + LASSERT (!in_interrupt()); + type = le32_to_cpu(hdr->type); + src_nid = le64_to_cpu(hdr->src_nid); dest_nid = le64_to_cpu(hdr->dest_nid); + payload_length = le32_to_cpu(hdr->payload_length); + + switch (type) { + case LNET_MSG_ACK: + case LNET_MSG_GET: + if (payload_length > 0) { + CERROR("%s, src %s: bad %s payload %d (0 expected)\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), + lnet_msgtyp2str(type), payload_length); + return -EPROTO; + } + break; + + case LNET_MSG_PUT: + case LNET_MSG_REPLY: + if (payload_length > PTL_MTU) { + CERROR("%s, src %s: bad %s payload %d " + "(%d max expected)\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), + lnet_msgtyp2str(type), + payload_length, PTL_MTU); + return -EPROTO; + } + break; + + default: + CERROR("%s, src %s: Bad message type 0x%x\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), type); + return -EPROTO; + } + + /* Regard a bad destination NID as a protocol error. Senders should + * know what they're doing; if they don't they're misconfigured, buggy + * or malicious so we chop them off at the knees :) */ for_me = (PTL_NETTYP(PTL_NIDNET(ni->ni_nid)) == LOLND || lnet_ptlcompat_matchnid(ni->ni_nid, dest_nid)); if (!for_me) { if (the_lnet.ln_ptlcompat > 0) { - CERROR ("%s: Dropping message from %s: wrong nid %s\n", - libcfs_nid2str(ni->ni_nid), - libcfs_nid2str(le64_to_cpu(hdr->src_nid)), + /* portals compatibility is single-network */ + CERROR ("%s, %s: Bad dest nid %s " + "(routing not supported)\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), libcfs_nid2str(dest_nid)); return -EPROTO; } - if (!lnet_islocalnid(dest_nid)) /* tell LND to use the router */ - return 1; /* to forward */ + if (the_lnet.ln_ptlcompat == 0 && + PTL_NIDNET(dest_nid) == PTL_NIDNET(ni->ni_nid)) { + /* should have gone direct */ + CERROR ("%s, %s: Bad dest nid %s " + "(should have been sent direct)\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), + libcfs_nid2str(dest_nid)); + return -EPROTO; + } - /* dest_nid is one of my NIs */ - - if (!allow_destination_aliases) { + if (the_lnet.ln_ptlcompat == 0 && + lnet_islocalnid(dest_nid)) { /* dest is another local NI; sender should have used * this node's NID on its own network */ - CERROR ("%s: Dropping message from %s: nid %s " - "is a local alias\n", - libcfs_nid2str(ni->ni_nid), - libcfs_nid2str(le64_to_cpu(hdr->src_nid)), + CERROR ("%s, %s: Bad dest nid %s " + "(it's my nid but on a different network)\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), libcfs_nid2str(dest_nid)); return -EPROTO; } - } - - /* convert common fields to host byte order */ - hdr->type = type; - hdr->src_nid = le64_to_cpu(hdr->src_nid); - hdr->src_pid = le32_to_cpu(hdr->src_pid); - hdr->dest_nid = dest_nid; - hdr->dest_pid = le32_to_cpu(hdr->dest_pid); - hdr->payload_length = le32_to_cpu(hdr->payload_length); - switch (type) { - case LNET_MSG_ACK: - case LNET_MSG_GET: - if (hdr->payload_length > 0) { - CERROR("%s: Bad %s from %s: " - "payload size %d sent (0 expected)\n", - libcfs_nid2str(ni->ni_nid), - hdr_type_string(hdr), - libcfs_nid2str(hdr->src_nid), - hdr->payload_length); - return -EPROTO; - } - break; - - case LNET_MSG_PUT: - case LNET_MSG_REPLY: - if (hdr->payload_length > PTL_MTU) { - CERROR("%s: Bad %s from %s: " - "payload size %d sent (%d max expected)\n", - libcfs_nid2str(ni->ni_nid), - hdr_type_string(hdr), - libcfs_nid2str(hdr->src_nid), - hdr->payload_length, PTL_MTU); - return -EPROTO; + if (!the_lnet.ln_routing) { + CERROR ("%s, %s: Dropping message for %s " + "(routing not enabled)\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), + libcfs_nid2str(dest_nid)); + goto drop; } - break; - - default: - CERROR("%s: Bad message type 0x%x from %s\n", - libcfs_nid2str(ni->ni_nid), hdr->type, - libcfs_nid2str(hdr->src_nid)); - return -EPROTO; } + /* Message looks OK; we're not going to return an error, so we MUST + * call back lnd_recv() come what may... */ + if (!list_empty (&the_lnet.ln_test_peers) && /* normally we don't */ - fail_peer (hdr->src_nid, 0)) /* shall we now? */ + fail_peer (src_nid, 0)) /* shall we now? */ { - CERROR("%s: Dropping incoming %s from %s: simulated failure\n", - libcfs_nid2str(ni->ni_nid), hdr_type_string(hdr), - libcfs_nid2str(hdr->src_nid)); + CERROR("%s, src %s: Dropping %s to simulate failure\n", + libcfs_nid2str(from_nid), libcfs_nid2str(src_nid), + lnet_msgtyp2str(type)); goto drop; } msg = lnet_msg_alloc(); if (msg == NULL) { - CERROR("%s: Dropping incoming %s from %s: " - "can't allocate a lnet_msg_t\n", - libcfs_nid2str(ni->ni_nid), hdr_type_string(hdr), - libcfs_nid2str(hdr->src_nid)); + CERROR("%s, src %s: Dropping %s (out of memory)\n", + libcfs_nid2str(from_nid), libcfs_nid2str(src_nid) + , lnet_msgtyp2str(type)); goto drop; } - switch (hdr->type) { + /* msg zeroed in lnet_msg_alloc; i.e. flags all clear, pointers NULL etc */ + + msg->msg_type = type; + msg->msg_private = private; + msg->msg_receiving = 1; + msg->msg_len = payload_length; + msg->msg_hdr = *hdr; + +#ifndef __KERNEL__ + LASSERT (for_me); +#else + if (!for_me) { + msg->msg_target.pid = le32_to_cpu(hdr->dest_pid); + msg->msg_target.nid = dest_nid; + msg->msg_routing = 1; + msg->msg_offset = 0; + + LNET_LOCK(); + rc = lnet_nid2peer_locked(&msg->msg_rxpeer, from_nid); + if (rc != 0) { + LNET_UNLOCK(); + CERROR("%s, src %s: Dropping %s " + "(error %d looking up sender)\n", + libcfs_nid2str(from_nid), libcfs_nid2str(src_nid), + lnet_msgtyp2str(type), rc); + goto free_drop; + } + + if (msg->msg_rxpeer->lp_rtrcredits <= 0 || + lnet_msg2bufpool(msg)->rbp_credits <= 0) { + rc = lnet_eager_recv_locked(msg); + if (rc != 0) { + LNET_UNLOCK(); + goto free_drop; + } + } + + lnet_commit_routedmsg(msg); + rc = lnet_post_routed_recv_locked(msg, 0); + LNET_UNLOCK(); + + if (rc == 0) + lnet_ni_recv(ni, msg->msg_private, msg, 0, + 0, payload_length, payload_length); + return 0; + } +#endif + /* convert common msg->hdr fields to host byteorder */ + msg->msg_hdr.type = type; + msg->msg_hdr.src_nid = src_nid; + msg->msg_hdr.src_pid = le32_to_cpu(msg->msg_hdr.src_pid); + msg->msg_hdr.dest_nid = dest_nid; + msg->msg_hdr.dest_pid = le32_to_cpu(msg->msg_hdr.dest_pid); + msg->msg_hdr.payload_length = payload_length; + + switch (type) { case LNET_MSG_ACK: - rc = lnet_parse_ack(ni, hdr, private, msg); + rc = lnet_parse_ack(ni, msg); break; case LNET_MSG_PUT: - rc = lnet_parse_put(ni, hdr, private, msg); + rc = lnet_parse_put(ni, msg); break; case LNET_MSG_GET: - rc = lnet_parse_get(ni, hdr, private, msg); + rc = lnet_parse_get(ni, msg); break; case LNET_MSG_REPLY: - rc = lnet_parse_reply(ni, hdr, private, msg); + rc = lnet_parse_reply(ni, msg); break; default: LASSERT(0); - break; + goto free_drop; /* prevent an unused label if !kernel */ } if (rc == 0) return 0; LASSERT (rc == ENOENT); + + free_drop: LASSERT (msg->msg_md == NULL); - - LNET_LOCK(flags); + LNET_LOCK(); lnet_msg_free(msg); /* expects LNET_LOCK held */ - LNET_UNLOCK(flags); + LNET_UNLOCK(); + drop: - lnet_drop_message(ni, private, hdr->payload_length); + lnet_drop_message(ni, private, payload_length); return ENOENT; } @@ -1291,7 +1886,6 @@ LNetPut(lnet_handle_md_t mdh, lnet_ack_req_t ack, { lnet_msg_t *msg; lnet_libmd_t *md; - unsigned long flags; int rc; LASSERT (the_lnet.ln_init); @@ -1312,12 +1906,12 @@ LNetPut(lnet_handle_md_t mdh, lnet_ack_req_t ack, return -ENOMEM; } - LNET_LOCK(flags); + LNET_LOCK(); md = lnet_handle2md(&mdh); if (md == NULL || md->md_threshold == 0) { lnet_msg_free(msg); - LNET_UNLOCK(flags); + LNET_UNLOCK(); CERROR("Dropping PUT to %s: MD invalid\n", libcfs_id2str(target)); @@ -1326,7 +1920,14 @@ LNetPut(lnet_handle_md_t mdh, lnet_ack_req_t ack, CDEBUG(D_NET, "LNetPut -> %s\n", libcfs_id2str(target)); - memset (&msg->msg_hdr, 0, sizeof (msg->msg_hdr)); + lnet_commit_md(md, msg); + + lnet_prep_send(msg, LNET_MSG_PUT, target, 0, md->md_length); + + msg->msg_hdr.msg.put.match_bits = cpu_to_le64(match_bits); + msg->msg_hdr.msg.put.ptl_index = cpu_to_le32(portal); + msg->msg_hdr.msg.put.offset = cpu_to_le32(offset); + msg->msg_hdr.msg.put.hdr_data = hdr_data; /* NB handles only looked up by creator (no flips) */ if (ack == LNET_ACK_REQ) { @@ -1338,13 +1939,6 @@ LNetPut(lnet_handle_md_t mdh, lnet_ack_req_t ack, msg->msg_hdr.msg.put.ack_wmd = LNET_WIRE_HANDLE_NONE; } - msg->msg_hdr.msg.put.match_bits = cpu_to_le64(match_bits); - msg->msg_hdr.msg.put.ptl_index = cpu_to_le32(portal); - msg->msg_hdr.msg.put.offset = cpu_to_le32(offset); - msg->msg_hdr.msg.put.hdr_data = hdr_data; - - lnet_commit_md(md, msg); - msg->msg_ev.type = LNET_EVENT_SEND; msg->msg_ev.initiator.nid = LNET_NID_ANY; msg->msg_ev.initiator.pid = the_lnet.ln_pid; @@ -1361,14 +1955,13 @@ LNetPut(lnet_handle_md_t mdh, lnet_ack_req_t ack, the_lnet.ln_counters.send_count++; the_lnet.ln_counters.send_length += md->md_length; - LNET_UNLOCK(flags); + LNET_UNLOCK(); - rc = lnet_send(NULL, NULL, msg, LNET_MSG_PUT, target, - md, 0, md->md_length); + rc = lnet_send(NULL, msg); if (rc != 0) { CERROR("Error sending PUT to %s: %d\n", libcfs_id2str(target), rc); - lnet_finalize (NULL, NULL, msg, rc); + lnet_finalize (NULL, msg, rc); } /* completion will be signalled by an event */ @@ -1388,12 +1981,11 @@ lnet_create_reply_msg (lnet_ni_t *ni, lnet_msg_t *getmsg) lnet_msg_t *msg = lnet_msg_alloc(); lnet_libmd_t *getmd = getmsg->msg_md; lnet_nid_t peer_nid = getmsg->msg_target.nid; - unsigned long flags; LASSERT (!getmsg->msg_target_is_router); LASSERT (!getmsg->msg_routing); - LNET_LOCK(flags); + LNET_LOCK(); LASSERT (getmd->md_pending > 0); @@ -1429,7 +2021,7 @@ lnet_create_reply_msg (lnet_ni_t *ni, lnet_msg_t *getmsg) the_lnet.ln_counters.recv_count++; the_lnet.ln_counters.recv_length += getmd->md_length; - LNET_UNLOCK(flags); + LNET_UNLOCK(); return msg; @@ -1439,7 +2031,7 @@ lnet_create_reply_msg (lnet_ni_t *ni, lnet_msg_t *getmsg) the_lnet.ln_counters.drop_count++; the_lnet.ln_counters.drop_length += getmd->md_length; - LNET_UNLOCK (flags); + LNET_UNLOCK (); return NULL; } @@ -1451,7 +2043,6 @@ LNetGet(lnet_handle_md_t mdh, { lnet_msg_t *msg; lnet_libmd_t *md; - unsigned long flags; int rc; LASSERT (the_lnet.ln_init); @@ -1472,12 +2063,12 @@ LNetGet(lnet_handle_md_t mdh, return -ENOMEM; } - LNET_LOCK(flags); + LNET_LOCK(); md = lnet_handle2md(&mdh); if (md == NULL || md->md_threshold == 0) { lnet_msg_free(msg); - LNET_UNLOCK(flags); + LNET_UNLOCK(); CERROR("Dropping GET to %s: MD invalid\n", libcfs_id2str(target)); @@ -1486,20 +2077,20 @@ LNetGet(lnet_handle_md_t mdh, CDEBUG(D_NET, "LNetGet -> %s\n", libcfs_id2str(target)); - memset (&msg->msg_hdr, 0, sizeof (msg->msg_hdr)); + lnet_commit_md(md, msg); - /* NB handles only looked up by creator (no flips) */ - msg->msg_hdr.msg.get.return_wmd.wh_interface_cookie = - the_lnet.ln_interface_cookie; - msg->msg_hdr.msg.get.return_wmd.wh_object_cookie = - md->md_lh.lh_cookie; + lnet_prep_send(msg, LNET_MSG_GET, target, 0, 0); msg->msg_hdr.msg.get.match_bits = cpu_to_le64(match_bits); msg->msg_hdr.msg.get.ptl_index = cpu_to_le32(portal); msg->msg_hdr.msg.get.src_offset = cpu_to_le32(offset); msg->msg_hdr.msg.get.sink_length = cpu_to_le32(md->md_length); - lnet_commit_md(md, msg); + /* NB handles only looked up by creator (no flips) */ + msg->msg_hdr.msg.get.return_wmd.wh_interface_cookie = + the_lnet.ln_interface_cookie; + msg->msg_hdr.msg.get.return_wmd.wh_object_cookie = + md->md_lh.lh_cookie; msg->msg_ev.type = LNET_EVENT_SEND; msg->msg_ev.initiator.nid = LNET_NID_ANY; @@ -1516,14 +2107,13 @@ LNetGet(lnet_handle_md_t mdh, the_lnet.ln_counters.send_count++; - LNET_UNLOCK(flags); + LNET_UNLOCK(); - rc = lnet_send(NULL, NULL, msg, LNET_MSG_GET, target, - NULL, 0, 0); + rc = lnet_send(NULL, msg); if (rc != 0) { CERROR("error sending GET to %s: %d\n", libcfs_id2str(target), rc); - lnet_finalize (NULL, NULL, msg, rc); + lnet_finalize (NULL, msg, rc); } /* completion will be signalled by an event */ @@ -1536,5 +2126,5 @@ LNetDist (lnet_nid_t nid, int *order) LASSERT (the_lnet.ln_init); LASSERT (the_lnet.ln_refcount > 0); - return kpr_distance(nid, order); + return lnet_distance(nid, order); } diff --git a/lnet/lnet/lib-msg.c b/lnet/lnet/lib-msg.c index 80b8c1c..6d97e19 100644 --- a/lnet/lnet/lib-msg.c +++ b/lnet/lnet/lib-msg.c @@ -27,7 +27,7 @@ #include void -lnet_enq_event_locked (void *private, lnet_eq_t *eq, lnet_event_t *ev) +lnet_enq_event_locked (lnet_eq_t *eq, lnet_event_t *ev) { lnet_event_t *eq_slot; @@ -62,18 +62,37 @@ lnet_enq_event_locked (void *private, lnet_eq_t *eq, lnet_event_t *ev) } void -lnet_finalize (lnet_ni_t *ni, void *private, lnet_msg_t *msg, int status) +lnet_finalize (lnet_ni_t *ni, lnet_msg_t *msg, int status) { - lnet_libmd_t *md; - int unlink; - unsigned long flags; - int rc; - int send_ack; + lnet_handle_wire_t ack_wmd; + lnet_libmd_t *md; + int unlink; + int rc; if (msg == NULL) return; - LNET_LOCK(flags); +#if 0 + CDEBUG(D_WARNING, "%s msg->%s Flags:%s%s%s%s%s%s%s%s%s%s%s%s txp %s rxp %s\n", + lnet_msgtyp2str(msg->msg_type), libcfs_id2str(msg->msg_target), + msg->msg_target_is_router ? "t" : "", + msg->msg_routing ? "X" : "", + msg->msg_ack ? "A" : "", + msg->msg_sending ? "S" : "", + msg->msg_receiving ? "R" : "", + msg->msg_recvaftersend ? "g" : "", + msg->msg_delayed ? "d" : "", + msg->msg_txcredit ? "C" : "", + msg->msg_peertxcredit ? "c" : "", + msg->msg_rtrcredit ? "F" : "", + msg->msg_peerrtrcredit ? "f" : "", + msg->msg_onactivelist ? "!" : "", + msg->msg_txpeer == NULL ? "" : libcfs_nid2str(msg->msg_txpeer->lp_nid), + msg->msg_rxpeer == NULL ? "" : libcfs_nid2str(msg->msg_rxpeer->lp_nid)); +#endif + LNET_LOCK(); + + LASSERT (msg->msg_onactivelist); md = msg->msg_md; if (md != NULL) { @@ -95,49 +114,61 @@ lnet_finalize (lnet_ni_t *ni, void *private, lnet_msg_t *msg, int status) msg->msg_ev.unlinked = unlink; if (md->md_eq != NULL) - lnet_enq_event_locked(private, md->md_eq, &msg->msg_ev); + lnet_enq_event_locked(md->md_eq, &msg->msg_ev); if (unlink) lnet_md_unlink(md); msg->msg_md = NULL; } + + if (status == 0 && msg->msg_ack) { + /* Only send an ACK if the PUT completed successfully */ + + lnet_return_credits_locked(msg); + + msg->msg_ack = 0; + LNET_UNLOCK(); - /* Only send an ACK if the PUT completed successfully */ - send_ack = (status == 0 && - !lnet_is_wire_handle_none(&msg->msg_ack_wmd)); + LASSERT(msg->msg_ev.type == LNET_EVENT_PUT); + LASSERT(!msg->msg_routing); - if (!send_ack) { - list_del (&msg->msg_activelist); - the_lnet.ln_counters.msgs_alloc--; - lnet_msg_free(msg); + ack_wmd = msg->msg_hdr.msg.put.ack_wmd; + + lnet_prep_send(msg, LNET_MSG_ACK, msg->msg_ev.initiator, 0, 0); - LNET_UNLOCK(flags); - return; - } + msg->msg_hdr.msg.ack.dst_wmd = ack_wmd; + msg->msg_hdr.msg.ack.match_bits = msg->msg_ev.match_bits; + msg->msg_hdr.msg.ack.mlength = cpu_to_le32(msg->msg_ev.mlength); - LNET_UNLOCK(flags); + rc = lnet_send(ni, msg); + if (rc == 0) + return; - LASSERT(msg->msg_ev.type == LNET_EVENT_PUT); + LNET_LOCK(); - memset (&msg->msg_hdr, 0, sizeof(msg->msg_hdr)); - msg->msg_hdr.msg.ack.dst_wmd = msg->msg_ack_wmd; - msg->msg_hdr.msg.ack.match_bits = msg->msg_ev.match_bits; - msg->msg_hdr.msg.ack.mlength = cpu_to_le32(msg->msg_ev.mlength); + } else if (status == 0 && /* OK so far */ + (msg->msg_routing && !msg->msg_sending)) { /* not forwarded */ + + LASSERT (!msg->msg_receiving); /* called back recv already */ + + LNET_UNLOCK(); + + rc = lnet_send(NULL, msg); + if (rc == 0) + return; - msg->msg_ack_wmd = LNET_WIRE_HANDLE_NONE; + LNET_LOCK(); + } - rc = lnet_send(ni, private, msg, LNET_MSG_ACK, - msg->msg_ev.initiator, NULL, 0, 0); - if (rc != 0) { - /* send failed: there's nothing else to clean up. */ - CERROR("Error %d sending ACK to %s\n", - rc, libcfs_id2str(msg->msg_ev.initiator)); + lnet_return_credits_locked(msg); - LNET_LOCK(flags); - list_del (&msg->msg_activelist); - the_lnet.ln_counters.msgs_alloc--; - lnet_msg_free(msg); - LNET_UNLOCK(flags); - } + LASSERT (msg->msg_onactivelist); + msg->msg_onactivelist = 0; + list_del (&msg->msg_activelist); + the_lnet.ln_counters.msgs_alloc--; + lnet_msg_free(msg); + + LNET_UNLOCK(); } + diff --git a/lnet/lnet/lo.c b/lnet/lnet/lo.c index 2fa1708..b5e2229 100644 --- a/lnet/lnet/lo.c +++ b/lnet/lnet/lo.c @@ -29,9 +29,9 @@ lolnd_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) LASSERT (!lntmsg->msg_routing); LASSERT (!lntmsg->msg_target_is_router); - rc = lnet_parse(ni, &lntmsg->msg_hdr, lntmsg); + rc = lnet_parse(ni, &lntmsg->msg_hdr, ni->ni_nid, lntmsg); if (rc >= 0) - lnet_finalize(ni, private, lntmsg, 0); + lnet_finalize(ni, lntmsg, 0); return rc; } @@ -69,7 +69,7 @@ lolnd_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, sendmsg->msg_offset, mlen); } - lnet_finalize(ni, private, lntmsg, 0); + lnet_finalize(ni, lntmsg, 0); return 0; } diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c new file mode 100644 index 0000000..2319131 --- /dev/null +++ b/lnet/lnet/peer.c @@ -0,0 +1,210 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib/lib-move.c + * Data movement routines + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define DEBUG_SUBSYSTEM S_PORTALS + +#include + +int +lnet_create_peer_table(void) +{ + struct list_head *hash; + int i; + + LASSERT (the_lnet.ln_peer_hash == NULL); + PORTAL_ALLOC(hash, LNET_PEER_HASHSIZE * sizeof(struct list_head)); + + if (hash == NULL) { + CERROR("Can't allocate peer hash table\n"); + return -ENOMEM; + } + + for (i = 0; i < LNET_PEER_HASHSIZE; i++) + CFS_INIT_LIST_HEAD(&hash[i]); + + the_lnet.ln_peer_hash = hash; + return 0; +} + +void +lnet_destroy_peer_table(void) +{ + int i; + + if (the_lnet.ln_peer_hash == NULL) + return; + + for (i = 0; i < LNET_PEER_HASHSIZE; i++) + LASSERT (list_empty(&the_lnet.ln_peer_hash[i])); + + PORTAL_FREE(the_lnet.ln_peer_hash, + LNET_PEER_HASHSIZE * sizeof (struct list_head)); + the_lnet.ln_peer_hash = NULL; +} + +void +lnet_clear_peer_table(void) +{ + int i; + + LASSERT (the_lnet.ln_shutdown); /* i.e. no new peers */ + + for (i = 0; i < LNET_PEER_HASHSIZE; i++) { + struct list_head *peers = &the_lnet.ln_peer_hash[i]; + + LNET_LOCK(); + while (!list_empty(peers)) { + lnet_peer_t *lp = list_entry(peers->next, + lnet_peer_t, lp_hashlist); + + list_del(&lp->lp_hashlist); + lnet_peer_decref_locked(lp); /* lose hash table's ref */ + } + LNET_UNLOCK(); + } + + LNET_LOCK(); + for (i = 3; the_lnet.ln_npeers != 0;i++) { + LNET_UNLOCK(); + + if ((i & (i-1)) == 0) + CDEBUG(D_WARNING,"Waiting for %d peers\n", + the_lnet.ln_npeers); + cfs_pause(cfs_time_seconds(1)); + + LNET_LOCK(); + } + LNET_UNLOCK(); +} + +void +lnet_destroy_peer_locked (lnet_peer_t *lp) +{ + lnet_ni_decref_locked(lp->lp_ni); + LNET_UNLOCK(); + + LASSERT (lp->lp_refcount == 0); + LASSERT (list_empty(&lp->lp_txq)); + LASSERT (lp->lp_txqnob == 0); + + PORTAL_FREE(lp, sizeof(*lp)); + + LNET_LOCK(); + + LASSERT(the_lnet.ln_npeers > 0); + the_lnet.ln_npeers--; +} + +lnet_peer_t * +lnet_find_peer_locked (lnet_nid_t nid) +{ + unsigned int idx = PTL_NIDADDR(nid) % LNET_PEER_HASHSIZE; + struct list_head *peers = &the_lnet.ln_peer_hash[idx]; + struct list_head *tmp; + lnet_peer_t *lp; + + if (the_lnet.ln_shutdown) + return NULL; + + list_for_each (tmp, peers) { + lp = list_entry(tmp, lnet_peer_t, lp_hashlist); + + if (lp->lp_nid == nid) { + lnet_peer_addref_locked(lp); + return lp; + } + } + + return NULL; +} + +int +lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid) +{ + struct timeval now; + lnet_peer_t *lp; + lnet_peer_t *lp2; + + lp = lnet_find_peer_locked(nid); + if (lp != NULL) { + *lpp = lp; + return 0; + } + + LNET_UNLOCK(); + + PORTAL_ALLOC(lp, sizeof(*lp)); + if (lp == NULL) { + *lpp = NULL; + return -ENOMEM; + } + + do_gettimeofday (&now); + + CFS_INIT_LIST_HEAD(&lp->lp_txq); + CFS_INIT_LIST_HEAD(&lp->lp_rtrq); + + lp->lp_alive = 1; + lp->lp_timestamp = now.tv_sec; + lp->lp_nid = nid; + lp->lp_refcount = 2; /* 1 for caller; 1 for hash */ + + LNET_LOCK(); + + lp2 = lnet_find_peer_locked(nid); + if (lp2 != NULL) { + LNET_UNLOCK(); + PORTAL_FREE(lp, sizeof(*lp)); + LNET_LOCK(); + + *lpp = lp2; + return 0; + } + + lp->lp_ni = lnet_net2ni_locked(PTL_NIDNET(nid)); + if (lp->lp_ni == NULL) { + LNET_UNLOCK(); + PORTAL_FREE(lp, sizeof(*lp)); + LNET_LOCK(); + + *lpp = NULL; + return the_lnet.ln_shutdown ? -ESHUTDOWN : -EHOSTUNREACH; + } + + lp->lp_txcredits = + lp->lp_mintxcredits = lp->lp_ni->ni_peertxcredits; + + /* As a first approximation; allow this peer the same number of router + * buffers as it is allowed outstanding sends */ + lp->lp_rtrcredits = lp->lp_txcredits; + + LASSERT (!the_lnet.ln_shutdown); + /* can't add peers after shutdown starts */ + + list_add_tail(&lp->lp_hashlist, lnet_nid2peerhash(nid)); + the_lnet.ln_npeers++; + the_lnet.ln_peertable_version++; + *lpp = lp; + return 0; +} diff --git a/lnet/lnet/router.c b/lnet/lnet/router.c index 1231f1c..54f87f6 100644 --- a/lnet/lnet/router.c +++ b/lnet/lnet/router.c @@ -21,12 +21,10 @@ * */ -#include "router.h" +#include #ifdef __KERNEL__ -kpr_state_t kpr_state; - static int forwarding = 0; CFS_MODULE_PARM(forwarding, "i", int, 0444, "Boolean: set non-zero to forward between networks"); @@ -35,11 +33,23 @@ static char *routes = ""; CFS_MODULE_PARM(routes, "s", charp, 0444, "routes to non-local networks"); -int -lnet_forwarding () +static int tiny_router_buffers = 512; +CFS_MODULE_PARM(tiny_router_buffers, "i", int, 0444, + "# of 0 payload messages to buffer in the router"); +static int small_router_buffers = 256; +CFS_MODULE_PARM(small_router_buffers, "i", int, 0444, + "# of small (1 page) messages to buffer in the router"); +static int large_router_buffers = 16; +CFS_MODULE_PARM(large_router_buffers, "i", int, 0444, + "# of large messages to buffer in the router"); + +typedef struct { - return forwarding; -} + work_struct_t kpru_tq; + lnet_nid_t kpru_nid; + int kpru_alive; + time_t kpru_when; +} kpr_upcall_t; void kpr_do_upcall (void *arg) @@ -87,12 +97,8 @@ kpr_upcall (lnet_nid_t gw_nid, int alive, time_t when) int lnet_notify (lnet_ni_t *ni, lnet_nid_t gateway_nid, int alive, time_t when) { - unsigned long flags; - int found; - kpr_gateway_entry_t *ge = NULL; + lnet_peer_t *lp = NULL; struct timeval now; - struct list_head *e; - struct list_head *n; CDEBUG (D_NET, "%s notifying %s: %s\n", (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid), @@ -118,250 +124,85 @@ lnet_notify (lnet_ni_t *ni, lnet_nid_t gateway_nid, int alive, time_t when) return -EINVAL; } - /* Serialise with lookups (i.e. write lock) */ - write_lock_irqsave(&kpr_state.kpr_rwlock, flags); - - found = 0; - list_for_each_safe (e, n, &kpr_state.kpr_gateways) { - - ge = list_entry(e, kpr_gateway_entry_t, kpge_list); - if (ge->kpge_nid != gateway_nid) - continue; - - found = 1; - break; - } - - if (!found) { + LNET_LOCK(); + + lp = lnet_find_peer_locked(gateway_nid); + if (lp == NULL) { /* gateway not found */ - write_unlock_irqrestore(&kpr_state.kpr_rwlock, flags); + LNET_UNLOCK(); CDEBUG (D_NET, "Gateway not found\n"); return (0); } - if (when < ge->kpge_timestamp) { + if (when < lp->lp_timestamp) { /* out of date information */ - write_unlock_irqrestore (&kpr_state.kpr_rwlock, flags); + lnet_peer_decref_locked(lp); + LNET_UNLOCK(); CDEBUG (D_NET, "Out of date\n"); return (0); } /* update timestamp */ - ge->kpge_timestamp = when; + lp->lp_timestamp = when; - if ((!ge->kpge_alive) == (!alive)) { + if ((!lp->lp_alive) == (!alive)) { /* new date for old news */ - write_unlock_irqrestore (&kpr_state.kpr_rwlock, flags); + lnet_peer_decref_locked(lp); + LNET_UNLOCK(); CDEBUG (D_NET, "Old news\n"); return (0); } - ge->kpge_alive = alive; - CDEBUG(D_NET, "set %s [%p] %d\n", - libcfs_nid2str(gateway_nid), ge, alive); - - if (alive) { - /* Reset all gateway weights so the newly-enabled gateway - * doesn't have to play catch-up */ - list_for_each_safe (e, n, &kpr_state.kpr_gateways) { - kpr_gateway_entry_t *ge = list_entry(e, kpr_gateway_entry_t, - kpge_list); - atomic_set (&ge->kpge_weight, 0); - } - } + lp->lp_alive = alive; - write_unlock_irqrestore(&kpr_state.kpr_rwlock, flags); + LNET_UNLOCK(); + + CDEBUG(D_NET, "set %s %d\n", libcfs_nid2str(gateway_nid), alive); if (ni == NULL) { /* userland notified me: notify NAL? */ - ni = lnet_net2ni(PTL_NIDNET(gateway_nid)); - if (ni != NULL) { + ni = lp->lp_ni; + if (ni->ni_lnd->lnd_notify != NULL) { ni->ni_lnd->lnd_notify(ni, gateway_nid, alive); - lnet_ni_decref(ni); } } else { /* It wasn't userland that notified me... */ + LBUG(); /* LND notification disabled for now */ CWARN ("Upcall: NID %s is %s\n", libcfs_nid2str(gateway_nid), alive ? "alive" : "dead"); kpr_upcall (gateway_nid, alive, when); } - return (0); -} + LNET_LOCK(); + lnet_peer_decref_locked(lp); + LNET_UNLOCK(); -int -kpr_ge_isbetter (kpr_gateway_entry_t *ge1, kpr_gateway_entry_t *ge2) -{ - const int significant_bits = 0x00ffffff; - /* We use atomic_t to record/compare route weights for - * load-balancing. Here we limit ourselves to only using - * 'significant_bits' when we do an 'after' comparison */ - - int diff = (atomic_read (&ge1->kpge_weight) - - atomic_read (&ge2->kpge_weight)) & significant_bits; - int rc = (diff > (significant_bits >> 1)); - - CDEBUG(D_NET, "[%p]%s=%d %s [%p]%s=%d\n", - ge1, libcfs_nid2str(ge1->kpge_nid), - atomic_read (&ge1->kpge_weight), - rc ? ">" : "<", - ge2, libcfs_nid2str(ge2->kpge_nid), - atomic_read (&ge2->kpge_weight)); - - return (rc); + return (0); } -void -kpr_update_weight (kpr_gateway_entry_t *ge, int nob) +lnet_remotenet_t * +lnet_find_net_locked (__u32 net) { - int weight = 1 + (nob + sizeof (lnet_hdr_t)/2)/sizeof (lnet_hdr_t); - - /* We've chosen this route entry (i.e. gateway) to forward payload - * of length 'nob'; update the route's weight to make it less - * favoured. Note that the weight is 1 plus the payload size - * rounded and scaled to the portals header size, so we get better - * use of the significant bits in kpge_weight. */ + lnet_remotenet_t *rnet; + struct list_head *tmp; - CDEBUG(D_NET, "gateway [%p]%s += %d\n", ge, - libcfs_nid2str(ge->kpge_nid), weight); + LASSERT (!the_lnet.ln_shutdown); - atomic_add (weight, &ge->kpge_weight); -} - -lnet_nid_t -lnet_lookup (lnet_ni_t **nip, lnet_nid_t target_nid, int nob) -{ - lnet_ni_t *ni = *nip; - lnet_nid_t gwnid; - struct list_head *e; - kpr_net_entry_t *ne = NULL; - kpr_route_entry_t *re; - int found; - unsigned long flags; - lnet_ni_t *gwni = NULL; - kpr_gateway_entry_t *ge = NULL; - __u32 target_net = PTL_NIDNET(target_nid); - __u32 gateway_net; - - /* Return the NID I must send to, to reach 'target_nid' */ - - CDEBUG (D_NET, "lookup %s from %s\n", libcfs_nid2str(target_nid), - (ni == NULL) ? "<>" : libcfs_nid2str(ni->ni_nid)); - - if (ni == NULL) { /* ni not determined yet */ - gwni = lnet_net2ni(target_net); /* is it a local network? */ - if (gwni != NULL) { - *nip = gwni; - return target_nid; - } - } else { /* ni already determined */ - if (PTL_NETTYP(PTL_NIDNET(ni->ni_nid)) == LOLND || - target_net == PTL_NIDNET(ni->ni_nid) || - (the_lnet.ln_ptlcompat > 0 && - PTL_NETTYP(target_net) == 0)) { - lnet_ni_addref(ni); /* extra ref so caller can drop blindly */ - return target_nid; - } - } - - CDEBUG(D_NET, "%s from %s\n", libcfs_nid2str(target_nid), - (ni == NULL) ? "" : libcfs_nid2str(ni->ni_nid)); - - read_lock_irqsave(&kpr_state.kpr_rwlock, flags); - - if (ni != NULL && ni->ni_shutdown) { - /* pre-determined ni is shutting down */ - read_unlock_irqrestore(&kpr_state.kpr_rwlock, flags); - return LNET_NID_ANY; - } - - /* Search for the target net */ - found = 0; - list_for_each (e, &kpr_state.kpr_nets) { - ne = list_entry (e, kpr_net_entry_t, kpne_list); + list_for_each (tmp, &the_lnet.ln_remote_nets) { + rnet = list_entry(tmp, lnet_remotenet_t, lrn_list); - found = ne->kpne_net == target_net; - if (found) - break; - } - - if (!found) { - read_unlock_irqrestore(&kpr_state.kpr_rwlock, flags); - return LNET_NID_ANY; + if (rnet->lrn_net == net) + return rnet; } - - /* Search routes for one that has a gateway to target_nid on the callers network */ - list_for_each (e, &ne->kpne_routes) { - re = list_entry (e, kpr_route_entry_t, kpre_list); - - if (!re->kpre_gateway->kpge_alive) /* gateway down */ - continue; - - gateway_net = PTL_NIDNET(re->kpre_gateway->kpge_nid); - - if (ni != NULL) { - /* local ni determined */ - if (gateway_net != /* gateway not on ni's net */ - PTL_NIDNET(ni->ni_nid)) - continue; - - if (ge != NULL && - kpr_ge_isbetter (ge, re->kpre_gateway)) - continue; - - } else if (!lnet_islocalnet(gateway_net, NULL)) { - continue; /* not on a local net */ - } else if (ge != NULL) { - /* already got 1 candidate gateway */ - LASSERT (gwni != NULL); - - if (kpr_ge_isbetter(ge, re->kpre_gateway)) - continue; - } else { - LASSERT (gwni == NULL); - gwni = lnet_net2ni(gateway_net); - - if (gwni == NULL) /* local nets changed */ - continue; - } - - ge = re->kpre_gateway; - } - - if (ge == NULL) { - read_unlock_irqrestore(&kpr_state.kpr_rwlock, flags); - LASSERT (gwni == NULL); - - return LNET_NID_ANY; - } - - kpr_update_weight(ge, nob); - gwnid = ge->kpge_nid; - read_unlock_irqrestore(&kpr_state.kpr_rwlock, flags); - - /* NB can't deref 're/ge' after lock released! */ - CDEBUG (D_NET, "lookup %s from %s: %s\n", - libcfs_nid2str(target_nid), - (ni == NULL) ? "<>" : libcfs_nid2str(ni->ni_nid), - libcfs_nid2str(gwnid)); - - LASSERT ((gwni == NULL) != (ni == NULL)); - - if (ni != NULL) - lnet_ni_addref(ni); /* extra ref so caller can drop blindly */ - else - *nip = gwni; /* already got a ref */ - - return gwnid; + return NULL; } int -kpr_distance (lnet_nid_t nid, int *orderp) +lnet_distance (lnet_nid_t nid, int *orderp) { - unsigned long flags; struct list_head *e; - kpr_net_entry_t *ne; + lnet_remotenet_t *rnet; __u32 net = PTL_NIDNET(nid); int dist = -ENETUNREACH; int order = 0; @@ -369,13 +210,13 @@ kpr_distance (lnet_nid_t nid, int *orderp) if (lnet_islocalnet(net, orderp)) return 0; - read_lock_irqsave(&kpr_state.kpr_rwlock, flags); + LNET_LOCK(); - list_for_each (e, &kpr_state.kpr_nets) { - ne = list_entry (e, kpr_net_entry_t, kpne_list); + list_for_each (e, &the_lnet.ln_remote_nets) { + rnet = list_entry(e, lnet_remotenet_t, lrn_list); - if (ne->kpne_net == net) { - dist = ne->kpne_hops; + if (rnet->lrn_net == net) { + dist = rnet->lrn_hops; if (orderp != NULL) *orderp = order; break; @@ -383,407 +224,481 @@ kpr_distance (lnet_nid_t nid, int *orderp) order++; } - read_unlock_irqrestore(&kpr_state.kpr_rwlock, flags); + LNET_UNLOCK(); return dist; } int -kpr_add_route (__u32 net, unsigned int hops, lnet_nid_t gateway_nid) +lnet_add_route (__u32 net, unsigned int hops, lnet_nid_t gateway) { - unsigned long flags; struct list_head *e; - kpr_net_entry_t *ne = NULL; - kpr_route_entry_t *re = NULL; - kpr_gateway_entry_t *ge = NULL; - int dup = 0; - - CDEBUG(D_NET, "Add route: net %s hops %u gw %s\n", - libcfs_net2str(net), hops, libcfs_nid2str(gateway_nid)); - - if (gateway_nid == LNET_NID_ANY || + lnet_remotenet_t *rnet; + lnet_remotenet_t *rnet2; + lnet_route_t *route; + lnet_route_t *route2; + lnet_peer_t *lp; + int dup; + int hops2; + __u32 net2; + int rc; + + CDEBUG(D_WARNING, "Add route: net %s hops %u gw %s\n", + libcfs_net2str(net), hops, libcfs_nid2str(gateway)); + + if (gateway == LNET_NID_ANY || hops < 1 || hops > 255) return (-EINVAL); - /* Assume net, route, gateway all new */ - PORTAL_ALLOC(ge, sizeof(*ge)); - PORTAL_ALLOC(re, sizeof(*re)); - PORTAL_ALLOC(ne, sizeof(*ne)); - - if (ge == NULL || re == NULL || ne == NULL) { - if (ge != NULL) - PORTAL_FREE(ge, sizeof(*ge)); - if (re != NULL) - PORTAL_FREE(re, sizeof(*re)); - if (ne != NULL) - PORTAL_FREE(ne, sizeof(*ne)); + if (lnet_islocalnet(net, NULL)) /* it's a local network */ + return 0; /* ignore the route entry */ + + /* Assume net, route, all new */ + PORTAL_ALLOC(route, sizeof(*route)); + PORTAL_ALLOC(rnet, sizeof(*rnet)); + if (route == NULL || rnet == NULL) { + CERROR("Out of memory creating route %s %d %s\n", + libcfs_net2str(net), hops, libcfs_nid2str(gateway)); + if (route != NULL) + PORTAL_FREE(route, sizeof(*route)); + if (rnet != NULL) + PORTAL_FREE(rnet, sizeof(*rnet)); return -ENOMEM; } - ge->kpge_nid = gateway_nid; - ge->kpge_alive = 1; - ge->kpge_timestamp = 0; - ge->kpge_refcount = 0; - atomic_set (&ge->kpge_weight, 0); + LNET_LOCK(); - ne->kpne_net = net; - ne->kpne_hops = hops; - INIT_LIST_HEAD(&ne->kpne_routes); - - LASSERT(!in_interrupt()); - write_lock_irqsave(&kpr_state.kpr_rwlock, flags); + rc = lnet_nid2peer_locked(&lp, gateway); + if (rc != 0) { + LNET_UNLOCK(); - list_for_each (e, &kpr_state.kpr_nets) { - kpr_net_entry_t *ne2 = - list_entry(e, kpr_net_entry_t, kpne_list); - - if (ne2->kpne_net == net) { - PORTAL_FREE(ne, sizeof(*ne)); - ne = ne2; - dup = 1; - break; - } + PORTAL_FREE(route, sizeof(*route)); + PORTAL_FREE(rnet, sizeof(*rnet)); + + if (rc == -EHOSTUNREACH) /* gateway is not on a local net */ + return 0; /* ignore the route entry */ + + CERROR("Error %d creating route %s %d %s\n", rc, + libcfs_net2str(net), hops, libcfs_nid2str(gateway)); + return rc; } + + LASSERT (!the_lnet.ln_shutdown); - if (!dup) { /* Adding a new network? */ - list_add_tail(&ne->kpne_list, &kpr_state.kpr_nets); - } else { - if (ne->kpne_hops != hops) { - unsigned int hops2 = ne->kpne_hops; + rnet2 = lnet_find_net_locked(net); + if (rnet2 == NULL) { + /* new network */ + INIT_LIST_HEAD(&rnet->lrn_routes); + rnet->lrn_net = net; + rnet->lrn_hops = hops; + rnet->lrn_ni = lp->lp_ni; - write_unlock_irqrestore(&kpr_state.kpr_rwlock, flags); + lnet_ni_addref_locked(rnet->lrn_ni); - CERROR("Inconsistent hop count %d(%d) for network %s\n", - hops, hops2, libcfs_net2str(net)); - PORTAL_FREE(re, sizeof(*re)); - PORTAL_FREE(ge, sizeof(*ge)); - return -EINVAL; - } + list_add_tail(&rnet->lrn_list, &the_lnet.ln_remote_nets); + route->lr_gateway = lp; + list_add_tail(&route->lr_list, &rnet->lrn_routes); + + the_lnet.ln_remote_nets_version++; + LNET_UNLOCK(); + return 0; + } + + hops2 = rnet2->lrn_hops; + net2 = PTL_NIDNET(rnet2->lrn_ni->ni_nid); + + if (rnet2->lrn_ni == lp->lp_ni && hops2 == hops) { + /* New route consistent with existing routes; search for + * duplicate route (NOOP if this is) */ dup = 0; - list_for_each (e, &ne->kpne_routes) { - kpr_route_entry_t *re2 = - list_entry(e, kpr_route_entry_t, kpre_list); - - if (PTL_NIDNET(re2->kpre_gateway->kpge_nid) != - PTL_NIDNET(gateway_nid)) { - /* different gateway nets is an error */ - dup = -1; - break; - } - - if (re2->kpre_gateway->kpge_nid == gateway_nid) { - /* same gateway is a noop */ + list_for_each (e, &rnet2->lrn_routes) { + route2 = list_entry(e, lnet_route_t, lr_list); + + if (route2->lr_gateway->lp_nid == gateway) { dup = 1; break; } } - if (dup != 0) { - /* Don't add duplicate/bad route entry */ - write_unlock_irqrestore(&kpr_state.kpr_rwlock, flags); - - PORTAL_FREE(re, sizeof(*re)); - PORTAL_FREE(ge, sizeof(*ge)); - - return (dup < 0) ? -EINVAL : 0; + if (!dup) { + /* New route */ + list_add_tail(&route->lr_list, &rnet2->lrn_routes); + the_lnet.ln_remote_nets_version++; + } else { + lnet_peer_decref_locked(lp); } - } - - list_add_tail(&re->kpre_list, &ne->kpne_routes); - list_for_each (e, &kpr_state.kpr_gateways) { - kpr_gateway_entry_t *ge2 = - list_entry(e, kpr_gateway_entry_t, kpge_list); - - if (ge2->kpge_nid == gateway_nid) { - PORTAL_FREE (ge, sizeof (*ge)); - ge = ge2; - dup = 1; - break; - } - } - - if (!dup) { - /* Adding a new gateway... */ - list_add (&ge->kpge_list, &kpr_state.kpr_gateways); + LNET_UNLOCK(); - /* ...zero all gateway weights so this one doesn't have to - * play catch-up */ + PORTAL_FREE(rnet, sizeof(*rnet)); + if (dup) + PORTAL_FREE(route, sizeof(*route)); - list_for_each (e, &kpr_state.kpr_gateways) { - kpr_gateway_entry_t *ge2 = list_entry(e, kpr_gateway_entry_t, - kpge_list); - atomic_set (&ge2->kpge_weight, 0); - } + return 0; } - re->kpre_gateway = ge; - ge->kpge_refcount++; - kpr_state.kpr_generation++; - - write_unlock_irqrestore(&kpr_state.kpr_rwlock, flags); - return 0; + lnet_peer_decref_locked(lp); + LNET_UNLOCK(); + PORTAL_FREE(rnet, sizeof(*rnet)); + PORTAL_FREE(route, sizeof(*route)); + + if (hops != hops2) + CERROR("Hopcount not consistent on route: %s %d(%d) %s\n", + libcfs_net2str(net), hops, hops2, + libcfs_nid2str(gateway)); + else + CERROR("Router network not consistent on route: %s %d %s(%s)\n", + libcfs_net2str(net), hops, + libcfs_nid2str(gateway), libcfs_net2str(net2)); + return -EINVAL; } int -kpr_del_route (__u32 net, lnet_nid_t gw_nid) +lnet_del_route (__u32 net, lnet_nid_t gw_nid) { - unsigned long flags; - kpr_net_entry_t *ne; - kpr_route_entry_t *re; - kpr_gateway_entry_t *ge; + lnet_remotenet_t *rnet; + lnet_route_t *route; struct list_head *e1; - struct list_head *n1; struct list_head *e2; - struct list_head *n2; int rc = -ENOENT; - CDEBUG(D_NET, "Del route: net %s : gw %s\n", + CDEBUG(D_WARNING, "Del route: net %s : gw %s\n", libcfs_net2str(net), libcfs_nid2str(gw_nid)); - LASSERT(!in_interrupt()); /* NB Caller may specify either all routes via the given gateway * or a specific route entry actual NIDs) */ - write_lock_irqsave(&kpr_state.kpr_rwlock, flags); + again: + LNET_LOCK(); - list_for_each_safe (e1, n1, &kpr_state.kpr_nets) { - ne = list_entry(e1, kpr_net_entry_t, kpne_list); + list_for_each (e1, &the_lnet.ln_remote_nets) { + rnet = list_entry(e1, lnet_remotenet_t, lrn_list); - if (!(net != PTL_NIDNET(LNET_NID_ANY) || - net == ne->kpne_net)) + if (!(net == PTL_NIDNET(LNET_NID_ANY) || + net == rnet->lrn_net)) continue; - list_for_each_safe (e2, n2, &ne->kpne_routes) { - re = list_entry(e2, kpr_route_entry_t, kpre_list); - ge = re->kpre_gateway; + list_for_each (e2, &rnet->lrn_routes) { + route = list_entry(e2, lnet_route_t, + lr_list); if (!(gw_nid == LNET_NID_ANY || - gw_nid == ge->kpge_nid)) + gw_nid == route->lr_gateway->lp_nid)) continue; - rc = 0; - - if (--ge->kpge_refcount == 0) { - list_del (&ge->kpge_list); - PORTAL_FREE (ge, sizeof (*ge)); - } + list_del(&route->lr_list); + the_lnet.ln_remote_nets_version++; - list_del(&re->kpre_list); - PORTAL_FREE(re, sizeof (*re)); - } + if (list_empty(&rnet->lrn_routes)) + list_del(&rnet->lrn_list); + else + rnet = NULL; + + lnet_peer_decref_locked(route->lr_gateway); + LNET_UNLOCK(); - if (list_empty(&ne->kpne_routes)) { - list_del(&ne->kpne_list); - PORTAL_FREE(ne, sizeof(*ne)); + PORTAL_FREE(route, sizeof (*route)); + + if (rnet != NULL) { + lnet_ni_decref(rnet->lrn_ni); + PORTAL_FREE(rnet, sizeof(*rnet)); + } + + rc = 0; + goto again; } } - if (rc == 0) - kpr_state.kpr_generation++; - - write_unlock_irqrestore(&kpr_state.kpr_rwlock, flags); - + LNET_UNLOCK(); return rc; } int -kpr_get_route (int idx, __u32 *net, __u32 *hops, - lnet_nid_t *gateway_nid, __u32 *alive, __u32 *ignored) +lnet_get_route (int idx, __u32 *net, __u32 *hops, + lnet_nid_t *gateway, __u32 *alive) { struct list_head *e1; struct list_head *e2; - kpr_net_entry_t *ne; - kpr_route_entry_t *re; - kpr_gateway_entry_t *ge; - unsigned long flags; + lnet_remotenet_t *rnet; + lnet_route_t *route; - LASSERT (!in_interrupt()); - read_lock_irqsave(&kpr_state.kpr_rwlock, flags); + LNET_LOCK(); - list_for_each (e1, &kpr_state.kpr_nets) { - ne = list_entry(e1, kpr_net_entry_t, kpne_list); + list_for_each (e1, &the_lnet.ln_remote_nets) { + rnet = list_entry(e1, lnet_remotenet_t, lrn_list); - list_for_each (e2, &ne->kpne_routes) { - re = list_entry(e2, kpr_route_entry_t, kpre_list); - ge = re->kpre_gateway; + list_for_each (e2, &rnet->lrn_routes) { + route = list_entry(e2, lnet_route_t, lr_list); if (idx-- == 0) { - *net = ne->kpne_net; - *hops = ne->kpne_hops; - *gateway_nid = ge->kpge_nid; - *alive = ge->kpge_alive; - *ignored = lnet_islocalnet(ne->kpne_net, NULL) || - !lnet_islocalnet(PTL_NIDNET(ge->kpge_nid), NULL); - - read_unlock_irqrestore(&kpr_state.kpr_rwlock, - flags); + *net = rnet->lrn_net; + *hops = rnet->lrn_hops; + *gateway = route->lr_gateway->lp_nid; + *alive = route->lr_gateway->lp_alive; + LNET_UNLOCK(); return 0; } } } - read_unlock_irqrestore(&kpr_state.kpr_rwlock, flags); + LNET_UNLOCK(); return -ENOENT; } -int -kpr_ctl(unsigned int cmd, void *arg) +void +lnet_destory_rtrbuf(lnet_rtrbuf_t *rb, int npages) { - struct portal_ioctl_data *data = arg; + int sz = offsetof(lnet_rtrbuf_t, rb_kiov[npages]); - switch(cmd) { - default: - return -EINVAL; - - case IOC_PORTAL_ADD_ROUTE: - return kpr_add_route(data->ioc_net, data->ioc_count, - data->ioc_nid); + while (--npages >= 0) + __free_page(rb->rb_kiov[npages].kiov_page); + + PORTAL_FREE(rb, sz); +} + +lnet_rtrbuf_t * +lnet_new_rtrbuf(lnet_rtrbufpool_t *rbp) +{ + int npages = rbp->rbp_npages; + int sz = offsetof(lnet_rtrbuf_t, rb_kiov[npages]); + struct page *page; + lnet_rtrbuf_t *rb; + int i; + + PORTAL_ALLOC(rb, sz); - case IOC_PORTAL_DEL_ROUTE: - return kpr_del_route (data->ioc_net, data->ioc_nid); + rb->rb_pool = rbp; - case IOC_PORTAL_GET_ROUTE: { - int alive; - int ignored; - int rc; + for (i = 0; i < npages; i++) { + page = alloc_page(GFP_KERNEL); /* HIGH? */ + if (page == NULL) { + while (--i >= 0) + __free_page(rb->rb_kiov[i].kiov_page); - rc = kpr_get_route(data->ioc_count, - &data->ioc_net, &data->ioc_count, - &data->ioc_nid, &alive, &ignored); - data->ioc_flags = ( alive ? 1 : 0) | - (ignored ? 2 : 0); - return rc; - } - - case IOC_PORTAL_NOTIFY_ROUTER: - return lnet_notify(NULL, data->ioc_nid, data->ioc_flags, - (time_t)data->ioc_u64[0]); + PORTAL_FREE(rb, sz); + return NULL; + } + + rb->rb_kiov[i].kiov_len = PAGE_SIZE; + rb->rb_kiov[i].kiov_offset = 0; + rb->rb_kiov[i].kiov_page = page; } + + return rb; } - void -kpr_finalise (void) +lnet_rtrpool_free_bufs(lnet_rtrbufpool_t *rbp) { - kpr_proc_fini(); + int npages = rbp->rbp_npages; + int nbuffers = 0; + lnet_rtrbuf_t *rb; - while (!list_empty (&kpr_state.kpr_nets)) { - kpr_net_entry_t *ne = list_entry(kpr_state.kpr_nets.next, - kpr_net_entry_t, kpne_list); + LASSERT (list_empty(&rbp->rbp_msgs)); + LASSERT (rbp->rbp_credits == rbp->rbp_nbuffers); + + while (!list_empty(&rbp->rbp_bufs)) { + LASSERT (rbp->rbp_credits > 0); - while (!list_empty (&ne->kpne_routes)) { - kpr_route_entry_t *re = list_entry(ne->kpne_routes.next, - kpr_route_entry_t, - kpre_list); + rb = list_entry(rbp->rbp_bufs.next, + lnet_rtrbuf_t, rb_list); + list_del(&rb->rb_list); + lnet_destory_rtrbuf(rb, npages); + nbuffers++; + } - list_del(&re->kpre_list); - PORTAL_FREE(re, sizeof(*re)); - } + LASSERT (rbp->rbp_nbuffers == nbuffers); + LASSERT (rbp->rbp_credits == nbuffers); + + rbp->rbp_nbuffers = rbp->rbp_credits = 0; +} - list_del(&ne->kpne_list); - PORTAL_FREE(ne, sizeof(*ne)); +int +lnet_rtrpool_alloc_bufs(lnet_rtrbufpool_t *rbp, int nbufs) +{ + lnet_rtrbuf_t *rb; + int i; + + for (i = 0; i < nbufs; i++) { + rb = lnet_new_rtrbuf(rbp); + + if (rb == NULL) { + CERROR("Failed to allocate %d router bufs of %d pages\n", + nbufs, rbp->rbp_npages); + return -ENOMEM; + } + + rbp->rbp_nbuffers++; + rbp->rbp_credits++; + list_add(&rb->rb_list, &rbp->rbp_bufs); + + /* NB if this is live there need to be code to schedule blocked + * msgs */ } - while (!list_empty (&kpr_state.kpr_gateways)) { - kpr_gateway_entry_t *ge = list_entry(kpr_state.kpr_gateways.next, - kpr_gateway_entry_t, - kpge_list); + LASSERT (rbp->rbp_credits == nbufs); + return 0; +} - list_del(&ge->kpge_list); - PORTAL_FREE(ge, sizeof (*ge)); - } +void +lnet_rtrpool_init(lnet_rtrbufpool_t *rbp, int npages) +{ + CFS_INIT_LIST_HEAD(&rbp->rbp_msgs); + CFS_INIT_LIST_HEAD(&rbp->rbp_bufs); + + rbp->rbp_npages = npages; + rbp->rbp_credits = 0; + rbp->rbp_mincredits = 0; +} - CDEBUG(D_MALLOC, "kpr_finalise: kmem back to %d\n", - atomic_read(&libcfs_kmemory)); +void +lnet_free_rtrpools(void) +{ + lnet_rtrpool_free_bufs(&the_lnet.ln_rtrpools[0]); + lnet_rtrpool_free_bufs(&the_lnet.ln_rtrpools[1]); + lnet_rtrpool_free_bufs(&the_lnet.ln_rtrpools[2]); } int -kpr_initialise (void) +lnet_alloc_rtrpools(void) { - int rc; - - CDEBUG(D_MALLOC, "kpr_initialise: kmem %d\n", - atomic_read(&libcfs_kmemory)); + int small_pages = 1; + int large_pages = (PTL_MTU + PAGE_SIZE - 1) / PAGE_SIZE; + int rc; - memset(&kpr_state, 0, sizeof(kpr_state)); + lnet_rtrpool_init(&the_lnet.ln_rtrpools[0], 0); + lnet_rtrpool_init(&the_lnet.ln_rtrpools[1], small_pages); + lnet_rtrpool_init(&the_lnet.ln_rtrpools[2], large_pages); - INIT_LIST_HEAD(&kpr_state.kpr_nets); - INIT_LIST_HEAD(&kpr_state.kpr_gateways); - rwlock_init(&kpr_state.kpr_rwlock); - spin_lock_init(&kpr_state.kpr_stats_lock); + for (rc = 0; rc < LNET_NRBPOOLS; rc++) + CDEBUG(D_WARNING, "Pages[%d]: %d\n", rc, + the_lnet.ln_rtrpools[rc].rbp_npages); - rc = lnet_parse_routes(routes); + the_lnet.ln_routing = forwarding; + if (!forwarding) + return 0; + + if (tiny_router_buffers <= 0) { + LCONSOLE_ERROR("tiny_router_buffers=%d invalid when " + "routing enabled\n", tiny_router_buffers); + rc = -EINVAL; + goto failed; + } + + rc = lnet_rtrpool_alloc_bufs(&the_lnet.ln_rtrpools[0], + tiny_router_buffers); + if (rc != 0) + goto failed; + + if (small_router_buffers <= 0) { + LCONSOLE_ERROR("small_router_buffers=%d invalid when " + "routing enabled\n", small_router_buffers); + rc = -EINVAL; + goto failed; + } + + rc = lnet_rtrpool_alloc_bufs(&the_lnet.ln_rtrpools[1], + small_router_buffers); if (rc != 0) - kpr_finalise(); + goto failed; + + if (large_router_buffers <= 0) { + LCONSOLE_ERROR("large_router_buffers=%d invalid when " + "routing enabled\n", large_router_buffers); + rc = -EINVAL; + goto failed; + } - if (rc == 0) - kpr_proc_init(); + rc = lnet_rtrpool_alloc_bufs(&the_lnet.ln_rtrpools[2], + large_router_buffers); + if (rc != 0) + goto failed; - return (rc == 0) ? 0 : -EINVAL; + return 0; + + failed: + lnet_free_rtrpools(); + return rc; } -EXPORT_SYMBOL(lnet_forwarding); -EXPORT_SYMBOL(lnet_lookup); -EXPORT_SYMBOL(lnet_notify); -#else +void +lnet_router_fini (void) +{ + lnet_del_route(PTL_NIDNET(LNET_NID_ANY), LNET_NID_ANY); + lnet_free_rtrpools(); +} -lnet_nid_t -lnet_lookup (lnet_ni_t **nip, lnet_nid_t target_nid, int nob) +int +lnet_router_init (void) { - lnet_ni_t *ni = *nip; - lnet_ni_t *gwni; - __u32 target_net = PTL_NIDNET(target_nid); - - if (ni == NULL) { /* ni not determined yet */ - gwni = lnet_net2ni(target_net); /* is it a local network? */ - if (gwni != NULL) { - *nip = gwni; - return target_nid; - } - } else { /* ni already determined */ - if (target_net == PTL_NIDNET(ni->ni_nid)) { - lnet_ni_addref(ni); /* extra ref so caller can drop blindly */ - return target_nid; - } + int rc; + + rc = lnet_alloc_rtrpools(); + if (rc != 0) + return rc; + + rc = lnet_parse_routes(routes); + if (rc != 0) { + lnet_del_route(PTL_NIDNET(LNET_NID_ANY), LNET_NID_ANY); + lnet_free_rtrpools(); + return rc; } - CERROR("Nid %s is not on a local network and " - "userspace portals does not support routing\n", - libcfs_nid2str(target_nid)); + return 0; +} + +EXPORT_SYMBOL(lnet_notify); + +#else - return LNET_NID_ANY; +int +lnet_add_route (__u32 net, unsigned int hops, lnet_nid_t gateway) +{ + return -EOPNOTSUPP; } int -kpr_add_route (__u32 net, unsigned int hops, lnet_nid_t gateway_nid) +lnet_del_route (__u32 net, lnet_nid_t gw_nid) { return -EOPNOTSUPP; } -int -kpr_ctl(unsigned int cmd, void *arg) +int +lnet_get_route (int idx, __u32 *net, __u32 *hops, + lnet_nid_t *gateway, __u32 *alive) { - return -EINVAL; + return -ENOENT; +} + +lnet_remotenet_t * +lnet_find_net_locked (__u32 net) +{ + return NULL; } int -kpr_distance(lnet_nid_t nid, int *orderp) +lnet_distance(lnet_nid_t nid, int *orderp) { - if (!lnet_net2ni(PTL_NIDNET(nid))) + if (!lnet_islocalnet(PTL_NIDNET(nid), orderp)) return -ENETUNREACH; return 0; } +int +lnet_notify (lnet_ni_t *ni, lnet_nid_t gateway_nid, int alive, time_t when) +{ + return -EOPNOTSUPP; +} + void -kpr_finalise (void) +lnet_router_fini (void) { } int -kpr_initialise (void) +lnet_router_init (void) { return 0; } diff --git a/lnet/lnet/router.h b/lnet/lnet/router.h deleted file mode 100644 index 2964854..0000000 --- a/lnet/lnet/router.h +++ /dev/null @@ -1,87 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * This file is part of Lustre, http://www.lustre.org - * - * Portals is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Portals is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Portals; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - */ - -#ifndef _KLNETROUTER_H -#define _KLNETROUTER_H -#ifndef EXPORT_SYMTAB -# define EXPORT_SYMTAB -#endif - -#define DEBUG_SUBSYSTEM S_PTLROUTER - -#include - -#ifdef __KERNEL__ - -typedef struct -{ - struct list_head kpge_list; - atomic_t kpge_weight; - time_t kpge_timestamp; - int kpge_alive; - int kpge_checked; - int kpge_refcount; - lnet_nid_t kpge_nid; -} kpr_gateway_entry_t; - -typedef struct -{ - struct list_head kpre_list; - kpr_gateway_entry_t *kpre_gateway; -} kpr_route_entry_t; - -typedef struct -{ - struct list_head kpne_list; - struct list_head kpne_routes; - __u32 kpne_net; - unsigned int kpne_hops; -} kpr_net_entry_t; - -typedef struct -{ - work_struct_t kpru_tq; - lnet_nid_t kpru_nid; - int kpru_alive; - time_t kpru_when; -} kpr_upcall_t; - -typedef struct{ - struct list_head kpr_nets; /* net -> gateways lookup */ - struct list_head kpr_gateways; /* known gateways */ - unsigned long long kpr_generation; /* validity stamp */ - rwlock_t kpr_rwlock; /* stabilize */ - - atomic_t kpr_queue_depth; /* packets being forwarded */ - - unsigned long long kpr_fwd_bytes; /* counters */ - unsigned long long kpr_fwd_packets; - unsigned long long kpr_fwd_errors; - spinlock_t kpr_stats_lock; /* serialise */ - -} kpr_state_t; - -extern kpr_state_t kpr_state; - -extern void kpr_proc_init (void); -extern void kpr_proc_fini (void); -#endif /* __KERNEL__ */ - -#endif /* _KPLROUTER_H */ diff --git a/lnet/lnet/router_proc.c b/lnet/lnet/router_proc.c index 27318db..a1ca3c7 100644 --- a/lnet/lnet/router_proc.c +++ b/lnet/lnet/router_proc.c @@ -21,112 +21,116 @@ * */ -#include "router.h" +#include #include #include -#define KPR_PROC_STATS "sys/lnet/router_stats" -#define KPR_PROC_ROUTES "sys/lnet/routes" +/* this is really lnet_proc.c */ + +#define LNET_PROC_STATS "sys/lnet/stats" +#define LNET_PROC_ROUTES "sys/lnet/routes" +#define LNET_PROC_PEERS "sys/lnet/peers" +#define LNET_PROC_BUFFERS "sys/lnet/buffers" +#define LNET_PROC_NIS "sys/lnet/nis" static int -kpr_proc_stats_read (char *page, char **start, off_t off, - int count, int *eof, void *data) +lnet_router_proc_stats_read (char *page, char **start, off_t off, + int count, int *eof, void *data) { - unsigned long long bytes; - unsigned long long packets; - unsigned long long errors; - unsigned int qdepth; - unsigned long flags; - + lnet_counters_t *ctrs; + int rc; + + PORTAL_ALLOC(ctrs, sizeof(*ctrs)); + if (ctrs == NULL) + return -ENOMEM; + *start = page; *eof = 1; if (off != 0) return 0; - spin_lock_irqsave(&kpr_state.kpr_stats_lock, flags); - - bytes = kpr_state.kpr_fwd_bytes; - packets = kpr_state.kpr_fwd_packets; - errors = kpr_state.kpr_fwd_errors; - qdepth = atomic_read(&kpr_state.kpr_queue_depth); - - spin_unlock_irqrestore(&kpr_state.kpr_stats_lock, flags); - - return sprintf(page, "%Ld %Ld %Ld %d\n", bytes, packets, errors, qdepth); + LNET_LOCK(); + *ctrs = the_lnet.ln_counters; + LNET_UNLOCK(); + + rc = sprintf(page, + "%u %u %u %u %u %u %u "LPU64" "LPU64" "LPU64" "LPU64"\n", + ctrs->msgs_alloc, ctrs->msgs_max, + ctrs->errors, + ctrs->send_count, ctrs->recv_count, + ctrs->route_count, ctrs->drop_count, + ctrs->send_length, ctrs->recv_length, + ctrs->route_length, ctrs->drop_length); + + PORTAL_FREE(ctrs, sizeof(*ctrs)); + return rc; } static int -kpr_proc_stats_write(struct file *file, const char *ubuffer, +lnet_router_proc_stats_write(struct file *file, const char *ubuffer, unsigned long count, void *data) { - unsigned long flags; + LNET_LOCK(); + memset(&the_lnet.ln_counters, 0, sizeof(the_lnet.ln_counters)); + LNET_UNLOCK(); - spin_lock_irqsave(&kpr_state.kpr_stats_lock, flags); - - /* just zero the stats */ - kpr_state.kpr_fwd_bytes = 0; - kpr_state.kpr_fwd_packets = 0; - kpr_state.kpr_fwd_errors = 0; - - spin_unlock_irqrestore(&kpr_state.kpr_stats_lock, flags); return (count); } typedef struct { - unsigned long long sri_generation; - kpr_net_entry_t *sri_net; - kpr_route_entry_t *sri_route; - loff_t sri_off; -} kpr_seq_route_iterator_t; + unsigned long long lrsi_version; + lnet_remotenet_t *lrsi_net; + lnet_route_t *lrsi_route; + loff_t lrsi_off; +} lnet_route_seq_iterator_t; int -kpr_seq_routes_seek (kpr_seq_route_iterator_t *sri, loff_t off) +lnet_router_seq_seek (lnet_route_seq_iterator_t *lrsi, loff_t off) { struct list_head *n; struct list_head *r; int rc; - unsigned long flags; loff_t here; - read_lock_irqsave(&kpr_state.kpr_rwlock, flags); + LNET_LOCK(); - if (sri->sri_net != NULL && - sri->sri_generation != kpr_state.kpr_generation) { + if (lrsi->lrsi_net != NULL && + lrsi->lrsi_version != the_lnet.ln_remote_nets_version) { /* tables have changed */ rc = -ESTALE; goto out; } - if (sri->sri_net == NULL || sri->sri_off > off) { + if (lrsi->lrsi_net == NULL || lrsi->lrsi_off > off) { /* search from start */ - n = kpr_state.kpr_nets.next; + n = the_lnet.ln_remote_nets.next; r = NULL; here = 0; } else { /* continue search */ - n = &sri->sri_net->kpne_list; - r = &sri->sri_route->kpre_list; - here = sri->sri_off; + n = &lrsi->lrsi_net->lrn_list; + r = &lrsi->lrsi_route->lr_list; + here = lrsi->lrsi_off; } - sri->sri_generation = kpr_state.kpr_generation; - sri->sri_off = off; + lrsi->lrsi_version = the_lnet.ln_remote_nets_version; + lrsi->lrsi_off = off; - while (n != &kpr_state.kpr_nets) { - kpr_net_entry_t *ne = - list_entry(n, kpr_net_entry_t, kpne_list); + while (n != &the_lnet.ln_remote_nets) { + lnet_remotenet_t *rnet = + list_entry(n, lnet_remotenet_t, lrn_list); if (r == NULL) - r = ne->kpne_routes.next; + r = rnet->lrn_routes.next; - while (r != &ne->kpne_routes) { - kpr_route_entry_t *re = - list_entry(r, kpr_route_entry_t, - kpre_list); + while (r != &rnet->lrn_routes) { + lnet_route_t *re = + list_entry(r, lnet_route_t, + lr_list); if (here == off) { - sri->sri_net = ne; - sri->sri_route = re; + lrsi->lrsi_net = rnet; + lrsi->lrsi_route = re; rc = 0; goto out; } @@ -139,112 +143,599 @@ kpr_seq_routes_seek (kpr_seq_route_iterator_t *sri, loff_t off) n = n->next; } - sri->sri_net = NULL; - sri->sri_route = NULL; + lrsi->lrsi_net = NULL; + lrsi->lrsi_route = NULL; rc = -ENOENT; out: - read_unlock_irqrestore(&kpr_state.kpr_rwlock, flags); + LNET_UNLOCK(); return rc; } static void * -kpr_seq_routes_start (struct seq_file *s, loff_t *pos) +lnet_router_seq_start (struct seq_file *s, loff_t *pos) { - kpr_seq_route_iterator_t *sri; - int rc; + lnet_route_seq_iterator_t *lrsi; + int rc; - PORTAL_ALLOC(sri, sizeof(*sri)); - if (sri == NULL) + PORTAL_ALLOC(lrsi, sizeof(*lrsi)); + if (lrsi == NULL) return NULL; - sri->sri_net = NULL; - rc = kpr_seq_routes_seek(sri, *pos); + lrsi->lrsi_net = NULL; + rc = lnet_router_seq_seek(lrsi, *pos); if (rc == 0) - return sri; + return lrsi; - PORTAL_FREE(sri, sizeof(*sri)); + PORTAL_FREE(lrsi, sizeof(*lrsi)); return NULL; } static void -kpr_seq_routes_stop (struct seq_file *s, void *iter) +lnet_router_seq_stop (struct seq_file *s, void *iter) { - kpr_seq_route_iterator_t *sri = iter; + lnet_route_seq_iterator_t *lrsi = iter; - if (sri != NULL) - PORTAL_FREE(sri, sizeof(*sri)); + if (lrsi != NULL) + PORTAL_FREE(lrsi, sizeof(*lrsi)); } static void * -kpr_seq_routes_next (struct seq_file *s, void *iter, loff_t *pos) +lnet_router_seq_next (struct seq_file *s, void *iter, loff_t *pos) +{ + lnet_route_seq_iterator_t *lrsi = iter; + int rc; + loff_t next = *pos + 1; + + rc = lnet_router_seq_seek(lrsi, next); + if (rc != 0) { + PORTAL_FREE(lrsi, sizeof(*lrsi)); + return NULL; + } + + *pos = next; + return lrsi; +} + +static int +lnet_router_seq_show (struct seq_file *s, void *iter) +{ + lnet_route_seq_iterator_t *lrsi = iter; + __u32 net; + unsigned int hops; + lnet_nid_t nid; + int alive; + + LASSERT (lrsi->lrsi_net != NULL); + LASSERT (lrsi->lrsi_route != NULL); + + LNET_LOCK(); + + if (lrsi->lrsi_version != the_lnet.ln_remote_nets_version) { + LNET_UNLOCK(); + return -ESTALE; + } + + net = lrsi->lrsi_net->lrn_net; + hops = lrsi->lrsi_net->lrn_hops; + nid = lrsi->lrsi_route->lr_gateway->lp_nid; + alive = lrsi->lrsi_route->lr_gateway->lp_alive; + + LNET_UNLOCK(); + + seq_printf(s, "%-8s %2u %7s %s\n", libcfs_net2str(net), hops, + alive ? "up" : "down", libcfs_nid2str(nid)); + return 0; +} + +static struct seq_operations lnet_routes_sops = { + .start = lnet_router_seq_start, + .stop = lnet_router_seq_stop, + .next = lnet_router_seq_next, + .show = lnet_router_seq_show, +}; + +static int +lnet_router_seq_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *dp = PDE(inode); + struct seq_file *sf; + int rc; + + rc = seq_open(file, &lnet_routes_sops); + if (rc == 0) { + sf = file->private_data; + sf->private = dp->data; + } + + return rc; +} + +static struct file_operations lnet_routes_fops = { + .owner = THIS_MODULE, + .open = lnet_router_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +typedef struct { + unsigned long long lpsi_version; + int lpsi_idx; + lnet_peer_t *lpsi_peer; + loff_t lpsi_off; +} lnet_peer_seq_iterator_t; + +int +lnet_peer_seq_seek (lnet_peer_seq_iterator_t *lpsi, loff_t off) { - kpr_seq_route_iterator_t *sri = iter; + int idx; + struct list_head *p; + loff_t here; + int rc; + + LNET_LOCK(); + + if (lpsi->lpsi_peer != NULL && + lpsi->lpsi_version != the_lnet.ln_peertable_version) { + /* tables have changed */ + rc = -ESTALE; + goto out; + } + + if (lpsi->lpsi_peer == NULL || + lpsi->lpsi_off > off) { + /* search from start */ + idx = 0; + p = NULL; + here = 0; + } else { + /* continue search */ + idx = lpsi->lpsi_idx; + p = &lpsi->lpsi_peer->lp_hashlist; + here = lpsi->lpsi_off; + } + + lpsi->lpsi_version = the_lnet.ln_peertable_version; + lpsi->lpsi_off = off; + + while (idx < LNET_PEER_HASHSIZE) { + if (p == NULL) + p = the_lnet.ln_peer_hash[idx].next; + + while (p != &the_lnet.ln_peer_hash[idx]) { + lnet_peer_t *lp = list_entry(p, lnet_peer_t, + lp_hashlist); + + if (here == off) { + lpsi->lpsi_idx = idx; + lpsi->lpsi_peer = lp; + rc = 0; + goto out; + } + + here++; + p = lp->lp_hashlist.next; + } + + p = NULL; + idx++; + } + + lpsi->lpsi_idx = 0; + lpsi->lpsi_peer = NULL; + rc = -ENOENT; + out: + LNET_UNLOCK(); + return rc; +} + +static void * +lnet_peer_seq_start (struct seq_file *s, loff_t *pos) +{ + lnet_peer_seq_iterator_t *lpsi; + int rc; + + PORTAL_ALLOC(lpsi, sizeof(*lpsi)); + if (lpsi == NULL) + return NULL; + + lpsi->lpsi_idx = 0; + lpsi->lpsi_peer = NULL; + rc = lnet_peer_seq_seek(lpsi, *pos); + if (rc == 0) + return lpsi; + + PORTAL_FREE(lpsi, sizeof(*lpsi)); + return NULL; +} + +static void +lnet_peer_seq_stop (struct seq_file *s, void *iter) +{ + lnet_peer_seq_iterator_t *lpsi = iter; + + if (lpsi != NULL) + PORTAL_FREE(lpsi, sizeof(*lpsi)); +} + +static void * +lnet_peer_seq_next (struct seq_file *s, void *iter, loff_t *pos) +{ + lnet_peer_seq_iterator_t *lpsi = iter; int rc; loff_t next = *pos + 1; - rc = kpr_seq_routes_seek(sri, next); + rc = lnet_peer_seq_seek(lpsi, next); if (rc != 0) { - PORTAL_FREE(sri, sizeof(*sri)); + PORTAL_FREE(lpsi, sizeof(*lpsi)); return NULL; } *pos = next; - return sri; + return lpsi; } static int -kpr_seq_routes_show (struct seq_file *s, void *iter) +lnet_peer_seq_show (struct seq_file *s, void *iter) { - kpr_seq_route_iterator_t *sri = iter; - unsigned long flags; - __u32 net; - unsigned int hops; + lnet_peer_seq_iterator_t *lpsi = iter; + lnet_peer_t *lp; lnet_nid_t nid; + int maxcr; + int mintxcr; + int txcr; + int minrtrcr; + int rtrcr; int alive; - int ignored; - - read_lock_irqsave(&kpr_state.kpr_rwlock, flags); + int txqnob; + int nrefs; + + LASSERT (lpsi->lpsi_peer != NULL); - LASSERT (sri->sri_net != NULL); - LASSERT (sri->sri_route != NULL); + LNET_LOCK(); - if (sri->sri_generation != kpr_state.kpr_generation) { - read_unlock_irqrestore(&kpr_state.kpr_rwlock, flags); + if (lpsi->lpsi_version != the_lnet.ln_peertable_version) { + LNET_UNLOCK(); return -ESTALE; } - net = sri->sri_net->kpne_net; - hops = sri->sri_net->kpne_hops; - nid = sri->sri_route->kpre_gateway->kpge_nid; - alive = sri->sri_route->kpre_gateway->kpge_alive; - ignored = lnet_islocalnet(sri->sri_net->kpne_net, NULL) || - !lnet_islocalnet(sri->sri_route->kpre_gateway->kpge_nid, NULL); + lp = lpsi->lpsi_peer; + + nid = lp->lp_nid; + maxcr = lp->lp_ni->ni_peertxcredits; + txcr = lp->lp_txcredits; + mintxcr = lp->lp_mintxcredits; + rtrcr = lp->lp_rtrcredits; + minrtrcr = lp->lp_minrtrcredits; + alive = lp->lp_alive; + txqnob = lp->lp_txqnob; + nrefs = lp->lp_refcount; + + LNET_UNLOCK(); + + seq_printf(s, "%-16s [%3d] %4s %3d rtr %3d %3d tx %3d %3d # %d\n", + libcfs_nid2str(nid), nrefs, alive ? "up" : "down", + maxcr, rtrcr, minrtrcr, txcr, mintxcr, txqnob); + return 0; +} - read_unlock_irqrestore(&kpr_state.kpr_rwlock, flags); +static struct seq_operations lnet_peer_sops = { + .start = lnet_peer_seq_start, + .stop = lnet_peer_seq_stop, + .next = lnet_peer_seq_next, + .show = lnet_peer_seq_show, +}; - seq_printf(s, "%-8s %2u %7s %s\n", - libcfs_net2str(net), hops, - ignored ? "ignored" : - alive ? "up" : "down", - libcfs_nid2str(nid)); +static int +lnet_peer_seq_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *dp = PDE(inode); + struct seq_file *sf; + int rc; + + rc = seq_open(file, &lnet_peer_sops); + if (rc == 0) { + sf = file->private_data; + sf->private = dp->data; + } + + return rc; +} + +static struct file_operations lnet_peer_fops = { + .owner = THIS_MODULE, + .open = lnet_peer_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +typedef struct { + int lbsi_idx; + loff_t lbsi_off; +} lnet_buffer_seq_iterator_t; + +int +lnet_buffer_seq_seek (lnet_buffer_seq_iterator_t *lbsi, loff_t off) +{ + int idx; + loff_t here; + int rc; + + LNET_LOCK(); + + if (lbsi->lbsi_idx < 0 || + lbsi->lbsi_off > off) { + /* search from start */ + idx = 0; + here = 0; + } else { + /* continue search */ + idx = lbsi->lbsi_idx; + here = lbsi->lbsi_off; + } + + lbsi->lbsi_off = off; + + while (idx < LNET_NRBPOOLS) { + if (here == off) { + lbsi->lbsi_idx = idx; + rc = 0; + goto out; + } + here++; + idx++; + } + + lbsi->lbsi_idx = -1; + rc = -ENOENT; + out: + LNET_UNLOCK(); + return rc; +} + +static void * +lnet_buffer_seq_start (struct seq_file *s, loff_t *pos) +{ + lnet_buffer_seq_iterator_t *lbsi; + int rc; + + PORTAL_ALLOC(lbsi, sizeof(*lbsi)); + if (lbsi == NULL) + return NULL; + + lbsi->lbsi_idx = -1; + rc = lnet_buffer_seq_seek(lbsi, *pos); + if (rc == 0) + return lbsi; + + PORTAL_FREE(lbsi, sizeof(*lbsi)); + return NULL; +} + +static void +lnet_buffer_seq_stop (struct seq_file *s, void *iter) +{ + lnet_buffer_seq_iterator_t *lbsi = iter; + + if (lbsi != NULL) + PORTAL_FREE(lbsi, sizeof(*lbsi)); +} + +static void * +lnet_buffer_seq_next (struct seq_file *s, void *iter, loff_t *pos) +{ + lnet_buffer_seq_iterator_t *lbsi = iter; + int rc; + loff_t next = *pos + 1; + + rc = lnet_buffer_seq_seek(lbsi, next); + if (rc != 0) { + PORTAL_FREE(lbsi, sizeof(*lbsi)); + return NULL; + } + + *pos = next; + return lbsi; +} + +static int +lnet_buffer_seq_show (struct seq_file *s, void *iter) +{ + lnet_buffer_seq_iterator_t *lbsi = iter; + lnet_rtrbufpool_t *rbp; + int npages; + int nbuf; + int cr; + int mincr; + + LASSERT (lbsi->lbsi_idx >= 0 && lbsi->lbsi_idx < LNET_PEER_HASHSIZE); + + LNET_LOCK(); + + rbp = &the_lnet.ln_rtrpools[lbsi->lbsi_idx]; + + npages = rbp->rbp_npages; + nbuf = rbp->rbp_nbuffers; + cr = rbp->rbp_credits; + mincr = rbp->rbp_mincredits; + + LNET_UNLOCK(); + + seq_printf(s, "[%d] %4d x %3d %5d %5d\n", lbsi->lbsi_idx, + npages, nbuf, cr, mincr); + return 0; +} + +static struct seq_operations lnet_buffer_sops = { + .start = lnet_buffer_seq_start, + .stop = lnet_buffer_seq_stop, + .next = lnet_buffer_seq_next, + .show = lnet_buffer_seq_show, +}; + +static int +lnet_buffer_seq_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *dp = PDE(inode); + struct seq_file *sf; + int rc; + + rc = seq_open(file, &lnet_buffer_sops); + if (rc == 0) { + sf = file->private_data; + sf->private = dp->data; + } + + return rc; +} + +static struct file_operations lnet_buffers_fops = { + .owner = THIS_MODULE, + .open = lnet_buffer_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +typedef struct { + lnet_ni_t *lnsi_ni; + loff_t lnsi_off; +} lnet_ni_seq_iterator_t; + +int +lnet_ni_seq_seek (lnet_ni_seq_iterator_t *lnsi, loff_t off) +{ + struct list_head *n; + loff_t here; + int rc; + + LNET_LOCK(); + + if (lnsi->lnsi_off > off) { + /* search from start */ + n = NULL; + here = 0; + } else { + /* continue search */ + n = &lnsi->lnsi_ni->ni_list; + here = lnsi->lnsi_off; + } + + lnsi->lnsi_off = off; + + if (n == NULL) + n = the_lnet.ln_nis.next; + + while (n != &the_lnet.ln_nis) { + if (here == off) { + lnsi->lnsi_ni = list_entry(n, lnet_ni_t, ni_list); + rc = 0; + goto out; + } + here++; + n = n->next; + } + + lnsi->lnsi_ni = NULL; + rc = -ENOENT; + out: + LNET_UNLOCK(); + return rc; +} + +static void * +lnet_ni_seq_start (struct seq_file *s, loff_t *pos) +{ + lnet_ni_seq_iterator_t *lnsi; + int rc; + + PORTAL_ALLOC(lnsi, sizeof(*lnsi)); + if (lnsi == NULL) + return NULL; + + lnsi->lnsi_ni = NULL; + rc = lnet_ni_seq_seek(lnsi, *pos); + if (rc == 0) + return lnsi; + + PORTAL_FREE(lnsi, sizeof(*lnsi)); + return NULL; +} + +static void +lnet_ni_seq_stop (struct seq_file *s, void *iter) +{ + lnet_ni_seq_iterator_t *lnsi = iter; + + if (lnsi != NULL) + PORTAL_FREE(lnsi, sizeof(*lnsi)); +} + +static void * +lnet_ni_seq_next (struct seq_file *s, void *iter, loff_t *pos) +{ + lnet_ni_seq_iterator_t *lnsi = iter; + int rc; + loff_t next = *pos + 1; + + rc = lnet_ni_seq_seek(lnsi, next); + if (rc != 0) { + PORTAL_FREE(lnsi, sizeof(*lnsi)); + return NULL; + } + + *pos = next; + return lnsi; +} + +static int +lnet_ni_seq_show (struct seq_file *s, void *iter) +{ + lnet_ni_seq_iterator_t *lnsi = iter; + lnet_ni_t *ni; + int maxtxcr; + int txcr; + int mintxcr; + int npeertxcr; + lnet_nid_t nid; + int nref; + + LASSERT (lnsi->lnsi_ni != NULL); + + LNET_LOCK(); + + ni = lnsi->lnsi_ni; + + maxtxcr = ni->ni_maxtxcredits; + txcr = ni->ni_txcredits; + mintxcr = ni->ni_mintxcredits; + npeertxcr = ni->ni_peertxcredits; + nid = ni->ni_nid; + nref = ni->ni_refcount; + + LNET_UNLOCK(); + + seq_printf(s, "%-16s [%3d] %4d %5d %5d %5d\n", + libcfs_nid2str(nid), nref, npeertxcr, maxtxcr, txcr, mintxcr); return 0; } -static struct seq_operations kpr_routes_sops = { - .start = kpr_seq_routes_start, - .stop = kpr_seq_routes_stop, - .next = kpr_seq_routes_next, - .show = kpr_seq_routes_show, +static struct seq_operations lnet_ni_sops = { + .start = lnet_ni_seq_start, + .stop = lnet_ni_seq_stop, + .next = lnet_ni_seq_next, + .show = lnet_ni_seq_show, }; static int -kpr_seq_routes_open(struct inode *inode, struct file *file) +lnet_ni_seq_open(struct inode *inode, struct file *file) { struct proc_dir_entry *dp = PDE(inode); struct seq_file *sf; int rc; - rc = seq_open(file, &kpr_routes_sops); + rc = seq_open(file, &lnet_ni_sops); if (rc == 0) { sf = file->private_data; sf->private = dp->data; @@ -253,45 +744,79 @@ kpr_seq_routes_open(struct inode *inode, struct file *file) return rc; } -static struct file_operations kpr_routes_fops = { +static struct file_operations lnet_ni_fops = { .owner = THIS_MODULE, - .open = kpr_seq_routes_open, + .open = lnet_ni_seq_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; void -kpr_proc_init(void) +lnet_proc_init(void) { struct proc_dir_entry *stats; struct proc_dir_entry *routes; + struct proc_dir_entry *peers; - /* Initialize KPR_PROC_STATS */ - stats = create_proc_entry (KPR_PROC_STATS, 0644, NULL); + /* Initialize LNET_PROC_STATS */ + stats = create_proc_entry (LNET_PROC_STATS, 0644, NULL); if (stats == NULL) { - CERROR("couldn't create proc entry %s\n", KPR_PROC_STATS); + CERROR("couldn't create proc entry %s\n", LNET_PROC_STATS); return; } stats->data = NULL; - stats->read_proc = kpr_proc_stats_read; - stats->write_proc = kpr_proc_stats_write; + stats->read_proc = lnet_router_proc_stats_read; + stats->write_proc = lnet_router_proc_stats_write; - /* Initialize KPR_PROC_ROUTES */ - routes = create_proc_entry (KPR_PROC_ROUTES, 0444, NULL); + /* Initialize LNET_PROC_ROUTES */ + routes = create_proc_entry (LNET_PROC_ROUTES, 0444, NULL); if (routes == NULL) { - CERROR("couldn't create proc entry %s\n", KPR_PROC_ROUTES); + CERROR("couldn't create proc entry %s\n", LNET_PROC_ROUTES); return; } - routes->proc_fops = &kpr_routes_fops; + routes->proc_fops = &lnet_routes_fops; routes->data = NULL; + + /* Initialize LNET_PROC_PEERS */ + peers = create_proc_entry (LNET_PROC_PEERS, 0444, NULL); + if (peers == NULL) { + CERROR("couldn't create proc entry %s\n", LNET_PROC_PEERS); + return; + } + + peers->proc_fops = &lnet_peer_fops; + peers->data = NULL; + + /* Initialize LNET_PROC_BUFFERS */ + peers = create_proc_entry (LNET_PROC_BUFFERS, 0444, NULL); + if (peers == NULL) { + CERROR("couldn't create proc entry %s\n", LNET_PROC_BUFFERS); + return; + } + + peers->proc_fops = &lnet_buffers_fops; + peers->data = NULL; + + /* Initialize LNET_PROC_NIS */ + peers = create_proc_entry (LNET_PROC_NIS, 0444, NULL); + if (peers == NULL) { + CERROR("couldn't create proc entry %s\n", LNET_PROC_NIS); + return; + } + + peers->proc_fops = &lnet_ni_fops; + peers->data = NULL; } void -kpr_proc_fini(void) +lnet_proc_fini(void) { - remove_proc_entry(KPR_PROC_STATS, 0); - remove_proc_entry(KPR_PROC_ROUTES, 0); + remove_proc_entry(LNET_PROC_STATS, 0); + remove_proc_entry(LNET_PROC_ROUTES, 0); + remove_proc_entry(LNET_PROC_PEERS, 0); + remove_proc_entry(LNET_PROC_BUFFERS, 0); + remove_proc_entry(LNET_PROC_NIS, 0); } diff --git a/lnet/ulnds/socklnd/tcplnd.c b/lnet/ulnds/socklnd/tcplnd.c index 613dbda..c237ba3 100644 --- a/lnet/ulnds/socklnd/tcplnd.c +++ b/lnet/ulnds/socklnd/tcplnd.c @@ -118,7 +118,7 @@ int tcpnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) if (rc == 0) { /* NB the NAL only calls lnet_finalize() if it returns 0 * from cb_send() */ - lnet_finalize(ni, private, lntmsg, 0); + lnet_finalize(ni, lntmsg, 0); } return(rc); @@ -157,7 +157,7 @@ int tcpnal_recv(lnet_ni_t *ni, finalize: /* FIXME; we always assume success here... */ - lnet_finalize(ni, private, cookie, 0); + lnet_finalize(ni, cookie, 0); LASSERT(rlen >= mlen); @@ -194,7 +194,7 @@ static int from_connection(void *a, void *d) hdr.dest_nid = cpu_to_le64(b->b_ni->ni_nid); hdr.dest_pid = cpu_to_le32(the_lnet.ln_pid); - rc = lnet_parse(b->b_ni, &hdr, c); + rc = lnet_parse(b->b_ni, &hdr, c->peer_nid, c); if (rc < 0) { CERROR("Error %d from lnet_parse\n", rc); return 0; diff --git a/lnet/utils/lbstats b/lnet/utils/lbstats new file mode 100755 index 0000000..5b77ad4 --- /dev/null +++ b/lnet/utils/lbstats @@ -0,0 +1,14 @@ +#!/bin/bash + +echo "=== Router Buffers ===========================" +echo +test -e /proc/sys/lnet/buffers && cat /proc/sys/lnet/buffers +echo +echo "=== NIs ======================================" +echo +test -e /proc/sys/lnet/nis && cat /proc/sys/lnet/nis +echo +echo "=== Peers ====================================" +echo +test -e /proc/sys/lnet/peers && cat /proc/sys/lnet/peers + diff --git a/lnet/utils/portals.c b/lnet/utils/portals.c index 52b8be9..4617621 100644 --- a/lnet/utils/portals.c +++ b/lnet/utils/portals.c @@ -284,7 +284,7 @@ int jt_ptl_network(int argc, char **argv) rc = l_ioctl(LNET_DEV_ID, IOC_PORTAL_UNCONFIGURE, &data); if (rc == 0) { - printf ("portals ready to unload\n"); + printf ("lnet ready to unload\n"); return 0; } @@ -1097,7 +1097,6 @@ jt_ptl_print_routes (int argc, char **argv) lnet_nid_t nid; unsigned int hops; int alive; - int ignored; for (index = 0;;index++) { @@ -1111,14 +1110,11 @@ jt_ptl_print_routes (int argc, char **argv) net = data.ioc_net; hops = data.ioc_count; nid = data.ioc_nid; - alive = data.ioc_flags & 1; - ignored = data.ioc_flags & 2; + alive = data.ioc_flags; printf ("net %18s hops %u gw %32s %s\n", libcfs_net2str(net), hops, - libcfs_nid2str(nid), - ignored ? "" : - alive ? "up" : "down"); + libcfs_nid2str(nid), alive ? "up" : "down"); } if (errno != ENOENT) diff --git a/lnet/utils/routerstat.c b/lnet/utils/routerstat.c index 587901b..3f6263b 100644 --- a/lnet/utils/routerstat.c +++ b/lnet/utils/routerstat.c @@ -16,20 +16,56 @@ timenow () return (tv.tv_sec + tv.tv_usec / 1000000.0); } +typedef struct { + unsigned long msgs_alloc; + unsigned long msgs_max; + unsigned long errors; + unsigned long send_count; + unsigned long recv_count; + unsigned long route_count; + unsigned long drop_count; + unsigned long long send_length; + unsigned long long recv_length; + unsigned long long route_length; + unsigned long long drop_length; +} counters_t; + +unsigned long long subull(unsigned long long a, unsigned long long b) +{ + if (a < b) + return -1ULL - b + a + 1; + + return a - b; +} + +unsigned long long subul(unsigned long a, unsigned long b) +{ + if (a < b) + return -1UL - b + a + 1; + + return a - b; +} + +double rul(unsigned long a, double secs) +{ + return (double)a/secs; +} + +double rull(unsigned long long a, double secs) +{ + return (double)a/secs; +} + void do_stat (int fd) { static char buffer[1024]; static double last = 0.0; - static unsigned long long old_bytes; - static unsigned long long old_packets; - static unsigned long long old_errors; + static counters_t old_counter; double now; double t; - unsigned long long new_bytes, bytes; - unsigned long long new_packets, packets; - unsigned long long new_errors, errors; - unsigned long long depth; + counters_t new_counter; + counters_t counter; int n; lseek (fd, 0, SEEK_SET); @@ -42,53 +78,53 @@ do_stat (int fd) } buffer[n] = 0; - n = sscanf (buffer, "%Lu %Lu %Lu %Lu", - &new_bytes, &new_packets, &new_errors, &depth); - - if (n < 3) + n = sscanf (buffer, "%u %u %u %u %u %u %u %Lu %Lu %Lu %Lu", + &new_counter.msgs_alloc, &new_counter.msgs_max, + &new_counter.errors, + &new_counter.send_count, &new_counter.recv_count, + &new_counter.route_count, &new_counter.drop_count, + &new_counter.send_length, &new_counter.recv_length, + &new_counter.route_length, &new_counter.drop_length); + if (n < 11) { fprintf (stderr, "Can't parse statfile\n"); exit (1); } - if (last == 0.0) - printf ("%llu bytes, %llu packets (sz %lld), %llu errors", - new_bytes, new_packets, - ((new_packets == 0) ? 0LL : new_bytes/new_packets), - new_errors); - else - { - t = now - last; + if (last == 0.0) { + printf ("M %lu(%lu) E %lu S %lu/%llu R %lu/%llu F %lu/%llu D %lu/%llu\n", + new_counter.msgs_alloc, new_counter.msgs_max, + new_counter.errors, + new_counter.send_count, new_counter.send_length, + new_counter.recv_count, new_counter.recv_length, + new_counter.route_count, new_counter.route_length, + new_counter.drop_count, new_counter.drop_length); + } else { + t = now - last; - if (new_bytes < old_bytes) - bytes = -1ULL - old_bytes + new_bytes + 1; - else - bytes = new_bytes - old_bytes; + counter.msgs_alloc = new_counter.msgs_alloc; + counter.msgs_max = new_counter.msgs_max; + + counter.errors = subul(new_counter.errors, old_counter.errors); + counter.send_count = subul(new_counter.send_count, old_counter.send_count); + counter.recv_count = subul(new_counter.recv_count, old_counter.recv_count); + counter.route_count = subul(new_counter.route_count, old_counter.route_count); + counter.drop_count = subul(new_counter.drop_count, old_counter.drop_count); + counter.send_length = subull(new_counter.send_length, old_counter.send_length); + counter.recv_length = subull(new_counter.recv_length, old_counter.recv_length); + counter.route_length = subull(new_counter.route_length, old_counter.route_length); + counter.drop_length = subull(new_counter.drop_length, old_counter.drop_length); - if (new_packets < old_packets) - packets = -1UL - old_packets + new_packets + 1; - else - packets = new_packets - old_packets; - - if (new_errors < old_errors) - errors = -1UL - old_errors + new_errors + 1; - else - errors = new_errors - old_errors; - - printf ("%9llu bytes (%7.2fMb/s), %7llu packets (sz %5lld, %5.0f/s), %llu errors (%0.0f/s)", - bytes, ((double)bytes)/((1<<20) * t), - packets, (packets == 0) ? 0LL : bytes/packets, packets/t, - errors, errors/t); + printf ("M %lu(%lu) E %0.0f S %7.2f/%6.0f R %7.2f/%6.0f F %7.2f/%6.0f D %4.2f/%0.0f\n", + counter.msgs_alloc, counter.msgs_max, + rul(counter.errors,t), + rull(counter.send_length,t*1024.0*1024.0), rul(counter.send_count, t), + rull(counter.recv_length,t*1024.0*1024.0), rul(counter.recv_count, t), + rull(counter.route_length,t*1024.0*1024.0), rul(counter.route_count, t), + rull(counter.drop_length,t*1024.0*1024.0), rul(counter.drop_count, t)); } - old_bytes = new_bytes; - old_packets = new_packets; - old_errors = new_errors; - - if (n == 4) - printf (", depth (%lld)\n", depth); - else - printf ("\n"); + old_counter = new_counter; fflush (stdout); lseek (fd, 0, SEEK_SET); @@ -103,7 +139,7 @@ int main (int argc, char **argv) if (argc > 1) interval = atoi (argv[1]); - fd = open ("/proc/sys/lnet/router_stats", O_RDONLY); + fd = open ("/proc/sys/lnet/stats", O_RDONLY); if (fd < 0) { fprintf (stderr, "Can't open stat: %s\n", strerror (errno)); -- 1.8.3.1