From: isaac Date: Tue, 17 Feb 2009 05:21:21 +0000 (+0000) Subject: b=16186,i=liangzhen,i=maxim: X-Git-Tag: v1_9_162~52 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=4588fda6626ef5f8b043c185be69211ddfae57b1 b=16186,i=liangzhen,i=maxim: - shared routing enhancements: peer health detection. --- diff --git a/lnet/ChangeLog b/lnet/ChangeLog index d0a0f50..e70bd23 100644 --- a/lnet/ChangeLog +++ b/lnet/ChangeLog @@ -17,6 +17,11 @@ Bugzilla : Description: Details : +Severity : normal +Bugzilla : 16186 +Description: One down Lustre FS hangs ALL mounted Lustre filesystems +Details : Shared routing enhancements - peer health detection. + Severity : enhancement Bugzilla : 14132 Description: acceptor.c cleanup diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index cdbfb89..452a325 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -538,6 +538,7 @@ lnet_net2ni (__u32 net) } int lnet_notify(lnet_ni_t *ni, lnet_nid_t peer, int alive, time_t when); +void lnet_notify_locked(lnet_peer_t *lp, int notifylnd, int alive, time_t when); int lnet_add_route(__u32 net, unsigned int hops, lnet_nid_t gateway_nid); int lnet_check_routes(void); int lnet_del_route(__u32 net, lnet_nid_t gw_nid); diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index f518278..9c8b498 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -358,6 +358,9 @@ typedef struct lnet_lnd /* notification of peer health */ void (*lnd_notify)(struct lnet_ni *ni, lnet_nid_t peer, int alive); + /* query of peer aliveness */ + void (*lnd_query)(struct lnet_ni *ni, lnet_nid_t peer, time_t *when); + #if defined(__KERNEL__) || defined(HAVE_LIBPTHREAD) /* accept a new connection */ int (*lnd_accept)(struct lnet_ni *ni, cfs_socket_t *sock); @@ -381,6 +384,7 @@ typedef struct lnet_ni { int ni_txcredits; /* # tx credits free */ int ni_mintxcredits; /* lowest it's been */ int ni_peertxcredits; /* # per-peer send credits */ + int ni_peertimeout; /* seconds to consider peer dead */ lnet_nid_t ni_nid; /* interface's NID */ void *ni_data; /* instance-specific data */ lnd_t *ni_lnd; /* procedural interface */ @@ -405,6 +409,7 @@ typedef struct lnet_peer { int lp_alive_count; /* # times router went dead<->alive */ long lp_txqnob; /* bytes queued for sending */ time_t lp_timestamp; /* time of last aliveness news */ + time_t lp_last_alive; /* when I was last alive */ time_t lp_ping_timestamp; /* time of last ping attempt */ time_t lp_ping_deadline; /* != 0 if ping reply expected */ lnet_ni_t *lp_ni; /* interface peer is on */ diff --git a/lnet/klnds/o2iblnd/o2iblnd.c b/lnet/klnds/o2iblnd/o2iblnd.c index 4a40e06..440587e 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.c +++ b/lnet/klnds/o2iblnd/o2iblnd.c @@ -45,6 +45,7 @@ lnd_t the_kiblnd = { .lnd_startup = kiblnd_startup, .lnd_shutdown = kiblnd_shutdown, .lnd_ctl = kiblnd_ctl, + .lnd_query = kiblnd_query, .lnd_send = kiblnd_send, .lnd_recv = kiblnd_recv, }; @@ -320,7 +321,7 @@ kiblnd_create_peer (lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid) peer->ibp_ni = ni; peer->ibp_nid = nid; peer->ibp_error = 0; - peer->ibp_last_alive = cfs_time_current(); + peer->ibp_last_alive = 0; atomic_set(&peer->ibp_refcount, 1); /* 1 ref for caller */ INIT_LIST_HEAD(&peer->ibp_list); /* not in the peer table yet */ @@ -1075,6 +1076,37 @@ kiblnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg) } void +kiblnd_query (lnet_ni_t *ni, lnet_nid_t nid, time_t *when) +{ + cfs_time_t last_alive = 0; + rwlock_t *glock = &kiblnd_data.kib_global_lock; + kib_peer_t *peer; + unsigned long flags; + + read_lock_irqsave(glock, flags); + + peer = kiblnd_find_peer_locked(nid); + if (peer != NULL) { + LASSERT (peer->ibp_connecting > 0 || /* creating conns */ + peer->ibp_accepting > 0 || + !list_empty(&peer->ibp_conns)); /* active conn */ + last_alive = peer->ibp_last_alive; + } + + read_unlock_irqrestore(glock, flags); + + if (last_alive != 0) + *when = cfs_time_current_sec() - + cfs_duration_sec(cfs_time_current() - last_alive); + + /* peer is not persistent in hash, trigger peer creation + * and connection establishment with a NULL tx */ + if (peer == NULL) + kiblnd_launch_tx(ni, NULL, nid); + return; +} + +void kiblnd_free_pages (kib_pages_t *p) { int npages = p->ibp_npages; @@ -1517,6 +1549,7 @@ kiblnd_startup (lnet_ni_t *ni) ni->ni_maxtxcredits = *kiblnd_tunables.kib_credits; ni->ni_peertxcredits = *kiblnd_tunables.kib_peercredits; + ni->ni_peertimeout = *kiblnd_tunables.kib_peertimeout; spin_lock_init(&net->ibn_tx_lock); INIT_LIST_HEAD(&net->ibn_idle_txs); diff --git a/lnet/klnds/o2iblnd/o2iblnd.h b/lnet/klnds/o2iblnd/o2iblnd.h index 7276c9a..57cc3fa 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.h +++ b/lnet/klnds/o2iblnd/o2iblnd.h @@ -137,6 +137,7 @@ typedef struct int *kib_ntx; /* # tx descs */ int *kib_credits; /* # concurrent sends */ int *kib_peercredits; /* # concurrent sends to 1 peer */ + int *kib_peertimeout; /* seconds to consider peer dead */ char **kib_default_ipif; /* default IPoIB interface */ int *kib_retry_count; int *kib_rnr_retry_count; @@ -721,6 +722,7 @@ static inline unsigned int kiblnd_sg_dma_len(struct ib_device *dev, int kiblnd_startup (lnet_ni_t *ni); void kiblnd_shutdown (lnet_ni_t *ni); int kiblnd_ctl (lnet_ni_t *ni, unsigned int cmd, void *arg); +void kiblnd_query (struct lnet_ni *ni, lnet_nid_t nid, time_t *when); int kiblnd_tunables_init(void); void kiblnd_tunables_fini(void); @@ -754,6 +756,7 @@ void kiblnd_close_conn_locked (kib_conn_t *conn, int error); int kiblnd_init_rdma (lnet_ni_t *ni, kib_tx_t *tx, int type, int nob, kib_rdma_desc_t *dstrd, __u64 dstcookie); +void kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid); void kiblnd_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn); void kiblnd_queue_tx (kib_tx_t *tx, kib_conn_t *conn); void kiblnd_init_tx_msg (lnet_ni_t *ni, kib_tx_t *tx, int type, int body_nob); diff --git a/lnet/klnds/o2iblnd/o2iblnd_cb.c b/lnet/klnds/o2iblnd/o2iblnd_cb.c index 8851a32..4abb588 100644 --- a/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ b/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -1406,8 +1406,8 @@ kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid) /* If I get here, I've committed to send, so I complete the tx with * failure on any problems */ - LASSERT (tx->tx_conn == NULL); /* only set when assigned a conn */ - LASSERT (tx->tx_nwrq > 0); /* work items have been set up */ + LASSERT (tx == NULL || tx->tx_conn == NULL); /* only set when assigned a conn */ + LASSERT (tx == NULL || tx->tx_nwrq > 0); /* work items have been set up */ /* First time, just use a read lock since I expect to find my peer * connected */ @@ -1421,7 +1421,8 @@ kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid) read_unlock_irqrestore(g_lock, flags); - kiblnd_queue_tx(tx, conn); + if (tx != NULL) + kiblnd_queue_tx(tx, conn); kiblnd_conn_decref(conn); /* ...to here */ return; } @@ -1436,15 +1437,17 @@ kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid) /* found a peer, but it's still connecting... */ LASSERT (peer->ibp_connecting != 0 || peer->ibp_accepting != 0); - list_add_tail (&tx->tx_list, &peer->ibp_tx_queue); + if (tx != NULL) + list_add_tail(&tx->tx_list, &peer->ibp_tx_queue); write_unlock_irqrestore(g_lock, flags); } else { conn = kiblnd_get_conn_locked(peer); kiblnd_conn_addref(conn); /* 1 ref for me... */ write_unlock_irqrestore(g_lock, flags); - - kiblnd_queue_tx(tx, conn); + + if (tx != NULL) + kiblnd_queue_tx(tx, conn); kiblnd_conn_decref(conn); /* ...to here */ } return; @@ -1456,9 +1459,11 @@ kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid) rc = kiblnd_create_peer(ni, &peer, nid); if (rc != 0) { CERROR("Can't create peer %s\n", libcfs_nid2str(nid)); - tx->tx_status = -EHOSTUNREACH; - tx->tx_waiting = 0; - kiblnd_tx_done(ni, tx); + if (tx != NULL) { + tx->tx_status = -EHOSTUNREACH; + tx->tx_waiting = 0; + kiblnd_tx_done(ni, tx); + } return; } @@ -1470,15 +1475,17 @@ kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid) /* found a peer, but it's still connecting... */ LASSERT (peer2->ibp_connecting != 0 || peer2->ibp_accepting != 0); - list_add_tail (&tx->tx_list, &peer2->ibp_tx_queue); + if (tx != NULL) + list_add_tail(&tx->tx_list, &peer2->ibp_tx_queue); write_unlock_irqrestore(g_lock, flags); } else { conn = kiblnd_get_conn_locked(peer2); kiblnd_conn_addref(conn); /* 1 ref for me... */ write_unlock_irqrestore(g_lock, flags); - - kiblnd_queue_tx(tx, conn); + + if (tx != NULL) + kiblnd_queue_tx(tx, conn); kiblnd_conn_decref(conn); /* ...to here */ } @@ -1493,7 +1500,8 @@ kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid) /* always called with a ref on ni, which prevents ni being shutdown */ LASSERT (((kib_net_t *)ni->ni_data)->ibn_shutdown == 0); - list_add_tail(&tx->tx_list, &peer->ibp_tx_queue); + if (tx != NULL) + list_add_tail(&tx->tx_list, &peer->ibp_tx_queue); kiblnd_peer_addref(peer); list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid)); diff --git a/lnet/klnds/o2iblnd/o2iblnd_modparams.c b/lnet/klnds/o2iblnd/o2iblnd_modparams.c index 6af4e85..11faf4e 100644 --- a/lnet/klnds/o2iblnd/o2iblnd_modparams.c +++ b/lnet/klnds/o2iblnd/o2iblnd_modparams.c @@ -64,6 +64,10 @@ static int peer_credits = 8; CFS_MODULE_PARM(peer_credits, "i", int, 0444, "# concurrent sends to 1 peer"); +static int peer_timeout = 0; +CFS_MODULE_PARM(peer_timeout, "i", int, 0444, + "Seconds without aliveness news to declare peer dead (<=0 to disable)"); + static char *ipif_name = "ib0"; CFS_MODULE_PARM(ipif_name, "s", charp, 0444, "IPoIB interface name"); @@ -114,6 +118,7 @@ kib_tunables_t kiblnd_tunables = { .kib_ntx = &ntx, .kib_credits = &credits, .kib_peercredits = &peer_credits, + .kib_peertimeout = &peer_timeout, .kib_default_ipif = &ipif_name, .kib_retry_count = &retry_count, .kib_rnr_retry_count = &rnr_retry_count, @@ -139,6 +144,7 @@ enum { O2IBLND_NTX, O2IBLND_CREDITS, O2IBLND_PEER_CREDITS, + O2IBLND_PEER_TIMEOUT, O2IBLND_IPIF_BASENAME, O2IBLND_RETRY_COUNT, O2IBLND_RNR_RETRY_COUNT, @@ -157,6 +163,7 @@ enum { #define O2IBLND_NTX CTL_UNNUMBERED #define O2IBLND_CREDITS CTL_UNNUMBERED #define O2IBLND_PEER_CREDITS CTL_UNNUMBERED +#define O2IBLND_PEER_TIMEOUT CTL_UNNUMBERED #define O2IBLND_IPIF_BASENAME CTL_UNNUMBERED #define O2IBLND_RETRY_COUNT CTL_UNNUMBERED #define O2IBLND_RNR_RETRY_COUNT CTL_UNNUMBERED @@ -219,6 +226,14 @@ static cfs_sysctl_table_t kiblnd_ctl_table[] = { .proc_handler = &proc_dointvec }, { + .ctl_name = O2IBLND_PEER_TIMEOUT, + .procname = "peer_timeout", + .data = &peer_timeout, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec + }, + { .ctl_name = O2IBLND_IPIF_BASENAME, .procname = "ipif_name", .data = ipif_basename_space, diff --git a/lnet/klnds/ptllnd/ptllnd.c b/lnet/klnds/ptllnd/ptllnd.c index ffa0a2e..760c8ba 100755 --- a/lnet/klnds/ptllnd/ptllnd.c +++ b/lnet/klnds/ptllnd/ptllnd.c @@ -45,6 +45,7 @@ lnd_t kptllnd_lnd = { .lnd_startup = kptllnd_startup, .lnd_shutdown = kptllnd_shutdown, .lnd_ctl = kptllnd_ctl, + .lnd_query = kptllnd_query, .lnd_send = kptllnd_send, .lnd_recv = kptllnd_recv, .lnd_eager_recv = kptllnd_eager_recv, @@ -476,6 +477,27 @@ kptllnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg) return rc; } +void +kptllnd_query (lnet_ni_t *ni, lnet_nid_t nid, time_t *when) +{ + kptl_peer_t *peer = NULL; + lnet_process_id_t id = {.nid = nid, .pid = LUSTRE_SRV_LNET_PID}; + unsigned long flags; + + /* NB: kptllnd_find_target connects to peer if necessary */ + if (kptllnd_find_target(&peer, id) != 0) + return; + + spin_lock_irqsave(&peer->peer_lock, flags); + if (peer->peer_last_alive != 0) + *when = cfs_time_current_sec() - + cfs_duration_sec(cfs_time_current() - + peer->peer_last_alive); + spin_unlock_irqrestore(&peer->peer_lock, flags); + kptllnd_peer_decref(peer); + return; +} + int kptllnd_startup (lnet_ni_t *ni) { diff --git a/lnet/klnds/ptllnd/ptllnd.h b/lnet/klnds/ptllnd/ptllnd.h index 18e5b2a..8b25d0f 100755 --- a/lnet/klnds/ptllnd/ptllnd.h +++ b/lnet/klnds/ptllnd/ptllnd.h @@ -339,6 +339,7 @@ kptllnd_lnet2ptlnid(lnet_nid_t lnet_nid) int kptllnd_startup(lnet_ni_t *ni); void kptllnd_shutdown(lnet_ni_t *ni); int kptllnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg); +void kptllnd_query (struct lnet_ni *ni, lnet_nid_t nid, time_t *when); int kptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg); int kptllnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed, unsigned int niov, diff --git a/lnet/klnds/ptllnd/ptllnd_peer.c b/lnet/klnds/ptllnd/ptllnd_peer.c index 4ad2661..1a5c383 100644 --- a/lnet/klnds/ptllnd/ptllnd_peer.c +++ b/lnet/klnds/ptllnd/ptllnd_peer.c @@ -187,7 +187,7 @@ kptllnd_peer_allocate (lnet_process_id_t lpid, ptl_process_id_t ppid) peer->peer_state = PEER_STATE_ALLOCATED; peer->peer_error = 0; - peer->peer_last_alive = cfs_time_current(); + peer->peer_last_alive = 0; peer->peer_id = lpid; peer->peer_ptlid = ppid; peer->peer_credits = 1; /* enough for HELLO */ diff --git a/lnet/klnds/socklnd/socklnd.c b/lnet/klnds/socklnd/socklnd.c index 1e8128c..d68d919 100644 --- a/lnet/klnds/socklnd/socklnd.c +++ b/lnet/klnds/socklnd/socklnd.c @@ -121,6 +121,7 @@ ksocknal_create_peer (ksock_peer_t **peerp, lnet_ni_t *ni, lnet_process_id_t id) peer->ksnp_closing = 0; peer->ksnp_accepting = 0; peer->ksnp_proto = NULL; + peer->ksnp_last_alive = 0; peer->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1; CFS_INIT_LIST_HEAD (&peer->ksnp_conns); @@ -1788,6 +1789,62 @@ ksocknal_notify (lnet_ni_t *ni, lnet_nid_t gw_nid, int alive) } void +ksocknal_query (lnet_ni_t *ni, lnet_nid_t nid, time_t *when) +{ + int connect = 1; + cfs_time_t last_alive = 0; + ksock_peer_t *peer = NULL; + rwlock_t *glock = &ksocknal_data.ksnd_global_lock; + lnet_process_id_t id = {.nid = nid, .pid = LUSTRE_SRV_LNET_PID}; + + read_lock(glock); + + peer = ksocknal_find_peer_locked(ni, id); + if (peer != NULL) { + struct list_head *tmp; + ksock_conn_t *conn; + int bufnob; + + list_for_each (tmp, &peer->ksnp_conns) { + conn = list_entry(tmp, ksock_conn_t, ksnc_list); + bufnob = libcfs_sock_wmem_queued(conn->ksnc_sock); + + if (bufnob < conn->ksnc_tx_bufnob) { + /* something got ACKed */ + conn->ksnc_tx_deadline = + cfs_time_shift(*ksocknal_tunables.ksnd_timeout); + peer->ksnp_last_alive = cfs_time_current(); + conn->ksnc_tx_bufnob = bufnob; + } + } + + last_alive = peer->ksnp_last_alive; + if (ksocknal_find_connectable_route_locked(peer) == NULL) + connect = 0; + } + + read_unlock(glock); + + if (last_alive != 0) + *when = cfs_time_current_sec() - + cfs_duration_sec(cfs_time_current() - last_alive); + + if (!connect) + return; + + ksocknal_add_peer(ni, id, LNET_NIDADDR(nid), lnet_acceptor_port()); + + write_lock_bh(glock); + + peer = ksocknal_find_peer_locked(ni, id); + if (peer != NULL) + ksocknal_launch_all_connections_locked(peer); + + write_unlock_bh(glock); + return; +} + +void ksocknal_push_peer (ksock_peer_t *peer) { int index; @@ -2545,6 +2602,7 @@ ksocknal_startup (lnet_ni_t *ni) ni->ni_data = net; ni->ni_maxtxcredits = *ksocknal_tunables.ksnd_credits; ni->ni_peertxcredits = *ksocknal_tunables.ksnd_peercredits; + ni->ni_peertimeout = *ksocknal_tunables.ksnd_peertimeout; if (ni->ni_interfaces[0] == NULL) { rc = ksocknal_enumerate_interfaces(net); @@ -2620,6 +2678,7 @@ ksocknal_module_init (void) the_ksocklnd.lnd_send = ksocknal_send; the_ksocklnd.lnd_recv = ksocknal_recv; the_ksocklnd.lnd_notify = ksocknal_notify; + the_ksocklnd.lnd_query = ksocknal_query; the_ksocklnd.lnd_accept = ksocknal_accept; rc = ksocknal_tunables_init(); diff --git a/lnet/klnds/socklnd/socklnd.h b/lnet/klnds/socklnd/socklnd.h index a18d47d..3ad8f75 100644 --- a/lnet/klnds/socklnd/socklnd.h +++ b/lnet/klnds/socklnd/socklnd.h @@ -116,6 +116,7 @@ typedef struct int *ksnd_keepalive_intvl; /* time between probes */ int *ksnd_credits; /* # concurrent sends */ int *ksnd_peercredits; /* # concurrent sends to 1 peer */ + int *ksnd_peertimeout; /* seconds to consider peer dead */ int *ksnd_enable_csum; /* enable check sum */ int *ksnd_inject_csum_error; /* set non-zero to inject checksum error */ unsigned int *ksnd_zc_min_payload; /* minimum zero copy payload size */ @@ -329,6 +330,7 @@ typedef struct ksock_route typedef struct ksock_peer { struct list_head ksnp_list; /* stash on global peer list */ + cfs_time_t ksnp_last_alive; /* when (in jiffies) I was last alive */ lnet_process_id_t ksnp_id; /* who's on the other end(s) */ cfs_atomic_t ksnp_refcount; /* # users */ int ksnp_sharecount; /* lconf usage counter */ @@ -343,7 +345,6 @@ typedef struct ksock_peer struct list_head ksnp_tx_queue; /* waiting packets */ cfs_spinlock_t ksnp_lock; /* serialize, NOT safe in g_lock */ struct list_head ksnp_zc_req_list; /* zero copy requests wait for ACK */ - cfs_time_t ksnp_last_alive; /* when (in jiffies) I was last alive */ cfs_time_t ksnp_send_keepalive; /* time to send keepalive */ lnet_ni_t *ksnp_ni; /* which network */ int ksnp_n_passive_ips; /* # of... */ @@ -546,8 +547,11 @@ extern void ksocknal_next_tx_carrier(ksock_conn_t *conn); extern void ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn); extern void ksocknal_txlist_done (lnet_ni_t *ni, struct list_head *txlist, int error); extern void ksocknal_notify (lnet_ni_t *ni, lnet_nid_t gw_nid, int alive); +extern void ksocknal_query (struct lnet_ni *ni, lnet_nid_t nid, time_t *when); extern int ksocknal_thread_start (int (*fn)(void *arg), void *arg); extern void ksocknal_thread_fini (void); +extern void ksocknal_launch_all_connections_locked (ksock_peer_t *peer); +extern ksock_route_t *ksocknal_find_connectable_route_locked (ksock_peer_t *peer); extern ksock_route_t *ksocknal_find_connecting_route_locked (ksock_peer_t *peer); extern int ksocknal_new_packet (ksock_conn_t *conn, int skip); extern int ksocknal_scheduler (void *arg); diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c index d73314c..e12800b 100644 --- a/lnet/klnds/socklnd/socklnd_cb.c +++ b/lnet/klnds/socklnd/socklnd_cb.c @@ -602,6 +602,22 @@ ksocknal_launch_connection_locked (ksock_route_t *route) cfs_spin_unlock_bh (&ksocknal_data.ksnd_connd_lock); } +void +ksocknal_launch_all_connections_locked (ksock_peer_t *peer) +{ + ksock_route_t *route; + + /* called holding write lock on ksnd_global_lock */ + for (;;) { + /* launch any/all connections that need it */ + route = ksocknal_find_connectable_route_locked(peer); + if (route == NULL) + return; + + ksocknal_launch_connection_locked(route); + } +} + ksock_conn_t * ksocknal_find_conn_locked(ksock_peer_t *peer, ksock_tx_t *tx, int nonblk) { @@ -718,6 +734,8 @@ ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn) /* First packet starts the timeout */ conn->ksnc_tx_deadline = cfs_time_shift(*ksocknal_tunables.ksnd_timeout); + if (conn->ksnc_tx_bufnob > 0) /* something got ACKed */ + conn->ksnc_peer->ksnp_last_alive = cfs_time_current(); conn->ksnc_tx_bufnob = 0; cfs_mb(); /* order with adding to tx_queue */ } @@ -813,7 +831,6 @@ ksocknal_launch_packet (lnet_ni_t *ni, ksock_tx_t *tx, lnet_process_id_t id) { ksock_peer_t *peer; ksock_conn_t *conn; - ksock_route_t *route; cfs_rwlock_t *g_lock; int retry; int rc; @@ -871,14 +888,7 @@ ksocknal_launch_packet (lnet_ni_t *ni, ksock_tx_t *tx, lnet_process_id_t id) } } - for (;;) { - /* launch any/all connections that need it */ - route = ksocknal_find_connectable_route_locked (peer); - if (route == NULL) - break; - - ksocknal_launch_connection_locked (route); - } + ksocknal_launch_all_connections_locked(peer); conn = ksocknal_find_conn_locked(peer, tx, tx->tx_nonblk); if (conn != NULL) { diff --git a/lnet/klnds/socklnd/socklnd_lib-linux.c b/lnet/klnds/socklnd/socklnd_lib-linux.c index 5b97271..7ad0db5 100644 --- a/lnet/klnds/socklnd/socklnd_lib-linux.c +++ b/lnet/klnds/socklnd/socklnd_lib-linux.c @@ -44,6 +44,7 @@ enum { SOCKLND_TIMEOUT = 1, SOCKLND_CREDITS, SOCKLND_PEER_CREDITS, + SOCKLND_PEER_TIMEOUT, SOCKLND_NCONNDS, SOCKLND_RECONNECTS_MIN, SOCKLND_RECONNECTS_MAX, @@ -71,6 +72,7 @@ enum { #define SOCKLND_TIMEOUT CTL_UNNUMBERED #define SOCKLND_CREDITS CTL_UNNUMBERED #define SOCKLND_PEER_CREDITS CTL_UNNUMBERED +#define SOCKLND_PEER_TIMEOUT CTL_UNNUMBERED #define SOCKLND_NCONNDS CTL_UNNUMBERED #define SOCKLND_RECONNECTS_MIN CTL_UNNUMBERED #define SOCKLND_RECONNECTS_MAX CTL_UNNUMBERED @@ -123,6 +125,15 @@ static cfs_sysctl_table_t ksocknal_ctl_table[] = { .strategy = &sysctl_intvec, }, { + .ctl_name = SOCKLND_PEER_TIMEOUT, + .procname = "peer_timeout", + .data = &ksocknal_tunables.ksnd_peertimeout, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = &proc_dointvec + .strategy = &sysctl_intvec, + }, + { .ctl_name = SOCKLND_NCONNDS, .procname = "nconnds", .data = &ksocknal_tunables.ksnd_nconnds, diff --git a/lnet/klnds/socklnd/socklnd_modparams.c b/lnet/klnds/socklnd/socklnd_modparams.c index d508509..3089808 100644 --- a/lnet/klnds/socklnd/socklnd_modparams.c +++ b/lnet/klnds/socklnd/socklnd_modparams.c @@ -33,6 +33,10 @@ static int peer_credits = 8; CFS_MODULE_PARM(peer_credits, "i", int, 0444, "# concurrent sends to 1 peer"); +static int peer_timeout = 0; +CFS_MODULE_PARM(peer_timeout, "i", int, 0444, + "Seconds without aliveness news to declare peer dead (<=0 to disable)"); + static int nconnds = 4; CFS_MODULE_PARM(nconnds, "i", int, 0444, "# connection daemons"); @@ -172,6 +176,7 @@ int ksocknal_tunables_init(void) ksocknal_tunables.ksnd_keepalive_intvl = &keepalive_intvl; ksocknal_tunables.ksnd_credits = &credits; ksocknal_tunables.ksnd_peercredits = &peer_credits; + ksocknal_tunables.ksnd_peertimeout = &peer_timeout; ksocknal_tunables.ksnd_enable_csum = &enable_csum; ksocknal_tunables.ksnd_inject_csum_error = &inject_csum_error; ksocknal_tunables.ksnd_zc_min_payload = &zc_min_payload; diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index 61f881f..58839c7 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -1040,6 +1040,8 @@ lnet_startup_lndnis (void) goto failed; } + LASSERT (ni->ni_peertimeout <= 0 || lnd->lnd_query != NULL); + list_del(&ni->ni_list); LNET_LOCK(); @@ -1079,9 +1081,10 @@ lnet_startup_lndnis (void) ni->ni_txcredits = ni->ni_mintxcredits = ni->ni_maxtxcredits; - CDEBUG(D_LNI, "Added LNI %s [%d/%d]\n", + CDEBUG(D_LNI, "Added LNI %s [%d/%d/%d]\n", libcfs_nid2str(ni->ni_nid), - ni->ni_peertxcredits, ni->ni_txcredits); + ni->ni_peertxcredits, ni->ni_txcredits, + ni->ni_peertimeout); nicount++; } diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 0d443a7..faea561 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -899,12 +899,101 @@ lnet_eager_recv_locked(lnet_msg_t *msg) return rc; } +/* NB: caller shall hold a ref on 'lp' as I'd drop LNET_LOCK */ +void +lnet_ni_peer_alive(lnet_peer_t *lp) +{ + time_t last_alive = 0; + lnet_ni_t *ni = lp->lp_ni; + + LASSERT (ni != NULL); + LASSERT (ni->ni_peertimeout > 0); + LASSERT (ni->ni_lnd->lnd_query != NULL); + + LNET_UNLOCK(); + (ni->ni_lnd->lnd_query)(ni, lp->lp_nid, &last_alive); + LNET_LOCK(); + + if (last_alive != 0) /* NI has updated timestamp */ + lp->lp_last_alive = last_alive; + return; +} + +/* NB: always called with LNET_LOCK held */ +static inline int +lnet_peer_is_alive (lnet_peer_t *lp, time_t now) +{ + lnet_ni_t *ni = lp->lp_ni; + time_t deadline; + int alive; + + LASSERT (ni != NULL); + LASSERT (ni->ni_peertimeout > 0); + + if (!lp->lp_alive && lp->lp_alive_count > 0 && + cfs_time_aftereq(lp->lp_timestamp, lp->lp_last_alive)) + return 0; + + deadline = cfs_time_add(lp->lp_last_alive, ni->ni_peertimeout); + alive = cfs_time_after(deadline, now); + if (alive && !lp->lp_alive) /* update obsolete lp_alive */ + lnet_notify_locked(lp, 0, 1, lp->lp_last_alive); + + return alive; +} + +/* NB: returns 1 when alive, 0 when dead, negative when error; + * may drop the LNET_LOCK */ +int +lnet_peer_alive_locked (lnet_peer_t *lp) +{ + lnet_ni_t *ni = lp->lp_ni; + time_t now = cfs_time_current_sec(); + + LASSERT (ni != NULL); + + if (ni->ni_peertimeout <= 0) /* disabled */ + return -ENODEV; + + if (lnet_peer_is_alive(lp, now)) + return 1; + + /* peer appears dead, query LND for latest aliveness news */ + lnet_ni_peer_alive(lp); + + if (lnet_peer_is_alive(lp, now)) + return 1; + + lnet_notify_locked(lp, 0, 0, lp->lp_last_alive); + return 0; +} + +/* NB: returns 1 when alive, 0 when dead, negative when error; + * may drop the LNET_LOCK */ +int +lnet_nid_alive_locked (lnet_nid_t nid) +{ + int rc; + lnet_peer_t *lp; + + rc = lnet_nid2peer_locked(&lp, nid); + if (rc != 0) { + CERROR("Error %d looking up %s\n", rc, libcfs_nid2str(nid)); + return -ENOENT; + } + + rc = lnet_peer_alive_locked(lp); + lnet_peer_decref_locked(lp); + return rc; +} + int lnet_post_send_locked (lnet_msg_t *msg, int do_send) { /* lnet_send is going to LNET_UNLOCK immediately after this, so it sets * do_send FALSE and I don't do the unlock/send/lock bit. I return - * EAGAIN if msg blocked and 0 if sent or OK to send */ + * EAGAIN if msg blocked, EHOSTUNREACH if msg_txpeer appears dead, and + * 0 if sent or OK to send */ lnet_peer_t *lp = msg->msg_txpeer; lnet_ni_t *ni = lp->lp_ni; @@ -912,6 +1001,20 @@ lnet_post_send_locked (lnet_msg_t *msg, int do_send) LASSERT (!do_send || msg->msg_delayed); LASSERT (!msg->msg_receiving); + /* NB 'lp' is always the next hop */ + if ((msg->msg_target.pid & LNET_PID_USERFLAG) == 0 && + lnet_peer_alive_locked(lp) == 0) { + LNET_UNLOCK(); + + CDEBUG(D_NETERROR, "Dropping message for %s: peer not alive\n", + libcfs_id2str(msg->msg_target)); + if (do_send) + lnet_finalize(ni, msg, -EHOSTUNREACH); + + LNET_LOCK(); + return EHOSTUNREACH; + } + if (!msg->msg_peertxcredit) { LASSERT ((lp->lp_txcredits < 0) == !list_empty(&lp->lp_txq)); @@ -993,7 +1096,7 @@ lnet_post_routed_recv_locked (lnet_msg_t *msg, int do_recv) { /* lnet_parse is going to LNET_UNLOCK immediately after this, so it * sets do_recv FALSE and I don't do the unlock/send/lock bit. I - * return EAGAIN if msg blocked and 0 if sent or OK to send */ + * return EAGAIN if msg blocked and 0 if received or OK to receive */ lnet_peer_t *lp = msg->msg_rxpeer; lnet_rtrbufpool_t *rbp; lnet_rtrbuf_t *rb; @@ -1327,6 +1430,9 @@ lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg) rc = lnet_post_send_locked(msg, 0); LNET_UNLOCK(); + if (rc == EHOSTUNREACH) + return -EHOSTUNREACH; + if (rc == 0) lnet_ni_send(src_ni, msg); @@ -1983,6 +2089,7 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, int rc = 0; int for_me; lnet_msg_t *msg; + lnet_pid_t dest_pid; lnet_nid_t dest_nid; lnet_nid_t src_nid; __u32 payload_length; @@ -1993,6 +2100,7 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, type = le32_to_cpu(hdr->type); src_nid = le64_to_cpu(hdr->src_nid); dest_nid = le64_to_cpu(hdr->dest_nid); + dest_pid = le32_to_cpu(hdr->dest_pid); payload_length = le32_to_cpu(hdr->payload_length); for_me = (ni->ni_nid == dest_nid); @@ -2120,7 +2228,7 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, LASSERT (for_me); #else if (!for_me) { - msg->msg_target.pid = le32_to_cpu(hdr->dest_pid); + msg->msg_target.pid = dest_pid; msg->msg_target.nid = dest_nid; msg->msg_routing = 1; msg->msg_offset = 0; @@ -2134,7 +2242,6 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, goto free_drop; } } - lnet_commit_routedmsg(msg); rc = lnet_post_routed_recv_locked(msg, 0); LNET_UNLOCK(); @@ -2150,7 +2257,7 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, msg->msg_hdr.src_nid = src_nid; msg->msg_hdr.src_pid = le32_to_cpu(msg->msg_hdr.src_pid); msg->msg_hdr.dest_nid = dest_nid; - msg->msg_hdr.dest_pid = le32_to_cpu(msg->msg_hdr.dest_pid); + msg->msg_hdr.dest_pid = dest_pid; msg->msg_hdr.payload_length = payload_length; msg->msg_ev.sender = from_nid; diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c index 9e033a0..7d68e6d4 100644 --- a/lnet/lnet/peer.c +++ b/lnet/lnet/peer.c @@ -180,12 +180,13 @@ lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid) CFS_INIT_LIST_HEAD(&lp->lp_txq); CFS_INIT_LIST_HEAD(&lp->lp_rtrq); - lp->lp_alive = !lnet_peers_start_down(); /* 1 bit!! */ lp->lp_notify = 0; lp->lp_notifylnd = 0; lp->lp_notifying = 0; lp->lp_alive_count = 0; lp->lp_timestamp = 0; + lp->lp_alive = !lnet_peers_start_down(); /* 1 bit!! */ + lp->lp_last_alive = cfs_time_current_sec(); /* assumes alive */ lp->lp_ping_timestamp = 0; lp->lp_nid = nid; lp->lp_refcount = 2; /* 1 for caller; 1 for hash */ @@ -239,11 +240,12 @@ lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid) void lnet_debug_peer(lnet_nid_t nid) { + char *aliveness = "NA"; int rc; lnet_peer_t *lp; LNET_LOCK(); - + rc = lnet_nid2peer_locked(&lp, nid); if (rc != 0) { LNET_UNLOCK(); @@ -251,11 +253,13 @@ lnet_debug_peer(lnet_nid_t nid) return; } + if (lnet_isrouter(lp) || lp->lp_ni->ni_peertimeout > 0) + aliveness = lp->lp_alive ? "up" : "down"; + CDEBUG(D_WARNING, "%-24s %4d %5s %5d %5d %5d %5d %5d %ld\n", - libcfs_nid2str(lp->lp_nid), lp->lp_refcount, - !lnet_isrouter(lp) ? "~rtr" : (lp->lp_alive ? "up" : "down"), - lp->lp_ni->ni_peertxcredits, - lp->lp_rtrcredits, lp->lp_minrtrcredits, + libcfs_nid2str(lp->lp_nid), lp->lp_refcount, + aliveness, lp->lp_ni->ni_peertxcredits, + lp->lp_rtrcredits, lp->lp_minrtrcredits, lp->lp_txcredits, lp->lp_mintxcredits, lp->lp_txqnob); lnet_peer_decref_locked(lp); diff --git a/lnet/lnet/router.c b/lnet/lnet/router.c index 7530fca..3a211fa 100644 --- a/lnet/lnet/router.c +++ b/lnet/lnet/router.c @@ -207,6 +207,12 @@ lnet_notify (lnet_ni_t *ni, lnet_nid_t nid, int alive, time_t when) return -EOPNOTSUPP; } +void +lnet_notify_locked (lnet_peer_t *lp, int notifylnd, int alive, time_t when) +{ + return; +} + #endif static void diff --git a/lnet/lnet/router_proc.c b/lnet/lnet/router_proc.c index 3e51250..77571e5 100644 --- a/lnet/lnet/router_proc.c +++ b/lnet/lnet/router_proc.c @@ -433,7 +433,7 @@ lnet_router_seq_show (struct seq_file *s, void *iter) LNET_UNLOCK(); - seq_printf(s, + seq_printf(s, "%-4d %7d %9d %6s %12lu %s\n", nrefs, nrtrrefs, alive_cnt, alive ? "up" : "down", last_ping, libcfs_nid2str(nid)); @@ -603,6 +603,7 @@ static int lnet_peer_seq_show (struct seq_file *s, void *iter) { lnet_peer_seq_iterator_t *lpsi = iter; + char *aliveness = "NA"; lnet_peer_t *lp; lnet_nid_t nid; int maxcr; @@ -610,8 +611,6 @@ lnet_peer_seq_show (struct seq_file *s, void *iter) int txcr; int minrtrcr; int rtrcr; - int alive; - int rtr; int txqnob; int nrefs; @@ -639,16 +638,16 @@ lnet_peer_seq_show (struct seq_file *s, void *iter) mintxcr = lp->lp_mintxcredits; rtrcr = lp->lp_rtrcredits; minrtrcr = lp->lp_minrtrcredits; - rtr = lnet_isrouter(lp); - alive = lp->lp_alive; txqnob = lp->lp_txqnob; nrefs = lp->lp_refcount; + if (lnet_isrouter(lp) || lp->lp_ni->ni_peertimeout > 0) + aliveness = lp->lp_alive ? "up" : "down"; + LNET_UNLOCK(); seq_printf(s, "%-24s %4d %5s %5d %5d %5d %5d %5d %d\n", - libcfs_nid2str(nid), nrefs, - !rtr ? "~rtr" : (alive ? "up" : "down"), + libcfs_nid2str(nid), nrefs, aliveness, maxcr, rtrcr, minrtrcr, txcr, mintxcr, txqnob); return 0; }