LND notifies when a peer is up or down. If the LND notifies
LNet that the peer is up and sets the "reset" flag to true
then this indicates to LNet that the LND knows about the health
of the peer and is telling LNet that the peer is fully healthy.
LNet will set the health value of the peer to maximum, otherwise
it will increment the health by one.
If the LND notifies the LNet that the peer is down, LNet will
decrement the health of the peer by sensitivity value configured.
LNet then turns around and rechecks the peer aliveness and if its
dead it'll notify the LND. This code is only used by the socklnd
because it needs to tear down connections. This is in keeping with
the original functionality.
Test-Parameters: forbuildonly
Signed-off-by: Amir Shehata <ashehata@whamcloud.com>
Change-Id: Ifa614405fb0c2cd4f6bcb1a2a97e856320eb6cbe
Reviewed-on: https://review.whamcloud.com/33453
Reviewed-by: Olaf Weber <olaf.weber@hpe.com>
Reviewed-by: Sebastien Buisson <sbuisson@ddn.com>
Tested-by: Jenkins
void lnet_mt_event_handler(struct lnet_event *event);
void lnet_mt_event_handler(struct lnet_event *event);
-int lnet_notify(struct lnet_ni *ni, lnet_nid_t peer, int alive,
+int lnet_notify(struct lnet_ni *ni, lnet_nid_t peer, bool alive, bool reset,
time64_t when);
void lnet_notify_locked(struct lnet_peer_ni *lp, int notifylnd, int alive,
time64_t when);
time64_t when);
void lnet_notify_locked(struct lnet_peer_ni *lp, int notifylnd, int alive,
time64_t when);
+lnet_set_healthv(atomic_t *healthv, int value)
+{
+ atomic_set(healthv, value);
+}
+
+static inline void
lnet_inc_healthv(atomic_t *healthv)
{
atomic_add_unless(healthv, 1, LNET_MAX_HEALTH_VALUE);
lnet_inc_healthv(atomic_t *healthv)
{
atomic_add_unless(healthv, 1, LNET_MAX_HEALTH_VALUE);
int (*lnd_eager_recv)(struct lnet_ni *ni, void *private,
struct lnet_msg *msg, void **new_privatep);
int (*lnd_eager_recv)(struct lnet_ni *ni, void *private,
struct lnet_msg *msg, void **new_privatep);
- /* notification of peer health */
- void (*lnd_notify)(struct lnet_ni *ni, lnet_nid_t peer, int alive);
+ /* notification of peer down */
+ void (*lnd_notify_peer_down)(lnet_nid_t peer);
/* query of peer aliveness */
void (*lnd_query)(struct lnet_ni *ni, lnet_nid_t peer, time64_t *when);
/* query of peer aliveness */
void (*lnd_query)(struct lnet_ni *ni, lnet_nid_t peer, time64_t *when);
ktime_get_seconds() - peer->gnp_last_alive);
lnet_notify(net->gnn_ni, peer_nid, alive,
ktime_get_seconds() - peer->gnp_last_alive);
lnet_notify(net->gnn_ni, peer_nid, alive,
+ (alive) ? true : false,
peer->gnp_last_alive);
kgnilnd_net_decref(net);
peer->gnp_last_alive);
kgnilnd_net_decref(net);
/* Notify LNET that we now have a working connection to this peer.
* This is a Cray extension to the "standard" LND behavior.
*/
/* Notify LNET that we now have a working connection to this peer.
* This is a Cray extension to the "standard" LND behavior.
*/
- lnet_notify(peer->gnp_net->gnn_ni, peer->gnp_nid, 1,
+ lnet_notify(peer->gnp_net->gnn_ni, peer->gnp_nid, true, true,
ktime_get_seconds());
/* drop our 'hold' ref */
ktime_get_seconds());
/* drop our 'hold' ref */
static void
kiblnd_peer_notify(struct kib_peer_ni *peer_ni)
{
static void
kiblnd_peer_notify(struct kib_peer_ni *peer_ni)
{
read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
if (kiblnd_peer_idle(peer_ni) && peer_ni->ibp_error != 0) {
read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
if (kiblnd_peer_idle(peer_ni) && peer_ni->ibp_error != 0) {
- error = peer_ni->ibp_error;
- peer_ni->ibp_error = 0;
+ error = peer_ni->ibp_error;
+ peer_ni->ibp_error = 0;
- last_alive = peer_ni->ibp_last_alive;
- }
+ last_alive = peer_ni->ibp_last_alive;
+ }
read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
- if (error != 0)
- lnet_notify(peer_ni->ibp_ni,
- peer_ni->ibp_nid, 0, last_alive);
+ if (error != 0)
+ lnet_notify(peer_ni->ibp_ni,
+ peer_ni->ibp_nid, false, false, last_alive);
read_unlock(&ksocknal_data.ksnd_global_lock);
if (notify)
read_unlock(&ksocknal_data.ksnd_global_lock);
if (notify)
- lnet_notify(peer_ni->ksnp_ni, peer_ni->ksnp_id.nid, 0,
- last_alive);
+ lnet_notify(peer_ni->ksnp_ni, peer_ni->ksnp_id.nid,
+ false, false, last_alive);
-ksocknal_notify(struct lnet_ni *ni, lnet_nid_t gw_nid, int alive)
+ksocknal_notify_gw_down(lnet_nid_t gw_nid)
{
/* The router is telling me she's been notified of a change in
* gateway state....
{
/* The router is telling me she's been notified of a change in
* gateway state....
- CDEBUG (D_NET, "gw %s %s\n", libcfs_nid2str(gw_nid),
- alive ? "up" : "down");
+ CDEBUG(D_NET, "gw %s down\n", libcfs_nid2str(gw_nid));
- if (!alive) {
- /* If the gateway crashed, close all open connections... */
- ksocknal_close_matching_conns (id, 0);
- return;
- }
+ /* If the gateway crashed, close all open connections... */
+ ksocknal_close_matching_conns(id, 0);
+ return;
- /* ...otherwise do nothing. We can only establish new connections
- * if we have autroutes, and these connect on demand. */
+ /* We can only establish new connections
+ * if we have autroutes, and these connect on demand. */
the_ksocklnd.lnd_ctl = ksocknal_ctl;
the_ksocklnd.lnd_send = ksocknal_send;
the_ksocklnd.lnd_recv = ksocknal_recv;
the_ksocklnd.lnd_ctl = ksocknal_ctl;
the_ksocklnd.lnd_send = ksocknal_send;
the_ksocklnd.lnd_recv = ksocknal_recv;
- the_ksocklnd.lnd_notify = ksocknal_notify;
+ the_ksocklnd.lnd_notify_peer_down = ksocknal_notify_gw_down;
the_ksocklnd.lnd_query = ksocknal_query;
the_ksocklnd.lnd_accept = ksocknal_accept;
the_ksocklnd.lnd_query = ksocknal_query;
the_ksocklnd.lnd_accept = ksocknal_accept;
extern void ksocknal_queue_tx_locked(struct ksock_tx *tx, struct ksock_conn *conn);
extern void ksocknal_txlist_done(struct lnet_ni *ni, struct list_head *txlist,
int error);
extern void ksocknal_queue_tx_locked(struct ksock_tx *tx, struct ksock_conn *conn);
extern void ksocknal_txlist_done(struct lnet_ni *ni, struct list_head *txlist,
int error);
-extern void ksocknal_notify(struct lnet_ni *ni, lnet_nid_t gw_nid, int alive);
+extern void ksocknal_notify(lnet_nid_t gw_nid);
extern void ksocknal_query(struct lnet_ni *ni, lnet_nid_t nid, time64_t *when);
extern int ksocknal_thread_start(int (*fn)(void *arg), void *arg, char *name);
extern void ksocknal_thread_fini(void);
extern void ksocknal_query(struct lnet_ni *ni, lnet_nid_t nid, time64_t *when);
extern int ksocknal_thread_start(int (*fn)(void *arg), void *arg, char *name);
extern void ksocknal_thread_fini(void);
* that deadline to the wall clock.
*/
deadline += ktime_get_seconds();
* that deadline to the wall clock.
*/
deadline += ktime_get_seconds();
- return lnet_notify(NULL, data->ioc_nid, data->ioc_flags,
+ return lnet_notify(NULL, data->ioc_nid, data->ioc_flags, false,
+static inline void
+lnet_notify_peer_down(struct lnet_ni *ni, lnet_nid_t nid)
+{
+ if (ni->ni_net->net_lnd->lnd_notify_peer_down != NULL)
+ (ni->ni_net->net_lnd->lnd_notify_peer_down)(nid);
+}
+
+/*
+ * ni: local NI used to communicate with the peer
+ * nid: peer NID
+ * alive: true if peer is alive, false otherwise
+ * reset: reset health value. This is requested by the LND.
+ * when: notificaiton time.
+ */
-lnet_notify(struct lnet_ni *ni, lnet_nid_t nid, int alive, time64_t when)
+lnet_notify(struct lnet_ni *ni, lnet_nid_t nid, bool alive, bool reset,
+ time64_t when)
- struct lnet_peer_ni *lp = NULL;
+ struct lnet_peer_ni *lpni = NULL;
time64_t now = ktime_get_seconds();
time64_t now = ktime_get_seconds();
- int cpt = lnet_cpt_of_nid(nid, ni);
LASSERT (!in_interrupt ());
LASSERT (!in_interrupt ());
+ /* must lock 0 since this is used for synchronization */
+ lnet_net_lock(0);
if (the_lnet.ln_state != LNET_STATE_RUNNING) {
if (the_lnet.ln_state != LNET_STATE_RUNNING) {
- lp = lnet_find_peer_ni_locked(nid);
- if (lp == NULL) {
+ lpni = lnet_find_peer_ni_locked(nid);
+ if (lpni == NULL) {
CDEBUG(D_NET, "%s not found\n", libcfs_nid2str(nid));
return 0;
}
CDEBUG(D_NET, "%s not found\n", libcfs_nid2str(nid));
return 0;
}
- /*
- * It is possible for this function to be called for the same peer
- * but with different NIs. We want to synchronize the notification
- * between the different calls. So we will use the lpni_cpt to
- * grab the net lock.
- */
- if (lp->lpni_cpt != cpt) {
- lnet_net_unlock(cpt);
- cpt = lp->lpni_cpt;
- lnet_net_lock(cpt);
+ if (alive) {
+ if (reset)
+ lnet_set_healthv(&lpni->lpni_healthv,
+ LNET_MAX_HEALTH_VALUE);
+ else
+ lnet_inc_healthv(&lpni->lpni_healthv);
+ } else {
+ lnet_handle_remote_failure_locked(lpni);
- lnet_peer_ni_decref_locked(lp);
+ /* recalculate aliveness */
+ alive = lnet_is_peer_ni_alive(lpni);
+ lnet_net_unlock(0);
+ if (ni != NULL && !alive)
+ lnet_notify_peer_down(ni, lpni->lpni_nid);
+
+ cpt = lpni->lpni_cpt;
+ lnet_net_lock(cpt);
+ lnet_peer_ni_decref_locked(lpni);
return 0;
}
EXPORT_SYMBOL(lnet_notify);
return 0;
}
EXPORT_SYMBOL(lnet_notify);