A router can bring up/down its interfaces if it hasn't received any
messages on that interface for a configurable period
(alive_router_ping_timeout). When this even occures the router can now
push its status change to the peers it's talking to in order to inform
them of the change in its status. This will allow the router users to
handle asym router failures quicker.
Test-Parameters: forbuildonly
Signed-off-by: Amir Shehata <ashehata@whamcloud.com>
Change-Id: I9530ed7d9bc0a86edc43e3f610cc943f1732dcfd
Reviewed-on: https://review.whamcloud.com/33651
Reviewed-by: Sebastien Buisson <sbuisson@ddn.com>
Reviewed-by: Alexey Lyashkov <c17817@cray.com>
Reviewed-by: Olaf Weber <olaf.weber@hpe.com>
Tested-by: Jenkins
lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid,
void *private, int rdma_req)
{
lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid,
void *private, int rdma_req)
{
- int rc = 0;
- int cpt;
- int for_me;
- struct lnet_msg *msg;
- lnet_pid_t dest_pid;
- lnet_nid_t dest_nid;
- lnet_nid_t src_nid;
struct lnet_peer_ni *lpni;
struct lnet_peer_ni *lpni;
- __u32 payload_length;
- __u32 type;
+ struct lnet_msg *msg;
+ __u32 payload_length;
+ lnet_pid_t dest_pid;
+ lnet_nid_t dest_nid;
+ lnet_nid_t src_nid;
+ bool push = false;
+ int for_me;
+ __u32 type;
+ int rc = 0;
+ int cpt;
LASSERT (!in_interrupt ());
LASSERT (!in_interrupt ());
lnet_ni_lock(ni);
ni->ni_last_alive = ktime_get_real_seconds();
if (ni->ni_status != NULL &&
lnet_ni_lock(ni);
ni->ni_last_alive = ktime_get_real_seconds();
if (ni->ni_status != NULL &&
- ni->ni_status->ns_status == LNET_NI_STATUS_DOWN)
+ ni->ni_status->ns_status == LNET_NI_STATUS_DOWN) {
ni->ni_status->ns_status = LNET_NI_STATUS_UP;
ni->ni_status->ns_status = LNET_NI_STATUS_UP;
+ if (push)
+ lnet_push_update_to_peers(1);
+
/* Regard a bad destination NID as a protocol error. Senders should
* know what they're doing; if they don't they're misconfigured, buggy
* or malicious so we chop them off at the knees :) */
/* Regard a bad destination NID as a protocol error. Senders should
* know what they're doing; if they don't they're misconfigured, buggy
* or malicious so we chop them off at the knees :) */
lnet_update_ni_status_locked(void)
{
struct lnet_ni *ni = NULL;
lnet_update_ni_status_locked(void)
{
struct lnet_ni *ni = NULL;
time64_t now;
time64_t timeout;
time64_t now;
time64_t timeout;
/* NB: so far, this is the only place to set
* NI status to "down" */
ni->ni_status->ns_status = LNET_NI_STATUS_DOWN;
/* NB: so far, this is the only place to set
* NI status to "down" */
ni->ni_status->ns_status = LNET_NI_STATUS_DOWN;
}
void lnet_wait_router_start(void)
}
void lnet_wait_router_start(void)
struct lnet_peer_ni *lpni;
struct list_head *entry;
struct lnet_peer *rtr;
struct lnet_peer_ni *lpni;
struct list_head *entry;
struct lnet_peer *rtr;
__u64 version;
time64_t now;
int cpt;
__u64 version;
time64_t now;
int cpt;
}
if (the_lnet.ln_routing)
}
if (the_lnet.ln_routing)
- lnet_update_ni_status_locked();
+ push = lnet_update_ni_status_locked();
+
+ /* if the status of the ni changed update the peers */
+ if (push)
+ lnet_push_update_to_peers(1);