From 0fa02a7d81e77ad482022d5543cf433af1bf34c6 Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Tue, 13 Nov 2018 18:14:36 -0800 Subject: [PATCH] LU-11664 lnet: push router interface updates A router can bring up/down its interfaces if it hasn't received any messages on that interface for a configurable period (alive_router_ping_timeout). When this even occures the router can now push its status change to the peers it's talking to in order to inform them of the change in its status. This will allow the router users to handle asym router failures quicker. Test-Parameters: forbuildonly Signed-off-by: Amir Shehata Change-Id: I9530ed7d9bc0a86edc43e3f610cc943f1732dcfd Reviewed-on: https://review.whamcloud.com/33651 Reviewed-by: Sebastien Buisson Reviewed-by: Alexey Lyashkov Reviewed-by: Olaf Weber Tested-by: Jenkins --- lnet/lnet/lib-move.c | 26 ++++++++++++++++---------- lnet/lnet/router.c | 13 +++++++++++-- 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 7ceaa1f..b2d3c6a 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -4109,16 +4109,17 @@ int lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid, void *private, int rdma_req) { - int rc = 0; - int cpt; - int for_me; - struct lnet_msg *msg; - lnet_pid_t dest_pid; - lnet_nid_t dest_nid; - lnet_nid_t src_nid; struct lnet_peer_ni *lpni; - __u32 payload_length; - __u32 type; + struct lnet_msg *msg; + __u32 payload_length; + lnet_pid_t dest_pid; + lnet_nid_t dest_nid; + lnet_nid_t src_nid; + bool push = false; + int for_me; + __u32 type; + int rc = 0; + int cpt; LASSERT (!in_interrupt ()); @@ -4178,11 +4179,16 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid, lnet_ni_lock(ni); ni->ni_last_alive = ktime_get_real_seconds(); if (ni->ni_status != NULL && - ni->ni_status->ns_status == LNET_NI_STATUS_DOWN) + ni->ni_status->ns_status == LNET_NI_STATUS_DOWN) { ni->ni_status->ns_status = LNET_NI_STATUS_UP; + push = true; + } lnet_ni_unlock(ni); } + if (push) + lnet_push_update_to_peers(1); + /* Regard a bad destination NID as a protocol error. Senders should * know what they're doing; if they don't they're misconfigured, buggy * or malicious so we chop them off at the knees :) */ diff --git a/lnet/lnet/router.c b/lnet/lnet/router.c index fb7f905..c8deecb 100644 --- a/lnet/lnet/router.c +++ b/lnet/lnet/router.c @@ -777,10 +777,11 @@ lnet_wait_known_routerstate(void) } } -static void +static bool lnet_update_ni_status_locked(void) { struct lnet_ni *ni = NULL; + bool push = false; time64_t now; time64_t timeout; @@ -811,9 +812,12 @@ lnet_update_ni_status_locked(void) /* NB: so far, this is the only place to set * NI status to "down" */ ni->ni_status->ns_status = LNET_NI_STATUS_DOWN; + push = true; } lnet_ni_unlock(ni); } + + return push; } void lnet_wait_router_start(void) @@ -848,6 +852,7 @@ lnet_check_routers(void) struct lnet_peer_ni *lpni; struct list_head *entry; struct lnet_peer *rtr; + bool push = false; __u64 version; time64_t now; int cpt; @@ -918,9 +923,13 @@ rescan: } if (the_lnet.ln_routing) - lnet_update_ni_status_locked(); + push = lnet_update_ni_status_locked(); lnet_net_unlock(cpt); + + /* if the status of the ni changed update the peers */ + if (push) + lnet_push_update_to_peers(1); } void -- 1.8.3.1