From da9998767a9093c088d28119179ee591f42910dc Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Thu, 4 Oct 2018 17:18:20 -0700 Subject: [PATCH] LU-11478 lnet: misleading discovery seqno. There is a sequence number used when sending discovery messages. This sequence number is intended to detect stale messages. However it could be misleading if the peer reboots. In this case the peer's sequence number will reset. The node will think that all information being sent to it is stale, while in reality the peer might've changed configuration. There is no reliable why to know whether a peer rebooted, so we'll always assume that the messages we're receiving are valid. So we'll operate on first come first serve basis. Lustre-change: https://review.whamcloud.com/33304 Lustre-commit: 42d999ed8f6113724b1ac103b832d5b74b878d55 Signed-off-by: Amir Shehata Change-Id: I421a00e47bc93ee60fa37c648d6d9a726d9def9c Reviewed-by: Olaf Weber Signed-off-by: Minh Diep Reviewed-on: https://review.whamcloud.com/36041 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Oleg Drokin --- lnet/lnet/peer.c | 44 ++++++-------------------------------------- 1 file changed, 6 insertions(+), 38 deletions(-) diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c index db012f3..1ad2fc2 100644 --- a/lnet/lnet/peer.c +++ b/lnet/lnet/peer.c @@ -1928,38 +1928,9 @@ void lnet_peer_push_event(struct lnet_event *ev) goto out; } - /* - * Check whether the Put data is stale. Stale data can just be - * dropped. - */ - if (pbuf->pb_info.pi_nnis > 1 && - lp->lp_primary_nid == pbuf->pb_info.pi_ni[1].ns_nid && - LNET_PING_BUFFER_SEQNO(pbuf) < lp->lp_peer_seqno) { - CDEBUG(D_NET, "Stale Push from %s: got %u have %u\n", - libcfs_nid2str(lp->lp_primary_nid), - LNET_PING_BUFFER_SEQNO(pbuf), - lp->lp_peer_seqno); - goto out; - } - - /* - * Check whether the Put data is new, in which case we clear - * the UPTODATE flag and prepare to process it. - * - * If the Put data is current, and the peer is UPTODATE then - * we assome everything is all right and drop the data as - * stale. - */ - if (LNET_PING_BUFFER_SEQNO(pbuf) > lp->lp_peer_seqno) { - lp->lp_peer_seqno = LNET_PING_BUFFER_SEQNO(pbuf); - lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE; - } else if (lp->lp_state & LNET_PEER_NIDS_UPTODATE) { - CDEBUG(D_NET, "Stale Push from %s: got %u have %u\n", - libcfs_nid2str(lp->lp_primary_nid), - LNET_PING_BUFFER_SEQNO(pbuf), - lp->lp_peer_seqno); - goto out; - } + /* always assume new data */ + lp->lp_peer_seqno = LNET_PING_BUFFER_SEQNO(pbuf); + lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE; /* * If there is data present that hasn't been processed yet, @@ -2244,16 +2215,13 @@ lnet_discovery_event_reply(struct lnet_peer *lp, struct lnet_event *ev) if (pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL && pbuf->pb_info.pi_nnis > 1 && lp->lp_primary_nid == pbuf->pb_info.pi_ni[1].ns_nid) { - if (LNET_PING_BUFFER_SEQNO(pbuf) < lp->lp_peer_seqno) { - CDEBUG(D_NET, "Stale Reply from %s: got %u have %u\n", + if (LNET_PING_BUFFER_SEQNO(pbuf) < lp->lp_peer_seqno) + CDEBUG(D_NET, "peer %s: seq# got %u have %u. peer rebooted?\n", libcfs_nid2str(lp->lp_primary_nid), LNET_PING_BUFFER_SEQNO(pbuf), lp->lp_peer_seqno); - goto out; - } - if (LNET_PING_BUFFER_SEQNO(pbuf) > lp->lp_peer_seqno) - lp->lp_peer_seqno = LNET_PING_BUFFER_SEQNO(pbuf); + lp->lp_peer_seqno = LNET_PING_BUFFER_SEQNO(pbuf); } /* We're happy with the state of the data in the buffer. */ -- 1.8.3.1