From: Amir Shehata Date: Thu, 4 Oct 2018 20:00:49 +0000 (-0700) Subject: LU-11472 lnet: Decrement health on timeout X-Git-Tag: 2.12.0-RC1~127 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=139d69141b73d427490f39d3096b2187e979eaea LU-11472 lnet: Decrement health on timeout When a response times out we want to decrement the health of the immediate next hop peer ni, so we don't use that interface if there are others available. When sending a message if there is a response tracker associated with the MD, store the next-hop-nid there. If the response times out then we can look up the peer_ni using the cached NID, and decrement its health value. Signed-off-by: Amir Shehata Change-Id: I6c2f49a695f078ee50378c0a468c7ee058f7e712 Reviewed-on: https://review.whamcloud.com/33308 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Sonia Sharma Reviewed-by: Doug Oucharek Reviewed-by: Olaf Weber Reviewed-by: Oleg Drokin --- diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index 4441009..1c60b4d 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -695,6 +695,7 @@ void lnet_detach_rsp_tracker(struct lnet_libmd *md, int cpt); void lnet_finalize(struct lnet_msg *msg, int rc); bool lnet_send_error_simulation(struct lnet_msg *msg, enum lnet_msg_hstatus *hstatus); +void lnet_handle_remote_failure_locked(struct lnet_peer_ni *lpni); void lnet_drop_message(struct lnet_ni *ni, int cpt, void *private, unsigned int nob, __u32 msg_type); diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index b4ab6ac..7433239 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -84,6 +84,8 @@ struct lnet_rsp_tracker { struct list_head rspt_on_list; /* cpt to lock */ int rspt_cpt; + /* nid of next hop */ + lnet_nid_t rspt_next_hop_nid; /* deadline of the REPLY/ACK */ ktime_t rspt_deadline; /* parent MD */ diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 5a304af..3f5b112 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -1609,6 +1609,7 @@ lnet_handle_send(struct lnet_send_data *sd) __u32 send_case = sd->sd_send_case; int rc; __u32 routing = send_case & REMOTE_DST; + struct lnet_rsp_tracker *rspt; /* * Increment sequence number of the selected peer so that we @@ -1701,6 +1702,19 @@ lnet_handle_send(struct lnet_send_data *sd) msg->msg_hdr.dest_nid = cpu_to_le64(msg->msg_txpeer->lpni_nid); } + /* + * if we have response tracker block update it with the next hop + * nid + */ + if (msg->msg_md) { + rspt = msg->msg_md->md_rspt_ptr; + if (rspt) { + rspt->rspt_next_hop_nid = msg->msg_txpeer->lpni_nid; + CDEBUG(D_NET, "rspt_next_hop_nid = %s\n", + libcfs_nid2str(rspt->rspt_next_hop_nid)); + } + } + rc = lnet_post_send_locked(msg, 0); if (!rc) @@ -2733,6 +2747,9 @@ lnet_finalize_expired_responses(bool force) if (ktime_compare(ktime_get(), rspt->rspt_deadline) >= 0 || force) { + struct lnet_peer_ni *lpni; + lnet_nid_t nid; + md = lnet_handle2md(&rspt->rspt_mdh); if (!md) { LNetInvalidateMDHandle(&rspt->rspt_mdh); @@ -2751,9 +2768,25 @@ lnet_finalize_expired_responses(bool force) list_del_init(&rspt->rspt_on_list); - CNETERR("Response timed out: md = %p\n", md); + nid = rspt->rspt_next_hop_nid; + + CNETERR("Response timed out: md = %p: nid = %s\n", + md, libcfs_nid2str(nid)); LNetMDUnlink(rspt->rspt_mdh); lnet_rspt_free(rspt, i); + + /* + * If there is a timeout on the response + * from the next hop decrement its health + * value so that we don't use it + */ + lnet_net_lock(0); + lpni = lnet_find_peer_ni_locked(nid); + if (lpni) { + lnet_handle_remote_failure_locked(lpni); + lnet_peer_ni_decref_locked(lpni); + } + lnet_net_unlock(0); } else { lnet_res_unlock(i); break; diff --git a/lnet/lnet/lib-msg.c b/lnet/lnet/lib-msg.c index 2f3b689..8ad9185 100644 --- a/lnet/lnet/lib-msg.c +++ b/lnet/lnet/lib-msg.c @@ -513,18 +513,13 @@ lnet_handle_local_failure(struct lnet_msg *msg) lnet_net_unlock(0); } -static void -lnet_handle_remote_failure(struct lnet_msg *msg) +void +lnet_handle_remote_failure_locked(struct lnet_peer_ni *lpni) { - struct lnet_peer_ni *lpni; - - lpni = msg->msg_txpeer; - /* lpni could be NULL if we're in the LOLND case */ if (!lpni) return; - lnet_net_lock(0); lnet_dec_healthv_locked(&lpni->lpni_healthv); /* * add the peer NI to the recovery queue if it's not already there @@ -534,6 +529,17 @@ lnet_handle_remote_failure(struct lnet_msg *msg) * invoke recovery */ lnet_peer_ni_add_to_recoveryq_locked(lpni); +} + +static void +lnet_handle_remote_failure(struct lnet_peer_ni *lpni) +{ + /* lpni could be NULL if we're in the LOLND case */ + if (!lpni) + return; + + lnet_net_lock(0); + lnet_handle_remote_failure_locked(lpni); lnet_net_unlock(0); } @@ -680,13 +686,13 @@ lnet_health_check(struct lnet_msg *msg) * attempt a resend safely. */ case LNET_MSG_STATUS_REMOTE_DROPPED: - lnet_handle_remote_failure(msg); + lnet_handle_remote_failure(msg->msg_txpeer); goto resend; case LNET_MSG_STATUS_REMOTE_ERROR: case LNET_MSG_STATUS_REMOTE_TIMEOUT: case LNET_MSG_STATUS_NETWORK_TIMEOUT: - lnet_handle_remote_failure(msg); + lnet_handle_remote_failure(msg->msg_txpeer); return -1; default: LBUG();