Make sure that ping buffer updates requested by o2iblnd and
socklnd are performed by the LNet monitor thread.
Having the LNDs do these updates via an LNet API directly caused a
lock-up due to spinlock acquisition while in an interrupt context
in Centos 7.9 environment.
To avoid LNet trying to update the ping buffer for an LNI which is
still initializing, check that o2iblnd net is fully initialized
(IBLND_INIT_ALL) before requesting the ping buffer update.
Fixes:
da230373bd ("LU-16563 lnet: use discovered ni status")
Signed-off-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Change-Id: I87ff8791937f5a0ead6096ff33e8c0a8087f8ddd
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/51635
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Cyril Bordage <cbordage@whamcloud.com>
Reviewed-by: Chris Horn <chris.horn@hpe.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
return notifier_from_errno(err) | NOTIFY_STOP_MASK;
}
+void lnet_mark_ping_buffer_for_update(void);
#endif
atomic_t ln_late_msg_count;
/* Total amount of time past their deadline for all late ^ messages */
atomic64_t ln_late_msg_nsecs;
+
+ /* for LNDs to signal that ping buffer needs updating */
+ atomic_t ln_update_ping_buf;
};
struct genl_filter_list {
if (!update_ping_buf &&
(ni->ni_state == LNET_NI_STATE_ACTIVE) &&
- (val != ni_state_before))
+ (val != ni_state_before) &&
+ (net->ibn_init == IBLND_INIT_ALL))
update_ping_buf = true;
}
if (update_ping_buf)
- lnet_update_ping_buffer();
+ lnet_mark_ping_buffer_for_update();
}
static void
ni_done:
if (!update_ping_buf &&
(ni->ni_state == LNET_NI_STATE_ACTIVE) &&
- (atomic_read(&ni->ni_fatal_error_on) != ni_state_before))
+ (atomic_read(&ni->ni_fatal_error_on) != ni_state_before) &&
+ (net->ibn_init == IBLND_INIT_ALL))
update_ping_buf = true;
}
if (update_ping_buf)
- lnet_update_ping_buffer();
+ lnet_mark_ping_buffer_for_update();
out:
return 0;
}
ni_state_before = lnet_set_link_fatal_state(ni, link_down);
if (!update_ping_buf &&
(ni->ni_state == LNET_NI_STATE_ACTIVE) &&
- ((event == NETDEV_DOWN) != ni_state_before))
+ ((event == NETDEV_DOWN) != ni_state_before) &&
+ (net->ibn_init == IBLND_INIT_ALL))
update_ping_buf = true;
}
if (update_ping_buf)
- lnet_update_ping_buffer();
+ lnet_mark_ping_buffer_for_update();
out:
return 0;
}
}
if (update_ping_buf)
- lnet_update_ping_buffer();
+ lnet_mark_ping_buffer_for_update();
out:
return 0;
}
}
if (update_ping_buf)
- lnet_update_ping_buffer();
+ lnet_mark_ping_buffer_for_update();
out:
return 0;
}
return rc;
}
+void lnet_mark_ping_buffer_for_update(void)
+{
+ if (the_lnet.ln_routing)
+ return;
+
+ atomic_set(&the_lnet.ln_update_ping_buf, 1);
+ complete(&the_lnet.ln_mt_wait_complete);
+}
+EXPORT_SYMBOL(lnet_mark_ping_buffer_for_update);
+
void lnet_update_ping_buffer(void)
{
struct lnet_ping_buffer *pbuf;
struct lnet_handle_md ping_mdh;
- if (the_lnet.ln_routing)
+ if (atomic_dec_if_positive(&the_lnet.ln_update_ping_buf) < 0)
return;
mutex_lock(&the_lnet.ln_api_mutex);
mutex_unlock(&the_lnet.ln_api_mutex);
}
-EXPORT_SYMBOL(lnet_update_ping_buffer);
void lnet_incr_dlc_seq(void)
{
* 1. Checks the aliveness of routers
* 2. Checks if there are messages on the resend queue to resend
* them.
- * 3. Check if there are any NIs on the local recovery queue and
+ * 3. Checks if there are any NIs on the local recovery queue and
* pings them
* 4. Checks if there are any NIs on the remote recovery queue
* and pings them.
+ * 5. Updates the ping buffer if requested by LNDs upon interface
+ * state change
*/
while (the_lnet.ln_mt_state == LNET_MT_STATE_RUNNING) {
now = ktime_get_real_seconds();
nlpnis = lnet_recover_peer_nis(peer_nids, LNET_MAX_NNIDS);
lnet_health_update_console(local_nids, nnis, peer_nids, nlpnis,
now);
+ lnet_update_ping_buffer();
/*
* TODO do we need to check if we should sleep without