Make sure that ping buffer updates requested by o2iblnd and
socklnd are performed by the LNet monitor thread.
Having the LNDs do these updates via an LNet API directly caused a
lock-up due to spinlock acquisition while in an interrupt context
in Centos 7.9 environment.
To avoid LNet trying to update the ping buffer for an LNI which is
still initializing, check that o2iblnd net is fully initialized
(IBLND_INIT_ALL) before requesting the ping buffer update.
Lustre-change: https://review.whamcloud.com/51635/
Lustre-commit:
7ac399c5aec01186ad4c9a7153aea400777c897f
Fixes:
da230373bd ("LU-16563 lnet: use discovered ni status")
Signed-off-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Change-Id: I87ff8791937f5a0ead6096ff33e8c0a8087f8ddd
Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/51704
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Frank Sehr <fsehr@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
alive ? "up" : "down");
}
void lnet_update_ping_buffer(void);
+
+void lnet_mark_ping_buffer_for_update(void);
#endif
* work loops
*/
struct completion ln_started;
+
+ /* for LNDs to signal that ping buffer needs updating */
+ atomic_t ln_update_ping_buf;
};
#endif
if (!update_ping_buf &&
(ni->ni_state == LNET_NI_STATE_ACTIVE) &&
- (val != ni_state_before))
+ (val != ni_state_before) &&
+ (net->ibn_init == IBLND_INIT_ALL))
update_ping_buf = true;
}
if (update_ping_buf)
- lnet_update_ping_buffer();
+ lnet_mark_ping_buffer_for_update();
}
void
ni_done:
if (!update_ping_buf &&
(ni->ni_state == LNET_NI_STATE_ACTIVE) &&
- (atomic_read(&ni->ni_fatal_error_on) != ni_state_before))
+ (atomic_read(&ni->ni_fatal_error_on) != ni_state_before) &&
+ (net->ibn_init == IBLND_INIT_ALL))
update_ping_buf = true;
}
if (update_ping_buf)
- lnet_update_ping_buffer();
+ lnet_mark_ping_buffer_for_update();
out:
return 0;
}
ni_state_before = lnet_set_link_fatal_state(ni, link_down);
if (!update_ping_buf &&
(ni->ni_state == LNET_NI_STATE_ACTIVE) &&
- ((event == NETDEV_DOWN) != ni_state_before))
+ ((event == NETDEV_DOWN) != ni_state_before) &&
+ (net->ibn_init == IBLND_INIT_ALL))
update_ping_buf = true;
}
if (update_ping_buf)
- lnet_update_ping_buffer();
+ lnet_mark_ping_buffer_for_update();
out:
return 0;
}
}
if (update_ping_buf)
- lnet_update_ping_buffer();
+ lnet_mark_ping_buffer_for_update();
out:
return 0;
}
}
if (update_ping_buf)
- lnet_update_ping_buffer();
+ lnet_mark_ping_buffer_for_update();
out:
return 0;
}
return rc;
}
+void lnet_mark_ping_buffer_for_update(void)
+{
+ if (the_lnet.ln_routing)
+ return;
+
+ atomic_set(&the_lnet.ln_update_ping_buf, 1);
+ complete(&the_lnet.ln_mt_wait_complete);
+}
+EXPORT_SYMBOL(lnet_mark_ping_buffer_for_update);
+
void lnet_update_ping_buffer(void)
{
struct lnet_ping_buffer *pbuf;
struct lnet_handle_md ping_mdh;
- if (the_lnet.ln_routing)
+ if (atomic_dec_if_positive(&the_lnet.ln_update_ping_buf) < 0)
return;
mutex_lock(&the_lnet.ln_api_mutex);
mutex_unlock(&the_lnet.ln_api_mutex);
}
-EXPORT_SYMBOL(lnet_update_ping_buffer);
void lnet_incr_dlc_seq(void)
{
* 1. Checks the aliveness of routers
* 2. Checks if there are messages on the resend queue to resend
* them.
- * 3. Check if there are any NIs on the local recovery queue and
+ * 3. Checks if there are any NIs on the local recovery queue and
* pings them
* 4. Checks if there are any NIs on the remote recovery queue
* and pings them.
+ * 5. Updates the ping buffer if requested by LNDs upon interface
+ * state change
*/
while (the_lnet.ln_mt_state == LNET_MT_STATE_RUNNING) {
now = ktime_get_real_seconds();
lnet_recover_peer_nis();
recovery_timeout = now + lnet_recovery_interval;
}
+ lnet_update_ping_buffer();
/*
* TODO do we need to check if we should sleep without