Introduce workqueue for handling lnd-initiated ping buffer
update requests.
This is done to avoid the possibility of monitor thread
lock up waiting for the "old" ping buffer refcount to get
decremented during the update, while the message which
triggers the decrement is on the monitor thread's own queue
waiting to be processed.
Test-Parameters: trivial
Test-Parameters: testlist=sanity-lnet env=ONLY="207 500",ONLY_REPEAT=50
Fixes:
7ac399c5 ("LU-16949 lnet: get monitor thread to update ping buffer")
Signed-off-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Change-Id: I5176581703e52f4adbfff417040bebcc2489b79e
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/52522
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Reviewed-by: James Simmons <jsimmons@infradead.org>
Reviewed-by: Cyril Bordage <cbordage@whamcloud.com>
Reviewed-by: Chris Horn <chris.horn@hpe.com>
Reviewed-by: Frank Sehr <fsehr@whamcloud.com>
old ? "up" : "down",
alive ? "up" : "down");
}
-void lnet_update_ping_buffer(void);
extern struct blocking_notifier_head lnet_ioctl_list;
static inline int notifier_from_ioctl_errno(int err)
}
void lnet_mark_ping_buffer_for_update(void);
+void lnet_queue_ping_buffer_update(void);
#endif
/* for LNDs to signal that ping buffer needs updating */
atomic_t ln_update_ping_buf;
+
+ /* workqueue for serving lnd ping buffer update requests */
+ struct workqueue_struct *ln_pb_update_wq;
+ struct work_struct ln_pb_update_work;
+
+ atomic_t ln_pb_update_ready;
};
struct genl_filter_list {
}
EXPORT_SYMBOL(lnet_mark_ping_buffer_for_update);
-void lnet_update_ping_buffer(void)
+void lnet_update_ping_buffer(struct work_struct *work)
{
struct lnet_ping_buffer *pbuf;
struct lnet_handle_md ping_mdh;
- if (atomic_dec_if_positive(&the_lnet.ln_update_ping_buf) < 0)
- return;
-
mutex_lock(&the_lnet.ln_api_mutex);
- if (!lnet_ping_target_setup(&pbuf, &ping_mdh,
+ atomic_set(&the_lnet.ln_pb_update_ready, 1);
+
+ if ((the_lnet.ln_state == LNET_STATE_RUNNING) &&
+ !lnet_ping_target_setup(&pbuf, &ping_mdh,
LNET_PING_INFO_HDR_SIZE +
lnet_get_ni_bytes(),
false))
lnet_ping_target_update(pbuf, ping_mdh);
+
mutex_unlock(&the_lnet.ln_api_mutex);
}
+
+void lnet_queue_ping_buffer_update(void)
+{
+ /* don't queue pb update if it is not needed */
+ if (atomic_dec_if_positive(&the_lnet.ln_update_ping_buf) < 0)
+ return;
+
+ /* don't queue pb update if already queued and not processed */
+ if (atomic_dec_if_positive(&the_lnet.ln_pb_update_ready) < 0)
+ return;
+
+ INIT_WORK(&the_lnet.ln_pb_update_work, lnet_update_ping_buffer);
+ queue_work(the_lnet.ln_pb_update_wq, &the_lnet.ln_pb_update_work);
+}
+
void lnet_incr_dlc_seq(void)
{
atomic_inc(&lnet_dlc_seq_no);
nlpnis = lnet_recover_peer_nis(peer_nids, LNET_MAX_NNIDS);
lnet_health_update_console(local_nids, nnis, peer_nids, nlpnis,
now);
- lnet_update_ping_buffer();
+
+ lnet_queue_ping_buffer_update();
/*
* TODO do we need to check if we should sleep without
if (rc)
goto clean_queues;
+ the_lnet.ln_pb_update_wq = alloc_workqueue("lnetpb_wq",
+ WQ_UNBOUND,
+ 1);
+ if (!the_lnet.ln_pb_update_wq) {
+ rc = -ENOMEM;
+ CERROR("Failed to allocate LNet ping buffer workqueue\n");
+ goto clean_queues;
+ }
+ atomic_set(&the_lnet.ln_pb_update_ready, 1);
+
sema_init(&the_lnet.ln_mt_signal, 0);
lnet_net_lock(LNET_LOCK_EX);
return;
LASSERT(the_lnet.ln_mt_state == LNET_MT_STATE_RUNNING);
+
+ /* clean up the ping buffer update workqueue before telling
+ * the monitor thread to shut down to avoid getting stuck
+ * on pending messages
+ */
+ mutex_unlock(&the_lnet.ln_api_mutex);
+ flush_workqueue(the_lnet.ln_pb_update_wq);
+ destroy_workqueue(the_lnet.ln_pb_update_wq);
+ atomic_set(&the_lnet.ln_pb_update_ready, 0);
+ the_lnet.ln_pb_update_wq = NULL;
+ mutex_lock(&the_lnet.ln_api_mutex);
+
lnet_net_lock(LNET_LOCK_EX);
the_lnet.ln_mt_state = LNET_MT_STATE_STOPPING;
lnet_net_unlock(LNET_LOCK_EX);
int lncpt;
int cpt;
+ if (the_lnet.ln_dc_state != LNET_DC_STATE_RUNNING)
+ return;
+
lnet_net_lock(LNET_LOCK_EX);
if (lnet_peer_discovery_disabled)
force = 0;
*/
mutex_lock(&the_lnet.ln_api_mutex);
if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+ lnet_ping_buffer_decref(pbuf);
rc = -ESHUTDOWN;
goto out;
}
}
run_test 402 "Destination net rule should not panic"
+test_500() {
+ reinit_dlc || return $?
+
+ cleanup_netns || error "Failed to cleanup netns before test execution"
+ setup_fakeif || error "Failed to add fake IF"
+ have_interface "$FAKE_IF" ||
+ error "Expect $FAKE_IF configured but not found"
+
+ add_net "tcp" "${INTERFACES[0]}"
+ add_net "tcp" "${FAKE_IF}"
+
+ do_lnetctl discover $($LCTL list_nids | head -n 1) ||
+ error "Failed to discover self"
+
+ $LCTL net_delay_add -s *@tcp -d *@tcp -r 1 -l 1 -m PUT ||
+ error "Failed to add delay rule"
+
+ $LCTL net_drop_add -s *@tcp -d $($LCTL list_nids | head -n 1) -m PUT -e local_timeout -r 1
+ $LCTL net_drop_add -s *@tcp -d $($LCTL list_nids | tail -n 1) -m PUT -e local_timeout -r 1
+
+ ip link set $FAKE_IF down ||
+ error "Failed to set link down"
+ ip link set $FAKE_IF up ||
+ error "Failed to set link up"
+
+ unload_modules
+}
+run_test 500 "Check deadlock on ping target update"
+
complete_test $SECONDS
cleanup_testsuite
exit_status