From: Bruno Faccini Date: Tue, 24 Jan 2017 15:19:31 +0000 (+0100) Subject: LU-9038 obdclass: handle early requests vs CT registering X-Git-Tag: 2.9.54~27 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=5571c9a73b7de3fd082fb62ab3738afc4325cbd9;ds=sidebyside LU-9038 obdclass: handle early requests vs CT registering This patch addresses cases where CDT may start to send requests before CT has fully registered with all MDTs and thus when the KUC pipe kernel side has still not been initialized in lmv_hsm_ct_register(). This will avoid Oops'es due to kkuc_groups[KUC_GRP_HSM] being uninitialized/zero'ed and we rely on CDT to later retry. sanity-hsm/test_402b has been added to verify. Signed-off-by: Bruno Faccini Change-Id: Ibccf2627aebe8da52128da5d90d24751394bf61d Reviewed-on: https://review.whamcloud.com/25050 Reviewed-by: Quentin Bouget Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Henri Doreau Reviewed-by: Oleg Drokin --- diff --git a/lustre/obdclass/kernelcomm.c b/lustre/obdclass/kernelcomm.c index 0787f46..cb52660 100644 --- a/lustre/obdclass/kernelcomm.c +++ b/lustre/obdclass/kernelcomm.c @@ -194,6 +194,14 @@ int libcfs_kkuc_group_put(int group, void *payload) ENTRY; down_write(&kg_sem); + + if (unlikely(kkuc_groups[group].next == NULL) || + unlikely(OBD_FAIL_CHECK(OBD_FAIL_MDS_HSM_CT_REGISTER_NET))) { + /* no agent have fully registered, CDT will retry */ + up_write(&kg_sem); + RETURN(-EAGAIN); + } + list_for_each_entry(reg, &kkuc_groups[group], kr_chain) { if (reg->kr_fp != NULL) { rc = libcfs_kkuc_msg_put(reg->kr_fp, payload); diff --git a/lustre/tests/sanity-hsm.sh b/lustre/tests/sanity-hsm.sh index dd700c3..785da68 100755 --- a/lustre/tests/sanity-hsm.sh +++ b/lustre/tests/sanity-hsm.sh @@ -4681,7 +4681,7 @@ mdc_change_state() # facet, MDT_pattern, activate|deactivate done } -test_402() { +test_402a() { # make sure there is no running copytool copytool_cleanup @@ -4698,7 +4698,34 @@ test_402() { # reactivate MDCs mdc_change_state $SINGLEAGT "$FSNAME-MDT000." "activate" } -run_test 402 "Copytool start fails if all MDTs are inactive" +run_test 402a "Copytool start fails if all MDTs are inactive" + +test_402b() { + copytool_setup + + mkdir -p $DIR/$tdir + + local f=$DIR/$tdir/$tfile + touch $f || error "touch $f failed" + local fid=$(path2fid $f) + +#define OBD_FAIL_MDS_HSM_CT_REGISTER_NET 0x14d + do_facet $SINGLEAGT lctl set_param fail_loc=0x14d + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + + # give time for CDT to send request and to keep it for retry + wait_for_loop_period + + wait_request_state $fid ARCHIVE WAITING + + do_facet $SINGLEAGT lctl set_param fail_loc=0 + + # request should succeed now + wait_request_state $fid ARCHIVE SUCCEED + + copytool_cleanup +} +run_test 402b "CDT must retry request upon slow start of CT" test_403() { [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return