Whamcloud - gitweb
LU-9038 obdclass: handle early requests vs CT registering 50/25050/9
authorBruno Faccini <bruno.faccini@intel.com>
Tue, 24 Jan 2017 15:19:31 +0000 (16:19 +0100)
committerOleg Drokin <oleg.drokin@intel.com>
Wed, 1 Mar 2017 05:11:17 +0000 (05:11 +0000)
This patch addresses cases where CDT may start to send requests
before CT has fully registered with all MDTs and thus when the KUC
pipe kernel side has still not been initialized in
lmv_hsm_ct_register().
This will avoid Oops'es due to kkuc_groups[KUC_GRP_HSM] being
uninitialized/zero'ed and we rely on CDT to later retry.
sanity-hsm/test_402b has been added to verify.

Signed-off-by: Bruno Faccini <bruno.faccini@intel.com>
Change-Id: Ibccf2627aebe8da52128da5d90d24751394bf61d
Reviewed-on: https://review.whamcloud.com/25050
Reviewed-by: Quentin Bouget <quentin.bouget@cea.fr>
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Henri Doreau <henri.doreau@cea.fr>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/obdclass/kernelcomm.c
lustre/tests/sanity-hsm.sh

index 0787f46..cb52660 100644 (file)
@@ -194,6 +194,14 @@ int libcfs_kkuc_group_put(int group, void *payload)
        ENTRY;
 
        down_write(&kg_sem);
        ENTRY;
 
        down_write(&kg_sem);
+
+       if (unlikely(kkuc_groups[group].next == NULL) ||
+           unlikely(OBD_FAIL_CHECK(OBD_FAIL_MDS_HSM_CT_REGISTER_NET))) {
+               /* no agent have fully registered, CDT will retry */
+               up_write(&kg_sem);
+               RETURN(-EAGAIN);
+       }
+
        list_for_each_entry(reg, &kkuc_groups[group], kr_chain) {
                if (reg->kr_fp != NULL) {
                        rc = libcfs_kkuc_msg_put(reg->kr_fp, payload);
        list_for_each_entry(reg, &kkuc_groups[group], kr_chain) {
                if (reg->kr_fp != NULL) {
                        rc = libcfs_kkuc_msg_put(reg->kr_fp, payload);
index dd700c3..785da68 100755 (executable)
@@ -4681,7 +4681,7 @@ mdc_change_state() # facet, MDT_pattern, activate|deactivate
        done
 }
 
        done
 }
 
-test_402() {
+test_402a() {
        # make sure there is no running copytool
        copytool_cleanup
 
        # make sure there is no running copytool
        copytool_cleanup
 
@@ -4698,7 +4698,34 @@ test_402() {
        # reactivate MDCs
        mdc_change_state $SINGLEAGT "$FSNAME-MDT000." "activate"
 }
        # reactivate MDCs
        mdc_change_state $SINGLEAGT "$FSNAME-MDT000." "activate"
 }
-run_test 402 "Copytool start fails if all MDTs are inactive"
+run_test 402a "Copytool start fails if all MDTs are inactive"
+
+test_402b() {
+       copytool_setup
+
+       mkdir -p $DIR/$tdir
+
+       local f=$DIR/$tdir/$tfile
+       touch $f || error "touch $f failed"
+       local fid=$(path2fid $f)
+
+#define OBD_FAIL_MDS_HSM_CT_REGISTER_NET       0x14d
+       do_facet $SINGLEAGT lctl set_param fail_loc=0x14d
+       $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f
+
+       # give time for CDT to send request and to keep it for retry
+       wait_for_loop_period
+
+       wait_request_state $fid ARCHIVE WAITING
+
+       do_facet $SINGLEAGT lctl set_param fail_loc=0
+
+       # request should succeed now
+       wait_request_state $fid ARCHIVE SUCCEED
+
+       copytool_cleanup
+}
+run_test 402b "CDT must retry request upon slow start of CT"
 
 test_403() {
        [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
 
 test_403() {
        [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return