Whamcloud - gitweb
LU-13116 mgc: do not lose sptlrpc config lock 44/37344/9
authorSebastien Buisson <sbuisson@ddn.com>
Tue, 28 Jan 2020 12:00:49 +0000 (21:00 +0900)
committerOleg Drokin <green@whamcloud.com>
Tue, 17 Mar 2020 03:40:58 +0000 (03:40 +0000)
Multiple targets on the same node can make use of the same lock on
sptlrpc config log. So if a target is being stopped, sptlrpc
config log should not be directly marked as stopping.

Fixes: 0ad54d5977 ("LU-11185 mgc: config lock leak")
Signed-off-by: Sebastien Buisson <sbuisson@ddn.com>
Change-Id: I09c2b0c276daadb66721df88c33f734ebba86114
Reviewed-on: https://review.whamcloud.com/37344
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Alexander Boyko <c17825@cray.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/mgc/mgc_request.c
lustre/tests/recovery-small.sh
lustre/tests/test-framework.sh

index b042c7f..f827fba 100644 (file)
@@ -156,8 +156,10 @@ static void config_log_put(struct config_llog_data *cld)
                config_log_put(cld->cld_params);
                config_log_put(cld->cld_nodemap);
                config_log_put(cld->cld_sptlrpc);
-               if (cld_is_sptlrpc(cld))
+               if (cld_is_sptlrpc(cld)) {
+                       cld->cld_stopping = 1;
                        sptlrpc_conf_log_stop(cld->cld_logname);
+               }
 
                class_export_put(cld->cld_mgcexp);
                OBD_FREE(cld, sizeof(*cld) + strlen(cld->cld_logname) + 1);
@@ -519,16 +521,17 @@ static int config_log_end(char *logname, struct config_llog_instance *cfg)
        mutex_unlock(&cld->cld_lock);
 
        config_mark_cld_stop(cld_recover);
+       config_log_put(cld_recover);
        config_mark_cld_stop(cld_params);
-       config_mark_cld_stop(cld_barrier);
-       config_mark_cld_stop(cld_sptlrpc);
-
        config_log_put(cld_params);
-       config_log_put(cld_recover);
-       /* don't set cld_stopping on nm lock as other targets may be active */
-       config_log_put(cld_nodemap);
+       config_mark_cld_stop(cld_barrier);
        config_log_put(cld_barrier);
+       /* don't explicitly set cld_stopping on sptlrpc lock here, as other
+        * targets may be active, it will be done in config_log_put if necessary
+        */
        config_log_put(cld_sptlrpc);
+       /* don't set cld_stopping on nm lock as other targets may be active */
+       config_log_put(cld_nodemap);
 
        /* drop the ref from the find */
        config_log_put(cld);
index 0b9285f..972441c 100755 (executable)
@@ -2978,6 +2978,29 @@ test_140b() {
 }
 run_test 140b "local mount is excluded from recovery"
 
+test_141() {
+       local oldc
+       local newc
+
+       [ $PARALLEL == "yes" ] && skip "skip parallel run"
+       combined_mgs_mds || skip "needs combined MGS/MDT"
+       ( local_mode || from_build_tree ) &&
+               skip "cannot run in local mode or from build tree"
+
+       # some get_param have a bug to handle dot in param name
+       do_rpc_nodes $(facet_active_host $SINGLEMDS) cancel_lru_locks MGC
+       oldc=$(do_facet $SINGLEMDS $LCTL get_param -n \
+               'ldlm.namespaces.MGC*.lock_count')
+       fail $SINGLEMDS
+       do_rpc_nodes $(facet_active_host $SINGLEMDS) cancel_lru_locks MGC
+       newc=$(do_facet $SINGLEMDS $LCTL get_param -n \
+               'ldlm.namespaces.MGC*.lock_count')
+
+       [ $oldc -eq $newc ] || error "mgc lost locks ($oldc != $newc)"
+       return 0
+}
+run_test 141 "do not lose locks on MGS restart"
+
 complete $SECONDS
 check_and_cleanup_lustre
 exit_status
index 0ff2a81..ba23673 100755 (executable)
@@ -972,6 +972,22 @@ add_sk_mntflag() {
        echo -n $mt_opts
 }
 
+from_build_tree() {
+       local from_tree
+
+       case $LUSTRE in
+       /usr/lib/lustre/* | /usr/lib64/lustre/* | /usr/lib/lustre | \
+       /usr/lib64/lustre )
+               from_tree=false
+               ;;
+       *)
+               from_tree=true
+               ;;
+       esac
+
+       [ $from_tree = true ]
+}
+
 init_gss() {
        if $SHARED_KEY; then
                GSS=true
@@ -982,16 +998,6 @@ init_gss() {
                return
        fi
 
-       case $LUSTRE in
-       /usr/lib/lustre/* | /usr/lib64/lustre/* | /usr/lib/lustre | \
-       /usr/lib64/lustre )
-               from_build_tree=false
-               ;;
-       *)
-               from_build_tree=true
-               ;;
-       esac
-
        if ! module_loaded ptlrpc_gss; then
                load_module ptlrpc/gss/ptlrpc_gss
                module_loaded ptlrpc_gss ||
@@ -1010,7 +1016,7 @@ init_gss() {
                SK_NO_KEY=false
                local lgssc_conf_file="/etc/request-key.d/lgssc.conf"
 
-               if $from_build_tree; then
+               if from_build_tree; then
                        mkdir -p $SK_OM_PATH
                        if grep -q request-key /proc/mounts > /dev/null; then
                                echo "SSK: Request key already mounted."
@@ -1029,7 +1035,7 @@ init_gss() {
                cat $lgssc_conf_file
 
                if ! local_mode; then
-                       if $from_build_tree; then
+                       if from_build_tree; then
                                do_nodes $(comma_list $(all_nodes)) "mkdir -p \
                                        $SK_OM_PATH"
                                do_nodes $(comma_list $(all_nodes)) "mount \
@@ -1115,7 +1121,7 @@ init_gss() {
        fi
 
        if [ -n "$LGSS_KEYRING_DEBUG" ] && \
-              ( local_mode || $from_build_tree ); then
+              ( local_mode || from_build_tree ); then
                lctl set_param -n \
                     sptlrpc.gss.lgss_keyring.debug_level=$LGSS_KEYRING_DEBUG
        elif [ -n "$LGSS_KEYRING_DEBUG" ]; then
@@ -1134,16 +1140,6 @@ cleanup_gss() {
 
 cleanup_sk() {
        if $GSS_SK; then
-               case $LUSTRE in
-               /usr/lib/lustre/* | /usr/lib64/lustre/* | /usr/lib/lustre | \
-               /usr/lib64/lustre )
-                       from_build_tree=false
-                       ;;
-               *)
-                       from_build_tree=true
-                       ;;
-               esac
-
                if $SK_S2S; then
                        do_node $(mgs_node) "$LCTL nodemap_del $SK_S2SNM"
                        do_node $(mgs_node) "$LCTL nodemap_del $SK_S2SNMCLI"
@@ -1156,7 +1152,7 @@ cleanup_sk() {
                        $SK_PATH/$FSNAME*.key $SK_PATH/nodemap/$FSNAME*.key"
                do_nodes $(comma_list $(all_nodes)) "keyctl show | \
                  awk '/lustre/ { print \\\$1 }' | xargs -IX keyctl unlink X"
-               if $from_build_tree; then
+               if from_build_tree; then
                        # Remove the mount and clean up the files we added to
                        # SK_PATH
                        do_nodes $(comma_list $(all_nodes)) "while grep -q \