Whamcloud - gitweb
LU-8626 hsm: expose the number of active hsm requests per type 36/30336/8
authorQuentin Bouget <quentin.bouget@cea.fr>
Fri, 1 Dec 2017 11:41:44 +0000 (11:41 +0000)
committerOleg Drokin <oleg.drokin@intel.com>
Thu, 4 Jan 2018 02:48:20 +0000 (02:48 +0000)
This patch creates 3 new proc files under the hsm directory:
 - archive_count
 - restore_count
 - remove_count

These should help monitor the coordinator's health and allow
policy engine to adapt their request flow.

Test-Parameters: trivial testlist=sanity-hsm
Signed-off-by: Quentin Bouget <quentin.bouget@cea.fr>
Change-Id: I30c9fb658e8c14a181b094b51408c92df609c3ca
Reviewed-on: https://review.whamcloud.com/30336
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Henri Doreau <henri.doreau@cea.fr>
Reviewed-by: Stephan Thiell <sthiell@stanford.edu>
Reviewed-by: John L. Hammond <john.hammond@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/mdt/mdt_coordinator.c
lustre/tests/sanity-hsm.sh

index 8ec7e57..08a0ed0 100644 (file)
@@ -2332,6 +2332,41 @@ LPROC_SEQ_FOPS(mdt_hsm_group_request_mask);
 LPROC_SEQ_FOPS(mdt_hsm_other_request_mask);
 LPROC_SEQ_FOPS(mdt_hsm_cdt_raolu);
 
+/* Read-only proc files for request counters */
+static int mdt_hsm_cdt_archive_count_seq_show(struct seq_file *m, void *data)
+{
+       struct mdt_device *mdt = m->private;
+       struct coordinator *cdt = &mdt->mdt_coordinator;
+       ENTRY;
+
+       seq_printf(m, "%d\n", atomic_read(&cdt->cdt_archive_count));
+       RETURN(0);
+}
+
+static int mdt_hsm_cdt_restore_count_seq_show(struct seq_file *m, void *data)
+{
+       struct mdt_device *mdt = m->private;
+       struct coordinator *cdt = &mdt->mdt_coordinator;
+       ENTRY;
+
+       seq_printf(m, "%d\n", atomic_read(&cdt->cdt_restore_count));
+       RETURN(0);
+}
+
+static int mdt_hsm_cdt_remove_count_seq_show(struct seq_file *m, void *data)
+{
+       struct mdt_device *mdt = m->private;
+       struct coordinator *cdt = &mdt->mdt_coordinator;
+       ENTRY;
+
+       seq_printf(m, "%d\n", atomic_read(&cdt->cdt_remove_count));
+       RETURN(0);
+}
+
+LPROC_SEQ_FOPS_RO(mdt_hsm_cdt_archive_count);
+LPROC_SEQ_FOPS_RO(mdt_hsm_cdt_restore_count);
+LPROC_SEQ_FOPS_RO(mdt_hsm_cdt_remove_count);
+
 static struct lprocfs_vars lprocfs_mdt_hsm_vars[] = {
        { .name =       "agents",
          .fops =       &mdt_hsm_agent_fops                     },
@@ -2360,5 +2395,11 @@ static struct lprocfs_vars lprocfs_mdt_hsm_vars[] = {
          .fops =       &mdt_hsm_other_request_mask_fops,       },
        { .name =       "remove_archive_on_last_unlink",
          .fops =       &mdt_hsm_cdt_raolu_fops,                },
+       { .name =       "archive_count",
+         .fops =       &mdt_hsm_cdt_archive_count_fops,        },
+       { .name =       "restore_count",
+         .fops =       &mdt_hsm_cdt_restore_count_fops,        },
+       { .name =       "remove_count",
+         .fops =       &mdt_hsm_cdt_remove_count_fops,         },
        { 0 }
 };
index 0d06a86..0f8c44f 100755 (executable)
@@ -4789,6 +4789,96 @@ test_253() {
 }
 run_test 253 "Check for wrong file size after release"
 
+test_254a()
+{
+       [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.10.56) ] &&
+               skip "need MDS version at least 2.10.56" && return
+
+       # Check that the counters are initialized to 0
+       local count
+       for request_type in archive restore remove; do
+               count="$(get_hsm_param ${request_type}_count)" ||
+                       error "Reading ${request_type}_count failed with $?"
+
+               [ "$count" -eq 0 ] ||
+                       error "Expected ${request_type}_count to be " \
+                             "0 != '$count'"
+       done
+}
+run_test 254a "Request counters are initialized to zero"
+
+test_254b()
+{
+       [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.10.56) ] &&
+               skip "need MDS version at least 2.10.56" && return
+
+       # The number of request to launch (at least 32)
+       local request_count=$((RANDOM % 32 + 32))
+       printf "Will launch %i requests of each type\n" "$request_count"
+
+       # Launch a copytool to process requests
+       copytool_setup
+
+       # Set hsm.max_requests to allow starting all requests at the same time
+       stack_trap \
+               "set_hsm_param max_requests $(get_hsm_param max_requests)" EXIT
+       set_hsm_param max_requests "$request_count"
+
+       local timeout
+       local count
+       for request_type in archive restore remove; do
+               printf "Checking %s requests\n" "${request_type}"
+               # Suspend the copytool to give us time to read the proc files
+               copytool_suspend
+
+               for ((i = 0; i < $request_count; i++)); do
+                       case $request_type in
+                       archive)
+                               create_empty_file "$DIR/$tdir/$tfile-$i" \
+                                       >/dev/null 2>&1
+                               ;;
+                       restore)
+                               lfs hsm_release "$DIR/$tdir/$tfile-$i"
+                               ;;
+                       esac
+                       $LFS hsm_${request_type} "$DIR/$tdir/$tfile-$i"
+               done
+
+               # Give the coordinator 10 seconds to start every request
+               timeout=10
+               while get_hsm_param actions | grep -q WAITING; do
+                       sleep 1
+                       let timeout-=1
+                       [ $timeout -gt 0 ] ||
+                               error "${request_type^} requests took too " \
+                                     "long to start"
+               done
+
+               count="$(get_hsm_param ${request_type}_count)"
+               [ "$count" -eq "$request_count" ] ||
+                       error "Expected '$request_count' (!= '$count') " \
+                             "active $request_type requests"
+
+               # Let the copytool process the requests
+               copytool_continue
+               # Give it 10 seconds maximum
+               timeout=10
+               while get_hsm_param actions | grep -q STARTED; do
+                       sleep 1
+                       let timeout-=1
+                       [ $timeout -gt 0 ] ||
+                               error "${request_type^} requests took too " \
+                                     "long to complete"
+               done
+
+               count="$(get_hsm_param ${request_type}_count)"
+               [ "$count" -eq 0 ] ||
+                       error "Expected 0 (!= '$count') " \
+                             "active $request_type requests"
+       done
+}
+run_test 254b "Request counters are correctly incremented and decremented"
+
 test_300() {
        # the only way to test ondisk conf is to restart MDS ...
        echo "Stop coordinator and remove coordinator state at mount"