From 42e40555f250b83730d233dc5e22fd1f9396ccfe Mon Sep 17 00:00:00 2001 From: Quentin Bouget Date: Fri, 1 Dec 2017 11:41:44 +0000 Subject: [PATCH] LU-8626 hsm: expose the number of active hsm requests per type This patch creates 3 new proc files under the hsm directory: - archive_count - restore_count - remove_count These should help monitor the coordinator's health and allow policy engine to adapt their request flow. Test-Parameters: trivial testlist=sanity-hsm Signed-off-by: Quentin Bouget Change-Id: I30c9fb658e8c14a181b094b51408c92df609c3ca Reviewed-on: https://review.whamcloud.com/30336 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Henri Doreau Reviewed-by: Stephan Thiell Reviewed-by: John L. Hammond Reviewed-by: Oleg Drokin --- lustre/mdt/mdt_coordinator.c | 41 ++++++++++++++++++++ lustre/tests/sanity-hsm.sh | 90 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 131 insertions(+) diff --git a/lustre/mdt/mdt_coordinator.c b/lustre/mdt/mdt_coordinator.c index 8ec7e57..08a0ed0 100644 --- a/lustre/mdt/mdt_coordinator.c +++ b/lustre/mdt/mdt_coordinator.c @@ -2332,6 +2332,41 @@ LPROC_SEQ_FOPS(mdt_hsm_group_request_mask); LPROC_SEQ_FOPS(mdt_hsm_other_request_mask); LPROC_SEQ_FOPS(mdt_hsm_cdt_raolu); +/* Read-only proc files for request counters */ +static int mdt_hsm_cdt_archive_count_seq_show(struct seq_file *m, void *data) +{ + struct mdt_device *mdt = m->private; + struct coordinator *cdt = &mdt->mdt_coordinator; + ENTRY; + + seq_printf(m, "%d\n", atomic_read(&cdt->cdt_archive_count)); + RETURN(0); +} + +static int mdt_hsm_cdt_restore_count_seq_show(struct seq_file *m, void *data) +{ + struct mdt_device *mdt = m->private; + struct coordinator *cdt = &mdt->mdt_coordinator; + ENTRY; + + seq_printf(m, "%d\n", atomic_read(&cdt->cdt_restore_count)); + RETURN(0); +} + +static int mdt_hsm_cdt_remove_count_seq_show(struct seq_file *m, void *data) +{ + struct mdt_device *mdt = m->private; + struct coordinator *cdt = &mdt->mdt_coordinator; + ENTRY; + + seq_printf(m, "%d\n", atomic_read(&cdt->cdt_remove_count)); + RETURN(0); +} + +LPROC_SEQ_FOPS_RO(mdt_hsm_cdt_archive_count); +LPROC_SEQ_FOPS_RO(mdt_hsm_cdt_restore_count); +LPROC_SEQ_FOPS_RO(mdt_hsm_cdt_remove_count); + static struct lprocfs_vars lprocfs_mdt_hsm_vars[] = { { .name = "agents", .fops = &mdt_hsm_agent_fops }, @@ -2360,5 +2395,11 @@ static struct lprocfs_vars lprocfs_mdt_hsm_vars[] = { .fops = &mdt_hsm_other_request_mask_fops, }, { .name = "remove_archive_on_last_unlink", .fops = &mdt_hsm_cdt_raolu_fops, }, + { .name = "archive_count", + .fops = &mdt_hsm_cdt_archive_count_fops, }, + { .name = "restore_count", + .fops = &mdt_hsm_cdt_restore_count_fops, }, + { .name = "remove_count", + .fops = &mdt_hsm_cdt_remove_count_fops, }, { 0 } }; diff --git a/lustre/tests/sanity-hsm.sh b/lustre/tests/sanity-hsm.sh index 0d06a86..0f8c44f 100755 --- a/lustre/tests/sanity-hsm.sh +++ b/lustre/tests/sanity-hsm.sh @@ -4789,6 +4789,96 @@ test_253() { } run_test 253 "Check for wrong file size after release" +test_254a() +{ + [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.10.56) ] && + skip "need MDS version at least 2.10.56" && return + + # Check that the counters are initialized to 0 + local count + for request_type in archive restore remove; do + count="$(get_hsm_param ${request_type}_count)" || + error "Reading ${request_type}_count failed with $?" + + [ "$count" -eq 0 ] || + error "Expected ${request_type}_count to be " \ + "0 != '$count'" + done +} +run_test 254a "Request counters are initialized to zero" + +test_254b() +{ + [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.10.56) ] && + skip "need MDS version at least 2.10.56" && return + + # The number of request to launch (at least 32) + local request_count=$((RANDOM % 32 + 32)) + printf "Will launch %i requests of each type\n" "$request_count" + + # Launch a copytool to process requests + copytool_setup + + # Set hsm.max_requests to allow starting all requests at the same time + stack_trap \ + "set_hsm_param max_requests $(get_hsm_param max_requests)" EXIT + set_hsm_param max_requests "$request_count" + + local timeout + local count + for request_type in archive restore remove; do + printf "Checking %s requests\n" "${request_type}" + # Suspend the copytool to give us time to read the proc files + copytool_suspend + + for ((i = 0; i < $request_count; i++)); do + case $request_type in + archive) + create_empty_file "$DIR/$tdir/$tfile-$i" \ + >/dev/null 2>&1 + ;; + restore) + lfs hsm_release "$DIR/$tdir/$tfile-$i" + ;; + esac + $LFS hsm_${request_type} "$DIR/$tdir/$tfile-$i" + done + + # Give the coordinator 10 seconds to start every request + timeout=10 + while get_hsm_param actions | grep -q WAITING; do + sleep 1 + let timeout-=1 + [ $timeout -gt 0 ] || + error "${request_type^} requests took too " \ + "long to start" + done + + count="$(get_hsm_param ${request_type}_count)" + [ "$count" -eq "$request_count" ] || + error "Expected '$request_count' (!= '$count') " \ + "active $request_type requests" + + # Let the copytool process the requests + copytool_continue + # Give it 10 seconds maximum + timeout=10 + while get_hsm_param actions | grep -q STARTED; do + sleep 1 + let timeout-=1 + [ $timeout -gt 0 ] || + error "${request_type^} requests took too " \ + "long to complete" + done + + count="$(get_hsm_param ${request_type}_count)" + [ "$count" -eq 0 ] || + error "Expected 0 (!= '$count') " \ + "active $request_type requests" + done +} +run_test 254b "Request counters are correctly incremented and decremented" + test_300() { # the only way to test ondisk conf is to restart MDS ... echo "Stop coordinator and remove coordinator state at mount" -- 1.8.3.1