From 111451ea3ae28f18335b0bf1539488a913886026 Mon Sep 17 00:00:00 2001
From: Patrick Farrell <pfarrell@whamcloud.com>
Date: Mon, 18 Sep 2023 17:12:46 -0400
Subject: [PATCH] EX-7795 scripts: add whole file to compression scan

Add a mode where the compression scan script compresses the
entire file, which in theory should 100% match the
compression results from using CSDC and allow a test to
calculate the exact space usage reduction expected by
using CSDC.

This is intended to be used mostly for testing.

Change help documentation slightly to make clear this can
also accept a path to a single file.

Test-Parameters: trivial
Signed-off-by: Patrick Farrell <pfarrell@whamcloud.com>
Change-Id: I606a33d686d87dd631bf5b33dc85ee8c24fe9f67
Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/52406
Tested-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
---
 lustre/scripts/ll_compression_scan | 60 +++++++++++++++++++++++---------------
 1 file changed, 36 insertions(+), 24 deletions(-)

diff --git a/lustre/scripts/ll_compression_scan b/lustre/scripts/ll_compression_scan
index 5271f39..ffd0bf9 100755
--- a/lustre/scripts/ll_compression_scan
+++ b/lustre/scripts/ll_compression_scan
@@ -30,16 +30,17 @@ chunk_size=65536
 block_size=4096
 sample_count=20
 min_files=100
-default_directory="$(pwd)"
+default_path="$(pwd)"
 percentage=1
 compression_type="gzip"
 compression_level=6
+whole_file="false"
 quiet=0
 
 # Display description of script behavior
 description()
 {
-	echo "Recursively scan DIRECTORY "
+	echo "Recursively scan PATH "
 	echo "sampling data from the first MIN_FILES "
 	echo "then sampling data from PERCENTAGE% of remaining files "
 	echo "to estimate the average compression ratio using "
@@ -58,7 +59,7 @@ description()
 
 runtime_description()
 {
-	echo "Recursively scan '$directory',"
+	echo "Recursively scan '$path',"
 	if (( percentage < 100 )); then
 		(( min_files > 1 )) && echo "sampling data from the first $min_files files "
 		echo "then sampling data from $percentage% of remaining files "
@@ -89,8 +90,9 @@ runtime_description()
 usage() {
 cat <<- USAGE
 Usage: $(basename $0) [-n MIN_FILES] [-p PERCENTAGE] [-s SAMPLE_COUNT]
-	[-c CHUNK_SIZE] [-z COMPRESSION_TYPE] [-l COMPRESSION_LEVEL] [-h][-q]
-	[DIRECTORY ...]
+	[-c CHUNK_SIZE] [-z COMPRESSION_TYPE] [-l COMPRESSION_LEVEL]
+	[-h][-w][-q]
+	[PATH ...]
 
 Description:
 $(description | fmt)
@@ -102,13 +104,14 @@ Arguments:
     -c CHUNK_SIZE: Size of data chunk in kibibytes (64-4096). Default: $((chunk_size / 1024))KiB.
     -z COMPRESSION_TYPE: One of gzip, lz4, lz4fast, lzo. Default: $compression_type.
     -l COMPRESSION_LEVEL: Compression level to use (1-9). Default: $compression_level.
+    -w Sample whole file (override -s). With '-p 100' for a full but slow estimate.
     -q Skip printing of usage header.  -qq to also skip runtime status update.
     -h Print this help message.
 USAGE
 }
 
 # Parse command-line options
-while getopts "c:s:n:p:z:Z:l:qh" opt; do
+while getopts "c:s:n:p:z:Z:l:wqh" opt; do
 	case $opt in
 	c)
 		if (( OPTARG & (OPTARG - 1) )); then
@@ -158,6 +161,9 @@ while getopts "c:s:n:p:z:Z:l:qh" opt; do
 	l)
 		compression_level=$OPTARG
 		;;
+	w)
+		whole_file="true"
+		;;
 	h)
 		usage
 		exit 0
@@ -178,14 +184,14 @@ if [[ $compression_level -gt 9 && $compression_type != "lz4" ]]; then
 	exit 2
 fi
 
-directory_provided=false
+path_provided=false
 compress="$compression_type -q -$compression_level"
 shift $((OPTIND - 1))
 if [[ -z "$@" ]]; then
-	directory=$default_directory
+	path=$default_path
 else
-	directory="$@"
-	directory_provided=true
+	path="$@"
+	path_provided=true
 	shift
 fi
 
@@ -216,17 +222,23 @@ process_file() {
 		return
 	fi
 
-
-	# Calculate the segment size for the file
-	local segment_size=$((file_size / sample_count))
-
-	# Limit sample_count for small file size, but have at least one chunk
-	if ((sample_count * chunk_size > file_size)); then
+	local segment_size
+	if [[ $whole_file == "true" ]]; then
+		segment_size=$chunk_size
 		sample_count=$((file_size / chunk_size))
-		if ((sample_count == 0)); then
-			sample_count=1
+	else
+		# Calculate the segment size for the file
+		segment_size=$((file_size / sample_count))
+
+		# Limit sample_count for small file size, but have at least
+		# one chunk
+		if ((sample_count * chunk_size > file_size)); then
+			sample_count=$((file_size / chunk_size))
 		fi
 	fi
+	if ((sample_count == 0)); then
+		sample_count=1
+	fi
 
 	# Round up the file_size to the next block (actual space usage)
 	file_size=$(round_to_block_size file_size)
@@ -326,10 +338,10 @@ total_file_count=0
 last=$SECONDS
 
 echo ""
-if [ "$directory_provided" = true ]; then
-	echo "Scanning $directory."
+if [ "$path_provided" = true ]; then
+	echo "Scanning $path."
 else
-	echo "Scanning current directory, $directory."
+	echo "Scanning current directory, $path."
 fi
 echo ""
 echo ""
@@ -357,13 +369,13 @@ while read FILE; do
 		fi
 		last=$SECONDS
 	fi
-done < <(find $directory -type f -print)
+done < <(find $path -type f -print)
 
 (( total_file_count == 0 )) &&
-	echo "error: no files found in '$directory' to compress" 1>&2 &&
+	echo "error: no files found in '$path' to compress" 1>&2 &&
 	exit 10
 (( total_uncompressed_size == 0 )) &&
-	echo "error: only zero-length files found in '$directory'" 1>&2 &&
+	echo "error: only zero-length files found in '$path'" 1>&2 &&
 	exit 11
 
 echo ""
-- 
1.8.3.1