From: Andreas Dilger <adilger@whamcloud.com>
Date: Sat, 27 Jan 2024 20:08:33 +0000 (-0800)
Subject: EX-8362 scripts: improve ll_compression_scan estimate
X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=21c73312a345673e108beedbad1fa2a1d7a8dbff;p=fs%2Flustre-release.git

EX-8362 scripts: improve ll_compression_scan estimate

Improve ll_compression_scan script to give a better estimate of
actual compression ratios.
- add a '-d' debug option for verbose output during testing
- log and report incompressible small files < 4096
- log and report incompressible file count and size
- include small/incompressible/large files in compression estimate
- add a correction factor to calculations for safety margin

Change-Id: If561b0273e38e4821de228c81291859c7bb1a0d2
Test-Parameters: trivial testlist=sanity-compr env=ONLY=1007,ONLY_REPEAT=10
Signed-off-by: Andreas Dilger <adilger@whamcloud.com>
Signed-off-by: Jian Yu <yujian@whamcloud.com>
Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/53824
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
---

diff --git a/lustre/scripts/ll_compression_scan b/lustre/scripts/ll_compression_scan
index af832a1..57833f1 100755
--- a/lustre/scripts/ll_compression_scan
+++ b/lustre/scripts/ll_compression_scan
@@ -35,7 +35,9 @@ default_path="$(pwd)"
 percentage=1
 compression_type="gzip -"
 compression_level=6
+compression_margin=5
 whole_file="false"
+debug=0
 quiet=0
 
 # Display description of script behavior
@@ -112,7 +114,7 @@ USAGE
 }
 
 # Parse command-line options
-while getopts "c:s:n:p:z:Z:l:wqh" opt; do
+while getopts "c:ds:n:p:z:Z:l:m:wqh" opt; do
 	case $opt in
 	c)
 		if (( OPTARG & (OPTARG - 1) )); then
@@ -125,6 +127,9 @@ while getopts "c:s:n:p:z:Z:l:wqh" opt; do
 		fi
 		chunk_size=$((OPTARG *= 1024))
 		;;
+	d)
+		((debug += 1))
+		;;
 	s)
 		sample_count=$OPTARG
 		;;
@@ -163,6 +168,13 @@ while getopts "c:s:n:p:z:Z:l:wqh" opt; do
 	l)
 		compression_level=$OPTARG
 		;;
+	m)
+		if (( OPTARG < 0 || OPTARG > 100 )); then
+			echo "Compression margin must be between 0 and 100" 1>&2
+			exit 1
+		fi
+		compression_margin=$OPTARG
+		;;
 	w)
 		whole_file="true"
 		;;
@@ -199,19 +211,27 @@ fi
 export total_file_size=0
 export total_uncompressed_size=0
 export total_compressed_size=0
-export total_files_scanned=0
-export total_empty_files=0
-export total_uncompressed_size_estimated=0
+export total_files_sampled=0
+export total_small_files=0
+export total_incompressible_files=0
+export total_incompressible_size=0
+export total_uncompressed_size_sampled=0
 export total_compressed_size_estimated=0
 
 round_to_block_size() {
-	local size=$1
+	local size=$*
 
 	echo $(( ((size - 1) | (block_size - 1)) + 1 ))
 }
 
-export format="--format=%s"
-[[ $(uname) != "Darwin" ]] || format="-f %z"
+round_to_chunk_size() {
+	local size=$*
+
+	echo $(( ((size - 1) | (chunk_size - 1)) + 1 ))
+}
+
+export format="--format=%b*%B"
+[[ $(uname) != "Darwin" ]] || format="-f %b*512"
 # Function to process a file
 process_file() {
 	local file="$1"
@@ -219,96 +239,121 @@ process_file() {
 	local sum_uncompressed_chunk=0
 	local sum_compressed_chunk=0
 
-	total_files_scanned=$((total_files_scanned + 1))
-
-	if [[ -z "$file_size" ]] || (( $file_size == 0 )); then
-		total_empty_files=$((total_empty_files + 1))
-		return
-	fi
-
-	local segment_size
-	if [[ $whole_file == "true" ]]; then
-		segment_size=$chunk_size
-		sample_count=$((file_size / chunk_size))
+	# Round up the file_size to the next block (actual space usage)
+	file_size=$(round_to_block_size $file_size)
+	# Accumulate total size of files scanned (in block_size multiples)
+	total_file_size=$((total_file_size + file_size))
+	((total_file_count+= 1))
+
+	# always count incompressible files, in case this is a large fraction
+	if [[ -z "$file_size" ]] || (( file_size <= block_size )); then
+		((total_small_files+= 1))
+		((total_files_sampled+= 1))
+		sum_uncompressed_chunk=$file_size
+		sum_compressed_chunk=$file_size
+		estimated_compressed_file_size=$file_size
 	else
-		# Calculate the segment size for the file
-		segment_size=$((file_size / sample_count))
+		# randomly select $percentage of files after sampling min_files,
+		# unless file is larger than average of files checked so far
+		local average=$((total_file_size / ${total_files_sampled/#0/1}))
+		if (( total_files_sampled > min_files &&
+		      file_size < 2 * average )); then
+			(( RANDOM % 100 < percentage )) || return
+		elif (( total_files_sampled > min_files && debug > 0 )); then
+			echo -n "***"
+		fi
+
+		((total_files_sampled+= 1))
 
-		# Limit sample_count for small file size, but have at least
-		# one chunk
-		if ((sample_count * chunk_size > file_size)); then
-			sample_count=$((file_size / chunk_size))
+		local segment_size
+		if [[ $whole_file == "true" ]] ||
+		   (( file_size < chunk_size * sample_count )); then
+			segment_size=$chunk_size
+			segment_count=$(($(round_to_chunk_size file_size) /
+					 chunk_size))
+		else
+			# Calculate the segment size for the file
+			segment_size=$((file_size / sample_count))
+			segment_count=$sample_count
 		fi
-	fi
-	if ((sample_count == 0)); then
-		sample_count=1
-	fi
 
-	# Round up the file_size to the next block (actual space usage)
-	file_size=$(round_to_block_size file_size)
-	# Accumulate the total size of files scanned (in block_size units)
-	total_file_size=$((total_file_size + file_size))
+		(( debug < 1 )) ||
+			echo -n "$(basename $file): size: $file_size "
+		(( debug < 2 )) ||
+			echo -n "segs: $segment_count segsz: $segment_size "
 
-	# Read and process each segment
-	for ((i = 0; i < sample_count; i++)); do
-		offset=$((i * segment_size / chunk_size))
-		compressed_size=$(dd if="$file" bs=$chunk_size count=1 skip=$offset 2>/dev/null | $compress | wc -c)
+		# Read and process each segment
+		for ((i = 0; i < segment_count; i++)); do
+			offset=$((i * segment_size / chunk_size))
+			compressed_size=$(dd if="$file" bs=$chunk_size count=1 \
+				skip=$offset 2>/dev/null | $compress | wc -c)
 
-		# if the compressed size is zero, something must have failed
-		(( compressed_size > 0 )) || continue
+			# if compressed size is zero, something must have failed
+			(( compressed_size > 0 )) || continue
 
-		# Round up compressed size to full block size
-		compressed_size=$(round_to_block_size compressed_size)
+			# Round up compressed size to full block size
+			compressed_size=$(round_to_block_size compressed_size)
 
-		# Incompressible chunks will not be compressed
-		(( compressed_size < chunk_size )) || compressed_size=$chunk_size
+			# Incompressible chunks will not be compressed
+			(( compressed_size <= chunk_size )) ||
+				compressed_size=$chunk_size
 
-		# Accumulate sampled chunk byte counts, but don't inflate size
-		sum_uncompressed_chunk=$((sum_uncompressed_chunk +
-					  (chunk_size < file_size ?
-					   chunk_size : file_size) ))
-		sum_compressed_chunk=$((sum_compressed_chunk + compressed_size))
+			# Add sampled chunk bytes, but don't inflate last chunk
+			last_chunk=$((file_size - offset * chunk_size ))
+			(( last_chunk > chunk_size )) && last_chunk=$chunk_size
 
-	done
+			((sum_uncompressed_chunk+= last_chunk ))
+			((sum_compressed_chunk+= compressed_size))
+		done
 
-	# Get current ratio for this file
-	current_ratio=$((sum_uncompressed_chunk * 100 / sum_compressed_chunk))
-	# Assume ratio will be the same for the entire file
-	estimated_compressed_file_size=$(( file_size * 100 / current_ratio))
+		# Get current ratio for this file
+		current_ratio=$((sum_uncompressed_chunk * 100 / sum_compressed_chunk))
+		# Assume compression ratio will be the same for the entire file
+		estimated_compressed_file_size=$((file_size * 100 / current_ratio))
+
+		(( debug < 1 )) ||
+			echo "uncompr: $sum_uncompressed_chunk compr: $sum_compressed_chunk est: $estimated_compressed_file_size avg: $average"
+	fi
+
+	if ((sum_compressed_chunk >= sum_uncompressed_chunk)); then
+		((total_incompressible_files+= 1))
+		((total_incompressible_size+= file_size))
+	fi
 
 	# Accumulate the total uncompressed and compressed byte counts
-	total_uncompressed_size=$((total_uncompressed_size + sum_uncompressed_chunk))
-	total_compressed_size=$((total_compressed_size + sum_compressed_chunk))
+	((total_uncompressed_size+= sum_uncompressed_chunk))
+	((total_compressed_size+= sum_compressed_chunk))
 
 	# Accumulate the estimated uncompressed and compressed byte counts
-	total_uncompressed_size_estimated=$((total_uncompressed_size_estimated +
-                                             file_size))
-	total_compressed_size_estimated=$((total_compressed_size_estimated +
-                                           estimated_compressed_file_size))
+	((total_uncompressed_size_sampled+= file_size))
+	((total_compressed_size_estimated+= estimated_compressed_file_size))
 }
 
-# Calculate compression ratio from estimated compressed file (value > 1)
-calculate_estimated_ratio() {
-	local ratio=$((total_uncompressed_size_estimated * 100 /
-                       total_compressed_size_estimated))
+# Calculate compression ratio of real compressed chunks vs original (value >= 1)
+calculate_ratio() {
+	local ratio=$((total_uncompressed_size * 100 / total_compressed_size))
 
 	printf "%u.%02u" $((ratio / 100)) $((ratio % 100))
 }
 
-# Calculate percentage of compressed size compared to original size (1-100%)
-calculate_pct() {
-	local pct=$((total_compressed_size * 10000 / total_uncompressed_size))
+# add correction factor for estimate safety margin with low sample percentage
+(( compression_margin == 0 )) && correction=100 ||
+	correction=$((100 + compression_margin + 10 * (100 - percentage) / 100))
 
-	printf "%u.%02u%%" $((pct / 100)) $((pct % 100))
+# Calculate compression ratio from estimated compressed file size (value >= 1)
+calculate_estimated_ratio() {
+	local ratio=$((total_uncompressed_size_sampled * 100 * 100 /
+                       (total_compressed_size_estimated * correction)))
+
+	printf "%u.%02u" $((ratio / 100)) $((ratio % 100))
 }
 
 # Calculate estimated compressed size of all files using the ratio from our
 # sample data
 calculate_estimated_total_compressed_size()
 {
-	local ratio=$1
-
-	printf "%d" $(echo "scale=0; $size_of_all_files / $compression_ratio_estimated" | bc)
+	echo $((total_file_size * total_compressed_size_estimated * correction /
+		(total_uncompressed_size_sampled * 100)))
 }
 
 print_size() {
@@ -316,16 +361,16 @@ print_size() {
 	local frac
 	local unit
 
-	if (( size > 9 * 2**50 )); then
+	if (( size > 4 * 2**50 )); then
 		frac=$((size / 2**40))
 		unit="PiB"
-	elif (( size > 9 * 2**40 )); then
+	elif (( size > 4 * 2**40 )); then
 		frac=$((size / 2**30))
 		unit="TiB"
-	elif (( size > 9 * 2**30 )); then
+	elif (( size > 4 * 2**30 )); then
 		frac=$((size / 2**20))
 		unit="GiB"
-	elif (( size > 9 * 2**20 )); then
+	elif (( size > 4 * 2**20 )); then
 		frac=$((size / 2**10))
 		unit="MiB"
 	else
@@ -333,7 +378,7 @@ print_size() {
 		unit="KiB"
 	fi
 
-	printf "%u.%02u $unit" $((frac / 1024)) $((frac % 1000))
+	printf "%u.%03u $unit" $((frac / 1024)) $((frac % 1024))
 }
 
 (( quiet == 0 )) && runtime_description | fmt
@@ -362,25 +407,16 @@ echo ""
 echo ""
 
 while read FILE; do
-	total_file_count=$((total_file_count + 1))
-	# randomly select $percentage of files after sampling min_files
-	if (( total_files_scanned > min_files )); then
-		(( RANDOM % 100 < percentage )) || continue
-	fi
-
-	### NOPE, you're not summing file size correctly imo
-	# You need to check the size of all the files not just the ones you're sampling
-	# oops
 	process_file "$FILE"
 
 	if (( quiet < 2 &&
 	      ((min_files > 1 && total_files_scanned == min_files) ||
-	       total_files_scanned % lines == 0 ||
+	       total_files_sampled % lines == 0 ||
 	       last + interval < SECONDS) )); then
-		if ((total_files_scanned != total_file_count)); then
-			echo -ne "${cr}Sampled $total_files_scanned/$total_file_count files so far, estimated compression ratio $(calculate_estimated_ratio)x...${lf}"
+		if ((total_files_sampled != total_file_count)); then
+			echo -ne "${cr}Sampled $total_files_sampled/$total_file_count files so far, estimated compression ratio $(calculate_estimated_ratio)x...${lf}"
 		else
-			echo -ne "${cr}Sampled $total_files_scanned files so far, estimated compression ratio $(calculate_estimated_ratio)x...${lf}"
+			echo -ne "${cr}Sampled $total_files_sampled files so far, estimated compression ratio $(calculate_estimated_ratio)x...${lf}"
 		fi
 		last=$SECONDS
 	fi
@@ -404,19 +440,17 @@ if (( quiet == 0 )); then
 	echo "Results"
 	echo "---------------------"
 fi
-echo "Number of files sampled: $total_files_scanned ($((total_files_scanned * 100 / total_file_count))% of $total_file_count files)"
-#echo "Number of zero-length files: $total_empty_files"
-echo "Total size of files sampled: $(print_size $total_file_size)"
+
+echo "Compression type: ${compression_type/ -*/} Level: $compression_level"
+echo "Chunk size: $chunk_size"
+echo "Number of files sampled: $total_files_sampled ($((total_files_sampled * 100 / total_file_count))% of $total_file_count total files)"
+echo "Number of files under $block_size bytes (incompressible): $total_small_files"
+echo "Total number of incompressible files: $total_incompressible_files"
+echo "Total size of incompressible files: $(print_size $total_incompressible_size)"
+echo "Total size of files scanned: $(print_size $total_file_size)"
 echo "Total uncompressed size of sampled data: $(print_size $total_uncompressed_size)"
 echo "Total compressed size of sampled data: $(print_size $total_compressed_size)"
-echo "Compressed size as percentage of uncompressed size: $(calculate_pct)"
-compression_ratio_estimated=$(calculate_estimated_ratio)
-echo "Estimated compression ratio of sampled files: ${compression_ratio_estimated}x"
-if (( total_files_scanned < total_file_count )); then
-	size_of_all_files=$((total_file_size * total_file_count / total_files_scanned))
-	echo "Estimated size of all $total_file_count files: $(print_size $size_of_all_files)"
-else
-	size_of_all_files=$total_file_size
-fi
-estimated_total_compressed_size=$(calculate_estimated_total_compressed_size $ratio)
+echo "Compression ratio of sampled data: $(calculate_ratio)"
+echo "Estimated compression ratio of sampled files: $(calculate_estimated_ratio)"
+estimated_total_compressed_size=$(calculate_estimated_total_compressed_size)
 echo "Estimated compressed size of all files: $(print_size $estimated_total_compressed_size)"
diff --git a/lustre/tests/sanity-compr.sh b/lustre/tests/sanity-compr.sh
index ce06081..7055637 100644
--- a/lustre/tests/sanity-compr.sh
+++ b/lustre/tests/sanity-compr.sh
@@ -982,12 +982,14 @@ test_1007() {
 		# Sync to disk and drop cache
 		sync; echo 3 > /proc/sys/vm/drop_caches
 
-		local scan_cmd="ll_compression_scan -w -q -z $compr_type"
+		local scan_cmd="ll_compression_scan -m 0 -w -q -z $compr_type"
 		[[ -z $has_level ]] || scan_cmd+=" -l $compr_level"
 		scan_cmd+=" -c $chunksize"
 
 		local estimated_size=$($scan_cmd $source |
-			awk '/Estimated compressed size/{print $7}')
+			awk '/Estimated compressed size/ { print $7 }')
+		[[ -n "$estimated_size" ]] || error "no compression estimate"
+		estimated_size=$(bc -l <<< "$estimated_size * 1024")
 		local csdc_size=$(du -sk $tf | awk '{print $1}')
 
 		local margin=5