Whamcloud - gitweb
EX-7681 scripts: Compression estimate script
authorPatrick Farrell <pfarrell@whamcloud.com>
Thu, 15 Jun 2023 18:49:56 +0000 (14:49 -0400)
committerAndreas Dilger <adilger@whamcloud.com>
Mon, 18 Sep 2023 06:27:03 +0000 (06:27 +0000)
ll_compression_scan is a simple tool which can be run on any
Linux system to estimate the space usage reduction from the
Lustre Client Side Data Compression (CSDC) feature with
particular compression settings (algorithm, chunk size,
and compression level).

When run on one or more directories, it will recursively
examine a percentage of files under that directory, sampling
data in those files to estimate how the files will compress.

This tools samples data throughout the file, so it should
avoid problems with poor estimates for files with headers
which differ from the bulk data in the file.

However, if the directory tree is particularly imbalanced,
with a few large uncompressible files in one directory, and
many small files in other directories, then scanning a small
percentage of files may give a misleading compression estimate.
Sampling a larger percentage of files will improve this.

This tool requires the lz4, lzop, and gzip utilities to
be installed in order to test those compression types.
(lzop is the command line utility for lzo compression)

Test-Parameters: trivial
Signed-off-by: Patrick Farrell <pfarrell@whamcloud.com>
Change-Id: I092f9608553eba10bacfcc3c4a3fafc9a454c287
Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/51333
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Colin Faber <cfaber@ddn.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
lustre.spec.in
lustre/scripts/Makefile.am
lustre/scripts/ll_compression_scan [new file with mode: 0755]

index 16ea674..99d2faf 100644 (file)
@@ -938,6 +938,7 @@ fi
 %{_bindir}/lustre_req_history
 %{_bindir}/remove_changelog
 %{_bindir}/remove_updatelog
+%{_bindir}/ll_compression_scan
 %endif
 
 %{_bindir}/llobdstat
index bfb6ce9..cc516c9 100644 (file)
@@ -62,6 +62,7 @@ bin_SCRIPTS   = lfs_migrate
 if SERVER
 sbin_SCRIPTS += $(genscripts) lc_mon lhbadm lc_servip
 bin_SCRIPTS  += lustre_req_history remove_changelog remove_updatelog
+bin_SCRIPTS  += ll_compression_scan
 
 hadir = $(sysconfdir)/ha.d/resource.d
 ha_SCRIPTS = Lustre.ha_v2
@@ -90,7 +91,7 @@ EXTRA_DIST = license-status lustre_rmmod ldev lc_mon lhbadm \
             lustre lsvcgss lc_common haconfig Lustre.ha_v2 dkms.mkconf \
             ko2iblnd-probe ksocklnd-config statechange-lustre.sh \
             bash-completion/lustre bash-completion/lctl bash-completion/lfs \
-            umount.lustre remove_changelog remove_updatelog
+            umount.lustre remove_changelog remove_updatelog ll_compression_scan
 
 CLEANFILES = $(genscripts)
 
diff --git a/lustre/scripts/ll_compression_scan b/lustre/scripts/ll_compression_scan
new file mode 100755 (executable)
index 0000000..5271f39
--- /dev/null
@@ -0,0 +1,395 @@
+#!/bin/bash
+
+# Copyright (c) 2023 DataDirect Networks, Inc.
+# Authors: Patrick Farrell, Andreas Dilger
+#
+# This is a simple tool which can be run on any Linux
+# system to estimate the space usage reduction from the
+# Lustre Client Side Data Compression (CSDC) feature with
+# particular compression settings (algorithm, chunk size,
+# and compression level).
+#
+# When run in a directory, this tool will recursively
+# examine files under that directory, sampling the data in
+# those files to estimate how much the files will compress.
+#
+# This tool will sample all files up to a configured number
+# (defaulting to 100 files) and after that, it samples a
+# configurable % of remaining files.
+#
+# This tools samples throughout the file, so it should
+# avoid problems with poor estimates for files with headers
+# which differ from the bulk data in the file.
+#
+# This tool requires the lz4, lzop, and gzip utilities to
+# be installed in order to test those compression types.
+# (lzop is the command line utility for lzo compression)
+
+# Default values
+chunk_size=65536
+block_size=4096
+sample_count=20
+min_files=100
+default_directory="$(pwd)"
+percentage=1
+compression_type="gzip"
+compression_level=6
+quiet=0
+
+# Display description of script behavior
+description()
+{
+       echo "Recursively scan DIRECTORY "
+       echo "sampling data from the first MIN_FILES "
+       echo "then sampling data from PERCENTAGE% of remaining files "
+       echo "to estimate the average compression ratio using "
+       echo "COMPRESSION_TYPE level COMPRESSION_LEVEL"
+       echo "and a chunk size of CHUNK_SIZE bytes."
+       echo ""
+       echo "This tool takes SAMPLE_COUNT samples of CHUNK_SIZE bytes from"
+       echo "each sampled file and compresses that data with the selected"
+       echo "parameters to generate an estimate of the compression ratio for"
+       echo "the full dataset."
+       echo ""
+       echo "You can trade-off estimation accuracy and scan speed by adjusting"
+       echo "the per file sample count and percentage of files to sample."
+       echo ""
+}
+
+runtime_description()
+{
+       echo "Recursively scan '$directory',"
+       if (( percentage < 100 )); then
+               (( min_files > 1 )) && echo "sampling data from the first $min_files files "
+               echo "then sampling data from $percentage% of remaining files "
+       fi
+       echo "to estimate the average compression ratio using "
+       echo "$compression_type level $compression_level "
+       echo "and a chunk size of $chunk_size bytes."
+       echo ""
+       echo "Run with -h to see options for these parameters."
+       echo ""
+       echo "This tool takes $sample_count samples of $chunk_size bytes from "
+       echo "each sampled file and compresses that data with the selected "
+       echo "parameters to generate an estimate of the compression ratio for "
+       echo "the full dataset."
+       echo ""
+       echo "You can trade-off estimation accuracy and scan speed by adjusting"
+       echo "the per file sample count and percentage of files to sample."
+       echo ""
+       echo "This tool assumes a relatively uniform distribution of file "
+       echo "sizes and contents across the directory tree, and is only "
+       echo "intended to provide an approximate estimate of the compression "
+       echo "potential of a specific dataset, and does not guarantee a "
+       echo "particular compression level."
+       echo ""
+}
+
+# Function to display script usage
+usage() {
+cat <<- USAGE
+Usage: $(basename $0) [-n MIN_FILES] [-p PERCENTAGE] [-s SAMPLE_COUNT]
+       [-c CHUNK_SIZE] [-z COMPRESSION_TYPE] [-l COMPRESSION_LEVEL] [-h][-q]
+       [DIRECTORY ...]
+
+Description:
+$(description | fmt)
+
+Arguments:
+    -n MIN_FILES: Minimum number of files to scan. Default: $min_files.
+    -p PERCENTAGE: Fraction of scanned files to process. Default: ${percentage}%.
+    -s SAMPLE_COUNT: Maximum number of chunks to sample per file. Default: $sample_count.
+    -c CHUNK_SIZE: Size of data chunk in kibibytes (64-4096). Default: $((chunk_size / 1024))KiB.
+    -z COMPRESSION_TYPE: One of gzip, lz4, lz4fast, lzo. Default: $compression_type.
+    -l COMPRESSION_LEVEL: Compression level to use (1-9). Default: $compression_level.
+    -q Skip printing of usage header.  -qq to also skip runtime status update.
+    -h Print this help message.
+USAGE
+}
+
+# Parse command-line options
+while getopts "c:s:n:p:z:Z:l:qh" opt; do
+       case $opt in
+       c)
+               if (( OPTARG & (OPTARG - 1) )); then
+                       echo "Chunk size must be a power-of-two value" 1>&2
+                       exit 1
+               fi
+               if (( OPTARG < 64 || OPTARG > 4096)); then
+                       echo "Chunk size must be between 64 and 4096" 1>&2
+                       exit 1
+               fi
+               chunk_size=$((OPTARG *= 1024))
+               ;;
+       s)
+               sample_count=$OPTARG
+               ;;
+       n)
+               min_files=$OPTARG
+               ;;
+       p)
+               if (( OPTARG < 1 || OPTARG > 100 )); then
+                       echo "Scan percentage must be between 1 and 100" 1>&2
+                       exit 1
+               fi
+               percentage=$OPTARG
+               ;;
+       q)
+               ((quiet += 1))
+               ;;
+       z|Z)
+               case $OPTARG in
+               lzo)
+                       compression_type=lzop
+                       ;;
+               lz4fast)
+                       compression_type="lz4 --fast"
+                       ;;
+               gzip|lz4)
+                       compression_type=$OPTARG
+                       ;;
+               *)
+                       echo "Unknown compression type: $compression_type" 1>&2
+                       usage 1>&2
+                       exit 1
+                       ;;
+               esac
+               ;;
+       l)
+               compression_level=$OPTARG
+               ;;
+       h)
+               usage
+               exit 0
+               ;;
+       *)
+               usage 1>&2
+               exit 1
+               ;;
+       esac
+done
+
+if (( compression_level < 1 || compression_level > 12 )); then
+       echo "Compression level must be between 1 and 12" 1>&2
+       exit 1
+fi
+if [[ $compression_level -gt 9 && $compression_type != "lz4" ]]; then
+       echo "Compression level must be between 1 and 9 (levels 10-12 are lz4 only)" 1>&2
+       exit 2
+fi
+
+directory_provided=false
+compress="$compression_type -q -$compression_level"
+shift $((OPTIND - 1))
+if [[ -z "$@" ]]; then
+       directory=$default_directory
+else
+       directory="$@"
+       directory_provided=true
+       shift
+fi
+
+# Variables to track overall compression efficiency and additional statistics
+export total_file_size=0
+export total_uncompressed_size=0
+export total_compressed_size=0
+export total_files_scanned=0
+export total_empty_files=0
+
+round_to_block_size() {
+       local size=$1
+
+       echo $(( ((size - 1) | (block_size - 1)) + 1 ))
+}
+
+# Function to process a file
+process_file() {
+       local file="$1"
+       local file_size=$(stat --format=%s "$file")
+       local sum_uncompressed_chunk=0
+       local sum_compressed_chunk=0
+
+       total_files_scanned=$((total_files_scanned + 1))
+
+       if [[ -z "$file_size" ]] || (( $file_size == 0 )); then
+               total_empty_files=$((total_empty_files + 1))
+               return
+       fi
+
+
+       # Calculate the segment size for the file
+       local segment_size=$((file_size / sample_count))
+
+       # Limit sample_count for small file size, but have at least one chunk
+       if ((sample_count * chunk_size > file_size)); then
+               sample_count=$((file_size / chunk_size))
+               if ((sample_count == 0)); then
+                       sample_count=1
+               fi
+       fi
+
+       # Round up the file_size to the next block (actual space usage)
+       file_size=$(round_to_block_size file_size)
+
+       # Accumulate the total size of files scanned (in block_size units)
+       total_file_size=$((total_file_size + file_size))
+
+       # Read and process each segment
+       for ((i = 0; i < sample_count; i++)); do
+               offset=$((i * segment_size / chunk_size))
+               compressed_size=$(dd if="$file" bs=$chunk_size count=1 skip=$offset 2>/dev/null | $compress | wc -c)
+
+               # if the compressed size is zero, something must have failed
+               (( compressed_size > 0 )) || continue
+
+               # Round up compressed size to full block size
+               compressed_size=$(round_to_block_size compressed_size)
+
+               # Incompressible chunks will not be compressed
+               (( compressed_size < chunk_size )) || compressed_size=$chunk_size
+
+               # Accumulate sampled chunk byte counts, but don't inflate size
+               sum_uncompressed_chunk=$((sum_uncompressed_chunk +
+                                         (chunk_size < file_size ?
+                                          chunk_size : file_size) ))
+               sum_compressed_chunk=$((sum_compressed_chunk + compressed_size))
+
+       done
+
+       # Accumulate the total uncompressed and compressed byte counts
+       total_uncompressed_size=$((total_uncompressed_size + sum_uncompressed_chunk))
+       total_compressed_size=$((total_compressed_size + sum_compressed_chunk))
+}
+
+# Calculate compression ratio from compressed chunks (value > 1)
+calculate_ratio() {
+       local ratio=$((total_uncompressed_size * 100 / total_compressed_size))
+
+       printf "%u.%02u" $((ratio / 100)) $((ratio % 100))
+}
+
+# Calculate percentage of compressed size compared to original size (1-100%)
+calculate_pct() {
+       local pct=$((total_compressed_size * 10000 / total_uncompressed_size))
+
+       printf "%u.%02u%%" $((pct / 100)) $((pct % 100))
+}
+
+# Calculate estimated compressed size of all files using the ratio from our
+# sample data
+calculate_estimated_total_compressed_size()
+{
+       local ratio=$1
+
+       printf "%d" $(echo "scale=0; $size_of_all_files / $compression_ratio" | bc)
+}
+
+print_size() {
+       local size=$1
+       local frac
+       local unit
+
+       if (( size > 9 * 2**50 )); then
+               frac=$((size / 2**40))
+               unit="PiB"
+       elif (( size > 9 * 2**40 )); then
+               frac=$((size / 2**30))
+               unit="TiB"
+       elif (( size > 9 * 2**30 )); then
+               frac=$((size / 2**20))
+               unit="GiB"
+       elif (( size > 9 * 2**20 )); then
+               frac=$((size / 2**10))
+               unit="MiB"
+       else
+               frac=$size
+               unit="KiB"
+       fi
+
+       printf "%u.%02u $unit" $((frac / 1024)) $((frac % 1000))
+}
+
+(( quiet == 0 )) && runtime_description | fmt
+
+# if stdout is a tty then make output more interactive
+if [[ -t 1 ]]; then
+       cr="\r"
+       lines=100
+       interval=30
+else
+       lf="\n"
+       lines=1000
+       interval=300
+fi
+
+total_file_count=0
+last=$SECONDS
+
+echo ""
+if [ "$directory_provided" = true ]; then
+       echo "Scanning $directory."
+else
+       echo "Scanning current directory, $directory."
+fi
+echo ""
+echo ""
+
+while read FILE; do
+       total_file_count=$((total_file_count + 1))
+       # randomly select $percentage of files after sampling min_files
+       if (( total_files_scanned > min_files )); then
+               (( RANDOM % 100 < percentage )) || continue
+       fi
+
+       ### NOPE, you're not summing file size correctly imo
+       # You need to check the size of all the files not just the ones you're sampling
+       # oops
+       process_file "$FILE"
+
+       if (( quiet < 2 &&
+             ((min_files > 1 && total_files_scanned == min_files) ||
+              total_files_scanned % lines == 0 ||
+              last + interval < SECONDS) )); then
+               if ((total_files_scanned != total_file_count)); then
+                       echo -ne "${cr}Sampled $total_files_scanned/$total_file_count files so far, estimated compression ratio $(calculate_ratio)x...${lf}"
+               else
+                       echo -ne "${cr}Sampled $total_files_scanned files so far, estimated compression ratio $(calculate_ratio)x...${lf}"
+               fi
+               last=$SECONDS
+       fi
+done < <(find $directory -type f -print)
+
+(( total_file_count == 0 )) &&
+       echo "error: no files found in '$directory' to compress" 1>&2 &&
+       exit 10
+(( total_uncompressed_size == 0 )) &&
+       echo "error: only zero-length files found in '$directory'" 1>&2 &&
+       exit 11
+
+echo ""
+# Report the additional statistics
+if (( quiet == 0 )); then
+       echo ""
+       echo "Finished sampling."
+       echo ""
+       echo ""
+       echo "---------------------"
+       echo "Results"
+       echo "---------------------"
+fi
+echo "Number of files sampled: $total_files_scanned ($((total_files_scanned * 100 / total_file_count))% of $total_file_count files)"
+#echo "Number of zero-length files: $total_empty_files"
+echo "Total size of files sampled: $(print_size $total_file_size)"
+echo "Total uncompressed size of sampled data: $(print_size $total_uncompressed_size)"
+echo "Total compressed size of sampled data: $(print_size $total_compressed_size)"
+echo "Compressed size as percentage of uncompressed size: $(calculate_pct)"
+compression_ratio=$(calculate_ratio)
+echo "Compression ratio of sampled data: ${compression_ratio}x"
+if (( total_files_scanned < total_file_count )); then
+       size_of_all_files=$((total_file_size * total_file_count / total_files_scanned))
+       echo "Estimated size of all $total_file_count files: $(print_size $size_of_all_files)"
+else
+       size_of_all_files=$total_file_size
+fi
+estimated_total_compressed_size=$(calculate_estimated_total_compressed_size $ratio)
+echo "Estimated compressed size of all files: $(print_size $estimated_total_compressed_size)"