EX-7681 scripts: Compression estimate script

author Patrick Farrell <pfarrell@whamcloud.com>

Thu, 15 Jun 2023 18:49:56 +0000 (14:49 -0400)

committer Andreas Dilger <adilger@whamcloud.com>

Mon, 18 Sep 2023 06:27:03 +0000 (06:27 +0000)
author Patrick Farrell <pfarrell@whamcloud.com>
Thu, 15 Jun 2023 18:49:56 +0000 (14:49 -0400)
committer Andreas Dilger <adilger@whamcloud.com>
Mon, 18 Sep 2023 06:27:03 +0000 (06:27 +0000)
diff --git a/lustre.spec.in b/lustre.spec.in

index 16ea674..99d2faf 100644 (file)
--- a/lustre.spec.in
+++ b/lustre.spec.in
@@ -938,6 +938,7 @@ fi
  %{_bindir}/lustre_req_history
  %{_bindir}/remove_changelog
  %{_bindir}/remove_updatelog
+%{_bindir}/ll_compression_scan
  %endif
  
  %{_bindir}/llobdstat
diff --git a/lustre/scripts/Makefile.am b/lustre/scripts/Makefile.am

index bfb6ce9..cc516c9 100644 (file)
--- a/lustre/scripts/Makefile.am
+++ b/lustre/scripts/Makefile.am
@@ -62,6 +62,7 @@ bin_SCRIPTS   = lfs_migrate
  if SERVER
  sbin_SCRIPTS += $(genscripts) lc_mon lhbadm lc_servip
  bin_SCRIPTS  += lustre_req_history remove_changelog remove_updatelog
+bin_SCRIPTS  += ll_compression_scan
  
  hadir = $(sysconfdir)/ha.d/resource.d
  ha_SCRIPTS = Lustre.ha_v2
@@ -90,7 +91,7 @@ EXTRA_DIST = license-status lustre_rmmod ldev lc_mon lhbadm \
              lustre lsvcgss lc_common haconfig Lustre.ha_v2 dkms.mkconf \
              ko2iblnd-probe ksocklnd-config statechange-lustre.sh \
              bash-completion/lustre bash-completion/lctl bash-completion/lfs \
-            umount.lustre remove_changelog remove_updatelog
+            umount.lustre remove_changelog remove_updatelog ll_compression_scan
  
  CLEANFILES = $(genscripts)
  
diff --git a/lustre/scripts/ll_compression_scan b/lustre/scripts/ll_compression_scan

new file mode 100755 (executable)

index 0000000..5271f39
--- /dev/null
+++ b/lustre/scripts/ll_compression_scan
@@ -0,0 +1,395 @@
+#!/bin/bash
+
+# Copyright (c) 2023 DataDirect Networks, Inc.
+# Authors: Patrick Farrell, Andreas Dilger
+#
+# This is a simple tool which can be run on any Linux
+# system to estimate the space usage reduction from the
+# Lustre Client Side Data Compression (CSDC) feature with
+# particular compression settings (algorithm, chunk size,
+# and compression level).
+#
+# When run in a directory, this tool will recursively
+# examine files under that directory, sampling the data in
+# those files to estimate how much the files will compress.
+#
+# This tool will sample all files up to a configured number
+# (defaulting to 100 files) and after that, it samples a
+# configurable % of remaining files.
+#
+# This tools samples throughout the file, so it should
+# avoid problems with poor estimates for files with headers
+# which differ from the bulk data in the file.
+#
+# This tool requires the lz4, lzop, and gzip utilities to
+# be installed in order to test those compression types.
+# (lzop is the command line utility for lzo compression)
+
+# Default values
+chunk_size=65536
+block_size=4096
+sample_count=20
+min_files=100
+default_directory="$(pwd)"
+percentage=1
+compression_type="gzip"
+compression_level=6
+quiet=0
+
+# Display description of script behavior
+description()
+{
+       echo "Recursively scan DIRECTORY "
+       echo "sampling data from the first MIN_FILES "
+       echo "then sampling data from PERCENTAGE% of remaining files "
+       echo "to estimate the average compression ratio using "
+       echo "COMPRESSION_TYPE level COMPRESSION_LEVEL"
+       echo "and a chunk size of CHUNK_SIZE bytes."
+       echo ""
+       echo "This tool takes SAMPLE_COUNT samples of CHUNK_SIZE bytes from"
+       echo "each sampled file and compresses that data with the selected"
+       echo "parameters to generate an estimate of the compression ratio for"
+       echo "the full dataset."
+       echo ""
+       echo "You can trade-off estimation accuracy and scan speed by adjusting"
+       echo "the per file sample count and percentage of files to sample."
+       echo ""
+}
+
+runtime_description()
+{
+       echo "Recursively scan '$directory',"
+       if (( percentage < 100 )); then
+               (( min_files > 1 )) && echo "sampling data from the first $min_files files "
+               echo "then sampling data from $percentage% of remaining files "
+       fi
+       echo "to estimate the average compression ratio using "
+       echo "$compression_type level $compression_level "
+       echo "and a chunk size of $chunk_size bytes."
+       echo ""
+       echo "Run with -h to see options for these parameters."
+       echo ""
+       echo "This tool takes $sample_count samples of $chunk_size bytes from "
+       echo "each sampled file and compresses that data with the selected "
+       echo "parameters to generate an estimate of the compression ratio for "
+       echo "the full dataset."
+       echo ""
+       echo "You can trade-off estimation accuracy and scan speed by adjusting"
+       echo "the per file sample count and percentage of files to sample."
+       echo ""
+       echo "This tool assumes a relatively uniform distribution of file "
+       echo "sizes and contents across the directory tree, and is only "
+       echo "intended to provide an approximate estimate of the compression "
+       echo "potential of a specific dataset, and does not guarantee a "
+       echo "particular compression level."
+       echo ""
+}
+
+# Function to display script usage
+usage() {
+cat <<- USAGE
+Usage: $(basename $0) [-n MIN_FILES] [-p PERCENTAGE] [-s SAMPLE_COUNT]
+       [-c CHUNK_SIZE] [-z COMPRESSION_TYPE] [-l COMPRESSION_LEVEL] [-h][-q]
+       [DIRECTORY ...]
+
+Description:
+$(description | fmt)
+
+Arguments:
+    -n MIN_FILES: Minimum number of files to scan. Default: $min_files.
+    -p PERCENTAGE: Fraction of scanned files to process. Default: ${percentage}%.
+    -s SAMPLE_COUNT: Maximum number of chunks to sample per file. Default: $sample_count.
+    -c CHUNK_SIZE: Size of data chunk in kibibytes (64-4096). Default: $((chunk_size / 1024))KiB.
+    -z COMPRESSION_TYPE: One of gzip, lz4, lz4fast, lzo. Default: $compression_type.
+    -l COMPRESSION_LEVEL: Compression level to use (1-9). Default: $compression_level.
+    -q Skip printing of usage header.  -qq to also skip runtime status update.
+    -h Print this help message.
+USAGE
+}
+
+# Parse command-line options
+while getopts "c:s:n:p:z:Z:l:qh" opt; do
+       case $opt in
+       c)
+               if (( OPTARG & (OPTARG - 1) )); then
+                       echo "Chunk size must be a power-of-two value" 1>&2
+                       exit 1
+               fi
+               if (( OPTARG < 64 || OPTARG > 4096)); then
+                       echo "Chunk size must be between 64 and 4096" 1>&2
+                       exit 1
+               fi
+               chunk_size=$((OPTARG *= 1024))
+               ;;
+       s)
+               sample_count=$OPTARG
+               ;;
+       n)
+               min_files=$OPTARG
+               ;;
+       p)
+               if (( OPTARG < 1 || OPTARG > 100 )); then
+                       echo "Scan percentage must be between 1 and 100" 1>&2
+                       exit 1
+               fi
+               percentage=$OPTARG
+               ;;
+       q)
+               ((quiet += 1))
+               ;;
+       z|Z)
+               case $OPTARG in
+               lzo)
+                       compression_type=lzop
+                       ;;
+               lz4fast)
+                       compression_type="lz4 --fast"
+                       ;;
+               gzip|lz4)
+                       compression_type=$OPTARG
+                       ;;
+               *)
+                       echo "Unknown compression type: $compression_type" 1>&2
+                       usage 1>&2
+                       exit 1
+                       ;;
+               esac
+               ;;
+       l)
+               compression_level=$OPTARG
+               ;;
+       h)
+               usage
+               exit 0
+               ;;
+       *)
+               usage 1>&2
+               exit 1
+               ;;
+       esac
+done
+
+if (( compression_level < 1 || compression_level > 12 )); then
+       echo "Compression level must be between 1 and 12" 1>&2
+       exit 1
+fi
+if [[ $compression_level -gt 9 && $compression_type != "lz4" ]]; then
+       echo "Compression level must be between 1 and 9 (levels 10-12 are lz4 only)" 1>&2
+       exit 2
+fi
+
+directory_provided=false
+compress="$compression_type -q -$compression_level"
+shift $((OPTIND - 1))
+if [[ -z "$@" ]]; then
+       directory=$default_directory
+else
+       directory="$@"
+       directory_provided=true
+       shift
+fi
+
+# Variables to track overall compression efficiency and additional statistics
+export total_file_size=0
+export total_uncompressed_size=0
+export total_compressed_size=0
+export total_files_scanned=0
+export total_empty_files=0
+
+round_to_block_size() {
+       local size=$1
+
+       echo $(( ((size - 1) | (block_size - 1)) + 1 ))
+}
+
+# Function to process a file
+process_file() {
+       local file="$1"
+       local file_size=$(stat --format=%s "$file")
+       local sum_uncompressed_chunk=0
+       local sum_compressed_chunk=0
+
+       total_files_scanned=$((total_files_scanned + 1))
+
+       if [[ -z "$file_size" ]] || (( $file_size == 0 )); then
+               total_empty_files=$((total_empty_files + 1))
+               return
+       fi
+
+
+       # Calculate the segment size for the file
+       local segment_size=$((file_size / sample_count))
+
+       # Limit sample_count for small file size, but have at least one chunk
+       if ((sample_count * chunk_size > file_size)); then
+               sample_count=$((file_size / chunk_size))
+               if ((sample_count == 0)); then
+                       sample_count=1
+               fi
+       fi
+
+       # Round up the file_size to the next block (actual space usage)
+       file_size=$(round_to_block_size file_size)
+
+       # Accumulate the total size of files scanned (in block_size units)
+       total_file_size=$((total_file_size + file_size))
+
+       # Read and process each segment
+       for ((i = 0; i < sample_count; i++)); do
+               offset=$((i * segment_size / chunk_size))
+               compressed_size=$(dd if="$file" bs=$chunk_size count=1 skip=$offset 2>/dev/null | $compress | wc -c)
+
+               # if the compressed size is zero, something must have failed
+               (( compressed_size > 0 )) || continue
+
+               # Round up compressed size to full block size
+               compressed_size=$(round_to_block_size compressed_size)
+
+               # Incompressible chunks will not be compressed
+               (( compressed_size < chunk_size )) || compressed_size=$chunk_size
+
+               # Accumulate sampled chunk byte counts, but don't inflate size
+               sum_uncompressed_chunk=$((sum_uncompressed_chunk +
+                                         (chunk_size < file_size ?
+                                          chunk_size : file_size) ))
+               sum_compressed_chunk=$((sum_compressed_chunk + compressed_size))
+
+       done
+
+       # Accumulate the total uncompressed and compressed byte counts
+       total_uncompressed_size=$((total_uncompressed_size + sum_uncompressed_chunk))
+       total_compressed_size=$((total_compressed_size + sum_compressed_chunk))
+}
+
+# Calculate compression ratio from compressed chunks (value > 1)
+calculate_ratio() {
+       local ratio=$((total_uncompressed_size * 100 / total_compressed_size))
+
+       printf "%u.%02u" $((ratio / 100)) $((ratio % 100))
+}
+
+# Calculate percentage of compressed size compared to original size (1-100%)
+calculate_pct() {
+       local pct=$((total_compressed_size * 10000 / total_uncompressed_size))
+
+       printf "%u.%02u%%" $((pct / 100)) $((pct % 100))
+}
+
+# Calculate estimated compressed size of all files using the ratio from our
+# sample data
+calculate_estimated_total_compressed_size()
+{
+       local ratio=$1
+
+       printf "%d" $(echo "scale=0; $size_of_all_files / $compression_ratio" | bc)
+}
+
+print_size() {
+       local size=$1
+       local frac
+       local unit
+
+       if (( size > 9 * 2**50 )); then
+               frac=$((size / 2**40))
+               unit="PiB"
+       elif (( size > 9 * 2**40 )); then
+               frac=$((size / 2**30))
+               unit="TiB"
+       elif (( size > 9 * 2**30 )); then
+               frac=$((size / 2**20))
+               unit="GiB"
+       elif (( size > 9 * 2**20 )); then
+               frac=$((size / 2**10))
+               unit="MiB"
+       else
+               frac=$size
+               unit="KiB"
+       fi
+
+       printf "%u.%02u $unit" $((frac / 1024)) $((frac % 1000))
+}
+
+(( quiet == 0 )) && runtime_description | fmt
+
+# if stdout is a tty then make output more interactive
+if [[ -t 1 ]]; then
+       cr="\r"
+       lines=100
+       interval=30
+else
+       lf="\n"
+       lines=1000
+       interval=300
+fi
+
+total_file_count=0
+last=$SECONDS
+
+echo ""
+if [ "$directory_provided" = true ]; then
+       echo "Scanning $directory."
+else
+       echo "Scanning current directory, $directory."
+fi
+echo ""
+echo ""
+
+while read FILE; do
+       total_file_count=$((total_file_count + 1))
+       # randomly select $percentage of files after sampling min_files
+       if (( total_files_scanned > min_files )); then
+               (( RANDOM % 100 < percentage )) || continue
+       fi
+
+       ### NOPE, you're not summing file size correctly imo
+       # You need to check the size of all the files not just the ones you're sampling
+       # oops
+       process_file "$FILE"
+
+       if (( quiet < 2 &&
+             ((min_files > 1 && total_files_scanned == min_files) ||
+              total_files_scanned % lines == 0 ||
+              last + interval < SECONDS) )); then
+               if ((total_files_scanned != total_file_count)); then
+                       echo -ne "${cr}Sampled $total_files_scanned/$total_file_count files so far, estimated compression ratio $(calculate_ratio)x...${lf}"
+               else
+                       echo -ne "${cr}Sampled $total_files_scanned files so far, estimated compression ratio $(calculate_ratio)x...${lf}"
+               fi
+               last=$SECONDS
+       fi
+done < <(find $directory -type f -print)
+
+(( total_file_count == 0 )) &&
+       echo "error: no files found in '$directory' to compress" 1>&2 &&
+       exit 10
+(( total_uncompressed_size == 0 )) &&
+       echo "error: only zero-length files found in '$directory'" 1>&2 &&
+       exit 11
+
+echo ""
+# Report the additional statistics
+if (( quiet == 0 )); then
+       echo ""
+       echo "Finished sampling."
+       echo ""
+       echo ""
+       echo "---------------------"
+       echo "Results"
+       echo "---------------------"
+fi
+echo "Number of files sampled: $total_files_scanned ($((total_files_scanned * 100 / total_file_count))% of $total_file_count files)"
+#echo "Number of zero-length files: $total_empty_files"
+echo "Total size of files sampled: $(print_size $total_file_size)"
+echo "Total uncompressed size of sampled data: $(print_size $total_uncompressed_size)"
+echo "Total compressed size of sampled data: $(print_size $total_compressed_size)"
+echo "Compressed size as percentage of uncompressed size: $(calculate_pct)"
+compression_ratio=$(calculate_ratio)
+echo "Compression ratio of sampled data: ${compression_ratio}x"
+if (( total_files_scanned < total_file_count )); then
+       size_of_all_files=$((total_file_size * total_file_count / total_files_scanned))
+       echo "Estimated size of all $total_file_count files: $(print_size $size_of_all_files)"
+else
+       size_of_all_files=$total_file_size
+fi
+estimated_total_compressed_size=$(calculate_estimated_total_compressed_size $ratio)
+echo "Estimated compressed size of all files: $(print_size $estimated_total_compressed_size)"
author	Patrick Farrell <pfarrell@whamcloud.com>
	Thu, 15 Jun 2023 18:49:56 +0000 (14:49 -0400)
committer	Andreas Dilger <adilger@whamcloud.com>
	Mon, 18 Sep 2023 06:27:03 +0000 (06:27 +0000)
lustre.spec.in		patch \| blob \| history
lustre/scripts/Makefile.am		patch \| blob \| history
lustre/scripts/ll_compression_scan	[new file with mode: 0755]	patch \| blob