--- /dev/null
+#!/bin/bash
+
+# Copyright (c) 2023 DataDirect Networks, Inc.
+# Authors: Patrick Farrell, Andreas Dilger
+#
+# This is a simple tool which can be run on any Linux
+# system to estimate the space usage reduction from the
+# Lustre Client Side Data Compression (CSDC) feature with
+# particular compression settings (algorithm, chunk size,
+# and compression level).
+#
+# When run in a directory, this tool will recursively
+# examine files under that directory, sampling the data in
+# those files to estimate how much the files will compress.
+#
+# This tool will sample all files up to a configured number
+# (defaulting to 100 files) and after that, it samples a
+# configurable % of remaining files.
+#
+# This tools samples throughout the file, so it should
+# avoid problems with poor estimates for files with headers
+# which differ from the bulk data in the file.
+#
+# This tool requires the lz4, lzop, and gzip utilities to
+# be installed in order to test those compression types.
+# (lzop is the command line utility for lzo compression)
+
+# Default values
+chunk_size=65536
+block_size=4096
+sample_count=20
+min_files=100
+default_directory="$(pwd)"
+percentage=1
+compression_type="gzip"
+compression_level=6
+quiet=0
+
+# Display description of script behavior
+description()
+{
+ echo "Recursively scan DIRECTORY "
+ echo "sampling data from the first MIN_FILES "
+ echo "then sampling data from PERCENTAGE% of remaining files "
+ echo "to estimate the average compression ratio using "
+ echo "COMPRESSION_TYPE level COMPRESSION_LEVEL"
+ echo "and a chunk size of CHUNK_SIZE bytes."
+ echo ""
+ echo "This tool takes SAMPLE_COUNT samples of CHUNK_SIZE bytes from"
+ echo "each sampled file and compresses that data with the selected"
+ echo "parameters to generate an estimate of the compression ratio for"
+ echo "the full dataset."
+ echo ""
+ echo "You can trade-off estimation accuracy and scan speed by adjusting"
+ echo "the per file sample count and percentage of files to sample."
+ echo ""
+}
+
+runtime_description()
+{
+ echo "Recursively scan '$directory',"
+ if (( percentage < 100 )); then
+ (( min_files > 1 )) && echo "sampling data from the first $min_files files "
+ echo "then sampling data from $percentage% of remaining files "
+ fi
+ echo "to estimate the average compression ratio using "
+ echo "$compression_type level $compression_level "
+ echo "and a chunk size of $chunk_size bytes."
+ echo ""
+ echo "Run with -h to see options for these parameters."
+ echo ""
+ echo "This tool takes $sample_count samples of $chunk_size bytes from "
+ echo "each sampled file and compresses that data with the selected "
+ echo "parameters to generate an estimate of the compression ratio for "
+ echo "the full dataset."
+ echo ""
+ echo "You can trade-off estimation accuracy and scan speed by adjusting"
+ echo "the per file sample count and percentage of files to sample."
+ echo ""
+ echo "This tool assumes a relatively uniform distribution of file "
+ echo "sizes and contents across the directory tree, and is only "
+ echo "intended to provide an approximate estimate of the compression "
+ echo "potential of a specific dataset, and does not guarantee a "
+ echo "particular compression level."
+ echo ""
+}
+
+# Function to display script usage
+usage() {
+cat <<- USAGE
+Usage: $(basename $0) [-n MIN_FILES] [-p PERCENTAGE] [-s SAMPLE_COUNT]
+ [-c CHUNK_SIZE] [-z COMPRESSION_TYPE] [-l COMPRESSION_LEVEL] [-h][-q]
+ [DIRECTORY ...]
+
+Description:
+$(description | fmt)
+
+Arguments:
+ -n MIN_FILES: Minimum number of files to scan. Default: $min_files.
+ -p PERCENTAGE: Fraction of scanned files to process. Default: ${percentage}%.
+ -s SAMPLE_COUNT: Maximum number of chunks to sample per file. Default: $sample_count.
+ -c CHUNK_SIZE: Size of data chunk in kibibytes (64-4096). Default: $((chunk_size / 1024))KiB.
+ -z COMPRESSION_TYPE: One of gzip, lz4, lz4fast, lzo. Default: $compression_type.
+ -l COMPRESSION_LEVEL: Compression level to use (1-9). Default: $compression_level.
+ -q Skip printing of usage header. -qq to also skip runtime status update.
+ -h Print this help message.
+USAGE
+}
+
+# Parse command-line options
+while getopts "c:s:n:p:z:Z:l:qh" opt; do
+ case $opt in
+ c)
+ if (( OPTARG & (OPTARG - 1) )); then
+ echo "Chunk size must be a power-of-two value" 1>&2
+ exit 1
+ fi
+ if (( OPTARG < 64 || OPTARG > 4096)); then
+ echo "Chunk size must be between 64 and 4096" 1>&2
+ exit 1
+ fi
+ chunk_size=$((OPTARG *= 1024))
+ ;;
+ s)
+ sample_count=$OPTARG
+ ;;
+ n)
+ min_files=$OPTARG
+ ;;
+ p)
+ if (( OPTARG < 1 || OPTARG > 100 )); then
+ echo "Scan percentage must be between 1 and 100" 1>&2
+ exit 1
+ fi
+ percentage=$OPTARG
+ ;;
+ q)
+ ((quiet += 1))
+ ;;
+ z|Z)
+ case $OPTARG in
+ lzo)
+ compression_type=lzop
+ ;;
+ lz4fast)
+ compression_type="lz4 --fast"
+ ;;
+ gzip|lz4)
+ compression_type=$OPTARG
+ ;;
+ *)
+ echo "Unknown compression type: $compression_type" 1>&2
+ usage 1>&2
+ exit 1
+ ;;
+ esac
+ ;;
+ l)
+ compression_level=$OPTARG
+ ;;
+ h)
+ usage
+ exit 0
+ ;;
+ *)
+ usage 1>&2
+ exit 1
+ ;;
+ esac
+done
+
+if (( compression_level < 1 || compression_level > 12 )); then
+ echo "Compression level must be between 1 and 12" 1>&2
+ exit 1
+fi
+if [[ $compression_level -gt 9 && $compression_type != "lz4" ]]; then
+ echo "Compression level must be between 1 and 9 (levels 10-12 are lz4 only)" 1>&2
+ exit 2
+fi
+
+directory_provided=false
+compress="$compression_type -q -$compression_level"
+shift $((OPTIND - 1))
+if [[ -z "$@" ]]; then
+ directory=$default_directory
+else
+ directory="$@"
+ directory_provided=true
+ shift
+fi
+
+# Variables to track overall compression efficiency and additional statistics
+export total_file_size=0
+export total_uncompressed_size=0
+export total_compressed_size=0
+export total_files_scanned=0
+export total_empty_files=0
+
+round_to_block_size() {
+ local size=$1
+
+ echo $(( ((size - 1) | (block_size - 1)) + 1 ))
+}
+
+# Function to process a file
+process_file() {
+ local file="$1"
+ local file_size=$(stat --format=%s "$file")
+ local sum_uncompressed_chunk=0
+ local sum_compressed_chunk=0
+
+ total_files_scanned=$((total_files_scanned + 1))
+
+ if [[ -z "$file_size" ]] || (( $file_size == 0 )); then
+ total_empty_files=$((total_empty_files + 1))
+ return
+ fi
+
+
+ # Calculate the segment size for the file
+ local segment_size=$((file_size / sample_count))
+
+ # Limit sample_count for small file size, but have at least one chunk
+ if ((sample_count * chunk_size > file_size)); then
+ sample_count=$((file_size / chunk_size))
+ if ((sample_count == 0)); then
+ sample_count=1
+ fi
+ fi
+
+ # Round up the file_size to the next block (actual space usage)
+ file_size=$(round_to_block_size file_size)
+
+ # Accumulate the total size of files scanned (in block_size units)
+ total_file_size=$((total_file_size + file_size))
+
+ # Read and process each segment
+ for ((i = 0; i < sample_count; i++)); do
+ offset=$((i * segment_size / chunk_size))
+ compressed_size=$(dd if="$file" bs=$chunk_size count=1 skip=$offset 2>/dev/null | $compress | wc -c)
+
+ # if the compressed size is zero, something must have failed
+ (( compressed_size > 0 )) || continue
+
+ # Round up compressed size to full block size
+ compressed_size=$(round_to_block_size compressed_size)
+
+ # Incompressible chunks will not be compressed
+ (( compressed_size < chunk_size )) || compressed_size=$chunk_size
+
+ # Accumulate sampled chunk byte counts, but don't inflate size
+ sum_uncompressed_chunk=$((sum_uncompressed_chunk +
+ (chunk_size < file_size ?
+ chunk_size : file_size) ))
+ sum_compressed_chunk=$((sum_compressed_chunk + compressed_size))
+
+ done
+
+ # Accumulate the total uncompressed and compressed byte counts
+ total_uncompressed_size=$((total_uncompressed_size + sum_uncompressed_chunk))
+ total_compressed_size=$((total_compressed_size + sum_compressed_chunk))
+}
+
+# Calculate compression ratio from compressed chunks (value > 1)
+calculate_ratio() {
+ local ratio=$((total_uncompressed_size * 100 / total_compressed_size))
+
+ printf "%u.%02u" $((ratio / 100)) $((ratio % 100))
+}
+
+# Calculate percentage of compressed size compared to original size (1-100%)
+calculate_pct() {
+ local pct=$((total_compressed_size * 10000 / total_uncompressed_size))
+
+ printf "%u.%02u%%" $((pct / 100)) $((pct % 100))
+}
+
+# Calculate estimated compressed size of all files using the ratio from our
+# sample data
+calculate_estimated_total_compressed_size()
+{
+ local ratio=$1
+
+ printf "%d" $(echo "scale=0; $size_of_all_files / $compression_ratio" | bc)
+}
+
+print_size() {
+ local size=$1
+ local frac
+ local unit
+
+ if (( size > 9 * 2**50 )); then
+ frac=$((size / 2**40))
+ unit="PiB"
+ elif (( size > 9 * 2**40 )); then
+ frac=$((size / 2**30))
+ unit="TiB"
+ elif (( size > 9 * 2**30 )); then
+ frac=$((size / 2**20))
+ unit="GiB"
+ elif (( size > 9 * 2**20 )); then
+ frac=$((size / 2**10))
+ unit="MiB"
+ else
+ frac=$size
+ unit="KiB"
+ fi
+
+ printf "%u.%02u $unit" $((frac / 1024)) $((frac % 1000))
+}
+
+(( quiet == 0 )) && runtime_description | fmt
+
+# if stdout is a tty then make output more interactive
+if [[ -t 1 ]]; then
+ cr="\r"
+ lines=100
+ interval=30
+else
+ lf="\n"
+ lines=1000
+ interval=300
+fi
+
+total_file_count=0
+last=$SECONDS
+
+echo ""
+if [ "$directory_provided" = true ]; then
+ echo "Scanning $directory."
+else
+ echo "Scanning current directory, $directory."
+fi
+echo ""
+echo ""
+
+while read FILE; do
+ total_file_count=$((total_file_count + 1))
+ # randomly select $percentage of files after sampling min_files
+ if (( total_files_scanned > min_files )); then
+ (( RANDOM % 100 < percentage )) || continue
+ fi
+
+ ### NOPE, you're not summing file size correctly imo
+ # You need to check the size of all the files not just the ones you're sampling
+ # oops
+ process_file "$FILE"
+
+ if (( quiet < 2 &&
+ ((min_files > 1 && total_files_scanned == min_files) ||
+ total_files_scanned % lines == 0 ||
+ last + interval < SECONDS) )); then
+ if ((total_files_scanned != total_file_count)); then
+ echo -ne "${cr}Sampled $total_files_scanned/$total_file_count files so far, estimated compression ratio $(calculate_ratio)x...${lf}"
+ else
+ echo -ne "${cr}Sampled $total_files_scanned files so far, estimated compression ratio $(calculate_ratio)x...${lf}"
+ fi
+ last=$SECONDS
+ fi
+done < <(find $directory -type f -print)
+
+(( total_file_count == 0 )) &&
+ echo "error: no files found in '$directory' to compress" 1>&2 &&
+ exit 10
+(( total_uncompressed_size == 0 )) &&
+ echo "error: only zero-length files found in '$directory'" 1>&2 &&
+ exit 11
+
+echo ""
+# Report the additional statistics
+if (( quiet == 0 )); then
+ echo ""
+ echo "Finished sampling."
+ echo ""
+ echo ""
+ echo "---------------------"
+ echo "Results"
+ echo "---------------------"
+fi
+echo "Number of files sampled: $total_files_scanned ($((total_files_scanned * 100 / total_file_count))% of $total_file_count files)"
+#echo "Number of zero-length files: $total_empty_files"
+echo "Total size of files sampled: $(print_size $total_file_size)"
+echo "Total uncompressed size of sampled data: $(print_size $total_uncompressed_size)"
+echo "Total compressed size of sampled data: $(print_size $total_compressed_size)"
+echo "Compressed size as percentage of uncompressed size: $(calculate_pct)"
+compression_ratio=$(calculate_ratio)
+echo "Compression ratio of sampled data: ${compression_ratio}x"
+if (( total_files_scanned < total_file_count )); then
+ size_of_all_files=$((total_file_size * total_file_count / total_files_scanned))
+ echo "Estimated size of all $total_file_count files: $(print_size $size_of_all_files)"
+else
+ size_of_all_files=$total_file_size
+fi
+estimated_total_compressed_size=$(calculate_estimated_total_compressed_size $ratio)
+echo "Estimated compressed size of all files: $(print_size $estimated_total_compressed_size)"