percentage=1
compression_type="gzip -"
compression_level=6
+compression_margin=5
whole_file="false"
+debug=0
quiet=0
# Display description of script behavior
}
# Parse command-line options
-while getopts "c:s:n:p:z:Z:l:wqh" opt; do
+while getopts "c:ds:n:p:z:Z:l:m:wqh" opt; do
case $opt in
c)
if (( OPTARG & (OPTARG - 1) )); then
fi
chunk_size=$((OPTARG *= 1024))
;;
+ d)
+ ((debug += 1))
+ ;;
s)
sample_count=$OPTARG
;;
l)
compression_level=$OPTARG
;;
+ m)
+ if (( OPTARG < 0 || OPTARG > 100 )); then
+ echo "Compression margin must be between 0 and 100" 1>&2
+ exit 1
+ fi
+ compression_margin=$OPTARG
+ ;;
w)
whole_file="true"
;;
export total_file_size=0
export total_uncompressed_size=0
export total_compressed_size=0
-export total_files_scanned=0
-export total_empty_files=0
-export total_uncompressed_size_estimated=0
+export total_files_sampled=0
+export total_small_files=0
+export total_incompressible_files=0
+export total_incompressible_size=0
+export total_uncompressed_size_sampled=0
export total_compressed_size_estimated=0
round_to_block_size() {
- local size=$1
+ local size=$*
echo $(( ((size - 1) | (block_size - 1)) + 1 ))
}
-export format="--format=%s"
-[[ $(uname) != "Darwin" ]] || format="-f %z"
+round_to_chunk_size() {
+ local size=$*
+
+ echo $(( ((size - 1) | (chunk_size - 1)) + 1 ))
+}
+
+export format="--format=%b*%B"
+[[ $(uname) != "Darwin" ]] || format="-f %b*512"
# Function to process a file
process_file() {
local file="$1"
local sum_uncompressed_chunk=0
local sum_compressed_chunk=0
- total_files_scanned=$((total_files_scanned + 1))
-
- if [[ -z "$file_size" ]] || (( $file_size == 0 )); then
- total_empty_files=$((total_empty_files + 1))
- return
- fi
-
- local segment_size
- if [[ $whole_file == "true" ]]; then
- segment_size=$chunk_size
- sample_count=$((file_size / chunk_size))
+ # Round up the file_size to the next block (actual space usage)
+ file_size=$(round_to_block_size $file_size)
+ # Accumulate total size of files scanned (in block_size multiples)
+ total_file_size=$((total_file_size + file_size))
+ ((total_file_count+= 1))
+
+ # always count incompressible files, in case this is a large fraction
+ if [[ -z "$file_size" ]] || (( file_size <= block_size )); then
+ ((total_small_files+= 1))
+ ((total_files_sampled+= 1))
+ sum_uncompressed_chunk=$file_size
+ sum_compressed_chunk=$file_size
+ estimated_compressed_file_size=$file_size
else
- # Calculate the segment size for the file
- segment_size=$((file_size / sample_count))
+ # randomly select $percentage of files after sampling min_files,
+ # unless file is larger than average of files checked so far
+ local average=$((total_file_size / ${total_files_sampled/#0/1}))
+ if (( total_files_sampled > min_files &&
+ file_size < 2 * average )); then
+ (( RANDOM % 100 < percentage )) || return
+ elif (( total_files_sampled > min_files && debug > 0 )); then
+ echo -n "***"
+ fi
+
+ ((total_files_sampled+= 1))
- # Limit sample_count for small file size, but have at least
- # one chunk
- if ((sample_count * chunk_size > file_size)); then
- sample_count=$((file_size / chunk_size))
+ local segment_size
+ if [[ $whole_file == "true" ]] ||
+ (( file_size < chunk_size * sample_count )); then
+ segment_size=$chunk_size
+ segment_count=$(($(round_to_chunk_size file_size) /
+ chunk_size))
+ else
+ # Calculate the segment size for the file
+ segment_size=$((file_size / sample_count))
+ segment_count=$sample_count
fi
- fi
- if ((sample_count == 0)); then
- sample_count=1
- fi
- # Round up the file_size to the next block (actual space usage)
- file_size=$(round_to_block_size file_size)
- # Accumulate the total size of files scanned (in block_size units)
- total_file_size=$((total_file_size + file_size))
+ (( debug < 1 )) ||
+ echo -n "$(basename $file): size: $file_size "
+ (( debug < 2 )) ||
+ echo -n "segs: $segment_count segsz: $segment_size "
- # Read and process each segment
- for ((i = 0; i < sample_count; i++)); do
- offset=$((i * segment_size / chunk_size))
- compressed_size=$(dd if="$file" bs=$chunk_size count=1 skip=$offset 2>/dev/null | $compress | wc -c)
+ # Read and process each segment
+ for ((i = 0; i < segment_count; i++)); do
+ offset=$((i * segment_size / chunk_size))
+ compressed_size=$(dd if="$file" bs=$chunk_size count=1 \
+ skip=$offset 2>/dev/null | $compress | wc -c)
- # if the compressed size is zero, something must have failed
- (( compressed_size > 0 )) || continue
+ # if compressed size is zero, something must have failed
+ (( compressed_size > 0 )) || continue
- # Round up compressed size to full block size
- compressed_size=$(round_to_block_size compressed_size)
+ # Round up compressed size to full block size
+ compressed_size=$(round_to_block_size compressed_size)
- # Incompressible chunks will not be compressed
- (( compressed_size < chunk_size )) || compressed_size=$chunk_size
+ # Incompressible chunks will not be compressed
+ (( compressed_size <= chunk_size )) ||
+ compressed_size=$chunk_size
- # Accumulate sampled chunk byte counts, but don't inflate size
- sum_uncompressed_chunk=$((sum_uncompressed_chunk +
- (chunk_size < file_size ?
- chunk_size : file_size) ))
- sum_compressed_chunk=$((sum_compressed_chunk + compressed_size))
+ # Add sampled chunk bytes, but don't inflate last chunk
+ last_chunk=$((file_size - offset * chunk_size ))
+ (( last_chunk > chunk_size )) && last_chunk=$chunk_size
- done
+ ((sum_uncompressed_chunk+= last_chunk ))
+ ((sum_compressed_chunk+= compressed_size))
+ done
- # Get current ratio for this file
- current_ratio=$((sum_uncompressed_chunk * 100 / sum_compressed_chunk))
- # Assume ratio will be the same for the entire file
- estimated_compressed_file_size=$(( file_size * 100 / current_ratio))
+ # Get current ratio for this file
+ current_ratio=$((sum_uncompressed_chunk * 100 / sum_compressed_chunk))
+ # Assume compression ratio will be the same for the entire file
+ estimated_compressed_file_size=$((file_size * 100 / current_ratio))
+
+ (( debug < 1 )) ||
+ echo "uncompr: $sum_uncompressed_chunk compr: $sum_compressed_chunk est: $estimated_compressed_file_size avg: $average"
+ fi
+
+ if ((sum_compressed_chunk >= sum_uncompressed_chunk)); then
+ ((total_incompressible_files+= 1))
+ ((total_incompressible_size+= file_size))
+ fi
# Accumulate the total uncompressed and compressed byte counts
- total_uncompressed_size=$((total_uncompressed_size + sum_uncompressed_chunk))
- total_compressed_size=$((total_compressed_size + sum_compressed_chunk))
+ ((total_uncompressed_size+= sum_uncompressed_chunk))
+ ((total_compressed_size+= sum_compressed_chunk))
# Accumulate the estimated uncompressed and compressed byte counts
- total_uncompressed_size_estimated=$((total_uncompressed_size_estimated +
- file_size))
- total_compressed_size_estimated=$((total_compressed_size_estimated +
- estimated_compressed_file_size))
+ ((total_uncompressed_size_sampled+= file_size))
+ ((total_compressed_size_estimated+= estimated_compressed_file_size))
}
-# Calculate compression ratio from estimated compressed file (value > 1)
-calculate_estimated_ratio() {
- local ratio=$((total_uncompressed_size_estimated * 100 /
- total_compressed_size_estimated))
+# Calculate compression ratio of real compressed chunks vs original (value >= 1)
+calculate_ratio() {
+ local ratio=$((total_uncompressed_size * 100 / total_compressed_size))
printf "%u.%02u" $((ratio / 100)) $((ratio % 100))
}
-# Calculate percentage of compressed size compared to original size (1-100%)
-calculate_pct() {
- local pct=$((total_compressed_size * 10000 / total_uncompressed_size))
+# add correction factor for estimate safety margin with low sample percentage
+(( compression_margin == 0 )) && correction=100 ||
+ correction=$((100 + compression_margin + 10 * (100 - percentage) / 100))
- printf "%u.%02u%%" $((pct / 100)) $((pct % 100))
+# Calculate compression ratio from estimated compressed file size (value >= 1)
+calculate_estimated_ratio() {
+ local ratio=$((total_uncompressed_size_sampled * 100 * 100 /
+ (total_compressed_size_estimated * correction)))
+
+ printf "%u.%02u" $((ratio / 100)) $((ratio % 100))
}
# Calculate estimated compressed size of all files using the ratio from our
# sample data
calculate_estimated_total_compressed_size()
{
- local ratio=$1
-
- printf "%d" $(echo "scale=0; $size_of_all_files / $compression_ratio_estimated" | bc)
+ echo $((total_file_size * total_compressed_size_estimated * correction /
+ (total_uncompressed_size_sampled * 100)))
}
print_size() {
local frac
local unit
- if (( size > 9 * 2**50 )); then
+ if (( size > 4 * 2**50 )); then
frac=$((size / 2**40))
unit="PiB"
- elif (( size > 9 * 2**40 )); then
+ elif (( size > 4 * 2**40 )); then
frac=$((size / 2**30))
unit="TiB"
- elif (( size > 9 * 2**30 )); then
+ elif (( size > 4 * 2**30 )); then
frac=$((size / 2**20))
unit="GiB"
- elif (( size > 9 * 2**20 )); then
+ elif (( size > 4 * 2**20 )); then
frac=$((size / 2**10))
unit="MiB"
else
unit="KiB"
fi
- printf "%u.%02u $unit" $((frac / 1024)) $((frac % 1000))
+ printf "%u.%03u $unit" $((frac / 1024)) $((frac % 1024))
}
(( quiet == 0 )) && runtime_description | fmt
echo ""
while read FILE; do
- total_file_count=$((total_file_count + 1))
- # randomly select $percentage of files after sampling min_files
- if (( total_files_scanned > min_files )); then
- (( RANDOM % 100 < percentage )) || continue
- fi
-
- ### NOPE, you're not summing file size correctly imo
- # You need to check the size of all the files not just the ones you're sampling
- # oops
process_file "$FILE"
if (( quiet < 2 &&
((min_files > 1 && total_files_scanned == min_files) ||
- total_files_scanned % lines == 0 ||
+ total_files_sampled % lines == 0 ||
last + interval < SECONDS) )); then
- if ((total_files_scanned != total_file_count)); then
- echo -ne "${cr}Sampled $total_files_scanned/$total_file_count files so far, estimated compression ratio $(calculate_estimated_ratio)x...${lf}"
+ if ((total_files_sampled != total_file_count)); then
+ echo -ne "${cr}Sampled $total_files_sampled/$total_file_count files so far, estimated compression ratio $(calculate_estimated_ratio)x...${lf}"
else
- echo -ne "${cr}Sampled $total_files_scanned files so far, estimated compression ratio $(calculate_estimated_ratio)x...${lf}"
+ echo -ne "${cr}Sampled $total_files_sampled files so far, estimated compression ratio $(calculate_estimated_ratio)x...${lf}"
fi
last=$SECONDS
fi
echo "Results"
echo "---------------------"
fi
-echo "Number of files sampled: $total_files_scanned ($((total_files_scanned * 100 / total_file_count))% of $total_file_count files)"
-#echo "Number of zero-length files: $total_empty_files"
-echo "Total size of files sampled: $(print_size $total_file_size)"
+
+echo "Compression type: ${compression_type/ -*/} Level: $compression_level"
+echo "Chunk size: $chunk_size"
+echo "Number of files sampled: $total_files_sampled ($((total_files_sampled * 100 / total_file_count))% of $total_file_count total files)"
+echo "Number of files under $block_size bytes (incompressible): $total_small_files"
+echo "Total number of incompressible files: $total_incompressible_files"
+echo "Total size of incompressible files: $(print_size $total_incompressible_size)"
+echo "Total size of files scanned: $(print_size $total_file_size)"
echo "Total uncompressed size of sampled data: $(print_size $total_uncompressed_size)"
echo "Total compressed size of sampled data: $(print_size $total_compressed_size)"
-echo "Compressed size as percentage of uncompressed size: $(calculate_pct)"
-compression_ratio_estimated=$(calculate_estimated_ratio)
-echo "Estimated compression ratio of sampled files: ${compression_ratio_estimated}x"
-if (( total_files_scanned < total_file_count )); then
- size_of_all_files=$((total_file_size * total_file_count / total_files_scanned))
- echo "Estimated size of all $total_file_count files: $(print_size $size_of_all_files)"
-else
- size_of_all_files=$total_file_size
-fi
-estimated_total_compressed_size=$(calculate_estimated_total_compressed_size $ratio)
+echo "Compression ratio of sampled data: $(calculate_ratio)"
+echo "Estimated compression ratio of sampled files: $(calculate_estimated_ratio)"
+estimated_total_compressed_size=$(calculate_estimated_total_compressed_size)
echo "Estimated compressed size of all files: $(print_size $estimated_total_compressed_size)"