From: Andreas Dilger Date: Sat, 27 Jan 2024 20:08:33 +0000 (-0800) Subject: EX-8362 scripts: improve ll_compression_scan estimate X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=21c73312a345673e108beedbad1fa2a1d7a8dbff;p=fs%2Flustre-release.git EX-8362 scripts: improve ll_compression_scan estimate Improve ll_compression_scan script to give a better estimate of actual compression ratios. - add a '-d' debug option for verbose output during testing - log and report incompressible small files < 4096 - log and report incompressible file count and size - include small/incompressible/large files in compression estimate - add a correction factor to calculations for safety margin Change-Id: If561b0273e38e4821de228c81291859c7bb1a0d2 Test-Parameters: trivial testlist=sanity-compr env=ONLY=1007,ONLY_REPEAT=10 Signed-off-by: Andreas Dilger Signed-off-by: Jian Yu Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/53824 Tested-by: jenkins Tested-by: Maloo --- diff --git a/lustre/scripts/ll_compression_scan b/lustre/scripts/ll_compression_scan index af832a1..57833f1 100755 --- a/lustre/scripts/ll_compression_scan +++ b/lustre/scripts/ll_compression_scan @@ -35,7 +35,9 @@ default_path="$(pwd)" percentage=1 compression_type="gzip -" compression_level=6 +compression_margin=5 whole_file="false" +debug=0 quiet=0 # Display description of script behavior @@ -112,7 +114,7 @@ USAGE } # Parse command-line options -while getopts "c:s:n:p:z:Z:l:wqh" opt; do +while getopts "c:ds:n:p:z:Z:l:m:wqh" opt; do case $opt in c) if (( OPTARG & (OPTARG - 1) )); then @@ -125,6 +127,9 @@ while getopts "c:s:n:p:z:Z:l:wqh" opt; do fi chunk_size=$((OPTARG *= 1024)) ;; + d) + ((debug += 1)) + ;; s) sample_count=$OPTARG ;; @@ -163,6 +168,13 @@ while getopts "c:s:n:p:z:Z:l:wqh" opt; do l) compression_level=$OPTARG ;; + m) + if (( OPTARG < 0 || OPTARG > 100 )); then + echo "Compression margin must be between 0 and 100" 1>&2 + exit 1 + fi + compression_margin=$OPTARG + ;; w) whole_file="true" ;; @@ -199,19 +211,27 @@ fi export total_file_size=0 export total_uncompressed_size=0 export total_compressed_size=0 -export total_files_scanned=0 -export total_empty_files=0 -export total_uncompressed_size_estimated=0 +export total_files_sampled=0 +export total_small_files=0 +export total_incompressible_files=0 +export total_incompressible_size=0 +export total_uncompressed_size_sampled=0 export total_compressed_size_estimated=0 round_to_block_size() { - local size=$1 + local size=$* echo $(( ((size - 1) | (block_size - 1)) + 1 )) } -export format="--format=%s" -[[ $(uname) != "Darwin" ]] || format="-f %z" +round_to_chunk_size() { + local size=$* + + echo $(( ((size - 1) | (chunk_size - 1)) + 1 )) +} + +export format="--format=%b*%B" +[[ $(uname) != "Darwin" ]] || format="-f %b*512" # Function to process a file process_file() { local file="$1" @@ -219,96 +239,121 @@ process_file() { local sum_uncompressed_chunk=0 local sum_compressed_chunk=0 - total_files_scanned=$((total_files_scanned + 1)) - - if [[ -z "$file_size" ]] || (( $file_size == 0 )); then - total_empty_files=$((total_empty_files + 1)) - return - fi - - local segment_size - if [[ $whole_file == "true" ]]; then - segment_size=$chunk_size - sample_count=$((file_size / chunk_size)) + # Round up the file_size to the next block (actual space usage) + file_size=$(round_to_block_size $file_size) + # Accumulate total size of files scanned (in block_size multiples) + total_file_size=$((total_file_size + file_size)) + ((total_file_count+= 1)) + + # always count incompressible files, in case this is a large fraction + if [[ -z "$file_size" ]] || (( file_size <= block_size )); then + ((total_small_files+= 1)) + ((total_files_sampled+= 1)) + sum_uncompressed_chunk=$file_size + sum_compressed_chunk=$file_size + estimated_compressed_file_size=$file_size else - # Calculate the segment size for the file - segment_size=$((file_size / sample_count)) + # randomly select $percentage of files after sampling min_files, + # unless file is larger than average of files checked so far + local average=$((total_file_size / ${total_files_sampled/#0/1})) + if (( total_files_sampled > min_files && + file_size < 2 * average )); then + (( RANDOM % 100 < percentage )) || return + elif (( total_files_sampled > min_files && debug > 0 )); then + echo -n "***" + fi + + ((total_files_sampled+= 1)) - # Limit sample_count for small file size, but have at least - # one chunk - if ((sample_count * chunk_size > file_size)); then - sample_count=$((file_size / chunk_size)) + local segment_size + if [[ $whole_file == "true" ]] || + (( file_size < chunk_size * sample_count )); then + segment_size=$chunk_size + segment_count=$(($(round_to_chunk_size file_size) / + chunk_size)) + else + # Calculate the segment size for the file + segment_size=$((file_size / sample_count)) + segment_count=$sample_count fi - fi - if ((sample_count == 0)); then - sample_count=1 - fi - # Round up the file_size to the next block (actual space usage) - file_size=$(round_to_block_size file_size) - # Accumulate the total size of files scanned (in block_size units) - total_file_size=$((total_file_size + file_size)) + (( debug < 1 )) || + echo -n "$(basename $file): size: $file_size " + (( debug < 2 )) || + echo -n "segs: $segment_count segsz: $segment_size " - # Read and process each segment - for ((i = 0; i < sample_count; i++)); do - offset=$((i * segment_size / chunk_size)) - compressed_size=$(dd if="$file" bs=$chunk_size count=1 skip=$offset 2>/dev/null | $compress | wc -c) + # Read and process each segment + for ((i = 0; i < segment_count; i++)); do + offset=$((i * segment_size / chunk_size)) + compressed_size=$(dd if="$file" bs=$chunk_size count=1 \ + skip=$offset 2>/dev/null | $compress | wc -c) - # if the compressed size is zero, something must have failed - (( compressed_size > 0 )) || continue + # if compressed size is zero, something must have failed + (( compressed_size > 0 )) || continue - # Round up compressed size to full block size - compressed_size=$(round_to_block_size compressed_size) + # Round up compressed size to full block size + compressed_size=$(round_to_block_size compressed_size) - # Incompressible chunks will not be compressed - (( compressed_size < chunk_size )) || compressed_size=$chunk_size + # Incompressible chunks will not be compressed + (( compressed_size <= chunk_size )) || + compressed_size=$chunk_size - # Accumulate sampled chunk byte counts, but don't inflate size - sum_uncompressed_chunk=$((sum_uncompressed_chunk + - (chunk_size < file_size ? - chunk_size : file_size) )) - sum_compressed_chunk=$((sum_compressed_chunk + compressed_size)) + # Add sampled chunk bytes, but don't inflate last chunk + last_chunk=$((file_size - offset * chunk_size )) + (( last_chunk > chunk_size )) && last_chunk=$chunk_size - done + ((sum_uncompressed_chunk+= last_chunk )) + ((sum_compressed_chunk+= compressed_size)) + done - # Get current ratio for this file - current_ratio=$((sum_uncompressed_chunk * 100 / sum_compressed_chunk)) - # Assume ratio will be the same for the entire file - estimated_compressed_file_size=$(( file_size * 100 / current_ratio)) + # Get current ratio for this file + current_ratio=$((sum_uncompressed_chunk * 100 / sum_compressed_chunk)) + # Assume compression ratio will be the same for the entire file + estimated_compressed_file_size=$((file_size * 100 / current_ratio)) + + (( debug < 1 )) || + echo "uncompr: $sum_uncompressed_chunk compr: $sum_compressed_chunk est: $estimated_compressed_file_size avg: $average" + fi + + if ((sum_compressed_chunk >= sum_uncompressed_chunk)); then + ((total_incompressible_files+= 1)) + ((total_incompressible_size+= file_size)) + fi # Accumulate the total uncompressed and compressed byte counts - total_uncompressed_size=$((total_uncompressed_size + sum_uncompressed_chunk)) - total_compressed_size=$((total_compressed_size + sum_compressed_chunk)) + ((total_uncompressed_size+= sum_uncompressed_chunk)) + ((total_compressed_size+= sum_compressed_chunk)) # Accumulate the estimated uncompressed and compressed byte counts - total_uncompressed_size_estimated=$((total_uncompressed_size_estimated + - file_size)) - total_compressed_size_estimated=$((total_compressed_size_estimated + - estimated_compressed_file_size)) + ((total_uncompressed_size_sampled+= file_size)) + ((total_compressed_size_estimated+= estimated_compressed_file_size)) } -# Calculate compression ratio from estimated compressed file (value > 1) -calculate_estimated_ratio() { - local ratio=$((total_uncompressed_size_estimated * 100 / - total_compressed_size_estimated)) +# Calculate compression ratio of real compressed chunks vs original (value >= 1) +calculate_ratio() { + local ratio=$((total_uncompressed_size * 100 / total_compressed_size)) printf "%u.%02u" $((ratio / 100)) $((ratio % 100)) } -# Calculate percentage of compressed size compared to original size (1-100%) -calculate_pct() { - local pct=$((total_compressed_size * 10000 / total_uncompressed_size)) +# add correction factor for estimate safety margin with low sample percentage +(( compression_margin == 0 )) && correction=100 || + correction=$((100 + compression_margin + 10 * (100 - percentage) / 100)) - printf "%u.%02u%%" $((pct / 100)) $((pct % 100)) +# Calculate compression ratio from estimated compressed file size (value >= 1) +calculate_estimated_ratio() { + local ratio=$((total_uncompressed_size_sampled * 100 * 100 / + (total_compressed_size_estimated * correction))) + + printf "%u.%02u" $((ratio / 100)) $((ratio % 100)) } # Calculate estimated compressed size of all files using the ratio from our # sample data calculate_estimated_total_compressed_size() { - local ratio=$1 - - printf "%d" $(echo "scale=0; $size_of_all_files / $compression_ratio_estimated" | bc) + echo $((total_file_size * total_compressed_size_estimated * correction / + (total_uncompressed_size_sampled * 100))) } print_size() { @@ -316,16 +361,16 @@ print_size() { local frac local unit - if (( size > 9 * 2**50 )); then + if (( size > 4 * 2**50 )); then frac=$((size / 2**40)) unit="PiB" - elif (( size > 9 * 2**40 )); then + elif (( size > 4 * 2**40 )); then frac=$((size / 2**30)) unit="TiB" - elif (( size > 9 * 2**30 )); then + elif (( size > 4 * 2**30 )); then frac=$((size / 2**20)) unit="GiB" - elif (( size > 9 * 2**20 )); then + elif (( size > 4 * 2**20 )); then frac=$((size / 2**10)) unit="MiB" else @@ -333,7 +378,7 @@ print_size() { unit="KiB" fi - printf "%u.%02u $unit" $((frac / 1024)) $((frac % 1000)) + printf "%u.%03u $unit" $((frac / 1024)) $((frac % 1024)) } (( quiet == 0 )) && runtime_description | fmt @@ -362,25 +407,16 @@ echo "" echo "" while read FILE; do - total_file_count=$((total_file_count + 1)) - # randomly select $percentage of files after sampling min_files - if (( total_files_scanned > min_files )); then - (( RANDOM % 100 < percentage )) || continue - fi - - ### NOPE, you're not summing file size correctly imo - # You need to check the size of all the files not just the ones you're sampling - # oops process_file "$FILE" if (( quiet < 2 && ((min_files > 1 && total_files_scanned == min_files) || - total_files_scanned % lines == 0 || + total_files_sampled % lines == 0 || last + interval < SECONDS) )); then - if ((total_files_scanned != total_file_count)); then - echo -ne "${cr}Sampled $total_files_scanned/$total_file_count files so far, estimated compression ratio $(calculate_estimated_ratio)x...${lf}" + if ((total_files_sampled != total_file_count)); then + echo -ne "${cr}Sampled $total_files_sampled/$total_file_count files so far, estimated compression ratio $(calculate_estimated_ratio)x...${lf}" else - echo -ne "${cr}Sampled $total_files_scanned files so far, estimated compression ratio $(calculate_estimated_ratio)x...${lf}" + echo -ne "${cr}Sampled $total_files_sampled files so far, estimated compression ratio $(calculate_estimated_ratio)x...${lf}" fi last=$SECONDS fi @@ -404,19 +440,17 @@ if (( quiet == 0 )); then echo "Results" echo "---------------------" fi -echo "Number of files sampled: $total_files_scanned ($((total_files_scanned * 100 / total_file_count))% of $total_file_count files)" -#echo "Number of zero-length files: $total_empty_files" -echo "Total size of files sampled: $(print_size $total_file_size)" + +echo "Compression type: ${compression_type/ -*/} Level: $compression_level" +echo "Chunk size: $chunk_size" +echo "Number of files sampled: $total_files_sampled ($((total_files_sampled * 100 / total_file_count))% of $total_file_count total files)" +echo "Number of files under $block_size bytes (incompressible): $total_small_files" +echo "Total number of incompressible files: $total_incompressible_files" +echo "Total size of incompressible files: $(print_size $total_incompressible_size)" +echo "Total size of files scanned: $(print_size $total_file_size)" echo "Total uncompressed size of sampled data: $(print_size $total_uncompressed_size)" echo "Total compressed size of sampled data: $(print_size $total_compressed_size)" -echo "Compressed size as percentage of uncompressed size: $(calculate_pct)" -compression_ratio_estimated=$(calculate_estimated_ratio) -echo "Estimated compression ratio of sampled files: ${compression_ratio_estimated}x" -if (( total_files_scanned < total_file_count )); then - size_of_all_files=$((total_file_size * total_file_count / total_files_scanned)) - echo "Estimated size of all $total_file_count files: $(print_size $size_of_all_files)" -else - size_of_all_files=$total_file_size -fi -estimated_total_compressed_size=$(calculate_estimated_total_compressed_size $ratio) +echo "Compression ratio of sampled data: $(calculate_ratio)" +echo "Estimated compression ratio of sampled files: $(calculate_estimated_ratio)" +estimated_total_compressed_size=$(calculate_estimated_total_compressed_size) echo "Estimated compressed size of all files: $(print_size $estimated_total_compressed_size)" diff --git a/lustre/tests/sanity-compr.sh b/lustre/tests/sanity-compr.sh index ce06081..7055637 100644 --- a/lustre/tests/sanity-compr.sh +++ b/lustre/tests/sanity-compr.sh @@ -982,12 +982,14 @@ test_1007() { # Sync to disk and drop cache sync; echo 3 > /proc/sys/vm/drop_caches - local scan_cmd="ll_compression_scan -w -q -z $compr_type" + local scan_cmd="ll_compression_scan -m 0 -w -q -z $compr_type" [[ -z $has_level ]] || scan_cmd+=" -l $compr_level" scan_cmd+=" -c $chunksize" local estimated_size=$($scan_cmd $source | - awk '/Estimated compressed size/{print $7}') + awk '/Estimated compressed size/ { print $7 }') + [[ -n "$estimated_size" ]] || error "no compression estimate" + estimated_size=$(bc -l <<< "$estimated_size * 1024") local csdc_size=$(du -sk $tf | awk '{print $1}') local margin=5