Whamcloud - gitweb
EX-8362 scripts: improve ll_compression_scan estimate
authorAndreas Dilger <adilger@whamcloud.com>
Sat, 27 Jan 2024 20:08:33 +0000 (12:08 -0800)
committerAndreas Dilger <adilger@whamcloud.com>
Mon, 29 Jan 2024 08:54:39 +0000 (08:54 +0000)
Improve ll_compression_scan script to give a better estimate of
actual compression ratios.
- add a '-d' debug option for verbose output during testing
- log and report incompressible small files < 4096
- log and report incompressible file count and size
- include small/incompressible/large files in compression estimate
- add a correction factor to calculations for safety margin

Change-Id: If561b0273e38e4821de228c81291859c7bb1a0d2
Test-Parameters: trivial testlist=sanity-compr env=ONLY=1007,ONLY_REPEAT=10
Signed-off-by: Andreas Dilger <adilger@whamcloud.com>
Signed-off-by: Jian Yu <yujian@whamcloud.com>
Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/53824
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
lustre/scripts/ll_compression_scan
lustre/tests/sanity-compr.sh

index af832a1..57833f1 100755 (executable)
@@ -35,7 +35,9 @@ default_path="$(pwd)"
 percentage=1
 compression_type="gzip -"
 compression_level=6
+compression_margin=5
 whole_file="false"
+debug=0
 quiet=0
 
 # Display description of script behavior
@@ -112,7 +114,7 @@ USAGE
 }
 
 # Parse command-line options
-while getopts "c:s:n:p:z:Z:l:wqh" opt; do
+while getopts "c:ds:n:p:z:Z:l:m:wqh" opt; do
        case $opt in
        c)
                if (( OPTARG & (OPTARG - 1) )); then
@@ -125,6 +127,9 @@ while getopts "c:s:n:p:z:Z:l:wqh" opt; do
                fi
                chunk_size=$((OPTARG *= 1024))
                ;;
+       d)
+               ((debug += 1))
+               ;;
        s)
                sample_count=$OPTARG
                ;;
@@ -163,6 +168,13 @@ while getopts "c:s:n:p:z:Z:l:wqh" opt; do
        l)
                compression_level=$OPTARG
                ;;
+       m)
+               if (( OPTARG < 0 || OPTARG > 100 )); then
+                       echo "Compression margin must be between 0 and 100" 1>&2
+                       exit 1
+               fi
+               compression_margin=$OPTARG
+               ;;
        w)
                whole_file="true"
                ;;
@@ -199,19 +211,27 @@ fi
 export total_file_size=0
 export total_uncompressed_size=0
 export total_compressed_size=0
-export total_files_scanned=0
-export total_empty_files=0
-export total_uncompressed_size_estimated=0
+export total_files_sampled=0
+export total_small_files=0
+export total_incompressible_files=0
+export total_incompressible_size=0
+export total_uncompressed_size_sampled=0
 export total_compressed_size_estimated=0
 
 round_to_block_size() {
-       local size=$1
+       local size=$*
 
        echo $(( ((size - 1) | (block_size - 1)) + 1 ))
 }
 
-export format="--format=%s"
-[[ $(uname) != "Darwin" ]] || format="-f %z"
+round_to_chunk_size() {
+       local size=$*
+
+       echo $(( ((size - 1) | (chunk_size - 1)) + 1 ))
+}
+
+export format="--format=%b*%B"
+[[ $(uname) != "Darwin" ]] || format="-f %b*512"
 # Function to process a file
 process_file() {
        local file="$1"
@@ -219,96 +239,121 @@ process_file() {
        local sum_uncompressed_chunk=0
        local sum_compressed_chunk=0
 
-       total_files_scanned=$((total_files_scanned + 1))
-
-       if [[ -z "$file_size" ]] || (( $file_size == 0 )); then
-               total_empty_files=$((total_empty_files + 1))
-               return
-       fi
-
-       local segment_size
-       if [[ $whole_file == "true" ]]; then
-               segment_size=$chunk_size
-               sample_count=$((file_size / chunk_size))
+       # Round up the file_size to the next block (actual space usage)
+       file_size=$(round_to_block_size $file_size)
+       # Accumulate total size of files scanned (in block_size multiples)
+       total_file_size=$((total_file_size + file_size))
+       ((total_file_count+= 1))
+
+       # always count incompressible files, in case this is a large fraction
+       if [[ -z "$file_size" ]] || (( file_size <= block_size )); then
+               ((total_small_files+= 1))
+               ((total_files_sampled+= 1))
+               sum_uncompressed_chunk=$file_size
+               sum_compressed_chunk=$file_size
+               estimated_compressed_file_size=$file_size
        else
-               # Calculate the segment size for the file
-               segment_size=$((file_size / sample_count))
+               # randomly select $percentage of files after sampling min_files,
+               # unless file is larger than average of files checked so far
+               local average=$((total_file_size / ${total_files_sampled/#0/1}))
+               if (( total_files_sampled > min_files &&
+                     file_size < 2 * average )); then
+                       (( RANDOM % 100 < percentage )) || return
+               elif (( total_files_sampled > min_files && debug > 0 )); then
+                       echo -n "***"
+               fi
+
+               ((total_files_sampled+= 1))
 
-               # Limit sample_count for small file size, but have at least
-               # one chunk
-               if ((sample_count * chunk_size > file_size)); then
-                       sample_count=$((file_size / chunk_size))
+               local segment_size
+               if [[ $whole_file == "true" ]] ||
+                  (( file_size < chunk_size * sample_count )); then
+                       segment_size=$chunk_size
+                       segment_count=$(($(round_to_chunk_size file_size) /
+                                        chunk_size))
+               else
+                       # Calculate the segment size for the file
+                       segment_size=$((file_size / sample_count))
+                       segment_count=$sample_count
                fi
-       fi
-       if ((sample_count == 0)); then
-               sample_count=1
-       fi
 
-       # Round up the file_size to the next block (actual space usage)
-       file_size=$(round_to_block_size file_size)
-       # Accumulate the total size of files scanned (in block_size units)
-       total_file_size=$((total_file_size + file_size))
+               (( debug < 1 )) ||
+                       echo -n "$(basename $file): size: $file_size "
+               (( debug < 2 )) ||
+                       echo -n "segs: $segment_count segsz: $segment_size "
 
-       # Read and process each segment
-       for ((i = 0; i < sample_count; i++)); do
-               offset=$((i * segment_size / chunk_size))
-               compressed_size=$(dd if="$file" bs=$chunk_size count=1 skip=$offset 2>/dev/null | $compress | wc -c)
+               # Read and process each segment
+               for ((i = 0; i < segment_count; i++)); do
+                       offset=$((i * segment_size / chunk_size))
+                       compressed_size=$(dd if="$file" bs=$chunk_size count=1 \
+                               skip=$offset 2>/dev/null | $compress | wc -c)
 
-               # if the compressed size is zero, something must have failed
-               (( compressed_size > 0 )) || continue
+                       # if compressed size is zero, something must have failed
+                       (( compressed_size > 0 )) || continue
 
-               # Round up compressed size to full block size
-               compressed_size=$(round_to_block_size compressed_size)
+                       # Round up compressed size to full block size
+                       compressed_size=$(round_to_block_size compressed_size)
 
-               # Incompressible chunks will not be compressed
-               (( compressed_size < chunk_size )) || compressed_size=$chunk_size
+                       # Incompressible chunks will not be compressed
+                       (( compressed_size <= chunk_size )) ||
+                               compressed_size=$chunk_size
 
-               # Accumulate sampled chunk byte counts, but don't inflate size
-               sum_uncompressed_chunk=$((sum_uncompressed_chunk +
-                                         (chunk_size < file_size ?
-                                          chunk_size : file_size) ))
-               sum_compressed_chunk=$((sum_compressed_chunk + compressed_size))
+                       # Add sampled chunk bytes, but don't inflate last chunk
+                       last_chunk=$((file_size - offset * chunk_size ))
+                       (( last_chunk > chunk_size )) && last_chunk=$chunk_size
 
-       done
+                       ((sum_uncompressed_chunk+= last_chunk ))
+                       ((sum_compressed_chunk+= compressed_size))
+               done
 
-       # Get current ratio for this file
-       current_ratio=$((sum_uncompressed_chunk * 100 / sum_compressed_chunk))
-       # Assume ratio will be the same for the entire file
-       estimated_compressed_file_size=$(( file_size * 100 / current_ratio))
+               # Get current ratio for this file
+               current_ratio=$((sum_uncompressed_chunk * 100 / sum_compressed_chunk))
+               # Assume compression ratio will be the same for the entire file
+               estimated_compressed_file_size=$((file_size * 100 / current_ratio))
+
+               (( debug < 1 )) ||
+                       echo "uncompr: $sum_uncompressed_chunk compr: $sum_compressed_chunk est: $estimated_compressed_file_size avg: $average"
+       fi
+
+       if ((sum_compressed_chunk >= sum_uncompressed_chunk)); then
+               ((total_incompressible_files+= 1))
+               ((total_incompressible_size+= file_size))
+       fi
 
        # Accumulate the total uncompressed and compressed byte counts
-       total_uncompressed_size=$((total_uncompressed_size + sum_uncompressed_chunk))
-       total_compressed_size=$((total_compressed_size + sum_compressed_chunk))
+       ((total_uncompressed_size+= sum_uncompressed_chunk))
+       ((total_compressed_size+= sum_compressed_chunk))
 
        # Accumulate the estimated uncompressed and compressed byte counts
-       total_uncompressed_size_estimated=$((total_uncompressed_size_estimated +
-                                             file_size))
-       total_compressed_size_estimated=$((total_compressed_size_estimated +
-                                           estimated_compressed_file_size))
+       ((total_uncompressed_size_sampled+= file_size))
+       ((total_compressed_size_estimated+= estimated_compressed_file_size))
 }
 
-# Calculate compression ratio from estimated compressed file (value > 1)
-calculate_estimated_ratio() {
-       local ratio=$((total_uncompressed_size_estimated * 100 /
-                       total_compressed_size_estimated))
+# Calculate compression ratio of real compressed chunks vs original (value >= 1)
+calculate_ratio() {
+       local ratio=$((total_uncompressed_size * 100 / total_compressed_size))
 
        printf "%u.%02u" $((ratio / 100)) $((ratio % 100))
 }
 
-# Calculate percentage of compressed size compared to original size (1-100%)
-calculate_pct() {
-       local pct=$((total_compressed_size * 10000 / total_uncompressed_size))
+# add correction factor for estimate safety margin with low sample percentage
+(( compression_margin == 0 )) && correction=100 ||
+       correction=$((100 + compression_margin + 10 * (100 - percentage) / 100))
 
-       printf "%u.%02u%%" $((pct / 100)) $((pct % 100))
+# Calculate compression ratio from estimated compressed file size (value >= 1)
+calculate_estimated_ratio() {
+       local ratio=$((total_uncompressed_size_sampled * 100 * 100 /
+                       (total_compressed_size_estimated * correction)))
+
+       printf "%u.%02u" $((ratio / 100)) $((ratio % 100))
 }
 
 # Calculate estimated compressed size of all files using the ratio from our
 # sample data
 calculate_estimated_total_compressed_size()
 {
-       local ratio=$1
-
-       printf "%d" $(echo "scale=0; $size_of_all_files / $compression_ratio_estimated" | bc)
+       echo $((total_file_size * total_compressed_size_estimated * correction /
+               (total_uncompressed_size_sampled * 100)))
 }
 
 print_size() {
@@ -316,16 +361,16 @@ print_size() {
        local frac
        local unit
 
-       if (( size > 9 * 2**50 )); then
+       if (( size > 4 * 2**50 )); then
                frac=$((size / 2**40))
                unit="PiB"
-       elif (( size > 9 * 2**40 )); then
+       elif (( size > 4 * 2**40 )); then
                frac=$((size / 2**30))
                unit="TiB"
-       elif (( size > 9 * 2**30 )); then
+       elif (( size > 4 * 2**30 )); then
                frac=$((size / 2**20))
                unit="GiB"
-       elif (( size > 9 * 2**20 )); then
+       elif (( size > 4 * 2**20 )); then
                frac=$((size / 2**10))
                unit="MiB"
        else
@@ -333,7 +378,7 @@ print_size() {
                unit="KiB"
        fi
 
-       printf "%u.%02u $unit" $((frac / 1024)) $((frac % 1000))
+       printf "%u.%03u $unit" $((frac / 1024)) $((frac % 1024))
 }
 
 (( quiet == 0 )) && runtime_description | fmt
@@ -362,25 +407,16 @@ echo ""
 echo ""
 
 while read FILE; do
-       total_file_count=$((total_file_count + 1))
-       # randomly select $percentage of files after sampling min_files
-       if (( total_files_scanned > min_files )); then
-               (( RANDOM % 100 < percentage )) || continue
-       fi
-
-       ### NOPE, you're not summing file size correctly imo
-       # You need to check the size of all the files not just the ones you're sampling
-       # oops
        process_file "$FILE"
 
        if (( quiet < 2 &&
              ((min_files > 1 && total_files_scanned == min_files) ||
-              total_files_scanned % lines == 0 ||
+              total_files_sampled % lines == 0 ||
               last + interval < SECONDS) )); then
-               if ((total_files_scanned != total_file_count)); then
-                       echo -ne "${cr}Sampled $total_files_scanned/$total_file_count files so far, estimated compression ratio $(calculate_estimated_ratio)x...${lf}"
+               if ((total_files_sampled != total_file_count)); then
+                       echo -ne "${cr}Sampled $total_files_sampled/$total_file_count files so far, estimated compression ratio $(calculate_estimated_ratio)x...${lf}"
                else
-                       echo -ne "${cr}Sampled $total_files_scanned files so far, estimated compression ratio $(calculate_estimated_ratio)x...${lf}"
+                       echo -ne "${cr}Sampled $total_files_sampled files so far, estimated compression ratio $(calculate_estimated_ratio)x...${lf}"
                fi
                last=$SECONDS
        fi
@@ -404,19 +440,17 @@ if (( quiet == 0 )); then
        echo "Results"
        echo "---------------------"
 fi
-echo "Number of files sampled: $total_files_scanned ($((total_files_scanned * 100 / total_file_count))% of $total_file_count files)"
-#echo "Number of zero-length files: $total_empty_files"
-echo "Total size of files sampled: $(print_size $total_file_size)"
+
+echo "Compression type: ${compression_type/ -*/} Level: $compression_level"
+echo "Chunk size: $chunk_size"
+echo "Number of files sampled: $total_files_sampled ($((total_files_sampled * 100 / total_file_count))% of $total_file_count total files)"
+echo "Number of files under $block_size bytes (incompressible): $total_small_files"
+echo "Total number of incompressible files: $total_incompressible_files"
+echo "Total size of incompressible files: $(print_size $total_incompressible_size)"
+echo "Total size of files scanned: $(print_size $total_file_size)"
 echo "Total uncompressed size of sampled data: $(print_size $total_uncompressed_size)"
 echo "Total compressed size of sampled data: $(print_size $total_compressed_size)"
-echo "Compressed size as percentage of uncompressed size: $(calculate_pct)"
-compression_ratio_estimated=$(calculate_estimated_ratio)
-echo "Estimated compression ratio of sampled files: ${compression_ratio_estimated}x"
-if (( total_files_scanned < total_file_count )); then
-       size_of_all_files=$((total_file_size * total_file_count / total_files_scanned))
-       echo "Estimated size of all $total_file_count files: $(print_size $size_of_all_files)"
-else
-       size_of_all_files=$total_file_size
-fi
-estimated_total_compressed_size=$(calculate_estimated_total_compressed_size $ratio)
+echo "Compression ratio of sampled data: $(calculate_ratio)"
+echo "Estimated compression ratio of sampled files: $(calculate_estimated_ratio)"
+estimated_total_compressed_size=$(calculate_estimated_total_compressed_size)
 echo "Estimated compressed size of all files: $(print_size $estimated_total_compressed_size)"
index ce06081..7055637 100644 (file)
@@ -982,12 +982,14 @@ test_1007() {
                # Sync to disk and drop cache
                sync; echo 3 > /proc/sys/vm/drop_caches
 
-               local scan_cmd="ll_compression_scan -w -q -z $compr_type"
+               local scan_cmd="ll_compression_scan -m 0 -w -q -z $compr_type"
                [[ -z $has_level ]] || scan_cmd+=" -l $compr_level"
                scan_cmd+=" -c $chunksize"
 
                local estimated_size=$($scan_cmd $source |
-                       awk '/Estimated compressed size/{print $7}')
+                       awk '/Estimated compressed size/ { print $7 }')
+               [[ -n "$estimated_size" ]] || error "no compression estimate"
+               estimated_size=$(bc -l <<< "$estimated_size * 1024")
                local csdc_size=$(du -sk $tf | awk '{print $1}')
 
                local margin=5