From 483f2fa2f6768d32709d2fc56cece4e9f03b9e18 Mon Sep 17 00:00:00 2001 From: Bobi Jam Date: Fri, 12 May 2023 11:24:35 +0800 Subject: [PATCH] EX-6510 csdc: prefer uncompressed mirror for write When writing to mirrored files with both compressed and uncompressed mirrors, prefer the uncompressed components to write, and that is better for performance, more compatible with older clients, and better fits the model of compressing files after initial write. Signed-off-by: Bobi Jam Change-Id: I62a117d5cc3d34e2c0c96d1a9ade8eef0a2d1291 Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/50974 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Alex Zhuravlev Reviewed-by: Andreas Dilger --- lustre/lod/lod_internal.h | 3 +- lustre/lod/lod_lov.c | 12 ++++- lustre/lod/lod_object.c | 89 +++++++++++++++++++++++++++----------- lustre/tests/fsx.c | 4 +- lustre/tests/sanity-flr.sh | 40 ++++++++++++++++- lustre/utils/lfs.c | 7 +-- lustre/utils/liblustreapi_layout.c | 27 +++++++----- 7 files changed, 138 insertions(+), 44 deletions(-) diff --git a/lustre/lod/lod_internal.h b/lustre/lod/lod_internal.h index 418abb2..89d26e4 100644 --- a/lustre/lod/lod_internal.h +++ b/lustre/lod/lod_internal.h @@ -234,7 +234,8 @@ enum layout_verify_flags { struct lod_mirror_entry { __u16 lme_stale:1, lme_prefer:1, - lme_hsm:1; + lme_hsm:1, + lme_compressed:1; /* mirror id */ __u16 lme_id; /* preference */ diff --git a/lustre/lod/lod_lov.c b/lustre/lod/lod_lov.c index 4b223d3..5580ae7 100644 --- a/lustre/lod/lod_lov.c +++ b/lustre/lod/lod_lov.c @@ -617,6 +617,7 @@ int lod_fill_mirrors(struct lod_object *lo) bool init = (lod_comp->llc_stripe != NULL) && !(lod_comp->llc_pattern & LOV_PATTERN_F_RELEASED) && !(lod_comp->llc_pattern & LOV_PATTERN_MDT); + bool compressed = lod_comp->llc_flags & LCME_FL_COMPRESS; int j; pref = 0; @@ -646,6 +647,9 @@ int lod_fill_mirrors(struct lod_object *lo) if (ltd->ltd_statfs.os_state & OS_STATFS_NONROT) pref++; + /* uncompressed mirror adds extra preference value */ + if (!compressed) + pref++; } if (mirror_id_of(lod_comp->llc_id) == mirror_id) { @@ -654,12 +658,14 @@ int lod_fill_mirrors(struct lod_object *lo) RETURN(-EINVAL); lo->ldo_mirrors[mirror_idx].lme_stale |= stale; lo->ldo_mirrors[mirror_idx].lme_prefer |= preferred; + lo->ldo_mirrors[mirror_idx].lme_compressed |= + compressed; lo->ldo_mirrors[mirror_idx].lme_preference += pref; lo->ldo_mirrors[mirror_idx].lme_end = i; continue; } - if (mirror_idx >= 0 && preferred && + if (mirror_idx >= 0 && lo->ldo_mirrors[mirror_idx].lme_prefer && !lo->ldo_mirrors[mirror_idx].lme_stale) found_preferred = true; @@ -678,12 +684,16 @@ int lod_fill_mirrors(struct lod_object *lo) lo->ldo_mirrors[mirror_idx].lme_stale = stale; lo->ldo_mirrors[mirror_idx].lme_prefer = preferred; lo->ldo_mirrors[mirror_idx].lme_hsm = mirror_hsm; + lo->ldo_mirrors[mirror_idx].lme_compressed = compressed; lo->ldo_mirrors[mirror_idx].lme_preference = pref; lo->ldo_mirrors[mirror_idx].lme_start = i; lo->ldo_mirrors[mirror_idx].lme_end = i; } if (mirror_idx != lo->ldo_mirror_count - 1) RETURN(-EINVAL); + else if (mirror_idx > 0 && lo->ldo_mirrors[mirror_idx].lme_prefer && + !lo->ldo_mirrors[mirror_idx].lme_stale) + found_preferred = true; if (!found_preferred && mirror_idx > 0) { int best = -1; diff --git a/lustre/lod/lod_object.c b/lustre/lod/lod_object.c index 7c78643..0667ed3 100644 --- a/lustre/lod/lod_object.c +++ b/lustre/lod/lod_object.c @@ -7303,7 +7303,8 @@ static int lod_primary_pick(const struct lu_env *env, struct lod_object *lo, unsigned int seq = 0; struct lod_layout_component *lod_comp; int i, j, rc; - int picked = -1, second_pick = -1, third_pick = -1; + int picked = -1, t2_pick = -1, t3_pick = -1; + int p1 = -1, p2 = -1, p3 = -1; /* first tier pick priority */ ENTRY; if (OBD_FAIL_CHECK(OBD_FAIL_FLR_RANDOM_PICK_MIRROR)) { @@ -7333,21 +7334,6 @@ static int lod_primary_pick(const struct lu_env *env, struct lod_object *lo, continue; } - /* 2nd pick is for the primary mirror containing unavail OST */ - if (lo->ldo_mirrors[index].lme_prefer && second_pick < 0) - second_pick = index; - - /* 3rd pick is for non-primary mirror containing unavail OST */ - if (second_pick < 0 && third_pick < 0) - third_pick = index; - - /** - * we found a non-primary 1st pick, we'd like to find a - * potential pirmary mirror. - */ - if (picked >= 0 && !lo->ldo_mirrors[index].lme_prefer) - continue; - /* check the availability of OSTs */ lod_foreach_mirror_comp(lod_comp, lo, index) { if (!lod_comp_inited(lod_comp) || !lod_comp->llc_stripe) @@ -7372,25 +7358,78 @@ static int lod_primary_pick(const struct lu_env *env, struct lod_object *lo, * the OSTs where allocated objects locates in the components * of the mirror are available. */ - if (!ost_avail) + if (!ost_avail) { + /** + * 2nd pick is for the preferred mirror containing + * unavail OST + */ + if (lo->ldo_mirrors[index].lme_prefer) { + if (t2_pick < 0 || + lo->ldo_mirrors[index].lme_preference > + lo->ldo_mirrors[t2_pick].lme_preference) + t2_pick = index; + } else { + /** + * 3rd pick is for non-preferred mirror + * containing unavail OST + */ + if (t3_pick < 0 || + lo->ldo_mirrors[index].lme_preference > + lo->ldo_mirrors[t3_pick].lme_preference) + t3_pick = index; + } + continue; + } /* this mirror has all OSTs available */ - picked = index; /** - * primary with all OSTs are available, this is the perfect - * 1st pick. + * this mirror has all OSTs available. + * + * We'd pick a mirror for write with following priority: + * + * preferred compressed priority/pick + * A 1 0 pick + * B 1 1 p1 + * C 0 0 p2 + * D 0 1 p3 + * */ - if (lo->ldo_mirrors[index].lme_prefer) - break; + if (lo->ldo_mirrors[index].lme_prefer) { + /* A */ + if (!lo->ldo_mirrors[index].lme_compressed) { + picked = index; + break; + } + + /* B */ + if (p1 < 0 || + lo->ldo_mirrors[index].lme_preference > + lo->ldo_mirrors[p1].lme_preference) + p1 = index; + } else if (!lo->ldo_mirrors[index].lme_compressed) { + /* C */ + if (p2 < 0 || + lo->ldo_mirrors[index].lme_preference > + lo->ldo_mirrors[p2].lme_preference) + p2 = index; + } else { + /* D */ + if (p3 < 0 || + lo->ldo_mirrors[index].lme_preference > + lo->ldo_mirrors[p3].lme_preference) + p3 = index; + } } /* for all mirrors */ /* failed to pick a sound mirror, lower our expectation */ if (picked < 0) - picked = second_pick; - if (picked < 0) - picked = third_pick; + picked = (p1 >= 0) ? p1 : + (p2 >= 0) ? p2 : + (p3 >= 0) ? p3 : + (t2_pick >= 0) ? t2_pick : + (t3_pick >= 0) ? t3_pick : -1; if (picked < 0) RETURN(-ENODATA); diff --git a/lustre/tests/fsx.c b/lustre/tests/fsx.c index 0459e4b..931d20d 100644 --- a/lustre/tests/fsx.c +++ b/lustre/tests/fsx.c @@ -1466,12 +1466,12 @@ do_mirror_ops(int op) rc = system(cmd); if (rc < 0) { - prt("%s: %d\n", cmd, errno); + prt("mirror op %d: %s: %d\n", op, cmd, errno); report_failure(184); } else if (WIFEXITED(rc)) { rc = WEXITSTATUS(rc); if (rc > 0) { - prt("%s: %d\n", cmd, rc); + prt("mirror op %d: %s: %d\n", op, cmd, rc); snprintf(cmd, sizeof(cmd), "lfs mirror verify -v %s", tf->path); rc = system(cmd); diff --git a/lustre/tests/sanity-flr.sh b/lustre/tests/sanity-flr.sh index 917d879..e1944f4 100644 --- a/lustre/tests/sanity-flr.sh +++ b/lustre/tests/sanity-flr.sh @@ -2021,6 +2021,7 @@ test_43b() { } run_test 43b "allow writing to multiple preferred mirror file" +export LFS_SETSTRIPE_COMPR_OK="yes" test_43c() { (( $MDS1_VERSION >= $(version_code 2.14.0.88) )) || skip "Need MDS >= 2.14.0.88 for compression support" @@ -2045,6 +2046,43 @@ test_43c() { } run_test 43c "read prefer uncompressed mirror" +test_43d() { + (( $MDS1_VERSION >= $(version_code 2.14.0.88) )) || + skip "Need MDS >= 2.14.0.88 for compression support" + + local tf=$DIR/$tdir/$tfile + local flags + local p="$TMP/$TESTSUITE-$TESTNAME.parameters" + + save_lustre_params client "llite.*.enable_compression" > $p + stack_trap "rm -rf $DIR/$tdir; restore_lustre_params < $p" EXIT + $LCTL set_param llite.*.enable_compression=1 + + test_mkdir $DIR/$tdir + ## mirror 0 compressed + ## mirror 1 uncompressed + $LFS mirror create -N -Eeof -Z gzip -N -Eeof $tf || + error "create 2 mirrors file $tf failed" + + dd if=/dev/zero of=$tf bs=1M count=1 || error "failed to write $tf" + echo " **verify components" + verify_comp_attr lcme_flags $tf 0x10001 init,stale,compress + verify_comp_attr lcme_flags $tf 0x20002 init + + rm -f $tf + ## mirror 0 uncompressed + ## mirror 1 compressed + $LFS mirror create -N -Eeof -N -Eeof -Z gzip $tf || + error "create 2 mirrors file $tf failed" + + dd if=/dev/zero of=$tf bs=1M count=1 || error "failed to write $tf" + echo " **verify components" + verify_comp_attr lcme_flags $tf 0x10001 init + verify_comp_attr lcme_flags $tf 0x20002 init,stale,compress +} +run_test 43d "prefer write on uncompressed mirror" +export LFS_SETSTRIPE_COMPR_OK="" + test_44a() { [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return rm -rf $DIR/$tdir @@ -3217,7 +3255,7 @@ test_70a() { do_facet $SINGLEMDS $LCTL set_param mdt.*MDT*.enable_strict_som=0 $LFS setstripe -N -E1M -Eeof $tf || error "setstripe $tf failed" - $FSX -p 5 -N 1000 -S 0 -M $tf || error "fsx FLR file $tf failed" + $FSX -p 1 -N 1000 -S 0 -M $tf || error "fsx FLR file $tf failed" } run_test 70a "flr mode fsx test" diff --git a/lustre/utils/lfs.c b/lustre/utils/lfs.c index f6a9f35..1dc7351 100644 --- a/lustre/utils/lfs.c +++ b/lustre/utils/lfs.c @@ -12156,10 +12156,11 @@ int lfs_mirror_verify_chunk(int fd, size_t file_size, rc = -EINVAL; fprintf(stderr, - "%s: chunk "DEXT" has different checksum value on mirror %u and mirror %u.\n", + "%s: chunk "DEXT" has different checksum value on mirror %u:%lx and mirror %u:%lx.\n", progname, PEXT(&chunk->chunk), - chunk->mirror_id[0], - chunk->mirror_id[i]); + chunk->mirror_id[0], crc_array[0], + chunk->mirror_id[i], crc_array[i]); + print_checksums(chunk, crc_array, pos, buflen); } } diff --git a/lustre/utils/liblustreapi_layout.c b/lustre/utils/liblustreapi_layout.c index 1920110..df5b987 100644 --- a/lustre/utils/liblustreapi_layout.c +++ b/lustre/utils/liblustreapi_layout.c @@ -3262,6 +3262,7 @@ int llapi_mirror_resync_many(int fd, struct llapi_layout *layout, struct llapi_resync_comp *comp_array, int comp_size, uint64_t start, uint64_t end) { + struct stat stbuf; size_t page_size = sysconf(_SC_PAGESIZE); const size_t buflen = 4 << 20; /* 4M */ void *buf; @@ -3276,6 +3277,9 @@ int llapi_mirror_resync_many(int fd, struct llapi_layout *layout, if (rc) return -rc; + if (fstat(fd, &stbuf) < 0) + return -errno; + while (pos < end) { uint64_t mirror_end; ssize_t bytes_read; @@ -3336,23 +3340,24 @@ int llapi_mirror_resync_many(int fd, struct llapi_layout *layout, else to_punch = data_off - cur_pos; - if (comp_array[i].lrc_end == OBD_OBJECT_EOF) { + if (comp_array[i].lrc_end == OBD_OBJECT_EOF) /* the last component can be truncated * safely */ rc = llapi_mirror_truncate(fd, mid, cur_pos); - /* hole at the end of file, so just - * truncate up to set size. - */ - if (!rc && data_off == data_end) - rc = llapi_mirror_truncate(fd, + else + rc = llapi_mirror_punch(fd, mid, + cur_pos, to_punch); + /** + * hole at the end of file, so just + * truncate up to set size. + */ + if (!rc && data_off == data_end && + data_end == stbuf.st_size) + rc = llapi_mirror_truncate(fd, mid, data_end); - } else { - rc = llapi_mirror_punch(fd, - comp_array[i].lrc_mirror_id, - cur_pos, to_punch); - } + /* if failed then read failed hole range */ if (rc < 0) { rc = 0; -- 1.8.3.1