From: Alexander Boyko Date: Fri, 16 May 2025 12:38:12 +0000 (+0200) Subject: LU-19015 llog: logic for skipping a zeroed record X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=919c5d25fc45121466ae0ea803558039a2162538;p=fs%2Flustre-release.git LU-19015 llog: logic for skipping a zeroed record For ENOSPC errors during dt_write() and threads races, the changelog could have a sparse file with zeros inside. The current processing logic skips records for the next chunk. The patch adds the abilty to skip only zeros in the buffer and start from a valid record. Also fix changes the llog_test 8 so that it uses non-zero byte for corruption. Fixes: cb1290768df9 ("LU-18218 mdd: changelog specific write function") Signed-off-by: Alexander Boyko Change-Id: I7263764ba6a89f226995b8967631eaa6d5bdd4dd Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/59267 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Andriy Skulysh Reviewed-by: Oleg Drokin --- diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 91d6710..863fe99 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -263,6 +263,7 @@ extern bool obd_enable_fname_encoding; #define OBD_FAIL_MDS_CHANGELOG_ENOSPC 0x18c #define OBD_FAIL_MDS_BATCH_NET 0x18d #define OBD_FAIL_MDS_HSM_DATA_VERSION_NET 0x18e +#define OBD_FAIL_MDS_CHANGELOG_FAIL_WRITE 0x18f /* OI scrub */ #define OBD_FAIL_OSD_SCRUB_DELAY 0x190 diff --git a/lustre/kunit/llog_test.c b/lustre/kunit/llog_test.c index 0ad3e1b..2abc94d 100644 --- a/lustre/kunit/llog_test.c +++ b/lustre/kunit/llog_test.c @@ -1174,8 +1174,8 @@ static int test_8_cb(const struct lu_env *env, struct llog_handle *llh, return 0; } -static int llog_zeroes(const struct lu_env *env, struct dt_object *o, - __u64 start, __u64 end) +static int llog_fill_bytes(const struct lu_env *env, struct dt_object *o, + __u64 start, __u64 end, char byte) { struct lu_attr la; struct thandle *th; @@ -1191,6 +1191,8 @@ static int llog_zeroes(const struct lu_env *env, struct dt_object *o, if (!buf) RETURN(-ENOMEM); + memset(buf, byte, end - start); + LASSERT(o); d = lu2dt_dev(o->do_lu.lo_dev); LASSERT(d); @@ -1334,8 +1336,8 @@ static int llog_test_8(const struct lu_env *env, struct obd_device *obd) /* must lost all 20 records */ CWARN("8b: clean first llog record in catalog\n"); - llog_zeroes(env, llh->lgh_obj, 8192 + plain_pos, - 8192 + plain_pos + sizeof(struct llog_logid_rec)); + llog_fill_bytes(env, llh->lgh_obj, 8192 + plain_pos, + 8192 + plain_pos + sizeof(struct llog_logid_rec), 0x5a); rc2 = llog_cat_close(env, llh); if (rc2) { @@ -1347,10 +1349,10 @@ static int llog_test_8(const struct lu_env *env, struct obd_device *obd) /* lost 28 records, from 5 to 32 in block */ CWARN("8c: corrupt first chunk in the middle\n"); - llog_zeroes(env, obj, 8192 + reclen * 4, 8192 + reclen * 10); + llog_fill_bytes(env, obj, 8192 + reclen * 4, 8192 + reclen * 10, 0xff); /* lost whole chunk - 32 records */ CWARN("8c: corrupt second chunk at start\n"); - llog_zeroes(env, obj, 16384, 16384 + reclen); + llog_fill_bytes(env, obj, 16384, 16384 + reclen, 0x01); CWARN("8d: count survived records\n"); rc = llog_open(env, ctxt, &llh, &cat_logid, NULL, LLOG_OPEN_EXISTS); diff --git a/lustre/mdd/mdd_dir.c b/lustre/mdd/mdd_dir.c index 694731b..475a3c5 100644 --- a/lustre/mdd/mdd_dir.c +++ b/lustre/mdd/mdd_dir.c @@ -839,7 +839,11 @@ int mdd_changelog_write_rec(const struct lu_env *env, lgi_buf.lb_len = rec->cr_hdr.lrh_len; lgi_buf.lb_buf = rec; - rc = dt_record_write(env, o, &lgi_buf, &offset, th); + if (CFS_FAIL_CHECK(OBD_FAIL_MDS_CHANGELOG_FAIL_WRITE) && + (rec->cr.cr_index % (cfs_fail_val + 1)) == 0) + rc = -EIO; + else + rc = dt_record_write(env, o, &lgi_buf, &offset, th); if (rc) { CERROR("%s: failed to write changelog record file "DFID" rec idx %u off %llu chnlg idx %llu: rc = %d\n", diff --git a/lustre/obdclass/llog.c b/lustre/obdclass/llog.c index 0a1119f..b78c9b0 100644 --- a/lustre/obdclass/llog.c +++ b/lustre/obdclass/llog.c @@ -493,6 +493,24 @@ static inline bool llog_is_index_skipable(int idx, struct llog_log_hdr *llh, return !test_bit_le(idx, LLOG_HDR_BITMAP(llh)); } +static inline int llog_skip_gap(struct llog_rec_hdr *start, char *end) +{ + struct llog_rec_hdr *rec = start; + + /* skipping zero gap */ + while ((rec->lrh_index == 0 || rec->lrh_len == 0) && + (char *)rec < (char *)end) + rec = (typeof(rec))(((char *)rec) + 4); + + if ((char *)rec > end || + !((rec->lrh_type & LLOG_OP_MASK) == LLOG_OP_MAGIC || + ((rec->lrh_type & __swab32(LLOG_OP_MASK)) == + __swab32(LLOG_OP_MAGIC)))) + return -ENOENT; + + return (int)((char *)rec - (char *)start); +} + static int llog_process_thread(void *arg) { struct llog_process_info *lpi = arg; @@ -692,18 +710,29 @@ repeat: rc = llog_verify_record(loghandle, rec); if (rc) { + int gap_size; + CDEBUG(D_OTHER, "invalid record at index %d\n", index); /* * for fixed-sized llogs we can skip one record * by using llh_size from llog header. - * Otherwise skip the next llog chunk. */ rc = 0; if (llh->llh_flags & LLOG_F_IS_FIXSIZE) { rec->lrh_len = llh->llh_size; goto next_rec; } + /* + * for zero gap we can find a next record. + * Otherwise skip the next llog chunk. + */ + gap_size = llog_skip_gap(rec, buf + chunk_size - + LLOG_MIN_REC_SIZE); + if (gap_size > 0) { + rec->lrh_len = gap_size; + goto next_rec; + } /* make sure that is always next block */ cur_offset = chunk_offset + chunk_size; /* no goal to find, just next block to read */ diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index e222513..5c7d2cf 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -3831,6 +3831,41 @@ test_162() { } run_test 162 "File attributes should be persisted after MDS failover" +test_163() { + remote_mds_nodsh && skip "remote MDS with nodsh" + (( "$MDS1_VERSION" >= $(version_code 2.16.54) )) || + skip "Need MDS version at least 2.16.54 to skip llog holes" + + local mdtidx + local mdtsvc + + changelog_register || error "changelog_register failed" + stack_trap changelog_deregister EXIT + test_mkdir -c 0 $DIR/$tdir || error "mkdir $tdir failed" + mdtidx=$(($($LFS getdirstripe -i $DIR/$tdir) + 1)) + mdtsvc=$(facet_svc mds$mdtidx) + echo mds$mdtidx $mdtsvc + + cl_mask=$(do_facet mds$mdtidx $LCTL get_param mdd.$mdtsvc.changelog_mask -n) + changelog_chmask "ALL" + stack_trap "do_facet mds$mdtidx \ + $LCTL set_param mdd.$mdtsvc.changelog_mask=\'$cl_mask\' -n" EXIT + + #define OBD_FAIL_MDS_CHANGELOG_FAIL_WRITE 0x18f + do_facet mds$mdtidx $LCTL set_param fail_loc=0x18f fail_val=30 + + # generate some changelog records to create a gap every 31 index + for (( i = 0; i < 10; i++)); do + createmany -m $DIR/$tdir/$tfile_$i 40 & + done + + # Check changelog gap processing without a jump to a next chunk + changelog_dump | awk -F'[ .]' '{if(prev != "" && $2 - prev > 2) \ + {print"Errot between "prev" and "$2; exit 1}prev=$2}' || + error "Found a gap" +} +run_test 163 "changelog check for fail write and processing records" + complete_test $SECONDS check_and_cleanup_lustre exit_status