Whamcloud - gitweb
LU-19015 llog: logic for skipping a zeroed record 67/59267/6
authorAlexander Boyko <alexander.boyko@hpe.com>
Fri, 16 May 2025 12:38:12 +0000 (14:38 +0200)
committerOleg Drokin <green@whamcloud.com>
Thu, 12 Jun 2025 06:36:04 +0000 (06:36 +0000)
For ENOSPC errors during dt_write() and threads races, the changelog
could have a sparse file with zeros inside. The current processing
logic skips records for the next chunk.
The patch adds the abilty to skip only zeros in the buffer and start
from a valid record.
Also fix changes the llog_test 8 so that it uses non-zero byte for
corruption.

Fixes: cb1290768df9 ("LU-18218 mdd: changelog specific write function")
Signed-off-by: Alexander Boyko <alexander.boyko@hpe.com>
Change-Id: I7263764ba6a89f226995b8967631eaa6d5bdd4dd
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/59267
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Andriy Skulysh <andriy.skulysh@hpe.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/obd_support.h
lustre/kunit/llog_test.c
lustre/mdd/mdd_dir.c
lustre/obdclass/llog.c
lustre/tests/recovery-small.sh

index 91d6710..863fe99 100644 (file)
@@ -263,6 +263,7 @@ extern bool obd_enable_fname_encoding;
 #define OBD_FAIL_MDS_CHANGELOG_ENOSPC          0x18c
 #define OBD_FAIL_MDS_BATCH_NET                 0x18d
 #define OBD_FAIL_MDS_HSM_DATA_VERSION_NET      0x18e
+#define OBD_FAIL_MDS_CHANGELOG_FAIL_WRITE      0x18f
 
 /* OI scrub */
 #define OBD_FAIL_OSD_SCRUB_DELAY               0x190
index 0ad3e1b..2abc94d 100644 (file)
@@ -1174,8 +1174,8 @@ static int test_8_cb(const struct lu_env *env, struct llog_handle *llh,
        return 0;
 }
 
-static int llog_zeroes(const struct lu_env *env, struct dt_object *o,
-                     __u64 start, __u64 end)
+static int llog_fill_bytes(const struct lu_env *env, struct dt_object *o,
+                          __u64 start, __u64 end, char byte)
 {
        struct lu_attr la;
        struct thandle *th;
@@ -1191,6 +1191,8 @@ static int llog_zeroes(const struct lu_env *env, struct dt_object *o,
        if (!buf)
                RETURN(-ENOMEM);
 
+       memset(buf, byte, end - start);
+
        LASSERT(o);
        d = lu2dt_dev(o->do_lu.lo_dev);
        LASSERT(d);
@@ -1334,8 +1336,8 @@ static int llog_test_8(const struct lu_env *env, struct obd_device *obd)
 
        /* must lost all 20 records */
        CWARN("8b: clean first llog record in catalog\n");
-       llog_zeroes(env, llh->lgh_obj, 8192 + plain_pos,
-                   8192 + plain_pos + sizeof(struct llog_logid_rec));
+       llog_fill_bytes(env, llh->lgh_obj, 8192 + plain_pos,
+                       8192 + plain_pos + sizeof(struct llog_logid_rec), 0x5a);
 
        rc2 = llog_cat_close(env, llh);
        if (rc2) {
@@ -1347,10 +1349,10 @@ static int llog_test_8(const struct lu_env *env, struct obd_device *obd)
 
        /* lost 28 records, from 5 to 32 in block */
        CWARN("8c: corrupt first chunk in the middle\n");
-       llog_zeroes(env, obj, 8192 + reclen * 4, 8192 + reclen * 10);
+       llog_fill_bytes(env, obj, 8192 + reclen * 4, 8192 + reclen * 10, 0xff);
        /* lost whole chunk - 32 records */
        CWARN("8c: corrupt second chunk at start\n");
-       llog_zeroes(env, obj, 16384, 16384 + reclen);
+       llog_fill_bytes(env, obj, 16384, 16384 + reclen, 0x01);
 
        CWARN("8d: count survived records\n");
        rc = llog_open(env, ctxt, &llh, &cat_logid, NULL, LLOG_OPEN_EXISTS);
index 694731b..475a3c5 100644 (file)
@@ -839,7 +839,11 @@ int mdd_changelog_write_rec(const struct lu_env *env,
                lgi_buf.lb_len = rec->cr_hdr.lrh_len;
                lgi_buf.lb_buf = rec;
 
-               rc = dt_record_write(env, o, &lgi_buf, &offset, th);
+               if (CFS_FAIL_CHECK(OBD_FAIL_MDS_CHANGELOG_FAIL_WRITE) &&
+                   (rec->cr.cr_index % (cfs_fail_val + 1)) == 0)
+                       rc = -EIO;
+               else
+                       rc = dt_record_write(env, o, &lgi_buf, &offset, th);
 
                if (rc) {
                        CERROR("%s: failed to write changelog record file "DFID" rec idx %u off %llu chnlg idx %llu: rc = %d\n",
index 0a1119f..b78c9b0 100644 (file)
@@ -493,6 +493,24 @@ static inline bool llog_is_index_skipable(int idx, struct llog_log_hdr *llh,
        return !test_bit_le(idx, LLOG_HDR_BITMAP(llh));
 }
 
+static inline int llog_skip_gap(struct llog_rec_hdr *start, char *end)
+{
+       struct llog_rec_hdr *rec = start;
+
+       /* skipping zero gap */
+       while ((rec->lrh_index == 0 || rec->lrh_len == 0) &&
+              (char *)rec < (char *)end)
+               rec = (typeof(rec))(((char *)rec) + 4);
+
+       if ((char *)rec > end ||
+           !((rec->lrh_type & LLOG_OP_MASK) == LLOG_OP_MAGIC ||
+             ((rec->lrh_type & __swab32(LLOG_OP_MASK)) ==
+              __swab32(LLOG_OP_MAGIC))))
+               return -ENOENT;
+
+       return (int)((char *)rec - (char *)start);
+}
+
 static int llog_process_thread(void *arg)
 {
        struct llog_process_info *lpi = arg;
@@ -692,18 +710,29 @@ repeat:
 
                        rc = llog_verify_record(loghandle, rec);
                        if (rc) {
+                               int gap_size;
+
                                CDEBUG(D_OTHER, "invalid record at index %d\n",
                                       index);
                                /*
                                 * for fixed-sized llogs we can skip one record
                                 * by using llh_size from llog header.
-                                * Otherwise skip the next llog chunk.
                                 */
                                rc = 0;
                                if (llh->llh_flags & LLOG_F_IS_FIXSIZE) {
                                        rec->lrh_len = llh->llh_size;
                                        goto next_rec;
                                }
+                               /*
+                                * for zero gap we can find a next record.
+                                * Otherwise skip the next llog chunk.
+                                */
+                               gap_size = llog_skip_gap(rec, buf + chunk_size -
+                                                        LLOG_MIN_REC_SIZE);
+                               if (gap_size > 0) {
+                                       rec->lrh_len = gap_size;
+                                       goto next_rec;
+                               }
                                /* make sure that is always next block */
                                cur_offset = chunk_offset + chunk_size;
                                /* no goal to find, just next block to read */
index e222513..5c7d2cf 100755 (executable)
@@ -3831,6 +3831,41 @@ test_162() {
 }
 run_test 162 "File attributes should be persisted after MDS failover"
 
+test_163() {
+       remote_mds_nodsh && skip "remote MDS with nodsh"
+       (( "$MDS1_VERSION" >= $(version_code 2.16.54) )) ||
+               skip "Need MDS version at least 2.16.54 to skip llog holes"
+
+       local mdtidx
+       local mdtsvc
+
+       changelog_register || error "changelog_register failed"
+       stack_trap changelog_deregister EXIT
+       test_mkdir -c 0 $DIR/$tdir || error "mkdir $tdir failed"
+       mdtidx=$(($($LFS getdirstripe -i $DIR/$tdir) + 1))
+       mdtsvc=$(facet_svc mds$mdtidx)
+       echo mds$mdtidx $mdtsvc
+
+       cl_mask=$(do_facet mds$mdtidx $LCTL get_param mdd.$mdtsvc.changelog_mask -n)
+       changelog_chmask "ALL"
+       stack_trap "do_facet mds$mdtidx \
+               $LCTL set_param mdd.$mdtsvc.changelog_mask=\'$cl_mask\' -n" EXIT
+
+       #define OBD_FAIL_MDS_CHANGELOG_FAIL_WRITE                       0x18f
+       do_facet mds$mdtidx $LCTL set_param fail_loc=0x18f fail_val=30
+
+       # generate some changelog records to create a gap every 31 index
+       for (( i = 0; i < 10; i++)); do
+               createmany -m $DIR/$tdir/$tfile_$i 40 &
+       done
+
+       # Check changelog gap processing without a jump to a next chunk
+       changelog_dump | awk -F'[ .]' '{if(prev != "" && $2 - prev > 2) \
+                       {print"Errot between "prev" and "$2; exit 1}prev=$2}' ||
+                       error "Found a gap"
+}
+run_test 163 "changelog check for fail write and processing records"
+
 complete_test $SECONDS
 check_and_cleanup_lustre
 exit_status