From: Alex Zhuravlev Date: Wed, 16 Mar 2022 09:10:38 +0000 (+0300) Subject: LU-15645 obdclass: llog to handle gaps X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=9138996071bd17bfdfb98c1b2c544e6d75614f49;p=fs%2Flustre-release.git LU-15645 obdclass: llog to handle gaps due to old errors an update llog can contaain gaps in index. this shouldn't block llog processing and recovery. actual gaps in transaction sequence should be catched by VBR. Lustre-change: https://review.whamcloud.com/46837 Lustre-commit: TBD (from b3de0d57bd0f7cd2e918aa9d3f08be1c69697b80) Signed-off-by: Alex Zhuravlev Change-Id: I11ec817e356f9658118c34706ef3a533e7faba83 Reviewed-on: https://review.whamcloud.com/46884 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger --- diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index bc431e6..0495bd6 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -501,6 +501,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_TGT_REPLY_DATA_RACE 0x722 #define OBD_FAIL_TGT_RECOVERY_CONNECT 0x724 #define OBD_FAIL_TGT_NO_GRANT 0x725 +#define OBD_FAIL_TGT_TXN_NO_CANCEL 0x726 #define OBD_FAIL_MDC_REVALIDATE_PAUSE 0x800 #define OBD_FAIL_MDC_ENQUEUE_PAUSE 0x801 @@ -577,6 +578,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_CATALOG_FULL_CHECK 0x131a #define OBD_FAIL_CATLIST 0x131b #define OBD_FAIL_LLOG_PAUSE_AFTER_PAD 0x131c +#define OBD_FAIL_LLOG_ADD_GAP 0x131d #define OBD_FAIL_LLITE 0x1400 #define OBD_FAIL_LLITE_FAULT_TRUNC_RACE 0x1401 diff --git a/lustre/obdclass/llog.c b/lustre/obdclass/llog.c index 240f76b..f8b77b9 100644 --- a/lustre/obdclass/llog.c +++ b/lustre/obdclass/llog.c @@ -671,23 +671,15 @@ repeat: continue; } - if (rec->lrh_index != index) { - /* - * the last time we couldn't parse the block due - * to corruption, thus has no idea about the - * next index, take it from the block, once. - */ - if (refresh_idx) { - refresh_idx = false; - index = rec->lrh_index; - } else { - CERROR("%s: "DFID" Invalid record: index" - " %u but expected %u\n", - loghandle2name(loghandle), - PFID(&loghandle->lgh_id.lgl_oi.oi_fid), - rec->lrh_index, index); - GOTO(out, rc = -ERANGE); - } + if (rec->lrh_index > index) { + /* the record itself looks good, but we met a + * gap which can be result of old bugs, just + * keep going */ + CERROR("%s: "DFID" index %u, expected %u\n", + loghandle2name(loghandle), + PFID(&loghandle->lgh_id.lgl_oi.oi_fid), + rec->lrh_index, index); + index = rec->lrh_index; } CDEBUG(D_OTHER, diff --git a/lustre/obdclass/llog_osd.c b/lustre/obdclass/llog_osd.c index 11d1ff9..3116ae8 100644 --- a/lustre/obdclass/llog_osd.c +++ b/lustre/obdclass/llog_osd.c @@ -604,6 +604,8 @@ static int llog_osd_write_rec(const struct lu_env *env, down_write(&loghandle->lgh_last_sem); /* increment the last_idx along with llh_tail index, they should * be equal for a llog lifetime */ + if (OBD_FAIL_CHECK(OBD_FAIL_LLOG_ADD_GAP) && --cfs_fail_val == 0) + loghandle->lgh_last_idx++; loghandle->lgh_last_idx++; index = loghandle->lgh_last_idx; LLOG_HDR_TAIL(llh)->lrt_index = index; diff --git a/lustre/target/update_trans.c b/lustre/target/update_trans.c index 23fd09a..5180663 100644 --- a/lustre/target/update_trans.c +++ b/lustre/target/update_trans.c @@ -1272,6 +1272,9 @@ static int distribute_txn_cancel_records(const struct lu_env *env, struct sub_thandle *st; ENTRY; + if (OBD_FAIL_CHECK(OBD_FAIL_TGT_TXN_NO_CANCEL)) + RETURN(0); + top_multiple_thandle_dump(tmt, D_INFO); /* Cancel update logs on other MDTs */ list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) { diff --git a/lustre/tests/replay-dual.sh b/lustre/tests/replay-dual.sh index 41b3a8a..8f6f6cc 100755 --- a/lustre/tests/replay-dual.sh +++ b/lustre/tests/replay-dual.sh @@ -1068,6 +1068,39 @@ test_29() { } run_test 29 "replay vs update with the same xid" +test_32() { + (( $MDSCOUNT < 2 )) && skip_env "needs >= 2 MDTs" + + # inject a gap with 10th transaction +#define OBD_FAIL_LLOG_ADD_GAP 0x131d + do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0000131d fail_val=10 + for ((i=0; i < 20; i++)); do + $LFS setdirstripe -i1 $DIR/$tdir-$i || + error "can't mkdir $DIR/$tdir-$i" + done + + # prevent update llog cancellation, so next boot MDS has + # process the update llog with gap injected +#define OBD_FAIL_TGT_TXN_NO_CANCEL 0x726 + $LCTL set_param fail_loc=0x726 + + stop mds2 + stop mds1 + + $LCTL set_param fail_loc=0 + + mount_facet mds1 + mount_facet mds2 + + $LFS df $DIR + + local testid=$(echo $TESTNAME | tr '_' ' ') + dmesg | tac | sed "/$testid/,$ d" | grep "This client was evicted" && + error "client got evicted due to aborted recovery" + return 0 +} +run_test 32 "gap in update llog shouldn't break recovery" + complete $SECONDS SLEEP=$((SECONDS - $NOW)) [ $SLEEP -lt $TIMEOUT ] && sleep $SLEEP