From: Alex Zhuravlev Date: Wed, 16 Mar 2022 09:10:38 +0000 (+0300) Subject: LU-15645 obdclass: llog to handle gaps X-Git-Tag: 2.15.0-RC4~10 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=903f2f663956fef380b9f383e73a05b7beb0baa5;p=fs%2Flustre-release.git LU-15645 obdclass: llog to handle gaps due to old errors an update llog can contaain gaps in index. this shouldn't block llog processing and recovery. actual gaps in transaction sequence should be catched by VBR. Signed-off-by: Alex Zhuravlev Change-Id: I11ec817e356f9658118c34706ef3a533e7faba83 Reviewed-on: https://review.whamcloud.com/46837 Tested-by: jenkins Reviewed-by: Andreas Dilger Reviewed-by: Alexander Boyko Tested-by: Maloo Reviewed-by: Oleg Drokin --- diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 4385468..9748c5a 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -504,6 +504,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_TGT_REPLY_DATA_RACE 0x722 #define OBD_FAIL_TGT_RECOVERY_CONNECT 0x724 #define OBD_FAIL_TGT_NO_GRANT 0x725 +#define OBD_FAIL_TGT_TXN_NO_CANCEL 0x726 #define OBD_FAIL_MDC_REVALIDATE_PAUSE 0x800 #define OBD_FAIL_MDC_ENQUEUE_PAUSE 0x801 @@ -579,6 +580,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_CATALOG_FULL_CHECK 0x131a #define OBD_FAIL_CATLIST 0x131b #define OBD_FAIL_LLOG_PAUSE_AFTER_PAD 0x131c +#define OBD_FAIL_LLOG_ADD_GAP 0x131d #define OBD_FAIL_LLITE 0x1400 #define OBD_FAIL_LLITE_FAULT_TRUNC_RACE 0x1401 diff --git a/lustre/obdclass/llog.c b/lustre/obdclass/llog.c index 12e179e..e021ee4 100644 --- a/lustre/obdclass/llog.c +++ b/lustre/obdclass/llog.c @@ -670,23 +670,15 @@ repeat: continue; } - if (rec->lrh_index != index) { - /* - * the last time we couldn't parse the block due - * to corruption, thus has no idea about the - * next index, take it from the block, once. - */ - if (refresh_idx) { - refresh_idx = false; - index = rec->lrh_index; - } else { - CERROR("%s: "DFID" Invalid record: index" - " %u but expected %u\n", - loghandle2name(loghandle), - PFID(&loghandle->lgh_id.lgl_oi.oi_fid), - rec->lrh_index, index); - GOTO(out, rc = -ERANGE); - } + if (rec->lrh_index > index) { + /* the record itself looks good, but we met a + * gap which can be result of old bugs, just + * keep going */ + CERROR("%s: "DFID" index %u, expected %u\n", + loghandle2name(loghandle), + PFID(&loghandle->lgh_id.lgl_oi.oi_fid), + rec->lrh_index, index); + index = rec->lrh_index; } CDEBUG(D_OTHER, diff --git a/lustre/obdclass/llog_osd.c b/lustre/obdclass/llog_osd.c index 5aed12e..8354dd5 100644 --- a/lustre/obdclass/llog_osd.c +++ b/lustre/obdclass/llog_osd.c @@ -603,6 +603,8 @@ static int llog_osd_write_rec(const struct lu_env *env, down_write(&loghandle->lgh_last_sem); /* increment the last_idx along with llh_tail index, they should * be equal for a llog lifetime */ + if (OBD_FAIL_CHECK(OBD_FAIL_LLOG_ADD_GAP) && --cfs_fail_val == 0) + loghandle->lgh_last_idx++; loghandle->lgh_last_idx++; index = loghandle->lgh_last_idx; LLOG_HDR_TAIL(llh)->lrt_index = index; diff --git a/lustre/osp/osp_trans.c b/lustre/osp/osp_trans.c index ea42715..95777ed 100644 --- a/lustre/osp/osp_trans.c +++ b/lustre/osp/osp_trans.c @@ -1115,6 +1115,7 @@ static int osp_send_update_req(const struct lu_env *env, ENTRY; LASSERT(oth != NULL); + LASSERT(osp->opd_obd); if (ou && ou->ou_generation != our->our_generation) { const struct lnet_processid *peer; diff --git a/lustre/target/update_trans.c b/lustre/target/update_trans.c index 361e314..b7ab00c 100644 --- a/lustre/target/update_trans.c +++ b/lustre/target/update_trans.c @@ -1283,6 +1283,9 @@ static int distribute_txn_cancel_records(const struct lu_env *env, struct sub_thandle *st; ENTRY; + if (OBD_FAIL_CHECK(OBD_FAIL_TGT_TXN_NO_CANCEL)) + RETURN(0); + top_multiple_thandle_dump(tmt, D_INFO); /* Cancel update logs on other MDTs */ list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) { diff --git a/lustre/tests/replay-dual.sh b/lustre/tests/replay-dual.sh index 93e5dd3..6547a39 100755 --- a/lustre/tests/replay-dual.sh +++ b/lustre/tests/replay-dual.sh @@ -1144,6 +1144,39 @@ test_31() { } run_test 31 "deadlock on file_remove_privs and occupied mod rpc slots" +test_32() { + (( $MDSCOUNT < 2 )) && skip_env "needs >= 2 MDTs" + + # inject a gap with 10th transaction +#define OBD_FAIL_LLOG_ADD_GAP 0x131d + do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0000131d fail_val=10 + for ((i=0; i < 20; i++)); do + $LFS setdirstripe -i1 $DIR/$tdir-$i || + error "can't mkdir $DIR/$tdir-$i" + done + + # prevent update llog cancellation, so next boot MDS has + # process the update llog with gap injected +#define OBD_FAIL_TGT_TXN_NO_CANCEL 0x726 + $LCTL set_param fail_loc=0x726 + + stop mds2 + stop mds1 + + $LCTL set_param fail_loc=0 + + mount_facet mds1 + mount_facet mds2 + + $LFS df $DIR + + local testid=$(echo $TESTNAME | tr '_' ' ') + dmesg | tac | sed "/$testid/,$ d" | grep "This client was evicted" && + error "client got evicted due to aborted recovery" + return 0 +} +run_test 32 "gap in update llog shouldn't break recovery" + complete $SECONDS SLEEP=$((SECONDS - $NOW)) [ $SLEEP -lt $TIMEOUT ] && sleep $SLEEP