From: Alexander Boyko Date: Tue, 24 Nov 2020 09:05:36 +0000 (-0500) Subject: LU-13974 tests: update log corruption X-Git-Tag: 2.14.51~65 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=562837124ec7bffeba7edb4b4b899bc271833374 LU-13974 tests: update log corruption Test case reproduce missing object for sub transaction during set xattr operation. First setattr got -2, second already started, but didn't make llog_add yet. In this case llog osp object is stale after top_trans_start. So declaration phase can not refresh llogs. And at llog_osd_write_rec osp object changes stale state to valid(dt_attr_get), but llog handle and llog header are invalid. A new record would be added to updatelog with wrong index. In that case processing of update log fails with fs1-MDT0001-osp-MDT0003: [0x2:0x400024d0:0x2] Invalid record: index 112926 but expected 112925 lod_sub_recovery_thread()) fs1-MDT0001-osp-MDT0003 get update log failed: rc = -34 Recovery aborted, and clients are evicted. HPE-bug-id: LUS-9030 Test-Parameters: testlist=sanity envdefinitions=ONLY="427" Signed-off-by: Alexander Boyko Change-Id: I6a47fed1bc01f4be62216d1d0787adc413df0cf5 Reviewed-on: https://review.whamcloud.com/40743 Reviewed-by: Andreas Dilger Tested-by: jenkins Tested-by: Maloo Reviewed-by: Alexander Zarochentsev Reviewed-by: Oleg Drokin --- diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index b212042..64c4979 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -656,6 +656,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_OUT_ENOSPC 0x1704 #define OBD_FAIL_INVALIDATE_UPDATE 0x1705 #define OBD_FAIL_OUT_UPDATE_DROP 0x1707 +#define OBD_FAIL_OUT_OBJECT_MISS 0x1708 /* MIGRATE */ #define OBD_FAIL_MIGRATE_ENTRIES 0x1801 diff --git a/lustre/osp/osp_md_object.c b/lustre/osp/osp_md_object.c index 2abaf95..a90d5f5 100644 --- a/lustre/osp/osp_md_object.c +++ b/lustre/osp/osp_md_object.c @@ -1137,6 +1137,7 @@ static int osp_write_interpreter(const struct lu_env *env, if (rc) { CDEBUG(D_HA, "error "DFID": rc = %d\n", PFID(lu_object_fid(&obj->opo_obj.do_lu)), rc); + OBD_RACE(OBD_FAIL_OUT_OBJECT_MISS); spin_lock(&obj->opo_lock); obj->opo_attr.la_valid = 0; obj->opo_stale = 1; diff --git a/lustre/target/out_lib.c b/lustre/target/out_lib.c index 56d16fa..ee6eeb4 100644 --- a/lustre/target/out_lib.c +++ b/lustre/target/out_lib.c @@ -754,7 +754,8 @@ static int out_tx_xattr_set_exec(const struct lu_env *env, dt_obd_name(th->th_dev), arg->u.xattr_set.buf.lb_buf, arg->u.xattr_set.name, arg->u.xattr_set.flags); - if (!lu_object_exists(&dt_obj->do_lu)) { + if (!lu_object_exists(&dt_obj->do_lu) || + OBD_FAIL_PRECHECK(OBD_FAIL_OUT_OBJECT_MISS)) { rc = -ENOENT; } else { struct linkea_data ldata = { 0 }; diff --git a/lustre/target/update_trans.c b/lustre/target/update_trans.c index 23fd09a..361e314 100644 --- a/lustre/target/update_trans.c +++ b/lustre/target/update_trans.c @@ -1026,6 +1026,17 @@ stop_master_trans: /* Step 3: write updates to other MDTs */ if (write_updates) { struct llog_update_record *lur; + if (OBD_FAIL_PRECHECK(OBD_FAIL_OUT_OBJECT_MISS)) { + if (cfs_fail_val == 1) { + long timeout = cfs_time_seconds(1) / 10; + + OBD_RACE(OBD_FAIL_OUT_OBJECT_MISS); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(schedule_timeout(timeout)); + cfs_fail_loc = 0; + } + cfs_fail_val++; + } /* Stop callback of master will add more updates and also update * master transno, so merge the parameters and updates into one @@ -1596,6 +1607,11 @@ static int distribute_txn_commit_thread(void *_arg) if (current->state) schedule(); + + if (OBD_FAIL_PRECHECK(OBD_FAIL_OUT_OBJECT_MISS)) { + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(5)); + } } while (({set_current_state(TASK_IDLE); diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 9ae4778..f23237a 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -24088,6 +24088,38 @@ test_426() { } run_test 426 "splice test on Lustre" +test_427() { + [ $MDSCOUNT -ge 2 ] || skip "needs >= 2 MDTs" + (( $MDS1_VERSION >= $(version_code 2.12.4) )) || + skip "Need MDS version at least 2.12.4" + local log + + mkdir $DIR/$tdir + mkdir $DIR/$tdir/1 + mkdir $DIR/$tdir/2 + test_mkdir -c $MDSCOUNT -i 1 $DIR/$tdir/1/dir + test_mkdir -c $MDSCOUNT -i 1 $DIR/$tdir/2/dir2 + + $LFS getdirstripe $DIR/$tdir/1/dir + + #first setfattr for creating updatelog + setfattr -n user.attr0 -v "some text" $DIR/$tdir/1/dir + +#define OBD_FAIL_OUT_OBJECT_MISS 0x1708 + do_nodes $(comma_list $(mdts_nodes)) $LCTL set_param fail_loc=0x80001708 + setfattr -n user.attr1 -v "some text" $DIR/$tdir/1/dir & + setfattr -n user.attr2 -v "another attr" $DIR/$tdir/2/dir2 & + + sleep 2 + fail mds2 + wait_recovery_complete mds2 $((2*TIMEOUT)) + + log=$(do_facet mds1 dmesg | tac | sed "/${TESTNAME//_/ }/,$ d") + echo $log | grep "get update log failed" && + error "update log corruption is detected" || true +} +run_test 427 "Failed DNE2 update request shouldn't corrupt updatelog" + lseek_test_430() { local offset local file=$1