Test case reproduce missing object for sub transaction during
set xattr operation.
First setattr got -2, second already started, but didn't
make llog_add yet. In this case llog osp object is stale after
top_trans_start. So declaration phase can not refresh llogs. And
at llog_osd_write_rec osp object changes stale state to
valid(dt_attr_get), but llog handle and llog header are invalid.
A new record would be added to updatelog with wrong index.
In that case processing of update log fails with
fs1-MDT0001-osp-MDT0003: [0x2:0x400024d0:0x2] Invalid record: index
112926 but expected 112925
lod_sub_recovery_thread()) fs1-MDT0001-osp-MDT0003 get update log
failed: rc = -34
Recovery aborted, and clients are evicted.
Lustre-change: https://review.whamcloud.com/40743
Lustre-commit:
562837124ec7bffeba7edb4b4b899bc271833374
HPE-bug-id: LUS-9030
Test-Parameters: testlist=sanity envdefinitions=ONLY="427"
Signed-off-by: Alexander Boyko <alexander.boyko@hpe.com>
Change-Id: I6a47fed1bc01f4be62216d1d0787adc413df0cf5
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Signed-off-by: Mikhail Pershin <mpershin@whamcloud.com>
#define OBD_FAIL_LARGE_STRIPE 0x1703
#define OBD_FAIL_OUT_ENOSPC 0x1704
#define OBD_FAIL_INVALIDATE_UPDATE 0x1705
+#define OBD_FAIL_OUT_UPDATE_DROP 0x1707
+#define OBD_FAIL_OUT_OBJECT_MISS 0x1708
/* MIGRATE */
#define OBD_FAIL_MIGRATE_ENTRIES 0x1801
if (rc) {
CDEBUG(D_HA, "error "DFID": rc = %d\n",
PFID(lu_object_fid(&obj->opo_obj.do_lu)), rc);
+ OBD_RACE(OBD_FAIL_OUT_OBJECT_MISS);
spin_lock(&obj->opo_lock);
obj->opo_attr.la_valid = 0;
obj->opo_stale = 1;
dt_obd_name(th->th_dev), arg->u.xattr_set.buf.lb_buf,
arg->u.xattr_set.name, arg->u.xattr_set.flags);
- if (!lu_object_exists(&dt_obj->do_lu)) {
+ if (!lu_object_exists(&dt_obj->do_lu) ||
+ OBD_FAIL_PRECHECK(OBD_FAIL_OUT_OBJECT_MISS)) {
rc = -ENOENT;
} else {
struct linkea_data ldata = { 0 };
/* Step 3: write updates to other MDTs */
if (write_updates) {
struct llog_update_record *lur;
+ if (OBD_FAIL_PRECHECK(OBD_FAIL_OUT_OBJECT_MISS)) {
+ if (cfs_fail_val == 1) {
+ long timeout = cfs_time_seconds(1) / 10;
+
+ OBD_RACE(OBD_FAIL_OUT_OBJECT_MISS);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(schedule_timeout(timeout));
+ cfs_fail_loc = 0;
+ }
+ cfs_fail_val++;
+ }
/* Stop callback of master will add more updates and also update
* master transno, so merge the parameters and updates into one
!distribute_txn_commit_thread_running(lut) ||
committed < tdtd->tdtd_committed_batchid ||
tdtd_ready_for_cancel_log(tdtd), &lwi);
- };
+
+ if (OBD_FAIL_PRECHECK(OBD_FAIL_OUT_OBJECT_MISS)) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(cfs_time_seconds(5));
+ }
+ }
l_wait_event(tdtd->tdtd_commit_thread_waitq,
atomic_read(&tdtd->tdtd_refcount) == 0, &lwi);
}
run_test 423 "statfs should return a right data"
+test_427() {
+ [ $MDSCOUNT -ge 2 ] || skip "needs >= 2 MDTs"
+ (( $MDS1_VERSION >= $(version_code 2.12.8) )) ||
+ skip "Need MDS version at least 2.12.8"
+ local log
+
+ mkdir $DIR/$tdir
+ mkdir $DIR/$tdir/1
+ mkdir $DIR/$tdir/2
+ test_mkdir -c $MDSCOUNT -i 1 $DIR/$tdir/1/dir
+ test_mkdir -c $MDSCOUNT -i 1 $DIR/$tdir/2/dir2
+
+ $LFS getdirstripe $DIR/$tdir/1/dir
+
+ #first setfattr for creating updatelog
+ setfattr -n user.attr0 -v "some text" $DIR/$tdir/1/dir
+
+#define OBD_FAIL_OUT_OBJECT_MISS 0x1708
+ do_nodes $(comma_list $(mdts_nodes)) $LCTL set_param fail_loc=0x80001708
+ setfattr -n user.attr1 -v "some text" $DIR/$tdir/1/dir &
+ setfattr -n user.attr2 -v "another attr" $DIR/$tdir/2/dir2 &
+
+ sleep 2
+ fail mds2
+ wait_recovery_complete mds2 $((2*TIMEOUT))
+
+ log=$(do_facet mds1 dmesg | tac | sed "/${TESTNAME//_/ }/,$ d")
+ echo $log | grep "get update log failed" &&
+ error "update log corruption is detected" || true
+}
+run_test 427 "Failed DNE2 update request shouldn't corrupt updatelog"
+
prep_801() {
[[ $(lustre_version_code mds1) -lt $(version_code 2.9.55) ]] ||
[[ $OST1_VERSION -lt $(version_code 2.9.55) ]] &&