Whamcloud - gitweb
LU-13974 tests: update log corruption 43/40743/4
authorAlexander Boyko <c17825@cray.com>
Tue, 24 Nov 2020 09:05:36 +0000 (04:05 -0500)
committerOleg Drokin <green@whamcloud.com>
Wed, 10 Mar 2021 08:02:00 +0000 (08:02 +0000)
Test case reproduce missing object for sub transaction during
set xattr operation.
First setattr got -2, second already started, but didn't
make llog_add yet. In this case llog osp object is stale after
top_trans_start. So declaration phase can not refresh llogs. And
at llog_osd_write_rec osp object changes stale state to
valid(dt_attr_get), but llog handle and llog header are invalid.
A new record would be added to updatelog with wrong index.
In that case processing of update log fails with

fs1-MDT0001-osp-MDT0003: [0x2:0x400024d0:0x2] Invalid record: index
112926 but expected 112925
lod_sub_recovery_thread()) fs1-MDT0001-osp-MDT0003 get update log
failed: rc = -34
Recovery aborted, and clients are evicted.

HPE-bug-id: LUS-9030
Test-Parameters: testlist=sanity  envdefinitions=ONLY="427"
Signed-off-by: Alexander Boyko <alexander.boyko@hpe.com>
Change-Id: I6a47fed1bc01f4be62216d1d0787adc413df0cf5
Reviewed-on: https://review.whamcloud.com/40743
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Alexander Zarochentsev <alexander.zarochentsev@hpe.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/obd_support.h
lustre/osp/osp_md_object.c
lustre/target/out_lib.c
lustre/target/update_trans.c
lustre/tests/sanity.sh

index b212042..64c4979 100644 (file)
@@ -656,6 +656,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_OUT_ENOSPC             0x1704
 #define OBD_FAIL_INVALIDATE_UPDATE     0x1705
 #define OBD_FAIL_OUT_UPDATE_DROP        0x1707
+#define OBD_FAIL_OUT_OBJECT_MISS       0x1708
 
 /* MIGRATE */
 #define OBD_FAIL_MIGRATE_ENTRIES               0x1801
index 2abaf95..a90d5f5 100644 (file)
@@ -1137,6 +1137,7 @@ static int osp_write_interpreter(const struct lu_env *env,
        if (rc) {
                CDEBUG(D_HA, "error "DFID": rc = %d\n",
                       PFID(lu_object_fid(&obj->opo_obj.do_lu)), rc);
+               OBD_RACE(OBD_FAIL_OUT_OBJECT_MISS);
                spin_lock(&obj->opo_lock);
                obj->opo_attr.la_valid = 0;
                obj->opo_stale = 1;
index 56d16fa..ee6eeb4 100644 (file)
@@ -754,7 +754,8 @@ static int out_tx_xattr_set_exec(const struct lu_env *env,
               dt_obd_name(th->th_dev), arg->u.xattr_set.buf.lb_buf,
               arg->u.xattr_set.name, arg->u.xattr_set.flags);
 
-       if (!lu_object_exists(&dt_obj->do_lu)) {
+       if (!lu_object_exists(&dt_obj->do_lu) ||
+           OBD_FAIL_PRECHECK(OBD_FAIL_OUT_OBJECT_MISS)) {
                rc = -ENOENT;
        } else {
                struct linkea_data ldata = { 0 };
index 23fd09a..361e314 100644 (file)
@@ -1026,6 +1026,17 @@ stop_master_trans:
        /* Step 3: write updates to other MDTs */
        if (write_updates) {
                struct llog_update_record *lur;
+               if (OBD_FAIL_PRECHECK(OBD_FAIL_OUT_OBJECT_MISS)) {
+                       if (cfs_fail_val == 1) {
+                               long timeout = cfs_time_seconds(1) / 10;
+
+                               OBD_RACE(OBD_FAIL_OUT_OBJECT_MISS);
+                               set_current_state(TASK_UNINTERRUPTIBLE);
+                               schedule_timeout(schedule_timeout(timeout));
+                               cfs_fail_loc = 0;
+                       }
+                       cfs_fail_val++;
+               }
 
                /* Stop callback of master will add more updates and also update
                 * master transno, so merge the parameters and updates into one
@@ -1596,6 +1607,11 @@ static int distribute_txn_commit_thread(void *_arg)
 
                if (current->state)
                        schedule();
+
+               if (OBD_FAIL_PRECHECK(OBD_FAIL_OUT_OBJECT_MISS)) {
+                       set_current_state(TASK_UNINTERRUPTIBLE);
+                       schedule_timeout(cfs_time_seconds(5));
+               }
        }
 
        while (({set_current_state(TASK_IDLE);
index 9ae4778..f23237a 100755 (executable)
@@ -24088,6 +24088,38 @@ test_426() {
 }
 run_test 426 "splice test on Lustre"
 
+test_427() {
+       [ $MDSCOUNT -ge 2 ] || skip "needs >= 2 MDTs"
+       (( $MDS1_VERSION >= $(version_code 2.12.4) )) ||
+               skip "Need MDS version at least 2.12.4"
+       local log
+
+       mkdir $DIR/$tdir
+       mkdir $DIR/$tdir/1
+       mkdir $DIR/$tdir/2
+       test_mkdir -c $MDSCOUNT -i 1 $DIR/$tdir/1/dir
+       test_mkdir -c $MDSCOUNT -i 1 $DIR/$tdir/2/dir2
+
+       $LFS getdirstripe $DIR/$tdir/1/dir
+
+       #first setfattr for creating updatelog
+       setfattr -n user.attr0 -v "some text" $DIR/$tdir/1/dir
+
+#define OBD_FAIL_OUT_OBJECT_MISS        0x1708
+       do_nodes $(comma_list $(mdts_nodes)) $LCTL set_param fail_loc=0x80001708
+       setfattr -n user.attr1 -v "some text" $DIR/$tdir/1/dir &
+       setfattr -n user.attr2 -v "another attr"  $DIR/$tdir/2/dir2 &
+
+       sleep 2
+       fail mds2
+       wait_recovery_complete mds2 $((2*TIMEOUT))
+
+       log=$(do_facet mds1 dmesg | tac | sed "/${TESTNAME//_/ }/,$ d")
+       echo $log | grep "get update log failed" &&
+               error "update log corruption is detected" || true
+}
+run_test 427 "Failed DNE2 update request shouldn't corrupt updatelog"
+
 lseek_test_430() {
        local offset
        local file=$1