Whamcloud - gitweb
LU-13974 tests: update log corruption 64/46864/1
authorAlexander Boyko <c17825@cray.com>
Tue, 24 Nov 2020 09:05:36 +0000 (04:05 -0500)
committerMikhail Pershin <mpershin@whamcloud.com>
Fri, 18 Mar 2022 06:47:40 +0000 (09:47 +0300)
Test case reproduce missing object for sub transaction during
set xattr operation.
First setattr got -2, second already started, but didn't
make llog_add yet. In this case llog osp object is stale after
top_trans_start. So declaration phase can not refresh llogs. And
at llog_osd_write_rec osp object changes stale state to
valid(dt_attr_get), but llog handle and llog header are invalid.
A new record would be added to updatelog with wrong index.
In that case processing of update log fails with

fs1-MDT0001-osp-MDT0003: [0x2:0x400024d0:0x2] Invalid record: index
112926 but expected 112925
lod_sub_recovery_thread()) fs1-MDT0001-osp-MDT0003 get update log
failed: rc = -34
Recovery aborted, and clients are evicted.

Lustre-change: https://review.whamcloud.com/40743
Lustre-commit: 562837124ec7bffeba7edb4b4b899bc271833374

HPE-bug-id: LUS-9030
Test-Parameters: testlist=sanity  envdefinitions=ONLY="427"
Signed-off-by: Alexander Boyko <alexander.boyko@hpe.com>
Change-Id: I6a47fed1bc01f4be62216d1d0787adc413df0cf5
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Signed-off-by: Mikhail Pershin <mpershin@whamcloud.com>
lustre/include/obd_support.h
lustre/osp/osp_md_object.c
lustre/target/out_lib.c
lustre/target/update_trans.c
lustre/tests/sanity.sh

index b3c7a2b..514fafb 100644 (file)
@@ -644,6 +644,8 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_LARGE_STRIPE          0x1703
 #define OBD_FAIL_OUT_ENOSPC             0x1704
 #define OBD_FAIL_INVALIDATE_UPDATE     0x1705
+#define OBD_FAIL_OUT_UPDATE_DROP        0x1707
+#define OBD_FAIL_OUT_OBJECT_MISS       0x1708
 
 /* MIGRATE */
 #define OBD_FAIL_MIGRATE_ENTRIES               0x1801
index ae8ebc0..312c0dd 100644 (file)
@@ -1137,6 +1137,7 @@ static int osp_write_interpreter(const struct lu_env *env,
        if (rc) {
                CDEBUG(D_HA, "error "DFID": rc = %d\n",
                       PFID(lu_object_fid(&obj->opo_obj.do_lu)), rc);
+               OBD_RACE(OBD_FAIL_OUT_OBJECT_MISS);
                spin_lock(&obj->opo_lock);
                obj->opo_attr.la_valid = 0;
                obj->opo_stale = 1;
index e7f9b62..2c2ebdd 100644 (file)
@@ -755,7 +755,8 @@ static int out_tx_xattr_set_exec(const struct lu_env *env,
               dt_obd_name(th->th_dev), arg->u.xattr_set.buf.lb_buf,
               arg->u.xattr_set.name, arg->u.xattr_set.flags);
 
-       if (!lu_object_exists(&dt_obj->do_lu)) {
+       if (!lu_object_exists(&dt_obj->do_lu) ||
+           OBD_FAIL_PRECHECK(OBD_FAIL_OUT_OBJECT_MISS)) {
                rc = -ENOENT;
        } else {
                struct linkea_data ldata = { 0 };
index b8150fa..40def43 100644 (file)
@@ -1039,6 +1039,17 @@ stop_master_trans:
        /* Step 3: write updates to other MDTs */
        if (write_updates) {
                struct llog_update_record *lur;
+               if (OBD_FAIL_PRECHECK(OBD_FAIL_OUT_OBJECT_MISS)) {
+                       if (cfs_fail_val == 1) {
+                               long timeout = cfs_time_seconds(1) / 10;
+
+                               OBD_RACE(OBD_FAIL_OUT_OBJECT_MISS);
+                               set_current_state(TASK_UNINTERRUPTIBLE);
+                               schedule_timeout(schedule_timeout(timeout));
+                               cfs_fail_loc = 0;
+                       }
+                       cfs_fail_val++;
+               }
 
                /* Stop callback of master will add more updates and also update
                 * master transno, so merge the parameters and updates into one
@@ -1649,7 +1660,12 @@ static int distribute_txn_commit_thread(void *_arg)
                             !distribute_txn_commit_thread_running(lut) ||
                             committed < tdtd->tdtd_committed_batchid ||
                             tdtd_ready_for_cancel_log(tdtd), &lwi);
-       };
+
+               if (OBD_FAIL_PRECHECK(OBD_FAIL_OUT_OBJECT_MISS)) {
+                       set_current_state(TASK_UNINTERRUPTIBLE);
+                       schedule_timeout(cfs_time_seconds(5));
+               }
+       }
 
        l_wait_event(tdtd->tdtd_commit_thread_waitq,
                     atomic_read(&tdtd->tdtd_refcount) == 0, &lwi);
index 3f00b05..7528e64 100755 (executable)
@@ -21265,6 +21265,38 @@ test_423() {
 }
 run_test 423 "statfs should return a right data"
 
+test_427() {
+       [ $MDSCOUNT -ge 2 ] || skip "needs >= 2 MDTs"
+       (( $MDS1_VERSION >= $(version_code 2.12.8) )) ||
+               skip "Need MDS version at least 2.12.8"
+       local log
+
+       mkdir $DIR/$tdir
+       mkdir $DIR/$tdir/1
+       mkdir $DIR/$tdir/2
+       test_mkdir -c $MDSCOUNT -i 1 $DIR/$tdir/1/dir
+       test_mkdir -c $MDSCOUNT -i 1 $DIR/$tdir/2/dir2
+
+       $LFS getdirstripe $DIR/$tdir/1/dir
+
+       #first setfattr for creating updatelog
+       setfattr -n user.attr0 -v "some text" $DIR/$tdir/1/dir
+
+#define OBD_FAIL_OUT_OBJECT_MISS        0x1708
+       do_nodes $(comma_list $(mdts_nodes)) $LCTL set_param fail_loc=0x80001708
+       setfattr -n user.attr1 -v "some text" $DIR/$tdir/1/dir &
+       setfattr -n user.attr2 -v "another attr"  $DIR/$tdir/2/dir2 &
+
+       sleep 2
+       fail mds2
+       wait_recovery_complete mds2 $((2*TIMEOUT))
+
+       log=$(do_facet mds1 dmesg | tac | sed "/${TESTNAME//_/ }/,$ d")
+       echo $log | grep "get update log failed" &&
+               error "update log corruption is detected" || true
+}
+run_test 427 "Failed DNE2 update request shouldn't corrupt updatelog"
+
 prep_801() {
        [[ $(lustre_version_code mds1) -lt $(version_code 2.9.55) ]] ||
        [[ $OST1_VERSION -lt $(version_code 2.9.55) ]] &&