Whamcloud - gitweb
LU-15300 mdt: refresh LOVEA with LL granted 13/46413/70
authorAlex Zhuravlev <bzzz@whamcloud.com>
Tue, 1 Feb 2022 20:50:02 +0000 (23:50 +0300)
committerOleg Drokin <green@whamcloud.com>
Sat, 22 Apr 2023 17:27:37 +0000 (17:27 +0000)
this change tries to fix two problems:
1) mdt_reint_open() fetches LOVEA before layout lock is taken.
   this may race with another process changing the layout and
   may result in a stale layout returned with a granted layout
   lock - re-fetch LOVEA once layout lock is granted

2) lov_layout_change() should not apply old layouts which
   can get through when MDS doesn't take layout lock

3) LFSCK shouldn't ignore layout version stored on MDS to avoid
   a situation when version degrades compared to client's copy.

This patch misses an optimization and can result in a number of
useless calls to OSD to fetch LOVEA. To be fixed in a followup
patch.

Signed-off-by: Alex Zhuravlev <bzzz@whamcloud.com>
Change-Id: Idee1101d152ab09947faf6d75574a8761a7690a5
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/46413
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Zhenyu Xu <bobijam@hotmail.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/obd_support.h
lustre/lfsck/lfsck_layout.c
lustre/llite/vvp_page.c
lustre/lov/lov_object.c
lustre/mdt/mdt_open.c
lustre/tests/sanity-flr.sh

index 49bd706..0cdde3b 100644 (file)
@@ -258,6 +258,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_MDS_LL_BLOCK           0x172
 #define OBD_FAIL_MDS_LOD_CREATE_PAUSE   0x173
 #define OBD_FAIL_MDS_CONNECT_VS_EVICT   0x174
+#define OBD_FAIL_MDS_DELAY_OPEN                 0x175
 
 /* CMD */
 #define OBD_FAIL_MDS_IS_SUBDIR_NET       0x180
@@ -623,6 +624,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_LLITE_XATTR_PAUSE                 0x1420
 #define OBD_FAIL_LLITE_PAGE_INVALIDATE_PAUSE       0x1421
 #define OBD_FAIL_LLITE_READPAGE_PAUSE              0x1422
+#define OBD_FAIL_LLITE_PANIC_ON_ESTALE             0x1423
 
 #define OBD_FAIL_FID_INDIR     0x1501
 #define OBD_FAIL_FID_INLMA     0x1502
index dacd615..8875f5e 100644 (file)
@@ -1894,7 +1894,13 @@ static int lfsck_layout_new_comp_lovea(const struct lu_env *env,
                                                  rec->lor_range);
                lcm->lcm_flags = cpu_to_le16(LCM_FL_NONE);
        } else {
-               lcm->lcm_layout_gen = cpu_to_le32(1);
+               /*
+                * if OST doesn't provide layout version, then try
+                * to inherit one from MDS's layout, but increment
+                * it so the client notices and applies modified
+                * layout
+                */
+               le32_add_cpu(&lcm->lcm_layout_gen, 1);
                lcm->lcm_flags = cpu_to_le16(LCM_FL_NONE);
        }
        lcm->lcm_entry_count = cpu_to_le16(1);
index ccd228e..5cdcda3 100644 (file)
@@ -114,10 +114,13 @@ static void vvp_vmpage_error(struct inode *inode, struct page *vmpage,
                obj->vob_discard_page_warned = 0;
        } else {
                SetPageError(vmpage);
-               if (ioret == -ENOSPC)
+               if (ioret == -ENOSPC) {
                        set_bit(AS_ENOSPC, &inode->i_mapping->flags);
-               else
+               } else {
+                       if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_PANIC_ON_ESTALE))
+                               LBUG();
                        set_bit(AS_EIO, &inode->i_mapping->flags);
+               }
 
                if ((ioret == -ESHUTDOWN || ioret == -EINTR ||
                     ioret == -EIO) && obj->vob_discard_page_warned == 0) {
index 57ce475..f8e7886 100644 (file)
@@ -1390,6 +1390,23 @@ static int lov_conf_set(const struct lu_env *env, struct cl_object *obj,
 
        LASSERT(conf->coc_opc == OBJECT_CONF_SET);
 
+       /*
+        * don't apply old layouts which can be brought
+        * if returned w/o ldlm lock.
+        * XXX: can we rollback in case of recovery?
+        */
+       if (lsm && lov->lo_lsm) {
+               u32 oldgen = lov->lo_lsm->lsm_layout_gen &= ~LU_LAYOUT_RESYNC;
+               u32 newgen = lsm->lsm_layout_gen & ~LU_LAYOUT_RESYNC;
+
+               if (newgen < oldgen) {
+                       CDEBUG(D_HA, "skip old for "DFID": %d < %d\n",
+                              PFID(lu_object_fid(lov2lu(lov))),
+                              (int)newgen, (int)oldgen);
+                       GOTO(out, result = 0);
+               }
+       }
+
        if ((lsm == NULL && lov->lo_lsm == NULL) ||
            ((lsm != NULL && lov->lo_lsm != NULL) &&
             (lov->lo_lsm->lsm_layout_gen == lsm->lsm_layout_gen) &&
index ee1cc2a..ff8a078 100644 (file)
@@ -1048,6 +1048,35 @@ static bool mdt_hsm_release_allow(const struct md_attr *ma)
        return true;
 }
 
+static int mdt_refetch_lovea(struct mdt_thread_info *info,
+                            struct mdt_object *o, struct md_attr *ma,
+                            u64 ibits)
+{
+       struct mdt_body *repbody;
+       int rc;
+
+       if ((ibits & MDS_INODELOCK_LAYOUT) == 0)
+               return 0;
+       if (!S_ISREG(lu_object_attr(&o->mot_obj)))
+               return 0;
+
+       if ((ma->ma_valid & MA_LOV) == 0)
+               return 0;
+
+       ma->ma_valid &= ~MA_LOV;
+       info->mti_big_lmm_used = 0;
+       ma->ma_lmm = req_capsule_server_get(info->mti_pill, &RMF_MDT_MD);
+       ma->ma_lmm_size = req_capsule_get_size(info->mti_pill, &RMF_MDT_MD,
+                                              RCL_SERVER);
+       rc = __mdt_stripe_get(info, o, ma, XATTR_NAME_LOV);
+       if (rc)
+               return rc;
+
+       repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
+       repbody->mbo_eadatasize = ma->ma_lmm_size;
+       return 0;
+}
+
 static int mdt_open_by_fid_lock(struct mdt_thread_info *info,
                                struct ldlm_reply *rep,
                                struct mdt_lock_handle *lhc)
@@ -1145,13 +1174,21 @@ static int mdt_open_by_fid_lock(struct mdt_thread_info *info,
 
        tgt_open_obj_set(info->mti_env, mdt_obj2dt(o));
        rc = mdt_finish_open(info, parent, o, open_flags, rep);
-       if (!rc) {
-               mdt_set_disposition(info, rep, DISP_LOOKUP_POS);
-               if (open_flags & MDS_OPEN_LOCK)
-                       mdt_set_disposition(info, rep, DISP_OPEN_LOCK);
-               if (open_flags & MDS_OPEN_LEASE)
-                       mdt_set_disposition(info, rep, DISP_OPEN_LEASE);
-       }
+       if (rc)
+               GOTO(out_unlock, rc);
+
+       mdt_set_disposition(info, rep, DISP_LOOKUP_POS);
+       if (open_flags & MDS_OPEN_LOCK)
+               mdt_set_disposition(info, rep, DISP_OPEN_LOCK);
+       if (open_flags & MDS_OPEN_LEASE)
+               mdt_set_disposition(info, rep, DISP_OPEN_LEASE);
+
+       /*
+        * if layout lock is granted, then we should re-fetch LOVEA
+        * which was originally taken w/o the lock
+        */
+       rc = mdt_refetch_lovea(info, o, ma, ibits);
+
        GOTO(out_unlock, rc);
 
 out_unlock:
@@ -1619,6 +1656,7 @@ again_pw:
        } else {
                /* get openlock if this isn't replay and client requested it */
                if (!req_is_replay(req)) {
+                       OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_DELAY_OPEN, cfs_fail_val);
                        rc = mdt_object_open_lock(info, child, lhc, &ibits);
                        object_locked = 1;
                        if (rc != 0)
@@ -1657,12 +1695,20 @@ again_pw:
                                       PFID(mdt_object_fid(child)), rc);
                        mdt_clear_disposition(info, ldlm_rep, DISP_OPEN_CREATE);
                }
+               GOTO(out_child_unlock, result);
        }
 
+       /*
+        * if layout lock is granted, then we should re-fetch LOVEA
+        * which was originally taken w/o the lock.
+        */
+       result = mdt_refetch_lovea(info, child, ma, ibits);
+
        mdt_counter_incr(req, LPROC_MDT_OPEN,
                         ktime_us_delta(ktime_get(), kstart));
 
-       EXIT;
+       GOTO(out_child_unlock, result);
+
 out_child_unlock:
        if (object_locked)
                mdt_object_open_unlock(info, child, lhc, ibits, result);
index 5264cdc..e0c4730 100644 (file)
@@ -3275,9 +3275,6 @@ test_100() {
 }
 run_test 100 "flr mode fsx test"
 
-ctrl_file=$(mktemp /tmp/CTRL.XXXXXX)
-lock_file=$(mktemp /var/lock/FLR.XXXXXX)
-
 write_file_200() {
        local tf=$1
 
@@ -3343,11 +3340,16 @@ resync_file_200() {
        done
 }
 
-test_200() {
+# this was test_200 before adding "b" and "c" subtests
+test_200a() {
        local tf=$DIR/$tfile
        local tf2=$DIR2/$tfile
        local tf3=$DIR3/$tfile
 
+       ctrl_file=$(mktemp /tmp/CTRL.XXXXXX)
+       lock_file=$(mktemp /var/lock/FLR.XXXXXX)
+       stack_trap "rm -f $ctrl_file $lock_file $tf $tf-2 $tf-3"
+
        $LFS setstripe -E 1M -S 1M -E 2M -c 2 -E 4M -E 16M -E eof $tf
        $LFS setstripe -E 2M -S 1M -E 6M -c 2 -E 8M -E 32M -E eof $tf-2
        $LFS setstripe -E 4M -c 2 -E 8M -E 64M -E eof $tf-3
@@ -3401,8 +3403,6 @@ test_200() {
        umount_client $MOUNT2
        umount_client $MOUNT3
 
-       rm -f $lock_file
-
        # resync and verify mirrors
        $LFS mirror resync $tf || error "final resync failed"
        get_mirror_ids $tf
@@ -3416,7 +3416,133 @@ test_200() {
 
        true
 }
-run_test 200 "stress test"
+run_test 200a "stress test"
+
+test_200b() {
+       local tf=$DIR/$tfile
+       local tf2=$DIR2/$tfile
+       local tf3=$DIR3/$tfile
+
+       ctrl_file=$(mktemp /tmp/CTRL.XXXXXX)
+       lock_file=$(mktemp /var/lock/FLR.XXXXXX)
+       stack_trap "rm -f $ctrl_file $lock_file $tf $tf-2 $tf-3"
+
+       $LFS setstripe -E 1M -S 1M -E 2M -c 2 -E 4M -E 16M -E eof $tf
+       $LFS setstripe -E 2M -S 1M -E 6M -c 2 -E 8M -E 32M -E eof $tf-2
+       $LFS setstripe -E 4M -c 2 -E 8M -E 64M -E eof $tf-3
+
+       $LFS mirror extend -N -f $tf-2 $tf ||
+               error "merging $tf-2 into $tf failed"
+       $LFS mirror extend -N -f $tf-3 $tf ||
+               error "merging $tf-3 into $tf failed"
+
+       mkdir -p $MOUNT2 && mount_client $MOUNT2
+
+       mkdir -p $MOUNT3 && mount_client $MOUNT3
+
+       verify_flr_state $tf3 "ro"
+
+#define OBD_FAIL_LLITE_PANIC_ON_ESTALE             0x1423
+       $LCTL set_param fail_loc=0x1423
+
+       local -a pids
+
+       write_file_200 $tf &
+       pids+=($!)
+
+       read_file_200 $tf &
+       pids+=($!)
+
+       write_file_200 $tf2 &
+       pids+=($!)
+
+       read_file_200 $tf2 &
+       pids+=($!)
+
+       resync_file_200 $tf3 &
+       pids+=($!)
+
+       local sleep_time=60
+       [ "$SLOW" = "yes" ] && sleep_time=400
+       sleep $sleep_time
+       rm -f $ctrl_file
+
+       echo "Waiting ${pids[@]}"
+       wait ${pids[@]}
+
+       umount_client $MOUNT2
+       umount_client $MOUNT3
+
+       # resync and verify mirrors
+       $LFS mirror resync $tf || {
+               ps ax
+               error "final resync failed"
+       }
+       get_mirror_ids $tf
+
+       local csum=$($LFS mirror read -N ${mirror_array[0]} $tf | md5sum)
+       for id in ${mirror_array[@]:1}; do
+               [ "$($LFS mirror read -N $id $tf | md5sum)" = "$csum" ] ||
+                       error "checksum error for mirror $id"
+       done
+
+       true
+}
+run_test 200b "racing IO, mirror extend and resync"
+
+test_200c() {
+       local tf=$DIR/$tfile
+       local tf2=$DIR2/$tfile
+
+       mkdir -p $MOUNT2 && mount_client $MOUNT2
+       stack_trap "umount_client $MOUNT2"
+       stack_trap "rm -f $tf"
+
+       $LFS df
+
+       dd if=/dev/urandom of=$tf bs=1M count=2 || error "can't write"
+       local mdt_idx
+       mdt_idx=$($LFS getstripe -m $tf)
+
+       cancel_lru_locks mdc
+       cancel_lru_locks osc
+
+       # start a process modifying file, block it just
+       # before layout lock acquisition
+#define OBD_FAIL_MDS_DELAY_OPEN                 0x175
+       do_facet mds$((mdt_idx+1)) $LCTL set_param fail_loc=0x80000175 fail_val=10
+       #log "dd to stale replica"
+       dd if=/dev/urandom of=$tf bs=1M count=2 oflag=direct conv=notrunc &
+       local PID=$!
+       sleep 0.5
+
+       # make a replica
+       log "mirror extend"
+       $LFS mirror extend -N -c -1 $tf2 || {
+               ps ax
+               error "can't mirror"
+       }
+       log "mirror extend done"
+       do_facet mds$((mdt_idx+1)) $LCTL set_param fail_loc=0 fail_val=0
+
+       # wait for blocking dd to complete and modify file
+       wait $PID || error "2nd dd failed"
+       log "dd completed"
+
+       verify_mirror_count $tf 2
+
+       $LFS getstripe $tf | grep -q lcme_flags.*stale || {
+               $LFS getstripe $tf
+               $LFS getstripe $tf2
+               error "both replicas are still in sync"
+       }
+
+       $LFS mirror verify -vvv $tf || {
+               $LFS getstripe $tf
+               error "corrupted in-sync file"
+       }
+}
+run_test 200c "layout change racing with open: LOVEA changes"
 
 cleanup_test_201() {
        do_facet $SINGLEMDS $LCTL --device $MDT0 changelog_deregister $CL_USER