From: Alex Zhuravlev Date: Tue, 12 Nov 2024 19:23:08 +0000 (-0800) Subject: LU-15300 mdt: refresh LOVEA with LL granted X-Git-Tag: 2.15.7-RC1~39 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=refs%2Fchanges%2F64%2F55464%2F8;p=fs%2Flustre-release.git LU-15300 mdt: refresh LOVEA with LL granted this change tries to fix two problems: 1) mdt_reint_open() fetches LOVEA before layout lock is taken. this may race with another process changing the layout and may result in a stale layout returned with a granted layout lock - re-fetch LOVEA once layout lock is granted 2) lov_layout_change() should not apply old layouts which can get through when MDS doesn't take layout lock 3) LFSCK shouldn't ignore layout version stored on MDS to avoid a situation when version degrades compared to client's copy. This patch misses an optimization and can result in a number of useless calls to OSD to fetch LOVEA. To be fixed in a followup patch. Lustre-change: https://review.whamcloud.com/46413 Lustre-commit: 13557aa86904376e48a5e43256d5c1ab32c1c2d6 LU-14869 test: improve sanity-flr/200a Make sure "flock -x" successfully returned before running mirror resync so that it won't get into running read holding shared flock. Lustre-change: https://review.whamcloud.com/54345 Lustre-commit: 2bf51212680b3d4117925965c368d53587bf37d4 Signed-off-by: Alex Zhuravlev Signed-off-by: Bobi Jam Change-Id: Idee1101d152ab09947faf6d75574a8761a7690a5 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/55464 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Zhenyu Xu Reviewed-by: Oleg Drokin --- diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 9327c80..13da93c 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -255,6 +255,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_MDS_NO_LL_OPEN 0x171 #define OBD_FAIL_MDS_LL_BLOCK 0x172 #define OBD_FAIL_MDS_LOD_CREATE_PAUSE 0x173 +#define OBD_FAIL_MDS_DELAY_OPEN 0x175 /* CMD */ #define OBD_FAIL_MDS_IS_SUBDIR_NET 0x180 @@ -611,6 +612,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_LLITE_PAGE_ALLOC 0x1418 #define OBD_FAIL_LLITE_OPEN_DELAY 0x1419 #define OBD_FAIL_LLITE_XATTR_PAUSE 0x1420 +#define OBD_FAIL_LLITE_PANIC_ON_ESTALE 0x1423 #define OBD_FAIL_LLITE_READPAGE_PAUSE2 0x1424 #define OBD_FAIL_FID_INDIR 0x1501 diff --git a/lustre/lfsck/lfsck_layout.c b/lustre/lfsck/lfsck_layout.c index fe956ea..4807025 100644 --- a/lustre/lfsck/lfsck_layout.c +++ b/lustre/lfsck/lfsck_layout.c @@ -1895,7 +1895,13 @@ static int lfsck_layout_new_comp_lovea(const struct lu_env *env, rec->lor_range); lcm->lcm_flags = cpu_to_le16(LCM_FL_NONE); } else { - lcm->lcm_layout_gen = cpu_to_le32(1); + /* + * if OST doesn't provide layout version, then try + * to inherit one from MDS's layout, but increment + * it so the client notices and applies modified + * layout + */ + le32_add_cpu(&lcm->lcm_layout_gen, 1); lcm->lcm_flags = cpu_to_le16(LCM_FL_NONE); } lcm->lcm_entry_count = cpu_to_le16(1); diff --git a/lustre/llite/vvp_page.c b/lustre/llite/vvp_page.c index 3b4a934..06033d8 100644 --- a/lustre/llite/vvp_page.c +++ b/lustre/llite/vvp_page.c @@ -252,6 +252,10 @@ static void vvp_vmpage_error(struct inode *inode, struct page *vmpage, obj->vob_discard_page_warned = 0; } else { SetPageError(vmpage); + if (ioret != -ENOSPC && + OBD_FAIL_CHECK(OBD_FAIL_LLITE_PANIC_ON_ESTALE)) + LBUG(); + mapping_set_error(inode->i_mapping, ioret); if ((ioret == -ESHUTDOWN || ioret == -EINTR || diff --git a/lustre/lov/lov_object.c b/lustre/lov/lov_object.c index a70e8a0..88107f8 100644 --- a/lustre/lov/lov_object.c +++ b/lustre/lov/lov_object.c @@ -1407,6 +1407,23 @@ static int lov_conf_set(const struct lu_env *env, struct cl_object *obj, LASSERT(conf->coc_opc == OBJECT_CONF_SET); + /* + * don't apply old layouts which can be brought + * if returned w/o ldlm lock. + * XXX: can we rollback in case of recovery? + */ + if (lsm && lov->lo_lsm) { + u32 oldgen = lov->lo_lsm->lsm_layout_gen &= ~LU_LAYOUT_RESYNC; + u32 newgen = lsm->lsm_layout_gen & ~LU_LAYOUT_RESYNC; + + if (newgen < oldgen) { + CDEBUG(D_HA, "skip old for "DFID": %d < %d\n", + PFID(lu_object_fid(lov2lu(lov))), + (int)newgen, (int)oldgen); + GOTO(out, result = 0); + } + } + if ((lsm == NULL && lov->lo_lsm == NULL) || ((lsm != NULL && lov->lo_lsm != NULL) && (lov->lo_lsm->lsm_layout_gen == lsm->lsm_layout_gen) && diff --git a/lustre/mdt/mdt_open.c b/lustre/mdt/mdt_open.c index 165caaf..6046b1d 100644 --- a/lustre/mdt/mdt_open.c +++ b/lustre/mdt/mdt_open.c @@ -1034,6 +1034,35 @@ static bool mdt_hsm_release_allow(const struct md_attr *ma) return true; } +static int mdt_refetch_lovea(struct mdt_thread_info *info, + struct mdt_object *o, struct md_attr *ma, + u64 ibits) +{ + struct mdt_body *repbody; + int rc; + + if ((ibits & MDS_INODELOCK_LAYOUT) == 0) + return 0; + if (!S_ISREG(lu_object_attr(&o->mot_obj))) + return 0; + + if ((ma->ma_valid & MA_LOV) == 0) + return 0; + + ma->ma_valid &= ~MA_LOV; + info->mti_big_lmm_used = 0; + ma->ma_lmm = req_capsule_server_get(info->mti_pill, &RMF_MDT_MD); + ma->ma_lmm_size = req_capsule_get_size(info->mti_pill, &RMF_MDT_MD, + RCL_SERVER); + rc = __mdt_stripe_get(info, o, ma, XATTR_NAME_LOV); + if (rc) + return rc; + + repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY); + repbody->mbo_eadatasize = ma->ma_lmm_size; + return 0; +} + static int mdt_open_by_fid_lock(struct mdt_thread_info *info, struct ldlm_reply *rep, struct mdt_lock_handle *lhc) @@ -1131,13 +1160,21 @@ static int mdt_open_by_fid_lock(struct mdt_thread_info *info, tgt_open_obj_set(info->mti_env, mdt_obj2dt(o)); rc = mdt_finish_open(info, parent, o, open_flags, rep); - if (!rc) { - mdt_set_disposition(info, rep, DISP_LOOKUP_POS); - if (open_flags & MDS_OPEN_LOCK) - mdt_set_disposition(info, rep, DISP_OPEN_LOCK); - if (open_flags & MDS_OPEN_LEASE) - mdt_set_disposition(info, rep, DISP_OPEN_LEASE); - } + if (rc) + GOTO(out_unlock, rc); + + mdt_set_disposition(info, rep, DISP_LOOKUP_POS); + if (open_flags & MDS_OPEN_LOCK) + mdt_set_disposition(info, rep, DISP_OPEN_LOCK); + if (open_flags & MDS_OPEN_LEASE) + mdt_set_disposition(info, rep, DISP_OPEN_LEASE); + + /* + * if layout lock is granted, then we should re-fetch LOVEA + * which was originally taken w/o the lock + */ + rc = mdt_refetch_lovea(info, o, ma, ibits); + GOTO(out_unlock, rc); out_unlock: @@ -1595,6 +1632,7 @@ again_pw: } else { /* get openlock if this isn't replay and client requested it */ if (!req_is_replay(req)) { + OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_DELAY_OPEN, cfs_fail_val); rc = mdt_object_open_lock(info, child, lhc, &ibits); object_locked = 1; if (rc != 0) @@ -1633,12 +1671,20 @@ again_pw: PFID(mdt_object_fid(child)), rc); mdt_clear_disposition(info, ldlm_rep, DISP_OPEN_CREATE); } + GOTO(out_child_unlock, result); } + /* + * if layout lock is granted, then we should re-fetch LOVEA + * which was originally taken w/o the lock. + */ + result = mdt_refetch_lovea(info, child, ma, ibits); + mdt_counter_incr(req, LPROC_MDT_OPEN, ktime_us_delta(ktime_get(), kstart)); - EXIT; + GOTO(out_child_unlock, result); + out_child_unlock: if (object_locked) mdt_object_open_unlock(info, child, lhc, ibits, result); diff --git a/lustre/tests/sanity-flr.sh b/lustre/tests/sanity-flr.sh index d8dabf7..501e1d6 100644 --- a/lustre/tests/sanity-flr.sh +++ b/lustre/tests/sanity-flr.sh @@ -3060,9 +3060,6 @@ test_70() { } run_test 70 "mirror create and split race" -ctrl_file=$(mktemp /tmp/CTRL.XXXXXX) -lock_file=$(mktemp /var/lock/FLR.XXXXXX) - write_file_200() { local tf=$1 @@ -3117,7 +3114,7 @@ resync_file_200() { echo -n "resync file $tf with '$cmd' .." if [[ $lock_taken = "true" ]]; then - flock -x 200 + flock -x 200 && $cmd $tf &> /dev/null && echo "done" || echo "failed" flock -u 200 else @@ -3128,11 +3125,16 @@ resync_file_200() { done } -test_200() { +# this was test_200 before adding "b" and "c" subtests +test_200a() { local tf=$DIR/$tfile local tf2=$DIR2/$tfile local tf3=$DIR3/$tfile + ctrl_file=$(mktemp /tmp/CTRL.XXXXXX) + lock_file=$(mktemp /var/lock/FLR.XXXXXX) + stack_trap "rm -f $ctrl_file $lock_file $tf $tf-2 $tf-3" + $LFS setstripe -E 1M -S 1M -E 2M -c 2 -E 4M -E 16M -E eof $tf $LFS setstripe -E 2M -S 1M -E 6M -c 2 -E 8M -E 32M -E eof $tf-2 $LFS setstripe -E 4M -c 2 -E 8M -E 64M -E eof $tf-3 @@ -3186,8 +3188,6 @@ test_200() { umount_client $MOUNT2 umount_client $MOUNT3 - rm -f $lock_file - # resync and verify mirrors $LFS mirror resync $tf || error "final resync failed" get_mirror_ids $tf @@ -3201,7 +3201,133 @@ test_200() { true } -run_test 200 "stress test" +run_test 200a "stress test" + +test_200b() { + local tf=$DIR/$tfile + local tf2=$DIR2/$tfile + local tf3=$DIR3/$tfile + + ctrl_file=$(mktemp /tmp/CTRL.XXXXXX) + lock_file=$(mktemp /var/lock/FLR.XXXXXX) + stack_trap "rm -f $ctrl_file $lock_file $tf $tf-2 $tf-3" + + $LFS setstripe -E 1M -S 1M -E 2M -c 2 -E 4M -E 16M -E eof $tf + $LFS setstripe -E 2M -S 1M -E 6M -c 2 -E 8M -E 32M -E eof $tf-2 + $LFS setstripe -E 4M -c 2 -E 8M -E 64M -E eof $tf-3 + + $LFS mirror extend -N -f $tf-2 $tf || + error "merging $tf-2 into $tf failed" + $LFS mirror extend -N -f $tf-3 $tf || + error "merging $tf-3 into $tf failed" + + mkdir -p $MOUNT2 && mount_client $MOUNT2 + + mkdir -p $MOUNT3 && mount_client $MOUNT3 + + verify_flr_state $tf3 "ro" + +#define OBD_FAIL_LLITE_PANIC_ON_ESTALE 0x1423 + $LCTL set_param fail_loc=0x1423 + + local -a pids + + write_file_200 $tf & + pids+=($!) + + read_file_200 $tf & + pids+=($!) + + write_file_200 $tf2 & + pids+=($!) + + read_file_200 $tf2 & + pids+=($!) + + resync_file_200 $tf3 & + pids+=($!) + + local sleep_time=60 + [ "$SLOW" = "yes" ] && sleep_time=400 + sleep $sleep_time + rm -f $ctrl_file + + echo "Waiting ${pids[@]}" + wait ${pids[@]} + + umount_client $MOUNT2 + umount_client $MOUNT3 + + # resync and verify mirrors + $LFS mirror resync $tf || { + ps ax + error "final resync failed" + } + get_mirror_ids $tf + + local csum=$($LFS mirror read -N ${mirror_array[0]} $tf | md5sum) + for id in ${mirror_array[@]:1}; do + [ "$($LFS mirror read -N $id $tf | md5sum)" = "$csum" ] || + error "checksum error for mirror $id" + done + + true +} +run_test 200b "racing IO, mirror extend and resync" + +test_200c() { + local tf=$DIR/$tfile + local tf2=$DIR2/$tfile + + mkdir -p $MOUNT2 && mount_client $MOUNT2 + stack_trap "umount_client $MOUNT2" + stack_trap "rm -f $tf" + + $LFS df + + dd if=/dev/urandom of=$tf bs=1M count=2 || error "can't write" + local mdt_idx + mdt_idx=$($LFS getstripe -m $tf) + + cancel_lru_locks mdc + cancel_lru_locks osc + + # start a process modifying file, block it just + # before layout lock acquisition +#define OBD_FAIL_MDS_DELAY_OPEN 0x175 + do_facet mds$((mdt_idx+1)) $LCTL set_param fail_loc=0x80000175 fail_val=10 + #log "dd to stale replica" + dd if=/dev/urandom of=$tf bs=1M count=2 oflag=direct conv=notrunc & + local PID=$! + sleep 0.5 + + # make a replica + log "mirror extend" + $LFS mirror extend -N -c -1 $tf2 || { + ps ax + error "can't mirror" + } + log "mirror extend done" + do_facet mds$((mdt_idx+1)) $LCTL set_param fail_loc=0 fail_val=0 + + # wait for blocking dd to complete and modify file + wait $PID || error "2nd dd failed" + log "dd completed" + + verify_mirror_count $tf 2 + + $LFS getstripe $tf | grep -q lcme_flags.*stale || { + $LFS getstripe $tf + $LFS getstripe $tf2 + error "both replicas are still in sync" + } + + $LFS mirror verify -vvv $tf || { + $LFS getstripe $tf + error "corrupted in-sync file" + } +} +run_test 200c "layout change racing with open: LOVEA changes" cleanup_test_201() { trap 0