From efbe0f63eff8a9a7b192607382f6859e3b0088b8 Mon Sep 17 00:00:00 2001 From: Alex Zhuravlev Date: Tue, 1 Feb 2022 23:50:02 +0300 Subject: [PATCH] LU-15300 mdt: refresh LOVEA with LL granted this change tries to fix two problems: 1) mdt_reint_open() fetches LOVEA before layout lock is taken. this may race with another process changing the layout and may result in a stale layout returned with a granted layout lock - re-fetch LOVEA once layout lock is granted 2) lov_layout_change() should not apply old layouts which can get through when MDS doesn't take layout lock 3) LFSCK shouldn't ignore layout version stored on MDS to avoid a situation when version degrades compared to client's copy. This patch misses an optimization and can result in a number of useless calls to OSD to fetch LOVEA. To be fixed in a followup patch. Signed-off-by: Alex Zhuravlev Change-Id: Idee1101d152ab09947faf6d75574a8761a7690a5 --- lustre/include/obd_support.h | 2 + lustre/lfsck/lfsck_layout.c | 8 ++- lustre/llite/vvp_page.c | 7 ++- lustre/lov/lov_object.c | 17 ++++++ lustre/mdt/mdt_open.c | 62 ++++++++++++++++--- lustre/tests/sanity-flr.sh | 140 ++++++++++++++++++++++++++++++++++++++++--- 6 files changed, 218 insertions(+), 18 deletions(-) diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 6343086..3dcf7b4 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -258,6 +258,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_MDS_LL_BLOCK 0x172 #define OBD_FAIL_MDS_LOD_CREATE_PAUSE 0x173 #define OBD_FAIL_MDS_CONNECT_VS_EVICT 0x174 +#define OBD_FAIL_MDS_DELAY_OPEN 0x175 /* CMD */ #define OBD_FAIL_MDS_IS_SUBDIR_NET 0x180 @@ -624,6 +625,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_LLITE_XATTR_PAUSE 0x1420 #define OBD_FAIL_LLITE_PAGE_INVALIDATE_PAUSE 0x1421 #define OBD_FAIL_LLITE_READPAGE_PAUSE 0x1422 +#define OBD_FAIL_LLITE_PANIC_ON_ESTALE 0x1423 #define OBD_FAIL_FID_INDIR 0x1501 #define OBD_FAIL_FID_INLMA 0x1502 diff --git a/lustre/lfsck/lfsck_layout.c b/lustre/lfsck/lfsck_layout.c index dacd615..8875f5e 100644 --- a/lustre/lfsck/lfsck_layout.c +++ b/lustre/lfsck/lfsck_layout.c @@ -1894,7 +1894,13 @@ static int lfsck_layout_new_comp_lovea(const struct lu_env *env, rec->lor_range); lcm->lcm_flags = cpu_to_le16(LCM_FL_NONE); } else { - lcm->lcm_layout_gen = cpu_to_le32(1); + /* + * if OST doesn't provide layout version, then try + * to inherit one from MDS's layout, but increment + * it so the client notices and applies modified + * layout + */ + le32_add_cpu(&lcm->lcm_layout_gen, 1); lcm->lcm_flags = cpu_to_le16(LCM_FL_NONE); } lcm->lcm_entry_count = cpu_to_le16(1); diff --git a/lustre/llite/vvp_page.c b/lustre/llite/vvp_page.c index ccd228e..5cdcda3 100644 --- a/lustre/llite/vvp_page.c +++ b/lustre/llite/vvp_page.c @@ -114,10 +114,13 @@ static void vvp_vmpage_error(struct inode *inode, struct page *vmpage, obj->vob_discard_page_warned = 0; } else { SetPageError(vmpage); - if (ioret == -ENOSPC) + if (ioret == -ENOSPC) { set_bit(AS_ENOSPC, &inode->i_mapping->flags); - else + } else { + if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_PANIC_ON_ESTALE)) + LBUG(); set_bit(AS_EIO, &inode->i_mapping->flags); + } if ((ioret == -ESHUTDOWN || ioret == -EINTR || ioret == -EIO) && obj->vob_discard_page_warned == 0) { diff --git a/lustre/lov/lov_object.c b/lustre/lov/lov_object.c index 57ce475..f8e7886 100644 --- a/lustre/lov/lov_object.c +++ b/lustre/lov/lov_object.c @@ -1390,6 +1390,23 @@ static int lov_conf_set(const struct lu_env *env, struct cl_object *obj, LASSERT(conf->coc_opc == OBJECT_CONF_SET); + /* + * don't apply old layouts which can be brought + * if returned w/o ldlm lock. + * XXX: can we rollback in case of recovery? + */ + if (lsm && lov->lo_lsm) { + u32 oldgen = lov->lo_lsm->lsm_layout_gen &= ~LU_LAYOUT_RESYNC; + u32 newgen = lsm->lsm_layout_gen & ~LU_LAYOUT_RESYNC; + + if (newgen < oldgen) { + CDEBUG(D_HA, "skip old for "DFID": %d < %d\n", + PFID(lu_object_fid(lov2lu(lov))), + (int)newgen, (int)oldgen); + GOTO(out, result = 0); + } + } + if ((lsm == NULL && lov->lo_lsm == NULL) || ((lsm != NULL && lov->lo_lsm != NULL) && (lov->lo_lsm->lsm_layout_gen == lsm->lsm_layout_gen) && diff --git a/lustre/mdt/mdt_open.c b/lustre/mdt/mdt_open.c index 8a68273..783ce86 100644 --- a/lustre/mdt/mdt_open.c +++ b/lustre/mdt/mdt_open.c @@ -1052,6 +1052,35 @@ static bool mdt_hsm_release_allow(const struct md_attr *ma) return true; } +static int mdt_refetch_lovea(struct mdt_thread_info *info, + struct mdt_object *o, struct md_attr *ma, + u64 ibits) +{ + struct mdt_body *repbody; + int rc; + + if ((ibits & MDS_INODELOCK_LAYOUT) == 0) + return 0; + if (!S_ISREG(lu_object_attr(&o->mot_obj))) + return 0; + + if ((ma->ma_valid & MA_LOV) == 0) + return 0; + + ma->ma_valid &= ~MA_LOV; + info->mti_big_lmm_used = 0; + ma->ma_lmm = req_capsule_server_get(info->mti_pill, &RMF_MDT_MD); + ma->ma_lmm_size = req_capsule_get_size(info->mti_pill, &RMF_MDT_MD, + RCL_SERVER); + rc = __mdt_stripe_get(info, o, ma, XATTR_NAME_LOV); + if (rc) + return rc; + + repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY); + repbody->mbo_eadatasize = ma->ma_lmm_size; + return 0; +} + static int mdt_open_by_fid_lock(struct mdt_thread_info *info, struct ldlm_reply *rep, struct mdt_lock_handle *lhc) @@ -1149,13 +1178,21 @@ static int mdt_open_by_fid_lock(struct mdt_thread_info *info, tgt_open_obj_set(info->mti_env, mdt_obj2dt(o)); rc = mdt_finish_open(info, parent, o, open_flags, rep); - if (!rc) { - mdt_set_disposition(info, rep, DISP_LOOKUP_POS); - if (open_flags & MDS_OPEN_LOCK) - mdt_set_disposition(info, rep, DISP_OPEN_LOCK); - if (open_flags & MDS_OPEN_LEASE) - mdt_set_disposition(info, rep, DISP_OPEN_LEASE); - } + if (rc) + GOTO(out_unlock, rc); + + mdt_set_disposition(info, rep, DISP_LOOKUP_POS); + if (open_flags & MDS_OPEN_LOCK) + mdt_set_disposition(info, rep, DISP_OPEN_LOCK); + if (open_flags & MDS_OPEN_LEASE) + mdt_set_disposition(info, rep, DISP_OPEN_LEASE); + + /* + * if layout lock is granted, then we should re-fetch LOVEA + * which was originally taken w/o the lock + */ + rc = mdt_refetch_lovea(info, o, ma, ibits); + GOTO(out_unlock, rc); out_unlock: @@ -1626,6 +1663,7 @@ again_pw: } else { /* get openlock if this isn't replay and client requested it */ if (!req_is_replay(req)) { + OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_DELAY_OPEN, cfs_fail_val); rc = mdt_object_open_lock(info, child, lhc, &ibits); object_locked = 1; if (rc != 0) @@ -1664,12 +1702,20 @@ again_pw: PFID(mdt_object_fid(child)), rc); mdt_clear_disposition(info, ldlm_rep, DISP_OPEN_CREATE); } + GOTO(out_child_unlock, result); } + /* + * if layout lock is granted, then we should re-fetch LOVEA + * which was originally taken w/o the lock. + */ + result = mdt_refetch_lovea(info, child, ma, ibits); + mdt_counter_incr(req, LPROC_MDT_OPEN, ktime_us_delta(ktime_get(), kstart)); - EXIT; + GOTO(out_child_unlock, result); + out_child_unlock: if (object_locked) mdt_object_open_unlock(info, child, lhc, ibits, result); diff --git a/lustre/tests/sanity-flr.sh b/lustre/tests/sanity-flr.sh index 5264cdc..e0c4730 100644 --- a/lustre/tests/sanity-flr.sh +++ b/lustre/tests/sanity-flr.sh @@ -3275,9 +3275,6 @@ test_100() { } run_test 100 "flr mode fsx test" -ctrl_file=$(mktemp /tmp/CTRL.XXXXXX) -lock_file=$(mktemp /var/lock/FLR.XXXXXX) - write_file_200() { local tf=$1 @@ -3343,11 +3340,16 @@ resync_file_200() { done } -test_200() { +# this was test_200 before adding "b" and "c" subtests +test_200a() { local tf=$DIR/$tfile local tf2=$DIR2/$tfile local tf3=$DIR3/$tfile + ctrl_file=$(mktemp /tmp/CTRL.XXXXXX) + lock_file=$(mktemp /var/lock/FLR.XXXXXX) + stack_trap "rm -f $ctrl_file $lock_file $tf $tf-2 $tf-3" + $LFS setstripe -E 1M -S 1M -E 2M -c 2 -E 4M -E 16M -E eof $tf $LFS setstripe -E 2M -S 1M -E 6M -c 2 -E 8M -E 32M -E eof $tf-2 $LFS setstripe -E 4M -c 2 -E 8M -E 64M -E eof $tf-3 @@ -3401,8 +3403,6 @@ test_200() { umount_client $MOUNT2 umount_client $MOUNT3 - rm -f $lock_file - # resync and verify mirrors $LFS mirror resync $tf || error "final resync failed" get_mirror_ids $tf @@ -3416,7 +3416,133 @@ test_200() { true } -run_test 200 "stress test" +run_test 200a "stress test" + +test_200b() { + local tf=$DIR/$tfile + local tf2=$DIR2/$tfile + local tf3=$DIR3/$tfile + + ctrl_file=$(mktemp /tmp/CTRL.XXXXXX) + lock_file=$(mktemp /var/lock/FLR.XXXXXX) + stack_trap "rm -f $ctrl_file $lock_file $tf $tf-2 $tf-3" + + $LFS setstripe -E 1M -S 1M -E 2M -c 2 -E 4M -E 16M -E eof $tf + $LFS setstripe -E 2M -S 1M -E 6M -c 2 -E 8M -E 32M -E eof $tf-2 + $LFS setstripe -E 4M -c 2 -E 8M -E 64M -E eof $tf-3 + + $LFS mirror extend -N -f $tf-2 $tf || + error "merging $tf-2 into $tf failed" + $LFS mirror extend -N -f $tf-3 $tf || + error "merging $tf-3 into $tf failed" + + mkdir -p $MOUNT2 && mount_client $MOUNT2 + + mkdir -p $MOUNT3 && mount_client $MOUNT3 + + verify_flr_state $tf3 "ro" + +#define OBD_FAIL_LLITE_PANIC_ON_ESTALE 0x1423 + $LCTL set_param fail_loc=0x1423 + + local -a pids + + write_file_200 $tf & + pids+=($!) + + read_file_200 $tf & + pids+=($!) + + write_file_200 $tf2 & + pids+=($!) + + read_file_200 $tf2 & + pids+=($!) + + resync_file_200 $tf3 & + pids+=($!) + + local sleep_time=60 + [ "$SLOW" = "yes" ] && sleep_time=400 + sleep $sleep_time + rm -f $ctrl_file + + echo "Waiting ${pids[@]}" + wait ${pids[@]} + + umount_client $MOUNT2 + umount_client $MOUNT3 + + # resync and verify mirrors + $LFS mirror resync $tf || { + ps ax + error "final resync failed" + } + get_mirror_ids $tf + + local csum=$($LFS mirror read -N ${mirror_array[0]} $tf | md5sum) + for id in ${mirror_array[@]:1}; do + [ "$($LFS mirror read -N $id $tf | md5sum)" = "$csum" ] || + error "checksum error for mirror $id" + done + + true +} +run_test 200b "racing IO, mirror extend and resync" + +test_200c() { + local tf=$DIR/$tfile + local tf2=$DIR2/$tfile + + mkdir -p $MOUNT2 && mount_client $MOUNT2 + stack_trap "umount_client $MOUNT2" + stack_trap "rm -f $tf" + + $LFS df + + dd if=/dev/urandom of=$tf bs=1M count=2 || error "can't write" + local mdt_idx + mdt_idx=$($LFS getstripe -m $tf) + + cancel_lru_locks mdc + cancel_lru_locks osc + + # start a process modifying file, block it just + # before layout lock acquisition +#define OBD_FAIL_MDS_DELAY_OPEN 0x175 + do_facet mds$((mdt_idx+1)) $LCTL set_param fail_loc=0x80000175 fail_val=10 + #log "dd to stale replica" + dd if=/dev/urandom of=$tf bs=1M count=2 oflag=direct conv=notrunc & + local PID=$! + sleep 0.5 + + # make a replica + log "mirror extend" + $LFS mirror extend -N -c -1 $tf2 || { + ps ax + error "can't mirror" + } + log "mirror extend done" + do_facet mds$((mdt_idx+1)) $LCTL set_param fail_loc=0 fail_val=0 + + # wait for blocking dd to complete and modify file + wait $PID || error "2nd dd failed" + log "dd completed" + + verify_mirror_count $tf 2 + + $LFS getstripe $tf | grep -q lcme_flags.*stale || { + $LFS getstripe $tf + $LFS getstripe $tf2 + error "both replicas are still in sync" + } + + $LFS mirror verify -vvv $tf || { + $LFS getstripe $tf + error "corrupted in-sync file" + } +} +run_test 200c "layout change racing with open: LOVEA changes" cleanup_test_201() { do_facet $SINGLEMDS $LCTL --device $MDT0 changelog_deregister $CL_USER -- 1.8.3.1