From 2ce0d5b0640e3e440822080e407eee1ce1cafd75 Mon Sep 17 00:00:00 2001 From: Alex Zhuravlev Date: Fri, 14 Oct 2016 20:48:50 +0300 Subject: [PATCH] LU-8367 osp: do not block orphan cleanup do not block orphan cleanup till all reserved objects are consumed, otherwise we risk to get into a livelock when one blocked thread holding reservation can block recovery. instead ask OST to recreate possible missing objects in our precreate window. Change-Id: I066b0783cce54d7ecd25a08da5c76f211b7244f5 Signed-off-by: Alex Zhuravlev Reviewed-on: https://review.whamcloud.com/23168 Reviewed-by: Sergey Cheremencev Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Niu Yawei Reviewed-by: Oleg Drokin --- lustre/include/obd_support.h | 1 + lustre/osp/osp_internal.h | 1 + lustre/osp/osp_precreate.c | 53 ++++++++++++++++++++++++++++-------------- lustre/tests/recovery-small.sh | 47 +++++++++++++++++++++++++++++++++++++ lustre/tests/sanity-lfsck.sh | 8 +++++++ 5 files changed, 93 insertions(+), 17 deletions(-) diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index d9d01de..e477312 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -249,6 +249,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_MDS_XATTR_REP 0x161 #define OBD_FAIL_MDS_TRACK_OVERFLOW 0x162 #define OBD_FAIL_MDS_LOV_CREATE_RACE 0x163 +#define OBD_FAIL_MDS_OSP_PRECREATE_WAIT 0x164 /* layout lock */ #define OBD_FAIL_MDS_NO_LL_GETATTR 0x170 diff --git a/lustre/osp/osp_internal.h b/lustre/osp/osp_internal.h index d059f2d..f61b2f5 100644 --- a/lustre/osp/osp_internal.h +++ b/lustre/osp/osp_internal.h @@ -88,6 +88,7 @@ struct osp_precreate { int osp_pre_create_slow; /* cleaning up orphans or recreating missing objects */ int osp_pre_recovering; + int osp_pre_delorphan_sent; }; struct osp_update_request_sub { diff --git a/lustre/osp/osp_precreate.c b/lustre/osp/osp_precreate.c index 3d84f6d..6910860 100644 --- a/lustre/osp/osp_precreate.c +++ b/lustre/osp/osp_precreate.c @@ -573,6 +573,7 @@ static int osp_precreate_send(const struct lu_env *env, struct osp_device *d) RETURN(rc); } + LASSERT(d->opd_pre->osp_pre_delorphan_sent != 0); spin_lock(&d->opd_pre_lock); if (d->opd_pre_create_count > d->opd_pre_max_create_count / 2) d->opd_pre_create_count = d->opd_pre_max_create_count / 2; @@ -778,18 +779,20 @@ static int osp_precreate_cleanup_orphans(struct lu_env *env, int update_status = 0; int rc; int diff; + struct lu_fid fid; ENTRY; /* - * wait for local recovery to finish, so we can cleanup orphans - * orphans are all objects since "last used" (assigned), but - * there might be objects reserved and in some cases they won't - * be used. we can't cleanup them till we're sure they won't be - * used. also can't we allow new reservations because they may - * end up getting orphans being cleaned up below. so we block - * new reservations and wait till all reserved objects either - * user or released. + * wait for local recovery to finish, so we can cleanup orphans. + * orphans are all objects since "last used" (assigned). + * consider reserved objects as created otherwise we can get into + * a livelock when one blocked thread holding a reservation can + * block recovery. see LU-8367 for the details. in some cases this + * can result in gaps (i.e. leaked objects), but we've got LFSCK... + * + * do not allow new reservations because they may end up getting + * orphans being cleaned up below. so we block new reservations. */ spin_lock(&d->opd_pre_lock); d->opd_pre_recovering = 1; @@ -799,16 +802,12 @@ static int osp_precreate_cleanup_orphans(struct lu_env *env, * catch all osp_precreate_reserve() calls who find * "!opd_pre_recovering". */ - l_wait_event(d->opd_pre_waitq, - (!d->opd_pre_reserved && d->opd_recovery_completed) || + l_wait_event(d->opd_pre_waitq, d->opd_recovery_completed || !osp_precreate_running(d) || d->opd_got_disconnected, &lwi); if (!osp_precreate_running(d) || d->opd_got_disconnected) GOTO(out, rc = -EAGAIN); - CDEBUG(D_HA, "%s: going to cleanup orphans since "DFID"\n", - d->opd_obd->obd_name, PFID(&d->opd_last_used_fid)); - *last_fid = d->opd_last_used_fid; /* The OSP should already get the valid seq now */ LASSERT(!fid_is_zero(last_fid)); @@ -840,7 +839,19 @@ static int osp_precreate_cleanup_orphans(struct lu_env *env, body->oa.o_flags = OBD_FL_DELORPHAN; body->oa.o_valid = OBD_MD_FLFLAGS | OBD_MD_FLGROUP; - fid_to_ostid(&d->opd_last_used_fid, &body->oa.o_oi); + /* unless this is the very first DELORPHAN (when we really + * can destroy some orphans), just tell OST to recreate + * missing objects in our precreate pool */ + spin_lock(&d->opd_pre_lock); + if (d->opd_pre->osp_pre_delorphan_sent) + fid = d->opd_pre_last_created_fid; + else + fid = d->opd_last_used_fid; + spin_unlock(&d->opd_pre_lock); + fid_to_ostid(&fid, &body->oa.o_oi); + + CDEBUG(D_HA, "%s: going to cleanup orphans since "DFID"\n", + d->opd_obd->obd_name, PFID(&fid)); ptlrpc_request_set_replen(req); @@ -863,10 +874,10 @@ static int osp_precreate_cleanup_orphans(struct lu_env *env, ostid_to_fid(last_fid, &body->oa.o_oi, d->opd_index); spin_lock(&d->opd_pre_lock); - diff = osp_fid_diff(&d->opd_last_used_fid, last_fid); + diff = osp_fid_diff(&fid, last_fid); if (diff > 0) { d->opd_pre_create_count = OST_MIN_PRECREATE + diff; - d->opd_pre_last_created_fid = d->opd_last_used_fid; + d->opd_pre_last_created_fid = *last_fid; } else { d->opd_pre_create_count = OST_MIN_PRECREATE; d->opd_pre_last_created_fid = *last_fid; @@ -877,9 +888,11 @@ static int osp_precreate_cleanup_orphans(struct lu_env *env, */ LASSERT(fid_oid(&d->opd_pre_last_created_fid) <= LUSTRE_DATA_SEQ_MAX_WIDTH); - d->opd_pre_used_fid = d->opd_pre_last_created_fid; + if (d->opd_pre->osp_pre_delorphan_sent == 0) + d->opd_pre_used_fid = d->opd_pre_last_created_fid; d->opd_pre_create_slow = 0; spin_unlock(&d->opd_pre_lock); + d->opd_pre->osp_pre_delorphan_sent = 1; CDEBUG(D_HA, "%s: Got last_id "DFID" from OST, last_created "DFID "last_used is "DFID"\n", d->opd_obd->obd_name, PFID(last_fid), @@ -1355,6 +1368,12 @@ int osp_precreate_reserve(const struct lu_env *env, struct osp_device *d) if (d->opd_pre_max_create_count == 0) RETURN(-ENOBUFS); + if (OBD_FAIL_PRECHECK(OBD_FAIL_MDS_OSP_PRECREATE_WAIT)) { + if (d->opd_index == cfs_fail_val) + OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_OSP_PRECREATE_WAIT, + obd_timeout); + } + /* * wait till: * - preallocation is done diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index fb07a4a..1c4759c 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -2587,6 +2587,53 @@ test_133() { } run_test 133 "don't fail on flock resend" +test_134() { + local file1 + local pid1 + local pid2 + local i + + [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return 0 + [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.8.59) ]] && + skip "Need MDS version at least 2.8.59" && return + + test_mkdir -p $DIR/$tdir + file1="$DIR/$tdir/file1" + file2="$DIR/$tdir/file2" + +#define OBD_FAIL_MDS_OSP_PRECREATE_WAIT 0x164 + # reserve stripe on ost1, block on ost2 + do_facet $SINGLEMDS \ + "lctl set_param fail_loc=0x80000164 fail_val=1" + $SETSTRIPE -c 2 -o 0,1 $file1 & + pid1=$! + sleep 1 + + # initiate recovery with orphan cleanup on ost1 + facet_failover ost1 + + # when OST1 recovery is over, the first setstripe should still + # have the object reserved, but that should not block new creates + # on OST1 + $SETSTRIPE -c 1 -o 0 $file2 & + pid2=$! + for ((i=0;i<$((TIMEOUT/2));i++)); do + if ! stat /proc/$pid2 >&/dev/null; then + echo "DONE!" + break + fi + echo "WAITING ..." + sleep 1 + done + if let "i >= (TIMEOUT/2)"; then + error "create seem to get blocked by recovery" + fi + wait $pid1 + wait $pid2 + return 0 +} +run_test 134 "MDT<>OST recovery don't block multistripe file creation" + complete $SECONDS check_and_cleanup_lustre exit_status diff --git a/lustre/tests/sanity-lfsck.sh b/lustre/tests/sanity-lfsck.sh index 1396a6e..0d3f136 100644 --- a/lustre/tests/sanity-lfsck.sh +++ b/lustre/tests/sanity-lfsck.sh @@ -1335,6 +1335,14 @@ test_11b() { umount_client $MOUNT stop ost1 || error "(1) Fail to stop ost1" + # stop MDS to forget last precreated object + echo "stop $SINGLEMDS" + stop $SINGLEMDS > /dev/null || error "(11) Fail to stop MDS!" + do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0 + echo "start $SINGLEMDS" + start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null || + error "(12) Fail to start MDS!" + #define OBD_FAIL_OST_ENOSPC 0x215 do_facet ost1 $LCTL set_param fail_loc=0x215 -- 1.8.3.1