From b525fe28b234ead0eddd2b8e68060e6528e3fba5 Mon Sep 17 00:00:00 2001 From: jxiong Date: Tue, 10 Feb 2009 02:05:08 +0000 Subject: [PATCH] b=17025 r=adilger,nikita If a client creates and opens some files, and then mds gets crashed, there exists a chance that the objids in OSTs will be used for multiple files. The root cause of this issue is that we don't have the appropriate mechanism to handle the case if the replaying of open request fails. This is a hot fix for this problem. By skipping the orphan objects, the client won't see that multiple files using the same objid any more. --- lustre/include/lustre/lustre_idl.h | 5 +++-- lustre/mds/mds_lov.c | 2 +- lustre/obdclass/lprocfs_status.c | 2 ++ lustre/obdfilter/filter.c | 40 ++++++++++++++++++++++++++++---------- lustre/osc/osc_create.c | 13 +++++++++++++ lustre/tests/recovery-small.sh | 25 ++++++++++++++++++++++++ lustre/utils/wirecheck.c | 1 + 7 files changed, 75 insertions(+), 13 deletions(-) diff --git a/lustre/include/lustre/lustre_idl.h b/lustre/include/lustre/lustre_idl.h index 9f5190a..5ea31cf 100644 --- a/lustre/include/lustre/lustre_idl.h +++ b/lustre/include/lustre/lustre_idl.h @@ -764,7 +764,8 @@ extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb); #define OBD_CONNECT_CKSUM 0x20000000ULL /*support several cksum algos */ #define OBD_CONNECT_FID 0x40000000ULL /*FID is supported by server */ #define OBD_CONNECT_VBR 0x80000000ULL /*version based recovery */ -#define OBD_CONNECT_LOV_V3 0x100000000ULL /*client supports LOV v3 EA */ +#define OBD_CONNECT_LOV_V3 0x100000000ULL /*client supports LOV v3 EA */ +#define OBD_CONNECT_SKIP_ORPHAN 0x400000000ULL /* don't reuse orphan objids */ /* also update obd_connect_names[] for lprocfs_rd_connect_flags() * and lustre/utils/wirecheck.c */ @@ -794,7 +795,7 @@ extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb); OBD_CONNECT_CHANGE_QS | \ OBD_CONNECT_OSS_CAPA | OBD_CONNECT_RMT_CLIENT | \ OBD_CONNECT_RMT_CLIENT_FORCE | \ - OBD_CONNECT_MDS) + OBD_CONNECT_MDS | OBD_CONNECT_SKIP_ORPHAN) #define ECHO_CONNECT_SUPPORTED (0) #define MGS_CONNECT_SUPPORTED (OBD_CONNECT_VERSION | OBD_CONNECT_AT) diff --git a/lustre/mds/mds_lov.c b/lustre/mds/mds_lov.c index 9c6142f..51b2fbc 100644 --- a/lustre/mds/mds_lov.c +++ b/lustre/mds/mds_lov.c @@ -643,7 +643,7 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name) OBD_CONNECT_OSS_CAPA | OBD_CONNECT_FID | OBD_CONNECT_BRW_SIZE | OBD_CONNECT_CKSUM | OBD_CONNECT_CHANGE_QS | OBD_CONNECT_AT | - OBD_CONNECT_MDS; + OBD_CONNECT_MDS | OBD_CONNECT_SKIP_ORPHAN; #ifdef HAVE_LRU_RESIZE_SUPPORT data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE; #endif diff --git a/lustre/obdclass/lprocfs_status.c b/lustre/obdclass/lprocfs_status.c index a6f0311..043ba88 100644 --- a/lustre/obdclass/lprocfs_status.c +++ b/lustre/obdclass/lprocfs_status.c @@ -808,6 +808,8 @@ static const char *obd_connect_names[] = { "fid_is_enabled", "version_recovery", "pools", + "", /* reserved for simplified interop */ + "skip_orphan", NULL }; diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c index 191b3e5..d6782e2 100644 --- a/lustre/obdfilter/filter.c +++ b/lustre/obdfilter/filter.c @@ -2670,6 +2670,11 @@ static int filter_connect_internal(struct obd_export *exp, exp->exp_connect_flags = data->ocd_connect_flags; data->ocd_version = LUSTRE_VERSION_CODE; + /* Kindly make sure the SKIP_ORPHAN flag is from MDS. */ + if (!ergo(data->ocd_connect_flags & OBD_CONNECT_SKIP_ORPHAN, + data->ocd_connect_flags & OBD_CONNECT_MDS)) + RETURN(-EPROTO); + if (exp->exp_connect_flags & OBD_CONNECT_GRANT) { obd_size left, want; @@ -3491,11 +3496,12 @@ static int filter_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp, /* caller must hold fo_create_locks[oa->o_gr] */ static int filter_destroy_precreated(struct obd_export *exp, struct obdo *oa, - struct filter_obd *filter) + struct filter_obd *filter) { struct obdo doa; /* XXX obdo on stack */ obd_id last, id; - int rc; + int rc = 0; + int skip_orphan; ENTRY; LASSERT(oa); @@ -3517,8 +3523,11 @@ static int filter_destroy_precreated(struct obd_export *exp, struct obdo *oa, last = filter_last_id(filter, doa.o_gr); - CWARN("%s: deleting orphan objects from "LPU64" to "LPU64"\n", - exp->exp_obd->obd_name, oa->o_id + 1, last); + skip_orphan = !!(exp->exp_connect_flags & OBD_CONNECT_SKIP_ORPHAN); + + CWARN("%s: deleting orphan objects from "LPU64" to "LPU64"%s\n", + exp->exp_obd->obd_name, oa->o_id + 1, last, + skip_orphan ? ", orphan objids won't be reused any more." : "."); for (id = last; id > oa->o_id; id--) { doa.o_id = id; @@ -3526,17 +3535,26 @@ static int filter_destroy_precreated(struct obd_export *exp, struct obdo *oa, if (rc && rc != -ENOENT) /* this is pretty fatal... */ CEMERG("error destroying precreate objid "LPU64": %d\n", id, rc); - filter_set_last_id(filter, id - 1, doa.o_gr); + /* update last_id on disk periodically so that if we restart * we don't need to re-scan all of the just-deleted objects. */ - if ((id & 511) == 0) + if ((id & 511) == 0 && !skip_orphan) { + filter_set_last_id(filter, id - 1, doa.o_gr); filter_update_last_objid(exp->exp_obd, doa.o_gr, 0); + } } CDEBUG(D_HA, "%s: after destroy: set last_objids["LPU64"] = "LPU64"\n", exp->exp_obd->obd_name, doa.o_gr, oa->o_id); - rc = filter_update_last_objid(exp->exp_obd, doa.o_gr, 1); + if (!skip_orphan) { + filter_set_last_id(filter, id, doa.o_gr); + rc = filter_update_last_objid(exp->exp_obd, doa.o_gr, 1); + } else { + /* don't reuse orphan object, return last used objid */ + oa->o_id = last; + rc = 0; + } clear_bit(doa.o_gr, &filter->fo_destroys_in_progress); RETURN(rc); @@ -3555,6 +3573,8 @@ static int filter_handle_precreate(struct obd_export *exp, struct obdo *oa, /* delete orphans request */ if ((oa->o_valid & OBD_MD_FLFLAGS) && (oa->o_flags & OBD_FL_DELORPHAN)){ + obd_id last = filter_last_id(filter, group); + if (oti->oti_conn_cnt < exp->exp_conn_cnt) { CERROR("%s: dropping old orphan cleanup request\n", obd->obd_name); @@ -3569,14 +3589,14 @@ static int filter_handle_precreate(struct obd_export *exp, struct obdo *oa, up(&filter->fo_create_locks[group]); RETURN(0); } - diff = oa->o_id - filter_last_id(filter, group); + diff = oa->o_id - last; CDEBUG(D_HA, "filter_last_id() = "LPU64" -> diff = %d\n", - filter_last_id(filter, group), diff); + last, diff); if (-diff > OST_MAX_PRECREATE) { CERROR("%s: ignoring bogus orphan destroy request: " "obdid "LPU64" last_id "LPU64"\n", obd->obd_name, - oa->o_id, filter_last_id(filter, group)); + oa->o_id, last); /* FIXME: should reset precreate_next_id on MDS */ GOTO(out, rc = -EINVAL); } diff --git a/lustre/osc/osc_create.c b/lustre/osc/osc_create.c index e432aa4..8b71f60 100644 --- a/lustre/osc/osc_create.c +++ b/lustre/osc/osc_create.c @@ -309,6 +309,7 @@ int osc_create(struct obd_export *exp, struct obdo *oa, struct lov_stripe_md **ea, struct obd_trans_info *oti) { struct osc_creator *oscc = &exp->exp_obd->u.cli.cl_oscc; + struct obd_import *imp = exp->exp_obd->u.cli.cl_import; struct lov_stripe_md *lsm; int try_again = 1, rc = 0; ENTRY; @@ -355,10 +356,22 @@ int osc_create(struct obd_export *exp, struct obdo *oa, spin_lock(&oscc->oscc_lock); oscc->oscc_flags &= ~OSCC_FLAG_SYNC_IN_PROGRESS; if (rc == 0 || rc == -ENOSPC) { + struct obd_connect_data *ocd; + if (rc == -ENOSPC) oscc->oscc_flags |= OSCC_FLAG_NOSPC; oscc->oscc_flags &= ~OSCC_FLAG_RECOVERING; + oscc->oscc_last_id = oa->o_id; + ocd = &imp->imp_connect_data; + if (ocd->ocd_connect_flags & OBD_CONNECT_SKIP_ORPHAN) { + CWARN("Skip orphan set, reset the last objid\n"); + oscc->oscc_next_id = oa->o_id + 1; + } + + /* sanity check for next objid. see bug 17025 */ + LASSERT(oscc->oscc_next_id == oa->o_id + 1); + CDEBUG(D_HA, "%s: oscc recovery finished, last_id: " LPU64", rc: %d\n", oscc->oscc_obd->obd_name, oscc->oscc_last_id, rc); diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index 41087cf..943b41d 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -1035,6 +1035,31 @@ of $NUM_FILES" } run_test 60 "Add Changelog entries during MDS failover" +test_61() +{ + local cflags='osc.*-OST0000-osc-MDT*.connect_flags' + do_facet $SINGLEMDS "lctl get_param -n $cflags" |grep -q skip_orphan + [ $? -ne 0 ] && skip "don't have skip orphan feature" && return + + mkdir -p $DIR/$tdir || error "mkdir dir $DIR/$tdir failed" + # Set the default stripe of $DIR/$tdir to put the files to ost1 + $LFS setstripe -c 1 --index 0 $DIR/$tdir + + replay_barrier $SINGLEMDS + createmany -o $DIR/$tdir/$tfile-%d 10 + local oid=`do_facet ost1 "lctl get_param -n obdfilter.*OST0000.last_id"` + + fail_abort $SINGLEMDS + + touch $DIR/$tdir/$tfile + local id=`$LFS getstripe $DIR/$tdir/$tfile |awk '$2 ~ /^[1-9]+/ {print $2}'` + [ $id -le $oid ] && error "the orphan objid was reused, failed" + + # Cleanup + rm -rf $DIR/$tdir +} +run_test 61 "Verify to not reuse orphan objects - bug 17025" + equals_msg `basename $0`: test complete, cleaning up check_and_cleanup_lustre [ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG && grep -q FAIL $TESTSUITELOG && exit 1 || true diff --git a/lustre/utils/wirecheck.c b/lustre/utils/wirecheck.c index 956af80..490c6a9 100644 --- a/lustre/utils/wirecheck.c +++ b/lustre/utils/wirecheck.c @@ -217,6 +217,7 @@ static void check_obd_connect_data(void) CHECK_CDEFINE(OBD_CONNECT_AT); CHECK_CDEFINE(OBD_CONNECT_CANCELSET); CHECK_CDEFINE(OBD_CONNECT_LRU_RESIZE); + CHECK_CDEFINE(OBD_CONNECT_SKIP_ORPHAN); } static void -- 1.8.3.1