From d69538b3c370460101e5e52d3164a54542dc31e0 Mon Sep 17 00:00:00 2001 From: Bob Glossman Date: Mon, 4 Nov 2013 07:18:27 -0800 Subject: [PATCH 1/1] LU-14 utils: allow formatting OST without VIRGIN flag When reformatting a new target to replace an existing target, the "mkfs.lustre --replace" option allows formatting an OST (or MDT?) with the same index as a previously used OST without setting the LDD_F_VIRGIN flag. This is required in the case where an OST was lost due to massive corruption or critical hardware failure. Otherwise, the newly formatted target will try to register with the MGS as a new target, but will be refused by the MGS due to having an already-used index. The OFD code skips precreating all of the objects in the filesystem if the MDS requests an object ID too much larger than the current LAST_ID. In this case (which can happen if the OST is replaced or restored from an older backup) only the most recent objects are precreated, and the deletion of orphans from a too-large precreate is left to lfsck. Lustre-commit: db6613f5bed1606cc8f97b46d1b298746af03a75 Lustre-change: http://review.whamcloud.com/7443 Test-Parameters: testlist=conf-sanity envdefinitions=SLOW=yes Signed-off-by: Bob Glossman Signed-off-by: Andreas Dilger Reviewed-by: Sebastien Buisson Reviewed-by: Alex Zhuravlev Reviewed-by: Oleg Drokin Change-Id: I5b773bd7116df4d7875ff31bdd0c873377f83116 Reviewed-on: http://review.whamcloud.com/8159 Tested-by: Jenkins Tested-by: Maloo --- lustre/doc/mkfs.lustre.8 | 9 ++++- lustre/ofd/ofd_obd.c | 16 ++++++++ lustre/tests/conf-sanity.sh | 40 +++++++++++++++++++- lustre/tests/sanity.sh | 4 +- lustre/utils/mkfs_lustre.c | 89 ++++++++++++++++++++++++--------------------- 5 files changed, 112 insertions(+), 46 deletions(-) diff --git a/lustre/doc/mkfs.lustre.8 b/lustre/doc/mkfs.lustre.8 index a189849..865448b 100644 --- a/lustre/doc/mkfs.lustre.8 +++ b/lustre/doc/mkfs.lustre.8 @@ -111,7 +111,14 @@ Return errors instead of waiting for recovery Print less information. .TP .BI \--reformat -Reformat an existing Lustre disk +Reformat an existing Lustre disk as a new target +.TP +.BI \--replace +Used to initialize a target with the same +.I --index +as a previously used target if the old target was permanently lost for +some reason (e.g. multiple disk failure or massive corruption). This +avoids having the target try to register as a new target with the MGS. .TP .BI \--stripe-count-hint= stripes Used for optizing MDT inode size diff --git a/lustre/ofd/ofd_obd.c b/lustre/ofd/ofd_obd.c index ed80a3c..0116067 100644 --- a/lustre/ofd/ofd_obd.c +++ b/lustre/ofd/ofd_obd.c @@ -1297,6 +1297,22 @@ int ofd_create(const struct lu_env *env, struct obd_export *exp, } } + /* This can happen if a new OST is formatted and installed + * in place of an old one at the same index. Instead of + * precreating potentially millions of deleted old objects + * (possibly filling the OST), only precreate the last batch. + * LFSCK will eventually clean up any orphans. LU-14 */ + if (diff > 5 * OST_MAX_PRECREATE) { + diff = OST_MAX_PRECREATE / 2; + LCONSOLE_WARN("%s: precreate FID "DOSTID" is over %u " + "larger than the LAST_ID "DOSTID", only " + "precreating the last %u objects.\n", + ofd_name(ofd), POSTID(&oa->o_oi), + 5 * OST_MAX_PRECREATE, + POSTID(&oseq->os_oi), diff); + ofd_seq_last_oid_set(oseq, ostid_id(&oa->o_oi) - diff); + } + while (diff > 0) { next_id = ofd_seq_last_oid(oseq) + 1; count = ofd_precreate_batch(ofd, diff); diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index 2b1501f..426b9ba 100644 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -86,7 +86,7 @@ init_logging require_dsh_mds || exit 0 require_dsh_ost || exit 0 # -[ "$SLOW" = "no" ] && EXCEPT_SLOW="30a 31 45" +[ "$SLOW" = "no" ] && EXCEPT_SLOW="30a 31 45 69" assert_DIR @@ -3775,6 +3775,44 @@ test_67() { #LU-2950 } run_test 67 "test routes conversion and configuration" +test_69() { + setup + + # use OST0000 since it probably has the most creations + local OSTNAME=$(ostname_from_index 0) + local mdtosc_proc1=$(get_mdtosc_proc_path mds1 $OSTNAME) + local last_id=$(do_facet mds1 lctl get_param -n \ + osc.$mdtosc_proc1.prealloc_last_id) + + # Want to have OST LAST_ID over 1.5 * OST_MAX_PRECREATE to + # verify that the LAST_ID recovery is working properly. If + # not, then the OST will refuse to allow the MDS connect + # because the LAST_ID value is too different from the MDS + #define OST_MAX_PRECREATE=20000 + local num_create=$((20000 * 5 + 100)) + + mkdir -p $DIR/$tdir + $LFS setstripe -i 0 $DIR/$tdir + createmany -o $DIR/$tdir/$tfile- $num_create + # delete all of the files with objects on OST0 so the + # filesystem is not inconsistent later on + $LFS find $MOUNT --ost 0 | xargs rm + + stop_ost || error "OST0 stop failure" + add ost1 $(mkfs_opts ost1 $(ostdevname 1)) --reformat --replace \ + $(ostdevname 1) $(ostvdevname 1) || + error "reformat and replace $ostdev failed" + start_ost || error "OST0 restart failure" + wait_osc_import_state mds ost FULL + + touch $DIR/$tdir/$tfile-last || error "create file after reformat" + local idx=$($LFS getstripe -i $DIR/$tdir/$tfile-last) + [ $idx -ne 0 ] && error "$DIR/$tdir/$tfile-last on $idx not 0" || true + + cleanup +} +run_test 69 "replace an OST with the same index" + test_70a() { [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return local MDTIDX=1 diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 32067d1..9ba8d32 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -1383,9 +1383,9 @@ exhaust_precreations() { # on the mdt's osc local mdtosc_proc1=$(get_mdtosc_proc_path mds${MDSIDX} $OST) local last_id=$(do_facet mds${MDSIDX} lctl get_param -n \ - osc.$mdtosc_proc1.prealloc_last_id) + osc.$mdtosc_proc1.prealloc_last_id) local next_id=$(do_facet mds${MDSIDX} lctl get_param -n \ - osc.$mdtosc_proc1.prealloc_next_id) + osc.$mdtosc_proc1.prealloc_next_id) local mdtosc_proc2=$(get_mdtosc_proc_path mds${MDSIDX}) do_facet mds${MDSIDX} lctl get_param osc.$mdtosc_proc2.prealloc* diff --git a/lustre/utils/mkfs_lustre.c b/lustre/utils/mkfs_lustre.c index fa36098..dd4a0b9 100644 --- a/lustre/utils/mkfs_lustre.c +++ b/lustre/utils/mkfs_lustre.c @@ -77,7 +77,6 @@ char *progname; int verbose = 1; static int print_only = 0; -static int upgrade_to_18 = 0; #ifdef HAVE_LDISKFS_OSD #define FSLIST_LDISKFS "ldiskfs" @@ -152,6 +151,7 @@ void usage(FILE *out) "\t\t--device-size=#N(KB): device size for loop devices\n" "\t\t--mkfsoptions=: format options\n" "\t\t--reformat: overwrite an existing disk\n" + "\t\t--replace: replace an old target with the same index\n" "\t\t--stripe-count-hint=#N: for optimizing MDT inode size\n" #else "\t\t--erase-params: erase all old parameter settings\n" @@ -273,44 +273,45 @@ static char *convert_hostnames(char *s1) int parse_opts(int argc, char *const argv[], struct mkfs_opts *mop, char **mountopts) { - static struct option long_opt[] = { - {"backfstype", 1, 0, 'b'}, - {"stripe-count-hint", 1, 0, 'c'}, - {"comment", 1, 0, 'u'}, - {"configdev", 1, 0, 'C'}, - {"device-size", 1, 0, 'd'}, - {"dryrun", 0, 0, 'n'}, - {"erase-params", 0, 0, 'e'}, - {"failnode", 1, 0, 'f'}, - {"failover", 1, 0, 'f'}, - {"mgs", 0, 0, 'G'}, - {"help", 0, 0, 'h'}, - {"index", 1, 0, 'i'}, - {"mkfsoptions", 1, 0, 'k'}, - {"mgsnode", 1, 0, 'm'}, - {"mgsnid", 1, 0, 'm'}, - {"mdt", 0, 0, 'M'}, - {"fsname",1, 0, 'L'}, - {"noformat", 0, 0, 'n'}, - {"nomgs", 0, 0, 'N'}, - {"mountfsoptions", 1, 0, 'o'}, - {"ost", 0, 0, 'O'}, - {"param", 1, 0, 'p'}, - {"print", 0, 0, 'n'}, - {"quiet", 0, 0, 'q'}, - {"reformat", 0, 0, 'r'}, - {"servicenode", 1, 0, 's'}, - {"verbose", 0, 0, 'v'}, - {"writeconf", 0, 0, 'w'}, - {"upgrade_to_18", 0, 0, 'U'}, - {"network", 1, 0, 't'}, - {"quota", 0, 0, 'Q'}, - {0, 0, 0, 0} - }; - char *optstring = "b:c:C:d:ef:Ghi:k:L:m:MnNo:Op:Pqrs:t:Uu:vw"; - int opt; - int rc, longidx; - int failnode_set = 0, servicenode_set = 0; + static struct option long_opt[] = { + { "backfstype", required_argument, NULL, 'b' }, + { "stripe-count-hint", required_argument, NULL, 'c' }, + { "comment", required_argument, NULL, 'u' }, + { "configdev", required_argument, NULL, 'C' }, + { "device-size", required_argument, NULL, 'd' }, + { "dryrun", no_argument, NULL, 'n' }, + { "erase-params", no_argument, NULL, 'e' }, + { "failnode", required_argument, NULL, 'f' }, + { "failover", required_argument, NULL, 'f' }, + { "mgs", no_argument, NULL, 'G' }, + { "help", no_argument, NULL, 'h' }, + { "index", required_argument, NULL, 'i' }, + { "mkfsoptions", required_argument, NULL, 'k' }, + { "mgsnode", required_argument, NULL, 'm' }, + { "mgsnid", required_argument, NULL, 'm' }, + { "mdt", no_argument, NULL, 'M' }, + { "fsname", required_argument, NULL, 'L' }, + { "noformat", no_argument, NULL, 'n' }, + { "nomgs", no_argument, NULL, 'N' }, + { "mountfsoptions", required_argument, NULL, 'o' }, + { "ost", no_argument, NULL, 'O' }, + { "param", required_argument, NULL, 'p' }, + { "print", no_argument, NULL, 'n' }, + { "quiet", no_argument, NULL, 'q' }, + { "quota", no_argument, NULL, 'Q' }, + { "reformat", no_argument, NULL, 'r' }, + { "replace", no_argument, NULL, 'R' }, + { "servicenode", required_argument, NULL, 's' }, + { "network", required_argument, NULL, 't' }, + { "verbose", no_argument, NULL, 'v' }, + { "writeconf", no_argument, NULL, 'w' }, + { 0, 0, NULL, 0 } + }; + char *optstring = "b:c:C:d:ef:Ghi:k:L:m:MnNo:Op:PqrRs:t:Uu:vw"; + int opt; + int rc, longidx; + int failnode_set = 0, servicenode_set = 0; + int replace = 0; while ((opt = getopt_long(argc, argv, optstring, long_opt, &longidx)) != EOF) { @@ -474,6 +475,9 @@ int parse_opts(int argc, char *const argv[], struct mkfs_opts *mop, case 'r': mop->mo_flags |= MO_FORCEFORMAT; break; + case 'R': + replace = 1; + break; case 't': if (!IS_MDT(&mop->mo_ldd) && !IS_OST(&mop->mo_ldd)) { badopt(long_opt[longidx].name, "MDT,OST"); @@ -500,9 +504,6 @@ int parse_opts(int argc, char *const argv[], struct mkfs_opts *mop, case 'w': mop->mo_ldd.ldd_flags |= LDD_F_WRITECONF; break; - case 'U': - upgrade_to_18 = 1; - break; case 'Q': mop->mo_flags |= MO_QUOTA; break; @@ -515,6 +516,10 @@ int parse_opts(int argc, char *const argv[], struct mkfs_opts *mop, } }//while + /* Need to clear this flag after parsing 'L' and 'i' options. */ + if (replace) + mop->mo_ldd.ldd_flags &= ~LDD_F_VIRGIN; + if (optind == argc) { /* The user didn't specify device name */ fatal(); -- 1.8.3.1