From: Andreas Dilger Date: Fri, 9 Aug 2013 21:03:05 +0000 (-0600) Subject: LU-14 utils: allow formatting OST without VIRGIN flag X-Git-Tag: 2.5.0~40 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=db6613f5bed1606cc8f97b46d1b298746af03a75;hp=c54e61cef76eb0e00c9e69729cf4ac8578c90f47 LU-14 utils: allow formatting OST without VIRGIN flag When reformatting a new target to replace an existing target, the "mkfs.lustre --replace" option allows formatting an OST (or MDT?) with the same index as a previously used OST without setting the LDD_F_VIRGIN flag. This is required in the case where an OST was lost due to massive corruption or critical hardware failure. Otherwise, the newly formatted target will try to register with the MGS as a new target, but will be refused by the MGS due to having an already-used index. The OFD code skips precreating all of the objects in the filesystem if the MDS requests an object ID too much larger than the current LAST_ID. In this case (which can happen if the OST is replaced or restored from an older backup) only the most recent objects are precreated, and the deletion of orphans from a too-large precreate is left to lfsck. Signed-off-by: Andreas Dilger Change-Id: Iea57167346627eeb85ac40c17f3ea4596b3ebbe5 Reviewed-on: http://review.whamcloud.com/7443 Tested-by: Hudson Tested-by: Maloo Reviewed-by: Sebastien Buisson Reviewed-by: Alex Zhuravlev Reviewed-by: Oleg Drokin --- diff --git a/lustre/doc/mkfs.lustre.8 b/lustre/doc/mkfs.lustre.8 index a189849..865448b 100644 --- a/lustre/doc/mkfs.lustre.8 +++ b/lustre/doc/mkfs.lustre.8 @@ -111,7 +111,14 @@ Return errors instead of waiting for recovery Print less information. .TP .BI \--reformat -Reformat an existing Lustre disk +Reformat an existing Lustre disk as a new target +.TP +.BI \--replace +Used to initialize a target with the same +.I --index +as a previously used target if the old target was permanently lost for +some reason (e.g. multiple disk failure or massive corruption). This +avoids having the target try to register as a new target with the MGS. .TP .BI \--stripe-count-hint= stripes Used for optizing MDT inode size diff --git a/lustre/ofd/ofd_obd.c b/lustre/ofd/ofd_obd.c index 55ba8d0..c0a6fcd 100644 --- a/lustre/ofd/ofd_obd.c +++ b/lustre/ofd/ofd_obd.c @@ -1298,6 +1298,22 @@ int ofd_create(const struct lu_env *env, struct obd_export *exp, } } + /* This can happen if a new OST is formatted and installed + * in place of an old one at the same index. Instead of + * precreating potentially millions of deleted old objects + * (possibly filling the OST), only precreate the last batch. + * LFSCK will eventually clean up any orphans. LU-14 */ + if (diff > 5 * OST_MAX_PRECREATE) { + diff = OST_MAX_PRECREATE / 2; + LCONSOLE_WARN("%s: precreate FID "DOSTID" is over %u " + "larger than the LAST_ID "DOSTID", only " + "precreating the last %u objects.\n", + ofd_name(ofd), POSTID(&oa->o_oi), + 5 * OST_MAX_PRECREATE, + POSTID(&oseq->os_oi), diff); + ofd_seq_last_oid_set(oseq, ostid_id(&oa->o_oi) - diff); + } + while (diff > 0) { next_id = ofd_seq_last_oid(oseq) + 1; count = ofd_precreate_batch(ofd, diff); diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index 1f22874..4f6dcda 100644 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -3862,6 +3862,40 @@ test_68() { } run_test 68 "be able to reserve specific sequences in FLDB" +test_69() { + setup + + # use OST0000 since it probably has the most creations + local OSTNAME=$(ostname_from_index 0) + local mdtosc_proc1=$(get_mdtosc_proc_path mds1 $OSTNAME) + local last_id=$(do_facet mds1 lctl get_param -n \ + osc.$mdtosc_proc1.prealloc_last_id) + + # Want to have OST LAST_ID over 1.5 * OST_MAX_PRECREATE to + # verify that the LAST_ID recovery is working properly. If + # not, then the OST will refuse to allow the MDS connect + # because the LAST_ID value is too different from the MDS + #define OST_MAX_PRECREATE=20000 + local num_create=$((20000 * 3/2 - $last_id + 100)) + + mkdir $DIR/$tdir + $LFS setstripe -i 0 $DIR/$tdir + createmany $DIR/$tdir/$tfile- $num_create + # delete all of the files with objects on OST0 so the + # filesystem is not inconsistent later on + $LFS find $MOUNT --index 0 -print0 | xargs -0 unlink + + stop_ost || error "OST0 stop failure" + add ost1 $(mkfs_opts ost1 $ostdev) --reformat --replace $ostdev || + error "reformat and replace $ostdev failed" + start_ost || error "OST0 restart failure" + + touch $DIR/$tdir/$tfile-last || error "create file after reformat" + local idx=$($LFS getstripe -c $DIR/$tdir/$tfile-last) + [ $idx -ne 0 ] && error "$DIR/$tdir/$tfile-last on $idx not 0" || true +} +run_test 68 "replace an OST with the same index" + test_70a() { [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return local MDTIDX=1 diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 20050c2..9280dbd 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -1358,9 +1358,9 @@ exhaust_precreations() { # on the mdt's osc local mdtosc_proc1=$(get_mdtosc_proc_path mds${MDSIDX} $OST) local last_id=$(do_facet mds${MDSIDX} lctl get_param -n \ - osc.$mdtosc_proc1.prealloc_last_id) + osc.$mdtosc_proc1.prealloc_last_id) local next_id=$(do_facet mds${MDSIDX} lctl get_param -n \ - osc.$mdtosc_proc1.prealloc_next_id) + osc.$mdtosc_proc1.prealloc_next_id) local mdtosc_proc2=$(get_mdtosc_proc_path mds${MDSIDX}) do_facet mds${MDSIDX} lctl get_param osc.$mdtosc_proc2.prealloc* diff --git a/lustre/utils/mkfs_lustre.c b/lustre/utils/mkfs_lustre.c index 646a767..ec85e08 100644 --- a/lustre/utils/mkfs_lustre.c +++ b/lustre/utils/mkfs_lustre.c @@ -77,7 +77,6 @@ char *progname; int verbose = 1; static int print_only = 0; -static int upgrade_to_18 = 0; #ifdef HAVE_LDISKFS_OSD #define FSLIST_LDISKFS "ldiskfs" @@ -152,6 +151,7 @@ void usage(FILE *out) "\t\t--device-size=#N(KB): device size for loop devices\n" "\t\t--mkfsoptions=: format options\n" "\t\t--reformat: overwrite an existing disk\n" + "\t\t--replace: replace an old target with the same index\n" "\t\t--stripe-count-hint=#N: for optimizing MDT inode size\n" #else "\t\t--erase-params: erase all old parameter settings\n" @@ -273,44 +273,45 @@ static char *convert_hostnames(char *s1) int parse_opts(int argc, char *const argv[], struct mkfs_opts *mop, char **mountopts) { - static struct option long_opt[] = { - {"backfstype", 1, 0, 'b'}, - {"stripe-count-hint", 1, 0, 'c'}, - {"comment", 1, 0, 'u'}, - {"configdev", 1, 0, 'C'}, - {"device-size", 1, 0, 'd'}, - {"dryrun", 0, 0, 'n'}, - {"erase-params", 0, 0, 'e'}, - {"failnode", 1, 0, 'f'}, - {"failover", 1, 0, 'f'}, - {"mgs", 0, 0, 'G'}, - {"help", 0, 0, 'h'}, - {"index", 1, 0, 'i'}, - {"mkfsoptions", 1, 0, 'k'}, - {"mgsnode", 1, 0, 'm'}, - {"mgsnid", 1, 0, 'm'}, - {"mdt", 0, 0, 'M'}, - {"fsname",1, 0, 'L'}, - {"noformat", 0, 0, 'n'}, - {"nomgs", 0, 0, 'N'}, - {"mountfsoptions", 1, 0, 'o'}, - {"ost", 0, 0, 'O'}, - {"param", 1, 0, 'p'}, - {"print", 0, 0, 'n'}, - {"quiet", 0, 0, 'q'}, - {"reformat", 0, 0, 'r'}, - {"servicenode", 1, 0, 's'}, - {"verbose", 0, 0, 'v'}, - {"writeconf", 0, 0, 'w'}, - {"upgrade_to_18", 0, 0, 'U'}, - {"network", 1, 0, 't'}, - {"quota", 0, 0, 'Q'}, - {0, 0, 0, 0} - }; - char *optstring = "b:c:C:d:ef:Ghi:k:L:m:MnNo:Op:Pqrs:t:Uu:vw"; - int opt; - int rc, longidx; - int failnode_set = 0, servicenode_set = 0; + static struct option long_opt[] = { + { "backfstype", required_argument, NULL, 'b' }, + { "stripe-count-hint", required_argument, NULL, 'c' }, + { "comment", required_argument, NULL, 'u' }, + { "configdev", required_argument, NULL, 'C' }, + { "device-size", required_argument, NULL, 'd' }, + { "dryrun", no_argument, NULL, 'n' }, + { "erase-params", no_argument, NULL, 'e' }, + { "failnode", required_argument, NULL, 'f' }, + { "failover", required_argument, NULL, 'f' }, + { "mgs", no_argument, NULL, 'G' }, + { "help", no_argument, NULL, 'h' }, + { "index", required_argument, NULL, 'i' }, + { "mkfsoptions", required_argument, NULL, 'k' }, + { "mgsnode", required_argument, NULL, 'm' }, + { "mgsnid", required_argument, NULL, 'm' }, + { "mdt", no_argument, NULL, 'M' }, + { "fsname", required_argument, NULL, 'L' }, + { "noformat", no_argument, NULL, 'n' }, + { "nomgs", no_argument, NULL, 'N' }, + { "mountfsoptions", required_argument, NULL, 'o' }, + { "ost", no_argument, NULL, 'O' }, + { "param", required_argument, NULL, 'p' }, + { "print", no_argument, NULL, 'n' }, + { "quiet", no_argument, NULL, 'q' }, + { "quota", no_argument, NULL, 'Q' }, + { "reformat", no_argument, NULL, 'r' }, + { "replace", no_argument, NULL, 'R' }, + { "servicenode", required_argument, NULL, 's' }, + { "network", required_argument, NULL, 't' }, + { "verbose", no_argument, NULL, 'v' }, + { "writeconf", no_argument, NULL, 'w' }, + { 0, 0, NULL, 0 } + }; + char *optstring = "b:c:C:d:ef:Ghi:k:L:m:MnNo:Op:PqrRs:t:Uu:vw"; + int opt; + int rc, longidx; + int failnode_set = 0, servicenode_set = 0; + int replace = 0; while ((opt = getopt_long(argc, argv, optstring, long_opt, &longidx)) != EOF) { @@ -474,6 +475,9 @@ int parse_opts(int argc, char *const argv[], struct mkfs_opts *mop, case 'r': mop->mo_flags |= MO_FORCEFORMAT; break; + case 'R': + replace = 1; + break; case 't': if (!IS_MDT(&mop->mo_ldd) && !IS_OST(&mop->mo_ldd)) { badopt(long_opt[longidx].name, "MDT,OST"); @@ -500,9 +504,6 @@ int parse_opts(int argc, char *const argv[], struct mkfs_opts *mop, case 'w': mop->mo_ldd.ldd_flags |= LDD_F_WRITECONF; break; - case 'U': - upgrade_to_18 = 1; - break; case 'Q': mop->mo_flags |= MO_QUOTA; break; @@ -515,6 +516,10 @@ int parse_opts(int argc, char *const argv[], struct mkfs_opts *mop, } }//while + /* Need to clear this flag after parsing 'L' and 'i' options. */ + if (replace) + mop->mo_ldd.ldd_flags &= ~LDD_F_VIRGIN; + if (optind == argc) { /* The user didn't specify device name */ fatal();