From 716cca7142fd5be67fe55850e297d06132c23a99 Mon Sep 17 00:00:00 2001 From: rcorreia Date: Fri, 13 Mar 2009 05:51:09 +0000 Subject: [PATCH] Branch b_hd_kdmu b=17623 Initial mkfs.lustre implementation for DMU/ZFS backend. Also implements new label format. --- lustre/include/lustre_disk.h | 14 +++- lustre/mgs/mgs_llog.c | 6 ++ lustre/obdclass/obd_mount.c | 21 ++++-- lustre/utils/mkfs_lustre.c | 174 ++++++++++++++++++++++++++++++++++--------- lustre/utils/mount_utils.c | 29 +++++++- lustre/utils/mount_utils.h | 1 + 6 files changed, 198 insertions(+), 47 deletions(-) diff --git a/lustre/include/lustre_disk.h b/lustre/include/lustre_disk.h index 34dfc67..3086083 100644 --- a/lustre/include/lustre_disk.h +++ b/lustre/include/lustre_disk.h @@ -143,10 +143,16 @@ static inline int server_make_name(__u32 flags, __u16 index, char *fs, char *name) { if (flags & (LDD_F_SV_TYPE_MDT | LDD_F_SV_TYPE_OST)) { - if (!(flags & LDD_F_SV_ALL)) - sprintf(name, "%.8s-%s%04x", fs, - (flags & LDD_F_SV_TYPE_MDT) ? "MDT" : "OST", - index); + if (!(flags & LDD_F_SV_ALL)) { + sprintf(name, "%.8s%c%s", fs, + (flags & LDD_F_VIRGIN) ? ':' : '-', + (flags & LDD_F_SV_TYPE_MDT) ? "MDT" : "OST"); + + if (flags & LDD_F_NEED_INDEX) + sprintf(&name[strlen(name)], "u%03x", index); + else + sprintf(&name[strlen(name)], "%04x", index); + } } else if (flags & LDD_F_SV_TYPE_MGS) { sprintf(name, "MGS"); } else { diff --git a/lustre/mgs/mgs_llog.c b/lustre/mgs/mgs_llog.c index 11aea49..316196f 100644 --- a/lustre/mgs/mgs_llog.c +++ b/lustre/mgs/mgs_llog.c @@ -532,6 +532,7 @@ int mgs_set_index(struct obd_device *obd, struct mgs_target_info *mti) if (rc == -1) RETURN(-ERANGE); mti->mti_stripe_index = rc; + mti->mti_flags &= ~LDD_F_NEED_INDEX; } if (mti->mti_stripe_index >= INDEX_MAP_SIZE * 8) { @@ -2600,6 +2601,11 @@ int mgs_write_log_target(struct obd_device *obd, if (mti->mti_flags & (LDD_F_VIRGIN | LDD_F_UPGRADE14 | LDD_F_WRITECONF)) { + /* Update target name from fsname:XXXyyyy -> fsname-XXXyyyy */ + mti->mti_flags &= ~LDD_F_VIRGIN; + server_make_name(mti->mti_flags, mti->mti_stripe_index, + mti->mti_fsname, mti->mti_svname); + /* Generate a log from scratch */ if (mti->mti_flags & LDD_F_SV_TYPE_MDT) { rc = mgs_write_log_mdt(obd, fsdb, mti); diff --git a/lustre/obdclass/obd_mount.c b/lustre/obdclass/obd_mount.c index 84e41f6..ac5e892 100644 --- a/lustre/obdclass/obd_mount.c +++ b/lustre/obdclass/obd_mount.c @@ -1916,14 +1916,17 @@ int server_name2index(char *svname, __u32 *idx, char **endptr) unsigned long index; int rc; char *dash = strrchr(svname, '-'); - if (!dash) - return(-EINVAL); + if (!dash) { + dash = strrchr(svname, ':'); + if (!dash) + return(-EINVAL); + } /* intepret -MDTXXXXX-mdc as mdt, the better way is to pass * in the fsname, then determine the server index */ if (!strcmp(LUSTRE_MDC_NAME, dash + 1)) { dash--; - for (; dash > svname && *dash != '-'; dash--); + for (; dash > svname && *dash != '-' && *dash != ':'; dash--); if (dash == svname) return(-EINVAL); } @@ -1934,10 +1937,18 @@ int server_name2index(char *svname, __u32 *idx, char **endptr) rc = LDD_F_SV_TYPE_OST; else return(-EINVAL); - if (strcmp(dash + 4, "all") == 0) + + dash += 4; + + if (strcmp(dash, "all") == 0) return rc | LDD_F_SV_ALL; - index = simple_strtoul(dash + 4, endptr, 16); + if (*dash == 'u') { + rc |= LDD_F_NEED_INDEX; + dash++; + } + + index = simple_strtoul(dash, endptr, 16); *idx = index; return rc; } diff --git a/lustre/utils/mkfs_lustre.c b/lustre/utils/mkfs_lustre.c index 880206c..c611009 100644 --- a/lustre/utils/mkfs_lustre.c +++ b/lustre/utils/mkfs_lustre.c @@ -82,16 +82,17 @@ #define MAX_LOOP_DEVICES 16 #define L_BLOCK_SIZE 4096 -#define INDEX_UNASSIGNED 0xFFFF #define MO_IS_LOOP 0x01 #define MO_FORCEFORMAT 0x02 /* used to describe the options to format the lustre disk, not persistent */ struct mkfs_opts { struct lustre_disk_data mo_ldd; /* to be written in MOUNT_DATA_FILE */ - char mo_device[128]; /* disk device name */ + char mo_device[128]; /* disk device name or ZFS pool name */ + char **mo_pool_vdevs; /* list of pool vdevs */ char mo_mkfsopts[128]; /* options to the backing-store mkfs */ char mo_loopdev[128]; /* in case a loop dev is needed */ + char mo_osname[256]; /* ZFS objset name */ __u64 mo_device_sz; /* in KB */ int mo_stripe_count; int mo_flags; @@ -103,13 +104,24 @@ int verbose = 1; static int print_only = 0; static int failover = 0; static int upgrade_to_18 = 0; +static int force_zpool = 0; void usage(FILE *out) { fprintf(out, "%s v"LUSTRE_VERSION_STRING"\n", progname); - fprintf(out, "usage: %s [options] \n", progname); + fprintf(out, "usage: %s [--backfstype=zfs] [options] " + " [[] [ ...] " + "[[vdev type>] ...]]\n", progname); + fprintf(out, "usage: %s --backfstype=ext3|ldiskfs " + "[options] \n", progname); fprintf(out, "\t:block device or file (e.g /dev/sda or /tmp/ost1)\n" + "\t: name of the new ZFS pool. It can also be an " + "existing pool, in which case you should not provide a list " + "of devices.\n" + "\t: type of vdev (mirror, raidz, raidz2, spare, " + "cache, log)\n" + "\n" "\ttarget types:\n" "\t\t--ost: object storage, mutually exclusive with mdt,mgs\n" "\t\t--mdt: metadata storage, mutually exclusive with ost\n" @@ -130,10 +142,11 @@ void usage(FILE *out) "\t\t--comment=: arbitrary user string (%d bytes)\n" "\t\t--mountfsoptions= : permanent mount options\n" #ifndef TUNEFS - "\t\t--backfstype= : backing fs type (ext3, ldiskfs)\n" + "\t\t--backfstype= : backing fs type (zfs, ext3, ldiskfs)\n" "\t\t--device-size=#N(KB) : device size for loop devices\n" "\t\t--mkfsoptions= : format options\n" "\t\t--reformat: overwrite an existing disk\n" + "\t\t--force-create : force the creation of a ZFS pool\n" "\t\t--stripe-count-hint=#N : used for optimizing MDT inode size\n" "\t\t--iam-dir: make use of IAM directory format on backfs, incompatible with ext3.\n" #else @@ -402,6 +415,7 @@ static void disp_old_e2fsprogs_msg(const char *feature, int make_backfs) } /* Check whether the file exists in the device */ +#ifndef TUNEFS static int file_in_dev(char *file_name, char *dev_name) { FILE *fp; @@ -445,6 +459,11 @@ static int is_lustre_target(struct mkfs_opts *mop) vprint("checking for existing Lustre data: "); + if (mop->mo_ldd.ldd_mount_type == LDD_MT_ZFS) { + vprint("WARNING: this functionality is currently disabled.\n"); + return 0; + } + if ((rc = file_in_dev(MOUNT_DATA_FILE, mop->mo_device))) { vprint("found %s\n", (rc == 1) ? MOUNT_DATA_FILE : "extents"); @@ -461,6 +480,7 @@ static int is_lustre_target(struct mkfs_opts *mop) vprint("not found\n"); return 0; /* The device is not a lustre target. */ } +#endif /* Check if a certain feature is supported by e2fsprogs. * Firstly we try to use "debugfs supported_features" command to check if @@ -675,7 +695,69 @@ int make_lustre_backfs(struct mkfs_opts *mop) } snprintf(mkfs_cmd, sizeof(mkfs_cmd), "mkreiserfs -ff "); } else if (mop->mo_ldd.ldd_mount_type == LDD_MT_ZFS) { - snprintf(mkfs_cmd, sizeof(mkfs_cmd), "echo "); + if (mop->mo_pool_vdevs != NULL) { + /* We are creating a new ZFS pool */ + snprintf(mkfs_cmd, sizeof(mkfs_cmd), + "zpool create %s%s", force_zpool ? "-f " : "", + mop->mo_device); + + /* Add the vdevs to the cmd line */ + while (*mop->mo_pool_vdevs != NULL) { + strscat(mkfs_cmd, " ", sizeof(mkfs_cmd)); + strscat(mkfs_cmd, *mop->mo_pool_vdevs, + sizeof(mkfs_cmd)); + mop->mo_pool_vdevs++; /* point to next vdev */ + } + + vprint("\ncreating ZFS pool '%s'...\n", mop->mo_device); + vprint("zpool_cmd = '%s'\n", mkfs_cmd); + + ret = run_command(mkfs_cmd, sizeof(mkfs_cmd)); + if (ret) { + fatal(); + fprintf(stderr, "Unable to create pool '%s' " + "(%d)\n", mop->mo_device, ret); + return ret; + } + } + +retry: + snprintf(mop->mo_osname, sizeof(mop->mo_osname), "%s/%s", + mop->mo_device, mop->mo_ldd.ldd_svname); + + /* Create the ZFS filesystem */ + snprintf(mkfs_cmd, sizeof(mkfs_cmd), "zfs create%s%s %s", + mop->mo_mkfsopts[0] ? " -o " : "", mop->mo_mkfsopts, + mop->mo_osname); + + vprint("\ncreating ZFS filesystem \"%s\"...\n", mop->mo_osname); + vprint("zfs_cmd = \"%s\"\n", mkfs_cmd); + + ret = run_command_err(mkfs_cmd, sizeof(mkfs_cmd), + mop->mo_ldd.ldd_flags & LDD_F_NEED_INDEX ? + "dataset already exists" : NULL); + + if (ret == -2 && mop->mo_ldd.ldd_flags & LDD_F_NEED_INDEX) { + vprint("Dataset \"%s\" already exists, retrying with " + "higher index.\n", mop->mo_osname); + /* Dataset already exists. + Increase svindex and retry */ + mop->mo_ldd.ldd_svindex++; + server_make_name(mop->mo_ldd.ldd_flags, + mop->mo_ldd.ldd_svindex, + mop->mo_ldd.ldd_fsname, + mop->mo_ldd.ldd_svname); + goto retry; + } + + if (ret) { + fatal(); + fprintf(stderr, "Unable to create filesystem %s (%d)\n", + mop->mo_osname, ret); + return ret; + } + + goto skip_format; } else { fprintf(stderr,"%s: unsupported fs type: %d (%s)\n", progname, mop->mo_ldd.ldd_mount_type, @@ -709,6 +791,8 @@ int make_lustre_backfs(struct mkfs_opts *mop) fatal(); fprintf(stderr, "Unable to build fs %s (%d)\n", dev, ret); } + +skip_format: return ret; } @@ -718,7 +802,7 @@ void print_ldd(char *str, struct lustre_disk_data *ldd) { printf("\n %s:\n", str); printf("Target: %s\n", ldd->ldd_svname); - if (ldd->ldd_svindex == INDEX_UNASSIGNED) + if (ldd->ldd_flags & LDD_F_NEED_INDEX) printf("Index: unassigned\n"); else printf("Index: %d\n", ldd->ldd_svindex); @@ -814,11 +898,6 @@ int write_local_files(struct mkfs_opts *mop) FILE *filep; int ret = 0; - if (mop->mo_ldd.ldd_mount_type == LDD_MT_ZFS) { - /* XXX: no LDD on ZFS yet */ - return 0; - } - /* Mount this device temporarily in order to write these files */ if (!mkdtemp(mntpt)) { fprintf(stderr, "%s: Can't create temp mount point %s: %s\n", @@ -1088,10 +1167,11 @@ void set_defaults(struct mkfs_opts *mop) if (get_os_version() == 24) mop->mo_ldd.ldd_mount_type = LDD_MT_EXT3; else - mop->mo_ldd.ldd_mount_type = LDD_MT_LDISKFS; + mop->mo_ldd.ldd_mount_type = LDD_MT_ZFS; - mop->mo_ldd.ldd_svindex = INDEX_UNASSIGNED; + mop->mo_ldd.ldd_svindex = 0; mop->mo_stripe_count = 1; + mop->mo_pool_vdevs = NULL; } static inline void badopt(const char *opt, char *type) @@ -1179,6 +1259,7 @@ int parse_opts(int argc, char *const argv[], struct mkfs_opts *mop, {"erase-params", 0, 0, 'e'}, {"failnode", 1, 0, 'f'}, {"failover", 1, 0, 'f'}, + {"force-create", 0, 0, 'F'}, {"mgs", 0, 0, 'G'}, {"help", 0, 0, 'h'}, {"index", 1, 0, 'i'}, @@ -1263,6 +1344,9 @@ int parse_opts(int argc, char *const argv[], struct mkfs_opts *mop, failover = 1; break; } + case 'F': + force_zpool = 1; + break; case 'G': mop->mo_ldd.ldd_flags |= LDD_F_SV_TYPE_MGS; break; @@ -1375,8 +1459,23 @@ int parse_opts(int argc, char *const argv[], struct mkfs_opts *mop, } }//while - /* Last arg is device */ - if (optind != argc - 1) { + /* optind points to device or pool name */ + strscpy(mop->mo_device, argv[optind], sizeof(mop->mo_device)); + + if (mop->mo_ldd.ldd_mount_type == LDD_MT_ZFS) { + /* Common mistake: user gave device name instead of pool name */ + if (mop->mo_device[0] == '/') { + fatal(); + fprintf(stderr, "Pool name cannot start with '/': %s\n" + "Please run '%s --help' for syntax help.\n", + mop->mo_device, progname); + return EINVAL; + } + /* next index (if existent) points to vdevs */ + if (optind < argc - 1) + mop->mo_pool_vdevs = (char **) &argv[optind + 1]; + } else if (optind != argc - 1) { + /* For non-ZFS backfs, last arg must be the device */ fatal(); fprintf(stderr, "Bad argument: %s\n", argv[optind]); return EINVAL; @@ -1407,17 +1506,12 @@ int main(int argc, char *const argv[]) memset(&mop, 0, sizeof(mop)); set_defaults(&mop); - /* device is last arg */ - strscpy(mop.mo_device, argv[argc - 1], sizeof(mop.mo_device)); - - /* Are we using a loop device? */ - ret = is_block(mop.mo_device); - if (ret < 0) - goto out; - if (ret == 0) - mop.mo_flags |= MO_IS_LOOP; - #ifdef TUNEFS + fatal(); + fprintf(stderr, "%s is non-functional in this release.\n", progname); + ret = EINVAL; + goto out; +#if 0 /* For tunefs, we must read in the old values before parsing any new ones. */ @@ -1444,6 +1538,7 @@ int main(int argc, char *const argv[]) if (verbose > 0) print_ldd("Read previous values", &(mop.mo_ldd)); #endif +#endif ret = parse_opts(argc, argv, &mop, &mountopts); if (ret) @@ -1499,6 +1594,13 @@ int main(int argc, char *const argv[]) case LDD_MT_EXT3: case LDD_MT_LDISKFS: case LDD_MT_LDISKFS2: { + /* Are we using a loop device? */ + ret = is_block(mop.mo_device); + if (ret < 0) + goto out; + if (ret == 0) + mop.mo_flags |= MO_IS_LOOP; + sprintf(always_mountopts, "errors=remount-ro"); if (IS_MDT(ldd) || IS_MGS(ldd)) strscat(always_mountopts, ",iopen_nopriv,user_xattr", @@ -1523,9 +1625,8 @@ int main(int argc, char *const argv[]) mop.mo_device); break; } - case LDD_MT_ZFS: { + case LDD_MT_ZFS: break; - } default: { fatal(); fprintf(stderr, "unknown fs type %d '%s'\n", @@ -1563,11 +1664,12 @@ int main(int argc, char *const argv[]) goto out; } - if (check_mtab_entry(mop.mo_device)) + if (ldd->ldd_mount_type != LDD_MT_ZFS && + check_mtab_entry(mop.mo_device)) return(EEXIST); /* Create the loopback file */ - if (mop.mo_flags & MO_IS_LOOP) { + if (ldd->ldd_mount_type != LDD_MT_ZFS && mop.mo_flags & MO_IS_LOOP) { ret = access(mop.mo_device, F_OK); if (ret) ret = errno; @@ -1609,12 +1711,14 @@ int main(int argc, char *const argv[]) } #endif - /* Write our config files */ - ret = write_local_files(&mop); - if (ret != 0) { - fatal(); - fprintf(stderr, "failed to write local files\n"); - goto out; + if (ldd->ldd_mount_type != LDD_MT_ZFS) { + /* Write our config files */ + ret = write_local_files(&mop); + if (ret != 0) { + fatal(); + fprintf(stderr, "failed to write local files\n"); + goto out; + } } out: diff --git a/lustre/utils/mount_utils.c b/lustre/utils/mount_utils.c index 02835cd..41272ef 100644 --- a/lustre/utils/mount_utils.c +++ b/lustre/utils/mount_utils.c @@ -59,7 +59,7 @@ void fatal(void) fprintf(stderr, "\n%s FATAL: ", progname); } -int run_command(char *cmd, int cmdsz) +int run_command_err(char *cmd, int cmdsz, char *error_msg) { char log[] = "/tmp/run_command_logXXXXXX"; int fd = -1, rc; @@ -85,7 +85,24 @@ int run_command(char *cmd, int cmdsz) /* Can't use popen because we need the rv of the command */ rc = system(cmd); if (rc && (fd >= 0)) { - char buf[128]; + char buf[256]; + + if (error_msg != NULL) { + if (snprintf(buf, sizeof(buf), "grep -q \"%s\" %s", + error_msg, log) >= sizeof(buf)) { + fatal(); + buf[sizeof(buf) - 1] = '\0'; + fprintf(stderr, "grep command buf overflow: " + "'%s'\n", buf); + return ENOMEM; + } + if (system(buf) == 0) { + /* The command had the expected error */ + rc = -2; + goto out; + } + } + FILE *fp; fp = fopen(log, "r"); if (fp) { @@ -95,11 +112,17 @@ int run_command(char *cmd, int cmdsz) fclose(fp); } } +out: if (fd >= 0) remove(log); return rc; } +int run_command(char *cmd, int cmdsz) +{ + return run_command_err(cmd, cmdsz, NULL); +} + int get_mountdata(char *dev, struct lustre_disk_data *mo_ldd) { @@ -191,7 +214,7 @@ static int stclient(char *type, char *arch) return 0; } - i = fread(cmd, 1, sizeof(cmd), fp); + i = fread(cmd, 1, sizeof(cmd) - 1, fp); if (i) { cmd[i] = 0; if (strcmp(cmd, "Record not found\n") != 0) { diff --git a/lustre/utils/mount_utils.h b/lustre/utils/mount_utils.h index a4a3898..7bdc9c2 100644 --- a/lustre/utils/mount_utils.h +++ b/lustre/utils/mount_utils.h @@ -41,6 +41,7 @@ void fatal(void); int run_command(char *, int); +int run_command_err(char *, int, char *); int get_mountdata(char *, struct lustre_disk_data *); void register_service_tags(char *, char *, char *); -- 1.8.3.1