From 01138321c7ce393c189a7ed11559c0938ce9f17e Mon Sep 17 00:00:00 2001 From: Alexander Boyko Date: Fri, 30 Dec 2011 14:24:10 +0800 Subject: [PATCH 1/1] LU-80 lov: large stripe count support Currently a file can be stripped across OSTs up to a limit of 160 stripes. This patch expands that limit to 2000 and it is possible to go even to larger stripe counts. Signed-off-by: Alexander Boyko Change-Id: I42e1aad35dd056faac23a0d5b025e0a23fc4ec2f Reviewed-on: http://review.whamcloud.com/1111 Reviewed-by: Yu Jian Tested-by: Hudson Tested-by: Maloo Reviewed-by: James Simmons Reviewed-by: Andreas Dilger Reviewed-by: Oleg Drokin --- lustre/include/dt_object.h | 1 + lustre/include/lustre/lustre_idl.h | 15 +++++++-- lustre/include/lustre_net.h | 10 +++--- lustre/include/obd.h | 5 +++ lustre/include/obd_lov.h | 32 +++++++++++++++++++- lustre/lov/lov_internal.h | 2 +- lustre/lov/lov_pack.c | 62 +++++++++++++++++++++++--------------- lustre/lov/lov_qos.c | 6 ++-- lustre/lov/lov_request.c | 4 +-- lustre/mdd/mdd_lov.c | 4 +++ lustre/mds/mds_log.c | 3 ++ lustre/mds/mds_lov.c | 16 ++++++---- lustre/osd-ldiskfs/osd_handler.c | 8 +++++ 13 files changed, 124 insertions(+), 44 deletions(-) diff --git a/lustre/include/dt_object.h b/lustre/include/dt_object.h index 39fde4b..db232da 100644 --- a/lustre/include/dt_object.h +++ b/lustre/include/dt_object.h @@ -81,6 +81,7 @@ struct dt_device_param { unsigned ddp_max_nlink; unsigned ddp_block_shift; mntopt_t ddp_mntopts; + unsigned ddp_max_ea_size; }; /** diff --git a/lustre/include/lustre/lustre_idl.h b/lustre/include/lustre/lustre_idl.h index 732846c..f030e2b 100644 --- a/lustre/include/lustre/lustre_idl.h +++ b/lustre/include/lustre/lustre_idl.h @@ -1141,7 +1141,8 @@ extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb); OBD_CONNECT_RMT_CLIENT_FORCE | OBD_CONNECT_VBR | \ OBD_CONNECT_MDS | OBD_CONNECT_SKIP_ORPHAN | \ OBD_CONNECT_GRANT_SHRINK | OBD_CONNECT_FULL20 | \ - OBD_CONNECT_64BITHASH | OBD_CONNECT_MAXBYTES) + OBD_CONNECT_64BITHASH | OBD_CONNECT_MAXBYTES | \ + OBD_CONNECT_MAX_EASIZE) #define ECHO_CONNECT_SUPPORTED (0) #define MGS_CONNECT_SUPPORTED (OBD_CONNECT_VERSION | OBD_CONNECT_AT | \ OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV) @@ -2130,7 +2131,17 @@ enum seq_op { #define LOV_MIN_STRIPE_BITS 16 /* maximum PAGE_SIZE (ia64), power of 2 */ #define LOV_MIN_STRIPE_SIZE (1<lsm_magic; - - /* If we are just sizing the EA, limit the stripe count - * to the actual number of OSTs in this filesystem. */ - if (!lmmp) { - stripe_count = lov_get_stripecnt(lov, - lsm->lsm_stripe_count); - lsm->lsm_stripe_count = stripe_count; - } else { - stripe_count = lsm->lsm_stripe_count; - } } else { - /* No needs to allocated more than LOV_MAX_STRIPE_COUNT. - * Anyway, this is pretty inaccurate since ld_tgt_count now - * represents max index and we should rely on the actual number - * of OSTs instead */ - stripe_count = min((__u32)LOV_MAX_STRIPE_COUNT, - lov->desc.ld_tgt_count); - if (lmmp && *lmmp) lmm_magic = le32_to_cpu((*lmmp)->lmm_magic); else @@ -181,6 +164,27 @@ int lov_packmd(struct obd_export *exp, struct lov_mds_md **lmmp, } + if (lsm) { + /* If we are just sizing the EA, limit the stripe count + * to the actual number of OSTs in this filesystem. */ + if (!lmmp) { + stripe_count = lov_get_stripecnt(lov, lmm_magic, + lsm->lsm_stripe_count); + lsm->lsm_stripe_count = stripe_count; + } else { + stripe_count = lsm->lsm_stripe_count; + } + } else { + /* No need to allocate more than maximum supported stripes. + * Anyway, this is pretty inaccurate since ld_tgt_count now + * represents max index and we should rely on the actual number + * of OSTs instead */ + stripe_count = lov_mds_md_stripecnt(lov->lov_ocd.ocd_max_easize, + lmm_magic); + if (stripe_count > lov->desc.ld_tgt_count) + stripe_count = lov->desc.ld_tgt_count; + } + /* XXX LOV STACKING call into osc for sizes */ lmm_size = lov_mds_md_size(stripe_count, lmm_magic); @@ -245,19 +249,26 @@ int lov_packmd(struct obd_export *exp, struct lov_mds_md **lmmp, } /* Find the max stripecount we should use */ -int lov_get_stripecnt(struct lov_obd *lov, __u32 stripe_count) +__u32 lov_get_stripecnt(struct lov_obd *lov, __u32 magic, __u32 stripe_count) { + __u32 max_stripes = LOV_MAX_STRIPE_COUNT_OLD; + if (!stripe_count) stripe_count = lov->desc.ld_default_stripe_count; if (stripe_count > lov->desc.ld_active_tgt_count) stripe_count = lov->desc.ld_active_tgt_count; if (!stripe_count) stripe_count = 1; - /* for now, we limit the stripe count directly, when bug 4424 is - * fixed this needs to be somewhat dynamic based on whether ext3 - * can handle larger EA sizes. */ - if (stripe_count > LOV_MAX_STRIPE_COUNT) - stripe_count = LOV_MAX_STRIPE_COUNT; + + /* stripe count is based on whether ldiskfs can handle + * larger EA sizes */ + if (lov->lov_ocd.ocd_connect_flags & OBD_CONNECT_MAX_EASIZE && + lov->lov_ocd.ocd_max_easize) + max_stripes = lov_mds_md_stripecnt(lov->lov_ocd.ocd_max_easize, + magic); + + if (stripe_count > max_stripes) + stripe_count = max_stripes; return stripe_count; } @@ -349,8 +360,8 @@ int lov_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp, RETURN(rc); magic = le32_to_cpu(lmm->lmm_magic); } else { - stripe_count = lov_get_stripecnt(lov, 0); magic = LOV_MAGIC; + stripe_count = lov_get_stripecnt(lov, magic, 0); } /* If we aren't passed an lsmp struct, we just want the size */ @@ -450,7 +461,8 @@ static int __lov_setstripe(struct obd_export *exp, int max_lmm_size, lumv1->lmm_stripe_offset, lov->desc.ld_tgt_count); RETURN(-EINVAL); } - stripe_count = lov_get_stripecnt(lov, lumv1->lmm_stripe_count); + stripe_count = lov_get_stripecnt(lov, lmm_magic, + lumv1->lmm_stripe_count); if (max_lmm_size) { int max_stripes = (max_lmm_size - diff --git a/lustre/lov/lov_qos.c b/lustre/lov/lov_qos.c index f454a59..d114aa3 100644 --- a/lustre/lov/lov_qos.c +++ b/lustre/lov/lov_qos.c @@ -1005,7 +1005,7 @@ int qos_prep_create(struct obd_export *exp, struct lov_request_set *set) LASSERT(src_oa->o_valid & OBD_MD_FLGROUP); if (set->set_oi->oi_md == NULL) { - int stripes_def = lov_get_stripecnt(lov, 0); + __u32 stripes_def = lov_get_stripecnt(lov, LOV_MAGIC, 0); /* If the MDS file was truncated up to some size, stripe over * enough OSTs to allow the file to be created at that size. @@ -1029,8 +1029,8 @@ int qos_prep_create(struct obd_export *exp, struct lov_request_set *set) if (stripes < stripes_def) stripes = stripes_def; } else { - flag = LOV_USES_DEFAULT_STRIPE; - stripes = stripes_def; + flag = LOV_USES_DEFAULT_STRIPE; + stripes = stripes_def; } rc = lov_alloc_memmd(&set->set_oi->oi_md, stripes, diff --git a/lustre/lov/lov_request.c b/lustre/lov/lov_request.c index 3e29f43..d8c1fea 100644 --- a/lustre/lov/lov_request.c +++ b/lustre/lov/lov_request.c @@ -1495,8 +1495,8 @@ int lov_fini_statfs(struct obd_device *obd, struct obd_statfs *osfs,int success) ENTRY; if (success) { - __u32 expected_stripes = lov_get_stripecnt(&obd->u.lov, 0); - + __u32 expected_stripes = lov_get_stripecnt(&obd->u.lov, + LOV_MAGIC, 0); if (osfs->os_files != LOV_U64_MAX) do_div(osfs->os_files, expected_stripes); if (osfs->os_ffree != LOV_U64_MAX) diff --git a/lustre/mdd/mdd_lov.c b/lustre/mdd/mdd_lov.c index 46d1580..69df68e 100644 --- a/lustre/mdd/mdd_lov.c +++ b/lustre/mdd/mdd_lov.c @@ -152,6 +152,9 @@ int mdd_init_obd(const struct lu_env *env, struct mdd_device *mdd, obd->obd_recovering = 1; cfs_spin_unlock(&obd->obd_dev_lock); obd->u.mds.mds_id = mds_id; + obd->u.obt.obt_osd_properties.osd_max_ea_size = + mdd->mdd_dt_conf.ddp_max_ea_size; + rc = class_setup(obd, lcfg); if (rc) GOTO(class_detach, rc); @@ -163,6 +166,7 @@ int mdd_init_obd(const struct lu_env *env, struct mdd_device *mdd, obd->obd_upcall.onu_upcall = mdd_notify; obd->obd_upcall.onu_owner = mdd; mdd->mdd_obd_dev = obd; + EXIT; class_detach: if (rc) diff --git a/lustre/mds/mds_log.c b/lustre/mds/mds_log.c index 91ea64c..f256053 100644 --- a/lustre/mds/mds_log.c +++ b/lustre/mds/mds_log.c @@ -297,6 +297,9 @@ static int mds_llog_add_unlink(struct obd_device *obd, struct llog_ctxt *ctxt; int rc; + if (cookies < lsm->lsm_stripe_count) + RETURN(rc = -EFBIG); + /* first prepare unlink log record */ OBD_ALLOC_PTR(lur); if (!lur) diff --git a/lustre/mds/mds_lov.c b/lustre/mds/mds_lov.c index ce9ce98..df26eda 100644 --- a/lustre/mds/mds_lov.c +++ b/lustre/mds/mds_lov.c @@ -201,12 +201,14 @@ static int mds_lov_update_max_ost(struct mds_obd *mds, obd_id index) /* workaround - New target not in objids file; increase mdsize */ /* ld_tgt_count is used as the max index everywhere, despite its name. */ if (data[off] == 0) { + __u32 max_easize; __u32 stripes; + max_easize = mds->mds_obt.obt_osd_properties.osd_max_ea_size; data[off] = 1; mds->mds_lov_objid_count++; - stripes = min_t(__u32, LOV_MAX_STRIPE_COUNT, - mds->mds_lov_objid_count); + stripes = min(lov_mds_md_stripecnt(max_easize, LOV_MAGIC_V3), + mds->mds_lov_objid_count); mds->mds_max_mdsize = lov_mds_md_size(stripes, LOV_MAGIC_V3); mds->mds_max_cookiesize = stripes * sizeof(struct llog_cookie); @@ -368,8 +370,8 @@ EXPORT_SYMBOL(mds_lov_update_objids); static int mds_lov_update_from_read(struct mds_obd *mds, obd_id *data, __u32 count) { - __u32 i; - __u32 stripes; + __u32 max_easize = mds->mds_obt.obt_osd_properties.osd_max_ea_size; + __u32 i, stripes; for (i = 0; i < count; i++) { if (data[i] == 0) @@ -378,7 +380,7 @@ static int mds_lov_update_from_read(struct mds_obd *mds, obd_id *data, mds->mds_lov_objid_count++; } - stripes = min_t(__u32, LOV_MAX_STRIPE_COUNT, + stripes = min(lov_mds_md_stripecnt(max_easize, LOV_MAGIC_V3), mds->mds_lov_objid_count); mds->mds_max_mdsize = lov_mds_md_size(stripes, LOV_MAGIC_V3); @@ -717,12 +719,14 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name) OBD_CONNECT_OSS_CAPA | OBD_CONNECT_FULL20 | OBD_CONNECT_CHANGE_QS | OBD_CONNECT_AT | OBD_CONNECT_MDS | OBD_CONNECT_SKIP_ORPHAN | - OBD_CONNECT_SOM; + OBD_CONNECT_SOM | OBD_CONNECT_MAX_EASIZE; #ifdef HAVE_LRU_RESIZE_SUPPORT data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE; #endif data->ocd_version = LUSTRE_VERSION_CODE; data->ocd_group = mdt_to_obd_objseq(mds->mds_id); + data->ocd_max_easize = mds->mds_obt.obt_osd_properties.osd_max_ea_size; + /* send max bytes per rpc */ data->ocd_brw_size = PTLRPC_MAX_BRW_PAGES << CFS_PAGE_SHIFT; /* send the list of supported checksum types */ diff --git a/lustre/osd-ldiskfs/osd_handler.c b/lustre/osd-ldiskfs/osd_handler.c index c461662..bffa028 100644 --- a/lustre/osd-ldiskfs/osd_handler.c +++ b/lustre/osd-ldiskfs/osd_handler.c @@ -939,6 +939,14 @@ static void osd_conf_get(const struct lu_env *env, param->ddp_mntopts |= MNTOPT_USERXATTR; if (test_opt(sb, POSIX_ACL)) param->ddp_mntopts |= MNTOPT_ACL; + +#if defined(LDISKFS_FEATURE_INCOMPAT_EA_INODE) + if (LDISKFS_HAS_INCOMPAT_FEATURE(sb, LDISKFS_FEATURE_INCOMPAT_EA_INODE)) + param->ddp_max_ea_size = LDISKFS_XATTR_MAX_LARGE_EA_SIZE; + else +#endif + param->ddp_max_ea_size = sb->s_blocksize; + } /** -- 1.8.3.1