From: Andreas Dilger Date: Wed, 8 Mar 2023 23:40:21 +0000 (-0700) Subject: LU-16623 lod: handle object allocation consistently X-Git-Url: https://git.whamcloud.com/gitweb?a=commitdiff_plain;h=c398e8e51e0576dabbc8d735390a6f021ae8512b;p=fs%2Flustre-release.git LU-16623 lod: handle object allocation consistently Consistently handle the various OS_STATFS_* flags that indicate an OST or MDT is full or otherwise marked ineligible for use. Fix lod_statfs_check() so it skips MDTs with OS_STATFS_ENOINO for allocating dir stripes instead of only checking OST targets. In the LOD code, ltd_active=0 indicates that the device is not usable for new object allocations for a variety of reasons. That includes out of space or inodes, read-only, max_create_count=0, or disconnected export, not *only* that the OSP is disconnected from the OST as with imp_deactive. Targets marked ltd_active=0 will not be counted in ld_active_tgt_count, so these OSTs will not count toward stripe_count for stripe_count=-1 files. Set flags = LOD_USES_DEFAULT_STRIPE in lod_qos_prep_create() for stripe_count = -1 layouts and pass it to lod_stripe_count_min() to avoid use of *all* OSTs when free space is imbalanced or OSTs are not available, and be happy with allocations on 3/4 of OSTs. It looks like this functionality was missed when object allocations transitioned from the LOV to LOD module. Put the LOV_USES_* into an enum and rename to LOD_USES_* for consistency with current code. Apply the lod.*.max_stripe_count limits to PFL components as well as plain file layouts in lod_comp_entry_stripe_count(). Rename ltd_connecting to ltd_discon, since there is no guarantee that this target is actually *connecting*, only that it is currently disconnected. Use ltd_discon in places that checked ltd_active to decide if the OSP was disconnected from the OST, which shouldn't be skipped just because the OST is full or has creates disabled. Lustre-change: https://review.whamcloud.com/50250 Lustre-commit: ced540165ef573570b8a8cba6e43f79e5fc6539f LU-16981 lod: update llc_stripe_count after ost inactive If an OST gets deactivated while lod_ost_alloc_qos() is trying to allocate stripes for a file create, then normally this is caught and EAGAIN is returned which causes the lod_comp->llc_stripe_count to get updated to accurately reflect the stripe count. But there is a race condition and if the OST is deactivated after the call to ltd_qos_is_usable() but before the stripes are allocated, then updating the stripe count never occurred. This causes an LBUG later in lod_striped_create() because fewer stripes are allocated than the number in llc_stripe_count so it finds a stripe that is NULL. The solution is to properly update lod_comp->llc_stripe_count when the number of stripes created is less than expected. Lustre-change: https://review.whamcloud.com/51759 Lustre-commit: 78336aa166f4a7a0128a5891c747eecf26ff9565 Test-Parameters: testlist=sanity env=ONLY=27V,ONLY_REPEAT=100 Signed-off-by: Thomas Bertschinger Fixes: 7b124fef76 ("LU-4277 lod: handle os_state as a flag, check READONLY") Fixes: 5b147e47de ("LU-11115 lod: skip max_create_count=0 OST in QoS and RR algorithms") Fixes: c7f2e70a27 ("LU-1303 lod: QoS allocation policy") Fixes: c1d0a355a6 ("LU-12624 lod: alloc dir stripes by QoS") Fixes: 3c9580931d ("LU-9162 lod: option to set max stripe count per filesystem") Signed-off-by: Andreas Dilger Signed-off-by: Sergey Cheremencev Change-Id: Ifb9443fe6c80b4d7f82b442060db7ac8423ebbe5 Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/52729 Tested-by: jenkins Tested-by: Maloo --- diff --git a/lustre/include/lu_object.h b/lustre/include/lu_object.h index 129a157..a8c72d2 100644 --- a/lustre/include/lu_object.h +++ b/lustre/include/lu_object.h @@ -1585,11 +1585,11 @@ struct lu_tgt_desc { struct lu_tgt_qos ltd_qos; /* qos info per target */ struct obd_statfs ltd_statfs; time64_t ltd_statfs_age; - unsigned long ltd_active:1,/* is this target up for requests */ - ltd_activate:1,/* should target be activated */ + unsigned long ltd_active:1,/* is target available for requests */ + ltd_activate:1,/* should LOV target be connected */ ltd_reap:1, /* should this target be deleted */ ltd_got_update_log:1, /* Already got update log */ - ltd_connecting:1; /* target is connecting */ + ltd_discon:1; /* LOD target disconnected from OST */ }; static inline __u64 tgt_statfs_bavail(struct lu_tgt_desc *tgt) diff --git a/lustre/lod/lod_dev.c b/lustre/lod/lod_dev.c index 7ecd2da..c0f69a3 100644 --- a/lustre/lod/lod_dev.c +++ b/lustre/lod/lod_dev.c @@ -1105,8 +1105,7 @@ static int lod_process_config(const struct lu_env *env, rc = lod_sub_prep_llog(env, lod, sub_tgt->ltd_tgt, sub_tgt->ltd_index); - if (rc == 0) - sub_tgt->ltd_active = 1; + sub_tgt->ltd_active = !rc; } else { lod_sub_fini_llog(env, sub_tgt->ltd_tgt, NULL); @@ -1866,7 +1865,7 @@ static int lod_sync(const struct lu_env *env, struct dt_device *dev) lod_getref(&lod->lod_ost_descs); lod_foreach_ost(lod, tgt) { - if (!tgt->ltd_active) + if (tgt->ltd_discon) continue; rc = dt_sync(env, tgt->ltd_tgt); if (rc) { @@ -1886,7 +1885,7 @@ static int lod_sync(const struct lu_env *env, struct dt_device *dev) lod_getref(&lod->lod_mdt_descs); lod_foreach_mdt(lod, tgt) { - if (!tgt->ltd_active) + if (tgt->ltd_discon) continue; rc = dt_sync(env, tgt->ltd_tgt); if (rc) { @@ -2486,7 +2485,7 @@ static int lod_obd_set_info_async(const struct lu_env *env, d = lu2lod_dev(obd->obd_lu_dev); lod_getref(&d->lod_ost_descs); lod_foreach_ost(d, tgt) { - if (!tgt->ltd_active) + if (tgt->ltd_discon) continue; rc2 = obd_set_info_async(env, tgt->ltd_exp, keylen, key, @@ -2498,8 +2497,9 @@ static int lod_obd_set_info_async(const struct lu_env *env, lod_getref(&d->lod_mdt_descs); lod_foreach_mdt(d, tgt) { - if (!tgt->ltd_active) + if (tgt->ltd_discon) continue; + rc2 = obd_set_info_async(env, tgt->ltd_exp, keylen, key, vallen, val, set); if (rc2 != 0 && rc == 0) diff --git a/lustre/lod/lod_internal.h b/lustre/lod/lod_internal.h index 4412738..227771c 100644 --- a/lustre/lod/lod_internal.h +++ b/lustre/lod/lod_internal.h @@ -43,8 +43,10 @@ #include #include -#define LOV_USES_ASSIGNED_STRIPE 0 -#define LOV_USES_DEFAULT_STRIPE 1 +enum lod_uses_hint { + LOD_USES_ASSIGNED_STRIPE = 0, + LOD_USES_DEFAULT_STRIPE, +}; /* Special values to remove LOV EA from disk */ #define LOVEA_DELETE_VALUES(size, count, offset, pool) \ @@ -766,8 +768,12 @@ int lod_qos_prep_create(const struct lu_env *env, struct lod_object *lo, int comp_idx, __u64 reserve); __u16 lod_comp_entry_stripe_count(struct lod_object *lo, int comp_idx, bool is_dir); +__u16 lod_get_stripe_count_plain(struct lod_device *lod, struct lod_object *lo, + __u16 stripe_count, bool overstriping, + enum lod_uses_hint *flags); __u16 lod_get_stripe_count(struct lod_device *lod, struct lod_object *lo, - int comp_idx, __u16 stripe_count, bool overstriping); + int comp_idx, __u16 stripe_count, bool overstriping, + enum lod_uses_hint *flags); void lod_qos_statfs_update(const struct lu_env *env, struct lod_device *lod, struct lu_tgt_descs *ltd); diff --git a/lustre/lod/lod_object.c b/lustre/lod/lod_object.c index 426ebf5..16597b7 100644 --- a/lustre/lod/lod_object.c +++ b/lustre/lod/lod_object.c @@ -2643,11 +2643,12 @@ static int lod_replace_parent_fid(const struct lu_env *env, RETURN(rc); } -__u16 lod_comp_entry_stripe_count(struct lod_object *lo, - int comp_idx, bool is_dir) +__u16 lod_comp_entry_stripe_count(struct lod_object *lo, int comp_idx, + bool is_dir) { struct lod_device *lod = lu2lod_dev(lod2lu_obj(lo)->lo_dev); struct lod_layout_component *entry; + enum lod_uses_hint flags = LOD_USES_ASSIGNED_STRIPE; if (is_dir) return 0; @@ -2655,11 +2656,16 @@ __u16 lod_comp_entry_stripe_count(struct lod_object *lo, entry = &lo->ldo_comp_entries[comp_idx]; if (lod_comp_inited(entry)) return entry->llc_stripe_count; - else if ((__u16)-1 == entry->llc_stripe_count) - return lod->lod_ost_count; - else - return lod_get_stripe_count(lod, lo, comp_idx, - entry->llc_stripe_count, false); + if (entry->llc_stripe_count == (__u16)-1) + return lod_get_stripe_count_plain(lod, lo, + entry->llc_stripe_count, + entry->llc_pattern & + LOV_PATTERN_OVERSTRIPING, + &flags); + + return lod_get_stripe_count(lod, lo, comp_idx, entry->llc_stripe_count, + entry->llc_pattern & LOV_PATTERN_OVERSTRIPING, + &flags); } static int lod_comp_md_size(struct lod_object *lo, bool is_dir) @@ -7302,12 +7308,9 @@ static inline int lod_check_ost_avail(const struct lu_env *env, } ost = OST_TGT(lod, idx); - if (ost->ltd_statfs.os_state & - (OS_STATFS_READONLY | OS_STATFS_ENOSPC | OS_STATFS_ENOINO | - OS_STATFS_NOPRECREATE) || - ost->ltd_active == 0) { - CDEBUG(D_LAYOUT, DFID ": mirror %d OST%d unavail, rc = %d\n", - PFID(lod_object_fid(lo)), index, idx, rc); + if (ost->ltd_active == 0) { + CDEBUG(D_LAYOUT, DFID ": mirror %d OST%d unavail\n", + PFID(lod_object_fid(lo)), index, idx); return 0; } diff --git a/lustre/lod/lod_pool.c b/lustre/lod/lod_pool.c index fe16450..fe99956 100644 --- a/lustre/lod/lod_pool.c +++ b/lustre/lod/lod_pool.c @@ -757,7 +757,7 @@ void lod_spill_target_refresh(const struct lu_env *env, struct lod_device *lod, if (!test_bit(idx, lod->lod_ost_bitmap)) continue; tgt = OST_TGT(lod, idx); - if (tgt->ltd_active == 0) + if (!tgt->ltd_active) continue; sfs = &tgt->ltd_statfs; diff --git a/lustre/lod/lod_qos.c b/lustre/lod/lod_qos.c index fb16a5b..b9f8e7f 100644 --- a/lustre/lod/lod_qos.c +++ b/lustre/lod/lod_qos.c @@ -60,22 +60,24 @@ #define TGT_BAVAIL(i) (OST_TGT(lod,i)->ltd_statfs.os_bavail * \ OST_TGT(lod,i)->ltd_statfs.os_bsize) +/* check whether a target is available for new object allocation */ static inline int lod_statfs_check(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt) { struct obd_statfs *sfs = &tgt->ltd_statfs; - if (((sfs->os_state & OS_STATFS_ENOSPC) || - (!ltd->ltd_is_mdt && sfs->os_state & OS_STATFS_ENOINO && - sfs->os_fprecreated == 0))) + if (sfs->os_state & OS_STATFS_ENOSPC || + (sfs->os_state & OS_STATFS_ENOINO && + /* OST allocation allowed while precreated objects available */ + (ltd->ltd_is_mdt || sfs->os_fprecreated == 0))) return -ENOSPC; /* If the OST is readonly then we can't allocate objects there */ if (sfs->os_state & OS_STATFS_READONLY) return -EROFS; - /* object precreation is skipped on the OST with max_create_count=0 */ - if (!ltd->ltd_is_mdt && sfs->os_state & OS_STATFS_NOPRECREATE) + /* object precreation is skipped on targets with max_create_count=0 */ + if (sfs->os_state & OS_STATFS_NOPRECREATE) return -ENOBUFS; return 0; @@ -113,27 +115,25 @@ static int lod_statfs_and_check(const struct lu_env *env, struct lod_device *d, info.os_enable_pre = 1; rc = dt_statfs_info(env, tgt->ltd_tgt, &tgt->ltd_statfs, &info); if (rc && rc != -ENOTCONN) - CERROR("%s: statfs: rc = %d\n", lod2obd(d)->obd_name, rc); + CERROR("%s: statfs error: rc = %d\n", lod2obd(d)->obd_name, rc); - if (!rc) { + if (!rc) rc = lod_statfs_check(ltd, tgt); - if (rc == -ENOSPC) - return rc; - } + /* reserving space shouldn't be enough to mark an OST inactive */ if (reserve && (reserve + (info.os_reserved_mb_low << 20) > tgt->ltd_statfs.os_bavail * tgt->ltd_statfs.os_bsize)) return -ENOSPC; /* check whether device has changed state (active, inactive) */ - if (rc != 0 && tgt->ltd_active) { + if (rc && tgt->ltd_active) { /* turned inactive? */ spin_lock(&d->lod_lock); if (tgt->ltd_active) { tgt->ltd_active = 0; if (rc == -ENOTCONN) - tgt->ltd_connecting = 1; + tgt->ltd_discon = 1; LASSERT(desc->ld_active_tgt_count > 0); desc->ld_active_tgt_count--; @@ -142,15 +142,15 @@ static int lod_statfs_and_check(const struct lu_env *env, struct lod_device *d, tgt->ltd_exp->exp_obd->obd_name); } spin_unlock(&d->lod_lock); - } else if (rc == 0 && tgt->ltd_active == 0) { + } else if (rc == 0 && !tgt->ltd_active) { /* turned active? */ + spin_lock(&d->lod_lock); LASSERTF(desc->ld_active_tgt_count < desc->ld_tgt_count, "active tgt count %d, tgt nr %d\n", desc->ld_active_tgt_count, desc->ld_tgt_count); - spin_lock(&d->lod_lock); - if (tgt->ltd_active == 0) { + if (!tgt->ltd_active) { tgt->ltd_active = 1; - tgt->ltd_connecting = 0; + tgt->ltd_discon = 0; desc->ld_active_tgt_count++; set_bit(LQ_DIRTY, <d->ltd_qos.lq_flags); CDEBUG(D_CONFIG, "%s: turns active\n", @@ -169,20 +169,6 @@ static int lod_statfs_and_check(const struct lu_env *env, struct lod_device *d, RETURN(rc); } -static int lod_is_tgt_usable(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt) -{ - int rc; - - rc = lod_statfs_check(ltd, tgt); - if (rc) - return rc; - - if (!tgt->ltd_active) - return -ENOTCONN; - - return 0; -} - /** * Maintain per-target statfs data. * @@ -425,17 +411,18 @@ out: /** * Calculate a minimum acceptable stripe count. * - * Return an acceptable stripe count depending on flag LOV_USES_DEFAULT_STRIPE: - * all stripes or 3/4 of stripes. + * Return an acceptable stripe count depending on flag LOD_USES_DEFAULT_STRIPE: + * all stripes or 3/4 of stripes. The code is written this way to avoid + * returning 0 for stripe_count < 4, like "stripe_count * 3 / 4" would do. * * \param[in] stripe_count number of stripes requested - * \param[in] flags 0 or LOV_USES_DEFAULT_STRIPE + * \param[in] flags 0 or LOD_USES_DEFAULT_STRIPE * * \retval acceptable stripecount */ -static int min_stripe_count(__u32 stripe_count, int flags) +static int lod_stripe_count_min(__u32 stripe_count, enum lod_uses_hint flags) { - return (flags & LOV_USES_DEFAULT_STRIPE ? + return (flags & LOD_USES_DEFAULT_STRIPE ? stripe_count - (stripe_count / 4) : stripe_count); } @@ -717,7 +704,7 @@ static int lod_check_and_reserve_ost(const struct lu_env *env, * all the internal structures (statfs cache, array of available OSTs sorted * with regard to OSS, etc). The number of stripes required is taken from the * object (must be prepared by the caller), but can change if the flag - * LOV_USES_DEFAULT_STRIPE is supplied. The caller should ensure nobody else + * LOD_USES_DEFAULT_STRIPE is supplied. The caller should ensure nobody else * is trying to create a striping on the object in parallel. All the internal * structures (like pools, etc) are protected and no additional locking is * required. The function succeeds even if a single stripe is allocated. To save @@ -728,7 +715,7 @@ static int lod_check_and_reserve_ost(const struct lu_env *env, * \param[in] lo LOD object * \param[out] stripe striping created * \param[out] ost_indices ost indices of striping created - * \param[in] flags allocation flags (0 or LOV_USES_DEFAULT_STRIPE) + * \param[in] flags allocation flags (0 or LOD_USES_DEFAULT_STRIPE) * \param[in] th transaction handle * \param[in] comp_idx index of ldo_comp_entries * @@ -738,8 +725,8 @@ static int lod_check_and_reserve_ost(const struct lu_env *env, */ static int lod_ost_alloc_rr(const struct lu_env *env, struct lod_object *lo, struct dt_object **stripe, __u32 *ost_indices, - int flags, struct thandle *th, int comp_idx, - __u64 reserve) + enum lod_uses_hint flags, struct thandle *th, + int comp_idx, __u64 reserve) { struct lod_layout_component *lod_comp; struct lod_device *m = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev); @@ -757,7 +744,7 @@ static int lod_ost_alloc_rr(const struct lu_env *env, struct lod_object *lo, LASSERT(lo->ldo_comp_cnt > comp_idx && lo->ldo_comp_entries != NULL); lod_comp = &lo->ldo_comp_entries[comp_idx]; stripe_count = lod_comp->llc_stripe_count; - stripe_count_min = min_stripe_count(stripe_count, flags); + stripe_count_min = lod_stripe_count_min(stripe_count, flags); if (lod_comp->llc_pool != NULL) pool = lod_find_pool(m, lod_comp->llc_pool); @@ -842,7 +829,7 @@ repeat_find: ost_indices, th, &overstriped, reserve); - if (rc != 0 && OST_TGT(m, ost_idx)->ltd_connecting) + if (rc != 0 && OST_TGT(m, ost_idx)->ltd_discon) ost_connecting = 1; } if ((speed < 2) && (stripe_idx < stripe_count_min)) { @@ -1019,13 +1006,14 @@ repeat_find: if (lod_qos_is_tgt_used(env, mdt_idx, stripe_idx)) continue; - rc = lod_is_tgt_usable(ltd, mdt); - if (rc) { - if (mdt->ltd_connecting) - tgt_connecting = 1; + if (mdt->ltd_discon) { + tgt_connecting = 1; continue; } + if (lod_statfs_check(ltd, mdt)) + continue; + /* try to use another OSP if this one is degraded */ if (mdt->ltd_statfs.os_state & OS_STATFS_DEGRADED && !use_degraded) { @@ -1047,7 +1035,7 @@ repeat_find: QOS_DEBUG("can't alloc stripe on #%u: %d\n", mdt->ltd_index, (int) PTR_ERR(dto)); - if (mdt->ltd_connecting) + if (mdt->ltd_discon) tgt_connecting = 1; continue; } @@ -1216,8 +1204,8 @@ static int lod_alloc_ost_list(const struct lu_env *env, struct lod_object *lo, static int lod_ost_alloc_specific(const struct lu_env *env, struct lod_object *lo, struct dt_object **stripe, __u32 *ost_indices, - int flags, struct thandle *th, int comp_idx, - __u64 reserve) + enum lod_uses_hint flags, struct thandle *th, + int comp_idx, __u64 reserve) { struct lod_layout_component *lod_comp; struct lod_device *m = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev); @@ -1496,7 +1484,7 @@ out: * configuration (# of stripes, offset, pool) is taken from the object and * is prepared by the caller. * - * If LOV_USES_DEFAULT_STRIPE is not passed and prepared configuration can't + * If LOD_USES_DEFAULT_STRIPE is not passed and prepared configuration can't * be met due to too few OSTs, then allocation fails. If the flag is passed * fewer than 3/4 of the requested number of stripes can be allocated, then * allocation fails. @@ -1513,7 +1501,7 @@ out: * \param[in] lo LOD object * \param[out] stripe striping created * \param[out] ost_indices ost indices of striping created - * \param[in] flags 0 or LOV_USES_DEFAULT_STRIPE + * \param[in] flags 0 or LOD_USES_DEFAULT_STRIPE * \param[in] th transaction handle * \param[in] comp_idx index of ldo_comp_entries * @@ -1524,8 +1512,8 @@ out: */ static int lod_ost_alloc_qos(const struct lu_env *env, struct lod_object *lo, struct dt_object **stripe, __u32 *ost_indices, - int flags, struct thandle *th, int comp_idx, - __u64 reserve) + enum lod_uses_hint flags, struct thandle *th, + int comp_idx, __u64 reserve) { struct lod_layout_component *lod_comp; struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev); @@ -1550,7 +1538,7 @@ static int lod_ost_alloc_qos(const struct lu_env *env, struct lod_object *lo, LASSERT(lo->ldo_comp_cnt > comp_idx && lo->ldo_comp_entries != NULL); lod_comp = &lo->ldo_comp_entries[comp_idx]; stripe_count = lod_comp->llc_stripe_count; - stripe_count_min = min_stripe_count(stripe_count, flags); + stripe_count_min = lod_stripe_count_min(stripe_count, flags); if (stripe_count_min < 1) RETURN(-EINVAL); @@ -1694,11 +1682,13 @@ static int lod_ost_alloc_qos(const struct lu_env *env, struct lod_object *lo, QOS_DEBUG("stripe=%d to idx=%d\n", nfound, idx); /* - * do not put >1 objects on a single OST, except for - * overstriping + * In case of QOS it makes sense to check components + * only for FLR and if current component doesn't support + * overstriping. */ - if ((lod_comp_is_ost_used(env, lo, idx)) && - !(lod_comp->llc_pattern & LOV_PATTERN_OVERSTRIPING)) + if (lo->ldo_mirror_count > 1 && + !(lod_comp->llc_pattern & LOV_PATTERN_OVERSTRIPING) + && lod_comp_is_ost_used(env, lo, idx)) continue; if (lod_qos_is_tgt_used(env, idx, nfound)) { @@ -1739,7 +1729,7 @@ static int lod_ost_alloc_qos(const struct lu_env *env, struct lod_object *lo, } } - if (unlikely(nfound != stripe_count)) { + if (unlikely(nfound < stripe_count_min)) { /* * when the decision to use weighted algorithm was made * we had enough appropriate OSPs, but this state can @@ -1759,6 +1749,8 @@ static int lod_ost_alloc_qos(const struct lu_env *env, struct lod_object *lo, set_bit(LQ_DIRTY, &lod->lod_ost_descs.ltd_qos.lq_flags); clear_bit(LQ_SAME_SPACE, &lod->lod_ost_descs.ltd_qos.lq_flags); rc = -EAGAIN; + } else if (nfound < lod_comp->llc_stripe_count) { + lod_comp->llc_stripe_count = nfound; } /* If there are enough OSTs, a component with overstriping requessted @@ -1879,8 +1871,7 @@ int lod_mdt_alloc_qos(const struct lu_env *env, struct lod_object *lo, mdt = LTD_TGT(ltd, pool->op_array[i]); mdt->ltd_qos.ltq_usable = 0; - rc = lod_is_tgt_usable(ltd, mdt); - if (rc) + if (mdt->ltd_discon || lod_statfs_check(ltd, mdt)) continue; if (mdt->ltd_statfs.os_state & OS_STATFS_DEGRADED) @@ -2013,28 +2004,45 @@ unlock: * * \retval the maximum usable stripe count */ +__u16 lod_get_stripe_count_plain(struct lod_device *lod, struct lod_object *lo, + __u16 stripe_count, bool overstriping, + enum lod_uses_hint *flags) +{ + struct lov_desc *lov_desc = &lod->lod_ost_descs.ltd_lov_desc; + + if (!stripe_count) + stripe_count = lov_desc->ld_default_stripe_count; + + /* Overstriping allows more stripes than targets */ + if (stripe_count > lov_desc->ld_active_tgt_count && !overstriping) { + *flags |= LOD_USES_DEFAULT_STRIPE; + if (stripe_count == LOV_ALL_STRIPES && lod->lod_max_stripecount) + stripe_count = lod->lod_max_stripecount; + else + stripe_count = lov_desc->ld_active_tgt_count; + } + if (!stripe_count) + stripe_count = 1; + + if (overstriping && stripe_count > LOV_MAX_STRIPE_COUNT) + stripe_count = LOV_MAX_STRIPE_COUNT; + + return stripe_count; +} + __u16 lod_get_stripe_count(struct lod_device *lod, struct lod_object *lo, - int comp_idx, __u16 stripe_count, bool overstriping) + int comp_idx, __u16 stripe_count, bool overstriping, + enum lod_uses_hint *flags) { __u32 max_stripes = LOV_MAX_STRIPE_COUNT_OLD; /* max stripe count is based on OSD ea size */ unsigned int easize = lod->lod_osd_max_easize; int i; + ENTRY; - if (stripe_count == (__u16)(-1) && lod->lod_max_stripecount) - stripe_count = lod->lod_max_stripecount; - if (!stripe_count) - stripe_count = - lod->lod_ost_descs.ltd_lov_desc.ld_default_stripe_count; - if (!stripe_count) - stripe_count = 1; - /* Overstriping allows more stripes than targets */ - if (stripe_count > - lod->lod_ost_descs.ltd_lov_desc.ld_active_tgt_count && - !overstriping) - stripe_count = - lod->lod_ost_descs.ltd_lov_desc.ld_active_tgt_count; + stripe_count = lod_get_stripe_count_plain(lod, lo, stripe_count, + overstriping, flags); if (lo->ldo_is_composite) { struct lod_layout_component *lod_comp; @@ -2711,13 +2719,13 @@ int lod_qos_prep_create(const struct lu_env *env, struct lod_object *lo, int comp_idx, __u64 reserve) { struct lod_layout_component *lod_comp; - struct lod_device *d = lu2lod_dev(lod2lu_obj(lo)->lo_dev); - int stripe_len; - int flag = LOV_USES_ASSIGNED_STRIPE; - int i, rc = 0; + struct lod_device *d = lu2lod_dev(lod2lu_obj(lo)->lo_dev); struct lod_avoid_guide *lag = &lod_env_info(env)->lti_avoid; struct dt_object **stripe = NULL; __u32 *ost_indices = NULL; + enum lod_uses_hint flags = LOD_USES_ASSIGNED_STRIPE; + int stripe_len; + int i, rc = 0; ENTRY; LASSERT(lo); @@ -2755,7 +2763,8 @@ int lod_qos_prep_create(const struct lu_env *env, struct lod_object *lo, stripe_len = lod_get_stripe_count(d, lo, comp_idx, lod_comp->llc_stripe_count, lod_comp->llc_pattern & - LOV_PATTERN_OVERSTRIPING); + LOV_PATTERN_OVERSTRIPING, + &flags); if (stripe_len == 0) GOTO(out, rc = -ERANGE); @@ -2791,14 +2800,14 @@ repeat: lod_collect_avoidance(lo, lag, comp_idx); rc = lod_ost_alloc_qos(env, lo, stripe, ost_indices, - flag, th, comp_idx, reserve); + flags, th, comp_idx, reserve); if (rc == -EAGAIN) rc = lod_ost_alloc_rr(env, lo, stripe, - ost_indices, flag, th, + ost_indices, flags, th, comp_idx, reserve); } else { rc = lod_ost_alloc_specific(env, lo, stripe, - ost_indices, flag, th, + ost_indices, flags, th, comp_idx, reserve); } put_ldts: diff --git a/lustre/osp/osp_dev.c b/lustre/osp/osp_dev.c index d3e26db..838545f 100644 --- a/lustre/osp/osp_dev.c +++ b/lustre/osp/osp_dev.c @@ -801,12 +801,12 @@ static int osp_statfs(const struct lu_env *env, struct dt_device *dev, info->os_reserved_mb_high = d->opd_reserved_mb_high; } - CDEBUG(D_OTHER, "%s: %llu blocks, %llu free, %llu avail, " - "%u bsize, %u reserved mb low, %u reserved mb high, " - "%llu files, %llu free files\n", d->opd_obd->obd_name, + CDEBUG(D_OTHER, + "%s: blocks=%llu, bfree=%llu, bavail=%llu, bsize=%u, reserved_mb_low=%u, reserved_mb_high=%u, files=%llu, ffree=%llu, state=%x\n", + d->opd_obd->obd_name, sfs->os_blocks, sfs->os_bfree, sfs->os_bavail, sfs->os_bsize, d->opd_reserved_mb_low, d->opd_reserved_mb_high, - sfs->os_files, sfs->os_ffree); + sfs->os_files, sfs->os_ffree, sfs->os_state); if (d->opd_pre == NULL || (info && !info->os_enable_pre)) RETURN(0); diff --git a/lustre/tests/sanity-flr.sh b/lustre/tests/sanity-flr.sh index 197e98d..98f4afe 100644 --- a/lustre/tests/sanity-flr.sh +++ b/lustre/tests/sanity-flr.sh @@ -3659,10 +3659,10 @@ test_202() { ids=($($LFS getstripe $tf | awk '/lcme_id/{print $2}' | tr '\n' ' ')) verify_comp_attr stripe-count $tf ${ids[0]} 1 - $LFS setstripe --component-add -E 2M -c -1 $tf + $LFS setstripe --component-add -E 2M -c $OSTCOUNT $tf ids=($($LFS getstripe $tf | awk '/lcme_id/{print $2}' | tr '\n' ' ')) verify_comp_attr stripe-count $tf ${ids[0]} 1 - verify_comp_attr stripe-count $tf ${ids[1]} -1 + verify_comp_attr stripe-count $tf ${ids[1]} $OSTCOUNT dd if=/dev/zero of=$tf bs=1M count=2 ids=($($LFS getstripe $tf | awk '/lcme_id/{print $2}' | tr '\n' ' ')) diff --git a/lustre/tests/sanity-pfl.sh b/lustre/tests/sanity-pfl.sh index 169f46d..04c175d 100644 --- a/lustre/tests/sanity-pfl.sh +++ b/lustre/tests/sanity-pfl.sh @@ -867,8 +867,8 @@ test_15() { $LFS setstripe -E 1M -S 1M -E 10M -E eof $parent/f1 || error "create f1" $LFS setstripe -E 4M -E 20M -E eof $parent/f2 || error "create f2" test_mkdir $parent/subdir - $LFS setstripe -E 6M -S 1M -c1 -E 30M -c4 -E eof -c -1 $parent/subdir || - error "setstripe to subdir" + $LFS setstripe -E 6M -S 1M -c1 -E 30M -c4 -E eof -c $OSTCOUNT \ + $parent/subdir || error "setstripe to subdir" $LFS setstripe -E 8M -E eof $parent/subdir/f3 || error "create f3" $LFS setstripe -c 1 $parent/subdir/f4 || error "create f4" diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 7882aa9..cb1d18e 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -3271,6 +3271,45 @@ test_27R() { } run_test 27R "test max_stripecount limitation when stripe count is set to -1" +test_27V() { + [ $PARALLEL == "yes" ] && skip "skip parallel run" + (( $OSTCOUNT >= 4 )) || skip_env "needs >= 4 OSTs" + + local dir=$DIR/$tdir + local osp_param=osp.$FSNAME-OST0000-osc-MDT0000.max_create_count + local lod_param=lod.$FSNAME-MDT0000-mdtlov.qos_threshold_rr + local saved_max=$(do_facet mds1 $LCTL get_param -n $osp_param) + local saved_qos=$(do_facet mds1 $LCTL get_param -n $lod_param) + local pid + + stack_trap "do_facet mds1 $LCTL set_param $osp_param=$saved_max" + + do_facet mds1 $LCTL set_param $lod_param=0 + stack_trap "do_facet mds1 $LCTL set_param $lod_param=$saved_qos" + + $LFS setdirstripe --mdt-count=1 --mdt-index=0 $dir + stack_trap "rm -rf $dir" + + # exercise race in LU-16981 with deactivating OST while creating a file + ( + while true; do + do_facet mds1 $LCTL set_param $osp_param=0 > /dev/null + sleep 0.1 + do_facet mds1 \ + $LCTL set_param $osp_param=$saved_max > /dev/null + done + ) & + + pid=$! + stack_trap "kill -9 $pid" + + # errors here are OK so ignore them (just don't want to crash) + $LFS setstripe -c -1 $dir/f.{1..200} 2> /dev/null + + return 0 +} +run_test 27V "creating widely striped file races with deactivating OST" + # createtest also checks that device nodes are created and # then visible correctly (#2091) test_28() { # bug 2091