X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Flod%2Flod_lov.c;h=3dd8696659b5fbf6e8876e91ffc6c7ed356ed074;hp=2197e415d81b541b4e76e212c86e2ce2b0aebfe1;hb=6a08df2d0effc7aa9d2a4428ff38c3b5df73d118;hpb=6a20bdcc608bc2b933774b9f34ec25395e920a54 diff --git a/lustre/lod/lod_lov.c b/lustre/lod/lod_lov.c index 2197e41..3dd8696 100644 --- a/lustre/lod/lod_lov.c +++ b/lustre/lod/lod_lov.c @@ -76,31 +76,20 @@ void lod_putref(struct lod_device *lod, struct lod_tgt_descs *ltd) ltd->ltd_refcount--; if (ltd->ltd_refcount == 0 && ltd->ltd_death_row) { struct lod_tgt_desc *tgt_desc, *tmp; - struct list_head kill; - unsigned int idx; + LIST_HEAD(kill); CDEBUG(D_CONFIG, "destroying %d ltd desc\n", ltd->ltd_death_row); - INIT_LIST_HEAD(&kill); - - cfs_foreach_bit(ltd->ltd_tgt_bitmap, idx) { - tgt_desc = LTD_TGT(ltd, idx); + ltd_foreach_tgt_safe(ltd, tgt_desc, tmp) { LASSERT(tgt_desc); - if (!tgt_desc->ltd_reap) continue; list_add(&tgt_desc->ltd_kill, &kill); - LTD_TGT(ltd, idx) = NULL; - /*FIXME: only support ost pool for now */ - if (ltd == &lod->lod_ost_descs) { - lod_ost_pool_remove(&lod->lod_pool_info, idx); - if (tgt_desc->ltd_active) - lod->lod_desc.ld_active_tgt_count--; - } - ltd->ltd_tgtnr--; - cfs_bitmap_clear(ltd->ltd_tgt_bitmap, idx); + lu_tgt_pool_remove(<d->ltd_tgt_pool, + tgt_desc->ltd_index); + ltd_del_tgt(ltd, tgt_desc); ltd->ltd_death_row--; } mutex_unlock(<d->ltd_mutex); @@ -108,17 +97,8 @@ void lod_putref(struct lod_device *lod, struct lod_tgt_descs *ltd) list_for_each_entry_safe(tgt_desc, tmp, &kill, ltd_kill) { int rc; + list_del(&tgt_desc->ltd_kill); - if (ltd == &lod->lod_ost_descs) { - /* remove from QoS structures */ - rc = qos_del_tgt(lod, tgt_desc); - if (rc) - CERROR("%s: qos_del_tgt(%s) failed:" - "rc = %d\n", - lod2obd(lod)->obd_name, - obd_uuid2str(&tgt_desc->ltd_uuid), - rc); - } rc = obd_disconnect(tgt_desc->ltd_exp); if (rc) CERROR("%s: failed to disconnect %s: rc = %d\n", @@ -133,60 +113,6 @@ void lod_putref(struct lod_device *lod, struct lod_tgt_descs *ltd) } /** - * Expand size of target table. - * - * When the target table is full, we have to extend the table. To do so, - * we allocate new memory with some reserve, move data from the old table - * to the new one and release memory consumed by the old table. - * Notice we take ltd_rw_sem exclusively to ensure atomic switch. - * - * \param[in] ltd target table - * \param[in] newsize new size of the table - * - * \retval 0 on success - * \retval -ENOMEM if reallocation failed - */ -static int ltd_bitmap_resize(struct lod_tgt_descs *ltd, __u32 newsize) -{ - struct cfs_bitmap *new_bitmap, *old_bitmap = NULL; - int rc = 0; - ENTRY; - - /* grab write reference on the lod. Relocating the array requires - * exclusive access */ - - down_write(<d->ltd_rw_sem); - if (newsize <= ltd->ltd_tgts_size) - /* someone else has already resize the array */ - GOTO(out, rc = 0); - - /* allocate new bitmap */ - new_bitmap = CFS_ALLOCATE_BITMAP(newsize); - if (!new_bitmap) - GOTO(out, rc = -ENOMEM); - - if (ltd->ltd_tgts_size > 0) { - /* the bitmap already exists, we need - * to copy data from old one */ - cfs_bitmap_copy(new_bitmap, ltd->ltd_tgt_bitmap); - old_bitmap = ltd->ltd_tgt_bitmap; - } - - ltd->ltd_tgts_size = newsize; - ltd->ltd_tgt_bitmap = new_bitmap; - - if (old_bitmap) - CFS_FREE_BITMAP(old_bitmap); - - CDEBUG(D_CONFIG, "tgt size: %d\n", ltd->ltd_tgts_size); - - EXIT; -out: - up_write(<d->ltd_rw_sem); - return rc; -} - -/** * Connect LOD to a new OSP and add it to the target table. * * Connect to the OSP device passed, initialize all the internal @@ -219,7 +145,6 @@ int lod_add_device(const struct lu_env *env, struct lod_device *lod, struct lustre_cfg *lcfg; struct obd_uuid obd_uuid; bool for_ost; - bool lock = false; bool connected = false; ENTRY; @@ -317,73 +242,27 @@ int lod_add_device(const struct lu_env *env, struct lod_device *lod, tgt_desc->ltd_index = index; tgt_desc->ltd_active = active; - lod_getref(ltd); - if (index >= ltd->ltd_tgts_size) { - /* we have to increase the size of the lod_osts array */ - __u32 newsize; - - newsize = max(ltd->ltd_tgts_size, (__u32)2); - while (newsize < index + 1) - newsize = newsize << 1; - - /* lod_bitmap_resize() needs lod_rw_sem - * which we hold with th reference */ - lod_putref(lod, ltd); - - rc = ltd_bitmap_resize(ltd, newsize); - if (rc) - GOTO(out_desc, rc); - - lod_getref(ltd); - } - + down_write(<d->ltd_rw_sem); mutex_lock(<d->ltd_mutex); - lock = true; - if (cfs_bitmap_check(ltd->ltd_tgt_bitmap, index)) { - CERROR("%s: device %d is registered already\n", obd->obd_name, - index); - GOTO(out_mutex, rc = -EEXIST); - } - - if (ltd->ltd_tgt_idx[index / TGT_PTRS_PER_BLOCK] == NULL) { - OBD_ALLOC_PTR(ltd->ltd_tgt_idx[index / TGT_PTRS_PER_BLOCK]); - if (ltd->ltd_tgt_idx[index / TGT_PTRS_PER_BLOCK] == NULL) { - CERROR("can't allocate index to add %s\n", - obd->obd_name); - GOTO(out_mutex, rc = -ENOMEM); - } - } - - if (for_ost) { - /* pool and qos are not supported for MDS stack yet */ - rc = lod_ost_pool_add(&lod->lod_pool_info, index, - lod->lod_osts_size); - if (rc) { - CERROR("%s: can't set up pool, failed with %d\n", - obd->obd_name, rc); - GOTO(out_mutex, rc); - } + rc = ltd_add_tgt(ltd, tgt_desc); + if (rc) + GOTO(out_mutex, rc); - rc = qos_add_tgt(lod, tgt_desc); - if (rc) { - CERROR("%s: qos_add_tgt failed with %d\n", - obd->obd_name, rc); - GOTO(out_pool, rc); - } + rc = lu_qos_add_tgt(<d->ltd_qos, tgt_desc); + if (rc) + GOTO(out_del_tgt, rc); - /* The new OST is now a full citizen */ - if (index >= lod->lod_desc.ld_tgt_count) - lod->lod_desc.ld_tgt_count = index + 1; - if (active) - lod->lod_desc.ld_active_tgt_count++; + rc = lu_tgt_pool_add(<d->ltd_tgt_pool, index, + ltd->ltd_lov_desc.ld_tgt_count); + if (rc) { + CERROR("%s: can't set up pool, failed with %d\n", + obd->obd_name, rc); + GOTO(out_del_tgt, rc); } - LTD_TGT(ltd, index) = tgt_desc; - cfs_bitmap_set(ltd->ltd_tgt_bitmap, index); - ltd->ltd_tgtnr++; mutex_unlock(<d->ltd_mutex); - lod_putref(lod, ltd); - lock = false; + up_write(<d->ltd_rw_sem); + if (lod->lod_recovery_completed) lu_dev->ld_ops->ldo_recovery_complete(env, lu_dev); @@ -405,28 +284,16 @@ int lod_add_device(const struct lu_env *env, struct lod_device *lod, RETURN(rc); out_fini_llog: lod_sub_fini_llog(env, tgt_desc->ltd_tgt, - tgt_desc->ltd_recovery_thread); + &tgt_desc->ltd_recovery_task); out_ltd: - lod_getref(ltd); + down_write(<d->ltd_rw_sem); mutex_lock(<d->ltd_mutex); - lock = true; - if (!for_ost && LTD_TGT(ltd, index)->ltd_recovery_thread != NULL) { - struct ptlrpc_thread *thread; - - thread = LTD_TGT(ltd, index)->ltd_recovery_thread; - OBD_FREE_PTR(thread); - } - ltd->ltd_tgtnr--; - cfs_bitmap_clear(ltd->ltd_tgt_bitmap, index); - LTD_TGT(ltd, index) = NULL; -out_pool: - lod_ost_pool_remove(&lod->lod_pool_info, index); + lu_tgt_pool_remove(<d->ltd_tgt_pool, index); +out_del_tgt: + ltd_del_tgt(ltd, tgt_desc); out_mutex: - if (lock) { - mutex_unlock(<d->ltd_mutex); - lod_putref(lod, ltd); - } -out_desc: + mutex_unlock(<d->ltd_mutex); + up_write(<d->ltd_rw_sem); OBD_FREE_PTR(tgt_desc); out_cleanup: /* XXX OSP needs us to send down LCFG_CLEANUP because it uses @@ -453,27 +320,16 @@ out_cleanup: * \param[in] env execution environment for this thread * \param[in] lod LOD device the target table belongs to * \param[in] ltd target table - * \param[in] idx index of the target - * \param[in] for_ost type of the target: 0 - MDT, 1 - OST + * \param[in] tgt target */ static void __lod_del_device(const struct lu_env *env, struct lod_device *lod, - struct lod_tgt_descs *ltd, unsigned idx, - bool for_ost) + struct lod_tgt_descs *ltd, struct lu_tgt_desc *tgt) { - LASSERT(LTD_TGT(ltd, idx)); - - lfsck_del_target(env, lod->lod_child, LTD_TGT(ltd, idx)->ltd_tgt, - idx, for_ost); - - if (!for_ost && LTD_TGT(ltd, idx)->ltd_recovery_thread != NULL) { - struct ptlrpc_thread *thread; - - thread = LTD_TGT(ltd, idx)->ltd_recovery_thread; - OBD_FREE_PTR(thread); - } + lfsck_del_target(env, lod->lod_child, tgt->ltd_tgt, tgt->ltd_index, + !ltd->ltd_is_mdt); - if (LTD_TGT(ltd, idx)->ltd_reap == 0) { - LTD_TGT(ltd, idx)->ltd_reap = 1; + if (!tgt->ltd_reap) { + tgt->ltd_reap = 1; ltd->ltd_death_row++; } } @@ -486,29 +342,26 @@ static void __lod_del_device(const struct lu_env *env, struct lod_device *lod, * \param[in] env execution environment for this thread * \param[in] lod LOD device the target table belongs to * \param[in] ltd target table - * \param[in] for_ost type of the target: MDT or OST * * \retval 0 always */ int lod_fini_tgt(const struct lu_env *env, struct lod_device *lod, - struct lod_tgt_descs *ltd, bool for_ost) + struct lod_tgt_descs *ltd) { - unsigned int idx; + struct lu_tgt_desc *tgt; if (ltd->ltd_tgts_size <= 0) return 0; + lod_getref(ltd); mutex_lock(<d->ltd_mutex); - cfs_foreach_bit(ltd->ltd_tgt_bitmap, idx) - __lod_del_device(env, lod, ltd, idx, for_ost); + ltd_foreach_tgt(ltd, tgt) + __lod_del_device(env, lod, ltd, tgt); mutex_unlock(<d->ltd_mutex); lod_putref(lod, ltd); - CFS_FREE_BITMAP(ltd->ltd_tgt_bitmap); - for (idx = 0; idx < TGT_PTRS; idx++) { - if (ltd->ltd_tgt_idx[idx]) - OBD_FREE_PTR(ltd->ltd_tgt_idx[idx]); - } - ltd->ltd_tgts_size = 0; + + lu_tgt_descs_fini(ltd); + return 0; } @@ -524,18 +377,19 @@ int lod_fini_tgt(const struct lu_env *env, struct lod_device *lod, * \param[in] osp name of OSP device to be removed * \param[in] idx index of the target * \param[in] gen generation number, not used currently - * \param[in] for_ost type of the target: 0 - MDT, 1 - OST * * \retval 0 if the device was scheduled for removal * \retval -EINVAL if no device was found */ int lod_del_device(const struct lu_env *env, struct lod_device *lod, - struct lod_tgt_descs *ltd, char *osp, unsigned idx, - unsigned gen, bool for_ost) + struct lod_tgt_descs *ltd, char *osp, unsigned int idx, + unsigned int gen) { struct obd_device *obd; - int rc = 0; - struct obd_uuid uuid; + struct lu_tgt_desc *tgt; + struct obd_uuid uuid; + int rc = 0; + ENTRY; CDEBUG(D_CONFIG, "osp:%s idx:%d gen:%d\n", osp, idx, gen); @@ -559,22 +413,21 @@ int lod_del_device(const struct lu_env *env, struct lod_device *lod, lod_getref(ltd); mutex_lock(<d->ltd_mutex); + tgt = LTD_TGT(ltd, idx); /* check that the index is allocated in the bitmap */ - if (!cfs_bitmap_check(ltd->ltd_tgt_bitmap, idx) || - !LTD_TGT(ltd, idx)) { + if (!test_bit(idx, ltd->ltd_tgt_bitmap) || !tgt) { CERROR("%s: device %d is not set up\n", obd->obd_name, idx); GOTO(out, rc = -EINVAL); } /* check that the UUID matches */ - if (!obd_uuid_equals(&uuid, <D_TGT(ltd, idx)->ltd_uuid)) { + if (!obd_uuid_equals(&uuid, &tgt->ltd_uuid)) { CERROR("%s: LOD target UUID %s at index %d does not match %s\n", - obd->obd_name, obd_uuid2str(<D_TGT(ltd,idx)->ltd_uuid), - idx, osp); + obd->obd_name, obd_uuid2str(&tgt->ltd_uuid), idx, osp); GOTO(out, rc = -EINVAL); } - __lod_del_device(env, lod, ltd, idx, for_ost); + __lod_del_device(env, lod, ltd, tgt); EXIT; out: mutex_unlock(<d->ltd_mutex); @@ -600,8 +453,6 @@ int lod_ea_store_resize(struct lod_thread_info *info, size_t size) { __u32 round = size_roundup_power2(size); - LASSERT(round <= - lov_mds_md_size(LOV_MAX_STRIPE_COUNT, LOV_MAGIC_V3)); if (info->lti_ea_store) { LASSERT(info->lti_ea_store_size); LASSERT(info->lti_ea_store_size < round); @@ -698,8 +549,7 @@ int lod_def_striping_comp_resize(struct lod_default_striping *lds, __u16 count) void lod_free_comp_entries(struct lod_object *lo) { if (lo->ldo_mirrors) { - OBD_FREE(lo->ldo_mirrors, - sizeof(*lo->ldo_mirrors) * lo->ldo_mirror_count); + OBD_FREE_PTR_ARRAY(lo->ldo_mirrors, lo->ldo_mirror_count); lo->ldo_mirrors = NULL; lo->ldo_mirror_count = 0; } @@ -718,8 +568,7 @@ int lod_alloc_comp_entries(struct lod_object *lo, LASSERT(lo->ldo_comp_cnt == 0 && lo->ldo_comp_entries == NULL); if (mirror_count > 0) { - OBD_ALLOC(lo->ldo_mirrors, - sizeof(*lo->ldo_mirrors) * mirror_count); + OBD_ALLOC_PTR_ARRAY(lo->ldo_mirrors, mirror_count); if (!lo->ldo_mirrors) return -ENOMEM; @@ -729,8 +578,7 @@ int lod_alloc_comp_entries(struct lod_object *lo, OBD_ALLOC_LARGE(lo->ldo_comp_entries, sizeof(*lo->ldo_comp_entries) * comp_count); if (lo->ldo_comp_entries == NULL) { - OBD_FREE(lo->ldo_mirrors, - sizeof(*lo->ldo_mirrors) * mirror_count); + OBD_FREE_PTR_ARRAY(lo->ldo_mirrors, mirror_count); lo->ldo_mirror_count = 0; return -ENOMEM; } @@ -741,10 +589,12 @@ int lod_alloc_comp_entries(struct lod_object *lo, int lod_fill_mirrors(struct lod_object *lo) { + struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev); struct lod_layout_component *lod_comp; + bool found_preferred = false; int mirror_idx = -1; __u16 mirror_id = 0xffff; - int i; + int i, pref; ENTRY; LASSERT(equi(!lo->ldo_is_composite, lo->ldo_mirror_count == 0)); @@ -753,17 +603,34 @@ int lod_fill_mirrors(struct lod_object *lo) RETURN(0); lod_comp = &lo->ldo_comp_entries[0]; + for (i = 0; i < lo->ldo_comp_cnt; i++, lod_comp++) { int stale = !!(lod_comp->llc_flags & LCME_FL_STALE); int preferred = !!(lod_comp->llc_flags & LCME_FL_PREF_WR); + int j; + + pref = 0; + /* calculate component preference over all used OSTs */ + for (j = 0; j < lod_comp->llc_stripes_allocated; j++) { + int idx = lod_comp->llc_ost_indices[j]; + struct obd_statfs *osfs = &OST_TGT(lod,idx)->ltd_statfs; + + if (osfs->os_state & OS_STATFS_NONROT) + pref++; + } if (mirror_id_of(lod_comp->llc_id) == mirror_id) { lo->ldo_mirrors[mirror_idx].lme_stale |= stale; - lo->ldo_mirrors[mirror_idx].lme_primary |= preferred; + lo->ldo_mirrors[mirror_idx].lme_prefer |= preferred; + lo->ldo_mirrors[mirror_idx].lme_preference += pref; lo->ldo_mirrors[mirror_idx].lme_end = i; continue; } + if (mirror_idx >= 0 && preferred && + !lo->ldo_mirrors[mirror_idx].lme_stale) + found_preferred = true; + /* new mirror */ ++mirror_idx; if (mirror_idx >= lo->ldo_mirror_count) @@ -773,13 +640,35 @@ int lod_fill_mirrors(struct lod_object *lo) lo->ldo_mirrors[mirror_idx].lme_id = mirror_id; lo->ldo_mirrors[mirror_idx].lme_stale = stale; - lo->ldo_mirrors[mirror_idx].lme_primary = preferred; + lo->ldo_mirrors[mirror_idx].lme_prefer = preferred; + lo->ldo_mirrors[mirror_idx].lme_preference = pref; lo->ldo_mirrors[mirror_idx].lme_start = i; lo->ldo_mirrors[mirror_idx].lme_end = i; } if (mirror_idx != lo->ldo_mirror_count - 1) RETURN(-EINVAL); + if (!found_preferred && mirror_idx > 0) { + int best = -1; + + /* + * if no explicited preferred found, then find a mirror + * with higher number of non-rotational OSTs + * */ + pref = -1; + for (i = 0; i <= mirror_idx; i++) { + if (lo->ldo_mirrors[i].lme_stale) + continue; + if (lo->ldo_mirrors[i].lme_preference > pref) { + pref = lo->ldo_mirrors[i].lme_preference; + best = i; + } + } + + LASSERT(best >= 0); + lo->ldo_mirrors[best].lme_prefer = 1; + } + RETURN(0); } @@ -856,11 +745,18 @@ static int lod_gen_component_ea(const struct lu_env *env, RETURN(-E2BIG); objs = &v3->lmm_objects[0]; } - stripe_count = lod_comp_entry_stripe_count(lo, lod_comp, is_dir); + lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev); + stripe_count = lod_comp_entry_stripe_count(lo, comp_idx, is_dir); if (stripe_count == 0 && !is_dir && !(lod_comp->llc_pattern & LOV_PATTERN_F_RELEASED) && - !(lod_comp->llc_pattern & LOV_PATTERN_MDT)) + !(lod_comp->llc_pattern & LOV_PATTERN_MDT)) { + /* Try again if all active targets are disconnected. + * It is possible when MDS does failover. */ + if (!lod->lod_ost_active_count && + lod->lod_ost_count) + RETURN(-EAGAIN); RETURN(-E2BIG); + } if (!is_dir && lo->ldo_is_composite) lod_comp_shrink_stripe_count(lod_comp, &stripe_count); @@ -869,7 +765,6 @@ static int lod_gen_component_ea(const struct lu_env *env, GOTO(done, rc = 0); /* generate ost_idx of this component stripe */ - lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev); for (i = 0; i < stripe_count; i++) { struct dt_object *object; __u32 ost_idx = (__u32)-1UL; @@ -1015,6 +910,9 @@ int lod_generate_lovea(const struct lu_env *env, struct lod_object *lo, if (lod_comp->llc_flags & LCME_FL_NOSYNC) lcme->lcme_timestamp = cpu_to_le64(lod_comp->llc_timestamp); + if (lod_comp->llc_flags & LCME_FL_EXTENSION && !is_dir) + lcm->lcm_magic = cpu_to_le32(LOV_MAGIC_SEL); + lcme->lcme_extent.e_start = cpu_to_le64(lod_comp->llc_extent.e_start); lcme->lcme_extent.e_end = @@ -1107,10 +1005,10 @@ repeat: * \retval 0 if the index is present * \retval -EINVAL if not */ -static int validate_lod_and_idx(struct lod_device *md, __u32 idx) +int validate_lod_and_idx(struct lod_device *md, __u32 idx) { if (unlikely(idx >= md->lod_ost_descs.ltd_tgts_size || - !cfs_bitmap_check(md->lod_ost_bitmap, idx))) { + !test_bit(idx, md->lod_ost_bitmap))) { CERROR("%s: bad idx: %d of %d\n", lod2obd(md)->obd_name, idx, md->lod_ost_descs.ltd_tgts_size); return -EINVAL; @@ -1122,7 +1020,7 @@ static int validate_lod_and_idx(struct lod_device *md, __u32 idx) return -EINVAL; } - if (unlikely(OST_TGT(md, idx)->ltd_ost == NULL)) { + if (unlikely(OST_TGT(md, idx)->ltd_tgt == NULL)) { CERROR("%s: invalid lod device, for idx: %d\n", lod2obd(md)->obd_name , idx); return -EINVAL; @@ -1174,10 +1072,10 @@ int lod_initialize_objects(const struct lu_env *env, struct lod_object *lo, LASSERT(lod_comp->llc_stripe_size > 0); stripe_len = lod_comp->llc_stripe_count; - OBD_ALLOC(stripe, sizeof(stripe[0]) * stripe_len); + OBD_ALLOC_PTR_ARRAY(stripe, stripe_len); if (stripe == NULL) RETURN(-ENOMEM); - OBD_ALLOC(ost_indices, sizeof(*ost_indices) * stripe_len); + OBD_ALLOC_PTR_ARRAY(ost_indices, stripe_len); if (!ost_indices) GOTO(out, rc = -ENOMEM); @@ -1200,7 +1098,7 @@ int lod_initialize_objects(const struct lu_env *env, struct lod_object *lo, GOTO(out, rc); } - nd = &OST_TGT(md,idx)->ltd_ost->dd_lu_dev; + nd = &OST_TGT(md, idx)->ltd_tgt->dd_lu_dev; lod_putref(md, &md->lod_ost_descs); /* In the function below, .hs_keycmp resolves to @@ -1223,11 +1121,10 @@ out: if (stripe[i] != NULL) dt_object_put(env, stripe[i]); - OBD_FREE(stripe, sizeof(stripe[0]) * stripe_len); + OBD_FREE_PTR_ARRAY(stripe, stripe_len); lod_comp->llc_stripe_count = 0; if (ost_indices) - OBD_FREE(ost_indices, - sizeof(*ost_indices) * stripe_len); + OBD_FREE_PTR_ARRAY(ost_indices, stripe_len); } else { lod_comp->llc_stripe = stripe; lod_comp->llc_ost_indices = ost_indices; @@ -1253,14 +1150,14 @@ out: int lod_parse_striping(const struct lu_env *env, struct lod_object *lo, const struct lu_buf *buf) { - struct lov_mds_md_v1 *lmm; - struct lov_comp_md_v1 *comp_v1 = NULL; - struct lov_foreign_md *foreign = NULL; - struct lov_ost_data_v1 *objs; - __u32 magic, pattern; - int i, j, rc = 0; - __u16 comp_cnt; - __u16 mirror_cnt = 0; + struct lov_mds_md_v1 *lmm; + struct lov_comp_md_v1 *comp_v1 = NULL; + struct lov_foreign_md *foreign = NULL; + struct lov_ost_data_v1 *objs; + __u32 magic, pattern; + __u16 mirror_cnt = 0; + __u16 comp_cnt; + int i, rc; ENTRY; LASSERT(buf); @@ -1272,24 +1169,25 @@ int lod_parse_striping(const struct lu_env *env, struct lod_object *lo, magic = le32_to_cpu(lmm->lmm_magic); if (magic != LOV_MAGIC_V1 && magic != LOV_MAGIC_V3 && - magic != LOV_MAGIC_COMP_V1 && magic != LOV_MAGIC_FOREIGN) + magic != LOV_MAGIC_COMP_V1 && magic != LOV_MAGIC_FOREIGN && + magic != LOV_MAGIC_SEL) GOTO(out, rc = -EINVAL); - if (lo->ldo_is_foreign) - lod_free_foreign_lov(lo); - else - lod_free_comp_entries(lo); + lod_striping_free_nolock(env, lo); - if (magic == LOV_MAGIC_COMP_V1) { + if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) { comp_v1 = (struct lov_comp_md_v1 *)lmm; comp_cnt = le16_to_cpu(comp_v1->lcm_entry_count); if (comp_cnt == 0) GOTO(out, rc = -EINVAL); lo->ldo_layout_gen = le32_to_cpu(comp_v1->lcm_layout_gen); lo->ldo_is_composite = 1; - lo->ldo_flr_state = le16_to_cpu(comp_v1->lcm_flags) & - LCM_FL_FLR_MASK; mirror_cnt = le16_to_cpu(comp_v1->lcm_mirror_count) + 1; + if (mirror_cnt > 1) + lo->ldo_flr_state = le16_to_cpu(comp_v1->lcm_flags) & + LCM_FL_FLR_MASK; + else + lo->ldo_flr_state = LCM_FL_NONE; } else if (magic == LOV_MAGIC_FOREIGN) { size_t length; @@ -1320,20 +1218,38 @@ int lod_parse_striping(const struct lu_env *env, struct lod_object *lo, GOTO(out, rc); for (i = 0; i < comp_cnt; i++) { - struct lod_layout_component *lod_comp; - struct lu_extent *ext; - __u32 offs; + struct lod_layout_component *lod_comp; + struct lu_extent *ext; + __u32 offs; lod_comp = &lo->ldo_comp_entries[i]; if (lo->ldo_is_composite) { offs = le32_to_cpu(comp_v1->lcm_entries[i].lcme_offset); lmm = (struct lov_mds_md_v1 *)((char *)comp_v1 + offs); - magic = le32_to_cpu(lmm->lmm_magic); ext = &comp_v1->lcm_entries[i].lcme_extent; lod_comp->llc_extent.e_start = le64_to_cpu(ext->e_start); + if (lod_comp->llc_extent.e_start & + (LOV_MIN_STRIPE_SIZE - 1)) { + CDEBUG(D_LAYOUT, + "extent start %llu is not a multiple of min size %u\n", + lod_comp->llc_extent.e_start, + LOV_MIN_STRIPE_SIZE); + GOTO(out, rc = -EINVAL); + } + lod_comp->llc_extent.e_end = le64_to_cpu(ext->e_end); + if (lod_comp->llc_extent.e_end != LUSTRE_EOF && + lod_comp->llc_extent.e_end & + (LOV_MIN_STRIPE_SIZE - 1)) { + CDEBUG(D_LAYOUT, + "extent end %llu is not a multiple of min size %u\n", + lod_comp->llc_extent.e_end, + LOV_MIN_STRIPE_SIZE); + GOTO(out, rc = -EINVAL); + } + lod_comp->llc_flags = le32_to_cpu(comp_v1->lcm_entries[i].lcme_flags); if (lod_comp->llc_flags & LCME_FL_NOSYNC) @@ -1343,13 +1259,24 @@ int lod_parse_striping(const struct lu_env *env, struct lod_object *lo, le32_to_cpu(comp_v1->lcm_entries[i].lcme_id); if (lod_comp->llc_id == LCME_ID_INVAL) GOTO(out, rc = -EINVAL); + + if ((lod_comp->llc_flags & LCME_FL_EXTENSION) && + comp_v1->lcm_magic != cpu_to_le32(LOV_MAGIC_SEL)) { + struct lod_device *d = + lu2lod_dev(lo->ldo_obj.do_lu.lo_dev); + + CWARN("%s: EXTENSION flags=%x set on component[%u]=%x of non-SEL file "DFID" with magic=%#08x\n", + lod2obd(d)->obd_name, + lod_comp->llc_flags, lod_comp->llc_id, i, + PFID(lod_object_fid(lo)), + le32_to_cpu(comp_v1->lcm_magic)); + } } else { lod_comp_set_init(lod_comp); } pattern = le32_to_cpu(lmm->lmm_pattern); - if (lov_pattern(pattern) != LOV_PATTERN_RAID0 && - lov_pattern(pattern) != LOV_PATTERN_MDT) + if (!lov_pattern_supported(lov_pattern(pattern))) GOTO(out, rc = -EINVAL); lod_comp->llc_pattern = pattern; @@ -1357,8 +1284,9 @@ int lod_parse_striping(const struct lu_env *env, struct lod_object *lo, lod_comp->llc_stripe_count = le16_to_cpu(lmm->lmm_stripe_count); lod_comp->llc_layout_gen = le16_to_cpu(lmm->lmm_layout_gen); - if (magic == LOV_MAGIC_V3) { + if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) { struct lov_mds_md_v3 *v3 = (struct lov_mds_md_v3 *)lmm; + lod_set_pool(&lod_comp->llc_pool, v3->lmm_pool_name); objs = &v3->lmm_objects[0]; } else { @@ -1374,8 +1302,10 @@ int lod_parse_striping(const struct lu_env *env, struct lod_object *lo, __u16 stripe_count; if (objs[0].l_ost_idx != (__u32)-1UL) { + int j; + stripe_count = lod_comp_entry_stripe_count( - lo, lod_comp, false); + lo, i, false); if (stripe_count == 0 && !(lod_comp->llc_pattern & LOV_PATTERN_F_RELEASED) && !(lod_comp->llc_pattern & LOV_PATTERN_MDT)) @@ -1513,7 +1443,23 @@ int lod_striping_load(const struct lu_env *env, struct lod_object *lo) lo->ldo_comp_cached = 1; } else if (S_ISDIR(lod2lu_obj(lo)->lo_header->loh_attr)) { rc = lod_get_lmv_ea(env, lo); - if (rc < (typeof(rc))sizeof(struct lmv_mds_md_v1)) { + if (rc > sizeof(struct lmv_foreign_md)) { + struct lmv_foreign_md *lfm = info->lti_ea_store; + + if (le32_to_cpu(lfm->lfm_magic) == LMV_MAGIC_FOREIGN) { + lo->ldo_foreign_lmv = info->lti_ea_store; + lo->ldo_foreign_lmv_size = + info->lti_ea_store_size; + info->lti_ea_store = NULL; + info->lti_ea_store_size = 0; + + lo->ldo_dir_stripe_loaded = 1; + lo->ldo_dir_is_foreign = 1; + GOTO(unlock, rc = 0); + } + } + + if (rc < (int)sizeof(struct lmv_mds_md_v1)) { /* Let's set stripe_loaded to avoid further * stripe loading especially for non-stripe directory, * which can hurt performance. (See LU-9840) @@ -1560,7 +1506,6 @@ int lod_striping_reload(const struct lu_env *env, struct lod_object *lo, ENTRY; mutex_lock(&lo->ldo_layout_mutex); - lod_striping_free_nolock(env, lo); rc = lod_parse_striping(env, lo, buf); mutex_unlock(&lo->ldo_layout_mutex); @@ -1643,14 +1588,14 @@ static int lod_verify_v1v3(struct lod_device *d, const struct lu_buf *buf, if (!is_from_disk && stripe_offset != LOV_OFFSET_DEFAULT && lov_pattern(le32_to_cpu(lum->lmm_pattern)) != LOV_PATTERN_MDT) { /* if offset is not within valid range [0, osts_size) */ - if (stripe_offset >= d->lod_osts_size) { + if (stripe_offset >= d->lod_ost_descs.ltd_tgts_size) { CDEBUG(D_LAYOUT, "stripe offset %u >= bitmap size %u\n", - stripe_offset, d->lod_osts_size); + stripe_offset, d->lod_ost_descs.ltd_tgts_size); GOTO(out, rc = -EINVAL); } /* if lmm_stripe_offset is *not* in bitmap */ - if (!cfs_bitmap_check(d->lod_ost_bitmap, stripe_offset)) { + if (!test_bit(stripe_offset, d->lod_ost_bitmap)) { CDEBUG(D_LAYOUT, "stripe offset %u not in bitmap\n", stripe_offset); GOTO(out, rc = -EINVAL); @@ -1708,7 +1653,9 @@ static inline struct lov_comp_md_entry_v1 *comp_entry_v1(struct lov_comp_md_v1 *comp, int i) { LASSERTF((le32_to_cpu(comp->lcm_magic) & ~LOV_MAGIC_DEFINED) == - LOV_USER_MAGIC_COMP_V1, "Wrong magic %x\n", + LOV_USER_MAGIC_COMP_V1 || + (le32_to_cpu(comp->lcm_magic) & ~LOV_MAGIC_DEFINED) == + LOV_USER_MAGIC_SEL, "Wrong magic %x\n", le32_to_cpu(comp->lcm_magic)); LASSERTF(i >= 0 && i < le16_to_cpu(comp->lcm_entry_count), "bad index %d, max = %d\n", @@ -1723,70 +1670,161 @@ struct lov_comp_md_entry_v1 *comp_entry_v1(struct lov_comp_md_v1 *comp, int i) le16_to_cpu(comp->lcm_entry_count) - 1); \ entry++) -int lod_erase_dom_stripe(struct lov_comp_md_v1 *comp_v1) +int lod_erase_dom_stripe(struct lov_comp_md_v1 *comp_v1, + struct lov_comp_md_entry_v1 *dom_ent) { - struct lov_comp_md_entry_v1 *ent, *dom_ent; + struct lov_comp_md_entry_v1 *ent; __u16 entries; - __u32 dom_off, dom_size, comp_size; - void *blob_src, *blob_dst; - unsigned int blob_size, blob_shift; + __u32 dom_off, dom_size, comp_size, off; + void *src, *dst; + unsigned int size, shift; entries = le16_to_cpu(comp_v1->lcm_entry_count) - 1; - /* if file has only DoM stripe return just error */ - if (entries == 0) - return -EFBIG; + LASSERT(entries > 0); + comp_v1->lcm_entry_count = cpu_to_le16(entries); comp_size = le32_to_cpu(comp_v1->lcm_size); - dom_ent = &comp_v1->lcm_entries[0]; dom_off = le32_to_cpu(dom_ent->lcme_offset); dom_size = le32_to_cpu(dom_ent->lcme_size); - /* shift entries array first */ - comp_v1->lcm_entry_count = cpu_to_le16(entries); - memmove(dom_ent, dom_ent + 1, - entries * sizeof(struct lov_comp_md_entry_v1)); + /* all entries offsets are shifted by entry size at least */ + shift = sizeof(*dom_ent); + for_each_comp_entry_v1(comp_v1, ent) { + off = le32_to_cpu(ent->lcme_offset); + if (off == dom_off) { + /* Entry deletion creates two holes in layout data: + * - hole in entries array + * - hole in layout data at dom_off with dom_size + * + * First memmove is one entry shift from next entry + * start with size up to dom_off in blob + */ + dst = (void *)ent; + src = (void *)(ent + 1); + size = (unsigned long)((void *)comp_v1 + dom_off - src); + memmove(dst, src, size); + /* take 'off' from just moved entry */ + off = le32_to_cpu(ent->lcme_offset); + /* second memmove is blob tail after 'off' up to + * component end + */ + dst = (void *)comp_v1 + dom_off - sizeof(*ent); + src = (void *)comp_v1 + off; + size = (unsigned long)(comp_size - off); + memmove(dst, src, size); + /* all entries offsets after DoM entry are shifted by + * dom_size additionally + */ + shift += dom_size; + } + ent->lcme_offset = cpu_to_le32(off - shift); + } + comp_v1->lcm_size = cpu_to_le32(comp_size - shift); - /* now move blob of layouts */ - blob_dst = (void *)comp_v1 + dom_off - sizeof(*dom_ent); - blob_src = (void *)comp_v1 + dom_off + dom_size; - blob_size = (unsigned long)((void *)comp_v1 + comp_size - blob_src); - blob_shift = sizeof(*dom_ent) + dom_size; + /* notify a caller to re-check entry */ + return -ERESTART; +} - memmove(blob_dst, blob_src, blob_size); +void lod_dom_stripesize_recalc(struct lod_device *d) +{ + __u64 threshold_mb = d->lod_dom_threshold_free_mb; + __u32 max_size = d->lod_dom_stripesize_max_kb; + __u32 def_size = d->lod_dom_stripesize_cur_kb; + + /* use maximum allowed value if free space is above threshold */ + if (d->lod_lsfs_free_mb >= threshold_mb) { + def_size = max_size; + } else if (!d->lod_lsfs_free_mb || max_size <= LOD_DOM_MIN_SIZE_KB) { + def_size = 0; + } else { + /* recalc threshold like it would be with def_size as max */ + threshold_mb = mult_frac(threshold_mb, def_size, max_size); + if (d->lod_lsfs_free_mb < threshold_mb) + def_size = rounddown(def_size / 2, LOD_DOM_MIN_SIZE_KB); + else if (d->lod_lsfs_free_mb > threshold_mb * 2) + def_size = max_t(unsigned int, def_size * 2, + LOD_DOM_MIN_SIZE_KB); + } - for_each_comp_entry_v1(comp_v1, ent) { - __u32 off; + if (d->lod_dom_stripesize_cur_kb != def_size) { + CDEBUG(D_LAYOUT, "Change default DOM stripe size %d->%d\n", + d->lod_dom_stripesize_cur_kb, def_size); + d->lod_dom_stripesize_cur_kb = def_size; + } +} - off = le32_to_cpu(ent->lcme_offset); - ent->lcme_offset = cpu_to_le32(off - blob_shift); +static __u32 lod_dom_stripesize_limit(const struct lu_env *env, + struct lod_device *d) +{ + int rc; + + /* set bfree as fraction of total space */ + if (OBD_FAIL_CHECK(OBD_FAIL_MDS_STATFS_SPOOF)) { + spin_lock(&d->lod_lsfs_lock); + d->lod_lsfs_free_mb = mult_frac(d->lod_lsfs_total_mb, + min_t(int, cfs_fail_val, 100), 100); + GOTO(recalc, rc = 0); } - comp_v1->lcm_size = cpu_to_le32(comp_size - blob_shift); + if (d->lod_lsfs_age < ktime_get_seconds() - LOD_DOM_SFS_MAX_AGE) { + struct obd_statfs sfs; - /* notify a caller to re-check entry */ - return -ERESTART; + spin_lock(&d->lod_lsfs_lock); + if (d->lod_lsfs_age > ktime_get_seconds() - LOD_DOM_SFS_MAX_AGE) + GOTO(unlock, rc = 0); + + d->lod_lsfs_age = ktime_get_seconds(); + spin_unlock(&d->lod_lsfs_lock); + rc = dt_statfs(env, d->lod_child, &sfs); + if (rc) { + CDEBUG(D_LAYOUT, + "%s: failed to get OSD statfs: rc = %d\n", + lod2obd(d)->obd_name, rc); + GOTO(out, rc); + } + /* udpate local OSD cached statfs data */ + spin_lock(&d->lod_lsfs_lock); + d->lod_lsfs_total_mb = (sfs.os_blocks * sfs.os_bsize) >> 20; + d->lod_lsfs_free_mb = (sfs.os_bfree * sfs.os_bsize) >> 20; +recalc: + lod_dom_stripesize_recalc(d); +unlock: + spin_unlock(&d->lod_lsfs_lock); + } +out: + return d->lod_dom_stripesize_cur_kb << 10; } -int lod_fix_dom_stripe(struct lod_device *d, struct lov_comp_md_v1 *comp_v1) +int lod_dom_stripesize_choose(const struct lu_env *env, struct lod_device *d, + struct lov_comp_md_v1 *comp_v1, + struct lov_comp_md_entry_v1 *dom_ent, + __u32 stripe_size) { - struct lov_comp_md_entry_v1 *ent, *dom_ent; + struct lov_comp_md_entry_v1 *ent; struct lu_extent *dom_ext, *ext; struct lov_user_md_v1 *lum; - __u32 stripe_size; + __u32 max_stripe_size; __u16 mid, dom_mid; int rc = 0; + bool dom_next_entry = false; - dom_ent = &comp_v1->lcm_entries[0]; dom_ext = &dom_ent->lcme_extent; dom_mid = mirror_id_of(le32_to_cpu(dom_ent->lcme_id)); - stripe_size = d->lod_dom_max_stripesize; + max_stripe_size = lod_dom_stripesize_limit(env, d); + + /* Check stripe size againts current per-MDT limit */ + if (stripe_size <= max_stripe_size) + return 0; lum = (void *)comp_v1 + le32_to_cpu(dom_ent->lcme_offset); - CDEBUG(D_LAYOUT, "DoM component size %u was bigger than MDT limit %u, " - "new size is %u\n", le32_to_cpu(lum->lmm_stripe_size), - d->lod_dom_max_stripesize, stripe_size); - lum->lmm_stripe_size = cpu_to_le32(stripe_size); + CDEBUG(D_LAYOUT, "overwrite DoM component size %u with MDT limit %u\n", + stripe_size, max_stripe_size); + lum->lmm_stripe_size = cpu_to_le32(max_stripe_size); + /* In common case the DoM stripe is first entry in a mirror and + * can be deleted only if it is not single entry in layout or + * mirror, otherwise error should be returned. + */ for_each_comp_entry_v1(comp_v1, ent) { if (ent == dom_ent) continue; @@ -1806,17 +1844,24 @@ int lod_fix_dom_stripe(struct lod_device *d, struct lov_comp_md_v1 *comp_v1) * DoM component in a file, all replicas are located on OSTs * always and don't need adjustment since use own layouts. */ - ext->e_start = cpu_to_le64(stripe_size); + ext->e_start = cpu_to_le64(max_stripe_size); + dom_next_entry = true; break; } - if (stripe_size == 0) { - /* DoM component size is zero due to server setting, - * remove it from the layout */ - rc = lod_erase_dom_stripe(comp_v1); + if (max_stripe_size == 0) { + /* DoM component size is zero due to server setting, remove + * it from the layout but only if next component exists in + * the same mirror. That must be checked prior calling the + * lod_erase_dom_stripe(). + */ + if (!dom_next_entry) + return -EFBIG; + + rc = lod_erase_dom_stripe(comp_v1, dom_ent); } else { /* Update DoM extent end finally */ - dom_ext->e_end = cpu_to_le64(stripe_size); + dom_ext->e_end = cpu_to_le64(max_stripe_size); } return rc; @@ -1834,10 +1879,10 @@ int lod_fix_dom_stripe(struct lod_device *d, struct lov_comp_md_v1 *comp_v1) * \retval 0 if the striping is valid * \retval -EINVAL if striping is invalid */ -int lod_verify_striping(struct lod_device *d, struct lod_object *lo, - const struct lu_buf *buf, bool is_from_disk) +int lod_verify_striping(const struct lu_env *env, struct lod_device *d, + struct lod_object *lo, const struct lu_buf *buf, + bool is_from_disk) { - struct lov_desc *desc = &d->lod_desc; struct lov_user_md_v1 *lum; struct lov_comp_md_v1 *comp_v1; struct lov_comp_md_entry_v1 *ent; @@ -1892,18 +1937,22 @@ int lod_verify_striping(struct lod_device *d, struct lod_object *lo, RETURN(-EINVAL); } - if (magic != LOV_USER_MAGIC_V1 && - magic != LOV_USER_MAGIC_V3 && - magic != LOV_USER_MAGIC_SPECIFIC && - magic != LOV_USER_MAGIC_COMP_V1) { + switch (magic) { + case LOV_USER_MAGIC_FOREIGN: + RETURN(0); + case LOV_USER_MAGIC_V1: + case LOV_USER_MAGIC_V3: + case LOV_USER_MAGIC_SPECIFIC: + RETURN(lod_verify_v1v3(d, buf, is_from_disk)); + case LOV_USER_MAGIC_COMP_V1: + case LOV_USER_MAGIC_SEL: + break; + default: CDEBUG(D_LAYOUT, "bad userland LOV MAGIC: %#x\n", le32_to_cpu(lum->lmm_magic)); RETURN(-EINVAL); } - if (magic != LOV_USER_MAGIC_COMP_V1) - RETURN(lod_verify_v1v3(d, buf, is_from_disk)); - /* magic == LOV_USER_MAGIC_COMP_V1 */ comp_v1 = buf->lb_buf; if (buf->lb_len < le32_to_cpu(comp_v1->lcm_size)) { @@ -1933,7 +1982,10 @@ recheck: for_each_comp_entry_v1(comp_v1, ent) { ext = &ent->lcme_extent; - if (le64_to_cpu(ext->e_start) >= le64_to_cpu(ext->e_end)) { + if (le64_to_cpu(ext->e_start) > le64_to_cpu(ext->e_end) || + le64_to_cpu(ext->e_start) & (LOV_MIN_STRIPE_SIZE - 1) || + (le64_to_cpu(ext->e_end) != LUSTRE_EOF && + le64_to_cpu(ext->e_end) & (LOV_MIN_STRIPE_SIZE - 1))) { CDEBUG(D_LAYOUT, "invalid extent "DEXT"\n", le64_to_cpu(ext->e_start), le64_to_cpu(ext->e_end)); @@ -1986,7 +2038,7 @@ recheck: lum = tmp.lb_buf; if (lov_pattern(le32_to_cpu(lum->lmm_pattern)) == LOV_PATTERN_MDT) { - /* DoM component can be only the first stripe */ + /* DoM component must be the first in a mirror */ if (le64_to_cpu(ext->e_start) > 0) { CDEBUG(D_LAYOUT, "invalid DoM component " "with %llu extent start\n", @@ -2003,19 +2055,33 @@ recheck: stripe_size, prev_end); RETURN(-EINVAL); } - /* Check stripe size againts per-MDT limit */ - if (stripe_size > d->lod_dom_max_stripesize) { - CDEBUG(D_LAYOUT, "DoM component size " - "%u is bigger than MDT limit %u, check " - "dom_max_stripesize parameter\n", - stripe_size, d->lod_dom_max_stripesize); - rc = lod_fix_dom_stripe(d, comp_v1); - if (rc == -ERESTART) { - /* DoM entry was removed, re-check - * new layout from start */ - goto recheck; - } else if (rc) { - RETURN(rc); + /* Check and adjust stripe size by per-MDT limit */ + rc = lod_dom_stripesize_choose(env, d, comp_v1, ent, + stripe_size); + /* DoM entry was removed, re-check layout from start */ + if (rc == -ERESTART) + goto recheck; + else if (rc) + RETURN(rc); + + if (le16_to_cpu(lum->lmm_stripe_count) == 1) + lum->lmm_stripe_count = 0; + /* Any stripe count is forbidden on DoM component */ + if (lum->lmm_stripe_count > 0) { + CDEBUG(D_LAYOUT, + "invalid DoM layout stripe count %u, must be 0\n", + le16_to_cpu(lum->lmm_stripe_count)); + RETURN(-EINVAL); + } + + /* Any pool is forbidden on DoM component */ + if (lum->lmm_magic == LOV_USER_MAGIC_V3) { + struct lov_user_md_v3 *v3 = (void *)lum; + + if (v3->lmm_pool_name[0] != '\0') { + CDEBUG(D_LAYOUT, + "DoM component cannot have pool assigned\n"); + RETURN(-EINVAL); } } } @@ -2026,14 +2092,12 @@ recheck: if (rc) RETURN(rc); - if (prev_end == LUSTRE_EOF) + if (prev_end == LUSTRE_EOF || ext->e_start == prev_end) continue; /* extent end must be aligned with the stripe_size */ stripe_size = le32_to_cpu(lum->lmm_stripe_size); - if (stripe_size == 0) - stripe_size = desc->ld_default_stripe_size; - if (stripe_size == 0 || (prev_end & (stripe_size - 1))) { + if (stripe_size && prev_end % stripe_size) { CDEBUG(D_LAYOUT, "stripe size isn't aligned, " "stripe_sz: %u, [%llu, %llu)\n", stripe_size, ext->e_start, prev_end); @@ -2100,9 +2164,16 @@ void lod_fix_desc_stripe_count(__u32 *val) void lod_fix_desc_pattern(__u32 *val) { /* from lov_setstripe */ - if ((*val != 0) && (*val != LOV_PATTERN_RAID0) && - (*val != LOV_PATTERN_MDT)) { - LCONSOLE_WARN("Unknown stripe pattern: %#x\n", *val); + if ((*val != 0) && !lov_pattern_supported_normal_comp(*val)) { + LCONSOLE_WARN("lod: Unknown stripe pattern: %#x\n", *val); + *val = 0; + } +} + +void lod_fix_lmv_desc_pattern(__u32 *val) +{ + if ((*val) && !lmv_is_known_hash_type(*val)) { + LCONSOLE_WARN("lod: Unknown md stripe pattern: %#x\n", *val); *val = 0; } } @@ -2127,6 +2198,14 @@ void lod_fix_desc(struct lov_desc *desc) lod_fix_desc_qos_maxage(&desc->ld_qos_maxage); } +static void lod_fix_lmv_desc(struct lmv_desc *desc) +{ + desc->ld_active_tgt_count = 0; + lod_fix_desc_stripe_count(&desc->ld_default_stripe_count); + lod_fix_lmv_desc_pattern(&desc->ld_pattern); + lod_fix_desc_qos_maxage(&desc->ld_qos_maxage); +} + /** * Initialize the structures used to store pools and default striping. * @@ -2175,48 +2254,47 @@ int lod_pools_init(struct lod_device *lod, struct lustre_cfg *lcfg) lod_fix_desc(desc); desc->ld_active_tgt_count = 0; - lod->lod_desc = *desc; + lod->lod_ost_descs.ltd_lov_desc = *desc; - lod->lod_sp_me = LUSTRE_SP_CLI; + /* NB: config doesn't contain lmv_desc, alter it via sysfs. */ + lod_fix_lmv_desc(&lod->lod_mdt_descs.ltd_lmv_desc); - /* Set up allocation policy (QoS and RR) */ - INIT_LIST_HEAD(&lod->lod_qos.lq_oss_list); - init_rwsem(&lod->lod_qos.lq_rw_sem); - lod->lod_qos.lq_dirty = 1; - lod->lod_qos.lq_rr.lqr_dirty = 1; - lod->lod_qos.lq_reset = 1; - /* Default priority is toward free space balance */ - lod->lod_qos.lq_prio_free = 232; - /* Default threshold for rr (roughly 17%) */ - lod->lod_qos.lq_threshold_rr = 43; + lod->lod_sp_me = LUSTRE_SP_CLI; /* Set up OST pool environment */ - lod->lod_pools_hash_body = cfs_hash_create("POOLS", HASH_POOLS_CUR_BITS, - HASH_POOLS_MAX_BITS, - HASH_POOLS_BKT_BITS, 0, - CFS_HASH_MIN_THETA, - CFS_HASH_MAX_THETA, - &pool_hash_operations, - CFS_HASH_DEFAULT); - if (lod->lod_pools_hash_body == NULL) + lod->lod_pool_count = 0; + rc = lod_pool_hash_init(&lod->lod_pools_hash_body); + if (rc) RETURN(-ENOMEM); INIT_LIST_HEAD(&lod->lod_pool_list); lod->lod_pool_count = 0; - rc = lod_ost_pool_init(&lod->lod_pool_info, 0); + rc = lu_tgt_pool_init(&lod->lod_mdt_descs.ltd_tgt_pool, 0); if (rc) GOTO(out_hash, rc); - lod_qos_rr_init(&lod->lod_qos.lq_rr); - rc = lod_ost_pool_init(&lod->lod_qos.lq_rr.lqr_pool, 0); + + rc = lu_tgt_pool_init(&lod->lod_mdt_descs.ltd_qos.lq_rr.lqr_pool, 0); + if (rc) + GOTO(out_mdt_pool, rc); + + rc = lu_tgt_pool_init(&lod->lod_ost_descs.ltd_tgt_pool, 0); + if (rc) + GOTO(out_mdt_rr_pool, rc); + + rc = lu_tgt_pool_init(&lod->lod_ost_descs.ltd_qos.lq_rr.lqr_pool, 0); if (rc) - GOTO(out_pool_info, rc); + GOTO(out_ost_pool, rc); RETURN(0); -out_pool_info: - lod_ost_pool_free(&lod->lod_pool_info); +out_ost_pool: + lu_tgt_pool_free(&lod->lod_ost_descs.ltd_tgt_pool); +out_mdt_rr_pool: + lu_tgt_pool_free(&lod->lod_mdt_descs.ltd_qos.lq_rr.lqr_pool); +out_mdt_pool: + lu_tgt_pool_free(&lod->lod_mdt_descs.ltd_tgt_pool); out_hash: - cfs_hash_putref(lod->lod_pools_hash_body); + lod_pool_hash_destroy(&lod->lod_pools_hash_body); return rc; } @@ -2243,9 +2321,11 @@ int lod_pools_fini(struct lod_device *lod) lod_pool_del(obd, pool->pool_name); } - cfs_hash_putref(lod->lod_pools_hash_body); - lod_ost_pool_free(&(lod->lod_qos.lq_rr.lqr_pool)); - lod_ost_pool_free(&lod->lod_pool_info); + lod_pool_hash_destroy(&lod->lod_pools_hash_body); + lu_tgt_pool_free(&lod->lod_ost_descs.ltd_qos.lq_rr.lqr_pool); + lu_tgt_pool_free(&lod->lod_ost_descs.ltd_tgt_pool); + lu_tgt_pool_free(&lod->lod_mdt_descs.ltd_qos.lq_rr.lqr_pool); + lu_tgt_pool_free(&lod->lod_mdt_descs.ltd_tgt_pool); RETURN(0); }