X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Flod%2Flod_lov.c;h=e449db60f973d51e8ae383f2b77d44de7f2182af;hp=9e25dc4ed9c5f1ea14c8f09428bbe732cd556ef1;hb=c1d0a355a6;hpb=4e4751d5bf7af565f9fc41f5001dae81f67be891 diff --git a/lustre/lod/lod_lov.c b/lustre/lod/lod_lov.c index 9e25dc4..e449db6 100644 --- a/lustre/lod/lod_lov.c +++ b/lustre/lod/lod_lov.c @@ -77,30 +77,21 @@ void lod_putref(struct lod_device *lod, struct lod_tgt_descs *ltd) if (ltd->ltd_refcount == 0 && ltd->ltd_death_row) { struct lod_tgt_desc *tgt_desc, *tmp; struct list_head kill; - unsigned int idx; CDEBUG(D_CONFIG, "destroying %d ltd desc\n", ltd->ltd_death_row); INIT_LIST_HEAD(&kill); - cfs_foreach_bit(ltd->ltd_tgt_bitmap, idx) { - tgt_desc = LTD_TGT(ltd, idx); + ltd_foreach_tgt_safe(ltd, tgt_desc, tmp) { LASSERT(tgt_desc); - if (!tgt_desc->ltd_reap) continue; list_add(&tgt_desc->ltd_kill, &kill); - LTD_TGT(ltd, idx) = NULL; - /*FIXME: only support ost pool for now */ - if (ltd == &lod->lod_ost_descs) { - lod_ost_pool_remove(&lod->lod_pool_info, idx); - if (tgt_desc->ltd_active) - lod->lod_desc.ld_active_tgt_count--; - } - ltd->ltd_tgtnr--; - cfs_bitmap_clear(ltd->ltd_tgt_bitmap, idx); + lod_tgt_pool_remove(<d->ltd_tgt_pool, + tgt_desc->ltd_index); + ltd_del_tgt(ltd, tgt_desc); ltd->ltd_death_row--; } mutex_unlock(<d->ltd_mutex); @@ -108,17 +99,8 @@ void lod_putref(struct lod_device *lod, struct lod_tgt_descs *ltd) list_for_each_entry_safe(tgt_desc, tmp, &kill, ltd_kill) { int rc; + list_del(&tgt_desc->ltd_kill); - if (ltd == &lod->lod_ost_descs) { - /* remove from QoS structures */ - rc = qos_del_tgt(lod, tgt_desc); - if (rc) - CERROR("%s: qos_del_tgt(%s) failed:" - "rc = %d\n", - lod2obd(lod)->obd_name, - obd_uuid2str(&tgt_desc->ltd_uuid), - rc); - } rc = obd_disconnect(tgt_desc->ltd_exp); if (rc) CERROR("%s: failed to disconnect %s: rc = %d\n", @@ -133,60 +115,6 @@ void lod_putref(struct lod_device *lod, struct lod_tgt_descs *ltd) } /** - * Expand size of target table. - * - * When the target table is full, we have to extend the table. To do so, - * we allocate new memory with some reserve, move data from the old table - * to the new one and release memory consumed by the old table. - * Notice we take ltd_rw_sem exclusively to ensure atomic switch. - * - * \param[in] ltd target table - * \param[in] newsize new size of the table - * - * \retval 0 on success - * \retval -ENOMEM if reallocation failed - */ -static int ltd_bitmap_resize(struct lod_tgt_descs *ltd, __u32 newsize) -{ - struct cfs_bitmap *new_bitmap, *old_bitmap = NULL; - int rc = 0; - ENTRY; - - /* grab write reference on the lod. Relocating the array requires - * exclusive access */ - - down_write(<d->ltd_rw_sem); - if (newsize <= ltd->ltd_tgts_size) - /* someone else has already resize the array */ - GOTO(out, rc = 0); - - /* allocate new bitmap */ - new_bitmap = CFS_ALLOCATE_BITMAP(newsize); - if (!new_bitmap) - GOTO(out, rc = -ENOMEM); - - if (ltd->ltd_tgts_size > 0) { - /* the bitmap already exists, we need - * to copy data from old one */ - cfs_bitmap_copy(new_bitmap, ltd->ltd_tgt_bitmap); - old_bitmap = ltd->ltd_tgt_bitmap; - } - - ltd->ltd_tgts_size = newsize; - ltd->ltd_tgt_bitmap = new_bitmap; - - if (old_bitmap) - CFS_FREE_BITMAP(old_bitmap); - - CDEBUG(D_CONFIG, "tgt size: %d\n", ltd->ltd_tgts_size); - - EXIT; -out: - up_write(<d->ltd_rw_sem); - return rc; -} - -/** * Connect LOD to a new OSP and add it to the target table. * * Connect to the OSP device passed, initialize all the internal @@ -219,7 +147,7 @@ int lod_add_device(const struct lu_env *env, struct lod_device *lod, struct lustre_cfg *lcfg; struct obd_uuid obd_uuid; bool for_ost; - bool lock = false; + bool connected = false; ENTRY; CDEBUG(D_CONFIG, "osp:%s idx:%d gen:%d\n", osp, index, gen); @@ -302,11 +230,12 @@ int lod_add_device(const struct lu_env *env, struct lod_device *lod, obd->obd_name, osp, rc); GOTO(out_cleanup, rc); } + connected = true; /* Allocate ost descriptor and fill it */ OBD_ALLOC_PTR(tgt_desc); if (!tgt_desc) - GOTO(out_conn, rc = -ENOMEM); + GOTO(out_cleanup, rc = -ENOMEM); tgt_desc->ltd_tgt = dt_dev; tgt_desc->ltd_exp = exp; @@ -315,73 +244,27 @@ int lod_add_device(const struct lu_env *env, struct lod_device *lod, tgt_desc->ltd_index = index; tgt_desc->ltd_active = active; - lod_getref(ltd); - if (index >= ltd->ltd_tgts_size) { - /* we have to increase the size of the lod_osts array */ - __u32 newsize; - - newsize = max(ltd->ltd_tgts_size, (__u32)2); - while (newsize < index + 1) - newsize = newsize << 1; - - /* lod_bitmap_resize() needs lod_rw_sem - * which we hold with th reference */ - lod_putref(lod, ltd); - - rc = ltd_bitmap_resize(ltd, newsize); - if (rc) - GOTO(out_desc, rc); - - lod_getref(ltd); - } - + down_write(<d->ltd_rw_sem); mutex_lock(<d->ltd_mutex); - lock = true; - if (cfs_bitmap_check(ltd->ltd_tgt_bitmap, index)) { - CERROR("%s: device %d is registered already\n", obd->obd_name, - index); - GOTO(out_mutex, rc = -EEXIST); - } - - if (ltd->ltd_tgt_idx[index / TGT_PTRS_PER_BLOCK] == NULL) { - OBD_ALLOC_PTR(ltd->ltd_tgt_idx[index / TGT_PTRS_PER_BLOCK]); - if (ltd->ltd_tgt_idx[index / TGT_PTRS_PER_BLOCK] == NULL) { - CERROR("can't allocate index to add %s\n", - obd->obd_name); - GOTO(out_mutex, rc = -ENOMEM); - } - } - - if (for_ost) { - /* pool and qos are not supported for MDS stack yet */ - rc = lod_ost_pool_add(&lod->lod_pool_info, index, - lod->lod_osts_size); - if (rc) { - CERROR("%s: can't set up pool, failed with %d\n", - obd->obd_name, rc); - GOTO(out_mutex, rc); - } + rc = ltd_add_tgt(ltd, tgt_desc); + if (rc) + GOTO(out_mutex, rc); - rc = qos_add_tgt(lod, tgt_desc); - if (rc) { - CERROR("%s: qos_add_tgt failed with %d\n", - obd->obd_name, rc); - GOTO(out_pool, rc); - } + rc = lu_qos_add_tgt(<d->ltd_qos, tgt_desc); + if (rc) + GOTO(out_del_tgt, rc); - /* The new OST is now a full citizen */ - if (index >= lod->lod_desc.ld_tgt_count) - lod->lod_desc.ld_tgt_count = index + 1; - if (active) - lod->lod_desc.ld_active_tgt_count++; + rc = lod_tgt_pool_add(<d->ltd_tgt_pool, index, + ltd->ltd_lov_desc.ld_tgt_count); + if (rc) { + CERROR("%s: can't set up pool, failed with %d\n", + obd->obd_name, rc); + GOTO(out_del_tgt, rc); } - LTD_TGT(ltd, index) = tgt_desc; - cfs_bitmap_set(ltd->ltd_tgt_bitmap, index); - ltd->ltd_tgtnr++; mutex_unlock(<d->ltd_mutex); - lod_putref(lod, ltd); - lock = false; + up_write(<d->ltd_rw_sem); + if (lod->lod_recovery_completed) lu_dev->ld_ops->ldo_recovery_complete(env, lu_dev); @@ -405,29 +288,21 @@ out_fini_llog: lod_sub_fini_llog(env, tgt_desc->ltd_tgt, tgt_desc->ltd_recovery_thread); out_ltd: - lod_getref(ltd); + down_write(<d->ltd_rw_sem); mutex_lock(<d->ltd_mutex); - lock = true; if (!for_ost && LTD_TGT(ltd, index)->ltd_recovery_thread != NULL) { struct ptlrpc_thread *thread; thread = LTD_TGT(ltd, index)->ltd_recovery_thread; OBD_FREE_PTR(thread); } - ltd->ltd_tgtnr--; - cfs_bitmap_clear(ltd->ltd_tgt_bitmap, index); - LTD_TGT(ltd, index) = NULL; -out_pool: - lod_ost_pool_remove(&lod->lod_pool_info, index); + lod_tgt_pool_remove(<d->ltd_tgt_pool, index); +out_del_tgt: + ltd_del_tgt(ltd, tgt_desc); out_mutex: - if (lock) { - mutex_unlock(<d->ltd_mutex); - lod_putref(lod, ltd); - } -out_desc: + mutex_unlock(<d->ltd_mutex); + up_write(<d->ltd_rw_sem); OBD_FREE_PTR(tgt_desc); -out_conn: - obd_disconnect(exp); out_cleanup: /* XXX OSP needs us to send down LCFG_CLEANUP because it uses * objects from the MDT stack. See LU-7184. */ @@ -437,6 +312,9 @@ out_cleanup: lcfg->lcfg_command = LCFG_CLEANUP; lu_dev->ld_ops->ldo_process_config(env, lu_dev, lcfg); + if (connected) + obd_disconnect(exp); + return rc; } @@ -450,27 +328,19 @@ out_cleanup: * \param[in] env execution environment for this thread * \param[in] lod LOD device the target table belongs to * \param[in] ltd target table - * \param[in] idx index of the target - * \param[in] for_ost type of the target: 0 - MDT, 1 - OST + * \param[in] tgt target */ static void __lod_del_device(const struct lu_env *env, struct lod_device *lod, - struct lod_tgt_descs *ltd, unsigned idx, - bool for_ost) + struct lod_tgt_descs *ltd, struct lu_tgt_desc *tgt) { - LASSERT(LTD_TGT(ltd, idx)); + lfsck_del_target(env, lod->lod_child, tgt->ltd_tgt, tgt->ltd_index, + !ltd->ltd_is_mdt); - lfsck_del_target(env, lod->lod_child, LTD_TGT(ltd, idx)->ltd_tgt, - idx, for_ost); + if (ltd->ltd_is_mdt && tgt->ltd_recovery_thread) + OBD_FREE_PTR(tgt->ltd_recovery_thread); - if (!for_ost && LTD_TGT(ltd, idx)->ltd_recovery_thread != NULL) { - struct ptlrpc_thread *thread; - - thread = LTD_TGT(ltd, idx)->ltd_recovery_thread; - OBD_FREE_PTR(thread); - } - - if (LTD_TGT(ltd, idx)->ltd_reap == 0) { - LTD_TGT(ltd, idx)->ltd_reap = 1; + if (!tgt->ltd_reap) { + tgt->ltd_reap = 1; ltd->ltd_death_row++; } } @@ -483,29 +353,26 @@ static void __lod_del_device(const struct lu_env *env, struct lod_device *lod, * \param[in] env execution environment for this thread * \param[in] lod LOD device the target table belongs to * \param[in] ltd target table - * \param[in] for_ost type of the target: MDT or OST * * \retval 0 always */ int lod_fini_tgt(const struct lu_env *env, struct lod_device *lod, - struct lod_tgt_descs *ltd, bool for_ost) + struct lod_tgt_descs *ltd) { - unsigned int idx; + struct lu_tgt_desc *tgt; if (ltd->ltd_tgts_size <= 0) return 0; + lod_getref(ltd); mutex_lock(<d->ltd_mutex); - cfs_foreach_bit(ltd->ltd_tgt_bitmap, idx) - __lod_del_device(env, lod, ltd, idx, for_ost); + ltd_foreach_tgt(ltd, tgt) + __lod_del_device(env, lod, ltd, tgt); mutex_unlock(<d->ltd_mutex); lod_putref(lod, ltd); - CFS_FREE_BITMAP(ltd->ltd_tgt_bitmap); - for (idx = 0; idx < TGT_PTRS; idx++) { - if (ltd->ltd_tgt_idx[idx]) - OBD_FREE_PTR(ltd->ltd_tgt_idx[idx]); - } - ltd->ltd_tgts_size = 0; + + lu_tgt_descs_fini(ltd); + return 0; } @@ -521,18 +388,19 @@ int lod_fini_tgt(const struct lu_env *env, struct lod_device *lod, * \param[in] osp name of OSP device to be removed * \param[in] idx index of the target * \param[in] gen generation number, not used currently - * \param[in] for_ost type of the target: 0 - MDT, 1 - OST * * \retval 0 if the device was scheduled for removal * \retval -EINVAL if no device was found */ int lod_del_device(const struct lu_env *env, struct lod_device *lod, - struct lod_tgt_descs *ltd, char *osp, unsigned idx, - unsigned gen, bool for_ost) + struct lod_tgt_descs *ltd, char *osp, unsigned int idx, + unsigned int gen) { struct obd_device *obd; - int rc = 0; - struct obd_uuid uuid; + struct lu_tgt_desc *tgt; + struct obd_uuid uuid; + int rc = 0; + ENTRY; CDEBUG(D_CONFIG, "osp:%s idx:%d gen:%d\n", osp, idx, gen); @@ -556,22 +424,21 @@ int lod_del_device(const struct lu_env *env, struct lod_device *lod, lod_getref(ltd); mutex_lock(<d->ltd_mutex); + tgt = LTD_TGT(ltd, idx); /* check that the index is allocated in the bitmap */ - if (!cfs_bitmap_check(ltd->ltd_tgt_bitmap, idx) || - !LTD_TGT(ltd, idx)) { + if (!cfs_bitmap_check(ltd->ltd_tgt_bitmap, idx) || !tgt) { CERROR("%s: device %d is not set up\n", obd->obd_name, idx); GOTO(out, rc = -EINVAL); } /* check that the UUID matches */ - if (!obd_uuid_equals(&uuid, <D_TGT(ltd, idx)->ltd_uuid)) { + if (!obd_uuid_equals(&uuid, &tgt->ltd_uuid)) { CERROR("%s: LOD target UUID %s at index %d does not match %s\n", - obd->obd_name, obd_uuid2str(<D_TGT(ltd,idx)->ltd_uuid), - idx, osp); + obd->obd_name, obd_uuid2str(&tgt->ltd_uuid), idx, osp); GOTO(out, rc = -EINVAL); } - __lod_del_device(env, lod, ltd, idx, for_ost); + __lod_del_device(env, lod, ltd, tgt); EXIT; out: mutex_unlock(<d->ltd_mutex); @@ -597,8 +464,6 @@ int lod_ea_store_resize(struct lod_thread_info *info, size_t size) { __u32 round = size_roundup_power2(size); - LASSERT(round <= - lov_mds_md_size(LOV_MAX_STRIPE_COUNT, LOV_MAGIC_V3)); if (info->lti_ea_store) { LASSERT(info->lti_ea_store_size); LASSERT(info->lti_ea_store_size < round); @@ -854,6 +719,11 @@ static int lod_gen_component_ea(const struct lu_env *env, objs = &v3->lmm_objects[0]; } stripe_count = lod_comp_entry_stripe_count(lo, lod_comp, is_dir); + if (stripe_count == 0 && !is_dir && + !(lod_comp->llc_pattern & LOV_PATTERN_F_RELEASED) && + !(lod_comp->llc_pattern & LOV_PATTERN_MDT)) + RETURN(-E2BIG); + if (!is_dir && lo->ldo_is_composite) lod_comp_shrink_stripe_count(lod_comp, &stripe_count); @@ -937,7 +807,7 @@ int lod_generate_lovea(const struct lu_env *env, struct lod_object *lo, struct lov_comp_md_v1 *lcm; struct lod_layout_component *comp_entries; __u16 comp_cnt, mirror_cnt; - bool is_composite; + bool is_composite, is_foreign = false; int i, rc = 0, offset; ENTRY; @@ -952,9 +822,27 @@ int lod_generate_lovea(const struct lu_env *env, struct lod_object *lo, mirror_cnt = lo->ldo_mirror_count; comp_entries = lo->ldo_comp_entries; is_composite = lo->ldo_is_composite; + is_foreign = lo->ldo_is_foreign; } LASSERT(lmm_size != NULL); + + if (is_foreign) { + struct lov_foreign_md *lfm; + + lfm = (struct lov_foreign_md *)lmm; + memcpy(lfm, lo->ldo_foreign_lov, lo->ldo_foreign_lov_size); + /* need to store little-endian */ + if (cpu_to_le32(LOV_MAGIC_FOREIGN) != LOV_MAGIC_FOREIGN) { + __swab32s(&lfm->lfm_magic); + __swab32s(&lfm->lfm_length); + __swab32s(&lfm->lfm_type); + __swab32s(&lfm->lfm_flags); + } + *lmm_size = lo->ldo_foreign_lov_size; + RETURN(0); + } + LASSERT(comp_cnt != 0 && comp_entries != NULL); if (!is_composite) { @@ -986,6 +874,12 @@ int lod_generate_lovea(const struct lu_env *env, struct lod_object *lo, /* component could be un-inistantiated */ lcme->lcme_flags = cpu_to_le32(lod_comp->llc_flags); + if (lod_comp->llc_flags & LCME_FL_NOSYNC) + lcme->lcme_timestamp = + cpu_to_le64(lod_comp->llc_timestamp); + if (lod_comp->llc_flags & LCME_FL_EXTENSION && !is_dir) + lcm->lcm_magic = cpu_to_le32(LOV_MAGIC_SEL); + lcme->lcme_extent.e_start = cpu_to_le64(lod_comp->llc_extent.e_start); lcme->lcme_extent.e_end = @@ -1093,7 +987,7 @@ static int validate_lod_and_idx(struct lod_device *md, __u32 idx) return -EINVAL; } - if (unlikely(OST_TGT(md, idx)->ltd_ost == NULL)) { + if (unlikely(OST_TGT(md, idx)->ltd_tgt == NULL)) { CERROR("%s: invalid lod device, for idx: %d\n", lod2obd(md)->obd_name , idx); return -EINVAL; @@ -1171,7 +1065,7 @@ int lod_initialize_objects(const struct lu_env *env, struct lod_object *lo, GOTO(out, rc); } - nd = &OST_TGT(md,idx)->ltd_ost->dd_lu_dev; + nd = &OST_TGT(md, idx)->ltd_tgt->dd_lu_dev; lod_putref(md, &md->lod_ost_descs); /* In the function below, .hs_keycmp resolves to @@ -1224,13 +1118,14 @@ out: int lod_parse_striping(const struct lu_env *env, struct lod_object *lo, const struct lu_buf *buf) { - struct lov_mds_md_v1 *lmm; - struct lov_comp_md_v1 *comp_v1 = NULL; - struct lov_ost_data_v1 *objs; - __u32 magic, pattern; - int i, j, rc = 0; - __u16 comp_cnt; - __u16 mirror_cnt = 0; + struct lov_mds_md_v1 *lmm; + struct lov_comp_md_v1 *comp_v1 = NULL; + struct lov_foreign_md *foreign = NULL; + struct lov_ost_data_v1 *objs; + __u32 magic, pattern; + __u16 mirror_cnt = 0; + __u16 comp_cnt; + int i, rc; ENTRY; LASSERT(buf); @@ -1242,12 +1137,16 @@ int lod_parse_striping(const struct lu_env *env, struct lod_object *lo, magic = le32_to_cpu(lmm->lmm_magic); if (magic != LOV_MAGIC_V1 && magic != LOV_MAGIC_V3 && - magic != LOV_MAGIC_COMP_V1) + magic != LOV_MAGIC_COMP_V1 && magic != LOV_MAGIC_FOREIGN && + magic != LOV_MAGIC_SEL) GOTO(out, rc = -EINVAL); - lod_free_comp_entries(lo); + if (lo->ldo_is_foreign) + lod_free_foreign_lov(lo); + else + lod_free_comp_entries(lo); - if (magic == LOV_MAGIC_COMP_V1) { + if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) { comp_v1 = (struct lov_comp_md_v1 *)lmm; comp_cnt = le16_to_cpu(comp_v1->lcm_entry_count); if (comp_cnt == 0) @@ -1257,6 +1156,25 @@ int lod_parse_striping(const struct lu_env *env, struct lod_object *lo, lo->ldo_flr_state = le16_to_cpu(comp_v1->lcm_flags) & LCM_FL_FLR_MASK; mirror_cnt = le16_to_cpu(comp_v1->lcm_mirror_count) + 1; + } else if (magic == LOV_MAGIC_FOREIGN) { + size_t length; + + foreign = (struct lov_foreign_md *)buf->lb_buf; + length = offsetof(typeof(*foreign), lfm_value); + if (buf->lb_len < length || + buf->lb_len < (length + le32_to_cpu(foreign->lfm_length))) { + CDEBUG(D_LAYOUT, + "buf len %zu too small for lov_foreign_md\n", + buf->lb_len); + GOTO(out, rc = -EINVAL); + } + + /* just cache foreign LOV EA raw */ + rc = lod_alloc_foreign_lov(lo, length); + if (rc) + GOTO(out, rc); + memcpy(lo->ldo_foreign_lov, buf->lb_buf, length); + GOTO(out, rc); } else { comp_cnt = 1; lo->ldo_layout_gen = le16_to_cpu(lmm->lmm_layout_gen); @@ -1268,15 +1186,14 @@ int lod_parse_striping(const struct lu_env *env, struct lod_object *lo, GOTO(out, rc); for (i = 0; i < comp_cnt; i++) { - struct lod_layout_component *lod_comp; - struct lu_extent *ext; - __u32 offs; + struct lod_layout_component *lod_comp; + struct lu_extent *ext; + __u32 offs; lod_comp = &lo->ldo_comp_entries[i]; if (lo->ldo_is_composite) { offs = le32_to_cpu(comp_v1->lcm_entries[i].lcme_offset); lmm = (struct lov_mds_md_v1 *)((char *)comp_v1 + offs); - magic = le32_to_cpu(lmm->lmm_magic); ext = &comp_v1->lcm_entries[i].lcme_extent; lod_comp->llc_extent.e_start = @@ -1284,17 +1201,31 @@ int lod_parse_striping(const struct lu_env *env, struct lod_object *lo, lod_comp->llc_extent.e_end = le64_to_cpu(ext->e_end); lod_comp->llc_flags = le32_to_cpu(comp_v1->lcm_entries[i].lcme_flags); + if (lod_comp->llc_flags & LCME_FL_NOSYNC) + lod_comp->llc_timestamp = le64_to_cpu( + comp_v1->lcm_entries[i].lcme_timestamp); lod_comp->llc_id = le32_to_cpu(comp_v1->lcm_entries[i].lcme_id); if (lod_comp->llc_id == LCME_ID_INVAL) GOTO(out, rc = -EINVAL); + + if ((lod_comp->llc_flags & LCME_FL_EXTENSION) && + comp_v1->lcm_magic != cpu_to_le32(LOV_MAGIC_SEL)) { + struct lod_device *d = + lu2lod_dev(lo->ldo_obj.do_lu.lo_dev); + + CWARN("%s: EXTENSION flags=%x set on component[%u]=%x of non-SEL file "DFID" with magic=%#08x\n", + lod2obd(d)->obd_name, + lod_comp->llc_flags, lod_comp->llc_id, i, + PFID(lod_object_fid(lo)), + le32_to_cpu(comp_v1->lcm_magic)); + } } else { lod_comp_set_init(lod_comp); } pattern = le32_to_cpu(lmm->lmm_pattern); - if (lov_pattern(pattern) != LOV_PATTERN_RAID0 && - lov_pattern(pattern) != LOV_PATTERN_MDT) + if (!lov_pattern_supported(lov_pattern(pattern))) GOTO(out, rc = -EINVAL); lod_comp->llc_pattern = pattern; @@ -1302,8 +1233,9 @@ int lod_parse_striping(const struct lu_env *env, struct lod_object *lo, lod_comp->llc_stripe_count = le16_to_cpu(lmm->lmm_stripe_count); lod_comp->llc_layout_gen = le16_to_cpu(lmm->lmm_layout_gen); - if (magic == LOV_MAGIC_V3) { + if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) { struct lov_mds_md_v3 *v3 = (struct lov_mds_md_v3 *)lmm; + lod_set_pool(&lod_comp->llc_pool, v3->lmm_pool_name); objs = &v3->lmm_objects[0]; } else { @@ -1319,8 +1251,14 @@ int lod_parse_striping(const struct lu_env *env, struct lod_object *lo, __u16 stripe_count; if (objs[0].l_ost_idx != (__u32)-1UL) { + int j; + stripe_count = lod_comp_entry_stripe_count( lo, lod_comp, false); + if (stripe_count == 0 && + !(lod_comp->llc_pattern & LOV_PATTERN_F_RELEASED) && + !(lod_comp->llc_pattern & LOV_PATTERN_MDT)) + GOTO(out, rc = -E2BIG); /** * load the user specified ost list, when this * component is instantiated later, it will be @@ -1454,7 +1392,23 @@ int lod_striping_load(const struct lu_env *env, struct lod_object *lo) lo->ldo_comp_cached = 1; } else if (S_ISDIR(lod2lu_obj(lo)->lo_header->loh_attr)) { rc = lod_get_lmv_ea(env, lo); - if (rc < (typeof(rc))sizeof(struct lmv_mds_md_v1)) { + if (rc > sizeof(struct lmv_foreign_md)) { + struct lmv_foreign_md *lfm = info->lti_ea_store; + + if (le32_to_cpu(lfm->lfm_magic) == LMV_MAGIC_FOREIGN) { + lo->ldo_foreign_lmv = info->lti_ea_store; + lo->ldo_foreign_lmv_size = + info->lti_ea_store_size; + info->lti_ea_store = NULL; + info->lti_ea_store_size = 0; + + lo->ldo_dir_stripe_loaded = 1; + lo->ldo_dir_is_foreign = 1; + GOTO(unlock, rc = 0); + } + } + + if (rc < (int)sizeof(struct lmv_mds_md_v1)) { /* Let's set stripe_loaded to avoid further * stripe loading especially for non-stripe directory, * which can hurt performance. (See LU-9840) @@ -1584,9 +1538,9 @@ static int lod_verify_v1v3(struct lod_device *d, const struct lu_buf *buf, if (!is_from_disk && stripe_offset != LOV_OFFSET_DEFAULT && lov_pattern(le32_to_cpu(lum->lmm_pattern)) != LOV_PATTERN_MDT) { /* if offset is not within valid range [0, osts_size) */ - if (stripe_offset >= d->lod_osts_size) { + if (stripe_offset >= d->lod_ost_descs.ltd_tgts_size) { CDEBUG(D_LAYOUT, "stripe offset %u >= bitmap size %u\n", - stripe_offset, d->lod_osts_size); + stripe_offset, d->lod_ost_descs.ltd_tgts_size); GOTO(out, rc = -EINVAL); } @@ -1664,9 +1618,10 @@ struct lov_comp_md_entry_v1 *comp_entry_v1(struct lov_comp_md_v1 *comp, int i) le16_to_cpu(comp->lcm_entry_count) - 1); \ entry++) -int lod_erase_dom_stripe(struct lov_comp_md_v1 *comp_v1) +int lod_erase_dom_stripe(struct lov_comp_md_v1 *comp_v1, + struct lov_comp_md_entry_v1 *dom_ent) { - struct lov_comp_md_entry_v1 *ent, *dom_ent; + struct lov_comp_md_entry_v1 *ent; __u16 entries; __u32 dom_off, dom_size, comp_size; void *blob_src, *blob_dst; @@ -1678,7 +1633,6 @@ int lod_erase_dom_stripe(struct lov_comp_md_v1 *comp_v1) return -EFBIG; comp_size = le32_to_cpu(comp_v1->lcm_size); - dom_ent = &comp_v1->lcm_entries[0]; dom_off = le32_to_cpu(dom_ent->lcme_offset); dom_size = le32_to_cpu(dom_ent->lcme_size); @@ -1708,16 +1662,16 @@ int lod_erase_dom_stripe(struct lov_comp_md_v1 *comp_v1) return -ERESTART; } -int lod_fix_dom_stripe(struct lod_device *d, struct lov_comp_md_v1 *comp_v1) +int lod_fix_dom_stripe(struct lod_device *d, struct lov_comp_md_v1 *comp_v1, + struct lov_comp_md_entry_v1 *dom_ent) { - struct lov_comp_md_entry_v1 *ent, *dom_ent; + struct lov_comp_md_entry_v1 *ent; struct lu_extent *dom_ext, *ext; struct lov_user_md_v1 *lum; __u32 stripe_size; __u16 mid, dom_mid; int rc = 0; - dom_ent = &comp_v1->lcm_entries[0]; dom_ext = &dom_ent->lcme_extent; dom_mid = mirror_id_of(le32_to_cpu(dom_ent->lcme_id)); stripe_size = d->lod_dom_max_stripesize; @@ -1754,7 +1708,7 @@ int lod_fix_dom_stripe(struct lod_device *d, struct lov_comp_md_v1 *comp_v1) if (stripe_size == 0) { /* DoM component size is zero due to server setting, * remove it from the layout */ - rc = lod_erase_dom_stripe(comp_v1); + rc = lod_erase_dom_stripe(comp_v1, dom_ent); } else { /* Update DoM extent end finally */ dom_ext->e_end = cpu_to_le64(stripe_size); @@ -1778,7 +1732,7 @@ int lod_fix_dom_stripe(struct lod_device *d, struct lov_comp_md_v1 *comp_v1) int lod_verify_striping(struct lod_device *d, struct lod_object *lo, const struct lu_buf *buf, bool is_from_disk) { - struct lov_desc *desc = &d->lod_desc; + struct lov_desc *desc = &d->lod_ost_descs.ltd_lov_desc; struct lov_user_md_v1 *lum; struct lov_comp_md_v1 *comp_v1; struct lov_comp_md_entry_v1 *ent; @@ -1792,15 +1746,47 @@ int lod_verify_striping(struct lod_device *d, struct lod_object *lo, int rc = 0; ENTRY; + if (buf->lb_len < sizeof(lum->lmm_magic)) { + CDEBUG(D_LAYOUT, "invalid buf len %zu\n", buf->lb_len); + RETURN(-EINVAL); + } + lum = buf->lb_buf; + magic = le32_to_cpu(lum->lmm_magic) & ~LOV_MAGIC_DEFINED; + /* treat foreign LOV EA/object case first + * XXX is it expected to try setting again a foreign? + * XXX should we care about different current vs new layouts ? + */ + if (unlikely(magic == LOV_USER_MAGIC_FOREIGN)) { + struct lov_foreign_md *lfm = buf->lb_buf; + + if (buf->lb_len < offsetof(typeof(*lfm), lfm_value)) { + CDEBUG(D_LAYOUT, + "buf len %zu < min lov_foreign_md size (%zu)\n", + buf->lb_len, offsetof(typeof(*lfm), + lfm_value)); + RETURN(-EINVAL); + } + + if (foreign_size_le(lfm) > buf->lb_len) { + CDEBUG(D_LAYOUT, + "buf len %zu < this lov_foreign_md size (%zu)\n", + buf->lb_len, foreign_size_le(lfm)); + RETURN(-EINVAL); + } + /* Don't do anything with foreign layouts */ + RETURN(0); + } + + /* normal LOV/layout cases */ + if (buf->lb_len < sizeof(*lum)) { CDEBUG(D_LAYOUT, "buf len %zu too small for lov_user_md\n", buf->lb_len); RETURN(-EINVAL); } - magic = le32_to_cpu(lum->lmm_magic) & ~LOV_MAGIC_DEFINED; if (magic != LOV_USER_MAGIC_V1 && magic != LOV_USER_MAGIC_V3 && magic != LOV_USER_MAGIC_SPECIFIC && @@ -1842,7 +1828,7 @@ recheck: for_each_comp_entry_v1(comp_v1, ent) { ext = &ent->lcme_extent; - if (le64_to_cpu(ext->e_start) >= le64_to_cpu(ext->e_end)) { + if (le64_to_cpu(ext->e_start) > le64_to_cpu(ext->e_end)) { CDEBUG(D_LAYOUT, "invalid extent "DEXT"\n", le64_to_cpu(ext->e_start), le64_to_cpu(ext->e_end)); @@ -1895,7 +1881,7 @@ recheck: lum = tmp.lb_buf; if (lov_pattern(le32_to_cpu(lum->lmm_pattern)) == LOV_PATTERN_MDT) { - /* DoM component can be only the first stripe */ + /* DoM component must be the first in a mirror */ if (le64_to_cpu(ext->e_start) > 0) { CDEBUG(D_LAYOUT, "invalid DoM component " "with %llu extent start\n", @@ -1918,7 +1904,7 @@ recheck: "%u is bigger than MDT limit %u, check " "dom_max_stripesize parameter\n", stripe_size, d->lod_dom_max_stripesize); - rc = lod_fix_dom_stripe(d, comp_v1); + rc = lod_fix_dom_stripe(d, comp_v1, ent); if (rc == -ERESTART) { /* DoM entry was removed, re-check * new layout from start */ @@ -1942,7 +1928,7 @@ recheck: stripe_size = le32_to_cpu(lum->lmm_stripe_size); if (stripe_size == 0) stripe_size = desc->ld_default_stripe_size; - if (stripe_size == 0 || (prev_end & (stripe_size - 1))) { + if (prev_end % stripe_size) { CDEBUG(D_LAYOUT, "stripe size isn't aligned, " "stripe_sz: %u, [%llu, %llu)\n", stripe_size, ext->e_start, prev_end); @@ -2009,9 +1995,16 @@ void lod_fix_desc_stripe_count(__u32 *val) void lod_fix_desc_pattern(__u32 *val) { /* from lov_setstripe */ - if ((*val != 0) && (*val != LOV_PATTERN_RAID0) && - (*val != LOV_PATTERN_MDT)) { - LCONSOLE_WARN("Unknown stripe pattern: %#x\n", *val); + if ((*val != 0) && !lov_pattern_supported_normal_comp(*val)) { + LCONSOLE_WARN("lod: Unknown stripe pattern: %#x\n", *val); + *val = 0; + } +} + +void lod_fix_lmv_desc_pattern(__u32 *val) +{ + if ((*val) && !lmv_is_known_hash_type(*val)) { + LCONSOLE_WARN("lod: Unknown md stripe pattern: %#x\n", *val); *val = 0; } } @@ -2036,6 +2029,14 @@ void lod_fix_desc(struct lov_desc *desc) lod_fix_desc_qos_maxage(&desc->ld_qos_maxage); } +static void lod_fix_lmv_desc(struct lmv_desc *desc) +{ + desc->ld_active_tgt_count = 0; + lod_fix_desc_stripe_count(&desc->ld_default_stripe_count); + lod_fix_lmv_desc_pattern(&desc->ld_pattern); + lod_fix_desc_qos_maxage(&desc->ld_qos_maxage); +} + /** * Initialize the structures used to store pools and default striping. * @@ -2084,20 +2085,12 @@ int lod_pools_init(struct lod_device *lod, struct lustre_cfg *lcfg) lod_fix_desc(desc); desc->ld_active_tgt_count = 0; - lod->lod_desc = *desc; + lod->lod_ost_descs.ltd_lov_desc = *desc; - lod->lod_sp_me = LUSTRE_SP_CLI; + /* NB: config doesn't contain lmv_desc, alter it via sysfs. */ + lod_fix_lmv_desc(&lod->lod_mdt_descs.ltd_lmv_desc); - /* Set up allocation policy (QoS and RR) */ - INIT_LIST_HEAD(&lod->lod_qos.lq_oss_list); - init_rwsem(&lod->lod_qos.lq_rw_sem); - lod->lod_qos.lq_dirty = 1; - lod->lod_qos.lq_rr.lqr_dirty = 1; - lod->lod_qos.lq_reset = 1; - /* Default priority is toward free space balance */ - lod->lod_qos.lq_prio_free = 232; - /* Default threshold for rr (roughly 17%) */ - lod->lod_qos.lq_threshold_rr = 43; + lod->lod_sp_me = LUSTRE_SP_CLI; /* Set up OST pool environment */ lod->lod_pools_hash_body = cfs_hash_create("POOLS", HASH_POOLS_CUR_BITS, @@ -2112,18 +2105,30 @@ int lod_pools_init(struct lod_device *lod, struct lustre_cfg *lcfg) INIT_LIST_HEAD(&lod->lod_pool_list); lod->lod_pool_count = 0; - rc = lod_ost_pool_init(&lod->lod_pool_info, 0); + rc = lod_tgt_pool_init(&lod->lod_mdt_descs.ltd_tgt_pool, 0); if (rc) GOTO(out_hash, rc); - lod_qos_rr_init(&lod->lod_qos.lq_rr); - rc = lod_ost_pool_init(&lod->lod_qos.lq_rr.lqr_pool, 0); + + rc = lod_tgt_pool_init(&lod->lod_mdt_descs.ltd_qos.lq_rr.lqr_pool, 0); + if (rc) + GOTO(out_mdt_pool, rc); + + rc = lod_tgt_pool_init(&lod->lod_ost_descs.ltd_tgt_pool, 0); + if (rc) + GOTO(out_mdt_rr_pool, rc); + + rc = lod_tgt_pool_init(&lod->lod_ost_descs.ltd_qos.lq_rr.lqr_pool, 0); if (rc) - GOTO(out_pool_info, rc); + GOTO(out_ost_pool, rc); RETURN(0); -out_pool_info: - lod_ost_pool_free(&lod->lod_pool_info); +out_ost_pool: + lod_tgt_pool_free(&lod->lod_ost_descs.ltd_tgt_pool); +out_mdt_rr_pool: + lod_tgt_pool_free(&lod->lod_mdt_descs.ltd_qos.lq_rr.lqr_pool); +out_mdt_pool: + lod_tgt_pool_free(&lod->lod_mdt_descs.ltd_tgt_pool); out_hash: cfs_hash_putref(lod->lod_pools_hash_body); @@ -2153,8 +2158,10 @@ int lod_pools_fini(struct lod_device *lod) } cfs_hash_putref(lod->lod_pools_hash_body); - lod_ost_pool_free(&(lod->lod_qos.lq_rr.lqr_pool)); - lod_ost_pool_free(&lod->lod_pool_info); + lod_tgt_pool_free(&lod->lod_ost_descs.ltd_qos.lq_rr.lqr_pool); + lod_tgt_pool_free(&lod->lod_ost_descs.ltd_tgt_pool); + lod_tgt_pool_free(&lod->lod_mdt_descs.ltd_qos.lq_rr.lqr_pool); + lod_tgt_pool_free(&lod->lod_mdt_descs.ltd_tgt_pool); RETURN(0); }