4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License version 2 for more details. A copy is
14 * included in the COPYING file that accompanied this code.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved
24 * Use is subject to license terms.
26 * Copyright (c) 2012, 2017, Intel Corporation.
29 * lustre/lod/lod_object.c
31 * This file contains implementations of methods for the OSD API
32 * for the Logical Object Device (LOD) layer, which provides a virtual
33 * local OSD object interface to the MDD layer, and abstracts the
34 * addressing of local (OSD) and remote (OSP) objects. The API is
35 * described in the file lustre/include/dt_object.h and in
36 * Documentation/osd-api.txt.
38 * Author: Alex Zhuravlev <alexey.zhuravlev@intel.com>
41 #define DEBUG_SUBSYSTEM S_MDS
43 #include <linux/random.h>
46 #include <obd_class.h>
47 #include <obd_support.h>
49 #include <lustre_fid.h>
50 #include <lustre_linkea.h>
51 #include <lustre_lmv.h>
52 #include <uapi/linux/lustre/lustre_param.h>
53 #include <lustre_swab.h>
54 #include <uapi/linux/lustre/lustre_ver.h>
55 #include <lprocfs_status.h>
56 #include <md_object.h>
58 #include "lod_internal.h"
60 static const char dot[] = ".";
61 static const char dotdot[] = "..";
64 * Implementation of dt_index_operations::dio_lookup
66 * Used with regular (non-striped) objects.
68 * \see dt_index_operations::dio_lookup() in the API description for details.
70 static int lod_lookup(const struct lu_env *env, struct dt_object *dt,
71 struct dt_rec *rec, const struct dt_key *key)
73 struct dt_object *next = dt_object_child(dt);
74 return next->do_index_ops->dio_lookup(env, next, rec, key);
78 * Implementation of dt_index_operations::dio_declare_insert.
80 * Used with regular (non-striped) objects.
82 * \see dt_index_operations::dio_declare_insert() in the API description
85 static int lod_declare_insert(const struct lu_env *env, struct dt_object *dt,
86 const struct dt_rec *rec,
87 const struct dt_key *key, struct thandle *th)
89 return lod_sub_declare_insert(env, dt_object_child(dt), rec, key, th);
93 * Implementation of dt_index_operations::dio_insert.
95 * Used with regular (non-striped) objects
97 * \see dt_index_operations::dio_insert() in the API description for details.
99 static int lod_insert(const struct lu_env *env, struct dt_object *dt,
100 const struct dt_rec *rec, const struct dt_key *key,
103 return lod_sub_insert(env, dt_object_child(dt), rec, key, th);
107 * Implementation of dt_index_operations::dio_declare_delete.
109 * Used with regular (non-striped) objects.
111 * \see dt_index_operations::dio_declare_delete() in the API description
114 static int lod_declare_delete(const struct lu_env *env, struct dt_object *dt,
115 const struct dt_key *key, struct thandle *th)
117 return lod_sub_declare_delete(env, dt_object_child(dt), key, th);
121 * Implementation of dt_index_operations::dio_delete.
123 * Used with regular (non-striped) objects.
125 * \see dt_index_operations::dio_delete() in the API description for details.
127 static int lod_delete(const struct lu_env *env, struct dt_object *dt,
128 const struct dt_key *key, struct thandle *th)
130 return lod_sub_delete(env, dt_object_child(dt), key, th);
134 * Implementation of dt_it_ops::init.
136 * Used with regular (non-striped) objects.
138 * \see dt_it_ops::init() in the API description for details.
140 static struct dt_it *lod_it_init(const struct lu_env *env,
141 struct dt_object *dt, __u32 attr)
143 struct dt_object *next = dt_object_child(dt);
144 struct lod_it *it = &lod_env_info(env)->lti_it;
145 struct dt_it *it_next;
147 it_next = next->do_index_ops->dio_it.init(env, next, attr);
151 /* currently we do not use more than one iterator per thread
152 * so we store it in thread info. if at some point we need
153 * more active iterators in a single thread, we can allocate
155 LASSERT(it->lit_obj == NULL);
157 it->lit_it = it_next;
160 return (struct dt_it *)it;
163 #define LOD_CHECK_IT(env, it) \
165 LASSERT((it)->lit_obj != NULL); \
166 LASSERT((it)->lit_it != NULL); \
170 * Implementation of dt_index_operations::dio_it.fini.
172 * Used with regular (non-striped) objects.
174 * \see dt_index_operations::dio_it.fini() in the API description for details.
176 static void lod_it_fini(const struct lu_env *env, struct dt_it *di)
178 struct lod_it *it = (struct lod_it *)di;
180 LOD_CHECK_IT(env, it);
181 it->lit_obj->do_index_ops->dio_it.fini(env, it->lit_it);
183 /* the iterator not in use any more */
189 * Implementation of dt_it_ops::get.
191 * Used with regular (non-striped) objects.
193 * \see dt_it_ops::get() in the API description for details.
195 static int lod_it_get(const struct lu_env *env, struct dt_it *di,
196 const struct dt_key *key)
198 const struct lod_it *it = (const struct lod_it *)di;
200 LOD_CHECK_IT(env, it);
201 return it->lit_obj->do_index_ops->dio_it.get(env, it->lit_it, key);
205 * Implementation of dt_it_ops::put.
207 * Used with regular (non-striped) objects.
209 * \see dt_it_ops::put() in the API description for details.
211 static void lod_it_put(const struct lu_env *env, struct dt_it *di)
213 struct lod_it *it = (struct lod_it *)di;
215 LOD_CHECK_IT(env, it);
216 return it->lit_obj->do_index_ops->dio_it.put(env, it->lit_it);
220 * Implementation of dt_it_ops::next.
222 * Used with regular (non-striped) objects
224 * \see dt_it_ops::next() in the API description for details.
226 static int lod_it_next(const struct lu_env *env, struct dt_it *di)
228 struct lod_it *it = (struct lod_it *)di;
230 LOD_CHECK_IT(env, it);
231 return it->lit_obj->do_index_ops->dio_it.next(env, it->lit_it);
235 * Implementation of dt_it_ops::key.
237 * Used with regular (non-striped) objects.
239 * \see dt_it_ops::key() in the API description for details.
241 static struct dt_key *lod_it_key(const struct lu_env *env,
242 const struct dt_it *di)
244 const struct lod_it *it = (const struct lod_it *)di;
246 LOD_CHECK_IT(env, it);
247 return it->lit_obj->do_index_ops->dio_it.key(env, it->lit_it);
251 * Implementation of dt_it_ops::key_size.
253 * Used with regular (non-striped) objects.
255 * \see dt_it_ops::key_size() in the API description for details.
257 static int lod_it_key_size(const struct lu_env *env, const struct dt_it *di)
259 struct lod_it *it = (struct lod_it *)di;
261 LOD_CHECK_IT(env, it);
262 return it->lit_obj->do_index_ops->dio_it.key_size(env, it->lit_it);
266 * Implementation of dt_it_ops::rec.
268 * Used with regular (non-striped) objects.
270 * \see dt_it_ops::rec() in the API description for details.
272 static int lod_it_rec(const struct lu_env *env, const struct dt_it *di,
273 struct dt_rec *rec, __u32 attr)
275 const struct lod_it *it = (const struct lod_it *)di;
277 LOD_CHECK_IT(env, it);
278 return it->lit_obj->do_index_ops->dio_it.rec(env, it->lit_it, rec,
283 * Implementation of dt_it_ops::rec_size.
285 * Used with regular (non-striped) objects.
287 * \see dt_it_ops::rec_size() in the API description for details.
289 static int lod_it_rec_size(const struct lu_env *env, const struct dt_it *di,
292 const struct lod_it *it = (const struct lod_it *)di;
294 LOD_CHECK_IT(env, it);
295 return it->lit_obj->do_index_ops->dio_it.rec_size(env, it->lit_it,
300 * Implementation of dt_it_ops::store.
302 * Used with regular (non-striped) objects.
304 * \see dt_it_ops::store() in the API description for details.
306 static __u64 lod_it_store(const struct lu_env *env, const struct dt_it *di)
308 const struct lod_it *it = (const struct lod_it *)di;
310 LOD_CHECK_IT(env, it);
311 return it->lit_obj->do_index_ops->dio_it.store(env, it->lit_it);
315 * Implementation of dt_it_ops::load.
317 * Used with regular (non-striped) objects.
319 * \see dt_it_ops::load() in the API description for details.
321 static int lod_it_load(const struct lu_env *env, const struct dt_it *di,
324 const struct lod_it *it = (const struct lod_it *)di;
326 LOD_CHECK_IT(env, it);
327 return it->lit_obj->do_index_ops->dio_it.load(env, it->lit_it, hash);
330 static const struct dt_index_operations lod_index_ops = {
331 .dio_lookup = lod_lookup,
332 .dio_declare_insert = lod_declare_insert,
333 .dio_insert = lod_insert,
334 .dio_declare_delete = lod_declare_delete,
335 .dio_delete = lod_delete,
343 .key_size = lod_it_key_size,
345 .rec_size = lod_it_rec_size,
346 .store = lod_it_store,
352 * Implementation of dt_index_operations::dio_lookup
354 * Used with striped directories.
356 * \see dt_index_operations::dio_lookup() in the API description for details.
358 static int lod_striped_lookup(const struct lu_env *env, struct dt_object *dt,
359 struct dt_rec *rec, const struct dt_key *key)
361 struct lod_object *lo = lod_dt_obj(dt);
362 struct dt_object *next;
363 const char *name = (const char *)key;
365 LASSERT(lo->ldo_dir_stripe_count > 0);
367 if (strcmp(name, dot) == 0) {
368 struct lu_fid *fid = (struct lu_fid *)rec;
370 *fid = *lod_object_fid(lo);
374 if (strcmp(name, dotdot) == 0) {
375 next = dt_object_child(dt);
379 index = __lmv_name_to_stripe_index(lo->ldo_dir_hash_type,
380 lo->ldo_dir_stripe_count,
381 lo->ldo_dir_migrate_hash,
382 lo->ldo_dir_migrate_offset,
383 name, strlen(name), true);
387 next = lo->ldo_stripe[index];
388 if (!next || !dt_object_exists(next))
392 return next->do_index_ops->dio_lookup(env, next, rec, key);
396 * Implementation of dt_it_ops::init.
398 * Used with striped objects. Internally just initializes the iterator
399 * on the first stripe.
401 * \see dt_it_ops::init() in the API description for details.
403 static struct dt_it *lod_striped_it_init(const struct lu_env *env,
404 struct dt_object *dt, __u32 attr)
406 struct lod_object *lo = lod_dt_obj(dt);
407 struct dt_object *next;
408 struct lod_it *it = &lod_env_info(env)->lti_it;
409 struct dt_it *it_next;
412 LASSERT(lo->ldo_dir_stripe_count > 0);
415 next = lo->ldo_stripe[index];
416 if (next && dt_object_exists(next))
418 } while (++index < lo->ldo_dir_stripe_count);
420 /* no valid stripe */
421 if (!next || !dt_object_exists(next))
422 return ERR_PTR(-ENODEV);
424 LASSERT(next->do_index_ops != NULL);
426 it_next = next->do_index_ops->dio_it.init(env, next, attr);
430 /* currently we do not use more than one iterator per thread
431 * so we store it in thread info. if at some point we need
432 * more active iterators in a single thread, we can allocate
434 LASSERT(it->lit_obj == NULL);
436 it->lit_stripe_index = index;
438 it->lit_it = it_next;
441 return (struct dt_it *)it;
444 #define LOD_CHECK_STRIPED_IT(env, it, lo) \
446 LASSERT((it)->lit_obj != NULL); \
447 LASSERT((it)->lit_it != NULL); \
448 LASSERT((lo)->ldo_dir_stripe_count > 0); \
449 LASSERT((it)->lit_stripe_index < (lo)->ldo_dir_stripe_count); \
453 * Implementation of dt_it_ops::fini.
455 * Used with striped objects.
457 * \see dt_it_ops::fini() in the API description for details.
459 static void lod_striped_it_fini(const struct lu_env *env, struct dt_it *di)
461 struct lod_it *it = (struct lod_it *)di;
462 struct lod_object *lo = lod_dt_obj(it->lit_obj);
463 struct dt_object *next;
465 /* If lit_it == NULL, then it means the sub_it has been finished,
466 * which only happens in failure cases, see lod_striped_it_next() */
467 if (it->lit_it != NULL) {
468 LOD_CHECK_STRIPED_IT(env, it, lo);
470 next = lo->ldo_stripe[it->lit_stripe_index];
472 LASSERT(next->do_index_ops != NULL);
473 next->do_index_ops->dio_it.fini(env, it->lit_it);
477 /* the iterator not in use any more */
480 it->lit_stripe_index = 0;
484 * Implementation of dt_it_ops::get.
486 * Right now it's not used widely, only to reset the iterator to the
487 * initial position. It should be possible to implement a full version
488 * which chooses a correct stripe to be able to position with any key.
490 * \see dt_it_ops::get() in the API description for details.
492 static int lod_striped_it_get(const struct lu_env *env, struct dt_it *di,
493 const struct dt_key *key)
495 const struct lod_it *it = (const struct lod_it *)di;
496 struct lod_object *lo = lod_dt_obj(it->lit_obj);
497 struct dt_object *next;
499 LOD_CHECK_STRIPED_IT(env, it, lo);
501 next = lo->ldo_stripe[it->lit_stripe_index];
502 LASSERT(next != NULL);
503 LASSERT(dt_object_exists(next));
504 LASSERT(next->do_index_ops != NULL);
506 return next->do_index_ops->dio_it.get(env, it->lit_it, key);
510 * Implementation of dt_it_ops::put.
512 * Used with striped objects.
514 * \see dt_it_ops::put() in the API description for details.
516 static void lod_striped_it_put(const struct lu_env *env, struct dt_it *di)
518 struct lod_it *it = (struct lod_it *)di;
519 struct lod_object *lo = lod_dt_obj(it->lit_obj);
520 struct dt_object *next;
523 * If lit_it == NULL, then it means the sub_it has been finished,
524 * which only happens in failure cases, see lod_striped_it_next()
529 LOD_CHECK_STRIPED_IT(env, it, lo);
531 next = lo->ldo_stripe[it->lit_stripe_index];
532 LASSERT(next != NULL);
533 LASSERT(next->do_index_ops != NULL);
535 return next->do_index_ops->dio_it.put(env, it->lit_it);
539 * Implementation of dt_it_ops::next.
541 * Used with striped objects. When the end of the current stripe is
542 * reached, the method takes the next stripe's iterator.
544 * \see dt_it_ops::next() in the API description for details.
546 static int lod_striped_it_next(const struct lu_env *env, struct dt_it *di)
548 struct lod_it *it = (struct lod_it *)di;
549 struct lod_object *lo = lod_dt_obj(it->lit_obj);
550 struct dt_object *next;
551 struct dt_it *it_next;
557 LOD_CHECK_STRIPED_IT(env, it, lo);
559 next = lo->ldo_stripe[it->lit_stripe_index];
560 LASSERT(next != NULL);
561 LASSERT(dt_object_exists(next));
562 LASSERT(next->do_index_ops != NULL);
564 rc = next->do_index_ops->dio_it.next(env, it->lit_it);
568 if (rc == 0 && it->lit_stripe_index == 0)
571 if (rc == 0 && it->lit_stripe_index > 0) {
572 struct lu_dirent *ent;
574 ent = (struct lu_dirent *)lod_env_info(env)->lti_key;
576 rc = next->do_index_ops->dio_it.rec(env, it->lit_it,
577 (struct dt_rec *)ent,
582 /* skip . and .. for slave stripe */
583 if ((strncmp(ent->lde_name, ".",
584 le16_to_cpu(ent->lde_namelen)) == 0 &&
585 le16_to_cpu(ent->lde_namelen) == 1) ||
586 (strncmp(ent->lde_name, "..",
587 le16_to_cpu(ent->lde_namelen)) == 0 &&
588 le16_to_cpu(ent->lde_namelen) == 2))
594 next->do_index_ops->dio_it.put(env, it->lit_it);
595 next->do_index_ops->dio_it.fini(env, it->lit_it);
598 /* go to next stripe */
599 index = it->lit_stripe_index;
600 while (++index < lo->ldo_dir_stripe_count) {
601 next = lo->ldo_stripe[index];
605 if (!dt_object_exists(next))
608 rc = next->do_ops->do_index_try(env, next,
609 &dt_directory_features);
613 LASSERT(next->do_index_ops != NULL);
615 it_next = next->do_index_ops->dio_it.init(env, next,
618 RETURN(PTR_ERR(it_next));
620 rc = next->do_index_ops->dio_it.get(env, it_next,
621 (const struct dt_key *)"");
623 RETURN(rc == 0 ? -EIO : rc);
625 it->lit_it = it_next;
626 it->lit_stripe_index = index;
635 * Implementation of dt_it_ops::key.
637 * Used with striped objects.
639 * \see dt_it_ops::key() in the API description for details.
641 static struct dt_key *lod_striped_it_key(const struct lu_env *env,
642 const struct dt_it *di)
644 const struct lod_it *it = (const struct lod_it *)di;
645 struct lod_object *lo = lod_dt_obj(it->lit_obj);
646 struct dt_object *next;
648 LOD_CHECK_STRIPED_IT(env, it, lo);
650 next = lo->ldo_stripe[it->lit_stripe_index];
651 LASSERT(next != NULL);
652 LASSERT(next->do_index_ops != NULL);
654 return next->do_index_ops->dio_it.key(env, it->lit_it);
658 * Implementation of dt_it_ops::key_size.
660 * Used with striped objects.
662 * \see dt_it_ops::size() in the API description for details.
664 static int lod_striped_it_key_size(const struct lu_env *env,
665 const struct dt_it *di)
667 struct lod_it *it = (struct lod_it *)di;
668 struct lod_object *lo = lod_dt_obj(it->lit_obj);
669 struct dt_object *next;
671 LOD_CHECK_STRIPED_IT(env, it, lo);
673 next = lo->ldo_stripe[it->lit_stripe_index];
674 LASSERT(next != NULL);
675 LASSERT(next->do_index_ops != NULL);
677 return next->do_index_ops->dio_it.key_size(env, it->lit_it);
681 * Implementation of dt_it_ops::rec.
683 * Used with striped objects.
685 * \see dt_it_ops::rec() in the API description for details.
687 static int lod_striped_it_rec(const struct lu_env *env, const struct dt_it *di,
688 struct dt_rec *rec, __u32 attr)
690 const struct lod_it *it = (const struct lod_it *)di;
691 struct lod_object *lo = lod_dt_obj(it->lit_obj);
692 struct dt_object *next;
694 LOD_CHECK_STRIPED_IT(env, it, lo);
696 next = lo->ldo_stripe[it->lit_stripe_index];
697 LASSERT(next != NULL);
698 LASSERT(next->do_index_ops != NULL);
700 return next->do_index_ops->dio_it.rec(env, it->lit_it, rec, attr);
704 * Implementation of dt_it_ops::rec_size.
706 * Used with striped objects.
708 * \see dt_it_ops::rec_size() in the API description for details.
710 static int lod_striped_it_rec_size(const struct lu_env *env,
711 const struct dt_it *di, __u32 attr)
713 struct lod_it *it = (struct lod_it *)di;
714 struct lod_object *lo = lod_dt_obj(it->lit_obj);
715 struct dt_object *next;
717 LOD_CHECK_STRIPED_IT(env, it, lo);
719 next = lo->ldo_stripe[it->lit_stripe_index];
720 LASSERT(next != NULL);
721 LASSERT(next->do_index_ops != NULL);
723 return next->do_index_ops->dio_it.rec_size(env, it->lit_it, attr);
727 * Implementation of dt_it_ops::store.
729 * Used with striped objects.
731 * \see dt_it_ops::store() in the API description for details.
733 static __u64 lod_striped_it_store(const struct lu_env *env,
734 const struct dt_it *di)
736 const struct lod_it *it = (const struct lod_it *)di;
737 struct lod_object *lo = lod_dt_obj(it->lit_obj);
738 struct dt_object *next;
740 LOD_CHECK_STRIPED_IT(env, it, lo);
742 next = lo->ldo_stripe[it->lit_stripe_index];
743 LASSERT(next != NULL);
744 LASSERT(next->do_index_ops != NULL);
746 return next->do_index_ops->dio_it.store(env, it->lit_it);
750 * Implementation of dt_it_ops::load.
752 * Used with striped objects.
754 * \see dt_it_ops::load() in the API description for details.
756 static int lod_striped_it_load(const struct lu_env *env,
757 const struct dt_it *di, __u64 hash)
759 const struct lod_it *it = (const struct lod_it *)di;
760 struct lod_object *lo = lod_dt_obj(it->lit_obj);
761 struct dt_object *next;
763 LOD_CHECK_STRIPED_IT(env, it, lo);
765 next = lo->ldo_stripe[it->lit_stripe_index];
766 LASSERT(next != NULL);
767 LASSERT(next->do_index_ops != NULL);
769 return next->do_index_ops->dio_it.load(env, it->lit_it, hash);
772 static const struct dt_index_operations lod_striped_index_ops = {
773 .dio_lookup = lod_striped_lookup,
774 .dio_declare_insert = lod_declare_insert,
775 .dio_insert = lod_insert,
776 .dio_declare_delete = lod_declare_delete,
777 .dio_delete = lod_delete,
779 .init = lod_striped_it_init,
780 .fini = lod_striped_it_fini,
781 .get = lod_striped_it_get,
782 .put = lod_striped_it_put,
783 .next = lod_striped_it_next,
784 .key = lod_striped_it_key,
785 .key_size = lod_striped_it_key_size,
786 .rec = lod_striped_it_rec,
787 .rec_size = lod_striped_it_rec_size,
788 .store = lod_striped_it_store,
789 .load = lod_striped_it_load,
794 * Append the FID for each shard of the striped directory after the
795 * given LMV EA header.
797 * To simplify striped directory and the consistency verification,
798 * we only store the LMV EA header on disk, for both master object
799 * and slave objects. When someone wants to know the whole LMV EA,
800 * such as client readdir(), we can build the entrie LMV EA on the
801 * MDT side (in RAM) via iterating the sub-directory entries that
802 * are contained in the master object of the stripe directory.
804 * For the master object of the striped directroy, the valid name
805 * for each shard is composed of the ${shard_FID}:${shard_idx}.
807 * There may be holes in the LMV EA if some shards' name entries
808 * are corrupted or lost.
810 * \param[in] env pointer to the thread context
811 * \param[in] lo pointer to the master object of the striped directory
812 * \param[in] buf pointer to the lu_buf which will hold the LMV EA
813 * \param[in] resize whether re-allocate the buffer if it is not big enough
815 * \retval positive size of the LMV EA
816 * \retval 0 for nothing to be loaded
817 * \retval negative error number on failure
819 int lod_load_lmv_shards(const struct lu_env *env, struct lod_object *lo,
820 struct lu_buf *buf, bool resize)
822 struct lu_dirent *ent =
823 (struct lu_dirent *)lod_env_info(env)->lti_key;
824 struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
825 struct dt_object *obj = dt_object_child(&lo->ldo_obj);
826 struct lmv_mds_md_v1 *lmv1 = buf->lb_buf;
828 const struct dt_it_ops *iops;
830 __u32 magic = le32_to_cpu(lmv1->lmv_magic);
835 if (magic != LMV_MAGIC_V1)
838 stripes = le32_to_cpu(lmv1->lmv_stripe_count);
842 rc = lmv_mds_md_size(stripes, magic);
846 if (buf->lb_len < lmv1_size) {
855 lu_buf_alloc(buf, lmv1_size);
860 memcpy(buf->lb_buf, tbuf.lb_buf, tbuf.lb_len);
863 if (unlikely(!dt_try_as_dir(env, obj, true)))
866 memset(&lmv1->lmv_stripe_fids[0], 0, stripes * sizeof(struct lu_fid));
867 iops = &obj->do_index_ops->dio_it;
868 it = iops->init(env, obj, LUDA_64BITHASH);
872 rc = iops->load(env, it, 0);
874 rc = iops->next(env, it);
879 char name[FID_LEN + 2] = "";
884 rc = iops->rec(env, it, (struct dt_rec *)ent, LUDA_64BITHASH);
890 fid_le_to_cpu(&fid, &ent->lde_fid);
891 ent->lde_namelen = le16_to_cpu(ent->lde_namelen);
892 if (ent->lde_name[0] == '.') {
893 if (ent->lde_namelen == 1)
896 if (ent->lde_namelen == 2 && ent->lde_name[1] == '.')
900 len = scnprintf(name, sizeof(name),
901 DFID":", PFID(&ent->lde_fid));
902 /* The ent->lde_name is composed of ${FID}:${index} */
903 if (ent->lde_namelen < len + 1 ||
904 memcmp(ent->lde_name, name, len) != 0) {
905 CDEBUG_LIMIT(lod->lod_lmv_failout ? D_ERROR : D_INFO,
906 "%s: invalid shard name %.*s with the FID "DFID" for the striped directory "DFID", %s\n",
907 lod2obd(lod)->obd_name, ent->lde_namelen,
908 ent->lde_name, PFID(&fid),
909 PFID(lu_object_fid(&obj->do_lu)),
910 lod->lod_lmv_failout ? "failout" : "skip");
912 if (lod->lod_lmv_failout)
920 if (ent->lde_name[len] < '0' ||
921 ent->lde_name[len] > '9') {
922 CDEBUG_LIMIT(lod->lod_lmv_failout ?
924 "%s: invalid shard name %.*s with the FID "DFID" for the striped directory "DFID", %s\n",
925 lod2obd(lod)->obd_name,
927 ent->lde_name, PFID(&fid),
928 PFID(lu_object_fid(&obj->do_lu)),
929 lod->lod_lmv_failout ?
932 if (lod->lod_lmv_failout)
938 index = index * 10 + ent->lde_name[len++] - '0';
939 } while (len < ent->lde_namelen);
941 if (len == ent->lde_namelen) {
942 /* Out of LMV EA range. */
943 if (index >= stripes) {
944 CERROR("%s: the shard %.*s for the striped "
945 "directory "DFID" is out of the known "
946 "LMV EA range [0 - %u], failout\n",
947 lod2obd(lod)->obd_name, ent->lde_namelen,
949 PFID(lu_object_fid(&obj->do_lu)),
955 /* The slot has been occupied. */
956 if (!fid_is_zero(&lmv1->lmv_stripe_fids[index]) &&
957 !CFS_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_SLAVE_NAME)) {
961 &lmv1->lmv_stripe_fids[index]);
962 CERROR("%s: both the shard "DFID" and "DFID
963 " for the striped directory "DFID
964 " claim the same LMV EA slot at the "
965 "index %d, failout\n",
966 lod2obd(lod)->obd_name,
967 PFID(&fid0), PFID(&fid),
968 PFID(lu_object_fid(&obj->do_lu)), index);
973 /* stored as LE mode */
974 lmv1->lmv_stripe_fids[index] = ent->lde_fid;
977 rc = iops->next(env, it);
984 RETURN(rc > 0 ? lmv_mds_md_size(stripes, magic) : rc);
988 * Implementation of dt_object_operations::do_index_try.
990 * \see dt_object_operations::do_index_try() in the API description for details.
992 static int lod_index_try(const struct lu_env *env, struct dt_object *dt,
993 const struct dt_index_features *feat)
995 struct lod_object *lo = lod_dt_obj(dt);
996 struct dt_object *next = dt_object_child(dt);
1000 LASSERT(next->do_ops);
1001 LASSERT(next->do_ops->do_index_try);
1003 rc = lod_striping_load(env, lo);
1007 rc = next->do_ops->do_index_try(env, next, feat);
1011 if (lo->ldo_dir_stripe_count > 0) {
1014 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
1015 if (!lo->ldo_stripe[i])
1017 if (!dt_object_exists(lo->ldo_stripe[i]))
1019 rc = lo->ldo_stripe[i]->do_ops->do_index_try(env,
1020 lo->ldo_stripe[i], feat);
1024 dt->do_index_ops = &lod_striped_index_ops;
1026 dt->do_index_ops = &lod_index_ops;
1033 * Implementation of dt_object_operations::do_read_lock.
1035 * \see dt_object_operations::do_read_lock() in the API description for details.
1037 static void lod_read_lock(const struct lu_env *env, struct dt_object *dt,
1040 dt_read_lock(env, dt_object_child(dt), role);
1044 * Implementation of dt_object_operations::do_write_lock.
1046 * \see dt_object_operations::do_write_lock() in the API description for
1049 static void lod_write_lock(const struct lu_env *env, struct dt_object *dt,
1052 dt_write_lock(env, dt_object_child(dt), role);
1056 * Implementation of dt_object_operations::do_read_unlock.
1058 * \see dt_object_operations::do_read_unlock() in the API description for
1061 static void lod_read_unlock(const struct lu_env *env, struct dt_object *dt)
1063 dt_read_unlock(env, dt_object_child(dt));
1067 * Implementation of dt_object_operations::do_write_unlock.
1069 * \see dt_object_operations::do_write_unlock() in the API description for
1072 static void lod_write_unlock(const struct lu_env *env, struct dt_object *dt)
1074 dt_write_unlock(env, dt_object_child(dt));
1078 * Implementation of dt_object_operations::do_write_locked.
1080 * \see dt_object_operations::do_write_locked() in the API description for
1083 static int lod_write_locked(const struct lu_env *env, struct dt_object *dt)
1085 return dt_write_locked(env, dt_object_child(dt));
1089 * Implementation of dt_object_operations::do_attr_get.
1091 * \see dt_object_operations::do_attr_get() in the API description for details.
1093 static int lod_attr_get(const struct lu_env *env,
1094 struct dt_object *dt,
1095 struct lu_attr *attr)
1097 /* Note: for striped directory, client will merge attributes
1098 * from all of the sub-stripes see lmv_merge_attr(), and there
1099 * no MDD logic depend on directory nlink/size/time, so we can
1100 * always use master inode nlink and size for now. */
1101 return dt_attr_get(env, dt_object_child(dt), attr);
1104 void lod_adjust_stripe_size(struct lod_layout_component *comp,
1105 __u32 def_stripe_size)
1107 __u64 comp_end = comp->llc_extent.e_end;
1109 /* Choose stripe size if not set. Note that default stripe size can't
1110 * be used as is, because it must be multiplier of given component end.
1111 * - first check if default stripe size can be used
1112 * - if not than select the lowest set bit from component end and use
1113 * that value as stripe size
1115 if (!comp->llc_stripe_size) {
1116 if (comp_end == LUSTRE_EOF || !(comp_end % def_stripe_size))
1117 comp->llc_stripe_size = def_stripe_size;
1119 comp->llc_stripe_size = comp_end & ~(comp_end - 1);
1121 if (comp_end != LUSTRE_EOF &&
1122 comp_end & (LOV_MIN_STRIPE_SIZE - 1)) {
1123 CWARN("Component end %llu is not a multiple of min size %u\n",
1124 comp_end, LOV_MIN_STRIPE_SIZE);
1125 comp_end = round_up(comp_end, LOV_MIN_STRIPE_SIZE);
1127 /* check stripe size is multiplier of comp_end */
1128 if (comp_end != LUSTRE_EOF &&
1129 comp_end != comp->llc_extent.e_start &&
1130 comp_end % comp->llc_stripe_size) {
1131 /* fix that even for defined stripe size but warn
1132 * about the problem, that must not happen
1134 CWARN("Component end %llu is not aligned by the stripe size %u\n",
1135 comp_end, comp->llc_stripe_size);
1136 comp->llc_stripe_size = comp_end & ~(comp_end - 1);
1141 static inline void lod_adjust_stripe_info(struct lod_layout_component *comp,
1142 struct lov_desc *desc,
1145 if (!(comp->llc_pattern & LOV_PATTERN_MDT)) {
1146 if (append_stripes) {
1147 comp->llc_stripe_count = append_stripes;
1148 } else if (!comp->llc_stripe_count) {
1149 comp->llc_stripe_count =
1150 desc->ld_default_stripe_count;
1154 lod_adjust_stripe_size(comp, desc->ld_default_stripe_size);
1157 int lod_obj_for_each_stripe(const struct lu_env *env, struct lod_object *lo,
1159 struct lod_obj_stripe_cb_data *data)
1161 struct lod_layout_component *lod_comp;
1165 mutex_lock(&lo->ldo_layout_mutex);
1166 for (i = 0; i < lo->ldo_comp_cnt; i++) {
1167 lod_comp = &lo->ldo_comp_entries[i];
1169 if (lod_comp->llc_magic == LOV_MAGIC_FOREIGN)
1172 if (lod_comp->llc_stripe == NULL)
1175 /* has stripe but not inited yet, this component has been
1176 * declared to be created, but hasn't created yet.
1178 if (!lod_comp_inited(lod_comp) && !data->locd_declare)
1181 if (data->locd_comp_skip_cb &&
1182 data->locd_comp_skip_cb(env, lo, i, data))
1185 if (data->locd_comp_cb) {
1186 rc = data->locd_comp_cb(env, lo, i, data);
1191 /* could used just to do sth about component, not each
1194 if (!data->locd_stripe_cb)
1197 LASSERT(lod_comp->llc_stripe_count > 0);
1198 for (j = 0; j < lod_comp->llc_stripe_count; j++) {
1199 struct dt_object *dt = lod_comp->llc_stripe[j];
1203 rc = data->locd_stripe_cb(env, lo, dt, th, i, j, data);
1209 mutex_unlock(&lo->ldo_layout_mutex);
1214 lod_obj_stripe_attr_set_cb(const struct lu_env *env, struct lod_object *lo,
1215 struct dt_object *dt, struct thandle *th,
1216 int comp_idx, int stripe_idx,
1217 struct lod_obj_stripe_cb_data *data)
1219 if (data->locd_declare)
1220 return lod_sub_declare_attr_set(env, dt, data->locd_attr, th);
1222 if (data->locd_attr->la_valid & LA_LAYOUT_VERSION) {
1223 CDEBUG(D_LAYOUT, DFID": set layout version: %u, comp_idx: %d\n",
1224 PFID(lu_object_fid(&dt->do_lu)),
1225 data->locd_attr->la_layout_version, comp_idx);
1228 return lod_sub_attr_set(env, dt, data->locd_attr, th);
1232 * Implementation of dt_object_operations::do_declare_attr_set.
1234 * If the object is striped, then apply the changes to all the stripes.
1236 * \see dt_object_operations::do_declare_attr_set() in the API description
1239 static int lod_declare_attr_set(const struct lu_env *env,
1240 struct dt_object *dt,
1241 const struct lu_attr *attr,
1244 struct dt_object *next = dt_object_child(dt);
1245 struct lod_object *lo = lod_dt_obj(dt);
1250 * declare setattr on the local object
1252 rc = lod_sub_declare_attr_set(env, next, attr, th);
1256 /* osp_declare_attr_set() ignores all attributes other than
1257 * UID, GID, PROJID, and size, and osp_attr_set() ignores all
1258 * but UID, GID and PROJID. Declaration of size attr setting
1259 * happens through lod_declare_init_size(), and not through
1260 * this function. Therefore we need not load striping unless
1261 * ownership is changing. This should save memory and (we hope)
1262 * speed up rename().
1264 if (!S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
1265 if (!(attr->la_valid & LA_REMOTE_ATTR_SET))
1268 if (CFS_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_OWNER))
1271 if (!(attr->la_valid & (LA_UID | LA_GID | LA_PROJID | LA_MODE |
1272 LA_ATIME | LA_MTIME | LA_CTIME |
1277 * load striping information, notice we don't do this when object
1278 * is being initialized as we don't need this information till
1279 * few specific cases like destroy, chown
1281 rc = lod_striping_load(env, lo);
1285 if (!lod_obj_is_striped(dt))
1289 * if object is striped declare changes on the stripes
1291 if (S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
1292 LASSERT(lo->ldo_stripe);
1293 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
1294 if (lo->ldo_stripe[i] == NULL)
1296 if (!dt_object_exists(lo->ldo_stripe[i]))
1298 rc = lod_sub_declare_attr_set(env, lo->ldo_stripe[i],
1304 struct lod_obj_stripe_cb_data data = { { 0 } };
1306 data.locd_attr = attr;
1307 data.locd_declare = true;
1308 data.locd_stripe_cb = lod_obj_stripe_attr_set_cb;
1309 rc = lod_obj_for_each_stripe(env, lo, th, &data);
1315 if (!dt_object_exists(next) || dt_object_remote(next) ||
1316 !S_ISREG(attr->la_mode))
1319 if (CFS_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_STRIPE)) {
1320 rc = lod_sub_declare_xattr_del(env, next, XATTR_NAME_LOV, th);
1324 if (CFS_FAIL_CHECK(OBD_FAIL_LFSCK_CHANGE_STRIPE) ||
1325 CFS_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_PFL_RANGE)) {
1326 struct lod_thread_info *info = lod_env_info(env);
1327 struct lu_buf *buf = &info->lti_buf;
1329 buf->lb_buf = info->lti_ea_store;
1330 buf->lb_len = info->lti_ea_store_size;
1331 rc = lod_sub_declare_xattr_set(env, next, buf, XATTR_NAME_LOV,
1332 LU_XATTR_REPLACE, th);
1339 * Implementation of dt_object_operations::do_attr_set.
1341 * If the object is striped, then apply the changes to all or subset of
1342 * the stripes depending on the object type and specific attributes.
1344 * \see dt_object_operations::do_attr_set() in the API description for details.
1346 static int lod_attr_set(const struct lu_env *env,
1347 struct dt_object *dt,
1348 const struct lu_attr *attr,
1351 struct dt_object *next = dt_object_child(dt);
1352 struct lod_object *lo = lod_dt_obj(dt);
1357 * apply changes to the local object
1359 rc = lod_sub_attr_set(env, next, attr, th);
1363 if (!S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
1364 if (!(attr->la_valid & LA_REMOTE_ATTR_SET))
1367 if (CFS_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_OWNER))
1370 if (!(attr->la_valid & (LA_UID | LA_GID | LA_MODE | LA_PROJID |
1371 LA_ATIME | LA_MTIME | LA_CTIME |
1376 /* FIXME: a tricky case in the code path of mdd_layout_change():
1377 * the in-memory striping information has been freed in lod_xattr_set()
1378 * due to layout change. It has to load stripe here again. It only
1379 * changes flags of layout so declare_attr_set() is still accurate */
1380 rc = lod_striping_load(env, lo);
1384 if (!lod_obj_is_striped(dt))
1388 * if object is striped, apply changes to all the stripes
1390 if (S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
1391 LASSERT(lo->ldo_stripe);
1392 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
1393 if (unlikely(lo->ldo_stripe[i] == NULL))
1396 if ((dt_object_exists(lo->ldo_stripe[i]) == 0))
1399 rc = lod_sub_attr_set(env, lo->ldo_stripe[i], attr, th);
1404 struct lod_obj_stripe_cb_data data = { { 0 } };
1406 data.locd_attr = attr;
1407 data.locd_declare = false;
1408 data.locd_stripe_cb = lod_obj_stripe_attr_set_cb;
1409 rc = lod_obj_for_each_stripe(env, lo, th, &data);
1415 if (!dt_object_exists(next) || dt_object_remote(next) ||
1416 !S_ISREG(attr->la_mode))
1419 if (CFS_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_STRIPE)) {
1420 rc = lod_sub_xattr_del(env, next, XATTR_NAME_LOV, th);
1424 if (CFS_FAIL_CHECK(OBD_FAIL_LFSCK_CHANGE_STRIPE)) {
1425 struct lod_thread_info *info = lod_env_info(env);
1426 struct lu_buf *buf = &info->lti_buf;
1427 struct ost_id *oi = &info->lti_ostid;
1428 struct lu_fid *fid = &info->lti_fid;
1429 struct lov_mds_md_v1 *lmm;
1430 struct lov_ost_data_v1 *objs;
1433 rc = lod_get_lov_ea(env, lo);
1437 buf->lb_buf = info->lti_ea_store;
1438 buf->lb_len = info->lti_ea_store_size;
1439 lmm = info->lti_ea_store;
1440 magic = le32_to_cpu(lmm->lmm_magic);
1441 if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) {
1442 struct lov_comp_md_v1 *lcm = buf->lb_buf;
1443 struct lov_comp_md_entry_v1 *lcme =
1444 &lcm->lcm_entries[0];
1446 lmm = buf->lb_buf + le32_to_cpu(lcme->lcme_offset);
1447 magic = le32_to_cpu(lmm->lmm_magic);
1450 if (magic == LOV_MAGIC_V1)
1451 objs = &(lmm->lmm_objects[0]);
1453 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0];
1454 ostid_le_to_cpu(&objs->l_ost_oi, oi);
1455 ostid_to_fid(fid, oi, le32_to_cpu(objs->l_ost_idx));
1457 fid_to_ostid(fid, oi);
1458 ostid_cpu_to_le(oi, &objs->l_ost_oi);
1460 rc = lod_sub_xattr_set(env, next, buf, XATTR_NAME_LOV,
1461 LU_XATTR_REPLACE, th);
1462 } else if (CFS_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_PFL_RANGE)) {
1463 struct lod_thread_info *info = lod_env_info(env);
1464 struct lu_buf *buf = &info->lti_buf;
1465 struct lov_comp_md_v1 *lcm;
1466 struct lov_comp_md_entry_v1 *lcme;
1468 rc = lod_get_lov_ea(env, lo);
1472 buf->lb_buf = info->lti_ea_store;
1473 buf->lb_len = info->lti_ea_store_size;
1475 if (le32_to_cpu(lcm->lcm_magic) != LOV_MAGIC_COMP_V1 &&
1476 le32_to_cpu(lcm->lcm_magic) != LOV_MAGIC_SEL)
1479 le32_add_cpu(&lcm->lcm_layout_gen, 1);
1480 lcme = &lcm->lcm_entries[0];
1481 le64_add_cpu(&lcme->lcme_extent.e_start, 1);
1482 le64_add_cpu(&lcme->lcme_extent.e_end, -1);
1484 rc = lod_sub_xattr_set(env, next, buf, XATTR_NAME_LOV,
1485 LU_XATTR_REPLACE, th);
1492 * Implementation of dt_object_operations::do_xattr_get.
1494 * If LOV EA is requested from the root object and it's not
1495 * found, then return default striping for the filesystem.
1497 * \see dt_object_operations::do_xattr_get() in the API description for details.
1499 static int lod_xattr_get(const struct lu_env *env, struct dt_object *dt,
1500 struct lu_buf *buf, const char *name)
1502 struct lod_thread_info *info = lod_env_info(env);
1503 struct lod_device *dev = lu2lod_dev(dt->do_lu.lo_dev);
1508 rc = dt_xattr_get(env, dt_object_child(dt), buf, name);
1509 if (strcmp(name, XATTR_NAME_LMV) == 0) {
1510 struct lmv_mds_md_v1 *lmv1;
1511 struct lmv_foreign_md *lfm;
1514 if (rc > (typeof(rc))sizeof(*lmv1))
1517 /* short (<= sizeof(struct lmv_mds_md_v1)) foreign LMV case */
1518 /* XXX empty foreign LMV is not allowed */
1519 if (rc <= offsetof(typeof(*lfm), lfm_value))
1520 RETURN(rc = rc > 0 ? -EINVAL : rc);
1522 if (buf->lb_buf == NULL || buf->lb_len == 0) {
1523 BUILD_BUG_ON(sizeof(*lmv1) > sizeof(info->lti_key));
1525 /* lti_buf is large enough for *lmv1 or a short
1526 * (<= sizeof(struct lmv_mds_md_v1)) foreign LMV
1528 info->lti_buf.lb_buf = info->lti_key;
1529 info->lti_buf.lb_len = sizeof(*lmv1);
1530 rc = dt_xattr_get(env, dt_object_child(dt),
1531 &info->lti_buf, name);
1532 if (unlikely(rc <= offsetof(typeof(*lfm),
1534 RETURN(rc = rc > 0 ? -EINVAL : rc);
1536 lfm = info->lti_buf.lb_buf;
1537 if (le32_to_cpu(lfm->lfm_magic) == LMV_MAGIC_FOREIGN)
1540 if (unlikely(rc != sizeof(*lmv1)))
1541 RETURN(rc = rc > 0 ? -EINVAL : rc);
1543 lmv1 = info->lti_buf.lb_buf;
1544 /* The on-disk LMV EA only contains header, but the
1545 * returned LMV EA size should contain the space for
1546 * the FIDs of all shards of the striped directory. */
1547 if (le32_to_cpu(lmv1->lmv_magic) == LMV_MAGIC_V1)
1548 rc = lmv_mds_md_size(
1549 le32_to_cpu(lmv1->lmv_stripe_count),
1550 le32_to_cpu(lmv1->lmv_magic));
1553 if (le32_to_cpu(lmv1->lmv_magic) != LMV_MAGIC_V1)
1556 if (rc != sizeof(*lmv1))
1557 RETURN(rc = rc > 0 ? -EINVAL : rc);
1559 rc1 = lod_load_lmv_shards(env, lod_dt_obj(dt),
1563 RETURN(rc = rc1 != 0 ? rc1 : rc);
1566 if ((rc > 0) && buf->lb_buf && strcmp(name, XATTR_NAME_LOV) == 0) {
1567 struct lov_comp_md_v1 *lcm = buf->lb_buf;
1569 if (lcm->lcm_magic == cpu_to_le32(LOV_MAGIC_SEL))
1570 lcm->lcm_magic = cpu_to_le32(LOV_MAGIC_COMP_V1);
1573 if (rc != -ENODATA || !S_ISDIR(dt->do_lu.lo_header->loh_attr & S_IFMT))
1577 * XXX: Only used by lfsck
1579 * lod returns default striping on the real root of the device
1580 * this is like the root stores default striping for the whole
1581 * filesystem. historically we've been using a different approach
1582 * and store it in the config.
1584 dt_root_get(env, dev->lod_child, &info->lti_fid);
1585 is_root = lu_fid_eq(&info->lti_fid, lu_object_fid(&dt->do_lu));
1587 if (is_root && strcmp(XATTR_NAME_LOV, name) == 0) {
1588 struct lov_user_md *lum = buf->lb_buf;
1589 struct lov_desc *desc = &dev->lod_ost_descs.ltd_lov_desc;
1591 if (buf->lb_buf == NULL) {
1593 } else if (buf->lb_len >= sizeof(*lum)) {
1594 lum->lmm_magic = cpu_to_le32(LOV_USER_MAGIC_V1);
1595 lmm_oi_set_seq(&lum->lmm_oi, FID_SEQ_LOV_DEFAULT);
1596 lmm_oi_set_id(&lum->lmm_oi, 0);
1597 lmm_oi_cpu_to_le(&lum->lmm_oi, &lum->lmm_oi);
1598 lum->lmm_pattern = cpu_to_le32(desc->ld_pattern);
1599 lum->lmm_stripe_size = cpu_to_le32(
1600 desc->ld_default_stripe_size);
1601 lum->lmm_stripe_count = cpu_to_le16(
1602 desc->ld_default_stripe_count);
1603 lum->lmm_stripe_offset = cpu_to_le16(
1604 desc->ld_default_stripe_offset);
1617 * Checks that the magic of the stripe is sane.
1619 * \param[in] lod lod device
1620 * \param[in] lum a buffer storing LMV EA to verify
1622 * \retval 0 if the EA is sane
1623 * \retval negative otherwise
1625 static int lod_verify_md_striping(struct lod_device *lod,
1626 const struct lmv_user_md_v1 *lum)
1628 if (unlikely(le32_to_cpu(lum->lum_magic) != LMV_USER_MAGIC)) {
1629 CERROR("%s: invalid lmv_user_md: magic = %x, "
1630 "stripe_offset = %d, stripe_count = %u: rc = %d\n",
1631 lod2obd(lod)->obd_name, le32_to_cpu(lum->lum_magic),
1632 (int)le32_to_cpu(lum->lum_stripe_offset),
1633 le32_to_cpu(lum->lum_stripe_count), -EINVAL);
1641 * Initialize LMV EA for a slave.
1643 * Initialize slave's LMV EA from the master's LMV EA.
1645 * \param[in] master_lmv a buffer containing master's EA
1646 * \param[out] slave_lmv a buffer where slave's EA will be stored
1649 static void lod_prep_slave_lmv_md(struct lmv_mds_md_v1 *slave_lmv,
1650 const struct lmv_mds_md_v1 *master_lmv)
1652 *slave_lmv = *master_lmv;
1653 slave_lmv->lmv_magic = cpu_to_le32(LMV_MAGIC_STRIPE);
1659 * Generate LMV EA from the object passed as \a dt. The object must have
1660 * the stripes created and initialized.
1662 * \param[in] env execution environment
1663 * \param[in] dt object
1664 * \param[out] lmv_buf buffer storing generated LMV EA
1666 * \retval 0 on success
1667 * \retval negative if failed
1669 static int lod_prep_lmv_md(const struct lu_env *env, struct dt_object *dt,
1670 struct lu_buf *lmv_buf)
1672 struct lod_thread_info *info = lod_env_info(env);
1673 struct lod_device *lod = lu2lod_dev(dt->do_lu.lo_dev);
1674 struct lod_object *lo = lod_dt_obj(dt);
1675 struct lmv_mds_md_v1 *lmm1;
1677 int type = LU_SEQ_RANGE_ANY;
1682 LASSERT(lo->ldo_dir_striped != 0);
1683 LASSERT(lo->ldo_dir_stripe_count > 0);
1684 stripe_count = lo->ldo_dir_stripe_count;
1685 /* Only store the LMV EA heahder on the disk. */
1686 if (info->lti_ea_store_size < sizeof(*lmm1)) {
1687 rc = lod_ea_store_resize(info, sizeof(*lmm1));
1691 memset(info->lti_ea_store, 0, sizeof(*lmm1));
1694 lmm1 = (struct lmv_mds_md_v1 *)info->lti_ea_store;
1695 memset(lmm1, 0, sizeof(*lmm1));
1696 lmm1->lmv_magic = cpu_to_le32(LMV_MAGIC);
1697 lmm1->lmv_stripe_count = cpu_to_le32(stripe_count);
1698 lmm1->lmv_hash_type = cpu_to_le32(lo->ldo_dir_hash_type);
1699 lmm1->lmv_layout_version = cpu_to_le32(lo->ldo_dir_layout_version);
1700 if (lod_is_layout_changing(lo)) {
1701 lmm1->lmv_migrate_hash = cpu_to_le32(lo->ldo_dir_migrate_hash);
1702 lmm1->lmv_migrate_offset =
1703 cpu_to_le32(lo->ldo_dir_migrate_offset);
1705 rc = lod_fld_lookup(env, lod, lu_object_fid(&dt->do_lu),
1710 lmm1->lmv_master_mdt_index = cpu_to_le32(mdtidx);
1711 lmv_buf->lb_buf = info->lti_ea_store;
1712 lmv_buf->lb_len = sizeof(*lmm1);
1718 * Create in-core represenation for a striped directory.
1720 * Parse the buffer containing LMV EA and instantiate LU objects
1721 * representing the stripe objects. The pointers to the objects are
1722 * stored in ldo_stripe field of \a lo. This function is used when
1723 * we need to access an already created object (i.e. load from a disk).
1725 * \param[in] env execution environment
1726 * \param[in] lo lod object
1727 * \param[in] buf buffer containing LMV EA
1729 * \retval 0 on success
1730 * \retval negative if failed
1732 int lod_parse_dir_striping(const struct lu_env *env, struct lod_object *lo,
1733 const struct lu_buf *buf)
1735 struct lod_thread_info *info = lod_env_info(env);
1736 struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
1737 struct lod_tgt_descs *ltd = &lod->lod_mdt_descs;
1738 struct dt_object **stripe;
1739 union lmv_mds_md *lmm = buf->lb_buf;
1740 struct lmv_mds_md_v1 *lmv1 = &lmm->lmv_md_v1;
1741 struct lu_fid *fid = &info->lti_fid;
1746 LASSERT(mutex_is_locked(&lo->ldo_layout_mutex));
1748 /* XXX may be useless as not called for foreign LMV ?? */
1749 if (le32_to_cpu(lmv1->lmv_magic) == LMV_MAGIC_FOREIGN)
1752 if (le32_to_cpu(lmv1->lmv_magic) == LMV_MAGIC_STRIPE) {
1753 lo->ldo_dir_slave_stripe = 1;
1757 if (!lmv_is_sane(lmv1))
1760 LASSERT(lo->ldo_stripe == NULL);
1761 OBD_ALLOC_PTR_ARRAY(stripe, le32_to_cpu(lmv1->lmv_stripe_count));
1765 for (i = 0; i < le32_to_cpu(lmv1->lmv_stripe_count); i++) {
1766 struct dt_device *tgt_dt;
1767 struct dt_object *dto;
1768 int type = LU_SEQ_RANGE_ANY;
1771 fid_le_to_cpu(fid, &lmv1->lmv_stripe_fids[i]);
1772 if (!fid_is_sane(fid)) {
1777 rc = lod_fld_lookup(env, lod, fid, &idx, &type);
1781 if (idx == lod2lu_dev(lod)->ld_site->ld_seq_site->ss_node_id) {
1782 tgt_dt = lod->lod_child;
1784 struct lod_tgt_desc *tgt;
1786 tgt = LTD_TGT(ltd, idx);
1788 GOTO(out, rc = -ESTALE);
1789 tgt_dt = tgt->ltd_tgt;
1792 dto = dt_locate_at(env, tgt_dt, fid,
1793 lo->ldo_obj.do_lu.lo_dev->ld_site->ls_top_dev,
1796 GOTO(out, rc = PTR_ERR(dto));
1801 lo->ldo_stripe = stripe;
1802 lo->ldo_is_foreign = 0;
1803 lo->ldo_dir_stripe_count = le32_to_cpu(lmv1->lmv_stripe_count);
1804 lo->ldo_dir_stripes_allocated = le32_to_cpu(lmv1->lmv_stripe_count);
1805 lo->ldo_dir_layout_version = le32_to_cpu(lmv1->lmv_layout_version);
1806 lo->ldo_dir_migrate_offset = le32_to_cpu(lmv1->lmv_migrate_offset);
1807 lo->ldo_dir_migrate_hash = le32_to_cpu(lmv1->lmv_migrate_hash);
1808 lo->ldo_dir_hash_type = le32_to_cpu(lmv1->lmv_hash_type);
1810 lod_striping_free_nolock(env, lo);
1816 * Declare create a striped directory.
1818 * Declare creating a striped directory with a given stripe pattern on the
1819 * specified MDTs. A striped directory is represented as a regular directory
1820 * - an index listing all the stripes. The stripes point back to the master
1821 * object with ".." and LinkEA. The master object gets LMV EA which
1822 * identifies it as a striped directory. The function allocates FIDs
1825 * \param[in] env execution environment
1826 * \param[in] dt object
1827 * \param[in] attr attributes to initialize the objects with
1828 * \param[in] dof type of objects to be created
1829 * \param[in] th transaction handle
1831 * \retval 0 on success
1832 * \retval negative if failed
1834 static int lod_dir_declare_create_stripes(const struct lu_env *env,
1835 struct dt_object *dt,
1836 struct lu_attr *attr,
1837 struct dt_object_format *dof,
1840 struct lod_thread_info *info = lod_env_info(env);
1841 struct lu_buf lmv_buf;
1842 struct lu_buf slave_lmv_buf;
1843 struct lmv_mds_md_v1 *lmm;
1844 struct lmv_mds_md_v1 *slave_lmm = NULL;
1845 struct dt_insert_rec *rec = &info->lti_dt_rec;
1846 struct lod_object *lo = lod_dt_obj(dt);
1851 rc = lod_prep_lmv_md(env, dt, &lmv_buf);
1854 lmm = lmv_buf.lb_buf;
1856 OBD_ALLOC_PTR(slave_lmm);
1857 if (slave_lmm == NULL)
1858 GOTO(out, rc = -ENOMEM);
1860 lod_prep_slave_lmv_md(slave_lmm, lmm);
1861 slave_lmv_buf.lb_buf = slave_lmm;
1862 slave_lmv_buf.lb_len = sizeof(*slave_lmm);
1864 if (!dt_try_as_dir(env, dt_object_child(dt), false))
1865 GOTO(out, rc = -EINVAL);
1867 rec->rec_type = S_IFDIR;
1868 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
1869 struct dt_object *dto = lo->ldo_stripe[i];
1870 char *stripe_name = info->lti_key;
1871 struct lu_name *sname;
1872 struct linkea_data ldata = { NULL };
1873 struct lu_buf linkea_buf;
1875 /* OBD_FAIL_MDS_STRIPE_FID may leave stripe uninitialized */
1879 /* directory split skip create for existing stripes */
1880 if (!(lod_is_splitting(lo) && i < lo->ldo_dir_split_offset)) {
1881 rc = lod_sub_declare_create(env, dto, attr, NULL, dof,
1886 if (!dt_try_as_dir(env, dto, false))
1887 GOTO(out, rc = -EINVAL);
1889 rc = lod_sub_declare_ref_add(env, dto, th);
1893 rec->rec_fid = lu_object_fid(&dto->do_lu);
1894 rc = lod_sub_declare_insert(env, dto,
1895 (const struct dt_rec *)rec,
1896 (const struct dt_key *)dot,
1901 /* master stripe FID will be put to .. */
1902 rec->rec_fid = lu_object_fid(&dt->do_lu);
1903 rc = lod_sub_declare_insert(env, dto,
1904 (const struct dt_rec *)rec,
1905 (const struct dt_key *)dotdot,
1910 if (CFS_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_SLAVE_NAME) &&
1912 snprintf(stripe_name, sizeof(info->lti_key),
1914 PFID(lu_object_fid(&dto->do_lu)),
1917 snprintf(stripe_name, sizeof(info->lti_key),
1919 PFID(lu_object_fid(&dto->do_lu)), i);
1921 sname = lod_name_get(env, stripe_name,
1922 strlen(stripe_name));
1923 rc = linkea_links_new(&ldata, &info->lti_linkea_buf,
1924 sname, lu_object_fid(&dt->do_lu));
1928 linkea_buf.lb_buf = ldata.ld_buf->lb_buf;
1929 linkea_buf.lb_len = ldata.ld_leh->leh_len;
1930 rc = lod_sub_declare_xattr_set(env, dto, &linkea_buf,
1931 XATTR_NAME_LINK, 0, th);
1935 rec->rec_fid = lu_object_fid(&dto->do_lu);
1936 rc = lod_sub_declare_insert(env, dt_object_child(dt),
1937 (const struct dt_rec *)rec,
1938 (const struct dt_key *)stripe_name, th);
1942 rc = lod_sub_declare_ref_add(env, dt_object_child(dt),
1948 if (!CFS_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_SLAVE_LMV) ||
1949 cfs_fail_val != i) {
1950 if (CFS_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_SLAVE_LMV) &&
1952 slave_lmm->lmv_master_mdt_index =
1955 slave_lmm->lmv_master_mdt_index =
1957 rc = lod_sub_declare_xattr_set(env, dto, &slave_lmv_buf,
1958 XATTR_NAME_LMV, 0, th);
1964 rc = lod_sub_declare_xattr_set(env, dt_object_child(dt),
1965 &lmv_buf, XATTR_NAME_LMV, 0, th);
1969 if (slave_lmm != NULL)
1970 OBD_FREE_PTR(slave_lmm);
1976 * Allocate a striping on a predefined set of MDTs.
1978 * Allocates new striping using the MDT index range provided by the data from
1979 * the lum_obejcts contained in the lmv_user_md passed to this method if
1980 * \a is_specific is true; or allocates new layout starting from MDT index in
1981 * lo->ldo_dir_stripe_offset. The exact order of MDTs is not important and
1982 * varies depending on MDT status. The number of stripes needed and stripe
1983 * offset are taken from the object. If that number cannot be met, then the
1984 * function returns an error and then it's the caller's responsibility to
1985 * release the stripes allocated. All the internal structures are protected,
1986 * but no concurrent allocation is allowed on the same objects.
1988 * \param[in] env execution environment for this thread
1989 * \param[in] lo LOD object
1990 * \param[out] stripes striping created
1991 * \param[out] mdt_indices MDT indices of striping created
1992 * \param[in] is_specific true if the MDTs are provided by lum; false if
1993 * only the starting MDT index is provided
1995 * \retval positive stripes allocated, including the first stripe allocated
1997 * \retval negative errno on failure
1999 static int lod_mdt_alloc_specific(const struct lu_env *env,
2000 struct lod_object *lo,
2001 struct dt_object **stripes,
2002 __u32 *mdt_indices, bool is_specific)
2004 struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
2005 struct lu_tgt_descs *ltd = &lod->lod_mdt_descs;
2006 struct lu_tgt_desc *tgt = NULL;
2007 struct lu_object_conf conf = { .loc_flags = LOC_F_NEW };
2008 struct dt_device *tgt_dt = NULL;
2009 struct lu_fid fid = { 0 };
2010 struct dt_object *dto;
2012 u32 stripe_count = lo->ldo_dir_stripe_count;
2018 master_index = lu_site2seq(lod2lu_dev(lod)->ld_site)->ss_node_id;
2019 if (!is_specific && stripe_count > 1)
2020 /* Set the start index for the 2nd stripe allocation */
2021 mdt_indices[1] = (mdt_indices[0] + 1) %
2022 (lod->lod_remote_mdt_count + 1);
2024 for (; stripe_idx < stripe_count; stripe_idx++) {
2025 /* Try to find next avaible target */
2026 idx = mdt_indices[stripe_idx];
2027 for (j = 0; j < lod->lod_remote_mdt_count;
2028 j++, idx = (idx + 1) % (lod->lod_remote_mdt_count + 1)) {
2029 bool already_allocated = false;
2033 "try idx %d, mdt cnt %u, allocated %u, specific %d count %hu offset %d hash %#X\n",
2034 idx, lod->lod_remote_mdt_count + 1, stripe_idx,
2035 is_specific, lo->ldo_dir_stripe_count,
2036 (int)lo->ldo_dir_stripe_offset,
2037 lo->ldo_dir_hash_type);
2039 if (likely(!is_specific &&
2040 !CFS_FAIL_CHECK(OBD_FAIL_LARGE_STRIPE) &&
2041 !(lo->ldo_dir_hash_type &
2042 LMV_HASH_FLAG_OVERSTRIPED))) {
2043 /* check whether the idx already exists
2044 * in current allocated array */
2045 for (k = 0; k < stripe_idx; k++) {
2046 if (mdt_indices[k] == idx) {
2047 already_allocated = true;
2052 if (already_allocated)
2056 /* Sigh, this index is not in the bitmap, let's check
2057 * next available target */
2058 if (!test_bit(idx, ltd->ltd_tgt_bitmap) &&
2059 idx != master_index)
2062 if (idx == master_index) {
2063 /* Allocate the FID locally */
2064 tgt_dt = lod->lod_child;
2065 rc = dt_fid_alloc(env, tgt_dt, &fid, NULL,
2072 /* check the status of the OSP */
2073 tgt = LTD_TGT(ltd, idx);
2077 tgt_dt = tgt->ltd_tgt;
2078 if (!tgt->ltd_active)
2079 /* this OSP doesn't feel well */
2082 if (tgt->ltd_statfs.os_state & OS_STATFS_NOCREATE)
2085 rc = dt_fid_alloc(env, tgt_dt, &fid, NULL, NULL);
2092 /* Can not allocate more stripes */
2093 if (j == lod->lod_remote_mdt_count) {
2094 CDEBUG(D_INFO, "%s: require stripes %u only get %d\n",
2095 lod2obd(lod)->obd_name, stripe_count,
2100 CDEBUG(D_INFO, "Get idx %d, for stripe %d "DFID"\n",
2101 idx, stripe_idx, PFID(&fid));
2102 mdt_indices[stripe_idx] = idx;
2103 /* Set the start index for next stripe allocation */
2104 if (!is_specific && stripe_idx < stripe_count - 1) {
2106 * for large dir test, put all other slaves on one
2107 * remote MDT, otherwise we may save too many local
2108 * slave locks which will exceed RS_MAX_LOCKS.
2110 if (unlikely(CFS_FAIL_CHECK(OBD_FAIL_LARGE_STRIPE)))
2112 mdt_indices[stripe_idx + 1] = (idx + 1) %
2113 (lod->lod_remote_mdt_count + 1);
2115 /* tgt_dt and fid must be ready after search avaible OSP
2116 * in the above loop */
2117 LASSERT(tgt_dt != NULL);
2118 LASSERT(fid_is_sane(&fid));
2120 /* fail a remote stripe FID allocation */
2121 if (stripe_idx && CFS_FAIL_CHECK(OBD_FAIL_MDS_STRIPE_FID))
2124 dto = dt_locate_at(env, tgt_dt, &fid,
2125 lo->ldo_obj.do_lu.lo_dev->ld_site->ls_top_dev,
2132 stripes[stripe_idx] = dto;
2138 for (j = 1; j < stripe_idx; j++) {
2139 LASSERT(stripes[j] != NULL);
2140 dt_object_put(env, stripes[j]);
2146 static int lod_prep_md_striped_create(const struct lu_env *env,
2147 struct dt_object *dt,
2148 struct lu_attr *attr,
2149 const struct lmv_user_md_v1 *lum,
2150 struct dt_object_format *dof,
2153 struct lod_device *lod = lu2lod_dev(dt->do_lu.lo_dev);
2154 struct lod_object *lo = lod_dt_obj(dt);
2155 struct dt_object **stripes;
2156 struct lu_object_conf conf = { .loc_flags = LOC_F_NEW };
2157 struct lu_fid fid = { 0 };
2158 int mdt_count = lod->lod_remote_mdt_count + 1;
2165 /* The lum has been verifed in lod_verify_md_striping */
2166 LASSERT(le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC ||
2167 le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC_SPECIFIC);
2169 stripe_count = lo->ldo_dir_stripe_count;
2170 /* silently clear OVERSTRIPED flag on single MDT system */
2172 lo->ldo_dir_hash_type &= ~LMV_HASH_FLAG_OVERSTRIPED;
2173 if (lo->ldo_dir_hash_type & LMV_HASH_FLAG_OVERSTRIPED) {
2174 /* silently clamp stripe count if MDTs are not specific */
2175 if (stripe_count > mdt_count * lod->lod_max_stripes_per_mdt) {
2176 if (le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC)
2177 stripe_count = mdt_count *
2178 lod->lod_max_stripes_per_mdt;
2182 /* clear OVERSTRIPED if not overstriped */
2183 if (stripe_count <= mdt_count &&
2184 le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC)
2185 lo->ldo_dir_hash_type &= ~LMV_HASH_FLAG_OVERSTRIPED;
2186 } else if (stripe_count > mdt_count) {
2190 OBD_ALLOC_PTR_ARRAY(stripes, stripe_count);
2194 /* Allocate the first stripe locally */
2195 rc = dt_fid_alloc(env, lod->lod_child, &fid, NULL, NULL);
2199 stripes[0] = dt_locate_at(env, lod->lod_child, &fid,
2200 dt->do_lu.lo_dev->ld_site->ls_top_dev, &conf);
2201 if (IS_ERR(stripes[0]))
2202 GOTO(out, rc = PTR_ERR(stripes[0]));
2204 if (lo->ldo_dir_stripe_offset == LMV_OFFSET_DEFAULT) {
2205 lod_qos_statfs_update(env, lod, &lod->lod_mdt_descs);
2206 rc = lod_mdt_alloc_qos(env, lo, stripes, 1, stripe_count);
2208 rc = lod_mdt_alloc_rr(env, lo, stripes, 1,
2212 bool is_specific = false;
2214 OBD_ALLOC_PTR_ARRAY(idx_array, stripe_count);
2216 GOTO(out, rc = -ENOMEM);
2218 if (le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC_SPECIFIC) {
2219 int stripes_per_mdt;
2221 bool overstriped = false;
2225 /* Verify we do not exceed the stripes per MDT limit */
2226 for (mdt = 0; mdt < mdt_count + 1; mdt++) {
2227 stripes_per_mdt = 0;
2228 for (i = 0; i < stripe_count; i++) {
2230 le32_to_cpu(lum->lum_objects[i].lum_mds))
2233 if (stripes_per_mdt >
2234 lod->lod_max_stripes_per_mdt)
2235 GOTO(out_free, rc = -EINVAL);
2236 if (stripes_per_mdt > 1)
2240 (lo->ldo_dir_hash_type & LMV_HASH_FLAG_OVERSTRIPED))
2241 lo->ldo_dir_hash_type &=
2242 ~LMV_HASH_FLAG_OVERSTRIPED;
2243 else if (overstriped &&
2244 !(lo->ldo_dir_hash_type &
2245 LMV_HASH_FLAG_OVERSTRIPED))
2246 GOTO(out_free, rc = -EINVAL);
2248 for (i = 0; i < stripe_count; i++)
2250 le32_to_cpu(lum->lum_objects[i].lum_mds);
2253 /* stripe 0 is local */
2255 lu_site2seq(lod2lu_dev(lod)->ld_site)->ss_node_id;
2256 rc = lod_mdt_alloc_specific(env, lo, stripes, idx_array,
2259 OBD_FREE_PTR_ARRAY(idx_array, stripe_count);
2267 lo->ldo_dir_striped = 1;
2268 lo->ldo_stripe = stripes;
2269 lo->ldo_dir_stripe_count = rc;
2270 lo->ldo_dir_stripes_allocated = stripe_count;
2272 lo->ldo_dir_stripe_loaded = 1;
2274 rc = lod_dir_declare_create_stripes(env, dt, attr, dof, th);
2276 lod_striping_free(env, lo);
2282 if (!IS_ERR_OR_NULL(stripes[0]))
2283 dt_object_put(env, stripes[0]);
2284 for (i = 1; i < stripe_count; i++)
2285 LASSERT(!stripes[i]);
2286 OBD_FREE_PTR_ARRAY(stripes, stripe_count);
2293 * Alloc cached foreign LOV
2295 * \param[in] lo object
2296 * \param[in] size size of foreign LOV
2298 * \retval 0 on success
2299 * \retval negative if failed
2301 int lod_alloc_foreign_lov(struct lod_object *lo, size_t size)
2303 OBD_ALLOC_LARGE(lo->ldo_foreign_lov, size);
2304 if (lo->ldo_foreign_lov == NULL)
2306 lo->ldo_foreign_lov_size = size;
2307 lo->ldo_is_foreign = 1;
2313 * Free cached foreign LOV
2315 * \param[in] lo object
2317 void lod_free_foreign_lov(struct lod_object *lo)
2319 if (lo->ldo_foreign_lov != NULL)
2320 OBD_FREE_LARGE(lo->ldo_foreign_lov, lo->ldo_foreign_lov_size);
2321 lo->ldo_foreign_lov = NULL;
2322 lo->ldo_foreign_lov_size = 0;
2323 lo->ldo_is_foreign = 0;
2328 * Alloc cached foreign LMV
2330 * \param[in] lo object
2331 * \param[in] size size of foreign LMV
2333 * \retval 0 on success
2334 * \retval negative if failed
2336 static int lod_alloc_foreign_lmv(struct lod_object *lo, size_t size)
2338 OBD_ALLOC_LARGE(lo->ldo_foreign_lmv, size);
2339 if (lo->ldo_foreign_lmv == NULL)
2341 lo->ldo_foreign_lmv_size = size;
2342 lo->ldo_is_foreign = 1;
2347 static int lod_prep_md_replayed_create(const struct lu_env *env,
2348 struct dt_object *dt,
2349 struct lu_attr *attr,
2350 const struct lu_buf *lmv_buf,
2351 struct dt_object_format *dof,
2354 struct lod_object *lo = lod_dt_obj(dt);
2359 mutex_lock(&lo->ldo_layout_mutex);
2360 rc = lod_parse_dir_striping(env, lo, lmv_buf);
2362 lo->ldo_dir_stripe_loaded = 1;
2363 lo->ldo_dir_striped = 1;
2364 rc = lod_dir_declare_create_stripes(env, dt, attr, dof, th);
2366 mutex_unlock(&lo->ldo_layout_mutex);
2373 * Free cached foreign LMV
2375 * \param[in] lo object
2377 static void lod_free_foreign_lmv(struct lod_object *lo)
2379 if (lo->ldo_foreign_lmv != NULL)
2380 OBD_FREE_LARGE(lo->ldo_foreign_lmv, lo->ldo_foreign_lmv_size);
2381 lo->ldo_foreign_lmv = NULL;
2382 lo->ldo_foreign_lmv_size = 0;
2383 lo->ldo_is_foreign = 0;
2387 * Declare create striped md object.
2389 * The function declares intention to create a striped directory. This is a
2390 * wrapper for lod_prep_md_striped_create(). The only additional functionality
2391 * is to verify pattern \a lum_buf is good. Check that function for the details.
2393 * \param[in] env execution environment
2394 * \param[in] dt object
2395 * \param[in] attr attributes to initialize the objects with
2396 * \param[in] lum_buf a pattern specifying the number of stripes and
2398 * \param[in] dof type of objects to be created
2399 * \param[in] th transaction handle
2401 * \retval 0 on success
2402 * \retval negative if failed
2405 static int lod_declare_xattr_set_lmv(const struct lu_env *env,
2406 struct dt_object *dt,
2407 struct lu_attr *attr,
2408 const struct lu_buf *lum_buf,
2409 struct dt_object_format *dof,
2412 struct lod_object *lo = lod_dt_obj(dt);
2413 struct lmv_user_md_v1 *lum = lum_buf->lb_buf;
2417 LASSERT(lum != NULL);
2420 "lum magic=%x hash=%x count=%u offset=%d inherit=%u rr=%u\n",
2421 le32_to_cpu(lum->lum_magic), le32_to_cpu(lum->lum_hash_type),
2422 le32_to_cpu(lum->lum_stripe_count),
2423 (int)le32_to_cpu(lum->lum_stripe_offset),
2424 lum->lum_max_inherit, lum->lum_max_inherit_rr);
2426 if (lo->ldo_dir_stripe_count == 0) {
2427 if (lo->ldo_is_foreign) {
2428 rc = lod_alloc_foreign_lmv(lo, lum_buf->lb_len);
2431 memcpy(lo->ldo_foreign_lmv, lum, lum_buf->lb_len);
2432 lo->ldo_dir_stripe_loaded = 1;
2437 /* client replay striped directory creation with LMV, this happens when
2438 * all involved MDTs were rebooted, or MDT recovery was aborted.
2440 if (le32_to_cpu(lum->lum_magic) == LMV_MAGIC_V1)
2441 rc = lod_prep_md_replayed_create(env, dt, attr, lum_buf, dof,
2444 rc = lod_prep_md_striped_create(env, dt, attr, lum, dof, th);
2446 /* failed to create striping, let's reset
2447 * config so that others don't get confused */
2448 lod_striping_free(env, lo);
2454 * Set or replace striped directory layout, and LFSCK may set layout on a plain
2455 * directory, so don't check stripe count.
2457 * \param[in] env execution environment
2458 * \param[in] dt target object
2459 * \param[in] lmv_buf LMV buf which contains source stripe FIDs
2460 * \param[in] fl set or replace
2461 * \param[in] th transaction handle
2463 * \retval 0 on success
2464 * \retval negative if failed
2466 static int lod_dir_layout_set(const struct lu_env *env,
2467 struct dt_object *dt,
2468 const struct lu_buf *lmv_buf,
2472 struct dt_object *next = dt_object_child(dt);
2473 struct lod_object *lo = lod_dt_obj(dt);
2474 struct lod_device *lod = lu2lod_dev(lod2lu_obj(lo)->lo_dev);
2475 struct lmv_mds_md_v1 *lmv = lmv_buf->lb_buf;
2476 struct lmv_mds_md_v1 *slave_lmv;
2477 struct lu_buf slave_buf;
2483 if (!lmv_is_sane2(lmv))
2486 /* adjust hash for dir merge, which may not be set in user command */
2487 if (lmv_is_merging(lmv) &&
2488 !(lmv->lmv_migrate_hash & LMV_HASH_TYPE_MASK))
2489 lmv->lmv_merge_hash |=
2490 lod->lod_mdt_descs.ltd_lmv_desc.ld_pattern &
2493 LMV_DEBUG(D_INFO, lmv, "set");
2495 rc = lod_sub_xattr_set(env, next, lmv_buf, XATTR_NAME_LMV, fl, th);
2499 /* directory restripe may update stripe LMV directly */
2500 if (!lo->ldo_dir_stripe_count)
2503 lo->ldo_dir_hash_type = le32_to_cpu(lmv->lmv_hash_type);
2504 lo->ldo_dir_migrate_offset = le32_to_cpu(lmv->lmv_migrate_offset);
2505 lo->ldo_dir_migrate_hash = le32_to_cpu(lmv->lmv_migrate_hash);
2506 lo->ldo_dir_layout_version = le32_to_cpu(lmv->lmv_layout_version);
2508 OBD_ALLOC_PTR(slave_lmv);
2512 lod_prep_slave_lmv_md(slave_lmv, lmv);
2513 slave_buf.lb_buf = slave_lmv;
2514 slave_buf.lb_len = sizeof(*slave_lmv);
2516 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
2517 if (!lo->ldo_stripe[i])
2520 if (!dt_object_exists(lo->ldo_stripe[i]))
2523 rc = lod_sub_xattr_set(env, lo->ldo_stripe[i], &slave_buf,
2524 XATTR_NAME_LMV, fl, th);
2529 OBD_FREE_PTR(slave_lmv);
2535 * Implementation of dt_object_operations::do_declare_xattr_set.
2537 * Used with regular (non-striped) objects. Basically it
2538 * initializes the striping information and applies the
2539 * change to all the stripes.
2541 * \see dt_object_operations::do_declare_xattr_set() in the API description
2544 static int lod_dir_declare_xattr_set(const struct lu_env *env,
2545 struct dt_object *dt,
2546 const struct lu_buf *buf,
2547 const char *name, int fl,
2550 struct dt_object *next = dt_object_child(dt);
2551 struct lod_device *d = lu2lod_dev(dt->do_lu.lo_dev);
2552 struct lod_object *lo = lod_dt_obj(dt);
2557 if (strcmp(name, XATTR_NAME_DEFAULT_LMV) == 0) {
2558 struct lmv_user_md_v1 *lum;
2560 LASSERT(buf != NULL);
2561 if (!buf->lb_buf || buf->lb_len < sizeof(*lum))
2565 rc = lod_verify_md_striping(d, lum);
2568 } else if (strcmp(name, XATTR_NAME_LOV) == 0) {
2569 rc = lod_verify_striping(env, d, lo, buf, false);
2574 rc = lod_sub_declare_xattr_set(env, next, buf, name, fl, th);
2578 /* Note: Do not set LinkEA on sub-stripes, otherwise
2579 * it will confuse the fid2path process(see mdt_path_current()).
2580 * The linkEA between master and sub-stripes is set in
2581 * lod_xattr_set_lmv(). */
2582 if (strcmp(name, XATTR_NAME_LINK) == 0)
2585 /* set xattr to each stripes, if needed */
2586 rc = lod_striping_load(env, lo);
2590 if (lo->ldo_dir_stripe_count == 0)
2593 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
2594 if (!lo->ldo_stripe[i])
2597 if (!dt_object_exists(lo->ldo_stripe[i]))
2600 rc = lod_sub_declare_xattr_set(env, lo->ldo_stripe[i],
2610 lod_obj_stripe_replace_parent_fid_cb(const struct lu_env *env,
2611 struct lod_object *lo,
2612 struct dt_object *dt, struct thandle *th,
2613 int comp_idx, int stripe_idx,
2614 struct lod_obj_stripe_cb_data *data)
2616 struct lod_thread_info *info = lod_env_info(env);
2617 struct lod_layout_component *comp = &lo->ldo_comp_entries[comp_idx];
2618 struct filter_fid *ff = &info->lti_ff;
2619 struct lu_buf *buf = &info->lti_buf;
2623 buf->lb_len = sizeof(*ff);
2624 rc = dt_xattr_get(env, dt, buf, XATTR_NAME_FID);
2632 * locd_buf is set if it's called by dir migration, which doesn't check
2635 if (data->locd_buf) {
2636 memset(ff, 0, sizeof(*ff));
2637 ff->ff_parent = *(struct lu_fid *)data->locd_buf->lb_buf;
2639 filter_fid_le_to_cpu(ff, ff, sizeof(*ff));
2641 if (lu_fid_eq(lod_object_fid(lo), &ff->ff_parent) &&
2642 ff->ff_layout.ol_comp_id == comp->llc_id)
2645 memset(ff, 0, sizeof(*ff));
2646 ff->ff_parent = *lu_object_fid(&lo->ldo_obj.do_lu);
2649 /* rewrite filter_fid */
2650 ff->ff_parent.f_ver = stripe_idx;
2651 ff->ff_layout.ol_stripe_size = comp->llc_stripe_size;
2652 ff->ff_layout.ol_stripe_count = comp->llc_stripe_count;
2653 ff->ff_layout.ol_comp_id = comp->llc_id;
2654 ff->ff_layout.ol_comp_start = comp->llc_extent.e_start;
2655 ff->ff_layout.ol_comp_end = comp->llc_extent.e_end;
2656 filter_fid_cpu_to_le(ff, ff, sizeof(*ff));
2658 if (data->locd_declare)
2659 rc = lod_sub_declare_xattr_set(env, dt, buf, XATTR_NAME_FID,
2660 LU_XATTR_REPLACE, th);
2662 rc = lod_sub_xattr_set(env, dt, buf, XATTR_NAME_FID,
2663 LU_XATTR_REPLACE, th);
2669 * Reset parent FID on OST object
2671 * Replace parent FID with @dt object FID, which is only called during migration
2672 * to reset the parent FID after the MDT object is migrated to the new MDT, i.e.
2673 * the FID is changed.
2675 * \param[in] env execution environment
2676 * \param[in] dt dt_object whose stripes's parent FID will be reset
2677 * \parem[in] th thandle
2678 * \param[in] declare if it is declare
2680 * \retval 0 if reset succeeds
2681 * \retval negative errno if reset fails
2683 static int lod_replace_parent_fid(const struct lu_env *env,
2684 struct dt_object *dt,
2685 const struct lu_buf *buf,
2686 struct thandle *th, bool declare)
2688 struct lod_object *lo = lod_dt_obj(dt);
2689 struct lod_thread_info *info = lod_env_info(env);
2690 struct filter_fid *ff;
2691 struct lod_obj_stripe_cb_data data = { { 0 } };
2695 LASSERT(S_ISREG(dt->do_lu.lo_header->loh_attr));
2697 /* set xattr to each stripes, if needed */
2698 rc = lod_striping_load(env, lo);
2702 if (!lod_obj_is_striped(dt))
2705 if (info->lti_ea_store_size < sizeof(*ff)) {
2706 rc = lod_ea_store_resize(info, sizeof(*ff));
2711 data.locd_declare = declare;
2712 data.locd_stripe_cb = lod_obj_stripe_replace_parent_fid_cb;
2713 data.locd_buf = buf;
2714 rc = lod_obj_for_each_stripe(env, lo, th, &data);
2719 __u16 lod_comp_entry_stripe_count(struct lod_object *lo, int comp_idx,
2722 struct lod_device *lod = lu2lod_dev(lod2lu_obj(lo)->lo_dev);
2723 struct lod_layout_component *entry;
2724 enum lod_uses_hint flags = LOD_USES_ASSIGNED_STRIPE;
2727 entry = &lo->ldo_def_striping->lds_def_comp_entries[comp_idx];
2728 return entry->llc_ostlist.op_count;
2731 entry = &lo->ldo_comp_entries[comp_idx];
2732 if (lod_comp_inited(entry))
2733 return entry->llc_stripe_count;
2734 if (entry->llc_stripe_count >= LOV_ALL_STRIPES_MIN &&
2735 entry->llc_stripe_count <= LOV_ALL_STRIPES_MAX)
2736 return lod_get_stripe_count_plain(lod, lo,
2737 entry->llc_stripe_count,
2738 entry->llc_pattern &
2739 LOV_PATTERN_OVERSTRIPING,
2742 return lod_get_stripe_count(lod, lo, comp_idx, entry->llc_stripe_count,
2743 entry->llc_pattern & LOV_PATTERN_OVERSTRIPING,
2747 static int lod_comp_md_size(struct lod_object *lo, bool is_dir)
2749 int magic, size = 0, i;
2750 struct lod_layout_component *comp_entries;
2752 bool is_composite, is_foreign = false;
2755 comp_cnt = lo->ldo_def_striping->lds_def_comp_cnt;
2756 comp_entries = lo->ldo_def_striping->lds_def_comp_entries;
2758 lo->ldo_def_striping->lds_def_striping_is_composite;
2760 comp_cnt = lo->ldo_comp_cnt;
2761 comp_entries = lo->ldo_comp_entries;
2762 is_composite = lo->ldo_is_composite;
2763 is_foreign = lo->ldo_is_foreign;
2767 return lo->ldo_foreign_lov_size;
2769 LASSERT(comp_cnt != 0 && comp_entries != NULL);
2771 size = sizeof(struct lov_comp_md_v1) +
2772 sizeof(struct lov_comp_md_entry_v1) * comp_cnt;
2773 LASSERT(size % sizeof(__u64) == 0);
2776 for (i = 0; i < comp_cnt; i++) {
2779 if (comp_entries[i].llc_magic == LOV_MAGIC_FOREIGN) {
2780 size += lov_foreign_md_size(comp_entries[i].llc_length);
2782 magic = comp_entries[i].llc_pool ? LOV_MAGIC_V3 :
2784 stripe_count = lod_comp_entry_stripe_count(lo, i,
2786 if (!is_dir && is_composite)
2787 lod_comp_shrink_stripe_count(&comp_entries[i],
2789 if (is_dir && comp_entries[i].llc_ostlist.op_count)
2790 magic = LOV_MAGIC_SPECIFIC;
2792 size += lov_user_md_size(stripe_count, magic);
2794 LASSERT(size % sizeof(__u64) == 0);
2800 * Declare component add. The xattr name is XATTR_LUSTRE_LOV.add, and
2801 * the xattr value is binary lov_comp_md_v1 which contains component(s)
2804 * \param[in] env execution environment
2805 * \param[in] dt dt_object to add components on
2806 * \param[in] buf buffer contains components to be added
2807 * \parem[in] th thandle
2809 * \retval 0 on success
2810 * \retval negative errno on failure
2812 static int lod_declare_layout_add(const struct lu_env *env,
2813 struct dt_object *dt,
2814 const struct lu_buf *buf,
2817 struct lod_thread_info *info = lod_env_info(env);
2818 struct lod_layout_component *comp_array, *lod_comp, *old_array;
2819 struct lod_device *d = lu2lod_dev(dt->do_lu.lo_dev);
2820 struct dt_object *next = dt_object_child(dt);
2821 struct lov_desc *desc = &d->lod_ost_descs.ltd_lov_desc;
2822 struct lod_object *lo = lod_dt_obj(dt);
2823 struct lov_comp_md_v1 *comp_v1 = buf->lb_buf;
2825 int i, rc, array_cnt, old_array_cnt;
2828 LASSERT(lo->ldo_is_composite);
2830 if (lo->ldo_flr_state != LCM_FL_NONE)
2833 rc = lod_verify_striping(env, d, lo, buf, false);
2837 magic = comp_v1->lcm_magic;
2838 if (magic == __swab32(LOV_USER_MAGIC_COMP_V1)) {
2839 lustre_swab_lov_comp_md_v1(comp_v1);
2840 magic = comp_v1->lcm_magic;
2843 if (magic != LOV_USER_MAGIC_COMP_V1)
2846 mutex_lock(&lo->ldo_layout_mutex);
2848 array_cnt = lo->ldo_comp_cnt + comp_v1->lcm_entry_count;
2849 OBD_ALLOC_PTR_ARRAY(comp_array, array_cnt);
2850 if (comp_array == NULL) {
2851 mutex_unlock(&lo->ldo_layout_mutex);
2856 memcpy(comp_array, lo->ldo_comp_entries,
2857 sizeof(*comp_array) * lo->ldo_comp_cnt);
2859 for (i = 0; i < comp_v1->lcm_entry_count; i++) {
2860 struct lov_user_md_v1 *v1;
2861 struct lu_extent *ext;
2863 v1 = (struct lov_user_md *)((char *)comp_v1 +
2864 comp_v1->lcm_entries[i].lcme_offset);
2865 ext = &comp_v1->lcm_entries[i].lcme_extent;
2867 lod_comp = &comp_array[lo->ldo_comp_cnt + i];
2868 lod_comp->llc_extent.e_start = ext->e_start;
2869 lod_comp->llc_extent.e_end = ext->e_end;
2870 lod_comp->llc_stripe_offset = v1->lmm_stripe_offset;
2871 lod_comp->llc_flags = comp_v1->lcm_entries[i].lcme_flags;
2873 lod_comp->llc_stripe_size = v1->lmm_stripe_size;
2874 lod_comp->llc_stripe_count = v1->lmm_stripe_count;
2875 lod_comp->llc_pattern = v1->lmm_pattern;
2877 * limit stripe count so that it's less than/equal to
2878 * extent_size / stripe_size.
2880 * Note: extension size reused llc_stripe_size field and
2881 * uninstantiated component could be defined with
2882 * extent_start == extent_end as extension component will
2885 if (!(lod_comp->llc_flags & LCME_FL_EXTENSION) &&
2886 (lod_comp_inited(lod_comp) ||
2887 lod_comp->llc_extent.e_start <
2888 lod_comp->llc_extent.e_end) &&
2889 !(lod_comp->llc_stripe_count >= LOV_ALL_STRIPES_MIN &&
2890 lod_comp->llc_stripe_count <= LOV_ALL_STRIPES_MAX) &&
2891 ext->e_end != OBD_OBJECT_EOF &&
2892 (__u64)(lod_comp->llc_stripe_count *
2893 lod_comp->llc_stripe_size) >
2894 (ext->e_end - ext->e_start))
2895 lod_comp->llc_stripe_count =
2896 DIV_ROUND_UP(ext->e_end - ext->e_start,
2897 lod_comp->llc_stripe_size);
2898 lod_adjust_stripe_info(lod_comp, desc, 0);
2900 if (v1->lmm_magic == LOV_USER_MAGIC_V3) {
2901 struct lov_user_md_v3 *v3 = (typeof(*v3) *) v1;
2903 if (v3->lmm_pool_name[0] != '\0' &&
2904 !lov_pool_is_ignored(v3->lmm_pool_name)) {
2905 rc = lod_set_pool(&lod_comp->llc_pool,
2913 old_array = lo->ldo_comp_entries;
2914 old_array_cnt = lo->ldo_comp_cnt;
2916 lo->ldo_comp_entries = comp_array;
2917 lo->ldo_comp_cnt = array_cnt;
2919 /* No need to increase layout generation here, it will be increased
2920 * later when generating component ID for the new components */
2922 info->lti_buf.lb_len = lod_comp_md_size(lo, false);
2923 rc = lod_sub_declare_xattr_set(env, next, &info->lti_buf,
2924 XATTR_NAME_LOV, 0, th);
2926 lo->ldo_comp_entries = old_array;
2927 lo->ldo_comp_cnt = old_array_cnt;
2931 OBD_FREE_PTR_ARRAY(old_array, old_array_cnt);
2933 LASSERT(lo->ldo_mirror_count == 1);
2934 lo->ldo_mirrors[0].lme_end = array_cnt - 1;
2936 mutex_unlock(&lo->ldo_layout_mutex);
2941 for (i = lo->ldo_comp_cnt; i < array_cnt; i++) {
2942 lod_comp = &comp_array[i];
2943 if (lod_comp->llc_pool != NULL) {
2944 OBD_FREE(lod_comp->llc_pool,
2945 strlen(lod_comp->llc_pool) + 1);
2946 lod_comp->llc_pool = NULL;
2949 OBD_FREE_PTR_ARRAY(comp_array, array_cnt);
2950 mutex_unlock(&lo->ldo_layout_mutex);
2956 * lod_last_non_stale_mirror() - Check if a mirror is the last non-stale mirror.
2957 * @mirror_id: Mirror id to be checked.
2960 * This function checks if a mirror with specified @mirror_id is the last
2961 * non-stale mirror of a LOD object @lo.
2963 * Return: true or false.
2966 bool lod_last_non_stale_mirror(__u16 mirror_id, struct lod_object *lo)
2968 struct lod_layout_component *lod_comp;
2969 bool has_stale_flag;
2972 for (i = 0; i < lo->ldo_mirror_count; i++) {
2973 if (lo->ldo_mirrors[i].lme_id == mirror_id ||
2974 lo->ldo_mirrors[i].lme_stale)
2977 has_stale_flag = false;
2978 lod_foreach_mirror_comp(lod_comp, lo, i) {
2979 if (lod_comp->llc_flags & LCME_FL_STALE) {
2980 has_stale_flag = true;
2984 if (!has_stale_flag)
2992 * Declare component set. The xattr is name XATTR_LUSTRE_LOV.set.$field,
2993 * the '$field' can only be 'flags' now. The xattr value is binary
2994 * lov_comp_md_v1 which contains the component ID(s) and the value of
2995 * the field to be modified.
2996 * Please update allowed_lustre_lov macro if $field groks more values
2999 * \param[in] env execution environment
3000 * \param[in] dt dt_object to be modified
3001 * \param[in] op operation string, like "set.flags"
3002 * \param[in] buf buffer contains components to be set
3003 * \parem[in] th thandle
3005 * \retval 0 on success
3006 * \retval negative errno on failure
3008 static int lod_declare_layout_set(const struct lu_env *env,
3009 struct dt_object *dt,
3010 char *op, const struct lu_buf *buf,
3013 struct lod_layout_component *lod_comp;
3014 struct lod_thread_info *info = lod_env_info(env);
3015 struct lod_device *d = lu2lod_dev(dt->do_lu.lo_dev);
3016 struct lod_object *lo = lod_dt_obj(dt);
3017 struct lov_comp_md_v1 *comp_v1 = buf->lb_buf;
3020 bool changed = false;
3023 /* Please update allowed_lustre_lov macro if op
3024 * groks more values in the future
3026 if (strcmp(op, "set.flags") != 0) {
3027 CDEBUG(D_LAYOUT, "%s: operation (%s) not supported.\n",
3028 lod2obd(d)->obd_name, op);
3032 magic = comp_v1->lcm_magic;
3033 if (magic == __swab32(LOV_USER_MAGIC_COMP_V1)) {
3034 lustre_swab_lov_comp_md_v1(comp_v1);
3035 magic = comp_v1->lcm_magic;
3038 if (magic != LOV_USER_MAGIC_COMP_V1)
3041 if (comp_v1->lcm_entry_count == 0) {
3042 CDEBUG(D_LAYOUT, "%s: entry count is zero.\n",
3043 lod2obd(d)->obd_name);
3047 mutex_lock(&lo->ldo_layout_mutex);
3048 for (i = 0; i < comp_v1->lcm_entry_count; i++) {
3049 __u32 id = comp_v1->lcm_entries[i].lcme_id;
3050 __u32 flags = comp_v1->lcm_entries[i].lcme_flags;
3051 __u32 mirror_flag = flags & LCME_MIRROR_FLAGS;
3052 __u16 mirror_id = mirror_id_of(id);
3053 bool neg = flags & LCME_FL_NEG;
3055 if (flags & LCME_FL_INIT) {
3057 lod_striping_free_nolock(env, lo);
3058 mutex_unlock(&lo->ldo_layout_mutex);
3062 flags &= ~(LCME_MIRROR_FLAGS | LCME_FL_NEG);
3063 for (j = 0; j < lo->ldo_comp_cnt; j++) {
3064 lod_comp = &lo->ldo_comp_entries[j];
3066 /* lfs only put one flag in each entry */
3067 if ((flags && id != lod_comp->llc_id) ||
3068 (mirror_flag && mirror_id !=
3069 mirror_id_of(lod_comp->llc_id)))
3074 lod_comp->llc_flags &= ~flags;
3076 lod_comp->llc_flags &= ~mirror_flag;
3079 if ((flags & LCME_FL_STALE) &&
3080 lod_last_non_stale_mirror(mirror_id,
3083 &lo->ldo_layout_mutex);
3086 lod_comp->llc_flags |= flags;
3089 lod_comp->llc_flags |= mirror_flag;
3090 if (mirror_flag & LCME_FL_NOSYNC)
3091 lod_comp->llc_timestamp =
3092 ktime_get_real_seconds();
3098 mutex_unlock(&lo->ldo_layout_mutex);
3101 CDEBUG(D_LAYOUT, "%s: requested component(s) not found.\n",
3102 lod2obd(d)->obd_name);
3106 lod_obj_inc_layout_gen(lo);
3108 info->lti_buf.lb_len = lod_comp_md_size(lo, false);
3109 rc = lod_sub_declare_xattr_set(env, dt_object_child(dt), &info->lti_buf,
3110 XATTR_NAME_LOV, LU_XATTR_REPLACE, th);
3115 * Declare component deletion. The xattr name is XATTR_LUSTRE_LOV.del,
3116 * and the xattr value is a unique component ID or a special lcme_id.
3118 * \param[in] env execution environment
3119 * \param[in] dt dt_object to be operated on
3120 * \param[in] buf buffer contains component ID or lcme_id
3121 * \parem[in] th thandle
3123 * \retval 0 on success
3124 * \retval negative errno on failure
3126 static int lod_declare_layout_del(const struct lu_env *env,
3127 struct dt_object *dt,
3128 const struct lu_buf *buf,
3131 struct lod_thread_info *info = lod_env_info(env);
3132 struct dt_object *next = dt_object_child(dt);
3133 struct lod_device *d = lu2lod_dev(dt->do_lu.lo_dev);
3134 struct lod_object *lo = lod_dt_obj(dt);
3135 struct lu_attr *attr = &lod_env_info(env)->lti_attr;
3136 struct lov_comp_md_v1 *comp_v1 = buf->lb_buf;
3137 __u32 magic, id, flags, neg_flags = 0;
3141 LASSERT(lo->ldo_is_composite);
3143 if (lo->ldo_flr_state != LCM_FL_NONE)
3146 magic = comp_v1->lcm_magic;
3147 if (magic == __swab32(LOV_USER_MAGIC_COMP_V1)) {
3148 lustre_swab_lov_comp_md_v1(comp_v1);
3149 magic = comp_v1->lcm_magic;
3152 if (magic != LOV_USER_MAGIC_COMP_V1)
3155 id = comp_v1->lcm_entries[0].lcme_id;
3156 flags = comp_v1->lcm_entries[0].lcme_flags;
3158 if (id > LCME_ID_MAX || (flags & ~LCME_KNOWN_FLAGS)) {
3159 CDEBUG(D_LAYOUT, "%s: invalid component id %#x, flags %#x\n",
3160 lod2obd(d)->obd_name, id, flags);
3164 if (id != LCME_ID_INVAL && flags != 0) {
3165 CDEBUG(D_LAYOUT, "%s: specified both id and flags.\n",
3166 lod2obd(d)->obd_name);
3170 if (id == LCME_ID_INVAL && !flags) {
3171 CDEBUG(D_LAYOUT, "%s: no id or flags specified.\n",
3172 lod2obd(d)->obd_name);
3176 if (flags & LCME_FL_NEG) {
3177 neg_flags = flags & ~LCME_FL_NEG;
3181 mutex_lock(&lo->ldo_layout_mutex);
3183 left = lo->ldo_comp_cnt;
3185 mutex_unlock(&lo->ldo_layout_mutex);
3189 for (i = (lo->ldo_comp_cnt - 1); i >= 0; i--) {
3190 struct lod_layout_component *lod_comp;
3192 lod_comp = &lo->ldo_comp_entries[i];
3194 if (id != LCME_ID_INVAL && id != lod_comp->llc_id)
3196 else if (flags && !(flags & lod_comp->llc_flags))
3198 else if (neg_flags && (neg_flags & lod_comp->llc_flags))
3201 if (left != (i + 1)) {
3202 CDEBUG(D_LAYOUT, "%s: this deletion will create "
3203 "a hole.\n", lod2obd(d)->obd_name);
3204 mutex_unlock(&lo->ldo_layout_mutex);
3209 /* Mark the component as deleted */
3210 lod_comp->llc_id = LCME_ID_INVAL;
3212 /* Not instantiated component */
3213 if (lod_comp->llc_stripe == NULL)
3216 LASSERT(lod_comp->llc_stripe_count > 0);
3217 for (j = 0; j < lod_comp->llc_stripe_count; j++) {
3218 struct dt_object *obj = lod_comp->llc_stripe[j];
3222 rc = lod_sub_declare_destroy(env, obj, th);
3224 mutex_unlock(&lo->ldo_layout_mutex);
3230 LASSERTF(left >= 0, "left = %d\n", left);
3231 if (left == lo->ldo_comp_cnt) {
3232 CDEBUG(D_LAYOUT, "%s: requested component id:%#x not found\n",
3233 lod2obd(d)->obd_name, id);
3234 mutex_unlock(&lo->ldo_layout_mutex);
3238 mutex_unlock(&lo->ldo_layout_mutex);
3240 memset(attr, 0, sizeof(*attr));
3241 attr->la_valid = LA_SIZE;
3242 rc = lod_sub_declare_attr_set(env, next, attr, th);
3247 info->lti_buf.lb_len = lod_comp_md_size(lo, false);
3248 rc = lod_sub_declare_xattr_set(env, next, &info->lti_buf,
3249 XATTR_NAME_LOV, 0, th);
3251 rc = lod_sub_declare_xattr_del(env, next, XATTR_NAME_LOV, th);
3258 * Declare layout add/set/del operations issued by special xattr names:
3260 * XATTR_LUSTRE_LOV.add add component(s) to existing file
3261 * XATTR_LUSTRE_LOV.del delete component(s) from existing file
3262 * XATTR_LUSTRE_LOV.set.$field set specified field of certain component(s)
3264 * \param[in] env execution environment
3265 * \param[in] dt object
3266 * \param[in] name name of xattr
3267 * \param[in] buf lu_buf contains xattr value
3268 * \param[in] th transaction handle
3270 * \retval 0 on success
3271 * \retval negative if failed
3273 static int lod_declare_modify_layout(const struct lu_env *env,
3274 struct dt_object *dt,
3276 const struct lu_buf *buf,
3279 struct lod_device *d = lu2lod_dev(dt->do_lu.lo_dev);
3280 struct lod_object *lo = lod_dt_obj(dt);
3282 int rc, len = strlen(XATTR_LUSTRE_LOV);
3285 LASSERT(dt_object_exists(dt));
3287 if (strlen(name) <= len || name[len] != '.') {
3288 CDEBUG(D_LAYOUT, "%s: invalid xattr name: %s\n",
3289 lod2obd(d)->obd_name, name);
3294 rc = lod_striping_load(env, lo);
3298 /* the layout to be modified must be a composite layout */
3299 if (!lo->ldo_is_composite) {
3300 CDEBUG(D_LAYOUT, "%s: object "DFID" isn't a composite file.\n",
3301 lod2obd(d)->obd_name, PFID(lu_object_fid(&dt->do_lu)));
3302 GOTO(unlock, rc = -EINVAL);
3305 op = (char *)name + len;
3306 if (strcmp(op, "add") == 0) {
3307 rc = lod_declare_layout_add(env, dt, buf, th);
3308 } else if (strcmp(op, "del") == 0) {
3309 rc = lod_declare_layout_del(env, dt, buf, th);
3310 } else if (strncmp(op, "set", strlen("set")) == 0) {
3311 rc = lod_declare_layout_set(env, dt, op, buf, th);
3313 CDEBUG(D_LAYOUT, "%s: unsupported xattr name:%s\n",
3314 lod2obd(d)->obd_name, name);
3315 GOTO(unlock, rc = -ENOTSUPP);
3319 lod_striping_free(env, lo);
3325 * Convert a plain file lov_mds_md to a composite layout.
3327 * \param[in,out] info the thread info::lti_ea_store buffer contains little
3328 * endian plain file layout
3330 * \retval 0 on success, <0 on failure
3332 static int lod_layout_convert(struct lod_thread_info *info)
3334 struct lov_mds_md *lmm = info->lti_ea_store;
3335 struct lov_mds_md *lmm_save;
3336 struct lov_comp_md_v1 *lcm;
3337 struct lov_comp_md_entry_v1 *lcme;
3343 /* realloc buffer to a composite layout which contains one component */
3344 blob_size = lov_mds_md_size(le16_to_cpu(lmm->lmm_stripe_count),
3345 le32_to_cpu(lmm->lmm_magic));
3346 size = sizeof(*lcm) + sizeof(*lcme) + blob_size;
3348 OBD_ALLOC_LARGE(lmm_save, blob_size);
3350 GOTO(out, rc = -ENOMEM);
3352 memcpy(lmm_save, lmm, blob_size);
3354 if (info->lti_ea_store_size < size) {
3355 rc = lod_ea_store_resize(info, size);
3360 lcm = info->lti_ea_store;
3361 memset(lcm, 0, sizeof(*lcm) + sizeof(*lcme));
3362 lcm->lcm_magic = cpu_to_le32(LOV_MAGIC_COMP_V1);
3363 lcm->lcm_size = cpu_to_le32(size);
3364 lcm->lcm_layout_gen = cpu_to_le32(le16_to_cpu(
3365 lmm_save->lmm_layout_gen));
3366 lcm->lcm_flags = cpu_to_le16(LCM_FL_NONE);
3367 lcm->lcm_entry_count = cpu_to_le16(1);
3369 lcme = &lcm->lcm_entries[0];
3370 lcme->lcme_flags = cpu_to_le32(LCME_FL_INIT);
3371 lcme->lcme_extent.e_start = 0;
3372 lcme->lcme_extent.e_end = cpu_to_le64(OBD_OBJECT_EOF);
3373 lcme->lcme_offset = cpu_to_le32(sizeof(*lcm) + sizeof(*lcme));
3374 lcme->lcme_size = cpu_to_le32(blob_size);
3376 memcpy((char *)lcm + lcme->lcme_offset, (char *)lmm_save, blob_size);
3381 OBD_FREE_LARGE(lmm_save, blob_size);
3386 * Merge layouts to form a mirrored file.
3388 static int lod_declare_layout_merge(const struct lu_env *env,
3389 struct dt_object *dt,
3390 const struct lu_buf *mbuf,
3393 struct lod_thread_info *info = lod_env_info(env);
3394 struct lu_attr *layout_attr = &info->lti_layout_attr;
3395 struct lu_buf *buf = &info->lti_buf;
3396 struct lod_object *lo = lod_dt_obj(dt);
3397 struct lov_comp_md_v1 *lcm;
3398 struct lov_comp_md_v1 *cur_lcm;
3399 struct lov_comp_md_v1 *merge_lcm;
3400 struct lov_comp_md_entry_v1 *lcme;
3401 struct lov_mds_md_v1 *lmm;
3404 __u16 cur_entry_count;
3405 __u16 merge_entry_count;
3407 __u16 mirror_id = 0;
3414 merge_lcm = mbuf->lb_buf;
3415 if (mbuf->lb_len < sizeof(*merge_lcm))
3418 /* must be an existing layout from disk */
3419 if (le32_to_cpu(merge_lcm->lcm_magic) != LOV_MAGIC_COMP_V1)
3422 merge_entry_count = le16_to_cpu(merge_lcm->lcm_entry_count);
3424 /* do not allow to merge two mirrored files */
3425 if (le16_to_cpu(merge_lcm->lcm_mirror_count))
3428 /* verify the target buffer */
3429 rc = lod_get_lov_ea(env, lo);
3431 RETURN(rc ? : -ENODATA);
3433 cur_lcm = info->lti_ea_store;
3434 switch (le32_to_cpu(cur_lcm->lcm_magic)) {
3437 rc = lod_layout_convert(info);
3439 case LOV_MAGIC_COMP_V1:
3449 /* info->lti_ea_store could be reallocated in lod_layout_convert() */
3450 cur_lcm = info->lti_ea_store;
3451 cur_entry_count = le16_to_cpu(cur_lcm->lcm_entry_count);
3453 /* 'lcm_mirror_count + 1' is the current # of mirrors the file has */
3454 mirror_count = le16_to_cpu(cur_lcm->lcm_mirror_count) + 1;
3455 if (mirror_count + 1 > LUSTRE_MIRROR_COUNT_MAX)
3458 /* size of new layout */
3459 size = le32_to_cpu(cur_lcm->lcm_size) +
3460 le32_to_cpu(merge_lcm->lcm_size) - sizeof(*cur_lcm);
3462 memset(buf, 0, sizeof(*buf));
3463 lu_buf_alloc(buf, size);
3464 if (buf->lb_buf == NULL)
3468 memcpy(lcm, cur_lcm, sizeof(*lcm) + cur_entry_count * sizeof(*lcme));
3470 offset = sizeof(*lcm) +
3471 sizeof(*lcme) * (cur_entry_count + merge_entry_count);
3472 for (i = 0; i < cur_entry_count; i++) {
3473 struct lov_comp_md_entry_v1 *cur_lcme;
3475 lcme = &lcm->lcm_entries[i];
3476 cur_lcme = &cur_lcm->lcm_entries[i];
3478 lcme->lcme_offset = cpu_to_le32(offset);
3479 memcpy((char *)lcm + offset,
3480 (char *)cur_lcm + le32_to_cpu(cur_lcme->lcme_offset),
3481 le32_to_cpu(lcme->lcme_size));
3483 offset += le32_to_cpu(lcme->lcme_size);
3485 if (mirror_count == 1 &&
3486 mirror_id_of(le32_to_cpu(lcme->lcme_id)) == 0) {
3487 /* Add mirror from a non-flr file, create new mirror ID.
3488 * Otherwise, keep existing mirror's component ID, used
3489 * for mirror extension.
3491 id = pflr_id(1, i + 1);
3492 lcme->lcme_id = cpu_to_le32(id);
3495 id = max(le32_to_cpu(lcme->lcme_id), id);
3498 mirror_id = mirror_id_of(id) + 1;
3500 /* check if first entry in new layout is DOM */
3501 lmm = (struct lov_mds_md_v1 *)((char *)merge_lcm +
3502 merge_lcm->lcm_entries[0].lcme_offset);
3503 merge_has_dom = lov_pattern(le32_to_cpu(lmm->lmm_pattern)) &
3506 for (i = 0; i < merge_entry_count; i++) {
3507 struct lov_comp_md_entry_v1 *merge_lcme;
3509 merge_lcme = &merge_lcm->lcm_entries[i];
3510 lcme = &lcm->lcm_entries[cur_entry_count + i];
3512 *lcme = *merge_lcme;
3513 lcme->lcme_offset = cpu_to_le32(offset);
3514 if (merge_has_dom && i == 0)
3515 lcme->lcme_flags |= cpu_to_le32(LCME_FL_STALE);
3517 id = pflr_id(mirror_id, i + 1);
3518 lcme->lcme_id = cpu_to_le32(id);
3520 memcpy((char *)lcm + offset,
3521 (char *)merge_lcm + le32_to_cpu(merge_lcme->lcme_offset),
3522 le32_to_cpu(lcme->lcme_size));
3524 offset += le32_to_cpu(lcme->lcme_size);
3527 /* fixup layout information */
3528 lcm->lcm_size = cpu_to_le32(size);
3529 lcm->lcm_entry_count = cpu_to_le16(cur_entry_count + merge_entry_count);
3530 lcm->lcm_mirror_count = cpu_to_le16(mirror_count);
3531 if ((le16_to_cpu(lcm->lcm_flags) & LCM_FL_FLR_MASK) == LCM_FL_NONE)
3532 lcm->lcm_flags = cpu_to_le32(LCM_FL_RDONLY);
3534 rc = lod_striping_reload(env, lo, buf, 0);
3538 lod_obj_inc_layout_gen(lo);
3539 lcm->lcm_layout_gen = cpu_to_le32(lo->ldo_layout_gen);
3541 /* transfer layout version to OST objects. */
3542 if (lo->ldo_mirror_count > 1) {
3543 struct lod_obj_stripe_cb_data data = { {0} };
3545 layout_attr->la_valid = LA_LAYOUT_VERSION;
3546 layout_attr->la_layout_version = 0;
3547 data.locd_attr = layout_attr;
3548 data.locd_declare = true;
3549 data.locd_stripe_cb = lod_obj_stripe_attr_set_cb;
3550 rc = lod_obj_for_each_stripe(env, lo, th, &data);
3555 rc = lod_sub_declare_xattr_set(env, dt_object_child(dt), buf,
3556 XATTR_NAME_LOV, LU_XATTR_REPLACE, th);
3564 * Split layouts, just set the LOVEA with the layout from mbuf.
3566 static int lod_declare_layout_split(const struct lu_env *env,
3567 struct dt_object *dt, const struct lu_buf *mbuf,
3570 struct lod_thread_info *info = lod_env_info(env);
3571 struct lu_attr *layout_attr = &info->lti_layout_attr;
3572 struct lod_object *lo = lod_dt_obj(dt);
3573 struct lov_comp_md_v1 *lcm = mbuf->lb_buf;
3577 rc = lod_striping_reload(env, lo, mbuf, LVF_ALL_STALE);
3581 lod_obj_inc_layout_gen(lo);
3582 /* fix on-disk layout gen */
3583 lcm->lcm_layout_gen = cpu_to_le32(lo->ldo_layout_gen);
3585 /* transfer layout version to OST objects. */
3586 if (lo->ldo_mirror_count > 1) {
3587 struct lod_obj_stripe_cb_data data = { {0} };
3589 layout_attr->la_valid = LA_LAYOUT_VERSION;
3590 layout_attr->la_layout_version = 0;
3591 data.locd_attr = layout_attr;
3592 data.locd_declare = true;
3593 data.locd_stripe_cb = lod_obj_stripe_attr_set_cb;
3594 rc = lod_obj_for_each_stripe(env, lo, th, &data);
3599 rc = lod_sub_declare_xattr_set(env, dt_object_child(dt), mbuf,
3600 XATTR_NAME_LOV, LU_XATTR_REPLACE, th);
3604 static int lod_layout_declare_or_purge_mirror(const struct lu_env *env,
3605 struct dt_object *dt, const struct lu_buf *buf,
3606 struct thandle *th, bool declare)
3608 struct lod_thread_info *info = lod_env_info(env);
3609 struct lod_device *d = lu2lod_dev(dt->do_lu.lo_dev);
3610 struct lod_object *lo = lod_dt_obj(dt);
3611 struct lov_comp_md_v1 *comp_v1 = buf->lb_buf;
3612 struct lov_comp_md_entry_v1 *entry;
3613 struct lov_mds_md_v1 *lmm;
3614 struct dt_object **sub_objs = NULL;
3615 int rc = 0, i, k, array_count = 0;
3620 * other ops (like lod_declare_destroy) could destroying sub objects
3623 mutex_lock(&lo->ldo_layout_mutex);
3626 /* prepare sub-objects array */
3627 for (i = 0; i < comp_v1->lcm_entry_count; i++) {
3628 entry = &comp_v1->lcm_entries[i];
3630 if (!(entry->lcme_flags & LCME_FL_INIT))
3633 lmm = (struct lov_mds_md_v1 *)
3634 ((char *)comp_v1 + entry->lcme_offset);
3635 array_count += lmm->lmm_stripe_count;
3637 OBD_ALLOC_PTR_ARRAY(sub_objs, array_count);
3638 if (sub_objs == NULL) {
3639 mutex_unlock(&lo->ldo_layout_mutex);
3644 k = 0; /* sub_objs index */
3645 for (i = 0; i < comp_v1->lcm_entry_count; i++) {
3646 struct lov_ost_data_v1 *objs;
3647 struct lu_object *o, *n;
3648 struct dt_object *dto;
3649 struct lu_device *nd;
3650 struct lov_mds_md_v3 *v3;
3654 entry = &comp_v1->lcm_entries[i];
3656 if (!(entry->lcme_flags & LCME_FL_INIT))
3659 lmm = (struct lov_mds_md_v1 *)
3660 ((char *)comp_v1 + entry->lcme_offset);
3661 v3 = (struct lov_mds_md_v3 *)lmm;
3662 if (lmm->lmm_magic == LOV_MAGIC_V3)
3663 objs = &v3->lmm_objects[0];
3665 objs = &lmm->lmm_objects[0];
3667 for (j = 0; j < lmm->lmm_stripe_count; j++) {
3668 idx = objs[j].l_ost_idx;
3669 rc = ostid_to_fid(&info->lti_fid, &objs[j].l_ost_oi,
3674 if (!fid_is_sane(&info->lti_fid)) {
3675 CERROR("%s: sub-object insane fid "DFID"\n",
3676 lod2obd(d)->obd_name,
3677 PFID(&info->lti_fid));
3678 GOTO(out, rc = -EINVAL);
3681 lod_getref(&d->lod_ost_descs);
3683 rc = validate_lod_and_idx(d, idx);
3685 lod_putref(d, &d->lod_ost_descs);
3689 nd = &OST_TGT(d, idx)->ltd_tgt->dd_lu_dev;
3690 lod_putref(d, &d->lod_ost_descs);
3692 o = lu_object_find_at(env, nd, &info->lti_fid, NULL);
3694 GOTO(out, rc = PTR_ERR(o));
3696 n = lu_object_locate(o->lo_header, nd->ld_type);
3698 lu_object_put(env, n);
3699 GOTO(out, rc = -ENOENT);
3702 dto = container_of(n, struct dt_object, do_lu);
3705 rc = lod_sub_declare_destroy(env, dto, th);
3706 dt_object_put(env, dto);
3711 * collect to-be-destroyed sub objects, the
3712 * reference would be released after actual
3718 } /* for each stripe */
3719 } /* for each component in the mirror */
3724 /* destroy the sub objects */
3725 for (; i < k; i++) {
3726 rc = lod_sub_destroy(env, sub_objs[i], th);
3729 dt_object_put(env, sub_objs[i]);
3733 * if a sub object destroy failed, we'd release sub objects
3734 * reference get from above sub_objs collection.
3737 dt_object_put(env, sub_objs[i]);
3739 OBD_FREE_PTR_ARRAY(sub_objs, array_count);
3741 mutex_unlock(&lo->ldo_layout_mutex);
3747 * Purge layouts, delete sub objects in the mirror stored in the vic_buf,
3748 * and set the LOVEA with the layout from mbuf.
3750 static int lod_declare_layout_purge(const struct lu_env *env,
3751 struct dt_object *dt, const struct lu_buf *buf,
3754 struct lod_device *d = lu2lod_dev(dt->do_lu.lo_dev);
3755 struct lov_comp_md_v1 *comp_v1 = buf->lb_buf;
3760 if (le32_to_cpu(comp_v1->lcm_magic) != LOV_MAGIC_COMP_V1) {
3761 CERROR("%s: invalid layout magic %#x != %#x\n",
3762 lod2obd(d)->obd_name, le32_to_cpu(comp_v1->lcm_magic),
3767 if (cpu_to_le32(LOV_MAGIC_COMP_V1) != LOV_MAGIC_COMP_V1)
3768 lustre_swab_lov_comp_md_v1(comp_v1);
3770 /* from now on, @buf contains cpu endian data */
3772 if (comp_v1->lcm_mirror_count != 0) {
3773 CERROR("%s: can only purge one mirror from "DFID"\n",
3774 lod2obd(d)->obd_name, PFID(lu_object_fid(&dt->do_lu)));
3778 /* delcare sub objects deletion in the mirror stored in @buf */
3779 rc = lod_layout_declare_or_purge_mirror(env, dt, buf, th, true);
3783 /* delete sub objects from the mirror stored in @buf */
3784 static int lod_layout_purge(const struct lu_env *env, struct dt_object *dt,
3785 const struct lu_buf *buf, struct thandle *th)
3790 rc = lod_layout_declare_or_purge_mirror(env, dt, buf, th, false);
3795 * Implementation of dt_object_operations::do_declare_xattr_set.
3797 * \see dt_object_operations::do_declare_xattr_set() in the API description
3800 * the extension to the API:
3801 * - declaring LOVEA requests striping creation
3802 * - LU_XATTR_REPLACE means layout swap
3804 static int lod_declare_xattr_set(const struct lu_env *env,
3805 struct dt_object *dt,
3806 const struct lu_buf *buf,
3807 const char *name, int fl,
3810 struct lod_thread_info *info = lod_env_info(env);
3811 struct dt_object *next = dt_object_child(dt);
3812 struct lu_attr *attr = &info->lti_attr;
3813 struct lod_object *lo = lod_dt_obj(dt);
3818 mode = dt->do_lu.lo_header->loh_attr & S_IFMT;
3819 if ((S_ISREG(mode) || mode == 0) &&
3820 !(fl & (LU_XATTR_REPLACE | LU_XATTR_MERGE | LU_XATTR_SPLIT |
3822 (strcmp(name, XATTR_NAME_LOV) == 0 ||
3823 strcmp(name, XATTR_LUSTRE_LOV) == 0)) {
3825 * this is a request to create object's striping.
3827 * allow to declare predefined striping on a new (!mode) object
3828 * which is supposed to be replay of regular file creation
3829 * (when LOV setting is declared)
3831 * LU_XATTR_REPLACE is set to indicate a layout swap
3833 if (dt_object_exists(dt)) {
3834 rc = dt_attr_get(env, next, attr);
3838 memset(attr, 0, sizeof(*attr));
3839 attr->la_valid = LA_TYPE | LA_MODE;
3840 attr->la_mode = S_IFREG;
3842 rc = lod_declare_striped_create(env, dt, attr, buf, th);
3843 } else if (fl & LU_XATTR_MERGE) {
3844 LASSERT(strcmp(name, XATTR_NAME_LOV) == 0 ||
3845 strcmp(name, XATTR_LUSTRE_LOV) == 0);
3846 rc = lod_declare_layout_merge(env, dt, buf, th);
3847 } else if (fl & LU_XATTR_SPLIT) {
3848 LASSERT(strcmp(name, XATTR_NAME_LOV) == 0 ||
3849 strcmp(name, XATTR_LUSTRE_LOV) == 0);
3850 rc = lod_declare_layout_split(env, dt, buf, th);
3851 } else if (fl & LU_XATTR_PURGE) {
3852 LASSERT(strcmp(name, XATTR_NAME_LOV) == 0 ||
3853 strcmp(name, XATTR_LUSTRE_LOV) == 0);
3854 rc = lod_declare_layout_purge(env, dt, buf, th);
3855 } else if (S_ISREG(mode) &&
3856 strlen(name) >= sizeof(XATTR_LUSTRE_LOV) + 3 &&
3857 allowed_lustre_lov(name)) {
3859 * this is a request to modify object's striping.
3860 * add/set/del component(s).
3862 if (!dt_object_exists(dt))
3865 rc = lod_declare_modify_layout(env, dt, name, buf, th);
3866 } else if (S_ISDIR(mode)) {
3867 rc = lod_dir_declare_xattr_set(env, dt, buf, name, fl, th);
3868 } else if (strcmp(name, XATTR_NAME_FID) == 0) {
3869 rc = lod_replace_parent_fid(env, dt, buf, th, true);
3871 rc = lod_sub_declare_xattr_set(env, next, buf, name, fl, th);
3875 (strcmp(name, XATTR_NAME_LOV) == 0 ||
3876 strcmp(name, XATTR_LUSTRE_LOV) == 0 || allowed_lustre_lov(name)))
3877 rc = lod_save_layout_gen_intrans(info, lo);
3883 * Apply xattr changes to the object.
3885 * Applies xattr changes to the object and the stripes if the latter exist.
3887 * \param[in] env execution environment
3888 * \param[in] dt object
3889 * \param[in] buf buffer pointing to the new value of xattr
3890 * \param[in] name name of xattr
3891 * \param[in] fl flags
3892 * \param[in] th transaction handle
3894 * \retval 0 on success
3895 * \retval negative if failed
3897 static int lod_xattr_set_internal(const struct lu_env *env,
3898 struct dt_object *dt,
3899 const struct lu_buf *buf,
3900 const char *name, int fl,
3903 struct dt_object *next = dt_object_child(dt);
3904 struct lod_object *lo = lod_dt_obj(dt);
3909 rc = lod_sub_xattr_set(env, next, buf, name, fl, th);
3910 if (rc != 0 || !S_ISDIR(dt->do_lu.lo_header->loh_attr))
3913 /* Note: Do not set LinkEA on sub-stripes, otherwise
3914 * it will confuse the fid2path process(see mdt_path_current()).
3915 * The linkEA between master and sub-stripes is set in
3916 * lod_xattr_set_lmv(). */
3917 if (lo->ldo_dir_stripe_count == 0 || strcmp(name, XATTR_NAME_LINK) == 0)
3920 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
3921 if (!lo->ldo_stripe[i])
3924 if (!dt_object_exists(lo->ldo_stripe[i]))
3927 rc = lod_sub_xattr_set(env, lo->ldo_stripe[i], buf, name,
3937 * Delete an extended attribute.
3939 * Deletes specified xattr from the object and the stripes if the latter exist.
3941 * \param[in] env execution environment
3942 * \param[in] dt object
3943 * \param[in] name name of xattr
3944 * \param[in] th transaction handle
3946 * \retval 0 on success
3947 * \retval negative if failed
3949 static int lod_xattr_del_internal(const struct lu_env *env,
3950 struct dt_object *dt,
3951 const char *name, struct thandle *th)
3953 struct dt_object *next = dt_object_child(dt);
3954 struct lod_object *lo = lod_dt_obj(dt);
3960 rc = lod_sub_xattr_del(env, next, name, th);
3961 if (rc != 0 || !S_ISDIR(dt->do_lu.lo_header->loh_attr))
3964 if (lo->ldo_dir_stripe_count == 0)
3967 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
3968 if (!lo->ldo_stripe[i])
3971 if (!dt_object_exists(lo->ldo_stripe[i]))
3974 rc = lod_sub_xattr_del(env, lo->ldo_stripe[i], name, th);
3983 * Set default striping on a directory.
3985 * Sets specified striping on a directory object unless it matches the default
3986 * striping (LOVEA_DELETE_VALUES() macro). In the latter case remove existing
3987 * EA. This striping will be used when regular file is being created in this
3990 * \param[in] env execution environment
3991 * \param[in] dt the striped object
3992 * \param[in] buf buffer with the striping
3993 * \param[in] name name of EA
3994 * \param[in] fl xattr flag (see OSD API description)
3995 * \param[in] th transaction handle
3997 * \retval 0 on success
3998 * \retval negative if failed
4000 static int lod_xattr_set_lov_on_dir(const struct lu_env *env,
4001 struct dt_object *dt,
4002 const struct lu_buf *buf,
4003 const char *name, int fl,
4006 struct lov_user_md_v1 *lum;
4007 struct lov_user_md_v3 *v3 = NULL;
4008 const char *pool_name = NULL;
4013 LASSERT(buf != NULL && buf->lb_buf != NULL);
4016 switch (lum->lmm_magic) {
4017 case LOV_USER_MAGIC_SPECIFIC:
4018 case LOV_USER_MAGIC_V3:
4020 if (lov_pool_is_reserved(v3->lmm_pool_name))
4021 memset(v3->lmm_pool_name, 0, sizeof(v3->lmm_pool_name));
4022 else if (v3->lmm_pool_name[0] != '\0')
4023 pool_name = v3->lmm_pool_name;
4025 case LOV_USER_MAGIC_V1:
4026 /* if { size, offset, count } = { 0, -1, 0 } and no pool
4027 * (i.e. all default values specified) then delete default
4028 * striping from dir. */
4030 "set default striping: sz %u # %u offset %d %s %s\n",
4031 (unsigned)lum->lmm_stripe_size,
4032 (unsigned)lum->lmm_stripe_count,
4033 (int)lum->lmm_stripe_offset,
4034 v3 ? "from" : "", v3 ? v3->lmm_pool_name : "");
4036 is_del = LOVEA_DELETE_VALUES(lum->lmm_stripe_size,
4037 lum->lmm_stripe_count,
4038 lum->lmm_stripe_offset,
4041 case LOV_USER_MAGIC_COMP_V1:
4043 struct lov_comp_md_v1 *lcm = (struct lov_comp_md_v1 *)lum;
4044 struct lov_comp_md_entry_v1 *lcme;
4047 comp_cnt = le16_to_cpu(lcm->lcm_entry_count);
4048 for (i = 0; i < comp_cnt; i++) {
4049 lcme = &lcm->lcm_entries[i];
4050 if (lcme->lcme_flags & cpu_to_le32(LCME_FL_EXTENSION)) {
4051 lcm->lcm_magic = cpu_to_le32(LOV_MAGIC_SEL);
4060 CERROR("Invalid magic %x\n", lum->lmm_magic);
4065 rc = lod_xattr_del_internal(env, dt, name, th);
4069 rc = lod_xattr_set_internal(env, dt, buf, name, fl, th);
4075 static int lod_get_default_lov_striping(const struct lu_env *env,
4076 struct lod_object *lo,
4077 struct lod_default_striping *lds,
4078 struct dt_allocation_hint *ah);
4081 * Helper function to convert compound layout to compound layout with
4084 * Copy lcm_entries array of \a src to \a tgt. Replace lov_user_md_v1
4085 * components of \a src with lov_user_md_v3 using \a pool.
4087 * \param[in] src source layout
4088 * \param[in] pool pool to use in \a tgt
4089 * \param[out] tgt target layout
4091 static void embed_pool_to_comp_v1(const struct lov_comp_md_v1 *src,
4093 struct lov_comp_md_v1 *tgt)
4096 struct lov_user_md_v1 *lum;
4097 struct lov_user_md_v3 *lum3;
4098 struct lov_comp_md_entry_v1 *entry;
4102 entry = tgt->lcm_entries;
4104 for (i = 0; i < le16_to_cpu(src->lcm_entry_count); i++, entry++) {
4105 *entry = src->lcm_entries[i];
4106 offset = le32_to_cpu(src->lcm_entries[i].lcme_offset);
4107 entry->lcme_offset = cpu_to_le32(offset + shift);
4109 lum = (struct lov_user_md_v1 *)((char *)src + offset);
4110 lum3 = (struct lov_user_md_v3 *)((char *)tgt + offset + shift);
4111 *(struct lov_user_md_v1 *)lum3 = *lum;
4112 if (lum->lmm_pattern & cpu_to_le32(LOV_PATTERN_MDT)) {
4113 lum3->lmm_magic = cpu_to_le32(LOV_USER_MAGIC_V1);
4115 lum3->lmm_magic = cpu_to_le32(LOV_USER_MAGIC_V3);
4116 entry->lcme_size = cpu_to_le32(sizeof(*lum3));
4117 strscpy(lum3->lmm_pool_name, pool,
4118 sizeof(lum3->lmm_pool_name));
4119 shift += sizeof(*lum3) - sizeof(*lum);
4125 * Set default striping on a directory.
4127 * Sets specified striping on a directory object unless it matches the default
4128 * striping (LOVEA_DELETE_VALUES() macro). In the latter case remove existing
4129 * EA. This striping will be used when regular file is being created in this
4131 * If current default striping includes a pool but specifed striping
4132 * does not - retain the pool if it exists.
4134 * \param[in] env execution environment
4135 * \param[in] dt the striped object
4136 * \param[in] buf buffer with the striping
4137 * \param[in] name name of EA
4138 * \param[in] fl xattr flag (see OSD API description)
4139 * \param[in] th transaction handle
4141 * \retval 0 on success
4142 * \retval negative if failed
4144 static int lod_xattr_set_default_lov_on_dir(const struct lu_env *env,
4145 struct dt_object *dt,
4146 const struct lu_buf *buf,
4147 const char *name, int fl,
4150 struct lod_default_striping *lds = lod_lds_buf_get(env);
4151 struct lov_user_md_v1 *v1 = buf->lb_buf;
4152 char pool[LOV_MAXPOOLNAME + 1];
4158 /* get existing striping config */
4159 rc = lod_get_default_lov_striping(env, lod_dt_obj(dt), lds, NULL);
4163 memset(pool, 0, sizeof(pool));
4164 if (lds->lds_def_striping_set == 1)
4165 lod_layout_get_pool(lds->lds_def_comp_entries,
4166 lds->lds_def_comp_cnt, pool,
4169 is_del = LOVEA_DELETE_VALUES(v1->lmm_stripe_size,
4170 v1->lmm_stripe_count,
4171 v1->lmm_stripe_offset,
4174 /* Retain the pool name if it is not given */
4175 if (v1->lmm_magic == LOV_USER_MAGIC_V1 && pool[0] != '\0' &&
4177 struct lod_thread_info *info = lod_env_info(env);
4178 struct lov_user_md_v3 *v3 = info->lti_ea_store;
4180 memset(v3, 0, sizeof(*v3));
4181 v3->lmm_magic = cpu_to_le32(LOV_USER_MAGIC_V3);
4182 v3->lmm_pattern = cpu_to_le32(v1->lmm_pattern);
4183 v3->lmm_stripe_count = cpu_to_le32(v1->lmm_stripe_count);
4184 v3->lmm_stripe_offset = cpu_to_le32(v1->lmm_stripe_offset);
4185 v3->lmm_stripe_size = cpu_to_le32(v1->lmm_stripe_size);
4187 strscpy(v3->lmm_pool_name, pool, sizeof(v3->lmm_pool_name));
4189 info->lti_buf.lb_buf = v3;
4190 info->lti_buf.lb_len = sizeof(*v3);
4191 rc = lod_xattr_set_lov_on_dir(env, dt, &info->lti_buf,
4193 } else if (v1->lmm_magic == LOV_USER_MAGIC_COMP_V1 &&
4194 pool[0] != '\0' && !is_del) {
4196 * try to retain the pool from default layout if the
4197 * specified component layout does not provide pool
4200 struct lod_thread_info *info = lod_env_info(env);
4201 struct lov_comp_md_v1 *comp_v1 = buf->lb_buf;
4202 struct lov_comp_md_v1 *comp_v1p;
4203 struct lov_user_md_v1 *lum;
4207 struct lov_comp_md_entry_v1 *entry;
4210 entry_count = le16_to_cpu(comp_v1->lcm_entry_count);
4211 size = sizeof(*comp_v1) +
4212 entry_count * sizeof(comp_v1->lcm_entries[0]);
4213 entry = comp_v1->lcm_entries;
4214 for (i = 0; i < entry_count; i++, entry++) {
4215 offset = le32_to_cpu(entry->lcme_offset);
4216 lum = (struct lov_user_md_v1 *)((char *)comp_v1 +
4218 if (le32_to_cpu(lum->lmm_magic) != LOV_USER_MAGIC_V1)
4219 /* the i-th component includes pool info */
4221 if (lum->lmm_pattern & cpu_to_le32(LOV_PATTERN_MDT))
4222 size += sizeof(struct lov_user_md_v1);
4224 size += sizeof(struct lov_user_md_v3);
4227 if (i == entry_count) {
4229 * re-compose the layout to include the pool for
4232 if (info->lti_ea_store_size < size)
4233 rc = lod_ea_store_resize(info, size);
4236 comp_v1p = info->lti_ea_store;
4237 *comp_v1p = *comp_v1;
4238 comp_v1p->lcm_size = cpu_to_le32(size);
4239 embed_pool_to_comp_v1(comp_v1, pool, comp_v1p);
4241 info->lti_buf.lb_buf = comp_v1p;
4242 info->lti_buf.lb_len = size;
4243 rc = lod_xattr_set_lov_on_dir(env, dt,
4248 rc = lod_xattr_set_lov_on_dir(env, dt, buf, name, fl,
4252 rc = lod_xattr_set_lov_on_dir(env, dt, buf, name, fl, th);
4255 if (lds->lds_def_striping_set == 1 && lds->lds_def_comp_entries != NULL)
4256 lod_free_def_comp_entries(lds);
4262 * Set default striping on a directory object.
4264 * Sets specified striping on a directory object unless it matches the default
4265 * striping (LOVEA_DELETE_VALUES() macro). In the latter case remove existing
4266 * EA. This striping will be used when a new directory is being created in the
4269 * \param[in] env execution environment
4270 * \param[in] dt the striped object
4271 * \param[in] buf buffer with the striping
4272 * \param[in] name name of EA
4273 * \param[in] fl xattr flag (see OSD API description)
4274 * \param[in] th transaction handle
4276 * \retval 0 on success
4277 * \retval negative if failed
4279 static int lod_xattr_set_default_lmv_on_dir(const struct lu_env *env,
4280 struct dt_object *dt,
4281 const struct lu_buf *buf,
4282 const char *name, int fl,
4285 struct lmv_user_md_v1 *lum;
4290 LASSERT(buf != NULL && buf->lb_buf != NULL);
4294 "set default stripe_count # %u stripe_offset %d hash %u\n",
4295 le32_to_cpu(lum->lum_stripe_count),
4296 (int)le32_to_cpu(lum->lum_stripe_offset),
4297 le32_to_cpu(lum->lum_hash_type));
4299 if (LMVEA_DELETE_VALUES((le32_to_cpu(lum->lum_stripe_count)),
4300 le32_to_cpu(lum->lum_stripe_offset)) &&
4301 le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC) {
4302 rc = lod_xattr_del_internal(env, dt, name, th);
4306 rc = lod_xattr_set_internal(env, dt, buf, name, fl, th);
4315 * Turn directory into a striped directory.
4317 * During replay the client sends the striping created before MDT
4318 * failure, then the layer above LOD sends this defined striping
4319 * using ->do_xattr_set(), so LOD uses this method to replay creation
4320 * of the stripes. Notice the original information for the striping
4321 * (#stripes, FIDs, etc) was transferred in declare path.
4323 * \param[in] env execution environment
4324 * \param[in] dt the striped object
4325 * \param[in] buf buf lmv_user_md for create, or lmv_mds_md for replay
4326 * \param[in] name not used currently
4327 * \param[in] fl xattr flag (see OSD API description)
4328 * \param[in] th transaction handle
4330 * \retval 0 on success
4331 * \retval negative if failed
4333 static int lod_xattr_set_lmv(const struct lu_env *env, struct dt_object *dt,
4334 const struct lu_buf *buf, const char *name,
4335 int fl, struct thandle *th)
4337 struct lod_object *lo = lod_dt_obj(dt);
4338 struct lod_thread_info *info = lod_env_info(env);
4339 struct lu_attr *attr = &info->lti_attr;
4340 struct dt_object_format *dof = &info->lti_format;
4341 struct lu_buf lmv_buf;
4342 struct lu_buf slave_lmv_buf;
4343 struct lmv_user_md *lum = buf->lb_buf;
4344 struct lmv_mds_md_v1 *lmm;
4345 struct lmv_mds_md_v1 *slave_lmm = NULL;
4346 struct dt_insert_rec *rec = &info->lti_dt_rec;
4351 /* lum is used to know whether it's replay */
4353 if (!S_ISDIR(dt->do_lu.lo_header->loh_attr))
4356 /* The stripes are supposed to be allocated in declare phase,
4357 * if there are no stripes being allocated, it will skip */
4358 if (lo->ldo_dir_stripe_count == 0) {
4359 if (lo->ldo_is_foreign) {
4360 rc = lod_sub_xattr_set(env, dt_object_child(dt), buf,
4361 XATTR_NAME_LMV, fl, th);
4368 rc = dt_attr_get(env, dt_object_child(dt), attr);
4372 attr->la_valid &= LA_ATIME | LA_MTIME | LA_CTIME | LA_FLAGS |
4373 LA_MODE | LA_UID | LA_GID | LA_TYPE | LA_PROJID;
4374 dof->dof_type = DFT_DIR;
4376 rc = lod_prep_lmv_md(env, dt, &lmv_buf);
4379 lmm = lmv_buf.lb_buf;
4381 OBD_ALLOC_PTR(slave_lmm);
4382 if (slave_lmm == NULL)
4385 lod_prep_slave_lmv_md(slave_lmm, lmm);
4386 slave_lmv_buf.lb_buf = slave_lmm;
4387 slave_lmv_buf.lb_len = sizeof(*slave_lmm);
4389 rec->rec_type = S_IFDIR;
4390 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
4391 struct dt_object *dto = lo->ldo_stripe[i];
4392 char *stripe_name = info->lti_key;
4393 struct lu_name *sname;
4394 struct linkea_data ldata = { NULL };
4395 struct lu_buf linkea_buf;
4396 bool stripe_created = false;
4398 /* OBD_FAIL_MDS_STRIPE_FID may leave stripe uninitialized */
4402 /* fail a remote stripe creation */
4403 if (i && CFS_FAIL_CHECK(OBD_FAIL_MDS_STRIPE_CREATE))
4406 /* if it's replay by client request, and stripe exists on remote
4407 * MDT, it means mkdir was partially executed: stripe was
4408 * created on remote MDT successfully, but target not in last
4411 if (unlikely((le32_to_cpu(lum->lum_magic) == LMV_MAGIC_V1) &&
4412 dt_object_exists(dto) && dt_object_remote(dto)))
4413 stripe_created = true;
4415 /* don't create stripe if:
4416 * 1. it's source stripe of migrating directory
4417 * 2. it's existed stripe of splitting directory
4419 if ((lod_is_migrating(lo) && i >= lo->ldo_dir_migrate_offset) ||
4420 (lod_is_splitting(lo) && i < lo->ldo_dir_split_offset)) {
4421 if (!dt_object_exists(dto))
4422 GOTO(out, rc = -EINVAL);
4423 } else if (!stripe_created) {
4424 dt_write_lock(env, dto, DT_TGT_CHILD);
4425 rc = lod_sub_create(env, dto, attr, NULL, dof, th);
4427 dt_write_unlock(env, dto);
4431 rc = lod_sub_ref_add(env, dto, th);
4432 dt_write_unlock(env, dto);
4436 rec->rec_fid = lu_object_fid(&dto->do_lu);
4437 rc = lod_sub_insert(env, dto,
4438 (const struct dt_rec *)rec,
4439 (const struct dt_key *)dot, th);
4444 if (!CFS_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_SLAVE_LMV) ||
4445 cfs_fail_val != i) {
4446 if (CFS_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_SLAVE_LMV) &&
4448 slave_lmm->lmv_master_mdt_index =
4451 slave_lmm->lmv_master_mdt_index =
4454 rc = lod_sub_xattr_set(env, dto, &slave_lmv_buf,
4455 XATTR_NAME_LMV, 0, th);
4460 /* don't insert stripe if it's existed stripe of splitting
4461 * directory (this directory is striped).
4462 * NB, plain directory will insert itself as the first
4465 if (lod_is_splitting(lo) && lo->ldo_dir_split_offset > 1 &&
4466 lo->ldo_dir_split_offset > i)
4469 if (CFS_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_SLAVE_NAME) &&
4471 snprintf(stripe_name, sizeof(info->lti_key), DFID":%d",
4472 PFID(lu_object_fid(&dto->do_lu)), i + 1);
4474 snprintf(stripe_name, sizeof(info->lti_key), DFID":%d",
4475 PFID(lu_object_fid(&dto->do_lu)), i);
4477 if (!stripe_created) {
4478 rec->rec_fid = lu_object_fid(&dt->do_lu);
4479 rc = lod_sub_insert(env, dto, (struct dt_rec *)rec,
4480 (const struct dt_key *)dotdot, th);
4484 sname = lod_name_get(env, stripe_name,
4485 strlen(stripe_name));
4486 rc = linkea_links_new(&ldata, &info->lti_linkea_buf,
4487 sname, lu_object_fid(&dt->do_lu));
4491 linkea_buf.lb_buf = ldata.ld_buf->lb_buf;
4492 linkea_buf.lb_len = ldata.ld_leh->leh_len;
4493 rc = lod_sub_xattr_set(env, dto, &linkea_buf,
4494 XATTR_NAME_LINK, 0, th);
4499 rec->rec_fid = lu_object_fid(&dto->do_lu);
4500 rc = lod_sub_insert(env, dt_object_child(dt),
4501 (const struct dt_rec *)rec,
4502 (const struct dt_key *)stripe_name, th);
4506 rc = lod_sub_ref_add(env, dt_object_child(dt), th);
4511 if (!CFS_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MASTER_LMV))
4512 rc = lod_sub_xattr_set(env, dt_object_child(dt),
4513 &lmv_buf, XATTR_NAME_LMV, fl, th);
4515 if (slave_lmm != NULL)
4516 OBD_FREE_PTR(slave_lmm);
4522 * Helper function to declare/execute creation of a striped directory
4524 * Called in declare/create object path, prepare striping for a directory
4525 * and prepare defaults data striping for the objects to be created in
4526 * that directory. Notice the function calls "declaration" or "execution"
4527 * methods depending on \a declare param. This is a consequence of the
4528 * current approach while we don't have natural distributed transactions:
4529 * we basically execute non-local updates in the declare phase. So, the
4530 * arguments for the both phases are the same and this is the reason for
4531 * this function to exist.
4533 * \param[in] env execution environment
4534 * \param[in] dt object
4535 * \param[in] attr attributes the stripes will be created with
4536 * \param[in] lmu lmv_user_md if MDT indices are specified
4537 * \param[in] dof format of stripes (see OSD API description)
4538 * \param[in] th transaction handle
4539 * \param[in] declare where to call "declare" or "execute" methods
4541 * \retval 0 on success
4542 * \retval negative if failed
4544 static int lod_dir_striping_create_internal(const struct lu_env *env,
4545 struct dt_object *dt,
4546 struct lu_attr *attr,
4547 const struct lu_buf *lmu,
4548 struct dt_object_format *dof,
4552 struct lod_thread_info *info = lod_env_info(env);
4553 struct lod_object *lo = lod_dt_obj(dt);
4554 const struct lod_default_striping *lds = lo->ldo_def_striping;
4558 LASSERT(ergo(lds != NULL,
4559 lds->lds_def_striping_set ||
4560 lds->lds_dir_def_striping_set));
4563 if (!LMVEA_DELETE_VALUES(lo->ldo_dir_stripe_count,
4564 lo->ldo_dir_stripe_offset)) {
4566 /* mkdir by default LMV */
4567 struct lmv_user_md_v1 *v1 = info->lti_ea_store;
4568 int stripe_count = lo->ldo_dir_stripe_count;
4570 if (info->lti_ea_store_size < sizeof(*v1)) {
4571 rc = lod_ea_store_resize(info, sizeof(*v1));
4574 v1 = info->lti_ea_store;
4577 memset(v1, 0, sizeof(*v1));
4578 v1->lum_magic = cpu_to_le32(LMV_USER_MAGIC);
4579 v1->lum_stripe_count = cpu_to_le32(stripe_count);
4580 v1->lum_stripe_offset =
4581 cpu_to_le32(lo->ldo_dir_stripe_offset);
4583 info->lti_buf.lb_buf = v1;
4584 info->lti_buf.lb_len = sizeof(*v1);
4585 lmu = &info->lti_buf;
4589 rc = lod_declare_xattr_set_lmv(env, dt, attr, lmu, dof,
4592 rc = lod_xattr_set_lmv(env, dt, lmu, XATTR_NAME_LMV, 0,
4596 } else if (lmu->lb_buf) {
4597 /* foreign LMV EA case */
4599 struct lmv_foreign_md *lfm = lmu->lb_buf;
4601 if (lfm->lfm_magic == LMV_MAGIC_FOREIGN)
4602 rc = lod_declare_xattr_set_lmv(env, dt, attr,
4604 } else if (lo->ldo_is_foreign) {
4605 LASSERT(lo->ldo_foreign_lmv != NULL &&
4606 lo->ldo_foreign_lmv_size > 0);
4607 info->lti_buf.lb_buf = lo->ldo_foreign_lmv;
4608 info->lti_buf.lb_len = lo->ldo_foreign_lmv_size;
4609 lmu = &info->lti_buf;
4610 rc = lod_xattr_set_lmv(env, dt, lmu, XATTR_NAME_LMV, 0,
4615 /* Transfer default LMV striping from the parent */
4616 if (lds != NULL && lds->lds_dir_def_striping_set &&
4617 lds->lds_dir_def_max_inherit != LMV_INHERIT_END &&
4618 lds->lds_dir_def_max_inherit != LMV_INHERIT_NONE &&
4619 !(LMVEA_DELETE_VALUES(lds->lds_dir_def_stripe_count,
4620 lds->lds_dir_def_stripe_offset) &&
4621 le32_to_cpu(lds->lds_dir_def_hash_type) !=
4622 LMV_HASH_TYPE_UNKNOWN)) {
4623 struct lmv_user_md_v1 *v1 = info->lti_ea_store;
4625 if (info->lti_ea_store_size < sizeof(*v1)) {
4626 rc = lod_ea_store_resize(info, sizeof(*v1));
4629 v1 = info->lti_ea_store;
4632 memset(v1, 0, sizeof(*v1));
4633 v1->lum_magic = cpu_to_le32(LMV_USER_MAGIC);
4634 v1->lum_stripe_count =
4635 cpu_to_le32(lds->lds_dir_def_stripe_count);
4636 v1->lum_stripe_offset =
4637 cpu_to_le32(lds->lds_dir_def_stripe_offset);
4639 cpu_to_le32(lds->lds_dir_def_hash_type);
4640 v1->lum_max_inherit =
4641 lmv_inherit_next(lds->lds_dir_def_max_inherit);
4642 v1->lum_max_inherit_rr =
4643 lmv_inherit_rr_next(lds->lds_dir_def_max_inherit_rr);
4645 info->lti_buf.lb_buf = v1;
4646 info->lti_buf.lb_len = sizeof(*v1);
4648 rc = lod_dir_declare_xattr_set(env, dt, &info->lti_buf,
4649 XATTR_NAME_DEFAULT_LMV,
4652 rc = lod_xattr_set_default_lmv_on_dir(env, dt,
4654 XATTR_NAME_DEFAULT_LMV, 0,
4660 /* Transfer default LOV striping from the parent */
4661 if (lds != NULL && lds->lds_def_striping_set &&
4662 lds->lds_def_comp_cnt != 0) {
4663 struct lov_mds_md *lmm;
4664 int lmm_size = lod_comp_md_size(lo, true);
4666 if (info->lti_ea_store_size < lmm_size) {
4667 rc = lod_ea_store_resize(info, lmm_size);
4671 lmm = info->lti_ea_store;
4673 rc = lod_generate_lovea(env, lo, lmm, &lmm_size, true);
4677 info->lti_buf.lb_buf = lmm;
4678 info->lti_buf.lb_len = lmm_size;
4681 rc = lod_dir_declare_xattr_set(env, dt, &info->lti_buf,
4682 XATTR_NAME_LOV, 0, th);
4684 rc = lod_xattr_set_lov_on_dir(env, dt, &info->lti_buf,
4685 XATTR_NAME_LOV, 0, th);
4690 /* ldo_def_striping is not allocated, clear after use, in case directory
4691 * layout is changed later.
4694 lo->ldo_def_striping = NULL;
4699 static int lod_declare_dir_striping_create(const struct lu_env *env,
4700 struct dt_object *dt,
4701 struct lu_attr *attr,
4703 struct dt_object_format *dof,
4706 return lod_dir_striping_create_internal(env, dt, attr, lmu, dof, th,
4710 static int lod_dir_striping_create(const struct lu_env *env,
4711 struct dt_object *dt,
4712 struct lu_attr *attr,
4713 const struct lu_buf *lmu,
4714 struct dt_object_format *dof,
4717 return lod_dir_striping_create_internal(env, dt, attr, lmu, dof, th,
4722 * Make LOV EA for striped object.
4724 * Generate striping information and store it in the LOV EA of the given
4725 * object. The caller must ensure nobody else is calling the function
4726 * against the object concurrently. The transaction must be started.
4727 * FLDB service must be running as well; it's used to map FID to the target,
4728 * which is stored in LOV EA.
4730 * \param[in] env execution environment for this thread
4731 * \param[in] lo LOD object
4732 * \param[in] th transaction handle
4734 * \retval 0 if LOV EA is stored successfully
4735 * \retval negative error number on failure
4737 static int lod_generate_and_set_lovea(const struct lu_env *env,
4738 struct lod_object *lo,
4741 struct lod_thread_info *info = lod_env_info(env);
4742 struct dt_object *next = dt_object_child(&lo->ldo_obj);
4743 struct lov_mds_md_v1 *lmm;
4749 if (lo->ldo_comp_cnt == 0 && !lo->ldo_is_foreign) {
4750 lod_striping_free_nolock(env, lo);
4751 rc = lod_sub_xattr_del(env, next, XATTR_NAME_LOV, th);
4755 lmm_size = lod_comp_md_size(lo, false);
4756 if (info->lti_ea_store_size < lmm_size) {
4757 rc = lod_ea_store_resize(info, lmm_size);
4761 lmm = info->lti_ea_store;
4763 rc = lod_generate_lovea(env, lo, lmm, &lmm_size, false);
4767 info->lti_buf.lb_buf = lmm;
4768 info->lti_buf.lb_len = lmm_size;
4769 rc = lod_sub_xattr_set(env, next, &info->lti_buf,
4770 XATTR_NAME_LOV, 0, th);
4774 static __u32 lod_gen_component_id(struct lod_object *lo,
4775 int mirror_id, int comp_idx);
4778 * Repeat an existing component
4780 * Creates a new layout by replicating an existing component. Uses striping
4781 * policy from previous component as a template for the striping for the new
4784 * New component starts with zero length, will be extended (or removed) before
4785 * returning layout to client.
4787 * NB: Reallocates layout components array (lo->ldo_comp_entries), invalidating
4788 * any pre-existing pointers to components. Handle with care.
4790 * \param[in] env execution environment for this thread
4791 * \param[in,out] lo object to update the layout of
4792 * \param[in] index index of component to copy
4794 * \retval 0 on success
4795 * \retval negative errno on error
4797 static int lod_layout_repeat_comp(const struct lu_env *env,
4798 struct lod_object *lo, int index)
4800 struct lod_layout_component *lod_comp;
4801 struct lod_layout_component *new_comp = NULL;
4802 struct lod_layout_component *comp_array;
4803 int rc = 0, i, new_cnt = lo->ldo_comp_cnt + 1;
4808 lod_comp = &lo->ldo_comp_entries[index];
4809 LASSERT(lod_comp_inited(lod_comp) && lod_comp->llc_id != LCME_ID_INVAL);
4811 CDEBUG(D_LAYOUT, "repeating component %d\n", index);
4813 OBD_ALLOC_PTR_ARRAY(comp_array, new_cnt);
4814 if (comp_array == NULL)
4815 GOTO(out, rc = -ENOMEM);
4817 for (i = 0; i < lo->ldo_comp_cnt; i++) {
4818 memcpy(&comp_array[i + offset], &lo->ldo_comp_entries[i],
4819 sizeof(*comp_array));
4821 /* Duplicate this component in to the next slot */
4823 new_comp = &comp_array[i + 1];
4824 memcpy(&comp_array[i + 1], &lo->ldo_comp_entries[i],
4825 sizeof(*comp_array));
4826 /* We must now skip this new component when copying */
4831 /* Set up copied component */
4832 new_comp->llc_flags &= ~LCME_FL_INIT;
4833 new_comp->llc_stripe = NULL;
4834 new_comp->llc_stripes_allocated = 0;
4835 new_comp->llc_ost_indices = NULL;
4836 new_comp->llc_stripe_offset = LOV_OFFSET_DEFAULT;
4837 /* for uninstantiated components, layout gen stores default stripe
4839 new_comp->llc_layout_gen = lod_comp->llc_stripe_offset;
4840 /* This makes the repeated component zero-length, placed at the end of
4841 * the preceding component */
4842 new_comp->llc_extent.e_start = new_comp->llc_extent.e_end;
4843 new_comp->llc_timestamp = lod_comp->llc_timestamp;
4844 new_comp->llc_pool = NULL;
4846 rc = lod_set_pool(&new_comp->llc_pool, lod_comp->llc_pool);
4850 if (new_comp->llc_ostlist.op_array) {
4851 __u32 *op_array = NULL;
4853 OBD_ALLOC(op_array, new_comp->llc_ostlist.op_size);
4855 GOTO(out, rc = -ENOMEM);
4856 memcpy(op_array, &new_comp->llc_ostlist.op_array,
4857 new_comp->llc_ostlist.op_size);
4858 new_comp->llc_ostlist.op_array = op_array;
4861 OBD_FREE_PTR_ARRAY(lo->ldo_comp_entries, lo->ldo_comp_cnt);
4862 lo->ldo_comp_entries = comp_array;
4863 lo->ldo_comp_cnt = new_cnt;
4865 /* Generate an id for the new component */
4866 mirror_id = mirror_id_of(new_comp->llc_id);
4867 new_comp->llc_id = LCME_ID_INVAL;
4868 new_comp->llc_id = lod_gen_component_id(lo, mirror_id, index + 1);
4869 if (new_comp->llc_id == LCME_ID_INVAL)
4870 GOTO(out, rc = -ERANGE);
4875 OBD_FREE_PTR_ARRAY(comp_array, new_cnt);
4880 static int lod_layout_data_init(struct lod_thread_info *info, __u32 comp_cnt)
4884 /* clear memory region that will be used for layout change */
4885 memset(&info->lti_layout_attr, 0, sizeof(struct lu_attr));
4886 info->lti_count = 0;
4888 if (info->lti_comp_size >= comp_cnt)
4891 if (info->lti_comp_size > 0) {
4892 OBD_FREE_PTR_ARRAY(info->lti_comp_idx, info->lti_comp_size);
4893 info->lti_comp_size = 0;
4896 OBD_ALLOC_PTR_ARRAY(info->lti_comp_idx, comp_cnt);
4897 if (!info->lti_comp_idx)
4900 info->lti_comp_size = comp_cnt;
4905 * Prepare new layout minus deleted components
4907 * Removes components marked for deletion (LCME_ID_INVAL) by copying to a new
4908 * layout and skipping those components. Removes stripe objects if any exist.
4911 * Reallocates layout components array (lo->ldo_comp_entries), invalidating
4912 * any pre-existing pointers to components.
4914 * Caller is responsible for updating mirror end (ldo_mirror[].lme_end).
4916 * \param[in] env execution environment for this thread
4917 * \param[in,out] lo object to update the layout of
4918 * \param[in] th transaction handle for this operation
4920 * \retval # of components deleted
4921 * \retval negative errno on error
4923 static int lod_layout_del_prep_layout(const struct lu_env *env,
4924 struct lod_object *lo,
4927 struct lod_layout_component *lod_comp;
4928 struct lod_thread_info *info = lod_env_info(env);
4929 int rc = 0, i, j, deleted = 0;
4933 LASSERT(lo->ldo_is_composite);
4934 LASSERT(lo->ldo_comp_cnt > 0 && lo->ldo_comp_entries != NULL);
4936 rc = lod_layout_data_init(info, lo->ldo_comp_cnt);
4940 for (i = 0; i < lo->ldo_comp_cnt; i++) {
4941 lod_comp = &lo->ldo_comp_entries[i];
4943 if (lod_comp->llc_id != LCME_ID_INVAL) {
4944 /* Build array of things to keep */
4945 info->lti_comp_idx[info->lti_count++] = i;
4949 if (lod_comp->llc_magic == LOV_MAGIC_FOREIGN)
4952 lod_obj_set_pool(lo, i, NULL);
4953 if (lod_comp->llc_ostlist.op_array) {
4954 OBD_FREE(lod_comp->llc_ostlist.op_array,
4955 lod_comp->llc_ostlist.op_size);
4956 lod_comp->llc_ostlist.op_array = NULL;
4957 lod_comp->llc_ostlist.op_size = 0;
4961 CDEBUG(D_LAYOUT, "deleting comp %d, left %d\n", i,
4962 lo->ldo_comp_cnt - deleted);
4964 /* No striping info for this component */
4965 if (lod_comp->llc_stripe == NULL)
4968 LASSERT(lod_comp->llc_stripe_count > 0);
4969 for (j = 0; j < lod_comp->llc_stripe_count; j++) {
4970 struct dt_object *obj = lod_comp->llc_stripe[j];
4975 /* components which are not init have no sub objects
4977 if (lod_comp_inited(lod_comp)) {
4978 rc = lod_sub_destroy(env, obj, th);
4983 lu_object_put(env, &obj->do_lu);
4984 lod_comp->llc_stripe[j] = NULL;
4986 OBD_FREE_PTR_ARRAY(lod_comp->llc_stripe,
4987 lod_comp->llc_stripes_allocated);
4988 lod_comp->llc_stripe = NULL;
4989 OBD_FREE_PTR_ARRAY(lod_comp->llc_ost_indices,
4990 lod_comp->llc_stripes_allocated);
4991 lod_comp->llc_ost_indices = NULL;
4992 lod_comp->llc_stripes_allocated = 0;
4995 /* info->lti_count has the amount of left components */
4996 LASSERTF(info->lti_count >= 0 && info->lti_count < lo->ldo_comp_cnt,
4997 "left = %d, lo->ldo_comp_cnt %d\n", (int)info->lti_count,
4998 (int)lo->ldo_comp_cnt);
5000 if (info->lti_count > 0) {
5001 struct lod_layout_component *comp_array;
5003 OBD_ALLOC_PTR_ARRAY(comp_array, info->lti_count);
5004 if (comp_array == NULL)
5005 GOTO(out, rc = -ENOMEM);
5007 for (i = 0; i < info->lti_count; i++) {
5008 memcpy(&comp_array[i],
5009 &lo->ldo_comp_entries[info->lti_comp_idx[i]],
5010 sizeof(*comp_array));
5013 OBD_FREE_PTR_ARRAY(lo->ldo_comp_entries, lo->ldo_comp_cnt);
5014 lo->ldo_comp_entries = comp_array;
5015 lo->ldo_comp_cnt = info->lti_count;
5017 lod_free_comp_entries(lo);
5022 return rc ? rc : deleted;
5026 * Delete layout component(s)
5028 * This function sets up the layout data in the env and does the setattrs
5029 * required to write out the new layout. The layout itself is modified in
5030 * lod_layout_del_prep_layout.
5032 * \param[in] env execution environment for this thread
5033 * \param[in] dt object
5034 * \param[in] th transaction handle
5036 * \retval 0 on success
5037 * \retval negative error number on failure
5039 static int lod_layout_del(const struct lu_env *env, struct dt_object *dt,
5042 struct lod_object *lo = lod_dt_obj(dt);
5043 struct dt_object *next = dt_object_child(dt);
5044 struct lu_attr *attr = &lod_env_info(env)->lti_attr;
5047 LASSERT(lo->ldo_mirror_count == 1);
5049 mutex_lock(&lo->ldo_layout_mutex);
5051 rc = lod_layout_del_prep_layout(env, lo, th);
5055 /* Only do this if we didn't delete all components */
5056 if (lo->ldo_comp_cnt > 0) {
5057 lo->ldo_mirrors[0].lme_end = lo->ldo_comp_cnt - 1;
5058 lod_obj_inc_layout_gen(lo);
5061 LASSERT(dt_object_exists(dt));
5062 rc = dt_attr_get(env, next, attr);
5066 if (attr->la_size > 0) {
5068 attr->la_valid = LA_SIZE;
5069 rc = lod_sub_attr_set(env, next, attr, th);
5074 rc = lod_generate_and_set_lovea(env, lo, th);
5078 lod_striping_free_nolock(env, lo);
5080 mutex_unlock(&lo->ldo_layout_mutex);
5087 * Implementation of dt_object_operations::do_xattr_set.
5089 * Sets specified extended attribute on the object. Three types of EAs are
5091 * LOV EA - stores striping for a regular file or default striping (when set
5093 * LMV EA - stores a marker for the striped directories
5094 * DMV EA - stores default directory striping
5096 * When striping is applied to a non-striped existing object (this is called
5097 * late striping), then LOD notices the caller wants to turn the object into a
5098 * striped one. The stripe objects are created and appropriate EA is set:
5099 * LOV EA storing all the stripes directly or LMV EA storing just a small header
5100 * with striping configuration.
5102 * \see dt_object_operations::do_xattr_set() in the API description for details.
5104 static int lod_xattr_set(const struct lu_env *env,
5105 struct dt_object *dt, const struct lu_buf *buf,
5106 const char *name, int fl, struct thandle *th)
5108 struct lod_thread_info *info = lod_env_info(env);
5109 struct dt_object *next = dt_object_child(dt);
5110 struct lu_attr *layout_attr = &info->lti_layout_attr;
5111 struct lod_object *lo = lod_dt_obj(dt);
5112 struct lod_obj_stripe_cb_data data = { {0} };
5117 if (S_ISDIR(dt->do_lu.lo_header->loh_attr) &&
5118 !strcmp(name, XATTR_NAME_LMV)) {
5120 case LU_XATTR_CREATE:
5121 rc = lod_dir_striping_create(env, dt, NULL, buf, NULL,
5125 case LU_XATTR_REPLACE:
5126 rc = lod_dir_layout_set(env, dt, buf, fl, th);
5133 } else if (S_ISDIR(dt->do_lu.lo_header->loh_attr) &&
5134 strcmp(name, XATTR_NAME_LOV) == 0) {
5135 rc = lod_xattr_set_default_lov_on_dir(env, dt, buf, name, fl,
5138 } else if (S_ISDIR(dt->do_lu.lo_header->loh_attr) &&
5139 strcmp(name, XATTR_NAME_DEFAULT_LMV) == 0) {
5141 rc = lod_xattr_set_default_lmv_on_dir(env, dt, buf, name, fl,
5144 } else if (S_ISREG(dt->do_lu.lo_header->loh_attr) &&
5145 (strcmp(name, XATTR_NAME_LOV) == 0 ||
5146 strcmp(name, XATTR_LUSTRE_LOV) == 0 ||
5147 allowed_lustre_lov(name))) {
5148 /* layout has been changed by others in the transaction */
5149 rc = lod_check_layout_gen_intrans(info, lo);
5152 "%s: obj "DFID" gen changed from %d to %d in transaction, retry the transaction\n",
5153 dt->do_lu.lo_dev->ld_obd->obd_name,
5154 PFID(lu_object_fid(&dt->do_lu)),
5155 info->lti_gen[rc - 1], lo->ldo_layout_gen);
5159 /* in case of lov EA swap, just set it
5160 * if not, it is a replay so check striping match what we
5161 * already have during req replay, declare_xattr_set()
5162 * defines striping, then create() does the work */
5163 if (fl & LU_XATTR_REPLACE) {
5164 /* free stripes, then update disk */
5165 lod_striping_free(env, lod_dt_obj(dt));
5167 rc = lod_sub_xattr_set(env, next, buf, name, fl, th);
5168 } else if (fl & LU_XATTR_SPLIT) {
5169 rc = lod_sub_xattr_set(env, next, buf, name, fl, th);
5173 rc = lod_striping_reload(env, lo, buf, LVF_ALL_STALE);
5177 if (lo->ldo_mirror_count > 1 &&
5178 layout_attr->la_valid & LA_LAYOUT_VERSION) {
5180 layout_attr->la_layout_version =
5182 data.locd_attr = layout_attr;
5183 data.locd_declare = false;
5184 data.locd_stripe_cb =
5185 lod_obj_stripe_attr_set_cb;
5186 rc = lod_obj_for_each_stripe(env, lo, th,
5191 } else if (fl & LU_XATTR_PURGE) {
5192 rc = lod_layout_purge(env, dt, buf, th);
5193 } else if (dt_object_remote(dt)) {
5194 /* This only happens during migration, see
5195 * mdd_migrate_create(), in which Master MDT will
5196 * create a remote target object, and only set
5197 * (migrating) stripe EA on the remote object,
5198 * and does not need creating each stripes. */
5199 rc = lod_sub_xattr_set(env, next, buf, name,
5201 } else if (strcmp(name, XATTR_LUSTRE_LOV".del") == 0) {
5202 /* delete component(s) */
5203 LASSERT(lod_dt_obj(dt)->ldo_comp_cached);
5204 rc = lod_layout_del(env, dt, th);
5207 * When 'name' is XATTR_LUSTRE_LOV or XATTR_NAME_LOV,
5208 * it's going to create file with specified
5209 * component(s), the striping must have not being
5210 * cached in this case;
5212 * Otherwise, it's going to add/change component(s) to
5213 * an existing file, the striping must have been cached
5216 if (!(fl & LU_XATTR_MERGE))
5217 LASSERT(equi(!strcmp(name, XATTR_LUSTRE_LOV) ||
5218 !strcmp(name, XATTR_NAME_LOV),
5219 !lod_dt_obj(dt)->ldo_comp_cached));
5221 rc = lod_striped_create(env, dt, NULL, NULL, th);
5225 if (fl & LU_XATTR_MERGE && lo->ldo_mirror_count > 1 &&
5226 layout_attr->la_valid & LA_LAYOUT_VERSION) {
5227 /* mirror merge exec phase */
5228 layout_attr->la_layout_version =
5230 data.locd_attr = layout_attr;
5231 data.locd_declare = false;
5232 data.locd_stripe_cb =
5233 lod_obj_stripe_attr_set_cb;
5234 rc = lod_obj_for_each_stripe(env, lo, th,
5241 } else if (strcmp(name, XATTR_NAME_FID) == 0) {
5242 rc = lod_replace_parent_fid(env, dt, buf, th, false);
5247 /* then all other xattr */
5248 rc = lod_xattr_set_internal(env, dt, buf, name, fl, th);
5254 * Implementation of dt_object_operations::do_declare_xattr_del.
5256 * \see dt_object_operations::do_declare_xattr_del() in the API description
5259 static int lod_declare_xattr_del(const struct lu_env *env,
5260 struct dt_object *dt, const char *name,
5263 struct lod_object *lo = lod_dt_obj(dt);
5264 struct dt_object *next = dt_object_child(dt);
5269 rc = lod_sub_declare_xattr_del(env, next, name, th);
5273 if (!S_ISDIR(dt->do_lu.lo_header->loh_attr))
5276 /* NB: don't delete stripe LMV, because when we do this, normally we
5277 * will remove stripes, besides, if directory LMV is corrupt, this will
5278 * prevent deleting its LMV and fixing it (via LFSCK).
5280 if (!strcmp(name, XATTR_NAME_LMV))
5283 rc = lod_striping_load(env, lo);
5287 if (lo->ldo_dir_stripe_count == 0)
5290 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
5291 struct dt_object *dto = lo->ldo_stripe[i];
5296 if (!dt_object_exists(dto))
5299 rc = lod_sub_declare_xattr_del(env, dto, name, th);
5308 * Implementation of dt_object_operations::do_xattr_del.
5310 * If EA storing a regular striping is being deleted, then release
5311 * all the references to the stripe objects in core.
5313 * \see dt_object_operations::do_xattr_del() in the API description for details.
5315 static int lod_xattr_del(const struct lu_env *env, struct dt_object *dt,
5316 const char *name, struct thandle *th)
5322 if (!strcmp(name, XATTR_NAME_LOV) || !strcmp(name, XATTR_NAME_LMV))
5323 lod_striping_free(env, lod_dt_obj(dt));
5325 rc = lod_xattr_del_internal(env, dt, name, th);
5331 * Implementation of dt_object_operations::do_xattr_list.
5333 * \see dt_object_operations::do_xattr_list() in the API description
5336 static int lod_xattr_list(const struct lu_env *env,
5337 struct dt_object *dt, const struct lu_buf *buf)
5339 return dt_xattr_list(env, dt_object_child(dt), buf);
5342 static inline int lod_object_will_be_striped(int is_reg, const struct lu_fid *fid)
5344 return (is_reg && fid_seq(fid) != FID_SEQ_LOCAL_FILE);
5348 * Copy OST list from layout provided by user.
5350 * \param[in] lod_comp layout_component to be filled
5351 * \param[in] v3 LOV EA V3 user data
5353 * \retval 0 on success
5354 * \retval negative if failed
5356 int lod_comp_copy_ost_lists(struct lod_layout_component *lod_comp,
5357 struct lov_user_md_v3 *v3)
5363 if (v3->lmm_stripe_offset == LOV_OFFSET_DEFAULT)
5364 v3->lmm_stripe_offset = v3->lmm_objects[0].l_ost_idx;
5366 if (lod_comp->llc_ostlist.op_array) {
5367 if (lod_comp->llc_ostlist.op_size >=
5368 v3->lmm_stripe_count * sizeof(__u32)) {
5369 lod_comp->llc_ostlist.op_count =
5370 v3->lmm_stripe_count;
5373 OBD_FREE(lod_comp->llc_ostlist.op_array,
5374 lod_comp->llc_ostlist.op_size);
5377 /* copy ost list from lmm */
5378 lod_comp->llc_ostlist.op_count = v3->lmm_stripe_count;
5379 lod_comp->llc_ostlist.op_size = v3->lmm_stripe_count * sizeof(__u32);
5380 OBD_ALLOC(lod_comp->llc_ostlist.op_array,
5381 lod_comp->llc_ostlist.op_size);
5382 if (!lod_comp->llc_ostlist.op_array)
5385 for (j = 0; j < v3->lmm_stripe_count; j++) {
5386 lod_comp->llc_ostlist.op_array[j] =
5387 v3->lmm_objects[j].l_ost_idx;
5395 * Get default striping.
5397 * \param[in] env execution environment
5398 * \param[in] lo object
5399 * \param[out] lds default striping
5401 * \retval 0 on success
5402 * \retval negative if failed
5404 static int lod_get_default_lov_striping(const struct lu_env *env,
5405 struct lod_object *lo,
5406 struct lod_default_striping *lds,
5407 struct dt_allocation_hint *dah)
5409 struct lod_thread_info *info = lod_env_info(env);
5410 struct lov_user_md_v1 *v1 = NULL;
5411 struct lov_user_md_v3 *v3 = NULL;
5412 struct lov_comp_md_v1 *lcm = NULL;
5414 int append_stripe_count = dah != NULL ? dah->dah_append_stripe_count : 0;
5415 const char *append_pool = (dah != NULL &&
5416 dah->dah_append_pool != NULL &&
5417 dah->dah_append_pool[0] != '\0') ?
5418 dah->dah_append_pool : NULL;
5419 __u16 entry_count = 1;
5420 __u16 mirror_count = 0;
5421 bool want_composite = false;
5426 lds->lds_def_striping_set = 0;
5428 rc = lod_get_lov_ea(env, lo);
5432 if (rc < (typeof(rc))sizeof(struct lov_user_md))
5435 magic = *(__u32 *)info->lti_ea_store;
5436 if (magic == __swab32(LOV_USER_MAGIC_V1)) {
5437 lustre_swab_lov_user_md_v1(info->lti_ea_store);
5438 } else if (magic == __swab32(LOV_USER_MAGIC_V3)) {
5439 lustre_swab_lov_user_md_v3(info->lti_ea_store);
5440 } else if (magic == __swab32(LOV_USER_MAGIC_SPECIFIC)) {
5441 v3 = (struct lov_user_md_v3 *)info->lti_ea_store;
5442 lustre_swab_lov_user_md_v3(v3);
5443 lustre_swab_lov_user_md_objects(v3->lmm_objects,
5444 v3->lmm_stripe_count);
5445 } else if (magic == __swab32(LOV_USER_MAGIC_COMP_V1) ||
5446 magic == __swab32(LOV_USER_MAGIC_SEL)) {
5447 lustre_swab_lov_comp_md_v1(info->lti_ea_store);
5453 case LOV_USER_MAGIC_SPECIFIC:
5454 v1 = info->lti_ea_store;
5456 case LOV_MAGIC_COMP_V1:
5458 lcm = info->lti_ea_store;
5459 entry_count = lcm->lcm_entry_count;
5460 if (entry_count == 0)
5463 mirror_count = lcm->lcm_mirror_count + 1;
5464 want_composite = true;
5470 if (append_stripe_count != 0 || append_pool != NULL) {
5473 want_composite = false;
5476 /* realloc default comp entries if necessary */
5477 rc = lod_def_striping_comp_resize(lds, entry_count);
5481 lds->lds_def_comp_cnt = entry_count;
5482 lds->lds_def_striping_is_composite = want_composite;
5483 lds->lds_def_mirror_cnt = mirror_count;
5485 for (i = 0; i < entry_count; i++) {
5486 struct lod_layout_component *llc = &lds->lds_def_comp_entries[i];
5490 * reset llc values, llc_stripes is always NULL in the
5491 * default striping template, llc_pool will be reset
5492 * later below using lod_set_pool().
5494 * XXX At this point llc_pool may point to valid (!)
5495 * kmalloced strings from previous RPCs.
5497 memset(llc, 0, offsetof(typeof(*llc), llc_pool));
5500 v1 = (struct lov_user_md *)((char *)lcm +
5501 lcm->lcm_entries[i].lcme_offset);
5503 if (want_composite) {
5504 llc->llc_extent = lcm->lcm_entries[i].lcme_extent;
5505 /* We only inherit certain flags from the layout */
5506 llc->llc_flags = lcm->lcm_entries[i].lcme_flags &
5507 LCME_TEMPLATE_FLAGS;
5511 CDEBUG(D_LAYOUT, DFID" magic = %#08x, pattern = %#x, stripe_count = %hu, stripe_size = %u, stripe_offset = %hu, append_pool = '%s', append_stripe_count = %d\n",
5512 PFID(lu_object_fid(&lo->ldo_obj.do_lu)),
5515 v1->lmm_stripe_count,
5516 v1->lmm_stripe_size,
5517 v1->lmm_stripe_offset,
5519 append_stripe_count);
5521 if (!lov_pattern_supported(v1->lmm_pattern) &&
5522 !(v1->lmm_pattern & LOV_PATTERN_F_RELEASED)) {
5523 lod_free_def_comp_entries(lds);
5527 llc->llc_stripe_count = v1->lmm_stripe_count;
5528 llc->llc_stripe_size = v1->lmm_stripe_size;
5529 llc->llc_stripe_offset = v1->lmm_stripe_offset;
5530 llc->llc_pattern = v1->lmm_pattern;
5532 if (append_stripe_count != 0 || append_pool != NULL)
5533 llc->llc_pattern = LOV_PATTERN_RAID0;
5535 if (append_stripe_count != 0)
5536 llc->llc_stripe_count = append_stripe_count;
5539 if (append_pool != NULL) {
5541 } else if (v1->lmm_magic == LOV_USER_MAGIC_V3) {
5542 /* XXX: sanity check here */
5543 v3 = (struct lov_user_md_v3 *)v1;
5544 if (v3->lmm_pool_name[0] != '\0')
5545 pool = v3->lmm_pool_name;
5548 lod_set_pool(&llc->llc_pool, pool);
5550 if (v1->lmm_magic == LOV_USER_MAGIC_SPECIFIC &&
5551 append_stripe_count == 0 &&
5552 append_pool == NULL) {
5553 v3 = (struct lov_user_md_v3 *)v1;
5554 rc = lod_comp_copy_ost_lists(llc, v3);
5557 } else if (llc->llc_ostlist.op_array &&
5558 llc->llc_ostlist.op_count) {
5559 for (j = 0; j < llc->llc_ostlist.op_count; j++)
5560 llc->llc_ostlist.op_array[j] = -1;
5561 llc->llc_ostlist.op_count = 0;
5565 lds->lds_def_striping_set = 1;
5569 static inline void lod_lum2lds(struct lod_default_striping *lds,
5570 const struct lmv_user_md *lum)
5572 lds->lds_dir_def_stripe_count = le32_to_cpu(lum->lum_stripe_count);
5573 lds->lds_dir_def_stripe_offset = le32_to_cpu(lum->lum_stripe_offset);
5574 lds->lds_dir_def_hash_type = le32_to_cpu(lum->lum_hash_type);
5575 lds->lds_dir_def_max_inherit = lum->lum_max_inherit;
5576 lds->lds_dir_def_max_inherit_rr = lum->lum_max_inherit_rr;
5577 lds->lds_dir_def_striping_set = 1;
5581 * Get default directory striping.
5583 * \param[in] env execution environment
5584 * \param[in] lo object
5585 * \param[out] lds default striping
5587 * \retval 0 on success
5588 * \retval negative if failed
5590 static int lod_get_default_lmv_striping(const struct lu_env *env,
5591 struct lod_object *lo,
5592 struct lod_default_striping *lds)
5594 struct lmv_user_md *lmu;
5597 lds->lds_dir_def_striping_set = 0;
5599 rc = lod_get_default_lmv_ea(env, lo);
5603 if (rc >= (int)sizeof(*lmu)) {
5604 struct lod_thread_info *info = lod_env_info(env);
5606 lmu = info->lti_ea_store;
5607 lod_lum2lds(lds, lmu);
5614 * Get default striping in the object.
5616 * Get object default striping and default directory striping.
5618 * \param[in] env execution environment
5619 * \param[in] lo object
5620 * \param[out] lds default striping
5622 * \retval 0 on success
5623 * \retval negative if failed
5625 static int lod_get_default_striping(const struct lu_env *env,
5626 struct lod_object *lo,
5627 struct dt_allocation_hint *ah,
5628 struct lod_default_striping *lds)
5632 rc = lod_get_default_lov_striping(env, lo, lds, NULL);
5633 if (lds->lds_def_striping_set) {
5634 struct lod_thread_info *info = lod_env_info(env);
5635 struct lod_device *d = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
5637 rc = lod_verify_striping(env, d, lo, &info->lti_buf, false);
5639 lds->lds_def_striping_set = 0;
5642 if (ah->dah_eadata_is_dmv) {
5643 lod_lum2lds(lds, ah->dah_eadata);
5644 } else if (ah->dah_dmv_imp_inherit) {
5645 lds->lds_dir_def_striping_set = 0;
5647 rc1 = lod_get_default_lmv_striping(env, lo, lds);
5648 if (rc == 0 && rc1 < 0)
5656 * Apply default striping on object.
5658 * If object striping pattern is not set, set to the one in default striping.
5659 * The default striping is from parent or fs.
5661 * \param[in] lo new object
5662 * \param[in] lds default striping
5663 * \param[in] mode new object's mode
5665 static void lod_striping_from_default(struct lod_object *lo,
5666 const struct lod_default_striping *lds,
5669 struct lod_device *d = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
5672 if (lds->lds_def_striping_set && S_ISREG(mode)) {
5673 struct lov_desc *desc = &d->lod_ost_descs.ltd_lov_desc;
5675 rc = lod_alloc_comp_entries(lo, lds->lds_def_mirror_cnt,
5676 lds->lds_def_comp_cnt);
5680 lo->ldo_is_composite = lds->lds_def_striping_is_composite;
5681 if (lds->lds_def_mirror_cnt > 1)
5682 lo->ldo_flr_state = LCM_FL_RDONLY;
5684 for (i = 0; i < lo->ldo_comp_cnt; i++) {
5685 struct lod_layout_component *obj_comp =
5686 &lo->ldo_comp_entries[i];
5687 struct lod_layout_component *def_comp =
5688 &lds->lds_def_comp_entries[i];
5691 "inherit "DFID" file layout from default: flags=%#x size=%u nr=%u offset=%u pattern=%#x pool=%s\n",
5692 PFID(lu_object_fid(&lo->ldo_obj.do_lu)),
5693 def_comp->llc_flags,
5694 def_comp->llc_stripe_size,
5695 def_comp->llc_stripe_count,
5696 def_comp->llc_stripe_offset,
5697 def_comp->llc_pattern,
5698 def_comp->llc_pool ?: "");
5700 *obj_comp = *def_comp;
5701 if (def_comp->llc_pool != NULL) {
5702 /* pointer was copied from def_comp */
5703 obj_comp->llc_pool = NULL;
5704 lod_obj_set_pool(lo, i, def_comp->llc_pool);
5708 if (def_comp->llc_ostlist.op_array &&
5709 def_comp->llc_ostlist.op_count) {
5710 OBD_ALLOC(obj_comp->llc_ostlist.op_array,
5711 obj_comp->llc_ostlist.op_size);
5712 if (!obj_comp->llc_ostlist.op_array)
5714 memcpy(obj_comp->llc_ostlist.op_array,
5715 def_comp->llc_ostlist.op_array,
5716 obj_comp->llc_ostlist.op_size);
5717 } else if (def_comp->llc_ostlist.op_array) {
5718 obj_comp->llc_ostlist.op_array = NULL;
5722 * Don't initialize these fields for plain layout
5723 * (v1/v3) here, they are inherited in the order of
5724 * 'parent' -> 'fs default (root)' -> 'global default
5725 * values for stripe_count & stripe_size'.
5727 * see lod_ah_init().
5729 if (!lo->ldo_is_composite)
5732 lod_adjust_stripe_info(obj_comp, desc, 0);
5734 } else if (lds->lds_dir_def_striping_set && S_ISDIR(mode)) {
5735 if (lo->ldo_dir_stripe_count == 0)
5736 lo->ldo_dir_stripe_count =
5737 lds->lds_dir_def_stripe_count;
5738 if (lo->ldo_dir_stripe_offset == -1)
5739 lo->ldo_dir_stripe_offset =
5740 lds->lds_dir_def_stripe_offset;
5741 if (lo->ldo_dir_hash_type == LMV_HASH_TYPE_UNKNOWN)
5742 lo->ldo_dir_hash_type = lds->lds_dir_def_hash_type;
5745 "inherit "DFID" dir layout from default: count=%hu offset=%u hash_type=%x\n",
5746 PFID(lu_object_fid(&lo->ldo_obj.do_lu)),
5747 lo->ldo_dir_stripe_count, lo->ldo_dir_stripe_offset,
5748 lo->ldo_dir_hash_type);
5752 static inline bool lod_need_inherit_more(struct lod_object *lo, bool from_root,
5753 const char *append_pool)
5755 struct lod_layout_component *lod_comp;
5757 if (lo->ldo_comp_cnt == 0)
5760 if (lo->ldo_is_composite)
5763 lod_comp = &lo->ldo_comp_entries[0];
5765 if (lod_comp->llc_stripe_count <= 0 ||
5766 lod_comp->llc_stripe_size <= 0)
5769 if (from_root && (lod_comp->llc_pool == NULL ||
5770 lod_comp->llc_stripe_offset == LOV_OFFSET_DEFAULT))
5773 if (append_pool && append_pool[0])
5780 * Implementation of dt_object_operations::do_ah_init.
5782 * This method is used to make a decision on the striping configuration for the
5783 * object being created. It can be taken from the \a parent object if it exists,
5784 * or filesystem's default. The resulting configuration (number of stripes,
5785 * stripe size/offset, pool name, hash_type, etc.) is stored in the object
5786 * itself and will be used by the methods like ->doo_declare_create().
5788 * \see dt_object_operations::do_ah_init() in the API description for details.
5790 static void lod_ah_init(const struct lu_env *env,
5791 struct dt_allocation_hint *ah,
5792 struct dt_object *parent,
5793 struct dt_object *child,
5796 struct lod_device *d = lu2lod_dev(child->do_lu.lo_dev);
5797 struct lod_thread_info *info = lod_env_info(env);
5798 struct lod_default_striping *lds = lod_lds_buf_get(env);
5799 struct dt_object *nextp = NULL;
5800 struct dt_object *nextc;
5801 struct lod_object *lp = NULL;
5802 struct lod_object *lc;
5803 struct lov_desc *desc;
5804 struct lod_layout_component *lod_comp;
5810 if (ah->dah_append_stripe_count == -1)
5811 ah->dah_append_stripe_count =
5812 d->lod_ost_descs.ltd_lov_desc.ld_tgt_count;
5814 if (likely(parent)) {
5815 nextp = dt_object_child(parent);
5816 lp = lod_dt_obj(parent);
5819 nextc = dt_object_child(child);
5820 lc = lod_dt_obj(child);
5822 LASSERT(!lod_obj_is_striped(child));
5823 /* default layout template may have been set on the regular file
5824 * when this is called from mdd_create_data() */
5825 if (S_ISREG(child_mode))
5826 lod_free_comp_entries(lc);
5828 if (!dt_object_exists(nextc))
5829 nextc->do_ops->do_ah_init(env, ah, nextp, nextc, child_mode);
5831 if (S_ISDIR(child_mode)) {
5832 const struct lmv_user_md_v1 *lum1 = ah->dah_eadata;
5833 int max_stripe_count;
5835 /* other default values are 0 */
5836 lc->ldo_dir_stripe_offset = LMV_OFFSET_DEFAULT;
5838 /* no default striping configuration is needed for
5841 if (ah->dah_eadata != NULL && ah->dah_eadata_len != 0 &&
5842 le32_to_cpu(lum1->lum_magic) == LMV_MAGIC_FOREIGN) {
5843 lc->ldo_is_foreign = true;
5844 /* keep stripe_count 0 and stripe_offset -1 */
5845 CDEBUG(D_INFO, "no default striping for foreign dir\n");
5849 if (likely(lp != NULL))
5850 lod_get_default_striping(env, lp, ah, lds);
5852 /* It should always honour the specified stripes */
5853 if (ah->dah_eadata && ah->dah_eadata_len &&
5854 !ah->dah_eadata_is_dmv &&
5855 (le32_to_cpu(lum1->lum_magic) == LMV_USER_MAGIC ||
5856 le32_to_cpu(lum1->lum_magic) == LMV_USER_MAGIC_SPECIFIC ||
5857 le32_to_cpu(lum1->lum_magic) == LMV_MAGIC_V1)) {
5858 lc->ldo_dir_stripe_count =
5859 le32_to_cpu(lum1->lum_stripe_count);
5860 lc->ldo_dir_stripe_offset =
5861 le32_to_cpu(lum1->lum_stripe_offset);
5862 lc->ldo_dir_hash_type =
5863 le32_to_cpu(lum1->lum_hash_type);
5865 "set dirstripe: count %hu, offset %d, hash %x\n",
5866 lc->ldo_dir_stripe_count,
5867 (int)lc->ldo_dir_stripe_offset,
5868 lc->ldo_dir_hash_type);
5870 if (d->lod_mdt_descs.ltd_lmv_desc.ld_active_tgt_count &&
5871 lc->ldo_dir_stripe_count < 2 &&
5872 lum1->lum_max_inherit != LMV_INHERIT_NONE) {
5873 /* when filesystem-wide default LMV is set, dirs
5874 * will be created on MDT by space usage, but if
5875 * dir is created with "lfs mkdir -c 1 ...", its
5876 * subdirs should be kept on the same MDT. To
5877 * guarantee this, set default LMV for such dir.
5879 lds->lds_dir_def_stripe_count =
5880 le32_to_cpu(lum1->lum_stripe_count);
5881 /* if "-1" stripe offset is set, save current
5882 * MDT index in default LMV.
5884 if (le32_to_cpu(lum1->lum_stripe_offset) ==
5886 lds->lds_dir_def_stripe_offset =
5887 lod2lu_dev(d)->ld_site->ld_seq_site->ss_node_id;
5889 lds->lds_dir_def_stripe_offset =
5890 le32_to_cpu(lum1->lum_stripe_offset);
5891 lds->lds_dir_def_hash_type =
5892 le32_to_cpu(lum1->lum_hash_type);
5893 lds->lds_dir_def_max_inherit =
5894 lum1->lum_max_inherit;
5895 /* it will be decreased by 1 later in setting */
5896 if (lum1->lum_max_inherit >= LMV_INHERIT_END &&
5897 lum1->lum_max_inherit < LMV_INHERIT_MAX)
5898 lds->lds_dir_def_max_inherit++;
5899 lds->lds_dir_def_max_inherit_rr =
5900 lum1->lum_max_inherit_rr;
5901 lds->lds_dir_def_striping_set = 1;
5902 /* don't inherit LOV from ROOT */
5903 if (lds->lds_def_striping_set &&
5904 fid_is_root(lod_object_fid(lp)))
5905 lds->lds_def_striping_set = 0;
5906 lc->ldo_def_striping = lds;
5907 } else if (lds->lds_def_striping_set &&
5908 !fid_is_root(lod_object_fid(lp))) {
5909 /* don't inherit default LMV for "lfs mkdir" */
5910 lds->lds_dir_def_striping_set = 0;
5911 lc->ldo_def_striping = lds;
5914 /* inherit default striping except ROOT */
5915 if ((lds->lds_def_striping_set ||
5916 lds->lds_dir_def_striping_set) &&
5917 !fid_is_root(lod_object_fid(lp)))
5918 lc->ldo_def_striping = lds;
5920 /* transfer defaults LMV to new directory */
5921 lod_striping_from_default(lc, lds, child_mode);
5923 /* set count 0 to create normal directory */
5924 if (lc->ldo_dir_stripe_count == 1)
5925 lc->ldo_dir_stripe_count = 0;
5927 /* do not save default LMV on server */
5928 if (ah->dah_dmv_imp_inherit) {
5929 lds->lds_dir_def_striping_set = 0;
5930 if (!lds->lds_def_striping_set)
5931 lc->ldo_def_striping = NULL;
5935 /* shrink the stripe count to max_mdt_stripecount if it is -1
5936 * and max_mdt_stripecount is not 0
5938 if (lc->ldo_dir_stripe_count == (__u16)(-1) &&
5939 d->lod_max_mdt_stripecount)
5940 lc->ldo_dir_stripe_count = d->lod_max_mdt_stripecount;
5942 max_stripe_count = d->lod_remote_mdt_count + 1;
5943 if (lc->ldo_dir_hash_type & LMV_HASH_FLAG_OVERSTRIPED)
5944 max_stripe_count *= d->lod_max_stripes_per_mdt;
5946 /* shrink the stripe_count to max stripe count */
5947 if (lc->ldo_dir_stripe_count > max_stripe_count &&
5948 !CFS_FAIL_CHECK(OBD_FAIL_LARGE_STRIPE)) {
5949 lc->ldo_dir_stripe_count = max_stripe_count;
5950 if (lc->ldo_dir_stripe_count == 1)
5951 lc->ldo_dir_stripe_count = 0;
5954 if (!lmv_is_known_hash_type(lc->ldo_dir_hash_type))
5955 lc->ldo_dir_hash_type =
5956 (lc->ldo_dir_hash_type & LMV_HASH_FLAG_KNOWN) |
5957 d->lod_mdt_descs.ltd_lmv_desc.ld_pattern;
5959 /* make sure all fscrypt metadata stays on same mdt */
5960 if (child->do_lu.lo_header->loh_attr & LOHA_FSCRYPT_MD) {
5961 lc->ldo_dir_stripe_count = 0;
5962 lds->lds_dir_def_stripe_offset =
5963 lod2lu_dev(d)->ld_site->ld_seq_site->ss_node_id;
5964 lds->lds_dir_def_striping_set = 1;
5965 lc->ldo_def_striping = lds;
5968 CDEBUG(D_INFO, "final dir stripe_count=%hu offset=%d hash=%x\n",
5969 lc->ldo_dir_stripe_count,
5970 (int)lc->ldo_dir_stripe_offset, lc->ldo_dir_hash_type);
5975 /* child object regular file*/
5977 if (!lod_object_will_be_striped(S_ISREG(child_mode),
5978 lu_object_fid(&child->do_lu)))
5981 /* If object is going to be striped over OSTs, transfer default
5982 * striping information to the child, so that we can use it
5983 * during declaration and creation.
5985 * Try from the parent first.
5987 if (likely(lp != NULL)) {
5988 rc = lod_get_default_lov_striping(env, lp, lds, ah);
5989 if (rc == 0 && lds->lds_def_striping_set) {
5990 rc = lod_verify_striping(env, d, lp, &info->lti_buf,
5993 lod_striping_from_default(lc, lds, child_mode);
5997 /* Initialize lod_device::lod_md_root object reference */
5998 if (d->lod_md_root == NULL) {
5999 struct dt_object *root;
6000 struct lod_object *lroot;
6002 lu_root_fid(&info->lti_fid);
6003 root = dt_locate(env, &d->lod_dt_dev, &info->lti_fid);
6004 if (!IS_ERR(root)) {
6005 lroot = lod_dt_obj(root);
6007 spin_lock(&d->lod_lock);
6008 if (d->lod_md_root != NULL)
6009 dt_object_put(env, &d->lod_md_root->ldo_obj);
6010 d->lod_md_root = lroot;
6011 spin_unlock(&d->lod_lock);
6015 /* try inherit layout from the root object (fs default) when:
6016 * - parent does not have default layout; or
6017 * - parent has plain(v1/v3) default layout, and some attributes
6018 * are not specified in the default layout;
6020 if (d->lod_md_root != NULL &&
6021 lod_need_inherit_more(lc, true, ah->dah_append_pool)) {
6022 rc = lod_get_default_lov_striping(env, d->lod_md_root, lds,
6024 if (rc || !lds->lds_def_striping_set)
6027 rc = lod_verify_striping(env, d, d->lod_md_root, &info->lti_buf,
6032 if (lc->ldo_comp_cnt == 0) {
6033 lod_striping_from_default(lc, lds, child_mode);
6034 } else if (!lds->lds_def_striping_is_composite) {
6035 struct lod_layout_component *def_comp;
6037 LASSERT(!lc->ldo_is_composite);
6038 lod_comp = &lc->ldo_comp_entries[0];
6039 def_comp = &lds->lds_def_comp_entries[0];
6041 if (lod_comp->llc_stripe_count <= 0)
6042 lod_comp->llc_stripe_count =
6043 def_comp->llc_stripe_count;
6044 if (lod_comp->llc_stripe_size <= 0)
6045 lod_comp->llc_stripe_size =
6046 def_comp->llc_stripe_size;
6047 if (lod_comp->llc_stripe_offset == LOV_OFFSET_DEFAULT &&
6048 (!lod_comp->llc_pool || !lod_comp->llc_pool[0]))
6049 lod_comp->llc_stripe_offset =
6050 def_comp->llc_stripe_offset;
6051 if (lod_comp->llc_pool == NULL)
6052 lod_qos_set_pool(lc, 0, def_comp->llc_pool);
6057 * fs default striping may not be explicitly set, or historically set
6058 * in config log, use them.
6060 if (lod_need_inherit_more(lc, false, ah->dah_append_pool)) {
6061 if (lc->ldo_comp_cnt == 0) {
6062 rc = lod_alloc_comp_entries(lc, 0, 1);
6064 /* fail to allocate memory, will create a
6065 * non-striped file. */
6067 lc->ldo_is_composite = 0;
6068 lod_comp = &lc->ldo_comp_entries[0];
6069 lod_comp->llc_stripe_offset = LOV_OFFSET_DEFAULT;
6071 LASSERT(!lc->ldo_is_composite);
6072 lod_comp = &lc->ldo_comp_entries[0];
6073 desc = &d->lod_ost_descs.ltd_lov_desc;
6074 lod_adjust_stripe_info(lod_comp, desc,
6075 ah->dah_append_stripe_count);
6076 if (ah->dah_append_pool && ah->dah_append_pool[0])
6077 lod_qos_set_pool(lc, 0, ah->dah_append_pool);
6084 * Size initialization on late striping.
6086 * Propagate the size of a truncated object to a deferred striping.
6087 * This function handles a special case when truncate was done on a
6088 * non-striped object and now while the striping is being created
6089 * we can't lose that size, so we have to propagate it to the stripes
6092 * \param[in] env execution environment
6093 * \param[in] dt object
6094 * \param[in] th transaction handle
6096 * \retval 0 on success
6097 * \retval negative if failed
6099 static int lod_declare_init_size(const struct lu_env *env,
6100 struct dt_object *dt, struct thandle *th)
6102 struct dt_object *next = dt_object_child(dt);
6103 struct lod_object *lo = lod_dt_obj(dt);
6104 struct dt_object **objects = NULL;
6105 struct lu_attr *attr = &lod_env_info(env)->lti_attr;
6106 uint64_t size, offs;
6107 int i, rc, stripe, stripe_count = 0, stripe_size = 0;
6108 struct lu_extent size_ext;
6111 if (!lod_obj_is_striped(dt))
6114 rc = dt_attr_get(env, next, attr);
6115 LASSERT(attr->la_valid & LA_SIZE);
6119 size = attr->la_size;
6123 size_ext = (typeof(size_ext)){ .e_start = size - 1, .e_end = size };
6124 for (i = 0; i < lo->ldo_comp_cnt; i++) {
6125 struct lod_layout_component *lod_comp;
6126 struct lu_extent *extent;
6128 lod_comp = &lo->ldo_comp_entries[i];
6130 if (lod_comp->llc_stripe == NULL)
6133 extent = &lod_comp->llc_extent;
6134 CDEBUG(D_INFO, "%lld "DEXT"\n", size, PEXT(extent));
6135 if (!lo->ldo_is_composite ||
6136 lu_extent_is_overlapped(extent, &size_ext)) {
6137 objects = lod_comp->llc_stripe;
6138 stripe_count = lod_comp->llc_stripe_count;
6139 stripe_size = lod_comp->llc_stripe_size;
6142 if (stripe_count == 0)
6145 LASSERT(objects != NULL && stripe_size != 0);
6146 do_div(size, stripe_size);
6147 stripe = do_div(size, stripe_count);
6148 LASSERT(objects[stripe] != NULL);
6150 size = size * stripe_size;
6151 offs = attr->la_size;
6152 size += do_div(offs, stripe_size);
6154 attr->la_valid = LA_SIZE;
6155 attr->la_size = size;
6157 rc = lod_sub_declare_attr_set(env, objects[stripe],
6166 * Declare creation of striped object.
6168 * The function declares creation stripes for a regular object. The function
6169 * also declares whether the stripes will be created with non-zero size if
6170 * previously size was set non-zero on the master object. If object \a dt is
6171 * not local, then only fully defined striping can be applied in \a lovea.
6172 * Otherwise \a lovea can be in the form of pattern, see lod_qos_parse_config()
6175 * \param[in] env execution environment
6176 * \param[in] dt object
6177 * \param[in] attr attributes the stripes will be created with
6178 * \param[in] lovea a buffer containing striping description
6179 * \param[in] th transaction handle
6181 * \retval 0 on success
6182 * \retval negative if failed
6184 int lod_declare_striped_create(const struct lu_env *env, struct dt_object *dt,
6185 struct lu_attr *attr,
6186 const struct lu_buf *lovea, struct thandle *th)
6188 struct lod_thread_info *info = lod_env_info(env);
6189 struct dt_object *next = dt_object_child(dt);
6190 struct lod_object *lo = lod_dt_obj(dt);
6194 if (CFS_FAIL_CHECK(OBD_FAIL_MDS_ALLOC_OBDO))
6195 GOTO(out, rc = -ENOMEM);
6197 if (!dt_object_remote(next)) {
6198 /* choose OST and generate appropriate objects */
6199 rc = lod_prepare_create(env, lo, attr, lovea, th);
6204 * declare storage for striping data
6206 info->lti_buf.lb_len = lod_comp_md_size(lo, false);
6208 /* LOD can not choose OST objects for remote objects, i.e.
6209 * stripes must be ready before that. Right now, it can only
6210 * happen during migrate, i.e. migrate process needs to create
6211 * remote regular file (mdd_migrate_create), then the migrate
6212 * process will provide stripeEA. */
6213 LASSERT(lovea != NULL);
6214 info->lti_buf = *lovea;
6217 rc = lod_sub_declare_xattr_set(env, next, &info->lti_buf,
6218 XATTR_NAME_LOV, 0, th);
6223 * if striping is created with local object's size > 0,
6224 * we have to propagate this size to specific object
6225 * the case is possible only when local object was created previously
6227 if (dt_object_exists(next))
6228 rc = lod_declare_init_size(env, dt, th);
6231 /* failed to create striping or to set initial size, let's reset
6232 * config so that others don't get confused */
6234 lod_striping_free(env, lo);
6240 * Whether subdirectories under \a dt should be created on MDTs by space QoS
6242 * If LMV_HASH_FLAG_SPACE is set on directory default layout, its subdirectories
6243 * should be created on MDT by space QoS.
6245 * \param[in] env execution environment
6246 * \param[in] dev lu device
6247 * \param[in] dt object
6249 * \retval 1 if directory should create subdir by space usage
6251 * \retval -ev if failed
6253 static inline int dt_object_qos_mkdir(const struct lu_env *env,
6254 struct lu_device *dev,
6255 struct dt_object *dt)
6257 struct lod_thread_info *info = lod_env_info(env);
6258 struct lu_object *obj;
6259 struct lod_object *lo;
6260 struct lmv_user_md *lmu;
6263 obj = lu_object_find_slice(env, dev, lu_object_fid(&dt->do_lu), NULL);
6265 return PTR_ERR(obj);
6267 lo = lu2lod_obj(obj);
6269 rc = lod_get_default_lmv_ea(env, lo);
6270 dt_object_put(env, dt);
6274 if (rc < (int)sizeof(*lmu))
6277 lmu = info->lti_ea_store;
6278 return le32_to_cpu(lmu->lum_stripe_offset) == LMV_OFFSET_DEFAULT;
6282 * Implementation of dt_object_operations::do_declare_create.
6284 * The method declares creation of a new object. If the object will be striped,
6285 * then helper functions are called to find FIDs for the stripes, declare
6286 * creation of the stripes and declare initialization of the striping
6287 * information to be stored in the master object.
6289 * \see dt_object_operations::do_declare_create() in the API description
6292 static int lod_declare_create(const struct lu_env *env, struct dt_object *dt,
6293 struct lu_attr *attr,
6294 struct dt_allocation_hint *hint,
6295 struct dt_object_format *dof, struct thandle *th)
6297 struct dt_object *next = dt_object_child(dt);
6298 struct lod_object *lo = lod_dt_obj(dt);
6307 * first of all, we declare creation of local object
6309 rc = lod_sub_declare_create(env, next, attr, hint, dof, th);
6314 * it's lod_ah_init() that has decided the object will be striped
6316 if (dof->dof_type == DFT_REGULAR) {
6317 /* callers don't want stripes */
6318 /* XXX: all tricky interactions with ->ah_make_hint() decided
6319 * to use striping, then ->declare_create() behaving differently
6320 * should be cleaned */
6321 if (dof->u.dof_reg.striped != 0)
6322 rc = lod_declare_striped_create(env, dt, attr,
6324 } else if (dof->dof_type == DFT_DIR) {
6325 struct seq_server_site *ss;
6326 struct lu_buf buf = { NULL };
6328 ss = lu_site2seq(dt->do_lu.lo_dev->ld_site);
6330 /* If the parent has default stripeEA, and client
6331 * did not find it before sending create request,
6332 * then MDT will return -EREMOTE, and client will
6333 * retrieve the default stripeEA and re-create the
6336 * Note: if dah_eadata != NULL, it means creating the
6337 * striped directory with specified stripeEA, then it
6338 * should ignore the default stripeEA */
6339 if (hint != NULL && hint->dah_eadata == NULL) {
6340 if (CFS_FAIL_CHECK(OBD_FAIL_MDS_STALE_DIR_LAYOUT))
6341 GOTO(out, rc = -EREMOTE);
6343 if (lo->ldo_dir_stripe_offset != LMV_OFFSET_DEFAULT &&
6344 lo->ldo_dir_stripe_offset != ss->ss_node_id) {
6345 struct lod_device *lod;
6346 struct lu_tgt_desc *mdt = NULL;
6347 bool found_mdt = false;
6349 lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
6350 lod_foreach_mdt(lod, mdt) {
6351 if (mdt->ltd_index ==
6352 lo->ldo_dir_stripe_offset) {
6358 /* If the MDT indicated by stripe_offset can be
6359 * found, then tell client to resend the create
6360 * request to the correct MDT, otherwise return
6361 * error to client */
6363 GOTO(out, rc = -EREMOTE);
6365 GOTO(out, rc = -EINVAL);
6367 } else if (hint && hint->dah_eadata) {
6368 buf.lb_buf = (void *)hint->dah_eadata;
6369 buf.lb_len = hint->dah_eadata_len;
6372 rc = lod_declare_dir_striping_create(env, dt, attr, &buf, dof,
6376 /* failed to create striping or to set initial size, let's reset
6377 * config so that others don't get confused */
6379 lod_striping_free(env, lo);
6384 * Generate component ID for new created component.
6386 * \param[in] lo LOD object
6387 * \param[in] comp_idx index of ldo_comp_entries
6389 * \retval component ID on success
6390 * \retval LCME_ID_INVAL on failure
6392 static __u32 lod_gen_component_id(struct lod_object *lo,
6393 int mirror_id, int comp_idx)
6395 struct lod_layout_component *lod_comp;
6396 __u32 id, start, end;
6399 LASSERT(lo->ldo_comp_entries[comp_idx].llc_id == LCME_ID_INVAL);
6401 lod_obj_inc_layout_gen(lo);
6402 id = lo->ldo_layout_gen;
6403 if (likely(id <= SEQ_ID_MAX))
6404 RETURN(pflr_id(mirror_id, id & SEQ_ID_MASK));
6406 /* Layout generation wraps, need to check collisions. */
6407 start = id & SEQ_ID_MASK;
6410 for (id = start; id <= end; id++) {
6411 for (i = 0; i < lo->ldo_comp_cnt; i++) {
6412 lod_comp = &lo->ldo_comp_entries[i];
6413 if (pflr_id(mirror_id, id) == lod_comp->llc_id)
6416 /* Found the ununsed ID */
6417 if (i == lo->ldo_comp_cnt)
6418 RETURN(pflr_id(mirror_id, id));
6421 if (end == SEQ_ID_MAX) {
6422 end = min_t(__u32, start, SEQ_ID_MAX) - 1;
6427 RETURN(LCME_ID_INVAL);
6431 * Creation of a striped regular object.
6433 * The function is called to create the stripe objects for a regular
6434 * striped file. This can happen at the initial object creation or
6435 * when the caller asks LOD to do so using ->do_xattr_set() method
6436 * (so called late striping). Notice all the information are already
6437 * prepared in the form of the list of objects (ldo_stripe field).
6438 * This is done during declare phase.
6440 * \param[in] env execution environment
6441 * \param[in] dt object
6442 * \param[in] attr attributes the stripes will be created with
6443 * \param[in] dof format of stripes (see OSD API description)
6444 * \param[in] th transaction handle
6446 * \retval 0 on success
6447 * \retval negative if failed
6449 int lod_striped_create(const struct lu_env *env, struct dt_object *dt,
6450 struct lu_attr *attr, struct dt_object_format *dof,
6453 struct lod_layout_component *lod_comp;
6454 struct lod_object *lo = lod_dt_obj(dt);
6459 mutex_lock(&lo->ldo_layout_mutex);
6461 LASSERT((lo->ldo_comp_cnt != 0 && lo->ldo_comp_entries != NULL) ||
6462 lo->ldo_is_foreign);
6464 mirror_id = 0; /* non-flr file's mirror_id is 0 */
6465 if (lo->ldo_mirror_count > 1) {
6466 for (i = 0; i < lo->ldo_comp_cnt; i++) {
6467 lod_comp = &lo->ldo_comp_entries[i];
6468 if (lod_comp->llc_id != LCME_ID_INVAL &&
6469 mirror_id_of(lod_comp->llc_id) > mirror_id)
6470 mirror_id = mirror_id_of(lod_comp->llc_id);
6474 /* create all underlying objects */
6475 for (i = 0; i < lo->ldo_comp_cnt; i++) {
6476 lod_comp = &lo->ldo_comp_entries[i];
6478 if (lod_comp->llc_id == LCME_ID_INVAL) {
6479 /* only the component of FLR layout with more than 1
6480 * mirror has mirror ID in its component ID.
6482 if (lod_comp->llc_extent.e_start == 0 &&
6483 lo->ldo_mirror_count > 1)
6486 lod_comp->llc_id = lod_gen_component_id(lo,
6488 if (lod_comp->llc_id == LCME_ID_INVAL)
6489 GOTO(out, rc = -ERANGE);
6492 if (lod_comp_inited(lod_comp))
6495 if (lod_comp->llc_magic == LOV_MAGIC_FOREIGN) {
6496 lod_comp_set_init(lod_comp);
6500 if (lod_comp->llc_pattern & LOV_PATTERN_F_RELEASED)
6501 lod_comp_set_init(lod_comp);
6503 if (lov_pattern(lod_comp->llc_pattern) & LOV_PATTERN_MDT)
6504 lod_comp_set_init(lod_comp);
6506 if (lod_comp->llc_stripe == NULL)
6509 LASSERT(lod_comp->llc_stripe_count);
6510 for (j = 0; j < lod_comp->llc_stripe_count; j++) {
6511 struct dt_object *object = lod_comp->llc_stripe[j];
6512 LASSERT(object != NULL);
6513 rc = lod_sub_create(env, object, attr, NULL, dof, th);
6517 lod_comp_set_init(lod_comp);
6520 rc = lod_fill_mirrors(lo);
6524 lo->ldo_comp_cached = 1;
6526 rc = lod_generate_and_set_lovea(env, lo, th);
6530 mutex_unlock(&lo->ldo_layout_mutex);
6535 lod_striping_free_nolock(env, lo);
6536 mutex_unlock(&lo->ldo_layout_mutex);
6541 static inline bool lod_obj_is_dom(struct dt_object *dt)
6543 struct lod_object *lo = lod_dt_obj(dt);
6545 if (!dt_object_exists(dt_object_child(dt)))
6548 if (S_ISDIR(dt->do_lu.lo_header->loh_attr))
6551 if (!lo->ldo_comp_cnt)
6554 return (lov_pattern(lo->ldo_comp_entries[0].llc_pattern) &
6559 * Implementation of dt_object_operations::do_create.
6561 * If any of preceeding methods (like ->do_declare_create(),
6562 * ->do_ah_init(), etc) chose to create a striped object,
6563 * then this method will create the master and the stripes.
6565 * \see dt_object_operations::do_create() in the API description for details.
6567 static int lod_create(const struct lu_env *env, struct dt_object *dt,
6568 struct lu_attr *attr, struct dt_allocation_hint *hint,
6569 struct dt_object_format *dof, struct thandle *th)
6574 /* create local object */
6575 rc = lod_sub_create(env, dt_object_child(dt), attr, hint, dof, th);
6579 if (S_ISREG(dt->do_lu.lo_header->loh_attr) &&
6580 (lod_obj_is_striped(dt) || lod_obj_is_dom(dt)) &&
6581 dof->u.dof_reg.striped != 0) {
6582 LASSERT(lod_dt_obj(dt)->ldo_comp_cached == 0);
6583 rc = lod_striped_create(env, dt, attr, dof, th);
6590 lod_obj_stripe_destroy_cb(const struct lu_env *env, struct lod_object *lo,
6591 struct dt_object *dt, struct thandle *th,
6592 int comp_idx, int stripe_idx,
6593 struct lod_obj_stripe_cb_data *data)
6595 if (data->locd_declare)
6596 return lod_sub_declare_destroy(env, dt, th);
6598 if (!CFS_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_SPEOBJ) ||
6599 stripe_idx == cfs_fail_val)
6600 return lod_sub_destroy(env, dt, th);
6606 * Implementation of dt_object_operations::do_declare_destroy.
6608 * If the object is a striped directory, then the function declares reference
6609 * removal from the master object (this is an index) to the stripes and declares
6610 * destroy of all the stripes. In all the cases, it declares an intention to
6611 * destroy the object itself.
6613 * \see dt_object_operations::do_declare_destroy() in the API description
6616 static int lod_declare_destroy(const struct lu_env *env, struct dt_object *dt,
6619 struct dt_object *next = dt_object_child(dt);
6620 struct lod_object *lo = lod_dt_obj(dt);
6621 struct lod_thread_info *info = lod_env_info(env);
6622 struct dt_object *stripe;
6623 char *stripe_name = info->lti_key;
6629 * load striping information, notice we don't do this when object
6630 * is being initialized as we don't need this information till
6631 * few specific cases like destroy, chown
6633 rc = lod_striping_load(env, lo);
6637 /* declare destroy for all underlying objects */
6638 if (S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
6639 rc = next->do_ops->do_index_try(env, next,
6640 &dt_directory_features);
6644 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
6645 stripe = lo->ldo_stripe[i];
6649 rc = lod_sub_declare_ref_del(env, next, th);
6653 snprintf(stripe_name, sizeof(info->lti_key),
6655 PFID(lu_object_fid(&stripe->do_lu)), i);
6656 rc = lod_sub_declare_delete(env, next,
6657 (const struct dt_key *)stripe_name, th);
6664 * we declare destroy for the local object
6666 rc = lod_sub_declare_destroy(env, next, th);
6670 if (CFS_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ) ||
6671 CFS_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ2))
6674 if (!lod_obj_is_striped(dt))
6677 /* declare destroy all striped objects */
6678 if (S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
6679 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
6680 stripe = lo->ldo_stripe[i];
6684 if (!dt_object_exists(stripe))
6687 rc = lod_sub_declare_ref_del(env, stripe, th);
6691 rc = lod_sub_declare_destroy(env, stripe, th);
6696 struct lod_obj_stripe_cb_data data = { { 0 } };
6698 data.locd_declare = true;
6699 data.locd_stripe_cb = lod_obj_stripe_destroy_cb;
6700 rc = lod_obj_for_each_stripe(env, lo, th, &data);
6707 * Implementation of dt_object_operations::do_destroy.
6709 * If the object is a striped directory, then the function removes references
6710 * from the master object (this is an index) to the stripes and destroys all
6711 * the stripes. In all the cases, the function destroys the object itself.
6713 * \see dt_object_operations::do_destroy() in the API description for details.
6715 static int lod_destroy(const struct lu_env *env, struct dt_object *dt,
6718 struct dt_object *next = dt_object_child(dt);
6719 struct lod_object *lo = lod_dt_obj(dt);
6720 struct lod_thread_info *info = lod_env_info(env);
6721 char *stripe_name = info->lti_key;
6722 struct dt_object *stripe;
6728 /* destroy sub-stripe of master object */
6729 if (S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
6730 rc = next->do_ops->do_index_try(env, next,
6731 &dt_directory_features);
6735 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
6736 stripe = lo->ldo_stripe[i];
6740 rc = lod_sub_ref_del(env, next, th);
6744 snprintf(stripe_name, sizeof(info->lti_key), DFID":%d",
6745 PFID(lu_object_fid(&stripe->do_lu)), i);
6747 CDEBUG(D_INFO, DFID" delete stripe %s "DFID"\n",
6748 PFID(lu_object_fid(&dt->do_lu)), stripe_name,
6749 PFID(lu_object_fid(&stripe->do_lu)));
6751 rc = lod_sub_delete(env, next,
6752 (const struct dt_key *)stripe_name, th);
6758 rc = lod_sub_destroy(env, next, th);
6762 if (CFS_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ) ||
6763 CFS_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ2))
6766 if (!lod_obj_is_striped(dt))
6769 /* destroy all striped objects */
6770 if (S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
6771 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
6772 stripe = lo->ldo_stripe[i];
6776 if (!dt_object_exists(stripe))
6779 if (!CFS_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_SPEOBJ) ||
6780 i == cfs_fail_val) {
6781 dt_write_lock(env, stripe, DT_TGT_CHILD);
6782 rc = lod_sub_ref_del(env, stripe, th);
6783 dt_write_unlock(env, stripe);
6787 rc = lod_sub_destroy(env, stripe, th);
6793 struct lod_obj_stripe_cb_data data = { { 0 } };
6795 data.locd_declare = false;
6796 data.locd_stripe_cb = lod_obj_stripe_destroy_cb;
6797 rc = lod_obj_for_each_stripe(env, lo, th, &data);
6804 * Implementation of dt_object_operations::do_declare_ref_add.
6806 * \see dt_object_operations::do_declare_ref_add() in the API description
6809 static int lod_declare_ref_add(const struct lu_env *env,
6810 struct dt_object *dt, struct thandle *th)
6812 return lod_sub_declare_ref_add(env, dt_object_child(dt), th);
6816 * Implementation of dt_object_operations::do_ref_add.
6818 * \see dt_object_operations::do_ref_add() in the API description for details.
6820 static int lod_ref_add(const struct lu_env *env,
6821 struct dt_object *dt, struct thandle *th)
6823 return lod_sub_ref_add(env, dt_object_child(dt), th);
6827 * Implementation of dt_object_operations::do_declare_ref_del.
6829 * \see dt_object_operations::do_declare_ref_del() in the API description
6832 static int lod_declare_ref_del(const struct lu_env *env,
6833 struct dt_object *dt, struct thandle *th)
6835 return lod_sub_declare_ref_del(env, dt_object_child(dt), th);
6839 * Implementation of dt_object_operations::do_ref_del
6841 * \see dt_object_operations::do_ref_del() in the API description for details.
6843 static int lod_ref_del(const struct lu_env *env,
6844 struct dt_object *dt, struct thandle *th)
6846 return lod_sub_ref_del(env, dt_object_child(dt), th);
6850 * Implementation of dt_object_operations::do_object_sync.
6852 * \see dt_object_operations::do_object_sync() in the API description
6855 static int lod_object_sync(const struct lu_env *env, struct dt_object *dt,
6856 __u64 start, __u64 end)
6858 return dt_object_sync(env, dt_object_child(dt), start, end);
6862 * Implementation of dt_object_operations::do_object_unlock.
6864 * Used to release LDLM lock(s).
6866 * \see dt_object_operations::do_object_unlock() in the API description
6869 static int lod_object_unlock(const struct lu_env *env, struct dt_object *dt,
6870 struct ldlm_enqueue_info *einfo,
6871 union ldlm_policy_data *policy)
6873 struct lod_object *lo = lod_dt_obj(dt);
6874 struct lustre_handle_array *slave_locks = einfo->ei_cbdata;
6875 int slave_locks_size;
6879 if (slave_locks == NULL)
6882 LASSERT(S_ISDIR(dt->do_lu.lo_header->loh_attr));
6883 /* Note: for remote lock for single stripe dir, MDT will cancel
6884 * the lock by lockh directly */
6885 LASSERT(!dt_object_remote(dt_object_child(dt)));
6887 /* locks were unlocked in MDT layer */
6888 for (i = 0; i < slave_locks->ha_count; i++)
6889 LASSERT(!lustre_handle_is_used(&slave_locks->ha_handles[i]));
6892 * NB, ha_count may not equal to ldo_dir_stripe_count, because dir
6893 * layout may change, e.g., shrink dir layout after migration.
6895 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
6896 if (lo->ldo_stripe[i])
6897 dt_invalidate(env, lo->ldo_stripe[i]);
6900 slave_locks_size = offsetof(typeof(*slave_locks),
6901 ha_handles[slave_locks->ha_count]);
6902 OBD_FREE(slave_locks, slave_locks_size);
6903 einfo->ei_cbdata = NULL;
6909 * Implementation of dt_object_operations::do_object_lock.
6911 * Used to get LDLM lock on the non-striped and striped objects.
6913 * \see dt_object_operations::do_object_lock() in the API description
6916 static int lod_object_lock(const struct lu_env *env,
6917 struct dt_object *dt,
6918 struct lustre_handle *lh,
6919 struct ldlm_enqueue_info *einfo,
6920 union ldlm_policy_data *policy)
6922 struct lod_object *lo = lod_dt_obj(dt);
6923 int slave_locks_size;
6924 struct lustre_handle_array *slave_locks = NULL;
6929 /* remote object lock */
6930 if (!einfo->ei_enq_slave) {
6931 LASSERT(dt_object_remote(dt));
6932 return dt_object_lock(env, dt_object_child(dt), lh, einfo,
6936 if (!S_ISDIR(dt->do_lu.lo_header->loh_attr))
6939 rc = lod_striping_load(env, lo);
6944 if (lo->ldo_dir_stripe_count <= 1)
6947 slave_locks_size = offsetof(typeof(*slave_locks),
6948 ha_handles[lo->ldo_dir_stripe_count]);
6949 /* Freed in lod_object_unlock */
6950 OBD_ALLOC(slave_locks, slave_locks_size);
6953 slave_locks->ha_count = lo->ldo_dir_stripe_count;
6955 /* striped directory lock */
6956 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
6957 struct lustre_handle lockh;
6958 struct ldlm_res_id *res_id;
6959 struct dt_object *stripe;
6961 stripe = lo->ldo_stripe[i];
6965 res_id = &lod_env_info(env)->lti_res_id;
6966 fid_build_reg_res_name(lu_object_fid(&stripe->do_lu), res_id);
6967 einfo->ei_res_id = res_id;
6969 if (dt_object_remote(stripe)) {
6970 set_bit(i, (void *)slave_locks->ha_map);
6971 rc = dt_object_lock(env, stripe, &lockh, einfo, policy);
6973 struct ldlm_namespace *ns = einfo->ei_namespace;
6974 ldlm_blocking_callback blocking = einfo->ei_cb_local_bl;
6975 ldlm_completion_callback completion = einfo->ei_cb_cp;
6976 __u64 dlmflags = LDLM_FL_ATOMIC_CB;
6978 LASSERT(ns != NULL);
6979 rc = ldlm_cli_enqueue_local(env, ns, res_id, LDLM_IBITS,
6980 policy, einfo->ei_mode,
6981 &dlmflags, blocking,
6983 NULL, 0, LVB_T_NONE,
6988 ldlm_lock_decref_and_cancel(
6989 &slave_locks->ha_handles[i],
6991 OBD_FREE(slave_locks, slave_locks_size);
6994 slave_locks->ha_handles[i] = lockh;
6996 einfo->ei_cbdata = slave_locks;
7002 * Implementation of dt_object_operations::do_invalidate.
7004 * \see dt_object_operations::do_invalidate() in the API description for details
7006 static int lod_invalidate(const struct lu_env *env, struct dt_object *dt)
7008 return dt_invalidate(env, dt_object_child(dt));
7011 static int lod_declare_instantiate_components(const struct lu_env *env,
7012 struct lod_object *lo,
7016 struct lod_thread_info *info = lod_env_info(env);
7021 LASSERT(info->lti_count < lo->ldo_comp_cnt);
7023 for (i = 0; i < info->lti_count; i++) {
7024 rc = lod_qos_prep_create(env, lo, NULL, th,
7025 info->lti_comp_idx[i], reserve);
7031 info->lti_buf.lb_len = lod_comp_md_size(lo, false);
7032 rc = lod_sub_declare_xattr_set(env, lod_object_child(lo),
7033 &info->lti_buf, XATTR_NAME_LOV, 0, th);
7040 * Check OSTs for an existing component for further extension
7042 * Checks if OSTs are still healthy and not out of space. Gets free space
7043 * on OSTs (relative to allocation watermark rmb_low) and compares to
7044 * the proposed new_end for this component.
7046 * Decides whether or not to extend a component on its current OSTs.
7048 * \param[in] env execution environment for this thread
7049 * \param[in] lo object we're checking
7050 * \param[in] index index of this component
7051 * \param[in] extension_size extension size for this component
7052 * \param[in] extent layout extent for requested operation
7053 * \param[in] comp_extent extension component extent
7054 * \param[in] write if this is write operation
7056 * \retval true - OK to extend on current OSTs
7057 * \retval false - do not extend on current OSTs
7059 static bool lod_sel_osts_allowed(const struct lu_env *env,
7060 struct lod_object *lo,
7061 int index, __u64 reserve,
7062 struct lu_extent *extent,
7063 struct lu_extent *comp_extent, int write)
7065 struct lod_layout_component *lod_comp = &lo->ldo_comp_entries[index];
7066 struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
7067 struct lod_thread_info *tinfo = lod_env_info(env);
7068 struct obd_statfs *sfs = &tinfo->lti_osfs;
7069 __u64 available = 0;
7075 LASSERT(lod_comp->llc_stripe_count != 0);
7077 lod_getref(&lod->lod_ost_descs);
7078 for (i = 0; i < lod_comp->llc_stripe_count; i++) {
7079 int index = lod_comp->llc_ost_indices[i];
7080 struct lod_tgt_desc *ost = OST_TGT(lod, index);
7081 struct obd_statfs_info info = { 0 };
7082 int j, repeated = 0;
7086 /* Get the number of times this OST repeats in this component.
7087 * Note: inter-component repeats are not counted as this is
7088 * considered as a rare case: we try to not repeat OST in other
7089 * components if possible. */
7090 for (j = 0; j < lod_comp->llc_stripe_count; j++) {
7091 if (index != lod_comp->llc_ost_indices[j])
7094 /* already handled */
7100 if (j < lod_comp->llc_stripe_count)
7103 if (!test_bit(index, lod->lod_ost_bitmap)) {
7104 CDEBUG(D_LAYOUT, "ost %d no longer present\n", index);
7109 rc = dt_statfs_info(env, ost->ltd_tgt, sfs, &info);
7111 CDEBUG(D_LAYOUT, "statfs failed for ost %d, error %d\n",
7117 if (sfs->os_state & OS_STATFS_ENOSPC ||
7118 sfs->os_state & OS_STATFS_READONLY ||
7119 sfs->os_state & OS_STATFS_NOCREATE ||
7120 sfs->os_state & OS_STATFS_DEGRADED) {
7122 "OST%04x unusable for SEL extension, state %x\n",
7123 index, sfs->os_state);
7129 available = sfs->os_bavail * sfs->os_bsize;
7130 /* 'available' is relative to the allocation threshold */
7131 available -= (__u64) info.os_reserved_mb_low << 20;
7133 CDEBUG(D_LAYOUT, "ost %d lowwm: %d highwm: %d, "
7134 "%llu %% blocks available, %llu %% blocks free\n",
7135 index, info.os_reserved_mb_low, info.os_reserved_mb_high,
7136 (100ull * sfs->os_bavail) / sfs->os_blocks,
7137 (100ull * sfs->os_bfree) / sfs->os_blocks);
7139 if (reserve * repeated > available) {
7141 CDEBUG(D_LAYOUT, "low space on ost %d, available %llu "
7142 "< extension size %llu repeated %d\n", index,
7143 available, reserve, repeated);
7147 lod_putref(lod, &lod->lod_ost_descs);
7153 * Adjust extents after component removal
7155 * When we remove an extension component, we move the start of the next
7156 * component to match the start of the extension component, so no space is left
7159 * \param[in] env execution environment for this thread
7160 * \param[in] lo object
7161 * \param[in] max_comp layout component
7162 * \param[in] index index of this component
7164 * \retval 0 on success
7165 * \retval negative errno on error
7167 static void lod_sel_adjust_extents(const struct lu_env *env,
7168 struct lod_object *lo,
7169 int max_comp, int index)
7171 struct lod_layout_component *lod_comp = NULL;
7172 struct lod_layout_component *next = NULL;
7173 struct lod_layout_component *prev = NULL;
7174 __u64 new_start = 0;
7178 /* Extension space component */
7179 lod_comp = &lo->ldo_comp_entries[index];
7180 next = &lo->ldo_comp_entries[index + 1];
7181 prev = &lo->ldo_comp_entries[index - 1];
7183 LASSERT(lod_comp != NULL && prev != NULL && next != NULL);
7184 LASSERT(lod_comp->llc_flags & LCME_FL_EXTENSION);
7186 /* Previous is being removed */
7187 if (prev && prev->llc_id == LCME_ID_INVAL)
7188 new_start = prev->llc_extent.e_start;
7190 new_start = lod_comp->llc_extent.e_start;
7192 for (i = index + 1; i < max_comp; i++) {
7193 lod_comp = &lo->ldo_comp_entries[i];
7195 start = lod_comp->llc_extent.e_start;
7196 lod_comp->llc_extent.e_start = new_start;
7198 /* We only move zero length extendable components */
7199 if (!(start == lod_comp->llc_extent.e_end))
7202 LASSERT(!(lod_comp->llc_flags & LCME_FL_INIT));
7204 lod_comp->llc_extent.e_end = new_start;
7208 /* Calculate the proposed 'new end' for a component we're extending */
7209 static __u64 lod_extension_new_end(__u64 extension_size, __u64 extent_end,
7210 __u32 stripe_size, __u64 component_end,
7211 __u64 extension_end)
7215 LASSERT(extension_size != 0 && stripe_size != 0);
7217 /* Round up to extension size */
7218 if (extent_end == OBD_OBJECT_EOF) {
7219 new_end = OBD_OBJECT_EOF;
7221 /* Add at least extension_size to the previous component_end,
7222 * covering the req layout extent */
7223 new_end = max(extent_end - component_end, extension_size);
7224 new_end = roundup(new_end, extension_size);
7225 new_end += component_end;
7227 /* Component end must be min stripe size aligned */
7228 if (new_end % stripe_size) {
7229 CDEBUG(D_LAYOUT, "new component end is not aligned "
7230 "by the stripe size %u: [%llu, %llu) ext size "
7231 "%llu new end %llu, aligning\n",
7232 stripe_size, component_end, extent_end,
7233 extension_size, new_end);
7234 new_end = roundup(new_end, stripe_size);
7238 if (new_end < extent_end)
7239 new_end = OBD_OBJECT_EOF;
7242 /* Don't extend past the end of the extension component */
7243 if (new_end > extension_end)
7244 new_end = extension_end;
7250 * Calculate the exact reservation (per-OST extension_size) on the OSTs being
7251 * instantiated. It needs to be calculated in advance and taken into account at
7252 * the instantiation time, because otherwise lod_statfs_and_check() may consider
7253 * an OST as OK, but SEL needs its extension_size to fit the free space and the
7254 * OST may turn out to be low-on-space, thus inappropriate OST may be used and
7257 * \param[in] lod_comp lod component we are checking
7259 * \retval size to reserved on each OST of lod_comp's stripe.
7261 static __u64 lod_sel_stripe_reserved(struct lod_layout_component *lod_comp)
7263 /* extension_size is file level, so we must divide by stripe count to
7264 * compare it to available space on a single OST */
7265 return lod_comp->llc_stripe_size * SEL_UNIT_SIZE /
7266 lod_comp->llc_stripe_count;
7269 /* As lod_sel_handler() could be re-entered for the same component several
7270 * times, this is the data for the next call. Fields could be changed to
7271 * component indexes when needed, (e.g. if there is no need to instantiate
7272 * all the previous components up to the current position) to tell the caller
7273 * where to start over from. */
7280 * Process extent updates for a particular layout component
7282 * Handle layout updates for a particular extension space component touched by
7283 * a layout update operation. Core function of self-extending PFL feature.
7285 * In general, this function processes exactly *one* stage of an extension
7286 * operation, modifying the layout accordingly, then returns to the caller.
7287 * The caller is responsible for restarting processing with the new layout,
7288 * which may repeatedly return to this function until the extension updates
7291 * This function does one of a few things to the layout:
7292 * 1. Extends the component before the current extension space component to
7293 * allow it to accomodate the requested operation (if space/policy permit that
7294 * component to continue on its current OSTs)
7296 * 2. If extension of the existing component fails, we do one of two things:
7297 * a. If there is a component after the extension space, we remove the
7298 * extension space component, move the start of the next component down
7299 * accordingly, then notify the caller to restart processing w/the new
7301 * b. If there is no following component, we try repeating the current
7302 * component, creating a new component using the current one as a
7303 * template (keeping its stripe properties but not specific striping),
7304 * and try assigning striping for this component. If there is sufficient
7305 * free space on the OSTs chosen for this component, it is instantiated
7306 * and i/o continues there.
7308 * If there is not sufficient space on the new OSTs, we remove this new
7309 * component & extend the current component.
7311 * Note further that uninited components followed by extension space can be zero
7312 * length meaning that we will try to extend them before initializing them, and
7313 * if that fails, they will be removed without initialization.
7315 * 3. If we extend to/beyond the end of an extension space component, that
7316 * component is exhausted (all of its range has been given to real components),
7317 * so we remove it and restart processing.
7319 * \param[in] env execution environment for this thread
7320 * \param[in,out] lo object to update the layout of
7321 * \param[in] extent layout extent for requested operation, update
7322 * layout to fit this operation
7323 * \param[in] th transaction handle for this operation
7324 * \param[in,out] max_comp the highest comp for the portion of the layout
7325 * we are operating on (For FLR, the chosen
7326 * replica). Updated because we may remove
7328 * \param[in] index index of the extension space component we're
7330 * \param[in] write if this is write op
7331 * \param[in,out] force if the extension is to be forced; set here
7332 to force it on the 2nd call for the same
7335 * \retval 0 on success
7336 * \retval negative errno on error
7338 static int lod_sel_handler(const struct lu_env *env,
7339 struct lod_object *lo,
7340 struct lu_extent *extent,
7341 struct thandle *th, int *max_comp,
7342 int index, int write,
7343 struct sel_data *sd)
7345 struct lod_device *d = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
7346 struct lod_thread_info *info = lod_env_info(env);
7347 struct lod_layout_component *lod_comp;
7348 struct lod_layout_component *prev;
7349 struct lod_layout_component *next = NULL;
7350 __u64 extension_size, reserve;
7357 /* First component cannot be extension space */
7359 CERROR("%s: "DFID" first component cannot be extension space\n",
7360 lod2obd(d)->obd_name, PFID(lod_object_fid(lo)));
7364 lod_comp = &lo->ldo_comp_entries[index];
7365 prev = &lo->ldo_comp_entries[index - 1];
7366 if ((index + 1) < *max_comp)
7367 next = &lo->ldo_comp_entries[index + 1];
7369 /* extension size uses the stripe size field as KiB */
7370 extension_size = lod_comp->llc_stripe_size * SEL_UNIT_SIZE;
7372 CDEBUG(D_LAYOUT, "prev start %llu, extension start %llu, extension end"
7373 " %llu, extension size %llu\n", prev->llc_extent.e_start,
7374 lod_comp->llc_extent.e_start, lod_comp->llc_extent.e_end,
7377 /* Two extension space components cannot be adjacent & extension space
7378 * components cannot be init */
7379 if ((prev->llc_flags & LCME_FL_EXTENSION) ||
7380 !(ergo(next, !(next->llc_flags & LCME_FL_EXTENSION))) ||
7381 lod_comp_inited(lod_comp)) {
7382 CERROR("%s: "DFID" invalid extension space components\n",
7383 lod2obd(d)->obd_name, PFID(lod_object_fid(lo)));
7387 reserve = lod_sel_stripe_reserved(lod_comp);
7389 if (!prev->llc_stripe) {
7390 CDEBUG(D_LAYOUT, "Previous component not inited\n");
7391 info->lti_count = 1;
7392 info->lti_comp_idx[0] = index - 1;
7393 rc = lod_declare_instantiate_components(env, lo, th, reserve);
7394 /* ENOSPC tells us we can't use this component. If there is
7395 * a next or we are repeating, we either spill over (next) or
7396 * extend the original comp (repeat). Otherwise, return the
7397 * error to the user. */
7398 if (rc == -ENOSPC && (next || sd->sd_repeat))
7404 if (sd->sd_force == 0 && rc == 0)
7405 rc = !lod_sel_osts_allowed(env, lo, index - 1, reserve, extent,
7406 &lod_comp->llc_extent, write);
7408 repeated = !!(sd->sd_repeat);
7412 /* Extend previous component */
7414 new_end = lod_extension_new_end(extension_size, extent->e_end,
7415 prev->llc_stripe_size,
7416 prev->llc_extent.e_end,
7417 lod_comp->llc_extent.e_end);
7419 CDEBUG(D_LAYOUT, "new end %llu\n", new_end);
7420 lod_comp->llc_extent.e_start = new_end;
7421 prev->llc_extent.e_end = new_end;
7423 if (prev->llc_extent.e_end == lod_comp->llc_extent.e_end) {
7424 CDEBUG(D_LAYOUT, "Extension component exhausted\n");
7425 lod_comp->llc_id = LCME_ID_INVAL;
7429 /* rc == 1, failed to extend current component */
7432 /* Normal 'spillover' case - Remove the extension
7433 * space component & bring down the start of the next
7435 lod_comp->llc_id = LCME_ID_INVAL;
7437 if (!(prev->llc_flags & LCME_FL_INIT)) {
7438 prev->llc_id = LCME_ID_INVAL;
7441 lod_sel_adjust_extents(env, lo, *max_comp, index);
7442 } else if (lod_comp_inited(prev)) {
7443 /* If there is no next, and the previous component is
7444 * INIT'ed, try repeating the previous component. */
7445 LASSERT(repeated == 0);
7446 rc = lod_layout_repeat_comp(env, lo, index - 1);
7450 /* The previous component is a repeated component.
7451 * Record this so we don't keep trying to repeat it. */
7454 /* If the previous component is not INIT'ed, this may
7455 * be a component we have just instantiated but failed
7456 * to extend. Or even a repeated component we failed
7457 * to prepare a striping for. Do not repeat but instead
7458 * remove the repeated component & force the extention
7459 * of the original one */
7462 prev->llc_id = LCME_ID_INVAL;
7469 rc = lod_layout_del_prep_layout(env, lo, NULL);
7472 LASSERTF(-rc == change,
7473 "number deleted %d != requested %d\n", -rc,
7476 *max_comp = *max_comp + change;
7478 /* lod_del_prep_layout reallocates ldo_comp_entries, so we must
7479 * refresh these pointers before using them */
7480 lod_comp = &lo->ldo_comp_entries[index];
7481 prev = &lo->ldo_comp_entries[index - 1];
7482 CDEBUG(D_LAYOUT, "After extent updates: prev start %llu, current start "
7483 "%llu, current end %llu max_comp %d ldo_comp_cnt %d\n",
7484 prev->llc_extent.e_start, lod_comp->llc_extent.e_start,
7485 lod_comp->llc_extent.e_end, *max_comp, lo->ldo_comp_cnt);
7487 /* Layout changed successfully */
7492 * Declare layout extent updates
7494 * Handles extensions. Identifies extension components touched by current
7495 * operation and passes them to processing function.
7497 * Restarts with updated layouts from the processing function until the current
7498 * operation no longer touches an extension space component.
7500 * \param[in] env execution environment for this thread
7501 * \param[in,out] lo object to update the layout of
7502 * \param[in] extent layout extent for requested operation, update layout to
7503 * fit this operation
7504 * \param[in] th transaction handle for this operation
7505 * \param[in] pick identifies chosen mirror for FLR layouts
7506 * \param[in] write if this is write op
7508 * \retval 1 on layout changed, 0 on no change
7509 * \retval negative errno on error
7511 static int lod_declare_update_extents(const struct lu_env *env,
7512 struct lod_object *lo, struct lu_extent *extent,
7513 struct thandle *th, int pick, int write)
7515 struct lod_thread_info *info = lod_env_info(env);
7516 struct lod_layout_component *lod_comp;
7517 bool layout_changed = false;
7518 struct sel_data sd = { 0 };
7526 /* This makes us work on the components of the chosen mirror */
7527 if (lo->ldo_mirrors) {
7528 start_index = lo->ldo_mirrors[pick].lme_start;
7529 max_comp = lo->ldo_mirrors[pick].lme_end + 1;
7532 max_comp = lo->ldo_comp_cnt;
7534 if (lo->ldo_flr_state == LCM_FL_NONE)
7535 LASSERT(start_index == 0 && max_comp == lo->ldo_comp_cnt);
7537 CDEBUG(D_LAYOUT, "extent->e_start %llu, extent->e_end %llu\n",
7538 extent->e_start, extent->e_end);
7539 for (i = start_index; i < max_comp; i++) {
7540 lod_comp = &lo->ldo_comp_entries[i];
7542 /* We've passed all components of interest */
7543 if (lod_comp->llc_extent.e_start >= extent->e_end)
7546 if (lod_comp->llc_flags & LCME_FL_EXTENSION) {
7547 layout_changed = true;
7548 rc = lod_sel_handler(env, lo, extent, th, &max_comp,
7553 /* Nothing has changed behind the prev one */
7559 /* We may have added or removed components. If so, we must update the
7560 * start & ends of all the mirrors after the current one, and the end
7561 * of the current mirror. */
7562 if (lo->ldo_mirrors) {
7563 change = max_comp - 1 - lo->ldo_mirrors[pick].lme_end;
7565 lo->ldo_mirrors[pick].lme_end += change;
7566 for (i = pick + 1; i < lo->ldo_mirror_count; i++) {
7567 lo->ldo_mirrors[i].lme_start += change;
7568 lo->ldo_mirrors[i].lme_end += change;
7575 /* The amount of components has changed, adjust the lti_comp_idx */
7576 rc2 = lod_layout_data_init(info, lo->ldo_comp_cnt);
7578 return rc < 0 ? rc : rc2 < 0 ? rc2 : layout_changed;
7581 /* If striping is already instantiated or INIT'ed DOM? */
7582 static bool lod_is_instantiation_needed(struct lod_layout_component *comp)
7584 if (comp->llc_magic == LOV_MAGIC_FOREIGN)
7587 return !(((lov_pattern(comp->llc_pattern) & LOV_PATTERN_MDT) &&
7588 lod_comp_inited(comp)) || comp->llc_stripe);
7592 * Declare layout update for a non-FLR layout.
7594 * \param[in] env execution environment for this thread
7595 * \param[in,out] lo object to update the layout of
7596 * \param[in] layout layout intent for requested operation, "update" is
7597 * a process of reacting to this
7598 * \param[in] buf buffer containing lov ea (see comment on usage inline)
7599 * \param[in] th transaction handle for this operation
7601 * \retval 0 on success
7602 * \retval negative errno on error
7604 static int lod_declare_update_plain(const struct lu_env *env,
7605 struct lod_object *lo, struct layout_intent *layout,
7606 const struct lu_buf *buf, struct thandle *th)
7608 struct lod_thread_info *info = lod_env_info(env);
7609 struct lod_device *d = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
7610 struct lod_layout_component *lod_comp;
7611 struct lov_comp_md_v1 *comp_v1 = NULL;
7612 bool layout_changed = false;
7613 bool replay = false;
7617 LASSERT(lo->ldo_flr_state == LCM_FL_NONE);
7620 * In case the client is passing lovea, which only happens during
7621 * the replay of layout intent write RPC for now, we may need to
7622 * parse the lovea and apply new layout configuration.
7624 if (buf && buf->lb_len) {
7625 struct lov_user_md_v1 *v1 = buf->lb_buf;
7627 if (v1->lmm_magic != (LOV_MAGIC_DEFINED | LOV_MAGIC_COMP_V1) &&
7628 v1->lmm_magic != __swab32(LOV_MAGIC_DEFINED |
7629 LOV_MAGIC_COMP_V1)) {
7630 CERROR("%s: the replay buffer of layout extend "
7631 "(magic %#x) does not contain expected "
7632 "composite layout.\n",
7633 lod2obd(d)->obd_name, v1->lmm_magic);
7634 GOTO(out, rc = -EINVAL);
7637 rc = lod_use_defined_striping(env, lo, buf);
7640 lo->ldo_comp_cached = 1;
7642 rc = lod_get_lov_ea(env, lo);
7645 /* old on-disk EA is stored in info->lti_buf */
7646 comp_v1 = (struct lov_comp_md_v1 *)info->lti_buf.lb_buf;
7648 layout_changed = true;
7650 rc = lod_layout_data_init(info, lo->ldo_comp_cnt);
7654 /* non replay path */
7655 rc = lod_striping_load(env, lo);
7660 /* Make sure defined layout covers the requested write range. */
7661 lod_comp = &lo->ldo_comp_entries[lo->ldo_comp_cnt - 1];
7662 if (lo->ldo_comp_cnt > 1 &&
7663 lod_comp->llc_extent.e_end != OBD_OBJECT_EOF &&
7664 lod_comp->llc_extent.e_end < layout->lai_extent.e_end) {
7665 CDEBUG_LIMIT(replay ? D_ERROR : D_LAYOUT,
7666 "%s: the defined layout [0, %#llx) does not "
7667 "covers the write range "DEXT"\n",
7668 lod2obd(d)->obd_name, lod_comp->llc_extent.e_end,
7669 PEXT(&layout->lai_extent));
7670 GOTO(out, rc = -EINVAL);
7673 CDEBUG(D_LAYOUT, "%s: "DFID": update components "DEXT"\n",
7674 lod2obd(d)->obd_name, PFID(lod_object_fid(lo)),
7675 PEXT(&layout->lai_extent));
7678 rc = lod_declare_update_extents(env, lo, &layout->lai_extent,
7679 th, 0, layout->lai_opc == LAYOUT_INTENT_WRITE);
7683 layout_changed = true;
7687 * Iterate ld->ldo_comp_entries, find the component whose extent under
7688 * the write range and not instantianted.
7690 for (i = 0; i < lo->ldo_comp_cnt; i++) {
7691 lod_comp = &lo->ldo_comp_entries[i];
7693 if (lod_comp->llc_extent.e_start >= layout->lai_extent.e_end)
7697 /* If striping is instantiated or INIT'ed DOM skip */
7698 if (!lod_is_instantiation_needed(lod_comp))
7702 * In replay path, lod_comp is the EA passed by
7703 * client replay buffer, comp_v1 is the pre-recovery
7704 * on-disk EA, we'd sift out those components which
7705 * were init-ed in the on-disk EA.
7707 if (le32_to_cpu(comp_v1->lcm_entries[i].lcme_flags) &
7712 * this component hasn't instantiated in normal path, or during
7713 * replay it needs replay the instantiation.
7716 /* A released component is being extended */
7717 if (lod_comp->llc_pattern & LOV_PATTERN_F_RELEASED)
7718 GOTO(out, rc = -EINVAL);
7720 LASSERT(info->lti_comp_idx != NULL);
7721 info->lti_comp_idx[info->lti_count++] = i;
7722 layout_changed = true;
7725 if (!layout_changed)
7728 lod_obj_inc_layout_gen(lo);
7729 rc = lod_declare_instantiate_components(env, lo, th, 0);
7733 lod_striping_free(env, lo);
7737 static inline int lod_comp_index(struct lod_object *lo,
7738 struct lod_layout_component *lod_comp)
7740 LASSERT(lod_comp >= lo->ldo_comp_entries &&
7741 lod_comp <= &lo->ldo_comp_entries[lo->ldo_comp_cnt - 1]);
7743 return lod_comp - lo->ldo_comp_entries;
7747 * Stale other mirrors by writing extent.
7749 static int lod_stale_components(const struct lu_env *env, struct lod_object *lo,
7750 int primary, struct lu_extent *extent,
7753 struct lod_layout_component *pri_comp, *lod_comp;
7754 struct lu_extent pri_extent;
7759 /* The writing extent decides which components in the primary
7760 * are affected... */
7761 CDEBUG(D_LAYOUT, "primary mirror %d, "DEXT"\n", primary, PEXT(extent));
7764 lod_foreach_mirror_comp(pri_comp, lo, primary) {
7765 if (!lu_extent_is_overlapped(extent, &pri_comp->llc_extent))
7768 CDEBUG(D_LAYOUT, "primary comp %u "DEXT"\n",
7769 lod_comp_index(lo, pri_comp),
7770 PEXT(&pri_comp->llc_extent));
7772 pri_extent.e_start = pri_comp->llc_extent.e_start;
7773 pri_extent.e_end = pri_comp->llc_extent.e_end;
7775 for (i = 0; i < lo->ldo_mirror_count; i++) {
7779 rc = lod_declare_update_extents(env, lo, &pri_extent,
7781 /* if update_extents changed the layout, it may have
7782 * reallocated the component array, so start over to
7783 * avoid using stale pointers */
7789 /* ... and then stale other components that are
7790 * overlapping with primary components */
7791 lod_foreach_mirror_comp(lod_comp, lo, i) {
7792 if (!lu_extent_is_overlapped(
7794 &lod_comp->llc_extent))
7797 CDEBUG(D_LAYOUT, "stale: %u / %u\n",
7798 i, lod_comp_index(lo, lod_comp));
7800 lod_comp->llc_flags |= LCME_FL_STALE;
7801 lo->ldo_mirrors[i].lme_stale = 1;
7802 if (lod_is_hsm(lod_comp))
7803 lod_comp->llc_foreign_flags |= HS_DIRTY;
7812 * check an OST's availability
7813 * \param[in] env execution environment
7814 * \param[in] lo lod object
7815 * \param[in] dt dt object
7816 * \param[in] index mirror index
7818 * \retval negative if failed
7819 * \retval 1 if \a dt is available
7820 * \retval 0 if \a dt is not available
7822 static inline int lod_check_ost_avail(const struct lu_env *env,
7823 struct lod_object *lo,
7824 struct dt_object *dt, int index)
7826 struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
7827 struct lod_tgt_desc *ost;
7829 int type = LU_SEQ_RANGE_OST;
7832 rc = lod_fld_lookup(env, lod, lu_object_fid(&dt->do_lu), &idx, &type);
7834 CERROR("%s: can't locate "DFID":rc = %d\n",
7835 lod2obd(lod)->obd_name, PFID(lu_object_fid(&dt->do_lu)),
7840 ost = OST_TGT(lod, idx);
7841 if (ost->ltd_active == 0) {
7842 CDEBUG(D_LAYOUT, DFID ": mirror %d OST%d unavail\n",
7843 PFID(lod_object_fid(lo)), index, idx);
7851 * Pick primary mirror for write
7852 * \param[in] env execution environment
7853 * \param[in] lo object
7854 * \param[in] extent write range
7856 static int lod_primary_pick(const struct lu_env *env, struct lod_object *lo,
7857 struct lu_extent *extent)
7859 struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
7860 unsigned int seq = 0;
7861 struct lod_layout_component *lod_comp;
7863 int picked = -1, second_pick = -1, third_pick = -1;
7866 if (CFS_FAIL_CHECK(OBD_FAIL_FLR_RANDOM_PICK_MIRROR)) {
7867 get_random_bytes(&seq, sizeof(seq));
7868 seq %= lo->ldo_mirror_count;
7872 * Pick a mirror as the primary, and check the availability of OSTs.
7874 * This algo can be revised later after knowing the topology of
7877 lod_qos_statfs_update(env, lod, &lod->lod_ost_descs);
7879 rc = lod_fill_mirrors(lo);
7883 for (i = 0; i < lo->ldo_mirror_count; i++) {
7884 bool ost_avail = true;
7885 int index = (i + seq) % lo->ldo_mirror_count;
7887 if (lo->ldo_mirrors[index].lme_stale) {
7888 CDEBUG(D_LAYOUT, DFID": mirror %d stale\n",
7889 PFID(lod_object_fid(lo)), index);
7893 /* 2nd pick is for the primary mirror containing unavail OST */
7894 if (lo->ldo_mirrors[index].lme_prefer && second_pick < 0)
7895 second_pick = index;
7897 /* 3rd pick is for non-primary mirror containing unavail OST */
7898 if (second_pick < 0 && third_pick < 0)
7902 * we found a non-primary 1st pick, we'd like to find a
7903 * potential pirmary mirror.
7905 if (picked >= 0 && !lo->ldo_mirrors[index].lme_prefer)
7908 /* check the availability of OSTs */
7909 lod_foreach_mirror_comp(lod_comp, lo, index) {
7910 if (!lod_comp_inited(lod_comp) || !lod_comp->llc_stripe)
7913 for (j = 0; j < lod_comp->llc_stripe_count; j++) {
7914 struct dt_object *dt = lod_comp->llc_stripe[j];
7916 rc = lod_check_ost_avail(env, lo, dt, index);
7923 } /* for all dt object in one component */
7926 } /* for all components in a mirror */
7929 * the OSTs where allocated objects locates in the components
7930 * of the mirror are available.
7935 /* this mirror has all OSTs available */
7939 * primary with all OSTs are available, this is the perfect
7942 if (lo->ldo_mirrors[index].lme_prefer)
7944 } /* for all mirrors */
7946 /* failed to pick a sound mirror, lower our expectation */
7948 picked = second_pick;
7950 picked = third_pick;
7957 static int lod_prepare_resync_mirror(const struct lu_env *env,
7958 struct lod_object *lo,
7961 struct lod_thread_info *info = lod_env_info(env);
7962 struct lod_layout_component *lod_comp;
7963 bool neg = !!(MIRROR_ID_NEG & mirror_id);
7966 mirror_id &= ~MIRROR_ID_NEG;
7968 for (i = 0; i < lo->ldo_mirror_count; i++) {
7969 if ((!neg && lo->ldo_mirrors[i].lme_id != mirror_id) ||
7970 (neg && lo->ldo_mirrors[i].lme_id == mirror_id))
7973 lod_foreach_mirror_comp(lod_comp, lo, i) {
7974 if (lod_comp_inited(lod_comp))
7977 info->lti_comp_idx[info->lti_count++] =
7978 lod_comp_index(lo, lod_comp);
7986 * figure out the components should be instantiated for resync.
7988 static int lod_prepare_resync(const struct lu_env *env, struct lod_object *lo,
7989 struct lu_extent *extent)
7991 struct lod_thread_info *info = lod_env_info(env);
7992 struct lod_layout_component *lod_comp;
7993 unsigned int need_sync = 0;
7997 DFID": instantiate all stale components in "DEXT"\n",
7998 PFID(lod_object_fid(lo)), PEXT(extent));
8001 * instantiate all components within this extent, even non-stale
8004 for (i = 0; i < lo->ldo_mirror_count; i++) {
8005 if (!lo->ldo_mirrors[i].lme_stale)
8008 lod_foreach_mirror_comp(lod_comp, lo, i) {
8009 if (!lu_extent_is_overlapped(extent,
8010 &lod_comp->llc_extent))
8015 if (lod_comp_inited(lod_comp))
8018 CDEBUG(D_LAYOUT, "resync instantiate %d / %d\n",
8019 i, lod_comp_index(lo, lod_comp));
8020 info->lti_comp_idx[info->lti_count++] =
8021 lod_comp_index(lo, lod_comp);
8025 return need_sync ? 0 : -EALREADY;
8028 static struct lod_layout_component *
8029 lod_locate_comp_hsm(struct lod_object *lo, int *hsm_mirror_id)
8031 struct lod_layout_component *lod_comp = NULL;
8034 if (!lo->ldo_is_composite)
8037 for (i = 0; i < lo->ldo_mirror_count; i++) {
8039 * FIXME: In the current design, there is only one HSM
8040 * mirror component in range [0, EOF] for a FLR file. This
8041 * should be fixed to support multiple HSM mirror components
8042 * with different HSM backend types and partial file ranges
8045 if (lo->ldo_mirrors[i].lme_hsm) {
8051 start_idx = lo->ldo_mirrors[i].lme_start;
8052 end_idx = lo->ldo_mirrors[i].lme_end;
8053 LASSERT(start_idx == end_idx);
8054 lod_comp = &lo->ldo_comp_entries[start_idx];
8055 LASSERT(lo->ldo_is_composite && lod_is_hsm(lod_comp) &&
8056 lod_comp->llc_extent.e_start == 0 &&
8057 lod_comp->llc_extent.e_end == LUSTRE_EOF);
8065 static int lod_declare_pccro_set(const struct lu_env *env,
8066 struct dt_object *dt, struct thandle *th)
8068 struct lod_thread_info *info = lod_env_info(env);
8069 struct lu_buf *buf = &info->lti_buf;
8070 struct lod_object *lo = lod_dt_obj(dt);
8071 struct lod_layout_component *lod_comp;
8072 struct lod_layout_component *comp_array;
8073 struct lod_mirror_entry *mirror_array;
8083 rc = lod_striping_load(env, lo);
8087 if (lo->ldo_flr_state & LCM_FL_PCC_RDONLY)
8090 rc = lod_layout_data_init(info, lo->ldo_comp_cnt);
8094 lod_comp = lod_locate_comp_hsm(lo, &hsm_mirror_id);
8096 if (lod_comp->llc_foreign_flags & HS_PCCRO) {
8097 CDEBUG(D_LAYOUT, "bad HSM flags: %#x\n",
8098 lod_comp->llc_foreign_flags);
8102 lod_obj_inc_layout_gen(lo);
8103 lod_comp->llc_foreign_flags |= HS_PCCRO;
8104 lod_comp->llc_foreign_flags &= ~HS_DIRTY;
8105 lod_comp->llc_flags &= ~LCME_FL_STALE;
8106 lo->ldo_mirrors[hsm_mirror_id].lme_stale = 0;
8107 lo->ldo_flr_state |= LCM_FL_PCC_RDONLY;
8108 buf->lb_len = lod_comp_md_size(lo, false);
8109 rc = lod_sub_declare_xattr_set(env, lod_object_child(lo),
8110 buf, XATTR_NAME_LOV, 0, th);
8115 * Create an new composite layout with only one HSM component.
8116 * Field @lhm_archive_uuid is used to be the identifier within HSM
8117 * backend for the archive copy. In the PCC case with a POSIX archive,
8118 * This can just be the original inode FID. This is important because
8119 * the inode FID may change due to layout swaps or migration to a new
8120 * MDT, and we do not want that to cause problems with finding the copy
8123 mirror_cnt = lo->ldo_mirror_count + 1;
8124 if (!lo->ldo_is_composite) {
8125 LASSERT(lo->ldo_mirror_count == 0);
8129 OBD_ALLOC_PTR_ARRAY(mirror_array, mirror_cnt);
8130 if (mirror_array == NULL)
8133 new_cnt = lo->ldo_comp_cnt + 1;
8134 OBD_ALLOC_PTR_ARRAY(comp_array, new_cnt);
8135 if (comp_array == NULL) {
8136 OBD_FREE_PTR_ARRAY(mirror_array, mirror_cnt);
8141 for (i = 0; i < lo->ldo_comp_cnt; i++) {
8142 lod_comp = &lo->ldo_comp_entries[i];
8145 * Add mirror from a non-flr file, create new mirror ID.
8146 * Otherwise, keep existing mirror's component ID, used
8147 * for mirror extension.
8149 if (lo->ldo_mirror_count == 0 &&
8150 mirror_id_of(lod_comp->llc_id) == 0)
8151 lod_comp->llc_id = pflr_id(1, i + 1);
8153 if (lod_comp->llc_id != LCME_ID_INVAL &&
8154 mirror_id_of(lod_comp->llc_id) > mirror_id)
8155 mirror_id = mirror_id_of(lod_comp->llc_id);
8157 if (!lo->ldo_is_composite) {
8158 lod_comp->llc_extent.e_start = 0;
8159 lod_comp->llc_extent.e_end = LUSTRE_EOF;
8160 lod_comp_set_init(lod_comp);
8164 memcpy(comp_array, lo->ldo_comp_entries,
8165 sizeof(*comp_array) * lo->ldo_comp_cnt);
8167 lod_comp = &comp_array[new_cnt - 1];
8168 lod_comp->llc_magic = LOV_MAGIC_FOREIGN;
8169 lod_comp->llc_extent.e_start = 0;
8170 lod_comp->llc_extent.e_end = LUSTRE_EOF;
8171 lod_comp->llc_length = sizeof(struct lov_hsm_base);
8172 lod_comp->llc_type = LU_FOREIGN_TYPE_PCCRO;
8173 lod_comp->llc_foreign_flags = HS_EXISTS | HS_ARCHIVED | HS_PCCRO;
8174 memset(&lod_comp->llc_hsm, 0, sizeof(lod_comp->llc_hsm));
8176 if (lo->ldo_mirrors)
8177 OBD_FREE_PTR_ARRAY(lo->ldo_mirrors, lo->ldo_mirror_count);
8178 OBD_FREE_PTR_ARRAY(lo->ldo_comp_entries, lo->ldo_comp_cnt);
8181 * The @ldo_mirror will be refilled by lod_fill_mirrors() when
8182 * call lod_striped_create() for layout change.
8184 lo->ldo_mirrors = mirror_array;
8185 lo->ldo_mirror_count = mirror_cnt;
8186 lo->ldo_comp_entries = comp_array;
8187 lo->ldo_comp_cnt = new_cnt;
8188 lo->ldo_is_composite = 1;
8191 lod_comp->llc_id = LCME_ID_INVAL;
8192 lod_comp->llc_id = lod_gen_component_id(lo, mirror_id, new_cnt - 1);
8194 if (lo->ldo_flr_state == LCM_FL_NONE)
8195 lo->ldo_flr_state = LCM_FL_RDONLY;
8196 lo->ldo_flr_state |= LCM_FL_PCC_RDONLY;
8197 buf->lb_len = lod_comp_md_size(lo, false);
8198 rc = lod_sub_declare_xattr_set(env, lod_object_child(lo),
8199 buf, XATTR_NAME_LOV, 0, th);
8201 lod_striping_free(env, lo);
8207 * TODO: When clear LCM_FL_PCC_RDONLY flag from the layouts, it means the file
8208 * is going to be modified. Currently it needs two RPCs: first one is to clear
8209 * LCM_FL_PCC_RDONLY flag; the second one is to pick primary mirror and mark
8210 * the file as LCM_FL_WRITE_PENDING.
8211 * These two RPCs can be combined in one RPC call.
8213 static int lod_declare_pccro_clear(const struct lu_env *env,
8214 struct dt_object *dt, struct thandle *th)
8216 struct lod_thread_info *info = lod_env_info(env);
8217 struct lod_object *lo = lod_dt_obj(dt);
8218 struct lod_layout_component *lod_comp;
8223 rc = lod_striping_load(env, lo);
8227 if (!(lo->ldo_flr_state & LCM_FL_PCC_RDONLY))
8230 rc = lod_layout_data_init(info, lo->ldo_comp_cnt);
8234 lod_comp = lod_locate_comp_hsm(lo, NULL);
8235 if (lod_comp == NULL) {
8236 CDEBUG(D_LAYOUT, "Not found any HSM component\n");
8237 GOTO(out, rc = -EINVAL);
8240 lod_comp->llc_foreign_flags &= ~HS_PCCRO;
8241 lo->ldo_flr_state &= ~LCM_FL_PCC_RDONLY;
8242 lod_obj_inc_layout_gen(lo);
8243 info->lti_buf.lb_len = lod_comp_md_size(lo, false);
8244 rc = lod_sub_declare_xattr_set(env, lod_object_child(lo),
8245 &info->lti_buf, XATTR_NAME_LOV, 0, th);
8248 lod_striping_free(env, lo);
8253 static int lod_declare_update_pccro(const struct lu_env *env,
8254 struct dt_object *dt,
8255 struct md_layout_change *mlc,
8258 struct layout_intent *intent = mlc->mlc_intent;
8261 switch (intent->lai_opc) {
8262 case LAYOUT_INTENT_PCCRO_SET:
8263 rc = lod_declare_pccro_set(env, dt, th);
8265 case LAYOUT_INTENT_PCCRO_CLEAR:
8266 rc = lod_declare_pccro_clear(env, dt, th);
8276 static int lod_declare_update_rdonly(const struct lu_env *env,
8277 struct lod_object *lo, struct md_layout_change *mlc,
8280 struct lod_thread_info *info = lod_env_info(env);
8281 struct lu_attr *layout_attr = &info->lti_layout_attr;
8282 struct lod_layout_component *lod_comp;
8283 struct lu_extent extent = { 0 };
8287 LASSERT(lo->ldo_flr_state == LCM_FL_RDONLY);
8288 LASSERT(mlc->mlc_opc == MD_LAYOUT_WRITE ||
8289 mlc->mlc_opc == MD_LAYOUT_RESYNC);
8290 LASSERT(lo->ldo_mirror_count > 0);
8292 if (mlc->mlc_opc == MD_LAYOUT_WRITE) {
8293 struct layout_intent *layout = mlc->mlc_intent;
8294 int write = layout->lai_opc == LAYOUT_INTENT_WRITE;
8297 extent = layout->lai_extent;
8298 CDEBUG(D_LAYOUT, DFID": trying to write :"DEXT"\n",
8299 PFID(lod_object_fid(lo)), PEXT(&extent));
8301 picked = lod_primary_pick(env, lo, &extent);
8305 CDEBUG(D_LAYOUT, DFID": picked mirror id %u as primary\n",
8306 PFID(lod_object_fid(lo)),
8307 lo->ldo_mirrors[picked].lme_id);
8309 /* Update extents of primary before staling */
8310 rc = lod_declare_update_extents(env, lo, &extent, th, picked,
8315 if (layout->lai_opc == LAYOUT_INTENT_TRUNC) {
8317 * trunc transfers [0, size) in the intent extent, we'd
8318 * stale components overlapping [size, eof).
8320 extent.e_start = extent.e_end;
8321 extent.e_end = OBD_OBJECT_EOF;
8324 /* stale overlapping components from other mirrors */
8325 rc = lod_stale_components(env, lo, picked, &extent, th);
8329 /* restore truncate intent extent */
8330 if (layout->lai_opc == LAYOUT_INTENT_TRUNC)
8331 extent.e_end = extent.e_start;
8333 /* instantiate components for the picked mirror, start from 0 */
8336 lod_foreach_mirror_comp(lod_comp, lo, picked) {
8337 if (!lu_extent_is_overlapped(&extent,
8338 &lod_comp->llc_extent))
8341 if (!lod_is_instantiation_needed(lod_comp))
8344 info->lti_comp_idx[info->lti_count++] =
8345 lod_comp_index(lo, lod_comp);
8348 lo->ldo_flr_state = LCM_FL_WRITE_PENDING;
8349 } else { /* MD_LAYOUT_RESYNC */
8353 * could contain multiple non-stale mirrors, so we need to
8354 * prep uninited all components assuming any non-stale mirror
8355 * could be picked as the primary mirror.
8357 if (mlc->mlc_mirror_id == 0) {
8359 for (i = 0; i < lo->ldo_mirror_count; i++) {
8360 if (lo->ldo_mirrors[i].lme_stale)
8363 lod_foreach_mirror_comp(lod_comp, lo, i) {
8364 if (!lod_comp_inited(lod_comp))
8368 lod_comp->llc_extent.e_end)
8370 lod_comp->llc_extent.e_end;
8373 rc = lod_prepare_resync(env, lo, &extent);
8377 /* mirror write, try to init its all components */
8378 rc = lod_prepare_resync_mirror(env, lo,
8379 mlc->mlc_mirror_id);
8384 /* change the file state to SYNC_PENDING */
8385 lo->ldo_flr_state = LCM_FL_SYNC_PENDING;
8388 /* Reset the layout version once it's becoming too large.
8389 * This way it can make sure that the layout version is
8390 * monotonously increased in this writing era. */
8391 lod_obj_inc_layout_gen(lo);
8393 rc = lod_declare_instantiate_components(env, lo, th, 0);
8397 layout_attr->la_valid = LA_LAYOUT_VERSION;
8398 layout_attr->la_layout_version = 0;
8399 if (mlc->mlc_opc == MD_LAYOUT_RESYNC)
8400 layout_attr->la_layout_version = LU_LAYOUT_RESYNC;
8401 rc = lod_declare_attr_set(env, &lo->ldo_obj, layout_attr, th);
8407 lod_striping_free(env, lo);
8411 static int lod_declare_update_write_pending(const struct lu_env *env,
8412 struct lod_object *lo, struct md_layout_change *mlc,
8415 struct lod_thread_info *info = lod_env_info(env);
8416 struct lu_attr *layout_attr = &info->lti_layout_attr;
8417 struct lod_layout_component *lod_comp;
8418 struct lu_extent extent = { 0 };
8424 LASSERT(lo->ldo_flr_state == LCM_FL_WRITE_PENDING);
8425 LASSERT(mlc->mlc_opc == MD_LAYOUT_WRITE ||
8426 mlc->mlc_opc == MD_LAYOUT_RESYNC);
8428 /* look for the first preferred mirror */
8429 for (i = 0; i < lo->ldo_mirror_count; i++) {
8430 if (lo->ldo_mirrors[i].lme_stale)
8432 if (lo->ldo_mirrors[i].lme_prefer == 0)
8434 if (lo->ldo_mirrors[i].lme_hsm)
8441 /* no primary, use any in-sync */
8442 for (i = 0; i < lo->ldo_mirror_count; i++) {
8443 if (lo->ldo_mirrors[i].lme_stale)
8449 CERROR(DFID ": doesn't have a primary mirror\n",
8450 PFID(lod_object_fid(lo)));
8451 GOTO(out, rc = -ENODATA);
8455 CDEBUG(D_LAYOUT, DFID": found primary %u\n",
8456 PFID(lod_object_fid(lo)), lo->ldo_mirrors[primary].lme_id);
8458 LASSERT(!lo->ldo_mirrors[primary].lme_stale);
8460 /* for LAYOUT_WRITE opc, it has to do the following operations:
8461 * 1. stale overlapping componets from stale mirrors;
8462 * 2. instantiate components of the primary mirror;
8463 * 3. transfter layout version to all objects of the primary;
8465 * for LAYOUT_RESYNC opc, it will do:
8466 * 1. instantiate components of all stale mirrors;
8467 * 2. transfer layout version to all objects to close write era. */
8469 if (mlc->mlc_opc == MD_LAYOUT_WRITE) {
8470 struct layout_intent *layout = mlc->mlc_intent;
8471 int write = layout->lai_opc == LAYOUT_INTENT_WRITE;
8473 LASSERT(layout != NULL);
8475 extent = layout->lai_extent;
8477 CDEBUG(D_LAYOUT, DFID": intent to write: "DEXT"\n",
8478 PFID(lod_object_fid(lo)), PEXT(&extent));
8480 /* 1. Update extents of primary before staling */
8481 rc = lod_declare_update_extents(env, lo, &extent, th, primary,
8486 if (layout->lai_opc == LAYOUT_INTENT_TRUNC) {
8488 * trunc transfers [0, size) in the intent extent, we'd
8489 * stale components overlapping [size, eof).
8491 extent.e_start = extent.e_end;
8492 extent.e_end = OBD_OBJECT_EOF;
8495 /* 2. stale overlapping components */
8496 rc = lod_stale_components(env, lo, primary, &extent, th);
8500 /* 3. find the components which need instantiating.
8501 * instantiate [0, mlc->mlc_intent->e_end) */
8503 /* restore truncate intent extent */
8504 if (layout->lai_opc == LAYOUT_INTENT_TRUNC)
8505 extent.e_end = extent.e_start;
8508 lod_foreach_mirror_comp(lod_comp, lo, primary) {
8509 if (!lu_extent_is_overlapped(&extent,
8510 &lod_comp->llc_extent))
8513 if (!lod_is_instantiation_needed(lod_comp))
8516 CDEBUG(D_LAYOUT, "write instantiate %d / %d\n",
8517 primary, lod_comp_index(lo, lod_comp));
8518 info->lti_comp_idx[info->lti_count++] =
8519 lod_comp_index(lo, lod_comp);
8521 } else { /* MD_LAYOUT_RESYNC */
8522 if (mlc->mlc_mirror_id == 0) {
8524 lod_foreach_mirror_comp(lod_comp, lo, primary) {
8525 if (!lod_comp_inited(lod_comp))
8528 extent.e_end = lod_comp->llc_extent.e_end;
8531 rc = lod_prepare_resync(env, lo, &extent);
8535 /* mirror write, try to init its all components */
8536 rc = lod_prepare_resync_mirror(env, lo,
8537 mlc->mlc_mirror_id);
8542 /* change the file state to SYNC_PENDING */
8543 lo->ldo_flr_state = LCM_FL_SYNC_PENDING;
8546 rc = lod_declare_instantiate_components(env, lo, th, 0);
8550 lod_obj_inc_layout_gen(lo);
8552 /* 3. transfer layout version to OST objects.
8553 * transfer new layout version to OST objects so that stale writes
8554 * can be denied. It also ends an era of writing by setting
8555 * LU_LAYOUT_RESYNC. Normal client can never use this bit to
8556 * send write RPC; only resync RPCs could do it. */
8557 layout_attr->la_valid = LA_LAYOUT_VERSION;
8558 layout_attr->la_layout_version = 0;
8559 if (mlc->mlc_opc == MD_LAYOUT_RESYNC)
8560 layout_attr->la_layout_version = LU_LAYOUT_RESYNC;
8561 rc = lod_declare_attr_set(env, &lo->ldo_obj, layout_attr, th);
8566 lod_striping_free(env, lo);
8570 static int lod_declare_update_sync_pending(const struct lu_env *env,
8571 struct lod_object *lo, struct md_layout_change *mlc,
8574 struct lod_thread_info *info = lod_env_info(env);
8575 struct lu_attr *layout_attr = &info->lti_layout_attr;
8576 unsigned sync_components = 0;
8577 unsigned resync_components = 0;
8582 LASSERT(lo->ldo_flr_state == LCM_FL_SYNC_PENDING);
8583 LASSERT(mlc->mlc_opc == MD_LAYOUT_RESYNC_DONE ||
8584 mlc->mlc_opc == MD_LAYOUT_WRITE);
8586 CDEBUG(D_LAYOUT, DFID ": received op %d in sync pending\n",
8587 PFID(lod_object_fid(lo)), mlc->mlc_opc);
8589 if (mlc->mlc_opc == MD_LAYOUT_WRITE) {
8590 CDEBUG(D_LAYOUT, DFID": cocurrent write to sync pending\n",
8591 PFID(lod_object_fid(lo)));
8593 lo->ldo_flr_state = LCM_FL_WRITE_PENDING;
8594 return lod_declare_update_write_pending(env, lo, mlc, th);
8597 /* MD_LAYOUT_RESYNC_DONE */
8599 for (i = 0; i < lo->ldo_comp_cnt; i++) {
8600 struct lod_layout_component *lod_comp;
8603 lod_comp = &lo->ldo_comp_entries[i];
8605 if (!(lod_comp->llc_flags & LCME_FL_STALE)) {
8610 for (j = 0; j < mlc->mlc_resync_count; j++) {
8611 if (lod_comp->llc_id != mlc->mlc_resync_ids[j])
8614 mlc->mlc_resync_ids[j] = LCME_ID_INVAL;
8615 lod_comp->llc_flags &= ~LCME_FL_STALE;
8616 resync_components++;
8622 for (i = 0; i < mlc->mlc_resync_count; i++) {
8623 if (mlc->mlc_resync_ids[i] == LCME_ID_INVAL)
8626 CDEBUG(D_LAYOUT, DFID": lcme id %u (%d / %zd) not exist "
8627 "or already synced\n", PFID(lod_object_fid(lo)),
8628 mlc->mlc_resync_ids[i], i, mlc->mlc_resync_count);
8629 GOTO(out, rc = -EINVAL);
8632 if (!sync_components || (mlc->mlc_resync_count && !resync_components)) {
8633 CDEBUG(D_LAYOUT, DFID": no mirror in sync\n",
8634 PFID(lod_object_fid(lo)));
8636 /* tend to return an error code here to prevent
8637 * the MDT from setting SoM attribute */
8638 GOTO(out, rc = -EINVAL);
8641 CDEBUG(D_LAYOUT, DFID": synced %u resynced %u/%zu components\n",
8642 PFID(lod_object_fid(lo)),
8643 sync_components, resync_components, mlc->mlc_resync_count);
8645 lo->ldo_flr_state = LCM_FL_RDONLY;
8646 lod_obj_inc_layout_gen(lo);
8648 layout_attr->la_valid = LA_LAYOUT_VERSION;
8649 layout_attr->la_layout_version = 0;
8650 rc = lod_declare_attr_set(env, &lo->ldo_obj, layout_attr, th);
8654 info->lti_buf.lb_len = lod_comp_md_size(lo, false);
8655 rc = lod_sub_declare_xattr_set(env, lod_object_child(lo),
8656 &info->lti_buf, XATTR_NAME_LOV, 0, th);
8661 lod_striping_free(env, lo);
8665 typedef int (*mlc_handler)(const struct lu_env *env, struct dt_object *dt,
8666 const struct md_layout_change *mlc,
8667 struct thandle *th);
8670 * Attach stripes after target's for migrating directory. NB, we
8671 * only need to declare this, the actual work is done inside
8672 * lod_xattr_set_lmv().
8674 * \param[in] env execution environment
8675 * \param[in] dt target object
8676 * \param[in] mlc layout change data
8677 * \param[in] th transaction handle
8679 * \retval 0 on success
8680 * \retval negative if failed
8682 static int lod_dir_declare_layout_attach(const struct lu_env *env,
8683 struct dt_object *dt,
8684 const struct md_layout_change *mlc,
8687 struct lod_thread_info *info = lod_env_info(env);
8688 struct lod_device *lod = lu2lod_dev(dt->do_lu.lo_dev);
8689 struct lod_tgt_descs *ltd = &lod->lod_mdt_descs;
8690 struct lod_object *lo = lod_dt_obj(dt);
8691 struct dt_object *next = dt_object_child(dt);
8692 struct dt_object_format *dof = &info->lti_format;
8693 struct lmv_mds_md_v1 *lmv = mlc->mlc_buf.lb_buf;
8694 struct dt_object **stripes;
8695 __u32 stripe_count = le32_to_cpu(lmv->lmv_stripe_count);
8696 struct lu_fid *fid = &info->lti_fid;
8697 struct lod_tgt_desc *tgt;
8698 struct dt_object *dto;
8699 struct dt_device *tgt_dt;
8700 int type = LU_SEQ_RANGE_ANY;
8701 struct dt_insert_rec *rec = &info->lti_dt_rec;
8702 char *stripe_name = info->lti_key;
8703 struct lu_name *sname;
8704 struct linkea_data ldata = { NULL };
8705 struct lu_buf linkea_buf;
8712 if (!lmv_is_sane(lmv))
8715 if (!dt_try_as_dir(env, dt, false))
8718 dof->dof_type = DFT_DIR;
8720 OBD_ALLOC_PTR_ARRAY(stripes, (lo->ldo_dir_stripe_count + stripe_count));
8724 for (i = 0; i < lo->ldo_dir_stripe_count; i++)
8725 stripes[i] = lo->ldo_stripe[i];
8727 rec->rec_type = S_IFDIR;
8729 for (i = 0; i < stripe_count; i++) {
8731 &lmv->lmv_stripe_fids[i]);
8732 if (!fid_is_sane(fid))
8735 rc = lod_fld_lookup(env, lod, fid, &idx, &type);
8739 if (idx == lod2lu_dev(lod)->ld_site->ld_seq_site->ss_node_id) {
8740 tgt_dt = lod->lod_child;
8742 tgt = LTD_TGT(ltd, idx);
8744 GOTO(out, rc = -ESTALE);
8745 tgt_dt = tgt->ltd_tgt;
8748 dto = dt_locate_at(env, tgt_dt, fid,
8749 lo->ldo_obj.do_lu.lo_dev->ld_site->ls_top_dev,
8752 GOTO(out, rc = PTR_ERR(dto));
8754 stripes[i + lo->ldo_dir_stripe_count] = dto;
8756 if (!dt_try_as_dir(env, dto, true))
8757 GOTO(out, rc = -ENOTDIR);
8759 rc = lod_sub_declare_ref_add(env, dto, th);
8763 rec->rec_fid = lu_object_fid(&dto->do_lu);
8764 rc = lod_sub_declare_insert(env, dto,
8765 (const struct dt_rec *)rec,
8766 (const struct dt_key *)dot, th);
8770 rc = lod_sub_declare_insert(env, dto,
8771 (const struct dt_rec *)rec,
8772 (const struct dt_key *)dotdot, th);
8776 rc = lod_sub_declare_xattr_set(env, dto, &mlc->mlc_buf,
8777 XATTR_NAME_LMV, 0, th);
8781 snprintf(stripe_name, sizeof(info->lti_key), DFID":%u",
8782 PFID(lu_object_fid(&dto->do_lu)),
8783 i + lo->ldo_dir_stripe_count);
8785 sname = lod_name_get(env, stripe_name, strlen(stripe_name));
8786 rc = linkea_links_new(&ldata, &info->lti_linkea_buf,
8787 sname, lu_object_fid(&dt->do_lu));
8791 linkea_buf.lb_buf = ldata.ld_buf->lb_buf;
8792 linkea_buf.lb_len = ldata.ld_leh->leh_len;
8793 rc = lod_sub_declare_xattr_set(env, dto, &linkea_buf,
8794 XATTR_NAME_LINK, 0, th);
8798 rc = lod_sub_declare_insert(env, next,
8799 (const struct dt_rec *)rec,
8800 (const struct dt_key *)stripe_name,
8805 rc = lod_sub_declare_ref_add(env, next, th);
8811 OBD_FREE_PTR_ARRAY(lo->ldo_stripe,
8812 lo->ldo_dir_stripes_allocated);
8813 lo->ldo_stripe = stripes;
8814 lo->ldo_is_foreign = 0;
8815 lo->ldo_dir_migrate_offset = lo->ldo_dir_stripe_count;
8816 lo->ldo_dir_migrate_hash = le32_to_cpu(lmv->lmv_hash_type);
8817 lo->ldo_dir_stripe_count += stripe_count;
8818 lo->ldo_dir_layout_version++;
8819 lo->ldo_dir_stripes_allocated += stripe_count;
8821 /* plain directory split creates target as a plain directory, while
8822 * after source attached as the first stripe, it becomes a striped
8823 * directory, set correct do_index_ops, otherwise it can't be unlinked.
8825 dt->do_index_ops = &lod_striped_index_ops;
8829 i = lo->ldo_dir_stripe_count;
8830 while (i < lo->ldo_dir_stripe_count + stripe_count && stripes[i])
8831 dt_object_put(env, stripes[i++]);
8833 OBD_FREE_PTR_ARRAY(stripes, stripe_count + lo->ldo_dir_stripe_count);
8837 static int lod_dir_declare_layout_detach(const struct lu_env *env,
8838 struct dt_object *dt,
8839 const struct md_layout_change *unused,
8842 struct lod_thread_info *info = lod_env_info(env);
8843 struct lod_object *lo = lod_dt_obj(dt);
8844 struct dt_object *next = dt_object_child(dt);
8845 char *stripe_name = info->lti_key;
8846 struct dt_object *dto;
8850 if (!dt_try_as_dir(env, dt, true))
8853 if (!lo->ldo_dir_stripe_count)
8854 return lod_sub_declare_delete(env, next,
8855 (const struct dt_key *)dotdot, th);
8857 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
8858 dto = lo->ldo_stripe[i];
8862 if (!dt_try_as_dir(env, dto, true))
8865 rc = lod_sub_declare_delete(env, dto,
8866 (const struct dt_key *)dotdot, th);
8870 snprintf(stripe_name, sizeof(info->lti_key), DFID":%d",
8871 PFID(lu_object_fid(&dto->do_lu)), i);
8873 rc = lod_sub_declare_delete(env, next,
8874 (const struct dt_key *)stripe_name, th);
8878 rc = lod_sub_declare_ref_del(env, next, th);
8886 static int dt_dir_is_empty(const struct lu_env *env,
8887 struct dt_object *obj)
8890 const struct dt_it_ops *iops;
8895 if (!dt_try_as_dir(env, obj, true))
8898 iops = &obj->do_index_ops->dio_it;
8899 it = iops->init(env, obj, LUDA_64BITHASH);
8901 RETURN(PTR_ERR(it));
8903 rc = iops->get(env, it, (const struct dt_key *)"");
8907 for (rc = 0, i = 0; rc == 0 && i < 3; ++i)
8908 rc = iops->next(env, it);
8914 /* Huh? Index contains no zero key? */
8919 iops->fini(env, it);
8924 static int lod_dir_declare_layout_shrink(const struct lu_env *env,
8925 struct dt_object *dt,
8926 const struct md_layout_change *mlc,
8929 struct lod_thread_info *info = lod_env_info(env);
8930 struct lod_object *lo = lod_dt_obj(dt);
8931 struct dt_object *next = dt_object_child(dt);
8932 struct lmv_user_md *lmu = mlc->mlc_buf.lb_buf;
8933 char *stripe_name = info->lti_key;
8934 struct lu_buf *lmv_buf = &info->lti_buf;
8935 __u32 final_stripe_count;
8936 struct dt_object *dto;
8942 if (!dt_try_as_dir(env, dt, true))
8945 /* shouldn't be called on plain directory */
8946 LASSERT(lo->ldo_dir_stripe_count);
8948 lmv_buf->lb_buf = &info->lti_lmv.lmv_md_v1;
8949 lmv_buf->lb_len = sizeof(info->lti_lmv.lmv_md_v1);
8951 final_stripe_count = le32_to_cpu(lmu->lum_stripe_count);
8952 LASSERT(final_stripe_count &&
8953 final_stripe_count < lo->ldo_dir_stripe_count);
8955 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
8956 dto = lo->ldo_stripe[i];
8960 if (i < final_stripe_count) {
8961 rc = lod_sub_declare_xattr_set(env, dto, lmv_buf,
8963 LU_XATTR_REPLACE, th);
8970 rc = dt_dir_is_empty(env, dto);
8974 rc = lod_sub_declare_ref_del(env, dto, th);
8978 rc = lod_sub_declare_destroy(env, dto, th);
8982 snprintf(stripe_name, sizeof(info->lti_key), DFID":%d",
8983 PFID(lu_object_fid(&dto->do_lu)), i);
8985 rc = lod_sub_declare_delete(env, next,
8986 (const struct dt_key *)stripe_name, th);
8990 rc = lod_sub_declare_ref_del(env, next, th);
8995 rc = lod_sub_declare_xattr_set(env, next, lmv_buf, XATTR_NAME_LMV,
8996 LU_XATTR_REPLACE, th);
9001 * Allocate stripes for split directory.
9003 * \param[in] env execution environment
9004 * \param[in] dt target object
9005 * \param[in] mlc layout change data
9006 * \param[in] th transaction handle
9008 * \retval 0 on success
9009 * \retval negative if failed
9011 static int lod_dir_declare_layout_split(const struct lu_env *env,
9012 struct dt_object *dt,
9013 const struct md_layout_change *mlc,
9016 struct lod_thread_info *info = lod_env_info(env);
9017 struct lod_device *lod = lu2lod_dev(dt->do_lu.lo_dev);
9018 struct lod_object *lo = lod_dt_obj(dt);
9019 struct dt_object_format *dof = &info->lti_format;
9020 struct lmv_user_md_v1 *lum = mlc->mlc_spec->u.sp_ea.eadata;
9021 struct dt_object **stripes;
9022 int mdt_count = lod->lod_remote_mdt_count + 1;
9030 LASSERT(le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC);
9031 LASSERT(le32_to_cpu(lum->lum_stripe_offset) == LMV_OFFSET_DEFAULT);
9033 saved_count = lo->ldo_dir_stripes_allocated;
9034 stripe_count = le32_to_cpu(lum->lum_stripe_count);
9036 /* if the split target is overstriped, we need to put that flag in the
9037 * current layout so it can allocate the larger number of stripes
9039 * Note we need to pick up any hash *flags* which affect allocation
9040 * *before* allocation, so they're used in allocating the directory,
9041 * rather than after when we finalize directory setup (at the end of
9044 if (le32_to_cpu(lum->lum_hash_type) & LMV_HASH_FLAG_OVERSTRIPED) {
9045 /* silently clamp stripe count if it exceeds limit */
9046 if (stripe_count > mdt_count * lod->lod_max_stripes_per_mdt)
9047 stripe_count = mdt_count * lod->lod_max_stripes_per_mdt;
9048 if (stripe_count > mdt_count)
9049 lo->ldo_dir_hash_type |= LMV_HASH_FLAG_OVERSTRIPED;
9050 } else if (stripe_count > mdt_count) {
9054 if (stripe_count <= saved_count)
9057 dof->dof_type = DFT_DIR;
9059 OBD_ALLOC(stripes, sizeof(*stripes) * stripe_count);
9063 for (i = 0; i < lo->ldo_dir_stripes_allocated; i++)
9064 stripes[i] = lo->ldo_stripe[i];
9066 lod_qos_statfs_update(env, lod, &lod->lod_mdt_descs);
9068 rc = lod_mdt_alloc_qos(env, lo, stripes, saved_count, stripe_count);
9070 rc = lod_mdt_alloc_rr(env, lo, stripes, saved_count,
9073 OBD_FREE(stripes, sizeof(*stripes) * stripe_count);
9077 LASSERT(rc > saved_count);
9078 OBD_FREE(lo->ldo_stripe,
9079 sizeof(*stripes) * lo->ldo_dir_stripes_allocated);
9080 lo->ldo_stripe = stripes;
9081 lo->ldo_is_foreign = 0;
9082 lo->ldo_dir_striped = 1;
9083 lo->ldo_dir_stripe_count = rc;
9084 lo->ldo_dir_stripes_allocated = stripe_count;
9085 lo->ldo_dir_split_hash = lo->ldo_dir_hash_type;
9086 lo->ldo_dir_hash_type = le32_to_cpu(lum->lum_hash_type);
9087 if (!lmv_is_known_hash_type(lo->ldo_dir_hash_type))
9088 lo->ldo_dir_hash_type =
9089 lod->lod_mdt_descs.ltd_lmv_desc.ld_pattern;
9090 lo->ldo_dir_hash_type |= LMV_HASH_FLAG_SPLIT | LMV_HASH_FLAG_MIGRATION;
9091 lo->ldo_dir_split_offset = saved_count;
9092 lo->ldo_dir_layout_version++;
9093 lo->ldo_dir_stripe_loaded = 1;
9095 rc = lod_dir_declare_create_stripes(env, dt, mlc->mlc_attr, dof, th);
9097 lod_striping_free(env, lo);
9103 * detach all stripes from dir master object, NB, stripes are not destroyed, but
9104 * deleted from it's parent namespace, this function is called in two places:
9105 * 1. mdd_migrate_mdt() detach stripes from source, and attach them to
9107 * 2. mdd_dir_layout_update() detach stripe before turning 1-stripe directory to
9108 * a plain directory.
9110 * \param[in] env execution environment
9111 * \param[in] dt target object
9112 * \param[in] mlc layout change data
9113 * \param[in] th transaction handle
9115 * \retval 0 on success
9116 * \retval negative if failed
9118 static int lod_dir_layout_detach(const struct lu_env *env,
9119 struct dt_object *dt,
9120 const struct md_layout_change *mlc,
9123 struct lod_thread_info *info = lod_env_info(env);
9124 struct lod_object *lo = lod_dt_obj(dt);
9125 struct dt_object *next = dt_object_child(dt);
9126 char *stripe_name = info->lti_key;
9127 struct dt_object *dto;
9133 if (!lo->ldo_dir_stripe_count) {
9134 /* plain directory delete .. */
9135 rc = lod_sub_delete(env, next,
9136 (const struct dt_key *)dotdot, th);
9140 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
9141 dto = lo->ldo_stripe[i];
9145 rc = lod_sub_delete(env, dto,
9146 (const struct dt_key *)dotdot, th);
9150 snprintf(stripe_name, sizeof(info->lti_key), DFID":%d",
9151 PFID(lu_object_fid(&dto->do_lu)), i);
9153 rc = lod_sub_delete(env, next,
9154 (const struct dt_key *)stripe_name, th);
9158 rc = lod_sub_ref_del(env, next, th);
9163 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
9164 dto = lo->ldo_stripe[i];
9166 dt_object_put(env, dto);
9168 OBD_FREE_PTR_ARRAY(lo->ldo_stripe, lo->ldo_dir_stripes_allocated);
9169 lo->ldo_stripe = NULL;
9170 lo->ldo_dir_stripes_allocated = 0;
9171 lo->ldo_dir_stripe_count = 0;
9172 dt->do_index_ops = &lod_index_ops;
9177 static int lod_dir_layout_shrink(const struct lu_env *env,
9178 struct dt_object *dt,
9179 const struct md_layout_change *mlc,
9182 struct lod_thread_info *info = lod_env_info(env);
9183 struct lod_object *lo = lod_dt_obj(dt);
9184 struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
9185 struct dt_object *next = dt_object_child(dt);
9186 struct lmv_user_md *lmu = mlc->mlc_buf.lb_buf;
9187 __u32 final_stripe_count;
9188 char *stripe_name = info->lti_key;
9189 struct dt_object *dto;
9190 struct lu_buf *lmv_buf = &info->lti_buf;
9191 struct lmv_mds_md_v1 *lmv = &info->lti_lmv.lmv_md_v1;
9193 int type = LU_SEQ_RANGE_ANY;
9199 final_stripe_count = le32_to_cpu(lmu->lum_stripe_count);
9201 lmv_buf->lb_buf = lmv;
9202 lmv_buf->lb_len = sizeof(*lmv);
9203 lmv->lmv_magic = cpu_to_le32(LMV_MAGIC_STRIPE);
9204 lmv->lmv_stripe_count = cpu_to_le32(final_stripe_count);
9205 lmv->lmv_hash_type = cpu_to_le32(lo->ldo_dir_hash_type) &
9206 cpu_to_le32(LMV_HASH_TYPE_MASK |
9207 LMV_HASH_FLAG_FIXED);
9208 lmv->lmv_layout_version =
9209 cpu_to_le32(lo->ldo_dir_layout_version + 1);
9210 lmv->lmv_migrate_offset = 0;
9211 lmv->lmv_migrate_hash = 0;
9213 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
9214 dto = lo->ldo_stripe[i];
9218 if (i < final_stripe_count) {
9219 rc = lod_fld_lookup(env, lod,
9220 lu_object_fid(&dto->do_lu),
9225 lmv->lmv_master_mdt_index = cpu_to_le32(mdtidx);
9226 rc = lod_sub_xattr_set(env, dto, lmv_buf,
9228 LU_XATTR_REPLACE, th);
9235 dt_write_lock(env, dto, DT_TGT_CHILD);
9236 rc = lod_sub_ref_del(env, dto, th);
9237 dt_write_unlock(env, dto);
9241 rc = lod_sub_destroy(env, dto, th);
9245 snprintf(stripe_name, sizeof(info->lti_key), DFID":%d",
9246 PFID(lu_object_fid(&dto->do_lu)), i);
9248 rc = lod_sub_delete(env, next,
9249 (const struct dt_key *)stripe_name, th);
9253 rc = lod_sub_ref_del(env, next, th);
9258 rc = lod_fld_lookup(env, lod, lu_object_fid(&dt->do_lu), &mdtidx,
9263 lmv->lmv_magic = cpu_to_le32(LMV_MAGIC_V1);
9264 lmv->lmv_master_mdt_index = cpu_to_le32(mdtidx);
9265 rc = lod_sub_xattr_set(env, next, lmv_buf, XATTR_NAME_LMV,
9266 LU_XATTR_REPLACE, th);
9270 for (i = final_stripe_count; i < lo->ldo_dir_stripe_count; i++) {
9271 dto = lo->ldo_stripe[i];
9273 dt_object_put(env, dto);
9275 lo->ldo_dir_stripe_count = final_stripe_count;
9280 static mlc_handler dir_mlc_declare_ops[MD_LAYOUT_MAX] = {
9281 [MD_LAYOUT_ATTACH] = lod_dir_declare_layout_attach,
9282 [MD_LAYOUT_DETACH] = lod_dir_declare_layout_detach,
9283 [MD_LAYOUT_SHRINK] = lod_dir_declare_layout_shrink,
9284 [MD_LAYOUT_SPLIT] = lod_dir_declare_layout_split,
9287 static mlc_handler dir_mlc_ops[MD_LAYOUT_MAX] = {
9288 [MD_LAYOUT_DETACH] = lod_dir_layout_detach,
9289 [MD_LAYOUT_SHRINK] = lod_dir_layout_shrink,
9292 static int lod_declare_layout_change(const struct lu_env *env,
9293 struct dt_object *dt, struct md_layout_change *mlc,
9296 struct lod_thread_info *info = lod_env_info(env);
9297 struct lod_object *lo = lod_dt_obj(dt);
9302 if (S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
9303 LASSERT(dir_mlc_declare_ops[mlc->mlc_opc]);
9304 rc = dir_mlc_declare_ops[mlc->mlc_opc](env, dt, mlc, th);
9308 if (!S_ISREG(dt->do_lu.lo_header->loh_attr) || !dt_object_exists(dt) ||
9309 dt_object_remote(dt_object_child(dt)))
9312 if (mlc->mlc_opc == MD_LAYOUT_WRITE) {
9313 struct layout_intent *intent = mlc->mlc_intent;
9315 if (intent->lai_opc == LAYOUT_INTENT_PCCRO_SET ||
9316 intent->lai_opc == LAYOUT_INTENT_PCCRO_CLEAR) {
9317 if (!S_ISREG(dt->do_lu.lo_header->loh_attr))
9320 rc = lod_declare_update_pccro(env, dt, mlc, th);
9325 rc = lod_striping_load(env, lo);
9329 LASSERT(lo->ldo_comp_cnt > 0);
9331 rc = lod_layout_data_init(info, lo->ldo_comp_cnt);
9335 switch (lo->ldo_flr_state) {
9337 rc = lod_declare_update_plain(env, lo, mlc->mlc_intent,
9341 rc = lod_declare_update_rdonly(env, lo, mlc, th);
9343 case LCM_FL_WRITE_PENDING:
9344 rc = lod_declare_update_write_pending(env, lo, mlc, th);
9346 case LCM_FL_SYNC_PENDING:
9347 rc = lod_declare_update_sync_pending(env, lo, mlc, th);
9354 rc = lod_save_layout_gen_intrans(info, lo);
9361 * Instantiate layout component objects which covers the intent write offset.
9363 static int lod_layout_change(const struct lu_env *env, struct dt_object *dt,
9364 struct md_layout_change *mlc, struct thandle *th)
9366 struct lod_thread_info *info = lod_env_info(env);
9367 struct lu_attr *attr = &lod_env_info(env)->lti_attr;
9368 struct lu_attr *layout_attr = &info->lti_layout_attr;
9369 struct lod_object *lo = lod_dt_obj(dt);
9374 if (S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
9375 LASSERT(dir_mlc_ops[mlc->mlc_opc]);
9376 rc = dir_mlc_ops[mlc->mlc_opc](env, dt, mlc, th);
9380 rc = lod_check_layout_gen_intrans(info, lo);
9383 "%s: obj "DFID" gen changed from %d to %d in transaction, retry the transaction \n",
9384 dt->do_lu.lo_dev->ld_obd->obd_name,
9385 PFID(lu_object_fid(&dt->do_lu)),
9386 info->lti_gen[rc - 1], lo->ldo_layout_gen);
9390 rc = lod_striped_create(env, dt, attr, NULL, th);
9391 if (!rc && layout_attr->la_valid & LA_LAYOUT_VERSION) {
9392 layout_attr->la_layout_version |= lo->ldo_layout_gen;
9393 rc = lod_attr_set(env, dt, layout_attr, th);
9399 const struct dt_object_operations lod_obj_ops = {
9400 .do_read_lock = lod_read_lock,
9401 .do_write_lock = lod_write_lock,
9402 .do_read_unlock = lod_read_unlock,
9403 .do_write_unlock = lod_write_unlock,
9404 .do_write_locked = lod_write_locked,
9405 .do_attr_get = lod_attr_get,
9406 .do_declare_attr_set = lod_declare_attr_set,
9407 .do_attr_set = lod_attr_set,
9408 .do_xattr_get = lod_xattr_get,
9409 .do_declare_xattr_set = lod_declare_xattr_set,
9410 .do_xattr_set = lod_xattr_set,
9411 .do_declare_xattr_del = lod_declare_xattr_del,
9412 .do_xattr_del = lod_xattr_del,
9413 .do_xattr_list = lod_xattr_list,
9414 .do_ah_init = lod_ah_init,
9415 .do_declare_create = lod_declare_create,
9416 .do_create = lod_create,
9417 .do_declare_destroy = lod_declare_destroy,
9418 .do_destroy = lod_destroy,
9419 .do_index_try = lod_index_try,
9420 .do_declare_ref_add = lod_declare_ref_add,
9421 .do_ref_add = lod_ref_add,
9422 .do_declare_ref_del = lod_declare_ref_del,
9423 .do_ref_del = lod_ref_del,
9424 .do_object_sync = lod_object_sync,
9425 .do_object_lock = lod_object_lock,
9426 .do_object_unlock = lod_object_unlock,
9427 .do_invalidate = lod_invalidate,
9428 .do_declare_layout_change = lod_declare_layout_change,
9429 .do_layout_change = lod_layout_change,
9433 * Implementation of dt_body_operations::dbo_read.
9435 * \see dt_body_operations::dbo_read() in the API description for details.
9437 static ssize_t lod_read(const struct lu_env *env, struct dt_object *dt,
9438 struct lu_buf *buf, loff_t *pos)
9440 struct dt_object *next = dt_object_child(dt);
9442 LASSERT(S_ISREG(dt->do_lu.lo_header->loh_attr) ||
9443 S_ISLNK(dt->do_lu.lo_header->loh_attr));
9444 return next->do_body_ops->dbo_read(env, next, buf, pos);
9448 * Implementation of dt_body_operations::dbo_declare_write.
9450 * \see dt_body_operations::dbo_declare_write() in the API description
9453 static ssize_t lod_declare_write(const struct lu_env *env,
9454 struct dt_object *dt,
9455 const struct lu_buf *buf, loff_t pos,
9458 return lod_sub_declare_write(env, dt_object_child(dt), buf, pos, th);
9462 * Implementation of dt_body_operations::dbo_write.
9464 * \see dt_body_operations::dbo_write() in the API description for details.
9466 static ssize_t lod_write(const struct lu_env *env, struct dt_object *dt,
9467 const struct lu_buf *buf, loff_t *pos,
9470 LASSERT(S_ISREG(dt->do_lu.lo_header->loh_attr) ||
9471 S_ISLNK(dt->do_lu.lo_header->loh_attr));
9472 return lod_sub_write(env, dt_object_child(dt), buf, pos, th);
9475 static int lod_declare_punch(const struct lu_env *env, struct dt_object *dt,
9476 __u64 start, __u64 end, struct thandle *th)
9478 if (dt_object_remote(dt))
9481 return lod_sub_declare_punch(env, dt_object_child(dt), start, end, th);
9484 static int lod_punch(const struct lu_env *env, struct dt_object *dt,
9485 __u64 start, __u64 end, struct thandle *th)
9487 if (dt_object_remote(dt))
9490 LASSERT(S_ISREG(dt->do_lu.lo_header->loh_attr));
9491 return lod_sub_punch(env, dt_object_child(dt), start, end, th);
9495 * different type of files use the same body_ops because object may be created
9496 * in OUT, where there is no chance to set correct body_ops for each type, so
9497 * body_ops themselves will check file type inside, see lod_read/write/punch for
9500 static const struct dt_body_operations lod_body_ops = {
9501 .dbo_read = lod_read,
9502 .dbo_declare_write = lod_declare_write,
9503 .dbo_write = lod_write,
9504 .dbo_declare_punch = lod_declare_punch,
9505 .dbo_punch = lod_punch,
9509 * Implementation of lu_object_operations::loo_object_init.
9511 * The function determines the type and the index of the target device using
9512 * sequence of the object's FID. Then passes control down to the
9513 * corresponding device:
9514 * OSD for the local objects, OSP for remote
9516 * \see lu_object_operations::loo_object_init() in the API description
9519 static int lod_object_init(const struct lu_env *env, struct lu_object *lo,
9520 const struct lu_object_conf *conf)
9522 struct lod_device *lod = lu2lod_dev(lo->lo_dev);
9523 struct lu_device *cdev = NULL;
9524 struct lu_object *cobj;
9525 struct lod_tgt_descs *ltd = NULL;
9526 struct lod_tgt_desc *tgt;
9528 int type = LU_SEQ_RANGE_ANY;
9532 rc = lod_fld_lookup(env, lod, lu_object_fid(lo), &idx, &type);
9536 if (type == LU_SEQ_RANGE_MDT &&
9537 idx == lu_site2seq(lo->lo_dev->ld_site)->ss_node_id) {
9538 cdev = &lod->lod_child->dd_lu_dev;
9539 } else if (type == LU_SEQ_RANGE_MDT) {
9540 ltd = &lod->lod_mdt_descs;
9542 } else if (type == LU_SEQ_RANGE_OST) {
9543 ltd = &lod->lod_ost_descs;
9550 if (ltd->ltd_tgts_size > idx &&
9551 test_bit(idx, ltd->ltd_tgt_bitmap)) {
9552 tgt = LTD_TGT(ltd, idx);
9554 LASSERT(tgt != NULL);
9555 LASSERT(tgt->ltd_tgt != NULL);
9557 cdev = &(tgt->ltd_tgt->dd_lu_dev);
9559 lod_putref(lod, ltd);
9562 if (unlikely(cdev == NULL))
9565 cobj = cdev->ld_ops->ldo_object_alloc(env, lo->lo_header, cdev);
9566 if (unlikely(cobj == NULL))
9569 lu2lod_obj(lo)->ldo_obj.do_body_ops = &lod_body_ops;
9571 lu_object_add(lo, cobj);
9578 * Release resources associated with striping.
9580 * If the object is striped (regular or directory), then release
9581 * the stripe objects references and free the ldo_stripe array.
9583 * \param[in] env execution environment
9584 * \param[in] lo object
9586 void lod_striping_free_nolock(const struct lu_env *env, struct lod_object *lo)
9588 struct lod_layout_component *lod_comp;
9589 __u32 obj_attr = lo->ldo_obj.do_lu.lo_header->loh_attr;
9592 if (unlikely(lo->ldo_is_foreign)) {
9593 if (S_ISREG(obj_attr)) {
9594 lod_free_foreign_lov(lo);
9595 lo->ldo_comp_cached = 0;
9596 } else if (S_ISDIR(obj_attr)) {
9597 lod_free_foreign_lmv(lo);
9598 lo->ldo_dir_stripe_loaded = 0;
9600 } else if (lo->ldo_stripe != NULL) {
9601 LASSERT(lo->ldo_comp_entries == NULL);
9602 LASSERT(lo->ldo_dir_stripes_allocated > 0);
9604 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
9605 if (lo->ldo_stripe[i])
9606 dt_object_put(env, lo->ldo_stripe[i]);
9609 j = sizeof(struct dt_object *) * lo->ldo_dir_stripes_allocated;
9610 OBD_FREE(lo->ldo_stripe, j);
9611 lo->ldo_stripe = NULL;
9612 lo->ldo_dir_stripes_allocated = 0;
9613 lo->ldo_dir_stripe_loaded = 0;
9614 lo->ldo_dir_stripe_count = 0;
9615 lo->ldo_obj.do_index_ops = NULL;
9616 } else if (lo->ldo_comp_entries != NULL) {
9617 for (i = 0; i < lo->ldo_comp_cnt; i++) {
9618 /* free lod_layout_component::llc_stripe array */
9619 lod_comp = &lo->ldo_comp_entries[i];
9621 /* HSM layout component */
9622 if (lod_comp->llc_magic == LOV_MAGIC_FOREIGN)
9624 if (lod_comp->llc_stripe == NULL)
9626 LASSERT(lod_comp->llc_stripes_allocated != 0);
9627 for (j = 0; j < lod_comp->llc_stripes_allocated; j++) {
9628 if (lod_comp->llc_stripe[j] != NULL)
9630 &lod_comp->llc_stripe[j]->do_lu);
9632 OBD_FREE_PTR_ARRAY(lod_comp->llc_stripe,
9633 lod_comp->llc_stripes_allocated);
9634 lod_comp->llc_stripe = NULL;
9635 OBD_FREE_PTR_ARRAY(lod_comp->llc_ost_indices,
9636 lod_comp->llc_stripes_allocated);
9637 lod_comp->llc_ost_indices = NULL;
9638 lod_comp->llc_stripes_allocated = 0;
9640 lod_free_comp_entries(lo);
9641 lo->ldo_comp_cached = 0;
9645 void lod_striping_free(const struct lu_env *env, struct lod_object *lo)
9647 mutex_lock(&lo->ldo_layout_mutex);
9648 lod_striping_free_nolock(env, lo);
9649 mutex_unlock(&lo->ldo_layout_mutex);
9653 * Implementation of lu_object_operations::loo_object_free.
9655 * \see lu_object_operations::loo_object_free() in the API description
9658 static void lod_object_free(const struct lu_env *env, struct lu_object *o)
9660 struct lod_object *lo = lu2lod_obj(o);
9662 /* release all underlying object pinned */
9663 lod_striping_free(env, lo);
9665 /* lo doesn't contain a lu_object_header, so we don't need call_rcu */
9666 OBD_SLAB_FREE_PTR(lo, lod_object_kmem);
9670 * Implementation of lu_object_operations::loo_object_release.
9672 * \see lu_object_operations::loo_object_release() in the API description
9675 static void lod_object_release(const struct lu_env *env, struct lu_object *o)
9677 /* XXX: shouldn't we release everything here in case if object
9678 * creation failed before? */
9682 * Implementation of lu_object_operations::loo_object_print.
9684 * \see lu_object_operations::loo_object_print() in the API description
9687 static int lod_object_print(const struct lu_env *env, void *cookie,
9688 lu_printer_t p, const struct lu_object *l)
9690 struct lod_object *o = lu2lod_obj((struct lu_object *) l);
9692 return (*p)(env, cookie, LUSTRE_LOD_NAME"-object@%p", o);
9695 const struct lu_object_operations lod_lu_obj_ops = {
9696 .loo_object_init = lod_object_init,
9697 .loo_object_free = lod_object_free,
9698 .loo_object_release = lod_object_release,
9699 .loo_object_print = lod_object_print,