Whamcloud - gitweb
LU-15720 dne: add crush2 hash type
[fs/lustre-release.git] / lustre / lod / lod_object.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License version 2 for more details.  A copy is
14  * included in the COPYING file that accompanied this code.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright  2009 Sun Microsystems, Inc. All rights reserved
24  * Use is subject to license terms.
25  *
26  * Copyright (c) 2012, 2017, Intel Corporation.
27  */
28 /*
29  * lustre/lod/lod_object.c
30  *
31  * This file contains implementations of methods for the OSD API
32  * for the Logical Object Device (LOD) layer, which provides a virtual
33  * local OSD object interface to the MDD layer, and abstracts the
34  * addressing of local (OSD) and remote (OSP) objects. The API is
35  * described in the file lustre/include/dt_object.h and in
36  * Documentation/osd-api.txt.
37  *
38  * Author: Alex Zhuravlev <alexey.zhuravlev@intel.com>
39  */
40
41 #define DEBUG_SUBSYSTEM S_MDS
42
43 #include <linux/random.h>
44
45 #include <obd.h>
46 #include <obd_class.h>
47 #include <obd_support.h>
48
49 #include <lustre_fid.h>
50 #include <lustre_linkea.h>
51 #include <lustre_lmv.h>
52 #include <uapi/linux/lustre/lustre_param.h>
53 #include <lustre_swab.h>
54 #include <uapi/linux/lustre/lustre_ver.h>
55 #include <lprocfs_status.h>
56 #include <md_object.h>
57
58 #include "lod_internal.h"
59
60 static const char dot[] = ".";
61 static const char dotdot[] = "..";
62
63 /**
64  * Implementation of dt_index_operations::dio_lookup
65  *
66  * Used with regular (non-striped) objects.
67  *
68  * \see dt_index_operations::dio_lookup() in the API description for details.
69  */
70 static int lod_lookup(const struct lu_env *env, struct dt_object *dt,
71                       struct dt_rec *rec, const struct dt_key *key)
72 {
73         struct dt_object *next = dt_object_child(dt);
74         return next->do_index_ops->dio_lookup(env, next, rec, key);
75 }
76
77 /**
78  * Implementation of dt_index_operations::dio_declare_insert.
79  *
80  * Used with regular (non-striped) objects.
81  *
82  * \see dt_index_operations::dio_declare_insert() in the API description
83  * for details.
84  */
85 static int lod_declare_insert(const struct lu_env *env, struct dt_object *dt,
86                               const struct dt_rec *rec,
87                               const struct dt_key *key, struct thandle *th)
88 {
89         return lod_sub_declare_insert(env, dt_object_child(dt), rec, key, th);
90 }
91
92 /**
93  * Implementation of dt_index_operations::dio_insert.
94  *
95  * Used with regular (non-striped) objects
96  *
97  * \see dt_index_operations::dio_insert() in the API description for details.
98  */
99 static int lod_insert(const struct lu_env *env, struct dt_object *dt,
100                       const struct dt_rec *rec, const struct dt_key *key,
101                       struct thandle *th)
102 {
103         return lod_sub_insert(env, dt_object_child(dt), rec, key, th);
104 }
105
106 /**
107  * Implementation of dt_index_operations::dio_declare_delete.
108  *
109  * Used with regular (non-striped) objects.
110  *
111  * \see dt_index_operations::dio_declare_delete() in the API description
112  * for details.
113  */
114 static int lod_declare_delete(const struct lu_env *env, struct dt_object *dt,
115                               const struct dt_key *key, struct thandle *th)
116 {
117         return lod_sub_declare_delete(env, dt_object_child(dt), key, th);
118 }
119
120 /**
121  * Implementation of dt_index_operations::dio_delete.
122  *
123  * Used with regular (non-striped) objects.
124  *
125  * \see dt_index_operations::dio_delete() in the API description for details.
126  */
127 static int lod_delete(const struct lu_env *env, struct dt_object *dt,
128                       const struct dt_key *key, struct thandle *th)
129 {
130         return lod_sub_delete(env, dt_object_child(dt), key, th);
131 }
132
133 /**
134  * Implementation of dt_it_ops::init.
135  *
136  * Used with regular (non-striped) objects.
137  *
138  * \see dt_it_ops::init() in the API description for details.
139  */
140 static struct dt_it *lod_it_init(const struct lu_env *env,
141                                  struct dt_object *dt, __u32 attr)
142 {
143         struct dt_object        *next = dt_object_child(dt);
144         struct lod_it           *it = &lod_env_info(env)->lti_it;
145         struct dt_it            *it_next;
146
147         it_next = next->do_index_ops->dio_it.init(env, next, attr);
148         if (IS_ERR(it_next))
149                 return it_next;
150
151         /* currently we do not use more than one iterator per thread
152          * so we store it in thread info. if at some point we need
153          * more active iterators in a single thread, we can allocate
154          * additional ones */
155         LASSERT(it->lit_obj == NULL);
156
157         it->lit_it = it_next;
158         it->lit_obj = next;
159
160         return (struct dt_it *)it;
161 }
162
163 #define LOD_CHECK_IT(env, it)                                   \
164 do {                                                            \
165         LASSERT((it)->lit_obj != NULL);                         \
166         LASSERT((it)->lit_it != NULL);                          \
167 } while (0)
168
169 /**
170  * Implementation of dt_index_operations::dio_it.fini.
171  *
172  * Used with regular (non-striped) objects.
173  *
174  * \see dt_index_operations::dio_it.fini() in the API description for details.
175  */
176 static void lod_it_fini(const struct lu_env *env, struct dt_it *di)
177 {
178         struct lod_it *it = (struct lod_it *)di;
179
180         LOD_CHECK_IT(env, it);
181         it->lit_obj->do_index_ops->dio_it.fini(env, it->lit_it);
182
183         /* the iterator not in use any more */
184         it->lit_obj = NULL;
185         it->lit_it = NULL;
186 }
187
188 /**
189  * Implementation of dt_it_ops::get.
190  *
191  * Used with regular (non-striped) objects.
192  *
193  * \see dt_it_ops::get() in the API description for details.
194  */
195 static int lod_it_get(const struct lu_env *env, struct dt_it *di,
196                       const struct dt_key *key)
197 {
198         const struct lod_it *it = (const struct lod_it *)di;
199
200         LOD_CHECK_IT(env, it);
201         return it->lit_obj->do_index_ops->dio_it.get(env, it->lit_it, key);
202 }
203
204 /**
205  * Implementation of dt_it_ops::put.
206  *
207  * Used with regular (non-striped) objects.
208  *
209  * \see dt_it_ops::put() in the API description for details.
210  */
211 static void lod_it_put(const struct lu_env *env, struct dt_it *di)
212 {
213         struct lod_it *it = (struct lod_it *)di;
214
215         LOD_CHECK_IT(env, it);
216         return it->lit_obj->do_index_ops->dio_it.put(env, it->lit_it);
217 }
218
219 /**
220  * Implementation of dt_it_ops::next.
221  *
222  * Used with regular (non-striped) objects
223  *
224  * \see dt_it_ops::next() in the API description for details.
225  */
226 static int lod_it_next(const struct lu_env *env, struct dt_it *di)
227 {
228         struct lod_it *it = (struct lod_it *)di;
229
230         LOD_CHECK_IT(env, it);
231         return it->lit_obj->do_index_ops->dio_it.next(env, it->lit_it);
232 }
233
234 /**
235  * Implementation of dt_it_ops::key.
236  *
237  * Used with regular (non-striped) objects.
238  *
239  * \see dt_it_ops::key() in the API description for details.
240  */
241 static struct dt_key *lod_it_key(const struct lu_env *env,
242                                  const struct dt_it *di)
243 {
244         const struct lod_it *it = (const struct lod_it *)di;
245
246         LOD_CHECK_IT(env, it);
247         return it->lit_obj->do_index_ops->dio_it.key(env, it->lit_it);
248 }
249
250 /**
251  * Implementation of dt_it_ops::key_size.
252  *
253  * Used with regular (non-striped) objects.
254  *
255  * \see dt_it_ops::key_size() in the API description for details.
256  */
257 static int lod_it_key_size(const struct lu_env *env, const struct dt_it *di)
258 {
259         struct lod_it *it = (struct lod_it *)di;
260
261         LOD_CHECK_IT(env, it);
262         return it->lit_obj->do_index_ops->dio_it.key_size(env, it->lit_it);
263 }
264
265 /**
266  * Implementation of dt_it_ops::rec.
267  *
268  * Used with regular (non-striped) objects.
269  *
270  * \see dt_it_ops::rec() in the API description for details.
271  */
272 static int lod_it_rec(const struct lu_env *env, const struct dt_it *di,
273                       struct dt_rec *rec, __u32 attr)
274 {
275         const struct lod_it *it = (const struct lod_it *)di;
276
277         LOD_CHECK_IT(env, it);
278         return it->lit_obj->do_index_ops->dio_it.rec(env, it->lit_it, rec,
279                                                      attr);
280 }
281
282 /**
283  * Implementation of dt_it_ops::rec_size.
284  *
285  * Used with regular (non-striped) objects.
286  *
287  * \see dt_it_ops::rec_size() in the API description for details.
288  */
289 static int lod_it_rec_size(const struct lu_env *env, const struct dt_it *di,
290                            __u32 attr)
291 {
292         const struct lod_it *it = (const struct lod_it *)di;
293
294         LOD_CHECK_IT(env, it);
295         return it->lit_obj->do_index_ops->dio_it.rec_size(env, it->lit_it,
296                                                           attr);
297 }
298
299 /**
300  * Implementation of dt_it_ops::store.
301  *
302  * Used with regular (non-striped) objects.
303  *
304  * \see dt_it_ops::store() in the API description for details.
305  */
306 static __u64 lod_it_store(const struct lu_env *env, const struct dt_it *di)
307 {
308         const struct lod_it *it = (const struct lod_it *)di;
309
310         LOD_CHECK_IT(env, it);
311         return it->lit_obj->do_index_ops->dio_it.store(env, it->lit_it);
312 }
313
314 /**
315  * Implementation of dt_it_ops::load.
316  *
317  * Used with regular (non-striped) objects.
318  *
319  * \see dt_it_ops::load() in the API description for details.
320  */
321 static int lod_it_load(const struct lu_env *env, const struct dt_it *di,
322                        __u64 hash)
323 {
324         const struct lod_it *it = (const struct lod_it *)di;
325
326         LOD_CHECK_IT(env, it);
327         return it->lit_obj->do_index_ops->dio_it.load(env, it->lit_it, hash);
328 }
329
330 /**
331  * Implementation of dt_it_ops::key_rec.
332  *
333  * Used with regular (non-striped) objects.
334  *
335  * \see dt_it_ops::rec() in the API description for details.
336  */
337 static int lod_it_key_rec(const struct lu_env *env, const struct dt_it *di,
338                           void *key_rec)
339 {
340         const struct lod_it *it = (const struct lod_it *)di;
341
342         LOD_CHECK_IT(env, it);
343         return it->lit_obj->do_index_ops->dio_it.key_rec(env, it->lit_it,
344                                                          key_rec);
345 }
346
347 static const struct dt_index_operations lod_index_ops = {
348         .dio_lookup             = lod_lookup,
349         .dio_declare_insert     = lod_declare_insert,
350         .dio_insert             = lod_insert,
351         .dio_declare_delete     = lod_declare_delete,
352         .dio_delete             = lod_delete,
353         .dio_it = {
354                 .init           = lod_it_init,
355                 .fini           = lod_it_fini,
356                 .get            = lod_it_get,
357                 .put            = lod_it_put,
358                 .next           = lod_it_next,
359                 .key            = lod_it_key,
360                 .key_size       = lod_it_key_size,
361                 .rec            = lod_it_rec,
362                 .rec_size       = lod_it_rec_size,
363                 .store          = lod_it_store,
364                 .load           = lod_it_load,
365                 .key_rec        = lod_it_key_rec,
366         }
367 };
368
369 /**
370  * Implementation of dt_index_operations::dio_lookup
371  *
372  * Used with striped directories.
373  *
374  * \see dt_index_operations::dio_lookup() in the API description for details.
375  */
376 static int lod_striped_lookup(const struct lu_env *env, struct dt_object *dt,
377                       struct dt_rec *rec, const struct dt_key *key)
378 {
379         struct lod_object *lo = lod_dt_obj(dt);
380         struct dt_object *next;
381         const char *name = (const char *)key;
382
383         LASSERT(lo->ldo_dir_stripe_count > 0);
384
385         if (strcmp(name, dot) == 0) {
386                 struct lu_fid *fid = (struct lu_fid *)rec;
387
388                 *fid = *lod_object_fid(lo);
389                 return 1;
390         }
391
392         if (strcmp(name, dotdot) == 0) {
393                 next = dt_object_child(dt);
394         } else {
395                 int index;
396
397                 index = __lmv_name_to_stripe_index(lo->ldo_dir_hash_type,
398                                                    lo->ldo_dir_stripe_count,
399                                                    lo->ldo_dir_migrate_hash,
400                                                    lo->ldo_dir_migrate_offset,
401                                                    name, strlen(name), true);
402                 if (index < 0)
403                         return index;
404
405                 next = lo->ldo_stripe[index];
406                 if (!next || !dt_object_exists(next))
407                         return -ENODEV;
408         }
409
410         return next->do_index_ops->dio_lookup(env, next, rec, key);
411 }
412
413 /**
414  * Implementation of dt_it_ops::init.
415  *
416  * Used with striped objects. Internally just initializes the iterator
417  * on the first stripe.
418  *
419  * \see dt_it_ops::init() in the API description for details.
420  */
421 static struct dt_it *lod_striped_it_init(const struct lu_env *env,
422                                          struct dt_object *dt, __u32 attr)
423 {
424         struct lod_object *lo = lod_dt_obj(dt);
425         struct dt_object *next;
426         struct lod_it *it = &lod_env_info(env)->lti_it;
427         struct dt_it *it_next;
428         __u16 index = 0;
429
430         LASSERT(lo->ldo_dir_stripe_count > 0);
431
432         do {
433                 next = lo->ldo_stripe[index];
434                 if (next && dt_object_exists(next))
435                         break;
436         } while (++index < lo->ldo_dir_stripe_count);
437
438         /* no valid stripe */
439         if (!next || !dt_object_exists(next))
440                 return ERR_PTR(-ENODEV);
441
442         LASSERT(next->do_index_ops != NULL);
443
444         it_next = next->do_index_ops->dio_it.init(env, next, attr);
445         if (IS_ERR(it_next))
446                 return it_next;
447
448         /* currently we do not use more than one iterator per thread
449          * so we store it in thread info. if at some point we need
450          * more active iterators in a single thread, we can allocate
451          * additional ones */
452         LASSERT(it->lit_obj == NULL);
453
454         it->lit_stripe_index = index;
455         it->lit_attr = attr;
456         it->lit_it = it_next;
457         it->lit_obj = dt;
458
459         return (struct dt_it *)it;
460 }
461
462 #define LOD_CHECK_STRIPED_IT(env, it, lo)                               \
463 do {                                                                    \
464         LASSERT((it)->lit_obj != NULL);                                 \
465         LASSERT((it)->lit_it != NULL);                                  \
466         LASSERT((lo)->ldo_dir_stripe_count > 0);                        \
467         LASSERT((it)->lit_stripe_index < (lo)->ldo_dir_stripe_count);   \
468 } while (0)
469
470 /**
471  * Implementation of dt_it_ops::fini.
472  *
473  * Used with striped objects.
474  *
475  * \see dt_it_ops::fini() in the API description for details.
476  */
477 static void lod_striped_it_fini(const struct lu_env *env, struct dt_it *di)
478 {
479         struct lod_it           *it = (struct lod_it *)di;
480         struct lod_object       *lo = lod_dt_obj(it->lit_obj);
481         struct dt_object        *next;
482
483         /* If lit_it == NULL, then it means the sub_it has been finished,
484          * which only happens in failure cases, see lod_striped_it_next() */
485         if (it->lit_it != NULL) {
486                 LOD_CHECK_STRIPED_IT(env, it, lo);
487
488                 next = lo->ldo_stripe[it->lit_stripe_index];
489                 if (next) {
490                         LASSERT(next->do_index_ops != NULL);
491                         next->do_index_ops->dio_it.fini(env, it->lit_it);
492                 }
493         }
494
495         /* the iterator not in use any more */
496         it->lit_obj = NULL;
497         it->lit_it = NULL;
498         it->lit_stripe_index = 0;
499 }
500
501 /**
502  * Implementation of dt_it_ops::get.
503  *
504  * Right now it's not used widely, only to reset the iterator to the
505  * initial position. It should be possible to implement a full version
506  * which chooses a correct stripe to be able to position with any key.
507  *
508  * \see dt_it_ops::get() in the API description for details.
509  */
510 static int lod_striped_it_get(const struct lu_env *env, struct dt_it *di,
511                               const struct dt_key *key)
512 {
513         const struct lod_it *it = (const struct lod_it *)di;
514         struct lod_object *lo = lod_dt_obj(it->lit_obj);
515         struct dt_object *next;
516
517         LOD_CHECK_STRIPED_IT(env, it, lo);
518
519         next = lo->ldo_stripe[it->lit_stripe_index];
520         LASSERT(next != NULL);
521         LASSERT(dt_object_exists(next));
522         LASSERT(next->do_index_ops != NULL);
523
524         return next->do_index_ops->dio_it.get(env, it->lit_it, key);
525 }
526
527 /**
528  * Implementation of dt_it_ops::put.
529  *
530  * Used with striped objects.
531  *
532  * \see dt_it_ops::put() in the API description for details.
533  */
534 static void lod_striped_it_put(const struct lu_env *env, struct dt_it *di)
535 {
536         struct lod_it *it = (struct lod_it *)di;
537         struct lod_object *lo = lod_dt_obj(it->lit_obj);
538         struct dt_object *next;
539
540         /*
541          * If lit_it == NULL, then it means the sub_it has been finished,
542          * which only happens in failure cases, see lod_striped_it_next()
543          */
544         if (!it->lit_it)
545                 return;
546
547         LOD_CHECK_STRIPED_IT(env, it, lo);
548
549         next = lo->ldo_stripe[it->lit_stripe_index];
550         LASSERT(next != NULL);
551         LASSERT(next->do_index_ops != NULL);
552
553         return next->do_index_ops->dio_it.put(env, it->lit_it);
554 }
555
556 /**
557  * Implementation of dt_it_ops::next.
558  *
559  * Used with striped objects. When the end of the current stripe is
560  * reached, the method takes the next stripe's iterator.
561  *
562  * \see dt_it_ops::next() in the API description for details.
563  */
564 static int lod_striped_it_next(const struct lu_env *env, struct dt_it *di)
565 {
566         struct lod_it *it = (struct lod_it *)di;
567         struct lod_object *lo = lod_dt_obj(it->lit_obj);
568         struct dt_object *next;
569         struct dt_it *it_next;
570         __u32 index;
571         int rc;
572
573         ENTRY;
574
575         LOD_CHECK_STRIPED_IT(env, it, lo);
576
577         next = lo->ldo_stripe[it->lit_stripe_index];
578         LASSERT(next != NULL);
579         LASSERT(dt_object_exists(next));
580         LASSERT(next->do_index_ops != NULL);
581 again:
582         rc = next->do_index_ops->dio_it.next(env, it->lit_it);
583         if (rc < 0)
584                 RETURN(rc);
585
586         if (rc == 0 && it->lit_stripe_index == 0)
587                 RETURN(rc);
588
589         if (rc == 0 && it->lit_stripe_index > 0) {
590                 struct lu_dirent *ent;
591
592                 ent = (struct lu_dirent *)lod_env_info(env)->lti_key;
593
594                 rc = next->do_index_ops->dio_it.rec(env, it->lit_it,
595                                                     (struct dt_rec *)ent,
596                                                     it->lit_attr);
597                 if (rc != 0)
598                         RETURN(rc);
599
600                 /* skip . and .. for slave stripe */
601                 if ((strncmp(ent->lde_name, ".",
602                              le16_to_cpu(ent->lde_namelen)) == 0 &&
603                      le16_to_cpu(ent->lde_namelen) == 1) ||
604                     (strncmp(ent->lde_name, "..",
605                              le16_to_cpu(ent->lde_namelen)) == 0 &&
606                      le16_to_cpu(ent->lde_namelen) == 2))
607                         goto again;
608
609                 RETURN(rc);
610         }
611
612         next->do_index_ops->dio_it.put(env, it->lit_it);
613         next->do_index_ops->dio_it.fini(env, it->lit_it);
614         it->lit_it = NULL;
615
616         /* go to next stripe */
617         index = it->lit_stripe_index;
618         while (++index < lo->ldo_dir_stripe_count) {
619                 next = lo->ldo_stripe[index];
620                 if (!next)
621                         continue;
622
623                 if (!dt_object_exists(next))
624                         continue;
625
626                 rc = next->do_ops->do_index_try(env, next,
627                                                 &dt_directory_features);
628                 if (rc != 0)
629                         RETURN(rc);
630
631                 LASSERT(next->do_index_ops != NULL);
632
633                 it_next = next->do_index_ops->dio_it.init(env, next,
634                                                           it->lit_attr);
635                 if (IS_ERR(it_next))
636                         RETURN(PTR_ERR(it_next));
637
638                 rc = next->do_index_ops->dio_it.get(env, it_next,
639                                                     (const struct dt_key *)"");
640                 if (rc <= 0)
641                         RETURN(rc == 0 ? -EIO : rc);
642
643                 it->lit_it = it_next;
644                 it->lit_stripe_index = index;
645                 goto again;
646
647         }
648
649         RETURN(1);
650 }
651
652 /**
653  * Implementation of dt_it_ops::key.
654  *
655  * Used with striped objects.
656  *
657  * \see dt_it_ops::key() in the API description for details.
658  */
659 static struct dt_key *lod_striped_it_key(const struct lu_env *env,
660                                          const struct dt_it *di)
661 {
662         const struct lod_it     *it = (const struct lod_it *)di;
663         struct lod_object       *lo = lod_dt_obj(it->lit_obj);
664         struct dt_object        *next;
665
666         LOD_CHECK_STRIPED_IT(env, it, lo);
667
668         next = lo->ldo_stripe[it->lit_stripe_index];
669         LASSERT(next != NULL);
670         LASSERT(next->do_index_ops != NULL);
671
672         return next->do_index_ops->dio_it.key(env, it->lit_it);
673 }
674
675 /**
676  * Implementation of dt_it_ops::key_size.
677  *
678  * Used with striped objects.
679  *
680  * \see dt_it_ops::size() in the API description for details.
681  */
682 static int lod_striped_it_key_size(const struct lu_env *env,
683                                    const struct dt_it *di)
684 {
685         struct lod_it           *it = (struct lod_it *)di;
686         struct lod_object       *lo = lod_dt_obj(it->lit_obj);
687         struct dt_object        *next;
688
689         LOD_CHECK_STRIPED_IT(env, it, lo);
690
691         next = lo->ldo_stripe[it->lit_stripe_index];
692         LASSERT(next != NULL);
693         LASSERT(next->do_index_ops != NULL);
694
695         return next->do_index_ops->dio_it.key_size(env, it->lit_it);
696 }
697
698 /**
699  * Implementation of dt_it_ops::rec.
700  *
701  * Used with striped objects.
702  *
703  * \see dt_it_ops::rec() in the API description for details.
704  */
705 static int lod_striped_it_rec(const struct lu_env *env, const struct dt_it *di,
706                               struct dt_rec *rec, __u32 attr)
707 {
708         const struct lod_it     *it = (const struct lod_it *)di;
709         struct lod_object       *lo = lod_dt_obj(it->lit_obj);
710         struct dt_object        *next;
711
712         LOD_CHECK_STRIPED_IT(env, it, lo);
713
714         next = lo->ldo_stripe[it->lit_stripe_index];
715         LASSERT(next != NULL);
716         LASSERT(next->do_index_ops != NULL);
717
718         return next->do_index_ops->dio_it.rec(env, it->lit_it, rec, attr);
719 }
720
721 /**
722  * Implementation of dt_it_ops::rec_size.
723  *
724  * Used with striped objects.
725  *
726  * \see dt_it_ops::rec_size() in the API description for details.
727  */
728 static int lod_striped_it_rec_size(const struct lu_env *env,
729                                    const struct dt_it *di, __u32 attr)
730 {
731         struct lod_it           *it = (struct lod_it *)di;
732         struct lod_object       *lo = lod_dt_obj(it->lit_obj);
733         struct dt_object        *next;
734
735         LOD_CHECK_STRIPED_IT(env, it, lo);
736
737         next = lo->ldo_stripe[it->lit_stripe_index];
738         LASSERT(next != NULL);
739         LASSERT(next->do_index_ops != NULL);
740
741         return next->do_index_ops->dio_it.rec_size(env, it->lit_it, attr);
742 }
743
744 /**
745  * Implementation of dt_it_ops::store.
746  *
747  * Used with striped objects.
748  *
749  * \see dt_it_ops::store() in the API description for details.
750  */
751 static __u64 lod_striped_it_store(const struct lu_env *env,
752                                   const struct dt_it *di)
753 {
754         const struct lod_it     *it = (const struct lod_it *)di;
755         struct lod_object       *lo = lod_dt_obj(it->lit_obj);
756         struct dt_object        *next;
757
758         LOD_CHECK_STRIPED_IT(env, it, lo);
759
760         next = lo->ldo_stripe[it->lit_stripe_index];
761         LASSERT(next != NULL);
762         LASSERT(next->do_index_ops != NULL);
763
764         return next->do_index_ops->dio_it.store(env, it->lit_it);
765 }
766
767 /**
768  * Implementation of dt_it_ops::load.
769  *
770  * Used with striped objects.
771  *
772  * \see dt_it_ops::load() in the API description for details.
773  */
774 static int lod_striped_it_load(const struct lu_env *env,
775                                const struct dt_it *di, __u64 hash)
776 {
777         const struct lod_it     *it = (const struct lod_it *)di;
778         struct lod_object       *lo = lod_dt_obj(it->lit_obj);
779         struct dt_object        *next;
780
781         LOD_CHECK_STRIPED_IT(env, it, lo);
782
783         next = lo->ldo_stripe[it->lit_stripe_index];
784         LASSERT(next != NULL);
785         LASSERT(next->do_index_ops != NULL);
786
787         return next->do_index_ops->dio_it.load(env, it->lit_it, hash);
788 }
789
790 static const struct dt_index_operations lod_striped_index_ops = {
791         .dio_lookup             = lod_striped_lookup,
792         .dio_declare_insert     = lod_declare_insert,
793         .dio_insert             = lod_insert,
794         .dio_declare_delete     = lod_declare_delete,
795         .dio_delete             = lod_delete,
796         .dio_it = {
797                 .init           = lod_striped_it_init,
798                 .fini           = lod_striped_it_fini,
799                 .get            = lod_striped_it_get,
800                 .put            = lod_striped_it_put,
801                 .next           = lod_striped_it_next,
802                 .key            = lod_striped_it_key,
803                 .key_size       = lod_striped_it_key_size,
804                 .rec            = lod_striped_it_rec,
805                 .rec_size       = lod_striped_it_rec_size,
806                 .store          = lod_striped_it_store,
807                 .load           = lod_striped_it_load,
808         }
809 };
810
811 /**
812  * Append the FID for each shard of the striped directory after the
813  * given LMV EA header.
814  *
815  * To simplify striped directory and the consistency verification,
816  * we only store the LMV EA header on disk, for both master object
817  * and slave objects. When someone wants to know the whole LMV EA,
818  * such as client readdir(), we can build the entrie LMV EA on the
819  * MDT side (in RAM) via iterating the sub-directory entries that
820  * are contained in the master object of the stripe directory.
821  *
822  * For the master object of the striped directroy, the valid name
823  * for each shard is composed of the ${shard_FID}:${shard_idx}.
824  *
825  * There may be holes in the LMV EA if some shards' name entries
826  * are corrupted or lost.
827  *
828  * \param[in] env       pointer to the thread context
829  * \param[in] lo        pointer to the master object of the striped directory
830  * \param[in] buf       pointer to the lu_buf which will hold the LMV EA
831  * \param[in] resize    whether re-allocate the buffer if it is not big enough
832  *
833  * \retval              positive size of the LMV EA
834  * \retval              0 for nothing to be loaded
835  * \retval              negative error number on failure
836  */
837 int lod_load_lmv_shards(const struct lu_env *env, struct lod_object *lo,
838                         struct lu_buf *buf, bool resize)
839 {
840         struct lu_dirent        *ent    =
841                         (struct lu_dirent *)lod_env_info(env)->lti_key;
842         struct lod_device       *lod    = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
843         struct dt_object        *obj    = dt_object_child(&lo->ldo_obj);
844         struct lmv_mds_md_v1    *lmv1   = buf->lb_buf;
845         struct dt_it            *it;
846         const struct dt_it_ops  *iops;
847         __u32                    stripes;
848         __u32                    magic  = le32_to_cpu(lmv1->lmv_magic);
849         size_t                   lmv1_size;
850         int                      rc;
851         ENTRY;
852
853         if (magic != LMV_MAGIC_V1)
854                 RETURN(0);
855
856         stripes = le32_to_cpu(lmv1->lmv_stripe_count);
857         if (stripes < 1)
858                 RETURN(0);
859
860         rc = lmv_mds_md_size(stripes, magic);
861         if (rc < 0)
862                 RETURN(rc);
863         lmv1_size = rc;
864         if (buf->lb_len < lmv1_size) {
865                 struct lu_buf tbuf;
866
867                 if (!resize)
868                         RETURN(-ERANGE);
869
870                 tbuf = *buf;
871                 buf->lb_buf = NULL;
872                 buf->lb_len = 0;
873                 lu_buf_alloc(buf, lmv1_size);
874                 lmv1 = buf->lb_buf;
875                 if (lmv1 == NULL)
876                         RETURN(-ENOMEM);
877
878                 memcpy(buf->lb_buf, tbuf.lb_buf, tbuf.lb_len);
879         }
880
881         if (unlikely(!dt_try_as_dir(env, obj)))
882                 RETURN(-ENOTDIR);
883
884         memset(&lmv1->lmv_stripe_fids[0], 0, stripes * sizeof(struct lu_fid));
885         iops = &obj->do_index_ops->dio_it;
886         it = iops->init(env, obj, LUDA_64BITHASH);
887         if (IS_ERR(it))
888                 RETURN(PTR_ERR(it));
889
890         rc = iops->load(env, it, 0);
891         if (rc == 0)
892                 rc = iops->next(env, it);
893         else if (rc > 0)
894                 rc = 0;
895
896         while (rc == 0) {
897                 char             name[FID_LEN + 2] = "";
898                 struct lu_fid    fid;
899                 __u32            index;
900                 int              len;
901
902                 rc = iops->rec(env, it, (struct dt_rec *)ent, LUDA_64BITHASH);
903                 if (rc != 0)
904                         break;
905
906                 rc = -EIO;
907
908                 fid_le_to_cpu(&fid, &ent->lde_fid);
909                 ent->lde_namelen = le16_to_cpu(ent->lde_namelen);
910                 if (ent->lde_name[0] == '.') {
911                         if (ent->lde_namelen == 1)
912                                 goto next;
913
914                         if (ent->lde_namelen == 2 && ent->lde_name[1] == '.')
915                                 goto next;
916                 }
917
918                 len = scnprintf(name, sizeof(name),
919                                 DFID":", PFID(&ent->lde_fid));
920                 /* The ent->lde_name is composed of ${FID}:${index} */
921                 if (ent->lde_namelen < len + 1 ||
922                     memcmp(ent->lde_name, name, len) != 0) {
923                         CDEBUG_LIMIT(lod->lod_lmv_failout ? D_ERROR : D_INFO,
924                                      "%s: invalid shard name %.*s with the FID "DFID" for the striped directory "DFID", %s\n",
925                                      lod2obd(lod)->obd_name, ent->lde_namelen,
926                                      ent->lde_name, PFID(&fid),
927                                      PFID(lu_object_fid(&obj->do_lu)),
928                                      lod->lod_lmv_failout ? "failout" : "skip");
929
930                         if (lod->lod_lmv_failout)
931                                 break;
932
933                         goto next;
934                 }
935
936                 index = 0;
937                 do {
938                         if (ent->lde_name[len] < '0' ||
939                             ent->lde_name[len] > '9') {
940                                 CDEBUG_LIMIT(lod->lod_lmv_failout ?
941                                              D_ERROR : D_INFO,
942                                              "%s: invalid shard name %.*s with the FID "DFID" for the striped directory "DFID", %s\n",
943                                              lod2obd(lod)->obd_name,
944                                              ent->lde_namelen,
945                                              ent->lde_name, PFID(&fid),
946                                              PFID(lu_object_fid(&obj->do_lu)),
947                                              lod->lod_lmv_failout ?
948                                              "failout" : "skip");
949
950                                 if (lod->lod_lmv_failout)
951                                         break;
952
953                                 goto next;
954                         }
955
956                         index = index * 10 + ent->lde_name[len++] - '0';
957                 } while (len < ent->lde_namelen);
958
959                 if (len == ent->lde_namelen) {
960                         /* Out of LMV EA range. */
961                         if (index >= stripes) {
962                                 CERROR("%s: the shard %.*s for the striped "
963                                        "directory "DFID" is out of the known "
964                                        "LMV EA range [0 - %u], failout\n",
965                                        lod2obd(lod)->obd_name, ent->lde_namelen,
966                                        ent->lde_name,
967                                        PFID(lu_object_fid(&obj->do_lu)),
968                                        stripes - 1);
969
970                                 break;
971                         }
972
973                         /* The slot has been occupied. */
974                         if (!fid_is_zero(&lmv1->lmv_stripe_fids[index])) {
975                                 struct lu_fid fid0;
976
977                                 fid_le_to_cpu(&fid0,
978                                         &lmv1->lmv_stripe_fids[index]);
979                                 CERROR("%s: both the shard "DFID" and "DFID
980                                        " for the striped directory "DFID
981                                        " claim the same LMV EA slot at the "
982                                        "index %d, failout\n",
983                                        lod2obd(lod)->obd_name,
984                                        PFID(&fid0), PFID(&fid),
985                                        PFID(lu_object_fid(&obj->do_lu)), index);
986
987                                 break;
988                         }
989
990                         /* stored as LE mode */
991                         lmv1->lmv_stripe_fids[index] = ent->lde_fid;
992
993 next:
994                         rc = iops->next(env, it);
995                 }
996         }
997
998         iops->put(env, it);
999         iops->fini(env, it);
1000
1001         RETURN(rc > 0 ? lmv_mds_md_size(stripes, magic) : rc);
1002 }
1003
1004 /**
1005  * Implementation of dt_object_operations::do_index_try.
1006  *
1007  * \see dt_object_operations::do_index_try() in the API description for details.
1008  */
1009 static int lod_index_try(const struct lu_env *env, struct dt_object *dt,
1010                          const struct dt_index_features *feat)
1011 {
1012         struct lod_object       *lo = lod_dt_obj(dt);
1013         struct dt_object        *next = dt_object_child(dt);
1014         int                     rc;
1015         ENTRY;
1016
1017         LASSERT(next->do_ops);
1018         LASSERT(next->do_ops->do_index_try);
1019
1020         rc = lod_striping_load(env, lo);
1021         if (rc != 0)
1022                 RETURN(rc);
1023
1024         rc = next->do_ops->do_index_try(env, next, feat);
1025         if (rc != 0)
1026                 RETURN(rc);
1027
1028         if (lo->ldo_dir_stripe_count > 0) {
1029                 int i;
1030
1031                 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
1032                         if (!lo->ldo_stripe[i])
1033                                 continue;
1034                         if (!dt_object_exists(lo->ldo_stripe[i]))
1035                                 continue;
1036                         rc = lo->ldo_stripe[i]->do_ops->do_index_try(env,
1037                                                 lo->ldo_stripe[i], feat);
1038                         if (rc != 0)
1039                                 RETURN(rc);
1040                 }
1041                 dt->do_index_ops = &lod_striped_index_ops;
1042         } else {
1043                 dt->do_index_ops = &lod_index_ops;
1044         }
1045
1046         RETURN(rc);
1047 }
1048
1049 /**
1050  * Implementation of dt_object_operations::do_read_lock.
1051  *
1052  * \see dt_object_operations::do_read_lock() in the API description for details.
1053  */
1054 static void lod_read_lock(const struct lu_env *env, struct dt_object *dt,
1055                           unsigned role)
1056 {
1057         dt_read_lock(env, dt_object_child(dt), role);
1058 }
1059
1060 /**
1061  * Implementation of dt_object_operations::do_write_lock.
1062  *
1063  * \see dt_object_operations::do_write_lock() in the API description for
1064  * details.
1065  */
1066 static void lod_write_lock(const struct lu_env *env, struct dt_object *dt,
1067                            unsigned role)
1068 {
1069         dt_write_lock(env, dt_object_child(dt), role);
1070 }
1071
1072 /**
1073  * Implementation of dt_object_operations::do_read_unlock.
1074  *
1075  * \see dt_object_operations::do_read_unlock() in the API description for
1076  * details.
1077  */
1078 static void lod_read_unlock(const struct lu_env *env, struct dt_object *dt)
1079 {
1080         dt_read_unlock(env, dt_object_child(dt));
1081 }
1082
1083 /**
1084  * Implementation of dt_object_operations::do_write_unlock.
1085  *
1086  * \see dt_object_operations::do_write_unlock() in the API description for
1087  * details.
1088  */
1089 static void lod_write_unlock(const struct lu_env *env, struct dt_object *dt)
1090 {
1091         dt_write_unlock(env, dt_object_child(dt));
1092 }
1093
1094 /**
1095  * Implementation of dt_object_operations::do_write_locked.
1096  *
1097  * \see dt_object_operations::do_write_locked() in the API description for
1098  * details.
1099  */
1100 static int lod_write_locked(const struct lu_env *env, struct dt_object *dt)
1101 {
1102         return dt_write_locked(env, dt_object_child(dt));
1103 }
1104
1105 /**
1106  * Implementation of dt_object_operations::do_attr_get.
1107  *
1108  * \see dt_object_operations::do_attr_get() in the API description for details.
1109  */
1110 static int lod_attr_get(const struct lu_env *env,
1111                         struct dt_object *dt,
1112                         struct lu_attr *attr)
1113 {
1114         /* Note: for striped directory, client will merge attributes
1115          * from all of the sub-stripes see lmv_merge_attr(), and there
1116          * no MDD logic depend on directory nlink/size/time, so we can
1117          * always use master inode nlink and size for now. */
1118         return dt_attr_get(env, dt_object_child(dt), attr);
1119 }
1120
1121 void lod_adjust_stripe_size(struct lod_layout_component *comp,
1122                             __u32 def_stripe_size)
1123 {
1124         __u64 comp_end = comp->llc_extent.e_end;
1125
1126         /* Choose stripe size if not set. Note that default stripe size can't
1127          * be used as is, because it must be multiplier of given component end.
1128          *  - first check if default stripe size can be used
1129          *  - if not than select the lowest set bit from component end and use
1130          *    that value as stripe size
1131          */
1132         if (!comp->llc_stripe_size) {
1133                 if (comp_end == LUSTRE_EOF || !(comp_end % def_stripe_size))
1134                         comp->llc_stripe_size = def_stripe_size;
1135                 else
1136                         comp->llc_stripe_size = comp_end & ~(comp_end - 1);
1137         } else {
1138                 if (comp_end != LUSTRE_EOF &&
1139                     comp_end & (LOV_MIN_STRIPE_SIZE - 1)) {
1140                         CWARN("Component end %llu is not a multiple of min size %u\n",
1141                               comp_end, LOV_MIN_STRIPE_SIZE);
1142                         comp_end = round_up(comp_end, LOV_MIN_STRIPE_SIZE);
1143                 }
1144                 /* check stripe size is multiplier of comp_end */
1145                 if (comp_end != LUSTRE_EOF &&
1146                     comp_end != comp->llc_extent.e_start &&
1147                     comp_end % comp->llc_stripe_size) {
1148                         /* fix that even for defined stripe size but warn
1149                          * about the problem, that must not happen
1150                          */
1151                         CWARN("Component end %llu is not aligned by the stripe size %u\n",
1152                               comp_end, comp->llc_stripe_size);
1153                         comp->llc_stripe_size = comp_end & ~(comp_end - 1);
1154                 }
1155         }
1156 }
1157
1158 static inline void lod_adjust_stripe_info(struct lod_layout_component *comp,
1159                                           struct lov_desc *desc,
1160                                           int append_stripes)
1161 {
1162         if (comp->llc_pattern != LOV_PATTERN_MDT) {
1163                 if (append_stripes) {
1164                         comp->llc_stripe_count = append_stripes;
1165                 } else if (!comp->llc_stripe_count) {
1166                         comp->llc_stripe_count =
1167                                 desc->ld_default_stripe_count;
1168                 }
1169         }
1170
1171         lod_adjust_stripe_size(comp, desc->ld_default_stripe_size);
1172 }
1173
1174 int lod_obj_for_each_stripe(const struct lu_env *env, struct lod_object *lo,
1175                             struct thandle *th,
1176                             struct lod_obj_stripe_cb_data *data)
1177 {
1178         struct lod_layout_component *lod_comp;
1179         int i, j, rc = 0;
1180         ENTRY;
1181
1182         mutex_lock(&lo->ldo_layout_mutex);
1183         for (i = 0; i < lo->ldo_comp_cnt; i++) {
1184                 lod_comp = &lo->ldo_comp_entries[i];
1185
1186                 if (lod_comp->llc_stripe == NULL)
1187                         continue;
1188
1189                 /* has stripe but not inited yet, this component has been
1190                  * declared to be created, but hasn't created yet.
1191                  */
1192                 if (!lod_comp_inited(lod_comp))
1193                         continue;
1194
1195                 if (data->locd_comp_skip_cb &&
1196                     data->locd_comp_skip_cb(env, lo, i, data))
1197                         continue;
1198
1199                 if (data->locd_comp_cb) {
1200                         rc = data->locd_comp_cb(env, lo, i, data);
1201                         if (rc)
1202                                 GOTO(unlock, rc);
1203                 }
1204
1205                 /* could used just to do sth about component, not each
1206                  * stripes
1207                  */
1208                 if (!data->locd_stripe_cb)
1209                         continue;
1210
1211                 LASSERT(lod_comp->llc_stripe_count > 0);
1212                 for (j = 0; j < lod_comp->llc_stripe_count; j++) {
1213                         struct dt_object *dt = lod_comp->llc_stripe[j];
1214
1215                         if (dt == NULL)
1216                                 continue;
1217                         rc = data->locd_stripe_cb(env, lo, dt, th, i, j, data);
1218                         if (rc != 0)
1219                                 GOTO(unlock, rc);
1220                 }
1221         }
1222 unlock:
1223         mutex_unlock(&lo->ldo_layout_mutex);
1224         RETURN(rc);
1225 }
1226
1227 static bool lod_obj_attr_set_comp_skip_cb(const struct lu_env *env,
1228                 struct lod_object *lo, int comp_idx,
1229                 struct lod_obj_stripe_cb_data *data)
1230 {
1231         struct lod_layout_component *lod_comp = &lo->ldo_comp_entries[comp_idx];
1232         bool skipped = false;
1233
1234         if (!(data->locd_attr->la_valid & LA_LAYOUT_VERSION))
1235                 return skipped;
1236
1237         switch (lo->ldo_flr_state) {
1238         case LCM_FL_WRITE_PENDING: {
1239                 int i;
1240
1241                 /* skip stale components */
1242                 if (lod_comp->llc_flags & LCME_FL_STALE) {
1243                         skipped = true;
1244                         break;
1245                 }
1246
1247                 /* skip valid and overlapping components, therefore any
1248                  * attempts to write overlapped components will never succeed
1249                  * because client will get EINPROGRESS. */
1250                 for (i = 0; i < lo->ldo_comp_cnt; i++) {
1251                         if (i == comp_idx)
1252                                 continue;
1253
1254                         if (lo->ldo_comp_entries[i].llc_flags & LCME_FL_STALE)
1255                                 continue;
1256
1257                         if (lu_extent_is_overlapped(&lod_comp->llc_extent,
1258                                         &lo->ldo_comp_entries[i].llc_extent)) {
1259                                 skipped = true;
1260                                 break;
1261                         }
1262                 }
1263                 break;
1264         }
1265         case LCM_FL_RDONLY:
1266         case LCM_FL_SYNC_PENDING:
1267                 break;
1268         default:
1269                 LASSERTF(0, "impossible: %d\n", lo->ldo_flr_state);
1270                 break;
1271         }
1272
1273         CDEBUG(D_LAYOUT, DFID": %s to set component %x to version: %u\n",
1274                PFID(lu_object_fid(&lo->ldo_obj.do_lu)),
1275                skipped ? "skipped" : "chose", lod_comp->llc_id,
1276                data->locd_attr->la_layout_version);
1277
1278         return skipped;
1279 }
1280
1281 static inline int
1282 lod_obj_stripe_attr_set_cb(const struct lu_env *env, struct lod_object *lo,
1283                            struct dt_object *dt, struct thandle *th,
1284                            int comp_idx, int stripe_idx,
1285                            struct lod_obj_stripe_cb_data *data)
1286 {
1287         if (data->locd_declare)
1288                 return lod_sub_declare_attr_set(env, dt, data->locd_attr, th);
1289
1290         if (data->locd_attr->la_valid & LA_LAYOUT_VERSION) {
1291                 CDEBUG(D_LAYOUT, DFID": set layout version: %u, comp_idx: %d\n",
1292                        PFID(lu_object_fid(&dt->do_lu)),
1293                        data->locd_attr->la_layout_version, comp_idx);
1294         }
1295
1296         return lod_sub_attr_set(env, dt, data->locd_attr, th);
1297 }
1298
1299 /**
1300  * Implementation of dt_object_operations::do_declare_attr_set.
1301  *
1302  * If the object is striped, then apply the changes to all the stripes.
1303  *
1304  * \see dt_object_operations::do_declare_attr_set() in the API description
1305  * for details.
1306  */
1307 static int lod_declare_attr_set(const struct lu_env *env,
1308                                 struct dt_object *dt,
1309                                 const struct lu_attr *attr,
1310                                 struct thandle *th)
1311 {
1312         struct dt_object  *next = dt_object_child(dt);
1313         struct lod_object *lo = lod_dt_obj(dt);
1314         int                rc, i;
1315         ENTRY;
1316
1317         /*
1318          * declare setattr on the local object
1319          */
1320         rc = lod_sub_declare_attr_set(env, next, attr, th);
1321         if (rc)
1322                 RETURN(rc);
1323
1324         /* osp_declare_attr_set() ignores all attributes other than
1325          * UID, GID, PROJID, and size, and osp_attr_set() ignores all
1326          * but UID, GID and PROJID. Declaration of size attr setting
1327          * happens through lod_declare_init_size(), and not through
1328          * this function. Therefore we need not load striping unless
1329          * ownership is changing.  This should save memory and (we hope)
1330          * speed up rename().
1331          */
1332         if (!S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
1333                 if (!(attr->la_valid & LA_REMOTE_ATTR_SET))
1334                         RETURN(rc);
1335
1336                 if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_OWNER))
1337                         RETURN(0);
1338         } else {
1339                 if (!(attr->la_valid & (LA_UID | LA_GID | LA_PROJID | LA_MODE |
1340                                         LA_ATIME | LA_MTIME | LA_CTIME |
1341                                         LA_FLAGS)))
1342                         RETURN(rc);
1343         }
1344         /*
1345          * load striping information, notice we don't do this when object
1346          * is being initialized as we don't need this information till
1347          * few specific cases like destroy, chown
1348          */
1349         rc = lod_striping_load(env, lo);
1350         if (rc)
1351                 RETURN(rc);
1352
1353         if (!lod_obj_is_striped(dt))
1354                 RETURN(0);
1355
1356         /*
1357          * if object is striped declare changes on the stripes
1358          */
1359         if (S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
1360                 LASSERT(lo->ldo_stripe);
1361                 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
1362                         if (lo->ldo_stripe[i] == NULL)
1363                                 continue;
1364                         if (!dt_object_exists(lo->ldo_stripe[i]))
1365                                 continue;
1366                         rc = lod_sub_declare_attr_set(env, lo->ldo_stripe[i],
1367                                                       attr, th);
1368                         if (rc != 0)
1369                                 RETURN(rc);
1370                 }
1371         } else {
1372                 struct lod_obj_stripe_cb_data data = { { 0 } };
1373
1374                 data.locd_attr = attr;
1375                 data.locd_declare = true;
1376                 data.locd_stripe_cb = lod_obj_stripe_attr_set_cb;
1377                 rc = lod_obj_for_each_stripe(env, lo, th, &data);
1378         }
1379
1380         if (rc)
1381                 RETURN(rc);
1382
1383         if (!dt_object_exists(next) || dt_object_remote(next) ||
1384             !S_ISREG(attr->la_mode))
1385                 RETURN(0);
1386
1387         if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_STRIPE)) {
1388                 rc = lod_sub_declare_xattr_del(env, next, XATTR_NAME_LOV, th);
1389                 RETURN(rc);
1390         }
1391
1392         if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_CHANGE_STRIPE) ||
1393             OBD_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_PFL_RANGE)) {
1394                 struct lod_thread_info *info = lod_env_info(env);
1395                 struct lu_buf *buf = &info->lti_buf;
1396
1397                 buf->lb_buf = info->lti_ea_store;
1398                 buf->lb_len = info->lti_ea_store_size;
1399                 rc = lod_sub_declare_xattr_set(env, next, buf, XATTR_NAME_LOV,
1400                                                LU_XATTR_REPLACE, th);
1401         }
1402
1403         RETURN(rc);
1404 }
1405
1406 /**
1407  * Implementation of dt_object_operations::do_attr_set.
1408  *
1409  * If the object is striped, then apply the changes to all or subset of
1410  * the stripes depending on the object type and specific attributes.
1411  *
1412  * \see dt_object_operations::do_attr_set() in the API description for details.
1413  */
1414 static int lod_attr_set(const struct lu_env *env,
1415                         struct dt_object *dt,
1416                         const struct lu_attr *attr,
1417                         struct thandle *th)
1418 {
1419         struct dt_object        *next = dt_object_child(dt);
1420         struct lod_object       *lo = lod_dt_obj(dt);
1421         int                     rc, i;
1422         ENTRY;
1423
1424         /*
1425          * apply changes to the local object
1426          */
1427         rc = lod_sub_attr_set(env, next, attr, th);
1428         if (rc)
1429                 RETURN(rc);
1430
1431         if (!S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
1432                 if (!(attr->la_valid & LA_REMOTE_ATTR_SET))
1433                         RETURN(rc);
1434
1435                 if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_OWNER))
1436                         RETURN(0);
1437         } else {
1438                 if (!(attr->la_valid & (LA_UID | LA_GID | LA_MODE | LA_PROJID |
1439                                         LA_ATIME | LA_MTIME | LA_CTIME |
1440                                         LA_FLAGS)))
1441                         RETURN(rc);
1442         }
1443
1444         /* FIXME: a tricky case in the code path of mdd_layout_change():
1445          * the in-memory striping information has been freed in lod_xattr_set()
1446          * due to layout change. It has to load stripe here again. It only
1447          * changes flags of layout so declare_attr_set() is still accurate */
1448         rc = lod_striping_load(env, lo);
1449         if (rc)
1450                 RETURN(rc);
1451
1452         if (!lod_obj_is_striped(dt))
1453                 RETURN(0);
1454
1455         /*
1456          * if object is striped, apply changes to all the stripes
1457          */
1458         if (S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
1459                 LASSERT(lo->ldo_stripe);
1460                 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
1461                         if (unlikely(lo->ldo_stripe[i] == NULL))
1462                                 continue;
1463
1464                         if ((dt_object_exists(lo->ldo_stripe[i]) == 0))
1465                                 continue;
1466
1467                         rc = lod_sub_attr_set(env, lo->ldo_stripe[i], attr, th);
1468                         if (rc != 0)
1469                                 break;
1470                 }
1471         } else {
1472                 struct lod_obj_stripe_cb_data data = { { 0 } };
1473
1474                 data.locd_attr = attr;
1475                 data.locd_declare = false;
1476                 data.locd_comp_skip_cb = lod_obj_attr_set_comp_skip_cb;
1477                 data.locd_stripe_cb = lod_obj_stripe_attr_set_cb;
1478                 rc = lod_obj_for_each_stripe(env, lo, th, &data);
1479         }
1480
1481         if (rc)
1482                 RETURN(rc);
1483
1484         if (!dt_object_exists(next) || dt_object_remote(next) ||
1485             !S_ISREG(attr->la_mode))
1486                 RETURN(0);
1487
1488         if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_STRIPE)) {
1489                 rc = lod_sub_xattr_del(env, next, XATTR_NAME_LOV, th);
1490                 RETURN(rc);
1491         }
1492
1493         if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_CHANGE_STRIPE)) {
1494                 struct lod_thread_info *info = lod_env_info(env);
1495                 struct lu_buf *buf = &info->lti_buf;
1496                 struct ost_id *oi = &info->lti_ostid;
1497                 struct lu_fid *fid = &info->lti_fid;
1498                 struct lov_mds_md_v1 *lmm;
1499                 struct lov_ost_data_v1 *objs;
1500                 __u32 magic;
1501
1502                 rc = lod_get_lov_ea(env, lo);
1503                 if (rc <= 0)
1504                         RETURN(rc);
1505
1506                 buf->lb_buf = info->lti_ea_store;
1507                 buf->lb_len = info->lti_ea_store_size;
1508                 lmm = info->lti_ea_store;
1509                 magic = le32_to_cpu(lmm->lmm_magic);
1510                 if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) {
1511                         struct lov_comp_md_v1 *lcm = buf->lb_buf;
1512                         struct lov_comp_md_entry_v1 *lcme =
1513                                                 &lcm->lcm_entries[0];
1514
1515                         lmm = buf->lb_buf + le32_to_cpu(lcme->lcme_offset);
1516                         magic = le32_to_cpu(lmm->lmm_magic);
1517                 }
1518
1519                 if (magic == LOV_MAGIC_V1)
1520                         objs = &(lmm->lmm_objects[0]);
1521                 else
1522                         objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0];
1523                 ostid_le_to_cpu(&objs->l_ost_oi, oi);
1524                 ostid_to_fid(fid, oi, le32_to_cpu(objs->l_ost_idx));
1525                 fid->f_oid--;
1526                 fid_to_ostid(fid, oi);
1527                 ostid_cpu_to_le(oi, &objs->l_ost_oi);
1528
1529                 rc = lod_sub_xattr_set(env, next, buf, XATTR_NAME_LOV,
1530                                        LU_XATTR_REPLACE, th);
1531         } else if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_PFL_RANGE)) {
1532                 struct lod_thread_info *info = lod_env_info(env);
1533                 struct lu_buf *buf = &info->lti_buf;
1534                 struct lov_comp_md_v1 *lcm;
1535                 struct lov_comp_md_entry_v1 *lcme;
1536
1537                 rc = lod_get_lov_ea(env, lo);
1538                 if (rc <= 0)
1539                         RETURN(rc);
1540
1541                 buf->lb_buf = info->lti_ea_store;
1542                 buf->lb_len = info->lti_ea_store_size;
1543                 lcm = buf->lb_buf;
1544                 if (le32_to_cpu(lcm->lcm_magic) != LOV_MAGIC_COMP_V1 &&
1545                     le32_to_cpu(lcm->lcm_magic) != LOV_MAGIC_SEL)
1546                         RETURN(-EINVAL);
1547
1548                 le32_add_cpu(&lcm->lcm_layout_gen, 1);
1549                 lcme = &lcm->lcm_entries[0];
1550                 le64_add_cpu(&lcme->lcme_extent.e_start, 1);
1551                 le64_add_cpu(&lcme->lcme_extent.e_end, -1);
1552
1553                 rc = lod_sub_xattr_set(env, next, buf, XATTR_NAME_LOV,
1554                                        LU_XATTR_REPLACE, th);
1555         }
1556
1557         RETURN(rc);
1558 }
1559
1560 /**
1561  * Implementation of dt_object_operations::do_xattr_get.
1562  *
1563  * If LOV EA is requested from the root object and it's not
1564  * found, then return default striping for the filesystem.
1565  *
1566  * \see dt_object_operations::do_xattr_get() in the API description for details.
1567  */
1568 static int lod_xattr_get(const struct lu_env *env, struct dt_object *dt,
1569                          struct lu_buf *buf, const char *name)
1570 {
1571         struct lod_thread_info *info = lod_env_info(env);
1572         struct lod_device *dev = lu2lod_dev(dt->do_lu.lo_dev);
1573         int is_root;
1574         int rc;
1575         ENTRY;
1576
1577         rc = dt_xattr_get(env, dt_object_child(dt), buf, name);
1578         if (strcmp(name, XATTR_NAME_LMV) == 0) {
1579                 struct lmv_mds_md_v1    *lmv1;
1580                 struct lmv_foreign_md   *lfm;
1581                 int                      rc1 = 0;
1582
1583                 if (rc > (typeof(rc))sizeof(*lmv1))
1584                         RETURN(rc);
1585
1586                 /* short (<= sizeof(struct lmv_mds_md_v1)) foreign LMV case */
1587                 /* XXX empty foreign LMV is not allowed */
1588                 if (rc <= offsetof(typeof(*lfm), lfm_value))
1589                         RETURN(rc = rc > 0 ? -EINVAL : rc);
1590
1591                 if (buf->lb_buf == NULL || buf->lb_len == 0) {
1592                         BUILD_BUG_ON(sizeof(*lmv1) > sizeof(info->lti_key));
1593
1594                         /* lti_buf is large enough for *lmv1 or a short
1595                          * (<= sizeof(struct lmv_mds_md_v1)) foreign LMV
1596                          */
1597                         info->lti_buf.lb_buf = info->lti_key;
1598                         info->lti_buf.lb_len = sizeof(*lmv1);
1599                         rc = dt_xattr_get(env, dt_object_child(dt),
1600                                           &info->lti_buf, name);
1601                         if (unlikely(rc <= offsetof(typeof(*lfm),
1602                                                     lfm_value)))
1603                                 RETURN(rc = rc > 0 ? -EINVAL : rc);
1604
1605                         lfm = info->lti_buf.lb_buf;
1606                         if (le32_to_cpu(lfm->lfm_magic) == LMV_MAGIC_FOREIGN)
1607                                 RETURN(rc);
1608
1609                         if (unlikely(rc != sizeof(*lmv1)))
1610                                 RETURN(rc = rc > 0 ? -EINVAL : rc);
1611
1612                         lmv1 = info->lti_buf.lb_buf;
1613                         /* The on-disk LMV EA only contains header, but the
1614                          * returned LMV EA size should contain the space for
1615                          * the FIDs of all shards of the striped directory. */
1616                         if (le32_to_cpu(lmv1->lmv_magic) == LMV_MAGIC_V1)
1617                                 rc = lmv_mds_md_size(
1618                                         le32_to_cpu(lmv1->lmv_stripe_count),
1619                                         le32_to_cpu(lmv1->lmv_magic));
1620                 } else {
1621                         lmv1 = buf->lb_buf;
1622                         if (le32_to_cpu(lmv1->lmv_magic) != LMV_MAGIC_V1)
1623                                 RETURN(rc);
1624
1625                         if (rc != sizeof(*lmv1))
1626                                 RETURN(rc = rc > 0 ? -EINVAL : rc);
1627
1628                         rc1 = lod_load_lmv_shards(env, lod_dt_obj(dt),
1629                                                   buf, false);
1630                 }
1631
1632                 RETURN(rc = rc1 != 0 ? rc1 : rc);
1633         }
1634
1635         if ((rc > 0) && buf->lb_buf && strcmp(name, XATTR_NAME_LOV) == 0) {
1636                 struct lov_comp_md_v1 *lcm = buf->lb_buf;
1637
1638                 if (lcm->lcm_magic == cpu_to_le32(LOV_MAGIC_SEL))
1639                         lcm->lcm_magic = cpu_to_le32(LOV_MAGIC_COMP_V1);
1640         }
1641
1642         if (rc != -ENODATA || !S_ISDIR(dt->do_lu.lo_header->loh_attr & S_IFMT))
1643                 RETURN(rc);
1644
1645         /*
1646          * XXX: Only used by lfsck
1647          *
1648          * lod returns default striping on the real root of the device
1649          * this is like the root stores default striping for the whole
1650          * filesystem. historically we've been using a different approach
1651          * and store it in the config.
1652          */
1653         dt_root_get(env, dev->lod_child, &info->lti_fid);
1654         is_root = lu_fid_eq(&info->lti_fid, lu_object_fid(&dt->do_lu));
1655
1656         if (is_root && strcmp(XATTR_NAME_LOV, name) == 0) {
1657                 struct lov_user_md *lum = buf->lb_buf;
1658                 struct lov_desc *desc = &dev->lod_ost_descs.ltd_lov_desc;
1659
1660                 if (buf->lb_buf == NULL) {
1661                         rc = sizeof(*lum);
1662                 } else if (buf->lb_len >= sizeof(*lum)) {
1663                         lum->lmm_magic = cpu_to_le32(LOV_USER_MAGIC_V1);
1664                         lmm_oi_set_seq(&lum->lmm_oi, FID_SEQ_LOV_DEFAULT);
1665                         lmm_oi_set_id(&lum->lmm_oi, 0);
1666                         lmm_oi_cpu_to_le(&lum->lmm_oi, &lum->lmm_oi);
1667                         lum->lmm_pattern = cpu_to_le32(desc->ld_pattern);
1668                         lum->lmm_stripe_size = cpu_to_le32(
1669                                                 desc->ld_default_stripe_size);
1670                         lum->lmm_stripe_count = cpu_to_le16(
1671                                                 desc->ld_default_stripe_count);
1672                         lum->lmm_stripe_offset = cpu_to_le16(
1673                                                 desc->ld_default_stripe_offset);
1674                         rc = sizeof(*lum);
1675                 } else {
1676                         rc = -ERANGE;
1677                 }
1678         }
1679
1680         RETURN(rc);
1681 }
1682
1683 /**
1684  * Verify LVM EA.
1685  *
1686  * Checks that the magic of the stripe is sane.
1687  *
1688  * \param[in] lod       lod device
1689  * \param[in] lum       a buffer storing LMV EA to verify
1690  *
1691  * \retval              0 if the EA is sane
1692  * \retval              negative otherwise
1693  */
1694 static int lod_verify_md_striping(struct lod_device *lod,
1695                                   const struct lmv_user_md_v1 *lum)
1696 {
1697         if (unlikely(le32_to_cpu(lum->lum_magic) != LMV_USER_MAGIC)) {
1698                 CERROR("%s: invalid lmv_user_md: magic = %x, "
1699                        "stripe_offset = %d, stripe_count = %u: rc = %d\n",
1700                        lod2obd(lod)->obd_name, le32_to_cpu(lum->lum_magic),
1701                        (int)le32_to_cpu(lum->lum_stripe_offset),
1702                        le32_to_cpu(lum->lum_stripe_count), -EINVAL);
1703                 return -EINVAL;
1704         }
1705
1706         return 0;
1707 }
1708
1709 /**
1710  * Initialize LMV EA for a slave.
1711  *
1712  * Initialize slave's LMV EA from the master's LMV EA.
1713  *
1714  * \param[in] master_lmv        a buffer containing master's EA
1715  * \param[out] slave_lmv        a buffer where slave's EA will be stored
1716  *
1717  */
1718 static void lod_prep_slave_lmv_md(struct lmv_mds_md_v1 *slave_lmv,
1719                                   const struct lmv_mds_md_v1 *master_lmv)
1720 {
1721         *slave_lmv = *master_lmv;
1722         slave_lmv->lmv_magic = cpu_to_le32(LMV_MAGIC_STRIPE);
1723 }
1724
1725 /**
1726  * Generate LMV EA.
1727  *
1728  * Generate LMV EA from the object passed as \a dt. The object must have
1729  * the stripes created and initialized.
1730  *
1731  * \param[in] env       execution environment
1732  * \param[in] dt        object
1733  * \param[out] lmv_buf  buffer storing generated LMV EA
1734  *
1735  * \retval              0 on success
1736  * \retval              negative if failed
1737  */
1738 static int lod_prep_lmv_md(const struct lu_env *env, struct dt_object *dt,
1739                            struct lu_buf *lmv_buf)
1740 {
1741         struct lod_thread_info  *info = lod_env_info(env);
1742         struct lod_device       *lod = lu2lod_dev(dt->do_lu.lo_dev);
1743         struct lod_object       *lo = lod_dt_obj(dt);
1744         struct lmv_mds_md_v1    *lmm1;
1745         int                     stripe_count;
1746         int                     type = LU_SEQ_RANGE_ANY;
1747         int                     rc;
1748         __u32                   mdtidx;
1749         ENTRY;
1750
1751         LASSERT(lo->ldo_dir_striped != 0);
1752         LASSERT(lo->ldo_dir_stripe_count > 0);
1753         stripe_count = lo->ldo_dir_stripe_count;
1754         /* Only store the LMV EA heahder on the disk. */
1755         if (info->lti_ea_store_size < sizeof(*lmm1)) {
1756                 rc = lod_ea_store_resize(info, sizeof(*lmm1));
1757                 if (rc != 0)
1758                         RETURN(rc);
1759         } else {
1760                 memset(info->lti_ea_store, 0, sizeof(*lmm1));
1761         }
1762
1763         lmm1 = (struct lmv_mds_md_v1 *)info->lti_ea_store;
1764         memset(lmm1, 0, sizeof(*lmm1));
1765         lmm1->lmv_magic = cpu_to_le32(LMV_MAGIC);
1766         lmm1->lmv_stripe_count = cpu_to_le32(stripe_count);
1767         lmm1->lmv_hash_type = cpu_to_le32(lo->ldo_dir_hash_type);
1768         lmm1->lmv_layout_version = cpu_to_le32(lo->ldo_dir_layout_version);
1769         if (lod_is_layout_changing(lo)) {
1770                 lmm1->lmv_migrate_hash = cpu_to_le32(lo->ldo_dir_migrate_hash);
1771                 lmm1->lmv_migrate_offset =
1772                         cpu_to_le32(lo->ldo_dir_migrate_offset);
1773         }
1774         rc = lod_fld_lookup(env, lod, lu_object_fid(&dt->do_lu),
1775                             &mdtidx, &type);
1776         if (rc != 0)
1777                 RETURN(rc);
1778
1779         lmm1->lmv_master_mdt_index = cpu_to_le32(mdtidx);
1780         lmv_buf->lb_buf = info->lti_ea_store;
1781         lmv_buf->lb_len = sizeof(*lmm1);
1782
1783         RETURN(rc);
1784 }
1785
1786 /**
1787  * Create in-core represenation for a striped directory.
1788  *
1789  * Parse the buffer containing LMV EA and instantiate LU objects
1790  * representing the stripe objects. The pointers to the objects are
1791  * stored in ldo_stripe field of \a lo. This function is used when
1792  * we need to access an already created object (i.e. load from a disk).
1793  *
1794  * \param[in] env       execution environment
1795  * \param[in] lo        lod object
1796  * \param[in] buf       buffer containing LMV EA
1797  *
1798  * \retval              0 on success
1799  * \retval              negative if failed
1800  */
1801 int lod_parse_dir_striping(const struct lu_env *env, struct lod_object *lo,
1802                            const struct lu_buf *buf)
1803 {
1804         struct lod_thread_info  *info = lod_env_info(env);
1805         struct lod_device       *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
1806         struct lod_tgt_descs    *ltd = &lod->lod_mdt_descs;
1807         struct dt_object        **stripe;
1808         union lmv_mds_md        *lmm = buf->lb_buf;
1809         struct lmv_mds_md_v1    *lmv1 = &lmm->lmv_md_v1;
1810         struct lu_fid           *fid = &info->lti_fid;
1811         unsigned int            i;
1812         int                     rc = 0;
1813         ENTRY;
1814
1815         LASSERT(mutex_is_locked(&lo->ldo_layout_mutex));
1816
1817         /* XXX may be useless as not called for foreign LMV ?? */
1818         if (le32_to_cpu(lmv1->lmv_magic) == LMV_MAGIC_FOREIGN)
1819                 RETURN(0);
1820
1821         if (le32_to_cpu(lmv1->lmv_magic) == LMV_MAGIC_STRIPE) {
1822                 lo->ldo_dir_slave_stripe = 1;
1823                 RETURN(0);
1824         }
1825
1826         if (!lmv_is_sane(lmv1))
1827                 RETURN(-EINVAL);
1828
1829         LASSERT(lo->ldo_stripe == NULL);
1830         OBD_ALLOC_PTR_ARRAY(stripe, le32_to_cpu(lmv1->lmv_stripe_count));
1831         if (stripe == NULL)
1832                 RETURN(-ENOMEM);
1833
1834         for (i = 0; i < le32_to_cpu(lmv1->lmv_stripe_count); i++) {
1835                 struct dt_device        *tgt_dt;
1836                 struct dt_object        *dto;
1837                 int                     type = LU_SEQ_RANGE_ANY;
1838                 __u32                   idx;
1839
1840                 fid_le_to_cpu(fid, &lmv1->lmv_stripe_fids[i]);
1841                 if (!fid_is_sane(fid)) {
1842                         stripe[i] = NULL;
1843                         continue;
1844                 }
1845
1846                 rc = lod_fld_lookup(env, lod, fid, &idx, &type);
1847                 if (rc != 0)
1848                         GOTO(out, rc);
1849
1850                 if (idx == lod2lu_dev(lod)->ld_site->ld_seq_site->ss_node_id) {
1851                         tgt_dt = lod->lod_child;
1852                 } else {
1853                         struct lod_tgt_desc     *tgt;
1854
1855                         tgt = LTD_TGT(ltd, idx);
1856                         if (tgt == NULL)
1857                                 GOTO(out, rc = -ESTALE);
1858                         tgt_dt = tgt->ltd_tgt;
1859                 }
1860
1861                 dto = dt_locate_at(env, tgt_dt, fid,
1862                                   lo->ldo_obj.do_lu.lo_dev->ld_site->ls_top_dev,
1863                                   NULL);
1864                 if (IS_ERR(dto))
1865                         GOTO(out, rc = PTR_ERR(dto));
1866
1867                 stripe[i] = dto;
1868         }
1869 out:
1870         lo->ldo_stripe = stripe;
1871         lo->ldo_is_foreign = 0;
1872         lo->ldo_dir_stripe_count = le32_to_cpu(lmv1->lmv_stripe_count);
1873         lo->ldo_dir_stripes_allocated = le32_to_cpu(lmv1->lmv_stripe_count);
1874         lo->ldo_dir_layout_version = le32_to_cpu(lmv1->lmv_layout_version);
1875         lo->ldo_dir_migrate_offset = le32_to_cpu(lmv1->lmv_migrate_offset);
1876         lo->ldo_dir_migrate_hash = le32_to_cpu(lmv1->lmv_migrate_hash);
1877         lo->ldo_dir_hash_type = le32_to_cpu(lmv1->lmv_hash_type);
1878         if (rc != 0)
1879                 lod_striping_free_nolock(env, lo);
1880
1881         RETURN(rc);
1882 }
1883
1884 /**
1885  * Declare create a striped directory.
1886  *
1887  * Declare creating a striped directory with a given stripe pattern on the
1888  * specified MDTs. A striped directory is represented as a regular directory
1889  * - an index listing all the stripes. The stripes point back to the master
1890  * object with ".." and LinkEA. The master object gets LMV EA which
1891  * identifies it as a striped directory. The function allocates FIDs
1892  * for all stripes.
1893  *
1894  * \param[in] env       execution environment
1895  * \param[in] dt        object
1896  * \param[in] attr      attributes to initialize the objects with
1897  * \param[in] dof       type of objects to be created
1898  * \param[in] th        transaction handle
1899  *
1900  * \retval              0 on success
1901  * \retval              negative if failed
1902  */
1903 static int lod_dir_declare_create_stripes(const struct lu_env *env,
1904                                           struct dt_object *dt,
1905                                           struct lu_attr *attr,
1906                                           struct dt_object_format *dof,
1907                                           struct thandle *th)
1908 {
1909         struct lod_thread_info  *info = lod_env_info(env);
1910         struct lu_buf           lmv_buf;
1911         struct lu_buf           slave_lmv_buf;
1912         struct lmv_mds_md_v1    *lmm;
1913         struct lmv_mds_md_v1    *slave_lmm = NULL;
1914         struct dt_insert_rec    *rec = &info->lti_dt_rec;
1915         struct lod_object       *lo = lod_dt_obj(dt);
1916         int                     rc;
1917         __u32                   i;
1918         ENTRY;
1919
1920         rc = lod_prep_lmv_md(env, dt, &lmv_buf);
1921         if (rc != 0)
1922                 GOTO(out, rc);
1923         lmm = lmv_buf.lb_buf;
1924
1925         OBD_ALLOC_PTR(slave_lmm);
1926         if (slave_lmm == NULL)
1927                 GOTO(out, rc = -ENOMEM);
1928
1929         lod_prep_slave_lmv_md(slave_lmm, lmm);
1930         slave_lmv_buf.lb_buf = slave_lmm;
1931         slave_lmv_buf.lb_len = sizeof(*slave_lmm);
1932
1933         if (!dt_try_as_dir(env, dt_object_child(dt)))
1934                 GOTO(out, rc = -EINVAL);
1935
1936         rec->rec_type = S_IFDIR;
1937         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
1938                 struct dt_object        *dto = lo->ldo_stripe[i];
1939                 char                    *stripe_name = info->lti_key;
1940                 struct lu_name          *sname;
1941                 struct linkea_data       ldata          = { NULL };
1942                 struct lu_buf           linkea_buf;
1943
1944                 /* OBD_FAIL_MDS_STRIPE_FID may leave stripe uninitialized */
1945                 if (!dto)
1946                         continue;
1947
1948                 /* directory split skip create for existing stripes */
1949                 if (!(lod_is_splitting(lo) && i < lo->ldo_dir_split_offset)) {
1950                         rc = lod_sub_declare_create(env, dto, attr, NULL, dof,
1951                                                     th);
1952                         if (rc != 0)
1953                                 GOTO(out, rc);
1954
1955                         if (!dt_try_as_dir(env, dto))
1956                                 GOTO(out, rc = -EINVAL);
1957
1958                         rc = lod_sub_declare_ref_add(env, dto, th);
1959                         if (rc != 0)
1960                                 GOTO(out, rc);
1961
1962                         rec->rec_fid = lu_object_fid(&dto->do_lu);
1963                         rc = lod_sub_declare_insert(env, dto,
1964                                                     (const struct dt_rec *)rec,
1965                                                     (const struct dt_key *)dot,
1966                                                     th);
1967                         if (rc != 0)
1968                                 GOTO(out, rc);
1969
1970                         /* master stripe FID will be put to .. */
1971                         rec->rec_fid = lu_object_fid(&dt->do_lu);
1972                         rc = lod_sub_declare_insert(env, dto,
1973                                                   (const struct dt_rec *)rec,
1974                                                   (const struct dt_key *)dotdot,
1975                                                   th);
1976                         if (rc != 0)
1977                                 GOTO(out, rc);
1978
1979                         if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_SLAVE_NAME) &&
1980                             cfs_fail_val == i)
1981                                 snprintf(stripe_name, sizeof(info->lti_key),
1982                                          DFID":%u",
1983                                          PFID(lu_object_fid(&dto->do_lu)),
1984                                          i + 1);
1985                         else
1986                                 snprintf(stripe_name, sizeof(info->lti_key),
1987                                          DFID":%u",
1988                                          PFID(lu_object_fid(&dto->do_lu)), i);
1989
1990                         sname = lod_name_get(env, stripe_name,
1991                                              strlen(stripe_name));
1992                         rc = linkea_links_new(&ldata, &info->lti_linkea_buf,
1993                                               sname, lu_object_fid(&dt->do_lu));
1994                         if (rc != 0)
1995                                 GOTO(out, rc);
1996
1997                         linkea_buf.lb_buf = ldata.ld_buf->lb_buf;
1998                         linkea_buf.lb_len = ldata.ld_leh->leh_len;
1999                         rc = lod_sub_declare_xattr_set(env, dto, &linkea_buf,
2000                                                        XATTR_NAME_LINK, 0, th);
2001                         if (rc != 0)
2002                                 GOTO(out, rc);
2003
2004                         rec->rec_fid = lu_object_fid(&dto->do_lu);
2005                         rc = lod_sub_declare_insert(env, dt_object_child(dt),
2006                                         (const struct dt_rec *)rec,
2007                                         (const struct dt_key *)stripe_name, th);
2008                         if (rc != 0)
2009                                 GOTO(out, rc);
2010
2011                         rc = lod_sub_declare_ref_add(env, dt_object_child(dt),
2012                                                      th);
2013                         if (rc != 0)
2014                                 GOTO(out, rc);
2015                 }
2016
2017                 if (!OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_SLAVE_LMV) ||
2018                     cfs_fail_val != i) {
2019                         if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_SLAVE_LMV) &&
2020                             cfs_fail_val == i)
2021                                 slave_lmm->lmv_master_mdt_index =
2022                                                         cpu_to_le32(i + 1);
2023                         else
2024                                 slave_lmm->lmv_master_mdt_index =
2025                                                         cpu_to_le32(i);
2026                         rc = lod_sub_declare_xattr_set(env, dto, &slave_lmv_buf,
2027                                                        XATTR_NAME_LMV, 0, th);
2028                         if (rc != 0)
2029                                 GOTO(out, rc);
2030                 }
2031         }
2032
2033         rc = lod_sub_declare_xattr_set(env, dt_object_child(dt),
2034                                        &lmv_buf, XATTR_NAME_LMV, 0, th);
2035         if (rc != 0)
2036                 GOTO(out, rc);
2037 out:
2038         if (slave_lmm != NULL)
2039                 OBD_FREE_PTR(slave_lmm);
2040
2041         RETURN(rc);
2042 }
2043
2044 /**
2045  * Allocate a striping on a predefined set of MDTs.
2046  *
2047  * Allocates new striping using the MDT index range provided by the data from
2048  * the lum_obejcts contained in the lmv_user_md passed to this method if
2049  * \a is_specific is true; or allocates new layout starting from MDT index in
2050  * lo->ldo_dir_stripe_offset. The exact order of MDTs is not important and
2051  * varies depending on MDT status. The number of stripes needed and stripe
2052  * offset are taken from the object. If that number cannot be met, then the
2053  * function returns an error and then it's the caller's responsibility to
2054  * release the stripes allocated. All the internal structures are protected,
2055  * but no concurrent allocation is allowed on the same objects.
2056  *
2057  * \param[in] env               execution environment for this thread
2058  * \param[in] lo                LOD object
2059  * \param[out] stripes          striping created
2060  * \param[out] mdt_indices      MDT indices of striping created
2061  * \param[in] is_specific       true if the MDTs are provided by lum; false if
2062  *                              only the starting MDT index is provided
2063  *
2064  * \retval positive     stripes allocated, including the first stripe allocated
2065  *                      outside
2066  * \retval negative     errno on failure
2067  */
2068 static int lod_mdt_alloc_specific(const struct lu_env *env,
2069                                   struct lod_object *lo,
2070                                   struct dt_object **stripes,
2071                                   __u32 *mdt_indices, bool is_specific)
2072 {
2073         struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
2074         struct lu_tgt_descs *ltd = &lod->lod_mdt_descs;
2075         struct lu_tgt_desc *tgt = NULL;
2076         struct lu_object_conf conf = { .loc_flags = LOC_F_NEW };
2077         struct dt_device *tgt_dt = NULL;
2078         struct lu_fid fid = { 0 };
2079         struct dt_object *dto;
2080         u32 master_index;
2081         u32 stripe_count = lo->ldo_dir_stripe_count;
2082         int stripe_idx = 1;
2083         int j;
2084         int idx;
2085         int rc;
2086
2087         master_index = lu_site2seq(lod2lu_dev(lod)->ld_site)->ss_node_id;
2088         if (!is_specific && stripe_count > 1)
2089                 /* Set the start index for the 2nd stripe allocation */
2090                 mdt_indices[1] = (mdt_indices[0] + 1) %
2091                                         (lod->lod_remote_mdt_count + 1);
2092
2093         for (; stripe_idx < stripe_count; stripe_idx++) {
2094                 /* Try to find next avaible target */
2095                 idx = mdt_indices[stripe_idx];
2096                 for (j = 0; j < lod->lod_remote_mdt_count;
2097                      j++, idx = (idx + 1) % (lod->lod_remote_mdt_count + 1)) {
2098                         bool already_allocated = false;
2099                         __u32 k;
2100
2101                         CDEBUG(D_INFO, "try idx %d, mdt cnt %u, allocated %u\n",
2102                                idx, lod->lod_remote_mdt_count + 1, stripe_idx);
2103
2104                         if (likely(!is_specific &&
2105                                    !OBD_FAIL_CHECK(OBD_FAIL_LARGE_STRIPE))) {
2106                                 /* check whether the idx already exists
2107                                  * in current allocated array */
2108                                 for (k = 0; k < stripe_idx; k++) {
2109                                         if (mdt_indices[k] == idx) {
2110                                                 already_allocated = true;
2111                                                 break;
2112                                         }
2113                                 }
2114
2115                                 if (already_allocated)
2116                                         continue;
2117                         }
2118
2119                         /* Sigh, this index is not in the bitmap, let's check
2120                          * next available target */
2121                         if (!test_bit(idx, ltd->ltd_tgt_bitmap) &&
2122                             idx != master_index)
2123                                 continue;
2124
2125                         if (idx == master_index) {
2126                                 /* Allocate the FID locally */
2127                                 tgt_dt = lod->lod_child;
2128                                 rc = dt_fid_alloc(env, tgt_dt, &fid, NULL,
2129                                                   NULL);
2130                                 if (rc < 0)
2131                                         continue;
2132                                 break;
2133                         }
2134
2135                         /* check the status of the OSP */
2136                         tgt = LTD_TGT(ltd, idx);
2137                         if (!tgt)
2138                                 continue;
2139
2140                         tgt_dt = tgt->ltd_tgt;
2141                         if (!tgt->ltd_active)
2142                                 /* this OSP doesn't feel well */
2143                                 continue;
2144
2145                         rc = dt_fid_alloc(env, tgt_dt, &fid, NULL, NULL);
2146                         if (rc < 0)
2147                                 continue;
2148
2149                         break;
2150                 }
2151
2152                 /* Can not allocate more stripes */
2153                 if (j == lod->lod_remote_mdt_count) {
2154                         CDEBUG(D_INFO, "%s: require stripes %u only get %d\n",
2155                                lod2obd(lod)->obd_name, stripe_count,
2156                                stripe_idx);
2157                         break;
2158                 }
2159
2160                 CDEBUG(D_INFO, "Get idx %d, for stripe %d "DFID"\n",
2161                        idx, stripe_idx, PFID(&fid));
2162                 mdt_indices[stripe_idx] = idx;
2163                 /* Set the start index for next stripe allocation */
2164                 if (!is_specific && stripe_idx < stripe_count - 1) {
2165                         /*
2166                          * for large dir test, put all other slaves on one
2167                          * remote MDT, otherwise we may save too many local
2168                          * slave locks which will exceed RS_MAX_LOCKS.
2169                          */
2170                         if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_LARGE_STRIPE)))
2171                                 idx = master_index;
2172                         mdt_indices[stripe_idx + 1] = (idx + 1) %
2173                                            (lod->lod_remote_mdt_count + 1);
2174                 }
2175                 /* tgt_dt and fid must be ready after search avaible OSP
2176                  * in the above loop */
2177                 LASSERT(tgt_dt != NULL);
2178                 LASSERT(fid_is_sane(&fid));
2179
2180                 /* fail a remote stripe FID allocation */
2181                 if (stripe_idx && OBD_FAIL_CHECK(OBD_FAIL_MDS_STRIPE_FID))
2182                         continue;
2183
2184                 dto = dt_locate_at(env, tgt_dt, &fid,
2185                                   lo->ldo_obj.do_lu.lo_dev->ld_site->ls_top_dev,
2186                                   &conf);
2187                 if (IS_ERR(dto)) {
2188                         rc = PTR_ERR(dto);
2189                         goto error;
2190                 }
2191
2192                 stripes[stripe_idx] = dto;
2193         }
2194
2195         return stripe_idx;
2196
2197 error:
2198         for (j = 1; j < stripe_idx; j++) {
2199                 LASSERT(stripes[j] != NULL);
2200                 dt_object_put(env, stripes[j]);
2201                 stripes[j] = NULL;
2202         }
2203         return rc;
2204 }
2205
2206 static int lod_prep_md_striped_create(const struct lu_env *env,
2207                                       struct dt_object *dt,
2208                                       struct lu_attr *attr,
2209                                       const struct lmv_user_md_v1 *lum,
2210                                       struct dt_object_format *dof,
2211                                       struct thandle *th)
2212 {
2213         struct lod_device *lod = lu2lod_dev(dt->do_lu.lo_dev);
2214         struct lod_object *lo = lod_dt_obj(dt);
2215         struct dt_object **stripes;
2216         struct lu_object_conf conf = { .loc_flags = LOC_F_NEW };
2217         struct lu_fid fid = { 0 };
2218         __u32 stripe_count;
2219         int i;
2220         int rc = 0;
2221
2222         ENTRY;
2223
2224         /* The lum has been verifed in lod_verify_md_striping */
2225         LASSERT(le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC ||
2226                 le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC_SPECIFIC);
2227
2228         stripe_count = lo->ldo_dir_stripe_count;
2229
2230         OBD_ALLOC_PTR_ARRAY(stripes, stripe_count);
2231         if (!stripes)
2232                 RETURN(-ENOMEM);
2233
2234         /* Allocate the first stripe locally */
2235         rc = dt_fid_alloc(env, lod->lod_child, &fid, NULL, NULL);
2236         if (rc < 0)
2237                 GOTO(out, rc);
2238
2239         stripes[0] = dt_locate_at(env, lod->lod_child, &fid,
2240                                   dt->do_lu.lo_dev->ld_site->ls_top_dev, &conf);
2241         if (IS_ERR(stripes[0]))
2242                 GOTO(out, rc = PTR_ERR(stripes[0]));
2243
2244         if (lo->ldo_dir_stripe_offset == LMV_OFFSET_DEFAULT) {
2245                 lod_qos_statfs_update(env, lod, &lod->lod_mdt_descs);
2246                 rc = lod_mdt_alloc_qos(env, lo, stripes, 1, stripe_count);
2247                 if (rc == -EAGAIN)
2248                         rc = lod_mdt_alloc_rr(env, lo, stripes, 1,
2249                                               stripe_count);
2250         } else {
2251                 int *idx_array;
2252                 bool is_specific = false;
2253
2254                 OBD_ALLOC_PTR_ARRAY(idx_array, stripe_count);
2255                 if (!idx_array)
2256                         GOTO(out, rc = -ENOMEM);
2257
2258                 if (le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC_SPECIFIC) {
2259                         is_specific = true;
2260                         for (i = 0; i < stripe_count; i++)
2261                                 idx_array[i] =
2262                                        le32_to_cpu(lum->lum_objects[i].lum_mds);
2263                 }
2264
2265                 /* stripe 0 is local */
2266                 idx_array[0] =
2267                         lu_site2seq(lod2lu_dev(lod)->ld_site)->ss_node_id;
2268                 rc = lod_mdt_alloc_specific(env, lo, stripes, idx_array,
2269                                             is_specific);
2270                 OBD_FREE_PTR_ARRAY(idx_array, stripe_count);
2271         }
2272
2273         if (rc < 0)
2274                 GOTO(out, rc);
2275
2276         LASSERT(rc > 0);
2277
2278         lo->ldo_dir_striped = 1;
2279         lo->ldo_stripe = stripes;
2280         lo->ldo_dir_stripe_count = rc;
2281         lo->ldo_dir_stripes_allocated = stripe_count;
2282         smp_mb();
2283         lo->ldo_dir_stripe_loaded = 1;
2284
2285         rc = lod_dir_declare_create_stripes(env, dt, attr, dof, th);
2286         if (rc < 0)
2287                 lod_striping_free(env, lo);
2288
2289         RETURN(rc);
2290
2291 out:
2292         LASSERT(rc < 0);
2293         if (!IS_ERR_OR_NULL(stripes[0]))
2294                 dt_object_put(env, stripes[0]);
2295         for (i = 1; i < stripe_count; i++)
2296                 LASSERT(!stripes[i]);
2297         OBD_FREE_PTR_ARRAY(stripes, stripe_count);
2298
2299         return rc;
2300 }
2301
2302 /**
2303  *
2304  * Alloc cached foreign LOV
2305  *
2306  * \param[in] lo        object
2307  * \param[in] size      size of foreign LOV
2308  *
2309  * \retval              0 on success
2310  * \retval              negative if failed
2311  */
2312 int lod_alloc_foreign_lov(struct lod_object *lo, size_t size)
2313 {
2314         OBD_ALLOC_LARGE(lo->ldo_foreign_lov, size);
2315         if (lo->ldo_foreign_lov == NULL)
2316                 return -ENOMEM;
2317         lo->ldo_foreign_lov_size = size;
2318         lo->ldo_is_foreign = 1;
2319         return 0;
2320 }
2321
2322 /**
2323  *
2324  * Free cached foreign LOV
2325  *
2326  * \param[in] lo        object
2327  */
2328 void lod_free_foreign_lov(struct lod_object *lo)
2329 {
2330         if (lo->ldo_foreign_lov != NULL)
2331                 OBD_FREE_LARGE(lo->ldo_foreign_lov, lo->ldo_foreign_lov_size);
2332         lo->ldo_foreign_lov = NULL;
2333         lo->ldo_foreign_lov_size = 0;
2334         lo->ldo_is_foreign = 0;
2335 }
2336
2337 /**
2338  *
2339  * Alloc cached foreign LMV
2340  *
2341  * \param[in] lo        object
2342  * \param[in] size      size of foreign LMV
2343  *
2344  * \retval              0 on success
2345  * \retval              negative if failed
2346  */
2347 int lod_alloc_foreign_lmv(struct lod_object *lo, size_t size)
2348 {
2349         OBD_ALLOC_LARGE(lo->ldo_foreign_lmv, size);
2350         if (lo->ldo_foreign_lmv == NULL)
2351                 return -ENOMEM;
2352         lo->ldo_foreign_lmv_size = size;
2353         lo->ldo_is_foreign = 1;
2354
2355         return 0;
2356 }
2357
2358 /**
2359  *
2360  * Free cached foreign LMV
2361  *
2362  * \param[in] lo        object
2363  */
2364 void lod_free_foreign_lmv(struct lod_object *lo)
2365 {
2366         if (lo->ldo_foreign_lmv != NULL)
2367                 OBD_FREE_LARGE(lo->ldo_foreign_lmv, lo->ldo_foreign_lmv_size);
2368         lo->ldo_foreign_lmv = NULL;
2369         lo->ldo_foreign_lmv_size = 0;
2370         lo->ldo_is_foreign = 0;
2371 }
2372
2373 /**
2374  * Declare create striped md object.
2375  *
2376  * The function declares intention to create a striped directory. This is a
2377  * wrapper for lod_prep_md_striped_create(). The only additional functionality
2378  * is to verify pattern \a lum_buf is good. Check that function for the details.
2379  *
2380  * \param[in] env       execution environment
2381  * \param[in] dt        object
2382  * \param[in] attr      attributes to initialize the objects with
2383  * \param[in] lum_buf   a pattern specifying the number of stripes and
2384  *                      MDT to start from
2385  * \param[in] dof       type of objects to be created
2386  * \param[in] th        transaction handle
2387  *
2388  * \retval              0 on success
2389  * \retval              negative if failed
2390  *
2391  */
2392 static int lod_declare_xattr_set_lmv(const struct lu_env *env,
2393                                      struct dt_object *dt,
2394                                      struct lu_attr *attr,
2395                                      const struct lu_buf *lum_buf,
2396                                      struct dt_object_format *dof,
2397                                      struct thandle *th)
2398 {
2399         struct lod_object *lo = lod_dt_obj(dt);
2400         struct lmv_user_md_v1 *lum = lum_buf->lb_buf;
2401         int rc;
2402
2403         ENTRY;
2404         LASSERT(lum != NULL);
2405
2406         CDEBUG(D_INFO,
2407                "lum magic=%x hash=%x count=%u offset=%d inherit=%u rr=%u\n",
2408                le32_to_cpu(lum->lum_magic), le32_to_cpu(lum->lum_hash_type),
2409                le32_to_cpu(lum->lum_stripe_count),
2410                (int)le32_to_cpu(lum->lum_stripe_offset),
2411                lum->lum_max_inherit, lum->lum_max_inherit_rr);
2412
2413         if (lo->ldo_dir_stripe_count == 0) {
2414                 if (lo->ldo_is_foreign) {
2415                         rc = lod_alloc_foreign_lmv(lo, lum_buf->lb_len);
2416                         if (rc != 0)
2417                                 GOTO(out, rc);
2418                         memcpy(lo->ldo_foreign_lmv, lum, lum_buf->lb_len);
2419                         lo->ldo_dir_stripe_loaded = 1;
2420                 }
2421                 GOTO(out, rc = 0);
2422         }
2423
2424         /* prepare dir striped objects */
2425         rc = lod_prep_md_striped_create(env, dt, attr, lum, dof, th);
2426         if (rc != 0) {
2427                 /* failed to create striping, let's reset
2428                  * config so that others don't get confused */
2429                 lod_striping_free(env, lo);
2430                 GOTO(out, rc);
2431         }
2432 out:
2433         RETURN(rc);
2434 }
2435
2436 /**
2437  * Set or replace striped directory layout, and LFSCK may set layout on a plain
2438  * directory, so don't check stripe count.
2439  *
2440  * \param[in] env       execution environment
2441  * \param[in] dt        target object
2442  * \param[in] lmv_buf   LMV buf which contains source stripe FIDs
2443  * \param[in] fl        set or replace
2444  * \param[in] th        transaction handle
2445  *
2446  * \retval              0 on success
2447  * \retval              negative if failed
2448  */
2449 static int lod_dir_layout_set(const struct lu_env *env,
2450                               struct dt_object *dt,
2451                               const struct lu_buf *lmv_buf,
2452                               int fl,
2453                               struct thandle *th)
2454 {
2455         struct dt_object *next = dt_object_child(dt);
2456         struct lod_object *lo = lod_dt_obj(dt);
2457         struct lod_device *lod = lu2lod_dev(lod2lu_obj(lo)->lo_dev);
2458         struct lmv_mds_md_v1 *lmv = lmv_buf->lb_buf;
2459         struct lmv_mds_md_v1 *slave_lmv;
2460         struct lu_buf slave_buf;
2461         int i;
2462         int rc;
2463
2464         ENTRY;
2465
2466         if (!lmv_is_sane2(lmv))
2467                 RETURN(-EINVAL);
2468
2469         /* adjust hash for dir merge, which may not be set in user command */
2470         if (lmv_is_merging(lmv) &&
2471             !(lmv->lmv_migrate_hash & LMV_HASH_TYPE_MASK))
2472                 lmv->lmv_merge_hash |=
2473                         lod->lod_mdt_descs.ltd_lmv_desc.ld_pattern &
2474                         LMV_HASH_TYPE_MASK;
2475
2476         LMV_DEBUG(D_INFO, lmv, "set");
2477
2478         rc = lod_sub_xattr_set(env, next, lmv_buf, XATTR_NAME_LMV, fl, th);
2479         if (rc)
2480                 RETURN(rc);
2481
2482         /* directory restripe may update stripe LMV directly */
2483         if (!lo->ldo_dir_stripe_count)
2484                 RETURN(0);
2485
2486         lo->ldo_dir_hash_type = le32_to_cpu(lmv->lmv_hash_type);
2487         lo->ldo_dir_migrate_offset = le32_to_cpu(lmv->lmv_migrate_offset);
2488         lo->ldo_dir_migrate_hash = le32_to_cpu(lmv->lmv_migrate_hash);
2489         lo->ldo_dir_layout_version = le32_to_cpu(lmv->lmv_layout_version);
2490
2491         OBD_ALLOC_PTR(slave_lmv);
2492         if (!slave_lmv)
2493                 RETURN(-ENOMEM);
2494
2495         lod_prep_slave_lmv_md(slave_lmv, lmv);
2496         slave_buf.lb_buf = slave_lmv;
2497         slave_buf.lb_len = sizeof(*slave_lmv);
2498
2499         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
2500                 if (!lo->ldo_stripe[i])
2501                         continue;
2502
2503                 if (!dt_object_exists(lo->ldo_stripe[i]))
2504                         continue;
2505
2506                 rc = lod_sub_xattr_set(env, lo->ldo_stripe[i], &slave_buf,
2507                                        XATTR_NAME_LMV, fl, th);
2508                 if (rc)
2509                         break;
2510         }
2511
2512         OBD_FREE_PTR(slave_lmv);
2513
2514         RETURN(rc);
2515 }
2516
2517 /**
2518  * Implementation of dt_object_operations::do_declare_xattr_set.
2519  *
2520  * Used with regular (non-striped) objects. Basically it
2521  * initializes the striping information and applies the
2522  * change to all the stripes.
2523  *
2524  * \see dt_object_operations::do_declare_xattr_set() in the API description
2525  * for details.
2526  */
2527 static int lod_dir_declare_xattr_set(const struct lu_env *env,
2528                                      struct dt_object *dt,
2529                                      const struct lu_buf *buf,
2530                                      const char *name, int fl,
2531                                      struct thandle *th)
2532 {
2533         struct dt_object        *next = dt_object_child(dt);
2534         struct lod_device       *d = lu2lod_dev(dt->do_lu.lo_dev);
2535         struct lod_object       *lo = lod_dt_obj(dt);
2536         int                     i;
2537         int                     rc;
2538         ENTRY;
2539
2540         if (strcmp(name, XATTR_NAME_DEFAULT_LMV) == 0) {
2541                 struct lmv_user_md_v1 *lum;
2542
2543                 LASSERT(buf != NULL && buf->lb_buf != NULL);
2544                 lum = buf->lb_buf;
2545                 rc = lod_verify_md_striping(d, lum);
2546                 if (rc != 0)
2547                         RETURN(rc);
2548         } else if (strcmp(name, XATTR_NAME_LOV) == 0) {
2549                 rc = lod_verify_striping(env, d, lo, buf, false);
2550                 if (rc != 0)
2551                         RETURN(rc);
2552         }
2553
2554         rc = lod_sub_declare_xattr_set(env, next, buf, name, fl, th);
2555         if (rc != 0)
2556                 RETURN(rc);
2557
2558         /* Note: Do not set LinkEA on sub-stripes, otherwise
2559          * it will confuse the fid2path process(see mdt_path_current()).
2560          * The linkEA between master and sub-stripes is set in
2561          * lod_xattr_set_lmv(). */
2562         if (strcmp(name, XATTR_NAME_LINK) == 0)
2563                 RETURN(0);
2564
2565         /* set xattr to each stripes, if needed */
2566         rc = lod_striping_load(env, lo);
2567         if (rc != 0)
2568                 RETURN(rc);
2569
2570         if (lo->ldo_dir_stripe_count == 0)
2571                 RETURN(0);
2572
2573         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
2574                 if (!lo->ldo_stripe[i])
2575                         continue;
2576
2577                 if (!dt_object_exists(lo->ldo_stripe[i]))
2578                         continue;
2579
2580                 rc = lod_sub_declare_xattr_set(env, lo->ldo_stripe[i],
2581                                                buf, name, fl, th);
2582                 if (rc != 0)
2583                         break;
2584         }
2585
2586         RETURN(rc);
2587 }
2588
2589 static int
2590 lod_obj_stripe_replace_parent_fid_cb(const struct lu_env *env,
2591                                      struct lod_object *lo,
2592                                      struct dt_object *dt, struct thandle *th,
2593                                      int comp_idx, int stripe_idx,
2594                                      struct lod_obj_stripe_cb_data *data)
2595 {
2596         struct lod_thread_info *info = lod_env_info(env);
2597         struct lod_layout_component *comp = &lo->ldo_comp_entries[comp_idx];
2598         struct filter_fid *ff = &info->lti_ff;
2599         struct lu_buf *buf = &info->lti_buf;
2600         int rc;
2601
2602         buf->lb_buf = ff;
2603         buf->lb_len = sizeof(*ff);
2604         rc = dt_xattr_get(env, dt, buf, XATTR_NAME_FID);
2605         if (rc < 0) {
2606                 if (rc == -ENODATA)
2607                         return 0;
2608                 return rc;
2609         }
2610
2611         /*
2612          * locd_buf is set if it's called by dir migration, which doesn't check
2613          * pfid and comp id.
2614          */
2615         if (data->locd_buf) {
2616                 memset(ff, 0, sizeof(*ff));
2617                 ff->ff_parent = *(struct lu_fid *)data->locd_buf->lb_buf;
2618         } else {
2619                 filter_fid_le_to_cpu(ff, ff, sizeof(*ff));
2620
2621                 if (lu_fid_eq(lod_object_fid(lo), &ff->ff_parent) &&
2622                     ff->ff_layout.ol_comp_id == comp->llc_id)
2623                         return 0;
2624
2625                 memset(ff, 0, sizeof(*ff));
2626                 ff->ff_parent = *lu_object_fid(&lo->ldo_obj.do_lu);
2627         }
2628
2629         /* rewrite filter_fid */
2630         ff->ff_parent.f_ver = stripe_idx;
2631         ff->ff_layout.ol_stripe_size = comp->llc_stripe_size;
2632         ff->ff_layout.ol_stripe_count = comp->llc_stripe_count;
2633         ff->ff_layout.ol_comp_id = comp->llc_id;
2634         ff->ff_layout.ol_comp_start = comp->llc_extent.e_start;
2635         ff->ff_layout.ol_comp_end = comp->llc_extent.e_end;
2636         filter_fid_cpu_to_le(ff, ff, sizeof(*ff));
2637
2638         if (data->locd_declare)
2639                 rc = lod_sub_declare_xattr_set(env, dt, buf, XATTR_NAME_FID,
2640                                                LU_XATTR_REPLACE, th);
2641         else
2642                 rc = lod_sub_xattr_set(env, dt, buf, XATTR_NAME_FID,
2643                                        LU_XATTR_REPLACE, th);
2644
2645         return rc;
2646 }
2647
2648 /**
2649  * Reset parent FID on OST object
2650  *
2651  * Replace parent FID with @dt object FID, which is only called during migration
2652  * to reset the parent FID after the MDT object is migrated to the new MDT, i.e.
2653  * the FID is changed.
2654  *
2655  * \param[in] env execution environment
2656  * \param[in] dt dt_object whose stripes's parent FID will be reset
2657  * \parem[in] th thandle
2658  * \param[in] declare if it is declare
2659  *
2660  * \retval      0 if reset succeeds
2661  * \retval      negative errno if reset fails
2662  */
2663 static int lod_replace_parent_fid(const struct lu_env *env,
2664                                   struct dt_object *dt,
2665                                   const struct lu_buf *buf,
2666                                   struct thandle *th, bool declare)
2667 {
2668         struct lod_object *lo = lod_dt_obj(dt);
2669         struct lod_thread_info  *info = lod_env_info(env);
2670         struct filter_fid *ff;
2671         struct lod_obj_stripe_cb_data data = { { 0 } };
2672         int rc;
2673         ENTRY;
2674
2675         LASSERT(S_ISREG(dt->do_lu.lo_header->loh_attr));
2676
2677         /* set xattr to each stripes, if needed */
2678         rc = lod_striping_load(env, lo);
2679         if (rc != 0)
2680                 RETURN(rc);
2681
2682         if (!lod_obj_is_striped(dt))
2683                 RETURN(0);
2684
2685         if (info->lti_ea_store_size < sizeof(*ff)) {
2686                 rc = lod_ea_store_resize(info, sizeof(*ff));
2687                 if (rc != 0)
2688                         RETURN(rc);
2689         }
2690
2691         data.locd_declare = declare;
2692         data.locd_stripe_cb = lod_obj_stripe_replace_parent_fid_cb;
2693         data.locd_buf = buf;
2694         rc = lod_obj_for_each_stripe(env, lo, th, &data);
2695
2696         RETURN(rc);
2697 }
2698
2699 __u16 lod_comp_entry_stripe_count(struct lod_object *lo,
2700                                   int comp_idx, bool is_dir)
2701 {
2702         struct lod_device *lod = lu2lod_dev(lod2lu_obj(lo)->lo_dev);
2703         struct lod_layout_component *entry;
2704
2705         if (is_dir)
2706                 return  0;
2707
2708         entry = &lo->ldo_comp_entries[comp_idx];
2709         if (lod_comp_inited(entry))
2710                 return entry->llc_stripe_count;
2711         else if ((__u16)-1 == entry->llc_stripe_count)
2712                 return lod->lod_ost_count;
2713         else
2714                 return lod_get_stripe_count(lod, lo, comp_idx,
2715                                             entry->llc_stripe_count,
2716                                             entry->llc_pattern &
2717                                             LOV_PATTERN_OVERSTRIPING);
2718 }
2719
2720 static int lod_comp_md_size(struct lod_object *lo, bool is_dir)
2721 {
2722         int magic, size = 0, i;
2723         struct lod_layout_component *comp_entries;
2724         __u16 comp_cnt;
2725         bool is_composite, is_foreign = false;
2726
2727         if (is_dir) {
2728                 comp_cnt = lo->ldo_def_striping->lds_def_comp_cnt;
2729                 comp_entries = lo->ldo_def_striping->lds_def_comp_entries;
2730                 is_composite =
2731                         lo->ldo_def_striping->lds_def_striping_is_composite;
2732         } else {
2733                 comp_cnt = lo->ldo_comp_cnt;
2734                 comp_entries = lo->ldo_comp_entries;
2735                 is_composite = lo->ldo_is_composite;
2736                 is_foreign = lo->ldo_is_foreign;
2737         }
2738
2739         if (is_foreign)
2740                 return lo->ldo_foreign_lov_size;
2741
2742         LASSERT(comp_cnt != 0 && comp_entries != NULL);
2743         if (is_composite) {
2744                 size = sizeof(struct lov_comp_md_v1) +
2745                        sizeof(struct lov_comp_md_entry_v1) * comp_cnt;
2746                 LASSERT(size % sizeof(__u64) == 0);
2747         }
2748
2749         for (i = 0; i < comp_cnt; i++) {
2750                 __u16 stripe_count;
2751
2752                 magic = comp_entries[i].llc_pool ? LOV_MAGIC_V3 : LOV_MAGIC_V1;
2753                 stripe_count = lod_comp_entry_stripe_count(lo, i, is_dir);
2754                 if (!is_dir && is_composite)
2755                         lod_comp_shrink_stripe_count(&comp_entries[i],
2756                                                      &stripe_count);
2757
2758                 size += lov_user_md_size(stripe_count, magic);
2759                 LASSERT(size % sizeof(__u64) == 0);
2760         }
2761         return size;
2762 }
2763
2764 /**
2765  * Declare component add. The xattr name is XATTR_LUSTRE_LOV.add, and
2766  * the xattr value is binary lov_comp_md_v1 which contains component(s)
2767  * to be added.
2768   *
2769  * \param[in] env       execution environment
2770  * \param[in] dt        dt_object to add components on
2771  * \param[in] buf       buffer contains components to be added
2772  * \parem[in] th        thandle
2773  *
2774  * \retval      0 on success
2775  * \retval      negative errno on failure
2776  */
2777 static int lod_declare_layout_add(const struct lu_env *env,
2778                                   struct dt_object *dt,
2779                                   const struct lu_buf *buf,
2780                                   struct thandle *th)
2781 {
2782         struct lod_thread_info  *info = lod_env_info(env);
2783         struct lod_layout_component *comp_array, *lod_comp, *old_array;
2784         struct lod_device *d = lu2lod_dev(dt->do_lu.lo_dev);
2785         struct dt_object *next = dt_object_child(dt);
2786         struct lov_desc *desc = &d->lod_ost_descs.ltd_lov_desc;
2787         struct lod_object *lo = lod_dt_obj(dt);
2788         struct lov_user_md_v3 *v3;
2789         struct lov_comp_md_v1 *comp_v1 = buf->lb_buf;
2790         __u32 magic;
2791         int i, rc, array_cnt, old_array_cnt;
2792         ENTRY;
2793
2794         LASSERT(lo->ldo_is_composite);
2795
2796         if (lo->ldo_flr_state != LCM_FL_NONE)
2797                 RETURN(-EBUSY);
2798
2799         rc = lod_verify_striping(env, d, lo, buf, false);
2800         if (rc != 0)
2801                 RETURN(rc);
2802
2803         magic = comp_v1->lcm_magic;
2804         if (magic == __swab32(LOV_USER_MAGIC_COMP_V1)) {
2805                 lustre_swab_lov_comp_md_v1(comp_v1);
2806                 magic = comp_v1->lcm_magic;
2807         }
2808
2809         if (magic != LOV_USER_MAGIC_COMP_V1)
2810                 RETURN(-EINVAL);
2811
2812         mutex_lock(&lo->ldo_layout_mutex);
2813
2814         array_cnt = lo->ldo_comp_cnt + comp_v1->lcm_entry_count;
2815         OBD_ALLOC_PTR_ARRAY(comp_array, array_cnt);
2816         if (comp_array == NULL) {
2817                 mutex_unlock(&lo->ldo_layout_mutex);
2818                 RETURN(-ENOMEM);
2819         }
2820
2821
2822         memcpy(comp_array, lo->ldo_comp_entries,
2823                sizeof(*comp_array) * lo->ldo_comp_cnt);
2824
2825         for (i = 0; i < comp_v1->lcm_entry_count; i++) {
2826                 struct lov_user_md_v1 *v1;
2827                 struct lu_extent *ext;
2828
2829                 v1 = (struct lov_user_md *)((char *)comp_v1 +
2830                                 comp_v1->lcm_entries[i].lcme_offset);
2831                 ext = &comp_v1->lcm_entries[i].lcme_extent;
2832
2833                 lod_comp = &comp_array[lo->ldo_comp_cnt + i];
2834                 lod_comp->llc_extent.e_start = ext->e_start;
2835                 lod_comp->llc_extent.e_end = ext->e_end;
2836                 lod_comp->llc_stripe_offset = v1->lmm_stripe_offset;
2837                 lod_comp->llc_flags = comp_v1->lcm_entries[i].lcme_flags;
2838
2839                 lod_comp->llc_stripe_count = v1->lmm_stripe_count;
2840                 lod_comp->llc_stripe_size = v1->lmm_stripe_size;
2841                 lod_adjust_stripe_info(lod_comp, desc, 0);
2842
2843                 if (v1->lmm_magic == LOV_USER_MAGIC_V3) {
2844                         v3 = (struct lov_user_md_v3 *) v1;
2845                         if (v3->lmm_pool_name[0] != '\0') {
2846                                 rc = lod_set_pool(&lod_comp->llc_pool,
2847                                                   v3->lmm_pool_name);
2848                                 if (rc)
2849                                         GOTO(error, rc);
2850                         }
2851                 }
2852         }
2853
2854         old_array = lo->ldo_comp_entries;
2855         old_array_cnt = lo->ldo_comp_cnt;
2856
2857         lo->ldo_comp_entries = comp_array;
2858         lo->ldo_comp_cnt = array_cnt;
2859
2860         /* No need to increase layout generation here, it will be increased
2861          * later when generating component ID for the new components */
2862
2863         info->lti_buf.lb_len = lod_comp_md_size(lo, false);
2864         rc = lod_sub_declare_xattr_set(env, next, &info->lti_buf,
2865                                               XATTR_NAME_LOV, 0, th);
2866         if (rc) {
2867                 lo->ldo_comp_entries = old_array;
2868                 lo->ldo_comp_cnt = old_array_cnt;
2869                 GOTO(error, rc);
2870         }
2871
2872         OBD_FREE_PTR_ARRAY(old_array, old_array_cnt);
2873
2874         LASSERT(lo->ldo_mirror_count == 1);
2875         lo->ldo_mirrors[0].lme_end = array_cnt - 1;
2876
2877         mutex_unlock(&lo->ldo_layout_mutex);
2878
2879         RETURN(0);
2880
2881 error:
2882         for (i = lo->ldo_comp_cnt; i < array_cnt; i++) {
2883                 lod_comp = &comp_array[i];
2884                 if (lod_comp->llc_pool != NULL) {
2885                         OBD_FREE(lod_comp->llc_pool,
2886                                  strlen(lod_comp->llc_pool) + 1);
2887                         lod_comp->llc_pool = NULL;
2888                 }
2889         }
2890         OBD_FREE_PTR_ARRAY(comp_array, array_cnt);
2891         mutex_unlock(&lo->ldo_layout_mutex);
2892
2893         RETURN(rc);
2894 }
2895
2896 /**
2897  * lod_last_non_stale_mirror() - Check if a mirror is the last non-stale mirror.
2898  * @mirror_id: Mirror id to be checked.
2899  * @lo:        LOD object.
2900  *
2901  * This function checks if a mirror with specified @mirror_id is the last
2902  * non-stale mirror of a LOD object @lo.
2903  *
2904  * Return: true or false.
2905  */
2906 static inline
2907 bool lod_last_non_stale_mirror(__u16 mirror_id, struct lod_object *lo)
2908 {
2909         struct lod_layout_component *lod_comp;
2910         bool has_stale_flag;
2911         int i;
2912
2913         for (i = 0; i < lo->ldo_mirror_count; i++) {
2914                 if (lo->ldo_mirrors[i].lme_id == mirror_id ||
2915                     lo->ldo_mirrors[i].lme_stale)
2916                         continue;
2917
2918                 has_stale_flag = false;
2919                 lod_foreach_mirror_comp(lod_comp, lo, i) {
2920                         if (lod_comp->llc_flags & LCME_FL_STALE) {
2921                                 has_stale_flag = true;
2922                                 break;
2923                         }
2924                 }
2925                 if (!has_stale_flag)
2926                         return false;
2927         }
2928
2929         return true;
2930 }
2931
2932 /**
2933  * Declare component set. The xattr is name XATTR_LUSTRE_LOV.set.$field,
2934  * the '$field' can only be 'flags' now. The xattr value is binary
2935  * lov_comp_md_v1 which contains the component ID(s) and the value of
2936  * the field to be modified.
2937  * Please update allowed_lustre_lov macro if $field groks more values
2938  * in the future.
2939  *
2940  * \param[in] env       execution environment
2941  * \param[in] dt        dt_object to be modified
2942  * \param[in] op        operation string, like "set.flags"
2943  * \param[in] buf       buffer contains components to be set
2944  * \parem[in] th        thandle
2945  *
2946  * \retval      0 on success
2947  * \retval      negative errno on failure
2948  */
2949 static int lod_declare_layout_set(const struct lu_env *env,
2950                                   struct dt_object *dt,
2951                                   char *op, const struct lu_buf *buf,
2952                                   struct thandle *th)
2953 {
2954         struct lod_layout_component     *lod_comp;
2955         struct lod_thread_info  *info = lod_env_info(env);
2956         struct lod_device       *d = lu2lod_dev(dt->do_lu.lo_dev);
2957         struct lod_object       *lo = lod_dt_obj(dt);
2958         struct lov_comp_md_v1   *comp_v1 = buf->lb_buf;
2959         __u32   magic;
2960         int     i, j, rc;
2961         bool    changed = false;
2962         ENTRY;
2963
2964         /* Please update allowed_lustre_lov macro if op
2965          * groks more values in the future
2966          */
2967         if (strcmp(op, "set.flags") != 0) {
2968                 CDEBUG(D_LAYOUT, "%s: operation (%s) not supported.\n",
2969                        lod2obd(d)->obd_name, op);
2970                 RETURN(-ENOTSUPP);
2971         }
2972
2973         magic = comp_v1->lcm_magic;
2974         if (magic == __swab32(LOV_USER_MAGIC_COMP_V1)) {
2975                 lustre_swab_lov_comp_md_v1(comp_v1);
2976                 magic = comp_v1->lcm_magic;
2977         }
2978
2979         if (magic != LOV_USER_MAGIC_COMP_V1)
2980                 RETURN(-EINVAL);
2981
2982         if (comp_v1->lcm_entry_count == 0) {
2983                 CDEBUG(D_LAYOUT, "%s: entry count is zero.\n",
2984                        lod2obd(d)->obd_name);
2985                 RETURN(-EINVAL);
2986         }
2987
2988         mutex_lock(&lo->ldo_layout_mutex);
2989         for (i = 0; i < comp_v1->lcm_entry_count; i++) {
2990                 __u32 id = comp_v1->lcm_entries[i].lcme_id;
2991                 __u32 flags = comp_v1->lcm_entries[i].lcme_flags;
2992                 __u32 mirror_flag = flags & LCME_MIRROR_FLAGS;
2993                 __u16 mirror_id = mirror_id_of(id);
2994                 bool neg = flags & LCME_FL_NEG;
2995
2996                 if (flags & LCME_FL_INIT) {
2997                         if (changed)
2998                                 lod_striping_free_nolock(env, lo);
2999                         mutex_unlock(&lo->ldo_layout_mutex);
3000                         RETURN(-EINVAL);
3001                 }
3002
3003                 flags &= ~(LCME_MIRROR_FLAGS | LCME_FL_NEG);
3004                 for (j = 0; j < lo->ldo_comp_cnt; j++) {
3005                         lod_comp = &lo->ldo_comp_entries[j];
3006
3007                         /* lfs only put one flag in each entry */
3008                         if ((flags && id != lod_comp->llc_id) ||
3009                             (mirror_flag && mirror_id !=
3010                                             mirror_id_of(lod_comp->llc_id)))
3011                                 continue;
3012
3013                         if (neg) {
3014                                 if (flags)
3015                                         lod_comp->llc_flags &= ~flags;
3016                                 if (mirror_flag)
3017                                         lod_comp->llc_flags &= ~mirror_flag;
3018                         } else {
3019                                 if (flags) {
3020                                         if ((flags & LCME_FL_STALE) &&
3021                                             lod_last_non_stale_mirror(mirror_id,
3022                                                                       lo)) {
3023                                                 mutex_unlock(
3024                                                         &lo->ldo_layout_mutex);
3025                                                 RETURN(-EUCLEAN);
3026                                         }
3027                                         lod_comp->llc_flags |= flags;
3028                                 }
3029                                 if (mirror_flag) {
3030                                         lod_comp->llc_flags |= mirror_flag;
3031                                         if (mirror_flag & LCME_FL_NOSYNC)
3032                                                 lod_comp->llc_timestamp =
3033                                                        ktime_get_real_seconds();
3034                                 }
3035                         }
3036                         changed = true;
3037                 }
3038         }
3039         mutex_unlock(&lo->ldo_layout_mutex);
3040
3041         if (!changed) {
3042                 CDEBUG(D_LAYOUT, "%s: requested component(s) not found.\n",
3043                        lod2obd(d)->obd_name);
3044                 RETURN(-EINVAL);
3045         }
3046
3047         lod_obj_inc_layout_gen(lo);
3048
3049         info->lti_buf.lb_len = lod_comp_md_size(lo, false);
3050         rc = lod_sub_declare_xattr_set(env, dt_object_child(dt), &info->lti_buf,
3051                                        XATTR_NAME_LOV, LU_XATTR_REPLACE, th);
3052         RETURN(rc);
3053 }
3054
3055 /**
3056  * Declare component deletion. The xattr name is XATTR_LUSTRE_LOV.del,
3057  * and the xattr value is a unique component ID or a special lcme_id.
3058  *
3059  * \param[in] env       execution environment
3060  * \param[in] dt        dt_object to be operated on
3061  * \param[in] buf       buffer contains component ID or lcme_id
3062  * \parem[in] th        thandle
3063  *
3064  * \retval      0 on success
3065  * \retval      negative errno on failure
3066  */
3067 static int lod_declare_layout_del(const struct lu_env *env,
3068                                   struct dt_object *dt,
3069                                   const struct lu_buf *buf,
3070                                   struct thandle *th)
3071 {
3072         struct lod_thread_info  *info = lod_env_info(env);
3073         struct dt_object *next = dt_object_child(dt);
3074         struct lod_device *d = lu2lod_dev(dt->do_lu.lo_dev);
3075         struct lod_object *lo = lod_dt_obj(dt);
3076         struct lu_attr *attr = &lod_env_info(env)->lti_attr;
3077         struct lov_comp_md_v1 *comp_v1 = buf->lb_buf;
3078         __u32 magic, id, flags, neg_flags = 0;
3079         int rc, i, j, left;
3080         ENTRY;
3081
3082         LASSERT(lo->ldo_is_composite);
3083
3084         if (lo->ldo_flr_state != LCM_FL_NONE)
3085                 RETURN(-EBUSY);
3086
3087         magic = comp_v1->lcm_magic;
3088         if (magic == __swab32(LOV_USER_MAGIC_COMP_V1)) {
3089                 lustre_swab_lov_comp_md_v1(comp_v1);
3090                 magic = comp_v1->lcm_magic;
3091         }
3092
3093         if (magic != LOV_USER_MAGIC_COMP_V1)
3094                 RETURN(-EINVAL);
3095
3096         id = comp_v1->lcm_entries[0].lcme_id;
3097         flags = comp_v1->lcm_entries[0].lcme_flags;
3098
3099         if (id > LCME_ID_MAX || (flags & ~LCME_KNOWN_FLAGS)) {
3100                 CDEBUG(D_LAYOUT, "%s: invalid component id %#x, flags %#x\n",
3101                        lod2obd(d)->obd_name, id, flags);
3102                 RETURN(-EINVAL);
3103         }
3104
3105         if (id != LCME_ID_INVAL && flags != 0) {
3106                 CDEBUG(D_LAYOUT, "%s: specified both id and flags.\n",
3107                        lod2obd(d)->obd_name);
3108                 RETURN(-EINVAL);
3109         }
3110
3111         if (id == LCME_ID_INVAL && !flags) {
3112                 CDEBUG(D_LAYOUT, "%s: no id or flags specified.\n",
3113                        lod2obd(d)->obd_name);
3114                 RETURN(-EINVAL);
3115         }
3116
3117         if (flags & LCME_FL_NEG) {
3118                 neg_flags = flags & ~LCME_FL_NEG;
3119                 flags = 0;
3120         }
3121
3122         mutex_lock(&lo->ldo_layout_mutex);
3123
3124         left = lo->ldo_comp_cnt;
3125         if (left <= 0) {
3126                 mutex_unlock(&lo->ldo_layout_mutex);
3127                 RETURN(-EINVAL);
3128         }
3129
3130         for (i = (lo->ldo_comp_cnt - 1); i >= 0; i--) {
3131                 struct lod_layout_component *lod_comp;
3132
3133                 lod_comp = &lo->ldo_comp_entries[i];
3134
3135                 if (id != LCME_ID_INVAL && id != lod_comp->llc_id)
3136                         continue;
3137                 else if (flags && !(flags & lod_comp->llc_flags))
3138                         continue;
3139                 else if (neg_flags && (neg_flags & lod_comp->llc_flags))
3140                         continue;
3141
3142                 if (left != (i + 1)) {
3143                         CDEBUG(D_LAYOUT, "%s: this deletion will create "
3144                                "a hole.\n", lod2obd(d)->obd_name);
3145                         mutex_unlock(&lo->ldo_layout_mutex);
3146                         RETURN(-EINVAL);
3147                 }
3148                 left--;
3149
3150                 /* Mark the component as deleted */
3151                 lod_comp->llc_id = LCME_ID_INVAL;
3152
3153                 /* Not instantiated component */
3154                 if (lod_comp->llc_stripe == NULL)
3155                         continue;
3156
3157                 LASSERT(lod_comp->llc_stripe_count > 0);
3158                 for (j = 0; j < lod_comp->llc_stripe_count; j++) {
3159                         struct dt_object *obj = lod_comp->llc_stripe[j];
3160
3161                         if (obj == NULL)
3162                                 continue;
3163                         rc = lod_sub_declare_destroy(env, obj, th);
3164                         if (rc) {
3165                                 mutex_unlock(&lo->ldo_layout_mutex);
3166                                 RETURN(rc);
3167                         }
3168                 }
3169         }
3170
3171         LASSERTF(left >= 0, "left = %d\n", left);
3172         if (left == lo->ldo_comp_cnt) {
3173                 CDEBUG(D_LAYOUT, "%s: requested component id:%#x not found\n",
3174                        lod2obd(d)->obd_name, id);
3175                 mutex_unlock(&lo->ldo_layout_mutex);
3176                 RETURN(-EINVAL);
3177         }
3178
3179         mutex_unlock(&lo->ldo_layout_mutex);
3180
3181         memset(attr, 0, sizeof(*attr));
3182         attr->la_valid = LA_SIZE;
3183         rc = lod_sub_declare_attr_set(env, next, attr, th);
3184         if (rc)
3185                 RETURN(rc);
3186
3187         if (left > 0) {
3188                 info->lti_buf.lb_len = lod_comp_md_size(lo, false);
3189                 rc = lod_sub_declare_xattr_set(env, next, &info->lti_buf,
3190                                                XATTR_NAME_LOV, 0, th);
3191         } else {
3192                 rc = lod_sub_declare_xattr_del(env, next, XATTR_NAME_LOV, th);
3193         }
3194
3195         RETURN(rc);
3196 }
3197
3198 /**
3199  * Declare layout add/set/del operations issued by special xattr names:
3200  *
3201  * XATTR_LUSTRE_LOV.add         add component(s) to existing file
3202  * XATTR_LUSTRE_LOV.del         delete component(s) from existing file
3203  * XATTR_LUSTRE_LOV.set.$field  set specified field of certain component(s)
3204  *
3205  * \param[in] env       execution environment
3206  * \param[in] dt        object
3207  * \param[in] name      name of xattr
3208  * \param[in] buf       lu_buf contains xattr value
3209  * \param[in] th        transaction handle
3210  *
3211  * \retval              0 on success
3212  * \retval              negative if failed
3213  */
3214 static int lod_declare_modify_layout(const struct lu_env *env,
3215                                      struct dt_object *dt,
3216                                      const char *name,
3217                                      const struct lu_buf *buf,
3218                                      struct thandle *th)
3219 {
3220         struct lod_device *d = lu2lod_dev(dt->do_lu.lo_dev);
3221         struct lod_object *lo = lod_dt_obj(dt);
3222         char *op;
3223         int rc, len = strlen(XATTR_LUSTRE_LOV);
3224         ENTRY;
3225
3226         LASSERT(dt_object_exists(dt));
3227
3228         if (strlen(name) <= len || name[len] != '.') {
3229                 CDEBUG(D_LAYOUT, "%s: invalid xattr name: %s\n",
3230                        lod2obd(d)->obd_name, name);
3231                 RETURN(-EINVAL);
3232         }
3233         len++;
3234
3235         rc = lod_striping_load(env, lo);
3236         if (rc)
3237                 GOTO(unlock, rc);
3238
3239         /* the layout to be modified must be a composite layout */
3240         if (!lo->ldo_is_composite) {
3241                 CDEBUG(D_LAYOUT, "%s: object "DFID" isn't a composite file.\n",
3242                        lod2obd(d)->obd_name, PFID(lu_object_fid(&dt->do_lu)));
3243                 GOTO(unlock, rc = -EINVAL);
3244         }
3245
3246         op = (char *)name + len;
3247         if (strcmp(op, "add") == 0) {
3248                 rc = lod_declare_layout_add(env, dt, buf, th);
3249         } else if (strcmp(op, "del") == 0) {
3250                 rc = lod_declare_layout_del(env, dt, buf, th);
3251         } else if (strncmp(op, "set", strlen("set")) == 0) {
3252                 rc = lod_declare_layout_set(env, dt, op, buf, th);
3253         } else  {
3254                 CDEBUG(D_LAYOUT, "%s: unsupported xattr name:%s\n",
3255                        lod2obd(d)->obd_name, name);
3256                 GOTO(unlock, rc = -ENOTSUPP);
3257         }
3258 unlock:
3259         if (rc)
3260                 lod_striping_free(env, lo);
3261
3262         RETURN(rc);
3263 }
3264
3265 /**
3266  * Convert a plain file lov_mds_md to a composite layout.
3267  *
3268  * \param[in,out] info  the thread info::lti_ea_store buffer contains little
3269  *                      endian plain file layout
3270  *
3271  * \retval              0 on success, <0 on failure
3272  */
3273 static int lod_layout_convert(struct lod_thread_info *info)
3274 {
3275         struct lov_mds_md *lmm = info->lti_ea_store;
3276         struct lov_mds_md *lmm_save;
3277         struct lov_comp_md_v1 *lcm;
3278         struct lov_comp_md_entry_v1 *lcme;
3279         size_t size;
3280         __u32 blob_size;
3281         int rc = 0;
3282         ENTRY;
3283
3284         /* realloc buffer to a composite layout which contains one component */
3285         blob_size = lov_mds_md_size(le16_to_cpu(lmm->lmm_stripe_count),
3286                                     le32_to_cpu(lmm->lmm_magic));
3287         size = sizeof(*lcm) + sizeof(*lcme) + blob_size;
3288
3289         OBD_ALLOC_LARGE(lmm_save, blob_size);
3290         if (!lmm_save)
3291                 GOTO(out, rc = -ENOMEM);
3292
3293         memcpy(lmm_save, lmm, blob_size);
3294
3295         if (info->lti_ea_store_size < size) {
3296                 rc = lod_ea_store_resize(info, size);
3297                 if (rc)
3298                         GOTO(out, rc);
3299         }
3300
3301         lcm = info->lti_ea_store;
3302         memset(lcm, 0, sizeof(*lcm) + sizeof(*lcme));
3303         lcm->lcm_magic = cpu_to_le32(LOV_MAGIC_COMP_V1);
3304         lcm->lcm_size = cpu_to_le32(size);
3305         lcm->lcm_layout_gen = cpu_to_le32(le16_to_cpu(
3306                                                 lmm_save->lmm_layout_gen));
3307         lcm->lcm_flags = cpu_to_le16(LCM_FL_NONE);
3308         lcm->lcm_entry_count = cpu_to_le16(1);
3309
3310         lcme = &lcm->lcm_entries[0];
3311         lcme->lcme_flags = cpu_to_le32(LCME_FL_INIT);
3312         lcme->lcme_extent.e_start = 0;
3313         lcme->lcme_extent.e_end = cpu_to_le64(OBD_OBJECT_EOF);
3314         lcme->lcme_offset = cpu_to_le32(sizeof(*lcm) + sizeof(*lcme));
3315         lcme->lcme_size = cpu_to_le32(blob_size);
3316
3317         memcpy((char *)lcm + lcme->lcme_offset, (char *)lmm_save, blob_size);
3318
3319         EXIT;
3320 out:
3321         if (lmm_save)
3322                 OBD_FREE_LARGE(lmm_save, blob_size);
3323         return rc;
3324 }
3325
3326 /**
3327  * Merge layouts to form a mirrored file.
3328  */
3329 static int lod_declare_layout_merge(const struct lu_env *env,
3330                 struct dt_object *dt, const struct lu_buf *mbuf,
3331                 struct thandle *th)
3332 {
3333         struct lod_thread_info *info = lod_env_info(env);
3334         struct lu_attr *layout_attr = &info->lti_layout_attr;
3335         struct lu_buf *buf = &info->lti_buf;
3336         struct lod_object *lo = lod_dt_obj(dt);
3337         struct lov_comp_md_v1 *lcm;
3338         struct lov_comp_md_v1 *cur_lcm;
3339         struct lov_comp_md_v1 *merge_lcm;
3340         struct lov_comp_md_entry_v1 *lcme;
3341         struct lov_mds_md_v1 *lmm;
3342         size_t size = 0;
3343         size_t offset;
3344         __u16 cur_entry_count;
3345         __u16 merge_entry_count;
3346         __u32 id = 0;
3347         __u16 mirror_id = 0;
3348         __u32 mirror_count;
3349         int rc, i;
3350         bool merge_has_dom;
3351
3352         ENTRY;
3353
3354         merge_lcm = mbuf->lb_buf;
3355         if (mbuf->lb_len < sizeof(*merge_lcm))
3356                 RETURN(-EINVAL);
3357
3358         /* must be an existing layout from disk */
3359         if (le32_to_cpu(merge_lcm->lcm_magic) != LOV_MAGIC_COMP_V1)
3360                 RETURN(-EINVAL);
3361
3362         merge_entry_count = le16_to_cpu(merge_lcm->lcm_entry_count);
3363
3364         /* do not allow to merge two mirrored files */
3365         if (le16_to_cpu(merge_lcm->lcm_mirror_count))
3366                 RETURN(-EBUSY);
3367
3368         /* verify the target buffer */
3369         rc = lod_get_lov_ea(env, lo);
3370         if (rc <= 0)
3371                 RETURN(rc ? : -ENODATA);
3372
3373         cur_lcm = info->lti_ea_store;
3374         switch (le32_to_cpu(cur_lcm->lcm_magic)) {
3375         case LOV_MAGIC_V1:
3376         case LOV_MAGIC_V3:
3377                 rc = lod_layout_convert(info);
3378                 break;
3379         case LOV_MAGIC_COMP_V1:
3380         case LOV_MAGIC_SEL:
3381                 rc = 0;
3382                 break;
3383         default:
3384                 rc = -EINVAL;
3385         }
3386         if (rc)
3387                 RETURN(rc);
3388
3389         /* info->lti_ea_store could be reallocated in lod_layout_convert() */
3390         cur_lcm = info->lti_ea_store;
3391         cur_entry_count = le16_to_cpu(cur_lcm->lcm_entry_count);
3392
3393         /* 'lcm_mirror_count + 1' is the current # of mirrors the file has */
3394         mirror_count = le16_to_cpu(cur_lcm->lcm_mirror_count) + 1;
3395         if (mirror_count + 1 > LUSTRE_MIRROR_COUNT_MAX)
3396                 RETURN(-ERANGE);
3397
3398         /* size of new layout */
3399         size = le32_to_cpu(cur_lcm->lcm_size) +
3400                le32_to_cpu(merge_lcm->lcm_size) - sizeof(*cur_lcm);
3401
3402         memset(buf, 0, sizeof(*buf));
3403         lu_buf_alloc(buf, size);
3404         if (buf->lb_buf == NULL)
3405                 RETURN(-ENOMEM);
3406
3407         lcm = buf->lb_buf;
3408         memcpy(lcm, cur_lcm, sizeof(*lcm) + cur_entry_count * sizeof(*lcme));
3409
3410         offset = sizeof(*lcm) +
3411                  sizeof(*lcme) * (cur_entry_count + merge_entry_count);
3412         for (i = 0; i < cur_entry_count; i++) {
3413                 struct lov_comp_md_entry_v1 *cur_lcme;
3414
3415                 lcme = &lcm->lcm_entries[i];
3416                 cur_lcme = &cur_lcm->lcm_entries[i];
3417
3418                 lcme->lcme_offset = cpu_to_le32(offset);
3419                 memcpy((char *)lcm + offset,
3420                        (char *)cur_lcm + le32_to_cpu(cur_lcme->lcme_offset),
3421                        le32_to_cpu(lcme->lcme_size));
3422
3423                 offset += le32_to_cpu(lcme->lcme_size);
3424
3425                 if (mirror_count == 1 &&
3426                     mirror_id_of(le32_to_cpu(lcme->lcme_id)) == 0) {
3427                         /* Add mirror from a non-flr file, create new mirror ID.
3428                          * Otherwise, keep existing mirror's component ID, used
3429                          * for mirror extension.
3430                          */
3431                         id = pflr_id(1, i + 1);
3432                         lcme->lcme_id = cpu_to_le32(id);
3433                 }
3434
3435                 id = max(le32_to_cpu(lcme->lcme_id), id);
3436         }
3437
3438         mirror_id = mirror_id_of(id) + 1;
3439
3440         /* check if first entry in new layout is DOM */
3441         lmm = (struct lov_mds_md_v1 *)((char *)merge_lcm +
3442                                         merge_lcm->lcm_entries[0].lcme_offset);
3443         merge_has_dom = lov_pattern(le32_to_cpu(lmm->lmm_pattern)) ==
3444                         LOV_PATTERN_MDT;
3445
3446         for (i = 0; i < merge_entry_count; i++) {
3447                 struct lov_comp_md_entry_v1 *merge_lcme;
3448
3449                 merge_lcme = &merge_lcm->lcm_entries[i];
3450                 lcme = &lcm->lcm_entries[cur_entry_count + i];
3451
3452                 *lcme = *merge_lcme;
3453                 lcme->lcme_offset = cpu_to_le32(offset);
3454                 if (merge_has_dom && i == 0)
3455                         lcme->lcme_flags |= cpu_to_le32(LCME_FL_STALE);
3456
3457                 id = pflr_id(mirror_id, i + 1);
3458                 lcme->lcme_id = cpu_to_le32(id);
3459
3460                 memcpy((char *)lcm + offset,
3461                        (char *)merge_lcm + le32_to_cpu(merge_lcme->lcme_offset),
3462                        le32_to_cpu(lcme->lcme_size));
3463
3464                 offset += le32_to_cpu(lcme->lcme_size);
3465         }
3466
3467         /* fixup layout information */
3468         lcm->lcm_size = cpu_to_le32(size);
3469         lcm->lcm_entry_count = cpu_to_le16(cur_entry_count + merge_entry_count);
3470         lcm->lcm_mirror_count = cpu_to_le16(mirror_count);
3471         if ((le16_to_cpu(lcm->lcm_flags) & LCM_FL_FLR_MASK) == LCM_FL_NONE)
3472                 lcm->lcm_flags = cpu_to_le32(LCM_FL_RDONLY);
3473
3474         rc = lod_striping_reload(env, lo, buf, 0);
3475         if (rc)
3476                 GOTO(out, rc);
3477
3478         lod_obj_inc_layout_gen(lo);
3479         lcm->lcm_layout_gen = cpu_to_le32(lo->ldo_layout_gen);
3480
3481         /* transfer layout version to OST objects. */
3482         if (lo->ldo_mirror_count > 1) {
3483                 struct lod_obj_stripe_cb_data data = { {0} };
3484
3485                 layout_attr->la_valid = LA_LAYOUT_VERSION;
3486                 layout_attr->la_layout_version = 0;
3487                 data.locd_attr = layout_attr;
3488                 data.locd_declare = true;
3489                 data.locd_stripe_cb = lod_obj_stripe_attr_set_cb;
3490                 rc = lod_obj_for_each_stripe(env, lo, th, &data);
3491                 if (rc)
3492                         GOTO(out, rc);
3493         }
3494
3495         rc = lod_sub_declare_xattr_set(env, dt_object_child(dt), buf,
3496                                         XATTR_NAME_LOV, LU_XATTR_REPLACE, th);
3497
3498 out:
3499         lu_buf_free(buf);
3500         RETURN(rc);
3501 }
3502
3503 /**
3504  * Split layouts, just set the LOVEA with the layout from mbuf.
3505  */
3506 static int lod_declare_layout_split(const struct lu_env *env,
3507                 struct dt_object *dt, const struct lu_buf *mbuf,
3508                 struct thandle *th)
3509 {
3510         struct lod_thread_info *info = lod_env_info(env);
3511         struct lu_attr *layout_attr = &info->lti_layout_attr;
3512         struct lod_object *lo = lod_dt_obj(dt);
3513         struct lov_comp_md_v1 *lcm = mbuf->lb_buf;
3514         int rc;
3515         ENTRY;
3516
3517         rc = lod_striping_reload(env, lo, mbuf, LVF_ALL_STALE);
3518         if (rc)
3519                 RETURN(rc);
3520
3521         lod_obj_inc_layout_gen(lo);
3522         /* fix on-disk layout gen */
3523         lcm->lcm_layout_gen = cpu_to_le32(lo->ldo_layout_gen);
3524
3525
3526         /* transfer layout version to OST objects. */
3527         if (lo->ldo_mirror_count > 1) {
3528                 struct lod_obj_stripe_cb_data data = { {0} };
3529
3530                 layout_attr->la_valid = LA_LAYOUT_VERSION;
3531                 layout_attr->la_layout_version = 0;
3532                 data.locd_attr = layout_attr;
3533                 data.locd_declare = true;
3534                 data.locd_stripe_cb = lod_obj_stripe_attr_set_cb;
3535                 rc = lod_obj_for_each_stripe(env, lo, th, &data);
3536                 if (rc)
3537                         RETURN(rc);
3538         }
3539
3540         rc = lod_sub_declare_xattr_set(env, dt_object_child(dt), mbuf,
3541                                        XATTR_NAME_LOV, LU_XATTR_REPLACE, th);
3542         RETURN(rc);
3543 }
3544
3545 static int lod_layout_declare_or_purge_mirror(const struct lu_env *env,
3546                         struct dt_object *dt, const struct lu_buf *buf,
3547                         struct thandle *th, bool declare)
3548 {
3549         struct lod_thread_info *info = lod_env_info(env);
3550         struct lod_device *d = lu2lod_dev(dt->do_lu.lo_dev);
3551         struct lod_object *lo = lod_dt_obj(dt);
3552         struct lov_comp_md_v1 *comp_v1 = buf->lb_buf;
3553         struct lov_comp_md_entry_v1 *entry;
3554         struct lov_mds_md_v1 *lmm;
3555         struct dt_object **sub_objs = NULL;
3556         int rc = 0, i, k, array_count = 0;
3557
3558         ENTRY;
3559
3560         /**
3561          * other ops (like lod_declare_destroy) could destroying sub objects
3562          * as well.
3563          */
3564         mutex_lock(&lo->ldo_layout_mutex);
3565
3566         if (!declare) {
3567                 /* prepare sub-objects array */
3568                 for (i = 0; i < comp_v1->lcm_entry_count; i++) {
3569                         entry = &comp_v1->lcm_entries[i];
3570
3571                         if (!(entry->lcme_flags & LCME_FL_INIT))
3572                                 continue;
3573
3574                         lmm = (struct lov_mds_md_v1 *)
3575                                         ((char *)comp_v1 + entry->lcme_offset);
3576                         array_count += lmm->lmm_stripe_count;
3577                 }
3578                 OBD_ALLOC_PTR_ARRAY(sub_objs, array_count);
3579                 if (sub_objs == NULL) {
3580                         mutex_unlock(&lo->ldo_layout_mutex);
3581                         RETURN(-ENOMEM);
3582                 }
3583         }
3584
3585         k = 0;  /* sub_objs index */
3586         for (i = 0; i < comp_v1->lcm_entry_count; i++) {
3587                 struct lov_ost_data_v1 *objs;
3588                 struct lu_object *o, *n;
3589                 struct dt_object *dto;
3590                 struct lu_device *nd;
3591                 struct lov_mds_md_v3 *v3;
3592                 __u32 idx;
3593                 int j;
3594
3595                 entry = &comp_v1->lcm_entries[i];
3596
3597                 if (!(entry->lcme_flags & LCME_FL_INIT))
3598                         continue;
3599
3600                 lmm = (struct lov_mds_md_v1 *)
3601                                 ((char *)comp_v1 + entry->lcme_offset);
3602                 v3 = (struct lov_mds_md_v3 *)lmm;
3603                 if (lmm->lmm_magic == LOV_MAGIC_V3)
3604                         objs = &v3->lmm_objects[0];
3605                 else
3606                         objs = &lmm->lmm_objects[0];
3607
3608                 for (j = 0; j < lmm->lmm_stripe_count; j++) {
3609                         idx = objs[j].l_ost_idx;
3610                         rc = ostid_to_fid(&info->lti_fid, &objs[j].l_ost_oi,
3611                                           idx);
3612                         if (rc)
3613                                 GOTO(out, rc);
3614
3615                         if (!fid_is_sane(&info->lti_fid)) {
3616                                 CERROR("%s: sub-object insane fid "DFID"\n",
3617                                        lod2obd(d)->obd_name,
3618                                        PFID(&info->lti_fid));
3619                                 GOTO(out, rc = -EINVAL);
3620                         }
3621
3622                         lod_getref(&d->lod_ost_descs);
3623
3624                         rc = validate_lod_and_idx(d, idx);
3625                         if (unlikely(rc)) {
3626                                 lod_putref(d, &d->lod_ost_descs);
3627                                 GOTO(out, rc);
3628                         }
3629
3630                         nd = &OST_TGT(d, idx)->ltd_tgt->dd_lu_dev;
3631                         lod_putref(d, &d->lod_ost_descs);
3632
3633                         o = lu_object_find_at(env, nd, &info->lti_fid, NULL);
3634                         if (IS_ERR(o))
3635                                 GOTO(out, rc = PTR_ERR(o));
3636
3637                         n = lu_object_locate(o->lo_header, nd->ld_type);
3638                         if (unlikely(!n)) {
3639                                 lu_object_put(env, n);
3640                                 GOTO(out, rc = -ENOENT);
3641                         }
3642
3643                         dto = container_of(n, struct dt_object, do_lu);
3644
3645                         if (declare) {
3646                                 rc = lod_sub_declare_destroy(env, dto, th);
3647                                 dt_object_put(env, dto);
3648                                 if (rc)
3649                                         GOTO(out, rc);
3650                         } else {
3651                                 /**
3652                                  * collect to-be-destroyed sub objects, the
3653                                  * reference would be released after actual
3654                                  * deletion.
3655                                  */
3656                                 sub_objs[k] = dto;
3657                                 k++;
3658                         }
3659                 } /* for each stripe */
3660         } /* for each component in the mirror */
3661 out:
3662         if (!declare) {
3663                 i = 0;
3664                 if (!rc) {
3665                         /* destroy the sub objects */
3666                         for (; i < k; i++) {
3667                                 rc = lod_sub_destroy(env, sub_objs[i], th);
3668                                 if (rc)
3669                                         break;
3670                                 dt_object_put(env, sub_objs[i]);
3671                         }
3672                 }
3673                 /**
3674                  * if a sub object destroy failed, we'd release sub objects
3675                  * reference get from above sub_objs collection.
3676                  */
3677                 for (; i < k; i++)
3678                         dt_object_put(env, sub_objs[i]);
3679
3680                 OBD_FREE_PTR_ARRAY(sub_objs, array_count);
3681         }
3682         mutex_unlock(&lo->ldo_layout_mutex);
3683
3684         RETURN(rc);
3685 }
3686
3687 /**
3688  * Purge layouts, delete sub objects in the mirror stored in the vic_buf,
3689  * and set the LOVEA with the layout from mbuf.
3690  */
3691 static int lod_declare_layout_purge(const struct lu_env *env,
3692                 struct dt_object *dt, const struct lu_buf *buf,
3693                 struct thandle *th)
3694 {
3695         struct lod_device *d = lu2lod_dev(dt->do_lu.lo_dev);
3696         struct lov_comp_md_v1 *comp_v1 = buf->lb_buf;
3697         int rc;
3698
3699         ENTRY;
3700
3701         if (le32_to_cpu(comp_v1->lcm_magic) != LOV_MAGIC_COMP_V1) {
3702                 CERROR("%s: invalid layout magic %#x != %#x\n",
3703                        lod2obd(d)->obd_name, le32_to_cpu(comp_v1->lcm_magic),
3704                        LOV_MAGIC_COMP_V1);
3705                 RETURN(-EINVAL);
3706         }
3707
3708         if (cpu_to_le32(LOV_MAGIC_COMP_V1) != LOV_MAGIC_COMP_V1)
3709                 lustre_swab_lov_comp_md_v1(comp_v1);
3710
3711         /* from now on, @buf contains cpu endian data */
3712
3713         if (comp_v1->lcm_mirror_count != 0) {
3714                 CERROR("%s: can only purge one mirror from "DFID"\n",
3715                        lod2obd(d)->obd_name, PFID(lu_object_fid(&dt->do_lu)));
3716                 RETURN(-EINVAL);
3717         }
3718
3719         /* delcare sub objects deletion in the mirror stored in @buf */
3720         rc = lod_layout_declare_or_purge_mirror(env, dt, buf, th, true);
3721         RETURN(rc);
3722 }
3723
3724 /* delete sub objects from the mirror stored in @buf */
3725 static int lod_layout_purge(const struct lu_env *env, struct dt_object *dt,
3726                             const struct lu_buf *buf, struct thandle *th)
3727 {
3728         int rc;
3729
3730         ENTRY;
3731         rc = lod_layout_declare_or_purge_mirror(env, dt, buf, th, false);
3732         RETURN(rc);
3733 }
3734
3735 /**
3736  * Implementation of dt_object_operations::do_declare_xattr_set.
3737  *
3738  * \see dt_object_operations::do_declare_xattr_set() in the API description
3739  * for details.
3740  *
3741  * the extension to the API:
3742  *   - declaring LOVEA requests striping creation
3743  *   - LU_XATTR_REPLACE means layout swap
3744  */
3745 static int lod_declare_xattr_set(const struct lu_env *env,
3746                                  struct dt_object *dt,
3747                                  const struct lu_buf *buf,
3748                                  const char *name, int fl,
3749                                  struct thandle *th)
3750 {
3751         struct dt_object *next = dt_object_child(dt);
3752         struct lu_attr   *attr = &lod_env_info(env)->lti_attr;
3753         __u32             mode;
3754         int               rc;
3755         ENTRY;
3756
3757         mode = dt->do_lu.lo_header->loh_attr & S_IFMT;
3758         if ((S_ISREG(mode) || mode == 0) &&
3759             !(fl & (LU_XATTR_REPLACE | LU_XATTR_MERGE | LU_XATTR_SPLIT |
3760                     LU_XATTR_PURGE)) &&
3761             (strcmp(name, XATTR_NAME_LOV) == 0 ||
3762              strcmp(name, XATTR_LUSTRE_LOV) == 0)) {
3763                 /*
3764                  * this is a request to create object's striping.
3765                  *
3766                  * allow to declare predefined striping on a new (!mode) object
3767                  * which is supposed to be replay of regular file creation
3768                  * (when LOV setting is declared)
3769                  *
3770                  * LU_XATTR_REPLACE is set to indicate a layout swap
3771                  */
3772                 if (dt_object_exists(dt)) {
3773                         rc = dt_attr_get(env, next, attr);
3774                         if (rc)
3775                                 RETURN(rc);
3776                 } else {
3777                         memset(attr, 0, sizeof(*attr));
3778                         attr->la_valid = LA_TYPE | LA_MODE;
3779                         attr->la_mode = S_IFREG;
3780                 }
3781                 rc = lod_declare_striped_create(env, dt, attr, buf, th);
3782         } else if (fl & LU_XATTR_MERGE) {
3783                 LASSERT(strcmp(name, XATTR_NAME_LOV) == 0 ||
3784                         strcmp(name, XATTR_LUSTRE_LOV) == 0);
3785                 rc = lod_declare_layout_merge(env, dt, buf, th);
3786         } else if (fl & LU_XATTR_SPLIT) {
3787                 LASSERT(strcmp(name, XATTR_NAME_LOV) == 0 ||
3788                         strcmp(name, XATTR_LUSTRE_LOV) == 0);
3789                 rc = lod_declare_layout_split(env, dt, buf, th);
3790         } else if (fl & LU_XATTR_PURGE) {
3791                 LASSERT(strcmp(name, XATTR_NAME_LOV) == 0 ||
3792                         strcmp(name, XATTR_LUSTRE_LOV) == 0);
3793                 rc = lod_declare_layout_purge(env, dt, buf, th);
3794         } else if (S_ISREG(mode) &&
3795                    strlen(name) >= sizeof(XATTR_LUSTRE_LOV) + 3 &&
3796                    allowed_lustre_lov(name)) {
3797                 /*
3798                  * this is a request to modify object's striping.
3799                  * add/set/del component(s).
3800                  */
3801                 if (!dt_object_exists(dt))
3802                         RETURN(-ENOENT);
3803
3804                 rc = lod_declare_modify_layout(env, dt, name, buf, th);
3805         } else if (S_ISDIR(mode)) {
3806                 rc = lod_dir_declare_xattr_set(env, dt, buf, name, fl, th);
3807         } else if (strcmp(name, XATTR_NAME_FID) == 0) {
3808                 rc = lod_replace_parent_fid(env, dt, buf, th, true);
3809         } else {
3810                 rc = lod_sub_declare_xattr_set(env, next, buf, name, fl, th);
3811         }
3812
3813         RETURN(rc);
3814 }
3815
3816 /**
3817  * Apply xattr changes to the object.
3818  *
3819  * Applies xattr changes to the object and the stripes if the latter exist.
3820  *
3821  * \param[in] env       execution environment
3822  * \param[in] dt        object
3823  * \param[in] buf       buffer pointing to the new value of xattr
3824  * \param[in] name      name of xattr
3825  * \param[in] fl        flags
3826  * \param[in] th        transaction handle
3827  *
3828  * \retval              0 on success
3829  * \retval              negative if failed
3830  */
3831 static int lod_xattr_set_internal(const struct lu_env *env,
3832                                   struct dt_object *dt,
3833                                   const struct lu_buf *buf,
3834                                   const char *name, int fl,
3835                                   struct thandle *th)
3836 {
3837         struct dt_object        *next = dt_object_child(dt);
3838         struct lod_object       *lo = lod_dt_obj(dt);
3839         int                     rc;
3840         int                     i;
3841         ENTRY;
3842
3843         rc = lod_sub_xattr_set(env, next, buf, name, fl, th);
3844         if (rc != 0 || !S_ISDIR(dt->do_lu.lo_header->loh_attr))
3845                 RETURN(rc);
3846
3847         /* Note: Do not set LinkEA on sub-stripes, otherwise
3848          * it will confuse the fid2path process(see mdt_path_current()).
3849          * The linkEA between master and sub-stripes is set in
3850          * lod_xattr_set_lmv(). */
3851         if (lo->ldo_dir_stripe_count == 0 || strcmp(name, XATTR_NAME_LINK) == 0)
3852                 RETURN(0);
3853
3854         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
3855                 if (!lo->ldo_stripe[i])
3856                         continue;
3857
3858                 if (!dt_object_exists(lo->ldo_stripe[i]))
3859                         continue;
3860
3861                 rc = lod_sub_xattr_set(env, lo->ldo_stripe[i], buf, name,
3862                                        fl, th);
3863                 if (rc != 0)
3864                         break;
3865         }
3866
3867         RETURN(rc);
3868 }
3869
3870 /**
3871  * Delete an extended attribute.
3872  *
3873  * Deletes specified xattr from the object and the stripes if the latter exist.
3874  *
3875  * \param[in] env       execution environment
3876  * \param[in] dt        object
3877  * \param[in] name      name of xattr
3878  * \param[in] th        transaction handle
3879  *
3880  * \retval              0 on success
3881  * \retval              negative if failed
3882  */
3883 static int lod_xattr_del_internal(const struct lu_env *env,
3884                                   struct dt_object *dt,
3885                                   const char *name, struct thandle *th)
3886 {
3887         struct dt_object *next = dt_object_child(dt);
3888         struct lod_object *lo = lod_dt_obj(dt);
3889         int i;
3890         int rc;
3891
3892         ENTRY;
3893
3894         rc = lod_sub_xattr_del(env, next, name, th);
3895         if (rc != 0 || !S_ISDIR(dt->do_lu.lo_header->loh_attr))
3896                 RETURN(rc);
3897
3898         if (lo->ldo_dir_stripe_count == 0)
3899                 RETURN(rc);
3900
3901         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
3902                 if (!lo->ldo_stripe[i])
3903                         continue;
3904
3905                 if (!dt_object_exists(lo->ldo_stripe[i]))
3906                         continue;
3907
3908                 rc = lod_sub_xattr_del(env, lo->ldo_stripe[i], name, th);
3909                 if (rc != 0)
3910                         break;
3911         }
3912
3913         RETURN(rc);
3914 }
3915
3916 /**
3917  * Set default striping on a directory.
3918  *
3919  * Sets specified striping on a directory object unless it matches the default
3920  * striping (LOVEA_DELETE_VALUES() macro). In the latter case remove existing
3921  * EA. This striping will be used when regular file is being created in this
3922  * directory.
3923  *
3924  * \param[in] env       execution environment
3925  * \param[in] dt        the striped object
3926  * \param[in] buf       buffer with the striping
3927  * \param[in] name      name of EA
3928  * \param[in] fl        xattr flag (see OSD API description)
3929  * \param[in] th        transaction handle
3930  *
3931  * \retval              0 on success
3932  * \retval              negative if failed
3933  */
3934 static int lod_xattr_set_lov_on_dir(const struct lu_env *env,
3935                                     struct dt_object *dt,
3936                                     const struct lu_buf *buf,
3937                                     const char *name, int fl,
3938                                     struct thandle *th)
3939 {
3940         struct lov_user_md_v1   *lum;
3941         struct lov_user_md_v3   *v3 = NULL;
3942         const char              *pool_name = NULL;
3943         int                      rc;
3944         bool                     is_del;
3945         ENTRY;
3946
3947         LASSERT(buf != NULL && buf->lb_buf != NULL);
3948         lum = buf->lb_buf;
3949
3950         switch (lum->lmm_magic) {
3951         case LOV_USER_MAGIC_SPECIFIC:
3952         case LOV_USER_MAGIC_V3:
3953                 v3 = buf->lb_buf;
3954                 if (v3->lmm_pool_name[0] != '\0')
3955                         pool_name = v3->lmm_pool_name;
3956                 fallthrough;
3957         case LOV_USER_MAGIC_V1:
3958                 /* if { size, offset, count } = { 0, -1, 0 } and no pool
3959                  * (i.e. all default values specified) then delete default
3960                  * striping from dir. */
3961                 CDEBUG(D_LAYOUT,
3962                        "set default striping: sz %u # %u offset %d %s %s\n",
3963                        (unsigned)lum->lmm_stripe_size,
3964                        (unsigned)lum->lmm_stripe_count,
3965                        (int)lum->lmm_stripe_offset,
3966                        v3 ? "from" : "", v3 ? v3->lmm_pool_name : "");
3967
3968                 is_del = LOVEA_DELETE_VALUES(lum->lmm_stripe_size,
3969                                              lum->lmm_stripe_count,
3970                                              lum->lmm_stripe_offset,
3971                                              pool_name);
3972                 break;
3973         case LOV_USER_MAGIC_COMP_V1:
3974         {
3975                 struct lov_comp_md_v1 *lcm = (struct lov_comp_md_v1 *)lum;
3976                 struct lov_comp_md_entry_v1 *lcme;
3977                 int i, comp_cnt;
3978
3979                 comp_cnt = le16_to_cpu(lcm->lcm_entry_count);
3980                 for (i = 0; i < comp_cnt; i++) {
3981                         lcme = &lcm->lcm_entries[i];
3982                         if (lcme->lcme_flags & cpu_to_le32(LCME_FL_EXTENSION)) {
3983                                 lcm->lcm_magic = cpu_to_le32(LOV_MAGIC_SEL);
3984                                 break;
3985                         }
3986                 }
3987
3988                 is_del = false;
3989                 break;
3990         }
3991         default:
3992                 CERROR("Invalid magic %x\n", lum->lmm_magic);
3993                 RETURN(-EINVAL);
3994         }
3995
3996         if (is_del) {
3997                 rc = lod_xattr_del_internal(env, dt, name, th);
3998                 if (rc == -ENODATA)
3999                         rc = 0;
4000         } else {
4001                 rc = lod_xattr_set_internal(env, dt, buf, name, fl, th);
4002         }
4003
4004         RETURN(rc);
4005 }
4006
4007 static int lod_get_default_lov_striping(const struct lu_env *env,
4008                                        struct lod_object *lo,
4009                                        struct lod_default_striping *lds,
4010                                        struct dt_allocation_hint *ah);
4011
4012 /**
4013  * Helper function to convert compound layout to compound layout with
4014  * pool
4015  *
4016  * Copy lcm_entries array of \a src to \a tgt. Replace lov_user_md_v1
4017  * components of \a src with lov_user_md_v3 using \a pool.
4018  *
4019  * \param[in] src       source layout
4020  * \param[in] pool      pool to use in \a tgt
4021  * \param[out] tgt      target layout
4022  */
4023 static void embed_pool_to_comp_v1(const struct lov_comp_md_v1 *src,
4024                                   const char *pool,
4025                                   struct lov_comp_md_v1 *tgt)
4026 {
4027         size_t shift;
4028         struct lov_user_md_v1 *lum;
4029         struct lov_user_md_v3 *lum3;
4030         struct lov_comp_md_entry_v1 *entry;
4031         int i;
4032         __u32 offset;
4033
4034         entry = tgt->lcm_entries;
4035         shift = 0;
4036         for (i = 0; i < le16_to_cpu(src->lcm_entry_count); i++, entry++) {
4037                 *entry = src->lcm_entries[i];
4038                 offset = le32_to_cpu(src->lcm_entries[i].lcme_offset);
4039                 entry->lcme_offset = cpu_to_le32(offset + shift);
4040
4041                 lum = (struct lov_user_md_v1 *)((char *)src + offset);
4042                 lum3 = (struct lov_user_md_v3 *)((char *)tgt + offset + shift);
4043                 *(struct lov_user_md_v1 *)lum3 = *lum;
4044                 if (lum->lmm_pattern == cpu_to_le32(LOV_PATTERN_MDT)) {
4045                         lum3->lmm_magic = cpu_to_le32(LOV_USER_MAGIC_V1);
4046                 } else {
4047                         lum3->lmm_magic = cpu_to_le32(LOV_USER_MAGIC_V3);
4048                         entry->lcme_size = cpu_to_le32(sizeof(*lum3));
4049                         strlcpy(lum3->lmm_pool_name, pool,
4050                                 sizeof(lum3->lmm_pool_name));
4051                         shift += sizeof(*lum3) - sizeof(*lum);
4052                 }
4053         }
4054 }
4055
4056 /**
4057  * Set default striping on a directory.
4058  *
4059  * Sets specified striping on a directory object unless it matches the default
4060  * striping (LOVEA_DELETE_VALUES() macro). In the latter case remove existing
4061  * EA. This striping will be used when regular file is being created in this
4062  * directory.
4063  * If current default striping includes a pool but specifed striping
4064  * does not - retain the pool if it exists.
4065  *
4066  * \param[in] env       execution environment
4067  * \param[in] dt        the striped object
4068  * \param[in] buf       buffer with the striping
4069  * \param[in] name      name of EA
4070  * \param[in] fl        xattr flag (see OSD API description)
4071  * \param[in] th        transaction handle
4072  *
4073  * \retval              0 on success
4074  * \retval              negative if failed
4075  */
4076 static int lod_xattr_set_default_lov_on_dir(const struct lu_env *env,
4077                                             struct dt_object *dt,
4078                                             const struct lu_buf *buf,
4079                                             const char *name, int fl,
4080                                             struct thandle *th)
4081 {
4082         struct lod_default_striping     *lds = lod_lds_buf_get(env);
4083         struct lov_user_md_v1           *v1 = buf->lb_buf;
4084         char                             pool[LOV_MAXPOOLNAME + 1];
4085         bool                             is_del;
4086         int                              rc;
4087
4088         ENTRY;
4089
4090         /* get existing striping config */
4091         rc = lod_get_default_lov_striping(env, lod_dt_obj(dt), lds, NULL);
4092         if (rc)
4093                 RETURN(rc);
4094
4095         memset(pool, 0, sizeof(pool));
4096         if (lds->lds_def_striping_set == 1)
4097                 lod_layout_get_pool(lds->lds_def_comp_entries,
4098                                     lds->lds_def_comp_cnt, pool,
4099                                     sizeof(pool));
4100
4101         is_del = LOVEA_DELETE_VALUES(v1->lmm_stripe_size,
4102                                      v1->lmm_stripe_count,
4103                                      v1->lmm_stripe_offset,
4104                                      NULL);
4105
4106         /* Retain the pool name if it is not given */
4107         if (v1->lmm_magic == LOV_USER_MAGIC_V1 && pool[0] != '\0' &&
4108             !is_del) {
4109                 struct lod_thread_info *info = lod_env_info(env);
4110                 struct lov_user_md_v3 *v3  = info->lti_ea_store;
4111
4112                 memset(v3, 0, sizeof(*v3));
4113                 v3->lmm_magic = cpu_to_le32(LOV_USER_MAGIC_V3);
4114                 v3->lmm_pattern = cpu_to_le32(v1->lmm_pattern);
4115                 v3->lmm_stripe_count = cpu_to_le32(v1->lmm_stripe_count);
4116                 v3->lmm_stripe_offset = cpu_to_le32(v1->lmm_stripe_offset);
4117                 v3->lmm_stripe_size = cpu_to_le32(v1->lmm_stripe_size);
4118
4119                 strlcpy(v3->lmm_pool_name, pool, sizeof(v3->lmm_pool_name));
4120
4121                 info->lti_buf.lb_buf = v3;
4122                 info->lti_buf.lb_len = sizeof(*v3);
4123                 rc = lod_xattr_set_lov_on_dir(env, dt, &info->lti_buf,
4124                                               name, fl, th);
4125         } else if (v1->lmm_magic == LOV_USER_MAGIC_COMP_V1 &&
4126                    pool[0] != '\0' && !is_del) {
4127                 /*
4128                  * try to retain the pool from default layout if the
4129                  * specified component layout does not provide pool
4130                  * info explicitly
4131                  */
4132                 struct lod_thread_info *info = lod_env_info(env);
4133                 struct lov_comp_md_v1 *comp_v1 = buf->lb_buf;
4134                 struct lov_comp_md_v1 *comp_v1p;
4135                 struct lov_user_md_v1 *lum;
4136                 int entry_count;
4137                 int i;
4138                 __u32 offset;
4139                 struct lov_comp_md_entry_v1 *entry;
4140                 int size;
4141
4142                 entry_count = le16_to_cpu(comp_v1->lcm_entry_count);
4143                 size = sizeof(*comp_v1) +
4144                         entry_count * sizeof(comp_v1->lcm_entries[0]);
4145                 entry = comp_v1->lcm_entries;
4146                 for (i = 0; i < entry_count; i++, entry++) {
4147                         offset = le32_to_cpu(entry->lcme_offset);
4148                         lum = (struct lov_user_md_v1 *)((char *)comp_v1 +
4149                                                         offset);
4150                         if (le32_to_cpu(lum->lmm_magic) != LOV_USER_MAGIC_V1)
4151                                 /* the i-th component includes pool info */
4152                                 break;
4153                         if (lum->lmm_pattern == cpu_to_le32(LOV_PATTERN_MDT))
4154                                 size += sizeof(struct lov_user_md_v1);
4155                         else
4156                                 size += sizeof(struct lov_user_md_v3);
4157                 }
4158
4159                 if (i == entry_count) {
4160                         /*
4161                          * re-compose the layout to include the pool for
4162                          * each component
4163                          */
4164                         if (info->lti_ea_store_size < size)
4165                                 rc = lod_ea_store_resize(info, size);
4166
4167                         if (rc == 0) {
4168                                 comp_v1p = info->lti_ea_store;
4169                                 *comp_v1p = *comp_v1;
4170                                 comp_v1p->lcm_size = cpu_to_le32(size);
4171                                 embed_pool_to_comp_v1(comp_v1, pool, comp_v1p);
4172
4173                                 info->lti_buf.lb_buf = comp_v1p;
4174                                 info->lti_buf.lb_len = size;
4175                                 rc = lod_xattr_set_lov_on_dir(env, dt,
4176                                                               &info->lti_buf,
4177                                                               name, fl, th);
4178                         }
4179                 } else {
4180                         rc = lod_xattr_set_lov_on_dir(env, dt, buf, name, fl,
4181                                                       th);
4182                 }
4183         } else {
4184                 rc = lod_xattr_set_lov_on_dir(env, dt, buf, name, fl, th);
4185         }
4186
4187         if (lds->lds_def_striping_set == 1 && lds->lds_def_comp_entries != NULL)
4188                 lod_free_def_comp_entries(lds);
4189
4190         RETURN(rc);
4191 }
4192
4193 /**
4194  * Set default striping on a directory object.
4195  *
4196  * Sets specified striping on a directory object unless it matches the default
4197  * striping (LOVEA_DELETE_VALUES() macro). In the latter case remove existing
4198  * EA. This striping will be used when a new directory is being created in the
4199  * directory.
4200  *
4201  * \param[in] env       execution environment
4202  * \param[in] dt        the striped object
4203  * \param[in] buf       buffer with the striping
4204  * \param[in] name      name of EA
4205  * \param[in] fl        xattr flag (see OSD API description)
4206  * \param[in] th        transaction handle
4207  *
4208  * \retval              0 on success
4209  * \retval              negative if failed
4210  */
4211 static int lod_xattr_set_default_lmv_on_dir(const struct lu_env *env,
4212                                             struct dt_object *dt,
4213                                             const struct lu_buf *buf,
4214                                             const char *name, int fl,
4215                                             struct thandle *th)
4216 {
4217         struct lmv_user_md_v1 *lum;
4218         int rc;
4219
4220         ENTRY;
4221
4222         LASSERT(buf != NULL && buf->lb_buf != NULL);
4223         lum = buf->lb_buf;
4224
4225         CDEBUG(D_INFO,
4226                "set default stripe_count # %u stripe_offset %d hash %u\n",
4227               le32_to_cpu(lum->lum_stripe_count),
4228               (int)le32_to_cpu(lum->lum_stripe_offset),
4229               le32_to_cpu(lum->lum_hash_type));
4230
4231         if (LMVEA_DELETE_VALUES((le32_to_cpu(lum->lum_stripe_count)),
4232                                  le32_to_cpu(lum->lum_stripe_offset)) &&
4233             le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC) {
4234                 rc = lod_xattr_del_internal(env, dt, name, th);
4235                 if (rc == -ENODATA)
4236                         rc = 0;
4237         } else {
4238                 rc = lod_xattr_set_internal(env, dt, buf, name, fl, th);
4239                 if (rc != 0)
4240                         RETURN(rc);
4241         }
4242
4243         RETURN(rc);
4244 }
4245
4246 /**
4247  * Turn directory into a striped directory.
4248  *
4249  * During replay the client sends the striping created before MDT
4250  * failure, then the layer above LOD sends this defined striping
4251  * using ->do_xattr_set(), so LOD uses this method to replay creation
4252  * of the stripes. Notice the original information for the striping
4253  * (#stripes, FIDs, etc) was transferred in declare path.
4254  *
4255  * \param[in] env       execution environment
4256  * \param[in] dt        the striped object
4257  * \param[in] buf       not used currently
4258  * \param[in] name      not used currently
4259  * \param[in] fl        xattr flag (see OSD API description)
4260  * \param[in] th        transaction handle
4261  *
4262  * \retval              0 on success
4263  * \retval              negative if failed
4264  */
4265 static int lod_xattr_set_lmv(const struct lu_env *env, struct dt_object *dt,
4266                              const struct lu_buf *buf, const char *name,
4267                              int fl, struct thandle *th)
4268 {
4269         struct lod_object       *lo = lod_dt_obj(dt);
4270         struct lod_thread_info  *info = lod_env_info(env);
4271         struct lu_attr          *attr = &info->lti_attr;
4272         struct dt_object_format *dof = &info->lti_format;
4273         struct lu_buf           lmv_buf;
4274         struct lu_buf           slave_lmv_buf;
4275         struct lmv_mds_md_v1    *lmm;
4276         struct lmv_mds_md_v1    *slave_lmm = NULL;
4277         struct dt_insert_rec    *rec = &info->lti_dt_rec;
4278         int                     i;
4279         int                     rc;
4280         ENTRY;
4281
4282         if (!S_ISDIR(dt->do_lu.lo_header->loh_attr))
4283                 RETURN(-ENOTDIR);
4284
4285         /* The stripes are supposed to be allocated in declare phase,
4286          * if there are no stripes being allocated, it will skip */
4287         if (lo->ldo_dir_stripe_count == 0) {
4288                 if (lo->ldo_is_foreign) {
4289                         rc = lod_sub_xattr_set(env, dt_object_child(dt), buf,
4290                                                XATTR_NAME_LMV, fl, th);
4291                         if (rc != 0)
4292                                 RETURN(rc);
4293                 }
4294                 RETURN(0);
4295         }
4296
4297         rc = dt_attr_get(env, dt_object_child(dt), attr);
4298         if (rc != 0)
4299                 RETURN(rc);
4300
4301         attr->la_valid = LA_ATIME | LA_MTIME | LA_CTIME | LA_FLAGS |
4302                          LA_MODE | LA_UID | LA_GID | LA_TYPE | LA_PROJID;
4303         dof->dof_type = DFT_DIR;
4304
4305         rc = lod_prep_lmv_md(env, dt, &lmv_buf);
4306         if (rc != 0)
4307                 RETURN(rc);
4308         lmm = lmv_buf.lb_buf;
4309
4310         OBD_ALLOC_PTR(slave_lmm);
4311         if (slave_lmm == NULL)
4312                 RETURN(-ENOMEM);
4313
4314         lod_prep_slave_lmv_md(slave_lmm, lmm);
4315         slave_lmv_buf.lb_buf = slave_lmm;
4316         slave_lmv_buf.lb_len = sizeof(*slave_lmm);
4317
4318         rec->rec_type = S_IFDIR;
4319         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
4320                 struct dt_object *dto = lo->ldo_stripe[i];
4321                 char *stripe_name = info->lti_key;
4322                 struct lu_name *sname;
4323                 struct linkea_data ldata = { NULL };
4324                 struct lu_buf linkea_buf;
4325
4326                 /* OBD_FAIL_MDS_STRIPE_FID may leave stripe uninitialized */
4327                 if (!dto)
4328                         continue;
4329
4330                 /* fail a remote stripe creation */
4331                 if (i && OBD_FAIL_CHECK(OBD_FAIL_MDS_STRIPE_CREATE))
4332                         continue;
4333
4334                 /* don't create stripe if:
4335                  * 1. it's source stripe of migrating directory
4336                  * 2. it's existed stripe of splitting directory
4337                  */
4338                 if ((lod_is_migrating(lo) && i >= lo->ldo_dir_migrate_offset) ||
4339                     (lod_is_splitting(lo) && i < lo->ldo_dir_split_offset)) {
4340                         if (!dt_object_exists(dto))
4341                                 GOTO(out, rc = -EINVAL);
4342                 } else {
4343                         dt_write_lock(env, dto, DT_TGT_CHILD);
4344                         rc = lod_sub_create(env, dto, attr, NULL, dof, th);
4345                         if (rc != 0) {
4346                                 dt_write_unlock(env, dto);
4347                                 GOTO(out, rc);
4348                         }
4349
4350                         rc = lod_sub_ref_add(env, dto, th);
4351                         dt_write_unlock(env, dto);
4352                         if (rc != 0)
4353                                 GOTO(out, rc);
4354
4355                         rec->rec_fid = lu_object_fid(&dto->do_lu);
4356                         rc = lod_sub_insert(env, dto,
4357                                             (const struct dt_rec *)rec,
4358                                             (const struct dt_key *)dot, th);
4359                         if (rc != 0)
4360                                 GOTO(out, rc);
4361                 }
4362
4363                 if (!OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_SLAVE_LMV) ||
4364                     cfs_fail_val != i) {
4365                         if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_SLAVE_LMV) &&
4366                             cfs_fail_val == i)
4367                                 slave_lmm->lmv_master_mdt_index =
4368                                                         cpu_to_le32(i + 1);
4369                         else
4370                                 slave_lmm->lmv_master_mdt_index =
4371                                                         cpu_to_le32(i);
4372
4373                         rc = lod_sub_xattr_set(env, dto, &slave_lmv_buf,
4374                                                XATTR_NAME_LMV, 0, th);
4375                         if (rc != 0)
4376                                 GOTO(out, rc);
4377                 }
4378
4379                 /* don't insert stripe if it's existed stripe of splitting
4380                  * directory (this directory is striped).
4381                  * NB, plain directory will insert itself as the first
4382                  * stripe in target.
4383                  */
4384                 if (lod_is_splitting(lo) && lo->ldo_dir_split_offset > 1 &&
4385                     lo->ldo_dir_split_offset > i)
4386                         continue;
4387
4388                 rec->rec_fid = lu_object_fid(&dt->do_lu);
4389                 rc = lod_sub_insert(env, dto, (struct dt_rec *)rec,
4390                                     (const struct dt_key *)dotdot, th);
4391                 if (rc != 0)
4392                         GOTO(out, rc);
4393
4394                 if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_SLAVE_NAME) &&
4395                     cfs_fail_val == i)
4396                         snprintf(stripe_name, sizeof(info->lti_key), DFID":%d",
4397                                  PFID(lu_object_fid(&dto->do_lu)), i + 1);
4398                 else
4399                         snprintf(stripe_name, sizeof(info->lti_key), DFID":%d",
4400                                  PFID(lu_object_fid(&dto->do_lu)), i);
4401
4402                 sname = lod_name_get(env, stripe_name, strlen(stripe_name));
4403                 rc = linkea_links_new(&ldata, &info->lti_linkea_buf,
4404                                       sname, lu_object_fid(&dt->do_lu));
4405                 if (rc != 0)
4406                         GOTO(out, rc);
4407
4408                 linkea_buf.lb_buf = ldata.ld_buf->lb_buf;
4409                 linkea_buf.lb_len = ldata.ld_leh->leh_len;
4410                 rc = lod_sub_xattr_set(env, dto, &linkea_buf,
4411                                        XATTR_NAME_LINK, 0, th);
4412                 if (rc != 0)
4413                         GOTO(out, rc);
4414
4415                 rec->rec_fid = lu_object_fid(&dto->do_lu);
4416                 rc = lod_sub_insert(env, dt_object_child(dt),
4417                                     (const struct dt_rec *)rec,
4418                                     (const struct dt_key *)stripe_name, th);
4419                 if (rc != 0)
4420                         GOTO(out, rc);
4421
4422                 rc = lod_sub_ref_add(env, dt_object_child(dt), th);
4423                 if (rc != 0)
4424                         GOTO(out, rc);
4425         }
4426
4427         if (!OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MASTER_LMV))
4428                 rc = lod_sub_xattr_set(env, dt_object_child(dt),
4429                                        &lmv_buf, XATTR_NAME_LMV, fl, th);
4430 out:
4431         if (slave_lmm != NULL)
4432                 OBD_FREE_PTR(slave_lmm);
4433
4434         RETURN(rc);
4435 }
4436
4437 /**
4438  * Helper function to declare/execute creation of a striped directory
4439  *
4440  * Called in declare/create object path, prepare striping for a directory
4441  * and prepare defaults data striping for the objects to be created in
4442  * that directory. Notice the function calls "declaration" or "execution"
4443  * methods depending on \a declare param. This is a consequence of the
4444  * current approach while we don't have natural distributed transactions:
4445  * we basically execute non-local updates in the declare phase. So, the
4446  * arguments for the both phases are the same and this is the reason for
4447  * this function to exist.
4448  *
4449  * \param[in] env       execution environment
4450  * \param[in] dt        object
4451  * \param[in] attr      attributes the stripes will be created with
4452  * \param[in] lmu       lmv_user_md if MDT indices are specified
4453  * \param[in] dof       format of stripes (see OSD API description)
4454  * \param[in] th        transaction handle
4455  * \param[in] declare   where to call "declare" or "execute" methods
4456  *
4457  * \retval              0 on success
4458  * \retval              negative if failed
4459  */
4460 static int lod_dir_striping_create_internal(const struct lu_env *env,
4461                                             struct dt_object *dt,
4462                                             struct lu_attr *attr,
4463                                             const struct lu_buf *lmu,
4464                                             struct dt_object_format *dof,
4465                                             struct thandle *th,
4466                                             bool declare)
4467 {
4468         struct lod_thread_info *info = lod_env_info(env);
4469         struct lod_object *lo = lod_dt_obj(dt);
4470         const struct lod_default_striping *lds = lo->ldo_def_striping;
4471         int rc;
4472         ENTRY;
4473
4474         LASSERT(ergo(lds != NULL,
4475                      lds->lds_def_striping_set ||
4476                      lds->lds_dir_def_striping_set));
4477
4478         if (!LMVEA_DELETE_VALUES(lo->ldo_dir_stripe_count,
4479                                  lo->ldo_dir_stripe_offset)) {
4480                 if (!lmu) {
4481                         struct lmv_user_md_v1 *v1 = info->lti_ea_store;
4482                         int stripe_count = lo->ldo_dir_stripe_count;
4483
4484                         if (info->lti_ea_store_size < sizeof(*v1)) {
4485                                 rc = lod_ea_store_resize(info, sizeof(*v1));
4486                                 if (rc != 0)
4487                                         RETURN(rc);
4488                                 v1 = info->lti_ea_store;
4489                         }
4490
4491                         memset(v1, 0, sizeof(*v1));
4492                         v1->lum_magic = cpu_to_le32(LMV_USER_MAGIC);
4493                         v1->lum_stripe_count = cpu_to_le32(stripe_count);
4494                         v1->lum_stripe_offset =
4495                                         cpu_to_le32(lo->ldo_dir_stripe_offset);
4496
4497                         info->lti_buf.lb_buf = v1;
4498                         info->lti_buf.lb_len = sizeof(*v1);
4499                         lmu = &info->lti_buf;
4500                 }
4501
4502                 if (declare)
4503                         rc = lod_declare_xattr_set_lmv(env, dt, attr, lmu, dof,
4504                                                        th);
4505                 else
4506                         rc = lod_xattr_set_lmv(env, dt, lmu, XATTR_NAME_LMV, 0,
4507                                                th);
4508                 if (rc != 0)
4509                         RETURN(rc);
4510         } else {
4511                 /* foreign LMV EA case */
4512                 if (lmu) {
4513                         struct lmv_foreign_md *lfm = lmu->lb_buf;
4514
4515                         if (lfm->lfm_magic == LMV_MAGIC_FOREIGN) {
4516                                 rc = lod_declare_xattr_set_lmv(env, dt, attr,
4517                                                                lmu, dof, th);
4518                         }
4519                 } else {
4520                         if (lo->ldo_is_foreign) {
4521                                 LASSERT(lo->ldo_foreign_lmv != NULL &&
4522                                         lo->ldo_foreign_lmv_size > 0);
4523                                 info->lti_buf.lb_buf = lo->ldo_foreign_lmv;
4524                                 info->lti_buf.lb_len = lo->ldo_foreign_lmv_size;
4525                                 lmu = &info->lti_buf;
4526                                 rc = lod_xattr_set_lmv(env, dt, lmu,
4527                                                        XATTR_NAME_LMV, 0, th);
4528                         }
4529                 }
4530         }
4531
4532         /* Transfer default LMV striping from the parent */
4533         if (lds != NULL && lds->lds_dir_def_striping_set &&
4534             lds->lds_dir_def_max_inherit != LMV_INHERIT_END &&
4535             lds->lds_dir_def_max_inherit != LMV_INHERIT_NONE &&
4536             !(LMVEA_DELETE_VALUES(lds->lds_dir_def_stripe_count,
4537                                  lds->lds_dir_def_stripe_offset) &&
4538               le32_to_cpu(lds->lds_dir_def_hash_type) !=
4539               LMV_HASH_TYPE_UNKNOWN)) {
4540                 struct lmv_user_md_v1 *v1 = info->lti_ea_store;
4541
4542                 if (info->lti_ea_store_size < sizeof(*v1)) {
4543                         rc = lod_ea_store_resize(info, sizeof(*v1));
4544                         if (rc != 0)
4545                                 RETURN(rc);
4546                         v1 = info->lti_ea_store;
4547                 }
4548
4549                 memset(v1, 0, sizeof(*v1));
4550                 v1->lum_magic = cpu_to_le32(LMV_USER_MAGIC);
4551                 v1->lum_stripe_count =
4552                         cpu_to_le32(lds->lds_dir_def_stripe_count);
4553                 v1->lum_stripe_offset =
4554                         cpu_to_le32(lds->lds_dir_def_stripe_offset);
4555                 v1->lum_hash_type =
4556                         cpu_to_le32(lds->lds_dir_def_hash_type);
4557                 v1->lum_max_inherit =
4558                         lmv_inherit_next(lds->lds_dir_def_max_inherit);
4559                 v1->lum_max_inherit_rr =
4560                         lmv_inherit_rr_next(lds->lds_dir_def_max_inherit_rr);
4561
4562                 info->lti_buf.lb_buf = v1;
4563                 info->lti_buf.lb_len = sizeof(*v1);
4564                 if (declare)
4565                         rc = lod_dir_declare_xattr_set(env, dt, &info->lti_buf,
4566                                                        XATTR_NAME_DEFAULT_LMV,
4567                                                        0, th);
4568                 else
4569                         rc = lod_xattr_set_default_lmv_on_dir(env, dt,
4570                                                   &info->lti_buf,
4571                                                   XATTR_NAME_DEFAULT_LMV, 0,
4572                                                   th);
4573                 if (rc != 0)
4574                         RETURN(rc);
4575         }
4576
4577         /* Transfer default LOV striping from the parent */
4578         if (lds != NULL && lds->lds_def_striping_set &&
4579             lds->lds_def_comp_cnt != 0) {
4580                 struct lov_mds_md *lmm;
4581                 int lmm_size = lod_comp_md_size(lo, true);
4582
4583                 if (info->lti_ea_store_size < lmm_size) {
4584                         rc = lod_ea_store_resize(info, lmm_size);
4585                         if (rc != 0)
4586                                 RETURN(rc);
4587                 }
4588                 lmm = info->lti_ea_store;
4589
4590                 rc = lod_generate_lovea(env, lo, lmm, &lmm_size, true);
4591                 if (rc != 0)
4592                         RETURN(rc);
4593
4594                 info->lti_buf.lb_buf = lmm;
4595                 info->lti_buf.lb_len = lmm_size;
4596
4597                 if (declare)
4598                         rc = lod_dir_declare_xattr_set(env, dt, &info->lti_buf,
4599                                                        XATTR_NAME_LOV, 0, th);
4600                 else
4601                         rc = lod_xattr_set_lov_on_dir(env, dt, &info->lti_buf,
4602                                                       XATTR_NAME_LOV, 0, th);
4603                 if (rc != 0)
4604                         RETURN(rc);
4605         }
4606
4607         /* ldo_def_striping is not allocated, clear after use, in case directory
4608          * layout is changed later.
4609          */
4610         if (!declare)
4611                 lo->ldo_def_striping = NULL;
4612
4613         RETURN(0);
4614 }
4615
4616 static int lod_declare_dir_striping_create(const struct lu_env *env,
4617                                            struct dt_object *dt,
4618                                            struct lu_attr *attr,
4619                                            struct lu_buf *lmu,
4620                                            struct dt_object_format *dof,
4621                                            struct thandle *th)
4622 {
4623         return lod_dir_striping_create_internal(env, dt, attr, lmu, dof, th,
4624                                                 true);
4625 }
4626
4627 static int lod_dir_striping_create(const struct lu_env *env,
4628                                    struct dt_object *dt,
4629                                    struct lu_attr *attr,
4630                                    struct dt_object_format *dof,
4631                                    struct thandle *th)
4632 {
4633         return lod_dir_striping_create_internal(env, dt, attr, NULL, dof, th,
4634                                                 false);
4635 }
4636
4637 /**
4638  * Make LOV EA for striped object.
4639  *
4640  * Generate striping information and store it in the LOV EA of the given
4641  * object. The caller must ensure nobody else is calling the function
4642  * against the object concurrently. The transaction must be started.
4643  * FLDB service must be running as well; it's used to map FID to the target,
4644  * which is stored in LOV EA.
4645  *
4646  * \param[in] env               execution environment for this thread
4647  * \param[in] lo                LOD object
4648  * \param[in] th                transaction handle
4649  *
4650  * \retval                      0 if LOV EA is stored successfully
4651  * \retval                      negative error number on failure
4652  */
4653 static int lod_generate_and_set_lovea(const struct lu_env *env,
4654                                       struct lod_object *lo,
4655                                       struct thandle *th)
4656 {
4657         struct lod_thread_info  *info = lod_env_info(env);
4658         struct dt_object        *next = dt_object_child(&lo->ldo_obj);
4659         struct lov_mds_md_v1    *lmm;
4660         int                      rc, lmm_size;
4661         ENTRY;
4662
4663         LASSERT(lo);
4664
4665         if (lo->ldo_comp_cnt == 0 && !lo->ldo_is_foreign) {
4666                 lod_striping_free_nolock(env, lo);
4667                 rc = lod_sub_xattr_del(env, next, XATTR_NAME_LOV, th);
4668                 RETURN(rc);
4669         }
4670
4671         lmm_size = lod_comp_md_size(lo, false);
4672         if (info->lti_ea_store_size < lmm_size) {
4673                 rc = lod_ea_store_resize(info, lmm_size);
4674                 if (rc)
4675                         RETURN(rc);
4676         }
4677         lmm = info->lti_ea_store;
4678
4679         rc = lod_generate_lovea(env, lo, lmm, &lmm_size, false);
4680         if (rc)
4681                 RETURN(rc);
4682
4683         info->lti_buf.lb_buf = lmm;
4684         info->lti_buf.lb_len = lmm_size;
4685         rc = lod_sub_xattr_set(env, next, &info->lti_buf,
4686                                XATTR_NAME_LOV, 0, th);
4687         RETURN(rc);
4688 }
4689
4690 static __u32 lod_gen_component_id(struct lod_object *lo,
4691                                   int mirror_id, int comp_idx);
4692
4693 /**
4694  * Repeat an existing component
4695  *
4696  * Creates a new layout by replicating an existing component.  Uses striping
4697  * policy from previous component as a template for the striping for the new
4698  * new component.
4699  *
4700  * New component starts with zero length, will be extended (or removed) before
4701  * returning layout to client.
4702  *
4703  * NB: Reallocates layout components array (lo->ldo_comp_entries), invalidating
4704  * any pre-existing pointers to components.  Handle with care.
4705  *
4706  * \param[in] env       execution environment for this thread
4707  * \param[in,out] lo    object to update the layout of
4708  * \param[in] index     index of component to copy
4709  *
4710  * \retval      0 on success
4711  * \retval      negative errno on error
4712  */
4713 static int lod_layout_repeat_comp(const struct lu_env *env,
4714                                   struct lod_object *lo, int index)
4715 {
4716         struct lod_layout_component *lod_comp;
4717         struct lod_layout_component *new_comp = NULL;
4718         struct lod_layout_component *comp_array;
4719         int rc = 0, i, new_cnt = lo->ldo_comp_cnt + 1;
4720         __u16 mirror_id;
4721         int offset = 0;
4722         ENTRY;
4723
4724         lod_comp = &lo->ldo_comp_entries[index];
4725         LASSERT(lod_comp_inited(lod_comp) && lod_comp->llc_id != LCME_ID_INVAL);
4726
4727         CDEBUG(D_LAYOUT, "repeating component %d\n", index);
4728
4729         OBD_ALLOC_PTR_ARRAY(comp_array, new_cnt);
4730         if (comp_array == NULL)
4731                 GOTO(out, rc = -ENOMEM);
4732
4733         for (i = 0; i < lo->ldo_comp_cnt; i++) {
4734                 memcpy(&comp_array[i + offset], &lo->ldo_comp_entries[i],
4735                        sizeof(*comp_array));
4736
4737                 /* Duplicate this component in to the next slot */
4738                 if (i == index) {
4739                         new_comp = &comp_array[i + 1];
4740                         memcpy(&comp_array[i + 1], &lo->ldo_comp_entries[i],
4741                                sizeof(*comp_array));
4742                         /* We must now skip this new component when copying */
4743                         offset = 1;
4744                 }
4745         }
4746
4747         /* Set up copied component */
4748         new_comp->llc_flags &= ~LCME_FL_INIT;
4749         new_comp->llc_stripe = NULL;
4750         new_comp->llc_stripes_allocated = 0;
4751         new_comp->llc_ost_indices = NULL;
4752         new_comp->llc_stripe_offset = LOV_OFFSET_DEFAULT;
4753         /* for uninstantiated components, layout gen stores default stripe
4754          * offset */
4755         new_comp->llc_layout_gen = lod_comp->llc_stripe_offset;
4756         /* This makes the repeated component zero-length, placed at the end of
4757          * the preceding component */
4758         new_comp->llc_extent.e_start = new_comp->llc_extent.e_end;
4759         new_comp->llc_timestamp = lod_comp->llc_timestamp;
4760         new_comp->llc_pool = NULL;
4761
4762         rc = lod_set_pool(&new_comp->llc_pool, lod_comp->llc_pool);
4763         if (rc)
4764                 GOTO(out, rc);
4765
4766         if (new_comp->llc_ostlist.op_array) {
4767                 __u32 *op_array = NULL;
4768
4769                 OBD_ALLOC(op_array, new_comp->llc_ostlist.op_size);
4770                 if (!op_array)
4771                         GOTO(out, rc = -ENOMEM);
4772                 memcpy(op_array, &new_comp->llc_ostlist.op_array,
4773                        new_comp->llc_ostlist.op_size);
4774                 new_comp->llc_ostlist.op_array = op_array;
4775         }
4776
4777         OBD_FREE_PTR_ARRAY(lo->ldo_comp_entries, lo->ldo_comp_cnt);
4778         lo->ldo_comp_entries = comp_array;
4779         lo->ldo_comp_cnt = new_cnt;
4780
4781         /* Generate an id for the new component */
4782         mirror_id = mirror_id_of(new_comp->llc_id);
4783         new_comp->llc_id = LCME_ID_INVAL;
4784         new_comp->llc_id = lod_gen_component_id(lo, mirror_id, index + 1);
4785         if (new_comp->llc_id == LCME_ID_INVAL)
4786                 GOTO(out, rc = -ERANGE);
4787
4788         EXIT;
4789 out:
4790         if (rc)
4791                 OBD_FREE_PTR_ARRAY(comp_array, new_cnt);
4792
4793         return rc;
4794 }
4795
4796 static int lod_layout_data_init(struct lod_thread_info *info, __u32 comp_cnt)
4797 {
4798         ENTRY;
4799
4800         /* clear memory region that will be used for layout change */
4801         memset(&info->lti_layout_attr, 0, sizeof(struct lu_attr));
4802         info->lti_count = 0;
4803
4804         if (info->lti_comp_size >= comp_cnt)
4805                 RETURN(0);
4806
4807         if (info->lti_comp_size > 0) {
4808                 OBD_FREE_PTR_ARRAY(info->lti_comp_idx, info->lti_comp_size);
4809                 info->lti_comp_size = 0;
4810         }
4811
4812         OBD_ALLOC_PTR_ARRAY(info->lti_comp_idx, comp_cnt);
4813         if (!info->lti_comp_idx)
4814                 RETURN(-ENOMEM);
4815
4816         info->lti_comp_size = comp_cnt;
4817         RETURN(0);
4818 }
4819
4820 /**
4821  * Prepare new layout minus deleted components
4822  *
4823  * Removes components marked for deletion (LCME_ID_INVAL) by copying to a new
4824  * layout and skipping those components.  Removes stripe objects if any exist.
4825  *
4826  * NB:
4827  * Reallocates layout components array (lo->ldo_comp_entries), invalidating
4828  * any pre-existing pointers to components.
4829  *
4830  * Caller is responsible for updating mirror end (ldo_mirror[].lme_end).
4831  *
4832  * \param[in] env       execution environment for this thread
4833  * \param[in,out] lo    object to update the layout of
4834  * \param[in] th        transaction handle for this operation
4835  *
4836  * \retval      # of components deleted
4837  * \retval      negative errno on error
4838  */
4839 static int lod_layout_del_prep_layout(const struct lu_env *env,
4840                                       struct lod_object *lo,
4841                                       struct thandle *th)
4842 {
4843         struct lod_layout_component     *lod_comp;
4844         struct lod_thread_info  *info = lod_env_info(env);
4845         int rc = 0, i, j, deleted = 0;
4846
4847         ENTRY;
4848
4849         LASSERT(lo->ldo_is_composite);
4850         LASSERT(lo->ldo_comp_cnt > 0 && lo->ldo_comp_entries != NULL);
4851
4852         rc = lod_layout_data_init(info, lo->ldo_comp_cnt);
4853         if (rc)
4854                 RETURN(rc);
4855
4856         for (i = 0; i < lo->ldo_comp_cnt; i++) {
4857                 lod_comp = &lo->ldo_comp_entries[i];
4858
4859                 if (lod_comp->llc_id != LCME_ID_INVAL) {
4860                         /* Build array of things to keep */
4861                         info->lti_comp_idx[info->lti_count++] = i;
4862                         continue;
4863                 }
4864
4865                 lod_obj_set_pool(lo, i, NULL);
4866                 if (lod_comp->llc_ostlist.op_array) {
4867                         OBD_FREE(lod_comp->llc_ostlist.op_array,
4868                                  lod_comp->llc_ostlist.op_size);
4869                         lod_comp->llc_ostlist.op_array = NULL;
4870                         lod_comp->llc_ostlist.op_size = 0;
4871                 }
4872
4873                 deleted++;
4874                 CDEBUG(D_LAYOUT, "deleting comp %d, left %d\n", i,
4875                        lo->ldo_comp_cnt - deleted);
4876
4877                 /* No striping info for this component */
4878                 if (lod_comp->llc_stripe == NULL)
4879                         continue;
4880
4881                 LASSERT(lod_comp->llc_stripe_count > 0);
4882                 for (j = 0; j < lod_comp->llc_stripe_count; j++) {
4883                         struct dt_object *obj = lod_comp->llc_stripe[j];
4884
4885                         if (obj == NULL)
4886                                 continue;
4887
4888                         /* components which are not init have no sub objects
4889                          * to destroy */
4890                         if (lod_comp_inited(lod_comp)) {
4891                                 rc = lod_sub_destroy(env, obj, th);
4892                                 if (rc)
4893                                         GOTO(out, rc);
4894                         }
4895
4896                         lu_object_put(env, &obj->do_lu);
4897                         lod_comp->llc_stripe[j] = NULL;
4898                 }
4899                 OBD_FREE_PTR_ARRAY(lod_comp->llc_stripe,
4900                                    lod_comp->llc_stripes_allocated);
4901                 lod_comp->llc_stripe = NULL;
4902                 OBD_FREE_PTR_ARRAY(lod_comp->llc_ost_indices,
4903                                    lod_comp->llc_stripes_allocated);
4904                 lod_comp->llc_ost_indices = NULL;
4905                 lod_comp->llc_stripes_allocated = 0;
4906         }
4907
4908         /* info->lti_count has the amount of left components */
4909         LASSERTF(info->lti_count >= 0 && info->lti_count < lo->ldo_comp_cnt,
4910                  "left = %d, lo->ldo_comp_cnt %d\n", (int)info->lti_count,
4911                  (int)lo->ldo_comp_cnt);
4912
4913         if (info->lti_count > 0) {
4914                 struct lod_layout_component *comp_array;
4915
4916                 OBD_ALLOC_PTR_ARRAY(comp_array, info->lti_count);
4917                 if (comp_array == NULL)
4918                         GOTO(out, rc = -ENOMEM);
4919
4920                 for (i = 0; i < info->lti_count; i++) {
4921                         memcpy(&comp_array[i],
4922                                &lo->ldo_comp_entries[info->lti_comp_idx[i]],
4923                                sizeof(*comp_array));
4924                 }
4925
4926                 OBD_FREE_PTR_ARRAY(lo->ldo_comp_entries, lo->ldo_comp_cnt);
4927                 lo->ldo_comp_entries = comp_array;
4928                 lo->ldo_comp_cnt = info->lti_count;
4929         } else {
4930                 lod_free_comp_entries(lo);
4931         }
4932
4933         EXIT;
4934 out:
4935         return rc ? rc : deleted;
4936 }
4937
4938 /**
4939  * Delete layout component(s)
4940  *
4941  * This function sets up the layout data in the env and does the setattrs
4942  * required to write out the new layout.  The layout itself is modified in
4943  * lod_layout_del_prep_layout.
4944  *
4945  * \param[in] env       execution environment for this thread
4946  * \param[in] dt        object
4947  * \param[in] th        transaction handle
4948  *
4949  * \retval      0 on success
4950  * \retval      negative error number on failure
4951  */
4952 static int lod_layout_del(const struct lu_env *env, struct dt_object *dt,
4953                           struct thandle *th)
4954 {
4955         struct lod_object *lo = lod_dt_obj(dt);
4956         struct dt_object *next = dt_object_child(dt);
4957         struct lu_attr *attr = &lod_env_info(env)->lti_attr;
4958         int rc;
4959
4960         LASSERT(lo->ldo_mirror_count == 1);
4961
4962         mutex_lock(&lo->ldo_layout_mutex);
4963
4964         rc = lod_layout_del_prep_layout(env, lo, th);
4965         if (rc < 0)
4966                 GOTO(out, rc);
4967
4968         /* Only do this if we didn't delete all components */
4969         if (lo->ldo_comp_cnt > 0) {
4970                 lo->ldo_mirrors[0].lme_end = lo->ldo_comp_cnt - 1;
4971                 lod_obj_inc_layout_gen(lo);
4972         }
4973
4974         LASSERT(dt_object_exists(dt));
4975         rc = dt_attr_get(env, next, attr);
4976         if (rc)
4977                 GOTO(out, rc);
4978
4979         if (attr->la_size > 0) {
4980                 attr->la_size = 0;
4981                 attr->la_valid = LA_SIZE;
4982                 rc = lod_sub_attr_set(env, next, attr, th);
4983                 if (rc)
4984                         GOTO(out, rc);
4985         }
4986
4987         rc = lod_generate_and_set_lovea(env, lo, th);
4988         EXIT;
4989 out:
4990         if (rc)
4991                 lod_striping_free_nolock(env, lo);
4992
4993         mutex_unlock(&lo->ldo_layout_mutex);
4994
4995         return rc;
4996 }
4997
4998
4999 /**
5000  * Implementation of dt_object_operations::do_xattr_set.
5001  *
5002  * Sets specified extended attribute on the object. Three types of EAs are
5003  * special:
5004  *   LOV EA - stores striping for a regular file or default striping (when set
5005  *            on a directory)
5006  *   LMV EA - stores a marker for the striped directories
5007  *   DMV EA - stores default directory striping
5008  *
5009  * When striping is applied to a non-striped existing object (this is called
5010  * late striping), then LOD notices the caller wants to turn the object into a
5011  * striped one. The stripe objects are created and appropriate EA is set:
5012  * LOV EA storing all the stripes directly or LMV EA storing just a small header
5013  * with striping configuration.
5014  *
5015  * \see dt_object_operations::do_xattr_set() in the API description for details.
5016  */
5017 static int lod_xattr_set(const struct lu_env *env,
5018                          struct dt_object *dt, const struct lu_buf *buf,
5019                          const char *name, int fl, struct thandle *th)
5020 {
5021         struct dt_object *next = dt_object_child(dt);
5022         struct lu_attr *layout_attr = &lod_env_info(env)->lti_layout_attr;
5023         struct lod_object *lo = lod_dt_obj(dt);
5024         struct lod_obj_stripe_cb_data data = { {0} };
5025         int rc = 0;
5026
5027         ENTRY;
5028
5029         if (S_ISDIR(dt->do_lu.lo_header->loh_attr) &&
5030             !strcmp(name, XATTR_NAME_LMV)) {
5031                 switch (fl) {
5032                 case LU_XATTR_CREATE:
5033                         rc = lod_dir_striping_create(env, dt, NULL, NULL, th);
5034                         break;
5035                 case 0:
5036                 case LU_XATTR_REPLACE:
5037                         rc = lod_dir_layout_set(env, dt, buf, fl, th);
5038                         break;
5039                 default:
5040                         LBUG();
5041                 }
5042
5043                 RETURN(rc);
5044         } else if (S_ISDIR(dt->do_lu.lo_header->loh_attr) &&
5045                    strcmp(name, XATTR_NAME_LOV) == 0) {
5046                 rc = lod_xattr_set_default_lov_on_dir(env, dt, buf, name, fl,
5047                                                       th);
5048                 RETURN(rc);
5049         } else if (S_ISDIR(dt->do_lu.lo_header->loh_attr) &&
5050                    strcmp(name, XATTR_NAME_DEFAULT_LMV) == 0) {
5051                 /* default LMVEA */
5052                 rc = lod_xattr_set_default_lmv_on_dir(env, dt, buf, name, fl,
5053                                                       th);
5054                 RETURN(rc);
5055         } else if (S_ISREG(dt->do_lu.lo_header->loh_attr) &&
5056                    (strcmp(name, XATTR_NAME_LOV) == 0 ||
5057                     strcmp(name, XATTR_LUSTRE_LOV) == 0 ||
5058                     allowed_lustre_lov(name))) {
5059                 /* in case of lov EA swap, just set it
5060                  * if not, it is a replay so check striping match what we
5061                  * already have during req replay, declare_xattr_set()
5062                  * defines striping, then create() does the work */
5063                 if (fl & LU_XATTR_REPLACE) {
5064                         /* free stripes, then update disk */
5065                         lod_striping_free(env, lod_dt_obj(dt));
5066
5067                         rc = lod_sub_xattr_set(env, next, buf, name, fl, th);
5068                 } else if (fl & LU_XATTR_SPLIT) {
5069                         rc = lod_sub_xattr_set(env, next, buf, name, fl, th);
5070                         if (rc)
5071                                 RETURN(rc);
5072
5073                         rc = lod_striping_reload(env, lo, buf, LVF_ALL_STALE);
5074                         if (rc)
5075                                 RETURN(rc);
5076
5077                         if (lo->ldo_mirror_count > 1 &&
5078                             layout_attr->la_valid & LA_LAYOUT_VERSION) {
5079                                 /* mirror split */
5080                                 layout_attr->la_layout_version =
5081                                                 lo->ldo_layout_gen;
5082                                 data.locd_attr = layout_attr;
5083                                 data.locd_declare = false;
5084                                 data.locd_stripe_cb =
5085                                                 lod_obj_stripe_attr_set_cb;
5086                                 rc = lod_obj_for_each_stripe(env, lo, th,
5087                                                              &data);
5088                                 if (rc)
5089                                         RETURN(rc);
5090                         }
5091                 } else if (fl & LU_XATTR_PURGE) {
5092                         rc = lod_layout_purge(env, dt, buf, th);
5093                 } else if (dt_object_remote(dt)) {
5094                         /* This only happens during migration, see
5095                          * mdd_migrate_create(), in which Master MDT will
5096                          * create a remote target object, and only set
5097                          * (migrating) stripe EA on the remote object,
5098                          * and does not need creating each stripes. */
5099                         rc = lod_sub_xattr_set(env, next, buf, name,
5100                                                       fl, th);
5101                 } else if (strcmp(name, XATTR_LUSTRE_LOV".del") == 0) {
5102                         /* delete component(s) */
5103                         LASSERT(lod_dt_obj(dt)->ldo_comp_cached);
5104                         rc = lod_layout_del(env, dt, th);
5105                 } else {
5106                         /*
5107                          * When 'name' is XATTR_LUSTRE_LOV or XATTR_NAME_LOV,
5108                          * it's going to create create file with specified
5109                          * component(s), the striping must have not being
5110                          * cached in this case;
5111                          *
5112                          * Otherwise, it's going to add/change component(s) to
5113                          * an existing file, the striping must have been cached
5114                          * in this case.
5115                          */
5116                         LASSERT(equi(!strcmp(name, XATTR_LUSTRE_LOV) ||
5117                                      !strcmp(name, XATTR_NAME_LOV),
5118                                 !lod_dt_obj(dt)->ldo_comp_cached));
5119
5120                         rc = lod_striped_create(env, dt, NULL, NULL, th);
5121                         if (rc)
5122                                 RETURN(rc);
5123
5124                         if (fl & LU_XATTR_MERGE && lo->ldo_mirror_count > 1 &&
5125                             layout_attr->la_valid & LA_LAYOUT_VERSION) {
5126                                 /* mirror merge exec phase */
5127                                 layout_attr->la_layout_version =
5128                                                 lo->ldo_layout_gen;
5129                                 data.locd_attr = layout_attr;
5130                                 data.locd_declare = false;
5131                                 data.locd_stripe_cb =
5132                                                 lod_obj_stripe_attr_set_cb;
5133                                 rc = lod_obj_for_each_stripe(env, lo, th,
5134                                                              &data);
5135                                 if (rc)
5136                                         RETURN(rc);
5137                         }
5138                 }
5139                 RETURN(rc);
5140         } else if (strcmp(name, XATTR_NAME_FID) == 0) {
5141                 rc = lod_replace_parent_fid(env, dt, buf, th, false);
5142
5143                 RETURN(rc);
5144         }
5145
5146         /* then all other xattr */
5147         rc = lod_xattr_set_internal(env, dt, buf, name, fl, th);
5148
5149         RETURN(rc);
5150 }
5151
5152 /**
5153  * Implementation of dt_object_operations::do_declare_xattr_del.
5154  *
5155  * \see dt_object_operations::do_declare_xattr_del() in the API description
5156  * for details.
5157  */
5158 static int lod_declare_xattr_del(const struct lu_env *env,
5159                                  struct dt_object *dt, const char *name,
5160                                  struct thandle *th)
5161 {
5162         struct lod_object *lo = lod_dt_obj(dt);
5163         struct dt_object *next = dt_object_child(dt);
5164         int i;
5165         int rc;
5166         ENTRY;
5167
5168         rc = lod_sub_declare_xattr_del(env, next, name, th);
5169         if (rc != 0)
5170                 RETURN(rc);
5171
5172         if (!S_ISDIR(dt->do_lu.lo_header->loh_attr))
5173                 RETURN(0);
5174
5175         /* NB: don't delete stripe LMV, because when we do this, normally we
5176          * will remove stripes, besides, if directory LMV is corrupt, this will
5177          * prevent deleting its LMV and fixing it (via LFSCK).
5178          */
5179         if (!strcmp(name, XATTR_NAME_LMV))
5180                 RETURN(0);
5181
5182         rc = lod_striping_load(env, lo);
5183         if (rc != 0)
5184                 RETURN(rc);
5185
5186         if (lo->ldo_dir_stripe_count == 0)
5187                 RETURN(0);
5188
5189         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
5190                 struct dt_object *dto = lo->ldo_stripe[i];
5191
5192                 if (!dto)
5193                         continue;
5194
5195                 if (!dt_object_exists(dto))
5196                         continue;
5197
5198                 rc = lod_sub_declare_xattr_del(env, dto, name, th);
5199                 if (rc != 0)
5200                         break;
5201         }
5202
5203         RETURN(rc);
5204 }
5205
5206 /**
5207  * Implementation of dt_object_operations::do_xattr_del.
5208  *
5209  * If EA storing a regular striping is being deleted, then release
5210  * all the references to the stripe objects in core.
5211  *
5212  * \see dt_object_operations::do_xattr_del() in the API description for details.
5213  */
5214 static int lod_xattr_del(const struct lu_env *env, struct dt_object *dt,
5215                          const char *name, struct thandle *th)
5216 {
5217         int rc;
5218
5219         ENTRY;
5220
5221         if (!strcmp(name, XATTR_NAME_LOV) || !strcmp(name, XATTR_NAME_LMV))
5222                 lod_striping_free(env, lod_dt_obj(dt));
5223
5224         rc = lod_xattr_del_internal(env, dt, name, th);
5225
5226         RETURN(rc);
5227 }
5228
5229 /**
5230  * Implementation of dt_object_operations::do_xattr_list.
5231  *
5232  * \see dt_object_operations::do_xattr_list() in the API description
5233  * for details.
5234  */
5235 static int lod_xattr_list(const struct lu_env *env,
5236                           struct dt_object *dt, const struct lu_buf *buf)
5237 {
5238         return dt_xattr_list(env, dt_object_child(dt), buf);
5239 }
5240
5241 static inline int lod_object_will_be_striped(int is_reg, const struct lu_fid *fid)
5242 {
5243         return (is_reg && fid_seq(fid) != FID_SEQ_LOCAL_FILE);
5244 }
5245
5246 /**
5247  * Copy OST list from layout provided by user.
5248  *
5249  * \param[in] lod_comp          layout_component to be filled
5250  * \param[in] v3                LOV EA V3 user data
5251  *
5252  * \retval              0 on success
5253  * \retval              negative if failed
5254  */
5255 int lod_comp_copy_ost_lists(struct lod_layout_component *lod_comp,
5256                             struct lov_user_md_v3 *v3)
5257 {
5258         int j;
5259
5260         ENTRY;
5261
5262         if (v3->lmm_stripe_offset == LOV_OFFSET_DEFAULT)
5263                 v3->lmm_stripe_offset = v3->lmm_objects[0].l_ost_idx;
5264
5265         if (lod_comp->llc_ostlist.op_array) {
5266                 if (lod_comp->llc_ostlist.op_size >=
5267                     v3->lmm_stripe_count * sizeof(__u32))  {
5268                         lod_comp->llc_ostlist.op_count =
5269                                         v3->lmm_stripe_count;
5270                         goto skip;
5271                 }
5272                 OBD_FREE(lod_comp->llc_ostlist.op_array,
5273                          lod_comp->llc_ostlist.op_size);
5274         }
5275
5276         /* copy ost list from lmm */
5277         lod_comp->llc_ostlist.op_count = v3->lmm_stripe_count;
5278         lod_comp->llc_ostlist.op_size = v3->lmm_stripe_count * sizeof(__u32);
5279         OBD_ALLOC(lod_comp->llc_ostlist.op_array,
5280                   lod_comp->llc_ostlist.op_size);
5281         if (!lod_comp->llc_ostlist.op_array)
5282                 RETURN(-ENOMEM);
5283 skip:
5284         for (j = 0; j < v3->lmm_stripe_count; j++) {
5285                 lod_comp->llc_ostlist.op_array[j] =
5286                         v3->lmm_objects[j].l_ost_idx;
5287         }
5288
5289         RETURN(0);
5290 }
5291
5292
5293 /**
5294  * Get default striping.
5295  *
5296  * \param[in] env               execution environment
5297  * \param[in] lo                object
5298  * \param[out] lds              default striping
5299  *
5300  * \retval              0 on success
5301  * \retval              negative if failed
5302  */
5303 static int lod_get_default_lov_striping(const struct lu_env *env,
5304                                         struct lod_object *lo,
5305                                         struct lod_default_striping *lds,
5306                                         struct dt_allocation_hint *ah)
5307 {
5308         struct lod_thread_info *info = lod_env_info(env);
5309         struct lov_user_md_v1 *v1 = NULL;
5310         struct lov_user_md_v3 *v3 = NULL;
5311         struct lov_comp_md_v1 *comp_v1 = NULL;
5312         __u16 comp_cnt;
5313         __u16 mirror_cnt;
5314         bool composite;
5315         int rc, i, j;
5316
5317         ENTRY;
5318
5319         lds->lds_def_striping_set = 0;
5320
5321         rc = lod_get_lov_ea(env, lo);
5322         if (rc < 0)
5323                 RETURN(rc);
5324
5325         if (rc < (typeof(rc))sizeof(struct lov_user_md))
5326                 RETURN(0);
5327
5328         v1 = info->lti_ea_store;
5329         if (v1->lmm_magic == __swab32(LOV_USER_MAGIC_V1)) {
5330                 lustre_swab_lov_user_md_v1(v1);
5331         } else if (v1->lmm_magic == __swab32(LOV_USER_MAGIC_V3)) {
5332                 v3 = (struct lov_user_md_v3 *)v1;
5333                 lustre_swab_lov_user_md_v3(v3);
5334         } else if (v1->lmm_magic == __swab32(LOV_USER_MAGIC_SPECIFIC)) {
5335                 v3 = (struct lov_user_md_v3 *)v1;
5336                 lustre_swab_lov_user_md_v3(v3);
5337                 lustre_swab_lov_user_md_objects(v3->lmm_objects,
5338                                                 v3->lmm_stripe_count);
5339         } else if (v1->lmm_magic == __swab32(LOV_USER_MAGIC_COMP_V1) ||
5340                    v1->lmm_magic == __swab32(LOV_USER_MAGIC_SEL)) {
5341                 comp_v1 = (struct lov_comp_md_v1 *)v1;
5342                 lustre_swab_lov_comp_md_v1(comp_v1);
5343         }
5344
5345         if (v1->lmm_magic != LOV_MAGIC_V3 && v1->lmm_magic != LOV_MAGIC_V1 &&
5346             v1->lmm_magic != LOV_MAGIC_COMP_V1 &&
5347             v1->lmm_magic != LOV_MAGIC_SEL &&
5348             v1->lmm_magic != LOV_USER_MAGIC_SPECIFIC)
5349                 RETURN(-ENOTSUPP);
5350
5351         if ((v1->lmm_magic == LOV_MAGIC_COMP_V1 ||
5352             v1->lmm_magic == LOV_MAGIC_SEL) &&
5353              !(ah && ah->dah_append_stripes)) {
5354                 comp_v1 = (struct lov_comp_md_v1 *)v1;
5355                 comp_cnt = comp_v1->lcm_entry_count;
5356                 if (comp_cnt == 0)
5357                         RETURN(-EINVAL);
5358                 mirror_cnt = comp_v1->lcm_mirror_count + 1;
5359                 composite = true;
5360         } else {
5361                 comp_cnt = 1;
5362                 mirror_cnt = 0;
5363                 composite = false;
5364         }
5365
5366         /* realloc default comp entries if necessary */
5367         rc = lod_def_striping_comp_resize(lds, comp_cnt);
5368         if (rc < 0)
5369                 RETURN(rc);
5370
5371         lds->lds_def_comp_cnt = comp_cnt;
5372         lds->lds_def_striping_is_composite = composite;
5373         lds->lds_def_mirror_cnt = mirror_cnt;
5374
5375         for (i = 0; i < comp_cnt; i++) {
5376                 struct lod_layout_component *lod_comp;
5377                 char *pool;
5378
5379                 lod_comp = &lds->lds_def_comp_entries[i];
5380                 /*
5381                  * reset lod_comp values, llc_stripes is always NULL in
5382                  * the default striping template, llc_pool will be reset
5383                  * later below.
5384                  */
5385                 memset(lod_comp, 0, offsetof(typeof(*lod_comp), llc_pool));
5386
5387                 if (composite) {
5388                         v1 = (struct lov_user_md *)((char *)comp_v1 +
5389                                         comp_v1->lcm_entries[i].lcme_offset);
5390                         lod_comp->llc_extent =
5391                                         comp_v1->lcm_entries[i].lcme_extent;
5392                         /* We only inherit certain flags from the layout */
5393                         lod_comp->llc_flags =
5394                                         comp_v1->lcm_entries[i].lcme_flags &
5395                                         LCME_TEMPLATE_FLAGS;
5396                 }
5397
5398                 if (!lov_pattern_supported(v1->lmm_pattern) &&
5399                     !(v1->lmm_pattern & LOV_PATTERN_F_RELEASED)) {
5400                         lod_free_def_comp_entries(lds);
5401                         RETURN(-EINVAL);
5402                 }
5403
5404                 CDEBUG(D_LAYOUT, DFID" stripe_count=%d stripe_size=%d stripe_offset=%d append_stripes=%d\n",
5405                        PFID(lu_object_fid(&lo->ldo_obj.do_lu)),
5406                        (int)v1->lmm_stripe_count, (int)v1->lmm_stripe_size,
5407                        (int)v1->lmm_stripe_offset,
5408                        ah ? ah->dah_append_stripes : 0);
5409
5410                 if (ah && ah->dah_append_stripes)
5411                         lod_comp->llc_stripe_count = ah->dah_append_stripes;
5412                 else
5413                         lod_comp->llc_stripe_count = v1->lmm_stripe_count;
5414                 lod_comp->llc_stripe_size = v1->lmm_stripe_size;
5415                 lod_comp->llc_stripe_offset = v1->lmm_stripe_offset;
5416                 lod_comp->llc_pattern = v1->lmm_pattern;
5417
5418                 pool = NULL;
5419                 if (ah && ah->dah_append_pool && ah->dah_append_pool[0]) {
5420                         pool = ah->dah_append_pool;
5421                 } else if (v1->lmm_magic == LOV_USER_MAGIC_V3) {
5422                         /* XXX: sanity check here */
5423                         v3 = (struct lov_user_md_v3 *) v1;
5424                         if (v3->lmm_pool_name[0] != '\0')
5425                                 pool = v3->lmm_pool_name;
5426                 }
5427                 lod_set_def_pool(lds, i, pool);
5428                 if (v1->lmm_magic == LOV_USER_MAGIC_SPECIFIC) {
5429                         v3 = (struct lov_user_md_v3 *)v1;
5430                         rc = lod_comp_copy_ost_lists(lod_comp, v3);
5431                         if (rc)
5432                                 RETURN(rc);
5433                 } else if (lod_comp->llc_ostlist.op_array &&
5434                            lod_comp->llc_ostlist.op_count) {
5435                         for (j = 0; j < lod_comp->llc_ostlist.op_count; j++)
5436                                 lod_comp->llc_ostlist.op_array[j] = -1;
5437                         lod_comp->llc_ostlist.op_count = 0;
5438                 }
5439         }
5440
5441         lds->lds_def_striping_set = 1;
5442         RETURN(rc);
5443 }
5444
5445 /**
5446  * Get default directory striping.
5447  *
5448  * \param[in] env               execution environment
5449  * \param[in] lo                object
5450  * \param[out] lds              default striping
5451  *
5452  * \retval              0 on success
5453  * \retval              negative if failed
5454  */
5455 static int lod_get_default_lmv_striping(const struct lu_env *env,
5456                                         struct lod_object *lo,
5457                                         struct lod_default_striping *lds)
5458 {
5459         struct lmv_user_md *lmu;
5460         int rc;
5461
5462         lds->lds_dir_def_striping_set = 0;
5463
5464         rc = lod_get_default_lmv_ea(env, lo);
5465         if (rc < 0)
5466                 return rc;
5467
5468         if (rc >= (int)sizeof(*lmu)) {
5469                 struct lod_thread_info *info = lod_env_info(env);
5470
5471                 lmu = info->lti_ea_store;
5472
5473                 lds->lds_dir_def_stripe_count =
5474                                 le32_to_cpu(lmu->lum_stripe_count);
5475                 lds->lds_dir_def_stripe_offset =
5476                                 le32_to_cpu(lmu->lum_stripe_offset);
5477                 lds->lds_dir_def_hash_type =
5478                                 le32_to_cpu(lmu->lum_hash_type);
5479                 lds->lds_dir_def_max_inherit = lmu->lum_max_inherit;
5480                 lds->lds_dir_def_max_inherit_rr = lmu->lum_max_inherit_rr;
5481                 lds->lds_dir_def_striping_set = 1;
5482         }
5483
5484         return 0;
5485 }
5486
5487 /**
5488  * Get default striping in the object.
5489  *
5490  * Get object default striping and default directory striping.
5491  *
5492  * \param[in] env               execution environment
5493  * \param[in] lo                object
5494  * \param[out] lds              default striping
5495  *
5496  * \retval              0 on success
5497  * \retval              negative if failed
5498  */
5499 static int lod_get_default_striping(const struct lu_env *env,
5500                                     struct lod_object *lo,
5501                                     struct lod_default_striping *lds)
5502 {
5503         int rc, rc1;
5504
5505         rc = lod_get_default_lov_striping(env, lo, lds, NULL);
5506         if (lds->lds_def_striping_set) {
5507                 struct lod_thread_info *info = lod_env_info(env);
5508                 struct lod_device *d = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
5509
5510                 rc = lod_verify_striping(env, d, lo, &info->lti_buf, false);
5511                 if (rc)
5512                         lds->lds_def_striping_set = 0;
5513         }
5514
5515         rc1 = lod_get_default_lmv_striping(env, lo, lds);
5516         if (rc == 0 && rc1 < 0)
5517                 rc = rc1;
5518
5519         return rc;
5520 }
5521
5522 /**
5523  * Apply default striping on object.
5524  *
5525  * If object striping pattern is not set, set to the one in default striping.
5526  * The default striping is from parent or fs.
5527  *
5528  * \param[in] lo                new object
5529  * \param[in] lds               default striping
5530  * \param[in] mode              new object's mode
5531  */
5532 static void lod_striping_from_default(struct lod_object *lo,
5533                                       const struct lod_default_striping *lds,
5534                                       umode_t mode)
5535 {
5536         struct lod_device *d = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
5537         int i, rc;
5538
5539         if (lds->lds_def_striping_set && S_ISREG(mode)) {
5540                 struct lov_desc *desc = &d->lod_ost_descs.ltd_lov_desc;
5541
5542                 rc = lod_alloc_comp_entries(lo, lds->lds_def_mirror_cnt,
5543                                             lds->lds_def_comp_cnt);
5544                 if (rc != 0)
5545                         return;
5546
5547                 lo->ldo_is_composite = lds->lds_def_striping_is_composite;
5548                 if (lds->lds_def_mirror_cnt > 1)
5549                         lo->ldo_flr_state = LCM_FL_RDONLY;
5550
5551                 for (i = 0; i < lo->ldo_comp_cnt; i++) {
5552                         struct lod_layout_component *obj_comp =
5553                                                 &lo->ldo_comp_entries[i];
5554                         struct lod_layout_component *def_comp =
5555                                                 &lds->lds_def_comp_entries[i];
5556
5557                         CDEBUG(D_LAYOUT,
5558                                "inherit "DFID" file layout from default: flags=%#x size=%hu nr=%u offset=%u pattern=%#x pool=%s\n",
5559                                PFID(lu_object_fid(&lo->ldo_obj.do_lu)),
5560                                def_comp->llc_flags,
5561                                def_comp->llc_stripe_size,
5562                                def_comp->llc_stripe_count,
5563                                def_comp->llc_stripe_offset,
5564                                def_comp->llc_pattern,
5565                                def_comp->llc_pool ?: "");
5566
5567                         *obj_comp = *def_comp;
5568                         if (def_comp->llc_pool != NULL) {
5569                                 /* pointer was copied from def_comp */
5570                                 obj_comp->llc_pool = NULL;
5571                                 lod_obj_set_pool(lo, i, def_comp->llc_pool);
5572                         }
5573
5574                         /* copy ost list */
5575                         if (def_comp->llc_ostlist.op_array &&
5576                             def_comp->llc_ostlist.op_count) {
5577                                 OBD_ALLOC(obj_comp->llc_ostlist.op_array,
5578                                           obj_comp->llc_ostlist.op_size);
5579                                 if (!obj_comp->llc_ostlist.op_array)
5580                                         return;
5581                                 memcpy(obj_comp->llc_ostlist.op_array,
5582                                        def_comp->llc_ostlist.op_array,
5583                                        obj_comp->llc_ostlist.op_size);
5584                         } else if (def_comp->llc_ostlist.op_array) {
5585                                 obj_comp->llc_ostlist.op_array = NULL;
5586                         }
5587
5588                         /*
5589                          * Don't initialize these fields for plain layout
5590                          * (v1/v3) here, they are inherited in the order of
5591                          * 'parent' -> 'fs default (root)' -> 'global default
5592                          * values for stripe_count & stripe_size'.
5593                          *
5594                          * see lod_ah_init().
5595                          */
5596                         if (!lo->ldo_is_composite)
5597                                 continue;
5598
5599                         lod_adjust_stripe_info(obj_comp, desc, 0);
5600                 }
5601         } else if (lds->lds_dir_def_striping_set && S_ISDIR(mode)) {
5602                 if (lo->ldo_dir_stripe_count == 0)
5603                         lo->ldo_dir_stripe_count =
5604                                 lds->lds_dir_def_stripe_count;
5605                 if (lo->ldo_dir_stripe_offset == -1)
5606                         lo->ldo_dir_stripe_offset =
5607                                 lds->lds_dir_def_stripe_offset;
5608                 if (lo->ldo_dir_hash_type == LMV_HASH_TYPE_UNKNOWN)
5609                         lo->ldo_dir_hash_type = lds->lds_dir_def_hash_type;
5610
5611                 CDEBUG(D_LAYOUT,
5612                        "inherit "DFID" dir layout from default: count=%hu offset=%u hash_type=%x\n",
5613                        PFID(lu_object_fid(&lo->ldo_obj.do_lu)),
5614                        lo->ldo_dir_stripe_count, lo->ldo_dir_stripe_offset,
5615                        lo->ldo_dir_hash_type);
5616         }
5617 }
5618
5619 static inline bool lod_need_inherit_more(struct lod_object *lo, bool from_root,
5620                                          char *append_pool)
5621 {
5622         struct lod_layout_component *lod_comp;
5623
5624         if (lo->ldo_comp_cnt == 0)
5625                 return true;
5626
5627         if (lo->ldo_is_composite)
5628                 return false;
5629
5630         lod_comp = &lo->ldo_comp_entries[0];
5631
5632         if (lod_comp->llc_stripe_count <= 0 ||
5633             lod_comp->llc_stripe_size <= 0)
5634                 return true;
5635
5636         if (from_root && (lod_comp->llc_pool == NULL ||
5637                           lod_comp->llc_stripe_offset == LOV_OFFSET_DEFAULT))
5638                 return true;
5639
5640         if (append_pool && append_pool[0])
5641                 return true;
5642
5643         return false;
5644 }
5645
5646 /**
5647  * Implementation of dt_object_operations::do_ah_init.
5648  *
5649  * This method is used to make a decision on the striping configuration for the
5650  * object being created. It can be taken from the \a parent object if it exists,
5651  * or filesystem's default. The resulting configuration (number of stripes,
5652  * stripe size/offset, pool name, hash_type, etc.) is stored in the object
5653  * itself and will be used by the methods like ->doo_declare_create().
5654  *
5655  * \see dt_object_operations::do_ah_init() in the API description for details.
5656  */
5657 static void lod_ah_init(const struct lu_env *env,
5658                         struct dt_allocation_hint *ah,
5659                         struct dt_object *parent,
5660                         struct dt_object *child,
5661                         umode_t child_mode)
5662 {
5663         struct lod_device *d = lu2lod_dev(child->do_lu.lo_dev);
5664         struct lod_thread_info *info = lod_env_info(env);
5665         struct lod_default_striping *lds = lod_lds_buf_get(env);
5666         struct dt_object *nextp = NULL;
5667         struct dt_object *nextc;
5668         struct lod_object *lp = NULL;
5669         struct lod_object *lc;
5670         struct lov_desc *desc;
5671         struct lod_layout_component *lod_comp;
5672         int rc;
5673         ENTRY;
5674
5675         LASSERT(child);
5676
5677         if (ah->dah_append_stripes == -1)
5678                 ah->dah_append_stripes =
5679                         d->lod_ost_descs.ltd_lov_desc.ld_tgt_count;
5680
5681         if (likely(parent)) {
5682                 nextp = dt_object_child(parent);
5683                 lp = lod_dt_obj(parent);
5684         }
5685
5686         nextc = dt_object_child(child);
5687         lc = lod_dt_obj(child);
5688
5689         LASSERT(!lod_obj_is_striped(child));
5690         /* default layout template may have been set on the regular file
5691          * when this is called from mdd_create_data() */
5692         if (S_ISREG(child_mode))
5693                 lod_free_comp_entries(lc);
5694
5695         if (!dt_object_exists(nextc))
5696                 nextc->do_ops->do_ah_init(env, ah, nextp, nextc, child_mode);
5697
5698         if (S_ISDIR(child_mode)) {
5699                 const struct lmv_user_md_v1 *lum1 = ah->dah_eadata;
5700
5701                 /* other default values are 0 */
5702                 lc->ldo_dir_stripe_offset = -1;
5703
5704                 /* no default striping configuration is needed for
5705                  * foreign dirs
5706                  */
5707                 if (ah->dah_eadata != NULL && ah->dah_eadata_len != 0 &&
5708                     le32_to_cpu(lum1->lum_magic) == LMV_MAGIC_FOREIGN) {
5709                         lc->ldo_is_foreign = true;
5710                         /* keep stripe_count 0 and stripe_offset -1 */
5711                         CDEBUG(D_INFO, "no default striping for foreign dir\n");
5712                         RETURN_EXIT;
5713                 }
5714
5715                 if (likely(lp != NULL))
5716                         lod_get_default_striping(env, lp, lds);
5717
5718                 /* It should always honour the specified stripes */
5719                 /* Note: old client (< 2.7)might also do lfs mkdir, whose EA
5720                  * will have old magic. In this case, we should ignore the
5721                  * stripe count and try to create dir by default stripe.
5722                  */
5723                 if (ah->dah_eadata != NULL && ah->dah_eadata_len != 0 &&
5724                     (le32_to_cpu(lum1->lum_magic) == LMV_USER_MAGIC ||
5725                      le32_to_cpu(lum1->lum_magic) == LMV_USER_MAGIC_SPECIFIC)) {
5726                         lc->ldo_dir_stripe_count =
5727                                 le32_to_cpu(lum1->lum_stripe_count);
5728                         lc->ldo_dir_stripe_offset =
5729                                 le32_to_cpu(lum1->lum_stripe_offset);
5730                         lc->ldo_dir_hash_type =
5731                                 le32_to_cpu(lum1->lum_hash_type);
5732                         CDEBUG(D_INFO,
5733                                "set dirstripe: count %hu, offset %d, hash %x\n",
5734                                 lc->ldo_dir_stripe_count,
5735                                 (int)lc->ldo_dir_stripe_offset,
5736                                 lc->ldo_dir_hash_type);
5737
5738                         if (d->lod_mdt_descs.ltd_lmv_desc.ld_active_tgt_count &&
5739                             lc->ldo_dir_stripe_count < 2 &&
5740                             lum1->lum_max_inherit != LMV_INHERIT_NONE) {
5741                                 /* when filesystem-wide default LMV is set, dirs
5742                                  * will be created on MDT by space usage, but if
5743                                  * dir is created with "lfs mkdir -c 1 ...", its
5744                                  * subdirs should be kept on the same MDT. To
5745                                  * guarantee this, set default LMV for such dir.
5746                                  */
5747                                 lds->lds_dir_def_stripe_count =
5748                                         le32_to_cpu(lum1->lum_stripe_count);
5749                                 /* if "-1" stripe offset is set, save current
5750                                  * MDT index in default LMV.
5751                                  */
5752                                 if (le32_to_cpu(lum1->lum_stripe_offset) ==
5753                                     LMV_OFFSET_DEFAULT)
5754                                         lds->lds_dir_def_stripe_offset =
5755                                                 lod2lu_dev(d)->ld_site->ld_seq_site->ss_node_id;
5756                                 else
5757                                         lds->lds_dir_def_stripe_offset =
5758                                                 le32_to_cpu(lum1->lum_stripe_offset);
5759                                 lds->lds_dir_def_hash_type =
5760                                         le32_to_cpu(lum1->lum_hash_type);
5761                                 lds->lds_dir_def_max_inherit =
5762                                         lum1->lum_max_inherit;
5763                                 /* it will be decreased by 1 later in setting */
5764                                 if (lum1->lum_max_inherit >= LMV_INHERIT_END &&
5765                                     lum1->lum_max_inherit < LMV_INHERIT_MAX)
5766                                         lds->lds_dir_def_max_inherit++;
5767                                 lds->lds_dir_def_max_inherit_rr =
5768                                         lum1->lum_max_inherit_rr;
5769                                 lds->lds_dir_def_striping_set = 1;
5770                                 /* don't inherit LOV from ROOT */
5771                                 if (lds->lds_def_striping_set &&
5772                                     fid_is_root(lod_object_fid(lp)))
5773                                         lds->lds_def_striping_set = 0;
5774                                 lc->ldo_def_striping = lds;
5775                         } else if (lds->lds_def_striping_set &&
5776                                    !fid_is_root(lod_object_fid(lp))) {
5777                                 /* don't inherit default LMV for "lfs mkdir" */
5778                                 lds->lds_dir_def_striping_set = 0;
5779                                 lc->ldo_def_striping = lds;
5780                         }
5781                 } else {
5782                         /* inherit default striping except ROOT */
5783                         if ((lds->lds_def_striping_set ||
5784                              lds->lds_dir_def_striping_set) &&
5785                             !fid_is_root(lod_object_fid(lp)))
5786                                 lc->ldo_def_striping = lds;
5787
5788                         /* transfer defaults LMV to new directory */
5789                         lod_striping_from_default(lc, lds, child_mode);
5790
5791                         /* set count 0 to create normal directory */
5792                         if (lc->ldo_dir_stripe_count == 1)
5793                                 lc->ldo_dir_stripe_count = 0;
5794                 }
5795
5796                 /* shrink the stripe count to max_mdt_stripecount if it is -1
5797                  * and max_mdt_stripecount is not 0
5798                  */
5799                 if (lc->ldo_dir_stripe_count == (__u16)(-1) &&
5800                     d->lod_max_mdt_stripecount)
5801                         lc->ldo_dir_stripe_count = d->lod_max_mdt_stripecount;
5802
5803                 /* shrink the stripe_count to the avaible MDT count */
5804                 if (lc->ldo_dir_stripe_count > d->lod_remote_mdt_count + 1 &&
5805                     !OBD_FAIL_CHECK(OBD_FAIL_LARGE_STRIPE)) {
5806                         lc->ldo_dir_stripe_count = d->lod_remote_mdt_count + 1;
5807                         if (lc->ldo_dir_stripe_count == 1)
5808                                 lc->ldo_dir_stripe_count = 0;
5809                 }
5810
5811                 if (!lmv_is_known_hash_type(lc->ldo_dir_hash_type))
5812                         lc->ldo_dir_hash_type =
5813                                 (lc->ldo_dir_hash_type & LMV_HASH_FLAG_KNOWN) |
5814                                 d->lod_mdt_descs.ltd_lmv_desc.ld_pattern;
5815
5816                 CDEBUG(D_INFO, "final dir stripe_count=%hu offset=%d hash=%u\n",
5817                        lc->ldo_dir_stripe_count,
5818                        (int)lc->ldo_dir_stripe_offset, lc->ldo_dir_hash_type);
5819
5820                 RETURN_EXIT;
5821         }
5822
5823         /* child object regular file*/
5824
5825         if (!lod_object_will_be_striped(S_ISREG(child_mode),
5826                                         lu_object_fid(&child->do_lu)))
5827                 RETURN_EXIT;
5828
5829         /* If object is going to be striped over OSTs, transfer default
5830          * striping information to the child, so that we can use it
5831          * during declaration and creation.
5832          *
5833          * Try from the parent first.
5834          */
5835         if (likely(lp != NULL)) {
5836                 rc = lod_get_default_lov_striping(env, lp, lds, ah);
5837                 if (rc == 0 && lds->lds_def_striping_set) {
5838                         rc = lod_verify_striping(env, d, lp, &info->lti_buf,
5839                                                  false);
5840                         if (rc == 0)
5841                                 lod_striping_from_default(lc, lds, child_mode);
5842                 }
5843         }
5844
5845         /* Initialize lod_device::lod_md_root object reference */
5846         if (d->lod_md_root == NULL) {
5847                 struct dt_object *root;
5848                 struct lod_object *lroot;
5849
5850                 lu_root_fid(&info->lti_fid);
5851                 root = dt_locate(env, &d->lod_dt_dev, &info->lti_fid);
5852                 if (!IS_ERR(root)) {
5853                         lroot = lod_dt_obj(root);
5854
5855                         spin_lock(&d->lod_lock);
5856                         if (d->lod_md_root != NULL)
5857                                 dt_object_put(env, &d->lod_md_root->ldo_obj);
5858                         d->lod_md_root = lroot;
5859                         spin_unlock(&d->lod_lock);
5860                 }
5861         }
5862
5863         /* try inherit layout from the root object (fs default) when:
5864          *  - parent does not have default layout; or
5865          *  - parent has plain(v1/v3) default layout, and some attributes
5866          *    are not specified in the default layout;
5867          */
5868         if (d->lod_md_root != NULL &&
5869             lod_need_inherit_more(lc, true, ah->dah_append_pool)) {
5870                 rc = lod_get_default_lov_striping(env, d->lod_md_root, lds,
5871                                                   ah);
5872                 if (rc || !lds->lds_def_striping_set)
5873                         goto out;
5874
5875                 rc = lod_verify_striping(env, d, d->lod_md_root, &info->lti_buf,
5876                                          false);
5877                 if (rc)
5878                         goto out;
5879
5880                 if (lc->ldo_comp_cnt == 0) {
5881                         lod_striping_from_default(lc, lds, child_mode);
5882                 } else if (!lds->lds_def_striping_is_composite) {
5883                         struct lod_layout_component *def_comp;
5884
5885                         LASSERT(!lc->ldo_is_composite);
5886                         lod_comp = &lc->ldo_comp_entries[0];
5887                         def_comp = &lds->lds_def_comp_entries[0];
5888
5889                         if (lod_comp->llc_stripe_count <= 0)
5890                                 lod_comp->llc_stripe_count =
5891                                         def_comp->llc_stripe_count;
5892                         if (lod_comp->llc_stripe_size <= 0)
5893                                 lod_comp->llc_stripe_size =
5894                                         def_comp->llc_stripe_size;
5895                         if (lod_comp->llc_stripe_offset == LOV_OFFSET_DEFAULT &&
5896                             (!lod_comp->llc_pool || !lod_comp->llc_pool[0]))
5897                                 lod_comp->llc_stripe_offset =
5898                                         def_comp->llc_stripe_offset;
5899                         if (lod_comp->llc_pool == NULL)
5900                                 lod_obj_set_pool(lc, 0, def_comp->llc_pool);
5901                 }
5902         }
5903 out:
5904         /*
5905          * fs default striping may not be explicitly set, or historically set
5906          * in config log, use them.
5907          */
5908         if (lod_need_inherit_more(lc, false, ah->dah_append_pool)) {
5909                 if (lc->ldo_comp_cnt == 0) {
5910                         rc = lod_alloc_comp_entries(lc, 0, 1);
5911                         if (rc)
5912                                 /* fail to allocate memory, will create a
5913                                  * non-striped file. */
5914                                 RETURN_EXIT;
5915                         lc->ldo_is_composite = 0;
5916                         lod_comp = &lc->ldo_comp_entries[0];
5917                         lod_comp->llc_stripe_offset = LOV_OFFSET_DEFAULT;
5918                 }
5919                 LASSERT(!lc->ldo_is_composite);
5920                 lod_comp = &lc->ldo_comp_entries[0];
5921                 desc = &d->lod_ost_descs.ltd_lov_desc;
5922                 lod_adjust_stripe_info(lod_comp, desc, ah->dah_append_stripes);
5923                 if (ah->dah_append_pool && ah->dah_append_pool[0])
5924                         lod_obj_set_pool(lc, 0, ah->dah_append_pool);
5925         }
5926
5927         EXIT;
5928 }
5929
5930 /**
5931  * Size initialization on late striping.
5932  *
5933  * Propagate the size of a truncated object to a deferred striping.
5934  * This function handles a special case when truncate was done on a
5935  * non-striped object and now while the striping is being created
5936  * we can't lose that size, so we have to propagate it to the stripes
5937  * being created.
5938  *
5939  * \param[in] env       execution environment
5940  * \param[in] dt        object
5941  * \param[in] th        transaction handle
5942  *
5943  * \retval              0 on success
5944  * \retval              negative if failed
5945  */
5946 static int lod_declare_init_size(const struct lu_env *env,
5947                                  struct dt_object *dt, struct thandle *th)
5948 {
5949         struct dt_object        *next = dt_object_child(dt);
5950         struct lod_object       *lo = lod_dt_obj(dt);
5951         struct dt_object        **objects = NULL;
5952         struct lu_attr  *attr = &lod_env_info(env)->lti_attr;
5953         uint64_t        size, offs;
5954         int     i, rc, stripe, stripe_count = 0, stripe_size = 0;
5955         struct lu_extent size_ext;
5956         ENTRY;
5957
5958         if (!lod_obj_is_striped(dt))
5959                 RETURN(0);
5960
5961         rc = dt_attr_get(env, next, attr);
5962         LASSERT(attr->la_valid & LA_SIZE);
5963         if (rc)
5964                 RETURN(rc);
5965
5966         size = attr->la_size;
5967         if (size == 0)
5968                 RETURN(0);
5969
5970         size_ext = (typeof(size_ext)){ .e_start = size - 1, .e_end = size };
5971         for (i = 0; i < lo->ldo_comp_cnt; i++) {
5972                 struct lod_layout_component *lod_comp;
5973                 struct lu_extent *extent;
5974
5975                 lod_comp = &lo->ldo_comp_entries[i];
5976
5977                 if (lod_comp->llc_stripe == NULL)
5978                         continue;
5979
5980                 extent = &lod_comp->llc_extent;
5981                 CDEBUG(D_INFO, "%lld "DEXT"\n", size, PEXT(extent));
5982                 if (!lo->ldo_is_composite ||
5983                     lu_extent_is_overlapped(extent, &size_ext)) {
5984                         objects = lod_comp->llc_stripe;
5985                         stripe_count = lod_comp->llc_stripe_count;
5986                         stripe_size = lod_comp->llc_stripe_size;
5987
5988                         /* next mirror */
5989                         if (stripe_count == 0)
5990                                 continue;
5991
5992                         LASSERT(objects != NULL && stripe_size != 0);
5993                         do_div(size, stripe_size);
5994                         stripe = do_div(size, stripe_count);
5995                         LASSERT(objects[stripe] != NULL);
5996
5997                         size = size * stripe_size;
5998                         offs = attr->la_size;
5999                         size += do_div(offs, stripe_size);
6000
6001                         attr->la_valid = LA_SIZE;
6002                         attr->la_size = size;
6003
6004                         rc = lod_sub_declare_attr_set(env, objects[stripe],
6005                                                       attr, th);
6006                 }
6007         }
6008
6009         RETURN(rc);
6010 }
6011
6012 /**
6013  * Declare creation of striped object.
6014  *
6015  * The function declares creation stripes for a regular object. The function
6016  * also declares whether the stripes will be created with non-zero size if
6017  * previously size was set non-zero on the master object. If object \a dt is
6018  * not local, then only fully defined striping can be applied in \a lovea.
6019  * Otherwise \a lovea can be in the form of pattern, see lod_qos_parse_config()
6020  * for the details.
6021  *
6022  * \param[in] env       execution environment
6023  * \param[in] dt        object
6024  * \param[in] attr      attributes the stripes will be created with
6025  * \param[in] lovea     a buffer containing striping description
6026  * \param[in] th        transaction handle
6027  *
6028  * \retval              0 on success
6029  * \retval              negative if failed
6030  */
6031 int lod_declare_striped_create(const struct lu_env *env, struct dt_object *dt,
6032                                struct lu_attr *attr,
6033                                const struct lu_buf *lovea, struct thandle *th)
6034 {
6035         struct lod_thread_info  *info = lod_env_info(env);
6036         struct dt_object        *next = dt_object_child(dt);
6037         struct lod_object       *lo = lod_dt_obj(dt);
6038         int                      rc;
6039         ENTRY;
6040
6041         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_ALLOC_OBDO))
6042                 GOTO(out, rc = -ENOMEM);
6043
6044         if (!dt_object_remote(next)) {
6045                 /* choose OST and generate appropriate objects */
6046                 rc = lod_prepare_create(env, lo, attr, lovea, th);
6047                 if (rc)
6048                         GOTO(out, rc);
6049
6050                 /*
6051                  * declare storage for striping data
6052                  */
6053                 info->lti_buf.lb_len = lod_comp_md_size(lo, false);
6054         } else {
6055                 /* LOD can not choose OST objects for remote objects, i.e.
6056                  * stripes must be ready before that. Right now, it can only
6057                  * happen during migrate, i.e. migrate process needs to create
6058                  * remote regular file (mdd_migrate_create), then the migrate
6059                  * process will provide stripeEA. */
6060                 LASSERT(lovea != NULL);
6061                 info->lti_buf = *lovea;
6062         }
6063
6064         rc = lod_sub_declare_xattr_set(env, next, &info->lti_buf,
6065                                        XATTR_NAME_LOV, 0, th);
6066         if (rc)
6067                 GOTO(out, rc);
6068
6069         /*
6070          * if striping is created with local object's size > 0,
6071          * we have to propagate this size to specific object
6072          * the case is possible only when local object was created previously
6073          */
6074         if (dt_object_exists(next))
6075                 rc = lod_declare_init_size(env, dt, th);
6076
6077 out:
6078         /* failed to create striping or to set initial size, let's reset
6079          * config so that others don't get confused */
6080         if (rc)
6081                 lod_striping_free(env, lo);
6082
6083         RETURN(rc);
6084 }
6085
6086 /*
6087  * Whether subdirectories under \a dt should be created on MDTs by space QoS
6088  *
6089  * If LMV_HASH_FLAG_SPACE is set on directory default layout, its subdirectories
6090  * should be created on MDT by space QoS.
6091  *
6092  * \param[in] env       execution environment
6093  * \param[in] dev       lu device
6094  * \param[in] dt        object
6095  *
6096  * \retval              1 if directory should create subdir by space usage
6097  * \retval              0 if not
6098  * \retval              -ev if failed
6099  */
6100 static inline int dt_object_qos_mkdir(const struct lu_env *env,
6101                                       struct lu_device *dev,
6102                                       struct dt_object *dt)
6103 {
6104         struct lod_thread_info *info = lod_env_info(env);
6105         struct lu_object *obj;
6106         struct lod_object *lo;
6107         struct lmv_user_md *lmu;
6108         int rc;
6109
6110         obj = lu_object_find_slice(env, dev, lu_object_fid(&dt->do_lu), NULL);
6111         if (IS_ERR(obj))
6112                 return PTR_ERR(obj);
6113
6114         lo = lu2lod_obj(obj);
6115
6116         rc = lod_get_default_lmv_ea(env, lo);
6117         dt_object_put(env, dt);
6118         if (rc <= 0)
6119                 return rc;
6120
6121         if (rc < (int)sizeof(*lmu))
6122                 return -EINVAL;
6123
6124         lmu = info->lti_ea_store;
6125         return le32_to_cpu(lmu->lum_stripe_offset) == LMV_OFFSET_DEFAULT;
6126 }
6127
6128 /**
6129  * Implementation of dt_object_operations::do_declare_create.
6130  *
6131  * The method declares creation of a new object. If the object will be striped,
6132  * then helper functions are called to find FIDs for the stripes, declare
6133  * creation of the stripes and declare initialization of the striping
6134  * information to be stored in the master object.
6135  *
6136  * \see dt_object_operations::do_declare_create() in the API description
6137  * for details.
6138  */
6139 static int lod_declare_create(const struct lu_env *env, struct dt_object *dt,
6140                               struct lu_attr *attr,
6141                               struct dt_allocation_hint *hint,
6142                               struct dt_object_format *dof, struct thandle *th)
6143 {
6144         struct dt_object   *next = dt_object_child(dt);
6145         struct lod_object  *lo = lod_dt_obj(dt);
6146         int                 rc;
6147         ENTRY;
6148
6149         LASSERT(dof);
6150         LASSERT(attr);
6151         LASSERT(th);
6152
6153         /*
6154          * first of all, we declare creation of local object
6155          */
6156         rc = lod_sub_declare_create(env, next, attr, hint, dof, th);
6157         if (rc != 0)
6158                 GOTO(out, rc);
6159
6160         /*
6161          * it's lod_ah_init() that has decided the object will be striped
6162          */
6163         if (dof->dof_type == DFT_REGULAR) {
6164                 /* callers don't want stripes */
6165                 /* XXX: all tricky interactions with ->ah_make_hint() decided
6166                  * to use striping, then ->declare_create() behaving differently
6167                  * should be cleaned */
6168                 if (dof->u.dof_reg.striped != 0)
6169                         rc = lod_declare_striped_create(env, dt, attr,
6170                                                         NULL, th);
6171         } else if (dof->dof_type == DFT_DIR) {
6172                 struct seq_server_site *ss;
6173                 struct lu_buf buf = { NULL };
6174                 struct lu_buf *lmu = NULL;
6175
6176                 ss = lu_site2seq(dt->do_lu.lo_dev->ld_site);
6177
6178                 /* If the parent has default stripeEA, and client
6179                  * did not find it before sending create request,
6180                  * then MDT will return -EREMOTE, and client will
6181                  * retrieve the default stripeEA and re-create the
6182                  * sub directory.
6183                  *
6184                  * Note: if dah_eadata != NULL, it means creating the
6185                  * striped directory with specified stripeEA, then it
6186                  * should ignore the default stripeEA */
6187                 if (hint != NULL && hint->dah_eadata == NULL) {
6188                         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_STALE_DIR_LAYOUT))
6189                                 GOTO(out, rc = -EREMOTE);
6190
6191                         if (lo->ldo_dir_stripe_offset != LMV_OFFSET_DEFAULT &&
6192                             lo->ldo_dir_stripe_offset != ss->ss_node_id) {
6193                                 struct lod_device *lod;
6194                                 struct lu_tgt_desc *mdt = NULL;
6195                                 bool found_mdt = false;
6196
6197                                 lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
6198                                 lod_foreach_mdt(lod, mdt) {
6199                                         if (mdt->ltd_index ==
6200                                                 lo->ldo_dir_stripe_offset) {
6201                                                 found_mdt = true;
6202                                                 break;
6203                                         }
6204                                 }
6205
6206                                 /* If the MDT indicated by stripe_offset can be
6207                                  * found, then tell client to resend the create
6208                                  * request to the correct MDT, otherwise return
6209                                  * error to client */
6210                                 if (found_mdt)
6211                                         GOTO(out, rc = -EREMOTE);
6212                                 else
6213                                         GOTO(out, rc = -EINVAL);
6214                         }
6215                 } else if (hint && hint->dah_eadata) {
6216                         lmu = &buf;
6217                         lmu->lb_buf = (void *)hint->dah_eadata;
6218                         lmu->lb_len = hint->dah_eadata_len;
6219                 }
6220
6221                 rc = lod_declare_dir_striping_create(env, dt, attr, lmu, dof,
6222                                                      th);
6223         }
6224 out:
6225         /* failed to create striping or to set initial size, let's reset
6226          * config so that others don't get confused */
6227         if (rc)
6228                 lod_striping_free(env, lo);
6229         RETURN(rc);
6230 }
6231
6232 /**
6233  * Generate component ID for new created component.
6234  *
6235  * \param[in] lo                LOD object
6236  * \param[in] comp_idx          index of ldo_comp_entries
6237  *
6238  * \retval                      component ID on success
6239  * \retval                      LCME_ID_INVAL on failure
6240  */
6241 static __u32 lod_gen_component_id(struct lod_object *lo,
6242                                   int mirror_id, int comp_idx)
6243 {
6244         struct lod_layout_component *lod_comp;
6245         __u32   id, start, end;
6246         int     i;
6247
6248         LASSERT(lo->ldo_comp_entries[comp_idx].llc_id == LCME_ID_INVAL);
6249
6250         lod_obj_inc_layout_gen(lo);
6251         id = lo->ldo_layout_gen;
6252         if (likely(id <= SEQ_ID_MAX))
6253                 RETURN(pflr_id(mirror_id, id & SEQ_ID_MASK));
6254
6255         /* Layout generation wraps, need to check collisions. */
6256         start = id & SEQ_ID_MASK;
6257         end = SEQ_ID_MAX;
6258 again:
6259         for (id = start; id <= end; id++) {
6260                 for (i = 0; i < lo->ldo_comp_cnt; i++) {
6261                         lod_comp = &lo->ldo_comp_entries[i];
6262                         if (pflr_id(mirror_id, id) == lod_comp->llc_id)
6263                                 break;
6264                 }
6265                 /* Found the ununsed ID */
6266                 if (i == lo->ldo_comp_cnt)
6267                         RETURN(pflr_id(mirror_id, id));
6268         }
6269         if (end == LCME_ID_MAX) {
6270                 start = 1;
6271                 end = min(lo->ldo_layout_gen & LCME_ID_MASK,
6272                           (__u32)(LCME_ID_MAX - 1));
6273                 goto again;
6274         }
6275
6276         RETURN(LCME_ID_INVAL);
6277 }
6278
6279 /**
6280  * Creation of a striped regular object.
6281  *
6282  * The function is called to create the stripe objects for a regular
6283  * striped file. This can happen at the initial object creation or
6284  * when the caller asks LOD to do so using ->do_xattr_set() method
6285  * (so called late striping). Notice all the information are already
6286  * prepared in the form of the list of objects (ldo_stripe field).
6287  * This is done during declare phase.
6288  *
6289  * \param[in] env       execution environment
6290  * \param[in] dt        object
6291  * \param[in] attr      attributes the stripes will be created with
6292  * \param[in] dof       format of stripes (see OSD API description)
6293  * \param[in] th        transaction handle
6294  *
6295  * \retval              0 on success
6296  * \retval              negative if failed
6297  */
6298 int lod_striped_create(const struct lu_env *env, struct dt_object *dt,
6299                        struct lu_attr *attr, struct dt_object_format *dof,
6300                        struct thandle *th)
6301 {
6302         struct lod_layout_component     *lod_comp;
6303         struct lod_object       *lo = lod_dt_obj(dt);
6304         __u16   mirror_id;
6305         int     rc = 0, i, j;
6306         ENTRY;
6307
6308         mutex_lock(&lo->ldo_layout_mutex);
6309
6310         LASSERT((lo->ldo_comp_cnt != 0 && lo->ldo_comp_entries != NULL) ||
6311                 lo->ldo_is_foreign);
6312
6313         mirror_id = 0; /* non-flr file's mirror_id is 0 */
6314         if (lo->ldo_mirror_count > 1) {
6315                 for (i = 0; i < lo->ldo_comp_cnt; i++) {
6316                         lod_comp = &lo->ldo_comp_entries[i];
6317                         if (lod_comp->llc_id != LCME_ID_INVAL &&
6318                             mirror_id_of(lod_comp->llc_id) > mirror_id)
6319                                 mirror_id = mirror_id_of(lod_comp->llc_id);
6320                 }
6321         }
6322
6323         /* create all underlying objects */
6324         for (i = 0; i < lo->ldo_comp_cnt; i++) {
6325                 lod_comp = &lo->ldo_comp_entries[i];
6326
6327                 if (lod_comp->llc_id == LCME_ID_INVAL) {
6328                         /* only the component of FLR layout with more than 1
6329                          * mirror has mirror ID in its component ID.
6330                          */
6331                         if (lod_comp->llc_extent.e_start == 0 &&
6332                             lo->ldo_mirror_count > 1)
6333                                 ++mirror_id;
6334
6335                         lod_comp->llc_id = lod_gen_component_id(lo,
6336                                                                 mirror_id, i);
6337                         if (lod_comp->llc_id == LCME_ID_INVAL)
6338                                 GOTO(out, rc = -ERANGE);
6339                 }
6340
6341                 if (lod_comp_inited(lod_comp))
6342                         continue;
6343
6344                 if (lod_comp->llc_pattern & LOV_PATTERN_F_RELEASED)
6345                         lod_comp_set_init(lod_comp);
6346
6347                 if (lov_pattern(lod_comp->llc_pattern) == LOV_PATTERN_MDT)
6348                         lod_comp_set_init(lod_comp);
6349
6350                 if (lod_comp->llc_stripe == NULL)
6351                         continue;
6352
6353                 LASSERT(lod_comp->llc_stripe_count);
6354                 for (j = 0; j < lod_comp->llc_stripe_count; j++) {
6355                         struct dt_object *object = lod_comp->llc_stripe[j];
6356                         LASSERT(object != NULL);
6357                         rc = lod_sub_create(env, object, attr, NULL, dof, th);
6358                         if (rc)
6359                                 GOTO(out, rc);
6360                 }
6361                 lod_comp_set_init(lod_comp);
6362         }
6363
6364         rc = lod_fill_mirrors(lo);
6365         if (rc)
6366                 GOTO(out, rc);
6367
6368         lo->ldo_comp_cached = 1;
6369
6370         rc = lod_generate_and_set_lovea(env, lo, th);
6371         if (rc)
6372                 GOTO(out, rc);
6373
6374         mutex_unlock(&lo->ldo_layout_mutex);
6375
6376         RETURN(0);
6377
6378 out:
6379         lod_striping_free_nolock(env, lo);
6380         mutex_unlock(&lo->ldo_layout_mutex);
6381
6382         RETURN(rc);
6383 }
6384
6385 static inline bool lod_obj_is_dom(struct dt_object *dt)
6386 {
6387         struct lod_object *lo = lod_dt_obj(dt);
6388
6389         if (!dt_object_exists(dt_object_child(dt)))
6390                 return false;
6391
6392         if (S_ISDIR(dt->do_lu.lo_header->loh_attr))
6393                 return false;
6394
6395         if (!lo->ldo_comp_cnt)
6396                 return false;
6397
6398         return (lov_pattern(lo->ldo_comp_entries[0].llc_pattern) ==
6399                 LOV_PATTERN_MDT);
6400 }
6401
6402 /**
6403  * Implementation of dt_object_operations::do_create.
6404  *
6405  * If any of preceeding methods (like ->do_declare_create(),
6406  * ->do_ah_init(), etc) chose to create a striped object,
6407  * then this method will create the master and the stripes.
6408  *
6409  * \see dt_object_operations::do_create() in the API description for details.
6410  */
6411 static int lod_create(const struct lu_env *env, struct dt_object *dt,
6412                       struct lu_attr *attr, struct dt_allocation_hint *hint,
6413                       struct dt_object_format *dof, struct thandle *th)
6414 {
6415         int                 rc;
6416         ENTRY;
6417
6418         /* create local object */
6419         rc = lod_sub_create(env, dt_object_child(dt), attr, hint, dof, th);
6420         if (rc != 0)
6421                 RETURN(rc);
6422
6423         if (S_ISREG(dt->do_lu.lo_header->loh_attr) &&
6424             (lod_obj_is_striped(dt) || lod_obj_is_dom(dt)) &&
6425             dof->u.dof_reg.striped != 0) {
6426                 LASSERT(lod_dt_obj(dt)->ldo_comp_cached == 0);
6427                 rc = lod_striped_create(env, dt, attr, dof, th);
6428         }
6429
6430         RETURN(rc);
6431 }
6432
6433 static inline int
6434 lod_obj_stripe_destroy_cb(const struct lu_env *env, struct lod_object *lo,
6435                           struct dt_object *dt, struct thandle *th,
6436                           int comp_idx, int stripe_idx,
6437                           struct lod_obj_stripe_cb_data *data)
6438 {
6439         if (data->locd_declare)
6440                 return lod_sub_declare_destroy(env, dt, th);
6441
6442         if (!OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_SPEOBJ) ||
6443             stripe_idx == cfs_fail_val)
6444                 return lod_sub_destroy(env, dt, th);
6445
6446         return 0;
6447 }
6448
6449 /**
6450  * Implementation of dt_object_operations::do_declare_destroy.
6451  *
6452  * If the object is a striped directory, then the function declares reference
6453  * removal from the master object (this is an index) to the stripes and declares
6454  * destroy of all the stripes. In all the cases, it declares an intention to
6455  * destroy the object itself.
6456  *
6457  * \see dt_object_operations::do_declare_destroy() in the API description
6458  * for details.
6459  */
6460 static int lod_declare_destroy(const struct lu_env *env, struct dt_object *dt,
6461                                struct thandle *th)
6462 {
6463         struct dt_object *next = dt_object_child(dt);
6464         struct lod_object *lo = lod_dt_obj(dt);
6465         struct lod_thread_info *info = lod_env_info(env);
6466         struct dt_object *stripe;
6467         char *stripe_name = info->lti_key;
6468         int rc, i;
6469
6470         ENTRY;
6471
6472         /*
6473          * load striping information, notice we don't do this when object
6474          * is being initialized as we don't need this information till
6475          * few specific cases like destroy, chown
6476          */
6477         rc = lod_striping_load(env, lo);
6478         if (rc)
6479                 RETURN(rc);
6480
6481         /* declare destroy for all underlying objects */
6482         if (S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
6483                 rc = next->do_ops->do_index_try(env, next,
6484                                                 &dt_directory_features);
6485                 if (rc != 0)
6486                         RETURN(rc);
6487
6488                 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
6489                         stripe = lo->ldo_stripe[i];
6490                         if (!stripe)
6491                                 continue;
6492
6493                         rc = lod_sub_declare_ref_del(env, next, th);
6494                         if (rc != 0)
6495                                 RETURN(rc);
6496
6497                         snprintf(stripe_name, sizeof(info->lti_key),
6498                                  DFID":%d",
6499                                  PFID(lu_object_fid(&stripe->do_lu)), i);
6500                         rc = lod_sub_declare_delete(env, next,
6501                                         (const struct dt_key *)stripe_name, th);
6502                         if (rc != 0)
6503                                 RETURN(rc);
6504                 }
6505         }
6506
6507         /*
6508          * we declare destroy for the local object
6509          */
6510         rc = lod_sub_declare_destroy(env, next, th);
6511         if (rc)
6512                 RETURN(rc);
6513
6514         if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ) ||
6515             OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ2))
6516                 RETURN(0);
6517
6518         if (!lod_obj_is_striped(dt))
6519                 RETURN(0);
6520
6521         /* declare destroy all striped objects */
6522         if (S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
6523                 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
6524                         stripe = lo->ldo_stripe[i];
6525                         if (!stripe)
6526                                 continue;
6527
6528                         if (!dt_object_exists(stripe))
6529                                 continue;
6530
6531                         rc = lod_sub_declare_ref_del(env, stripe, th);
6532                         if (rc != 0)
6533                                 break;
6534
6535                         rc = lod_sub_declare_destroy(env, stripe, th);
6536                         if (rc != 0)
6537                                 break;
6538                 }
6539         } else {
6540                 struct lod_obj_stripe_cb_data data = { { 0 } };
6541
6542                 data.locd_declare = true;
6543                 data.locd_stripe_cb = lod_obj_stripe_destroy_cb;
6544                 rc = lod_obj_for_each_stripe(env, lo, th, &data);
6545         }
6546
6547         RETURN(rc);
6548 }
6549
6550 /**
6551  * Implementation of dt_object_operations::do_destroy.
6552  *
6553  * If the object is a striped directory, then the function removes references
6554  * from the master object (this is an index) to the stripes and destroys all
6555  * the stripes. In all the cases, the function destroys the object itself.
6556  *
6557  * \see dt_object_operations::do_destroy() in the API description for details.
6558  */
6559 static int lod_destroy(const struct lu_env *env, struct dt_object *dt,
6560                        struct thandle *th)
6561 {
6562         struct dt_object  *next = dt_object_child(dt);
6563         struct lod_object *lo = lod_dt_obj(dt);
6564         struct lod_thread_info *info = lod_env_info(env);
6565         char *stripe_name = info->lti_key;
6566         struct dt_object *stripe;
6567         unsigned int i;
6568         int rc;
6569
6570         ENTRY;
6571
6572         /* destroy sub-stripe of master object */
6573         if (S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
6574                 rc = next->do_ops->do_index_try(env, next,
6575                                                 &dt_directory_features);
6576                 if (rc != 0)
6577                         RETURN(rc);
6578
6579                 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
6580                         stripe = lo->ldo_stripe[i];
6581                         if (!stripe)
6582                                 continue;
6583
6584                         rc = lod_sub_ref_del(env, next, th);
6585                         if (rc != 0)
6586                                 RETURN(rc);
6587
6588                         snprintf(stripe_name, sizeof(info->lti_key), DFID":%d",
6589                                 PFID(lu_object_fid(&stripe->do_lu)), i);
6590
6591                         CDEBUG(D_INFO, DFID" delete stripe %s "DFID"\n",
6592                                PFID(lu_object_fid(&dt->do_lu)), stripe_name,
6593                                PFID(lu_object_fid(&stripe->do_lu)));
6594
6595                         rc = lod_sub_delete(env, next,
6596                                        (const struct dt_key *)stripe_name, th);
6597                         if (rc != 0)
6598                                 RETURN(rc);
6599                 }
6600         }
6601
6602         rc = lod_sub_destroy(env, next, th);
6603         if (rc != 0)
6604                 RETURN(rc);
6605
6606         if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ) ||
6607             OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ2))
6608                 RETURN(0);
6609
6610         if (!lod_obj_is_striped(dt))
6611                 RETURN(0);
6612
6613         /* destroy all striped objects */
6614         if (S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
6615                 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
6616                         stripe = lo->ldo_stripe[i];
6617                         if (!stripe)
6618                                 continue;
6619
6620                         if (!dt_object_exists(stripe))
6621                                 continue;
6622
6623                         if (!OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_SPEOBJ) ||
6624                             i == cfs_fail_val) {
6625                                 dt_write_lock(env, stripe, DT_TGT_CHILD);
6626                                 rc = lod_sub_ref_del(env, stripe, th);
6627                                 dt_write_unlock(env, stripe);
6628                                 if (rc != 0)
6629                                         break;
6630
6631                                 rc = lod_sub_destroy(env, stripe, th);
6632                                 if (rc != 0)
6633                                         break;
6634                         }
6635                 }
6636         } else {
6637                 struct lod_obj_stripe_cb_data data = { { 0 } };
6638
6639                 data.locd_declare = false;
6640                 data.locd_stripe_cb = lod_obj_stripe_destroy_cb;
6641                 rc = lod_obj_for_each_stripe(env, lo, th, &data);
6642         }
6643
6644         RETURN(rc);
6645 }
6646
6647 /**
6648  * Implementation of dt_object_operations::do_declare_ref_add.
6649  *
6650  * \see dt_object_operations::do_declare_ref_add() in the API description
6651  * for details.
6652  */
6653 static int lod_declare_ref_add(const struct lu_env *env,
6654                                struct dt_object *dt, struct thandle *th)
6655 {
6656         return lod_sub_declare_ref_add(env, dt_object_child(dt), th);
6657 }
6658
6659 /**
6660  * Implementation of dt_object_operations::do_ref_add.
6661  *
6662  * \see dt_object_operations::do_ref_add() in the API description for details.
6663  */
6664 static int lod_ref_add(const struct lu_env *env,
6665                        struct dt_object *dt, struct thandle *th)
6666 {
6667         return lod_sub_ref_add(env, dt_object_child(dt), th);
6668 }
6669
6670 /**
6671  * Implementation of dt_object_operations::do_declare_ref_del.
6672  *
6673  * \see dt_object_operations::do_declare_ref_del() in the API description
6674  * for details.
6675  */
6676 static int lod_declare_ref_del(const struct lu_env *env,
6677                                struct dt_object *dt, struct thandle *th)
6678 {
6679         return lod_sub_declare_ref_del(env, dt_object_child(dt), th);
6680 }
6681
6682 /**
6683  * Implementation of dt_object_operations::do_ref_del
6684  *
6685  * \see dt_object_operations::do_ref_del() in the API description for details.
6686  */
6687 static int lod_ref_del(const struct lu_env *env,
6688                        struct dt_object *dt, struct thandle *th)
6689 {
6690         return lod_sub_ref_del(env, dt_object_child(dt), th);
6691 }
6692
6693 /**
6694  * Implementation of dt_object_operations::do_object_sync.
6695  *
6696  * \see dt_object_operations::do_object_sync() in the API description
6697  * for details.
6698  */
6699 static int lod_object_sync(const struct lu_env *env, struct dt_object *dt,
6700                            __u64 start, __u64 end)
6701 {
6702         return dt_object_sync(env, dt_object_child(dt), start, end);
6703 }
6704
6705 /**
6706  * Implementation of dt_object_operations::do_object_unlock.
6707  *
6708  * Used to release LDLM lock(s).
6709  *
6710  * \see dt_object_operations::do_object_unlock() in the API description
6711  * for details.
6712  */
6713 static int lod_object_unlock(const struct lu_env *env, struct dt_object *dt,
6714                              struct ldlm_enqueue_info *einfo,
6715                              union ldlm_policy_data *policy)
6716 {
6717         struct lod_object *lo = lod_dt_obj(dt);
6718         struct lustre_handle_array *slave_locks = einfo->ei_cbdata;
6719         int slave_locks_size;
6720         int i;
6721         ENTRY;
6722
6723         if (slave_locks == NULL)
6724                 RETURN(0);
6725
6726         LASSERT(S_ISDIR(dt->do_lu.lo_header->loh_attr));
6727         /* Note: for remote lock for single stripe dir, MDT will cancel
6728          * the lock by lockh directly */
6729         LASSERT(!dt_object_remote(dt_object_child(dt)));
6730
6731         /* locks were unlocked in MDT layer */
6732         for (i = 0; i < slave_locks->ha_count; i++)
6733                 LASSERT(!lustre_handle_is_used(&slave_locks->ha_handles[i]));
6734
6735         /*
6736          * NB, ha_count may not equal to ldo_dir_stripe_count, because dir
6737          * layout may change, e.g., shrink dir layout after migration.
6738          */
6739         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
6740                 if (lo->ldo_stripe[i])
6741                         dt_invalidate(env, lo->ldo_stripe[i]);
6742         }
6743
6744         slave_locks_size = offsetof(typeof(*slave_locks),
6745                                     ha_handles[slave_locks->ha_count]);
6746         OBD_FREE(slave_locks, slave_locks_size);
6747         einfo->ei_cbdata = NULL;
6748
6749         RETURN(0);
6750 }
6751
6752 /**
6753  * Implementation of dt_object_operations::do_object_lock.
6754  *
6755  * Used to get LDLM lock on the non-striped and striped objects.
6756  *
6757  * \see dt_object_operations::do_object_lock() in the API description
6758  * for details.
6759  */
6760 static int lod_object_lock(const struct lu_env *env,
6761                            struct dt_object *dt,
6762                            struct lustre_handle *lh,
6763                            struct ldlm_enqueue_info *einfo,
6764                            union ldlm_policy_data *policy)
6765 {
6766         struct lod_object *lo = lod_dt_obj(dt);
6767         int slave_locks_size;
6768         struct lustre_handle_array *slave_locks = NULL;
6769         int i;
6770         int rc;
6771         ENTRY;
6772
6773         /* remote object lock */
6774         if (!einfo->ei_enq_slave) {
6775                 LASSERT(dt_object_remote(dt));
6776                 return dt_object_lock(env, dt_object_child(dt), lh, einfo,
6777                                       policy);
6778         }
6779
6780         if (!S_ISDIR(dt->do_lu.lo_header->loh_attr))
6781                 RETURN(-ENOTDIR);
6782
6783         rc = lod_striping_load(env, lo);
6784         if (rc != 0)
6785                 RETURN(rc);
6786
6787         /* No stripes */
6788         if (lo->ldo_dir_stripe_count <= 1)
6789                 RETURN(0);
6790
6791         slave_locks_size = offsetof(typeof(*slave_locks),
6792                                     ha_handles[lo->ldo_dir_stripe_count]);
6793         /* Freed in lod_object_unlock */
6794         OBD_ALLOC(slave_locks, slave_locks_size);
6795         if (!slave_locks)
6796                 RETURN(-ENOMEM);
6797         slave_locks->ha_count = lo->ldo_dir_stripe_count;
6798
6799         /* striped directory lock */
6800         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
6801                 struct lustre_handle lockh;
6802                 struct ldlm_res_id *res_id;
6803                 struct dt_object *stripe;
6804
6805                 stripe = lo->ldo_stripe[i];
6806                 if (!stripe)
6807                         continue;
6808
6809                 res_id = &lod_env_info(env)->lti_res_id;
6810                 fid_build_reg_res_name(lu_object_fid(&stripe->do_lu), res_id);
6811                 einfo->ei_res_id = res_id;
6812
6813                 if (dt_object_remote(stripe)) {
6814                         set_bit(i, (void *)slave_locks->ha_map);
6815                         rc = dt_object_lock(env, stripe, &lockh, einfo, policy);
6816                 } else {
6817                         struct ldlm_namespace *ns = einfo->ei_namespace;
6818                         ldlm_blocking_callback blocking = einfo->ei_cb_local_bl;
6819                         ldlm_completion_callback completion = einfo->ei_cb_cp;
6820                         __u64 dlmflags = LDLM_FL_ATOMIC_CB;
6821
6822                         if (einfo->ei_mode == LCK_PW ||
6823                             einfo->ei_mode == LCK_EX)
6824                                 dlmflags |= LDLM_FL_COS_INCOMPAT;
6825
6826                         LASSERT(ns != NULL);
6827                         rc = ldlm_cli_enqueue_local(env, ns, res_id, LDLM_IBITS,
6828                                                     policy, einfo->ei_mode,
6829                                                     &dlmflags, blocking,
6830                                                     completion, NULL,
6831                                                     NULL, 0, LVB_T_NONE,
6832                                                     NULL, &lockh);
6833                 }
6834                 if (rc) {
6835                         while (i--)
6836                                 ldlm_lock_decref_and_cancel(
6837                                                 &slave_locks->ha_handles[i],
6838                                                 einfo->ei_mode);
6839                         OBD_FREE(slave_locks, slave_locks_size);
6840                         RETURN(rc);
6841                 }
6842                 slave_locks->ha_handles[i] = lockh;
6843         }
6844         einfo->ei_cbdata = slave_locks;
6845
6846         RETURN(0);
6847 }
6848
6849 /**
6850  * Implementation of dt_object_operations::do_invalidate.
6851  *
6852  * \see dt_object_operations::do_invalidate() in the API description for details
6853  */
6854 static int lod_invalidate(const struct lu_env *env, struct dt_object *dt)
6855 {
6856         return dt_invalidate(env, dt_object_child(dt));
6857 }
6858
6859 static int lod_declare_instantiate_components(const struct lu_env *env,
6860                                               struct lod_object *lo,
6861                                               struct thandle *th,
6862                                               __u64 reserve)
6863 {
6864         struct lod_thread_info *info = lod_env_info(env);
6865         int i;
6866         int rc = 0;
6867         ENTRY;
6868
6869         LASSERT(info->lti_count < lo->ldo_comp_cnt);
6870
6871         for (i = 0; i < info->lti_count; i++) {
6872                 rc = lod_qos_prep_create(env, lo, NULL, th,
6873                                          info->lti_comp_idx[i], reserve);
6874                 if (rc)
6875                         break;
6876         }
6877
6878         if (!rc) {
6879                 info->lti_buf.lb_len = lod_comp_md_size(lo, false);
6880                 rc = lod_sub_declare_xattr_set(env, lod_object_child(lo),
6881                                 &info->lti_buf, XATTR_NAME_LOV, 0, th);
6882         }
6883
6884         RETURN(rc);
6885 }
6886
6887 /**
6888  * Check OSTs for an existing component for further extension
6889  *
6890  * Checks if OSTs are still healthy and not out of space.  Gets free space
6891  * on OSTs (relative to allocation watermark rmb_low) and compares to
6892  * the proposed new_end for this component.
6893  *
6894  * Decides whether or not to extend a component on its current OSTs.
6895  *
6896  * \param[in] env               execution environment for this thread
6897  * \param[in] lo                object we're checking
6898  * \param[in] index             index of this component
6899  * \param[in] extension_size    extension size for this component
6900  * \param[in] extent            layout extent for requested operation
6901  * \param[in] comp_extent       extension component extent
6902  * \param[in] write             if this is write operation
6903  *
6904  * \retval      true - OK to extend on current OSTs
6905  * \retval      false - do not extend on current OSTs
6906  */
6907 static bool lod_sel_osts_allowed(const struct lu_env *env,
6908                                  struct lod_object *lo,
6909                                  int index, __u64 reserve,
6910                                  struct lu_extent *extent,
6911                                  struct lu_extent *comp_extent, int write)
6912 {
6913         struct lod_layout_component *lod_comp = &lo->ldo_comp_entries[index];
6914         struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
6915         struct lod_thread_info *tinfo = lod_env_info(env);
6916         struct obd_statfs *sfs = &tinfo->lti_osfs;
6917         __u64 available = 0;
6918         bool ret = true;
6919         int i, rc;
6920
6921         ENTRY;
6922
6923         LASSERT(lod_comp->llc_stripe_count != 0);
6924
6925         lod_getref(&lod->lod_ost_descs);
6926         for (i = 0; i < lod_comp->llc_stripe_count; i++) {
6927                 int index = lod_comp->llc_ost_indices[i];
6928                 struct lod_tgt_desc *ost = OST_TGT(lod, index);
6929                 struct obd_statfs_info info = { 0 };
6930                 int j, repeated = 0;
6931
6932                 LASSERT(ost);
6933
6934                 /* Get the number of times this OST repeats in this component.
6935                  * Note: inter-component repeats are not counted as this is
6936                  * considered as a rare case: we try to not repeat OST in other
6937                  * components if possible. */
6938                 for (j = 0; j < lod_comp->llc_stripe_count; j++) {
6939                         if (index != lod_comp->llc_ost_indices[j])
6940                                 continue;
6941
6942                         /* already handled */
6943                         if (j < i)
6944                                 break;
6945
6946                         repeated++;
6947                 }
6948                 if (j < lod_comp->llc_stripe_count)
6949                         continue;
6950
6951                 if (!test_bit(index, lod->lod_ost_bitmap)) {
6952                         CDEBUG(D_LAYOUT, "ost %d no longer present\n", index);
6953                         ret = false;
6954                         break;
6955                 }
6956
6957                 rc = dt_statfs_info(env, ost->ltd_tgt, sfs, &info);
6958                 if (rc) {
6959                         CDEBUG(D_LAYOUT, "statfs failed for ost %d, error %d\n",
6960                                index, rc);
6961                         ret = false;
6962                         break;
6963                 }
6964
6965                 if (sfs->os_state & OS_STATFS_ENOSPC ||
6966                     sfs->os_state & OS_STATFS_READONLY ||
6967                     sfs->os_state & OS_STATFS_DEGRADED) {
6968                         CDEBUG(D_LAYOUT, "ost %d is not availble for SEL "
6969                                "extension, state %u\n", index, sfs->os_state);
6970                         ret = false;
6971                         break;
6972                 }
6973
6974                 /* In bytes */
6975                 available = sfs->os_bavail * sfs->os_bsize;
6976                 /* 'available' is relative to the allocation threshold */
6977                 available -= (__u64) info.os_reserved_mb_low << 20;
6978
6979                 CDEBUG(D_LAYOUT, "ost %d lowwm: %d highwm: %d, "
6980                        "%llu %% blocks available, %llu %% blocks free\n",
6981                        index, info.os_reserved_mb_low, info.os_reserved_mb_high,
6982                        (100ull * sfs->os_bavail) / sfs->os_blocks,
6983                        (100ull * sfs->os_bfree) / sfs->os_blocks);
6984
6985                 if (reserve * repeated > available) {
6986                         ret = false;
6987                         CDEBUG(D_LAYOUT, "low space on ost %d, available %llu "
6988                                "< extension size %llu repeated %d\n", index,
6989                                available, reserve, repeated);
6990                         break;
6991                 }
6992         }
6993         lod_putref(lod, &lod->lod_ost_descs);
6994
6995         RETURN(ret);
6996 }
6997
6998 /**
6999  * Adjust extents after component removal
7000  *
7001  * When we remove an extension component, we move the start of the next
7002  * component to match the start of the extension component, so no space is left
7003  * without layout.
7004  *
7005  * \param[in] env       execution environment for this thread
7006  * \param[in] lo        object
7007  * \param[in] max_comp  layout component
7008  * \param[in] index     index of this component
7009  *
7010  * \retval              0 on success
7011  * \retval              negative errno on error
7012  */
7013 static void lod_sel_adjust_extents(const struct lu_env *env,
7014                                    struct lod_object *lo,
7015                                    int max_comp, int index)
7016 {
7017         struct lod_layout_component *lod_comp = NULL;
7018         struct lod_layout_component *next = NULL;
7019         struct lod_layout_component *prev = NULL;
7020         __u64 new_start = 0;
7021         __u64 start;
7022         int i;
7023
7024         /* Extension space component */
7025         lod_comp = &lo->ldo_comp_entries[index];
7026         next = &lo->ldo_comp_entries[index + 1];
7027         prev = &lo->ldo_comp_entries[index - 1];
7028
7029         LASSERT(lod_comp != NULL && prev != NULL && next != NULL);
7030         LASSERT(lod_comp->llc_flags & LCME_FL_EXTENSION);
7031
7032         /* Previous is being removed */
7033         if (prev && prev->llc_id == LCME_ID_INVAL)
7034                 new_start = prev->llc_extent.e_start;
7035         else
7036                 new_start = lod_comp->llc_extent.e_start;
7037
7038         for (i = index + 1; i < max_comp; i++) {
7039                 lod_comp = &lo->ldo_comp_entries[i];
7040
7041                 start = lod_comp->llc_extent.e_start;
7042                 lod_comp->llc_extent.e_start = new_start;
7043
7044                 /* We only move zero length extendable components */
7045                 if (!(start == lod_comp->llc_extent.e_end))
7046                         break;
7047
7048                 LASSERT(!(lod_comp->llc_flags & LCME_FL_INIT));
7049
7050                 lod_comp->llc_extent.e_end = new_start;
7051         }
7052 }
7053
7054 /* Calculate the proposed 'new end' for a component we're extending */
7055 static __u64 lod_extension_new_end(__u64 extension_size, __u64 extent_end,
7056                                    __u32 stripe_size, __u64 component_end,
7057                                    __u64 extension_end)
7058 {
7059         __u64 new_end;
7060
7061         LASSERT(extension_size != 0 && stripe_size != 0);
7062
7063         /* Round up to extension size */
7064         if (extent_end == OBD_OBJECT_EOF) {
7065                 new_end = OBD_OBJECT_EOF;
7066         } else {
7067                 /* Add at least extension_size to the previous component_end,
7068                  * covering the req layout extent */
7069                 new_end = max(extent_end - component_end, extension_size);
7070                 new_end = roundup(new_end, extension_size);
7071                 new_end += component_end;
7072
7073                 /* Component end must be min stripe size aligned */
7074                 if (new_end % stripe_size) {
7075                         CDEBUG(D_LAYOUT, "new component end is not aligned "
7076                                "by the stripe size %u: [%llu, %llu) ext size "
7077                                "%llu new end %llu, aligning\n",
7078                                stripe_size, component_end, extent_end,
7079                                extension_size, new_end);
7080                         new_end = roundup(new_end, stripe_size);
7081                 }
7082
7083                 /* Overflow */
7084                 if (new_end < extent_end)
7085                         new_end = OBD_OBJECT_EOF;
7086         }
7087
7088         /* Don't extend past the end of the extension component */
7089         if (new_end > extension_end)
7090                 new_end = extension_end;
7091
7092         return new_end;
7093 }
7094
7095 /**
7096  * Calculate the exact reservation (per-OST extension_size) on the OSTs being
7097  * instantiated. It needs to be calculated in advance and taken into account at
7098  * the instantiation time, because otherwise lod_statfs_and_check() may consider
7099  * an OST as OK, but SEL needs its extension_size to fit the free space and the
7100  * OST may turn out to be low-on-space, thus inappropriate OST may be used and
7101  * ENOSPC occurs.
7102  *
7103  * \param[in] lod_comp          lod component we are checking
7104  *
7105  * \retval      size to reserved on each OST of lod_comp's stripe.
7106  */
7107 static __u64 lod_sel_stripe_reserved(struct lod_layout_component *lod_comp)
7108 {
7109         /* extension_size is file level, so we must divide by stripe count to
7110          * compare it to available space on a single OST */
7111         return  lod_comp->llc_stripe_size * SEL_UNIT_SIZE /
7112                 lod_comp->llc_stripe_count;
7113 }
7114
7115 /* As lod_sel_handler() could be re-entered for the same component several
7116  * times, this is the data for the next call. Fields could be changed to
7117  * component indexes when needed, (e.g. if there is no need to instantiate
7118  * all the previous components up to the current position) to tell the caller
7119  * where to start over from. */
7120 struct sel_data {
7121         int sd_force;
7122         int sd_repeat;
7123 };
7124
7125 /**
7126  * Process extent updates for a particular layout component
7127  *
7128  * Handle layout updates for a particular extension space component touched by
7129  * a layout update operation.  Core function of self-extending PFL feature.
7130  *
7131  * In general, this function processes exactly *one* stage of an extension
7132  * operation, modifying the layout accordingly, then returns to the caller.
7133  * The caller is responsible for restarting processing with the new layout,
7134  * which may repeatedly return to this function until the extension updates
7135  * are complete.
7136  *
7137  * This function does one of a few things to the layout:
7138  * 1. Extends the component before the current extension space component to
7139  * allow it to accomodate the requested operation (if space/policy permit that
7140  * component to continue on its current OSTs)
7141  *
7142  * 2. If extension of the existing component fails, we do one of two things:
7143  *    a. If there is a component after the extension space, we remove the
7144  *       extension space component, move the start of the next component down
7145  *       accordingly, then notify the caller to restart processing w/the new
7146  *       layout.
7147  *    b. If there is no following component, we try repeating the current
7148  *       component, creating a new component using the current one as a
7149  *       template (keeping its stripe properties but not specific striping),
7150  *       and try assigning striping for this component.  If there is sufficient
7151  *       free space on the OSTs chosen for this component, it is instantiated
7152  *       and i/o continues there.
7153  *
7154  *       If there is not sufficient space on the new OSTs, we remove this new
7155  *       component & extend the current component.
7156  *
7157  * Note further that uninited components followed by extension space can be zero
7158  * length meaning that we will try to extend them before initializing them, and
7159  * if that fails, they will be removed without initialization.
7160  *
7161  * 3. If we extend to/beyond the end of an extension space component, that
7162  * component is exhausted (all of its range has been given to real components),
7163  * so we remove it and restart processing.
7164  *
7165  * \param[in] env               execution environment for this thread
7166  * \param[in,out] lo            object to update the layout of
7167  * \param[in] extent            layout extent for requested operation, update
7168  *                              layout to fit this operation
7169  * \param[in] th                transaction handle for this operation
7170  * \param[in,out] max_comp      the highest comp for the portion of the layout
7171  *                              we are operating on (For FLR, the chosen
7172  *                              replica).  Updated because we may remove
7173  *                              components.
7174  * \param[in] index             index of the extension space component we're
7175  *                              working on
7176  * \param[in] write             if this is write op
7177  * \param[in,out] force         if the extension is to be forced; set here
7178                                 to force it on the 2nd call for the same
7179                                 extension component
7180  *
7181  * \retval      0 on success
7182  * \retval      negative errno on error
7183  */
7184 static int lod_sel_handler(const struct lu_env *env,
7185                           struct lod_object *lo,
7186                           struct lu_extent *extent,
7187                           struct thandle *th, int *max_comp,
7188                           int index, int write,
7189                           struct sel_data *sd)
7190 {
7191         struct lod_device *d = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
7192         struct lod_thread_info *info = lod_env_info(env);
7193         struct lod_layout_component *lod_comp;
7194         struct lod_layout_component *prev;
7195         struct lod_layout_component *next = NULL;
7196         __u64 extension_size, reserve;
7197         __u64 new_end = 0;
7198         bool repeated;
7199         int change = 0;
7200         int rc = 0;
7201         ENTRY;
7202
7203         /* First component cannot be extension space */
7204         if (index == 0) {
7205                 CERROR("%s: "DFID" first component cannot be extension space\n",
7206                        lod2obd(d)->obd_name, PFID(lod_object_fid(lo)));
7207                 RETURN(-EINVAL);
7208         }
7209
7210         lod_comp = &lo->ldo_comp_entries[index];
7211         prev = &lo->ldo_comp_entries[index - 1];
7212         if ((index + 1) < *max_comp)
7213                 next = &lo->ldo_comp_entries[index + 1];
7214
7215         /* extension size uses the stripe size field as KiB */
7216         extension_size = lod_comp->llc_stripe_size * SEL_UNIT_SIZE;
7217
7218         CDEBUG(D_LAYOUT, "prev start %llu, extension start %llu, extension end"
7219                " %llu, extension size %llu\n", prev->llc_extent.e_start,
7220                lod_comp->llc_extent.e_start, lod_comp->llc_extent.e_end,
7221                extension_size);
7222
7223         /* Two extension space components cannot be adjacent & extension space
7224          * components cannot be init */
7225         if ((prev->llc_flags & LCME_FL_EXTENSION) ||
7226             !(ergo(next, !(next->llc_flags & LCME_FL_EXTENSION))) ||
7227              lod_comp_inited(lod_comp)) {
7228                 CERROR("%s: "DFID" invalid extension space components\n",
7229                        lod2obd(d)->obd_name, PFID(lod_object_fid(lo)));
7230                 RETURN(-EINVAL);
7231         }
7232
7233         reserve = lod_sel_stripe_reserved(lod_comp);
7234
7235         if (!prev->llc_stripe) {
7236                 CDEBUG(D_LAYOUT, "Previous component not inited\n");
7237                 info->lti_count = 1;
7238                 info->lti_comp_idx[0] = index - 1;
7239                 rc = lod_declare_instantiate_components(env, lo, th, reserve);
7240                 /* ENOSPC tells us we can't use this component.  If there is
7241                  * a next or we are repeating, we either spill over (next) or
7242                  * extend the original comp (repeat).  Otherwise, return the
7243                  * error to the user. */
7244                 if (rc == -ENOSPC && (next || sd->sd_repeat))
7245                         rc = 1;
7246                 if (rc < 0)
7247                         RETURN(rc);
7248         }
7249
7250         if (sd->sd_force == 0 && rc == 0)
7251                 rc = !lod_sel_osts_allowed(env, lo, index - 1, reserve, extent,
7252                                            &lod_comp->llc_extent, write);
7253
7254         repeated = !!(sd->sd_repeat);
7255         sd->sd_repeat = 0;
7256         sd->sd_force = 0;
7257
7258         /* Extend previous component */
7259         if (rc == 0) {
7260                 new_end = lod_extension_new_end(extension_size, extent->e_end,
7261                                                 prev->llc_stripe_size,
7262                                                 prev->llc_extent.e_end,
7263                                                 lod_comp->llc_extent.e_end);
7264
7265                 CDEBUG(D_LAYOUT, "new end %llu\n", new_end);
7266                 lod_comp->llc_extent.e_start = new_end;
7267                 prev->llc_extent.e_end = new_end;
7268
7269                 if (prev->llc_extent.e_end == lod_comp->llc_extent.e_end) {
7270                         CDEBUG(D_LAYOUT, "Extension component exhausted\n");
7271                         lod_comp->llc_id = LCME_ID_INVAL;
7272                         change--;
7273                 }
7274         } else {
7275                 /* rc == 1, failed to extend current component */
7276                 LASSERT(rc == 1);
7277                 if (next) {
7278                         /* Normal 'spillover' case - Remove the extension
7279                          * space component & bring down the start of the next
7280                          * component. */
7281                         lod_comp->llc_id = LCME_ID_INVAL;
7282                         change--;
7283                         if (!(prev->llc_flags & LCME_FL_INIT)) {
7284                                 prev->llc_id = LCME_ID_INVAL;
7285                                 change--;
7286                         }
7287                         lod_sel_adjust_extents(env, lo, *max_comp, index);
7288                 } else if (lod_comp_inited(prev)) {
7289                         /* If there is no next, and the previous component is
7290                          * INIT'ed, try repeating the previous component. */
7291                         LASSERT(repeated == 0);
7292                         rc = lod_layout_repeat_comp(env, lo, index - 1);
7293                         if (rc < 0)
7294                                 RETURN(rc);
7295                         change++;
7296                         /* The previous component is a repeated component.
7297                          * Record this so we don't keep trying to repeat it. */
7298                         sd->sd_repeat = 1;
7299                 } else {
7300                         /* If the previous component is not INIT'ed, this may
7301                          * be a component we have just instantiated but failed
7302                          * to extend. Or even a repeated component we failed
7303                          * to prepare a striping for. Do not repeat but instead
7304                          * remove the repeated component & force the extention
7305                          * of the original one */
7306                         sd->sd_force = 1;
7307                         if (repeated) {
7308                                 prev->llc_id = LCME_ID_INVAL;
7309                                 change--;
7310                         }
7311                 }
7312         }
7313
7314         if (change < 0) {
7315                 rc = lod_layout_del_prep_layout(env, lo, NULL);
7316                 if (rc < 0)
7317                         RETURN(rc);
7318                 LASSERTF(-rc == change,
7319                          "number deleted %d != requested %d\n", -rc,
7320                          change);
7321         }
7322         *max_comp = *max_comp + change;
7323
7324         /* lod_del_prep_layout reallocates ldo_comp_entries, so we must
7325          * refresh these pointers before using them */
7326         lod_comp = &lo->ldo_comp_entries[index];
7327         prev = &lo->ldo_comp_entries[index - 1];
7328         CDEBUG(D_LAYOUT, "After extent updates: prev start %llu, current start "
7329                "%llu, current end %llu max_comp %d ldo_comp_cnt %d\n",
7330                prev->llc_extent.e_start, lod_comp->llc_extent.e_start,
7331                lod_comp->llc_extent.e_end, *max_comp, lo->ldo_comp_cnt);
7332
7333         /* Layout changed successfully */
7334         RETURN(0);
7335 }
7336
7337 /**
7338  * Declare layout extent updates
7339  *
7340  * Handles extensions.  Identifies extension components touched by current
7341  * operation and passes them to processing function.
7342  *
7343  * Restarts with updated layouts from the processing function until the current
7344  * operation no longer touches an extension space component.
7345  *
7346  * \param[in] env       execution environment for this thread
7347  * \param[in,out] lo    object to update the layout of
7348  * \param[in] extent    layout extent for requested operation, update layout to
7349  *                      fit this operation
7350  * \param[in] th        transaction handle for this operation
7351  * \param[in] pick      identifies chosen mirror for FLR layouts
7352  * \param[in] write     if this is write op
7353  *
7354  * \retval      1 on layout changed, 0 on no change
7355  * \retval      negative errno on error
7356  */
7357 static int lod_declare_update_extents(const struct lu_env *env,
7358                 struct lod_object *lo, struct lu_extent *extent,
7359                 struct thandle *th, int pick, int write)
7360 {
7361         struct lod_thread_info *info = lod_env_info(env);
7362         struct lod_layout_component *lod_comp;
7363         bool layout_changed = false;
7364         struct sel_data sd = { 0 };
7365         int start_index;
7366         int i = 0;
7367         int max_comp = 0;
7368         int rc = 0, rc2;
7369         int change = 0;
7370         ENTRY;
7371
7372         /* This makes us work on the components of the chosen mirror */
7373         start_index = lo->ldo_mirrors[pick].lme_start;
7374         max_comp = lo->ldo_mirrors[pick].lme_end + 1;
7375         if (lo->ldo_flr_state == LCM_FL_NONE)
7376                 LASSERT(start_index == 0 && max_comp == lo->ldo_comp_cnt);
7377
7378         CDEBUG(D_LAYOUT, "extent->e_start %llu, extent->e_end %llu\n",
7379                extent->e_start, extent->e_end);
7380         for (i = start_index; i < max_comp; i++) {
7381                 lod_comp = &lo->ldo_comp_entries[i];
7382
7383                 /* We've passed all components of interest */
7384                 if (lod_comp->llc_extent.e_start >= extent->e_end)
7385                         break;
7386
7387                 if (lod_comp->llc_flags & LCME_FL_EXTENSION) {
7388                         layout_changed = true;
7389                         rc = lod_sel_handler(env, lo, extent, th, &max_comp,
7390                                              i, write, &sd);
7391                         if (rc < 0)
7392                                 GOTO(out, rc);
7393
7394                         /* Nothing has changed behind the prev one */
7395                         i -= 2;
7396                         continue;
7397                 }
7398         }
7399
7400         /* We may have added or removed components.  If so, we must update the
7401          * start & ends of all the mirrors after the current one, and the end
7402          * of the current mirror. */
7403         change = max_comp - 1 - lo->ldo_mirrors[pick].lme_end;
7404         if (change) {
7405                 lo->ldo_mirrors[pick].lme_end += change;
7406                 for (i = pick + 1; i < lo->ldo_mirror_count; i++) {
7407                         lo->ldo_mirrors[i].lme_start += change;
7408                         lo->ldo_mirrors[i].lme_end += change;
7409                 }
7410         }
7411
7412         EXIT;
7413 out:
7414         /* The amount of components has changed, adjust the lti_comp_idx */
7415         rc2 = lod_layout_data_init(info, lo->ldo_comp_cnt);
7416
7417         return rc < 0 ? rc : rc2 < 0 ? rc2 : layout_changed;
7418 }
7419
7420 /* If striping is already instantiated or INIT'ed DOM? */
7421 static bool lod_is_instantiation_needed(struct lod_layout_component *comp)
7422 {
7423         return !(((lov_pattern(comp->llc_pattern) == LOV_PATTERN_MDT) &&
7424                   lod_comp_inited(comp)) || comp->llc_stripe);
7425 }
7426
7427 /**
7428  * Declare layout update for a non-FLR layout.
7429  *
7430  * \param[in] env       execution environment for this thread
7431  * \param[in,out] lo    object to update the layout of
7432  * \param[in] layout    layout intent for requested operation, "update" is
7433  *                      a process of reacting to this
7434  * \param[in] buf       buffer containing lov ea (see comment on usage inline)
7435  * \param[in] th        transaction handle for this operation
7436  *
7437  * \retval      0 on success
7438  * \retval      negative errno on error
7439  */
7440 static int lod_declare_update_plain(const struct lu_env *env,
7441                 struct lod_object *lo, struct layout_intent *layout,
7442                 const struct lu_buf *buf, struct thandle *th)
7443 {
7444         struct lod_thread_info *info = lod_env_info(env);
7445         struct lod_device *d = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
7446         struct lod_layout_component *lod_comp;
7447         struct lov_comp_md_v1 *comp_v1 = NULL;
7448         bool layout_changed = false;
7449         bool replay = false;
7450         int i, rc;
7451         ENTRY;
7452
7453         LASSERT(lo->ldo_flr_state == LCM_FL_NONE);
7454
7455         /*
7456          * In case the client is passing lovea, which only happens during
7457          * the replay of layout intent write RPC for now, we may need to
7458          * parse the lovea and apply new layout configuration.
7459          */
7460         if (buf && buf->lb_len)  {
7461                 struct lov_user_md_v1 *v1 = buf->lb_buf;
7462
7463                 if (v1->lmm_magic != (LOV_MAGIC_DEFINED | LOV_MAGIC_COMP_V1) &&
7464                     v1->lmm_magic != __swab32(LOV_MAGIC_DEFINED |
7465                                               LOV_MAGIC_COMP_V1)) {
7466                         CERROR("%s: the replay buffer of layout extend "
7467                                "(magic %#x) does not contain expected "
7468                                "composite layout.\n",
7469                                lod2obd(d)->obd_name, v1->lmm_magic);
7470                         GOTO(out, rc = -EINVAL);
7471                 }
7472
7473                 rc = lod_use_defined_striping(env, lo, buf);
7474                 if (rc)
7475                         GOTO(out, rc);
7476                 lo->ldo_comp_cached = 1;
7477
7478                 rc = lod_get_lov_ea(env, lo);
7479                 if (rc <= 0)
7480                         GOTO(out, rc);
7481                 /* old on-disk EA is stored in info->lti_buf */
7482                 comp_v1 = (struct lov_comp_md_v1 *)info->lti_buf.lb_buf;
7483                 replay = true;
7484                 layout_changed = true;
7485
7486                 rc = lod_layout_data_init(info, lo->ldo_comp_cnt);
7487                 if (rc)
7488                         GOTO(out, rc);
7489         } else {
7490                 /* non replay path */
7491                 rc = lod_striping_load(env, lo);
7492                 if (rc)
7493                         GOTO(out, rc);
7494         }
7495
7496         /* Make sure defined layout covers the requested write range. */
7497         lod_comp = &lo->ldo_comp_entries[lo->ldo_comp_cnt - 1];
7498         if (lo->ldo_comp_cnt > 1 &&
7499             lod_comp->llc_extent.e_end != OBD_OBJECT_EOF &&
7500             lod_comp->llc_extent.e_end < layout->li_extent.e_end) {
7501                 CDEBUG_LIMIT(replay ? D_ERROR : D_LAYOUT,
7502                              "%s: the defined layout [0, %#llx) does not "
7503                              "covers the write range "DEXT"\n",
7504                              lod2obd(d)->obd_name, lod_comp->llc_extent.e_end,
7505                              PEXT(&layout->li_extent));
7506                 GOTO(out, rc = -EINVAL);
7507         }
7508
7509         CDEBUG(D_LAYOUT, "%s: "DFID": update components "DEXT"\n",
7510                lod2obd(d)->obd_name, PFID(lod_object_fid(lo)),
7511                PEXT(&layout->li_extent));
7512
7513         if (!replay) {
7514                 rc = lod_declare_update_extents(env, lo, &layout->li_extent,
7515                                 th, 0, layout->li_opc == LAYOUT_INTENT_WRITE);
7516                 if (rc < 0)
7517                         GOTO(out, rc);
7518                 else if (rc)
7519                         layout_changed = true;
7520         }
7521
7522         /*
7523          * Iterate ld->ldo_comp_entries, find the component whose extent under
7524          * the write range and not instantianted.
7525          */
7526         for (i = 0; i < lo->ldo_comp_cnt; i++) {
7527                 lod_comp = &lo->ldo_comp_entries[i];
7528
7529                 if (lod_comp->llc_extent.e_start >= layout->li_extent.e_end)
7530                         break;
7531
7532                 if (!replay) {
7533                         /* If striping is instantiated or INIT'ed DOM skip */
7534                         if (!lod_is_instantiation_needed(lod_comp))
7535                                 continue;
7536                 } else {
7537                         /**
7538                          * In replay path, lod_comp is the EA passed by
7539                          * client replay buffer,  comp_v1 is the pre-recovery
7540                          * on-disk EA, we'd sift out those components which
7541                          * were init-ed in the on-disk EA.
7542                          */
7543                         if (le32_to_cpu(comp_v1->lcm_entries[i].lcme_flags) &
7544                             LCME_FL_INIT)
7545                                 continue;
7546                 }
7547                 /*
7548                  * this component hasn't instantiated in normal path, or during
7549                  * replay it needs replay the instantiation.
7550                  */
7551
7552                 /* A released component is being extended */
7553                 if (lod_comp->llc_pattern & LOV_PATTERN_F_RELEASED)
7554                         GOTO(out, rc = -EINVAL);
7555
7556                 LASSERT(info->lti_comp_idx != NULL);
7557                 info->lti_comp_idx[info->lti_count++] = i;
7558                 layout_changed = true;
7559         }
7560
7561         if (!layout_changed)
7562                 RETURN(-EALREADY);
7563
7564         lod_obj_inc_layout_gen(lo);
7565         rc = lod_declare_instantiate_components(env, lo, th, 0);
7566         EXIT;
7567 out:
7568         if (rc)
7569                 lod_striping_free(env, lo);
7570         return rc;
7571 }
7572
7573 static inline int lod_comp_index(struct lod_object *lo,
7574                                  struct lod_layout_component *lod_comp)
7575 {
7576         LASSERT(lod_comp >= lo->ldo_comp_entries &&
7577                 lod_comp <= &lo->ldo_comp_entries[lo->ldo_comp_cnt - 1]);
7578
7579         return lod_comp - lo->ldo_comp_entries;
7580 }
7581
7582 /**
7583  * Stale other mirrors by writing extent.
7584  */
7585 static int lod_stale_components(const struct lu_env *env, struct lod_object *lo,
7586                                 int primary, struct lu_extent *extent,
7587                                 struct thandle *th)
7588 {
7589         struct lod_layout_component *pri_comp, *lod_comp;
7590         struct lu_extent pri_extent;
7591         int rc = 0;
7592         int i;
7593         ENTRY;
7594
7595         /* The writing extent decides which components in the primary
7596          * are affected... */
7597         CDEBUG(D_LAYOUT, "primary mirror %d, "DEXT"\n", primary, PEXT(extent));
7598
7599 restart:
7600         lod_foreach_mirror_comp(pri_comp, lo, primary) {
7601                 if (!lu_extent_is_overlapped(extent, &pri_comp->llc_extent))
7602                         continue;
7603
7604                 CDEBUG(D_LAYOUT, "primary comp %u "DEXT"\n",
7605                        lod_comp_index(lo, pri_comp),
7606                        PEXT(&pri_comp->llc_extent));
7607
7608                 pri_extent.e_start = pri_comp->llc_extent.e_start;
7609                 pri_extent.e_end = pri_comp->llc_extent.e_end;
7610
7611                 for (i = 0; i < lo->ldo_mirror_count; i++) {
7612                         if (i == primary)
7613                                 continue;
7614                         rc = lod_declare_update_extents(env, lo, &pri_extent,
7615                                                         th, i, 0);
7616                         /* if update_extents changed the layout, it may have
7617                          * reallocated the component array, so start over to
7618                          * avoid using stale pointers */
7619                         if (rc == 1)
7620                                 goto restart;
7621                         if (rc < 0)
7622                                 RETURN(rc);
7623
7624                         /* ... and then stale other components that are
7625                          * overlapping with primary components */
7626                         lod_foreach_mirror_comp(lod_comp, lo, i) {
7627                                 if (!lu_extent_is_overlapped(
7628                                                         &pri_extent,
7629                                                         &lod_comp->llc_extent))
7630                                         continue;
7631
7632                                 CDEBUG(D_LAYOUT, "stale: %u / %u\n",
7633                                       i, lod_comp_index(lo, lod_comp));
7634
7635                                 lod_comp->llc_flags |= LCME_FL_STALE;
7636                                 lo->ldo_mirrors[i].lme_stale = 1;
7637                         }
7638                 }
7639         }
7640
7641         RETURN(rc);
7642 }
7643
7644 /**
7645  * check an OST's availability
7646  * \param[in] env       execution environment
7647  * \param[in] lo        lod object
7648  * \param[in] dt        dt object
7649  * \param[in] index     mirror index
7650  *
7651  * \retval      negative if failed
7652  * \retval      1 if \a dt is available
7653  * \retval      0 if \a dt is not available
7654  */
7655 static inline int lod_check_ost_avail(const struct lu_env *env,
7656                                       struct lod_object *lo,
7657                                       struct dt_object *dt, int index)
7658 {
7659         struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
7660         struct lod_tgt_desc *ost;
7661         __u32 idx;
7662         int type = LU_SEQ_RANGE_OST;
7663         int rc;
7664
7665         rc = lod_fld_lookup(env, lod, lu_object_fid(&dt->do_lu), &idx, &type);
7666         if (rc < 0) {
7667                 CERROR("%s: can't locate "DFID":rc = %d\n",
7668                        lod2obd(lod)->obd_name, PFID(lu_object_fid(&dt->do_lu)),
7669                        rc);
7670                 return rc;
7671         }
7672
7673         ost = OST_TGT(lod, idx);
7674         if (ost->ltd_statfs.os_state &
7675                 (OS_STATFS_READONLY | OS_STATFS_ENOSPC | OS_STATFS_ENOINO |
7676                  OS_STATFS_NOPRECREATE) ||
7677             ost->ltd_active == 0) {
7678                 CDEBUG(D_LAYOUT, DFID ": mirror %d OST%d unavail, rc = %d\n",
7679                        PFID(lod_object_fid(lo)), index, idx, rc);
7680                 return 0;
7681         }
7682
7683         return 1;
7684 }
7685
7686 /**
7687  * Pick primary mirror for write
7688  * \param[in] env       execution environment
7689  * \param[in] lo        object
7690  * \param[in] extent    write range
7691  */
7692 static int lod_primary_pick(const struct lu_env *env, struct lod_object *lo,
7693                             struct lu_extent *extent)
7694 {
7695         struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
7696         unsigned int seq = 0;
7697         struct lod_layout_component *lod_comp;
7698         int i, j, rc;
7699         int picked = -1, second_pick = -1, third_pick = -1;
7700         ENTRY;
7701
7702         if (OBD_FAIL_CHECK(OBD_FAIL_FLR_RANDOM_PICK_MIRROR)) {
7703                 get_random_bytes(&seq, sizeof(seq));
7704                 seq %= lo->ldo_mirror_count;
7705         }
7706
7707         /**
7708          * Pick a mirror as the primary, and check the availability of OSTs.
7709          *
7710          * This algo can be revised later after knowing the topology of
7711          * cluster.
7712          */
7713         lod_qos_statfs_update(env, lod, &lod->lod_ost_descs);
7714
7715         rc = lod_fill_mirrors(lo);
7716         if (rc)
7717                 RETURN(rc);
7718
7719         for (i = 0; i < lo->ldo_mirror_count; i++) {
7720                 bool ost_avail = true;
7721                 int index = (i + seq) % lo->ldo_mirror_count;
7722
7723                 if (lo->ldo_mirrors[index].lme_stale) {
7724                         CDEBUG(D_LAYOUT, DFID": mirror %d stale\n",
7725                                PFID(lod_object_fid(lo)), index);
7726                         continue;
7727                 }
7728
7729                 /* 2nd pick is for the primary mirror containing unavail OST */
7730                 if (lo->ldo_mirrors[index].lme_prefer && second_pick < 0)
7731                         second_pick = index;
7732
7733                 /* 3rd pick is for non-primary mirror containing unavail OST */
7734                 if (second_pick < 0 && third_pick < 0)
7735                         third_pick = index;
7736
7737                 /**
7738                  * we found a non-primary 1st pick, we'd like to find a
7739                  * potential pirmary mirror.
7740                  */
7741                 if (picked >= 0 && !lo->ldo_mirrors[index].lme_prefer)
7742                         continue;
7743
7744                 /* check the availability of OSTs */
7745                 lod_foreach_mirror_comp(lod_comp, lo, index) {
7746                         if (!lod_comp_inited(lod_comp) || !lod_comp->llc_stripe)
7747                                 continue;
7748
7749                         for (j = 0; j < lod_comp->llc_stripe_count; j++) {
7750                                 struct dt_object *dt = lod_comp->llc_stripe[j];
7751
7752                                 rc = lod_check_ost_avail(env, lo, dt, index);
7753                                 if (rc < 0)
7754                                         RETURN(rc);
7755
7756                                 ost_avail = !!rc;
7757                                 if (!ost_avail)
7758                                         break;
7759                         } /* for all dt object in one component */
7760                         if (!ost_avail)
7761                                 break;
7762                 } /* for all components in a mirror */
7763
7764                 /**
7765                  * the OSTs where allocated objects locates in the components
7766                  * of the mirror are available.
7767                  */
7768                 if (!ost_avail)
7769                         continue;
7770
7771                 /* this mirror has all OSTs available */
7772                 picked = index;
7773
7774                 /**
7775                  * primary with all OSTs are available, this is the perfect
7776                  * 1st pick.
7777                  */
7778                 if (lo->ldo_mirrors[index].lme_prefer)
7779                         break;
7780         } /* for all mirrors */
7781
7782         /* failed to pick a sound mirror, lower our expectation */
7783         if (picked < 0)
7784                 picked = second_pick;
7785         if (picked < 0)
7786                 picked = third_pick;
7787         if (picked < 0)
7788                 RETURN(-ENODATA);
7789
7790         RETURN(picked);
7791 }
7792
7793 static int lod_prepare_resync_mirror(const struct lu_env *env,
7794                                      struct lod_object *lo,
7795                                      __u16 mirror_id)
7796 {
7797         struct lod_thread_info *info = lod_env_info(env);
7798         struct lod_layout_component *lod_comp;
7799         bool neg = !!(MIRROR_ID_NEG & mirror_id);
7800         int i;
7801
7802         mirror_id &= ~MIRROR_ID_NEG;
7803
7804         for (i = 0; i < lo->ldo_mirror_count; i++) {
7805                 if ((!neg && lo->ldo_mirrors[i].lme_id != mirror_id) ||
7806                     (neg && lo->ldo_mirrors[i].lme_id == mirror_id))
7807                         continue;
7808
7809                 lod_foreach_mirror_comp(lod_comp, lo, i) {
7810                         if (lod_comp_inited(lod_comp))
7811                                 continue;
7812
7813                         info->lti_comp_idx[info->lti_count++] =
7814                                 lod_comp_index(lo, lod_comp);
7815                 }
7816         }
7817
7818         return 0;
7819 }
7820
7821 /**
7822  * figure out the components should be instantiated for resync.
7823  */
7824 static int lod_prepare_resync(const struct lu_env *env, struct lod_object *lo,
7825                               struct lu_extent *extent)
7826 {
7827         struct lod_thread_info *info = lod_env_info(env);
7828         struct lod_layout_component *lod_comp;
7829         unsigned int need_sync = 0;
7830         int i;
7831
7832         CDEBUG(D_LAYOUT,
7833                DFID": instantiate all stale components in "DEXT"\n",
7834                PFID(lod_object_fid(lo)), PEXT(extent));
7835
7836         /**
7837          * instantiate all components within this extent, even non-stale
7838          * components.
7839          */
7840         for (i = 0; i < lo->ldo_mirror_count; i++) {
7841                 if (!lo->ldo_mirrors[i].lme_stale)
7842                         continue;
7843
7844                 lod_foreach_mirror_comp(lod_comp, lo, i) {
7845                         if (!lu_extent_is_overlapped(extent,
7846                                                 &lod_comp->llc_extent))
7847                                 break;
7848
7849                         need_sync++;
7850
7851                         if (lod_comp_inited(lod_comp))
7852                                 continue;
7853
7854                         CDEBUG(D_LAYOUT, "resync instantiate %d / %d\n",
7855                                i, lod_comp_index(lo, lod_comp));
7856                         info->lti_comp_idx[info->lti_count++] =
7857                                         lod_comp_index(lo, lod_comp);
7858                 }
7859         }
7860
7861         return need_sync ? 0 : -EALREADY;
7862 }
7863
7864 static int lod_declare_update_rdonly(const struct lu_env *env,
7865                 struct lod_object *lo, struct md_layout_change *mlc,
7866                 struct thandle *th)
7867 {
7868         struct lod_thread_info *info = lod_env_info(env);
7869         struct lu_attr *layout_attr = &info->lti_layout_attr;
7870         struct lod_layout_component *lod_comp;
7871         struct lu_extent extent = { 0 };
7872         int rc;
7873         ENTRY;
7874
7875         LASSERT(lo->ldo_flr_state == LCM_FL_RDONLY);
7876         LASSERT(mlc->mlc_opc == MD_LAYOUT_WRITE ||
7877                 mlc->mlc_opc == MD_LAYOUT_RESYNC);
7878         LASSERT(lo->ldo_mirror_count > 0);
7879
7880         if (mlc->mlc_opc == MD_LAYOUT_WRITE) {
7881                 struct layout_intent *layout = mlc->mlc_intent;
7882                 int write = layout->li_opc == LAYOUT_INTENT_WRITE;
7883                 int picked;
7884
7885                 extent = layout->li_extent;
7886                 CDEBUG(D_LAYOUT, DFID": trying to write :"DEXT"\n",
7887                        PFID(lod_object_fid(lo)), PEXT(&extent));
7888
7889                 picked = lod_primary_pick(env, lo, &extent);
7890                 if (picked < 0)
7891                         RETURN(picked);
7892
7893                 CDEBUG(D_LAYOUT, DFID": picked mirror id %u as primary\n",
7894                        PFID(lod_object_fid(lo)),
7895                        lo->ldo_mirrors[picked].lme_id);
7896
7897                 /* Update extents of primary before staling */
7898                 rc = lod_declare_update_extents(env, lo, &extent, th, picked,
7899                                                 write);
7900                 if (rc < 0)
7901                         GOTO(out, rc);
7902
7903                 if (layout->li_opc == LAYOUT_INTENT_TRUNC) {
7904                         /**
7905                          * trunc transfers [0, size) in the intent extent, we'd
7906                          * stale components overlapping [size, eof).
7907                          */
7908                         extent.e_start = extent.e_end;
7909                         extent.e_end = OBD_OBJECT_EOF;
7910                 }
7911
7912                 /* stale overlapping components from other mirrors */
7913                 rc = lod_stale_components(env, lo, picked, &extent, th);
7914                 if (rc < 0)
7915                         GOTO(out, rc);
7916
7917                 /* restore truncate intent extent */
7918                 if (layout->li_opc == LAYOUT_INTENT_TRUNC)
7919                         extent.e_end = extent.e_start;
7920
7921                 /* instantiate components for the picked mirror, start from 0 */
7922                 extent.e_start = 0;
7923
7924                 lod_foreach_mirror_comp(lod_comp, lo, picked) {
7925                         if (!lu_extent_is_overlapped(&extent,
7926                                                      &lod_comp->llc_extent))
7927                                 break;
7928
7929                         if (!lod_is_instantiation_needed(lod_comp))
7930                                 continue;
7931
7932                         info->lti_comp_idx[info->lti_count++] =
7933                                                 lod_comp_index(lo, lod_comp);
7934                 }
7935
7936                 lo->ldo_flr_state = LCM_FL_WRITE_PENDING;
7937         } else { /* MD_LAYOUT_RESYNC */
7938                 int i;
7939
7940                 /**
7941                  * could contain multiple non-stale mirrors, so we need to
7942                  * prep uninited all components assuming any non-stale mirror
7943                  * could be picked as the primary mirror.
7944                  */
7945                 if (mlc->mlc_mirror_id == 0) {
7946                         /* normal resync */
7947                         for (i = 0; i < lo->ldo_mirror_count; i++) {
7948                                 if (lo->ldo_mirrors[i].lme_stale)
7949                                         continue;
7950
7951                                 lod_foreach_mirror_comp(lod_comp, lo, i) {
7952                                         if (!lod_comp_inited(lod_comp))
7953                                                 break;
7954
7955                                         if (extent.e_end <
7956                                                 lod_comp->llc_extent.e_end)
7957                                                 extent.e_end =
7958                                                      lod_comp->llc_extent.e_end;
7959                                 }
7960                         }
7961                         rc = lod_prepare_resync(env, lo, &extent);
7962                         if (rc)
7963                                 GOTO(out, rc);
7964                 } else {
7965                         /* mirror write, try to init its all components */
7966                         rc = lod_prepare_resync_mirror(env, lo,
7967                                                        mlc->mlc_mirror_id);
7968                         if (rc)
7969                                 GOTO(out, rc);
7970                 }
7971
7972                 /* change the file state to SYNC_PENDING */
7973                 lo->ldo_flr_state = LCM_FL_SYNC_PENDING;
7974         }
7975
7976         /* Reset the layout version once it's becoming too large.
7977          * This way it can make sure that the layout version is
7978          * monotonously increased in this writing era. */
7979         lod_obj_inc_layout_gen(lo);
7980         if (lo->ldo_layout_gen > (LCME_ID_MAX >> 1)) {
7981                 __u32 layout_version;
7982
7983                 get_random_bytes(&layout_version, sizeof(layout_version));
7984                 lo->ldo_layout_gen = layout_version & 0xffff;
7985         }
7986
7987         rc = lod_declare_instantiate_components(env, lo, th, 0);
7988         if (rc)
7989                 GOTO(out, rc);
7990
7991         layout_attr->la_valid = LA_LAYOUT_VERSION;
7992         layout_attr->la_layout_version = 0; /* set current version */
7993         if (mlc->mlc_opc == MD_LAYOUT_RESYNC)
7994                 layout_attr->la_layout_version = LU_LAYOUT_RESYNC;
7995         rc = lod_declare_attr_set(env, &lo->ldo_obj, layout_attr, th);
7996         if (rc)
7997                 GOTO(out, rc);
7998
7999 out:
8000         if (rc)
8001                 lod_striping_free(env, lo);
8002         RETURN(rc);
8003 }
8004
8005 static int lod_declare_update_write_pending(const struct lu_env *env,
8006                 struct lod_object *lo, struct md_layout_change *mlc,
8007                 struct thandle *th)
8008 {
8009         struct lod_thread_info *info = lod_env_info(env);
8010         struct lu_attr *layout_attr = &info->lti_layout_attr;
8011         struct lod_layout_component *lod_comp;
8012         struct lu_extent extent = { 0 };
8013         int primary = -1;
8014         int i;
8015         int rc;
8016         ENTRY;
8017
8018         LASSERT(lo->ldo_flr_state == LCM_FL_WRITE_PENDING);
8019         LASSERT(mlc->mlc_opc == MD_LAYOUT_WRITE ||
8020                 mlc->mlc_opc == MD_LAYOUT_RESYNC);
8021
8022         /* look for the first preferred mirror */
8023         for (i = 0; i < lo->ldo_mirror_count; i++) {
8024                 if (lo->ldo_mirrors[i].lme_stale)
8025                         continue;
8026                 if (lo->ldo_mirrors[i].lme_prefer == 0)
8027                         continue;
8028
8029                 primary = i;
8030                 break;
8031         }
8032         if (primary < 0) {
8033                 /* no primary, use any in-sync */
8034                 for (i = 0; i < lo->ldo_mirror_count; i++) {
8035                         if (lo->ldo_mirrors[i].lme_stale)
8036                                 continue;
8037                         primary = i;
8038                         break;
8039                 }
8040                 if (primary < 0) {
8041                         CERROR(DFID ": doesn't have a primary mirror\n",
8042                                PFID(lod_object_fid(lo)));
8043                         GOTO(out, rc = -ENODATA);
8044                 }
8045         }
8046
8047         CDEBUG(D_LAYOUT, DFID": found primary %u\n",
8048                PFID(lod_object_fid(lo)), lo->ldo_mirrors[primary].lme_id);
8049
8050         LASSERT(!lo->ldo_mirrors[primary].lme_stale);
8051
8052         /* for LAYOUT_WRITE opc, it has to do the following operations:
8053          * 1. stale overlapping componets from stale mirrors;
8054          * 2. instantiate components of the primary mirror;
8055          * 3. transfter layout version to all objects of the primary;
8056          *
8057          * for LAYOUT_RESYNC opc, it will do:
8058          * 1. instantiate components of all stale mirrors;
8059          * 2. transfer layout version to all objects to close write era. */
8060
8061         if (mlc->mlc_opc == MD_LAYOUT_WRITE) {
8062                 struct layout_intent *layout = mlc->mlc_intent;
8063                 int write = layout->li_opc == LAYOUT_INTENT_WRITE;
8064
8065                 LASSERT(mlc->mlc_intent != NULL);
8066
8067                 extent = mlc->mlc_intent->li_extent;
8068
8069                 CDEBUG(D_LAYOUT, DFID": intent to write: "DEXT"\n",
8070                        PFID(lod_object_fid(lo)), PEXT(&extent));
8071
8072                 /* 1. Update extents of primary before staling */
8073                 rc = lod_declare_update_extents(env, lo, &extent, th, primary,
8074                                                 write);
8075                 if (rc < 0)
8076                         GOTO(out, rc);
8077
8078                 if (mlc->mlc_intent->li_opc == LAYOUT_INTENT_TRUNC) {
8079                         /**
8080                          * trunc transfers [0, size) in the intent extent, we'd
8081                          * stale components overlapping [size, eof).
8082                          */
8083                         extent.e_start = extent.e_end;
8084                         extent.e_end = OBD_OBJECT_EOF;
8085                 }
8086
8087                 /* 2. stale overlapping components */
8088                 rc = lod_stale_components(env, lo, primary, &extent, th);
8089                 if (rc < 0)
8090                         GOTO(out, rc);
8091
8092                 /* 3. find the components which need instantiating.
8093                  * instantiate [0, mlc->mlc_intent->e_end) */
8094
8095                 /* restore truncate intent extent */
8096                 if (mlc->mlc_intent->li_opc == LAYOUT_INTENT_TRUNC)
8097                         extent.e_end = extent.e_start;
8098                 extent.e_start = 0;
8099
8100                 lod_foreach_mirror_comp(lod_comp, lo, primary) {
8101                         if (!lu_extent_is_overlapped(&extent,
8102                                                      &lod_comp->llc_extent))
8103                                 break;
8104
8105                         if (!lod_is_instantiation_needed(lod_comp))
8106                                 continue;
8107
8108                         CDEBUG(D_LAYOUT, "write instantiate %d / %d\n",
8109                                primary, lod_comp_index(lo, lod_comp));
8110                         info->lti_comp_idx[info->lti_count++] =
8111                                                 lod_comp_index(lo, lod_comp);
8112                 }
8113         } else { /* MD_LAYOUT_RESYNC */
8114                 if (mlc->mlc_mirror_id == 0) {
8115                         /* normal resync */
8116                         lod_foreach_mirror_comp(lod_comp, lo, primary) {
8117                                 if (!lod_comp_inited(lod_comp))
8118                                         break;
8119
8120                                 extent.e_end = lod_comp->llc_extent.e_end;
8121                         }
8122
8123                         rc = lod_prepare_resync(env, lo, &extent);
8124                         if (rc)
8125                                 GOTO(out, rc);
8126                 } else {
8127                         /* mirror write, try to init its all components */
8128                         rc = lod_prepare_resync_mirror(env, lo,
8129                                                        mlc->mlc_mirror_id);
8130                         if (rc)
8131                                 GOTO(out, rc);
8132                 }
8133
8134                 /* change the file state to SYNC_PENDING */
8135                 lo->ldo_flr_state = LCM_FL_SYNC_PENDING;
8136         }
8137
8138         rc = lod_declare_instantiate_components(env, lo, th, 0);
8139         if (rc)
8140                 GOTO(out, rc);
8141
8142         /* 3. transfer layout version to OST objects.
8143          * transfer new layout version to OST objects so that stale writes
8144          * can be denied. It also ends an era of writing by setting
8145          * LU_LAYOUT_RESYNC. Normal client can never use this bit to
8146          * send write RPC; only resync RPCs could do it. */
8147         layout_attr->la_valid = LA_LAYOUT_VERSION;
8148         layout_attr->la_layout_version = 0; /* set current version */
8149         if (mlc->mlc_opc == MD_LAYOUT_RESYNC)
8150                 layout_attr->la_layout_version = LU_LAYOUT_RESYNC;
8151         rc = lod_declare_attr_set(env, &lo->ldo_obj, layout_attr, th);
8152         if (rc)
8153                 GOTO(out, rc);
8154
8155         lod_obj_inc_layout_gen(lo);
8156 out:
8157         if (rc)
8158                 lod_striping_free(env, lo);
8159         RETURN(rc);
8160 }
8161
8162 static int lod_declare_update_sync_pending(const struct lu_env *env,
8163                 struct lod_object *lo, struct md_layout_change *mlc,
8164                 struct thandle *th)
8165 {
8166         struct lod_thread_info  *info = lod_env_info(env);
8167         struct lu_attr *layout_attr = &info->lti_layout_attr;
8168         unsigned sync_components = 0;
8169         unsigned resync_components = 0;
8170         int i;
8171         int rc;
8172         ENTRY;
8173
8174         LASSERT(lo->ldo_flr_state == LCM_FL_SYNC_PENDING);
8175         LASSERT(mlc->mlc_opc == MD_LAYOUT_RESYNC_DONE ||
8176                 mlc->mlc_opc == MD_LAYOUT_WRITE);
8177
8178         CDEBUG(D_LAYOUT, DFID ": received op %d in sync pending\n",
8179                PFID(lod_object_fid(lo)), mlc->mlc_opc);
8180
8181         if (mlc->mlc_opc == MD_LAYOUT_WRITE) {
8182                 CDEBUG(D_LAYOUT, DFID": cocurrent write to sync pending\n",
8183                        PFID(lod_object_fid(lo)));
8184
8185                 lo->ldo_flr_state = LCM_FL_WRITE_PENDING;
8186                 return lod_declare_update_write_pending(env, lo, mlc, th);
8187         }
8188
8189         /* MD_LAYOUT_RESYNC_DONE */
8190
8191         for (i = 0; i < lo->ldo_comp_cnt; i++) {
8192                 struct lod_layout_component *lod_comp;
8193                 int j;
8194
8195                 lod_comp = &lo->ldo_comp_entries[i];
8196
8197                 if (!(lod_comp->llc_flags & LCME_FL_STALE)) {
8198                         sync_components++;
8199                         continue;
8200                 }
8201
8202                 for (j = 0; j < mlc->mlc_resync_count; j++) {
8203                         if (lod_comp->llc_id != mlc->mlc_resync_ids[j])
8204                                 continue;
8205
8206                         mlc->mlc_resync_ids[j] = LCME_ID_INVAL;
8207                         lod_comp->llc_flags &= ~LCME_FL_STALE;
8208                         resync_components++;
8209                         break;
8210                 }
8211         }
8212
8213         /* valid check */
8214         for (i = 0; i < mlc->mlc_resync_count; i++) {
8215                 if (mlc->mlc_resync_ids[i] == LCME_ID_INVAL)
8216                         continue;
8217
8218                 CDEBUG(D_LAYOUT, DFID": lcme id %u (%d / %zd) not exist "
8219                        "or already synced\n", PFID(lod_object_fid(lo)),
8220                        mlc->mlc_resync_ids[i], i, mlc->mlc_resync_count);
8221                 GOTO(out, rc = -EINVAL);
8222         }
8223
8224         if (!sync_components || (mlc->mlc_resync_count && !resync_components)) {
8225                 CDEBUG(D_LAYOUT, DFID": no mirror in sync\n",
8226                        PFID(lod_object_fid(lo)));
8227
8228                 /* tend to return an error code here to prevent
8229                  * the MDT from setting SoM attribute */
8230                 GOTO(out, rc = -EINVAL);
8231         }
8232
8233         CDEBUG(D_LAYOUT, DFID": synced %u resynced %u/%zu components\n",
8234                PFID(lod_object_fid(lo)),
8235                sync_components, resync_components, mlc->mlc_resync_count);
8236
8237         lo->ldo_flr_state = LCM_FL_RDONLY;
8238         lod_obj_inc_layout_gen(lo);
8239
8240         layout_attr->la_valid = LA_LAYOUT_VERSION;
8241         layout_attr->la_layout_version = 0; /* set current version */
8242         rc = lod_declare_attr_set(env, &lo->ldo_obj, layout_attr, th);
8243         if (rc)
8244                 GOTO(out, rc);
8245
8246         info->lti_buf.lb_len = lod_comp_md_size(lo, false);
8247         rc = lod_sub_declare_xattr_set(env, lod_object_child(lo),
8248                                        &info->lti_buf, XATTR_NAME_LOV, 0, th);
8249         EXIT;
8250
8251 out:
8252         if (rc)
8253                 lod_striping_free(env, lo);
8254         RETURN(rc);
8255 }
8256
8257 typedef int (*mlc_handler)(const struct lu_env *env, struct dt_object *dt,
8258                            const struct md_layout_change *mlc,
8259                            struct thandle *th);
8260
8261 /**
8262  * Attach stripes after target's for migrating directory. NB, we
8263  * only need to declare this, the actual work is done inside
8264  * lod_xattr_set_lmv().
8265  *
8266  * \param[in] env       execution environment
8267  * \param[in] dt        target object
8268  * \param[in] mlc       layout change data
8269  * \param[in] th        transaction handle
8270  *
8271  * \retval              0 on success
8272  * \retval              negative if failed
8273  */
8274 static int lod_dir_declare_layout_attach(const struct lu_env *env,
8275                                          struct dt_object *dt,
8276                                          const struct md_layout_change *mlc,
8277                                          struct thandle *th)
8278 {
8279         struct lod_thread_info *info = lod_env_info(env);
8280         struct lod_device *lod = lu2lod_dev(dt->do_lu.lo_dev);
8281         struct lod_tgt_descs *ltd = &lod->lod_mdt_descs;
8282         struct lod_object *lo = lod_dt_obj(dt);
8283         struct dt_object *next = dt_object_child(dt);
8284         struct dt_object_format *dof = &info->lti_format;
8285         struct lmv_mds_md_v1 *lmv = mlc->mlc_buf.lb_buf;
8286         struct dt_object **stripes;
8287         __u32 stripe_count = le32_to_cpu(lmv->lmv_stripe_count);
8288         struct lu_fid *fid = &info->lti_fid;
8289         struct lod_tgt_desc *tgt;
8290         struct dt_object *dto;
8291         struct dt_device *tgt_dt;
8292         int type = LU_SEQ_RANGE_ANY;
8293         struct dt_insert_rec *rec = &info->lti_dt_rec;
8294         char *stripe_name = info->lti_key;
8295         struct lu_name *sname;
8296         struct linkea_data ldata = { NULL };
8297         struct lu_buf linkea_buf;
8298         __u32 idx;
8299         int i;
8300         int rc;
8301
8302         ENTRY;
8303
8304         if (!lmv_is_sane(lmv))
8305                 RETURN(-EINVAL);
8306
8307         if (!dt_try_as_dir(env, dt))
8308                 return -ENOTDIR;
8309
8310         dof->dof_type = DFT_DIR;
8311
8312         OBD_ALLOC_PTR_ARRAY(stripes, (lo->ldo_dir_stripe_count + stripe_count));
8313         if (!stripes)
8314                 RETURN(-ENOMEM);
8315
8316         for (i = 0; i < lo->ldo_dir_stripe_count; i++)
8317                 stripes[i] = lo->ldo_stripe[i];
8318
8319         rec->rec_type = S_IFDIR;
8320
8321         for (i = 0; i < stripe_count; i++) {
8322                 fid_le_to_cpu(fid,
8323                         &lmv->lmv_stripe_fids[i]);
8324                 if (!fid_is_sane(fid))
8325                         continue;
8326
8327                 rc = lod_fld_lookup(env, lod, fid, &idx, &type);
8328                 if (rc)
8329                         GOTO(out, rc);
8330
8331                 if (idx == lod2lu_dev(lod)->ld_site->ld_seq_site->ss_node_id) {
8332                         tgt_dt = lod->lod_child;
8333                 } else {
8334                         tgt = LTD_TGT(ltd, idx);
8335                         if (tgt == NULL)
8336                                 GOTO(out, rc = -ESTALE);
8337                         tgt_dt = tgt->ltd_tgt;
8338                 }
8339
8340                 dto = dt_locate_at(env, tgt_dt, fid,
8341                                   lo->ldo_obj.do_lu.lo_dev->ld_site->ls_top_dev,
8342                                   NULL);
8343                 if (IS_ERR(dto))
8344                         GOTO(out, rc = PTR_ERR(dto));
8345
8346                 stripes[i + lo->ldo_dir_stripe_count] = dto;
8347
8348                 if (!dt_try_as_dir(env, dto))
8349                         GOTO(out, rc = -ENOTDIR);
8350
8351                 rc = lod_sub_declare_ref_add(env, dto, th);
8352                 if (rc)
8353                         GOTO(out, rc);
8354
8355                 rec->rec_fid = lu_object_fid(&dto->do_lu);
8356                 rc = lod_sub_declare_insert(env, dto,
8357                                             (const struct dt_rec *)rec,
8358                                             (const struct dt_key *)dot, th);
8359                 if (rc)
8360                         GOTO(out, rc);
8361
8362                 rc = lod_sub_declare_insert(env, dto,
8363                                             (const struct dt_rec *)rec,
8364                                             (const struct dt_key *)dotdot, th);
8365                 if (rc)
8366                         GOTO(out, rc);
8367
8368                 rc = lod_sub_declare_xattr_set(env, dto, &mlc->mlc_buf,
8369                                                 XATTR_NAME_LMV, 0, th);
8370                 if (rc)
8371                         GOTO(out, rc);
8372
8373                 snprintf(stripe_name, sizeof(info->lti_key), DFID":%u",
8374                          PFID(lu_object_fid(&dto->do_lu)),
8375                          i + lo->ldo_dir_stripe_count);
8376
8377                 sname = lod_name_get(env, stripe_name, strlen(stripe_name));
8378                 rc = linkea_links_new(&ldata, &info->lti_linkea_buf,
8379                                       sname, lu_object_fid(&dt->do_lu));
8380                 if (rc)
8381                         GOTO(out, rc);
8382
8383                 linkea_buf.lb_buf = ldata.ld_buf->lb_buf;
8384                 linkea_buf.lb_len = ldata.ld_leh->leh_len;
8385                 rc = lod_sub_declare_xattr_set(env, dto, &linkea_buf,
8386                                                XATTR_NAME_LINK, 0, th);
8387                 if (rc)
8388                         GOTO(out, rc);
8389
8390                 rc = lod_sub_declare_insert(env, next,
8391                                             (const struct dt_rec *)rec,
8392                                             (const struct dt_key *)stripe_name,
8393                                             th);
8394                 if (rc)
8395                         GOTO(out, rc);
8396
8397                 rc = lod_sub_declare_ref_add(env, next, th);
8398                 if (rc)
8399                         GOTO(out, rc);
8400         }
8401
8402         if (lo->ldo_stripe)
8403                 OBD_FREE_PTR_ARRAY(lo->ldo_stripe,
8404                                    lo->ldo_dir_stripes_allocated);
8405         lo->ldo_stripe = stripes;
8406         lo->ldo_is_foreign = 0;
8407         lo->ldo_dir_migrate_offset = lo->ldo_dir_stripe_count;
8408         lo->ldo_dir_migrate_hash = le32_to_cpu(lmv->lmv_hash_type);
8409         lo->ldo_dir_stripe_count += stripe_count;
8410         lo->ldo_dir_stripes_allocated += stripe_count;
8411
8412         /* plain directory split creates target as a plain directory, while
8413          * after source attached as the first stripe, it becomes a striped
8414          * directory, set correct do_index_ops, otherwise it can't be unlinked.
8415          */
8416         dt->do_index_ops = &lod_striped_index_ops;
8417
8418         RETURN(0);
8419 out:
8420         i = lo->ldo_dir_stripe_count;
8421         while (i < lo->ldo_dir_stripe_count + stripe_count && stripes[i])
8422                 dt_object_put(env, stripes[i++]);
8423
8424         OBD_FREE_PTR_ARRAY(stripes, stripe_count + lo->ldo_dir_stripe_count);
8425         return rc;
8426 }
8427
8428 static int lod_dir_declare_layout_detach(const struct lu_env *env,
8429                                          struct dt_object *dt,
8430                                          const struct md_layout_change *unused,
8431                                          struct thandle *th)
8432 {
8433         struct lod_thread_info *info = lod_env_info(env);
8434         struct lod_object *lo = lod_dt_obj(dt);
8435         struct dt_object *next = dt_object_child(dt);
8436         char *stripe_name = info->lti_key;
8437         struct dt_object *dto;
8438         int i;
8439         int rc = 0;
8440
8441         if (!dt_try_as_dir(env, dt))
8442                 return -ENOTDIR;
8443
8444         if (!lo->ldo_dir_stripe_count)
8445                 return lod_sub_declare_delete(env, next,
8446                                         (const struct dt_key *)dotdot, th);
8447
8448         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
8449                 dto = lo->ldo_stripe[i];
8450                 if (!dto)
8451                         continue;
8452
8453                 if (!dt_try_as_dir(env, dto))
8454                         return -ENOTDIR;
8455
8456                 rc = lod_sub_declare_delete(env, dto,
8457                                         (const struct dt_key *)dotdot, th);
8458                 if (rc)
8459                         return rc;
8460
8461                 snprintf(stripe_name, sizeof(info->lti_key), DFID":%d",
8462                          PFID(lu_object_fid(&dto->do_lu)), i);
8463
8464                 rc = lod_sub_declare_delete(env, next,
8465                                         (const struct dt_key *)stripe_name, th);
8466                 if (rc)
8467                         return rc;
8468
8469                 rc = lod_sub_declare_ref_del(env, next, th);
8470                 if (rc)
8471                         return rc;
8472         }
8473
8474         return 0;
8475 }
8476
8477 static int dt_dir_is_empty(const struct lu_env *env,
8478                            struct dt_object *obj)
8479 {
8480         struct dt_it *it;
8481         const struct dt_it_ops *iops;
8482         int rc;
8483
8484         ENTRY;
8485
8486         if (!dt_try_as_dir(env, obj))
8487                 RETURN(-ENOTDIR);
8488
8489         iops = &obj->do_index_ops->dio_it;
8490         it = iops->init(env, obj, LUDA_64BITHASH);
8491         if (IS_ERR(it))
8492                 RETURN(PTR_ERR(it));
8493
8494         rc = iops->get(env, it, (const struct dt_key *)"");
8495         if (rc > 0) {
8496                 int i;
8497
8498                 for (rc = 0, i = 0; rc == 0 && i < 3; ++i)
8499                         rc = iops->next(env, it);
8500                 if (!rc)
8501                         rc = -ENOTEMPTY;
8502                 else if (rc == 1)
8503                         rc = 0;
8504         } else if (!rc) {
8505                 /* Huh? Index contains no zero key? */
8506                 rc = -EIO;
8507         }
8508
8509         iops->put(env, it);
8510         iops->fini(env, it);
8511
8512         RETURN(rc);
8513 }
8514
8515 static int lod_dir_declare_layout_shrink(const struct lu_env *env,
8516                                          struct dt_object *dt,
8517                                          const struct md_layout_change *mlc,
8518                                          struct thandle *th)
8519 {
8520         struct lod_thread_info *info = lod_env_info(env);
8521         struct lod_object *lo = lod_dt_obj(dt);
8522         struct dt_object *next = dt_object_child(dt);
8523         struct lmv_user_md *lmu = mlc->mlc_buf.lb_buf;
8524         char *stripe_name = info->lti_key;
8525         struct lu_buf *lmv_buf = &info->lti_buf;
8526         __u32 final_stripe_count;
8527         struct dt_object *dto;
8528         int i;
8529         int rc;
8530
8531         LASSERT(lmu);
8532
8533         if (!dt_try_as_dir(env, dt))
8534                 return -ENOTDIR;
8535
8536         /* shouldn't be called on plain directory */
8537         LASSERT(lo->ldo_dir_stripe_count);
8538
8539         lmv_buf->lb_buf = &info->lti_lmv.lmv_md_v1;
8540         lmv_buf->lb_len = sizeof(info->lti_lmv.lmv_md_v1);
8541
8542         final_stripe_count = le32_to_cpu(lmu->lum_stripe_count);
8543         LASSERT(final_stripe_count &&
8544                 final_stripe_count < lo->ldo_dir_stripe_count);
8545
8546         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
8547                 dto = lo->ldo_stripe[i];
8548                 if (!dto)
8549                         continue;
8550
8551                 if (i < final_stripe_count) {
8552                         rc = lod_sub_declare_xattr_set(env, dto, lmv_buf,
8553                                                        XATTR_NAME_LMV,
8554                                                        LU_XATTR_REPLACE, th);
8555                         if (rc)
8556                                 return rc;
8557
8558                         continue;
8559                 }
8560
8561                 rc = dt_dir_is_empty(env, dto);
8562                 if (rc < 0)
8563                         return rc;
8564
8565                 rc = lod_sub_declare_ref_del(env, dto, th);
8566                 if (rc)
8567                         return rc;
8568
8569                 rc = lod_sub_declare_destroy(env, dto, th);
8570                 if (rc)
8571                         return rc;
8572
8573                 snprintf(stripe_name, sizeof(info->lti_key), DFID":%d",
8574                          PFID(lu_object_fid(&dto->do_lu)), i);
8575
8576                 rc = lod_sub_declare_delete(env, next,
8577                                         (const struct dt_key *)stripe_name, th);
8578                 if (rc)
8579                         return rc;
8580
8581                 rc = lod_sub_declare_ref_del(env, next, th);
8582                 if (rc)
8583                         return rc;
8584         }
8585
8586         rc = lod_sub_declare_xattr_set(env, next, lmv_buf, XATTR_NAME_LMV,
8587                                        LU_XATTR_REPLACE, th);
8588         return rc;
8589 }
8590
8591 /**
8592  * Allocate stripes for split directory.
8593  *
8594  * \param[in] env       execution environment
8595  * \param[in] dt        target object
8596  * \param[in] mlc       layout change data
8597  * \param[in] th        transaction handle
8598  *
8599  * \retval              0 on success
8600  * \retval              negative if failed
8601  */
8602 static int lod_dir_declare_layout_split(const struct lu_env *env,
8603                                         struct dt_object *dt,
8604                                         const struct md_layout_change *mlc,
8605                                         struct thandle *th)
8606 {
8607         struct lod_thread_info *info = lod_env_info(env);
8608         struct lod_device *lod = lu2lod_dev(dt->do_lu.lo_dev);
8609         struct lod_object *lo = lod_dt_obj(dt);
8610         struct dt_object_format *dof = &info->lti_format;
8611         struct lmv_user_md_v1 *lum = mlc->mlc_spec->u.sp_ea.eadata;
8612         struct dt_object **stripes;
8613         u32 stripe_count;
8614         u32 saved_count;
8615         int i;
8616         int rc;
8617
8618         ENTRY;
8619
8620         LASSERT(le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC);
8621         LASSERT(le32_to_cpu(lum->lum_stripe_offset) == LMV_OFFSET_DEFAULT);
8622
8623         saved_count = lo->ldo_dir_stripes_allocated;
8624         stripe_count = le32_to_cpu(lum->lum_stripe_count);
8625         if (stripe_count <= saved_count)
8626                 RETURN(-EINVAL);
8627
8628         dof->dof_type = DFT_DIR;
8629
8630         OBD_ALLOC(stripes, sizeof(*stripes) * stripe_count);
8631         if (!stripes)
8632                 RETURN(-ENOMEM);
8633
8634         for (i = 0; i < lo->ldo_dir_stripes_allocated; i++)
8635                 stripes[i] = lo->ldo_stripe[i];
8636
8637         lod_qos_statfs_update(env, lod, &lod->lod_mdt_descs);
8638         rc = lod_mdt_alloc_qos(env, lo, stripes, saved_count, stripe_count);
8639         if (rc == -EAGAIN)
8640                 rc = lod_mdt_alloc_rr(env, lo, stripes, saved_count,
8641                                       stripe_count);
8642         if (rc < 0) {
8643                 OBD_FREE(stripes, sizeof(*stripes) * stripe_count);
8644                 RETURN(rc);
8645         }
8646
8647         LASSERT(rc > saved_count);
8648         OBD_FREE(lo->ldo_stripe,
8649                  sizeof(*stripes) * lo->ldo_dir_stripes_allocated);
8650         lo->ldo_stripe = stripes;
8651         lo->ldo_is_foreign = 0;
8652         lo->ldo_dir_striped = 1;
8653         lo->ldo_dir_stripe_count = rc;
8654         lo->ldo_dir_stripes_allocated = stripe_count;
8655         lo->ldo_dir_split_hash = lo->ldo_dir_hash_type;
8656         lo->ldo_dir_hash_type = le32_to_cpu(lum->lum_hash_type);
8657         if (!lmv_is_known_hash_type(lo->ldo_dir_hash_type))
8658                 lo->ldo_dir_hash_type =
8659                         lod->lod_mdt_descs.ltd_lmv_desc.ld_pattern;
8660         lo->ldo_dir_hash_type |= LMV_HASH_FLAG_SPLIT | LMV_HASH_FLAG_MIGRATION;
8661         lo->ldo_dir_split_offset = saved_count;
8662         lo->ldo_dir_layout_version++;
8663         lo->ldo_dir_stripe_loaded = 1;
8664
8665         rc = lod_dir_declare_create_stripes(env, dt, mlc->mlc_attr, dof, th);
8666         if (rc)
8667                 lod_striping_free(env, lo);
8668
8669         RETURN(rc);
8670 }
8671
8672 /*
8673  * detach all stripes from dir master object, NB, stripes are not destroyed, but
8674  * deleted from it's parent namespace, this function is called in two places:
8675  * 1. mdd_migrate_mdt() detach stripes from source, and attach them to
8676  *    target.
8677  * 2. mdd_dir_layout_update() detach stripe before turning 1-stripe directory to
8678  *    a plain directory.
8679  *
8680  * \param[in] env       execution environment
8681  * \param[in] dt        target object
8682  * \param[in] mlc       layout change data
8683  * \param[in] th        transaction handle
8684  *
8685  * \retval              0 on success
8686  * \retval              negative if failed
8687  */
8688 static int lod_dir_layout_detach(const struct lu_env *env,
8689                                  struct dt_object *dt,
8690                                  const struct md_layout_change *mlc,
8691                                  struct thandle *th)
8692 {
8693         struct lod_thread_info *info = lod_env_info(env);
8694         struct lod_object *lo = lod_dt_obj(dt);
8695         struct dt_object *next = dt_object_child(dt);
8696         char *stripe_name = info->lti_key;
8697         struct dt_object *dto;
8698         int i;
8699         int rc = 0;
8700
8701         ENTRY;
8702
8703         if (!lo->ldo_dir_stripe_count) {
8704                 /* plain directory delete .. */
8705                 rc = lod_sub_delete(env, next,
8706                                     (const struct dt_key *)dotdot, th);
8707                 RETURN(rc);
8708         }
8709
8710         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
8711                 dto = lo->ldo_stripe[i];
8712                 if (!dto)
8713                         continue;
8714
8715                 rc = lod_sub_delete(env, dto,
8716                                     (const struct dt_key *)dotdot, th);
8717                 if (rc)
8718                         break;
8719
8720                 snprintf(stripe_name, sizeof(info->lti_key), DFID":%d",
8721                          PFID(lu_object_fid(&dto->do_lu)), i);
8722
8723                 rc = lod_sub_delete(env, next,
8724                                     (const struct dt_key *)stripe_name, th);
8725                 if (rc)
8726                         break;
8727
8728                 rc = lod_sub_ref_del(env, next, th);
8729                 if (rc)
8730                         break;
8731         }
8732
8733         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
8734                 dto = lo->ldo_stripe[i];
8735                 if (dto)
8736                         dt_object_put(env, dto);
8737         }
8738         OBD_FREE_PTR_ARRAY(lo->ldo_stripe, lo->ldo_dir_stripes_allocated);
8739         lo->ldo_stripe = NULL;
8740         lo->ldo_dir_stripes_allocated = 0;
8741         lo->ldo_dir_stripe_count = 0;
8742         dt->do_index_ops = &lod_index_ops;
8743
8744         RETURN(rc);
8745 }
8746
8747 static int lod_dir_layout_shrink(const struct lu_env *env,
8748                                  struct dt_object *dt,
8749                                  const struct md_layout_change *mlc,
8750                                  struct thandle *th)
8751 {
8752         struct lod_thread_info *info = lod_env_info(env);
8753         struct lod_object *lo = lod_dt_obj(dt);
8754         struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
8755         struct dt_object *next = dt_object_child(dt);
8756         struct lmv_user_md *lmu = mlc->mlc_buf.lb_buf;
8757         __u32 final_stripe_count;
8758         char *stripe_name = info->lti_key;
8759         struct dt_object *dto;
8760         struct lu_buf *lmv_buf = &info->lti_buf;
8761         struct lmv_mds_md_v1 *lmv = &info->lti_lmv.lmv_md_v1;
8762         u32 mdtidx;
8763         int type = LU_SEQ_RANGE_ANY;
8764         int i;
8765         int rc;
8766
8767         ENTRY;
8768
8769         final_stripe_count = le32_to_cpu(lmu->lum_stripe_count);
8770
8771         lmv_buf->lb_buf = lmv;
8772         lmv_buf->lb_len = sizeof(*lmv);
8773         lmv->lmv_magic = cpu_to_le32(LMV_MAGIC_STRIPE);
8774         lmv->lmv_stripe_count = cpu_to_le32(final_stripe_count);
8775         lmv->lmv_hash_type = cpu_to_le32(lo->ldo_dir_hash_type) &
8776                              cpu_to_le32(LMV_HASH_TYPE_MASK |
8777                                          LMV_HASH_FLAG_FIXED);
8778         lmv->lmv_layout_version =
8779                         cpu_to_le32(lo->ldo_dir_layout_version + 1);
8780         lmv->lmv_migrate_offset = 0;
8781         lmv->lmv_migrate_hash = 0;
8782
8783         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
8784                 dto = lo->ldo_stripe[i];
8785                 if (!dto)
8786                         continue;
8787
8788                 if (i < final_stripe_count) {
8789                         rc = lod_fld_lookup(env, lod,
8790                                             lu_object_fid(&dto->do_lu),
8791                                             &mdtidx, &type);
8792                         if (rc)
8793                                 RETURN(rc);
8794
8795                         lmv->lmv_master_mdt_index = cpu_to_le32(mdtidx);
8796                         rc = lod_sub_xattr_set(env, dto, lmv_buf,
8797                                                XATTR_NAME_LMV,
8798                                                LU_XATTR_REPLACE, th);
8799                         if (rc)
8800                                 RETURN(rc);
8801
8802                         continue;
8803                 }
8804
8805                 dt_write_lock(env, dto, DT_TGT_CHILD);
8806                 rc = lod_sub_ref_del(env, dto, th);
8807                 dt_write_unlock(env, dto);
8808                 if (rc)
8809                         RETURN(rc);
8810
8811                 rc = lod_sub_destroy(env, dto, th);
8812                 if (rc)
8813                         RETURN(rc);
8814
8815                 snprintf(stripe_name, sizeof(info->lti_key), DFID":%d",
8816                          PFID(lu_object_fid(&dto->do_lu)), i);
8817
8818                 rc = lod_sub_delete(env, next,
8819                                     (const struct dt_key *)stripe_name, th);
8820                 if (rc)
8821                         RETURN(rc);
8822
8823                 rc = lod_sub_ref_del(env, next, th);
8824                 if (rc)
8825                         RETURN(rc);
8826         }
8827
8828         rc = lod_fld_lookup(env, lod, lu_object_fid(&dt->do_lu), &mdtidx,
8829                             &type);
8830         if (rc)
8831                 RETURN(rc);
8832
8833         lmv->lmv_magic = cpu_to_le32(LMV_MAGIC_V1);
8834         lmv->lmv_master_mdt_index = cpu_to_le32(mdtidx);
8835         rc = lod_sub_xattr_set(env, next, lmv_buf, XATTR_NAME_LMV,
8836                                LU_XATTR_REPLACE, th);
8837         if (rc)
8838                 RETURN(rc);
8839
8840         for (i = final_stripe_count; i < lo->ldo_dir_stripe_count; i++) {
8841                 dto = lo->ldo_stripe[i];
8842                 if (dto)
8843                         dt_object_put(env, dto);
8844         }
8845         lo->ldo_dir_stripe_count = final_stripe_count;
8846
8847         RETURN(rc);
8848 }
8849
8850 static mlc_handler dir_mlc_declare_ops[MD_LAYOUT_MAX] = {
8851         [MD_LAYOUT_ATTACH] = lod_dir_declare_layout_attach,
8852         [MD_LAYOUT_DETACH] = lod_dir_declare_layout_detach,
8853         [MD_LAYOUT_SHRINK] = lod_dir_declare_layout_shrink,
8854         [MD_LAYOUT_SPLIT]  = lod_dir_declare_layout_split,
8855 };
8856
8857 static mlc_handler dir_mlc_ops[MD_LAYOUT_MAX] = {
8858         [MD_LAYOUT_DETACH] = lod_dir_layout_detach,
8859         [MD_LAYOUT_SHRINK] = lod_dir_layout_shrink,
8860 };
8861
8862 static int lod_declare_layout_change(const struct lu_env *env,
8863                 struct dt_object *dt, struct md_layout_change *mlc,
8864                 struct thandle *th)
8865 {
8866         struct lod_thread_info  *info = lod_env_info(env);
8867         struct lod_object *lo = lod_dt_obj(dt);
8868         int rc;
8869
8870         ENTRY;
8871
8872         if (S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
8873                 LASSERT(dir_mlc_declare_ops[mlc->mlc_opc]);
8874                 rc = dir_mlc_declare_ops[mlc->mlc_opc](env, dt, mlc, th);
8875                 RETURN(rc);
8876         }
8877
8878         if (!S_ISREG(dt->do_lu.lo_header->loh_attr) || !dt_object_exists(dt) ||
8879             dt_object_remote(dt_object_child(dt)))
8880                 RETURN(-EINVAL);
8881
8882         rc = lod_striping_load(env, lo);
8883         if (rc)
8884                 GOTO(out, rc);
8885
8886         LASSERT(lo->ldo_comp_cnt > 0);
8887
8888         rc = lod_layout_data_init(info, lo->ldo_comp_cnt);
8889         if (rc)
8890                 GOTO(out, rc);
8891
8892         switch (lo->ldo_flr_state) {
8893         case LCM_FL_NONE:
8894                 rc = lod_declare_update_plain(env, lo, mlc->mlc_intent,
8895                                               &mlc->mlc_buf, th);
8896                 break;
8897         case LCM_FL_RDONLY:
8898                 rc = lod_declare_update_rdonly(env, lo, mlc, th);
8899                 break;
8900         case LCM_FL_WRITE_PENDING:
8901                 rc = lod_declare_update_write_pending(env, lo, mlc, th);
8902                 break;
8903         case LCM_FL_SYNC_PENDING:
8904                 rc = lod_declare_update_sync_pending(env, lo, mlc, th);
8905                 break;
8906         default:
8907                 rc = -ENOTSUPP;
8908                 break;
8909         }
8910 out:
8911         RETURN(rc);
8912 }
8913
8914 /**
8915  * Instantiate layout component objects which covers the intent write offset.
8916  */
8917 static int lod_layout_change(const struct lu_env *env, struct dt_object *dt,
8918                              struct md_layout_change *mlc, struct thandle *th)
8919 {
8920         struct lu_attr *attr = &lod_env_info(env)->lti_attr;
8921         struct lu_attr *layout_attr = &lod_env_info(env)->lti_layout_attr;
8922         struct lod_object *lo = lod_dt_obj(dt);
8923         int rc;
8924
8925         ENTRY;
8926
8927         if (S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
8928                 LASSERT(dir_mlc_ops[mlc->mlc_opc]);
8929                 rc = dir_mlc_ops[mlc->mlc_opc](env, dt, mlc, th);
8930                 RETURN(rc);
8931         }
8932
8933         rc = lod_striped_create(env, dt, attr, NULL, th);
8934         if (!rc && layout_attr->la_valid & LA_LAYOUT_VERSION) {
8935                 layout_attr->la_layout_version |= lo->ldo_layout_gen;
8936                 rc = lod_attr_set(env, dt, layout_attr, th);
8937         }
8938
8939         RETURN(rc);
8940 }
8941
8942 const struct dt_object_operations lod_obj_ops = {
8943         .do_read_lock           = lod_read_lock,
8944         .do_write_lock          = lod_write_lock,
8945         .do_read_unlock         = lod_read_unlock,
8946         .do_write_unlock        = lod_write_unlock,
8947         .do_write_locked        = lod_write_locked,
8948         .do_attr_get            = lod_attr_get,
8949         .do_declare_attr_set    = lod_declare_attr_set,
8950         .do_attr_set            = lod_attr_set,
8951         .do_xattr_get           = lod_xattr_get,
8952         .do_declare_xattr_set   = lod_declare_xattr_set,
8953         .do_xattr_set           = lod_xattr_set,
8954         .do_declare_xattr_del   = lod_declare_xattr_del,
8955         .do_xattr_del           = lod_xattr_del,
8956         .do_xattr_list          = lod_xattr_list,
8957         .do_ah_init             = lod_ah_init,
8958         .do_declare_create      = lod_declare_create,
8959         .do_create              = lod_create,
8960         .do_declare_destroy     = lod_declare_destroy,
8961         .do_destroy             = lod_destroy,
8962         .do_index_try           = lod_index_try,
8963         .do_declare_ref_add     = lod_declare_ref_add,
8964         .do_ref_add             = lod_ref_add,
8965         .do_declare_ref_del     = lod_declare_ref_del,
8966         .do_ref_del             = lod_ref_del,
8967         .do_object_sync         = lod_object_sync,
8968         .do_object_lock         = lod_object_lock,
8969         .do_object_unlock       = lod_object_unlock,
8970         .do_invalidate          = lod_invalidate,
8971         .do_declare_layout_change = lod_declare_layout_change,
8972         .do_layout_change       = lod_layout_change,
8973 };
8974
8975 /**
8976  * Implementation of dt_body_operations::dbo_read.
8977  *
8978  * \see dt_body_operations::dbo_read() in the API description for details.
8979  */
8980 static ssize_t lod_read(const struct lu_env *env, struct dt_object *dt,
8981                         struct lu_buf *buf, loff_t *pos)
8982 {
8983         struct dt_object *next = dt_object_child(dt);
8984
8985         LASSERT(S_ISREG(dt->do_lu.lo_header->loh_attr) ||
8986                 S_ISLNK(dt->do_lu.lo_header->loh_attr));
8987         return next->do_body_ops->dbo_read(env, next, buf, pos);
8988 }
8989
8990 /**
8991  * Implementation of dt_body_operations::dbo_declare_write.
8992  *
8993  * \see dt_body_operations::dbo_declare_write() in the API description
8994  * for details.
8995  */
8996 static ssize_t lod_declare_write(const struct lu_env *env,
8997                                  struct dt_object *dt,
8998                                  const struct lu_buf *buf, loff_t pos,
8999                                  struct thandle *th)
9000 {
9001         return lod_sub_declare_write(env, dt_object_child(dt), buf, pos, th);
9002 }
9003
9004 /**
9005  * Implementation of dt_body_operations::dbo_write.
9006  *
9007  * \see dt_body_operations::dbo_write() in the API description for details.
9008  */
9009 static ssize_t lod_write(const struct lu_env *env, struct dt_object *dt,
9010                          const struct lu_buf *buf, loff_t *pos,
9011                          struct thandle *th)
9012 {
9013         LASSERT(S_ISREG(dt->do_lu.lo_header->loh_attr) ||
9014                 S_ISLNK(dt->do_lu.lo_header->loh_attr));
9015         return lod_sub_write(env, dt_object_child(dt), buf, pos, th);
9016 }
9017
9018 static int lod_declare_punch(const struct lu_env *env, struct dt_object *dt,
9019                              __u64 start, __u64 end, struct thandle *th)
9020 {
9021         if (dt_object_remote(dt))
9022                 return -ENOTSUPP;
9023
9024         return lod_sub_declare_punch(env, dt_object_child(dt), start, end, th);
9025 }
9026
9027 static int lod_punch(const struct lu_env *env, struct dt_object *dt,
9028                      __u64 start, __u64 end, struct thandle *th)
9029 {
9030         if (dt_object_remote(dt))
9031                 return -ENOTSUPP;
9032
9033         LASSERT(S_ISREG(dt->do_lu.lo_header->loh_attr));
9034         return lod_sub_punch(env, dt_object_child(dt), start, end, th);
9035 }
9036
9037 /*
9038  * different type of files use the same body_ops because object may be created
9039  * in OUT, where there is no chance to set correct body_ops for each type, so
9040  * body_ops themselves will check file type inside, see lod_read/write/punch for
9041  * details.
9042  */
9043 static const struct dt_body_operations lod_body_ops = {
9044         .dbo_read               = lod_read,
9045         .dbo_declare_write      = lod_declare_write,
9046         .dbo_write              = lod_write,
9047         .dbo_declare_punch      = lod_declare_punch,
9048         .dbo_punch              = lod_punch,
9049 };
9050
9051 /**
9052  * Implementation of lu_object_operations::loo_object_init.
9053  *
9054  * The function determines the type and the index of the target device using
9055  * sequence of the object's FID. Then passes control down to the
9056  * corresponding device:
9057  *  OSD for the local objects, OSP for remote
9058  *
9059  * \see lu_object_operations::loo_object_init() in the API description
9060  * for details.
9061  */
9062 static int lod_object_init(const struct lu_env *env, struct lu_object *lo,
9063                            const struct lu_object_conf *conf)
9064 {
9065         struct lod_device       *lod    = lu2lod_dev(lo->lo_dev);
9066         struct lu_device        *cdev   = NULL;
9067         struct lu_object        *cobj;
9068         struct lod_tgt_descs    *ltd    = NULL;
9069         struct lod_tgt_desc     *tgt;
9070         u32                      idx    = 0;
9071         int                      type   = LU_SEQ_RANGE_ANY;
9072         int                      rc;
9073         ENTRY;
9074
9075         rc = lod_fld_lookup(env, lod, lu_object_fid(lo), &idx, &type);
9076         if (rc != 0)
9077                 RETURN(rc);
9078
9079         if (type == LU_SEQ_RANGE_MDT &&
9080             idx == lu_site2seq(lo->lo_dev->ld_site)->ss_node_id) {
9081                 cdev = &lod->lod_child->dd_lu_dev;
9082         } else if (type == LU_SEQ_RANGE_MDT) {
9083                 ltd = &lod->lod_mdt_descs;
9084                 lod_getref(ltd);
9085         } else if (type == LU_SEQ_RANGE_OST) {
9086                 ltd = &lod->lod_ost_descs;
9087                 lod_getref(ltd);
9088         } else {
9089                 LBUG();
9090         }
9091
9092         if (ltd != NULL) {
9093                 if (ltd->ltd_tgts_size > idx &&
9094                     test_bit(idx, ltd->ltd_tgt_bitmap)) {
9095                         tgt = LTD_TGT(ltd, idx);
9096
9097                         LASSERT(tgt != NULL);
9098                         LASSERT(tgt->ltd_tgt != NULL);
9099
9100                         cdev = &(tgt->ltd_tgt->dd_lu_dev);
9101                 }
9102                 lod_putref(lod, ltd);
9103         }
9104
9105         if (unlikely(cdev == NULL))
9106                 RETURN(-ENOENT);
9107
9108         cobj = cdev->ld_ops->ldo_object_alloc(env, lo->lo_header, cdev);
9109         if (unlikely(cobj == NULL))
9110                 RETURN(-ENOMEM);
9111
9112         lu2lod_obj(lo)->ldo_obj.do_body_ops = &lod_body_ops;
9113
9114         lu_object_add(lo, cobj);
9115
9116         RETURN(0);
9117 }
9118
9119 /**
9120  *
9121  * Release resources associated with striping.
9122  *
9123  * If the object is striped (regular or directory), then release
9124  * the stripe objects references and free the ldo_stripe array.
9125  *
9126  * \param[in] env       execution environment
9127  * \param[in] lo        object
9128  */
9129 void lod_striping_free_nolock(const struct lu_env *env, struct lod_object *lo)
9130 {
9131         struct lod_layout_component *lod_comp;
9132         __u32 obj_attr = lo->ldo_obj.do_lu.lo_header->loh_attr;
9133         int i, j;
9134
9135         if (unlikely(lo->ldo_is_foreign)) {
9136                 if (S_ISREG(obj_attr)) {
9137                         lod_free_foreign_lov(lo);
9138                         lo->ldo_comp_cached = 0;
9139                 } else if (S_ISDIR(obj_attr)) {
9140                         lod_free_foreign_lmv(lo);
9141                         lo->ldo_dir_stripe_loaded = 0;
9142                 }
9143         } else if (lo->ldo_stripe != NULL) {
9144                 LASSERT(lo->ldo_comp_entries == NULL);
9145                 LASSERT(lo->ldo_dir_stripes_allocated > 0);
9146
9147                 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
9148                         if (lo->ldo_stripe[i])
9149                                 dt_object_put(env, lo->ldo_stripe[i]);
9150                 }
9151
9152                 j = sizeof(struct dt_object *) * lo->ldo_dir_stripes_allocated;
9153                 OBD_FREE(lo->ldo_stripe, j);
9154                 lo->ldo_stripe = NULL;
9155                 lo->ldo_dir_stripes_allocated = 0;
9156                 lo->ldo_dir_stripe_loaded = 0;
9157                 lo->ldo_dir_stripe_count = 0;
9158                 lo->ldo_obj.do_index_ops = NULL;
9159         } else if (lo->ldo_comp_entries != NULL) {
9160                 for (i = 0; i < lo->ldo_comp_cnt; i++) {
9161                         /* free lod_layout_component::llc_stripe array */
9162                         lod_comp = &lo->ldo_comp_entries[i];
9163
9164                         if (lod_comp->llc_stripe == NULL)
9165                                 continue;
9166                         LASSERT(lod_comp->llc_stripes_allocated != 0);
9167                         for (j = 0; j < lod_comp->llc_stripes_allocated; j++) {
9168                                 if (lod_comp->llc_stripe[j] != NULL)
9169                                         lu_object_put(env,
9170                                                &lod_comp->llc_stripe[j]->do_lu);
9171                         }
9172                         OBD_FREE_PTR_ARRAY(lod_comp->llc_stripe,
9173                                            lod_comp->llc_stripes_allocated);
9174                         lod_comp->llc_stripe = NULL;
9175                         OBD_FREE_PTR_ARRAY(lod_comp->llc_ost_indices,
9176                                            lod_comp->llc_stripes_allocated);
9177                         lod_comp->llc_ost_indices = NULL;
9178                         lod_comp->llc_stripes_allocated = 0;
9179                 }
9180                 lod_free_comp_entries(lo);
9181                 lo->ldo_comp_cached = 0;
9182         }
9183 }
9184
9185 void lod_striping_free(const struct lu_env *env, struct lod_object *lo)
9186 {
9187         mutex_lock(&lo->ldo_layout_mutex);
9188         lod_striping_free_nolock(env, lo);
9189         mutex_unlock(&lo->ldo_layout_mutex);
9190 }
9191
9192 /**
9193  * Implementation of lu_object_operations::loo_object_free.
9194  *
9195  * \see lu_object_operations::loo_object_free() in the API description
9196  * for details.
9197  */
9198 static void lod_object_free(const struct lu_env *env, struct lu_object *o)
9199 {
9200         struct lod_object *lo = lu2lod_obj(o);
9201
9202         /* release all underlying object pinned */
9203         lod_striping_free(env, lo);
9204         lu_object_fini(o);
9205         /* lo doesn't contain a lu_object_header, so we don't need call_rcu */
9206         OBD_SLAB_FREE_PTR(lo, lod_object_kmem);
9207 }
9208
9209 /**
9210  * Implementation of lu_object_operations::loo_object_release.
9211  *
9212  * \see lu_object_operations::loo_object_release() in the API description
9213  * for details.
9214  */
9215 static void lod_object_release(const struct lu_env *env, struct lu_object *o)
9216 {
9217         /* XXX: shouldn't we release everything here in case if object
9218          * creation failed before? */
9219 }
9220
9221 /**
9222  * Implementation of lu_object_operations::loo_object_print.
9223  *
9224  * \see lu_object_operations::loo_object_print() in the API description
9225  * for details.
9226  */
9227 static int lod_object_print(const struct lu_env *env, void *cookie,
9228                             lu_printer_t p, const struct lu_object *l)
9229 {
9230         struct lod_object *o = lu2lod_obj((struct lu_object *) l);
9231
9232         return (*p)(env, cookie, LUSTRE_LOD_NAME"-object@%p", o);
9233 }
9234
9235 const struct lu_object_operations lod_lu_obj_ops = {
9236         .loo_object_init        = lod_object_init,
9237         .loo_object_free        = lod_object_free,
9238         .loo_object_release     = lod_object_release,
9239         .loo_object_print       = lod_object_print,
9240 };