Whamcloud - gitweb
LU-15727 lod: honor append_pool with default composite layouts
[fs/lustre-release.git] / lustre / lod / lod_object.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License version 2 for more details.  A copy is
14  * included in the COPYING file that accompanied this code.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright  2009 Sun Microsystems, Inc. All rights reserved
24  * Use is subject to license terms.
25  *
26  * Copyright (c) 2012, 2017, Intel Corporation.
27  */
28 /*
29  * lustre/lod/lod_object.c
30  *
31  * This file contains implementations of methods for the OSD API
32  * for the Logical Object Device (LOD) layer, which provides a virtual
33  * local OSD object interface to the MDD layer, and abstracts the
34  * addressing of local (OSD) and remote (OSP) objects. The API is
35  * described in the file lustre/include/dt_object.h and in
36  * Documentation/osd-api.txt.
37  *
38  * Author: Alex Zhuravlev <alexey.zhuravlev@intel.com>
39  */
40
41 #define DEBUG_SUBSYSTEM S_MDS
42
43 #include <linux/random.h>
44
45 #include <obd.h>
46 #include <obd_class.h>
47 #include <obd_support.h>
48
49 #include <lustre_fid.h>
50 #include <lustre_linkea.h>
51 #include <lustre_lmv.h>
52 #include <uapi/linux/lustre/lustre_param.h>
53 #include <lustre_swab.h>
54 #include <uapi/linux/lustre/lustre_ver.h>
55 #include <lprocfs_status.h>
56 #include <md_object.h>
57
58 #include "lod_internal.h"
59
60 static const char dot[] = ".";
61 static const char dotdot[] = "..";
62
63 /**
64  * Implementation of dt_index_operations::dio_lookup
65  *
66  * Used with regular (non-striped) objects.
67  *
68  * \see dt_index_operations::dio_lookup() in the API description for details.
69  */
70 static int lod_lookup(const struct lu_env *env, struct dt_object *dt,
71                       struct dt_rec *rec, const struct dt_key *key)
72 {
73         struct dt_object *next = dt_object_child(dt);
74         return next->do_index_ops->dio_lookup(env, next, rec, key);
75 }
76
77 /**
78  * Implementation of dt_index_operations::dio_declare_insert.
79  *
80  * Used with regular (non-striped) objects.
81  *
82  * \see dt_index_operations::dio_declare_insert() in the API description
83  * for details.
84  */
85 static int lod_declare_insert(const struct lu_env *env, struct dt_object *dt,
86                               const struct dt_rec *rec,
87                               const struct dt_key *key, struct thandle *th)
88 {
89         return lod_sub_declare_insert(env, dt_object_child(dt), rec, key, th);
90 }
91
92 /**
93  * Implementation of dt_index_operations::dio_insert.
94  *
95  * Used with regular (non-striped) objects
96  *
97  * \see dt_index_operations::dio_insert() in the API description for details.
98  */
99 static int lod_insert(const struct lu_env *env, struct dt_object *dt,
100                       const struct dt_rec *rec, const struct dt_key *key,
101                       struct thandle *th)
102 {
103         return lod_sub_insert(env, dt_object_child(dt), rec, key, th);
104 }
105
106 /**
107  * Implementation of dt_index_operations::dio_declare_delete.
108  *
109  * Used with regular (non-striped) objects.
110  *
111  * \see dt_index_operations::dio_declare_delete() in the API description
112  * for details.
113  */
114 static int lod_declare_delete(const struct lu_env *env, struct dt_object *dt,
115                               const struct dt_key *key, struct thandle *th)
116 {
117         return lod_sub_declare_delete(env, dt_object_child(dt), key, th);
118 }
119
120 /**
121  * Implementation of dt_index_operations::dio_delete.
122  *
123  * Used with regular (non-striped) objects.
124  *
125  * \see dt_index_operations::dio_delete() in the API description for details.
126  */
127 static int lod_delete(const struct lu_env *env, struct dt_object *dt,
128                       const struct dt_key *key, struct thandle *th)
129 {
130         return lod_sub_delete(env, dt_object_child(dt), key, th);
131 }
132
133 /**
134  * Implementation of dt_it_ops::init.
135  *
136  * Used with regular (non-striped) objects.
137  *
138  * \see dt_it_ops::init() in the API description for details.
139  */
140 static struct dt_it *lod_it_init(const struct lu_env *env,
141                                  struct dt_object *dt, __u32 attr)
142 {
143         struct dt_object        *next = dt_object_child(dt);
144         struct lod_it           *it = &lod_env_info(env)->lti_it;
145         struct dt_it            *it_next;
146
147         it_next = next->do_index_ops->dio_it.init(env, next, attr);
148         if (IS_ERR(it_next))
149                 return it_next;
150
151         /* currently we do not use more than one iterator per thread
152          * so we store it in thread info. if at some point we need
153          * more active iterators in a single thread, we can allocate
154          * additional ones */
155         LASSERT(it->lit_obj == NULL);
156
157         it->lit_it = it_next;
158         it->lit_obj = next;
159
160         return (struct dt_it *)it;
161 }
162
163 #define LOD_CHECK_IT(env, it)                                   \
164 do {                                                            \
165         LASSERT((it)->lit_obj != NULL);                         \
166         LASSERT((it)->lit_it != NULL);                          \
167 } while (0)
168
169 /**
170  * Implementation of dt_index_operations::dio_it.fini.
171  *
172  * Used with regular (non-striped) objects.
173  *
174  * \see dt_index_operations::dio_it.fini() in the API description for details.
175  */
176 static void lod_it_fini(const struct lu_env *env, struct dt_it *di)
177 {
178         struct lod_it *it = (struct lod_it *)di;
179
180         LOD_CHECK_IT(env, it);
181         it->lit_obj->do_index_ops->dio_it.fini(env, it->lit_it);
182
183         /* the iterator not in use any more */
184         it->lit_obj = NULL;
185         it->lit_it = NULL;
186 }
187
188 /**
189  * Implementation of dt_it_ops::get.
190  *
191  * Used with regular (non-striped) objects.
192  *
193  * \see dt_it_ops::get() in the API description for details.
194  */
195 static int lod_it_get(const struct lu_env *env, struct dt_it *di,
196                       const struct dt_key *key)
197 {
198         const struct lod_it *it = (const struct lod_it *)di;
199
200         LOD_CHECK_IT(env, it);
201         return it->lit_obj->do_index_ops->dio_it.get(env, it->lit_it, key);
202 }
203
204 /**
205  * Implementation of dt_it_ops::put.
206  *
207  * Used with regular (non-striped) objects.
208  *
209  * \see dt_it_ops::put() in the API description for details.
210  */
211 static void lod_it_put(const struct lu_env *env, struct dt_it *di)
212 {
213         struct lod_it *it = (struct lod_it *)di;
214
215         LOD_CHECK_IT(env, it);
216         return it->lit_obj->do_index_ops->dio_it.put(env, it->lit_it);
217 }
218
219 /**
220  * Implementation of dt_it_ops::next.
221  *
222  * Used with regular (non-striped) objects
223  *
224  * \see dt_it_ops::next() in the API description for details.
225  */
226 static int lod_it_next(const struct lu_env *env, struct dt_it *di)
227 {
228         struct lod_it *it = (struct lod_it *)di;
229
230         LOD_CHECK_IT(env, it);
231         return it->lit_obj->do_index_ops->dio_it.next(env, it->lit_it);
232 }
233
234 /**
235  * Implementation of dt_it_ops::key.
236  *
237  * Used with regular (non-striped) objects.
238  *
239  * \see dt_it_ops::key() in the API description for details.
240  */
241 static struct dt_key *lod_it_key(const struct lu_env *env,
242                                  const struct dt_it *di)
243 {
244         const struct lod_it *it = (const struct lod_it *)di;
245
246         LOD_CHECK_IT(env, it);
247         return it->lit_obj->do_index_ops->dio_it.key(env, it->lit_it);
248 }
249
250 /**
251  * Implementation of dt_it_ops::key_size.
252  *
253  * Used with regular (non-striped) objects.
254  *
255  * \see dt_it_ops::key_size() in the API description for details.
256  */
257 static int lod_it_key_size(const struct lu_env *env, const struct dt_it *di)
258 {
259         struct lod_it *it = (struct lod_it *)di;
260
261         LOD_CHECK_IT(env, it);
262         return it->lit_obj->do_index_ops->dio_it.key_size(env, it->lit_it);
263 }
264
265 /**
266  * Implementation of dt_it_ops::rec.
267  *
268  * Used with regular (non-striped) objects.
269  *
270  * \see dt_it_ops::rec() in the API description for details.
271  */
272 static int lod_it_rec(const struct lu_env *env, const struct dt_it *di,
273                       struct dt_rec *rec, __u32 attr)
274 {
275         const struct lod_it *it = (const struct lod_it *)di;
276
277         LOD_CHECK_IT(env, it);
278         return it->lit_obj->do_index_ops->dio_it.rec(env, it->lit_it, rec,
279                                                      attr);
280 }
281
282 /**
283  * Implementation of dt_it_ops::rec_size.
284  *
285  * Used with regular (non-striped) objects.
286  *
287  * \see dt_it_ops::rec_size() in the API description for details.
288  */
289 static int lod_it_rec_size(const struct lu_env *env, const struct dt_it *di,
290                            __u32 attr)
291 {
292         const struct lod_it *it = (const struct lod_it *)di;
293
294         LOD_CHECK_IT(env, it);
295         return it->lit_obj->do_index_ops->dio_it.rec_size(env, it->lit_it,
296                                                           attr);
297 }
298
299 /**
300  * Implementation of dt_it_ops::store.
301  *
302  * Used with regular (non-striped) objects.
303  *
304  * \see dt_it_ops::store() in the API description for details.
305  */
306 static __u64 lod_it_store(const struct lu_env *env, const struct dt_it *di)
307 {
308         const struct lod_it *it = (const struct lod_it *)di;
309
310         LOD_CHECK_IT(env, it);
311         return it->lit_obj->do_index_ops->dio_it.store(env, it->lit_it);
312 }
313
314 /**
315  * Implementation of dt_it_ops::load.
316  *
317  * Used with regular (non-striped) objects.
318  *
319  * \see dt_it_ops::load() in the API description for details.
320  */
321 static int lod_it_load(const struct lu_env *env, const struct dt_it *di,
322                        __u64 hash)
323 {
324         const struct lod_it *it = (const struct lod_it *)di;
325
326         LOD_CHECK_IT(env, it);
327         return it->lit_obj->do_index_ops->dio_it.load(env, it->lit_it, hash);
328 }
329
330 /**
331  * Implementation of dt_it_ops::key_rec.
332  *
333  * Used with regular (non-striped) objects.
334  *
335  * \see dt_it_ops::rec() in the API description for details.
336  */
337 static int lod_it_key_rec(const struct lu_env *env, const struct dt_it *di,
338                           void *key_rec)
339 {
340         const struct lod_it *it = (const struct lod_it *)di;
341
342         LOD_CHECK_IT(env, it);
343         return it->lit_obj->do_index_ops->dio_it.key_rec(env, it->lit_it,
344                                                          key_rec);
345 }
346
347 static const struct dt_index_operations lod_index_ops = {
348         .dio_lookup             = lod_lookup,
349         .dio_declare_insert     = lod_declare_insert,
350         .dio_insert             = lod_insert,
351         .dio_declare_delete     = lod_declare_delete,
352         .dio_delete             = lod_delete,
353         .dio_it = {
354                 .init           = lod_it_init,
355                 .fini           = lod_it_fini,
356                 .get            = lod_it_get,
357                 .put            = lod_it_put,
358                 .next           = lod_it_next,
359                 .key            = lod_it_key,
360                 .key_size       = lod_it_key_size,
361                 .rec            = lod_it_rec,
362                 .rec_size       = lod_it_rec_size,
363                 .store          = lod_it_store,
364                 .load           = lod_it_load,
365                 .key_rec        = lod_it_key_rec,
366         }
367 };
368
369 /**
370  * Implementation of dt_index_operations::dio_lookup
371  *
372  * Used with striped directories.
373  *
374  * \see dt_index_operations::dio_lookup() in the API description for details.
375  */
376 static int lod_striped_lookup(const struct lu_env *env, struct dt_object *dt,
377                       struct dt_rec *rec, const struct dt_key *key)
378 {
379         struct lod_object *lo = lod_dt_obj(dt);
380         struct dt_object *next;
381         const char *name = (const char *)key;
382
383         LASSERT(lo->ldo_dir_stripe_count > 0);
384
385         if (strcmp(name, dot) == 0) {
386                 struct lu_fid *fid = (struct lu_fid *)rec;
387
388                 *fid = *lod_object_fid(lo);
389                 return 1;
390         }
391
392         if (strcmp(name, dotdot) == 0) {
393                 next = dt_object_child(dt);
394         } else {
395                 int index;
396
397                 index = __lmv_name_to_stripe_index(lo->ldo_dir_hash_type,
398                                                    lo->ldo_dir_stripe_count,
399                                                    lo->ldo_dir_migrate_hash,
400                                                    lo->ldo_dir_migrate_offset,
401                                                    name, strlen(name), true);
402                 if (index < 0)
403                         return index;
404
405                 next = lo->ldo_stripe[index];
406                 if (!next || !dt_object_exists(next))
407                         return -ENODEV;
408         }
409
410         return next->do_index_ops->dio_lookup(env, next, rec, key);
411 }
412
413 /**
414  * Implementation of dt_it_ops::init.
415  *
416  * Used with striped objects. Internally just initializes the iterator
417  * on the first stripe.
418  *
419  * \see dt_it_ops::init() in the API description for details.
420  */
421 static struct dt_it *lod_striped_it_init(const struct lu_env *env,
422                                          struct dt_object *dt, __u32 attr)
423 {
424         struct lod_object *lo = lod_dt_obj(dt);
425         struct dt_object *next;
426         struct lod_it *it = &lod_env_info(env)->lti_it;
427         struct dt_it *it_next;
428         __u16 index = 0;
429
430         LASSERT(lo->ldo_dir_stripe_count > 0);
431
432         do {
433                 next = lo->ldo_stripe[index];
434                 if (next && dt_object_exists(next))
435                         break;
436         } while (++index < lo->ldo_dir_stripe_count);
437
438         /* no valid stripe */
439         if (!next || !dt_object_exists(next))
440                 return ERR_PTR(-ENODEV);
441
442         LASSERT(next->do_index_ops != NULL);
443
444         it_next = next->do_index_ops->dio_it.init(env, next, attr);
445         if (IS_ERR(it_next))
446                 return it_next;
447
448         /* currently we do not use more than one iterator per thread
449          * so we store it in thread info. if at some point we need
450          * more active iterators in a single thread, we can allocate
451          * additional ones */
452         LASSERT(it->lit_obj == NULL);
453
454         it->lit_stripe_index = index;
455         it->lit_attr = attr;
456         it->lit_it = it_next;
457         it->lit_obj = dt;
458
459         return (struct dt_it *)it;
460 }
461
462 #define LOD_CHECK_STRIPED_IT(env, it, lo)                               \
463 do {                                                                    \
464         LASSERT((it)->lit_obj != NULL);                                 \
465         LASSERT((it)->lit_it != NULL);                                  \
466         LASSERT((lo)->ldo_dir_stripe_count > 0);                        \
467         LASSERT((it)->lit_stripe_index < (lo)->ldo_dir_stripe_count);   \
468 } while (0)
469
470 /**
471  * Implementation of dt_it_ops::fini.
472  *
473  * Used with striped objects.
474  *
475  * \see dt_it_ops::fini() in the API description for details.
476  */
477 static void lod_striped_it_fini(const struct lu_env *env, struct dt_it *di)
478 {
479         struct lod_it           *it = (struct lod_it *)di;
480         struct lod_object       *lo = lod_dt_obj(it->lit_obj);
481         struct dt_object        *next;
482
483         /* If lit_it == NULL, then it means the sub_it has been finished,
484          * which only happens in failure cases, see lod_striped_it_next() */
485         if (it->lit_it != NULL) {
486                 LOD_CHECK_STRIPED_IT(env, it, lo);
487
488                 next = lo->ldo_stripe[it->lit_stripe_index];
489                 if (next) {
490                         LASSERT(next->do_index_ops != NULL);
491                         next->do_index_ops->dio_it.fini(env, it->lit_it);
492                 }
493         }
494
495         /* the iterator not in use any more */
496         it->lit_obj = NULL;
497         it->lit_it = NULL;
498         it->lit_stripe_index = 0;
499 }
500
501 /**
502  * Implementation of dt_it_ops::get.
503  *
504  * Right now it's not used widely, only to reset the iterator to the
505  * initial position. It should be possible to implement a full version
506  * which chooses a correct stripe to be able to position with any key.
507  *
508  * \see dt_it_ops::get() in the API description for details.
509  */
510 static int lod_striped_it_get(const struct lu_env *env, struct dt_it *di,
511                               const struct dt_key *key)
512 {
513         const struct lod_it *it = (const struct lod_it *)di;
514         struct lod_object *lo = lod_dt_obj(it->lit_obj);
515         struct dt_object *next;
516
517         LOD_CHECK_STRIPED_IT(env, it, lo);
518
519         next = lo->ldo_stripe[it->lit_stripe_index];
520         LASSERT(next != NULL);
521         LASSERT(dt_object_exists(next));
522         LASSERT(next->do_index_ops != NULL);
523
524         return next->do_index_ops->dio_it.get(env, it->lit_it, key);
525 }
526
527 /**
528  * Implementation of dt_it_ops::put.
529  *
530  * Used with striped objects.
531  *
532  * \see dt_it_ops::put() in the API description for details.
533  */
534 static void lod_striped_it_put(const struct lu_env *env, struct dt_it *di)
535 {
536         struct lod_it *it = (struct lod_it *)di;
537         struct lod_object *lo = lod_dt_obj(it->lit_obj);
538         struct dt_object *next;
539
540         /*
541          * If lit_it == NULL, then it means the sub_it has been finished,
542          * which only happens in failure cases, see lod_striped_it_next()
543          */
544         if (!it->lit_it)
545                 return;
546
547         LOD_CHECK_STRIPED_IT(env, it, lo);
548
549         next = lo->ldo_stripe[it->lit_stripe_index];
550         LASSERT(next != NULL);
551         LASSERT(next->do_index_ops != NULL);
552
553         return next->do_index_ops->dio_it.put(env, it->lit_it);
554 }
555
556 /**
557  * Implementation of dt_it_ops::next.
558  *
559  * Used with striped objects. When the end of the current stripe is
560  * reached, the method takes the next stripe's iterator.
561  *
562  * \see dt_it_ops::next() in the API description for details.
563  */
564 static int lod_striped_it_next(const struct lu_env *env, struct dt_it *di)
565 {
566         struct lod_it *it = (struct lod_it *)di;
567         struct lod_object *lo = lod_dt_obj(it->lit_obj);
568         struct dt_object *next;
569         struct dt_it *it_next;
570         __u32 index;
571         int rc;
572
573         ENTRY;
574
575         LOD_CHECK_STRIPED_IT(env, it, lo);
576
577         next = lo->ldo_stripe[it->lit_stripe_index];
578         LASSERT(next != NULL);
579         LASSERT(dt_object_exists(next));
580         LASSERT(next->do_index_ops != NULL);
581 again:
582         rc = next->do_index_ops->dio_it.next(env, it->lit_it);
583         if (rc < 0)
584                 RETURN(rc);
585
586         if (rc == 0 && it->lit_stripe_index == 0)
587                 RETURN(rc);
588
589         if (rc == 0 && it->lit_stripe_index > 0) {
590                 struct lu_dirent *ent;
591
592                 ent = (struct lu_dirent *)lod_env_info(env)->lti_key;
593
594                 rc = next->do_index_ops->dio_it.rec(env, it->lit_it,
595                                                     (struct dt_rec *)ent,
596                                                     it->lit_attr);
597                 if (rc != 0)
598                         RETURN(rc);
599
600                 /* skip . and .. for slave stripe */
601                 if ((strncmp(ent->lde_name, ".",
602                              le16_to_cpu(ent->lde_namelen)) == 0 &&
603                      le16_to_cpu(ent->lde_namelen) == 1) ||
604                     (strncmp(ent->lde_name, "..",
605                              le16_to_cpu(ent->lde_namelen)) == 0 &&
606                      le16_to_cpu(ent->lde_namelen) == 2))
607                         goto again;
608
609                 RETURN(rc);
610         }
611
612         next->do_index_ops->dio_it.put(env, it->lit_it);
613         next->do_index_ops->dio_it.fini(env, it->lit_it);
614         it->lit_it = NULL;
615
616         /* go to next stripe */
617         index = it->lit_stripe_index;
618         while (++index < lo->ldo_dir_stripe_count) {
619                 next = lo->ldo_stripe[index];
620                 if (!next)
621                         continue;
622
623                 if (!dt_object_exists(next))
624                         continue;
625
626                 rc = next->do_ops->do_index_try(env, next,
627                                                 &dt_directory_features);
628                 if (rc != 0)
629                         RETURN(rc);
630
631                 LASSERT(next->do_index_ops != NULL);
632
633                 it_next = next->do_index_ops->dio_it.init(env, next,
634                                                           it->lit_attr);
635                 if (IS_ERR(it_next))
636                         RETURN(PTR_ERR(it_next));
637
638                 rc = next->do_index_ops->dio_it.get(env, it_next,
639                                                     (const struct dt_key *)"");
640                 if (rc <= 0)
641                         RETURN(rc == 0 ? -EIO : rc);
642
643                 it->lit_it = it_next;
644                 it->lit_stripe_index = index;
645                 goto again;
646
647         }
648
649         RETURN(1);
650 }
651
652 /**
653  * Implementation of dt_it_ops::key.
654  *
655  * Used with striped objects.
656  *
657  * \see dt_it_ops::key() in the API description for details.
658  */
659 static struct dt_key *lod_striped_it_key(const struct lu_env *env,
660                                          const struct dt_it *di)
661 {
662         const struct lod_it     *it = (const struct lod_it *)di;
663         struct lod_object       *lo = lod_dt_obj(it->lit_obj);
664         struct dt_object        *next;
665
666         LOD_CHECK_STRIPED_IT(env, it, lo);
667
668         next = lo->ldo_stripe[it->lit_stripe_index];
669         LASSERT(next != NULL);
670         LASSERT(next->do_index_ops != NULL);
671
672         return next->do_index_ops->dio_it.key(env, it->lit_it);
673 }
674
675 /**
676  * Implementation of dt_it_ops::key_size.
677  *
678  * Used with striped objects.
679  *
680  * \see dt_it_ops::size() in the API description for details.
681  */
682 static int lod_striped_it_key_size(const struct lu_env *env,
683                                    const struct dt_it *di)
684 {
685         struct lod_it           *it = (struct lod_it *)di;
686         struct lod_object       *lo = lod_dt_obj(it->lit_obj);
687         struct dt_object        *next;
688
689         LOD_CHECK_STRIPED_IT(env, it, lo);
690
691         next = lo->ldo_stripe[it->lit_stripe_index];
692         LASSERT(next != NULL);
693         LASSERT(next->do_index_ops != NULL);
694
695         return next->do_index_ops->dio_it.key_size(env, it->lit_it);
696 }
697
698 /**
699  * Implementation of dt_it_ops::rec.
700  *
701  * Used with striped objects.
702  *
703  * \see dt_it_ops::rec() in the API description for details.
704  */
705 static int lod_striped_it_rec(const struct lu_env *env, const struct dt_it *di,
706                               struct dt_rec *rec, __u32 attr)
707 {
708         const struct lod_it     *it = (const struct lod_it *)di;
709         struct lod_object       *lo = lod_dt_obj(it->lit_obj);
710         struct dt_object        *next;
711
712         LOD_CHECK_STRIPED_IT(env, it, lo);
713
714         next = lo->ldo_stripe[it->lit_stripe_index];
715         LASSERT(next != NULL);
716         LASSERT(next->do_index_ops != NULL);
717
718         return next->do_index_ops->dio_it.rec(env, it->lit_it, rec, attr);
719 }
720
721 /**
722  * Implementation of dt_it_ops::rec_size.
723  *
724  * Used with striped objects.
725  *
726  * \see dt_it_ops::rec_size() in the API description for details.
727  */
728 static int lod_striped_it_rec_size(const struct lu_env *env,
729                                    const struct dt_it *di, __u32 attr)
730 {
731         struct lod_it           *it = (struct lod_it *)di;
732         struct lod_object       *lo = lod_dt_obj(it->lit_obj);
733         struct dt_object        *next;
734
735         LOD_CHECK_STRIPED_IT(env, it, lo);
736
737         next = lo->ldo_stripe[it->lit_stripe_index];
738         LASSERT(next != NULL);
739         LASSERT(next->do_index_ops != NULL);
740
741         return next->do_index_ops->dio_it.rec_size(env, it->lit_it, attr);
742 }
743
744 /**
745  * Implementation of dt_it_ops::store.
746  *
747  * Used with striped objects.
748  *
749  * \see dt_it_ops::store() in the API description for details.
750  */
751 static __u64 lod_striped_it_store(const struct lu_env *env,
752                                   const struct dt_it *di)
753 {
754         const struct lod_it     *it = (const struct lod_it *)di;
755         struct lod_object       *lo = lod_dt_obj(it->lit_obj);
756         struct dt_object        *next;
757
758         LOD_CHECK_STRIPED_IT(env, it, lo);
759
760         next = lo->ldo_stripe[it->lit_stripe_index];
761         LASSERT(next != NULL);
762         LASSERT(next->do_index_ops != NULL);
763
764         return next->do_index_ops->dio_it.store(env, it->lit_it);
765 }
766
767 /**
768  * Implementation of dt_it_ops::load.
769  *
770  * Used with striped objects.
771  *
772  * \see dt_it_ops::load() in the API description for details.
773  */
774 static int lod_striped_it_load(const struct lu_env *env,
775                                const struct dt_it *di, __u64 hash)
776 {
777         const struct lod_it     *it = (const struct lod_it *)di;
778         struct lod_object       *lo = lod_dt_obj(it->lit_obj);
779         struct dt_object        *next;
780
781         LOD_CHECK_STRIPED_IT(env, it, lo);
782
783         next = lo->ldo_stripe[it->lit_stripe_index];
784         LASSERT(next != NULL);
785         LASSERT(next->do_index_ops != NULL);
786
787         return next->do_index_ops->dio_it.load(env, it->lit_it, hash);
788 }
789
790 static const struct dt_index_operations lod_striped_index_ops = {
791         .dio_lookup             = lod_striped_lookup,
792         .dio_declare_insert     = lod_declare_insert,
793         .dio_insert             = lod_insert,
794         .dio_declare_delete     = lod_declare_delete,
795         .dio_delete             = lod_delete,
796         .dio_it = {
797                 .init           = lod_striped_it_init,
798                 .fini           = lod_striped_it_fini,
799                 .get            = lod_striped_it_get,
800                 .put            = lod_striped_it_put,
801                 .next           = lod_striped_it_next,
802                 .key            = lod_striped_it_key,
803                 .key_size       = lod_striped_it_key_size,
804                 .rec            = lod_striped_it_rec,
805                 .rec_size       = lod_striped_it_rec_size,
806                 .store          = lod_striped_it_store,
807                 .load           = lod_striped_it_load,
808         }
809 };
810
811 /**
812  * Append the FID for each shard of the striped directory after the
813  * given LMV EA header.
814  *
815  * To simplify striped directory and the consistency verification,
816  * we only store the LMV EA header on disk, for both master object
817  * and slave objects. When someone wants to know the whole LMV EA,
818  * such as client readdir(), we can build the entrie LMV EA on the
819  * MDT side (in RAM) via iterating the sub-directory entries that
820  * are contained in the master object of the stripe directory.
821  *
822  * For the master object of the striped directroy, the valid name
823  * for each shard is composed of the ${shard_FID}:${shard_idx}.
824  *
825  * There may be holes in the LMV EA if some shards' name entries
826  * are corrupted or lost.
827  *
828  * \param[in] env       pointer to the thread context
829  * \param[in] lo        pointer to the master object of the striped directory
830  * \param[in] buf       pointer to the lu_buf which will hold the LMV EA
831  * \param[in] resize    whether re-allocate the buffer if it is not big enough
832  *
833  * \retval              positive size of the LMV EA
834  * \retval              0 for nothing to be loaded
835  * \retval              negative error number on failure
836  */
837 int lod_load_lmv_shards(const struct lu_env *env, struct lod_object *lo,
838                         struct lu_buf *buf, bool resize)
839 {
840         struct lu_dirent        *ent    =
841                         (struct lu_dirent *)lod_env_info(env)->lti_key;
842         struct lod_device       *lod    = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
843         struct dt_object        *obj    = dt_object_child(&lo->ldo_obj);
844         struct lmv_mds_md_v1    *lmv1   = buf->lb_buf;
845         struct dt_it            *it;
846         const struct dt_it_ops  *iops;
847         __u32                    stripes;
848         __u32                    magic  = le32_to_cpu(lmv1->lmv_magic);
849         size_t                   lmv1_size;
850         int                      rc;
851         ENTRY;
852
853         if (magic != LMV_MAGIC_V1)
854                 RETURN(0);
855
856         stripes = le32_to_cpu(lmv1->lmv_stripe_count);
857         if (stripes < 1)
858                 RETURN(0);
859
860         rc = lmv_mds_md_size(stripes, magic);
861         if (rc < 0)
862                 RETURN(rc);
863         lmv1_size = rc;
864         if (buf->lb_len < lmv1_size) {
865                 struct lu_buf tbuf;
866
867                 if (!resize)
868                         RETURN(-ERANGE);
869
870                 tbuf = *buf;
871                 buf->lb_buf = NULL;
872                 buf->lb_len = 0;
873                 lu_buf_alloc(buf, lmv1_size);
874                 lmv1 = buf->lb_buf;
875                 if (lmv1 == NULL)
876                         RETURN(-ENOMEM);
877
878                 memcpy(buf->lb_buf, tbuf.lb_buf, tbuf.lb_len);
879         }
880
881         if (unlikely(!dt_try_as_dir(env, obj)))
882                 RETURN(-ENOTDIR);
883
884         memset(&lmv1->lmv_stripe_fids[0], 0, stripes * sizeof(struct lu_fid));
885         iops = &obj->do_index_ops->dio_it;
886         it = iops->init(env, obj, LUDA_64BITHASH);
887         if (IS_ERR(it))
888                 RETURN(PTR_ERR(it));
889
890         rc = iops->load(env, it, 0);
891         if (rc == 0)
892                 rc = iops->next(env, it);
893         else if (rc > 0)
894                 rc = 0;
895
896         while (rc == 0) {
897                 char             name[FID_LEN + 2] = "";
898                 struct lu_fid    fid;
899                 __u32            index;
900                 int              len;
901
902                 rc = iops->rec(env, it, (struct dt_rec *)ent, LUDA_64BITHASH);
903                 if (rc != 0)
904                         break;
905
906                 rc = -EIO;
907
908                 fid_le_to_cpu(&fid, &ent->lde_fid);
909                 ent->lde_namelen = le16_to_cpu(ent->lde_namelen);
910                 if (ent->lde_name[0] == '.') {
911                         if (ent->lde_namelen == 1)
912                                 goto next;
913
914                         if (ent->lde_namelen == 2 && ent->lde_name[1] == '.')
915                                 goto next;
916                 }
917
918                 len = scnprintf(name, sizeof(name),
919                                 DFID":", PFID(&ent->lde_fid));
920                 /* The ent->lde_name is composed of ${FID}:${index} */
921                 if (ent->lde_namelen < len + 1 ||
922                     memcmp(ent->lde_name, name, len) != 0) {
923                         CDEBUG_LIMIT(lod->lod_lmv_failout ? D_ERROR : D_INFO,
924                                      "%s: invalid shard name %.*s with the FID "DFID" for the striped directory "DFID", %s\n",
925                                      lod2obd(lod)->obd_name, ent->lde_namelen,
926                                      ent->lde_name, PFID(&fid),
927                                      PFID(lu_object_fid(&obj->do_lu)),
928                                      lod->lod_lmv_failout ? "failout" : "skip");
929
930                         if (lod->lod_lmv_failout)
931                                 break;
932
933                         goto next;
934                 }
935
936                 index = 0;
937                 do {
938                         if (ent->lde_name[len] < '0' ||
939                             ent->lde_name[len] > '9') {
940                                 CDEBUG_LIMIT(lod->lod_lmv_failout ?
941                                              D_ERROR : D_INFO,
942                                              "%s: invalid shard name %.*s with the FID "DFID" for the striped directory "DFID", %s\n",
943                                              lod2obd(lod)->obd_name,
944                                              ent->lde_namelen,
945                                              ent->lde_name, PFID(&fid),
946                                              PFID(lu_object_fid(&obj->do_lu)),
947                                              lod->lod_lmv_failout ?
948                                              "failout" : "skip");
949
950                                 if (lod->lod_lmv_failout)
951                                         break;
952
953                                 goto next;
954                         }
955
956                         index = index * 10 + ent->lde_name[len++] - '0';
957                 } while (len < ent->lde_namelen);
958
959                 if (len == ent->lde_namelen) {
960                         /* Out of LMV EA range. */
961                         if (index >= stripes) {
962                                 CERROR("%s: the shard %.*s for the striped "
963                                        "directory "DFID" is out of the known "
964                                        "LMV EA range [0 - %u], failout\n",
965                                        lod2obd(lod)->obd_name, ent->lde_namelen,
966                                        ent->lde_name,
967                                        PFID(lu_object_fid(&obj->do_lu)),
968                                        stripes - 1);
969
970                                 break;
971                         }
972
973                         /* The slot has been occupied. */
974                         if (!fid_is_zero(&lmv1->lmv_stripe_fids[index])) {
975                                 struct lu_fid fid0;
976
977                                 fid_le_to_cpu(&fid0,
978                                         &lmv1->lmv_stripe_fids[index]);
979                                 CERROR("%s: both the shard "DFID" and "DFID
980                                        " for the striped directory "DFID
981                                        " claim the same LMV EA slot at the "
982                                        "index %d, failout\n",
983                                        lod2obd(lod)->obd_name,
984                                        PFID(&fid0), PFID(&fid),
985                                        PFID(lu_object_fid(&obj->do_lu)), index);
986
987                                 break;
988                         }
989
990                         /* stored as LE mode */
991                         lmv1->lmv_stripe_fids[index] = ent->lde_fid;
992
993 next:
994                         rc = iops->next(env, it);
995                 }
996         }
997
998         iops->put(env, it);
999         iops->fini(env, it);
1000
1001         RETURN(rc > 0 ? lmv_mds_md_size(stripes, magic) : rc);
1002 }
1003
1004 /**
1005  * Implementation of dt_object_operations::do_index_try.
1006  *
1007  * \see dt_object_operations::do_index_try() in the API description for details.
1008  */
1009 static int lod_index_try(const struct lu_env *env, struct dt_object *dt,
1010                          const struct dt_index_features *feat)
1011 {
1012         struct lod_object       *lo = lod_dt_obj(dt);
1013         struct dt_object        *next = dt_object_child(dt);
1014         int                     rc;
1015         ENTRY;
1016
1017         LASSERT(next->do_ops);
1018         LASSERT(next->do_ops->do_index_try);
1019
1020         rc = lod_striping_load(env, lo);
1021         if (rc != 0)
1022                 RETURN(rc);
1023
1024         rc = next->do_ops->do_index_try(env, next, feat);
1025         if (rc != 0)
1026                 RETURN(rc);
1027
1028         if (lo->ldo_dir_stripe_count > 0) {
1029                 int i;
1030
1031                 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
1032                         if (!lo->ldo_stripe[i])
1033                                 continue;
1034                         if (!dt_object_exists(lo->ldo_stripe[i]))
1035                                 continue;
1036                         rc = lo->ldo_stripe[i]->do_ops->do_index_try(env,
1037                                                 lo->ldo_stripe[i], feat);
1038                         if (rc != 0)
1039                                 RETURN(rc);
1040                 }
1041                 dt->do_index_ops = &lod_striped_index_ops;
1042         } else {
1043                 dt->do_index_ops = &lod_index_ops;
1044         }
1045
1046         RETURN(rc);
1047 }
1048
1049 /**
1050  * Implementation of dt_object_operations::do_read_lock.
1051  *
1052  * \see dt_object_operations::do_read_lock() in the API description for details.
1053  */
1054 static void lod_read_lock(const struct lu_env *env, struct dt_object *dt,
1055                           unsigned role)
1056 {
1057         dt_read_lock(env, dt_object_child(dt), role);
1058 }
1059
1060 /**
1061  * Implementation of dt_object_operations::do_write_lock.
1062  *
1063  * \see dt_object_operations::do_write_lock() in the API description for
1064  * details.
1065  */
1066 static void lod_write_lock(const struct lu_env *env, struct dt_object *dt,
1067                            unsigned role)
1068 {
1069         dt_write_lock(env, dt_object_child(dt), role);
1070 }
1071
1072 /**
1073  * Implementation of dt_object_operations::do_read_unlock.
1074  *
1075  * \see dt_object_operations::do_read_unlock() in the API description for
1076  * details.
1077  */
1078 static void lod_read_unlock(const struct lu_env *env, struct dt_object *dt)
1079 {
1080         dt_read_unlock(env, dt_object_child(dt));
1081 }
1082
1083 /**
1084  * Implementation of dt_object_operations::do_write_unlock.
1085  *
1086  * \see dt_object_operations::do_write_unlock() in the API description for
1087  * details.
1088  */
1089 static void lod_write_unlock(const struct lu_env *env, struct dt_object *dt)
1090 {
1091         dt_write_unlock(env, dt_object_child(dt));
1092 }
1093
1094 /**
1095  * Implementation of dt_object_operations::do_write_locked.
1096  *
1097  * \see dt_object_operations::do_write_locked() in the API description for
1098  * details.
1099  */
1100 static int lod_write_locked(const struct lu_env *env, struct dt_object *dt)
1101 {
1102         return dt_write_locked(env, dt_object_child(dt));
1103 }
1104
1105 /**
1106  * Implementation of dt_object_operations::do_attr_get.
1107  *
1108  * \see dt_object_operations::do_attr_get() in the API description for details.
1109  */
1110 static int lod_attr_get(const struct lu_env *env,
1111                         struct dt_object *dt,
1112                         struct lu_attr *attr)
1113 {
1114         /* Note: for striped directory, client will merge attributes
1115          * from all of the sub-stripes see lmv_merge_attr(), and there
1116          * no MDD logic depend on directory nlink/size/time, so we can
1117          * always use master inode nlink and size for now. */
1118         return dt_attr_get(env, dt_object_child(dt), attr);
1119 }
1120
1121 void lod_adjust_stripe_size(struct lod_layout_component *comp,
1122                             __u32 def_stripe_size)
1123 {
1124         __u64 comp_end = comp->llc_extent.e_end;
1125
1126         /* Choose stripe size if not set. Note that default stripe size can't
1127          * be used as is, because it must be multiplier of given component end.
1128          *  - first check if default stripe size can be used
1129          *  - if not than select the lowest set bit from component end and use
1130          *    that value as stripe size
1131          */
1132         if (!comp->llc_stripe_size) {
1133                 if (comp_end == LUSTRE_EOF || !(comp_end % def_stripe_size))
1134                         comp->llc_stripe_size = def_stripe_size;
1135                 else
1136                         comp->llc_stripe_size = comp_end & ~(comp_end - 1);
1137         } else {
1138                 if (comp_end != LUSTRE_EOF &&
1139                     comp_end & (LOV_MIN_STRIPE_SIZE - 1)) {
1140                         CWARN("Component end %llu is not a multiple of min size %u\n",
1141                               comp_end, LOV_MIN_STRIPE_SIZE);
1142                         comp_end = round_up(comp_end, LOV_MIN_STRIPE_SIZE);
1143                 }
1144                 /* check stripe size is multiplier of comp_end */
1145                 if (comp_end != LUSTRE_EOF &&
1146                     comp_end != comp->llc_extent.e_start &&
1147                     comp_end % comp->llc_stripe_size) {
1148                         /* fix that even for defined stripe size but warn
1149                          * about the problem, that must not happen
1150                          */
1151                         CWARN("Component end %llu is not aligned by the stripe size %u\n",
1152                               comp_end, comp->llc_stripe_size);
1153                         comp->llc_stripe_size = comp_end & ~(comp_end - 1);
1154                 }
1155         }
1156 }
1157
1158 static inline void lod_adjust_stripe_info(struct lod_layout_component *comp,
1159                                           struct lov_desc *desc,
1160                                           int append_stripes)
1161 {
1162         if (comp->llc_pattern != LOV_PATTERN_MDT) {
1163                 if (append_stripes) {
1164                         comp->llc_stripe_count = append_stripes;
1165                 } else if (!comp->llc_stripe_count) {
1166                         comp->llc_stripe_count =
1167                                 desc->ld_default_stripe_count;
1168                 }
1169         }
1170
1171         lod_adjust_stripe_size(comp, desc->ld_default_stripe_size);
1172 }
1173
1174 int lod_obj_for_each_stripe(const struct lu_env *env, struct lod_object *lo,
1175                             struct thandle *th,
1176                             struct lod_obj_stripe_cb_data *data)
1177 {
1178         struct lod_layout_component *lod_comp;
1179         int i, j, rc = 0;
1180         ENTRY;
1181
1182         mutex_lock(&lo->ldo_layout_mutex);
1183         for (i = 0; i < lo->ldo_comp_cnt; i++) {
1184                 lod_comp = &lo->ldo_comp_entries[i];
1185
1186                 if (lod_comp->llc_stripe == NULL)
1187                         continue;
1188
1189                 /* has stripe but not inited yet, this component has been
1190                  * declared to be created, but hasn't created yet.
1191                  */
1192                 if (!lod_comp_inited(lod_comp))
1193                         continue;
1194
1195                 if (data->locd_comp_skip_cb &&
1196                     data->locd_comp_skip_cb(env, lo, i, data))
1197                         continue;
1198
1199                 if (data->locd_comp_cb) {
1200                         rc = data->locd_comp_cb(env, lo, i, data);
1201                         if (rc)
1202                                 GOTO(unlock, rc);
1203                 }
1204
1205                 /* could used just to do sth about component, not each
1206                  * stripes
1207                  */
1208                 if (!data->locd_stripe_cb)
1209                         continue;
1210
1211                 LASSERT(lod_comp->llc_stripe_count > 0);
1212                 for (j = 0; j < lod_comp->llc_stripe_count; j++) {
1213                         struct dt_object *dt = lod_comp->llc_stripe[j];
1214
1215                         if (dt == NULL)
1216                                 continue;
1217                         rc = data->locd_stripe_cb(env, lo, dt, th, i, j, data);
1218                         if (rc != 0)
1219                                 GOTO(unlock, rc);
1220                 }
1221         }
1222 unlock:
1223         mutex_unlock(&lo->ldo_layout_mutex);
1224         RETURN(rc);
1225 }
1226
1227 static bool lod_obj_attr_set_comp_skip_cb(const struct lu_env *env,
1228                 struct lod_object *lo, int comp_idx,
1229                 struct lod_obj_stripe_cb_data *data)
1230 {
1231         struct lod_layout_component *lod_comp = &lo->ldo_comp_entries[comp_idx];
1232         bool skipped = false;
1233
1234         if (!(data->locd_attr->la_valid & LA_LAYOUT_VERSION))
1235                 return skipped;
1236
1237         switch (lo->ldo_flr_state) {
1238         case LCM_FL_WRITE_PENDING: {
1239                 int i;
1240
1241                 /* skip stale components */
1242                 if (lod_comp->llc_flags & LCME_FL_STALE) {
1243                         skipped = true;
1244                         break;
1245                 }
1246
1247                 /* skip valid and overlapping components, therefore any
1248                  * attempts to write overlapped components will never succeed
1249                  * because client will get EINPROGRESS. */
1250                 for (i = 0; i < lo->ldo_comp_cnt; i++) {
1251                         if (i == comp_idx)
1252                                 continue;
1253
1254                         if (lo->ldo_comp_entries[i].llc_flags & LCME_FL_STALE)
1255                                 continue;
1256
1257                         if (lu_extent_is_overlapped(&lod_comp->llc_extent,
1258                                         &lo->ldo_comp_entries[i].llc_extent)) {
1259                                 skipped = true;
1260                                 break;
1261                         }
1262                 }
1263                 break;
1264         }
1265         case LCM_FL_RDONLY:
1266         case LCM_FL_SYNC_PENDING:
1267                 break;
1268         default:
1269                 LASSERTF(0, "impossible: %d\n", lo->ldo_flr_state);
1270                 break;
1271         }
1272
1273         CDEBUG(D_LAYOUT, DFID": %s to set component %x to version: %u\n",
1274                PFID(lu_object_fid(&lo->ldo_obj.do_lu)),
1275                skipped ? "skipped" : "chose", lod_comp->llc_id,
1276                data->locd_attr->la_layout_version);
1277
1278         return skipped;
1279 }
1280
1281 static inline int
1282 lod_obj_stripe_attr_set_cb(const struct lu_env *env, struct lod_object *lo,
1283                            struct dt_object *dt, struct thandle *th,
1284                            int comp_idx, int stripe_idx,
1285                            struct lod_obj_stripe_cb_data *data)
1286 {
1287         if (data->locd_declare)
1288                 return lod_sub_declare_attr_set(env, dt, data->locd_attr, th);
1289
1290         if (data->locd_attr->la_valid & LA_LAYOUT_VERSION) {
1291                 CDEBUG(D_LAYOUT, DFID": set layout version: %u, comp_idx: %d\n",
1292                        PFID(lu_object_fid(&dt->do_lu)),
1293                        data->locd_attr->la_layout_version, comp_idx);
1294         }
1295
1296         return lod_sub_attr_set(env, dt, data->locd_attr, th);
1297 }
1298
1299 /**
1300  * Implementation of dt_object_operations::do_declare_attr_set.
1301  *
1302  * If the object is striped, then apply the changes to all the stripes.
1303  *
1304  * \see dt_object_operations::do_declare_attr_set() in the API description
1305  * for details.
1306  */
1307 static int lod_declare_attr_set(const struct lu_env *env,
1308                                 struct dt_object *dt,
1309                                 const struct lu_attr *attr,
1310                                 struct thandle *th)
1311 {
1312         struct dt_object  *next = dt_object_child(dt);
1313         struct lod_object *lo = lod_dt_obj(dt);
1314         int                rc, i;
1315         ENTRY;
1316
1317         /*
1318          * declare setattr on the local object
1319          */
1320         rc = lod_sub_declare_attr_set(env, next, attr, th);
1321         if (rc)
1322                 RETURN(rc);
1323
1324         /* osp_declare_attr_set() ignores all attributes other than
1325          * UID, GID, PROJID, and size, and osp_attr_set() ignores all
1326          * but UID, GID and PROJID. Declaration of size attr setting
1327          * happens through lod_declare_init_size(), and not through
1328          * this function. Therefore we need not load striping unless
1329          * ownership is changing.  This should save memory and (we hope)
1330          * speed up rename().
1331          */
1332         if (!S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
1333                 if (!(attr->la_valid & LA_REMOTE_ATTR_SET))
1334                         RETURN(rc);
1335
1336                 if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_OWNER))
1337                         RETURN(0);
1338         } else {
1339                 if (!(attr->la_valid & (LA_UID | LA_GID | LA_PROJID | LA_MODE |
1340                                         LA_ATIME | LA_MTIME | LA_CTIME |
1341                                         LA_FLAGS)))
1342                         RETURN(rc);
1343         }
1344         /*
1345          * load striping information, notice we don't do this when object
1346          * is being initialized as we don't need this information till
1347          * few specific cases like destroy, chown
1348          */
1349         rc = lod_striping_load(env, lo);
1350         if (rc)
1351                 RETURN(rc);
1352
1353         if (!lod_obj_is_striped(dt))
1354                 RETURN(0);
1355
1356         /*
1357          * if object is striped declare changes on the stripes
1358          */
1359         if (S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
1360                 LASSERT(lo->ldo_stripe);
1361                 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
1362                         if (lo->ldo_stripe[i] == NULL)
1363                                 continue;
1364                         if (!dt_object_exists(lo->ldo_stripe[i]))
1365                                 continue;
1366                         rc = lod_sub_declare_attr_set(env, lo->ldo_stripe[i],
1367                                                       attr, th);
1368                         if (rc != 0)
1369                                 RETURN(rc);
1370                 }
1371         } else {
1372                 struct lod_obj_stripe_cb_data data = { { 0 } };
1373
1374                 data.locd_attr = attr;
1375                 data.locd_declare = true;
1376                 data.locd_stripe_cb = lod_obj_stripe_attr_set_cb;
1377                 rc = lod_obj_for_each_stripe(env, lo, th, &data);
1378         }
1379
1380         if (rc)
1381                 RETURN(rc);
1382
1383         if (!dt_object_exists(next) || dt_object_remote(next) ||
1384             !S_ISREG(attr->la_mode))
1385                 RETURN(0);
1386
1387         if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_STRIPE)) {
1388                 rc = lod_sub_declare_xattr_del(env, next, XATTR_NAME_LOV, th);
1389                 RETURN(rc);
1390         }
1391
1392         if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_CHANGE_STRIPE) ||
1393             OBD_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_PFL_RANGE)) {
1394                 struct lod_thread_info *info = lod_env_info(env);
1395                 struct lu_buf *buf = &info->lti_buf;
1396
1397                 buf->lb_buf = info->lti_ea_store;
1398                 buf->lb_len = info->lti_ea_store_size;
1399                 rc = lod_sub_declare_xattr_set(env, next, buf, XATTR_NAME_LOV,
1400                                                LU_XATTR_REPLACE, th);
1401         }
1402
1403         RETURN(rc);
1404 }
1405
1406 /**
1407  * Implementation of dt_object_operations::do_attr_set.
1408  *
1409  * If the object is striped, then apply the changes to all or subset of
1410  * the stripes depending on the object type and specific attributes.
1411  *
1412  * \see dt_object_operations::do_attr_set() in the API description for details.
1413  */
1414 static int lod_attr_set(const struct lu_env *env,
1415                         struct dt_object *dt,
1416                         const struct lu_attr *attr,
1417                         struct thandle *th)
1418 {
1419         struct dt_object        *next = dt_object_child(dt);
1420         struct lod_object       *lo = lod_dt_obj(dt);
1421         int                     rc, i;
1422         ENTRY;
1423
1424         /*
1425          * apply changes to the local object
1426          */
1427         rc = lod_sub_attr_set(env, next, attr, th);
1428         if (rc)
1429                 RETURN(rc);
1430
1431         if (!S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
1432                 if (!(attr->la_valid & LA_REMOTE_ATTR_SET))
1433                         RETURN(rc);
1434
1435                 if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_OWNER))
1436                         RETURN(0);
1437         } else {
1438                 if (!(attr->la_valid & (LA_UID | LA_GID | LA_MODE | LA_PROJID |
1439                                         LA_ATIME | LA_MTIME | LA_CTIME |
1440                                         LA_FLAGS)))
1441                         RETURN(rc);
1442         }
1443
1444         /* FIXME: a tricky case in the code path of mdd_layout_change():
1445          * the in-memory striping information has been freed in lod_xattr_set()
1446          * due to layout change. It has to load stripe here again. It only
1447          * changes flags of layout so declare_attr_set() is still accurate */
1448         rc = lod_striping_load(env, lo);
1449         if (rc)
1450                 RETURN(rc);
1451
1452         if (!lod_obj_is_striped(dt))
1453                 RETURN(0);
1454
1455         /*
1456          * if object is striped, apply changes to all the stripes
1457          */
1458         if (S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
1459                 LASSERT(lo->ldo_stripe);
1460                 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
1461                         if (unlikely(lo->ldo_stripe[i] == NULL))
1462                                 continue;
1463
1464                         if ((dt_object_exists(lo->ldo_stripe[i]) == 0))
1465                                 continue;
1466
1467                         rc = lod_sub_attr_set(env, lo->ldo_stripe[i], attr, th);
1468                         if (rc != 0)
1469                                 break;
1470                 }
1471         } else {
1472                 struct lod_obj_stripe_cb_data data = { { 0 } };
1473
1474                 data.locd_attr = attr;
1475                 data.locd_declare = false;
1476                 data.locd_comp_skip_cb = lod_obj_attr_set_comp_skip_cb;
1477                 data.locd_stripe_cb = lod_obj_stripe_attr_set_cb;
1478                 rc = lod_obj_for_each_stripe(env, lo, th, &data);
1479         }
1480
1481         if (rc)
1482                 RETURN(rc);
1483
1484         if (!dt_object_exists(next) || dt_object_remote(next) ||
1485             !S_ISREG(attr->la_mode))
1486                 RETURN(0);
1487
1488         if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_STRIPE)) {
1489                 rc = lod_sub_xattr_del(env, next, XATTR_NAME_LOV, th);
1490                 RETURN(rc);
1491         }
1492
1493         if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_CHANGE_STRIPE)) {
1494                 struct lod_thread_info *info = lod_env_info(env);
1495                 struct lu_buf *buf = &info->lti_buf;
1496                 struct ost_id *oi = &info->lti_ostid;
1497                 struct lu_fid *fid = &info->lti_fid;
1498                 struct lov_mds_md_v1 *lmm;
1499                 struct lov_ost_data_v1 *objs;
1500                 __u32 magic;
1501
1502                 rc = lod_get_lov_ea(env, lo);
1503                 if (rc <= 0)
1504                         RETURN(rc);
1505
1506                 buf->lb_buf = info->lti_ea_store;
1507                 buf->lb_len = info->lti_ea_store_size;
1508                 lmm = info->lti_ea_store;
1509                 magic = le32_to_cpu(lmm->lmm_magic);
1510                 if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) {
1511                         struct lov_comp_md_v1 *lcm = buf->lb_buf;
1512                         struct lov_comp_md_entry_v1 *lcme =
1513                                                 &lcm->lcm_entries[0];
1514
1515                         lmm = buf->lb_buf + le32_to_cpu(lcme->lcme_offset);
1516                         magic = le32_to_cpu(lmm->lmm_magic);
1517                 }
1518
1519                 if (magic == LOV_MAGIC_V1)
1520                         objs = &(lmm->lmm_objects[0]);
1521                 else
1522                         objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0];
1523                 ostid_le_to_cpu(&objs->l_ost_oi, oi);
1524                 ostid_to_fid(fid, oi, le32_to_cpu(objs->l_ost_idx));
1525                 fid->f_oid--;
1526                 fid_to_ostid(fid, oi);
1527                 ostid_cpu_to_le(oi, &objs->l_ost_oi);
1528
1529                 rc = lod_sub_xattr_set(env, next, buf, XATTR_NAME_LOV,
1530                                        LU_XATTR_REPLACE, th);
1531         } else if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_PFL_RANGE)) {
1532                 struct lod_thread_info *info = lod_env_info(env);
1533                 struct lu_buf *buf = &info->lti_buf;
1534                 struct lov_comp_md_v1 *lcm;
1535                 struct lov_comp_md_entry_v1 *lcme;
1536
1537                 rc = lod_get_lov_ea(env, lo);
1538                 if (rc <= 0)
1539                         RETURN(rc);
1540
1541                 buf->lb_buf = info->lti_ea_store;
1542                 buf->lb_len = info->lti_ea_store_size;
1543                 lcm = buf->lb_buf;
1544                 if (le32_to_cpu(lcm->lcm_magic) != LOV_MAGIC_COMP_V1 &&
1545                     le32_to_cpu(lcm->lcm_magic) != LOV_MAGIC_SEL)
1546                         RETURN(-EINVAL);
1547
1548                 le32_add_cpu(&lcm->lcm_layout_gen, 1);
1549                 lcme = &lcm->lcm_entries[0];
1550                 le64_add_cpu(&lcme->lcme_extent.e_start, 1);
1551                 le64_add_cpu(&lcme->lcme_extent.e_end, -1);
1552
1553                 rc = lod_sub_xattr_set(env, next, buf, XATTR_NAME_LOV,
1554                                        LU_XATTR_REPLACE, th);
1555         }
1556
1557         RETURN(rc);
1558 }
1559
1560 /**
1561  * Implementation of dt_object_operations::do_xattr_get.
1562  *
1563  * If LOV EA is requested from the root object and it's not
1564  * found, then return default striping for the filesystem.
1565  *
1566  * \see dt_object_operations::do_xattr_get() in the API description for details.
1567  */
1568 static int lod_xattr_get(const struct lu_env *env, struct dt_object *dt,
1569                          struct lu_buf *buf, const char *name)
1570 {
1571         struct lod_thread_info *info = lod_env_info(env);
1572         struct lod_device *dev = lu2lod_dev(dt->do_lu.lo_dev);
1573         int is_root;
1574         int rc;
1575         ENTRY;
1576
1577         rc = dt_xattr_get(env, dt_object_child(dt), buf, name);
1578         if (strcmp(name, XATTR_NAME_LMV) == 0) {
1579                 struct lmv_mds_md_v1    *lmv1;
1580                 struct lmv_foreign_md   *lfm;
1581                 int                      rc1 = 0;
1582
1583                 if (rc > (typeof(rc))sizeof(*lmv1))
1584                         RETURN(rc);
1585
1586                 /* short (<= sizeof(struct lmv_mds_md_v1)) foreign LMV case */
1587                 /* XXX empty foreign LMV is not allowed */
1588                 if (rc <= offsetof(typeof(*lfm), lfm_value))
1589                         RETURN(rc = rc > 0 ? -EINVAL : rc);
1590
1591                 if (buf->lb_buf == NULL || buf->lb_len == 0) {
1592                         BUILD_BUG_ON(sizeof(*lmv1) > sizeof(info->lti_key));
1593
1594                         /* lti_buf is large enough for *lmv1 or a short
1595                          * (<= sizeof(struct lmv_mds_md_v1)) foreign LMV
1596                          */
1597                         info->lti_buf.lb_buf = info->lti_key;
1598                         info->lti_buf.lb_len = sizeof(*lmv1);
1599                         rc = dt_xattr_get(env, dt_object_child(dt),
1600                                           &info->lti_buf, name);
1601                         if (unlikely(rc <= offsetof(typeof(*lfm),
1602                                                     lfm_value)))
1603                                 RETURN(rc = rc > 0 ? -EINVAL : rc);
1604
1605                         lfm = info->lti_buf.lb_buf;
1606                         if (le32_to_cpu(lfm->lfm_magic) == LMV_MAGIC_FOREIGN)
1607                                 RETURN(rc);
1608
1609                         if (unlikely(rc != sizeof(*lmv1)))
1610                                 RETURN(rc = rc > 0 ? -EINVAL : rc);
1611
1612                         lmv1 = info->lti_buf.lb_buf;
1613                         /* The on-disk LMV EA only contains header, but the
1614                          * returned LMV EA size should contain the space for
1615                          * the FIDs of all shards of the striped directory. */
1616                         if (le32_to_cpu(lmv1->lmv_magic) == LMV_MAGIC_V1)
1617                                 rc = lmv_mds_md_size(
1618                                         le32_to_cpu(lmv1->lmv_stripe_count),
1619                                         le32_to_cpu(lmv1->lmv_magic));
1620                 } else {
1621                         lmv1 = buf->lb_buf;
1622                         if (le32_to_cpu(lmv1->lmv_magic) != LMV_MAGIC_V1)
1623                                 RETURN(rc);
1624
1625                         if (rc != sizeof(*lmv1))
1626                                 RETURN(rc = rc > 0 ? -EINVAL : rc);
1627
1628                         rc1 = lod_load_lmv_shards(env, lod_dt_obj(dt),
1629                                                   buf, false);
1630                 }
1631
1632                 RETURN(rc = rc1 != 0 ? rc1 : rc);
1633         }
1634
1635         if ((rc > 0) && buf->lb_buf && strcmp(name, XATTR_NAME_LOV) == 0) {
1636                 struct lov_comp_md_v1 *lcm = buf->lb_buf;
1637
1638                 if (lcm->lcm_magic == cpu_to_le32(LOV_MAGIC_SEL))
1639                         lcm->lcm_magic = cpu_to_le32(LOV_MAGIC_COMP_V1);
1640         }
1641
1642         if (rc != -ENODATA || !S_ISDIR(dt->do_lu.lo_header->loh_attr & S_IFMT))
1643                 RETURN(rc);
1644
1645         /*
1646          * XXX: Only used by lfsck
1647          *
1648          * lod returns default striping on the real root of the device
1649          * this is like the root stores default striping for the whole
1650          * filesystem. historically we've been using a different approach
1651          * and store it in the config.
1652          */
1653         dt_root_get(env, dev->lod_child, &info->lti_fid);
1654         is_root = lu_fid_eq(&info->lti_fid, lu_object_fid(&dt->do_lu));
1655
1656         if (is_root && strcmp(XATTR_NAME_LOV, name) == 0) {
1657                 struct lov_user_md *lum = buf->lb_buf;
1658                 struct lov_desc *desc = &dev->lod_ost_descs.ltd_lov_desc;
1659
1660                 if (buf->lb_buf == NULL) {
1661                         rc = sizeof(*lum);
1662                 } else if (buf->lb_len >= sizeof(*lum)) {
1663                         lum->lmm_magic = cpu_to_le32(LOV_USER_MAGIC_V1);
1664                         lmm_oi_set_seq(&lum->lmm_oi, FID_SEQ_LOV_DEFAULT);
1665                         lmm_oi_set_id(&lum->lmm_oi, 0);
1666                         lmm_oi_cpu_to_le(&lum->lmm_oi, &lum->lmm_oi);
1667                         lum->lmm_pattern = cpu_to_le32(desc->ld_pattern);
1668                         lum->lmm_stripe_size = cpu_to_le32(
1669                                                 desc->ld_default_stripe_size);
1670                         lum->lmm_stripe_count = cpu_to_le16(
1671                                                 desc->ld_default_stripe_count);
1672                         lum->lmm_stripe_offset = cpu_to_le16(
1673                                                 desc->ld_default_stripe_offset);
1674                         rc = sizeof(*lum);
1675                 } else {
1676                         rc = -ERANGE;
1677                 }
1678         }
1679
1680         RETURN(rc);
1681 }
1682
1683 /**
1684  * Verify LVM EA.
1685  *
1686  * Checks that the magic of the stripe is sane.
1687  *
1688  * \param[in] lod       lod device
1689  * \param[in] lum       a buffer storing LMV EA to verify
1690  *
1691  * \retval              0 if the EA is sane
1692  * \retval              negative otherwise
1693  */
1694 static int lod_verify_md_striping(struct lod_device *lod,
1695                                   const struct lmv_user_md_v1 *lum)
1696 {
1697         if (unlikely(le32_to_cpu(lum->lum_magic) != LMV_USER_MAGIC)) {
1698                 CERROR("%s: invalid lmv_user_md: magic = %x, "
1699                        "stripe_offset = %d, stripe_count = %u: rc = %d\n",
1700                        lod2obd(lod)->obd_name, le32_to_cpu(lum->lum_magic),
1701                        (int)le32_to_cpu(lum->lum_stripe_offset),
1702                        le32_to_cpu(lum->lum_stripe_count), -EINVAL);
1703                 return -EINVAL;
1704         }
1705
1706         return 0;
1707 }
1708
1709 /**
1710  * Initialize LMV EA for a slave.
1711  *
1712  * Initialize slave's LMV EA from the master's LMV EA.
1713  *
1714  * \param[in] master_lmv        a buffer containing master's EA
1715  * \param[out] slave_lmv        a buffer where slave's EA will be stored
1716  *
1717  */
1718 static void lod_prep_slave_lmv_md(struct lmv_mds_md_v1 *slave_lmv,
1719                                   const struct lmv_mds_md_v1 *master_lmv)
1720 {
1721         *slave_lmv = *master_lmv;
1722         slave_lmv->lmv_magic = cpu_to_le32(LMV_MAGIC_STRIPE);
1723 }
1724
1725 /**
1726  * Generate LMV EA.
1727  *
1728  * Generate LMV EA from the object passed as \a dt. The object must have
1729  * the stripes created and initialized.
1730  *
1731  * \param[in] env       execution environment
1732  * \param[in] dt        object
1733  * \param[out] lmv_buf  buffer storing generated LMV EA
1734  *
1735  * \retval              0 on success
1736  * \retval              negative if failed
1737  */
1738 static int lod_prep_lmv_md(const struct lu_env *env, struct dt_object *dt,
1739                            struct lu_buf *lmv_buf)
1740 {
1741         struct lod_thread_info  *info = lod_env_info(env);
1742         struct lod_device       *lod = lu2lod_dev(dt->do_lu.lo_dev);
1743         struct lod_object       *lo = lod_dt_obj(dt);
1744         struct lmv_mds_md_v1    *lmm1;
1745         int                     stripe_count;
1746         int                     type = LU_SEQ_RANGE_ANY;
1747         int                     rc;
1748         __u32                   mdtidx;
1749         ENTRY;
1750
1751         LASSERT(lo->ldo_dir_striped != 0);
1752         LASSERT(lo->ldo_dir_stripe_count > 0);
1753         stripe_count = lo->ldo_dir_stripe_count;
1754         /* Only store the LMV EA heahder on the disk. */
1755         if (info->lti_ea_store_size < sizeof(*lmm1)) {
1756                 rc = lod_ea_store_resize(info, sizeof(*lmm1));
1757                 if (rc != 0)
1758                         RETURN(rc);
1759         } else {
1760                 memset(info->lti_ea_store, 0, sizeof(*lmm1));
1761         }
1762
1763         lmm1 = (struct lmv_mds_md_v1 *)info->lti_ea_store;
1764         memset(lmm1, 0, sizeof(*lmm1));
1765         lmm1->lmv_magic = cpu_to_le32(LMV_MAGIC);
1766         lmm1->lmv_stripe_count = cpu_to_le32(stripe_count);
1767         lmm1->lmv_hash_type = cpu_to_le32(lo->ldo_dir_hash_type);
1768         lmm1->lmv_layout_version = cpu_to_le32(lo->ldo_dir_layout_version);
1769         if (lod_is_layout_changing(lo)) {
1770                 lmm1->lmv_migrate_hash = cpu_to_le32(lo->ldo_dir_migrate_hash);
1771                 lmm1->lmv_migrate_offset =
1772                         cpu_to_le32(lo->ldo_dir_migrate_offset);
1773         }
1774         rc = lod_fld_lookup(env, lod, lu_object_fid(&dt->do_lu),
1775                             &mdtidx, &type);
1776         if (rc != 0)
1777                 RETURN(rc);
1778
1779         lmm1->lmv_master_mdt_index = cpu_to_le32(mdtidx);
1780         lmv_buf->lb_buf = info->lti_ea_store;
1781         lmv_buf->lb_len = sizeof(*lmm1);
1782
1783         RETURN(rc);
1784 }
1785
1786 /**
1787  * Create in-core represenation for a striped directory.
1788  *
1789  * Parse the buffer containing LMV EA and instantiate LU objects
1790  * representing the stripe objects. The pointers to the objects are
1791  * stored in ldo_stripe field of \a lo. This function is used when
1792  * we need to access an already created object (i.e. load from a disk).
1793  *
1794  * \param[in] env       execution environment
1795  * \param[in] lo        lod object
1796  * \param[in] buf       buffer containing LMV EA
1797  *
1798  * \retval              0 on success
1799  * \retval              negative if failed
1800  */
1801 int lod_parse_dir_striping(const struct lu_env *env, struct lod_object *lo,
1802                            const struct lu_buf *buf)
1803 {
1804         struct lod_thread_info  *info = lod_env_info(env);
1805         struct lod_device       *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
1806         struct lod_tgt_descs    *ltd = &lod->lod_mdt_descs;
1807         struct dt_object        **stripe;
1808         union lmv_mds_md        *lmm = buf->lb_buf;
1809         struct lmv_mds_md_v1    *lmv1 = &lmm->lmv_md_v1;
1810         struct lu_fid           *fid = &info->lti_fid;
1811         unsigned int            i;
1812         int                     rc = 0;
1813         ENTRY;
1814
1815         LASSERT(mutex_is_locked(&lo->ldo_layout_mutex));
1816
1817         /* XXX may be useless as not called for foreign LMV ?? */
1818         if (le32_to_cpu(lmv1->lmv_magic) == LMV_MAGIC_FOREIGN)
1819                 RETURN(0);
1820
1821         if (le32_to_cpu(lmv1->lmv_magic) == LMV_MAGIC_STRIPE) {
1822                 lo->ldo_dir_slave_stripe = 1;
1823                 RETURN(0);
1824         }
1825
1826         if (!lmv_is_sane(lmv1))
1827                 RETURN(-EINVAL);
1828
1829         LASSERT(lo->ldo_stripe == NULL);
1830         OBD_ALLOC_PTR_ARRAY(stripe, le32_to_cpu(lmv1->lmv_stripe_count));
1831         if (stripe == NULL)
1832                 RETURN(-ENOMEM);
1833
1834         for (i = 0; i < le32_to_cpu(lmv1->lmv_stripe_count); i++) {
1835                 struct dt_device        *tgt_dt;
1836                 struct dt_object        *dto;
1837                 int                     type = LU_SEQ_RANGE_ANY;
1838                 __u32                   idx;
1839
1840                 fid_le_to_cpu(fid, &lmv1->lmv_stripe_fids[i]);
1841                 if (!fid_is_sane(fid)) {
1842                         stripe[i] = NULL;
1843                         continue;
1844                 }
1845
1846                 rc = lod_fld_lookup(env, lod, fid, &idx, &type);
1847                 if (rc != 0)
1848                         GOTO(out, rc);
1849
1850                 if (idx == lod2lu_dev(lod)->ld_site->ld_seq_site->ss_node_id) {
1851                         tgt_dt = lod->lod_child;
1852                 } else {
1853                         struct lod_tgt_desc     *tgt;
1854
1855                         tgt = LTD_TGT(ltd, idx);
1856                         if (tgt == NULL)
1857                                 GOTO(out, rc = -ESTALE);
1858                         tgt_dt = tgt->ltd_tgt;
1859                 }
1860
1861                 dto = dt_locate_at(env, tgt_dt, fid,
1862                                   lo->ldo_obj.do_lu.lo_dev->ld_site->ls_top_dev,
1863                                   NULL);
1864                 if (IS_ERR(dto))
1865                         GOTO(out, rc = PTR_ERR(dto));
1866
1867                 stripe[i] = dto;
1868         }
1869 out:
1870         lo->ldo_stripe = stripe;
1871         lo->ldo_is_foreign = 0;
1872         lo->ldo_dir_stripe_count = le32_to_cpu(lmv1->lmv_stripe_count);
1873         lo->ldo_dir_stripes_allocated = le32_to_cpu(lmv1->lmv_stripe_count);
1874         lo->ldo_dir_layout_version = le32_to_cpu(lmv1->lmv_layout_version);
1875         lo->ldo_dir_migrate_offset = le32_to_cpu(lmv1->lmv_migrate_offset);
1876         lo->ldo_dir_migrate_hash = le32_to_cpu(lmv1->lmv_migrate_hash);
1877         lo->ldo_dir_hash_type = le32_to_cpu(lmv1->lmv_hash_type);
1878         if (rc != 0)
1879                 lod_striping_free_nolock(env, lo);
1880
1881         RETURN(rc);
1882 }
1883
1884 /**
1885  * Declare create a striped directory.
1886  *
1887  * Declare creating a striped directory with a given stripe pattern on the
1888  * specified MDTs. A striped directory is represented as a regular directory
1889  * - an index listing all the stripes. The stripes point back to the master
1890  * object with ".." and LinkEA. The master object gets LMV EA which
1891  * identifies it as a striped directory. The function allocates FIDs
1892  * for all stripes.
1893  *
1894  * \param[in] env       execution environment
1895  * \param[in] dt        object
1896  * \param[in] attr      attributes to initialize the objects with
1897  * \param[in] dof       type of objects to be created
1898  * \param[in] th        transaction handle
1899  *
1900  * \retval              0 on success
1901  * \retval              negative if failed
1902  */
1903 static int lod_dir_declare_create_stripes(const struct lu_env *env,
1904                                           struct dt_object *dt,
1905                                           struct lu_attr *attr,
1906                                           struct dt_object_format *dof,
1907                                           struct thandle *th)
1908 {
1909         struct lod_thread_info  *info = lod_env_info(env);
1910         struct lu_buf           lmv_buf;
1911         struct lu_buf           slave_lmv_buf;
1912         struct lmv_mds_md_v1    *lmm;
1913         struct lmv_mds_md_v1    *slave_lmm = NULL;
1914         struct dt_insert_rec    *rec = &info->lti_dt_rec;
1915         struct lod_object       *lo = lod_dt_obj(dt);
1916         int                     rc;
1917         __u32                   i;
1918         ENTRY;
1919
1920         rc = lod_prep_lmv_md(env, dt, &lmv_buf);
1921         if (rc != 0)
1922                 GOTO(out, rc);
1923         lmm = lmv_buf.lb_buf;
1924
1925         OBD_ALLOC_PTR(slave_lmm);
1926         if (slave_lmm == NULL)
1927                 GOTO(out, rc = -ENOMEM);
1928
1929         lod_prep_slave_lmv_md(slave_lmm, lmm);
1930         slave_lmv_buf.lb_buf = slave_lmm;
1931         slave_lmv_buf.lb_len = sizeof(*slave_lmm);
1932
1933         if (!dt_try_as_dir(env, dt_object_child(dt)))
1934                 GOTO(out, rc = -EINVAL);
1935
1936         rec->rec_type = S_IFDIR;
1937         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
1938                 struct dt_object        *dto = lo->ldo_stripe[i];
1939                 char                    *stripe_name = info->lti_key;
1940                 struct lu_name          *sname;
1941                 struct linkea_data       ldata          = { NULL };
1942                 struct lu_buf           linkea_buf;
1943
1944                 /* OBD_FAIL_MDS_STRIPE_FID may leave stripe uninitialized */
1945                 if (!dto)
1946                         continue;
1947
1948                 /* directory split skip create for existing stripes */
1949                 if (!(lod_is_splitting(lo) && i < lo->ldo_dir_split_offset)) {
1950                         rc = lod_sub_declare_create(env, dto, attr, NULL, dof,
1951                                                     th);
1952                         if (rc != 0)
1953                                 GOTO(out, rc);
1954
1955                         if (!dt_try_as_dir(env, dto))
1956                                 GOTO(out, rc = -EINVAL);
1957
1958                         rc = lod_sub_declare_ref_add(env, dto, th);
1959                         if (rc != 0)
1960                                 GOTO(out, rc);
1961
1962                         rec->rec_fid = lu_object_fid(&dto->do_lu);
1963                         rc = lod_sub_declare_insert(env, dto,
1964                                                     (const struct dt_rec *)rec,
1965                                                     (const struct dt_key *)dot,
1966                                                     th);
1967                         if (rc != 0)
1968                                 GOTO(out, rc);
1969
1970                         /* master stripe FID will be put to .. */
1971                         rec->rec_fid = lu_object_fid(&dt->do_lu);
1972                         rc = lod_sub_declare_insert(env, dto,
1973                                                   (const struct dt_rec *)rec,
1974                                                   (const struct dt_key *)dotdot,
1975                                                   th);
1976                         if (rc != 0)
1977                                 GOTO(out, rc);
1978
1979                         if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_SLAVE_NAME) &&
1980                             cfs_fail_val == i)
1981                                 snprintf(stripe_name, sizeof(info->lti_key),
1982                                          DFID":%u",
1983                                          PFID(lu_object_fid(&dto->do_lu)),
1984                                          i + 1);
1985                         else
1986                                 snprintf(stripe_name, sizeof(info->lti_key),
1987                                          DFID":%u",
1988                                          PFID(lu_object_fid(&dto->do_lu)), i);
1989
1990                         sname = lod_name_get(env, stripe_name,
1991                                              strlen(stripe_name));
1992                         rc = linkea_links_new(&ldata, &info->lti_linkea_buf,
1993                                               sname, lu_object_fid(&dt->do_lu));
1994                         if (rc != 0)
1995                                 GOTO(out, rc);
1996
1997                         linkea_buf.lb_buf = ldata.ld_buf->lb_buf;
1998                         linkea_buf.lb_len = ldata.ld_leh->leh_len;
1999                         rc = lod_sub_declare_xattr_set(env, dto, &linkea_buf,
2000                                                        XATTR_NAME_LINK, 0, th);
2001                         if (rc != 0)
2002                                 GOTO(out, rc);
2003
2004                         rec->rec_fid = lu_object_fid(&dto->do_lu);
2005                         rc = lod_sub_declare_insert(env, dt_object_child(dt),
2006                                         (const struct dt_rec *)rec,
2007                                         (const struct dt_key *)stripe_name, th);
2008                         if (rc != 0)
2009                                 GOTO(out, rc);
2010
2011                         rc = lod_sub_declare_ref_add(env, dt_object_child(dt),
2012                                                      th);
2013                         if (rc != 0)
2014                                 GOTO(out, rc);
2015                 }
2016
2017                 if (!OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_SLAVE_LMV) ||
2018                     cfs_fail_val != i) {
2019                         if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_SLAVE_LMV) &&
2020                             cfs_fail_val == i)
2021                                 slave_lmm->lmv_master_mdt_index =
2022                                                         cpu_to_le32(i + 1);
2023                         else
2024                                 slave_lmm->lmv_master_mdt_index =
2025                                                         cpu_to_le32(i);
2026                         rc = lod_sub_declare_xattr_set(env, dto, &slave_lmv_buf,
2027                                                        XATTR_NAME_LMV, 0, th);
2028                         if (rc != 0)
2029                                 GOTO(out, rc);
2030                 }
2031         }
2032
2033         rc = lod_sub_declare_xattr_set(env, dt_object_child(dt),
2034                                        &lmv_buf, XATTR_NAME_LMV, 0, th);
2035         if (rc != 0)
2036                 GOTO(out, rc);
2037 out:
2038         if (slave_lmm != NULL)
2039                 OBD_FREE_PTR(slave_lmm);
2040
2041         RETURN(rc);
2042 }
2043
2044 /**
2045  * Allocate a striping on a predefined set of MDTs.
2046  *
2047  * Allocates new striping using the MDT index range provided by the data from
2048  * the lum_obejcts contained in the lmv_user_md passed to this method if
2049  * \a is_specific is true; or allocates new layout starting from MDT index in
2050  * lo->ldo_dir_stripe_offset. The exact order of MDTs is not important and
2051  * varies depending on MDT status. The number of stripes needed and stripe
2052  * offset are taken from the object. If that number cannot be met, then the
2053  * function returns an error and then it's the caller's responsibility to
2054  * release the stripes allocated. All the internal structures are protected,
2055  * but no concurrent allocation is allowed on the same objects.
2056  *
2057  * \param[in] env               execution environment for this thread
2058  * \param[in] lo                LOD object
2059  * \param[out] stripes          striping created
2060  * \param[out] mdt_indices      MDT indices of striping created
2061  * \param[in] is_specific       true if the MDTs are provided by lum; false if
2062  *                              only the starting MDT index is provided
2063  *
2064  * \retval positive     stripes allocated, including the first stripe allocated
2065  *                      outside
2066  * \retval negative     errno on failure
2067  */
2068 static int lod_mdt_alloc_specific(const struct lu_env *env,
2069                                   struct lod_object *lo,
2070                                   struct dt_object **stripes,
2071                                   __u32 *mdt_indices, bool is_specific)
2072 {
2073         struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
2074         struct lu_tgt_descs *ltd = &lod->lod_mdt_descs;
2075         struct lu_tgt_desc *tgt = NULL;
2076         struct lu_object_conf conf = { .loc_flags = LOC_F_NEW };
2077         struct dt_device *tgt_dt = NULL;
2078         struct lu_fid fid = { 0 };
2079         struct dt_object *dto;
2080         u32 master_index;
2081         u32 stripe_count = lo->ldo_dir_stripe_count;
2082         int stripe_idx = 1;
2083         int j;
2084         int idx;
2085         int rc;
2086
2087         master_index = lu_site2seq(lod2lu_dev(lod)->ld_site)->ss_node_id;
2088         if (!is_specific && stripe_count > 1)
2089                 /* Set the start index for the 2nd stripe allocation */
2090                 mdt_indices[1] = (mdt_indices[0] + 1) %
2091                                         (lod->lod_remote_mdt_count + 1);
2092
2093         for (; stripe_idx < stripe_count; stripe_idx++) {
2094                 /* Try to find next avaible target */
2095                 idx = mdt_indices[stripe_idx];
2096                 for (j = 0; j < lod->lod_remote_mdt_count;
2097                      j++, idx = (idx + 1) % (lod->lod_remote_mdt_count + 1)) {
2098                         bool already_allocated = false;
2099                         __u32 k;
2100
2101                         CDEBUG(D_INFO, "try idx %d, mdt cnt %u, allocated %u\n",
2102                                idx, lod->lod_remote_mdt_count + 1, stripe_idx);
2103
2104                         if (likely(!is_specific &&
2105                                    !OBD_FAIL_CHECK(OBD_FAIL_LARGE_STRIPE))) {
2106                                 /* check whether the idx already exists
2107                                  * in current allocated array */
2108                                 for (k = 0; k < stripe_idx; k++) {
2109                                         if (mdt_indices[k] == idx) {
2110                                                 already_allocated = true;
2111                                                 break;
2112                                         }
2113                                 }
2114
2115                                 if (already_allocated)
2116                                         continue;
2117                         }
2118
2119                         /* Sigh, this index is not in the bitmap, let's check
2120                          * next available target */
2121                         if (!test_bit(idx, ltd->ltd_tgt_bitmap) &&
2122                             idx != master_index)
2123                                 continue;
2124
2125                         if (idx == master_index) {
2126                                 /* Allocate the FID locally */
2127                                 tgt_dt = lod->lod_child;
2128                                 rc = dt_fid_alloc(env, tgt_dt, &fid, NULL,
2129                                                   NULL);
2130                                 if (rc < 0)
2131                                         continue;
2132                                 break;
2133                         }
2134
2135                         /* check the status of the OSP */
2136                         tgt = LTD_TGT(ltd, idx);
2137                         if (!tgt)
2138                                 continue;
2139
2140                         tgt_dt = tgt->ltd_tgt;
2141                         if (!tgt->ltd_active)
2142                                 /* this OSP doesn't feel well */
2143                                 continue;
2144
2145                         rc = dt_fid_alloc(env, tgt_dt, &fid, NULL, NULL);
2146                         if (rc < 0)
2147                                 continue;
2148
2149                         break;
2150                 }
2151
2152                 /* Can not allocate more stripes */
2153                 if (j == lod->lod_remote_mdt_count) {
2154                         CDEBUG(D_INFO, "%s: require stripes %u only get %d\n",
2155                                lod2obd(lod)->obd_name, stripe_count,
2156                                stripe_idx);
2157                         break;
2158                 }
2159
2160                 CDEBUG(D_INFO, "Get idx %d, for stripe %d "DFID"\n",
2161                        idx, stripe_idx, PFID(&fid));
2162                 mdt_indices[stripe_idx] = idx;
2163                 /* Set the start index for next stripe allocation */
2164                 if (!is_specific && stripe_idx < stripe_count - 1) {
2165                         /*
2166                          * for large dir test, put all other slaves on one
2167                          * remote MDT, otherwise we may save too many local
2168                          * slave locks which will exceed RS_MAX_LOCKS.
2169                          */
2170                         if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_LARGE_STRIPE)))
2171                                 idx = master_index;
2172                         mdt_indices[stripe_idx + 1] = (idx + 1) %
2173                                            (lod->lod_remote_mdt_count + 1);
2174                 }
2175                 /* tgt_dt and fid must be ready after search avaible OSP
2176                  * in the above loop */
2177                 LASSERT(tgt_dt != NULL);
2178                 LASSERT(fid_is_sane(&fid));
2179
2180                 /* fail a remote stripe FID allocation */
2181                 if (stripe_idx && OBD_FAIL_CHECK(OBD_FAIL_MDS_STRIPE_FID))
2182                         continue;
2183
2184                 dto = dt_locate_at(env, tgt_dt, &fid,
2185                                   lo->ldo_obj.do_lu.lo_dev->ld_site->ls_top_dev,
2186                                   &conf);
2187                 if (IS_ERR(dto)) {
2188                         rc = PTR_ERR(dto);
2189                         goto error;
2190                 }
2191
2192                 stripes[stripe_idx] = dto;
2193         }
2194
2195         return stripe_idx;
2196
2197 error:
2198         for (j = 1; j < stripe_idx; j++) {
2199                 LASSERT(stripes[j] != NULL);
2200                 dt_object_put(env, stripes[j]);
2201                 stripes[j] = NULL;
2202         }
2203         return rc;
2204 }
2205
2206 static int lod_prep_md_striped_create(const struct lu_env *env,
2207                                       struct dt_object *dt,
2208                                       struct lu_attr *attr,
2209                                       const struct lmv_user_md_v1 *lum,
2210                                       struct dt_object_format *dof,
2211                                       struct thandle *th)
2212 {
2213         struct lod_device *lod = lu2lod_dev(dt->do_lu.lo_dev);
2214         struct lod_object *lo = lod_dt_obj(dt);
2215         struct dt_object **stripes;
2216         struct lu_object_conf conf = { .loc_flags = LOC_F_NEW };
2217         struct lu_fid fid = { 0 };
2218         __u32 stripe_count;
2219         int i;
2220         int rc = 0;
2221
2222         ENTRY;
2223
2224         /* The lum has been verifed in lod_verify_md_striping */
2225         LASSERT(le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC ||
2226                 le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC_SPECIFIC);
2227
2228         stripe_count = lo->ldo_dir_stripe_count;
2229
2230         OBD_ALLOC_PTR_ARRAY(stripes, stripe_count);
2231         if (!stripes)
2232                 RETURN(-ENOMEM);
2233
2234         /* Allocate the first stripe locally */
2235         rc = dt_fid_alloc(env, lod->lod_child, &fid, NULL, NULL);
2236         if (rc < 0)
2237                 GOTO(out, rc);
2238
2239         stripes[0] = dt_locate_at(env, lod->lod_child, &fid,
2240                                   dt->do_lu.lo_dev->ld_site->ls_top_dev, &conf);
2241         if (IS_ERR(stripes[0]))
2242                 GOTO(out, rc = PTR_ERR(stripes[0]));
2243
2244         if (lo->ldo_dir_stripe_offset == LMV_OFFSET_DEFAULT) {
2245                 lod_qos_statfs_update(env, lod, &lod->lod_mdt_descs);
2246                 rc = lod_mdt_alloc_qos(env, lo, stripes, 1, stripe_count);
2247                 if (rc == -EAGAIN)
2248                         rc = lod_mdt_alloc_rr(env, lo, stripes, 1,
2249                                               stripe_count);
2250         } else {
2251                 int *idx_array;
2252                 bool is_specific = false;
2253
2254                 OBD_ALLOC_PTR_ARRAY(idx_array, stripe_count);
2255                 if (!idx_array)
2256                         GOTO(out, rc = -ENOMEM);
2257
2258                 if (le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC_SPECIFIC) {
2259                         is_specific = true;
2260                         for (i = 0; i < stripe_count; i++)
2261                                 idx_array[i] =
2262                                        le32_to_cpu(lum->lum_objects[i].lum_mds);
2263                 }
2264
2265                 /* stripe 0 is local */
2266                 idx_array[0] =
2267                         lu_site2seq(lod2lu_dev(lod)->ld_site)->ss_node_id;
2268                 rc = lod_mdt_alloc_specific(env, lo, stripes, idx_array,
2269                                             is_specific);
2270                 OBD_FREE_PTR_ARRAY(idx_array, stripe_count);
2271         }
2272
2273         if (rc < 0)
2274                 GOTO(out, rc);
2275
2276         LASSERT(rc > 0);
2277
2278         lo->ldo_dir_striped = 1;
2279         lo->ldo_stripe = stripes;
2280         lo->ldo_dir_stripe_count = rc;
2281         lo->ldo_dir_stripes_allocated = stripe_count;
2282         smp_mb();
2283         lo->ldo_dir_stripe_loaded = 1;
2284
2285         rc = lod_dir_declare_create_stripes(env, dt, attr, dof, th);
2286         if (rc < 0)
2287                 lod_striping_free(env, lo);
2288
2289         RETURN(rc);
2290
2291 out:
2292         LASSERT(rc < 0);
2293         if (!IS_ERR_OR_NULL(stripes[0]))
2294                 dt_object_put(env, stripes[0]);
2295         for (i = 1; i < stripe_count; i++)
2296                 LASSERT(!stripes[i]);
2297         OBD_FREE_PTR_ARRAY(stripes, stripe_count);
2298
2299         return rc;
2300 }
2301
2302 /**
2303  *
2304  * Alloc cached foreign LOV
2305  *
2306  * \param[in] lo        object
2307  * \param[in] size      size of foreign LOV
2308  *
2309  * \retval              0 on success
2310  * \retval              negative if failed
2311  */
2312 int lod_alloc_foreign_lov(struct lod_object *lo, size_t size)
2313 {
2314         OBD_ALLOC_LARGE(lo->ldo_foreign_lov, size);
2315         if (lo->ldo_foreign_lov == NULL)
2316                 return -ENOMEM;
2317         lo->ldo_foreign_lov_size = size;
2318         lo->ldo_is_foreign = 1;
2319         return 0;
2320 }
2321
2322 /**
2323  *
2324  * Free cached foreign LOV
2325  *
2326  * \param[in] lo        object
2327  */
2328 void lod_free_foreign_lov(struct lod_object *lo)
2329 {
2330         if (lo->ldo_foreign_lov != NULL)
2331                 OBD_FREE_LARGE(lo->ldo_foreign_lov, lo->ldo_foreign_lov_size);
2332         lo->ldo_foreign_lov = NULL;
2333         lo->ldo_foreign_lov_size = 0;
2334         lo->ldo_is_foreign = 0;
2335 }
2336
2337 /**
2338  *
2339  * Alloc cached foreign LMV
2340  *
2341  * \param[in] lo        object
2342  * \param[in] size      size of foreign LMV
2343  *
2344  * \retval              0 on success
2345  * \retval              negative if failed
2346  */
2347 int lod_alloc_foreign_lmv(struct lod_object *lo, size_t size)
2348 {
2349         OBD_ALLOC_LARGE(lo->ldo_foreign_lmv, size);
2350         if (lo->ldo_foreign_lmv == NULL)
2351                 return -ENOMEM;
2352         lo->ldo_foreign_lmv_size = size;
2353         lo->ldo_is_foreign = 1;
2354
2355         return 0;
2356 }
2357
2358 /**
2359  *
2360  * Free cached foreign LMV
2361  *
2362  * \param[in] lo        object
2363  */
2364 void lod_free_foreign_lmv(struct lod_object *lo)
2365 {
2366         if (lo->ldo_foreign_lmv != NULL)
2367                 OBD_FREE_LARGE(lo->ldo_foreign_lmv, lo->ldo_foreign_lmv_size);
2368         lo->ldo_foreign_lmv = NULL;
2369         lo->ldo_foreign_lmv_size = 0;
2370         lo->ldo_is_foreign = 0;
2371 }
2372
2373 /**
2374  * Declare create striped md object.
2375  *
2376  * The function declares intention to create a striped directory. This is a
2377  * wrapper for lod_prep_md_striped_create(). The only additional functionality
2378  * is to verify pattern \a lum_buf is good. Check that function for the details.
2379  *
2380  * \param[in] env       execution environment
2381  * \param[in] dt        object
2382  * \param[in] attr      attributes to initialize the objects with
2383  * \param[in] lum_buf   a pattern specifying the number of stripes and
2384  *                      MDT to start from
2385  * \param[in] dof       type of objects to be created
2386  * \param[in] th        transaction handle
2387  *
2388  * \retval              0 on success
2389  * \retval              negative if failed
2390  *
2391  */
2392 static int lod_declare_xattr_set_lmv(const struct lu_env *env,
2393                                      struct dt_object *dt,
2394                                      struct lu_attr *attr,
2395                                      const struct lu_buf *lum_buf,
2396                                      struct dt_object_format *dof,
2397                                      struct thandle *th)
2398 {
2399         struct lod_object *lo = lod_dt_obj(dt);
2400         struct lmv_user_md_v1 *lum = lum_buf->lb_buf;
2401         int rc;
2402
2403         ENTRY;
2404         LASSERT(lum != NULL);
2405
2406         CDEBUG(D_INFO,
2407                "lum magic=%x hash=%x count=%u offset=%d inherit=%u rr=%u\n",
2408                le32_to_cpu(lum->lum_magic), le32_to_cpu(lum->lum_hash_type),
2409                le32_to_cpu(lum->lum_stripe_count),
2410                (int)le32_to_cpu(lum->lum_stripe_offset),
2411                lum->lum_max_inherit, lum->lum_max_inherit_rr);
2412
2413         if (lo->ldo_dir_stripe_count == 0) {
2414                 if (lo->ldo_is_foreign) {
2415                         rc = lod_alloc_foreign_lmv(lo, lum_buf->lb_len);
2416                         if (rc != 0)
2417                                 GOTO(out, rc);
2418                         memcpy(lo->ldo_foreign_lmv, lum, lum_buf->lb_len);
2419                         lo->ldo_dir_stripe_loaded = 1;
2420                 }
2421                 GOTO(out, rc = 0);
2422         }
2423
2424         /* prepare dir striped objects */
2425         rc = lod_prep_md_striped_create(env, dt, attr, lum, dof, th);
2426         if (rc != 0) {
2427                 /* failed to create striping, let's reset
2428                  * config so that others don't get confused */
2429                 lod_striping_free(env, lo);
2430                 GOTO(out, rc);
2431         }
2432 out:
2433         RETURN(rc);
2434 }
2435
2436 /**
2437  * Set or replace striped directory layout, and LFSCK may set layout on a plain
2438  * directory, so don't check stripe count.
2439  *
2440  * \param[in] env       execution environment
2441  * \param[in] dt        target object
2442  * \param[in] lmv_buf   LMV buf which contains source stripe FIDs
2443  * \param[in] fl        set or replace
2444  * \param[in] th        transaction handle
2445  *
2446  * \retval              0 on success
2447  * \retval              negative if failed
2448  */
2449 static int lod_dir_layout_set(const struct lu_env *env,
2450                               struct dt_object *dt,
2451                               const struct lu_buf *lmv_buf,
2452                               int fl,
2453                               struct thandle *th)
2454 {
2455         struct dt_object *next = dt_object_child(dt);
2456         struct lod_object *lo = lod_dt_obj(dt);
2457         struct lod_device *lod = lu2lod_dev(lod2lu_obj(lo)->lo_dev);
2458         struct lmv_mds_md_v1 *lmv = lmv_buf->lb_buf;
2459         struct lmv_mds_md_v1 *slave_lmv;
2460         struct lu_buf slave_buf;
2461         int i;
2462         int rc;
2463
2464         ENTRY;
2465
2466         if (!lmv_is_sane2(lmv))
2467                 RETURN(-EINVAL);
2468
2469         /* adjust hash for dir merge, which may not be set in user command */
2470         if (lmv_is_merging(lmv) &&
2471             !(lmv->lmv_migrate_hash & LMV_HASH_TYPE_MASK))
2472                 lmv->lmv_merge_hash |=
2473                         lod->lod_mdt_descs.ltd_lmv_desc.ld_pattern &
2474                         LMV_HASH_TYPE_MASK;
2475
2476         LMV_DEBUG(D_INFO, lmv, "set");
2477
2478         rc = lod_sub_xattr_set(env, next, lmv_buf, XATTR_NAME_LMV, fl, th);
2479         if (rc)
2480                 RETURN(rc);
2481
2482         /* directory restripe may update stripe LMV directly */
2483         if (!lo->ldo_dir_stripe_count)
2484                 RETURN(0);
2485
2486         lo->ldo_dir_hash_type = le32_to_cpu(lmv->lmv_hash_type);
2487         lo->ldo_dir_migrate_offset = le32_to_cpu(lmv->lmv_migrate_offset);
2488         lo->ldo_dir_migrate_hash = le32_to_cpu(lmv->lmv_migrate_hash);
2489         lo->ldo_dir_layout_version = le32_to_cpu(lmv->lmv_layout_version);
2490
2491         OBD_ALLOC_PTR(slave_lmv);
2492         if (!slave_lmv)
2493                 RETURN(-ENOMEM);
2494
2495         lod_prep_slave_lmv_md(slave_lmv, lmv);
2496         slave_buf.lb_buf = slave_lmv;
2497         slave_buf.lb_len = sizeof(*slave_lmv);
2498
2499         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
2500                 if (!lo->ldo_stripe[i])
2501                         continue;
2502
2503                 if (!dt_object_exists(lo->ldo_stripe[i]))
2504                         continue;
2505
2506                 rc = lod_sub_xattr_set(env, lo->ldo_stripe[i], &slave_buf,
2507                                        XATTR_NAME_LMV, fl, th);
2508                 if (rc)
2509                         break;
2510         }
2511
2512         OBD_FREE_PTR(slave_lmv);
2513
2514         RETURN(rc);
2515 }
2516
2517 /**
2518  * Implementation of dt_object_operations::do_declare_xattr_set.
2519  *
2520  * Used with regular (non-striped) objects. Basically it
2521  * initializes the striping information and applies the
2522  * change to all the stripes.
2523  *
2524  * \see dt_object_operations::do_declare_xattr_set() in the API description
2525  * for details.
2526  */
2527 static int lod_dir_declare_xattr_set(const struct lu_env *env,
2528                                      struct dt_object *dt,
2529                                      const struct lu_buf *buf,
2530                                      const char *name, int fl,
2531                                      struct thandle *th)
2532 {
2533         struct dt_object        *next = dt_object_child(dt);
2534         struct lod_device       *d = lu2lod_dev(dt->do_lu.lo_dev);
2535         struct lod_object       *lo = lod_dt_obj(dt);
2536         int                     i;
2537         int                     rc;
2538         ENTRY;
2539
2540         if (strcmp(name, XATTR_NAME_DEFAULT_LMV) == 0) {
2541                 struct lmv_user_md_v1 *lum;
2542
2543                 LASSERT(buf != NULL && buf->lb_buf != NULL);
2544                 lum = buf->lb_buf;
2545                 rc = lod_verify_md_striping(d, lum);
2546                 if (rc != 0)
2547                         RETURN(rc);
2548         } else if (strcmp(name, XATTR_NAME_LOV) == 0) {
2549                 rc = lod_verify_striping(env, d, lo, buf, false);
2550                 if (rc != 0)
2551                         RETURN(rc);
2552         }
2553
2554         rc = lod_sub_declare_xattr_set(env, next, buf, name, fl, th);
2555         if (rc != 0)
2556                 RETURN(rc);
2557
2558         /* Note: Do not set LinkEA on sub-stripes, otherwise
2559          * it will confuse the fid2path process(see mdt_path_current()).
2560          * The linkEA between master and sub-stripes is set in
2561          * lod_xattr_set_lmv(). */
2562         if (strcmp(name, XATTR_NAME_LINK) == 0)
2563                 RETURN(0);
2564
2565         /* set xattr to each stripes, if needed */
2566         rc = lod_striping_load(env, lo);
2567         if (rc != 0)
2568                 RETURN(rc);
2569
2570         if (lo->ldo_dir_stripe_count == 0)
2571                 RETURN(0);
2572
2573         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
2574                 if (!lo->ldo_stripe[i])
2575                         continue;
2576
2577                 if (!dt_object_exists(lo->ldo_stripe[i]))
2578                         continue;
2579
2580                 rc = lod_sub_declare_xattr_set(env, lo->ldo_stripe[i],
2581                                                buf, name, fl, th);
2582                 if (rc != 0)
2583                         break;
2584         }
2585
2586         RETURN(rc);
2587 }
2588
2589 static int
2590 lod_obj_stripe_replace_parent_fid_cb(const struct lu_env *env,
2591                                      struct lod_object *lo,
2592                                      struct dt_object *dt, struct thandle *th,
2593                                      int comp_idx, int stripe_idx,
2594                                      struct lod_obj_stripe_cb_data *data)
2595 {
2596         struct lod_thread_info *info = lod_env_info(env);
2597         struct lod_layout_component *comp = &lo->ldo_comp_entries[comp_idx];
2598         struct filter_fid *ff = &info->lti_ff;
2599         struct lu_buf *buf = &info->lti_buf;
2600         int rc;
2601
2602         buf->lb_buf = ff;
2603         buf->lb_len = sizeof(*ff);
2604         rc = dt_xattr_get(env, dt, buf, XATTR_NAME_FID);
2605         if (rc < 0) {
2606                 if (rc == -ENODATA)
2607                         return 0;
2608                 return rc;
2609         }
2610
2611         /*
2612          * locd_buf is set if it's called by dir migration, which doesn't check
2613          * pfid and comp id.
2614          */
2615         if (data->locd_buf) {
2616                 memset(ff, 0, sizeof(*ff));
2617                 ff->ff_parent = *(struct lu_fid *)data->locd_buf->lb_buf;
2618         } else {
2619                 filter_fid_le_to_cpu(ff, ff, sizeof(*ff));
2620
2621                 if (lu_fid_eq(lod_object_fid(lo), &ff->ff_parent) &&
2622                     ff->ff_layout.ol_comp_id == comp->llc_id)
2623                         return 0;
2624
2625                 memset(ff, 0, sizeof(*ff));
2626                 ff->ff_parent = *lu_object_fid(&lo->ldo_obj.do_lu);
2627         }
2628
2629         /* rewrite filter_fid */
2630         ff->ff_parent.f_ver = stripe_idx;
2631         ff->ff_layout.ol_stripe_size = comp->llc_stripe_size;
2632         ff->ff_layout.ol_stripe_count = comp->llc_stripe_count;
2633         ff->ff_layout.ol_comp_id = comp->llc_id;
2634         ff->ff_layout.ol_comp_start = comp->llc_extent.e_start;
2635         ff->ff_layout.ol_comp_end = comp->llc_extent.e_end;
2636         filter_fid_cpu_to_le(ff, ff, sizeof(*ff));
2637
2638         if (data->locd_declare)
2639                 rc = lod_sub_declare_xattr_set(env, dt, buf, XATTR_NAME_FID,
2640                                                LU_XATTR_REPLACE, th);
2641         else
2642                 rc = lod_sub_xattr_set(env, dt, buf, XATTR_NAME_FID,
2643                                        LU_XATTR_REPLACE, th);
2644
2645         return rc;
2646 }
2647
2648 /**
2649  * Reset parent FID on OST object
2650  *
2651  * Replace parent FID with @dt object FID, which is only called during migration
2652  * to reset the parent FID after the MDT object is migrated to the new MDT, i.e.
2653  * the FID is changed.
2654  *
2655  * \param[in] env execution environment
2656  * \param[in] dt dt_object whose stripes's parent FID will be reset
2657  * \parem[in] th thandle
2658  * \param[in] declare if it is declare
2659  *
2660  * \retval      0 if reset succeeds
2661  * \retval      negative errno if reset fails
2662  */
2663 static int lod_replace_parent_fid(const struct lu_env *env,
2664                                   struct dt_object *dt,
2665                                   const struct lu_buf *buf,
2666                                   struct thandle *th, bool declare)
2667 {
2668         struct lod_object *lo = lod_dt_obj(dt);
2669         struct lod_thread_info  *info = lod_env_info(env);
2670         struct filter_fid *ff;
2671         struct lod_obj_stripe_cb_data data = { { 0 } };
2672         int rc;
2673         ENTRY;
2674
2675         LASSERT(S_ISREG(dt->do_lu.lo_header->loh_attr));
2676
2677         /* set xattr to each stripes, if needed */
2678         rc = lod_striping_load(env, lo);
2679         if (rc != 0)
2680                 RETURN(rc);
2681
2682         if (!lod_obj_is_striped(dt))
2683                 RETURN(0);
2684
2685         if (info->lti_ea_store_size < sizeof(*ff)) {
2686                 rc = lod_ea_store_resize(info, sizeof(*ff));
2687                 if (rc != 0)
2688                         RETURN(rc);
2689         }
2690
2691         data.locd_declare = declare;
2692         data.locd_stripe_cb = lod_obj_stripe_replace_parent_fid_cb;
2693         data.locd_buf = buf;
2694         rc = lod_obj_for_each_stripe(env, lo, th, &data);
2695
2696         RETURN(rc);
2697 }
2698
2699 __u16 lod_comp_entry_stripe_count(struct lod_object *lo,
2700                                   int comp_idx, bool is_dir)
2701 {
2702         struct lod_device *lod = lu2lod_dev(lod2lu_obj(lo)->lo_dev);
2703         struct lod_layout_component *entry;
2704
2705         if (is_dir)
2706                 return  0;
2707
2708         entry = &lo->ldo_comp_entries[comp_idx];
2709         if (lod_comp_inited(entry))
2710                 return entry->llc_stripe_count;
2711         else if ((__u16)-1 == entry->llc_stripe_count)
2712                 return lod->lod_ost_count;
2713         else
2714                 return lod_get_stripe_count(lod, lo, comp_idx,
2715                                             entry->llc_stripe_count,
2716                                             entry->llc_pattern &
2717                                             LOV_PATTERN_OVERSTRIPING);
2718 }
2719
2720 static int lod_comp_md_size(struct lod_object *lo, bool is_dir)
2721 {
2722         int magic, size = 0, i;
2723         struct lod_layout_component *comp_entries;
2724         __u16 comp_cnt;
2725         bool is_composite, is_foreign = false;
2726
2727         if (is_dir) {
2728                 comp_cnt = lo->ldo_def_striping->lds_def_comp_cnt;
2729                 comp_entries = lo->ldo_def_striping->lds_def_comp_entries;
2730                 is_composite =
2731                         lo->ldo_def_striping->lds_def_striping_is_composite;
2732         } else {
2733                 comp_cnt = lo->ldo_comp_cnt;
2734                 comp_entries = lo->ldo_comp_entries;
2735                 is_composite = lo->ldo_is_composite;
2736                 is_foreign = lo->ldo_is_foreign;
2737         }
2738
2739         if (is_foreign)
2740                 return lo->ldo_foreign_lov_size;
2741
2742         LASSERT(comp_cnt != 0 && comp_entries != NULL);
2743         if (is_composite) {
2744                 size = sizeof(struct lov_comp_md_v1) +
2745                        sizeof(struct lov_comp_md_entry_v1) * comp_cnt;
2746                 LASSERT(size % sizeof(__u64) == 0);
2747         }
2748
2749         for (i = 0; i < comp_cnt; i++) {
2750                 __u16 stripe_count;
2751
2752                 magic = comp_entries[i].llc_pool ? LOV_MAGIC_V3 : LOV_MAGIC_V1;
2753                 stripe_count = lod_comp_entry_stripe_count(lo, i, is_dir);
2754                 if (!is_dir && is_composite)
2755                         lod_comp_shrink_stripe_count(&comp_entries[i],
2756                                                      &stripe_count);
2757
2758                 size += lov_user_md_size(stripe_count, magic);
2759                 LASSERT(size % sizeof(__u64) == 0);
2760         }
2761         return size;
2762 }
2763
2764 /**
2765  * Declare component add. The xattr name is XATTR_LUSTRE_LOV.add, and
2766  * the xattr value is binary lov_comp_md_v1 which contains component(s)
2767  * to be added.
2768   *
2769  * \param[in] env       execution environment
2770  * \param[in] dt        dt_object to add components on
2771  * \param[in] buf       buffer contains components to be added
2772  * \parem[in] th        thandle
2773  *
2774  * \retval      0 on success
2775  * \retval      negative errno on failure
2776  */
2777 static int lod_declare_layout_add(const struct lu_env *env,
2778                                   struct dt_object *dt,
2779                                   const struct lu_buf *buf,
2780                                   struct thandle *th)
2781 {
2782         struct lod_thread_info  *info = lod_env_info(env);
2783         struct lod_layout_component *comp_array, *lod_comp, *old_array;
2784         struct lod_device *d = lu2lod_dev(dt->do_lu.lo_dev);
2785         struct dt_object *next = dt_object_child(dt);
2786         struct lov_desc *desc = &d->lod_ost_descs.ltd_lov_desc;
2787         struct lod_object *lo = lod_dt_obj(dt);
2788         struct lov_user_md_v3 *v3;
2789         struct lov_comp_md_v1 *comp_v1 = buf->lb_buf;
2790         __u32 magic;
2791         int i, rc, array_cnt, old_array_cnt;
2792         ENTRY;
2793
2794         LASSERT(lo->ldo_is_composite);
2795
2796         if (lo->ldo_flr_state != LCM_FL_NONE)
2797                 RETURN(-EBUSY);
2798
2799         rc = lod_verify_striping(env, d, lo, buf, false);
2800         if (rc != 0)
2801                 RETURN(rc);
2802
2803         magic = comp_v1->lcm_magic;
2804         if (magic == __swab32(LOV_USER_MAGIC_COMP_V1)) {
2805                 lustre_swab_lov_comp_md_v1(comp_v1);
2806                 magic = comp_v1->lcm_magic;
2807         }
2808
2809         if (magic != LOV_USER_MAGIC_COMP_V1)
2810                 RETURN(-EINVAL);
2811
2812         mutex_lock(&lo->ldo_layout_mutex);
2813
2814         array_cnt = lo->ldo_comp_cnt + comp_v1->lcm_entry_count;
2815         OBD_ALLOC_PTR_ARRAY(comp_array, array_cnt);
2816         if (comp_array == NULL) {
2817                 mutex_unlock(&lo->ldo_layout_mutex);
2818                 RETURN(-ENOMEM);
2819         }
2820
2821
2822         memcpy(comp_array, lo->ldo_comp_entries,
2823                sizeof(*comp_array) * lo->ldo_comp_cnt);
2824
2825         for (i = 0; i < comp_v1->lcm_entry_count; i++) {
2826                 struct lov_user_md_v1 *v1;
2827                 struct lu_extent *ext;
2828
2829                 v1 = (struct lov_user_md *)((char *)comp_v1 +
2830                                 comp_v1->lcm_entries[i].lcme_offset);
2831                 ext = &comp_v1->lcm_entries[i].lcme_extent;
2832
2833                 lod_comp = &comp_array[lo->ldo_comp_cnt + i];
2834                 lod_comp->llc_extent.e_start = ext->e_start;
2835                 lod_comp->llc_extent.e_end = ext->e_end;
2836                 lod_comp->llc_stripe_offset = v1->lmm_stripe_offset;
2837                 lod_comp->llc_flags = comp_v1->lcm_entries[i].lcme_flags;
2838
2839                 lod_comp->llc_stripe_count = v1->lmm_stripe_count;
2840                 lod_comp->llc_stripe_size = v1->lmm_stripe_size;
2841                 lod_adjust_stripe_info(lod_comp, desc, 0);
2842
2843                 if (v1->lmm_magic == LOV_USER_MAGIC_V3) {
2844                         v3 = (struct lov_user_md_v3 *) v1;
2845                         if (v3->lmm_pool_name[0] != '\0') {
2846                                 rc = lod_set_pool(&lod_comp->llc_pool,
2847                                                   v3->lmm_pool_name);
2848                                 if (rc)
2849                                         GOTO(error, rc);
2850                         }
2851                 }
2852         }
2853
2854         old_array = lo->ldo_comp_entries;
2855         old_array_cnt = lo->ldo_comp_cnt;
2856
2857         lo->ldo_comp_entries = comp_array;
2858         lo->ldo_comp_cnt = array_cnt;
2859
2860         /* No need to increase layout generation here, it will be increased
2861          * later when generating component ID for the new components */
2862
2863         info->lti_buf.lb_len = lod_comp_md_size(lo, false);
2864         rc = lod_sub_declare_xattr_set(env, next, &info->lti_buf,
2865                                               XATTR_NAME_LOV, 0, th);
2866         if (rc) {
2867                 lo->ldo_comp_entries = old_array;
2868                 lo->ldo_comp_cnt = old_array_cnt;
2869                 GOTO(error, rc);
2870         }
2871
2872         OBD_FREE_PTR_ARRAY(old_array, old_array_cnt);
2873
2874         LASSERT(lo->ldo_mirror_count == 1);
2875         lo->ldo_mirrors[0].lme_end = array_cnt - 1;
2876
2877         mutex_unlock(&lo->ldo_layout_mutex);
2878
2879         RETURN(0);
2880
2881 error:
2882         for (i = lo->ldo_comp_cnt; i < array_cnt; i++) {
2883                 lod_comp = &comp_array[i];
2884                 if (lod_comp->llc_pool != NULL) {
2885                         OBD_FREE(lod_comp->llc_pool,
2886                                  strlen(lod_comp->llc_pool) + 1);
2887                         lod_comp->llc_pool = NULL;
2888                 }
2889         }
2890         OBD_FREE_PTR_ARRAY(comp_array, array_cnt);
2891         mutex_unlock(&lo->ldo_layout_mutex);
2892
2893         RETURN(rc);
2894 }
2895
2896 /**
2897  * lod_last_non_stale_mirror() - Check if a mirror is the last non-stale mirror.
2898  * @mirror_id: Mirror id to be checked.
2899  * @lo:        LOD object.
2900  *
2901  * This function checks if a mirror with specified @mirror_id is the last
2902  * non-stale mirror of a LOD object @lo.
2903  *
2904  * Return: true or false.
2905  */
2906 static inline
2907 bool lod_last_non_stale_mirror(__u16 mirror_id, struct lod_object *lo)
2908 {
2909         struct lod_layout_component *lod_comp;
2910         bool has_stale_flag;
2911         int i;
2912
2913         for (i = 0; i < lo->ldo_mirror_count; i++) {
2914                 if (lo->ldo_mirrors[i].lme_id == mirror_id ||
2915                     lo->ldo_mirrors[i].lme_stale)
2916                         continue;
2917
2918                 has_stale_flag = false;
2919                 lod_foreach_mirror_comp(lod_comp, lo, i) {
2920                         if (lod_comp->llc_flags & LCME_FL_STALE) {
2921                                 has_stale_flag = true;
2922                                 break;
2923                         }
2924                 }
2925                 if (!has_stale_flag)
2926                         return false;
2927         }
2928
2929         return true;
2930 }
2931
2932 /**
2933  * Declare component set. The xattr is name XATTR_LUSTRE_LOV.set.$field,
2934  * the '$field' can only be 'flags' now. The xattr value is binary
2935  * lov_comp_md_v1 which contains the component ID(s) and the value of
2936  * the field to be modified.
2937  * Please update allowed_lustre_lov macro if $field groks more values
2938  * in the future.
2939  *
2940  * \param[in] env       execution environment
2941  * \param[in] dt        dt_object to be modified
2942  * \param[in] op        operation string, like "set.flags"
2943  * \param[in] buf       buffer contains components to be set
2944  * \parem[in] th        thandle
2945  *
2946  * \retval      0 on success
2947  * \retval      negative errno on failure
2948  */
2949 static int lod_declare_layout_set(const struct lu_env *env,
2950                                   struct dt_object *dt,
2951                                   char *op, const struct lu_buf *buf,
2952                                   struct thandle *th)
2953 {
2954         struct lod_layout_component     *lod_comp;
2955         struct lod_thread_info  *info = lod_env_info(env);
2956         struct lod_device       *d = lu2lod_dev(dt->do_lu.lo_dev);
2957         struct lod_object       *lo = lod_dt_obj(dt);
2958         struct lov_comp_md_v1   *comp_v1 = buf->lb_buf;
2959         __u32   magic;
2960         int     i, j, rc;
2961         bool    changed = false;
2962         ENTRY;
2963
2964         /* Please update allowed_lustre_lov macro if op
2965          * groks more values in the future
2966          */
2967         if (strcmp(op, "set.flags") != 0) {
2968                 CDEBUG(D_LAYOUT, "%s: operation (%s) not supported.\n",
2969                        lod2obd(d)->obd_name, op);
2970                 RETURN(-ENOTSUPP);
2971         }
2972
2973         magic = comp_v1->lcm_magic;
2974         if (magic == __swab32(LOV_USER_MAGIC_COMP_V1)) {
2975                 lustre_swab_lov_comp_md_v1(comp_v1);
2976                 magic = comp_v1->lcm_magic;
2977         }
2978
2979         if (magic != LOV_USER_MAGIC_COMP_V1)
2980                 RETURN(-EINVAL);
2981
2982         if (comp_v1->lcm_entry_count == 0) {
2983                 CDEBUG(D_LAYOUT, "%s: entry count is zero.\n",
2984                        lod2obd(d)->obd_name);
2985                 RETURN(-EINVAL);
2986         }
2987
2988         mutex_lock(&lo->ldo_layout_mutex);
2989         for (i = 0; i < comp_v1->lcm_entry_count; i++) {
2990                 __u32 id = comp_v1->lcm_entries[i].lcme_id;
2991                 __u32 flags = comp_v1->lcm_entries[i].lcme_flags;
2992                 __u32 mirror_flag = flags & LCME_MIRROR_FLAGS;
2993                 __u16 mirror_id = mirror_id_of(id);
2994                 bool neg = flags & LCME_FL_NEG;
2995
2996                 if (flags & LCME_FL_INIT) {
2997                         if (changed)
2998                                 lod_striping_free_nolock(env, lo);
2999                         mutex_unlock(&lo->ldo_layout_mutex);
3000                         RETURN(-EINVAL);
3001                 }
3002
3003                 flags &= ~(LCME_MIRROR_FLAGS | LCME_FL_NEG);
3004                 for (j = 0; j < lo->ldo_comp_cnt; j++) {
3005                         lod_comp = &lo->ldo_comp_entries[j];
3006
3007                         /* lfs only put one flag in each entry */
3008                         if ((flags && id != lod_comp->llc_id) ||
3009                             (mirror_flag && mirror_id !=
3010                                             mirror_id_of(lod_comp->llc_id)))
3011                                 continue;
3012
3013                         if (neg) {
3014                                 if (flags)
3015                                         lod_comp->llc_flags &= ~flags;
3016                                 if (mirror_flag)
3017                                         lod_comp->llc_flags &= ~mirror_flag;
3018                         } else {
3019                                 if (flags) {
3020                                         if ((flags & LCME_FL_STALE) &&
3021                                             lod_last_non_stale_mirror(mirror_id,
3022                                                                       lo)) {
3023                                                 mutex_unlock(
3024                                                         &lo->ldo_layout_mutex);
3025                                                 RETURN(-EUCLEAN);
3026                                         }
3027                                         lod_comp->llc_flags |= flags;
3028                                 }
3029                                 if (mirror_flag) {
3030                                         lod_comp->llc_flags |= mirror_flag;
3031                                         if (mirror_flag & LCME_FL_NOSYNC)
3032                                                 lod_comp->llc_timestamp =
3033                                                        ktime_get_real_seconds();
3034                                 }
3035                         }
3036                         changed = true;
3037                 }
3038         }
3039         mutex_unlock(&lo->ldo_layout_mutex);
3040
3041         if (!changed) {
3042                 CDEBUG(D_LAYOUT, "%s: requested component(s) not found.\n",
3043                        lod2obd(d)->obd_name);
3044                 RETURN(-EINVAL);
3045         }
3046
3047         lod_obj_inc_layout_gen(lo);
3048
3049         info->lti_buf.lb_len = lod_comp_md_size(lo, false);
3050         rc = lod_sub_declare_xattr_set(env, dt_object_child(dt), &info->lti_buf,
3051                                        XATTR_NAME_LOV, LU_XATTR_REPLACE, th);
3052         RETURN(rc);
3053 }
3054
3055 /**
3056  * Declare component deletion. The xattr name is XATTR_LUSTRE_LOV.del,
3057  * and the xattr value is a unique component ID or a special lcme_id.
3058  *
3059  * \param[in] env       execution environment
3060  * \param[in] dt        dt_object to be operated on
3061  * \param[in] buf       buffer contains component ID or lcme_id
3062  * \parem[in] th        thandle
3063  *
3064  * \retval      0 on success
3065  * \retval      negative errno on failure
3066  */
3067 static int lod_declare_layout_del(const struct lu_env *env,
3068                                   struct dt_object *dt,
3069                                   const struct lu_buf *buf,
3070                                   struct thandle *th)
3071 {
3072         struct lod_thread_info  *info = lod_env_info(env);
3073         struct dt_object *next = dt_object_child(dt);
3074         struct lod_device *d = lu2lod_dev(dt->do_lu.lo_dev);
3075         struct lod_object *lo = lod_dt_obj(dt);
3076         struct lu_attr *attr = &lod_env_info(env)->lti_attr;
3077         struct lov_comp_md_v1 *comp_v1 = buf->lb_buf;
3078         __u32 magic, id, flags, neg_flags = 0;
3079         int rc, i, j, left;
3080         ENTRY;
3081
3082         LASSERT(lo->ldo_is_composite);
3083
3084         if (lo->ldo_flr_state != LCM_FL_NONE)
3085                 RETURN(-EBUSY);
3086
3087         magic = comp_v1->lcm_magic;
3088         if (magic == __swab32(LOV_USER_MAGIC_COMP_V1)) {
3089                 lustre_swab_lov_comp_md_v1(comp_v1);
3090                 magic = comp_v1->lcm_magic;
3091         }
3092
3093         if (magic != LOV_USER_MAGIC_COMP_V1)
3094                 RETURN(-EINVAL);
3095
3096         id = comp_v1->lcm_entries[0].lcme_id;
3097         flags = comp_v1->lcm_entries[0].lcme_flags;
3098
3099         if (id > LCME_ID_MAX || (flags & ~LCME_KNOWN_FLAGS)) {
3100                 CDEBUG(D_LAYOUT, "%s: invalid component id %#x, flags %#x\n",
3101                        lod2obd(d)->obd_name, id, flags);
3102                 RETURN(-EINVAL);
3103         }
3104
3105         if (id != LCME_ID_INVAL && flags != 0) {
3106                 CDEBUG(D_LAYOUT, "%s: specified both id and flags.\n",
3107                        lod2obd(d)->obd_name);
3108                 RETURN(-EINVAL);
3109         }
3110
3111         if (id == LCME_ID_INVAL && !flags) {
3112                 CDEBUG(D_LAYOUT, "%s: no id or flags specified.\n",
3113                        lod2obd(d)->obd_name);
3114                 RETURN(-EINVAL);
3115         }
3116
3117         if (flags & LCME_FL_NEG) {
3118                 neg_flags = flags & ~LCME_FL_NEG;
3119                 flags = 0;
3120         }
3121
3122         mutex_lock(&lo->ldo_layout_mutex);
3123
3124         left = lo->ldo_comp_cnt;
3125         if (left <= 0) {
3126                 mutex_unlock(&lo->ldo_layout_mutex);
3127                 RETURN(-EINVAL);
3128         }
3129
3130         for (i = (lo->ldo_comp_cnt - 1); i >= 0; i--) {
3131                 struct lod_layout_component *lod_comp;
3132
3133                 lod_comp = &lo->ldo_comp_entries[i];
3134
3135                 if (id != LCME_ID_INVAL && id != lod_comp->llc_id)
3136                         continue;
3137                 else if (flags && !(flags & lod_comp->llc_flags))
3138                         continue;
3139                 else if (neg_flags && (neg_flags & lod_comp->llc_flags))
3140                         continue;
3141
3142                 if (left != (i + 1)) {
3143                         CDEBUG(D_LAYOUT, "%s: this deletion will create "
3144                                "a hole.\n", lod2obd(d)->obd_name);
3145                         mutex_unlock(&lo->ldo_layout_mutex);
3146                         RETURN(-EINVAL);
3147                 }
3148                 left--;
3149
3150                 /* Mark the component as deleted */
3151                 lod_comp->llc_id = LCME_ID_INVAL;
3152
3153                 /* Not instantiated component */
3154                 if (lod_comp->llc_stripe == NULL)
3155                         continue;
3156
3157                 LASSERT(lod_comp->llc_stripe_count > 0);
3158                 for (j = 0; j < lod_comp->llc_stripe_count; j++) {
3159                         struct dt_object *obj = lod_comp->llc_stripe[j];
3160
3161                         if (obj == NULL)
3162                                 continue;
3163                         rc = lod_sub_declare_destroy(env, obj, th);
3164                         if (rc) {
3165                                 mutex_unlock(&lo->ldo_layout_mutex);
3166                                 RETURN(rc);
3167                         }
3168                 }
3169         }
3170
3171         LASSERTF(left >= 0, "left = %d\n", left);
3172         if (left == lo->ldo_comp_cnt) {
3173                 CDEBUG(D_LAYOUT, "%s: requested component id:%#x not found\n",
3174                        lod2obd(d)->obd_name, id);
3175                 mutex_unlock(&lo->ldo_layout_mutex);
3176                 RETURN(-EINVAL);
3177         }
3178
3179         mutex_unlock(&lo->ldo_layout_mutex);
3180
3181         memset(attr, 0, sizeof(*attr));
3182         attr->la_valid = LA_SIZE;
3183         rc = lod_sub_declare_attr_set(env, next, attr, th);
3184         if (rc)
3185                 RETURN(rc);
3186
3187         if (left > 0) {
3188                 info->lti_buf.lb_len = lod_comp_md_size(lo, false);
3189                 rc = lod_sub_declare_xattr_set(env, next, &info->lti_buf,
3190                                                XATTR_NAME_LOV, 0, th);
3191         } else {
3192                 rc = lod_sub_declare_xattr_del(env, next, XATTR_NAME_LOV, th);
3193         }
3194
3195         RETURN(rc);
3196 }
3197
3198 /**
3199  * Declare layout add/set/del operations issued by special xattr names:
3200  *
3201  * XATTR_LUSTRE_LOV.add         add component(s) to existing file
3202  * XATTR_LUSTRE_LOV.del         delete component(s) from existing file
3203  * XATTR_LUSTRE_LOV.set.$field  set specified field of certain component(s)
3204  *
3205  * \param[in] env       execution environment
3206  * \param[in] dt        object
3207  * \param[in] name      name of xattr
3208  * \param[in] buf       lu_buf contains xattr value
3209  * \param[in] th        transaction handle
3210  *
3211  * \retval              0 on success
3212  * \retval              negative if failed
3213  */
3214 static int lod_declare_modify_layout(const struct lu_env *env,
3215                                      struct dt_object *dt,
3216                                      const char *name,
3217                                      const struct lu_buf *buf,
3218                                      struct thandle *th)
3219 {
3220         struct lod_device *d = lu2lod_dev(dt->do_lu.lo_dev);
3221         struct lod_object *lo = lod_dt_obj(dt);
3222         char *op;
3223         int rc, len = strlen(XATTR_LUSTRE_LOV);
3224         ENTRY;
3225
3226         LASSERT(dt_object_exists(dt));
3227
3228         if (strlen(name) <= len || name[len] != '.') {
3229                 CDEBUG(D_LAYOUT, "%s: invalid xattr name: %s\n",
3230                        lod2obd(d)->obd_name, name);
3231                 RETURN(-EINVAL);
3232         }
3233         len++;
3234
3235         rc = lod_striping_load(env, lo);
3236         if (rc)
3237                 GOTO(unlock, rc);
3238
3239         /* the layout to be modified must be a composite layout */
3240         if (!lo->ldo_is_composite) {
3241                 CDEBUG(D_LAYOUT, "%s: object "DFID" isn't a composite file.\n",
3242                        lod2obd(d)->obd_name, PFID(lu_object_fid(&dt->do_lu)));
3243                 GOTO(unlock, rc = -EINVAL);
3244         }
3245
3246         op = (char *)name + len;
3247         if (strcmp(op, "add") == 0) {
3248                 rc = lod_declare_layout_add(env, dt, buf, th);
3249         } else if (strcmp(op, "del") == 0) {
3250                 rc = lod_declare_layout_del(env, dt, buf, th);
3251         } else if (strncmp(op, "set", strlen("set")) == 0) {
3252                 rc = lod_declare_layout_set(env, dt, op, buf, th);
3253         } else  {
3254                 CDEBUG(D_LAYOUT, "%s: unsupported xattr name:%s\n",
3255                        lod2obd(d)->obd_name, name);
3256                 GOTO(unlock, rc = -ENOTSUPP);
3257         }
3258 unlock:
3259         if (rc)
3260                 lod_striping_free(env, lo);
3261
3262         RETURN(rc);
3263 }
3264
3265 /**
3266  * Convert a plain file lov_mds_md to a composite layout.
3267  *
3268  * \param[in,out] info  the thread info::lti_ea_store buffer contains little
3269  *                      endian plain file layout
3270  *
3271  * \retval              0 on success, <0 on failure
3272  */
3273 static int lod_layout_convert(struct lod_thread_info *info)
3274 {
3275         struct lov_mds_md *lmm = info->lti_ea_store;
3276         struct lov_mds_md *lmm_save;
3277         struct lov_comp_md_v1 *lcm;
3278         struct lov_comp_md_entry_v1 *lcme;
3279         size_t size;
3280         __u32 blob_size;
3281         int rc = 0;
3282         ENTRY;
3283
3284         /* realloc buffer to a composite layout which contains one component */
3285         blob_size = lov_mds_md_size(le16_to_cpu(lmm->lmm_stripe_count),
3286                                     le32_to_cpu(lmm->lmm_magic));
3287         size = sizeof(*lcm) + sizeof(*lcme) + blob_size;
3288
3289         OBD_ALLOC_LARGE(lmm_save, blob_size);
3290         if (!lmm_save)
3291                 GOTO(out, rc = -ENOMEM);
3292
3293         memcpy(lmm_save, lmm, blob_size);
3294
3295         if (info->lti_ea_store_size < size) {
3296                 rc = lod_ea_store_resize(info, size);
3297                 if (rc)
3298                         GOTO(out, rc);
3299         }
3300
3301         lcm = info->lti_ea_store;
3302         memset(lcm, 0, sizeof(*lcm) + sizeof(*lcme));
3303         lcm->lcm_magic = cpu_to_le32(LOV_MAGIC_COMP_V1);
3304         lcm->lcm_size = cpu_to_le32(size);
3305         lcm->lcm_layout_gen = cpu_to_le32(le16_to_cpu(
3306                                                 lmm_save->lmm_layout_gen));
3307         lcm->lcm_flags = cpu_to_le16(LCM_FL_NONE);
3308         lcm->lcm_entry_count = cpu_to_le16(1);
3309
3310         lcme = &lcm->lcm_entries[0];
3311         lcme->lcme_flags = cpu_to_le32(LCME_FL_INIT);
3312         lcme->lcme_extent.e_start = 0;
3313         lcme->lcme_extent.e_end = cpu_to_le64(OBD_OBJECT_EOF);
3314         lcme->lcme_offset = cpu_to_le32(sizeof(*lcm) + sizeof(*lcme));
3315         lcme->lcme_size = cpu_to_le32(blob_size);
3316
3317         memcpy((char *)lcm + lcme->lcme_offset, (char *)lmm_save, blob_size);
3318
3319         EXIT;
3320 out:
3321         if (lmm_save)
3322                 OBD_FREE_LARGE(lmm_save, blob_size);
3323         return rc;
3324 }
3325
3326 /**
3327  * Merge layouts to form a mirrored file.
3328  */
3329 static int lod_declare_layout_merge(const struct lu_env *env,
3330                 struct dt_object *dt, const struct lu_buf *mbuf,
3331                 struct thandle *th)
3332 {
3333         struct lod_thread_info *info = lod_env_info(env);
3334         struct lu_attr *layout_attr = &info->lti_layout_attr;
3335         struct lu_buf *buf = &info->lti_buf;
3336         struct lod_object *lo = lod_dt_obj(dt);
3337         struct lov_comp_md_v1 *lcm;
3338         struct lov_comp_md_v1 *cur_lcm;
3339         struct lov_comp_md_v1 *merge_lcm;
3340         struct lov_comp_md_entry_v1 *lcme;
3341         struct lov_mds_md_v1 *lmm;
3342         size_t size = 0;
3343         size_t offset;
3344         __u16 cur_entry_count;
3345         __u16 merge_entry_count;
3346         __u32 id = 0;
3347         __u16 mirror_id = 0;
3348         __u32 mirror_count;
3349         int rc, i;
3350         bool merge_has_dom;
3351
3352         ENTRY;
3353
3354         merge_lcm = mbuf->lb_buf;
3355         if (mbuf->lb_len < sizeof(*merge_lcm))
3356                 RETURN(-EINVAL);
3357
3358         /* must be an existing layout from disk */
3359         if (le32_to_cpu(merge_lcm->lcm_magic) != LOV_MAGIC_COMP_V1)
3360                 RETURN(-EINVAL);
3361
3362         merge_entry_count = le16_to_cpu(merge_lcm->lcm_entry_count);
3363
3364         /* do not allow to merge two mirrored files */
3365         if (le16_to_cpu(merge_lcm->lcm_mirror_count))
3366                 RETURN(-EBUSY);
3367
3368         /* verify the target buffer */
3369         rc = lod_get_lov_ea(env, lo);
3370         if (rc <= 0)
3371                 RETURN(rc ? : -ENODATA);
3372
3373         cur_lcm = info->lti_ea_store;
3374         switch (le32_to_cpu(cur_lcm->lcm_magic)) {
3375         case LOV_MAGIC_V1:
3376         case LOV_MAGIC_V3:
3377                 rc = lod_layout_convert(info);
3378                 break;
3379         case LOV_MAGIC_COMP_V1:
3380         case LOV_MAGIC_SEL:
3381                 rc = 0;
3382                 break;
3383         default:
3384                 rc = -EINVAL;
3385         }
3386         if (rc)
3387                 RETURN(rc);
3388
3389         /* info->lti_ea_store could be reallocated in lod_layout_convert() */
3390         cur_lcm = info->lti_ea_store;
3391         cur_entry_count = le16_to_cpu(cur_lcm->lcm_entry_count);
3392
3393         /* 'lcm_mirror_count + 1' is the current # of mirrors the file has */
3394         mirror_count = le16_to_cpu(cur_lcm->lcm_mirror_count) + 1;
3395         if (mirror_count + 1 > LUSTRE_MIRROR_COUNT_MAX)
3396                 RETURN(-ERANGE);
3397
3398         /* size of new layout */
3399         size = le32_to_cpu(cur_lcm->lcm_size) +
3400                le32_to_cpu(merge_lcm->lcm_size) - sizeof(*cur_lcm);
3401
3402         memset(buf, 0, sizeof(*buf));
3403         lu_buf_alloc(buf, size);
3404         if (buf->lb_buf == NULL)
3405                 RETURN(-ENOMEM);
3406
3407         lcm = buf->lb_buf;
3408         memcpy(lcm, cur_lcm, sizeof(*lcm) + cur_entry_count * sizeof(*lcme));
3409
3410         offset = sizeof(*lcm) +
3411                  sizeof(*lcme) * (cur_entry_count + merge_entry_count);
3412         for (i = 0; i < cur_entry_count; i++) {
3413                 struct lov_comp_md_entry_v1 *cur_lcme;
3414
3415                 lcme = &lcm->lcm_entries[i];
3416                 cur_lcme = &cur_lcm->lcm_entries[i];
3417
3418                 lcme->lcme_offset = cpu_to_le32(offset);
3419                 memcpy((char *)lcm + offset,
3420                        (char *)cur_lcm + le32_to_cpu(cur_lcme->lcme_offset),
3421                        le32_to_cpu(lcme->lcme_size));
3422
3423                 offset += le32_to_cpu(lcme->lcme_size);
3424
3425                 if (mirror_count == 1 &&
3426                     mirror_id_of(le32_to_cpu(lcme->lcme_id)) == 0) {
3427                         /* Add mirror from a non-flr file, create new mirror ID.
3428                          * Otherwise, keep existing mirror's component ID, used
3429                          * for mirror extension.
3430                          */
3431                         id = pflr_id(1, i + 1);
3432                         lcme->lcme_id = cpu_to_le32(id);
3433                 }
3434
3435                 id = max(le32_to_cpu(lcme->lcme_id), id);
3436         }
3437
3438         mirror_id = mirror_id_of(id) + 1;
3439
3440         /* check if first entry in new layout is DOM */
3441         lmm = (struct lov_mds_md_v1 *)((char *)merge_lcm +
3442                                         merge_lcm->lcm_entries[0].lcme_offset);
3443         merge_has_dom = lov_pattern(le32_to_cpu(lmm->lmm_pattern)) ==
3444                         LOV_PATTERN_MDT;
3445
3446         for (i = 0; i < merge_entry_count; i++) {
3447                 struct lov_comp_md_entry_v1 *merge_lcme;
3448
3449                 merge_lcme = &merge_lcm->lcm_entries[i];
3450                 lcme = &lcm->lcm_entries[cur_entry_count + i];
3451
3452                 *lcme = *merge_lcme;
3453                 lcme->lcme_offset = cpu_to_le32(offset);
3454                 if (merge_has_dom && i == 0)
3455                         lcme->lcme_flags |= cpu_to_le32(LCME_FL_STALE);
3456
3457                 id = pflr_id(mirror_id, i + 1);
3458                 lcme->lcme_id = cpu_to_le32(id);
3459
3460                 memcpy((char *)lcm + offset,
3461                        (char *)merge_lcm + le32_to_cpu(merge_lcme->lcme_offset),
3462                        le32_to_cpu(lcme->lcme_size));
3463
3464                 offset += le32_to_cpu(lcme->lcme_size);
3465         }
3466
3467         /* fixup layout information */
3468         lcm->lcm_size = cpu_to_le32(size);
3469         lcm->lcm_entry_count = cpu_to_le16(cur_entry_count + merge_entry_count);
3470         lcm->lcm_mirror_count = cpu_to_le16(mirror_count);
3471         if ((le16_to_cpu(lcm->lcm_flags) & LCM_FL_FLR_MASK) == LCM_FL_NONE)
3472                 lcm->lcm_flags = cpu_to_le32(LCM_FL_RDONLY);
3473
3474         rc = lod_striping_reload(env, lo, buf, 0);
3475         if (rc)
3476                 GOTO(out, rc);
3477
3478         lod_obj_inc_layout_gen(lo);
3479         lcm->lcm_layout_gen = cpu_to_le32(lo->ldo_layout_gen);
3480
3481         /* transfer layout version to OST objects. */
3482         if (lo->ldo_mirror_count > 1) {
3483                 struct lod_obj_stripe_cb_data data = { {0} };
3484
3485                 layout_attr->la_valid = LA_LAYOUT_VERSION;
3486                 layout_attr->la_layout_version = 0;
3487                 data.locd_attr = layout_attr;
3488                 data.locd_declare = true;
3489                 data.locd_stripe_cb = lod_obj_stripe_attr_set_cb;
3490                 rc = lod_obj_for_each_stripe(env, lo, th, &data);
3491                 if (rc)
3492                         GOTO(out, rc);
3493         }
3494
3495         rc = lod_sub_declare_xattr_set(env, dt_object_child(dt), buf,
3496                                         XATTR_NAME_LOV, LU_XATTR_REPLACE, th);
3497
3498 out:
3499         lu_buf_free(buf);
3500         RETURN(rc);
3501 }
3502
3503 /**
3504  * Split layouts, just set the LOVEA with the layout from mbuf.
3505  */
3506 static int lod_declare_layout_split(const struct lu_env *env,
3507                 struct dt_object *dt, const struct lu_buf *mbuf,
3508                 struct thandle *th)
3509 {
3510         struct lod_thread_info *info = lod_env_info(env);
3511         struct lu_attr *layout_attr = &info->lti_layout_attr;
3512         struct lod_object *lo = lod_dt_obj(dt);
3513         struct lov_comp_md_v1 *lcm = mbuf->lb_buf;
3514         int rc;
3515         ENTRY;
3516
3517         rc = lod_striping_reload(env, lo, mbuf, LVF_ALL_STALE);
3518         if (rc)
3519                 RETURN(rc);
3520
3521         lod_obj_inc_layout_gen(lo);
3522         /* fix on-disk layout gen */
3523         lcm->lcm_layout_gen = cpu_to_le32(lo->ldo_layout_gen);
3524
3525
3526         /* transfer layout version to OST objects. */
3527         if (lo->ldo_mirror_count > 1) {
3528                 struct lod_obj_stripe_cb_data data = { {0} };
3529
3530                 layout_attr->la_valid = LA_LAYOUT_VERSION;
3531                 layout_attr->la_layout_version = 0;
3532                 data.locd_attr = layout_attr;
3533                 data.locd_declare = true;
3534                 data.locd_stripe_cb = lod_obj_stripe_attr_set_cb;
3535                 rc = lod_obj_for_each_stripe(env, lo, th, &data);
3536                 if (rc)
3537                         RETURN(rc);
3538         }
3539
3540         rc = lod_sub_declare_xattr_set(env, dt_object_child(dt), mbuf,
3541                                        XATTR_NAME_LOV, LU_XATTR_REPLACE, th);
3542         RETURN(rc);
3543 }
3544
3545 static int lod_layout_declare_or_purge_mirror(const struct lu_env *env,
3546                         struct dt_object *dt, const struct lu_buf *buf,
3547                         struct thandle *th, bool declare)
3548 {
3549         struct lod_thread_info *info = lod_env_info(env);
3550         struct lod_device *d = lu2lod_dev(dt->do_lu.lo_dev);
3551         struct lod_object *lo = lod_dt_obj(dt);
3552         struct lov_comp_md_v1 *comp_v1 = buf->lb_buf;
3553         struct lov_comp_md_entry_v1 *entry;
3554         struct lov_mds_md_v1 *lmm;
3555         struct dt_object **sub_objs = NULL;
3556         int rc = 0, i, k, array_count = 0;
3557
3558         ENTRY;
3559
3560         /**
3561          * other ops (like lod_declare_destroy) could destroying sub objects
3562          * as well.
3563          */
3564         mutex_lock(&lo->ldo_layout_mutex);
3565
3566         if (!declare) {
3567                 /* prepare sub-objects array */
3568                 for (i = 0; i < comp_v1->lcm_entry_count; i++) {
3569                         entry = &comp_v1->lcm_entries[i];
3570
3571                         if (!(entry->lcme_flags & LCME_FL_INIT))
3572                                 continue;
3573
3574                         lmm = (struct lov_mds_md_v1 *)
3575                                         ((char *)comp_v1 + entry->lcme_offset);
3576                         array_count += lmm->lmm_stripe_count;
3577                 }
3578                 OBD_ALLOC_PTR_ARRAY(sub_objs, array_count);
3579                 if (sub_objs == NULL) {
3580                         mutex_unlock(&lo->ldo_layout_mutex);
3581                         RETURN(-ENOMEM);
3582                 }
3583         }
3584
3585         k = 0;  /* sub_objs index */
3586         for (i = 0; i < comp_v1->lcm_entry_count; i++) {
3587                 struct lov_ost_data_v1 *objs;
3588                 struct lu_object *o, *n;
3589                 struct dt_object *dto;
3590                 struct lu_device *nd;
3591                 struct lov_mds_md_v3 *v3;
3592                 __u32 idx;
3593                 int j;
3594
3595                 entry = &comp_v1->lcm_entries[i];
3596
3597                 if (!(entry->lcme_flags & LCME_FL_INIT))
3598                         continue;
3599
3600                 lmm = (struct lov_mds_md_v1 *)
3601                                 ((char *)comp_v1 + entry->lcme_offset);
3602                 v3 = (struct lov_mds_md_v3 *)lmm;
3603                 if (lmm->lmm_magic == LOV_MAGIC_V3)
3604                         objs = &v3->lmm_objects[0];
3605                 else
3606                         objs = &lmm->lmm_objects[0];
3607
3608                 for (j = 0; j < lmm->lmm_stripe_count; j++) {
3609                         idx = objs[j].l_ost_idx;
3610                         rc = ostid_to_fid(&info->lti_fid, &objs[j].l_ost_oi,
3611                                           idx);
3612                         if (rc)
3613                                 GOTO(out, rc);
3614
3615                         if (!fid_is_sane(&info->lti_fid)) {
3616                                 CERROR("%s: sub-object insane fid "DFID"\n",
3617                                        lod2obd(d)->obd_name,
3618                                        PFID(&info->lti_fid));
3619                                 GOTO(out, rc = -EINVAL);
3620                         }
3621
3622                         lod_getref(&d->lod_ost_descs);
3623
3624                         rc = validate_lod_and_idx(d, idx);
3625                         if (unlikely(rc)) {
3626                                 lod_putref(d, &d->lod_ost_descs);
3627                                 GOTO(out, rc);
3628                         }
3629
3630                         nd = &OST_TGT(d, idx)->ltd_tgt->dd_lu_dev;
3631                         lod_putref(d, &d->lod_ost_descs);
3632
3633                         o = lu_object_find_at(env, nd, &info->lti_fid, NULL);
3634                         if (IS_ERR(o))
3635                                 GOTO(out, rc = PTR_ERR(o));
3636
3637                         n = lu_object_locate(o->lo_header, nd->ld_type);
3638                         if (unlikely(!n)) {
3639                                 lu_object_put(env, n);
3640                                 GOTO(out, rc = -ENOENT);
3641                         }
3642
3643                         dto = container_of(n, struct dt_object, do_lu);
3644
3645                         if (declare) {
3646                                 rc = lod_sub_declare_destroy(env, dto, th);
3647                                 dt_object_put(env, dto);
3648                                 if (rc)
3649                                         GOTO(out, rc);
3650                         } else {
3651                                 /**
3652                                  * collect to-be-destroyed sub objects, the
3653                                  * reference would be released after actual
3654                                  * deletion.
3655                                  */
3656                                 sub_objs[k] = dto;
3657                                 k++;
3658                         }
3659                 } /* for each stripe */
3660         } /* for each component in the mirror */
3661 out:
3662         if (!declare) {
3663                 i = 0;
3664                 if (!rc) {
3665                         /* destroy the sub objects */
3666                         for (; i < k; i++) {
3667                                 rc = lod_sub_destroy(env, sub_objs[i], th);
3668                                 if (rc)
3669                                         break;
3670                                 dt_object_put(env, sub_objs[i]);
3671                         }
3672                 }
3673                 /**
3674                  * if a sub object destroy failed, we'd release sub objects
3675                  * reference get from above sub_objs collection.
3676                  */
3677                 for (; i < k; i++)
3678                         dt_object_put(env, sub_objs[i]);
3679
3680                 OBD_FREE_PTR_ARRAY(sub_objs, array_count);
3681         }
3682         mutex_unlock(&lo->ldo_layout_mutex);
3683
3684         RETURN(rc);
3685 }
3686
3687 /**
3688  * Purge layouts, delete sub objects in the mirror stored in the vic_buf,
3689  * and set the LOVEA with the layout from mbuf.
3690  */
3691 static int lod_declare_layout_purge(const struct lu_env *env,
3692                 struct dt_object *dt, const struct lu_buf *buf,
3693                 struct thandle *th)
3694 {
3695         struct lod_device *d = lu2lod_dev(dt->do_lu.lo_dev);
3696         struct lov_comp_md_v1 *comp_v1 = buf->lb_buf;
3697         int rc;
3698
3699         ENTRY;
3700
3701         if (le32_to_cpu(comp_v1->lcm_magic) != LOV_MAGIC_COMP_V1) {
3702                 CERROR("%s: invalid layout magic %#x != %#x\n",
3703                        lod2obd(d)->obd_name, le32_to_cpu(comp_v1->lcm_magic),
3704                        LOV_MAGIC_COMP_V1);
3705                 RETURN(-EINVAL);
3706         }
3707
3708         if (cpu_to_le32(LOV_MAGIC_COMP_V1) != LOV_MAGIC_COMP_V1)
3709                 lustre_swab_lov_comp_md_v1(comp_v1);
3710
3711         /* from now on, @buf contains cpu endian data */
3712
3713         if (comp_v1->lcm_mirror_count != 0) {
3714                 CERROR("%s: can only purge one mirror from "DFID"\n",
3715                        lod2obd(d)->obd_name, PFID(lu_object_fid(&dt->do_lu)));
3716                 RETURN(-EINVAL);
3717         }
3718
3719         /* delcare sub objects deletion in the mirror stored in @buf */
3720         rc = lod_layout_declare_or_purge_mirror(env, dt, buf, th, true);
3721         RETURN(rc);
3722 }
3723
3724 /* delete sub objects from the mirror stored in @buf */
3725 static int lod_layout_purge(const struct lu_env *env, struct dt_object *dt,
3726                             const struct lu_buf *buf, struct thandle *th)
3727 {
3728         int rc;
3729
3730         ENTRY;
3731         rc = lod_layout_declare_or_purge_mirror(env, dt, buf, th, false);
3732         RETURN(rc);
3733 }
3734
3735 /**
3736  * Implementation of dt_object_operations::do_declare_xattr_set.
3737  *
3738  * \see dt_object_operations::do_declare_xattr_set() in the API description
3739  * for details.
3740  *
3741  * the extension to the API:
3742  *   - declaring LOVEA requests striping creation
3743  *   - LU_XATTR_REPLACE means layout swap
3744  */
3745 static int lod_declare_xattr_set(const struct lu_env *env,
3746                                  struct dt_object *dt,
3747                                  const struct lu_buf *buf,
3748                                  const char *name, int fl,
3749                                  struct thandle *th)
3750 {
3751         struct dt_object *next = dt_object_child(dt);
3752         struct lu_attr   *attr = &lod_env_info(env)->lti_attr;
3753         __u32             mode;
3754         int               rc;
3755         ENTRY;
3756
3757         mode = dt->do_lu.lo_header->loh_attr & S_IFMT;
3758         if ((S_ISREG(mode) || mode == 0) &&
3759             !(fl & (LU_XATTR_REPLACE | LU_XATTR_MERGE | LU_XATTR_SPLIT |
3760                     LU_XATTR_PURGE)) &&
3761             (strcmp(name, XATTR_NAME_LOV) == 0 ||
3762              strcmp(name, XATTR_LUSTRE_LOV) == 0)) {
3763                 /*
3764                  * this is a request to create object's striping.
3765                  *
3766                  * allow to declare predefined striping on a new (!mode) object
3767                  * which is supposed to be replay of regular file creation
3768                  * (when LOV setting is declared)
3769                  *
3770                  * LU_XATTR_REPLACE is set to indicate a layout swap
3771                  */
3772                 if (dt_object_exists(dt)) {
3773                         rc = dt_attr_get(env, next, attr);
3774                         if (rc)
3775                                 RETURN(rc);
3776                 } else {
3777                         memset(attr, 0, sizeof(*attr));
3778                         attr->la_valid = LA_TYPE | LA_MODE;
3779                         attr->la_mode = S_IFREG;
3780                 }
3781                 rc = lod_declare_striped_create(env, dt, attr, buf, th);
3782         } else if (fl & LU_XATTR_MERGE) {
3783                 LASSERT(strcmp(name, XATTR_NAME_LOV) == 0 ||
3784                         strcmp(name, XATTR_LUSTRE_LOV) == 0);
3785                 rc = lod_declare_layout_merge(env, dt, buf, th);
3786         } else if (fl & LU_XATTR_SPLIT) {
3787                 LASSERT(strcmp(name, XATTR_NAME_LOV) == 0 ||
3788                         strcmp(name, XATTR_LUSTRE_LOV) == 0);
3789                 rc = lod_declare_layout_split(env, dt, buf, th);
3790         } else if (fl & LU_XATTR_PURGE) {
3791                 LASSERT(strcmp(name, XATTR_NAME_LOV) == 0 ||
3792                         strcmp(name, XATTR_LUSTRE_LOV) == 0);
3793                 rc = lod_declare_layout_purge(env, dt, buf, th);
3794         } else if (S_ISREG(mode) &&
3795                    strlen(name) >= sizeof(XATTR_LUSTRE_LOV) + 3 &&
3796                    allowed_lustre_lov(name)) {
3797                 /*
3798                  * this is a request to modify object's striping.
3799                  * add/set/del component(s).
3800                  */
3801                 if (!dt_object_exists(dt))
3802                         RETURN(-ENOENT);
3803
3804                 rc = lod_declare_modify_layout(env, dt, name, buf, th);
3805         } else if (S_ISDIR(mode)) {
3806                 rc = lod_dir_declare_xattr_set(env, dt, buf, name, fl, th);
3807         } else if (strcmp(name, XATTR_NAME_FID) == 0) {
3808                 rc = lod_replace_parent_fid(env, dt, buf, th, true);
3809         } else {
3810                 rc = lod_sub_declare_xattr_set(env, next, buf, name, fl, th);
3811         }
3812
3813         RETURN(rc);
3814 }
3815
3816 /**
3817  * Apply xattr changes to the object.
3818  *
3819  * Applies xattr changes to the object and the stripes if the latter exist.
3820  *
3821  * \param[in] env       execution environment
3822  * \param[in] dt        object
3823  * \param[in] buf       buffer pointing to the new value of xattr
3824  * \param[in] name      name of xattr
3825  * \param[in] fl        flags
3826  * \param[in] th        transaction handle
3827  *
3828  * \retval              0 on success
3829  * \retval              negative if failed
3830  */
3831 static int lod_xattr_set_internal(const struct lu_env *env,
3832                                   struct dt_object *dt,
3833                                   const struct lu_buf *buf,
3834                                   const char *name, int fl,
3835                                   struct thandle *th)
3836 {
3837         struct dt_object        *next = dt_object_child(dt);
3838         struct lod_object       *lo = lod_dt_obj(dt);
3839         int                     rc;
3840         int                     i;
3841         ENTRY;
3842
3843         rc = lod_sub_xattr_set(env, next, buf, name, fl, th);
3844         if (rc != 0 || !S_ISDIR(dt->do_lu.lo_header->loh_attr))
3845                 RETURN(rc);
3846
3847         /* Note: Do not set LinkEA on sub-stripes, otherwise
3848          * it will confuse the fid2path process(see mdt_path_current()).
3849          * The linkEA between master and sub-stripes is set in
3850          * lod_xattr_set_lmv(). */
3851         if (lo->ldo_dir_stripe_count == 0 || strcmp(name, XATTR_NAME_LINK) == 0)
3852                 RETURN(0);
3853
3854         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
3855                 if (!lo->ldo_stripe[i])
3856                         continue;
3857
3858                 if (!dt_object_exists(lo->ldo_stripe[i]))
3859                         continue;
3860
3861                 rc = lod_sub_xattr_set(env, lo->ldo_stripe[i], buf, name,
3862                                        fl, th);
3863                 if (rc != 0)
3864                         break;
3865         }
3866
3867         RETURN(rc);
3868 }
3869
3870 /**
3871  * Delete an extended attribute.
3872  *
3873  * Deletes specified xattr from the object and the stripes if the latter exist.
3874  *
3875  * \param[in] env       execution environment
3876  * \param[in] dt        object
3877  * \param[in] name      name of xattr
3878  * \param[in] th        transaction handle
3879  *
3880  * \retval              0 on success
3881  * \retval              negative if failed
3882  */
3883 static int lod_xattr_del_internal(const struct lu_env *env,
3884                                   struct dt_object *dt,
3885                                   const char *name, struct thandle *th)
3886 {
3887         struct dt_object *next = dt_object_child(dt);
3888         struct lod_object *lo = lod_dt_obj(dt);
3889         int i;
3890         int rc;
3891
3892         ENTRY;
3893
3894         rc = lod_sub_xattr_del(env, next, name, th);
3895         if (rc != 0 || !S_ISDIR(dt->do_lu.lo_header->loh_attr))
3896                 RETURN(rc);
3897
3898         if (lo->ldo_dir_stripe_count == 0)
3899                 RETURN(rc);
3900
3901         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
3902                 if (!lo->ldo_stripe[i])
3903                         continue;
3904
3905                 if (!dt_object_exists(lo->ldo_stripe[i]))
3906                         continue;
3907
3908                 rc = lod_sub_xattr_del(env, lo->ldo_stripe[i], name, th);
3909                 if (rc != 0)
3910                         break;
3911         }
3912
3913         RETURN(rc);
3914 }
3915
3916 /**
3917  * Set default striping on a directory.
3918  *
3919  * Sets specified striping on a directory object unless it matches the default
3920  * striping (LOVEA_DELETE_VALUES() macro). In the latter case remove existing
3921  * EA. This striping will be used when regular file is being created in this
3922  * directory.
3923  *
3924  * \param[in] env       execution environment
3925  * \param[in] dt        the striped object
3926  * \param[in] buf       buffer with the striping
3927  * \param[in] name      name of EA
3928  * \param[in] fl        xattr flag (see OSD API description)
3929  * \param[in] th        transaction handle
3930  *
3931  * \retval              0 on success
3932  * \retval              negative if failed
3933  */
3934 static int lod_xattr_set_lov_on_dir(const struct lu_env *env,
3935                                     struct dt_object *dt,
3936                                     const struct lu_buf *buf,
3937                                     const char *name, int fl,
3938                                     struct thandle *th)
3939 {
3940         struct lov_user_md_v1   *lum;
3941         struct lov_user_md_v3   *v3 = NULL;
3942         const char              *pool_name = NULL;
3943         int                      rc;
3944         bool                     is_del;
3945         ENTRY;
3946
3947         LASSERT(buf != NULL && buf->lb_buf != NULL);
3948         lum = buf->lb_buf;
3949
3950         switch (lum->lmm_magic) {
3951         case LOV_USER_MAGIC_SPECIFIC:
3952         case LOV_USER_MAGIC_V3:
3953                 v3 = buf->lb_buf;
3954                 if (v3->lmm_pool_name[0] != '\0')
3955                         pool_name = v3->lmm_pool_name;
3956                 fallthrough;
3957         case LOV_USER_MAGIC_V1:
3958                 /* if { size, offset, count } = { 0, -1, 0 } and no pool
3959                  * (i.e. all default values specified) then delete default
3960                  * striping from dir. */
3961                 CDEBUG(D_LAYOUT,
3962                        "set default striping: sz %u # %u offset %d %s %s\n",
3963                        (unsigned)lum->lmm_stripe_size,
3964                        (unsigned)lum->lmm_stripe_count,
3965                        (int)lum->lmm_stripe_offset,
3966                        v3 ? "from" : "", v3 ? v3->lmm_pool_name : "");
3967
3968                 is_del = LOVEA_DELETE_VALUES(lum->lmm_stripe_size,
3969                                              lum->lmm_stripe_count,
3970                                              lum->lmm_stripe_offset,
3971                                              pool_name);
3972                 break;
3973         case LOV_USER_MAGIC_COMP_V1:
3974         {
3975                 struct lov_comp_md_v1 *lcm = (struct lov_comp_md_v1 *)lum;
3976                 struct lov_comp_md_entry_v1 *lcme;
3977                 int i, comp_cnt;
3978
3979                 comp_cnt = le16_to_cpu(lcm->lcm_entry_count);
3980                 for (i = 0; i < comp_cnt; i++) {
3981                         lcme = &lcm->lcm_entries[i];
3982                         if (lcme->lcme_flags & cpu_to_le32(LCME_FL_EXTENSION)) {
3983                                 lcm->lcm_magic = cpu_to_le32(LOV_MAGIC_SEL);
3984                                 break;
3985                         }
3986                 }
3987
3988                 is_del = false;
3989                 break;
3990         }
3991         default:
3992                 CERROR("Invalid magic %x\n", lum->lmm_magic);
3993                 RETURN(-EINVAL);
3994         }
3995
3996         if (is_del) {
3997                 rc = lod_xattr_del_internal(env, dt, name, th);
3998                 if (rc == -ENODATA)
3999                         rc = 0;
4000         } else {
4001                 rc = lod_xattr_set_internal(env, dt, buf, name, fl, th);
4002         }
4003
4004         RETURN(rc);
4005 }
4006
4007 static int lod_get_default_lov_striping(const struct lu_env *env,
4008                                        struct lod_object *lo,
4009                                        struct lod_default_striping *lds,
4010                                        struct dt_allocation_hint *ah);
4011
4012 /**
4013  * Helper function to convert compound layout to compound layout with
4014  * pool
4015  *
4016  * Copy lcm_entries array of \a src to \a tgt. Replace lov_user_md_v1
4017  * components of \a src with lov_user_md_v3 using \a pool.
4018  *
4019  * \param[in] src       source layout
4020  * \param[in] pool      pool to use in \a tgt
4021  * \param[out] tgt      target layout
4022  */
4023 static void embed_pool_to_comp_v1(const struct lov_comp_md_v1 *src,
4024                                   const char *pool,
4025                                   struct lov_comp_md_v1 *tgt)
4026 {
4027         size_t shift;
4028         struct lov_user_md_v1 *lum;
4029         struct lov_user_md_v3 *lum3;
4030         struct lov_comp_md_entry_v1 *entry;
4031         int i;
4032         __u32 offset;
4033
4034         entry = tgt->lcm_entries;
4035         shift = 0;
4036         for (i = 0; i < le16_to_cpu(src->lcm_entry_count); i++, entry++) {
4037                 *entry = src->lcm_entries[i];
4038                 offset = le32_to_cpu(src->lcm_entries[i].lcme_offset);
4039                 entry->lcme_offset = cpu_to_le32(offset + shift);
4040
4041                 lum = (struct lov_user_md_v1 *)((char *)src + offset);
4042                 lum3 = (struct lov_user_md_v3 *)((char *)tgt + offset + shift);
4043                 *(struct lov_user_md_v1 *)lum3 = *lum;
4044                 if (lum->lmm_pattern == cpu_to_le32(LOV_PATTERN_MDT)) {
4045                         lum3->lmm_magic = cpu_to_le32(LOV_USER_MAGIC_V1);
4046                 } else {
4047                         lum3->lmm_magic = cpu_to_le32(LOV_USER_MAGIC_V3);
4048                         entry->lcme_size = cpu_to_le32(sizeof(*lum3));
4049                         strlcpy(lum3->lmm_pool_name, pool,
4050                                 sizeof(lum3->lmm_pool_name));
4051                         shift += sizeof(*lum3) - sizeof(*lum);
4052                 }
4053         }
4054 }
4055
4056 /**
4057  * Set default striping on a directory.
4058  *
4059  * Sets specified striping on a directory object unless it matches the default
4060  * striping (LOVEA_DELETE_VALUES() macro). In the latter case remove existing
4061  * EA. This striping will be used when regular file is being created in this
4062  * directory.
4063  * If current default striping includes a pool but specifed striping
4064  * does not - retain the pool if it exists.
4065  *
4066  * \param[in] env       execution environment
4067  * \param[in] dt        the striped object
4068  * \param[in] buf       buffer with the striping
4069  * \param[in] name      name of EA
4070  * \param[in] fl        xattr flag (see OSD API description)
4071  * \param[in] th        transaction handle
4072  *
4073  * \retval              0 on success
4074  * \retval              negative if failed
4075  */
4076 static int lod_xattr_set_default_lov_on_dir(const struct lu_env *env,
4077                                             struct dt_object *dt,
4078                                             const struct lu_buf *buf,
4079                                             const char *name, int fl,
4080                                             struct thandle *th)
4081 {
4082         struct lod_default_striping     *lds = lod_lds_buf_get(env);
4083         struct lov_user_md_v1           *v1 = buf->lb_buf;
4084         char                             pool[LOV_MAXPOOLNAME + 1];
4085         bool                             is_del;
4086         int                              rc;
4087
4088         ENTRY;
4089
4090         /* get existing striping config */
4091         rc = lod_get_default_lov_striping(env, lod_dt_obj(dt), lds, NULL);
4092         if (rc)
4093                 RETURN(rc);
4094
4095         memset(pool, 0, sizeof(pool));
4096         if (lds->lds_def_striping_set == 1)
4097                 lod_layout_get_pool(lds->lds_def_comp_entries,
4098                                     lds->lds_def_comp_cnt, pool,
4099                                     sizeof(pool));
4100
4101         is_del = LOVEA_DELETE_VALUES(v1->lmm_stripe_size,
4102                                      v1->lmm_stripe_count,
4103                                      v1->lmm_stripe_offset,
4104                                      NULL);
4105
4106         /* Retain the pool name if it is not given */
4107         if (v1->lmm_magic == LOV_USER_MAGIC_V1 && pool[0] != '\0' &&
4108             !is_del) {
4109                 struct lod_thread_info *info = lod_env_info(env);
4110                 struct lov_user_md_v3 *v3  = info->lti_ea_store;
4111
4112                 memset(v3, 0, sizeof(*v3));
4113                 v3->lmm_magic = cpu_to_le32(LOV_USER_MAGIC_V3);
4114                 v3->lmm_pattern = cpu_to_le32(v1->lmm_pattern);
4115                 v3->lmm_stripe_count = cpu_to_le32(v1->lmm_stripe_count);
4116                 v3->lmm_stripe_offset = cpu_to_le32(v1->lmm_stripe_offset);
4117                 v3->lmm_stripe_size = cpu_to_le32(v1->lmm_stripe_size);
4118
4119                 strlcpy(v3->lmm_pool_name, pool, sizeof(v3->lmm_pool_name));
4120
4121                 info->lti_buf.lb_buf = v3;
4122                 info->lti_buf.lb_len = sizeof(*v3);
4123                 rc = lod_xattr_set_lov_on_dir(env, dt, &info->lti_buf,
4124                                               name, fl, th);
4125         } else if (v1->lmm_magic == LOV_USER_MAGIC_COMP_V1 &&
4126                    pool[0] != '\0' && !is_del) {
4127                 /*
4128                  * try to retain the pool from default layout if the
4129                  * specified component layout does not provide pool
4130                  * info explicitly
4131                  */
4132                 struct lod_thread_info *info = lod_env_info(env);
4133                 struct lov_comp_md_v1 *comp_v1 = buf->lb_buf;
4134                 struct lov_comp_md_v1 *comp_v1p;
4135                 struct lov_user_md_v1 *lum;
4136                 int entry_count;
4137                 int i;
4138                 __u32 offset;
4139                 struct lov_comp_md_entry_v1 *entry;
4140                 int size;
4141
4142                 entry_count = le16_to_cpu(comp_v1->lcm_entry_count);
4143                 size = sizeof(*comp_v1) +
4144                         entry_count * sizeof(comp_v1->lcm_entries[0]);
4145                 entry = comp_v1->lcm_entries;
4146                 for (i = 0; i < entry_count; i++, entry++) {
4147                         offset = le32_to_cpu(entry->lcme_offset);
4148                         lum = (struct lov_user_md_v1 *)((char *)comp_v1 +
4149                                                         offset);
4150                         if (le32_to_cpu(lum->lmm_magic) != LOV_USER_MAGIC_V1)
4151                                 /* the i-th component includes pool info */
4152                                 break;
4153                         if (lum->lmm_pattern == cpu_to_le32(LOV_PATTERN_MDT))
4154                                 size += sizeof(struct lov_user_md_v1);
4155                         else
4156                                 size += sizeof(struct lov_user_md_v3);
4157                 }
4158
4159                 if (i == entry_count) {
4160                         /*
4161                          * re-compose the layout to include the pool for
4162                          * each component
4163                          */
4164                         if (info->lti_ea_store_size < size)
4165                                 rc = lod_ea_store_resize(info, size);
4166
4167                         if (rc == 0) {
4168                                 comp_v1p = info->lti_ea_store;
4169                                 *comp_v1p = *comp_v1;
4170                                 comp_v1p->lcm_size = cpu_to_le32(size);
4171                                 embed_pool_to_comp_v1(comp_v1, pool, comp_v1p);
4172
4173                                 info->lti_buf.lb_buf = comp_v1p;
4174                                 info->lti_buf.lb_len = size;
4175                                 rc = lod_xattr_set_lov_on_dir(env, dt,
4176                                                               &info->lti_buf,
4177                                                               name, fl, th);
4178                         }
4179                 } else {
4180                         rc = lod_xattr_set_lov_on_dir(env, dt, buf, name, fl,
4181                                                       th);
4182                 }
4183         } else {
4184                 rc = lod_xattr_set_lov_on_dir(env, dt, buf, name, fl, th);
4185         }
4186
4187         if (lds->lds_def_striping_set == 1 && lds->lds_def_comp_entries != NULL)
4188                 lod_free_def_comp_entries(lds);
4189
4190         RETURN(rc);
4191 }
4192
4193 /**
4194  * Set default striping on a directory object.
4195  *
4196  * Sets specified striping on a directory object unless it matches the default
4197  * striping (LOVEA_DELETE_VALUES() macro). In the latter case remove existing
4198  * EA. This striping will be used when a new directory is being created in the
4199  * directory.
4200  *
4201  * \param[in] env       execution environment
4202  * \param[in] dt        the striped object
4203  * \param[in] buf       buffer with the striping
4204  * \param[in] name      name of EA
4205  * \param[in] fl        xattr flag (see OSD API description)
4206  * \param[in] th        transaction handle
4207  *
4208  * \retval              0 on success
4209  * \retval              negative if failed
4210  */
4211 static int lod_xattr_set_default_lmv_on_dir(const struct lu_env *env,
4212                                             struct dt_object *dt,
4213                                             const struct lu_buf *buf,
4214                                             const char *name, int fl,
4215                                             struct thandle *th)
4216 {
4217         struct lmv_user_md_v1 *lum;
4218         int rc;
4219
4220         ENTRY;
4221
4222         LASSERT(buf != NULL && buf->lb_buf != NULL);
4223         lum = buf->lb_buf;
4224
4225         CDEBUG(D_INFO,
4226                "set default stripe_count # %u stripe_offset %d hash %u\n",
4227               le32_to_cpu(lum->lum_stripe_count),
4228               (int)le32_to_cpu(lum->lum_stripe_offset),
4229               le32_to_cpu(lum->lum_hash_type));
4230
4231         if (LMVEA_DELETE_VALUES((le32_to_cpu(lum->lum_stripe_count)),
4232                                  le32_to_cpu(lum->lum_stripe_offset)) &&
4233             le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC) {
4234                 rc = lod_xattr_del_internal(env, dt, name, th);
4235                 if (rc == -ENODATA)
4236                         rc = 0;
4237         } else {
4238                 rc = lod_xattr_set_internal(env, dt, buf, name, fl, th);
4239                 if (rc != 0)
4240                         RETURN(rc);
4241         }
4242
4243         RETURN(rc);
4244 }
4245
4246 /**
4247  * Turn directory into a striped directory.
4248  *
4249  * During replay the client sends the striping created before MDT
4250  * failure, then the layer above LOD sends this defined striping
4251  * using ->do_xattr_set(), so LOD uses this method to replay creation
4252  * of the stripes. Notice the original information for the striping
4253  * (#stripes, FIDs, etc) was transferred in declare path.
4254  *
4255  * \param[in] env       execution environment
4256  * \param[in] dt        the striped object
4257  * \param[in] buf       not used currently
4258  * \param[in] name      not used currently
4259  * \param[in] fl        xattr flag (see OSD API description)
4260  * \param[in] th        transaction handle
4261  *
4262  * \retval              0 on success
4263  * \retval              negative if failed
4264  */
4265 static int lod_xattr_set_lmv(const struct lu_env *env, struct dt_object *dt,
4266                              const struct lu_buf *buf, const char *name,
4267                              int fl, struct thandle *th)
4268 {
4269         struct lod_object       *lo = lod_dt_obj(dt);
4270         struct lod_thread_info  *info = lod_env_info(env);
4271         struct lu_attr          *attr = &info->lti_attr;
4272         struct dt_object_format *dof = &info->lti_format;
4273         struct lu_buf           lmv_buf;
4274         struct lu_buf           slave_lmv_buf;
4275         struct lmv_mds_md_v1    *lmm;
4276         struct lmv_mds_md_v1    *slave_lmm = NULL;
4277         struct dt_insert_rec    *rec = &info->lti_dt_rec;
4278         int                     i;
4279         int                     rc;
4280         ENTRY;
4281
4282         if (!S_ISDIR(dt->do_lu.lo_header->loh_attr))
4283                 RETURN(-ENOTDIR);
4284
4285         /* The stripes are supposed to be allocated in declare phase,
4286          * if there are no stripes being allocated, it will skip */
4287         if (lo->ldo_dir_stripe_count == 0) {
4288                 if (lo->ldo_is_foreign) {
4289                         rc = lod_sub_xattr_set(env, dt_object_child(dt), buf,
4290                                                XATTR_NAME_LMV, fl, th);
4291                         if (rc != 0)
4292                                 RETURN(rc);
4293                 }
4294                 RETURN(0);
4295         }
4296
4297         rc = dt_attr_get(env, dt_object_child(dt), attr);
4298         if (rc != 0)
4299                 RETURN(rc);
4300
4301         attr->la_valid = LA_ATIME | LA_MTIME | LA_CTIME | LA_FLAGS |
4302                          LA_MODE | LA_UID | LA_GID | LA_TYPE | LA_PROJID;
4303         dof->dof_type = DFT_DIR;
4304
4305         rc = lod_prep_lmv_md(env, dt, &lmv_buf);
4306         if (rc != 0)
4307                 RETURN(rc);
4308         lmm = lmv_buf.lb_buf;
4309
4310         OBD_ALLOC_PTR(slave_lmm);
4311         if (slave_lmm == NULL)
4312                 RETURN(-ENOMEM);
4313
4314         lod_prep_slave_lmv_md(slave_lmm, lmm);
4315         slave_lmv_buf.lb_buf = slave_lmm;
4316         slave_lmv_buf.lb_len = sizeof(*slave_lmm);
4317
4318         rec->rec_type = S_IFDIR;
4319         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
4320                 struct dt_object *dto = lo->ldo_stripe[i];
4321                 char *stripe_name = info->lti_key;
4322                 struct lu_name *sname;
4323                 struct linkea_data ldata = { NULL };
4324                 struct lu_buf linkea_buf;
4325
4326                 /* OBD_FAIL_MDS_STRIPE_FID may leave stripe uninitialized */
4327                 if (!dto)
4328                         continue;
4329
4330                 /* fail a remote stripe creation */
4331                 if (i && OBD_FAIL_CHECK(OBD_FAIL_MDS_STRIPE_CREATE))
4332                         continue;
4333
4334                 /* don't create stripe if:
4335                  * 1. it's source stripe of migrating directory
4336                  * 2. it's existed stripe of splitting directory
4337                  */
4338                 if ((lod_is_migrating(lo) && i >= lo->ldo_dir_migrate_offset) ||
4339                     (lod_is_splitting(lo) && i < lo->ldo_dir_split_offset)) {
4340                         if (!dt_object_exists(dto))
4341                                 GOTO(out, rc = -EINVAL);
4342                 } else {
4343                         dt_write_lock(env, dto, DT_TGT_CHILD);
4344                         rc = lod_sub_create(env, dto, attr, NULL, dof, th);
4345                         if (rc != 0) {
4346                                 dt_write_unlock(env, dto);
4347                                 GOTO(out, rc);
4348                         }
4349
4350                         rc = lod_sub_ref_add(env, dto, th);
4351                         dt_write_unlock(env, dto);
4352                         if (rc != 0)
4353                                 GOTO(out, rc);
4354
4355                         rec->rec_fid = lu_object_fid(&dto->do_lu);
4356                         rc = lod_sub_insert(env, dto,
4357                                             (const struct dt_rec *)rec,
4358                                             (const struct dt_key *)dot, th);
4359                         if (rc != 0)
4360                                 GOTO(out, rc);
4361                 }
4362
4363                 if (!OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_SLAVE_LMV) ||
4364                     cfs_fail_val != i) {
4365                         if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_SLAVE_LMV) &&
4366                             cfs_fail_val == i)
4367                                 slave_lmm->lmv_master_mdt_index =
4368                                                         cpu_to_le32(i + 1);
4369                         else
4370                                 slave_lmm->lmv_master_mdt_index =
4371                                                         cpu_to_le32(i);
4372
4373                         rc = lod_sub_xattr_set(env, dto, &slave_lmv_buf,
4374                                                XATTR_NAME_LMV, 0, th);
4375                         if (rc != 0)
4376                                 GOTO(out, rc);
4377                 }
4378
4379                 /* don't insert stripe if it's existed stripe of splitting
4380                  * directory (this directory is striped).
4381                  * NB, plain directory will insert itself as the first
4382                  * stripe in target.
4383                  */
4384                 if (lod_is_splitting(lo) && lo->ldo_dir_split_offset > 1 &&
4385                     lo->ldo_dir_split_offset > i)
4386                         continue;
4387
4388                 rec->rec_fid = lu_object_fid(&dt->do_lu);
4389                 rc = lod_sub_insert(env, dto, (struct dt_rec *)rec,
4390                                     (const struct dt_key *)dotdot, th);
4391                 if (rc != 0)
4392                         GOTO(out, rc);
4393
4394                 if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_SLAVE_NAME) &&
4395                     cfs_fail_val == i)
4396                         snprintf(stripe_name, sizeof(info->lti_key), DFID":%d",
4397                                  PFID(lu_object_fid(&dto->do_lu)), i + 1);
4398                 else
4399                         snprintf(stripe_name, sizeof(info->lti_key), DFID":%d",
4400                                  PFID(lu_object_fid(&dto->do_lu)), i);
4401
4402                 sname = lod_name_get(env, stripe_name, strlen(stripe_name));
4403                 rc = linkea_links_new(&ldata, &info->lti_linkea_buf,
4404                                       sname, lu_object_fid(&dt->do_lu));
4405                 if (rc != 0)
4406                         GOTO(out, rc);
4407
4408                 linkea_buf.lb_buf = ldata.ld_buf->lb_buf;
4409                 linkea_buf.lb_len = ldata.ld_leh->leh_len;
4410                 rc = lod_sub_xattr_set(env, dto, &linkea_buf,
4411                                        XATTR_NAME_LINK, 0, th);
4412                 if (rc != 0)
4413                         GOTO(out, rc);
4414
4415                 rec->rec_fid = lu_object_fid(&dto->do_lu);
4416                 rc = lod_sub_insert(env, dt_object_child(dt),
4417                                     (const struct dt_rec *)rec,
4418                                     (const struct dt_key *)stripe_name, th);
4419                 if (rc != 0)
4420                         GOTO(out, rc);
4421
4422                 rc = lod_sub_ref_add(env, dt_object_child(dt), th);
4423                 if (rc != 0)
4424                         GOTO(out, rc);
4425         }
4426
4427         if (!OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MASTER_LMV))
4428                 rc = lod_sub_xattr_set(env, dt_object_child(dt),
4429                                        &lmv_buf, XATTR_NAME_LMV, fl, th);
4430 out:
4431         if (slave_lmm != NULL)
4432                 OBD_FREE_PTR(slave_lmm);
4433
4434         RETURN(rc);
4435 }
4436
4437 /**
4438  * Helper function to declare/execute creation of a striped directory
4439  *
4440  * Called in declare/create object path, prepare striping for a directory
4441  * and prepare defaults data striping for the objects to be created in
4442  * that directory. Notice the function calls "declaration" or "execution"
4443  * methods depending on \a declare param. This is a consequence of the
4444  * current approach while we don't have natural distributed transactions:
4445  * we basically execute non-local updates in the declare phase. So, the
4446  * arguments for the both phases are the same and this is the reason for
4447  * this function to exist.
4448  *
4449  * \param[in] env       execution environment
4450  * \param[in] dt        object
4451  * \param[in] attr      attributes the stripes will be created with
4452  * \param[in] lmu       lmv_user_md if MDT indices are specified
4453  * \param[in] dof       format of stripes (see OSD API description)
4454  * \param[in] th        transaction handle
4455  * \param[in] declare   where to call "declare" or "execute" methods
4456  *
4457  * \retval              0 on success
4458  * \retval              negative if failed
4459  */
4460 static int lod_dir_striping_create_internal(const struct lu_env *env,
4461                                             struct dt_object *dt,
4462                                             struct lu_attr *attr,
4463                                             const struct lu_buf *lmu,
4464                                             struct dt_object_format *dof,
4465                                             struct thandle *th,
4466                                             bool declare)
4467 {
4468         struct lod_thread_info *info = lod_env_info(env);
4469         struct lod_object *lo = lod_dt_obj(dt);
4470         const struct lod_default_striping *lds = lo->ldo_def_striping;
4471         int rc;
4472         ENTRY;
4473
4474         LASSERT(ergo(lds != NULL,
4475                      lds->lds_def_striping_set ||
4476                      lds->lds_dir_def_striping_set));
4477
4478         if (!LMVEA_DELETE_VALUES(lo->ldo_dir_stripe_count,
4479                                  lo->ldo_dir_stripe_offset)) {
4480                 if (!lmu) {
4481                         struct lmv_user_md_v1 *v1 = info->lti_ea_store;
4482                         int stripe_count = lo->ldo_dir_stripe_count;
4483
4484                         if (info->lti_ea_store_size < sizeof(*v1)) {
4485                                 rc = lod_ea_store_resize(info, sizeof(*v1));
4486                                 if (rc != 0)
4487                                         RETURN(rc);
4488                                 v1 = info->lti_ea_store;
4489                         }
4490
4491                         memset(v1, 0, sizeof(*v1));
4492                         v1->lum_magic = cpu_to_le32(LMV_USER_MAGIC);
4493                         v1->lum_stripe_count = cpu_to_le32(stripe_count);
4494                         v1->lum_stripe_offset =
4495                                         cpu_to_le32(lo->ldo_dir_stripe_offset);
4496
4497                         info->lti_buf.lb_buf = v1;
4498                         info->lti_buf.lb_len = sizeof(*v1);
4499                         lmu = &info->lti_buf;
4500                 }
4501
4502                 if (declare)
4503                         rc = lod_declare_xattr_set_lmv(env, dt, attr, lmu, dof,
4504                                                        th);
4505                 else
4506                         rc = lod_xattr_set_lmv(env, dt, lmu, XATTR_NAME_LMV, 0,
4507                                                th);
4508                 if (rc != 0)
4509                         RETURN(rc);
4510         } else {
4511                 /* foreign LMV EA case */
4512                 if (lmu) {
4513                         struct lmv_foreign_md *lfm = lmu->lb_buf;
4514
4515                         if (lfm->lfm_magic == LMV_MAGIC_FOREIGN) {
4516                                 rc = lod_declare_xattr_set_lmv(env, dt, attr,
4517                                                                lmu, dof, th);
4518                         }
4519                 } else {
4520                         if (lo->ldo_is_foreign) {
4521                                 LASSERT(lo->ldo_foreign_lmv != NULL &&
4522                                         lo->ldo_foreign_lmv_size > 0);
4523                                 info->lti_buf.lb_buf = lo->ldo_foreign_lmv;
4524                                 info->lti_buf.lb_len = lo->ldo_foreign_lmv_size;
4525                                 lmu = &info->lti_buf;
4526                                 rc = lod_xattr_set_lmv(env, dt, lmu,
4527                                                        XATTR_NAME_LMV, 0, th);
4528                         }
4529                 }
4530         }
4531
4532         /* Transfer default LMV striping from the parent */
4533         if (lds != NULL && lds->lds_dir_def_striping_set &&
4534             lds->lds_dir_def_max_inherit != LMV_INHERIT_END &&
4535             lds->lds_dir_def_max_inherit != LMV_INHERIT_NONE &&
4536             !(LMVEA_DELETE_VALUES(lds->lds_dir_def_stripe_count,
4537                                  lds->lds_dir_def_stripe_offset) &&
4538               le32_to_cpu(lds->lds_dir_def_hash_type) !=
4539               LMV_HASH_TYPE_UNKNOWN)) {
4540                 struct lmv_user_md_v1 *v1 = info->lti_ea_store;
4541
4542                 if (info->lti_ea_store_size < sizeof(*v1)) {
4543                         rc = lod_ea_store_resize(info, sizeof(*v1));
4544                         if (rc != 0)
4545                                 RETURN(rc);
4546                         v1 = info->lti_ea_store;
4547                 }
4548
4549                 memset(v1, 0, sizeof(*v1));
4550                 v1->lum_magic = cpu_to_le32(LMV_USER_MAGIC);
4551                 v1->lum_stripe_count =
4552                         cpu_to_le32(lds->lds_dir_def_stripe_count);
4553                 v1->lum_stripe_offset =
4554                         cpu_to_le32(lds->lds_dir_def_stripe_offset);
4555                 v1->lum_hash_type =
4556                         cpu_to_le32(lds->lds_dir_def_hash_type);
4557                 v1->lum_max_inherit =
4558                         lmv_inherit_next(lds->lds_dir_def_max_inherit);
4559                 v1->lum_max_inherit_rr =
4560                         lmv_inherit_rr_next(lds->lds_dir_def_max_inherit_rr);
4561
4562                 info->lti_buf.lb_buf = v1;
4563                 info->lti_buf.lb_len = sizeof(*v1);
4564                 if (declare)
4565                         rc = lod_dir_declare_xattr_set(env, dt, &info->lti_buf,
4566                                                        XATTR_NAME_DEFAULT_LMV,
4567                                                        0, th);
4568                 else
4569                         rc = lod_xattr_set_default_lmv_on_dir(env, dt,
4570                                                   &info->lti_buf,
4571                                                   XATTR_NAME_DEFAULT_LMV, 0,
4572                                                   th);
4573                 if (rc != 0)
4574                         RETURN(rc);
4575         }
4576
4577         /* Transfer default LOV striping from the parent */
4578         if (lds != NULL && lds->lds_def_striping_set &&
4579             lds->lds_def_comp_cnt != 0) {
4580                 struct lov_mds_md *lmm;
4581                 int lmm_size = lod_comp_md_size(lo, true);
4582
4583                 if (info->lti_ea_store_size < lmm_size) {
4584                         rc = lod_ea_store_resize(info, lmm_size);
4585                         if (rc != 0)
4586                                 RETURN(rc);
4587                 }
4588                 lmm = info->lti_ea_store;
4589
4590                 rc = lod_generate_lovea(env, lo, lmm, &lmm_size, true);
4591                 if (rc != 0)
4592                         RETURN(rc);
4593
4594                 info->lti_buf.lb_buf = lmm;
4595                 info->lti_buf.lb_len = lmm_size;
4596
4597                 if (declare)
4598                         rc = lod_dir_declare_xattr_set(env, dt, &info->lti_buf,
4599                                                        XATTR_NAME_LOV, 0, th);
4600                 else
4601                         rc = lod_xattr_set_lov_on_dir(env, dt, &info->lti_buf,
4602                                                       XATTR_NAME_LOV, 0, th);
4603                 if (rc != 0)
4604                         RETURN(rc);
4605         }
4606
4607         /* ldo_def_striping is not allocated, clear after use, in case directory
4608          * layout is changed later.
4609          */
4610         if (!declare)
4611                 lo->ldo_def_striping = NULL;
4612
4613         RETURN(0);
4614 }
4615
4616 static int lod_declare_dir_striping_create(const struct lu_env *env,
4617                                            struct dt_object *dt,
4618                                            struct lu_attr *attr,
4619                                            struct lu_buf *lmu,
4620                                            struct dt_object_format *dof,
4621                                            struct thandle *th)
4622 {
4623         return lod_dir_striping_create_internal(env, dt, attr, lmu, dof, th,
4624                                                 true);
4625 }
4626
4627 static int lod_dir_striping_create(const struct lu_env *env,
4628                                    struct dt_object *dt,
4629                                    struct lu_attr *attr,
4630                                    struct dt_object_format *dof,
4631                                    struct thandle *th)
4632 {
4633         return lod_dir_striping_create_internal(env, dt, attr, NULL, dof, th,
4634                                                 false);
4635 }
4636
4637 /**
4638  * Make LOV EA for striped object.
4639  *
4640  * Generate striping information and store it in the LOV EA of the given
4641  * object. The caller must ensure nobody else is calling the function
4642  * against the object concurrently. The transaction must be started.
4643  * FLDB service must be running as well; it's used to map FID to the target,
4644  * which is stored in LOV EA.
4645  *
4646  * \param[in] env               execution environment for this thread
4647  * \param[in] lo                LOD object
4648  * \param[in] th                transaction handle
4649  *
4650  * \retval                      0 if LOV EA is stored successfully
4651  * \retval                      negative error number on failure
4652  */
4653 static int lod_generate_and_set_lovea(const struct lu_env *env,
4654                                       struct lod_object *lo,
4655                                       struct thandle *th)
4656 {
4657         struct lod_thread_info  *info = lod_env_info(env);
4658         struct dt_object        *next = dt_object_child(&lo->ldo_obj);
4659         struct lov_mds_md_v1    *lmm;
4660         int                      rc, lmm_size;
4661         ENTRY;
4662
4663         LASSERT(lo);
4664
4665         if (lo->ldo_comp_cnt == 0 && !lo->ldo_is_foreign) {
4666                 lod_striping_free_nolock(env, lo);
4667                 rc = lod_sub_xattr_del(env, next, XATTR_NAME_LOV, th);
4668                 RETURN(rc);
4669         }
4670
4671         lmm_size = lod_comp_md_size(lo, false);
4672         if (info->lti_ea_store_size < lmm_size) {
4673                 rc = lod_ea_store_resize(info, lmm_size);
4674                 if (rc)
4675                         RETURN(rc);
4676         }
4677         lmm = info->lti_ea_store;
4678
4679         rc = lod_generate_lovea(env, lo, lmm, &lmm_size, false);
4680         if (rc)
4681                 RETURN(rc);
4682
4683         info->lti_buf.lb_buf = lmm;
4684         info->lti_buf.lb_len = lmm_size;
4685         rc = lod_sub_xattr_set(env, next, &info->lti_buf,
4686                                XATTR_NAME_LOV, 0, th);
4687         RETURN(rc);
4688 }
4689
4690 static __u32 lod_gen_component_id(struct lod_object *lo,
4691                                   int mirror_id, int comp_idx);
4692
4693 /**
4694  * Repeat an existing component
4695  *
4696  * Creates a new layout by replicating an existing component.  Uses striping
4697  * policy from previous component as a template for the striping for the new
4698  * new component.
4699  *
4700  * New component starts with zero length, will be extended (or removed) before
4701  * returning layout to client.
4702  *
4703  * NB: Reallocates layout components array (lo->ldo_comp_entries), invalidating
4704  * any pre-existing pointers to components.  Handle with care.
4705  *
4706  * \param[in] env       execution environment for this thread
4707  * \param[in,out] lo    object to update the layout of
4708  * \param[in] index     index of component to copy
4709  *
4710  * \retval      0 on success
4711  * \retval      negative errno on error
4712  */
4713 static int lod_layout_repeat_comp(const struct lu_env *env,
4714                                   struct lod_object *lo, int index)
4715 {
4716         struct lod_layout_component *lod_comp;
4717         struct lod_layout_component *new_comp = NULL;
4718         struct lod_layout_component *comp_array;
4719         int rc = 0, i, new_cnt = lo->ldo_comp_cnt + 1;
4720         __u16 mirror_id;
4721         int offset = 0;
4722         ENTRY;
4723
4724         lod_comp = &lo->ldo_comp_entries[index];
4725         LASSERT(lod_comp_inited(lod_comp) && lod_comp->llc_id != LCME_ID_INVAL);
4726
4727         CDEBUG(D_LAYOUT, "repeating component %d\n", index);
4728
4729         OBD_ALLOC_PTR_ARRAY(comp_array, new_cnt);
4730         if (comp_array == NULL)
4731                 GOTO(out, rc = -ENOMEM);
4732
4733         for (i = 0; i < lo->ldo_comp_cnt; i++) {
4734                 memcpy(&comp_array[i + offset], &lo->ldo_comp_entries[i],
4735                        sizeof(*comp_array));
4736
4737                 /* Duplicate this component in to the next slot */
4738                 if (i == index) {
4739                         new_comp = &comp_array[i + 1];
4740                         memcpy(&comp_array[i + 1], &lo->ldo_comp_entries[i],
4741                                sizeof(*comp_array));
4742                         /* We must now skip this new component when copying */
4743                         offset = 1;
4744                 }
4745         }
4746
4747         /* Set up copied component */
4748         new_comp->llc_flags &= ~LCME_FL_INIT;
4749         new_comp->llc_stripe = NULL;
4750         new_comp->llc_stripes_allocated = 0;
4751         new_comp->llc_ost_indices = NULL;
4752         new_comp->llc_stripe_offset = LOV_OFFSET_DEFAULT;
4753         /* for uninstantiated components, layout gen stores default stripe
4754          * offset */
4755         new_comp->llc_layout_gen = lod_comp->llc_stripe_offset;
4756         /* This makes the repeated component zero-length, placed at the end of
4757          * the preceding component */
4758         new_comp->llc_extent.e_start = new_comp->llc_extent.e_end;
4759         new_comp->llc_timestamp = lod_comp->llc_timestamp;
4760         new_comp->llc_pool = NULL;
4761
4762         rc = lod_set_pool(&new_comp->llc_pool, lod_comp->llc_pool);
4763         if (rc)
4764                 GOTO(out, rc);
4765
4766         if (new_comp->llc_ostlist.op_array) {
4767                 __u32 *op_array = NULL;
4768
4769                 OBD_ALLOC(op_array, new_comp->llc_ostlist.op_size);
4770                 if (!op_array)
4771                         GOTO(out, rc = -ENOMEM);
4772                 memcpy(op_array, &new_comp->llc_ostlist.op_array,
4773                        new_comp->llc_ostlist.op_size);
4774                 new_comp->llc_ostlist.op_array = op_array;
4775         }
4776
4777         OBD_FREE_PTR_ARRAY(lo->ldo_comp_entries, lo->ldo_comp_cnt);
4778         lo->ldo_comp_entries = comp_array;
4779         lo->ldo_comp_cnt = new_cnt;
4780
4781         /* Generate an id for the new component */
4782         mirror_id = mirror_id_of(new_comp->llc_id);
4783         new_comp->llc_id = LCME_ID_INVAL;
4784         new_comp->llc_id = lod_gen_component_id(lo, mirror_id, index + 1);
4785         if (new_comp->llc_id == LCME_ID_INVAL)
4786                 GOTO(out, rc = -ERANGE);
4787
4788         EXIT;
4789 out:
4790         if (rc)
4791                 OBD_FREE_PTR_ARRAY(comp_array, new_cnt);
4792
4793         return rc;
4794 }
4795
4796 static int lod_layout_data_init(struct lod_thread_info *info, __u32 comp_cnt)
4797 {
4798         ENTRY;
4799
4800         /* clear memory region that will be used for layout change */
4801         memset(&info->lti_layout_attr, 0, sizeof(struct lu_attr));
4802         info->lti_count = 0;
4803
4804         if (info->lti_comp_size >= comp_cnt)
4805                 RETURN(0);
4806
4807         if (info->lti_comp_size > 0) {
4808                 OBD_FREE_PTR_ARRAY(info->lti_comp_idx, info->lti_comp_size);
4809                 info->lti_comp_size = 0;
4810         }
4811
4812         OBD_ALLOC_PTR_ARRAY(info->lti_comp_idx, comp_cnt);
4813         if (!info->lti_comp_idx)
4814                 RETURN(-ENOMEM);
4815
4816         info->lti_comp_size = comp_cnt;
4817         RETURN(0);
4818 }
4819
4820 /**
4821  * Prepare new layout minus deleted components
4822  *
4823  * Removes components marked for deletion (LCME_ID_INVAL) by copying to a new
4824  * layout and skipping those components.  Removes stripe objects if any exist.
4825  *
4826  * NB:
4827  * Reallocates layout components array (lo->ldo_comp_entries), invalidating
4828  * any pre-existing pointers to components.
4829  *
4830  * Caller is responsible for updating mirror end (ldo_mirror[].lme_end).
4831  *
4832  * \param[in] env       execution environment for this thread
4833  * \param[in,out] lo    object to update the layout of
4834  * \param[in] th        transaction handle for this operation
4835  *
4836  * \retval      # of components deleted
4837  * \retval      negative errno on error
4838  */
4839 static int lod_layout_del_prep_layout(const struct lu_env *env,
4840                                       struct lod_object *lo,
4841                                       struct thandle *th)
4842 {
4843         struct lod_layout_component     *lod_comp;
4844         struct lod_thread_info  *info = lod_env_info(env);
4845         int rc = 0, i, j, deleted = 0;
4846
4847         ENTRY;
4848
4849         LASSERT(lo->ldo_is_composite);
4850         LASSERT(lo->ldo_comp_cnt > 0 && lo->ldo_comp_entries != NULL);
4851
4852         rc = lod_layout_data_init(info, lo->ldo_comp_cnt);
4853         if (rc)
4854                 RETURN(rc);
4855
4856         for (i = 0; i < lo->ldo_comp_cnt; i++) {
4857                 lod_comp = &lo->ldo_comp_entries[i];
4858
4859                 if (lod_comp->llc_id != LCME_ID_INVAL) {
4860                         /* Build array of things to keep */
4861                         info->lti_comp_idx[info->lti_count++] = i;
4862                         continue;
4863                 }
4864
4865                 lod_obj_set_pool(lo, i, NULL);
4866                 if (lod_comp->llc_ostlist.op_array) {
4867                         OBD_FREE(lod_comp->llc_ostlist.op_array,
4868                                  lod_comp->llc_ostlist.op_size);
4869                         lod_comp->llc_ostlist.op_array = NULL;
4870                         lod_comp->llc_ostlist.op_size = 0;
4871                 }
4872
4873                 deleted++;
4874                 CDEBUG(D_LAYOUT, "deleting comp %d, left %d\n", i,
4875                        lo->ldo_comp_cnt - deleted);
4876
4877                 /* No striping info for this component */
4878                 if (lod_comp->llc_stripe == NULL)
4879                         continue;
4880
4881                 LASSERT(lod_comp->llc_stripe_count > 0);
4882                 for (j = 0; j < lod_comp->llc_stripe_count; j++) {
4883                         struct dt_object *obj = lod_comp->llc_stripe[j];
4884
4885                         if (obj == NULL)
4886                                 continue;
4887
4888                         /* components which are not init have no sub objects
4889                          * to destroy */
4890                         if (lod_comp_inited(lod_comp)) {
4891                                 rc = lod_sub_destroy(env, obj, th);
4892                                 if (rc)
4893                                         GOTO(out, rc);
4894                         }
4895
4896                         lu_object_put(env, &obj->do_lu);
4897                         lod_comp->llc_stripe[j] = NULL;
4898                 }
4899                 OBD_FREE_PTR_ARRAY(lod_comp->llc_stripe,
4900                                    lod_comp->llc_stripes_allocated);
4901                 lod_comp->llc_stripe = NULL;
4902                 OBD_FREE_PTR_ARRAY(lod_comp->llc_ost_indices,
4903                                    lod_comp->llc_stripes_allocated);
4904                 lod_comp->llc_ost_indices = NULL;
4905                 lod_comp->llc_stripes_allocated = 0;
4906         }
4907
4908         /* info->lti_count has the amount of left components */
4909         LASSERTF(info->lti_count >= 0 && info->lti_count < lo->ldo_comp_cnt,
4910                  "left = %d, lo->ldo_comp_cnt %d\n", (int)info->lti_count,
4911                  (int)lo->ldo_comp_cnt);
4912
4913         if (info->lti_count > 0) {
4914                 struct lod_layout_component *comp_array;
4915
4916                 OBD_ALLOC_PTR_ARRAY(comp_array, info->lti_count);
4917                 if (comp_array == NULL)
4918                         GOTO(out, rc = -ENOMEM);
4919
4920                 for (i = 0; i < info->lti_count; i++) {
4921                         memcpy(&comp_array[i],
4922                                &lo->ldo_comp_entries[info->lti_comp_idx[i]],
4923                                sizeof(*comp_array));
4924                 }
4925
4926                 OBD_FREE_PTR_ARRAY(lo->ldo_comp_entries, lo->ldo_comp_cnt);
4927                 lo->ldo_comp_entries = comp_array;
4928                 lo->ldo_comp_cnt = info->lti_count;
4929         } else {
4930                 lod_free_comp_entries(lo);
4931         }
4932
4933         EXIT;
4934 out:
4935         return rc ? rc : deleted;
4936 }
4937
4938 /**
4939  * Delete layout component(s)
4940  *
4941  * This function sets up the layout data in the env and does the setattrs
4942  * required to write out the new layout.  The layout itself is modified in
4943  * lod_layout_del_prep_layout.
4944  *
4945  * \param[in] env       execution environment for this thread
4946  * \param[in] dt        object
4947  * \param[in] th        transaction handle
4948  *
4949  * \retval      0 on success
4950  * \retval      negative error number on failure
4951  */
4952 static int lod_layout_del(const struct lu_env *env, struct dt_object *dt,
4953                           struct thandle *th)
4954 {
4955         struct lod_object *lo = lod_dt_obj(dt);
4956         struct dt_object *next = dt_object_child(dt);
4957         struct lu_attr *attr = &lod_env_info(env)->lti_attr;
4958         int rc;
4959
4960         LASSERT(lo->ldo_mirror_count == 1);
4961
4962         mutex_lock(&lo->ldo_layout_mutex);
4963
4964         rc = lod_layout_del_prep_layout(env, lo, th);
4965         if (rc < 0)
4966                 GOTO(out, rc);
4967
4968         /* Only do this if we didn't delete all components */
4969         if (lo->ldo_comp_cnt > 0) {
4970                 lo->ldo_mirrors[0].lme_end = lo->ldo_comp_cnt - 1;
4971                 lod_obj_inc_layout_gen(lo);
4972         }
4973
4974         LASSERT(dt_object_exists(dt));
4975         rc = dt_attr_get(env, next, attr);
4976         if (rc)
4977                 GOTO(out, rc);
4978
4979         if (attr->la_size > 0) {
4980                 attr->la_size = 0;
4981                 attr->la_valid = LA_SIZE;
4982                 rc = lod_sub_attr_set(env, next, attr, th);
4983                 if (rc)
4984                         GOTO(out, rc);
4985         }
4986
4987         rc = lod_generate_and_set_lovea(env, lo, th);
4988         EXIT;
4989 out:
4990         if (rc)
4991                 lod_striping_free_nolock(env, lo);
4992
4993         mutex_unlock(&lo->ldo_layout_mutex);
4994
4995         return rc;
4996 }
4997
4998
4999 /**
5000  * Implementation of dt_object_operations::do_xattr_set.
5001  *
5002  * Sets specified extended attribute on the object. Three types of EAs are
5003  * special:
5004  *   LOV EA - stores striping for a regular file or default striping (when set
5005  *            on a directory)
5006  *   LMV EA - stores a marker for the striped directories
5007  *   DMV EA - stores default directory striping
5008  *
5009  * When striping is applied to a non-striped existing object (this is called
5010  * late striping), then LOD notices the caller wants to turn the object into a
5011  * striped one. The stripe objects are created and appropriate EA is set:
5012  * LOV EA storing all the stripes directly or LMV EA storing just a small header
5013  * with striping configuration.
5014  *
5015  * \see dt_object_operations::do_xattr_set() in the API description for details.
5016  */
5017 static int lod_xattr_set(const struct lu_env *env,
5018                          struct dt_object *dt, const struct lu_buf *buf,
5019                          const char *name, int fl, struct thandle *th)
5020 {
5021         struct dt_object *next = dt_object_child(dt);
5022         struct lu_attr *layout_attr = &lod_env_info(env)->lti_layout_attr;
5023         struct lod_object *lo = lod_dt_obj(dt);
5024         struct lod_obj_stripe_cb_data data = { {0} };
5025         int rc = 0;
5026
5027         ENTRY;
5028
5029         if (S_ISDIR(dt->do_lu.lo_header->loh_attr) &&
5030             !strcmp(name, XATTR_NAME_LMV)) {
5031                 switch (fl) {
5032                 case LU_XATTR_CREATE:
5033                         rc = lod_dir_striping_create(env, dt, NULL, NULL, th);
5034                         break;
5035                 case 0:
5036                 case LU_XATTR_REPLACE:
5037                         rc = lod_dir_layout_set(env, dt, buf, fl, th);
5038                         break;
5039                 default:
5040                         LBUG();
5041                 }
5042
5043                 RETURN(rc);
5044         } else if (S_ISDIR(dt->do_lu.lo_header->loh_attr) &&
5045                    strcmp(name, XATTR_NAME_LOV) == 0) {
5046                 rc = lod_xattr_set_default_lov_on_dir(env, dt, buf, name, fl,
5047                                                       th);
5048                 RETURN(rc);
5049         } else if (S_ISDIR(dt->do_lu.lo_header->loh_attr) &&
5050                    strcmp(name, XATTR_NAME_DEFAULT_LMV) == 0) {
5051                 /* default LMVEA */
5052                 rc = lod_xattr_set_default_lmv_on_dir(env, dt, buf, name, fl,
5053                                                       th);
5054                 RETURN(rc);
5055         } else if (S_ISREG(dt->do_lu.lo_header->loh_attr) &&
5056                    (strcmp(name, XATTR_NAME_LOV) == 0 ||
5057                     strcmp(name, XATTR_LUSTRE_LOV) == 0 ||
5058                     allowed_lustre_lov(name))) {
5059                 /* in case of lov EA swap, just set it
5060                  * if not, it is a replay so check striping match what we
5061                  * already have during req replay, declare_xattr_set()
5062                  * defines striping, then create() does the work */
5063                 if (fl & LU_XATTR_REPLACE) {
5064                         /* free stripes, then update disk */
5065                         lod_striping_free(env, lod_dt_obj(dt));
5066
5067                         rc = lod_sub_xattr_set(env, next, buf, name, fl, th);
5068                 } else if (fl & LU_XATTR_SPLIT) {
5069                         rc = lod_sub_xattr_set(env, next, buf, name, fl, th);
5070                         if (rc)
5071                                 RETURN(rc);
5072
5073                         rc = lod_striping_reload(env, lo, buf, LVF_ALL_STALE);
5074                         if (rc)
5075                                 RETURN(rc);
5076
5077                         if (lo->ldo_mirror_count > 1 &&
5078                             layout_attr->la_valid & LA_LAYOUT_VERSION) {
5079                                 /* mirror split */
5080                                 layout_attr->la_layout_version =
5081                                                 lo->ldo_layout_gen;
5082                                 data.locd_attr = layout_attr;
5083                                 data.locd_declare = false;
5084                                 data.locd_stripe_cb =
5085                                                 lod_obj_stripe_attr_set_cb;
5086                                 rc = lod_obj_for_each_stripe(env, lo, th,
5087                                                              &data);
5088                                 if (rc)
5089                                         RETURN(rc);
5090                         }
5091                 } else if (fl & LU_XATTR_PURGE) {
5092                         rc = lod_layout_purge(env, dt, buf, th);
5093                 } else if (dt_object_remote(dt)) {
5094                         /* This only happens during migration, see
5095                          * mdd_migrate_create(), in which Master MDT will
5096                          * create a remote target object, and only set
5097                          * (migrating) stripe EA on the remote object,
5098                          * and does not need creating each stripes. */
5099                         rc = lod_sub_xattr_set(env, next, buf, name,
5100                                                       fl, th);
5101                 } else if (strcmp(name, XATTR_LUSTRE_LOV".del") == 0) {
5102                         /* delete component(s) */
5103                         LASSERT(lod_dt_obj(dt)->ldo_comp_cached);
5104                         rc = lod_layout_del(env, dt, th);
5105                 } else {
5106                         /*
5107                          * When 'name' is XATTR_LUSTRE_LOV or XATTR_NAME_LOV,
5108                          * it's going to create create file with specified
5109                          * component(s), the striping must have not being
5110                          * cached in this case;
5111                          *
5112                          * Otherwise, it's going to add/change component(s) to
5113                          * an existing file, the striping must have been cached
5114                          * in this case.
5115                          */
5116                         LASSERT(equi(!strcmp(name, XATTR_LUSTRE_LOV) ||
5117                                      !strcmp(name, XATTR_NAME_LOV),
5118                                 !lod_dt_obj(dt)->ldo_comp_cached));
5119
5120                         rc = lod_striped_create(env, dt, NULL, NULL, th);
5121                         if (rc)
5122                                 RETURN(rc);
5123
5124                         if (fl & LU_XATTR_MERGE && lo->ldo_mirror_count > 1 &&
5125                             layout_attr->la_valid & LA_LAYOUT_VERSION) {
5126                                 /* mirror merge exec phase */
5127                                 layout_attr->la_layout_version =
5128                                                 lo->ldo_layout_gen;
5129                                 data.locd_attr = layout_attr;
5130                                 data.locd_declare = false;
5131                                 data.locd_stripe_cb =
5132                                                 lod_obj_stripe_attr_set_cb;
5133                                 rc = lod_obj_for_each_stripe(env, lo, th,
5134                                                              &data);
5135                                 if (rc)
5136                                         RETURN(rc);
5137                         }
5138                 }
5139                 RETURN(rc);
5140         } else if (strcmp(name, XATTR_NAME_FID) == 0) {
5141                 rc = lod_replace_parent_fid(env, dt, buf, th, false);
5142
5143                 RETURN(rc);
5144         }
5145
5146         /* then all other xattr */
5147         rc = lod_xattr_set_internal(env, dt, buf, name, fl, th);
5148
5149         RETURN(rc);
5150 }
5151
5152 /**
5153  * Implementation of dt_object_operations::do_declare_xattr_del.
5154  *
5155  * \see dt_object_operations::do_declare_xattr_del() in the API description
5156  * for details.
5157  */
5158 static int lod_declare_xattr_del(const struct lu_env *env,
5159                                  struct dt_object *dt, const char *name,
5160                                  struct thandle *th)
5161 {
5162         struct lod_object *lo = lod_dt_obj(dt);
5163         struct dt_object *next = dt_object_child(dt);
5164         int i;
5165         int rc;
5166         ENTRY;
5167
5168         rc = lod_sub_declare_xattr_del(env, next, name, th);
5169         if (rc != 0)
5170                 RETURN(rc);
5171
5172         if (!S_ISDIR(dt->do_lu.lo_header->loh_attr))
5173                 RETURN(0);
5174
5175         /* NB: don't delete stripe LMV, because when we do this, normally we
5176          * will remove stripes, besides, if directory LMV is corrupt, this will
5177          * prevent deleting its LMV and fixing it (via LFSCK).
5178          */
5179         if (!strcmp(name, XATTR_NAME_LMV))
5180                 RETURN(0);
5181
5182         rc = lod_striping_load(env, lo);
5183         if (rc != 0)
5184                 RETURN(rc);
5185
5186         if (lo->ldo_dir_stripe_count == 0)
5187                 RETURN(0);
5188
5189         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
5190                 struct dt_object *dto = lo->ldo_stripe[i];
5191
5192                 if (!dto)
5193                         continue;
5194
5195                 if (!dt_object_exists(dto))
5196                         continue;
5197
5198                 rc = lod_sub_declare_xattr_del(env, dto, name, th);
5199                 if (rc != 0)
5200                         break;
5201         }
5202
5203         RETURN(rc);
5204 }
5205
5206 /**
5207  * Implementation of dt_object_operations::do_xattr_del.
5208  *
5209  * If EA storing a regular striping is being deleted, then release
5210  * all the references to the stripe objects in core.
5211  *
5212  * \see dt_object_operations::do_xattr_del() in the API description for details.
5213  */
5214 static int lod_xattr_del(const struct lu_env *env, struct dt_object *dt,
5215                          const char *name, struct thandle *th)
5216 {
5217         int rc;
5218
5219         ENTRY;
5220
5221         if (!strcmp(name, XATTR_NAME_LOV) || !strcmp(name, XATTR_NAME_LMV))
5222                 lod_striping_free(env, lod_dt_obj(dt));
5223
5224         rc = lod_xattr_del_internal(env, dt, name, th);
5225
5226         RETURN(rc);
5227 }
5228
5229 /**
5230  * Implementation of dt_object_operations::do_xattr_list.
5231  *
5232  * \see dt_object_operations::do_xattr_list() in the API description
5233  * for details.
5234  */
5235 static int lod_xattr_list(const struct lu_env *env,
5236                           struct dt_object *dt, const struct lu_buf *buf)
5237 {
5238         return dt_xattr_list(env, dt_object_child(dt), buf);
5239 }
5240
5241 static inline int lod_object_will_be_striped(int is_reg, const struct lu_fid *fid)
5242 {
5243         return (is_reg && fid_seq(fid) != FID_SEQ_LOCAL_FILE);
5244 }
5245
5246 /**
5247  * Copy OST list from layout provided by user.
5248  *
5249  * \param[in] lod_comp          layout_component to be filled
5250  * \param[in] v3                LOV EA V3 user data
5251  *
5252  * \retval              0 on success
5253  * \retval              negative if failed
5254  */
5255 int lod_comp_copy_ost_lists(struct lod_layout_component *lod_comp,
5256                             struct lov_user_md_v3 *v3)
5257 {
5258         int j;
5259
5260         ENTRY;
5261
5262         if (v3->lmm_stripe_offset == LOV_OFFSET_DEFAULT)
5263                 v3->lmm_stripe_offset = v3->lmm_objects[0].l_ost_idx;
5264
5265         if (lod_comp->llc_ostlist.op_array) {
5266                 if (lod_comp->llc_ostlist.op_size >=
5267                     v3->lmm_stripe_count * sizeof(__u32))  {
5268                         lod_comp->llc_ostlist.op_count =
5269                                         v3->lmm_stripe_count;
5270                         goto skip;
5271                 }
5272                 OBD_FREE(lod_comp->llc_ostlist.op_array,
5273                          lod_comp->llc_ostlist.op_size);
5274         }
5275
5276         /* copy ost list from lmm */
5277         lod_comp->llc_ostlist.op_count = v3->lmm_stripe_count;
5278         lod_comp->llc_ostlist.op_size = v3->lmm_stripe_count * sizeof(__u32);
5279         OBD_ALLOC(lod_comp->llc_ostlist.op_array,
5280                   lod_comp->llc_ostlist.op_size);
5281         if (!lod_comp->llc_ostlist.op_array)
5282                 RETURN(-ENOMEM);
5283 skip:
5284         for (j = 0; j < v3->lmm_stripe_count; j++) {
5285                 lod_comp->llc_ostlist.op_array[j] =
5286                         v3->lmm_objects[j].l_ost_idx;
5287         }
5288
5289         RETURN(0);
5290 }
5291
5292
5293 /**
5294  * Get default striping.
5295  *
5296  * \param[in] env               execution environment
5297  * \param[in] lo                object
5298  * \param[out] lds              default striping
5299  *
5300  * \retval              0 on success
5301  * \retval              negative if failed
5302  */
5303 static int lod_get_default_lov_striping(const struct lu_env *env,
5304                                         struct lod_object *lo,
5305                                         struct lod_default_striping *lds,
5306                                         struct dt_allocation_hint *dah)
5307 {
5308         struct lod_thread_info *info = lod_env_info(env);
5309         struct lov_user_md_v1 *v1 = NULL;
5310         struct lov_user_md_v3 *v3 = NULL;
5311         struct lov_comp_md_v1 *lcm = NULL;
5312         __u32 magic;
5313         int append_stripe_count = dah != NULL ? dah->dah_append_stripe_count : 0;
5314         const char *append_pool = (dah != NULL &&
5315                                    dah->dah_append_pool != NULL &&
5316                                    dah->dah_append_pool[0] != '\0') ?
5317                                   dah->dah_append_pool : NULL;
5318         __u16 entry_count = 1;
5319         __u16 mirror_count = 0;
5320         bool want_composite = false;
5321         int rc, i, j;
5322
5323         ENTRY;
5324
5325         lds->lds_def_striping_set = 0;
5326
5327         rc = lod_get_lov_ea(env, lo);
5328         if (rc < 0)
5329                 RETURN(rc);
5330
5331         if (rc < (typeof(rc))sizeof(struct lov_user_md))
5332                 RETURN(0);
5333
5334         magic = *(__u32 *)info->lti_ea_store;
5335         if (magic == __swab32(LOV_USER_MAGIC_V1)) {
5336                 lustre_swab_lov_user_md_v1(info->lti_ea_store);
5337         } else if (magic == __swab32(LOV_USER_MAGIC_V3)) {
5338                 lustre_swab_lov_user_md_v3(info->lti_ea_store);
5339         } else if (magic == __swab32(LOV_USER_MAGIC_SPECIFIC)) {
5340                 v3 = (struct lov_user_md_v3 *)info->lti_ea_store;
5341                 lustre_swab_lov_user_md_v3(v3);
5342                 lustre_swab_lov_user_md_objects(v3->lmm_objects,
5343                                                 v3->lmm_stripe_count);
5344         } else if (magic == __swab32(LOV_USER_MAGIC_COMP_V1) ||
5345                    magic == __swab32(LOV_USER_MAGIC_SEL)) {
5346                 lustre_swab_lov_comp_md_v1(info->lti_ea_store);
5347         }
5348
5349         switch (magic) {
5350         case LOV_MAGIC_V1:
5351         case LOV_MAGIC_V3:
5352         case LOV_USER_MAGIC_SPECIFIC:
5353                 v1 = info->lti_ea_store;
5354                 break;
5355         case LOV_MAGIC_COMP_V1:
5356         case LOV_MAGIC_SEL:
5357                 lcm = info->lti_ea_store;
5358                 entry_count = lcm->lcm_entry_count;
5359                 if (entry_count == 0)
5360                         RETURN(-EINVAL);
5361
5362                 mirror_count = lcm->lcm_mirror_count + 1;
5363                 want_composite = true;
5364                 break;
5365         default:
5366                 RETURN(-ENOTSUPP);
5367         }
5368
5369         if (append_stripe_count != 0 || append_pool != NULL) {
5370                 entry_count = 1;
5371                 mirror_count = 0;
5372                 want_composite = false;
5373         }
5374
5375         /* realloc default comp entries if necessary */
5376         rc = lod_def_striping_comp_resize(lds, entry_count);
5377         if (rc < 0)
5378                 RETURN(rc);
5379
5380         lds->lds_def_comp_cnt = entry_count;
5381         lds->lds_def_striping_is_composite = want_composite;
5382         lds->lds_def_mirror_cnt = mirror_count;
5383
5384         for (i = 0; i < entry_count; i++) {
5385                 struct lod_layout_component *llc = &lds->lds_def_comp_entries[i];
5386                 const char *pool;
5387
5388                 /*
5389                  * reset llc values, llc_stripes is always NULL in the
5390                  * default striping template, llc_pool will be reset
5391                  * later below using lod_set_pool().
5392                  *
5393                  * XXX At this point llc_pool may point to valid (!)
5394                  * kmalloced strings from previous RPCs.
5395                  */
5396                 memset(llc, 0, offsetof(typeof(*llc), llc_pool));
5397
5398                 if (lcm != NULL) {
5399                         v1 = (struct lov_user_md *)((char *)lcm +
5400                                                     lcm->lcm_entries[i].lcme_offset);
5401
5402                         if (want_composite) {
5403                                 llc->llc_extent = lcm->lcm_entries[i].lcme_extent;
5404                                 /* We only inherit certain flags from the layout */
5405                                 llc->llc_flags = lcm->lcm_entries[i].lcme_flags &
5406                                         LCME_TEMPLATE_FLAGS;
5407                         }
5408                 }
5409
5410                 CDEBUG(D_LAYOUT, DFID" magic = %#08x, pattern = %#x, stripe_count = %hu, stripe_size = %u, stripe_offset = %hu, append_pool = '%s', append_stripe_count = %d\n",
5411                        PFID(lu_object_fid(&lo->ldo_obj.do_lu)),
5412                        v1->lmm_magic,
5413                        v1->lmm_pattern,
5414                        v1->lmm_stripe_count,
5415                        v1->lmm_stripe_size,
5416                        v1->lmm_stripe_offset,
5417                        append_pool ?: "",
5418                        append_stripe_count);
5419
5420                 if (!lov_pattern_supported(v1->lmm_pattern) &&
5421                     !(v1->lmm_pattern & LOV_PATTERN_F_RELEASED)) {
5422                         lod_free_def_comp_entries(lds);
5423                         RETURN(-EINVAL);
5424                 }
5425
5426                 llc->llc_stripe_count = v1->lmm_stripe_count;
5427                 llc->llc_stripe_size = v1->lmm_stripe_size;
5428                 llc->llc_stripe_offset = v1->lmm_stripe_offset;
5429                 llc->llc_pattern = v1->lmm_pattern;
5430
5431                 if (append_stripe_count != 0 || append_pool != NULL)
5432                         llc->llc_pattern = LOV_PATTERN_RAID0;
5433
5434                 if (append_stripe_count != 0)
5435                         llc->llc_stripe_count = append_stripe_count;
5436
5437                 pool = NULL;
5438                 if (append_pool != NULL) {
5439                         pool = append_pool;
5440                 } else if (v1->lmm_magic == LOV_USER_MAGIC_V3) {
5441                         /* XXX: sanity check here */
5442                         v3 = (struct lov_user_md_v3 *)v1;
5443                         if (v3->lmm_pool_name[0] != '\0')
5444                                 pool = v3->lmm_pool_name;
5445                 }
5446
5447                 lod_set_pool(&llc->llc_pool, pool);
5448
5449                 if (append_stripe_count != 0 || append_pool != NULL) {
5450                         /* Ignore specific striping for append. */
5451                 } else if (v1->lmm_magic == LOV_USER_MAGIC_SPECIFIC) {
5452                         v3 = (struct lov_user_md_v3 *)v1;
5453                         rc = lod_comp_copy_ost_lists(llc, v3);
5454                         if (rc)
5455                                 RETURN(rc);
5456                 } else if (llc->llc_ostlist.op_array &&
5457                            llc->llc_ostlist.op_count) {
5458                         for (j = 0; j < llc->llc_ostlist.op_count; j++)
5459                                 llc->llc_ostlist.op_array[j] = -1;
5460                         llc->llc_ostlist.op_count = 0;
5461                 }
5462         }
5463
5464         lds->lds_def_striping_set = 1;
5465         RETURN(rc);
5466 }
5467
5468 /**
5469  * Get default directory striping.
5470  *
5471  * \param[in] env               execution environment
5472  * \param[in] lo                object
5473  * \param[out] lds              default striping
5474  *
5475  * \retval              0 on success
5476  * \retval              negative if failed
5477  */
5478 static int lod_get_default_lmv_striping(const struct lu_env *env,
5479                                         struct lod_object *lo,
5480                                         struct lod_default_striping *lds)
5481 {
5482         struct lmv_user_md *lmu;
5483         int rc;
5484
5485         lds->lds_dir_def_striping_set = 0;
5486
5487         rc = lod_get_default_lmv_ea(env, lo);
5488         if (rc < 0)
5489                 return rc;
5490
5491         if (rc >= (int)sizeof(*lmu)) {
5492                 struct lod_thread_info *info = lod_env_info(env);
5493
5494                 lmu = info->lti_ea_store;
5495
5496                 lds->lds_dir_def_stripe_count =
5497                                 le32_to_cpu(lmu->lum_stripe_count);
5498                 lds->lds_dir_def_stripe_offset =
5499                                 le32_to_cpu(lmu->lum_stripe_offset);
5500                 lds->lds_dir_def_hash_type =
5501                                 le32_to_cpu(lmu->lum_hash_type);
5502                 lds->lds_dir_def_max_inherit = lmu->lum_max_inherit;
5503                 lds->lds_dir_def_max_inherit_rr = lmu->lum_max_inherit_rr;
5504                 lds->lds_dir_def_striping_set = 1;
5505         }
5506
5507         return 0;
5508 }
5509
5510 /**
5511  * Get default striping in the object.
5512  *
5513  * Get object default striping and default directory striping.
5514  *
5515  * \param[in] env               execution environment
5516  * \param[in] lo                object
5517  * \param[out] lds              default striping
5518  *
5519  * \retval              0 on success
5520  * \retval              negative if failed
5521  */
5522 static int lod_get_default_striping(const struct lu_env *env,
5523                                     struct lod_object *lo,
5524                                     struct lod_default_striping *lds)
5525 {
5526         int rc, rc1;
5527
5528         rc = lod_get_default_lov_striping(env, lo, lds, NULL);
5529         if (lds->lds_def_striping_set) {
5530                 struct lod_thread_info *info = lod_env_info(env);
5531                 struct lod_device *d = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
5532
5533                 rc = lod_verify_striping(env, d, lo, &info->lti_buf, false);
5534                 if (rc)
5535                         lds->lds_def_striping_set = 0;
5536         }
5537
5538         rc1 = lod_get_default_lmv_striping(env, lo, lds);
5539         if (rc == 0 && rc1 < 0)
5540                 rc = rc1;
5541
5542         return rc;
5543 }
5544
5545 /**
5546  * Apply default striping on object.
5547  *
5548  * If object striping pattern is not set, set to the one in default striping.
5549  * The default striping is from parent or fs.
5550  *
5551  * \param[in] lo                new object
5552  * \param[in] lds               default striping
5553  * \param[in] mode              new object's mode
5554  */
5555 static void lod_striping_from_default(struct lod_object *lo,
5556                                       const struct lod_default_striping *lds,
5557                                       umode_t mode)
5558 {
5559         struct lod_device *d = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
5560         int i, rc;
5561
5562         if (lds->lds_def_striping_set && S_ISREG(mode)) {
5563                 struct lov_desc *desc = &d->lod_ost_descs.ltd_lov_desc;
5564
5565                 rc = lod_alloc_comp_entries(lo, lds->lds_def_mirror_cnt,
5566                                             lds->lds_def_comp_cnt);
5567                 if (rc != 0)
5568                         return;
5569
5570                 lo->ldo_is_composite = lds->lds_def_striping_is_composite;
5571                 if (lds->lds_def_mirror_cnt > 1)
5572                         lo->ldo_flr_state = LCM_FL_RDONLY;
5573
5574                 for (i = 0; i < lo->ldo_comp_cnt; i++) {
5575                         struct lod_layout_component *obj_comp =
5576                                                 &lo->ldo_comp_entries[i];
5577                         struct lod_layout_component *def_comp =
5578                                                 &lds->lds_def_comp_entries[i];
5579
5580                         CDEBUG(D_LAYOUT,
5581                                "inherit "DFID" file layout from default: flags=%#x size=%hu nr=%u offset=%u pattern=%#x pool=%s\n",
5582                                PFID(lu_object_fid(&lo->ldo_obj.do_lu)),
5583                                def_comp->llc_flags,
5584                                def_comp->llc_stripe_size,
5585                                def_comp->llc_stripe_count,
5586                                def_comp->llc_stripe_offset,
5587                                def_comp->llc_pattern,
5588                                def_comp->llc_pool ?: "");
5589
5590                         *obj_comp = *def_comp;
5591                         if (def_comp->llc_pool != NULL) {
5592                                 /* pointer was copied from def_comp */
5593                                 obj_comp->llc_pool = NULL;
5594                                 lod_obj_set_pool(lo, i, def_comp->llc_pool);
5595                         }
5596
5597                         /* copy ost list */
5598                         if (def_comp->llc_ostlist.op_array &&
5599                             def_comp->llc_ostlist.op_count) {
5600                                 OBD_ALLOC(obj_comp->llc_ostlist.op_array,
5601                                           obj_comp->llc_ostlist.op_size);
5602                                 if (!obj_comp->llc_ostlist.op_array)
5603                                         return;
5604                                 memcpy(obj_comp->llc_ostlist.op_array,
5605                                        def_comp->llc_ostlist.op_array,
5606                                        obj_comp->llc_ostlist.op_size);
5607                         } else if (def_comp->llc_ostlist.op_array) {
5608                                 obj_comp->llc_ostlist.op_array = NULL;
5609                         }
5610
5611                         /*
5612                          * Don't initialize these fields for plain layout
5613                          * (v1/v3) here, they are inherited in the order of
5614                          * 'parent' -> 'fs default (root)' -> 'global default
5615                          * values for stripe_count & stripe_size'.
5616                          *
5617                          * see lod_ah_init().
5618                          */
5619                         if (!lo->ldo_is_composite)
5620                                 continue;
5621
5622                         lod_adjust_stripe_info(obj_comp, desc, 0);
5623                 }
5624         } else if (lds->lds_dir_def_striping_set && S_ISDIR(mode)) {
5625                 if (lo->ldo_dir_stripe_count == 0)
5626                         lo->ldo_dir_stripe_count =
5627                                 lds->lds_dir_def_stripe_count;
5628                 if (lo->ldo_dir_stripe_offset == -1)
5629                         lo->ldo_dir_stripe_offset =
5630                                 lds->lds_dir_def_stripe_offset;
5631                 if (lo->ldo_dir_hash_type == LMV_HASH_TYPE_UNKNOWN)
5632                         lo->ldo_dir_hash_type = lds->lds_dir_def_hash_type;
5633
5634                 CDEBUG(D_LAYOUT,
5635                        "inherit "DFID" dir layout from default: count=%hu offset=%u hash_type=%x\n",
5636                        PFID(lu_object_fid(&lo->ldo_obj.do_lu)),
5637                        lo->ldo_dir_stripe_count, lo->ldo_dir_stripe_offset,
5638                        lo->ldo_dir_hash_type);
5639         }
5640 }
5641
5642 static inline bool lod_need_inherit_more(struct lod_object *lo, bool from_root,
5643                                          const char *append_pool)
5644 {
5645         struct lod_layout_component *lod_comp;
5646
5647         if (lo->ldo_comp_cnt == 0)
5648                 return true;
5649
5650         if (lo->ldo_is_composite)
5651                 return false;
5652
5653         lod_comp = &lo->ldo_comp_entries[0];
5654
5655         if (lod_comp->llc_stripe_count <= 0 ||
5656             lod_comp->llc_stripe_size <= 0)
5657                 return true;
5658
5659         if (from_root && (lod_comp->llc_pool == NULL ||
5660                           lod_comp->llc_stripe_offset == LOV_OFFSET_DEFAULT))
5661                 return true;
5662
5663         if (append_pool && append_pool[0])
5664                 return true;
5665
5666         return false;
5667 }
5668
5669 /**
5670  * Implementation of dt_object_operations::do_ah_init.
5671  *
5672  * This method is used to make a decision on the striping configuration for the
5673  * object being created. It can be taken from the \a parent object if it exists,
5674  * or filesystem's default. The resulting configuration (number of stripes,
5675  * stripe size/offset, pool name, hash_type, etc.) is stored in the object
5676  * itself and will be used by the methods like ->doo_declare_create().
5677  *
5678  * \see dt_object_operations::do_ah_init() in the API description for details.
5679  */
5680 static void lod_ah_init(const struct lu_env *env,
5681                         struct dt_allocation_hint *ah,
5682                         struct dt_object *parent,
5683                         struct dt_object *child,
5684                         umode_t child_mode)
5685 {
5686         struct lod_device *d = lu2lod_dev(child->do_lu.lo_dev);
5687         struct lod_thread_info *info = lod_env_info(env);
5688         struct lod_default_striping *lds = lod_lds_buf_get(env);
5689         struct dt_object *nextp = NULL;
5690         struct dt_object *nextc;
5691         struct lod_object *lp = NULL;
5692         struct lod_object *lc;
5693         struct lov_desc *desc;
5694         struct lod_layout_component *lod_comp;
5695         int rc;
5696         ENTRY;
5697
5698         LASSERT(child);
5699
5700         if (ah->dah_append_stripe_count == -1)
5701                 ah->dah_append_stripe_count =
5702                         d->lod_ost_descs.ltd_lov_desc.ld_tgt_count;
5703
5704         if (likely(parent)) {
5705                 nextp = dt_object_child(parent);
5706                 lp = lod_dt_obj(parent);
5707         }
5708
5709         nextc = dt_object_child(child);
5710         lc = lod_dt_obj(child);
5711
5712         LASSERT(!lod_obj_is_striped(child));
5713         /* default layout template may have been set on the regular file
5714          * when this is called from mdd_create_data() */
5715         if (S_ISREG(child_mode))
5716                 lod_free_comp_entries(lc);
5717
5718         if (!dt_object_exists(nextc))
5719                 nextc->do_ops->do_ah_init(env, ah, nextp, nextc, child_mode);
5720
5721         if (S_ISDIR(child_mode)) {
5722                 const struct lmv_user_md_v1 *lum1 = ah->dah_eadata;
5723
5724                 /* other default values are 0 */
5725                 lc->ldo_dir_stripe_offset = -1;
5726
5727                 /* no default striping configuration is needed for
5728                  * foreign dirs
5729                  */
5730                 if (ah->dah_eadata != NULL && ah->dah_eadata_len != 0 &&
5731                     le32_to_cpu(lum1->lum_magic) == LMV_MAGIC_FOREIGN) {
5732                         lc->ldo_is_foreign = true;
5733                         /* keep stripe_count 0 and stripe_offset -1 */
5734                         CDEBUG(D_INFO, "no default striping for foreign dir\n");
5735                         RETURN_EXIT;
5736                 }
5737
5738                 if (likely(lp != NULL))
5739                         lod_get_default_striping(env, lp, lds);
5740
5741                 /* It should always honour the specified stripes */
5742                 /* Note: old client (< 2.7)might also do lfs mkdir, whose EA
5743                  * will have old magic. In this case, we should ignore the
5744                  * stripe count and try to create dir by default stripe.
5745                  */
5746                 if (ah->dah_eadata != NULL && ah->dah_eadata_len != 0 &&
5747                     (le32_to_cpu(lum1->lum_magic) == LMV_USER_MAGIC ||
5748                      le32_to_cpu(lum1->lum_magic) == LMV_USER_MAGIC_SPECIFIC)) {
5749                         lc->ldo_dir_stripe_count =
5750                                 le32_to_cpu(lum1->lum_stripe_count);
5751                         lc->ldo_dir_stripe_offset =
5752                                 le32_to_cpu(lum1->lum_stripe_offset);
5753                         lc->ldo_dir_hash_type =
5754                                 le32_to_cpu(lum1->lum_hash_type);
5755                         CDEBUG(D_INFO,
5756                                "set dirstripe: count %hu, offset %d, hash %x\n",
5757                                 lc->ldo_dir_stripe_count,
5758                                 (int)lc->ldo_dir_stripe_offset,
5759                                 lc->ldo_dir_hash_type);
5760
5761                         if (d->lod_mdt_descs.ltd_lmv_desc.ld_active_tgt_count &&
5762                             lc->ldo_dir_stripe_count < 2 &&
5763                             lum1->lum_max_inherit != LMV_INHERIT_NONE) {
5764                                 /* when filesystem-wide default LMV is set, dirs
5765                                  * will be created on MDT by space usage, but if
5766                                  * dir is created with "lfs mkdir -c 1 ...", its
5767                                  * subdirs should be kept on the same MDT. To
5768                                  * guarantee this, set default LMV for such dir.
5769                                  */
5770                                 lds->lds_dir_def_stripe_count =
5771                                         le32_to_cpu(lum1->lum_stripe_count);
5772                                 /* if "-1" stripe offset is set, save current
5773                                  * MDT index in default LMV.
5774                                  */
5775                                 if (le32_to_cpu(lum1->lum_stripe_offset) ==
5776                                     LMV_OFFSET_DEFAULT)
5777                                         lds->lds_dir_def_stripe_offset =
5778                                                 lod2lu_dev(d)->ld_site->ld_seq_site->ss_node_id;
5779                                 else
5780                                         lds->lds_dir_def_stripe_offset =
5781                                                 le32_to_cpu(lum1->lum_stripe_offset);
5782                                 lds->lds_dir_def_hash_type =
5783                                         le32_to_cpu(lum1->lum_hash_type);
5784                                 lds->lds_dir_def_max_inherit =
5785                                         lum1->lum_max_inherit;
5786                                 /* it will be decreased by 1 later in setting */
5787                                 if (lum1->lum_max_inherit >= LMV_INHERIT_END &&
5788                                     lum1->lum_max_inherit < LMV_INHERIT_MAX)
5789                                         lds->lds_dir_def_max_inherit++;
5790                                 lds->lds_dir_def_max_inherit_rr =
5791                                         lum1->lum_max_inherit_rr;
5792                                 lds->lds_dir_def_striping_set = 1;
5793                                 /* don't inherit LOV from ROOT */
5794                                 if (lds->lds_def_striping_set &&
5795                                     fid_is_root(lod_object_fid(lp)))
5796                                         lds->lds_def_striping_set = 0;
5797                                 lc->ldo_def_striping = lds;
5798                         } else if (lds->lds_def_striping_set &&
5799                                    !fid_is_root(lod_object_fid(lp))) {
5800                                 /* don't inherit default LMV for "lfs mkdir" */
5801                                 lds->lds_dir_def_striping_set = 0;
5802                                 lc->ldo_def_striping = lds;
5803                         }
5804                 } else {
5805                         /* inherit default striping except ROOT */
5806                         if ((lds->lds_def_striping_set ||
5807                              lds->lds_dir_def_striping_set) &&
5808                             !fid_is_root(lod_object_fid(lp)))
5809                                 lc->ldo_def_striping = lds;
5810
5811                         /* transfer defaults LMV to new directory */
5812                         lod_striping_from_default(lc, lds, child_mode);
5813
5814                         /* set count 0 to create normal directory */
5815                         if (lc->ldo_dir_stripe_count == 1)
5816                                 lc->ldo_dir_stripe_count = 0;
5817                 }
5818
5819                 /* shrink the stripe count to max_mdt_stripecount if it is -1
5820                  * and max_mdt_stripecount is not 0
5821                  */
5822                 if (lc->ldo_dir_stripe_count == (__u16)(-1) &&
5823                     d->lod_max_mdt_stripecount)
5824                         lc->ldo_dir_stripe_count = d->lod_max_mdt_stripecount;
5825
5826                 /* shrink the stripe_count to the avaible MDT count */
5827                 if (lc->ldo_dir_stripe_count > d->lod_remote_mdt_count + 1 &&
5828                     !OBD_FAIL_CHECK(OBD_FAIL_LARGE_STRIPE)) {
5829                         lc->ldo_dir_stripe_count = d->lod_remote_mdt_count + 1;
5830                         if (lc->ldo_dir_stripe_count == 1)
5831                                 lc->ldo_dir_stripe_count = 0;
5832                 }
5833
5834                 if (!lmv_is_known_hash_type(lc->ldo_dir_hash_type))
5835                         lc->ldo_dir_hash_type =
5836                                 (lc->ldo_dir_hash_type & LMV_HASH_FLAG_KNOWN) |
5837                                 d->lod_mdt_descs.ltd_lmv_desc.ld_pattern;
5838
5839                 CDEBUG(D_INFO, "final dir stripe_count=%hu offset=%d hash=%u\n",
5840                        lc->ldo_dir_stripe_count,
5841                        (int)lc->ldo_dir_stripe_offset, lc->ldo_dir_hash_type);
5842
5843                 RETURN_EXIT;
5844         }
5845
5846         /* child object regular file*/
5847
5848         if (!lod_object_will_be_striped(S_ISREG(child_mode),
5849                                         lu_object_fid(&child->do_lu)))
5850                 RETURN_EXIT;
5851
5852         /* If object is going to be striped over OSTs, transfer default
5853          * striping information to the child, so that we can use it
5854          * during declaration and creation.
5855          *
5856          * Try from the parent first.
5857          */
5858         if (likely(lp != NULL)) {
5859                 rc = lod_get_default_lov_striping(env, lp, lds, ah);
5860                 if (rc == 0 && lds->lds_def_striping_set) {
5861                         rc = lod_verify_striping(env, d, lp, &info->lti_buf,
5862                                                  false);
5863                         if (rc == 0)
5864                                 lod_striping_from_default(lc, lds, child_mode);
5865                 }
5866         }
5867
5868         /* Initialize lod_device::lod_md_root object reference */
5869         if (d->lod_md_root == NULL) {
5870                 struct dt_object *root;
5871                 struct lod_object *lroot;
5872
5873                 lu_root_fid(&info->lti_fid);
5874                 root = dt_locate(env, &d->lod_dt_dev, &info->lti_fid);
5875                 if (!IS_ERR(root)) {
5876                         lroot = lod_dt_obj(root);
5877
5878                         spin_lock(&d->lod_lock);
5879                         if (d->lod_md_root != NULL)
5880                                 dt_object_put(env, &d->lod_md_root->ldo_obj);
5881                         d->lod_md_root = lroot;
5882                         spin_unlock(&d->lod_lock);
5883                 }
5884         }
5885
5886         /* try inherit layout from the root object (fs default) when:
5887          *  - parent does not have default layout; or
5888          *  - parent has plain(v1/v3) default layout, and some attributes
5889          *    are not specified in the default layout;
5890          */
5891         if (d->lod_md_root != NULL &&
5892             lod_need_inherit_more(lc, true, ah->dah_append_pool)) {
5893                 rc = lod_get_default_lov_striping(env, d->lod_md_root, lds,
5894                                                   ah);
5895                 if (rc || !lds->lds_def_striping_set)
5896                         goto out;
5897
5898                 rc = lod_verify_striping(env, d, d->lod_md_root, &info->lti_buf,
5899                                          false);
5900                 if (rc)
5901                         goto out;
5902
5903                 if (lc->ldo_comp_cnt == 0) {
5904                         lod_striping_from_default(lc, lds, child_mode);
5905                 } else if (!lds->lds_def_striping_is_composite) {
5906                         struct lod_layout_component *def_comp;
5907
5908                         LASSERT(!lc->ldo_is_composite);
5909                         lod_comp = &lc->ldo_comp_entries[0];
5910                         def_comp = &lds->lds_def_comp_entries[0];
5911
5912                         if (lod_comp->llc_stripe_count <= 0)
5913                                 lod_comp->llc_stripe_count =
5914                                         def_comp->llc_stripe_count;
5915                         if (lod_comp->llc_stripe_size <= 0)
5916                                 lod_comp->llc_stripe_size =
5917                                         def_comp->llc_stripe_size;
5918                         if (lod_comp->llc_stripe_offset == LOV_OFFSET_DEFAULT &&
5919                             (!lod_comp->llc_pool || !lod_comp->llc_pool[0]))
5920                                 lod_comp->llc_stripe_offset =
5921                                         def_comp->llc_stripe_offset;
5922                         if (lod_comp->llc_pool == NULL)
5923                                 lod_obj_set_pool(lc, 0, def_comp->llc_pool);
5924                 }
5925         }
5926 out:
5927         /*
5928          * fs default striping may not be explicitly set, or historically set
5929          * in config log, use them.
5930          */
5931         if (lod_need_inherit_more(lc, false, ah->dah_append_pool)) {
5932                 if (lc->ldo_comp_cnt == 0) {
5933                         rc = lod_alloc_comp_entries(lc, 0, 1);
5934                         if (rc)
5935                                 /* fail to allocate memory, will create a
5936                                  * non-striped file. */
5937                                 RETURN_EXIT;
5938                         lc->ldo_is_composite = 0;
5939                         lod_comp = &lc->ldo_comp_entries[0];
5940                         lod_comp->llc_stripe_offset = LOV_OFFSET_DEFAULT;
5941                 }
5942                 LASSERT(!lc->ldo_is_composite);
5943                 lod_comp = &lc->ldo_comp_entries[0];
5944                 desc = &d->lod_ost_descs.ltd_lov_desc;
5945                 lod_adjust_stripe_info(lod_comp, desc,
5946                                        ah->dah_append_stripe_count);
5947                 if (ah->dah_append_pool && ah->dah_append_pool[0])
5948                         lod_obj_set_pool(lc, 0, ah->dah_append_pool);
5949         }
5950
5951         EXIT;
5952 }
5953
5954 /**
5955  * Size initialization on late striping.
5956  *
5957  * Propagate the size of a truncated object to a deferred striping.
5958  * This function handles a special case when truncate was done on a
5959  * non-striped object and now while the striping is being created
5960  * we can't lose that size, so we have to propagate it to the stripes
5961  * being created.
5962  *
5963  * \param[in] env       execution environment
5964  * \param[in] dt        object
5965  * \param[in] th        transaction handle
5966  *
5967  * \retval              0 on success
5968  * \retval              negative if failed
5969  */
5970 static int lod_declare_init_size(const struct lu_env *env,
5971                                  struct dt_object *dt, struct thandle *th)
5972 {
5973         struct dt_object        *next = dt_object_child(dt);
5974         struct lod_object       *lo = lod_dt_obj(dt);
5975         struct dt_object        **objects = NULL;
5976         struct lu_attr  *attr = &lod_env_info(env)->lti_attr;
5977         uint64_t        size, offs;
5978         int     i, rc, stripe, stripe_count = 0, stripe_size = 0;
5979         struct lu_extent size_ext;
5980         ENTRY;
5981
5982         if (!lod_obj_is_striped(dt))
5983                 RETURN(0);
5984
5985         rc = dt_attr_get(env, next, attr);
5986         LASSERT(attr->la_valid & LA_SIZE);
5987         if (rc)
5988                 RETURN(rc);
5989
5990         size = attr->la_size;
5991         if (size == 0)
5992                 RETURN(0);
5993
5994         size_ext = (typeof(size_ext)){ .e_start = size - 1, .e_end = size };
5995         for (i = 0; i < lo->ldo_comp_cnt; i++) {
5996                 struct lod_layout_component *lod_comp;
5997                 struct lu_extent *extent;
5998
5999                 lod_comp = &lo->ldo_comp_entries[i];
6000
6001                 if (lod_comp->llc_stripe == NULL)
6002                         continue;
6003
6004                 extent = &lod_comp->llc_extent;
6005                 CDEBUG(D_INFO, "%lld "DEXT"\n", size, PEXT(extent));
6006                 if (!lo->ldo_is_composite ||
6007                     lu_extent_is_overlapped(extent, &size_ext)) {
6008                         objects = lod_comp->llc_stripe;
6009                         stripe_count = lod_comp->llc_stripe_count;
6010                         stripe_size = lod_comp->llc_stripe_size;
6011
6012                         /* next mirror */
6013                         if (stripe_count == 0)
6014                                 continue;
6015
6016                         LASSERT(objects != NULL && stripe_size != 0);
6017                         do_div(size, stripe_size);
6018                         stripe = do_div(size, stripe_count);
6019                         LASSERT(objects[stripe] != NULL);
6020
6021                         size = size * stripe_size;
6022                         offs = attr->la_size;
6023                         size += do_div(offs, stripe_size);
6024
6025                         attr->la_valid = LA_SIZE;
6026                         attr->la_size = size;
6027
6028                         rc = lod_sub_declare_attr_set(env, objects[stripe],
6029                                                       attr, th);
6030                 }
6031         }
6032
6033         RETURN(rc);
6034 }
6035
6036 /**
6037  * Declare creation of striped object.
6038  *
6039  * The function declares creation stripes for a regular object. The function
6040  * also declares whether the stripes will be created with non-zero size if
6041  * previously size was set non-zero on the master object. If object \a dt is
6042  * not local, then only fully defined striping can be applied in \a lovea.
6043  * Otherwise \a lovea can be in the form of pattern, see lod_qos_parse_config()
6044  * for the details.
6045  *
6046  * \param[in] env       execution environment
6047  * \param[in] dt        object
6048  * \param[in] attr      attributes the stripes will be created with
6049  * \param[in] lovea     a buffer containing striping description
6050  * \param[in] th        transaction handle
6051  *
6052  * \retval              0 on success
6053  * \retval              negative if failed
6054  */
6055 int lod_declare_striped_create(const struct lu_env *env, struct dt_object *dt,
6056                                struct lu_attr *attr,
6057                                const struct lu_buf *lovea, struct thandle *th)
6058 {
6059         struct lod_thread_info  *info = lod_env_info(env);
6060         struct dt_object        *next = dt_object_child(dt);
6061         struct lod_object       *lo = lod_dt_obj(dt);
6062         int                      rc;
6063         ENTRY;
6064
6065         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_ALLOC_OBDO))
6066                 GOTO(out, rc = -ENOMEM);
6067
6068         if (!dt_object_remote(next)) {
6069                 /* choose OST and generate appropriate objects */
6070                 rc = lod_prepare_create(env, lo, attr, lovea, th);
6071                 if (rc)
6072                         GOTO(out, rc);
6073
6074                 /*
6075                  * declare storage for striping data
6076                  */
6077                 info->lti_buf.lb_len = lod_comp_md_size(lo, false);
6078         } else {
6079                 /* LOD can not choose OST objects for remote objects, i.e.
6080                  * stripes must be ready before that. Right now, it can only
6081                  * happen during migrate, i.e. migrate process needs to create
6082                  * remote regular file (mdd_migrate_create), then the migrate
6083                  * process will provide stripeEA. */
6084                 LASSERT(lovea != NULL);
6085                 info->lti_buf = *lovea;
6086         }
6087
6088         rc = lod_sub_declare_xattr_set(env, next, &info->lti_buf,
6089                                        XATTR_NAME_LOV, 0, th);
6090         if (rc)
6091                 GOTO(out, rc);
6092
6093         /*
6094          * if striping is created with local object's size > 0,
6095          * we have to propagate this size to specific object
6096          * the case is possible only when local object was created previously
6097          */
6098         if (dt_object_exists(next))
6099                 rc = lod_declare_init_size(env, dt, th);
6100
6101 out:
6102         /* failed to create striping or to set initial size, let's reset
6103          * config so that others don't get confused */
6104         if (rc)
6105                 lod_striping_free(env, lo);
6106
6107         RETURN(rc);
6108 }
6109
6110 /*
6111  * Whether subdirectories under \a dt should be created on MDTs by space QoS
6112  *
6113  * If LMV_HASH_FLAG_SPACE is set on directory default layout, its subdirectories
6114  * should be created on MDT by space QoS.
6115  *
6116  * \param[in] env       execution environment
6117  * \param[in] dev       lu device
6118  * \param[in] dt        object
6119  *
6120  * \retval              1 if directory should create subdir by space usage
6121  * \retval              0 if not
6122  * \retval              -ev if failed
6123  */
6124 static inline int dt_object_qos_mkdir(const struct lu_env *env,
6125                                       struct lu_device *dev,
6126                                       struct dt_object *dt)
6127 {
6128         struct lod_thread_info *info = lod_env_info(env);
6129         struct lu_object *obj;
6130         struct lod_object *lo;
6131         struct lmv_user_md *lmu;
6132         int rc;
6133
6134         obj = lu_object_find_slice(env, dev, lu_object_fid(&dt->do_lu), NULL);
6135         if (IS_ERR(obj))
6136                 return PTR_ERR(obj);
6137
6138         lo = lu2lod_obj(obj);
6139
6140         rc = lod_get_default_lmv_ea(env, lo);
6141         dt_object_put(env, dt);
6142         if (rc <= 0)
6143                 return rc;
6144
6145         if (rc < (int)sizeof(*lmu))
6146                 return -EINVAL;
6147
6148         lmu = info->lti_ea_store;
6149         return le32_to_cpu(lmu->lum_stripe_offset) == LMV_OFFSET_DEFAULT;
6150 }
6151
6152 /**
6153  * Implementation of dt_object_operations::do_declare_create.
6154  *
6155  * The method declares creation of a new object. If the object will be striped,
6156  * then helper functions are called to find FIDs for the stripes, declare
6157  * creation of the stripes and declare initialization of the striping
6158  * information to be stored in the master object.
6159  *
6160  * \see dt_object_operations::do_declare_create() in the API description
6161  * for details.
6162  */
6163 static int lod_declare_create(const struct lu_env *env, struct dt_object *dt,
6164                               struct lu_attr *attr,
6165                               struct dt_allocation_hint *hint,
6166                               struct dt_object_format *dof, struct thandle *th)
6167 {
6168         struct dt_object   *next = dt_object_child(dt);
6169         struct lod_object  *lo = lod_dt_obj(dt);
6170         int                 rc;
6171         ENTRY;
6172
6173         LASSERT(dof);
6174         LASSERT(attr);
6175         LASSERT(th);
6176
6177         /*
6178          * first of all, we declare creation of local object
6179          */
6180         rc = lod_sub_declare_create(env, next, attr, hint, dof, th);
6181         if (rc != 0)
6182                 GOTO(out, rc);
6183
6184         /*
6185          * it's lod_ah_init() that has decided the object will be striped
6186          */
6187         if (dof->dof_type == DFT_REGULAR) {
6188                 /* callers don't want stripes */
6189                 /* XXX: all tricky interactions with ->ah_make_hint() decided
6190                  * to use striping, then ->declare_create() behaving differently
6191                  * should be cleaned */
6192                 if (dof->u.dof_reg.striped != 0)
6193                         rc = lod_declare_striped_create(env, dt, attr,
6194                                                         NULL, th);
6195         } else if (dof->dof_type == DFT_DIR) {
6196                 struct seq_server_site *ss;
6197                 struct lu_buf buf = { NULL };
6198                 struct lu_buf *lmu = NULL;
6199
6200                 ss = lu_site2seq(dt->do_lu.lo_dev->ld_site);
6201
6202                 /* If the parent has default stripeEA, and client
6203                  * did not find it before sending create request,
6204                  * then MDT will return -EREMOTE, and client will
6205                  * retrieve the default stripeEA and re-create the
6206                  * sub directory.
6207                  *
6208                  * Note: if dah_eadata != NULL, it means creating the
6209                  * striped directory with specified stripeEA, then it
6210                  * should ignore the default stripeEA */
6211                 if (hint != NULL && hint->dah_eadata == NULL) {
6212                         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_STALE_DIR_LAYOUT))
6213                                 GOTO(out, rc = -EREMOTE);
6214
6215                         if (lo->ldo_dir_stripe_offset != LMV_OFFSET_DEFAULT &&
6216                             lo->ldo_dir_stripe_offset != ss->ss_node_id) {
6217                                 struct lod_device *lod;
6218                                 struct lu_tgt_desc *mdt = NULL;
6219                                 bool found_mdt = false;
6220
6221                                 lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
6222                                 lod_foreach_mdt(lod, mdt) {
6223                                         if (mdt->ltd_index ==
6224                                                 lo->ldo_dir_stripe_offset) {
6225                                                 found_mdt = true;
6226                                                 break;
6227                                         }
6228                                 }
6229
6230                                 /* If the MDT indicated by stripe_offset can be
6231                                  * found, then tell client to resend the create
6232                                  * request to the correct MDT, otherwise return
6233                                  * error to client */
6234                                 if (found_mdt)
6235                                         GOTO(out, rc = -EREMOTE);
6236                                 else
6237                                         GOTO(out, rc = -EINVAL);
6238                         }
6239                 } else if (hint && hint->dah_eadata) {
6240                         lmu = &buf;
6241                         lmu->lb_buf = (void *)hint->dah_eadata;
6242                         lmu->lb_len = hint->dah_eadata_len;
6243                 }
6244
6245                 rc = lod_declare_dir_striping_create(env, dt, attr, lmu, dof,
6246                                                      th);
6247         }
6248 out:
6249         /* failed to create striping or to set initial size, let's reset
6250          * config so that others don't get confused */
6251         if (rc)
6252                 lod_striping_free(env, lo);
6253         RETURN(rc);
6254 }
6255
6256 /**
6257  * Generate component ID for new created component.
6258  *
6259  * \param[in] lo                LOD object
6260  * \param[in] comp_idx          index of ldo_comp_entries
6261  *
6262  * \retval                      component ID on success
6263  * \retval                      LCME_ID_INVAL on failure
6264  */
6265 static __u32 lod_gen_component_id(struct lod_object *lo,
6266                                   int mirror_id, int comp_idx)
6267 {
6268         struct lod_layout_component *lod_comp;
6269         __u32   id, start, end;
6270         int     i;
6271
6272         LASSERT(lo->ldo_comp_entries[comp_idx].llc_id == LCME_ID_INVAL);
6273
6274         lod_obj_inc_layout_gen(lo);
6275         id = lo->ldo_layout_gen;
6276         if (likely(id <= SEQ_ID_MAX))
6277                 RETURN(pflr_id(mirror_id, id & SEQ_ID_MASK));
6278
6279         /* Layout generation wraps, need to check collisions. */
6280         start = id & SEQ_ID_MASK;
6281         end = SEQ_ID_MAX;
6282 again:
6283         for (id = start; id <= end; id++) {
6284                 for (i = 0; i < lo->ldo_comp_cnt; i++) {
6285                         lod_comp = &lo->ldo_comp_entries[i];
6286                         if (pflr_id(mirror_id, id) == lod_comp->llc_id)
6287                                 break;
6288                 }
6289                 /* Found the ununsed ID */
6290                 if (i == lo->ldo_comp_cnt)
6291                         RETURN(pflr_id(mirror_id, id));
6292         }
6293         if (end == LCME_ID_MAX) {
6294                 start = 1;
6295                 end = min(lo->ldo_layout_gen & LCME_ID_MASK,
6296                           (__u32)(LCME_ID_MAX - 1));
6297                 goto again;
6298         }
6299
6300         RETURN(LCME_ID_INVAL);
6301 }
6302
6303 /**
6304  * Creation of a striped regular object.
6305  *
6306  * The function is called to create the stripe objects for a regular
6307  * striped file. This can happen at the initial object creation or
6308  * when the caller asks LOD to do so using ->do_xattr_set() method
6309  * (so called late striping). Notice all the information are already
6310  * prepared in the form of the list of objects (ldo_stripe field).
6311  * This is done during declare phase.
6312  *
6313  * \param[in] env       execution environment
6314  * \param[in] dt        object
6315  * \param[in] attr      attributes the stripes will be created with
6316  * \param[in] dof       format of stripes (see OSD API description)
6317  * \param[in] th        transaction handle
6318  *
6319  * \retval              0 on success
6320  * \retval              negative if failed
6321  */
6322 int lod_striped_create(const struct lu_env *env, struct dt_object *dt,
6323                        struct lu_attr *attr, struct dt_object_format *dof,
6324                        struct thandle *th)
6325 {
6326         struct lod_layout_component     *lod_comp;
6327         struct lod_object       *lo = lod_dt_obj(dt);
6328         __u16   mirror_id;
6329         int     rc = 0, i, j;
6330         ENTRY;
6331
6332         mutex_lock(&lo->ldo_layout_mutex);
6333
6334         LASSERT((lo->ldo_comp_cnt != 0 && lo->ldo_comp_entries != NULL) ||
6335                 lo->ldo_is_foreign);
6336
6337         mirror_id = 0; /* non-flr file's mirror_id is 0 */
6338         if (lo->ldo_mirror_count > 1) {
6339                 for (i = 0; i < lo->ldo_comp_cnt; i++) {
6340                         lod_comp = &lo->ldo_comp_entries[i];
6341                         if (lod_comp->llc_id != LCME_ID_INVAL &&
6342                             mirror_id_of(lod_comp->llc_id) > mirror_id)
6343                                 mirror_id = mirror_id_of(lod_comp->llc_id);
6344                 }
6345         }
6346
6347         /* create all underlying objects */
6348         for (i = 0; i < lo->ldo_comp_cnt; i++) {
6349                 lod_comp = &lo->ldo_comp_entries[i];
6350
6351                 if (lod_comp->llc_id == LCME_ID_INVAL) {
6352                         /* only the component of FLR layout with more than 1
6353                          * mirror has mirror ID in its component ID.
6354                          */
6355                         if (lod_comp->llc_extent.e_start == 0 &&
6356                             lo->ldo_mirror_count > 1)
6357                                 ++mirror_id;
6358
6359                         lod_comp->llc_id = lod_gen_component_id(lo,
6360                                                                 mirror_id, i);
6361                         if (lod_comp->llc_id == LCME_ID_INVAL)
6362                                 GOTO(out, rc = -ERANGE);
6363                 }
6364
6365                 if (lod_comp_inited(lod_comp))
6366                         continue;
6367
6368                 if (lod_comp->llc_pattern & LOV_PATTERN_F_RELEASED)
6369                         lod_comp_set_init(lod_comp);
6370
6371                 if (lov_pattern(lod_comp->llc_pattern) == LOV_PATTERN_MDT)
6372                         lod_comp_set_init(lod_comp);
6373
6374                 if (lod_comp->llc_stripe == NULL)
6375                         continue;
6376
6377                 LASSERT(lod_comp->llc_stripe_count);
6378                 for (j = 0; j < lod_comp->llc_stripe_count; j++) {
6379                         struct dt_object *object = lod_comp->llc_stripe[j];
6380                         LASSERT(object != NULL);
6381                         rc = lod_sub_create(env, object, attr, NULL, dof, th);
6382                         if (rc)
6383                                 GOTO(out, rc);
6384                 }
6385                 lod_comp_set_init(lod_comp);
6386         }
6387
6388         rc = lod_fill_mirrors(lo);
6389         if (rc)
6390                 GOTO(out, rc);
6391
6392         lo->ldo_comp_cached = 1;
6393
6394         rc = lod_generate_and_set_lovea(env, lo, th);
6395         if (rc)
6396                 GOTO(out, rc);
6397
6398         mutex_unlock(&lo->ldo_layout_mutex);
6399
6400         RETURN(0);
6401
6402 out:
6403         lod_striping_free_nolock(env, lo);
6404         mutex_unlock(&lo->ldo_layout_mutex);
6405
6406         RETURN(rc);
6407 }
6408
6409 static inline bool lod_obj_is_dom(struct dt_object *dt)
6410 {
6411         struct lod_object *lo = lod_dt_obj(dt);
6412
6413         if (!dt_object_exists(dt_object_child(dt)))
6414                 return false;
6415
6416         if (S_ISDIR(dt->do_lu.lo_header->loh_attr))
6417                 return false;
6418
6419         if (!lo->ldo_comp_cnt)
6420                 return false;
6421
6422         return (lov_pattern(lo->ldo_comp_entries[0].llc_pattern) ==
6423                 LOV_PATTERN_MDT);
6424 }
6425
6426 /**
6427  * Implementation of dt_object_operations::do_create.
6428  *
6429  * If any of preceeding methods (like ->do_declare_create(),
6430  * ->do_ah_init(), etc) chose to create a striped object,
6431  * then this method will create the master and the stripes.
6432  *
6433  * \see dt_object_operations::do_create() in the API description for details.
6434  */
6435 static int lod_create(const struct lu_env *env, struct dt_object *dt,
6436                       struct lu_attr *attr, struct dt_allocation_hint *hint,
6437                       struct dt_object_format *dof, struct thandle *th)
6438 {
6439         int                 rc;
6440         ENTRY;
6441
6442         /* create local object */
6443         rc = lod_sub_create(env, dt_object_child(dt), attr, hint, dof, th);
6444         if (rc != 0)
6445                 RETURN(rc);
6446
6447         if (S_ISREG(dt->do_lu.lo_header->loh_attr) &&
6448             (lod_obj_is_striped(dt) || lod_obj_is_dom(dt)) &&
6449             dof->u.dof_reg.striped != 0) {
6450                 LASSERT(lod_dt_obj(dt)->ldo_comp_cached == 0);
6451                 rc = lod_striped_create(env, dt, attr, dof, th);
6452         }
6453
6454         RETURN(rc);
6455 }
6456
6457 static inline int
6458 lod_obj_stripe_destroy_cb(const struct lu_env *env, struct lod_object *lo,
6459                           struct dt_object *dt, struct thandle *th,
6460                           int comp_idx, int stripe_idx,
6461                           struct lod_obj_stripe_cb_data *data)
6462 {
6463         if (data->locd_declare)
6464                 return lod_sub_declare_destroy(env, dt, th);
6465
6466         if (!OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_SPEOBJ) ||
6467             stripe_idx == cfs_fail_val)
6468                 return lod_sub_destroy(env, dt, th);
6469
6470         return 0;
6471 }
6472
6473 /**
6474  * Implementation of dt_object_operations::do_declare_destroy.
6475  *
6476  * If the object is a striped directory, then the function declares reference
6477  * removal from the master object (this is an index) to the stripes and declares
6478  * destroy of all the stripes. In all the cases, it declares an intention to
6479  * destroy the object itself.
6480  *
6481  * \see dt_object_operations::do_declare_destroy() in the API description
6482  * for details.
6483  */
6484 static int lod_declare_destroy(const struct lu_env *env, struct dt_object *dt,
6485                                struct thandle *th)
6486 {
6487         struct dt_object *next = dt_object_child(dt);
6488         struct lod_object *lo = lod_dt_obj(dt);
6489         struct lod_thread_info *info = lod_env_info(env);
6490         struct dt_object *stripe;
6491         char *stripe_name = info->lti_key;
6492         int rc, i;
6493
6494         ENTRY;
6495
6496         /*
6497          * load striping information, notice we don't do this when object
6498          * is being initialized as we don't need this information till
6499          * few specific cases like destroy, chown
6500          */
6501         rc = lod_striping_load(env, lo);
6502         if (rc)
6503                 RETURN(rc);
6504
6505         /* declare destroy for all underlying objects */
6506         if (S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
6507                 rc = next->do_ops->do_index_try(env, next,
6508                                                 &dt_directory_features);
6509                 if (rc != 0)
6510                         RETURN(rc);
6511
6512                 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
6513                         stripe = lo->ldo_stripe[i];
6514                         if (!stripe)
6515                                 continue;
6516
6517                         rc = lod_sub_declare_ref_del(env, next, th);
6518                         if (rc != 0)
6519                                 RETURN(rc);
6520
6521                         snprintf(stripe_name, sizeof(info->lti_key),
6522                                  DFID":%d",
6523                                  PFID(lu_object_fid(&stripe->do_lu)), i);
6524                         rc = lod_sub_declare_delete(env, next,
6525                                         (const struct dt_key *)stripe_name, th);
6526                         if (rc != 0)
6527                                 RETURN(rc);
6528                 }
6529         }
6530
6531         /*
6532          * we declare destroy for the local object
6533          */
6534         rc = lod_sub_declare_destroy(env, next, th);
6535         if (rc)
6536                 RETURN(rc);
6537
6538         if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ) ||
6539             OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ2))
6540                 RETURN(0);
6541
6542         if (!lod_obj_is_striped(dt))
6543                 RETURN(0);
6544
6545         /* declare destroy all striped objects */
6546         if (S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
6547                 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
6548                         stripe = lo->ldo_stripe[i];
6549                         if (!stripe)
6550                                 continue;
6551
6552                         if (!dt_object_exists(stripe))
6553                                 continue;
6554
6555                         rc = lod_sub_declare_ref_del(env, stripe, th);
6556                         if (rc != 0)
6557                                 break;
6558
6559                         rc = lod_sub_declare_destroy(env, stripe, th);
6560                         if (rc != 0)
6561                                 break;
6562                 }
6563         } else {
6564                 struct lod_obj_stripe_cb_data data = { { 0 } };
6565
6566                 data.locd_declare = true;
6567                 data.locd_stripe_cb = lod_obj_stripe_destroy_cb;
6568                 rc = lod_obj_for_each_stripe(env, lo, th, &data);
6569         }
6570
6571         RETURN(rc);
6572 }
6573
6574 /**
6575  * Implementation of dt_object_operations::do_destroy.
6576  *
6577  * If the object is a striped directory, then the function removes references
6578  * from the master object (this is an index) to the stripes and destroys all
6579  * the stripes. In all the cases, the function destroys the object itself.
6580  *
6581  * \see dt_object_operations::do_destroy() in the API description for details.
6582  */
6583 static int lod_destroy(const struct lu_env *env, struct dt_object *dt,
6584                        struct thandle *th)
6585 {
6586         struct dt_object  *next = dt_object_child(dt);
6587         struct lod_object *lo = lod_dt_obj(dt);
6588         struct lod_thread_info *info = lod_env_info(env);
6589         char *stripe_name = info->lti_key;
6590         struct dt_object *stripe;
6591         unsigned int i;
6592         int rc;
6593
6594         ENTRY;
6595
6596         /* destroy sub-stripe of master object */
6597         if (S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
6598                 rc = next->do_ops->do_index_try(env, next,
6599                                                 &dt_directory_features);
6600                 if (rc != 0)
6601                         RETURN(rc);
6602
6603                 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
6604                         stripe = lo->ldo_stripe[i];
6605                         if (!stripe)
6606                                 continue;
6607
6608                         rc = lod_sub_ref_del(env, next, th);
6609                         if (rc != 0)
6610                                 RETURN(rc);
6611
6612                         snprintf(stripe_name, sizeof(info->lti_key), DFID":%d",
6613                                 PFID(lu_object_fid(&stripe->do_lu)), i);
6614
6615                         CDEBUG(D_INFO, DFID" delete stripe %s "DFID"\n",
6616                                PFID(lu_object_fid(&dt->do_lu)), stripe_name,
6617                                PFID(lu_object_fid(&stripe->do_lu)));
6618
6619                         rc = lod_sub_delete(env, next,
6620                                        (const struct dt_key *)stripe_name, th);
6621                         if (rc != 0)
6622                                 RETURN(rc);
6623                 }
6624         }
6625
6626         rc = lod_sub_destroy(env, next, th);
6627         if (rc != 0)
6628                 RETURN(rc);
6629
6630         if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ) ||
6631             OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ2))
6632                 RETURN(0);
6633
6634         if (!lod_obj_is_striped(dt))
6635                 RETURN(0);
6636
6637         /* destroy all striped objects */
6638         if (S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
6639                 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
6640                         stripe = lo->ldo_stripe[i];
6641                         if (!stripe)
6642                                 continue;
6643
6644                         if (!dt_object_exists(stripe))
6645                                 continue;
6646
6647                         if (!OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_SPEOBJ) ||
6648                             i == cfs_fail_val) {
6649                                 dt_write_lock(env, stripe, DT_TGT_CHILD);
6650                                 rc = lod_sub_ref_del(env, stripe, th);
6651                                 dt_write_unlock(env, stripe);
6652                                 if (rc != 0)
6653                                         break;
6654
6655                                 rc = lod_sub_destroy(env, stripe, th);
6656                                 if (rc != 0)
6657                                         break;
6658                         }
6659                 }
6660         } else {
6661                 struct lod_obj_stripe_cb_data data = { { 0 } };
6662
6663                 data.locd_declare = false;
6664                 data.locd_stripe_cb = lod_obj_stripe_destroy_cb;
6665                 rc = lod_obj_for_each_stripe(env, lo, th, &data);
6666         }
6667
6668         RETURN(rc);
6669 }
6670
6671 /**
6672  * Implementation of dt_object_operations::do_declare_ref_add.
6673  *
6674  * \see dt_object_operations::do_declare_ref_add() in the API description
6675  * for details.
6676  */
6677 static int lod_declare_ref_add(const struct lu_env *env,
6678                                struct dt_object *dt, struct thandle *th)
6679 {
6680         return lod_sub_declare_ref_add(env, dt_object_child(dt), th);
6681 }
6682
6683 /**
6684  * Implementation of dt_object_operations::do_ref_add.
6685  *
6686  * \see dt_object_operations::do_ref_add() in the API description for details.
6687  */
6688 static int lod_ref_add(const struct lu_env *env,
6689                        struct dt_object *dt, struct thandle *th)
6690 {
6691         return lod_sub_ref_add(env, dt_object_child(dt), th);
6692 }
6693
6694 /**
6695  * Implementation of dt_object_operations::do_declare_ref_del.
6696  *
6697  * \see dt_object_operations::do_declare_ref_del() in the API description
6698  * for details.
6699  */
6700 static int lod_declare_ref_del(const struct lu_env *env,
6701                                struct dt_object *dt, struct thandle *th)
6702 {
6703         return lod_sub_declare_ref_del(env, dt_object_child(dt), th);
6704 }
6705
6706 /**
6707  * Implementation of dt_object_operations::do_ref_del
6708  *
6709  * \see dt_object_operations::do_ref_del() in the API description for details.
6710  */
6711 static int lod_ref_del(const struct lu_env *env,
6712                        struct dt_object *dt, struct thandle *th)
6713 {
6714         return lod_sub_ref_del(env, dt_object_child(dt), th);
6715 }
6716
6717 /**
6718  * Implementation of dt_object_operations::do_object_sync.
6719  *
6720  * \see dt_object_operations::do_object_sync() in the API description
6721  * for details.
6722  */
6723 static int lod_object_sync(const struct lu_env *env, struct dt_object *dt,
6724                            __u64 start, __u64 end)
6725 {
6726         return dt_object_sync(env, dt_object_child(dt), start, end);
6727 }
6728
6729 /**
6730  * Implementation of dt_object_operations::do_object_unlock.
6731  *
6732  * Used to release LDLM lock(s).
6733  *
6734  * \see dt_object_operations::do_object_unlock() in the API description
6735  * for details.
6736  */
6737 static int lod_object_unlock(const struct lu_env *env, struct dt_object *dt,
6738                              struct ldlm_enqueue_info *einfo,
6739                              union ldlm_policy_data *policy)
6740 {
6741         struct lod_object *lo = lod_dt_obj(dt);
6742         struct lustre_handle_array *slave_locks = einfo->ei_cbdata;
6743         int slave_locks_size;
6744         int i;
6745         ENTRY;
6746
6747         if (slave_locks == NULL)
6748                 RETURN(0);
6749
6750         LASSERT(S_ISDIR(dt->do_lu.lo_header->loh_attr));
6751         /* Note: for remote lock for single stripe dir, MDT will cancel
6752          * the lock by lockh directly */
6753         LASSERT(!dt_object_remote(dt_object_child(dt)));
6754
6755         /* locks were unlocked in MDT layer */
6756         for (i = 0; i < slave_locks->ha_count; i++)
6757                 LASSERT(!lustre_handle_is_used(&slave_locks->ha_handles[i]));
6758
6759         /*
6760          * NB, ha_count may not equal to ldo_dir_stripe_count, because dir
6761          * layout may change, e.g., shrink dir layout after migration.
6762          */
6763         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
6764                 if (lo->ldo_stripe[i])
6765                         dt_invalidate(env, lo->ldo_stripe[i]);
6766         }
6767
6768         slave_locks_size = offsetof(typeof(*slave_locks),
6769                                     ha_handles[slave_locks->ha_count]);
6770         OBD_FREE(slave_locks, slave_locks_size);
6771         einfo->ei_cbdata = NULL;
6772
6773         RETURN(0);
6774 }
6775
6776 /**
6777  * Implementation of dt_object_operations::do_object_lock.
6778  *
6779  * Used to get LDLM lock on the non-striped and striped objects.
6780  *
6781  * \see dt_object_operations::do_object_lock() in the API description
6782  * for details.
6783  */
6784 static int lod_object_lock(const struct lu_env *env,
6785                            struct dt_object *dt,
6786                            struct lustre_handle *lh,
6787                            struct ldlm_enqueue_info *einfo,
6788                            union ldlm_policy_data *policy)
6789 {
6790         struct lod_object *lo = lod_dt_obj(dt);
6791         int slave_locks_size;
6792         struct lustre_handle_array *slave_locks = NULL;
6793         int i;
6794         int rc;
6795         ENTRY;
6796
6797         /* remote object lock */
6798         if (!einfo->ei_enq_slave) {
6799                 LASSERT(dt_object_remote(dt));
6800                 return dt_object_lock(env, dt_object_child(dt), lh, einfo,
6801                                       policy);
6802         }
6803
6804         if (!S_ISDIR(dt->do_lu.lo_header->loh_attr))
6805                 RETURN(-ENOTDIR);
6806
6807         rc = lod_striping_load(env, lo);
6808         if (rc != 0)
6809                 RETURN(rc);
6810
6811         /* No stripes */
6812         if (lo->ldo_dir_stripe_count <= 1)
6813                 RETURN(0);
6814
6815         slave_locks_size = offsetof(typeof(*slave_locks),
6816                                     ha_handles[lo->ldo_dir_stripe_count]);
6817         /* Freed in lod_object_unlock */
6818         OBD_ALLOC(slave_locks, slave_locks_size);
6819         if (!slave_locks)
6820                 RETURN(-ENOMEM);
6821         slave_locks->ha_count = lo->ldo_dir_stripe_count;
6822
6823         /* striped directory lock */
6824         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
6825                 struct lustre_handle lockh;
6826                 struct ldlm_res_id *res_id;
6827                 struct dt_object *stripe;
6828
6829                 stripe = lo->ldo_stripe[i];
6830                 if (!stripe)
6831                         continue;
6832
6833                 res_id = &lod_env_info(env)->lti_res_id;
6834                 fid_build_reg_res_name(lu_object_fid(&stripe->do_lu), res_id);
6835                 einfo->ei_res_id = res_id;
6836
6837                 if (dt_object_remote(stripe)) {
6838                         set_bit(i, (void *)slave_locks->ha_map);
6839                         rc = dt_object_lock(env, stripe, &lockh, einfo, policy);
6840                 } else {
6841                         struct ldlm_namespace *ns = einfo->ei_namespace;
6842                         ldlm_blocking_callback blocking = einfo->ei_cb_local_bl;
6843                         ldlm_completion_callback completion = einfo->ei_cb_cp;
6844                         __u64 dlmflags = LDLM_FL_ATOMIC_CB;
6845
6846                         if (einfo->ei_mode == LCK_PW ||
6847                             einfo->ei_mode == LCK_EX)
6848                                 dlmflags |= LDLM_FL_COS_INCOMPAT;
6849
6850                         LASSERT(ns != NULL);
6851                         rc = ldlm_cli_enqueue_local(env, ns, res_id, LDLM_IBITS,
6852                                                     policy, einfo->ei_mode,
6853                                                     &dlmflags, blocking,
6854                                                     completion, NULL,
6855                                                     NULL, 0, LVB_T_NONE,
6856                                                     NULL, &lockh);
6857                 }
6858                 if (rc) {
6859                         while (i--)
6860                                 ldlm_lock_decref_and_cancel(
6861                                                 &slave_locks->ha_handles[i],
6862                                                 einfo->ei_mode);
6863                         OBD_FREE(slave_locks, slave_locks_size);
6864                         RETURN(rc);
6865                 }
6866                 slave_locks->ha_handles[i] = lockh;
6867         }
6868         einfo->ei_cbdata = slave_locks;
6869
6870         RETURN(0);
6871 }
6872
6873 /**
6874  * Implementation of dt_object_operations::do_invalidate.
6875  *
6876  * \see dt_object_operations::do_invalidate() in the API description for details
6877  */
6878 static int lod_invalidate(const struct lu_env *env, struct dt_object *dt)
6879 {
6880         return dt_invalidate(env, dt_object_child(dt));
6881 }
6882
6883 static int lod_declare_instantiate_components(const struct lu_env *env,
6884                                               struct lod_object *lo,
6885                                               struct thandle *th,
6886                                               __u64 reserve)
6887 {
6888         struct lod_thread_info *info = lod_env_info(env);
6889         int i;
6890         int rc = 0;
6891         ENTRY;
6892
6893         LASSERT(info->lti_count < lo->ldo_comp_cnt);
6894
6895         for (i = 0; i < info->lti_count; i++) {
6896                 rc = lod_qos_prep_create(env, lo, NULL, th,
6897                                          info->lti_comp_idx[i], reserve);
6898                 if (rc)
6899                         break;
6900         }
6901
6902         if (!rc) {
6903                 info->lti_buf.lb_len = lod_comp_md_size(lo, false);
6904                 rc = lod_sub_declare_xattr_set(env, lod_object_child(lo),
6905                                 &info->lti_buf, XATTR_NAME_LOV, 0, th);
6906         }
6907
6908         RETURN(rc);
6909 }
6910
6911 /**
6912  * Check OSTs for an existing component for further extension
6913  *
6914  * Checks if OSTs are still healthy and not out of space.  Gets free space
6915  * on OSTs (relative to allocation watermark rmb_low) and compares to
6916  * the proposed new_end for this component.
6917  *
6918  * Decides whether or not to extend a component on its current OSTs.
6919  *
6920  * \param[in] env               execution environment for this thread
6921  * \param[in] lo                object we're checking
6922  * \param[in] index             index of this component
6923  * \param[in] extension_size    extension size for this component
6924  * \param[in] extent            layout extent for requested operation
6925  * \param[in] comp_extent       extension component extent
6926  * \param[in] write             if this is write operation
6927  *
6928  * \retval      true - OK to extend on current OSTs
6929  * \retval      false - do not extend on current OSTs
6930  */
6931 static bool lod_sel_osts_allowed(const struct lu_env *env,
6932                                  struct lod_object *lo,
6933                                  int index, __u64 reserve,
6934                                  struct lu_extent *extent,
6935                                  struct lu_extent *comp_extent, int write)
6936 {
6937         struct lod_layout_component *lod_comp = &lo->ldo_comp_entries[index];
6938         struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
6939         struct lod_thread_info *tinfo = lod_env_info(env);
6940         struct obd_statfs *sfs = &tinfo->lti_osfs;
6941         __u64 available = 0;
6942         bool ret = true;
6943         int i, rc;
6944
6945         ENTRY;
6946
6947         LASSERT(lod_comp->llc_stripe_count != 0);
6948
6949         lod_getref(&lod->lod_ost_descs);
6950         for (i = 0; i < lod_comp->llc_stripe_count; i++) {
6951                 int index = lod_comp->llc_ost_indices[i];
6952                 struct lod_tgt_desc *ost = OST_TGT(lod, index);
6953                 struct obd_statfs_info info = { 0 };
6954                 int j, repeated = 0;
6955
6956                 LASSERT(ost);
6957
6958                 /* Get the number of times this OST repeats in this component.
6959                  * Note: inter-component repeats are not counted as this is
6960                  * considered as a rare case: we try to not repeat OST in other
6961                  * components if possible. */
6962                 for (j = 0; j < lod_comp->llc_stripe_count; j++) {
6963                         if (index != lod_comp->llc_ost_indices[j])
6964                                 continue;
6965
6966                         /* already handled */
6967                         if (j < i)
6968                                 break;
6969
6970                         repeated++;
6971                 }
6972                 if (j < lod_comp->llc_stripe_count)
6973                         continue;
6974
6975                 if (!test_bit(index, lod->lod_ost_bitmap)) {
6976                         CDEBUG(D_LAYOUT, "ost %d no longer present\n", index);
6977                         ret = false;
6978                         break;
6979                 }
6980
6981                 rc = dt_statfs_info(env, ost->ltd_tgt, sfs, &info);
6982                 if (rc) {
6983                         CDEBUG(D_LAYOUT, "statfs failed for ost %d, error %d\n",
6984                                index, rc);
6985                         ret = false;
6986                         break;
6987                 }
6988
6989                 if (sfs->os_state & OS_STATFS_ENOSPC ||
6990                     sfs->os_state & OS_STATFS_READONLY ||
6991                     sfs->os_state & OS_STATFS_DEGRADED) {
6992                         CDEBUG(D_LAYOUT, "ost %d is not availble for SEL "
6993                                "extension, state %u\n", index, sfs->os_state);
6994                         ret = false;
6995                         break;
6996                 }
6997
6998                 /* In bytes */
6999                 available = sfs->os_bavail * sfs->os_bsize;
7000                 /* 'available' is relative to the allocation threshold */
7001                 available -= (__u64) info.os_reserved_mb_low << 20;
7002
7003                 CDEBUG(D_LAYOUT, "ost %d lowwm: %d highwm: %d, "
7004                        "%llu %% blocks available, %llu %% blocks free\n",
7005                        index, info.os_reserved_mb_low, info.os_reserved_mb_high,
7006                        (100ull * sfs->os_bavail) / sfs->os_blocks,
7007                        (100ull * sfs->os_bfree) / sfs->os_blocks);
7008
7009                 if (reserve * repeated > available) {
7010                         ret = false;
7011                         CDEBUG(D_LAYOUT, "low space on ost %d, available %llu "
7012                                "< extension size %llu repeated %d\n", index,
7013                                available, reserve, repeated);
7014                         break;
7015                 }
7016         }
7017         lod_putref(lod, &lod->lod_ost_descs);
7018
7019         RETURN(ret);
7020 }
7021
7022 /**
7023  * Adjust extents after component removal
7024  *
7025  * When we remove an extension component, we move the start of the next
7026  * component to match the start of the extension component, so no space is left
7027  * without layout.
7028  *
7029  * \param[in] env       execution environment for this thread
7030  * \param[in] lo        object
7031  * \param[in] max_comp  layout component
7032  * \param[in] index     index of this component
7033  *
7034  * \retval              0 on success
7035  * \retval              negative errno on error
7036  */
7037 static void lod_sel_adjust_extents(const struct lu_env *env,
7038                                    struct lod_object *lo,
7039                                    int max_comp, int index)
7040 {
7041         struct lod_layout_component *lod_comp = NULL;
7042         struct lod_layout_component *next = NULL;
7043         struct lod_layout_component *prev = NULL;
7044         __u64 new_start = 0;
7045         __u64 start;
7046         int i;
7047
7048         /* Extension space component */
7049         lod_comp = &lo->ldo_comp_entries[index];
7050         next = &lo->ldo_comp_entries[index + 1];
7051         prev = &lo->ldo_comp_entries[index - 1];
7052
7053         LASSERT(lod_comp != NULL && prev != NULL && next != NULL);
7054         LASSERT(lod_comp->llc_flags & LCME_FL_EXTENSION);
7055
7056         /* Previous is being removed */
7057         if (prev && prev->llc_id == LCME_ID_INVAL)
7058                 new_start = prev->llc_extent.e_start;
7059         else
7060                 new_start = lod_comp->llc_extent.e_start;
7061
7062         for (i = index + 1; i < max_comp; i++) {
7063                 lod_comp = &lo->ldo_comp_entries[i];
7064
7065                 start = lod_comp->llc_extent.e_start;
7066                 lod_comp->llc_extent.e_start = new_start;
7067
7068                 /* We only move zero length extendable components */
7069                 if (!(start == lod_comp->llc_extent.e_end))
7070                         break;
7071
7072                 LASSERT(!(lod_comp->llc_flags & LCME_FL_INIT));
7073
7074                 lod_comp->llc_extent.e_end = new_start;
7075         }
7076 }
7077
7078 /* Calculate the proposed 'new end' for a component we're extending */
7079 static __u64 lod_extension_new_end(__u64 extension_size, __u64 extent_end,
7080                                    __u32 stripe_size, __u64 component_end,
7081                                    __u64 extension_end)
7082 {
7083         __u64 new_end;
7084
7085         LASSERT(extension_size != 0 && stripe_size != 0);
7086
7087         /* Round up to extension size */
7088         if (extent_end == OBD_OBJECT_EOF) {
7089                 new_end = OBD_OBJECT_EOF;
7090         } else {
7091                 /* Add at least extension_size to the previous component_end,
7092                  * covering the req layout extent */
7093                 new_end = max(extent_end - component_end, extension_size);
7094                 new_end = roundup(new_end, extension_size);
7095                 new_end += component_end;
7096
7097                 /* Component end must be min stripe size aligned */
7098                 if (new_end % stripe_size) {
7099                         CDEBUG(D_LAYOUT, "new component end is not aligned "
7100                                "by the stripe size %u: [%llu, %llu) ext size "
7101                                "%llu new end %llu, aligning\n",
7102                                stripe_size, component_end, extent_end,
7103                                extension_size, new_end);
7104                         new_end = roundup(new_end, stripe_size);
7105                 }
7106
7107                 /* Overflow */
7108                 if (new_end < extent_end)
7109                         new_end = OBD_OBJECT_EOF;
7110         }
7111
7112         /* Don't extend past the end of the extension component */
7113         if (new_end > extension_end)
7114                 new_end = extension_end;
7115
7116         return new_end;
7117 }
7118
7119 /**
7120  * Calculate the exact reservation (per-OST extension_size) on the OSTs being
7121  * instantiated. It needs to be calculated in advance and taken into account at
7122  * the instantiation time, because otherwise lod_statfs_and_check() may consider
7123  * an OST as OK, but SEL needs its extension_size to fit the free space and the
7124  * OST may turn out to be low-on-space, thus inappropriate OST may be used and
7125  * ENOSPC occurs.
7126  *
7127  * \param[in] lod_comp          lod component we are checking
7128  *
7129  * \retval      size to reserved on each OST of lod_comp's stripe.
7130  */
7131 static __u64 lod_sel_stripe_reserved(struct lod_layout_component *lod_comp)
7132 {
7133         /* extension_size is file level, so we must divide by stripe count to
7134          * compare it to available space on a single OST */
7135         return  lod_comp->llc_stripe_size * SEL_UNIT_SIZE /
7136                 lod_comp->llc_stripe_count;
7137 }
7138
7139 /* As lod_sel_handler() could be re-entered for the same component several
7140  * times, this is the data for the next call. Fields could be changed to
7141  * component indexes when needed, (e.g. if there is no need to instantiate
7142  * all the previous components up to the current position) to tell the caller
7143  * where to start over from. */
7144 struct sel_data {
7145         int sd_force;
7146         int sd_repeat;
7147 };
7148
7149 /**
7150  * Process extent updates for a particular layout component
7151  *
7152  * Handle layout updates for a particular extension space component touched by
7153  * a layout update operation.  Core function of self-extending PFL feature.
7154  *
7155  * In general, this function processes exactly *one* stage of an extension
7156  * operation, modifying the layout accordingly, then returns to the caller.
7157  * The caller is responsible for restarting processing with the new layout,
7158  * which may repeatedly return to this function until the extension updates
7159  * are complete.
7160  *
7161  * This function does one of a few things to the layout:
7162  * 1. Extends the component before the current extension space component to
7163  * allow it to accomodate the requested operation (if space/policy permit that
7164  * component to continue on its current OSTs)
7165  *
7166  * 2. If extension of the existing component fails, we do one of two things:
7167  *    a. If there is a component after the extension space, we remove the
7168  *       extension space component, move the start of the next component down
7169  *       accordingly, then notify the caller to restart processing w/the new
7170  *       layout.
7171  *    b. If there is no following component, we try repeating the current
7172  *       component, creating a new component using the current one as a
7173  *       template (keeping its stripe properties but not specific striping),
7174  *       and try assigning striping for this component.  If there is sufficient
7175  *       free space on the OSTs chosen for this component, it is instantiated
7176  *       and i/o continues there.
7177  *
7178  *       If there is not sufficient space on the new OSTs, we remove this new
7179  *       component & extend the current component.
7180  *
7181  * Note further that uninited components followed by extension space can be zero
7182  * length meaning that we will try to extend them before initializing them, and
7183  * if that fails, they will be removed without initialization.
7184  *
7185  * 3. If we extend to/beyond the end of an extension space component, that
7186  * component is exhausted (all of its range has been given to real components),
7187  * so we remove it and restart processing.
7188  *
7189  * \param[in] env               execution environment for this thread
7190  * \param[in,out] lo            object to update the layout of
7191  * \param[in] extent            layout extent for requested operation, update
7192  *                              layout to fit this operation
7193  * \param[in] th                transaction handle for this operation
7194  * \param[in,out] max_comp      the highest comp for the portion of the layout
7195  *                              we are operating on (For FLR, the chosen
7196  *                              replica).  Updated because we may remove
7197  *                              components.
7198  * \param[in] index             index of the extension space component we're
7199  *                              working on
7200  * \param[in] write             if this is write op
7201  * \param[in,out] force         if the extension is to be forced; set here
7202                                 to force it on the 2nd call for the same
7203                                 extension component
7204  *
7205  * \retval      0 on success
7206  * \retval      negative errno on error
7207  */
7208 static int lod_sel_handler(const struct lu_env *env,
7209                           struct lod_object *lo,
7210                           struct lu_extent *extent,
7211                           struct thandle *th, int *max_comp,
7212                           int index, int write,
7213                           struct sel_data *sd)
7214 {
7215         struct lod_device *d = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
7216         struct lod_thread_info *info = lod_env_info(env);
7217         struct lod_layout_component *lod_comp;
7218         struct lod_layout_component *prev;
7219         struct lod_layout_component *next = NULL;
7220         __u64 extension_size, reserve;
7221         __u64 new_end = 0;
7222         bool repeated;
7223         int change = 0;
7224         int rc = 0;
7225         ENTRY;
7226
7227         /* First component cannot be extension space */
7228         if (index == 0) {
7229                 CERROR("%s: "DFID" first component cannot be extension space\n",
7230                        lod2obd(d)->obd_name, PFID(lod_object_fid(lo)));
7231                 RETURN(-EINVAL);
7232         }
7233
7234         lod_comp = &lo->ldo_comp_entries[index];
7235         prev = &lo->ldo_comp_entries[index - 1];
7236         if ((index + 1) < *max_comp)
7237                 next = &lo->ldo_comp_entries[index + 1];
7238
7239         /* extension size uses the stripe size field as KiB */
7240         extension_size = lod_comp->llc_stripe_size * SEL_UNIT_SIZE;
7241
7242         CDEBUG(D_LAYOUT, "prev start %llu, extension start %llu, extension end"
7243                " %llu, extension size %llu\n", prev->llc_extent.e_start,
7244                lod_comp->llc_extent.e_start, lod_comp->llc_extent.e_end,
7245                extension_size);
7246
7247         /* Two extension space components cannot be adjacent & extension space
7248          * components cannot be init */
7249         if ((prev->llc_flags & LCME_FL_EXTENSION) ||
7250             !(ergo(next, !(next->llc_flags & LCME_FL_EXTENSION))) ||
7251              lod_comp_inited(lod_comp)) {
7252                 CERROR("%s: "DFID" invalid extension space components\n",
7253                        lod2obd(d)->obd_name, PFID(lod_object_fid(lo)));
7254                 RETURN(-EINVAL);
7255         }
7256
7257         reserve = lod_sel_stripe_reserved(lod_comp);
7258
7259         if (!prev->llc_stripe) {
7260                 CDEBUG(D_LAYOUT, "Previous component not inited\n");
7261                 info->lti_count = 1;
7262                 info->lti_comp_idx[0] = index - 1;
7263                 rc = lod_declare_instantiate_components(env, lo, th, reserve);
7264                 /* ENOSPC tells us we can't use this component.  If there is
7265                  * a next or we are repeating, we either spill over (next) or
7266                  * extend the original comp (repeat).  Otherwise, return the
7267                  * error to the user. */
7268                 if (rc == -ENOSPC && (next || sd->sd_repeat))
7269                         rc = 1;
7270                 if (rc < 0)
7271                         RETURN(rc);
7272         }
7273
7274         if (sd->sd_force == 0 && rc == 0)
7275                 rc = !lod_sel_osts_allowed(env, lo, index - 1, reserve, extent,
7276                                            &lod_comp->llc_extent, write);
7277
7278         repeated = !!(sd->sd_repeat);
7279         sd->sd_repeat = 0;
7280         sd->sd_force = 0;
7281
7282         /* Extend previous component */
7283         if (rc == 0) {
7284                 new_end = lod_extension_new_end(extension_size, extent->e_end,
7285                                                 prev->llc_stripe_size,
7286                                                 prev->llc_extent.e_end,
7287                                                 lod_comp->llc_extent.e_end);
7288
7289                 CDEBUG(D_LAYOUT, "new end %llu\n", new_end);
7290                 lod_comp->llc_extent.e_start = new_end;
7291                 prev->llc_extent.e_end = new_end;
7292
7293                 if (prev->llc_extent.e_end == lod_comp->llc_extent.e_end) {
7294                         CDEBUG(D_LAYOUT, "Extension component exhausted\n");
7295                         lod_comp->llc_id = LCME_ID_INVAL;
7296                         change--;
7297                 }
7298         } else {
7299                 /* rc == 1, failed to extend current component */
7300                 LASSERT(rc == 1);
7301                 if (next) {
7302                         /* Normal 'spillover' case - Remove the extension
7303                          * space component & bring down the start of the next
7304                          * component. */
7305                         lod_comp->llc_id = LCME_ID_INVAL;
7306                         change--;
7307                         if (!(prev->llc_flags & LCME_FL_INIT)) {
7308                                 prev->llc_id = LCME_ID_INVAL;
7309                                 change--;
7310                         }
7311                         lod_sel_adjust_extents(env, lo, *max_comp, index);
7312                 } else if (lod_comp_inited(prev)) {
7313                         /* If there is no next, and the previous component is
7314                          * INIT'ed, try repeating the previous component. */
7315                         LASSERT(repeated == 0);
7316                         rc = lod_layout_repeat_comp(env, lo, index - 1);
7317                         if (rc < 0)
7318                                 RETURN(rc);
7319                         change++;
7320                         /* The previous component is a repeated component.
7321                          * Record this so we don't keep trying to repeat it. */
7322                         sd->sd_repeat = 1;
7323                 } else {
7324                         /* If the previous component is not INIT'ed, this may
7325                          * be a component we have just instantiated but failed
7326                          * to extend. Or even a repeated component we failed
7327                          * to prepare a striping for. Do not repeat but instead
7328                          * remove the repeated component & force the extention
7329                          * of the original one */
7330                         sd->sd_force = 1;
7331                         if (repeated) {
7332                                 prev->llc_id = LCME_ID_INVAL;
7333                                 change--;
7334                         }
7335                 }
7336         }
7337
7338         if (change < 0) {
7339                 rc = lod_layout_del_prep_layout(env, lo, NULL);
7340                 if (rc < 0)
7341                         RETURN(rc);
7342                 LASSERTF(-rc == change,
7343                          "number deleted %d != requested %d\n", -rc,
7344                          change);
7345         }
7346         *max_comp = *max_comp + change;
7347
7348         /* lod_del_prep_layout reallocates ldo_comp_entries, so we must
7349          * refresh these pointers before using them */
7350         lod_comp = &lo->ldo_comp_entries[index];
7351         prev = &lo->ldo_comp_entries[index - 1];
7352         CDEBUG(D_LAYOUT, "After extent updates: prev start %llu, current start "
7353                "%llu, current end %llu max_comp %d ldo_comp_cnt %d\n",
7354                prev->llc_extent.e_start, lod_comp->llc_extent.e_start,
7355                lod_comp->llc_extent.e_end, *max_comp, lo->ldo_comp_cnt);
7356
7357         /* Layout changed successfully */
7358         RETURN(0);
7359 }
7360
7361 /**
7362  * Declare layout extent updates
7363  *
7364  * Handles extensions.  Identifies extension components touched by current
7365  * operation and passes them to processing function.
7366  *
7367  * Restarts with updated layouts from the processing function until the current
7368  * operation no longer touches an extension space component.
7369  *
7370  * \param[in] env       execution environment for this thread
7371  * \param[in,out] lo    object to update the layout of
7372  * \param[in] extent    layout extent for requested operation, update layout to
7373  *                      fit this operation
7374  * \param[in] th        transaction handle for this operation
7375  * \param[in] pick      identifies chosen mirror for FLR layouts
7376  * \param[in] write     if this is write op
7377  *
7378  * \retval      1 on layout changed, 0 on no change
7379  * \retval      negative errno on error
7380  */
7381 static int lod_declare_update_extents(const struct lu_env *env,
7382                 struct lod_object *lo, struct lu_extent *extent,
7383                 struct thandle *th, int pick, int write)
7384 {
7385         struct lod_thread_info *info = lod_env_info(env);
7386         struct lod_layout_component *lod_comp;
7387         bool layout_changed = false;
7388         struct sel_data sd = { 0 };
7389         int start_index;
7390         int i = 0;
7391         int max_comp = 0;
7392         int rc = 0, rc2;
7393         int change = 0;
7394         ENTRY;
7395
7396         /* This makes us work on the components of the chosen mirror */
7397         start_index = lo->ldo_mirrors[pick].lme_start;
7398         max_comp = lo->ldo_mirrors[pick].lme_end + 1;
7399         if (lo->ldo_flr_state == LCM_FL_NONE)
7400                 LASSERT(start_index == 0 && max_comp == lo->ldo_comp_cnt);
7401
7402         CDEBUG(D_LAYOUT, "extent->e_start %llu, extent->e_end %llu\n",
7403                extent->e_start, extent->e_end);
7404         for (i = start_index; i < max_comp; i++) {
7405                 lod_comp = &lo->ldo_comp_entries[i];
7406
7407                 /* We've passed all components of interest */
7408                 if (lod_comp->llc_extent.e_start >= extent->e_end)
7409                         break;
7410
7411                 if (lod_comp->llc_flags & LCME_FL_EXTENSION) {
7412                         layout_changed = true;
7413                         rc = lod_sel_handler(env, lo, extent, th, &max_comp,
7414                                              i, write, &sd);
7415                         if (rc < 0)
7416                                 GOTO(out, rc);
7417
7418                         /* Nothing has changed behind the prev one */
7419                         i -= 2;
7420                         continue;
7421                 }
7422         }
7423
7424         /* We may have added or removed components.  If so, we must update the
7425          * start & ends of all the mirrors after the current one, and the end
7426          * of the current mirror. */
7427         change = max_comp - 1 - lo->ldo_mirrors[pick].lme_end;
7428         if (change) {
7429                 lo->ldo_mirrors[pick].lme_end += change;
7430                 for (i = pick + 1; i < lo->ldo_mirror_count; i++) {
7431                         lo->ldo_mirrors[i].lme_start += change;
7432                         lo->ldo_mirrors[i].lme_end += change;
7433                 }
7434         }
7435
7436         EXIT;
7437 out:
7438         /* The amount of components has changed, adjust the lti_comp_idx */
7439         rc2 = lod_layout_data_init(info, lo->ldo_comp_cnt);
7440
7441         return rc < 0 ? rc : rc2 < 0 ? rc2 : layout_changed;
7442 }
7443
7444 /* If striping is already instantiated or INIT'ed DOM? */
7445 static bool lod_is_instantiation_needed(struct lod_layout_component *comp)
7446 {
7447         return !(((lov_pattern(comp->llc_pattern) == LOV_PATTERN_MDT) &&
7448                   lod_comp_inited(comp)) || comp->llc_stripe);
7449 }
7450
7451 /**
7452  * Declare layout update for a non-FLR layout.
7453  *
7454  * \param[in] env       execution environment for this thread
7455  * \param[in,out] lo    object to update the layout of
7456  * \param[in] layout    layout intent for requested operation, "update" is
7457  *                      a process of reacting to this
7458  * \param[in] buf       buffer containing lov ea (see comment on usage inline)
7459  * \param[in] th        transaction handle for this operation
7460  *
7461  * \retval      0 on success
7462  * \retval      negative errno on error
7463  */
7464 static int lod_declare_update_plain(const struct lu_env *env,
7465                 struct lod_object *lo, struct layout_intent *layout,
7466                 const struct lu_buf *buf, struct thandle *th)
7467 {
7468         struct lod_thread_info *info = lod_env_info(env);
7469         struct lod_device *d = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
7470         struct lod_layout_component *lod_comp;
7471         struct lov_comp_md_v1 *comp_v1 = NULL;
7472         bool layout_changed = false;
7473         bool replay = false;
7474         int i, rc;
7475         ENTRY;
7476
7477         LASSERT(lo->ldo_flr_state == LCM_FL_NONE);
7478
7479         /*
7480          * In case the client is passing lovea, which only happens during
7481          * the replay of layout intent write RPC for now, we may need to
7482          * parse the lovea and apply new layout configuration.
7483          */
7484         if (buf && buf->lb_len)  {
7485                 struct lov_user_md_v1 *v1 = buf->lb_buf;
7486
7487                 if (v1->lmm_magic != (LOV_MAGIC_DEFINED | LOV_MAGIC_COMP_V1) &&
7488                     v1->lmm_magic != __swab32(LOV_MAGIC_DEFINED |
7489                                               LOV_MAGIC_COMP_V1)) {
7490                         CERROR("%s: the replay buffer of layout extend "
7491                                "(magic %#x) does not contain expected "
7492                                "composite layout.\n",
7493                                lod2obd(d)->obd_name, v1->lmm_magic);
7494                         GOTO(out, rc = -EINVAL);
7495                 }
7496
7497                 rc = lod_use_defined_striping(env, lo, buf);
7498                 if (rc)
7499                         GOTO(out, rc);
7500                 lo->ldo_comp_cached = 1;
7501
7502                 rc = lod_get_lov_ea(env, lo);
7503                 if (rc <= 0)
7504                         GOTO(out, rc);
7505                 /* old on-disk EA is stored in info->lti_buf */
7506                 comp_v1 = (struct lov_comp_md_v1 *)info->lti_buf.lb_buf;
7507                 replay = true;
7508                 layout_changed = true;
7509
7510                 rc = lod_layout_data_init(info, lo->ldo_comp_cnt);
7511                 if (rc)
7512                         GOTO(out, rc);
7513         } else {
7514                 /* non replay path */
7515                 rc = lod_striping_load(env, lo);
7516                 if (rc)
7517                         GOTO(out, rc);
7518         }
7519
7520         /* Make sure defined layout covers the requested write range. */
7521         lod_comp = &lo->ldo_comp_entries[lo->ldo_comp_cnt - 1];
7522         if (lo->ldo_comp_cnt > 1 &&
7523             lod_comp->llc_extent.e_end != OBD_OBJECT_EOF &&
7524             lod_comp->llc_extent.e_end < layout->li_extent.e_end) {
7525                 CDEBUG_LIMIT(replay ? D_ERROR : D_LAYOUT,
7526                              "%s: the defined layout [0, %#llx) does not "
7527                              "covers the write range "DEXT"\n",
7528                              lod2obd(d)->obd_name, lod_comp->llc_extent.e_end,
7529                              PEXT(&layout->li_extent));
7530                 GOTO(out, rc = -EINVAL);
7531         }
7532
7533         CDEBUG(D_LAYOUT, "%s: "DFID": update components "DEXT"\n",
7534                lod2obd(d)->obd_name, PFID(lod_object_fid(lo)),
7535                PEXT(&layout->li_extent));
7536
7537         if (!replay) {
7538                 rc = lod_declare_update_extents(env, lo, &layout->li_extent,
7539                                 th, 0, layout->li_opc == LAYOUT_INTENT_WRITE);
7540                 if (rc < 0)
7541                         GOTO(out, rc);
7542                 else if (rc)
7543                         layout_changed = true;
7544         }
7545
7546         /*
7547          * Iterate ld->ldo_comp_entries, find the component whose extent under
7548          * the write range and not instantianted.
7549          */
7550         for (i = 0; i < lo->ldo_comp_cnt; i++) {
7551                 lod_comp = &lo->ldo_comp_entries[i];
7552
7553                 if (lod_comp->llc_extent.e_start >= layout->li_extent.e_end)
7554                         break;
7555
7556                 if (!replay) {
7557                         /* If striping is instantiated or INIT'ed DOM skip */
7558                         if (!lod_is_instantiation_needed(lod_comp))
7559                                 continue;
7560                 } else {
7561                         /**
7562                          * In replay path, lod_comp is the EA passed by
7563                          * client replay buffer,  comp_v1 is the pre-recovery
7564                          * on-disk EA, we'd sift out those components which
7565                          * were init-ed in the on-disk EA.
7566                          */
7567                         if (le32_to_cpu(comp_v1->lcm_entries[i].lcme_flags) &
7568                             LCME_FL_INIT)
7569                                 continue;
7570                 }
7571                 /*
7572                  * this component hasn't instantiated in normal path, or during
7573                  * replay it needs replay the instantiation.
7574                  */
7575
7576                 /* A released component is being extended */
7577                 if (lod_comp->llc_pattern & LOV_PATTERN_F_RELEASED)
7578                         GOTO(out, rc = -EINVAL);
7579
7580                 LASSERT(info->lti_comp_idx != NULL);
7581                 info->lti_comp_idx[info->lti_count++] = i;
7582                 layout_changed = true;
7583         }
7584
7585         if (!layout_changed)
7586                 RETURN(-EALREADY);
7587
7588         lod_obj_inc_layout_gen(lo);
7589         rc = lod_declare_instantiate_components(env, lo, th, 0);
7590         EXIT;
7591 out:
7592         if (rc)
7593                 lod_striping_free(env, lo);
7594         return rc;
7595 }
7596
7597 static inline int lod_comp_index(struct lod_object *lo,
7598                                  struct lod_layout_component *lod_comp)
7599 {
7600         LASSERT(lod_comp >= lo->ldo_comp_entries &&
7601                 lod_comp <= &lo->ldo_comp_entries[lo->ldo_comp_cnt - 1]);
7602
7603         return lod_comp - lo->ldo_comp_entries;
7604 }
7605
7606 /**
7607  * Stale other mirrors by writing extent.
7608  */
7609 static int lod_stale_components(const struct lu_env *env, struct lod_object *lo,
7610                                 int primary, struct lu_extent *extent,
7611                                 struct thandle *th)
7612 {
7613         struct lod_layout_component *pri_comp, *lod_comp;
7614         struct lu_extent pri_extent;
7615         int rc = 0;
7616         int i;
7617         ENTRY;
7618
7619         /* The writing extent decides which components in the primary
7620          * are affected... */
7621         CDEBUG(D_LAYOUT, "primary mirror %d, "DEXT"\n", primary, PEXT(extent));
7622
7623 restart:
7624         lod_foreach_mirror_comp(pri_comp, lo, primary) {
7625                 if (!lu_extent_is_overlapped(extent, &pri_comp->llc_extent))
7626                         continue;
7627
7628                 CDEBUG(D_LAYOUT, "primary comp %u "DEXT"\n",
7629                        lod_comp_index(lo, pri_comp),
7630                        PEXT(&pri_comp->llc_extent));
7631
7632                 pri_extent.e_start = pri_comp->llc_extent.e_start;
7633                 pri_extent.e_end = pri_comp->llc_extent.e_end;
7634
7635                 for (i = 0; i < lo->ldo_mirror_count; i++) {
7636                         if (i == primary)
7637                                 continue;
7638                         rc = lod_declare_update_extents(env, lo, &pri_extent,
7639                                                         th, i, 0);
7640                         /* if update_extents changed the layout, it may have
7641                          * reallocated the component array, so start over to
7642                          * avoid using stale pointers */
7643                         if (rc == 1)
7644                                 goto restart;
7645                         if (rc < 0)
7646                                 RETURN(rc);
7647
7648                         /* ... and then stale other components that are
7649                          * overlapping with primary components */
7650                         lod_foreach_mirror_comp(lod_comp, lo, i) {
7651                                 if (!lu_extent_is_overlapped(
7652                                                         &pri_extent,
7653                                                         &lod_comp->llc_extent))
7654                                         continue;
7655
7656                                 CDEBUG(D_LAYOUT, "stale: %u / %u\n",
7657                                       i, lod_comp_index(lo, lod_comp));
7658
7659                                 lod_comp->llc_flags |= LCME_FL_STALE;
7660                                 lo->ldo_mirrors[i].lme_stale = 1;
7661                         }
7662                 }
7663         }
7664
7665         RETURN(rc);
7666 }
7667
7668 /**
7669  * check an OST's availability
7670  * \param[in] env       execution environment
7671  * \param[in] lo        lod object
7672  * \param[in] dt        dt object
7673  * \param[in] index     mirror index
7674  *
7675  * \retval      negative if failed
7676  * \retval      1 if \a dt is available
7677  * \retval      0 if \a dt is not available
7678  */
7679 static inline int lod_check_ost_avail(const struct lu_env *env,
7680                                       struct lod_object *lo,
7681                                       struct dt_object *dt, int index)
7682 {
7683         struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
7684         struct lod_tgt_desc *ost;
7685         __u32 idx;
7686         int type = LU_SEQ_RANGE_OST;
7687         int rc;
7688
7689         rc = lod_fld_lookup(env, lod, lu_object_fid(&dt->do_lu), &idx, &type);
7690         if (rc < 0) {
7691                 CERROR("%s: can't locate "DFID":rc = %d\n",
7692                        lod2obd(lod)->obd_name, PFID(lu_object_fid(&dt->do_lu)),
7693                        rc);
7694                 return rc;
7695         }
7696
7697         ost = OST_TGT(lod, idx);
7698         if (ost->ltd_statfs.os_state &
7699                 (OS_STATFS_READONLY | OS_STATFS_ENOSPC | OS_STATFS_ENOINO |
7700                  OS_STATFS_NOPRECREATE) ||
7701             ost->ltd_active == 0) {
7702                 CDEBUG(D_LAYOUT, DFID ": mirror %d OST%d unavail, rc = %d\n",
7703                        PFID(lod_object_fid(lo)), index, idx, rc);
7704                 return 0;
7705         }
7706
7707         return 1;
7708 }
7709
7710 /**
7711  * Pick primary mirror for write
7712  * \param[in] env       execution environment
7713  * \param[in] lo        object
7714  * \param[in] extent    write range
7715  */
7716 static int lod_primary_pick(const struct lu_env *env, struct lod_object *lo,
7717                             struct lu_extent *extent)
7718 {
7719         struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
7720         unsigned int seq = 0;
7721         struct lod_layout_component *lod_comp;
7722         int i, j, rc;
7723         int picked = -1, second_pick = -1, third_pick = -1;
7724         ENTRY;
7725
7726         if (OBD_FAIL_CHECK(OBD_FAIL_FLR_RANDOM_PICK_MIRROR)) {
7727                 get_random_bytes(&seq, sizeof(seq));
7728                 seq %= lo->ldo_mirror_count;
7729         }
7730
7731         /**
7732          * Pick a mirror as the primary, and check the availability of OSTs.
7733          *
7734          * This algo can be revised later after knowing the topology of
7735          * cluster.
7736          */
7737         lod_qos_statfs_update(env, lod, &lod->lod_ost_descs);
7738
7739         rc = lod_fill_mirrors(lo);
7740         if (rc)
7741                 RETURN(rc);
7742
7743         for (i = 0; i < lo->ldo_mirror_count; i++) {
7744                 bool ost_avail = true;
7745                 int index = (i + seq) % lo->ldo_mirror_count;
7746
7747                 if (lo->ldo_mirrors[index].lme_stale) {
7748                         CDEBUG(D_LAYOUT, DFID": mirror %d stale\n",
7749                                PFID(lod_object_fid(lo)), index);
7750                         continue;
7751                 }
7752
7753                 /* 2nd pick is for the primary mirror containing unavail OST */
7754                 if (lo->ldo_mirrors[index].lme_prefer && second_pick < 0)
7755                         second_pick = index;
7756
7757                 /* 3rd pick is for non-primary mirror containing unavail OST */
7758                 if (second_pick < 0 && third_pick < 0)
7759                         third_pick = index;
7760
7761                 /**
7762                  * we found a non-primary 1st pick, we'd like to find a
7763                  * potential pirmary mirror.
7764                  */
7765                 if (picked >= 0 && !lo->ldo_mirrors[index].lme_prefer)
7766                         continue;
7767
7768                 /* check the availability of OSTs */
7769                 lod_foreach_mirror_comp(lod_comp, lo, index) {
7770                         if (!lod_comp_inited(lod_comp) || !lod_comp->llc_stripe)
7771                                 continue;
7772
7773                         for (j = 0; j < lod_comp->llc_stripe_count; j++) {
7774                                 struct dt_object *dt = lod_comp->llc_stripe[j];
7775
7776                                 rc = lod_check_ost_avail(env, lo, dt, index);
7777                                 if (rc < 0)
7778                                         RETURN(rc);
7779
7780                                 ost_avail = !!rc;
7781                                 if (!ost_avail)
7782                                         break;
7783                         } /* for all dt object in one component */
7784                         if (!ost_avail)
7785                                 break;
7786                 } /* for all components in a mirror */
7787
7788                 /**
7789                  * the OSTs where allocated objects locates in the components
7790                  * of the mirror are available.
7791                  */
7792                 if (!ost_avail)
7793                         continue;
7794
7795                 /* this mirror has all OSTs available */
7796                 picked = index;
7797
7798                 /**
7799                  * primary with all OSTs are available, this is the perfect
7800                  * 1st pick.
7801                  */
7802                 if (lo->ldo_mirrors[index].lme_prefer)
7803                         break;
7804         } /* for all mirrors */
7805
7806         /* failed to pick a sound mirror, lower our expectation */
7807         if (picked < 0)
7808                 picked = second_pick;
7809         if (picked < 0)
7810                 picked = third_pick;
7811         if (picked < 0)
7812                 RETURN(-ENODATA);
7813
7814         RETURN(picked);
7815 }
7816
7817 static int lod_prepare_resync_mirror(const struct lu_env *env,
7818                                      struct lod_object *lo,
7819                                      __u16 mirror_id)
7820 {
7821         struct lod_thread_info *info = lod_env_info(env);
7822         struct lod_layout_component *lod_comp;
7823         bool neg = !!(MIRROR_ID_NEG & mirror_id);
7824         int i;
7825
7826         mirror_id &= ~MIRROR_ID_NEG;
7827
7828         for (i = 0; i < lo->ldo_mirror_count; i++) {
7829                 if ((!neg && lo->ldo_mirrors[i].lme_id != mirror_id) ||
7830                     (neg && lo->ldo_mirrors[i].lme_id == mirror_id))
7831                         continue;
7832
7833                 lod_foreach_mirror_comp(lod_comp, lo, i) {
7834                         if (lod_comp_inited(lod_comp))
7835                                 continue;
7836
7837                         info->lti_comp_idx[info->lti_count++] =
7838                                 lod_comp_index(lo, lod_comp);
7839                 }
7840         }
7841
7842         return 0;
7843 }
7844
7845 /**
7846  * figure out the components should be instantiated for resync.
7847  */
7848 static int lod_prepare_resync(const struct lu_env *env, struct lod_object *lo,
7849                               struct lu_extent *extent)
7850 {
7851         struct lod_thread_info *info = lod_env_info(env);
7852         struct lod_layout_component *lod_comp;
7853         unsigned int need_sync = 0;
7854         int i;
7855
7856         CDEBUG(D_LAYOUT,
7857                DFID": instantiate all stale components in "DEXT"\n",
7858                PFID(lod_object_fid(lo)), PEXT(extent));
7859
7860         /**
7861          * instantiate all components within this extent, even non-stale
7862          * components.
7863          */
7864         for (i = 0; i < lo->ldo_mirror_count; i++) {
7865                 if (!lo->ldo_mirrors[i].lme_stale)
7866                         continue;
7867
7868                 lod_foreach_mirror_comp(lod_comp, lo, i) {
7869                         if (!lu_extent_is_overlapped(extent,
7870                                                 &lod_comp->llc_extent))
7871                                 break;
7872
7873                         need_sync++;
7874
7875                         if (lod_comp_inited(lod_comp))
7876                                 continue;
7877
7878                         CDEBUG(D_LAYOUT, "resync instantiate %d / %d\n",
7879                                i, lod_comp_index(lo, lod_comp));
7880                         info->lti_comp_idx[info->lti_count++] =
7881                                         lod_comp_index(lo, lod_comp);
7882                 }
7883         }
7884
7885         return need_sync ? 0 : -EALREADY;
7886 }
7887
7888 static int lod_declare_update_rdonly(const struct lu_env *env,
7889                 struct lod_object *lo, struct md_layout_change *mlc,
7890                 struct thandle *th)
7891 {
7892         struct lod_thread_info *info = lod_env_info(env);
7893         struct lu_attr *layout_attr = &info->lti_layout_attr;
7894         struct lod_layout_component *lod_comp;
7895         struct lu_extent extent = { 0 };
7896         int rc;
7897         ENTRY;
7898
7899         LASSERT(lo->ldo_flr_state == LCM_FL_RDONLY);
7900         LASSERT(mlc->mlc_opc == MD_LAYOUT_WRITE ||
7901                 mlc->mlc_opc == MD_LAYOUT_RESYNC);
7902         LASSERT(lo->ldo_mirror_count > 0);
7903
7904         if (mlc->mlc_opc == MD_LAYOUT_WRITE) {
7905                 struct layout_intent *layout = mlc->mlc_intent;
7906                 int write = layout->li_opc == LAYOUT_INTENT_WRITE;
7907                 int picked;
7908
7909                 extent = layout->li_extent;
7910                 CDEBUG(D_LAYOUT, DFID": trying to write :"DEXT"\n",
7911                        PFID(lod_object_fid(lo)), PEXT(&extent));
7912
7913                 picked = lod_primary_pick(env, lo, &extent);
7914                 if (picked < 0)
7915                         RETURN(picked);
7916
7917                 CDEBUG(D_LAYOUT, DFID": picked mirror id %u as primary\n",
7918                        PFID(lod_object_fid(lo)),
7919                        lo->ldo_mirrors[picked].lme_id);
7920
7921                 /* Update extents of primary before staling */
7922                 rc = lod_declare_update_extents(env, lo, &extent, th, picked,
7923                                                 write);
7924                 if (rc < 0)
7925                         GOTO(out, rc);
7926
7927                 if (layout->li_opc == LAYOUT_INTENT_TRUNC) {
7928                         /**
7929                          * trunc transfers [0, size) in the intent extent, we'd
7930                          * stale components overlapping [size, eof).
7931                          */
7932                         extent.e_start = extent.e_end;
7933                         extent.e_end = OBD_OBJECT_EOF;
7934                 }
7935
7936                 /* stale overlapping components from other mirrors */
7937                 rc = lod_stale_components(env, lo, picked, &extent, th);
7938                 if (rc < 0)
7939                         GOTO(out, rc);
7940
7941                 /* restore truncate intent extent */
7942                 if (layout->li_opc == LAYOUT_INTENT_TRUNC)
7943                         extent.e_end = extent.e_start;
7944
7945                 /* instantiate components for the picked mirror, start from 0 */
7946                 extent.e_start = 0;
7947
7948                 lod_foreach_mirror_comp(lod_comp, lo, picked) {
7949                         if (!lu_extent_is_overlapped(&extent,
7950                                                      &lod_comp->llc_extent))
7951                                 break;
7952
7953                         if (!lod_is_instantiation_needed(lod_comp))
7954                                 continue;
7955
7956                         info->lti_comp_idx[info->lti_count++] =
7957                                                 lod_comp_index(lo, lod_comp);
7958                 }
7959
7960                 lo->ldo_flr_state = LCM_FL_WRITE_PENDING;
7961         } else { /* MD_LAYOUT_RESYNC */
7962                 int i;
7963
7964                 /**
7965                  * could contain multiple non-stale mirrors, so we need to
7966                  * prep uninited all components assuming any non-stale mirror
7967                  * could be picked as the primary mirror.
7968                  */
7969                 if (mlc->mlc_mirror_id == 0) {
7970                         /* normal resync */
7971                         for (i = 0; i < lo->ldo_mirror_count; i++) {
7972                                 if (lo->ldo_mirrors[i].lme_stale)
7973                                         continue;
7974
7975                                 lod_foreach_mirror_comp(lod_comp, lo, i) {
7976                                         if (!lod_comp_inited(lod_comp))
7977                                                 break;
7978
7979                                         if (extent.e_end <
7980                                                 lod_comp->llc_extent.e_end)
7981                                                 extent.e_end =
7982                                                      lod_comp->llc_extent.e_end;
7983                                 }
7984                         }
7985                         rc = lod_prepare_resync(env, lo, &extent);
7986                         if (rc)
7987                                 GOTO(out, rc);
7988                 } else {
7989                         /* mirror write, try to init its all components */
7990                         rc = lod_prepare_resync_mirror(env, lo,
7991                                                        mlc->mlc_mirror_id);
7992                         if (rc)
7993                                 GOTO(out, rc);
7994                 }
7995
7996                 /* change the file state to SYNC_PENDING */
7997                 lo->ldo_flr_state = LCM_FL_SYNC_PENDING;
7998         }
7999
8000         /* Reset the layout version once it's becoming too large.
8001          * This way it can make sure that the layout version is
8002          * monotonously increased in this writing era. */
8003         lod_obj_inc_layout_gen(lo);
8004         if (lo->ldo_layout_gen > (LCME_ID_MAX >> 1)) {
8005                 __u32 layout_version;
8006
8007                 get_random_bytes(&layout_version, sizeof(layout_version));
8008                 lo->ldo_layout_gen = layout_version & 0xffff;
8009         }
8010
8011         rc = lod_declare_instantiate_components(env, lo, th, 0);
8012         if (rc)
8013                 GOTO(out, rc);
8014
8015         layout_attr->la_valid = LA_LAYOUT_VERSION;
8016         layout_attr->la_layout_version = 0; /* set current version */
8017         if (mlc->mlc_opc == MD_LAYOUT_RESYNC)
8018                 layout_attr->la_layout_version = LU_LAYOUT_RESYNC;
8019         rc = lod_declare_attr_set(env, &lo->ldo_obj, layout_attr, th);
8020         if (rc)
8021                 GOTO(out, rc);
8022
8023 out:
8024         if (rc)
8025                 lod_striping_free(env, lo);
8026         RETURN(rc);
8027 }
8028
8029 static int lod_declare_update_write_pending(const struct lu_env *env,
8030                 struct lod_object *lo, struct md_layout_change *mlc,
8031                 struct thandle *th)
8032 {
8033         struct lod_thread_info *info = lod_env_info(env);
8034         struct lu_attr *layout_attr = &info->lti_layout_attr;
8035         struct lod_layout_component *lod_comp;
8036         struct lu_extent extent = { 0 };
8037         int primary = -1;
8038         int i;
8039         int rc;
8040         ENTRY;
8041
8042         LASSERT(lo->ldo_flr_state == LCM_FL_WRITE_PENDING);
8043         LASSERT(mlc->mlc_opc == MD_LAYOUT_WRITE ||
8044                 mlc->mlc_opc == MD_LAYOUT_RESYNC);
8045
8046         /* look for the first preferred mirror */
8047         for (i = 0; i < lo->ldo_mirror_count; i++) {
8048                 if (lo->ldo_mirrors[i].lme_stale)
8049                         continue;
8050                 if (lo->ldo_mirrors[i].lme_prefer == 0)
8051                         continue;
8052
8053                 primary = i;
8054                 break;
8055         }
8056         if (primary < 0) {
8057                 /* no primary, use any in-sync */
8058                 for (i = 0; i < lo->ldo_mirror_count; i++) {
8059                         if (lo->ldo_mirrors[i].lme_stale)
8060                                 continue;
8061                         primary = i;
8062                         break;
8063                 }
8064                 if (primary < 0) {
8065                         CERROR(DFID ": doesn't have a primary mirror\n",
8066                                PFID(lod_object_fid(lo)));
8067                         GOTO(out, rc = -ENODATA);
8068                 }
8069         }
8070
8071         CDEBUG(D_LAYOUT, DFID": found primary %u\n",
8072                PFID(lod_object_fid(lo)), lo->ldo_mirrors[primary].lme_id);
8073
8074         LASSERT(!lo->ldo_mirrors[primary].lme_stale);
8075
8076         /* for LAYOUT_WRITE opc, it has to do the following operations:
8077          * 1. stale overlapping componets from stale mirrors;
8078          * 2. instantiate components of the primary mirror;
8079          * 3. transfter layout version to all objects of the primary;
8080          *
8081          * for LAYOUT_RESYNC opc, it will do:
8082          * 1. instantiate components of all stale mirrors;
8083          * 2. transfer layout version to all objects to close write era. */
8084
8085         if (mlc->mlc_opc == MD_LAYOUT_WRITE) {
8086                 struct layout_intent *layout = mlc->mlc_intent;
8087                 int write = layout->li_opc == LAYOUT_INTENT_WRITE;
8088
8089                 LASSERT(mlc->mlc_intent != NULL);
8090
8091                 extent = mlc->mlc_intent->li_extent;
8092
8093                 CDEBUG(D_LAYOUT, DFID": intent to write: "DEXT"\n",
8094                        PFID(lod_object_fid(lo)), PEXT(&extent));
8095
8096                 /* 1. Update extents of primary before staling */
8097                 rc = lod_declare_update_extents(env, lo, &extent, th, primary,
8098                                                 write);
8099                 if (rc < 0)
8100                         GOTO(out, rc);
8101
8102                 if (mlc->mlc_intent->li_opc == LAYOUT_INTENT_TRUNC) {
8103                         /**
8104                          * trunc transfers [0, size) in the intent extent, we'd
8105                          * stale components overlapping [size, eof).
8106                          */
8107                         extent.e_start = extent.e_end;
8108                         extent.e_end = OBD_OBJECT_EOF;
8109                 }
8110
8111                 /* 2. stale overlapping components */
8112                 rc = lod_stale_components(env, lo, primary, &extent, th);
8113                 if (rc < 0)
8114                         GOTO(out, rc);
8115
8116                 /* 3. find the components which need instantiating.
8117                  * instantiate [0, mlc->mlc_intent->e_end) */
8118
8119                 /* restore truncate intent extent */
8120                 if (mlc->mlc_intent->li_opc == LAYOUT_INTENT_TRUNC)
8121                         extent.e_end = extent.e_start;
8122                 extent.e_start = 0;
8123
8124                 lod_foreach_mirror_comp(lod_comp, lo, primary) {
8125                         if (!lu_extent_is_overlapped(&extent,
8126                                                      &lod_comp->llc_extent))
8127                                 break;
8128
8129                         if (!lod_is_instantiation_needed(lod_comp))
8130                                 continue;
8131
8132                         CDEBUG(D_LAYOUT, "write instantiate %d / %d\n",
8133                                primary, lod_comp_index(lo, lod_comp));
8134                         info->lti_comp_idx[info->lti_count++] =
8135                                                 lod_comp_index(lo, lod_comp);
8136                 }
8137         } else { /* MD_LAYOUT_RESYNC */
8138                 if (mlc->mlc_mirror_id == 0) {
8139                         /* normal resync */
8140                         lod_foreach_mirror_comp(lod_comp, lo, primary) {
8141                                 if (!lod_comp_inited(lod_comp))
8142                                         break;
8143
8144                                 extent.e_end = lod_comp->llc_extent.e_end;
8145                         }
8146
8147                         rc = lod_prepare_resync(env, lo, &extent);
8148                         if (rc)
8149                                 GOTO(out, rc);
8150                 } else {
8151                         /* mirror write, try to init its all components */
8152                         rc = lod_prepare_resync_mirror(env, lo,
8153                                                        mlc->mlc_mirror_id);
8154                         if (rc)
8155                                 GOTO(out, rc);
8156                 }
8157
8158                 /* change the file state to SYNC_PENDING */
8159                 lo->ldo_flr_state = LCM_FL_SYNC_PENDING;
8160         }
8161
8162         rc = lod_declare_instantiate_components(env, lo, th, 0);
8163         if (rc)
8164                 GOTO(out, rc);
8165
8166         /* 3. transfer layout version to OST objects.
8167          * transfer new layout version to OST objects so that stale writes
8168          * can be denied. It also ends an era of writing by setting
8169          * LU_LAYOUT_RESYNC. Normal client can never use this bit to
8170          * send write RPC; only resync RPCs could do it. */
8171         layout_attr->la_valid = LA_LAYOUT_VERSION;
8172         layout_attr->la_layout_version = 0; /* set current version */
8173         if (mlc->mlc_opc == MD_LAYOUT_RESYNC)
8174                 layout_attr->la_layout_version = LU_LAYOUT_RESYNC;
8175         rc = lod_declare_attr_set(env, &lo->ldo_obj, layout_attr, th);
8176         if (rc)
8177                 GOTO(out, rc);
8178
8179         lod_obj_inc_layout_gen(lo);
8180 out:
8181         if (rc)
8182                 lod_striping_free(env, lo);
8183         RETURN(rc);
8184 }
8185
8186 static int lod_declare_update_sync_pending(const struct lu_env *env,
8187                 struct lod_object *lo, struct md_layout_change *mlc,
8188                 struct thandle *th)
8189 {
8190         struct lod_thread_info  *info = lod_env_info(env);
8191         struct lu_attr *layout_attr = &info->lti_layout_attr;
8192         unsigned sync_components = 0;
8193         unsigned resync_components = 0;
8194         int i;
8195         int rc;
8196         ENTRY;
8197
8198         LASSERT(lo->ldo_flr_state == LCM_FL_SYNC_PENDING);
8199         LASSERT(mlc->mlc_opc == MD_LAYOUT_RESYNC_DONE ||
8200                 mlc->mlc_opc == MD_LAYOUT_WRITE);
8201
8202         CDEBUG(D_LAYOUT, DFID ": received op %d in sync pending\n",
8203                PFID(lod_object_fid(lo)), mlc->mlc_opc);
8204
8205         if (mlc->mlc_opc == MD_LAYOUT_WRITE) {
8206                 CDEBUG(D_LAYOUT, DFID": cocurrent write to sync pending\n",
8207                        PFID(lod_object_fid(lo)));
8208
8209                 lo->ldo_flr_state = LCM_FL_WRITE_PENDING;
8210                 return lod_declare_update_write_pending(env, lo, mlc, th);
8211         }
8212
8213         /* MD_LAYOUT_RESYNC_DONE */
8214
8215         for (i = 0; i < lo->ldo_comp_cnt; i++) {
8216                 struct lod_layout_component *lod_comp;
8217                 int j;
8218
8219                 lod_comp = &lo->ldo_comp_entries[i];
8220
8221                 if (!(lod_comp->llc_flags & LCME_FL_STALE)) {
8222                         sync_components++;
8223                         continue;
8224                 }
8225
8226                 for (j = 0; j < mlc->mlc_resync_count; j++) {
8227                         if (lod_comp->llc_id != mlc->mlc_resync_ids[j])
8228                                 continue;
8229
8230                         mlc->mlc_resync_ids[j] = LCME_ID_INVAL;
8231                         lod_comp->llc_flags &= ~LCME_FL_STALE;
8232                         resync_components++;
8233                         break;
8234                 }
8235         }
8236
8237         /* valid check */
8238         for (i = 0; i < mlc->mlc_resync_count; i++) {
8239                 if (mlc->mlc_resync_ids[i] == LCME_ID_INVAL)
8240                         continue;
8241
8242                 CDEBUG(D_LAYOUT, DFID": lcme id %u (%d / %zd) not exist "
8243                        "or already synced\n", PFID(lod_object_fid(lo)),
8244                        mlc->mlc_resync_ids[i], i, mlc->mlc_resync_count);
8245                 GOTO(out, rc = -EINVAL);
8246         }
8247
8248         if (!sync_components || (mlc->mlc_resync_count && !resync_components)) {
8249                 CDEBUG(D_LAYOUT, DFID": no mirror in sync\n",
8250                        PFID(lod_object_fid(lo)));
8251
8252                 /* tend to return an error code here to prevent
8253                  * the MDT from setting SoM attribute */
8254                 GOTO(out, rc = -EINVAL);
8255         }
8256
8257         CDEBUG(D_LAYOUT, DFID": synced %u resynced %u/%zu components\n",
8258                PFID(lod_object_fid(lo)),
8259                sync_components, resync_components, mlc->mlc_resync_count);
8260
8261         lo->ldo_flr_state = LCM_FL_RDONLY;
8262         lod_obj_inc_layout_gen(lo);
8263
8264         layout_attr->la_valid = LA_LAYOUT_VERSION;
8265         layout_attr->la_layout_version = 0; /* set current version */
8266         rc = lod_declare_attr_set(env, &lo->ldo_obj, layout_attr, th);
8267         if (rc)
8268                 GOTO(out, rc);
8269
8270         info->lti_buf.lb_len = lod_comp_md_size(lo, false);
8271         rc = lod_sub_declare_xattr_set(env, lod_object_child(lo),
8272                                        &info->lti_buf, XATTR_NAME_LOV, 0, th);
8273         EXIT;
8274
8275 out:
8276         if (rc)
8277                 lod_striping_free(env, lo);
8278         RETURN(rc);
8279 }
8280
8281 typedef int (*mlc_handler)(const struct lu_env *env, struct dt_object *dt,
8282                            const struct md_layout_change *mlc,
8283                            struct thandle *th);
8284
8285 /**
8286  * Attach stripes after target's for migrating directory. NB, we
8287  * only need to declare this, the actual work is done inside
8288  * lod_xattr_set_lmv().
8289  *
8290  * \param[in] env       execution environment
8291  * \param[in] dt        target object
8292  * \param[in] mlc       layout change data
8293  * \param[in] th        transaction handle
8294  *
8295  * \retval              0 on success
8296  * \retval              negative if failed
8297  */
8298 static int lod_dir_declare_layout_attach(const struct lu_env *env,
8299                                          struct dt_object *dt,
8300                                          const struct md_layout_change *mlc,
8301                                          struct thandle *th)
8302 {
8303         struct lod_thread_info *info = lod_env_info(env);
8304         struct lod_device *lod = lu2lod_dev(dt->do_lu.lo_dev);
8305         struct lod_tgt_descs *ltd = &lod->lod_mdt_descs;
8306         struct lod_object *lo = lod_dt_obj(dt);
8307         struct dt_object *next = dt_object_child(dt);
8308         struct dt_object_format *dof = &info->lti_format;
8309         struct lmv_mds_md_v1 *lmv = mlc->mlc_buf.lb_buf;
8310         struct dt_object **stripes;
8311         __u32 stripe_count = le32_to_cpu(lmv->lmv_stripe_count);
8312         struct lu_fid *fid = &info->lti_fid;
8313         struct lod_tgt_desc *tgt;
8314         struct dt_object *dto;
8315         struct dt_device *tgt_dt;
8316         int type = LU_SEQ_RANGE_ANY;
8317         struct dt_insert_rec *rec = &info->lti_dt_rec;
8318         char *stripe_name = info->lti_key;
8319         struct lu_name *sname;
8320         struct linkea_data ldata = { NULL };
8321         struct lu_buf linkea_buf;
8322         __u32 idx;
8323         int i;
8324         int rc;
8325
8326         ENTRY;
8327
8328         if (!lmv_is_sane(lmv))
8329                 RETURN(-EINVAL);
8330
8331         if (!dt_try_as_dir(env, dt))
8332                 return -ENOTDIR;
8333
8334         dof->dof_type = DFT_DIR;
8335
8336         OBD_ALLOC_PTR_ARRAY(stripes, (lo->ldo_dir_stripe_count + stripe_count));
8337         if (!stripes)
8338                 RETURN(-ENOMEM);
8339
8340         for (i = 0; i < lo->ldo_dir_stripe_count; i++)
8341                 stripes[i] = lo->ldo_stripe[i];
8342
8343         rec->rec_type = S_IFDIR;
8344
8345         for (i = 0; i < stripe_count; i++) {
8346                 fid_le_to_cpu(fid,
8347                         &lmv->lmv_stripe_fids[i]);
8348                 if (!fid_is_sane(fid))
8349                         continue;
8350
8351                 rc = lod_fld_lookup(env, lod, fid, &idx, &type);
8352                 if (rc)
8353                         GOTO(out, rc);
8354
8355                 if (idx == lod2lu_dev(lod)->ld_site->ld_seq_site->ss_node_id) {
8356                         tgt_dt = lod->lod_child;
8357                 } else {
8358                         tgt = LTD_TGT(ltd, idx);
8359                         if (tgt == NULL)
8360                                 GOTO(out, rc = -ESTALE);
8361                         tgt_dt = tgt->ltd_tgt;
8362                 }
8363
8364                 dto = dt_locate_at(env, tgt_dt, fid,
8365                                   lo->ldo_obj.do_lu.lo_dev->ld_site->ls_top_dev,
8366                                   NULL);
8367                 if (IS_ERR(dto))
8368                         GOTO(out, rc = PTR_ERR(dto));
8369
8370                 stripes[i + lo->ldo_dir_stripe_count] = dto;
8371
8372                 if (!dt_try_as_dir(env, dto))
8373                         GOTO(out, rc = -ENOTDIR);
8374
8375                 rc = lod_sub_declare_ref_add(env, dto, th);
8376                 if (rc)
8377                         GOTO(out, rc);
8378
8379                 rec->rec_fid = lu_object_fid(&dto->do_lu);
8380                 rc = lod_sub_declare_insert(env, dto,
8381                                             (const struct dt_rec *)rec,
8382                                             (const struct dt_key *)dot, th);
8383                 if (rc)
8384                         GOTO(out, rc);
8385
8386                 rc = lod_sub_declare_insert(env, dto,
8387                                             (const struct dt_rec *)rec,
8388                                             (const struct dt_key *)dotdot, th);
8389                 if (rc)
8390                         GOTO(out, rc);
8391
8392                 rc = lod_sub_declare_xattr_set(env, dto, &mlc->mlc_buf,
8393                                                 XATTR_NAME_LMV, 0, th);
8394                 if (rc)
8395                         GOTO(out, rc);
8396
8397                 snprintf(stripe_name, sizeof(info->lti_key), DFID":%u",
8398                          PFID(lu_object_fid(&dto->do_lu)),
8399                          i + lo->ldo_dir_stripe_count);
8400
8401                 sname = lod_name_get(env, stripe_name, strlen(stripe_name));
8402                 rc = linkea_links_new(&ldata, &info->lti_linkea_buf,
8403                                       sname, lu_object_fid(&dt->do_lu));
8404                 if (rc)
8405                         GOTO(out, rc);
8406
8407                 linkea_buf.lb_buf = ldata.ld_buf->lb_buf;
8408                 linkea_buf.lb_len = ldata.ld_leh->leh_len;
8409                 rc = lod_sub_declare_xattr_set(env, dto, &linkea_buf,
8410                                                XATTR_NAME_LINK, 0, th);
8411                 if (rc)
8412                         GOTO(out, rc);
8413
8414                 rc = lod_sub_declare_insert(env, next,
8415                                             (const struct dt_rec *)rec,
8416                                             (const struct dt_key *)stripe_name,
8417                                             th);
8418                 if (rc)
8419                         GOTO(out, rc);
8420
8421                 rc = lod_sub_declare_ref_add(env, next, th);
8422                 if (rc)
8423                         GOTO(out, rc);
8424         }
8425
8426         if (lo->ldo_stripe)
8427                 OBD_FREE_PTR_ARRAY(lo->ldo_stripe,
8428                                    lo->ldo_dir_stripes_allocated);
8429         lo->ldo_stripe = stripes;
8430         lo->ldo_is_foreign = 0;
8431         lo->ldo_dir_migrate_offset = lo->ldo_dir_stripe_count;
8432         lo->ldo_dir_migrate_hash = le32_to_cpu(lmv->lmv_hash_type);
8433         lo->ldo_dir_stripe_count += stripe_count;
8434         lo->ldo_dir_stripes_allocated += stripe_count;
8435
8436         /* plain directory split creates target as a plain directory, while
8437          * after source attached as the first stripe, it becomes a striped
8438          * directory, set correct do_index_ops, otherwise it can't be unlinked.
8439          */
8440         dt->do_index_ops = &lod_striped_index_ops;
8441
8442         RETURN(0);
8443 out:
8444         i = lo->ldo_dir_stripe_count;
8445         while (i < lo->ldo_dir_stripe_count + stripe_count && stripes[i])
8446                 dt_object_put(env, stripes[i++]);
8447
8448         OBD_FREE_PTR_ARRAY(stripes, stripe_count + lo->ldo_dir_stripe_count);
8449         return rc;
8450 }
8451
8452 static int lod_dir_declare_layout_detach(const struct lu_env *env,
8453                                          struct dt_object *dt,
8454                                          const struct md_layout_change *unused,
8455                                          struct thandle *th)
8456 {
8457         struct lod_thread_info *info = lod_env_info(env);
8458         struct lod_object *lo = lod_dt_obj(dt);
8459         struct dt_object *next = dt_object_child(dt);
8460         char *stripe_name = info->lti_key;
8461         struct dt_object *dto;
8462         int i;
8463         int rc = 0;
8464
8465         if (!dt_try_as_dir(env, dt))
8466                 return -ENOTDIR;
8467
8468         if (!lo->ldo_dir_stripe_count)
8469                 return lod_sub_declare_delete(env, next,
8470                                         (const struct dt_key *)dotdot, th);
8471
8472         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
8473                 dto = lo->ldo_stripe[i];
8474                 if (!dto)
8475                         continue;
8476
8477                 if (!dt_try_as_dir(env, dto))
8478                         return -ENOTDIR;
8479
8480                 rc = lod_sub_declare_delete(env, dto,
8481                                         (const struct dt_key *)dotdot, th);
8482                 if (rc)
8483                         return rc;
8484
8485                 snprintf(stripe_name, sizeof(info->lti_key), DFID":%d",
8486                          PFID(lu_object_fid(&dto->do_lu)), i);
8487
8488                 rc = lod_sub_declare_delete(env, next,
8489                                         (const struct dt_key *)stripe_name, th);
8490                 if (rc)
8491                         return rc;
8492
8493                 rc = lod_sub_declare_ref_del(env, next, th);
8494                 if (rc)
8495                         return rc;
8496         }
8497
8498         return 0;
8499 }
8500
8501 static int dt_dir_is_empty(const struct lu_env *env,
8502                            struct dt_object *obj)
8503 {
8504         struct dt_it *it;
8505         const struct dt_it_ops *iops;
8506         int rc;
8507
8508         ENTRY;
8509
8510         if (!dt_try_as_dir(env, obj))
8511                 RETURN(-ENOTDIR);
8512
8513         iops = &obj->do_index_ops->dio_it;
8514         it = iops->init(env, obj, LUDA_64BITHASH);
8515         if (IS_ERR(it))
8516                 RETURN(PTR_ERR(it));
8517
8518         rc = iops->get(env, it, (const struct dt_key *)"");
8519         if (rc > 0) {
8520                 int i;
8521
8522                 for (rc = 0, i = 0; rc == 0 && i < 3; ++i)
8523                         rc = iops->next(env, it);
8524                 if (!rc)
8525                         rc = -ENOTEMPTY;
8526                 else if (rc == 1)
8527                         rc = 0;
8528         } else if (!rc) {
8529                 /* Huh? Index contains no zero key? */
8530                 rc = -EIO;
8531         }
8532
8533         iops->put(env, it);
8534         iops->fini(env, it);
8535
8536         RETURN(rc);
8537 }
8538
8539 static int lod_dir_declare_layout_shrink(const struct lu_env *env,
8540                                          struct dt_object *dt,
8541                                          const struct md_layout_change *mlc,
8542                                          struct thandle *th)
8543 {
8544         struct lod_thread_info *info = lod_env_info(env);
8545         struct lod_object *lo = lod_dt_obj(dt);
8546         struct dt_object *next = dt_object_child(dt);
8547         struct lmv_user_md *lmu = mlc->mlc_buf.lb_buf;
8548         char *stripe_name = info->lti_key;
8549         struct lu_buf *lmv_buf = &info->lti_buf;
8550         __u32 final_stripe_count;
8551         struct dt_object *dto;
8552         int i;
8553         int rc;
8554
8555         LASSERT(lmu);
8556
8557         if (!dt_try_as_dir(env, dt))
8558                 return -ENOTDIR;
8559
8560         /* shouldn't be called on plain directory */
8561         LASSERT(lo->ldo_dir_stripe_count);
8562
8563         lmv_buf->lb_buf = &info->lti_lmv.lmv_md_v1;
8564         lmv_buf->lb_len = sizeof(info->lti_lmv.lmv_md_v1);
8565
8566         final_stripe_count = le32_to_cpu(lmu->lum_stripe_count);
8567         LASSERT(final_stripe_count &&
8568                 final_stripe_count < lo->ldo_dir_stripe_count);
8569
8570         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
8571                 dto = lo->ldo_stripe[i];
8572                 if (!dto)
8573                         continue;
8574
8575                 if (i < final_stripe_count) {
8576                         rc = lod_sub_declare_xattr_set(env, dto, lmv_buf,
8577                                                        XATTR_NAME_LMV,
8578                                                        LU_XATTR_REPLACE, th);
8579                         if (rc)
8580                                 return rc;
8581
8582                         continue;
8583                 }
8584
8585                 rc = dt_dir_is_empty(env, dto);
8586                 if (rc < 0)
8587                         return rc;
8588
8589                 rc = lod_sub_declare_ref_del(env, dto, th);
8590                 if (rc)
8591                         return rc;
8592
8593                 rc = lod_sub_declare_destroy(env, dto, th);
8594                 if (rc)
8595                         return rc;
8596
8597                 snprintf(stripe_name, sizeof(info->lti_key), DFID":%d",
8598                          PFID(lu_object_fid(&dto->do_lu)), i);
8599
8600                 rc = lod_sub_declare_delete(env, next,
8601                                         (const struct dt_key *)stripe_name, th);
8602                 if (rc)
8603                         return rc;
8604
8605                 rc = lod_sub_declare_ref_del(env, next, th);
8606                 if (rc)
8607                         return rc;
8608         }
8609
8610         rc = lod_sub_declare_xattr_set(env, next, lmv_buf, XATTR_NAME_LMV,
8611                                        LU_XATTR_REPLACE, th);
8612         return rc;
8613 }
8614
8615 /**
8616  * Allocate stripes for split directory.
8617  *
8618  * \param[in] env       execution environment
8619  * \param[in] dt        target object
8620  * \param[in] mlc       layout change data
8621  * \param[in] th        transaction handle
8622  *
8623  * \retval              0 on success
8624  * \retval              negative if failed
8625  */
8626 static int lod_dir_declare_layout_split(const struct lu_env *env,
8627                                         struct dt_object *dt,
8628                                         const struct md_layout_change *mlc,
8629                                         struct thandle *th)
8630 {
8631         struct lod_thread_info *info = lod_env_info(env);
8632         struct lod_device *lod = lu2lod_dev(dt->do_lu.lo_dev);
8633         struct lod_object *lo = lod_dt_obj(dt);
8634         struct dt_object_format *dof = &info->lti_format;
8635         struct lmv_user_md_v1 *lum = mlc->mlc_spec->u.sp_ea.eadata;
8636         struct dt_object **stripes;
8637         u32 stripe_count;
8638         u32 saved_count;
8639         int i;
8640         int rc;
8641
8642         ENTRY;
8643
8644         LASSERT(le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC);
8645         LASSERT(le32_to_cpu(lum->lum_stripe_offset) == LMV_OFFSET_DEFAULT);
8646
8647         saved_count = lo->ldo_dir_stripes_allocated;
8648         stripe_count = le32_to_cpu(lum->lum_stripe_count);
8649         if (stripe_count <= saved_count)
8650                 RETURN(-EINVAL);
8651
8652         dof->dof_type = DFT_DIR;
8653
8654         OBD_ALLOC(stripes, sizeof(*stripes) * stripe_count);
8655         if (!stripes)
8656                 RETURN(-ENOMEM);
8657
8658         for (i = 0; i < lo->ldo_dir_stripes_allocated; i++)
8659                 stripes[i] = lo->ldo_stripe[i];
8660
8661         lod_qos_statfs_update(env, lod, &lod->lod_mdt_descs);
8662         rc = lod_mdt_alloc_qos(env, lo, stripes, saved_count, stripe_count);
8663         if (rc == -EAGAIN)
8664                 rc = lod_mdt_alloc_rr(env, lo, stripes, saved_count,
8665                                       stripe_count);
8666         if (rc < 0) {
8667                 OBD_FREE(stripes, sizeof(*stripes) * stripe_count);
8668                 RETURN(rc);
8669         }
8670
8671         LASSERT(rc > saved_count);
8672         OBD_FREE(lo->ldo_stripe,
8673                  sizeof(*stripes) * lo->ldo_dir_stripes_allocated);
8674         lo->ldo_stripe = stripes;
8675         lo->ldo_is_foreign = 0;
8676         lo->ldo_dir_striped = 1;
8677         lo->ldo_dir_stripe_count = rc;
8678         lo->ldo_dir_stripes_allocated = stripe_count;
8679         lo->ldo_dir_split_hash = lo->ldo_dir_hash_type;
8680         lo->ldo_dir_hash_type = le32_to_cpu(lum->lum_hash_type);
8681         if (!lmv_is_known_hash_type(lo->ldo_dir_hash_type))
8682                 lo->ldo_dir_hash_type =
8683                         lod->lod_mdt_descs.ltd_lmv_desc.ld_pattern;
8684         lo->ldo_dir_hash_type |= LMV_HASH_FLAG_SPLIT | LMV_HASH_FLAG_MIGRATION;
8685         lo->ldo_dir_split_offset = saved_count;
8686         lo->ldo_dir_layout_version++;
8687         lo->ldo_dir_stripe_loaded = 1;
8688
8689         rc = lod_dir_declare_create_stripes(env, dt, mlc->mlc_attr, dof, th);
8690         if (rc)
8691                 lod_striping_free(env, lo);
8692
8693         RETURN(rc);
8694 }
8695
8696 /*
8697  * detach all stripes from dir master object, NB, stripes are not destroyed, but
8698  * deleted from it's parent namespace, this function is called in two places:
8699  * 1. mdd_migrate_mdt() detach stripes from source, and attach them to
8700  *    target.
8701  * 2. mdd_dir_layout_update() detach stripe before turning 1-stripe directory to
8702  *    a plain directory.
8703  *
8704  * \param[in] env       execution environment
8705  * \param[in] dt        target object
8706  * \param[in] mlc       layout change data
8707  * \param[in] th        transaction handle
8708  *
8709  * \retval              0 on success
8710  * \retval              negative if failed
8711  */
8712 static int lod_dir_layout_detach(const struct lu_env *env,
8713                                  struct dt_object *dt,
8714                                  const struct md_layout_change *mlc,
8715                                  struct thandle *th)
8716 {
8717         struct lod_thread_info *info = lod_env_info(env);
8718         struct lod_object *lo = lod_dt_obj(dt);
8719         struct dt_object *next = dt_object_child(dt);
8720         char *stripe_name = info->lti_key;
8721         struct dt_object *dto;
8722         int i;
8723         int rc = 0;
8724
8725         ENTRY;
8726
8727         if (!lo->ldo_dir_stripe_count) {
8728                 /* plain directory delete .. */
8729                 rc = lod_sub_delete(env, next,
8730                                     (const struct dt_key *)dotdot, th);
8731                 RETURN(rc);
8732         }
8733
8734         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
8735                 dto = lo->ldo_stripe[i];
8736                 if (!dto)
8737                         continue;
8738
8739                 rc = lod_sub_delete(env, dto,
8740                                     (const struct dt_key *)dotdot, th);
8741                 if (rc)
8742                         break;
8743
8744                 snprintf(stripe_name, sizeof(info->lti_key), DFID":%d",
8745                          PFID(lu_object_fid(&dto->do_lu)), i);
8746
8747                 rc = lod_sub_delete(env, next,
8748                                     (const struct dt_key *)stripe_name, th);
8749                 if (rc)
8750                         break;
8751
8752                 rc = lod_sub_ref_del(env, next, th);
8753                 if (rc)
8754                         break;
8755         }
8756
8757         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
8758                 dto = lo->ldo_stripe[i];
8759                 if (dto)
8760                         dt_object_put(env, dto);
8761         }
8762         OBD_FREE_PTR_ARRAY(lo->ldo_stripe, lo->ldo_dir_stripes_allocated);
8763         lo->ldo_stripe = NULL;
8764         lo->ldo_dir_stripes_allocated = 0;
8765         lo->ldo_dir_stripe_count = 0;
8766         dt->do_index_ops = &lod_index_ops;
8767
8768         RETURN(rc);
8769 }
8770
8771 static int lod_dir_layout_shrink(const struct lu_env *env,
8772                                  struct dt_object *dt,
8773                                  const struct md_layout_change *mlc,
8774                                  struct thandle *th)
8775 {
8776         struct lod_thread_info *info = lod_env_info(env);
8777         struct lod_object *lo = lod_dt_obj(dt);
8778         struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
8779         struct dt_object *next = dt_object_child(dt);
8780         struct lmv_user_md *lmu = mlc->mlc_buf.lb_buf;
8781         __u32 final_stripe_count;
8782         char *stripe_name = info->lti_key;
8783         struct dt_object *dto;
8784         struct lu_buf *lmv_buf = &info->lti_buf;
8785         struct lmv_mds_md_v1 *lmv = &info->lti_lmv.lmv_md_v1;
8786         u32 mdtidx;
8787         int type = LU_SEQ_RANGE_ANY;
8788         int i;
8789         int rc;
8790
8791         ENTRY;
8792
8793         final_stripe_count = le32_to_cpu(lmu->lum_stripe_count);
8794
8795         lmv_buf->lb_buf = lmv;
8796         lmv_buf->lb_len = sizeof(*lmv);
8797         lmv->lmv_magic = cpu_to_le32(LMV_MAGIC_STRIPE);
8798         lmv->lmv_stripe_count = cpu_to_le32(final_stripe_count);
8799         lmv->lmv_hash_type = cpu_to_le32(lo->ldo_dir_hash_type) &
8800                              cpu_to_le32(LMV_HASH_TYPE_MASK |
8801                                          LMV_HASH_FLAG_FIXED);
8802         lmv->lmv_layout_version =
8803                         cpu_to_le32(lo->ldo_dir_layout_version + 1);
8804         lmv->lmv_migrate_offset = 0;
8805         lmv->lmv_migrate_hash = 0;
8806
8807         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
8808                 dto = lo->ldo_stripe[i];
8809                 if (!dto)
8810                         continue;
8811
8812                 if (i < final_stripe_count) {
8813                         rc = lod_fld_lookup(env, lod,
8814                                             lu_object_fid(&dto->do_lu),
8815                                             &mdtidx, &type);
8816                         if (rc)
8817                                 RETURN(rc);
8818
8819                         lmv->lmv_master_mdt_index = cpu_to_le32(mdtidx);
8820                         rc = lod_sub_xattr_set(env, dto, lmv_buf,
8821                                                XATTR_NAME_LMV,
8822                                                LU_XATTR_REPLACE, th);
8823                         if (rc)
8824                                 RETURN(rc);
8825
8826                         continue;
8827                 }
8828
8829                 dt_write_lock(env, dto, DT_TGT_CHILD);
8830                 rc = lod_sub_ref_del(env, dto, th);
8831                 dt_write_unlock(env, dto);
8832                 if (rc)
8833                         RETURN(rc);
8834
8835                 rc = lod_sub_destroy(env, dto, th);
8836                 if (rc)
8837                         RETURN(rc);
8838
8839                 snprintf(stripe_name, sizeof(info->lti_key), DFID":%d",
8840                          PFID(lu_object_fid(&dto->do_lu)), i);
8841
8842                 rc = lod_sub_delete(env, next,
8843                                     (const struct dt_key *)stripe_name, th);
8844                 if (rc)
8845                         RETURN(rc);
8846
8847                 rc = lod_sub_ref_del(env, next, th);
8848                 if (rc)
8849                         RETURN(rc);
8850         }
8851
8852         rc = lod_fld_lookup(env, lod, lu_object_fid(&dt->do_lu), &mdtidx,
8853                             &type);
8854         if (rc)
8855                 RETURN(rc);
8856
8857         lmv->lmv_magic = cpu_to_le32(LMV_MAGIC_V1);
8858         lmv->lmv_master_mdt_index = cpu_to_le32(mdtidx);
8859         rc = lod_sub_xattr_set(env, next, lmv_buf, XATTR_NAME_LMV,
8860                                LU_XATTR_REPLACE, th);
8861         if (rc)
8862                 RETURN(rc);
8863
8864         for (i = final_stripe_count; i < lo->ldo_dir_stripe_count; i++) {
8865                 dto = lo->ldo_stripe[i];
8866                 if (dto)
8867                         dt_object_put(env, dto);
8868         }
8869         lo->ldo_dir_stripe_count = final_stripe_count;
8870
8871         RETURN(rc);
8872 }
8873
8874 static mlc_handler dir_mlc_declare_ops[MD_LAYOUT_MAX] = {
8875         [MD_LAYOUT_ATTACH] = lod_dir_declare_layout_attach,
8876         [MD_LAYOUT_DETACH] = lod_dir_declare_layout_detach,
8877         [MD_LAYOUT_SHRINK] = lod_dir_declare_layout_shrink,
8878         [MD_LAYOUT_SPLIT]  = lod_dir_declare_layout_split,
8879 };
8880
8881 static mlc_handler dir_mlc_ops[MD_LAYOUT_MAX] = {
8882         [MD_LAYOUT_DETACH] = lod_dir_layout_detach,
8883         [MD_LAYOUT_SHRINK] = lod_dir_layout_shrink,
8884 };
8885
8886 static int lod_declare_layout_change(const struct lu_env *env,
8887                 struct dt_object *dt, struct md_layout_change *mlc,
8888                 struct thandle *th)
8889 {
8890         struct lod_thread_info  *info = lod_env_info(env);
8891         struct lod_object *lo = lod_dt_obj(dt);
8892         int rc;
8893
8894         ENTRY;
8895
8896         if (S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
8897                 LASSERT(dir_mlc_declare_ops[mlc->mlc_opc]);
8898                 rc = dir_mlc_declare_ops[mlc->mlc_opc](env, dt, mlc, th);
8899                 RETURN(rc);
8900         }
8901
8902         if (!S_ISREG(dt->do_lu.lo_header->loh_attr) || !dt_object_exists(dt) ||
8903             dt_object_remote(dt_object_child(dt)))
8904                 RETURN(-EINVAL);
8905
8906         rc = lod_striping_load(env, lo);
8907         if (rc)
8908                 GOTO(out, rc);
8909
8910         LASSERT(lo->ldo_comp_cnt > 0);
8911
8912         rc = lod_layout_data_init(info, lo->ldo_comp_cnt);
8913         if (rc)
8914                 GOTO(out, rc);
8915
8916         switch (lo->ldo_flr_state) {
8917         case LCM_FL_NONE:
8918                 rc = lod_declare_update_plain(env, lo, mlc->mlc_intent,
8919                                               &mlc->mlc_buf, th);
8920                 break;
8921         case LCM_FL_RDONLY:
8922                 rc = lod_declare_update_rdonly(env, lo, mlc, th);
8923                 break;
8924         case LCM_FL_WRITE_PENDING:
8925                 rc = lod_declare_update_write_pending(env, lo, mlc, th);
8926                 break;
8927         case LCM_FL_SYNC_PENDING:
8928                 rc = lod_declare_update_sync_pending(env, lo, mlc, th);
8929                 break;
8930         default:
8931                 rc = -ENOTSUPP;
8932                 break;
8933         }
8934 out:
8935         RETURN(rc);
8936 }
8937
8938 /**
8939  * Instantiate layout component objects which covers the intent write offset.
8940  */
8941 static int lod_layout_change(const struct lu_env *env, struct dt_object *dt,
8942                              struct md_layout_change *mlc, struct thandle *th)
8943 {
8944         struct lu_attr *attr = &lod_env_info(env)->lti_attr;
8945         struct lu_attr *layout_attr = &lod_env_info(env)->lti_layout_attr;
8946         struct lod_object *lo = lod_dt_obj(dt);
8947         int rc;
8948
8949         ENTRY;
8950
8951         if (S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
8952                 LASSERT(dir_mlc_ops[mlc->mlc_opc]);
8953                 rc = dir_mlc_ops[mlc->mlc_opc](env, dt, mlc, th);
8954                 RETURN(rc);
8955         }
8956
8957         rc = lod_striped_create(env, dt, attr, NULL, th);
8958         if (!rc && layout_attr->la_valid & LA_LAYOUT_VERSION) {
8959                 layout_attr->la_layout_version |= lo->ldo_layout_gen;
8960                 rc = lod_attr_set(env, dt, layout_attr, th);
8961         }
8962
8963         RETURN(rc);
8964 }
8965
8966 const struct dt_object_operations lod_obj_ops = {
8967         .do_read_lock           = lod_read_lock,
8968         .do_write_lock          = lod_write_lock,
8969         .do_read_unlock         = lod_read_unlock,
8970         .do_write_unlock        = lod_write_unlock,
8971         .do_write_locked        = lod_write_locked,
8972         .do_attr_get            = lod_attr_get,
8973         .do_declare_attr_set    = lod_declare_attr_set,
8974         .do_attr_set            = lod_attr_set,
8975         .do_xattr_get           = lod_xattr_get,
8976         .do_declare_xattr_set   = lod_declare_xattr_set,
8977         .do_xattr_set           = lod_xattr_set,
8978         .do_declare_xattr_del   = lod_declare_xattr_del,
8979         .do_xattr_del           = lod_xattr_del,
8980         .do_xattr_list          = lod_xattr_list,
8981         .do_ah_init             = lod_ah_init,
8982         .do_declare_create      = lod_declare_create,
8983         .do_create              = lod_create,
8984         .do_declare_destroy     = lod_declare_destroy,
8985         .do_destroy             = lod_destroy,
8986         .do_index_try           = lod_index_try,
8987         .do_declare_ref_add     = lod_declare_ref_add,
8988         .do_ref_add             = lod_ref_add,
8989         .do_declare_ref_del     = lod_declare_ref_del,
8990         .do_ref_del             = lod_ref_del,
8991         .do_object_sync         = lod_object_sync,
8992         .do_object_lock         = lod_object_lock,
8993         .do_object_unlock       = lod_object_unlock,
8994         .do_invalidate          = lod_invalidate,
8995         .do_declare_layout_change = lod_declare_layout_change,
8996         .do_layout_change       = lod_layout_change,
8997 };
8998
8999 /**
9000  * Implementation of dt_body_operations::dbo_read.
9001  *
9002  * \see dt_body_operations::dbo_read() in the API description for details.
9003  */
9004 static ssize_t lod_read(const struct lu_env *env, struct dt_object *dt,
9005                         struct lu_buf *buf, loff_t *pos)
9006 {
9007         struct dt_object *next = dt_object_child(dt);
9008
9009         LASSERT(S_ISREG(dt->do_lu.lo_header->loh_attr) ||
9010                 S_ISLNK(dt->do_lu.lo_header->loh_attr));
9011         return next->do_body_ops->dbo_read(env, next, buf, pos);
9012 }
9013
9014 /**
9015  * Implementation of dt_body_operations::dbo_declare_write.
9016  *
9017  * \see dt_body_operations::dbo_declare_write() in the API description
9018  * for details.
9019  */
9020 static ssize_t lod_declare_write(const struct lu_env *env,
9021                                  struct dt_object *dt,
9022                                  const struct lu_buf *buf, loff_t pos,
9023                                  struct thandle *th)
9024 {
9025         return lod_sub_declare_write(env, dt_object_child(dt), buf, pos, th);
9026 }
9027
9028 /**
9029  * Implementation of dt_body_operations::dbo_write.
9030  *
9031  * \see dt_body_operations::dbo_write() in the API description for details.
9032  */
9033 static ssize_t lod_write(const struct lu_env *env, struct dt_object *dt,
9034                          const struct lu_buf *buf, loff_t *pos,
9035                          struct thandle *th)
9036 {
9037         LASSERT(S_ISREG(dt->do_lu.lo_header->loh_attr) ||
9038                 S_ISLNK(dt->do_lu.lo_header->loh_attr));
9039         return lod_sub_write(env, dt_object_child(dt), buf, pos, th);
9040 }
9041
9042 static int lod_declare_punch(const struct lu_env *env, struct dt_object *dt,
9043                              __u64 start, __u64 end, struct thandle *th)
9044 {
9045         if (dt_object_remote(dt))
9046                 return -ENOTSUPP;
9047
9048         return lod_sub_declare_punch(env, dt_object_child(dt), start, end, th);
9049 }
9050
9051 static int lod_punch(const struct lu_env *env, struct dt_object *dt,
9052                      __u64 start, __u64 end, struct thandle *th)
9053 {
9054         if (dt_object_remote(dt))
9055                 return -ENOTSUPP;
9056
9057         LASSERT(S_ISREG(dt->do_lu.lo_header->loh_attr));
9058         return lod_sub_punch(env, dt_object_child(dt), start, end, th);
9059 }
9060
9061 /*
9062  * different type of files use the same body_ops because object may be created
9063  * in OUT, where there is no chance to set correct body_ops for each type, so
9064  * body_ops themselves will check file type inside, see lod_read/write/punch for
9065  * details.
9066  */
9067 static const struct dt_body_operations lod_body_ops = {
9068         .dbo_read               = lod_read,
9069         .dbo_declare_write      = lod_declare_write,
9070         .dbo_write              = lod_write,
9071         .dbo_declare_punch      = lod_declare_punch,
9072         .dbo_punch              = lod_punch,
9073 };
9074
9075 /**
9076  * Implementation of lu_object_operations::loo_object_init.
9077  *
9078  * The function determines the type and the index of the target device using
9079  * sequence of the object's FID. Then passes control down to the
9080  * corresponding device:
9081  *  OSD for the local objects, OSP for remote
9082  *
9083  * \see lu_object_operations::loo_object_init() in the API description
9084  * for details.
9085  */
9086 static int lod_object_init(const struct lu_env *env, struct lu_object *lo,
9087                            const struct lu_object_conf *conf)
9088 {
9089         struct lod_device       *lod    = lu2lod_dev(lo->lo_dev);
9090         struct lu_device        *cdev   = NULL;
9091         struct lu_object        *cobj;
9092         struct lod_tgt_descs    *ltd    = NULL;
9093         struct lod_tgt_desc     *tgt;
9094         u32                      idx    = 0;
9095         int                      type   = LU_SEQ_RANGE_ANY;
9096         int                      rc;
9097         ENTRY;
9098
9099         rc = lod_fld_lookup(env, lod, lu_object_fid(lo), &idx, &type);
9100         if (rc != 0)
9101                 RETURN(rc);
9102
9103         if (type == LU_SEQ_RANGE_MDT &&
9104             idx == lu_site2seq(lo->lo_dev->ld_site)->ss_node_id) {
9105                 cdev = &lod->lod_child->dd_lu_dev;
9106         } else if (type == LU_SEQ_RANGE_MDT) {
9107                 ltd = &lod->lod_mdt_descs;
9108                 lod_getref(ltd);
9109         } else if (type == LU_SEQ_RANGE_OST) {
9110                 ltd = &lod->lod_ost_descs;
9111                 lod_getref(ltd);
9112         } else {
9113                 LBUG();
9114         }
9115
9116         if (ltd != NULL) {
9117                 if (ltd->ltd_tgts_size > idx &&
9118                     test_bit(idx, ltd->ltd_tgt_bitmap)) {
9119                         tgt = LTD_TGT(ltd, idx);
9120
9121                         LASSERT(tgt != NULL);
9122                         LASSERT(tgt->ltd_tgt != NULL);
9123
9124                         cdev = &(tgt->ltd_tgt->dd_lu_dev);
9125                 }
9126                 lod_putref(lod, ltd);
9127         }
9128
9129         if (unlikely(cdev == NULL))
9130                 RETURN(-ENOENT);
9131
9132         cobj = cdev->ld_ops->ldo_object_alloc(env, lo->lo_header, cdev);
9133         if (unlikely(cobj == NULL))
9134                 RETURN(-ENOMEM);
9135
9136         lu2lod_obj(lo)->ldo_obj.do_body_ops = &lod_body_ops;
9137
9138         lu_object_add(lo, cobj);
9139
9140         RETURN(0);
9141 }
9142
9143 /**
9144  *
9145  * Release resources associated with striping.
9146  *
9147  * If the object is striped (regular or directory), then release
9148  * the stripe objects references and free the ldo_stripe array.
9149  *
9150  * \param[in] env       execution environment
9151  * \param[in] lo        object
9152  */
9153 void lod_striping_free_nolock(const struct lu_env *env, struct lod_object *lo)
9154 {
9155         struct lod_layout_component *lod_comp;
9156         __u32 obj_attr = lo->ldo_obj.do_lu.lo_header->loh_attr;
9157         int i, j;
9158
9159         if (unlikely(lo->ldo_is_foreign)) {
9160                 if (S_ISREG(obj_attr)) {
9161                         lod_free_foreign_lov(lo);
9162                         lo->ldo_comp_cached = 0;
9163                 } else if (S_ISDIR(obj_attr)) {
9164                         lod_free_foreign_lmv(lo);
9165                         lo->ldo_dir_stripe_loaded = 0;
9166                 }
9167         } else if (lo->ldo_stripe != NULL) {
9168                 LASSERT(lo->ldo_comp_entries == NULL);
9169                 LASSERT(lo->ldo_dir_stripes_allocated > 0);
9170
9171                 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
9172                         if (lo->ldo_stripe[i])
9173                                 dt_object_put(env, lo->ldo_stripe[i]);
9174                 }
9175
9176                 j = sizeof(struct dt_object *) * lo->ldo_dir_stripes_allocated;
9177                 OBD_FREE(lo->ldo_stripe, j);
9178                 lo->ldo_stripe = NULL;
9179                 lo->ldo_dir_stripes_allocated = 0;
9180                 lo->ldo_dir_stripe_loaded = 0;
9181                 lo->ldo_dir_stripe_count = 0;
9182                 lo->ldo_obj.do_index_ops = NULL;
9183         } else if (lo->ldo_comp_entries != NULL) {
9184                 for (i = 0; i < lo->ldo_comp_cnt; i++) {
9185                         /* free lod_layout_component::llc_stripe array */
9186                         lod_comp = &lo->ldo_comp_entries[i];
9187
9188                         if (lod_comp->llc_stripe == NULL)
9189                                 continue;
9190                         LASSERT(lod_comp->llc_stripes_allocated != 0);
9191                         for (j = 0; j < lod_comp->llc_stripes_allocated; j++) {
9192                                 if (lod_comp->llc_stripe[j] != NULL)
9193                                         lu_object_put(env,
9194                                                &lod_comp->llc_stripe[j]->do_lu);
9195                         }
9196                         OBD_FREE_PTR_ARRAY(lod_comp->llc_stripe,
9197                                            lod_comp->llc_stripes_allocated);
9198                         lod_comp->llc_stripe = NULL;
9199                         OBD_FREE_PTR_ARRAY(lod_comp->llc_ost_indices,
9200                                            lod_comp->llc_stripes_allocated);
9201                         lod_comp->llc_ost_indices = NULL;
9202                         lod_comp->llc_stripes_allocated = 0;
9203                 }
9204                 lod_free_comp_entries(lo);
9205                 lo->ldo_comp_cached = 0;
9206         }
9207 }
9208
9209 void lod_striping_free(const struct lu_env *env, struct lod_object *lo)
9210 {
9211         mutex_lock(&lo->ldo_layout_mutex);
9212         lod_striping_free_nolock(env, lo);
9213         mutex_unlock(&lo->ldo_layout_mutex);
9214 }
9215
9216 /**
9217  * Implementation of lu_object_operations::loo_object_free.
9218  *
9219  * \see lu_object_operations::loo_object_free() in the API description
9220  * for details.
9221  */
9222 static void lod_object_free(const struct lu_env *env, struct lu_object *o)
9223 {
9224         struct lod_object *lo = lu2lod_obj(o);
9225
9226         /* release all underlying object pinned */
9227         lod_striping_free(env, lo);
9228         lu_object_fini(o);
9229         /* lo doesn't contain a lu_object_header, so we don't need call_rcu */
9230         OBD_SLAB_FREE_PTR(lo, lod_object_kmem);
9231 }
9232
9233 /**
9234  * Implementation of lu_object_operations::loo_object_release.
9235  *
9236  * \see lu_object_operations::loo_object_release() in the API description
9237  * for details.
9238  */
9239 static void lod_object_release(const struct lu_env *env, struct lu_object *o)
9240 {
9241         /* XXX: shouldn't we release everything here in case if object
9242          * creation failed before? */
9243 }
9244
9245 /**
9246  * Implementation of lu_object_operations::loo_object_print.
9247  *
9248  * \see lu_object_operations::loo_object_print() in the API description
9249  * for details.
9250  */
9251 static int lod_object_print(const struct lu_env *env, void *cookie,
9252                             lu_printer_t p, const struct lu_object *l)
9253 {
9254         struct lod_object *o = lu2lod_obj((struct lu_object *) l);
9255
9256         return (*p)(env, cookie, LUSTRE_LOD_NAME"-object@%p", o);
9257 }
9258
9259 const struct lu_object_operations lod_lu_obj_ops = {
9260         .loo_object_init        = lod_object_init,
9261         .loo_object_free        = lod_object_free,
9262         .loo_object_release     = lod_object_release,
9263         .loo_object_print       = lod_object_print,
9264 };