Whamcloud - gitweb
LU-17848 osd: purge key_rec() from dt API
[fs/lustre-release.git] / lustre / lod / lod_object.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License version 2 for more details.  A copy is
14  * included in the COPYING file that accompanied this code.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright  2009 Sun Microsystems, Inc. All rights reserved
24  * Use is subject to license terms.
25  *
26  * Copyright (c) 2012, 2017, Intel Corporation.
27  */
28 /*
29  * lustre/lod/lod_object.c
30  *
31  * This file contains implementations of methods for the OSD API
32  * for the Logical Object Device (LOD) layer, which provides a virtual
33  * local OSD object interface to the MDD layer, and abstracts the
34  * addressing of local (OSD) and remote (OSP) objects. The API is
35  * described in the file lustre/include/dt_object.h and in
36  * Documentation/osd-api.txt.
37  *
38  * Author: Alex Zhuravlev <alexey.zhuravlev@intel.com>
39  */
40
41 #define DEBUG_SUBSYSTEM S_MDS
42
43 #include <linux/random.h>
44
45 #include <obd.h>
46 #include <obd_class.h>
47 #include <obd_support.h>
48
49 #include <lustre_fid.h>
50 #include <lustre_linkea.h>
51 #include <lustre_lmv.h>
52 #include <uapi/linux/lustre/lustre_param.h>
53 #include <lustre_swab.h>
54 #include <uapi/linux/lustre/lustre_ver.h>
55 #include <lprocfs_status.h>
56 #include <md_object.h>
57
58 #include "lod_internal.h"
59
60 static const char dot[] = ".";
61 static const char dotdot[] = "..";
62
63 /**
64  * Implementation of dt_index_operations::dio_lookup
65  *
66  * Used with regular (non-striped) objects.
67  *
68  * \see dt_index_operations::dio_lookup() in the API description for details.
69  */
70 static int lod_lookup(const struct lu_env *env, struct dt_object *dt,
71                       struct dt_rec *rec, const struct dt_key *key)
72 {
73         struct dt_object *next = dt_object_child(dt);
74         return next->do_index_ops->dio_lookup(env, next, rec, key);
75 }
76
77 /**
78  * Implementation of dt_index_operations::dio_declare_insert.
79  *
80  * Used with regular (non-striped) objects.
81  *
82  * \see dt_index_operations::dio_declare_insert() in the API description
83  * for details.
84  */
85 static int lod_declare_insert(const struct lu_env *env, struct dt_object *dt,
86                               const struct dt_rec *rec,
87                               const struct dt_key *key, struct thandle *th)
88 {
89         return lod_sub_declare_insert(env, dt_object_child(dt), rec, key, th);
90 }
91
92 /**
93  * Implementation of dt_index_operations::dio_insert.
94  *
95  * Used with regular (non-striped) objects
96  *
97  * \see dt_index_operations::dio_insert() in the API description for details.
98  */
99 static int lod_insert(const struct lu_env *env, struct dt_object *dt,
100                       const struct dt_rec *rec, const struct dt_key *key,
101                       struct thandle *th)
102 {
103         return lod_sub_insert(env, dt_object_child(dt), rec, key, th);
104 }
105
106 /**
107  * Implementation of dt_index_operations::dio_declare_delete.
108  *
109  * Used with regular (non-striped) objects.
110  *
111  * \see dt_index_operations::dio_declare_delete() in the API description
112  * for details.
113  */
114 static int lod_declare_delete(const struct lu_env *env, struct dt_object *dt,
115                               const struct dt_key *key, struct thandle *th)
116 {
117         return lod_sub_declare_delete(env, dt_object_child(dt), key, th);
118 }
119
120 /**
121  * Implementation of dt_index_operations::dio_delete.
122  *
123  * Used with regular (non-striped) objects.
124  *
125  * \see dt_index_operations::dio_delete() in the API description for details.
126  */
127 static int lod_delete(const struct lu_env *env, struct dt_object *dt,
128                       const struct dt_key *key, struct thandle *th)
129 {
130         return lod_sub_delete(env, dt_object_child(dt), key, th);
131 }
132
133 /**
134  * Implementation of dt_it_ops::init.
135  *
136  * Used with regular (non-striped) objects.
137  *
138  * \see dt_it_ops::init() in the API description for details.
139  */
140 static struct dt_it *lod_it_init(const struct lu_env *env,
141                                  struct dt_object *dt, __u32 attr)
142 {
143         struct dt_object        *next = dt_object_child(dt);
144         struct lod_it           *it = &lod_env_info(env)->lti_it;
145         struct dt_it            *it_next;
146
147         it_next = next->do_index_ops->dio_it.init(env, next, attr);
148         if (IS_ERR(it_next))
149                 return it_next;
150
151         /* currently we do not use more than one iterator per thread
152          * so we store it in thread info. if at some point we need
153          * more active iterators in a single thread, we can allocate
154          * additional ones */
155         LASSERT(it->lit_obj == NULL);
156
157         it->lit_it = it_next;
158         it->lit_obj = next;
159
160         return (struct dt_it *)it;
161 }
162
163 #define LOD_CHECK_IT(env, it)                                   \
164 do {                                                            \
165         LASSERT((it)->lit_obj != NULL);                         \
166         LASSERT((it)->lit_it != NULL);                          \
167 } while (0)
168
169 /**
170  * Implementation of dt_index_operations::dio_it.fini.
171  *
172  * Used with regular (non-striped) objects.
173  *
174  * \see dt_index_operations::dio_it.fini() in the API description for details.
175  */
176 static void lod_it_fini(const struct lu_env *env, struct dt_it *di)
177 {
178         struct lod_it *it = (struct lod_it *)di;
179
180         LOD_CHECK_IT(env, it);
181         it->lit_obj->do_index_ops->dio_it.fini(env, it->lit_it);
182
183         /* the iterator not in use any more */
184         it->lit_obj = NULL;
185         it->lit_it = NULL;
186 }
187
188 /**
189  * Implementation of dt_it_ops::get.
190  *
191  * Used with regular (non-striped) objects.
192  *
193  * \see dt_it_ops::get() in the API description for details.
194  */
195 static int lod_it_get(const struct lu_env *env, struct dt_it *di,
196                       const struct dt_key *key)
197 {
198         const struct lod_it *it = (const struct lod_it *)di;
199
200         LOD_CHECK_IT(env, it);
201         return it->lit_obj->do_index_ops->dio_it.get(env, it->lit_it, key);
202 }
203
204 /**
205  * Implementation of dt_it_ops::put.
206  *
207  * Used with regular (non-striped) objects.
208  *
209  * \see dt_it_ops::put() in the API description for details.
210  */
211 static void lod_it_put(const struct lu_env *env, struct dt_it *di)
212 {
213         struct lod_it *it = (struct lod_it *)di;
214
215         LOD_CHECK_IT(env, it);
216         return it->lit_obj->do_index_ops->dio_it.put(env, it->lit_it);
217 }
218
219 /**
220  * Implementation of dt_it_ops::next.
221  *
222  * Used with regular (non-striped) objects
223  *
224  * \see dt_it_ops::next() in the API description for details.
225  */
226 static int lod_it_next(const struct lu_env *env, struct dt_it *di)
227 {
228         struct lod_it *it = (struct lod_it *)di;
229
230         LOD_CHECK_IT(env, it);
231         return it->lit_obj->do_index_ops->dio_it.next(env, it->lit_it);
232 }
233
234 /**
235  * Implementation of dt_it_ops::key.
236  *
237  * Used with regular (non-striped) objects.
238  *
239  * \see dt_it_ops::key() in the API description for details.
240  */
241 static struct dt_key *lod_it_key(const struct lu_env *env,
242                                  const struct dt_it *di)
243 {
244         const struct lod_it *it = (const struct lod_it *)di;
245
246         LOD_CHECK_IT(env, it);
247         return it->lit_obj->do_index_ops->dio_it.key(env, it->lit_it);
248 }
249
250 /**
251  * Implementation of dt_it_ops::key_size.
252  *
253  * Used with regular (non-striped) objects.
254  *
255  * \see dt_it_ops::key_size() in the API description for details.
256  */
257 static int lod_it_key_size(const struct lu_env *env, const struct dt_it *di)
258 {
259         struct lod_it *it = (struct lod_it *)di;
260
261         LOD_CHECK_IT(env, it);
262         return it->lit_obj->do_index_ops->dio_it.key_size(env, it->lit_it);
263 }
264
265 /**
266  * Implementation of dt_it_ops::rec.
267  *
268  * Used with regular (non-striped) objects.
269  *
270  * \see dt_it_ops::rec() in the API description for details.
271  */
272 static int lod_it_rec(const struct lu_env *env, const struct dt_it *di,
273                       struct dt_rec *rec, __u32 attr)
274 {
275         const struct lod_it *it = (const struct lod_it *)di;
276
277         LOD_CHECK_IT(env, it);
278         return it->lit_obj->do_index_ops->dio_it.rec(env, it->lit_it, rec,
279                                                      attr);
280 }
281
282 /**
283  * Implementation of dt_it_ops::rec_size.
284  *
285  * Used with regular (non-striped) objects.
286  *
287  * \see dt_it_ops::rec_size() in the API description for details.
288  */
289 static int lod_it_rec_size(const struct lu_env *env, const struct dt_it *di,
290                            __u32 attr)
291 {
292         const struct lod_it *it = (const struct lod_it *)di;
293
294         LOD_CHECK_IT(env, it);
295         return it->lit_obj->do_index_ops->dio_it.rec_size(env, it->lit_it,
296                                                           attr);
297 }
298
299 /**
300  * Implementation of dt_it_ops::store.
301  *
302  * Used with regular (non-striped) objects.
303  *
304  * \see dt_it_ops::store() in the API description for details.
305  */
306 static __u64 lod_it_store(const struct lu_env *env, const struct dt_it *di)
307 {
308         const struct lod_it *it = (const struct lod_it *)di;
309
310         LOD_CHECK_IT(env, it);
311         return it->lit_obj->do_index_ops->dio_it.store(env, it->lit_it);
312 }
313
314 /**
315  * Implementation of dt_it_ops::load.
316  *
317  * Used with regular (non-striped) objects.
318  *
319  * \see dt_it_ops::load() in the API description for details.
320  */
321 static int lod_it_load(const struct lu_env *env, const struct dt_it *di,
322                        __u64 hash)
323 {
324         const struct lod_it *it = (const struct lod_it *)di;
325
326         LOD_CHECK_IT(env, it);
327         return it->lit_obj->do_index_ops->dio_it.load(env, it->lit_it, hash);
328 }
329
330 static const struct dt_index_operations lod_index_ops = {
331         .dio_lookup             = lod_lookup,
332         .dio_declare_insert     = lod_declare_insert,
333         .dio_insert             = lod_insert,
334         .dio_declare_delete     = lod_declare_delete,
335         .dio_delete             = lod_delete,
336         .dio_it = {
337                 .init           = lod_it_init,
338                 .fini           = lod_it_fini,
339                 .get            = lod_it_get,
340                 .put            = lod_it_put,
341                 .next           = lod_it_next,
342                 .key            = lod_it_key,
343                 .key_size       = lod_it_key_size,
344                 .rec            = lod_it_rec,
345                 .rec_size       = lod_it_rec_size,
346                 .store          = lod_it_store,
347                 .load           = lod_it_load,
348         }
349 };
350
351 /**
352  * Implementation of dt_index_operations::dio_lookup
353  *
354  * Used with striped directories.
355  *
356  * \see dt_index_operations::dio_lookup() in the API description for details.
357  */
358 static int lod_striped_lookup(const struct lu_env *env, struct dt_object *dt,
359                       struct dt_rec *rec, const struct dt_key *key)
360 {
361         struct lod_object *lo = lod_dt_obj(dt);
362         struct dt_object *next;
363         const char *name = (const char *)key;
364
365         LASSERT(lo->ldo_dir_stripe_count > 0);
366
367         if (strcmp(name, dot) == 0) {
368                 struct lu_fid *fid = (struct lu_fid *)rec;
369
370                 *fid = *lod_object_fid(lo);
371                 return 1;
372         }
373
374         if (strcmp(name, dotdot) == 0) {
375                 next = dt_object_child(dt);
376         } else {
377                 int index;
378
379                 index = __lmv_name_to_stripe_index(lo->ldo_dir_hash_type,
380                                                    lo->ldo_dir_stripe_count,
381                                                    lo->ldo_dir_migrate_hash,
382                                                    lo->ldo_dir_migrate_offset,
383                                                    name, strlen(name), true);
384                 if (index < 0)
385                         return index;
386
387                 next = lo->ldo_stripe[index];
388                 if (!next || !dt_object_exists(next))
389                         return -ENODEV;
390         }
391
392         return next->do_index_ops->dio_lookup(env, next, rec, key);
393 }
394
395 /**
396  * Implementation of dt_it_ops::init.
397  *
398  * Used with striped objects. Internally just initializes the iterator
399  * on the first stripe.
400  *
401  * \see dt_it_ops::init() in the API description for details.
402  */
403 static struct dt_it *lod_striped_it_init(const struct lu_env *env,
404                                          struct dt_object *dt, __u32 attr)
405 {
406         struct lod_object *lo = lod_dt_obj(dt);
407         struct dt_object *next;
408         struct lod_it *it = &lod_env_info(env)->lti_it;
409         struct dt_it *it_next;
410         __u16 index = 0;
411
412         LASSERT(lo->ldo_dir_stripe_count > 0);
413
414         do {
415                 next = lo->ldo_stripe[index];
416                 if (next && dt_object_exists(next))
417                         break;
418         } while (++index < lo->ldo_dir_stripe_count);
419
420         /* no valid stripe */
421         if (!next || !dt_object_exists(next))
422                 return ERR_PTR(-ENODEV);
423
424         LASSERT(next->do_index_ops != NULL);
425
426         it_next = next->do_index_ops->dio_it.init(env, next, attr);
427         if (IS_ERR(it_next))
428                 return it_next;
429
430         /* currently we do not use more than one iterator per thread
431          * so we store it in thread info. if at some point we need
432          * more active iterators in a single thread, we can allocate
433          * additional ones */
434         LASSERT(it->lit_obj == NULL);
435
436         it->lit_stripe_index = index;
437         it->lit_attr = attr;
438         it->lit_it = it_next;
439         it->lit_obj = dt;
440
441         return (struct dt_it *)it;
442 }
443
444 #define LOD_CHECK_STRIPED_IT(env, it, lo)                               \
445 do {                                                                    \
446         LASSERT((it)->lit_obj != NULL);                                 \
447         LASSERT((it)->lit_it != NULL);                                  \
448         LASSERT((lo)->ldo_dir_stripe_count > 0);                        \
449         LASSERT((it)->lit_stripe_index < (lo)->ldo_dir_stripe_count);   \
450 } while (0)
451
452 /**
453  * Implementation of dt_it_ops::fini.
454  *
455  * Used with striped objects.
456  *
457  * \see dt_it_ops::fini() in the API description for details.
458  */
459 static void lod_striped_it_fini(const struct lu_env *env, struct dt_it *di)
460 {
461         struct lod_it           *it = (struct lod_it *)di;
462         struct lod_object       *lo = lod_dt_obj(it->lit_obj);
463         struct dt_object        *next;
464
465         /* If lit_it == NULL, then it means the sub_it has been finished,
466          * which only happens in failure cases, see lod_striped_it_next() */
467         if (it->lit_it != NULL) {
468                 LOD_CHECK_STRIPED_IT(env, it, lo);
469
470                 next = lo->ldo_stripe[it->lit_stripe_index];
471                 if (next) {
472                         LASSERT(next->do_index_ops != NULL);
473                         next->do_index_ops->dio_it.fini(env, it->lit_it);
474                 }
475         }
476
477         /* the iterator not in use any more */
478         it->lit_obj = NULL;
479         it->lit_it = NULL;
480         it->lit_stripe_index = 0;
481 }
482
483 /**
484  * Implementation of dt_it_ops::get.
485  *
486  * Right now it's not used widely, only to reset the iterator to the
487  * initial position. It should be possible to implement a full version
488  * which chooses a correct stripe to be able to position with any key.
489  *
490  * \see dt_it_ops::get() in the API description for details.
491  */
492 static int lod_striped_it_get(const struct lu_env *env, struct dt_it *di,
493                               const struct dt_key *key)
494 {
495         const struct lod_it *it = (const struct lod_it *)di;
496         struct lod_object *lo = lod_dt_obj(it->lit_obj);
497         struct dt_object *next;
498
499         LOD_CHECK_STRIPED_IT(env, it, lo);
500
501         next = lo->ldo_stripe[it->lit_stripe_index];
502         LASSERT(next != NULL);
503         LASSERT(dt_object_exists(next));
504         LASSERT(next->do_index_ops != NULL);
505
506         return next->do_index_ops->dio_it.get(env, it->lit_it, key);
507 }
508
509 /**
510  * Implementation of dt_it_ops::put.
511  *
512  * Used with striped objects.
513  *
514  * \see dt_it_ops::put() in the API description for details.
515  */
516 static void lod_striped_it_put(const struct lu_env *env, struct dt_it *di)
517 {
518         struct lod_it *it = (struct lod_it *)di;
519         struct lod_object *lo = lod_dt_obj(it->lit_obj);
520         struct dt_object *next;
521
522         /*
523          * If lit_it == NULL, then it means the sub_it has been finished,
524          * which only happens in failure cases, see lod_striped_it_next()
525          */
526         if (!it->lit_it)
527                 return;
528
529         LOD_CHECK_STRIPED_IT(env, it, lo);
530
531         next = lo->ldo_stripe[it->lit_stripe_index];
532         LASSERT(next != NULL);
533         LASSERT(next->do_index_ops != NULL);
534
535         return next->do_index_ops->dio_it.put(env, it->lit_it);
536 }
537
538 /**
539  * Implementation of dt_it_ops::next.
540  *
541  * Used with striped objects. When the end of the current stripe is
542  * reached, the method takes the next stripe's iterator.
543  *
544  * \see dt_it_ops::next() in the API description for details.
545  */
546 static int lod_striped_it_next(const struct lu_env *env, struct dt_it *di)
547 {
548         struct lod_it *it = (struct lod_it *)di;
549         struct lod_object *lo = lod_dt_obj(it->lit_obj);
550         struct dt_object *next;
551         struct dt_it *it_next;
552         __u32 index;
553         int rc;
554
555         ENTRY;
556
557         LOD_CHECK_STRIPED_IT(env, it, lo);
558
559         next = lo->ldo_stripe[it->lit_stripe_index];
560         LASSERT(next != NULL);
561         LASSERT(dt_object_exists(next));
562         LASSERT(next->do_index_ops != NULL);
563 again:
564         rc = next->do_index_ops->dio_it.next(env, it->lit_it);
565         if (rc < 0)
566                 RETURN(rc);
567
568         if (rc == 0 && it->lit_stripe_index == 0)
569                 RETURN(rc);
570
571         if (rc == 0 && it->lit_stripe_index > 0) {
572                 struct lu_dirent *ent;
573
574                 ent = (struct lu_dirent *)lod_env_info(env)->lti_key;
575
576                 rc = next->do_index_ops->dio_it.rec(env, it->lit_it,
577                                                     (struct dt_rec *)ent,
578                                                     it->lit_attr);
579                 if (rc != 0)
580                         RETURN(rc);
581
582                 /* skip . and .. for slave stripe */
583                 if ((strncmp(ent->lde_name, ".",
584                              le16_to_cpu(ent->lde_namelen)) == 0 &&
585                      le16_to_cpu(ent->lde_namelen) == 1) ||
586                     (strncmp(ent->lde_name, "..",
587                              le16_to_cpu(ent->lde_namelen)) == 0 &&
588                      le16_to_cpu(ent->lde_namelen) == 2))
589                         goto again;
590
591                 RETURN(rc);
592         }
593
594         next->do_index_ops->dio_it.put(env, it->lit_it);
595         next->do_index_ops->dio_it.fini(env, it->lit_it);
596         it->lit_it = NULL;
597
598         /* go to next stripe */
599         index = it->lit_stripe_index;
600         while (++index < lo->ldo_dir_stripe_count) {
601                 next = lo->ldo_stripe[index];
602                 if (!next)
603                         continue;
604
605                 if (!dt_object_exists(next))
606                         continue;
607
608                 rc = next->do_ops->do_index_try(env, next,
609                                                 &dt_directory_features);
610                 if (rc != 0)
611                         RETURN(rc);
612
613                 LASSERT(next->do_index_ops != NULL);
614
615                 it_next = next->do_index_ops->dio_it.init(env, next,
616                                                           it->lit_attr);
617                 if (IS_ERR(it_next))
618                         RETURN(PTR_ERR(it_next));
619
620                 rc = next->do_index_ops->dio_it.get(env, it_next,
621                                                     (const struct dt_key *)"");
622                 if (rc <= 0)
623                         RETURN(rc == 0 ? -EIO : rc);
624
625                 it->lit_it = it_next;
626                 it->lit_stripe_index = index;
627                 goto again;
628
629         }
630
631         RETURN(1);
632 }
633
634 /**
635  * Implementation of dt_it_ops::key.
636  *
637  * Used with striped objects.
638  *
639  * \see dt_it_ops::key() in the API description for details.
640  */
641 static struct dt_key *lod_striped_it_key(const struct lu_env *env,
642                                          const struct dt_it *di)
643 {
644         const struct lod_it     *it = (const struct lod_it *)di;
645         struct lod_object       *lo = lod_dt_obj(it->lit_obj);
646         struct dt_object        *next;
647
648         LOD_CHECK_STRIPED_IT(env, it, lo);
649
650         next = lo->ldo_stripe[it->lit_stripe_index];
651         LASSERT(next != NULL);
652         LASSERT(next->do_index_ops != NULL);
653
654         return next->do_index_ops->dio_it.key(env, it->lit_it);
655 }
656
657 /**
658  * Implementation of dt_it_ops::key_size.
659  *
660  * Used with striped objects.
661  *
662  * \see dt_it_ops::size() in the API description for details.
663  */
664 static int lod_striped_it_key_size(const struct lu_env *env,
665                                    const struct dt_it *di)
666 {
667         struct lod_it           *it = (struct lod_it *)di;
668         struct lod_object       *lo = lod_dt_obj(it->lit_obj);
669         struct dt_object        *next;
670
671         LOD_CHECK_STRIPED_IT(env, it, lo);
672
673         next = lo->ldo_stripe[it->lit_stripe_index];
674         LASSERT(next != NULL);
675         LASSERT(next->do_index_ops != NULL);
676
677         return next->do_index_ops->dio_it.key_size(env, it->lit_it);
678 }
679
680 /**
681  * Implementation of dt_it_ops::rec.
682  *
683  * Used with striped objects.
684  *
685  * \see dt_it_ops::rec() in the API description for details.
686  */
687 static int lod_striped_it_rec(const struct lu_env *env, const struct dt_it *di,
688                               struct dt_rec *rec, __u32 attr)
689 {
690         const struct lod_it     *it = (const struct lod_it *)di;
691         struct lod_object       *lo = lod_dt_obj(it->lit_obj);
692         struct dt_object        *next;
693
694         LOD_CHECK_STRIPED_IT(env, it, lo);
695
696         next = lo->ldo_stripe[it->lit_stripe_index];
697         LASSERT(next != NULL);
698         LASSERT(next->do_index_ops != NULL);
699
700         return next->do_index_ops->dio_it.rec(env, it->lit_it, rec, attr);
701 }
702
703 /**
704  * Implementation of dt_it_ops::rec_size.
705  *
706  * Used with striped objects.
707  *
708  * \see dt_it_ops::rec_size() in the API description for details.
709  */
710 static int lod_striped_it_rec_size(const struct lu_env *env,
711                                    const struct dt_it *di, __u32 attr)
712 {
713         struct lod_it           *it = (struct lod_it *)di;
714         struct lod_object       *lo = lod_dt_obj(it->lit_obj);
715         struct dt_object        *next;
716
717         LOD_CHECK_STRIPED_IT(env, it, lo);
718
719         next = lo->ldo_stripe[it->lit_stripe_index];
720         LASSERT(next != NULL);
721         LASSERT(next->do_index_ops != NULL);
722
723         return next->do_index_ops->dio_it.rec_size(env, it->lit_it, attr);
724 }
725
726 /**
727  * Implementation of dt_it_ops::store.
728  *
729  * Used with striped objects.
730  *
731  * \see dt_it_ops::store() in the API description for details.
732  */
733 static __u64 lod_striped_it_store(const struct lu_env *env,
734                                   const struct dt_it *di)
735 {
736         const struct lod_it     *it = (const struct lod_it *)di;
737         struct lod_object       *lo = lod_dt_obj(it->lit_obj);
738         struct dt_object        *next;
739
740         LOD_CHECK_STRIPED_IT(env, it, lo);
741
742         next = lo->ldo_stripe[it->lit_stripe_index];
743         LASSERT(next != NULL);
744         LASSERT(next->do_index_ops != NULL);
745
746         return next->do_index_ops->dio_it.store(env, it->lit_it);
747 }
748
749 /**
750  * Implementation of dt_it_ops::load.
751  *
752  * Used with striped objects.
753  *
754  * \see dt_it_ops::load() in the API description for details.
755  */
756 static int lod_striped_it_load(const struct lu_env *env,
757                                const struct dt_it *di, __u64 hash)
758 {
759         const struct lod_it     *it = (const struct lod_it *)di;
760         struct lod_object       *lo = lod_dt_obj(it->lit_obj);
761         struct dt_object        *next;
762
763         LOD_CHECK_STRIPED_IT(env, it, lo);
764
765         next = lo->ldo_stripe[it->lit_stripe_index];
766         LASSERT(next != NULL);
767         LASSERT(next->do_index_ops != NULL);
768
769         return next->do_index_ops->dio_it.load(env, it->lit_it, hash);
770 }
771
772 static const struct dt_index_operations lod_striped_index_ops = {
773         .dio_lookup             = lod_striped_lookup,
774         .dio_declare_insert     = lod_declare_insert,
775         .dio_insert             = lod_insert,
776         .dio_declare_delete     = lod_declare_delete,
777         .dio_delete             = lod_delete,
778         .dio_it = {
779                 .init           = lod_striped_it_init,
780                 .fini           = lod_striped_it_fini,
781                 .get            = lod_striped_it_get,
782                 .put            = lod_striped_it_put,
783                 .next           = lod_striped_it_next,
784                 .key            = lod_striped_it_key,
785                 .key_size       = lod_striped_it_key_size,
786                 .rec            = lod_striped_it_rec,
787                 .rec_size       = lod_striped_it_rec_size,
788                 .store          = lod_striped_it_store,
789                 .load           = lod_striped_it_load,
790         }
791 };
792
793 /**
794  * Append the FID for each shard of the striped directory after the
795  * given LMV EA header.
796  *
797  * To simplify striped directory and the consistency verification,
798  * we only store the LMV EA header on disk, for both master object
799  * and slave objects. When someone wants to know the whole LMV EA,
800  * such as client readdir(), we can build the entrie LMV EA on the
801  * MDT side (in RAM) via iterating the sub-directory entries that
802  * are contained in the master object of the stripe directory.
803  *
804  * For the master object of the striped directroy, the valid name
805  * for each shard is composed of the ${shard_FID}:${shard_idx}.
806  *
807  * There may be holes in the LMV EA if some shards' name entries
808  * are corrupted or lost.
809  *
810  * \param[in] env       pointer to the thread context
811  * \param[in] lo        pointer to the master object of the striped directory
812  * \param[in] buf       pointer to the lu_buf which will hold the LMV EA
813  * \param[in] resize    whether re-allocate the buffer if it is not big enough
814  *
815  * \retval              positive size of the LMV EA
816  * \retval              0 for nothing to be loaded
817  * \retval              negative error number on failure
818  */
819 int lod_load_lmv_shards(const struct lu_env *env, struct lod_object *lo,
820                         struct lu_buf *buf, bool resize)
821 {
822         struct lu_dirent        *ent    =
823                         (struct lu_dirent *)lod_env_info(env)->lti_key;
824         struct lod_device       *lod    = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
825         struct dt_object        *obj    = dt_object_child(&lo->ldo_obj);
826         struct lmv_mds_md_v1    *lmv1   = buf->lb_buf;
827         struct dt_it            *it;
828         const struct dt_it_ops  *iops;
829         __u32                    stripes;
830         __u32                    magic  = le32_to_cpu(lmv1->lmv_magic);
831         size_t                   lmv1_size;
832         int                      rc;
833         ENTRY;
834
835         if (magic != LMV_MAGIC_V1)
836                 RETURN(0);
837
838         stripes = le32_to_cpu(lmv1->lmv_stripe_count);
839         if (stripes < 1)
840                 RETURN(0);
841
842         rc = lmv_mds_md_size(stripes, magic);
843         if (rc < 0)
844                 RETURN(rc);
845         lmv1_size = rc;
846         if (buf->lb_len < lmv1_size) {
847                 struct lu_buf tbuf;
848
849                 if (!resize)
850                         RETURN(-ERANGE);
851
852                 tbuf = *buf;
853                 buf->lb_buf = NULL;
854                 buf->lb_len = 0;
855                 lu_buf_alloc(buf, lmv1_size);
856                 lmv1 = buf->lb_buf;
857                 if (lmv1 == NULL)
858                         RETURN(-ENOMEM);
859
860                 memcpy(buf->lb_buf, tbuf.lb_buf, tbuf.lb_len);
861         }
862
863         if (unlikely(!dt_try_as_dir(env, obj, true)))
864                 RETURN(-ENOTDIR);
865
866         memset(&lmv1->lmv_stripe_fids[0], 0, stripes * sizeof(struct lu_fid));
867         iops = &obj->do_index_ops->dio_it;
868         it = iops->init(env, obj, LUDA_64BITHASH);
869         if (IS_ERR(it))
870                 RETURN(PTR_ERR(it));
871
872         rc = iops->load(env, it, 0);
873         if (rc == 0)
874                 rc = iops->next(env, it);
875         else if (rc > 0)
876                 rc = 0;
877
878         while (rc == 0) {
879                 char             name[FID_LEN + 2] = "";
880                 struct lu_fid    fid;
881                 __u32            index;
882                 int              len;
883
884                 rc = iops->rec(env, it, (struct dt_rec *)ent, LUDA_64BITHASH);
885                 if (rc != 0)
886                         break;
887
888                 rc = -EIO;
889
890                 fid_le_to_cpu(&fid, &ent->lde_fid);
891                 ent->lde_namelen = le16_to_cpu(ent->lde_namelen);
892                 if (ent->lde_name[0] == '.') {
893                         if (ent->lde_namelen == 1)
894                                 goto next;
895
896                         if (ent->lde_namelen == 2 && ent->lde_name[1] == '.')
897                                 goto next;
898                 }
899
900                 len = scnprintf(name, sizeof(name),
901                                 DFID":", PFID(&ent->lde_fid));
902                 /* The ent->lde_name is composed of ${FID}:${index} */
903                 if (ent->lde_namelen < len + 1 ||
904                     memcmp(ent->lde_name, name, len) != 0) {
905                         CDEBUG_LIMIT(lod->lod_lmv_failout ? D_ERROR : D_INFO,
906                                      "%s: invalid shard name %.*s with the FID "DFID" for the striped directory "DFID", %s\n",
907                                      lod2obd(lod)->obd_name, ent->lde_namelen,
908                                      ent->lde_name, PFID(&fid),
909                                      PFID(lu_object_fid(&obj->do_lu)),
910                                      lod->lod_lmv_failout ? "failout" : "skip");
911
912                         if (lod->lod_lmv_failout)
913                                 break;
914
915                         goto next;
916                 }
917
918                 index = 0;
919                 do {
920                         if (ent->lde_name[len] < '0' ||
921                             ent->lde_name[len] > '9') {
922                                 CDEBUG_LIMIT(lod->lod_lmv_failout ?
923                                              D_ERROR : D_INFO,
924                                              "%s: invalid shard name %.*s with the FID "DFID" for the striped directory "DFID", %s\n",
925                                              lod2obd(lod)->obd_name,
926                                              ent->lde_namelen,
927                                              ent->lde_name, PFID(&fid),
928                                              PFID(lu_object_fid(&obj->do_lu)),
929                                              lod->lod_lmv_failout ?
930                                              "failout" : "skip");
931
932                                 if (lod->lod_lmv_failout)
933                                         break;
934
935                                 goto next;
936                         }
937
938                         index = index * 10 + ent->lde_name[len++] - '0';
939                 } while (len < ent->lde_namelen);
940
941                 if (len == ent->lde_namelen) {
942                         /* Out of LMV EA range. */
943                         if (index >= stripes) {
944                                 CERROR("%s: the shard %.*s for the striped "
945                                        "directory "DFID" is out of the known "
946                                        "LMV EA range [0 - %u], failout\n",
947                                        lod2obd(lod)->obd_name, ent->lde_namelen,
948                                        ent->lde_name,
949                                        PFID(lu_object_fid(&obj->do_lu)),
950                                        stripes - 1);
951
952                                 break;
953                         }
954
955                         /* The slot has been occupied. */
956                         if (!fid_is_zero(&lmv1->lmv_stripe_fids[index]) &&
957                             !CFS_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_SLAVE_NAME)) {
958                                 struct lu_fid fid0;
959
960                                 fid_le_to_cpu(&fid0,
961                                         &lmv1->lmv_stripe_fids[index]);
962                                 CERROR("%s: both the shard "DFID" and "DFID
963                                        " for the striped directory "DFID
964                                        " claim the same LMV EA slot at the "
965                                        "index %d, failout\n",
966                                        lod2obd(lod)->obd_name,
967                                        PFID(&fid0), PFID(&fid),
968                                        PFID(lu_object_fid(&obj->do_lu)), index);
969
970                                 break;
971                         }
972
973                         /* stored as LE mode */
974                         lmv1->lmv_stripe_fids[index] = ent->lde_fid;
975
976 next:
977                         rc = iops->next(env, it);
978                 }
979         }
980
981         iops->put(env, it);
982         iops->fini(env, it);
983
984         RETURN(rc > 0 ? lmv_mds_md_size(stripes, magic) : rc);
985 }
986
987 /**
988  * Implementation of dt_object_operations::do_index_try.
989  *
990  * \see dt_object_operations::do_index_try() in the API description for details.
991  */
992 static int lod_index_try(const struct lu_env *env, struct dt_object *dt,
993                          const struct dt_index_features *feat)
994 {
995         struct lod_object       *lo = lod_dt_obj(dt);
996         struct dt_object        *next = dt_object_child(dt);
997         int                     rc;
998         ENTRY;
999
1000         LASSERT(next->do_ops);
1001         LASSERT(next->do_ops->do_index_try);
1002
1003         rc = lod_striping_load(env, lo);
1004         if (rc != 0)
1005                 RETURN(rc);
1006
1007         rc = next->do_ops->do_index_try(env, next, feat);
1008         if (rc != 0)
1009                 RETURN(rc);
1010
1011         if (lo->ldo_dir_stripe_count > 0) {
1012                 int i;
1013
1014                 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
1015                         if (!lo->ldo_stripe[i])
1016                                 continue;
1017                         if (!dt_object_exists(lo->ldo_stripe[i]))
1018                                 continue;
1019                         rc = lo->ldo_stripe[i]->do_ops->do_index_try(env,
1020                                                 lo->ldo_stripe[i], feat);
1021                         if (rc != 0)
1022                                 RETURN(rc);
1023                 }
1024                 dt->do_index_ops = &lod_striped_index_ops;
1025         } else {
1026                 dt->do_index_ops = &lod_index_ops;
1027         }
1028
1029         RETURN(rc);
1030 }
1031
1032 /**
1033  * Implementation of dt_object_operations::do_read_lock.
1034  *
1035  * \see dt_object_operations::do_read_lock() in the API description for details.
1036  */
1037 static void lod_read_lock(const struct lu_env *env, struct dt_object *dt,
1038                           unsigned role)
1039 {
1040         dt_read_lock(env, dt_object_child(dt), role);
1041 }
1042
1043 /**
1044  * Implementation of dt_object_operations::do_write_lock.
1045  *
1046  * \see dt_object_operations::do_write_lock() in the API description for
1047  * details.
1048  */
1049 static void lod_write_lock(const struct lu_env *env, struct dt_object *dt,
1050                            unsigned role)
1051 {
1052         dt_write_lock(env, dt_object_child(dt), role);
1053 }
1054
1055 /**
1056  * Implementation of dt_object_operations::do_read_unlock.
1057  *
1058  * \see dt_object_operations::do_read_unlock() in the API description for
1059  * details.
1060  */
1061 static void lod_read_unlock(const struct lu_env *env, struct dt_object *dt)
1062 {
1063         dt_read_unlock(env, dt_object_child(dt));
1064 }
1065
1066 /**
1067  * Implementation of dt_object_operations::do_write_unlock.
1068  *
1069  * \see dt_object_operations::do_write_unlock() in the API description for
1070  * details.
1071  */
1072 static void lod_write_unlock(const struct lu_env *env, struct dt_object *dt)
1073 {
1074         dt_write_unlock(env, dt_object_child(dt));
1075 }
1076
1077 /**
1078  * Implementation of dt_object_operations::do_write_locked.
1079  *
1080  * \see dt_object_operations::do_write_locked() in the API description for
1081  * details.
1082  */
1083 static int lod_write_locked(const struct lu_env *env, struct dt_object *dt)
1084 {
1085         return dt_write_locked(env, dt_object_child(dt));
1086 }
1087
1088 /**
1089  * Implementation of dt_object_operations::do_attr_get.
1090  *
1091  * \see dt_object_operations::do_attr_get() in the API description for details.
1092  */
1093 static int lod_attr_get(const struct lu_env *env,
1094                         struct dt_object *dt,
1095                         struct lu_attr *attr)
1096 {
1097         /* Note: for striped directory, client will merge attributes
1098          * from all of the sub-stripes see lmv_merge_attr(), and there
1099          * no MDD logic depend on directory nlink/size/time, so we can
1100          * always use master inode nlink and size for now. */
1101         return dt_attr_get(env, dt_object_child(dt), attr);
1102 }
1103
1104 void lod_adjust_stripe_size(struct lod_layout_component *comp,
1105                             __u32 def_stripe_size)
1106 {
1107         __u64 comp_end = comp->llc_extent.e_end;
1108
1109         /* Choose stripe size if not set. Note that default stripe size can't
1110          * be used as is, because it must be multiplier of given component end.
1111          *  - first check if default stripe size can be used
1112          *  - if not than select the lowest set bit from component end and use
1113          *    that value as stripe size
1114          */
1115         if (!comp->llc_stripe_size) {
1116                 if (comp_end == LUSTRE_EOF || !(comp_end % def_stripe_size))
1117                         comp->llc_stripe_size = def_stripe_size;
1118                 else
1119                         comp->llc_stripe_size = comp_end & ~(comp_end - 1);
1120         } else {
1121                 if (comp_end != LUSTRE_EOF &&
1122                     comp_end & (LOV_MIN_STRIPE_SIZE - 1)) {
1123                         CWARN("Component end %llu is not a multiple of min size %u\n",
1124                               comp_end, LOV_MIN_STRIPE_SIZE);
1125                         comp_end = round_up(comp_end, LOV_MIN_STRIPE_SIZE);
1126                 }
1127                 /* check stripe size is multiplier of comp_end */
1128                 if (comp_end != LUSTRE_EOF &&
1129                     comp_end != comp->llc_extent.e_start &&
1130                     comp_end % comp->llc_stripe_size) {
1131                         /* fix that even for defined stripe size but warn
1132                          * about the problem, that must not happen
1133                          */
1134                         CWARN("Component end %llu is not aligned by the stripe size %u\n",
1135                               comp_end, comp->llc_stripe_size);
1136                         comp->llc_stripe_size = comp_end & ~(comp_end - 1);
1137                 }
1138         }
1139 }
1140
1141 static inline void lod_adjust_stripe_info(struct lod_layout_component *comp,
1142                                           struct lov_desc *desc,
1143                                           int append_stripes)
1144 {
1145         if (!(comp->llc_pattern & LOV_PATTERN_MDT)) {
1146                 if (append_stripes) {
1147                         comp->llc_stripe_count = append_stripes;
1148                 } else if (!comp->llc_stripe_count) {
1149                         comp->llc_stripe_count =
1150                                 desc->ld_default_stripe_count;
1151                 }
1152         }
1153
1154         lod_adjust_stripe_size(comp, desc->ld_default_stripe_size);
1155 }
1156
1157 int lod_obj_for_each_stripe(const struct lu_env *env, struct lod_object *lo,
1158                             struct thandle *th,
1159                             struct lod_obj_stripe_cb_data *data)
1160 {
1161         struct lod_layout_component *lod_comp;
1162         int i, j, rc = 0;
1163         ENTRY;
1164
1165         mutex_lock(&lo->ldo_layout_mutex);
1166         for (i = 0; i < lo->ldo_comp_cnt; i++) {
1167                 lod_comp = &lo->ldo_comp_entries[i];
1168
1169                 if (lod_comp->llc_magic == LOV_MAGIC_FOREIGN)
1170                         continue;
1171
1172                 if (lod_comp->llc_stripe == NULL)
1173                         continue;
1174
1175                 /* has stripe but not inited yet, this component has been
1176                  * declared to be created, but hasn't created yet.
1177                  */
1178                 if (!lod_comp_inited(lod_comp) && !data->locd_declare)
1179                         continue;
1180
1181                 if (data->locd_comp_skip_cb &&
1182                     data->locd_comp_skip_cb(env, lo, i, data))
1183                         continue;
1184
1185                 if (data->locd_comp_cb) {
1186                         rc = data->locd_comp_cb(env, lo, i, data);
1187                         if (rc)
1188                                 GOTO(unlock, rc);
1189                 }
1190
1191                 /* could used just to do sth about component, not each
1192                  * stripes
1193                  */
1194                 if (!data->locd_stripe_cb)
1195                         continue;
1196
1197                 LASSERT(lod_comp->llc_stripe_count > 0);
1198                 for (j = 0; j < lod_comp->llc_stripe_count; j++) {
1199                         struct dt_object *dt = lod_comp->llc_stripe[j];
1200
1201                         if (dt == NULL)
1202                                 continue;
1203                         rc = data->locd_stripe_cb(env, lo, dt, th, i, j, data);
1204                         if (rc != 0)
1205                                 GOTO(unlock, rc);
1206                 }
1207         }
1208 unlock:
1209         mutex_unlock(&lo->ldo_layout_mutex);
1210         RETURN(rc);
1211 }
1212
1213 static inline int
1214 lod_obj_stripe_attr_set_cb(const struct lu_env *env, struct lod_object *lo,
1215                            struct dt_object *dt, struct thandle *th,
1216                            int comp_idx, int stripe_idx,
1217                            struct lod_obj_stripe_cb_data *data)
1218 {
1219         if (data->locd_declare)
1220                 return lod_sub_declare_attr_set(env, dt, data->locd_attr, th);
1221
1222         if (data->locd_attr->la_valid & LA_LAYOUT_VERSION) {
1223                 CDEBUG(D_LAYOUT, DFID": set layout version: %u, comp_idx: %d\n",
1224                        PFID(lu_object_fid(&dt->do_lu)),
1225                        data->locd_attr->la_layout_version, comp_idx);
1226         }
1227
1228         return lod_sub_attr_set(env, dt, data->locd_attr, th);
1229 }
1230
1231 /**
1232  * Implementation of dt_object_operations::do_declare_attr_set.
1233  *
1234  * If the object is striped, then apply the changes to all the stripes.
1235  *
1236  * \see dt_object_operations::do_declare_attr_set() in the API description
1237  * for details.
1238  */
1239 static int lod_declare_attr_set(const struct lu_env *env,
1240                                 struct dt_object *dt,
1241                                 const struct lu_attr *attr,
1242                                 struct thandle *th)
1243 {
1244         struct dt_object  *next = dt_object_child(dt);
1245         struct lod_object *lo = lod_dt_obj(dt);
1246         int                rc, i;
1247         ENTRY;
1248
1249         /*
1250          * declare setattr on the local object
1251          */
1252         rc = lod_sub_declare_attr_set(env, next, attr, th);
1253         if (rc)
1254                 RETURN(rc);
1255
1256         /* osp_declare_attr_set() ignores all attributes other than
1257          * UID, GID, PROJID, and size, and osp_attr_set() ignores all
1258          * but UID, GID and PROJID. Declaration of size attr setting
1259          * happens through lod_declare_init_size(), and not through
1260          * this function. Therefore we need not load striping unless
1261          * ownership is changing.  This should save memory and (we hope)
1262          * speed up rename().
1263          */
1264         if (!S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
1265                 if (!(attr->la_valid & LA_REMOTE_ATTR_SET))
1266                         RETURN(rc);
1267
1268                 if (CFS_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_OWNER))
1269                         RETURN(0);
1270         } else {
1271                 if (!(attr->la_valid & (LA_UID | LA_GID | LA_PROJID | LA_MODE |
1272                                         LA_ATIME | LA_MTIME | LA_CTIME |
1273                                         LA_FLAGS)))
1274                         RETURN(rc);
1275         }
1276         /*
1277          * load striping information, notice we don't do this when object
1278          * is being initialized as we don't need this information till
1279          * few specific cases like destroy, chown
1280          */
1281         rc = lod_striping_load(env, lo);
1282         if (rc)
1283                 RETURN(rc);
1284
1285         if (!lod_obj_is_striped(dt))
1286                 RETURN(0);
1287
1288         /*
1289          * if object is striped declare changes on the stripes
1290          */
1291         if (S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
1292                 LASSERT(lo->ldo_stripe);
1293                 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
1294                         if (lo->ldo_stripe[i] == NULL)
1295                                 continue;
1296                         if (!dt_object_exists(lo->ldo_stripe[i]))
1297                                 continue;
1298                         rc = lod_sub_declare_attr_set(env, lo->ldo_stripe[i],
1299                                                       attr, th);
1300                         if (rc != 0)
1301                                 RETURN(rc);
1302                 }
1303         } else {
1304                 struct lod_obj_stripe_cb_data data = { { 0 } };
1305
1306                 data.locd_attr = attr;
1307                 data.locd_declare = true;
1308                 data.locd_stripe_cb = lod_obj_stripe_attr_set_cb;
1309                 rc = lod_obj_for_each_stripe(env, lo, th, &data);
1310         }
1311
1312         if (rc)
1313                 RETURN(rc);
1314
1315         if (!dt_object_exists(next) || dt_object_remote(next) ||
1316             !S_ISREG(attr->la_mode))
1317                 RETURN(0);
1318
1319         if (CFS_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_STRIPE)) {
1320                 rc = lod_sub_declare_xattr_del(env, next, XATTR_NAME_LOV, th);
1321                 RETURN(rc);
1322         }
1323
1324         if (CFS_FAIL_CHECK(OBD_FAIL_LFSCK_CHANGE_STRIPE) ||
1325             CFS_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_PFL_RANGE)) {
1326                 struct lod_thread_info *info = lod_env_info(env);
1327                 struct lu_buf *buf = &info->lti_buf;
1328
1329                 buf->lb_buf = info->lti_ea_store;
1330                 buf->lb_len = info->lti_ea_store_size;
1331                 rc = lod_sub_declare_xattr_set(env, next, buf, XATTR_NAME_LOV,
1332                                                LU_XATTR_REPLACE, th);
1333         }
1334
1335         RETURN(rc);
1336 }
1337
1338 /**
1339  * Implementation of dt_object_operations::do_attr_set.
1340  *
1341  * If the object is striped, then apply the changes to all or subset of
1342  * the stripes depending on the object type and specific attributes.
1343  *
1344  * \see dt_object_operations::do_attr_set() in the API description for details.
1345  */
1346 static int lod_attr_set(const struct lu_env *env,
1347                         struct dt_object *dt,
1348                         const struct lu_attr *attr,
1349                         struct thandle *th)
1350 {
1351         struct dt_object        *next = dt_object_child(dt);
1352         struct lod_object       *lo = lod_dt_obj(dt);
1353         int                     rc, i;
1354         ENTRY;
1355
1356         /*
1357          * apply changes to the local object
1358          */
1359         rc = lod_sub_attr_set(env, next, attr, th);
1360         if (rc)
1361                 RETURN(rc);
1362
1363         if (!S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
1364                 if (!(attr->la_valid & LA_REMOTE_ATTR_SET))
1365                         RETURN(rc);
1366
1367                 if (CFS_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_OWNER))
1368                         RETURN(0);
1369         } else {
1370                 if (!(attr->la_valid & (LA_UID | LA_GID | LA_MODE | LA_PROJID |
1371                                         LA_ATIME | LA_MTIME | LA_CTIME |
1372                                         LA_FLAGS)))
1373                         RETURN(rc);
1374         }
1375
1376         /* FIXME: a tricky case in the code path of mdd_layout_change():
1377          * the in-memory striping information has been freed in lod_xattr_set()
1378          * due to layout change. It has to load stripe here again. It only
1379          * changes flags of layout so declare_attr_set() is still accurate */
1380         rc = lod_striping_load(env, lo);
1381         if (rc)
1382                 RETURN(rc);
1383
1384         if (!lod_obj_is_striped(dt))
1385                 RETURN(0);
1386
1387         /*
1388          * if object is striped, apply changes to all the stripes
1389          */
1390         if (S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
1391                 LASSERT(lo->ldo_stripe);
1392                 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
1393                         if (unlikely(lo->ldo_stripe[i] == NULL))
1394                                 continue;
1395
1396                         if ((dt_object_exists(lo->ldo_stripe[i]) == 0))
1397                                 continue;
1398
1399                         rc = lod_sub_attr_set(env, lo->ldo_stripe[i], attr, th);
1400                         if (rc != 0)
1401                                 break;
1402                 }
1403         } else {
1404                 struct lod_obj_stripe_cb_data data = { { 0 } };
1405
1406                 data.locd_attr = attr;
1407                 data.locd_declare = false;
1408                 data.locd_stripe_cb = lod_obj_stripe_attr_set_cb;
1409                 rc = lod_obj_for_each_stripe(env, lo, th, &data);
1410         }
1411
1412         if (rc)
1413                 RETURN(rc);
1414
1415         if (!dt_object_exists(next) || dt_object_remote(next) ||
1416             !S_ISREG(attr->la_mode))
1417                 RETURN(0);
1418
1419         if (CFS_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_STRIPE)) {
1420                 rc = lod_sub_xattr_del(env, next, XATTR_NAME_LOV, th);
1421                 RETURN(rc);
1422         }
1423
1424         if (CFS_FAIL_CHECK(OBD_FAIL_LFSCK_CHANGE_STRIPE)) {
1425                 struct lod_thread_info *info = lod_env_info(env);
1426                 struct lu_buf *buf = &info->lti_buf;
1427                 struct ost_id *oi = &info->lti_ostid;
1428                 struct lu_fid *fid = &info->lti_fid;
1429                 struct lov_mds_md_v1 *lmm;
1430                 struct lov_ost_data_v1 *objs;
1431                 __u32 magic;
1432
1433                 rc = lod_get_lov_ea(env, lo);
1434                 if (rc <= 0)
1435                         RETURN(rc);
1436
1437                 buf->lb_buf = info->lti_ea_store;
1438                 buf->lb_len = info->lti_ea_store_size;
1439                 lmm = info->lti_ea_store;
1440                 magic = le32_to_cpu(lmm->lmm_magic);
1441                 if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) {
1442                         struct lov_comp_md_v1 *lcm = buf->lb_buf;
1443                         struct lov_comp_md_entry_v1 *lcme =
1444                                                 &lcm->lcm_entries[0];
1445
1446                         lmm = buf->lb_buf + le32_to_cpu(lcme->lcme_offset);
1447                         magic = le32_to_cpu(lmm->lmm_magic);
1448                 }
1449
1450                 if (magic == LOV_MAGIC_V1)
1451                         objs = &(lmm->lmm_objects[0]);
1452                 else
1453                         objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0];
1454                 ostid_le_to_cpu(&objs->l_ost_oi, oi);
1455                 ostid_to_fid(fid, oi, le32_to_cpu(objs->l_ost_idx));
1456                 fid->f_oid--;
1457                 fid_to_ostid(fid, oi);
1458                 ostid_cpu_to_le(oi, &objs->l_ost_oi);
1459
1460                 rc = lod_sub_xattr_set(env, next, buf, XATTR_NAME_LOV,
1461                                        LU_XATTR_REPLACE, th);
1462         } else if (CFS_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_PFL_RANGE)) {
1463                 struct lod_thread_info *info = lod_env_info(env);
1464                 struct lu_buf *buf = &info->lti_buf;
1465                 struct lov_comp_md_v1 *lcm;
1466                 struct lov_comp_md_entry_v1 *lcme;
1467
1468                 rc = lod_get_lov_ea(env, lo);
1469                 if (rc <= 0)
1470                         RETURN(rc);
1471
1472                 buf->lb_buf = info->lti_ea_store;
1473                 buf->lb_len = info->lti_ea_store_size;
1474                 lcm = buf->lb_buf;
1475                 if (le32_to_cpu(lcm->lcm_magic) != LOV_MAGIC_COMP_V1 &&
1476                     le32_to_cpu(lcm->lcm_magic) != LOV_MAGIC_SEL)
1477                         RETURN(-EINVAL);
1478
1479                 le32_add_cpu(&lcm->lcm_layout_gen, 1);
1480                 lcme = &lcm->lcm_entries[0];
1481                 le64_add_cpu(&lcme->lcme_extent.e_start, 1);
1482                 le64_add_cpu(&lcme->lcme_extent.e_end, -1);
1483
1484                 rc = lod_sub_xattr_set(env, next, buf, XATTR_NAME_LOV,
1485                                        LU_XATTR_REPLACE, th);
1486         }
1487
1488         RETURN(rc);
1489 }
1490
1491 /**
1492  * Implementation of dt_object_operations::do_xattr_get.
1493  *
1494  * If LOV EA is requested from the root object and it's not
1495  * found, then return default striping for the filesystem.
1496  *
1497  * \see dt_object_operations::do_xattr_get() in the API description for details.
1498  */
1499 static int lod_xattr_get(const struct lu_env *env, struct dt_object *dt,
1500                          struct lu_buf *buf, const char *name)
1501 {
1502         struct lod_thread_info *info = lod_env_info(env);
1503         struct lod_device *dev = lu2lod_dev(dt->do_lu.lo_dev);
1504         int is_root;
1505         int rc;
1506         ENTRY;
1507
1508         rc = dt_xattr_get(env, dt_object_child(dt), buf, name);
1509         if (strcmp(name, XATTR_NAME_LMV) == 0) {
1510                 struct lmv_mds_md_v1    *lmv1;
1511                 struct lmv_foreign_md   *lfm;
1512                 int                      rc1 = 0;
1513
1514                 if (rc > (typeof(rc))sizeof(*lmv1))
1515                         RETURN(rc);
1516
1517                 /* short (<= sizeof(struct lmv_mds_md_v1)) foreign LMV case */
1518                 /* XXX empty foreign LMV is not allowed */
1519                 if (rc <= offsetof(typeof(*lfm), lfm_value))
1520                         RETURN(rc = rc > 0 ? -EINVAL : rc);
1521
1522                 if (buf->lb_buf == NULL || buf->lb_len == 0) {
1523                         BUILD_BUG_ON(sizeof(*lmv1) > sizeof(info->lti_key));
1524
1525                         /* lti_buf is large enough for *lmv1 or a short
1526                          * (<= sizeof(struct lmv_mds_md_v1)) foreign LMV
1527                          */
1528                         info->lti_buf.lb_buf = info->lti_key;
1529                         info->lti_buf.lb_len = sizeof(*lmv1);
1530                         rc = dt_xattr_get(env, dt_object_child(dt),
1531                                           &info->lti_buf, name);
1532                         if (unlikely(rc <= offsetof(typeof(*lfm),
1533                                                     lfm_value)))
1534                                 RETURN(rc = rc > 0 ? -EINVAL : rc);
1535
1536                         lfm = info->lti_buf.lb_buf;
1537                         if (le32_to_cpu(lfm->lfm_magic) == LMV_MAGIC_FOREIGN)
1538                                 RETURN(rc);
1539
1540                         if (unlikely(rc != sizeof(*lmv1)))
1541                                 RETURN(rc = rc > 0 ? -EINVAL : rc);
1542
1543                         lmv1 = info->lti_buf.lb_buf;
1544                         /* The on-disk LMV EA only contains header, but the
1545                          * returned LMV EA size should contain the space for
1546                          * the FIDs of all shards of the striped directory. */
1547                         if (le32_to_cpu(lmv1->lmv_magic) == LMV_MAGIC_V1)
1548                                 rc = lmv_mds_md_size(
1549                                         le32_to_cpu(lmv1->lmv_stripe_count),
1550                                         le32_to_cpu(lmv1->lmv_magic));
1551                 } else {
1552                         lmv1 = buf->lb_buf;
1553                         if (le32_to_cpu(lmv1->lmv_magic) != LMV_MAGIC_V1)
1554                                 RETURN(rc);
1555
1556                         if (rc != sizeof(*lmv1))
1557                                 RETURN(rc = rc > 0 ? -EINVAL : rc);
1558
1559                         rc1 = lod_load_lmv_shards(env, lod_dt_obj(dt),
1560                                                   buf, false);
1561                 }
1562
1563                 RETURN(rc = rc1 != 0 ? rc1 : rc);
1564         }
1565
1566         if ((rc > 0) && buf->lb_buf && strcmp(name, XATTR_NAME_LOV) == 0) {
1567                 struct lov_comp_md_v1 *lcm = buf->lb_buf;
1568
1569                 if (lcm->lcm_magic == cpu_to_le32(LOV_MAGIC_SEL))
1570                         lcm->lcm_magic = cpu_to_le32(LOV_MAGIC_COMP_V1);
1571         }
1572
1573         if (rc != -ENODATA || !S_ISDIR(dt->do_lu.lo_header->loh_attr & S_IFMT))
1574                 RETURN(rc);
1575
1576         /*
1577          * XXX: Only used by lfsck
1578          *
1579          * lod returns default striping on the real root of the device
1580          * this is like the root stores default striping for the whole
1581          * filesystem. historically we've been using a different approach
1582          * and store it in the config.
1583          */
1584         dt_root_get(env, dev->lod_child, &info->lti_fid);
1585         is_root = lu_fid_eq(&info->lti_fid, lu_object_fid(&dt->do_lu));
1586
1587         if (is_root && strcmp(XATTR_NAME_LOV, name) == 0) {
1588                 struct lov_user_md *lum = buf->lb_buf;
1589                 struct lov_desc *desc = &dev->lod_ost_descs.ltd_lov_desc;
1590
1591                 if (buf->lb_buf == NULL) {
1592                         rc = sizeof(*lum);
1593                 } else if (buf->lb_len >= sizeof(*lum)) {
1594                         lum->lmm_magic = cpu_to_le32(LOV_USER_MAGIC_V1);
1595                         lmm_oi_set_seq(&lum->lmm_oi, FID_SEQ_LOV_DEFAULT);
1596                         lmm_oi_set_id(&lum->lmm_oi, 0);
1597                         lmm_oi_cpu_to_le(&lum->lmm_oi, &lum->lmm_oi);
1598                         lum->lmm_pattern = cpu_to_le32(desc->ld_pattern);
1599                         lum->lmm_stripe_size = cpu_to_le32(
1600                                                 desc->ld_default_stripe_size);
1601                         lum->lmm_stripe_count = cpu_to_le16(
1602                                                 desc->ld_default_stripe_count);
1603                         lum->lmm_stripe_offset = cpu_to_le16(
1604                                                 desc->ld_default_stripe_offset);
1605                         rc = sizeof(*lum);
1606                 } else {
1607                         rc = -ERANGE;
1608                 }
1609         }
1610
1611         RETURN(rc);
1612 }
1613
1614 /**
1615  * Verify LVM EA.
1616  *
1617  * Checks that the magic of the stripe is sane.
1618  *
1619  * \param[in] lod       lod device
1620  * \param[in] lum       a buffer storing LMV EA to verify
1621  *
1622  * \retval              0 if the EA is sane
1623  * \retval              negative otherwise
1624  */
1625 static int lod_verify_md_striping(struct lod_device *lod,
1626                                   const struct lmv_user_md_v1 *lum)
1627 {
1628         if (unlikely(le32_to_cpu(lum->lum_magic) != LMV_USER_MAGIC)) {
1629                 CERROR("%s: invalid lmv_user_md: magic = %x, "
1630                        "stripe_offset = %d, stripe_count = %u: rc = %d\n",
1631                        lod2obd(lod)->obd_name, le32_to_cpu(lum->lum_magic),
1632                        (int)le32_to_cpu(lum->lum_stripe_offset),
1633                        le32_to_cpu(lum->lum_stripe_count), -EINVAL);
1634                 return -EINVAL;
1635         }
1636
1637         return 0;
1638 }
1639
1640 /**
1641  * Initialize LMV EA for a slave.
1642  *
1643  * Initialize slave's LMV EA from the master's LMV EA.
1644  *
1645  * \param[in] master_lmv        a buffer containing master's EA
1646  * \param[out] slave_lmv        a buffer where slave's EA will be stored
1647  *
1648  */
1649 static void lod_prep_slave_lmv_md(struct lmv_mds_md_v1 *slave_lmv,
1650                                   const struct lmv_mds_md_v1 *master_lmv)
1651 {
1652         *slave_lmv = *master_lmv;
1653         slave_lmv->lmv_magic = cpu_to_le32(LMV_MAGIC_STRIPE);
1654 }
1655
1656 /**
1657  * Generate LMV EA.
1658  *
1659  * Generate LMV EA from the object passed as \a dt. The object must have
1660  * the stripes created and initialized.
1661  *
1662  * \param[in] env       execution environment
1663  * \param[in] dt        object
1664  * \param[out] lmv_buf  buffer storing generated LMV EA
1665  *
1666  * \retval              0 on success
1667  * \retval              negative if failed
1668  */
1669 static int lod_prep_lmv_md(const struct lu_env *env, struct dt_object *dt,
1670                            struct lu_buf *lmv_buf)
1671 {
1672         struct lod_thread_info  *info = lod_env_info(env);
1673         struct lod_device       *lod = lu2lod_dev(dt->do_lu.lo_dev);
1674         struct lod_object       *lo = lod_dt_obj(dt);
1675         struct lmv_mds_md_v1    *lmm1;
1676         int                     stripe_count;
1677         int                     type = LU_SEQ_RANGE_ANY;
1678         int                     rc;
1679         __u32                   mdtidx;
1680         ENTRY;
1681
1682         LASSERT(lo->ldo_dir_striped != 0);
1683         LASSERT(lo->ldo_dir_stripe_count > 0);
1684         stripe_count = lo->ldo_dir_stripe_count;
1685         /* Only store the LMV EA heahder on the disk. */
1686         if (info->lti_ea_store_size < sizeof(*lmm1)) {
1687                 rc = lod_ea_store_resize(info, sizeof(*lmm1));
1688                 if (rc != 0)
1689                         RETURN(rc);
1690         } else {
1691                 memset(info->lti_ea_store, 0, sizeof(*lmm1));
1692         }
1693
1694         lmm1 = (struct lmv_mds_md_v1 *)info->lti_ea_store;
1695         memset(lmm1, 0, sizeof(*lmm1));
1696         lmm1->lmv_magic = cpu_to_le32(LMV_MAGIC);
1697         lmm1->lmv_stripe_count = cpu_to_le32(stripe_count);
1698         lmm1->lmv_hash_type = cpu_to_le32(lo->ldo_dir_hash_type);
1699         lmm1->lmv_layout_version = cpu_to_le32(lo->ldo_dir_layout_version);
1700         if (lod_is_layout_changing(lo)) {
1701                 lmm1->lmv_migrate_hash = cpu_to_le32(lo->ldo_dir_migrate_hash);
1702                 lmm1->lmv_migrate_offset =
1703                         cpu_to_le32(lo->ldo_dir_migrate_offset);
1704         }
1705         rc = lod_fld_lookup(env, lod, lu_object_fid(&dt->do_lu),
1706                             &mdtidx, &type);
1707         if (rc != 0)
1708                 RETURN(rc);
1709
1710         lmm1->lmv_master_mdt_index = cpu_to_le32(mdtidx);
1711         lmv_buf->lb_buf = info->lti_ea_store;
1712         lmv_buf->lb_len = sizeof(*lmm1);
1713
1714         RETURN(rc);
1715 }
1716
1717 /**
1718  * Create in-core represenation for a striped directory.
1719  *
1720  * Parse the buffer containing LMV EA and instantiate LU objects
1721  * representing the stripe objects. The pointers to the objects are
1722  * stored in ldo_stripe field of \a lo. This function is used when
1723  * we need to access an already created object (i.e. load from a disk).
1724  *
1725  * \param[in] env       execution environment
1726  * \param[in] lo        lod object
1727  * \param[in] buf       buffer containing LMV EA
1728  *
1729  * \retval              0 on success
1730  * \retval              negative if failed
1731  */
1732 int lod_parse_dir_striping(const struct lu_env *env, struct lod_object *lo,
1733                            const struct lu_buf *buf)
1734 {
1735         struct lod_thread_info  *info = lod_env_info(env);
1736         struct lod_device       *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
1737         struct lod_tgt_descs    *ltd = &lod->lod_mdt_descs;
1738         struct dt_object        **stripe;
1739         union lmv_mds_md        *lmm = buf->lb_buf;
1740         struct lmv_mds_md_v1    *lmv1 = &lmm->lmv_md_v1;
1741         struct lu_fid           *fid = &info->lti_fid;
1742         unsigned int            i;
1743         int                     rc = 0;
1744         ENTRY;
1745
1746         LASSERT(mutex_is_locked(&lo->ldo_layout_mutex));
1747
1748         /* XXX may be useless as not called for foreign LMV ?? */
1749         if (le32_to_cpu(lmv1->lmv_magic) == LMV_MAGIC_FOREIGN)
1750                 RETURN(0);
1751
1752         if (le32_to_cpu(lmv1->lmv_magic) == LMV_MAGIC_STRIPE) {
1753                 lo->ldo_dir_slave_stripe = 1;
1754                 RETURN(0);
1755         }
1756
1757         if (!lmv_is_sane(lmv1))
1758                 RETURN(-EINVAL);
1759
1760         LASSERT(lo->ldo_stripe == NULL);
1761         OBD_ALLOC_PTR_ARRAY(stripe, le32_to_cpu(lmv1->lmv_stripe_count));
1762         if (stripe == NULL)
1763                 RETURN(-ENOMEM);
1764
1765         for (i = 0; i < le32_to_cpu(lmv1->lmv_stripe_count); i++) {
1766                 struct dt_device        *tgt_dt;
1767                 struct dt_object        *dto;
1768                 int                     type = LU_SEQ_RANGE_ANY;
1769                 __u32                   idx;
1770
1771                 fid_le_to_cpu(fid, &lmv1->lmv_stripe_fids[i]);
1772                 if (!fid_is_sane(fid)) {
1773                         stripe[i] = NULL;
1774                         continue;
1775                 }
1776
1777                 rc = lod_fld_lookup(env, lod, fid, &idx, &type);
1778                 if (rc != 0)
1779                         GOTO(out, rc);
1780
1781                 if (idx == lod2lu_dev(lod)->ld_site->ld_seq_site->ss_node_id) {
1782                         tgt_dt = lod->lod_child;
1783                 } else {
1784                         struct lod_tgt_desc     *tgt;
1785
1786                         tgt = LTD_TGT(ltd, idx);
1787                         if (tgt == NULL)
1788                                 GOTO(out, rc = -ESTALE);
1789                         tgt_dt = tgt->ltd_tgt;
1790                 }
1791
1792                 dto = dt_locate_at(env, tgt_dt, fid,
1793                                   lo->ldo_obj.do_lu.lo_dev->ld_site->ls_top_dev,
1794                                   NULL);
1795                 if (IS_ERR(dto))
1796                         GOTO(out, rc = PTR_ERR(dto));
1797
1798                 stripe[i] = dto;
1799         }
1800 out:
1801         lo->ldo_stripe = stripe;
1802         lo->ldo_is_foreign = 0;
1803         lo->ldo_dir_stripe_count = le32_to_cpu(lmv1->lmv_stripe_count);
1804         lo->ldo_dir_stripes_allocated = le32_to_cpu(lmv1->lmv_stripe_count);
1805         lo->ldo_dir_layout_version = le32_to_cpu(lmv1->lmv_layout_version);
1806         lo->ldo_dir_migrate_offset = le32_to_cpu(lmv1->lmv_migrate_offset);
1807         lo->ldo_dir_migrate_hash = le32_to_cpu(lmv1->lmv_migrate_hash);
1808         lo->ldo_dir_hash_type = le32_to_cpu(lmv1->lmv_hash_type);
1809         if (rc != 0)
1810                 lod_striping_free_nolock(env, lo);
1811
1812         RETURN(rc);
1813 }
1814
1815 /**
1816  * Declare create a striped directory.
1817  *
1818  * Declare creating a striped directory with a given stripe pattern on the
1819  * specified MDTs. A striped directory is represented as a regular directory
1820  * - an index listing all the stripes. The stripes point back to the master
1821  * object with ".." and LinkEA. The master object gets LMV EA which
1822  * identifies it as a striped directory. The function allocates FIDs
1823  * for all stripes.
1824  *
1825  * \param[in] env       execution environment
1826  * \param[in] dt        object
1827  * \param[in] attr      attributes to initialize the objects with
1828  * \param[in] dof       type of objects to be created
1829  * \param[in] th        transaction handle
1830  *
1831  * \retval              0 on success
1832  * \retval              negative if failed
1833  */
1834 static int lod_dir_declare_create_stripes(const struct lu_env *env,
1835                                           struct dt_object *dt,
1836                                           struct lu_attr *attr,
1837                                           struct dt_object_format *dof,
1838                                           struct thandle *th)
1839 {
1840         struct lod_thread_info  *info = lod_env_info(env);
1841         struct lu_buf           lmv_buf;
1842         struct lu_buf           slave_lmv_buf;
1843         struct lmv_mds_md_v1    *lmm;
1844         struct lmv_mds_md_v1    *slave_lmm = NULL;
1845         struct dt_insert_rec    *rec = &info->lti_dt_rec;
1846         struct lod_object       *lo = lod_dt_obj(dt);
1847         int                     rc;
1848         __u32                   i;
1849         ENTRY;
1850
1851         rc = lod_prep_lmv_md(env, dt, &lmv_buf);
1852         if (rc != 0)
1853                 GOTO(out, rc);
1854         lmm = lmv_buf.lb_buf;
1855
1856         OBD_ALLOC_PTR(slave_lmm);
1857         if (slave_lmm == NULL)
1858                 GOTO(out, rc = -ENOMEM);
1859
1860         lod_prep_slave_lmv_md(slave_lmm, lmm);
1861         slave_lmv_buf.lb_buf = slave_lmm;
1862         slave_lmv_buf.lb_len = sizeof(*slave_lmm);
1863
1864         if (!dt_try_as_dir(env, dt_object_child(dt), false))
1865                 GOTO(out, rc = -EINVAL);
1866
1867         rec->rec_type = S_IFDIR;
1868         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
1869                 struct dt_object        *dto = lo->ldo_stripe[i];
1870                 char                    *stripe_name = info->lti_key;
1871                 struct lu_name          *sname;
1872                 struct linkea_data       ldata          = { NULL };
1873                 struct lu_buf           linkea_buf;
1874
1875                 /* OBD_FAIL_MDS_STRIPE_FID may leave stripe uninitialized */
1876                 if (!dto)
1877                         continue;
1878
1879                 /* directory split skip create for existing stripes */
1880                 if (!(lod_is_splitting(lo) && i < lo->ldo_dir_split_offset)) {
1881                         rc = lod_sub_declare_create(env, dto, attr, NULL, dof,
1882                                                     th);
1883                         if (rc != 0)
1884                                 GOTO(out, rc);
1885
1886                         if (!dt_try_as_dir(env, dto, false))
1887                                 GOTO(out, rc = -EINVAL);
1888
1889                         rc = lod_sub_declare_ref_add(env, dto, th);
1890                         if (rc != 0)
1891                                 GOTO(out, rc);
1892
1893                         rec->rec_fid = lu_object_fid(&dto->do_lu);
1894                         rc = lod_sub_declare_insert(env, dto,
1895                                                     (const struct dt_rec *)rec,
1896                                                     (const struct dt_key *)dot,
1897                                                     th);
1898                         if (rc != 0)
1899                                 GOTO(out, rc);
1900
1901                         /* master stripe FID will be put to .. */
1902                         rec->rec_fid = lu_object_fid(&dt->do_lu);
1903                         rc = lod_sub_declare_insert(env, dto,
1904                                                   (const struct dt_rec *)rec,
1905                                                   (const struct dt_key *)dotdot,
1906                                                   th);
1907                         if (rc != 0)
1908                                 GOTO(out, rc);
1909
1910                         if (CFS_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_SLAVE_NAME) &&
1911                             cfs_fail_val == i)
1912                                 snprintf(stripe_name, sizeof(info->lti_key),
1913                                          DFID":%u",
1914                                          PFID(lu_object_fid(&dto->do_lu)),
1915                                          i + 1);
1916                         else
1917                                 snprintf(stripe_name, sizeof(info->lti_key),
1918                                          DFID":%u",
1919                                          PFID(lu_object_fid(&dto->do_lu)), i);
1920
1921                         sname = lod_name_get(env, stripe_name,
1922                                              strlen(stripe_name));
1923                         rc = linkea_links_new(&ldata, &info->lti_linkea_buf,
1924                                               sname, lu_object_fid(&dt->do_lu));
1925                         if (rc != 0)
1926                                 GOTO(out, rc);
1927
1928                         linkea_buf.lb_buf = ldata.ld_buf->lb_buf;
1929                         linkea_buf.lb_len = ldata.ld_leh->leh_len;
1930                         rc = lod_sub_declare_xattr_set(env, dto, &linkea_buf,
1931                                                        XATTR_NAME_LINK, 0, th);
1932                         if (rc != 0)
1933                                 GOTO(out, rc);
1934
1935                         rec->rec_fid = lu_object_fid(&dto->do_lu);
1936                         rc = lod_sub_declare_insert(env, dt_object_child(dt),
1937                                         (const struct dt_rec *)rec,
1938                                         (const struct dt_key *)stripe_name, th);
1939                         if (rc != 0)
1940                                 GOTO(out, rc);
1941
1942                         rc = lod_sub_declare_ref_add(env, dt_object_child(dt),
1943                                                      th);
1944                         if (rc != 0)
1945                                 GOTO(out, rc);
1946                 }
1947
1948                 if (!CFS_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_SLAVE_LMV) ||
1949                     cfs_fail_val != i) {
1950                         if (CFS_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_SLAVE_LMV) &&
1951                             cfs_fail_val == i)
1952                                 slave_lmm->lmv_master_mdt_index =
1953                                                         cpu_to_le32(i + 1);
1954                         else
1955                                 slave_lmm->lmv_master_mdt_index =
1956                                                         cpu_to_le32(i);
1957                         rc = lod_sub_declare_xattr_set(env, dto, &slave_lmv_buf,
1958                                                        XATTR_NAME_LMV, 0, th);
1959                         if (rc != 0)
1960                                 GOTO(out, rc);
1961                 }
1962         }
1963
1964         rc = lod_sub_declare_xattr_set(env, dt_object_child(dt),
1965                                        &lmv_buf, XATTR_NAME_LMV, 0, th);
1966         if (rc != 0)
1967                 GOTO(out, rc);
1968 out:
1969         if (slave_lmm != NULL)
1970                 OBD_FREE_PTR(slave_lmm);
1971
1972         RETURN(rc);
1973 }
1974
1975 /**
1976  * Allocate a striping on a predefined set of MDTs.
1977  *
1978  * Allocates new striping using the MDT index range provided by the data from
1979  * the lum_obejcts contained in the lmv_user_md passed to this method if
1980  * \a is_specific is true; or allocates new layout starting from MDT index in
1981  * lo->ldo_dir_stripe_offset. The exact order of MDTs is not important and
1982  * varies depending on MDT status. The number of stripes needed and stripe
1983  * offset are taken from the object. If that number cannot be met, then the
1984  * function returns an error and then it's the caller's responsibility to
1985  * release the stripes allocated. All the internal structures are protected,
1986  * but no concurrent allocation is allowed on the same objects.
1987  *
1988  * \param[in] env               execution environment for this thread
1989  * \param[in] lo                LOD object
1990  * \param[out] stripes          striping created
1991  * \param[out] mdt_indices      MDT indices of striping created
1992  * \param[in] is_specific       true if the MDTs are provided by lum; false if
1993  *                              only the starting MDT index is provided
1994  *
1995  * \retval positive     stripes allocated, including the first stripe allocated
1996  *                      outside
1997  * \retval negative     errno on failure
1998  */
1999 static int lod_mdt_alloc_specific(const struct lu_env *env,
2000                                   struct lod_object *lo,
2001                                   struct dt_object **stripes,
2002                                   __u32 *mdt_indices, bool is_specific)
2003 {
2004         struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
2005         struct lu_tgt_descs *ltd = &lod->lod_mdt_descs;
2006         struct lu_tgt_desc *tgt = NULL;
2007         struct lu_object_conf conf = { .loc_flags = LOC_F_NEW };
2008         struct dt_device *tgt_dt = NULL;
2009         struct lu_fid fid = { 0 };
2010         struct dt_object *dto;
2011         u32 master_index;
2012         u32 stripe_count = lo->ldo_dir_stripe_count;
2013         int stripe_idx = 1;
2014         int j;
2015         int idx;
2016         int rc;
2017
2018         master_index = lu_site2seq(lod2lu_dev(lod)->ld_site)->ss_node_id;
2019         if (!is_specific && stripe_count > 1)
2020                 /* Set the start index for the 2nd stripe allocation */
2021                 mdt_indices[1] = (mdt_indices[0] + 1) %
2022                                         (lod->lod_remote_mdt_count + 1);
2023
2024         for (; stripe_idx < stripe_count; stripe_idx++) {
2025                 /* Try to find next avaible target */
2026                 idx = mdt_indices[stripe_idx];
2027                 for (j = 0; j < lod->lod_remote_mdt_count;
2028                      j++, idx = (idx + 1) % (lod->lod_remote_mdt_count + 1)) {
2029                         bool already_allocated = false;
2030                         __u32 k;
2031
2032                         CDEBUG(D_INFO,
2033                                "try idx %d, mdt cnt %u, allocated %u, specific %d count %hu offset %d hash %#X\n",
2034                                idx, lod->lod_remote_mdt_count + 1, stripe_idx,
2035                                is_specific, lo->ldo_dir_stripe_count,
2036                                (int)lo->ldo_dir_stripe_offset,
2037                                lo->ldo_dir_hash_type);
2038
2039                         if (likely(!is_specific &&
2040                                    !CFS_FAIL_CHECK(OBD_FAIL_LARGE_STRIPE) &&
2041                                    !(lo->ldo_dir_hash_type &
2042                                      LMV_HASH_FLAG_OVERSTRIPED))) {
2043                                 /* check whether the idx already exists
2044                                  * in current allocated array */
2045                                 for (k = 0; k < stripe_idx; k++) {
2046                                         if (mdt_indices[k] == idx) {
2047                                                 already_allocated = true;
2048                                                 break;
2049                                         }
2050                                 }
2051
2052                                 if (already_allocated)
2053                                         continue;
2054                         }
2055
2056                         /* Sigh, this index is not in the bitmap, let's check
2057                          * next available target */
2058                         if (!test_bit(idx, ltd->ltd_tgt_bitmap) &&
2059                             idx != master_index)
2060                                 continue;
2061
2062                         if (idx == master_index) {
2063                                 /* Allocate the FID locally */
2064                                 tgt_dt = lod->lod_child;
2065                                 rc = dt_fid_alloc(env, tgt_dt, &fid, NULL,
2066                                                   NULL);
2067                                 if (rc < 0)
2068                                         continue;
2069                                 break;
2070                         }
2071
2072                         /* check the status of the OSP */
2073                         tgt = LTD_TGT(ltd, idx);
2074                         if (!tgt)
2075                                 continue;
2076
2077                         tgt_dt = tgt->ltd_tgt;
2078                         if (!tgt->ltd_active)
2079                                 /* this OSP doesn't feel well */
2080                                 continue;
2081
2082                         if (tgt->ltd_statfs.os_state & OS_STATFS_NOCREATE)
2083                                 continue;
2084
2085                         rc = dt_fid_alloc(env, tgt_dt, &fid, NULL, NULL);
2086                         if (rc < 0)
2087                                 continue;
2088
2089                         break;
2090                 }
2091
2092                 /* Can not allocate more stripes */
2093                 if (j == lod->lod_remote_mdt_count) {
2094                         CDEBUG(D_INFO, "%s: require stripes %u only get %d\n",
2095                                lod2obd(lod)->obd_name, stripe_count,
2096                                stripe_idx);
2097                         break;
2098                 }
2099
2100                 CDEBUG(D_INFO, "Get idx %d, for stripe %d "DFID"\n",
2101                        idx, stripe_idx, PFID(&fid));
2102                 mdt_indices[stripe_idx] = idx;
2103                 /* Set the start index for next stripe allocation */
2104                 if (!is_specific && stripe_idx < stripe_count - 1) {
2105                         /*
2106                          * for large dir test, put all other slaves on one
2107                          * remote MDT, otherwise we may save too many local
2108                          * slave locks which will exceed RS_MAX_LOCKS.
2109                          */
2110                         if (unlikely(CFS_FAIL_CHECK(OBD_FAIL_LARGE_STRIPE)))
2111                                 idx = master_index;
2112                         mdt_indices[stripe_idx + 1] = (idx + 1) %
2113                                            (lod->lod_remote_mdt_count + 1);
2114                 }
2115                 /* tgt_dt and fid must be ready after search avaible OSP
2116                  * in the above loop */
2117                 LASSERT(tgt_dt != NULL);
2118                 LASSERT(fid_is_sane(&fid));
2119
2120                 /* fail a remote stripe FID allocation */
2121                 if (stripe_idx && CFS_FAIL_CHECK(OBD_FAIL_MDS_STRIPE_FID))
2122                         continue;
2123
2124                 dto = dt_locate_at(env, tgt_dt, &fid,
2125                                   lo->ldo_obj.do_lu.lo_dev->ld_site->ls_top_dev,
2126                                   &conf);
2127                 if (IS_ERR(dto)) {
2128                         rc = PTR_ERR(dto);
2129                         goto error;
2130                 }
2131
2132                 stripes[stripe_idx] = dto;
2133         }
2134
2135         return stripe_idx;
2136
2137 error:
2138         for (j = 1; j < stripe_idx; j++) {
2139                 LASSERT(stripes[j] != NULL);
2140                 dt_object_put(env, stripes[j]);
2141                 stripes[j] = NULL;
2142         }
2143         return rc;
2144 }
2145
2146 static int lod_prep_md_striped_create(const struct lu_env *env,
2147                                       struct dt_object *dt,
2148                                       struct lu_attr *attr,
2149                                       const struct lmv_user_md_v1 *lum,
2150                                       struct dt_object_format *dof,
2151                                       struct thandle *th)
2152 {
2153         struct lod_device *lod = lu2lod_dev(dt->do_lu.lo_dev);
2154         struct lod_object *lo = lod_dt_obj(dt);
2155         struct dt_object **stripes;
2156         struct lu_object_conf conf = { .loc_flags = LOC_F_NEW };
2157         struct lu_fid fid = { 0 };
2158         int mdt_count = lod->lod_remote_mdt_count + 1;
2159         __u32 stripe_count;
2160         int i;
2161         int rc = 0;
2162
2163         ENTRY;
2164
2165         /* The lum has been verifed in lod_verify_md_striping */
2166         LASSERT(le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC ||
2167                 le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC_SPECIFIC);
2168
2169         stripe_count = lo->ldo_dir_stripe_count;
2170         /* silently clear OVERSTRIPED flag on single MDT system */
2171         if (mdt_count == 1)
2172                 lo->ldo_dir_hash_type &= ~LMV_HASH_FLAG_OVERSTRIPED;
2173         if (lo->ldo_dir_hash_type & LMV_HASH_FLAG_OVERSTRIPED) {
2174                 /* silently clamp stripe count if MDTs are not specific */
2175                 if (stripe_count > mdt_count * lod->lod_max_stripes_per_mdt) {
2176                         if (le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC)
2177                                 stripe_count = mdt_count *
2178                                                lod->lod_max_stripes_per_mdt;
2179                         else
2180                                 RETURN(-E2BIG);
2181                 }
2182                 /* clear OVERSTRIPED if not overstriped */
2183                 if (stripe_count <= mdt_count &&
2184                     le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC)
2185                         lo->ldo_dir_hash_type &= ~LMV_HASH_FLAG_OVERSTRIPED;
2186         } else if (stripe_count > mdt_count) {
2187                 RETURN(-E2BIG);
2188         }
2189
2190         OBD_ALLOC_PTR_ARRAY(stripes, stripe_count);
2191         if (!stripes)
2192                 RETURN(-ENOMEM);
2193
2194         /* Allocate the first stripe locally */
2195         rc = dt_fid_alloc(env, lod->lod_child, &fid, NULL, NULL);
2196         if (rc < 0)
2197                 GOTO(out, rc);
2198
2199         stripes[0] = dt_locate_at(env, lod->lod_child, &fid,
2200                                   dt->do_lu.lo_dev->ld_site->ls_top_dev, &conf);
2201         if (IS_ERR(stripes[0]))
2202                 GOTO(out, rc = PTR_ERR(stripes[0]));
2203
2204         if (lo->ldo_dir_stripe_offset == LMV_OFFSET_DEFAULT) {
2205                 lod_qos_statfs_update(env, lod, &lod->lod_mdt_descs);
2206                 rc = lod_mdt_alloc_qos(env, lo, stripes, 1, stripe_count);
2207                 if (rc == -EAGAIN)
2208                         rc = lod_mdt_alloc_rr(env, lo, stripes, 1,
2209                                               stripe_count);
2210         } else {
2211                 int *idx_array;
2212                 bool is_specific = false;
2213
2214                 OBD_ALLOC_PTR_ARRAY(idx_array, stripe_count);
2215                 if (!idx_array)
2216                         GOTO(out, rc = -ENOMEM);
2217
2218                 if (le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC_SPECIFIC) {
2219                         int stripes_per_mdt;
2220                         int mdt;
2221                         bool overstriped = false;
2222
2223                         is_specific = true;
2224
2225                         /* Verify we do not exceed the stripes per MDT limit */
2226                         for (mdt = 0; mdt < mdt_count + 1; mdt++) {
2227                                 stripes_per_mdt = 0;
2228                                 for (i = 0; i < stripe_count; i++) {
2229                                         if (mdt ==
2230                                             le32_to_cpu(lum->lum_objects[i].lum_mds))
2231                                                 stripes_per_mdt++;
2232                                 }
2233                                 if (stripes_per_mdt >
2234                                     lod->lod_max_stripes_per_mdt)
2235                                         GOTO(out_free, rc = -EINVAL);
2236                                 if (stripes_per_mdt > 1)
2237                                         overstriped = true;
2238                         }
2239                         if (!overstriped &&
2240                             (lo->ldo_dir_hash_type & LMV_HASH_FLAG_OVERSTRIPED))
2241                                 lo->ldo_dir_hash_type &=
2242                                         ~LMV_HASH_FLAG_OVERSTRIPED;
2243                         else if (overstriped &&
2244                                  !(lo->ldo_dir_hash_type &
2245                                    LMV_HASH_FLAG_OVERSTRIPED))
2246                                 GOTO(out_free, rc = -EINVAL);
2247
2248                         for (i = 0; i < stripe_count; i++)
2249                                 idx_array[i] =
2250                                        le32_to_cpu(lum->lum_objects[i].lum_mds);
2251                 }
2252
2253                 /* stripe 0 is local */
2254                 idx_array[0] =
2255                         lu_site2seq(lod2lu_dev(lod)->ld_site)->ss_node_id;
2256                 rc = lod_mdt_alloc_specific(env, lo, stripes, idx_array,
2257                                             is_specific);
2258 out_free:
2259                 OBD_FREE_PTR_ARRAY(idx_array, stripe_count);
2260         }
2261
2262         if (rc < 0)
2263                 GOTO(out, rc);
2264
2265         LASSERT(rc > 0);
2266
2267         lo->ldo_dir_striped = 1;
2268         lo->ldo_stripe = stripes;
2269         lo->ldo_dir_stripe_count = rc;
2270         lo->ldo_dir_stripes_allocated = stripe_count;
2271         smp_mb();
2272         lo->ldo_dir_stripe_loaded = 1;
2273
2274         rc = lod_dir_declare_create_stripes(env, dt, attr, dof, th);
2275         if (rc < 0)
2276                 lod_striping_free(env, lo);
2277
2278         RETURN(rc);
2279
2280 out:
2281         LASSERT(rc < 0);
2282         if (!IS_ERR_OR_NULL(stripes[0]))
2283                 dt_object_put(env, stripes[0]);
2284         for (i = 1; i < stripe_count; i++)
2285                 LASSERT(!stripes[i]);
2286         OBD_FREE_PTR_ARRAY(stripes, stripe_count);
2287
2288         return rc;
2289 }
2290
2291 /**
2292  *
2293  * Alloc cached foreign LOV
2294  *
2295  * \param[in] lo        object
2296  * \param[in] size      size of foreign LOV
2297  *
2298  * \retval              0 on success
2299  * \retval              negative if failed
2300  */
2301 int lod_alloc_foreign_lov(struct lod_object *lo, size_t size)
2302 {
2303         OBD_ALLOC_LARGE(lo->ldo_foreign_lov, size);
2304         if (lo->ldo_foreign_lov == NULL)
2305                 return -ENOMEM;
2306         lo->ldo_foreign_lov_size = size;
2307         lo->ldo_is_foreign = 1;
2308         return 0;
2309 }
2310
2311 /**
2312  *
2313  * Free cached foreign LOV
2314  *
2315  * \param[in] lo        object
2316  */
2317 void lod_free_foreign_lov(struct lod_object *lo)
2318 {
2319         if (lo->ldo_foreign_lov != NULL)
2320                 OBD_FREE_LARGE(lo->ldo_foreign_lov, lo->ldo_foreign_lov_size);
2321         lo->ldo_foreign_lov = NULL;
2322         lo->ldo_foreign_lov_size = 0;
2323         lo->ldo_is_foreign = 0;
2324 }
2325
2326 /**
2327  *
2328  * Alloc cached foreign LMV
2329  *
2330  * \param[in] lo        object
2331  * \param[in] size      size of foreign LMV
2332  *
2333  * \retval              0 on success
2334  * \retval              negative if failed
2335  */
2336 static int lod_alloc_foreign_lmv(struct lod_object *lo, size_t size)
2337 {
2338         OBD_ALLOC_LARGE(lo->ldo_foreign_lmv, size);
2339         if (lo->ldo_foreign_lmv == NULL)
2340                 return -ENOMEM;
2341         lo->ldo_foreign_lmv_size = size;
2342         lo->ldo_is_foreign = 1;
2343
2344         return 0;
2345 }
2346
2347 static int lod_prep_md_replayed_create(const struct lu_env *env,
2348                                        struct dt_object *dt,
2349                                        struct lu_attr *attr,
2350                                        const struct lu_buf *lmv_buf,
2351                                        struct dt_object_format *dof,
2352                                        struct thandle *th)
2353 {
2354         struct lod_object *lo = lod_dt_obj(dt);
2355         int rc;
2356
2357         ENTRY;
2358
2359         mutex_lock(&lo->ldo_layout_mutex);
2360         rc = lod_parse_dir_striping(env, lo, lmv_buf);
2361         if (rc == 0) {
2362                 lo->ldo_dir_stripe_loaded = 1;
2363                 lo->ldo_dir_striped = 1;
2364                 rc = lod_dir_declare_create_stripes(env, dt, attr, dof, th);
2365         }
2366         mutex_unlock(&lo->ldo_layout_mutex);
2367
2368         RETURN(rc);
2369 }
2370
2371 /**
2372  *
2373  * Free cached foreign LMV
2374  *
2375  * \param[in] lo        object
2376  */
2377 static void lod_free_foreign_lmv(struct lod_object *lo)
2378 {
2379         if (lo->ldo_foreign_lmv != NULL)
2380                 OBD_FREE_LARGE(lo->ldo_foreign_lmv, lo->ldo_foreign_lmv_size);
2381         lo->ldo_foreign_lmv = NULL;
2382         lo->ldo_foreign_lmv_size = 0;
2383         lo->ldo_is_foreign = 0;
2384 }
2385
2386 /**
2387  * Declare create striped md object.
2388  *
2389  * The function declares intention to create a striped directory. This is a
2390  * wrapper for lod_prep_md_striped_create(). The only additional functionality
2391  * is to verify pattern \a lum_buf is good. Check that function for the details.
2392  *
2393  * \param[in] env       execution environment
2394  * \param[in] dt        object
2395  * \param[in] attr      attributes to initialize the objects with
2396  * \param[in] lum_buf   a pattern specifying the number of stripes and
2397  *                      MDT to start from
2398  * \param[in] dof       type of objects to be created
2399  * \param[in] th        transaction handle
2400  *
2401  * \retval              0 on success
2402  * \retval              negative if failed
2403  *
2404  */
2405 static int lod_declare_xattr_set_lmv(const struct lu_env *env,
2406                                      struct dt_object *dt,
2407                                      struct lu_attr *attr,
2408                                      const struct lu_buf *lum_buf,
2409                                      struct dt_object_format *dof,
2410                                      struct thandle *th)
2411 {
2412         struct lod_object *lo = lod_dt_obj(dt);
2413         struct lmv_user_md_v1 *lum = lum_buf->lb_buf;
2414         int rc;
2415
2416         ENTRY;
2417         LASSERT(lum != NULL);
2418
2419         CDEBUG(D_INFO,
2420                "lum magic=%x hash=%x count=%u offset=%d inherit=%u rr=%u\n",
2421                le32_to_cpu(lum->lum_magic), le32_to_cpu(lum->lum_hash_type),
2422                le32_to_cpu(lum->lum_stripe_count),
2423                (int)le32_to_cpu(lum->lum_stripe_offset),
2424                lum->lum_max_inherit, lum->lum_max_inherit_rr);
2425
2426         if (lo->ldo_dir_stripe_count == 0) {
2427                 if (lo->ldo_is_foreign) {
2428                         rc = lod_alloc_foreign_lmv(lo, lum_buf->lb_len);
2429                         if (rc != 0)
2430                                 RETURN(rc);
2431                         memcpy(lo->ldo_foreign_lmv, lum, lum_buf->lb_len);
2432                         lo->ldo_dir_stripe_loaded = 1;
2433                 }
2434                 RETURN(0);
2435         }
2436
2437         /* client replay striped directory creation with LMV, this happens when
2438          * all involved MDTs were rebooted, or MDT recovery was aborted.
2439          */
2440         if (le32_to_cpu(lum->lum_magic) == LMV_MAGIC_V1)
2441                 rc = lod_prep_md_replayed_create(env, dt, attr, lum_buf, dof,
2442                                                  th);
2443         else
2444                 rc = lod_prep_md_striped_create(env, dt, attr, lum, dof, th);
2445         if (rc != 0)
2446                 /* failed to create striping, let's reset
2447                  * config so that others don't get confused */
2448                 lod_striping_free(env, lo);
2449
2450         RETURN(rc);
2451 }
2452
2453 /**
2454  * Set or replace striped directory layout, and LFSCK may set layout on a plain
2455  * directory, so don't check stripe count.
2456  *
2457  * \param[in] env       execution environment
2458  * \param[in] dt        target object
2459  * \param[in] lmv_buf   LMV buf which contains source stripe FIDs
2460  * \param[in] fl        set or replace
2461  * \param[in] th        transaction handle
2462  *
2463  * \retval              0 on success
2464  * \retval              negative if failed
2465  */
2466 static int lod_dir_layout_set(const struct lu_env *env,
2467                               struct dt_object *dt,
2468                               const struct lu_buf *lmv_buf,
2469                               int fl,
2470                               struct thandle *th)
2471 {
2472         struct dt_object *next = dt_object_child(dt);
2473         struct lod_object *lo = lod_dt_obj(dt);
2474         struct lod_device *lod = lu2lod_dev(lod2lu_obj(lo)->lo_dev);
2475         struct lmv_mds_md_v1 *lmv = lmv_buf->lb_buf;
2476         struct lmv_mds_md_v1 *slave_lmv;
2477         struct lu_buf slave_buf;
2478         int i;
2479         int rc;
2480
2481         ENTRY;
2482
2483         if (!lmv_is_sane2(lmv))
2484                 RETURN(-EINVAL);
2485
2486         /* adjust hash for dir merge, which may not be set in user command */
2487         if (lmv_is_merging(lmv) &&
2488             !(lmv->lmv_migrate_hash & LMV_HASH_TYPE_MASK))
2489                 lmv->lmv_merge_hash |=
2490                         lod->lod_mdt_descs.ltd_lmv_desc.ld_pattern &
2491                         LMV_HASH_TYPE_MASK;
2492
2493         LMV_DEBUG(D_INFO, lmv, "set");
2494
2495         rc = lod_sub_xattr_set(env, next, lmv_buf, XATTR_NAME_LMV, fl, th);
2496         if (rc)
2497                 RETURN(rc);
2498
2499         /* directory restripe may update stripe LMV directly */
2500         if (!lo->ldo_dir_stripe_count)
2501                 RETURN(0);
2502
2503         lo->ldo_dir_hash_type = le32_to_cpu(lmv->lmv_hash_type);
2504         lo->ldo_dir_migrate_offset = le32_to_cpu(lmv->lmv_migrate_offset);
2505         lo->ldo_dir_migrate_hash = le32_to_cpu(lmv->lmv_migrate_hash);
2506         lo->ldo_dir_layout_version = le32_to_cpu(lmv->lmv_layout_version);
2507
2508         OBD_ALLOC_PTR(slave_lmv);
2509         if (!slave_lmv)
2510                 RETURN(-ENOMEM);
2511
2512         lod_prep_slave_lmv_md(slave_lmv, lmv);
2513         slave_buf.lb_buf = slave_lmv;
2514         slave_buf.lb_len = sizeof(*slave_lmv);
2515
2516         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
2517                 if (!lo->ldo_stripe[i])
2518                         continue;
2519
2520                 if (!dt_object_exists(lo->ldo_stripe[i]))
2521                         continue;
2522
2523                 rc = lod_sub_xattr_set(env, lo->ldo_stripe[i], &slave_buf,
2524                                        XATTR_NAME_LMV, fl, th);
2525                 if (rc)
2526                         break;
2527         }
2528
2529         OBD_FREE_PTR(slave_lmv);
2530
2531         RETURN(rc);
2532 }
2533
2534 /**
2535  * Implementation of dt_object_operations::do_declare_xattr_set.
2536  *
2537  * Used with regular (non-striped) objects. Basically it
2538  * initializes the striping information and applies the
2539  * change to all the stripes.
2540  *
2541  * \see dt_object_operations::do_declare_xattr_set() in the API description
2542  * for details.
2543  */
2544 static int lod_dir_declare_xattr_set(const struct lu_env *env,
2545                                      struct dt_object *dt,
2546                                      const struct lu_buf *buf,
2547                                      const char *name, int fl,
2548                                      struct thandle *th)
2549 {
2550         struct dt_object        *next = dt_object_child(dt);
2551         struct lod_device       *d = lu2lod_dev(dt->do_lu.lo_dev);
2552         struct lod_object       *lo = lod_dt_obj(dt);
2553         int                     i;
2554         int                     rc;
2555         ENTRY;
2556
2557         if (strcmp(name, XATTR_NAME_DEFAULT_LMV) == 0) {
2558                 struct lmv_user_md_v1 *lum;
2559
2560                 LASSERT(buf != NULL);
2561                 if (!buf->lb_buf || buf->lb_len < sizeof(*lum))
2562                         RETURN(-EFAULT);
2563
2564                 lum = buf->lb_buf;
2565                 rc = lod_verify_md_striping(d, lum);
2566                 if (rc != 0)
2567                         RETURN(rc);
2568         } else if (strcmp(name, XATTR_NAME_LOV) == 0) {
2569                 rc = lod_verify_striping(env, d, lo, buf, false);
2570                 if (rc != 0)
2571                         RETURN(rc);
2572         }
2573
2574         rc = lod_sub_declare_xattr_set(env, next, buf, name, fl, th);
2575         if (rc != 0)
2576                 RETURN(rc);
2577
2578         /* Note: Do not set LinkEA on sub-stripes, otherwise
2579          * it will confuse the fid2path process(see mdt_path_current()).
2580          * The linkEA between master and sub-stripes is set in
2581          * lod_xattr_set_lmv(). */
2582         if (strcmp(name, XATTR_NAME_LINK) == 0)
2583                 RETURN(0);
2584
2585         /* set xattr to each stripes, if needed */
2586         rc = lod_striping_load(env, lo);
2587         if (rc != 0)
2588                 RETURN(rc);
2589
2590         if (lo->ldo_dir_stripe_count == 0)
2591                 RETURN(0);
2592
2593         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
2594                 if (!lo->ldo_stripe[i])
2595                         continue;
2596
2597                 if (!dt_object_exists(lo->ldo_stripe[i]))
2598                         continue;
2599
2600                 rc = lod_sub_declare_xattr_set(env, lo->ldo_stripe[i],
2601                                                buf, name, fl, th);
2602                 if (rc != 0)
2603                         break;
2604         }
2605
2606         RETURN(rc);
2607 }
2608
2609 static int
2610 lod_obj_stripe_replace_parent_fid_cb(const struct lu_env *env,
2611                                      struct lod_object *lo,
2612                                      struct dt_object *dt, struct thandle *th,
2613                                      int comp_idx, int stripe_idx,
2614                                      struct lod_obj_stripe_cb_data *data)
2615 {
2616         struct lod_thread_info *info = lod_env_info(env);
2617         struct lod_layout_component *comp = &lo->ldo_comp_entries[comp_idx];
2618         struct filter_fid *ff = &info->lti_ff;
2619         struct lu_buf *buf = &info->lti_buf;
2620         int rc;
2621
2622         buf->lb_buf = ff;
2623         buf->lb_len = sizeof(*ff);
2624         rc = dt_xattr_get(env, dt, buf, XATTR_NAME_FID);
2625         if (rc < 0) {
2626                 if (rc == -ENODATA)
2627                         return 0;
2628                 return rc;
2629         }
2630
2631         /*
2632          * locd_buf is set if it's called by dir migration, which doesn't check
2633          * pfid and comp id.
2634          */
2635         if (data->locd_buf) {
2636                 memset(ff, 0, sizeof(*ff));
2637                 ff->ff_parent = *(struct lu_fid *)data->locd_buf->lb_buf;
2638         } else {
2639                 filter_fid_le_to_cpu(ff, ff, sizeof(*ff));
2640
2641                 if (lu_fid_eq(lod_object_fid(lo), &ff->ff_parent) &&
2642                     ff->ff_layout.ol_comp_id == comp->llc_id)
2643                         return 0;
2644
2645                 memset(ff, 0, sizeof(*ff));
2646                 ff->ff_parent = *lu_object_fid(&lo->ldo_obj.do_lu);
2647         }
2648
2649         /* rewrite filter_fid */
2650         ff->ff_parent.f_ver = stripe_idx;
2651         ff->ff_layout.ol_stripe_size = comp->llc_stripe_size;
2652         ff->ff_layout.ol_stripe_count = comp->llc_stripe_count;
2653         ff->ff_layout.ol_comp_id = comp->llc_id;
2654         ff->ff_layout.ol_comp_start = comp->llc_extent.e_start;
2655         ff->ff_layout.ol_comp_end = comp->llc_extent.e_end;
2656         filter_fid_cpu_to_le(ff, ff, sizeof(*ff));
2657
2658         if (data->locd_declare)
2659                 rc = lod_sub_declare_xattr_set(env, dt, buf, XATTR_NAME_FID,
2660                                                LU_XATTR_REPLACE, th);
2661         else
2662                 rc = lod_sub_xattr_set(env, dt, buf, XATTR_NAME_FID,
2663                                        LU_XATTR_REPLACE, th);
2664
2665         return rc;
2666 }
2667
2668 /**
2669  * Reset parent FID on OST object
2670  *
2671  * Replace parent FID with @dt object FID, which is only called during migration
2672  * to reset the parent FID after the MDT object is migrated to the new MDT, i.e.
2673  * the FID is changed.
2674  *
2675  * \param[in] env execution environment
2676  * \param[in] dt dt_object whose stripes's parent FID will be reset
2677  * \parem[in] th thandle
2678  * \param[in] declare if it is declare
2679  *
2680  * \retval      0 if reset succeeds
2681  * \retval      negative errno if reset fails
2682  */
2683 static int lod_replace_parent_fid(const struct lu_env *env,
2684                                   struct dt_object *dt,
2685                                   const struct lu_buf *buf,
2686                                   struct thandle *th, bool declare)
2687 {
2688         struct lod_object *lo = lod_dt_obj(dt);
2689         struct lod_thread_info  *info = lod_env_info(env);
2690         struct filter_fid *ff;
2691         struct lod_obj_stripe_cb_data data = { { 0 } };
2692         int rc;
2693         ENTRY;
2694
2695         LASSERT(S_ISREG(dt->do_lu.lo_header->loh_attr));
2696
2697         /* set xattr to each stripes, if needed */
2698         rc = lod_striping_load(env, lo);
2699         if (rc != 0)
2700                 RETURN(rc);
2701
2702         if (!lod_obj_is_striped(dt))
2703                 RETURN(0);
2704
2705         if (info->lti_ea_store_size < sizeof(*ff)) {
2706                 rc = lod_ea_store_resize(info, sizeof(*ff));
2707                 if (rc != 0)
2708                         RETURN(rc);
2709         }
2710
2711         data.locd_declare = declare;
2712         data.locd_stripe_cb = lod_obj_stripe_replace_parent_fid_cb;
2713         data.locd_buf = buf;
2714         rc = lod_obj_for_each_stripe(env, lo, th, &data);
2715
2716         RETURN(rc);
2717 }
2718
2719 __u16 lod_comp_entry_stripe_count(struct lod_object *lo, int comp_idx,
2720                                   bool is_dir)
2721 {
2722         struct lod_device *lod = lu2lod_dev(lod2lu_obj(lo)->lo_dev);
2723         struct lod_layout_component *entry;
2724         enum lod_uses_hint flags = LOD_USES_ASSIGNED_STRIPE;
2725
2726         if (is_dir) {
2727                 entry = &lo->ldo_def_striping->lds_def_comp_entries[comp_idx];
2728                 return entry->llc_ostlist.op_count;
2729         }
2730
2731         entry = &lo->ldo_comp_entries[comp_idx];
2732         if (lod_comp_inited(entry))
2733                 return entry->llc_stripe_count;
2734         if (entry->llc_stripe_count >= LOV_ALL_STRIPES_MIN &&
2735              entry->llc_stripe_count <= LOV_ALL_STRIPES_MAX)
2736                 return lod_get_stripe_count_plain(lod, lo,
2737                                                   entry->llc_stripe_count,
2738                                                   entry->llc_pattern &
2739                                                       LOV_PATTERN_OVERSTRIPING,
2740                                                   &flags);
2741
2742         return lod_get_stripe_count(lod, lo, comp_idx, entry->llc_stripe_count,
2743                                  entry->llc_pattern & LOV_PATTERN_OVERSTRIPING,
2744                                  &flags);
2745 }
2746
2747 static int lod_comp_md_size(struct lod_object *lo, bool is_dir)
2748 {
2749         int magic, size = 0, i;
2750         struct lod_layout_component *comp_entries;
2751         __u16 comp_cnt;
2752         bool is_composite, is_foreign = false;
2753
2754         if (is_dir) {
2755                 comp_cnt = lo->ldo_def_striping->lds_def_comp_cnt;
2756                 comp_entries = lo->ldo_def_striping->lds_def_comp_entries;
2757                 is_composite =
2758                         lo->ldo_def_striping->lds_def_striping_is_composite;
2759         } else {
2760                 comp_cnt = lo->ldo_comp_cnt;
2761                 comp_entries = lo->ldo_comp_entries;
2762                 is_composite = lo->ldo_is_composite;
2763                 is_foreign = lo->ldo_is_foreign;
2764         }
2765
2766         if (is_foreign)
2767                 return lo->ldo_foreign_lov_size;
2768
2769         LASSERT(comp_cnt != 0 && comp_entries != NULL);
2770         if (is_composite) {
2771                 size = sizeof(struct lov_comp_md_v1) +
2772                        sizeof(struct lov_comp_md_entry_v1) * comp_cnt;
2773                 LASSERT(size % sizeof(__u64) == 0);
2774         }
2775
2776         for (i = 0; i < comp_cnt; i++) {
2777                 __u16 stripe_count;
2778
2779                 if (comp_entries[i].llc_magic == LOV_MAGIC_FOREIGN) {
2780                         size += lov_foreign_md_size(comp_entries[i].llc_length);
2781                 } else {
2782                         magic = comp_entries[i].llc_pool ? LOV_MAGIC_V3 :
2783                                                            LOV_MAGIC_V1;
2784                         stripe_count = lod_comp_entry_stripe_count(lo, i,
2785                                                                    is_dir);
2786                         if (!is_dir && is_composite)
2787                                 lod_comp_shrink_stripe_count(&comp_entries[i],
2788                                                              &stripe_count);
2789                         if (is_dir && comp_entries[i].llc_ostlist.op_count)
2790                                 magic = LOV_MAGIC_SPECIFIC;
2791
2792                         size += lov_user_md_size(stripe_count, magic);
2793                 }
2794                 LASSERT(size % sizeof(__u64) == 0);
2795         }
2796         return size;
2797 }
2798
2799 /**
2800  * Declare component add. The xattr name is XATTR_LUSTRE_LOV.add, and
2801  * the xattr value is binary lov_comp_md_v1 which contains component(s)
2802  * to be added.
2803   *
2804  * \param[in] env       execution environment
2805  * \param[in] dt        dt_object to add components on
2806  * \param[in] buf       buffer contains components to be added
2807  * \parem[in] th        thandle
2808  *
2809  * \retval      0 on success
2810  * \retval      negative errno on failure
2811  */
2812 static int lod_declare_layout_add(const struct lu_env *env,
2813                                   struct dt_object *dt,
2814                                   const struct lu_buf *buf,
2815                                   struct thandle *th)
2816 {
2817         struct lod_thread_info  *info = lod_env_info(env);
2818         struct lod_layout_component *comp_array, *lod_comp, *old_array;
2819         struct lod_device *d = lu2lod_dev(dt->do_lu.lo_dev);
2820         struct dt_object *next = dt_object_child(dt);
2821         struct lov_desc *desc = &d->lod_ost_descs.ltd_lov_desc;
2822         struct lod_object *lo = lod_dt_obj(dt);
2823         struct lov_comp_md_v1 *comp_v1 = buf->lb_buf;
2824         __u32 magic;
2825         int i, rc, array_cnt, old_array_cnt;
2826         ENTRY;
2827
2828         LASSERT(lo->ldo_is_composite);
2829
2830         if (lo->ldo_flr_state != LCM_FL_NONE)
2831                 RETURN(-EBUSY);
2832
2833         rc = lod_verify_striping(env, d, lo, buf, false);
2834         if (rc != 0)
2835                 RETURN(rc);
2836
2837         magic = comp_v1->lcm_magic;
2838         if (magic == __swab32(LOV_USER_MAGIC_COMP_V1)) {
2839                 lustre_swab_lov_comp_md_v1(comp_v1);
2840                 magic = comp_v1->lcm_magic;
2841         }
2842
2843         if (magic != LOV_USER_MAGIC_COMP_V1)
2844                 RETURN(-EINVAL);
2845
2846         mutex_lock(&lo->ldo_layout_mutex);
2847
2848         array_cnt = lo->ldo_comp_cnt + comp_v1->lcm_entry_count;
2849         OBD_ALLOC_PTR_ARRAY(comp_array, array_cnt);
2850         if (comp_array == NULL) {
2851                 mutex_unlock(&lo->ldo_layout_mutex);
2852                 RETURN(-ENOMEM);
2853         }
2854
2855
2856         memcpy(comp_array, lo->ldo_comp_entries,
2857                sizeof(*comp_array) * lo->ldo_comp_cnt);
2858
2859         for (i = 0; i < comp_v1->lcm_entry_count; i++) {
2860                 struct lov_user_md_v1 *v1;
2861                 struct lu_extent *ext;
2862
2863                 v1 = (struct lov_user_md *)((char *)comp_v1 +
2864                                 comp_v1->lcm_entries[i].lcme_offset);
2865                 ext = &comp_v1->lcm_entries[i].lcme_extent;
2866
2867                 lod_comp = &comp_array[lo->ldo_comp_cnt + i];
2868                 lod_comp->llc_extent.e_start = ext->e_start;
2869                 lod_comp->llc_extent.e_end = ext->e_end;
2870                 lod_comp->llc_stripe_offset = v1->lmm_stripe_offset;
2871                 lod_comp->llc_flags = comp_v1->lcm_entries[i].lcme_flags;
2872
2873                 lod_comp->llc_stripe_size = v1->lmm_stripe_size;
2874                 lod_comp->llc_stripe_count = v1->lmm_stripe_count;
2875                 lod_comp->llc_pattern = v1->lmm_pattern;
2876                 /**
2877                  * limit stripe count so that it's less than/equal to
2878                  * extent_size / stripe_size.
2879                  *
2880                  * Note: extension size reused llc_stripe_size field and
2881                  * uninstantiated component could be defined with
2882                  * extent_start == extent_end as extension component will
2883                  * expand it later.
2884                  */
2885                 if (!(lod_comp->llc_flags & LCME_FL_EXTENSION) &&
2886                     (lod_comp_inited(lod_comp) ||
2887                      lod_comp->llc_extent.e_start <
2888                      lod_comp->llc_extent.e_end) &&
2889                      !(lod_comp->llc_stripe_count >= LOV_ALL_STRIPES_MIN &&
2890                        lod_comp->llc_stripe_count <= LOV_ALL_STRIPES_MAX) &&
2891                     ext->e_end != OBD_OBJECT_EOF &&
2892                     (__u64)(lod_comp->llc_stripe_count *
2893                             lod_comp->llc_stripe_size) >
2894                     (ext->e_end - ext->e_start))
2895                         lod_comp->llc_stripe_count =
2896                                 DIV_ROUND_UP(ext->e_end - ext->e_start,
2897                                              lod_comp->llc_stripe_size);
2898                 lod_adjust_stripe_info(lod_comp, desc, 0);
2899
2900                 if (v1->lmm_magic == LOV_USER_MAGIC_V3) {
2901                         struct lov_user_md_v3 *v3 = (typeof(*v3) *) v1;
2902
2903                         if (v3->lmm_pool_name[0] != '\0' &&
2904                             !lov_pool_is_ignored(v3->lmm_pool_name)) {
2905                                 rc = lod_set_pool(&lod_comp->llc_pool,
2906                                                   v3->lmm_pool_name);
2907                                 if (rc)
2908                                         GOTO(error, rc);
2909                         }
2910                 }
2911         }
2912
2913         old_array = lo->ldo_comp_entries;
2914         old_array_cnt = lo->ldo_comp_cnt;
2915
2916         lo->ldo_comp_entries = comp_array;
2917         lo->ldo_comp_cnt = array_cnt;
2918
2919         /* No need to increase layout generation here, it will be increased
2920          * later when generating component ID for the new components */
2921
2922         info->lti_buf.lb_len = lod_comp_md_size(lo, false);
2923         rc = lod_sub_declare_xattr_set(env, next, &info->lti_buf,
2924                                               XATTR_NAME_LOV, 0, th);
2925         if (rc) {
2926                 lo->ldo_comp_entries = old_array;
2927                 lo->ldo_comp_cnt = old_array_cnt;
2928                 GOTO(error, rc);
2929         }
2930
2931         OBD_FREE_PTR_ARRAY(old_array, old_array_cnt);
2932
2933         LASSERT(lo->ldo_mirror_count == 1);
2934         lo->ldo_mirrors[0].lme_end = array_cnt - 1;
2935
2936         mutex_unlock(&lo->ldo_layout_mutex);
2937
2938         RETURN(0);
2939
2940 error:
2941         for (i = lo->ldo_comp_cnt; i < array_cnt; i++) {
2942                 lod_comp = &comp_array[i];
2943                 if (lod_comp->llc_pool != NULL) {
2944                         OBD_FREE(lod_comp->llc_pool,
2945                                  strlen(lod_comp->llc_pool) + 1);
2946                         lod_comp->llc_pool = NULL;
2947                 }
2948         }
2949         OBD_FREE_PTR_ARRAY(comp_array, array_cnt);
2950         mutex_unlock(&lo->ldo_layout_mutex);
2951
2952         RETURN(rc);
2953 }
2954
2955 /**
2956  * lod_last_non_stale_mirror() - Check if a mirror is the last non-stale mirror.
2957  * @mirror_id: Mirror id to be checked.
2958  * @lo:        LOD object.
2959  *
2960  * This function checks if a mirror with specified @mirror_id is the last
2961  * non-stale mirror of a LOD object @lo.
2962  *
2963  * Return: true or false.
2964  */
2965 static inline
2966 bool lod_last_non_stale_mirror(__u16 mirror_id, struct lod_object *lo)
2967 {
2968         struct lod_layout_component *lod_comp;
2969         bool has_stale_flag;
2970         int i;
2971
2972         for (i = 0; i < lo->ldo_mirror_count; i++) {
2973                 if (lo->ldo_mirrors[i].lme_id == mirror_id ||
2974                     lo->ldo_mirrors[i].lme_stale)
2975                         continue;
2976
2977                 has_stale_flag = false;
2978                 lod_foreach_mirror_comp(lod_comp, lo, i) {
2979                         if (lod_comp->llc_flags & LCME_FL_STALE) {
2980                                 has_stale_flag = true;
2981                                 break;
2982                         }
2983                 }
2984                 if (!has_stale_flag)
2985                         return false;
2986         }
2987
2988         return true;
2989 }
2990
2991 /**
2992  * Declare component set. The xattr is name XATTR_LUSTRE_LOV.set.$field,
2993  * the '$field' can only be 'flags' now. The xattr value is binary
2994  * lov_comp_md_v1 which contains the component ID(s) and the value of
2995  * the field to be modified.
2996  * Please update allowed_lustre_lov macro if $field groks more values
2997  * in the future.
2998  *
2999  * \param[in] env       execution environment
3000  * \param[in] dt        dt_object to be modified
3001  * \param[in] op        operation string, like "set.flags"
3002  * \param[in] buf       buffer contains components to be set
3003  * \parem[in] th        thandle
3004  *
3005  * \retval      0 on success
3006  * \retval      negative errno on failure
3007  */
3008 static int lod_declare_layout_set(const struct lu_env *env,
3009                                   struct dt_object *dt,
3010                                   char *op, const struct lu_buf *buf,
3011                                   struct thandle *th)
3012 {
3013         struct lod_layout_component     *lod_comp;
3014         struct lod_thread_info  *info = lod_env_info(env);
3015         struct lod_device       *d = lu2lod_dev(dt->do_lu.lo_dev);
3016         struct lod_object       *lo = lod_dt_obj(dt);
3017         struct lov_comp_md_v1   *comp_v1 = buf->lb_buf;
3018         __u32   magic;
3019         int     i, j, rc;
3020         bool    changed = false;
3021         ENTRY;
3022
3023         /* Please update allowed_lustre_lov macro if op
3024          * groks more values in the future
3025          */
3026         if (strcmp(op, "set.flags") != 0) {
3027                 CDEBUG(D_LAYOUT, "%s: operation (%s) not supported.\n",
3028                        lod2obd(d)->obd_name, op);
3029                 RETURN(-ENOTSUPP);
3030         }
3031
3032         magic = comp_v1->lcm_magic;
3033         if (magic == __swab32(LOV_USER_MAGIC_COMP_V1)) {
3034                 lustre_swab_lov_comp_md_v1(comp_v1);
3035                 magic = comp_v1->lcm_magic;
3036         }
3037
3038         if (magic != LOV_USER_MAGIC_COMP_V1)
3039                 RETURN(-EINVAL);
3040
3041         if (comp_v1->lcm_entry_count == 0) {
3042                 CDEBUG(D_LAYOUT, "%s: entry count is zero.\n",
3043                        lod2obd(d)->obd_name);
3044                 RETURN(-EINVAL);
3045         }
3046
3047         mutex_lock(&lo->ldo_layout_mutex);
3048         for (i = 0; i < comp_v1->lcm_entry_count; i++) {
3049                 __u32 id = comp_v1->lcm_entries[i].lcme_id;
3050                 __u32 flags = comp_v1->lcm_entries[i].lcme_flags;
3051                 __u32 mirror_flag = flags & LCME_MIRROR_FLAGS;
3052                 __u16 mirror_id = mirror_id_of(id);
3053                 bool neg = flags & LCME_FL_NEG;
3054
3055                 if (flags & LCME_FL_INIT) {
3056                         if (changed)
3057                                 lod_striping_free_nolock(env, lo);
3058                         mutex_unlock(&lo->ldo_layout_mutex);
3059                         RETURN(-EINVAL);
3060                 }
3061
3062                 flags &= ~(LCME_MIRROR_FLAGS | LCME_FL_NEG);
3063                 for (j = 0; j < lo->ldo_comp_cnt; j++) {
3064                         lod_comp = &lo->ldo_comp_entries[j];
3065
3066                         /* lfs only put one flag in each entry */
3067                         if ((flags && id != lod_comp->llc_id) ||
3068                             (mirror_flag && mirror_id !=
3069                                             mirror_id_of(lod_comp->llc_id)))
3070                                 continue;
3071
3072                         if (neg) {
3073                                 if (flags)
3074                                         lod_comp->llc_flags &= ~flags;
3075                                 if (mirror_flag)
3076                                         lod_comp->llc_flags &= ~mirror_flag;
3077                         } else {
3078                                 if (flags) {
3079                                         if ((flags & LCME_FL_STALE) &&
3080                                             lod_last_non_stale_mirror(mirror_id,
3081                                                                       lo)) {
3082                                                 mutex_unlock(
3083                                                         &lo->ldo_layout_mutex);
3084                                                 RETURN(-EUCLEAN);
3085                                         }
3086                                         lod_comp->llc_flags |= flags;
3087                                 }
3088                                 if (mirror_flag) {
3089                                         lod_comp->llc_flags |= mirror_flag;
3090                                         if (mirror_flag & LCME_FL_NOSYNC)
3091                                                 lod_comp->llc_timestamp =
3092                                                        ktime_get_real_seconds();
3093                                 }
3094                         }
3095                         changed = true;
3096                 }
3097         }
3098         mutex_unlock(&lo->ldo_layout_mutex);
3099
3100         if (!changed) {
3101                 CDEBUG(D_LAYOUT, "%s: requested component(s) not found.\n",
3102                        lod2obd(d)->obd_name);
3103                 RETURN(-EINVAL);
3104         }
3105
3106         lod_obj_inc_layout_gen(lo);
3107
3108         info->lti_buf.lb_len = lod_comp_md_size(lo, false);
3109         rc = lod_sub_declare_xattr_set(env, dt_object_child(dt), &info->lti_buf,
3110                                        XATTR_NAME_LOV, LU_XATTR_REPLACE, th);
3111         RETURN(rc);
3112 }
3113
3114 /**
3115  * Declare component deletion. The xattr name is XATTR_LUSTRE_LOV.del,
3116  * and the xattr value is a unique component ID or a special lcme_id.
3117  *
3118  * \param[in] env       execution environment
3119  * \param[in] dt        dt_object to be operated on
3120  * \param[in] buf       buffer contains component ID or lcme_id
3121  * \parem[in] th        thandle
3122  *
3123  * \retval      0 on success
3124  * \retval      negative errno on failure
3125  */
3126 static int lod_declare_layout_del(const struct lu_env *env,
3127                                   struct dt_object *dt,
3128                                   const struct lu_buf *buf,
3129                                   struct thandle *th)
3130 {
3131         struct lod_thread_info  *info = lod_env_info(env);
3132         struct dt_object *next = dt_object_child(dt);
3133         struct lod_device *d = lu2lod_dev(dt->do_lu.lo_dev);
3134         struct lod_object *lo = lod_dt_obj(dt);
3135         struct lu_attr *attr = &lod_env_info(env)->lti_attr;
3136         struct lov_comp_md_v1 *comp_v1 = buf->lb_buf;
3137         __u32 magic, id, flags, neg_flags = 0;
3138         int rc, i, j, left;
3139         ENTRY;
3140
3141         LASSERT(lo->ldo_is_composite);
3142
3143         if (lo->ldo_flr_state != LCM_FL_NONE)
3144                 RETURN(-EBUSY);
3145
3146         magic = comp_v1->lcm_magic;
3147         if (magic == __swab32(LOV_USER_MAGIC_COMP_V1)) {
3148                 lustre_swab_lov_comp_md_v1(comp_v1);
3149                 magic = comp_v1->lcm_magic;
3150         }
3151
3152         if (magic != LOV_USER_MAGIC_COMP_V1)
3153                 RETURN(-EINVAL);
3154
3155         id = comp_v1->lcm_entries[0].lcme_id;
3156         flags = comp_v1->lcm_entries[0].lcme_flags;
3157
3158         if (id > LCME_ID_MAX || (flags & ~LCME_KNOWN_FLAGS)) {
3159                 CDEBUG(D_LAYOUT, "%s: invalid component id %#x, flags %#x\n",
3160                        lod2obd(d)->obd_name, id, flags);
3161                 RETURN(-EINVAL);
3162         }
3163
3164         if (id != LCME_ID_INVAL && flags != 0) {
3165                 CDEBUG(D_LAYOUT, "%s: specified both id and flags.\n",
3166                        lod2obd(d)->obd_name);
3167                 RETURN(-EINVAL);
3168         }
3169
3170         if (id == LCME_ID_INVAL && !flags) {
3171                 CDEBUG(D_LAYOUT, "%s: no id or flags specified.\n",
3172                        lod2obd(d)->obd_name);
3173                 RETURN(-EINVAL);
3174         }
3175
3176         if (flags & LCME_FL_NEG) {
3177                 neg_flags = flags & ~LCME_FL_NEG;
3178                 flags = 0;
3179         }
3180
3181         mutex_lock(&lo->ldo_layout_mutex);
3182
3183         left = lo->ldo_comp_cnt;
3184         if (left <= 0) {
3185                 mutex_unlock(&lo->ldo_layout_mutex);
3186                 RETURN(-EINVAL);
3187         }
3188
3189         for (i = (lo->ldo_comp_cnt - 1); i >= 0; i--) {
3190                 struct lod_layout_component *lod_comp;
3191
3192                 lod_comp = &lo->ldo_comp_entries[i];
3193
3194                 if (id != LCME_ID_INVAL && id != lod_comp->llc_id)
3195                         continue;
3196                 else if (flags && !(flags & lod_comp->llc_flags))
3197                         continue;
3198                 else if (neg_flags && (neg_flags & lod_comp->llc_flags))
3199                         continue;
3200
3201                 if (left != (i + 1)) {
3202                         CDEBUG(D_LAYOUT, "%s: this deletion will create "
3203                                "a hole.\n", lod2obd(d)->obd_name);
3204                         mutex_unlock(&lo->ldo_layout_mutex);
3205                         RETURN(-EINVAL);
3206                 }
3207                 left--;
3208
3209                 /* Mark the component as deleted */
3210                 lod_comp->llc_id = LCME_ID_INVAL;
3211
3212                 /* Not instantiated component */
3213                 if (lod_comp->llc_stripe == NULL)
3214                         continue;
3215
3216                 LASSERT(lod_comp->llc_stripe_count > 0);
3217                 for (j = 0; j < lod_comp->llc_stripe_count; j++) {
3218                         struct dt_object *obj = lod_comp->llc_stripe[j];
3219
3220                         if (obj == NULL)
3221                                 continue;
3222                         rc = lod_sub_declare_destroy(env, obj, th);
3223                         if (rc) {
3224                                 mutex_unlock(&lo->ldo_layout_mutex);
3225                                 RETURN(rc);
3226                         }
3227                 }
3228         }
3229
3230         LASSERTF(left >= 0, "left = %d\n", left);
3231         if (left == lo->ldo_comp_cnt) {
3232                 CDEBUG(D_LAYOUT, "%s: requested component id:%#x not found\n",
3233                        lod2obd(d)->obd_name, id);
3234                 mutex_unlock(&lo->ldo_layout_mutex);
3235                 RETURN(-EINVAL);
3236         }
3237
3238         mutex_unlock(&lo->ldo_layout_mutex);
3239
3240         memset(attr, 0, sizeof(*attr));
3241         attr->la_valid = LA_SIZE;
3242         rc = lod_sub_declare_attr_set(env, next, attr, th);
3243         if (rc)
3244                 RETURN(rc);
3245
3246         if (left > 0) {
3247                 info->lti_buf.lb_len = lod_comp_md_size(lo, false);
3248                 rc = lod_sub_declare_xattr_set(env, next, &info->lti_buf,
3249                                                XATTR_NAME_LOV, 0, th);
3250         } else {
3251                 rc = lod_sub_declare_xattr_del(env, next, XATTR_NAME_LOV, th);
3252         }
3253
3254         RETURN(rc);
3255 }
3256
3257 /**
3258  * Declare layout add/set/del operations issued by special xattr names:
3259  *
3260  * XATTR_LUSTRE_LOV.add         add component(s) to existing file
3261  * XATTR_LUSTRE_LOV.del         delete component(s) from existing file
3262  * XATTR_LUSTRE_LOV.set.$field  set specified field of certain component(s)
3263  *
3264  * \param[in] env       execution environment
3265  * \param[in] dt        object
3266  * \param[in] name      name of xattr
3267  * \param[in] buf       lu_buf contains xattr value
3268  * \param[in] th        transaction handle
3269  *
3270  * \retval              0 on success
3271  * \retval              negative if failed
3272  */
3273 static int lod_declare_modify_layout(const struct lu_env *env,
3274                                      struct dt_object *dt,
3275                                      const char *name,
3276                                      const struct lu_buf *buf,
3277                                      struct thandle *th)
3278 {
3279         struct lod_device *d = lu2lod_dev(dt->do_lu.lo_dev);
3280         struct lod_object *lo = lod_dt_obj(dt);
3281         char *op;
3282         int rc, len = strlen(XATTR_LUSTRE_LOV);
3283         ENTRY;
3284
3285         LASSERT(dt_object_exists(dt));
3286
3287         if (strlen(name) <= len || name[len] != '.') {
3288                 CDEBUG(D_LAYOUT, "%s: invalid xattr name: %s\n",
3289                        lod2obd(d)->obd_name, name);
3290                 RETURN(-EINVAL);
3291         }
3292         len++;
3293
3294         rc = lod_striping_load(env, lo);
3295         if (rc)
3296                 GOTO(unlock, rc);
3297
3298         /* the layout to be modified must be a composite layout */
3299         if (!lo->ldo_is_composite) {
3300                 CDEBUG(D_LAYOUT, "%s: object "DFID" isn't a composite file.\n",
3301                        lod2obd(d)->obd_name, PFID(lu_object_fid(&dt->do_lu)));
3302                 GOTO(unlock, rc = -EINVAL);
3303         }
3304
3305         op = (char *)name + len;
3306         if (strcmp(op, "add") == 0) {
3307                 rc = lod_declare_layout_add(env, dt, buf, th);
3308         } else if (strcmp(op, "del") == 0) {
3309                 rc = lod_declare_layout_del(env, dt, buf, th);
3310         } else if (strncmp(op, "set", strlen("set")) == 0) {
3311                 rc = lod_declare_layout_set(env, dt, op, buf, th);
3312         } else  {
3313                 CDEBUG(D_LAYOUT, "%s: unsupported xattr name:%s\n",
3314                        lod2obd(d)->obd_name, name);
3315                 GOTO(unlock, rc = -ENOTSUPP);
3316         }
3317 unlock:
3318         if (rc)
3319                 lod_striping_free(env, lo);
3320
3321         RETURN(rc);
3322 }
3323
3324 /**
3325  * Convert a plain file lov_mds_md to a composite layout.
3326  *
3327  * \param[in,out] info  the thread info::lti_ea_store buffer contains little
3328  *                      endian plain file layout
3329  *
3330  * \retval              0 on success, <0 on failure
3331  */
3332 static int lod_layout_convert(struct lod_thread_info *info)
3333 {
3334         struct lov_mds_md *lmm = info->lti_ea_store;
3335         struct lov_mds_md *lmm_save;
3336         struct lov_comp_md_v1 *lcm;
3337         struct lov_comp_md_entry_v1 *lcme;
3338         size_t size;
3339         __u32 blob_size;
3340         int rc = 0;
3341         ENTRY;
3342
3343         /* realloc buffer to a composite layout which contains one component */
3344         blob_size = lov_mds_md_size(le16_to_cpu(lmm->lmm_stripe_count),
3345                                     le32_to_cpu(lmm->lmm_magic));
3346         size = sizeof(*lcm) + sizeof(*lcme) + blob_size;
3347
3348         OBD_ALLOC_LARGE(lmm_save, blob_size);
3349         if (!lmm_save)
3350                 GOTO(out, rc = -ENOMEM);
3351
3352         memcpy(lmm_save, lmm, blob_size);
3353
3354         if (info->lti_ea_store_size < size) {
3355                 rc = lod_ea_store_resize(info, size);
3356                 if (rc)
3357                         GOTO(out, rc);
3358         }
3359
3360         lcm = info->lti_ea_store;
3361         memset(lcm, 0, sizeof(*lcm) + sizeof(*lcme));
3362         lcm->lcm_magic = cpu_to_le32(LOV_MAGIC_COMP_V1);
3363         lcm->lcm_size = cpu_to_le32(size);
3364         lcm->lcm_layout_gen = cpu_to_le32(le16_to_cpu(
3365                                                 lmm_save->lmm_layout_gen));
3366         lcm->lcm_flags = cpu_to_le16(LCM_FL_NONE);
3367         lcm->lcm_entry_count = cpu_to_le16(1);
3368
3369         lcme = &lcm->lcm_entries[0];
3370         lcme->lcme_flags = cpu_to_le32(LCME_FL_INIT);
3371         lcme->lcme_extent.e_start = 0;
3372         lcme->lcme_extent.e_end = cpu_to_le64(OBD_OBJECT_EOF);
3373         lcme->lcme_offset = cpu_to_le32(sizeof(*lcm) + sizeof(*lcme));
3374         lcme->lcme_size = cpu_to_le32(blob_size);
3375
3376         memcpy((char *)lcm + lcme->lcme_offset, (char *)lmm_save, blob_size);
3377
3378         EXIT;
3379 out:
3380         if (lmm_save)
3381                 OBD_FREE_LARGE(lmm_save, blob_size);
3382         return rc;
3383 }
3384
3385 /**
3386  * Merge layouts to form a mirrored file.
3387  */
3388 static int lod_declare_layout_merge(const struct lu_env *env,
3389                                     struct dt_object *dt,
3390                                     const struct lu_buf *mbuf,
3391                                     struct thandle *th)
3392 {
3393         struct lod_thread_info *info = lod_env_info(env);
3394         struct lu_attr *layout_attr = &info->lti_layout_attr;
3395         struct lu_buf *buf = &info->lti_buf;
3396         struct lod_object *lo = lod_dt_obj(dt);
3397         struct lov_comp_md_v1 *lcm;
3398         struct lov_comp_md_v1 *cur_lcm;
3399         struct lov_comp_md_v1 *merge_lcm;
3400         struct lov_comp_md_entry_v1 *lcme;
3401         struct lov_mds_md_v1 *lmm;
3402         size_t size = 0;
3403         size_t offset;
3404         __u16 cur_entry_count;
3405         __u16 merge_entry_count;
3406         __u32 id = 0;
3407         __u16 mirror_id = 0;
3408         __u32 mirror_count;
3409         int rc, i;
3410         bool merge_has_dom;
3411
3412         ENTRY;
3413
3414         merge_lcm = mbuf->lb_buf;
3415         if (mbuf->lb_len < sizeof(*merge_lcm))
3416                 RETURN(-EINVAL);
3417
3418         /* must be an existing layout from disk */
3419         if (le32_to_cpu(merge_lcm->lcm_magic) != LOV_MAGIC_COMP_V1)
3420                 RETURN(-EINVAL);
3421
3422         merge_entry_count = le16_to_cpu(merge_lcm->lcm_entry_count);
3423
3424         /* do not allow to merge two mirrored files */
3425         if (le16_to_cpu(merge_lcm->lcm_mirror_count))
3426                 RETURN(-EBUSY);
3427
3428         /* verify the target buffer */
3429         rc = lod_get_lov_ea(env, lo);
3430         if (rc <= 0)
3431                 RETURN(rc ? : -ENODATA);
3432
3433         cur_lcm = info->lti_ea_store;
3434         switch (le32_to_cpu(cur_lcm->lcm_magic)) {
3435         case LOV_MAGIC_V1:
3436         case LOV_MAGIC_V3:
3437                 rc = lod_layout_convert(info);
3438                 break;
3439         case LOV_MAGIC_COMP_V1:
3440         case LOV_MAGIC_SEL:
3441                 rc = 0;
3442                 break;
3443         default:
3444                 rc = -EINVAL;
3445         }
3446         if (rc)
3447                 RETURN(rc);
3448
3449         /* info->lti_ea_store could be reallocated in lod_layout_convert() */
3450         cur_lcm = info->lti_ea_store;
3451         cur_entry_count = le16_to_cpu(cur_lcm->lcm_entry_count);
3452
3453         /* 'lcm_mirror_count + 1' is the current # of mirrors the file has */
3454         mirror_count = le16_to_cpu(cur_lcm->lcm_mirror_count) + 1;
3455         if (mirror_count + 1 > LUSTRE_MIRROR_COUNT_MAX)
3456                 RETURN(-ERANGE);
3457
3458         /* size of new layout */
3459         size = le32_to_cpu(cur_lcm->lcm_size) +
3460                le32_to_cpu(merge_lcm->lcm_size) - sizeof(*cur_lcm);
3461
3462         memset(buf, 0, sizeof(*buf));
3463         lu_buf_alloc(buf, size);
3464         if (buf->lb_buf == NULL)
3465                 RETURN(-ENOMEM);
3466
3467         lcm = buf->lb_buf;
3468         memcpy(lcm, cur_lcm, sizeof(*lcm) + cur_entry_count * sizeof(*lcme));
3469
3470         offset = sizeof(*lcm) +
3471                  sizeof(*lcme) * (cur_entry_count + merge_entry_count);
3472         for (i = 0; i < cur_entry_count; i++) {
3473                 struct lov_comp_md_entry_v1 *cur_lcme;
3474
3475                 lcme = &lcm->lcm_entries[i];
3476                 cur_lcme = &cur_lcm->lcm_entries[i];
3477
3478                 lcme->lcme_offset = cpu_to_le32(offset);
3479                 memcpy((char *)lcm + offset,
3480                        (char *)cur_lcm + le32_to_cpu(cur_lcme->lcme_offset),
3481                        le32_to_cpu(lcme->lcme_size));
3482
3483                 offset += le32_to_cpu(lcme->lcme_size);
3484
3485                 if (mirror_count == 1 &&
3486                     mirror_id_of(le32_to_cpu(lcme->lcme_id)) == 0) {
3487                         /* Add mirror from a non-flr file, create new mirror ID.
3488                          * Otherwise, keep existing mirror's component ID, used
3489                          * for mirror extension.
3490                          */
3491                         id = pflr_id(1, i + 1);
3492                         lcme->lcme_id = cpu_to_le32(id);
3493                 }
3494
3495                 id = max(le32_to_cpu(lcme->lcme_id), id);
3496         }
3497
3498         mirror_id = mirror_id_of(id) + 1;
3499
3500         /* check if first entry in new layout is DOM */
3501         lmm = (struct lov_mds_md_v1 *)((char *)merge_lcm +
3502                                         merge_lcm->lcm_entries[0].lcme_offset);
3503         merge_has_dom = lov_pattern(le32_to_cpu(lmm->lmm_pattern)) &
3504                         LOV_PATTERN_MDT;
3505
3506         for (i = 0; i < merge_entry_count; i++) {
3507                 struct lov_comp_md_entry_v1 *merge_lcme;
3508
3509                 merge_lcme = &merge_lcm->lcm_entries[i];
3510                 lcme = &lcm->lcm_entries[cur_entry_count + i];
3511
3512                 *lcme = *merge_lcme;
3513                 lcme->lcme_offset = cpu_to_le32(offset);
3514                 if (merge_has_dom && i == 0)
3515                         lcme->lcme_flags |= cpu_to_le32(LCME_FL_STALE);
3516
3517                 id = pflr_id(mirror_id, i + 1);
3518                 lcme->lcme_id = cpu_to_le32(id);
3519
3520                 memcpy((char *)lcm + offset,
3521                        (char *)merge_lcm + le32_to_cpu(merge_lcme->lcme_offset),
3522                        le32_to_cpu(lcme->lcme_size));
3523
3524                 offset += le32_to_cpu(lcme->lcme_size);
3525         }
3526
3527         /* fixup layout information */
3528         lcm->lcm_size = cpu_to_le32(size);
3529         lcm->lcm_entry_count = cpu_to_le16(cur_entry_count + merge_entry_count);
3530         lcm->lcm_mirror_count = cpu_to_le16(mirror_count);
3531         if ((le16_to_cpu(lcm->lcm_flags) & LCM_FL_FLR_MASK) == LCM_FL_NONE)
3532                 lcm->lcm_flags = cpu_to_le32(LCM_FL_RDONLY);
3533
3534         rc = lod_striping_reload(env, lo, buf, 0);
3535         if (rc)
3536                 GOTO(out, rc);
3537
3538         lod_obj_inc_layout_gen(lo);
3539         lcm->lcm_layout_gen = cpu_to_le32(lo->ldo_layout_gen);
3540
3541         /* transfer layout version to OST objects. */
3542         if (lo->ldo_mirror_count > 1) {
3543                 struct lod_obj_stripe_cb_data data = { {0} };
3544
3545                 layout_attr->la_valid = LA_LAYOUT_VERSION;
3546                 layout_attr->la_layout_version = 0;
3547                 data.locd_attr = layout_attr;
3548                 data.locd_declare = true;
3549                 data.locd_stripe_cb = lod_obj_stripe_attr_set_cb;
3550                 rc = lod_obj_for_each_stripe(env, lo, th, &data);
3551                 if (rc)
3552                         GOTO(out, rc);
3553         }
3554
3555         rc = lod_sub_declare_xattr_set(env, dt_object_child(dt), buf,
3556                                        XATTR_NAME_LOV, LU_XATTR_REPLACE, th);
3557
3558 out:
3559         lu_buf_free(buf);
3560         RETURN(rc);
3561 }
3562
3563 /**
3564  * Split layouts, just set the LOVEA with the layout from mbuf.
3565  */
3566 static int lod_declare_layout_split(const struct lu_env *env,
3567                 struct dt_object *dt, const struct lu_buf *mbuf,
3568                 struct thandle *th)
3569 {
3570         struct lod_thread_info *info = lod_env_info(env);
3571         struct lu_attr *layout_attr = &info->lti_layout_attr;
3572         struct lod_object *lo = lod_dt_obj(dt);
3573         struct lov_comp_md_v1 *lcm = mbuf->lb_buf;
3574         int rc;
3575         ENTRY;
3576
3577         rc = lod_striping_reload(env, lo, mbuf, LVF_ALL_STALE);
3578         if (rc)
3579                 RETURN(rc);
3580
3581         lod_obj_inc_layout_gen(lo);
3582         /* fix on-disk layout gen */
3583         lcm->lcm_layout_gen = cpu_to_le32(lo->ldo_layout_gen);
3584
3585         /* transfer layout version to OST objects. */
3586         if (lo->ldo_mirror_count > 1) {
3587                 struct lod_obj_stripe_cb_data data = { {0} };
3588
3589                 layout_attr->la_valid = LA_LAYOUT_VERSION;
3590                 layout_attr->la_layout_version = 0;
3591                 data.locd_attr = layout_attr;
3592                 data.locd_declare = true;
3593                 data.locd_stripe_cb = lod_obj_stripe_attr_set_cb;
3594                 rc = lod_obj_for_each_stripe(env, lo, th, &data);
3595                 if (rc)
3596                         RETURN(rc);
3597         }
3598
3599         rc = lod_sub_declare_xattr_set(env, dt_object_child(dt), mbuf,
3600                                        XATTR_NAME_LOV, LU_XATTR_REPLACE, th);
3601         RETURN(rc);
3602 }
3603
3604 static int lod_layout_declare_or_purge_mirror(const struct lu_env *env,
3605                         struct dt_object *dt, const struct lu_buf *buf,
3606                         struct thandle *th, bool declare)
3607 {
3608         struct lod_thread_info *info = lod_env_info(env);
3609         struct lod_device *d = lu2lod_dev(dt->do_lu.lo_dev);
3610         struct lod_object *lo = lod_dt_obj(dt);
3611         struct lov_comp_md_v1 *comp_v1 = buf->lb_buf;
3612         struct lov_comp_md_entry_v1 *entry;
3613         struct lov_mds_md_v1 *lmm;
3614         struct dt_object **sub_objs = NULL;
3615         int rc = 0, i, k, array_count = 0;
3616
3617         ENTRY;
3618
3619         /**
3620          * other ops (like lod_declare_destroy) could destroying sub objects
3621          * as well.
3622          */
3623         mutex_lock(&lo->ldo_layout_mutex);
3624
3625         if (!declare) {
3626                 /* prepare sub-objects array */
3627                 for (i = 0; i < comp_v1->lcm_entry_count; i++) {
3628                         entry = &comp_v1->lcm_entries[i];
3629
3630                         if (!(entry->lcme_flags & LCME_FL_INIT))
3631                                 continue;
3632
3633                         lmm = (struct lov_mds_md_v1 *)
3634                                         ((char *)comp_v1 + entry->lcme_offset);
3635                         array_count += lmm->lmm_stripe_count;
3636                 }
3637                 OBD_ALLOC_PTR_ARRAY(sub_objs, array_count);
3638                 if (sub_objs == NULL) {
3639                         mutex_unlock(&lo->ldo_layout_mutex);
3640                         RETURN(-ENOMEM);
3641                 }
3642         }
3643
3644         k = 0;  /* sub_objs index */
3645         for (i = 0; i < comp_v1->lcm_entry_count; i++) {
3646                 struct lov_ost_data_v1 *objs;
3647                 struct lu_object *o, *n;
3648                 struct dt_object *dto;
3649                 struct lu_device *nd;
3650                 struct lov_mds_md_v3 *v3;
3651                 __u32 idx;
3652                 int j;
3653
3654                 entry = &comp_v1->lcm_entries[i];
3655
3656                 if (!(entry->lcme_flags & LCME_FL_INIT))
3657                         continue;
3658
3659                 lmm = (struct lov_mds_md_v1 *)
3660                                 ((char *)comp_v1 + entry->lcme_offset);
3661                 v3 = (struct lov_mds_md_v3 *)lmm;
3662                 if (lmm->lmm_magic == LOV_MAGIC_V3)
3663                         objs = &v3->lmm_objects[0];
3664                 else
3665                         objs = &lmm->lmm_objects[0];
3666
3667                 for (j = 0; j < lmm->lmm_stripe_count; j++) {
3668                         idx = objs[j].l_ost_idx;
3669                         rc = ostid_to_fid(&info->lti_fid, &objs[j].l_ost_oi,
3670                                           idx);
3671                         if (rc)
3672                                 GOTO(out, rc);
3673
3674                         if (!fid_is_sane(&info->lti_fid)) {
3675                                 CERROR("%s: sub-object insane fid "DFID"\n",
3676                                        lod2obd(d)->obd_name,
3677                                        PFID(&info->lti_fid));
3678                                 GOTO(out, rc = -EINVAL);
3679                         }
3680
3681                         lod_getref(&d->lod_ost_descs);
3682
3683                         rc = validate_lod_and_idx(d, idx);
3684                         if (unlikely(rc)) {
3685                                 lod_putref(d, &d->lod_ost_descs);
3686                                 GOTO(out, rc);
3687                         }
3688
3689                         nd = &OST_TGT(d, idx)->ltd_tgt->dd_lu_dev;
3690                         lod_putref(d, &d->lod_ost_descs);
3691
3692                         o = lu_object_find_at(env, nd, &info->lti_fid, NULL);
3693                         if (IS_ERR(o))
3694                                 GOTO(out, rc = PTR_ERR(o));
3695
3696                         n = lu_object_locate(o->lo_header, nd->ld_type);
3697                         if (unlikely(!n)) {
3698                                 lu_object_put(env, n);
3699                                 GOTO(out, rc = -ENOENT);
3700                         }
3701
3702                         dto = container_of(n, struct dt_object, do_lu);
3703
3704                         if (declare) {
3705                                 rc = lod_sub_declare_destroy(env, dto, th);
3706                                 dt_object_put(env, dto);
3707                                 if (rc)
3708                                         GOTO(out, rc);
3709                         } else {
3710                                 /**
3711                                  * collect to-be-destroyed sub objects, the
3712                                  * reference would be released after actual
3713                                  * deletion.
3714                                  */
3715                                 sub_objs[k] = dto;
3716                                 k++;
3717                         }
3718                 } /* for each stripe */
3719         } /* for each component in the mirror */
3720 out:
3721         if (!declare) {
3722                 i = 0;
3723                 if (!rc) {
3724                         /* destroy the sub objects */
3725                         for (; i < k; i++) {
3726                                 rc = lod_sub_destroy(env, sub_objs[i], th);
3727                                 if (rc)
3728                                         break;
3729                                 dt_object_put(env, sub_objs[i]);
3730                         }
3731                 }
3732                 /**
3733                  * if a sub object destroy failed, we'd release sub objects
3734                  * reference get from above sub_objs collection.
3735                  */
3736                 for (; i < k; i++)
3737                         dt_object_put(env, sub_objs[i]);
3738
3739                 OBD_FREE_PTR_ARRAY(sub_objs, array_count);
3740         }
3741         mutex_unlock(&lo->ldo_layout_mutex);
3742
3743         RETURN(rc);
3744 }
3745
3746 /**
3747  * Purge layouts, delete sub objects in the mirror stored in the vic_buf,
3748  * and set the LOVEA with the layout from mbuf.
3749  */
3750 static int lod_declare_layout_purge(const struct lu_env *env,
3751                 struct dt_object *dt, const struct lu_buf *buf,
3752                 struct thandle *th)
3753 {
3754         struct lod_device *d = lu2lod_dev(dt->do_lu.lo_dev);
3755         struct lov_comp_md_v1 *comp_v1 = buf->lb_buf;
3756         int rc;
3757
3758         ENTRY;
3759
3760         if (le32_to_cpu(comp_v1->lcm_magic) != LOV_MAGIC_COMP_V1) {
3761                 CERROR("%s: invalid layout magic %#x != %#x\n",
3762                        lod2obd(d)->obd_name, le32_to_cpu(comp_v1->lcm_magic),
3763                        LOV_MAGIC_COMP_V1);
3764                 RETURN(-EINVAL);
3765         }
3766
3767         if (cpu_to_le32(LOV_MAGIC_COMP_V1) != LOV_MAGIC_COMP_V1)
3768                 lustre_swab_lov_comp_md_v1(comp_v1);
3769
3770         /* from now on, @buf contains cpu endian data */
3771
3772         if (comp_v1->lcm_mirror_count != 0) {
3773                 CERROR("%s: can only purge one mirror from "DFID"\n",
3774                        lod2obd(d)->obd_name, PFID(lu_object_fid(&dt->do_lu)));
3775                 RETURN(-EINVAL);
3776         }
3777
3778         /* delcare sub objects deletion in the mirror stored in @buf */
3779         rc = lod_layout_declare_or_purge_mirror(env, dt, buf, th, true);
3780         RETURN(rc);
3781 }
3782
3783 /* delete sub objects from the mirror stored in @buf */
3784 static int lod_layout_purge(const struct lu_env *env, struct dt_object *dt,
3785                             const struct lu_buf *buf, struct thandle *th)
3786 {
3787         int rc;
3788
3789         ENTRY;
3790         rc = lod_layout_declare_or_purge_mirror(env, dt, buf, th, false);
3791         RETURN(rc);
3792 }
3793
3794 /**
3795  * Implementation of dt_object_operations::do_declare_xattr_set.
3796  *
3797  * \see dt_object_operations::do_declare_xattr_set() in the API description
3798  * for details.
3799  *
3800  * the extension to the API:
3801  *   - declaring LOVEA requests striping creation
3802  *   - LU_XATTR_REPLACE means layout swap
3803  */
3804 static int lod_declare_xattr_set(const struct lu_env *env,
3805                                  struct dt_object *dt,
3806                                  const struct lu_buf *buf,
3807                                  const char *name, int fl,
3808                                  struct thandle *th)
3809 {
3810         struct lod_thread_info *info = lod_env_info(env);
3811         struct dt_object *next = dt_object_child(dt);
3812         struct lu_attr   *attr = &info->lti_attr;
3813         struct lod_object *lo = lod_dt_obj(dt);
3814         __u32             mode;
3815         int               rc;
3816         ENTRY;
3817
3818         mode = dt->do_lu.lo_header->loh_attr & S_IFMT;
3819         if ((S_ISREG(mode) || mode == 0) &&
3820             !(fl & (LU_XATTR_REPLACE | LU_XATTR_MERGE | LU_XATTR_SPLIT |
3821                     LU_XATTR_PURGE)) &&
3822             (strcmp(name, XATTR_NAME_LOV) == 0 ||
3823              strcmp(name, XATTR_LUSTRE_LOV) == 0)) {
3824                 /*
3825                  * this is a request to create object's striping.
3826                  *
3827                  * allow to declare predefined striping on a new (!mode) object
3828                  * which is supposed to be replay of regular file creation
3829                  * (when LOV setting is declared)
3830                  *
3831                  * LU_XATTR_REPLACE is set to indicate a layout swap
3832                  */
3833                 if (dt_object_exists(dt)) {
3834                         rc = dt_attr_get(env, next, attr);
3835                         if (rc)
3836                                 RETURN(rc);
3837                 } else {
3838                         memset(attr, 0, sizeof(*attr));
3839                         attr->la_valid = LA_TYPE | LA_MODE;
3840                         attr->la_mode = S_IFREG;
3841                 }
3842                 rc = lod_declare_striped_create(env, dt, attr, buf, th);
3843         } else if (fl & LU_XATTR_MERGE) {
3844                 LASSERT(strcmp(name, XATTR_NAME_LOV) == 0 ||
3845                         strcmp(name, XATTR_LUSTRE_LOV) == 0);
3846                 rc = lod_declare_layout_merge(env, dt, buf, th);
3847         } else if (fl & LU_XATTR_SPLIT) {
3848                 LASSERT(strcmp(name, XATTR_NAME_LOV) == 0 ||
3849                         strcmp(name, XATTR_LUSTRE_LOV) == 0);
3850                 rc = lod_declare_layout_split(env, dt, buf, th);
3851         } else if (fl & LU_XATTR_PURGE) {
3852                 LASSERT(strcmp(name, XATTR_NAME_LOV) == 0 ||
3853                         strcmp(name, XATTR_LUSTRE_LOV) == 0);
3854                 rc = lod_declare_layout_purge(env, dt, buf, th);
3855         } else if (S_ISREG(mode) &&
3856                    strlen(name) >= sizeof(XATTR_LUSTRE_LOV) + 3 &&
3857                    allowed_lustre_lov(name)) {
3858                 /*
3859                  * this is a request to modify object's striping.
3860                  * add/set/del component(s).
3861                  */
3862                 if (!dt_object_exists(dt))
3863                         RETURN(-ENOENT);
3864
3865                 rc = lod_declare_modify_layout(env, dt, name, buf, th);
3866         } else if (S_ISDIR(mode)) {
3867                 rc = lod_dir_declare_xattr_set(env, dt, buf, name, fl, th);
3868         } else if (strcmp(name, XATTR_NAME_FID) == 0) {
3869                 rc = lod_replace_parent_fid(env, dt, buf, th, true);
3870         } else {
3871                 rc = lod_sub_declare_xattr_set(env, next, buf, name, fl, th);
3872         }
3873
3874         if (rc == 0 &&
3875             (strcmp(name, XATTR_NAME_LOV) == 0 ||
3876              strcmp(name, XATTR_LUSTRE_LOV) == 0 || allowed_lustre_lov(name)))
3877                 rc = lod_save_layout_gen_intrans(info, lo);
3878
3879         RETURN(rc);
3880 }
3881
3882 /**
3883  * Apply xattr changes to the object.
3884  *
3885  * Applies xattr changes to the object and the stripes if the latter exist.
3886  *
3887  * \param[in] env       execution environment
3888  * \param[in] dt        object
3889  * \param[in] buf       buffer pointing to the new value of xattr
3890  * \param[in] name      name of xattr
3891  * \param[in] fl        flags
3892  * \param[in] th        transaction handle
3893  *
3894  * \retval              0 on success
3895  * \retval              negative if failed
3896  */
3897 static int lod_xattr_set_internal(const struct lu_env *env,
3898                                   struct dt_object *dt,
3899                                   const struct lu_buf *buf,
3900                                   const char *name, int fl,
3901                                   struct thandle *th)
3902 {
3903         struct dt_object        *next = dt_object_child(dt);
3904         struct lod_object       *lo = lod_dt_obj(dt);
3905         int                     rc;
3906         int                     i;
3907         ENTRY;
3908
3909         rc = lod_sub_xattr_set(env, next, buf, name, fl, th);
3910         if (rc != 0 || !S_ISDIR(dt->do_lu.lo_header->loh_attr))
3911                 RETURN(rc);
3912
3913         /* Note: Do not set LinkEA on sub-stripes, otherwise
3914          * it will confuse the fid2path process(see mdt_path_current()).
3915          * The linkEA between master and sub-stripes is set in
3916          * lod_xattr_set_lmv(). */
3917         if (lo->ldo_dir_stripe_count == 0 || strcmp(name, XATTR_NAME_LINK) == 0)
3918                 RETURN(0);
3919
3920         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
3921                 if (!lo->ldo_stripe[i])
3922                         continue;
3923
3924                 if (!dt_object_exists(lo->ldo_stripe[i]))
3925                         continue;
3926
3927                 rc = lod_sub_xattr_set(env, lo->ldo_stripe[i], buf, name,
3928                                        fl, th);
3929                 if (rc != 0)
3930                         break;
3931         }
3932
3933         RETURN(rc);
3934 }
3935
3936 /**
3937  * Delete an extended attribute.
3938  *
3939  * Deletes specified xattr from the object and the stripes if the latter exist.
3940  *
3941  * \param[in] env       execution environment
3942  * \param[in] dt        object
3943  * \param[in] name      name of xattr
3944  * \param[in] th        transaction handle
3945  *
3946  * \retval              0 on success
3947  * \retval              negative if failed
3948  */
3949 static int lod_xattr_del_internal(const struct lu_env *env,
3950                                   struct dt_object *dt,
3951                                   const char *name, struct thandle *th)
3952 {
3953         struct dt_object *next = dt_object_child(dt);
3954         struct lod_object *lo = lod_dt_obj(dt);
3955         int i;
3956         int rc;
3957
3958         ENTRY;
3959
3960         rc = lod_sub_xattr_del(env, next, name, th);
3961         if (rc != 0 || !S_ISDIR(dt->do_lu.lo_header->loh_attr))
3962                 RETURN(rc);
3963
3964         if (lo->ldo_dir_stripe_count == 0)
3965                 RETURN(rc);
3966
3967         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
3968                 if (!lo->ldo_stripe[i])
3969                         continue;
3970
3971                 if (!dt_object_exists(lo->ldo_stripe[i]))
3972                         continue;
3973
3974                 rc = lod_sub_xattr_del(env, lo->ldo_stripe[i], name, th);
3975                 if (rc != 0)
3976                         break;
3977         }
3978
3979         RETURN(rc);
3980 }
3981
3982 /**
3983  * Set default striping on a directory.
3984  *
3985  * Sets specified striping on a directory object unless it matches the default
3986  * striping (LOVEA_DELETE_VALUES() macro). In the latter case remove existing
3987  * EA. This striping will be used when regular file is being created in this
3988  * directory.
3989  *
3990  * \param[in] env       execution environment
3991  * \param[in] dt        the striped object
3992  * \param[in] buf       buffer with the striping
3993  * \param[in] name      name of EA
3994  * \param[in] fl        xattr flag (see OSD API description)
3995  * \param[in] th        transaction handle
3996  *
3997  * \retval              0 on success
3998  * \retval              negative if failed
3999  */
4000 static int lod_xattr_set_lov_on_dir(const struct lu_env *env,
4001                                     struct dt_object *dt,
4002                                     const struct lu_buf *buf,
4003                                     const char *name, int fl,
4004                                     struct thandle *th)
4005 {
4006         struct lov_user_md_v1   *lum;
4007         struct lov_user_md_v3   *v3 = NULL;
4008         const char              *pool_name = NULL;
4009         int                      rc;
4010         bool                     is_del;
4011         ENTRY;
4012
4013         LASSERT(buf != NULL && buf->lb_buf != NULL);
4014         lum = buf->lb_buf;
4015
4016         switch (lum->lmm_magic) {
4017         case LOV_USER_MAGIC_SPECIFIC:
4018         case LOV_USER_MAGIC_V3:
4019                 v3 = buf->lb_buf;
4020                 if (lov_pool_is_reserved(v3->lmm_pool_name))
4021                         memset(v3->lmm_pool_name, 0, sizeof(v3->lmm_pool_name));
4022                 else if (v3->lmm_pool_name[0] != '\0')
4023                         pool_name = v3->lmm_pool_name;
4024                 fallthrough;
4025         case LOV_USER_MAGIC_V1:
4026                 /* if { size, offset, count } = { 0, -1, 0 } and no pool
4027                  * (i.e. all default values specified) then delete default
4028                  * striping from dir. */
4029                 CDEBUG(D_LAYOUT,
4030                        "set default striping: sz %u # %u offset %d %s %s\n",
4031                        (unsigned)lum->lmm_stripe_size,
4032                        (unsigned)lum->lmm_stripe_count,
4033                        (int)lum->lmm_stripe_offset,
4034                        v3 ? "from" : "", v3 ? v3->lmm_pool_name : "");
4035
4036                 is_del = LOVEA_DELETE_VALUES(lum->lmm_stripe_size,
4037                                              lum->lmm_stripe_count,
4038                                              lum->lmm_stripe_offset,
4039                                              pool_name);
4040                 break;
4041         case LOV_USER_MAGIC_COMP_V1:
4042         {
4043                 struct lov_comp_md_v1 *lcm = (struct lov_comp_md_v1 *)lum;
4044                 struct lov_comp_md_entry_v1 *lcme;
4045                 int i, comp_cnt;
4046
4047                 comp_cnt = le16_to_cpu(lcm->lcm_entry_count);
4048                 for (i = 0; i < comp_cnt; i++) {
4049                         lcme = &lcm->lcm_entries[i];
4050                         if (lcme->lcme_flags & cpu_to_le32(LCME_FL_EXTENSION)) {
4051                                 lcm->lcm_magic = cpu_to_le32(LOV_MAGIC_SEL);
4052                                 break;
4053                         }
4054                 }
4055
4056                 is_del = false;
4057                 break;
4058         }
4059         default:
4060                 CERROR("Invalid magic %x\n", lum->lmm_magic);
4061                 RETURN(-EINVAL);
4062         }
4063
4064         if (is_del) {
4065                 rc = lod_xattr_del_internal(env, dt, name, th);
4066                 if (rc == -ENODATA)
4067                         rc = 0;
4068         } else {
4069                 rc = lod_xattr_set_internal(env, dt, buf, name, fl, th);
4070         }
4071
4072         RETURN(rc);
4073 }
4074
4075 static int lod_get_default_lov_striping(const struct lu_env *env,
4076                                        struct lod_object *lo,
4077                                        struct lod_default_striping *lds,
4078                                        struct dt_allocation_hint *ah);
4079
4080 /**
4081  * Helper function to convert compound layout to compound layout with
4082  * pool
4083  *
4084  * Copy lcm_entries array of \a src to \a tgt. Replace lov_user_md_v1
4085  * components of \a src with lov_user_md_v3 using \a pool.
4086  *
4087  * \param[in] src       source layout
4088  * \param[in] pool      pool to use in \a tgt
4089  * \param[out] tgt      target layout
4090  */
4091 static void embed_pool_to_comp_v1(const struct lov_comp_md_v1 *src,
4092                                   const char *pool,
4093                                   struct lov_comp_md_v1 *tgt)
4094 {
4095         size_t shift;
4096         struct lov_user_md_v1 *lum;
4097         struct lov_user_md_v3 *lum3;
4098         struct lov_comp_md_entry_v1 *entry;
4099         int i;
4100         __u32 offset;
4101
4102         entry = tgt->lcm_entries;
4103         shift = 0;
4104         for (i = 0; i < le16_to_cpu(src->lcm_entry_count); i++, entry++) {
4105                 *entry = src->lcm_entries[i];
4106                 offset = le32_to_cpu(src->lcm_entries[i].lcme_offset);
4107                 entry->lcme_offset = cpu_to_le32(offset + shift);
4108
4109                 lum = (struct lov_user_md_v1 *)((char *)src + offset);
4110                 lum3 = (struct lov_user_md_v3 *)((char *)tgt + offset + shift);
4111                 *(struct lov_user_md_v1 *)lum3 = *lum;
4112                 if (lum->lmm_pattern & cpu_to_le32(LOV_PATTERN_MDT)) {
4113                         lum3->lmm_magic = cpu_to_le32(LOV_USER_MAGIC_V1);
4114                 } else {
4115                         lum3->lmm_magic = cpu_to_le32(LOV_USER_MAGIC_V3);
4116                         entry->lcme_size = cpu_to_le32(sizeof(*lum3));
4117                         strscpy(lum3->lmm_pool_name, pool,
4118                                 sizeof(lum3->lmm_pool_name));
4119                         shift += sizeof(*lum3) - sizeof(*lum);
4120                 }
4121         }
4122 }
4123
4124 /**
4125  * Set default striping on a directory.
4126  *
4127  * Sets specified striping on a directory object unless it matches the default
4128  * striping (LOVEA_DELETE_VALUES() macro). In the latter case remove existing
4129  * EA. This striping will be used when regular file is being created in this
4130  * directory.
4131  * If current default striping includes a pool but specifed striping
4132  * does not - retain the pool if it exists.
4133  *
4134  * \param[in] env       execution environment
4135  * \param[in] dt        the striped object
4136  * \param[in] buf       buffer with the striping
4137  * \param[in] name      name of EA
4138  * \param[in] fl        xattr flag (see OSD API description)
4139  * \param[in] th        transaction handle
4140  *
4141  * \retval              0 on success
4142  * \retval              negative if failed
4143  */
4144 static int lod_xattr_set_default_lov_on_dir(const struct lu_env *env,
4145                                             struct dt_object *dt,
4146                                             const struct lu_buf *buf,
4147                                             const char *name, int fl,
4148                                             struct thandle *th)
4149 {
4150         struct lod_default_striping     *lds = lod_lds_buf_get(env);
4151         struct lov_user_md_v1           *v1 = buf->lb_buf;
4152         char                             pool[LOV_MAXPOOLNAME + 1];
4153         bool                             is_del;
4154         int                              rc;
4155
4156         ENTRY;
4157
4158         /* get existing striping config */
4159         rc = lod_get_default_lov_striping(env, lod_dt_obj(dt), lds, NULL);
4160         if (rc)
4161                 RETURN(rc);
4162
4163         memset(pool, 0, sizeof(pool));
4164         if (lds->lds_def_striping_set == 1)
4165                 lod_layout_get_pool(lds->lds_def_comp_entries,
4166                                     lds->lds_def_comp_cnt, pool,
4167                                     sizeof(pool));
4168
4169         is_del = LOVEA_DELETE_VALUES(v1->lmm_stripe_size,
4170                                      v1->lmm_stripe_count,
4171                                      v1->lmm_stripe_offset,
4172                                      NULL);
4173
4174         /* Retain the pool name if it is not given */
4175         if (v1->lmm_magic == LOV_USER_MAGIC_V1 && pool[0] != '\0' &&
4176             !is_del) {
4177                 struct lod_thread_info *info = lod_env_info(env);
4178                 struct lov_user_md_v3 *v3  = info->lti_ea_store;
4179
4180                 memset(v3, 0, sizeof(*v3));
4181                 v3->lmm_magic = cpu_to_le32(LOV_USER_MAGIC_V3);
4182                 v3->lmm_pattern = cpu_to_le32(v1->lmm_pattern);
4183                 v3->lmm_stripe_count = cpu_to_le32(v1->lmm_stripe_count);
4184                 v3->lmm_stripe_offset = cpu_to_le32(v1->lmm_stripe_offset);
4185                 v3->lmm_stripe_size = cpu_to_le32(v1->lmm_stripe_size);
4186
4187                 strscpy(v3->lmm_pool_name, pool, sizeof(v3->lmm_pool_name));
4188
4189                 info->lti_buf.lb_buf = v3;
4190                 info->lti_buf.lb_len = sizeof(*v3);
4191                 rc = lod_xattr_set_lov_on_dir(env, dt, &info->lti_buf,
4192                                               name, fl, th);
4193         } else if (v1->lmm_magic == LOV_USER_MAGIC_COMP_V1 &&
4194                    pool[0] != '\0' && !is_del) {
4195                 /*
4196                  * try to retain the pool from default layout if the
4197                  * specified component layout does not provide pool
4198                  * info explicitly
4199                  */
4200                 struct lod_thread_info *info = lod_env_info(env);
4201                 struct lov_comp_md_v1 *comp_v1 = buf->lb_buf;
4202                 struct lov_comp_md_v1 *comp_v1p;
4203                 struct lov_user_md_v1 *lum;
4204                 int entry_count;
4205                 int i;
4206                 __u32 offset;
4207                 struct lov_comp_md_entry_v1 *entry;
4208                 int size;
4209
4210                 entry_count = le16_to_cpu(comp_v1->lcm_entry_count);
4211                 size = sizeof(*comp_v1) +
4212                         entry_count * sizeof(comp_v1->lcm_entries[0]);
4213                 entry = comp_v1->lcm_entries;
4214                 for (i = 0; i < entry_count; i++, entry++) {
4215                         offset = le32_to_cpu(entry->lcme_offset);
4216                         lum = (struct lov_user_md_v1 *)((char *)comp_v1 +
4217                                                         offset);
4218                         if (le32_to_cpu(lum->lmm_magic) != LOV_USER_MAGIC_V1)
4219                                 /* the i-th component includes pool info */
4220                                 break;
4221                         if (lum->lmm_pattern & cpu_to_le32(LOV_PATTERN_MDT))
4222                                 size += sizeof(struct lov_user_md_v1);
4223                         else
4224                                 size += sizeof(struct lov_user_md_v3);
4225                 }
4226
4227                 if (i == entry_count) {
4228                         /*
4229                          * re-compose the layout to include the pool for
4230                          * each component
4231                          */
4232                         if (info->lti_ea_store_size < size)
4233                                 rc = lod_ea_store_resize(info, size);
4234
4235                         if (rc == 0) {
4236                                 comp_v1p = info->lti_ea_store;
4237                                 *comp_v1p = *comp_v1;
4238                                 comp_v1p->lcm_size = cpu_to_le32(size);
4239                                 embed_pool_to_comp_v1(comp_v1, pool, comp_v1p);
4240
4241                                 info->lti_buf.lb_buf = comp_v1p;
4242                                 info->lti_buf.lb_len = size;
4243                                 rc = lod_xattr_set_lov_on_dir(env, dt,
4244                                                               &info->lti_buf,
4245                                                               name, fl, th);
4246                         }
4247                 } else {
4248                         rc = lod_xattr_set_lov_on_dir(env, dt, buf, name, fl,
4249                                                       th);
4250                 }
4251         } else {
4252                 rc = lod_xattr_set_lov_on_dir(env, dt, buf, name, fl, th);
4253         }
4254
4255         if (lds->lds_def_striping_set == 1 && lds->lds_def_comp_entries != NULL)
4256                 lod_free_def_comp_entries(lds);
4257
4258         RETURN(rc);
4259 }
4260
4261 /**
4262  * Set default striping on a directory object.
4263  *
4264  * Sets specified striping on a directory object unless it matches the default
4265  * striping (LOVEA_DELETE_VALUES() macro). In the latter case remove existing
4266  * EA. This striping will be used when a new directory is being created in the
4267  * directory.
4268  *
4269  * \param[in] env       execution environment
4270  * \param[in] dt        the striped object
4271  * \param[in] buf       buffer with the striping
4272  * \param[in] name      name of EA
4273  * \param[in] fl        xattr flag (see OSD API description)
4274  * \param[in] th        transaction handle
4275  *
4276  * \retval              0 on success
4277  * \retval              negative if failed
4278  */
4279 static int lod_xattr_set_default_lmv_on_dir(const struct lu_env *env,
4280                                             struct dt_object *dt,
4281                                             const struct lu_buf *buf,
4282                                             const char *name, int fl,
4283                                             struct thandle *th)
4284 {
4285         struct lmv_user_md_v1 *lum;
4286         int rc;
4287
4288         ENTRY;
4289
4290         LASSERT(buf != NULL && buf->lb_buf != NULL);
4291         lum = buf->lb_buf;
4292
4293         CDEBUG(D_INFO,
4294                "set default stripe_count # %u stripe_offset %d hash %u\n",
4295               le32_to_cpu(lum->lum_stripe_count),
4296               (int)le32_to_cpu(lum->lum_stripe_offset),
4297               le32_to_cpu(lum->lum_hash_type));
4298
4299         if (LMVEA_DELETE_VALUES((le32_to_cpu(lum->lum_stripe_count)),
4300                                  le32_to_cpu(lum->lum_stripe_offset)) &&
4301             le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC) {
4302                 rc = lod_xattr_del_internal(env, dt, name, th);
4303                 if (rc == -ENODATA)
4304                         rc = 0;
4305         } else {
4306                 rc = lod_xattr_set_internal(env, dt, buf, name, fl, th);
4307                 if (rc != 0)
4308                         RETURN(rc);
4309         }
4310
4311         RETURN(rc);
4312 }
4313
4314 /**
4315  * Turn directory into a striped directory.
4316  *
4317  * During replay the client sends the striping created before MDT
4318  * failure, then the layer above LOD sends this defined striping
4319  * using ->do_xattr_set(), so LOD uses this method to replay creation
4320  * of the stripes. Notice the original information for the striping
4321  * (#stripes, FIDs, etc) was transferred in declare path.
4322  *
4323  * \param[in] env       execution environment
4324  * \param[in] dt        the striped object
4325  * \param[in] buf       buf lmv_user_md for create, or lmv_mds_md for replay
4326  * \param[in] name      not used currently
4327  * \param[in] fl        xattr flag (see OSD API description)
4328  * \param[in] th        transaction handle
4329  *
4330  * \retval              0 on success
4331  * \retval              negative if failed
4332  */
4333 static int lod_xattr_set_lmv(const struct lu_env *env, struct dt_object *dt,
4334                              const struct lu_buf *buf, const char *name,
4335                              int fl, struct thandle *th)
4336 {
4337         struct lod_object *lo = lod_dt_obj(dt);
4338         struct lod_thread_info *info = lod_env_info(env);
4339         struct lu_attr *attr = &info->lti_attr;
4340         struct dt_object_format *dof = &info->lti_format;
4341         struct lu_buf lmv_buf;
4342         struct lu_buf slave_lmv_buf;
4343         struct lmv_user_md *lum = buf->lb_buf;
4344         struct lmv_mds_md_v1 *lmm;
4345         struct lmv_mds_md_v1 *slave_lmm = NULL;
4346         struct dt_insert_rec *rec = &info->lti_dt_rec;
4347         int i;
4348         int rc;
4349
4350         ENTRY;
4351         /* lum is used to know whether it's replay */
4352         LASSERT(lum);
4353         if (!S_ISDIR(dt->do_lu.lo_header->loh_attr))
4354                 RETURN(-ENOTDIR);
4355
4356         /* The stripes are supposed to be allocated in declare phase,
4357          * if there are no stripes being allocated, it will skip */
4358         if (lo->ldo_dir_stripe_count == 0) {
4359                 if (lo->ldo_is_foreign) {
4360                         rc = lod_sub_xattr_set(env, dt_object_child(dt), buf,
4361                                                XATTR_NAME_LMV, fl, th);
4362                         if (rc != 0)
4363                                 RETURN(rc);
4364                 }
4365                 RETURN(0);
4366         }
4367
4368         rc = dt_attr_get(env, dt_object_child(dt), attr);
4369         if (rc != 0)
4370                 RETURN(rc);
4371
4372         attr->la_valid &= LA_ATIME | LA_MTIME | LA_CTIME | LA_FLAGS |
4373                           LA_MODE | LA_UID | LA_GID | LA_TYPE | LA_PROJID;
4374         dof->dof_type = DFT_DIR;
4375
4376         rc = lod_prep_lmv_md(env, dt, &lmv_buf);
4377         if (rc != 0)
4378                 RETURN(rc);
4379         lmm = lmv_buf.lb_buf;
4380
4381         OBD_ALLOC_PTR(slave_lmm);
4382         if (slave_lmm == NULL)
4383                 RETURN(-ENOMEM);
4384
4385         lod_prep_slave_lmv_md(slave_lmm, lmm);
4386         slave_lmv_buf.lb_buf = slave_lmm;
4387         slave_lmv_buf.lb_len = sizeof(*slave_lmm);
4388
4389         rec->rec_type = S_IFDIR;
4390         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
4391                 struct dt_object *dto = lo->ldo_stripe[i];
4392                 char *stripe_name = info->lti_key;
4393                 struct lu_name *sname;
4394                 struct linkea_data ldata = { NULL };
4395                 struct lu_buf linkea_buf;
4396                 bool stripe_created = false;
4397
4398                 /* OBD_FAIL_MDS_STRIPE_FID may leave stripe uninitialized */
4399                 if (!dto)
4400                         continue;
4401
4402                 /* fail a remote stripe creation */
4403                 if (i && CFS_FAIL_CHECK(OBD_FAIL_MDS_STRIPE_CREATE))
4404                         continue;
4405
4406                 /* if it's replay by client request, and stripe exists on remote
4407                  * MDT, it means mkdir was partially executed: stripe was
4408                  * created on remote MDT successfully, but target not in last
4409                  * run.
4410                  */
4411                 if (unlikely((le32_to_cpu(lum->lum_magic) == LMV_MAGIC_V1) &&
4412                              dt_object_exists(dto) && dt_object_remote(dto)))
4413                         stripe_created = true;
4414
4415                 /* don't create stripe if:
4416                  * 1. it's source stripe of migrating directory
4417                  * 2. it's existed stripe of splitting directory
4418                  */
4419                 if ((lod_is_migrating(lo) && i >= lo->ldo_dir_migrate_offset) ||
4420                     (lod_is_splitting(lo) && i < lo->ldo_dir_split_offset)) {
4421                         if (!dt_object_exists(dto))
4422                                 GOTO(out, rc = -EINVAL);
4423                 } else if (!stripe_created) {
4424                         dt_write_lock(env, dto, DT_TGT_CHILD);
4425                         rc = lod_sub_create(env, dto, attr, NULL, dof, th);
4426                         if (rc != 0) {
4427                                 dt_write_unlock(env, dto);
4428                                 GOTO(out, rc);
4429                         }
4430
4431                         rc = lod_sub_ref_add(env, dto, th);
4432                         dt_write_unlock(env, dto);
4433                         if (rc != 0)
4434                                 GOTO(out, rc);
4435
4436                         rec->rec_fid = lu_object_fid(&dto->do_lu);
4437                         rc = lod_sub_insert(env, dto,
4438                                             (const struct dt_rec *)rec,
4439                                             (const struct dt_key *)dot, th);
4440                         if (rc != 0)
4441                                 GOTO(out, rc);
4442                 }
4443
4444                 if (!CFS_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_SLAVE_LMV) ||
4445                     cfs_fail_val != i) {
4446                         if (CFS_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_SLAVE_LMV) &&
4447                             cfs_fail_val == i)
4448                                 slave_lmm->lmv_master_mdt_index =
4449                                                         cpu_to_le32(i + 1);
4450                         else
4451                                 slave_lmm->lmv_master_mdt_index =
4452                                                         cpu_to_le32(i);
4453
4454                         rc = lod_sub_xattr_set(env, dto, &slave_lmv_buf,
4455                                                XATTR_NAME_LMV, 0, th);
4456                         if (rc != 0)
4457                                 GOTO(out, rc);
4458                 }
4459
4460                 /* don't insert stripe if it's existed stripe of splitting
4461                  * directory (this directory is striped).
4462                  * NB, plain directory will insert itself as the first
4463                  * stripe in target.
4464                  */
4465                 if (lod_is_splitting(lo) && lo->ldo_dir_split_offset > 1 &&
4466                     lo->ldo_dir_split_offset > i)
4467                         continue;
4468
4469                 if (CFS_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_SLAVE_NAME) &&
4470                     cfs_fail_val == i)
4471                         snprintf(stripe_name, sizeof(info->lti_key), DFID":%d",
4472                                  PFID(lu_object_fid(&dto->do_lu)), i + 1);
4473                 else
4474                         snprintf(stripe_name, sizeof(info->lti_key), DFID":%d",
4475                                  PFID(lu_object_fid(&dto->do_lu)), i);
4476
4477                 if (!stripe_created) {
4478                         rec->rec_fid = lu_object_fid(&dt->do_lu);
4479                         rc = lod_sub_insert(env, dto, (struct dt_rec *)rec,
4480                                             (const struct dt_key *)dotdot, th);
4481                         if (rc != 0)
4482                                 GOTO(out, rc);
4483
4484                         sname = lod_name_get(env, stripe_name,
4485                                              strlen(stripe_name));
4486                         rc = linkea_links_new(&ldata, &info->lti_linkea_buf,
4487                                               sname, lu_object_fid(&dt->do_lu));
4488                         if (rc != 0)
4489                                 GOTO(out, rc);
4490
4491                         linkea_buf.lb_buf = ldata.ld_buf->lb_buf;
4492                         linkea_buf.lb_len = ldata.ld_leh->leh_len;
4493                         rc = lod_sub_xattr_set(env, dto, &linkea_buf,
4494                                                XATTR_NAME_LINK, 0, th);
4495                         if (rc != 0)
4496                                 GOTO(out, rc);
4497                 }
4498
4499                 rec->rec_fid = lu_object_fid(&dto->do_lu);
4500                 rc = lod_sub_insert(env, dt_object_child(dt),
4501                                     (const struct dt_rec *)rec,
4502                                     (const struct dt_key *)stripe_name, th);
4503                 if (rc != 0)
4504                         GOTO(out, rc);
4505
4506                 rc = lod_sub_ref_add(env, dt_object_child(dt), th);
4507                 if (rc != 0)
4508                         GOTO(out, rc);
4509         }
4510
4511         if (!CFS_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MASTER_LMV))
4512                 rc = lod_sub_xattr_set(env, dt_object_child(dt),
4513                                        &lmv_buf, XATTR_NAME_LMV, fl, th);
4514 out:
4515         if (slave_lmm != NULL)
4516                 OBD_FREE_PTR(slave_lmm);
4517
4518         RETURN(rc);
4519 }
4520
4521 /**
4522  * Helper function to declare/execute creation of a striped directory
4523  *
4524  * Called in declare/create object path, prepare striping for a directory
4525  * and prepare defaults data striping for the objects to be created in
4526  * that directory. Notice the function calls "declaration" or "execution"
4527  * methods depending on \a declare param. This is a consequence of the
4528  * current approach while we don't have natural distributed transactions:
4529  * we basically execute non-local updates in the declare phase. So, the
4530  * arguments for the both phases are the same and this is the reason for
4531  * this function to exist.
4532  *
4533  * \param[in] env       execution environment
4534  * \param[in] dt        object
4535  * \param[in] attr      attributes the stripes will be created with
4536  * \param[in] lmu       lmv_user_md if MDT indices are specified
4537  * \param[in] dof       format of stripes (see OSD API description)
4538  * \param[in] th        transaction handle
4539  * \param[in] declare   where to call "declare" or "execute" methods
4540  *
4541  * \retval              0 on success
4542  * \retval              negative if failed
4543  */
4544 static int lod_dir_striping_create_internal(const struct lu_env *env,
4545                                             struct dt_object *dt,
4546                                             struct lu_attr *attr,
4547                                             const struct lu_buf *lmu,
4548                                             struct dt_object_format *dof,
4549                                             struct thandle *th,
4550                                             bool declare)
4551 {
4552         struct lod_thread_info *info = lod_env_info(env);
4553         struct lod_object *lo = lod_dt_obj(dt);
4554         const struct lod_default_striping *lds = lo->ldo_def_striping;
4555         int rc;
4556         ENTRY;
4557
4558         LASSERT(ergo(lds != NULL,
4559                      lds->lds_def_striping_set ||
4560                      lds->lds_dir_def_striping_set));
4561         LASSERT(lmu);
4562
4563         if (!LMVEA_DELETE_VALUES(lo->ldo_dir_stripe_count,
4564                                  lo->ldo_dir_stripe_offset)) {
4565                 if (!lmu->lb_buf) {
4566                         /* mkdir by default LMV */
4567                         struct lmv_user_md_v1 *v1 = info->lti_ea_store;
4568                         int stripe_count = lo->ldo_dir_stripe_count;
4569
4570                         if (info->lti_ea_store_size < sizeof(*v1)) {
4571                                 rc = lod_ea_store_resize(info, sizeof(*v1));
4572                                 if (rc != 0)
4573                                         RETURN(rc);
4574                                 v1 = info->lti_ea_store;
4575                         }
4576
4577                         memset(v1, 0, sizeof(*v1));
4578                         v1->lum_magic = cpu_to_le32(LMV_USER_MAGIC);
4579                         v1->lum_stripe_count = cpu_to_le32(stripe_count);
4580                         v1->lum_stripe_offset =
4581                                         cpu_to_le32(lo->ldo_dir_stripe_offset);
4582
4583                         info->lti_buf.lb_buf = v1;
4584                         info->lti_buf.lb_len = sizeof(*v1);
4585                         lmu = &info->lti_buf;
4586                 }
4587
4588                 if (declare)
4589                         rc = lod_declare_xattr_set_lmv(env, dt, attr, lmu, dof,
4590                                                        th);
4591                 else
4592                         rc = lod_xattr_set_lmv(env, dt, lmu, XATTR_NAME_LMV, 0,
4593                                                th);
4594                 if (rc != 0)
4595                         RETURN(rc);
4596         } else if (lmu->lb_buf) {
4597                 /* foreign LMV EA case */
4598                 if (declare) {
4599                         struct lmv_foreign_md *lfm = lmu->lb_buf;
4600
4601                         if (lfm->lfm_magic == LMV_MAGIC_FOREIGN)
4602                                 rc = lod_declare_xattr_set_lmv(env, dt, attr,
4603                                                                lmu, dof, th);
4604                 } else if (lo->ldo_is_foreign) {
4605                         LASSERT(lo->ldo_foreign_lmv != NULL &&
4606                                 lo->ldo_foreign_lmv_size > 0);
4607                         info->lti_buf.lb_buf = lo->ldo_foreign_lmv;
4608                         info->lti_buf.lb_len = lo->ldo_foreign_lmv_size;
4609                         lmu = &info->lti_buf;
4610                         rc = lod_xattr_set_lmv(env, dt, lmu, XATTR_NAME_LMV, 0,
4611                                                th);
4612                 }
4613         }
4614
4615         /* Transfer default LMV striping from the parent */
4616         if (lds != NULL && lds->lds_dir_def_striping_set &&
4617             lds->lds_dir_def_max_inherit != LMV_INHERIT_END &&
4618             lds->lds_dir_def_max_inherit != LMV_INHERIT_NONE &&
4619             !(LMVEA_DELETE_VALUES(lds->lds_dir_def_stripe_count,
4620                                  lds->lds_dir_def_stripe_offset) &&
4621               le32_to_cpu(lds->lds_dir_def_hash_type) !=
4622               LMV_HASH_TYPE_UNKNOWN)) {
4623                 struct lmv_user_md_v1 *v1 = info->lti_ea_store;
4624
4625                 if (info->lti_ea_store_size < sizeof(*v1)) {
4626                         rc = lod_ea_store_resize(info, sizeof(*v1));
4627                         if (rc != 0)
4628                                 RETURN(rc);
4629                         v1 = info->lti_ea_store;
4630                 }
4631
4632                 memset(v1, 0, sizeof(*v1));
4633                 v1->lum_magic = cpu_to_le32(LMV_USER_MAGIC);
4634                 v1->lum_stripe_count =
4635                         cpu_to_le32(lds->lds_dir_def_stripe_count);
4636                 v1->lum_stripe_offset =
4637                         cpu_to_le32(lds->lds_dir_def_stripe_offset);
4638                 v1->lum_hash_type =
4639                         cpu_to_le32(lds->lds_dir_def_hash_type);
4640                 v1->lum_max_inherit =
4641                         lmv_inherit_next(lds->lds_dir_def_max_inherit);
4642                 v1->lum_max_inherit_rr =
4643                         lmv_inherit_rr_next(lds->lds_dir_def_max_inherit_rr);
4644
4645                 info->lti_buf.lb_buf = v1;
4646                 info->lti_buf.lb_len = sizeof(*v1);
4647                 if (declare)
4648                         rc = lod_dir_declare_xattr_set(env, dt, &info->lti_buf,
4649                                                        XATTR_NAME_DEFAULT_LMV,
4650                                                        0, th);
4651                 else
4652                         rc = lod_xattr_set_default_lmv_on_dir(env, dt,
4653                                                   &info->lti_buf,
4654                                                   XATTR_NAME_DEFAULT_LMV, 0,
4655                                                   th);
4656                 if (rc != 0)
4657                         RETURN(rc);
4658         }
4659
4660         /* Transfer default LOV striping from the parent */
4661         if (lds != NULL && lds->lds_def_striping_set &&
4662             lds->lds_def_comp_cnt != 0) {
4663                 struct lov_mds_md *lmm;
4664                 int lmm_size = lod_comp_md_size(lo, true);
4665
4666                 if (info->lti_ea_store_size < lmm_size) {
4667                         rc = lod_ea_store_resize(info, lmm_size);
4668                         if (rc != 0)
4669                                 RETURN(rc);
4670                 }
4671                 lmm = info->lti_ea_store;
4672
4673                 rc = lod_generate_lovea(env, lo, lmm, &lmm_size, true);
4674                 if (rc != 0)
4675                         RETURN(rc);
4676
4677                 info->lti_buf.lb_buf = lmm;
4678                 info->lti_buf.lb_len = lmm_size;
4679
4680                 if (declare)
4681                         rc = lod_dir_declare_xattr_set(env, dt, &info->lti_buf,
4682                                                        XATTR_NAME_LOV, 0, th);
4683                 else
4684                         rc = lod_xattr_set_lov_on_dir(env, dt, &info->lti_buf,
4685                                                       XATTR_NAME_LOV, 0, th);
4686                 if (rc != 0)
4687                         RETURN(rc);
4688         }
4689
4690         /* ldo_def_striping is not allocated, clear after use, in case directory
4691          * layout is changed later.
4692          */
4693         if (!declare)
4694                 lo->ldo_def_striping = NULL;
4695
4696         RETURN(0);
4697 }
4698
4699 static int lod_declare_dir_striping_create(const struct lu_env *env,
4700                                            struct dt_object *dt,
4701                                            struct lu_attr *attr,
4702                                            struct lu_buf *lmu,
4703                                            struct dt_object_format *dof,
4704                                            struct thandle *th)
4705 {
4706         return lod_dir_striping_create_internal(env, dt, attr, lmu, dof, th,
4707                                                 true);
4708 }
4709
4710 static int lod_dir_striping_create(const struct lu_env *env,
4711                                    struct dt_object *dt,
4712                                    struct lu_attr *attr,
4713                                    const struct lu_buf *lmu,
4714                                    struct dt_object_format *dof,
4715                                    struct thandle *th)
4716 {
4717         return lod_dir_striping_create_internal(env, dt, attr, lmu, dof, th,
4718                                                 false);
4719 }
4720
4721 /**
4722  * Make LOV EA for striped object.
4723  *
4724  * Generate striping information and store it in the LOV EA of the given
4725  * object. The caller must ensure nobody else is calling the function
4726  * against the object concurrently. The transaction must be started.
4727  * FLDB service must be running as well; it's used to map FID to the target,
4728  * which is stored in LOV EA.
4729  *
4730  * \param[in] env               execution environment for this thread
4731  * \param[in] lo                LOD object
4732  * \param[in] th                transaction handle
4733  *
4734  * \retval                      0 if LOV EA is stored successfully
4735  * \retval                      negative error number on failure
4736  */
4737 static int lod_generate_and_set_lovea(const struct lu_env *env,
4738                                       struct lod_object *lo,
4739                                       struct thandle *th)
4740 {
4741         struct lod_thread_info  *info = lod_env_info(env);
4742         struct dt_object        *next = dt_object_child(&lo->ldo_obj);
4743         struct lov_mds_md_v1    *lmm;
4744         int                      rc, lmm_size;
4745         ENTRY;
4746
4747         LASSERT(lo);
4748
4749         if (lo->ldo_comp_cnt == 0 && !lo->ldo_is_foreign) {
4750                 lod_striping_free_nolock(env, lo);
4751                 rc = lod_sub_xattr_del(env, next, XATTR_NAME_LOV, th);
4752                 RETURN(rc);
4753         }
4754
4755         lmm_size = lod_comp_md_size(lo, false);
4756         if (info->lti_ea_store_size < lmm_size) {
4757                 rc = lod_ea_store_resize(info, lmm_size);
4758                 if (rc)
4759                         RETURN(rc);
4760         }
4761         lmm = info->lti_ea_store;
4762
4763         rc = lod_generate_lovea(env, lo, lmm, &lmm_size, false);
4764         if (rc)
4765                 RETURN(rc);
4766
4767         info->lti_buf.lb_buf = lmm;
4768         info->lti_buf.lb_len = lmm_size;
4769         rc = lod_sub_xattr_set(env, next, &info->lti_buf,
4770                                XATTR_NAME_LOV, 0, th);
4771         RETURN(rc);
4772 }
4773
4774 static __u32 lod_gen_component_id(struct lod_object *lo,
4775                                   int mirror_id, int comp_idx);
4776
4777 /**
4778  * Repeat an existing component
4779  *
4780  * Creates a new layout by replicating an existing component.  Uses striping
4781  * policy from previous component as a template for the striping for the new
4782  * new component.
4783  *
4784  * New component starts with zero length, will be extended (or removed) before
4785  * returning layout to client.
4786  *
4787  * NB: Reallocates layout components array (lo->ldo_comp_entries), invalidating
4788  * any pre-existing pointers to components.  Handle with care.
4789  *
4790  * \param[in] env       execution environment for this thread
4791  * \param[in,out] lo    object to update the layout of
4792  * \param[in] index     index of component to copy
4793  *
4794  * \retval      0 on success
4795  * \retval      negative errno on error
4796  */
4797 static int lod_layout_repeat_comp(const struct lu_env *env,
4798                                   struct lod_object *lo, int index)
4799 {
4800         struct lod_layout_component *lod_comp;
4801         struct lod_layout_component *new_comp = NULL;
4802         struct lod_layout_component *comp_array;
4803         int rc = 0, i, new_cnt = lo->ldo_comp_cnt + 1;
4804         __u16 mirror_id;
4805         int offset = 0;
4806         ENTRY;
4807
4808         lod_comp = &lo->ldo_comp_entries[index];
4809         LASSERT(lod_comp_inited(lod_comp) && lod_comp->llc_id != LCME_ID_INVAL);
4810
4811         CDEBUG(D_LAYOUT, "repeating component %d\n", index);
4812
4813         OBD_ALLOC_PTR_ARRAY(comp_array, new_cnt);
4814         if (comp_array == NULL)
4815                 GOTO(out, rc = -ENOMEM);
4816
4817         for (i = 0; i < lo->ldo_comp_cnt; i++) {
4818                 memcpy(&comp_array[i + offset], &lo->ldo_comp_entries[i],
4819                        sizeof(*comp_array));
4820
4821                 /* Duplicate this component in to the next slot */
4822                 if (i == index) {
4823                         new_comp = &comp_array[i + 1];
4824                         memcpy(&comp_array[i + 1], &lo->ldo_comp_entries[i],
4825                                sizeof(*comp_array));
4826                         /* We must now skip this new component when copying */
4827                         offset = 1;
4828                 }
4829         }
4830
4831         /* Set up copied component */
4832         new_comp->llc_flags &= ~LCME_FL_INIT;
4833         new_comp->llc_stripe = NULL;
4834         new_comp->llc_stripes_allocated = 0;
4835         new_comp->llc_ost_indices = NULL;
4836         new_comp->llc_stripe_offset = LOV_OFFSET_DEFAULT;
4837         /* for uninstantiated components, layout gen stores default stripe
4838          * offset */
4839         new_comp->llc_layout_gen = lod_comp->llc_stripe_offset;
4840         /* This makes the repeated component zero-length, placed at the end of
4841          * the preceding component */
4842         new_comp->llc_extent.e_start = new_comp->llc_extent.e_end;
4843         new_comp->llc_timestamp = lod_comp->llc_timestamp;
4844         new_comp->llc_pool = NULL;
4845
4846         rc = lod_set_pool(&new_comp->llc_pool, lod_comp->llc_pool);
4847         if (rc)
4848                 GOTO(out, rc);
4849
4850         if (new_comp->llc_ostlist.op_array) {
4851                 __u32 *op_array = NULL;
4852
4853                 OBD_ALLOC(op_array, new_comp->llc_ostlist.op_size);
4854                 if (!op_array)
4855                         GOTO(out, rc = -ENOMEM);
4856                 memcpy(op_array, &new_comp->llc_ostlist.op_array,
4857                        new_comp->llc_ostlist.op_size);
4858                 new_comp->llc_ostlist.op_array = op_array;
4859         }
4860
4861         OBD_FREE_PTR_ARRAY(lo->ldo_comp_entries, lo->ldo_comp_cnt);
4862         lo->ldo_comp_entries = comp_array;
4863         lo->ldo_comp_cnt = new_cnt;
4864
4865         /* Generate an id for the new component */
4866         mirror_id = mirror_id_of(new_comp->llc_id);
4867         new_comp->llc_id = LCME_ID_INVAL;
4868         new_comp->llc_id = lod_gen_component_id(lo, mirror_id, index + 1);
4869         if (new_comp->llc_id == LCME_ID_INVAL)
4870                 GOTO(out, rc = -ERANGE);
4871
4872         EXIT;
4873 out:
4874         if (rc)
4875                 OBD_FREE_PTR_ARRAY(comp_array, new_cnt);
4876
4877         return rc;
4878 }
4879
4880 static int lod_layout_data_init(struct lod_thread_info *info, __u32 comp_cnt)
4881 {
4882         ENTRY;
4883
4884         /* clear memory region that will be used for layout change */
4885         memset(&info->lti_layout_attr, 0, sizeof(struct lu_attr));
4886         info->lti_count = 0;
4887
4888         if (info->lti_comp_size >= comp_cnt)
4889                 RETURN(0);
4890
4891         if (info->lti_comp_size > 0) {
4892                 OBD_FREE_PTR_ARRAY(info->lti_comp_idx, info->lti_comp_size);
4893                 info->lti_comp_size = 0;
4894         }
4895
4896         OBD_ALLOC_PTR_ARRAY(info->lti_comp_idx, comp_cnt);
4897         if (!info->lti_comp_idx)
4898                 RETURN(-ENOMEM);
4899
4900         info->lti_comp_size = comp_cnt;
4901         RETURN(0);
4902 }
4903
4904 /**
4905  * Prepare new layout minus deleted components
4906  *
4907  * Removes components marked for deletion (LCME_ID_INVAL) by copying to a new
4908  * layout and skipping those components.  Removes stripe objects if any exist.
4909  *
4910  * NB:
4911  * Reallocates layout components array (lo->ldo_comp_entries), invalidating
4912  * any pre-existing pointers to components.
4913  *
4914  * Caller is responsible for updating mirror end (ldo_mirror[].lme_end).
4915  *
4916  * \param[in] env       execution environment for this thread
4917  * \param[in,out] lo    object to update the layout of
4918  * \param[in] th        transaction handle for this operation
4919  *
4920  * \retval      # of components deleted
4921  * \retval      negative errno on error
4922  */
4923 static int lod_layout_del_prep_layout(const struct lu_env *env,
4924                                       struct lod_object *lo,
4925                                       struct thandle *th)
4926 {
4927         struct lod_layout_component     *lod_comp;
4928         struct lod_thread_info  *info = lod_env_info(env);
4929         int rc = 0, i, j, deleted = 0;
4930
4931         ENTRY;
4932
4933         LASSERT(lo->ldo_is_composite);
4934         LASSERT(lo->ldo_comp_cnt > 0 && lo->ldo_comp_entries != NULL);
4935
4936         rc = lod_layout_data_init(info, lo->ldo_comp_cnt);
4937         if (rc)
4938                 RETURN(rc);
4939
4940         for (i = 0; i < lo->ldo_comp_cnt; i++) {
4941                 lod_comp = &lo->ldo_comp_entries[i];
4942
4943                 if (lod_comp->llc_id != LCME_ID_INVAL) {
4944                         /* Build array of things to keep */
4945                         info->lti_comp_idx[info->lti_count++] = i;
4946                         continue;
4947                 }
4948
4949                 if (lod_comp->llc_magic == LOV_MAGIC_FOREIGN)
4950                         continue;
4951
4952                 lod_obj_set_pool(lo, i, NULL);
4953                 if (lod_comp->llc_ostlist.op_array) {
4954                         OBD_FREE(lod_comp->llc_ostlist.op_array,
4955                                  lod_comp->llc_ostlist.op_size);
4956                         lod_comp->llc_ostlist.op_array = NULL;
4957                         lod_comp->llc_ostlist.op_size = 0;
4958                 }
4959
4960                 deleted++;
4961                 CDEBUG(D_LAYOUT, "deleting comp %d, left %d\n", i,
4962                        lo->ldo_comp_cnt - deleted);
4963
4964                 /* No striping info for this component */
4965                 if (lod_comp->llc_stripe == NULL)
4966                         continue;
4967
4968                 LASSERT(lod_comp->llc_stripe_count > 0);
4969                 for (j = 0; j < lod_comp->llc_stripe_count; j++) {
4970                         struct dt_object *obj = lod_comp->llc_stripe[j];
4971
4972                         if (obj == NULL)
4973                                 continue;
4974
4975                         /* components which are not init have no sub objects
4976                          * to destroy */
4977                         if (lod_comp_inited(lod_comp)) {
4978                                 rc = lod_sub_destroy(env, obj, th);
4979                                 if (rc)
4980                                         GOTO(out, rc);
4981                         }
4982
4983                         lu_object_put(env, &obj->do_lu);
4984                         lod_comp->llc_stripe[j] = NULL;
4985                 }
4986                 OBD_FREE_PTR_ARRAY(lod_comp->llc_stripe,
4987                                    lod_comp->llc_stripes_allocated);
4988                 lod_comp->llc_stripe = NULL;
4989                 OBD_FREE_PTR_ARRAY(lod_comp->llc_ost_indices,
4990                                    lod_comp->llc_stripes_allocated);
4991                 lod_comp->llc_ost_indices = NULL;
4992                 lod_comp->llc_stripes_allocated = 0;
4993         }
4994
4995         /* info->lti_count has the amount of left components */
4996         LASSERTF(info->lti_count >= 0 && info->lti_count < lo->ldo_comp_cnt,
4997                  "left = %d, lo->ldo_comp_cnt %d\n", (int)info->lti_count,
4998                  (int)lo->ldo_comp_cnt);
4999
5000         if (info->lti_count > 0) {
5001                 struct lod_layout_component *comp_array;
5002
5003                 OBD_ALLOC_PTR_ARRAY(comp_array, info->lti_count);
5004                 if (comp_array == NULL)
5005                         GOTO(out, rc = -ENOMEM);
5006
5007                 for (i = 0; i < info->lti_count; i++) {
5008                         memcpy(&comp_array[i],
5009                                &lo->ldo_comp_entries[info->lti_comp_idx[i]],
5010                                sizeof(*comp_array));
5011                 }
5012
5013                 OBD_FREE_PTR_ARRAY(lo->ldo_comp_entries, lo->ldo_comp_cnt);
5014                 lo->ldo_comp_entries = comp_array;
5015                 lo->ldo_comp_cnt = info->lti_count;
5016         } else {
5017                 lod_free_comp_entries(lo);
5018         }
5019
5020         EXIT;
5021 out:
5022         return rc ? rc : deleted;
5023 }
5024
5025 /**
5026  * Delete layout component(s)
5027  *
5028  * This function sets up the layout data in the env and does the setattrs
5029  * required to write out the new layout.  The layout itself is modified in
5030  * lod_layout_del_prep_layout.
5031  *
5032  * \param[in] env       execution environment for this thread
5033  * \param[in] dt        object
5034  * \param[in] th        transaction handle
5035  *
5036  * \retval      0 on success
5037  * \retval      negative error number on failure
5038  */
5039 static int lod_layout_del(const struct lu_env *env, struct dt_object *dt,
5040                           struct thandle *th)
5041 {
5042         struct lod_object *lo = lod_dt_obj(dt);
5043         struct dt_object *next = dt_object_child(dt);
5044         struct lu_attr *attr = &lod_env_info(env)->lti_attr;
5045         int rc;
5046
5047         LASSERT(lo->ldo_mirror_count == 1);
5048
5049         mutex_lock(&lo->ldo_layout_mutex);
5050
5051         rc = lod_layout_del_prep_layout(env, lo, th);
5052         if (rc < 0)
5053                 GOTO(out, rc);
5054
5055         /* Only do this if we didn't delete all components */
5056         if (lo->ldo_comp_cnt > 0) {
5057                 lo->ldo_mirrors[0].lme_end = lo->ldo_comp_cnt - 1;
5058                 lod_obj_inc_layout_gen(lo);
5059         }
5060
5061         LASSERT(dt_object_exists(dt));
5062         rc = dt_attr_get(env, next, attr);
5063         if (rc)
5064                 GOTO(out, rc);
5065
5066         if (attr->la_size > 0) {
5067                 attr->la_size = 0;
5068                 attr->la_valid = LA_SIZE;
5069                 rc = lod_sub_attr_set(env, next, attr, th);
5070                 if (rc)
5071                         GOTO(out, rc);
5072         }
5073
5074         rc = lod_generate_and_set_lovea(env, lo, th);
5075         EXIT;
5076 out:
5077         if (rc)
5078                 lod_striping_free_nolock(env, lo);
5079
5080         mutex_unlock(&lo->ldo_layout_mutex);
5081
5082         return rc;
5083 }
5084
5085
5086 /**
5087  * Implementation of dt_object_operations::do_xattr_set.
5088  *
5089  * Sets specified extended attribute on the object. Three types of EAs are
5090  * special:
5091  *   LOV EA - stores striping for a regular file or default striping (when set
5092  *            on a directory)
5093  *   LMV EA - stores a marker for the striped directories
5094  *   DMV EA - stores default directory striping
5095  *
5096  * When striping is applied to a non-striped existing object (this is called
5097  * late striping), then LOD notices the caller wants to turn the object into a
5098  * striped one. The stripe objects are created and appropriate EA is set:
5099  * LOV EA storing all the stripes directly or LMV EA storing just a small header
5100  * with striping configuration.
5101  *
5102  * \see dt_object_operations::do_xattr_set() in the API description for details.
5103  */
5104 static int lod_xattr_set(const struct lu_env *env,
5105                          struct dt_object *dt, const struct lu_buf *buf,
5106                          const char *name, int fl, struct thandle *th)
5107 {
5108         struct lod_thread_info *info = lod_env_info(env);
5109         struct dt_object *next = dt_object_child(dt);
5110         struct lu_attr *layout_attr = &info->lti_layout_attr;
5111         struct lod_object *lo = lod_dt_obj(dt);
5112         struct lod_obj_stripe_cb_data data = { {0} };
5113         int rc = 0;
5114
5115         ENTRY;
5116
5117         if (S_ISDIR(dt->do_lu.lo_header->loh_attr) &&
5118             !strcmp(name, XATTR_NAME_LMV)) {
5119                 switch (fl) {
5120                 case LU_XATTR_CREATE:
5121                         rc = lod_dir_striping_create(env, dt, NULL, buf, NULL,
5122                                                      th);
5123                         break;
5124                 case 0:
5125                 case LU_XATTR_REPLACE:
5126                         rc = lod_dir_layout_set(env, dt, buf, fl, th);
5127                         break;
5128                 default:
5129                         LBUG();
5130                 }
5131
5132                 RETURN(rc);
5133         } else if (S_ISDIR(dt->do_lu.lo_header->loh_attr) &&
5134                    strcmp(name, XATTR_NAME_LOV) == 0) {
5135                 rc = lod_xattr_set_default_lov_on_dir(env, dt, buf, name, fl,
5136                                                       th);
5137                 RETURN(rc);
5138         } else if (S_ISDIR(dt->do_lu.lo_header->loh_attr) &&
5139                    strcmp(name, XATTR_NAME_DEFAULT_LMV) == 0) {
5140                 /* default LMVEA */
5141                 rc = lod_xattr_set_default_lmv_on_dir(env, dt, buf, name, fl,
5142                                                       th);
5143                 RETURN(rc);
5144         } else if (S_ISREG(dt->do_lu.lo_header->loh_attr) &&
5145                    (strcmp(name, XATTR_NAME_LOV) == 0 ||
5146                     strcmp(name, XATTR_LUSTRE_LOV) == 0 ||
5147                     allowed_lustre_lov(name))) {
5148                 /* layout has been changed by others in the transaction */
5149                 rc = lod_check_layout_gen_intrans(info, lo);
5150                 if (rc > 0) {
5151                         CDEBUG(D_LAYOUT,
5152                                "%s: obj "DFID" gen changed from %d to %d in transaction, retry the transaction\n",
5153                                dt->do_lu.lo_dev->ld_obd->obd_name,
5154                                PFID(lu_object_fid(&dt->do_lu)),
5155                                info->lti_gen[rc - 1], lo->ldo_layout_gen);
5156                         RETURN(-EAGAIN);
5157                 }
5158
5159                 /* in case of lov EA swap, just set it
5160                  * if not, it is a replay so check striping match what we
5161                  * already have during req replay, declare_xattr_set()
5162                  * defines striping, then create() does the work */
5163                 if (fl & LU_XATTR_REPLACE) {
5164                         /* free stripes, then update disk */
5165                         lod_striping_free(env, lod_dt_obj(dt));
5166
5167                         rc = lod_sub_xattr_set(env, next, buf, name, fl, th);
5168                 } else if (fl & LU_XATTR_SPLIT) {
5169                         rc = lod_sub_xattr_set(env, next, buf, name, fl, th);
5170                         if (rc)
5171                                 RETURN(rc);
5172
5173                         rc = lod_striping_reload(env, lo, buf, LVF_ALL_STALE);
5174                         if (rc)
5175                                 RETURN(rc);
5176
5177                         if (lo->ldo_mirror_count > 1 &&
5178                             layout_attr->la_valid & LA_LAYOUT_VERSION) {
5179                                 /* mirror split */
5180                                 layout_attr->la_layout_version =
5181                                                 lo->ldo_layout_gen;
5182                                 data.locd_attr = layout_attr;
5183                                 data.locd_declare = false;
5184                                 data.locd_stripe_cb =
5185                                                 lod_obj_stripe_attr_set_cb;
5186                                 rc = lod_obj_for_each_stripe(env, lo, th,
5187                                                              &data);
5188                                 if (rc)
5189                                         RETURN(rc);
5190                         }
5191                 } else if (fl & LU_XATTR_PURGE) {
5192                         rc = lod_layout_purge(env, dt, buf, th);
5193                 } else if (dt_object_remote(dt)) {
5194                         /* This only happens during migration, see
5195                          * mdd_migrate_create(), in which Master MDT will
5196                          * create a remote target object, and only set
5197                          * (migrating) stripe EA on the remote object,
5198                          * and does not need creating each stripes. */
5199                         rc = lod_sub_xattr_set(env, next, buf, name,
5200                                                       fl, th);
5201                 } else if (strcmp(name, XATTR_LUSTRE_LOV".del") == 0) {
5202                         /* delete component(s) */
5203                         LASSERT(lod_dt_obj(dt)->ldo_comp_cached);
5204                         rc = lod_layout_del(env, dt, th);
5205                 } else {
5206                         /*
5207                          * When 'name' is XATTR_LUSTRE_LOV or XATTR_NAME_LOV,
5208                          * it's going to create file with specified
5209                          * component(s), the striping must have not being
5210                          * cached in this case;
5211                          *
5212                          * Otherwise, it's going to add/change component(s) to
5213                          * an existing file, the striping must have been cached
5214                          * in this case.
5215                          */
5216                         if (!(fl & LU_XATTR_MERGE))
5217                                 LASSERT(equi(!strcmp(name, XATTR_LUSTRE_LOV) ||
5218                                              !strcmp(name, XATTR_NAME_LOV),
5219                                         !lod_dt_obj(dt)->ldo_comp_cached));
5220
5221                         rc = lod_striped_create(env, dt, NULL, NULL, th);
5222                         if (rc)
5223                                 RETURN(rc);
5224
5225                         if (fl & LU_XATTR_MERGE && lo->ldo_mirror_count > 1 &&
5226                             layout_attr->la_valid & LA_LAYOUT_VERSION) {
5227                                 /* mirror merge exec phase */
5228                                 layout_attr->la_layout_version =
5229                                                 lo->ldo_layout_gen;
5230                                 data.locd_attr = layout_attr;
5231                                 data.locd_declare = false;
5232                                 data.locd_stripe_cb =
5233                                                 lod_obj_stripe_attr_set_cb;
5234                                 rc = lod_obj_for_each_stripe(env, lo, th,
5235                                                              &data);
5236                                 if (rc)
5237                                         RETURN(rc);
5238                         }
5239                 }
5240                 RETURN(rc);
5241         } else if (strcmp(name, XATTR_NAME_FID) == 0) {
5242                 rc = lod_replace_parent_fid(env, dt, buf, th, false);
5243
5244                 RETURN(rc);
5245         }
5246
5247         /* then all other xattr */
5248         rc = lod_xattr_set_internal(env, dt, buf, name, fl, th);
5249
5250         RETURN(rc);
5251 }
5252
5253 /**
5254  * Implementation of dt_object_operations::do_declare_xattr_del.
5255  *
5256  * \see dt_object_operations::do_declare_xattr_del() in the API description
5257  * for details.
5258  */
5259 static int lod_declare_xattr_del(const struct lu_env *env,
5260                                  struct dt_object *dt, const char *name,
5261                                  struct thandle *th)
5262 {
5263         struct lod_object *lo = lod_dt_obj(dt);
5264         struct dt_object *next = dt_object_child(dt);
5265         int i;
5266         int rc;
5267         ENTRY;
5268
5269         rc = lod_sub_declare_xattr_del(env, next, name, th);
5270         if (rc != 0)
5271                 RETURN(rc);
5272
5273         if (!S_ISDIR(dt->do_lu.lo_header->loh_attr))
5274                 RETURN(0);
5275
5276         /* NB: don't delete stripe LMV, because when we do this, normally we
5277          * will remove stripes, besides, if directory LMV is corrupt, this will
5278          * prevent deleting its LMV and fixing it (via LFSCK).
5279          */
5280         if (!strcmp(name, XATTR_NAME_LMV))
5281                 RETURN(0);
5282
5283         rc = lod_striping_load(env, lo);
5284         if (rc != 0)
5285                 RETURN(rc);
5286
5287         if (lo->ldo_dir_stripe_count == 0)
5288                 RETURN(0);
5289
5290         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
5291                 struct dt_object *dto = lo->ldo_stripe[i];
5292
5293                 if (!dto)
5294                         continue;
5295
5296                 if (!dt_object_exists(dto))
5297                         continue;
5298
5299                 rc = lod_sub_declare_xattr_del(env, dto, name, th);
5300                 if (rc != 0)
5301                         break;
5302         }
5303
5304         RETURN(rc);
5305 }
5306
5307 /**
5308  * Implementation of dt_object_operations::do_xattr_del.
5309  *
5310  * If EA storing a regular striping is being deleted, then release
5311  * all the references to the stripe objects in core.
5312  *
5313  * \see dt_object_operations::do_xattr_del() in the API description for details.
5314  */
5315 static int lod_xattr_del(const struct lu_env *env, struct dt_object *dt,
5316                          const char *name, struct thandle *th)
5317 {
5318         int rc;
5319
5320         ENTRY;
5321
5322         if (!strcmp(name, XATTR_NAME_LOV) || !strcmp(name, XATTR_NAME_LMV))
5323                 lod_striping_free(env, lod_dt_obj(dt));
5324
5325         rc = lod_xattr_del_internal(env, dt, name, th);
5326
5327         RETURN(rc);
5328 }
5329
5330 /**
5331  * Implementation of dt_object_operations::do_xattr_list.
5332  *
5333  * \see dt_object_operations::do_xattr_list() in the API description
5334  * for details.
5335  */
5336 static int lod_xattr_list(const struct lu_env *env,
5337                           struct dt_object *dt, const struct lu_buf *buf)
5338 {
5339         return dt_xattr_list(env, dt_object_child(dt), buf);
5340 }
5341
5342 static inline int lod_object_will_be_striped(int is_reg, const struct lu_fid *fid)
5343 {
5344         return (is_reg && fid_seq(fid) != FID_SEQ_LOCAL_FILE);
5345 }
5346
5347 /**
5348  * Copy OST list from layout provided by user.
5349  *
5350  * \param[in] lod_comp          layout_component to be filled
5351  * \param[in] v3                LOV EA V3 user data
5352  *
5353  * \retval              0 on success
5354  * \retval              negative if failed
5355  */
5356 int lod_comp_copy_ost_lists(struct lod_layout_component *lod_comp,
5357                             struct lov_user_md_v3 *v3)
5358 {
5359         int j;
5360
5361         ENTRY;
5362
5363         if (v3->lmm_stripe_offset == LOV_OFFSET_DEFAULT)
5364                 v3->lmm_stripe_offset = v3->lmm_objects[0].l_ost_idx;
5365
5366         if (lod_comp->llc_ostlist.op_array) {
5367                 if (lod_comp->llc_ostlist.op_size >=
5368                     v3->lmm_stripe_count * sizeof(__u32))  {
5369                         lod_comp->llc_ostlist.op_count =
5370                                         v3->lmm_stripe_count;
5371                         goto skip;
5372                 }
5373                 OBD_FREE(lod_comp->llc_ostlist.op_array,
5374                          lod_comp->llc_ostlist.op_size);
5375         }
5376
5377         /* copy ost list from lmm */
5378         lod_comp->llc_ostlist.op_count = v3->lmm_stripe_count;
5379         lod_comp->llc_ostlist.op_size = v3->lmm_stripe_count * sizeof(__u32);
5380         OBD_ALLOC(lod_comp->llc_ostlist.op_array,
5381                   lod_comp->llc_ostlist.op_size);
5382         if (!lod_comp->llc_ostlist.op_array)
5383                 RETURN(-ENOMEM);
5384 skip:
5385         for (j = 0; j < v3->lmm_stripe_count; j++) {
5386                 lod_comp->llc_ostlist.op_array[j] =
5387                         v3->lmm_objects[j].l_ost_idx;
5388         }
5389
5390         RETURN(0);
5391 }
5392
5393
5394 /**
5395  * Get default striping.
5396  *
5397  * \param[in] env               execution environment
5398  * \param[in] lo                object
5399  * \param[out] lds              default striping
5400  *
5401  * \retval              0 on success
5402  * \retval              negative if failed
5403  */
5404 static int lod_get_default_lov_striping(const struct lu_env *env,
5405                                         struct lod_object *lo,
5406                                         struct lod_default_striping *lds,
5407                                         struct dt_allocation_hint *dah)
5408 {
5409         struct lod_thread_info *info = lod_env_info(env);
5410         struct lov_user_md_v1 *v1 = NULL;
5411         struct lov_user_md_v3 *v3 = NULL;
5412         struct lov_comp_md_v1 *lcm = NULL;
5413         __u32 magic;
5414         int append_stripe_count = dah != NULL ? dah->dah_append_stripe_count : 0;
5415         const char *append_pool = (dah != NULL &&
5416                                    dah->dah_append_pool != NULL &&
5417                                    dah->dah_append_pool[0] != '\0') ?
5418                                   dah->dah_append_pool : NULL;
5419         __u16 entry_count = 1;
5420         __u16 mirror_count = 0;
5421         bool want_composite = false;
5422         int rc, i, j;
5423
5424         ENTRY;
5425
5426         lds->lds_def_striping_set = 0;
5427
5428         rc = lod_get_lov_ea(env, lo);
5429         if (rc < 0)
5430                 RETURN(rc);
5431
5432         if (rc < (typeof(rc))sizeof(struct lov_user_md))
5433                 RETURN(0);
5434
5435         magic = *(__u32 *)info->lti_ea_store;
5436         if (magic == __swab32(LOV_USER_MAGIC_V1)) {
5437                 lustre_swab_lov_user_md_v1(info->lti_ea_store);
5438         } else if (magic == __swab32(LOV_USER_MAGIC_V3)) {
5439                 lustre_swab_lov_user_md_v3(info->lti_ea_store);
5440         } else if (magic == __swab32(LOV_USER_MAGIC_SPECIFIC)) {
5441                 v3 = (struct lov_user_md_v3 *)info->lti_ea_store;
5442                 lustre_swab_lov_user_md_v3(v3);
5443                 lustre_swab_lov_user_md_objects(v3->lmm_objects,
5444                                                 v3->lmm_stripe_count);
5445         } else if (magic == __swab32(LOV_USER_MAGIC_COMP_V1) ||
5446                    magic == __swab32(LOV_USER_MAGIC_SEL)) {
5447                 lustre_swab_lov_comp_md_v1(info->lti_ea_store);
5448         }
5449
5450         switch (magic) {
5451         case LOV_MAGIC_V1:
5452         case LOV_MAGIC_V3:
5453         case LOV_USER_MAGIC_SPECIFIC:
5454                 v1 = info->lti_ea_store;
5455                 break;
5456         case LOV_MAGIC_COMP_V1:
5457         case LOV_MAGIC_SEL:
5458                 lcm = info->lti_ea_store;
5459                 entry_count = lcm->lcm_entry_count;
5460                 if (entry_count == 0)
5461                         RETURN(-EINVAL);
5462
5463                 mirror_count = lcm->lcm_mirror_count + 1;
5464                 want_composite = true;
5465                 break;
5466         default:
5467                 RETURN(-ENOTSUPP);
5468         }
5469
5470         if (append_stripe_count != 0 || append_pool != NULL) {
5471                 entry_count = 1;
5472                 mirror_count = 0;
5473                 want_composite = false;
5474         }
5475
5476         /* realloc default comp entries if necessary */
5477         rc = lod_def_striping_comp_resize(lds, entry_count);
5478         if (rc < 0)
5479                 RETURN(rc);
5480
5481         lds->lds_def_comp_cnt = entry_count;
5482         lds->lds_def_striping_is_composite = want_composite;
5483         lds->lds_def_mirror_cnt = mirror_count;
5484
5485         for (i = 0; i < entry_count; i++) {
5486                 struct lod_layout_component *llc = &lds->lds_def_comp_entries[i];
5487                 const char *pool;
5488
5489                 /*
5490                  * reset llc values, llc_stripes is always NULL in the
5491                  * default striping template, llc_pool will be reset
5492                  * later below using lod_set_pool().
5493                  *
5494                  * XXX At this point llc_pool may point to valid (!)
5495                  * kmalloced strings from previous RPCs.
5496                  */
5497                 memset(llc, 0, offsetof(typeof(*llc), llc_pool));
5498
5499                 if (lcm != NULL) {
5500                         v1 = (struct lov_user_md *)((char *)lcm +
5501                                                     lcm->lcm_entries[i].lcme_offset);
5502
5503                         if (want_composite) {
5504                                 llc->llc_extent = lcm->lcm_entries[i].lcme_extent;
5505                                 /* We only inherit certain flags from the layout */
5506                                 llc->llc_flags = lcm->lcm_entries[i].lcme_flags &
5507                                         LCME_TEMPLATE_FLAGS;
5508                         }
5509                 }
5510
5511                 CDEBUG(D_LAYOUT, DFID" magic = %#08x, pattern = %#x, stripe_count = %hu, stripe_size = %u, stripe_offset = %hu, append_pool = '%s', append_stripe_count = %d\n",
5512                        PFID(lu_object_fid(&lo->ldo_obj.do_lu)),
5513                        v1->lmm_magic,
5514                        v1->lmm_pattern,
5515                        v1->lmm_stripe_count,
5516                        v1->lmm_stripe_size,
5517                        v1->lmm_stripe_offset,
5518                        append_pool ?: "",
5519                        append_stripe_count);
5520
5521                 if (!lov_pattern_supported(v1->lmm_pattern) &&
5522                     !(v1->lmm_pattern & LOV_PATTERN_F_RELEASED)) {
5523                         lod_free_def_comp_entries(lds);
5524                         RETURN(-EINVAL);
5525                 }
5526
5527                 llc->llc_stripe_count = v1->lmm_stripe_count;
5528                 llc->llc_stripe_size = v1->lmm_stripe_size;
5529                 llc->llc_stripe_offset = v1->lmm_stripe_offset;
5530                 llc->llc_pattern = v1->lmm_pattern;
5531
5532                 if (append_stripe_count != 0 || append_pool != NULL)
5533                         llc->llc_pattern = LOV_PATTERN_RAID0;
5534
5535                 if (append_stripe_count != 0)
5536                         llc->llc_stripe_count = append_stripe_count;
5537
5538                 pool = NULL;
5539                 if (append_pool != NULL) {
5540                         pool = append_pool;
5541                 } else if (v1->lmm_magic == LOV_USER_MAGIC_V3) {
5542                         /* XXX: sanity check here */
5543                         v3 = (struct lov_user_md_v3 *)v1;
5544                         if (v3->lmm_pool_name[0] != '\0')
5545                                 pool = v3->lmm_pool_name;
5546                 }
5547
5548                 lod_set_pool(&llc->llc_pool, pool);
5549
5550                 if (v1->lmm_magic == LOV_USER_MAGIC_SPECIFIC &&
5551                     append_stripe_count == 0 &&
5552                     append_pool == NULL) {
5553                         v3 = (struct lov_user_md_v3 *)v1;
5554                         rc = lod_comp_copy_ost_lists(llc, v3);
5555                         if (rc)
5556                                 RETURN(rc);
5557                 } else if (llc->llc_ostlist.op_array &&
5558                            llc->llc_ostlist.op_count) {
5559                         for (j = 0; j < llc->llc_ostlist.op_count; j++)
5560                                 llc->llc_ostlist.op_array[j] = -1;
5561                         llc->llc_ostlist.op_count = 0;
5562                 }
5563         }
5564
5565         lds->lds_def_striping_set = 1;
5566         RETURN(rc);
5567 }
5568
5569 static inline void lod_lum2lds(struct lod_default_striping *lds,
5570                                const struct lmv_user_md *lum)
5571 {
5572         lds->lds_dir_def_stripe_count = le32_to_cpu(lum->lum_stripe_count);
5573         lds->lds_dir_def_stripe_offset = le32_to_cpu(lum->lum_stripe_offset);
5574         lds->lds_dir_def_hash_type = le32_to_cpu(lum->lum_hash_type);
5575         lds->lds_dir_def_max_inherit = lum->lum_max_inherit;
5576         lds->lds_dir_def_max_inherit_rr = lum->lum_max_inherit_rr;
5577         lds->lds_dir_def_striping_set = 1;
5578 }
5579
5580 /**
5581  * Get default directory striping.
5582  *
5583  * \param[in] env               execution environment
5584  * \param[in] lo                object
5585  * \param[out] lds              default striping
5586  *
5587  * \retval              0 on success
5588  * \retval              negative if failed
5589  */
5590 static int lod_get_default_lmv_striping(const struct lu_env *env,
5591                                         struct lod_object *lo,
5592                                         struct lod_default_striping *lds)
5593 {
5594         struct lmv_user_md *lmu;
5595         int rc;
5596
5597         lds->lds_dir_def_striping_set = 0;
5598
5599         rc = lod_get_default_lmv_ea(env, lo);
5600         if (rc < 0)
5601                 return rc;
5602
5603         if (rc >= (int)sizeof(*lmu)) {
5604                 struct lod_thread_info *info = lod_env_info(env);
5605
5606                 lmu = info->lti_ea_store;
5607                 lod_lum2lds(lds, lmu);
5608         }
5609
5610         return 0;
5611 }
5612
5613 /**
5614  * Get default striping in the object.
5615  *
5616  * Get object default striping and default directory striping.
5617  *
5618  * \param[in] env               execution environment
5619  * \param[in] lo                object
5620  * \param[out] lds              default striping
5621  *
5622  * \retval              0 on success
5623  * \retval              negative if failed
5624  */
5625 static int lod_get_default_striping(const struct lu_env *env,
5626                                     struct lod_object *lo,
5627                                     struct dt_allocation_hint *ah,
5628                                     struct lod_default_striping *lds)
5629 {
5630         int rc, rc1;
5631
5632         rc = lod_get_default_lov_striping(env, lo, lds, NULL);
5633         if (lds->lds_def_striping_set) {
5634                 struct lod_thread_info *info = lod_env_info(env);
5635                 struct lod_device *d = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
5636
5637                 rc = lod_verify_striping(env, d, lo, &info->lti_buf, false);
5638                 if (rc)
5639                         lds->lds_def_striping_set = 0;
5640         }
5641
5642         if (ah->dah_eadata_is_dmv) {
5643                 lod_lum2lds(lds, ah->dah_eadata);
5644         } else if (ah->dah_dmv_imp_inherit) {
5645                 lds->lds_dir_def_striping_set = 0;
5646         } else {
5647                 rc1 = lod_get_default_lmv_striping(env, lo, lds);
5648                 if (rc == 0 && rc1 < 0)
5649                         rc = rc1;
5650         }
5651
5652         return rc;
5653 }
5654
5655 /**
5656  * Apply default striping on object.
5657  *
5658  * If object striping pattern is not set, set to the one in default striping.
5659  * The default striping is from parent or fs.
5660  *
5661  * \param[in] lo                new object
5662  * \param[in] lds               default striping
5663  * \param[in] mode              new object's mode
5664  */
5665 static void lod_striping_from_default(struct lod_object *lo,
5666                                       const struct lod_default_striping *lds,
5667                                       umode_t mode)
5668 {
5669         struct lod_device *d = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
5670         int i, rc;
5671
5672         if (lds->lds_def_striping_set && S_ISREG(mode)) {
5673                 struct lov_desc *desc = &d->lod_ost_descs.ltd_lov_desc;
5674
5675                 rc = lod_alloc_comp_entries(lo, lds->lds_def_mirror_cnt,
5676                                             lds->lds_def_comp_cnt);
5677                 if (rc != 0)
5678                         return;
5679
5680                 lo->ldo_is_composite = lds->lds_def_striping_is_composite;
5681                 if (lds->lds_def_mirror_cnt > 1)
5682                         lo->ldo_flr_state = LCM_FL_RDONLY;
5683
5684                 for (i = 0; i < lo->ldo_comp_cnt; i++) {
5685                         struct lod_layout_component *obj_comp =
5686                                                 &lo->ldo_comp_entries[i];
5687                         struct lod_layout_component *def_comp =
5688                                                 &lds->lds_def_comp_entries[i];
5689
5690                         CDEBUG(D_LAYOUT,
5691                                "inherit "DFID" file layout from default: flags=%#x size=%u nr=%u offset=%u pattern=%#x pool=%s\n",
5692                                PFID(lu_object_fid(&lo->ldo_obj.do_lu)),
5693                                def_comp->llc_flags,
5694                                def_comp->llc_stripe_size,
5695                                def_comp->llc_stripe_count,
5696                                def_comp->llc_stripe_offset,
5697                                def_comp->llc_pattern,
5698                                def_comp->llc_pool ?: "");
5699
5700                         *obj_comp = *def_comp;
5701                         if (def_comp->llc_pool != NULL) {
5702                                 /* pointer was copied from def_comp */
5703                                 obj_comp->llc_pool = NULL;
5704                                 lod_obj_set_pool(lo, i, def_comp->llc_pool);
5705                         }
5706
5707                         /* copy ost list */
5708                         if (def_comp->llc_ostlist.op_array &&
5709                             def_comp->llc_ostlist.op_count) {
5710                                 OBD_ALLOC(obj_comp->llc_ostlist.op_array,
5711                                           obj_comp->llc_ostlist.op_size);
5712                                 if (!obj_comp->llc_ostlist.op_array)
5713                                         return;
5714                                 memcpy(obj_comp->llc_ostlist.op_array,
5715                                        def_comp->llc_ostlist.op_array,
5716                                        obj_comp->llc_ostlist.op_size);
5717                         } else if (def_comp->llc_ostlist.op_array) {
5718                                 obj_comp->llc_ostlist.op_array = NULL;
5719                         }
5720
5721                         /*
5722                          * Don't initialize these fields for plain layout
5723                          * (v1/v3) here, they are inherited in the order of
5724                          * 'parent' -> 'fs default (root)' -> 'global default
5725                          * values for stripe_count & stripe_size'.
5726                          *
5727                          * see lod_ah_init().
5728                          */
5729                         if (!lo->ldo_is_composite)
5730                                 continue;
5731
5732                         lod_adjust_stripe_info(obj_comp, desc, 0);
5733                 }
5734         } else if (lds->lds_dir_def_striping_set && S_ISDIR(mode)) {
5735                 if (lo->ldo_dir_stripe_count == 0)
5736                         lo->ldo_dir_stripe_count =
5737                                 lds->lds_dir_def_stripe_count;
5738                 if (lo->ldo_dir_stripe_offset == -1)
5739                         lo->ldo_dir_stripe_offset =
5740                                 lds->lds_dir_def_stripe_offset;
5741                 if (lo->ldo_dir_hash_type == LMV_HASH_TYPE_UNKNOWN)
5742                         lo->ldo_dir_hash_type = lds->lds_dir_def_hash_type;
5743
5744                 CDEBUG(D_LAYOUT,
5745                        "inherit "DFID" dir layout from default: count=%hu offset=%u hash_type=%x\n",
5746                        PFID(lu_object_fid(&lo->ldo_obj.do_lu)),
5747                        lo->ldo_dir_stripe_count, lo->ldo_dir_stripe_offset,
5748                        lo->ldo_dir_hash_type);
5749         }
5750 }
5751
5752 static inline bool lod_need_inherit_more(struct lod_object *lo, bool from_root,
5753                                          const char *append_pool)
5754 {
5755         struct lod_layout_component *lod_comp;
5756
5757         if (lo->ldo_comp_cnt == 0)
5758                 return true;
5759
5760         if (lo->ldo_is_composite)
5761                 return false;
5762
5763         lod_comp = &lo->ldo_comp_entries[0];
5764
5765         if (lod_comp->llc_stripe_count <= 0 ||
5766             lod_comp->llc_stripe_size <= 0)
5767                 return true;
5768
5769         if (from_root && (lod_comp->llc_pool == NULL ||
5770                           lod_comp->llc_stripe_offset == LOV_OFFSET_DEFAULT))
5771                 return true;
5772
5773         if (append_pool && append_pool[0])
5774                 return true;
5775
5776         return false;
5777 }
5778
5779 /**
5780  * Implementation of dt_object_operations::do_ah_init.
5781  *
5782  * This method is used to make a decision on the striping configuration for the
5783  * object being created. It can be taken from the \a parent object if it exists,
5784  * or filesystem's default. The resulting configuration (number of stripes,
5785  * stripe size/offset, pool name, hash_type, etc.) is stored in the object
5786  * itself and will be used by the methods like ->doo_declare_create().
5787  *
5788  * \see dt_object_operations::do_ah_init() in the API description for details.
5789  */
5790 static void lod_ah_init(const struct lu_env *env,
5791                         struct dt_allocation_hint *ah,
5792                         struct dt_object *parent,
5793                         struct dt_object *child,
5794                         umode_t child_mode)
5795 {
5796         struct lod_device *d = lu2lod_dev(child->do_lu.lo_dev);
5797         struct lod_thread_info *info = lod_env_info(env);
5798         struct lod_default_striping *lds = lod_lds_buf_get(env);
5799         struct dt_object *nextp = NULL;
5800         struct dt_object *nextc;
5801         struct lod_object *lp = NULL;
5802         struct lod_object *lc;
5803         struct lov_desc *desc;
5804         struct lod_layout_component *lod_comp;
5805         int rc;
5806         ENTRY;
5807
5808         LASSERT(child);
5809
5810         if (ah->dah_append_stripe_count == -1)
5811                 ah->dah_append_stripe_count =
5812                         d->lod_ost_descs.ltd_lov_desc.ld_tgt_count;
5813
5814         if (likely(parent)) {
5815                 nextp = dt_object_child(parent);
5816                 lp = lod_dt_obj(parent);
5817         }
5818
5819         nextc = dt_object_child(child);
5820         lc = lod_dt_obj(child);
5821
5822         LASSERT(!lod_obj_is_striped(child));
5823         /* default layout template may have been set on the regular file
5824          * when this is called from mdd_create_data() */
5825         if (S_ISREG(child_mode))
5826                 lod_free_comp_entries(lc);
5827
5828         if (!dt_object_exists(nextc))
5829                 nextc->do_ops->do_ah_init(env, ah, nextp, nextc, child_mode);
5830
5831         if (S_ISDIR(child_mode)) {
5832                 const struct lmv_user_md_v1 *lum1 = ah->dah_eadata;
5833                 int max_stripe_count;
5834
5835                 /* other default values are 0 */
5836                 lc->ldo_dir_stripe_offset = LMV_OFFSET_DEFAULT;
5837
5838                 /* no default striping configuration is needed for
5839                  * foreign dirs
5840                  */
5841                 if (ah->dah_eadata != NULL && ah->dah_eadata_len != 0 &&
5842                     le32_to_cpu(lum1->lum_magic) == LMV_MAGIC_FOREIGN) {
5843                         lc->ldo_is_foreign = true;
5844                         /* keep stripe_count 0 and stripe_offset -1 */
5845                         CDEBUG(D_INFO, "no default striping for foreign dir\n");
5846                         RETURN_EXIT;
5847                 }
5848
5849                 if (likely(lp != NULL))
5850                         lod_get_default_striping(env, lp, ah, lds);
5851
5852                 /* It should always honour the specified stripes */
5853                 if (ah->dah_eadata && ah->dah_eadata_len &&
5854                     !ah->dah_eadata_is_dmv &&
5855                     (le32_to_cpu(lum1->lum_magic) == LMV_USER_MAGIC ||
5856                      le32_to_cpu(lum1->lum_magic) == LMV_USER_MAGIC_SPECIFIC ||
5857                      le32_to_cpu(lum1->lum_magic) == LMV_MAGIC_V1)) {
5858                         lc->ldo_dir_stripe_count =
5859                                 le32_to_cpu(lum1->lum_stripe_count);
5860                         lc->ldo_dir_stripe_offset =
5861                                 le32_to_cpu(lum1->lum_stripe_offset);
5862                         lc->ldo_dir_hash_type =
5863                                 le32_to_cpu(lum1->lum_hash_type);
5864                         CDEBUG(D_INFO,
5865                                "set dirstripe: count %hu, offset %d, hash %x\n",
5866                                 lc->ldo_dir_stripe_count,
5867                                 (int)lc->ldo_dir_stripe_offset,
5868                                 lc->ldo_dir_hash_type);
5869
5870                         if (d->lod_mdt_descs.ltd_lmv_desc.ld_active_tgt_count &&
5871                             lc->ldo_dir_stripe_count < 2 &&
5872                             lum1->lum_max_inherit != LMV_INHERIT_NONE) {
5873                                 /* when filesystem-wide default LMV is set, dirs
5874                                  * will be created on MDT by space usage, but if
5875                                  * dir is created with "lfs mkdir -c 1 ...", its
5876                                  * subdirs should be kept on the same MDT. To
5877                                  * guarantee this, set default LMV for such dir.
5878                                  */
5879                                 lds->lds_dir_def_stripe_count =
5880                                         le32_to_cpu(lum1->lum_stripe_count);
5881                                 /* if "-1" stripe offset is set, save current
5882                                  * MDT index in default LMV.
5883                                  */
5884                                 if (le32_to_cpu(lum1->lum_stripe_offset) ==
5885                                     LMV_OFFSET_DEFAULT)
5886                                         lds->lds_dir_def_stripe_offset =
5887                                                 lod2lu_dev(d)->ld_site->ld_seq_site->ss_node_id;
5888                                 else
5889                                         lds->lds_dir_def_stripe_offset =
5890                                                 le32_to_cpu(lum1->lum_stripe_offset);
5891                                 lds->lds_dir_def_hash_type =
5892                                         le32_to_cpu(lum1->lum_hash_type);
5893                                 lds->lds_dir_def_max_inherit =
5894                                         lum1->lum_max_inherit;
5895                                 /* it will be decreased by 1 later in setting */
5896                                 if (lum1->lum_max_inherit >= LMV_INHERIT_END &&
5897                                     lum1->lum_max_inherit < LMV_INHERIT_MAX)
5898                                         lds->lds_dir_def_max_inherit++;
5899                                 lds->lds_dir_def_max_inherit_rr =
5900                                         lum1->lum_max_inherit_rr;
5901                                 lds->lds_dir_def_striping_set = 1;
5902                                 /* don't inherit LOV from ROOT */
5903                                 if (lds->lds_def_striping_set &&
5904                                     fid_is_root(lod_object_fid(lp)))
5905                                         lds->lds_def_striping_set = 0;
5906                                 lc->ldo_def_striping = lds;
5907                         } else if (lds->lds_def_striping_set &&
5908                                    !fid_is_root(lod_object_fid(lp))) {
5909                                 /* don't inherit default LMV for "lfs mkdir" */
5910                                 lds->lds_dir_def_striping_set = 0;
5911                                 lc->ldo_def_striping = lds;
5912                         }
5913                 } else {
5914                         /* inherit default striping except ROOT */
5915                         if ((lds->lds_def_striping_set ||
5916                              lds->lds_dir_def_striping_set) &&
5917                             !fid_is_root(lod_object_fid(lp)))
5918                                 lc->ldo_def_striping = lds;
5919
5920                         /* transfer defaults LMV to new directory */
5921                         lod_striping_from_default(lc, lds, child_mode);
5922
5923                         /* set count 0 to create normal directory */
5924                         if (lc->ldo_dir_stripe_count == 1)
5925                                 lc->ldo_dir_stripe_count = 0;
5926
5927                         /* do not save default LMV on server */
5928                         if (ah->dah_dmv_imp_inherit) {
5929                                 lds->lds_dir_def_striping_set = 0;
5930                                 if (!lds->lds_def_striping_set)
5931                                         lc->ldo_def_striping = NULL;
5932                         }
5933                 }
5934
5935                 /* shrink the stripe count to max_mdt_stripecount if it is -1
5936                  * and max_mdt_stripecount is not 0
5937                  */
5938                 if (lc->ldo_dir_stripe_count == (__u16)(-1) &&
5939                     d->lod_max_mdt_stripecount)
5940                         lc->ldo_dir_stripe_count = d->lod_max_mdt_stripecount;
5941
5942                 max_stripe_count = d->lod_remote_mdt_count + 1;
5943                 if (lc->ldo_dir_hash_type & LMV_HASH_FLAG_OVERSTRIPED)
5944                         max_stripe_count *= d->lod_max_stripes_per_mdt;
5945
5946                 /* shrink the stripe_count to max stripe count */
5947                 if (lc->ldo_dir_stripe_count > max_stripe_count &&
5948                     !CFS_FAIL_CHECK(OBD_FAIL_LARGE_STRIPE)) {
5949                         lc->ldo_dir_stripe_count = max_stripe_count;
5950                         if (lc->ldo_dir_stripe_count == 1)
5951                                 lc->ldo_dir_stripe_count = 0;
5952                 }
5953
5954                 if (!lmv_is_known_hash_type(lc->ldo_dir_hash_type))
5955                         lc->ldo_dir_hash_type =
5956                                 (lc->ldo_dir_hash_type & LMV_HASH_FLAG_KNOWN) |
5957                                 d->lod_mdt_descs.ltd_lmv_desc.ld_pattern;
5958
5959                 /* make sure all fscrypt metadata stays on same mdt */
5960                 if (child->do_lu.lo_header->loh_attr & LOHA_FSCRYPT_MD) {
5961                         lc->ldo_dir_stripe_count = 0;
5962                         lds->lds_dir_def_stripe_offset =
5963                                 lod2lu_dev(d)->ld_site->ld_seq_site->ss_node_id;
5964                         lds->lds_dir_def_striping_set = 1;
5965                         lc->ldo_def_striping = lds;
5966                 }
5967
5968                 CDEBUG(D_INFO, "final dir stripe_count=%hu offset=%d hash=%x\n",
5969                        lc->ldo_dir_stripe_count,
5970                        (int)lc->ldo_dir_stripe_offset, lc->ldo_dir_hash_type);
5971
5972                 RETURN_EXIT;
5973         }
5974
5975         /* child object regular file*/
5976
5977         if (!lod_object_will_be_striped(S_ISREG(child_mode),
5978                                         lu_object_fid(&child->do_lu)))
5979                 RETURN_EXIT;
5980
5981         /* If object is going to be striped over OSTs, transfer default
5982          * striping information to the child, so that we can use it
5983          * during declaration and creation.
5984          *
5985          * Try from the parent first.
5986          */
5987         if (likely(lp != NULL)) {
5988                 rc = lod_get_default_lov_striping(env, lp, lds, ah);
5989                 if (rc == 0 && lds->lds_def_striping_set) {
5990                         rc = lod_verify_striping(env, d, lp, &info->lti_buf,
5991                                                  false);
5992                         if (rc == 0)
5993                                 lod_striping_from_default(lc, lds, child_mode);
5994                 }
5995         }
5996
5997         /* Initialize lod_device::lod_md_root object reference */
5998         if (d->lod_md_root == NULL) {
5999                 struct dt_object *root;
6000                 struct lod_object *lroot;
6001
6002                 lu_root_fid(&info->lti_fid);
6003                 root = dt_locate(env, &d->lod_dt_dev, &info->lti_fid);
6004                 if (!IS_ERR(root)) {
6005                         lroot = lod_dt_obj(root);
6006
6007                         spin_lock(&d->lod_lock);
6008                         if (d->lod_md_root != NULL)
6009                                 dt_object_put(env, &d->lod_md_root->ldo_obj);
6010                         d->lod_md_root = lroot;
6011                         spin_unlock(&d->lod_lock);
6012                 }
6013         }
6014
6015         /* try inherit layout from the root object (fs default) when:
6016          *  - parent does not have default layout; or
6017          *  - parent has plain(v1/v3) default layout, and some attributes
6018          *    are not specified in the default layout;
6019          */
6020         if (d->lod_md_root != NULL &&
6021             lod_need_inherit_more(lc, true, ah->dah_append_pool)) {
6022                 rc = lod_get_default_lov_striping(env, d->lod_md_root, lds,
6023                                                   ah);
6024                 if (rc || !lds->lds_def_striping_set)
6025                         goto out;
6026
6027                 rc = lod_verify_striping(env, d, d->lod_md_root, &info->lti_buf,
6028                                          false);
6029                 if (rc)
6030                         goto out;
6031
6032                 if (lc->ldo_comp_cnt == 0) {
6033                         lod_striping_from_default(lc, lds, child_mode);
6034                 } else if (!lds->lds_def_striping_is_composite) {
6035                         struct lod_layout_component *def_comp;
6036
6037                         LASSERT(!lc->ldo_is_composite);
6038                         lod_comp = &lc->ldo_comp_entries[0];
6039                         def_comp = &lds->lds_def_comp_entries[0];
6040
6041                         if (lod_comp->llc_stripe_count <= 0)
6042                                 lod_comp->llc_stripe_count =
6043                                         def_comp->llc_stripe_count;
6044                         if (lod_comp->llc_stripe_size <= 0)
6045                                 lod_comp->llc_stripe_size =
6046                                         def_comp->llc_stripe_size;
6047                         if (lod_comp->llc_stripe_offset == LOV_OFFSET_DEFAULT &&
6048                             (!lod_comp->llc_pool || !lod_comp->llc_pool[0]))
6049                                 lod_comp->llc_stripe_offset =
6050                                         def_comp->llc_stripe_offset;
6051                         if (lod_comp->llc_pool == NULL)
6052                                 lod_qos_set_pool(lc, 0, def_comp->llc_pool);
6053                 }
6054         }
6055 out:
6056         /*
6057          * fs default striping may not be explicitly set, or historically set
6058          * in config log, use them.
6059          */
6060         if (lod_need_inherit_more(lc, false, ah->dah_append_pool)) {
6061                 if (lc->ldo_comp_cnt == 0) {
6062                         rc = lod_alloc_comp_entries(lc, 0, 1);
6063                         if (rc)
6064                                 /* fail to allocate memory, will create a
6065                                  * non-striped file. */
6066                                 RETURN_EXIT;
6067                         lc->ldo_is_composite = 0;
6068                         lod_comp = &lc->ldo_comp_entries[0];
6069                         lod_comp->llc_stripe_offset = LOV_OFFSET_DEFAULT;
6070                 }
6071                 LASSERT(!lc->ldo_is_composite);
6072                 lod_comp = &lc->ldo_comp_entries[0];
6073                 desc = &d->lod_ost_descs.ltd_lov_desc;
6074                 lod_adjust_stripe_info(lod_comp, desc,
6075                                        ah->dah_append_stripe_count);
6076                 if (ah->dah_append_pool && ah->dah_append_pool[0])
6077                         lod_qos_set_pool(lc, 0, ah->dah_append_pool);
6078         }
6079
6080         EXIT;
6081 }
6082
6083 /**
6084  * Size initialization on late striping.
6085  *
6086  * Propagate the size of a truncated object to a deferred striping.
6087  * This function handles a special case when truncate was done on a
6088  * non-striped object and now while the striping is being created
6089  * we can't lose that size, so we have to propagate it to the stripes
6090  * being created.
6091  *
6092  * \param[in] env       execution environment
6093  * \param[in] dt        object
6094  * \param[in] th        transaction handle
6095  *
6096  * \retval              0 on success
6097  * \retval              negative if failed
6098  */
6099 static int lod_declare_init_size(const struct lu_env *env,
6100                                  struct dt_object *dt, struct thandle *th)
6101 {
6102         struct dt_object        *next = dt_object_child(dt);
6103         struct lod_object       *lo = lod_dt_obj(dt);
6104         struct dt_object        **objects = NULL;
6105         struct lu_attr  *attr = &lod_env_info(env)->lti_attr;
6106         uint64_t        size, offs;
6107         int     i, rc, stripe, stripe_count = 0, stripe_size = 0;
6108         struct lu_extent size_ext;
6109         ENTRY;
6110
6111         if (!lod_obj_is_striped(dt))
6112                 RETURN(0);
6113
6114         rc = dt_attr_get(env, next, attr);
6115         LASSERT(attr->la_valid & LA_SIZE);
6116         if (rc)
6117                 RETURN(rc);
6118
6119         size = attr->la_size;
6120         if (size == 0)
6121                 RETURN(0);
6122
6123         size_ext = (typeof(size_ext)){ .e_start = size - 1, .e_end = size };
6124         for (i = 0; i < lo->ldo_comp_cnt; i++) {
6125                 struct lod_layout_component *lod_comp;
6126                 struct lu_extent *extent;
6127
6128                 lod_comp = &lo->ldo_comp_entries[i];
6129
6130                 if (lod_comp->llc_stripe == NULL)
6131                         continue;
6132
6133                 extent = &lod_comp->llc_extent;
6134                 CDEBUG(D_INFO, "%lld "DEXT"\n", size, PEXT(extent));
6135                 if (!lo->ldo_is_composite ||
6136                     lu_extent_is_overlapped(extent, &size_ext)) {
6137                         objects = lod_comp->llc_stripe;
6138                         stripe_count = lod_comp->llc_stripe_count;
6139                         stripe_size = lod_comp->llc_stripe_size;
6140
6141                         /* next mirror */
6142                         if (stripe_count == 0)
6143                                 continue;
6144
6145                         LASSERT(objects != NULL && stripe_size != 0);
6146                         do_div(size, stripe_size);
6147                         stripe = do_div(size, stripe_count);
6148                         LASSERT(objects[stripe] != NULL);
6149
6150                         size = size * stripe_size;
6151                         offs = attr->la_size;
6152                         size += do_div(offs, stripe_size);
6153
6154                         attr->la_valid = LA_SIZE;
6155                         attr->la_size = size;
6156
6157                         rc = lod_sub_declare_attr_set(env, objects[stripe],
6158                                                       attr, th);
6159                 }
6160         }
6161
6162         RETURN(rc);
6163 }
6164
6165 /**
6166  * Declare creation of striped object.
6167  *
6168  * The function declares creation stripes for a regular object. The function
6169  * also declares whether the stripes will be created with non-zero size if
6170  * previously size was set non-zero on the master object. If object \a dt is
6171  * not local, then only fully defined striping can be applied in \a lovea.
6172  * Otherwise \a lovea can be in the form of pattern, see lod_qos_parse_config()
6173  * for the details.
6174  *
6175  * \param[in] env       execution environment
6176  * \param[in] dt        object
6177  * \param[in] attr      attributes the stripes will be created with
6178  * \param[in] lovea     a buffer containing striping description
6179  * \param[in] th        transaction handle
6180  *
6181  * \retval              0 on success
6182  * \retval              negative if failed
6183  */
6184 int lod_declare_striped_create(const struct lu_env *env, struct dt_object *dt,
6185                                struct lu_attr *attr,
6186                                const struct lu_buf *lovea, struct thandle *th)
6187 {
6188         struct lod_thread_info  *info = lod_env_info(env);
6189         struct dt_object        *next = dt_object_child(dt);
6190         struct lod_object       *lo = lod_dt_obj(dt);
6191         int                      rc;
6192         ENTRY;
6193
6194         if (CFS_FAIL_CHECK(OBD_FAIL_MDS_ALLOC_OBDO))
6195                 GOTO(out, rc = -ENOMEM);
6196
6197         if (!dt_object_remote(next)) {
6198                 /* choose OST and generate appropriate objects */
6199                 rc = lod_prepare_create(env, lo, attr, lovea, th);
6200                 if (rc)
6201                         GOTO(out, rc);
6202
6203                 /*
6204                  * declare storage for striping data
6205                  */
6206                 info->lti_buf.lb_len = lod_comp_md_size(lo, false);
6207         } else {
6208                 /* LOD can not choose OST objects for remote objects, i.e.
6209                  * stripes must be ready before that. Right now, it can only
6210                  * happen during migrate, i.e. migrate process needs to create
6211                  * remote regular file (mdd_migrate_create), then the migrate
6212                  * process will provide stripeEA. */
6213                 LASSERT(lovea != NULL);
6214                 info->lti_buf = *lovea;
6215         }
6216
6217         rc = lod_sub_declare_xattr_set(env, next, &info->lti_buf,
6218                                        XATTR_NAME_LOV, 0, th);
6219         if (rc)
6220                 GOTO(out, rc);
6221
6222         /*
6223          * if striping is created with local object's size > 0,
6224          * we have to propagate this size to specific object
6225          * the case is possible only when local object was created previously
6226          */
6227         if (dt_object_exists(next))
6228                 rc = lod_declare_init_size(env, dt, th);
6229
6230 out:
6231         /* failed to create striping or to set initial size, let's reset
6232          * config so that others don't get confused */
6233         if (rc)
6234                 lod_striping_free(env, lo);
6235
6236         RETURN(rc);
6237 }
6238
6239 /*
6240  * Whether subdirectories under \a dt should be created on MDTs by space QoS
6241  *
6242  * If LMV_HASH_FLAG_SPACE is set on directory default layout, its subdirectories
6243  * should be created on MDT by space QoS.
6244  *
6245  * \param[in] env       execution environment
6246  * \param[in] dev       lu device
6247  * \param[in] dt        object
6248  *
6249  * \retval              1 if directory should create subdir by space usage
6250  * \retval              0 if not
6251  * \retval              -ev if failed
6252  */
6253 static inline int dt_object_qos_mkdir(const struct lu_env *env,
6254                                       struct lu_device *dev,
6255                                       struct dt_object *dt)
6256 {
6257         struct lod_thread_info *info = lod_env_info(env);
6258         struct lu_object *obj;
6259         struct lod_object *lo;
6260         struct lmv_user_md *lmu;
6261         int rc;
6262
6263         obj = lu_object_find_slice(env, dev, lu_object_fid(&dt->do_lu), NULL);
6264         if (IS_ERR(obj))
6265                 return PTR_ERR(obj);
6266
6267         lo = lu2lod_obj(obj);
6268
6269         rc = lod_get_default_lmv_ea(env, lo);
6270         dt_object_put(env, dt);
6271         if (rc <= 0)
6272                 return rc;
6273
6274         if (rc < (int)sizeof(*lmu))
6275                 return -EINVAL;
6276
6277         lmu = info->lti_ea_store;
6278         return le32_to_cpu(lmu->lum_stripe_offset) == LMV_OFFSET_DEFAULT;
6279 }
6280
6281 /**
6282  * Implementation of dt_object_operations::do_declare_create.
6283  *
6284  * The method declares creation of a new object. If the object will be striped,
6285  * then helper functions are called to find FIDs for the stripes, declare
6286  * creation of the stripes and declare initialization of the striping
6287  * information to be stored in the master object.
6288  *
6289  * \see dt_object_operations::do_declare_create() in the API description
6290  * for details.
6291  */
6292 static int lod_declare_create(const struct lu_env *env, struct dt_object *dt,
6293                               struct lu_attr *attr,
6294                               struct dt_allocation_hint *hint,
6295                               struct dt_object_format *dof, struct thandle *th)
6296 {
6297         struct dt_object   *next = dt_object_child(dt);
6298         struct lod_object  *lo = lod_dt_obj(dt);
6299         int                 rc;
6300         ENTRY;
6301
6302         LASSERT(dof);
6303         LASSERT(attr);
6304         LASSERT(th);
6305
6306         /*
6307          * first of all, we declare creation of local object
6308          */
6309         rc = lod_sub_declare_create(env, next, attr, hint, dof, th);
6310         if (rc != 0)
6311                 GOTO(out, rc);
6312
6313         /*
6314          * it's lod_ah_init() that has decided the object will be striped
6315          */
6316         if (dof->dof_type == DFT_REGULAR) {
6317                 /* callers don't want stripes */
6318                 /* XXX: all tricky interactions with ->ah_make_hint() decided
6319                  * to use striping, then ->declare_create() behaving differently
6320                  * should be cleaned */
6321                 if (dof->u.dof_reg.striped != 0)
6322                         rc = lod_declare_striped_create(env, dt, attr,
6323                                                         NULL, th);
6324         } else if (dof->dof_type == DFT_DIR) {
6325                 struct seq_server_site *ss;
6326                 struct lu_buf buf = { NULL };
6327
6328                 ss = lu_site2seq(dt->do_lu.lo_dev->ld_site);
6329
6330                 /* If the parent has default stripeEA, and client
6331                  * did not find it before sending create request,
6332                  * then MDT will return -EREMOTE, and client will
6333                  * retrieve the default stripeEA and re-create the
6334                  * sub directory.
6335                  *
6336                  * Note: if dah_eadata != NULL, it means creating the
6337                  * striped directory with specified stripeEA, then it
6338                  * should ignore the default stripeEA */
6339                 if (hint != NULL && hint->dah_eadata == NULL) {
6340                         if (CFS_FAIL_CHECK(OBD_FAIL_MDS_STALE_DIR_LAYOUT))
6341                                 GOTO(out, rc = -EREMOTE);
6342
6343                         if (lo->ldo_dir_stripe_offset != LMV_OFFSET_DEFAULT &&
6344                             lo->ldo_dir_stripe_offset != ss->ss_node_id) {
6345                                 struct lod_device *lod;
6346                                 struct lu_tgt_desc *mdt = NULL;
6347                                 bool found_mdt = false;
6348
6349                                 lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
6350                                 lod_foreach_mdt(lod, mdt) {
6351                                         if (mdt->ltd_index ==
6352                                                 lo->ldo_dir_stripe_offset) {
6353                                                 found_mdt = true;
6354                                                 break;
6355                                         }
6356                                 }
6357
6358                                 /* If the MDT indicated by stripe_offset can be
6359                                  * found, then tell client to resend the create
6360                                  * request to the correct MDT, otherwise return
6361                                  * error to client */
6362                                 if (found_mdt)
6363                                         GOTO(out, rc = -EREMOTE);
6364                                 else
6365                                         GOTO(out, rc = -EINVAL);
6366                         }
6367                 } else if (hint && hint->dah_eadata) {
6368                         buf.lb_buf = (void *)hint->dah_eadata;
6369                         buf.lb_len = hint->dah_eadata_len;
6370                 }
6371
6372                 rc = lod_declare_dir_striping_create(env, dt, attr, &buf, dof,
6373                                                      th);
6374         }
6375 out:
6376         /* failed to create striping or to set initial size, let's reset
6377          * config so that others don't get confused */
6378         if (rc)
6379                 lod_striping_free(env, lo);
6380         RETURN(rc);
6381 }
6382
6383 /**
6384  * Generate component ID for new created component.
6385  *
6386  * \param[in] lo                LOD object
6387  * \param[in] comp_idx          index of ldo_comp_entries
6388  *
6389  * \retval                      component ID on success
6390  * \retval                      LCME_ID_INVAL on failure
6391  */
6392 static __u32 lod_gen_component_id(struct lod_object *lo,
6393                                   int mirror_id, int comp_idx)
6394 {
6395         struct lod_layout_component *lod_comp;
6396         __u32   id, start, end;
6397         int     i;
6398
6399         LASSERT(lo->ldo_comp_entries[comp_idx].llc_id == LCME_ID_INVAL);
6400
6401         lod_obj_inc_layout_gen(lo);
6402         id = lo->ldo_layout_gen;
6403         if (likely(id <= SEQ_ID_MAX))
6404                 RETURN(pflr_id(mirror_id, id & SEQ_ID_MASK));
6405
6406         /* Layout generation wraps, need to check collisions. */
6407         start = id & SEQ_ID_MASK;
6408         end = SEQ_ID_MAX;
6409 again:
6410         for (id = start; id <= end; id++) {
6411                 for (i = 0; i < lo->ldo_comp_cnt; i++) {
6412                         lod_comp = &lo->ldo_comp_entries[i];
6413                         if (pflr_id(mirror_id, id) == lod_comp->llc_id)
6414                                 break;
6415                 }
6416                 /* Found the ununsed ID */
6417                 if (i == lo->ldo_comp_cnt)
6418                         RETURN(pflr_id(mirror_id, id));
6419         }
6420
6421         if (end == SEQ_ID_MAX) {
6422                 end = min_t(__u32, start, SEQ_ID_MAX) - 1;
6423                 start = 1;
6424                 goto again;
6425         }
6426
6427         RETURN(LCME_ID_INVAL);
6428 }
6429
6430 /**
6431  * Creation of a striped regular object.
6432  *
6433  * The function is called to create the stripe objects for a regular
6434  * striped file. This can happen at the initial object creation or
6435  * when the caller asks LOD to do so using ->do_xattr_set() method
6436  * (so called late striping). Notice all the information are already
6437  * prepared in the form of the list of objects (ldo_stripe field).
6438  * This is done during declare phase.
6439  *
6440  * \param[in] env       execution environment
6441  * \param[in] dt        object
6442  * \param[in] attr      attributes the stripes will be created with
6443  * \param[in] dof       format of stripes (see OSD API description)
6444  * \param[in] th        transaction handle
6445  *
6446  * \retval              0 on success
6447  * \retval              negative if failed
6448  */
6449 int lod_striped_create(const struct lu_env *env, struct dt_object *dt,
6450                        struct lu_attr *attr, struct dt_object_format *dof,
6451                        struct thandle *th)
6452 {
6453         struct lod_layout_component     *lod_comp;
6454         struct lod_object       *lo = lod_dt_obj(dt);
6455         __u16   mirror_id;
6456         int     rc = 0, i, j;
6457         ENTRY;
6458
6459         mutex_lock(&lo->ldo_layout_mutex);
6460
6461         LASSERT((lo->ldo_comp_cnt != 0 && lo->ldo_comp_entries != NULL) ||
6462                 lo->ldo_is_foreign);
6463
6464         mirror_id = 0; /* non-flr file's mirror_id is 0 */
6465         if (lo->ldo_mirror_count > 1) {
6466                 for (i = 0; i < lo->ldo_comp_cnt; i++) {
6467                         lod_comp = &lo->ldo_comp_entries[i];
6468                         if (lod_comp->llc_id != LCME_ID_INVAL &&
6469                             mirror_id_of(lod_comp->llc_id) > mirror_id)
6470                                 mirror_id = mirror_id_of(lod_comp->llc_id);
6471                 }
6472         }
6473
6474         /* create all underlying objects */
6475         for (i = 0; i < lo->ldo_comp_cnt; i++) {
6476                 lod_comp = &lo->ldo_comp_entries[i];
6477
6478                 if (lod_comp->llc_id == LCME_ID_INVAL) {
6479                         /* only the component of FLR layout with more than 1
6480                          * mirror has mirror ID in its component ID.
6481                          */
6482                         if (lod_comp->llc_extent.e_start == 0 &&
6483                             lo->ldo_mirror_count > 1)
6484                                 ++mirror_id;
6485
6486                         lod_comp->llc_id = lod_gen_component_id(lo,
6487                                                                 mirror_id, i);
6488                         if (lod_comp->llc_id == LCME_ID_INVAL)
6489                                 GOTO(out, rc = -ERANGE);
6490                 }
6491
6492                 if (lod_comp_inited(lod_comp))
6493                         continue;
6494
6495                 if (lod_comp->llc_magic == LOV_MAGIC_FOREIGN) {
6496                         lod_comp_set_init(lod_comp);
6497                         continue;
6498                 }
6499
6500                 if (lod_comp->llc_pattern & LOV_PATTERN_F_RELEASED)
6501                         lod_comp_set_init(lod_comp);
6502
6503                 if (lov_pattern(lod_comp->llc_pattern) & LOV_PATTERN_MDT)
6504                         lod_comp_set_init(lod_comp);
6505
6506                 if (lod_comp->llc_stripe == NULL)
6507                         continue;
6508
6509                 LASSERT(lod_comp->llc_stripe_count);
6510                 for (j = 0; j < lod_comp->llc_stripe_count; j++) {
6511                         struct dt_object *object = lod_comp->llc_stripe[j];
6512                         LASSERT(object != NULL);
6513                         rc = lod_sub_create(env, object, attr, NULL, dof, th);
6514                         if (rc)
6515                                 GOTO(out, rc);
6516                 }
6517                 lod_comp_set_init(lod_comp);
6518         }
6519
6520         rc = lod_fill_mirrors(lo);
6521         if (rc)
6522                 GOTO(out, rc);
6523
6524         lo->ldo_comp_cached = 1;
6525
6526         rc = lod_generate_and_set_lovea(env, lo, th);
6527         if (rc)
6528                 GOTO(out, rc);
6529
6530         mutex_unlock(&lo->ldo_layout_mutex);
6531
6532         RETURN(0);
6533
6534 out:
6535         lod_striping_free_nolock(env, lo);
6536         mutex_unlock(&lo->ldo_layout_mutex);
6537
6538         RETURN(rc);
6539 }
6540
6541 static inline bool lod_obj_is_dom(struct dt_object *dt)
6542 {
6543         struct lod_object *lo = lod_dt_obj(dt);
6544
6545         if (!dt_object_exists(dt_object_child(dt)))
6546                 return false;
6547
6548         if (S_ISDIR(dt->do_lu.lo_header->loh_attr))
6549                 return false;
6550
6551         if (!lo->ldo_comp_cnt)
6552                 return false;
6553
6554         return (lov_pattern(lo->ldo_comp_entries[0].llc_pattern) &
6555                 LOV_PATTERN_MDT);
6556 }
6557
6558 /**
6559  * Implementation of dt_object_operations::do_create.
6560  *
6561  * If any of preceeding methods (like ->do_declare_create(),
6562  * ->do_ah_init(), etc) chose to create a striped object,
6563  * then this method will create the master and the stripes.
6564  *
6565  * \see dt_object_operations::do_create() in the API description for details.
6566  */
6567 static int lod_create(const struct lu_env *env, struct dt_object *dt,
6568                       struct lu_attr *attr, struct dt_allocation_hint *hint,
6569                       struct dt_object_format *dof, struct thandle *th)
6570 {
6571         int                 rc;
6572         ENTRY;
6573
6574         /* create local object */
6575         rc = lod_sub_create(env, dt_object_child(dt), attr, hint, dof, th);
6576         if (rc != 0)
6577                 RETURN(rc);
6578
6579         if (S_ISREG(dt->do_lu.lo_header->loh_attr) &&
6580             (lod_obj_is_striped(dt) || lod_obj_is_dom(dt)) &&
6581             dof->u.dof_reg.striped != 0) {
6582                 LASSERT(lod_dt_obj(dt)->ldo_comp_cached == 0);
6583                 rc = lod_striped_create(env, dt, attr, dof, th);
6584         }
6585
6586         RETURN(rc);
6587 }
6588
6589 static inline int
6590 lod_obj_stripe_destroy_cb(const struct lu_env *env, struct lod_object *lo,
6591                           struct dt_object *dt, struct thandle *th,
6592                           int comp_idx, int stripe_idx,
6593                           struct lod_obj_stripe_cb_data *data)
6594 {
6595         if (data->locd_declare)
6596                 return lod_sub_declare_destroy(env, dt, th);
6597
6598         if (!CFS_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_SPEOBJ) ||
6599             stripe_idx == cfs_fail_val)
6600                 return lod_sub_destroy(env, dt, th);
6601
6602         return 0;
6603 }
6604
6605 /**
6606  * Implementation of dt_object_operations::do_declare_destroy.
6607  *
6608  * If the object is a striped directory, then the function declares reference
6609  * removal from the master object (this is an index) to the stripes and declares
6610  * destroy of all the stripes. In all the cases, it declares an intention to
6611  * destroy the object itself.
6612  *
6613  * \see dt_object_operations::do_declare_destroy() in the API description
6614  * for details.
6615  */
6616 static int lod_declare_destroy(const struct lu_env *env, struct dt_object *dt,
6617                                struct thandle *th)
6618 {
6619         struct dt_object *next = dt_object_child(dt);
6620         struct lod_object *lo = lod_dt_obj(dt);
6621         struct lod_thread_info *info = lod_env_info(env);
6622         struct dt_object *stripe;
6623         char *stripe_name = info->lti_key;
6624         int rc, i;
6625
6626         ENTRY;
6627
6628         /*
6629          * load striping information, notice we don't do this when object
6630          * is being initialized as we don't need this information till
6631          * few specific cases like destroy, chown
6632          */
6633         rc = lod_striping_load(env, lo);
6634         if (rc)
6635                 RETURN(rc);
6636
6637         /* declare destroy for all underlying objects */
6638         if (S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
6639                 rc = next->do_ops->do_index_try(env, next,
6640                                                 &dt_directory_features);
6641                 if (rc != 0)
6642                         RETURN(rc);
6643
6644                 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
6645                         stripe = lo->ldo_stripe[i];
6646                         if (!stripe)
6647                                 continue;
6648
6649                         rc = lod_sub_declare_ref_del(env, next, th);
6650                         if (rc != 0)
6651                                 RETURN(rc);
6652
6653                         snprintf(stripe_name, sizeof(info->lti_key),
6654                                  DFID":%d",
6655                                  PFID(lu_object_fid(&stripe->do_lu)), i);
6656                         rc = lod_sub_declare_delete(env, next,
6657                                         (const struct dt_key *)stripe_name, th);
6658                         if (rc != 0)
6659                                 RETURN(rc);
6660                 }
6661         }
6662
6663         /*
6664          * we declare destroy for the local object
6665          */
6666         rc = lod_sub_declare_destroy(env, next, th);
6667         if (rc)
6668                 RETURN(rc);
6669
6670         if (CFS_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ) ||
6671             CFS_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ2))
6672                 RETURN(0);
6673
6674         if (!lod_obj_is_striped(dt))
6675                 RETURN(0);
6676
6677         /* declare destroy all striped objects */
6678         if (S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
6679                 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
6680                         stripe = lo->ldo_stripe[i];
6681                         if (!stripe)
6682                                 continue;
6683
6684                         if (!dt_object_exists(stripe))
6685                                 continue;
6686
6687                         rc = lod_sub_declare_ref_del(env, stripe, th);
6688                         if (rc != 0)
6689                                 break;
6690
6691                         rc = lod_sub_declare_destroy(env, stripe, th);
6692                         if (rc != 0)
6693                                 break;
6694                 }
6695         } else {
6696                 struct lod_obj_stripe_cb_data data = { { 0 } };
6697
6698                 data.locd_declare = true;
6699                 data.locd_stripe_cb = lod_obj_stripe_destroy_cb;
6700                 rc = lod_obj_for_each_stripe(env, lo, th, &data);
6701         }
6702
6703         RETURN(rc);
6704 }
6705
6706 /**
6707  * Implementation of dt_object_operations::do_destroy.
6708  *
6709  * If the object is a striped directory, then the function removes references
6710  * from the master object (this is an index) to the stripes and destroys all
6711  * the stripes. In all the cases, the function destroys the object itself.
6712  *
6713  * \see dt_object_operations::do_destroy() in the API description for details.
6714  */
6715 static int lod_destroy(const struct lu_env *env, struct dt_object *dt,
6716                        struct thandle *th)
6717 {
6718         struct dt_object  *next = dt_object_child(dt);
6719         struct lod_object *lo = lod_dt_obj(dt);
6720         struct lod_thread_info *info = lod_env_info(env);
6721         char *stripe_name = info->lti_key;
6722         struct dt_object *stripe;
6723         unsigned int i;
6724         int rc;
6725
6726         ENTRY;
6727
6728         /* destroy sub-stripe of master object */
6729         if (S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
6730                 rc = next->do_ops->do_index_try(env, next,
6731                                                 &dt_directory_features);
6732                 if (rc != 0)
6733                         RETURN(rc);
6734
6735                 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
6736                         stripe = lo->ldo_stripe[i];
6737                         if (!stripe)
6738                                 continue;
6739
6740                         rc = lod_sub_ref_del(env, next, th);
6741                         if (rc != 0)
6742                                 RETURN(rc);
6743
6744                         snprintf(stripe_name, sizeof(info->lti_key), DFID":%d",
6745                                 PFID(lu_object_fid(&stripe->do_lu)), i);
6746
6747                         CDEBUG(D_INFO, DFID" delete stripe %s "DFID"\n",
6748                                PFID(lu_object_fid(&dt->do_lu)), stripe_name,
6749                                PFID(lu_object_fid(&stripe->do_lu)));
6750
6751                         rc = lod_sub_delete(env, next,
6752                                        (const struct dt_key *)stripe_name, th);
6753                         if (rc != 0)
6754                                 RETURN(rc);
6755                 }
6756         }
6757
6758         rc = lod_sub_destroy(env, next, th);
6759         if (rc != 0)
6760                 RETURN(rc);
6761
6762         if (CFS_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ) ||
6763             CFS_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ2))
6764                 RETURN(0);
6765
6766         if (!lod_obj_is_striped(dt))
6767                 RETURN(0);
6768
6769         /* destroy all striped objects */
6770         if (S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
6771                 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
6772                         stripe = lo->ldo_stripe[i];
6773                         if (!stripe)
6774                                 continue;
6775
6776                         if (!dt_object_exists(stripe))
6777                                 continue;
6778
6779                         if (!CFS_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_SPEOBJ) ||
6780                             i == cfs_fail_val) {
6781                                 dt_write_lock(env, stripe, DT_TGT_CHILD);
6782                                 rc = lod_sub_ref_del(env, stripe, th);
6783                                 dt_write_unlock(env, stripe);
6784                                 if (rc != 0)
6785                                         break;
6786
6787                                 rc = lod_sub_destroy(env, stripe, th);
6788                                 if (rc != 0)
6789                                         break;
6790                         }
6791                 }
6792         } else {
6793                 struct lod_obj_stripe_cb_data data = { { 0 } };
6794
6795                 data.locd_declare = false;
6796                 data.locd_stripe_cb = lod_obj_stripe_destroy_cb;
6797                 rc = lod_obj_for_each_stripe(env, lo, th, &data);
6798         }
6799
6800         RETURN(rc);
6801 }
6802
6803 /**
6804  * Implementation of dt_object_operations::do_declare_ref_add.
6805  *
6806  * \see dt_object_operations::do_declare_ref_add() in the API description
6807  * for details.
6808  */
6809 static int lod_declare_ref_add(const struct lu_env *env,
6810                                struct dt_object *dt, struct thandle *th)
6811 {
6812         return lod_sub_declare_ref_add(env, dt_object_child(dt), th);
6813 }
6814
6815 /**
6816  * Implementation of dt_object_operations::do_ref_add.
6817  *
6818  * \see dt_object_operations::do_ref_add() in the API description for details.
6819  */
6820 static int lod_ref_add(const struct lu_env *env,
6821                        struct dt_object *dt, struct thandle *th)
6822 {
6823         return lod_sub_ref_add(env, dt_object_child(dt), th);
6824 }
6825
6826 /**
6827  * Implementation of dt_object_operations::do_declare_ref_del.
6828  *
6829  * \see dt_object_operations::do_declare_ref_del() in the API description
6830  * for details.
6831  */
6832 static int lod_declare_ref_del(const struct lu_env *env,
6833                                struct dt_object *dt, struct thandle *th)
6834 {
6835         return lod_sub_declare_ref_del(env, dt_object_child(dt), th);
6836 }
6837
6838 /**
6839  * Implementation of dt_object_operations::do_ref_del
6840  *
6841  * \see dt_object_operations::do_ref_del() in the API description for details.
6842  */
6843 static int lod_ref_del(const struct lu_env *env,
6844                        struct dt_object *dt, struct thandle *th)
6845 {
6846         return lod_sub_ref_del(env, dt_object_child(dt), th);
6847 }
6848
6849 /**
6850  * Implementation of dt_object_operations::do_object_sync.
6851  *
6852  * \see dt_object_operations::do_object_sync() in the API description
6853  * for details.
6854  */
6855 static int lod_object_sync(const struct lu_env *env, struct dt_object *dt,
6856                            __u64 start, __u64 end)
6857 {
6858         return dt_object_sync(env, dt_object_child(dt), start, end);
6859 }
6860
6861 /**
6862  * Implementation of dt_object_operations::do_object_unlock.
6863  *
6864  * Used to release LDLM lock(s).
6865  *
6866  * \see dt_object_operations::do_object_unlock() in the API description
6867  * for details.
6868  */
6869 static int lod_object_unlock(const struct lu_env *env, struct dt_object *dt,
6870                              struct ldlm_enqueue_info *einfo,
6871                              union ldlm_policy_data *policy)
6872 {
6873         struct lod_object *lo = lod_dt_obj(dt);
6874         struct lustre_handle_array *slave_locks = einfo->ei_cbdata;
6875         int slave_locks_size;
6876         int i;
6877         ENTRY;
6878
6879         if (slave_locks == NULL)
6880                 RETURN(0);
6881
6882         LASSERT(S_ISDIR(dt->do_lu.lo_header->loh_attr));
6883         /* Note: for remote lock for single stripe dir, MDT will cancel
6884          * the lock by lockh directly */
6885         LASSERT(!dt_object_remote(dt_object_child(dt)));
6886
6887         /* locks were unlocked in MDT layer */
6888         for (i = 0; i < slave_locks->ha_count; i++)
6889                 LASSERT(!lustre_handle_is_used(&slave_locks->ha_handles[i]));
6890
6891         /*
6892          * NB, ha_count may not equal to ldo_dir_stripe_count, because dir
6893          * layout may change, e.g., shrink dir layout after migration.
6894          */
6895         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
6896                 if (lo->ldo_stripe[i])
6897                         dt_invalidate(env, lo->ldo_stripe[i]);
6898         }
6899
6900         slave_locks_size = offsetof(typeof(*slave_locks),
6901                                     ha_handles[slave_locks->ha_count]);
6902         OBD_FREE(slave_locks, slave_locks_size);
6903         einfo->ei_cbdata = NULL;
6904
6905         RETURN(0);
6906 }
6907
6908 /**
6909  * Implementation of dt_object_operations::do_object_lock.
6910  *
6911  * Used to get LDLM lock on the non-striped and striped objects.
6912  *
6913  * \see dt_object_operations::do_object_lock() in the API description
6914  * for details.
6915  */
6916 static int lod_object_lock(const struct lu_env *env,
6917                            struct dt_object *dt,
6918                            struct lustre_handle *lh,
6919                            struct ldlm_enqueue_info *einfo,
6920                            union ldlm_policy_data *policy)
6921 {
6922         struct lod_object *lo = lod_dt_obj(dt);
6923         int slave_locks_size;
6924         struct lustre_handle_array *slave_locks = NULL;
6925         int i;
6926         int rc;
6927         ENTRY;
6928
6929         /* remote object lock */
6930         if (!einfo->ei_enq_slave) {
6931                 LASSERT(dt_object_remote(dt));
6932                 return dt_object_lock(env, dt_object_child(dt), lh, einfo,
6933                                       policy);
6934         }
6935
6936         if (!S_ISDIR(dt->do_lu.lo_header->loh_attr))
6937                 RETURN(-ENOTDIR);
6938
6939         rc = lod_striping_load(env, lo);
6940         if (rc != 0)
6941                 RETURN(rc);
6942
6943         /* No stripes */
6944         if (lo->ldo_dir_stripe_count <= 1)
6945                 RETURN(0);
6946
6947         slave_locks_size = offsetof(typeof(*slave_locks),
6948                                     ha_handles[lo->ldo_dir_stripe_count]);
6949         /* Freed in lod_object_unlock */
6950         OBD_ALLOC(slave_locks, slave_locks_size);
6951         if (!slave_locks)
6952                 RETURN(-ENOMEM);
6953         slave_locks->ha_count = lo->ldo_dir_stripe_count;
6954
6955         /* striped directory lock */
6956         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
6957                 struct lustre_handle lockh;
6958                 struct ldlm_res_id *res_id;
6959                 struct dt_object *stripe;
6960
6961                 stripe = lo->ldo_stripe[i];
6962                 if (!stripe)
6963                         continue;
6964
6965                 res_id = &lod_env_info(env)->lti_res_id;
6966                 fid_build_reg_res_name(lu_object_fid(&stripe->do_lu), res_id);
6967                 einfo->ei_res_id = res_id;
6968
6969                 if (dt_object_remote(stripe)) {
6970                         set_bit(i, (void *)slave_locks->ha_map);
6971                         rc = dt_object_lock(env, stripe, &lockh, einfo, policy);
6972                 } else {
6973                         struct ldlm_namespace *ns = einfo->ei_namespace;
6974                         ldlm_blocking_callback blocking = einfo->ei_cb_local_bl;
6975                         ldlm_completion_callback completion = einfo->ei_cb_cp;
6976                         __u64 dlmflags = LDLM_FL_ATOMIC_CB;
6977
6978                         LASSERT(ns != NULL);
6979                         rc = ldlm_cli_enqueue_local(env, ns, res_id, LDLM_IBITS,
6980                                                     policy, einfo->ei_mode,
6981                                                     &dlmflags, blocking,
6982                                                     completion, NULL,
6983                                                     NULL, 0, LVB_T_NONE,
6984                                                     NULL, &lockh);
6985                 }
6986                 if (rc) {
6987                         while (i--)
6988                                 ldlm_lock_decref_and_cancel(
6989                                                 &slave_locks->ha_handles[i],
6990                                                 einfo->ei_mode);
6991                         OBD_FREE(slave_locks, slave_locks_size);
6992                         RETURN(rc);
6993                 }
6994                 slave_locks->ha_handles[i] = lockh;
6995         }
6996         einfo->ei_cbdata = slave_locks;
6997
6998         RETURN(0);
6999 }
7000
7001 /**
7002  * Implementation of dt_object_operations::do_invalidate.
7003  *
7004  * \see dt_object_operations::do_invalidate() in the API description for details
7005  */
7006 static int lod_invalidate(const struct lu_env *env, struct dt_object *dt)
7007 {
7008         return dt_invalidate(env, dt_object_child(dt));
7009 }
7010
7011 static int lod_declare_instantiate_components(const struct lu_env *env,
7012                                               struct lod_object *lo,
7013                                               struct thandle *th,
7014                                               __u64 reserve)
7015 {
7016         struct lod_thread_info *info = lod_env_info(env);
7017         int i;
7018         int rc = 0;
7019         ENTRY;
7020
7021         LASSERT(info->lti_count < lo->ldo_comp_cnt);
7022
7023         for (i = 0; i < info->lti_count; i++) {
7024                 rc = lod_qos_prep_create(env, lo, NULL, th,
7025                                          info->lti_comp_idx[i], reserve);
7026                 if (rc)
7027                         break;
7028         }
7029
7030         if (!rc) {
7031                 info->lti_buf.lb_len = lod_comp_md_size(lo, false);
7032                 rc = lod_sub_declare_xattr_set(env, lod_object_child(lo),
7033                                 &info->lti_buf, XATTR_NAME_LOV, 0, th);
7034         }
7035
7036         RETURN(rc);
7037 }
7038
7039 /**
7040  * Check OSTs for an existing component for further extension
7041  *
7042  * Checks if OSTs are still healthy and not out of space.  Gets free space
7043  * on OSTs (relative to allocation watermark rmb_low) and compares to
7044  * the proposed new_end for this component.
7045  *
7046  * Decides whether or not to extend a component on its current OSTs.
7047  *
7048  * \param[in] env               execution environment for this thread
7049  * \param[in] lo                object we're checking
7050  * \param[in] index             index of this component
7051  * \param[in] extension_size    extension size for this component
7052  * \param[in] extent            layout extent for requested operation
7053  * \param[in] comp_extent       extension component extent
7054  * \param[in] write             if this is write operation
7055  *
7056  * \retval      true - OK to extend on current OSTs
7057  * \retval      false - do not extend on current OSTs
7058  */
7059 static bool lod_sel_osts_allowed(const struct lu_env *env,
7060                                  struct lod_object *lo,
7061                                  int index, __u64 reserve,
7062                                  struct lu_extent *extent,
7063                                  struct lu_extent *comp_extent, int write)
7064 {
7065         struct lod_layout_component *lod_comp = &lo->ldo_comp_entries[index];
7066         struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
7067         struct lod_thread_info *tinfo = lod_env_info(env);
7068         struct obd_statfs *sfs = &tinfo->lti_osfs;
7069         __u64 available = 0;
7070         bool ret = true;
7071         int i, rc;
7072
7073         ENTRY;
7074
7075         LASSERT(lod_comp->llc_stripe_count != 0);
7076
7077         lod_getref(&lod->lod_ost_descs);
7078         for (i = 0; i < lod_comp->llc_stripe_count; i++) {
7079                 int index = lod_comp->llc_ost_indices[i];
7080                 struct lod_tgt_desc *ost = OST_TGT(lod, index);
7081                 struct obd_statfs_info info = { 0 };
7082                 int j, repeated = 0;
7083
7084                 LASSERT(ost);
7085
7086                 /* Get the number of times this OST repeats in this component.
7087                  * Note: inter-component repeats are not counted as this is
7088                  * considered as a rare case: we try to not repeat OST in other
7089                  * components if possible. */
7090                 for (j = 0; j < lod_comp->llc_stripe_count; j++) {
7091                         if (index != lod_comp->llc_ost_indices[j])
7092                                 continue;
7093
7094                         /* already handled */
7095                         if (j < i)
7096                                 break;
7097
7098                         repeated++;
7099                 }
7100                 if (j < lod_comp->llc_stripe_count)
7101                         continue;
7102
7103                 if (!test_bit(index, lod->lod_ost_bitmap)) {
7104                         CDEBUG(D_LAYOUT, "ost %d no longer present\n", index);
7105                         ret = false;
7106                         break;
7107                 }
7108
7109                 rc = dt_statfs_info(env, ost->ltd_tgt, sfs, &info);
7110                 if (rc) {
7111                         CDEBUG(D_LAYOUT, "statfs failed for ost %d, error %d\n",
7112                                index, rc);
7113                         ret = false;
7114                         break;
7115                 }
7116
7117                 if (sfs->os_state & OS_STATFS_ENOSPC ||
7118                     sfs->os_state & OS_STATFS_READONLY ||
7119                     sfs->os_state & OS_STATFS_NOCREATE ||
7120                     sfs->os_state & OS_STATFS_DEGRADED) {
7121                         CDEBUG(D_LAYOUT,
7122                                "OST%04x unusable for SEL extension, state %x\n",
7123                                index, sfs->os_state);
7124                         ret = false;
7125                         break;
7126                 }
7127
7128                 /* In bytes */
7129                 available = sfs->os_bavail * sfs->os_bsize;
7130                 /* 'available' is relative to the allocation threshold */
7131                 available -= (__u64) info.os_reserved_mb_low << 20;
7132
7133                 CDEBUG(D_LAYOUT, "ost %d lowwm: %d highwm: %d, "
7134                        "%llu %% blocks available, %llu %% blocks free\n",
7135                        index, info.os_reserved_mb_low, info.os_reserved_mb_high,
7136                        (100ull * sfs->os_bavail) / sfs->os_blocks,
7137                        (100ull * sfs->os_bfree) / sfs->os_blocks);
7138
7139                 if (reserve * repeated > available) {
7140                         ret = false;
7141                         CDEBUG(D_LAYOUT, "low space on ost %d, available %llu "
7142                                "< extension size %llu repeated %d\n", index,
7143                                available, reserve, repeated);
7144                         break;
7145                 }
7146         }
7147         lod_putref(lod, &lod->lod_ost_descs);
7148
7149         RETURN(ret);
7150 }
7151
7152 /**
7153  * Adjust extents after component removal
7154  *
7155  * When we remove an extension component, we move the start of the next
7156  * component to match the start of the extension component, so no space is left
7157  * without layout.
7158  *
7159  * \param[in] env       execution environment for this thread
7160  * \param[in] lo        object
7161  * \param[in] max_comp  layout component
7162  * \param[in] index     index of this component
7163  *
7164  * \retval              0 on success
7165  * \retval              negative errno on error
7166  */
7167 static void lod_sel_adjust_extents(const struct lu_env *env,
7168                                    struct lod_object *lo,
7169                                    int max_comp, int index)
7170 {
7171         struct lod_layout_component *lod_comp = NULL;
7172         struct lod_layout_component *next = NULL;
7173         struct lod_layout_component *prev = NULL;
7174         __u64 new_start = 0;
7175         __u64 start;
7176         int i;
7177
7178         /* Extension space component */
7179         lod_comp = &lo->ldo_comp_entries[index];
7180         next = &lo->ldo_comp_entries[index + 1];
7181         prev = &lo->ldo_comp_entries[index - 1];
7182
7183         LASSERT(lod_comp != NULL && prev != NULL && next != NULL);
7184         LASSERT(lod_comp->llc_flags & LCME_FL_EXTENSION);
7185
7186         /* Previous is being removed */
7187         if (prev && prev->llc_id == LCME_ID_INVAL)
7188                 new_start = prev->llc_extent.e_start;
7189         else
7190                 new_start = lod_comp->llc_extent.e_start;
7191
7192         for (i = index + 1; i < max_comp; i++) {
7193                 lod_comp = &lo->ldo_comp_entries[i];
7194
7195                 start = lod_comp->llc_extent.e_start;
7196                 lod_comp->llc_extent.e_start = new_start;
7197
7198                 /* We only move zero length extendable components */
7199                 if (!(start == lod_comp->llc_extent.e_end))
7200                         break;
7201
7202                 LASSERT(!(lod_comp->llc_flags & LCME_FL_INIT));
7203
7204                 lod_comp->llc_extent.e_end = new_start;
7205         }
7206 }
7207
7208 /* Calculate the proposed 'new end' for a component we're extending */
7209 static __u64 lod_extension_new_end(__u64 extension_size, __u64 extent_end,
7210                                    __u32 stripe_size, __u64 component_end,
7211                                    __u64 extension_end)
7212 {
7213         __u64 new_end;
7214
7215         LASSERT(extension_size != 0 && stripe_size != 0);
7216
7217         /* Round up to extension size */
7218         if (extent_end == OBD_OBJECT_EOF) {
7219                 new_end = OBD_OBJECT_EOF;
7220         } else {
7221                 /* Add at least extension_size to the previous component_end,
7222                  * covering the req layout extent */
7223                 new_end = max(extent_end - component_end, extension_size);
7224                 new_end = roundup(new_end, extension_size);
7225                 new_end += component_end;
7226
7227                 /* Component end must be min stripe size aligned */
7228                 if (new_end % stripe_size) {
7229                         CDEBUG(D_LAYOUT, "new component end is not aligned "
7230                                "by the stripe size %u: [%llu, %llu) ext size "
7231                                "%llu new end %llu, aligning\n",
7232                                stripe_size, component_end, extent_end,
7233                                extension_size, new_end);
7234                         new_end = roundup(new_end, stripe_size);
7235                 }
7236
7237                 /* Overflow */
7238                 if (new_end < extent_end)
7239                         new_end = OBD_OBJECT_EOF;
7240         }
7241
7242         /* Don't extend past the end of the extension component */
7243         if (new_end > extension_end)
7244                 new_end = extension_end;
7245
7246         return new_end;
7247 }
7248
7249 /**
7250  * Calculate the exact reservation (per-OST extension_size) on the OSTs being
7251  * instantiated. It needs to be calculated in advance and taken into account at
7252  * the instantiation time, because otherwise lod_statfs_and_check() may consider
7253  * an OST as OK, but SEL needs its extension_size to fit the free space and the
7254  * OST may turn out to be low-on-space, thus inappropriate OST may be used and
7255  * ENOSPC occurs.
7256  *
7257  * \param[in] lod_comp          lod component we are checking
7258  *
7259  * \retval      size to reserved on each OST of lod_comp's stripe.
7260  */
7261 static __u64 lod_sel_stripe_reserved(struct lod_layout_component *lod_comp)
7262 {
7263         /* extension_size is file level, so we must divide by stripe count to
7264          * compare it to available space on a single OST */
7265         return  lod_comp->llc_stripe_size * SEL_UNIT_SIZE /
7266                 lod_comp->llc_stripe_count;
7267 }
7268
7269 /* As lod_sel_handler() could be re-entered for the same component several
7270  * times, this is the data for the next call. Fields could be changed to
7271  * component indexes when needed, (e.g. if there is no need to instantiate
7272  * all the previous components up to the current position) to tell the caller
7273  * where to start over from. */
7274 struct sel_data {
7275         int sd_force;
7276         int sd_repeat;
7277 };
7278
7279 /**
7280  * Process extent updates for a particular layout component
7281  *
7282  * Handle layout updates for a particular extension space component touched by
7283  * a layout update operation.  Core function of self-extending PFL feature.
7284  *
7285  * In general, this function processes exactly *one* stage of an extension
7286  * operation, modifying the layout accordingly, then returns to the caller.
7287  * The caller is responsible for restarting processing with the new layout,
7288  * which may repeatedly return to this function until the extension updates
7289  * are complete.
7290  *
7291  * This function does one of a few things to the layout:
7292  * 1. Extends the component before the current extension space component to
7293  * allow it to accomodate the requested operation (if space/policy permit that
7294  * component to continue on its current OSTs)
7295  *
7296  * 2. If extension of the existing component fails, we do one of two things:
7297  *    a. If there is a component after the extension space, we remove the
7298  *       extension space component, move the start of the next component down
7299  *       accordingly, then notify the caller to restart processing w/the new
7300  *       layout.
7301  *    b. If there is no following component, we try repeating the current
7302  *       component, creating a new component using the current one as a
7303  *       template (keeping its stripe properties but not specific striping),
7304  *       and try assigning striping for this component.  If there is sufficient
7305  *       free space on the OSTs chosen for this component, it is instantiated
7306  *       and i/o continues there.
7307  *
7308  *       If there is not sufficient space on the new OSTs, we remove this new
7309  *       component & extend the current component.
7310  *
7311  * Note further that uninited components followed by extension space can be zero
7312  * length meaning that we will try to extend them before initializing them, and
7313  * if that fails, they will be removed without initialization.
7314  *
7315  * 3. If we extend to/beyond the end of an extension space component, that
7316  * component is exhausted (all of its range has been given to real components),
7317  * so we remove it and restart processing.
7318  *
7319  * \param[in] env               execution environment for this thread
7320  * \param[in,out] lo            object to update the layout of
7321  * \param[in] extent            layout extent for requested operation, update
7322  *                              layout to fit this operation
7323  * \param[in] th                transaction handle for this operation
7324  * \param[in,out] max_comp      the highest comp for the portion of the layout
7325  *                              we are operating on (For FLR, the chosen
7326  *                              replica).  Updated because we may remove
7327  *                              components.
7328  * \param[in] index             index of the extension space component we're
7329  *                              working on
7330  * \param[in] write             if this is write op
7331  * \param[in,out] force         if the extension is to be forced; set here
7332                                 to force it on the 2nd call for the same
7333                                 extension component
7334  *
7335  * \retval      0 on success
7336  * \retval      negative errno on error
7337  */
7338 static int lod_sel_handler(const struct lu_env *env,
7339                           struct lod_object *lo,
7340                           struct lu_extent *extent,
7341                           struct thandle *th, int *max_comp,
7342                           int index, int write,
7343                           struct sel_data *sd)
7344 {
7345         struct lod_device *d = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
7346         struct lod_thread_info *info = lod_env_info(env);
7347         struct lod_layout_component *lod_comp;
7348         struct lod_layout_component *prev;
7349         struct lod_layout_component *next = NULL;
7350         __u64 extension_size, reserve;
7351         __u64 new_end = 0;
7352         bool repeated;
7353         int change = 0;
7354         int rc = 0;
7355         ENTRY;
7356
7357         /* First component cannot be extension space */
7358         if (index == 0) {
7359                 CERROR("%s: "DFID" first component cannot be extension space\n",
7360                        lod2obd(d)->obd_name, PFID(lod_object_fid(lo)));
7361                 RETURN(-EINVAL);
7362         }
7363
7364         lod_comp = &lo->ldo_comp_entries[index];
7365         prev = &lo->ldo_comp_entries[index - 1];
7366         if ((index + 1) < *max_comp)
7367                 next = &lo->ldo_comp_entries[index + 1];
7368
7369         /* extension size uses the stripe size field as KiB */
7370         extension_size = lod_comp->llc_stripe_size * SEL_UNIT_SIZE;
7371
7372         CDEBUG(D_LAYOUT, "prev start %llu, extension start %llu, extension end"
7373                " %llu, extension size %llu\n", prev->llc_extent.e_start,
7374                lod_comp->llc_extent.e_start, lod_comp->llc_extent.e_end,
7375                extension_size);
7376
7377         /* Two extension space components cannot be adjacent & extension space
7378          * components cannot be init */
7379         if ((prev->llc_flags & LCME_FL_EXTENSION) ||
7380             !(ergo(next, !(next->llc_flags & LCME_FL_EXTENSION))) ||
7381              lod_comp_inited(lod_comp)) {
7382                 CERROR("%s: "DFID" invalid extension space components\n",
7383                        lod2obd(d)->obd_name, PFID(lod_object_fid(lo)));
7384                 RETURN(-EINVAL);
7385         }
7386
7387         reserve = lod_sel_stripe_reserved(lod_comp);
7388
7389         if (!prev->llc_stripe) {
7390                 CDEBUG(D_LAYOUT, "Previous component not inited\n");
7391                 info->lti_count = 1;
7392                 info->lti_comp_idx[0] = index - 1;
7393                 rc = lod_declare_instantiate_components(env, lo, th, reserve);
7394                 /* ENOSPC tells us we can't use this component.  If there is
7395                  * a next or we are repeating, we either spill over (next) or
7396                  * extend the original comp (repeat).  Otherwise, return the
7397                  * error to the user. */
7398                 if (rc == -ENOSPC && (next || sd->sd_repeat))
7399                         rc = 1;
7400                 if (rc < 0)
7401                         RETURN(rc);
7402         }
7403
7404         if (sd->sd_force == 0 && rc == 0)
7405                 rc = !lod_sel_osts_allowed(env, lo, index - 1, reserve, extent,
7406                                            &lod_comp->llc_extent, write);
7407
7408         repeated = !!(sd->sd_repeat);
7409         sd->sd_repeat = 0;
7410         sd->sd_force = 0;
7411
7412         /* Extend previous component */
7413         if (rc == 0) {
7414                 new_end = lod_extension_new_end(extension_size, extent->e_end,
7415                                                 prev->llc_stripe_size,
7416                                                 prev->llc_extent.e_end,
7417                                                 lod_comp->llc_extent.e_end);
7418
7419                 CDEBUG(D_LAYOUT, "new end %llu\n", new_end);
7420                 lod_comp->llc_extent.e_start = new_end;
7421                 prev->llc_extent.e_end = new_end;
7422
7423                 if (prev->llc_extent.e_end == lod_comp->llc_extent.e_end) {
7424                         CDEBUG(D_LAYOUT, "Extension component exhausted\n");
7425                         lod_comp->llc_id = LCME_ID_INVAL;
7426                         change--;
7427                 }
7428         } else {
7429                 /* rc == 1, failed to extend current component */
7430                 LASSERT(rc == 1);
7431                 if (next) {
7432                         /* Normal 'spillover' case - Remove the extension
7433                          * space component & bring down the start of the next
7434                          * component. */
7435                         lod_comp->llc_id = LCME_ID_INVAL;
7436                         change--;
7437                         if (!(prev->llc_flags & LCME_FL_INIT)) {
7438                                 prev->llc_id = LCME_ID_INVAL;
7439                                 change--;
7440                         }
7441                         lod_sel_adjust_extents(env, lo, *max_comp, index);
7442                 } else if (lod_comp_inited(prev)) {
7443                         /* If there is no next, and the previous component is
7444                          * INIT'ed, try repeating the previous component. */
7445                         LASSERT(repeated == 0);
7446                         rc = lod_layout_repeat_comp(env, lo, index - 1);
7447                         if (rc < 0)
7448                                 RETURN(rc);
7449                         change++;
7450                         /* The previous component is a repeated component.
7451                          * Record this so we don't keep trying to repeat it. */
7452                         sd->sd_repeat = 1;
7453                 } else {
7454                         /* If the previous component is not INIT'ed, this may
7455                          * be a component we have just instantiated but failed
7456                          * to extend. Or even a repeated component we failed
7457                          * to prepare a striping for. Do not repeat but instead
7458                          * remove the repeated component & force the extention
7459                          * of the original one */
7460                         sd->sd_force = 1;
7461                         if (repeated) {
7462                                 prev->llc_id = LCME_ID_INVAL;
7463                                 change--;
7464                         }
7465                 }
7466         }
7467
7468         if (change < 0) {
7469                 rc = lod_layout_del_prep_layout(env, lo, NULL);
7470                 if (rc < 0)
7471                         RETURN(rc);
7472                 LASSERTF(-rc == change,
7473                          "number deleted %d != requested %d\n", -rc,
7474                          change);
7475         }
7476         *max_comp = *max_comp + change;
7477
7478         /* lod_del_prep_layout reallocates ldo_comp_entries, so we must
7479          * refresh these pointers before using them */
7480         lod_comp = &lo->ldo_comp_entries[index];
7481         prev = &lo->ldo_comp_entries[index - 1];
7482         CDEBUG(D_LAYOUT, "After extent updates: prev start %llu, current start "
7483                "%llu, current end %llu max_comp %d ldo_comp_cnt %d\n",
7484                prev->llc_extent.e_start, lod_comp->llc_extent.e_start,
7485                lod_comp->llc_extent.e_end, *max_comp, lo->ldo_comp_cnt);
7486
7487         /* Layout changed successfully */
7488         RETURN(0);
7489 }
7490
7491 /**
7492  * Declare layout extent updates
7493  *
7494  * Handles extensions.  Identifies extension components touched by current
7495  * operation and passes them to processing function.
7496  *
7497  * Restarts with updated layouts from the processing function until the current
7498  * operation no longer touches an extension space component.
7499  *
7500  * \param[in] env       execution environment for this thread
7501  * \param[in,out] lo    object to update the layout of
7502  * \param[in] extent    layout extent for requested operation, update layout to
7503  *                      fit this operation
7504  * \param[in] th        transaction handle for this operation
7505  * \param[in] pick      identifies chosen mirror for FLR layouts
7506  * \param[in] write     if this is write op
7507  *
7508  * \retval      1 on layout changed, 0 on no change
7509  * \retval      negative errno on error
7510  */
7511 static int lod_declare_update_extents(const struct lu_env *env,
7512                 struct lod_object *lo, struct lu_extent *extent,
7513                 struct thandle *th, int pick, int write)
7514 {
7515         struct lod_thread_info *info = lod_env_info(env);
7516         struct lod_layout_component *lod_comp;
7517         bool layout_changed = false;
7518         struct sel_data sd = { 0 };
7519         int start_index;
7520         int i = 0;
7521         int max_comp = 0;
7522         int rc = 0, rc2;
7523         int change = 0;
7524         ENTRY;
7525
7526         /* This makes us work on the components of the chosen mirror */
7527         if (lo->ldo_mirrors) {
7528                 start_index = lo->ldo_mirrors[pick].lme_start;
7529                 max_comp = lo->ldo_mirrors[pick].lme_end + 1;
7530         } else {
7531                 start_index = 0;
7532                 max_comp = lo->ldo_comp_cnt;
7533         }
7534         if (lo->ldo_flr_state == LCM_FL_NONE)
7535                 LASSERT(start_index == 0 && max_comp == lo->ldo_comp_cnt);
7536
7537         CDEBUG(D_LAYOUT, "extent->e_start %llu, extent->e_end %llu\n",
7538                extent->e_start, extent->e_end);
7539         for (i = start_index; i < max_comp; i++) {
7540                 lod_comp = &lo->ldo_comp_entries[i];
7541
7542                 /* We've passed all components of interest */
7543                 if (lod_comp->llc_extent.e_start >= extent->e_end)
7544                         break;
7545
7546                 if (lod_comp->llc_flags & LCME_FL_EXTENSION) {
7547                         layout_changed = true;
7548                         rc = lod_sel_handler(env, lo, extent, th, &max_comp,
7549                                              i, write, &sd);
7550                         if (rc < 0)
7551                                 GOTO(out, rc);
7552
7553                         /* Nothing has changed behind the prev one */
7554                         i -= 2;
7555                         continue;
7556                 }
7557         }
7558
7559         /* We may have added or removed components.  If so, we must update the
7560          * start & ends of all the mirrors after the current one, and the end
7561          * of the current mirror. */
7562         if (lo->ldo_mirrors) {
7563                 change = max_comp - 1 - lo->ldo_mirrors[pick].lme_end;
7564                 if (change) {
7565                         lo->ldo_mirrors[pick].lme_end += change;
7566                         for (i = pick + 1; i < lo->ldo_mirror_count; i++) {
7567                                 lo->ldo_mirrors[i].lme_start += change;
7568                                 lo->ldo_mirrors[i].lme_end += change;
7569                         }
7570                 }
7571         }
7572
7573         EXIT;
7574 out:
7575         /* The amount of components has changed, adjust the lti_comp_idx */
7576         rc2 = lod_layout_data_init(info, lo->ldo_comp_cnt);
7577
7578         return rc < 0 ? rc : rc2 < 0 ? rc2 : layout_changed;
7579 }
7580
7581 /* If striping is already instantiated or INIT'ed DOM? */
7582 static bool lod_is_instantiation_needed(struct lod_layout_component *comp)
7583 {
7584         if (comp->llc_magic == LOV_MAGIC_FOREIGN)
7585                 return false;
7586
7587         return !(((lov_pattern(comp->llc_pattern) & LOV_PATTERN_MDT) &&
7588                   lod_comp_inited(comp)) || comp->llc_stripe);
7589 }
7590
7591 /**
7592  * Declare layout update for a non-FLR layout.
7593  *
7594  * \param[in] env       execution environment for this thread
7595  * \param[in,out] lo    object to update the layout of
7596  * \param[in] layout    layout intent for requested operation, "update" is
7597  *                      a process of reacting to this
7598  * \param[in] buf       buffer containing lov ea (see comment on usage inline)
7599  * \param[in] th        transaction handle for this operation
7600  *
7601  * \retval      0 on success
7602  * \retval      negative errno on error
7603  */
7604 static int lod_declare_update_plain(const struct lu_env *env,
7605                 struct lod_object *lo, struct layout_intent *layout,
7606                 const struct lu_buf *buf, struct thandle *th)
7607 {
7608         struct lod_thread_info *info = lod_env_info(env);
7609         struct lod_device *d = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
7610         struct lod_layout_component *lod_comp;
7611         struct lov_comp_md_v1 *comp_v1 = NULL;
7612         bool layout_changed = false;
7613         bool replay = false;
7614         int i, rc;
7615         ENTRY;
7616
7617         LASSERT(lo->ldo_flr_state == LCM_FL_NONE);
7618
7619         /*
7620          * In case the client is passing lovea, which only happens during
7621          * the replay of layout intent write RPC for now, we may need to
7622          * parse the lovea and apply new layout configuration.
7623          */
7624         if (buf && buf->lb_len)  {
7625                 struct lov_user_md_v1 *v1 = buf->lb_buf;
7626
7627                 if (v1->lmm_magic != (LOV_MAGIC_DEFINED | LOV_MAGIC_COMP_V1) &&
7628                     v1->lmm_magic != __swab32(LOV_MAGIC_DEFINED |
7629                                               LOV_MAGIC_COMP_V1)) {
7630                         CERROR("%s: the replay buffer of layout extend "
7631                                "(magic %#x) does not contain expected "
7632                                "composite layout.\n",
7633                                lod2obd(d)->obd_name, v1->lmm_magic);
7634                         GOTO(out, rc = -EINVAL);
7635                 }
7636
7637                 rc = lod_use_defined_striping(env, lo, buf);
7638                 if (rc)
7639                         GOTO(out, rc);
7640                 lo->ldo_comp_cached = 1;
7641
7642                 rc = lod_get_lov_ea(env, lo);
7643                 if (rc <= 0)
7644                         GOTO(out, rc);
7645                 /* old on-disk EA is stored in info->lti_buf */
7646                 comp_v1 = (struct lov_comp_md_v1 *)info->lti_buf.lb_buf;
7647                 replay = true;
7648                 layout_changed = true;
7649
7650                 rc = lod_layout_data_init(info, lo->ldo_comp_cnt);
7651                 if (rc)
7652                         GOTO(out, rc);
7653         } else {
7654                 /* non replay path */
7655                 rc = lod_striping_load(env, lo);
7656                 if (rc)
7657                         GOTO(out, rc);
7658         }
7659
7660         /* Make sure defined layout covers the requested write range. */
7661         lod_comp = &lo->ldo_comp_entries[lo->ldo_comp_cnt - 1];
7662         if (lo->ldo_comp_cnt > 1 &&
7663             lod_comp->llc_extent.e_end != OBD_OBJECT_EOF &&
7664             lod_comp->llc_extent.e_end < layout->lai_extent.e_end) {
7665                 CDEBUG_LIMIT(replay ? D_ERROR : D_LAYOUT,
7666                              "%s: the defined layout [0, %#llx) does not "
7667                              "covers the write range "DEXT"\n",
7668                              lod2obd(d)->obd_name, lod_comp->llc_extent.e_end,
7669                              PEXT(&layout->lai_extent));
7670                 GOTO(out, rc = -EINVAL);
7671         }
7672
7673         CDEBUG(D_LAYOUT, "%s: "DFID": update components "DEXT"\n",
7674                lod2obd(d)->obd_name, PFID(lod_object_fid(lo)),
7675                PEXT(&layout->lai_extent));
7676
7677         if (!replay) {
7678                 rc = lod_declare_update_extents(env, lo, &layout->lai_extent,
7679                                 th, 0, layout->lai_opc == LAYOUT_INTENT_WRITE);
7680                 if (rc < 0)
7681                         GOTO(out, rc);
7682                 else if (rc)
7683                         layout_changed = true;
7684         }
7685
7686         /*
7687          * Iterate ld->ldo_comp_entries, find the component whose extent under
7688          * the write range and not instantianted.
7689          */
7690         for (i = 0; i < lo->ldo_comp_cnt; i++) {
7691                 lod_comp = &lo->ldo_comp_entries[i];
7692
7693                 if (lod_comp->llc_extent.e_start >= layout->lai_extent.e_end)
7694                         break;
7695
7696                 if (!replay) {
7697                         /* If striping is instantiated or INIT'ed DOM skip */
7698                         if (!lod_is_instantiation_needed(lod_comp))
7699                                 continue;
7700                 } else {
7701                         /**
7702                          * In replay path, lod_comp is the EA passed by
7703                          * client replay buffer,  comp_v1 is the pre-recovery
7704                          * on-disk EA, we'd sift out those components which
7705                          * were init-ed in the on-disk EA.
7706                          */
7707                         if (le32_to_cpu(comp_v1->lcm_entries[i].lcme_flags) &
7708                             LCME_FL_INIT)
7709                                 continue;
7710                 }
7711                 /*
7712                  * this component hasn't instantiated in normal path, or during
7713                  * replay it needs replay the instantiation.
7714                  */
7715
7716                 /* A released component is being extended */
7717                 if (lod_comp->llc_pattern & LOV_PATTERN_F_RELEASED)
7718                         GOTO(out, rc = -EINVAL);
7719
7720                 LASSERT(info->lti_comp_idx != NULL);
7721                 info->lti_comp_idx[info->lti_count++] = i;
7722                 layout_changed = true;
7723         }
7724
7725         if (!layout_changed)
7726                 RETURN(-EALREADY);
7727
7728         lod_obj_inc_layout_gen(lo);
7729         rc = lod_declare_instantiate_components(env, lo, th, 0);
7730         EXIT;
7731 out:
7732         if (rc)
7733                 lod_striping_free(env, lo);
7734         return rc;
7735 }
7736
7737 static inline int lod_comp_index(struct lod_object *lo,
7738                                  struct lod_layout_component *lod_comp)
7739 {
7740         LASSERT(lod_comp >= lo->ldo_comp_entries &&
7741                 lod_comp <= &lo->ldo_comp_entries[lo->ldo_comp_cnt - 1]);
7742
7743         return lod_comp - lo->ldo_comp_entries;
7744 }
7745
7746 /**
7747  * Stale other mirrors by writing extent.
7748  */
7749 static int lod_stale_components(const struct lu_env *env, struct lod_object *lo,
7750                                 int primary, struct lu_extent *extent,
7751                                 struct thandle *th)
7752 {
7753         struct lod_layout_component *pri_comp, *lod_comp;
7754         struct lu_extent pri_extent;
7755         int rc = 0;
7756         int i;
7757         ENTRY;
7758
7759         /* The writing extent decides which components in the primary
7760          * are affected... */
7761         CDEBUG(D_LAYOUT, "primary mirror %d, "DEXT"\n", primary, PEXT(extent));
7762
7763 restart:
7764         lod_foreach_mirror_comp(pri_comp, lo, primary) {
7765                 if (!lu_extent_is_overlapped(extent, &pri_comp->llc_extent))
7766                         continue;
7767
7768                 CDEBUG(D_LAYOUT, "primary comp %u "DEXT"\n",
7769                        lod_comp_index(lo, pri_comp),
7770                        PEXT(&pri_comp->llc_extent));
7771
7772                 pri_extent.e_start = pri_comp->llc_extent.e_start;
7773                 pri_extent.e_end = pri_comp->llc_extent.e_end;
7774
7775                 for (i = 0; i < lo->ldo_mirror_count; i++) {
7776                         if (i == primary)
7777                                 continue;
7778
7779                         rc = lod_declare_update_extents(env, lo, &pri_extent,
7780                                                         th, i, 0);
7781                         /* if update_extents changed the layout, it may have
7782                          * reallocated the component array, so start over to
7783                          * avoid using stale pointers */
7784                         if (rc == 1)
7785                                 goto restart;
7786                         if (rc < 0)
7787                                 RETURN(rc);
7788
7789                         /* ... and then stale other components that are
7790                          * overlapping with primary components */
7791                         lod_foreach_mirror_comp(lod_comp, lo, i) {
7792                                 if (!lu_extent_is_overlapped(
7793                                                         &pri_extent,
7794                                                         &lod_comp->llc_extent))
7795                                         continue;
7796
7797                                 CDEBUG(D_LAYOUT, "stale: %u / %u\n",
7798                                       i, lod_comp_index(lo, lod_comp));
7799
7800                                 lod_comp->llc_flags |= LCME_FL_STALE;
7801                                 lo->ldo_mirrors[i].lme_stale = 1;
7802                                 if (lod_is_hsm(lod_comp))
7803                                         lod_comp->llc_foreign_flags |= HS_DIRTY;
7804                         }
7805                 }
7806         }
7807
7808         RETURN(rc);
7809 }
7810
7811 /**
7812  * check an OST's availability
7813  * \param[in] env       execution environment
7814  * \param[in] lo        lod object
7815  * \param[in] dt        dt object
7816  * \param[in] index     mirror index
7817  *
7818  * \retval      negative if failed
7819  * \retval      1 if \a dt is available
7820  * \retval      0 if \a dt is not available
7821  */
7822 static inline int lod_check_ost_avail(const struct lu_env *env,
7823                                       struct lod_object *lo,
7824                                       struct dt_object *dt, int index)
7825 {
7826         struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
7827         struct lod_tgt_desc *ost;
7828         __u32 idx;
7829         int type = LU_SEQ_RANGE_OST;
7830         int rc;
7831
7832         rc = lod_fld_lookup(env, lod, lu_object_fid(&dt->do_lu), &idx, &type);
7833         if (rc < 0) {
7834                 CERROR("%s: can't locate "DFID":rc = %d\n",
7835                        lod2obd(lod)->obd_name, PFID(lu_object_fid(&dt->do_lu)),
7836                        rc);
7837                 return rc;
7838         }
7839
7840         ost = OST_TGT(lod, idx);
7841         if (ost->ltd_active == 0) {
7842                 CDEBUG(D_LAYOUT, DFID ": mirror %d OST%d unavail\n",
7843                        PFID(lod_object_fid(lo)), index, idx);
7844                 return 0;
7845         }
7846
7847         return 1;
7848 }
7849
7850 /**
7851  * Pick primary mirror for write
7852  * \param[in] env       execution environment
7853  * \param[in] lo        object
7854  * \param[in] extent    write range
7855  */
7856 static int lod_primary_pick(const struct lu_env *env, struct lod_object *lo,
7857                             struct lu_extent *extent)
7858 {
7859         struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
7860         unsigned int seq = 0;
7861         struct lod_layout_component *lod_comp;
7862         int i, j, rc;
7863         int picked = -1, second_pick = -1, third_pick = -1;
7864         ENTRY;
7865
7866         if (CFS_FAIL_CHECK(OBD_FAIL_FLR_RANDOM_PICK_MIRROR)) {
7867                 get_random_bytes(&seq, sizeof(seq));
7868                 seq %= lo->ldo_mirror_count;
7869         }
7870
7871         /**
7872          * Pick a mirror as the primary, and check the availability of OSTs.
7873          *
7874          * This algo can be revised later after knowing the topology of
7875          * cluster.
7876          */
7877         lod_qos_statfs_update(env, lod, &lod->lod_ost_descs);
7878
7879         rc = lod_fill_mirrors(lo);
7880         if (rc)
7881                 RETURN(rc);
7882
7883         for (i = 0; i < lo->ldo_mirror_count; i++) {
7884                 bool ost_avail = true;
7885                 int index = (i + seq) % lo->ldo_mirror_count;
7886
7887                 if (lo->ldo_mirrors[index].lme_stale) {
7888                         CDEBUG(D_LAYOUT, DFID": mirror %d stale\n",
7889                                PFID(lod_object_fid(lo)), index);
7890                         continue;
7891                 }
7892
7893                 /* 2nd pick is for the primary mirror containing unavail OST */
7894                 if (lo->ldo_mirrors[index].lme_prefer && second_pick < 0)
7895                         second_pick = index;
7896
7897                 /* 3rd pick is for non-primary mirror containing unavail OST */
7898                 if (second_pick < 0 && third_pick < 0)
7899                         third_pick = index;
7900
7901                 /**
7902                  * we found a non-primary 1st pick, we'd like to find a
7903                  * potential pirmary mirror.
7904                  */
7905                 if (picked >= 0 && !lo->ldo_mirrors[index].lme_prefer)
7906                         continue;
7907
7908                 /* check the availability of OSTs */
7909                 lod_foreach_mirror_comp(lod_comp, lo, index) {
7910                         if (!lod_comp_inited(lod_comp) || !lod_comp->llc_stripe)
7911                                 continue;
7912
7913                         for (j = 0; j < lod_comp->llc_stripe_count; j++) {
7914                                 struct dt_object *dt = lod_comp->llc_stripe[j];
7915
7916                                 rc = lod_check_ost_avail(env, lo, dt, index);
7917                                 if (rc < 0)
7918                                         RETURN(rc);
7919
7920                                 ost_avail = !!rc;
7921                                 if (!ost_avail)
7922                                         break;
7923                         } /* for all dt object in one component */
7924                         if (!ost_avail)
7925                                 break;
7926                 } /* for all components in a mirror */
7927
7928                 /**
7929                  * the OSTs where allocated objects locates in the components
7930                  * of the mirror are available.
7931                  */
7932                 if (!ost_avail)
7933                         continue;
7934
7935                 /* this mirror has all OSTs available */
7936                 picked = index;
7937
7938                 /**
7939                  * primary with all OSTs are available, this is the perfect
7940                  * 1st pick.
7941                  */
7942                 if (lo->ldo_mirrors[index].lme_prefer)
7943                         break;
7944         } /* for all mirrors */
7945
7946         /* failed to pick a sound mirror, lower our expectation */
7947         if (picked < 0)
7948                 picked = second_pick;
7949         if (picked < 0)
7950                 picked = third_pick;
7951         if (picked < 0)
7952                 RETURN(-ENODATA);
7953
7954         RETURN(picked);
7955 }
7956
7957 static int lod_prepare_resync_mirror(const struct lu_env *env,
7958                                      struct lod_object *lo,
7959                                      __u16 mirror_id)
7960 {
7961         struct lod_thread_info *info = lod_env_info(env);
7962         struct lod_layout_component *lod_comp;
7963         bool neg = !!(MIRROR_ID_NEG & mirror_id);
7964         int i;
7965
7966         mirror_id &= ~MIRROR_ID_NEG;
7967
7968         for (i = 0; i < lo->ldo_mirror_count; i++) {
7969                 if ((!neg && lo->ldo_mirrors[i].lme_id != mirror_id) ||
7970                     (neg && lo->ldo_mirrors[i].lme_id == mirror_id))
7971                         continue;
7972
7973                 lod_foreach_mirror_comp(lod_comp, lo, i) {
7974                         if (lod_comp_inited(lod_comp))
7975                                 continue;
7976
7977                         info->lti_comp_idx[info->lti_count++] =
7978                                 lod_comp_index(lo, lod_comp);
7979                 }
7980         }
7981
7982         return 0;
7983 }
7984
7985 /**
7986  * figure out the components should be instantiated for resync.
7987  */
7988 static int lod_prepare_resync(const struct lu_env *env, struct lod_object *lo,
7989                               struct lu_extent *extent)
7990 {
7991         struct lod_thread_info *info = lod_env_info(env);
7992         struct lod_layout_component *lod_comp;
7993         unsigned int need_sync = 0;
7994         int i;
7995
7996         CDEBUG(D_LAYOUT,
7997                DFID": instantiate all stale components in "DEXT"\n",
7998                PFID(lod_object_fid(lo)), PEXT(extent));
7999
8000         /**
8001          * instantiate all components within this extent, even non-stale
8002          * components.
8003          */
8004         for (i = 0; i < lo->ldo_mirror_count; i++) {
8005                 if (!lo->ldo_mirrors[i].lme_stale)
8006                         continue;
8007
8008                 lod_foreach_mirror_comp(lod_comp, lo, i) {
8009                         if (!lu_extent_is_overlapped(extent,
8010                                                 &lod_comp->llc_extent))
8011                                 break;
8012
8013                         need_sync++;
8014
8015                         if (lod_comp_inited(lod_comp))
8016                                 continue;
8017
8018                         CDEBUG(D_LAYOUT, "resync instantiate %d / %d\n",
8019                                i, lod_comp_index(lo, lod_comp));
8020                         info->lti_comp_idx[info->lti_count++] =
8021                                         lod_comp_index(lo, lod_comp);
8022                 }
8023         }
8024
8025         return need_sync ? 0 : -EALREADY;
8026 }
8027
8028 static struct lod_layout_component *
8029 lod_locate_comp_hsm(struct lod_object *lo, int *hsm_mirror_id)
8030 {
8031         struct lod_layout_component *lod_comp = NULL;
8032         int i;
8033
8034         if (!lo->ldo_is_composite)
8035                 return NULL;
8036
8037         for (i = 0; i < lo->ldo_mirror_count; i++) {
8038                 /*
8039                  * FIXME: In the current design, there is only one HSM
8040                  * mirror component in range [0, EOF] for a FLR file. This
8041                  * should be fixed to support multiple HSM mirror components
8042                  * with different HSM backend types and partial file ranges
8043                  * in the future.
8044                  */
8045                 if (lo->ldo_mirrors[i].lme_hsm) {
8046                         __u16 start_idx;
8047                         __u16 end_idx;
8048
8049                         if (hsm_mirror_id)
8050                                 *hsm_mirror_id = i;
8051                         start_idx = lo->ldo_mirrors[i].lme_start;
8052                         end_idx = lo->ldo_mirrors[i].lme_end;
8053                         LASSERT(start_idx == end_idx);
8054                         lod_comp = &lo->ldo_comp_entries[start_idx];
8055                         LASSERT(lo->ldo_is_composite && lod_is_hsm(lod_comp) &&
8056                                 lod_comp->llc_extent.e_start == 0 &&
8057                                 lod_comp->llc_extent.e_end == LUSTRE_EOF);
8058                         break;
8059                 }
8060         }
8061
8062         return lod_comp;
8063 }
8064
8065 static int lod_declare_pccro_set(const struct lu_env *env,
8066                                  struct dt_object *dt, struct thandle *th)
8067 {
8068         struct lod_thread_info *info = lod_env_info(env);
8069         struct lu_buf *buf = &info->lti_buf;
8070         struct lod_object *lo = lod_dt_obj(dt);
8071         struct lod_layout_component *lod_comp;
8072         struct lod_layout_component *comp_array;
8073         struct lod_mirror_entry *mirror_array;
8074         __u16 mirror_id;
8075         int hsm_mirror_id;
8076         int mirror_cnt;
8077         int new_cnt;
8078         int rc;
8079         int i;
8080
8081         ENTRY;
8082
8083         rc = lod_striping_load(env, lo);
8084         if (rc)
8085                 RETURN(rc);
8086
8087         if (lo->ldo_flr_state & LCM_FL_PCC_RDONLY)
8088                 RETURN(-EALREADY);
8089
8090         rc = lod_layout_data_init(info, lo->ldo_comp_cnt);
8091         if (rc)
8092                 RETURN(rc);
8093
8094         lod_comp = lod_locate_comp_hsm(lo, &hsm_mirror_id);
8095         if (lod_comp) {
8096                 if (lod_comp->llc_foreign_flags & HS_PCCRO) {
8097                         CDEBUG(D_LAYOUT, "bad HSM flags: %#x\n",
8098                                lod_comp->llc_foreign_flags);
8099                         RETURN(-EINVAL);
8100                 }
8101
8102                 lod_obj_inc_layout_gen(lo);
8103                 lod_comp->llc_foreign_flags |= HS_PCCRO;
8104                 lod_comp->llc_foreign_flags &= ~HS_DIRTY;
8105                 lod_comp->llc_flags &= ~LCME_FL_STALE;
8106                 lo->ldo_mirrors[hsm_mirror_id].lme_stale = 0;
8107                 lo->ldo_flr_state |= LCM_FL_PCC_RDONLY;
8108                 buf->lb_len = lod_comp_md_size(lo, false);
8109                 rc = lod_sub_declare_xattr_set(env, lod_object_child(lo),
8110                                                buf, XATTR_NAME_LOV, 0, th);
8111                 RETURN(rc);
8112         }
8113
8114         /*
8115          * Create an new composite layout with only one HSM component.
8116          * Field @lhm_archive_uuid is used to be the identifier within HSM
8117          * backend for the archive copy. In the PCC case with a POSIX archive,
8118          * This can just be the original inode FID. This is important because
8119          * the inode FID may change due to layout swaps or migration to a new
8120          * MDT, and we do not want that to cause problems with finding the copy
8121          * in HSM/PCC.
8122          */
8123         mirror_cnt = lo->ldo_mirror_count + 1;
8124         if (!lo->ldo_is_composite) {
8125                 LASSERT(lo->ldo_mirror_count == 0);
8126                 mirror_cnt++;
8127         }
8128
8129         OBD_ALLOC_PTR_ARRAY(mirror_array, mirror_cnt);
8130         if (mirror_array == NULL)
8131                 RETURN(-ENOMEM);
8132
8133         new_cnt = lo->ldo_comp_cnt + 1;
8134         OBD_ALLOC_PTR_ARRAY(comp_array, new_cnt);
8135         if (comp_array == NULL) {
8136                 OBD_FREE_PTR_ARRAY(mirror_array, mirror_cnt);
8137                 RETURN(-ENOMEM);
8138         }
8139
8140         mirror_id = 0;
8141         for (i = 0; i < lo->ldo_comp_cnt; i++) {
8142                 lod_comp = &lo->ldo_comp_entries[i];
8143
8144                 /*
8145                  * Add mirror from a non-flr file, create new mirror ID.
8146                  * Otherwise, keep existing mirror's component ID, used
8147                  * for mirror extension.
8148                  */
8149                 if (lo->ldo_mirror_count == 0 &&
8150                     mirror_id_of(lod_comp->llc_id) == 0)
8151                         lod_comp->llc_id = pflr_id(1, i + 1);
8152
8153                 if (lod_comp->llc_id != LCME_ID_INVAL &&
8154                     mirror_id_of(lod_comp->llc_id) > mirror_id)
8155                         mirror_id = mirror_id_of(lod_comp->llc_id);
8156
8157                 if (!lo->ldo_is_composite) {
8158                         lod_comp->llc_extent.e_start = 0;
8159                         lod_comp->llc_extent.e_end = LUSTRE_EOF;
8160                         lod_comp_set_init(lod_comp);
8161                 }
8162         }
8163
8164         memcpy(comp_array, lo->ldo_comp_entries,
8165                sizeof(*comp_array) * lo->ldo_comp_cnt);
8166
8167         lod_comp = &comp_array[new_cnt - 1];
8168         lod_comp->llc_magic = LOV_MAGIC_FOREIGN;
8169         lod_comp->llc_extent.e_start = 0;
8170         lod_comp->llc_extent.e_end = LUSTRE_EOF;
8171         lod_comp->llc_length = sizeof(struct lov_hsm_base);
8172         lod_comp->llc_type = LU_FOREIGN_TYPE_PCCRO;
8173         lod_comp->llc_foreign_flags = HS_EXISTS | HS_ARCHIVED | HS_PCCRO;
8174         memset(&lod_comp->llc_hsm, 0, sizeof(lod_comp->llc_hsm));
8175
8176         if (lo->ldo_mirrors)
8177                 OBD_FREE_PTR_ARRAY(lo->ldo_mirrors, lo->ldo_mirror_count);
8178         OBD_FREE_PTR_ARRAY(lo->ldo_comp_entries, lo->ldo_comp_cnt);
8179
8180         /*
8181          * The @ldo_mirror will be refilled by lod_fill_mirrors() when
8182          * call lod_striped_create() for layout change.
8183          */
8184         lo->ldo_mirrors = mirror_array;
8185         lo->ldo_mirror_count = mirror_cnt;
8186         lo->ldo_comp_entries = comp_array;
8187         lo->ldo_comp_cnt = new_cnt;
8188         lo->ldo_is_composite = 1;
8189
8190         ++mirror_id;
8191         lod_comp->llc_id = LCME_ID_INVAL;
8192         lod_comp->llc_id = lod_gen_component_id(lo, mirror_id, new_cnt - 1);
8193
8194         if (lo->ldo_flr_state == LCM_FL_NONE)
8195                 lo->ldo_flr_state = LCM_FL_RDONLY;
8196         lo->ldo_flr_state |= LCM_FL_PCC_RDONLY;
8197         buf->lb_len = lod_comp_md_size(lo, false);
8198         rc = lod_sub_declare_xattr_set(env, lod_object_child(lo),
8199                                        buf, XATTR_NAME_LOV, 0, th);
8200         if (rc)
8201                 lod_striping_free(env, lo);
8202
8203         RETURN(rc);
8204 }
8205
8206 /*
8207  * TODO: When clear LCM_FL_PCC_RDONLY flag from the layouts, it means the file
8208  * is going to be modified. Currently it needs two RPCs: first one is to clear
8209  * LCM_FL_PCC_RDONLY flag; the second one is to pick primary mirror and mark
8210  * the file as LCM_FL_WRITE_PENDING.
8211  * These two RPCs can be combined in one RPC call.
8212  */
8213 static int lod_declare_pccro_clear(const struct lu_env *env,
8214                                    struct dt_object *dt, struct thandle *th)
8215 {
8216         struct lod_thread_info *info = lod_env_info(env);
8217         struct lod_object *lo = lod_dt_obj(dt);
8218         struct lod_layout_component *lod_comp;
8219         int rc;
8220
8221         ENTRY;
8222
8223         rc = lod_striping_load(env, lo);
8224         if (rc)
8225                 RETURN(rc);
8226
8227         if (!(lo->ldo_flr_state & LCM_FL_PCC_RDONLY))
8228                 RETURN(-EALREADY);
8229
8230         rc = lod_layout_data_init(info, lo->ldo_comp_cnt);
8231         if (rc)
8232                 RETURN(rc);
8233
8234         lod_comp = lod_locate_comp_hsm(lo, NULL);
8235         if (lod_comp == NULL) {
8236                 CDEBUG(D_LAYOUT, "Not found any HSM component\n");
8237                 GOTO(out, rc = -EINVAL);
8238         }
8239
8240         lod_comp->llc_foreign_flags &= ~HS_PCCRO;
8241         lo->ldo_flr_state &= ~LCM_FL_PCC_RDONLY;
8242         lod_obj_inc_layout_gen(lo);
8243         info->lti_buf.lb_len = lod_comp_md_size(lo, false);
8244         rc = lod_sub_declare_xattr_set(env, lod_object_child(lo),
8245                                        &info->lti_buf, XATTR_NAME_LOV, 0, th);
8246 out:
8247         if (rc)
8248                 lod_striping_free(env, lo);
8249
8250         RETURN(rc);
8251 }
8252
8253 static int lod_declare_update_pccro(const struct lu_env *env,
8254                                     struct dt_object *dt,
8255                                     struct md_layout_change *mlc,
8256                                     struct thandle *th)
8257 {
8258         struct layout_intent *intent = mlc->mlc_intent;
8259         int rc;
8260
8261         switch (intent->lai_opc) {
8262         case LAYOUT_INTENT_PCCRO_SET:
8263                 rc = lod_declare_pccro_set(env, dt, th);
8264                 break;
8265         case LAYOUT_INTENT_PCCRO_CLEAR:
8266                 rc = lod_declare_pccro_clear(env, dt, th);
8267                 break;
8268         default:
8269                 rc = -EOPNOTSUPP;
8270                 break;
8271         }
8272
8273         return rc;
8274 }
8275
8276 static int lod_declare_update_rdonly(const struct lu_env *env,
8277                 struct lod_object *lo, struct md_layout_change *mlc,
8278                 struct thandle *th)
8279 {
8280         struct lod_thread_info *info = lod_env_info(env);
8281         struct lu_attr *layout_attr = &info->lti_layout_attr;
8282         struct lod_layout_component *lod_comp;
8283         struct lu_extent extent = { 0 };
8284         int rc;
8285         ENTRY;
8286
8287         LASSERT(lo->ldo_flr_state == LCM_FL_RDONLY);
8288         LASSERT(mlc->mlc_opc == MD_LAYOUT_WRITE ||
8289                 mlc->mlc_opc == MD_LAYOUT_RESYNC);
8290         LASSERT(lo->ldo_mirror_count > 0);
8291
8292         if (mlc->mlc_opc == MD_LAYOUT_WRITE) {
8293                 struct layout_intent *layout = mlc->mlc_intent;
8294                 int write = layout->lai_opc == LAYOUT_INTENT_WRITE;
8295                 int picked;
8296
8297                 extent = layout->lai_extent;
8298                 CDEBUG(D_LAYOUT, DFID": trying to write :"DEXT"\n",
8299                        PFID(lod_object_fid(lo)), PEXT(&extent));
8300
8301                 picked = lod_primary_pick(env, lo, &extent);
8302                 if (picked < 0)
8303                         RETURN(picked);
8304
8305                 CDEBUG(D_LAYOUT, DFID": picked mirror id %u as primary\n",
8306                        PFID(lod_object_fid(lo)),
8307                        lo->ldo_mirrors[picked].lme_id);
8308
8309                 /* Update extents of primary before staling */
8310                 rc = lod_declare_update_extents(env, lo, &extent, th, picked,
8311                                                 write);
8312                 if (rc < 0)
8313                         GOTO(out, rc);
8314
8315                 if (layout->lai_opc == LAYOUT_INTENT_TRUNC) {
8316                         /**
8317                          * trunc transfers [0, size) in the intent extent, we'd
8318                          * stale components overlapping [size, eof).
8319                          */
8320                         extent.e_start = extent.e_end;
8321                         extent.e_end = OBD_OBJECT_EOF;
8322                 }
8323
8324                 /* stale overlapping components from other mirrors */
8325                 rc = lod_stale_components(env, lo, picked, &extent, th);
8326                 if (rc < 0)
8327                         GOTO(out, rc);
8328
8329                 /* restore truncate intent extent */
8330                 if (layout->lai_opc == LAYOUT_INTENT_TRUNC)
8331                         extent.e_end = extent.e_start;
8332
8333                 /* instantiate components for the picked mirror, start from 0 */
8334                 extent.e_start = 0;
8335
8336                 lod_foreach_mirror_comp(lod_comp, lo, picked) {
8337                         if (!lu_extent_is_overlapped(&extent,
8338                                                      &lod_comp->llc_extent))
8339                                 break;
8340
8341                         if (!lod_is_instantiation_needed(lod_comp))
8342                                 continue;
8343
8344                         info->lti_comp_idx[info->lti_count++] =
8345                                                 lod_comp_index(lo, lod_comp);
8346                 }
8347
8348                 lo->ldo_flr_state = LCM_FL_WRITE_PENDING;
8349         } else { /* MD_LAYOUT_RESYNC */
8350                 int i;
8351
8352                 /**
8353                  * could contain multiple non-stale mirrors, so we need to
8354                  * prep uninited all components assuming any non-stale mirror
8355                  * could be picked as the primary mirror.
8356                  */
8357                 if (mlc->mlc_mirror_id == 0) {
8358                         /* normal resync */
8359                         for (i = 0; i < lo->ldo_mirror_count; i++) {
8360                                 if (lo->ldo_mirrors[i].lme_stale)
8361                                         continue;
8362
8363                                 lod_foreach_mirror_comp(lod_comp, lo, i) {
8364                                         if (!lod_comp_inited(lod_comp))
8365                                                 break;
8366
8367                                         if (extent.e_end <
8368                                                 lod_comp->llc_extent.e_end)
8369                                                 extent.e_end =
8370                                                      lod_comp->llc_extent.e_end;
8371                                 }
8372                         }
8373                         rc = lod_prepare_resync(env, lo, &extent);
8374                         if (rc)
8375                                 GOTO(out, rc);
8376                 } else {
8377                         /* mirror write, try to init its all components */
8378                         rc = lod_prepare_resync_mirror(env, lo,
8379                                                        mlc->mlc_mirror_id);
8380                         if (rc)
8381                                 GOTO(out, rc);
8382                 }
8383
8384                 /* change the file state to SYNC_PENDING */
8385                 lo->ldo_flr_state = LCM_FL_SYNC_PENDING;
8386         }
8387
8388         /* Reset the layout version once it's becoming too large.
8389          * This way it can make sure that the layout version is
8390          * monotonously increased in this writing era. */
8391         lod_obj_inc_layout_gen(lo);
8392
8393         rc = lod_declare_instantiate_components(env, lo, th, 0);
8394         if (rc)
8395                 GOTO(out, rc);
8396
8397         layout_attr->la_valid = LA_LAYOUT_VERSION;
8398         layout_attr->la_layout_version = 0;
8399         if (mlc->mlc_opc == MD_LAYOUT_RESYNC)
8400                 layout_attr->la_layout_version = LU_LAYOUT_RESYNC;
8401         rc = lod_declare_attr_set(env, &lo->ldo_obj, layout_attr, th);
8402         if (rc)
8403                 GOTO(out, rc);
8404
8405 out:
8406         if (rc)
8407                 lod_striping_free(env, lo);
8408         RETURN(rc);
8409 }
8410
8411 static int lod_declare_update_write_pending(const struct lu_env *env,
8412                 struct lod_object *lo, struct md_layout_change *mlc,
8413                 struct thandle *th)
8414 {
8415         struct lod_thread_info *info = lod_env_info(env);
8416         struct lu_attr *layout_attr = &info->lti_layout_attr;
8417         struct lod_layout_component *lod_comp;
8418         struct lu_extent extent = { 0 };
8419         int primary = -1;
8420         int i;
8421         int rc;
8422         ENTRY;
8423
8424         LASSERT(lo->ldo_flr_state == LCM_FL_WRITE_PENDING);
8425         LASSERT(mlc->mlc_opc == MD_LAYOUT_WRITE ||
8426                 mlc->mlc_opc == MD_LAYOUT_RESYNC);
8427
8428         /* look for the first preferred mirror */
8429         for (i = 0; i < lo->ldo_mirror_count; i++) {
8430                 if (lo->ldo_mirrors[i].lme_stale)
8431                         continue;
8432                 if (lo->ldo_mirrors[i].lme_prefer == 0)
8433                         continue;
8434                 if (lo->ldo_mirrors[i].lme_hsm)
8435                         continue;
8436
8437                 primary = i;
8438                 break;
8439         }
8440         if (primary < 0) {
8441                 /* no primary, use any in-sync */
8442                 for (i = 0; i < lo->ldo_mirror_count; i++) {
8443                         if (lo->ldo_mirrors[i].lme_stale)
8444                                 continue;
8445                         primary = i;
8446                         break;
8447                 }
8448                 if (primary < 0) {
8449                         CERROR(DFID ": doesn't have a primary mirror\n",
8450                                PFID(lod_object_fid(lo)));
8451                         GOTO(out, rc = -ENODATA);
8452                 }
8453         }
8454
8455         CDEBUG(D_LAYOUT, DFID": found primary %u\n",
8456                PFID(lod_object_fid(lo)), lo->ldo_mirrors[primary].lme_id);
8457
8458         LASSERT(!lo->ldo_mirrors[primary].lme_stale);
8459
8460         /* for LAYOUT_WRITE opc, it has to do the following operations:
8461          * 1. stale overlapping componets from stale mirrors;
8462          * 2. instantiate components of the primary mirror;
8463          * 3. transfter layout version to all objects of the primary;
8464          *
8465          * for LAYOUT_RESYNC opc, it will do:
8466          * 1. instantiate components of all stale mirrors;
8467          * 2. transfer layout version to all objects to close write era. */
8468
8469         if (mlc->mlc_opc == MD_LAYOUT_WRITE) {
8470                 struct layout_intent *layout = mlc->mlc_intent;
8471                 int write = layout->lai_opc == LAYOUT_INTENT_WRITE;
8472
8473                 LASSERT(layout != NULL);
8474
8475                 extent = layout->lai_extent;
8476
8477                 CDEBUG(D_LAYOUT, DFID": intent to write: "DEXT"\n",
8478                        PFID(lod_object_fid(lo)), PEXT(&extent));
8479
8480                 /* 1. Update extents of primary before staling */
8481                 rc = lod_declare_update_extents(env, lo, &extent, th, primary,
8482                                                 write);
8483                 if (rc < 0)
8484                         GOTO(out, rc);
8485
8486                 if (layout->lai_opc == LAYOUT_INTENT_TRUNC) {
8487                         /**
8488                          * trunc transfers [0, size) in the intent extent, we'd
8489                          * stale components overlapping [size, eof).
8490                          */
8491                         extent.e_start = extent.e_end;
8492                         extent.e_end = OBD_OBJECT_EOF;
8493                 }
8494
8495                 /* 2. stale overlapping components */
8496                 rc = lod_stale_components(env, lo, primary, &extent, th);
8497                 if (rc < 0)
8498                         GOTO(out, rc);
8499
8500                 /* 3. find the components which need instantiating.
8501                  * instantiate [0, mlc->mlc_intent->e_end) */
8502
8503                 /* restore truncate intent extent */
8504                 if (layout->lai_opc == LAYOUT_INTENT_TRUNC)
8505                         extent.e_end = extent.e_start;
8506                 extent.e_start = 0;
8507
8508                 lod_foreach_mirror_comp(lod_comp, lo, primary) {
8509                         if (!lu_extent_is_overlapped(&extent,
8510                                                      &lod_comp->llc_extent))
8511                                 break;
8512
8513                         if (!lod_is_instantiation_needed(lod_comp))
8514                                 continue;
8515
8516                         CDEBUG(D_LAYOUT, "write instantiate %d / %d\n",
8517                                primary, lod_comp_index(lo, lod_comp));
8518                         info->lti_comp_idx[info->lti_count++] =
8519                                                 lod_comp_index(lo, lod_comp);
8520                 }
8521         } else { /* MD_LAYOUT_RESYNC */
8522                 if (mlc->mlc_mirror_id == 0) {
8523                         /* normal resync */
8524                         lod_foreach_mirror_comp(lod_comp, lo, primary) {
8525                                 if (!lod_comp_inited(lod_comp))
8526                                         break;
8527
8528                                 extent.e_end = lod_comp->llc_extent.e_end;
8529                         }
8530
8531                         rc = lod_prepare_resync(env, lo, &extent);
8532                         if (rc)
8533                                 GOTO(out, rc);
8534                 } else {
8535                         /* mirror write, try to init its all components */
8536                         rc = lod_prepare_resync_mirror(env, lo,
8537                                                        mlc->mlc_mirror_id);
8538                         if (rc)
8539                                 GOTO(out, rc);
8540                 }
8541
8542                 /* change the file state to SYNC_PENDING */
8543                 lo->ldo_flr_state = LCM_FL_SYNC_PENDING;
8544         }
8545
8546         rc = lod_declare_instantiate_components(env, lo, th, 0);
8547         if (rc)
8548                 GOTO(out, rc);
8549
8550         lod_obj_inc_layout_gen(lo);
8551
8552         /* 3. transfer layout version to OST objects.
8553          * transfer new layout version to OST objects so that stale writes
8554          * can be denied. It also ends an era of writing by setting
8555          * LU_LAYOUT_RESYNC. Normal client can never use this bit to
8556          * send write RPC; only resync RPCs could do it. */
8557         layout_attr->la_valid = LA_LAYOUT_VERSION;
8558         layout_attr->la_layout_version = 0;
8559         if (mlc->mlc_opc == MD_LAYOUT_RESYNC)
8560                 layout_attr->la_layout_version = LU_LAYOUT_RESYNC;
8561         rc = lod_declare_attr_set(env, &lo->ldo_obj, layout_attr, th);
8562         if (rc)
8563                 GOTO(out, rc);
8564 out:
8565         if (rc)
8566                 lod_striping_free(env, lo);
8567         RETURN(rc);
8568 }
8569
8570 static int lod_declare_update_sync_pending(const struct lu_env *env,
8571                 struct lod_object *lo, struct md_layout_change *mlc,
8572                 struct thandle *th)
8573 {
8574         struct lod_thread_info  *info = lod_env_info(env);
8575         struct lu_attr *layout_attr = &info->lti_layout_attr;
8576         unsigned sync_components = 0;
8577         unsigned resync_components = 0;
8578         int i;
8579         int rc;
8580         ENTRY;
8581
8582         LASSERT(lo->ldo_flr_state == LCM_FL_SYNC_PENDING);
8583         LASSERT(mlc->mlc_opc == MD_LAYOUT_RESYNC_DONE ||
8584                 mlc->mlc_opc == MD_LAYOUT_WRITE);
8585
8586         CDEBUG(D_LAYOUT, DFID ": received op %d in sync pending\n",
8587                PFID(lod_object_fid(lo)), mlc->mlc_opc);
8588
8589         if (mlc->mlc_opc == MD_LAYOUT_WRITE) {
8590                 CDEBUG(D_LAYOUT, DFID": cocurrent write to sync pending\n",
8591                        PFID(lod_object_fid(lo)));
8592
8593                 lo->ldo_flr_state = LCM_FL_WRITE_PENDING;
8594                 return lod_declare_update_write_pending(env, lo, mlc, th);
8595         }
8596
8597         /* MD_LAYOUT_RESYNC_DONE */
8598
8599         for (i = 0; i < lo->ldo_comp_cnt; i++) {
8600                 struct lod_layout_component *lod_comp;
8601                 int j;
8602
8603                 lod_comp = &lo->ldo_comp_entries[i];
8604
8605                 if (!(lod_comp->llc_flags & LCME_FL_STALE)) {
8606                         sync_components++;
8607                         continue;
8608                 }
8609
8610                 for (j = 0; j < mlc->mlc_resync_count; j++) {
8611                         if (lod_comp->llc_id != mlc->mlc_resync_ids[j])
8612                                 continue;
8613
8614                         mlc->mlc_resync_ids[j] = LCME_ID_INVAL;
8615                         lod_comp->llc_flags &= ~LCME_FL_STALE;
8616                         resync_components++;
8617                         break;
8618                 }
8619         }
8620
8621         /* valid check */
8622         for (i = 0; i < mlc->mlc_resync_count; i++) {
8623                 if (mlc->mlc_resync_ids[i] == LCME_ID_INVAL)
8624                         continue;
8625
8626                 CDEBUG(D_LAYOUT, DFID": lcme id %u (%d / %zd) not exist "
8627                        "or already synced\n", PFID(lod_object_fid(lo)),
8628                        mlc->mlc_resync_ids[i], i, mlc->mlc_resync_count);
8629                 GOTO(out, rc = -EINVAL);
8630         }
8631
8632         if (!sync_components || (mlc->mlc_resync_count && !resync_components)) {
8633                 CDEBUG(D_LAYOUT, DFID": no mirror in sync\n",
8634                        PFID(lod_object_fid(lo)));
8635
8636                 /* tend to return an error code here to prevent
8637                  * the MDT from setting SoM attribute */
8638                 GOTO(out, rc = -EINVAL);
8639         }
8640
8641         CDEBUG(D_LAYOUT, DFID": synced %u resynced %u/%zu components\n",
8642                PFID(lod_object_fid(lo)),
8643                sync_components, resync_components, mlc->mlc_resync_count);
8644
8645         lo->ldo_flr_state = LCM_FL_RDONLY;
8646         lod_obj_inc_layout_gen(lo);
8647
8648         layout_attr->la_valid = LA_LAYOUT_VERSION;
8649         layout_attr->la_layout_version = 0;
8650         rc = lod_declare_attr_set(env, &lo->ldo_obj, layout_attr, th);
8651         if (rc)
8652                 GOTO(out, rc);
8653
8654         info->lti_buf.lb_len = lod_comp_md_size(lo, false);
8655         rc = lod_sub_declare_xattr_set(env, lod_object_child(lo),
8656                                        &info->lti_buf, XATTR_NAME_LOV, 0, th);
8657         EXIT;
8658
8659 out:
8660         if (rc)
8661                 lod_striping_free(env, lo);
8662         RETURN(rc);
8663 }
8664
8665 typedef int (*mlc_handler)(const struct lu_env *env, struct dt_object *dt,
8666                            const struct md_layout_change *mlc,
8667                            struct thandle *th);
8668
8669 /**
8670  * Attach stripes after target's for migrating directory. NB, we
8671  * only need to declare this, the actual work is done inside
8672  * lod_xattr_set_lmv().
8673  *
8674  * \param[in] env       execution environment
8675  * \param[in] dt        target object
8676  * \param[in] mlc       layout change data
8677  * \param[in] th        transaction handle
8678  *
8679  * \retval              0 on success
8680  * \retval              negative if failed
8681  */
8682 static int lod_dir_declare_layout_attach(const struct lu_env *env,
8683                                          struct dt_object *dt,
8684                                          const struct md_layout_change *mlc,
8685                                          struct thandle *th)
8686 {
8687         struct lod_thread_info *info = lod_env_info(env);
8688         struct lod_device *lod = lu2lod_dev(dt->do_lu.lo_dev);
8689         struct lod_tgt_descs *ltd = &lod->lod_mdt_descs;
8690         struct lod_object *lo = lod_dt_obj(dt);
8691         struct dt_object *next = dt_object_child(dt);
8692         struct dt_object_format *dof = &info->lti_format;
8693         struct lmv_mds_md_v1 *lmv = mlc->mlc_buf.lb_buf;
8694         struct dt_object **stripes;
8695         __u32 stripe_count = le32_to_cpu(lmv->lmv_stripe_count);
8696         struct lu_fid *fid = &info->lti_fid;
8697         struct lod_tgt_desc *tgt;
8698         struct dt_object *dto;
8699         struct dt_device *tgt_dt;
8700         int type = LU_SEQ_RANGE_ANY;
8701         struct dt_insert_rec *rec = &info->lti_dt_rec;
8702         char *stripe_name = info->lti_key;
8703         struct lu_name *sname;
8704         struct linkea_data ldata = { NULL };
8705         struct lu_buf linkea_buf;
8706         __u32 idx;
8707         int i;
8708         int rc;
8709
8710         ENTRY;
8711
8712         if (!lmv_is_sane(lmv))
8713                 RETURN(-EINVAL);
8714
8715         if (!dt_try_as_dir(env, dt, false))
8716                 RETURN(-ENOTDIR);
8717
8718         dof->dof_type = DFT_DIR;
8719
8720         OBD_ALLOC_PTR_ARRAY(stripes, (lo->ldo_dir_stripe_count + stripe_count));
8721         if (!stripes)
8722                 RETURN(-ENOMEM);
8723
8724         for (i = 0; i < lo->ldo_dir_stripe_count; i++)
8725                 stripes[i] = lo->ldo_stripe[i];
8726
8727         rec->rec_type = S_IFDIR;
8728
8729         for (i = 0; i < stripe_count; i++) {
8730                 fid_le_to_cpu(fid,
8731                         &lmv->lmv_stripe_fids[i]);
8732                 if (!fid_is_sane(fid))
8733                         continue;
8734
8735                 rc = lod_fld_lookup(env, lod, fid, &idx, &type);
8736                 if (rc)
8737                         GOTO(out, rc);
8738
8739                 if (idx == lod2lu_dev(lod)->ld_site->ld_seq_site->ss_node_id) {
8740                         tgt_dt = lod->lod_child;
8741                 } else {
8742                         tgt = LTD_TGT(ltd, idx);
8743                         if (tgt == NULL)
8744                                 GOTO(out, rc = -ESTALE);
8745                         tgt_dt = tgt->ltd_tgt;
8746                 }
8747
8748                 dto = dt_locate_at(env, tgt_dt, fid,
8749                                   lo->ldo_obj.do_lu.lo_dev->ld_site->ls_top_dev,
8750                                   NULL);
8751                 if (IS_ERR(dto))
8752                         GOTO(out, rc = PTR_ERR(dto));
8753
8754                 stripes[i + lo->ldo_dir_stripe_count] = dto;
8755
8756                 if (!dt_try_as_dir(env, dto, true))
8757                         GOTO(out, rc = -ENOTDIR);
8758
8759                 rc = lod_sub_declare_ref_add(env, dto, th);
8760                 if (rc)
8761                         GOTO(out, rc);
8762
8763                 rec->rec_fid = lu_object_fid(&dto->do_lu);
8764                 rc = lod_sub_declare_insert(env, dto,
8765                                             (const struct dt_rec *)rec,
8766                                             (const struct dt_key *)dot, th);
8767                 if (rc)
8768                         GOTO(out, rc);
8769
8770                 rc = lod_sub_declare_insert(env, dto,
8771                                             (const struct dt_rec *)rec,
8772                                             (const struct dt_key *)dotdot, th);
8773                 if (rc)
8774                         GOTO(out, rc);
8775
8776                 rc = lod_sub_declare_xattr_set(env, dto, &mlc->mlc_buf,
8777                                                 XATTR_NAME_LMV, 0, th);
8778                 if (rc)
8779                         GOTO(out, rc);
8780
8781                 snprintf(stripe_name, sizeof(info->lti_key), DFID":%u",
8782                          PFID(lu_object_fid(&dto->do_lu)),
8783                          i + lo->ldo_dir_stripe_count);
8784
8785                 sname = lod_name_get(env, stripe_name, strlen(stripe_name));
8786                 rc = linkea_links_new(&ldata, &info->lti_linkea_buf,
8787                                       sname, lu_object_fid(&dt->do_lu));
8788                 if (rc)
8789                         GOTO(out, rc);
8790
8791                 linkea_buf.lb_buf = ldata.ld_buf->lb_buf;
8792                 linkea_buf.lb_len = ldata.ld_leh->leh_len;
8793                 rc = lod_sub_declare_xattr_set(env, dto, &linkea_buf,
8794                                                XATTR_NAME_LINK, 0, th);
8795                 if (rc)
8796                         GOTO(out, rc);
8797
8798                 rc = lod_sub_declare_insert(env, next,
8799                                             (const struct dt_rec *)rec,
8800                                             (const struct dt_key *)stripe_name,
8801                                             th);
8802                 if (rc)
8803                         GOTO(out, rc);
8804
8805                 rc = lod_sub_declare_ref_add(env, next, th);
8806                 if (rc)
8807                         GOTO(out, rc);
8808         }
8809
8810         if (lo->ldo_stripe)
8811                 OBD_FREE_PTR_ARRAY(lo->ldo_stripe,
8812                                    lo->ldo_dir_stripes_allocated);
8813         lo->ldo_stripe = stripes;
8814         lo->ldo_is_foreign = 0;
8815         lo->ldo_dir_migrate_offset = lo->ldo_dir_stripe_count;
8816         lo->ldo_dir_migrate_hash = le32_to_cpu(lmv->lmv_hash_type);
8817         lo->ldo_dir_stripe_count += stripe_count;
8818         lo->ldo_dir_layout_version++;
8819         lo->ldo_dir_stripes_allocated += stripe_count;
8820
8821         /* plain directory split creates target as a plain directory, while
8822          * after source attached as the first stripe, it becomes a striped
8823          * directory, set correct do_index_ops, otherwise it can't be unlinked.
8824          */
8825         dt->do_index_ops = &lod_striped_index_ops;
8826
8827         RETURN(0);
8828 out:
8829         i = lo->ldo_dir_stripe_count;
8830         while (i < lo->ldo_dir_stripe_count + stripe_count && stripes[i])
8831                 dt_object_put(env, stripes[i++]);
8832
8833         OBD_FREE_PTR_ARRAY(stripes, stripe_count + lo->ldo_dir_stripe_count);
8834         return rc;
8835 }
8836
8837 static int lod_dir_declare_layout_detach(const struct lu_env *env,
8838                                          struct dt_object *dt,
8839                                          const struct md_layout_change *unused,
8840                                          struct thandle *th)
8841 {
8842         struct lod_thread_info *info = lod_env_info(env);
8843         struct lod_object *lo = lod_dt_obj(dt);
8844         struct dt_object *next = dt_object_child(dt);
8845         char *stripe_name = info->lti_key;
8846         struct dt_object *dto;
8847         int i;
8848         int rc = 0;
8849
8850         if (!dt_try_as_dir(env, dt, true))
8851                 return -ENOTDIR;
8852
8853         if (!lo->ldo_dir_stripe_count)
8854                 return lod_sub_declare_delete(env, next,
8855                                         (const struct dt_key *)dotdot, th);
8856
8857         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
8858                 dto = lo->ldo_stripe[i];
8859                 if (!dto)
8860                         continue;
8861
8862                 if (!dt_try_as_dir(env, dto, true))
8863                         return -ENOTDIR;
8864
8865                 rc = lod_sub_declare_delete(env, dto,
8866                                         (const struct dt_key *)dotdot, th);
8867                 if (rc)
8868                         return rc;
8869
8870                 snprintf(stripe_name, sizeof(info->lti_key), DFID":%d",
8871                          PFID(lu_object_fid(&dto->do_lu)), i);
8872
8873                 rc = lod_sub_declare_delete(env, next,
8874                                         (const struct dt_key *)stripe_name, th);
8875                 if (rc)
8876                         return rc;
8877
8878                 rc = lod_sub_declare_ref_del(env, next, th);
8879                 if (rc)
8880                         return rc;
8881         }
8882
8883         return 0;
8884 }
8885
8886 static int dt_dir_is_empty(const struct lu_env *env,
8887                            struct dt_object *obj)
8888 {
8889         struct dt_it *it;
8890         const struct dt_it_ops *iops;
8891         int rc;
8892
8893         ENTRY;
8894
8895         if (!dt_try_as_dir(env, obj, true))
8896                 RETURN(-ENOTDIR);
8897
8898         iops = &obj->do_index_ops->dio_it;
8899         it = iops->init(env, obj, LUDA_64BITHASH);
8900         if (IS_ERR(it))
8901                 RETURN(PTR_ERR(it));
8902
8903         rc = iops->get(env, it, (const struct dt_key *)"");
8904         if (rc > 0) {
8905                 int i;
8906
8907                 for (rc = 0, i = 0; rc == 0 && i < 3; ++i)
8908                         rc = iops->next(env, it);
8909                 if (!rc)
8910                         rc = -ENOTEMPTY;
8911                 else if (rc == 1)
8912                         rc = 0;
8913         } else if (!rc) {
8914                 /* Huh? Index contains no zero key? */
8915                 rc = -EIO;
8916         }
8917
8918         iops->put(env, it);
8919         iops->fini(env, it);
8920
8921         RETURN(rc);
8922 }
8923
8924 static int lod_dir_declare_layout_shrink(const struct lu_env *env,
8925                                          struct dt_object *dt,
8926                                          const struct md_layout_change *mlc,
8927                                          struct thandle *th)
8928 {
8929         struct lod_thread_info *info = lod_env_info(env);
8930         struct lod_object *lo = lod_dt_obj(dt);
8931         struct dt_object *next = dt_object_child(dt);
8932         struct lmv_user_md *lmu = mlc->mlc_buf.lb_buf;
8933         char *stripe_name = info->lti_key;
8934         struct lu_buf *lmv_buf = &info->lti_buf;
8935         __u32 final_stripe_count;
8936         struct dt_object *dto;
8937         int i;
8938         int rc;
8939
8940         LASSERT(lmu);
8941
8942         if (!dt_try_as_dir(env, dt, true))
8943                 return -ENOTDIR;
8944
8945         /* shouldn't be called on plain directory */
8946         LASSERT(lo->ldo_dir_stripe_count);
8947
8948         lmv_buf->lb_buf = &info->lti_lmv.lmv_md_v1;
8949         lmv_buf->lb_len = sizeof(info->lti_lmv.lmv_md_v1);
8950
8951         final_stripe_count = le32_to_cpu(lmu->lum_stripe_count);
8952         LASSERT(final_stripe_count &&
8953                 final_stripe_count < lo->ldo_dir_stripe_count);
8954
8955         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
8956                 dto = lo->ldo_stripe[i];
8957                 if (!dto)
8958                         continue;
8959
8960                 if (i < final_stripe_count) {
8961                         rc = lod_sub_declare_xattr_set(env, dto, lmv_buf,
8962                                                        XATTR_NAME_LMV,
8963                                                        LU_XATTR_REPLACE, th);
8964                         if (rc)
8965                                 return rc;
8966
8967                         continue;
8968                 }
8969
8970                 rc = dt_dir_is_empty(env, dto);
8971                 if (rc < 0)
8972                         return rc;
8973
8974                 rc = lod_sub_declare_ref_del(env, dto, th);
8975                 if (rc)
8976                         return rc;
8977
8978                 rc = lod_sub_declare_destroy(env, dto, th);
8979                 if (rc)
8980                         return rc;
8981
8982                 snprintf(stripe_name, sizeof(info->lti_key), DFID":%d",
8983                          PFID(lu_object_fid(&dto->do_lu)), i);
8984
8985                 rc = lod_sub_declare_delete(env, next,
8986                                         (const struct dt_key *)stripe_name, th);
8987                 if (rc)
8988                         return rc;
8989
8990                 rc = lod_sub_declare_ref_del(env, next, th);
8991                 if (rc)
8992                         return rc;
8993         }
8994
8995         rc = lod_sub_declare_xattr_set(env, next, lmv_buf, XATTR_NAME_LMV,
8996                                        LU_XATTR_REPLACE, th);
8997         return rc;
8998 }
8999
9000 /**
9001  * Allocate stripes for split directory.
9002  *
9003  * \param[in] env       execution environment
9004  * \param[in] dt        target object
9005  * \param[in] mlc       layout change data
9006  * \param[in] th        transaction handle
9007  *
9008  * \retval              0 on success
9009  * \retval              negative if failed
9010  */
9011 static int lod_dir_declare_layout_split(const struct lu_env *env,
9012                                         struct dt_object *dt,
9013                                         const struct md_layout_change *mlc,
9014                                         struct thandle *th)
9015 {
9016         struct lod_thread_info *info = lod_env_info(env);
9017         struct lod_device *lod = lu2lod_dev(dt->do_lu.lo_dev);
9018         struct lod_object *lo = lod_dt_obj(dt);
9019         struct dt_object_format *dof = &info->lti_format;
9020         struct lmv_user_md_v1 *lum = mlc->mlc_spec->u.sp_ea.eadata;
9021         struct dt_object **stripes;
9022         int mdt_count = lod->lod_remote_mdt_count + 1;
9023         u32 stripe_count;
9024         u32 saved_count;
9025         int i;
9026         int rc;
9027
9028         ENTRY;
9029
9030         LASSERT(le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC);
9031         LASSERT(le32_to_cpu(lum->lum_stripe_offset) == LMV_OFFSET_DEFAULT);
9032
9033         saved_count = lo->ldo_dir_stripes_allocated;
9034         stripe_count = le32_to_cpu(lum->lum_stripe_count);
9035
9036         /* if the split target is overstriped, we need to put that flag in the
9037          * current layout so it can allocate the larger number of stripes
9038          *
9039          * Note we need to pick up any hash *flags* which affect allocation
9040          * *before* allocation, so they're used in allocating the directory,
9041          * rather than after when we finalize directory setup (at the end of
9042          * this function).
9043          */
9044         if (le32_to_cpu(lum->lum_hash_type) & LMV_HASH_FLAG_OVERSTRIPED) {
9045                 /* silently clamp stripe count if it exceeds limit */
9046                 if (stripe_count > mdt_count * lod->lod_max_stripes_per_mdt)
9047                         stripe_count = mdt_count * lod->lod_max_stripes_per_mdt;
9048                 if (stripe_count > mdt_count)
9049                         lo->ldo_dir_hash_type |= LMV_HASH_FLAG_OVERSTRIPED;
9050         } else if (stripe_count > mdt_count) {
9051                 RETURN(-E2BIG);
9052         }
9053
9054         if (stripe_count <= saved_count)
9055                 RETURN(-EINVAL);
9056
9057         dof->dof_type = DFT_DIR;
9058
9059         OBD_ALLOC(stripes, sizeof(*stripes) * stripe_count);
9060         if (!stripes)
9061                 RETURN(-ENOMEM);
9062
9063         for (i = 0; i < lo->ldo_dir_stripes_allocated; i++)
9064                 stripes[i] = lo->ldo_stripe[i];
9065
9066         lod_qos_statfs_update(env, lod, &lod->lod_mdt_descs);
9067
9068         rc = lod_mdt_alloc_qos(env, lo, stripes, saved_count, stripe_count);
9069         if (rc == -EAGAIN)
9070                 rc = lod_mdt_alloc_rr(env, lo, stripes, saved_count,
9071                                       stripe_count);
9072         if (rc < 0) {
9073                 OBD_FREE(stripes, sizeof(*stripes) * stripe_count);
9074                 RETURN(rc);
9075         }
9076
9077         LASSERT(rc > saved_count);
9078         OBD_FREE(lo->ldo_stripe,
9079                  sizeof(*stripes) * lo->ldo_dir_stripes_allocated);
9080         lo->ldo_stripe = stripes;
9081         lo->ldo_is_foreign = 0;
9082         lo->ldo_dir_striped = 1;
9083         lo->ldo_dir_stripe_count = rc;
9084         lo->ldo_dir_stripes_allocated = stripe_count;
9085         lo->ldo_dir_split_hash = lo->ldo_dir_hash_type;
9086         lo->ldo_dir_hash_type = le32_to_cpu(lum->lum_hash_type);
9087         if (!lmv_is_known_hash_type(lo->ldo_dir_hash_type))
9088                 lo->ldo_dir_hash_type =
9089                         lod->lod_mdt_descs.ltd_lmv_desc.ld_pattern;
9090         lo->ldo_dir_hash_type |= LMV_HASH_FLAG_SPLIT | LMV_HASH_FLAG_MIGRATION;
9091         lo->ldo_dir_split_offset = saved_count;
9092         lo->ldo_dir_layout_version++;
9093         lo->ldo_dir_stripe_loaded = 1;
9094
9095         rc = lod_dir_declare_create_stripes(env, dt, mlc->mlc_attr, dof, th);
9096         if (rc)
9097                 lod_striping_free(env, lo);
9098
9099         RETURN(rc);
9100 }
9101
9102 /*
9103  * detach all stripes from dir master object, NB, stripes are not destroyed, but
9104  * deleted from it's parent namespace, this function is called in two places:
9105  * 1. mdd_migrate_mdt() detach stripes from source, and attach them to
9106  *    target.
9107  * 2. mdd_dir_layout_update() detach stripe before turning 1-stripe directory to
9108  *    a plain directory.
9109  *
9110  * \param[in] env       execution environment
9111  * \param[in] dt        target object
9112  * \param[in] mlc       layout change data
9113  * \param[in] th        transaction handle
9114  *
9115  * \retval              0 on success
9116  * \retval              negative if failed
9117  */
9118 static int lod_dir_layout_detach(const struct lu_env *env,
9119                                  struct dt_object *dt,
9120                                  const struct md_layout_change *mlc,
9121                                  struct thandle *th)
9122 {
9123         struct lod_thread_info *info = lod_env_info(env);
9124         struct lod_object *lo = lod_dt_obj(dt);
9125         struct dt_object *next = dt_object_child(dt);
9126         char *stripe_name = info->lti_key;
9127         struct dt_object *dto;
9128         int i;
9129         int rc = 0;
9130
9131         ENTRY;
9132
9133         if (!lo->ldo_dir_stripe_count) {
9134                 /* plain directory delete .. */
9135                 rc = lod_sub_delete(env, next,
9136                                     (const struct dt_key *)dotdot, th);
9137                 RETURN(rc);
9138         }
9139
9140         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
9141                 dto = lo->ldo_stripe[i];
9142                 if (!dto)
9143                         continue;
9144
9145                 rc = lod_sub_delete(env, dto,
9146                                     (const struct dt_key *)dotdot, th);
9147                 if (rc)
9148                         break;
9149
9150                 snprintf(stripe_name, sizeof(info->lti_key), DFID":%d",
9151                          PFID(lu_object_fid(&dto->do_lu)), i);
9152
9153                 rc = lod_sub_delete(env, next,
9154                                     (const struct dt_key *)stripe_name, th);
9155                 if (rc)
9156                         break;
9157
9158                 rc = lod_sub_ref_del(env, next, th);
9159                 if (rc)
9160                         break;
9161         }
9162
9163         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
9164                 dto = lo->ldo_stripe[i];
9165                 if (dto)
9166                         dt_object_put(env, dto);
9167         }
9168         OBD_FREE_PTR_ARRAY(lo->ldo_stripe, lo->ldo_dir_stripes_allocated);
9169         lo->ldo_stripe = NULL;
9170         lo->ldo_dir_stripes_allocated = 0;
9171         lo->ldo_dir_stripe_count = 0;
9172         dt->do_index_ops = &lod_index_ops;
9173
9174         RETURN(rc);
9175 }
9176
9177 static int lod_dir_layout_shrink(const struct lu_env *env,
9178                                  struct dt_object *dt,
9179                                  const struct md_layout_change *mlc,
9180                                  struct thandle *th)
9181 {
9182         struct lod_thread_info *info = lod_env_info(env);
9183         struct lod_object *lo = lod_dt_obj(dt);
9184         struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
9185         struct dt_object *next = dt_object_child(dt);
9186         struct lmv_user_md *lmu = mlc->mlc_buf.lb_buf;
9187         __u32 final_stripe_count;
9188         char *stripe_name = info->lti_key;
9189         struct dt_object *dto;
9190         struct lu_buf *lmv_buf = &info->lti_buf;
9191         struct lmv_mds_md_v1 *lmv = &info->lti_lmv.lmv_md_v1;
9192         u32 mdtidx;
9193         int type = LU_SEQ_RANGE_ANY;
9194         int i;
9195         int rc;
9196
9197         ENTRY;
9198
9199         final_stripe_count = le32_to_cpu(lmu->lum_stripe_count);
9200
9201         lmv_buf->lb_buf = lmv;
9202         lmv_buf->lb_len = sizeof(*lmv);
9203         lmv->lmv_magic = cpu_to_le32(LMV_MAGIC_STRIPE);
9204         lmv->lmv_stripe_count = cpu_to_le32(final_stripe_count);
9205         lmv->lmv_hash_type = cpu_to_le32(lo->ldo_dir_hash_type) &
9206                              cpu_to_le32(LMV_HASH_TYPE_MASK |
9207                                          LMV_HASH_FLAG_FIXED);
9208         lmv->lmv_layout_version =
9209                         cpu_to_le32(lo->ldo_dir_layout_version + 1);
9210         lmv->lmv_migrate_offset = 0;
9211         lmv->lmv_migrate_hash = 0;
9212
9213         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
9214                 dto = lo->ldo_stripe[i];
9215                 if (!dto)
9216                         continue;
9217
9218                 if (i < final_stripe_count) {
9219                         rc = lod_fld_lookup(env, lod,
9220                                             lu_object_fid(&dto->do_lu),
9221                                             &mdtidx, &type);
9222                         if (rc)
9223                                 RETURN(rc);
9224
9225                         lmv->lmv_master_mdt_index = cpu_to_le32(mdtidx);
9226                         rc = lod_sub_xattr_set(env, dto, lmv_buf,
9227                                                XATTR_NAME_LMV,
9228                                                LU_XATTR_REPLACE, th);
9229                         if (rc)
9230                                 RETURN(rc);
9231
9232                         continue;
9233                 }
9234
9235                 dt_write_lock(env, dto, DT_TGT_CHILD);
9236                 rc = lod_sub_ref_del(env, dto, th);
9237                 dt_write_unlock(env, dto);
9238                 if (rc)
9239                         RETURN(rc);
9240
9241                 rc = lod_sub_destroy(env, dto, th);
9242                 if (rc)
9243                         RETURN(rc);
9244
9245                 snprintf(stripe_name, sizeof(info->lti_key), DFID":%d",
9246                          PFID(lu_object_fid(&dto->do_lu)), i);
9247
9248                 rc = lod_sub_delete(env, next,
9249                                     (const struct dt_key *)stripe_name, th);
9250                 if (rc)
9251                         RETURN(rc);
9252
9253                 rc = lod_sub_ref_del(env, next, th);
9254                 if (rc)
9255                         RETURN(rc);
9256         }
9257
9258         rc = lod_fld_lookup(env, lod, lu_object_fid(&dt->do_lu), &mdtidx,
9259                             &type);
9260         if (rc)
9261                 RETURN(rc);
9262
9263         lmv->lmv_magic = cpu_to_le32(LMV_MAGIC_V1);
9264         lmv->lmv_master_mdt_index = cpu_to_le32(mdtidx);
9265         rc = lod_sub_xattr_set(env, next, lmv_buf, XATTR_NAME_LMV,
9266                                LU_XATTR_REPLACE, th);
9267         if (rc)
9268                 RETURN(rc);
9269
9270         for (i = final_stripe_count; i < lo->ldo_dir_stripe_count; i++) {
9271                 dto = lo->ldo_stripe[i];
9272                 if (dto)
9273                         dt_object_put(env, dto);
9274         }
9275         lo->ldo_dir_stripe_count = final_stripe_count;
9276
9277         RETURN(rc);
9278 }
9279
9280 static mlc_handler dir_mlc_declare_ops[MD_LAYOUT_MAX] = {
9281         [MD_LAYOUT_ATTACH] = lod_dir_declare_layout_attach,
9282         [MD_LAYOUT_DETACH] = lod_dir_declare_layout_detach,
9283         [MD_LAYOUT_SHRINK] = lod_dir_declare_layout_shrink,
9284         [MD_LAYOUT_SPLIT]  = lod_dir_declare_layout_split,
9285 };
9286
9287 static mlc_handler dir_mlc_ops[MD_LAYOUT_MAX] = {
9288         [MD_LAYOUT_DETACH] = lod_dir_layout_detach,
9289         [MD_LAYOUT_SHRINK] = lod_dir_layout_shrink,
9290 };
9291
9292 static int lod_declare_layout_change(const struct lu_env *env,
9293                 struct dt_object *dt, struct md_layout_change *mlc,
9294                 struct thandle *th)
9295 {
9296         struct lod_thread_info  *info = lod_env_info(env);
9297         struct lod_object *lo = lod_dt_obj(dt);
9298         int rc;
9299
9300         ENTRY;
9301
9302         if (S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
9303                 LASSERT(dir_mlc_declare_ops[mlc->mlc_opc]);
9304                 rc = dir_mlc_declare_ops[mlc->mlc_opc](env, dt, mlc, th);
9305                 RETURN(rc);
9306         }
9307
9308         if (!S_ISREG(dt->do_lu.lo_header->loh_attr) || !dt_object_exists(dt) ||
9309             dt_object_remote(dt_object_child(dt)))
9310                 RETURN(-EINVAL);
9311
9312         if (mlc->mlc_opc == MD_LAYOUT_WRITE) {
9313                 struct layout_intent *intent = mlc->mlc_intent;
9314
9315                 if (intent->lai_opc == LAYOUT_INTENT_PCCRO_SET ||
9316                     intent->lai_opc == LAYOUT_INTENT_PCCRO_CLEAR) {
9317                         if (!S_ISREG(dt->do_lu.lo_header->loh_attr))
9318                                 RETURN(-EINVAL);
9319
9320                         rc = lod_declare_update_pccro(env, dt, mlc, th);
9321                         RETURN(rc);
9322                 }
9323         }
9324
9325         rc = lod_striping_load(env, lo);
9326         if (rc)
9327                 GOTO(out, rc);
9328
9329         LASSERT(lo->ldo_comp_cnt > 0);
9330
9331         rc = lod_layout_data_init(info, lo->ldo_comp_cnt);
9332         if (rc)
9333                 GOTO(out, rc);
9334
9335         switch (lo->ldo_flr_state) {
9336         case LCM_FL_NONE:
9337                 rc = lod_declare_update_plain(env, lo, mlc->mlc_intent,
9338                                               &mlc->mlc_buf, th);
9339                 break;
9340         case LCM_FL_RDONLY:
9341                 rc = lod_declare_update_rdonly(env, lo, mlc, th);
9342                 break;
9343         case LCM_FL_WRITE_PENDING:
9344                 rc = lod_declare_update_write_pending(env, lo, mlc, th);
9345                 break;
9346         case LCM_FL_SYNC_PENDING:
9347                 rc = lod_declare_update_sync_pending(env, lo, mlc, th);
9348                 break;
9349         default:
9350                 rc = -ENOTSUPP;
9351                 break;
9352         }
9353         if (rc == 0)
9354                 rc = lod_save_layout_gen_intrans(info, lo);
9355
9356 out:
9357         RETURN(rc);
9358 }
9359
9360 /**
9361  * Instantiate layout component objects which covers the intent write offset.
9362  */
9363 static int lod_layout_change(const struct lu_env *env, struct dt_object *dt,
9364                              struct md_layout_change *mlc, struct thandle *th)
9365 {
9366         struct lod_thread_info *info = lod_env_info(env);
9367         struct lu_attr *attr = &lod_env_info(env)->lti_attr;
9368         struct lu_attr *layout_attr = &info->lti_layout_attr;
9369         struct lod_object *lo = lod_dt_obj(dt);
9370         int rc;
9371
9372         ENTRY;
9373
9374         if (S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
9375                 LASSERT(dir_mlc_ops[mlc->mlc_opc]);
9376                 rc = dir_mlc_ops[mlc->mlc_opc](env, dt, mlc, th);
9377                 RETURN(rc);
9378         }
9379
9380         rc = lod_check_layout_gen_intrans(info, lo);
9381         if (rc > 0) {
9382                 CDEBUG(D_LAYOUT,
9383                        "%s: obj "DFID" gen changed from %d to %d in transaction, retry the transaction \n",
9384                        dt->do_lu.lo_dev->ld_obd->obd_name,
9385                        PFID(lu_object_fid(&dt->do_lu)),
9386                        info->lti_gen[rc - 1], lo->ldo_layout_gen);
9387                 RETURN(-EAGAIN);
9388         }
9389
9390         rc = lod_striped_create(env, dt, attr, NULL, th);
9391         if (!rc && layout_attr->la_valid & LA_LAYOUT_VERSION) {
9392                 layout_attr->la_layout_version |= lo->ldo_layout_gen;
9393                 rc = lod_attr_set(env, dt, layout_attr, th);
9394         }
9395
9396         RETURN(rc);
9397 }
9398
9399 const struct dt_object_operations lod_obj_ops = {
9400         .do_read_lock           = lod_read_lock,
9401         .do_write_lock          = lod_write_lock,
9402         .do_read_unlock         = lod_read_unlock,
9403         .do_write_unlock        = lod_write_unlock,
9404         .do_write_locked        = lod_write_locked,
9405         .do_attr_get            = lod_attr_get,
9406         .do_declare_attr_set    = lod_declare_attr_set,
9407         .do_attr_set            = lod_attr_set,
9408         .do_xattr_get           = lod_xattr_get,
9409         .do_declare_xattr_set   = lod_declare_xattr_set,
9410         .do_xattr_set           = lod_xattr_set,
9411         .do_declare_xattr_del   = lod_declare_xattr_del,
9412         .do_xattr_del           = lod_xattr_del,
9413         .do_xattr_list          = lod_xattr_list,
9414         .do_ah_init             = lod_ah_init,
9415         .do_declare_create      = lod_declare_create,
9416         .do_create              = lod_create,
9417         .do_declare_destroy     = lod_declare_destroy,
9418         .do_destroy             = lod_destroy,
9419         .do_index_try           = lod_index_try,
9420         .do_declare_ref_add     = lod_declare_ref_add,
9421         .do_ref_add             = lod_ref_add,
9422         .do_declare_ref_del     = lod_declare_ref_del,
9423         .do_ref_del             = lod_ref_del,
9424         .do_object_sync         = lod_object_sync,
9425         .do_object_lock         = lod_object_lock,
9426         .do_object_unlock       = lod_object_unlock,
9427         .do_invalidate          = lod_invalidate,
9428         .do_declare_layout_change = lod_declare_layout_change,
9429         .do_layout_change       = lod_layout_change,
9430 };
9431
9432 /**
9433  * Implementation of dt_body_operations::dbo_read.
9434  *
9435  * \see dt_body_operations::dbo_read() in the API description for details.
9436  */
9437 static ssize_t lod_read(const struct lu_env *env, struct dt_object *dt,
9438                         struct lu_buf *buf, loff_t *pos)
9439 {
9440         struct dt_object *next = dt_object_child(dt);
9441
9442         LASSERT(S_ISREG(dt->do_lu.lo_header->loh_attr) ||
9443                 S_ISLNK(dt->do_lu.lo_header->loh_attr));
9444         return next->do_body_ops->dbo_read(env, next, buf, pos);
9445 }
9446
9447 /**
9448  * Implementation of dt_body_operations::dbo_declare_write.
9449  *
9450  * \see dt_body_operations::dbo_declare_write() in the API description
9451  * for details.
9452  */
9453 static ssize_t lod_declare_write(const struct lu_env *env,
9454                                  struct dt_object *dt,
9455                                  const struct lu_buf *buf, loff_t pos,
9456                                  struct thandle *th)
9457 {
9458         return lod_sub_declare_write(env, dt_object_child(dt), buf, pos, th);
9459 }
9460
9461 /**
9462  * Implementation of dt_body_operations::dbo_write.
9463  *
9464  * \see dt_body_operations::dbo_write() in the API description for details.
9465  */
9466 static ssize_t lod_write(const struct lu_env *env, struct dt_object *dt,
9467                          const struct lu_buf *buf, loff_t *pos,
9468                          struct thandle *th)
9469 {
9470         LASSERT(S_ISREG(dt->do_lu.lo_header->loh_attr) ||
9471                 S_ISLNK(dt->do_lu.lo_header->loh_attr));
9472         return lod_sub_write(env, dt_object_child(dt), buf, pos, th);
9473 }
9474
9475 static int lod_declare_punch(const struct lu_env *env, struct dt_object *dt,
9476                              __u64 start, __u64 end, struct thandle *th)
9477 {
9478         if (dt_object_remote(dt))
9479                 return -ENOTSUPP;
9480
9481         return lod_sub_declare_punch(env, dt_object_child(dt), start, end, th);
9482 }
9483
9484 static int lod_punch(const struct lu_env *env, struct dt_object *dt,
9485                      __u64 start, __u64 end, struct thandle *th)
9486 {
9487         if (dt_object_remote(dt))
9488                 return -ENOTSUPP;
9489
9490         LASSERT(S_ISREG(dt->do_lu.lo_header->loh_attr));
9491         return lod_sub_punch(env, dt_object_child(dt), start, end, th);
9492 }
9493
9494 /*
9495  * different type of files use the same body_ops because object may be created
9496  * in OUT, where there is no chance to set correct body_ops for each type, so
9497  * body_ops themselves will check file type inside, see lod_read/write/punch for
9498  * details.
9499  */
9500 static const struct dt_body_operations lod_body_ops = {
9501         .dbo_read               = lod_read,
9502         .dbo_declare_write      = lod_declare_write,
9503         .dbo_write              = lod_write,
9504         .dbo_declare_punch      = lod_declare_punch,
9505         .dbo_punch              = lod_punch,
9506 };
9507
9508 /**
9509  * Implementation of lu_object_operations::loo_object_init.
9510  *
9511  * The function determines the type and the index of the target device using
9512  * sequence of the object's FID. Then passes control down to the
9513  * corresponding device:
9514  *  OSD for the local objects, OSP for remote
9515  *
9516  * \see lu_object_operations::loo_object_init() in the API description
9517  * for details.
9518  */
9519 static int lod_object_init(const struct lu_env *env, struct lu_object *lo,
9520                            const struct lu_object_conf *conf)
9521 {
9522         struct lod_device       *lod    = lu2lod_dev(lo->lo_dev);
9523         struct lu_device        *cdev   = NULL;
9524         struct lu_object        *cobj;
9525         struct lod_tgt_descs    *ltd    = NULL;
9526         struct lod_tgt_desc     *tgt;
9527         u32                      idx    = 0;
9528         int                      type   = LU_SEQ_RANGE_ANY;
9529         int                      rc;
9530         ENTRY;
9531
9532         rc = lod_fld_lookup(env, lod, lu_object_fid(lo), &idx, &type);
9533         if (rc != 0)
9534                 RETURN(rc);
9535
9536         if (type == LU_SEQ_RANGE_MDT &&
9537             idx == lu_site2seq(lo->lo_dev->ld_site)->ss_node_id) {
9538                 cdev = &lod->lod_child->dd_lu_dev;
9539         } else if (type == LU_SEQ_RANGE_MDT) {
9540                 ltd = &lod->lod_mdt_descs;
9541                 lod_getref(ltd);
9542         } else if (type == LU_SEQ_RANGE_OST) {
9543                 ltd = &lod->lod_ost_descs;
9544                 lod_getref(ltd);
9545         } else {
9546                 LBUG();
9547         }
9548
9549         if (ltd != NULL) {
9550                 if (ltd->ltd_tgts_size > idx &&
9551                     test_bit(idx, ltd->ltd_tgt_bitmap)) {
9552                         tgt = LTD_TGT(ltd, idx);
9553
9554                         LASSERT(tgt != NULL);
9555                         LASSERT(tgt->ltd_tgt != NULL);
9556
9557                         cdev = &(tgt->ltd_tgt->dd_lu_dev);
9558                 }
9559                 lod_putref(lod, ltd);
9560         }
9561
9562         if (unlikely(cdev == NULL))
9563                 RETURN(-ENOENT);
9564
9565         cobj = cdev->ld_ops->ldo_object_alloc(env, lo->lo_header, cdev);
9566         if (unlikely(cobj == NULL))
9567                 RETURN(-ENOMEM);
9568
9569         lu2lod_obj(lo)->ldo_obj.do_body_ops = &lod_body_ops;
9570
9571         lu_object_add(lo, cobj);
9572
9573         RETURN(0);
9574 }
9575
9576 /**
9577  *
9578  * Release resources associated with striping.
9579  *
9580  * If the object is striped (regular or directory), then release
9581  * the stripe objects references and free the ldo_stripe array.
9582  *
9583  * \param[in] env       execution environment
9584  * \param[in] lo        object
9585  */
9586 void lod_striping_free_nolock(const struct lu_env *env, struct lod_object *lo)
9587 {
9588         struct lod_layout_component *lod_comp;
9589         __u32 obj_attr = lo->ldo_obj.do_lu.lo_header->loh_attr;
9590         int i, j;
9591
9592         if (unlikely(lo->ldo_is_foreign)) {
9593                 if (S_ISREG(obj_attr)) {
9594                         lod_free_foreign_lov(lo);
9595                         lo->ldo_comp_cached = 0;
9596                 } else if (S_ISDIR(obj_attr)) {
9597                         lod_free_foreign_lmv(lo);
9598                         lo->ldo_dir_stripe_loaded = 0;
9599                 }
9600         } else if (lo->ldo_stripe != NULL) {
9601                 LASSERT(lo->ldo_comp_entries == NULL);
9602                 LASSERT(lo->ldo_dir_stripes_allocated > 0);
9603
9604                 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
9605                         if (lo->ldo_stripe[i])
9606                                 dt_object_put(env, lo->ldo_stripe[i]);
9607                 }
9608
9609                 j = sizeof(struct dt_object *) * lo->ldo_dir_stripes_allocated;
9610                 OBD_FREE(lo->ldo_stripe, j);
9611                 lo->ldo_stripe = NULL;
9612                 lo->ldo_dir_stripes_allocated = 0;
9613                 lo->ldo_dir_stripe_loaded = 0;
9614                 lo->ldo_dir_stripe_count = 0;
9615                 lo->ldo_obj.do_index_ops = NULL;
9616         } else if (lo->ldo_comp_entries != NULL) {
9617                 for (i = 0; i < lo->ldo_comp_cnt; i++) {
9618                         /* free lod_layout_component::llc_stripe array */
9619                         lod_comp = &lo->ldo_comp_entries[i];
9620
9621                         /* HSM layout component */
9622                         if (lod_comp->llc_magic == LOV_MAGIC_FOREIGN)
9623                                 continue;
9624                         if (lod_comp->llc_stripe == NULL)
9625                                 continue;
9626                         LASSERT(lod_comp->llc_stripes_allocated != 0);
9627                         for (j = 0; j < lod_comp->llc_stripes_allocated; j++) {
9628                                 if (lod_comp->llc_stripe[j] != NULL)
9629                                         lu_object_put(env,
9630                                                &lod_comp->llc_stripe[j]->do_lu);
9631                         }
9632                         OBD_FREE_PTR_ARRAY(lod_comp->llc_stripe,
9633                                            lod_comp->llc_stripes_allocated);
9634                         lod_comp->llc_stripe = NULL;
9635                         OBD_FREE_PTR_ARRAY(lod_comp->llc_ost_indices,
9636                                            lod_comp->llc_stripes_allocated);
9637                         lod_comp->llc_ost_indices = NULL;
9638                         lod_comp->llc_stripes_allocated = 0;
9639                 }
9640                 lod_free_comp_entries(lo);
9641                 lo->ldo_comp_cached = 0;
9642         }
9643 }
9644
9645 void lod_striping_free(const struct lu_env *env, struct lod_object *lo)
9646 {
9647         mutex_lock(&lo->ldo_layout_mutex);
9648         lod_striping_free_nolock(env, lo);
9649         mutex_unlock(&lo->ldo_layout_mutex);
9650 }
9651
9652 /**
9653  * Implementation of lu_object_operations::loo_object_free.
9654  *
9655  * \see lu_object_operations::loo_object_free() in the API description
9656  * for details.
9657  */
9658 static void lod_object_free(const struct lu_env *env, struct lu_object *o)
9659 {
9660         struct lod_object *lo = lu2lod_obj(o);
9661
9662         /* release all underlying object pinned */
9663         lod_striping_free(env, lo);
9664         lu_object_fini(o);
9665         /* lo doesn't contain a lu_object_header, so we don't need call_rcu */
9666         OBD_SLAB_FREE_PTR(lo, lod_object_kmem);
9667 }
9668
9669 /**
9670  * Implementation of lu_object_operations::loo_object_release.
9671  *
9672  * \see lu_object_operations::loo_object_release() in the API description
9673  * for details.
9674  */
9675 static void lod_object_release(const struct lu_env *env, struct lu_object *o)
9676 {
9677         /* XXX: shouldn't we release everything here in case if object
9678          * creation failed before? */
9679 }
9680
9681 /**
9682  * Implementation of lu_object_operations::loo_object_print.
9683  *
9684  * \see lu_object_operations::loo_object_print() in the API description
9685  * for details.
9686  */
9687 static int lod_object_print(const struct lu_env *env, void *cookie,
9688                             lu_printer_t p, const struct lu_object *l)
9689 {
9690         struct lod_object *o = lu2lod_obj((struct lu_object *) l);
9691
9692         return (*p)(env, cookie, LUSTRE_LOD_NAME"-object@%p", o);
9693 }
9694
9695 const struct lu_object_operations lod_lu_obj_ops = {
9696         .loo_object_init        = lod_object_init,
9697         .loo_object_free        = lod_object_free,
9698         .loo_object_release     = lod_object_release,
9699         .loo_object_print       = lod_object_print,
9700 };