Whamcloud - gitweb
LU-11376 lmv: new foreign LMV format
[fs/lustre-release.git] / lustre / lod / lod_object.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License version 2 for more details.  A copy is
14  * included in the COPYING file that accompanied this code.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright  2009 Sun Microsystems, Inc. All rights reserved
24  * Use is subject to license terms.
25  *
26  * Copyright (c) 2012, 2017, Intel Corporation.
27  */
28 /*
29  * lustre/lod/lod_object.c
30  *
31  * This file contains implementations of methods for the OSD API
32  * for the Logical Object Device (LOD) layer, which provides a virtual
33  * local OSD object interface to the MDD layer, and abstracts the
34  * addressing of local (OSD) and remote (OSP) objects. The API is
35  * described in the file lustre/include/dt_object.h and in
36  * Documentation/osd-api.txt.
37  *
38  * Author: Alex Zhuravlev <alexey.zhuravlev@intel.com>
39  */
40
41 #define DEBUG_SUBSYSTEM S_MDS
42
43 #include <linux/random.h>
44
45 #include <obd.h>
46 #include <obd_class.h>
47 #include <obd_support.h>
48
49 #include <lustre_fid.h>
50 #include <lustre_linkea.h>
51 #include <lustre_lmv.h>
52 #include <uapi/linux/lustre/lustre_param.h>
53 #include <lustre_swab.h>
54 #include <uapi/linux/lustre/lustre_ver.h>
55 #include <lprocfs_status.h>
56 #include <md_object.h>
57
58 #include "lod_internal.h"
59
60 static const char dot[] = ".";
61 static const char dotdot[] = "..";
62
63 /**
64  * Implementation of dt_index_operations::dio_lookup
65  *
66  * Used with regular (non-striped) objects.
67  *
68  * \see dt_index_operations::dio_lookup() in the API description for details.
69  */
70 static int lod_lookup(const struct lu_env *env, struct dt_object *dt,
71                       struct dt_rec *rec, const struct dt_key *key)
72 {
73         struct dt_object *next = dt_object_child(dt);
74         return next->do_index_ops->dio_lookup(env, next, rec, key);
75 }
76
77 /**
78  * Implementation of dt_index_operations::dio_declare_insert.
79  *
80  * Used with regular (non-striped) objects.
81  *
82  * \see dt_index_operations::dio_declare_insert() in the API description
83  * for details.
84  */
85 static int lod_declare_insert(const struct lu_env *env, struct dt_object *dt,
86                               const struct dt_rec *rec,
87                               const struct dt_key *key, struct thandle *th)
88 {
89         return lod_sub_declare_insert(env, dt_object_child(dt), rec, key, th);
90 }
91
92 /**
93  * Implementation of dt_index_operations::dio_insert.
94  *
95  * Used with regular (non-striped) objects
96  *
97  * \see dt_index_operations::dio_insert() in the API description for details.
98  */
99 static int lod_insert(const struct lu_env *env, struct dt_object *dt,
100                       const struct dt_rec *rec, const struct dt_key *key,
101                       struct thandle *th)
102 {
103         return lod_sub_insert(env, dt_object_child(dt), rec, key, th);
104 }
105
106 /**
107  * Implementation of dt_index_operations::dio_declare_delete.
108  *
109  * Used with regular (non-striped) objects.
110  *
111  * \see dt_index_operations::dio_declare_delete() in the API description
112  * for details.
113  */
114 static int lod_declare_delete(const struct lu_env *env, struct dt_object *dt,
115                               const struct dt_key *key, struct thandle *th)
116 {
117         return lod_sub_declare_delete(env, dt_object_child(dt), key, th);
118 }
119
120 /**
121  * Implementation of dt_index_operations::dio_delete.
122  *
123  * Used with regular (non-striped) objects.
124  *
125  * \see dt_index_operations::dio_delete() in the API description for details.
126  */
127 static int lod_delete(const struct lu_env *env, struct dt_object *dt,
128                       const struct dt_key *key, struct thandle *th)
129 {
130         return lod_sub_delete(env, dt_object_child(dt), key, th);
131 }
132
133 /**
134  * Implementation of dt_it_ops::init.
135  *
136  * Used with regular (non-striped) objects.
137  *
138  * \see dt_it_ops::init() in the API description for details.
139  */
140 static struct dt_it *lod_it_init(const struct lu_env *env,
141                                  struct dt_object *dt, __u32 attr)
142 {
143         struct dt_object        *next = dt_object_child(dt);
144         struct lod_it           *it = &lod_env_info(env)->lti_it;
145         struct dt_it            *it_next;
146
147         it_next = next->do_index_ops->dio_it.init(env, next, attr);
148         if (IS_ERR(it_next))
149                 return it_next;
150
151         /* currently we do not use more than one iterator per thread
152          * so we store it in thread info. if at some point we need
153          * more active iterators in a single thread, we can allocate
154          * additional ones */
155         LASSERT(it->lit_obj == NULL);
156
157         it->lit_it = it_next;
158         it->lit_obj = next;
159
160         return (struct dt_it *)it;
161 }
162
163 #define LOD_CHECK_IT(env, it)                                   \
164 do {                                                            \
165         LASSERT((it)->lit_obj != NULL);                         \
166         LASSERT((it)->lit_it != NULL);                          \
167 } while (0)
168
169 /**
170  * Implementation of dt_index_operations::dio_it.fini.
171  *
172  * Used with regular (non-striped) objects.
173  *
174  * \see dt_index_operations::dio_it.fini() in the API description for details.
175  */
176 static void lod_it_fini(const struct lu_env *env, struct dt_it *di)
177 {
178         struct lod_it *it = (struct lod_it *)di;
179
180         LOD_CHECK_IT(env, it);
181         it->lit_obj->do_index_ops->dio_it.fini(env, it->lit_it);
182
183         /* the iterator not in use any more */
184         it->lit_obj = NULL;
185         it->lit_it = NULL;
186 }
187
188 /**
189  * Implementation of dt_it_ops::get.
190  *
191  * Used with regular (non-striped) objects.
192  *
193  * \see dt_it_ops::get() in the API description for details.
194  */
195 static int lod_it_get(const struct lu_env *env, struct dt_it *di,
196                       const struct dt_key *key)
197 {
198         const struct lod_it *it = (const struct lod_it *)di;
199
200         LOD_CHECK_IT(env, it);
201         return it->lit_obj->do_index_ops->dio_it.get(env, it->lit_it, key);
202 }
203
204 /**
205  * Implementation of dt_it_ops::put.
206  *
207  * Used with regular (non-striped) objects.
208  *
209  * \see dt_it_ops::put() in the API description for details.
210  */
211 static void lod_it_put(const struct lu_env *env, struct dt_it *di)
212 {
213         struct lod_it *it = (struct lod_it *)di;
214
215         LOD_CHECK_IT(env, it);
216         return it->lit_obj->do_index_ops->dio_it.put(env, it->lit_it);
217 }
218
219 /**
220  * Implementation of dt_it_ops::next.
221  *
222  * Used with regular (non-striped) objects
223  *
224  * \see dt_it_ops::next() in the API description for details.
225  */
226 static int lod_it_next(const struct lu_env *env, struct dt_it *di)
227 {
228         struct lod_it *it = (struct lod_it *)di;
229
230         LOD_CHECK_IT(env, it);
231         return it->lit_obj->do_index_ops->dio_it.next(env, it->lit_it);
232 }
233
234 /**
235  * Implementation of dt_it_ops::key.
236  *
237  * Used with regular (non-striped) objects.
238  *
239  * \see dt_it_ops::key() in the API description for details.
240  */
241 static struct dt_key *lod_it_key(const struct lu_env *env,
242                                  const struct dt_it *di)
243 {
244         const struct lod_it *it = (const struct lod_it *)di;
245
246         LOD_CHECK_IT(env, it);
247         return it->lit_obj->do_index_ops->dio_it.key(env, it->lit_it);
248 }
249
250 /**
251  * Implementation of dt_it_ops::key_size.
252  *
253  * Used with regular (non-striped) objects.
254  *
255  * \see dt_it_ops::key_size() in the API description for details.
256  */
257 static int lod_it_key_size(const struct lu_env *env, const struct dt_it *di)
258 {
259         struct lod_it *it = (struct lod_it *)di;
260
261         LOD_CHECK_IT(env, it);
262         return it->lit_obj->do_index_ops->dio_it.key_size(env, it->lit_it);
263 }
264
265 /**
266  * Implementation of dt_it_ops::rec.
267  *
268  * Used with regular (non-striped) objects.
269  *
270  * \see dt_it_ops::rec() in the API description for details.
271  */
272 static int lod_it_rec(const struct lu_env *env, const struct dt_it *di,
273                       struct dt_rec *rec, __u32 attr)
274 {
275         const struct lod_it *it = (const struct lod_it *)di;
276
277         LOD_CHECK_IT(env, it);
278         return it->lit_obj->do_index_ops->dio_it.rec(env, it->lit_it, rec,
279                                                      attr);
280 }
281
282 /**
283  * Implementation of dt_it_ops::rec_size.
284  *
285  * Used with regular (non-striped) objects.
286  *
287  * \see dt_it_ops::rec_size() in the API description for details.
288  */
289 static int lod_it_rec_size(const struct lu_env *env, const struct dt_it *di,
290                            __u32 attr)
291 {
292         const struct lod_it *it = (const struct lod_it *)di;
293
294         LOD_CHECK_IT(env, it);
295         return it->lit_obj->do_index_ops->dio_it.rec_size(env, it->lit_it,
296                                                           attr);
297 }
298
299 /**
300  * Implementation of dt_it_ops::store.
301  *
302  * Used with regular (non-striped) objects.
303  *
304  * \see dt_it_ops::store() in the API description for details.
305  */
306 static __u64 lod_it_store(const struct lu_env *env, const struct dt_it *di)
307 {
308         const struct lod_it *it = (const struct lod_it *)di;
309
310         LOD_CHECK_IT(env, it);
311         return it->lit_obj->do_index_ops->dio_it.store(env, it->lit_it);
312 }
313
314 /**
315  * Implementation of dt_it_ops::load.
316  *
317  * Used with regular (non-striped) objects.
318  *
319  * \see dt_it_ops::load() in the API description for details.
320  */
321 static int lod_it_load(const struct lu_env *env, const struct dt_it *di,
322                        __u64 hash)
323 {
324         const struct lod_it *it = (const struct lod_it *)di;
325
326         LOD_CHECK_IT(env, it);
327         return it->lit_obj->do_index_ops->dio_it.load(env, it->lit_it, hash);
328 }
329
330 /**
331  * Implementation of dt_it_ops::key_rec.
332  *
333  * Used with regular (non-striped) objects.
334  *
335  * \see dt_it_ops::rec() in the API description for details.
336  */
337 static int lod_it_key_rec(const struct lu_env *env, const struct dt_it *di,
338                           void *key_rec)
339 {
340         const struct lod_it *it = (const struct lod_it *)di;
341
342         LOD_CHECK_IT(env, it);
343         return it->lit_obj->do_index_ops->dio_it.key_rec(env, it->lit_it,
344                                                          key_rec);
345 }
346
347 static struct dt_index_operations lod_index_ops = {
348         .dio_lookup             = lod_lookup,
349         .dio_declare_insert     = lod_declare_insert,
350         .dio_insert             = lod_insert,
351         .dio_declare_delete     = lod_declare_delete,
352         .dio_delete             = lod_delete,
353         .dio_it = {
354                 .init           = lod_it_init,
355                 .fini           = lod_it_fini,
356                 .get            = lod_it_get,
357                 .put            = lod_it_put,
358                 .next           = lod_it_next,
359                 .key            = lod_it_key,
360                 .key_size       = lod_it_key_size,
361                 .rec            = lod_it_rec,
362                 .rec_size       = lod_it_rec_size,
363                 .store          = lod_it_store,
364                 .load           = lod_it_load,
365                 .key_rec        = lod_it_key_rec,
366         }
367 };
368
369 /**
370  * Implementation of dt_it_ops::init.
371  *
372  * Used with striped objects. Internally just initializes the iterator
373  * on the first stripe.
374  *
375  * \see dt_it_ops::init() in the API description for details.
376  */
377 static struct dt_it *lod_striped_it_init(const struct lu_env *env,
378                                          struct dt_object *dt, __u32 attr)
379 {
380         struct lod_object       *lo = lod_dt_obj(dt);
381         struct dt_object        *next;
382         struct lod_it           *it = &lod_env_info(env)->lti_it;
383         struct dt_it            *it_next;
384         ENTRY;
385
386         LASSERT(lo->ldo_dir_stripe_count > 0);
387         next = lo->ldo_stripe[0];
388         LASSERT(next != NULL);
389         LASSERT(next->do_index_ops != NULL);
390
391         it_next = next->do_index_ops->dio_it.init(env, next, attr);
392         if (IS_ERR(it_next))
393                 return it_next;
394
395         /* currently we do not use more than one iterator per thread
396          * so we store it in thread info. if at some point we need
397          * more active iterators in a single thread, we can allocate
398          * additional ones */
399         LASSERT(it->lit_obj == NULL);
400
401         it->lit_stripe_index = 0;
402         it->lit_attr = attr;
403         it->lit_it = it_next;
404         it->lit_obj = dt;
405
406         return (struct dt_it *)it;
407 }
408
409 #define LOD_CHECK_STRIPED_IT(env, it, lo)                               \
410 do {                                                                    \
411         LASSERT((it)->lit_obj != NULL);                                 \
412         LASSERT((it)->lit_it != NULL);                                  \
413         LASSERT((lo)->ldo_dir_stripe_count > 0);                        \
414         LASSERT((it)->lit_stripe_index < (lo)->ldo_dir_stripe_count);   \
415 } while (0)
416
417 /**
418  * Implementation of dt_it_ops::fini.
419  *
420  * Used with striped objects.
421  *
422  * \see dt_it_ops::fini() in the API description for details.
423  */
424 static void lod_striped_it_fini(const struct lu_env *env, struct dt_it *di)
425 {
426         struct lod_it           *it = (struct lod_it *)di;
427         struct lod_object       *lo = lod_dt_obj(it->lit_obj);
428         struct dt_object        *next;
429
430         /* If lit_it == NULL, then it means the sub_it has been finished,
431          * which only happens in failure cases, see lod_striped_it_next() */
432         if (it->lit_it != NULL) {
433                 LOD_CHECK_STRIPED_IT(env, it, lo);
434
435                 next = lo->ldo_stripe[it->lit_stripe_index];
436                 LASSERT(next != NULL);
437                 LASSERT(next->do_index_ops != NULL);
438
439                 next->do_index_ops->dio_it.fini(env, it->lit_it);
440         }
441
442         /* the iterator not in use any more */
443         it->lit_obj = NULL;
444         it->lit_it = NULL;
445         it->lit_stripe_index = 0;
446 }
447
448 /**
449  * Implementation of dt_it_ops::get.
450  *
451  * Right now it's not used widely, only to reset the iterator to the
452  * initial position. It should be possible to implement a full version
453  * which chooses a correct stripe to be able to position with any key.
454  *
455  * \see dt_it_ops::get() in the API description for details.
456  */
457 static int lod_striped_it_get(const struct lu_env *env, struct dt_it *di,
458                               const struct dt_key *key)
459 {
460         const struct lod_it     *it = (const struct lod_it *)di;
461         struct lod_object       *lo = lod_dt_obj(it->lit_obj);
462         struct dt_object        *next;
463         ENTRY;
464
465         LOD_CHECK_STRIPED_IT(env, it, lo);
466
467         next = lo->ldo_stripe[it->lit_stripe_index];
468         LASSERT(next != NULL);
469         LASSERT(next->do_index_ops != NULL);
470
471         return next->do_index_ops->dio_it.get(env, it->lit_it, key);
472 }
473
474 /**
475  * Implementation of dt_it_ops::put.
476  *
477  * Used with striped objects.
478  *
479  * \see dt_it_ops::put() in the API description for details.
480  */
481 static void lod_striped_it_put(const struct lu_env *env, struct dt_it *di)
482 {
483         struct lod_it           *it = (struct lod_it *)di;
484         struct lod_object       *lo = lod_dt_obj(it->lit_obj);
485         struct dt_object        *next;
486
487         LOD_CHECK_STRIPED_IT(env, it, lo);
488
489         next = lo->ldo_stripe[it->lit_stripe_index];
490         LASSERT(next != NULL);
491         LASSERT(next->do_index_ops != NULL);
492
493         return next->do_index_ops->dio_it.put(env, it->lit_it);
494 }
495
496 /**
497  * Implementation of dt_it_ops::next.
498  *
499  * Used with striped objects. When the end of the current stripe is
500  * reached, the method takes the next stripe's iterator.
501  *
502  * \see dt_it_ops::next() in the API description for details.
503  */
504 static int lod_striped_it_next(const struct lu_env *env, struct dt_it *di)
505 {
506         struct lod_it           *it = (struct lod_it *)di;
507         struct lod_object       *lo = lod_dt_obj(it->lit_obj);
508         struct dt_object        *next;
509         struct dt_it            *it_next;
510         int                     rc;
511         ENTRY;
512
513         LOD_CHECK_STRIPED_IT(env, it, lo);
514
515         next = lo->ldo_stripe[it->lit_stripe_index];
516         LASSERT(next != NULL);
517         LASSERT(next->do_index_ops != NULL);
518 again:
519         rc = next->do_index_ops->dio_it.next(env, it->lit_it);
520         if (rc < 0)
521                 RETURN(rc);
522
523         if (rc == 0 && it->lit_stripe_index == 0)
524                 RETURN(rc);
525
526         if (rc == 0 && it->lit_stripe_index > 0) {
527                 struct lu_dirent *ent;
528
529                 ent = (struct lu_dirent *)lod_env_info(env)->lti_key;
530
531                 rc = next->do_index_ops->dio_it.rec(env, it->lit_it,
532                                                     (struct dt_rec *)ent,
533                                                     it->lit_attr);
534                 if (rc != 0)
535                         RETURN(rc);
536
537                 /* skip . and .. for slave stripe */
538                 if ((strncmp(ent->lde_name, ".",
539                              le16_to_cpu(ent->lde_namelen)) == 0 &&
540                      le16_to_cpu(ent->lde_namelen) == 1) ||
541                     (strncmp(ent->lde_name, "..",
542                              le16_to_cpu(ent->lde_namelen)) == 0 &&
543                      le16_to_cpu(ent->lde_namelen) == 2))
544                         goto again;
545
546                 RETURN(rc);
547         }
548
549         /* go to next stripe */
550         if (it->lit_stripe_index + 1 >= lo->ldo_dir_stripe_count)
551                 RETURN(1);
552
553         it->lit_stripe_index++;
554
555         next->do_index_ops->dio_it.put(env, it->lit_it);
556         next->do_index_ops->dio_it.fini(env, it->lit_it);
557         it->lit_it = NULL;
558
559         next = lo->ldo_stripe[it->lit_stripe_index];
560         LASSERT(next != NULL);
561         rc = next->do_ops->do_index_try(env, next, &dt_directory_features);
562         if (rc != 0)
563                 RETURN(rc);
564
565         LASSERT(next->do_index_ops != NULL);
566
567         it_next = next->do_index_ops->dio_it.init(env, next, it->lit_attr);
568         if (!IS_ERR(it_next)) {
569                 it->lit_it = it_next;
570                 goto again;
571         } else {
572                 rc = PTR_ERR(it_next);
573         }
574
575         RETURN(rc);
576 }
577
578 /**
579  * Implementation of dt_it_ops::key.
580  *
581  * Used with striped objects.
582  *
583  * \see dt_it_ops::key() in the API description for details.
584  */
585 static struct dt_key *lod_striped_it_key(const struct lu_env *env,
586                                          const struct dt_it *di)
587 {
588         const struct lod_it     *it = (const struct lod_it *)di;
589         struct lod_object       *lo = lod_dt_obj(it->lit_obj);
590         struct dt_object        *next;
591
592         LOD_CHECK_STRIPED_IT(env, it, lo);
593
594         next = lo->ldo_stripe[it->lit_stripe_index];
595         LASSERT(next != NULL);
596         LASSERT(next->do_index_ops != NULL);
597
598         return next->do_index_ops->dio_it.key(env, it->lit_it);
599 }
600
601 /**
602  * Implementation of dt_it_ops::key_size.
603  *
604  * Used with striped objects.
605  *
606  * \see dt_it_ops::size() in the API description for details.
607  */
608 static int lod_striped_it_key_size(const struct lu_env *env,
609                                    const struct dt_it *di)
610 {
611         struct lod_it           *it = (struct lod_it *)di;
612         struct lod_object       *lo = lod_dt_obj(it->lit_obj);
613         struct dt_object        *next;
614
615         LOD_CHECK_STRIPED_IT(env, it, lo);
616
617         next = lo->ldo_stripe[it->lit_stripe_index];
618         LASSERT(next != NULL);
619         LASSERT(next->do_index_ops != NULL);
620
621         return next->do_index_ops->dio_it.key_size(env, it->lit_it);
622 }
623
624 /**
625  * Implementation of dt_it_ops::rec.
626  *
627  * Used with striped objects.
628  *
629  * \see dt_it_ops::rec() in the API description for details.
630  */
631 static int lod_striped_it_rec(const struct lu_env *env, const struct dt_it *di,
632                               struct dt_rec *rec, __u32 attr)
633 {
634         const struct lod_it     *it = (const struct lod_it *)di;
635         struct lod_object       *lo = lod_dt_obj(it->lit_obj);
636         struct dt_object        *next;
637
638         LOD_CHECK_STRIPED_IT(env, it, lo);
639
640         next = lo->ldo_stripe[it->lit_stripe_index];
641         LASSERT(next != NULL);
642         LASSERT(next->do_index_ops != NULL);
643
644         return next->do_index_ops->dio_it.rec(env, it->lit_it, rec, attr);
645 }
646
647 /**
648  * Implementation of dt_it_ops::rec_size.
649  *
650  * Used with striped objects.
651  *
652  * \see dt_it_ops::rec_size() in the API description for details.
653  */
654 static int lod_striped_it_rec_size(const struct lu_env *env,
655                                    const struct dt_it *di, __u32 attr)
656 {
657         struct lod_it           *it = (struct lod_it *)di;
658         struct lod_object       *lo = lod_dt_obj(it->lit_obj);
659         struct dt_object        *next;
660
661         LOD_CHECK_STRIPED_IT(env, it, lo);
662
663         next = lo->ldo_stripe[it->lit_stripe_index];
664         LASSERT(next != NULL);
665         LASSERT(next->do_index_ops != NULL);
666
667         return next->do_index_ops->dio_it.rec_size(env, it->lit_it, attr);
668 }
669
670 /**
671  * Implementation of dt_it_ops::store.
672  *
673  * Used with striped objects.
674  *
675  * \see dt_it_ops::store() in the API description for details.
676  */
677 static __u64 lod_striped_it_store(const struct lu_env *env,
678                                   const struct dt_it *di)
679 {
680         const struct lod_it     *it = (const struct lod_it *)di;
681         struct lod_object       *lo = lod_dt_obj(it->lit_obj);
682         struct dt_object        *next;
683
684         LOD_CHECK_STRIPED_IT(env, it, lo);
685
686         next = lo->ldo_stripe[it->lit_stripe_index];
687         LASSERT(next != NULL);
688         LASSERT(next->do_index_ops != NULL);
689
690         return next->do_index_ops->dio_it.store(env, it->lit_it);
691 }
692
693 /**
694  * Implementation of dt_it_ops::load.
695  *
696  * Used with striped objects.
697  *
698  * \see dt_it_ops::load() in the API description for details.
699  */
700 static int lod_striped_it_load(const struct lu_env *env,
701                                const struct dt_it *di, __u64 hash)
702 {
703         const struct lod_it     *it = (const struct lod_it *)di;
704         struct lod_object       *lo = lod_dt_obj(it->lit_obj);
705         struct dt_object        *next;
706
707         LOD_CHECK_STRIPED_IT(env, it, lo);
708
709         next = lo->ldo_stripe[it->lit_stripe_index];
710         LASSERT(next != NULL);
711         LASSERT(next->do_index_ops != NULL);
712
713         return next->do_index_ops->dio_it.load(env, it->lit_it, hash);
714 }
715
716 static struct dt_index_operations lod_striped_index_ops = {
717         .dio_lookup             = lod_lookup,
718         .dio_declare_insert     = lod_declare_insert,
719         .dio_insert             = lod_insert,
720         .dio_declare_delete     = lod_declare_delete,
721         .dio_delete             = lod_delete,
722         .dio_it = {
723                 .init           = lod_striped_it_init,
724                 .fini           = lod_striped_it_fini,
725                 .get            = lod_striped_it_get,
726                 .put            = lod_striped_it_put,
727                 .next           = lod_striped_it_next,
728                 .key            = lod_striped_it_key,
729                 .key_size       = lod_striped_it_key_size,
730                 .rec            = lod_striped_it_rec,
731                 .rec_size       = lod_striped_it_rec_size,
732                 .store          = lod_striped_it_store,
733                 .load           = lod_striped_it_load,
734         }
735 };
736
737 /**
738  * Append the FID for each shard of the striped directory after the
739  * given LMV EA header.
740  *
741  * To simplify striped directory and the consistency verification,
742  * we only store the LMV EA header on disk, for both master object
743  * and slave objects. When someone wants to know the whole LMV EA,
744  * such as client readdir(), we can build the entrie LMV EA on the
745  * MDT side (in RAM) via iterating the sub-directory entries that
746  * are contained in the master object of the stripe directory.
747  *
748  * For the master object of the striped directroy, the valid name
749  * for each shard is composed of the ${shard_FID}:${shard_idx}.
750  *
751  * There may be holes in the LMV EA if some shards' name entries
752  * are corrupted or lost.
753  *
754  * \param[in] env       pointer to the thread context
755  * \param[in] lo        pointer to the master object of the striped directory
756  * \param[in] buf       pointer to the lu_buf which will hold the LMV EA
757  * \param[in] resize    whether re-allocate the buffer if it is not big enough
758  *
759  * \retval              positive size of the LMV EA
760  * \retval              0 for nothing to be loaded
761  * \retval              negative error number on failure
762  */
763 int lod_load_lmv_shards(const struct lu_env *env, struct lod_object *lo,
764                         struct lu_buf *buf, bool resize)
765 {
766         struct lu_dirent        *ent    =
767                         (struct lu_dirent *)lod_env_info(env)->lti_key;
768         struct lod_device       *lod    = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
769         struct dt_object        *obj    = dt_object_child(&lo->ldo_obj);
770         struct lmv_mds_md_v1    *lmv1   = buf->lb_buf;
771         struct dt_it            *it;
772         const struct dt_it_ops  *iops;
773         __u32                    stripes;
774         __u32                    magic  = le32_to_cpu(lmv1->lmv_magic);
775         size_t                   lmv1_size;
776         int                      rc;
777         ENTRY;
778
779         if (magic != LMV_MAGIC_V1)
780                 RETURN(0);
781
782         stripes = le32_to_cpu(lmv1->lmv_stripe_count);
783         if (stripes < 1)
784                 RETURN(0);
785
786         rc = lmv_mds_md_size(stripes, magic);
787         if (rc < 0)
788                 RETURN(rc);
789         lmv1_size = rc;
790         if (buf->lb_len < lmv1_size) {
791                 struct lu_buf tbuf;
792
793                 if (!resize)
794                         RETURN(-ERANGE);
795
796                 tbuf = *buf;
797                 buf->lb_buf = NULL;
798                 buf->lb_len = 0;
799                 lu_buf_alloc(buf, lmv1_size);
800                 lmv1 = buf->lb_buf;
801                 if (lmv1 == NULL)
802                         RETURN(-ENOMEM);
803
804                 memcpy(buf->lb_buf, tbuf.lb_buf, tbuf.lb_len);
805         }
806
807         if (unlikely(!dt_try_as_dir(env, obj)))
808                 RETURN(-ENOTDIR);
809
810         memset(&lmv1->lmv_stripe_fids[0], 0, stripes * sizeof(struct lu_fid));
811         iops = &obj->do_index_ops->dio_it;
812         it = iops->init(env, obj, LUDA_64BITHASH);
813         if (IS_ERR(it))
814                 RETURN(PTR_ERR(it));
815
816         rc = iops->load(env, it, 0);
817         if (rc == 0)
818                 rc = iops->next(env, it);
819         else if (rc > 0)
820                 rc = 0;
821
822         while (rc == 0) {
823                 char             name[FID_LEN + 2] = "";
824                 struct lu_fid    fid;
825                 __u32            index;
826                 int              len;
827
828                 rc = iops->rec(env, it, (struct dt_rec *)ent, LUDA_64BITHASH);
829                 if (rc != 0)
830                         break;
831
832                 rc = -EIO;
833
834                 fid_le_to_cpu(&fid, &ent->lde_fid);
835                 ent->lde_namelen = le16_to_cpu(ent->lde_namelen);
836                 if (ent->lde_name[0] == '.') {
837                         if (ent->lde_namelen == 1)
838                                 goto next;
839
840                         if (ent->lde_namelen == 2 && ent->lde_name[1] == '.')
841                                 goto next;
842                 }
843
844                 len = snprintf(name, sizeof(name),
845                                DFID":", PFID(&ent->lde_fid));
846                 /* The ent->lde_name is composed of ${FID}:${index} */
847                 if (ent->lde_namelen < len + 1 ||
848                     memcmp(ent->lde_name, name, len) != 0) {
849                         CDEBUG(lod->lod_lmv_failout ? D_ERROR : D_INFO,
850                                "%s: invalid shard name %.*s with the FID "DFID
851                                " for the striped directory "DFID", %s\n",
852                                lod2obd(lod)->obd_name, ent->lde_namelen,
853                                ent->lde_name, PFID(&fid),
854                                PFID(lu_object_fid(&obj->do_lu)),
855                                lod->lod_lmv_failout ? "failout" : "skip");
856
857                         if (lod->lod_lmv_failout)
858                                 break;
859
860                         goto next;
861                 }
862
863                 index = 0;
864                 do {
865                         if (ent->lde_name[len] < '0' ||
866                             ent->lde_name[len] > '9') {
867                                 CDEBUG(lod->lod_lmv_failout ? D_ERROR : D_INFO,
868                                        "%s: invalid shard name %.*s with the "
869                                        "FID "DFID" for the striped directory "
870                                        DFID", %s\n",
871                                        lod2obd(lod)->obd_name, ent->lde_namelen,
872                                        ent->lde_name, PFID(&fid),
873                                        PFID(lu_object_fid(&obj->do_lu)),
874                                        lod->lod_lmv_failout ?
875                                        "failout" : "skip");
876
877                                 if (lod->lod_lmv_failout)
878                                         break;
879
880                                 goto next;
881                         }
882
883                         index = index * 10 + ent->lde_name[len++] - '0';
884                 } while (len < ent->lde_namelen);
885
886                 if (len == ent->lde_namelen) {
887                         /* Out of LMV EA range. */
888                         if (index >= stripes) {
889                                 CERROR("%s: the shard %.*s for the striped "
890                                        "directory "DFID" is out of the known "
891                                        "LMV EA range [0 - %u], failout\n",
892                                        lod2obd(lod)->obd_name, ent->lde_namelen,
893                                        ent->lde_name,
894                                        PFID(lu_object_fid(&obj->do_lu)),
895                                        stripes - 1);
896
897                                 break;
898                         }
899
900                         /* The slot has been occupied. */
901                         if (!fid_is_zero(&lmv1->lmv_stripe_fids[index])) {
902                                 struct lu_fid fid0;
903
904                                 fid_le_to_cpu(&fid0,
905                                         &lmv1->lmv_stripe_fids[index]);
906                                 CERROR("%s: both the shard "DFID" and "DFID
907                                        " for the striped directory "DFID
908                                        " claim the same LMV EA slot at the "
909                                        "index %d, failout\n",
910                                        lod2obd(lod)->obd_name,
911                                        PFID(&fid0), PFID(&fid),
912                                        PFID(lu_object_fid(&obj->do_lu)), index);
913
914                                 break;
915                         }
916
917                         /* stored as LE mode */
918                         lmv1->lmv_stripe_fids[index] = ent->lde_fid;
919
920 next:
921                         rc = iops->next(env, it);
922                 }
923         }
924
925         iops->put(env, it);
926         iops->fini(env, it);
927
928         RETURN(rc > 0 ? lmv_mds_md_size(stripes, magic) : rc);
929 }
930
931 /**
932  * Implementation of dt_object_operations::do_index_try.
933  *
934  * \see dt_object_operations::do_index_try() in the API description for details.
935  */
936 static int lod_index_try(const struct lu_env *env, struct dt_object *dt,
937                          const struct dt_index_features *feat)
938 {
939         struct lod_object       *lo = lod_dt_obj(dt);
940         struct dt_object        *next = dt_object_child(dt);
941         int                     rc;
942         ENTRY;
943
944         LASSERT(next->do_ops);
945         LASSERT(next->do_ops->do_index_try);
946
947         rc = lod_striping_load(env, lo);
948         if (rc != 0)
949                 RETURN(rc);
950
951         rc = next->do_ops->do_index_try(env, next, feat);
952         if (rc != 0)
953                 RETURN(rc);
954
955         if (lo->ldo_dir_stripe_count > 0) {
956                 int i;
957
958                 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
959                         if (dt_object_exists(lo->ldo_stripe[i]) == 0)
960                                 continue;
961                         rc = lo->ldo_stripe[i]->do_ops->do_index_try(env,
962                                                 lo->ldo_stripe[i], feat);
963                         if (rc != 0)
964                                 RETURN(rc);
965                 }
966                 dt->do_index_ops = &lod_striped_index_ops;
967         } else {
968                 dt->do_index_ops = &lod_index_ops;
969         }
970
971         RETURN(rc);
972 }
973
974 /**
975  * Implementation of dt_object_operations::do_read_lock.
976  *
977  * \see dt_object_operations::do_read_lock() in the API description for details.
978  */
979 static void lod_read_lock(const struct lu_env *env, struct dt_object *dt,
980                           unsigned role)
981 {
982         dt_read_lock(env, dt_object_child(dt), role);
983 }
984
985 /**
986  * Implementation of dt_object_operations::do_write_lock.
987  *
988  * \see dt_object_operations::do_write_lock() in the API description for
989  * details.
990  */
991 static void lod_write_lock(const struct lu_env *env, struct dt_object *dt,
992                            unsigned role)
993 {
994         dt_write_lock(env, dt_object_child(dt), role);
995 }
996
997 /**
998  * Implementation of dt_object_operations::do_read_unlock.
999  *
1000  * \see dt_object_operations::do_read_unlock() in the API description for
1001  * details.
1002  */
1003 static void lod_read_unlock(const struct lu_env *env, struct dt_object *dt)
1004 {
1005         dt_read_unlock(env, dt_object_child(dt));
1006 }
1007
1008 /**
1009  * Implementation of dt_object_operations::do_write_unlock.
1010  *
1011  * \see dt_object_operations::do_write_unlock() in the API description for
1012  * details.
1013  */
1014 static void lod_write_unlock(const struct lu_env *env, struct dt_object *dt)
1015 {
1016         dt_write_unlock(env, dt_object_child(dt));
1017 }
1018
1019 /**
1020  * Implementation of dt_object_operations::do_write_locked.
1021  *
1022  * \see dt_object_operations::do_write_locked() in the API description for
1023  * details.
1024  */
1025 static int lod_write_locked(const struct lu_env *env, struct dt_object *dt)
1026 {
1027         return dt_write_locked(env, dt_object_child(dt));
1028 }
1029
1030 /**
1031  * Implementation of dt_object_operations::do_attr_get.
1032  *
1033  * \see dt_object_operations::do_attr_get() in the API description for details.
1034  */
1035 static int lod_attr_get(const struct lu_env *env,
1036                         struct dt_object *dt,
1037                         struct lu_attr *attr)
1038 {
1039         /* Note: for striped directory, client will merge attributes
1040          * from all of the sub-stripes see lmv_merge_attr(), and there
1041          * no MDD logic depend on directory nlink/size/time, so we can
1042          * always use master inode nlink and size for now. */
1043         return dt_attr_get(env, dt_object_child(dt), attr);
1044 }
1045
1046 static inline void lod_adjust_stripe_info(struct lod_layout_component *comp,
1047                                           struct lov_desc *desc)
1048 {
1049         if (comp->llc_pattern != LOV_PATTERN_MDT) {
1050                 if (!comp->llc_stripe_count)
1051                         comp->llc_stripe_count =
1052                                 desc->ld_default_stripe_count;
1053         }
1054         if (comp->llc_stripe_size <= 0)
1055                 comp->llc_stripe_size = desc->ld_default_stripe_size;
1056 }
1057
1058 int lod_obj_for_each_stripe(const struct lu_env *env, struct lod_object *lo,
1059                             struct thandle *th,
1060                             struct lod_obj_stripe_cb_data *data)
1061 {
1062         struct lod_layout_component *lod_comp;
1063         int i, j, rc;
1064         ENTRY;
1065
1066         LASSERT(lo->ldo_comp_cnt != 0 && lo->ldo_comp_entries != NULL);
1067         for (i = 0; i < lo->ldo_comp_cnt; i++) {
1068                 lod_comp = &lo->ldo_comp_entries[i];
1069
1070                 if (lod_comp->llc_stripe == NULL)
1071                         continue;
1072
1073                 /* has stripe but not inited yet, this component has been
1074                  * declared to be created, but hasn't created yet.
1075                  */
1076                 if (!lod_comp_inited(lod_comp))
1077                         continue;
1078
1079                 if (data->locd_comp_skip_cb &&
1080                     data->locd_comp_skip_cb(env, lo, i, data))
1081                         continue;
1082
1083                 if (data->locd_comp_cb) {
1084                         rc = data->locd_comp_cb(env, lo, i, data);
1085                         if (rc)
1086                                 RETURN(rc);
1087                 }
1088
1089                 /* could used just to do sth about component, not each
1090                  * stripes
1091                  */
1092                 if (!data->locd_stripe_cb)
1093                         continue;
1094
1095                 LASSERT(lod_comp->llc_stripe_count > 0);
1096                 for (j = 0; j < lod_comp->llc_stripe_count; j++) {
1097                         struct dt_object *dt = lod_comp->llc_stripe[j];
1098
1099                         if (dt == NULL)
1100                                 continue;
1101                         rc = data->locd_stripe_cb(env, lo, dt, th, i, j, data);
1102                         if (rc != 0)
1103                                 RETURN(rc);
1104                 }
1105         }
1106         RETURN(0);
1107 }
1108
1109 static bool lod_obj_attr_set_comp_skip_cb(const struct lu_env *env,
1110                 struct lod_object *lo, int comp_idx,
1111                 struct lod_obj_stripe_cb_data *data)
1112 {
1113         struct lod_layout_component *lod_comp = &lo->ldo_comp_entries[comp_idx];
1114         bool skipped = false;
1115
1116         if (!(data->locd_attr->la_valid & LA_LAYOUT_VERSION))
1117                 return skipped;
1118
1119         switch (lo->ldo_flr_state) {
1120         case LCM_FL_WRITE_PENDING: {
1121                 int i;
1122
1123                 /* skip stale components */
1124                 if (lod_comp->llc_flags & LCME_FL_STALE) {
1125                         skipped = true;
1126                         break;
1127                 }
1128
1129                 /* skip valid and overlapping components, therefore any
1130                  * attempts to write overlapped components will never succeed
1131                  * because client will get EINPROGRESS. */
1132                 for (i = 0; i < lo->ldo_comp_cnt; i++) {
1133                         if (i == comp_idx)
1134                                 continue;
1135
1136                         if (lo->ldo_comp_entries[i].llc_flags & LCME_FL_STALE)
1137                                 continue;
1138
1139                         if (lu_extent_is_overlapped(&lod_comp->llc_extent,
1140                                         &lo->ldo_comp_entries[i].llc_extent)) {
1141                                 skipped = true;
1142                                 break;
1143                         }
1144                 }
1145                 break;
1146         }
1147         default:
1148                 LASSERTF(0, "impossible: %d\n", lo->ldo_flr_state);
1149         case LCM_FL_SYNC_PENDING:
1150                 break;
1151         }
1152
1153         CDEBUG(D_LAYOUT, DFID": %s to set component %x to version: %u\n",
1154                PFID(lu_object_fid(&lo->ldo_obj.do_lu)),
1155                skipped ? "skipped" : "chose", lod_comp->llc_id,
1156                data->locd_attr->la_layout_version);
1157
1158         return skipped;
1159 }
1160
1161 static inline int
1162 lod_obj_stripe_attr_set_cb(const struct lu_env *env, struct lod_object *lo,
1163                            struct dt_object *dt, struct thandle *th,
1164                            int comp_idx, int stripe_idx,
1165                            struct lod_obj_stripe_cb_data *data)
1166 {
1167         if (data->locd_declare)
1168                 return lod_sub_declare_attr_set(env, dt, data->locd_attr, th);
1169
1170         if (data->locd_attr->la_valid & LA_LAYOUT_VERSION) {
1171                 CDEBUG(D_LAYOUT, DFID": set layout version: %u, comp_idx: %d\n",
1172                        PFID(lu_object_fid(&dt->do_lu)),
1173                        data->locd_attr->la_layout_version, comp_idx);
1174         }
1175
1176         return lod_sub_attr_set(env, dt, data->locd_attr, th);
1177 }
1178
1179 /**
1180  * Implementation of dt_object_operations::do_declare_attr_set.
1181  *
1182  * If the object is striped, then apply the changes to all the stripes.
1183  *
1184  * \see dt_object_operations::do_declare_attr_set() in the API description
1185  * for details.
1186  */
1187 static int lod_declare_attr_set(const struct lu_env *env,
1188                                 struct dt_object *dt,
1189                                 const struct lu_attr *attr,
1190                                 struct thandle *th)
1191 {
1192         struct dt_object  *next = dt_object_child(dt);
1193         struct lod_object *lo = lod_dt_obj(dt);
1194         int                rc, i;
1195         ENTRY;
1196
1197         /*
1198          * declare setattr on the local object
1199          */
1200         rc = lod_sub_declare_attr_set(env, next, attr, th);
1201         if (rc)
1202                 RETURN(rc);
1203
1204         /* osp_declare_attr_set() ignores all attributes other than
1205          * UID, GID, PROJID, and size, and osp_attr_set() ignores all
1206          * but UID, GID and PROJID. Declaration of size attr setting
1207          * happens through lod_declare_init_size(), and not through
1208          * this function. Therefore we need not load striping unless
1209          * ownership is changing.  This should save memory and (we hope)
1210          * speed up rename().
1211          */
1212         if (!S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
1213                 if (!(attr->la_valid & LA_REMOTE_ATTR_SET))
1214                         RETURN(rc);
1215
1216                 if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_OWNER))
1217                         RETURN(0);
1218         } else {
1219                 if (!(attr->la_valid & (LA_UID | LA_GID | LA_PROJID | LA_MODE |
1220                                         LA_ATIME | LA_MTIME | LA_CTIME |
1221                                         LA_FLAGS)))
1222                         RETURN(rc);
1223         }
1224         /*
1225          * load striping information, notice we don't do this when object
1226          * is being initialized as we don't need this information till
1227          * few specific cases like destroy, chown
1228          */
1229         rc = lod_striping_load(env, lo);
1230         if (rc)
1231                 RETURN(rc);
1232
1233         if (!lod_obj_is_striped(dt))
1234                 RETURN(0);
1235
1236         /*
1237          * if object is striped declare changes on the stripes
1238          */
1239         if (S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
1240                 LASSERT(lo->ldo_stripe);
1241                 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
1242                         if (lo->ldo_stripe[i] == NULL)
1243                                 continue;
1244                         rc = lod_sub_declare_attr_set(env, lo->ldo_stripe[i],
1245                                                       attr, th);
1246                         if (rc != 0)
1247                                 RETURN(rc);
1248                 }
1249         } else {
1250                 struct lod_obj_stripe_cb_data data = { { 0 } };
1251
1252                 data.locd_attr = attr;
1253                 data.locd_declare = true;
1254                 data.locd_stripe_cb = lod_obj_stripe_attr_set_cb;
1255                 rc = lod_obj_for_each_stripe(env, lo, th, &data);
1256         }
1257
1258         if (rc)
1259                 RETURN(rc);
1260
1261         if (!dt_object_exists(next) || dt_object_remote(next) ||
1262             !S_ISREG(attr->la_mode))
1263                 RETURN(0);
1264
1265         if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_STRIPE)) {
1266                 rc = lod_sub_declare_xattr_del(env, next, XATTR_NAME_LOV, th);
1267                 RETURN(rc);
1268         }
1269
1270         if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_CHANGE_STRIPE) ||
1271             OBD_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_PFL_RANGE)) {
1272                 struct lod_thread_info *info = lod_env_info(env);
1273                 struct lu_buf *buf = &info->lti_buf;
1274
1275                 buf->lb_buf = info->lti_ea_store;
1276                 buf->lb_len = info->lti_ea_store_size;
1277                 rc = lod_sub_declare_xattr_set(env, next, buf, XATTR_NAME_LOV,
1278                                                LU_XATTR_REPLACE, th);
1279         }
1280
1281         RETURN(rc);
1282 }
1283
1284 /**
1285  * Implementation of dt_object_operations::do_attr_set.
1286  *
1287  * If the object is striped, then apply the changes to all or subset of
1288  * the stripes depending on the object type and specific attributes.
1289  *
1290  * \see dt_object_operations::do_attr_set() in the API description for details.
1291  */
1292 static int lod_attr_set(const struct lu_env *env,
1293                         struct dt_object *dt,
1294                         const struct lu_attr *attr,
1295                         struct thandle *th)
1296 {
1297         struct dt_object        *next = dt_object_child(dt);
1298         struct lod_object       *lo = lod_dt_obj(dt);
1299         int                     rc, i;
1300         ENTRY;
1301
1302         /*
1303          * apply changes to the local object
1304          */
1305         rc = lod_sub_attr_set(env, next, attr, th);
1306         if (rc)
1307                 RETURN(rc);
1308
1309         if (!S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
1310                 if (!(attr->la_valid & LA_REMOTE_ATTR_SET))
1311                         RETURN(rc);
1312
1313                 if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_OWNER))
1314                         RETURN(0);
1315         } else {
1316                 if (!(attr->la_valid & (LA_UID | LA_GID | LA_MODE | LA_PROJID |
1317                                         LA_ATIME | LA_MTIME | LA_CTIME |
1318                                         LA_FLAGS)))
1319                         RETURN(rc);
1320         }
1321
1322         /* FIXME: a tricky case in the code path of mdd_layout_change():
1323          * the in-memory striping information has been freed in lod_xattr_set()
1324          * due to layout change. It has to load stripe here again. It only
1325          * changes flags of layout so declare_attr_set() is still accurate */
1326         rc = lod_striping_load(env, lo);
1327         if (rc)
1328                 RETURN(rc);
1329
1330         if (!lod_obj_is_striped(dt))
1331                 RETURN(0);
1332
1333         /*
1334          * if object is striped, apply changes to all the stripes
1335          */
1336         if (S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
1337                 LASSERT(lo->ldo_stripe);
1338                 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
1339                         if (unlikely(lo->ldo_stripe[i] == NULL))
1340                                 continue;
1341
1342                         if ((dt_object_exists(lo->ldo_stripe[i]) == 0))
1343                                 continue;
1344
1345                         rc = lod_sub_attr_set(env, lo->ldo_stripe[i], attr, th);
1346                         if (rc != 0)
1347                                 break;
1348                 }
1349         } else {
1350                 struct lod_obj_stripe_cb_data data = { { 0 } };
1351
1352                 data.locd_attr = attr;
1353                 data.locd_declare = false;
1354                 data.locd_comp_skip_cb = lod_obj_attr_set_comp_skip_cb;
1355                 data.locd_stripe_cb = lod_obj_stripe_attr_set_cb;
1356                 rc = lod_obj_for_each_stripe(env, lo, th, &data);
1357         }
1358
1359         if (rc)
1360                 RETURN(rc);
1361
1362         if (!dt_object_exists(next) || dt_object_remote(next) ||
1363             !S_ISREG(attr->la_mode))
1364                 RETURN(0);
1365
1366         if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_STRIPE)) {
1367                 rc = lod_sub_xattr_del(env, next, XATTR_NAME_LOV, th);
1368                 RETURN(rc);
1369         }
1370
1371         if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_CHANGE_STRIPE)) {
1372                 struct lod_thread_info *info = lod_env_info(env);
1373                 struct lu_buf *buf = &info->lti_buf;
1374                 struct ost_id *oi = &info->lti_ostid;
1375                 struct lu_fid *fid = &info->lti_fid;
1376                 struct lov_mds_md_v1 *lmm;
1377                 struct lov_ost_data_v1 *objs;
1378                 __u32 magic;
1379
1380                 rc = lod_get_lov_ea(env, lo);
1381                 if (rc <= 0)
1382                         RETURN(rc);
1383
1384                 buf->lb_buf = info->lti_ea_store;
1385                 buf->lb_len = info->lti_ea_store_size;
1386                 lmm = info->lti_ea_store;
1387                 magic = le32_to_cpu(lmm->lmm_magic);
1388                 if (magic == LOV_MAGIC_COMP_V1) {
1389                         struct lov_comp_md_v1 *lcm = buf->lb_buf;
1390                         struct lov_comp_md_entry_v1 *lcme =
1391                                                 &lcm->lcm_entries[0];
1392
1393                         lmm = buf->lb_buf + le32_to_cpu(lcme->lcme_offset);
1394                         magic = le32_to_cpu(lmm->lmm_magic);
1395                 }
1396
1397                 if (magic == LOV_MAGIC_V1)
1398                         objs = &(lmm->lmm_objects[0]);
1399                 else
1400                         objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0];
1401                 ostid_le_to_cpu(&objs->l_ost_oi, oi);
1402                 ostid_to_fid(fid, oi, le32_to_cpu(objs->l_ost_idx));
1403                 fid->f_oid--;
1404                 fid_to_ostid(fid, oi);
1405                 ostid_cpu_to_le(oi, &objs->l_ost_oi);
1406
1407                 rc = lod_sub_xattr_set(env, next, buf, XATTR_NAME_LOV,
1408                                        LU_XATTR_REPLACE, th);
1409         } else if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_PFL_RANGE)) {
1410                 struct lod_thread_info *info = lod_env_info(env);
1411                 struct lu_buf *buf = &info->lti_buf;
1412                 struct lov_comp_md_v1 *lcm;
1413                 struct lov_comp_md_entry_v1 *lcme;
1414
1415                 rc = lod_get_lov_ea(env, lo);
1416                 if (rc <= 0)
1417                         RETURN(rc);
1418
1419                 buf->lb_buf = info->lti_ea_store;
1420                 buf->lb_len = info->lti_ea_store_size;
1421                 lcm = buf->lb_buf;
1422                 if (le32_to_cpu(lcm->lcm_magic) != LOV_MAGIC_COMP_V1)
1423                         RETURN(-EINVAL);
1424
1425                 le32_add_cpu(&lcm->lcm_layout_gen, 1);
1426                 lcme = &lcm->lcm_entries[0];
1427                 le64_add_cpu(&lcme->lcme_extent.e_start, 1);
1428                 le64_add_cpu(&lcme->lcme_extent.e_end, -1);
1429
1430                 rc = lod_sub_xattr_set(env, next, buf, XATTR_NAME_LOV,
1431                                        LU_XATTR_REPLACE, th);
1432         }
1433
1434         RETURN(rc);
1435 }
1436
1437 /**
1438  * Implementation of dt_object_operations::do_xattr_get.
1439  *
1440  * If LOV EA is requested from the root object and it's not
1441  * found, then return default striping for the filesystem.
1442  *
1443  * \see dt_object_operations::do_xattr_get() in the API description for details.
1444  */
1445 static int lod_xattr_get(const struct lu_env *env, struct dt_object *dt,
1446                          struct lu_buf *buf, const char *name)
1447 {
1448         struct lod_thread_info *info = lod_env_info(env);
1449         struct lod_device *dev = lu2lod_dev(dt->do_lu.lo_dev);
1450         int is_root;
1451         int rc;
1452         ENTRY;
1453
1454         rc = dt_xattr_get(env, dt_object_child(dt), buf, name);
1455         if (strcmp(name, XATTR_NAME_LMV) == 0) {
1456                 struct lmv_mds_md_v1    *lmv1;
1457                 struct lmv_foreign_md   *lfm;
1458                 int                      rc1 = 0;
1459
1460                 if (rc > (typeof(rc))sizeof(*lmv1))
1461                         RETURN(rc);
1462
1463                 /* short (<= sizeof(struct lmv_mds_md_v1)) foreign LMV case */
1464                 /* XXX empty foreign LMV is not allowed */
1465                 if (rc <= offsetof(typeof(*lfm), lfm_value))
1466                         RETURN(rc = rc > 0 ? -EINVAL : rc);
1467
1468                 if (buf->lb_buf == NULL || buf->lb_len == 0) {
1469                         CLASSERT(sizeof(*lmv1) <= sizeof(info->lti_key));
1470
1471                         /* lti_buf is large enough for *lmv1 or a short
1472                          * (<= sizeof(struct lmv_mds_md_v1)) foreign LMV
1473                          */
1474                         info->lti_buf.lb_buf = info->lti_key;
1475                         info->lti_buf.lb_len = sizeof(*lmv1);
1476                         rc = dt_xattr_get(env, dt_object_child(dt),
1477                                           &info->lti_buf, name);
1478                         if (unlikely(rc <= offsetof(typeof(*lfm),
1479                                                     lfm_value)))
1480                                 RETURN(rc = rc > 0 ? -EINVAL : rc);
1481
1482                         lfm = info->lti_buf.lb_buf;
1483                         if (le32_to_cpu(lfm->lfm_magic) == LMV_MAGIC_FOREIGN)
1484                                 RETURN(rc);
1485
1486                         if (unlikely(rc != sizeof(*lmv1)))
1487                                 RETURN(rc = rc > 0 ? -EINVAL : rc);
1488
1489                         lmv1 = info->lti_buf.lb_buf;
1490                         /* The on-disk LMV EA only contains header, but the
1491                          * returned LMV EA size should contain the space for
1492                          * the FIDs of all shards of the striped directory. */
1493                         if (le32_to_cpu(lmv1->lmv_magic) == LMV_MAGIC_V1)
1494                                 rc = lmv_mds_md_size(
1495                                         le32_to_cpu(lmv1->lmv_stripe_count),
1496                                         LMV_MAGIC_V1);
1497                 } else {
1498                         lfm = buf->lb_buf;
1499                         if (le32_to_cpu(lfm->lfm_magic) == LMV_MAGIC_FOREIGN)
1500                                 RETURN(rc);
1501
1502                         if (rc != sizeof(*lmv1))
1503                                 RETURN(rc = rc > 0 ? -EINVAL : rc);
1504
1505                         rc1 = lod_load_lmv_shards(env, lod_dt_obj(dt),
1506                                                   buf, false);
1507                 }
1508
1509                 RETURN(rc = rc1 != 0 ? rc1 : rc);
1510         }
1511
1512         if (rc != -ENODATA || !S_ISDIR(dt->do_lu.lo_header->loh_attr & S_IFMT))
1513                 RETURN(rc);
1514
1515         /*
1516          * XXX: Only used by lfsck
1517          *
1518          * lod returns default striping on the real root of the device
1519          * this is like the root stores default striping for the whole
1520          * filesystem. historically we've been using a different approach
1521          * and store it in the config.
1522          */
1523         dt_root_get(env, dev->lod_child, &info->lti_fid);
1524         is_root = lu_fid_eq(&info->lti_fid, lu_object_fid(&dt->do_lu));
1525
1526         if (is_root && strcmp(XATTR_NAME_LOV, name) == 0) {
1527                 struct lov_user_md *lum = buf->lb_buf;
1528                 struct lov_desc    *desc = &dev->lod_desc;
1529
1530                 if (buf->lb_buf == NULL) {
1531                         rc = sizeof(*lum);
1532                 } else if (buf->lb_len >= sizeof(*lum)) {
1533                         lum->lmm_magic = cpu_to_le32(LOV_USER_MAGIC_V1);
1534                         lmm_oi_set_seq(&lum->lmm_oi, FID_SEQ_LOV_DEFAULT);
1535                         lmm_oi_set_id(&lum->lmm_oi, 0);
1536                         lmm_oi_cpu_to_le(&lum->lmm_oi, &lum->lmm_oi);
1537                         lum->lmm_pattern = cpu_to_le32(desc->ld_pattern);
1538                         lum->lmm_stripe_size = cpu_to_le32(
1539                                                 desc->ld_default_stripe_size);
1540                         lum->lmm_stripe_count = cpu_to_le16(
1541                                                 desc->ld_default_stripe_count);
1542                         lum->lmm_stripe_offset = cpu_to_le16(
1543                                                 desc->ld_default_stripe_offset);
1544                         rc = sizeof(*lum);
1545                 } else {
1546                         rc = -ERANGE;
1547                 }
1548         }
1549
1550         RETURN(rc);
1551 }
1552
1553 /**
1554  * Verify LVM EA.
1555  *
1556  * Checks that the magic of the stripe is sane.
1557  *
1558  * \param[in] lod       lod device
1559  * \param[in] lum       a buffer storing LMV EA to verify
1560  *
1561  * \retval              0 if the EA is sane
1562  * \retval              negative otherwise
1563  */
1564 static int lod_verify_md_striping(struct lod_device *lod,
1565                                   const struct lmv_user_md_v1 *lum)
1566 {
1567         if (unlikely(le32_to_cpu(lum->lum_magic) != LMV_USER_MAGIC)) {
1568                 CERROR("%s: invalid lmv_user_md: magic = %x, "
1569                        "stripe_offset = %d, stripe_count = %u: rc = %d\n",
1570                        lod2obd(lod)->obd_name, le32_to_cpu(lum->lum_magic),
1571                        (int)le32_to_cpu(lum->lum_stripe_offset),
1572                        le32_to_cpu(lum->lum_stripe_count), -EINVAL);
1573                 return -EINVAL;
1574         }
1575
1576         return 0;
1577 }
1578
1579 /**
1580  * Initialize LMV EA for a slave.
1581  *
1582  * Initialize slave's LMV EA from the master's LMV EA.
1583  *
1584  * \param[in] master_lmv        a buffer containing master's EA
1585  * \param[out] slave_lmv        a buffer where slave's EA will be stored
1586  *
1587  */
1588 static void lod_prep_slave_lmv_md(struct lmv_mds_md_v1 *slave_lmv,
1589                                   const struct lmv_mds_md_v1 *master_lmv)
1590 {
1591         *slave_lmv = *master_lmv;
1592         slave_lmv->lmv_magic = cpu_to_le32(LMV_MAGIC_STRIPE);
1593 }
1594
1595 /**
1596  * Generate LMV EA.
1597  *
1598  * Generate LMV EA from the object passed as \a dt. The object must have
1599  * the stripes created and initialized.
1600  *
1601  * \param[in] env       execution environment
1602  * \param[in] dt        object
1603  * \param[out] lmv_buf  buffer storing generated LMV EA
1604  *
1605  * \retval              0 on success
1606  * \retval              negative if failed
1607  */
1608 static int lod_prep_lmv_md(const struct lu_env *env, struct dt_object *dt,
1609                            struct lu_buf *lmv_buf)
1610 {
1611         struct lod_thread_info  *info = lod_env_info(env);
1612         struct lod_device       *lod = lu2lod_dev(dt->do_lu.lo_dev);
1613         struct lod_object       *lo = lod_dt_obj(dt);
1614         struct lmv_mds_md_v1    *lmm1;
1615         int                     stripe_count;
1616         int                     type = LU_SEQ_RANGE_ANY;
1617         int                     rc;
1618         __u32                   mdtidx;
1619         ENTRY;
1620
1621         LASSERT(lo->ldo_dir_striped != 0);
1622         LASSERT(lo->ldo_dir_stripe_count > 0);
1623         stripe_count = lo->ldo_dir_stripe_count;
1624         /* Only store the LMV EA heahder on the disk. */
1625         if (info->lti_ea_store_size < sizeof(*lmm1)) {
1626                 rc = lod_ea_store_resize(info, sizeof(*lmm1));
1627                 if (rc != 0)
1628                         RETURN(rc);
1629         } else {
1630                 memset(info->lti_ea_store, 0, sizeof(*lmm1));
1631         }
1632
1633         lmm1 = (struct lmv_mds_md_v1 *)info->lti_ea_store;
1634         memset(lmm1, 0, sizeof(*lmm1));
1635         lmm1->lmv_magic = cpu_to_le32(LMV_MAGIC);
1636         lmm1->lmv_stripe_count = cpu_to_le32(stripe_count);
1637         lmm1->lmv_hash_type = cpu_to_le32(lo->ldo_dir_hash_type);
1638         if (lo->ldo_dir_hash_type & LMV_HASH_FLAG_MIGRATION) {
1639                 lmm1->lmv_migrate_hash = cpu_to_le32(lo->ldo_dir_migrate_hash);
1640                 lmm1->lmv_migrate_offset =
1641                         cpu_to_le32(lo->ldo_dir_migrate_offset);
1642         }
1643         rc = lod_fld_lookup(env, lod, lu_object_fid(&dt->do_lu),
1644                             &mdtidx, &type);
1645         if (rc != 0)
1646                 RETURN(rc);
1647
1648         lmm1->lmv_master_mdt_index = cpu_to_le32(mdtidx);
1649         lmv_buf->lb_buf = info->lti_ea_store;
1650         lmv_buf->lb_len = sizeof(*lmm1);
1651
1652         RETURN(rc);
1653 }
1654
1655 /**
1656  * Create in-core represenation for a striped directory.
1657  *
1658  * Parse the buffer containing LMV EA and instantiate LU objects
1659  * representing the stripe objects. The pointers to the objects are
1660  * stored in ldo_stripe field of \a lo. This function is used when
1661  * we need to access an already created object (i.e. load from a disk).
1662  *
1663  * \param[in] env       execution environment
1664  * \param[in] lo        lod object
1665  * \param[in] buf       buffer containing LMV EA
1666  *
1667  * \retval              0 on success
1668  * \retval              negative if failed
1669  */
1670 int lod_parse_dir_striping(const struct lu_env *env, struct lod_object *lo,
1671                            const struct lu_buf *buf)
1672 {
1673         struct lod_thread_info  *info = lod_env_info(env);
1674         struct lod_device       *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
1675         struct lod_tgt_descs    *ltd = &lod->lod_mdt_descs;
1676         struct dt_object        **stripe;
1677         union lmv_mds_md        *lmm = buf->lb_buf;
1678         struct lmv_mds_md_v1    *lmv1 = &lmm->lmv_md_v1;
1679         struct lu_fid           *fid = &info->lti_fid;
1680         unsigned int            i;
1681         int                     rc = 0;
1682         ENTRY;
1683
1684         LASSERT(mutex_is_locked(&lo->ldo_layout_mutex));
1685
1686         /* XXX may be useless as not called for foreign LMV ?? */
1687         if (le32_to_cpu(lmv1->lmv_magic) == LMV_MAGIC_FOREIGN)
1688                 RETURN(0);
1689
1690         if (le32_to_cpu(lmv1->lmv_magic) == LMV_MAGIC_STRIPE) {
1691                 lo->ldo_dir_slave_stripe = 1;
1692                 RETURN(0);
1693         }
1694
1695         if (le32_to_cpu(lmv1->lmv_magic) != LMV_MAGIC_V1)
1696                 RETURN(-EINVAL);
1697
1698         if (le32_to_cpu(lmv1->lmv_stripe_count) < 1)
1699                 RETURN(0);
1700
1701         LASSERT(lo->ldo_stripe == NULL);
1702         OBD_ALLOC(stripe, sizeof(stripe[0]) *
1703                   (le32_to_cpu(lmv1->lmv_stripe_count)));
1704         if (stripe == NULL)
1705                 RETURN(-ENOMEM);
1706
1707         for (i = 0; i < le32_to_cpu(lmv1->lmv_stripe_count); i++) {
1708                 struct dt_device        *tgt_dt;
1709                 struct dt_object        *dto;
1710                 int                     type = LU_SEQ_RANGE_ANY;
1711                 __u32                   idx;
1712
1713                 fid_le_to_cpu(fid, &lmv1->lmv_stripe_fids[i]);
1714                 if (!fid_is_sane(fid))
1715                         GOTO(out, rc = -ESTALE);
1716
1717                 rc = lod_fld_lookup(env, lod, fid, &idx, &type);
1718                 if (rc != 0)
1719                         GOTO(out, rc);
1720
1721                 if (idx == lod2lu_dev(lod)->ld_site->ld_seq_site->ss_node_id) {
1722                         tgt_dt = lod->lod_child;
1723                 } else {
1724                         struct lod_tgt_desc     *tgt;
1725
1726                         tgt = LTD_TGT(ltd, idx);
1727                         if (tgt == NULL)
1728                                 GOTO(out, rc = -ESTALE);
1729                         tgt_dt = tgt->ltd_tgt;
1730                 }
1731
1732                 dto = dt_locate_at(env, tgt_dt, fid,
1733                                   lo->ldo_obj.do_lu.lo_dev->ld_site->ls_top_dev,
1734                                   NULL);
1735                 if (IS_ERR(dto))
1736                         GOTO(out, rc = PTR_ERR(dto));
1737
1738                 stripe[i] = dto;
1739         }
1740 out:
1741         lo->ldo_stripe = stripe;
1742         lo->ldo_dir_stripe_count = le32_to_cpu(lmv1->lmv_stripe_count);
1743         lo->ldo_dir_stripes_allocated = le32_to_cpu(lmv1->lmv_stripe_count);
1744         if (rc != 0)
1745                 lod_striping_free_nolock(env, lo);
1746
1747         RETURN(rc);
1748 }
1749
1750 /**
1751  * Declare create a striped directory.
1752  *
1753  * Declare creating a striped directory with a given stripe pattern on the
1754  * specified MDTs. A striped directory is represented as a regular directory
1755  * - an index listing all the stripes. The stripes point back to the master
1756  * object with ".." and LinkEA. The master object gets LMV EA which
1757  * identifies it as a striped directory. The function allocates FIDs
1758  * for all stripes.
1759  *
1760  * \param[in] env       execution environment
1761  * \param[in] dt        object
1762  * \param[in] attr      attributes to initialize the objects with
1763  * \param[in] dof       type of objects to be created
1764  * \param[in] th        transaction handle
1765  *
1766  * \retval              0 on success
1767  * \retval              negative if failed
1768  */
1769 static int lod_dir_declare_create_stripes(const struct lu_env *env,
1770                                           struct dt_object *dt,
1771                                           struct lu_attr *attr,
1772                                           struct dt_object_format *dof,
1773                                           struct thandle *th)
1774 {
1775         struct lod_thread_info  *info = lod_env_info(env);
1776         struct lu_buf           lmv_buf;
1777         struct lu_buf           slave_lmv_buf;
1778         struct lmv_mds_md_v1    *lmm;
1779         struct lmv_mds_md_v1    *slave_lmm = NULL;
1780         struct dt_insert_rec    *rec = &info->lti_dt_rec;
1781         struct lod_object       *lo = lod_dt_obj(dt);
1782         int                     rc;
1783         __u32                   i;
1784         ENTRY;
1785
1786         rc = lod_prep_lmv_md(env, dt, &lmv_buf);
1787         if (rc != 0)
1788                 GOTO(out, rc);
1789         lmm = lmv_buf.lb_buf;
1790
1791         OBD_ALLOC_PTR(slave_lmm);
1792         if (slave_lmm == NULL)
1793                 GOTO(out, rc = -ENOMEM);
1794
1795         lod_prep_slave_lmv_md(slave_lmm, lmm);
1796         slave_lmv_buf.lb_buf = slave_lmm;
1797         slave_lmv_buf.lb_len = sizeof(*slave_lmm);
1798
1799         if (!dt_try_as_dir(env, dt_object_child(dt)))
1800                 GOTO(out, rc = -EINVAL);
1801
1802         rec->rec_type = S_IFDIR;
1803         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
1804                 struct dt_object        *dto = lo->ldo_stripe[i];
1805                 char                    *stripe_name = info->lti_key;
1806                 struct lu_name          *sname;
1807                 struct linkea_data       ldata          = { NULL };
1808                 struct lu_buf           linkea_buf;
1809
1810                 rc = lod_sub_declare_create(env, dto, attr, NULL, dof, th);
1811                 if (rc != 0)
1812                         GOTO(out, rc);
1813
1814                 if (!dt_try_as_dir(env, dto))
1815                         GOTO(out, rc = -EINVAL);
1816
1817                 rc = lod_sub_declare_ref_add(env, dto, th);
1818                 if (rc != 0)
1819                         GOTO(out, rc);
1820
1821                 rec->rec_fid = lu_object_fid(&dto->do_lu);
1822                 rc = lod_sub_declare_insert(env, dto,
1823                                             (const struct dt_rec *)rec,
1824                                             (const struct dt_key *)dot, th);
1825                 if (rc != 0)
1826                         GOTO(out, rc);
1827
1828                 /* master stripe FID will be put to .. */
1829                 rec->rec_fid = lu_object_fid(&dt->do_lu);
1830                 rc = lod_sub_declare_insert(env, dto,
1831                                             (const struct dt_rec *)rec,
1832                                             (const struct dt_key *)dotdot, th);
1833                 if (rc != 0)
1834                         GOTO(out, rc);
1835
1836                 if (!OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_SLAVE_LMV) ||
1837                     cfs_fail_val != i) {
1838                         if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_SLAVE_LMV) &&
1839                             cfs_fail_val == i)
1840                                 slave_lmm->lmv_master_mdt_index =
1841                                                         cpu_to_le32(i + 1);
1842                         else
1843                                 slave_lmm->lmv_master_mdt_index =
1844                                                         cpu_to_le32(i);
1845                         rc = lod_sub_declare_xattr_set(env, dto, &slave_lmv_buf,
1846                                                        XATTR_NAME_LMV, 0, th);
1847                         if (rc != 0)
1848                                 GOTO(out, rc);
1849                 }
1850
1851                 if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_SLAVE_NAME) &&
1852                     cfs_fail_val == i)
1853                         snprintf(stripe_name, sizeof(info->lti_key), DFID":%u",
1854                                 PFID(lu_object_fid(&dto->do_lu)), i + 1);
1855                 else
1856                         snprintf(stripe_name, sizeof(info->lti_key), DFID":%u",
1857                                 PFID(lu_object_fid(&dto->do_lu)), i);
1858
1859                 sname = lod_name_get(env, stripe_name, strlen(stripe_name));
1860                 rc = linkea_links_new(&ldata, &info->lti_linkea_buf,
1861                                       sname, lu_object_fid(&dt->do_lu));
1862                 if (rc != 0)
1863                         GOTO(out, rc);
1864
1865                 linkea_buf.lb_buf = ldata.ld_buf->lb_buf;
1866                 linkea_buf.lb_len = ldata.ld_leh->leh_len;
1867                 rc = lod_sub_declare_xattr_set(env, dto, &linkea_buf,
1868                                                XATTR_NAME_LINK, 0, th);
1869                 if (rc != 0)
1870                         GOTO(out, rc);
1871
1872                 rec->rec_fid = lu_object_fid(&dto->do_lu);
1873                 rc = lod_sub_declare_insert(env, dt_object_child(dt),
1874                                             (const struct dt_rec *)rec,
1875                                             (const struct dt_key *)stripe_name,
1876                                             th);
1877                 if (rc != 0)
1878                         GOTO(out, rc);
1879
1880                 rc = lod_sub_declare_ref_add(env, dt_object_child(dt), th);
1881                 if (rc != 0)
1882                         GOTO(out, rc);
1883         }
1884
1885         rc = lod_sub_declare_xattr_set(env, dt_object_child(dt),
1886                                        &lmv_buf, XATTR_NAME_LMV, 0, th);
1887         if (rc != 0)
1888                 GOTO(out, rc);
1889 out:
1890         if (slave_lmm != NULL)
1891                 OBD_FREE_PTR(slave_lmm);
1892
1893         RETURN(rc);
1894 }
1895
1896 static int lod_prep_md_striped_create(const struct lu_env *env,
1897                                       struct dt_object *dt,
1898                                       struct lu_attr *attr,
1899                                       const struct lmv_user_md_v1 *lum,
1900                                       struct dt_object_format *dof,
1901                                       struct thandle *th)
1902 {
1903         struct lod_thread_info  *info = lod_env_info(env);
1904         struct lod_device       *lod = lu2lod_dev(dt->do_lu.lo_dev);
1905         struct lod_tgt_descs    *ltd = &lod->lod_mdt_descs;
1906         struct lod_object       *lo = lod_dt_obj(dt);
1907         struct dt_object        **stripe;
1908         __u32                   stripe_count;
1909         int                     *idx_array;
1910         __u32                   master_index;
1911         int                     rc = 0;
1912         __u32                   i;
1913         __u32                   j;
1914         bool                    is_specific = false;
1915         ENTRY;
1916
1917         /* The lum has been verifed in lod_verify_md_striping */
1918         LASSERT(le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC ||
1919                 le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC_SPECIFIC);
1920
1921         stripe_count = lo->ldo_dir_stripe_count;
1922
1923         OBD_ALLOC(idx_array, sizeof(idx_array[0]) * stripe_count);
1924         if (idx_array == NULL)
1925                 RETURN(-ENOMEM);
1926
1927         OBD_ALLOC(stripe, sizeof(stripe[0]) * stripe_count);
1928         if (stripe == NULL)
1929                 GOTO(out_free, rc = -ENOMEM);
1930
1931         /* Start index must be the master MDT */
1932         master_index = lu_site2seq(lod2lu_dev(lod)->ld_site)->ss_node_id;
1933         idx_array[0] = master_index;
1934         if (le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC_SPECIFIC) {
1935                 is_specific = true;
1936                 for (i = 1; i < stripe_count; i++)
1937                         idx_array[i] = le32_to_cpu(lum->lum_objects[i].lum_mds);
1938         }
1939
1940         for (i = 0; i < stripe_count; i++) {
1941                 struct lod_tgt_desc     *tgt = NULL;
1942                 struct dt_object        *dto;
1943                 struct lu_fid           fid = { 0 };
1944                 int                     idx;
1945                 struct lu_object_conf   conf = { 0 };
1946                 struct dt_device        *tgt_dt = NULL;
1947
1948                 /* Try to find next avaible target */
1949                 idx = idx_array[i];
1950                 for (j = 0; j < lod->lod_remote_mdt_count;
1951                      j++, idx = (idx + 1) % (lod->lod_remote_mdt_count + 1)) {
1952                         bool already_allocated = false;
1953                         __u32 k;
1954
1955                         CDEBUG(D_INFO, "try idx %d, mdt cnt %u, allocated %u\n",
1956                                idx, lod->lod_remote_mdt_count + 1, i);
1957
1958                         if (likely(!is_specific &&
1959                                    !OBD_FAIL_CHECK(OBD_FAIL_LARGE_STRIPE))) {
1960                                 /* check whether the idx already exists
1961                                  * in current allocated array */
1962                                 for (k = 0; k < i; k++) {
1963                                         if (idx_array[k] == idx) {
1964                                                 already_allocated = true;
1965                                                 break;
1966                                         }
1967                                 }
1968
1969                                 if (already_allocated)
1970                                         continue;
1971                         }
1972
1973                         /* Sigh, this index is not in the bitmap, let's check
1974                          * next available target */
1975                         if (!cfs_bitmap_check(ltd->ltd_tgt_bitmap, idx) &&
1976                             idx != master_index)
1977                                 continue;
1978
1979                         if (idx == master_index) {
1980                                 /* Allocate the FID locally */
1981                                 rc = obd_fid_alloc(env, lod->lod_child_exp,
1982                                                    &fid, NULL);
1983                                 if (rc < 0)
1984                                         GOTO(out_put, rc);
1985                                 tgt_dt = lod->lod_child;
1986                                 break;
1987                         }
1988
1989                         /* check the status of the OSP */
1990                         tgt = LTD_TGT(ltd, idx);
1991                         if (tgt == NULL)
1992                                 continue;
1993
1994                         tgt_dt = tgt->ltd_tgt;
1995                         rc = dt_statfs(env, tgt_dt, &info->lti_osfs);
1996                         if (rc) {
1997                                 /* this OSP doesn't feel well */
1998                                 rc = 0;
1999                                 continue;
2000                         }
2001
2002                         rc = obd_fid_alloc(env, tgt->ltd_exp, &fid, NULL);
2003                         if (rc < 0) {
2004                                 rc = 0;
2005                                 continue;
2006                         }
2007
2008                         break;
2009                 }
2010
2011                 /* Can not allocate more stripes */
2012                 if (j == lod->lod_remote_mdt_count) {
2013                         CDEBUG(D_INFO, "%s: require stripes %u only get %d\n",
2014                                lod2obd(lod)->obd_name, stripe_count, i);
2015                         break;
2016                 }
2017
2018                 CDEBUG(D_INFO, "Get idx %d, for stripe %d "DFID"\n",
2019                        idx, i, PFID(&fid));
2020                 idx_array[i] = idx;
2021                 /* Set the start index for next stripe allocation */
2022                 if (!is_specific && i < stripe_count - 1) {
2023                         /*
2024                          * for large dir test, put all other slaves on one
2025                          * remote MDT, otherwise we may save too many local
2026                          * slave locks which will exceed RS_MAX_LOCKS.
2027                          */
2028                         if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_LARGE_STRIPE)))
2029                                 idx = master_index;
2030                         idx_array[i + 1] = (idx + 1) %
2031                                            (lod->lod_remote_mdt_count + 1);
2032                 }
2033                 /* tgt_dt and fid must be ready after search avaible OSP
2034                  * in the above loop */
2035                 LASSERT(tgt_dt != NULL);
2036                 LASSERT(fid_is_sane(&fid));
2037                 conf.loc_flags = LOC_F_NEW;
2038                 dto = dt_locate_at(env, tgt_dt, &fid,
2039                                    dt->do_lu.lo_dev->ld_site->ls_top_dev,
2040                                    &conf);
2041                 if (IS_ERR(dto))
2042                         GOTO(out_put, rc = PTR_ERR(dto));
2043                 stripe[i] = dto;
2044         }
2045
2046         lo->ldo_dir_striped = 1;
2047         lo->ldo_stripe = stripe;
2048         lo->ldo_dir_stripe_count = i;
2049         lo->ldo_dir_stripes_allocated = stripe_count;
2050         smp_mb();
2051         lo->ldo_dir_stripe_loaded = 1;
2052
2053         if (lo->ldo_dir_stripe_count == 0)
2054                 GOTO(out_put, rc = -ENOSPC);
2055
2056         rc = lod_dir_declare_create_stripes(env, dt, attr, dof, th);
2057         if (rc != 0)
2058                 GOTO(out_put, rc);
2059
2060 out_put:
2061         if (rc < 0) {
2062                 for (i = 0; i < stripe_count; i++)
2063                         if (stripe[i] != NULL)
2064                                 dt_object_put(env, stripe[i]);
2065                 OBD_FREE(stripe, sizeof(stripe[0]) * stripe_count);
2066                 lo->ldo_dir_stripe_count = 0;
2067                 lo->ldo_dir_stripes_allocated = 0;
2068                 lo->ldo_stripe = NULL;
2069         }
2070
2071 out_free:
2072         OBD_FREE(idx_array, sizeof(idx_array[0]) * stripe_count);
2073
2074         RETURN(rc);
2075 }
2076
2077 /**
2078  *
2079  * Alloc cached foreign LMV
2080  *
2081  * \param[in] lo        object
2082  * \param[in] size      size of foreign LMV
2083  *
2084  * \retval              0 on success
2085  * \retval              negative if failed
2086  */
2087 int lod_alloc_foreign_lmv(struct lod_object *lo, size_t size)
2088 {
2089         OBD_ALLOC_LARGE(lo->ldo_foreign_lmv, size);
2090         if (lo->ldo_foreign_lmv == NULL)
2091                 return -ENOMEM;
2092         lo->ldo_foreign_lmv_size = size;
2093         lo->ldo_dir_is_foreign = 1;
2094
2095         return 0;
2096 }
2097
2098 /**
2099  * Declare create striped md object.
2100  *
2101  * The function declares intention to create a striped directory. This is a
2102  * wrapper for lod_prep_md_striped_create(). The only additional functionality
2103  * is to verify pattern \a lum_buf is good. Check that function for the details.
2104  *
2105  * \param[in] env       execution environment
2106  * \param[in] dt        object
2107  * \param[in] attr      attributes to initialize the objects with
2108  * \param[in] lum_buf   a pattern specifying the number of stripes and
2109  *                      MDT to start from
2110  * \param[in] dof       type of objects to be created
2111  * \param[in] th        transaction handle
2112  *
2113  * \retval              0 on success
2114  * \retval              negative if failed
2115  *
2116  */
2117 static int lod_declare_xattr_set_lmv(const struct lu_env *env,
2118                                      struct dt_object *dt,
2119                                      struct lu_attr *attr,
2120                                      const struct lu_buf *lum_buf,
2121                                      struct dt_object_format *dof,
2122                                      struct thandle *th)
2123 {
2124         struct lod_object       *lo = lod_dt_obj(dt);
2125         struct lmv_user_md_v1   *lum = lum_buf->lb_buf;
2126         int                     rc;
2127         ENTRY;
2128
2129         LASSERT(lum != NULL);
2130
2131         CDEBUG(D_INFO, "lum magic = %x count = %u offset = %d\n",
2132                le32_to_cpu(lum->lum_magic), le32_to_cpu(lum->lum_stripe_count),
2133                (int)le32_to_cpu(lum->lum_stripe_offset));
2134
2135         if (lo->ldo_dir_stripe_count == 0) {
2136                 if (lo->ldo_dir_is_foreign) {
2137                         rc = lod_alloc_foreign_lmv(lo, lum_buf->lb_len);
2138                         if (rc != 0)
2139                                 GOTO(out, rc);
2140                         memcpy(lo->ldo_foreign_lmv, lum, lum_buf->lb_len);
2141                         lo->ldo_dir_stripe_loaded = 1;
2142                 }
2143                 GOTO(out, rc = 0);
2144         }
2145
2146         /* prepare dir striped objects */
2147         rc = lod_prep_md_striped_create(env, dt, attr, lum, dof, th);
2148         if (rc != 0) {
2149                 /* failed to create striping, let's reset
2150                  * config so that others don't get confused */
2151                 lod_striping_free(env, lo);
2152                 GOTO(out, rc);
2153         }
2154 out:
2155         RETURN(rc);
2156 }
2157
2158 /**
2159  * Append source stripes after target stripes for migrating directory. NB, we
2160  * only need to declare this, the append is done inside lod_xattr_set_lmv().
2161  *
2162  * \param[in] env       execution environment
2163  * \param[in] dt        target object
2164  * \param[in] buf       LMV buf which contains source stripe fids
2165  * \param[in] th        transaction handle
2166  *
2167  * \retval              0 on success
2168  * \retval              negative if failed
2169  */
2170 static int lod_dir_declare_layout_add(const struct lu_env *env,
2171                                       struct dt_object *dt,
2172                                       const struct lu_buf *buf,
2173                                       struct thandle *th)
2174 {
2175         struct lod_thread_info *info = lod_env_info(env);
2176         struct lod_device *lod = lu2lod_dev(dt->do_lu.lo_dev);
2177         struct lod_tgt_descs *ltd = &lod->lod_mdt_descs;
2178         struct lod_object *lo = lod_dt_obj(dt);
2179         struct dt_object *next = dt_object_child(dt);
2180         struct dt_object_format *dof = &info->lti_format;
2181         struct lmv_mds_md_v1 *lmv = buf->lb_buf;
2182         struct dt_object **stripe;
2183         __u32 stripe_count = le32_to_cpu(lmv->lmv_stripe_count);
2184         struct lu_fid *fid = &info->lti_fid;
2185         struct lod_tgt_desc *tgt;
2186         struct dt_object *dto;
2187         struct dt_device *tgt_dt;
2188         int type = LU_SEQ_RANGE_ANY;
2189         struct dt_insert_rec *rec = &info->lti_dt_rec;
2190         char *stripe_name = info->lti_key;
2191         struct lu_name *sname;
2192         struct linkea_data ldata = { NULL };
2193         struct lu_buf linkea_buf;
2194         __u32 idx;
2195         int i;
2196         int rc;
2197
2198         ENTRY;
2199
2200         if (le32_to_cpu(lmv->lmv_magic) != LMV_MAGIC_V1)
2201                 RETURN(-EINVAL);
2202
2203         if (stripe_count == 0)
2204                 RETURN(-EINVAL);
2205
2206         dof->dof_type = DFT_DIR;
2207
2208         OBD_ALLOC(stripe,
2209                   sizeof(*stripe) * (lo->ldo_dir_stripe_count + stripe_count));
2210         if (stripe == NULL)
2211                 RETURN(-ENOMEM);
2212
2213         for (i = 0; i < lo->ldo_dir_stripe_count; i++)
2214                 stripe[i] = lo->ldo_stripe[i];
2215
2216         for (i = 0; i < stripe_count; i++) {
2217                 fid_le_to_cpu(fid,
2218                         &lmv->lmv_stripe_fids[i]);
2219                 if (!fid_is_sane(fid))
2220                         GOTO(out, rc = -ESTALE);
2221
2222                 rc = lod_fld_lookup(env, lod, fid, &idx, &type);
2223                 if (rc)
2224                         GOTO(out, rc);
2225
2226                 if (idx == lod2lu_dev(lod)->ld_site->ld_seq_site->ss_node_id) {
2227                         tgt_dt = lod->lod_child;
2228                 } else {
2229                         tgt = LTD_TGT(ltd, idx);
2230                         if (tgt == NULL)
2231                                 GOTO(out, rc = -ESTALE);
2232                         tgt_dt = tgt->ltd_tgt;
2233                 }
2234
2235                 dto = dt_locate_at(env, tgt_dt, fid,
2236                                   lo->ldo_obj.do_lu.lo_dev->ld_site->ls_top_dev,
2237                                   NULL);
2238                 if (IS_ERR(dto))
2239                         GOTO(out, rc = PTR_ERR(dto));
2240
2241                 stripe[i + lo->ldo_dir_stripe_count] = dto;
2242
2243                 if (!dt_try_as_dir(env, dto))
2244                         GOTO(out, rc = -ENOTDIR);
2245
2246                 rc = lod_sub_declare_ref_add(env, dto, th);
2247                 if (rc)
2248                         GOTO(out, rc);
2249
2250                 rc = lod_sub_declare_insert(env, dto,
2251                                             (const struct dt_rec *)rec,
2252                                             (const struct dt_key *)dot, th);
2253                 if (rc)
2254                         GOTO(out, rc);
2255
2256                 rc = lod_sub_declare_insert(env, dto,
2257                                             (const struct dt_rec *)rec,
2258                                             (const struct dt_key *)dotdot, th);
2259                 if (rc)
2260                         GOTO(out, rc);
2261
2262                 rc = lod_sub_declare_xattr_set(env, dto, buf,
2263                                                 XATTR_NAME_LMV, 0, th);
2264                 if (rc)
2265                         GOTO(out, rc);
2266
2267                 snprintf(stripe_name, sizeof(info->lti_key), DFID":%u",
2268                          PFID(lu_object_fid(&dto->do_lu)),
2269                          i + lo->ldo_dir_stripe_count);
2270
2271                 sname = lod_name_get(env, stripe_name, strlen(stripe_name));
2272                 rc = linkea_links_new(&ldata, &info->lti_linkea_buf,
2273                                       sname, lu_object_fid(&dt->do_lu));
2274                 if (rc)
2275                         GOTO(out, rc);
2276
2277                 linkea_buf.lb_buf = ldata.ld_buf->lb_buf;
2278                 linkea_buf.lb_len = ldata.ld_leh->leh_len;
2279                 rc = lod_sub_declare_xattr_set(env, dto, &linkea_buf,
2280                                                XATTR_NAME_LINK, 0, th);
2281                 if (rc)
2282                         GOTO(out, rc);
2283
2284                 rc = lod_sub_declare_insert(env, next,
2285                                             (const struct dt_rec *)rec,
2286                                             (const struct dt_key *)stripe_name,
2287                                             th);
2288                 if (rc)
2289                         GOTO(out, rc);
2290
2291                 rc = lod_sub_declare_ref_add(env, next, th);
2292                 if (rc)
2293                         GOTO(out, rc);
2294         }
2295
2296         if (lo->ldo_stripe)
2297                 OBD_FREE(lo->ldo_stripe,
2298                          sizeof(*stripe) * lo->ldo_dir_stripes_allocated);
2299         lo->ldo_stripe = stripe;
2300         lo->ldo_dir_migrate_offset = lo->ldo_dir_stripe_count;
2301         lo->ldo_dir_migrate_hash = le32_to_cpu(lmv->lmv_hash_type);
2302         lo->ldo_dir_stripe_count += stripe_count;
2303         lo->ldo_dir_stripes_allocated += stripe_count;
2304         lo->ldo_dir_hash_type |= LMV_HASH_FLAG_MIGRATION;
2305
2306         RETURN(0);
2307 out:
2308         i = lo->ldo_dir_stripe_count;
2309         while (i < lo->ldo_dir_stripe_count + stripe_count && stripe[i])
2310                 dt_object_put(env, stripe[i++]);
2311
2312         OBD_FREE(stripe,
2313                  sizeof(*stripe) * (stripe_count + lo->ldo_dir_stripe_count));
2314         RETURN(rc);
2315 }
2316
2317 static int lod_dir_declare_layout_delete(const struct lu_env *env,
2318                                          struct dt_object *dt,
2319                                          const struct lu_buf *buf,
2320                                          struct thandle *th)
2321 {
2322         struct lod_thread_info *info = lod_env_info(env);
2323         struct lod_object *lo = lod_dt_obj(dt);
2324         struct dt_object *next = dt_object_child(dt);
2325         struct lmv_user_md *lmu = buf->lb_buf;
2326         __u32 final_stripe_count;
2327         char *stripe_name = info->lti_key;
2328         struct dt_object *dto;
2329         int i;
2330         int rc = 0;
2331
2332         if (!lmu)
2333                 return -EINVAL;
2334
2335         final_stripe_count = le32_to_cpu(lmu->lum_stripe_count);
2336         if (final_stripe_count >= lo->ldo_dir_stripe_count)
2337                 return -EINVAL;
2338
2339         for (i = final_stripe_count; i < lo->ldo_dir_stripe_count; i++) {
2340                 dto = lo->ldo_stripe[i];
2341                 LASSERT(dto);
2342
2343                 if (!dt_try_as_dir(env, dto))
2344                         return -ENOTDIR;
2345
2346                 rc = lod_sub_declare_delete(env, dto,
2347                                             (const struct dt_key *)dot, th);
2348                 if (rc)
2349                         return rc;
2350
2351                 rc = lod_sub_declare_ref_del(env, dto, th);
2352                 if (rc)
2353                         return rc;
2354
2355                 rc = lod_sub_declare_delete(env, dto,
2356                                         (const struct dt_key *)dotdot, th);
2357                 if (rc)
2358                         return rc;
2359
2360                 snprintf(stripe_name, sizeof(info->lti_key), DFID":%d",
2361                          PFID(lu_object_fid(&dto->do_lu)), i);
2362
2363                 rc = lod_sub_declare_delete(env, next,
2364                                         (const struct dt_key *)stripe_name, th);
2365                 if (rc)
2366                         return rc;
2367
2368                 rc = lod_sub_declare_ref_del(env, next, th);
2369                 if (rc)
2370                         return rc;
2371         }
2372
2373         return 0;
2374 }
2375
2376 /*
2377  * delete stripes from dir master object, the lum_stripe_count in argument is
2378  * the final stripe count, the stripes after that will be deleted, NB, they
2379  * are not destroyed, but deleted from it's parent namespace, this function
2380  * will be called in two places:
2381  * 1. mdd_migrate_create() delete stripes from source, and append them to
2382  *    target.
2383  * 2. mdd_dir_layout_shrink() delete stripes from source, and destroy them.
2384  */
2385 static int lod_dir_layout_delete(const struct lu_env *env,
2386                                  struct dt_object *dt,
2387                                  const struct lu_buf *buf,
2388                                  struct thandle *th)
2389 {
2390         struct lod_thread_info *info = lod_env_info(env);
2391         struct lod_object *lo = lod_dt_obj(dt);
2392         struct dt_object *next = dt_object_child(dt);
2393         struct lmv_user_md *lmu = buf->lb_buf;
2394         __u32 final_stripe_count;
2395         char *stripe_name = info->lti_key;
2396         struct dt_object *dto;
2397         int i;
2398         int rc = 0;
2399
2400         ENTRY;
2401
2402         if (!lmu)
2403                 RETURN(-EINVAL);
2404
2405         final_stripe_count = le32_to_cpu(lmu->lum_stripe_count);
2406         if (final_stripe_count >= lo->ldo_dir_stripe_count)
2407                 RETURN(-EINVAL);
2408
2409         for (i = final_stripe_count; i < lo->ldo_dir_stripe_count; i++) {
2410                 dto = lo->ldo_stripe[i];
2411                 LASSERT(dto);
2412
2413                 rc = lod_sub_delete(env, dto,
2414                                     (const struct dt_key *)dotdot, th);
2415                 if (rc)
2416                         break;
2417
2418                 snprintf(stripe_name, sizeof(info->lti_key), DFID":%d",
2419                          PFID(lu_object_fid(&dto->do_lu)), i);
2420
2421                 rc = lod_sub_delete(env, next,
2422                                     (const struct dt_key *)stripe_name, th);
2423                 if (rc)
2424                         break;
2425
2426                 rc = lod_sub_ref_del(env, next, th);
2427                 if (rc)
2428                         break;
2429         }
2430
2431         lod_striping_free(env, lod_dt_obj(dt));
2432
2433         RETURN(rc);
2434 }
2435
2436 /**
2437  * Implementation of dt_object_operations::do_declare_xattr_set.
2438  *
2439  * Used with regular (non-striped) objects. Basically it
2440  * initializes the striping information and applies the
2441  * change to all the stripes.
2442  *
2443  * \see dt_object_operations::do_declare_xattr_set() in the API description
2444  * for details.
2445  */
2446 static int lod_dir_declare_xattr_set(const struct lu_env *env,
2447                                      struct dt_object *dt,
2448                                      const struct lu_buf *buf,
2449                                      const char *name, int fl,
2450                                      struct thandle *th)
2451 {
2452         struct dt_object        *next = dt_object_child(dt);
2453         struct lod_device       *d = lu2lod_dev(dt->do_lu.lo_dev);
2454         struct lod_object       *lo = lod_dt_obj(dt);
2455         int                     i;
2456         int                     rc;
2457         ENTRY;
2458
2459         if (strcmp(name, XATTR_NAME_DEFAULT_LMV) == 0) {
2460                 struct lmv_user_md_v1 *lum;
2461
2462                 LASSERT(buf != NULL && buf->lb_buf != NULL);
2463                 lum = buf->lb_buf;
2464                 rc = lod_verify_md_striping(d, lum);
2465                 if (rc != 0)
2466                         RETURN(rc);
2467         } else if (strcmp(name, XATTR_NAME_LOV) == 0) {
2468                 rc = lod_verify_striping(d, lo, buf, false);
2469                 if (rc != 0)
2470                         RETURN(rc);
2471         }
2472
2473         rc = lod_sub_declare_xattr_set(env, next, buf, name, fl, th);
2474         if (rc != 0)
2475                 RETURN(rc);
2476
2477         /* Note: Do not set LinkEA on sub-stripes, otherwise
2478          * it will confuse the fid2path process(see mdt_path_current()).
2479          * The linkEA between master and sub-stripes is set in
2480          * lod_xattr_set_lmv(). */
2481         if (strcmp(name, XATTR_NAME_LINK) == 0)
2482                 RETURN(0);
2483
2484         /* set xattr to each stripes, if needed */
2485         rc = lod_striping_load(env, lo);
2486         if (rc != 0)
2487                 RETURN(rc);
2488
2489         if (lo->ldo_dir_stripe_count == 0)
2490                 RETURN(0);
2491
2492         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
2493                 LASSERT(lo->ldo_stripe[i]);
2494
2495                 rc = lod_sub_declare_xattr_set(env, lo->ldo_stripe[i],
2496                                                buf, name, fl, th);
2497                 if (rc != 0)
2498                         break;
2499         }
2500
2501         RETURN(rc);
2502 }
2503
2504 static int
2505 lod_obj_stripe_replace_parent_fid_cb(const struct lu_env *env,
2506                                      struct lod_object *lo,
2507                                      struct dt_object *dt, struct thandle *th,
2508                                      int comp_idx, int stripe_idx,
2509                                      struct lod_obj_stripe_cb_data *data)
2510 {
2511         struct lod_thread_info *info = lod_env_info(env);
2512         struct lod_layout_component *comp = &lo->ldo_comp_entries[comp_idx];
2513         struct filter_fid *ff = &info->lti_ff;
2514         struct lu_buf *buf = &info->lti_buf;
2515         int rc;
2516
2517         buf->lb_buf = ff;
2518         buf->lb_len = sizeof(*ff);
2519         rc = dt_xattr_get(env, dt, buf, XATTR_NAME_FID);
2520         if (rc < 0) {
2521                 if (rc == -ENODATA)
2522                         return 0;
2523                 return rc;
2524         }
2525
2526         /*
2527          * locd_buf is set if it's called by dir migration, which doesn't check
2528          * pfid and comp id.
2529          */
2530         if (data->locd_buf) {
2531                 memset(ff, 0, sizeof(*ff));
2532                 ff->ff_parent = *(struct lu_fid *)data->locd_buf->lb_buf;
2533         } else {
2534                 filter_fid_le_to_cpu(ff, ff, sizeof(*ff));
2535
2536                 if (lu_fid_eq(lod_object_fid(lo), &ff->ff_parent) &&
2537                     ff->ff_layout.ol_comp_id == comp->llc_id)
2538                         return 0;
2539
2540                 memset(ff, 0, sizeof(*ff));
2541                 ff->ff_parent = *lu_object_fid(&lo->ldo_obj.do_lu);
2542         }
2543
2544         /* rewrite filter_fid */
2545         ff->ff_parent.f_ver = stripe_idx;
2546         ff->ff_layout.ol_stripe_size = comp->llc_stripe_size;
2547         ff->ff_layout.ol_stripe_count = comp->llc_stripe_count;
2548         ff->ff_layout.ol_comp_id = comp->llc_id;
2549         ff->ff_layout.ol_comp_start = comp->llc_extent.e_start;
2550         ff->ff_layout.ol_comp_end = comp->llc_extent.e_end;
2551         filter_fid_cpu_to_le(ff, ff, sizeof(*ff));
2552
2553         if (data->locd_declare)
2554                 rc = lod_sub_declare_xattr_set(env, dt, buf, XATTR_NAME_FID,
2555                                                LU_XATTR_REPLACE, th);
2556         else
2557                 rc = lod_sub_xattr_set(env, dt, buf, XATTR_NAME_FID,
2558                                        LU_XATTR_REPLACE, th);
2559
2560         return rc;
2561 }
2562
2563 /**
2564  * Reset parent FID on OST object
2565  *
2566  * Replace parent FID with @dt object FID, which is only called during migration
2567  * to reset the parent FID after the MDT object is migrated to the new MDT, i.e.
2568  * the FID is changed.
2569  *
2570  * \param[in] env execution environment
2571  * \param[in] dt dt_object whose stripes's parent FID will be reset
2572  * \parem[in] th thandle
2573  * \param[in] declare if it is declare
2574  *
2575  * \retval      0 if reset succeeds
2576  * \retval      negative errno if reset fails
2577  */
2578 static int lod_replace_parent_fid(const struct lu_env *env,
2579                                   struct dt_object *dt,
2580                                   const struct lu_buf *buf,
2581                                   struct thandle *th, bool declare)
2582 {
2583         struct lod_object *lo = lod_dt_obj(dt);
2584         struct lod_thread_info  *info = lod_env_info(env);
2585         struct filter_fid *ff;
2586         struct lod_obj_stripe_cb_data data = { { 0 } };
2587         int rc;
2588         ENTRY;
2589
2590         LASSERT(S_ISREG(dt->do_lu.lo_header->loh_attr));
2591
2592         /* set xattr to each stripes, if needed */
2593         rc = lod_striping_load(env, lo);
2594         if (rc != 0)
2595                 RETURN(rc);
2596
2597         if (!lod_obj_is_striped(dt))
2598                 RETURN(0);
2599
2600         if (info->lti_ea_store_size < sizeof(*ff)) {
2601                 rc = lod_ea_store_resize(info, sizeof(*ff));
2602                 if (rc != 0)
2603                         RETURN(rc);
2604         }
2605
2606         data.locd_declare = declare;
2607         data.locd_stripe_cb = lod_obj_stripe_replace_parent_fid_cb;
2608         data.locd_buf = buf;
2609         rc = lod_obj_for_each_stripe(env, lo, th, &data);
2610
2611         RETURN(rc);
2612 }
2613
2614 inline __u16 lod_comp_entry_stripe_count(struct lod_object *lo,
2615                                          struct lod_layout_component *entry,
2616                                          bool is_dir)
2617 {
2618         struct lod_device *lod = lu2lod_dev(lod2lu_obj(lo)->lo_dev);
2619
2620         if (is_dir)
2621                 return  0;
2622         else if (lod_comp_inited(entry))
2623                 return entry->llc_stripe_count;
2624         else if ((__u16)-1 == entry->llc_stripe_count)
2625                 return lod->lod_desc.ld_tgt_count;
2626         else
2627                 return lod_get_stripe_count(lod, lo, entry->llc_stripe_count);
2628 }
2629
2630 static int lod_comp_md_size(struct lod_object *lo, bool is_dir)
2631 {
2632         int magic, size = 0, i;
2633         struct lod_layout_component *comp_entries;
2634         __u16 comp_cnt;
2635         bool is_composite, is_foreign = false;
2636
2637         if (is_dir) {
2638                 comp_cnt = lo->ldo_def_striping->lds_def_comp_cnt;
2639                 comp_entries = lo->ldo_def_striping->lds_def_comp_entries;
2640                 is_composite =
2641                         lo->ldo_def_striping->lds_def_striping_is_composite;
2642         } else {
2643                 comp_cnt = lo->ldo_comp_cnt;
2644                 comp_entries = lo->ldo_comp_entries;
2645                 is_composite = lo->ldo_is_composite;
2646                 is_foreign = lo->ldo_is_foreign;
2647         }
2648
2649         if (is_foreign)
2650                 return lo->ldo_foreign_lov_size;
2651
2652         LASSERT(comp_cnt != 0 && comp_entries != NULL);
2653         if (is_composite) {
2654                 size = sizeof(struct lov_comp_md_v1) +
2655                        sizeof(struct lov_comp_md_entry_v1) * comp_cnt;
2656                 LASSERT(size % sizeof(__u64) == 0);
2657         }
2658
2659         for (i = 0; i < comp_cnt; i++) {
2660                 __u16 stripe_count;
2661
2662                 magic = comp_entries[i].llc_pool ? LOV_MAGIC_V3 : LOV_MAGIC_V1;
2663                 stripe_count = lod_comp_entry_stripe_count(lo, &comp_entries[i],
2664                                                            is_dir);
2665                 if (!is_dir && is_composite)
2666                         lod_comp_shrink_stripe_count(&comp_entries[i],
2667                                                      &stripe_count);
2668
2669                 size += lov_user_md_size(stripe_count, magic);
2670                 LASSERT(size % sizeof(__u64) == 0);
2671         }
2672         return size;
2673 }
2674
2675 /**
2676  * Declare component add. The xattr name is XATTR_LUSTRE_LOV.add, and
2677  * the xattr value is binary lov_comp_md_v1 which contains component(s)
2678  * to be added.
2679   *
2680  * \param[in] env       execution environment
2681  * \param[in] dt        dt_object to add components on
2682  * \param[in] buf       buffer contains components to be added
2683  * \parem[in] th        thandle
2684  *
2685  * \retval      0 on success
2686  * \retval      negative errno on failure
2687  */
2688 static int lod_declare_layout_add(const struct lu_env *env,
2689                                   struct dt_object *dt,
2690                                   const struct lu_buf *buf,
2691                                   struct thandle *th)
2692 {
2693         struct lod_thread_info  *info = lod_env_info(env);
2694         struct lod_layout_component *comp_array, *lod_comp, *old_array;
2695         struct lod_device       *d = lu2lod_dev(dt->do_lu.lo_dev);
2696         struct dt_object *next = dt_object_child(dt);
2697         struct lov_desc         *desc = &d->lod_desc;
2698         struct lod_object       *lo = lod_dt_obj(dt);
2699         struct lov_user_md_v3   *v3;
2700         struct lov_comp_md_v1   *comp_v1 = buf->lb_buf;
2701         __u32   magic;
2702         int     i, rc, array_cnt, old_array_cnt;
2703         ENTRY;
2704
2705         LASSERT(lo->ldo_is_composite);
2706
2707         if (lo->ldo_flr_state != LCM_FL_NONE)
2708                 RETURN(-EBUSY);
2709
2710         rc = lod_verify_striping(d, lo, buf, false);
2711         if (rc != 0)
2712                 RETURN(rc);
2713
2714         magic = comp_v1->lcm_magic;
2715         if (magic == __swab32(LOV_USER_MAGIC_COMP_V1)) {
2716                 lustre_swab_lov_comp_md_v1(comp_v1);
2717                 magic = comp_v1->lcm_magic;
2718         }
2719
2720         if (magic != LOV_USER_MAGIC_COMP_V1)
2721                 RETURN(-EINVAL);
2722
2723         array_cnt = lo->ldo_comp_cnt + comp_v1->lcm_entry_count;
2724         OBD_ALLOC(comp_array, sizeof(*comp_array) * array_cnt);
2725         if (comp_array == NULL)
2726                 RETURN(-ENOMEM);
2727
2728         memcpy(comp_array, lo->ldo_comp_entries,
2729                sizeof(*comp_array) * lo->ldo_comp_cnt);
2730
2731         for (i = 0; i < comp_v1->lcm_entry_count; i++) {
2732                 struct lov_user_md_v1 *v1;
2733                 struct lu_extent *ext;
2734
2735                 v1 = (struct lov_user_md *)((char *)comp_v1 +
2736                                 comp_v1->lcm_entries[i].lcme_offset);
2737                 ext = &comp_v1->lcm_entries[i].lcme_extent;
2738
2739                 lod_comp = &comp_array[lo->ldo_comp_cnt + i];
2740                 lod_comp->llc_extent.e_start = ext->e_start;
2741                 lod_comp->llc_extent.e_end = ext->e_end;
2742                 lod_comp->llc_stripe_offset = v1->lmm_stripe_offset;
2743                 lod_comp->llc_flags = comp_v1->lcm_entries[i].lcme_flags;
2744
2745                 lod_comp->llc_stripe_count = v1->lmm_stripe_count;
2746                 lod_comp->llc_stripe_size = v1->lmm_stripe_size;
2747                 lod_adjust_stripe_info(lod_comp, desc);
2748
2749                 if (v1->lmm_magic == LOV_USER_MAGIC_V3) {
2750                         v3 = (struct lov_user_md_v3 *) v1;
2751                         if (v3->lmm_pool_name[0] != '\0') {
2752                                 rc = lod_set_pool(&lod_comp->llc_pool,
2753                                                   v3->lmm_pool_name);
2754                                 if (rc)
2755                                         GOTO(error, rc);
2756                         }
2757                 }
2758         }
2759
2760         old_array = lo->ldo_comp_entries;
2761         old_array_cnt = lo->ldo_comp_cnt;
2762
2763         lo->ldo_comp_entries = comp_array;
2764         lo->ldo_comp_cnt = array_cnt;
2765
2766         /* No need to increase layout generation here, it will be increased
2767          * later when generating component ID for the new components */
2768
2769         info->lti_buf.lb_len = lod_comp_md_size(lo, false);
2770         rc = lod_sub_declare_xattr_set(env, next, &info->lti_buf,
2771                                               XATTR_NAME_LOV, 0, th);
2772         if (rc) {
2773                 lo->ldo_comp_entries = old_array;
2774                 lo->ldo_comp_cnt = old_array_cnt;
2775                 GOTO(error, rc);
2776         }
2777
2778         OBD_FREE(old_array, sizeof(*lod_comp) * old_array_cnt);
2779
2780         LASSERT(lo->ldo_mirror_count == 1);
2781         lo->ldo_mirrors[0].lme_end = array_cnt - 1;
2782
2783         RETURN(0);
2784
2785 error:
2786         for (i = lo->ldo_comp_cnt; i < array_cnt; i++) {
2787                 lod_comp = &comp_array[i];
2788                 if (lod_comp->llc_pool != NULL) {
2789                         OBD_FREE(lod_comp->llc_pool,
2790                                  strlen(lod_comp->llc_pool) + 1);
2791                         lod_comp->llc_pool = NULL;
2792                 }
2793         }
2794         OBD_FREE(comp_array, sizeof(*comp_array) * array_cnt);
2795         RETURN(rc);
2796 }
2797
2798 /**
2799  * Declare component set. The xattr is name XATTR_LUSTRE_LOV.set.$field,
2800  * the '$field' can only be 'flags' now. The xattr value is binary
2801  * lov_comp_md_v1 which contains the component ID(s) and the value of
2802  * the field to be modified.
2803  *
2804  * \param[in] env       execution environment
2805  * \param[in] dt        dt_object to be modified
2806  * \param[in] op        operation string, like "set.flags"
2807  * \param[in] buf       buffer contains components to be set
2808  * \parem[in] th        thandle
2809  *
2810  * \retval      0 on success
2811  * \retval      negative errno on failure
2812  */
2813 static int lod_declare_layout_set(const struct lu_env *env,
2814                                   struct dt_object *dt,
2815                                   char *op, const struct lu_buf *buf,
2816                                   struct thandle *th)
2817 {
2818         struct lod_layout_component     *lod_comp;
2819         struct lod_thread_info  *info = lod_env_info(env);
2820         struct lod_device       *d = lu2lod_dev(dt->do_lu.lo_dev);
2821         struct lod_object       *lo = lod_dt_obj(dt);
2822         struct lov_comp_md_v1   *comp_v1 = buf->lb_buf;
2823         __u32   magic;
2824         int     i, j, rc;
2825         bool    changed = false;
2826         ENTRY;
2827
2828         if (strcmp(op, "set.flags") != 0) {
2829                 CDEBUG(D_LAYOUT, "%s: operation (%s) not supported.\n",
2830                        lod2obd(d)->obd_name, op);
2831                 RETURN(-ENOTSUPP);
2832         }
2833
2834         magic = comp_v1->lcm_magic;
2835         if (magic == __swab32(LOV_USER_MAGIC_COMP_V1)) {
2836                 lustre_swab_lov_comp_md_v1(comp_v1);
2837                 magic = comp_v1->lcm_magic;
2838         }
2839
2840         if (magic != LOV_USER_MAGIC_COMP_V1)
2841                 RETURN(-EINVAL);
2842
2843         if (comp_v1->lcm_entry_count == 0) {
2844                 CDEBUG(D_LAYOUT, "%s: entry count is zero.\n",
2845                        lod2obd(d)->obd_name);
2846                 RETURN(-EINVAL);
2847         }
2848
2849         for (i = 0; i < comp_v1->lcm_entry_count; i++) {
2850                 __u32 id = comp_v1->lcm_entries[i].lcme_id;
2851                 __u32 flags = comp_v1->lcm_entries[i].lcme_flags;
2852                 __u32 mirror_flag = flags & LCME_MIRROR_FLAGS;
2853                 bool neg = flags & LCME_FL_NEG;
2854
2855                 if (flags & LCME_FL_INIT) {
2856                         if (changed)
2857                                 lod_striping_free(env, lo);
2858                         RETURN(-EINVAL);
2859                 }
2860
2861                 flags &= ~(LCME_MIRROR_FLAGS | LCME_FL_NEG);
2862                 for (j = 0; j < lo->ldo_comp_cnt; j++) {
2863                         lod_comp = &lo->ldo_comp_entries[j];
2864
2865                         /* lfs only put one flag in each entry */
2866                         if ((flags && id != lod_comp->llc_id) ||
2867                             (mirror_flag && mirror_id_of(id) !=
2868                                             mirror_id_of(lod_comp->llc_id)))
2869                                 continue;
2870
2871                         if (neg) {
2872                                 if (flags)
2873                                         lod_comp->llc_flags &= ~flags;
2874                                 if (mirror_flag)
2875                                         lod_comp->llc_flags &= ~mirror_flag;
2876                         } else {
2877                                 if (flags)
2878                                         lod_comp->llc_flags |= flags;
2879                                 if (mirror_flag) {
2880                                         lod_comp->llc_flags |= mirror_flag;
2881                                         if (mirror_flag & LCME_FL_NOSYNC)
2882                                                 lod_comp->llc_timestamp =
2883                                                        ktime_get_real_seconds();
2884                                 }
2885                         }
2886                         changed = true;
2887                 }
2888         }
2889
2890         if (!changed) {
2891                 CDEBUG(D_LAYOUT, "%s: requested component(s) not found.\n",
2892                        lod2obd(d)->obd_name);
2893                 RETURN(-EINVAL);
2894         }
2895
2896         lod_obj_inc_layout_gen(lo);
2897
2898         info->lti_buf.lb_len = lod_comp_md_size(lo, false);
2899         rc = lod_sub_declare_xattr_set(env, dt_object_child(dt), &info->lti_buf,
2900                                        XATTR_NAME_LOV, LU_XATTR_REPLACE, th);
2901         RETURN(rc);
2902 }
2903
2904 /**
2905  * Declare component deletion. The xattr name is XATTR_LUSTRE_LOV.del,
2906  * and the xattr value is a unique component ID or a special lcme_id.
2907  *
2908  * \param[in] env       execution environment
2909  * \param[in] dt        dt_object to be operated on
2910  * \param[in] buf       buffer contains component ID or lcme_id
2911  * \parem[in] th        thandle
2912  *
2913  * \retval      0 on success
2914  * \retval      negative errno on failure
2915  */
2916 static int lod_declare_layout_del(const struct lu_env *env,
2917                                   struct dt_object *dt,
2918                                   const struct lu_buf *buf,
2919                                   struct thandle *th)
2920 {
2921         struct lod_thread_info  *info = lod_env_info(env);
2922         struct dt_object *next = dt_object_child(dt);
2923         struct lod_device *d = lu2lod_dev(dt->do_lu.lo_dev);
2924         struct lod_object *lo = lod_dt_obj(dt);
2925         struct lu_attr *attr = &lod_env_info(env)->lti_attr;
2926         struct lov_comp_md_v1 *comp_v1 = buf->lb_buf;
2927         __u32 magic, id, flags, neg_flags = 0;
2928         int rc, i, j, left;
2929         ENTRY;
2930
2931         LASSERT(lo->ldo_is_composite);
2932
2933         if (lo->ldo_flr_state != LCM_FL_NONE)
2934                 RETURN(-EBUSY);
2935
2936         magic = comp_v1->lcm_magic;
2937         if (magic == __swab32(LOV_USER_MAGIC_COMP_V1)) {
2938                 lustre_swab_lov_comp_md_v1(comp_v1);
2939                 magic = comp_v1->lcm_magic;
2940         }
2941
2942         if (magic != LOV_USER_MAGIC_COMP_V1)
2943                 RETURN(-EINVAL);
2944
2945         id = comp_v1->lcm_entries[0].lcme_id;
2946         flags = comp_v1->lcm_entries[0].lcme_flags;
2947
2948         if (id > LCME_ID_MAX || (flags & ~LCME_KNOWN_FLAGS)) {
2949                 CDEBUG(D_LAYOUT, "%s: invalid component id %#x, flags %#x\n",
2950                        lod2obd(d)->obd_name, id, flags);
2951                 RETURN(-EINVAL);
2952         }
2953
2954         if (id != LCME_ID_INVAL && flags != 0) {
2955                 CDEBUG(D_LAYOUT, "%s: specified both id and flags.\n",
2956                        lod2obd(d)->obd_name);
2957                 RETURN(-EINVAL);
2958         }
2959
2960         if (id == LCME_ID_INVAL && !flags) {
2961                 CDEBUG(D_LAYOUT, "%s: no id or flags specified.\n",
2962                        lod2obd(d)->obd_name);
2963                 RETURN(-EINVAL);
2964         }
2965
2966         if (flags & LCME_FL_NEG) {
2967                 neg_flags = flags & ~LCME_FL_NEG;
2968                 flags = 0;
2969         }
2970
2971         left = lo->ldo_comp_cnt;
2972         if (left <= 0)
2973                 RETURN(-EINVAL);
2974
2975         for (i = (lo->ldo_comp_cnt - 1); i >= 0; i--) {
2976                 struct lod_layout_component *lod_comp;
2977
2978                 lod_comp = &lo->ldo_comp_entries[i];
2979
2980                 if (id != LCME_ID_INVAL && id != lod_comp->llc_id)
2981                         continue;
2982                 else if (flags && !(flags & lod_comp->llc_flags))
2983                         continue;
2984                 else if (neg_flags && (neg_flags & lod_comp->llc_flags))
2985                         continue;
2986
2987                 if (left != (i + 1)) {
2988                         CDEBUG(D_LAYOUT, "%s: this deletion will create "
2989                                "a hole.\n", lod2obd(d)->obd_name);
2990                         RETURN(-EINVAL);
2991                 }
2992                 left--;
2993
2994                 /* Mark the component as deleted */
2995                 lod_comp->llc_id = LCME_ID_INVAL;
2996
2997                 /* Not instantiated component */
2998                 if (lod_comp->llc_stripe == NULL)
2999                         continue;
3000
3001                 LASSERT(lod_comp->llc_stripe_count > 0);
3002                 for (j = 0; j < lod_comp->llc_stripe_count; j++) {
3003                         struct dt_object *obj = lod_comp->llc_stripe[j];
3004
3005                         if (obj == NULL)
3006                                 continue;
3007                         rc = lod_sub_declare_destroy(env, obj, th);
3008                         if (rc)
3009                                 RETURN(rc);
3010                 }
3011         }
3012
3013         LASSERTF(left >= 0, "left = %d\n", left);
3014         if (left == lo->ldo_comp_cnt) {
3015                 CDEBUG(D_LAYOUT, "%s: requested component id:%#x not found\n",
3016                        lod2obd(d)->obd_name, id);
3017                 RETURN(-EINVAL);
3018         }
3019
3020         memset(attr, 0, sizeof(*attr));
3021         attr->la_valid = LA_SIZE;
3022         rc = lod_sub_declare_attr_set(env, next, attr, th);
3023         if (rc)
3024                 RETURN(rc);
3025
3026         if (left > 0) {
3027                 info->lti_buf.lb_len = lod_comp_md_size(lo, false);
3028                 rc = lod_sub_declare_xattr_set(env, next, &info->lti_buf,
3029                                                XATTR_NAME_LOV, 0, th);
3030         } else {
3031                 rc = lod_sub_declare_xattr_del(env, next, XATTR_NAME_LOV, th);
3032         }
3033
3034         RETURN(rc);
3035 }
3036
3037 /**
3038  * Declare layout add/set/del operations issued by special xattr names:
3039  *
3040  * XATTR_LUSTRE_LOV.add         add component(s) to existing file
3041  * XATTR_LUSTRE_LOV.del         delete component(s) from existing file
3042  * XATTR_LUSTRE_LOV.set.$field  set specified field of certain component(s)
3043  *
3044  * \param[in] env       execution environment
3045  * \param[in] dt        object
3046  * \param[in] name      name of xattr
3047  * \param[in] buf       lu_buf contains xattr value
3048  * \param[in] th        transaction handle
3049  *
3050  * \retval              0 on success
3051  * \retval              negative if failed
3052  */
3053 static int lod_declare_modify_layout(const struct lu_env *env,
3054                                      struct dt_object *dt,
3055                                      const char *name,
3056                                      const struct lu_buf *buf,
3057                                      struct thandle *th)
3058 {
3059         struct lod_device *d = lu2lod_dev(dt->do_lu.lo_dev);
3060         struct lod_object *lo = lod_dt_obj(dt);
3061         char *op;
3062         int rc, len = strlen(XATTR_LUSTRE_LOV);
3063         ENTRY;
3064
3065         LASSERT(dt_object_exists(dt));
3066
3067         if (strlen(name) <= len || name[len] != '.') {
3068                 CDEBUG(D_LAYOUT, "%s: invalid xattr name: %s\n",
3069                        lod2obd(d)->obd_name, name);
3070                 RETURN(-EINVAL);
3071         }
3072         len++;
3073
3074         rc = lod_striping_load(env, lo);
3075         if (rc)
3076                 GOTO(unlock, rc);
3077
3078         /* the layout to be modified must be a composite layout */
3079         if (!lo->ldo_is_composite) {
3080                 CDEBUG(D_LAYOUT, "%s: object "DFID" isn't a composite file.\n",
3081                        lod2obd(d)->obd_name, PFID(lu_object_fid(&dt->do_lu)));
3082                 GOTO(unlock, rc = -EINVAL);
3083         }
3084
3085         op = (char *)name + len;
3086         if (strcmp(op, "add") == 0) {
3087                 rc = lod_declare_layout_add(env, dt, buf, th);
3088         } else if (strcmp(op, "del") == 0) {
3089                 rc = lod_declare_layout_del(env, dt, buf, th);
3090         } else if (strncmp(op, "set", strlen("set")) == 0) {
3091                 rc = lod_declare_layout_set(env, dt, op, buf, th);
3092         } else  {
3093                 CDEBUG(D_LAYOUT, "%s: unsupported xattr name:%s\n",
3094                        lod2obd(d)->obd_name, name);
3095                 GOTO(unlock, rc = -ENOTSUPP);
3096         }
3097 unlock:
3098         if (rc)
3099                 lod_striping_free(env, lo);
3100
3101         RETURN(rc);
3102 }
3103
3104 /**
3105  * Convert a plain file lov_mds_md to a composite layout.
3106  *
3107  * \param[in,out] info  the thread info::lti_ea_store buffer contains little
3108  *                      endian plain file layout
3109  *
3110  * \retval              0 on success, <0 on failure
3111  */
3112 static int lod_layout_convert(struct lod_thread_info *info)
3113 {
3114         struct lov_mds_md *lmm = info->lti_ea_store;
3115         struct lov_mds_md *lmm_save;
3116         struct lov_comp_md_v1 *lcm;
3117         struct lov_comp_md_entry_v1 *lcme;
3118         size_t size;
3119         __u32 blob_size;
3120         int rc = 0;
3121         ENTRY;
3122
3123         /* realloc buffer to a composite layout which contains one component */
3124         blob_size = lov_mds_md_size(le16_to_cpu(lmm->lmm_stripe_count),
3125                                     le32_to_cpu(lmm->lmm_magic));
3126         size = sizeof(*lcm) + sizeof(*lcme) + blob_size;
3127
3128         OBD_ALLOC_LARGE(lmm_save, blob_size);
3129         if (!lmm_save)
3130                 GOTO(out, rc = -ENOMEM);
3131
3132         memcpy(lmm_save, lmm, blob_size);
3133
3134         if (info->lti_ea_store_size < size) {
3135                 rc = lod_ea_store_resize(info, size);
3136                 if (rc)
3137                         GOTO(out, rc);
3138         }
3139
3140         lcm = info->lti_ea_store;
3141         lcm->lcm_magic = cpu_to_le32(LOV_MAGIC_COMP_V1);
3142         lcm->lcm_size = cpu_to_le32(size);
3143         lcm->lcm_layout_gen = cpu_to_le32(le16_to_cpu(
3144                                                 lmm_save->lmm_layout_gen));
3145         lcm->lcm_flags = cpu_to_le16(LCM_FL_NONE);
3146         lcm->lcm_entry_count = cpu_to_le16(1);
3147         lcm->lcm_mirror_count = 0;
3148
3149         lcme = &lcm->lcm_entries[0];
3150         lcme->lcme_flags = cpu_to_le32(LCME_FL_INIT);
3151         lcme->lcme_extent.e_start = 0;
3152         lcme->lcme_extent.e_end = cpu_to_le64(OBD_OBJECT_EOF);
3153         lcme->lcme_offset = cpu_to_le32(sizeof(*lcm) + sizeof(*lcme));
3154         lcme->lcme_size = cpu_to_le32(blob_size);
3155
3156         memcpy((char *)lcm + lcme->lcme_offset, (char *)lmm_save, blob_size);
3157
3158         EXIT;
3159 out:
3160         if (lmm_save)
3161                 OBD_FREE_LARGE(lmm_save, blob_size);
3162         return rc;
3163 }
3164
3165 /**
3166  * Merge layouts to form a mirrored file.
3167  */
3168 static int lod_declare_layout_merge(const struct lu_env *env,
3169                 struct dt_object *dt, const struct lu_buf *mbuf,
3170                 struct thandle *th)
3171 {
3172         struct lod_thread_info  *info = lod_env_info(env);
3173         struct lu_buf           *buf = &info->lti_buf;
3174         struct lod_object       *lo = lod_dt_obj(dt);
3175         struct lov_comp_md_v1   *lcm;
3176         struct lov_comp_md_v1   *cur_lcm;
3177         struct lov_comp_md_v1   *merge_lcm;
3178         struct lov_comp_md_entry_v1     *lcme;
3179         size_t size = 0;
3180         size_t offset;
3181         __u16 cur_entry_count;
3182         __u16 merge_entry_count;
3183         __u32 id = 0;
3184         __u16 mirror_id = 0;
3185         __u32 mirror_count;
3186         int     rc, i;
3187         ENTRY;
3188
3189         merge_lcm = mbuf->lb_buf;
3190         if (mbuf->lb_len < sizeof(*merge_lcm))
3191                 RETURN(-EINVAL);
3192
3193         /* must be an existing layout from disk */
3194         if (le32_to_cpu(merge_lcm->lcm_magic) != LOV_MAGIC_COMP_V1)
3195                 RETURN(-EINVAL);
3196
3197         merge_entry_count = le16_to_cpu(merge_lcm->lcm_entry_count);
3198
3199         /* do not allow to merge two mirrored files */
3200         if (le16_to_cpu(merge_lcm->lcm_mirror_count))
3201                 RETURN(-EBUSY);
3202
3203         /* verify the target buffer */
3204         rc = lod_get_lov_ea(env, lo);
3205         if (rc <= 0)
3206                 RETURN(rc ? : -ENODATA);
3207
3208         cur_lcm = info->lti_ea_store;
3209         switch (le32_to_cpu(cur_lcm->lcm_magic)) {
3210         case LOV_MAGIC_V1:
3211         case LOV_MAGIC_V3:
3212                 rc = lod_layout_convert(info);
3213                 break;
3214         case LOV_MAGIC_COMP_V1:
3215                 rc = 0;
3216                 break;
3217         default:
3218                 rc = -EINVAL;
3219         }
3220         if (rc)
3221                 RETURN(rc);
3222
3223         /* info->lti_ea_store could be reallocated in lod_layout_convert() */
3224         cur_lcm = info->lti_ea_store;
3225         cur_entry_count = le16_to_cpu(cur_lcm->lcm_entry_count);
3226
3227         /* 'lcm_mirror_count + 1' is the current # of mirrors the file has */
3228         mirror_count = le16_to_cpu(cur_lcm->lcm_mirror_count) + 1;
3229         if (mirror_count + 1 > LUSTRE_MIRROR_COUNT_MAX)
3230                 RETURN(-ERANGE);
3231
3232         /* size of new layout */
3233         size = le32_to_cpu(cur_lcm->lcm_size) +
3234                le32_to_cpu(merge_lcm->lcm_size) - sizeof(*cur_lcm);
3235
3236         memset(buf, 0, sizeof(*buf));
3237         lu_buf_alloc(buf, size);
3238         if (buf->lb_buf == NULL)
3239                 RETURN(-ENOMEM);
3240
3241         lcm = buf->lb_buf;
3242         memcpy(lcm, cur_lcm, sizeof(*lcm) + cur_entry_count * sizeof(*lcme));
3243
3244         offset = sizeof(*lcm) +
3245                  sizeof(*lcme) * (cur_entry_count + merge_entry_count);
3246         for (i = 0; i < cur_entry_count; i++) {
3247                 struct lov_comp_md_entry_v1 *cur_lcme;
3248
3249                 lcme = &lcm->lcm_entries[i];
3250                 cur_lcme = &cur_lcm->lcm_entries[i];
3251
3252                 lcme->lcme_offset = cpu_to_le32(offset);
3253                 memcpy((char *)lcm + offset,
3254                        (char *)cur_lcm + le32_to_cpu(cur_lcme->lcme_offset),
3255                        le32_to_cpu(lcme->lcme_size));
3256
3257                 offset += le32_to_cpu(lcme->lcme_size);
3258
3259                 if (mirror_count == 1 &&
3260                     mirror_id_of(le32_to_cpu(lcme->lcme_id)) == 0) {
3261                         /* Add mirror from a non-flr file, create new mirror ID.
3262                          * Otherwise, keep existing mirror's component ID, used
3263                          * for mirror extension.
3264                          */
3265                         id = pflr_id(1, i + 1);
3266                         lcme->lcme_id = cpu_to_le32(id);
3267                 }
3268
3269                 id = MAX(le32_to_cpu(lcme->lcme_id), id);
3270         }
3271
3272         mirror_id = mirror_id_of(id) + 1;
3273         for (i = 0; i < merge_entry_count; i++) {
3274                 struct lov_comp_md_entry_v1 *merge_lcme;
3275
3276                 merge_lcme = &merge_lcm->lcm_entries[i];
3277                 lcme = &lcm->lcm_entries[cur_entry_count + i];
3278
3279                 *lcme = *merge_lcme;
3280                 lcme->lcme_offset = cpu_to_le32(offset);
3281
3282                 id = pflr_id(mirror_id, i + 1);
3283                 lcme->lcme_id = cpu_to_le32(id);
3284
3285                 memcpy((char *)lcm + offset,
3286                        (char *)merge_lcm + le32_to_cpu(merge_lcme->lcme_offset),
3287                        le32_to_cpu(lcme->lcme_size));
3288
3289                 offset += le32_to_cpu(lcme->lcme_size);
3290         }
3291
3292         /* fixup layout information */
3293         lod_obj_inc_layout_gen(lo);
3294         lcm->lcm_layout_gen = cpu_to_le32(lo->ldo_layout_gen);
3295         lcm->lcm_size = cpu_to_le32(size);
3296         lcm->lcm_entry_count = cpu_to_le16(cur_entry_count + merge_entry_count);
3297         lcm->lcm_mirror_count = cpu_to_le16(mirror_count);
3298         if ((le16_to_cpu(lcm->lcm_flags) & LCM_FL_FLR_MASK) == LCM_FL_NONE)
3299                 lcm->lcm_flags = cpu_to_le32(LCM_FL_RDONLY);
3300
3301         rc = lod_striping_reload(env, lo, buf);
3302         if (rc)
3303                 GOTO(out, rc);
3304
3305         rc = lod_sub_declare_xattr_set(env, dt_object_child(dt), buf,
3306                                         XATTR_NAME_LOV, LU_XATTR_REPLACE, th);
3307
3308 out:
3309         lu_buf_free(buf);
3310         RETURN(rc);
3311 }
3312
3313 /**
3314  * Split layouts, just set the LOVEA with the layout from mbuf.
3315  */
3316 static int lod_declare_layout_split(const struct lu_env *env,
3317                 struct dt_object *dt, const struct lu_buf *mbuf,
3318                 struct thandle *th)
3319 {
3320         struct lod_object *lo = lod_dt_obj(dt);
3321         struct lov_comp_md_v1 *lcm = mbuf->lb_buf;
3322         int rc;
3323         ENTRY;
3324
3325         lod_obj_inc_layout_gen(lo);
3326         lcm->lcm_layout_gen = cpu_to_le32(lo->ldo_layout_gen);
3327
3328         rc = lod_striping_reload(env, lo, mbuf);
3329         if (rc)
3330                 RETURN(rc);
3331
3332         rc = lod_sub_declare_xattr_set(env, dt_object_child(dt), mbuf,
3333                                        XATTR_NAME_LOV, LU_XATTR_REPLACE, th);
3334         RETURN(rc);
3335 }
3336
3337 /**
3338  * Implementation of dt_object_operations::do_declare_xattr_set.
3339  *
3340  * \see dt_object_operations::do_declare_xattr_set() in the API description
3341  * for details.
3342  *
3343  * the extension to the API:
3344  *   - declaring LOVEA requests striping creation
3345  *   - LU_XATTR_REPLACE means layout swap
3346  */
3347 static int lod_declare_xattr_set(const struct lu_env *env,
3348                                  struct dt_object *dt,
3349                                  const struct lu_buf *buf,
3350                                  const char *name, int fl,
3351                                  struct thandle *th)
3352 {
3353         struct dt_object *next = dt_object_child(dt);
3354         struct lu_attr   *attr = &lod_env_info(env)->lti_attr;
3355         __u32             mode;
3356         int               rc;
3357         ENTRY;
3358
3359         mode = dt->do_lu.lo_header->loh_attr & S_IFMT;
3360         if ((S_ISREG(mode) || mode == 0) &&
3361             !(fl & (LU_XATTR_REPLACE | LU_XATTR_MERGE | LU_XATTR_SPLIT)) &&
3362             (strcmp(name, XATTR_NAME_LOV) == 0 ||
3363              strcmp(name, XATTR_LUSTRE_LOV) == 0)) {
3364                 /*
3365                  * this is a request to create object's striping.
3366                  *
3367                  * allow to declare predefined striping on a new (!mode) object
3368                  * which is supposed to be replay of regular file creation
3369                  * (when LOV setting is declared)
3370                  *
3371                  * LU_XATTR_REPLACE is set to indicate a layout swap
3372                  */
3373                 if (dt_object_exists(dt)) {
3374                         rc = dt_attr_get(env, next, attr);
3375                         if (rc)
3376                                 RETURN(rc);
3377                 } else {
3378                         memset(attr, 0, sizeof(*attr));
3379                         attr->la_valid = LA_TYPE | LA_MODE;
3380                         attr->la_mode = S_IFREG;
3381                 }
3382                 rc = lod_declare_striped_create(env, dt, attr, buf, th);
3383         } else if (fl & LU_XATTR_MERGE) {
3384                 LASSERT(strcmp(name, XATTR_NAME_LOV) == 0 ||
3385                         strcmp(name, XATTR_LUSTRE_LOV) == 0);
3386                 rc = lod_declare_layout_merge(env, dt, buf, th);
3387         } else if (fl & LU_XATTR_SPLIT) {
3388                 LASSERT(strcmp(name, XATTR_NAME_LOV) == 0 ||
3389                         strcmp(name, XATTR_LUSTRE_LOV) == 0);
3390                 rc = lod_declare_layout_split(env, dt, buf, th);
3391         } else if (S_ISREG(mode) &&
3392                    strlen(name) > strlen(XATTR_LUSTRE_LOV) + 1 &&
3393                    strncmp(name, XATTR_LUSTRE_LOV,
3394                            strlen(XATTR_LUSTRE_LOV)) == 0) {
3395                 /*
3396                  * this is a request to modify object's striping.
3397                  * add/set/del component(s).
3398                  */
3399                 if (!dt_object_exists(dt))
3400                         RETURN(-ENOENT);
3401
3402                 rc = lod_declare_modify_layout(env, dt, name, buf, th);
3403         } else if (strncmp(name, XATTR_NAME_LMV, strlen(XATTR_NAME_LMV)) == 0 &&
3404                    strlen(name) > strlen(XATTR_NAME_LMV) + 1) {
3405                 const char *op = name + strlen(XATTR_NAME_LMV) + 1;
3406
3407                 rc = -ENOTSUPP;
3408                 if (strcmp(op, "add") == 0)
3409                         rc = lod_dir_declare_layout_add(env, dt, buf, th);
3410                 else if (strcmp(op, "del") == 0)
3411                         rc = lod_dir_declare_layout_delete(env, dt, buf, th);
3412                 else if (strcmp(op, "set") == 0)
3413                         rc = lod_sub_declare_xattr_set(env, next, buf,
3414                                                        XATTR_NAME_LMV, fl, th);
3415
3416                 RETURN(rc);
3417         } else if (S_ISDIR(mode)) {
3418                 rc = lod_dir_declare_xattr_set(env, dt, buf, name, fl, th);
3419         } else if (strcmp(name, XATTR_NAME_FID) == 0) {
3420                 rc = lod_replace_parent_fid(env, dt, buf, th, true);
3421         } else {
3422                 rc = lod_sub_declare_xattr_set(env, next, buf, name, fl, th);
3423         }
3424
3425         RETURN(rc);
3426 }
3427
3428 /**
3429  * Apply xattr changes to the object.
3430  *
3431  * Applies xattr changes to the object and the stripes if the latter exist.
3432  *
3433  * \param[in] env       execution environment
3434  * \param[in] dt        object
3435  * \param[in] buf       buffer pointing to the new value of xattr
3436  * \param[in] name      name of xattr
3437  * \param[in] fl        flags
3438  * \param[in] th        transaction handle
3439  *
3440  * \retval              0 on success
3441  * \retval              negative if failed
3442  */
3443 static int lod_xattr_set_internal(const struct lu_env *env,
3444                                   struct dt_object *dt,
3445                                   const struct lu_buf *buf,
3446                                   const char *name, int fl,
3447                                   struct thandle *th)
3448 {
3449         struct dt_object        *next = dt_object_child(dt);
3450         struct lod_object       *lo = lod_dt_obj(dt);
3451         int                     rc;
3452         int                     i;
3453         ENTRY;
3454
3455         rc = lod_sub_xattr_set(env, next, buf, name, fl, th);
3456         if (rc != 0 || !S_ISDIR(dt->do_lu.lo_header->loh_attr))
3457                 RETURN(rc);
3458
3459         /* Note: Do not set LinkEA on sub-stripes, otherwise
3460          * it will confuse the fid2path process(see mdt_path_current()).
3461          * The linkEA between master and sub-stripes is set in
3462          * lod_xattr_set_lmv(). */
3463         if (lo->ldo_dir_stripe_count == 0 || strcmp(name, XATTR_NAME_LINK) == 0)
3464                 RETURN(0);
3465
3466         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
3467                 LASSERT(lo->ldo_stripe[i]);
3468
3469                 rc = lod_sub_xattr_set(env, lo->ldo_stripe[i], buf, name,
3470                                        fl, th);
3471                 if (rc != 0)
3472                         break;
3473         }
3474
3475         RETURN(rc);
3476 }
3477
3478 /**
3479  * Delete an extended attribute.
3480  *
3481  * Deletes specified xattr from the object and the stripes if the latter exist.
3482  *
3483  * \param[in] env       execution environment
3484  * \param[in] dt        object
3485  * \param[in] name      name of xattr
3486  * \param[in] th        transaction handle
3487  *
3488  * \retval              0 on success
3489  * \retval              negative if failed
3490  */
3491 static int lod_xattr_del_internal(const struct lu_env *env,
3492                                   struct dt_object *dt,
3493                                   const char *name, struct thandle *th)
3494 {
3495         struct dt_object        *next = dt_object_child(dt);
3496         struct lod_object       *lo = lod_dt_obj(dt);
3497         int                     rc;
3498         int                     i;
3499         ENTRY;
3500
3501         rc = lod_sub_xattr_del(env, next, name, th);
3502         if (rc != 0 || !S_ISDIR(dt->do_lu.lo_header->loh_attr))
3503                 RETURN(rc);
3504
3505         if (lo->ldo_dir_stripe_count == 0)
3506                 RETURN(rc);
3507
3508         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
3509                 LASSERT(lo->ldo_stripe[i]);
3510
3511                 rc = lod_sub_xattr_del(env, lo->ldo_stripe[i], name, th);
3512                 if (rc != 0)
3513                         break;
3514         }
3515
3516         RETURN(rc);
3517 }
3518
3519 /**
3520  * Set default striping on a directory.
3521  *
3522  * Sets specified striping on a directory object unless it matches the default
3523  * striping (LOVEA_DELETE_VALUES() macro). In the latter case remove existing
3524  * EA. This striping will be used when regular file is being created in this
3525  * directory.
3526  *
3527  * \param[in] env       execution environment
3528  * \param[in] dt        the striped object
3529  * \param[in] buf       buffer with the striping
3530  * \param[in] name      name of EA
3531  * \param[in] fl        xattr flag (see OSD API description)
3532  * \param[in] th        transaction handle
3533  *
3534  * \retval              0 on success
3535  * \retval              negative if failed
3536  */
3537 static int lod_xattr_set_lov_on_dir(const struct lu_env *env,
3538                                     struct dt_object *dt,
3539                                     const struct lu_buf *buf,
3540                                     const char *name, int fl,
3541                                     struct thandle *th)
3542 {
3543         struct lov_user_md_v1   *lum;
3544         struct lov_user_md_v3   *v3 = NULL;
3545         const char              *pool_name = NULL;
3546         int                      rc;
3547         bool                     is_del;
3548         ENTRY;
3549
3550         LASSERT(buf != NULL && buf->lb_buf != NULL);
3551         lum = buf->lb_buf;
3552
3553         switch (lum->lmm_magic) {
3554         case LOV_USER_MAGIC_SPECIFIC:
3555         case LOV_USER_MAGIC_V3:
3556                 v3 = buf->lb_buf;
3557                 if (v3->lmm_pool_name[0] != '\0')
3558                         pool_name = v3->lmm_pool_name;
3559                 /* fall through */
3560         case LOV_USER_MAGIC_V1:
3561                 /* if { size, offset, count } = { 0, -1, 0 } and no pool
3562                  * (i.e. all default values specified) then delete default
3563                  * striping from dir. */
3564                 CDEBUG(D_LAYOUT,
3565                        "set default striping: sz %u # %u offset %d %s %s\n",
3566                        (unsigned)lum->lmm_stripe_size,
3567                        (unsigned)lum->lmm_stripe_count,
3568                        (int)lum->lmm_stripe_offset,
3569                        v3 ? "from" : "", v3 ? v3->lmm_pool_name : "");
3570
3571                 is_del = LOVEA_DELETE_VALUES(lum->lmm_stripe_size,
3572                                              lum->lmm_stripe_count,
3573                                              lum->lmm_stripe_offset,
3574                                              pool_name);
3575                 break;
3576         case LOV_USER_MAGIC_COMP_V1:
3577                 is_del = false;
3578                 break;
3579         default:
3580                 CERROR("Invalid magic %x\n", lum->lmm_magic);
3581                 RETURN(-EINVAL);
3582         }
3583
3584         if (is_del) {
3585                 rc = lod_xattr_del_internal(env, dt, name, th);
3586                 if (rc == -ENODATA)
3587                         rc = 0;
3588         } else {
3589                 rc = lod_xattr_set_internal(env, dt, buf, name, fl, th);
3590         }
3591
3592         RETURN(rc);
3593 }
3594
3595 /**
3596  * Set default striping on a directory object.
3597  *
3598  * Sets specified striping on a directory object unless it matches the default
3599  * striping (LOVEA_DELETE_VALUES() macro). In the latter case remove existing
3600  * EA. This striping will be used when a new directory is being created in the
3601  * directory.
3602  *
3603  * \param[in] env       execution environment
3604  * \param[in] dt        the striped object
3605  * \param[in] buf       buffer with the striping
3606  * \param[in] name      name of EA
3607  * \param[in] fl        xattr flag (see OSD API description)
3608  * \param[in] th        transaction handle
3609  *
3610  * \retval              0 on success
3611  * \retval              negative if failed
3612  */
3613 static int lod_xattr_set_default_lmv_on_dir(const struct lu_env *env,
3614                                             struct dt_object *dt,
3615                                             const struct lu_buf *buf,
3616                                             const char *name, int fl,
3617                                             struct thandle *th)
3618 {
3619         struct lmv_user_md_v1   *lum;
3620         int                      rc;
3621         ENTRY;
3622
3623         LASSERT(buf != NULL && buf->lb_buf != NULL);
3624         lum = buf->lb_buf;
3625
3626         CDEBUG(D_OTHER, "set default stripe_count # %u stripe_offset %d\n",
3627               le32_to_cpu(lum->lum_stripe_count),
3628               (int)le32_to_cpu(lum->lum_stripe_offset));
3629
3630         if (LMVEA_DELETE_VALUES((le32_to_cpu(lum->lum_stripe_count)),
3631                                  le32_to_cpu(lum->lum_stripe_offset)) &&
3632                                 le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC) {
3633                 rc = lod_xattr_del_internal(env, dt, name, th);
3634                 if (rc == -ENODATA)
3635                         rc = 0;
3636         } else {
3637                 rc = lod_xattr_set_internal(env, dt, buf, name, fl, th);
3638                 if (rc != 0)
3639                         RETURN(rc);
3640         }
3641
3642         RETURN(rc);
3643 }
3644
3645 /**
3646  * Turn directory into a striped directory.
3647  *
3648  * During replay the client sends the striping created before MDT
3649  * failure, then the layer above LOD sends this defined striping
3650  * using ->do_xattr_set(), so LOD uses this method to replay creation
3651  * of the stripes. Notice the original information for the striping
3652  * (#stripes, FIDs, etc) was transferred in declare path.
3653  *
3654  * \param[in] env       execution environment
3655  * \param[in] dt        the striped object
3656  * \param[in] buf       not used currently
3657  * \param[in] name      not used currently
3658  * \param[in] fl        xattr flag (see OSD API description)
3659  * \param[in] th        transaction handle
3660  *
3661  * \retval              0 on success
3662  * \retval              negative if failed
3663  */
3664 static int lod_xattr_set_lmv(const struct lu_env *env, struct dt_object *dt,
3665                              const struct lu_buf *buf, const char *name,
3666                              int fl, struct thandle *th)
3667 {
3668         struct lod_object       *lo = lod_dt_obj(dt);
3669         struct lod_thread_info  *info = lod_env_info(env);
3670         struct lu_attr          *attr = &info->lti_attr;
3671         struct dt_object_format *dof = &info->lti_format;
3672         struct lu_buf           lmv_buf;
3673         struct lu_buf           slave_lmv_buf;
3674         struct lmv_mds_md_v1    *lmm;
3675         struct lmv_mds_md_v1    *slave_lmm = NULL;
3676         struct dt_insert_rec    *rec = &info->lti_dt_rec;
3677         int                     i;
3678         int                     rc;
3679         ENTRY;
3680
3681         if (!S_ISDIR(dt->do_lu.lo_header->loh_attr))
3682                 RETURN(-ENOTDIR);
3683
3684         /* The stripes are supposed to be allocated in declare phase,
3685          * if there are no stripes being allocated, it will skip */
3686         if (lo->ldo_dir_stripe_count == 0) {
3687                 if (lo->ldo_dir_is_foreign) {
3688                         rc = lod_sub_xattr_set(env, dt_object_child(dt), buf,
3689                                                XATTR_NAME_LMV, fl, th);
3690                         if (rc != 0)
3691                                 RETURN(rc);
3692                 }
3693                 RETURN(0);
3694         }
3695
3696         rc = dt_attr_get(env, dt_object_child(dt), attr);
3697         if (rc != 0)
3698                 RETURN(rc);
3699
3700         attr->la_valid = LA_ATIME | LA_MTIME | LA_CTIME |
3701                          LA_MODE | LA_UID | LA_GID | LA_TYPE | LA_PROJID;
3702         dof->dof_type = DFT_DIR;
3703
3704         rc = lod_prep_lmv_md(env, dt, &lmv_buf);
3705         if (rc != 0)
3706                 RETURN(rc);
3707         lmm = lmv_buf.lb_buf;
3708
3709         OBD_ALLOC_PTR(slave_lmm);
3710         if (slave_lmm == NULL)
3711                 RETURN(-ENOMEM);
3712
3713         lod_prep_slave_lmv_md(slave_lmm, lmm);
3714         slave_lmv_buf.lb_buf = slave_lmm;
3715         slave_lmv_buf.lb_len = sizeof(*slave_lmm);
3716
3717         rec->rec_type = S_IFDIR;
3718         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
3719                 struct dt_object *dto = lo->ldo_stripe[i];
3720                 char *stripe_name = info->lti_key;
3721                 struct lu_name *sname;
3722                 struct linkea_data ldata = { NULL };
3723                 struct lu_buf linkea_buf;
3724
3725                 /* if it's source stripe of migrating directory, don't create */
3726                 if (!((lo->ldo_dir_hash_type & LMV_HASH_FLAG_MIGRATION) &&
3727                       i >= lo->ldo_dir_migrate_offset)) {
3728                         dt_write_lock(env, dto, MOR_TGT_CHILD);
3729                         rc = lod_sub_create(env, dto, attr, NULL, dof, th);
3730                         if (rc != 0) {
3731                                 dt_write_unlock(env, dto);
3732                                 GOTO(out, rc);
3733                         }
3734
3735                         rc = lod_sub_ref_add(env, dto, th);
3736                         dt_write_unlock(env, dto);
3737                         if (rc != 0)
3738                                 GOTO(out, rc);
3739
3740                         rec->rec_fid = lu_object_fid(&dto->do_lu);
3741                         rc = lod_sub_insert(env, dto,
3742                                             (const struct dt_rec *)rec,
3743                                             (const struct dt_key *)dot, th);
3744                         if (rc != 0)
3745                                 GOTO(out, rc);
3746                 }
3747
3748                 rec->rec_fid = lu_object_fid(&dt->do_lu);
3749                 rc = lod_sub_insert(env, dto, (struct dt_rec *)rec,
3750                                     (const struct dt_key *)dotdot, th);
3751                 if (rc != 0)
3752                         GOTO(out, rc);
3753
3754                 if (!OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_SLAVE_LMV) ||
3755                     cfs_fail_val != i) {
3756                         if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_SLAVE_LMV) &&
3757                             cfs_fail_val == i)
3758                                 slave_lmm->lmv_master_mdt_index =
3759                                                         cpu_to_le32(i + 1);
3760                         else
3761                                 slave_lmm->lmv_master_mdt_index =
3762                                                         cpu_to_le32(i);
3763
3764                         rc = lod_sub_xattr_set(env, dto, &slave_lmv_buf,
3765                                                XATTR_NAME_LMV, fl, th);
3766                         if (rc != 0)
3767                                 GOTO(out, rc);
3768                 }
3769
3770                 if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_SLAVE_NAME) &&
3771                     cfs_fail_val == i)
3772                         snprintf(stripe_name, sizeof(info->lti_key), DFID":%d",
3773                                  PFID(lu_object_fid(&dto->do_lu)), i + 1);
3774                 else
3775                         snprintf(stripe_name, sizeof(info->lti_key), DFID":%d",
3776                                  PFID(lu_object_fid(&dto->do_lu)), i);
3777
3778                 sname = lod_name_get(env, stripe_name, strlen(stripe_name));
3779                 rc = linkea_links_new(&ldata, &info->lti_linkea_buf,
3780                                       sname, lu_object_fid(&dt->do_lu));
3781                 if (rc != 0)
3782                         GOTO(out, rc);
3783
3784                 linkea_buf.lb_buf = ldata.ld_buf->lb_buf;
3785                 linkea_buf.lb_len = ldata.ld_leh->leh_len;
3786                 rc = lod_sub_xattr_set(env, dto, &linkea_buf,
3787                                        XATTR_NAME_LINK, 0, th);
3788                 if (rc != 0)
3789                         GOTO(out, rc);
3790
3791                 rec->rec_fid = lu_object_fid(&dto->do_lu);
3792                 rc = lod_sub_insert(env, dt_object_child(dt),
3793                                     (const struct dt_rec *)rec,
3794                                     (const struct dt_key *)stripe_name, th);
3795                 if (rc != 0)
3796                         GOTO(out, rc);
3797
3798                 rc = lod_sub_ref_add(env, dt_object_child(dt), th);
3799                 if (rc != 0)
3800                         GOTO(out, rc);
3801         }
3802
3803         if (!OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MASTER_LMV))
3804                 rc = lod_sub_xattr_set(env, dt_object_child(dt),
3805                                        &lmv_buf, XATTR_NAME_LMV, fl, th);
3806 out:
3807         if (slave_lmm != NULL)
3808                 OBD_FREE_PTR(slave_lmm);
3809
3810         RETURN(rc);
3811 }
3812
3813 /**
3814  * Helper function to declare/execute creation of a striped directory
3815  *
3816  * Called in declare/create object path, prepare striping for a directory
3817  * and prepare defaults data striping for the objects to be created in
3818  * that directory. Notice the function calls "declaration" or "execution"
3819  * methods depending on \a declare param. This is a consequence of the
3820  * current approach while we don't have natural distributed transactions:
3821  * we basically execute non-local updates in the declare phase. So, the
3822  * arguments for the both phases are the same and this is the reason for
3823  * this function to exist.
3824  *
3825  * \param[in] env       execution environment
3826  * \param[in] dt        object
3827  * \param[in] attr      attributes the stripes will be created with
3828  * \param[in] lmu       lmv_user_md if MDT indices are specified
3829  * \param[in] dof       format of stripes (see OSD API description)
3830  * \param[in] th        transaction handle
3831  * \param[in] declare   where to call "declare" or "execute" methods
3832  *
3833  * \retval              0 on success
3834  * \retval              negative if failed
3835  */
3836 static int lod_dir_striping_create_internal(const struct lu_env *env,
3837                                             struct dt_object *dt,
3838                                             struct lu_attr *attr,
3839                                             const struct lu_buf *lmu,
3840                                             struct dt_object_format *dof,
3841                                             struct thandle *th,
3842                                             bool declare)
3843 {
3844         struct lod_thread_info *info = lod_env_info(env);
3845         struct lod_object *lo = lod_dt_obj(dt);
3846         const struct lod_default_striping *lds = lo->ldo_def_striping;
3847         int rc;
3848         ENTRY;
3849
3850         LASSERT(ergo(lds != NULL,
3851                      lds->lds_def_striping_set ||
3852                      lds->lds_dir_def_striping_set));
3853
3854         if (!LMVEA_DELETE_VALUES(lo->ldo_dir_stripe_count,
3855                                  lo->ldo_dir_stripe_offset)) {
3856                 if (!lmu) {
3857                         struct lmv_user_md_v1 *v1 = info->lti_ea_store;
3858                         int stripe_count = lo->ldo_dir_stripe_count;
3859
3860                         if (info->lti_ea_store_size < sizeof(*v1)) {
3861                                 rc = lod_ea_store_resize(info, sizeof(*v1));
3862                                 if (rc != 0)
3863                                         RETURN(rc);
3864                                 v1 = info->lti_ea_store;
3865                         }
3866
3867                         memset(v1, 0, sizeof(*v1));
3868                         v1->lum_magic = cpu_to_le32(LMV_USER_MAGIC);
3869                         v1->lum_stripe_count = cpu_to_le32(stripe_count);
3870                         v1->lum_stripe_offset =
3871                                         cpu_to_le32(lo->ldo_dir_stripe_offset);
3872
3873                         info->lti_buf.lb_buf = v1;
3874                         info->lti_buf.lb_len = sizeof(*v1);
3875                         lmu = &info->lti_buf;
3876                 }
3877
3878                 if (declare)
3879                         rc = lod_declare_xattr_set_lmv(env, dt, attr, lmu, dof,
3880                                                        th);
3881                 else
3882                         rc = lod_xattr_set_lmv(env, dt, lmu, XATTR_NAME_LMV, 0,
3883                                                th);
3884                 if (rc != 0)
3885                         RETURN(rc);
3886         } else {
3887                 /* foreign LMV EA case */
3888                 if (lmu) {
3889                         struct lmv_foreign_md *lfm = lmu->lb_buf;
3890
3891                         if (lfm->lfm_magic == LMV_MAGIC_FOREIGN) {
3892                                 rc = lod_declare_xattr_set_lmv(env, dt, attr,
3893                                                                lmu, dof, th);
3894                         }
3895                 } else {
3896                         if (lo->ldo_dir_is_foreign) {
3897                                 LASSERT(lo->ldo_foreign_lmv != NULL &&
3898                                         lo->ldo_foreign_lmv_size > 0);
3899                                 info->lti_buf.lb_buf = lo->ldo_foreign_lmv;
3900                                 info->lti_buf.lb_len = lo->ldo_foreign_lmv_size;
3901                                 lmu = &info->lti_buf;
3902                                 rc = lod_xattr_set_lmv(env, dt, lmu,
3903                                                        XATTR_NAME_LMV, 0, th);
3904                         }
3905                 }
3906         }
3907
3908         /* Transfer default LMV striping from the parent */
3909         if (lds != NULL && lds->lds_dir_def_striping_set &&
3910             !LMVEA_DELETE_VALUES(lds->lds_dir_def_stripe_count,
3911                                  lds->lds_dir_def_stripe_offset)) {
3912                 struct lmv_user_md_v1 *v1 = info->lti_ea_store;
3913
3914                 if (info->lti_ea_store_size < sizeof(*v1)) {
3915                         rc = lod_ea_store_resize(info, sizeof(*v1));
3916                         if (rc != 0)
3917                                 RETURN(rc);
3918                         v1 = info->lti_ea_store;
3919                 }
3920
3921                 memset(v1, 0, sizeof(*v1));
3922                 v1->lum_magic = cpu_to_le32(LMV_USER_MAGIC);
3923                 v1->lum_stripe_count =
3924                         cpu_to_le32(lds->lds_dir_def_stripe_count);
3925                 v1->lum_stripe_offset =
3926                         cpu_to_le32(lds->lds_dir_def_stripe_offset);
3927                 v1->lum_hash_type =
3928                         cpu_to_le32(lds->lds_dir_def_hash_type);
3929
3930                 info->lti_buf.lb_buf = v1;
3931                 info->lti_buf.lb_len = sizeof(*v1);
3932                 if (declare)
3933                         rc = lod_dir_declare_xattr_set(env, dt, &info->lti_buf,
3934                                                        XATTR_NAME_DEFAULT_LMV,
3935                                                        0, th);
3936                 else
3937                         rc = lod_xattr_set_default_lmv_on_dir(env, dt,
3938                                                   &info->lti_buf,
3939                                                   XATTR_NAME_DEFAULT_LMV, 0,
3940                                                   th);
3941                 if (rc != 0)
3942                         RETURN(rc);
3943         }
3944
3945         /* Transfer default LOV striping from the parent */
3946         if (lds != NULL && lds->lds_def_striping_set &&
3947             lds->lds_def_comp_cnt != 0) {
3948                 struct lov_mds_md *lmm;
3949                 int lmm_size = lod_comp_md_size(lo, true);
3950
3951                 if (info->lti_ea_store_size < lmm_size) {
3952                         rc = lod_ea_store_resize(info, lmm_size);
3953                         if (rc != 0)
3954                                 RETURN(rc);
3955                 }
3956                 lmm = info->lti_ea_store;
3957
3958                 rc = lod_generate_lovea(env, lo, lmm, &lmm_size, true);
3959                 if (rc != 0)
3960                         RETURN(rc);
3961
3962                 info->lti_buf.lb_buf = lmm;
3963                 info->lti_buf.lb_len = lmm_size;
3964
3965                 if (declare)
3966                         rc = lod_dir_declare_xattr_set(env, dt, &info->lti_buf,
3967                                                        XATTR_NAME_LOV, 0, th);
3968                 else
3969                         rc = lod_xattr_set_lov_on_dir(env, dt, &info->lti_buf,
3970                                                       XATTR_NAME_LOV, 0, th);
3971                 if (rc != 0)
3972                         RETURN(rc);
3973         }
3974
3975         RETURN(0);
3976 }
3977
3978 static int lod_declare_dir_striping_create(const struct lu_env *env,
3979                                            struct dt_object *dt,
3980                                            struct lu_attr *attr,
3981                                            struct lu_buf *lmu,
3982                                            struct dt_object_format *dof,
3983                                            struct thandle *th)
3984 {
3985         return lod_dir_striping_create_internal(env, dt, attr, lmu, dof, th,
3986                                                 true);
3987 }
3988
3989 static int lod_dir_striping_create(const struct lu_env *env,
3990                                    struct dt_object *dt,
3991                                    struct lu_attr *attr,
3992                                    struct dt_object_format *dof,
3993                                    struct thandle *th)
3994 {
3995         return lod_dir_striping_create_internal(env, dt, attr, NULL, dof, th,
3996                                                 false);
3997 }
3998
3999 /**
4000  * Make LOV EA for striped object.
4001  *
4002  * Generate striping information and store it in the LOV EA of the given
4003  * object. The caller must ensure nobody else is calling the function
4004  * against the object concurrently. The transaction must be started.
4005  * FLDB service must be running as well; it's used to map FID to the target,
4006  * which is stored in LOV EA.
4007  *
4008  * \param[in] env               execution environment for this thread
4009  * \param[in] lo                LOD object
4010  * \param[in] th                transaction handle
4011  *
4012  * \retval                      0 if LOV EA is stored successfully
4013  * \retval                      negative error number on failure
4014  */
4015 static int lod_generate_and_set_lovea(const struct lu_env *env,
4016                                       struct lod_object *lo,
4017                                       struct thandle *th)
4018 {
4019         struct lod_thread_info  *info = lod_env_info(env);
4020         struct dt_object        *next = dt_object_child(&lo->ldo_obj);
4021         struct lov_mds_md_v1    *lmm;
4022         int                      rc, lmm_size;
4023         ENTRY;
4024
4025         LASSERT(lo);
4026
4027         if (lo->ldo_comp_cnt == 0 && !lo->ldo_is_foreign) {
4028                 lod_striping_free(env, lo);
4029                 rc = lod_sub_xattr_del(env, next, XATTR_NAME_LOV, th);
4030                 RETURN(rc);
4031         }
4032
4033         lmm_size = lod_comp_md_size(lo, false);
4034         if (info->lti_ea_store_size < lmm_size) {
4035                 rc = lod_ea_store_resize(info, lmm_size);
4036                 if (rc)
4037                         RETURN(rc);
4038         }
4039         lmm = info->lti_ea_store;
4040
4041         rc = lod_generate_lovea(env, lo, lmm, &lmm_size, false);
4042         if (rc)
4043                 RETURN(rc);
4044
4045         info->lti_buf.lb_buf = lmm;
4046         info->lti_buf.lb_len = lmm_size;
4047         rc = lod_sub_xattr_set(env, next, &info->lti_buf,
4048                                XATTR_NAME_LOV, 0, th);
4049         RETURN(rc);
4050 }
4051
4052 /**
4053  * Delete layout component(s)
4054  *
4055  * \param[in] env       execution environment for this thread
4056  * \param[in] dt        object
4057  * \param[in] th        transaction handle
4058  *
4059  * \retval      0 on success
4060  * \retval      negative error number on failure
4061  */
4062 static int lod_layout_del(const struct lu_env *env, struct dt_object *dt,
4063                           struct thandle *th)
4064 {
4065         struct lod_layout_component     *lod_comp;
4066         struct lod_object       *lo = lod_dt_obj(dt);
4067         struct dt_object        *next = dt_object_child(dt);
4068         struct lu_attr  *attr = &lod_env_info(env)->lti_attr;
4069         int     rc, i, j, left;
4070
4071         LASSERT(lo->ldo_is_composite);
4072         LASSERT(lo->ldo_comp_cnt > 0 && lo->ldo_comp_entries != NULL);
4073
4074         left = lo->ldo_comp_cnt;
4075         for (i = (lo->ldo_comp_cnt - 1); i >= 0; i--) {
4076                 lod_comp = &lo->ldo_comp_entries[i];
4077
4078                 if (lod_comp->llc_id != LCME_ID_INVAL)
4079                         break;
4080                 left--;
4081
4082                 /* Not instantiated component */
4083                 if (lod_comp->llc_stripe == NULL)
4084                         continue;
4085
4086                 LASSERT(lod_comp->llc_stripe_count > 0);
4087                 for (j = 0; j < lod_comp->llc_stripe_count; j++) {
4088                         struct dt_object *obj = lod_comp->llc_stripe[j];
4089
4090                         if (obj == NULL)
4091                                 continue;
4092                         rc = lod_sub_destroy(env, obj, th);
4093                         if (rc)
4094                                 GOTO(out, rc);
4095
4096                         lu_object_put(env, &obj->do_lu);
4097                         lod_comp->llc_stripe[j] = NULL;
4098                 }
4099                 OBD_FREE(lod_comp->llc_stripe, sizeof(struct dt_object *) *
4100                                         lod_comp->llc_stripes_allocated);
4101                 lod_comp->llc_stripe = NULL;
4102                 OBD_FREE(lod_comp->llc_ost_indices,
4103                          sizeof(__u32) * lod_comp->llc_stripes_allocated);
4104                 lod_comp->llc_ost_indices = NULL;
4105                 lod_comp->llc_stripes_allocated = 0;
4106                 lod_obj_set_pool(lo, i, NULL);
4107                 if (lod_comp->llc_ostlist.op_array) {
4108                         OBD_FREE(lod_comp->llc_ostlist.op_array,
4109                                  lod_comp->llc_ostlist.op_size);
4110                         lod_comp->llc_ostlist.op_array = NULL;
4111                         lod_comp->llc_ostlist.op_size = 0;
4112                 }
4113         }
4114
4115         LASSERTF(left >= 0 && left < lo->ldo_comp_cnt, "left = %d\n", left);
4116         if (left > 0) {
4117                 struct lod_layout_component     *comp_array;
4118
4119                 OBD_ALLOC(comp_array, sizeof(*comp_array) * left);
4120                 if (comp_array == NULL)
4121                         GOTO(out, rc = -ENOMEM);
4122
4123                 memcpy(&comp_array[0], &lo->ldo_comp_entries[0],
4124                        sizeof(*comp_array) * left);
4125
4126                 OBD_FREE(lo->ldo_comp_entries,
4127                          sizeof(*comp_array) * lo->ldo_comp_cnt);
4128                 lo->ldo_comp_entries = comp_array;
4129                 lo->ldo_comp_cnt = left;
4130
4131                 LASSERT(lo->ldo_mirror_count == 1);
4132                 lo->ldo_mirrors[0].lme_end = left - 1;
4133                 lod_obj_inc_layout_gen(lo);
4134         } else {
4135                 lod_free_comp_entries(lo);
4136         }
4137
4138         LASSERT(dt_object_exists(dt));
4139         rc = dt_attr_get(env, next, attr);
4140         if (rc)
4141                 GOTO(out, rc);
4142
4143         if (attr->la_size > 0) {
4144                 attr->la_size = 0;
4145                 attr->la_valid = LA_SIZE;
4146                 rc = lod_sub_attr_set(env, next, attr, th);
4147                 if (rc)
4148                         GOTO(out, rc);
4149         }
4150
4151         rc = lod_generate_and_set_lovea(env, lo, th);
4152         EXIT;
4153 out:
4154         if (rc)
4155                 lod_striping_free(env, lo);
4156         return rc;
4157 }
4158
4159
4160 static int lod_get_default_lov_striping(const struct lu_env *env,
4161                                         struct lod_object *lo,
4162                                         struct lod_default_striping *lds);
4163 /**
4164  * Implementation of dt_object_operations::do_xattr_set.
4165  *
4166  * Sets specified extended attribute on the object. Three types of EAs are
4167  * special:
4168  *   LOV EA - stores striping for a regular file or default striping (when set
4169  *            on a directory)
4170  *   LMV EA - stores a marker for the striped directories
4171  *   DMV EA - stores default directory striping
4172  *
4173  * When striping is applied to a non-striped existing object (this is called
4174  * late striping), then LOD notices the caller wants to turn the object into a
4175  * striped one. The stripe objects are created and appropriate EA is set:
4176  * LOV EA storing all the stripes directly or LMV EA storing just a small header
4177  * with striping configuration.
4178  *
4179  * \see dt_object_operations::do_xattr_set() in the API description for details.
4180  */
4181 static int lod_xattr_set(const struct lu_env *env,
4182                          struct dt_object *dt, const struct lu_buf *buf,
4183                          const char *name, int fl, struct thandle *th)
4184 {
4185         struct dt_object        *next = dt_object_child(dt);
4186         int                      rc;
4187         ENTRY;
4188
4189         if (S_ISDIR(dt->do_lu.lo_header->loh_attr) &&
4190             strcmp(name, XATTR_NAME_LMV) == 0) {
4191                 rc = lod_dir_striping_create(env, dt, NULL, NULL, th);
4192                 RETURN(rc);
4193         } else if (S_ISDIR(dt->do_lu.lo_header->loh_attr) &&
4194                    strncmp(name, XATTR_NAME_LMV, strlen(XATTR_NAME_LMV)) == 0 &&
4195                    strlen(name) > strlen(XATTR_NAME_LMV) + 1) {
4196                 const char *op = name + strlen(XATTR_NAME_LMV) + 1;
4197
4198                 rc = -ENOTSUPP;
4199                 /*
4200                  * XATTR_NAME_LMV".add" is never called, but only declared,
4201                  * because lod_xattr_set_lmv() will do the addition.
4202                  */
4203                 if (strcmp(op, "del") == 0)
4204                         rc = lod_dir_layout_delete(env, dt, buf, th);
4205                 else if (strcmp(op, "set") == 0)
4206                         rc = lod_sub_xattr_set(env, next, buf, XATTR_NAME_LMV,
4207                                                fl, th);
4208
4209                 RETURN(rc);
4210         } else if (S_ISDIR(dt->do_lu.lo_header->loh_attr) &&
4211             strcmp(name, XATTR_NAME_LOV) == 0) {
4212                 struct lod_default_striping *lds = lod_lds_buf_get(env);
4213                 struct lov_user_md_v1 *v1 = buf->lb_buf;
4214                 char pool[LOV_MAXPOOLNAME + 1];
4215                 bool is_del;
4216
4217                 /* get existing striping config */
4218                 rc = lod_get_default_lov_striping(env, lod_dt_obj(dt), lds);
4219                 if (rc)
4220                         RETURN(rc);
4221
4222                 memset(pool, 0, sizeof(pool));
4223                 if (lds->lds_def_striping_set == 1)
4224                         lod_layout_get_pool(lds->lds_def_comp_entries,
4225                                             lds->lds_def_comp_cnt, pool,
4226                                             sizeof(pool));
4227
4228                 is_del = LOVEA_DELETE_VALUES(v1->lmm_stripe_size,
4229                                              v1->lmm_stripe_count,
4230                                              v1->lmm_stripe_offset,
4231                                              NULL);
4232
4233                 /* Retain the pool name if it is not given */
4234                 if (v1->lmm_magic == LOV_USER_MAGIC_V1 && pool[0] != '\0' &&
4235                         !is_del) {
4236                         struct lod_thread_info *info = lod_env_info(env);
4237                         struct lov_user_md_v3 *v3  = info->lti_ea_store;
4238
4239                         memset(v3, 0, sizeof(*v3));
4240                         v3->lmm_magic = cpu_to_le32(LOV_USER_MAGIC_V3);
4241                         v3->lmm_pattern = cpu_to_le32(v1->lmm_pattern);
4242                         v3->lmm_stripe_count =
4243                                         cpu_to_le32(v1->lmm_stripe_count);
4244                         v3->lmm_stripe_offset =
4245                                         cpu_to_le32(v1->lmm_stripe_offset);
4246                         v3->lmm_stripe_size = cpu_to_le32(v1->lmm_stripe_size);
4247
4248                         strlcpy(v3->lmm_pool_name, pool,
4249                                 sizeof(v3->lmm_pool_name));
4250
4251                         info->lti_buf.lb_buf = v3;
4252                         info->lti_buf.lb_len = sizeof(*v3);
4253                         rc = lod_xattr_set_lov_on_dir(env, dt, &info->lti_buf,
4254                                                       name, fl, th);
4255                 } else {
4256                         rc = lod_xattr_set_lov_on_dir(env, dt, buf, name,
4257                                                       fl, th);
4258                 }
4259
4260                 if (lds->lds_def_striping_set == 1 &&
4261                     lds->lds_def_comp_entries != NULL)
4262                         lod_free_def_comp_entries(lds);
4263
4264                 RETURN(rc);
4265         } else if (S_ISDIR(dt->do_lu.lo_header->loh_attr) &&
4266                    strcmp(name, XATTR_NAME_DEFAULT_LMV) == 0) {
4267                 /* default LMVEA */
4268                 rc = lod_xattr_set_default_lmv_on_dir(env, dt, buf, name, fl,
4269                                                       th);
4270                 RETURN(rc);
4271         } else if (S_ISREG(dt->do_lu.lo_header->loh_attr) &&
4272                    (!strcmp(name, XATTR_NAME_LOV) ||
4273                     !strncmp(name, XATTR_LUSTRE_LOV,
4274                              strlen(XATTR_LUSTRE_LOV)))) {
4275                 /* in case of lov EA swap, just set it
4276                  * if not, it is a replay so check striping match what we
4277                  * already have during req replay, declare_xattr_set()
4278                  * defines striping, then create() does the work */
4279                 if (fl & LU_XATTR_REPLACE) {
4280                         /* free stripes, then update disk */
4281                         lod_striping_free(env, lod_dt_obj(dt));
4282
4283                         rc = lod_sub_xattr_set(env, next, buf, name, fl, th);
4284                 } else if (dt_object_remote(dt)) {
4285                         /* This only happens during migration, see
4286                          * mdd_migrate_create(), in which Master MDT will
4287                          * create a remote target object, and only set
4288                          * (migrating) stripe EA on the remote object,
4289                          * and does not need creating each stripes. */
4290                         rc = lod_sub_xattr_set(env, next, buf, name,
4291                                                       fl, th);
4292                 } else if (strcmp(name, XATTR_LUSTRE_LOV".del") == 0) {
4293                         /* delete component(s) */
4294                         LASSERT(lod_dt_obj(dt)->ldo_comp_cached);
4295                         rc = lod_layout_del(env, dt, th);
4296                 } else {
4297                         /*
4298                          * When 'name' is XATTR_LUSTRE_LOV or XATTR_NAME_LOV,
4299                          * it's going to create create file with specified
4300                          * component(s), the striping must have not being
4301                          * cached in this case;
4302                          *
4303                          * Otherwise, it's going to add/change component(s) to
4304                          * an existing file, the striping must have been cached
4305                          * in this case.
4306                          */
4307                         LASSERT(equi(!strcmp(name, XATTR_LUSTRE_LOV) ||
4308                                      !strcmp(name, XATTR_NAME_LOV),
4309                                 !lod_dt_obj(dt)->ldo_comp_cached));
4310
4311                         rc = lod_striped_create(env, dt, NULL, NULL, th);
4312                 }
4313                 RETURN(rc);
4314         } else if (strcmp(name, XATTR_NAME_FID) == 0) {
4315                 rc = lod_replace_parent_fid(env, dt, buf, th, false);
4316
4317                 RETURN(rc);
4318         }
4319
4320         /* then all other xattr */
4321         rc = lod_xattr_set_internal(env, dt, buf, name, fl, th);
4322
4323         RETURN(rc);
4324 }
4325
4326 /**
4327  * Implementation of dt_object_operations::do_declare_xattr_del.
4328  *
4329  * \see dt_object_operations::do_declare_xattr_del() in the API description
4330  * for details.
4331  */
4332 static int lod_declare_xattr_del(const struct lu_env *env,
4333                                  struct dt_object *dt, const char *name,
4334                                  struct thandle *th)
4335 {
4336         struct lod_object *lo = lod_dt_obj(dt);
4337         struct dt_object *next = dt_object_child(dt);
4338         int i;
4339         int rc;
4340         ENTRY;
4341
4342         rc = lod_sub_declare_xattr_del(env, next, name, th);
4343         if (rc != 0)
4344                 RETURN(rc);
4345
4346         if (!S_ISDIR(dt->do_lu.lo_header->loh_attr))
4347                 RETURN(0);
4348
4349         /* set xattr to each stripes, if needed */
4350         rc = lod_striping_load(env, lo);
4351         if (rc != 0)
4352                 RETURN(rc);
4353
4354         if (lo->ldo_dir_stripe_count == 0)
4355                 RETURN(0);
4356
4357         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
4358                 struct dt_object *dto = lo->ldo_stripe[i];
4359
4360                 LASSERT(dto);
4361                 rc = lod_sub_declare_xattr_del(env, dto, name, th);
4362                 if (rc != 0)
4363                         break;
4364         }
4365
4366         RETURN(rc);
4367 }
4368
4369 /**
4370  * Implementation of dt_object_operations::do_xattr_del.
4371  *
4372  * If EA storing a regular striping is being deleted, then release
4373  * all the references to the stripe objects in core.
4374  *
4375  * \see dt_object_operations::do_xattr_del() in the API description for details.
4376  */
4377 static int lod_xattr_del(const struct lu_env *env, struct dt_object *dt,
4378                          const char *name, struct thandle *th)
4379 {
4380         struct dt_object        *next = dt_object_child(dt);
4381         struct lod_object       *lo = lod_dt_obj(dt);
4382         int                     rc;
4383         int                     i;
4384         ENTRY;
4385
4386         if (!strcmp(name, XATTR_NAME_LOV) || !strcmp(name, XATTR_NAME_LMV))
4387                 lod_striping_free(env, lod_dt_obj(dt));
4388
4389         rc = lod_sub_xattr_del(env, next, name, th);
4390         if (rc != 0 || !S_ISDIR(dt->do_lu.lo_header->loh_attr))
4391                 RETURN(rc);
4392
4393         if (lo->ldo_dir_stripe_count == 0)
4394                 RETURN(0);
4395
4396         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
4397                 struct dt_object *dto = lo->ldo_stripe[i];
4398
4399                 LASSERT(dto);
4400
4401                 rc = lod_sub_xattr_del(env, dto, name, th);
4402                 if (rc != 0)
4403                         break;
4404         }
4405
4406         RETURN(rc);
4407 }
4408
4409 /**
4410  * Implementation of dt_object_operations::do_xattr_list.
4411  *
4412  * \see dt_object_operations::do_xattr_list() in the API description
4413  * for details.
4414  */
4415 static int lod_xattr_list(const struct lu_env *env,
4416                           struct dt_object *dt, const struct lu_buf *buf)
4417 {
4418         return dt_xattr_list(env, dt_object_child(dt), buf);
4419 }
4420
4421 static inline int lod_object_will_be_striped(int is_reg, const struct lu_fid *fid)
4422 {
4423         return (is_reg && fid_seq(fid) != FID_SEQ_LOCAL_FILE);
4424 }
4425
4426 /**
4427  * Copy OST list from layout provided by user.
4428  *
4429  * \param[in] lod_comp          layout_component to be filled
4430  * \param[in] v3                LOV EA V3 user data
4431  *
4432  * \retval              0 on success
4433  * \retval              negative if failed
4434  */
4435 int lod_comp_copy_ost_lists(struct lod_layout_component *lod_comp,
4436                             struct lov_user_md_v3 *v3)
4437 {
4438         int j;
4439
4440         ENTRY;
4441
4442         if (v3->lmm_stripe_offset == LOV_OFFSET_DEFAULT)
4443                 v3->lmm_stripe_offset = v3->lmm_objects[0].l_ost_idx;
4444
4445         if (lod_comp->llc_ostlist.op_array) {
4446                 if (lod_comp->llc_ostlist.op_size >=
4447                     v3->lmm_stripe_count * sizeof(__u32))  {
4448                         lod_comp->llc_ostlist.op_count =
4449                                         v3->lmm_stripe_count;
4450                         goto skip;
4451                 }
4452                 OBD_FREE(lod_comp->llc_ostlist.op_array,
4453                          lod_comp->llc_ostlist.op_size);
4454         }
4455
4456         /* copy ost list from lmm */
4457         lod_comp->llc_ostlist.op_count = v3->lmm_stripe_count;
4458         lod_comp->llc_ostlist.op_size = v3->lmm_stripe_count * sizeof(__u32);
4459         OBD_ALLOC(lod_comp->llc_ostlist.op_array,
4460                   lod_comp->llc_ostlist.op_size);
4461         if (!lod_comp->llc_ostlist.op_array)
4462                 RETURN(-ENOMEM);
4463 skip:
4464         for (j = 0; j < v3->lmm_stripe_count; j++) {
4465                 lod_comp->llc_ostlist.op_array[j] =
4466                         v3->lmm_objects[j].l_ost_idx;
4467         }
4468
4469         RETURN(0);
4470 }
4471
4472
4473 /**
4474  * Get default striping.
4475  *
4476  * \param[in] env               execution environment
4477  * \param[in] lo                object
4478  * \param[out] lds              default striping
4479  *
4480  * \retval              0 on success
4481  * \retval              negative if failed
4482  */
4483 static int lod_get_default_lov_striping(const struct lu_env *env,
4484                                         struct lod_object *lo,
4485                                         struct lod_default_striping *lds)
4486 {
4487         struct lod_thread_info *info = lod_env_info(env);
4488         struct lov_user_md_v1 *v1 = NULL;
4489         struct lov_user_md_v3 *v3 = NULL;
4490         struct lov_comp_md_v1 *comp_v1 = NULL;
4491         __u16   comp_cnt;
4492         __u16   mirror_cnt;
4493         bool    composite;
4494         int     rc, i, j;
4495         ENTRY;
4496
4497         lds->lds_def_striping_set = 0;
4498
4499         rc = lod_get_lov_ea(env, lo);
4500         if (rc < 0)
4501                 RETURN(rc);
4502
4503         if (rc < (typeof(rc))sizeof(struct lov_user_md))
4504                 RETURN(0);
4505
4506         v1 = info->lti_ea_store;
4507         if (v1->lmm_magic == __swab32(LOV_USER_MAGIC_V1)) {
4508                 lustre_swab_lov_user_md_v1(v1);
4509         } else if (v1->lmm_magic == __swab32(LOV_USER_MAGIC_V3)) {
4510                 v3 = (struct lov_user_md_v3 *)v1;
4511                 lustre_swab_lov_user_md_v3(v3);
4512         } else if (v1->lmm_magic == __swab32(LOV_USER_MAGIC_SPECIFIC)) {
4513                 v3 = (struct lov_user_md_v3 *)v1;
4514                 lustre_swab_lov_user_md_v3(v3);
4515                 lustre_swab_lov_user_md_objects(v3->lmm_objects,
4516                                                 v3->lmm_stripe_count);
4517         } else if (v1->lmm_magic == __swab32(LOV_USER_MAGIC_COMP_V1)) {
4518                 comp_v1 = (struct lov_comp_md_v1 *)v1;
4519                 lustre_swab_lov_comp_md_v1(comp_v1);
4520         }
4521
4522         if (v1->lmm_magic != LOV_MAGIC_V3 && v1->lmm_magic != LOV_MAGIC_V1 &&
4523             v1->lmm_magic != LOV_MAGIC_COMP_V1 &&
4524             v1->lmm_magic != LOV_USER_MAGIC_SPECIFIC)
4525                 RETURN(-ENOTSUPP);
4526
4527         if (v1->lmm_magic == LOV_MAGIC_COMP_V1) {
4528                 comp_v1 = (struct lov_comp_md_v1 *)v1;
4529                 comp_cnt = comp_v1->lcm_entry_count;
4530                 if (comp_cnt == 0)
4531                         RETURN(-EINVAL);
4532                 mirror_cnt = comp_v1->lcm_mirror_count + 1;
4533                 composite = true;
4534         } else {
4535                 comp_cnt = 1;
4536                 mirror_cnt = 0;
4537                 composite = false;
4538         }
4539
4540         /* realloc default comp entries if necessary */
4541         rc = lod_def_striping_comp_resize(lds, comp_cnt);
4542         if (rc < 0)
4543                 RETURN(rc);
4544
4545         lds->lds_def_comp_cnt = comp_cnt;
4546         lds->lds_def_striping_is_composite = composite;
4547         lds->lds_def_mirror_cnt = mirror_cnt;
4548
4549         for (i = 0; i < comp_cnt; i++) {
4550                 struct lod_layout_component *lod_comp;
4551                 char *pool;
4552
4553                 lod_comp = &lds->lds_def_comp_entries[i];
4554                 /*
4555                  * reset lod_comp values, llc_stripes is always NULL in
4556                  * the default striping template, llc_pool will be reset
4557                  * later below.
4558                  */
4559                 memset(lod_comp, 0, offsetof(typeof(*lod_comp), llc_pool));
4560
4561                 if (composite) {
4562                         v1 = (struct lov_user_md *)((char *)comp_v1 +
4563                                         comp_v1->lcm_entries[i].lcme_offset);
4564                         lod_comp->llc_extent =
4565                                         comp_v1->lcm_entries[i].lcme_extent;
4566                         /* We only inherit certain flags from the layout */
4567                         lod_comp->llc_flags =
4568                                         comp_v1->lcm_entries[i].lcme_flags &
4569                                         LCME_TEMPLATE_FLAGS;
4570                 }
4571
4572                 if (v1->lmm_pattern != LOV_PATTERN_RAID0 &&
4573                     v1->lmm_pattern != LOV_PATTERN_MDT &&
4574                     v1->lmm_pattern != 0) {
4575                         lod_free_def_comp_entries(lds);
4576                         RETURN(-EINVAL);
4577                 }
4578
4579                 CDEBUG(D_LAYOUT, DFID" stripe_count=%d stripe_size=%d "
4580                        "stripe_offset=%d\n",
4581                        PFID(lu_object_fid(&lo->ldo_obj.do_lu)),
4582                        (int)v1->lmm_stripe_count, (int)v1->lmm_stripe_size,
4583                        (int)v1->lmm_stripe_offset);
4584
4585                 lod_comp->llc_stripe_count = v1->lmm_stripe_count;
4586                 lod_comp->llc_stripe_size = v1->lmm_stripe_size;
4587                 lod_comp->llc_stripe_offset = v1->lmm_stripe_offset;
4588                 lod_comp->llc_pattern = v1->lmm_pattern;
4589
4590                 pool = NULL;
4591                 if (v1->lmm_magic == LOV_USER_MAGIC_V3) {
4592                         /* XXX: sanity check here */
4593                         v3 = (struct lov_user_md_v3 *) v1;
4594                         if (v3->lmm_pool_name[0] != '\0')
4595                                 pool = v3->lmm_pool_name;
4596                 }
4597                 lod_set_def_pool(lds, i, pool);
4598                 if (v1->lmm_magic == LOV_USER_MAGIC_SPECIFIC) {
4599                         v3 = (struct lov_user_md_v3 *)v1;
4600                         rc = lod_comp_copy_ost_lists(lod_comp, v3);
4601                         if (rc)
4602                                 RETURN(rc);
4603                 } else if (lod_comp->llc_ostlist.op_array &&
4604                            lod_comp->llc_ostlist.op_count) {
4605                         for (j = 0; j < lod_comp->llc_ostlist.op_count; j++)
4606                                 lod_comp->llc_ostlist.op_array[j] = -1;
4607                         lod_comp->llc_ostlist.op_count = 0;
4608                 }
4609         }
4610
4611         lds->lds_def_striping_set = 1;
4612         RETURN(rc);
4613 }
4614
4615 /**
4616  * Get default directory striping.
4617  *
4618  * \param[in] env               execution environment
4619  * \param[in] lo                object
4620  * \param[out] lds              default striping
4621  *
4622  * \retval              0 on success
4623  * \retval              negative if failed
4624  */
4625 static int lod_get_default_lmv_striping(const struct lu_env *env,
4626                                         struct lod_object *lo,
4627                                         struct lod_default_striping *lds)
4628 {
4629         struct lod_thread_info  *info = lod_env_info(env);
4630         struct lmv_user_md_v1   *v1 = NULL;
4631         int                      rc;
4632         ENTRY;
4633
4634         lds->lds_dir_def_striping_set = 0;
4635         rc = lod_get_default_lmv_ea(env, lo);
4636         if (rc < 0)
4637                 RETURN(rc);
4638
4639         if (rc < (typeof(rc))sizeof(struct lmv_user_md))
4640                 RETURN(0);
4641
4642         v1 = info->lti_ea_store;
4643
4644         lds->lds_dir_def_stripe_count = le32_to_cpu(v1->lum_stripe_count);
4645         lds->lds_dir_def_stripe_offset = le32_to_cpu(v1->lum_stripe_offset);
4646         lds->lds_dir_def_hash_type = le32_to_cpu(v1->lum_hash_type);
4647         lds->lds_dir_def_striping_set = 1;
4648
4649         RETURN(0);
4650 }
4651
4652 /**
4653  * Get default striping in the object.
4654  *
4655  * Get object default striping and default directory striping.
4656  *
4657  * \param[in] env               execution environment
4658  * \param[in] lo                object
4659  * \param[out] lds              default striping
4660  *
4661  * \retval              0 on success
4662  * \retval              negative if failed
4663  */
4664 static int lod_get_default_striping(const struct lu_env *env,
4665                                     struct lod_object *lo,
4666                                     struct lod_default_striping *lds)
4667 {
4668         int rc, rc1;
4669
4670         rc = lod_get_default_lov_striping(env, lo, lds);
4671         rc1 = lod_get_default_lmv_striping(env, lo, lds);
4672         if (rc == 0 && rc1 < 0)
4673                 rc = rc1;
4674
4675         return rc;
4676 }
4677
4678 /**
4679  * Apply default striping on object.
4680  *
4681  * If object striping pattern is not set, set to the one in default striping.
4682  * The default striping is from parent or fs.
4683  *
4684  * \param[in] lo                new object
4685  * \param[in] lds               default striping
4686  * \param[in] mode              new object's mode
4687  */
4688 static void lod_striping_from_default(struct lod_object *lo,
4689                                       const struct lod_default_striping *lds,
4690                                       umode_t mode)
4691 {
4692         struct lod_device *d = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
4693         struct lov_desc *desc = &d->lod_desc;
4694         int i, rc;
4695
4696         if (lds->lds_def_striping_set && S_ISREG(mode)) {
4697                 rc = lod_alloc_comp_entries(lo, lds->lds_def_mirror_cnt,
4698                                             lds->lds_def_comp_cnt);
4699                 if (rc != 0)
4700                         return;
4701
4702                 lo->ldo_is_composite = lds->lds_def_striping_is_composite;
4703                 if (lds->lds_def_mirror_cnt > 1)
4704                         lo->ldo_flr_state = LCM_FL_RDONLY;
4705
4706                 for (i = 0; i < lo->ldo_comp_cnt; i++) {
4707                         struct lod_layout_component *obj_comp =
4708                                                 &lo->ldo_comp_entries[i];
4709                         struct lod_layout_component *def_comp =
4710                                                 &lds->lds_def_comp_entries[i];
4711
4712                         CDEBUG(D_LAYOUT, "Inherit from default: flags=%#x "
4713                                "size=%hu nr=%u offset=%u pattern=%#x pool=%s\n",
4714                                def_comp->llc_flags,
4715                                def_comp->llc_stripe_size,
4716                                def_comp->llc_stripe_count,
4717                                def_comp->llc_stripe_offset,
4718                                def_comp->llc_pattern,
4719                                def_comp->llc_pool ?: "");
4720
4721                         *obj_comp = *def_comp;
4722                         if (def_comp->llc_pool != NULL) {
4723                                 /* pointer was copied from def_comp */
4724                                 obj_comp->llc_pool = NULL;
4725                                 lod_obj_set_pool(lo, i, def_comp->llc_pool);
4726                         }
4727
4728                         /* copy ost list */
4729                         if (def_comp->llc_ostlist.op_array &&
4730                             def_comp->llc_ostlist.op_count) {
4731                                 OBD_ALLOC(obj_comp->llc_ostlist.op_array,
4732                                           obj_comp->llc_ostlist.op_size);
4733                                 if (!obj_comp->llc_ostlist.op_array)
4734                                         return;
4735                                 memcpy(obj_comp->llc_ostlist.op_array,
4736                                        def_comp->llc_ostlist.op_array,
4737                                        obj_comp->llc_ostlist.op_size);
4738                         } else if (def_comp->llc_ostlist.op_array) {
4739                                 obj_comp->llc_ostlist.op_array = NULL;
4740                         }
4741
4742                         /*
4743                          * Don't initialize these fields for plain layout
4744                          * (v1/v3) here, they are inherited in the order of
4745                          * 'parent' -> 'fs default (root)' -> 'global default
4746                          * values for stripe_count & stripe_size'.
4747                          *
4748                          * see lod_ah_init().
4749                          */
4750                         if (!lo->ldo_is_composite)
4751                                 continue;
4752
4753                         lod_adjust_stripe_info(obj_comp, desc);
4754                 }
4755         } else if (lds->lds_dir_def_striping_set && S_ISDIR(mode)) {
4756                 if (lo->ldo_dir_stripe_count == 0)
4757                         lo->ldo_dir_stripe_count =
4758                                 lds->lds_dir_def_stripe_count;
4759                 if (lo->ldo_dir_stripe_offset == -1)
4760                         lo->ldo_dir_stripe_offset =
4761                                 lds->lds_dir_def_stripe_offset;
4762                 if (lo->ldo_dir_hash_type == 0)
4763                         lo->ldo_dir_hash_type = lds->lds_dir_def_hash_type;
4764
4765                 CDEBUG(D_LAYOUT, "striping from default dir: count:%hu, "
4766                        "offset:%u, hash_type:%u\n",
4767                        lo->ldo_dir_stripe_count, lo->ldo_dir_stripe_offset,
4768                        lo->ldo_dir_hash_type);
4769         }
4770 }
4771
4772 static inline bool lod_need_inherit_more(struct lod_object *lo, bool from_root)
4773 {
4774         struct lod_layout_component *lod_comp;
4775
4776         if (lo->ldo_comp_cnt == 0)
4777                 return true;
4778
4779         if (lo->ldo_is_composite)
4780                 return false;
4781
4782         lod_comp = &lo->ldo_comp_entries[0];
4783
4784         if (lod_comp->llc_stripe_count <= 0 ||
4785             lod_comp->llc_stripe_size <= 0)
4786                 return true;
4787
4788         if (from_root && (lod_comp->llc_pool == NULL ||
4789                           lod_comp->llc_stripe_offset == LOV_OFFSET_DEFAULT))
4790                 return true;
4791
4792         return false;
4793 }
4794
4795 /**
4796  * Implementation of dt_object_operations::do_ah_init.
4797  *
4798  * This method is used to make a decision on the striping configuration for the
4799  * object being created. It can be taken from the \a parent object if it exists,
4800  * or filesystem's default. The resulting configuration (number of stripes,
4801  * stripe size/offset, pool name, etc) is stored in the object itself and will
4802  * be used by the methods like ->doo_declare_create().
4803  *
4804  * \see dt_object_operations::do_ah_init() in the API description for details.
4805  */
4806 static void lod_ah_init(const struct lu_env *env,
4807                         struct dt_allocation_hint *ah,
4808                         struct dt_object *parent,
4809                         struct dt_object *child,
4810                         umode_t child_mode)
4811 {
4812         struct lod_device *d = lu2lod_dev(child->do_lu.lo_dev);
4813         struct lod_thread_info *info = lod_env_info(env);
4814         struct lod_default_striping *lds = lod_lds_buf_get(env);
4815         struct dt_object *nextp = NULL;
4816         struct dt_object *nextc;
4817         struct lod_object *lp = NULL;
4818         struct lod_object *lc;
4819         struct lov_desc *desc;
4820         struct lod_layout_component *lod_comp;
4821         int rc;
4822         ENTRY;
4823
4824         LASSERT(child);
4825
4826         if (likely(parent)) {
4827                 nextp = dt_object_child(parent);
4828                 lp = lod_dt_obj(parent);
4829         }
4830
4831         nextc = dt_object_child(child);
4832         lc = lod_dt_obj(child);
4833
4834         LASSERT(!lod_obj_is_striped(child));
4835         /* default layout template may have been set on the regular file
4836          * when this is called from mdd_create_data() */
4837         if (S_ISREG(child_mode))
4838                 lod_free_comp_entries(lc);
4839
4840         if (!dt_object_exists(nextc))
4841                 nextc->do_ops->do_ah_init(env, ah, nextp, nextc, child_mode);
4842
4843         if (S_ISDIR(child_mode)) {
4844                 const struct lmv_user_md_v1 *lum1 = ah->dah_eadata;
4845
4846                 /* other default values are 0 */
4847                 lc->ldo_dir_stripe_offset = -1;
4848
4849                 /* no default striping configuration is needed for
4850                  * foreign dirs
4851                  */
4852                 if (ah->dah_eadata != NULL && ah->dah_eadata_len != 0 &&
4853                     le32_to_cpu(lum1->lum_magic) == LMV_MAGIC_FOREIGN) {
4854                         lc->ldo_dir_is_foreign = true;
4855                         /* keep stripe_count 0 and stripe_offset -1 */
4856                         CDEBUG(D_INFO, "no default striping for foreign dir\n");
4857                         RETURN_EXIT;
4858                 }
4859
4860                 /*
4861                  * If parent object is not root directory,
4862                  * then get default striping from parent object.
4863                  */
4864                 if (likely(lp != NULL) && !fid_is_root(lod_object_fid(lp)))
4865                         lod_get_default_striping(env, lp, lds);
4866
4867                 /* set child default striping info, default value is NULL */
4868                 if (lds->lds_def_striping_set || lds->lds_dir_def_striping_set)
4869                         lc->ldo_def_striping = lds;
4870
4871                 /* It should always honour the specified stripes */
4872                 /* Note: old client (< 2.7)might also do lfs mkdir, whose EA
4873                  * will have old magic. In this case, we should ignore the
4874                  * stripe count and try to create dir by default stripe.
4875                  */
4876                 if (ah->dah_eadata != NULL && ah->dah_eadata_len != 0 &&
4877                     (le32_to_cpu(lum1->lum_magic) == LMV_USER_MAGIC ||
4878                      le32_to_cpu(lum1->lum_magic) == LMV_USER_MAGIC_SPECIFIC)) {
4879                         lc->ldo_dir_stripe_count =
4880                                 le32_to_cpu(lum1->lum_stripe_count);
4881                         lc->ldo_dir_stripe_offset =
4882                                 le32_to_cpu(lum1->lum_stripe_offset);
4883                         lc->ldo_dir_hash_type =
4884                                 le32_to_cpu(lum1->lum_hash_type);
4885                         CDEBUG(D_INFO,
4886                                "set dirstripe: count %hu, offset %d, hash %u\n",
4887                                 lc->ldo_dir_stripe_count,
4888                                 (int)lc->ldo_dir_stripe_offset,
4889                                 lc->ldo_dir_hash_type);
4890                 } else {
4891                         /* transfer defaults LMV to new directory */
4892                         lod_striping_from_default(lc, lds, child_mode);
4893
4894                         /* set count 0 to create normal directory */
4895                         if (lc->ldo_dir_stripe_count == 1)
4896                                 lc->ldo_dir_stripe_count = 0;
4897                 }
4898
4899                 /* shrink the stripe_count to the avaible MDT count */
4900                 if (lc->ldo_dir_stripe_count > d->lod_remote_mdt_count + 1 &&
4901                     !OBD_FAIL_CHECK(OBD_FAIL_LARGE_STRIPE)) {
4902                         lc->ldo_dir_stripe_count = d->lod_remote_mdt_count + 1;
4903                         if (lc->ldo_dir_stripe_count == 1)
4904                                 lc->ldo_dir_stripe_count = 0;
4905                 }
4906
4907                 CDEBUG(D_INFO, "final dir stripe [%hu %d %u]\n",
4908                        lc->ldo_dir_stripe_count,
4909                        (int)lc->ldo_dir_stripe_offset, lc->ldo_dir_hash_type);
4910
4911                 RETURN_EXIT;
4912         }
4913
4914         /* child object regular file*/
4915
4916         if (!lod_object_will_be_striped(S_ISREG(child_mode),
4917                                         lu_object_fid(&child->do_lu)))
4918                 RETURN_EXIT;
4919
4920         /* If object is going to be striped over OSTs, transfer default
4921          * striping information to the child, so that we can use it
4922          * during declaration and creation.
4923          *
4924          * Try from the parent first.
4925          */
4926         if (likely(lp != NULL)) {
4927                 rc = lod_get_default_lov_striping(env, lp, lds);
4928                 if (rc == 0)
4929                         lod_striping_from_default(lc, lds, child_mode);
4930         }
4931
4932         /* Initialize lod_device::lod_md_root object reference */
4933         if (d->lod_md_root == NULL) {
4934                 struct dt_object *root;
4935                 struct lod_object *lroot;
4936
4937                 lu_root_fid(&info->lti_fid);
4938                 root = dt_locate(env, &d->lod_dt_dev, &info->lti_fid);
4939                 if (!IS_ERR(root)) {
4940                         lroot = lod_dt_obj(root);
4941
4942                         spin_lock(&d->lod_lock);
4943                         if (d->lod_md_root != NULL)
4944                                 dt_object_put(env, &d->lod_md_root->ldo_obj);
4945                         d->lod_md_root = lroot;
4946                         spin_unlock(&d->lod_lock);
4947                 }
4948         }
4949
4950         /* try inherit layout from the root object (fs default) when:
4951          *  - parent does not have default layout; or
4952          *  - parent has plain(v1/v3) default layout, and some attributes
4953          *    are not specified in the default layout;
4954          */
4955         if (d->lod_md_root != NULL && lod_need_inherit_more(lc, true)) {
4956                 rc = lod_get_default_lov_striping(env, d->lod_md_root, lds);
4957                 if (rc)
4958                         goto out;
4959                 if (lc->ldo_comp_cnt == 0) {
4960                         lod_striping_from_default(lc, lds, child_mode);
4961                 } else if (!lds->lds_def_striping_is_composite) {
4962                         struct lod_layout_component *def_comp;
4963
4964                         LASSERT(!lc->ldo_is_composite);
4965                         lod_comp = &lc->ldo_comp_entries[0];
4966                         def_comp = &lds->lds_def_comp_entries[0];
4967
4968                         if (lod_comp->llc_stripe_count <= 0)
4969                                 lod_comp->llc_stripe_count =
4970                                         def_comp->llc_stripe_count;
4971                         if (lod_comp->llc_stripe_size <= 0)
4972                                 lod_comp->llc_stripe_size =
4973                                         def_comp->llc_stripe_size;
4974                         if (lod_comp->llc_stripe_offset == LOV_OFFSET_DEFAULT &&
4975                             (!lod_comp->llc_pool || !lod_comp->llc_pool[0]))
4976                                 lod_comp->llc_stripe_offset =
4977                                         def_comp->llc_stripe_offset;
4978                         if (lod_comp->llc_pool == NULL)
4979                                 lod_obj_set_pool(lc, 0, def_comp->llc_pool);
4980                 }
4981         }
4982 out:
4983         /*
4984          * fs default striping may not be explicitly set, or historically set
4985          * in config log, use them.
4986          */
4987         if (lod_need_inherit_more(lc, false)) {
4988                 if (lc->ldo_comp_cnt == 0) {
4989                         rc = lod_alloc_comp_entries(lc, 0, 1);
4990                         if (rc)
4991                                 /* fail to allocate memory, will create a
4992                                  * non-striped file. */
4993                                 RETURN_EXIT;
4994                         lc->ldo_is_composite = 0;
4995                         lod_comp = &lc->ldo_comp_entries[0];
4996                         lod_comp->llc_stripe_offset = LOV_OFFSET_DEFAULT;
4997                 }
4998                 LASSERT(!lc->ldo_is_composite);
4999                 lod_comp = &lc->ldo_comp_entries[0];
5000                 desc = &d->lod_desc;
5001                 lod_adjust_stripe_info(lod_comp, desc);
5002         }
5003
5004         EXIT;
5005 }
5006
5007 #define ll_do_div64(aaa,bbb)    do_div((aaa), (bbb))
5008 /**
5009  * Size initialization on late striping.
5010  *
5011  * Propagate the size of a truncated object to a deferred striping.
5012  * This function handles a special case when truncate was done on a
5013  * non-striped object and now while the striping is being created
5014  * we can't lose that size, so we have to propagate it to the stripes
5015  * being created.
5016  *
5017  * \param[in] env       execution environment
5018  * \param[in] dt        object
5019  * \param[in] th        transaction handle
5020  *
5021  * \retval              0 on success
5022  * \retval              negative if failed
5023  */
5024 static int lod_declare_init_size(const struct lu_env *env,
5025                                  struct dt_object *dt, struct thandle *th)
5026 {
5027         struct dt_object        *next = dt_object_child(dt);
5028         struct lod_object       *lo = lod_dt_obj(dt);
5029         struct dt_object        **objects = NULL;
5030         struct lu_attr  *attr = &lod_env_info(env)->lti_attr;
5031         uint64_t        size, offs;
5032         int     i, rc, stripe, stripe_count = 0, stripe_size = 0;
5033         struct lu_extent size_ext;
5034         ENTRY;
5035
5036         if (!lod_obj_is_striped(dt))
5037                 RETURN(0);
5038
5039         rc = dt_attr_get(env, next, attr);
5040         LASSERT(attr->la_valid & LA_SIZE);
5041         if (rc)
5042                 RETURN(rc);
5043
5044         size = attr->la_size;
5045         if (size == 0)
5046                 RETURN(0);
5047
5048         size_ext = (typeof(size_ext)){ .e_start = size - 1, .e_end = size };
5049         for (i = 0; i < lo->ldo_comp_cnt; i++) {
5050                 struct lod_layout_component *lod_comp;
5051                 struct lu_extent *extent;
5052
5053                 lod_comp = &lo->ldo_comp_entries[i];
5054
5055                 if (lod_comp->llc_stripe == NULL)
5056                         continue;
5057
5058                 extent = &lod_comp->llc_extent;
5059                 CDEBUG(D_INFO, "%lld "DEXT"\n", size, PEXT(extent));
5060                 if (!lo->ldo_is_composite ||
5061                     lu_extent_is_overlapped(extent, &size_ext)) {
5062                         objects = lod_comp->llc_stripe;
5063                         stripe_count = lod_comp->llc_stripe_count;
5064                         stripe_size = lod_comp->llc_stripe_size;
5065
5066                         /* next mirror */
5067                         if (stripe_count == 0)
5068                                 continue;
5069
5070                         LASSERT(objects != NULL && stripe_size != 0);
5071                         /* ll_do_div64(a, b) returns a % b, and a = a / b */
5072                         ll_do_div64(size, (__u64)stripe_size);
5073                         stripe = ll_do_div64(size, (__u64)stripe_count);
5074                         LASSERT(objects[stripe] != NULL);
5075
5076                         size = size * stripe_size;
5077                         offs = attr->la_size;
5078                         size += ll_do_div64(offs, stripe_size);
5079
5080                         attr->la_valid = LA_SIZE;
5081                         attr->la_size = size;
5082
5083                         rc = lod_sub_declare_attr_set(env, objects[stripe],
5084                                                       attr, th);
5085                 }
5086         }
5087
5088         RETURN(rc);
5089 }
5090
5091 /**
5092  * Declare creation of striped object.
5093  *
5094  * The function declares creation stripes for a regular object. The function
5095  * also declares whether the stripes will be created with non-zero size if
5096  * previously size was set non-zero on the master object. If object \a dt is
5097  * not local, then only fully defined striping can be applied in \a lovea.
5098  * Otherwise \a lovea can be in the form of pattern, see lod_qos_parse_config()
5099  * for the details.
5100  *
5101  * \param[in] env       execution environment
5102  * \param[in] dt        object
5103  * \param[in] attr      attributes the stripes will be created with
5104  * \param[in] lovea     a buffer containing striping description
5105  * \param[in] th        transaction handle
5106  *
5107  * \retval              0 on success
5108  * \retval              negative if failed
5109  */
5110 int lod_declare_striped_create(const struct lu_env *env, struct dt_object *dt,
5111                                struct lu_attr *attr,
5112                                const struct lu_buf *lovea, struct thandle *th)
5113 {
5114         struct lod_thread_info  *info = lod_env_info(env);
5115         struct dt_object        *next = dt_object_child(dt);
5116         struct lod_object       *lo = lod_dt_obj(dt);
5117         int                      rc;
5118         ENTRY;
5119
5120         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_ALLOC_OBDO))
5121                 GOTO(out, rc = -ENOMEM);
5122
5123         if (!dt_object_remote(next)) {
5124                 /* choose OST and generate appropriate objects */
5125                 rc = lod_prepare_create(env, lo, attr, lovea, th);
5126                 if (rc)
5127                         GOTO(out, rc);
5128
5129                 /*
5130                  * declare storage for striping data
5131                  */
5132                 info->lti_buf.lb_len = lod_comp_md_size(lo, false);
5133         } else {
5134                 /* LOD can not choose OST objects for remote objects, i.e.
5135                  * stripes must be ready before that. Right now, it can only
5136                  * happen during migrate, i.e. migrate process needs to create
5137                  * remote regular file (mdd_migrate_create), then the migrate
5138                  * process will provide stripeEA. */
5139                 LASSERT(lovea != NULL);
5140                 info->lti_buf = *lovea;
5141         }
5142
5143         rc = lod_sub_declare_xattr_set(env, next, &info->lti_buf,
5144                                        XATTR_NAME_LOV, 0, th);
5145         if (rc)
5146                 GOTO(out, rc);
5147
5148         /*
5149          * if striping is created with local object's size > 0,
5150          * we have to propagate this size to specific object
5151          * the case is possible only when local object was created previously
5152          */
5153         if (dt_object_exists(next))
5154                 rc = lod_declare_init_size(env, dt, th);
5155
5156 out:
5157         /* failed to create striping or to set initial size, let's reset
5158          * config so that others don't get confused */
5159         if (rc)
5160                 lod_striping_free(env, lo);
5161
5162         RETURN(rc);
5163 }
5164
5165 /**
5166  * Implementation of dt_object_operations::do_declare_create.
5167  *
5168  * The method declares creation of a new object. If the object will be striped,
5169  * then helper functions are called to find FIDs for the stripes, declare
5170  * creation of the stripes and declare initialization of the striping
5171  * information to be stored in the master object.
5172  *
5173  * \see dt_object_operations::do_declare_create() in the API description
5174  * for details.
5175  */
5176 static int lod_declare_create(const struct lu_env *env, struct dt_object *dt,
5177                               struct lu_attr *attr,
5178                               struct dt_allocation_hint *hint,
5179                               struct dt_object_format *dof, struct thandle *th)
5180 {
5181         struct dt_object   *next = dt_object_child(dt);
5182         struct lod_object  *lo = lod_dt_obj(dt);
5183         int                 rc;
5184         ENTRY;
5185
5186         LASSERT(dof);
5187         LASSERT(attr);
5188         LASSERT(th);
5189
5190         /*
5191          * first of all, we declare creation of local object
5192          */
5193         rc = lod_sub_declare_create(env, next, attr, hint, dof, th);
5194         if (rc != 0)
5195                 GOTO(out, rc);
5196
5197         /*
5198          * it's lod_ah_init() that has decided the object will be striped
5199          */
5200         if (dof->dof_type == DFT_REGULAR) {
5201                 /* callers don't want stripes */
5202                 /* XXX: all tricky interactions with ->ah_make_hint() decided
5203                  * to use striping, then ->declare_create() behaving differently
5204                  * should be cleaned */
5205                 if (dof->u.dof_reg.striped != 0)
5206                         rc = lod_declare_striped_create(env, dt, attr,
5207                                                         NULL, th);
5208         } else if (dof->dof_type == DFT_DIR) {
5209                 struct seq_server_site *ss;
5210                 struct lu_buf buf = { NULL };
5211                 struct lu_buf *lmu = NULL;
5212
5213                 ss = lu_site2seq(dt->do_lu.lo_dev->ld_site);
5214
5215                 /* If the parent has default stripeEA, and client
5216                  * did not find it before sending create request,
5217                  * then MDT will return -EREMOTE, and client will
5218                  * retrieve the default stripeEA and re-create the
5219                  * sub directory.
5220                  *
5221                  * Note: if dah_eadata != NULL, it means creating the
5222                  * striped directory with specified stripeEA, then it
5223                  * should ignore the default stripeEA */
5224                 if (hint != NULL && hint->dah_eadata == NULL) {
5225                         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_STALE_DIR_LAYOUT))
5226                                 GOTO(out, rc = -EREMOTE);
5227
5228                         if (lo->ldo_dir_stripe_offset == -1) {
5229                                 /* child and parent should be in the same MDT */
5230                                 if (hint->dah_parent != NULL &&
5231                                     dt_object_remote(hint->dah_parent))
5232                                         GOTO(out, rc = -EREMOTE);
5233                         } else if (lo->ldo_dir_stripe_offset !=
5234                                    ss->ss_node_id) {
5235                                 struct lod_device *lod;
5236                                 struct lod_tgt_descs *ltd;
5237                                 struct lod_tgt_desc *tgt = NULL;
5238                                 bool found_mdt = false;
5239                                 int i;
5240
5241                                 lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
5242                                 ltd = &lod->lod_mdt_descs;
5243                                 cfs_foreach_bit(ltd->ltd_tgt_bitmap, i) {
5244                                         tgt = LTD_TGT(ltd, i);
5245                                         if (tgt->ltd_index ==
5246                                                 lo->ldo_dir_stripe_offset) {
5247                                                 found_mdt = true;
5248                                                 break;
5249                                         }
5250                                 }
5251
5252                                 /* If the MDT indicated by stripe_offset can be
5253                                  * found, then tell client to resend the create
5254                                  * request to the correct MDT, otherwise return
5255                                  * error to client */
5256                                 if (found_mdt)
5257                                         GOTO(out, rc = -EREMOTE);
5258                                 else
5259                                         GOTO(out, rc = -EINVAL);
5260                         }
5261                 } else if (hint && hint->dah_eadata) {
5262                         lmu = &buf;
5263                         lmu->lb_buf = (void *)hint->dah_eadata;
5264                         lmu->lb_len = hint->dah_eadata_len;
5265                 }
5266
5267                 rc = lod_declare_dir_striping_create(env, dt, attr, lmu, dof,
5268                                                      th);
5269         }
5270 out:
5271         /* failed to create striping or to set initial size, let's reset
5272          * config so that others don't get confused */
5273         if (rc)
5274                 lod_striping_free(env, lo);
5275         RETURN(rc);
5276 }
5277
5278 /**
5279  * Generate component ID for new created component.
5280  *
5281  * \param[in] lo                LOD object
5282  * \param[in] comp_idx          index of ldo_comp_entries
5283  *
5284  * \retval                      component ID on success
5285  * \retval                      LCME_ID_INVAL on failure
5286  */
5287 static __u32 lod_gen_component_id(struct lod_object *lo,
5288                                   int mirror_id, int comp_idx)
5289 {
5290         struct lod_layout_component *lod_comp;
5291         __u32   id, start, end;
5292         int     i;
5293
5294         LASSERT(lo->ldo_comp_entries[comp_idx].llc_id == LCME_ID_INVAL);
5295
5296         lod_obj_inc_layout_gen(lo);
5297         id = lo->ldo_layout_gen;
5298         if (likely(id <= SEQ_ID_MAX))
5299                 RETURN(pflr_id(mirror_id, id & SEQ_ID_MASK));
5300
5301         /* Layout generation wraps, need to check collisions. */
5302         start = id & SEQ_ID_MASK;
5303         end = SEQ_ID_MAX;
5304 again:
5305         for (id = start; id <= end; id++) {
5306                 for (i = 0; i < lo->ldo_comp_cnt; i++) {
5307                         lod_comp = &lo->ldo_comp_entries[i];
5308                         if (pflr_id(mirror_id, id) == lod_comp->llc_id)
5309                                 break;
5310                 }
5311                 /* Found the ununsed ID */
5312                 if (i == lo->ldo_comp_cnt)
5313                         RETURN(pflr_id(mirror_id, id));
5314         }
5315         if (end == LCME_ID_MAX) {
5316                 start = 1;
5317                 end = min(lo->ldo_layout_gen & LCME_ID_MASK,
5318                           (__u32)(LCME_ID_MAX - 1));
5319                 goto again;
5320         }
5321
5322         RETURN(LCME_ID_INVAL);
5323 }
5324
5325 /**
5326  * Creation of a striped regular object.
5327  *
5328  * The function is called to create the stripe objects for a regular
5329  * striped file. This can happen at the initial object creation or
5330  * when the caller asks LOD to do so using ->do_xattr_set() method
5331  * (so called late striping). Notice all the information are already
5332  * prepared in the form of the list of objects (ldo_stripe field).
5333  * This is done during declare phase.
5334  *
5335  * \param[in] env       execution environment
5336  * \param[in] dt        object
5337  * \param[in] attr      attributes the stripes will be created with
5338  * \param[in] dof       format of stripes (see OSD API description)
5339  * \param[in] th        transaction handle
5340  *
5341  * \retval              0 on success
5342  * \retval              negative if failed
5343  */
5344 int lod_striped_create(const struct lu_env *env, struct dt_object *dt,
5345                        struct lu_attr *attr, struct dt_object_format *dof,
5346                        struct thandle *th)
5347 {
5348         struct lod_layout_component     *lod_comp;
5349         struct lod_object       *lo = lod_dt_obj(dt);
5350         __u16   mirror_id;
5351         int     rc = 0, i, j;
5352         ENTRY;
5353
5354         LASSERT((lo->ldo_comp_cnt != 0 && lo->ldo_comp_entries != NULL) ||
5355                 lo->ldo_is_foreign);
5356
5357         mirror_id = 0; /* non-flr file's mirror_id is 0 */
5358         if (lo->ldo_mirror_count > 1) {
5359                 for (i = 0; i < lo->ldo_comp_cnt; i++) {
5360                         lod_comp = &lo->ldo_comp_entries[i];
5361                         if (lod_comp->llc_id != LCME_ID_INVAL &&
5362                             mirror_id_of(lod_comp->llc_id) > mirror_id)
5363                                 mirror_id = mirror_id_of(lod_comp->llc_id);
5364                 }
5365         }
5366
5367         /* create all underlying objects */
5368         for (i = 0; i < lo->ldo_comp_cnt; i++) {
5369                 lod_comp = &lo->ldo_comp_entries[i];
5370
5371                 if (lod_comp->llc_id == LCME_ID_INVAL) {
5372                         /* only the component of FLR layout with more than 1
5373                          * mirror has mirror ID in its component ID.
5374                          */
5375                         if (lod_comp->llc_extent.e_start == 0 &&
5376                             lo->ldo_mirror_count > 1)
5377                                 ++mirror_id;
5378
5379                         lod_comp->llc_id = lod_gen_component_id(lo,
5380                                                                 mirror_id, i);
5381                         if (lod_comp->llc_id == LCME_ID_INVAL)
5382                                 GOTO(out, rc = -ERANGE);
5383                 }
5384
5385                 if (lod_comp_inited(lod_comp))
5386                         continue;
5387
5388                 if (lod_comp->llc_pattern & LOV_PATTERN_F_RELEASED)
5389                         lod_comp_set_init(lod_comp);
5390
5391                 if (lov_pattern(lod_comp->llc_pattern) == LOV_PATTERN_MDT)
5392                         lod_comp_set_init(lod_comp);
5393
5394                 if (lod_comp->llc_stripe == NULL)
5395                         continue;
5396
5397                 LASSERT(lod_comp->llc_stripe_count);
5398                 for (j = 0; j < lod_comp->llc_stripe_count; j++) {
5399                         struct dt_object *object = lod_comp->llc_stripe[j];
5400                         LASSERT(object != NULL);
5401                         rc = lod_sub_create(env, object, attr, NULL, dof, th);
5402                         if (rc)
5403                                 GOTO(out, rc);
5404                 }
5405                 lod_comp_set_init(lod_comp);
5406         }
5407
5408         rc = lod_fill_mirrors(lo);
5409         if (rc)
5410                 GOTO(out, rc);
5411
5412         rc = lod_generate_and_set_lovea(env, lo, th);
5413         if (rc)
5414                 GOTO(out, rc);
5415
5416         lo->ldo_comp_cached = 1;
5417         RETURN(0);
5418
5419 out:
5420         lod_striping_free(env, lo);
5421         RETURN(rc);
5422 }
5423
5424 static inline bool lod_obj_is_dom(struct dt_object *dt)
5425 {
5426         struct lod_object *lo = lod_dt_obj(dt);
5427
5428         if (!dt_object_exists(dt_object_child(dt)))
5429                 return false;
5430
5431         if (S_ISDIR(dt->do_lu.lo_header->loh_attr))
5432                 return false;
5433
5434         if (!lo->ldo_comp_cnt)
5435                 return false;
5436
5437         return (lov_pattern(lo->ldo_comp_entries[0].llc_pattern) ==
5438                 LOV_PATTERN_MDT);
5439 }
5440
5441 /**
5442  * Implementation of dt_object_operations::do_create.
5443  *
5444  * If any of preceeding methods (like ->do_declare_create(),
5445  * ->do_ah_init(), etc) chose to create a striped object,
5446  * then this method will create the master and the stripes.
5447  *
5448  * \see dt_object_operations::do_create() in the API description for details.
5449  */
5450 static int lod_create(const struct lu_env *env, struct dt_object *dt,
5451                       struct lu_attr *attr, struct dt_allocation_hint *hint,
5452                       struct dt_object_format *dof, struct thandle *th)
5453 {
5454         int                 rc;
5455         ENTRY;
5456
5457         /* create local object */
5458         rc = lod_sub_create(env, dt_object_child(dt), attr, hint, dof, th);
5459         if (rc != 0)
5460                 RETURN(rc);
5461
5462         if (S_ISREG(dt->do_lu.lo_header->loh_attr) &&
5463             (lod_obj_is_striped(dt) || lod_obj_is_dom(dt)) &&
5464             dof->u.dof_reg.striped != 0) {
5465                 LASSERT(lod_dt_obj(dt)->ldo_comp_cached == 0);
5466                 rc = lod_striped_create(env, dt, attr, dof, th);
5467         }
5468
5469         RETURN(rc);
5470 }
5471
5472 static inline int
5473 lod_obj_stripe_destroy_cb(const struct lu_env *env, struct lod_object *lo,
5474                           struct dt_object *dt, struct thandle *th,
5475                           int comp_idx, int stripe_idx,
5476                           struct lod_obj_stripe_cb_data *data)
5477 {
5478         if (data->locd_declare)
5479                 return lod_sub_declare_destroy(env, dt, th);
5480         else if (!OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_SPEOBJ) ||
5481                  stripe_idx == cfs_fail_val)
5482                 return lod_sub_destroy(env, dt, th);
5483         else
5484                 return 0;
5485 }
5486
5487 /**
5488  * Implementation of dt_object_operations::do_declare_destroy.
5489  *
5490  * If the object is a striped directory, then the function declares reference
5491  * removal from the master object (this is an index) to the stripes and declares
5492  * destroy of all the stripes. In all the cases, it declares an intention to
5493  * destroy the object itself.
5494  *
5495  * \see dt_object_operations::do_declare_destroy() in the API description
5496  * for details.
5497  */
5498 static int lod_declare_destroy(const struct lu_env *env, struct dt_object *dt,
5499                                struct thandle *th)
5500 {
5501         struct dt_object   *next = dt_object_child(dt);
5502         struct lod_object  *lo = lod_dt_obj(dt);
5503         struct lod_thread_info *info = lod_env_info(env);
5504         char               *stripe_name = info->lti_key;
5505         int                 rc, i;
5506         ENTRY;
5507
5508         /*
5509          * load striping information, notice we don't do this when object
5510          * is being initialized as we don't need this information till
5511          * few specific cases like destroy, chown
5512          */
5513         rc = lod_striping_load(env, lo);
5514         if (rc)
5515                 RETURN(rc);
5516
5517         /* declare destroy for all underlying objects */
5518         if (S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
5519                 rc = next->do_ops->do_index_try(env, next,
5520                                                 &dt_directory_features);
5521                 if (rc != 0)
5522                         RETURN(rc);
5523
5524                 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
5525                         rc = lod_sub_declare_ref_del(env, next, th);
5526                         if (rc != 0)
5527                                 RETURN(rc);
5528
5529                         snprintf(stripe_name, sizeof(info->lti_key), DFID":%d",
5530                                 PFID(lu_object_fid(&lo->ldo_stripe[i]->do_lu)),
5531                                 i);
5532                         rc = lod_sub_declare_delete(env, next,
5533                                         (const struct dt_key *)stripe_name, th);
5534                         if (rc != 0)
5535                                 RETURN(rc);
5536                 }
5537         }
5538
5539         /*
5540          * we declare destroy for the local object
5541          */
5542         rc = lod_sub_declare_destroy(env, next, th);
5543         if (rc)
5544                 RETURN(rc);
5545
5546         if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ) ||
5547             OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ2))
5548                 RETURN(0);
5549
5550         if (!lod_obj_is_striped(dt))
5551                 RETURN(0);
5552
5553         /* declare destroy all striped objects */
5554         if (S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
5555                 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
5556                         if (lo->ldo_stripe[i] == NULL)
5557                                 continue;
5558
5559                         rc = lod_sub_declare_ref_del(env, lo->ldo_stripe[i],
5560                                                      th);
5561
5562                         rc = lod_sub_declare_destroy(env, lo->ldo_stripe[i],
5563                                                      th);
5564                         if (rc != 0)
5565                                 break;
5566                 }
5567         } else {
5568                 struct lod_obj_stripe_cb_data data = { { 0 } };
5569
5570                 data.locd_declare = true;
5571                 data.locd_stripe_cb = lod_obj_stripe_destroy_cb;
5572                 rc = lod_obj_for_each_stripe(env, lo, th, &data);
5573         }
5574
5575         RETURN(rc);
5576 }
5577
5578 /**
5579  * Implementation of dt_object_operations::do_destroy.
5580  *
5581  * If the object is a striped directory, then the function removes references
5582  * from the master object (this is an index) to the stripes and destroys all
5583  * the stripes. In all the cases, the function destroys the object itself.
5584  *
5585  * \see dt_object_operations::do_destroy() in the API description for details.
5586  */
5587 static int lod_destroy(const struct lu_env *env, struct dt_object *dt,
5588                        struct thandle *th)
5589 {
5590         struct dt_object  *next = dt_object_child(dt);
5591         struct lod_object *lo = lod_dt_obj(dt);
5592         struct lod_thread_info *info = lod_env_info(env);
5593         char               *stripe_name = info->lti_key;
5594         unsigned int       i;
5595         int                rc;
5596         ENTRY;
5597
5598         /* destroy sub-stripe of master object */
5599         if (S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
5600                 rc = next->do_ops->do_index_try(env, next,
5601                                                 &dt_directory_features);
5602                 if (rc != 0)
5603                         RETURN(rc);
5604
5605                 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
5606                         rc = lod_sub_ref_del(env, next, th);
5607                         if (rc != 0)
5608                                 RETURN(rc);
5609
5610                         snprintf(stripe_name, sizeof(info->lti_key), DFID":%d",
5611                                 PFID(lu_object_fid(&lo->ldo_stripe[i]->do_lu)),
5612                                 i);
5613
5614                         CDEBUG(D_INFO, DFID" delete stripe %s "DFID"\n",
5615                                PFID(lu_object_fid(&dt->do_lu)), stripe_name,
5616                                PFID(lu_object_fid(&lo->ldo_stripe[i]->do_lu)));
5617
5618                         rc = lod_sub_delete(env, next,
5619                                        (const struct dt_key *)stripe_name, th);
5620                         if (rc != 0)
5621                                 RETURN(rc);
5622                 }
5623         }
5624
5625         rc = lod_sub_destroy(env, next, th);
5626         if (rc != 0)
5627                 RETURN(rc);
5628
5629         if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ) ||
5630             OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ2))
5631                 RETURN(0);
5632
5633         if (!lod_obj_is_striped(dt))
5634                 RETURN(0);
5635
5636         /* destroy all striped objects */
5637         if (S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
5638                 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
5639                         if (lo->ldo_stripe[i] == NULL)
5640                                 continue;
5641                         if (!OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_SPEOBJ) ||
5642                             i == cfs_fail_val) {
5643                                 dt_write_lock(env, lo->ldo_stripe[i],
5644                                               MOR_TGT_CHILD);
5645                                 rc = lod_sub_ref_del(env, lo->ldo_stripe[i],
5646                                                      th);
5647                                 dt_write_unlock(env, lo->ldo_stripe[i]);
5648                                 if (rc != 0)
5649                                         break;
5650
5651                                 rc = lod_sub_destroy(env, lo->ldo_stripe[i],
5652                                                      th);
5653                                 if (rc != 0)
5654                                         break;
5655                         }
5656                 }
5657         } else {
5658                 struct lod_obj_stripe_cb_data data = { { 0 } };
5659
5660                 data.locd_declare = false;
5661                 data.locd_stripe_cb = lod_obj_stripe_destroy_cb;
5662                 rc = lod_obj_for_each_stripe(env, lo, th, &data);
5663         }
5664
5665         RETURN(rc);
5666 }
5667
5668 /**
5669  * Implementation of dt_object_operations::do_declare_ref_add.
5670  *
5671  * \see dt_object_operations::do_declare_ref_add() in the API description
5672  * for details.
5673  */
5674 static int lod_declare_ref_add(const struct lu_env *env,
5675                                struct dt_object *dt, struct thandle *th)
5676 {
5677         return lod_sub_declare_ref_add(env, dt_object_child(dt), th);
5678 }
5679
5680 /**
5681  * Implementation of dt_object_operations::do_ref_add.
5682  *
5683  * \see dt_object_operations::do_ref_add() in the API description for details.
5684  */
5685 static int lod_ref_add(const struct lu_env *env,
5686                        struct dt_object *dt, struct thandle *th)
5687 {
5688         return lod_sub_ref_add(env, dt_object_child(dt), th);
5689 }
5690
5691 /**
5692  * Implementation of dt_object_operations::do_declare_ref_del.
5693  *
5694  * \see dt_object_operations::do_declare_ref_del() in the API description
5695  * for details.
5696  */
5697 static int lod_declare_ref_del(const struct lu_env *env,
5698                                struct dt_object *dt, struct thandle *th)
5699 {
5700         return lod_sub_declare_ref_del(env, dt_object_child(dt), th);
5701 }
5702
5703 /**
5704  * Implementation of dt_object_operations::do_ref_del
5705  *
5706  * \see dt_object_operations::do_ref_del() in the API description for details.
5707  */
5708 static int lod_ref_del(const struct lu_env *env,
5709                        struct dt_object *dt, struct thandle *th)
5710 {
5711         return lod_sub_ref_del(env, dt_object_child(dt), th);
5712 }
5713
5714 /**
5715  * Implementation of dt_object_operations::do_object_sync.
5716  *
5717  * \see dt_object_operations::do_object_sync() in the API description
5718  * for details.
5719  */
5720 static int lod_object_sync(const struct lu_env *env, struct dt_object *dt,
5721                            __u64 start, __u64 end)
5722 {
5723         return dt_object_sync(env, dt_object_child(dt), start, end);
5724 }
5725
5726 /**
5727  * Implementation of dt_object_operations::do_object_unlock.
5728  *
5729  * Used to release LDLM lock(s).
5730  *
5731  * \see dt_object_operations::do_object_unlock() in the API description
5732  * for details.
5733  */
5734 static int lod_object_unlock(const struct lu_env *env, struct dt_object *dt,
5735                              struct ldlm_enqueue_info *einfo,
5736                              union ldlm_policy_data *policy)
5737 {
5738         struct lod_object *lo = lod_dt_obj(dt);
5739         struct lustre_handle_array *slave_locks = einfo->ei_cbdata;
5740         int slave_locks_size;
5741         int i;
5742         ENTRY;
5743
5744         if (slave_locks == NULL)
5745                 RETURN(0);
5746
5747         LASSERT(S_ISDIR(dt->do_lu.lo_header->loh_attr));
5748         /* Note: for remote lock for single stripe dir, MDT will cancel
5749          * the lock by lockh directly */
5750         LASSERT(!dt_object_remote(dt_object_child(dt)));
5751
5752         /* locks were unlocked in MDT layer */
5753         for (i = 0; i < slave_locks->ha_count; i++)
5754                 LASSERT(!lustre_handle_is_used(&slave_locks->ha_handles[i]));
5755
5756         /*
5757          * NB, ha_count may not equal to ldo_dir_stripe_count, because dir
5758          * layout may change, e.g., shrink dir layout after migration.
5759          */
5760         for (i = 0; i < lo->ldo_dir_stripe_count; i++)
5761                 dt_invalidate(env, lo->ldo_stripe[i]);
5762
5763         slave_locks_size = offsetof(typeof(*slave_locks),
5764                                     ha_handles[slave_locks->ha_count]);
5765         OBD_FREE(slave_locks, slave_locks_size);
5766         einfo->ei_cbdata = NULL;
5767
5768         RETURN(0);
5769 }
5770
5771 /**
5772  * Implementation of dt_object_operations::do_object_lock.
5773  *
5774  * Used to get LDLM lock on the non-striped and striped objects.
5775  *
5776  * \see dt_object_operations::do_object_lock() in the API description
5777  * for details.
5778  */
5779 static int lod_object_lock(const struct lu_env *env,
5780                            struct dt_object *dt,
5781                            struct lustre_handle *lh,
5782                            struct ldlm_enqueue_info *einfo,
5783                            union ldlm_policy_data *policy)
5784 {
5785         struct lod_object *lo = lod_dt_obj(dt);
5786         int slave_locks_size;
5787         struct lustre_handle_array *slave_locks = NULL;
5788         int i;
5789         int rc;
5790         ENTRY;
5791
5792         /* remote object lock */
5793         if (!einfo->ei_enq_slave) {
5794                 LASSERT(dt_object_remote(dt));
5795                 return dt_object_lock(env, dt_object_child(dt), lh, einfo,
5796                                       policy);
5797         }
5798
5799         if (!S_ISDIR(dt->do_lu.lo_header->loh_attr))
5800                 RETURN(-ENOTDIR);
5801
5802         rc = lod_striping_load(env, lo);
5803         if (rc != 0)
5804                 RETURN(rc);
5805
5806         /* No stripes */
5807         if (lo->ldo_dir_stripe_count <= 1)
5808                 RETURN(0);
5809
5810         slave_locks_size = offsetof(typeof(*slave_locks),
5811                                     ha_handles[lo->ldo_dir_stripe_count]);
5812         /* Freed in lod_object_unlock */
5813         OBD_ALLOC(slave_locks, slave_locks_size);
5814         if (!slave_locks)
5815                 RETURN(-ENOMEM);
5816         slave_locks->ha_count = lo->ldo_dir_stripe_count;
5817
5818         /* striped directory lock */
5819         for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
5820                 struct lustre_handle lockh;
5821                 struct ldlm_res_id *res_id;
5822
5823                 res_id = &lod_env_info(env)->lti_res_id;
5824                 fid_build_reg_res_name(lu_object_fid(&lo->ldo_stripe[i]->do_lu),
5825                                        res_id);
5826                 einfo->ei_res_id = res_id;
5827
5828                 LASSERT(lo->ldo_stripe[i] != NULL);
5829                 if (dt_object_remote(lo->ldo_stripe[i])) {
5830                         set_bit(i, (void *)slave_locks->ha_map);
5831                         rc = dt_object_lock(env, lo->ldo_stripe[i], &lockh,
5832                                             einfo, policy);
5833                 } else {
5834                         struct ldlm_namespace *ns = einfo->ei_namespace;
5835                         ldlm_blocking_callback blocking = einfo->ei_cb_local_bl;
5836                         ldlm_completion_callback completion = einfo->ei_cb_cp;
5837                         __u64 dlmflags = LDLM_FL_ATOMIC_CB;
5838
5839                         if (einfo->ei_mode == LCK_PW ||
5840                             einfo->ei_mode == LCK_EX)
5841                                 dlmflags |= LDLM_FL_COS_INCOMPAT;
5842
5843                         LASSERT(ns != NULL);
5844                         rc = ldlm_cli_enqueue_local(env, ns, res_id, LDLM_IBITS,
5845                                                     policy, einfo->ei_mode,
5846                                                     &dlmflags, blocking,
5847                                                     completion, NULL,
5848                                                     NULL, 0, LVB_T_NONE,
5849                                                     NULL, &lockh);
5850                 }
5851                 if (rc) {
5852                         while (i--)
5853                                 ldlm_lock_decref_and_cancel(
5854                                                 &slave_locks->ha_handles[i],
5855                                                 einfo->ei_mode);
5856                         OBD_FREE(slave_locks, slave_locks_size);
5857                         RETURN(rc);
5858                 }
5859                 slave_locks->ha_handles[i] = lockh;
5860         }
5861         einfo->ei_cbdata = slave_locks;
5862
5863         RETURN(0);
5864 }
5865
5866 /**
5867  * Implementation of dt_object_operations::do_invalidate.
5868  *
5869  * \see dt_object_operations::do_invalidate() in the API description for details
5870  */
5871 static int lod_invalidate(const struct lu_env *env, struct dt_object *dt)
5872 {
5873         return dt_invalidate(env, dt_object_child(dt));
5874 }
5875
5876 static int lod_layout_data_init(struct lod_thread_info *info, __u32 comp_cnt)
5877 {
5878         ENTRY;
5879
5880         /* clear memory region that will be used for layout change */
5881         memset(&info->lti_layout_attr, 0, sizeof(struct lu_attr));
5882         info->lti_count = 0;
5883
5884         if (info->lti_comp_size >= comp_cnt)
5885                 RETURN(0);
5886
5887         if (info->lti_comp_size > 0) {
5888                 OBD_FREE(info->lti_comp_idx,
5889                          info->lti_comp_size * sizeof(__u32));
5890                 info->lti_comp_size = 0;
5891         }
5892
5893         OBD_ALLOC(info->lti_comp_idx, comp_cnt * sizeof(__u32));
5894         if (!info->lti_comp_idx)
5895                 RETURN(-ENOMEM);
5896
5897         info->lti_comp_size = comp_cnt;
5898         RETURN(0);
5899 }
5900
5901 static int lod_declare_instantiate_components(const struct lu_env *env,
5902                 struct lod_object *lo, struct thandle *th)
5903 {
5904         struct lod_thread_info *info = lod_env_info(env);
5905         int i;
5906         int rc = 0;
5907         ENTRY;
5908
5909         LASSERT(info->lti_count < lo->ldo_comp_cnt);
5910
5911         for (i = 0; i < info->lti_count; i++) {
5912                 rc = lod_qos_prep_create(env, lo, NULL, th,
5913                                          info->lti_comp_idx[i]);
5914                 if (rc)
5915                         break;
5916         }
5917
5918         if (!rc) {
5919                 info->lti_buf.lb_len = lod_comp_md_size(lo, false);
5920                 rc = lod_sub_declare_xattr_set(env, lod_object_child(lo),
5921                                 &info->lti_buf, XATTR_NAME_LOV, 0, th);
5922         }
5923
5924         RETURN(rc);
5925 }
5926
5927 static int lod_declare_update_plain(const struct lu_env *env,
5928                 struct lod_object *lo, struct layout_intent *layout,
5929                 const struct lu_buf *buf, struct thandle *th)
5930 {
5931         struct lod_thread_info *info = lod_env_info(env);
5932         struct lod_device *d = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
5933         struct lod_layout_component *lod_comp;
5934         struct lov_comp_md_v1 *comp_v1 = NULL;
5935         bool replay = false;
5936         int i, rc;
5937         ENTRY;
5938
5939         LASSERT(lo->ldo_flr_state == LCM_FL_NONE);
5940
5941         /*
5942          * In case the client is passing lovea, which only happens during
5943          * the replay of layout intent write RPC for now, we may need to
5944          * parse the lovea and apply new layout configuration.
5945          */
5946         if (buf && buf->lb_len)  {
5947                 struct lov_user_md_v1 *v1 = buf->lb_buf;
5948
5949                 if (v1->lmm_magic != (LOV_MAGIC_DEFINED | LOV_MAGIC_COMP_V1) &&
5950                     v1->lmm_magic != __swab32(LOV_MAGIC_DEFINED |
5951                                               LOV_MAGIC_COMP_V1)) {
5952                         CERROR("%s: the replay buffer of layout extend "
5953                                "(magic %#x) does not contain expected "
5954                                "composite layout.\n",
5955                                lod2obd(d)->obd_name, v1->lmm_magic);
5956                         GOTO(out, rc = -EINVAL);
5957                 }
5958
5959                 rc = lod_use_defined_striping(env, lo, buf);
5960                 if (rc)
5961                         GOTO(out, rc);
5962                 lo->ldo_comp_cached = 1;
5963
5964                 rc = lod_get_lov_ea(env, lo);
5965                 if (rc <= 0)
5966                         GOTO(out, rc);
5967                 /* old on-disk EA is stored in info->lti_buf */
5968                 comp_v1 = (struct lov_comp_md_v1 *)info->lti_buf.lb_buf;
5969                 replay = true;
5970         } else {
5971                 /* non replay path */
5972                 rc = lod_striping_load(env, lo);
5973                 if (rc)
5974                         GOTO(out, rc);
5975         }
5976
5977         /* Make sure defined layout covers the requested write range. */
5978         lod_comp = &lo->ldo_comp_entries[lo->ldo_comp_cnt - 1];
5979         if (lo->ldo_comp_cnt > 1 &&
5980             lod_comp->llc_extent.e_end != OBD_OBJECT_EOF &&
5981             lod_comp->llc_extent.e_end < layout->li_extent.e_end) {
5982                 CDEBUG(replay ? D_ERROR : D_LAYOUT,
5983                        "%s: the defined layout [0, %#llx) does not covers "
5984                        "the write range "DEXT"\n",
5985                        lod2obd(d)->obd_name, lod_comp->llc_extent.e_end,
5986                        PEXT(&layout->li_extent));
5987                 GOTO(out, rc = -EINVAL);
5988         }
5989
5990         CDEBUG(D_LAYOUT, "%s: "DFID": instantiate components "DEXT"\n",
5991                lod2obd(d)->obd_name, PFID(lod_object_fid(lo)),
5992                PEXT(&layout->li_extent));
5993
5994         /*
5995          * Iterate ld->ldo_comp_entries, find the component whose extent under
5996          * the write range and not instantianted.
5997          */
5998         for (i = 0; i < lo->ldo_comp_cnt; i++) {
5999                 lod_comp = &lo->ldo_comp_entries[i];
6000
6001                 if (lod_comp->llc_extent.e_start >= layout->li_extent.e_end)
6002                         break;
6003
6004                 if (!replay) {
6005                         if (lod_comp_inited(lod_comp))
6006                                 continue;
6007                 } else {
6008                         /**
6009                          * In replay path, lod_comp is the EA passed by
6010                          * client replay buffer,  comp_v1 is the pre-recovery
6011                          * on-disk EA, we'd sift out those components which
6012                          * were init-ed in the on-disk EA.
6013                          */
6014                         if (le32_to_cpu(comp_v1->lcm_entries[i].lcme_flags) &
6015                             LCME_FL_INIT)
6016                                 continue;
6017                 }
6018                 /*
6019                  * this component hasn't instantiated in normal path, or during
6020                  * replay it needs replay the instantiation.
6021                  */
6022
6023                 /* A released component is being extended */
6024                 if (lod_comp->llc_pattern & LOV_PATTERN_F_RELEASED)
6025                         GOTO(out, rc = -EINVAL);
6026
6027                 LASSERT(info->lti_comp_idx != NULL);
6028                 info->lti_comp_idx[info->lti_count++] = i;
6029         }
6030
6031         if (info->lti_count == 0)
6032                 RETURN(-EALREADY);
6033
6034         lod_obj_inc_layout_gen(lo);
6035         rc = lod_declare_instantiate_components(env, lo, th);
6036 out:
6037         if (rc)
6038                 lod_striping_free(env, lo);
6039         RETURN(rc);
6040 }
6041
6042 static inline int lod_comp_index(struct lod_object *lo,
6043                                  struct lod_layout_component *lod_comp)
6044 {
6045         LASSERT(lod_comp >= lo->ldo_comp_entries &&
6046                 lod_comp <= &lo->ldo_comp_entries[lo->ldo_comp_cnt - 1]);
6047
6048         return lod_comp - lo->ldo_comp_entries;
6049 }
6050
6051 /**
6052  * Stale other mirrors by writing extent.
6053  */
6054 static void lod_stale_components(struct lod_object *lo, int primary,
6055                                  struct lu_extent *extent)
6056 {
6057         struct lod_layout_component *pri_comp, *lod_comp;
6058         int i;
6059
6060         /* The writing extent decides which components in the primary
6061          * are affected... */
6062         CDEBUG(D_LAYOUT, "primary mirror %d, "DEXT"\n", primary, PEXT(extent));
6063         lod_foreach_mirror_comp(pri_comp, lo, primary) {
6064                 if (!lu_extent_is_overlapped(extent, &pri_comp->llc_extent))
6065                         continue;
6066
6067                 CDEBUG(D_LAYOUT, "primary comp %u "DEXT"\n",
6068                        lod_comp_index(lo, pri_comp),
6069                        PEXT(&pri_comp->llc_extent));
6070
6071                 for (i = 0; i < lo->ldo_mirror_count; i++) {
6072                         if (i == primary)
6073                                 continue;
6074
6075                         /* ... and then stale other components that are
6076                          * overlapping with primary components */
6077                         lod_foreach_mirror_comp(lod_comp, lo, i) {
6078                                 if (!lu_extent_is_overlapped(
6079                                                         &pri_comp->llc_extent,
6080                                                         &lod_comp->llc_extent))
6081                                         continue;
6082
6083                                 CDEBUG(D_LAYOUT, "stale: %u / %u\n",
6084                                       i, lod_comp_index(lo, lod_comp));
6085
6086                                 lod_comp->llc_flags |= LCME_FL_STALE;
6087                                 lo->ldo_mirrors[i].lme_stale = 1;
6088                         }
6089                 }
6090         }
6091 }
6092
6093 /**
6094  * check an OST's availability
6095  * \param[in] env       execution environment
6096  * \param[in] lo        lod object
6097  * \param[in] dt        dt object
6098  * \param[in] index     mirror index
6099  *
6100  * \retval      negative if failed
6101  * \retval      1 if \a dt is available
6102  * \retval      0 if \a dt is not available
6103  */
6104 static inline int lod_check_ost_avail(const struct lu_env *env,
6105                                       struct lod_object *lo,
6106                                       struct dt_object *dt, int index)
6107 {
6108         struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
6109         struct lod_tgt_desc *ost;
6110         __u32 idx;
6111         int type = LU_SEQ_RANGE_OST;
6112         int rc;
6113
6114         rc = lod_fld_lookup(env, lod, lu_object_fid(&dt->do_lu), &idx, &type);
6115         if (rc < 0) {
6116                 CERROR("%s: can't locate "DFID":rc = %d\n",
6117                        lod2obd(lod)->obd_name, PFID(lu_object_fid(&dt->do_lu)),
6118                        rc);
6119                 return rc;
6120         }
6121
6122         ost = OST_TGT(lod, idx);
6123         if (ost->ltd_statfs.os_state &
6124                 (OS_STATE_READONLY | OS_STATE_ENOSPC | OS_STATE_ENOINO |
6125                  OS_STATE_NOPRECREATE) ||
6126             ost->ltd_active == 0) {
6127                 CDEBUG(D_LAYOUT, DFID ": mirror %d OST%d unavail, rc = %d\n",
6128                        PFID(lod_object_fid(lo)), index, idx, rc);
6129                 return 0;
6130         }
6131
6132         return 1;
6133 }
6134
6135 /**
6136  * Pick primary mirror for write
6137  * \param[in] env       execution environment
6138  * \param[in] lo        object
6139  * \param[in] extent    write range
6140  */
6141 static int lod_primary_pick(const struct lu_env *env, struct lod_object *lo,
6142                             struct lu_extent *extent)
6143 {
6144         struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
6145         unsigned int seq = 0;
6146         struct lod_layout_component *lod_comp;
6147         int i, j, rc;
6148         int picked = -1, second_pick = -1, third_pick = -1;
6149         ENTRY;
6150
6151         if (OBD_FAIL_CHECK(OBD_FAIL_FLR_RANDOM_PICK_MIRROR)) {
6152                 get_random_bytes(&seq, sizeof(seq));
6153                 seq %= lo->ldo_mirror_count;
6154         }
6155
6156         /**
6157          * Pick a mirror as the primary, and check the availability of OSTs.
6158          *
6159          * This algo can be revised later after knowing the topology of
6160          * cluster.
6161          */
6162         lod_qos_statfs_update(env, lod);
6163         for (i = 0; i < lo->ldo_mirror_count; i++) {
6164                 bool ost_avail = true;
6165                 int index = (i + seq) % lo->ldo_mirror_count;
6166
6167                 if (lo->ldo_mirrors[index].lme_stale) {
6168                         CDEBUG(D_LAYOUT, DFID": mirror %d stale\n",
6169                                PFID(lod_object_fid(lo)), index);
6170                         continue;
6171                 }
6172
6173                 /* 2nd pick is for the primary mirror containing unavail OST */
6174                 if (lo->ldo_mirrors[index].lme_primary && second_pick < 0)
6175                         second_pick = index;
6176
6177                 /* 3rd pick is for non-primary mirror containing unavail OST */
6178                 if (second_pick < 0 && third_pick < 0)
6179                         third_pick = index;
6180
6181                 /**
6182                  * we found a non-primary 1st pick, we'd like to find a
6183                  * potential pirmary mirror.
6184                  */
6185                 if (picked >= 0 && !lo->ldo_mirrors[index].lme_primary)
6186                         continue;
6187
6188                 /* check the availability of OSTs */
6189                 lod_foreach_mirror_comp(lod_comp, lo, index) {
6190                         if (!lod_comp_inited(lod_comp) || !lod_comp->llc_stripe)
6191                                 continue;
6192
6193                         for (j = 0; j < lod_comp->llc_stripe_count; j++) {
6194                                 struct dt_object *dt = lod_comp->llc_stripe[j];
6195
6196                                 rc = lod_check_ost_avail(env, lo, dt, index);
6197                                 if (rc < 0)
6198                                         RETURN(rc);
6199
6200                                 ost_avail = !!rc;
6201                                 if (!ost_avail)
6202                                         break;
6203                         } /* for all dt object in one component */
6204                         if (!ost_avail)
6205                                 break;
6206                 } /* for all components in a mirror */
6207
6208                 /**
6209                  * the OSTs where allocated objects locates in the components
6210                  * of the mirror are available.
6211                  */
6212                 if (!ost_avail)
6213                         continue;
6214
6215                 /* this mirror has all OSTs available */
6216                 picked = index;
6217
6218                 /**
6219                  * primary with all OSTs are available, this is the perfect
6220                  * 1st pick.
6221                  */
6222                 if (lo->ldo_mirrors[index].lme_primary)
6223                         break;
6224         } /* for all mirrors */
6225
6226         /* failed to pick a sound mirror, lower our expectation */
6227         if (picked < 0)
6228                 picked = second_pick;
6229         if (picked < 0)
6230                 picked = third_pick;
6231         if (picked < 0)
6232                 RETURN(-ENODATA);
6233
6234         RETURN(picked);
6235 }
6236
6237 static int lod_prepare_resync_mirror(const struct lu_env *env,
6238                                      struct lod_object *lo,
6239                                      __u16 mirror_id)
6240 {
6241         struct lod_thread_info *info = lod_env_info(env);
6242         struct lod_layout_component *lod_comp;
6243         bool neg = !!(MIRROR_ID_NEG & mirror_id);
6244         int i;
6245
6246         mirror_id &= ~MIRROR_ID_NEG;
6247
6248         for (i = 0; i < lo->ldo_mirror_count; i++) {
6249                 if ((!neg && lo->ldo_mirrors[i].lme_id != mirror_id) ||
6250                     (neg && lo->ldo_mirrors[i].lme_id == mirror_id))
6251                         continue;
6252
6253                 lod_foreach_mirror_comp(lod_comp, lo, i) {
6254                         if (lod_comp_inited(lod_comp))
6255                                 continue;
6256
6257                         info->lti_comp_idx[info->lti_count++] =
6258                                 lod_comp_index(lo, lod_comp);
6259                 }
6260         }
6261
6262         return 0;
6263 }
6264
6265 /**
6266  * figure out the components should be instantiated for resync.
6267  */
6268 static int lod_prepare_resync(const struct lu_env *env, struct lod_object *lo,
6269                               struct lu_extent *extent)
6270 {
6271         struct lod_thread_info *info = lod_env_info(env);
6272         struct lod_layout_component *lod_comp;
6273         unsigned int need_sync = 0;
6274         int i;
6275
6276         CDEBUG(D_LAYOUT,
6277                DFID": instantiate all stale components in "DEXT"\n",
6278                PFID(lod_object_fid(lo)), PEXT(extent));
6279
6280         /**
6281          * instantiate all components within this extent, even non-stale
6282          * components.
6283          */
6284         for (i = 0; i < lo->ldo_mirror_count; i++) {
6285                 if (!lo->ldo_mirrors[i].lme_stale)
6286                         continue;
6287
6288                 lod_foreach_mirror_comp(lod_comp, lo, i) {
6289                         if (!lu_extent_is_overlapped(extent,
6290                                                 &lod_comp->llc_extent))
6291                                 break;
6292
6293                         need_sync++;
6294
6295                         if (lod_comp_inited(lod_comp))
6296                                 continue;
6297
6298                         CDEBUG(D_LAYOUT, "resync instantiate %d / %d\n",
6299                                i, lod_comp_index(lo, lod_comp));
6300                         info->lti_comp_idx[info->lti_count++] =
6301                                         lod_comp_index(lo, lod_comp);
6302                 }
6303         }
6304
6305         return need_sync ? 0 : -EALREADY;
6306 }
6307
6308 static int lod_declare_update_rdonly(const struct lu_env *env,
6309                 struct lod_object *lo, struct md_layout_change *mlc,
6310                 struct thandle *th)
6311 {
6312         struct lod_thread_info *info = lod_env_info(env);
6313         struct lu_attr *layout_attr = &info->lti_layout_attr;
6314         struct lod_layout_component *lod_comp;
6315         struct lu_extent extent = { 0 };
6316         int rc;
6317         ENTRY;
6318
6319         LASSERT(lo->ldo_flr_state == LCM_FL_RDONLY);
6320         LASSERT(mlc->mlc_opc == MD_LAYOUT_WRITE ||
6321                 mlc->mlc_opc == MD_LAYOUT_RESYNC);
6322         LASSERT(lo->ldo_mirror_count > 0);
6323
6324         if (mlc->mlc_opc == MD_LAYOUT_WRITE) {
6325                 struct layout_intent *layout = mlc->mlc_intent;
6326                 int picked;
6327
6328                 extent = layout->li_extent;
6329                 CDEBUG(D_LAYOUT, DFID": trying to write :"DEXT"\n",
6330                        PFID(lod_object_fid(lo)), PEXT(&extent));
6331
6332                 picked = lod_primary_pick(env, lo, &extent);
6333                 if (picked < 0)
6334                         RETURN(picked);
6335
6336                 CDEBUG(D_LAYOUT, DFID": picked mirror id %u as primary\n",
6337                        PFID(lod_object_fid(lo)),
6338                        lo->ldo_mirrors[picked].lme_id);
6339
6340                 if (layout->li_opc == LAYOUT_INTENT_TRUNC) {
6341                         /**
6342                          * trunc transfers [0, size) in the intent extent, we'd
6343                          * stale components overlapping [size, eof).
6344                          */
6345                         extent.e_start = extent.e_end;
6346                         extent.e_end = OBD_OBJECT_EOF;
6347                 }
6348
6349                 /* stale overlapping components from other mirrors */
6350                 lod_stale_components(lo, picked, &extent);
6351
6352                 /* restore truncate intent extent */
6353                 if (layout->li_opc == LAYOUT_INTENT_TRUNC)
6354                         extent.e_end = extent.e_start;
6355
6356                 /* instantiate components for the picked mirror, start from 0 */
6357                 extent.e_start = 0;
6358
6359                 lod_foreach_mirror_comp(lod_comp, lo, picked) {
6360                         if (!lu_extent_is_overlapped(&extent,
6361                                                      &lod_comp->llc_extent))
6362                                 break;
6363
6364                         if (lod_comp_inited(lod_comp))
6365                                 continue;
6366
6367                         info->lti_comp_idx[info->lti_count++] =
6368                                                 lod_comp_index(lo, lod_comp);
6369                 }
6370
6371                 lo->ldo_flr_state = LCM_FL_WRITE_PENDING;
6372         } else { /* MD_LAYOUT_RESYNC */
6373                 int i;
6374
6375                 /**
6376                  * could contain multiple non-stale mirrors, so we need to
6377                  * prep uninited all components assuming any non-stale mirror
6378                  * could be picked as the primary mirror.
6379                  */
6380                 if (mlc->mlc_mirror_id == 0) {
6381                         /* normal resync */
6382                         for (i = 0; i < lo->ldo_mirror_count; i++) {
6383                                 if (lo->ldo_mirrors[i].lme_stale)
6384                                         continue;
6385
6386                                 lod_foreach_mirror_comp(lod_comp, lo, i) {
6387                                         if (!lod_comp_inited(lod_comp))
6388                                                 break;
6389
6390                                         if (extent.e_end <
6391                                                 lod_comp->llc_extent.e_end)
6392                                                 extent.e_end =
6393                                                      lod_comp->llc_extent.e_end;
6394                                 }
6395                         }
6396                         rc = lod_prepare_resync(env, lo, &extent);
6397                         if (rc)
6398                                 GOTO(out, rc);
6399                 } else {
6400                         /* mirror write, try to init its all components */
6401                         rc = lod_prepare_resync_mirror(env, lo,
6402                                                        mlc->mlc_mirror_id);
6403                         if (rc)
6404                                 GOTO(out, rc);
6405                 }
6406
6407                 /* change the file state to SYNC_PENDING */
6408                 lo->ldo_flr_state = LCM_FL_SYNC_PENDING;
6409         }
6410
6411         /* Reset the layout version once it's becoming too large.
6412          * This way it can make sure that the layout version is
6413          * monotonously increased in this writing era. */
6414         lod_obj_inc_layout_gen(lo);
6415         if (lo->ldo_layout_gen > (LCME_ID_MAX >> 1)) {
6416                 __u32 layout_version;
6417
6418                 cfs_get_random_bytes(&layout_version, sizeof(layout_version));
6419                 lo->ldo_layout_gen = layout_version & 0xffff;
6420         }
6421
6422         rc = lod_declare_instantiate_components(env, lo, th);
6423         if (rc)
6424                 GOTO(out, rc);
6425
6426         layout_attr->la_valid = LA_LAYOUT_VERSION;
6427         layout_attr->la_layout_version = 0; /* set current version */
6428         if (mlc->mlc_opc == MD_LAYOUT_RESYNC)
6429                 layout_attr->la_layout_version = LU_LAYOUT_RESYNC;
6430         rc = lod_declare_attr_set(env, &lo->ldo_obj, layout_attr, th);
6431         if (rc)
6432                 GOTO(out, rc);
6433
6434 out:
6435         if (rc)
6436                 lod_striping_free(env, lo);
6437         RETURN(rc);
6438 }
6439
6440 static int lod_declare_update_write_pending(const struct lu_env *env,
6441                 struct lod_object *lo, struct md_layout_change *mlc,
6442                 struct thandle *th)
6443 {
6444         struct lod_thread_info *info = lod_env_info(env);
6445         struct lu_attr *layout_attr = &info->lti_layout_attr;
6446         struct lod_layout_component *lod_comp;
6447         struct lu_extent extent = { 0 };
6448         int primary = -1;
6449         int i;
6450         int rc;
6451         ENTRY;
6452
6453         LASSERT(lo->ldo_flr_state == LCM_FL_WRITE_PENDING);
6454         LASSERT(mlc->mlc_opc == MD_LAYOUT_WRITE ||
6455                 mlc->mlc_opc == MD_LAYOUT_RESYNC);
6456
6457         /* look for the primary mirror */
6458         for (i = 0; i < lo->ldo_mirror_count; i++) {
6459                 if (lo->ldo_mirrors[i].lme_stale)
6460                         continue;
6461
6462                 LASSERTF(primary < 0, DFID " has multiple primary: %u / %u",
6463                          PFID(lod_object_fid(lo)),
6464                          lo->ldo_mirrors[i].lme_id,
6465                          lo->ldo_mirrors[primary].lme_id);
6466
6467                 primary = i;
6468         }
6469         if (primary < 0) {
6470                 CERROR(DFID ": doesn't have a primary mirror\n",
6471                        PFID(lod_object_fid(lo)));
6472                 GOTO(out, rc = -ENODATA);
6473         }
6474
6475         CDEBUG(D_LAYOUT, DFID": found primary %u\n",
6476                PFID(lod_object_fid(lo)), lo->ldo_mirrors[primary].lme_id);
6477
6478         LASSERT(!lo->ldo_mirrors[primary].lme_stale);
6479
6480         /* for LAYOUT_WRITE opc, it has to do the following operations:
6481          * 1. stale overlapping componets from stale mirrors;
6482          * 2. instantiate components of the primary mirror;
6483          * 3. transfter layout version to all objects of the primary;
6484          *
6485          * for LAYOUT_RESYNC opc, it will do:
6486          * 1. instantiate components of all stale mirrors;
6487          * 2. transfer layout version to all objects to close write era. */
6488
6489         if (mlc->mlc_opc == MD_LAYOUT_WRITE) {
6490                 LASSERT(mlc->mlc_intent != NULL);
6491
6492                 extent = mlc->mlc_intent->li_extent;
6493
6494                 CDEBUG(D_LAYOUT, DFID": intent to write: "DEXT"\n",
6495                        PFID(lod_object_fid(lo)), PEXT(&extent));
6496
6497                 if (mlc->mlc_intent->li_opc == LAYOUT_INTENT_TRUNC) {
6498                         /**
6499                          * trunc transfers [0, size) in the intent extent, we'd
6500                          * stale components overlapping [size, eof).
6501                          */
6502                         extent.e_start = extent.e_end;
6503                         extent.e_end = OBD_OBJECT_EOF;
6504                 }
6505                 /* 1. stale overlapping components */
6506                 lod_stale_components(lo, primary, &extent);
6507
6508                 /* 2. find out the components need instantiating.
6509                  * instantiate [0, mlc->mlc_intent->e_end) */
6510
6511                 /* restore truncate intent extent */
6512                 if (mlc->mlc_intent->li_opc == LAYOUT_INTENT_TRUNC)
6513                         extent.e_end = extent.e_start;
6514                 extent.e_start = 0;
6515
6516                 lod_foreach_mirror_comp(lod_comp, lo, primary) {
6517                         if (!lu_extent_is_overlapped(&extent,
6518                                                      &lod_comp->llc_extent))
6519                                 break;
6520
6521                         if (lod_comp_inited(lod_comp))
6522                                 continue;
6523
6524                         CDEBUG(D_LAYOUT, "write instantiate %d / %d\n",
6525                                primary, lod_comp_index(lo, lod_comp));
6526                         info->lti_comp_idx[info->lti_count++] =
6527                                                 lod_comp_index(lo, lod_comp);
6528                 }
6529         } else { /* MD_LAYOUT_RESYNC */
6530                 if (mlc->mlc_mirror_id == 0) {
6531                         /* normal resync */
6532                         lod_foreach_mirror_comp(lod_comp, lo, primary) {
6533                                 if (!lod_comp_inited(lod_comp))
6534                                         break;
6535
6536                                 extent.e_end = lod_comp->llc_extent.e_end;
6537                         }
6538
6539                         rc = lod_prepare_resync(env, lo, &extent);
6540                         if (rc)
6541                                 GOTO(out, rc);
6542                 } else {
6543                         /* mirror write, try to init its all components */
6544                         rc = lod_prepare_resync_mirror(env, lo,
6545                                                        mlc->mlc_mirror_id);
6546                         if (rc)
6547                                 GOTO(out, rc);
6548                 }
6549
6550                 /* change the file state to SYNC_PENDING */
6551                 lo->ldo_flr_state = LCM_FL_SYNC_PENDING;
6552         }
6553
6554         rc = lod_declare_instantiate_components(env, lo, th);
6555         if (rc)
6556                 GOTO(out, rc);
6557
6558         /* 3. transfer layout version to OST objects.
6559          * transfer new layout version to OST objects so that stale writes
6560          * can be denied. It also ends an era of writing by setting
6561          * LU_LAYOUT_RESYNC. Normal client can never use this bit to
6562          * send write RPC; only resync RPCs could do it. */
6563         layout_attr->la_valid = LA_LAYOUT_VERSION;
6564         layout_attr->la_layout_version = 0; /* set current version */
6565         if (mlc->mlc_opc == MD_LAYOUT_RESYNC)
6566                 layout_attr->la_layout_version = LU_LAYOUT_RESYNC;
6567         rc = lod_declare_attr_set(env, &lo->ldo_obj, layout_attr, th);
6568         if (rc)
6569                 GOTO(out, rc);
6570
6571         lod_obj_inc_layout_gen(lo);
6572 out:
6573         if (rc)
6574                 lod_striping_free(env, lo);
6575         RETURN(rc);
6576 }
6577
6578 static int lod_declare_update_sync_pending(const struct lu_env *env,
6579                 struct lod_object *lo, struct md_layout_change *mlc,
6580                 struct thandle *th)
6581 {
6582         struct lod_thread_info  *info = lod_env_info(env);
6583         unsigned sync_components = 0;
6584         unsigned resync_components = 0;
6585         int i;
6586         int rc;
6587         ENTRY;
6588
6589         LASSERT(lo->ldo_flr_state == LCM_FL_SYNC_PENDING);
6590         LASSERT(mlc->mlc_opc == MD_LAYOUT_RESYNC_DONE ||
6591                 mlc->mlc_opc == MD_LAYOUT_WRITE);
6592
6593         CDEBUG(D_LAYOUT, DFID ": received op %d in sync pending\n",
6594                PFID(lod_object_fid(lo)), mlc->mlc_opc);
6595
6596         if (mlc->mlc_opc == MD_LAYOUT_WRITE) {
6597                 CDEBUG(D_LAYOUT, DFID": cocurrent write to sync pending\n",
6598                        PFID(lod_object_fid(lo)));
6599
6600                 lo->ldo_flr_state = LCM_FL_WRITE_PENDING;
6601                 return lod_declare_update_write_pending(env, lo, mlc, th);
6602         }
6603
6604         /* MD_LAYOUT_RESYNC_DONE */
6605
6606         for (i = 0; i < lo->ldo_comp_cnt; i++) {
6607                 struct lod_layout_component *lod_comp;
6608                 int j;
6609
6610                 lod_comp = &lo->ldo_comp_entries[i];
6611
6612                 if (!(lod_comp->llc_flags & LCME_FL_STALE)) {
6613                         sync_components++;
6614                         continue;
6615                 }
6616
6617                 for (j = 0; j < mlc->mlc_resync_count; j++) {
6618                         if (lod_comp->llc_id != mlc->mlc_resync_ids[j])
6619                                 continue;
6620
6621                         mlc->mlc_resync_ids[j] = LCME_ID_INVAL;
6622                         lod_comp->llc_flags &= ~LCME_FL_STALE;
6623                         resync_components++;
6624                         break;
6625                 }
6626         }
6627
6628         /* valid check */
6629         for (i = 0; i < mlc->mlc_resync_count; i++) {
6630                 if (mlc->mlc_resync_ids[i] == LCME_ID_INVAL)
6631                         continue;
6632
6633                 CDEBUG(D_LAYOUT, DFID": lcme id %u (%d / %zd) not exist "
6634                        "or already synced\n", PFID(lod_object_fid(lo)),
6635                        mlc->mlc_resync_ids[i], i, mlc->mlc_resync_count);
6636                 GOTO(out, rc = -EINVAL);
6637         }
6638
6639         if (!sync_components || (mlc->mlc_resync_count && !resync_components)) {
6640                 CDEBUG(D_LAYOUT, DFID": no mirror in sync\n",
6641                        PFID(lod_object_fid(lo)));
6642
6643                 /* tend to return an error code here to prevent
6644                  * the MDT from setting SoM attribute */
6645                 GOTO(out, rc = -EINVAL);
6646         }
6647
6648         CDEBUG(D_LAYOUT, DFID": resynced %u/%zu components\n",
6649                PFID(lod_object_fid(lo)),
6650                resync_components, mlc->mlc_resync_count);
6651
6652         lo->ldo_flr_state = LCM_FL_RDONLY;
6653         lod_obj_inc_layout_gen(lo);
6654
6655         info->lti_buf.lb_len = lod_comp_md_size(lo, false);
6656         rc = lod_sub_declare_xattr_set(env, lod_object_child(lo),
6657                                        &info->lti_buf, XATTR_NAME_LOV, 0, th);
6658         EXIT;
6659
6660 out:
6661         if (rc)
6662                 lod_striping_free(env, lo);
6663         RETURN(rc);
6664 }
6665
6666 static int lod_declare_layout_change(const struct lu_env *env,
6667                 struct dt_object *dt, struct md_layout_change *mlc,
6668                 struct thandle *th)
6669 {
6670         struct lod_thread_info  *info = lod_env_info(env);
6671         struct lod_object *lo = lod_dt_obj(dt);
6672         int rc;
6673         ENTRY;
6674
6675         if (!S_ISREG(dt->do_lu.lo_header->loh_attr) || !dt_object_exists(dt) ||
6676             dt_object_remote(dt_object_child(dt)))
6677                 RETURN(-EINVAL);
6678
6679         rc = lod_striping_load(env, lo);
6680         if (rc)
6681                 GOTO(out, rc);
6682
6683         LASSERT(lo->ldo_comp_cnt > 0);
6684
6685         rc = lod_layout_data_init(info, lo->ldo_comp_cnt);
6686         if (rc)
6687                 GOTO(out, rc);
6688
6689         switch (lo->ldo_flr_state) {
6690         case LCM_FL_NONE:
6691                 rc = lod_declare_update_plain(env, lo, mlc->mlc_intent,
6692                                               &mlc->mlc_buf, th);
6693                 break;
6694         case LCM_FL_RDONLY:
6695                 rc = lod_declare_update_rdonly(env, lo, mlc, th);
6696                 break;
6697         case LCM_FL_WRITE_PENDING:
6698                 rc = lod_declare_update_write_pending(env, lo, mlc, th);
6699                 break;
6700         case LCM_FL_SYNC_PENDING:
6701                 rc = lod_declare_update_sync_pending(env, lo, mlc, th);
6702                 break;
6703         default:
6704                 rc = -ENOTSUPP;
6705                 break;
6706         }
6707 out:
6708         RETURN(rc);
6709 }
6710
6711 /**
6712  * Instantiate layout component objects which covers the intent write offset.
6713  */
6714 static int lod_layout_change(const struct lu_env *env, struct dt_object *dt,
6715                              struct md_layout_change *mlc, struct thandle *th)
6716 {
6717         struct lu_attr *attr = &lod_env_info(env)->lti_attr;
6718         struct lu_attr *layout_attr = &lod_env_info(env)->lti_layout_attr;
6719         struct lod_object *lo = lod_dt_obj(dt);
6720         int rc;
6721
6722         rc = lod_striped_create(env, dt, attr, NULL, th);
6723         if (!rc && layout_attr->la_valid & LA_LAYOUT_VERSION) {
6724                 layout_attr->la_layout_version |= lo->ldo_layout_gen;
6725                 rc = lod_attr_set(env, dt, layout_attr, th);
6726         }
6727
6728         return rc;
6729 }
6730
6731 struct dt_object_operations lod_obj_ops = {
6732         .do_read_lock           = lod_read_lock,
6733         .do_write_lock          = lod_write_lock,
6734         .do_read_unlock         = lod_read_unlock,
6735         .do_write_unlock        = lod_write_unlock,
6736         .do_write_locked        = lod_write_locked,
6737         .do_attr_get            = lod_attr_get,
6738         .do_declare_attr_set    = lod_declare_attr_set,
6739         .do_attr_set            = lod_attr_set,
6740         .do_xattr_get           = lod_xattr_get,
6741         .do_declare_xattr_set   = lod_declare_xattr_set,
6742         .do_xattr_set           = lod_xattr_set,
6743         .do_declare_xattr_del   = lod_declare_xattr_del,
6744         .do_xattr_del           = lod_xattr_del,
6745         .do_xattr_list          = lod_xattr_list,
6746         .do_ah_init             = lod_ah_init,
6747         .do_declare_create      = lod_declare_create,
6748         .do_create              = lod_create,
6749         .do_declare_destroy     = lod_declare_destroy,
6750         .do_destroy             = lod_destroy,
6751         .do_index_try           = lod_index_try,
6752         .do_declare_ref_add     = lod_declare_ref_add,
6753         .do_ref_add             = lod_ref_add,
6754         .do_declare_ref_del     = lod_declare_ref_del,
6755         .do_ref_del             = lod_ref_del,
6756         .do_object_sync         = lod_object_sync,
6757         .do_object_lock         = lod_object_lock,
6758         .do_object_unlock       = lod_object_unlock,
6759         .do_invalidate          = lod_invalidate,
6760         .do_declare_layout_change = lod_declare_layout_change,
6761         .do_layout_change       = lod_layout_change,
6762 };
6763
6764 /**
6765  * Implementation of dt_body_operations::dbo_read.
6766  *
6767  * \see dt_body_operations::dbo_read() in the API description for details.
6768  */
6769 static ssize_t lod_read(const struct lu_env *env, struct dt_object *dt,
6770                         struct lu_buf *buf, loff_t *pos)
6771 {
6772         struct dt_object *next = dt_object_child(dt);
6773
6774         LASSERT(S_ISREG(dt->do_lu.lo_header->loh_attr) ||
6775                 S_ISLNK(dt->do_lu.lo_header->loh_attr));
6776         return next->do_body_ops->dbo_read(env, next, buf, pos);
6777 }
6778
6779 /**
6780  * Implementation of dt_body_operations::dbo_declare_write.
6781  *
6782  * \see dt_body_operations::dbo_declare_write() in the API description
6783  * for details.
6784  */
6785 static ssize_t lod_declare_write(const struct lu_env *env,
6786                                  struct dt_object *dt,
6787                                  const struct lu_buf *buf, loff_t pos,
6788                                  struct thandle *th)
6789 {
6790         return lod_sub_declare_write(env, dt_object_child(dt), buf, pos, th);
6791 }
6792
6793 /**
6794  * Implementation of dt_body_operations::dbo_write.
6795  *
6796  * \see dt_body_operations::dbo_write() in the API description for details.
6797  */
6798 static ssize_t lod_write(const struct lu_env *env, struct dt_object *dt,
6799                          const struct lu_buf *buf, loff_t *pos,
6800                          struct thandle *th)
6801 {
6802         LASSERT(S_ISREG(dt->do_lu.lo_header->loh_attr) ||
6803                 S_ISLNK(dt->do_lu.lo_header->loh_attr));
6804         return lod_sub_write(env, dt_object_child(dt), buf, pos, th);
6805 }
6806
6807 static int lod_declare_punch(const struct lu_env *env, struct dt_object *dt,
6808                              __u64 start, __u64 end, struct thandle *th)
6809 {
6810         if (dt_object_remote(dt))
6811                 return -ENOTSUPP;
6812
6813         return lod_sub_declare_punch(env, dt_object_child(dt), start, end, th);
6814 }
6815
6816 static int lod_punch(const struct lu_env *env, struct dt_object *dt,
6817                      __u64 start, __u64 end, struct thandle *th)
6818 {
6819         if (dt_object_remote(dt))
6820                 return -ENOTSUPP;
6821
6822         LASSERT(S_ISREG(dt->do_lu.lo_header->loh_attr));
6823         return lod_sub_punch(env, dt_object_child(dt), start, end, th);
6824 }
6825
6826 /*
6827  * different type of files use the same body_ops because object may be created
6828  * in OUT, where there is no chance to set correct body_ops for each type, so
6829  * body_ops themselves will check file type inside, see lod_read/write/punch for
6830  * details.
6831  */
6832 const struct dt_body_operations lod_body_ops = {
6833         .dbo_read               = lod_read,
6834         .dbo_declare_write      = lod_declare_write,
6835         .dbo_write              = lod_write,
6836         .dbo_declare_punch      = lod_declare_punch,
6837         .dbo_punch              = lod_punch,
6838 };
6839
6840 /**
6841  * Implementation of lu_object_operations::loo_object_init.
6842  *
6843  * The function determines the type and the index of the target device using
6844  * sequence of the object's FID. Then passes control down to the
6845  * corresponding device:
6846  *  OSD for the local objects, OSP for remote
6847  *
6848  * \see lu_object_operations::loo_object_init() in the API description
6849  * for details.
6850  */
6851 static int lod_object_init(const struct lu_env *env, struct lu_object *lo,
6852                            const struct lu_object_conf *conf)
6853 {
6854         struct lod_device       *lod    = lu2lod_dev(lo->lo_dev);
6855         struct lu_device        *cdev   = NULL;
6856         struct lu_object        *cobj;
6857         struct lod_tgt_descs    *ltd    = NULL;
6858         struct lod_tgt_desc     *tgt;
6859         u32                      idx    = 0;
6860         int                      type   = LU_SEQ_RANGE_ANY;
6861         int                      rc;
6862         ENTRY;
6863
6864         rc = lod_fld_lookup(env, lod, lu_object_fid(lo), &idx, &type);
6865         if (rc != 0) {
6866                 /* Note: Sometimes, it will Return EAGAIN here, see
6867                  * ptrlpc_import_delay_req(), which might confuse
6868                  * lu_object_find_at() and make it wait there incorrectly.
6869                  * so we convert it to EIO here.*/
6870                 if (rc == -EAGAIN)
6871                         rc = -EIO;
6872
6873                 RETURN(rc);
6874         }
6875
6876         if (type == LU_SEQ_RANGE_MDT &&
6877             idx == lu_site2seq(lo->lo_dev->ld_site)->ss_node_id) {
6878                 cdev = &lod->lod_child->dd_lu_dev;
6879         } else if (type == LU_SEQ_RANGE_MDT) {
6880                 ltd = &lod->lod_mdt_descs;
6881                 lod_getref(ltd);
6882         } else if (type == LU_SEQ_RANGE_OST) {
6883                 ltd = &lod->lod_ost_descs;
6884                 lod_getref(ltd);
6885         } else {
6886                 LBUG();
6887         }
6888
6889         if (ltd != NULL) {
6890                 if (ltd->ltd_tgts_size > idx &&
6891                     cfs_bitmap_check(ltd->ltd_tgt_bitmap, idx)) {
6892                         tgt = LTD_TGT(ltd, idx);
6893
6894                         LASSERT(tgt != NULL);
6895                         LASSERT(tgt->ltd_tgt != NULL);
6896
6897                         cdev = &(tgt->ltd_tgt->dd_lu_dev);
6898                 }
6899                 lod_putref(lod, ltd);
6900         }
6901
6902         if (unlikely(cdev == NULL))
6903                 RETURN(-ENOENT);
6904
6905         cobj = cdev->ld_ops->ldo_object_alloc(env, lo->lo_header, cdev);
6906         if (unlikely(cobj == NULL))
6907                 RETURN(-ENOMEM);
6908
6909         lu2lod_obj(lo)->ldo_obj.do_body_ops = &lod_body_ops;
6910
6911         lu_object_add(lo, cobj);
6912
6913         RETURN(0);
6914 }
6915
6916 /**
6917  *
6918  * Alloc cached foreign LOV
6919  *
6920  * \param[in] lo        object
6921  * \param[in] size      size of foreign LOV
6922  *
6923  * \retval              0 on success
6924  * \retval              negative if failed
6925  */
6926 int lod_alloc_foreign_lov(struct lod_object *lo, size_t size)
6927 {
6928         OBD_ALLOC_LARGE(lo->ldo_foreign_lov, size);
6929         if (lo->ldo_foreign_lov == NULL)
6930                 return -ENOMEM;
6931         lo->ldo_foreign_lov_size = size;
6932         lo->ldo_is_foreign = 1;
6933         return 0;
6934 }
6935
6936 /**
6937  *
6938  * Free cached foreign LOV
6939  *
6940  * \param[in] lo        object
6941  */
6942 void lod_free_foreign_lov(struct lod_object *lo)
6943 {
6944         if (lo->ldo_foreign_lov != NULL)
6945                 OBD_FREE_LARGE(lo->ldo_foreign_lov, lo->ldo_foreign_lov_size);
6946         lo->ldo_foreign_lov = NULL;
6947         lo->ldo_foreign_lov_size = 0;
6948         lo->ldo_is_foreign = 0;
6949 }
6950
6951 /**
6952  *
6953  * Free cached foreign LMV
6954  *
6955  * \param[in] lo        object
6956  */
6957 void lod_free_foreign_lmv(struct lod_object *lo)
6958 {
6959         if (lo->ldo_foreign_lmv != NULL)
6960                 OBD_FREE_LARGE(lo->ldo_foreign_lmv, lo->ldo_foreign_lmv_size);
6961         lo->ldo_foreign_lmv = NULL;
6962         lo->ldo_foreign_lmv_size = 0;
6963         lo->ldo_dir_is_foreign = 0;
6964 }
6965
6966 /**
6967  *
6968  * Release resources associated with striping.
6969  *
6970  * If the object is striped (regular or directory), then release
6971  * the stripe objects references and free the ldo_stripe array.
6972  *
6973  * \param[in] env       execution environment
6974  * \param[in] lo        object
6975  */
6976 void lod_striping_free_nolock(const struct lu_env *env, struct lod_object *lo)
6977 {
6978         struct lod_layout_component *lod_comp;
6979         int i, j;
6980
6981         if (unlikely(lo->ldo_is_foreign)) {
6982                 lod_free_foreign_lov(lo);
6983                 lo->ldo_comp_cached = 0;
6984         } else if (unlikely(lo->ldo_dir_is_foreign)) {
6985                 lod_free_foreign_lmv(lo);
6986                 lo->ldo_dir_stripe_loaded = 0;
6987         } else if (lo->ldo_stripe != NULL) {
6988                 LASSERT(lo->ldo_comp_entries == NULL);
6989                 LASSERT(lo->ldo_dir_stripes_allocated > 0);
6990
6991                 for (i = 0; i < lo->ldo_dir_stripe_count; i++) {
6992                         if (lo->ldo_stripe[i])
6993                                 dt_object_put(env, lo->ldo_stripe[i]);
6994                 }
6995
6996                 j = sizeof(struct dt_object *) * lo->ldo_dir_stripes_allocated;
6997                 OBD_FREE(lo->ldo_stripe, j);
6998                 lo->ldo_stripe = NULL;
6999                 lo->ldo_dir_stripes_allocated = 0;
7000                 lo->ldo_dir_stripe_loaded = 0;
7001                 lo->ldo_dir_stripe_count = 0;
7002         } else if (lo->ldo_comp_entries != NULL) {
7003                 for (i = 0; i < lo->ldo_comp_cnt; i++) {
7004                         /* free lod_layout_component::llc_stripe array */
7005                         lod_comp = &lo->ldo_comp_entries[i];
7006
7007                         if (lod_comp->llc_stripe == NULL)
7008                                 continue;
7009                         LASSERT(lod_comp->llc_stripes_allocated != 0);
7010                         for (j = 0; j < lod_comp->llc_stripes_allocated; j++) {
7011                                 if (lod_comp->llc_stripe[j] != NULL)
7012                                         lu_object_put(env,
7013                                                &lod_comp->llc_stripe[j]->do_lu);
7014                         }
7015                         OBD_FREE(lod_comp->llc_stripe,
7016                                  sizeof(struct dt_object *) *
7017                                  lod_comp->llc_stripes_allocated);
7018                         lod_comp->llc_stripe = NULL;
7019                         OBD_FREE(lod_comp->llc_ost_indices,
7020                                  sizeof(__u32) *
7021                                  lod_comp->llc_stripes_allocated);
7022                         lod_comp->llc_ost_indices = NULL;
7023                         lod_comp->llc_stripes_allocated = 0;
7024                 }
7025                 lod_free_comp_entries(lo);
7026                 lo->ldo_comp_cached = 0;
7027         }
7028 }
7029
7030 void lod_striping_free(const struct lu_env *env, struct lod_object *lo)
7031 {
7032         mutex_lock(&lo->ldo_layout_mutex);
7033         lod_striping_free_nolock(env, lo);
7034         mutex_unlock(&lo->ldo_layout_mutex);
7035 }
7036
7037 /**
7038  * Implementation of lu_object_operations::loo_object_free.
7039  *
7040  * \see lu_object_operations::loo_object_free() in the API description
7041  * for details.
7042  */
7043 static void lod_object_free(const struct lu_env *env, struct lu_object *o)
7044 {
7045         struct lod_object *lo = lu2lod_obj(o);
7046
7047         /* release all underlying object pinned */
7048         lod_striping_free(env, lo);
7049         lu_object_fini(o);
7050         OBD_SLAB_FREE_PTR(lo, lod_object_kmem);
7051 }
7052
7053 /**
7054  * Implementation of lu_object_operations::loo_object_release.
7055  *
7056  * \see lu_object_operations::loo_object_release() in the API description
7057  * for details.
7058  */
7059 static void lod_object_release(const struct lu_env *env, struct lu_object *o)
7060 {
7061         /* XXX: shouldn't we release everything here in case if object
7062          * creation failed before? */
7063 }
7064
7065 /**
7066  * Implementation of lu_object_operations::loo_object_print.
7067  *
7068  * \see lu_object_operations::loo_object_print() in the API description
7069  * for details.
7070  */
7071 static int lod_object_print(const struct lu_env *env, void *cookie,
7072                             lu_printer_t p, const struct lu_object *l)
7073 {
7074         struct lod_object *o = lu2lod_obj((struct lu_object *) l);
7075
7076         return (*p)(env, cookie, LUSTRE_LOD_NAME"-object@%p", o);
7077 }
7078
7079 struct lu_object_operations lod_lu_obj_ops = {
7080         .loo_object_init        = lod_object_init,
7081         .loo_object_free        = lod_object_free,
7082         .loo_object_release     = lod_object_release,
7083         .loo_object_print       = lod_object_print,
7084 };