Whamcloud - gitweb
e866566f5d22d1c8169b657dec00ac4ba8965e92
[fs/lustre-release.git] / lustre / lfsck / lfsck_layout.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License version 2 for more details.  A copy is
14  * included in the COPYING file that accompanied this code.
15
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright (c) 2014, 2017, Intel Corporation.
24  */
25 /*
26  * lustre/lfsck/lfsck_layout.c
27  *
28  * Author: Fan, Yong <fan.yong@intel.com>
29  */
30
31 #ifndef EXPORT_SYMTAB
32 # define EXPORT_SYMTAB
33 #endif
34 #define DEBUG_SUBSYSTEM S_LFSCK
35
36 #include <linux/bitops.h>
37 #include <linux/rbtree.h>
38
39 #include <lu_object.h>
40 #include <dt_object.h>
41 #include <lustre_fid.h>
42 #include <lustre_lib.h>
43 #include <lustre_net.h>
44 #include <md_object.h>
45 #include <obd_class.h>
46
47 #include "lfsck_internal.h"
48
49 #define LFSCK_LAYOUT_MAGIC_V1           0xB173AE14
50 #define LFSCK_LAYOUT_MAGIC_V2           0xB1734D76
51 #define LFSCK_LAYOUT_MAGIC_V3           0xB17371B9
52 #define LFSCK_LAYOUT_MAGIC_V4           0xB1732FED
53
54 #define LFSCK_LAYOUT_MAGIC              LFSCK_LAYOUT_MAGIC_V4
55
56 struct lfsck_layout_seq {
57         struct list_head         lls_list;
58         __u64                    lls_seq;
59         __u64                    lls_lastid;
60         __u64                    lls_lastid_known;
61         struct dt_object        *lls_lastid_obj;
62         unsigned int             lls_dirty:1;
63 };
64
65 struct lfsck_layout_slave_target {
66         /* link into lfsck_layout_slave_data::llsd_master_list. */
67         struct list_head        llst_list;
68         /* The position for next record in the rbtree for iteration. */
69         struct lu_fid           llst_fid;
70         /* Dummy hash for iteration against the rbtree. */
71         __u64                   llst_hash;
72         __u64                   llst_gen;
73         atomic_t                llst_ref;
74         __u32                   llst_index;
75         /* How many times we have failed to get the master status. */
76         int                     llst_failures;
77 };
78
79 struct lfsck_layout_slave_data {
80         /* list for lfsck_layout_seq */
81         struct list_head         llsd_seq_list;
82
83         /* list for the masters involve layout verification. */
84         struct list_head         llsd_master_list;
85         spinlock_t               llsd_lock;
86         __u64                    llsd_touch_gen;
87         struct dt_object        *llsd_rb_obj;
88         struct rb_root           llsd_rb_root;
89         rwlock_t                 llsd_rb_lock;
90         unsigned int             llsd_rbtree_valid:1;
91 };
92
93 struct lfsck_layout_slave_async_args {
94         struct obd_export                *llsaa_exp;
95         struct lfsck_component           *llsaa_com;
96         struct lfsck_layout_slave_target *llsaa_llst;
97 };
98
99 static inline bool lfsck_comp_extent_aligned(__u64 size)
100 {
101          return (size & (LOV_MIN_STRIPE_SIZE - 1)) == 0;
102 }
103
104 static inline void
105 lfsck_layout_llst_put(struct lfsck_layout_slave_target *llst)
106 {
107         if (atomic_dec_and_test(&llst->llst_ref)) {
108                 LASSERT(list_empty(&llst->llst_list));
109
110                 OBD_FREE_PTR(llst);
111         }
112 }
113
114 static inline int
115 lfsck_layout_llst_add(struct lfsck_layout_slave_data *llsd, __u32 index)
116 {
117         struct lfsck_layout_slave_target *llst;
118         struct lfsck_layout_slave_target *tmp;
119         int                               rc   = 0;
120
121         OBD_ALLOC_PTR(llst);
122         if (llst == NULL)
123                 return -ENOMEM;
124
125         INIT_LIST_HEAD(&llst->llst_list);
126         llst->llst_gen = 0;
127         llst->llst_index = index;
128         atomic_set(&llst->llst_ref, 1);
129
130         spin_lock(&llsd->llsd_lock);
131         list_for_each_entry(tmp, &llsd->llsd_master_list, llst_list) {
132                 if (tmp->llst_index == index) {
133                         rc = -EALREADY;
134                         break;
135                 }
136         }
137         if (rc == 0)
138                 list_add_tail(&llst->llst_list, &llsd->llsd_master_list);
139         spin_unlock(&llsd->llsd_lock);
140
141         if (rc != 0)
142                 OBD_FREE_PTR(llst);
143
144         return rc;
145 }
146
147 static inline void
148 lfsck_layout_llst_del(struct lfsck_layout_slave_data *llsd,
149                       struct lfsck_layout_slave_target *llst)
150 {
151         bool del = false;
152
153         spin_lock(&llsd->llsd_lock);
154         if (!list_empty(&llst->llst_list)) {
155                 list_del_init(&llst->llst_list);
156                 del = true;
157         }
158         spin_unlock(&llsd->llsd_lock);
159
160         if (del)
161                 lfsck_layout_llst_put(llst);
162 }
163
164 static inline struct lfsck_layout_slave_target *
165 lfsck_layout_llst_find_and_del(struct lfsck_layout_slave_data *llsd,
166                                __u32 index, bool unlink)
167 {
168         struct lfsck_layout_slave_target *llst;
169
170         spin_lock(&llsd->llsd_lock);
171         list_for_each_entry(llst, &llsd->llsd_master_list, llst_list) {
172                 if (llst->llst_index == index) {
173                         if (unlink)
174                                 list_del_init(&llst->llst_list);
175                         else
176                                 atomic_inc(&llst->llst_ref);
177                         spin_unlock(&llsd->llsd_lock);
178
179                         return llst;
180                 }
181         }
182         spin_unlock(&llsd->llsd_lock);
183
184         return NULL;
185 }
186
187 static struct lfsck_layout_req *
188 lfsck_layout_assistant_req_init(struct lfsck_assistant_object *lso,
189                                 struct dt_object *child, __u32 comp_id,
190                                 __u32 ost_idx, __u32 lov_idx)
191 {
192         struct lfsck_layout_req *llr;
193
194         OBD_ALLOC_PTR(llr);
195         if (llr == NULL)
196                 return ERR_PTR(-ENOMEM);
197
198         INIT_LIST_HEAD(&llr->llr_lar.lar_list);
199         llr->llr_lar.lar_parent = lfsck_assistant_object_get(lso);
200         llr->llr_child = child;
201         llr->llr_comp_id = comp_id;
202         llr->llr_ost_idx = ost_idx;
203         llr->llr_lov_idx = lov_idx;
204
205         return llr;
206 }
207
208 static void lfsck_layout_assistant_req_fini(const struct lu_env *env,
209                                             struct lfsck_assistant_req *lar)
210 {
211         struct lfsck_layout_req *llr =
212                         container_of0(lar, struct lfsck_layout_req, llr_lar);
213
214         lfsck_object_put(env, llr->llr_child);
215         lfsck_assistant_object_put(env, lar->lar_parent);
216         OBD_FREE_PTR(llr);
217 }
218
219 static int
220 lfsck_layout_assistant_sync_failures_interpret(const struct lu_env *env,
221                                                struct ptlrpc_request *req,
222                                                void *args, int rc)
223 {
224         if (rc == 0) {
225                 struct lfsck_async_interpret_args *laia = args;
226                 struct lfsck_tgt_desc             *ltd  = laia->laia_ltd;
227
228                 ltd->ltd_synced_failures = 1;
229                 atomic_dec(laia->laia_count);
230         }
231
232         return 0;
233 }
234
235 /**
236  * Notify remote LFSCK instances about former failures.
237  *
238  * The local LFSCK instance has recorded which OSTs have ever failed to respond
239  * some LFSCK verification requests (maybe because of network issues or the OST
240  * itself trouble). During the respond gap, the OST may missed some OST-objects
241  * verification, then the OST cannot know whether related OST-objects have been
242  * referenced by related MDT-objects or not, then in the second-stage scanning,
243  * these OST-objects will be regarded as orphan, if the OST-object contains bad
244  * parent FID for back reference, then it will misguide the LFSCK to make wrong
245  * fixing for the fake orphan.
246  *
247  * To avoid above trouble, when layout LFSCK finishes the first-stage scanning,
248  * it will scan the bitmap for the ever failed OSTs, and notify them that they
249  * have ever missed some OST-object verification and should skip the handling
250  * for orphan OST-objects on all MDTs that are in the layout LFSCK.
251  *
252  * \param[in] env       pointer to the thread context
253  * \param[in] com       pointer to the lfsck component
254  * \param[in] lr        pointer to the lfsck request
255  */
256 static void lfsck_layout_assistant_sync_failures(const struct lu_env *env,
257                                                  struct lfsck_component *com,
258                                                  struct lfsck_request *lr)
259 {
260         struct lfsck_async_interpret_args *laia  =
261                                 &lfsck_env_info(env)->lti_laia2;
262         struct lfsck_assistant_data       *lad   = com->lc_data;
263         struct lfsck_layout               *lo    = com->lc_file_ram;
264         struct lfsck_instance             *lfsck = com->lc_lfsck;
265         struct lfsck_tgt_descs            *ltds  = &lfsck->li_ost_descs;
266         struct lfsck_tgt_desc             *ltd;
267         struct ptlrpc_request_set         *set;
268         atomic_t                           count;
269         __u32                              idx;
270         int                                rc    = 0;
271         ENTRY;
272
273         if (!lad->lad_incomplete)
274                 RETURN_EXIT;
275
276         /* If the MDT has ever failed to verfiy some OST-objects,
277          * then sync failures with them firstly. */
278         lr->lr_flags2 = lo->ll_flags | LF_INCOMPLETE;
279
280         atomic_set(&count, 0);
281         memset(laia, 0, sizeof(*laia));
282         laia->laia_count = &count;
283         set = ptlrpc_prep_set();
284         if (set == NULL)
285                 GOTO(out, rc = -ENOMEM);
286
287         down_read(&ltds->ltd_rw_sem);
288         cfs_foreach_bit(lad->lad_bitmap, idx) {
289                 ltd = lfsck_ltd2tgt(ltds, idx);
290                 if (unlikely(!ltd))
291                         continue;
292
293                 laia->laia_ltd = ltd;
294                 rc = lfsck_async_request(env, ltd->ltd_exp, lr, set,
295                                 lfsck_layout_assistant_sync_failures_interpret,
296                                 laia, LFSCK_NOTIFY);
297                 if (rc != 0) {
298                         CDEBUG(D_LFSCK, "%s: LFSCK assistant fail to "
299                                "notify target %x for %s phase1 done: "
300                                "rc = %d\n", lfsck_lfsck2name(com->lc_lfsck),
301                                ltd->ltd_index, lad->lad_name, rc);
302
303                         break;
304                 }
305
306                 atomic_inc(&count);
307         }
308         up_read(&ltds->ltd_rw_sem);
309
310         if (rc == 0 && atomic_read(&count) > 0)
311                 rc = ptlrpc_set_wait(env, set);
312
313         ptlrpc_set_destroy(set);
314
315         if (rc == 0 && atomic_read(&count) > 0)
316                 rc = -EINVAL;
317
318         GOTO(out, rc);
319
320 out:
321         if (rc != 0)
322                 /* If failed to sync failures with the OSTs, then have to
323                  * mark the whole LFSCK as LF_INCOMPLETE to skip the whole
324                  * subsequent orphan OST-object handling. */
325                 lo->ll_flags |= LF_INCOMPLETE;
326
327         lr->lr_flags2 = lo->ll_flags;
328 }
329
330 static int lfsck_layout_verify_header_v1v3(struct dt_object *obj,
331                                            struct lov_mds_md_v1 *lmm,
332                                            __u64 start, __u32 comp_id)
333 {
334         __u32 magic;
335         __u32 pattern;
336
337         magic = le32_to_cpu(lmm->lmm_magic);
338         /* If magic crashed, keep it there. Sometime later, during OST-object
339          * orphan handling, if some OST-object(s) back-point to it, it can be
340          * verified and repaired. */
341         if (magic != LOV_MAGIC_V1 && magic != LOV_MAGIC_V3) {
342                 int rc;
343
344                 if ((magic & LOV_MAGIC_MASK) == LOV_MAGIC_MAGIC)
345                         rc = -EOPNOTSUPP;
346                 else
347                         rc = -EINVAL;
348
349                 CDEBUG(D_LFSCK, "%s LOV EA magic %u for the file "DFID"\n",
350                        rc == -EINVAL ? "Unknown" : "Unsupported",
351                        magic, PFID(lfsck_dto2fid(obj)));
352
353                 return rc;
354         }
355
356         pattern = le32_to_cpu(lmm->lmm_pattern);
357
358 #if 0
359         /* XXX: DoM file verification will be supportted via LU-11081. */
360         if (lov_pattern(pattern) == LOV_PATTERN_MDT) {
361                 if (start != 0) {
362                         CDEBUG(D_LFSCK, "The DoM entry for "DFID" is not "
363                                "the first component in the mirror %x/%llu\n",
364                                PFID(lfsck_dto2fid(obj)), comp_id, start);
365
366                         return -EINVAL;
367                 }
368         }
369 #endif
370
371         if (lov_pattern(pattern) != LOV_PATTERN_RAID0) {
372                 CDEBUG(D_LFSCK, "Unsupported LOV EA pattern %u for the file "
373                        DFID" in the component %x\n",
374                        pattern, PFID(lfsck_dto2fid(obj)), comp_id);
375
376                 return -EOPNOTSUPP;
377         }
378
379         return 0;
380 }
381
382 static int lfsck_layout_verify_header(struct dt_object *obj,
383                                       struct lov_mds_md_v1 *lmm)
384 {
385         int rc = 0;
386
387         if (le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_COMP_V1) {
388                 struct lov_comp_md_v1 *lcm = (struct lov_comp_md_v1 *)lmm;
389                 int i;
390                 __u16 count = le16_to_cpu(lcm->lcm_entry_count);
391
392                 if (unlikely(count == 0)) {
393                         CDEBUG(D_LFSCK, "the PFL file "DFID" contains invalid "
394                                "components count 0\n",
395                                PFID(lfsck_dto2fid(obj)));
396
397                         return -EINVAL;
398                 }
399
400                 for (i = 0; i < count && !rc; i++) {
401                         struct lov_comp_md_entry_v1 *lcme =
402                                                 &lcm->lcm_entries[i];
403                         __u64 start = le64_to_cpu(lcme->lcme_extent.e_start);
404                         __u64 end = le64_to_cpu(lcme->lcme_extent.e_end);
405                         __u32 comp_id = le32_to_cpu(lcme->lcme_id);
406
407                         if (unlikely(comp_id == LCME_ID_INVAL ||
408                                      comp_id > LCME_ID_MAX)) {
409                                 CDEBUG(D_LFSCK, "found invalid FPL ID %u "
410                                        "for the file "DFID" at idx %d\n",
411                                        comp_id, PFID(lfsck_dto2fid(obj)), i);
412
413                                 return -EINVAL;
414                         }
415
416                         if (unlikely(start >= end ||
417                                      !lfsck_comp_extent_aligned(start) ||
418                                      (!lfsck_comp_extent_aligned(end) &&
419                                       end != LUSTRE_EOF))) {
420                                 CDEBUG(D_LFSCK, "found invalid FPL extent "
421                                        "range [%llu - %llu) for the file "
422                                        DFID" at idx %d\n",
423                                        start, end, PFID(lfsck_dto2fid(obj)), i);
424
425                                 return -EINVAL;
426                         }
427
428                         rc = lfsck_layout_verify_header_v1v3(obj,
429                                         (struct lov_mds_md_v1 *)((char *)lmm +
430                                         le32_to_cpu(lcme->lcme_offset)), start,
431                                         comp_id);
432                 }
433         } else {
434                 rc = lfsck_layout_verify_header_v1v3(obj, lmm, 1, 0);
435         }
436
437         return rc;
438 }
439
440 static int lfsck_layout_get_lovea(const struct lu_env *env,
441                                   struct dt_object *obj, struct lu_buf *buf)
442 {
443         int rc;
444         int rc1;
445
446 again:
447         rc = dt_xattr_get(env, obj, buf, XATTR_NAME_LOV);
448         if (rc == -ERANGE) {
449                 rc = dt_xattr_get(env, obj, &LU_BUF_NULL, XATTR_NAME_LOV);
450                 if (rc <= 0)
451                         return !rc ? -ENODATA : rc;
452
453                 lu_buf_realloc(buf, rc);
454                 if (buf->lb_buf == NULL)
455                         return -ENOMEM;
456
457                 goto again;
458         }
459
460         if (rc <= 0)
461                 return !rc ? -ENODATA : rc;
462
463         if (unlikely(buf->lb_buf == NULL)) {
464                 lu_buf_alloc(buf, rc);
465                 if (buf->lb_buf == NULL)
466                         return -ENOMEM;
467
468                 goto again;
469         }
470
471         rc1 = lfsck_layout_verify_header(obj, buf->lb_buf);
472
473         return rc1 ? rc1 : rc;
474 }
475
476 #define LFSCK_RBTREE_BITMAP_SIZE        PAGE_SIZE
477 #define LFSCK_RBTREE_BITMAP_WIDTH       (LFSCK_RBTREE_BITMAP_SIZE << 3)
478 #define LFSCK_RBTREE_BITMAP_MASK        (LFSCK_RBTREE_BITMAP_WIDTH - 1)
479
480 struct lfsck_rbtree_node {
481         struct rb_node   lrn_node;
482         __u64            lrn_seq;
483         __u32            lrn_first_oid;
484         atomic_t         lrn_known_count;
485         atomic_t         lrn_accessed_count;
486         void            *lrn_known_bitmap;
487         void            *lrn_accessed_bitmap;
488 };
489
490 static inline int lfsck_rbtree_cmp(struct lfsck_rbtree_node *lrn,
491                                    __u64 seq, __u32 oid)
492 {
493         if (seq < lrn->lrn_seq)
494                 return -1;
495
496         if (seq > lrn->lrn_seq)
497                 return 1;
498
499         if (oid < lrn->lrn_first_oid)
500                 return -1;
501
502         if (oid - lrn->lrn_first_oid >= LFSCK_RBTREE_BITMAP_WIDTH)
503                 return 1;
504
505         return 0;
506 }
507
508 /* The caller should hold llsd->llsd_rb_lock. */
509 static struct lfsck_rbtree_node *
510 lfsck_rbtree_search(struct lfsck_layout_slave_data *llsd,
511                     const struct lu_fid *fid, bool *exact)
512 {
513         struct rb_node           *node  = llsd->llsd_rb_root.rb_node;
514         struct rb_node           *prev  = NULL;
515         struct lfsck_rbtree_node *lrn   = NULL;
516         int                       rc    = 0;
517
518         if (exact != NULL)
519                 *exact = true;
520
521         while (node != NULL) {
522                 prev = node;
523                 lrn = rb_entry(node, struct lfsck_rbtree_node, lrn_node);
524                 rc = lfsck_rbtree_cmp(lrn, fid_seq(fid), fid_oid(fid));
525                 if (rc < 0)
526                         node = node->rb_left;
527                 else if (rc > 0)
528                         node = node->rb_right;
529                 else
530                         return lrn;
531         }
532
533         if (exact == NULL)
534                 return NULL;
535
536         /* If there is no exactly matched one, then to the next valid one. */
537         *exact = false;
538
539         /* The rbtree is empty. */
540         if (rc == 0)
541                 return NULL;
542
543         if (rc < 0)
544                 return lrn;
545
546         node = rb_next(prev);
547
548         /* The end of the rbtree. */
549         if (node == NULL)
550                 return NULL;
551
552         lrn = rb_entry(node, struct lfsck_rbtree_node, lrn_node);
553
554         return lrn;
555 }
556
557 static struct lfsck_rbtree_node *lfsck_rbtree_new(const struct lu_env *env,
558                                                   const struct lu_fid *fid)
559 {
560         struct lfsck_rbtree_node *lrn;
561
562         OBD_ALLOC_PTR(lrn);
563         if (lrn == NULL)
564                 return ERR_PTR(-ENOMEM);
565
566         OBD_ALLOC(lrn->lrn_known_bitmap, LFSCK_RBTREE_BITMAP_SIZE);
567         if (lrn->lrn_known_bitmap == NULL) {
568                 OBD_FREE_PTR(lrn);
569
570                 return ERR_PTR(-ENOMEM);
571         }
572
573         OBD_ALLOC(lrn->lrn_accessed_bitmap, LFSCK_RBTREE_BITMAP_SIZE);
574         if (lrn->lrn_accessed_bitmap == NULL) {
575                 OBD_FREE(lrn->lrn_known_bitmap, LFSCK_RBTREE_BITMAP_SIZE);
576                 OBD_FREE_PTR(lrn);
577
578                 return ERR_PTR(-ENOMEM);
579         }
580
581         RB_CLEAR_NODE(&lrn->lrn_node);
582         lrn->lrn_seq = fid_seq(fid);
583         lrn->lrn_first_oid = fid_oid(fid) & ~LFSCK_RBTREE_BITMAP_MASK;
584         atomic_set(&lrn->lrn_known_count, 0);
585         atomic_set(&lrn->lrn_accessed_count, 0);
586
587         return lrn;
588 }
589
590 static void lfsck_rbtree_free(struct lfsck_rbtree_node *lrn)
591 {
592         OBD_FREE(lrn->lrn_accessed_bitmap, LFSCK_RBTREE_BITMAP_SIZE);
593         OBD_FREE(lrn->lrn_known_bitmap, LFSCK_RBTREE_BITMAP_SIZE);
594         OBD_FREE_PTR(lrn);
595 }
596
597 /* The caller should hold lock. */
598 static struct lfsck_rbtree_node *
599 lfsck_rbtree_insert(struct lfsck_layout_slave_data *llsd,
600                     struct lfsck_rbtree_node *lrn)
601 {
602         struct rb_node           **pos    = &llsd->llsd_rb_root.rb_node;
603         struct rb_node            *parent = NULL;
604         struct lfsck_rbtree_node  *tmp;
605         int                        rc;
606
607         while (*pos != NULL) {
608                 parent = *pos;
609                 tmp = rb_entry(parent, struct lfsck_rbtree_node, lrn_node);
610                 rc = lfsck_rbtree_cmp(tmp, lrn->lrn_seq, lrn->lrn_first_oid);
611                 if (rc < 0)
612                         pos = &(*pos)->rb_left;
613                 else if (rc > 0)
614                         pos = &(*pos)->rb_right;
615                 else
616                         return tmp;
617         }
618
619         rb_link_node(&lrn->lrn_node, parent, pos);
620         rb_insert_color(&lrn->lrn_node, &llsd->llsd_rb_root);
621
622         return lrn;
623 }
624
625 extern const struct dt_index_operations lfsck_orphan_index_ops;
626
627 static int lfsck_rbtree_setup(const struct lu_env *env,
628                               struct lfsck_component *com)
629 {
630         struct lu_fid                   *fid    = &lfsck_env_info(env)->lti_fid;
631         struct lfsck_instance           *lfsck  = com->lc_lfsck;
632         struct dt_device                *dev    = lfsck->li_bottom;
633         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
634         struct dt_object                *obj;
635
636         fid->f_seq = FID_SEQ_LAYOUT_RBTREE;
637         fid->f_oid = lfsck_dev_idx(lfsck);
638         fid->f_ver = 0;
639         obj = dt_locate(env, dev, fid);
640         if (IS_ERR(obj))
641                 RETURN(PTR_ERR(obj));
642
643         /* Generate an in-RAM object to stand for the layout rbtree.
644          * Scanning the layout rbtree will be via the iteration over
645          * the object. In the future, the rbtree may be written onto
646          * disk with the object.
647          *
648          * Mark the object to be as exist. */
649         obj->do_lu.lo_header->loh_attr |= LOHA_EXISTS;
650         obj->do_index_ops = &lfsck_orphan_index_ops;
651         llsd->llsd_rb_obj = obj;
652         llsd->llsd_rbtree_valid = 1;
653         dev->dd_record_fid_accessed = 1;
654
655         CDEBUG(D_LFSCK, "%s: layout LFSCK init OST-objects accessing bitmap\n",
656                lfsck_lfsck2name(lfsck));
657
658         return 0;
659 }
660
661 static void lfsck_rbtree_cleanup(const struct lu_env *env,
662                                  struct lfsck_component *com)
663 {
664         struct lfsck_instance           *lfsck = com->lc_lfsck;
665         struct lfsck_layout_slave_data  *llsd  = com->lc_data;
666         struct rb_node                  *node  = rb_first(&llsd->llsd_rb_root);
667         struct rb_node                  *next;
668         struct lfsck_rbtree_node        *lrn;
669
670         lfsck->li_bottom->dd_record_fid_accessed = 0;
671         /* Invalid the rbtree, then no others will use it. */
672         write_lock(&llsd->llsd_rb_lock);
673         llsd->llsd_rbtree_valid = 0;
674         write_unlock(&llsd->llsd_rb_lock);
675
676         while (node != NULL) {
677                 next = rb_next(node);
678                 lrn = rb_entry(node, struct lfsck_rbtree_node, lrn_node);
679                 rb_erase(node, &llsd->llsd_rb_root);
680                 lfsck_rbtree_free(lrn);
681                 node = next;
682         }
683
684         if (llsd->llsd_rb_obj != NULL) {
685                 lfsck_object_put(env, llsd->llsd_rb_obj);
686                 llsd->llsd_rb_obj = NULL;
687         }
688
689         CDEBUG(D_LFSCK, "%s: layout LFSCK fini OST-objects accessing bitmap\n",
690                lfsck_lfsck2name(lfsck));
691 }
692
693 static void lfsck_rbtree_update_bitmap(const struct lu_env *env,
694                                        struct lfsck_component *com,
695                                        const struct lu_fid *fid,
696                                        bool accessed)
697 {
698         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
699         struct lfsck_rbtree_node        *lrn;
700         bool                             insert = false;
701         int                              idx;
702         int                              rc     = 0;
703         ENTRY;
704
705         if (unlikely(!fid_is_sane(fid) || fid_is_last_id(fid)))
706                 RETURN_EXIT;
707
708         if (!fid_is_idif(fid) && !fid_is_norm(fid))
709                 RETURN_EXIT;
710
711         read_lock(&llsd->llsd_rb_lock);
712         if (!llsd->llsd_rbtree_valid)
713                 GOTO(unlock, rc = 0);
714
715         lrn = lfsck_rbtree_search(llsd, fid, NULL);
716         if (lrn == NULL) {
717                 struct lfsck_rbtree_node *tmp;
718
719                 LASSERT(!insert);
720
721                 read_unlock(&llsd->llsd_rb_lock);
722                 tmp = lfsck_rbtree_new(env, fid);
723                 if (IS_ERR(tmp))
724                         GOTO(out, rc = PTR_ERR(tmp));
725
726                 insert = true;
727                 write_lock(&llsd->llsd_rb_lock);
728                 if (!llsd->llsd_rbtree_valid) {
729                         lfsck_rbtree_free(tmp);
730                         GOTO(unlock, rc = 0);
731                 }
732
733                 lrn = lfsck_rbtree_insert(llsd, tmp);
734                 if (lrn != tmp)
735                         lfsck_rbtree_free(tmp);
736         }
737
738         idx = fid_oid(fid) & LFSCK_RBTREE_BITMAP_MASK;
739         /* Any accessed object must be a known object. */
740         if (!test_and_set_bit(idx, lrn->lrn_known_bitmap))
741                 atomic_inc(&lrn->lrn_known_count);
742         if (accessed && !test_and_set_bit(idx, lrn->lrn_accessed_bitmap))
743                 atomic_inc(&lrn->lrn_accessed_count);
744
745         GOTO(unlock, rc = 0);
746
747 unlock:
748         if (insert)
749                 write_unlock(&llsd->llsd_rb_lock);
750         else
751                 read_unlock(&llsd->llsd_rb_lock);
752 out:
753         if (rc != 0 && accessed) {
754                 struct lfsck_layout *lo = com->lc_file_ram;
755
756                 CDEBUG(D_LFSCK, "%s: fail to update OST-objects accessing "
757                        "bitmap, and will cause incorrect LFSCK OST-object "
758                        "handling, so disable it to cancel orphan handling "
759                        "for related device. rc = %d\n",
760                        lfsck_lfsck2name(com->lc_lfsck), rc);
761
762                 lo->ll_flags |= LF_INCOMPLETE;
763                 lfsck_rbtree_cleanup(env, com);
764         }
765 }
766
767 static inline void lldk_le_to_cpu(struct lfsck_layout_dangling_key *des,
768                                   const struct lfsck_layout_dangling_key *src)
769 {
770         fid_le_to_cpu(&des->lldk_fid, &src->lldk_fid);
771         des->lldk_comp_id = le32_to_cpu(src->lldk_comp_id);
772         des->lldk_ea_off = le32_to_cpu(src->lldk_ea_off);
773 }
774
775 static inline void lldk_cpu_to_le(struct lfsck_layout_dangling_key *des,
776                                   const struct lfsck_layout_dangling_key *src)
777 {
778         fid_cpu_to_le(&des->lldk_fid, &src->lldk_fid);
779         des->lldk_comp_id = cpu_to_le32(src->lldk_comp_id);
780         des->lldk_ea_off = cpu_to_le32(src->lldk_ea_off);
781 }
782
783 static inline void lldk_be_to_cpu(struct lfsck_layout_dangling_key *des,
784                                   const struct lfsck_layout_dangling_key *src)
785 {
786         fid_be_to_cpu(&des->lldk_fid, &src->lldk_fid);
787         des->lldk_comp_id = be32_to_cpu(src->lldk_comp_id);
788         des->lldk_ea_off = be32_to_cpu(src->lldk_ea_off);
789 }
790
791 static inline void lldk_cpu_to_be(struct lfsck_layout_dangling_key *des,
792                                   const struct lfsck_layout_dangling_key *src)
793 {
794         fid_cpu_to_be(&des->lldk_fid, &src->lldk_fid);
795         des->lldk_comp_id = cpu_to_be32(src->lldk_comp_id);
796         des->lldk_ea_off = cpu_to_be32(src->lldk_ea_off);
797 }
798
799 static void lfsck_layout_le_to_cpu(struct lfsck_layout *des,
800                                    const struct lfsck_layout *src)
801 {
802         int i;
803
804         des->ll_magic = le32_to_cpu(src->ll_magic);
805         des->ll_status = le32_to_cpu(src->ll_status);
806         des->ll_flags = le32_to_cpu(src->ll_flags);
807         des->ll_success_count = le32_to_cpu(src->ll_success_count);
808         des->ll_run_time_phase1 = le64_to_cpu(src->ll_run_time_phase1);
809         des->ll_run_time_phase2 = le64_to_cpu(src->ll_run_time_phase2);
810         des->ll_time_last_complete = le64_to_cpu(src->ll_time_last_complete);
811         des->ll_time_latest_start = le64_to_cpu(src->ll_time_latest_start);
812         des->ll_time_last_checkpoint =
813                                 le64_to_cpu(src->ll_time_last_checkpoint);
814         des->ll_pos_latest_start = le64_to_cpu(src->ll_pos_latest_start);
815         des->ll_pos_last_checkpoint = le64_to_cpu(src->ll_pos_last_checkpoint);
816         des->ll_pos_first_inconsistent =
817                         le64_to_cpu(src->ll_pos_first_inconsistent);
818         des->ll_objs_checked_phase1 = le64_to_cpu(src->ll_objs_checked_phase1);
819         des->ll_objs_failed_phase1 = le64_to_cpu(src->ll_objs_failed_phase1);
820         des->ll_objs_checked_phase2 = le64_to_cpu(src->ll_objs_checked_phase2);
821         des->ll_objs_failed_phase2 = le64_to_cpu(src->ll_objs_failed_phase2);
822         for (i = 0; i < LLIT_MAX; i++)
823                 des->ll_objs_repaired[i] =
824                                 le64_to_cpu(src->ll_objs_repaired[i]);
825         des->ll_objs_skipped = le64_to_cpu(src->ll_objs_skipped);
826         des->ll_bitmap_size = le32_to_cpu(src->ll_bitmap_size);
827         lldk_le_to_cpu(&des->ll_lldk_latest_scanned_phase2,
828                        &src->ll_lldk_latest_scanned_phase2);
829 }
830
831 static void lfsck_layout_cpu_to_le(struct lfsck_layout *des,
832                                    const struct lfsck_layout *src)
833 {
834         int i;
835
836         des->ll_magic = cpu_to_le32(src->ll_magic);
837         des->ll_status = cpu_to_le32(src->ll_status);
838         des->ll_flags = cpu_to_le32(src->ll_flags);
839         des->ll_success_count = cpu_to_le32(src->ll_success_count);
840         des->ll_run_time_phase1 = cpu_to_le64(src->ll_run_time_phase1);
841         des->ll_run_time_phase2 = cpu_to_le64(src->ll_run_time_phase2);
842         des->ll_time_last_complete = cpu_to_le64(src->ll_time_last_complete);
843         des->ll_time_latest_start = cpu_to_le64(src->ll_time_latest_start);
844         des->ll_time_last_checkpoint =
845                                 cpu_to_le64(src->ll_time_last_checkpoint);
846         des->ll_pos_latest_start = cpu_to_le64(src->ll_pos_latest_start);
847         des->ll_pos_last_checkpoint = cpu_to_le64(src->ll_pos_last_checkpoint);
848         des->ll_pos_first_inconsistent =
849                         cpu_to_le64(src->ll_pos_first_inconsistent);
850         des->ll_objs_checked_phase1 = cpu_to_le64(src->ll_objs_checked_phase1);
851         des->ll_objs_failed_phase1 = cpu_to_le64(src->ll_objs_failed_phase1);
852         des->ll_objs_checked_phase2 = cpu_to_le64(src->ll_objs_checked_phase2);
853         des->ll_objs_failed_phase2 = cpu_to_le64(src->ll_objs_failed_phase2);
854         for (i = 0; i < LLIT_MAX; i++)
855                 des->ll_objs_repaired[i] =
856                                 cpu_to_le64(src->ll_objs_repaired[i]);
857         des->ll_objs_skipped = cpu_to_le64(src->ll_objs_skipped);
858         des->ll_bitmap_size = cpu_to_le32(src->ll_bitmap_size);
859         lldk_cpu_to_le(&des->ll_lldk_latest_scanned_phase2,
860                        &src->ll_lldk_latest_scanned_phase2);
861 }
862
863 /**
864  * Load the OST bitmap from the lfsck_layout trace file.
865  *
866  * \param[in] env       pointer to the thread context
867  * \param[in] com       pointer to the lfsck component
868  *
869  * \retval              0 for success
870  * \retval              negative error number on failure or data corruption
871  */
872 static int lfsck_layout_load_bitmap(const struct lu_env *env,
873                                     struct lfsck_component *com)
874 {
875         struct dt_object                *obj    = com->lc_obj;
876         struct lfsck_assistant_data     *lad    = com->lc_data;
877         struct lfsck_layout             *lo     = com->lc_file_ram;
878         struct cfs_bitmap                       *bitmap = lad->lad_bitmap;
879         loff_t                           pos    = com->lc_file_size;
880         ssize_t                          size;
881         __u32                            nbits;
882         int                              rc;
883         ENTRY;
884
885         if (com->lc_lfsck->li_ost_descs.ltd_tgts_bitmap->size >
886             lo->ll_bitmap_size)
887                 nbits = com->lc_lfsck->li_ost_descs.ltd_tgts_bitmap->size;
888         else
889                 nbits = lo->ll_bitmap_size;
890
891         if (unlikely(nbits < BITS_PER_LONG))
892                 nbits = BITS_PER_LONG;
893
894         if (nbits > bitmap->size) {
895                 __u32 new_bits = bitmap->size;
896                 struct cfs_bitmap *new_bitmap;
897
898                 while (new_bits < nbits)
899                         new_bits <<= 1;
900
901                 new_bitmap = CFS_ALLOCATE_BITMAP(new_bits);
902                 if (new_bitmap == NULL)
903                         RETURN(-ENOMEM);
904
905                 lad->lad_bitmap = new_bitmap;
906                 CFS_FREE_BITMAP(bitmap);
907                 bitmap = new_bitmap;
908         }
909
910         if (lo->ll_bitmap_size == 0) {
911                 lad->lad_incomplete = 0;
912                 CFS_RESET_BITMAP(bitmap);
913
914                 RETURN(0);
915         }
916
917         size = (lo->ll_bitmap_size + 7) >> 3;
918         rc = dt_read(env, obj, lfsck_buf_get(env, bitmap->data, size), &pos);
919         if (rc != size)
920                 RETURN(rc >= 0 ? -EINVAL : rc);
921
922         if (cfs_bitmap_check_empty(bitmap))
923                 lad->lad_incomplete = 0;
924         else
925                 lad->lad_incomplete = 1;
926
927         RETURN(0);
928 }
929
930 /**
931  * Load the layout LFSCK trace file from disk.
932  *
933  * The layout LFSCK trace file records the layout LFSCK status information
934  * and other statistics, such as how many objects have been scanned, and how
935  * many objects have been repaired, and etc. It also contains the bitmap for
936  * failed OSTs during the layout LFSCK. All these information will be loaded
937  * from disk to RAM when the layout LFSCK component setup.
938  *
939  * \param[in] env       pointer to the thread context
940  * \param[in] com       pointer to the lfsck component
941  *
942  * \retval              positive number for file data corruption, the caller
943  *                      should reset the layout LFSCK trace file
944  * \retval              0 for success
945  * \retval              negative error number on failure
946  */
947 static int lfsck_layout_load(const struct lu_env *env,
948                              struct lfsck_component *com)
949 {
950         struct lfsck_layout             *lo     = com->lc_file_ram;
951         ssize_t                          size   = com->lc_file_size;
952         loff_t                           pos    = 0;
953         int                              rc;
954
955         rc = dt_read(env, com->lc_obj,
956                      lfsck_buf_get(env, com->lc_file_disk, size), &pos);
957         if (rc == 0) {
958                 return -ENOENT;
959         } else if (rc < 0) {
960                 CDEBUG(D_LFSCK, "%s: failed to load lfsck_layout: rc = %d\n",
961                        lfsck_lfsck2name(com->lc_lfsck), rc);
962                 return rc;
963         } else if (rc != size) {
964                 CDEBUG(D_LFSCK, "%s: lfsck_layout size %u != %u; reset it\n",
965                        lfsck_lfsck2name(com->lc_lfsck), rc, (unsigned int)size);
966                 return 1;
967         }
968
969         lfsck_layout_le_to_cpu(lo, com->lc_file_disk);
970         if (lo->ll_magic != LFSCK_LAYOUT_MAGIC) {
971                 CDEBUG(D_LFSCK, "%s: invalid lfsck_layout magic %#x != %#x, "
972                        "to be reset\n", lfsck_lfsck2name(com->lc_lfsck),
973                        lo->ll_magic, LFSCK_LAYOUT_MAGIC);
974                 return 1;
975         }
976
977         return 0;
978 }
979
980 /**
981  * Store the layout LFSCK trace file on disk.
982  *
983  * The layout LFSCK trace file records the layout LFSCK status information
984  * and other statistics, such as how many objects have been scanned, and how
985  * many objects have been repaired, and etc. It also contains the bitmap for
986  * failed OSTs during the layout LFSCK. All these information will be synced
987  * from RAM to disk periodically.
988  *
989  * \param[in] env       pointer to the thread context
990  * \param[in] com       pointer to the lfsck component
991  *
992  * \retval              0 for success
993  * \retval              negative error number on failure
994  */
995 static int lfsck_layout_store(const struct lu_env *env,
996                               struct lfsck_component *com)
997 {
998         struct dt_object        *obj    = com->lc_obj;
999         struct lfsck_instance   *lfsck  = com->lc_lfsck;
1000         struct lfsck_layout     *lo_ram = com->lc_file_ram;
1001         struct lfsck_layout     *lo     = com->lc_file_disk;
1002         struct thandle          *th;
1003         struct dt_device        *dev    = lfsck_obj2dev(obj);
1004         struct cfs_bitmap       *bitmap = NULL;
1005         loff_t                   pos;
1006         ssize_t                  size   = com->lc_file_size;
1007         __u32                    nbits  = 0;
1008         int                      rc;
1009         ENTRY;
1010
1011         if (lfsck->li_master) {
1012                 struct lfsck_assistant_data *lad = com->lc_data;
1013
1014                 bitmap = lad->lad_bitmap;
1015                 nbits = bitmap->size;
1016
1017                 LASSERT(nbits > 0);
1018                 LASSERTF((nbits & 7) == 0, "Invalid nbits %u\n", nbits);
1019         }
1020
1021         lo_ram->ll_bitmap_size = nbits;
1022         lfsck_layout_cpu_to_le(lo, lo_ram);
1023         th = dt_trans_create(env, dev);
1024         if (IS_ERR(th))
1025                 GOTO(log, rc = PTR_ERR(th));
1026
1027         rc = dt_declare_record_write(env, obj, lfsck_buf_get(env, lo, size),
1028                                      (loff_t)0, th);
1029         if (rc != 0)
1030                 GOTO(out, rc);
1031
1032         if (bitmap != NULL) {
1033                 rc = dt_declare_record_write(env, obj,
1034                                 lfsck_buf_get(env, bitmap->data, nbits >> 3),
1035                                 (loff_t)size, th);
1036                 if (rc != 0)
1037                         GOTO(out, rc);
1038         }
1039
1040         rc = dt_trans_start_local(env, dev, th);
1041         if (rc != 0)
1042                 GOTO(out, rc);
1043
1044         pos = 0;
1045         rc = dt_record_write(env, obj, lfsck_buf_get(env, lo, size), &pos, th);
1046         if (rc != 0)
1047                 GOTO(out, rc);
1048
1049         if (bitmap != NULL) {
1050                 pos = size;
1051                 rc = dt_record_write(env, obj,
1052                                 lfsck_buf_get(env, bitmap->data, nbits >> 3),
1053                                 &pos, th);
1054         }
1055
1056         GOTO(out, rc);
1057
1058 out:
1059         dt_trans_stop(env, dev, th);
1060
1061 log:
1062         if (rc != 0)
1063                 CDEBUG(D_LFSCK, "%s: fail to store lfsck_layout: rc = %d\n",
1064                        lfsck_lfsck2name(lfsck), rc);
1065
1066         return rc;
1067 }
1068
1069 static int lfsck_layout_init(const struct lu_env *env,
1070                              struct lfsck_component *com)
1071 {
1072         struct lfsck_layout *lo = com->lc_file_ram;
1073         int rc;
1074
1075         memset(lo, 0, com->lc_file_size);
1076         lo->ll_magic = LFSCK_LAYOUT_MAGIC;
1077         lo->ll_status = LS_INIT;
1078         down_write(&com->lc_sem);
1079         rc = lfsck_layout_store(env, com);
1080         if (rc == 0 && com->lc_lfsck->li_master)
1081                 rc = lfsck_load_sub_trace_files(env, com,
1082                         &dt_lfsck_layout_dangling_features, LFSCK_LAYOUT, true);
1083         up_write(&com->lc_sem);
1084
1085         return rc;
1086 }
1087
1088 static int fid_is_for_ostobj(const struct lu_env *env,
1089                              struct lfsck_instance *lfsck,
1090                              struct dt_object *obj, const struct lu_fid *fid)
1091 {
1092         struct seq_server_site  *ss     = lfsck_dev_site(lfsck);
1093         struct lu_seq_range     *range  = &lfsck_env_info(env)->lti_range;
1094         struct lustre_ost_attrs *loa;
1095         int                      rc;
1096
1097         fld_range_set_any(range);
1098         rc = fld_server_lookup(env, ss->ss_server_fld, fid_seq(fid), range);
1099         if (rc == 0) {
1100                 if (fld_range_is_ost(range))
1101                         return 1;
1102
1103                 return 0;
1104         }
1105
1106         loa = &lfsck_env_info(env)->lti_loa;
1107         rc = dt_xattr_get(env, obj, lfsck_buf_get(env, loa, sizeof(*loa)),
1108                           XATTR_NAME_LMA);
1109         if (rc >= sizeof(struct lustre_mdt_attrs)) {
1110                 lustre_lma_swab(&loa->loa_lma);
1111
1112                 return loa->loa_lma.lma_compat & LMAC_FID_ON_OST ? 1 : 0;
1113         }
1114
1115         rc = dt_xattr_get(env, obj, &LU_BUF_NULL, XATTR_NAME_FID);
1116
1117         return rc > 0;
1118 }
1119
1120 static struct lfsck_layout_seq *
1121 lfsck_layout_seq_lookup(struct lfsck_layout_slave_data *llsd, __u64 seq)
1122 {
1123         struct lfsck_layout_seq *lls;
1124
1125         list_for_each_entry(lls, &llsd->llsd_seq_list, lls_list) {
1126                 if (lls->lls_seq == seq)
1127                         return lls;
1128
1129                 if (lls->lls_seq > seq)
1130                         return NULL;
1131         }
1132
1133         return NULL;
1134 }
1135
1136 static void
1137 lfsck_layout_seq_insert(struct lfsck_layout_slave_data *llsd,
1138                         struct lfsck_layout_seq *lls)
1139 {
1140         struct lfsck_layout_seq *tmp;
1141         struct list_head        *pos = &llsd->llsd_seq_list;
1142
1143         list_for_each_entry(tmp, &llsd->llsd_seq_list, lls_list) {
1144                 if (lls->lls_seq < tmp->lls_seq) {
1145                         pos = &tmp->lls_list;
1146                         break;
1147                 }
1148         }
1149         list_add_tail(&lls->lls_list, pos);
1150 }
1151
1152 static int
1153 lfsck_layout_lastid_create(const struct lu_env *env,
1154                            struct lfsck_instance *lfsck,
1155                            struct dt_object *obj)
1156 {
1157         struct lfsck_thread_info *info   = lfsck_env_info(env);
1158         struct lu_attr           *la     = &info->lti_la;
1159         struct dt_object_format  *dof    = &info->lti_dof;
1160         struct lfsck_bookmark    *bk     = &lfsck->li_bookmark_ram;
1161         struct dt_device         *dt     = lfsck_obj2dev(obj);
1162         struct thandle           *th;
1163         __u64                     lastid = 0;
1164         loff_t                    pos    = 0;
1165         int                       rc;
1166         ENTRY;
1167
1168         if (bk->lb_param & LPF_DRYRUN)
1169                 return 0;
1170
1171         memset(la, 0, sizeof(*la));
1172         la->la_mode = S_IFREG |  S_IRUGO | S_IWUSR;
1173         la->la_valid = LA_MODE | LA_UID | LA_GID;
1174         memset(dof, 0, sizeof(*dof));
1175         dof->dof_type = dt_mode_to_dft(S_IFREG);
1176
1177         th = dt_trans_create(env, dt);
1178         if (IS_ERR(th))
1179                 GOTO(log, rc = PTR_ERR(th));
1180
1181         rc = dt_declare_create(env, obj, la, NULL, dof, th);
1182         if (rc != 0)
1183                 GOTO(stop, rc);
1184
1185         rc = dt_declare_record_write(env, obj,
1186                                      lfsck_buf_get(env, &lastid,
1187                                                    sizeof(lastid)),
1188                                      pos, th);
1189         if (rc != 0)
1190                 GOTO(stop, rc);
1191
1192         rc = dt_trans_start_local(env, dt, th);
1193         if (rc != 0)
1194                 GOTO(stop, rc);
1195
1196         dt_write_lock(env, obj, 0);
1197         if (likely(dt_object_exists(obj) == 0)) {
1198                 rc = dt_create(env, obj, la, NULL, dof, th);
1199                 if (rc == 0)
1200                         rc = dt_record_write(env, obj,
1201                                 lfsck_buf_get(env, &lastid, sizeof(lastid)),
1202                                 &pos, th);
1203         }
1204         dt_write_unlock(env, obj);
1205
1206         GOTO(stop, rc);
1207
1208 stop:
1209         dt_trans_stop(env, dt, th);
1210
1211 log:
1212         CDEBUG(D_LFSCK, "%s: layout LFSCK will create LAST_ID for <seq> "
1213                "%#llx: rc = %d\n",
1214                lfsck_lfsck2name(lfsck), fid_seq(lfsck_dto2fid(obj)), rc);
1215
1216         return rc;
1217 }
1218
1219 static int
1220 lfsck_layout_lastid_reload(const struct lu_env *env,
1221                            struct lfsck_component *com,
1222                            struct lfsck_layout_seq *lls)
1223 {
1224         __u64   lastid;
1225         loff_t  pos     = 0;
1226         int     rc;
1227
1228         dt_read_lock(env, lls->lls_lastid_obj, 0);
1229         rc = dt_record_read(env, lls->lls_lastid_obj,
1230                             lfsck_buf_get(env, &lastid, sizeof(lastid)), &pos);
1231         dt_read_unlock(env, lls->lls_lastid_obj);
1232         if (unlikely(rc != 0))
1233                 return rc;
1234
1235         lastid = le64_to_cpu(lastid);
1236         if (lastid < lls->lls_lastid_known) {
1237                 struct lfsck_instance   *lfsck  = com->lc_lfsck;
1238                 struct lfsck_layout     *lo     = com->lc_file_ram;
1239
1240                 lls->lls_lastid = lls->lls_lastid_known;
1241                 lls->lls_dirty = 1;
1242                 if (!(lo->ll_flags & LF_CRASHED_LASTID)) {
1243                         LASSERT(lfsck->li_out_notify != NULL);
1244
1245                         lfsck->li_out_notify(env, lfsck->li_out_notify_data,
1246                                              LE_LASTID_REBUILDING);
1247                         lo->ll_flags |= LF_CRASHED_LASTID;
1248
1249                         CDEBUG(D_LFSCK, "%s: layout LFSCK finds crashed "
1250                                "LAST_ID file (1) for the sequence %#llx"
1251                                ", old value %llu, known value %llu\n",
1252                                lfsck_lfsck2name(lfsck), lls->lls_seq,
1253                                lastid, lls->lls_lastid);
1254                 }
1255         } else if (lastid >= lls->lls_lastid) {
1256                 lls->lls_lastid = lastid;
1257                 lls->lls_dirty = 0;
1258         }
1259
1260         return 0;
1261 }
1262
1263 static int
1264 lfsck_layout_lastid_store(const struct lu_env *env,
1265                           struct lfsck_component *com)
1266 {
1267         struct lfsck_instance           *lfsck  = com->lc_lfsck;
1268         struct lfsck_bookmark           *bk     = &lfsck->li_bookmark_ram;
1269         struct dt_device                *dt     = lfsck->li_bottom;
1270         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
1271         struct lfsck_layout_seq         *lls;
1272         struct thandle                  *th;
1273         __u64                            lastid;
1274         int                              rc     = 0;
1275         int                              rc1    = 0;
1276
1277         list_for_each_entry(lls, &llsd->llsd_seq_list, lls_list) {
1278                 loff_t pos = 0;
1279
1280                 if (!lls->lls_dirty)
1281                         continue;
1282
1283                 CDEBUG(D_LFSCK, "%s: layout LFSCK will sync the LAST_ID for "
1284                        "<seq> %#llx as <oid> %llu\n",
1285                        lfsck_lfsck2name(lfsck), lls->lls_seq, lls->lls_lastid);
1286
1287                 if (bk->lb_param & LPF_DRYRUN) {
1288                         lls->lls_dirty = 0;
1289                         continue;
1290                 }
1291
1292                 th = dt_trans_create(env, dt);
1293                 if (IS_ERR(th)) {
1294                         rc1 = PTR_ERR(th);
1295                         CDEBUG(D_LFSCK, "%s: layout LFSCK failed to store "
1296                                "the LAST_ID for <seq> %#llx(1): rc = %d\n",
1297                                lfsck_lfsck2name(com->lc_lfsck),
1298                                lls->lls_seq, rc1);
1299                         continue;
1300                 }
1301
1302                 lastid = cpu_to_le64(lls->lls_lastid);
1303                 rc = dt_declare_record_write(env, lls->lls_lastid_obj,
1304                                              lfsck_buf_get(env, &lastid,
1305                                                            sizeof(lastid)),
1306                                              pos, th);
1307                 if (rc != 0)
1308                         goto stop;
1309
1310                 rc = dt_trans_start_local(env, dt, th);
1311                 if (rc != 0)
1312                         goto stop;
1313
1314                 dt_write_lock(env, lls->lls_lastid_obj, 0);
1315                 rc = dt_record_write(env, lls->lls_lastid_obj,
1316                                      lfsck_buf_get(env, &lastid,
1317                                      sizeof(lastid)), &pos, th);
1318                 dt_write_unlock(env, lls->lls_lastid_obj);
1319                 if (rc == 0)
1320                         lls->lls_dirty = 0;
1321
1322 stop:
1323                 dt_trans_stop(env, dt, th);
1324                 if (rc != 0) {
1325                         rc1 = rc;
1326                         CDEBUG(D_LFSCK, "%s: layout LFSCK failed to store "
1327                                "the LAST_ID for <seq> %#llx(2): rc = %d\n",
1328                                lfsck_lfsck2name(com->lc_lfsck),
1329                                lls->lls_seq, rc1);
1330                 }
1331         }
1332
1333         return rc1;
1334 }
1335
1336 static int
1337 lfsck_layout_lastid_load(const struct lu_env *env,
1338                          struct lfsck_component *com,
1339                          struct lfsck_layout_seq *lls)
1340 {
1341         struct lfsck_instance   *lfsck  = com->lc_lfsck;
1342         struct lfsck_layout     *lo     = com->lc_file_ram;
1343         struct lu_fid           *fid    = &lfsck_env_info(env)->lti_fid;
1344         struct dt_object        *obj;
1345         loff_t                   pos    = 0;
1346         int                      rc;
1347         ENTRY;
1348
1349         lu_last_id_fid(fid, lls->lls_seq, lfsck_dev_idx(lfsck));
1350         obj = dt_locate(env, lfsck->li_bottom, fid);
1351         if (IS_ERR(obj))
1352                 RETURN(PTR_ERR(obj));
1353
1354         /* LAST_ID crashed, to be rebuilt */
1355         if (dt_object_exists(obj) == 0) {
1356                 if (!(lo->ll_flags & LF_CRASHED_LASTID)) {
1357                         LASSERT(lfsck->li_out_notify != NULL);
1358
1359                         lfsck->li_out_notify(env, lfsck->li_out_notify_data,
1360                                              LE_LASTID_REBUILDING);
1361                         lo->ll_flags |= LF_CRASHED_LASTID;
1362
1363                         CDEBUG(D_LFSCK, "%s: layout LFSCK cannot find the "
1364                                "LAST_ID file for sequence %#llx\n",
1365                                lfsck_lfsck2name(lfsck), lls->lls_seq);
1366
1367                         if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY4) &&
1368                             cfs_fail_val > 0) {
1369                                 struct l_wait_info lwi = LWI_TIMEOUT(
1370                                                 cfs_time_seconds(cfs_fail_val),
1371                                                 NULL, NULL);
1372
1373                                 /* Some others may changed the cfs_fail_val
1374                                  * as zero after above check, re-check it for
1375                                  * sure to avoid falling into wait for ever. */
1376                                 if (likely(lwi.lwi_timeout > 0)) {
1377                                         struct ptlrpc_thread *thread =
1378                                                 &lfsck->li_thread;
1379
1380                                         up_write(&com->lc_sem);
1381                                         l_wait_event(thread->t_ctl_waitq,
1382                                                      !thread_is_running(thread),
1383                                                      &lwi);
1384                                         down_write(&com->lc_sem);
1385                                 }
1386                         }
1387                 }
1388
1389                 rc = lfsck_layout_lastid_create(env, lfsck, obj);
1390         } else {
1391                 dt_read_lock(env, obj, 0);
1392                 rc = dt_read(env, obj,
1393                         lfsck_buf_get(env, &lls->lls_lastid, sizeof(__u64)),
1394                         &pos);
1395                 dt_read_unlock(env, obj);
1396                 if (rc != 0 && rc != sizeof(__u64))
1397                         GOTO(out, rc = (rc > 0 ? -EFAULT : rc));
1398
1399                 if (rc == 0 && !(lo->ll_flags & LF_CRASHED_LASTID)) {
1400                         LASSERT(lfsck->li_out_notify != NULL);
1401
1402                         lfsck->li_out_notify(env, lfsck->li_out_notify_data,
1403                                              LE_LASTID_REBUILDING);
1404                         lo->ll_flags |= LF_CRASHED_LASTID;
1405
1406                         CDEBUG(D_LFSCK, "%s: layout LFSCK finds invalid "
1407                                "LAST_ID file for the sequence %#llx"
1408                                ": rc = %d\n",
1409                                lfsck_lfsck2name(lfsck), lls->lls_seq, rc);
1410                 }
1411
1412                 lls->lls_lastid = le64_to_cpu(lls->lls_lastid);
1413                 rc = 0;
1414         }
1415
1416         GOTO(out, rc);
1417
1418 out:
1419         if (rc != 0)
1420                 lfsck_object_put(env, obj);
1421         else
1422                 lls->lls_lastid_obj = obj;
1423
1424         return rc;
1425 }
1426
1427 static void lfsck_layout_record_failure(const struct lu_env *env,
1428                                         struct lfsck_instance *lfsck,
1429                                         struct lfsck_layout *lo)
1430 {
1431         __u64 cookie;
1432
1433         lo->ll_objs_failed_phase1++;
1434         cookie = lfsck->li_obj_oit->do_index_ops->dio_it.store(env,
1435                                                         lfsck->li_di_oit);
1436         if (lo->ll_pos_first_inconsistent == 0 ||
1437             lo->ll_pos_first_inconsistent < cookie) {
1438                 lo->ll_pos_first_inconsistent = cookie;
1439
1440                 CDEBUG(D_LFSCK, "%s: layout LFSCK hit first non-repaired "
1441                        "inconsistency at the pos [%llu]\n",
1442                        lfsck_lfsck2name(lfsck),
1443                        lo->ll_pos_first_inconsistent);
1444         }
1445 }
1446
1447 static int lfsck_layout_double_scan_result(const struct lu_env *env,
1448                                            struct lfsck_component *com,
1449                                            int rc)
1450 {
1451         struct lfsck_instance   *lfsck = com->lc_lfsck;
1452         struct lfsck_layout     *lo    = com->lc_file_ram;
1453
1454         CDEBUG(D_LFSCK, "%s: layout LFSCK double scan: rc = %d\n",
1455                lfsck_lfsck2name(lfsck), rc);
1456
1457         down_write(&com->lc_sem);
1458         lo->ll_run_time_phase2 += ktime_get_seconds() -
1459                                   com->lc_time_last_checkpoint;
1460         lo->ll_time_last_checkpoint = ktime_get_real_seconds();
1461         lo->ll_objs_checked_phase2 += com->lc_new_checked;
1462
1463         if (rc > 0) {
1464                 if (lo->ll_flags & LF_INCOMPLETE) {
1465                         lo->ll_status = LS_PARTIAL;
1466                 } else {
1467                         if (lfsck->li_master) {
1468                                 struct lfsck_assistant_data *lad = com->lc_data;
1469
1470                                 if (lad->lad_incomplete)
1471                                         lo->ll_status = LS_PARTIAL;
1472                                 else
1473                                         lo->ll_status = LS_COMPLETED;
1474                         } else {
1475                                 lo->ll_status = LS_COMPLETED;
1476                         }
1477                 }
1478                 lo->ll_flags &= ~LF_SCANNED_ONCE;
1479                 if (!(lfsck->li_bookmark_ram.lb_param & LPF_DRYRUN))
1480                         lo->ll_flags &= ~LF_INCONSISTENT;
1481                 lo->ll_time_last_complete = lo->ll_time_last_checkpoint;
1482                 lo->ll_success_count++;
1483         } else if (rc == 0) {
1484                 if (lfsck->li_status != 0)
1485                         lo->ll_status = lfsck->li_status;
1486                 else
1487                         lo->ll_status = LS_STOPPED;
1488         } else {
1489                 lo->ll_status = LS_FAILED;
1490         }
1491
1492         rc = lfsck_layout_store(env, com);
1493         up_write(&com->lc_sem);
1494
1495         CDEBUG(D_LFSCK, "%s: layout LFSCK double scan result %u: rc = %d\n",
1496                lfsck_lfsck2name(lfsck), lo->ll_status, rc);
1497
1498         return rc;
1499 }
1500
1501 static int lfsck_layout_trans_stop(const struct lu_env *env,
1502                                    struct dt_device *dev,
1503                                    struct thandle *handle, int result)
1504 {
1505         int rc;
1506
1507         /* XXX: If there is something worng or it needs to repair nothing,
1508          *      then notify the lower to stop the modification. Currently,
1509          *      we use th_result for such purpose, that may be replaced by
1510          *      some rollback mechanism in the future. */
1511         handle->th_result = result;
1512         rc = dt_trans_stop(env, dev, handle);
1513         if (result != 0)
1514                 return result > 0 ? 0 : result;
1515
1516         return rc == 0 ? 1 : rc;
1517 }
1518
1519 static int lfsck_layout_ins_dangling_rec(const struct lu_env *env,
1520                                          struct lfsck_component *com,
1521                                          const struct lu_fid *pfid,
1522                                          const struct lu_fid *cfid,
1523                                          __u32 comp_id, __u32 ea_off,
1524                                          __u32 ost_idx)
1525 {
1526         struct lfsck_layout_dangling_key *key = &lfsck_env_info(env)->lti_lldk;
1527         struct lu_fid *rec = &lfsck_env_info(env)->lti_fid3;
1528         struct dt_device *dev;
1529         struct dt_object *obj;
1530         struct thandle *th = NULL;
1531         int idx;
1532         int rc = 0;
1533         ENTRY;
1534
1535         idx = lfsck_sub_trace_file_fid2idx(pfid);
1536         obj = com->lc_sub_trace_objs[idx].lsto_obj;
1537         dev = lfsck_obj2dev(obj);
1538
1539         fid_cpu_to_be(&key->lldk_fid, pfid);
1540         key->lldk_comp_id = cpu_to_be32(comp_id);
1541         key->lldk_ea_off = cpu_to_be32(ea_off);
1542
1543         fid_cpu_to_be(rec, cfid);
1544         rec->f_ver = cpu_to_be32(ost_idx);
1545
1546         mutex_lock(&com->lc_sub_trace_objs[idx].lsto_mutex);
1547
1548         th = dt_trans_create(env, dev);
1549         if (IS_ERR(th))
1550                 GOTO(unlock, rc = PTR_ERR(th));
1551
1552         rc = dt_declare_insert(env, obj,
1553                                (const struct dt_rec *)rec,
1554                                (const struct dt_key *)key, th);
1555         if (rc)
1556                 GOTO(unlock, rc);
1557
1558         rc = dt_trans_start_local(env, dev, th);
1559         if (rc)
1560                 GOTO(unlock, rc);
1561
1562         rc = dt_insert(env, obj, (const struct dt_rec *)rec,
1563                        (const struct dt_key *)key, th, 1);
1564
1565         GOTO(unlock, rc);
1566
1567 unlock:
1568         if (th && !IS_ERR(th))
1569                 dt_trans_stop(env, dev, th);
1570
1571         mutex_unlock(&com->lc_sub_trace_objs[idx].lsto_mutex);
1572
1573         CDEBUG(D_LFSCK, "%s: insert the paris "DFID" => "DFID", comp_id = %u, "
1574                "ea_off = %u, ost_idx = %u, into the trace file for further "
1575                "dangling check: rc = %d\n", lfsck_lfsck2name(com->lc_lfsck),
1576                PFID(pfid), PFID(cfid), comp_id, ea_off, ost_idx, rc);
1577
1578         return rc;
1579 }
1580
1581 static int lfsck_layout_del_dangling_rec(const struct lu_env *env,
1582                                          struct lfsck_component *com,
1583                                          const struct lu_fid *fid,
1584                                          __u32 comp_id, __u32 ea_off)
1585 {
1586         struct lfsck_layout_dangling_key *key = &lfsck_env_info(env)->lti_lldk;
1587         struct dt_device *dev;
1588         struct dt_object *obj;
1589         struct thandle *th = NULL;
1590         int idx;
1591         int rc = 0;
1592         ENTRY;
1593
1594         idx = lfsck_sub_trace_file_fid2idx(fid);
1595         obj = com->lc_sub_trace_objs[idx].lsto_obj;
1596         dev = lfsck_obj2dev(obj);
1597
1598         fid_cpu_to_be(&key->lldk_fid, fid);
1599         key->lldk_comp_id = cpu_to_be32(comp_id);
1600         key->lldk_ea_off = cpu_to_be32(ea_off);
1601
1602         mutex_lock(&com->lc_sub_trace_objs[idx].lsto_mutex);
1603
1604         th = dt_trans_create(env, dev);
1605         if (IS_ERR(th))
1606                 GOTO(unlock, rc = PTR_ERR(th));
1607
1608         rc = dt_declare_delete(env, obj, (const struct dt_key *)key, th);
1609         if (rc)
1610                 GOTO(unlock, rc);
1611
1612         rc = dt_trans_start_local(env, dev, th);
1613         if (rc)
1614                 GOTO(unlock, rc);
1615
1616         rc = dt_delete(env, obj, (const struct dt_key *)key, th);
1617
1618         GOTO(unlock, rc);
1619
1620 unlock:
1621         if (th && !IS_ERR(th))
1622                 dt_trans_stop(env, dev, th);
1623
1624         mutex_unlock(&com->lc_sub_trace_objs[idx].lsto_mutex);
1625
1626         CDEBUG(D_LFSCK, "%s: delete the dangling record for "DFID
1627                ", comp_id = %u, ea_off = %u from the trace file: rc = %d\n",
1628                lfsck_lfsck2name(com->lc_lfsck), PFID(fid), comp_id, ea_off, rc);
1629
1630         return rc;
1631 }
1632
1633 /**
1634  * Get the system default stripe size.
1635  *
1636  * \param[in] env       pointer to the thread context
1637  * \param[in] lfsck     pointer to the lfsck instance
1638  * \param[out] size     pointer to the default stripe size
1639  *
1640  * \retval              0 for success
1641  * \retval              negative error number on failure
1642  */
1643 static int lfsck_layout_get_def_stripesize(const struct lu_env *env,
1644                                            struct lfsck_instance *lfsck,
1645                                            __u32 *size)
1646 {
1647         struct lov_user_md      *lum = &lfsck_env_info(env)->lti_lum;
1648         struct dt_object        *root;
1649         int                      rc;
1650
1651         root = dt_locate(env, lfsck->li_next, &lfsck->li_local_root_fid);
1652         if (IS_ERR(root))
1653                 return PTR_ERR(root);
1654
1655         /* Get the default stripe size via xattr_get on the backend root. */
1656         rc = dt_xattr_get(env, root, lfsck_buf_get(env, lum, sizeof(*lum)),
1657                           XATTR_NAME_LOV);
1658         if (rc > 0) {
1659                 /* The lum->lmm_stripe_size is LE mode. The *size also
1660                  * should be LE mode. So it is unnecessary to convert. */
1661                 *size = lum->lmm_stripe_size;
1662                 rc = 0;
1663         } else if (unlikely(rc == 0)) {
1664                 rc = -EINVAL;
1665         }
1666
1667         lfsck_object_put(env, root);
1668
1669         return rc;
1670 }
1671
1672 /**
1673  * \retval       +1: repaired
1674  * \retval        0: did nothing
1675  * \retval      -ve: on error
1676  */
1677 static int lfsck_layout_refill_lovea(const struct lu_env *env,
1678                                      struct lfsck_instance *lfsck,
1679                                      struct thandle *handle,
1680                                      struct dt_object *parent,
1681                                      const struct lu_fid *cfid,
1682                                      struct lu_buf *buf,
1683                                      struct lov_mds_md_v1 *lmm,
1684                                      struct lov_ost_data_v1 *slot,
1685                                      int fl, __u32 ost_idx, int size)
1686 {
1687         struct ost_id           *oi     = &lfsck_env_info(env)->lti_oi;
1688         struct lu_buf            ea_buf;
1689         int                      rc;
1690         __u32                    magic;
1691         __u32                    pattern;
1692         __u16                    count;
1693         ENTRY;
1694
1695         magic = le32_to_cpu(lmm->lmm_magic);
1696         pattern = le32_to_cpu(lmm->lmm_pattern);
1697         count = le16_to_cpu(lmm->lmm_stripe_count);
1698
1699         fid_to_ostid(cfid, oi);
1700         ostid_cpu_to_le(oi, &slot->l_ost_oi);
1701         slot->l_ost_gen = cpu_to_le32(0);
1702         slot->l_ost_idx = cpu_to_le32(ost_idx);
1703
1704         if (pattern & LOV_PATTERN_F_HOLE) {
1705                 struct lov_ost_data_v1 *objs;
1706                 int                     i;
1707
1708                 if (magic == LOV_MAGIC_V1)
1709                         objs = &lmm->lmm_objects[0];
1710                 else
1711                         objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0];
1712                 for (i = 0; i < count; i++, objs++) {
1713                         if (lovea_slot_is_dummy(objs))
1714                                 break;
1715                 }
1716
1717                 /* If the @slot is the last dummy slot to be refilled,
1718                  * then drop LOV_PATTERN_F_HOLE from lmm::lmm_pattern. */
1719                 if (i == count) {
1720                         lmm->lmm_pattern =
1721                                 cpu_to_le32(pattern & ~LOV_PATTERN_F_HOLE);
1722
1723                         CDEBUG(D_LFSCK, "%s: remove layout HOLE for "DFID
1724                                ": parent "DFID"\n", lfsck_lfsck2name(lfsck),
1725                                PFID(cfid), PFID(lfsck_dto2fid(parent)));
1726                 }
1727         }
1728
1729         lfsck_buf_init(&ea_buf, buf->lb_buf, size);
1730         rc = dt_xattr_set(env, parent, &ea_buf, XATTR_NAME_LOV, fl, handle);
1731         if (rc == 0)
1732                 rc = 1;
1733
1734         RETURN(rc);
1735 }
1736
1737 static struct lov_ost_data_v1 *
1738 __lfsck_layout_new_v1_lovea(struct lov_mds_md_v1 *lmm,
1739                             const struct lu_fid *pfid,
1740                             __u32 stripe_size, __u32 ea_off,
1741                             __u32 pattern, __u16 count)
1742 {
1743         lmm->lmm_magic = cpu_to_le32(LOV_MAGIC_V1);
1744         lmm->lmm_pattern = cpu_to_le32(pattern);
1745         fid_to_lmm_oi(pfid, &lmm->lmm_oi);
1746         lmm_oi_cpu_to_le(&lmm->lmm_oi, &lmm->lmm_oi);
1747         lmm->lmm_stripe_size = cpu_to_le32(stripe_size);
1748         lmm->lmm_stripe_count = cpu_to_le16(count);
1749         lmm->lmm_layout_gen = cpu_to_le16(1);
1750         memset(&lmm->lmm_objects[0], 0,
1751                sizeof(struct lov_ost_data_v1) * count);
1752
1753         return &lmm->lmm_objects[ea_off];
1754 }
1755
1756 static int lfsck_layout_new_v1_lovea(const struct lu_env *env,
1757                                      struct lfsck_instance *lfsck,
1758                                      struct ost_layout *ol,
1759                                      struct dt_object *parent,
1760                                      struct lu_buf *buf, __u32 ea_off,
1761                                      struct lov_mds_md_v1 **lmm,
1762                                      struct lov_ost_data_v1 **objs)
1763 {
1764         int size;
1765         __u32 stripe_size = ol->ol_stripe_size;
1766         __u32 pattern = LOV_PATTERN_RAID0;
1767         __u16 count;
1768
1769         if (ol->ol_stripe_count != 0)
1770                 count = ol->ol_stripe_count;
1771         else
1772                 count = ea_off + 1;
1773
1774         size = lov_mds_md_size(count, LOV_MAGIC_V1);
1775         LASSERTF(buf->lb_len >= size,
1776                  "buffer len %d is less than real size %d\n",
1777                  (int)buf->lb_len, size);
1778
1779         if (stripe_size == 0) {
1780                 int rc;
1781
1782                 rc = lfsck_layout_get_def_stripesize(env, lfsck, &stripe_size);
1783                 if (rc)
1784                         return rc;
1785         }
1786
1787         *lmm = buf->lb_buf;
1788         if (ol->ol_stripe_count > 1 ||
1789             (ol->ol_stripe_count == 0 && ea_off != 0)) {
1790                 pattern |= LOV_PATTERN_F_HOLE;
1791                 memset(&(*lmm)->lmm_objects[0], 0,
1792                        count * sizeof(struct lov_ost_data_v1));
1793         }
1794
1795         *objs = __lfsck_layout_new_v1_lovea(*lmm, lfsck_dto2fid(parent),
1796                                 stripe_size, ea_off, pattern, count);
1797
1798         return size;
1799 }
1800
1801 static int lfsck_layout_new_comp_lovea(const struct lu_env *env,
1802                                        struct lu_orphan_rec_v3 *rec,
1803                                        struct dt_object *parent,
1804                                        struct lu_buf *buf, __u32 ea_off,
1805                                        struct lov_mds_md_v1 **lmm,
1806                                        struct lov_ost_data_v1 **objs)
1807 {
1808         struct ost_layout *ol = &rec->lor_layout;
1809         struct lov_comp_md_v1 *lcm;
1810         struct lov_comp_md_entry_v1 *lcme;
1811         __u32 pattern = LOV_PATTERN_RAID0;
1812         __u32 offset = sizeof(*lcm) + sizeof(*lcme);
1813         int lcme_size = lov_mds_md_size(ol->ol_stripe_count, LOV_MAGIC_V1);
1814         int size = offset + lcme_size;
1815
1816         LASSERTF(buf->lb_len >= size,
1817                  "buffer len %d is less than real size %d\n",
1818                  (int)buf->lb_len, size);
1819
1820         lcm = buf->lb_buf;
1821         lcm->lcm_magic = cpu_to_le32(LOV_MAGIC_COMP_V1);
1822         lcm->lcm_size = cpu_to_le32(size);
1823         if (rec->lor_range) {
1824                 lcm->lcm_layout_gen = cpu_to_le32(rec->lor_layout_version +
1825                                                   rec->lor_range);
1826                 lcm->lcm_flags = cpu_to_le16(LCM_FL_WRITE_PENDING);
1827         } else if (rec->lor_layout_version) {
1828                 lcm->lcm_layout_gen = cpu_to_le32(rec->lor_layout_version +
1829                                                   rec->lor_range);
1830                 lcm->lcm_flags = cpu_to_le16(LCM_FL_NONE);
1831         } else {
1832                 lcm->lcm_layout_gen = cpu_to_le32(1);
1833                 lcm->lcm_flags = cpu_to_le16(LCM_FL_NONE);
1834         }
1835         lcm->lcm_entry_count = cpu_to_le16(1);
1836         /* Currently, we do not know how many mirrors will be, set it as zero
1837          * at the beginning. It will be updated when more mirrors are found. */
1838         lcm->lcm_mirror_count = 0;
1839
1840         lcme = &lcm->lcm_entries[0];
1841         lcme->lcme_id = cpu_to_le32(ol->ol_comp_id);
1842         lcme->lcme_flags = cpu_to_le32(LCME_FL_INIT);
1843         lcme->lcme_extent.e_start = cpu_to_le64(ol->ol_comp_start);
1844         lcme->lcme_extent.e_end = cpu_to_le64(ol->ol_comp_end);
1845         lcme->lcme_offset = cpu_to_le32(offset);
1846         lcme->lcme_size = cpu_to_le32(lcme_size);
1847         lcme->lcme_layout_gen = lcm->lcm_layout_gen;
1848         if (ol->ol_stripe_count > 1)
1849                 pattern |= LOV_PATTERN_F_HOLE;
1850
1851         *lmm = buf->lb_buf + offset;
1852         *objs = __lfsck_layout_new_v1_lovea(*lmm, lfsck_dto2fid(parent),
1853                                             ol->ol_stripe_size, ea_off,
1854                                             pattern, ol->ol_stripe_count);
1855
1856         return size;
1857 }
1858
1859 static void lfsck_layout_update_lcm(struct lov_comp_md_v1 *lcm,
1860                                     struct lov_comp_md_entry_v1 *lcme,
1861                                     __u32 version, __u32 range)
1862 {
1863         struct lov_comp_md_entry_v1 *tmp;
1864         __u64 start = le64_to_cpu(lcme->lcme_extent.e_start);
1865         __u64 end = le64_to_cpu(lcme->lcme_extent.e_end);
1866         __u32 gen = version + range;
1867         __u32 tmp_gen;
1868         int i;
1869         __u16 count = le16_to_cpu(lcm->lcm_entry_count);
1870         __u16 flags = le16_to_cpu(lcm->lcm_flags);
1871
1872         if (!gen)
1873                 gen = 1;
1874         lcme->lcme_layout_gen = cpu_to_le32(gen);
1875         if (le32_to_cpu(lcm->lcm_layout_gen) < gen)
1876                 lcm->lcm_layout_gen = cpu_to_le32(gen);
1877
1878         if (range)
1879                 lcm->lcm_flags = cpu_to_le16(LCM_FL_WRITE_PENDING);
1880         else if (flags == LCM_FL_NONE && le16_to_cpu(lcm->lcm_mirror_count) > 0)
1881                 lcm->lcm_flags = cpu_to_le16(LCM_FL_RDONLY);
1882
1883         for (i = 0; i < count; i++) {
1884                 tmp = &lcm->lcm_entries[i];
1885                 if (le64_to_cpu(tmp->lcme_extent.e_end) <= start)
1886                         continue;
1887
1888                 if (le64_to_cpu(tmp->lcme_extent.e_start) >= end)
1889                         continue;
1890
1891                 if (le32_to_cpu(tmp->lcme_flags) & LCME_FL_STALE)
1892                         continue;
1893
1894                 tmp_gen = le32_to_cpu(tmp->lcme_layout_gen);
1895                 /* "lcme_layout_gen == 0" but without LCME_FL_STALE flag,
1896                  * then it should be the latest version of all mirrors. */
1897                 if (tmp_gen == 0 || tmp_gen > gen) {
1898                         lcme->lcme_flags = cpu_to_le32(
1899                                 le32_to_cpu(lcme->lcme_flags) | LCME_FL_STALE);
1900                         break;
1901                 }
1902
1903                 if (tmp_gen < gen)
1904                         tmp->lcme_flags = cpu_to_le32(
1905                                 le32_to_cpu(tmp->lcme_flags) | LCME_FL_STALE);
1906         }
1907 }
1908
1909 static int lfsck_layout_add_comp(const struct lu_env *env,
1910                                  struct lfsck_instance *lfsck,
1911                                  struct thandle *handle,
1912                                  struct lu_orphan_rec_v3 *rec,
1913                                  struct dt_object *parent,
1914                                  const struct lu_fid *cfid,
1915                                  struct lu_buf *buf, __u32 ost_idx,
1916                                  __u32 ea_off, int pos, bool new_mirror)
1917 {
1918         struct ost_layout *ol = &rec->lor_layout;
1919         struct lov_comp_md_v1 *lcm = buf->lb_buf;
1920         struct lov_comp_md_entry_v1 *lcme;
1921         struct lov_mds_md_v1 *lmm;
1922         struct lov_ost_data_v1 *objs;
1923         int added = sizeof(*lcme) +
1924                     lov_mds_md_size(ol->ol_stripe_count, LOV_MAGIC_V1);
1925         int size = le32_to_cpu(lcm->lcm_size) + added;
1926         int rc;
1927         int i;
1928         __u32 offset;
1929         __u32 pattern = LOV_PATTERN_RAID0;
1930         __u16 count = le16_to_cpu(lcm->lcm_entry_count);
1931         ENTRY;
1932
1933         lu_buf_check_and_grow(buf, size);
1934         /* set the lcm again because lu_buf_check_and_grow() may
1935          * have reallocated the buf. */
1936         lcm = buf->lb_buf;
1937         lcm->lcm_size = cpu_to_le32(size);
1938         lcm->lcm_entry_count = cpu_to_le16(count + 1);
1939         if (new_mirror)
1940                 le16_add_cpu(&lcm->lcm_mirror_count, 1);
1941
1942         /* 1. Move the component bodies from [pos, count-1] to [pos+1, count]
1943          *    with distance of 'added'. */
1944         if (pos < count) {
1945                 size = 0;
1946                 for (i = pos; i < count; i++) {
1947                         lcme = &lcm->lcm_entries[i];
1948                         size += le32_to_cpu(lcme->lcme_size);
1949                 }
1950
1951                 offset = le32_to_cpu(lcm->lcm_entries[pos].lcme_offset);
1952                 memmove(buf->lb_buf + offset + added,
1953                         buf->lb_buf + offset, size);
1954         }
1955
1956         size = 0;
1957         /* 2. Move the component header [0, pos-1] to [0, pos-1] with distance
1958          *    of 'sizeof(struct lov_comp_md_entry_v1)' */
1959         if (pos > 0) {
1960                 for (i = 0; i < pos; i++) {
1961                         lcme = &lcm->lcm_entries[i];
1962                         size += le32_to_cpu(lcme->lcme_size);
1963                 }
1964
1965                 offset = le32_to_cpu(lcm->lcm_entries[0].lcme_offset);
1966                 memmove(buf->lb_buf + offset + sizeof(*lcme),
1967                         buf->lb_buf + offset, size);
1968         }
1969
1970         /* 3. Recalculate the enter offset for the component [pos, count-1] */
1971         for (i = count - 1; i >= pos; i--) {
1972                 lcm->lcm_entries[i + 1] = lcm->lcm_entries[i];
1973                 lcm->lcm_entries[i + 1].lcme_offset =
1974                         cpu_to_le32(le32_to_cpu(lcm->lcm_entries[i + 1].
1975                                                 lcme_offset) + added);
1976         }
1977
1978         /* 4. Recalculate the enter offset for the component [0, pos) */
1979         for (i = 0; i < pos; i++) {
1980                 lcm->lcm_entries[i].lcme_offset =
1981                         cpu_to_le32(le32_to_cpu(lcm->lcm_entries[i].
1982                                                 lcme_offset) + sizeof(*lcme));
1983         }
1984
1985         offset = sizeof(*lcm) + sizeof(*lcme) * (count + 1) + size;
1986         /* 4. Insert the new component header (entry) at the slot 'pos'. */
1987         lcme = &lcm->lcm_entries[pos];
1988         lcme->lcme_id = cpu_to_le32(ol->ol_comp_id);
1989         lcme->lcme_flags = cpu_to_le32(LCME_FL_INIT);
1990         lcme->lcme_extent.e_start = cpu_to_le64(ol->ol_comp_start);
1991         lcme->lcme_extent.e_end = cpu_to_le64(ol->ol_comp_end);
1992         lcme->lcme_offset = cpu_to_le32(offset);
1993         lcme->lcme_size = cpu_to_le32(lov_mds_md_size(ol->ol_stripe_count,
1994                                                       LOV_MAGIC_V1));
1995
1996         if (ol->ol_stripe_count > 1)
1997                 pattern |= LOV_PATTERN_F_HOLE;
1998
1999         lmm = buf->lb_buf + offset;
2000         /* 5. Insert teh new component body at the 'offset'. */
2001         objs = __lfsck_layout_new_v1_lovea(lmm, lfsck_dto2fid(parent),
2002                                            ol->ol_stripe_size, ea_off,
2003                                            pattern, ol->ol_stripe_count);
2004
2005         /* 6. Update mirror related flags and version. */
2006         lfsck_layout_update_lcm(lcm, lcme, rec->lor_layout_version,
2007                                 rec->lor_range);
2008
2009         rc = lfsck_layout_refill_lovea(env, lfsck, handle, parent, cfid, buf,
2010                                        lmm, objs, LU_XATTR_REPLACE, ost_idx,
2011                                        le32_to_cpu(lcm->lcm_size));
2012
2013         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant add new COMP for "
2014                DFID": parent "DFID", OST-index %u, stripe-index %u, "
2015                "stripe_size %u, stripe_count %u, comp_id %u, comp_start %llu, "
2016                "comp_end %llu, layout version %u, range %u, "
2017                "%s LOV EA hole: rc = %d\n",
2018                lfsck_lfsck2name(lfsck), PFID(cfid), PFID(lfsck_dto2fid(parent)),
2019                ost_idx, ea_off, ol->ol_stripe_size, ol->ol_stripe_count,
2020                ol->ol_comp_id, ol->ol_comp_start, ol->ol_comp_end,
2021                rec->lor_layout_version, rec->lor_range,
2022                le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_HOLE ?
2023                "with" : "without", rc);
2024
2025         RETURN(rc);
2026 }
2027
2028 static int lfsck_layout_extend_v1v3_lovea(const struct lu_env *env,
2029                                           struct lfsck_instance *lfsck,
2030                                           struct thandle *handle,
2031                                           struct ost_layout *ol,
2032                                           struct dt_object *parent,
2033                                           const struct lu_fid *cfid,
2034                                           struct lu_buf *buf, __u32 ost_idx,
2035                                           __u32 ea_off)
2036 {
2037         struct lov_mds_md_v1 *lmm = buf->lb_buf;
2038         struct lov_ost_data_v1 *objs;
2039         __u16 count = le16_to_cpu(lmm->lmm_stripe_count);
2040         __u32 magic = le32_to_cpu(lmm->lmm_magic);
2041         int size;
2042         int gap;
2043         int rc;
2044         ENTRY;
2045
2046         /* The original LOVEA maybe re-generated via old filter_fid, at
2047          * that time, we do not know the stripe count and stripe size. */
2048         if (ol->ol_stripe_count > count)
2049                 count = ol->ol_stripe_count;
2050         if (ol->ol_stripe_size != 0 &&
2051             ol->ol_stripe_size != le32_to_cpu(lmm->lmm_stripe_size))
2052                 lmm->lmm_stripe_size = cpu_to_le32(ol->ol_stripe_size);
2053
2054         if (magic == LOV_MAGIC_V1)
2055                 objs = &lmm->lmm_objects[count];
2056         else
2057                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[count];
2058
2059         gap = ea_off - count;
2060         if (gap >= 0)
2061                 count = ea_off + 1;
2062
2063         size = lov_mds_md_size(count, magic);
2064         LASSERTF(buf->lb_len >= size,
2065                  "buffer len %d is less than real size %d\n",
2066                  (int)buf->lb_len, size);
2067
2068         if (gap > 0) {
2069                 memset(objs, 0, gap * sizeof(*objs));
2070                 lmm->lmm_pattern |= cpu_to_le32(LOV_PATTERN_F_HOLE);
2071         }
2072
2073         lmm->lmm_layout_gen = cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1);
2074         lmm->lmm_stripe_count = cpu_to_le16(count);
2075         objs += gap;
2076
2077         rc = lfsck_layout_refill_lovea(env, lfsck, handle, parent, cfid, buf,
2078                                 lmm, objs, LU_XATTR_REPLACE, ost_idx, size);
2079
2080         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant extend layout EA for "
2081                DFID": parent "DFID", OST-index %u, stripe-index %u, "
2082                "stripe_size %u, stripe_count %u, comp_id %u, comp_start %llu, "
2083                "comp_end %llu, %s LOV EA hole: rc = %d\n",
2084                lfsck_lfsck2name(lfsck), PFID(cfid), PFID(lfsck_dto2fid(parent)),
2085                ost_idx, ea_off, ol->ol_stripe_size, ol->ol_stripe_count,
2086                ol->ol_comp_id, ol->ol_comp_start, ol->ol_comp_end,
2087                le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_HOLE ?
2088                "with" : "without", rc);
2089
2090         RETURN(rc);
2091 }
2092
2093 /**
2094  * \retval       +1: repaired
2095  * \retval        0: did nothing
2096  * \retval      -ve: on error
2097  */
2098 static int lfsck_layout_update_lovea(const struct lu_env *env,
2099                                      struct lfsck_instance *lfsck,
2100                                      struct thandle *handle,
2101                                      struct lu_orphan_rec_v3 *rec,
2102                                      struct dt_object *parent,
2103                                      const struct lu_fid *cfid,
2104                                      struct lu_buf *buf, int fl,
2105                                      __u32 ost_idx, __u32 ea_off)
2106 {
2107         struct ost_layout *ol = &rec->lor_layout;
2108         struct lov_mds_md_v1 *lmm = NULL;
2109         struct lov_ost_data_v1 *objs = NULL;
2110         int rc = 0;
2111         ENTRY;
2112
2113         if (ol->ol_comp_id != 0)
2114                 rc = lfsck_layout_new_comp_lovea(env, rec, parent, buf, ea_off,
2115                                                  &lmm, &objs);
2116         else
2117                 rc = lfsck_layout_new_v1_lovea(env, lfsck, &rec->lor_layout,
2118                                                parent, buf, ea_off, &lmm,
2119                                                &objs);
2120         if (rc > 0)
2121                 rc = lfsck_layout_refill_lovea(env, lfsck, handle, parent, cfid,
2122                                                buf, lmm, objs, fl, ost_idx, rc);
2123
2124         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant created layout EA for "
2125                DFID": parent "DFID", OST-index %u, stripe-index %u, "
2126                "stripe_size %u, stripe_count %u, comp_id %u, comp_start %llu, "
2127                "comp_end %llu, layout version %u, range %u, fl %d, "
2128                "%s LOV EA hole: rc = %d\n",
2129                lfsck_lfsck2name(lfsck), PFID(cfid), PFID(lfsck_dto2fid(parent)),
2130                ost_idx, ea_off, ol->ol_stripe_size, ol->ol_stripe_count,
2131                ol->ol_comp_id, ol->ol_comp_start, ol->ol_comp_end,
2132                rec->lor_layout_version, rec->lor_range, fl,
2133                le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_HOLE ?
2134                "with" : "without", rc);
2135
2136         RETURN(rc);
2137 }
2138
2139 static int __lfsck_layout_update_pfid(const struct lu_env *env,
2140                                       struct dt_object *child,
2141                                       const struct lu_fid *pfid,
2142                                       const struct ost_layout *ol, __u32 offset,
2143                                       __u32 version, __u32 range)
2144 {
2145         struct dt_device        *dev    = lfsck_obj2dev(child);
2146         struct filter_fid       *ff     = &lfsck_env_info(env)->lti_ff;
2147         struct thandle          *handle;
2148         struct lu_buf            buf    = { NULL };
2149         int                      rc;
2150
2151         ff->ff_parent.f_seq = cpu_to_le64(pfid->f_seq);
2152         ff->ff_parent.f_oid = cpu_to_le32(pfid->f_oid);
2153         /* Currently, the filter_fid::ff_parent::f_ver is not the real parent
2154          * MDT-object's FID::f_ver, instead it is the OST-object index in its
2155          * parent MDT-object's layout EA. */
2156         ff->ff_parent.f_stripe_idx = cpu_to_le32(offset);
2157         ost_layout_cpu_to_le(&ff->ff_layout, ol);
2158         ff->ff_layout_version = cpu_to_le32(version);
2159         ff->ff_range = cpu_to_le32(range);
2160         lfsck_buf_init(&buf, ff, sizeof(*ff));
2161
2162         handle = dt_trans_create(env, dev);
2163         if (IS_ERR(handle))
2164                 RETURN(PTR_ERR(handle));
2165
2166         rc = dt_declare_xattr_set(env, child, &buf, XATTR_NAME_FID, 0, handle);
2167         if (rc != 0)
2168                 GOTO(stop, rc);
2169
2170         rc = dt_trans_start_local(env, dev, handle);
2171         if (rc != 0)
2172                 GOTO(stop, rc);
2173
2174         rc = dt_xattr_set(env, child, &buf, XATTR_NAME_FID, 0, handle);
2175
2176         GOTO(stop, rc);
2177
2178 stop:
2179         dt_trans_stop(env, dev, handle);
2180
2181         return rc;
2182 }
2183
2184 /**
2185  * \retval       +1: repaired
2186  * \retval        0: did nothing
2187  * \retval      -ve: on error
2188  */
2189 static int lfsck_layout_update_pfid(const struct lu_env *env,
2190                                     struct lfsck_component *com,
2191                                     struct dt_object *parent,
2192                                     struct lu_fid *cfid,
2193                                     struct dt_device *cdev,
2194                                     struct lu_orphan_rec_v3 *rec, __u32 ea_off)
2195 {
2196         struct dt_object        *child;
2197         int                      rc     = 0;
2198         ENTRY;
2199
2200         child = lfsck_object_find_by_dev(env, cdev, cfid);
2201         if (IS_ERR(child))
2202                 RETURN(PTR_ERR(child));
2203
2204         rc = __lfsck_layout_update_pfid(env, child,
2205                                         lu_object_fid(&parent->do_lu),
2206                                         &rec->lor_layout, ea_off,
2207                                         rec->lor_layout_version,
2208                                         rec->lor_range);
2209         lfsck_object_put(env, child);
2210
2211         RETURN(rc == 0 ? 1 : rc);
2212 }
2213
2214 static int lfsck_lovea_size(struct ost_layout *ol, __u32 ea_off)
2215 {
2216         if (ol->ol_comp_id != 0)
2217                 return sizeof(struct lov_comp_md_v1) +
2218                        sizeof(struct lov_comp_md_entry_v1) +
2219                        lov_mds_md_size(ol->ol_stripe_count, LOV_MAGIC_V1);
2220
2221         if (ol->ol_stripe_count != 0)
2222                 return lov_mds_md_size(ol->ol_stripe_count, LOV_MAGIC_V1);
2223
2224         return lov_mds_md_size(ea_off + 1, LOV_MAGIC_V1);
2225 }
2226
2227 /**
2228  * This function will create the MDT-object with the given (partial) LOV EA.
2229  *
2230  * Under some data corruption cases, the MDT-object of the file may be lost,
2231  * but its OST-objects, or some of them are there. The layout LFSCK needs to
2232  * re-create the MDT-object with the orphan OST-object(s) information.
2233  *
2234  * On the other hand, the LFSCK may has created some OST-object for repairing
2235  * dangling LOV EA reference, but as the LFSCK processing, it may find that
2236  * the old OST-object is there and should replace the former new created OST
2237  * object. Unfortunately, some others have modified such newly created object.
2238  * To keep the data (both new and old), the LFSCK will create MDT-object with
2239  * new FID to reference the original OST-object.
2240  *
2241  * \param[in] env       pointer to the thread context
2242  * \param[in] com       pointer to the lfsck component
2243  * \param[in] ltd       pointer to target device descriptor
2244  * \param[in] rec       pointer to the record for the orphan OST-object
2245  * \param[in] cfid      pointer to FID for the orphan OST-object
2246  * \param[in] infix     additional information, such as the FID for original
2247  *                      MDT-object and the stripe offset in the LOV EA
2248  * \param[in] type      the type for describing why the orphan MDT-object is
2249  *                      created. The rules are as following:
2250  *
2251  *  type "C":           Multiple OST-objects claim the same MDT-object and the
2252  *                      same slot in the layout EA. Then the LFSCK will create
2253  *                      new MDT-object(s) to hold the conflict OST-object(s).
2254  *
2255  *  type "N":           The orphan OST-object does not know which one was the
2256  *                      real parent MDT-object, so the LFSCK uses new FID for
2257  *                      its parent MDT-object.
2258  *
2259  *  type "R":           The orphan OST-object knows its parent MDT-object FID,
2260  *                      but does not know the position (the file name) in the
2261  *                      layout.
2262  *
2263  *  type "D":           The MDT-object is a directory, it may knows its parent
2264  *                      but because there is no valid linkEA, the LFSCK cannot
2265  *                      know where to put it back to the namespace.
2266  *  type "O":           The MDT-object has no linkEA, and there is no name
2267  *                      entry that references the MDT-object.
2268  *
2269  *  type "P":           The orphan object to be created was a parent directory
2270  *                      of some MDT-object which linkEA shows that the @orphan
2271  *                      object is missing.
2272  *
2273  * The orphan name will be like:
2274  * ${FID}-${infix}-${type}-${conflict_version}
2275  *
2276  * \param[in] ea_off    the stripe offset in the LOV EA
2277  *
2278  * \retval              positive on repaired something
2279  * \retval              0 if needs to repair nothing
2280  * \retval              negative error number on failure
2281  */
2282 static int lfsck_layout_recreate_parent(const struct lu_env *env,
2283                                         struct lfsck_component *com,
2284                                         struct lfsck_tgt_desc *ltd,
2285                                         struct lu_orphan_rec_v3 *rec,
2286                                         struct lu_fid *cfid,
2287                                         const char *infix,
2288                                         const char *type,
2289                                         __u32 ea_off)
2290 {
2291         struct lfsck_thread_info        *info   = lfsck_env_info(env);
2292         struct dt_insert_rec            *dtrec  = &info->lti_dt_rec;
2293         char                            *name   = info->lti_key;
2294         struct lu_attr                  *la     = &info->lti_la2;
2295         struct dt_object_format         *dof    = &info->lti_dof;
2296         struct lfsck_instance           *lfsck  = com->lc_lfsck;
2297         struct lu_fid                   *pfid   = &rec->lor_rec.lor_fid;
2298         struct lu_fid                   *tfid   = &info->lti_fid3;
2299         struct dt_device                *dev    = lfsck->li_bottom;
2300         struct dt_object                *lpf    = lfsck->li_lpf_obj;
2301         struct dt_object                *pobj   = NULL;
2302         struct dt_object                *cobj   = NULL;
2303         struct thandle                  *th     = NULL;
2304         struct lu_buf                   *ea_buf = &info->lti_big_buf;
2305         struct lu_buf                    lov_buf;
2306         struct lfsck_lock_handle        *llh    = &info->lti_llh;
2307         struct linkea_data               ldata  = { NULL };
2308         struct lu_buf                    linkea_buf;
2309         const struct lu_name            *pname;
2310         int                              size   = 0;
2311         int                              idx    = 0;
2312         int                              rc     = 0;
2313         ENTRY;
2314
2315         if (unlikely(lpf == NULL))
2316                 GOTO(log, rc = -ENXIO);
2317
2318         /* We use two separated transactions to repair the inconsistency.
2319          *
2320          * 1) create the MDT-object locally.
2321          * 2) update the OST-object's PFID EA if necessary.
2322          *
2323          * If 1) succeed, but 2) failed, then the OST-object's PFID EA will be
2324          * updated when the layout LFSCK run next time.
2325          *
2326          * If 1) failed, but 2) succeed, then such MDT-object will be re-created
2327          * when the layout LFSCK run next time. */
2328
2329         if (fid_is_zero(pfid)) {
2330                 rc = lfsck_fid_alloc(env, lfsck, pfid, false);
2331                 if (rc != 0)
2332                         GOTO(log, rc);
2333
2334                 cobj = lfsck_object_find_by_dev(env, ltd->ltd_tgt, cfid);
2335                 if (IS_ERR(cobj))
2336                         GOTO(log, rc = PTR_ERR(cobj));
2337         }
2338
2339         pobj = lfsck_object_find_by_dev(env, dev, pfid);
2340         if (IS_ERR(pobj))
2341                 GOTO(log, rc = PTR_ERR(pobj));
2342
2343         LASSERT(infix != NULL);
2344         LASSERT(type != NULL);
2345
2346         memset(la, 0, sizeof(*la));
2347         la->la_uid = rec->lor_rec.lor_uid;
2348         la->la_gid = rec->lor_rec.lor_gid;
2349         la->la_mode = S_IFREG | S_IRUSR;
2350         la->la_valid = LA_MODE | LA_UID | LA_GID;
2351
2352         memset(dof, 0, sizeof(*dof));
2353         dof->dof_type = dt_mode_to_dft(S_IFREG);
2354         /* Because the dof->dof_reg.striped = 0, the LOD will not create
2355          * the stripe(s). The LFSCK will specify the LOV EA via
2356          * lfsck_layout_update_lovea(). */
2357
2358         size = lfsck_lovea_size(&rec->lor_layout, ea_off);
2359         if (ea_buf->lb_len < size) {
2360                 lu_buf_realloc(ea_buf, size);
2361                 if (ea_buf->lb_buf == NULL)
2362                         GOTO(log, rc = -ENOMEM);
2363         }
2364
2365 again:
2366         do {
2367                 snprintf(name, NAME_MAX, DFID"%s-%s-%d", PFID(pfid), infix,
2368                          type, idx++);
2369                 rc = dt_lookup(env, lfsck->li_lpf_obj, (struct dt_rec *)tfid,
2370                                (const struct dt_key *)name);
2371                 if (rc != 0 && rc != -ENOENT)
2372                         GOTO(log, rc);
2373         } while (rc == 0);
2374
2375         rc = lfsck_lock(env, lfsck, lfsck->li_lpf_obj, name, llh,
2376                         MDS_INODELOCK_UPDATE, LCK_PW);
2377         if (rc != 0)
2378                 GOTO(log, rc);
2379
2380         /* Re-check whether the name conflict with othrs after taken
2381          * the ldlm lock. */
2382         rc = dt_lookup(env, lfsck->li_lpf_obj, (struct dt_rec *)tfid,
2383                        (const struct dt_key *)name);
2384         if (unlikely(rc == 0)) {
2385                 lfsck_unlock(llh);
2386                 goto again;
2387         }
2388
2389         if (rc != -ENOENT)
2390                 GOTO(unlock, rc);
2391
2392         pname = lfsck_name_get_const(env, name, strlen(name));
2393         rc = linkea_links_new(&ldata, &lfsck_env_info(env)->lti_linkea_buf,
2394                               pname, lfsck_dto2fid(lfsck->li_lpf_obj));
2395         if (rc != 0)
2396                 GOTO(unlock, rc);
2397
2398         /* The 1st transaction. */
2399         th = dt_trans_create(env, dev);
2400         if (IS_ERR(th))
2401                 GOTO(unlock, rc = PTR_ERR(th));
2402
2403         rc = dt_declare_create(env, pobj, la, NULL, dof, th);
2404         if (rc != 0)
2405                 GOTO(stop, rc);
2406
2407         lfsck_buf_init(&lov_buf, ea_buf->lb_buf, size);
2408         rc = dt_declare_xattr_set(env, pobj, &lov_buf, XATTR_NAME_LOV,
2409                                   LU_XATTR_CREATE, th);
2410         if (rc != 0)
2411                 GOTO(stop, rc);
2412
2413         dtrec->rec_fid = pfid;
2414         dtrec->rec_type = S_IFREG;
2415         rc = dt_declare_insert(env, lpf,
2416                                (const struct dt_rec *)dtrec,
2417                                (const struct dt_key *)name, th);
2418         if (rc != 0)
2419                 GOTO(stop, rc);
2420
2421         lfsck_buf_init(&linkea_buf, ldata.ld_buf->lb_buf,
2422                        ldata.ld_leh->leh_len);
2423         rc = dt_declare_xattr_set(env, pobj, &linkea_buf,
2424                                   XATTR_NAME_LINK, 0, th);
2425         if (rc != 0)
2426                 GOTO(stop, rc);
2427
2428         rc = dt_trans_start_local(env, dev, th);
2429         if (rc != 0)
2430                 GOTO(stop, rc);
2431
2432         dt_write_lock(env, pobj, 0);
2433         rc = dt_create(env, pobj, la, NULL, dof, th);
2434         if (rc == 0)
2435                 rc = lfsck_layout_update_lovea(env, lfsck, th, rec, pobj, cfid,
2436                         &lov_buf, LU_XATTR_CREATE, ltd->ltd_index, ea_off);
2437         dt_write_unlock(env, pobj);
2438         if (rc < 0)
2439                 GOTO(stop, rc);
2440
2441         rc = dt_insert(env, lpf, (const struct dt_rec *)dtrec,
2442                        (const struct dt_key *)name, th, 1);
2443         if (rc != 0)
2444                 GOTO(stop, rc);
2445
2446         rc = dt_xattr_set(env, pobj, &linkea_buf, XATTR_NAME_LINK, 0, th);
2447         if (rc == 0 && cobj != NULL) {
2448                 dt_trans_stop(env, dev, th);
2449                 th = NULL;
2450
2451                 /* The 2nd transaction. */
2452                 rc = __lfsck_layout_update_pfid(env, cobj, pfid,
2453                                                 &rec->lor_layout, ea_off,
2454                                                 rec->lor_layout_version,
2455                                                 rec->lor_range);
2456         }
2457
2458         GOTO(stop, rc);
2459
2460 stop:
2461         if (th != NULL)
2462                 dt_trans_stop(env, dev, th);
2463
2464 unlock:
2465         lfsck_unlock(llh);
2466
2467 log:
2468         if (cobj != NULL && !IS_ERR(cobj))
2469                 lfsck_object_put(env, cobj);
2470         if (pobj != NULL && !IS_ERR(pobj))
2471                 lfsck_object_put(env, pobj);
2472
2473         if (rc < 0)
2474                 CDEBUG(D_LFSCK, "%s layout LFSCK assistant failed to "
2475                        "recreate the lost MDT-object: parent "DFID
2476                        ", child "DFID", OST-index %u, stripe-index %u, "
2477                        "infix %s, type %s: rc = %d\n",
2478                        lfsck_lfsck2name(lfsck), PFID(pfid), PFID(cfid),
2479                        ltd->ltd_index, ea_off, infix, type, rc);
2480
2481         return rc >= 0 ? 1 : rc;
2482 }
2483
2484 static int lfsck_layout_master_conditional_destroy(const struct lu_env *env,
2485                                                    struct lfsck_component *com,
2486                                                    const struct lu_fid *fid,
2487                                                    __u32 index)
2488 {
2489         struct lfsck_thread_info *info  = lfsck_env_info(env);
2490         struct lfsck_request     *lr    = &info->lti_lr;
2491         struct lfsck_instance    *lfsck = com->lc_lfsck;
2492         struct lfsck_tgt_desc    *ltd;
2493         struct ptlrpc_request    *req;
2494         struct lfsck_request     *tmp;
2495         struct obd_export        *exp;
2496         int                       rc    = 0;
2497         ENTRY;
2498
2499         ltd = lfsck_tgt_get(&lfsck->li_ost_descs, index);
2500         if (unlikely(ltd == NULL))
2501                 RETURN(-ENXIO);
2502
2503         exp = ltd->ltd_exp;
2504         if (!(exp_connect_flags(exp) & OBD_CONNECT_LFSCK))
2505                 GOTO(put, rc = -EOPNOTSUPP);
2506
2507         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LFSCK_NOTIFY);
2508         if (req == NULL)
2509                 GOTO(put, rc = -ENOMEM);
2510
2511         rc = ptlrpc_request_pack(req, LUSTRE_OBD_VERSION, LFSCK_NOTIFY);
2512         if (rc != 0) {
2513                 ptlrpc_request_free(req);
2514
2515                 GOTO(put, rc);
2516         }
2517
2518         memset(lr, 0, sizeof(*lr));
2519         lr->lr_event = LE_CONDITIONAL_DESTROY;
2520         lr->lr_active = LFSCK_TYPE_LAYOUT;
2521         lr->lr_fid = *fid;
2522
2523         tmp = req_capsule_client_get(&req->rq_pill, &RMF_LFSCK_REQUEST);
2524         *tmp = *lr;
2525         ptlrpc_request_set_replen(req);
2526
2527         rc = ptlrpc_queue_wait(req);
2528         ptlrpc_req_finished(req);
2529
2530         GOTO(put, rc);
2531
2532 put:
2533         lfsck_tgt_put(ltd);
2534
2535         return rc;
2536 }
2537
2538 static int lfsck_layout_slave_conditional_destroy(const struct lu_env *env,
2539                                                   struct lfsck_component *com,
2540                                                   struct lfsck_request *lr)
2541 {
2542         struct lfsck_thread_info        *info   = lfsck_env_info(env);
2543         struct lu_attr                  *la     = &info->lti_la;
2544         union ldlm_policy_data          *policy = &info->lti_policy;
2545         struct ldlm_res_id              *resid  = &info->lti_resid;
2546         struct lfsck_instance           *lfsck  = com->lc_lfsck;
2547         struct dt_device                *dev    = lfsck->li_bottom;
2548         struct lu_fid                   *fid    = &lr->lr_fid;
2549         struct dt_object                *obj;
2550         struct thandle                  *th     = NULL;
2551         struct lustre_handle             lh     = { 0 };
2552         __u64                            flags  = 0;
2553         int                              rc     = 0;
2554         ENTRY;
2555
2556         obj = lfsck_object_find_by_dev(env, dev, fid);
2557         if (IS_ERR(obj))
2558                 RETURN(PTR_ERR(obj));
2559
2560         dt_read_lock(env, obj, 0);
2561         if (dt_object_exists(obj) == 0 ||
2562             lfsck_is_dead_obj(obj)) {
2563                 dt_read_unlock(env, obj);
2564
2565                 GOTO(put, rc = -ENOENT);
2566         }
2567
2568         /* Get obj's attr without lock firstly. */
2569         rc = dt_attr_get(env, obj, la);
2570         dt_read_unlock(env, obj);
2571         if (rc != 0)
2572                 GOTO(put, rc);
2573
2574         if (likely(la->la_ctime != 0 || la->la_mode & S_ISUID))
2575                 GOTO(put, rc = -ETXTBSY);
2576
2577         /* Acquire extent lock on [0, EOF] to sync with all possible written. */
2578         LASSERT(lfsck->li_namespace != NULL);
2579
2580         memset(policy, 0, sizeof(*policy));
2581         policy->l_extent.end = OBD_OBJECT_EOF;
2582         ost_fid_build_resid(fid, resid);
2583         rc = ldlm_cli_enqueue_local(env, lfsck->li_namespace, resid,
2584                                     LDLM_EXTENT, policy, LCK_EX, &flags,
2585                                     ldlm_blocking_ast, ldlm_completion_ast,
2586                                     NULL, NULL, 0, LVB_T_NONE, NULL, &lh);
2587         if (rc != ELDLM_OK)
2588                 GOTO(put, rc = -EIO);
2589
2590         dt_write_lock(env, obj, 0);
2591         /* Get obj's attr within lock again. */
2592         rc = dt_attr_get(env, obj, la);
2593         if (rc != 0)
2594                 GOTO(unlock, rc);
2595
2596         if (la->la_ctime != 0)
2597                 GOTO(unlock, rc = -ETXTBSY);
2598
2599         th = dt_trans_create(env, dev);
2600         if (IS_ERR(th))
2601                 GOTO(unlock, rc = PTR_ERR(th));
2602
2603         rc = dt_declare_ref_del(env, obj, th);
2604         if (rc != 0)
2605                 GOTO(stop, rc);
2606
2607         rc = dt_declare_destroy(env, obj, th);
2608         if (rc != 0)
2609                 GOTO(stop, rc);
2610
2611         rc = dt_trans_start_local(env, dev, th);
2612         if (rc != 0)
2613                 GOTO(stop, rc);
2614
2615         rc = dt_ref_del(env, obj, th);
2616         if (rc != 0)
2617                 GOTO(stop, rc);
2618
2619         rc = dt_destroy(env, obj, th);
2620         if (rc == 0)
2621                 CDEBUG(D_LFSCK, "%s: layout LFSCK destroyed the empty "
2622                        "OST-object "DFID" that was created for reparing "
2623                        "dangling referenced case. But the original missing "
2624                        "OST-object is found now.\n",
2625                        lfsck_lfsck2name(lfsck), PFID(fid));
2626
2627         GOTO(stop, rc);
2628
2629 stop:
2630         dt_trans_stop(env, dev, th);
2631
2632 unlock:
2633         dt_write_unlock(env, obj);
2634         ldlm_lock_decref(&lh, LCK_EX);
2635
2636 put:
2637         lfsck_object_put(env, obj);
2638
2639         return rc;
2640 }
2641
2642 /**
2643  * Some OST-object has occupied the specified layout EA slot.
2644  * Such OST-object may be generated by the LFSCK when repair
2645  * dangling referenced MDT-object, which can be indicated by
2646  * attr::la_ctime == 0 but without S_ISUID in la_mode. If it
2647  * is true and such OST-object has not been modified yet, we
2648  * will replace it with the orphan OST-object; otherwise the
2649  * LFSCK will create new MDT-object to reference the orphan.
2650  *
2651  * \retval       +1: repaired
2652  * \retval        0: did nothing
2653  * \retval      -ve: on error
2654  */
2655 static int lfsck_layout_conflict_create(const struct lu_env *env,
2656                                         struct lfsck_component *com,
2657                                         struct lfsck_tgt_desc *ltd,
2658                                         struct lu_orphan_rec_v3 *rec,
2659                                         struct dt_object *parent,
2660                                         struct lu_fid *cfid,
2661                                         struct lu_buf *ea_buf,
2662                                         struct lov_mds_md_v1 *lmm,
2663                                         struct lov_ost_data_v1 *slot,
2664                                         __u32 ea_off, int lovea_size)
2665 {
2666         struct lfsck_thread_info *info          = lfsck_env_info(env);
2667         struct lu_fid            *cfid2         = &info->lti_fid2;
2668         struct ost_id            *oi            = &info->lti_oi;
2669         struct dt_device         *dev           = lfsck_obj2dev(parent);
2670         struct thandle           *th            = NULL;
2671         struct lustre_handle      lh            = { 0 };
2672         __u32                     ost_idx2      = le32_to_cpu(slot->l_ost_idx);
2673         int                       rc            = 0;
2674         ENTRY;
2675
2676         while (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val)) {
2677                 if (unlikely(!thread_is_running(&com->lc_lfsck->li_thread)))
2678                         RETURN(0);
2679         }
2680
2681         ostid_le_to_cpu(&slot->l_ost_oi, oi);
2682         rc = ostid_to_fid(cfid2, oi, ost_idx2);
2683         if (rc != 0)
2684                 GOTO(out, rc);
2685
2686         rc = lfsck_ibits_lock(env, com->lc_lfsck, parent, &lh,
2687                               MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR,
2688                               LCK_EX);
2689         if (rc != 0)
2690                 GOTO(out, rc);
2691
2692         rc = lfsck_layout_master_conditional_destroy(env, com, cfid2, ost_idx2);
2693
2694         /* If the conflict OST-obejct is not created for fixing dangling
2695          * referenced MDT-object in former LFSCK check/repair, or it has
2696          * been modified by others, then we cannot destroy it. Re-create
2697          * a new MDT-object for the orphan OST-object. */
2698         if (rc == -ETXTBSY) {
2699                 /* No need the layout lock on the original parent. */
2700                 lfsck_ibits_unlock(&lh, LCK_EX);
2701
2702                 fid_zero(&rec->lor_rec.lor_fid);
2703                 snprintf(info->lti_tmpbuf, sizeof(info->lti_tmpbuf),
2704                          "-"DFID"-%x", PFID(lu_object_fid(&parent->do_lu)),
2705                          ea_off);
2706                 rc = lfsck_layout_recreate_parent(env, com, ltd, rec, cfid,
2707                                                 info->lti_tmpbuf, "C", ea_off);
2708
2709                 RETURN(rc);
2710         }
2711
2712         if (rc != 0 && rc != -ENOENT)
2713                 GOTO(unlock, rc);
2714
2715         th = dt_trans_create(env, dev);
2716         if (IS_ERR(th))
2717                 GOTO(unlock, rc = PTR_ERR(th));
2718
2719         rc = dt_declare_xattr_set(env, parent, ea_buf, XATTR_NAME_LOV,
2720                                   LU_XATTR_REPLACE, th);
2721         if (rc != 0)
2722                 GOTO(stop, rc);
2723
2724         rc = dt_trans_start_local(env, dev, th);
2725         if (rc != 0)
2726                 GOTO(stop, rc);
2727
2728         dt_write_lock(env, parent, 0);
2729         lmm->lmm_layout_gen = cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1);
2730         rc = lfsck_layout_refill_lovea(env, com->lc_lfsck, th, parent, cfid,
2731                                        ea_buf, lmm, slot, LU_XATTR_REPLACE,
2732                                        ltd->ltd_index, lovea_size);
2733         dt_write_unlock(env, parent);
2734
2735         GOTO(stop, rc);
2736
2737 stop:
2738         dt_trans_stop(env, dev, th);
2739
2740 unlock:
2741         lfsck_ibits_unlock(&lh, LCK_EX);
2742
2743 out:
2744         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant replaced the conflict "
2745                "OST-object "DFID" on the OST %x with the orphan "DFID" on "
2746                "the OST %x: parent "DFID", stripe-index %u: rc = %d\n",
2747                lfsck_lfsck2name(com->lc_lfsck), PFID(cfid2), ost_idx2,
2748                PFID(cfid), ltd->ltd_index, PFID(lfsck_dto2fid(parent)),
2749                ea_off, rc);
2750
2751         return rc >= 0 ? 1 : rc;
2752 }
2753
2754 /**
2755  * \retval       +1: repaired
2756  * \retval        0: did nothing
2757  * \retval      -ve: on error
2758  */
2759 static int lfsck_layout_recreate_lovea(const struct lu_env *env,
2760                                        struct lfsck_component *com,
2761                                        struct lfsck_tgt_desc *ltd,
2762                                        struct lu_orphan_rec_v3 *rec,
2763                                        struct dt_object *parent,
2764                                        struct lu_fid *cfid,
2765                                        __u32 ost_idx, __u32 ea_off)
2766 {
2767         struct lfsck_thread_info *info          = lfsck_env_info(env);
2768         struct lu_buf            *buf           = &info->lti_big_buf;
2769         struct lu_fid            *fid           = &info->lti_fid2;
2770         struct ost_id            *oi            = &info->lti_oi;
2771         struct lfsck_instance    *lfsck         = com->lc_lfsck;
2772         struct dt_device         *dt            = lfsck_obj2dev(parent);
2773         struct lfsck_bookmark    *bk            = &lfsck->li_bookmark_ram;
2774         struct ost_layout        *ol            = &rec->lor_layout;
2775         struct lov_comp_md_v1    *lcm           = NULL;
2776         struct lov_comp_md_entry_v1 *lcme       = NULL;
2777         struct thandle           *handle        = NULL;
2778         size_t                    lovea_size;
2779         struct lov_mds_md_v1     *lmm;
2780         struct lov_ost_data_v1   *objs;
2781         struct lustre_handle      lh            = { 0 };
2782         __u32                     magic;
2783         __u32 flags = 0;
2784         int                       fl            = 0;
2785         int                       rc            = 0;
2786         int                       rc1;
2787         int                       i;
2788         int pos = 0;
2789         __u16 count;
2790         bool locked = false;
2791         bool new_mirror = true;
2792         ENTRY;
2793
2794         rc = lfsck_ibits_lock(env, lfsck, parent, &lh,
2795                               MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR,
2796                               LCK_EX);
2797         if (rc != 0) {
2798                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant failed to recreate "
2799                        "LOV EA for "DFID": parent "DFID", OST-index %u, "
2800                        "stripe-index %u, comp_id %u, comp_start %llu, "
2801                        "comp_end %llu, layout version %u, range %u: rc = %d\n",
2802                        lfsck_lfsck2name(lfsck), PFID(cfid),
2803                        PFID(lfsck_dto2fid(parent)), ost_idx, ea_off,
2804                        ol->ol_comp_id, ol->ol_comp_start,
2805                        ol->ol_comp_end, rec->lor_layout_version,
2806                        rec->lor_range, rc);
2807
2808                 RETURN(rc);
2809         }
2810
2811 again:
2812         if (locked) {
2813                 dt_write_unlock(env, parent);
2814                 locked = false;
2815         }
2816
2817         if (handle != NULL) {
2818                 dt_trans_stop(env, dt, handle);
2819                 handle = NULL;
2820         }
2821
2822         if (rc < 0)
2823                 GOTO(unlock_layout, rc);
2824
2825         lovea_size = rc;
2826         if (buf->lb_len < lovea_size) {
2827                 lu_buf_realloc(buf, lovea_size);
2828                 if (buf->lb_buf == NULL)
2829                         GOTO(unlock_layout, rc = -ENOMEM);
2830         }
2831
2832         if (!(bk->lb_param & LPF_DRYRUN)) {
2833                 handle = dt_trans_create(env, dt);
2834                 if (IS_ERR(handle))
2835                         GOTO(unlock_layout, rc = PTR_ERR(handle));
2836
2837                 rc = dt_declare_xattr_set(env, parent, buf, XATTR_NAME_LOV,
2838                                           fl, handle);
2839                 if (rc != 0)
2840                         GOTO(stop, rc);
2841
2842                 rc = dt_trans_start_local(env, dt, handle);
2843                 if (rc != 0)
2844                         GOTO(stop, rc);
2845         }
2846
2847         dt_write_lock(env, parent, 0);
2848         locked = true;
2849         rc = dt_xattr_get(env, parent, buf, XATTR_NAME_LOV);
2850         if (rc == -ERANGE) {
2851                 rc = dt_xattr_get(env, parent, &LU_BUF_NULL, XATTR_NAME_LOV);
2852                 LASSERT(rc != 0);
2853                 goto again;
2854         } else if (rc == -ENODATA || rc == 0) {
2855                 lovea_size = lfsck_lovea_size(ol, ea_off);
2856                 /* If the declared is not big enough, re-try. */
2857                 if (buf->lb_len < lovea_size) {
2858                         rc = lovea_size;
2859                         goto again;
2860                 }
2861                 fl = LU_XATTR_CREATE;
2862         } else if (rc < 0) {
2863                 GOTO(unlock_parent, rc);
2864         } else if (unlikely(buf->lb_len == 0)) {
2865                 goto again;
2866         } else {
2867                 fl = LU_XATTR_REPLACE;
2868                 lovea_size = rc;
2869         }
2870
2871         if (fl == LU_XATTR_CREATE) {
2872                 if (bk->lb_param & LPF_DRYRUN)
2873                         GOTO(unlock_parent, rc = 1);
2874
2875                 LASSERT(buf->lb_len >= lovea_size);
2876
2877                 rc = lfsck_layout_update_lovea(env, lfsck, handle, rec, parent,
2878                                                cfid, buf, fl, ost_idx, ea_off);
2879
2880                 GOTO(unlock_parent, rc);
2881         }
2882
2883         lmm = buf->lb_buf;
2884         rc1 = lfsck_layout_verify_header(parent, lmm);
2885
2886         /* If the LOV EA crashed, the rebuild it. */
2887         if (rc1 == -EINVAL) {
2888                 if (bk->lb_param & LPF_DRYRUN)
2889                         GOTO(unlock_parent, rc = 1);
2890
2891                 LASSERT(buf->lb_len >= lovea_size);
2892
2893                 rc = lfsck_layout_update_lovea(env, lfsck, handle, rec, parent,
2894                                                cfid, buf, fl, ost_idx, ea_off);
2895
2896                 GOTO(unlock_parent, rc);
2897         }
2898
2899         /* For other unknown magic/pattern, keep the current LOV EA. */
2900         if (rc1 == -EOPNOTSUPP)
2901                 GOTO(unlock_parent, rc1 = 0);
2902
2903         if (rc1)
2904                 GOTO(unlock_parent, rc = rc1);
2905
2906         magic = le32_to_cpu(lmm->lmm_magic);
2907         if (magic == LOV_MAGIC_COMP_V1) {
2908                 __u64 start;
2909                 __u64 end;
2910                 __u16 mirror_id0 = mirror_id_of(ol->ol_comp_id);
2911                 __u16 mirror_id1;
2912
2913                 lcm = buf->lb_buf;
2914                 count = le16_to_cpu(lcm->lcm_entry_count);
2915                 for (i = 0; i < count; pos = ++i) {
2916                         lcme = &lcm->lcm_entries[i];
2917                         start = le64_to_cpu(lcme->lcme_extent.e_start);
2918                         end = le64_to_cpu(lcme->lcme_extent.e_end);
2919                         mirror_id1 = mirror_id_of(le32_to_cpu(lcme->lcme_id));
2920
2921                         if (mirror_id0 > mirror_id1)
2922                                 continue;
2923
2924                         if (mirror_id0 < mirror_id1)
2925                                 break;
2926
2927                         new_mirror = false;
2928                         if (end <= ol->ol_comp_start)
2929                                 continue;
2930
2931                         if (start >= ol->ol_comp_end)
2932                                 break;
2933
2934                         lmm = buf->lb_buf + le32_to_cpu(lcme->lcme_offset);
2935                         magic = le32_to_cpu(lmm->lmm_magic);
2936                         flags = le32_to_cpu(lcme->lcme_flags);
2937                         goto further;
2938                 }
2939
2940                 rc = lfsck_layout_add_comp(env, lfsck, handle, rec, parent,
2941                                 cfid, buf, ost_idx, ea_off, pos, new_mirror);
2942
2943                 GOTO(unlock_parent, rc);
2944         }
2945
2946 further:
2947         count = le16_to_cpu(lmm->lmm_stripe_count);
2948         if (count == 0)
2949                 GOTO(unlock_parent, rc = -EINVAL);
2950         LASSERT(count > 0);
2951
2952         /* Exceed the current end of MDT-object layout EA. Then extend it. */
2953         if (count <= ea_off) {
2954                 if (bk->lb_param & LPF_DRYRUN)
2955                         GOTO(unlock_parent, rc = 1);
2956
2957                 lovea_size = lov_mds_md_size(ea_off + 1, magic);
2958                 /* If the declared is not big enough, re-try. */
2959                 if (buf->lb_len < lovea_size) {
2960                         rc = lovea_size;
2961                         goto again;
2962                 }
2963
2964                 if (lcm) {
2965                         LASSERT(lcme);
2966
2967                         lcme->lcme_flags = cpu_to_le32(flags | LCME_FL_INIT);
2968                         lfsck_layout_update_lcm(lcm, lcme,
2969                                                 rec->lor_layout_version,
2970                                                 rec->lor_range);
2971                 }
2972
2973                 rc = lfsck_layout_extend_v1v3_lovea(env, lfsck, handle, ol,
2974                                         parent, cfid, buf, ost_idx, ea_off);
2975
2976                 GOTO(unlock_parent, rc);
2977         }
2978
2979         LASSERTF(rc > 0, "invalid rc = %d\n", rc);
2980
2981         if (magic == LOV_MAGIC_V1) {
2982                 objs = &lmm->lmm_objects[0];
2983         } else {
2984                 LASSERT(magic == LOV_MAGIC_V3);
2985                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0];
2986         }
2987
2988         for (i = 0; i < count; i++, objs++) {
2989                 /* The MDT-object was created via lfsck_layout_recover_create()
2990                  * by others before, and we fill the dummy layout EA. */
2991                 if ((lcme && !(flags & LCME_FL_INIT)) ||
2992                      lovea_slot_is_dummy(objs)) {
2993                         if (i != ea_off)
2994                                 continue;
2995
2996                         if (bk->lb_param & LPF_DRYRUN)
2997                                 GOTO(unlock_parent, rc = 1);
2998
2999                         lmm->lmm_layout_gen =
3000                             cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1);
3001                         if (lcme) {
3002                                 LASSERT(lcm);
3003
3004                                 if (le32_to_cpu(lmm->lmm_stripe_size) !=
3005                                         ol->ol_stripe_size ||
3006                                     le16_to_cpu(lmm->lmm_stripe_count) !=
3007                                         ol->ol_stripe_count ||
3008                                     le64_to_cpu(lcme->lcme_extent.e_start) !=
3009                                         ol->ol_comp_start ||
3010                                     le64_to_cpu(lcme->lcme_extent.e_end) !=
3011                                         ol->ol_comp_end) {
3012                                         CDEBUG(D_LFSCK, "%s: found invalid "
3013                                         "component for "DFID ": parent "DFID
3014                                         ", stripe-index %u, stripe_size %u, "
3015                                         "stripe_count %u, comp_id %u, "
3016                                         "comp_start %llu, comp_end %llu, "
3017                                         "cur_stripe_size %u, "
3018                                         "cur_stripe_count %u, "
3019                                         "cur_comp_start %llu, "
3020                                         "cur_comp_end %llu\n",
3021                                         lfsck_lfsck2name(lfsck), PFID(cfid),
3022                                         PFID(lfsck_dto2fid(parent)), ea_off,
3023                                         ol->ol_stripe_size,
3024                                         ol->ol_stripe_count, ol->ol_comp_id,
3025                                         ol->ol_comp_start, ol->ol_comp_end,
3026                                         le32_to_cpu(lmm->lmm_stripe_size),
3027                                         le16_to_cpu(lmm->lmm_stripe_count),
3028                                         le64_to_cpu(lcme->lcme_extent.e_start),
3029                                         le64_to_cpu(lcme->lcme_extent.e_end));
3030
3031                                         GOTO(unlock_parent, rc = -EINVAL);
3032                                 }
3033
3034                                 lovea_size = le32_to_cpu(lcm->lcm_size);
3035                                 lcme->lcme_flags = cpu_to_le32(flags |
3036                                                                LCME_FL_INIT);
3037                                 lfsck_layout_update_lcm(lcm, lcme,
3038                                                         rec->lor_layout_version,
3039                                                         rec->lor_range);
3040                         }
3041
3042                         LASSERTF(buf->lb_len >= lovea_size,
3043                                  "buffer len %d is less than real size %d\n",
3044                                  (int)buf->lb_len, (int)lovea_size);
3045
3046                         rc = lfsck_layout_refill_lovea(env, lfsck, handle,
3047                                                 parent, cfid, buf, lmm, objs,
3048                                                 fl, ost_idx, lovea_size);
3049
3050                         CDEBUG(D_LFSCK, "%s layout LFSCK assistant fill "
3051                                "dummy layout slot for "DFID": parent "DFID
3052                                ", OST-index %u, stripe-index %u: rc = %d\n",
3053                                lfsck_lfsck2name(lfsck), PFID(cfid),
3054                                PFID(lfsck_dto2fid(parent)), ost_idx, i, rc);
3055
3056                         GOTO(unlock_parent, rc);
3057                 }
3058
3059                 ostid_le_to_cpu(&objs->l_ost_oi, oi);
3060                 rc = ostid_to_fid(fid, oi, le32_to_cpu(objs->l_ost_idx));
3061                 if (rc != 0) {
3062                         CDEBUG(D_LFSCK, "%s: the parent "DFID" contains "
3063                                "invalid layout EA at the slot %d, index %u\n",
3064                                lfsck_lfsck2name(lfsck),
3065                                PFID(lfsck_dto2fid(parent)), i,
3066                                le32_to_cpu(objs->l_ost_idx));
3067
3068                         GOTO(unlock_parent, rc);
3069                 }
3070
3071                 /* It should be rare case, the slot is there, but the LFSCK
3072                  * does not handle it during the first-phase cycle scanning. */
3073                 if (unlikely(lu_fid_eq(fid, cfid))) {
3074                         if (i == ea_off) {
3075                                 GOTO(unlock_parent, rc = 0);
3076                         } else {
3077                                 /* Rare case that the OST-object index
3078                                  * does not match the parent MDT-object
3079                                  * layout EA. We trust the later one. */
3080                                 if (bk->lb_param & LPF_DRYRUN)
3081                                         GOTO(unlock_parent, rc = 1);
3082
3083                                 dt_write_unlock(env, parent);
3084                                 if (handle != NULL)
3085                                         dt_trans_stop(env, dt, handle);
3086                                 lfsck_ibits_unlock(&lh, LCK_EX);
3087                                 rc = lfsck_layout_update_pfid(env, com, parent,
3088                                                         cfid, ltd->ltd_tgt,
3089                                                         rec, i);
3090
3091                                 CDEBUG(D_LFSCK, "%s layout LFSCK assistant "
3092                                        "updated OST-object's pfid for "DFID
3093                                        ": parent "DFID", OST-index %u, "
3094                                        "stripe-index %u: rc = %d\n",
3095                                        lfsck_lfsck2name(lfsck), PFID(cfid),
3096                                        PFID(lfsck_dto2fid(parent)),
3097                                        ltd->ltd_index, i, rc);
3098
3099                                 RETURN(rc);
3100                         }
3101                 }
3102         }
3103
3104         /* The MDT-object exists, but related layout EA slot is occupied
3105          * by others. */
3106         if (bk->lb_param & LPF_DRYRUN)
3107                 GOTO(unlock_parent, rc = 1);
3108
3109         dt_write_unlock(env, parent);
3110         if (handle != NULL)
3111                 dt_trans_stop(env, dt, handle);
3112         lfsck_ibits_unlock(&lh, LCK_EX);
3113         if (magic == LOV_MAGIC_V1)
3114                 objs = &lmm->lmm_objects[ea_off];
3115         else
3116                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[ea_off];
3117         rc = lfsck_layout_conflict_create(env, com, ltd, rec, parent, cfid,
3118                                           buf, lmm, objs, ea_off, lovea_size);
3119
3120         RETURN(rc);
3121
3122 unlock_parent:
3123         if (locked)
3124                 dt_write_unlock(env, parent);
3125
3126 stop:
3127         if (handle != NULL)
3128                 dt_trans_stop(env, dt, handle);
3129
3130 unlock_layout:
3131         lfsck_ibits_unlock(&lh, LCK_EX);
3132
3133         return rc;
3134 }
3135
3136 static int lfsck_layout_scan_orphan_one(const struct lu_env *env,
3137                                         struct lfsck_component *com,
3138                                         struct lfsck_tgt_desc *ltd,
3139                                         struct lu_orphan_rec_v3 *rec,
3140                                         struct lu_fid *cfid)
3141 {
3142         struct lfsck_layout     *lo     = com->lc_file_ram;
3143         struct lu_fid           *pfid   = &rec->lor_rec.lor_fid;
3144         struct dt_object        *parent = NULL;
3145         __u32                    ea_off = pfid->f_stripe_idx;
3146         int                      rc     = 0;
3147         ENTRY;
3148
3149         if (!fid_is_sane(cfid))
3150                 GOTO(out, rc = -EINVAL);
3151
3152         pfid->f_ver = 0;
3153         if (fid_is_zero(pfid)) {
3154                 rc = lfsck_layout_recreate_parent(env, com, ltd, rec, cfid,
3155                                                   "", "N", ea_off);
3156                 GOTO(out, rc);
3157         }
3158
3159         if (!fid_is_sane(pfid))
3160                 GOTO(out, rc = -EINVAL);
3161
3162         parent = lfsck_object_find_by_dev(env, com->lc_lfsck->li_bottom, pfid);
3163         if (IS_ERR(parent))
3164                 GOTO(out, rc = PTR_ERR(parent));
3165
3166         if (unlikely(dt_object_remote(parent) != 0))
3167                 GOTO(put, rc = -EXDEV);
3168
3169         if (dt_object_exists(parent) == 0) {
3170                 lfsck_object_put(env, parent);
3171                 rc = lfsck_layout_recreate_parent(env, com, ltd, rec, cfid,
3172                                                   "", "R", ea_off);
3173                 GOTO(out, rc);
3174         }
3175
3176         if (!S_ISREG(lu_object_attr(&parent->do_lu)))
3177                 GOTO(put, rc = -EISDIR);
3178
3179         /* The orphan OST-object claims to be the parent's stripe, then
3180          * related dangling record in the trace file is meaningless. */
3181         rc = lfsck_layout_del_dangling_rec(env, com, pfid,
3182                                            rec->lor_layout.ol_comp_id, ea_off);
3183         if (rc && rc != -ENOENT)
3184                 GOTO(put, rc);
3185
3186         rc = lfsck_layout_recreate_lovea(env, com, ltd, rec, parent, cfid,
3187                                          ltd->ltd_index, ea_off);
3188
3189         GOTO(put, rc);
3190
3191 put:
3192         if (rc <= 0)
3193                 lfsck_object_put(env, parent);
3194         else
3195                 /* The layout EA is changed, need to be reloaded next time. */
3196                 dt_object_put_nocache(env, parent);
3197
3198 out:
3199         down_write(&com->lc_sem);
3200         com->lc_new_scanned++;
3201         com->lc_new_checked++;
3202         if (rc > 0) {
3203                 lo->ll_objs_repaired[LLIT_ORPHAN - 1]++;
3204                 rc = 0;
3205         } else if (rc < 0) {
3206                 lo->ll_objs_failed_phase2++;
3207         }
3208         up_write(&com->lc_sem);
3209
3210         return rc;
3211 }
3212
3213 static int lfsck_layout_scan_orphan(const struct lu_env *env,
3214                                     struct lfsck_component *com,
3215                                     struct lfsck_tgt_desc *ltd)
3216 {
3217         struct lfsck_assistant_data     *lad    = com->lc_data;
3218         struct lfsck_instance           *lfsck  = com->lc_lfsck;
3219         struct lfsck_bookmark           *bk     = &lfsck->li_bookmark_ram;
3220         struct lfsck_thread_info        *info   = lfsck_env_info(env);
3221         struct lu_fid                   *fid    = &info->lti_fid;
3222         struct dt_object                *obj;
3223         const struct dt_it_ops          *iops;
3224         struct dt_it                    *di;
3225         int                              rc     = 0;
3226         ENTRY;
3227
3228         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant starts the orphan "
3229                "scanning for OST%04x\n",
3230                lfsck_lfsck2name(lfsck), ltd->ltd_index);
3231
3232         if (cfs_bitmap_check(lad->lad_bitmap, ltd->ltd_index)) {
3233                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant skip the orphan "
3234                        "scanning for OST%04x\n",
3235                        lfsck_lfsck2name(lfsck), ltd->ltd_index);
3236
3237                 RETURN(0);
3238         }
3239
3240         fid->f_seq = fid_idif_seq(0, ltd->ltd_index);
3241         fid->f_oid = fid->f_ver = 0;
3242
3243         obj = lfsck_object_find_by_dev(env, ltd->ltd_tgt, fid);
3244         if (unlikely(IS_ERR(obj)))
3245                 GOTO(log, rc = PTR_ERR(obj));
3246
3247         rc = obj->do_ops->do_index_try(env, obj,
3248                                        &dt_lfsck_layout_orphan_features);
3249         if (rc != 0)
3250                 GOTO(put, rc);
3251
3252         iops = &obj->do_index_ops->dio_it;
3253         di = iops->init(env, obj, 0);
3254         if (IS_ERR(di))
3255                 GOTO(put, rc = PTR_ERR(di));
3256
3257         rc = iops->load(env, di, 0);
3258         if (rc == -ESRCH) {
3259                 /* -ESRCH means that the orphan OST-objects rbtree has been
3260                  * cleanup because of the OSS server restart or other errors. */
3261                 lfsck_lad_set_bitmap(env, com, ltd->ltd_index);
3262                 GOTO(fini, rc);
3263         }
3264
3265         if (rc == 0)
3266                 rc = iops->next(env, di);
3267         else if (rc > 0)
3268                 rc = 0;
3269
3270         if (rc < 0)
3271                 GOTO(fini, rc);
3272
3273         if (rc > 0)
3274                 GOTO(fini, rc = 0);
3275
3276         do {
3277                 struct dt_key           *key;
3278                 struct lu_orphan_rec_v3 *rec = &info->lti_rec;
3279
3280                 if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val) &&
3281                     unlikely(!thread_is_running(&lfsck->li_thread)))
3282                         break;
3283
3284                 key = iops->key(env, di);
3285                 com->lc_fid_latest_scanned_phase2 = *(struct lu_fid *)key;
3286                 /* Remote target OST may be runnning old LFSCK */
3287                 memset(rec, 0, sizeof(*rec));
3288                 rc = iops->rec(env, di, (struct dt_rec *)rec, 0);
3289                 if (rc == 0)
3290                         rc = lfsck_layout_scan_orphan_one(env, com, ltd, rec,
3291                                         &com->lc_fid_latest_scanned_phase2);
3292                 if (rc != 0 && bk->lb_param & LPF_FAILOUT)
3293                         GOTO(fini, rc);
3294
3295                 lfsck_control_speed_by_self(com);
3296                 do {
3297                         rc = iops->next(env, di);
3298                 } while (rc < 0 && !(bk->lb_param & LPF_FAILOUT));
3299         } while (rc == 0);
3300
3301         GOTO(fini, rc);
3302
3303 fini:
3304         iops->put(env, di);
3305         iops->fini(env, di);
3306 put:
3307         lfsck_object_put(env, obj);
3308
3309 log:
3310         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant finished the orphan "
3311                "scanning for OST%04x: rc = %d\n",
3312                lfsck_lfsck2name(lfsck), ltd->ltd_index, rc);
3313
3314         return rc > 0 ? 0 : rc;
3315 }
3316
3317 static int lfsck_lov2layout(struct lov_mds_md_v1 *lmm, struct filter_fid *ff,
3318                             __u32 comp_id)
3319 {
3320         struct ost_layout *ol = &ff->ff_layout;
3321         __u32 magic = le32_to_cpu(lmm->lmm_magic);
3322         int rc = 0;
3323         ENTRY;
3324
3325         if (magic == LOV_MAGIC_V1 || magic == LOV_MAGIC_V3) {
3326                 ol->ol_stripe_size = lmm->lmm_stripe_size;
3327                 ol->ol_stripe_count = lmm->lmm_stripe_count;
3328                 ol->ol_comp_start = 0;
3329                 ol->ol_comp_end = 0;
3330                 ol->ol_comp_id = 0;
3331                 ff->ff_layout_version = 0;
3332                 ff->ff_range = 0;
3333         } else if (magic == LOV_MAGIC_COMP_V1) {
3334                 struct lov_comp_md_v1 *lcm = (struct lov_comp_md_v1 *)lmm;
3335                 struct lov_comp_md_entry_v1 *lcme = NULL;
3336                 __u16 count = le16_to_cpu(lcm->lcm_entry_count);
3337                 int i;
3338
3339                 for (i = 0; i < count; i++) {
3340                         lcme = &lcm->lcm_entries[i];
3341                         if (le32_to_cpu(lcme->lcme_id) == comp_id) {
3342                                 LASSERT(le32_to_cpu(lcme->lcme_flags) &
3343                                         LCME_FL_INIT);
3344
3345                                 break;
3346                         }
3347                 }
3348
3349                 /* The comp has been removed, do nothing. */
3350                 if (i == count)
3351                         GOTO(out, rc = 1);
3352
3353                 lmm = (void *)lmm + le32_to_cpu(lcme->lcme_offset);
3354                 ol->ol_stripe_size = le32_to_cpu(lmm->lmm_stripe_size);
3355                 ol->ol_stripe_count = le32_to_cpu(lmm->lmm_stripe_count);
3356                 ol->ol_comp_start = le64_to_cpu(lcme->lcme_extent.e_start);
3357                 ol->ol_comp_end = le64_to_cpu(lcme->lcme_extent.e_end);
3358                 ol->ol_comp_id = le32_to_cpu(lcme->lcme_id);
3359                 ff->ff_layout_version = le32_to_cpu(lcme->lcme_layout_gen);
3360                 ff->ff_range = 0;
3361         } else {
3362                 GOTO(out, rc = -EINVAL);
3363         }
3364
3365         EXIT;
3366
3367 out:
3368         return rc;
3369 }
3370
3371 /**
3372  * Repair the MDT-object with dangling LOV EA reference.
3373  *
3374  * we need to repair the inconsistency according to the users' requirement:
3375  *
3376  * 1) Keep the inconsistency there and report the inconsistency case,
3377  *    then give the chance to the application to find related issues,
3378  *    and the users can make the decision about how to handle it with
3379  *    more human knownledge. (by default)
3380  *
3381  * 2) Re-create the missing OST-object with the FID/owner information.
3382  *
3383  * \param[in] env       pointer to the thread context
3384  * \param[in] com       the layout LFSCK component
3385  * \param[in] parent    the MDT-object with dangling LOV EA reference
3386  * \param[in] child     the OST-object to be created
3387  * \param[in] comp_id   the component ID of the OST-object in the LOV EA
3388  * \param[in] ea_off    the offset of the OST-object in the LOV EA
3389  * \param[in] ost_idx   the index of OST on which the OST-object resides
3390  *
3391  * \retval              +1 for repair successfully
3392  * \retval              0 for did nothing
3393  * \retval              negative error number on failure
3394  */
3395 static int __lfsck_layout_repair_dangling(const struct lu_env *env,
3396                                           struct lfsck_component *com,
3397                                           struct dt_object *parent,
3398                                           struct dt_object *child,
3399                                           __u32 comp_id, __u32 ea_off,
3400                                           __u32 ost_idx, bool log)
3401 {
3402         struct lfsck_thread_info *info = lfsck_env_info(env);
3403         struct filter_fid *ff = &info->lti_ff;
3404         struct dt_object_format *dof = &info->lti_dof;
3405         struct lu_attr *la = &info->lti_la;
3406         struct lfsck_instance *lfsck = com->lc_lfsck;
3407         struct dt_device *dev = lfsck_obj2dev(child);
3408         const struct lu_fid *pfid = lfsck_dto2fid(parent);
3409         const struct lu_fid *cfid = lfsck_dto2fid(child);
3410         struct lu_buf *tbuf = &info->lti_big_buf;
3411         struct thandle *handle;
3412         struct lu_buf *buf;
3413         struct lustre_handle lh = { 0 };
3414         int rc;
3415         ENTRY;
3416
3417         if (!(lfsck->li_bookmark_ram.lb_param & LPF_CREATE_OSTOBJ))
3418                 GOTO(log, rc = 1);
3419
3420         rc = lfsck_ibits_lock(env, lfsck, parent, &lh,
3421                               MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR,
3422                               LCK_EX);
3423         if (rc != 0)
3424                 GOTO(log, rc);
3425
3426         rc = dt_attr_get(env, parent, la);
3427         if (rc != 0)
3428                 GOTO(unlock1, rc);
3429
3430         la->la_mode = S_IFREG | 0666;
3431         la->la_atime = la->la_mtime = la->la_ctime = 0;
3432         la->la_valid = LA_TYPE | LA_MODE | LA_UID | LA_GID |
3433                        LA_ATIME | LA_MTIME | LA_CTIME;
3434         memset(dof, 0, sizeof(*dof));
3435         ff->ff_parent.f_seq = cpu_to_le64(pfid->f_seq);
3436         ff->ff_parent.f_oid = cpu_to_le32(pfid->f_oid);
3437         /* Currently, the filter_fid::ff_parent::f_ver is not the real parent
3438          * MDT-object's FID::f_ver, instead it is the OST-object index in its
3439          * parent MDT-object's layout EA. */
3440         ff->ff_parent.f_stripe_idx = cpu_to_le32(ea_off);
3441
3442         rc = lfsck_layout_get_lovea(env, parent, tbuf);
3443         if (unlikely(rc == -ENODATA))
3444                 rc = 0;
3445         if (rc <= 0)
3446                 GOTO(unlock1, rc);
3447
3448         rc = lfsck_lov2layout(tbuf->lb_buf, ff, comp_id);
3449         if (rc)
3450                 GOTO(unlock1, rc);
3451
3452         buf = lfsck_buf_get(env, ff, sizeof(struct filter_fid));
3453         handle = dt_trans_create(env, dev);
3454         if (IS_ERR(handle))
3455                 GOTO(unlock1, rc = PTR_ERR(handle));
3456
3457         rc = dt_declare_create(env, child, la, NULL, dof, handle);
3458         if (rc != 0)
3459                 GOTO(stop, rc);
3460
3461         rc = dt_declare_xattr_set(env, child, buf, XATTR_NAME_FID,
3462                                   LU_XATTR_CREATE, handle);
3463         if (rc != 0)
3464                 GOTO(stop, rc);
3465
3466         rc = dt_trans_start_local(env, dev, handle);
3467         if (rc != 0)
3468                 GOTO(stop, rc);
3469
3470         dt_read_lock(env, parent, 0);
3471         if (unlikely(lfsck_is_dead_obj(parent)))
3472                 GOTO(unlock2, rc = 0);
3473
3474         if (lfsck->li_bookmark_ram.lb_param & LPF_DELAY_CREATE_OSTOBJ) {
3475                 struct ost_id *oi = &info->lti_oi;
3476                 struct lu_fid *tfid = &info->lti_fid2;
3477                 struct lu_buf *lovea = &info->lti_big_buf;
3478                 struct lov_mds_md_v1 *lmm;
3479                 struct lov_ost_data_v1 *objs;
3480                 __u32 magic;
3481                 int count;
3482                 int idx2;
3483
3484                 rc = lfsck_layout_get_lovea(env, parent, lovea);
3485                 if (unlikely(rc == -ENODATA))
3486                         rc = 0;
3487                 if (rc <= 0)
3488                         GOTO(unlock2, rc);
3489
3490                 lmm = lovea->lb_buf;
3491                 magic = le32_to_cpu(lmm->lmm_magic);
3492                 if (magic == LOV_MAGIC_COMP_V1) {
3493                         struct lov_comp_md_v1 *lcm = buf->lb_buf;
3494                         struct lov_comp_md_entry_v1 *lcme;
3495                         __u16 count = le16_to_cpu(lcm->lcm_entry_count);
3496                         int i;
3497
3498                         for (i = 0; i < count; i++) {
3499                                 lcme = &lcm->lcm_entries[i];
3500                                 if (le32_to_cpu(lcme->lcme_id) == comp_id) {
3501                                         LASSERT(le32_to_cpu(lcme->lcme_flags) &
3502                                                 LCME_FL_INIT);
3503
3504                                         lmm = lovea->lb_buf +
3505                                                 le32_to_cpu(lcme->lcme_offset);
3506                                         magic = le32_to_cpu(lmm->lmm_magic);
3507                                         goto check;
3508                                 }
3509                         }
3510
3511                         /* Someone removed the component, do nothing. */
3512                         GOTO(unlock2, rc = 0);
3513                 }
3514
3515 check:
3516                 count = le16_to_cpu(lmm->lmm_stripe_count);
3517                 /* Someone changed the LOV EA, do nothing. */
3518                 if (count <= ea_off)
3519                         GOTO(unlock2, rc = 0);
3520
3521                 if (magic == LOV_MAGIC_V1) {
3522                         objs = &lmm->lmm_objects[ea_off];
3523                 } else {
3524                         LASSERT(magic == LOV_MAGIC_V3);
3525
3526                         objs = &((struct lov_mds_md_v3 *)lmm)->\
3527                                                         lmm_objects[ea_off];
3528                 }
3529
3530                 ostid_le_to_cpu(&objs->l_ost_oi, oi);
3531                 idx2 = le32_to_cpu(objs->l_ost_idx);
3532                 rc = ostid_to_fid(tfid, oi, idx2);
3533                 /* Someone changed the LOV EA, do nothing. */
3534                 if (rc != 0 || !lu_fid_eq(tfid, cfid))
3535                         GOTO(unlock2, rc);
3536         }
3537
3538         rc = dt_create(env, child, la, NULL, dof, handle);
3539         if (rc != 0)
3540                 GOTO(unlock2, rc);
3541
3542         rc = dt_xattr_set(env, child, buf, XATTR_NAME_FID, LU_XATTR_CREATE,
3543                           handle);
3544
3545         GOTO(unlock2, rc);
3546
3547 unlock2:
3548         dt_read_unlock(env, parent);
3549
3550 stop:
3551         rc = lfsck_layout_trans_stop(env, dev, handle, rc);
3552
3553 unlock1:
3554         lfsck_ibits_unlock(&lh, LCK_EX);
3555
3556 log:
3557         if (rc && log)
3558                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant found "
3559                        "dangling reference for: parent "DFID", child "
3560                        DFID", comp_id %u, ea_off %u, ost_idx %u, %s: "
3561                        "rc = %d\n",
3562                        lfsck_lfsck2name(lfsck), PFID(pfid), PFID(cfid),
3563                        comp_id, ea_off, ost_idx,
3564                        (lfsck->li_bookmark_ram.lb_param & LPF_CREATE_OSTOBJ) ?
3565                                 "Create the lost OST-object as required" :
3566                                 "Keep the MDT-object there by default", rc);
3567
3568         return rc;
3569 }
3570
3571 /**
3572  * Repair the MDT-object with dangling LOV EA reference.
3573  *
3574  * Prepare parameters and call __lfsck_layout_repair_dangling()
3575  * to repair the dangling LOV EA reference.
3576  *
3577  * \param[in] env       pointer to the thread context
3578  * \param[in] com       the layout LFSCK component
3579  * \param[in] pfid      the MDT-object's FID
3580  * \param[in] cfid      the FID for the OST-object to be created
3581  * \param[in] comp_id   the component ID of the OST-object in the LOV EA
3582  * \param[in] ea_off    the offset of the OST-object in the LOV EA
3583  * \param[in] ost_idx   the index of OST on which the OST-object resides
3584  *
3585  * \retval              +1 for repair successfully
3586  * \retval              0 for did nothing
3587  * \retval              negative error number on failure
3588  */
3589 static int lfsck_layout_repair_dangling(const struct lu_env *env,
3590                                         struct lfsck_component *com,
3591                                         const struct lu_fid *pfid,
3592                                         const struct lu_fid *cfid,
3593                                         __u32 comp_id, __u32 ea_off,
3594                                         __u32 ost_idx)
3595 {
3596         struct lfsck_instance *lfsck = com->lc_lfsck;
3597         struct dt_object *parent = NULL;
3598         struct dt_object *child = NULL;
3599         struct lfsck_tgt_desc *ltd;
3600         int rc;
3601         ENTRY;
3602
3603         parent = lfsck_object_find_bottom(env, lfsck, pfid);
3604         if (IS_ERR(parent))
3605                 GOTO(log, rc = PTR_ERR(parent));
3606
3607         /* The MDT-object has been removed. */
3608         if (dt_object_exists(parent) == 0)
3609                 GOTO(log, rc = 0);
3610
3611         ltd = lfsck_ltd2tgt(&lfsck->li_ost_descs, ost_idx);
3612         if (unlikely(ltd == NULL))
3613                 GOTO(log, rc = -ENODEV);
3614
3615         child = lfsck_object_find_by_dev(env, ltd->ltd_tgt, cfid);
3616         if (IS_ERR(child))
3617                 GOTO(log, rc = PTR_ERR(child));
3618
3619         /* The OST-object has been created. */
3620         if (unlikely(dt_object_exists(child) != 0))
3621                 GOTO(log, rc = 0);
3622
3623         rc = __lfsck_layout_repair_dangling(env, com, parent, child,
3624                                             comp_id, ea_off, ost_idx, false);
3625
3626         GOTO(log, rc);
3627
3628 log:
3629         if (child != NULL && !IS_ERR(child))
3630                 lfsck_object_put(env, child);
3631
3632         if (parent != NULL && !IS_ERR(parent))
3633                 lfsck_object_put(env, parent);
3634
3635         if (rc)
3636                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant found "
3637                        "dangling reference for: parent "DFID", child "
3638                        DFID", comp_id %u, ea_off %u, ost_idx %u, %s: rc = %d\n",
3639                        lfsck_lfsck2name(lfsck), PFID(pfid), PFID(cfid),
3640                        comp_id, ea_off, ost_idx,
3641                        (lfsck->li_bookmark_ram.lb_param & LPF_CREATE_OSTOBJ) ?
3642                                 "Create the lost OST-object as required" :
3643                                 "Keep the MDT-object there by default", rc);
3644
3645         return rc;
3646 }
3647
3648 /* If the OST-object does not recognize the MDT-object as its parent, and
3649  * there is no other MDT-object claims as its parent, then just trust the
3650  * given MDT-object as its parent. So update the OST-object filter_fid. */
3651 static int lfsck_layout_repair_unmatched_pair(const struct lu_env *env,
3652                                               struct lfsck_component *com,
3653                                               struct dt_object *parent,
3654                                               struct lfsck_layout_req *llr,
3655                                               struct lu_attr *la)
3656 {
3657         struct lfsck_thread_info        *info   = lfsck_env_info(env);
3658         struct filter_fid               *ff     = &info->lti_ff;
3659         struct dt_object                *child  = llr->llr_child;
3660         struct dt_device                *dev    = lfsck_obj2dev(child);
3661         const struct lu_fid             *tfid   = lu_object_fid(&parent->do_lu);
3662         struct lu_buf                   *tbuf   = &info->lti_big_buf;
3663         struct thandle                  *handle;
3664         struct lu_buf                   *buf;
3665         struct lustre_handle             lh     = { 0 };
3666         int                              rc;
3667         ENTRY;
3668
3669         rc = lfsck_ibits_lock(env, com->lc_lfsck, parent, &lh,
3670                               MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR,
3671                               LCK_EX);
3672         if (rc != 0)
3673                 GOTO(log, rc);
3674
3675         ff->ff_parent.f_seq = cpu_to_le64(tfid->f_seq);
3676         ff->ff_parent.f_oid = cpu_to_le32(tfid->f_oid);
3677         /* Currently, the filter_fid::ff_parent::f_ver is not the real parent
3678          * MDT-object's FID::f_ver, instead it is the OST-object index in its
3679          * parent MDT-object's layout EA. */
3680         ff->ff_parent.f_stripe_idx = cpu_to_le32(llr->llr_lov_idx);
3681
3682         rc = lfsck_layout_get_lovea(env, parent, tbuf);
3683         if (unlikely(rc == -ENODATA))
3684                 rc = 0;
3685         if (rc <= 0)
3686                 GOTO(unlock1, rc);
3687
3688         rc = lfsck_lov2layout(tbuf->lb_buf, ff, llr->llr_comp_id);
3689         if (rc)
3690                 GOTO(unlock1, rc);
3691
3692         buf = lfsck_buf_get(env, ff, sizeof(*ff));
3693
3694         handle = dt_trans_create(env, dev);
3695         if (IS_ERR(handle))
3696                 GOTO(unlock1, rc = PTR_ERR(handle));
3697
3698         rc = dt_declare_xattr_set(env, child, buf, XATTR_NAME_FID, 0, handle);
3699         if (rc != 0)
3700                 GOTO(stop, rc);
3701
3702         rc = dt_attr_get(env, parent, la);
3703         if (rc != 0)
3704                 GOTO(stop, rc);
3705
3706         la->la_valid = LA_UID | LA_GID;
3707         rc = dt_declare_attr_set(env, child, la, handle);
3708         if (rc != 0)
3709                 GOTO(stop, rc);
3710
3711         rc = dt_trans_start_local(env, dev, handle);
3712         if (rc != 0)
3713                 GOTO(stop, rc);
3714
3715         dt_write_lock(env, parent, 0);
3716         if (unlikely(lfsck_is_dead_obj(parent)))
3717                 GOTO(unlock2, rc = 1);
3718
3719         rc = dt_xattr_set(env, child, buf, XATTR_NAME_FID, 0, handle);
3720         if (rc != 0)
3721                 GOTO(unlock2, rc);
3722
3723         /* Get the latest parent's owner. */
3724         rc = dt_attr_get(env, parent, la);
3725         if (rc != 0)
3726                 GOTO(unlock2, rc);
3727
3728         la->la_valid = LA_UID | LA_GID;
3729         rc = dt_attr_set(env, child, la, handle);
3730
3731         GOTO(unlock2, rc);
3732
3733 unlock2:
3734         dt_write_unlock(env, parent);
3735
3736 stop:
3737         rc = lfsck_layout_trans_stop(env, dev, handle, rc);
3738
3739 unlock1:
3740         lfsck_ibits_unlock(&lh, LCK_EX);
3741
3742 log:
3743         if (rc)
3744                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant repaired "
3745                        "unmatched MDT-OST pair for: parent "DFID
3746                        ", child "DFID", comp_id %u, OST-index %u, "
3747                        "stripe-index %u, owner %u/%u: rc = %d\n",
3748                        lfsck_lfsck2name(com->lc_lfsck),
3749                        PFID(lfsck_dto2fid(parent)),
3750                        PFID(lfsck_dto2fid(child)),
3751                        llr->llr_comp_id, llr->llr_ost_idx, llr->llr_lov_idx,
3752                        la->la_uid, la->la_gid, rc);
3753
3754         return rc;
3755 }
3756
3757 /* If there are more than one MDT-objects claim as the OST-object's parent,
3758  * and the OST-object only recognizes one of them, then we need to generate
3759  * new OST-object(s) with new fid(s) for the non-recognized MDT-object(s). */
3760 static int lfsck_layout_repair_multiple_references(const struct lu_env *env,
3761                                                    struct lfsck_component *com,
3762                                                    struct dt_object *parent,
3763                                                    struct lfsck_layout_req *llr,
3764                                                    struct lu_attr *la)
3765 {
3766         struct lfsck_thread_info        *info   = lfsck_env_info(env);
3767         struct dt_allocation_hint       *hint   = &info->lti_hint;
3768         struct dt_object_format         *dof    = &info->lti_dof;
3769         struct ost_id                   *oi     = &info->lti_oi;
3770         struct lu_buf                   *buf    = &info->lti_big_buf;
3771         struct lfsck_instance           *lfsck  = com->lc_lfsck;
3772         struct dt_device                *dev;
3773         struct lu_device                *d      =
3774                                 &lfsck_obj2dev(llr->llr_child)->dd_lu_dev;
3775         struct lu_object                *o;
3776         struct lu_object                *n;
3777         struct dt_object                *child  = NULL;
3778         struct thandle                  *handle = NULL;
3779         struct lov_mds_md_v1            *lmm;
3780         struct lov_ost_data_v1          *objs;
3781         const struct lu_fid             *pfid   = lfsck_dto2fid(parent);
3782         struct lu_fid                    tfid;
3783         struct lustre_handle             lh     = { 0 };
3784         __u32                            magic;
3785         __u32                            index;
3786         int                              rc;
3787         ENTRY;
3788
3789         /* We use two separated transactions to repair the inconsistency.
3790          *
3791          * 1) create the child (OST-object).
3792          * 2) update the parent LOV EA according to the child's FID.
3793          *
3794          * If 1) succeed, but 2) failed or aborted, then such OST-object will be
3795          * handled as orphan when the layout LFSCK run next time.
3796          *
3797          * If 1) failed, but 2) succeed, then such OST-object will be re-created
3798          * as dangling referened case when the layout LFSCK run next time. */
3799
3800         /* The 1st transaction. */
3801         o = lu_object_anon(env, d, NULL);
3802         if (IS_ERR(o))
3803                 GOTO(log, rc = PTR_ERR(o));
3804
3805         n = lu_object_locate(o->lo_header, d->ld_type);
3806         if (unlikely(n == NULL)) {
3807                 lu_object_put_nocache(env, o);
3808
3809                 GOTO(log, rc = -EINVAL);
3810         }
3811
3812         child = container_of(n, struct dt_object, do_lu);
3813         memset(hint, 0, sizeof(*hint));
3814         rc = dt_attr_get(env, parent, la);
3815         if (rc != 0)
3816                 GOTO(log, rc);
3817
3818         la->la_valid = LA_UID | LA_GID;
3819         memset(dof, 0, sizeof(*dof));
3820
3821         dev = lfsck_obj2dev(child);
3822         handle = dt_trans_create(env, dev);
3823         if (IS_ERR(handle))
3824                 GOTO(log, rc = PTR_ERR(handle));
3825
3826         rc = dt_declare_create(env, child, la, hint, dof, handle);
3827         if (rc != 0)
3828                 GOTO(stop, rc);
3829
3830         rc = dt_trans_start_local(env, dev, handle);
3831         if (rc != 0)
3832                 GOTO(stop, rc);
3833
3834         rc = dt_create(env, child, la, hint, dof, handle);
3835         dt_trans_stop(env, dev, handle);
3836         handle = NULL;
3837         if (rc != 0)
3838                 GOTO(log, rc);
3839
3840         rc = lfsck_ibits_lock(env, lfsck, parent, &lh,
3841                               MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR,
3842                               LCK_EX);
3843         if (rc != 0)
3844                 GOTO(log, rc);
3845
3846         /* The 2nd transaction. */
3847
3848         /* XXX: Generally, we should use bottom device (OSD) to update parent
3849          *      LOV EA. But because the LOD-object still references the wrong
3850          *      OSP-object that should be detached after the parent's LOV EA
3851          *      refreshed. Unfortunately, there is no suitable API for that.
3852          *      So we have to make the LOD to re-load the OSP-object(s) via
3853          *      replacing the LOV EA against the LOD-object.
3854          *
3855          *      Once the DNE2 patches have been landed, we can replace the
3856          *      LOD device with the OSD device. LU-6230. */
3857
3858         dev = lfsck->li_next;
3859         parent = lfsck_object_locate(dev, parent);
3860         if (IS_ERR(parent))
3861                 GOTO(log, rc = PTR_ERR(parent));
3862
3863         handle = dt_trans_create(env, dev);
3864         if (IS_ERR(handle))
3865                 GOTO(log, rc = PTR_ERR(handle));
3866
3867         rc = dt_declare_xattr_set(env, parent, buf, XATTR_NAME_LOV,
3868                                   LU_XATTR_REPLACE, handle);
3869         if (rc != 0)
3870                 GOTO(stop, rc);
3871
3872         rc = dt_trans_start_local(env, dev, handle);
3873         if (rc != 0)
3874                 GOTO(stop, rc);
3875
3876         dt_write_lock(env, parent, 0);
3877         if (unlikely(lfsck_is_dead_obj(parent)))
3878                 GOTO(unlock, rc = 0);
3879
3880         rc = lfsck_layout_get_lovea(env, parent, buf);
3881         if (unlikely(rc == -ENODATA))
3882                 rc = 0;
3883         if (rc <= 0)
3884                 GOTO(unlock, rc);
3885
3886         lmm = buf->lb_buf;
3887         magic = le32_to_cpu(lmm->lmm_magic);
3888         if (magic == LOV_MAGIC_COMP_V1) {
3889                 struct lov_comp_md_v1 *lcm = buf->lb_buf;
3890                 struct lov_comp_md_entry_v1 *lcme;
3891                 __u16 count = le16_to_cpu(lcm->lcm_entry_count);
3892                 int i;
3893
3894                 LASSERT(llr->llr_comp_id != 0);
3895
3896                 for (i = 0; i < count; i++) {
3897                         lcme = &lcm->lcm_entries[i];
3898                         if (le32_to_cpu(lcme->lcme_id) == llr->llr_comp_id) {
3899                                 LASSERT(le32_to_cpu(lcme->lcme_flags) &
3900                                         LCME_FL_INIT);
3901
3902                                 le32_add_cpu(&lcm->lcm_layout_gen, 1);
3903                                 lmm = buf->lb_buf +
3904                                         le32_to_cpu(lcme->lcme_offset);
3905                                 magic = le32_to_cpu(lmm->lmm_magic);
3906                                 goto set;
3907                         }
3908                 }
3909
3910                 GOTO(unlock, rc = 0);
3911         }
3912
3913 set:
3914         if (magic == LOV_MAGIC_V1) {
3915                 objs = &lmm->lmm_objects[llr->llr_lov_idx];
3916         } else {
3917                 LASSERT(magic == LOV_MAGIC_V3);
3918                 objs =
3919                 &((struct lov_mds_md_v3 *)lmm)->lmm_objects[llr->llr_lov_idx];
3920         }
3921
3922         ostid_le_to_cpu(&objs->l_ost_oi, oi);
3923         index = le32_to_cpu(objs->l_ost_idx);
3924         rc = ostid_to_fid(&tfid, oi, index);
3925         /* Someone changed layout during the LFSCK, no need to repair then. */
3926         if (rc == 0 && !lu_fid_eq(&tfid, lu_object_fid(&llr->llr_child->do_lu)))
3927                 GOTO(unlock, rc = 0);
3928
3929         lmm->lmm_layout_gen = cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1);
3930         fid_to_ostid(lu_object_fid(&child->do_lu), oi);
3931         ostid_cpu_to_le(oi, &objs->l_ost_oi);
3932         objs->l_ost_gen = cpu_to_le32(0);
3933         objs->l_ost_idx = cpu_to_le32(llr->llr_ost_idx);
3934         rc = dt_xattr_set(env, parent, buf, XATTR_NAME_LOV,
3935                           LU_XATTR_REPLACE, handle);
3936
3937         GOTO(unlock, rc = (rc == 0 ? 1 : rc));
3938
3939 unlock:
3940         dt_write_unlock(env, parent);
3941
3942 stop:
3943         if (handle != NULL)
3944                 dt_trans_stop(env, dev, handle);
3945
3946 log:
3947         lfsck_ibits_unlock(&lh, LCK_EX);
3948         if (child != NULL)
3949                 lfsck_object_put(env, child);
3950
3951         if (rc)
3952                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant repaired "
3953                        "multiple references for: parent "DFID", comp_id %u, "
3954                        "OST-index %u, stripe-index %u, owner %u/%u: rc = %d\n",
3955                        lfsck_lfsck2name(lfsck), PFID(pfid),
3956                        llr->llr_comp_id, llr->llr_ost_idx, llr->llr_lov_idx,
3957                        la->la_uid, la->la_gid, rc);
3958
3959         return rc;
3960 }
3961
3962 /* If the MDT-object and the OST-object have different owner information,
3963  * then trust the MDT-object, because the normal chown/chgrp handle order
3964  * is from MDT to OST, and it is possible that some chown/chgrp operation
3965  * is partly done. */
3966 static int lfsck_layout_repair_owner(const struct lu_env *env,
3967                                      struct lfsck_component *com,
3968                                      struct dt_object *parent,
3969                                      struct lfsck_layout_req *llr,
3970                                      struct lu_attr *pla,
3971                                      const struct lu_attr *cla)
3972 {
3973         struct lfsck_thread_info        *info   = lfsck_env_info(env);
3974         struct lu_attr                  *tla    = &info->lti_la2;
3975         struct dt_object                *child  = llr->llr_child;
3976         struct dt_device                *dev    = lfsck_obj2dev(child);
3977         struct thandle                  *handle;
3978         int                              rc;
3979         ENTRY;
3980
3981         tla->la_uid = pla->la_uid;
3982         tla->la_gid = pla->la_gid;
3983         tla->la_valid = LA_UID | LA_GID;
3984         handle = dt_trans_create(env, dev);
3985         if (IS_ERR(handle))
3986                 GOTO(log, rc = PTR_ERR(handle));
3987
3988         rc = dt_declare_attr_set(env, child, tla, handle);
3989         if (rc != 0)
3990                 GOTO(stop, rc);
3991
3992         rc = dt_trans_start_local(env, dev, handle);
3993         if (rc != 0)
3994                 GOTO(stop, rc);
3995
3996         /* Use the dt_object lock to serialize with destroy and attr_set. */
3997         dt_read_lock(env, parent, 0);
3998         if (unlikely(lfsck_is_dead_obj(parent)))
3999                 GOTO(unlock, rc = 1);
4000
4001         /* Get the latest parent's owner. */
4002         rc = dt_attr_get(env, parent, pla);
4003         if (rc != 0)
4004                 GOTO(unlock, rc);
4005
4006         /* Some others chown/chgrp during the LFSCK, needs to do nothing. */
4007         if (unlikely(tla->la_uid != pla->la_uid ||
4008                      tla->la_gid != pla->la_gid))
4009                 rc = 1;
4010         else
4011                 rc = dt_attr_set(env, child, tla, handle);
4012
4013         GOTO(unlock, rc);
4014
4015 unlock:
4016         dt_read_unlock(env, parent);
4017
4018 stop:
4019         rc = lfsck_layout_trans_stop(env, dev, handle, rc);
4020
4021 log:
4022         if (rc != 0)
4023                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant repaired "
4024                        "inconsistent file owner for: parent "DFID", child "DFID
4025                        ", OST-index %u, stripe-index %u, old owner %u/%u, "
4026                        "new owner %u/%u: rc = %d\n",
4027                        lfsck_lfsck2name(com->lc_lfsck),
4028                        PFID(lfsck_dto2fid(parent)), PFID(lfsck_dto2fid(child)),
4029                        llr->llr_ost_idx, llr->llr_lov_idx,
4030                        cla->la_uid, cla->la_gid, tla->la_uid, tla->la_gid, rc);
4031
4032         return rc;
4033 }
4034
4035 /* Check whether the OST-object correctly back points to the
4036  * MDT-object (@parent) via the XATTR_NAME_FID xattr (@pfid). */
4037 static int lfsck_layout_check_parent(const struct lu_env *env,
4038                                      struct lfsck_component *com,
4039                                      struct lfsck_assistant_object *lso,
4040                                      struct filter_fid *ff,
4041                                      const struct lu_fid *cfid,
4042                                      const struct lu_attr *cla,
4043                                      struct lfsck_layout_req *llr)
4044 {
4045         struct lfsck_thread_info        *info   = lfsck_env_info(env);
4046         struct lu_buf                   *buf    = &info->lti_big_buf;
4047         struct lu_fid                   *pfid   = &info->lti_fid;
4048         struct dt_object                *tobj;
4049         struct lov_mds_md_v1            *lmm;
4050         struct lov_ost_data_v1          *objs;
4051         struct lustre_handle             lh     = { 0 };
4052         int                              rc;
4053         int                              i;
4054         __u32                            magic;
4055         __u32                            idx;
4056         __u16                            count;
4057         ENTRY;
4058
4059         *pfid = ff->ff_parent;
4060         idx = pfid->f_stripe_idx;
4061         pfid->f_ver = 0;
4062
4063         if (unlikely(!fid_is_sane(pfid)))
4064                 RETURN(LLIT_UNMATCHED_PAIR);
4065
4066         if (lu_fid_eq(pfid, &lso->lso_fid)) {
4067                 if (likely(llr->llr_lov_idx == idx))
4068                         RETURN(0);
4069
4070                 RETURN(LLIT_UNMATCHED_PAIR);
4071         }
4072
4073         tobj = lfsck_object_find_bottom(env, com->lc_lfsck, pfid);
4074         if (IS_ERR(tobj))
4075                 RETURN(PTR_ERR(tobj));
4076
4077         if (dt_object_exists(tobj) == 0 || lfsck_is_dead_obj(tobj) ||
4078             !S_ISREG(lfsck_object_type(tobj)))
4079                 GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4080
4081         /* Load the tobj's layout EA, in spite of it is a local MDT-object or
4082          * remote one on another MDT. Then check whether the given OST-object
4083          * is in such layout. If yes, it is multiple referenced, otherwise it
4084          * is unmatched referenced case. */
4085         rc = lfsck_layout_get_lovea(env, tobj, buf);
4086         if (rc == 0 || rc == -ENODATA || rc == -ENOENT)
4087                 GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4088
4089         if (unlikely(rc == -EOPNOTSUPP))
4090                 GOTO(out, rc = LLIT_NONE);
4091
4092         if (rc < 0)
4093                 GOTO(out, rc);
4094
4095         lmm = buf->lb_buf;
4096         magic = le32_to_cpu(lmm->lmm_magic);
4097         if (magic == LOV_MAGIC_COMP_V1) {
4098                 struct lov_comp_md_v1 *lcm = buf->lb_buf;
4099                 struct lov_comp_md_entry_v1 *lcme;
4100
4101                 if (ff->ff_layout.ol_comp_id == 0)
4102                         GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4103
4104                 count = le16_to_cpu(lcm->lcm_entry_count);
4105                 for (i = 0; i < count; i++) {
4106                         lcme = &lcm->lcm_entries[i];
4107                         if (le32_to_cpu(lcme->lcme_id) ==
4108                             ff->ff_layout.ol_comp_id) {
4109                                 lmm = buf->lb_buf +
4110                                         le32_to_cpu(lcme->lcme_offset);
4111                                 magic = le32_to_cpu(lmm->lmm_magic);
4112                                 if (!(le32_to_cpu(lcme->lcme_flags) &
4113                                       LCME_FL_INIT))
4114                                         GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4115
4116                                 goto further;
4117                         }
4118                 }
4119
4120                 GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4121         }
4122
4123 further:
4124         if (magic == LOV_MAGIC_V1) {
4125                 objs = &lmm->lmm_objects[0];
4126         } else {
4127                 LASSERT(magic == LOV_MAGIC_V3);
4128                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0];
4129         }
4130
4131         count = le16_to_cpu(lmm->lmm_stripe_count);
4132         for (i = 0; i < count; i++, objs++) {
4133                 struct lu_fid           *tfid   = &info->lti_fid2;
4134                 struct ost_id           *oi     = &info->lti_oi;
4135                 __u32                    idx2;
4136
4137                 if (lovea_slot_is_dummy(objs))
4138                         continue;
4139
4140                 ostid_le_to_cpu(&objs->l_ost_oi, oi);
4141                 idx2 = le32_to_cpu(objs->l_ost_idx);
4142                 rc = ostid_to_fid(tfid, oi, idx2);
4143                 if (rc != 0) {
4144                         CDEBUG(D_LFSCK, "%s: the parent "DFID" contains "
4145                                "invalid layout EA at the slot %d, index %u\n",
4146                                lfsck_lfsck2name(com->lc_lfsck),
4147                                PFID(pfid), i, idx2);
4148
4149                         GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4150                 }
4151
4152                 if (lu_fid_eq(cfid, tfid)) {
4153                         rc = lfsck_ibits_lock(env, com->lc_lfsck, tobj, &lh,
4154                                               MDS_INODELOCK_UPDATE |
4155                                               MDS_INODELOCK_LAYOUT |
4156                                               MDS_INODELOCK_XATTR,
4157                                               LCK_EX);
4158                         if (rc != 0)
4159                                 GOTO(out, rc);
4160
4161                         dt_read_lock(env, tobj, 0);
4162
4163                         /* For local MDT-object, re-check existence
4164                          * after taken the lock. */
4165                         if (!dt_object_remote(tobj)) {
4166                                 if (dt_object_exists(tobj) == 0 ||
4167                                     lfsck_is_dead_obj(tobj))
4168                                         rc = LLIT_UNMATCHED_PAIR;
4169                                 else
4170                                         rc = LLIT_MULTIPLE_REFERENCED;
4171
4172                                 GOTO(unlock, rc);
4173                         }
4174
4175                         /* For migration case, the new MDT-object and old
4176                          * MDT-object may reference the same OST-object at
4177                          * some migration internal time.
4178                          *
4179                          * For remote MDT-object, the local MDT may not know
4180                          * whether it has been removed or not.  Try checking
4181                          * for a non-existent xattr to check if this object
4182                          * has been been removed or not. */
4183                         rc = dt_xattr_get(env, tobj, &LU_BUF_NULL,
4184                                           XATTR_NAME_DUMMY);
4185                         if (unlikely(rc == -ENOENT || rc >= 0))
4186                                 rc = LLIT_UNMATCHED_PAIR;
4187                         else if (rc == -ENODATA)
4188                                 rc = LLIT_MULTIPLE_REFERENCED;
4189
4190                         GOTO(unlock, rc);
4191                 }
4192         }
4193
4194         GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4195
4196 unlock:
4197         if (lustre_handle_is_used(&lh)) {
4198                 dt_read_unlock(env, tobj);
4199                 lfsck_ibits_unlock(&lh, LCK_EX);
4200         }
4201
4202 out:
4203         lfsck_object_put(env, tobj);
4204
4205         return rc;
4206 }
4207
4208 static int lfsck_layout_assistant_handler_p1(const struct lu_env *env,
4209                                              struct lfsck_component *com,
4210                                              struct lfsck_assistant_req *lar)
4211 {
4212         struct lfsck_layout_req              *llr    =
4213                         container_of0(lar, struct lfsck_layout_req, llr_lar);
4214         struct lfsck_assistant_object        *lso    = lar->lar_parent;
4215         struct lfsck_layout                  *lo     = com->lc_file_ram;
4216         struct lfsck_thread_info             *info   = lfsck_env_info(env);
4217         struct filter_fid                    *ff     = &info->lti_ff;
4218         struct lu_buf buf = { .lb_buf = ff,
4219                               .lb_len = sizeof(*ff) };
4220         struct dt_object                     *parent = NULL;
4221         struct dt_object                     *child  = llr->llr_child;
4222         struct lu_attr                       *pla    = &lso->lso_attr;
4223         struct lu_attr                       *cla    = &info->lti_la;
4224         struct lfsck_instance                *lfsck  = com->lc_lfsck;
4225         struct lfsck_bookmark                *bk     = &lfsck->li_bookmark_ram;
4226         enum lfsck_layout_inconsistency_type  type   = LLIT_NONE;
4227         int                                   rc;
4228         ENTRY;
4229
4230         if (lso->lso_dead)
4231                 RETURN(0);
4232
4233         CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_ENGINE_DELAY, cfs_fail_val);
4234
4235         rc = dt_attr_get(env, child, cla);
4236         if (rc == -ENOENT) {
4237                 parent = lfsck_assistant_object_load(env, lfsck, lso);
4238                 if (IS_ERR(parent)) {
4239                         rc = PTR_ERR(parent);
4240
4241                         RETURN(rc == -ENOENT ? 0 : rc);
4242                 }
4243
4244                 type = LLIT_DANGLING;
4245                 goto repair;
4246         }
4247
4248         if (rc != 0)
4249                 GOTO(out, rc);
4250
4251         lfsck_buf_init(&buf, ff, sizeof(*ff));
4252         rc = dt_xattr_get(env, child, &buf, XATTR_NAME_FID);
4253         if (unlikely(rc > 0 && rc < sizeof(struct lu_fid))) {
4254                 type = LLIT_UNMATCHED_PAIR;
4255                 goto repair;
4256         }
4257
4258         if (rc < 0 && rc != -ENODATA)
4259                 GOTO(out, rc);
4260
4261         if (rc == 0 || rc == -ENODATA)
4262                 GOTO(check_owner, rc = 0);
4263
4264         filter_fid_le_to_cpu(ff, ff, sizeof(*ff));
4265         rc = lfsck_layout_check_parent(env, com, lso, ff,
4266                                        lu_object_fid(&child->do_lu), cla, llr);
4267         if (rc > 0) {
4268                 type = rc;
4269                 goto repair;
4270         }
4271
4272         if (rc < 0)
4273                 GOTO(out, rc);
4274
4275 check_owner:
4276         /* Someone may has changed the owner after the parent attr pre-loaded.
4277          * It can be handled later inside the lfsck_layout_repair_owner(). */
4278         if (unlikely(cla->la_uid != pla->la_uid ||
4279                      cla->la_gid != pla->la_gid)) {
4280                 type = LLIT_INCONSISTENT_OWNER;
4281                 goto repair;
4282         }
4283
4284 repair:
4285         if (type == LLIT_NONE)
4286                 GOTO(out, rc = 0);
4287
4288         if (bk->lb_param & LPF_DRYRUN)
4289                 GOTO(out, rc = 1);
4290
4291         if (parent == NULL) {
4292                 parent = lfsck_assistant_object_load(env, lfsck, lso);
4293                 if (IS_ERR(parent)) {
4294                         rc = PTR_ERR(parent);
4295
4296                         if (rc == -ENOENT)
4297                                 RETURN(0);
4298
4299                         GOTO(out, rc);
4300                 }
4301         }
4302
4303         switch (type) {
4304         case LLIT_DANGLING:
4305                 if (bk->lb_param & LPF_DELAY_CREATE_OSTOBJ)
4306                         rc = lfsck_layout_ins_dangling_rec(env, com,
4307                                 lfsck_dto2fid(parent), lfsck_dto2fid(child),
4308                                 llr->llr_comp_id, llr->llr_lov_idx,
4309                                 llr->llr_ost_idx);
4310                 else
4311                         rc = __lfsck_layout_repair_dangling(env, com, parent,
4312                                                             llr->llr_child,
4313                                                             llr->llr_comp_id,
4314                                                             llr->llr_lov_idx,
4315                                                             llr->llr_ost_idx,
4316                                                             true);
4317                 break;
4318         case LLIT_UNMATCHED_PAIR:
4319                 rc = lfsck_layout_repair_unmatched_pair(env, com, parent,
4320                                                         llr, pla);
4321                 break;
4322         case LLIT_MULTIPLE_REFERENCED:
4323                 rc = lfsck_layout_repair_multiple_references(env, com, parent,
4324                                                              llr, pla);
4325                 break;
4326         case LLIT_INCONSISTENT_OWNER:
4327                 rc = lfsck_layout_repair_owner(env, com, parent, llr, pla, cla);
4328                 break;
4329         default:
4330                 rc = 0;
4331                 break;
4332         }
4333
4334         GOTO(out, rc);
4335
4336 out:
4337         down_write(&com->lc_sem);
4338         if (rc < 0) {
4339                 struct lfsck_assistant_data *lad = com->lc_data;
4340
4341                 if (unlikely(lad->lad_exit)) {
4342                         rc = 0;
4343                 } else if (rc == -ENOTCONN || rc == -ESHUTDOWN ||
4344                            rc == -ETIMEDOUT || rc == -EHOSTDOWN ||
4345                            rc == -EHOSTUNREACH) {
4346                         /* If cannot touch the target server,
4347                          * mark the LFSCK as INCOMPLETE. */
4348                         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant fail to "
4349                                "talk with OST %x: rc = %d\n",
4350                                lfsck_lfsck2name(lfsck), llr->llr_ost_idx, rc);
4351                         lfsck_lad_set_bitmap(env, com, llr->llr_ost_idx);
4352                         lo->ll_objs_skipped++;
4353                         rc = 0;
4354                 } else {
4355                         lfsck_layout_record_failure(env, lfsck, lo);
4356                 }
4357         } else if (rc > 0 && (type != LLIT_DANGLING ||
4358                               !(bk->lb_param & LPF_DELAY_CREATE_OSTOBJ))) {
4359                 LASSERTF(type > LLIT_NONE && type <= LLIT_MAX,
4360                          "unknown type = %d\n", type);
4361
4362                 lo->ll_objs_repaired[type - 1]++;
4363                 if (bk->lb_param & LPF_DRYRUN &&
4364                     unlikely(lo->ll_pos_first_inconsistent == 0))
4365                         lo->ll_pos_first_inconsistent =
4366                         lfsck->li_obj_oit->do_index_ops->dio_it.store(env,
4367                                                         lfsck->li_di_oit);
4368         }
4369         up_write(&com->lc_sem);
4370
4371         if (parent != NULL && !IS_ERR(parent))
4372                 lfsck_object_put(env, parent);
4373
4374         return rc;
4375 }
4376
4377 static int
4378 lfsck_layout_double_scan_one_trace_file(const struct lu_env *env,
4379                                         struct lfsck_component *com,
4380                                         struct dt_object *obj, bool first)
4381 {
4382         struct lfsck_instance *lfsck = com->lc_lfsck;
4383         struct ptlrpc_thread *thread = &lfsck->li_thread;
4384         struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram;
4385         struct lfsck_layout *lo = com->lc_file_ram;
4386         const struct dt_it_ops *iops = &obj->do_index_ops->dio_it;
4387         struct dt_it *di;
4388         struct dt_key *key;
4389         struct lfsck_layout_dangling_key *parent =
4390                                         &lfsck_env_info(env)->lti_lldk;
4391         struct lu_fid *cfid = &lfsck_env_info(env)->lti_fid3;
4392         __u32 ost_idx;
4393         int rc;
4394         ENTRY;
4395
4396         di = iops->init(env, obj, 0);
4397         if (IS_ERR(di))
4398                 RETURN(PTR_ERR(di));
4399
4400         if (first)
4401                 lldk_cpu_to_be(parent, &lo->ll_lldk_latest_scanned_phase2);
4402         else
4403                 memset(parent, 0, sizeof(*parent));
4404         rc = iops->get(env, di, (const struct dt_key *)parent);
4405         if (rc < 0)
4406                 GOTO(fini, rc);
4407
4408         if (first) {
4409                 /* The start one either has been processed or does not exist,
4410                  * skip it. */
4411                 rc = iops->next(env, di);
4412                 if (rc != 0)
4413                         GOTO(put, rc);
4414         }
4415
4416         do {
4417                 if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val) &&
4418                     unlikely(!thread_is_running(thread)))
4419                         GOTO(put, rc = 0);
4420
4421                 key = iops->key(env, di);
4422                 if (IS_ERR(key)) {
4423                         rc = PTR_ERR(key);
4424                         if (rc == -ENOENT)
4425                                 GOTO(put, rc = 1);
4426
4427                         goto checkpoint;
4428                 }
4429
4430                 lldk_be_to_cpu(parent,
4431                                 (const struct lfsck_layout_dangling_key *)key);
4432                 if (!fid_is_sane(&parent->lldk_fid)) {
4433                         rc = 0;
4434                         goto checkpoint;
4435                 }
4436
4437                 rc = iops->rec(env, di, (struct dt_rec *)cfid, 0);
4438                 if (rc == 0) {
4439                         fid_be_to_cpu(cfid, cfid);
4440                         ost_idx = cfid->f_ver;
4441                         cfid->f_ver = 0;
4442                         if (!fid_is_sane(cfid)) {
4443                                 rc = 0;
4444                                 goto checkpoint;
4445                         }
4446
4447                         rc = lfsck_layout_repair_dangling(env, com,
4448                                         &parent->lldk_fid, cfid,
4449                                         parent->lldk_comp_id,
4450                                         parent->lldk_ea_off, ost_idx);
4451                 }
4452
4453 checkpoint:
4454                 down_write(&com->lc_sem);
4455                 com->lc_new_checked++;
4456                 com->lc_new_scanned++;
4457                 if (rc >= 0)
4458                         lo->ll_lldk_latest_scanned_phase2 = *parent;
4459
4460                 if (rc > 0)
4461                         lo->ll_objs_repaired[LLIT_DANGLING - 1]++;
4462                 else if (rc < 0)
4463                         lo->ll_objs_failed_phase2++;
4464                 up_write(&com->lc_sem);
4465
4466                 if (rc < 0 && bk->lb_param & LPF_FAILOUT)
4467                         GOTO(put, rc);
4468
4469                 if (unlikely(com->lc_time_next_checkpoint <=
4470                              ktime_get_seconds()) &&
4471                     com->lc_new_checked != 0) {
4472                         down_write(&com->lc_sem);
4473                         lo->ll_run_time_phase2 += ktime_get_seconds() -
4474                                                   com->lc_time_last_checkpoint;
4475                         lo->ll_time_last_checkpoint = ktime_get_real_seconds();
4476                         lo->ll_objs_checked_phase2 += com->lc_new_checked;
4477                         com->lc_new_checked = 0;
4478                         lfsck_layout_store(env, com);
4479                         up_write(&com->lc_sem);
4480
4481                         com->lc_time_last_checkpoint = ktime_get_seconds();
4482                         com->lc_time_next_checkpoint =
4483                                 com->lc_time_last_checkpoint +
4484                                 LFSCK_CHECKPOINT_INTERVAL;
4485                 }
4486
4487                 lfsck_control_speed_by_self(com);
4488                 if (unlikely(!thread_is_running(thread)))
4489                         GOTO(put, rc = 0);
4490
4491                 rc = iops->next(env, di);
4492         } while (rc == 0);
4493
4494         GOTO(put, rc);
4495
4496 put:
4497         iops->put(env, di);
4498
4499 fini:
4500         iops->fini(env, di);
4501
4502         return rc;
4503 }
4504
4505 static int lfsck_layout_assistant_handler_p2(const struct lu_env *env,
4506                                              struct lfsck_component *com)
4507 {
4508         struct lfsck_assistant_data     *lad    = com->lc_data;
4509         struct lfsck_instance           *lfsck  = com->lc_lfsck;
4510         struct lfsck_bookmark           *bk     = &lfsck->li_bookmark_ram;
4511         struct lfsck_tgt_descs          *ltds   = &lfsck->li_ost_descs;
4512         struct lfsck_tgt_desc           *ltd;
4513         int                              rc     = 0;
4514         ENTRY;
4515
4516         CDEBUG(D_LFSCK, "%s: layout LFSCK phase2 scan start\n",
4517                lfsck_lfsck2name(lfsck));
4518
4519         spin_lock(&ltds->ltd_lock);
4520         while (!list_empty(&lad->lad_ost_phase2_list)) {
4521                 ltd = list_entry(lad->lad_ost_phase2_list.next,
4522                                  struct lfsck_tgt_desc,
4523                                  ltd_layout_phase_list);
4524                 list_del_init(&ltd->ltd_layout_phase_list);
4525                 if (bk->lb_param & LPF_OST_ORPHAN) {
4526                         spin_unlock(&ltds->ltd_lock);
4527                         rc = lfsck_layout_scan_orphan(env, com, ltd);
4528                         if (rc != 0 && bk->lb_param & LPF_FAILOUT)
4529                                 RETURN(rc);
4530
4531                         if (unlikely(lad->lad_exit ||
4532                                      !thread_is_running(&lfsck->li_thread)))
4533                                 RETURN(0);
4534                         spin_lock(&ltds->ltd_lock);
4535                 }
4536         }
4537
4538         if (list_empty(&lad->lad_ost_phase1_list))
4539                 rc = 1;
4540         else
4541                 rc = 0;
4542         spin_unlock(&ltds->ltd_lock);
4543
4544         if (rc == 1 && bk->lb_param & LPF_OST_ORPHAN) {
4545                 struct lfsck_layout *lo = com->lc_file_ram;
4546                 int i;
4547
4548                 com->lc_new_checked = 0;
4549                 com->lc_new_scanned = 0;
4550                 com->lc_time_last_checkpoint = ktime_get_seconds();
4551                 com->lc_time_next_checkpoint = com->lc_time_last_checkpoint +
4552                                                LFSCK_CHECKPOINT_INTERVAL;
4553
4554                 i = lfsck_sub_trace_file_fid2idx(
4555                                 &lo->ll_lldk_latest_scanned_phase2.lldk_fid);
4556                 rc = lfsck_layout_double_scan_one_trace_file(env, com,
4557                                 com->lc_sub_trace_objs[i].lsto_obj, true);
4558                 while (rc > 0 && ++i < LFSCK_STF_COUNT)
4559                         rc = lfsck_layout_double_scan_one_trace_file(env, com,
4560                                 com->lc_sub_trace_objs[i].lsto_obj, false);
4561
4562                 CDEBUG(D_LFSCK, "%s: layout LFSCK phase2 scan dangling stop "
4563                        "at the No. %d trace file: rc = %d\n",
4564                        lfsck_lfsck2name(lfsck), i, rc);
4565         }
4566
4567         CDEBUG(D_LFSCK, "%s: layout LFSCK phase2 scan stop: rc = %d\n",
4568                lfsck_lfsck2name(lfsck), rc);
4569
4570         RETURN(rc);
4571 }
4572
4573 static int
4574 lfsck_layout_slave_async_interpret(const struct lu_env *env,
4575                                    struct ptlrpc_request *req,
4576                                    void *args, int rc)
4577 {
4578         struct lfsck_layout_slave_async_args *llsaa = args;
4579         struct obd_export                    *exp   = llsaa->llsaa_exp;
4580         struct lfsck_component               *com   = llsaa->llsaa_com;
4581         struct lfsck_layout_slave_target     *llst  = llsaa->llsaa_llst;
4582         struct lfsck_layout_slave_data       *llsd  = com->lc_data;
4583         struct lfsck_reply                   *lr    = NULL;
4584         bool                                  done  = false;
4585
4586         if (rc != 0) {
4587                 /* It is probably caused by network trouble, or target crash,
4588                  * it will try several times (depends on the obd_timeout, and
4589                  * will not less than 3 times). But to make the LFSCK can go
4590                  * ahead, we should not try for ever. After some try but still
4591                  * hit failure, it will assume that the target exit the LFSCK
4592                  * prcoessing and stop try. */
4593                 if (rc == -ENOTCONN || rc == -ESHUTDOWN) {
4594                         int max_try = max_t(int, obd_timeout / 30, 3);
4595
4596                         if (++(llst->llst_failures) > max_try)
4597                                 done = true;
4598                 } else {
4599                         done = true;
4600                 }
4601         } else {
4602                 llst->llst_failures = 0;
4603                 lr = req_capsule_server_get(&req->rq_pill, &RMF_LFSCK_REPLY);
4604                 if (lr->lr_status != LS_SCANNING_PHASE1 &&
4605                     lr->lr_status != LS_SCANNING_PHASE2)
4606                         done = true;
4607         }
4608
4609         if (done) {
4610                 CDEBUG(D_LFSCK, "%s: layout LFSCK slave gets the MDT %x "
4611                        "status %d, failures_try %d\n", lfsck_lfsck2name(com->lc_lfsck),
4612                        llst->llst_index, lr != NULL ? lr->lr_status : rc,
4613                        llst->llst_failures);
4614
4615                 lfsck_layout_llst_del(llsd, llst);
4616         }
4617
4618         lfsck_layout_llst_put(llst);
4619         lfsck_component_put(env, com);
4620         class_export_put(exp);
4621
4622         return 0;
4623 }
4624
4625 static int lfsck_layout_async_query(const struct lu_env *env,
4626                                     struct lfsck_component *com,
4627                                     struct obd_export *exp,
4628                                     struct lfsck_layout_slave_target *llst,
4629                                     struct lfsck_request *lr,
4630                                     struct ptlrpc_request_set *set)
4631 {
4632         struct lfsck_layout_slave_async_args *llsaa;
4633         struct ptlrpc_request                *req;
4634         struct lfsck_request                 *tmp;
4635         int                                   rc;
4636         ENTRY;
4637
4638         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LFSCK_QUERY);
4639         if (req == NULL)
4640                 RETURN(-ENOMEM);
4641
4642         rc = ptlrpc_request_pack(req, LUSTRE_OBD_VERSION, LFSCK_QUERY);
4643         if (rc != 0) {
4644                 ptlrpc_request_free(req);
4645                 RETURN(rc);
4646         }
4647
4648         tmp = req_capsule_client_get(&req->rq_pill, &RMF_LFSCK_REQUEST);
4649         *tmp = *lr;
4650         ptlrpc_request_set_replen(req);
4651
4652         llsaa = ptlrpc_req_async_args(req);
4653         llsaa->llsaa_exp = exp;
4654         llsaa->llsaa_com = lfsck_component_get(com);
4655         llsaa->llsaa_llst = llst;
4656         req->rq_interpret_reply = lfsck_layout_slave_async_interpret;
4657         req->rq_allow_intr = 1;
4658         req->rq_no_delay = 1;
4659         ptlrpc_set_add_req(set, req);
4660
4661         RETURN(0);
4662 }
4663
4664 static int lfsck_layout_async_notify(const struct lu_env *env,
4665                                      struct obd_export *exp,
4666                                      struct lfsck_request *lr,
4667                                      struct ptlrpc_request_set *set)
4668 {
4669         struct ptlrpc_request   *req;
4670         struct lfsck_request    *tmp;
4671         int                      rc;
4672         ENTRY;
4673
4674         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LFSCK_NOTIFY);
4675         if (req == NULL)
4676                 RETURN(-ENOMEM);
4677
4678         rc = ptlrpc_request_pack(req, LUSTRE_OBD_VERSION, LFSCK_NOTIFY);
4679         if (rc != 0) {
4680                 ptlrpc_request_free(req);
4681                 RETURN(rc);
4682         }
4683
4684         tmp = req_capsule_client_get(&req->rq_pill, &RMF_LFSCK_REQUEST);
4685         *tmp = *lr;
4686         ptlrpc_request_set_replen(req);
4687         req->rq_allow_intr = 1;
4688         req->rq_no_delay = 1;
4689         ptlrpc_set_add_req(set, req);
4690
4691         RETURN(0);
4692 }
4693
4694 static int
4695 lfsck_layout_slave_query_master(const struct lu_env *env,
4696                                 struct lfsck_component *com)
4697 {
4698         struct lfsck_request             *lr    = &lfsck_env_info(env)->lti_lr;
4699         struct lfsck_instance            *lfsck = com->lc_lfsck;
4700         struct lfsck_layout_slave_data   *llsd  = com->lc_data;
4701         struct lfsck_layout_slave_target *llst;
4702         struct obd_export                *exp;
4703         struct ptlrpc_request_set        *set;
4704         int                               rc    = 0;
4705         int                               rc1   = 0;
4706         ENTRY;
4707
4708         set = ptlrpc_prep_set();
4709         if (set == NULL)
4710                 GOTO(log, rc = -ENOMEM);
4711
4712         memset(lr, 0, sizeof(*lr));
4713         lr->lr_event = LE_QUERY;
4714         lr->lr_active = LFSCK_TYPE_LAYOUT;
4715
4716         llsd->llsd_touch_gen++;
4717         spin_lock(&llsd->llsd_lock);
4718         while (!list_empty(&llsd->llsd_master_list)) {
4719                 llst = list_entry(llsd->llsd_master_list.next,
4720                                   struct lfsck_layout_slave_target,
4721                                   llst_list);
4722                 if (llst->llst_gen == llsd->llsd_touch_gen)
4723                         break;
4724
4725                 llst->llst_gen = llsd->llsd_touch_gen;
4726                 list_move_tail(&llst->llst_list,
4727                                &llsd->llsd_master_list);
4728                 atomic_inc(&llst->llst_ref);
4729                 spin_unlock(&llsd->llsd_lock);
4730
4731                 exp = lustre_find_lwp_by_index(lfsck->li_obd->obd_name,
4732                                                llst->llst_index);
4733                 if (exp == NULL) {
4734                         lfsck_layout_llst_del(llsd, llst);
4735                         lfsck_layout_llst_put(llst);
4736                         spin_lock(&llsd->llsd_lock);
4737                         continue;
4738                 }
4739
4740                 rc = lfsck_layout_async_query(env, com, exp, llst, lr, set);
4741                 if (rc != 0) {
4742                         CDEBUG(D_LFSCK, "%s: layout LFSCK slave fail to "
4743                                "query %s for layout: rc = %d\n",
4744                                lfsck_lfsck2name(lfsck),
4745                                exp->exp_obd->obd_name, rc);
4746
4747                         rc1 = rc;
4748                         lfsck_layout_llst_put(llst);
4749                         class_export_put(exp);
4750                 }
4751                 spin_lock(&llsd->llsd_lock);
4752         }
4753         spin_unlock(&llsd->llsd_lock);
4754
4755         rc = ptlrpc_set_wait(env, set);
4756         ptlrpc_set_destroy(set);
4757
4758         GOTO(log, rc = (rc1 != 0 ? rc1 : rc));
4759
4760 log:
4761         CDEBUG(D_LFSCK, "%s: layout LFSCK slave queries master: rc = %d\n",
4762                lfsck_lfsck2name(com->lc_lfsck), rc);
4763
4764         return rc;
4765 }
4766
4767 static void
4768 lfsck_layout_slave_notify_master(const struct lu_env *env,
4769                                  struct lfsck_component *com,
4770                                  enum lfsck_events event, int result)
4771 {
4772         struct lfsck_layout              *lo    = com->lc_file_ram;
4773         struct lfsck_instance            *lfsck = com->lc_lfsck;
4774         struct lfsck_layout_slave_data   *llsd  = com->lc_data;
4775         struct lfsck_request             *lr    = &lfsck_env_info(env)->lti_lr;
4776         struct lfsck_layout_slave_target *llst;
4777         struct obd_export                *exp;
4778         struct ptlrpc_request_set        *set;
4779         int                               rc;
4780         ENTRY;
4781
4782         CDEBUG(D_LFSCK, "%s: layout LFSCK slave notifies master\n",
4783                lfsck_lfsck2name(com->lc_lfsck));
4784
4785         set = ptlrpc_prep_set();
4786         if (set == NULL)
4787                 RETURN_EXIT;
4788
4789         memset(lr, 0, sizeof(*lr));
4790         lr->lr_event = event;
4791         lr->lr_flags = LEF_FROM_OST;
4792         lr->lr_status = result;
4793         lr->lr_index = lfsck_dev_idx(lfsck);
4794         lr->lr_active = LFSCK_TYPE_LAYOUT;
4795         lr->lr_flags2 = lo->ll_flags;
4796         llsd->llsd_touch_gen++;
4797         spin_lock(&llsd->llsd_lock);
4798         while (!list_empty(&llsd->llsd_master_list)) {
4799                 llst = list_entry(llsd->llsd_master_list.next,
4800                                   struct lfsck_layout_slave_target,
4801                                   llst_list);
4802                 if (llst->llst_gen == llsd->llsd_touch_gen)
4803                         break;
4804
4805                 llst->llst_gen = llsd->llsd_touch_gen;
4806                 list_move_tail(&llst->llst_list,
4807                                &llsd->llsd_master_list);
4808                 atomic_inc(&llst->llst_ref);
4809                 spin_unlock(&llsd->llsd_lock);
4810
4811                 exp = lustre_find_lwp_by_index(lfsck->li_obd->obd_name,
4812                                                llst->llst_index);
4813                 if (exp == NULL) {
4814                         lfsck_layout_llst_del(llsd, llst);
4815                         lfsck_layout_llst_put(llst);
4816                         spin_lock(&llsd->llsd_lock);
4817                         continue;
4818                 }
4819
4820                 rc = lfsck_layout_async_notify(env, exp, lr, set);
4821                 if (rc != 0)
4822                         CDEBUG(D_LFSCK, "%s: layout LFSCK slave fail to "
4823                                "notify %s for layout: rc = %d\n",
4824                                lfsck_lfsck2name(lfsck),
4825                                exp->exp_obd->obd_name, rc);
4826
4827                 lfsck_layout_llst_put(llst);
4828                 class_export_put(exp);
4829                 spin_lock(&llsd->llsd_lock);
4830         }
4831         spin_unlock(&llsd->llsd_lock);
4832
4833         ptlrpc_set_wait(env, set);
4834         ptlrpc_set_destroy(set);
4835
4836         RETURN_EXIT;
4837 }
4838
4839 /*
4840  * \ret -ENODATA: unrecognized stripe
4841  * \ret = 0     : recognized stripe
4842  * \ret < 0     : other failures
4843  */
4844 static int lfsck_layout_master_check_pairs(const struct lu_env *env,
4845                                            struct lfsck_component *com,
4846                                            struct lu_fid *cfid,
4847                                            struct lu_fid *pfid, __u32 comp_id)
4848 {
4849         struct lfsck_thread_info        *info   = lfsck_env_info(env);
4850         struct lu_buf                   *buf    = &info->lti_big_buf;
4851         struct ost_id                   *oi     = &info->lti_oi;
4852         struct dt_object                *obj;
4853         struct lov_mds_md_v1            *lmm;
4854         struct lov_ost_data_v1          *objs;
4855         __u32                            idx    = pfid->f_stripe_idx;
4856         __u32                            magic;
4857         int                              rc     = 0;
4858         int                              i;
4859         __u16                            count;
4860         ENTRY;
4861
4862         pfid->f_ver = 0;
4863         obj = lfsck_object_find_bottom(env, com->lc_lfsck, pfid);
4864         if (IS_ERR(obj))
4865                 RETURN(PTR_ERR(obj));
4866
4867         dt_read_lock(env, obj, 0);
4868         if (unlikely(dt_object_exists(obj) == 0 ||
4869                      lfsck_is_dead_obj(obj)))
4870                 GOTO(unlock, rc = -ENOENT);
4871
4872         if (!S_ISREG(lfsck_object_type(obj)))
4873                 GOTO(unlock, rc = -ENODATA);
4874
4875         rc = lfsck_layout_get_lovea(env, obj, buf);
4876         if (rc < 0)
4877                 GOTO(unlock, rc);
4878
4879         lmm = buf->lb_buf;
4880         magic = le32_to_cpu(lmm->lmm_magic);
4881         if (magic == LOV_MAGIC_COMP_V1) {
4882                 struct lov_comp_md_v1 *lcm = buf->lb_buf;
4883                 struct lov_comp_md_entry_v1 *lcme;
4884
4885                 if (comp_id == 0)
4886                         GOTO(unlock, rc = -ENODATA);
4887
4888                 count = le16_to_cpu(lcm->lcm_entry_count);
4889                 for (i = 0; i < count; i++) {
4890                         lcme = &lcm->lcm_entries[i];
4891                         if (le32_to_cpu(lcme->lcme_id) == comp_id) {
4892                                 lmm = buf->lb_buf +
4893                                         le32_to_cpu(lcme->lcme_offset);
4894                                 magic = le32_to_cpu(lmm->lmm_magic);
4895                                 if (!(le32_to_cpu(lcme->lcme_flags) &
4896                                       LCME_FL_INIT))
4897                                         GOTO(unlock, rc = -ENODATA);
4898
4899                                 goto further;
4900                         }
4901                 }
4902
4903                 GOTO(unlock, rc = -ENODATA);
4904         }
4905
4906 further:
4907         if (magic == LOV_MAGIC_V1) {
4908                 objs = &lmm->lmm_objects[0];
4909         } else {
4910                 LASSERT(magic == LOV_MAGIC_V3);
4911                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0];
4912         }
4913
4914         fid_to_ostid(cfid, oi);
4915         count = le16_to_cpu(lmm->lmm_stripe_count);
4916         for (i = 0; i < count; i++, objs++) {
4917                 struct ost_id oi2;
4918
4919                 ostid_le_to_cpu(&objs->l_ost_oi, &oi2);
4920                 if (memcmp(oi, &oi2, sizeof(*oi)) == 0)
4921                         GOTO(unlock, rc = (i != idx ? -ENODATA : 0));
4922         }
4923
4924         GOTO(unlock, rc = -ENODATA);
4925
4926 unlock:
4927         dt_read_unlock(env, obj);
4928         lfsck_object_put(env, obj);
4929
4930         return rc;
4931 }
4932
4933 /*
4934  * The LFSCK-on-OST will ask the LFSCK-on-MDT to check whether the given
4935  * MDT-object/OST-object pairs match or not to aviod transfer MDT-object
4936  * layout EA from MDT to OST. On one hand, the OST no need to understand
4937  * the layout EA structure; on the other hand, it may cause trouble when
4938  * transfer large layout EA from MDT to OST via normal OUT RPC.
4939  *
4940  * \ret > 0: unrecognized stripe
4941  * \ret = 0: recognized stripe
4942  * \ret < 0: other failures
4943  */
4944 static int lfsck_layout_slave_check_pairs(const struct lu_env *env,
4945                                           struct lfsck_component *com,
4946                                           struct lu_fid *cfid,
4947                                           struct lu_fid *pfid, __u32 comp_id)
4948 {
4949         struct lfsck_instance    *lfsck  = com->lc_lfsck;
4950         struct obd_device        *obd    = lfsck->li_obd;
4951         struct seq_server_site   *ss     = lfsck_dev_site(lfsck);
4952         struct obd_export        *exp    = NULL;
4953         struct ptlrpc_request    *req    = NULL;
4954         struct lfsck_request     *lr;
4955         struct lu_seq_range      *range  = &lfsck_env_info(env)->lti_range;
4956         int                       rc     = 0;
4957         ENTRY;
4958
4959         if (unlikely(fid_is_idif(pfid)))
4960                 RETURN(1);
4961
4962         fld_range_set_any(range);
4963         rc = fld_server_lookup(env, ss->ss_server_fld, fid_seq(pfid), range);
4964         if (rc != 0)
4965                 RETURN(rc == -ENOENT ? 1 : rc);
4966
4967         if (unlikely(!fld_range_is_mdt(range)))
4968                 RETURN(1);
4969
4970         exp = lustre_find_lwp_by_index(obd->obd_name, range->lsr_index);
4971         if (unlikely(exp == NULL))
4972                 RETURN(1);
4973
4974         if (!(exp_connect_flags(exp) & OBD_CONNECT_LFSCK))
4975                 GOTO(out, rc = -EOPNOTSUPP);
4976
4977         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LFSCK_NOTIFY);
4978         if (req == NULL)
4979                 GOTO(out, rc = -ENOMEM);
4980
4981         rc = ptlrpc_request_pack(req, LUSTRE_OBD_VERSION, LFSCK_NOTIFY);
4982         if (rc != 0) {
4983                 ptlrpc_request_free(req);
4984
4985                 GOTO(out, rc);
4986         }
4987
4988         lr = req_capsule_client_get(&req->rq_pill, &RMF_LFSCK_REQUEST);
4989         memset(lr, 0, sizeof(*lr));
4990         lr->lr_event = LE_PAIRS_VERIFY;
4991         lr->lr_active = LFSCK_TYPE_LAYOUT;
4992         lr->lr_fid = *cfid; /* OST-object itself FID. */
4993         lr->lr_fid2 = *pfid; /* The claimed parent FID. */
4994         lr->lr_comp_id = comp_id;
4995
4996         ptlrpc_request_set_replen(req);
4997         rc = ptlrpc_queue_wait(req);
4998         ptlrpc_req_finished(req);
4999
5000         if (rc == -ENOENT || rc == -ENODATA)
5001                 rc = 1;
5002
5003         GOTO(out, rc);
5004
5005 out:
5006         if (exp != NULL)
5007                 class_export_put(exp);
5008
5009         return rc;
5010 }
5011
5012 static int lfsck_layout_slave_repair_pfid(const struct lu_env *env,
5013                                           struct lfsck_component *com,
5014                                           struct lfsck_req_local *lrl)
5015 {
5016         struct dt_object        *obj;
5017         int                      rc     = 0;
5018         ENTRY;
5019
5020         obj = lfsck_object_find_bottom(env, com->lc_lfsck, &lrl->lrl_fid);
5021         if (IS_ERR(obj))
5022                 GOTO(log, rc = PTR_ERR(obj));
5023
5024         dt_write_lock(env, obj, 0);
5025         if (unlikely(dt_object_exists(obj) == 0 ||
5026                      lfsck_is_dead_obj(obj)))
5027                 GOTO(unlock, rc = 0);
5028
5029         rc = __lfsck_layout_update_pfid(env, obj, &lrl->lrl_ff_client.ff_parent,
5030                                         &lrl->lrl_ff_client.ff_layout,
5031                                         lrl->lrl_ff_client.ff_layout_version,
5032                                         lrl->lrl_ff_client.ff_range,
5033                                         lrl->lrl_ff_client.ff_parent.f_ver);
5034
5035         GOTO(unlock, rc);
5036
5037 unlock:
5038         dt_write_unlock(env, obj);
5039         lfsck_object_put(env, obj);
5040
5041 log:
5042         CDEBUG(D_LFSCK, "%s: layout LFSCK slave repaired pfid for "DFID
5043                ", parent "DFID": rc = %d\n", lfsck_lfsck2name(com->lc_lfsck),
5044                PFID(&lrl->lrl_fid), PFID(&lrl->lrl_ff_client.ff_parent), rc);
5045
5046         return rc;
5047 }
5048
5049 /* layout APIs */
5050
5051 static void lfsck_layout_slave_quit(const struct lu_env *env,
5052                                     struct lfsck_component *com);
5053
5054 static int lfsck_layout_reset(const struct lu_env *env,
5055                               struct lfsck_component *com, bool init)
5056 {
5057         struct lfsck_layout     *lo    = com->lc_file_ram;
5058         int                      rc;
5059
5060         down_write(&com->lc_sem);
5061         if (init) {
5062                 memset(lo, 0, com->lc_file_size);
5063         } else {
5064                 __u32 count = lo->ll_success_count;
5065                 time64_t last_time = lo->ll_time_last_complete;
5066
5067                 memset(lo, 0, com->lc_file_size);
5068                 lo->ll_success_count = count;
5069                 lo->ll_time_last_complete = last_time;
5070         }
5071
5072         lo->ll_magic = LFSCK_LAYOUT_MAGIC;
5073         lo->ll_status = LS_INIT;
5074
5075         if (com->lc_lfsck->li_master) {
5076                 struct lfsck_assistant_data *lad = com->lc_data;
5077
5078                 lad->lad_incomplete = 0;
5079                 CFS_RESET_BITMAP(lad->lad_bitmap);
5080         }
5081
5082         rc = lfsck_layout_store(env, com);
5083         if (rc == 0 && com->lc_lfsck->li_master)
5084                 rc = lfsck_load_sub_trace_files(env, com,
5085                         &dt_lfsck_layout_dangling_features, LFSCK_LAYOUT, true);
5086         up_write(&com->lc_sem);
5087
5088         CDEBUG(D_LFSCK, "%s: layout LFSCK reset: rc = %d\n",
5089                lfsck_lfsck2name(com->lc_lfsck), rc);
5090
5091         return rc;
5092 }
5093
5094 static void lfsck_layout_fail(const struct lu_env *env,
5095                               struct lfsck_component *com, bool new_checked)
5096 {
5097         struct lfsck_layout *lo = com->lc_file_ram;
5098
5099         down_write(&com->lc_sem);
5100         if (new_checked)
5101                 com->lc_new_checked++;
5102         lfsck_layout_record_failure(env, com->lc_lfsck, lo);
5103         up_write(&com->lc_sem);
5104 }
5105
5106 static int lfsck_layout_master_checkpoint(const struct lu_env *env,
5107                                           struct lfsck_component *com, bool init)
5108 {
5109         struct lfsck_instance   *lfsck   = com->lc_lfsck;
5110         struct lfsck_layout     *lo      = com->lc_file_ram;
5111         int                      rc;
5112
5113         if (!init) {
5114                 rc = lfsck_checkpoint_generic(env, com);
5115                 if (rc != 0)
5116                         return rc > 0 ? 0 : rc;
5117         }
5118
5119         down_write(&com->lc_sem);
5120         if (init) {
5121                 lo->ll_pos_latest_start =
5122                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
5123         } else {
5124                 lo->ll_pos_last_checkpoint =
5125                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
5126                 lo->ll_run_time_phase1 += ktime_get_seconds() -
5127                                           lfsck->li_time_last_checkpoint;
5128                 lo->ll_time_last_checkpoint = ktime_get_real_seconds();
5129                 lo->ll_objs_checked_phase1 += com->lc_new_checked;
5130                 com->lc_new_checked = 0;
5131         }
5132
5133         rc = lfsck_layout_store(env, com);
5134         up_write(&com->lc_sem);
5135
5136         CDEBUG(D_LFSCK, "%s: layout LFSCK master checkpoint at the pos ["
5137                "%llu], status = %d: rc = %d\n", lfsck_lfsck2name(lfsck),
5138                lfsck->li_pos_current.lp_oit_cookie, lo->ll_status, rc);
5139
5140         return rc;
5141 }
5142
5143 static int lfsck_layout_slave_checkpoint(const struct lu_env *env,
5144                                          struct lfsck_component *com, bool init)
5145 {
5146         struct lfsck_instance   *lfsck = com->lc_lfsck;
5147         struct lfsck_layout     *lo    = com->lc_file_ram;
5148         int                      rc;
5149
5150         if (com->lc_new_checked == 0 && !init)
5151                 return 0;
5152
5153         down_write(&com->lc_sem);
5154         if (init) {
5155                 lo->ll_pos_latest_start =
5156                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
5157         } else {
5158                 lo->ll_pos_last_checkpoint =
5159                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
5160                 lo->ll_run_time_phase1 += ktime_get_seconds() -
5161                                           lfsck->li_time_last_checkpoint;
5162                 lo->ll_time_last_checkpoint = ktime_get_real_seconds();
5163                 lo->ll_objs_checked_phase1 += com->lc_new_checked;
5164                 com->lc_new_checked = 0;
5165         }
5166
5167         rc = lfsck_layout_store(env, com);
5168         up_write(&com->lc_sem);
5169
5170         CDEBUG(D_LFSCK, "%s: layout LFSCK slave checkpoint at the pos ["
5171                "%llu], status = %d: rc = %d\n", lfsck_lfsck2name(lfsck),
5172                lfsck->li_pos_current.lp_oit_cookie, lo->ll_status, rc);
5173
5174         return rc;
5175 }
5176
5177 static int lfsck_layout_prep(const struct lu_env *env,
5178                              struct lfsck_component *com,
5179                              struct lfsck_start *start)
5180 {
5181         struct lfsck_instance   *lfsck  = com->lc_lfsck;
5182         struct lfsck_layout     *lo     = com->lc_file_ram;
5183         struct lfsck_position   *pos    = &com->lc_pos_start;
5184
5185         fid_zero(&pos->lp_dir_parent);
5186         pos->lp_dir_cookie = 0;
5187         if (lo->ll_status == LS_COMPLETED ||
5188             lo->ll_status == LS_PARTIAL ||
5189             /* To handle orphan, must scan from the beginning. */
5190             (start != NULL && start->ls_flags & LPF_OST_ORPHAN)) {
5191                 int rc;
5192
5193                 rc = lfsck_layout_reset(env, com, false);
5194                 if (rc == 0)
5195                         rc = lfsck_set_param(env, lfsck, start, true);
5196
5197                 if (rc != 0) {
5198                         CDEBUG(D_LFSCK, "%s: layout LFSCK prep failed: "
5199                                "rc = %d\n", lfsck_lfsck2name(lfsck), rc);
5200
5201                         return rc;
5202                 }
5203         }
5204
5205         down_write(&com->lc_sem);
5206         lo->ll_time_latest_start = ktime_get_real_seconds();
5207         spin_lock(&lfsck->li_lock);
5208         if (lo->ll_flags & LF_SCANNED_ONCE) {
5209                 if (!lfsck->li_drop_dryrun ||
5210                     lo->ll_pos_first_inconsistent == 0) {
5211                         lo->ll_status = LS_SCANNING_PHASE2;
5212                         list_move_tail(&com->lc_link,
5213                                        &lfsck->li_list_double_scan);
5214                         pos->lp_oit_cookie = 0;
5215                 } else {
5216                         int i;
5217
5218                         lo->ll_status = LS_SCANNING_PHASE1;
5219                         lo->ll_run_time_phase1 = 0;
5220                         lo->ll_run_time_phase2 = 0;
5221                         lo->ll_objs_checked_phase1 = 0;
5222                         lo->ll_objs_checked_phase2 = 0;
5223                         lo->ll_objs_failed_phase1 = 0;
5224                         lo->ll_objs_failed_phase2 = 0;
5225                         for (i = 0; i < LLIT_MAX; i++)
5226                                 lo->ll_objs_repaired[i] = 0;
5227
5228                         pos->lp_oit_cookie = lo->ll_pos_first_inconsistent;
5229                         fid_zero(&com->lc_fid_latest_scanned_phase2);
5230                 }
5231         } else {
5232                 lo->ll_status = LS_SCANNING_PHASE1;
5233                 if (!lfsck->li_drop_dryrun ||
5234                     lo->ll_pos_first_inconsistent == 0)
5235                         pos->lp_oit_cookie = lo->ll_pos_last_checkpoint + 1;
5236                 else
5237                         pos->lp_oit_cookie = lo->ll_pos_first_inconsistent;
5238         }
5239         spin_unlock(&lfsck->li_lock);
5240         up_write(&com->lc_sem);
5241
5242         return 0;
5243 }
5244
5245 static int lfsck_layout_slave_prep(const struct lu_env *env,
5246                                    struct lfsck_component *com,
5247                                    struct lfsck_start_param *lsp)
5248 {
5249         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
5250         struct lfsck_instance           *lfsck  = com->lc_lfsck;
5251         struct lfsck_layout             *lo     = com->lc_file_ram;
5252         struct lfsck_start              *start  = lsp->lsp_start;
5253         int                              rc;
5254
5255         rc = lfsck_layout_prep(env, com, start);
5256         if (rc != 0)
5257                 return rc;
5258
5259         if (lo->ll_flags & LF_CRASHED_LASTID &&
5260             list_empty(&llsd->llsd_master_list)) {
5261                 LASSERT(lfsck->li_out_notify != NULL);
5262
5263                 lfsck->li_out_notify(env, lfsck->li_out_notify_data,
5264                                      LE_LASTID_REBUILDING);
5265         }
5266
5267         if (!lsp->lsp_index_valid)
5268                 return 0;
5269
5270         rc = lfsck_layout_llst_add(llsd, lsp->lsp_index);
5271         if (rc == 0 && start != NULL && start->ls_flags & LPF_OST_ORPHAN) {
5272                 LASSERT(!llsd->llsd_rbtree_valid);
5273
5274                 write_lock(&llsd->llsd_rb_lock);
5275                 rc = lfsck_rbtree_setup(env, com);
5276                 write_unlock(&llsd->llsd_rb_lock);
5277         }
5278
5279         CDEBUG(D_LFSCK, "%s: layout LFSCK slave prep done, start pos ["
5280                "%llu]\n", lfsck_lfsck2name(lfsck),
5281                com->lc_pos_start.lp_oit_cookie);
5282
5283         return rc;
5284 }
5285
5286 static int lfsck_layout_master_prep(const struct lu_env *env,
5287                                     struct lfsck_component *com,
5288                                     struct lfsck_start_param *lsp)
5289 {
5290         int rc;
5291         ENTRY;
5292
5293         rc = lfsck_layout_load_bitmap(env, com);
5294         if (rc != 0) {
5295                 rc = lfsck_layout_reset(env, com, false);
5296                 if (rc == 0)
5297                         rc = lfsck_set_param(env, com->lc_lfsck,
5298                                              lsp->lsp_start, true);
5299
5300                 if (rc != 0)
5301                         GOTO(log, rc);
5302         }
5303
5304         rc = lfsck_layout_prep(env, com, lsp->lsp_start);
5305         if (rc != 0)
5306                 RETURN(rc);
5307
5308         rc = lfsck_start_assistant(env, com, lsp);
5309
5310         GOTO(log, rc);
5311
5312 log:
5313         CDEBUG(D_LFSCK, "%s: layout LFSCK master prep done, start pos ["
5314                "%llu]\n", lfsck_lfsck2name(com->lc_lfsck),
5315                com->lc_pos_start.lp_oit_cookie);
5316
5317         return 0;
5318 }
5319
5320 /* Pre-fetch the attribute for each stripe in the given layout EA. */
5321 static int lfsck_layout_scan_stripes(const struct lu_env *env,
5322                                      struct lfsck_component *com,
5323                                      struct dt_object *parent,
5324                                      struct lov_mds_md_v1 *lmm, __u32 comp_id)
5325 {
5326         struct lfsck_thread_info        *info    = lfsck_env_info(env);
5327         struct lfsck_instance           *lfsck   = com->lc_lfsck;
5328         struct lfsck_bookmark           *bk      = &lfsck->li_bookmark_ram;
5329         struct lfsck_layout             *lo      = com->lc_file_ram;
5330         struct lfsck_assistant_data     *lad     = com->lc_data;
5331         struct lfsck_assistant_object   *lso     = NULL;
5332         struct lov_ost_data_v1          *objs;
5333         struct lfsck_tgt_descs          *ltds    = &lfsck->li_ost_descs;
5334         struct ptlrpc_thread            *mthread = &lfsck->li_thread;
5335         struct ptlrpc_thread            *athread = &lad->lad_thread;
5336         struct l_wait_info               lwi     = { 0 };
5337         struct lu_buf                    buf;
5338         int                              rc      = 0;
5339         int                              i;
5340         __u32                            magic;
5341         __u16                            count;
5342         ENTRY;
5343
5344         lfsck_buf_init(&buf, &info->lti_ff, sizeof(struct filter_fid));
5345         magic = le32_to_cpu(lmm->lmm_magic);
5346         if (magic == LOV_MAGIC_V1) {
5347                 objs = &lmm->lmm_objects[0];
5348         } else {
5349                 LASSERT(magic == LOV_MAGIC_V3);
5350                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0];
5351         }
5352
5353         count = le16_to_cpu(lmm->lmm_stripe_count);
5354         for (i = 0; i < count; i++, objs++) {
5355                 struct lu_fid           *fid    = &info->lti_fid;
5356                 struct ost_id           *oi     = &info->lti_oi;
5357                 struct lfsck_layout_req *llr;
5358                 struct lfsck_tgt_desc   *tgt    = NULL;
5359                 struct dt_object        *cobj   = NULL;
5360                 __u32                    index;
5361                 bool                     wakeup = false;
5362
5363                 if (unlikely(lovea_slot_is_dummy(objs)))
5364                         continue;
5365
5366                 l_wait_event(mthread->t_ctl_waitq,
5367                              lad->lad_prefetched < bk->lb_async_windows ||
5368                              !thread_is_running(mthread) ||
5369                              thread_is_stopped(athread),
5370                              &lwi);
5371
5372                 if (unlikely(!thread_is_running(mthread)) ||
5373                              thread_is_stopped(athread))
5374                         GOTO(out, rc = 0);
5375
5376                 if (unlikely(lfsck_is_dead_obj(parent)))
5377                         GOTO(out, rc = 0);
5378
5379                 ostid_le_to_cpu(&objs->l_ost_oi, oi);
5380                 index = le32_to_cpu(objs->l_ost_idx);
5381                 rc = ostid_to_fid(fid, oi, index);
5382                 if (rc != 0) {
5383                         CDEBUG(D_LFSCK, "%s: get invalid layout EA for "DFID
5384                                ": "DOSTID", idx %u, comp_id %u\n",
5385                                lfsck_lfsck2name(lfsck),
5386                                PFID(lfsck_dto2fid(parent)), POSTID(oi),
5387                                index, comp_id);
5388                         goto next;
5389                 }
5390
5391                 tgt = lfsck_tgt_get(ltds, index);
5392                 if (unlikely(tgt == NULL)) {
5393                         CDEBUG(D_LFSCK, "%s: cannot talk with OST %x which "
5394                                "did not join the layout LFSCK, comp_id %u\n",
5395                                lfsck_lfsck2name(lfsck), index, comp_id);
5396                         lfsck_lad_set_bitmap(env, com, index);
5397                         goto next;
5398                 }
5399
5400                 /* There is potential deadlock race condition between object
5401                  * destroy and layout LFSCK. Consider the following scenario:
5402                  *
5403                  * 1) The LFSCK thread obtained the parent object firstly, at
5404                  *    that time, the parent object has not been destroyed yet.
5405                  *
5406                  * 2) One RPC service thread destroyed the parent and all its
5407                  *    children objects. Because the LFSCK is referencing the
5408                  *    parent object, then the parent object will be marked as
5409                  *    dying in RAM. On the other hand, the parent object is
5410                  *    referencing all its children objects, then all children
5411                  *    objects will be marked as dying in RAM also.
5412                  *
5413                  * 3) The LFSCK thread tries to find some child object with
5414                  *    the parent object referenced. Then it will find that the
5415                  *    child object is dying. According to the object visibility
5416                  *    rules: the object with dying flag cannot be returned to
5417                  *    others. So the LFSCK thread has to wait until the dying
5418                  *    object has been purged from RAM, then it can allocate a
5419                  *    new object (with the same FID) in RAM. Unfortunately, the
5420                  *    LFSCK thread itself is referencing the parent object, and
5421                  *    cause the parent object cannot be purged, then cause the
5422                  *    child object cannot be purged also. So the LFSCK thread
5423                  *    will fall into deadlock.
5424                  */
5425                 cobj = lfsck_object_find_by_dev(env, tgt->ltd_tgt, fid);
5426                 if (IS_ERR(cobj)) {
5427                         if (lfsck_is_dead_obj(parent)) {
5428                                 lfsck_tgt_put(tgt);
5429
5430                                 GOTO(out, rc = 0);
5431                         }
5432
5433                         rc = PTR_ERR(cobj);
5434                         goto next;
5435                 }
5436
5437                 rc = dt_declare_attr_get(env, cobj);
5438                 if (rc)
5439                         goto next;
5440
5441                 rc = dt_declare_xattr_get(env, cobj, &buf, XATTR_NAME_FID);
5442                 if (rc)
5443                         goto next;
5444
5445                 if (lso == NULL) {
5446                         struct lu_attr *attr = &info->lti_la;
5447
5448                         rc = dt_attr_get(env, parent, attr);
5449                         if (rc != 0)
5450                                 goto next;
5451
5452                         lso = lfsck_assistant_object_init(env,
5453                                 lfsck_dto2fid(parent), attr,
5454                                 lfsck->li_pos_current.lp_oit_cookie, false);
5455                         if (IS_ERR(lso)) {
5456                                 rc = PTR_ERR(lso);
5457                                 lso = NULL;
5458
5459                                 goto next;
5460                         }
5461                 }
5462
5463                 llr = lfsck_layout_assistant_req_init(lso, cobj, comp_id,
5464                                                       index, i);
5465                 if (IS_ERR(llr)) {
5466                         rc = PTR_ERR(llr);
5467                         goto next;
5468                 }
5469
5470                 cobj = NULL;
5471                 spin_lock(&lad->lad_lock);
5472                 if (lad->lad_assistant_status < 0) {
5473                         spin_unlock(&lad->lad_lock);
5474                         lfsck_layout_assistant_req_fini(env, &llr->llr_lar);
5475                         lfsck_tgt_put(tgt);
5476                         RETURN(lad->lad_assistant_status);
5477                 }
5478
5479                 list_add_tail(&llr->llr_lar.lar_list, &lad->lad_req_list);
5480                 if (lad->lad_prefetched == 0)
5481                         wakeup = true;
5482
5483                 lad->lad_prefetched++;
5484                 spin_unlock(&lad->lad_lock);
5485                 if (wakeup)
5486                         wake_up_all(&athread->t_ctl_waitq);
5487
5488 next:
5489                 down_write(&com->lc_sem);
5490                 com->lc_new_checked++;
5491                 if (rc < 0)
5492                         lfsck_layout_record_failure(env, lfsck, lo);
5493                 up_write(&com->lc_sem);
5494
5495                 if (cobj != NULL && !IS_ERR(cobj))
5496                         lfsck_object_put(env, cobj);
5497
5498                 if (likely(tgt != NULL))
5499                         lfsck_tgt_put(tgt);
5500
5501                 if (rc < 0 && bk->lb_param & LPF_FAILOUT)
5502                         GOTO(out, rc);
5503         }
5504
5505         GOTO(out, rc = 0);
5506
5507 out:
5508         if (lso != NULL)
5509                 lfsck_assistant_object_put(env, lso);
5510
5511         return rc;
5512 }
5513
5514 /* For the given object, read its layout EA locally. For each stripe, pre-fetch
5515  * the OST-object's attribute and generate an structure lfsck_layout_req on the
5516  * list ::lad_req_list.
5517  *
5518  * For each request on above list, the lfsck_layout_assistant thread compares
5519  * the OST side attribute with local attribute, if inconsistent, then repair it.
5520  *
5521  * All above processing is async mode with pipeline. */
5522 static int lfsck_layout_master_exec_oit(const struct lu_env *env,
5523                                         struct lfsck_component *com,
5524                                         struct dt_object *obj)
5525 {
5526         struct lfsck_thread_info        *info   = lfsck_env_info(env);
5527         struct ost_id                   *oi     = &info->lti_oi;
5528         struct lfsck_layout             *lo     = com->lc_file_ram;
5529         struct lfsck_assistant_data     *lad    = com->lc_data;
5530         struct lfsck_instance           *lfsck  = com->lc_lfsck;
5531         struct lfsck_bookmark           *bk     = &lfsck->li_bookmark_ram;
5532         struct thandle                  *handle = NULL;
5533         struct lu_buf                   *buf    = &info->lti_big_buf;
5534         struct lov_mds_md_v1            *lmm    = NULL;
5535         struct dt_device                *dev    = lfsck_obj2dev(obj);
5536         struct lustre_handle             lh     = { 0 };
5537         struct lu_buf                    ea_buf = { NULL };
5538         struct lov_comp_md_v1           *lcm    = NULL;
5539         struct lov_comp_md_entry_v1     *lcme   = NULL;
5540         int                              rc     = 0;
5541         int                              size   = 0;
5542         __u32                            magic  = 0;
5543         __u16                            count  = 0;
5544         bool                             locked = false;
5545         bool                             stripe = false;
5546         bool                             bad_oi = false;
5547         ENTRY;
5548
5549         if (!S_ISREG(lfsck_object_type(obj)))
5550                 GOTO(out, rc = 0);
5551
5552         if (lad->lad_assistant_status < 0)
5553                 GOTO(out, rc = -ESRCH);
5554
5555         fid_to_lmm_oi(lfsck_dto2fid(obj), oi);
5556         lmm_oi_cpu_to_le(oi, oi);
5557         dt_read_lock(env, obj, 0);
5558         locked = true;
5559
5560 again:
5561         bad_oi = false;
5562         if (dt_object_exists(obj) == 0 ||
5563             lfsck_is_dead_obj(obj))
5564                 GOTO(out, rc = 0);
5565
5566         rc = lfsck_layout_get_lovea(env, obj, buf);
5567         if (rc == -EINVAL || rc == -ENODATA || rc == -EOPNOTSUPP)
5568                 /* Skip bad lov EA during the 1st cycle scanning, and
5569                  * try to recover it via orphan in the 2nd scanning. */
5570                 rc = 0;
5571         if (rc <= 0)
5572                 GOTO(out, rc);
5573
5574         size = rc;
5575         lmm = buf->lb_buf;
5576         magic = le32_to_cpu(lmm->lmm_magic);
5577         if (magic == LOV_MAGIC_COMP_V1) {
5578                 int i;
5579
5580                 lcm = buf->lb_buf;
5581                 count = le16_to_cpu(lcm->lcm_entry_count);
5582                 for (i = 0; i < count; i++) {
5583                         lcme = &lcm->lcm_entries[i];
5584                         lmm = buf->lb_buf + le32_to_cpu(lcme->lcme_offset);
5585                         if (memcmp(oi, &lmm->lmm_oi, sizeof(*oi)) != 0)
5586                                 goto fix;
5587                 }
5588
5589                 GOTO(out, stripe = true);
5590         } else if (memcmp(oi, &lmm->lmm_oi, sizeof(*oi)) == 0) {
5591                 GOTO(out, stripe = true);
5592         }
5593
5594 fix:
5595         /* Inconsistent lmm_oi, should be repaired. */
5596         bad_oi = true;
5597
5598         if (bk->lb_param & LPF_DRYRUN) {
5599                 lo->ll_objs_repaired[LLIT_OTHERS - 1]++;
5600
5601                 GOTO(out, stripe = true);
5602         }
5603
5604         if (!lustre_handle_is_used(&lh)) {
5605                 dt_read_unlock(env, obj);
5606                 locked = false;
5607                 rc = lfsck_ibits_lock(env, lfsck, obj, &lh,
5608                                       MDS_INODELOCK_LAYOUT |
5609                                       MDS_INODELOCK_XATTR, LCK_EX);
5610                 if (rc != 0)
5611                         GOTO(out, rc);
5612
5613                 handle = dt_trans_create(env, dev);
5614                 if (IS_ERR(handle))
5615                         GOTO(out, rc = PTR_ERR(handle));
5616
5617                 lfsck_buf_init(&ea_buf, lmm, size);
5618                 rc = dt_declare_xattr_set(env, obj, &ea_buf, XATTR_NAME_LOV,
5619                                           LU_XATTR_REPLACE, handle);
5620                 if (rc != 0)
5621                         GOTO(out, rc);
5622
5623                 rc = dt_trans_start_local(env, dev, handle);
5624                 if (rc != 0)
5625                         GOTO(out, rc);
5626
5627                 dt_write_lock(env, obj, 0);
5628                 locked = true;
5629
5630                 goto again;
5631         }
5632
5633         if (magic == LOV_MAGIC_COMP_V1) {
5634                 int i;
5635
5636                 for (i = 0; i < count; i++) {
5637                         lcme = &lcm->lcm_entries[i];
5638                         lmm = buf->lb_buf + le32_to_cpu(lcme->lcme_offset);
5639                         lmm->lmm_oi = *oi;
5640                 }
5641         } else {
5642                 lmm->lmm_oi = *oi;
5643         }
5644
5645         rc = dt_xattr_set(env, obj, &ea_buf, XATTR_NAME_LOV,
5646                           LU_XATTR_REPLACE, handle);
5647         if (rc != 0)
5648                 GOTO(out, rc);
5649
5650         lo->ll_objs_repaired[LLIT_OTHERS - 1]++;
5651
5652         GOTO(out, stripe = true);
5653
5654 out:
5655         if (locked) {
5656                 if (lustre_handle_is_used(&lh))
5657                         dt_write_unlock(env, obj);
5658                 else
5659                         dt_read_unlock(env, obj);
5660         }
5661
5662         if (handle != NULL && !IS_ERR(handle))
5663                 dt_trans_stop(env, dev, handle);
5664
5665         lfsck_ibits_unlock(&lh, LCK_EX);
5666
5667         if (bad_oi)
5668                 CDEBUG(D_LFSCK, "%s: layout LFSCK master %s bad lmm_oi for "
5669                        DFID": rc = %d\n", lfsck_lfsck2name(lfsck),
5670                        bk->lb_param & LPF_DRYRUN ? "found" : "repaired",
5671                        PFID(lfsck_dto2fid(obj)), rc);
5672
5673         if (stripe) {
5674                 if (magic == LOV_MAGIC_COMP_V1) {
5675                         int i;
5676
5677                         for (i = 0; i < count; i++) {
5678                                 lcme = &lcm->lcm_entries[i];
5679                                 if (!(le32_to_cpu(lcme->lcme_flags) &
5680                                       LCME_FL_INIT))
5681                                         continue;
5682
5683                                 rc = lfsck_layout_scan_stripes(env, com, obj,
5684                                         (struct lov_mds_md_v1 *)(buf->lb_buf +
5685                                         le32_to_cpu(lcme->lcme_offset)),
5686                                         le32_to_cpu(lcme->lcme_id));
5687                         }
5688                 } else {
5689                         rc = lfsck_layout_scan_stripes(env, com, obj, lmm, 0);
5690                 }
5691         } else {
5692                 down_write(&com->lc_sem);
5693                 com->lc_new_checked++;
5694                 if (rc < 0)
5695                         lfsck_layout_record_failure(env, lfsck, lo);
5696                 up_write(&com->lc_sem);
5697         }
5698
5699         return rc;
5700 }
5701
5702 static int lfsck_layout_slave_exec_oit(const struct lu_env *env,
5703                                        struct lfsck_component *com,
5704                                        struct dt_object *obj)
5705 {
5706         struct lfsck_instance           *lfsck  = com->lc_lfsck;
5707         struct lfsck_layout             *lo     = com->lc_file_ram;
5708         const struct lu_fid             *fid    = lfsck_dto2fid(obj);
5709         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
5710         struct lfsck_layout_seq         *lls;
5711         __u64                            seq;
5712         __u64                            oid;
5713         int                              rc;
5714         ENTRY;
5715
5716         LASSERT(llsd != NULL);
5717
5718         if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY5) &&
5719             cfs_fail_val == lfsck_dev_idx(lfsck)) {
5720                 struct l_wait_info       lwi = LWI_TIMEOUT(cfs_time_seconds(1),
5721                                                            NULL, NULL);
5722                 struct ptlrpc_thread    *thread = &lfsck->li_thread;
5723
5724                 l_wait_event(thread->t_ctl_waitq,
5725                              !thread_is_running(thread),
5726                              &lwi);
5727         }
5728
5729         lfsck_rbtree_update_bitmap(env, com, fid, false);
5730
5731         down_write(&com->lc_sem);
5732         if (fid_is_idif(fid))
5733                 seq = 0;
5734         else if (!fid_is_norm(fid) ||
5735                  !fid_is_for_ostobj(env, lfsck, obj, fid))
5736                 GOTO(unlock, rc = 0);
5737         else
5738                 seq = fid_seq(fid);
5739         com->lc_new_checked++;
5740
5741         lls = lfsck_layout_seq_lookup(llsd, seq);
5742         if (lls == NULL) {
5743                 OBD_ALLOC_PTR(lls);
5744                 if (unlikely(lls == NULL))
5745                         GOTO(unlock, rc = -ENOMEM);
5746
5747                 INIT_LIST_HEAD(&lls->lls_list);
5748                 lls->lls_seq = seq;
5749                 rc = lfsck_layout_lastid_load(env, com, lls);
5750                 if (rc != 0) {
5751                         CDEBUG(D_LFSCK, "%s: layout LFSCK failed to "
5752                               "load LAST_ID for %#llx: rc = %d\n",
5753                               lfsck_lfsck2name(com->lc_lfsck), seq, rc);
5754                         lo->ll_objs_failed_phase1++;
5755                         OBD_FREE_PTR(lls);
5756                         GOTO(unlock, rc);
5757                 }
5758
5759                 lfsck_layout_seq_insert(llsd, lls);
5760         }
5761
5762         if (unlikely(fid_is_last_id(fid)))
5763                 GOTO(unlock, rc = 0);
5764
5765         if (fid_is_idif(fid))
5766                 oid = fid_idif_id(fid_seq(fid), fid_oid(fid), fid_ver(fid));
5767         else
5768                 oid = fid_oid(fid);
5769
5770         if (oid > lls->lls_lastid_known)
5771                 lls->lls_lastid_known = oid;
5772
5773         if (oid > lls->lls_lastid) {
5774                 if (!(lo->ll_flags & LF_CRASHED_LASTID)) {
5775                         /* OFD may create new objects during LFSCK scanning. */
5776                         rc = lfsck_layout_lastid_reload(env, com, lls);
5777                         if (unlikely(rc != 0)) {
5778                                 CDEBUG(D_LFSCK, "%s: layout LFSCK failed to "
5779                                       "reload LAST_ID for %#llx: rc = %d\n",
5780                                       lfsck_lfsck2name(com->lc_lfsck),
5781                                       lls->lls_seq, rc);
5782
5783                                 GOTO(unlock, rc);
5784                         }
5785
5786                         if (oid <= lls->lls_lastid ||
5787                             lo->ll_flags & LF_CRASHED_LASTID)
5788                                 GOTO(unlock, rc = 0);
5789
5790                         LASSERT(lfsck->li_out_notify != NULL);
5791
5792                         lfsck->li_out_notify(env, lfsck->li_out_notify_data,
5793                                              LE_LASTID_REBUILDING);
5794                         lo->ll_flags |= LF_CRASHED_LASTID;
5795
5796                         CDEBUG(D_LFSCK, "%s: layout LFSCK finds crashed "
5797                                "LAST_ID file (2) for the sequence %#llx"
5798                                ", old value %llu, known value %llu\n",
5799                                lfsck_lfsck2name(lfsck), lls->lls_seq,
5800                                lls->lls_lastid, oid);
5801                 }
5802
5803                 lls->lls_lastid = oid;
5804                 lls->lls_dirty = 1;
5805         }
5806
5807         GOTO(unlock, rc = 0);
5808
5809 unlock:
5810         up_write(&com->lc_sem);
5811
5812         return rc;
5813 }
5814
5815 static int lfsck_layout_exec_dir(const struct lu_env *env,
5816                                  struct lfsck_component *com,
5817                                  struct lfsck_assistant_object *lso,
5818                                  struct lu_dirent *ent, __u16 type)
5819 {
5820         return 0;
5821 }
5822
5823 static int lfsck_layout_master_post(const struct lu_env *env,
5824                                     struct lfsck_component *com,
5825                                     int result, bool init)
5826 {
5827         struct lfsck_instance   *lfsck  = com->lc_lfsck;
5828         struct lfsck_layout     *lo     = com->lc_file_ram;
5829         int                      rc;
5830         ENTRY;
5831
5832         lfsck_post_generic(env, com, &result);
5833
5834         down_write(&com->lc_sem);
5835         spin_lock(&lfsck->li_lock);
5836         if (!init)
5837                 lo->ll_pos_last_checkpoint =
5838                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
5839
5840         if (result > 0) {
5841                 if (lo->ll_flags & LF_INCOMPLETE)
5842                         lo->ll_status = LS_PARTIAL;
5843                 else
5844                         lo->ll_status = LS_SCANNING_PHASE2;
5845                 lo->ll_flags |= LF_SCANNED_ONCE;
5846                 lo->ll_flags &= ~LF_UPGRADE;
5847                 list_move_tail(&com->lc_link, &lfsck->li_list_double_scan);
5848         } else if (result == 0) {
5849                 if (lfsck->li_status != 0)
5850                         lo->ll_status = lfsck->li_status;
5851                 else
5852                         lo->ll_status = LS_STOPPED;
5853                 if (lo->ll_status != LS_PAUSED)
5854                         list_move_tail(&com->lc_link, &lfsck->li_list_idle);
5855         } else {
5856                 lo->ll_status = LS_FAILED;
5857                 list_move_tail(&com->lc_link, &lfsck->li_list_idle);
5858         }
5859         spin_unlock(&lfsck->li_lock);
5860
5861         if (!init) {
5862                 lo->ll_run_time_phase1 += ktime_get_seconds() -
5863                                           lfsck->li_time_last_checkpoint;
5864                 lo->ll_time_last_checkpoint = ktime_get_real_seconds();
5865                 lo->ll_objs_checked_phase1 += com->lc_new_checked;
5866                 com->lc_new_checked = 0;
5867         }
5868
5869         rc = lfsck_layout_store(env, com);
5870         up_write(&com->lc_sem);
5871
5872         CDEBUG(D_LFSCK, "%s: layout LFSCK master post done: rc = %d\n",
5873                lfsck_lfsck2name(lfsck), rc);
5874
5875         RETURN(rc);
5876 }
5877
5878 static int lfsck_layout_slave_post(const struct lu_env *env,
5879                                    struct lfsck_component *com,
5880                                    int result, bool init)
5881 {
5882         struct lfsck_instance   *lfsck = com->lc_lfsck;
5883         struct lfsck_layout     *lo    = com->lc_file_ram;
5884         int                      rc;
5885         bool                     done  = false;
5886
5887         down_write(&com->lc_sem);
5888         rc = lfsck_layout_lastid_store(env, com);
5889         if (rc != 0)
5890                 result = rc;
5891
5892         LASSERT(lfsck->li_out_notify != NULL);
5893
5894         spin_lock(&lfsck->li_lock);
5895         if (!init)
5896                 lo->ll_pos_last_checkpoint =
5897                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
5898
5899         if (result > 0) {
5900                 lo->ll_status = LS_SCANNING_PHASE2;
5901                 lo->ll_flags |= LF_SCANNED_ONCE;
5902                 if (lo->ll_flags & LF_CRASHED_LASTID) {
5903                         done = true;
5904                         lo->ll_flags &= ~LF_CRASHED_LASTID;
5905
5906                         CDEBUG(D_LFSCK, "%s: layout LFSCK has rebuilt "
5907                                "crashed LAST_ID files successfully\n",
5908                                lfsck_lfsck2name(lfsck));
5909                 }
5910                 lo->ll_flags &= ~LF_UPGRADE;
5911                 list_move_tail(&com->lc_link, &lfsck->li_list_double_scan);
5912         } else if (result == 0) {
5913                 if (lfsck->li_status != 0)
5914                         lo->ll_status = lfsck->li_status;
5915                 else
5916                         lo->ll_status = LS_STOPPED;
5917                 if (lo->ll_status != LS_PAUSED)
5918                         list_move_tail(&com->lc_link, &lfsck->li_list_idle);
5919         } else {
5920                 lo->ll_status = LS_FAILED;
5921                 list_move_tail(&com->lc_link, &lfsck->li_list_idle);
5922         }
5923         spin_unlock(&lfsck->li_lock);
5924
5925         if (done)
5926                 lfsck->li_out_notify(env, lfsck->li_out_notify_data,
5927                                      LE_LASTID_REBUILT);
5928
5929         if (!init) {
5930                 lo->ll_run_time_phase1 += ktime_get_seconds() -
5931                                           lfsck->li_time_last_checkpoint;
5932                 lo->ll_time_last_checkpoint = ktime_get_real_seconds();
5933                 lo->ll_objs_checked_phase1 += com->lc_new_checked;
5934                 com->lc_new_checked = 0;
5935         }
5936
5937         rc = lfsck_layout_store(env, com);
5938         up_write(&com->lc_sem);
5939
5940         lfsck_layout_slave_notify_master(env, com, LE_PHASE1_DONE, result);
5941
5942         CDEBUG(D_LFSCK, "%s: layout LFSCK slave post done: rc = %d\n",
5943                lfsck_lfsck2name(lfsck), rc);
5944
5945         return rc;
5946 }
5947
5948 static void lfsck_layout_dump(const struct lu_env *env,
5949                               struct lfsck_component *com, struct seq_file *m)
5950 {
5951         struct lfsck_instance   *lfsck = com->lc_lfsck;
5952         struct lfsck_bookmark   *bk    = &lfsck->li_bookmark_ram;
5953         struct lfsck_layout     *lo    = com->lc_file_ram;
5954         const char *prefix;
5955
5956         down_read(&com->lc_sem);
5957         if (bk->lb_param & LPF_DRYRUN)
5958                 prefix = "inconsistent";
5959         else
5960                 prefix = "repaired";
5961
5962         seq_printf(m, "name: lfsck_layout\n"
5963                    "magic: %#x\n"
5964                    "version: %d\n"
5965                    "status: %s\n",
5966                    lo->ll_magic,
5967                    bk->lb_version,
5968                    lfsck_status2name(lo->ll_status));
5969
5970         lfsck_bits_dump(m, lo->ll_flags, lfsck_flags_names, "flags");
5971
5972         lfsck_bits_dump(m, bk->lb_param, lfsck_param_names, "param");
5973
5974         lfsck_time_dump(m, lo->ll_time_last_complete, "last_completed");
5975
5976         lfsck_time_dump(m, lo->ll_time_latest_start, "latest_start");
5977
5978         lfsck_time_dump(m, lo->ll_time_last_checkpoint, "last_checkpoint");
5979
5980         seq_printf(m, "latest_start_position: %llu\n"
5981                    "last_checkpoint_position: %llu\n"
5982                    "first_failure_position: %llu\n",
5983                    lo->ll_pos_latest_start,
5984                    lo->ll_pos_last_checkpoint,
5985                    lo->ll_pos_first_inconsistent);
5986
5987         seq_printf(m, "success_count: %u\n"
5988                    "%s_dangling: %llu\n"
5989                    "%s_unmatched_pair: %llu\n"
5990                    "%s_multiple_referenced: %llu\n"
5991                    "%s_orphan: %llu\n"
5992                    "%s_inconsistent_owner: %llu\n"
5993                    "%s_others: %llu\n"
5994                    "skipped: %llu\n"
5995                    "failed_phase1: %llu\n"
5996                    "failed_phase2: %llu\n",
5997                    lo->ll_success_count,
5998                    prefix, lo->ll_objs_repaired[LLIT_DANGLING - 1],
5999                    prefix, lo->ll_objs_repaired[LLIT_UNMATCHED_PAIR - 1],
6000                    prefix, lo->ll_objs_repaired[LLIT_MULTIPLE_REFERENCED - 1],
6001                    prefix, lo->ll_objs_repaired[LLIT_ORPHAN - 1],
6002                    prefix, lo->ll_objs_repaired[LLIT_INCONSISTENT_OWNER - 1],
6003                    prefix, lo->ll_objs_repaired[LLIT_OTHERS - 1],
6004                    lo->ll_objs_skipped,
6005                    lo->ll_objs_failed_phase1,
6006                    lo->ll_objs_failed_phase2);
6007
6008         if (lo->ll_status == LS_SCANNING_PHASE1) {
6009                 time64_t duration = ktime_get_seconds() -
6010                                     lfsck->li_time_last_checkpoint;
6011                 u64 checked = lo->ll_objs_checked_phase1 +
6012                               com->lc_new_checked;
6013                 u64 speed = checked;
6014                 u64 new_checked = com->lc_new_checked;
6015                 time64_t rtime = lo->ll_run_time_phase1 + duration;
6016                 u64 pos;
6017
6018                 if (duration != 0)
6019                         new_checked = div64_s64(new_checked, duration);
6020                 if (rtime != 0)
6021                         speed = div64_s64(speed, rtime);
6022                 seq_printf(m, "checked_phase1: %llu\n"
6023                            "checked_phase2: %llu\n"
6024                            "run_time_phase1: %lld seconds\n"
6025                            "run_time_phase2: %lld seconds\n"
6026                            "average_speed_phase1: %llu items/sec\n"
6027                            "average_speed_phase2: N/A\n"
6028                            "real-time_speed_phase1: %llu items/sec\n"
6029                            "real-time_speed_phase2: N/A\n",
6030                            checked,
6031                            lo->ll_objs_checked_phase2,
6032                            rtime,
6033                            lo->ll_run_time_phase2,
6034                            speed,
6035                            new_checked);
6036
6037                 if (likely(lfsck->li_di_oit)) {
6038                         const struct dt_it_ops *iops =
6039                                 &lfsck->li_obj_oit->do_index_ops->dio_it;
6040
6041                         /* The low layer otable-based iteration position may NOT
6042                          * exactly match the layout-based directory traversal
6043                          * cookie. Generally, it is not a serious issue. But the
6044                          * caller should NOT make assumption on that. */
6045                         pos = iops->store(env, lfsck->li_di_oit);
6046                         if (!lfsck->li_current_oit_processed)
6047                                 pos--;
6048                 } else {
6049                         pos = lo->ll_pos_last_checkpoint;
6050                 }
6051
6052                 seq_printf(m, "current_position: %llu\n", pos);
6053         } else if (lo->ll_status == LS_SCANNING_PHASE2) {
6054                 time64_t duration = ktime_get_seconds() -
6055                                     com->lc_time_last_checkpoint;
6056                 u64 checked = lo->ll_objs_checked_phase2 +
6057                               com->lc_new_checked;
6058                 u64 speed1 = lo->ll_objs_checked_phase1;
6059                 u64 speed2 = checked;
6060                 u64 new_checked = com->lc_new_checked;
6061                 time64_t rtime = lo->ll_run_time_phase2 + duration;
6062
6063                 if (duration != 0)
6064                         new_checked = div64_s64(new_checked, duration);
6065                 if (lo->ll_run_time_phase1 != 0)
6066                         speed1 = div64_s64(speed1, lo->ll_run_time_phase1);
6067                 if (rtime != 0)
6068                         speed2 = div64_s64(speed2, rtime);
6069                 seq_printf(m, "checked_phase1: %llu\n"
6070                            "checked_phase2: %llu\n"
6071                            "run_time_phase1: %lld seconds\n"
6072                            "run_time_phase2: %lld seconds\n"
6073                            "average_speed_phase1: %llu items/sec\n"
6074                            "average_speed_phase2: %llu items/sec\n"
6075                            "real-time_speed_phase1: N/A\n"
6076                            "real-time_speed_phase2: %llu items/sec\n"
6077                            "current_position: "DFID"\n",
6078                            lo->ll_objs_checked_phase1,
6079                            checked,
6080                            lo->ll_run_time_phase1,
6081                            rtime,
6082                            speed1,
6083                            speed2,
6084                            new_checked,
6085                            PFID(&com->lc_fid_latest_scanned_phase2));
6086         } else {
6087                 __u64 speed1 = lo->ll_objs_checked_phase1;
6088                 __u64 speed2 = lo->ll_objs_checked_phase2;
6089
6090                 if (lo->ll_run_time_phase1 != 0)
6091                         speed1 = div64_s64(speed1, lo->ll_run_time_phase1);
6092                 if (lo->ll_run_time_phase2 != 0)
6093                         speed2 = div64_s64(speed2, lo->ll_run_time_phase2);
6094                 seq_printf(m, "checked_phase1: %llu\n"
6095                            "checked_phase2: %llu\n"
6096                            "run_time_phase1: %lld seconds\n"
6097                            "run_time_phase2: %lld seconds\n"
6098                            "average_speed_phase1: %llu items/sec\n"
6099                            "average_speed_phase2: %llu objs/sec\n"
6100                            "real-time_speed_phase1: N/A\n"
6101                            "real-time_speed_phase2: N/A\n"
6102                            "current_position: N/A\n",
6103                            lo->ll_objs_checked_phase1,
6104                            lo->ll_objs_checked_phase2,
6105                            lo->ll_run_time_phase1,
6106                            lo->ll_run_time_phase2,
6107                            speed1,
6108                            speed2);
6109         }
6110
6111         up_read(&com->lc_sem);
6112 }
6113
6114 static int lfsck_layout_master_double_scan(const struct lu_env *env,
6115                                            struct lfsck_component *com)
6116 {
6117         struct lfsck_layout             *lo     = com->lc_file_ram;
6118         struct lfsck_assistant_data     *lad    = com->lc_data;
6119         struct lfsck_instance           *lfsck  = com->lc_lfsck;
6120         struct lfsck_tgt_descs          *ltds;
6121         struct lfsck_tgt_desc           *ltd;
6122         struct lfsck_tgt_desc           *next;
6123         int                              rc;
6124
6125         rc = lfsck_double_scan_generic(env, com, lo->ll_status);
6126
6127         if (thread_is_stopped(&lad->lad_thread)) {
6128                 LASSERT(list_empty(&lad->lad_req_list));
6129                 LASSERT(list_empty(&lad->lad_ost_phase1_list));
6130                 LASSERT(list_empty(&lad->lad_mdt_phase1_list));
6131
6132                 ltds = &lfsck->li_ost_descs;
6133                 spin_lock(&ltds->ltd_lock);
6134                 list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase2_list,
6135                                          ltd_layout_phase_list) {
6136                         list_del_init(&ltd->ltd_layout_phase_list);
6137                 }
6138                 spin_unlock(&ltds->ltd_lock);
6139
6140                 ltds = &lfsck->li_mdt_descs;
6141                 spin_lock(&ltds->ltd_lock);
6142                 list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase2_list,
6143                                          ltd_layout_phase_list) {
6144                         list_del_init(&ltd->ltd_layout_phase_list);
6145                 }
6146                 spin_unlock(&ltds->ltd_lock);
6147         }
6148
6149         return rc;
6150 }
6151
6152 static int lfsck_layout_slave_double_scan(const struct lu_env *env,
6153                                           struct lfsck_component *com)
6154 {
6155         struct lfsck_instance           *lfsck  = com->lc_lfsck;
6156         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
6157         struct lfsck_layout             *lo     = com->lc_file_ram;
6158         struct ptlrpc_thread            *thread = &lfsck->li_thread;
6159         int                              rc;
6160         ENTRY;
6161
6162         CDEBUG(D_LFSCK, "%s: layout LFSCK slave phase2 scan start\n",
6163                lfsck_lfsck2name(lfsck));
6164
6165         atomic_inc(&lfsck->li_double_scan_count);
6166
6167         if (lo->ll_flags & LF_INCOMPLETE)
6168                 GOTO(done, rc = 1);
6169
6170         com->lc_new_checked = 0;
6171         com->lc_new_scanned = 0;
6172         com->lc_time_last_checkpoint = ktime_get_seconds();
6173         com->lc_time_next_checkpoint = com->lc_time_last_checkpoint +
6174                                        LFSCK_CHECKPOINT_INTERVAL;
6175
6176         while (1) {
6177                 struct l_wait_info lwi = LWI_TIMEOUT(cfs_time_seconds(30),
6178                                                      NULL, NULL);
6179
6180                 rc = lfsck_layout_slave_query_master(env, com);
6181                 if (list_empty(&llsd->llsd_master_list)) {
6182                         if (unlikely(!thread_is_running(thread)))
6183                                 rc = 0;
6184                         else
6185                                 rc = 1;
6186
6187                         GOTO(done, rc);
6188                 }
6189
6190                 if (rc < 0)
6191                         GOTO(done, rc);
6192
6193                 rc = l_wait_event(thread->t_ctl_waitq,
6194                                   !thread_is_running(thread) ||
6195                                   lo->ll_flags & LF_INCOMPLETE ||
6196                                   list_empty(&llsd->llsd_master_list),
6197                                   &lwi);
6198                 if (unlikely(!thread_is_running(thread)))
6199                         GOTO(done, rc = 0);
6200
6201                 if (lo->ll_flags & LF_INCOMPLETE)
6202                         GOTO(done, rc = 1);
6203
6204                 if (rc == -ETIMEDOUT)
6205                         continue;
6206
6207                 GOTO(done, rc = (rc < 0 ? rc : 1));
6208         }
6209
6210 done:
6211         rc = lfsck_layout_double_scan_result(env, com, rc);
6212         lfsck_layout_slave_notify_master(env, com, LE_PHASE2_DONE,
6213                         (rc > 0 && lo->ll_flags & LF_INCOMPLETE) ? 0 : rc);
6214         lfsck_layout_slave_quit(env, com);
6215         if (atomic_dec_and_test(&lfsck->li_double_scan_count))
6216                 wake_up_all(&lfsck->li_thread.t_ctl_waitq);
6217
6218         CDEBUG(D_LFSCK, "%s: layout LFSCK slave phase2 scan finished, "
6219                "status %d: rc = %d\n",
6220                lfsck_lfsck2name(lfsck), lo->ll_status, rc);
6221
6222         return rc;
6223 }
6224
6225 static void lfsck_layout_master_data_release(const struct lu_env *env,
6226                                              struct lfsck_component *com)
6227 {
6228         struct lfsck_assistant_data     *lad    = com->lc_data;
6229         struct lfsck_instance           *lfsck  = com->lc_lfsck;
6230         struct lfsck_tgt_descs          *ltds;
6231         struct lfsck_tgt_desc           *ltd;
6232         struct lfsck_tgt_desc           *next;
6233
6234         LASSERT(lad != NULL);
6235         LASSERT(thread_is_init(&lad->lad_thread) ||
6236                 thread_is_stopped(&lad->lad_thread));
6237         LASSERT(list_empty(&lad->lad_req_list));
6238
6239         com->lc_data = NULL;
6240
6241         ltds = &lfsck->li_ost_descs;
6242         spin_lock(&ltds->ltd_lock);
6243         list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase1_list,
6244                                  ltd_layout_phase_list) {
6245                 list_del_init(&ltd->ltd_layout_phase_list);
6246         }
6247         list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase2_list,
6248                                  ltd_layout_phase_list) {
6249                 list_del_init(&ltd->ltd_layout_phase_list);
6250         }
6251         list_for_each_entry_safe(ltd, next, &lad->lad_ost_list,
6252                                  ltd_layout_list) {
6253                 list_del_init(&ltd->ltd_layout_list);
6254         }
6255         spin_unlock(&ltds->ltd_lock);
6256
6257         ltds = &lfsck->li_mdt_descs;
6258         spin_lock(&ltds->ltd_lock);
6259         list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase1_list,
6260                                  ltd_layout_phase_list) {
6261                 list_del_init(&ltd->ltd_layout_phase_list);
6262         }
6263         list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase2_list,
6264                                  ltd_layout_phase_list) {
6265                 list_del_init(&ltd->ltd_layout_phase_list);
6266         }
6267         list_for_each_entry_safe(ltd, next, &lad->lad_mdt_list,
6268                                  ltd_layout_list) {
6269                 list_del_init(&ltd->ltd_layout_list);
6270         }
6271         spin_unlock(&ltds->ltd_lock);
6272
6273         if (likely(lad->lad_bitmap != NULL))
6274                 CFS_FREE_BITMAP(lad->lad_bitmap);
6275
6276         OBD_FREE_PTR(lad);
6277 }
6278
6279 static void lfsck_layout_slave_data_release(const struct lu_env *env,
6280                                             struct lfsck_component *com)
6281 {
6282         struct lfsck_layout_slave_data *llsd = com->lc_data;
6283
6284         lfsck_layout_slave_quit(env, com);
6285         com->lc_data = NULL;
6286         OBD_FREE_PTR(llsd);
6287 }
6288
6289 static void lfsck_layout_master_quit(const struct lu_env *env,
6290                                      struct lfsck_component *com)
6291 {
6292         struct lfsck_assistant_data     *lad    = com->lc_data;
6293         struct lfsck_instance           *lfsck  = com->lc_lfsck;
6294         struct lfsck_tgt_descs          *ltds;
6295         struct lfsck_tgt_desc           *ltd;
6296         struct lfsck_tgt_desc           *next;
6297
6298         LASSERT(lad != NULL);
6299
6300         lfsck_quit_generic(env, com);
6301
6302         LASSERT(thread_is_init(&lad->lad_thread) ||
6303                 thread_is_stopped(&lad->lad_thread));
6304         LASSERT(list_empty(&lad->lad_req_list));
6305
6306         ltds = &lfsck->li_ost_descs;
6307         spin_lock(&ltds->ltd_lock);
6308         list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase1_list,
6309                                  ltd_layout_phase_list) {
6310                 list_del_init(&ltd->ltd_layout_phase_list);
6311         }
6312         list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase2_list,
6313                                  ltd_layout_phase_list) {
6314                 list_del_init(&ltd->ltd_layout_phase_list);
6315         }
6316         spin_unlock(&ltds->ltd_lock);
6317
6318         ltds = &lfsck->li_mdt_descs;
6319         spin_lock(&ltds->ltd_lock);
6320         list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase1_list,
6321                                  ltd_layout_phase_list) {
6322                 list_del_init(&ltd->ltd_layout_phase_list);
6323         }
6324         list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase2_list,
6325                                  ltd_layout_phase_list) {
6326                 list_del_init(&ltd->ltd_layout_phase_list);
6327         }
6328         spin_unlock(&ltds->ltd_lock);
6329 }
6330
6331 static void lfsck_layout_slave_quit(const struct lu_env *env,
6332                                     struct lfsck_component *com)
6333 {
6334         struct lfsck_layout_slave_data   *llsd  = com->lc_data;
6335         struct lfsck_layout_seq          *lls;
6336         struct lfsck_layout_seq          *next;
6337         struct lfsck_layout_slave_target *llst;
6338
6339         LASSERT(llsd != NULL);
6340
6341         down_write(&com->lc_sem);
6342         list_for_each_entry_safe(lls, next, &llsd->llsd_seq_list,
6343                                  lls_list) {
6344                 list_del_init(&lls->lls_list);
6345                 lfsck_object_put(env, lls->lls_lastid_obj);
6346                 OBD_FREE_PTR(lls);
6347         }
6348         up_write(&com->lc_sem);
6349
6350         spin_lock(&llsd->llsd_lock);
6351         while (!list_empty(&llsd->llsd_master_list)) {
6352                 llst = list_entry(llsd->llsd_master_list.next,
6353                                   struct lfsck_layout_slave_target, llst_list);
6354                 list_del_init(&llst->llst_list);
6355                 spin_unlock(&llsd->llsd_lock);
6356                 lfsck_layout_llst_put(llst);
6357                 spin_lock(&llsd->llsd_lock);
6358         }
6359         spin_unlock(&llsd->llsd_lock);
6360
6361         lfsck_rbtree_cleanup(env, com);
6362 }
6363
6364 static int lfsck_layout_master_in_notify(const struct lu_env *env,
6365                                          struct lfsck_component *com,
6366                                          struct lfsck_request *lr)
6367 {
6368         struct lfsck_instance           *lfsck = com->lc_lfsck;
6369         struct lfsck_layout             *lo    = com->lc_file_ram;
6370         struct lfsck_assistant_data     *lad   = com->lc_data;
6371         struct lfsck_tgt_descs          *ltds;
6372         struct lfsck_tgt_desc           *ltd;
6373         bool                             fail  = false;
6374         ENTRY;
6375
6376         if (lr->lr_event == LE_PAIRS_VERIFY) {
6377                 int rc;
6378
6379                 rc = lfsck_layout_master_check_pairs(env, com, &lr->lr_fid,
6380                                                      &lr->lr_fid2,
6381                                                      lr->lr_comp_id);
6382
6383                 RETURN(rc);
6384         }
6385
6386         CDEBUG(D_LFSCK, "%s: layout LFSCK master handles notify %u "
6387                "from %s %x, status %d, flags %x, flags2 %x\n",
6388                lfsck_lfsck2name(lfsck), lr->lr_event,
6389                (lr->lr_flags & LEF_FROM_OST) ? "OST" : "MDT",
6390                lr->lr_index, lr->lr_status, lr->lr_flags, lr->lr_flags2);
6391
6392         if (lr->lr_event != LE_PHASE1_DONE &&
6393             lr->lr_event != LE_PHASE2_DONE &&
6394             lr->lr_event != LE_PEER_EXIT)
6395                 RETURN(-EINVAL);
6396
6397         if (lr->lr_flags & LEF_FROM_OST)
6398                 ltds = &lfsck->li_ost_descs;
6399         else
6400                 ltds = &lfsck->li_mdt_descs;
6401         spin_lock(&ltds->ltd_lock);
6402         ltd = lfsck_ltd2tgt(ltds, lr->lr_index);
6403         if (ltd == NULL) {
6404                 spin_unlock(&ltds->ltd_lock);
6405
6406                 RETURN(-ENXIO);
6407         }
6408
6409         list_del_init(&ltd->ltd_layout_phase_list);
6410         switch (lr->lr_event) {
6411         case LE_PHASE1_DONE:
6412                 if (lr->lr_status <= 0 || lr->lr_flags2 & LF_INCOMPLETE) {
6413                         if (lr->lr_flags2 & LF_INCOMPLETE) {
6414                                 if (lr->lr_flags & LEF_FROM_OST)
6415                                         lfsck_lad_set_bitmap(env, com,
6416                                                              ltd->ltd_index);
6417                                 else
6418                                         lo->ll_flags |= LF_INCOMPLETE;
6419                         }
6420                         ltd->ltd_layout_done = 1;
6421                         list_del_init(&ltd->ltd_layout_list);
6422                         fail = true;
6423                         break;
6424                 }
6425
6426                 if (lr->lr_flags & LEF_FROM_OST) {
6427                         if (list_empty(&ltd->ltd_layout_list))
6428                                 list_add_tail(&ltd->ltd_layout_list,
6429                                               &lad->lad_ost_list);
6430                         list_add_tail(&ltd->ltd_layout_phase_list,
6431                                       &lad->lad_ost_phase2_list);
6432                 } else {
6433                         if (list_empty(&ltd->ltd_layout_list))
6434                                 list_add_tail(&ltd->ltd_layout_list,
6435                                               &lad->lad_mdt_list);
6436                         list_add_tail(&ltd->ltd_layout_phase_list,
6437                                       &lad->lad_mdt_phase2_list);
6438                 }
6439                 break;
6440         case LE_PHASE2_DONE:
6441                 ltd->ltd_layout_done = 1;
6442                 if (!list_empty(&ltd->ltd_layout_list))
6443                         list_del_init(&ltd->ltd_layout_list);
6444
6445                 if (lr->lr_flags2 & LF_INCOMPLETE) {
6446                         lfsck_lad_set_bitmap(env, com, ltd->ltd_index);
6447                         fail = true;
6448                 }
6449
6450                 break;
6451         case LE_PEER_EXIT:
6452                 fail = true;
6453                 ltd->ltd_layout_done = 1;
6454                 list_del_init(&ltd->ltd_layout_list);
6455                 if (!(lfsck->li_bookmark_ram.lb_param & LPF_FAILOUT) &&
6456                     !(lr->lr_flags & LEF_FROM_OST))
6457                                 lo->ll_flags |= LF_INCOMPLETE;
6458                 break;
6459         default:
6460                 break;
6461         }
6462         spin_unlock(&ltds->ltd_lock);
6463
6464         if (fail && lfsck->li_bookmark_ram.lb_param & LPF_FAILOUT) {
6465                 struct lfsck_stop *stop = &lfsck_env_info(env)->lti_stop;
6466
6467                 memset(stop, 0, sizeof(*stop));
6468                 stop->ls_status = lr->lr_status;
6469                 stop->ls_flags = lr->lr_param & ~LPF_BROADCAST;
6470                 lfsck_stop(env, lfsck->li_bottom, stop);
6471         } else if (lfsck_phase2_next_ready(lad)) {
6472                 wake_up_all(&lad->lad_thread.t_ctl_waitq);
6473         }
6474
6475         RETURN(0);
6476 }
6477
6478 static int lfsck_layout_slave_in_notify_local(const struct lu_env *env,
6479                                               struct lfsck_component *com,
6480                                               struct lfsck_req_local *lrl,
6481                                               struct thandle *th)
6482 {
6483         ENTRY;
6484
6485         switch (lrl->lrl_event) {
6486         case LEL_FID_ACCESSED:
6487                 lfsck_rbtree_update_bitmap(env, com, &lrl->lrl_fid, true);
6488                 RETURN(0);
6489         case LEL_PAIRS_VERIFY_LOCAL: {
6490                 int rc;
6491
6492                 lrl->lrl_status = LPVS_INIT;
6493                 /* Firstly, if the MDT-object which is claimed via OST-object
6494                  * local stored PFID xattr recognizes the OST-object, then it
6495                  * must be that the client given PFID is wrong. */
6496                 rc = lfsck_layout_slave_check_pairs(env, com, &lrl->lrl_fid,
6497                                 &lrl->lrl_ff_local.ff_parent,
6498                                 lrl->lrl_ff_local.ff_layout.ol_comp_id);
6499                 if (rc <= 0)
6500                         RETURN(0);
6501
6502                 lrl->lrl_status = LPVS_INCONSISTENT;
6503                 /* The OST-object local stored PFID xattr is stale. We need to
6504                  * check whether the MDT-object that is claimed via the client
6505                  * given PFID information recognizes the OST-object or not. If
6506                  * matches, then need to update the OST-object's PFID xattr. */
6507                 rc = lfsck_layout_slave_check_pairs(env, com, &lrl->lrl_fid,
6508                                 &lrl->lrl_ff_client.ff_parent,
6509                                 lrl->lrl_ff_client.ff_layout.ol_comp_id);
6510                 /* For rc < 0 case:
6511                  * We are not sure whether the client given PFID information
6512                  * is correct or not, do nothing to avoid improper fixing.
6513                  *
6514                  * For rc > 0 case:
6515                  * The client given PFID information is also invalid, we can
6516                  * NOT fix the OST-object inconsistency.
6517                  */
6518                 if (!rc) {
6519                         lrl->lrl_status = LPVS_INCONSISTENT_TOFIX;
6520                         rc = lfsck_layout_slave_repair_pfid(env, com, lrl);
6521                 }
6522
6523                 RETURN(rc);
6524         }
6525         default:
6526                 break;
6527         }
6528
6529         RETURN(-EOPNOTSUPP);
6530 }
6531
6532 static int lfsck_layout_slave_in_notify(const struct lu_env *env,
6533                                         struct lfsck_component *com,
6534                                         struct lfsck_request *lr)
6535 {
6536         struct lfsck_instance *lfsck = com->lc_lfsck;
6537         struct lfsck_layout_slave_data *llsd = com->lc_data;
6538         struct lfsck_layout_slave_target *llst;
6539         int rc;
6540         ENTRY;
6541
6542         switch (lr->lr_event) {
6543         case LE_CONDITIONAL_DESTROY:
6544                 rc = lfsck_layout_slave_conditional_destroy(env, com, lr);
6545                 RETURN(rc);
6546         case LE_PHASE1_DONE: {
6547                 if (lr->lr_flags2 & LF_INCOMPLETE) {
6548                         struct lfsck_layout *lo = com->lc_file_ram;
6549
6550                         lo->ll_flags |= LF_INCOMPLETE;
6551                         llst = lfsck_layout_llst_find_and_del(llsd,
6552                                                               lr->lr_index,
6553                                                               true);
6554                         if (llst != NULL) {
6555                                 lfsck_layout_llst_put(llst);
6556                                 wake_up_all(&lfsck->li_thread.t_ctl_waitq);
6557                         }
6558                 }
6559
6560                 RETURN(0);
6561         }
6562         case LE_PHASE2_DONE:
6563         case LE_PEER_EXIT:
6564                 CDEBUG(D_LFSCK, "%s: layout LFSCK slave handle notify %u "
6565                        "from MDT %x, status %d\n", lfsck_lfsck2name(lfsck),
6566                        lr->lr_event, lr->lr_index, lr->lr_status);
6567                 break;
6568         default:
6569                 RETURN(-EINVAL);
6570         }
6571
6572         llst = lfsck_layout_llst_find_and_del(llsd, lr->lr_index, true);
6573         if (llst == NULL)
6574                 RETURN(0);
6575
6576         lfsck_layout_llst_put(llst);
6577         if (list_empty(&llsd->llsd_master_list))
6578                 wake_up_all(&lfsck->li_thread.t_ctl_waitq);
6579
6580         if (lr->lr_event == LE_PEER_EXIT &&
6581             (lfsck->li_bookmark_ram.lb_param & LPF_FAILOUT ||
6582              (list_empty(&llsd->llsd_master_list) &&
6583               (lr->lr_status == LS_STOPPED ||
6584                lr->lr_status == LS_CO_STOPPED)))) {
6585                 struct lfsck_stop *stop = &lfsck_env_info(env)->lti_stop;
6586
6587                 memset(stop, 0, sizeof(*stop));
6588                 stop->ls_status = lr->lr_status;
6589                 stop->ls_flags = lr->lr_param & ~LPF_BROADCAST;
6590                 lfsck_stop(env, lfsck->li_bottom, stop);
6591         }
6592
6593         RETURN(0);
6594 }
6595
6596 static void lfsck_layout_repaired(struct lfsck_layout *lo, __u64 *count)
6597 {
6598         int i;
6599
6600         for (i = 0; i < LLIT_MAX; i++)
6601                 *count += lo->ll_objs_repaired[i];
6602 }
6603
6604 static int lfsck_layout_query_all(const struct lu_env *env,
6605                                   struct lfsck_component *com,
6606                                   __u32 *mdts_count, __u32 *osts_count,
6607                                   __u64 *repaired)
6608 {
6609         struct lfsck_layout *lo = com->lc_file_ram;
6610         struct lfsck_tgt_descs *ltds;
6611         struct lfsck_tgt_desc *ltd;
6612         int idx;
6613         int rc;
6614         ENTRY;
6615
6616         rc = lfsck_query_all(env, com);
6617         if (rc != 0)
6618                 RETURN(rc);
6619
6620         ltds = &com->lc_lfsck->li_mdt_descs;
6621         down_read(&ltds->ltd_rw_sem);
6622         cfs_foreach_bit(ltds->ltd_tgts_bitmap, idx) {
6623                 ltd = lfsck_ltd2tgt(ltds, idx);
6624                 LASSERT(ltd != NULL);
6625
6626                 mdts_count[ltd->ltd_layout_status]++;
6627                 *repaired += ltd->ltd_layout_repaired;
6628         }
6629         up_read(&ltds->ltd_rw_sem);
6630
6631         ltds = &com->lc_lfsck->li_ost_descs;
6632         down_read(&ltds->ltd_rw_sem);
6633         cfs_foreach_bit(ltds->ltd_tgts_bitmap, idx) {
6634                 ltd = lfsck_ltd2tgt(ltds, idx);
6635                 LASSERT(ltd != NULL);
6636
6637                 osts_count[ltd->ltd_layout_status]++;
6638                 *repaired += ltd->ltd_layout_repaired;
6639         }
6640         up_read(&ltds->ltd_rw_sem);
6641
6642         down_read(&com->lc_sem);
6643         mdts_count[lo->ll_status]++;
6644         lfsck_layout_repaired(lo, repaired);
6645         up_read(&com->lc_sem);
6646
6647         RETURN(0);
6648 }
6649
6650 static int lfsck_layout_query(const struct lu_env *env,
6651                               struct lfsck_component *com,
6652                               struct lfsck_request *req,
6653                               struct lfsck_reply *rep,
6654                               struct lfsck_query *que, int idx)
6655 {
6656         struct lfsck_layout *lo = com->lc_file_ram;
6657         int rc = 0;
6658
6659         if (que != NULL) {
6660                 LASSERT(com->lc_lfsck->li_master);
6661
6662                 rc = lfsck_layout_query_all(env, com,
6663                                             que->lu_mdts_count[idx],
6664                                             que->lu_osts_count[idx],
6665                                             &que->lu_repaired[idx]);
6666         } else {
6667                 down_read(&com->lc_sem);
6668                 rep->lr_status = lo->ll_status;
6669                 if (req->lr_flags & LEF_QUERY_ALL)
6670                         lfsck_layout_repaired(lo, &rep->lr_repaired);
6671                 up_read(&com->lc_sem);
6672         }
6673
6674         return rc;
6675 }
6676
6677 /* with lfsck::li_lock held */
6678 static int lfsck_layout_slave_join(const struct lu_env *env,
6679                                    struct lfsck_component *com,
6680                                    struct lfsck_start_param *lsp)
6681 {
6682         struct lfsck_instance            *lfsck = com->lc_lfsck;
6683         struct lfsck_layout_slave_data   *llsd  = com->lc_data;
6684         struct lfsck_layout_slave_target *llst;
6685         struct lfsck_start               *start = lsp->lsp_start;
6686         int                               rc    = 0;
6687         ENTRY;
6688
6689         if (start == NULL || !(start->ls_flags & LPF_OST_ORPHAN))
6690                 RETURN(0);
6691
6692         if (!lsp->lsp_index_valid)
6693                 RETURN(-EINVAL);
6694
6695         /* If someone is running the LFSCK without orphan handling,
6696          * it will not maintain the object accessing rbtree. So we
6697          * cannot join it for orphan handling. */
6698         if (!llsd->llsd_rbtree_valid)
6699                 RETURN(-EBUSY);
6700
6701         spin_unlock(&lfsck->li_lock);
6702         rc = lfsck_layout_llst_add(llsd, lsp->lsp_index);
6703         spin_lock(&lfsck->li_lock);
6704         if (rc == 0 && !thread_is_running(&lfsck->li_thread)) {
6705                 spin_unlock(&lfsck->li_lock);
6706                 llst = lfsck_layout_llst_find_and_del(llsd, lsp->lsp_index,
6707                                                       true);
6708                 if (llst != NULL)
6709                         lfsck_layout_llst_put(llst);
6710                 spin_lock(&lfsck->li_lock);
6711                 rc = -EAGAIN;
6712         }
6713
6714         RETURN(rc);
6715 }
6716
6717 static struct lfsck_operations lfsck_layout_master_ops = {
6718         .lfsck_reset            = lfsck_layout_reset,
6719         .lfsck_fail             = lfsck_layout_fail,
6720         .lfsck_checkpoint       = lfsck_layout_master_checkpoint,
6721         .lfsck_prep             = lfsck_layout_master_prep,
6722         .lfsck_exec_oit         = lfsck_layout_master_exec_oit,
6723         .lfsck_exec_dir         = lfsck_layout_exec_dir,
6724         .lfsck_post             = lfsck_layout_master_post,
6725         .lfsck_dump             = lfsck_layout_dump,
6726         .lfsck_double_scan      = lfsck_layout_master_double_scan,
6727         .lfsck_data_release     = lfsck_layout_master_data_release,
6728         .lfsck_quit             = lfsck_layout_master_quit,
6729         .lfsck_in_notify        = lfsck_layout_master_in_notify,
6730         .lfsck_query            = lfsck_layout_query,
6731 };
6732
6733 static struct lfsck_operations lfsck_layout_slave_ops = {
6734         .lfsck_reset            = lfsck_layout_reset,
6735         .lfsck_fail             = lfsck_layout_fail,
6736         .lfsck_checkpoint       = lfsck_layout_slave_checkpoint,
6737         .lfsck_prep             = lfsck_layout_slave_prep,
6738         .lfsck_exec_oit         = lfsck_layout_slave_exec_oit,
6739         .lfsck_exec_dir         = lfsck_layout_exec_dir,
6740         .lfsck_post             = lfsck_layout_slave_post,
6741         .lfsck_dump             = lfsck_layout_dump,
6742         .lfsck_double_scan      = lfsck_layout_slave_double_scan,
6743         .lfsck_data_release     = lfsck_layout_slave_data_release,
6744         .lfsck_quit             = lfsck_layout_slave_quit,
6745         .lfsck_in_notify_local  = lfsck_layout_slave_in_notify_local,
6746         .lfsck_in_notify        = lfsck_layout_slave_in_notify,
6747         .lfsck_query            = lfsck_layout_query,
6748         .lfsck_join             = lfsck_layout_slave_join,
6749 };
6750
6751 static void lfsck_layout_assistant_fill_pos(const struct lu_env *env,
6752                                             struct lfsck_component *com,
6753                                             struct lfsck_position *pos)
6754 {
6755         struct lfsck_assistant_data     *lad = com->lc_data;
6756         struct lfsck_layout_req         *llr;
6757
6758         if (((struct lfsck_layout *)(com->lc_file_ram))->ll_status !=
6759             LS_SCANNING_PHASE1)
6760                 return;
6761
6762         if (list_empty(&lad->lad_req_list))
6763                 return;
6764
6765         llr = list_entry(lad->lad_req_list.next,
6766                          struct lfsck_layout_req,
6767                          llr_lar.lar_list);
6768         pos->lp_oit_cookie = llr->llr_lar.lar_parent->lso_oit_cookie - 1;
6769 }
6770
6771 struct lfsck_assistant_operations lfsck_layout_assistant_ops = {
6772         .la_handler_p1          = lfsck_layout_assistant_handler_p1,
6773         .la_handler_p2          = lfsck_layout_assistant_handler_p2,
6774         .la_fill_pos            = lfsck_layout_assistant_fill_pos,
6775         .la_double_scan_result  = lfsck_layout_double_scan_result,
6776         .la_req_fini            = lfsck_layout_assistant_req_fini,
6777         .la_sync_failures       = lfsck_layout_assistant_sync_failures,
6778 };
6779
6780 int lfsck_layout_setup(const struct lu_env *env, struct lfsck_instance *lfsck)
6781 {
6782         struct lfsck_component  *com;
6783         struct lfsck_layout     *lo;
6784         struct dt_object        *root = NULL;
6785         struct dt_object        *obj;
6786         int                      i;
6787         int                      rc;
6788         ENTRY;
6789
6790         OBD_ALLOC_PTR(com);
6791         if (com == NULL)
6792                 RETURN(-ENOMEM);
6793
6794         INIT_LIST_HEAD(&com->lc_link);
6795         INIT_LIST_HEAD(&com->lc_link_dir);
6796         init_rwsem(&com->lc_sem);
6797         atomic_set(&com->lc_ref, 1);
6798         com->lc_lfsck = lfsck;
6799         com->lc_type = LFSCK_TYPE_LAYOUT;
6800         if (lfsck->li_master) {
6801                 com->lc_ops = &lfsck_layout_master_ops;
6802                 com->lc_data = lfsck_assistant_data_init(
6803                                 &lfsck_layout_assistant_ops,
6804                                 LFSCK_LAYOUT);
6805                 if (com->lc_data == NULL)
6806                         GOTO(out, rc = -ENOMEM);
6807
6808                 for (i = 0; i < LFSCK_STF_COUNT; i++)
6809                         mutex_init(&com->lc_sub_trace_objs[i].lsto_mutex);
6810         } else {
6811                 struct lfsck_layout_slave_data *llsd;
6812
6813                 com->lc_ops = &lfsck_layout_slave_ops;
6814                 OBD_ALLOC_PTR(llsd);
6815                 if (llsd == NULL)
6816                         GOTO(out, rc = -ENOMEM);
6817
6818                 INIT_LIST_HEAD(&llsd->llsd_seq_list);
6819                 INIT_LIST_HEAD(&llsd->llsd_master_list);
6820                 spin_lock_init(&llsd->llsd_lock);
6821                 llsd->llsd_rb_root = RB_ROOT;
6822                 rwlock_init(&llsd->llsd_rb_lock);
6823                 com->lc_data = llsd;
6824         }
6825         com->lc_file_size = sizeof(*lo);
6826         OBD_ALLOC(com->lc_file_ram, com->lc_file_size);
6827         if (com->lc_file_ram == NULL)
6828                 GOTO(out, rc = -ENOMEM);
6829
6830         OBD_ALLOC(com->lc_file_disk, com->lc_file_size);
6831         if (com->lc_file_disk == NULL)
6832                 GOTO(out, rc = -ENOMEM);
6833
6834         root = dt_locate(env, lfsck->li_bottom, &lfsck->li_local_root_fid);
6835         if (IS_ERR(root))
6836                 GOTO(out, rc = PTR_ERR(root));
6837
6838         if (unlikely(!dt_try_as_dir(env, root)))
6839                 GOTO(out, rc = -ENOTDIR);
6840
6841         obj = local_file_find_or_create(env, lfsck->li_los, root,
6842                                         LFSCK_LAYOUT,
6843                                         S_IFREG | S_IRUGO | S_IWUSR);
6844         if (IS_ERR(obj))
6845                 GOTO(out, rc = PTR_ERR(obj));
6846
6847         com->lc_obj = obj;
6848         rc = lfsck_layout_load(env, com);
6849         if (rc > 0) {
6850                 rc = lfsck_layout_reset(env, com, true);
6851         } else if (rc == -ENOENT) {
6852                 rc = lfsck_layout_init(env, com);
6853         } else if (lfsck->li_master) {
6854                 rc = lfsck_load_sub_trace_files(env, com,
6855                                 &dt_lfsck_layout_dangling_features,
6856                                 LFSCK_LAYOUT, false);
6857                 if (rc)
6858                         rc = lfsck_layout_reset(env, com, true);
6859         }
6860
6861         if (rc != 0)
6862                 GOTO(out, rc);
6863
6864         lo = com->lc_file_ram;
6865         switch (lo->ll_status) {
6866         case LS_INIT:
6867         case LS_COMPLETED:
6868         case LS_FAILED:
6869         case LS_STOPPED:
6870         case LS_PARTIAL:
6871                 spin_lock(&lfsck->li_lock);
6872                 list_add_tail(&com->lc_link, &lfsck->li_list_idle);
6873                 spin_unlock(&lfsck->li_lock);
6874                 break;
6875         default:
6876                 CERROR("%s: unknown lfsck_layout status %d\n",
6877                        lfsck_lfsck2name(lfsck), lo->ll_status);
6878                 /* fall through */
6879         case LS_SCANNING_PHASE1:
6880         case LS_SCANNING_PHASE2:
6881                 /* No need to store the status to disk right now.
6882                  * If the system crashed before the status stored,
6883                  * it will be loaded back when next time. */
6884                 lo->ll_status = LS_CRASHED;
6885                 if (!lfsck->li_master)
6886                         lo->ll_flags |= LF_INCOMPLETE;
6887                 /* fall through */
6888         case LS_PAUSED:
6889         case LS_CRASHED:
6890         case LS_CO_FAILED:
6891         case LS_CO_STOPPED:
6892         case LS_CO_PAUSED:
6893                 spin_lock(&lfsck->li_lock);
6894                 list_add_tail(&com->lc_link, &lfsck->li_list_scan);
6895                 spin_unlock(&lfsck->li_lock);
6896                 break;
6897         }
6898
6899         if (lo->ll_flags & LF_CRASHED_LASTID) {
6900                 LASSERT(lfsck->li_out_notify != NULL);
6901
6902                 lfsck->li_out_notify(env, lfsck->li_out_notify_data,
6903                                      LE_LASTID_REBUILDING);
6904         }
6905
6906         GOTO(out, rc = 0);
6907
6908 out:
6909         if (root != NULL && !IS_ERR(root))
6910                 lfsck_object_put(env, root);
6911
6912         if (rc != 0) {
6913                 lfsck_component_cleanup(env, com);
6914                 CERROR("%s: fail to init layout LFSCK component: rc = %d\n",
6915                        lfsck_lfsck2name(lfsck), rc);
6916         }
6917
6918         return rc;
6919 }
6920
6921 struct lfsck_orphan_it {
6922         struct lfsck_component           *loi_com;
6923         struct lfsck_rbtree_node         *loi_lrn;
6924         struct lfsck_layout_slave_target *loi_llst;
6925         struct lu_fid                     loi_key;
6926         struct lu_orphan_rec_v3           loi_rec;
6927         __u64                             loi_hash;
6928         unsigned int                      loi_over:1;
6929 };
6930
6931 static int lfsck_fid_match_idx(const struct lu_env *env,
6932                                struct lfsck_instance *lfsck,
6933                                const struct lu_fid *fid, int idx)
6934 {
6935         struct seq_server_site  *ss;
6936         struct lu_server_fld    *sf;
6937         struct lu_seq_range     *range = &lfsck_env_info(env)->lti_range;
6938         int                      rc;
6939
6940         /* All abnormal cases will be returned to MDT0. */
6941         if (!fid_is_norm(fid)) {
6942                 if (idx == 0)
6943                         return 1;
6944
6945                 return 0;
6946         }
6947
6948         ss = lfsck_dev_site(lfsck);
6949         if (unlikely(ss == NULL))
6950                 return -ENOTCONN;
6951
6952         sf = ss->ss_server_fld;
6953         LASSERT(sf != NULL);
6954
6955         fld_range_set_any(range);
6956         rc = fld_server_lookup(env, sf, fid_seq(fid), range);
6957         if (rc != 0)
6958                 return rc;
6959
6960         if (!fld_range_is_mdt(range))
6961                 return -EINVAL;
6962
6963         if (range->lsr_index == idx)
6964                 return 1;
6965
6966         return 0;
6967 }
6968
6969 static void lfsck_layout_destroy_orphan(const struct lu_env *env,
6970                                         struct dt_object *obj)
6971 {
6972         struct dt_device        *dev    = lfsck_obj2dev(obj);
6973         struct thandle          *handle;
6974         int                      rc;
6975         ENTRY;
6976
6977         handle = dt_trans_create(env, dev);
6978         if (IS_ERR(handle))
6979                 RETURN_EXIT;
6980
6981         rc = dt_declare_ref_del(env, obj, handle);
6982         if (rc != 0)
6983                 GOTO(stop, rc);
6984
6985         rc = dt_declare_destroy(env, obj, handle);
6986         if (rc != 0)
6987                 GOTO(stop, rc);
6988
6989         rc = dt_trans_start_local(env, dev, handle);
6990         if (rc != 0)
6991                 GOTO(stop, rc);
6992
6993         dt_write_lock(env, obj, 0);
6994         rc = dt_ref_del(env, obj, handle);
6995         if (rc == 0)
6996                 rc = dt_destroy(env, obj, handle);
6997         dt_write_unlock(env, obj);
6998
6999         GOTO(stop, rc);
7000
7001 stop:
7002         dt_trans_stop(env, dev, handle);
7003
7004         CDEBUG(D_LFSCK, "destroy orphan OST-object "DFID": rc = %d\n",
7005                PFID(lfsck_dto2fid(obj)), rc);
7006
7007         RETURN_EXIT;
7008 }
7009
7010 static int lfsck_orphan_index_lookup(const struct lu_env *env,
7011                                      struct dt_object *dt,
7012                                      struct dt_rec *rec,
7013                                      const struct dt_key *key)
7014 {
7015         return -EOPNOTSUPP;
7016 }
7017
7018 static int lfsck_orphan_index_declare_insert(const struct lu_env *env,
7019                                              struct dt_object *dt,
7020                                              const struct dt_rec *rec,
7021                                              const struct dt_key *key,
7022                                              struct thandle *handle)
7023 {
7024         return -EOPNOTSUPP;
7025 }
7026
7027 static int lfsck_orphan_index_insert(const struct lu_env *env,
7028                                      struct dt_object *dt,
7029                                      const struct dt_rec *rec,
7030                                      const struct dt_key *key,
7031                                      struct thandle *handle,
7032                                      int ignore_quota)
7033 {
7034         return -EOPNOTSUPP;
7035 }
7036
7037 static int lfsck_orphan_index_declare_delete(const struct lu_env *env,
7038                                              struct dt_object *dt,
7039                                              const struct dt_key *key,
7040                                              struct thandle *handle)
7041 {
7042         return -EOPNOTSUPP;
7043 }
7044
7045 static int lfsck_orphan_index_delete(const struct lu_env *env,
7046                                      struct dt_object *dt,
7047                                      const struct dt_key *key,
7048                                      struct thandle *handle)
7049 {
7050         return -EOPNOTSUPP;
7051 }
7052
7053 static struct dt_it *lfsck_orphan_it_init(const struct lu_env *env,
7054                                           struct dt_object *dt,
7055                                           __u32 attr)
7056 {
7057         struct dt_device                *dev    = lu2dt_dev(dt->do_lu.lo_dev);
7058         struct lfsck_instance           *lfsck;
7059         struct lfsck_component          *com    = NULL;
7060         struct lfsck_layout_slave_data  *llsd;
7061         struct lfsck_orphan_it          *it     = NULL;
7062         struct lfsck_layout             *lo;
7063         int                              rc     = 0;
7064         ENTRY;
7065
7066         lfsck = lfsck_instance_find(dev, true, false);
7067         if (unlikely(lfsck == NULL))
7068                 RETURN(ERR_PTR(-ENXIO));
7069
7070         com = lfsck_component_find(lfsck, LFSCK_TYPE_LAYOUT);
7071         if (unlikely(com == NULL))
7072                 GOTO(out, rc = -ENOENT);
7073
7074         lo = com->lc_file_ram;
7075         if (lo->ll_flags & LF_INCOMPLETE)
7076                 GOTO(out, rc = -ESRCH);
7077
7078         llsd = com->lc_data;
7079         if (!llsd->llsd_rbtree_valid)
7080                 GOTO(out, rc = -ESRCH);
7081
7082         OBD_ALLOC_PTR(it);
7083         if (it == NULL)
7084                 GOTO(out, rc = -ENOMEM);
7085
7086         it->loi_llst = lfsck_layout_llst_find_and_del(llsd, attr, false);
7087         if (it->loi_llst == NULL)
7088                 GOTO(out, rc = -ENXIO);
7089
7090         if (dev->dd_record_fid_accessed) {
7091                 /* The first iteration against the rbtree, scan the whole rbtree
7092                  * to remove the nodes which do NOT need to be handled. */
7093                 write_lock(&llsd->llsd_rb_lock);
7094                 if (dev->dd_record_fid_accessed) {
7095                         struct rb_node                  *node;
7096                         struct rb_node                  *next;
7097                         struct lfsck_rbtree_node        *lrn;
7098
7099                         /* No need to record the fid accessing anymore. */
7100                         dev->dd_record_fid_accessed = 0;
7101
7102                         node = rb_first(&llsd->llsd_rb_root);
7103                         while (node != NULL) {
7104                                 next = rb_next(node);
7105                                 lrn = rb_entry(node, struct lfsck_rbtree_node,
7106                                                lrn_node);
7107                                 if (atomic_read(&lrn->lrn_known_count) <=
7108                                     atomic_read(&lrn->lrn_accessed_count)) {
7109                                         rb_erase(node, &llsd->llsd_rb_root);
7110                                         lfsck_rbtree_free(lrn);
7111                                 }
7112                                 node = next;
7113                         }
7114                 }
7115                 write_unlock(&llsd->llsd_rb_lock);
7116         }
7117
7118         /* read lock the rbtree when init, and unlock when fini */
7119         read_lock(&llsd->llsd_rb_lock);
7120         it->loi_com = com;
7121         com = NULL;
7122
7123         GOTO(out, rc = 0);
7124
7125 out:
7126         if (com != NULL)
7127                 lfsck_component_put(env, com);
7128
7129         CDEBUG(D_LFSCK, "%s: init the orphan iteration: rc = %d\n",
7130                lfsck_lfsck2name(lfsck), rc);
7131
7132         lfsck_instance_put(env, lfsck);
7133         if (rc != 0) {
7134                 if (it != NULL)
7135                         OBD_FREE_PTR(it);
7136
7137                 it = (struct lfsck_orphan_it *)ERR_PTR(rc);
7138         }
7139
7140         return (struct dt_it *)it;
7141 }
7142
7143 static void lfsck_orphan_it_fini(const struct lu_env *env,
7144                                  struct dt_it *di)
7145 {
7146         struct lfsck_orphan_it           *it    = (struct lfsck_orphan_it *)di;
7147         struct lfsck_component           *com   = it->loi_com;
7148         struct lfsck_layout_slave_data   *llsd;
7149         struct lfsck_layout_slave_target *llst;
7150
7151         if (com != NULL) {
7152                 CDEBUG(D_LFSCK, "%s: fini the orphan iteration\n",
7153                        lfsck_lfsck2name(com->lc_lfsck));
7154
7155                 llsd = com->lc_data;
7156                 read_unlock(&llsd->llsd_rb_lock);
7157                 llst = it->loi_llst;
7158                 LASSERT(llst != NULL);
7159
7160                 /* Save the key and hash for iterate next. */
7161                 llst->llst_fid = it->loi_key;
7162                 llst->llst_hash = it->loi_hash;
7163                 lfsck_layout_llst_put(llst);
7164                 lfsck_component_put(env, com);
7165         }
7166         OBD_FREE_PTR(it);
7167 }
7168
7169 /**
7170  * \retval       +1: the iteration finished
7171  * \retval        0: on success, not finished
7172  * \retval      -ve: on error
7173  */
7174 static int lfsck_orphan_it_next(const struct lu_env *env,
7175                                 struct dt_it *di)
7176 {
7177         struct lfsck_thread_info        *info   = lfsck_env_info(env);
7178         struct filter_fid               *ff     = &info->lti_ff;
7179         struct lu_attr                  *la     = &info->lti_la;
7180         struct lfsck_orphan_it          *it     = (struct lfsck_orphan_it *)di;
7181         struct lu_fid                   *key    = &it->loi_key;
7182         struct lu_orphan_rec_v3         *rec    = &it->loi_rec;
7183         struct ost_layout               *ol     = &rec->lor_layout;
7184         struct lfsck_component          *com    = it->loi_com;
7185         struct lfsck_instance           *lfsck  = com->lc_lfsck;
7186         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
7187         struct dt_object                *obj;
7188         struct lfsck_rbtree_node        *lrn;
7189         int                              pos;
7190         int                              rc;
7191         __u32                            save;
7192         __u32                            idx    = it->loi_llst->llst_index;
7193         bool                             exact  = false;
7194         ENTRY;
7195
7196         if (it->loi_over)
7197                 RETURN(1);
7198
7199 again0:
7200         lrn = it->loi_lrn;
7201         if (lrn == NULL) {
7202                 lrn = lfsck_rbtree_search(llsd, key, &exact);
7203                 if (lrn == NULL) {
7204                         it->loi_over = 1;
7205                         RETURN(1);
7206                 }
7207
7208                 it->loi_lrn = lrn;
7209                 if (!exact) {
7210                         key->f_seq = lrn->lrn_seq;
7211                         key->f_oid = lrn->lrn_first_oid;
7212                         key->f_ver = 0;
7213                 }
7214         } else {
7215                 key->f_oid++;
7216                 if (unlikely(key->f_oid == 0)) {
7217                         key->f_seq++;
7218                         it->loi_lrn = NULL;
7219                         goto again0;
7220                 }
7221
7222                 if (key->f_oid >=
7223                     lrn->lrn_first_oid + LFSCK_RBTREE_BITMAP_WIDTH) {
7224                         it->loi_lrn = NULL;
7225                         goto again0;
7226                 }
7227         }
7228
7229         if (unlikely(atomic_read(&lrn->lrn_known_count) <=
7230                      atomic_read(&lrn->lrn_accessed_count))) {
7231                 struct rb_node *next = rb_next(&lrn->lrn_node);
7232
7233                 while (next != NULL) {
7234                         lrn = rb_entry(next, struct lfsck_rbtree_node,
7235                                        lrn_node);
7236                         if (atomic_read(&lrn->lrn_known_count) >
7237                             atomic_read(&lrn->lrn_accessed_count))
7238                                 break;
7239                         next = rb_next(next);
7240                 }
7241
7242                 if (next == NULL) {
7243                         it->loi_over = 1;
7244                         RETURN(1);
7245                 }
7246
7247                 it->loi_lrn = lrn;
7248                 key->f_seq = lrn->lrn_seq;
7249                 key->f_oid = lrn->lrn_first_oid;
7250                 key->f_ver = 0;
7251         }
7252
7253         pos = key->f_oid - lrn->lrn_first_oid;
7254
7255 again1:
7256         pos = find_next_bit(lrn->lrn_known_bitmap,
7257                             LFSCK_RBTREE_BITMAP_WIDTH, pos);
7258         if (pos >= LFSCK_RBTREE_BITMAP_WIDTH) {
7259                 key->f_oid = lrn->lrn_first_oid + pos;
7260                 if (unlikely(key->f_oid < lrn->lrn_first_oid)) {
7261                         key->f_seq++;
7262                         key->f_oid = 0;
7263                 }
7264                 it->loi_lrn = NULL;
7265                 goto again0;
7266         }
7267
7268         if (test_bit(pos, lrn->lrn_accessed_bitmap)) {
7269                 pos++;
7270                 goto again1;
7271         }
7272
7273         key->f_oid = lrn->lrn_first_oid + pos;
7274         obj = lfsck_object_find_bottom(env, lfsck, key);
7275         if (IS_ERR(obj)) {
7276                 rc = PTR_ERR(obj);
7277                 if (rc == -ENOENT) {
7278                         pos++;
7279                         goto again1;
7280                 }
7281                 RETURN(rc);
7282         }
7283
7284         dt_read_lock(env, obj, 0);
7285         if (dt_object_exists(obj) == 0 ||
7286             lfsck_is_dead_obj(obj)) {
7287                 dt_read_unlock(env, obj);
7288                 lfsck_object_put(env, obj);
7289                 pos++;
7290                 goto again1;
7291         }
7292
7293         rc = dt_attr_get(env, obj, la);
7294         if (rc != 0)
7295                 GOTO(out, rc);
7296
7297         rc = dt_xattr_get(env, obj, lfsck_buf_get(env, ff, sizeof(*ff)),
7298                           XATTR_NAME_FID);
7299         if (rc == -ENODATA) {
7300                 /* For the pre-created OST-object, update the bitmap to avoid
7301                  * others LFSCK (second phase) iteration to touch it again. */
7302                 if (la->la_ctime == 0) {
7303                         if (!test_and_set_bit(pos, lrn->lrn_accessed_bitmap))
7304                                 atomic_inc(&lrn->lrn_accessed_count);
7305
7306                         /* For the race between repairing dangling referenced
7307                          * MDT-object and unlink the file, it may left orphan
7308                          * OST-object there. Destroy it now! */
7309                         if (unlikely(!(la->la_mode & S_ISUID))) {
7310                                 dt_read_unlock(env, obj);
7311                                 lfsck_layout_destroy_orphan(env, obj);
7312                                 lfsck_object_put(env, obj);
7313                                 pos++;
7314                                 goto again1;
7315                         }
7316                 } else if (idx == 0) {
7317                         /* If the orphan OST-object has no parent information,
7318                          * regard it as referenced by the MDT-object on MDT0. */
7319                         fid_zero(&rec->lor_rec.lor_fid);
7320                         rec->lor_rec.lor_uid = la->la_uid;
7321                         rec->lor_rec.lor_gid = la->la_gid;
7322                         memset(ol, 0, sizeof(*ol));
7323                         rec->lor_layout_version = 0;
7324                         rec->lor_range = 0;
7325
7326                         GOTO(out, rc = 0);
7327                 }
7328
7329                 dt_read_unlock(env, obj);
7330                 lfsck_object_put(env, obj);
7331                 pos++;
7332                 goto again1;
7333         }
7334
7335         if (rc < sizeof(struct lu_fid))
7336                 GOTO(out, rc = (rc < 0 ? rc : -EINVAL));
7337
7338         fid_le_to_cpu(&rec->lor_rec.lor_fid, &ff->ff_parent);
7339         /* Currently, the filter_fid::ff_parent::f_ver is not the real parent
7340          * MDT-object's FID::f_ver, instead it is the OST-object index in its
7341          * parent MDT-object's layout EA. */
7342         save = rec->lor_rec.lor_fid.f_stripe_idx;
7343         rec->lor_rec.lor_fid.f_ver = 0;
7344         rc = lfsck_fid_match_idx(env, lfsck, &rec->lor_rec.lor_fid, idx);
7345         /* If the orphan OST-object does not claim the MDT, then next.
7346          *
7347          * If we do not know whether it matches or not, then return it
7348          * to the MDT for further check. */
7349         if (rc == 0) {
7350                 dt_read_unlock(env, obj);
7351                 lfsck_object_put(env, obj);
7352                 pos++;
7353                 goto again1;
7354         }
7355
7356         rec->lor_rec.lor_fid.f_stripe_idx = save;
7357         rec->lor_rec.lor_uid = la->la_uid;
7358         rec->lor_rec.lor_gid = la->la_gid;
7359         ost_layout_le_to_cpu(ol, &ff->ff_layout);
7360         rec->lor_layout_version =
7361                 le32_to_cpu(ff->ff_layout_version & ~LU_LAYOUT_RESYNC);
7362         rec->lor_range = le32_to_cpu(ff->ff_range);
7363
7364         CDEBUG(D_LFSCK, "%s: return orphan "DFID", PFID "DFID", owner %u:%u, "
7365                "stripe size %u, stripe count %u, COMP id %u, COMP start %llu, "
7366                "COMP end %llu, layout version %u, range %u\n",
7367                lfsck_lfsck2name(com->lc_lfsck), PFID(key),
7368                PFID(&rec->lor_rec.lor_fid), rec->lor_rec.lor_uid,
7369                rec->lor_rec.lor_gid, ol->ol_stripe_size, ol->ol_stripe_count,
7370                ol->ol_comp_id, ol->ol_comp_start, ol->ol_comp_end,
7371                rec->lor_layout_version, rec->lor_range);
7372
7373         GOTO(out, rc = 0);
7374
7375 out:
7376         dt_read_unlock(env, obj);
7377         lfsck_object_put(env, obj);
7378         if (rc == 0)
7379                 it->loi_hash++;
7380
7381         return rc;
7382 }
7383
7384 /**
7385  * \retval       +1: locate to the exactly position
7386  * \retval        0: cannot locate to the exactly position,
7387  *                   call next() to move to a valid position.
7388  * \retval      -ve: on error
7389  */
7390 static int lfsck_orphan_it_get(const struct lu_env *env,
7391                                struct dt_it *di,
7392                                const struct dt_key *key)
7393 {
7394         struct lfsck_orphan_it  *it   = (struct lfsck_orphan_it *)di;
7395         int                      rc;
7396
7397         it->loi_key = *(struct lu_fid *)key;
7398         rc = lfsck_orphan_it_next(env, di);
7399         if (rc == 1)
7400                 return 0;
7401
7402         if (rc == 0)
7403                 return 1;
7404
7405         return rc;
7406 }
7407
7408 static void lfsck_orphan_it_put(const struct lu_env *env,
7409                                 struct dt_it *di)
7410 {
7411 }
7412
7413 static struct dt_key *lfsck_orphan_it_key(const struct lu_env *env,
7414                                           const struct dt_it *di)
7415 {
7416         struct lfsck_orphan_it *it = (struct lfsck_orphan_it *)di;
7417
7418         return (struct dt_key *)&it->loi_key;
7419 }
7420
7421 static int lfsck_orphan_it_key_size(const struct lu_env *env,
7422                                     const struct dt_it *di)
7423 {
7424         return sizeof(struct lu_fid);
7425 }
7426
7427 static int lfsck_orphan_it_rec(const struct lu_env *env,
7428                                const struct dt_it *di,
7429                                struct dt_rec *rec,
7430                                __u32 attr)
7431 {
7432         struct lfsck_orphan_it *it = (struct lfsck_orphan_it *)di;
7433
7434         *(struct lu_orphan_rec_v3 *)rec = it->loi_rec;
7435
7436         return 0;
7437 }
7438
7439 static __u64 lfsck_orphan_it_store(const struct lu_env *env,
7440                                    const struct dt_it *di)
7441 {
7442         struct lfsck_orphan_it  *it   = (struct lfsck_orphan_it *)di;
7443
7444         return it->loi_hash;
7445 }
7446
7447 /**
7448  * \retval       +1: locate to the exactly position
7449  * \retval        0: cannot locate to the exactly position,
7450  *                   call next() to move to a valid position.
7451  * \retval      -ve: on error
7452  */
7453 static int lfsck_orphan_it_load(const struct lu_env *env,
7454                                 const struct dt_it *di,
7455                                 __u64 hash)
7456 {
7457         struct lfsck_orphan_it           *it   = (struct lfsck_orphan_it *)di;
7458         struct lfsck_layout_slave_target *llst = it->loi_llst;
7459         int                               rc;
7460
7461         LASSERT(llst != NULL);
7462
7463         if (hash != llst->llst_hash) {
7464                 CDEBUG(D_LFSCK, "%s: the given hash %llu for orphan "
7465                        "iteration does not match the one when fini "
7466                        "%llu, to be reset.\n",
7467                        lfsck_lfsck2name(it->loi_com->lc_lfsck), hash,
7468                        llst->llst_hash);
7469                 fid_zero(&llst->llst_fid);
7470                 llst->llst_hash = 0;
7471         }
7472
7473         it->loi_key = llst->llst_fid;
7474         it->loi_hash = llst->llst_hash;
7475         rc = lfsck_orphan_it_next(env, (struct dt_it *)di);
7476         if (rc == 1)
7477                 return 0;
7478
7479         if (rc == 0)
7480                 return 1;
7481
7482         return rc;
7483 }
7484
7485 static int lfsck_orphan_it_key_rec(const struct lu_env *env,
7486                                    const struct dt_it *di,
7487                                    void *key_rec)
7488 {
7489         return 0;
7490 }
7491
7492 const struct dt_index_operations lfsck_orphan_index_ops = {
7493         .dio_lookup             = lfsck_orphan_index_lookup,
7494         .dio_declare_insert     = lfsck_orphan_index_declare_insert,
7495         .dio_insert             = lfsck_orphan_index_insert,
7496         .dio_declare_delete     = lfsck_orphan_index_declare_delete,
7497         .dio_delete             = lfsck_orphan_index_delete,
7498         .dio_it = {
7499                 .init           = lfsck_orphan_it_init,
7500                 .fini           = lfsck_orphan_it_fini,
7501                 .get            = lfsck_orphan_it_get,
7502                 .put            = lfsck_orphan_it_put,
7503                 .next           = lfsck_orphan_it_next,
7504                 .key            = lfsck_orphan_it_key,
7505                 .key_size       = lfsck_orphan_it_key_size,
7506                 .rec            = lfsck_orphan_it_rec,
7507                 .store          = lfsck_orphan_it_store,
7508                 .load           = lfsck_orphan_it_load,
7509                 .key_rec        = lfsck_orphan_it_key_rec,
7510         }
7511 };