Whamcloud - gitweb
LU-17000 lnet: fix use-after-free in lnet_startup_lndnet
[fs/lustre-release.git] / lustre / lfsck / lfsck_layout.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License version 2 for more details.  A copy is
14  * included in the COPYING file that accompanied this code.
15
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright (c) 2014, 2017, Intel Corporation.
24  */
25 /*
26  * lustre/lfsck/lfsck_layout.c
27  *
28  * Author: Fan, Yong <fan.yong@intel.com>
29  */
30
31 #ifndef EXPORT_SYMTAB
32 # define EXPORT_SYMTAB
33 #endif
34 #define DEBUG_SUBSYSTEM S_LFSCK
35
36 #include <linux/bitops.h>
37 #include <linux/rbtree.h>
38
39 #include <lu_object.h>
40 #include <dt_object.h>
41 #include <lustre_fid.h>
42 #include <lustre_lib.h>
43 #include <lustre_net.h>
44 #include <md_object.h>
45 #include <obd_class.h>
46
47 #include "lfsck_internal.h"
48
49 #define LFSCK_LAYOUT_MAGIC_V1           0xB173AE14
50 #define LFSCK_LAYOUT_MAGIC_V2           0xB1734D76
51 #define LFSCK_LAYOUT_MAGIC_V3           0xB17371B9
52 #define LFSCK_LAYOUT_MAGIC_V4           0xB1732FED
53
54 #define LFSCK_LAYOUT_MAGIC              LFSCK_LAYOUT_MAGIC_V4
55
56 struct lfsck_layout_seq {
57         struct list_head         lls_list;
58         __u64                    lls_seq;
59         __u64                    lls_lastid;
60         __u64                    lls_lastid_known;
61         struct dt_object        *lls_lastid_obj;
62         unsigned int             lls_dirty:1;
63 };
64
65 struct lfsck_layout_slave_target {
66         /* link into lfsck_layout_slave_data::llsd_master_list. */
67         struct list_head        llst_list;
68         /* The position for next record in the rbtree for iteration. */
69         struct lu_fid           llst_fid;
70         /* Dummy hash for iteration against the rbtree. */
71         __u64                   llst_hash;
72         __u64                   llst_gen;
73         atomic_t                llst_ref;
74         __u32                   llst_index;
75         /* How many times we have failed to get the master status. */
76         int                     llst_failures;
77 };
78
79 struct lfsck_layout_slave_data {
80         /* list for lfsck_layout_seq */
81         struct list_head         llsd_seq_list;
82
83         /* list for the masters involve layout verification. */
84         struct list_head         llsd_master_list;
85         spinlock_t               llsd_lock;
86         __u64                    llsd_touch_gen;
87         struct dt_object        *llsd_rb_obj;
88         struct rb_root           llsd_rb_root;
89         struct rw_semaphore      llsd_rb_rwsem;
90         unsigned int             llsd_rbtree_valid:1;
91 };
92
93 struct lfsck_layout_slave_async_args {
94         struct obd_export                *llsaa_exp;
95         struct lfsck_component           *llsaa_com;
96         struct lfsck_layout_slave_target *llsaa_llst;
97 };
98
99 static inline bool lfsck_comp_extent_aligned(__u64 border, __u32 size)
100 {
101         return (border & (size - 1)) == 0;
102 }
103
104 static inline void
105 lfsck_layout_llst_put(struct lfsck_layout_slave_target *llst)
106 {
107         if (atomic_dec_and_test(&llst->llst_ref)) {
108                 LASSERT(list_empty(&llst->llst_list));
109
110                 OBD_FREE_PTR(llst);
111         }
112 }
113
114 static inline int
115 lfsck_layout_llst_add(struct lfsck_layout_slave_data *llsd, __u32 index)
116 {
117         struct lfsck_layout_slave_target *llst;
118         struct lfsck_layout_slave_target *tmp;
119         int                               rc   = 0;
120
121         OBD_ALLOC_PTR(llst);
122         if (llst == NULL)
123                 return -ENOMEM;
124
125         INIT_LIST_HEAD(&llst->llst_list);
126         llst->llst_gen = 0;
127         llst->llst_index = index;
128         atomic_set(&llst->llst_ref, 1);
129
130         spin_lock(&llsd->llsd_lock);
131         list_for_each_entry(tmp, &llsd->llsd_master_list, llst_list) {
132                 if (tmp->llst_index == index) {
133                         rc = -EALREADY;
134                         break;
135                 }
136         }
137         if (rc == 0)
138                 list_add_tail(&llst->llst_list, &llsd->llsd_master_list);
139         spin_unlock(&llsd->llsd_lock);
140
141         if (rc != 0)
142                 OBD_FREE_PTR(llst);
143
144         return rc;
145 }
146
147 static inline void
148 lfsck_layout_llst_del(struct lfsck_layout_slave_data *llsd,
149                       struct lfsck_layout_slave_target *llst)
150 {
151         bool del = false;
152
153         spin_lock(&llsd->llsd_lock);
154         if (!list_empty(&llst->llst_list)) {
155                 list_del_init(&llst->llst_list);
156                 del = true;
157         }
158         spin_unlock(&llsd->llsd_lock);
159
160         if (del)
161                 lfsck_layout_llst_put(llst);
162 }
163
164 static inline struct lfsck_layout_slave_target *
165 lfsck_layout_llst_find_and_del(struct lfsck_layout_slave_data *llsd,
166                                __u32 index, bool unlink)
167 {
168         struct lfsck_layout_slave_target *llst;
169
170         spin_lock(&llsd->llsd_lock);
171         list_for_each_entry(llst, &llsd->llsd_master_list, llst_list) {
172                 if (llst->llst_index == index) {
173                         if (unlink)
174                                 list_del_init(&llst->llst_list);
175                         else
176                                 atomic_inc(&llst->llst_ref);
177                         spin_unlock(&llsd->llsd_lock);
178
179                         return llst;
180                 }
181         }
182         spin_unlock(&llsd->llsd_lock);
183
184         return NULL;
185 }
186
187 static struct lfsck_layout_req *
188 lfsck_layout_assistant_req_init(struct lfsck_assistant_object *lso,
189                                 struct dt_object *child, __u32 comp_id,
190                                 __u32 ost_idx, __u32 lov_idx)
191 {
192         struct lfsck_layout_req *llr;
193
194         OBD_ALLOC_PTR(llr);
195         if (llr == NULL)
196                 return ERR_PTR(-ENOMEM);
197
198         INIT_LIST_HEAD(&llr->llr_lar.lar_list);
199         llr->llr_lar.lar_parent = lfsck_assistant_object_get(lso);
200         llr->llr_child = child;
201         llr->llr_comp_id = comp_id;
202         llr->llr_ost_idx = ost_idx;
203         llr->llr_lov_idx = lov_idx;
204
205         return llr;
206 }
207
208 static void lfsck_layout_assistant_req_fini(const struct lu_env *env,
209                                             struct lfsck_assistant_req *lar)
210 {
211         struct lfsck_layout_req *llr =
212                 container_of(lar, struct lfsck_layout_req, llr_lar);
213
214         lfsck_object_put(env, llr->llr_child);
215         lfsck_assistant_object_put(env, lar->lar_parent);
216         OBD_FREE_PTR(llr);
217 }
218
219 static int
220 lfsck_layout_assistant_sync_failures_interpret(const struct lu_env *env,
221                                                struct ptlrpc_request *req,
222                                                void *args, int rc)
223 {
224         if (rc == 0) {
225                 struct lfsck_async_interpret_args *laia = args;
226                 struct lfsck_tgt_desc             *ltd  = laia->laia_ltd;
227
228                 ltd->ltd_synced_failures = 1;
229                 atomic_dec(laia->laia_count);
230         }
231
232         return 0;
233 }
234
235 /**
236  * Notify remote LFSCK instances about former failures.
237  *
238  * The local LFSCK instance has recorded which OSTs have ever failed to respond
239  * some LFSCK verification requests (maybe because of network issues or the OST
240  * itself trouble). During the respond gap, the OST may missed some OST-objects
241  * verification, then the OST cannot know whether related OST-objects have been
242  * referenced by related MDT-objects or not, then in the second-stage scanning,
243  * these OST-objects will be regarded as orphan, if the OST-object contains bad
244  * parent FID for back reference, then it will misguide the LFSCK to make wrong
245  * fixing for the fake orphan.
246  *
247  * To avoid above trouble, when layout LFSCK finishes the first-stage scanning,
248  * it will scan the bitmap for the ever failed OSTs, and notify them that they
249  * have ever missed some OST-object verification and should skip the handling
250  * for orphan OST-objects on all MDTs that are in the layout LFSCK.
251  *
252  * \param[in] env       pointer to the thread context
253  * \param[in] com       pointer to the lfsck component
254  * \param[in] lr        pointer to the lfsck request
255  */
256 static void lfsck_layout_assistant_sync_failures(const struct lu_env *env,
257                                                  struct lfsck_component *com,
258                                                  struct lfsck_request *lr)
259 {
260         struct lfsck_async_interpret_args *laia  =
261                                 &lfsck_env_info(env)->lti_laia2;
262         struct lfsck_assistant_data       *lad   = com->lc_data;
263         struct lfsck_layout               *lo    = com->lc_file_ram;
264         struct lfsck_instance             *lfsck = com->lc_lfsck;
265         struct lfsck_tgt_descs            *ltds  = &lfsck->li_ost_descs;
266         struct lfsck_tgt_desc             *ltd;
267         struct ptlrpc_request_set         *set;
268         atomic_t                           count;
269         __u32                              idx;
270         int                                rc    = 0;
271         ENTRY;
272
273         if (!test_bit(LAD_INCOMPLETE, &lad->lad_flags))
274                 RETURN_EXIT;
275
276         /* If the MDT has ever failed to verfiy some OST-objects,
277          * then sync failures with them firstly. */
278         lr->lr_flags2 = lo->ll_flags | LF_INCOMPLETE;
279
280         atomic_set(&count, 0);
281         memset(laia, 0, sizeof(*laia));
282         laia->laia_count = &count;
283         set = ptlrpc_prep_set();
284         if (set == NULL)
285                 GOTO(out, rc = -ENOMEM);
286
287         down_read(&ltds->ltd_rw_sem);
288         for_each_set_bit(idx, lad->lad_bitmap, lad->lad_bitmap_count) {
289                 ltd = lfsck_ltd2tgt(ltds, idx);
290                 if (unlikely(!ltd))
291                         continue;
292
293                 laia->laia_ltd = ltd;
294                 rc = lfsck_async_request(env, ltd->ltd_exp, lr, set,
295                                 lfsck_layout_assistant_sync_failures_interpret,
296                                 laia, LFSCK_NOTIFY);
297                 if (rc != 0) {
298                         CDEBUG(D_LFSCK, "%s: LFSCK assistant fail to "
299                                "notify target %x for %s phase1 done: "
300                                "rc = %d\n", lfsck_lfsck2name(com->lc_lfsck),
301                                ltd->ltd_index, lad->lad_name, rc);
302
303                         break;
304                 }
305
306                 atomic_inc(&count);
307         }
308         up_read(&ltds->ltd_rw_sem);
309
310         if (rc == 0 && atomic_read(&count) > 0)
311                 rc = ptlrpc_set_wait(env, set);
312
313         ptlrpc_set_destroy(set);
314
315         if (rc == 0 && atomic_read(&count) > 0)
316                 rc = -EINVAL;
317
318         GOTO(out, rc);
319
320 out:
321         if (rc != 0)
322                 /* If failed to sync failures with the OSTs, then have to
323                  * mark the whole LFSCK as LF_INCOMPLETE to skip the whole
324                  * subsequent orphan OST-object handling. */
325                 lo->ll_flags |= LF_INCOMPLETE;
326
327         lr->lr_flags2 = lo->ll_flags;
328 }
329
330 static int lfsck_layout_verify_header_v1v3(struct dt_object *obj,
331                                            struct lov_mds_md_v1 *lmm,
332                                            __u64 start, __u64 end,
333                                            __u32 comp_id,
334                                            bool ext, bool *dom)
335 {
336         __u32 magic;
337         __u32 pattern;
338         __u32 size;
339
340         magic = le32_to_cpu(lmm->lmm_magic);
341         /* If magic crashed, keep it there. Sometime later, during OST-object
342          * orphan handling, if some OST-object(s) back-point to it, it can be
343          * verified and repaired. */
344         if (magic != LOV_MAGIC_V1 && magic != LOV_MAGIC_V3) {
345                 int rc;
346
347                 if ((magic & LOV_MAGIC_MASK) == LOV_MAGIC_MAGIC)
348                         rc = -EOPNOTSUPP;
349                 else
350                         rc = -EINVAL;
351
352                 CDEBUG(D_LFSCK, "%s LOV EA magic 0x%X for the file "DFID"\n",
353                        rc == -EINVAL ? "Unknown" : "Unsupported",
354                        magic, PFID(lfsck_dto2fid(obj)));
355
356                 return rc;
357         }
358
359         pattern = le32_to_cpu(lmm->lmm_pattern);
360         *dom = !!(lov_pattern(pattern) == LOV_PATTERN_MDT);
361
362         /* XXX: DoM file verification will be supportted via LU-11081. */
363         if (lov_pattern(pattern) == LOV_PATTERN_MDT) {
364 #if 0
365                 if (start != 0) {
366                         CDEBUG(D_LFSCK, "The DoM entry for "DFID" is not "
367                                "the first component in the mirror %x/%llu\n",
368                                PFID(lfsck_dto2fid(obj)), comp_id, start);
369
370                         return -EINVAL;
371                 }
372 #endif
373         } else if (!lov_pattern_supported_normal_comp(lov_pattern(pattern))) {
374                 CDEBUG(D_LFSCK, "Unsupported LOV EA pattern %u for the file "
375                        DFID" in the component %x\n",
376                        pattern, PFID(lfsck_dto2fid(obj)), comp_id);
377
378                 return -EOPNOTSUPP;
379         }
380
381         size = le32_to_cpu(lmm->lmm_stripe_size);
382         if (!ext && end != LUSTRE_EOF && start != end &&
383             !lfsck_comp_extent_aligned(end, size)){
384                 CDEBUG(D_LFSCK, "not aligned border in PFL extent range "
385                        "[%llu - %llu) stripesize %u for the file "DFID
386                        " at idx %d\n", start, end, size,
387                        PFID(lfsck_dto2fid(obj)), comp_id);
388
389                 return -EINVAL;
390         }
391
392         return 0;
393 }
394
395 static int lfsck_layout_verify_header_foreign(struct dt_object *obj,
396                                               struct lov_foreign_md *lfm,
397                                               size_t len)
398 {
399         /* magic has been verified already */
400         __u32 value_len = le32_to_cpu(lfm->lfm_length);
401         /* type and flags are not checked for instance */
402
403         CDEBUG(D_INFO, "foreign LOV EA, magic %x, len %u, type %x, flags %x, for file "DFID"\n",
404                le32_to_cpu(lfm->lfm_magic), value_len,
405                le32_to_cpu(lfm->lfm_type), le32_to_cpu(lfm->lfm_flags),
406                PFID(lfsck_dto2fid(obj)));
407
408         if (len != value_len + offsetof(typeof(*lfm), lfm_value))
409                 CDEBUG(D_LFSCK, "foreign LOV EA internal size %u does not match EA full size %zu for file "DFID"\n",
410                        value_len, len, PFID(lfsck_dto2fid(obj)));
411
412         /* nothing to repair */
413         return -ENODATA;
414 }
415
416 static int lfsck_layout_verify_header(struct dt_object *obj,
417                                       struct lov_mds_md_v1 *lmm, size_t len)
418 {
419         bool p_dom = false;
420         int rc = 0;
421
422         if (le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_COMP_V1 ||
423             le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_SEL) {
424                 struct lov_comp_md_v1 *lcm = (struct lov_comp_md_v1 *)lmm;
425                 bool p_zero = false;
426                 int i;
427                 __u16 count = le16_to_cpu(lcm->lcm_entry_count);
428
429                 if (unlikely(count == 0)) {
430                         CDEBUG(D_LFSCK, "the PFL file "DFID" contains invalid "
431                                "components count 0\n",
432                                PFID(lfsck_dto2fid(obj)));
433
434                         return -EINVAL;
435                 }
436
437                 for (i = 0; i < count && !rc; i++) {
438                         struct lov_comp_md_entry_v1 *lcme =
439                                                 &lcm->lcm_entries[i];
440                         __u64 start = le64_to_cpu(lcme->lcme_extent.e_start);
441                         __u64 end = le64_to_cpu(lcme->lcme_extent.e_end);
442                         __u32 comp_id = le32_to_cpu(lcme->lcme_id);
443                         bool ext, inited, zero;
444                         __u32 flags;
445
446                         if (unlikely(comp_id == LCME_ID_INVAL ||
447                                      comp_id > LCME_ID_MAX)) {
448                                 CDEBUG(D_LFSCK, "found invalid PFL ID %u "
449                                        "for the file "DFID" at idx %d\n",
450                                        comp_id, PFID(lfsck_dto2fid(obj)), i);
451
452                                 return -EINVAL;
453                         }
454
455                         flags = le32_to_cpu(lcme->lcme_flags);
456                         ext = flags & LCME_FL_EXTENSION;
457                         inited = flags & LCME_FL_INIT;
458                         zero = !!(start == end);
459
460                         if ((i == 0) && zero) {
461                                 CDEBUG(D_LFSCK, "invalid PFL comp %d: [%llu "
462                                        "- %llu) for "DFID"\n", i, start, end,
463                                        PFID(lfsck_dto2fid(obj)));
464                                 return -EINVAL;
465                         }
466
467                         if ((zero && (inited || (i + 1 == count))) ||
468                             (start > end)) {
469                                 CDEBUG(D_LFSCK, "invalid PFL comp %d/%d: "
470                                        "[%llu, %llu) for "DFID", %sinited\n",
471                                        i, count, start, end,
472                                        PFID(lfsck_dto2fid(obj)),
473                                        inited ? "" : "NOT ");
474                                 return -EINVAL;
475                         }
476
477                         if (!ext && p_zero) {
478                                 CDEBUG(D_LFSCK, "invalid PFL comp %d: [%llu, "
479                                        "%llu) for "DFID": NOT extension "
480                                        "after 0-length component\n", i,
481                                        start, end, PFID(lfsck_dto2fid(obj)));
482                                 return -EINVAL;
483                         }
484
485                         if (ext && (inited || p_dom || zero)) {
486                                 CDEBUG(D_LFSCK, "invalid PFL comp %d: [%llu, "
487                                        "%llu) for "DFID": %s\n", i,
488                                        start, end, PFID(lfsck_dto2fid(obj)),
489                                        inited ? "inited extension" :
490                                        p_dom ? "extension follows DOM" :
491                                        zero ? "zero length extension" : "");
492                                 return -EINVAL;
493                         }
494
495                         rc = lfsck_layout_verify_header_v1v3(obj,
496                                         (struct lov_mds_md_v1 *)((char *)lmm +
497                                         le32_to_cpu(lcme->lcme_offset)), start,
498                                         end, comp_id, ext, &p_dom);
499
500                         p_zero = zero;
501                 }
502         } else if (le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_FOREIGN) {
503                 rc = lfsck_layout_verify_header_foreign(obj,
504                                                 (struct lov_foreign_md *)lmm,
505                                                 len);
506         } else {
507                 rc = lfsck_layout_verify_header_v1v3(obj, lmm, 0, LUSTRE_EOF,
508                                                      0, false, &p_dom);
509         }
510
511         return rc;
512 }
513
514 static int lfsck_layout_get_lovea(const struct lu_env *env,
515                                   struct dt_object *obj, struct lu_buf *buf)
516 {
517         int rc;
518         int rc1;
519
520 again:
521         rc = dt_xattr_get(env, obj, buf, XATTR_NAME_LOV);
522         if (rc == -ERANGE) {
523                 rc = dt_xattr_get(env, obj, &LU_BUF_NULL, XATTR_NAME_LOV);
524                 if (rc <= 0)
525                         return !rc ? -ENODATA : rc;
526
527                 lu_buf_realloc(buf, rc);
528                 if (buf->lb_buf == NULL)
529                         return -ENOMEM;
530
531                 goto again;
532         }
533
534         if (rc <= 0)
535                 return !rc ? -ENODATA : rc;
536
537         if (unlikely(buf->lb_buf == NULL)) {
538                 lu_buf_alloc(buf, rc);
539                 if (buf->lb_buf == NULL)
540                         return -ENOMEM;
541
542                 goto again;
543         }
544
545         rc1 = lfsck_layout_verify_header(obj, buf->lb_buf, rc);
546
547         return rc1 ? rc1 : rc;
548 }
549
550 #define LFSCK_RBTREE_BITMAP_SIZE        PAGE_SIZE
551 #define LFSCK_RBTREE_BITMAP_WIDTH       (LFSCK_RBTREE_BITMAP_SIZE << 3)
552 #define LFSCK_RBTREE_BITMAP_MASK        (LFSCK_RBTREE_BITMAP_WIDTH - 1)
553
554 struct lfsck_rbtree_node {
555         struct rb_node   lrn_node;
556         __u64            lrn_seq;
557         __u32            lrn_first_oid;
558         atomic_t         lrn_known_count;
559         atomic_t         lrn_accessed_count;
560         void            *lrn_known_bitmap;
561         void            *lrn_accessed_bitmap;
562 };
563
564 static inline int lfsck_rbtree_cmp(struct lfsck_rbtree_node *lrn,
565                                    __u64 seq, __u32 oid)
566 {
567         if (seq < lrn->lrn_seq)
568                 return -1;
569
570         if (seq > lrn->lrn_seq)
571                 return 1;
572
573         if (oid < lrn->lrn_first_oid)
574                 return -1;
575
576         if (oid - lrn->lrn_first_oid >= LFSCK_RBTREE_BITMAP_WIDTH)
577                 return 1;
578
579         return 0;
580 }
581
582 /* The caller should hold llsd->llsd_rb_lock. */
583 static struct lfsck_rbtree_node *
584 lfsck_rbtree_search(struct lfsck_layout_slave_data *llsd,
585                     const struct lu_fid *fid, bool *exact)
586 {
587         struct rb_node           *node  = llsd->llsd_rb_root.rb_node;
588         struct rb_node           *prev  = NULL;
589         struct lfsck_rbtree_node *lrn   = NULL;
590         int                       rc    = 0;
591
592         if (exact != NULL)
593                 *exact = true;
594
595         while (node != NULL) {
596                 prev = node;
597                 lrn = rb_entry(node, struct lfsck_rbtree_node, lrn_node);
598                 rc = lfsck_rbtree_cmp(lrn, fid_seq(fid), fid_oid(fid));
599                 if (rc < 0)
600                         node = node->rb_left;
601                 else if (rc > 0)
602                         node = node->rb_right;
603                 else
604                         return lrn;
605         }
606
607         if (exact == NULL)
608                 return NULL;
609
610         /* If there is no exactly matched one, then to the next valid one. */
611         *exact = false;
612
613         /* The rbtree is empty. */
614         if (rc == 0)
615                 return NULL;
616
617         if (rc < 0)
618                 return lrn;
619
620         node = rb_next(prev);
621
622         /* The end of the rbtree. */
623         if (node == NULL)
624                 return NULL;
625
626         lrn = rb_entry(node, struct lfsck_rbtree_node, lrn_node);
627
628         return lrn;
629 }
630
631 static struct lfsck_rbtree_node *lfsck_rbtree_new(const struct lu_env *env,
632                                                   const struct lu_fid *fid)
633 {
634         struct lfsck_rbtree_node *lrn;
635
636         OBD_ALLOC_PTR(lrn);
637         if (lrn == NULL)
638                 return ERR_PTR(-ENOMEM);
639
640         OBD_ALLOC(lrn->lrn_known_bitmap, LFSCK_RBTREE_BITMAP_SIZE);
641         if (lrn->lrn_known_bitmap == NULL) {
642                 OBD_FREE_PTR(lrn);
643
644                 return ERR_PTR(-ENOMEM);
645         }
646
647         OBD_ALLOC(lrn->lrn_accessed_bitmap, LFSCK_RBTREE_BITMAP_SIZE);
648         if (lrn->lrn_accessed_bitmap == NULL) {
649                 OBD_FREE(lrn->lrn_known_bitmap, LFSCK_RBTREE_BITMAP_SIZE);
650                 OBD_FREE_PTR(lrn);
651
652                 return ERR_PTR(-ENOMEM);
653         }
654
655         RB_CLEAR_NODE(&lrn->lrn_node);
656         lrn->lrn_seq = fid_seq(fid);
657         lrn->lrn_first_oid = fid_oid(fid) & ~LFSCK_RBTREE_BITMAP_MASK;
658         atomic_set(&lrn->lrn_known_count, 0);
659         atomic_set(&lrn->lrn_accessed_count, 0);
660
661         return lrn;
662 }
663
664 static void lfsck_rbtree_free(struct lfsck_rbtree_node *lrn)
665 {
666         OBD_FREE(lrn->lrn_accessed_bitmap, LFSCK_RBTREE_BITMAP_SIZE);
667         OBD_FREE(lrn->lrn_known_bitmap, LFSCK_RBTREE_BITMAP_SIZE);
668         OBD_FREE_PTR(lrn);
669 }
670
671 /* The caller should hold lock. */
672 static struct lfsck_rbtree_node *
673 lfsck_rbtree_insert(struct lfsck_layout_slave_data *llsd,
674                     struct lfsck_rbtree_node *lrn)
675 {
676         struct rb_node           **pos    = &llsd->llsd_rb_root.rb_node;
677         struct rb_node            *parent = NULL;
678         struct lfsck_rbtree_node  *tmp;
679         int                        rc;
680
681         while (*pos != NULL) {
682                 parent = *pos;
683                 tmp = rb_entry(parent, struct lfsck_rbtree_node, lrn_node);
684                 rc = lfsck_rbtree_cmp(tmp, lrn->lrn_seq, lrn->lrn_first_oid);
685                 if (rc < 0)
686                         pos = &(*pos)->rb_left;
687                 else if (rc > 0)
688                         pos = &(*pos)->rb_right;
689                 else
690                         return tmp;
691         }
692
693         rb_link_node(&lrn->lrn_node, parent, pos);
694         rb_insert_color(&lrn->lrn_node, &llsd->llsd_rb_root);
695
696         return lrn;
697 }
698
699 static const struct dt_index_operations lfsck_orphan_index_ops;
700
701 static int lfsck_rbtree_setup(const struct lu_env *env,
702                               struct lfsck_component *com)
703 {
704         struct lu_fid                   *fid    = &lfsck_env_info(env)->lti_fid;
705         struct lfsck_instance           *lfsck  = com->lc_lfsck;
706         struct dt_device                *dev    = lfsck->li_bottom;
707         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
708         struct dt_object                *obj;
709
710         fid->f_seq = FID_SEQ_LAYOUT_RBTREE;
711         fid->f_oid = lfsck_dev_idx(lfsck);
712         fid->f_ver = 0;
713         obj = dt_locate(env, dev, fid);
714         if (IS_ERR(obj))
715                 RETURN(PTR_ERR(obj));
716
717         /* Generate an in-RAM object to stand for the layout rbtree.
718          * Scanning the layout rbtree will be via the iteration over
719          * the object. In the future, the rbtree may be written onto
720          * disk with the object.
721          *
722          * Mark the object to be as exist. */
723         obj->do_lu.lo_header->loh_attr |= LOHA_EXISTS;
724         obj->do_index_ops = &lfsck_orphan_index_ops;
725         llsd->llsd_rb_obj = obj;
726         llsd->llsd_rbtree_valid = 1;
727         dev->dd_record_fid_accessed = 1;
728
729         CDEBUG(D_LFSCK, "%s: layout LFSCK init OST-objects accessing bitmap\n",
730                lfsck_lfsck2name(lfsck));
731
732         return 0;
733 }
734
735 static void lfsck_rbtree_cleanup(const struct lu_env *env,
736                                  struct lfsck_component *com)
737 {
738         struct lfsck_instance           *lfsck = com->lc_lfsck;
739         struct lfsck_layout_slave_data  *llsd  = com->lc_data;
740         struct rb_node                  *node  = rb_first(&llsd->llsd_rb_root);
741         struct rb_node                  *next;
742         struct lfsck_rbtree_node        *lrn;
743
744         lfsck->li_bottom->dd_record_fid_accessed = 0;
745         /* Invalid the rbtree, then no others will use it. */
746         down_write(&llsd->llsd_rb_rwsem);
747         llsd->llsd_rbtree_valid = 0;
748         up_write(&llsd->llsd_rb_rwsem);
749
750         while (node != NULL) {
751                 next = rb_next(node);
752                 lrn = rb_entry(node, struct lfsck_rbtree_node, lrn_node);
753                 rb_erase(node, &llsd->llsd_rb_root);
754                 lfsck_rbtree_free(lrn);
755                 node = next;
756         }
757
758         if (llsd->llsd_rb_obj != NULL) {
759                 lfsck_object_put(env, llsd->llsd_rb_obj);
760                 llsd->llsd_rb_obj = NULL;
761         }
762
763         CDEBUG(D_LFSCK, "%s: layout LFSCK fini OST-objects accessing bitmap\n",
764                lfsck_lfsck2name(lfsck));
765 }
766
767 static void lfsck_rbtree_update_bitmap(const struct lu_env *env,
768                                        struct lfsck_component *com,
769                                        const struct lu_fid *fid,
770                                        bool accessed)
771 {
772         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
773         struct lfsck_rbtree_node        *lrn;
774         bool                             insert = false;
775         int                              idx;
776         int                              rc     = 0;
777         ENTRY;
778
779         if (unlikely(!fid_is_sane(fid) || fid_is_last_id(fid)))
780                 RETURN_EXIT;
781
782         if (!fid_is_idif(fid) && !fid_is_norm(fid))
783                 RETURN_EXIT;
784
785         down_read(&llsd->llsd_rb_rwsem);
786         if (!llsd->llsd_rbtree_valid)
787                 GOTO(unlock, rc = 0);
788
789         lrn = lfsck_rbtree_search(llsd, fid, NULL);
790         if (lrn == NULL) {
791                 struct lfsck_rbtree_node *tmp;
792
793                 LASSERT(!insert);
794
795                 up_read(&llsd->llsd_rb_rwsem);
796                 tmp = lfsck_rbtree_new(env, fid);
797                 if (IS_ERR(tmp))
798                         GOTO(out, rc = PTR_ERR(tmp));
799
800                 insert = true;
801                 down_write(&llsd->llsd_rb_rwsem);
802                 if (!llsd->llsd_rbtree_valid) {
803                         lfsck_rbtree_free(tmp);
804                         GOTO(unlock, rc = 0);
805                 }
806
807                 lrn = lfsck_rbtree_insert(llsd, tmp);
808                 if (lrn != tmp)
809                         lfsck_rbtree_free(tmp);
810         }
811
812         idx = fid_oid(fid) & LFSCK_RBTREE_BITMAP_MASK;
813         /* Any accessed object must be a known object. */
814         if (!test_and_set_bit(idx, lrn->lrn_known_bitmap))
815                 atomic_inc(&lrn->lrn_known_count);
816         if (accessed && !test_and_set_bit(idx, lrn->lrn_accessed_bitmap))
817                 atomic_inc(&lrn->lrn_accessed_count);
818
819         GOTO(unlock, rc = 0);
820
821 unlock:
822         if (insert)
823                 up_write(&llsd->llsd_rb_rwsem);
824         else
825                 up_read(&llsd->llsd_rb_rwsem);
826 out:
827         if (rc != 0 && accessed) {
828                 struct lfsck_layout *lo = com->lc_file_ram;
829
830                 CDEBUG(D_LFSCK, "%s: fail to update OST-objects accessing "
831                        "bitmap, and will cause incorrect LFSCK OST-object "
832                        "handling, so disable it to cancel orphan handling "
833                        "for related device. rc = %d\n",
834                        lfsck_lfsck2name(com->lc_lfsck), rc);
835
836                 lo->ll_flags |= LF_INCOMPLETE;
837                 lfsck_rbtree_cleanup(env, com);
838         }
839 }
840
841 static inline void lldk_le_to_cpu(struct lfsck_layout_dangling_key *des,
842                                   const struct lfsck_layout_dangling_key *src)
843 {
844         fid_le_to_cpu(&des->lldk_fid, &src->lldk_fid);
845         des->lldk_comp_id = le32_to_cpu(src->lldk_comp_id);
846         des->lldk_ea_off = le32_to_cpu(src->lldk_ea_off);
847 }
848
849 static inline void lldk_cpu_to_le(struct lfsck_layout_dangling_key *des,
850                                   const struct lfsck_layout_dangling_key *src)
851 {
852         fid_cpu_to_le(&des->lldk_fid, &src->lldk_fid);
853         des->lldk_comp_id = cpu_to_le32(src->lldk_comp_id);
854         des->lldk_ea_off = cpu_to_le32(src->lldk_ea_off);
855 }
856
857 static inline void lldk_be_to_cpu(struct lfsck_layout_dangling_key *des,
858                                   const struct lfsck_layout_dangling_key *src)
859 {
860         fid_be_to_cpu(&des->lldk_fid, &src->lldk_fid);
861         des->lldk_comp_id = be32_to_cpu(src->lldk_comp_id);
862         des->lldk_ea_off = be32_to_cpu(src->lldk_ea_off);
863 }
864
865 static inline void lldk_cpu_to_be(struct lfsck_layout_dangling_key *des,
866                                   const struct lfsck_layout_dangling_key *src)
867 {
868         fid_cpu_to_be(&des->lldk_fid, &src->lldk_fid);
869         des->lldk_comp_id = cpu_to_be32(src->lldk_comp_id);
870         des->lldk_ea_off = cpu_to_be32(src->lldk_ea_off);
871 }
872
873 static void lfsck_layout_le_to_cpu(struct lfsck_layout *des,
874                                    const struct lfsck_layout *src)
875 {
876         int i;
877
878         des->ll_magic = le32_to_cpu(src->ll_magic);
879         des->ll_status = le32_to_cpu(src->ll_status);
880         des->ll_flags = le32_to_cpu(src->ll_flags);
881         des->ll_success_count = le32_to_cpu(src->ll_success_count);
882         des->ll_run_time_phase1 = le64_to_cpu(src->ll_run_time_phase1);
883         des->ll_run_time_phase2 = le64_to_cpu(src->ll_run_time_phase2);
884         des->ll_time_last_complete = le64_to_cpu(src->ll_time_last_complete);
885         des->ll_time_latest_start = le64_to_cpu(src->ll_time_latest_start);
886         des->ll_time_last_checkpoint =
887                                 le64_to_cpu(src->ll_time_last_checkpoint);
888         des->ll_pos_latest_start = le64_to_cpu(src->ll_pos_latest_start);
889         des->ll_pos_last_checkpoint = le64_to_cpu(src->ll_pos_last_checkpoint);
890         des->ll_pos_first_inconsistent =
891                         le64_to_cpu(src->ll_pos_first_inconsistent);
892         des->ll_objs_checked_phase1 = le64_to_cpu(src->ll_objs_checked_phase1);
893         des->ll_objs_failed_phase1 = le64_to_cpu(src->ll_objs_failed_phase1);
894         des->ll_objs_checked_phase2 = le64_to_cpu(src->ll_objs_checked_phase2);
895         des->ll_objs_failed_phase2 = le64_to_cpu(src->ll_objs_failed_phase2);
896         for (i = 0; i < LLIT_MAX; i++)
897                 des->ll_objs_repaired[i] =
898                                 le64_to_cpu(src->ll_objs_repaired[i]);
899         des->ll_objs_skipped = le64_to_cpu(src->ll_objs_skipped);
900         des->ll_bitmap_size = le32_to_cpu(src->ll_bitmap_size);
901         lldk_le_to_cpu(&des->ll_lldk_latest_scanned_phase2,
902                        &src->ll_lldk_latest_scanned_phase2);
903 }
904
905 static void lfsck_layout_cpu_to_le(struct lfsck_layout *des,
906                                    const struct lfsck_layout *src)
907 {
908         int i;
909
910         des->ll_magic = cpu_to_le32(src->ll_magic);
911         des->ll_status = cpu_to_le32(src->ll_status);
912         des->ll_flags = cpu_to_le32(src->ll_flags);
913         des->ll_success_count = cpu_to_le32(src->ll_success_count);
914         des->ll_run_time_phase1 = cpu_to_le64(src->ll_run_time_phase1);
915         des->ll_run_time_phase2 = cpu_to_le64(src->ll_run_time_phase2);
916         des->ll_time_last_complete = cpu_to_le64(src->ll_time_last_complete);
917         des->ll_time_latest_start = cpu_to_le64(src->ll_time_latest_start);
918         des->ll_time_last_checkpoint =
919                                 cpu_to_le64(src->ll_time_last_checkpoint);
920         des->ll_pos_latest_start = cpu_to_le64(src->ll_pos_latest_start);
921         des->ll_pos_last_checkpoint = cpu_to_le64(src->ll_pos_last_checkpoint);
922         des->ll_pos_first_inconsistent =
923                         cpu_to_le64(src->ll_pos_first_inconsistent);
924         des->ll_objs_checked_phase1 = cpu_to_le64(src->ll_objs_checked_phase1);
925         des->ll_objs_failed_phase1 = cpu_to_le64(src->ll_objs_failed_phase1);
926         des->ll_objs_checked_phase2 = cpu_to_le64(src->ll_objs_checked_phase2);
927         des->ll_objs_failed_phase2 = cpu_to_le64(src->ll_objs_failed_phase2);
928         for (i = 0; i < LLIT_MAX; i++)
929                 des->ll_objs_repaired[i] =
930                                 cpu_to_le64(src->ll_objs_repaired[i]);
931         des->ll_objs_skipped = cpu_to_le64(src->ll_objs_skipped);
932         des->ll_bitmap_size = cpu_to_le32(src->ll_bitmap_size);
933         lldk_cpu_to_le(&des->ll_lldk_latest_scanned_phase2,
934                        &src->ll_lldk_latest_scanned_phase2);
935 }
936
937 /**
938  * Load the OST bitmap from the lfsck_layout trace file.
939  *
940  * \param[in] env       pointer to the thread context
941  * \param[in] com       pointer to the lfsck component
942  *
943  * \retval              0 for success
944  * \retval              negative error number on failure or data corruption
945  */
946 static int lfsck_layout_load_bitmap(const struct lu_env *env,
947                                     struct lfsck_component *com)
948 {
949         struct dt_object *obj = com->lc_obj;
950         struct lfsck_assistant_data *lad = com->lc_data;
951         struct lfsck_layout *lo = com->lc_file_ram;
952         unsigned long *bitmap = lad->lad_bitmap;
953         loff_t pos = com->lc_file_size;
954         ssize_t size;
955         __u32 nbits;
956         int rc;
957
958         ENTRY;
959         if (com->lc_lfsck->li_ost_descs.ltd_tgts_mask_len > lo->ll_bitmap_size)
960                 nbits = com->lc_lfsck->li_ost_descs.ltd_tgts_mask_len;
961         else
962                 nbits = lo->ll_bitmap_size;
963
964         if (unlikely(nbits < BITS_PER_LONG))
965                 nbits = BITS_PER_LONG;
966
967         if (nbits > lad->lad_bitmap_count) {
968                 u32 new_bits = lad->lad_bitmap_count;
969                 unsigned long *new_bitmap;
970
971                 while (new_bits < nbits)
972                         new_bits <<= 1;
973
974                 new_bitmap = bitmap_zalloc(new_bits, GFP_KERNEL);
975                 if (new_bitmap == NULL)
976                         RETURN(-ENOMEM);
977
978                 lad->lad_bitmap = new_bitmap;
979                 lad->lad_bitmap_count = new_bits;
980                 bitmap_free(bitmap);
981                 bitmap = new_bitmap;
982         }
983
984         if (lo->ll_bitmap_size == 0) {
985                 clear_bit(LAD_INCOMPLETE, &lad->lad_flags);
986                 bitmap_zero(bitmap, lad->lad_bitmap_count);
987                 RETURN(0);
988         }
989
990         size = (lo->ll_bitmap_size + 7) >> 3;
991         rc = dt_read(env, obj, lfsck_buf_get(env, bitmap, size), &pos);
992         if (rc != size)
993                 RETURN(rc >= 0 ? -EINVAL : rc);
994
995         if (bitmap_empty(bitmap, lad->lad_bitmap_count))
996                 clear_bit(LAD_INCOMPLETE, &lad->lad_flags);
997         else
998                 set_bit(LAD_INCOMPLETE, &lad->lad_flags);
999
1000         RETURN(0);
1001 }
1002
1003 /**
1004  * Load the layout LFSCK trace file from disk.
1005  *
1006  * The layout LFSCK trace file records the layout LFSCK status information
1007  * and other statistics, such as how many objects have been scanned, and how
1008  * many objects have been repaired, and etc. It also contains the bitmap for
1009  * failed OSTs during the layout LFSCK. All these information will be loaded
1010  * from disk to RAM when the layout LFSCK component setup.
1011  *
1012  * \param[in] env       pointer to the thread context
1013  * \param[in] com       pointer to the lfsck component
1014  *
1015  * \retval              positive number for file data corruption, the caller
1016  *                      should reset the layout LFSCK trace file
1017  * \retval              0 for success
1018  * \retval              negative error number on failure
1019  */
1020 static int lfsck_layout_load(const struct lu_env *env,
1021                              struct lfsck_component *com)
1022 {
1023         struct lfsck_layout             *lo     = com->lc_file_ram;
1024         ssize_t                          size   = com->lc_file_size;
1025         loff_t                           pos    = 0;
1026         int                              rc;
1027
1028         rc = dt_read(env, com->lc_obj,
1029                      lfsck_buf_get(env, com->lc_file_disk, size), &pos);
1030         if (rc == 0) {
1031                 return -ENOENT;
1032         } else if (rc < 0) {
1033                 CDEBUG(D_LFSCK, "%s: failed to load lfsck_layout: rc = %d\n",
1034                        lfsck_lfsck2name(com->lc_lfsck), rc);
1035                 return rc;
1036         } else if (rc != size) {
1037                 CDEBUG(D_LFSCK, "%s: lfsck_layout size %u != %u; reset it\n",
1038                        lfsck_lfsck2name(com->lc_lfsck), rc, (unsigned int)size);
1039                 return 1;
1040         }
1041
1042         lfsck_layout_le_to_cpu(lo, com->lc_file_disk);
1043         if (lo->ll_magic != LFSCK_LAYOUT_MAGIC) {
1044                 CDEBUG(D_LFSCK, "%s: invalid lfsck_layout magic %#x != %#x, "
1045                        "to be reset\n", lfsck_lfsck2name(com->lc_lfsck),
1046                        lo->ll_magic, LFSCK_LAYOUT_MAGIC);
1047                 return 1;
1048         }
1049
1050         return 0;
1051 }
1052
1053 /**
1054  * Store the layout LFSCK trace file on disk.
1055  *
1056  * The layout LFSCK trace file records the layout LFSCK status information
1057  * and other statistics, such as how many objects have been scanned, and how
1058  * many objects have been repaired, and etc. It also contains the bitmap for
1059  * failed OSTs during the layout LFSCK. All these information will be synced
1060  * from RAM to disk periodically.
1061  *
1062  * \param[in] env       pointer to the thread context
1063  * \param[in] com       pointer to the lfsck component
1064  *
1065  * \retval              0 for success
1066  * \retval              negative error number on failure
1067  */
1068 static int lfsck_layout_store(const struct lu_env *env,
1069                               struct lfsck_component *com)
1070 {
1071         struct dt_object *obj = com->lc_obj;
1072         struct lfsck_instance *lfsck = com->lc_lfsck;
1073         struct lfsck_layout *lo_ram = com->lc_file_ram;
1074         struct lfsck_layout *lo = com->lc_file_disk;
1075         struct thandle *th;
1076         struct dt_device *dev = lfsck_obj2dev(obj);
1077         unsigned long *bitmap = NULL;
1078         loff_t pos;
1079         ssize_t size = com->lc_file_size;
1080         __u32 nbits = 0;
1081         int rc;
1082
1083         ENTRY;
1084         if (lfsck->li_master) {
1085                 struct lfsck_assistant_data *lad = com->lc_data;
1086
1087                 bitmap = lad->lad_bitmap;
1088                 nbits = lad->lad_bitmap_count;
1089
1090                 LASSERT(nbits > 0);
1091                 LASSERTF((nbits & 7) == 0, "Invalid nbits %u\n", nbits);
1092         }
1093
1094         lo_ram->ll_bitmap_size = nbits;
1095         lfsck_layout_cpu_to_le(lo, lo_ram);
1096         th = dt_trans_create(env, dev);
1097         if (IS_ERR(th))
1098                 GOTO(log, rc = PTR_ERR(th));
1099
1100         rc = dt_declare_record_write(env, obj, lfsck_buf_get(env, lo, size),
1101                                      (loff_t)0, th);
1102         if (rc != 0)
1103                 GOTO(out, rc);
1104
1105         if (bitmap != NULL) {
1106                 rc = dt_declare_record_write(env, obj,
1107                                 lfsck_buf_get(env, bitmap, nbits >> 3),
1108                                 (loff_t)size, th);
1109                 if (rc != 0)
1110                         GOTO(out, rc);
1111         }
1112
1113         rc = dt_trans_start_local(env, dev, th);
1114         if (rc != 0)
1115                 GOTO(out, rc);
1116
1117         pos = 0;
1118         rc = dt_record_write(env, obj, lfsck_buf_get(env, lo, size), &pos, th);
1119         if (rc != 0)
1120                 GOTO(out, rc);
1121
1122         if (bitmap != NULL) {
1123                 pos = size;
1124                 rc = dt_record_write(env, obj,
1125                                 lfsck_buf_get(env, bitmap, nbits >> 3),
1126                                 &pos, th);
1127         }
1128
1129         GOTO(out, rc);
1130
1131 out:
1132         dt_trans_stop(env, dev, th);
1133
1134 log:
1135         if (rc != 0)
1136                 CDEBUG(D_LFSCK, "%s: fail to store lfsck_layout: rc = %d\n",
1137                        lfsck_lfsck2name(lfsck), rc);
1138
1139         return rc;
1140 }
1141
1142 static int lfsck_layout_init(const struct lu_env *env,
1143                              struct lfsck_component *com)
1144 {
1145         struct lfsck_layout *lo = com->lc_file_ram;
1146         int rc;
1147
1148         memset(lo, 0, com->lc_file_size);
1149         lo->ll_magic = LFSCK_LAYOUT_MAGIC;
1150         lo->ll_status = LS_INIT;
1151         down_write(&com->lc_sem);
1152         rc = lfsck_layout_store(env, com);
1153         if (rc == 0 && com->lc_lfsck->li_master)
1154                 rc = lfsck_load_sub_trace_files(env, com,
1155                         &dt_lfsck_layout_dangling_features, LFSCK_LAYOUT, true);
1156         up_write(&com->lc_sem);
1157
1158         return rc;
1159 }
1160
1161 static int fid_is_for_ostobj(const struct lu_env *env,
1162                              struct lfsck_instance *lfsck,
1163                              struct dt_object *obj, const struct lu_fid *fid)
1164 {
1165         struct seq_server_site  *ss     = lfsck_dev_site(lfsck);
1166         struct lu_seq_range     *range  = &lfsck_env_info(env)->lti_range;
1167         struct lustre_ost_attrs *loa;
1168         int                      rc;
1169
1170         fld_range_set_any(range);
1171         rc = fld_server_lookup(env, ss->ss_server_fld, fid_seq(fid), range);
1172         if (rc == 0) {
1173                 if (fld_range_is_ost(range))
1174                         return 1;
1175
1176                 return 0;
1177         }
1178
1179         loa = &lfsck_env_info(env)->lti_loa;
1180         rc = dt_xattr_get(env, obj, lfsck_buf_get(env, loa, sizeof(*loa)),
1181                           XATTR_NAME_LMA);
1182         if (rc >= (int)sizeof(struct lustre_mdt_attrs)) {
1183                 lustre_lma_swab(&loa->loa_lma);
1184
1185                 return loa->loa_lma.lma_compat & LMAC_FID_ON_OST ? 1 : 0;
1186         }
1187
1188         rc = dt_xattr_get(env, obj, &LU_BUF_NULL, XATTR_NAME_FID);
1189
1190         return rc > 0;
1191 }
1192
1193 static struct lfsck_layout_seq *
1194 lfsck_layout_seq_lookup(struct lfsck_layout_slave_data *llsd, __u64 seq)
1195 {
1196         struct lfsck_layout_seq *lls;
1197
1198         list_for_each_entry(lls, &llsd->llsd_seq_list, lls_list) {
1199                 if (lls->lls_seq == seq)
1200                         return lls;
1201
1202                 if (lls->lls_seq > seq)
1203                         return NULL;
1204         }
1205
1206         return NULL;
1207 }
1208
1209 static void
1210 lfsck_layout_seq_insert(struct lfsck_layout_slave_data *llsd,
1211                         struct lfsck_layout_seq *lls)
1212 {
1213         struct lfsck_layout_seq *tmp;
1214         struct list_head        *pos = &llsd->llsd_seq_list;
1215
1216         list_for_each_entry(tmp, &llsd->llsd_seq_list, lls_list) {
1217                 if (lls->lls_seq < tmp->lls_seq) {
1218                         pos = &tmp->lls_list;
1219                         break;
1220                 }
1221         }
1222         list_add_tail(&lls->lls_list, pos);
1223 }
1224
1225 static int
1226 lfsck_layout_lastid_create(const struct lu_env *env,
1227                            struct lfsck_instance *lfsck,
1228                            struct dt_object *obj)
1229 {
1230         struct lfsck_thread_info *info   = lfsck_env_info(env);
1231         struct lu_attr           *la     = &info->lti_la;
1232         struct dt_object_format  *dof    = &info->lti_dof;
1233         struct lfsck_bookmark    *bk     = &lfsck->li_bookmark_ram;
1234         struct dt_device         *dt     = lfsck_obj2dev(obj);
1235         struct thandle           *th;
1236         __u64                     lastid = 0;
1237         loff_t                    pos    = 0;
1238         int                       rc;
1239         ENTRY;
1240
1241         if (bk->lb_param & LPF_DRYRUN)
1242                 return 0;
1243
1244         memset(la, 0, sizeof(*la));
1245         la->la_mode = S_IFREG |  S_IRUGO | S_IWUSR;
1246         la->la_valid = LA_MODE | LA_UID | LA_GID;
1247         memset(dof, 0, sizeof(*dof));
1248         dof->dof_type = dt_mode_to_dft(S_IFREG);
1249
1250         th = lfsck_trans_create(env, dt, lfsck);
1251         if (IS_ERR(th))
1252                 GOTO(log, rc = PTR_ERR(th));
1253
1254         rc = dt_declare_create(env, obj, la, NULL, dof, th);
1255         if (rc != 0)
1256                 GOTO(stop, rc);
1257
1258         rc = dt_declare_record_write(env, obj,
1259                                      lfsck_buf_get(env, &lastid,
1260                                                    sizeof(lastid)),
1261                                      pos, th);
1262         if (rc != 0)
1263                 GOTO(stop, rc);
1264
1265         rc = dt_trans_start_local(env, dt, th);
1266         if (rc != 0)
1267                 GOTO(stop, rc);
1268
1269         dt_write_lock(env, obj, 0);
1270         if (likely(dt_object_exists(obj) == 0)) {
1271                 rc = dt_create(env, obj, la, NULL, dof, th);
1272                 if (rc == 0)
1273                         rc = dt_record_write(env, obj,
1274                                 lfsck_buf_get(env, &lastid, sizeof(lastid)),
1275                                 &pos, th);
1276         }
1277         dt_write_unlock(env, obj);
1278
1279         GOTO(stop, rc);
1280
1281 stop:
1282         dt_trans_stop(env, dt, th);
1283
1284 log:
1285         CDEBUG(D_LFSCK, "%s: layout LFSCK will create LAST_ID for <seq> "
1286                "%#llx: rc = %d\n",
1287                lfsck_lfsck2name(lfsck), fid_seq(lfsck_dto2fid(obj)), rc);
1288
1289         return rc;
1290 }
1291
1292 static int
1293 lfsck_layout_lastid_reload(const struct lu_env *env,
1294                            struct lfsck_component *com,
1295                            struct lfsck_layout_seq *lls)
1296 {
1297         __u64   lastid;
1298         loff_t  pos     = 0;
1299         int     rc;
1300
1301         dt_read_lock(env, lls->lls_lastid_obj, 0);
1302         rc = dt_record_read(env, lls->lls_lastid_obj,
1303                             lfsck_buf_get(env, &lastid, sizeof(lastid)), &pos);
1304         dt_read_unlock(env, lls->lls_lastid_obj);
1305         if (unlikely(rc != 0))
1306                 return rc;
1307
1308         lastid = le64_to_cpu(lastid);
1309         if (lastid < lls->lls_lastid_known) {
1310                 struct lfsck_instance   *lfsck  = com->lc_lfsck;
1311                 struct lfsck_layout     *lo     = com->lc_file_ram;
1312
1313                 lls->lls_lastid = lls->lls_lastid_known;
1314                 lls->lls_dirty = 1;
1315                 if (!(lo->ll_flags & LF_CRASHED_LASTID)) {
1316                         LASSERT(lfsck->li_out_notify != NULL);
1317
1318                         lfsck->li_out_notify(env, lfsck->li_out_notify_data,
1319                                              LE_LASTID_REBUILDING);
1320                         lo->ll_flags |= LF_CRASHED_LASTID;
1321
1322                         CDEBUG(D_LFSCK, "%s: layout LFSCK finds crashed "
1323                                "LAST_ID file (1) for the sequence %#llx"
1324                                ", old value %llu, known value %llu\n",
1325                                lfsck_lfsck2name(lfsck), lls->lls_seq,
1326                                lastid, lls->lls_lastid);
1327                 }
1328         } else if (lastid >= lls->lls_lastid) {
1329                 lls->lls_lastid = lastid;
1330                 lls->lls_dirty = 0;
1331         }
1332
1333         return 0;
1334 }
1335
1336 static int
1337 lfsck_layout_lastid_store(const struct lu_env *env,
1338                           struct lfsck_component *com)
1339 {
1340         struct lfsck_instance           *lfsck  = com->lc_lfsck;
1341         struct lfsck_bookmark           *bk     = &lfsck->li_bookmark_ram;
1342         struct dt_device                *dt     = lfsck->li_bottom;
1343         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
1344         struct lfsck_layout_seq         *lls;
1345         struct thandle                  *th;
1346         __u64                            lastid;
1347         int                              rc     = 0;
1348         int                              rc1    = 0;
1349
1350         list_for_each_entry(lls, &llsd->llsd_seq_list, lls_list) {
1351                 loff_t pos = 0;
1352
1353                 if (!lls->lls_dirty)
1354                         continue;
1355
1356                 CDEBUG(D_LFSCK, "%s: layout LFSCK will sync the LAST_ID for "
1357                        "<seq> %#llx as <oid> %llu\n",
1358                        lfsck_lfsck2name(lfsck), lls->lls_seq, lls->lls_lastid);
1359
1360                 if (bk->lb_param & LPF_DRYRUN) {
1361                         lls->lls_dirty = 0;
1362                         continue;
1363                 }
1364
1365                 th = lfsck_trans_create(env, dt, lfsck);
1366                 if (IS_ERR(th)) {
1367                         rc1 = PTR_ERR(th);
1368                         CDEBUG(D_LFSCK, "%s: layout LFSCK failed to store "
1369                                "the LAST_ID for <seq> %#llx(1): rc = %d\n",
1370                                lfsck_lfsck2name(com->lc_lfsck),
1371                                lls->lls_seq, rc1);
1372                         continue;
1373                 }
1374
1375                 lastid = cpu_to_le64(lls->lls_lastid);
1376                 rc = dt_declare_record_write(env, lls->lls_lastid_obj,
1377                                              lfsck_buf_get(env, &lastid,
1378                                                            sizeof(lastid)),
1379                                              pos, th);
1380                 if (rc != 0)
1381                         goto stop;
1382
1383                 rc = dt_trans_start_local(env, dt, th);
1384                 if (rc != 0)
1385                         goto stop;
1386
1387                 dt_write_lock(env, lls->lls_lastid_obj, 0);
1388                 rc = dt_record_write(env, lls->lls_lastid_obj,
1389                                      lfsck_buf_get(env, &lastid,
1390                                      sizeof(lastid)), &pos, th);
1391                 dt_write_unlock(env, lls->lls_lastid_obj);
1392                 if (rc == 0)
1393                         lls->lls_dirty = 0;
1394
1395 stop:
1396                 dt_trans_stop(env, dt, th);
1397                 if (rc != 0) {
1398                         rc1 = rc;
1399                         CDEBUG(D_LFSCK, "%s: layout LFSCK failed to store "
1400                                "the LAST_ID for <seq> %#llx(2): rc = %d\n",
1401                                lfsck_lfsck2name(com->lc_lfsck),
1402                                lls->lls_seq, rc1);
1403                 }
1404         }
1405
1406         return rc1;
1407 }
1408
1409 static int
1410 lfsck_layout_lastid_load(const struct lu_env *env,
1411                          struct lfsck_component *com,
1412                          struct lfsck_layout_seq *lls)
1413 {
1414         struct lfsck_instance   *lfsck  = com->lc_lfsck;
1415         struct lfsck_layout     *lo     = com->lc_file_ram;
1416         struct lu_fid           *fid    = &lfsck_env_info(env)->lti_fid;
1417         struct dt_object        *obj;
1418         loff_t                   pos    = 0;
1419         int                      rc;
1420         ENTRY;
1421
1422         lu_last_id_fid(fid, lls->lls_seq, lfsck_dev_idx(lfsck));
1423         obj = dt_locate(env, lfsck->li_bottom, fid);
1424         if (IS_ERR(obj))
1425                 RETURN(PTR_ERR(obj));
1426
1427         /* LAST_ID crashed, to be rebuilt */
1428         if (dt_object_exists(obj) == 0) {
1429                 if (!(lo->ll_flags & LF_CRASHED_LASTID)) {
1430                         LASSERT(lfsck->li_out_notify != NULL);
1431
1432                         lfsck->li_out_notify(env, lfsck->li_out_notify_data,
1433                                              LE_LASTID_REBUILDING);
1434                         lo->ll_flags |= LF_CRASHED_LASTID;
1435
1436                         CDEBUG(D_LFSCK, "%s: layout LFSCK cannot find the "
1437                                "LAST_ID file for sequence %#llx\n",
1438                                lfsck_lfsck2name(lfsck), lls->lls_seq);
1439
1440                         if (CFS_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY4) &&
1441                             cfs_fail_val > 0) {
1442                                 struct ptlrpc_thread *thread =
1443                                         &lfsck->li_thread;
1444
1445                                 up_write(&com->lc_sem);
1446                                 wait_event_idle_timeout(
1447                                         thread->t_ctl_waitq,
1448                                         !thread_is_running(thread),
1449                                         cfs_time_seconds(cfs_fail_val));
1450                                 down_write(&com->lc_sem);
1451                         }
1452                 }
1453
1454                 rc = lfsck_layout_lastid_create(env, lfsck, obj);
1455         } else {
1456                 dt_read_lock(env, obj, 0);
1457                 rc = dt_read(env, obj,
1458                         lfsck_buf_get(env, &lls->lls_lastid, sizeof(__u64)),
1459                         &pos);
1460                 dt_read_unlock(env, obj);
1461                 if (rc != 0 && rc != sizeof(__u64))
1462                         GOTO(out, rc = (rc > 0 ? -EFAULT : rc));
1463
1464                 if (rc == 0 && !(lo->ll_flags & LF_CRASHED_LASTID)) {
1465                         LASSERT(lfsck->li_out_notify != NULL);
1466
1467                         lfsck->li_out_notify(env, lfsck->li_out_notify_data,
1468                                              LE_LASTID_REBUILDING);
1469                         lo->ll_flags |= LF_CRASHED_LASTID;
1470
1471                         CDEBUG(D_LFSCK, "%s: layout LFSCK finds invalid "
1472                                "LAST_ID file for the sequence %#llx"
1473                                ": rc = %d\n",
1474                                lfsck_lfsck2name(lfsck), lls->lls_seq, rc);
1475                 }
1476
1477                 lls->lls_lastid = le64_to_cpu(lls->lls_lastid);
1478                 rc = 0;
1479         }
1480
1481         GOTO(out, rc);
1482
1483 out:
1484         if (rc != 0)
1485                 lfsck_object_put(env, obj);
1486         else
1487                 lls->lls_lastid_obj = obj;
1488
1489         return rc;
1490 }
1491
1492 static void lfsck_layout_record_failure(const struct lu_env *env,
1493                                         struct lfsck_instance *lfsck,
1494                                         struct lfsck_layout *lo)
1495 {
1496         __u64 cookie;
1497
1498         lo->ll_objs_failed_phase1++;
1499         cookie = lfsck->li_obj_oit->do_index_ops->dio_it.store(env,
1500                                                         lfsck->li_di_oit);
1501         if (lo->ll_pos_first_inconsistent == 0 ||
1502             lo->ll_pos_first_inconsistent < cookie) {
1503                 lo->ll_pos_first_inconsistent = cookie;
1504
1505                 CDEBUG(D_LFSCK, "%s: layout LFSCK hit first non-repaired "
1506                        "inconsistency at the pos [%llu]\n",
1507                        lfsck_lfsck2name(lfsck),
1508                        lo->ll_pos_first_inconsistent);
1509         }
1510 }
1511
1512 static int lfsck_layout_double_scan_result(const struct lu_env *env,
1513                                            struct lfsck_component *com,
1514                                            int rc)
1515 {
1516         struct lfsck_instance   *lfsck = com->lc_lfsck;
1517         struct lfsck_layout     *lo    = com->lc_file_ram;
1518
1519         CDEBUG(D_LFSCK, "%s: layout LFSCK double scan: rc = %d\n",
1520                lfsck_lfsck2name(lfsck), rc);
1521
1522         down_write(&com->lc_sem);
1523         lo->ll_run_time_phase2 += ktime_get_seconds() -
1524                                   com->lc_time_last_checkpoint;
1525         lo->ll_time_last_checkpoint = ktime_get_real_seconds();
1526         lo->ll_objs_checked_phase2 += com->lc_new_checked;
1527
1528         if (rc > 0) {
1529                 if (lo->ll_flags & LF_INCOMPLETE) {
1530                         lo->ll_status = LS_PARTIAL;
1531                 } else {
1532                         if (lfsck->li_master) {
1533                                 struct lfsck_assistant_data *lad = com->lc_data;
1534
1535                                 if (test_bit(LAD_INCOMPLETE, &lad->lad_flags))
1536                                         lo->ll_status = LS_PARTIAL;
1537                                 else
1538                                         lo->ll_status = LS_COMPLETED;
1539                         } else {
1540                                 lo->ll_status = LS_COMPLETED;
1541                         }
1542                 }
1543                 lo->ll_flags &= ~LF_SCANNED_ONCE;
1544                 if (!(lfsck->li_bookmark_ram.lb_param & LPF_DRYRUN))
1545                         lo->ll_flags &= ~LF_INCONSISTENT;
1546                 lo->ll_time_last_complete = lo->ll_time_last_checkpoint;
1547                 lo->ll_success_count++;
1548         } else if (rc == 0) {
1549                 if (lfsck->li_status != 0)
1550                         lo->ll_status = lfsck->li_status;
1551                 else
1552                         lo->ll_status = LS_STOPPED;
1553         } else {
1554                 lo->ll_status = LS_FAILED;
1555         }
1556
1557         rc = lfsck_layout_store(env, com);
1558         up_write(&com->lc_sem);
1559
1560         CDEBUG(D_LFSCK, "%s: layout LFSCK double scan result %u: rc = %d\n",
1561                lfsck_lfsck2name(lfsck), lo->ll_status, rc);
1562
1563         return rc;
1564 }
1565
1566 static int lfsck_layout_trans_stop(const struct lu_env *env,
1567                                    struct dt_device *dev,
1568                                    struct thandle *handle, int result)
1569 {
1570         int rc;
1571
1572         /* XXX: If there is something worng or it needs to repair nothing,
1573          *      then notify the lower to stop the modification. Currently,
1574          *      we use th_result for such purpose, that may be replaced by
1575          *      some rollback mechanism in the future. */
1576         handle->th_result = result;
1577         rc = dt_trans_stop(env, dev, handle);
1578         if (result != 0)
1579                 return result > 0 ? 0 : result;
1580
1581         return rc == 0 ? 1 : rc;
1582 }
1583
1584 static int lfsck_layout_ins_dangling_rec(const struct lu_env *env,
1585                                          struct lfsck_component *com,
1586                                          const struct lu_fid *pfid,
1587                                          const struct lu_fid *cfid,
1588                                          __u32 comp_id, __u32 ea_off,
1589                                          __u32 ost_idx)
1590 {
1591         struct lfsck_layout_dangling_key *key = &lfsck_env_info(env)->lti_lldk;
1592         struct lu_fid *rec = &lfsck_env_info(env)->lti_fid3;
1593         struct dt_device *dev;
1594         struct dt_object *obj;
1595         struct thandle *th = NULL;
1596         int idx;
1597         int rc = 0;
1598         ENTRY;
1599
1600         idx = lfsck_sub_trace_file_fid2idx(pfid);
1601         obj = com->lc_sub_trace_objs[idx].lsto_obj;
1602         dev = lfsck_obj2dev(obj);
1603
1604         fid_cpu_to_be(&key->lldk_fid, pfid);
1605         key->lldk_comp_id = cpu_to_be32(comp_id);
1606         key->lldk_ea_off = cpu_to_be32(ea_off);
1607
1608         fid_cpu_to_be(rec, cfid);
1609         rec->f_ver = cpu_to_be32(ost_idx);
1610
1611         mutex_lock(&com->lc_sub_trace_objs[idx].lsto_mutex);
1612
1613         th = lfsck_trans_create(env, dev, com->lc_lfsck);
1614         if (IS_ERR(th))
1615                 GOTO(unlock, rc = PTR_ERR(th));
1616
1617         rc = dt_declare_insert(env, obj,
1618                                (const struct dt_rec *)rec,
1619                                (const struct dt_key *)key, th);
1620         if (rc)
1621                 GOTO(unlock, rc);
1622
1623         rc = dt_trans_start_local(env, dev, th);
1624         if (rc)
1625                 GOTO(unlock, rc);
1626
1627         rc = dt_insert(env, obj, (const struct dt_rec *)rec,
1628                        (const struct dt_key *)key, th);
1629
1630         GOTO(unlock, rc);
1631
1632 unlock:
1633         if (th && !IS_ERR(th))
1634                 dt_trans_stop(env, dev, th);
1635
1636         mutex_unlock(&com->lc_sub_trace_objs[idx].lsto_mutex);
1637
1638         CDEBUG(D_LFSCK, "%s: insert the paris "DFID" => "DFID", comp_id = %u, "
1639                "ea_off = %u, ost_idx = %u, into the trace file for further "
1640                "dangling check: rc = %d\n", lfsck_lfsck2name(com->lc_lfsck),
1641                PFID(pfid), PFID(cfid), comp_id, ea_off, ost_idx, rc);
1642
1643         return rc;
1644 }
1645
1646 static int lfsck_layout_del_dangling_rec(const struct lu_env *env,
1647                                          struct lfsck_component *com,
1648                                          const struct lu_fid *fid,
1649                                          __u32 comp_id, __u32 ea_off)
1650 {
1651         struct lfsck_layout_dangling_key *key = &lfsck_env_info(env)->lti_lldk;
1652         struct dt_device *dev;
1653         struct dt_object *obj;
1654         struct thandle *th = NULL;
1655         int idx;
1656         int rc = 0;
1657         ENTRY;
1658
1659         idx = lfsck_sub_trace_file_fid2idx(fid);
1660         obj = com->lc_sub_trace_objs[idx].lsto_obj;
1661         dev = lfsck_obj2dev(obj);
1662
1663         fid_cpu_to_be(&key->lldk_fid, fid);
1664         key->lldk_comp_id = cpu_to_be32(comp_id);
1665         key->lldk_ea_off = cpu_to_be32(ea_off);
1666
1667         mutex_lock(&com->lc_sub_trace_objs[idx].lsto_mutex);
1668
1669         th = lfsck_trans_create(env, dev, com->lc_lfsck);
1670         if (IS_ERR(th))
1671                 GOTO(unlock, rc = PTR_ERR(th));
1672
1673         rc = dt_declare_delete(env, obj, (const struct dt_key *)key, th);
1674         if (rc)
1675                 GOTO(unlock, rc);
1676
1677         rc = dt_trans_start_local(env, dev, th);
1678         if (rc)
1679                 GOTO(unlock, rc);
1680
1681         rc = dt_delete(env, obj, (const struct dt_key *)key, th);
1682
1683         GOTO(unlock, rc);
1684
1685 unlock:
1686         if (th && !IS_ERR(th))
1687                 dt_trans_stop(env, dev, th);
1688
1689         mutex_unlock(&com->lc_sub_trace_objs[idx].lsto_mutex);
1690
1691         CDEBUG(D_LFSCK, "%s: delete the dangling record for "DFID
1692                ", comp_id = %u, ea_off = %u from the trace file: rc = %d\n",
1693                lfsck_lfsck2name(com->lc_lfsck), PFID(fid), comp_id, ea_off, rc);
1694
1695         return rc;
1696 }
1697
1698 /**
1699  * Get the system default stripe size.
1700  *
1701  * \param[in] env       pointer to the thread context
1702  * \param[in] lfsck     pointer to the lfsck instance
1703  * \param[out] size     pointer to the default stripe size
1704  *
1705  * \retval              0 for success
1706  * \retval              negative error number on failure
1707  */
1708 static int lfsck_layout_get_def_stripesize(const struct lu_env *env,
1709                                            struct lfsck_instance *lfsck,
1710                                            __u32 *size)
1711 {
1712         struct lov_user_md      *lum = &lfsck_env_info(env)->lti_lum;
1713         struct dt_object        *root;
1714         int                      rc;
1715
1716         root = dt_locate(env, lfsck->li_next, &lfsck->li_local_root_fid);
1717         if (IS_ERR(root))
1718                 return PTR_ERR(root);
1719
1720         /* Get the default stripe size via xattr_get on the backend root. */
1721         rc = dt_xattr_get(env, root, lfsck_buf_get(env, lum, sizeof(*lum)),
1722                           XATTR_NAME_LOV);
1723         if (rc > 0) {
1724                 /* The lum->lmm_stripe_size is LE mode. The *size also
1725                  * should be LE mode. So it is unnecessary to convert. */
1726                 *size = lum->lmm_stripe_size;
1727                 rc = 0;
1728         } else if (unlikely(rc == 0)) {
1729                 rc = -EINVAL;
1730         }
1731
1732         lfsck_object_put(env, root);
1733
1734         return rc;
1735 }
1736
1737 /**
1738  * \retval       +1: repaired
1739  * \retval        0: did nothing
1740  * \retval      -ve: on error
1741  */
1742 static int lfsck_layout_refill_lovea(const struct lu_env *env,
1743                                      struct lfsck_instance *lfsck,
1744                                      struct thandle *handle,
1745                                      struct dt_object *parent,
1746                                      const struct lu_fid *cfid,
1747                                      struct lu_buf *buf,
1748                                      struct lov_mds_md_v1 *lmm,
1749                                      struct lov_ost_data_v1 *slot,
1750                                      int fl, __u32 ost_idx, int size)
1751 {
1752         struct ost_id           *oi     = &lfsck_env_info(env)->lti_oi;
1753         struct lu_buf            ea_buf;
1754         int                      rc;
1755         __u32                    magic;
1756         __u32                    pattern;
1757         __u16                    count;
1758         ENTRY;
1759
1760         magic = le32_to_cpu(lmm->lmm_magic);
1761         pattern = le32_to_cpu(lmm->lmm_pattern);
1762         count = le16_to_cpu(lmm->lmm_stripe_count);
1763
1764         fid_to_ostid(cfid, oi);
1765         ostid_cpu_to_le(oi, &slot->l_ost_oi);
1766         slot->l_ost_gen = cpu_to_le32(0);
1767         slot->l_ost_idx = cpu_to_le32(ost_idx);
1768
1769         if (pattern & LOV_PATTERN_F_HOLE) {
1770                 struct lov_ost_data_v1 *objs;
1771                 int                     i;
1772
1773                 if (magic == LOV_MAGIC_V1)
1774                         objs = &lmm->lmm_objects[0];
1775                 else
1776                         objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0];
1777                 for (i = 0; i < count; i++, objs++) {
1778                         if (lovea_slot_is_dummy(objs))
1779                                 break;
1780                 }
1781
1782                 /* If the @slot is the last dummy slot to be refilled,
1783                  * then drop LOV_PATTERN_F_HOLE from lmm::lmm_pattern. */
1784                 if (i == count) {
1785                         lmm->lmm_pattern =
1786                                 cpu_to_le32(pattern & ~LOV_PATTERN_F_HOLE);
1787
1788                         CDEBUG(D_LFSCK, "%s: remove layout HOLE for "DFID
1789                                ": parent "DFID"\n", lfsck_lfsck2name(lfsck),
1790                                PFID(cfid), PFID(lfsck_dto2fid(parent)));
1791                 }
1792         }
1793
1794         lfsck_buf_init(&ea_buf, buf->lb_buf, size);
1795         rc = dt_xattr_set(env, parent, &ea_buf, XATTR_NAME_LOV, fl, handle);
1796         if (rc == 0)
1797                 rc = 1;
1798
1799         RETURN(rc);
1800 }
1801
1802 static struct lov_ost_data_v1 *
1803 __lfsck_layout_new_v1_lovea(struct lov_mds_md_v1 *lmm,
1804                             const struct lu_fid *pfid,
1805                             __u32 stripe_size, __u32 ea_off,
1806                             __u32 pattern, __u16 count)
1807 {
1808         lmm->lmm_magic = cpu_to_le32(LOV_MAGIC_V1);
1809         lmm->lmm_pattern = cpu_to_le32(pattern);
1810         fid_to_lmm_oi(pfid, &lmm->lmm_oi);
1811         lmm_oi_cpu_to_le(&lmm->lmm_oi, &lmm->lmm_oi);
1812         lmm->lmm_stripe_size = cpu_to_le32(stripe_size);
1813         lmm->lmm_stripe_count = cpu_to_le16(count);
1814         lmm->lmm_layout_gen = cpu_to_le16(1);
1815         memset(&lmm->lmm_objects[0], 0,
1816                sizeof(struct lov_ost_data_v1) * count);
1817
1818         return &lmm->lmm_objects[ea_off];
1819 }
1820
1821 static int lfsck_layout_new_v1_lovea(const struct lu_env *env,
1822                                      struct lfsck_instance *lfsck,
1823                                      struct ost_layout *ol,
1824                                      struct dt_object *parent,
1825                                      struct lu_buf *buf, __u32 ea_off,
1826                                      struct lov_mds_md_v1 **lmm,
1827                                      struct lov_ost_data_v1 **objs)
1828 {
1829         int size;
1830         __u32 stripe_size = ol->ol_stripe_size;
1831         __u32 pattern = LOV_PATTERN_RAID0;
1832         __u16 count;
1833
1834         if (ol->ol_stripe_count != 0)
1835                 count = ol->ol_stripe_count;
1836         else
1837                 count = ea_off + 1;
1838
1839         size = lov_mds_md_size(count, LOV_MAGIC_V1);
1840         LASSERTF(buf->lb_len >= size,
1841                  "buffer len %d is less than real size %d\n",
1842                  (int)buf->lb_len, size);
1843
1844         if (stripe_size == 0) {
1845                 int rc;
1846
1847                 rc = lfsck_layout_get_def_stripesize(env, lfsck, &stripe_size);
1848                 if (rc)
1849                         return rc;
1850         }
1851
1852         *lmm = buf->lb_buf;
1853         if (ol->ol_stripe_count > 1 ||
1854             (ol->ol_stripe_count == 0 && ea_off != 0)) {
1855                 pattern |= LOV_PATTERN_F_HOLE;
1856                 memset(&(*lmm)->lmm_objects[0], 0,
1857                        count * sizeof(struct lov_ost_data_v1));
1858         }
1859
1860         *objs = __lfsck_layout_new_v1_lovea(*lmm, lfsck_dto2fid(parent),
1861                                 stripe_size, ea_off, pattern, count);
1862
1863         return size;
1864 }
1865
1866 static int lfsck_layout_new_comp_lovea(const struct lu_env *env,
1867                                        struct lu_orphan_rec_v3 *rec,
1868                                        struct dt_object *parent,
1869                                        struct lu_buf *buf, __u32 ea_off,
1870                                        struct lov_mds_md_v1 **lmm,
1871                                        struct lov_ost_data_v1 **objs)
1872 {
1873         struct ost_layout *ol = &rec->lor_layout;
1874         struct lov_comp_md_v1 *lcm;
1875         struct lov_comp_md_entry_v1 *lcme;
1876         __u32 pattern = LOV_PATTERN_RAID0;
1877         __u32 offset = sizeof(*lcm) + sizeof(*lcme);
1878         int lcme_size = lov_mds_md_size(ol->ol_stripe_count, LOV_MAGIC_V1);
1879         int size = offset + lcme_size;
1880
1881         LASSERTF(buf->lb_len >= size,
1882                  "buffer len %d is less than real size %d\n",
1883                  (int)buf->lb_len, size);
1884
1885         lcm = buf->lb_buf;
1886         lcm->lcm_magic = cpu_to_le32(LOV_MAGIC_COMP_V1);
1887         lcm->lcm_size = cpu_to_le32(size);
1888         if (rec->lor_range) {
1889                 lcm->lcm_layout_gen = cpu_to_le32(rec->lor_layout_version +
1890                                                   rec->lor_range);
1891                 lcm->lcm_flags = cpu_to_le16(LCM_FL_WRITE_PENDING);
1892         } else if (rec->lor_layout_version) {
1893                 lcm->lcm_layout_gen = cpu_to_le32(rec->lor_layout_version +
1894                                                   rec->lor_range);
1895                 lcm->lcm_flags = cpu_to_le16(LCM_FL_NONE);
1896         } else {
1897                 /*
1898                  * if OST doesn't provide layout version, then try
1899                  * to inherit one from MDS's layout, but increment
1900                  * it so the client notices and applies modified
1901                  * layout
1902                  */
1903                 le32_add_cpu(&lcm->lcm_layout_gen, 1);
1904                 lcm->lcm_flags = cpu_to_le16(LCM_FL_NONE);
1905         }
1906         lcm->lcm_entry_count = cpu_to_le16(1);
1907         /* Currently, we do not know how many mirrors will be, set it as zero
1908          * at the beginning. It will be updated when more mirrors are found. */
1909         lcm->lcm_mirror_count = 0;
1910
1911         lcme = &lcm->lcm_entries[0];
1912         lcme->lcme_id = cpu_to_le32(ol->ol_comp_id);
1913         lcme->lcme_flags = cpu_to_le32(LCME_FL_INIT);
1914         lcme->lcme_extent.e_start = cpu_to_le64(ol->ol_comp_start);
1915         lcme->lcme_extent.e_end = cpu_to_le64(ol->ol_comp_end);
1916         lcme->lcme_offset = cpu_to_le32(offset);
1917         lcme->lcme_size = cpu_to_le32(lcme_size);
1918         lcme->lcme_layout_gen = lcm->lcm_layout_gen;
1919         if (ol->ol_stripe_count > 1)
1920                 pattern |= LOV_PATTERN_F_HOLE;
1921
1922         *lmm = buf->lb_buf + offset;
1923         *objs = __lfsck_layout_new_v1_lovea(*lmm, lfsck_dto2fid(parent),
1924                                             ol->ol_stripe_size, ea_off,
1925                                             pattern, ol->ol_stripe_count);
1926
1927         return size;
1928 }
1929
1930 static void lfsck_layout_update_lcm(struct lov_comp_md_v1 *lcm,
1931                                     struct lov_comp_md_entry_v1 *lcme,
1932                                     __u32 version, __u32 range)
1933 {
1934         struct lov_comp_md_entry_v1 *tmp;
1935         __u64 start = le64_to_cpu(lcme->lcme_extent.e_start);
1936         __u64 end = le64_to_cpu(lcme->lcme_extent.e_end);
1937         __u32 gen = version + range;
1938         __u32 tmp_gen;
1939         int i;
1940         __u16 count = le16_to_cpu(lcm->lcm_entry_count);
1941         __u16 flags = le16_to_cpu(lcm->lcm_flags);
1942
1943         if (!gen)
1944                 gen = 1;
1945         lcme->lcme_layout_gen = cpu_to_le32(gen);
1946         if (le32_to_cpu(lcm->lcm_layout_gen) < gen)
1947                 lcm->lcm_layout_gen = cpu_to_le32(gen);
1948
1949         if (range)
1950                 lcm->lcm_flags = cpu_to_le16(LCM_FL_WRITE_PENDING);
1951         else if (flags == LCM_FL_NONE && le16_to_cpu(lcm->lcm_mirror_count) > 0)
1952                 lcm->lcm_flags = cpu_to_le16(LCM_FL_RDONLY);
1953
1954         for (i = 0; i < count; i++) {
1955                 tmp = &lcm->lcm_entries[i];
1956                 if (le64_to_cpu(tmp->lcme_extent.e_end) <= start)
1957                         continue;
1958
1959                 if (le64_to_cpu(tmp->lcme_extent.e_start) >= end)
1960                         continue;
1961
1962                 if (le32_to_cpu(tmp->lcme_flags) & LCME_FL_STALE)
1963                         continue;
1964
1965                 tmp_gen = le32_to_cpu(tmp->lcme_layout_gen);
1966                 /* "lcme_layout_gen == 0" but without LCME_FL_STALE flag,
1967                  * then it should be the latest version of all mirrors. */
1968                 if (tmp_gen == 0 || tmp_gen > gen) {
1969                         lcme->lcme_flags = cpu_to_le32(
1970                                 le32_to_cpu(lcme->lcme_flags) | LCME_FL_STALE);
1971                         break;
1972                 }
1973
1974                 if (tmp_gen < gen)
1975                         tmp->lcme_flags = cpu_to_le32(
1976                                 le32_to_cpu(tmp->lcme_flags) | LCME_FL_STALE);
1977         }
1978 }
1979
1980 static int lfsck_layout_add_comp(const struct lu_env *env,
1981                                  struct lfsck_instance *lfsck,
1982                                  struct thandle *handle,
1983                                  struct lu_orphan_rec_v3 *rec,
1984                                  struct dt_object *parent,
1985                                  const struct lu_fid *cfid,
1986                                  struct lu_buf *buf, __u32 ost_idx,
1987                                  __u32 ea_off, int pos, bool new_mirror)
1988 {
1989         struct ost_layout *ol = &rec->lor_layout;
1990         struct lov_comp_md_v1 *lcm = buf->lb_buf;
1991         struct lov_comp_md_entry_v1 *lcme;
1992         struct lov_mds_md_v1 *lmm;
1993         struct lov_ost_data_v1 *objs;
1994         int added = sizeof(*lcme) +
1995                     lov_mds_md_size(ol->ol_stripe_count, LOV_MAGIC_V1);
1996         int size = le32_to_cpu(lcm->lcm_size) + added;
1997         int rc;
1998         int i;
1999         __u32 offset;
2000         __u32 pattern = LOV_PATTERN_RAID0;
2001         __u16 count = le16_to_cpu(lcm->lcm_entry_count);
2002         ENTRY;
2003
2004         lu_buf_check_and_grow(buf, size);
2005         /* set the lcm again because lu_buf_check_and_grow() may
2006          * have reallocated the buf. */
2007         lcm = buf->lb_buf;
2008         lcm->lcm_size = cpu_to_le32(size);
2009         lcm->lcm_entry_count = cpu_to_le16(count + 1);
2010         if (new_mirror)
2011                 le16_add_cpu(&lcm->lcm_mirror_count, 1);
2012
2013         /* 1. Move the component bodies from [pos, count-1] to [pos+1, count]
2014          *    with distance of 'added'. */
2015         if (pos < count) {
2016                 size = 0;
2017                 for (i = pos; i < count; i++) {
2018                         lcme = &lcm->lcm_entries[i];
2019                         size += le32_to_cpu(lcme->lcme_size);
2020                 }
2021
2022                 offset = le32_to_cpu(lcm->lcm_entries[pos].lcme_offset);
2023                 memmove(buf->lb_buf + offset + added,
2024                         buf->lb_buf + offset, size);
2025         }
2026
2027         size = 0;
2028         /* 2. Move the component header [0, pos-1] to [0, pos-1] with distance
2029          *    of 'sizeof(struct lov_comp_md_entry_v1)' */
2030         if (pos > 0) {
2031                 for (i = 0; i < pos; i++) {
2032                         lcme = &lcm->lcm_entries[i];
2033                         size += le32_to_cpu(lcme->lcme_size);
2034                 }
2035
2036                 offset = le32_to_cpu(lcm->lcm_entries[0].lcme_offset);
2037                 memmove(buf->lb_buf + offset + sizeof(*lcme),
2038                         buf->lb_buf + offset, size);
2039         }
2040
2041         /* 3. Recalculate the enter offset for the component [pos, count-1] */
2042         for (i = count - 1; i >= pos; i--) {
2043                 lcm->lcm_entries[i + 1] = lcm->lcm_entries[i];
2044                 lcm->lcm_entries[i + 1].lcme_offset =
2045                         cpu_to_le32(le32_to_cpu(lcm->lcm_entries[i + 1].
2046                                                 lcme_offset) + added);
2047         }
2048
2049         /* 4. Recalculate the enter offset for the component [0, pos) */
2050         for (i = 0; i < pos; i++) {
2051                 lcm->lcm_entries[i].lcme_offset =
2052                         cpu_to_le32(le32_to_cpu(lcm->lcm_entries[i].
2053                                                 lcme_offset) + sizeof(*lcme));
2054         }
2055
2056         offset = sizeof(*lcm) + sizeof(*lcme) * (count + 1) + size;
2057         /* 4. Insert the new component header (entry) at the slot 'pos'. */
2058         lcme = &lcm->lcm_entries[pos];
2059         lcme->lcme_id = cpu_to_le32(ol->ol_comp_id);
2060         lcme->lcme_flags = cpu_to_le32(LCME_FL_INIT);
2061         lcme->lcme_extent.e_start = cpu_to_le64(ol->ol_comp_start);
2062         lcme->lcme_extent.e_end = cpu_to_le64(ol->ol_comp_end);
2063         lcme->lcme_offset = cpu_to_le32(offset);
2064         lcme->lcme_size = cpu_to_le32(lov_mds_md_size(ol->ol_stripe_count,
2065                                                       LOV_MAGIC_V1));
2066
2067         if (ol->ol_stripe_count > 1)
2068                 pattern |= LOV_PATTERN_F_HOLE;
2069
2070         lmm = buf->lb_buf + offset;
2071         /* 5. Insert teh new component body at the 'offset'. */
2072         objs = __lfsck_layout_new_v1_lovea(lmm, lfsck_dto2fid(parent),
2073                                            ol->ol_stripe_size, ea_off,
2074                                            pattern, ol->ol_stripe_count);
2075
2076         /* 6. Update mirror related flags and version. */
2077         lfsck_layout_update_lcm(lcm, lcme, rec->lor_layout_version,
2078                                 rec->lor_range);
2079
2080         rc = lfsck_layout_refill_lovea(env, lfsck, handle, parent, cfid, buf,
2081                                        lmm, objs, LU_XATTR_REPLACE, ost_idx,
2082                                        le32_to_cpu(lcm->lcm_size));
2083
2084         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant add new COMP for "
2085                DFID": parent "DFID", OST-index %u, stripe-index %u, "
2086                "stripe_size %u, stripe_count %u, comp_id %u, comp_start %llu, "
2087                "comp_end %llu, layout version %u, range %u, "
2088                "%s LOV EA hole: rc = %d\n",
2089                lfsck_lfsck2name(lfsck), PFID(cfid), PFID(lfsck_dto2fid(parent)),
2090                ost_idx, ea_off, ol->ol_stripe_size, ol->ol_stripe_count,
2091                ol->ol_comp_id, ol->ol_comp_start, ol->ol_comp_end,
2092                rec->lor_layout_version, rec->lor_range,
2093                le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_HOLE ?
2094                "with" : "without", rc);
2095
2096         RETURN(rc);
2097 }
2098
2099 static int lfsck_layout_extend_v1v3_lovea(const struct lu_env *env,
2100                                           struct lfsck_instance *lfsck,
2101                                           struct thandle *handle,
2102                                           struct ost_layout *ol,
2103                                           struct dt_object *parent,
2104                                           const struct lu_fid *cfid,
2105                                           struct lu_buf *buf, __u32 ost_idx,
2106                                           __u32 ea_off)
2107 {
2108         struct lov_mds_md_v1 *lmm = buf->lb_buf;
2109         struct lov_ost_data_v1 *objs;
2110         __u16 count = le16_to_cpu(lmm->lmm_stripe_count);
2111         __u32 magic = le32_to_cpu(lmm->lmm_magic);
2112         int size;
2113         int gap;
2114         int rc;
2115         ENTRY;
2116
2117         /* The original LOVEA maybe re-generated via old filter_fid, at
2118          * that time, we do not know the stripe count and stripe size. */
2119         if (ol->ol_stripe_count > count)
2120                 count = ol->ol_stripe_count;
2121         if (ol->ol_stripe_size != 0 &&
2122             ol->ol_stripe_size != le32_to_cpu(lmm->lmm_stripe_size))
2123                 lmm->lmm_stripe_size = cpu_to_le32(ol->ol_stripe_size);
2124
2125         if (magic == LOV_MAGIC_V1)
2126                 objs = &lmm->lmm_objects[count];
2127         else
2128                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[count];
2129
2130         gap = ea_off - count;
2131         if (gap >= 0)
2132                 count = ea_off + 1;
2133
2134         size = lov_mds_md_size(count, magic);
2135         LASSERTF(buf->lb_len >= size,
2136                  "buffer len %d is less than real size %d\n",
2137                  (int)buf->lb_len, size);
2138
2139         if (gap > 0) {
2140                 memset(objs, 0, gap * sizeof(*objs));
2141                 lmm->lmm_pattern |= cpu_to_le32(LOV_PATTERN_F_HOLE);
2142         }
2143
2144         lmm->lmm_layout_gen = cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1);
2145         lmm->lmm_stripe_count = cpu_to_le16(count);
2146         objs += gap;
2147
2148         rc = lfsck_layout_refill_lovea(env, lfsck, handle, parent, cfid, buf,
2149                                 lmm, objs, LU_XATTR_REPLACE, ost_idx, size);
2150
2151         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant extend layout EA for "
2152                DFID": parent "DFID", OST-index %u, stripe-index %u, "
2153                "stripe_size %u, stripe_count %u, comp_id %u, comp_start %llu, "
2154                "comp_end %llu, %s LOV EA hole: rc = %d\n",
2155                lfsck_lfsck2name(lfsck), PFID(cfid), PFID(lfsck_dto2fid(parent)),
2156                ost_idx, ea_off, ol->ol_stripe_size, ol->ol_stripe_count,
2157                ol->ol_comp_id, ol->ol_comp_start, ol->ol_comp_end,
2158                le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_HOLE ?
2159                "with" : "without", rc);
2160
2161         RETURN(rc);
2162 }
2163
2164 /**
2165  * \retval       +1: repaired
2166  * \retval        0: did nothing
2167  * \retval      -ve: on error
2168  */
2169 static int lfsck_layout_update_lovea(const struct lu_env *env,
2170                                      struct lfsck_instance *lfsck,
2171                                      struct thandle *handle,
2172                                      struct lu_orphan_rec_v3 *rec,
2173                                      struct dt_object *parent,
2174                                      const struct lu_fid *cfid,
2175                                      struct lu_buf *buf, int fl,
2176                                      __u32 ost_idx, __u32 ea_off)
2177 {
2178         struct ost_layout *ol = &rec->lor_layout;
2179         struct lov_mds_md_v1 *lmm = NULL;
2180         struct lov_ost_data_v1 *objs = NULL;
2181         int rc = 0;
2182         ENTRY;
2183
2184         if (ol->ol_comp_id != 0)
2185                 rc = lfsck_layout_new_comp_lovea(env, rec, parent, buf, ea_off,
2186                                                  &lmm, &objs);
2187         else
2188                 rc = lfsck_layout_new_v1_lovea(env, lfsck, &rec->lor_layout,
2189                                                parent, buf, ea_off, &lmm,
2190                                                &objs);
2191         if (rc > 0)
2192                 rc = lfsck_layout_refill_lovea(env, lfsck, handle, parent, cfid,
2193                                                buf, lmm, objs, fl, ost_idx, rc);
2194
2195         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant created layout EA for "
2196                DFID": parent "DFID", OST-index %u, stripe-index %u, "
2197                "stripe_size %u, stripe_count %u, comp_id %u, comp_start %llu, "
2198                "comp_end %llu, layout version %u, range %u, fl %d, "
2199                "%s LOV EA hole: rc = %d\n",
2200                lfsck_lfsck2name(lfsck), PFID(cfid), PFID(lfsck_dto2fid(parent)),
2201                ost_idx, ea_off, ol->ol_stripe_size, ol->ol_stripe_count,
2202                ol->ol_comp_id, ol->ol_comp_start, ol->ol_comp_end,
2203                rec->lor_layout_version, rec->lor_range, fl,
2204                le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_HOLE ?
2205                "with" : "without", rc);
2206
2207         RETURN(rc);
2208 }
2209
2210 static int __lfsck_layout_update_pfid(const struct lu_env *env,
2211                                       struct lfsck_component *com,
2212                                       struct dt_object *child,
2213                                       const struct lu_fid *pfid,
2214                                       const struct ost_layout *ol, __u32 offset,
2215                                       __u32 version, __u32 range)
2216 {
2217         struct dt_device        *dev    = lfsck_obj2dev(child);
2218         struct filter_fid       *ff     = &lfsck_env_info(env)->lti_ff;
2219         struct thandle          *handle;
2220         struct lu_buf            buf    = { NULL };
2221         int                      rc;
2222
2223         ff->ff_parent.f_seq = cpu_to_le64(pfid->f_seq);
2224         ff->ff_parent.f_oid = cpu_to_le32(pfid->f_oid);
2225         /* Currently, the filter_fid::ff_parent::f_ver is not the real parent
2226          * MDT-object's FID::f_ver, instead it is the OST-object index in its
2227          * parent MDT-object's layout EA. */
2228         ff->ff_parent.f_stripe_idx = cpu_to_le32(offset);
2229         ost_layout_cpu_to_le(&ff->ff_layout, ol);
2230         ff->ff_layout_version = cpu_to_le32(version);
2231         ff->ff_range = cpu_to_le32(range);
2232         lfsck_buf_init(&buf, ff, sizeof(*ff));
2233
2234         if (!dt_object_exists(child) || lfsck_is_dead_obj(child))
2235                 return 0;
2236
2237         handle = lfsck_trans_create(env, dev, com->lc_lfsck);
2238         if (IS_ERR(handle))
2239                 RETURN(PTR_ERR(handle));
2240
2241         rc = dt_declare_xattr_set(env, child, &buf, XATTR_NAME_FID, 0, handle);
2242         if (rc != 0)
2243                 GOTO(stop, rc);
2244
2245         rc = dt_trans_start_local(env, dev, handle);
2246         if (rc != 0)
2247                 GOTO(stop, rc);
2248
2249         dt_write_lock(env, child, 0);
2250         if (dt_object_exists(child) && !lfsck_is_dead_obj(child))
2251                 rc = dt_xattr_set(env, child, &buf, XATTR_NAME_FID, 0, handle);
2252         dt_write_unlock(env, child);
2253
2254         GOTO(stop, rc);
2255
2256 stop:
2257         dt_trans_stop(env, dev, handle);
2258
2259         return rc;
2260 }
2261
2262 /**
2263  * \retval       +1: repaired
2264  * \retval        0: did nothing
2265  * \retval      -ve: on error
2266  */
2267 static int lfsck_layout_update_pfid(const struct lu_env *env,
2268                                     struct lfsck_component *com,
2269                                     struct dt_object *parent,
2270                                     struct lu_fid *cfid,
2271                                     struct dt_device *cdev,
2272                                     struct lu_orphan_rec_v3 *rec, __u32 ea_off)
2273 {
2274         struct dt_object        *child;
2275         int                      rc     = 0;
2276         ENTRY;
2277
2278         child = lfsck_object_find_by_dev(env, cdev, cfid);
2279         if (IS_ERR(child))
2280                 RETURN(PTR_ERR(child));
2281
2282         rc = __lfsck_layout_update_pfid(env, com, child,
2283                                         lu_object_fid(&parent->do_lu),
2284                                         &rec->lor_layout, ea_off,
2285                                         rec->lor_layout_version,
2286                                         rec->lor_range);
2287         lfsck_object_put(env, child);
2288
2289         RETURN(rc == 0 ? 1 : rc);
2290 }
2291
2292 static int lfsck_lovea_size(struct ost_layout *ol, __u32 ea_off)
2293 {
2294         if (ol->ol_comp_id != 0)
2295                 return sizeof(struct lov_comp_md_v1) +
2296                        sizeof(struct lov_comp_md_entry_v1) +
2297                        lov_mds_md_size(ol->ol_stripe_count, LOV_MAGIC_V1);
2298
2299         if (ol->ol_stripe_count != 0)
2300                 return lov_mds_md_size(ol->ol_stripe_count, LOV_MAGIC_V1);
2301
2302         return lov_mds_md_size(ea_off + 1, LOV_MAGIC_V1);
2303 }
2304
2305 /**
2306  * This function will create the MDT-object with the given (partial) LOV EA.
2307  *
2308  * Under some data corruption cases, the MDT-object of the file may be lost,
2309  * but its OST-objects, or some of them are there. The layout LFSCK needs to
2310  * re-create the MDT-object with the orphan OST-object(s) information.
2311  *
2312  * On the other hand, the LFSCK may has created some OST-object for repairing
2313  * dangling LOV EA reference, but as the LFSCK processing, it may find that
2314  * the old OST-object is there and should replace the former new created OST
2315  * object. Unfortunately, some others have modified such newly created object.
2316  * To keep the data (both new and old), the LFSCK will create MDT-object with
2317  * new FID to reference the original OST-object.
2318  *
2319  * \param[in] env       pointer to the thread context
2320  * \param[in] com       pointer to the lfsck component
2321  * \param[in] ltd       pointer to target device descriptor
2322  * \param[in] rec       pointer to the record for the orphan OST-object
2323  * \param[in] cfid      pointer to FID for the orphan OST-object
2324  * \param[in] infix     additional information, such as the FID for original
2325  *                      MDT-object and the stripe offset in the LOV EA
2326  * \param[in] type      the type for describing why the orphan MDT-object is
2327  *                      created. The rules are as following:
2328  *
2329  *  type "C":           Multiple OST-objects claim the same MDT-object and the
2330  *                      same slot in the layout EA. Then the LFSCK will create
2331  *                      new MDT-object(s) to hold the conflict OST-object(s).
2332  *
2333  *  type "N":           The orphan OST-object does not know which one was the
2334  *                      real parent MDT-object, so the LFSCK uses new FID for
2335  *                      its parent MDT-object.
2336  *
2337  *  type "R":           The orphan OST-object knows its parent MDT-object FID,
2338  *                      but does not know the position (the file name) in the
2339  *                      layout.
2340  *
2341  *  type "D":           The MDT-object is a directory, it may knows its parent
2342  *                      but because there is no valid linkEA, the LFSCK cannot
2343  *                      know where to put it back to the namespace.
2344  *  type "O":           The MDT-object has no linkEA, and there is no name
2345  *                      entry that references the MDT-object.
2346  *
2347  *  type "P":           The orphan object to be created was a parent directory
2348  *                      of some MDT-object which linkEA shows that the @orphan
2349  *                      object is missing.
2350  *
2351  * The orphan name will be like:
2352  * ${FID}-${infix}-${type}-${conflict_version}
2353  *
2354  * \param[in] ea_off    the stripe offset in the LOV EA
2355  *
2356  * \retval              positive on repaired something
2357  * \retval              0 if needs to repair nothing
2358  * \retval              negative error number on failure
2359  */
2360 static int lfsck_layout_recreate_parent(const struct lu_env *env,
2361                                         struct lfsck_component *com,
2362                                         struct lfsck_tgt_desc *ltd,
2363                                         struct lu_orphan_rec_v3 *rec,
2364                                         struct lu_fid *cfid,
2365                                         const char *infix,
2366                                         const char *type,
2367                                         __u32 ea_off)
2368 {
2369         struct lfsck_thread_info        *info   = lfsck_env_info(env);
2370         struct dt_insert_rec            *dtrec  = &info->lti_dt_rec;
2371         char                            *name   = info->lti_key;
2372         struct lu_attr                  *la     = &info->lti_la2;
2373         struct dt_object_format         *dof    = &info->lti_dof;
2374         struct lfsck_instance           *lfsck  = com->lc_lfsck;
2375         struct lu_fid                   *pfid   = &rec->lor_rec.lor_fid;
2376         struct lu_fid                   *tfid   = &info->lti_fid3;
2377         struct dt_device                *dev    = lfsck->li_bottom;
2378         struct dt_object                *lpf    = lfsck->li_lpf_obj;
2379         struct dt_object                *pobj   = NULL;
2380         struct dt_object                *cobj   = NULL;
2381         struct thandle                  *th     = NULL;
2382         struct lu_buf                   *ea_buf = &info->lti_big_buf;
2383         struct lu_buf                    lov_buf;
2384         struct lfsck_lock_handle        *llh    = &info->lti_llh;
2385         struct linkea_data               ldata  = { NULL };
2386         struct lu_buf                    linkea_buf;
2387         const struct lu_name            *pname;
2388         int                              size   = 0;
2389         int                              idx    = 0;
2390         int                              rc     = 0;
2391         ENTRY;
2392
2393         if (lfsck_is_dryrun(lfsck))
2394                 GOTO(log, rc = 0);
2395
2396         if (unlikely(lpf == NULL))
2397                 GOTO(log, rc = -ENXIO);
2398
2399         /* We use two separated transactions to repair the inconsistency.
2400          *
2401          * 1) create the MDT-object locally.
2402          * 2) update the OST-object's PFID EA if necessary.
2403          *
2404          * If 1) succeed, but 2) failed, then the OST-object's PFID EA will be
2405          * updated when the layout LFSCK run next time.
2406          *
2407          * If 1) failed, but 2) succeed, then such MDT-object will be re-created
2408          * when the layout LFSCK run next time. */
2409
2410         if (fid_is_zero(pfid)) {
2411                 rc = lfsck_fid_alloc(env, lfsck, pfid, false);
2412                 if (rc != 0)
2413                         GOTO(log, rc);
2414
2415                 cobj = lfsck_object_find_by_dev(env, ltd->ltd_tgt, cfid);
2416                 if (IS_ERR(cobj))
2417                         GOTO(log, rc = PTR_ERR(cobj));
2418         }
2419
2420         pobj = lfsck_object_find_by_dev(env, dev, pfid);
2421         if (IS_ERR(pobj))
2422                 GOTO(log, rc = PTR_ERR(pobj));
2423
2424         LASSERT(infix != NULL);
2425         LASSERT(type != NULL);
2426
2427         memset(la, 0, sizeof(*la));
2428         la->la_uid = rec->lor_rec.lor_uid;
2429         la->la_gid = rec->lor_rec.lor_gid;
2430         la->la_mode = S_IFREG | S_IRUSR;
2431         la->la_valid = LA_MODE | LA_UID | LA_GID;
2432
2433         memset(dof, 0, sizeof(*dof));
2434         dof->dof_type = dt_mode_to_dft(S_IFREG);
2435         /* Because the dof->dof_reg.striped = 0, the LOD will not create
2436          * the stripe(s). The LFSCK will specify the LOV EA via
2437          * lfsck_layout_update_lovea(). */
2438
2439         size = lfsck_lovea_size(&rec->lor_layout, ea_off);
2440         if (ea_buf->lb_len < size) {
2441                 lu_buf_realloc(ea_buf, size);
2442                 if (ea_buf->lb_buf == NULL)
2443                         GOTO(log, rc = -ENOMEM);
2444         }
2445
2446 again:
2447         do {
2448                 snprintf(name, NAME_MAX, DFID"%s-%s-%d", PFID(pfid), infix,
2449                          type, idx++);
2450                 rc = dt_lookup_dir(env, lfsck->li_lpf_obj, name, tfid);
2451                 if (rc != 0 && rc != -ENOENT)
2452                         GOTO(log, rc);
2453         } while (rc == 0);
2454
2455         rc = lfsck_lock(env, lfsck, lfsck->li_lpf_obj, name, llh,
2456                         MDS_INODELOCK_UPDATE, LCK_PW);
2457         if (rc != 0)
2458                 GOTO(log, rc);
2459
2460         /* Re-check whether the name conflict with othrs after taken
2461          * the ldlm lock. */
2462         rc = dt_lookup_dir(env, lfsck->li_lpf_obj, name, tfid);
2463         if (unlikely(rc == 0)) {
2464                 lfsck_unlock(llh);
2465                 goto again;
2466         }
2467
2468         if (rc != -ENOENT)
2469                 GOTO(unlock, rc);
2470
2471         pname = lfsck_name_get_const(env, name, strlen(name));
2472         rc = linkea_links_new(&ldata, &lfsck_env_info(env)->lti_linkea_buf,
2473                               pname, lfsck_dto2fid(lfsck->li_lpf_obj));
2474         if (rc != 0)
2475                 GOTO(unlock, rc);
2476
2477         /* The 1st transaction. */
2478         th = lfsck_trans_create(env, dev, lfsck);
2479         if (IS_ERR(th))
2480                 GOTO(unlock, rc = PTR_ERR(th));
2481
2482         rc = dt_declare_create(env, pobj, la, NULL, dof, th);
2483         if (rc != 0)
2484                 GOTO(stop, rc);
2485
2486         lfsck_buf_init(&lov_buf, ea_buf->lb_buf, size);
2487         rc = dt_declare_xattr_set(env, pobj, &lov_buf, XATTR_NAME_LOV,
2488                                   LU_XATTR_CREATE, th);
2489         if (rc != 0)
2490                 GOTO(stop, rc);
2491
2492         dtrec->rec_fid = pfid;
2493         dtrec->rec_type = S_IFREG;
2494         rc = dt_declare_insert(env, lpf,
2495                                (const struct dt_rec *)dtrec,
2496                                (const struct dt_key *)name, th);
2497         if (rc != 0)
2498                 GOTO(stop, rc);
2499
2500         lfsck_buf_init(&linkea_buf, ldata.ld_buf->lb_buf,
2501                        ldata.ld_leh->leh_len);
2502         rc = dt_declare_xattr_set(env, pobj, &linkea_buf,
2503                                   XATTR_NAME_LINK, 0, th);
2504         if (rc != 0)
2505                 GOTO(stop, rc);
2506
2507         rc = dt_trans_start_local(env, dev, th);
2508         if (rc != 0)
2509                 GOTO(stop, rc);
2510
2511         dt_write_lock(env, pobj, 0);
2512         rc = dt_create(env, pobj, la, NULL, dof, th);
2513         if (rc == 0)
2514                 rc = lfsck_layout_update_lovea(env, lfsck, th, rec, pobj, cfid,
2515                         &lov_buf, LU_XATTR_CREATE, ltd->ltd_index, ea_off);
2516         dt_write_unlock(env, pobj);
2517         if (rc < 0)
2518                 GOTO(stop, rc);
2519
2520         rc = dt_insert(env, lpf, (const struct dt_rec *)dtrec,
2521                        (const struct dt_key *)name, th);
2522         if (rc != 0)
2523                 GOTO(stop, rc);
2524
2525         rc = dt_xattr_set(env, pobj, &linkea_buf, XATTR_NAME_LINK, 0, th);
2526         if (rc == 0 && cobj != NULL) {
2527                 dt_trans_stop(env, dev, th);
2528                 th = NULL;
2529
2530                 /* The 2nd transaction. */
2531                 rc = __lfsck_layout_update_pfid(env, com, cobj, pfid,
2532                                                 &rec->lor_layout, ea_off,
2533                                                 rec->lor_layout_version,
2534                                                 rec->lor_range);
2535         }
2536
2537         GOTO(stop, rc);
2538
2539 stop:
2540         if (th != NULL)
2541                 dt_trans_stop(env, dev, th);
2542
2543 unlock:
2544         lfsck_unlock(llh);
2545
2546 log:
2547         if (cobj != NULL && !IS_ERR(cobj))
2548                 lfsck_object_put(env, cobj);
2549         if (pobj != NULL && !IS_ERR(pobj))
2550                 lfsck_object_put(env, pobj);
2551
2552         if (rc < 0)
2553                 CDEBUG(D_LFSCK, "%s layout LFSCK assistant failed to "
2554                        "recreate the lost MDT-object: parent "DFID
2555                        ", child "DFID", OST-index %u, stripe-index %u, "
2556                        "infix %s, type %s: rc = %d\n",
2557                        lfsck_lfsck2name(lfsck), PFID(pfid), PFID(cfid),
2558                        ltd->ltd_index, ea_off, infix, type, rc);
2559
2560         return rc >= 0 ? 1 : rc;
2561 }
2562
2563 static int lfsck_layout_master_conditional_destroy(const struct lu_env *env,
2564                                                    struct lfsck_component *com,
2565                                                    const struct lu_fid *fid,
2566                                                    __u32 index)
2567 {
2568         struct lfsck_thread_info *info  = lfsck_env_info(env);
2569         struct lfsck_request     *lr    = &info->lti_lr;
2570         struct lfsck_instance    *lfsck = com->lc_lfsck;
2571         struct lfsck_tgt_desc    *ltd;
2572         struct ptlrpc_request    *req;
2573         struct lfsck_request     *tmp;
2574         struct obd_export        *exp;
2575         int                       rc    = 0;
2576         ENTRY;
2577
2578         ltd = lfsck_tgt_get(&lfsck->li_ost_descs, index);
2579         if (unlikely(ltd == NULL))
2580                 RETURN(-ENXIO);
2581
2582         exp = ltd->ltd_exp;
2583         if (!(exp_connect_flags(exp) & OBD_CONNECT_LFSCK))
2584                 GOTO(put, rc = -EOPNOTSUPP);
2585
2586         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LFSCK_NOTIFY);
2587         if (req == NULL)
2588                 GOTO(put, rc = -ENOMEM);
2589
2590         rc = ptlrpc_request_pack(req, LUSTRE_OBD_VERSION, LFSCK_NOTIFY);
2591         if (rc != 0) {
2592                 ptlrpc_request_free(req);
2593
2594                 GOTO(put, rc);
2595         }
2596
2597         memset(lr, 0, sizeof(*lr));
2598         lr->lr_event = LE_CONDITIONAL_DESTROY;
2599         lr->lr_active = LFSCK_TYPE_LAYOUT;
2600         lr->lr_fid = *fid;
2601
2602         tmp = req_capsule_client_get(&req->rq_pill, &RMF_LFSCK_REQUEST);
2603         *tmp = *lr;
2604         ptlrpc_request_set_replen(req);
2605
2606         rc = ptlrpc_queue_wait(req);
2607         ptlrpc_req_finished(req);
2608
2609         GOTO(put, rc);
2610
2611 put:
2612         lfsck_tgt_put(ltd);
2613
2614         return rc;
2615 }
2616
2617 static int lfsck_layout_slave_conditional_destroy(const struct lu_env *env,
2618                                                   struct lfsck_component *com,
2619                                                   struct lfsck_request *lr)
2620 {
2621         struct lfsck_thread_info        *info   = lfsck_env_info(env);
2622         struct lu_attr                  *la     = &info->lti_la;
2623         union ldlm_policy_data          *policy = &info->lti_policy;
2624         struct ldlm_res_id              *resid  = &info->lti_resid;
2625         struct lfsck_instance           *lfsck  = com->lc_lfsck;
2626         struct dt_device                *dev    = lfsck->li_bottom;
2627         struct lu_fid                   *fid    = &lr->lr_fid;
2628         struct dt_object                *obj;
2629         struct thandle                  *th     = NULL;
2630         struct lustre_handle             lh     = { 0 };
2631         __u64                            flags  = 0;
2632         int                              rc     = 0;
2633         ENTRY;
2634
2635         obj = lfsck_object_find_by_dev(env, dev, fid);
2636         if (IS_ERR(obj))
2637                 RETURN(PTR_ERR(obj));
2638
2639         dt_read_lock(env, obj, 0);
2640         if (dt_object_exists(obj) == 0 ||
2641             lfsck_is_dead_obj(obj)) {
2642                 dt_read_unlock(env, obj);
2643
2644                 GOTO(put, rc = -ENOENT);
2645         }
2646
2647         /* Get obj's attr without lock firstly. */
2648         rc = dt_attr_get(env, obj, la);
2649         dt_read_unlock(env, obj);
2650         if (rc != 0)
2651                 GOTO(put, rc);
2652
2653         if (likely(la->la_ctime != 0 || la->la_mode & S_ISUID))
2654                 GOTO(put, rc = -ETXTBSY);
2655
2656         /* Acquire extent lock on [0, EOF] to sync with all possible written. */
2657         LASSERT(lfsck->li_namespace != NULL);
2658
2659         memset(policy, 0, sizeof(*policy));
2660         policy->l_extent.end = OBD_OBJECT_EOF;
2661         ost_fid_build_resid(fid, resid);
2662         rc = ldlm_cli_enqueue_local(env, lfsck->li_namespace, resid,
2663                                     LDLM_EXTENT, policy, LCK_EX, &flags,
2664                                     ldlm_blocking_ast, ldlm_completion_ast,
2665                                     NULL, NULL, 0, LVB_T_NONE, NULL, &lh);
2666         if (rc != ELDLM_OK)
2667                 GOTO(put, rc = -EIO);
2668
2669         dt_write_lock(env, obj, 0);
2670         /* Get obj's attr within lock again. */
2671         rc = dt_attr_get(env, obj, la);
2672         if (rc != 0)
2673                 GOTO(unlock, rc);
2674
2675         if (la->la_ctime != 0)
2676                 GOTO(unlock, rc = -ETXTBSY);
2677
2678         th = lfsck_trans_create(env, dev, lfsck);
2679         if (IS_ERR(th))
2680                 GOTO(unlock, rc = PTR_ERR(th));
2681
2682         rc = dt_declare_ref_del(env, obj, th);
2683         if (rc != 0)
2684                 GOTO(stop, rc);
2685
2686         rc = dt_declare_destroy(env, obj, th);
2687         if (rc != 0)
2688                 GOTO(stop, rc);
2689
2690         rc = dt_trans_start_local(env, dev, th);
2691         if (rc != 0)
2692                 GOTO(stop, rc);
2693
2694         rc = dt_ref_del(env, obj, th);
2695         if (rc != 0)
2696                 GOTO(stop, rc);
2697
2698         rc = dt_destroy(env, obj, th);
2699         if (rc == 0)
2700                 CDEBUG(D_LFSCK, "%s: layout LFSCK destroyed the empty "
2701                        "OST-object "DFID" that was created for reparing "
2702                        "dangling referenced case. But the original missing "
2703                        "OST-object is found now.\n",
2704                        lfsck_lfsck2name(lfsck), PFID(fid));
2705
2706         GOTO(stop, rc);
2707
2708 stop:
2709         dt_trans_stop(env, dev, th);
2710
2711 unlock:
2712         dt_write_unlock(env, obj);
2713         ldlm_lock_decref(&lh, LCK_EX);
2714
2715 put:
2716         lfsck_object_put(env, obj);
2717
2718         return rc;
2719 }
2720
2721 /**
2722  * Some OST-object has occupied the specified layout EA slot.
2723  * Such OST-object may be generated by the LFSCK when repair
2724  * dangling referenced MDT-object, which can be indicated by
2725  * attr::la_ctime == 0 but without S_ISUID in la_mode. If it
2726  * is true and such OST-object has not been modified yet, we
2727  * will replace it with the orphan OST-object; otherwise the
2728  * LFSCK will create new MDT-object to reference the orphan.
2729  *
2730  * \retval       +1: repaired
2731  * \retval        0: did nothing
2732  * \retval      -ve: on error
2733  */
2734 static int lfsck_layout_conflict_create(const struct lu_env *env,
2735                                         struct lfsck_component *com,
2736                                         struct lfsck_tgt_desc *ltd,
2737                                         struct lu_orphan_rec_v3 *rec,
2738                                         struct dt_object *parent,
2739                                         struct lu_fid *cfid,
2740                                         struct lu_buf *ea_buf,
2741                                         struct lov_mds_md_v1 *lmm,
2742                                         struct lov_ost_data_v1 *slot,
2743                                         __u32 ea_off, int lovea_size)
2744 {
2745         struct lfsck_thread_info *info          = lfsck_env_info(env);
2746         struct lu_fid            *cfid2         = &info->lti_fid2;
2747         struct ost_id            *oi            = &info->lti_oi;
2748         struct dt_device         *dev           = lfsck_obj2dev(parent);
2749         struct thandle           *th            = NULL;
2750         struct lustre_handle      lh            = { 0 };
2751         __u32                     ost_idx2      = le32_to_cpu(slot->l_ost_idx);
2752         int                       rc            = 0;
2753         ENTRY;
2754
2755         while (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val)) {
2756                 if (unlikely(!thread_is_running(&com->lc_lfsck->li_thread)))
2757                         RETURN(0);
2758         }
2759
2760         ostid_le_to_cpu(&slot->l_ost_oi, oi);
2761         rc = ostid_to_fid(cfid2, oi, ost_idx2);
2762         if (rc != 0)
2763                 GOTO(out, rc);
2764
2765         rc = lfsck_ibits_lock(env, com->lc_lfsck, parent, &lh,
2766                               MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR,
2767                               LCK_EX);
2768         if (rc != 0)
2769                 GOTO(out, rc);
2770
2771         rc = lfsck_layout_master_conditional_destroy(env, com, cfid2, ost_idx2);
2772
2773         /* If the conflict OST-obejct is not created for fixing dangling
2774          * referenced MDT-object in former LFSCK check/repair, or it has
2775          * been modified by others, then we cannot destroy it. Re-create
2776          * a new MDT-object for the orphan OST-object. */
2777         if (rc == -ETXTBSY) {
2778                 /* No need the layout lock on the original parent. */
2779                 lfsck_ibits_unlock(&lh, LCK_EX);
2780
2781                 fid_zero(&rec->lor_rec.lor_fid);
2782                 snprintf(info->lti_tmpbuf, sizeof(info->lti_tmpbuf),
2783                          "-"DFID"-%x", PFID(lu_object_fid(&parent->do_lu)),
2784                          ea_off);
2785                 rc = lfsck_layout_recreate_parent(env, com, ltd, rec, cfid,
2786                                                 info->lti_tmpbuf, "C", ea_off);
2787
2788                 RETURN(rc);
2789         }
2790
2791         if (rc != 0 && rc != -ENOENT)
2792                 GOTO(unlock, rc);
2793
2794         if (lfsck_is_dryrun(com->lc_lfsck))
2795                 GOTO(unlock, rc = 0);
2796
2797         th = lfsck_trans_create(env, dev, com->lc_lfsck);
2798         if (IS_ERR(th))
2799                 GOTO(unlock, rc = PTR_ERR(th));
2800
2801         rc = dt_declare_xattr_set(env, parent, ea_buf, XATTR_NAME_LOV,
2802                                   LU_XATTR_REPLACE, th);
2803         if (rc != 0)
2804                 GOTO(stop, rc);
2805
2806         rc = dt_trans_start_local(env, dev, th);
2807         if (rc != 0)
2808                 GOTO(stop, rc);
2809
2810         dt_write_lock(env, parent, 0);
2811         lmm->lmm_layout_gen = cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1);
2812         rc = lfsck_layout_refill_lovea(env, com->lc_lfsck, th, parent, cfid,
2813                                        ea_buf, lmm, slot, LU_XATTR_REPLACE,
2814                                        ltd->ltd_index, lovea_size);
2815         dt_write_unlock(env, parent);
2816
2817         GOTO(stop, rc);
2818
2819 stop:
2820         dt_trans_stop(env, dev, th);
2821
2822 unlock:
2823         lfsck_ibits_unlock(&lh, LCK_EX);
2824
2825 out:
2826         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant replaced the conflict "
2827                "OST-object "DFID" on the OST %x with the orphan "DFID" on "
2828                "the OST %x: parent "DFID", stripe-index %u: rc = %d\n",
2829                lfsck_lfsck2name(com->lc_lfsck), PFID(cfid2), ost_idx2,
2830                PFID(cfid), ltd->ltd_index, PFID(lfsck_dto2fid(parent)),
2831                ea_off, rc);
2832
2833         return rc >= 0 ? 1 : rc;
2834 }
2835
2836 /**
2837  * \retval       +1: repaired
2838  * \retval        0: did nothing
2839  * \retval      -ve: on error
2840  */
2841 static int lfsck_layout_recreate_lovea(const struct lu_env *env,
2842                                        struct lfsck_component *com,
2843                                        struct lfsck_tgt_desc *ltd,
2844                                        struct lu_orphan_rec_v3 *rec,
2845                                        struct dt_object *parent,
2846                                        struct lu_fid *cfid,
2847                                        __u32 ost_idx, __u32 ea_off)
2848 {
2849         struct lfsck_thread_info *info          = lfsck_env_info(env);
2850         struct lu_buf            *buf           = &info->lti_big_buf;
2851         struct lu_fid            *fid           = &info->lti_fid2;
2852         struct ost_id            *oi            = &info->lti_oi;
2853         struct lfsck_instance    *lfsck         = com->lc_lfsck;
2854         struct dt_device         *dt            = lfsck_obj2dev(parent);
2855         struct lfsck_bookmark    *bk            = &lfsck->li_bookmark_ram;
2856         struct ost_layout        *ol            = &rec->lor_layout;
2857         struct lov_comp_md_v1    *lcm           = NULL;
2858         struct lov_comp_md_entry_v1 *lcme       = NULL;
2859         struct thandle           *handle        = NULL;
2860         size_t                    lovea_size;
2861         struct lov_mds_md_v1     *lmm;
2862         struct lov_ost_data_v1   *objs;
2863         struct lustre_handle      lh            = { 0 };
2864         __u32                     magic;
2865         __u32 flags = 0;
2866         int                       fl            = 0;
2867         int                       rc            = 0;
2868         int                       rc1;
2869         int                       i;
2870         int pos = 0;
2871         __u16 count;
2872         bool locked = false;
2873         bool new_mirror = true;
2874         ENTRY;
2875
2876         if (lfsck_is_dryrun(lfsck))
2877                 RETURN(0);
2878
2879         rc = lfsck_ibits_lock(env, lfsck, parent, &lh,
2880                               MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR,
2881                               LCK_EX);
2882         if (rc != 0) {
2883                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant failed to recreate "
2884                        "LOV EA for "DFID": parent "DFID", OST-index %u, "
2885                        "stripe-index %u, comp_id %u, comp_start %llu, "
2886                        "comp_end %llu, layout version %u, range %u: rc = %d\n",
2887                        lfsck_lfsck2name(lfsck), PFID(cfid),
2888                        PFID(lfsck_dto2fid(parent)), ost_idx, ea_off,
2889                        ol->ol_comp_id, ol->ol_comp_start,
2890                        ol->ol_comp_end, rec->lor_layout_version,
2891                        rec->lor_range, rc);
2892
2893                 RETURN(rc);
2894         }
2895
2896 again:
2897         if (locked) {
2898                 dt_write_unlock(env, parent);
2899                 locked = false;
2900         }
2901
2902         if (handle != NULL) {
2903                 dt_trans_stop(env, dt, handle);
2904                 handle = NULL;
2905         }
2906
2907         if (rc < 0)
2908                 GOTO(unlock_layout, rc);
2909
2910         lovea_size = rc;
2911         if (buf->lb_len < lovea_size) {
2912                 lu_buf_realloc(buf, lovea_size);
2913                 if (buf->lb_buf == NULL)
2914                         GOTO(unlock_layout, rc = -ENOMEM);
2915         }
2916
2917         if (!(bk->lb_param & LPF_DRYRUN)) {
2918                 handle = lfsck_trans_create(env, dt, lfsck);
2919                 if (IS_ERR(handle))
2920                         GOTO(unlock_layout, rc = PTR_ERR(handle));
2921
2922                 rc = dt_declare_xattr_set(env, parent, buf, XATTR_NAME_LOV,
2923                                           fl, handle);
2924                 if (rc != 0)
2925                         GOTO(stop, rc);
2926
2927                 rc = dt_trans_start_local(env, dt, handle);
2928                 if (rc != 0)
2929                         GOTO(stop, rc);
2930         }
2931
2932         dt_write_lock(env, parent, 0);
2933         locked = true;
2934         rc = dt_xattr_get(env, parent, buf, XATTR_NAME_LOV);
2935         if (rc == -ERANGE) {
2936                 rc = dt_xattr_get(env, parent, &LU_BUF_NULL, XATTR_NAME_LOV);
2937                 LASSERT(rc != 0);
2938                 goto again;
2939         } else if (rc == -ENODATA || rc == 0) {
2940                 lovea_size = lfsck_lovea_size(ol, ea_off);
2941                 /* If the declared is not big enough, re-try. */
2942                 if (buf->lb_len < lovea_size) {
2943                         rc = lovea_size;
2944                         goto again;
2945                 }
2946                 fl = LU_XATTR_CREATE;
2947         } else if (rc < 0) {
2948                 GOTO(unlock_parent, rc);
2949         } else if (unlikely(buf->lb_len == 0)) {
2950                 goto again;
2951         } else {
2952                 fl = LU_XATTR_REPLACE;
2953                 lovea_size = rc;
2954         }
2955
2956         if (fl == LU_XATTR_CREATE) {
2957                 if (bk->lb_param & LPF_DRYRUN)
2958                         GOTO(unlock_parent, rc = 1);
2959
2960                 LASSERT(buf->lb_len >= lovea_size);
2961
2962                 rc = lfsck_layout_update_lovea(env, lfsck, handle, rec, parent,
2963                                                cfid, buf, fl, ost_idx, ea_off);
2964
2965                 GOTO(unlock_parent, rc);
2966         }
2967
2968         lmm = buf->lb_buf;
2969         rc1 = lfsck_layout_verify_header(parent, lmm, lovea_size);
2970
2971         /* If the LOV EA crashed, the rebuild it. */
2972         if (rc1 == -EINVAL) {
2973                 if (bk->lb_param & LPF_DRYRUN)
2974                         GOTO(unlock_parent, rc = 1);
2975
2976                 LASSERT(buf->lb_len >= lovea_size);
2977
2978                 rc = lfsck_layout_update_lovea(env, lfsck, handle, rec, parent,
2979                                                cfid, buf, fl, ost_idx, ea_off);
2980
2981                 GOTO(unlock_parent, rc);
2982         }
2983
2984         /* For other unknown magic/pattern, keep the current LOV EA. */
2985         if (rc1 == -EOPNOTSUPP)
2986                 GOTO(unlock_parent, rc1 = 0);
2987
2988         if (rc1)
2989                 GOTO(unlock_parent, rc = rc1);
2990
2991         magic = le32_to_cpu(lmm->lmm_magic);
2992         if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) {
2993                 __u64 start;
2994                 __u64 end;
2995                 __u16 mirror_id0 = mirror_id_of(ol->ol_comp_id);
2996                 __u16 mirror_id1;
2997
2998                 if (bk->lb_param & LPF_DRYRUN)
2999                         GOTO(unlock_parent, rc = 1);
3000
3001                 lcm = buf->lb_buf;
3002                 count = le16_to_cpu(lcm->lcm_entry_count);
3003                 for (i = 0; i < count; pos = ++i) {
3004                         lcme = &lcm->lcm_entries[i];
3005                         start = le64_to_cpu(lcme->lcme_extent.e_start);
3006                         end = le64_to_cpu(lcme->lcme_extent.e_end);
3007                         mirror_id1 = mirror_id_of(le32_to_cpu(lcme->lcme_id));
3008
3009                         if (mirror_id0 > mirror_id1)
3010                                 continue;
3011
3012                         if (mirror_id0 < mirror_id1)
3013                                 break;
3014
3015                         new_mirror = false;
3016                         if (end <= ol->ol_comp_start)
3017                                 continue;
3018
3019                         if (start >= ol->ol_comp_end)
3020                                 break;
3021
3022                         lmm = buf->lb_buf + le32_to_cpu(lcme->lcme_offset);
3023                         magic = le32_to_cpu(lmm->lmm_magic);
3024                         flags = le32_to_cpu(lcme->lcme_flags);
3025                         goto further;
3026                 }
3027
3028                 rc = lfsck_layout_add_comp(env, lfsck, handle, rec, parent,
3029                                 cfid, buf, ost_idx, ea_off, pos, new_mirror);
3030
3031                 GOTO(unlock_parent, rc);
3032         }
3033
3034 further:
3035         count = le16_to_cpu(lmm->lmm_stripe_count);
3036         if (count == 0)
3037                 GOTO(unlock_parent, rc = -EINVAL);
3038         LASSERT(count > 0);
3039
3040         /* Exceed the current end of MDT-object layout EA. Then extend it. */
3041         if (count <= ea_off) {
3042                 if (bk->lb_param & LPF_DRYRUN)
3043                         GOTO(unlock_parent, rc = 1);
3044
3045                 lovea_size = lov_mds_md_size(ea_off + 1, magic);
3046                 /* If the declared is not big enough, re-try. */
3047                 if (buf->lb_len < lovea_size) {
3048                         rc = lovea_size;
3049                         goto again;
3050                 }
3051
3052                 if (lcm) {
3053                         LASSERT(lcme);
3054
3055                         lcme->lcme_flags = cpu_to_le32(flags | LCME_FL_INIT);
3056                         lfsck_layout_update_lcm(lcm, lcme,
3057                                                 rec->lor_layout_version,
3058                                                 rec->lor_range);
3059                 }
3060
3061                 rc = lfsck_layout_extend_v1v3_lovea(env, lfsck, handle, ol,
3062                                         parent, cfid, buf, ost_idx, ea_off);
3063
3064                 GOTO(unlock_parent, rc);
3065         }
3066
3067         LASSERTF(rc > 0, "invalid rc = %d\n", rc);
3068
3069         if (magic == LOV_MAGIC_V1) {
3070                 objs = &lmm->lmm_objects[0];
3071         } else {
3072                 LASSERT(magic == LOV_MAGIC_V3);
3073                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0];
3074         }
3075
3076         for (i = 0; i < count; i++, objs++) {
3077                 /* The MDT-object was created via lfsck_layout_recover_create()
3078                  * by others before, and we fill the dummy layout EA. */
3079                 if ((lcme && !(flags & LCME_FL_INIT)) ||
3080                      lovea_slot_is_dummy(objs)) {
3081                         if (i != ea_off)
3082                                 continue;
3083
3084                         if (bk->lb_param & LPF_DRYRUN)
3085                                 GOTO(unlock_parent, rc = 1);
3086
3087                         lmm->lmm_layout_gen =
3088                             cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1);
3089                         if (lcme) {
3090                                 LASSERT(lcm);
3091
3092                                 if (le32_to_cpu(lmm->lmm_stripe_size) !=
3093                                         ol->ol_stripe_size ||
3094                                     le16_to_cpu(lmm->lmm_stripe_count) !=
3095                                         ol->ol_stripe_count ||
3096                                     le64_to_cpu(lcme->lcme_extent.e_start) !=
3097                                         ol->ol_comp_start ||
3098                                     le64_to_cpu(lcme->lcme_extent.e_end) !=
3099                                         ol->ol_comp_end) {
3100                                         CDEBUG(D_LFSCK, "%s: found invalid "
3101                                         "component for "DFID ": parent "DFID
3102                                         ", stripe-index %u, stripe_size %u, "
3103                                         "stripe_count %u, comp_id %u, "
3104                                         "comp_start %llu, comp_end %llu, "
3105                                         "cur_stripe_size %u, "
3106                                         "cur_stripe_count %u, "
3107                                         "cur_comp_start %llu, "
3108                                         "cur_comp_end %llu\n",
3109                                         lfsck_lfsck2name(lfsck), PFID(cfid),
3110                                         PFID(lfsck_dto2fid(parent)), ea_off,
3111                                         ol->ol_stripe_size,
3112                                         ol->ol_stripe_count, ol->ol_comp_id,
3113                                         ol->ol_comp_start, ol->ol_comp_end,
3114                                         le32_to_cpu(lmm->lmm_stripe_size),
3115                                         le16_to_cpu(lmm->lmm_stripe_count),
3116                                         le64_to_cpu(lcme->lcme_extent.e_start),
3117                                         le64_to_cpu(lcme->lcme_extent.e_end));
3118
3119                                         GOTO(unlock_parent, rc = -EINVAL);
3120                                 }
3121
3122                                 lovea_size = le32_to_cpu(lcm->lcm_size);
3123                                 lcme->lcme_flags = cpu_to_le32(flags |
3124                                                                LCME_FL_INIT);
3125                                 lfsck_layout_update_lcm(lcm, lcme,
3126                                                         rec->lor_layout_version,
3127                                                         rec->lor_range);
3128                         }
3129
3130                         LASSERTF(buf->lb_len >= lovea_size,
3131                                  "buffer len %d is less than real size %d\n",
3132                                  (int)buf->lb_len, (int)lovea_size);
3133
3134                         rc = lfsck_layout_refill_lovea(env, lfsck, handle,
3135                                                 parent, cfid, buf, lmm, objs,
3136                                                 fl, ost_idx, lovea_size);
3137
3138                         CDEBUG(D_LFSCK, "%s layout LFSCK assistant fill "
3139                                "dummy layout slot for "DFID": parent "DFID
3140                                ", OST-index %u, stripe-index %u: rc = %d\n",
3141                                lfsck_lfsck2name(lfsck), PFID(cfid),
3142                                PFID(lfsck_dto2fid(parent)), ost_idx, i, rc);
3143
3144                         GOTO(unlock_parent, rc);
3145                 }
3146
3147                 ostid_le_to_cpu(&objs->l_ost_oi, oi);
3148                 rc = ostid_to_fid(fid, oi, le32_to_cpu(objs->l_ost_idx));
3149                 if (rc != 0) {
3150                         CDEBUG(D_LFSCK, "%s: the parent "DFID" contains "
3151                                "invalid layout EA at the slot %d, index %u\n",
3152                                lfsck_lfsck2name(lfsck),
3153                                PFID(lfsck_dto2fid(parent)), i,
3154                                le32_to_cpu(objs->l_ost_idx));
3155
3156                         GOTO(unlock_parent, rc);
3157                 }
3158
3159                 /* It should be rare case, the slot is there, but the LFSCK
3160                  * does not handle it during the first-phase cycle scanning. */
3161                 if (unlikely(lu_fid_eq(fid, cfid))) {
3162                         if (i == ea_off) {
3163                                 GOTO(unlock_parent, rc = 0);
3164                         } else {
3165                                 /* Rare case that the OST-object index
3166                                  * does not match the parent MDT-object
3167                                  * layout EA. We trust the later one. */
3168                                 if (bk->lb_param & LPF_DRYRUN)
3169                                         GOTO(unlock_parent, rc = 1);
3170
3171                                 dt_write_unlock(env, parent);
3172                                 if (handle != NULL)
3173                                         dt_trans_stop(env, dt, handle);
3174                                 lfsck_ibits_unlock(&lh, LCK_EX);
3175                                 rc = lfsck_layout_update_pfid(env, com, parent,
3176                                                         cfid, ltd->ltd_tgt,
3177                                                         rec, i);
3178
3179                                 CDEBUG(D_LFSCK, "%s layout LFSCK assistant "
3180                                        "updated OST-object's pfid for "DFID
3181                                        ": parent "DFID", OST-index %u, "
3182                                        "stripe-index %u: rc = %d\n",
3183                                        lfsck_lfsck2name(lfsck), PFID(cfid),
3184                                        PFID(lfsck_dto2fid(parent)),
3185                                        ltd->ltd_index, i, rc);
3186
3187                                 RETURN(rc);
3188                         }
3189                 }
3190         }
3191
3192         /* The MDT-object exists, but related layout EA slot is occupied
3193          * by others. */
3194         if (bk->lb_param & LPF_DRYRUN)
3195                 GOTO(unlock_parent, rc = 1);
3196
3197         dt_write_unlock(env, parent);
3198         if (handle != NULL)
3199                 dt_trans_stop(env, dt, handle);
3200         lfsck_ibits_unlock(&lh, LCK_EX);
3201         if (magic == LOV_MAGIC_V1)
3202                 objs = &lmm->lmm_objects[ea_off];
3203         else
3204                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[ea_off];
3205         rc = lfsck_layout_conflict_create(env, com, ltd, rec, parent, cfid,
3206                                           buf, lmm, objs, ea_off, lovea_size);
3207
3208         RETURN(rc);
3209
3210 unlock_parent:
3211         if (locked)
3212                 dt_write_unlock(env, parent);
3213
3214 stop:
3215         if (handle != NULL)
3216                 dt_trans_stop(env, dt, handle);
3217
3218 unlock_layout:
3219         lfsck_ibits_unlock(&lh, LCK_EX);
3220
3221         return rc;
3222 }
3223
3224 static int lfsck_layout_scan_orphan_one(const struct lu_env *env,
3225                                         struct lfsck_component *com,
3226                                         struct lfsck_tgt_desc *ltd,
3227                                         struct lu_orphan_rec_v3 *rec,
3228                                         struct lu_fid *cfid)
3229 {
3230         struct lfsck_layout     *lo     = com->lc_file_ram;
3231         struct lu_fid           *pfid   = &rec->lor_rec.lor_fid;
3232         struct dt_object        *parent = NULL;
3233         __u32                    ea_off = pfid->f_stripe_idx;
3234         int                      rc     = 0;
3235         ENTRY;
3236
3237         if (!fid_is_sane(cfid))
3238                 GOTO(out, rc = -EINVAL);
3239
3240         pfid->f_ver = 0;
3241         if (fid_is_zero(pfid)) {
3242                 rc = lfsck_layout_recreate_parent(env, com, ltd, rec, cfid,
3243                                                   "", "N", ea_off);
3244                 GOTO(out, rc);
3245         }
3246
3247         if (!fid_is_sane(pfid))
3248                 GOTO(out, rc = -EINVAL);
3249
3250         parent = lfsck_object_find_by_dev(env, com->lc_lfsck->li_bottom, pfid);
3251         if (IS_ERR(parent))
3252                 GOTO(out, rc = PTR_ERR(parent));
3253
3254         if (unlikely(dt_object_remote(parent) != 0))
3255                 GOTO(put, rc = -EXDEV);
3256
3257         if (dt_object_exists(parent) == 0) {
3258                 lfsck_object_put(env, parent);
3259                 rc = lfsck_layout_recreate_parent(env, com, ltd, rec, cfid,
3260                                                   "", "R", ea_off);
3261                 GOTO(out, rc);
3262         }
3263
3264         if (!S_ISREG(lu_object_attr(&parent->do_lu)))
3265                 GOTO(put, rc = -EISDIR);
3266
3267         /* The orphan OST-object claims to be the parent's stripe, then
3268          * related dangling record in the trace file is meaningless. */
3269         rc = lfsck_layout_del_dangling_rec(env, com, pfid,
3270                                            rec->lor_layout.ol_comp_id, ea_off);
3271         if (rc && rc != -ENOENT)
3272                 GOTO(put, rc);
3273
3274         rc = lfsck_layout_recreate_lovea(env, com, ltd, rec, parent, cfid,
3275                                          ltd->ltd_index, ea_off);
3276
3277         GOTO(put, rc);
3278
3279 put:
3280         if (rc <= 0)
3281                 lfsck_object_put(env, parent);
3282         else
3283                 /* The layout EA is changed, need to be reloaded next time. */
3284                 dt_object_put_nocache(env, parent);
3285
3286 out:
3287         down_write(&com->lc_sem);
3288         com->lc_new_scanned++;
3289         com->lc_new_checked++;
3290         if (rc > 0) {
3291                 lo->ll_objs_repaired[LLIT_ORPHAN - 1]++;
3292                 rc = 0;
3293         } else if (rc < 0) {
3294                 lo->ll_objs_failed_phase2++;
3295         }
3296         up_write(&com->lc_sem);
3297
3298         return rc;
3299 }
3300
3301 static int lfsck_layout_scan_orphan(const struct lu_env *env,
3302                                     struct lfsck_component *com,
3303                                     struct lfsck_tgt_desc *ltd)
3304 {
3305         struct lfsck_assistant_data     *lad    = com->lc_data;
3306         struct lfsck_instance           *lfsck  = com->lc_lfsck;
3307         struct lfsck_bookmark           *bk     = &lfsck->li_bookmark_ram;
3308         struct lfsck_thread_info        *info   = lfsck_env_info(env);
3309         struct lu_fid                   *fid    = &info->lti_fid;
3310         struct dt_object                *obj;
3311         const struct dt_it_ops          *iops;
3312         struct dt_it                    *di;
3313         int                              rc     = 0;
3314         ENTRY;
3315
3316         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant starts the orphan "
3317                "scanning for OST%04x\n",
3318                lfsck_lfsck2name(lfsck), ltd->ltd_index);
3319
3320         if (test_bit(ltd->ltd_index, lad->lad_bitmap)) {
3321                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant skip the orphan "
3322                        "scanning for OST%04x\n",
3323                        lfsck_lfsck2name(lfsck), ltd->ltd_index);
3324
3325                 RETURN(0);
3326         }
3327
3328         fid->f_seq = fid_idif_seq(0, ltd->ltd_index);
3329         fid->f_oid = fid->f_ver = 0;
3330
3331         obj = lfsck_object_find_by_dev(env, ltd->ltd_tgt, fid);
3332         if (unlikely(IS_ERR(obj)))
3333                 GOTO(log, rc = PTR_ERR(obj));
3334
3335         rc = obj->do_ops->do_index_try(env, obj,
3336                                        &dt_lfsck_layout_orphan_features);
3337         if (rc != 0)
3338                 GOTO(put, rc);
3339
3340         iops = &obj->do_index_ops->dio_it;
3341         di = iops->init(env, obj, 0);
3342         if (IS_ERR(di))
3343                 GOTO(put, rc = PTR_ERR(di));
3344
3345         rc = iops->load(env, di, 0);
3346         if (rc == -ESRCH) {
3347                 /* -ESRCH means that the orphan OST-objects rbtree has been
3348                  * cleanup because of the OSS server restart or other errors. */
3349                 lfsck_lad_set_bitmap(env, com, ltd->ltd_index);
3350                 GOTO(fini, rc);
3351         }
3352
3353         if (rc == 0)
3354                 rc = iops->next(env, di);
3355         else if (rc > 0)
3356                 rc = 0;
3357
3358         if (rc < 0)
3359                 GOTO(fini, rc);
3360
3361         if (rc > 0)
3362                 GOTO(fini, rc = 0);
3363
3364         do {
3365                 struct dt_key           *key;
3366                 struct lu_orphan_rec_v3 *rec = &info->lti_rec;
3367
3368                 if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val) &&
3369                     unlikely(!thread_is_running(&lfsck->li_thread)))
3370                         break;
3371
3372                 key = iops->key(env, di);
3373                 com->lc_fid_latest_scanned_phase2 = *(struct lu_fid *)key;
3374                 /* Remote target OST may be runnning old LFSCK */
3375                 memset(rec, 0, sizeof(*rec));
3376                 rc = iops->rec(env, di, (struct dt_rec *)rec, 0);
3377                 if (rc == 0)
3378                         rc = lfsck_layout_scan_orphan_one(env, com, ltd, rec,
3379                                         &com->lc_fid_latest_scanned_phase2);
3380                 if (rc != 0 && bk->lb_param & LPF_FAILOUT)
3381                         GOTO(fini, rc);
3382
3383                 lfsck_control_speed_by_self(com);
3384                 do {
3385                         rc = iops->next(env, di);
3386                 } while (rc < 0 && !(bk->lb_param & LPF_FAILOUT));
3387         } while (rc == 0);
3388
3389         GOTO(fini, rc);
3390
3391 fini:
3392         iops->put(env, di);
3393         iops->fini(env, di);
3394 put:
3395         lfsck_object_put(env, obj);
3396
3397 log:
3398         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant finished the orphan "
3399                "scanning for OST%04x: rc = %d\n",
3400                lfsck_lfsck2name(lfsck), ltd->ltd_index, rc);
3401
3402         return rc > 0 ? 0 : rc;
3403 }
3404
3405 static int lfsck_lov2layout(struct lov_mds_md_v1 *lmm, struct filter_fid *ff,
3406                             __u32 comp_id)
3407 {
3408         struct ost_layout *ol = &ff->ff_layout;
3409         __u32 magic = le32_to_cpu(lmm->lmm_magic);
3410         int rc = 0;
3411         ENTRY;
3412
3413         if (magic == LOV_MAGIC_V1 || magic == LOV_MAGIC_V3) {
3414                 ol->ol_stripe_size = lmm->lmm_stripe_size;
3415                 ol->ol_stripe_count = lmm->lmm_stripe_count;
3416                 ol->ol_comp_start = 0;
3417                 ol->ol_comp_end = 0;
3418                 ol->ol_comp_id = 0;
3419                 ff->ff_layout_version = 0;
3420                 ff->ff_range = 0;
3421         } else if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) {
3422                 struct lov_comp_md_v1 *lcm = (struct lov_comp_md_v1 *)lmm;
3423                 struct lov_comp_md_entry_v1 *lcme = NULL;
3424                 __u16 count = le16_to_cpu(lcm->lcm_entry_count);
3425                 int i;
3426
3427                 for (i = 0; i < count; i++) {
3428                         lcme = &lcm->lcm_entries[i];
3429                         if (le32_to_cpu(lcme->lcme_id) == comp_id) {
3430                                 LASSERT(le32_to_cpu(lcme->lcme_flags) &
3431                                         LCME_FL_INIT);
3432
3433                                 break;
3434                         }
3435                 }
3436
3437                 /* The comp has been removed, do nothing. */
3438                 if (i == count)
3439                         GOTO(out, rc = 1);
3440
3441                 lmm = (void *)lmm + le32_to_cpu(lcme->lcme_offset);
3442                 ol->ol_stripe_size = le32_to_cpu(lmm->lmm_stripe_size);
3443                 ol->ol_stripe_count = le32_to_cpu(lmm->lmm_stripe_count);
3444                 ol->ol_comp_start = le64_to_cpu(lcme->lcme_extent.e_start);
3445                 ol->ol_comp_end = le64_to_cpu(lcme->lcme_extent.e_end);
3446                 ol->ol_comp_id = le32_to_cpu(lcme->lcme_id);
3447                 ff->ff_layout_version = le32_to_cpu(lcme->lcme_layout_gen);
3448                 ff->ff_range = 0;
3449         } else {
3450                 GOTO(out, rc = -EINVAL);
3451         }
3452
3453         EXIT;
3454
3455 out:
3456         return rc;
3457 }
3458
3459 /**
3460  * Repair the MDT-object with dangling LOV EA reference.
3461  *
3462  * we need to repair the inconsistency according to the users' requirement:
3463  *
3464  * 1) Keep the inconsistency there and report the inconsistency case,
3465  *    then give the chance to the application to find related issues,
3466  *    and the users can make the decision about how to handle it with
3467  *    more human knownledge. (by default)
3468  *
3469  * 2) Re-create the missing OST-object with the FID/owner information.
3470  *
3471  * \param[in] env       pointer to the thread context
3472  * \param[in] com       the layout LFSCK component
3473  * \param[in] parent    the MDT-object with dangling LOV EA reference
3474  * \param[in] child     the OST-object to be created
3475  * \param[in] comp_id   the component ID of the OST-object in the LOV EA
3476  * \param[in] ea_off    the offset of the OST-object in the LOV EA
3477  * \param[in] ost_idx   the index of OST on which the OST-object resides
3478  *
3479  * \retval              +1 for repair successfully
3480  * \retval              0 for did nothing
3481  * \retval              negative error number on failure
3482  */
3483 static int __lfsck_layout_repair_dangling(const struct lu_env *env,
3484                                           struct lfsck_component *com,
3485                                           struct dt_object *parent,
3486                                           struct dt_object *child,
3487                                           __u32 comp_id, __u32 ea_off,
3488                                           __u32 ost_idx, bool log)
3489 {
3490         struct lfsck_thread_info *info = lfsck_env_info(env);
3491         struct filter_fid *ff = &info->lti_ff;
3492         struct dt_object_format *dof = &info->lti_dof;
3493         struct lu_attr *la = &info->lti_la;
3494         struct lfsck_instance *lfsck = com->lc_lfsck;
3495         struct dt_device *dev = lfsck_obj2dev(child);
3496         const struct lu_fid *pfid = lfsck_dto2fid(parent);
3497         const struct lu_fid *cfid = lfsck_dto2fid(child);
3498         struct lu_buf *tbuf = &info->lti_big_buf;
3499         struct thandle *handle;
3500         struct lu_buf *buf;
3501         struct lustre_handle lh = { 0 };
3502         int rc;
3503         ENTRY;
3504
3505         if (!(lfsck->li_bookmark_ram.lb_param & LPF_CREATE_OSTOBJ))
3506                 GOTO(log, rc = 1);
3507
3508         rc = lfsck_ibits_lock(env, lfsck, parent, &lh,
3509                               MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR,
3510                               LCK_EX);
3511         if (rc != 0)
3512                 GOTO(log, rc);
3513
3514         rc = dt_attr_get(env, parent, la);
3515         if (rc != 0)
3516                 GOTO(unlock1, rc);
3517
3518         la->la_mode = S_IFREG | 0666;
3519         la->la_atime = la->la_mtime = la->la_ctime = 0;
3520         la->la_valid = LA_TYPE | LA_MODE | LA_UID | LA_GID |
3521                        LA_ATIME | LA_MTIME | LA_CTIME;
3522         memset(dof, 0, sizeof(*dof));
3523         ff->ff_parent.f_seq = cpu_to_le64(pfid->f_seq);
3524         ff->ff_parent.f_oid = cpu_to_le32(pfid->f_oid);
3525         /* Currently, the filter_fid::ff_parent::f_ver is not the real parent
3526          * MDT-object's FID::f_ver, instead it is the OST-object index in its
3527          * parent MDT-object's layout EA. */
3528         ff->ff_parent.f_stripe_idx = cpu_to_le32(ea_off);
3529
3530         rc = lfsck_layout_get_lovea(env, parent, tbuf);
3531         if (unlikely(rc == -ENODATA))
3532                 rc = 0;
3533         if (rc <= 0)
3534                 GOTO(unlock1, rc);
3535
3536         rc = lfsck_lov2layout(tbuf->lb_buf, ff, comp_id);
3537         if (rc)
3538                 GOTO(unlock1, rc);
3539
3540         buf = lfsck_buf_get(env, ff, sizeof(struct filter_fid));
3541         handle = lfsck_trans_create(env, dev, lfsck);
3542         if (IS_ERR(handle))
3543                 GOTO(unlock1, rc = PTR_ERR(handle));
3544
3545         rc = dt_declare_create(env, child, la, NULL, dof, handle);
3546         if (rc != 0)
3547                 GOTO(stop, rc);
3548
3549         rc = dt_declare_xattr_set(env, child, buf, XATTR_NAME_FID,
3550                                   LU_XATTR_CREATE, handle);
3551         if (rc != 0)
3552                 GOTO(stop, rc);
3553
3554         rc = dt_trans_start_local(env, dev, handle);
3555         if (rc != 0)
3556                 GOTO(stop, rc);
3557
3558         dt_read_lock(env, parent, 0);
3559         if (unlikely(lfsck_is_dead_obj(parent)))
3560                 GOTO(unlock2, rc = 0);
3561
3562         if (lfsck->li_bookmark_ram.lb_param & LPF_DELAY_CREATE_OSTOBJ) {
3563                 struct ost_id *oi = &info->lti_oi;
3564                 struct lu_fid *tfid = &info->lti_fid2;
3565                 struct lu_buf *lovea = &info->lti_big_buf;
3566                 struct lov_mds_md_v1 *lmm;
3567                 struct lov_ost_data_v1 *objs;
3568                 __u32 magic;
3569                 int count;
3570                 int idx2;
3571
3572                 rc = lfsck_layout_get_lovea(env, parent, lovea);
3573                 if (unlikely(rc == -ENODATA))
3574                         rc = 0;
3575                 if (rc <= 0)
3576                         GOTO(unlock2, rc);
3577
3578                 lmm = lovea->lb_buf;
3579                 magic = le32_to_cpu(lmm->lmm_magic);
3580                 if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) {
3581                         struct lov_comp_md_v1 *lcm = buf->lb_buf;
3582                         struct lov_comp_md_entry_v1 *lcme;
3583                         __u16 count = le16_to_cpu(lcm->lcm_entry_count);
3584                         int i;
3585
3586                         for (i = 0; i < count; i++) {
3587                                 lcme = &lcm->lcm_entries[i];
3588                                 if (le32_to_cpu(lcme->lcme_id) == comp_id) {
3589                                         LASSERT(le32_to_cpu(lcme->lcme_flags) &
3590                                                 LCME_FL_INIT);
3591
3592                                         lmm = lovea->lb_buf +
3593                                                 le32_to_cpu(lcme->lcme_offset);
3594                                         magic = le32_to_cpu(lmm->lmm_magic);
3595                                         goto check;
3596                                 }
3597                         }
3598
3599                         /* Someone removed the component, do nothing. */
3600                         GOTO(unlock2, rc = 0);
3601                 }
3602
3603 check:
3604                 count = le16_to_cpu(lmm->lmm_stripe_count);
3605                 /* Someone changed the LOV EA, do nothing. */
3606                 if (count <= ea_off)
3607                         GOTO(unlock2, rc = 0);
3608
3609                 if (magic == LOV_MAGIC_V1) {
3610                         objs = &lmm->lmm_objects[ea_off];
3611                 } else {
3612                         LASSERT(magic == LOV_MAGIC_V3);
3613
3614                         objs = &((struct lov_mds_md_v3 *)lmm)->\
3615                                                         lmm_objects[ea_off];
3616                 }
3617
3618                 ostid_le_to_cpu(&objs->l_ost_oi, oi);
3619                 idx2 = le32_to_cpu(objs->l_ost_idx);
3620                 rc = ostid_to_fid(tfid, oi, idx2);
3621                 /* Someone changed the LOV EA, do nothing. */
3622                 if (rc != 0 || !lu_fid_eq(tfid, cfid))
3623                         GOTO(unlock2, rc);
3624         }
3625
3626         rc = dt_create(env, child, la, NULL, dof, handle);
3627         if (rc != 0)
3628                 GOTO(unlock2, rc);
3629
3630         rc = dt_xattr_set(env, child, buf, XATTR_NAME_FID, LU_XATTR_CREATE,
3631                           handle);
3632
3633         GOTO(unlock2, rc);
3634
3635 unlock2:
3636         dt_read_unlock(env, parent);
3637
3638 stop:
3639         rc = lfsck_layout_trans_stop(env, dev, handle, rc);
3640
3641 unlock1:
3642         lfsck_ibits_unlock(&lh, LCK_EX);
3643
3644 log:
3645         if (rc && log)
3646                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant found "
3647                        "dangling reference for: parent "DFID", child "
3648                        DFID", comp_id %u, ea_off %u, ost_idx %u, %s: "
3649                        "rc = %d\n",
3650                        lfsck_lfsck2name(lfsck), PFID(pfid), PFID(cfid),
3651                        comp_id, ea_off, ost_idx,
3652                        (lfsck->li_bookmark_ram.lb_param & LPF_CREATE_OSTOBJ) ?
3653                                 "Create the lost OST-object as required" :
3654                                 "Keep the MDT-object there by default", rc);
3655
3656         return rc;
3657 }
3658
3659 /**
3660  * Repair the MDT-object with dangling LOV EA reference.
3661  *
3662  * Prepare parameters and call __lfsck_layout_repair_dangling()
3663  * to repair the dangling LOV EA reference.
3664  *
3665  * \param[in] env       pointer to the thread context
3666  * \param[in] com       the layout LFSCK component
3667  * \param[in] pfid      the MDT-object's FID
3668  * \param[in] cfid      the FID for the OST-object to be created
3669  * \param[in] comp_id   the component ID of the OST-object in the LOV EA
3670  * \param[in] ea_off    the offset of the OST-object in the LOV EA
3671  * \param[in] ost_idx   the index of OST on which the OST-object resides
3672  *
3673  * \retval              +1 for repair successfully
3674  * \retval              0 for did nothing
3675  * \retval              negative error number on failure
3676  */
3677 static int lfsck_layout_repair_dangling(const struct lu_env *env,
3678                                         struct lfsck_component *com,
3679                                         const struct lu_fid *pfid,
3680                                         const struct lu_fid *cfid,
3681                                         __u32 comp_id, __u32 ea_off,
3682                                         __u32 ost_idx)
3683 {
3684         struct lfsck_instance *lfsck = com->lc_lfsck;
3685         struct dt_object *parent = NULL;
3686         struct dt_object *child = NULL;
3687         struct lfsck_tgt_desc *ltd;
3688         int rc;
3689         ENTRY;
3690
3691         parent = lfsck_object_find_bottom(env, lfsck, pfid);
3692         if (IS_ERR(parent))
3693                 GOTO(log, rc = PTR_ERR(parent));
3694
3695         /* The MDT-object has been removed. */
3696         if (dt_object_exists(parent) == 0)
3697                 GOTO(log, rc = 0);
3698
3699         ltd = lfsck_ltd2tgt(&lfsck->li_ost_descs, ost_idx);
3700         if (unlikely(ltd == NULL))
3701                 GOTO(log, rc = -ENODEV);
3702
3703         child = lfsck_object_find_by_dev(env, ltd->ltd_tgt, cfid);
3704         if (IS_ERR(child))
3705                 GOTO(log, rc = PTR_ERR(child));
3706
3707         /* The OST-object has been created. */
3708         if (unlikely(dt_object_exists(child) != 0))
3709                 GOTO(log, rc = 0);
3710
3711         rc = __lfsck_layout_repair_dangling(env, com, parent, child,
3712                                             comp_id, ea_off, ost_idx, false);
3713
3714         GOTO(log, rc);
3715
3716 log:
3717         if (child != NULL && !IS_ERR(child))
3718                 lfsck_object_put(env, child);
3719
3720         if (parent != NULL && !IS_ERR(parent))
3721                 lfsck_object_put(env, parent);
3722
3723         if (rc)
3724                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant found "
3725                        "dangling reference for: parent "DFID", child "
3726                        DFID", comp_id %u, ea_off %u, ost_idx %u, %s: rc = %d\n",
3727                        lfsck_lfsck2name(lfsck), PFID(pfid), PFID(cfid),
3728                        comp_id, ea_off, ost_idx,
3729                        (lfsck->li_bookmark_ram.lb_param & LPF_CREATE_OSTOBJ) ?
3730                                 "Create the lost OST-object as required" :
3731                                 "Keep the MDT-object there by default", rc);
3732
3733         return rc;
3734 }
3735
3736 /* If the OST-object does not recognize the MDT-object as its parent, and
3737  * there is no other MDT-object claims as its parent, then just trust the
3738  * given MDT-object as its parent. So update the OST-object filter_fid. */
3739 static int lfsck_layout_repair_unmatched_pair(const struct lu_env *env,
3740                                               struct lfsck_component *com,
3741                                               struct dt_object *parent,
3742                                               struct lfsck_layout_req *llr,
3743                                               struct lu_attr *la)
3744 {
3745         struct lfsck_thread_info        *info   = lfsck_env_info(env);
3746         struct filter_fid               *ff     = &info->lti_ff;
3747         struct dt_object                *child  = llr->llr_child;
3748         struct dt_device                *dev    = lfsck_obj2dev(child);
3749         const struct lu_fid             *tfid   = lu_object_fid(&parent->do_lu);
3750         struct lu_buf                   *tbuf   = &info->lti_big_buf;
3751         struct thandle                  *handle;
3752         struct lu_buf                   *buf;
3753         struct lustre_handle             lh     = { 0 };
3754         int                              rc;
3755         ENTRY;
3756
3757         rc = lfsck_ibits_lock(env, com->lc_lfsck, parent, &lh,
3758                               MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR,
3759                               LCK_EX);
3760         if (rc != 0)
3761                 GOTO(log, rc);
3762
3763         ff->ff_parent.f_seq = cpu_to_le64(tfid->f_seq);
3764         ff->ff_parent.f_oid = cpu_to_le32(tfid->f_oid);
3765         /* Currently, the filter_fid::ff_parent::f_ver is not the real parent
3766          * MDT-object's FID::f_ver, instead it is the OST-object index in its
3767          * parent MDT-object's layout EA. */
3768         ff->ff_parent.f_stripe_idx = cpu_to_le32(llr->llr_lov_idx);
3769
3770         rc = lfsck_layout_get_lovea(env, parent, tbuf);
3771         if (unlikely(rc == -ENODATA))
3772                 rc = 0;
3773         if (rc <= 0)
3774                 GOTO(unlock1, rc);
3775
3776         rc = lfsck_lov2layout(tbuf->lb_buf, ff, llr->llr_comp_id);
3777         if (rc)
3778                 GOTO(unlock1, rc);
3779
3780         buf = lfsck_buf_get(env, ff, sizeof(*ff));
3781
3782         handle = lfsck_trans_create(env, dev, com->lc_lfsck);
3783         if (IS_ERR(handle))
3784                 GOTO(unlock1, rc = PTR_ERR(handle));
3785
3786         rc = dt_declare_xattr_set(env, child, buf, XATTR_NAME_FID, 0, handle);
3787         if (rc != 0)
3788                 GOTO(stop, rc);
3789
3790         rc = dt_attr_get(env, parent, la);
3791         if (rc != 0)
3792                 GOTO(stop, rc);
3793
3794         la->la_valid = LA_UID | LA_GID;
3795         rc = dt_declare_attr_set(env, child, la, handle);
3796         if (rc != 0)
3797                 GOTO(stop, rc);
3798
3799         rc = dt_trans_start_local(env, dev, handle);
3800         if (rc != 0)
3801                 GOTO(stop, rc);
3802
3803         dt_write_lock(env, parent, 0);
3804         if (unlikely(lfsck_is_dead_obj(parent)))
3805                 GOTO(unlock2, rc = 1);
3806
3807         rc = dt_xattr_set(env, child, buf, XATTR_NAME_FID, 0, handle);
3808         if (rc != 0)
3809                 GOTO(unlock2, rc);
3810
3811         /* Get the latest parent's owner. */
3812         rc = dt_attr_get(env, parent, la);
3813         if (rc != 0)
3814                 GOTO(unlock2, rc);
3815
3816         la->la_valid = LA_UID | LA_GID;
3817         rc = dt_attr_set(env, child, la, handle);
3818
3819         GOTO(unlock2, rc);
3820
3821 unlock2:
3822         dt_write_unlock(env, parent);
3823
3824 stop:
3825         rc = lfsck_layout_trans_stop(env, dev, handle, rc);
3826
3827 unlock1:
3828         lfsck_ibits_unlock(&lh, LCK_EX);
3829
3830 log:
3831         if (rc)
3832                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant repaired "
3833                        "unmatched MDT-OST pair for: parent "DFID
3834                        ", child "DFID", comp_id %u, OST-index %u, "
3835                        "stripe-index %u, owner %u/%u: rc = %d\n",
3836                        lfsck_lfsck2name(com->lc_lfsck),
3837                        PFID(lfsck_dto2fid(parent)),
3838                        PFID(lfsck_dto2fid(child)),
3839                        llr->llr_comp_id, llr->llr_ost_idx, llr->llr_lov_idx,
3840                        la->la_uid, la->la_gid, rc);
3841
3842         return rc;
3843 }
3844
3845 /* If there are more than one MDT-objects claim as the OST-object's parent,
3846  * and the OST-object only recognizes one of them, then we need to generate
3847  * new OST-object(s) with new fid(s) for the non-recognized MDT-object(s). */
3848 static int lfsck_layout_repair_multiple_references(const struct lu_env *env,
3849                                                    struct lfsck_component *com,
3850                                                    struct dt_object *parent,
3851                                                    struct lfsck_layout_req *llr,
3852                                                    struct lu_attr *la)
3853 {
3854         struct lfsck_thread_info        *info   = lfsck_env_info(env);
3855         struct dt_allocation_hint       *hint   = &info->lti_hint;
3856         struct dt_object_format         *dof    = &info->lti_dof;
3857         struct ost_id                   *oi     = &info->lti_oi;
3858         struct lu_buf                   *buf    = &info->lti_big_buf;
3859         struct lfsck_instance           *lfsck  = com->lc_lfsck;
3860         struct dt_device                *dev;
3861         struct lu_device                *d      =
3862                                 &lfsck_obj2dev(llr->llr_child)->dd_lu_dev;
3863         struct lu_object                *o;
3864         struct lu_object                *n;
3865         struct dt_object                *child  = NULL;
3866         struct thandle                  *handle = NULL;
3867         struct lov_mds_md_v1            *lmm;
3868         struct lov_ost_data_v1          *objs;
3869         const struct lu_fid             *pfid   = lfsck_dto2fid(parent);
3870         struct lu_fid                    tfid;
3871         struct lustre_handle             lh     = { 0 };
3872         __u32                            magic;
3873         __u32                            index;
3874         int                              rc;
3875         ENTRY;
3876
3877         /* We use two separated transactions to repair the inconsistency.
3878          *
3879          * 1) create the child (OST-object).
3880          * 2) update the parent LOV EA according to the child's FID.
3881          *
3882          * If 1) succeed, but 2) failed or aborted, then such OST-object will be
3883          * handled as orphan when the layout LFSCK run next time.
3884          *
3885          * If 1) failed, but 2) succeed, then such OST-object will be re-created
3886          * as dangling referened case when the layout LFSCK run next time. */
3887
3888         /* The 1st transaction. */
3889         o = lu_object_anon(env, d, NULL);
3890         if (IS_ERR(o))
3891                 GOTO(log, rc = PTR_ERR(o));
3892
3893         n = lu_object_locate(o->lo_header, d->ld_type);
3894         if (unlikely(n == NULL)) {
3895                 lu_object_put_nocache(env, o);
3896
3897                 GOTO(log, rc = -EINVAL);
3898         }
3899
3900         child = container_of(n, struct dt_object, do_lu);
3901         memset(hint, 0, sizeof(*hint));
3902         rc = dt_attr_get(env, parent, la);
3903         if (rc != 0)
3904                 GOTO(log, rc);
3905
3906         la->la_valid = LA_UID | LA_GID;
3907         memset(dof, 0, sizeof(*dof));
3908
3909         dev = lfsck_obj2dev(child);
3910         handle = lfsck_trans_create(env, dev, lfsck);
3911         if (IS_ERR(handle))
3912                 GOTO(log, rc = PTR_ERR(handle));
3913
3914         rc = dt_declare_create(env, child, la, hint, dof, handle);
3915         if (rc != 0)
3916                 GOTO(stop, rc);
3917
3918         rc = dt_trans_start_local(env, dev, handle);
3919         if (rc != 0)
3920                 GOTO(stop, rc);
3921
3922         rc = dt_create(env, child, la, hint, dof, handle);
3923         dt_trans_stop(env, dev, handle);
3924         handle = NULL;
3925         if (rc != 0)
3926                 GOTO(log, rc);
3927
3928         rc = lfsck_ibits_lock(env, lfsck, parent, &lh,
3929                               MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR,
3930                               LCK_EX);
3931         if (rc != 0)
3932                 GOTO(log, rc);
3933
3934         /* The 2nd transaction. */
3935
3936         /* XXX: Generally, we should use bottom device (OSD) to update parent
3937          *      LOV EA. But because the LOD-object still references the wrong
3938          *      OSP-object that should be detached after the parent's LOV EA
3939          *      refreshed. Unfortunately, there is no suitable API for that.
3940          *      So we have to make the LOD to re-load the OSP-object(s) via
3941          *      replacing the LOV EA against the LOD-object.
3942          *
3943          *      Once the DNE2 patches have been landed, we can replace the
3944          *      LOD device with the OSD device. LU-6230. */
3945
3946         dev = lfsck->li_next;
3947         parent = lfsck_object_locate(dev, parent);
3948         if (IS_ERR(parent))
3949                 GOTO(log, rc = PTR_ERR(parent));
3950
3951         handle = lfsck_trans_create(env, dev, lfsck);
3952         if (IS_ERR(handle))
3953                 GOTO(log, rc = PTR_ERR(handle));
3954
3955         rc = dt_declare_xattr_set(env, parent, buf, XATTR_NAME_LOV,
3956                                   LU_XATTR_REPLACE, handle);
3957         if (rc != 0)
3958                 GOTO(stop, rc);
3959
3960         rc = dt_trans_start_local(env, dev, handle);
3961         if (rc != 0)
3962                 GOTO(stop, rc);
3963
3964         dt_write_lock(env, parent, 0);
3965         if (unlikely(lfsck_is_dead_obj(parent)))
3966                 GOTO(unlock, rc = 0);
3967
3968         rc = lfsck_layout_get_lovea(env, parent, buf);
3969         if (unlikely(rc == -ENODATA))
3970                 rc = 0;
3971         if (rc <= 0)
3972                 GOTO(unlock, rc);
3973
3974         lmm = buf->lb_buf;
3975         magic = le32_to_cpu(lmm->lmm_magic);
3976         if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) {
3977                 struct lov_comp_md_v1 *lcm = buf->lb_buf;
3978                 struct lov_comp_md_entry_v1 *lcme;
3979                 __u16 count = le16_to_cpu(lcm->lcm_entry_count);
3980                 int i;
3981
3982                 LASSERT(llr->llr_comp_id != 0);
3983
3984                 for (i = 0; i < count; i++) {
3985                         lcme = &lcm->lcm_entries[i];
3986                         if (le32_to_cpu(lcme->lcme_id) == llr->llr_comp_id) {
3987                                 LASSERT(le32_to_cpu(lcme->lcme_flags) &
3988                                         LCME_FL_INIT);
3989
3990                                 le32_add_cpu(&lcm->lcm_layout_gen, 1);
3991                                 lmm = buf->lb_buf +
3992                                         le32_to_cpu(lcme->lcme_offset);
3993                                 magic = le32_to_cpu(lmm->lmm_magic);
3994                                 goto set;
3995                         }
3996                 }
3997
3998                 GOTO(unlock, rc = 0);
3999         }
4000
4001 set:
4002         if (magic == LOV_MAGIC_V1) {
4003                 objs = &lmm->lmm_objects[llr->llr_lov_idx];
4004         } else {
4005                 LASSERT(magic == LOV_MAGIC_V3);
4006                 objs =
4007                 &((struct lov_mds_md_v3 *)lmm)->lmm_objects[llr->llr_lov_idx];
4008         }
4009
4010         ostid_le_to_cpu(&objs->l_ost_oi, oi);
4011         index = le32_to_cpu(objs->l_ost_idx);
4012         rc = ostid_to_fid(&tfid, oi, index);
4013         /* Someone changed layout during the LFSCK, no need to repair then. */
4014         if (rc == 0 && !lu_fid_eq(&tfid, lu_object_fid(&llr->llr_child->do_lu)))
4015                 GOTO(unlock, rc = 0);
4016
4017         lmm->lmm_layout_gen = cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1);
4018         fid_to_ostid(lu_object_fid(&child->do_lu), oi);
4019         ostid_cpu_to_le(oi, &objs->l_ost_oi);
4020         objs->l_ost_gen = cpu_to_le32(0);
4021         objs->l_ost_idx = cpu_to_le32(llr->llr_ost_idx);
4022         rc = dt_xattr_set(env, parent, buf, XATTR_NAME_LOV,
4023                           LU_XATTR_REPLACE, handle);
4024
4025         GOTO(unlock, rc = (rc == 0 ? 1 : rc));
4026
4027 unlock:
4028         dt_write_unlock(env, parent);
4029
4030 stop:
4031         if (handle != NULL)
4032                 dt_trans_stop(env, dev, handle);
4033
4034 log:
4035         lfsck_ibits_unlock(&lh, LCK_EX);
4036         if (child != NULL)
4037                 lfsck_object_put(env, child);
4038
4039         if (rc)
4040                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant repaired "
4041                        "multiple references for: parent "DFID", comp_id %u, "
4042                        "OST-index %u, stripe-index %u, owner %u/%u: rc = %d\n",
4043                        lfsck_lfsck2name(lfsck), PFID(pfid),
4044                        llr->llr_comp_id, llr->llr_ost_idx, llr->llr_lov_idx,
4045                        la->la_uid, la->la_gid, rc);
4046
4047         return rc;
4048 }
4049
4050 /* If the MDT-object and the OST-object have different owner information,
4051  * then trust the MDT-object, because the normal chown/chgrp handle order
4052  * is from MDT to OST, and it is possible that some chown/chgrp operation
4053  * is partly done. */
4054 static int lfsck_layout_repair_owner(const struct lu_env *env,
4055                                      struct lfsck_component *com,
4056                                      struct dt_object *parent,
4057                                      struct lfsck_layout_req *llr,
4058                                      struct lu_attr *pla,
4059                                      const struct lu_attr *cla)
4060 {
4061         struct lfsck_thread_info        *info   = lfsck_env_info(env);
4062         struct lu_attr                  *tla    = &info->lti_la2;
4063         struct dt_object                *child  = llr->llr_child;
4064         struct dt_device                *dev    = lfsck_obj2dev(child);
4065         struct thandle                  *handle;
4066         int                              rc;
4067         dt_obj_version_t                 version;
4068         ENTRY;
4069
4070         tla->la_uid = pla->la_uid;
4071         tla->la_gid = pla->la_gid;
4072         tla->la_valid = LA_UID | LA_GID;
4073         handle = lfsck_trans_create(env, dev, com->lc_lfsck);
4074         if (IS_ERR(handle))
4075                 GOTO(log, rc = PTR_ERR(handle));
4076
4077         rc = dt_declare_attr_set(env, child, tla, handle);
4078         if (rc != 0)
4079                 GOTO(stop, rc);
4080
4081         rc = dt_trans_start_local(env, dev, handle);
4082         if (rc != 0)
4083                 GOTO(stop, rc);
4084
4085         /* Use the dt_object lock to serialize with destroy and attr_set. */
4086         dt_read_lock(env, parent, 0);
4087         if (unlikely(lfsck_is_dead_obj(parent)))
4088                 GOTO(unlock, rc = 1);
4089
4090         version = dt_version_get(env, child);
4091         if (version == -EOPNOTSUPP)
4092                 version = 0;
4093
4094         /* Get the latest parent's owner. */
4095         rc = dt_attr_get(env, parent, pla);
4096         if (rc != 0)
4097                 GOTO(unlock, rc);
4098
4099         /* Some others chown/chgrp during the LFSCK, needs to do nothing. */
4100         if (unlikely((!version && tla->la_ctime == 0) ||
4101                      tla->la_uid != pla->la_uid || tla->la_gid != pla->la_gid))
4102                 rc = 1;
4103         else
4104                 rc = dt_attr_set(env, child, tla, handle);
4105
4106         GOTO(unlock, rc);
4107
4108 unlock:
4109         dt_read_unlock(env, parent);
4110
4111 stop:
4112         rc = lfsck_layout_trans_stop(env, dev, handle, rc);
4113
4114 log:
4115         if (rc != 0)
4116                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant repaired "
4117                        "inconsistent file owner for: parent "DFID", child "DFID
4118                        ", OST-index %u, stripe-index %u, old owner %u/%u, "
4119                        "new owner %u/%u: rc = %d\n",
4120                        lfsck_lfsck2name(com->lc_lfsck),
4121                        PFID(lfsck_dto2fid(parent)), PFID(lfsck_dto2fid(child)),
4122                        llr->llr_ost_idx, llr->llr_lov_idx,
4123                        cla->la_uid, cla->la_gid, tla->la_uid, tla->la_gid, rc);
4124
4125         return rc;
4126 }
4127
4128 #define CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid, msg)               \
4129         CDEBUG(D_LFSCK, "%s:("DFID"|"DFID")/"DFID":XATTR %s: %s\n",      \
4130                lfsck_lfsck2name(lfsck), PFID(&lso->lso_fid), PFID(pfid), \
4131                PFID(cfid), XATTR_NAME_FID, msg);
4132
4133 /* Check whether the OST-object correctly back points to the
4134  * MDT-object (@parent) via the XATTR_NAME_FID xattr (@pfid). */
4135 static int lfsck_layout_check_parent(const struct lu_env *env,
4136                                      struct lfsck_component *com,
4137                                      struct lfsck_assistant_object *lso,
4138                                      struct filter_fid *ff,
4139                                      const struct lu_fid *cfid,
4140                                      const struct lu_attr *cla,
4141                                      struct lfsck_layout_req *llr)
4142 {
4143         struct lfsck_thread_info        *info   = lfsck_env_info(env);
4144         struct lu_buf                   *buf    = &info->lti_big_buf;
4145         struct lu_fid                   *pfid   = &info->lti_fid;
4146         struct dt_object                *tobj;
4147         struct lov_mds_md_v1            *lmm;
4148         struct lov_ost_data_v1          *objs;
4149         struct lustre_handle             lh     = { 0 };
4150         struct lfsck_instance           *lfsck  = com->lc_lfsck;
4151         int                              rc;
4152         int                              i;
4153         __u32                            magic;
4154         __u32                            idx;
4155         __u16                            count;
4156         ENTRY;
4157
4158         *pfid = ff->ff_parent;
4159         idx = pfid->f_stripe_idx;
4160         pfid->f_ver = 0;
4161
4162         if (unlikely(!fid_is_sane(pfid))) {
4163                 CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid,
4164                                       "the parent FID is invalid");
4165
4166                 RETURN(LLIT_UNMATCHED_PAIR);
4167         }
4168
4169         if (lu_fid_eq(pfid, &lso->lso_fid)) {
4170                 if (likely(llr->llr_lov_idx == idx))
4171                         RETURN(0);
4172
4173                 CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid,
4174                                       "the stripe index is unmatched");
4175
4176                 RETURN(LLIT_UNMATCHED_PAIR);
4177         }
4178
4179         tobj = lfsck_object_find_bottom(env, com->lc_lfsck, pfid);
4180         if (IS_ERR(tobj))
4181                 RETURN(PTR_ERR(tobj));
4182
4183         if (dt_object_exists(tobj) == 0) {
4184                 CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid,
4185                                       "the parent is nonexistent");
4186
4187                 GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4188         }
4189
4190         if (lfsck_is_dead_obj(tobj)) {
4191                 CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid,
4192                                       "the parent is dead object");
4193
4194                 GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4195         }
4196
4197         if (!S_ISREG(lfsck_object_type(tobj))) {
4198                 CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid,
4199                                       "the parent is not a regular file");
4200
4201                 GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4202         }
4203
4204         /* Load the tobj's layout EA, in spite of it is a local MDT-object or
4205          * remote one on another MDT. Then check whether the given OST-object
4206          * is in such layout. If yes, it is multiple referenced, otherwise it
4207          * is unmatched referenced case. */
4208         rc = lfsck_layout_get_lovea(env, tobj, buf);
4209         if (rc == 0 || rc == -ENODATA || rc == -ENOENT) {
4210                 CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid,
4211                                       "the parent has no stripe data");
4212
4213                 GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4214         }
4215
4216         if (unlikely(rc == -EOPNOTSUPP))
4217                 GOTO(out, rc = LLIT_NONE);
4218
4219         if (rc < 0)
4220                 GOTO(out, rc);
4221
4222         lmm = buf->lb_buf;
4223         magic = le32_to_cpu(lmm->lmm_magic);
4224         if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) {
4225                 struct lov_comp_md_v1 *lcm = buf->lb_buf;
4226                 struct lov_comp_md_entry_v1 *lcme;
4227
4228                 if (ff->ff_layout.ol_comp_id == 0) {
4229                         CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid,
4230                                               "the parent has incorrect comp_id");
4231
4232                         GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4233                 }
4234
4235                 count = le16_to_cpu(lcm->lcm_entry_count);
4236                 for (i = 0; i < count; i++) {
4237                         lcme = &lcm->lcm_entries[i];
4238                         if (le32_to_cpu(lcme->lcme_id) ==
4239                             ff->ff_layout.ol_comp_id) {
4240                                 lmm = buf->lb_buf +
4241                                         le32_to_cpu(lcme->lcme_offset);
4242                                 magic = le32_to_cpu(lmm->lmm_magic);
4243                                 if (!(le32_to_cpu(lcme->lcme_flags) &
4244                                       LCME_FL_INIT)) {
4245                                         CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid,
4246                                                               cfid,
4247                                                               "the parent has uninitialized component");
4248
4249                                         GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4250                                 }
4251
4252                                 goto further;
4253                         }
4254                 }
4255
4256                 CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid,
4257                                       "the parent has no matched comp_id");
4258
4259                 GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4260         }
4261
4262 further:
4263         if (magic == LOV_MAGIC_V1) {
4264                 objs = &lmm->lmm_objects[0];
4265         } else {
4266                 LASSERT(magic == LOV_MAGIC_V3);
4267                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0];
4268         }
4269
4270         count = le16_to_cpu(lmm->lmm_stripe_count);
4271         for (i = 0; i < count; i++, objs++) {
4272                 struct lu_fid           *tfid   = &info->lti_fid2;
4273                 struct ost_id           *oi     = &info->lti_oi;
4274                 __u32                    idx2;
4275
4276                 if (lovea_slot_is_dummy(objs))
4277                         continue;
4278
4279                 ostid_le_to_cpu(&objs->l_ost_oi, oi);
4280                 idx2 = le32_to_cpu(objs->l_ost_idx);
4281                 rc = ostid_to_fid(tfid, oi, idx2);
4282                 if (rc != 0) {
4283                         CDEBUG(D_LFSCK, "%s: the parent "DFID" contains "
4284                                "invalid layout EA at the slot %d, index %u\n",
4285                                lfsck_lfsck2name(com->lc_lfsck),
4286                                PFID(pfid), i, idx2);
4287
4288                         GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4289                 }
4290
4291                 if (lu_fid_eq(cfid, tfid)) {
4292                         rc = lfsck_ibits_lock(env, com->lc_lfsck, tobj, &lh,
4293                                               MDS_INODELOCK_UPDATE |
4294                                               MDS_INODELOCK_LAYOUT |
4295                                               MDS_INODELOCK_XATTR,
4296                                               LCK_EX);
4297                         if (rc != 0)
4298                                 GOTO(out, rc);
4299
4300                         dt_read_lock(env, tobj, 0);
4301
4302                         /* For local MDT-object, re-check existence
4303                          * after taken the lock. */
4304                         if (!dt_object_remote(tobj)) {
4305                                 if (dt_object_exists(tobj) == 0 ||
4306                                     lfsck_is_dead_obj(tobj)) {
4307                                         CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid,
4308                                                               cfid,
4309                                                               "the parent doesn't exist anymore after lock");
4310
4311                                         rc = LLIT_UNMATCHED_PAIR;
4312                                 } else {
4313                                         rc = LLIT_MULTIPLE_REFERENCED;
4314                                 }
4315
4316                                 GOTO(unlock, rc);
4317                         }
4318
4319                         /* For migration case, the new MDT-object and old
4320                          * MDT-object may reference the same OST-object at
4321                          * some migration internal time.
4322                          *
4323                          * For remote MDT-object, the local MDT may not know
4324                          * whether it has been removed or not.  Try checking
4325                          * for a non-existent xattr to check if this object
4326                          * has been been removed or not. */
4327                         rc = dt_xattr_get(env, tobj, &LU_BUF_NULL,
4328                                           XATTR_NAME_DUMMY);
4329                         if (unlikely(rc == -ENOENT || rc >= 0)) {
4330                                 CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid,
4331                                                       "the parent is remote object and nonexistent after lock");
4332
4333                                 rc = LLIT_UNMATCHED_PAIR;
4334                         } else if (rc == -ENODATA) {
4335                                 rc = LLIT_MULTIPLE_REFERENCED;
4336                         }
4337
4338                         GOTO(unlock, rc);
4339                 }
4340         }
4341
4342         CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid,
4343                               "the parent has no matched stripe");
4344
4345         GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4346
4347 unlock:
4348         if (lustre_handle_is_used(&lh)) {
4349                 dt_read_unlock(env, tobj);
4350                 lfsck_ibits_unlock(&lh, LCK_EX);
4351         }
4352
4353 out:
4354         lfsck_object_put(env, tobj);
4355
4356         return rc;
4357 }
4358
4359 /*
4360  * If the MDT-object has the LUSTRE_ENCRYPT_FL flag, it needs to be set
4361  * on the OST-object as well.
4362  */
4363 static int lfsck_layout_repair_encflag(const struct lu_env *env,
4364                                        struct lfsck_component *com,
4365                                        struct dt_object *parent,
4366                                        struct lfsck_layout_req *llr)
4367 {
4368         struct lfsck_thread_info *info = lfsck_env_info(env);
4369         struct lu_attr *tla = &info->lti_la2;
4370         struct dt_object *child = llr->llr_child;
4371         struct dt_device *dev = lfsck_obj2dev(child);
4372         struct thandle *handle;
4373         int rc;
4374
4375         ENTRY;
4376
4377         tla->la_valid = LA_FLAGS;
4378         tla->la_flags = LUSTRE_ENCRYPT_FL;
4379         handle = lfsck_trans_create(env, dev, com->lc_lfsck);
4380         if (IS_ERR(handle))
4381                 GOTO(log, rc = PTR_ERR(handle));
4382
4383         rc = dt_declare_attr_set(env, child, tla, handle);
4384         if (rc != 0)
4385                 GOTO(stop, rc);
4386
4387         rc = dt_trans_start_local(env, dev, handle);
4388         if (rc != 0)
4389                 GOTO(stop, rc);
4390
4391         /* Use the dt_object lock to serialize with destroy and attr_set. */
4392         dt_read_lock(env, parent, 0);
4393         if (unlikely(lfsck_is_dead_obj(parent)))
4394                 GOTO(unlock, rc = 1);
4395
4396         rc = dt_attr_set(env, child, tla, handle);
4397         GOTO(unlock, rc);
4398
4399 unlock:
4400         dt_read_unlock(env, parent);
4401
4402 stop:
4403         rc = lfsck_layout_trans_stop(env, dev, handle, rc);
4404
4405 log:
4406         if (rc != 0)
4407                 CDEBUG(D_LFSCK,
4408                        "%s: layout LFSCK assistant repair of inconsistent file enc flag for: parent "
4409                        DFID", child "
4410                        DFID", OST-index %u, stripe-index %u: rc = %d\n",
4411                        lfsck_lfsck2name(com->lc_lfsck),
4412                        PFID(lfsck_dto2fid(parent)), PFID(lfsck_dto2fid(child)),
4413                        llr->llr_ost_idx, llr->llr_lov_idx, rc);
4414
4415         return rc;
4416 }
4417
4418 static int lfsck_layout_assistant_handler_p1(const struct lu_env *env,
4419                                              struct lfsck_component *com,
4420                                              struct lfsck_assistant_req *lar)
4421 {
4422         struct lfsck_layout_req              *llr    =
4423                 container_of(lar, struct lfsck_layout_req, llr_lar);
4424         struct lfsck_assistant_object        *lso    = lar->lar_parent;
4425         struct lfsck_layout                  *lo     = com->lc_file_ram;
4426         struct lfsck_thread_info             *info   = lfsck_env_info(env);
4427         struct filter_fid                    *ff     = &info->lti_ff;
4428         struct lu_buf buf = { .lb_buf = ff,
4429                               .lb_len = sizeof(*ff) };
4430         struct dt_object                     *parent = NULL;
4431         struct dt_object                     *child  = llr->llr_child;
4432         struct lu_attr                       *pla    = &lso->lso_attr;
4433         struct lu_attr                       *cla    = &info->lti_la;
4434         struct lfsck_instance                *lfsck  = com->lc_lfsck;
4435         struct lfsck_bookmark                *bk     = &lfsck->li_bookmark_ram;
4436         enum lfsck_layout_inconsistency_type  type   = LLIT_NONE;
4437         int                                   rc;
4438         ENTRY;
4439
4440         if (lso->lso_dead)
4441                 RETURN(0);
4442
4443         CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_ENGINE_DELAY, cfs_fail_val);
4444
4445         rc = dt_attr_get(env, child, cla);
4446         if (rc == -ENOENT) {
4447                 parent = lfsck_assistant_object_load(env, lfsck, lso);
4448                 if (IS_ERR(parent)) {
4449                         rc = PTR_ERR(parent);
4450
4451                         RETURN(rc == -ENOENT ? 0 : rc);
4452                 }
4453
4454                 type = LLIT_DANGLING;
4455                 goto repair;
4456         }
4457
4458         if (rc != 0)
4459                 GOTO(out, rc);
4460
4461         if (!(bk->lb_param & LPF_DRYRUN) &&
4462             pla->la_valid & LA_FLAGS && pla->la_flags & LUSTRE_ENCRYPT_FL) {
4463                 /* MDT-inode is encrypted */
4464                 struct lu_buf lb = { .lb_buf = NULL, .lb_len = 0 };
4465
4466                 /* if OST-inode is missing encryption.c xattr, fix it */
4467                 if (dt_xattr_get(env, child, &lb,
4468                                  LL_XATTR_NAME_ENCRYPTION_CONTEXT) >= 0)
4469                         goto check_fid;
4470
4471                 if (parent == NULL)
4472                         parent = lfsck_assistant_object_load(env, lfsck, lso);
4473                 if (!IS_ERR_OR_NULL(parent))
4474                         rc = lfsck_layout_repair_encflag(env, com, parent, llr);
4475                 down_write(&com->lc_sem);
4476                 if (rc < 0)
4477                         lfsck_layout_record_failure(env, lfsck, lo);
4478                 else if (rc > 0)
4479                         lo->ll_objs_repaired[LLIT_OTHERS - 1]++;
4480                 up_write(&com->lc_sem);
4481         }
4482
4483 check_fid:
4484         lfsck_buf_init(&buf, ff, sizeof(*ff));
4485         rc = dt_xattr_get(env, child, &buf, XATTR_NAME_FID);
4486         if (unlikely(rc > 0 && rc < sizeof(struct lu_fid))) {
4487                 CDEBUG(D_LFSCK, "%s:"DFID"/"DFID": "
4488                        "the child object's %s is corrupted\n",
4489                        lfsck_lfsck2name(lfsck), PFID(&lso->lso_fid),
4490                        PFID(lu_object_fid(&child->do_lu)),
4491                        XATTR_NAME_FID);
4492
4493                 type = LLIT_UNMATCHED_PAIR;
4494                 goto repair;
4495         }
4496
4497         if (rc < 0 && rc != -ENODATA)
4498                 GOTO(out, rc);
4499
4500         if (rc == 0 || rc == -ENODATA)
4501                 GOTO(check_owner, rc = 0);
4502
4503         filter_fid_le_to_cpu(ff, ff, sizeof(*ff));
4504         rc = lfsck_layout_check_parent(env, com, lso, ff,
4505                                        lu_object_fid(&child->do_lu), cla, llr);
4506         if (rc > 0) {
4507                 type = rc;
4508                 goto repair;
4509         }
4510
4511         if (rc < 0)
4512                 GOTO(out, rc);
4513
4514 check_owner:
4515         /* Someone may has changed the owner after the parent attr pre-loaded.
4516          * It can be handled later inside the lfsck_layout_repair_owner(). */
4517         if (unlikely(cla->la_uid != pla->la_uid ||
4518                      cla->la_gid != pla->la_gid)) {
4519                 type = LLIT_INCONSISTENT_OWNER;
4520                 goto repair;
4521         }
4522
4523 repair:
4524         if (type == LLIT_NONE)
4525                 GOTO(out, rc = 0);
4526
4527         if (bk->lb_param & LPF_DRYRUN)
4528                 GOTO(out, rc = 1);
4529
4530         if (parent == NULL) {
4531                 parent = lfsck_assistant_object_load(env, lfsck, lso);
4532                 if (IS_ERR(parent)) {
4533                         rc = PTR_ERR(parent);
4534
4535                         if (rc == -ENOENT)
4536                                 RETURN(0);
4537
4538                         GOTO(out, rc);
4539                 }
4540         }
4541
4542         switch (type) {
4543         case LLIT_DANGLING:
4544                 if (bk->lb_param & LPF_DELAY_CREATE_OSTOBJ)
4545                         rc = lfsck_layout_ins_dangling_rec(env, com,
4546                                 lfsck_dto2fid(parent), lfsck_dto2fid(child),
4547                                 llr->llr_comp_id, llr->llr_lov_idx,
4548                                 llr->llr_ost_idx);
4549                 else
4550                         rc = __lfsck_layout_repair_dangling(env, com, parent,
4551                                                             llr->llr_child,
4552                                                             llr->llr_comp_id,
4553                                                             llr->llr_lov_idx,
4554                                                             llr->llr_ost_idx,
4555                                                             true);
4556                 break;
4557         case LLIT_UNMATCHED_PAIR:
4558                 rc = lfsck_layout_repair_unmatched_pair(env, com, parent,
4559                                                         llr, pla);
4560                 break;
4561         case LLIT_MULTIPLE_REFERENCED:
4562                 rc = lfsck_layout_repair_multiple_references(env, com, parent,
4563                                                              llr, pla);
4564                 break;
4565         case LLIT_INCONSISTENT_OWNER:
4566                 rc = lfsck_layout_repair_owner(env, com, parent, llr, pla, cla);
4567                 break;
4568         default:
4569                 rc = 0;
4570                 break;
4571         }
4572
4573         GOTO(out, rc);
4574
4575 out:
4576         down_write(&com->lc_sem);
4577         if (rc < 0) {
4578                 struct lfsck_assistant_data *lad = com->lc_data;
4579
4580                 if (unlikely(test_bit(LAD_EXIT, &lad->lad_flags))) {
4581                         rc = 0;
4582                 } else if (rc == -ENOTCONN || rc == -ESHUTDOWN ||
4583                            rc == -ETIMEDOUT || rc == -EHOSTDOWN ||
4584                            rc == -EHOSTUNREACH) {
4585                         /* If cannot touch the target server,
4586                          * mark the LFSCK as INCOMPLETE. */
4587                         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant fail to "
4588                                "talk with OST %x: rc = %d\n",
4589                                lfsck_lfsck2name(lfsck), llr->llr_ost_idx, rc);
4590                         lfsck_lad_set_bitmap(env, com, llr->llr_ost_idx);
4591                         lo->ll_objs_skipped++;
4592                         rc = 0;
4593                 } else {
4594                         lfsck_layout_record_failure(env, lfsck, lo);
4595                 }
4596         } else if (rc > 0 && (type != LLIT_DANGLING ||
4597                               !(bk->lb_param & LPF_DELAY_CREATE_OSTOBJ))) {
4598                 LASSERTF(type > LLIT_NONE && type <= LLIT_MAX,
4599                          "unknown type = %d\n", type);
4600
4601                 lo->ll_objs_repaired[type - 1]++;
4602                 if (bk->lb_param & LPF_DRYRUN &&
4603                     unlikely(lo->ll_pos_first_inconsistent == 0))
4604                         lo->ll_pos_first_inconsistent =
4605                         lfsck->li_obj_oit->do_index_ops->dio_it.store(env,
4606                                                         lfsck->li_di_oit);
4607         }
4608         up_write(&com->lc_sem);
4609
4610         if (parent != NULL && !IS_ERR(parent))
4611                 lfsck_object_put(env, parent);
4612
4613         return rc;
4614 }
4615
4616 static int
4617 lfsck_layout_double_scan_one_trace_file(const struct lu_env *env,
4618                                         struct lfsck_component *com,
4619                                         struct dt_object *obj, bool first)
4620 {
4621         struct lfsck_instance *lfsck = com->lc_lfsck;
4622         struct ptlrpc_thread *thread = &lfsck->li_thread;
4623         struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram;
4624         struct lfsck_layout *lo = com->lc_file_ram;
4625         const struct dt_it_ops *iops = &obj->do_index_ops->dio_it;
4626         struct dt_it *di;
4627         struct dt_key *key;
4628         struct lfsck_layout_dangling_key *parent =
4629                                         &lfsck_env_info(env)->lti_lldk;
4630         struct lu_fid *cfid = &lfsck_env_info(env)->lti_fid3;
4631         __u32 ost_idx;
4632         int rc;
4633         ENTRY;
4634
4635         di = iops->init(env, obj, 0);
4636         if (IS_ERR(di))
4637                 RETURN(PTR_ERR(di));
4638
4639         if (first)
4640                 lldk_cpu_to_be(parent, &lo->ll_lldk_latest_scanned_phase2);
4641         else
4642                 memset(parent, 0, sizeof(*parent));
4643         rc = iops->get(env, di, (const struct dt_key *)parent);
4644         if (rc < 0)
4645                 GOTO(fini, rc);
4646
4647         if (first) {
4648                 /* The start one either has been processed or does not exist,
4649                  * skip it. */
4650                 rc = iops->next(env, di);
4651                 if (rc != 0)
4652                         GOTO(put, rc);
4653         }
4654
4655         do {
4656                 if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val) &&
4657                     unlikely(!thread_is_running(thread)))
4658                         GOTO(put, rc = 0);
4659
4660                 key = iops->key(env, di);
4661                 if (IS_ERR(key)) {
4662                         rc = PTR_ERR(key);
4663                         if (rc == -ENOENT)
4664                                 GOTO(put, rc = 1);
4665
4666                         goto checkpoint;
4667                 }
4668
4669                 lldk_be_to_cpu(parent,
4670                                 (const struct lfsck_layout_dangling_key *)key);
4671                 if (!fid_is_sane(&parent->lldk_fid)) {
4672                         rc = 0;
4673                         goto checkpoint;
4674                 }
4675
4676                 rc = iops->rec(env, di, (struct dt_rec *)cfid, 0);
4677                 if (rc == 0) {
4678                         fid_be_to_cpu(cfid, cfid);
4679                         ost_idx = cfid->f_ver;
4680                         cfid->f_ver = 0;
4681                         if (!fid_is_sane(cfid)) {
4682                                 rc = 0;
4683                                 goto checkpoint;
4684                         }
4685
4686                         rc = lfsck_layout_repair_dangling(env, com,
4687                                         &parent->lldk_fid, cfid,
4688                                         parent->lldk_comp_id,
4689                                         parent->lldk_ea_off, ost_idx);
4690                 }
4691
4692 checkpoint:
4693                 down_write(&com->lc_sem);
4694                 com->lc_new_checked++;
4695                 com->lc_new_scanned++;
4696                 if (rc >= 0)
4697                         lo->ll_lldk_latest_scanned_phase2 = *parent;
4698
4699                 if (rc > 0)
4700                         lo->ll_objs_repaired[LLIT_DANGLING - 1]++;
4701                 else if (rc < 0)
4702                         lo->ll_objs_failed_phase2++;
4703                 up_write(&com->lc_sem);
4704
4705                 if (rc < 0 && bk->lb_param & LPF_FAILOUT)
4706                         GOTO(put, rc);
4707
4708                 if (unlikely(com->lc_time_next_checkpoint <=
4709                              ktime_get_seconds()) &&
4710                     com->lc_new_checked != 0) {
4711                         down_write(&com->lc_sem);
4712                         lo->ll_run_time_phase2 += ktime_get_seconds() -
4713                                                   com->lc_time_last_checkpoint;
4714                         lo->ll_time_last_checkpoint = ktime_get_real_seconds();
4715                         lo->ll_objs_checked_phase2 += com->lc_new_checked;
4716                         com->lc_new_checked = 0;
4717                         lfsck_layout_store(env, com);
4718                         up_write(&com->lc_sem);
4719
4720                         com->lc_time_last_checkpoint = ktime_get_seconds();
4721                         com->lc_time_next_checkpoint =
4722                                 com->lc_time_last_checkpoint +
4723                                 LFSCK_CHECKPOINT_INTERVAL;
4724                 }
4725
4726                 lfsck_control_speed_by_self(com);
4727                 if (unlikely(!thread_is_running(thread)))
4728                         GOTO(put, rc = 0);
4729
4730                 rc = iops->next(env, di);
4731         } while (rc == 0);
4732
4733         GOTO(put, rc);
4734
4735 put:
4736         iops->put(env, di);
4737
4738 fini:
4739         iops->fini(env, di);
4740
4741         return rc;
4742 }
4743
4744 static int lfsck_layout_assistant_handler_p2(const struct lu_env *env,
4745                                              struct lfsck_component *com)
4746 {
4747         struct lfsck_assistant_data     *lad    = com->lc_data;
4748         struct lfsck_instance           *lfsck  = com->lc_lfsck;
4749         struct lfsck_bookmark           *bk     = &lfsck->li_bookmark_ram;
4750         struct lfsck_tgt_descs          *ltds   = &lfsck->li_ost_descs;
4751         struct lfsck_tgt_desc           *ltd;
4752         int                              rc     = 0;
4753         ENTRY;
4754
4755         CDEBUG(D_LFSCK, "%s: layout LFSCK phase2 scan start\n",
4756                lfsck_lfsck2name(lfsck));
4757
4758         spin_lock(&ltds->ltd_lock);
4759         while (!list_empty(&lad->lad_ost_phase2_list)) {
4760                 ltd = list_first_entry(&lad->lad_ost_phase2_list,
4761                                        struct lfsck_tgt_desc,
4762                                        ltd_layout_phase_list);
4763                 list_del_init(&ltd->ltd_layout_phase_list);
4764                 if (bk->lb_param & LPF_OST_ORPHAN) {
4765                         spin_unlock(&ltds->ltd_lock);
4766                         rc = lfsck_layout_scan_orphan(env, com, ltd);
4767                         if (rc != 0 && bk->lb_param & LPF_FAILOUT)
4768                                 RETURN(rc);
4769
4770                         if (unlikely(test_bit(LAD_EXIT, &lad->lad_flags) ||
4771                                      !thread_is_running(&lfsck->li_thread)))
4772                                 RETURN(0);
4773                         spin_lock(&ltds->ltd_lock);
4774                 }
4775         }
4776
4777         if (list_empty(&lad->lad_ost_phase1_list))
4778                 rc = 1;
4779         else
4780                 rc = 0;
4781         spin_unlock(&ltds->ltd_lock);
4782
4783         if (rc == 1 && bk->lb_param & LPF_OST_ORPHAN) {
4784                 struct lfsck_layout *lo = com->lc_file_ram;
4785                 int i;
4786
4787                 com->lc_new_checked = 0;
4788                 com->lc_new_scanned = 0;
4789                 com->lc_time_last_checkpoint = ktime_get_seconds();
4790                 com->lc_time_next_checkpoint = com->lc_time_last_checkpoint +
4791                                                LFSCK_CHECKPOINT_INTERVAL;
4792
4793                 i = lfsck_sub_trace_file_fid2idx(
4794                                 &lo->ll_lldk_latest_scanned_phase2.lldk_fid);
4795                 rc = lfsck_layout_double_scan_one_trace_file(env, com,
4796                                 com->lc_sub_trace_objs[i].lsto_obj, true);
4797                 while (rc > 0 && ++i < LFSCK_STF_COUNT)
4798                         rc = lfsck_layout_double_scan_one_trace_file(env, com,
4799                                 com->lc_sub_trace_objs[i].lsto_obj, false);
4800
4801                 CDEBUG(D_LFSCK, "%s: layout LFSCK phase2 scan dangling stop "
4802                        "at the No. %d trace file: rc = %d\n",
4803                        lfsck_lfsck2name(lfsck), i, rc);
4804         }
4805
4806         CDEBUG(D_LFSCK, "%s: layout LFSCK phase2 scan stop: rc = %d\n",
4807                lfsck_lfsck2name(lfsck), rc);
4808
4809         RETURN(rc);
4810 }
4811
4812 static int
4813 lfsck_layout_slave_async_interpret(const struct lu_env *env,
4814                                    struct ptlrpc_request *req,
4815                                    void *args, int rc)
4816 {
4817         struct lfsck_layout_slave_async_args *llsaa = args;
4818         struct obd_export *exp = llsaa->llsaa_exp;
4819         struct lfsck_component *com = llsaa->llsaa_com;
4820         struct lfsck_layout_slave_target *llst = llsaa->llsaa_llst;
4821         struct lfsck_layout_slave_data *llsd = com->lc_data;
4822         struct lfsck_reply *lr = NULL;
4823         bool done = false;
4824
4825         if (rc != 0) {
4826                 /* It is probably caused by network trouble, or target crash,
4827                  * it will try several times (depends on the obd_timeout, and
4828                  * will not less than 3 times). But to make the LFSCK can go
4829                  * ahead, we should not try for ever. After some try but still
4830                  * hit failure, it will assume that the target exit the LFSCK
4831                  * prcoessing and stop try. */
4832                 if (rc == -ENOTCONN || rc == -ESHUTDOWN) {
4833                         int max_try = max_t(int, obd_timeout / 30, 3);
4834
4835                         if (++(llst->llst_failures) > max_try)
4836                                 done = true;
4837                 } else {
4838                         done = true;
4839                 }
4840         } else {
4841                 llst->llst_failures = 0;
4842                 lr = req_capsule_server_get(&req->rq_pill, &RMF_LFSCK_REPLY);
4843                 if (lr->lr_status != LS_SCANNING_PHASE1 &&
4844                     lr->lr_status != LS_SCANNING_PHASE2)
4845                         done = true;
4846         }
4847
4848         if (done) {
4849                 CDEBUG(D_LFSCK, "%s: layout LFSCK slave gets the MDT %x "
4850                        "status %d, failures_try %d\n", lfsck_lfsck2name(com->lc_lfsck),
4851                        llst->llst_index, lr != NULL ? lr->lr_status : rc,
4852                        llst->llst_failures);
4853
4854                 lfsck_layout_llst_del(llsd, llst);
4855         }
4856
4857         lfsck_layout_llst_put(llst);
4858         lfsck_component_put(env, com);
4859         class_export_put(exp);
4860
4861         return 0;
4862 }
4863
4864 static int lfsck_layout_async_query(const struct lu_env *env,
4865                                     struct lfsck_component *com,
4866                                     struct obd_export *exp,
4867                                     struct lfsck_layout_slave_target *llst,
4868                                     struct lfsck_request *lr,
4869                                     struct ptlrpc_request_set *set)
4870 {
4871         struct lfsck_layout_slave_async_args *llsaa;
4872         struct ptlrpc_request                *req;
4873         struct lfsck_request                 *tmp;
4874         int                                   rc;
4875         ENTRY;
4876
4877         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LFSCK_QUERY);
4878         if (req == NULL)
4879                 RETURN(-ENOMEM);
4880
4881         rc = ptlrpc_request_pack(req, LUSTRE_OBD_VERSION, LFSCK_QUERY);
4882         if (rc != 0) {
4883                 ptlrpc_request_free(req);
4884                 RETURN(rc);
4885         }
4886
4887         tmp = req_capsule_client_get(&req->rq_pill, &RMF_LFSCK_REQUEST);
4888         *tmp = *lr;
4889         ptlrpc_request_set_replen(req);
4890
4891         llsaa = ptlrpc_req_async_args(llsaa, req);
4892         llsaa->llsaa_exp = exp;
4893         llsaa->llsaa_com = lfsck_component_get(com);
4894         llsaa->llsaa_llst = llst;
4895         req->rq_interpret_reply = lfsck_layout_slave_async_interpret;
4896         req->rq_allow_intr = 1;
4897         req->rq_no_delay = 1;
4898         ptlrpc_set_add_req(set, req);
4899
4900         RETURN(0);
4901 }
4902
4903 static int lfsck_layout_async_notify(const struct lu_env *env,
4904                                      struct obd_export *exp,
4905                                      struct lfsck_request *lr,
4906                                      struct ptlrpc_request_set *set)
4907 {
4908         struct ptlrpc_request   *req;
4909         struct lfsck_request    *tmp;
4910         int                      rc;
4911         ENTRY;
4912
4913         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LFSCK_NOTIFY);
4914         if (req == NULL)
4915                 RETURN(-ENOMEM);
4916
4917         rc = ptlrpc_request_pack(req, LUSTRE_OBD_VERSION, LFSCK_NOTIFY);
4918         if (rc != 0) {
4919                 ptlrpc_request_free(req);
4920                 RETURN(rc);
4921         }
4922
4923         tmp = req_capsule_client_get(&req->rq_pill, &RMF_LFSCK_REQUEST);
4924         *tmp = *lr;
4925         ptlrpc_request_set_replen(req);
4926         req->rq_allow_intr = 1;
4927         req->rq_no_delay = 1;
4928         ptlrpc_set_add_req(set, req);
4929
4930         RETURN(0);
4931 }
4932
4933 static int
4934 lfsck_layout_slave_query_master(const struct lu_env *env,
4935                                 struct lfsck_component *com)
4936 {
4937         struct lfsck_request             *lr    = &lfsck_env_info(env)->lti_lr;
4938         struct lfsck_instance            *lfsck = com->lc_lfsck;
4939         struct lfsck_layout_slave_data   *llsd  = com->lc_data;
4940         struct lfsck_layout_slave_target *llst;
4941         struct obd_export                *exp;
4942         struct ptlrpc_request_set        *set;
4943         int                               rc    = 0;
4944         int                               rc1   = 0;
4945         ENTRY;
4946
4947         set = ptlrpc_prep_set();
4948         if (set == NULL)
4949                 GOTO(log, rc = -ENOMEM);
4950
4951         memset(lr, 0, sizeof(*lr));
4952         lr->lr_event = LE_QUERY;
4953         lr->lr_active = LFSCK_TYPE_LAYOUT;
4954
4955         llsd->llsd_touch_gen++;
4956         spin_lock(&llsd->llsd_lock);
4957         while (!list_empty(&llsd->llsd_master_list)) {
4958                 llst = list_first_entry(&llsd->llsd_master_list,
4959                                         struct lfsck_layout_slave_target,
4960                                         llst_list);
4961                 if (llst->llst_gen == llsd->llsd_touch_gen)
4962                         break;
4963
4964                 llst->llst_gen = llsd->llsd_touch_gen;
4965                 list_move_tail(&llst->llst_list,
4966                                &llsd->llsd_master_list);
4967                 atomic_inc(&llst->llst_ref);
4968                 spin_unlock(&llsd->llsd_lock);
4969
4970                 exp = lustre_find_lwp_by_index(lfsck->li_obd->obd_name,
4971                                                llst->llst_index);
4972                 if (exp == NULL) {
4973                         lfsck_layout_llst_del(llsd, llst);
4974                         lfsck_layout_llst_put(llst);
4975                         spin_lock(&llsd->llsd_lock);
4976                         continue;
4977                 }
4978
4979                 rc = lfsck_layout_async_query(env, com, exp, llst, lr, set);
4980                 if (rc != 0) {
4981                         CDEBUG(D_LFSCK, "%s: layout LFSCK slave fail to "
4982                                "query %s for layout: rc = %d\n",
4983                                lfsck_lfsck2name(lfsck),
4984                                exp->exp_obd->obd_name, rc);
4985
4986                         rc1 = rc;
4987                         lfsck_layout_llst_put(llst);
4988                         class_export_put(exp);
4989                 }
4990                 spin_lock(&llsd->llsd_lock);
4991         }
4992         spin_unlock(&llsd->llsd_lock);
4993
4994         rc = ptlrpc_set_wait(env, set);
4995         ptlrpc_set_destroy(set);
4996
4997         GOTO(log, rc = (rc1 != 0 ? rc1 : rc));
4998
4999 log:
5000         CDEBUG(D_LFSCK, "%s: layout LFSCK slave queries master: rc = %d\n",
5001                lfsck_lfsck2name(com->lc_lfsck), rc);
5002
5003         return rc;
5004 }
5005
5006 static void
5007 lfsck_layout_slave_notify_master(const struct lu_env *env,
5008                                  struct lfsck_component *com,
5009                                  enum lfsck_events event, int result)
5010 {
5011         struct lfsck_layout              *lo    = com->lc_file_ram;
5012         struct lfsck_instance            *lfsck = com->lc_lfsck;
5013         struct lfsck_layout_slave_data   *llsd  = com->lc_data;
5014         struct lfsck_request             *lr    = &lfsck_env_info(env)->lti_lr;
5015         struct lfsck_layout_slave_target *llst;
5016         struct obd_export                *exp;
5017         struct ptlrpc_request_set        *set;
5018         int                               rc;
5019         ENTRY;
5020
5021         CDEBUG(D_LFSCK, "%s: layout LFSCK slave notifies master\n",
5022                lfsck_lfsck2name(com->lc_lfsck));
5023
5024         set = ptlrpc_prep_set();
5025         if (set == NULL)
5026                 RETURN_EXIT;
5027
5028         memset(lr, 0, sizeof(*lr));
5029         lr->lr_event = event;
5030         lr->lr_flags = LEF_FROM_OST;
5031         lr->lr_status = result;
5032         lr->lr_index = lfsck_dev_idx(lfsck);
5033         lr->lr_active = LFSCK_TYPE_LAYOUT;
5034         lr->lr_flags2 = lo->ll_flags;
5035         llsd->llsd_touch_gen++;
5036         spin_lock(&llsd->llsd_lock);
5037         while (!list_empty(&llsd->llsd_master_list)) {
5038                 llst = list_first_entry(&llsd->llsd_master_list,
5039                                         struct lfsck_layout_slave_target,
5040                                         llst_list);
5041                 if (llst->llst_gen == llsd->llsd_touch_gen)
5042                         break;
5043
5044                 llst->llst_gen = llsd->llsd_touch_gen;
5045                 list_move_tail(&llst->llst_list,
5046                                &llsd->llsd_master_list);
5047                 atomic_inc(&llst->llst_ref);
5048                 spin_unlock(&llsd->llsd_lock);
5049
5050                 exp = lustre_find_lwp_by_index(lfsck->li_obd->obd_name,
5051                                                llst->llst_index);
5052                 if (exp == NULL) {
5053                         lfsck_layout_llst_del(llsd, llst);
5054                         lfsck_layout_llst_put(llst);
5055                         spin_lock(&llsd->llsd_lock);
5056                         continue;
5057                 }
5058
5059                 rc = lfsck_layout_async_notify(env, exp, lr, set);
5060                 if (rc != 0)
5061                         CDEBUG(D_LFSCK, "%s: layout LFSCK slave fail to "
5062                                "notify %s for layout: rc = %d\n",
5063                                lfsck_lfsck2name(lfsck),
5064                                exp->exp_obd->obd_name, rc);
5065
5066                 lfsck_layout_llst_put(llst);
5067                 class_export_put(exp);
5068                 spin_lock(&llsd->llsd_lock);
5069         }
5070         spin_unlock(&llsd->llsd_lock);
5071
5072         ptlrpc_set_wait(env, set);
5073         ptlrpc_set_destroy(set);
5074
5075         RETURN_EXIT;
5076 }
5077
5078 /*
5079  * \ret -ENODATA: unrecognized stripe
5080  * \ret = 0     : recognized stripe
5081  * \ret < 0     : other failures
5082  */
5083 static int lfsck_layout_master_check_pairs(const struct lu_env *env,
5084                                            struct lfsck_component *com,
5085                                            struct lu_fid *cfid,
5086                                            struct lu_fid *pfid, __u32 comp_id)
5087 {
5088         struct lfsck_thread_info        *info   = lfsck_env_info(env);
5089         struct lu_buf                   *buf    = &info->lti_big_buf;
5090         struct ost_id                   *oi     = &info->lti_oi;
5091         struct dt_object                *obj;
5092         struct lov_mds_md_v1            *lmm;
5093         struct lov_ost_data_v1          *objs;
5094         __u32                            idx    = pfid->f_stripe_idx;
5095         __u32                            magic;
5096         int                              rc     = 0;
5097         int                              i;
5098         __u16                            count;
5099         ENTRY;
5100
5101         pfid->f_ver = 0;
5102         obj = lfsck_object_find_bottom(env, com->lc_lfsck, pfid);
5103         if (IS_ERR(obj))
5104                 RETURN(PTR_ERR(obj));
5105
5106         dt_read_lock(env, obj, 0);
5107         if (unlikely(dt_object_exists(obj) == 0 ||
5108                      lfsck_is_dead_obj(obj)))
5109                 GOTO(unlock, rc = -ENOENT);
5110
5111         if (!S_ISREG(lfsck_object_type(obj)))
5112                 GOTO(unlock, rc = -ENODATA);
5113
5114         rc = lfsck_layout_get_lovea(env, obj, buf);
5115         if (rc < 0)
5116                 GOTO(unlock, rc);
5117
5118         lmm = buf->lb_buf;
5119         magic = le32_to_cpu(lmm->lmm_magic);
5120         if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) {
5121                 struct lov_comp_md_v1 *lcm = buf->lb_buf;
5122                 struct lov_comp_md_entry_v1 *lcme;
5123
5124                 if (comp_id == 0)
5125                         GOTO(unlock, rc = -ENODATA);
5126
5127                 count = le16_to_cpu(lcm->lcm_entry_count);
5128                 for (i = 0; i < count; i++) {
5129                         lcme = &lcm->lcm_entries[i];
5130                         if (le32_to_cpu(lcme->lcme_id) == comp_id) {
5131                                 lmm = buf->lb_buf +
5132                                         le32_to_cpu(lcme->lcme_offset);
5133                                 magic = le32_to_cpu(lmm->lmm_magic);
5134                                 if (!(le32_to_cpu(lcme->lcme_flags) &
5135                                       LCME_FL_INIT))
5136                                         GOTO(unlock, rc = -ENODATA);
5137
5138                                 goto further;
5139                         }
5140                 }
5141
5142                 GOTO(unlock, rc = -ENODATA);
5143         }
5144
5145 further:
5146         if (magic == LOV_MAGIC_V1) {
5147                 objs = &lmm->lmm_objects[0];
5148         } else {
5149                 LASSERT(magic == LOV_MAGIC_V3);
5150                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0];
5151         }
5152
5153         fid_to_ostid(cfid, oi);
5154         count = le16_to_cpu(lmm->lmm_stripe_count);
5155         for (i = 0; i < count; i++, objs++) {
5156                 struct ost_id oi2;
5157
5158                 ostid_le_to_cpu(&objs->l_ost_oi, &oi2);
5159                 if (memcmp(oi, &oi2, sizeof(*oi)) == 0)
5160                         GOTO(unlock, rc = (i != idx ? -ENODATA : 0));
5161         }
5162
5163         GOTO(unlock, rc = -ENODATA);
5164
5165 unlock:
5166         dt_read_unlock(env, obj);
5167         lfsck_object_put(env, obj);
5168
5169         return rc;
5170 }
5171
5172 /*
5173  * The LFSCK-on-OST will ask the LFSCK-on-MDT to check whether the given
5174  * MDT-object/OST-object pairs match or not to aviod transfer MDT-object
5175  * layout EA from MDT to OST. On one hand, the OST no need to understand
5176  * the layout EA structure; on the other hand, it may cause trouble when
5177  * transfer large layout EA from MDT to OST via normal OUT RPC.
5178  *
5179  * \ret > 0: unrecognized stripe
5180  * \ret = 0: recognized stripe
5181  * \ret < 0: other failures
5182  */
5183 static int lfsck_layout_slave_check_pairs(const struct lu_env *env,
5184                                           struct lfsck_component *com,
5185                                           struct lu_fid *cfid,
5186                                           struct lu_fid *pfid, __u32 comp_id)
5187 {
5188         struct lfsck_instance    *lfsck  = com->lc_lfsck;
5189         struct obd_device        *obd    = lfsck->li_obd;
5190         struct seq_server_site   *ss     = lfsck_dev_site(lfsck);
5191         struct obd_export        *exp    = NULL;
5192         struct ptlrpc_request    *req    = NULL;
5193         struct lfsck_request     *lr;
5194         struct lu_seq_range      *range  = &lfsck_env_info(env)->lti_range;
5195         int                       rc     = 0;
5196         ENTRY;
5197
5198         if (unlikely(fid_is_idif(pfid)))
5199                 RETURN(1);
5200
5201         fld_range_set_any(range);
5202         rc = fld_server_lookup(env, ss->ss_server_fld, fid_seq(pfid), range);
5203         if (rc != 0)
5204                 RETURN(rc == -ENOENT ? 1 : rc);
5205
5206         if (unlikely(!fld_range_is_mdt(range)))
5207                 RETURN(1);
5208
5209         exp = lustre_find_lwp_by_index(obd->obd_name, range->lsr_index);
5210         if (unlikely(exp == NULL))
5211                 RETURN(1);
5212
5213         if (!(exp_connect_flags(exp) & OBD_CONNECT_LFSCK))
5214                 GOTO(out, rc = -EOPNOTSUPP);
5215
5216         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LFSCK_NOTIFY);
5217         if (req == NULL)
5218                 GOTO(out, rc = -ENOMEM);
5219
5220         rc = ptlrpc_request_pack(req, LUSTRE_OBD_VERSION, LFSCK_NOTIFY);
5221         if (rc != 0) {
5222                 ptlrpc_request_free(req);
5223
5224                 GOTO(out, rc);
5225         }
5226
5227         lr = req_capsule_client_get(&req->rq_pill, &RMF_LFSCK_REQUEST);
5228         memset(lr, 0, sizeof(*lr));
5229         lr->lr_event = LE_PAIRS_VERIFY;
5230         lr->lr_active = LFSCK_TYPE_LAYOUT;
5231         lr->lr_fid = *cfid; /* OST-object itself FID. */
5232         lr->lr_fid2 = *pfid; /* The claimed parent FID. */
5233         lr->lr_comp_id = comp_id;
5234
5235         ptlrpc_request_set_replen(req);
5236         rc = ptlrpc_queue_wait(req);
5237         ptlrpc_req_finished(req);
5238
5239         if (rc == -ENOENT || rc == -ENODATA)
5240                 rc = 1;
5241
5242         GOTO(out, rc);
5243
5244 out:
5245         if (exp != NULL)
5246                 class_export_put(exp);
5247
5248         return rc;
5249 }
5250
5251 static int lfsck_layout_slave_repair_pfid(const struct lu_env *env,
5252                                           struct lfsck_component *com,
5253                                           struct lfsck_req_local *lrl)
5254 {
5255         struct dt_object        *obj;
5256         int                      rc     = 0;
5257         ENTRY;
5258
5259         obj = lfsck_object_find_bottom(env, com->lc_lfsck, &lrl->lrl_fid);
5260         if (IS_ERR(obj))
5261                 GOTO(log, rc = PTR_ERR(obj));
5262
5263         rc = __lfsck_layout_update_pfid(env, com, obj,
5264                                         &lrl->lrl_ff_client.ff_parent,
5265                                         &lrl->lrl_ff_client.ff_layout,
5266                                         lrl->lrl_ff_client.ff_layout_version,
5267                                         lrl->lrl_ff_client.ff_range,
5268                                         lrl->lrl_ff_client.ff_parent.f_ver);
5269
5270         lfsck_object_put(env, obj);
5271
5272 log:
5273         CDEBUG(D_LFSCK, "%s: layout LFSCK slave repaired pfid for "DFID
5274                ", parent "DFID": rc = %d\n", lfsck_lfsck2name(com->lc_lfsck),
5275                PFID(&lrl->lrl_fid), PFID(&lrl->lrl_ff_client.ff_parent), rc);
5276
5277         return rc;
5278 }
5279
5280 /* layout APIs */
5281
5282 static void lfsck_layout_slave_quit(const struct lu_env *env,
5283                                     struct lfsck_component *com);
5284
5285 static int lfsck_layout_reset(const struct lu_env *env,
5286                               struct lfsck_component *com, bool init)
5287 {
5288         struct lfsck_layout     *lo    = com->lc_file_ram;
5289         int                      rc;
5290
5291         down_write(&com->lc_sem);
5292         if (init) {
5293                 memset(lo, 0, com->lc_file_size);
5294         } else {
5295                 __u32 count = lo->ll_success_count;
5296                 time64_t last_time = lo->ll_time_last_complete;
5297
5298                 memset(lo, 0, com->lc_file_size);
5299                 lo->ll_success_count = count;
5300                 lo->ll_time_last_complete = last_time;
5301         }
5302
5303         lo->ll_magic = LFSCK_LAYOUT_MAGIC;
5304         lo->ll_status = LS_INIT;
5305
5306         if (com->lc_lfsck->li_master) {
5307                 struct lfsck_assistant_data *lad = com->lc_data;
5308
5309                 clear_bit(LAD_INCOMPLETE, &lad->lad_flags);
5310                 bitmap_zero(lad->lad_bitmap, lad->lad_bitmap_count);
5311         }
5312
5313         rc = lfsck_layout_store(env, com);
5314         if (rc == 0 && com->lc_lfsck->li_master)
5315                 rc = lfsck_load_sub_trace_files(env, com,
5316                         &dt_lfsck_layout_dangling_features, LFSCK_LAYOUT, true);
5317         up_write(&com->lc_sem);
5318
5319         CDEBUG(D_LFSCK, "%s: layout LFSCK reset: rc = %d\n",
5320                lfsck_lfsck2name(com->lc_lfsck), rc);
5321
5322         return rc;
5323 }
5324
5325 static void lfsck_layout_fail(const struct lu_env *env,
5326                               struct lfsck_component *com, bool new_checked)
5327 {
5328         struct lfsck_layout *lo = com->lc_file_ram;
5329
5330         down_write(&com->lc_sem);
5331         if (new_checked)
5332                 com->lc_new_checked++;
5333         lfsck_layout_record_failure(env, com->lc_lfsck, lo);
5334         up_write(&com->lc_sem);
5335 }
5336
5337 static int lfsck_layout_master_checkpoint(const struct lu_env *env,
5338                                           struct lfsck_component *com, bool init)
5339 {
5340         struct lfsck_instance   *lfsck   = com->lc_lfsck;
5341         struct lfsck_layout     *lo      = com->lc_file_ram;
5342         int                      rc;
5343
5344         if (!init) {
5345                 rc = lfsck_checkpoint_generic(env, com);
5346                 if (rc != 0)
5347                         return rc > 0 ? 0 : rc;
5348         }
5349
5350         down_write(&com->lc_sem);
5351         if (init) {
5352                 lo->ll_pos_latest_start =
5353                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
5354         } else {
5355                 lo->ll_pos_last_checkpoint =
5356                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
5357                 lo->ll_run_time_phase1 += ktime_get_seconds() -
5358                                           lfsck->li_time_last_checkpoint;
5359                 lo->ll_time_last_checkpoint = ktime_get_real_seconds();
5360                 lo->ll_objs_checked_phase1 += com->lc_new_checked;
5361                 com->lc_new_checked = 0;
5362         }
5363
5364         rc = lfsck_layout_store(env, com);
5365         up_write(&com->lc_sem);
5366
5367         CDEBUG(D_LFSCK, "%s: layout LFSCK master checkpoint at the pos ["
5368                "%llu], status = %d: rc = %d\n", lfsck_lfsck2name(lfsck),
5369                lfsck->li_pos_current.lp_oit_cookie, lo->ll_status, rc);
5370
5371         return rc;
5372 }
5373
5374 static int lfsck_layout_slave_checkpoint(const struct lu_env *env,
5375                                          struct lfsck_component *com, bool init)
5376 {
5377         struct lfsck_instance   *lfsck = com->lc_lfsck;
5378         struct lfsck_layout     *lo    = com->lc_file_ram;
5379         int                      rc;
5380
5381         if (com->lc_new_checked == 0 && !init)
5382                 return 0;
5383
5384         down_write(&com->lc_sem);
5385         if (init) {
5386                 lo->ll_pos_latest_start =
5387                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
5388         } else {
5389                 lo->ll_pos_last_checkpoint =
5390                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
5391                 lo->ll_run_time_phase1 += ktime_get_seconds() -
5392                                           lfsck->li_time_last_checkpoint;
5393                 lo->ll_time_last_checkpoint = ktime_get_real_seconds();
5394                 lo->ll_objs_checked_phase1 += com->lc_new_checked;
5395                 com->lc_new_checked = 0;
5396         }
5397
5398         rc = lfsck_layout_store(env, com);
5399         up_write(&com->lc_sem);
5400
5401         CDEBUG(D_LFSCK, "%s: layout LFSCK slave checkpoint at the pos ["
5402                "%llu], status = %d: rc = %d\n", lfsck_lfsck2name(lfsck),
5403                lfsck->li_pos_current.lp_oit_cookie, lo->ll_status, rc);
5404
5405         return rc;
5406 }
5407
5408 static int lfsck_layout_prep(const struct lu_env *env,
5409                              struct lfsck_component *com,
5410                              struct lfsck_start *start)
5411 {
5412         struct lfsck_instance   *lfsck  = com->lc_lfsck;
5413         struct lfsck_layout     *lo     = com->lc_file_ram;
5414         struct lfsck_position   *pos    = &com->lc_pos_start;
5415
5416         fid_zero(&pos->lp_dir_parent);
5417         pos->lp_dir_cookie = 0;
5418         if (lo->ll_status == LS_COMPLETED ||
5419             lo->ll_status == LS_PARTIAL ||
5420             /* To handle orphan, must scan from the beginning. */
5421             (start != NULL && start->ls_flags & LPF_OST_ORPHAN)) {
5422                 int rc;
5423
5424                 rc = lfsck_layout_reset(env, com, false);
5425                 if (rc == 0)
5426                         rc = lfsck_set_param(env, lfsck, start, true);
5427
5428                 if (rc != 0) {
5429                         CDEBUG(D_LFSCK, "%s: layout LFSCK prep failed: "
5430                                "rc = %d\n", lfsck_lfsck2name(lfsck), rc);
5431
5432                         return rc;
5433                 }
5434         }
5435
5436         down_write(&com->lc_sem);
5437         lo->ll_time_latest_start = ktime_get_real_seconds();
5438         spin_lock(&lfsck->li_lock);
5439         if (lo->ll_flags & LF_SCANNED_ONCE) {
5440                 if (!lfsck->li_drop_dryrun ||
5441                     lo->ll_pos_first_inconsistent == 0) {
5442                         lo->ll_status = LS_SCANNING_PHASE2;
5443                         list_move_tail(&com->lc_link,
5444                                        &lfsck->li_list_double_scan);
5445                         pos->lp_oit_cookie = 0;
5446                 } else {
5447                         int i;
5448
5449                         lo->ll_status = LS_SCANNING_PHASE1;
5450                         lo->ll_run_time_phase1 = 0;
5451                         lo->ll_run_time_phase2 = 0;
5452                         lo->ll_objs_checked_phase1 = 0;
5453                         lo->ll_objs_checked_phase2 = 0;
5454                         lo->ll_objs_failed_phase1 = 0;
5455                         lo->ll_objs_failed_phase2 = 0;
5456                         for (i = 0; i < LLIT_MAX; i++)
5457                                 lo->ll_objs_repaired[i] = 0;
5458
5459                         pos->lp_oit_cookie = lo->ll_pos_first_inconsistent;
5460                         fid_zero(&com->lc_fid_latest_scanned_phase2);
5461                 }
5462         } else {
5463                 lo->ll_status = LS_SCANNING_PHASE1;
5464                 if (!lfsck->li_drop_dryrun ||
5465                     lo->ll_pos_first_inconsistent == 0)
5466                         pos->lp_oit_cookie = lo->ll_pos_last_checkpoint + 1;
5467                 else
5468                         pos->lp_oit_cookie = lo->ll_pos_first_inconsistent;
5469         }
5470         spin_unlock(&lfsck->li_lock);
5471         up_write(&com->lc_sem);
5472
5473         return 0;
5474 }
5475
5476 static int lfsck_layout_slave_prep(const struct lu_env *env,
5477                                    struct lfsck_component *com,
5478                                    struct lfsck_start_param *lsp)
5479 {
5480         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
5481         struct lfsck_instance           *lfsck  = com->lc_lfsck;
5482         struct lfsck_layout             *lo     = com->lc_file_ram;
5483         struct lfsck_start              *start  = lsp->lsp_start;
5484         int                              rc;
5485
5486         rc = lfsck_layout_prep(env, com, start);
5487         if (rc != 0)
5488                 return rc;
5489
5490         if (lo->ll_flags & LF_CRASHED_LASTID &&
5491             list_empty(&llsd->llsd_master_list)) {
5492                 LASSERT(lfsck->li_out_notify != NULL);
5493
5494                 lfsck->li_out_notify(env, lfsck->li_out_notify_data,
5495                                      LE_LASTID_REBUILDING);
5496         }
5497
5498         if (!lsp->lsp_index_valid)
5499                 return 0;
5500
5501         rc = lfsck_layout_llst_add(llsd, lsp->lsp_index);
5502         if (rc == 0 && start != NULL && start->ls_flags & LPF_OST_ORPHAN) {
5503                 LASSERT(!llsd->llsd_rbtree_valid);
5504
5505                 down_write(&llsd->llsd_rb_rwsem);
5506                 rc = lfsck_rbtree_setup(env, com);
5507                 up_write(&llsd->llsd_rb_rwsem);
5508         }
5509
5510         CDEBUG(D_LFSCK, "%s: layout LFSCK slave prep done, start pos ["
5511                "%llu]\n", lfsck_lfsck2name(lfsck),
5512                com->lc_pos_start.lp_oit_cookie);
5513
5514         return rc;
5515 }
5516
5517 static int lfsck_layout_master_prep(const struct lu_env *env,
5518                                     struct lfsck_component *com,
5519                                     struct lfsck_start_param *lsp)
5520 {
5521         int rc;
5522         ENTRY;
5523
5524         rc = lfsck_layout_load_bitmap(env, com);
5525         if (rc != 0) {
5526                 rc = lfsck_layout_reset(env, com, false);
5527                 if (rc == 0)
5528                         rc = lfsck_set_param(env, com->lc_lfsck,
5529                                              lsp->lsp_start, true);
5530
5531                 if (rc != 0)
5532                         GOTO(log, rc);
5533         }
5534
5535         rc = lfsck_layout_prep(env, com, lsp->lsp_start);
5536         if (rc != 0)
5537                 RETURN(rc);
5538
5539         rc = lfsck_start_assistant(env, com, lsp);
5540
5541         GOTO(log, rc);
5542
5543 log:
5544         CDEBUG(D_LFSCK, "%s: layout LFSCK master prep done, start pos ["
5545                "%llu]\n", lfsck_lfsck2name(com->lc_lfsck),
5546                com->lc_pos_start.lp_oit_cookie);
5547
5548         return 0;
5549 }
5550
5551 /* Pre-fetch the attribute for each stripe in the given layout EA. */
5552 static int lfsck_layout_scan_stripes(const struct lu_env *env,
5553                                      struct lfsck_component *com,
5554                                      struct dt_object *parent,
5555                                      struct lov_mds_md_v1 *lmm, __u32 comp_id)
5556 {
5557         struct lfsck_thread_info        *info    = lfsck_env_info(env);
5558         struct lfsck_instance           *lfsck   = com->lc_lfsck;
5559         struct lfsck_bookmark           *bk      = &lfsck->li_bookmark_ram;
5560         struct lfsck_layout             *lo      = com->lc_file_ram;
5561         struct lfsck_assistant_data     *lad     = com->lc_data;
5562         struct lfsck_assistant_object   *lso     = NULL;
5563         struct lov_ost_data_v1          *objs;
5564         struct lfsck_tgt_descs          *ltds    = &lfsck->li_ost_descs;
5565         struct ptlrpc_thread            *mthread = &lfsck->li_thread;
5566         struct ptlrpc_thread            *athread = &lad->lad_thread;
5567         struct lu_buf                    buf;
5568         int                              rc      = 0;
5569         int                              i;
5570         __u32                            magic;
5571         __u16                            count;
5572         ENTRY;
5573
5574         lfsck_buf_init(&buf, &info->lti_ff, sizeof(struct filter_fid));
5575         magic = le32_to_cpu(lmm->lmm_magic);
5576         if (magic == LOV_MAGIC_V1) {
5577                 objs = &lmm->lmm_objects[0];
5578         } else {
5579                 LASSERT(magic == LOV_MAGIC_V3);
5580                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0];
5581         }
5582
5583         count = le16_to_cpu(lmm->lmm_stripe_count);
5584         for (i = 0; i < count; i++, objs++) {
5585                 struct lu_fid           *fid    = &info->lti_fid;
5586                 struct ost_id           *oi     = &info->lti_oi;
5587                 struct lfsck_layout_req *llr;
5588                 struct lfsck_tgt_desc   *tgt    = NULL;
5589                 struct dt_object        *cobj   = NULL;
5590                 __u32                    index;
5591                 bool                     wakeup = false;
5592
5593                 if (unlikely(lovea_slot_is_dummy(objs)))
5594                         continue;
5595
5596                 wait_event_idle(mthread->t_ctl_waitq,
5597                                 lad->lad_prefetched < bk->lb_async_windows ||
5598                                 !thread_is_running(mthread) ||
5599                                 thread_is_stopped(athread));
5600
5601                 if (unlikely(!thread_is_running(mthread)) ||
5602                              thread_is_stopped(athread))
5603                         GOTO(out, rc = 0);
5604
5605                 if (unlikely(lfsck_is_dead_obj(parent)))
5606                         GOTO(out, rc = 0);
5607
5608                 ostid_le_to_cpu(&objs->l_ost_oi, oi);
5609                 index = le32_to_cpu(objs->l_ost_idx);
5610                 rc = ostid_to_fid(fid, oi, index);
5611                 if (rc != 0) {
5612                         CDEBUG(D_LFSCK, "%s: get invalid layout EA for "DFID
5613                                ": "DOSTID", idx %u, comp_id %u\n",
5614                                lfsck_lfsck2name(lfsck),
5615                                PFID(lfsck_dto2fid(parent)), POSTID(oi),
5616                                index, comp_id);
5617                         goto next;
5618                 }
5619
5620                 tgt = lfsck_tgt_get(ltds, index);
5621                 if (unlikely(tgt == NULL)) {
5622                         CDEBUG(D_LFSCK, "%s: cannot talk with OST %x which "
5623                                "did not join the layout LFSCK, comp_id %u\n",
5624                                lfsck_lfsck2name(lfsck), index, comp_id);
5625                         lfsck_lad_set_bitmap(env, com, index);
5626                         goto next;
5627                 }
5628
5629                 /* There is potential deadlock race condition between object
5630                  * destroy and layout LFSCK. Consider the following scenario:
5631                  *
5632                  * 1) The LFSCK thread obtained the parent object firstly, at
5633                  *    that time, the parent object has not been destroyed yet.
5634                  *
5635                  * 2) One RPC service thread destroyed the parent and all its
5636                  *    children objects. Because the LFSCK is referencing the
5637                  *    parent object, then the parent object will be marked as
5638                  *    dying in RAM. On the other hand, the parent object is
5639                  *    referencing all its children objects, then all children
5640                  *    objects will be marked as dying in RAM also.
5641                  *
5642                  * 3) The LFSCK thread tries to find some child object with
5643                  *    the parent object referenced. Then it will find that the
5644                  *    child object is dying. According to the object visibility
5645                  *    rules: the object with dying flag cannot be returned to
5646                  *    others. So the LFSCK thread has to wait until the dying
5647                  *    object has been purged from RAM, then it can allocate a
5648                  *    new object (with the same FID) in RAM. Unfortunately, the
5649                  *    LFSCK thread itself is referencing the parent object, and
5650                  *    cause the parent object cannot be purged, then cause the
5651                  *    child object cannot be purged also. So the LFSCK thread
5652                  *    will fall into deadlock.
5653                  */
5654                 cobj = lfsck_object_find_by_dev(env, tgt->ltd_tgt, fid);
5655                 if (IS_ERR(cobj)) {
5656                         if (lfsck_is_dead_obj(parent)) {
5657                                 lfsck_tgt_put(tgt);
5658
5659                                 GOTO(out, rc = 0);
5660                         }
5661
5662                         rc = PTR_ERR(cobj);
5663                         goto next;
5664                 }
5665
5666                 rc = dt_declare_attr_get(env, cobj);
5667                 if (rc)
5668                         goto next;
5669
5670                 rc = dt_declare_xattr_get(env, cobj, &buf, XATTR_NAME_FID);
5671                 if (rc)
5672                         goto next;
5673
5674                 if (lso == NULL) {
5675                         struct lu_attr *attr = &info->lti_la;
5676
5677                         rc = dt_attr_get(env, parent, attr);
5678                         if (rc != 0)
5679                                 goto next;
5680
5681                         lso = lfsck_assistant_object_init(env,
5682                                 lfsck_dto2fid(parent), attr,
5683                                 lfsck->li_pos_current.lp_oit_cookie, false);
5684                         if (IS_ERR(lso)) {
5685                                 rc = PTR_ERR(lso);
5686                                 lso = NULL;
5687
5688                                 goto next;
5689                         }
5690                 }
5691
5692                 llr = lfsck_layout_assistant_req_init(lso, cobj, comp_id,
5693                                                       index, i);
5694                 if (IS_ERR(llr)) {
5695                         rc = PTR_ERR(llr);
5696                         goto next;
5697                 }
5698
5699                 cobj = NULL;
5700                 spin_lock(&lad->lad_lock);
5701                 if (lad->lad_assistant_status < 0) {
5702                         spin_unlock(&lad->lad_lock);
5703                         lfsck_layout_assistant_req_fini(env, &llr->llr_lar);
5704                         lfsck_tgt_put(tgt);
5705                         RETURN(lad->lad_assistant_status);
5706                 }
5707
5708                 list_add_tail(&llr->llr_lar.lar_list, &lad->lad_req_list);
5709                 if (lad->lad_prefetched == 0)
5710                         wakeup = true;
5711
5712                 lad->lad_prefetched++;
5713                 spin_unlock(&lad->lad_lock);
5714                 if (wakeup)
5715                         wake_up(&athread->t_ctl_waitq);
5716
5717 next:
5718                 down_write(&com->lc_sem);
5719                 com->lc_new_checked++;
5720                 if (rc < 0)
5721                         lfsck_layout_record_failure(env, lfsck, lo);
5722                 up_write(&com->lc_sem);
5723
5724                 if (cobj != NULL && !IS_ERR(cobj))
5725                         lfsck_object_put(env, cobj);
5726
5727                 if (likely(tgt != NULL))
5728                         lfsck_tgt_put(tgt);
5729
5730                 if (rc < 0 && bk->lb_param & LPF_FAILOUT)
5731                         GOTO(out, rc);
5732         }
5733
5734         GOTO(out, rc = 0);
5735
5736 out:
5737         if (lso != NULL)
5738                 lfsck_assistant_object_put(env, lso);
5739
5740         return rc;
5741 }
5742
5743 /* For the given object, read its layout EA locally. For each stripe, pre-fetch
5744  * the OST-object's attribute and generate an structure lfsck_layout_req on the
5745  * list ::lad_req_list.
5746  *
5747  * For each request on above list, the lfsck_layout_assistant thread compares
5748  * the OST side attribute with local attribute, if inconsistent, then repair it.
5749  *
5750  * All above processing is async mode with pipeline. */
5751 static int lfsck_layout_master_exec_oit(const struct lu_env *env,
5752                                         struct lfsck_component *com,
5753                                         struct dt_object *obj)
5754 {
5755         struct lfsck_thread_info        *info   = lfsck_env_info(env);
5756         struct ost_id                   *oi     = &info->lti_oi;
5757         struct lfsck_layout             *lo     = com->lc_file_ram;
5758         struct lfsck_assistant_data     *lad    = com->lc_data;
5759         struct lfsck_instance           *lfsck  = com->lc_lfsck;
5760         struct lfsck_bookmark           *bk     = &lfsck->li_bookmark_ram;
5761         struct thandle                  *handle = NULL;
5762         struct lu_buf                   *buf    = &info->lti_big_buf;
5763         struct lov_mds_md_v1            *lmm    = NULL;
5764         struct dt_device                *dev    = lfsck_obj2dev(obj);
5765         struct lustre_handle             lh     = { 0 };
5766         struct lu_buf                    ea_buf = { NULL };
5767         struct lov_comp_md_v1           *lcm    = NULL;
5768         struct lov_comp_md_entry_v1     *lcme   = NULL;
5769         int                              rc     = 0;
5770         int                              size   = 0;
5771         __u32                            magic  = 0;
5772         __u16                            count  = 0;
5773         bool                             locked = false;
5774         bool                             stripe = false;
5775         bool                             bad_oi = false;
5776         ENTRY;
5777
5778         if (!S_ISREG(lfsck_object_type(obj)))
5779                 GOTO(out, rc = 0);
5780
5781         if (lad->lad_assistant_status < 0)
5782                 GOTO(out, rc = -ESRCH);
5783
5784         fid_to_lmm_oi(lfsck_dto2fid(obj), oi);
5785         lmm_oi_cpu_to_le(oi, oi);
5786         dt_read_lock(env, obj, 0);
5787         locked = true;
5788
5789 again:
5790         bad_oi = false;
5791         if (dt_object_exists(obj) == 0 ||
5792             lfsck_is_dead_obj(obj))
5793                 GOTO(out, rc = 0);
5794
5795         rc = lfsck_layout_get_lovea(env, obj, buf);
5796         if (rc == -EINVAL || rc == -ENODATA || rc == -EOPNOTSUPP)
5797                 /* Skip bad lov EA during the 1st cycle scanning, and
5798                  * try to recover it via orphan in the 2nd scanning. */
5799                 rc = 0;
5800         if (rc <= 0)
5801                 GOTO(out, rc);
5802
5803         size = rc;
5804         lmm = buf->lb_buf;
5805         magic = le32_to_cpu(lmm->lmm_magic);
5806         if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) {
5807                 struct lov_mds_md_v1 *v1;
5808                 int i;
5809
5810                 lcm = buf->lb_buf;
5811                 count = le16_to_cpu(lcm->lcm_entry_count);
5812                 for (i = 0; i < count; i++) {
5813                         lcme = &lcm->lcm_entries[i];
5814                         v1 = buf->lb_buf + le32_to_cpu(lcme->lcme_offset);
5815                         if (memcmp(oi, &v1->lmm_oi, sizeof(*oi)) != 0)
5816                                 goto fix;
5817                 }
5818
5819                 GOTO(out, stripe = true);
5820         } else if (memcmp(oi, &lmm->lmm_oi, sizeof(*oi)) == 0) {
5821                 GOTO(out, stripe = true);
5822         }
5823
5824 fix:
5825         /* Inconsistent lmm_oi, should be repaired. */
5826         bad_oi = true;
5827
5828         if (bk->lb_param & LPF_DRYRUN) {
5829                 lo->ll_objs_repaired[LLIT_OTHERS - 1]++;
5830
5831                 GOTO(out, stripe = true);
5832         }
5833
5834         if (!lustre_handle_is_used(&lh)) {
5835                 dt_read_unlock(env, obj);
5836                 locked = false;
5837                 rc = lfsck_ibits_lock(env, lfsck, obj, &lh,
5838                                       MDS_INODELOCK_LAYOUT |
5839                                       MDS_INODELOCK_XATTR, LCK_EX);
5840                 if (rc != 0)
5841                         GOTO(out, rc);
5842
5843                 handle = lfsck_trans_create(env, dev, lfsck);
5844                 if (IS_ERR(handle))
5845                         GOTO(out, rc = PTR_ERR(handle));
5846
5847                 lfsck_buf_init(&ea_buf, buf->lb_buf, size);
5848                 rc = dt_declare_xattr_set(env, obj, &ea_buf, XATTR_NAME_LOV,
5849                                           LU_XATTR_REPLACE, handle);
5850                 if (rc != 0)
5851                         GOTO(out, rc);
5852
5853                 rc = dt_trans_start_local(env, dev, handle);
5854                 if (rc != 0)
5855                         GOTO(out, rc);
5856
5857                 dt_write_lock(env, obj, 0);
5858                 locked = true;
5859
5860                 goto again;
5861         }
5862
5863         if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) {
5864                 struct lov_mds_md_v1 *v1;
5865                 int i;
5866
5867                 for (i = 0; i < count; i++) {
5868                         lcme = &lcm->lcm_entries[i];
5869                         v1 = buf->lb_buf + le32_to_cpu(lcme->lcme_offset);
5870                         v1->lmm_oi = *oi;
5871                 }
5872         } else {
5873                 lmm->lmm_oi = *oi;
5874         }
5875
5876         rc = dt_xattr_set(env, obj, &ea_buf, XATTR_NAME_LOV,
5877                           LU_XATTR_REPLACE, handle);
5878         if (rc != 0)
5879                 GOTO(out, rc);
5880
5881         lo->ll_objs_repaired[LLIT_OTHERS - 1]++;
5882
5883         GOTO(out, stripe = true);
5884
5885 out:
5886         if (locked) {
5887                 if (lustre_handle_is_used(&lh))
5888                         dt_write_unlock(env, obj);
5889                 else
5890                         dt_read_unlock(env, obj);
5891         }
5892
5893         if (handle != NULL && !IS_ERR(handle))
5894                 dt_trans_stop(env, dev, handle);
5895
5896         lfsck_ibits_unlock(&lh, LCK_EX);
5897
5898         if (bad_oi)
5899                 CDEBUG(D_LFSCK, "%s: layout LFSCK master %s bad lmm_oi for "
5900                        DFID": rc = %d\n", lfsck_lfsck2name(lfsck),
5901                        bk->lb_param & LPF_DRYRUN ? "found" : "repaired",
5902                        PFID(lfsck_dto2fid(obj)), rc);
5903
5904         if (stripe) {
5905                 if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) {
5906                         int i;
5907
5908                         for (i = 0; i < count; i++) {
5909                                 lcme = &lcm->lcm_entries[i];
5910                                 if (!(le32_to_cpu(lcme->lcme_flags) &
5911                                       LCME_FL_INIT))
5912                                         continue;
5913
5914                                 rc = lfsck_layout_scan_stripes(env, com, obj,
5915                                         (struct lov_mds_md_v1 *)(buf->lb_buf +
5916                                         le32_to_cpu(lcme->lcme_offset)),
5917                                         le32_to_cpu(lcme->lcme_id));
5918                         }
5919                 } else {
5920                         rc = lfsck_layout_scan_stripes(env, com, obj, lmm, 0);
5921                 }
5922         } else {
5923                 down_write(&com->lc_sem);
5924                 com->lc_new_checked++;
5925                 if (rc < 0)
5926                         lfsck_layout_record_failure(env, lfsck, lo);
5927                 up_write(&com->lc_sem);
5928         }
5929
5930         return rc;
5931 }
5932
5933 static int lfsck_layout_slave_exec_oit(const struct lu_env *env,
5934                                        struct lfsck_component *com,
5935                                        struct dt_object *obj)
5936 {
5937         struct lfsck_instance           *lfsck  = com->lc_lfsck;
5938         struct lfsck_layout             *lo     = com->lc_file_ram;
5939         const struct lu_fid             *fid    = lfsck_dto2fid(obj);
5940         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
5941         struct lfsck_layout_seq         *lls;
5942         __u64                            seq;
5943         __u64                            oid;
5944         int                              rc;
5945         ENTRY;
5946
5947         LASSERT(llsd != NULL);
5948
5949         if (CFS_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY5) &&
5950             cfs_fail_val == lfsck_dev_idx(lfsck)) {
5951                 struct ptlrpc_thread    *thread = &lfsck->li_thread;
5952
5953                 wait_event_idle_timeout(thread->t_ctl_waitq,
5954                                         !thread_is_running(thread),
5955                                         cfs_time_seconds(1));
5956         }
5957
5958         lfsck_rbtree_update_bitmap(env, com, fid, false);
5959
5960         down_write(&com->lc_sem);
5961         if (fid_is_idif(fid))
5962                 seq = 0;
5963         else if (!fid_is_norm(fid) ||
5964                  !fid_is_for_ostobj(env, lfsck, obj, fid))
5965                 GOTO(unlock, rc = 0);
5966         else
5967                 seq = fid_seq(fid);
5968         com->lc_new_checked++;
5969
5970         lls = lfsck_layout_seq_lookup(llsd, seq);
5971         if (lls == NULL) {
5972                 OBD_ALLOC_PTR(lls);
5973                 if (unlikely(lls == NULL))
5974                         GOTO(unlock, rc = -ENOMEM);
5975
5976                 INIT_LIST_HEAD(&lls->lls_list);
5977                 lls->lls_seq = seq;
5978                 rc = lfsck_layout_lastid_load(env, com, lls);
5979                 if (rc != 0) {
5980                         CDEBUG(D_LFSCK, "%s: layout LFSCK failed to "
5981                               "load LAST_ID for %#llx: rc = %d\n",
5982                               lfsck_lfsck2name(com->lc_lfsck), seq, rc);
5983                         lo->ll_objs_failed_phase1++;
5984                         OBD_FREE_PTR(lls);
5985                         GOTO(unlock, rc);
5986                 }
5987
5988                 lfsck_layout_seq_insert(llsd, lls);
5989         }
5990
5991         if (unlikely(fid_is_last_id(fid)))
5992                 GOTO(unlock, rc = 0);
5993
5994         if (fid_is_idif(fid))
5995                 oid = fid_idif_id(fid_seq(fid), fid_oid(fid), fid_ver(fid));
5996         else
5997                 oid = fid_oid(fid);
5998
5999         if (oid > lls->lls_lastid_known)
6000                 lls->lls_lastid_known = oid;
6001
6002         if (oid > lls->lls_lastid) {
6003                 if (!(lo->ll_flags & LF_CRASHED_LASTID)) {
6004                         /* OFD may create new objects during LFSCK scanning. */
6005                         rc = lfsck_layout_lastid_reload(env, com, lls);
6006                         if (unlikely(rc != 0)) {
6007                                 CDEBUG(D_LFSCK, "%s: layout LFSCK failed to "
6008                                       "reload LAST_ID for %#llx: rc = %d\n",
6009                                       lfsck_lfsck2name(com->lc_lfsck),
6010                                       lls->lls_seq, rc);
6011
6012                                 GOTO(unlock, rc);
6013                         }
6014
6015                         if (oid <= lls->lls_lastid ||
6016                             lo->ll_flags & LF_CRASHED_LASTID)
6017                                 GOTO(unlock, rc = 0);
6018
6019                         LASSERT(lfsck->li_out_notify != NULL);
6020
6021                         lfsck->li_out_notify(env, lfsck->li_out_notify_data,
6022                                              LE_LASTID_REBUILDING);
6023                         lo->ll_flags |= LF_CRASHED_LASTID;
6024
6025                         CDEBUG(D_LFSCK, "%s: layout LFSCK finds crashed "
6026                                "LAST_ID file (2) for the sequence %#llx"
6027                                ", old value %llu, known value %llu\n",
6028                                lfsck_lfsck2name(lfsck), lls->lls_seq,
6029                                lls->lls_lastid, oid);
6030                 }
6031
6032                 lls->lls_lastid = oid;
6033                 lls->lls_dirty = 1;
6034         }
6035
6036         GOTO(unlock, rc = 0);
6037
6038 unlock:
6039         up_write(&com->lc_sem);
6040
6041         return rc;
6042 }
6043
6044 static int lfsck_layout_exec_dir(const struct lu_env *env,
6045                                  struct lfsck_component *com,
6046                                  struct lfsck_assistant_object *lso,
6047                                  struct lu_dirent *ent, __u16 type)
6048 {
6049         return 0;
6050 }
6051
6052 static int lfsck_layout_master_post(const struct lu_env *env,
6053                                     struct lfsck_component *com,
6054                                     int result, bool init)
6055 {
6056         struct lfsck_instance   *lfsck  = com->lc_lfsck;
6057         struct lfsck_layout     *lo     = com->lc_file_ram;
6058         int                      rc;
6059         ENTRY;
6060
6061         lfsck_post_generic(env, com, &result);
6062
6063         down_write(&com->lc_sem);
6064         spin_lock(&lfsck->li_lock);
6065         if (!init)
6066                 lo->ll_pos_last_checkpoint =
6067                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
6068
6069         if (result > 0) {
6070                 if (lo->ll_flags & LF_INCOMPLETE)
6071                         lo->ll_status = LS_PARTIAL;
6072                 else
6073                         lo->ll_status = LS_SCANNING_PHASE2;
6074                 lo->ll_flags |= LF_SCANNED_ONCE;
6075                 lo->ll_flags &= ~LF_UPGRADE;
6076                 list_move_tail(&com->lc_link, &lfsck->li_list_double_scan);
6077         } else if (result == 0) {
6078                 if (lfsck->li_status != 0)
6079                         lo->ll_status = lfsck->li_status;
6080                 else
6081                         lo->ll_status = LS_STOPPED;
6082                 if (lo->ll_status != LS_PAUSED)
6083                         list_move_tail(&com->lc_link, &lfsck->li_list_idle);
6084         } else {
6085                 lo->ll_status = LS_FAILED;
6086                 list_move_tail(&com->lc_link, &lfsck->li_list_idle);
6087         }
6088         spin_unlock(&lfsck->li_lock);
6089
6090         if (!init) {
6091                 lo->ll_run_time_phase1 += ktime_get_seconds() -
6092                                           lfsck->li_time_last_checkpoint;
6093                 lo->ll_time_last_checkpoint = ktime_get_real_seconds();
6094                 lo->ll_objs_checked_phase1 += com->lc_new_checked;
6095                 com->lc_new_checked = 0;
6096         }
6097
6098         rc = lfsck_layout_store(env, com);
6099         up_write(&com->lc_sem);
6100
6101         CDEBUG(D_LFSCK, "%s: layout LFSCK master post done: rc = %d\n",
6102                lfsck_lfsck2name(lfsck), rc);
6103
6104         RETURN(rc);
6105 }
6106
6107 static int lfsck_layout_slave_post(const struct lu_env *env,
6108                                    struct lfsck_component *com,
6109                                    int result, bool init)
6110 {
6111         struct lfsck_instance   *lfsck = com->lc_lfsck;
6112         struct lfsck_layout     *lo    = com->lc_file_ram;
6113         int                      rc;
6114         bool                     done  = false;
6115
6116         down_write(&com->lc_sem);
6117         rc = lfsck_layout_lastid_store(env, com);
6118         if (rc != 0)
6119                 result = rc;
6120
6121         LASSERT(lfsck->li_out_notify != NULL);
6122
6123         spin_lock(&lfsck->li_lock);
6124         if (!init)
6125                 lo->ll_pos_last_checkpoint =
6126                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
6127
6128         if (result > 0) {
6129                 lo->ll_status = LS_SCANNING_PHASE2;
6130                 lo->ll_flags |= LF_SCANNED_ONCE;
6131                 if (lo->ll_flags & LF_CRASHED_LASTID) {
6132                         done = true;
6133                         lo->ll_flags &= ~LF_CRASHED_LASTID;
6134
6135                         CDEBUG(D_LFSCK, "%s: layout LFSCK has rebuilt "
6136                                "crashed LAST_ID files successfully\n",
6137                                lfsck_lfsck2name(lfsck));
6138                 }
6139                 lo->ll_flags &= ~LF_UPGRADE;
6140                 list_move_tail(&com->lc_link, &lfsck->li_list_double_scan);
6141         } else if (result == 0) {
6142                 if (lfsck->li_status != 0)
6143                         lo->ll_status = lfsck->li_status;
6144                 else
6145                         lo->ll_status = LS_STOPPED;
6146                 if (lo->ll_status != LS_PAUSED)
6147                         list_move_tail(&com->lc_link, &lfsck->li_list_idle);
6148         } else {
6149                 lo->ll_status = LS_FAILED;
6150                 list_move_tail(&com->lc_link, &lfsck->li_list_idle);
6151         }
6152         spin_unlock(&lfsck->li_lock);
6153
6154         if (done)
6155                 lfsck->li_out_notify(env, lfsck->li_out_notify_data,
6156                                      LE_LASTID_REBUILT);
6157
6158         if (!init) {
6159                 lo->ll_run_time_phase1 += ktime_get_seconds() -
6160                                           lfsck->li_time_last_checkpoint;
6161                 lo->ll_time_last_checkpoint = ktime_get_real_seconds();
6162                 lo->ll_objs_checked_phase1 += com->lc_new_checked;
6163                 com->lc_new_checked = 0;
6164         }
6165
6166         rc = lfsck_layout_store(env, com);
6167         up_write(&com->lc_sem);
6168
6169         lfsck_layout_slave_notify_master(env, com, LE_PHASE1_DONE, result);
6170
6171         CDEBUG(D_LFSCK, "%s: layout LFSCK slave post done: rc = %d\n",
6172                lfsck_lfsck2name(lfsck), rc);
6173
6174         return rc;
6175 }
6176
6177 static void lfsck_layout_dump(const struct lu_env *env,
6178                               struct lfsck_component *com, struct seq_file *m)
6179 {
6180         struct lfsck_instance   *lfsck = com->lc_lfsck;
6181         struct lfsck_bookmark   *bk    = &lfsck->li_bookmark_ram;
6182         struct lfsck_layout     *lo    = com->lc_file_ram;
6183         const char *prefix;
6184
6185         down_read(&com->lc_sem);
6186         if (bk->lb_param & LPF_DRYRUN)
6187                 prefix = "inconsistent";
6188         else
6189                 prefix = "repaired";
6190
6191         seq_printf(m, "name: lfsck_layout\n"
6192                    "magic: %#x\n"
6193                    "version: %d\n"
6194                    "status: %s\n",
6195                    lo->ll_magic,
6196                    bk->lb_version,
6197                    lfsck_status2name(lo->ll_status));
6198
6199         lfsck_bits_dump(m, lo->ll_flags, lfsck_flags_names, "flags");
6200
6201         lfsck_bits_dump(m, bk->lb_param, lfsck_param_names, "param");
6202
6203         lfsck_time_dump(m, lo->ll_time_last_complete, "last_completed");
6204
6205         lfsck_time_dump(m, lo->ll_time_latest_start, "latest_start");
6206
6207         lfsck_time_dump(m, lo->ll_time_last_checkpoint, "last_checkpoint");
6208
6209         seq_printf(m, "latest_start_position: %llu\n"
6210                    "last_checkpoint_position: %llu\n"
6211                    "first_failure_position: %llu\n",
6212                    lo->ll_pos_latest_start,
6213                    lo->ll_pos_last_checkpoint,
6214                    lo->ll_pos_first_inconsistent);
6215
6216         seq_printf(m, "success_count: %u\n"
6217                    "%s_dangling: %llu\n"
6218                    "%s_unmatched_pair: %llu\n"
6219                    "%s_multiple_referenced: %llu\n"
6220                    "%s_orphan: %llu\n"
6221                    "%s_inconsistent_owner: %llu\n"
6222                    "%s_others: %llu\n"
6223                    "skipped: %llu\n"
6224                    "failed_phase1: %llu\n"
6225                    "failed_phase2: %llu\n",
6226                    lo->ll_success_count,
6227                    prefix, lo->ll_objs_repaired[LLIT_DANGLING - 1],
6228                    prefix, lo->ll_objs_repaired[LLIT_UNMATCHED_PAIR - 1],
6229                    prefix, lo->ll_objs_repaired[LLIT_MULTIPLE_REFERENCED - 1],
6230                    prefix, lo->ll_objs_repaired[LLIT_ORPHAN - 1],
6231                    prefix, lo->ll_objs_repaired[LLIT_INCONSISTENT_OWNER - 1],
6232                    prefix, lo->ll_objs_repaired[LLIT_OTHERS - 1],
6233                    lo->ll_objs_skipped,
6234                    lo->ll_objs_failed_phase1,
6235                    lo->ll_objs_failed_phase2);
6236
6237         if (lo->ll_status == LS_SCANNING_PHASE1) {
6238                 time64_t duration = ktime_get_seconds() -
6239                                     lfsck->li_time_last_checkpoint;
6240                 u64 checked = lo->ll_objs_checked_phase1 +
6241                               com->lc_new_checked;
6242                 u64 speed = checked;
6243                 u64 new_checked = com->lc_new_checked;
6244                 time64_t rtime = lo->ll_run_time_phase1 + duration;
6245                 u64 pos;
6246
6247                 if (duration != 0)
6248                         new_checked = div64_s64(new_checked, duration);
6249                 if (rtime != 0)
6250                         speed = div64_s64(speed, rtime);
6251                 seq_printf(m, "checked_phase1: %llu\n"
6252                            "checked_phase2: %llu\n"
6253                            "run_time_phase1: %lld seconds\n"
6254                            "run_time_phase2: %lld seconds\n"
6255                            "average_speed_phase1: %llu items/sec\n"
6256                            "average_speed_phase2: N/A\n"
6257                            "real_time_speed_phase1: %llu items/sec\n"
6258                            "real_time_speed_phase2: N/A\n",
6259                            checked,
6260                            lo->ll_objs_checked_phase2,
6261                            rtime,
6262                            lo->ll_run_time_phase2,
6263                            speed,
6264                            new_checked);
6265
6266                 if (likely(lfsck->li_di_oit)) {
6267                         const struct dt_it_ops *iops =
6268                                 &lfsck->li_obj_oit->do_index_ops->dio_it;
6269
6270                         /* The low layer otable-based iteration position may NOT
6271                          * exactly match the layout-based directory traversal
6272                          * cookie. Generally, it is not a serious issue. But the
6273                          * caller should NOT make assumption on that. */
6274                         pos = iops->store(env, lfsck->li_di_oit);
6275                         if (!lfsck->li_current_oit_processed)
6276                                 pos--;
6277                 } else {
6278                         pos = lo->ll_pos_last_checkpoint;
6279                 }
6280
6281                 seq_printf(m, "current_position: %llu\n", pos);
6282         } else if (lo->ll_status == LS_SCANNING_PHASE2) {
6283                 time64_t duration = ktime_get_seconds() -
6284                                     com->lc_time_last_checkpoint;
6285                 u64 checked = lo->ll_objs_checked_phase2 +
6286                               com->lc_new_checked;
6287                 u64 speed1 = lo->ll_objs_checked_phase1;
6288                 u64 speed2 = checked;
6289                 u64 new_checked = com->lc_new_checked;
6290                 time64_t rtime = lo->ll_run_time_phase2 + duration;
6291
6292                 if (duration != 0)
6293                         new_checked = div64_s64(new_checked, duration);
6294                 if (lo->ll_run_time_phase1 != 0)
6295                         speed1 = div64_s64(speed1, lo->ll_run_time_phase1);
6296                 if (rtime != 0)
6297                         speed2 = div64_s64(speed2, rtime);
6298                 seq_printf(m, "checked_phase1: %llu\n"
6299                            "checked_phase2: %llu\n"
6300                            "run_time_phase1: %lld seconds\n"
6301                            "run_time_phase2: %lld seconds\n"
6302                            "average_speed_phase1: %llu items/sec\n"
6303                            "average_speed_phase2: %llu items/sec\n"
6304                            "real_time_speed_phase1: N/A\n"
6305                            "real_time_speed_phase2: %llu items/sec\n"
6306                            "current_position: "DFID"\n",
6307                            lo->ll_objs_checked_phase1,
6308                            checked,
6309                            lo->ll_run_time_phase1,
6310                            rtime,
6311                            speed1,
6312                            speed2,
6313                            new_checked,
6314                            PFID(&com->lc_fid_latest_scanned_phase2));
6315         } else {
6316                 __u64 speed1 = lo->ll_objs_checked_phase1;
6317                 __u64 speed2 = lo->ll_objs_checked_phase2;
6318
6319                 if (lo->ll_run_time_phase1 != 0)
6320                         speed1 = div64_s64(speed1, lo->ll_run_time_phase1);
6321                 if (lo->ll_run_time_phase2 != 0)
6322                         speed2 = div64_s64(speed2, lo->ll_run_time_phase2);
6323                 seq_printf(m, "checked_phase1: %llu\n"
6324                            "checked_phase2: %llu\n"
6325                            "run_time_phase1: %lld seconds\n"
6326                            "run_time_phase2: %lld seconds\n"
6327                            "average_speed_phase1: %llu items/sec\n"
6328                            "average_speed_phase2: %llu objs/sec\n"
6329                            "real_time_speed_phase1: N/A\n"
6330                            "real_time_speed_phase2: N/A\n"
6331                            "current_position: N/A\n",
6332                            lo->ll_objs_checked_phase1,
6333                            lo->ll_objs_checked_phase2,
6334                            lo->ll_run_time_phase1,
6335                            lo->ll_run_time_phase2,
6336                            speed1,
6337                            speed2);
6338         }
6339
6340         up_read(&com->lc_sem);
6341 }
6342
6343 static int lfsck_layout_master_double_scan(const struct lu_env *env,
6344                                            struct lfsck_component *com)
6345 {
6346         struct lfsck_layout             *lo     = com->lc_file_ram;
6347         struct lfsck_assistant_data     *lad    = com->lc_data;
6348         struct lfsck_instance           *lfsck  = com->lc_lfsck;
6349         struct lfsck_tgt_descs          *ltds;
6350         struct lfsck_tgt_desc           *ltd;
6351         struct lfsck_tgt_desc           *next;
6352         int                              rc;
6353
6354         rc = lfsck_double_scan_generic(env, com, lo->ll_status);
6355
6356         if (thread_is_stopped(&lad->lad_thread)) {
6357                 LASSERT(list_empty(&lad->lad_req_list));
6358                 LASSERT(list_empty(&lad->lad_ost_phase1_list));
6359                 LASSERT(list_empty(&lad->lad_mdt_phase1_list));
6360
6361                 ltds = &lfsck->li_ost_descs;
6362                 spin_lock(&ltds->ltd_lock);
6363                 list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase2_list,
6364                                          ltd_layout_phase_list) {
6365                         list_del_init(&ltd->ltd_layout_phase_list);
6366                 }
6367                 spin_unlock(&ltds->ltd_lock);
6368
6369                 ltds = &lfsck->li_mdt_descs;
6370                 spin_lock(&ltds->ltd_lock);
6371                 list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase2_list,
6372                                          ltd_layout_phase_list) {
6373                         list_del_init(&ltd->ltd_layout_phase_list);
6374                 }
6375                 spin_unlock(&ltds->ltd_lock);
6376         }
6377
6378         return rc;
6379 }
6380
6381 static int lfsck_layout_slave_double_scan(const struct lu_env *env,
6382                                           struct lfsck_component *com)
6383 {
6384         struct lfsck_instance           *lfsck  = com->lc_lfsck;
6385         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
6386         struct lfsck_layout             *lo     = com->lc_file_ram;
6387         struct ptlrpc_thread            *thread = &lfsck->li_thread;
6388         int                              rc;
6389         ENTRY;
6390
6391         CDEBUG(D_LFSCK, "%s: layout LFSCK slave phase2 scan start\n",
6392                lfsck_lfsck2name(lfsck));
6393
6394         atomic_inc(&lfsck->li_double_scan_count);
6395
6396         if (lo->ll_flags & LF_INCOMPLETE)
6397                 GOTO(done, rc = 1);
6398
6399         com->lc_new_checked = 0;
6400         com->lc_new_scanned = 0;
6401         com->lc_time_last_checkpoint = ktime_get_seconds();
6402         com->lc_time_next_checkpoint = com->lc_time_last_checkpoint +
6403                                        LFSCK_CHECKPOINT_INTERVAL;
6404
6405         while (1) {
6406                 rc = lfsck_layout_slave_query_master(env, com);
6407                 if (list_empty(&llsd->llsd_master_list)) {
6408                         if (unlikely(!thread_is_running(thread)))
6409                                 rc = 0;
6410                         else
6411                                 rc = 1;
6412
6413                         GOTO(done, rc);
6414                 }
6415
6416                 if (rc < 0)
6417                         GOTO(done, rc);
6418
6419                 rc = wait_event_idle_timeout(
6420                         thread->t_ctl_waitq,
6421                         !thread_is_running(thread) ||
6422                         lo->ll_flags & LF_INCOMPLETE ||
6423                         list_empty(&llsd->llsd_master_list),
6424                         cfs_time_seconds(30));
6425                 if (unlikely(!thread_is_running(thread)))
6426                         GOTO(done, rc = 0);
6427
6428                 if (lo->ll_flags & LF_INCOMPLETE)
6429                         GOTO(done, rc = 1);
6430
6431                 if (rc == 0)
6432                         continue;
6433
6434                 GOTO(done, rc = 1);
6435         }
6436
6437 done:
6438         rc = lfsck_layout_double_scan_result(env, com, rc);
6439         lfsck_layout_slave_notify_master(env, com, LE_PHASE2_DONE,
6440                         (rc > 0 && lo->ll_flags & LF_INCOMPLETE) ? 0 : rc);
6441         lfsck_layout_slave_quit(env, com);
6442         if (atomic_dec_and_test(&lfsck->li_double_scan_count))
6443                 wake_up(&lfsck->li_thread.t_ctl_waitq);
6444
6445         CDEBUG(D_LFSCK, "%s: layout LFSCK slave phase2 scan finished, "
6446                "status %d: rc = %d\n",
6447                lfsck_lfsck2name(lfsck), lo->ll_status, rc);
6448
6449         return rc;
6450 }
6451
6452 static void lfsck_layout_master_data_release(const struct lu_env *env,
6453                                              struct lfsck_component *com)
6454 {
6455         struct lfsck_assistant_data     *lad    = com->lc_data;
6456         struct lfsck_instance           *lfsck  = com->lc_lfsck;
6457         struct lfsck_tgt_descs          *ltds;
6458         struct lfsck_tgt_desc           *ltd;
6459         struct lfsck_tgt_desc           *next;
6460
6461         LASSERT(lad != NULL);
6462         LASSERT(thread_is_init(&lad->lad_thread) ||
6463                 thread_is_stopped(&lad->lad_thread));
6464         LASSERT(list_empty(&lad->lad_req_list));
6465
6466         com->lc_data = NULL;
6467
6468         ltds = &lfsck->li_ost_descs;
6469         spin_lock(&ltds->ltd_lock);
6470         list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase1_list,
6471                                  ltd_layout_phase_list) {
6472                 list_del_init(&ltd->ltd_layout_phase_list);
6473         }
6474         list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase2_list,
6475                                  ltd_layout_phase_list) {
6476                 list_del_init(&ltd->ltd_layout_phase_list);
6477         }
6478         list_for_each_entry_safe(ltd, next, &lad->lad_ost_list,
6479                                  ltd_layout_list) {
6480                 list_del_init(&ltd->ltd_layout_list);
6481         }
6482         spin_unlock(&ltds->ltd_lock);
6483
6484         ltds = &lfsck->li_mdt_descs;
6485         spin_lock(&ltds->ltd_lock);
6486         list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase1_list,
6487                                  ltd_layout_phase_list) {
6488                 list_del_init(&ltd->ltd_layout_phase_list);
6489         }
6490         list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase2_list,
6491                                  ltd_layout_phase_list) {
6492                 list_del_init(&ltd->ltd_layout_phase_list);
6493         }
6494         list_for_each_entry_safe(ltd, next, &lad->lad_mdt_list,
6495                                  ltd_layout_list) {
6496                 list_del_init(&ltd->ltd_layout_list);
6497         }
6498         spin_unlock(&ltds->ltd_lock);
6499
6500         bitmap_free(lad->lad_bitmap);
6501
6502         OBD_FREE_PTR(lad);
6503 }
6504
6505 static void lfsck_layout_slave_data_release(const struct lu_env *env,
6506                                             struct lfsck_component *com)
6507 {
6508         struct lfsck_layout_slave_data *llsd = com->lc_data;
6509
6510         lfsck_layout_slave_quit(env, com);
6511         com->lc_data = NULL;
6512         OBD_FREE_PTR(llsd);
6513 }
6514
6515 static void lfsck_layout_master_quit(const struct lu_env *env,
6516                                      struct lfsck_component *com)
6517 {
6518         struct lfsck_assistant_data     *lad    = com->lc_data;
6519         struct lfsck_instance           *lfsck  = com->lc_lfsck;
6520         struct lfsck_tgt_descs          *ltds;
6521         struct lfsck_tgt_desc           *ltd;
6522         struct lfsck_tgt_desc           *next;
6523
6524         LASSERT(lad != NULL);
6525
6526         lfsck_quit_generic(env, com);
6527
6528         LASSERT(thread_is_init(&lad->lad_thread) ||
6529                 thread_is_stopped(&lad->lad_thread));
6530         LASSERT(list_empty(&lad->lad_req_list));
6531
6532         ltds = &lfsck->li_ost_descs;
6533         spin_lock(&ltds->ltd_lock);
6534         list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase1_list,
6535                                  ltd_layout_phase_list) {
6536                 list_del_init(&ltd->ltd_layout_phase_list);
6537         }
6538         list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase2_list,
6539                                  ltd_layout_phase_list) {
6540                 list_del_init(&ltd->ltd_layout_phase_list);
6541         }
6542         spin_unlock(&ltds->ltd_lock);
6543
6544         ltds = &lfsck->li_mdt_descs;
6545         spin_lock(&ltds->ltd_lock);
6546         list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase1_list,
6547                                  ltd_layout_phase_list) {
6548                 list_del_init(&ltd->ltd_layout_phase_list);
6549         }
6550         list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase2_list,
6551                                  ltd_layout_phase_list) {
6552                 list_del_init(&ltd->ltd_layout_phase_list);
6553         }
6554         spin_unlock(&ltds->ltd_lock);
6555 }
6556
6557 static void lfsck_layout_slave_quit(const struct lu_env *env,
6558                                     struct lfsck_component *com)
6559 {
6560         struct lfsck_layout_slave_data   *llsd  = com->lc_data;
6561         struct lfsck_layout_seq          *lls;
6562         struct lfsck_layout_seq          *next;
6563         struct lfsck_layout_slave_target *llst;
6564
6565         LASSERT(llsd != NULL);
6566
6567         down_write(&com->lc_sem);
6568         list_for_each_entry_safe(lls, next, &llsd->llsd_seq_list,
6569                                  lls_list) {
6570                 list_del_init(&lls->lls_list);
6571                 lfsck_object_put(env, lls->lls_lastid_obj);
6572                 OBD_FREE_PTR(lls);
6573         }
6574         up_write(&com->lc_sem);
6575
6576         spin_lock(&llsd->llsd_lock);
6577         while (!list_empty(&llsd->llsd_master_list)) {
6578                 llst = list_first_entry(&llsd->llsd_master_list,
6579                                         struct lfsck_layout_slave_target,
6580                                         llst_list);
6581                 list_del_init(&llst->llst_list);
6582                 spin_unlock(&llsd->llsd_lock);
6583                 lfsck_layout_llst_put(llst);
6584                 spin_lock(&llsd->llsd_lock);
6585         }
6586         spin_unlock(&llsd->llsd_lock);
6587
6588         lfsck_rbtree_cleanup(env, com);
6589 }
6590
6591 static int lfsck_layout_master_in_notify(const struct lu_env *env,
6592                                          struct lfsck_component *com,
6593                                          struct lfsck_request *lr)
6594 {
6595         struct lfsck_instance           *lfsck = com->lc_lfsck;
6596         struct lfsck_layout             *lo    = com->lc_file_ram;
6597         struct lfsck_assistant_data     *lad   = com->lc_data;
6598         struct lfsck_tgt_descs          *ltds;
6599         struct lfsck_tgt_desc           *ltd;
6600         bool                             fail  = false;
6601         ENTRY;
6602
6603         if (lr->lr_event == LE_PAIRS_VERIFY) {
6604                 int rc;
6605
6606                 rc = lfsck_layout_master_check_pairs(env, com, &lr->lr_fid,
6607                                                      &lr->lr_fid2,
6608                                                      lr->lr_comp_id);
6609
6610                 RETURN(rc);
6611         }
6612
6613         CDEBUG(D_LFSCK, "%s: layout LFSCK master handles notify %u "
6614                "from %s %x, status %d, flags %x, flags2 %x\n",
6615                lfsck_lfsck2name(lfsck), lr->lr_event,
6616                (lr->lr_flags & LEF_FROM_OST) ? "OST" : "MDT",
6617                lr->lr_index, lr->lr_status, lr->lr_flags, lr->lr_flags2);
6618
6619         if (lr->lr_event != LE_PHASE1_DONE &&
6620             lr->lr_event != LE_PHASE2_DONE &&
6621             lr->lr_event != LE_PEER_EXIT)
6622                 RETURN(-EINVAL);
6623
6624         if (lr->lr_flags & LEF_FROM_OST)
6625                 ltds = &lfsck->li_ost_descs;
6626         else
6627                 ltds = &lfsck->li_mdt_descs;
6628         spin_lock(&ltds->ltd_lock);
6629         ltd = lfsck_ltd2tgt(ltds, lr->lr_index);
6630         if (ltd == NULL) {
6631                 spin_unlock(&ltds->ltd_lock);
6632
6633                 RETURN(-ENXIO);
6634         }
6635
6636         list_del_init(&ltd->ltd_layout_phase_list);
6637         switch (lr->lr_event) {
6638         case LE_PHASE1_DONE:
6639                 if (lr->lr_status <= 0 || lr->lr_flags2 & LF_INCOMPLETE) {
6640                         if (lr->lr_flags2 & LF_INCOMPLETE) {
6641                                 if (lr->lr_flags & LEF_FROM_OST)
6642                                         lfsck_lad_set_bitmap(env, com,
6643                                                              ltd->ltd_index);
6644                                 else
6645                                         lo->ll_flags |= LF_INCOMPLETE;
6646                         }
6647                         ltd->ltd_layout_done = 1;
6648                         list_del_init(&ltd->ltd_layout_list);
6649                         fail = true;
6650                         break;
6651                 }
6652
6653                 if (lr->lr_flags & LEF_FROM_OST) {
6654                         if (list_empty(&ltd->ltd_layout_list))
6655                                 list_add_tail(&ltd->ltd_layout_list,
6656                                               &lad->lad_ost_list);
6657                         list_add_tail(&ltd->ltd_layout_phase_list,
6658                                       &lad->lad_ost_phase2_list);
6659                 } else {
6660                         if (list_empty(&ltd->ltd_layout_list))
6661                                 list_add_tail(&ltd->ltd_layout_list,
6662                                               &lad->lad_mdt_list);
6663                         list_add_tail(&ltd->ltd_layout_phase_list,
6664                                       &lad->lad_mdt_phase2_list);
6665                 }
6666                 break;
6667         case LE_PHASE2_DONE:
6668                 ltd->ltd_layout_done = 1;
6669                 if (!list_empty(&ltd->ltd_layout_list))
6670                         list_del_init(&ltd->ltd_layout_list);
6671
6672                 if (lr->lr_flags2 & LF_INCOMPLETE) {
6673                         lfsck_lad_set_bitmap(env, com, ltd->ltd_index);
6674                         fail = true;
6675                 }
6676
6677                 break;
6678         case LE_PEER_EXIT:
6679                 fail = true;
6680                 ltd->ltd_layout_done = 1;
6681                 list_del_init(&ltd->ltd_layout_list);
6682                 if (!(lfsck->li_bookmark_ram.lb_param & LPF_FAILOUT) &&
6683                     !(lr->lr_flags & LEF_FROM_OST))
6684                                 lo->ll_flags |= LF_INCOMPLETE;
6685                 break;
6686         default:
6687                 break;
6688         }
6689         spin_unlock(&ltds->ltd_lock);
6690
6691         if (fail && lfsck->li_bookmark_ram.lb_param & LPF_FAILOUT) {
6692                 struct lfsck_stop *stop = &lfsck_env_info(env)->lti_stop;
6693
6694                 memset(stop, 0, sizeof(*stop));
6695                 stop->ls_status = lr->lr_status;
6696                 stop->ls_flags = lr->lr_param & ~LPF_BROADCAST;
6697                 lfsck_stop(env, lfsck->li_bottom, stop);
6698         } else if (lfsck_phase2_next_ready(lad)) {
6699                 wake_up(&lad->lad_thread.t_ctl_waitq);
6700         }
6701
6702         RETURN(0);
6703 }
6704
6705 static int lfsck_layout_slave_in_notify_local(const struct lu_env *env,
6706                                               struct lfsck_component *com,
6707                                               struct lfsck_req_local *lrl,
6708                                               struct thandle *th)
6709 {
6710         ENTRY;
6711
6712         switch (lrl->lrl_event) {
6713         case LEL_FID_ACCESSED:
6714                 lfsck_rbtree_update_bitmap(env, com, &lrl->lrl_fid, true);
6715                 RETURN(0);
6716         case LEL_PAIRS_VERIFY_LOCAL: {
6717                 int rc;
6718
6719                 lrl->lrl_status = LPVS_INIT;
6720                 /* Firstly, if the MDT-object which is claimed via OST-object
6721                  * local stored PFID xattr recognizes the OST-object, then it
6722                  * must be that the client given PFID is wrong. */
6723                 rc = lfsck_layout_slave_check_pairs(env, com, &lrl->lrl_fid,
6724                                 &lrl->lrl_ff_local.ff_parent,
6725                                 lrl->lrl_ff_local.ff_layout.ol_comp_id);
6726                 if (rc <= 0)
6727                         RETURN(0);
6728
6729                 lrl->lrl_status = LPVS_INCONSISTENT;
6730                 /* The OST-object local stored PFID xattr is stale. We need to
6731                  * check whether the MDT-object that is claimed via the client
6732                  * given PFID information recognizes the OST-object or not. If
6733                  * matches, then need to update the OST-object's PFID xattr. */
6734                 rc = lfsck_layout_slave_check_pairs(env, com, &lrl->lrl_fid,
6735                                 &lrl->lrl_ff_client.ff_parent,
6736                                 lrl->lrl_ff_client.ff_layout.ol_comp_id);
6737                 /* For rc < 0 case:
6738                  * We are not sure whether the client given PFID information
6739                  * is correct or not, do nothing to avoid improper fixing.
6740                  *
6741                  * For rc > 0 case:
6742                  * The client given PFID information is also invalid, we can
6743                  * NOT fix the OST-object inconsistency.
6744                  */
6745                 if (!rc) {
6746                         lrl->lrl_status = LPVS_INCONSISTENT_TOFIX;
6747                         rc = lfsck_layout_slave_repair_pfid(env, com, lrl);
6748                 }
6749
6750                 RETURN(rc);
6751         }
6752         default:
6753                 break;
6754         }
6755
6756         RETURN(-EOPNOTSUPP);
6757 }
6758
6759 static int lfsck_layout_slave_in_notify(const struct lu_env *env,
6760                                         struct lfsck_component *com,
6761                                         struct lfsck_request *lr)
6762 {
6763         struct lfsck_instance *lfsck = com->lc_lfsck;
6764         struct lfsck_layout_slave_data *llsd = com->lc_data;
6765         struct lfsck_layout_slave_target *llst;
6766         int rc;
6767         ENTRY;
6768
6769         switch (lr->lr_event) {
6770         case LE_CONDITIONAL_DESTROY:
6771                 rc = lfsck_layout_slave_conditional_destroy(env, com, lr);
6772                 RETURN(rc);
6773         case LE_PHASE1_DONE: {
6774                 if (lr->lr_flags2 & LF_INCOMPLETE) {
6775                         struct lfsck_layout *lo = com->lc_file_ram;
6776
6777                         lo->ll_flags |= LF_INCOMPLETE;
6778                         llst = lfsck_layout_llst_find_and_del(llsd,
6779                                                               lr->lr_index,
6780                                                               true);
6781                         if (llst != NULL) {
6782                                 lfsck_layout_llst_put(llst);
6783                                 wake_up(&lfsck->li_thread.t_ctl_waitq);
6784                         }
6785                 }
6786
6787                 RETURN(0);
6788         }
6789         case LE_PHASE2_DONE:
6790         case LE_PEER_EXIT:
6791                 CDEBUG(D_LFSCK, "%s: layout LFSCK slave handle notify %u "
6792                        "from MDT %x, status %d\n", lfsck_lfsck2name(lfsck),
6793                        lr->lr_event, lr->lr_index, lr->lr_status);
6794                 break;
6795         default:
6796                 RETURN(-EINVAL);
6797         }
6798
6799         llst = lfsck_layout_llst_find_and_del(llsd, lr->lr_index, true);
6800         if (llst == NULL)
6801                 RETURN(0);
6802
6803         lfsck_layout_llst_put(llst);
6804         if (list_empty(&llsd->llsd_master_list))
6805                 wake_up(&lfsck->li_thread.t_ctl_waitq);
6806
6807         if (lr->lr_event == LE_PEER_EXIT &&
6808             (lfsck->li_bookmark_ram.lb_param & LPF_FAILOUT ||
6809              (list_empty(&llsd->llsd_master_list) &&
6810               (lr->lr_status == LS_STOPPED ||
6811                lr->lr_status == LS_CO_STOPPED)))) {
6812                 struct lfsck_stop *stop = &lfsck_env_info(env)->lti_stop;
6813
6814                 memset(stop, 0, sizeof(*stop));
6815                 stop->ls_status = lr->lr_status;
6816                 stop->ls_flags = lr->lr_param & ~LPF_BROADCAST;
6817                 lfsck_stop(env, lfsck->li_bottom, stop);
6818         }
6819
6820         RETURN(0);
6821 }
6822
6823 static void lfsck_layout_repaired(struct lfsck_layout *lo, __u64 *count)
6824 {
6825         int i;
6826
6827         for (i = 0; i < LLIT_MAX; i++)
6828                 *count += lo->ll_objs_repaired[i];
6829 }
6830
6831 static int lfsck_layout_query_all(const struct lu_env *env,
6832                                   struct lfsck_component *com,
6833                                   __u32 *mdts_count, __u32 *osts_count,
6834                                   __u64 *repaired)
6835 {
6836         struct lfsck_layout *lo = com->lc_file_ram;
6837         struct lfsck_tgt_descs *ltds;
6838         struct lfsck_tgt_desc *ltd;
6839         int idx;
6840         int rc;
6841         ENTRY;
6842
6843         rc = lfsck_query_all(env, com);
6844         if (rc != 0)
6845                 RETURN(rc);
6846
6847         ltds = &com->lc_lfsck->li_mdt_descs;
6848         down_read(&ltds->ltd_rw_sem);
6849         for_each_set_bit(idx, ltds->ltd_tgts_bitmap, ltds->ltd_tgts_mask_len) {
6850                 ltd = lfsck_ltd2tgt(ltds, idx);
6851                 LASSERT(ltd != NULL);
6852
6853                 mdts_count[ltd->ltd_layout_status]++;
6854                 *repaired += ltd->ltd_layout_repaired;
6855         }
6856         up_read(&ltds->ltd_rw_sem);
6857
6858         ltds = &com->lc_lfsck->li_ost_descs;
6859         down_read(&ltds->ltd_rw_sem);
6860         for_each_set_bit(idx, ltds->ltd_tgts_bitmap, ltds->ltd_tgts_mask_len) {
6861                 ltd = lfsck_ltd2tgt(ltds, idx);
6862                 LASSERT(ltd != NULL);
6863
6864                 osts_count[ltd->ltd_layout_status]++;
6865                 *repaired += ltd->ltd_layout_repaired;
6866         }
6867         up_read(&ltds->ltd_rw_sem);
6868
6869         down_read(&com->lc_sem);
6870         mdts_count[lo->ll_status]++;
6871         lfsck_layout_repaired(lo, repaired);
6872         up_read(&com->lc_sem);
6873
6874         RETURN(0);
6875 }
6876
6877 static int lfsck_layout_query(const struct lu_env *env,
6878                               struct lfsck_component *com,
6879                               struct lfsck_request *req,
6880                               struct lfsck_reply *rep,
6881                               struct lfsck_query *que, int idx)
6882 {
6883         struct lfsck_layout *lo = com->lc_file_ram;
6884         int rc = 0;
6885
6886         if (que != NULL) {
6887                 LASSERT(com->lc_lfsck->li_master);
6888
6889                 rc = lfsck_layout_query_all(env, com,
6890                                             que->lu_mdts_count[idx],
6891                                             que->lu_osts_count[idx],
6892                                             &que->lu_repaired[idx]);
6893         } else {
6894                 down_read(&com->lc_sem);
6895                 rep->lr_status = lo->ll_status;
6896                 if (req->lr_flags & LEF_QUERY_ALL)
6897                         lfsck_layout_repaired(lo, &rep->lr_repaired);
6898                 up_read(&com->lc_sem);
6899         }
6900
6901         return rc;
6902 }
6903
6904 /* with lfsck::li_lock held */
6905 static int lfsck_layout_slave_join(const struct lu_env *env,
6906                                    struct lfsck_component *com,
6907                                    struct lfsck_start_param *lsp)
6908 {
6909         struct lfsck_instance            *lfsck = com->lc_lfsck;
6910         struct lfsck_layout_slave_data   *llsd  = com->lc_data;
6911         struct lfsck_layout_slave_target *llst;
6912         struct lfsck_start               *start = lsp->lsp_start;
6913         int                               rc    = 0;
6914         ENTRY;
6915
6916         if (start == NULL || !(start->ls_flags & LPF_OST_ORPHAN))
6917                 RETURN(0);
6918
6919         if (!lsp->lsp_index_valid)
6920                 RETURN(-EINVAL);
6921
6922         /* If someone is running the LFSCK without orphan handling,
6923          * it will not maintain the object accessing rbtree. So we
6924          * cannot join it for orphan handling. */
6925         if (!llsd->llsd_rbtree_valid)
6926                 RETURN(-EBUSY);
6927
6928         spin_unlock(&lfsck->li_lock);
6929         rc = lfsck_layout_llst_add(llsd, lsp->lsp_index);
6930         spin_lock(&lfsck->li_lock);
6931         if (rc == 0 && !thread_is_running(&lfsck->li_thread)) {
6932                 spin_unlock(&lfsck->li_lock);
6933                 llst = lfsck_layout_llst_find_and_del(llsd, lsp->lsp_index,
6934                                                       true);
6935                 if (llst != NULL)
6936                         lfsck_layout_llst_put(llst);
6937                 spin_lock(&lfsck->li_lock);
6938                 rc = -EAGAIN;
6939         }
6940
6941         RETURN(rc);
6942 }
6943
6944 static const struct lfsck_operations lfsck_layout_master_ops = {
6945         .lfsck_reset            = lfsck_layout_reset,
6946         .lfsck_fail             = lfsck_layout_fail,
6947         .lfsck_checkpoint       = lfsck_layout_master_checkpoint,
6948         .lfsck_prep             = lfsck_layout_master_prep,
6949         .lfsck_exec_oit         = lfsck_layout_master_exec_oit,
6950         .lfsck_exec_dir         = lfsck_layout_exec_dir,
6951         .lfsck_post             = lfsck_layout_master_post,
6952         .lfsck_dump             = lfsck_layout_dump,
6953         .lfsck_double_scan      = lfsck_layout_master_double_scan,
6954         .lfsck_data_release     = lfsck_layout_master_data_release,
6955         .lfsck_quit             = lfsck_layout_master_quit,
6956         .lfsck_in_notify        = lfsck_layout_master_in_notify,
6957         .lfsck_query            = lfsck_layout_query,
6958 };
6959
6960 static const struct lfsck_operations lfsck_layout_slave_ops = {
6961         .lfsck_reset            = lfsck_layout_reset,
6962         .lfsck_fail             = lfsck_layout_fail,
6963         .lfsck_checkpoint       = lfsck_layout_slave_checkpoint,
6964         .lfsck_prep             = lfsck_layout_slave_prep,
6965         .lfsck_exec_oit         = lfsck_layout_slave_exec_oit,
6966         .lfsck_exec_dir         = lfsck_layout_exec_dir,
6967         .lfsck_post             = lfsck_layout_slave_post,
6968         .lfsck_dump             = lfsck_layout_dump,
6969         .lfsck_double_scan      = lfsck_layout_slave_double_scan,
6970         .lfsck_data_release     = lfsck_layout_slave_data_release,
6971         .lfsck_quit             = lfsck_layout_slave_quit,
6972         .lfsck_in_notify_local  = lfsck_layout_slave_in_notify_local,
6973         .lfsck_in_notify        = lfsck_layout_slave_in_notify,
6974         .lfsck_query            = lfsck_layout_query,
6975         .lfsck_join             = lfsck_layout_slave_join,
6976 };
6977
6978 static void lfsck_layout_assistant_fill_pos(const struct lu_env *env,
6979                                             struct lfsck_component *com,
6980                                             struct lfsck_position *pos)
6981 {
6982         struct lfsck_assistant_data     *lad = com->lc_data;
6983         struct lfsck_layout_req         *llr;
6984
6985         if (((struct lfsck_layout *)(com->lc_file_ram))->ll_status !=
6986             LS_SCANNING_PHASE1)
6987                 return;
6988
6989         if (list_empty(&lad->lad_req_list))
6990                 return;
6991
6992         llr = list_first_entry(&lad->lad_req_list,
6993                                struct lfsck_layout_req,
6994                                llr_lar.lar_list);
6995         pos->lp_oit_cookie = llr->llr_lar.lar_parent->lso_oit_cookie - 1;
6996 }
6997
6998 const struct lfsck_assistant_operations lfsck_layout_assistant_ops = {
6999         .la_handler_p1          = lfsck_layout_assistant_handler_p1,
7000         .la_handler_p2          = lfsck_layout_assistant_handler_p2,
7001         .la_fill_pos            = lfsck_layout_assistant_fill_pos,
7002         .la_double_scan_result  = lfsck_layout_double_scan_result,
7003         .la_req_fini            = lfsck_layout_assistant_req_fini,
7004         .la_sync_failures       = lfsck_layout_assistant_sync_failures,
7005 };
7006
7007 int lfsck_layout_setup(const struct lu_env *env, struct lfsck_instance *lfsck)
7008 {
7009         struct lfsck_component  *com;
7010         struct lfsck_layout     *lo;
7011         struct dt_object        *root = NULL;
7012         struct dt_object        *obj;
7013         int                      i;
7014         int                      rc;
7015         ENTRY;
7016
7017         OBD_ALLOC_PTR(com);
7018         if (com == NULL)
7019                 RETURN(-ENOMEM);
7020
7021         INIT_LIST_HEAD(&com->lc_link);
7022         INIT_LIST_HEAD(&com->lc_link_dir);
7023         init_rwsem(&com->lc_sem);
7024         atomic_set(&com->lc_ref, 1);
7025         com->lc_lfsck = lfsck;
7026         com->lc_type = LFSCK_TYPE_LAYOUT;
7027         if (lfsck->li_master) {
7028                 com->lc_ops = &lfsck_layout_master_ops;
7029                 com->lc_data = lfsck_assistant_data_init(
7030                                 &lfsck_layout_assistant_ops,
7031                                 LFSCK_LAYOUT);
7032                 if (com->lc_data == NULL)
7033                         GOTO(out, rc = -ENOMEM);
7034
7035                 for (i = 0; i < LFSCK_STF_COUNT; i++)
7036                         mutex_init(&com->lc_sub_trace_objs[i].lsto_mutex);
7037         } else {
7038                 struct lfsck_layout_slave_data *llsd;
7039
7040                 com->lc_ops = &lfsck_layout_slave_ops;
7041                 OBD_ALLOC_PTR(llsd);
7042                 if (llsd == NULL)
7043                         GOTO(out, rc = -ENOMEM);
7044
7045                 INIT_LIST_HEAD(&llsd->llsd_seq_list);
7046                 INIT_LIST_HEAD(&llsd->llsd_master_list);
7047                 spin_lock_init(&llsd->llsd_lock);
7048                 llsd->llsd_rb_root = RB_ROOT;
7049                 init_rwsem(&llsd->llsd_rb_rwsem);
7050                 com->lc_data = llsd;
7051         }
7052         com->lc_file_size = sizeof(*lo);
7053         OBD_ALLOC(com->lc_file_ram, com->lc_file_size);
7054         if (com->lc_file_ram == NULL)
7055                 GOTO(out, rc = -ENOMEM);
7056
7057         OBD_ALLOC(com->lc_file_disk, com->lc_file_size);
7058         if (com->lc_file_disk == NULL)
7059                 GOTO(out, rc = -ENOMEM);
7060
7061         root = dt_locate(env, lfsck->li_bottom, &lfsck->li_local_root_fid);
7062         if (IS_ERR(root))
7063                 GOTO(out, rc = PTR_ERR(root));
7064
7065         if (unlikely(!dt_try_as_dir(env, root, true)))
7066                 GOTO(out, rc = -ENOTDIR);
7067
7068         obj = local_file_find_or_create(env, lfsck->li_los, root,
7069                                         LFSCK_LAYOUT,
7070                                         S_IFREG | S_IRUGO | S_IWUSR);
7071         if (IS_ERR(obj))
7072                 GOTO(out, rc = PTR_ERR(obj));
7073
7074         com->lc_obj = obj;
7075         rc = lfsck_layout_load(env, com);
7076         if (rc > 0) {
7077                 rc = lfsck_layout_reset(env, com, true);
7078         } else if (rc == -ENOENT) {
7079                 rc = lfsck_layout_init(env, com);
7080         } else if (lfsck->li_master) {
7081                 rc = lfsck_load_sub_trace_files(env, com,
7082                                 &dt_lfsck_layout_dangling_features,
7083                                 LFSCK_LAYOUT, false);
7084                 if (rc)
7085                         rc = lfsck_layout_reset(env, com, true);
7086         }
7087
7088         if (rc != 0)
7089                 GOTO(out, rc);
7090
7091         lo = com->lc_file_ram;
7092         switch (lo->ll_status) {
7093         case LS_INIT:
7094         case LS_COMPLETED:
7095         case LS_FAILED:
7096         case LS_STOPPED:
7097         case LS_PARTIAL:
7098                 spin_lock(&lfsck->li_lock);
7099                 list_add_tail(&com->lc_link, &lfsck->li_list_idle);
7100                 spin_unlock(&lfsck->li_lock);
7101                 break;
7102         default:
7103                 CERROR("%s: unknown lfsck_layout status %d\n",
7104                        lfsck_lfsck2name(lfsck), lo->ll_status);
7105                 fallthrough;
7106         case LS_SCANNING_PHASE1:
7107         case LS_SCANNING_PHASE2:
7108                 /* No need to store the status to disk right now.
7109                  * If the system crashed before the status stored,
7110                  * it will be loaded back when next time. */
7111                 lo->ll_status = LS_CRASHED;
7112                 if (!lfsck->li_master)
7113                         lo->ll_flags |= LF_INCOMPLETE;
7114                 fallthrough;
7115         case LS_PAUSED:
7116         case LS_CRASHED:
7117         case LS_CO_FAILED:
7118         case LS_CO_STOPPED:
7119         case LS_CO_PAUSED:
7120                 spin_lock(&lfsck->li_lock);
7121                 list_add_tail(&com->lc_link, &lfsck->li_list_scan);
7122                 spin_unlock(&lfsck->li_lock);
7123                 break;
7124         }
7125
7126         if (lo->ll_flags & LF_CRASHED_LASTID) {
7127                 LASSERT(lfsck->li_out_notify != NULL);
7128
7129                 lfsck->li_out_notify(env, lfsck->li_out_notify_data,
7130                                      LE_LASTID_REBUILDING);
7131         }
7132
7133         GOTO(out, rc = 0);
7134
7135 out:
7136         if (root != NULL && !IS_ERR(root))
7137                 lfsck_object_put(env, root);
7138
7139         if (rc != 0) {
7140                 lfsck_component_cleanup(env, com);
7141                 CERROR("%s: fail to init layout LFSCK component: rc = %d\n",
7142                        lfsck_lfsck2name(lfsck), rc);
7143         }
7144
7145         return rc;
7146 }
7147
7148 struct lfsck_orphan_it {
7149         struct lfsck_component           *loi_com;
7150         struct lfsck_rbtree_node         *loi_lrn;
7151         struct lfsck_layout_slave_target *loi_llst;
7152         struct lu_fid                     loi_key;
7153         struct lu_orphan_rec_v3           loi_rec;
7154         __u64                             loi_hash;
7155         unsigned int                      loi_over:1;
7156 };
7157
7158 static int lfsck_fid_match_idx(const struct lu_env *env,
7159                                struct lfsck_instance *lfsck,
7160                                const struct lu_fid *fid, int idx)
7161 {
7162         struct seq_server_site  *ss;
7163         struct lu_server_fld    *sf;
7164         struct lu_seq_range     *range = &lfsck_env_info(env)->lti_range;
7165         int                      rc;
7166
7167         /* All abnormal cases will be returned to MDT0. */
7168         if (!fid_is_norm(fid)) {
7169                 if (idx == 0)
7170                         return 1;
7171
7172                 return 0;
7173         }
7174
7175         ss = lfsck_dev_site(lfsck);
7176         if (unlikely(ss == NULL))
7177                 return -ENOTCONN;
7178
7179         sf = ss->ss_server_fld;
7180         LASSERT(sf != NULL);
7181
7182         fld_range_set_any(range);
7183         rc = fld_server_lookup(env, sf, fid_seq(fid), range);
7184         if (rc != 0)
7185                 return rc;
7186
7187         if (!fld_range_is_mdt(range))
7188                 return -EINVAL;
7189
7190         if (range->lsr_index == idx)
7191                 return 1;
7192
7193         return 0;
7194 }
7195
7196 static void lfsck_layout_destroy_orphan(const struct lu_env *env,
7197                                         struct lfsck_instance *lfsck,
7198                                         struct dt_object *obj)
7199 {
7200         struct dt_device        *dev    = lfsck_obj2dev(obj);
7201         struct thandle          *handle;
7202         int                      rc;
7203         ENTRY;
7204
7205         handle = lfsck_trans_create(env, dev, lfsck);
7206         if (IS_ERR(handle))
7207                 RETURN_EXIT;
7208
7209         rc = dt_declare_ref_del(env, obj, handle);
7210         if (rc != 0)
7211                 GOTO(stop, rc);
7212
7213         rc = dt_declare_destroy(env, obj, handle);
7214         if (rc != 0)
7215                 GOTO(stop, rc);
7216
7217         rc = dt_trans_start_local(env, dev, handle);
7218         if (rc != 0)
7219                 GOTO(stop, rc);
7220
7221         dt_write_lock(env, obj, 0);
7222         rc = dt_ref_del(env, obj, handle);
7223         if (rc == 0)
7224                 rc = dt_destroy(env, obj, handle);
7225         dt_write_unlock(env, obj);
7226
7227         GOTO(stop, rc);
7228
7229 stop:
7230         dt_trans_stop(env, dev, handle);
7231
7232         CDEBUG(D_LFSCK, "destroy orphan OST-object "DFID": rc = %d\n",
7233                PFID(lfsck_dto2fid(obj)), rc);
7234
7235         RETURN_EXIT;
7236 }
7237
7238 static int lfsck_orphan_index_lookup(const struct lu_env *env,
7239                                      struct dt_object *dt,
7240                                      struct dt_rec *rec,
7241                                      const struct dt_key *key)
7242 {
7243         return -EOPNOTSUPP;
7244 }
7245
7246 static int lfsck_orphan_index_declare_insert(const struct lu_env *env,
7247                                              struct dt_object *dt,
7248                                              const struct dt_rec *rec,
7249                                              const struct dt_key *key,
7250                                              struct thandle *handle)
7251 {
7252         return -EOPNOTSUPP;
7253 }
7254
7255 static int lfsck_orphan_index_insert(const struct lu_env *env,
7256                                      struct dt_object *dt,
7257                                      const struct dt_rec *rec,
7258                                      const struct dt_key *key,
7259                                      struct thandle *handle)
7260 {
7261         return -EOPNOTSUPP;
7262 }
7263
7264 static int lfsck_orphan_index_declare_delete(const struct lu_env *env,
7265                                              struct dt_object *dt,
7266                                              const struct dt_key *key,
7267                                              struct thandle *handle)
7268 {
7269         return -EOPNOTSUPP;
7270 }
7271
7272 static int lfsck_orphan_index_delete(const struct lu_env *env,
7273                                      struct dt_object *dt,
7274                                      const struct dt_key *key,
7275                                      struct thandle *handle)
7276 {
7277         return -EOPNOTSUPP;
7278 }
7279
7280 static struct dt_it *lfsck_orphan_it_init(const struct lu_env *env,
7281                                           struct dt_object *dt,
7282                                           __u32 attr)
7283 {
7284         struct dt_device                *dev    = lu2dt_dev(dt->do_lu.lo_dev);
7285         struct lfsck_instance           *lfsck;
7286         struct lfsck_component          *com    = NULL;
7287         struct lfsck_layout_slave_data  *llsd;
7288         struct lfsck_orphan_it          *it     = NULL;
7289         struct lfsck_layout             *lo;
7290         int                              rc     = 0;
7291         ENTRY;
7292
7293         lfsck = lfsck_instance_find(dev, true, false);
7294         if (unlikely(lfsck == NULL))
7295                 RETURN(ERR_PTR(-ENXIO));
7296
7297         com = lfsck_component_find(lfsck, LFSCK_TYPE_LAYOUT);
7298         if (unlikely(com == NULL))
7299                 GOTO(out, rc = -ENOENT);
7300
7301         lo = com->lc_file_ram;
7302         if (lo->ll_flags & LF_INCOMPLETE)
7303                 GOTO(out, rc = -ESRCH);
7304
7305         llsd = com->lc_data;
7306         if (!llsd->llsd_rbtree_valid)
7307                 GOTO(out, rc = -ESRCH);
7308
7309         OBD_ALLOC_PTR(it);
7310         if (it == NULL)
7311                 GOTO(out, rc = -ENOMEM);
7312
7313         it->loi_llst = lfsck_layout_llst_find_and_del(llsd, attr, false);
7314         if (it->loi_llst == NULL)
7315                 GOTO(out, rc = -ENXIO);
7316
7317         if (dev->dd_record_fid_accessed) {
7318                 /* The first iteration against the rbtree, scan the whole rbtree
7319                  * to remove the nodes which do NOT need to be handled. */
7320                 down_write(&llsd->llsd_rb_rwsem);
7321                 if (dev->dd_record_fid_accessed) {
7322                         struct rb_node                  *node;
7323                         struct rb_node                  *next;
7324                         struct lfsck_rbtree_node        *lrn;
7325
7326                         /* No need to record the fid accessing anymore. */
7327                         dev->dd_record_fid_accessed = 0;
7328
7329                         node = rb_first(&llsd->llsd_rb_root);
7330                         while (node != NULL) {
7331                                 next = rb_next(node);
7332                                 lrn = rb_entry(node, struct lfsck_rbtree_node,
7333                                                lrn_node);
7334                                 if (atomic_read(&lrn->lrn_known_count) <=
7335                                     atomic_read(&lrn->lrn_accessed_count)) {
7336                                         rb_erase(node, &llsd->llsd_rb_root);
7337                                         lfsck_rbtree_free(lrn);
7338                                 }
7339                                 node = next;
7340                         }
7341                 }
7342                 up_write(&llsd->llsd_rb_rwsem);
7343         }
7344
7345         /* read lock the rbtree when init, and unlock when fini */
7346         down_read(&llsd->llsd_rb_rwsem);
7347         it->loi_com = com;
7348         com = NULL;
7349
7350         GOTO(out, rc = 0);
7351
7352 out:
7353         if (com != NULL)
7354                 lfsck_component_put(env, com);
7355
7356         CDEBUG(D_LFSCK, "%s: init the orphan iteration: rc = %d\n",
7357                lfsck_lfsck2name(lfsck), rc);
7358
7359         lfsck_instance_put(env, lfsck);
7360         if (rc != 0) {
7361                 if (it != NULL)
7362                         OBD_FREE_PTR(it);
7363
7364                 it = (struct lfsck_orphan_it *)ERR_PTR(rc);
7365         }
7366
7367         return (struct dt_it *)it;
7368 }
7369
7370 static void lfsck_orphan_it_fini(const struct lu_env *env,
7371                                  struct dt_it *di)
7372 {
7373         struct lfsck_orphan_it           *it    = (struct lfsck_orphan_it *)di;
7374         struct lfsck_component           *com   = it->loi_com;
7375         struct lfsck_layout_slave_data   *llsd;
7376         struct lfsck_layout_slave_target *llst;
7377
7378         if (com != NULL) {
7379                 CDEBUG(D_LFSCK, "%s: fini the orphan iteration\n",
7380                        lfsck_lfsck2name(com->lc_lfsck));
7381
7382                 llsd = com->lc_data;
7383                 up_read(&llsd->llsd_rb_rwsem);
7384                 llst = it->loi_llst;
7385                 LASSERT(llst != NULL);
7386
7387                 /* Save the key and hash for iterate next. */
7388                 llst->llst_fid = it->loi_key;
7389                 llst->llst_hash = it->loi_hash;
7390                 lfsck_layout_llst_put(llst);
7391                 lfsck_component_put(env, com);
7392         }
7393         OBD_FREE_PTR(it);
7394 }
7395
7396 /**
7397  * \retval       +1: the iteration finished
7398  * \retval        0: on success, not finished
7399  * \retval      -ve: on error
7400  */
7401 static int lfsck_orphan_it_next(const struct lu_env *env,
7402                                 struct dt_it *di)
7403 {
7404         struct lfsck_thread_info        *info   = lfsck_env_info(env);
7405         struct filter_fid               *ff     = &info->lti_ff;
7406         struct lu_attr                  *la     = &info->lti_la;
7407         struct lfsck_orphan_it          *it     = (struct lfsck_orphan_it *)di;
7408         struct lu_fid                   *key    = &it->loi_key;
7409         struct lu_orphan_rec_v3         *rec    = &it->loi_rec;
7410         struct ost_layout               *ol     = &rec->lor_layout;
7411         struct lfsck_component          *com    = it->loi_com;
7412         struct lfsck_instance           *lfsck  = com->lc_lfsck;
7413         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
7414         struct dt_object                *obj;
7415         struct lfsck_rbtree_node        *lrn;
7416         int                              pos;
7417         int                              rc;
7418         __u32                            save;
7419         __u32                            idx    = it->loi_llst->llst_index;
7420         bool                             exact  = false;
7421         ENTRY;
7422
7423         if (it->loi_over)
7424                 RETURN(1);
7425
7426 again0:
7427         lrn = it->loi_lrn;
7428         if (lrn == NULL) {
7429                 lrn = lfsck_rbtree_search(llsd, key, &exact);
7430                 if (lrn == NULL) {
7431                         it->loi_over = 1;
7432                         RETURN(1);
7433                 }
7434
7435                 it->loi_lrn = lrn;
7436                 if (!exact) {
7437                         key->f_seq = lrn->lrn_seq;
7438                         key->f_oid = lrn->lrn_first_oid;
7439                         key->f_ver = 0;
7440                 }
7441         } else {
7442                 key->f_oid++;
7443                 if (unlikely(key->f_oid == 0)) {
7444                         key->f_seq++;
7445                         it->loi_lrn = NULL;
7446                         goto again0;
7447                 }
7448
7449                 if (key->f_oid >=
7450                     lrn->lrn_first_oid + LFSCK_RBTREE_BITMAP_WIDTH) {
7451                         it->loi_lrn = NULL;
7452                         goto again0;
7453                 }
7454         }
7455
7456         if (unlikely(atomic_read(&lrn->lrn_known_count) <=
7457                      atomic_read(&lrn->lrn_accessed_count))) {
7458                 struct rb_node *next = rb_next(&lrn->lrn_node);
7459
7460                 while (next != NULL) {
7461                         lrn = rb_entry(next, struct lfsck_rbtree_node,
7462                                        lrn_node);
7463                         if (atomic_read(&lrn->lrn_known_count) >
7464                             atomic_read(&lrn->lrn_accessed_count))
7465                                 break;
7466                         next = rb_next(next);
7467                 }
7468
7469                 if (next == NULL) {
7470                         it->loi_over = 1;
7471                         RETURN(1);
7472                 }
7473
7474                 it->loi_lrn = lrn;
7475                 key->f_seq = lrn->lrn_seq;
7476                 key->f_oid = lrn->lrn_first_oid;
7477                 key->f_ver = 0;
7478         }
7479
7480         pos = key->f_oid - lrn->lrn_first_oid;
7481
7482 again1:
7483         pos = find_next_bit(lrn->lrn_known_bitmap,
7484                             LFSCK_RBTREE_BITMAP_WIDTH, pos);
7485         if (pos >= LFSCK_RBTREE_BITMAP_WIDTH) {
7486                 key->f_oid = lrn->lrn_first_oid + pos;
7487                 if (unlikely(key->f_oid < lrn->lrn_first_oid)) {
7488                         key->f_seq++;
7489                         key->f_oid = 0;
7490                 }
7491                 it->loi_lrn = NULL;
7492                 goto again0;
7493         }
7494
7495         if (test_bit(pos, lrn->lrn_accessed_bitmap)) {
7496                 pos++;
7497                 goto again1;
7498         }
7499
7500         key->f_oid = lrn->lrn_first_oid + pos;
7501         obj = lfsck_object_find_bottom(env, lfsck, key);
7502         if (IS_ERR(obj)) {
7503                 rc = PTR_ERR(obj);
7504                 if (rc == -ENOENT) {
7505                         pos++;
7506                         goto again1;
7507                 }
7508                 RETURN(rc);
7509         }
7510
7511         dt_read_lock(env, obj, 0);
7512         if (dt_object_exists(obj) == 0 ||
7513             lfsck_is_dead_obj(obj)) {
7514                 dt_read_unlock(env, obj);
7515                 lfsck_object_put(env, obj);
7516                 pos++;
7517                 goto again1;
7518         }
7519
7520         rc = dt_attr_get(env, obj, la);
7521         if (rc != 0)
7522                 GOTO(out, rc);
7523
7524         rc = dt_xattr_get(env, obj, lfsck_buf_get(env, ff, sizeof(*ff)),
7525                           XATTR_NAME_FID);
7526         if (rc == -ENODATA) {
7527                 /* For the pre-created OST-object, update the bitmap to avoid
7528                  * others LFSCK (second phase) iteration to touch it again. */
7529                 if (la->la_ctime == 0) {
7530                         if (!test_and_set_bit(pos, lrn->lrn_accessed_bitmap))
7531                                 atomic_inc(&lrn->lrn_accessed_count);
7532
7533                         /* For the race between repairing dangling referenced
7534                          * MDT-object and unlink the file, it may left orphan
7535                          * OST-object there. Destroy it now! */
7536                         if (unlikely(!(la->la_mode & S_ISUID))) {
7537                                 dt_read_unlock(env, obj);
7538                                 lfsck_layout_destroy_orphan(env, lfsck, obj);
7539                                 lfsck_object_put(env, obj);
7540                                 pos++;
7541                                 goto again1;
7542                         }
7543                 } else if (idx == 0) {
7544                         /* If the orphan OST-object has no parent information,
7545                          * regard it as referenced by the MDT-object on MDT0. */
7546                         fid_zero(&rec->lor_rec.lor_fid);
7547                         rec->lor_rec.lor_uid = la->la_uid;
7548                         rec->lor_rec.lor_gid = la->la_gid;
7549                         memset(ol, 0, sizeof(*ol));
7550                         rec->lor_layout_version = 0;
7551                         rec->lor_range = 0;
7552
7553                         GOTO(out, rc = 0);
7554                 }
7555
7556                 dt_read_unlock(env, obj);
7557                 lfsck_object_put(env, obj);
7558                 pos++;
7559                 goto again1;
7560         }
7561
7562         if (rc < sizeof(struct lu_fid))
7563                 GOTO(out, rc = (rc < 0 ? rc : -EINVAL));
7564
7565         fid_le_to_cpu(&rec->lor_rec.lor_fid, &ff->ff_parent);
7566         /* Currently, the filter_fid::ff_parent::f_ver is not the real parent
7567          * MDT-object's FID::f_ver, instead it is the OST-object index in its
7568          * parent MDT-object's layout EA. */
7569         save = rec->lor_rec.lor_fid.f_stripe_idx;
7570         rec->lor_rec.lor_fid.f_ver = 0;
7571         rc = lfsck_fid_match_idx(env, lfsck, &rec->lor_rec.lor_fid, idx);
7572         /* If the orphan OST-object does not claim the MDT, then next.
7573          *
7574          * If we do not know whether it matches or not, then return it
7575          * to the MDT for further check. */
7576         if (rc == 0) {
7577                 dt_read_unlock(env, obj);
7578                 lfsck_object_put(env, obj);
7579                 pos++;
7580                 goto again1;
7581         }
7582
7583         rec->lor_rec.lor_fid.f_stripe_idx = save;
7584         rec->lor_rec.lor_uid = la->la_uid;
7585         rec->lor_rec.lor_gid = la->la_gid;
7586         ost_layout_le_to_cpu(ol, &ff->ff_layout);
7587         rec->lor_layout_version =
7588                 le32_to_cpu(ff->ff_layout_version & ~LU_LAYOUT_RESYNC);
7589         rec->lor_range = le32_to_cpu(ff->ff_range);
7590
7591         CDEBUG(D_LFSCK, "%s: return orphan "DFID", PFID "DFID", owner %u:%u, "
7592                "stripe size %u, stripe count %u, COMP id %u, COMP start %llu, "
7593                "COMP end %llu, layout version %u, range %u\n",
7594                lfsck_lfsck2name(com->lc_lfsck), PFID(key),
7595                PFID(&rec->lor_rec.lor_fid), rec->lor_rec.lor_uid,
7596                rec->lor_rec.lor_gid, ol->ol_stripe_size, ol->ol_stripe_count,
7597                ol->ol_comp_id, ol->ol_comp_start, ol->ol_comp_end,
7598                rec->lor_layout_version, rec->lor_range);
7599
7600         GOTO(out, rc = 0);
7601
7602 out:
7603         dt_read_unlock(env, obj);
7604         lfsck_object_put(env, obj);
7605         if (rc == 0)
7606                 it->loi_hash++;
7607
7608         return rc;
7609 }
7610
7611 /**
7612  * \retval       +1: locate to the exactly position
7613  * \retval        0: cannot locate to the exactly position,
7614  *                   call next() to move to a valid position.
7615  * \retval      -ve: on error
7616  */
7617 static int lfsck_orphan_it_get(const struct lu_env *env,
7618                                struct dt_it *di,
7619                                const struct dt_key *key)
7620 {
7621         struct lfsck_orphan_it  *it   = (struct lfsck_orphan_it *)di;
7622         int                      rc;
7623
7624         it->loi_key = *(struct lu_fid *)key;
7625         rc = lfsck_orphan_it_next(env, di);
7626         if (rc == 1)
7627                 return 0;
7628
7629         if (rc == 0)
7630                 return 1;
7631
7632         return rc;
7633 }
7634
7635 static void lfsck_orphan_it_put(const struct lu_env *env,
7636                                 struct dt_it *di)
7637 {
7638 }
7639
7640 static struct dt_key *lfsck_orphan_it_key(const struct lu_env *env,
7641                                           const struct dt_it *di)
7642 {
7643         struct lfsck_orphan_it *it = (struct lfsck_orphan_it *)di;
7644
7645         return (struct dt_key *)&it->loi_key;
7646 }
7647
7648 static int lfsck_orphan_it_key_size(const struct lu_env *env,
7649                                     const struct dt_it *di)
7650 {
7651         return sizeof(struct lu_fid);
7652 }
7653
7654 static int lfsck_orphan_it_rec(const struct lu_env *env,
7655                                const struct dt_it *di,
7656                                struct dt_rec *rec,
7657                                __u32 attr)
7658 {
7659         struct lfsck_orphan_it *it = (struct lfsck_orphan_it *)di;
7660
7661         *(struct lu_orphan_rec_v3 *)rec = it->loi_rec;
7662
7663         return 0;
7664 }
7665
7666 static __u64 lfsck_orphan_it_store(const struct lu_env *env,
7667                                    const struct dt_it *di)
7668 {
7669         struct lfsck_orphan_it  *it   = (struct lfsck_orphan_it *)di;
7670
7671         return it->loi_hash;
7672 }
7673
7674 /**
7675  * \retval       +1: locate to the exactly position
7676  * \retval        0: cannot locate to the exactly position,
7677  *                   call next() to move to a valid position.
7678  * \retval      -ve: on error
7679  */
7680 static int lfsck_orphan_it_load(const struct lu_env *env,
7681                                 const struct dt_it *di,
7682                                 __u64 hash)
7683 {
7684         struct lfsck_orphan_it           *it   = (struct lfsck_orphan_it *)di;
7685         struct lfsck_layout_slave_target *llst = it->loi_llst;
7686         int                               rc;
7687
7688         LASSERT(llst != NULL);
7689
7690         if (hash != llst->llst_hash) {
7691                 CDEBUG(D_LFSCK, "%s: the given hash %llu for orphan "
7692                        "iteration does not match the one when fini "
7693                        "%llu, to be reset.\n",
7694                        lfsck_lfsck2name(it->loi_com->lc_lfsck), hash,
7695                        llst->llst_hash);
7696                 fid_zero(&llst->llst_fid);
7697                 llst->llst_hash = 0;
7698         }
7699
7700         it->loi_key = llst->llst_fid;
7701         it->loi_hash = llst->llst_hash;
7702         rc = lfsck_orphan_it_next(env, (struct dt_it *)di);
7703         if (rc == 1)
7704                 return 0;
7705
7706         if (rc == 0)
7707                 return 1;
7708
7709         return rc;
7710 }
7711
7712 static int lfsck_orphan_it_key_rec(const struct lu_env *env,
7713                                    const struct dt_it *di,
7714                                    void *key_rec)
7715 {
7716         return 0;
7717 }
7718
7719 static const struct dt_index_operations lfsck_orphan_index_ops = {
7720         .dio_lookup             = lfsck_orphan_index_lookup,
7721         .dio_declare_insert     = lfsck_orphan_index_declare_insert,
7722         .dio_insert             = lfsck_orphan_index_insert,
7723         .dio_declare_delete     = lfsck_orphan_index_declare_delete,
7724         .dio_delete             = lfsck_orphan_index_delete,
7725         .dio_it = {
7726                 .init           = lfsck_orphan_it_init,
7727                 .fini           = lfsck_orphan_it_fini,
7728                 .get            = lfsck_orphan_it_get,
7729                 .put            = lfsck_orphan_it_put,
7730                 .next           = lfsck_orphan_it_next,
7731                 .key            = lfsck_orphan_it_key,
7732                 .key_size       = lfsck_orphan_it_key_size,
7733                 .rec            = lfsck_orphan_it_rec,
7734                 .store          = lfsck_orphan_it_store,
7735                 .load           = lfsck_orphan_it_load,
7736                 .key_rec        = lfsck_orphan_it_key_rec,
7737         }
7738 };