Whamcloud - gitweb
LU-10499 pcc: use foreign layout for PCCRO on server side
[fs/lustre-release.git] / lustre / lfsck / lfsck_layout.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License version 2 for more details.  A copy is
14  * included in the COPYING file that accompanied this code.
15
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright (c) 2014, 2017, Intel Corporation.
24  */
25 /*
26  * lustre/lfsck/lfsck_layout.c
27  *
28  * Author: Fan, Yong <fan.yong@intel.com>
29  */
30
31 #ifndef EXPORT_SYMTAB
32 # define EXPORT_SYMTAB
33 #endif
34 #define DEBUG_SUBSYSTEM S_LFSCK
35
36 #include <linux/bitops.h>
37 #include <linux/rbtree.h>
38
39 #include <lu_object.h>
40 #include <dt_object.h>
41 #include <lustre_fid.h>
42 #include <lustre_lib.h>
43 #include <lustre_net.h>
44 #include <md_object.h>
45 #include <obd_class.h>
46
47 #include "lfsck_internal.h"
48
49 #define LFSCK_LAYOUT_MAGIC_V1           0xB173AE14
50 #define LFSCK_LAYOUT_MAGIC_V2           0xB1734D76
51 #define LFSCK_LAYOUT_MAGIC_V3           0xB17371B9
52 #define LFSCK_LAYOUT_MAGIC_V4           0xB1732FED
53
54 #define LFSCK_LAYOUT_MAGIC              LFSCK_LAYOUT_MAGIC_V4
55
56 struct lfsck_layout_seq {
57         struct list_head         lls_list;
58         __u64                    lls_seq;
59         __u64                    lls_lastid;
60         __u64                    lls_lastid_known;
61         struct dt_object        *lls_lastid_obj;
62         unsigned int             lls_dirty:1;
63 };
64
65 struct lfsck_layout_slave_target {
66         /* link into lfsck_layout_slave_data::llsd_master_list. */
67         struct list_head        llst_list;
68         /* The position for next record in the rbtree for iteration. */
69         struct lu_fid           llst_fid;
70         /* Dummy hash for iteration against the rbtree. */
71         __u64                   llst_hash;
72         __u64                   llst_gen;
73         atomic_t                llst_ref;
74         __u32                   llst_index;
75         /* How many times we have failed to get the master status. */
76         int                     llst_failures;
77 };
78
79 struct lfsck_layout_slave_data {
80         /* list for lfsck_layout_seq */
81         struct list_head         llsd_seq_list;
82
83         /* list for the masters involve layout verification. */
84         struct list_head         llsd_master_list;
85         spinlock_t               llsd_lock;
86         __u64                    llsd_touch_gen;
87         struct dt_object        *llsd_rb_obj;
88         struct rb_root           llsd_rb_root;
89         struct rw_semaphore      llsd_rb_rwsem;
90         unsigned int             llsd_rbtree_valid:1;
91 };
92
93 struct lfsck_layout_slave_async_args {
94         struct obd_export                *llsaa_exp;
95         struct lfsck_component           *llsaa_com;
96         struct lfsck_layout_slave_target *llsaa_llst;
97 };
98
99 static inline bool lfsck_comp_extent_aligned(__u64 border, __u32 size)
100 {
101         return (border & (size - 1)) == 0;
102 }
103
104 static inline void
105 lfsck_layout_llst_put(struct lfsck_layout_slave_target *llst)
106 {
107         if (atomic_dec_and_test(&llst->llst_ref)) {
108                 LASSERT(list_empty(&llst->llst_list));
109
110                 OBD_FREE_PTR(llst);
111         }
112 }
113
114 static inline int
115 lfsck_layout_llst_add(struct lfsck_layout_slave_data *llsd, __u32 index)
116 {
117         struct lfsck_layout_slave_target *llst;
118         struct lfsck_layout_slave_target *tmp;
119         int                               rc   = 0;
120
121         OBD_ALLOC_PTR(llst);
122         if (llst == NULL)
123                 return -ENOMEM;
124
125         INIT_LIST_HEAD(&llst->llst_list);
126         llst->llst_gen = 0;
127         llst->llst_index = index;
128         atomic_set(&llst->llst_ref, 1);
129
130         spin_lock(&llsd->llsd_lock);
131         list_for_each_entry(tmp, &llsd->llsd_master_list, llst_list) {
132                 if (tmp->llst_index == index) {
133                         rc = -EALREADY;
134                         break;
135                 }
136         }
137         if (rc == 0)
138                 list_add_tail(&llst->llst_list, &llsd->llsd_master_list);
139         spin_unlock(&llsd->llsd_lock);
140
141         if (rc != 0)
142                 OBD_FREE_PTR(llst);
143
144         return rc;
145 }
146
147 static inline void
148 lfsck_layout_llst_del(struct lfsck_layout_slave_data *llsd,
149                       struct lfsck_layout_slave_target *llst)
150 {
151         bool del = false;
152
153         spin_lock(&llsd->llsd_lock);
154         if (!list_empty(&llst->llst_list)) {
155                 list_del_init(&llst->llst_list);
156                 del = true;
157         }
158         spin_unlock(&llsd->llsd_lock);
159
160         if (del)
161                 lfsck_layout_llst_put(llst);
162 }
163
164 static inline struct lfsck_layout_slave_target *
165 lfsck_layout_llst_find_and_del(struct lfsck_layout_slave_data *llsd,
166                                __u32 index, bool unlink)
167 {
168         struct lfsck_layout_slave_target *llst;
169
170         spin_lock(&llsd->llsd_lock);
171         list_for_each_entry(llst, &llsd->llsd_master_list, llst_list) {
172                 if (llst->llst_index == index) {
173                         if (unlink)
174                                 list_del_init(&llst->llst_list);
175                         else
176                                 atomic_inc(&llst->llst_ref);
177                         spin_unlock(&llsd->llsd_lock);
178
179                         return llst;
180                 }
181         }
182         spin_unlock(&llsd->llsd_lock);
183
184         return NULL;
185 }
186
187 static struct lfsck_layout_req *
188 lfsck_layout_assistant_req_init(struct lfsck_assistant_object *lso,
189                                 struct dt_object *child, __u32 comp_id,
190                                 __u32 ost_idx, __u32 lov_idx)
191 {
192         struct lfsck_layout_req *llr;
193
194         OBD_ALLOC_PTR(llr);
195         if (llr == NULL)
196                 return ERR_PTR(-ENOMEM);
197
198         INIT_LIST_HEAD(&llr->llr_lar.lar_list);
199         llr->llr_lar.lar_parent = lfsck_assistant_object_get(lso);
200         llr->llr_child = child;
201         llr->llr_comp_id = comp_id;
202         llr->llr_ost_idx = ost_idx;
203         llr->llr_lov_idx = lov_idx;
204
205         return llr;
206 }
207
208 static void lfsck_layout_assistant_req_fini(const struct lu_env *env,
209                                             struct lfsck_assistant_req *lar)
210 {
211         struct lfsck_layout_req *llr =
212                 container_of(lar, struct lfsck_layout_req, llr_lar);
213
214         lfsck_object_put(env, llr->llr_child);
215         lfsck_assistant_object_put(env, lar->lar_parent);
216         OBD_FREE_PTR(llr);
217 }
218
219 static int
220 lfsck_layout_assistant_sync_failures_interpret(const struct lu_env *env,
221                                                struct ptlrpc_request *req,
222                                                void *args, int rc)
223 {
224         if (rc == 0) {
225                 struct lfsck_async_interpret_args *laia = args;
226                 struct lfsck_tgt_desc             *ltd  = laia->laia_ltd;
227
228                 ltd->ltd_synced_failures = 1;
229                 atomic_dec(laia->laia_count);
230         }
231
232         return 0;
233 }
234
235 /**
236  * Notify remote LFSCK instances about former failures.
237  *
238  * The local LFSCK instance has recorded which OSTs have ever failed to respond
239  * some LFSCK verification requests (maybe because of network issues or the OST
240  * itself trouble). During the respond gap, the OST may missed some OST-objects
241  * verification, then the OST cannot know whether related OST-objects have been
242  * referenced by related MDT-objects or not, then in the second-stage scanning,
243  * these OST-objects will be regarded as orphan, if the OST-object contains bad
244  * parent FID for back reference, then it will misguide the LFSCK to make wrong
245  * fixing for the fake orphan.
246  *
247  * To avoid above trouble, when layout LFSCK finishes the first-stage scanning,
248  * it will scan the bitmap for the ever failed OSTs, and notify them that they
249  * have ever missed some OST-object verification and should skip the handling
250  * for orphan OST-objects on all MDTs that are in the layout LFSCK.
251  *
252  * \param[in] env       pointer to the thread context
253  * \param[in] com       pointer to the lfsck component
254  * \param[in] lr        pointer to the lfsck request
255  */
256 static void lfsck_layout_assistant_sync_failures(const struct lu_env *env,
257                                                  struct lfsck_component *com,
258                                                  struct lfsck_request *lr)
259 {
260         struct lfsck_async_interpret_args *laia  =
261                                 &lfsck_env_info(env)->lti_laia2;
262         struct lfsck_assistant_data       *lad   = com->lc_data;
263         struct lfsck_layout               *lo    = com->lc_file_ram;
264         struct lfsck_instance             *lfsck = com->lc_lfsck;
265         struct lfsck_tgt_descs            *ltds  = &lfsck->li_ost_descs;
266         struct lfsck_tgt_desc             *ltd;
267         struct ptlrpc_request_set         *set;
268         atomic_t                           count;
269         __u32                              idx;
270         int                                rc    = 0;
271         ENTRY;
272
273         if (!test_bit(LAD_INCOMPLETE, &lad->lad_flags))
274                 RETURN_EXIT;
275
276         /* If the MDT has ever failed to verfiy some OST-objects,
277          * then sync failures with them firstly. */
278         lr->lr_flags2 = lo->ll_flags | LF_INCOMPLETE;
279
280         atomic_set(&count, 0);
281         memset(laia, 0, sizeof(*laia));
282         laia->laia_count = &count;
283         set = ptlrpc_prep_set();
284         if (set == NULL)
285                 GOTO(out, rc = -ENOMEM);
286
287         down_read(&ltds->ltd_rw_sem);
288         for_each_set_bit(idx, lad->lad_bitmap, lad->lad_bitmap_count) {
289                 ltd = lfsck_ltd2tgt(ltds, idx);
290                 if (unlikely(!ltd))
291                         continue;
292
293                 laia->laia_ltd = ltd;
294                 rc = lfsck_async_request(env, ltd->ltd_exp, lr, set,
295                                 lfsck_layout_assistant_sync_failures_interpret,
296                                 laia, LFSCK_NOTIFY);
297                 if (rc != 0) {
298                         CDEBUG(D_LFSCK, "%s: LFSCK assistant fail to "
299                                "notify target %x for %s phase1 done: "
300                                "rc = %d\n", lfsck_lfsck2name(com->lc_lfsck),
301                                ltd->ltd_index, lad->lad_name, rc);
302
303                         break;
304                 }
305
306                 atomic_inc(&count);
307         }
308         up_read(&ltds->ltd_rw_sem);
309
310         if (rc == 0 && atomic_read(&count) > 0)
311                 rc = ptlrpc_set_wait(env, set);
312
313         ptlrpc_set_destroy(set);
314
315         if (rc == 0 && atomic_read(&count) > 0)
316                 rc = -EINVAL;
317
318         GOTO(out, rc);
319
320 out:
321         if (rc != 0)
322                 /* If failed to sync failures with the OSTs, then have to
323                  * mark the whole LFSCK as LF_INCOMPLETE to skip the whole
324                  * subsequent orphan OST-object handling. */
325                 lo->ll_flags |= LF_INCOMPLETE;
326
327         lr->lr_flags2 = lo->ll_flags;
328 }
329
330 static int lfsck_layout_verify_header_v1v3(struct dt_object *obj,
331                                            struct lov_mds_md_v1 *lmm,
332                                            __u64 start, __u64 end,
333                                            __u32 comp_id,
334                                            bool ext, bool *dom)
335 {
336         __u32 magic;
337         __u32 pattern;
338         __u32 size;
339
340         magic = le32_to_cpu(lmm->lmm_magic);
341         /* If magic crashed, keep it there. Sometime later, during OST-object
342          * orphan handling, if some OST-object(s) back-point to it, it can be
343          * verified and repaired. */
344         if (magic != LOV_MAGIC_V1 && magic != LOV_MAGIC_V3) {
345                 int rc;
346
347                 if ((magic & LOV_MAGIC_MASK) == LOV_MAGIC_MAGIC)
348                         rc = -EOPNOTSUPP;
349                 else
350                         rc = -EINVAL;
351
352                 CDEBUG(D_LFSCK, "%s LOV EA magic 0x%X for the file "DFID"\n",
353                        rc == -EINVAL ? "Unknown" : "Unsupported",
354                        magic, PFID(lfsck_dto2fid(obj)));
355
356                 return rc;
357         }
358
359         pattern = le32_to_cpu(lmm->lmm_pattern);
360         *dom = !!(lov_pattern(pattern) & LOV_PATTERN_MDT);
361
362         /* XXX: DoM file verification will be supportted via LU-11081. */
363         if (lov_pattern(pattern) & LOV_PATTERN_MDT) {
364 #if 0
365                 if (start != 0) {
366                         CDEBUG(D_LFSCK, "The DoM entry for "DFID" is not "
367                                "the first component in the mirror %x/%llu\n",
368                                PFID(lfsck_dto2fid(obj)), comp_id, start);
369
370                         return -EINVAL;
371                 }
372 #endif
373         } else if (!lov_pattern_supported_normal_comp(lov_pattern(pattern))) {
374                 CDEBUG(D_LFSCK, "Unsupported LOV EA pattern %u for the file "
375                        DFID" in the component %x\n",
376                        pattern, PFID(lfsck_dto2fid(obj)), comp_id);
377
378                 return -EOPNOTSUPP;
379         }
380
381         size = le32_to_cpu(lmm->lmm_stripe_size);
382         if (!ext && end != LUSTRE_EOF && start != end &&
383             !lfsck_comp_extent_aligned(end, size)){
384                 CDEBUG(D_LFSCK, "not aligned border in PFL extent range "
385                        "[%llu - %llu) stripesize %u for the file "DFID
386                        " at idx %d\n", start, end, size,
387                        PFID(lfsck_dto2fid(obj)), comp_id);
388
389                 return -EINVAL;
390         }
391
392         return 0;
393 }
394
395 static int lfsck_layout_verify_header_foreign(struct dt_object *obj,
396                                               struct lov_foreign_md *lfm,
397                                               size_t len)
398 {
399         /* magic has been verified already */
400         __u32 value_len = le32_to_cpu(lfm->lfm_length);
401         /* type and flags are not checked for instance */
402
403         CDEBUG(D_INFO, "foreign LOV EA, magic %x, len %u, type %x, flags %x, for file "DFID"\n",
404                le32_to_cpu(lfm->lfm_magic), value_len,
405                le32_to_cpu(lfm->lfm_type), le32_to_cpu(lfm->lfm_flags),
406                PFID(lfsck_dto2fid(obj)));
407
408         if (len != value_len + offsetof(typeof(*lfm), lfm_value))
409                 CDEBUG(D_LFSCK, "foreign LOV EA internal size %u does not match EA full size %zu for file "DFID"\n",
410                        value_len, len, PFID(lfsck_dto2fid(obj)));
411
412         /* nothing to repair */
413         return -ENODATA;
414 }
415
416 static int lfsck_layout_verify_header(struct dt_object *obj,
417                                       struct lov_mds_md_v1 *lmm, size_t len)
418 {
419         bool p_dom = false;
420         int rc = 0;
421
422         if (le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_COMP_V1 ||
423             le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_SEL) {
424                 struct lov_comp_md_v1 *lcm = (struct lov_comp_md_v1 *)lmm;
425                 bool p_zero = false;
426                 int i;
427                 __u16 count = le16_to_cpu(lcm->lcm_entry_count);
428
429                 if (unlikely(count == 0)) {
430                         CDEBUG(D_LFSCK, "the PFL file "DFID" contains invalid "
431                                "components count 0\n",
432                                PFID(lfsck_dto2fid(obj)));
433
434                         return -EINVAL;
435                 }
436
437                 for (i = 0; i < count && !rc; i++) {
438                         struct lov_comp_md_entry_v1 *lcme =
439                                                 &lcm->lcm_entries[i];
440                         __u64 start = le64_to_cpu(lcme->lcme_extent.e_start);
441                         __u64 end = le64_to_cpu(lcme->lcme_extent.e_end);
442                         __u32 comp_id = le32_to_cpu(lcme->lcme_id);
443                         struct lov_mds_md_v1 *v1;
444                         bool ext, inited, zero;
445                         __u32 flags;
446
447                         if (unlikely(comp_id == LCME_ID_INVAL ||
448                                      comp_id > LCME_ID_MAX)) {
449                                 CDEBUG(D_LFSCK, "found invalid PFL ID %u "
450                                        "for the file "DFID" at idx %d\n",
451                                        comp_id, PFID(lfsck_dto2fid(obj)), i);
452
453                                 return -EINVAL;
454                         }
455
456                         flags = le32_to_cpu(lcme->lcme_flags);
457                         ext = flags & LCME_FL_EXTENSION;
458                         inited = flags & LCME_FL_INIT;
459                         zero = !!(start == end);
460
461                         if ((i == 0) && zero) {
462                                 CDEBUG(D_LFSCK, "invalid PFL comp %d: [%llu "
463                                        "- %llu) for "DFID"\n", i, start, end,
464                                        PFID(lfsck_dto2fid(obj)));
465                                 return -EINVAL;
466                         }
467
468                         if ((zero && (inited || (i + 1 == count))) ||
469                             (start > end)) {
470                                 CDEBUG(D_LFSCK, "invalid PFL comp %d/%d: "
471                                        "[%llu, %llu) for "DFID", %sinited\n",
472                                        i, count, start, end,
473                                        PFID(lfsck_dto2fid(obj)),
474                                        inited ? "" : "NOT ");
475                                 return -EINVAL;
476                         }
477
478                         if (!ext && p_zero) {
479                                 CDEBUG(D_LFSCK, "invalid PFL comp %d: [%llu, "
480                                        "%llu) for "DFID": NOT extension "
481                                        "after 0-length component\n", i,
482                                        start, end, PFID(lfsck_dto2fid(obj)));
483                                 return -EINVAL;
484                         }
485
486                         if (ext && (inited || p_dom || zero)) {
487                                 CDEBUG(D_LFSCK, "invalid PFL comp %d: [%llu, "
488                                        "%llu) for "DFID": %s\n", i,
489                                        start, end, PFID(lfsck_dto2fid(obj)),
490                                        inited ? "inited extension" :
491                                        p_dom ? "extension follows DOM" :
492                                        zero ? "zero length extension" : "");
493                                 return -EINVAL;
494                         }
495
496                         v1 = (struct lov_mds_md_v1 *)((char *)lmm +
497                                                 le32_to_cpu(lcme->lcme_offset));
498                         if (le32_to_cpu(v1->lmm_magic) == LOV_MAGIC_FOREIGN)
499                                 rc = lfsck_layout_verify_header_foreign(
500                                         obj, (struct lov_foreign_md *)v1,
501                                         le32_to_cpu(lcme->lcme_size));
502                         else
503                                 rc = lfsck_layout_verify_header_v1v3(obj, v1,
504                                         start, end, comp_id, ext, &p_dom);
505
506                         p_zero = zero;
507                 }
508         } else if (le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_FOREIGN) {
509                 rc = lfsck_layout_verify_header_foreign(obj,
510                                                 (struct lov_foreign_md *)lmm,
511                                                 len);
512         } else {
513                 rc = lfsck_layout_verify_header_v1v3(obj, lmm, 0, LUSTRE_EOF,
514                                                      0, false, &p_dom);
515         }
516
517         return rc;
518 }
519
520 static int lfsck_layout_get_lovea(const struct lu_env *env,
521                                   struct dt_object *obj, struct lu_buf *buf)
522 {
523         int rc;
524         int rc1;
525
526 again:
527         rc = dt_xattr_get(env, obj, buf, XATTR_NAME_LOV);
528         if (rc == -ERANGE) {
529                 rc = dt_xattr_get(env, obj, &LU_BUF_NULL, XATTR_NAME_LOV);
530                 if (rc <= 0)
531                         return !rc ? -ENODATA : rc;
532
533                 lu_buf_realloc(buf, rc);
534                 if (buf->lb_buf == NULL)
535                         return -ENOMEM;
536
537                 goto again;
538         }
539
540         if (rc <= 0)
541                 return !rc ? -ENODATA : rc;
542
543         if (unlikely(buf->lb_buf == NULL)) {
544                 lu_buf_alloc(buf, rc);
545                 if (buf->lb_buf == NULL)
546                         return -ENOMEM;
547
548                 goto again;
549         }
550
551         rc1 = lfsck_layout_verify_header(obj, buf->lb_buf, rc);
552
553         return rc1 ? rc1 : rc;
554 }
555
556 #define LFSCK_RBTREE_BITMAP_SIZE        PAGE_SIZE
557 #define LFSCK_RBTREE_BITMAP_WIDTH       (LFSCK_RBTREE_BITMAP_SIZE << 3)
558 #define LFSCK_RBTREE_BITMAP_MASK        (LFSCK_RBTREE_BITMAP_WIDTH - 1)
559
560 struct lfsck_rbtree_node {
561         struct rb_node   lrn_node;
562         __u64            lrn_seq;
563         __u32            lrn_first_oid;
564         atomic_t         lrn_known_count;
565         atomic_t         lrn_accessed_count;
566         void            *lrn_known_bitmap;
567         void            *lrn_accessed_bitmap;
568 };
569
570 static inline int lfsck_rbtree_cmp(struct lfsck_rbtree_node *lrn,
571                                    __u64 seq, __u32 oid)
572 {
573         if (seq < lrn->lrn_seq)
574                 return -1;
575
576         if (seq > lrn->lrn_seq)
577                 return 1;
578
579         if (oid < lrn->lrn_first_oid)
580                 return -1;
581
582         if (oid - lrn->lrn_first_oid >= LFSCK_RBTREE_BITMAP_WIDTH)
583                 return 1;
584
585         return 0;
586 }
587
588 /* The caller should hold llsd->llsd_rb_lock. */
589 static struct lfsck_rbtree_node *
590 lfsck_rbtree_search(struct lfsck_layout_slave_data *llsd,
591                     const struct lu_fid *fid, bool *exact)
592 {
593         struct rb_node           *node  = llsd->llsd_rb_root.rb_node;
594         struct rb_node           *prev  = NULL;
595         struct lfsck_rbtree_node *lrn   = NULL;
596         int                       rc    = 0;
597
598         if (exact != NULL)
599                 *exact = true;
600
601         while (node != NULL) {
602                 prev = node;
603                 lrn = rb_entry(node, struct lfsck_rbtree_node, lrn_node);
604                 rc = lfsck_rbtree_cmp(lrn, fid_seq(fid), fid_oid(fid));
605                 if (rc < 0)
606                         node = node->rb_left;
607                 else if (rc > 0)
608                         node = node->rb_right;
609                 else
610                         return lrn;
611         }
612
613         if (exact == NULL)
614                 return NULL;
615
616         /* If there is no exactly matched one, then to the next valid one. */
617         *exact = false;
618
619         /* The rbtree is empty. */
620         if (rc == 0)
621                 return NULL;
622
623         if (rc < 0)
624                 return lrn;
625
626         node = rb_next(prev);
627
628         /* The end of the rbtree. */
629         if (node == NULL)
630                 return NULL;
631
632         lrn = rb_entry(node, struct lfsck_rbtree_node, lrn_node);
633
634         return lrn;
635 }
636
637 static struct lfsck_rbtree_node *lfsck_rbtree_new(const struct lu_env *env,
638                                                   const struct lu_fid *fid)
639 {
640         struct lfsck_rbtree_node *lrn;
641
642         OBD_ALLOC_PTR(lrn);
643         if (lrn == NULL)
644                 return ERR_PTR(-ENOMEM);
645
646         OBD_ALLOC(lrn->lrn_known_bitmap, LFSCK_RBTREE_BITMAP_SIZE);
647         if (lrn->lrn_known_bitmap == NULL) {
648                 OBD_FREE_PTR(lrn);
649
650                 return ERR_PTR(-ENOMEM);
651         }
652
653         OBD_ALLOC(lrn->lrn_accessed_bitmap, LFSCK_RBTREE_BITMAP_SIZE);
654         if (lrn->lrn_accessed_bitmap == NULL) {
655                 OBD_FREE(lrn->lrn_known_bitmap, LFSCK_RBTREE_BITMAP_SIZE);
656                 OBD_FREE_PTR(lrn);
657
658                 return ERR_PTR(-ENOMEM);
659         }
660
661         RB_CLEAR_NODE(&lrn->lrn_node);
662         lrn->lrn_seq = fid_seq(fid);
663         lrn->lrn_first_oid = fid_oid(fid) & ~LFSCK_RBTREE_BITMAP_MASK;
664         atomic_set(&lrn->lrn_known_count, 0);
665         atomic_set(&lrn->lrn_accessed_count, 0);
666
667         return lrn;
668 }
669
670 static void lfsck_rbtree_free(struct lfsck_rbtree_node *lrn)
671 {
672         OBD_FREE(lrn->lrn_accessed_bitmap, LFSCK_RBTREE_BITMAP_SIZE);
673         OBD_FREE(lrn->lrn_known_bitmap, LFSCK_RBTREE_BITMAP_SIZE);
674         OBD_FREE_PTR(lrn);
675 }
676
677 /* The caller should hold lock. */
678 static struct lfsck_rbtree_node *
679 lfsck_rbtree_insert(struct lfsck_layout_slave_data *llsd,
680                     struct lfsck_rbtree_node *lrn)
681 {
682         struct rb_node           **pos    = &llsd->llsd_rb_root.rb_node;
683         struct rb_node            *parent = NULL;
684         struct lfsck_rbtree_node  *tmp;
685         int                        rc;
686
687         while (*pos != NULL) {
688                 parent = *pos;
689                 tmp = rb_entry(parent, struct lfsck_rbtree_node, lrn_node);
690                 rc = lfsck_rbtree_cmp(tmp, lrn->lrn_seq, lrn->lrn_first_oid);
691                 if (rc < 0)
692                         pos = &(*pos)->rb_left;
693                 else if (rc > 0)
694                         pos = &(*pos)->rb_right;
695                 else
696                         return tmp;
697         }
698
699         rb_link_node(&lrn->lrn_node, parent, pos);
700         rb_insert_color(&lrn->lrn_node, &llsd->llsd_rb_root);
701
702         return lrn;
703 }
704
705 static const struct dt_index_operations lfsck_orphan_index_ops;
706
707 static int lfsck_rbtree_setup(const struct lu_env *env,
708                               struct lfsck_component *com)
709 {
710         struct lu_fid                   *fid    = &lfsck_env_info(env)->lti_fid;
711         struct lfsck_instance           *lfsck  = com->lc_lfsck;
712         struct dt_device                *dev    = lfsck->li_bottom;
713         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
714         struct dt_object                *obj;
715
716         fid->f_seq = FID_SEQ_LAYOUT_RBTREE;
717         fid->f_oid = lfsck_dev_idx(lfsck);
718         fid->f_ver = 0;
719         obj = dt_locate(env, dev, fid);
720         if (IS_ERR(obj))
721                 RETURN(PTR_ERR(obj));
722
723         /* Generate an in-RAM object to stand for the layout rbtree.
724          * Scanning the layout rbtree will be via the iteration over
725          * the object. In the future, the rbtree may be written onto
726          * disk with the object.
727          *
728          * Mark the object to be as exist. */
729         obj->do_lu.lo_header->loh_attr |= LOHA_EXISTS;
730         obj->do_index_ops = &lfsck_orphan_index_ops;
731         llsd->llsd_rb_obj = obj;
732         llsd->llsd_rbtree_valid = 1;
733         dev->dd_record_fid_accessed = 1;
734
735         CDEBUG(D_LFSCK, "%s: layout LFSCK init OST-objects accessing bitmap\n",
736                lfsck_lfsck2name(lfsck));
737
738         return 0;
739 }
740
741 static void lfsck_rbtree_cleanup(const struct lu_env *env,
742                                  struct lfsck_component *com)
743 {
744         struct lfsck_instance           *lfsck = com->lc_lfsck;
745         struct lfsck_layout_slave_data  *llsd  = com->lc_data;
746         struct rb_node                  *node  = rb_first(&llsd->llsd_rb_root);
747         struct rb_node                  *next;
748         struct lfsck_rbtree_node        *lrn;
749
750         lfsck->li_bottom->dd_record_fid_accessed = 0;
751         /* Invalid the rbtree, then no others will use it. */
752         down_write(&llsd->llsd_rb_rwsem);
753         llsd->llsd_rbtree_valid = 0;
754         up_write(&llsd->llsd_rb_rwsem);
755
756         while (node != NULL) {
757                 next = rb_next(node);
758                 lrn = rb_entry(node, struct lfsck_rbtree_node, lrn_node);
759                 rb_erase(node, &llsd->llsd_rb_root);
760                 lfsck_rbtree_free(lrn);
761                 node = next;
762         }
763
764         if (llsd->llsd_rb_obj != NULL) {
765                 lfsck_object_put(env, llsd->llsd_rb_obj);
766                 llsd->llsd_rb_obj = NULL;
767         }
768
769         CDEBUG(D_LFSCK, "%s: layout LFSCK fini OST-objects accessing bitmap\n",
770                lfsck_lfsck2name(lfsck));
771 }
772
773 static void lfsck_rbtree_update_bitmap(const struct lu_env *env,
774                                        struct lfsck_component *com,
775                                        const struct lu_fid *fid,
776                                        bool accessed)
777 {
778         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
779         struct lfsck_rbtree_node        *lrn;
780         bool                             insert = false;
781         int                              idx;
782         int                              rc     = 0;
783         ENTRY;
784
785         if (unlikely(!fid_is_sane(fid) || fid_is_last_id(fid)))
786                 RETURN_EXIT;
787
788         if (!fid_is_idif(fid) && !fid_is_norm(fid))
789                 RETURN_EXIT;
790
791         down_read(&llsd->llsd_rb_rwsem);
792         if (!llsd->llsd_rbtree_valid)
793                 GOTO(unlock, rc = 0);
794
795         lrn = lfsck_rbtree_search(llsd, fid, NULL);
796         if (lrn == NULL) {
797                 struct lfsck_rbtree_node *tmp;
798
799                 LASSERT(!insert);
800
801                 up_read(&llsd->llsd_rb_rwsem);
802                 tmp = lfsck_rbtree_new(env, fid);
803                 if (IS_ERR(tmp))
804                         GOTO(out, rc = PTR_ERR(tmp));
805
806                 insert = true;
807                 down_write(&llsd->llsd_rb_rwsem);
808                 if (!llsd->llsd_rbtree_valid) {
809                         lfsck_rbtree_free(tmp);
810                         GOTO(unlock, rc = 0);
811                 }
812
813                 lrn = lfsck_rbtree_insert(llsd, tmp);
814                 if (lrn != tmp)
815                         lfsck_rbtree_free(tmp);
816         }
817
818         idx = fid_oid(fid) & LFSCK_RBTREE_BITMAP_MASK;
819         /* Any accessed object must be a known object. */
820         if (!test_and_set_bit(idx, lrn->lrn_known_bitmap))
821                 atomic_inc(&lrn->lrn_known_count);
822         if (accessed && !test_and_set_bit(idx, lrn->lrn_accessed_bitmap))
823                 atomic_inc(&lrn->lrn_accessed_count);
824
825         GOTO(unlock, rc = 0);
826
827 unlock:
828         if (insert)
829                 up_write(&llsd->llsd_rb_rwsem);
830         else
831                 up_read(&llsd->llsd_rb_rwsem);
832 out:
833         if (rc != 0 && accessed) {
834                 struct lfsck_layout *lo = com->lc_file_ram;
835
836                 CDEBUG(D_LFSCK, "%s: fail to update OST-objects accessing "
837                        "bitmap, and will cause incorrect LFSCK OST-object "
838                        "handling, so disable it to cancel orphan handling "
839                        "for related device. rc = %d\n",
840                        lfsck_lfsck2name(com->lc_lfsck), rc);
841
842                 lo->ll_flags |= LF_INCOMPLETE;
843                 lfsck_rbtree_cleanup(env, com);
844         }
845 }
846
847 static inline void lldk_le_to_cpu(struct lfsck_layout_dangling_key *des,
848                                   const struct lfsck_layout_dangling_key *src)
849 {
850         fid_le_to_cpu(&des->lldk_fid, &src->lldk_fid);
851         des->lldk_comp_id = le32_to_cpu(src->lldk_comp_id);
852         des->lldk_ea_off = le32_to_cpu(src->lldk_ea_off);
853 }
854
855 static inline void lldk_cpu_to_le(struct lfsck_layout_dangling_key *des,
856                                   const struct lfsck_layout_dangling_key *src)
857 {
858         fid_cpu_to_le(&des->lldk_fid, &src->lldk_fid);
859         des->lldk_comp_id = cpu_to_le32(src->lldk_comp_id);
860         des->lldk_ea_off = cpu_to_le32(src->lldk_ea_off);
861 }
862
863 static inline void lldk_be_to_cpu(struct lfsck_layout_dangling_key *des,
864                                   const struct lfsck_layout_dangling_key *src)
865 {
866         fid_be_to_cpu(&des->lldk_fid, &src->lldk_fid);
867         des->lldk_comp_id = be32_to_cpu(src->lldk_comp_id);
868         des->lldk_ea_off = be32_to_cpu(src->lldk_ea_off);
869 }
870
871 static inline void lldk_cpu_to_be(struct lfsck_layout_dangling_key *des,
872                                   const struct lfsck_layout_dangling_key *src)
873 {
874         fid_cpu_to_be(&des->lldk_fid, &src->lldk_fid);
875         des->lldk_comp_id = cpu_to_be32(src->lldk_comp_id);
876         des->lldk_ea_off = cpu_to_be32(src->lldk_ea_off);
877 }
878
879 static void lfsck_layout_le_to_cpu(struct lfsck_layout *des,
880                                    const struct lfsck_layout *src)
881 {
882         int i;
883
884         des->ll_magic = le32_to_cpu(src->ll_magic);
885         des->ll_status = le32_to_cpu(src->ll_status);
886         des->ll_flags = le32_to_cpu(src->ll_flags);
887         des->ll_success_count = le32_to_cpu(src->ll_success_count);
888         des->ll_run_time_phase1 = le64_to_cpu(src->ll_run_time_phase1);
889         des->ll_run_time_phase2 = le64_to_cpu(src->ll_run_time_phase2);
890         des->ll_time_last_complete = le64_to_cpu(src->ll_time_last_complete);
891         des->ll_time_latest_start = le64_to_cpu(src->ll_time_latest_start);
892         des->ll_time_last_checkpoint =
893                                 le64_to_cpu(src->ll_time_last_checkpoint);
894         des->ll_pos_latest_start = le64_to_cpu(src->ll_pos_latest_start);
895         des->ll_pos_last_checkpoint = le64_to_cpu(src->ll_pos_last_checkpoint);
896         des->ll_pos_first_inconsistent =
897                         le64_to_cpu(src->ll_pos_first_inconsistent);
898         des->ll_objs_checked_phase1 = le64_to_cpu(src->ll_objs_checked_phase1);
899         des->ll_objs_failed_phase1 = le64_to_cpu(src->ll_objs_failed_phase1);
900         des->ll_objs_checked_phase2 = le64_to_cpu(src->ll_objs_checked_phase2);
901         des->ll_objs_failed_phase2 = le64_to_cpu(src->ll_objs_failed_phase2);
902         for (i = 0; i < LLIT_MAX; i++)
903                 des->ll_objs_repaired[i] =
904                                 le64_to_cpu(src->ll_objs_repaired[i]);
905         des->ll_objs_skipped = le64_to_cpu(src->ll_objs_skipped);
906         des->ll_bitmap_size = le32_to_cpu(src->ll_bitmap_size);
907         lldk_le_to_cpu(&des->ll_lldk_latest_scanned_phase2,
908                        &src->ll_lldk_latest_scanned_phase2);
909 }
910
911 static void lfsck_layout_cpu_to_le(struct lfsck_layout *des,
912                                    const struct lfsck_layout *src)
913 {
914         int i;
915
916         des->ll_magic = cpu_to_le32(src->ll_magic);
917         des->ll_status = cpu_to_le32(src->ll_status);
918         des->ll_flags = cpu_to_le32(src->ll_flags);
919         des->ll_success_count = cpu_to_le32(src->ll_success_count);
920         des->ll_run_time_phase1 = cpu_to_le64(src->ll_run_time_phase1);
921         des->ll_run_time_phase2 = cpu_to_le64(src->ll_run_time_phase2);
922         des->ll_time_last_complete = cpu_to_le64(src->ll_time_last_complete);
923         des->ll_time_latest_start = cpu_to_le64(src->ll_time_latest_start);
924         des->ll_time_last_checkpoint =
925                                 cpu_to_le64(src->ll_time_last_checkpoint);
926         des->ll_pos_latest_start = cpu_to_le64(src->ll_pos_latest_start);
927         des->ll_pos_last_checkpoint = cpu_to_le64(src->ll_pos_last_checkpoint);
928         des->ll_pos_first_inconsistent =
929                         cpu_to_le64(src->ll_pos_first_inconsistent);
930         des->ll_objs_checked_phase1 = cpu_to_le64(src->ll_objs_checked_phase1);
931         des->ll_objs_failed_phase1 = cpu_to_le64(src->ll_objs_failed_phase1);
932         des->ll_objs_checked_phase2 = cpu_to_le64(src->ll_objs_checked_phase2);
933         des->ll_objs_failed_phase2 = cpu_to_le64(src->ll_objs_failed_phase2);
934         for (i = 0; i < LLIT_MAX; i++)
935                 des->ll_objs_repaired[i] =
936                                 cpu_to_le64(src->ll_objs_repaired[i]);
937         des->ll_objs_skipped = cpu_to_le64(src->ll_objs_skipped);
938         des->ll_bitmap_size = cpu_to_le32(src->ll_bitmap_size);
939         lldk_cpu_to_le(&des->ll_lldk_latest_scanned_phase2,
940                        &src->ll_lldk_latest_scanned_phase2);
941 }
942
943 /**
944  * Load the OST bitmap from the lfsck_layout trace file.
945  *
946  * \param[in] env       pointer to the thread context
947  * \param[in] com       pointer to the lfsck component
948  *
949  * \retval              0 for success
950  * \retval              negative error number on failure or data corruption
951  */
952 static int lfsck_layout_load_bitmap(const struct lu_env *env,
953                                     struct lfsck_component *com)
954 {
955         struct dt_object *obj = com->lc_obj;
956         struct lfsck_assistant_data *lad = com->lc_data;
957         struct lfsck_layout *lo = com->lc_file_ram;
958         unsigned long *bitmap = lad->lad_bitmap;
959         loff_t pos = com->lc_file_size;
960         ssize_t size;
961         __u32 nbits;
962         int rc;
963
964         ENTRY;
965         if (com->lc_lfsck->li_ost_descs.ltd_tgts_mask_len > lo->ll_bitmap_size)
966                 nbits = com->lc_lfsck->li_ost_descs.ltd_tgts_mask_len;
967         else
968                 nbits = lo->ll_bitmap_size;
969
970         if (unlikely(nbits < BITS_PER_LONG))
971                 nbits = BITS_PER_LONG;
972
973         if (nbits > lad->lad_bitmap_count) {
974                 u32 new_bits = lad->lad_bitmap_count;
975                 unsigned long *new_bitmap;
976
977                 while (new_bits < nbits)
978                         new_bits <<= 1;
979
980                 new_bitmap = bitmap_zalloc(new_bits, GFP_KERNEL);
981                 if (new_bitmap == NULL)
982                         RETURN(-ENOMEM);
983
984                 lad->lad_bitmap = new_bitmap;
985                 lad->lad_bitmap_count = new_bits;
986                 bitmap_free(bitmap);
987                 bitmap = new_bitmap;
988         }
989
990         if (lo->ll_bitmap_size == 0) {
991                 clear_bit(LAD_INCOMPLETE, &lad->lad_flags);
992                 bitmap_zero(bitmap, lad->lad_bitmap_count);
993                 RETURN(0);
994         }
995
996         size = (lo->ll_bitmap_size + 7) >> 3;
997         rc = dt_read(env, obj, lfsck_buf_get(env, bitmap, size), &pos);
998         if (rc != size)
999                 RETURN(rc >= 0 ? -EINVAL : rc);
1000
1001         if (bitmap_empty(bitmap, lad->lad_bitmap_count))
1002                 clear_bit(LAD_INCOMPLETE, &lad->lad_flags);
1003         else
1004                 set_bit(LAD_INCOMPLETE, &lad->lad_flags);
1005
1006         RETURN(0);
1007 }
1008
1009 /**
1010  * Load the layout LFSCK trace file from disk.
1011  *
1012  * The layout LFSCK trace file records the layout LFSCK status information
1013  * and other statistics, such as how many objects have been scanned, and how
1014  * many objects have been repaired, and etc. It also contains the bitmap for
1015  * failed OSTs during the layout LFSCK. All these information will be loaded
1016  * from disk to RAM when the layout LFSCK component setup.
1017  *
1018  * \param[in] env       pointer to the thread context
1019  * \param[in] com       pointer to the lfsck component
1020  *
1021  * \retval              positive number for file data corruption, the caller
1022  *                      should reset the layout LFSCK trace file
1023  * \retval              0 for success
1024  * \retval              negative error number on failure
1025  */
1026 static int lfsck_layout_load(const struct lu_env *env,
1027                              struct lfsck_component *com)
1028 {
1029         struct lfsck_layout             *lo     = com->lc_file_ram;
1030         ssize_t                          size   = com->lc_file_size;
1031         loff_t                           pos    = 0;
1032         int                              rc;
1033
1034         rc = dt_read(env, com->lc_obj,
1035                      lfsck_buf_get(env, com->lc_file_disk, size), &pos);
1036         if (rc == 0) {
1037                 return -ENOENT;
1038         } else if (rc < 0) {
1039                 CDEBUG(D_LFSCK, "%s: failed to load lfsck_layout: rc = %d\n",
1040                        lfsck_lfsck2name(com->lc_lfsck), rc);
1041                 return rc;
1042         } else if (rc != size) {
1043                 CDEBUG(D_LFSCK, "%s: lfsck_layout size %u != %u; reset it\n",
1044                        lfsck_lfsck2name(com->lc_lfsck), rc, (unsigned int)size);
1045                 return 1;
1046         }
1047
1048         lfsck_layout_le_to_cpu(lo, com->lc_file_disk);
1049         if (lo->ll_magic != LFSCK_LAYOUT_MAGIC) {
1050                 CDEBUG(D_LFSCK, "%s: invalid lfsck_layout magic %#x != %#x, "
1051                        "to be reset\n", lfsck_lfsck2name(com->lc_lfsck),
1052                        lo->ll_magic, LFSCK_LAYOUT_MAGIC);
1053                 return 1;
1054         }
1055
1056         return 0;
1057 }
1058
1059 /**
1060  * Store the layout LFSCK trace file on disk.
1061  *
1062  * The layout LFSCK trace file records the layout LFSCK status information
1063  * and other statistics, such as how many objects have been scanned, and how
1064  * many objects have been repaired, and etc. It also contains the bitmap for
1065  * failed OSTs during the layout LFSCK. All these information will be synced
1066  * from RAM to disk periodically.
1067  *
1068  * \param[in] env       pointer to the thread context
1069  * \param[in] com       pointer to the lfsck component
1070  *
1071  * \retval              0 for success
1072  * \retval              negative error number on failure
1073  */
1074 static int lfsck_layout_store(const struct lu_env *env,
1075                               struct lfsck_component *com)
1076 {
1077         struct dt_object *obj = com->lc_obj;
1078         struct lfsck_instance *lfsck = com->lc_lfsck;
1079         struct lfsck_layout *lo_ram = com->lc_file_ram;
1080         struct lfsck_layout *lo = com->lc_file_disk;
1081         struct thandle *th;
1082         struct dt_device *dev = lfsck_obj2dev(obj);
1083         unsigned long *bitmap = NULL;
1084         loff_t pos;
1085         ssize_t size = com->lc_file_size;
1086         __u32 nbits = 0;
1087         int rc;
1088
1089         ENTRY;
1090         if (lfsck->li_master) {
1091                 struct lfsck_assistant_data *lad = com->lc_data;
1092
1093                 bitmap = lad->lad_bitmap;
1094                 nbits = lad->lad_bitmap_count;
1095
1096                 LASSERT(nbits > 0);
1097                 LASSERTF((nbits & 7) == 0, "Invalid nbits %u\n", nbits);
1098         }
1099
1100         lo_ram->ll_bitmap_size = nbits;
1101         lfsck_layout_cpu_to_le(lo, lo_ram);
1102         th = dt_trans_create(env, dev);
1103         if (IS_ERR(th))
1104                 GOTO(log, rc = PTR_ERR(th));
1105
1106         rc = dt_declare_record_write(env, obj, lfsck_buf_get(env, lo, size),
1107                                      (loff_t)0, th);
1108         if (rc != 0)
1109                 GOTO(out, rc);
1110
1111         if (bitmap != NULL) {
1112                 rc = dt_declare_record_write(env, obj,
1113                                 lfsck_buf_get(env, bitmap, nbits >> 3),
1114                                 (loff_t)size, th);
1115                 if (rc != 0)
1116                         GOTO(out, rc);
1117         }
1118
1119         rc = dt_trans_start_local(env, dev, th);
1120         if (rc != 0)
1121                 GOTO(out, rc);
1122
1123         pos = 0;
1124         rc = dt_record_write(env, obj, lfsck_buf_get(env, lo, size), &pos, th);
1125         if (rc != 0)
1126                 GOTO(out, rc);
1127
1128         if (bitmap != NULL) {
1129                 pos = size;
1130                 rc = dt_record_write(env, obj,
1131                                 lfsck_buf_get(env, bitmap, nbits >> 3),
1132                                 &pos, th);
1133         }
1134
1135         GOTO(out, rc);
1136
1137 out:
1138         dt_trans_stop(env, dev, th);
1139
1140 log:
1141         if (rc != 0)
1142                 CDEBUG(D_LFSCK, "%s: fail to store lfsck_layout: rc = %d\n",
1143                        lfsck_lfsck2name(lfsck), rc);
1144
1145         return rc;
1146 }
1147
1148 static int lfsck_layout_init(const struct lu_env *env,
1149                              struct lfsck_component *com)
1150 {
1151         struct lfsck_layout *lo = com->lc_file_ram;
1152         int rc;
1153
1154         memset(lo, 0, com->lc_file_size);
1155         lo->ll_magic = LFSCK_LAYOUT_MAGIC;
1156         lo->ll_status = LS_INIT;
1157         down_write(&com->lc_sem);
1158         rc = lfsck_layout_store(env, com);
1159         if (rc == 0 && com->lc_lfsck->li_master)
1160                 rc = lfsck_load_sub_trace_files(env, com,
1161                         &dt_lfsck_layout_dangling_features, LFSCK_LAYOUT, true);
1162         up_write(&com->lc_sem);
1163
1164         return rc;
1165 }
1166
1167 static int fid_is_for_ostobj(const struct lu_env *env,
1168                              struct lfsck_instance *lfsck,
1169                              struct dt_object *obj, const struct lu_fid *fid)
1170 {
1171         struct seq_server_site  *ss     = lfsck_dev_site(lfsck);
1172         struct lu_seq_range     *range  = &lfsck_env_info(env)->lti_range;
1173         struct lustre_ost_attrs *loa;
1174         int                      rc;
1175
1176         fld_range_set_any(range);
1177         rc = fld_server_lookup(env, ss->ss_server_fld, fid_seq(fid), range);
1178         if (rc == 0) {
1179                 if (fld_range_is_ost(range))
1180                         return 1;
1181
1182                 return 0;
1183         }
1184
1185         loa = &lfsck_env_info(env)->lti_loa;
1186         rc = dt_xattr_get(env, obj, lfsck_buf_get(env, loa, sizeof(*loa)),
1187                           XATTR_NAME_LMA);
1188         if (rc >= (int)sizeof(struct lustre_mdt_attrs)) {
1189                 lustre_lma_swab(&loa->loa_lma);
1190
1191                 return loa->loa_lma.lma_compat & LMAC_FID_ON_OST ? 1 : 0;
1192         }
1193
1194         rc = dt_xattr_get(env, obj, &LU_BUF_NULL, XATTR_NAME_FID);
1195
1196         return rc > 0;
1197 }
1198
1199 static struct lfsck_layout_seq *
1200 lfsck_layout_seq_lookup(struct lfsck_layout_slave_data *llsd, __u64 seq)
1201 {
1202         struct lfsck_layout_seq *lls;
1203
1204         list_for_each_entry(lls, &llsd->llsd_seq_list, lls_list) {
1205                 if (lls->lls_seq == seq)
1206                         return lls;
1207
1208                 if (lls->lls_seq > seq)
1209                         return NULL;
1210         }
1211
1212         return NULL;
1213 }
1214
1215 static void
1216 lfsck_layout_seq_insert(struct lfsck_layout_slave_data *llsd,
1217                         struct lfsck_layout_seq *lls)
1218 {
1219         struct lfsck_layout_seq *tmp;
1220         struct list_head        *pos = &llsd->llsd_seq_list;
1221
1222         list_for_each_entry(tmp, &llsd->llsd_seq_list, lls_list) {
1223                 if (lls->lls_seq < tmp->lls_seq) {
1224                         pos = &tmp->lls_list;
1225                         break;
1226                 }
1227         }
1228         list_add_tail(&lls->lls_list, pos);
1229 }
1230
1231 static int
1232 lfsck_layout_lastid_create(const struct lu_env *env,
1233                            struct lfsck_instance *lfsck,
1234                            struct dt_object *obj)
1235 {
1236         struct lfsck_thread_info *info   = lfsck_env_info(env);
1237         struct lu_attr           *la     = &info->lti_la;
1238         struct dt_object_format  *dof    = &info->lti_dof;
1239         struct lfsck_bookmark    *bk     = &lfsck->li_bookmark_ram;
1240         struct dt_device         *dt     = lfsck_obj2dev(obj);
1241         struct thandle           *th;
1242         __u64                     lastid = 0;
1243         loff_t                    pos    = 0;
1244         int                       rc;
1245         ENTRY;
1246
1247         if (bk->lb_param & LPF_DRYRUN)
1248                 return 0;
1249
1250         memset(la, 0, sizeof(*la));
1251         la->la_mode = S_IFREG |  S_IRUGO | S_IWUSR;
1252         la->la_valid = LA_MODE | LA_UID | LA_GID;
1253         memset(dof, 0, sizeof(*dof));
1254         dof->dof_type = dt_mode_to_dft(S_IFREG);
1255
1256         th = lfsck_trans_create(env, dt, lfsck);
1257         if (IS_ERR(th))
1258                 GOTO(log, rc = PTR_ERR(th));
1259
1260         rc = dt_declare_create(env, obj, la, NULL, dof, th);
1261         if (rc != 0)
1262                 GOTO(stop, rc);
1263
1264         rc = dt_declare_record_write(env, obj,
1265                                      lfsck_buf_get(env, &lastid,
1266                                                    sizeof(lastid)),
1267                                      pos, th);
1268         if (rc != 0)
1269                 GOTO(stop, rc);
1270
1271         rc = dt_trans_start_local(env, dt, th);
1272         if (rc != 0)
1273                 GOTO(stop, rc);
1274
1275         dt_write_lock(env, obj, 0);
1276         if (likely(dt_object_exists(obj) == 0)) {
1277                 rc = dt_create(env, obj, la, NULL, dof, th);
1278                 if (rc == 0)
1279                         rc = dt_record_write(env, obj,
1280                                 lfsck_buf_get(env, &lastid, sizeof(lastid)),
1281                                 &pos, th);
1282         }
1283         dt_write_unlock(env, obj);
1284
1285         GOTO(stop, rc);
1286
1287 stop:
1288         dt_trans_stop(env, dt, th);
1289
1290 log:
1291         CDEBUG(D_LFSCK, "%s: layout LFSCK will create LAST_ID for <seq> "
1292                "%#llx: rc = %d\n",
1293                lfsck_lfsck2name(lfsck), fid_seq(lfsck_dto2fid(obj)), rc);
1294
1295         return rc;
1296 }
1297
1298 static int
1299 lfsck_layout_lastid_reload(const struct lu_env *env,
1300                            struct lfsck_component *com,
1301                            struct lfsck_layout_seq *lls)
1302 {
1303         __u64   lastid;
1304         loff_t  pos     = 0;
1305         int     rc;
1306
1307         dt_read_lock(env, lls->lls_lastid_obj, 0);
1308         rc = dt_record_read(env, lls->lls_lastid_obj,
1309                             lfsck_buf_get(env, &lastid, sizeof(lastid)), &pos);
1310         dt_read_unlock(env, lls->lls_lastid_obj);
1311         if (unlikely(rc != 0))
1312                 return rc;
1313
1314         lastid = le64_to_cpu(lastid);
1315         if (lastid < lls->lls_lastid_known) {
1316                 struct lfsck_instance   *lfsck  = com->lc_lfsck;
1317                 struct lfsck_layout     *lo     = com->lc_file_ram;
1318
1319                 lls->lls_lastid = lls->lls_lastid_known;
1320                 lls->lls_dirty = 1;
1321                 if (!(lo->ll_flags & LF_CRASHED_LASTID)) {
1322                         LASSERT(lfsck->li_out_notify != NULL);
1323
1324                         lfsck->li_out_notify(env, lfsck->li_out_notify_data,
1325                                              LE_LASTID_REBUILDING);
1326                         lo->ll_flags |= LF_CRASHED_LASTID;
1327
1328                         CDEBUG(D_LFSCK, "%s: layout LFSCK finds crashed "
1329                                "LAST_ID file (1) for the sequence %#llx"
1330                                ", old value %llu, known value %llu\n",
1331                                lfsck_lfsck2name(lfsck), lls->lls_seq,
1332                                lastid, lls->lls_lastid);
1333                 }
1334         } else if (lastid >= lls->lls_lastid) {
1335                 lls->lls_lastid = lastid;
1336                 lls->lls_dirty = 0;
1337         }
1338
1339         return 0;
1340 }
1341
1342 static int
1343 lfsck_layout_lastid_store(const struct lu_env *env,
1344                           struct lfsck_component *com)
1345 {
1346         struct lfsck_instance           *lfsck  = com->lc_lfsck;
1347         struct lfsck_bookmark           *bk     = &lfsck->li_bookmark_ram;
1348         struct dt_device                *dt     = lfsck->li_bottom;
1349         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
1350         struct lfsck_layout_seq         *lls;
1351         struct thandle                  *th;
1352         __u64                            lastid;
1353         int                              rc     = 0;
1354         int                              rc1    = 0;
1355
1356         list_for_each_entry(lls, &llsd->llsd_seq_list, lls_list) {
1357                 loff_t pos = 0;
1358
1359                 if (!lls->lls_dirty)
1360                         continue;
1361
1362                 CDEBUG(D_LFSCK, "%s: layout LFSCK will sync the LAST_ID for "
1363                        "<seq> %#llx as <oid> %llu\n",
1364                        lfsck_lfsck2name(lfsck), lls->lls_seq, lls->lls_lastid);
1365
1366                 if (bk->lb_param & LPF_DRYRUN) {
1367                         lls->lls_dirty = 0;
1368                         continue;
1369                 }
1370
1371                 th = lfsck_trans_create(env, dt, lfsck);
1372                 if (IS_ERR(th)) {
1373                         rc1 = PTR_ERR(th);
1374                         CDEBUG(D_LFSCK, "%s: layout LFSCK failed to store "
1375                                "the LAST_ID for <seq> %#llx(1): rc = %d\n",
1376                                lfsck_lfsck2name(com->lc_lfsck),
1377                                lls->lls_seq, rc1);
1378                         continue;
1379                 }
1380
1381                 lastid = cpu_to_le64(lls->lls_lastid);
1382                 rc = dt_declare_record_write(env, lls->lls_lastid_obj,
1383                                              lfsck_buf_get(env, &lastid,
1384                                                            sizeof(lastid)),
1385                                              pos, th);
1386                 if (rc != 0)
1387                         goto stop;
1388
1389                 rc = dt_trans_start_local(env, dt, th);
1390                 if (rc != 0)
1391                         goto stop;
1392
1393                 dt_write_lock(env, lls->lls_lastid_obj, 0);
1394                 rc = dt_record_write(env, lls->lls_lastid_obj,
1395                                      lfsck_buf_get(env, &lastid,
1396                                      sizeof(lastid)), &pos, th);
1397                 dt_write_unlock(env, lls->lls_lastid_obj);
1398                 if (rc == 0)
1399                         lls->lls_dirty = 0;
1400
1401 stop:
1402                 dt_trans_stop(env, dt, th);
1403                 if (rc != 0) {
1404                         rc1 = rc;
1405                         CDEBUG(D_LFSCK, "%s: layout LFSCK failed to store "
1406                                "the LAST_ID for <seq> %#llx(2): rc = %d\n",
1407                                lfsck_lfsck2name(com->lc_lfsck),
1408                                lls->lls_seq, rc1);
1409                 }
1410         }
1411
1412         return rc1;
1413 }
1414
1415 static int
1416 lfsck_layout_lastid_load(const struct lu_env *env,
1417                          struct lfsck_component *com,
1418                          struct lfsck_layout_seq *lls)
1419 {
1420         struct lfsck_instance   *lfsck  = com->lc_lfsck;
1421         struct lfsck_layout     *lo     = com->lc_file_ram;
1422         struct lu_fid           *fid    = &lfsck_env_info(env)->lti_fid;
1423         struct dt_object        *obj;
1424         loff_t                   pos    = 0;
1425         int                      rc;
1426         ENTRY;
1427
1428         lu_last_id_fid(fid, lls->lls_seq, lfsck_dev_idx(lfsck));
1429         obj = dt_locate(env, lfsck->li_bottom, fid);
1430         if (IS_ERR(obj))
1431                 RETURN(PTR_ERR(obj));
1432
1433         /* LAST_ID crashed, to be rebuilt */
1434         if (dt_object_exists(obj) == 0) {
1435                 if (!(lo->ll_flags & LF_CRASHED_LASTID)) {
1436                         LASSERT(lfsck->li_out_notify != NULL);
1437
1438                         lfsck->li_out_notify(env, lfsck->li_out_notify_data,
1439                                              LE_LASTID_REBUILDING);
1440                         lo->ll_flags |= LF_CRASHED_LASTID;
1441
1442                         CDEBUG(D_LFSCK, "%s: layout LFSCK cannot find the "
1443                                "LAST_ID file for sequence %#llx\n",
1444                                lfsck_lfsck2name(lfsck), lls->lls_seq);
1445
1446                         if (CFS_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY4) &&
1447                             cfs_fail_val > 0) {
1448                                 struct ptlrpc_thread *thread =
1449                                         &lfsck->li_thread;
1450
1451                                 up_write(&com->lc_sem);
1452                                 wait_event_idle_timeout(
1453                                         thread->t_ctl_waitq,
1454                                         !thread_is_running(thread),
1455                                         cfs_time_seconds(cfs_fail_val));
1456                                 down_write(&com->lc_sem);
1457                         }
1458                 }
1459
1460                 rc = lfsck_layout_lastid_create(env, lfsck, obj);
1461         } else {
1462                 dt_read_lock(env, obj, 0);
1463                 rc = dt_read(env, obj,
1464                         lfsck_buf_get(env, &lls->lls_lastid, sizeof(__u64)),
1465                         &pos);
1466                 dt_read_unlock(env, obj);
1467                 if (rc != 0 && rc != sizeof(__u64))
1468                         GOTO(out, rc = (rc > 0 ? -EFAULT : rc));
1469
1470                 if (rc == 0 && !(lo->ll_flags & LF_CRASHED_LASTID)) {
1471                         LASSERT(lfsck->li_out_notify != NULL);
1472
1473                         lfsck->li_out_notify(env, lfsck->li_out_notify_data,
1474                                              LE_LASTID_REBUILDING);
1475                         lo->ll_flags |= LF_CRASHED_LASTID;
1476
1477                         CDEBUG(D_LFSCK, "%s: layout LFSCK finds invalid "
1478                                "LAST_ID file for the sequence %#llx"
1479                                ": rc = %d\n",
1480                                lfsck_lfsck2name(lfsck), lls->lls_seq, rc);
1481                 }
1482
1483                 lls->lls_lastid = le64_to_cpu(lls->lls_lastid);
1484                 rc = 0;
1485         }
1486
1487         GOTO(out, rc);
1488
1489 out:
1490         if (rc != 0)
1491                 lfsck_object_put(env, obj);
1492         else
1493                 lls->lls_lastid_obj = obj;
1494
1495         return rc;
1496 }
1497
1498 static void lfsck_layout_record_failure(const struct lu_env *env,
1499                                         struct lfsck_instance *lfsck,
1500                                         struct lfsck_layout *lo)
1501 {
1502         __u64 cookie;
1503
1504         lo->ll_objs_failed_phase1++;
1505         cookie = lfsck->li_obj_oit->do_index_ops->dio_it.store(env,
1506                                                         lfsck->li_di_oit);
1507         if (lo->ll_pos_first_inconsistent == 0 ||
1508             lo->ll_pos_first_inconsistent < cookie) {
1509                 lo->ll_pos_first_inconsistent = cookie;
1510
1511                 CDEBUG(D_LFSCK, "%s: layout LFSCK hit first non-repaired "
1512                        "inconsistency at the pos [%llu]\n",
1513                        lfsck_lfsck2name(lfsck),
1514                        lo->ll_pos_first_inconsistent);
1515         }
1516 }
1517
1518 static int lfsck_layout_double_scan_result(const struct lu_env *env,
1519                                            struct lfsck_component *com,
1520                                            int rc)
1521 {
1522         struct lfsck_instance   *lfsck = com->lc_lfsck;
1523         struct lfsck_layout     *lo    = com->lc_file_ram;
1524
1525         CDEBUG(D_LFSCK, "%s: layout LFSCK double scan: rc = %d\n",
1526                lfsck_lfsck2name(lfsck), rc);
1527
1528         down_write(&com->lc_sem);
1529         lo->ll_run_time_phase2 += ktime_get_seconds() -
1530                                   com->lc_time_last_checkpoint;
1531         lo->ll_time_last_checkpoint = ktime_get_real_seconds();
1532         lo->ll_objs_checked_phase2 += com->lc_new_checked;
1533
1534         if (rc > 0) {
1535                 if (lo->ll_flags & LF_INCOMPLETE) {
1536                         lo->ll_status = LS_PARTIAL;
1537                 } else {
1538                         if (lfsck->li_master) {
1539                                 struct lfsck_assistant_data *lad = com->lc_data;
1540
1541                                 if (test_bit(LAD_INCOMPLETE, &lad->lad_flags))
1542                                         lo->ll_status = LS_PARTIAL;
1543                                 else
1544                                         lo->ll_status = LS_COMPLETED;
1545                         } else {
1546                                 lo->ll_status = LS_COMPLETED;
1547                         }
1548                 }
1549                 lo->ll_flags &= ~LF_SCANNED_ONCE;
1550                 if (!(lfsck->li_bookmark_ram.lb_param & LPF_DRYRUN))
1551                         lo->ll_flags &= ~LF_INCONSISTENT;
1552                 lo->ll_time_last_complete = lo->ll_time_last_checkpoint;
1553                 lo->ll_success_count++;
1554         } else if (rc == 0) {
1555                 if (lfsck->li_status != 0)
1556                         lo->ll_status = lfsck->li_status;
1557                 else
1558                         lo->ll_status = LS_STOPPED;
1559         } else {
1560                 lo->ll_status = LS_FAILED;
1561         }
1562
1563         rc = lfsck_layout_store(env, com);
1564         up_write(&com->lc_sem);
1565
1566         CDEBUG(D_LFSCK, "%s: layout LFSCK double scan result %u: rc = %d\n",
1567                lfsck_lfsck2name(lfsck), lo->ll_status, rc);
1568
1569         return rc;
1570 }
1571
1572 static int lfsck_layout_trans_stop(const struct lu_env *env,
1573                                    struct dt_device *dev,
1574                                    struct thandle *handle, int result)
1575 {
1576         int rc;
1577
1578         /* XXX: If there is something worng or it needs to repair nothing,
1579          *      then notify the lower to stop the modification. Currently,
1580          *      we use th_result for such purpose, that may be replaced by
1581          *      some rollback mechanism in the future. */
1582         handle->th_result = result;
1583         rc = dt_trans_stop(env, dev, handle);
1584         if (result != 0)
1585                 return result > 0 ? 0 : result;
1586
1587         return rc == 0 ? 1 : rc;
1588 }
1589
1590 static int lfsck_layout_ins_dangling_rec(const struct lu_env *env,
1591                                          struct lfsck_component *com,
1592                                          const struct lu_fid *pfid,
1593                                          const struct lu_fid *cfid,
1594                                          __u32 comp_id, __u32 ea_off,
1595                                          __u32 ost_idx)
1596 {
1597         struct lfsck_layout_dangling_key *key = &lfsck_env_info(env)->lti_lldk;
1598         struct lu_fid *rec = &lfsck_env_info(env)->lti_fid3;
1599         struct dt_device *dev;
1600         struct dt_object *obj;
1601         struct thandle *th = NULL;
1602         int idx;
1603         int rc = 0;
1604         ENTRY;
1605
1606         if (com->lc_lfsck->li_bookmark_ram.lb_param & LPF_DRYRUN)
1607                 GOTO(log, rc = 0);
1608
1609         idx = lfsck_sub_trace_file_fid2idx(pfid);
1610         obj = com->lc_sub_trace_objs[idx].lsto_obj;
1611         dev = lfsck_obj2dev(obj);
1612
1613         fid_cpu_to_be(&key->lldk_fid, pfid);
1614         key->lldk_comp_id = cpu_to_be32(comp_id);
1615         key->lldk_ea_off = cpu_to_be32(ea_off);
1616
1617         fid_cpu_to_be(rec, cfid);
1618         rec->f_ver = cpu_to_be32(ost_idx);
1619
1620         mutex_lock(&com->lc_sub_trace_objs[idx].lsto_mutex);
1621
1622         th = lfsck_trans_create(env, dev, com->lc_lfsck);
1623         if (IS_ERR(th))
1624                 GOTO(unlock, rc = PTR_ERR(th));
1625
1626         rc = dt_declare_insert(env, obj,
1627                                (const struct dt_rec *)rec,
1628                                (const struct dt_key *)key, th);
1629         if (rc)
1630                 GOTO(unlock, rc);
1631
1632         rc = dt_trans_start_local(env, dev, th);
1633         if (rc)
1634                 GOTO(unlock, rc);
1635
1636         rc = dt_insert(env, obj, (const struct dt_rec *)rec,
1637                        (const struct dt_key *)key, th);
1638
1639         GOTO(unlock, rc);
1640
1641 unlock:
1642         if (th && !IS_ERR(th))
1643                 dt_trans_stop(env, dev, th);
1644
1645         mutex_unlock(&com->lc_sub_trace_objs[idx].lsto_mutex);
1646
1647 log:
1648         CDEBUG(D_LFSCK, "%s: insert the paris "DFID" => "DFID", comp_id = %u, "
1649                "ea_off = %u, ost_idx = %u, into the trace file for further "
1650                "dangling check: rc = %d\n", lfsck_lfsck2name(com->lc_lfsck),
1651                PFID(pfid), PFID(cfid), comp_id, ea_off, ost_idx, rc);
1652
1653         return rc;
1654 }
1655
1656 static int lfsck_layout_del_dangling_rec(const struct lu_env *env,
1657                                          struct lfsck_component *com,
1658                                          const struct lu_fid *fid,
1659                                          __u32 comp_id, __u32 ea_off)
1660 {
1661         struct lfsck_layout_dangling_key *key = &lfsck_env_info(env)->lti_lldk;
1662         struct dt_device *dev;
1663         struct dt_object *obj;
1664         struct thandle *th = NULL;
1665         int idx;
1666         int rc = 0;
1667         ENTRY;
1668
1669         if (com->lc_lfsck->li_bookmark_ram.lb_param & LPF_DRYRUN)
1670                 GOTO(log, rc = 0);
1671
1672         idx = lfsck_sub_trace_file_fid2idx(fid);
1673         obj = com->lc_sub_trace_objs[idx].lsto_obj;
1674         dev = lfsck_obj2dev(obj);
1675
1676         fid_cpu_to_be(&key->lldk_fid, fid);
1677         key->lldk_comp_id = cpu_to_be32(comp_id);
1678         key->lldk_ea_off = cpu_to_be32(ea_off);
1679
1680         mutex_lock(&com->lc_sub_trace_objs[idx].lsto_mutex);
1681
1682         th = lfsck_trans_create(env, dev, com->lc_lfsck);
1683         if (IS_ERR(th))
1684                 GOTO(unlock, rc = PTR_ERR(th));
1685
1686         rc = dt_declare_delete(env, obj, (const struct dt_key *)key, th);
1687         if (rc)
1688                 GOTO(unlock, rc);
1689
1690         rc = dt_trans_start_local(env, dev, th);
1691         if (rc)
1692                 GOTO(unlock, rc);
1693
1694         rc = dt_delete(env, obj, (const struct dt_key *)key, th);
1695
1696         GOTO(unlock, rc);
1697
1698 unlock:
1699         if (th && !IS_ERR(th))
1700                 dt_trans_stop(env, dev, th);
1701
1702         mutex_unlock(&com->lc_sub_trace_objs[idx].lsto_mutex);
1703
1704 log:
1705         CDEBUG(D_LFSCK, "%s: delete the dangling record for "DFID
1706                ", comp_id = %u, ea_off = %u from the trace file: rc = %d\n",
1707                lfsck_lfsck2name(com->lc_lfsck), PFID(fid), comp_id, ea_off, rc);
1708
1709         return rc;
1710 }
1711
1712 /**
1713  * Get the system default stripe size.
1714  *
1715  * \param[in] env       pointer to the thread context
1716  * \param[in] lfsck     pointer to the lfsck instance
1717  * \param[out] size     pointer to the default stripe size
1718  *
1719  * \retval              0 for success
1720  * \retval              negative error number on failure
1721  */
1722 static int lfsck_layout_get_def_stripesize(const struct lu_env *env,
1723                                            struct lfsck_instance *lfsck,
1724                                            __u32 *size)
1725 {
1726         struct lov_user_md      *lum = &lfsck_env_info(env)->lti_lum;
1727         struct dt_object        *root;
1728         int                      rc;
1729
1730         root = dt_locate(env, lfsck->li_next, &lfsck->li_local_root_fid);
1731         if (IS_ERR(root))
1732                 return PTR_ERR(root);
1733
1734         /* Get the default stripe size via xattr_get on the backend root. */
1735         rc = dt_xattr_get(env, root, lfsck_buf_get(env, lum, sizeof(*lum)),
1736                           XATTR_NAME_LOV);
1737         if (rc > 0) {
1738                 /* The lum->lmm_stripe_size is LE mode. The *size also
1739                  * should be LE mode. So it is unnecessary to convert. */
1740                 *size = lum->lmm_stripe_size;
1741                 rc = 0;
1742         } else if (unlikely(rc == 0)) {
1743                 rc = -EINVAL;
1744         }
1745
1746         lfsck_object_put(env, root);
1747
1748         return rc;
1749 }
1750
1751 /**
1752  * \retval       +1: repaired
1753  * \retval        0: did nothing
1754  * \retval      -ve: on error
1755  */
1756 static int lfsck_layout_refill_lovea(const struct lu_env *env,
1757                                      struct lfsck_instance *lfsck,
1758                                      struct thandle *handle,
1759                                      struct dt_object *parent,
1760                                      const struct lu_fid *cfid,
1761                                      struct lu_buf *buf,
1762                                      struct lov_mds_md_v1 *lmm,
1763                                      struct lov_ost_data_v1 *slot,
1764                                      int fl, __u32 ost_idx, int size)
1765 {
1766         struct ost_id           *oi     = &lfsck_env_info(env)->lti_oi;
1767         struct lu_buf            ea_buf;
1768         int                      rc;
1769         __u32                    magic;
1770         __u32                    pattern;
1771         __u16                    count;
1772         ENTRY;
1773
1774         magic = le32_to_cpu(lmm->lmm_magic);
1775         pattern = le32_to_cpu(lmm->lmm_pattern);
1776         count = le16_to_cpu(lmm->lmm_stripe_count);
1777
1778         fid_to_ostid(cfid, oi);
1779         ostid_cpu_to_le(oi, &slot->l_ost_oi);
1780         slot->l_ost_gen = cpu_to_le32(0);
1781         slot->l_ost_idx = cpu_to_le32(ost_idx);
1782
1783         if (pattern & LOV_PATTERN_F_HOLE) {
1784                 struct lov_ost_data_v1 *objs;
1785                 int                     i;
1786
1787                 if (magic == LOV_MAGIC_V1)
1788                         objs = &lmm->lmm_objects[0];
1789                 else
1790                         objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0];
1791                 for (i = 0; i < count; i++, objs++) {
1792                         if (lovea_slot_is_dummy(objs))
1793                                 break;
1794                 }
1795
1796                 /* If the @slot is the last dummy slot to be refilled,
1797                  * then drop LOV_PATTERN_F_HOLE from lmm::lmm_pattern. */
1798                 if (i == count) {
1799                         lmm->lmm_pattern =
1800                                 cpu_to_le32(pattern & ~LOV_PATTERN_F_HOLE);
1801
1802                         CDEBUG(D_LFSCK, "%s: remove layout HOLE for "DFID
1803                                ": parent "DFID"\n", lfsck_lfsck2name(lfsck),
1804                                PFID(cfid), PFID(lfsck_dto2fid(parent)));
1805                 }
1806         }
1807
1808         lfsck_buf_init(&ea_buf, buf->lb_buf, size);
1809         rc = dt_xattr_set(env, parent, &ea_buf, XATTR_NAME_LOV, fl, handle);
1810         if (rc == 0)
1811                 rc = 1;
1812
1813         RETURN(rc);
1814 }
1815
1816 static struct lov_ost_data_v1 *
1817 __lfsck_layout_new_v1_lovea(struct lov_mds_md_v1 *lmm,
1818                             const struct lu_fid *pfid,
1819                             __u32 stripe_size, __u32 ea_off,
1820                             __u32 pattern, __u16 count)
1821 {
1822         lmm->lmm_magic = cpu_to_le32(LOV_MAGIC_V1);
1823         lmm->lmm_pattern = cpu_to_le32(pattern);
1824         fid_to_lmm_oi(pfid, &lmm->lmm_oi);
1825         lmm_oi_cpu_to_le(&lmm->lmm_oi, &lmm->lmm_oi);
1826         lmm->lmm_stripe_size = cpu_to_le32(stripe_size);
1827         lmm->lmm_stripe_count = cpu_to_le16(count);
1828         lmm->lmm_layout_gen = cpu_to_le16(1);
1829         memset(&lmm->lmm_objects[0], 0,
1830                sizeof(struct lov_ost_data_v1) * count);
1831
1832         return &lmm->lmm_objects[ea_off];
1833 }
1834
1835 static int lfsck_layout_new_v1_lovea(const struct lu_env *env,
1836                                      struct lfsck_instance *lfsck,
1837                                      struct ost_layout *ol,
1838                                      struct dt_object *parent,
1839                                      struct lu_buf *buf, __u32 ea_off,
1840                                      struct lov_mds_md_v1 **lmm,
1841                                      struct lov_ost_data_v1 **objs)
1842 {
1843         int size;
1844         __u32 stripe_size = ol->ol_stripe_size;
1845         __u32 pattern = LOV_PATTERN_RAID0;
1846         __u16 count;
1847
1848         if (ol->ol_stripe_count != 0)
1849                 count = ol->ol_stripe_count;
1850         else
1851                 count = ea_off + 1;
1852
1853         size = lov_mds_md_size(count, LOV_MAGIC_V1);
1854         LASSERTF(buf->lb_len >= size,
1855                  "buffer len %d is less than real size %d\n",
1856                  (int)buf->lb_len, size);
1857
1858         if (stripe_size == 0) {
1859                 int rc;
1860
1861                 rc = lfsck_layout_get_def_stripesize(env, lfsck, &stripe_size);
1862                 if (rc)
1863                         return rc;
1864         }
1865
1866         *lmm = buf->lb_buf;
1867         if (ol->ol_stripe_count > 1 ||
1868             (ol->ol_stripe_count == 0 && ea_off != 0)) {
1869                 pattern |= LOV_PATTERN_F_HOLE;
1870                 memset(&(*lmm)->lmm_objects[0], 0,
1871                        count * sizeof(struct lov_ost_data_v1));
1872         }
1873
1874         *objs = __lfsck_layout_new_v1_lovea(*lmm, lfsck_dto2fid(parent),
1875                                 stripe_size, ea_off, pattern, count);
1876
1877         return size;
1878 }
1879
1880 static int lfsck_layout_new_comp_lovea(const struct lu_env *env,
1881                                        struct lu_orphan_rec_v3 *rec,
1882                                        struct dt_object *parent,
1883                                        struct lu_buf *buf, __u32 ea_off,
1884                                        struct lov_mds_md_v1 **lmm,
1885                                        struct lov_ost_data_v1 **objs)
1886 {
1887         struct ost_layout *ol = &rec->lor_layout;
1888         struct lov_comp_md_v1 *lcm;
1889         struct lov_comp_md_entry_v1 *lcme;
1890         __u32 pattern = LOV_PATTERN_RAID0;
1891         __u32 offset = sizeof(*lcm) + sizeof(*lcme);
1892         int lcme_size = lov_mds_md_size(ol->ol_stripe_count, LOV_MAGIC_V1);
1893         int size = offset + lcme_size;
1894
1895         LASSERTF(buf->lb_len >= size,
1896                  "buffer len %d is less than real size %d\n",
1897                  (int)buf->lb_len, size);
1898
1899         lcm = buf->lb_buf;
1900         lcm->lcm_magic = cpu_to_le32(LOV_MAGIC_COMP_V1);
1901         lcm->lcm_size = cpu_to_le32(size);
1902         if (rec->lor_range) {
1903                 lcm->lcm_layout_gen = cpu_to_le32(rec->lor_layout_version +
1904                                                   rec->lor_range);
1905                 lcm->lcm_flags = cpu_to_le16(LCM_FL_WRITE_PENDING);
1906         } else if (rec->lor_layout_version) {
1907                 lcm->lcm_layout_gen = cpu_to_le32(rec->lor_layout_version +
1908                                                   rec->lor_range);
1909                 lcm->lcm_flags = cpu_to_le16(LCM_FL_NONE);
1910         } else {
1911                 /*
1912                  * if OST doesn't provide layout version, then try
1913                  * to inherit one from MDS's layout, but increment
1914                  * it so the client notices and applies modified
1915                  * layout
1916                  */
1917                 le32_add_cpu(&lcm->lcm_layout_gen, 1);
1918                 lcm->lcm_flags = cpu_to_le16(LCM_FL_NONE);
1919         }
1920         lcm->lcm_entry_count = cpu_to_le16(1);
1921         /* Currently, we do not know how many mirrors will be, set it as zero
1922          * at the beginning. It will be updated when more mirrors are found. */
1923         lcm->lcm_mirror_count = 0;
1924
1925         lcme = &lcm->lcm_entries[0];
1926         lcme->lcme_id = cpu_to_le32(ol->ol_comp_id);
1927         lcme->lcme_flags = cpu_to_le32(LCME_FL_INIT);
1928         lcme->lcme_extent.e_start = cpu_to_le64(ol->ol_comp_start);
1929         lcme->lcme_extent.e_end = cpu_to_le64(ol->ol_comp_end);
1930         lcme->lcme_offset = cpu_to_le32(offset);
1931         lcme->lcme_size = cpu_to_le32(lcme_size);
1932         lcme->lcme_layout_gen = lcm->lcm_layout_gen;
1933         if (ol->ol_stripe_count > 1)
1934                 pattern |= LOV_PATTERN_F_HOLE;
1935
1936         *lmm = buf->lb_buf + offset;
1937         *objs = __lfsck_layout_new_v1_lovea(*lmm, lfsck_dto2fid(parent),
1938                                             ol->ol_stripe_size, ea_off,
1939                                             pattern, ol->ol_stripe_count);
1940
1941         return size;
1942 }
1943
1944 static void lfsck_layout_update_lcm(struct lov_comp_md_v1 *lcm,
1945                                     struct lov_comp_md_entry_v1 *lcme,
1946                                     __u32 version, __u32 range)
1947 {
1948         struct lov_comp_md_entry_v1 *tmp;
1949         __u64 start = le64_to_cpu(lcme->lcme_extent.e_start);
1950         __u64 end = le64_to_cpu(lcme->lcme_extent.e_end);
1951         __u32 gen = version + range;
1952         __u32 tmp_gen;
1953         int i;
1954         __u16 count = le16_to_cpu(lcm->lcm_entry_count);
1955         __u16 flags = le16_to_cpu(lcm->lcm_flags);
1956
1957         if (!gen)
1958                 gen = 1;
1959         lcme->lcme_layout_gen = cpu_to_le32(gen);
1960         if (le32_to_cpu(lcm->lcm_layout_gen) < gen)
1961                 lcm->lcm_layout_gen = cpu_to_le32(gen);
1962
1963         if (range)
1964                 lcm->lcm_flags = cpu_to_le16(LCM_FL_WRITE_PENDING);
1965         else if (flags == LCM_FL_NONE && le16_to_cpu(lcm->lcm_mirror_count) > 0)
1966                 lcm->lcm_flags = cpu_to_le16(LCM_FL_RDONLY);
1967
1968         for (i = 0; i < count; i++) {
1969                 tmp = &lcm->lcm_entries[i];
1970                 if (le64_to_cpu(tmp->lcme_extent.e_end) <= start)
1971                         continue;
1972
1973                 if (le64_to_cpu(tmp->lcme_extent.e_start) >= end)
1974                         continue;
1975
1976                 if (le32_to_cpu(tmp->lcme_flags) & LCME_FL_STALE)
1977                         continue;
1978
1979                 tmp_gen = le32_to_cpu(tmp->lcme_layout_gen);
1980                 /* "lcme_layout_gen == 0" but without LCME_FL_STALE flag,
1981                  * then it should be the latest version of all mirrors. */
1982                 if (tmp_gen == 0 || tmp_gen > gen) {
1983                         lcme->lcme_flags = cpu_to_le32(
1984                                 le32_to_cpu(lcme->lcme_flags) | LCME_FL_STALE);
1985                         break;
1986                 }
1987
1988                 if (tmp_gen < gen)
1989                         tmp->lcme_flags = cpu_to_le32(
1990                                 le32_to_cpu(tmp->lcme_flags) | LCME_FL_STALE);
1991         }
1992 }
1993
1994 static int lfsck_layout_add_comp(const struct lu_env *env,
1995                                  struct lfsck_instance *lfsck,
1996                                  struct thandle *handle,
1997                                  struct lu_orphan_rec_v3 *rec,
1998                                  struct dt_object *parent,
1999                                  const struct lu_fid *cfid,
2000                                  struct lu_buf *buf, __u32 ost_idx,
2001                                  __u32 ea_off, int pos, bool new_mirror)
2002 {
2003         struct ost_layout *ol = &rec->lor_layout;
2004         struct lov_comp_md_v1 *lcm = buf->lb_buf;
2005         struct lov_comp_md_entry_v1 *lcme;
2006         struct lov_mds_md_v1 *lmm;
2007         struct lov_ost_data_v1 *objs;
2008         int added = sizeof(*lcme) +
2009                     lov_mds_md_size(ol->ol_stripe_count, LOV_MAGIC_V1);
2010         int size = le32_to_cpu(lcm->lcm_size) + added;
2011         int rc;
2012         int i;
2013         __u32 offset;
2014         __u32 pattern = LOV_PATTERN_RAID0;
2015         __u16 count = le16_to_cpu(lcm->lcm_entry_count);
2016         ENTRY;
2017
2018         lu_buf_check_and_grow(buf, size);
2019         /* set the lcm again because lu_buf_check_and_grow() may
2020          * have reallocated the buf. */
2021         lcm = buf->lb_buf;
2022         lcm->lcm_size = cpu_to_le32(size);
2023         lcm->lcm_entry_count = cpu_to_le16(count + 1);
2024         if (new_mirror)
2025                 le16_add_cpu(&lcm->lcm_mirror_count, 1);
2026
2027         /* 1. Move the component bodies from [pos, count-1] to [pos+1, count]
2028          *    with distance of 'added'. */
2029         if (pos < count) {
2030                 size = 0;
2031                 for (i = pos; i < count; i++) {
2032                         lcme = &lcm->lcm_entries[i];
2033                         size += le32_to_cpu(lcme->lcme_size);
2034                 }
2035
2036                 offset = le32_to_cpu(lcm->lcm_entries[pos].lcme_offset);
2037                 memmove(buf->lb_buf + offset + added,
2038                         buf->lb_buf + offset, size);
2039         }
2040
2041         size = 0;
2042         /* 2. Move the component header [0, pos-1] to [0, pos-1] with distance
2043          *    of 'sizeof(struct lov_comp_md_entry_v1)' */
2044         if (pos > 0) {
2045                 for (i = 0; i < pos; i++) {
2046                         lcme = &lcm->lcm_entries[i];
2047                         size += le32_to_cpu(lcme->lcme_size);
2048                 }
2049
2050                 offset = le32_to_cpu(lcm->lcm_entries[0].lcme_offset);
2051                 memmove(buf->lb_buf + offset + sizeof(*lcme),
2052                         buf->lb_buf + offset, size);
2053         }
2054
2055         /* 3. Recalculate the enter offset for the component [pos, count-1] */
2056         for (i = count - 1; i >= pos; i--) {
2057                 lcm->lcm_entries[i + 1] = lcm->lcm_entries[i];
2058                 lcm->lcm_entries[i + 1].lcme_offset =
2059                         cpu_to_le32(le32_to_cpu(lcm->lcm_entries[i + 1].
2060                                                 lcme_offset) + added);
2061         }
2062
2063         /* 4. Recalculate the enter offset for the component [0, pos) */
2064         for (i = 0; i < pos; i++) {
2065                 lcm->lcm_entries[i].lcme_offset =
2066                         cpu_to_le32(le32_to_cpu(lcm->lcm_entries[i].
2067                                                 lcme_offset) + sizeof(*lcme));
2068         }
2069
2070         offset = sizeof(*lcm) + sizeof(*lcme) * (count + 1) + size;
2071         /* 4. Insert the new component header (entry) at the slot 'pos'. */
2072         lcme = &lcm->lcm_entries[pos];
2073         lcme->lcme_id = cpu_to_le32(ol->ol_comp_id);
2074         lcme->lcme_flags = cpu_to_le32(LCME_FL_INIT);
2075         lcme->lcme_extent.e_start = cpu_to_le64(ol->ol_comp_start);
2076         lcme->lcme_extent.e_end = cpu_to_le64(ol->ol_comp_end);
2077         lcme->lcme_offset = cpu_to_le32(offset);
2078         lcme->lcme_size = cpu_to_le32(lov_mds_md_size(ol->ol_stripe_count,
2079                                                       LOV_MAGIC_V1));
2080
2081         if (ol->ol_stripe_count > 1)
2082                 pattern |= LOV_PATTERN_F_HOLE;
2083
2084         lmm = buf->lb_buf + offset;
2085         /* 5. Insert teh new component body at the 'offset'. */
2086         objs = __lfsck_layout_new_v1_lovea(lmm, lfsck_dto2fid(parent),
2087                                            ol->ol_stripe_size, ea_off,
2088                                            pattern, ol->ol_stripe_count);
2089
2090         /* 6. Update mirror related flags and version. */
2091         lfsck_layout_update_lcm(lcm, lcme, rec->lor_layout_version,
2092                                 rec->lor_range);
2093
2094         rc = lfsck_layout_refill_lovea(env, lfsck, handle, parent, cfid, buf,
2095                                        lmm, objs, LU_XATTR_REPLACE, ost_idx,
2096                                        le32_to_cpu(lcm->lcm_size));
2097
2098         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant add new COMP for "
2099                DFID": parent "DFID", OST-index %u, stripe-index %u, "
2100                "stripe_size %u, stripe_count %u, comp_id %u, comp_start %llu, "
2101                "comp_end %llu, layout version %u, range %u, "
2102                "%s LOV EA hole: rc = %d\n",
2103                lfsck_lfsck2name(lfsck), PFID(cfid), PFID(lfsck_dto2fid(parent)),
2104                ost_idx, ea_off, ol->ol_stripe_size, ol->ol_stripe_count,
2105                ol->ol_comp_id, ol->ol_comp_start, ol->ol_comp_end,
2106                rec->lor_layout_version, rec->lor_range,
2107                le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_HOLE ?
2108                "with" : "without", rc);
2109
2110         RETURN(rc);
2111 }
2112
2113 static int lfsck_layout_extend_v1v3_lovea(const struct lu_env *env,
2114                                           struct lfsck_instance *lfsck,
2115                                           struct thandle *handle,
2116                                           struct ost_layout *ol,
2117                                           struct dt_object *parent,
2118                                           const struct lu_fid *cfid,
2119                                           struct lu_buf *buf, __u32 ost_idx,
2120                                           __u32 ea_off)
2121 {
2122         struct lov_mds_md_v1 *lmm = buf->lb_buf;
2123         struct lov_ost_data_v1 *objs;
2124         __u16 count = le16_to_cpu(lmm->lmm_stripe_count);
2125         __u32 magic = le32_to_cpu(lmm->lmm_magic);
2126         int size;
2127         int gap;
2128         int rc;
2129         ENTRY;
2130
2131         /* The original LOVEA maybe re-generated via old filter_fid, at
2132          * that time, we do not know the stripe count and stripe size. */
2133         if (ol->ol_stripe_count > count)
2134                 count = ol->ol_stripe_count;
2135         if (ol->ol_stripe_size != 0 &&
2136             ol->ol_stripe_size != le32_to_cpu(lmm->lmm_stripe_size))
2137                 lmm->lmm_stripe_size = cpu_to_le32(ol->ol_stripe_size);
2138
2139         if (magic == LOV_MAGIC_V1)
2140                 objs = &lmm->lmm_objects[count];
2141         else
2142                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[count];
2143
2144         gap = ea_off - count;
2145         if (gap >= 0)
2146                 count = ea_off + 1;
2147
2148         size = lov_mds_md_size(count, magic);
2149         LASSERTF(buf->lb_len >= size,
2150                  "buffer len %d is less than real size %d\n",
2151                  (int)buf->lb_len, size);
2152
2153         if (gap > 0) {
2154                 memset(objs, 0, gap * sizeof(*objs));
2155                 lmm->lmm_pattern |= cpu_to_le32(LOV_PATTERN_F_HOLE);
2156         }
2157
2158         lmm->lmm_layout_gen = cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1);
2159         lmm->lmm_stripe_count = cpu_to_le16(count);
2160         objs += gap;
2161
2162         rc = lfsck_layout_refill_lovea(env, lfsck, handle, parent, cfid, buf,
2163                                 lmm, objs, LU_XATTR_REPLACE, ost_idx, size);
2164
2165         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant extend layout EA for "
2166                DFID": parent "DFID", OST-index %u, stripe-index %u, "
2167                "stripe_size %u, stripe_count %u, comp_id %u, comp_start %llu, "
2168                "comp_end %llu, %s LOV EA hole: rc = %d\n",
2169                lfsck_lfsck2name(lfsck), PFID(cfid), PFID(lfsck_dto2fid(parent)),
2170                ost_idx, ea_off, ol->ol_stripe_size, ol->ol_stripe_count,
2171                ol->ol_comp_id, ol->ol_comp_start, ol->ol_comp_end,
2172                le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_HOLE ?
2173                "with" : "without", rc);
2174
2175         RETURN(rc);
2176 }
2177
2178 /**
2179  * \retval       +1: repaired
2180  * \retval        0: did nothing
2181  * \retval      -ve: on error
2182  */
2183 static int lfsck_layout_update_lovea(const struct lu_env *env,
2184                                      struct lfsck_instance *lfsck,
2185                                      struct thandle *handle,
2186                                      struct lu_orphan_rec_v3 *rec,
2187                                      struct dt_object *parent,
2188                                      const struct lu_fid *cfid,
2189                                      struct lu_buf *buf, int fl,
2190                                      __u32 ost_idx, __u32 ea_off)
2191 {
2192         struct ost_layout *ol = &rec->lor_layout;
2193         struct lov_mds_md_v1 *lmm = NULL;
2194         struct lov_ost_data_v1 *objs = NULL;
2195         int rc = 0;
2196         ENTRY;
2197
2198         if (ol->ol_comp_id != 0)
2199                 rc = lfsck_layout_new_comp_lovea(env, rec, parent, buf, ea_off,
2200                                                  &lmm, &objs);
2201         else
2202                 rc = lfsck_layout_new_v1_lovea(env, lfsck, &rec->lor_layout,
2203                                                parent, buf, ea_off, &lmm,
2204                                                &objs);
2205         if (rc > 0)
2206                 rc = lfsck_layout_refill_lovea(env, lfsck, handle, parent, cfid,
2207                                                buf, lmm, objs, fl, ost_idx, rc);
2208
2209         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant created layout EA for "
2210                DFID": parent "DFID", OST-index %u, stripe-index %u, "
2211                "stripe_size %u, stripe_count %u, comp_id %u, comp_start %llu, "
2212                "comp_end %llu, layout version %u, range %u, fl %d, "
2213                "%s LOV EA hole: rc = %d\n",
2214                lfsck_lfsck2name(lfsck), PFID(cfid), PFID(lfsck_dto2fid(parent)),
2215                ost_idx, ea_off, ol->ol_stripe_size, ol->ol_stripe_count,
2216                ol->ol_comp_id, ol->ol_comp_start, ol->ol_comp_end,
2217                rec->lor_layout_version, rec->lor_range, fl,
2218                le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_HOLE ?
2219                "with" : "without", rc);
2220
2221         RETURN(rc);
2222 }
2223
2224 static int __lfsck_layout_update_pfid(const struct lu_env *env,
2225                                       struct lfsck_component *com,
2226                                       struct dt_object *child,
2227                                       const struct lu_fid *pfid,
2228                                       const struct ost_layout *ol, __u32 offset,
2229                                       __u32 version, __u32 range)
2230 {
2231         struct dt_device        *dev    = lfsck_obj2dev(child);
2232         struct filter_fid       *ff     = &lfsck_env_info(env)->lti_ff;
2233         struct thandle          *handle;
2234         struct lu_buf            buf    = { NULL };
2235         int                      rc;
2236
2237         if (com->lc_lfsck->li_bookmark_ram.lb_param & LPF_DRYRUN)
2238                 RETURN(0);
2239
2240         ff->ff_parent.f_seq = cpu_to_le64(pfid->f_seq);
2241         ff->ff_parent.f_oid = cpu_to_le32(pfid->f_oid);
2242         /* Currently, the filter_fid::ff_parent::f_ver is not the real parent
2243          * MDT-object's FID::f_ver, instead it is the OST-object index in its
2244          * parent MDT-object's layout EA. */
2245         ff->ff_parent.f_stripe_idx = cpu_to_le32(offset);
2246         ost_layout_cpu_to_le(&ff->ff_layout, ol);
2247         ff->ff_layout_version = cpu_to_le32(version);
2248         ff->ff_range = cpu_to_le32(range);
2249         lfsck_buf_init(&buf, ff, sizeof(*ff));
2250
2251         if (!dt_object_exists(child) || lfsck_is_dead_obj(child))
2252                 return 0;
2253
2254         handle = lfsck_trans_create(env, dev, com->lc_lfsck);
2255         if (IS_ERR(handle))
2256                 RETURN(PTR_ERR(handle));
2257
2258         rc = dt_declare_xattr_set(env, child, &buf, XATTR_NAME_FID, 0, handle);
2259         if (rc != 0)
2260                 GOTO(stop, rc);
2261
2262         rc = dt_trans_start_local(env, dev, handle);
2263         if (rc != 0)
2264                 GOTO(stop, rc);
2265
2266         dt_write_lock(env, child, 0);
2267         if (dt_object_exists(child) && !lfsck_is_dead_obj(child))
2268                 rc = dt_xattr_set(env, child, &buf, XATTR_NAME_FID, 0, handle);
2269         dt_write_unlock(env, child);
2270
2271         GOTO(stop, rc);
2272
2273 stop:
2274         dt_trans_stop(env, dev, handle);
2275
2276         return rc;
2277 }
2278
2279 /**
2280  * \retval       +1: repaired
2281  * \retval        0: did nothing
2282  * \retval      -ve: on error
2283  */
2284 static int lfsck_layout_update_pfid(const struct lu_env *env,
2285                                     struct lfsck_component *com,
2286                                     struct dt_object *parent,
2287                                     struct lu_fid *cfid,
2288                                     struct dt_device *cdev,
2289                                     struct lu_orphan_rec_v3 *rec, __u32 ea_off)
2290 {
2291         struct dt_object        *child;
2292         int                      rc     = 0;
2293         ENTRY;
2294
2295         child = lfsck_object_find_by_dev(env, cdev, cfid);
2296         if (IS_ERR(child))
2297                 RETURN(PTR_ERR(child));
2298
2299         rc = __lfsck_layout_update_pfid(env, com, child,
2300                                         lu_object_fid(&parent->do_lu),
2301                                         &rec->lor_layout, ea_off,
2302                                         rec->lor_layout_version,
2303                                         rec->lor_range);
2304         lfsck_object_put(env, child);
2305
2306         RETURN(rc == 0 ? 1 : rc);
2307 }
2308
2309 static int lfsck_lovea_size(struct ost_layout *ol, __u32 ea_off)
2310 {
2311         if (ol->ol_comp_id != 0)
2312                 return sizeof(struct lov_comp_md_v1) +
2313                        sizeof(struct lov_comp_md_entry_v1) +
2314                        lov_mds_md_size(ol->ol_stripe_count, LOV_MAGIC_V1);
2315
2316         if (ol->ol_stripe_count != 0)
2317                 return lov_mds_md_size(ol->ol_stripe_count, LOV_MAGIC_V1);
2318
2319         return lov_mds_md_size(ea_off + 1, LOV_MAGIC_V1);
2320 }
2321
2322 /**
2323  * This function will create the MDT-object with the given (partial) LOV EA.
2324  *
2325  * Under some data corruption cases, the MDT-object of the file may be lost,
2326  * but its OST-objects, or some of them are there. The layout LFSCK needs to
2327  * re-create the MDT-object with the orphan OST-object(s) information.
2328  *
2329  * On the other hand, the LFSCK may has created some OST-object for repairing
2330  * dangling LOV EA reference, but as the LFSCK processing, it may find that
2331  * the old OST-object is there and should replace the former new created OST
2332  * object. Unfortunately, some others have modified such newly created object.
2333  * To keep the data (both new and old), the LFSCK will create MDT-object with
2334  * new FID to reference the original OST-object.
2335  *
2336  * \param[in] env       pointer to the thread context
2337  * \param[in] com       pointer to the lfsck component
2338  * \param[in] ltd       pointer to target device descriptor
2339  * \param[in] rec       pointer to the record for the orphan OST-object
2340  * \param[in] cfid      pointer to FID for the orphan OST-object
2341  * \param[in] infix     additional information, such as the FID for original
2342  *                      MDT-object and the stripe offset in the LOV EA
2343  * \param[in] type      the type for describing why the orphan MDT-object is
2344  *                      created. The rules are as following:
2345  *
2346  *  type "C":           Multiple OST-objects claim the same MDT-object and the
2347  *                      same slot in the layout EA. Then the LFSCK will create
2348  *                      new MDT-object(s) to hold the conflict OST-object(s).
2349  *
2350  *  type "N":           The orphan OST-object does not know which one was the
2351  *                      real parent MDT-object, so the LFSCK uses new FID for
2352  *                      its parent MDT-object.
2353  *
2354  *  type "R":           The orphan OST-object knows its parent MDT-object FID,
2355  *                      but does not know the position (the file name) in the
2356  *                      layout.
2357  *
2358  *  type "D":           The MDT-object is a directory, it may knows its parent
2359  *                      but because there is no valid linkEA, the LFSCK cannot
2360  *                      know where to put it back to the namespace.
2361  *  type "O":           The MDT-object has no linkEA, and there is no name
2362  *                      entry that references the MDT-object.
2363  *
2364  *  type "P":           The orphan object to be created was a parent directory
2365  *                      of some MDT-object which linkEA shows that the @orphan
2366  *                      object is missing.
2367  *
2368  * The orphan name will be like:
2369  * ${FID}-${infix}-${type}-${conflict_version}
2370  *
2371  * \param[in] ea_off    the stripe offset in the LOV EA
2372  *
2373  * \retval              positive on repaired something
2374  * \retval              0 if needs to repair nothing
2375  * \retval              negative error number on failure
2376  */
2377 static int lfsck_layout_recreate_parent(const struct lu_env *env,
2378                                         struct lfsck_component *com,
2379                                         struct lfsck_tgt_desc *ltd,
2380                                         struct lu_orphan_rec_v3 *rec,
2381                                         struct lu_fid *cfid,
2382                                         const char *infix,
2383                                         const char *type,
2384                                         __u32 ea_off)
2385 {
2386         struct lfsck_thread_info        *info   = lfsck_env_info(env);
2387         struct dt_insert_rec            *dtrec  = &info->lti_dt_rec;
2388         char                            *name   = info->lti_key;
2389         struct lu_attr                  *la     = &info->lti_la2;
2390         struct dt_object_format         *dof    = &info->lti_dof;
2391         struct lfsck_instance           *lfsck  = com->lc_lfsck;
2392         struct lu_fid                   *pfid   = &rec->lor_rec.lor_fid;
2393         struct lu_fid                   *tfid   = &info->lti_fid3;
2394         struct dt_device                *dev    = lfsck->li_bottom;
2395         struct dt_object                *lpf    = lfsck->li_lpf_obj;
2396         struct dt_object                *pobj   = NULL;
2397         struct dt_object                *cobj   = NULL;
2398         struct thandle                  *th     = NULL;
2399         struct lu_buf                   *ea_buf = &info->lti_big_buf;
2400         struct lu_buf                    lov_buf;
2401         struct lfsck_lock_handle        *llh    = &info->lti_llh;
2402         struct linkea_data               ldata  = { NULL };
2403         struct lu_buf                    linkea_buf;
2404         const struct lu_name            *pname;
2405         int                              size   = 0;
2406         int                              idx    = 0;
2407         int                              rc     = 0;
2408         ENTRY;
2409
2410         if (lfsck_is_dryrun(lfsck))
2411                 GOTO(log, rc = 0);
2412
2413         if (unlikely(lpf == NULL))
2414                 GOTO(log, rc = -ENXIO);
2415
2416         /* We use two separated transactions to repair the inconsistency.
2417          *
2418          * 1) create the MDT-object locally.
2419          * 2) update the OST-object's PFID EA if necessary.
2420          *
2421          * If 1) succeed, but 2) failed, then the OST-object's PFID EA will be
2422          * updated when the layout LFSCK run next time.
2423          *
2424          * If 1) failed, but 2) succeed, then such MDT-object will be re-created
2425          * when the layout LFSCK run next time. */
2426
2427         if (fid_is_zero(pfid)) {
2428                 rc = lfsck_fid_alloc(env, lfsck, pfid, false);
2429                 if (rc != 0)
2430                         GOTO(log, rc);
2431
2432                 cobj = lfsck_object_find_by_dev(env, ltd->ltd_tgt, cfid);
2433                 if (IS_ERR(cobj))
2434                         GOTO(log, rc = PTR_ERR(cobj));
2435         }
2436
2437         pobj = lfsck_object_find_by_dev(env, dev, pfid);
2438         if (IS_ERR(pobj))
2439                 GOTO(log, rc = PTR_ERR(pobj));
2440
2441         LASSERT(infix != NULL);
2442         LASSERT(type != NULL);
2443
2444         memset(la, 0, sizeof(*la));
2445         la->la_uid = rec->lor_rec.lor_uid;
2446         la->la_gid = rec->lor_rec.lor_gid;
2447         la->la_mode = S_IFREG | S_IRUSR;
2448         la->la_valid = LA_MODE | LA_UID | LA_GID;
2449
2450         memset(dof, 0, sizeof(*dof));
2451         dof->dof_type = dt_mode_to_dft(S_IFREG);
2452         /* Because the dof->dof_reg.striped = 0, the LOD will not create
2453          * the stripe(s). The LFSCK will specify the LOV EA via
2454          * lfsck_layout_update_lovea(). */
2455
2456         size = lfsck_lovea_size(&rec->lor_layout, ea_off);
2457         if (ea_buf->lb_len < size) {
2458                 lu_buf_realloc(ea_buf, size);
2459                 if (ea_buf->lb_buf == NULL)
2460                         GOTO(log, rc = -ENOMEM);
2461         }
2462
2463 again:
2464         do {
2465                 snprintf(name, NAME_MAX, DFID"%s-%s-%d", PFID(pfid), infix,
2466                          type, idx++);
2467                 rc = dt_lookup_dir(env, lfsck->li_lpf_obj, name, tfid);
2468                 if (rc != 0 && rc != -ENOENT)
2469                         GOTO(log, rc);
2470         } while (rc == 0);
2471
2472         rc = lfsck_lock(env, lfsck, lfsck->li_lpf_obj, name, llh,
2473                         MDS_INODELOCK_UPDATE, LCK_PW);
2474         if (rc != 0)
2475                 GOTO(log, rc);
2476
2477         /* Re-check whether the name conflict with othrs after taken
2478          * the ldlm lock. */
2479         rc = dt_lookup_dir(env, lfsck->li_lpf_obj, name, tfid);
2480         if (unlikely(rc == 0)) {
2481                 lfsck_unlock(llh);
2482                 goto again;
2483         }
2484
2485         if (rc != -ENOENT)
2486                 GOTO(unlock, rc);
2487
2488         pname = lfsck_name_get_const(env, name, strlen(name));
2489         rc = linkea_links_new(&ldata, &lfsck_env_info(env)->lti_linkea_buf,
2490                               pname, lfsck_dto2fid(lfsck->li_lpf_obj));
2491         if (rc != 0)
2492                 GOTO(unlock, rc);
2493
2494         /* The 1st transaction. */
2495         th = lfsck_trans_create(env, dev, lfsck);
2496         if (IS_ERR(th))
2497                 GOTO(unlock, rc = PTR_ERR(th));
2498
2499         rc = dt_declare_create(env, pobj, la, NULL, dof, th);
2500         if (rc != 0)
2501                 GOTO(stop, rc);
2502
2503         lfsck_buf_init(&lov_buf, ea_buf->lb_buf, size);
2504         rc = dt_declare_xattr_set(env, pobj, &lov_buf, XATTR_NAME_LOV,
2505                                   LU_XATTR_CREATE, th);
2506         if (rc != 0)
2507                 GOTO(stop, rc);
2508
2509         dtrec->rec_fid = pfid;
2510         dtrec->rec_type = S_IFREG;
2511         rc = dt_declare_insert(env, lpf,
2512                                (const struct dt_rec *)dtrec,
2513                                (const struct dt_key *)name, th);
2514         if (rc != 0)
2515                 GOTO(stop, rc);
2516
2517         lfsck_buf_init(&linkea_buf, ldata.ld_buf->lb_buf,
2518                        ldata.ld_leh->leh_len);
2519         rc = dt_declare_xattr_set(env, pobj, &linkea_buf,
2520                                   XATTR_NAME_LINK, 0, th);
2521         if (rc != 0)
2522                 GOTO(stop, rc);
2523
2524         rc = dt_trans_start_local(env, dev, th);
2525         if (rc != 0)
2526                 GOTO(stop, rc);
2527
2528         dt_write_lock(env, pobj, 0);
2529         rc = dt_create(env, pobj, la, NULL, dof, th);
2530         if (rc == 0)
2531                 rc = lfsck_layout_update_lovea(env, lfsck, th, rec, pobj, cfid,
2532                         &lov_buf, LU_XATTR_CREATE, ltd->ltd_index, ea_off);
2533         dt_write_unlock(env, pobj);
2534         if (rc < 0)
2535                 GOTO(stop, rc);
2536
2537         rc = dt_insert(env, lpf, (const struct dt_rec *)dtrec,
2538                        (const struct dt_key *)name, th);
2539         if (rc != 0)
2540                 GOTO(stop, rc);
2541
2542         rc = dt_xattr_set(env, pobj, &linkea_buf, XATTR_NAME_LINK, 0, th);
2543         if (rc == 0 && cobj != NULL) {
2544                 dt_trans_stop(env, dev, th);
2545                 th = NULL;
2546
2547                 /* The 2nd transaction. */
2548                 rc = __lfsck_layout_update_pfid(env, com, cobj, pfid,
2549                                                 &rec->lor_layout, ea_off,
2550                                                 rec->lor_layout_version,
2551                                                 rec->lor_range);
2552         }
2553
2554         GOTO(stop, rc);
2555
2556 stop:
2557         if (th != NULL)
2558                 dt_trans_stop(env, dev, th);
2559
2560 unlock:
2561         lfsck_unlock(llh);
2562
2563 log:
2564         if (cobj != NULL && !IS_ERR(cobj))
2565                 lfsck_object_put(env, cobj);
2566         if (pobj != NULL && !IS_ERR(pobj))
2567                 lfsck_object_put(env, pobj);
2568
2569         if (rc < 0)
2570                 CDEBUG(D_LFSCK, "%s layout LFSCK assistant failed to "
2571                        "recreate the lost MDT-object: parent "DFID
2572                        ", child "DFID", OST-index %u, stripe-index %u, "
2573                        "infix %s, type %s: rc = %d\n",
2574                        lfsck_lfsck2name(lfsck), PFID(pfid), PFID(cfid),
2575                        ltd->ltd_index, ea_off, infix, type, rc);
2576
2577         return rc >= 0 ? 1 : rc;
2578 }
2579
2580 static int lfsck_layout_master_conditional_destroy(const struct lu_env *env,
2581                                                    struct lfsck_component *com,
2582                                                    const struct lu_fid *fid,
2583                                                    __u32 index)
2584 {
2585         struct lfsck_thread_info *info  = lfsck_env_info(env);
2586         struct lfsck_request     *lr    = &info->lti_lr;
2587         struct lfsck_instance    *lfsck = com->lc_lfsck;
2588         struct lfsck_tgt_desc    *ltd;
2589         struct ptlrpc_request    *req;
2590         struct lfsck_request     *tmp;
2591         struct obd_export        *exp;
2592         int                       rc    = 0;
2593         ENTRY;
2594
2595         ltd = lfsck_tgt_get(&lfsck->li_ost_descs, index);
2596         if (unlikely(ltd == NULL))
2597                 RETURN(-ENXIO);
2598
2599         exp = ltd->ltd_exp;
2600         if (!(exp_connect_flags(exp) & OBD_CONNECT_LFSCK))
2601                 GOTO(put, rc = -EOPNOTSUPP);
2602
2603         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LFSCK_NOTIFY);
2604         if (req == NULL)
2605                 GOTO(put, rc = -ENOMEM);
2606
2607         rc = ptlrpc_request_pack(req, LUSTRE_OBD_VERSION, LFSCK_NOTIFY);
2608         if (rc != 0) {
2609                 ptlrpc_request_free(req);
2610
2611                 GOTO(put, rc);
2612         }
2613
2614         memset(lr, 0, sizeof(*lr));
2615         lr->lr_event = LE_CONDITIONAL_DESTROY;
2616         lr->lr_active = LFSCK_TYPE_LAYOUT;
2617         lr->lr_fid = *fid;
2618
2619         tmp = req_capsule_client_get(&req->rq_pill, &RMF_LFSCK_REQUEST);
2620         *tmp = *lr;
2621         ptlrpc_request_set_replen(req);
2622
2623         rc = ptlrpc_queue_wait(req);
2624         ptlrpc_req_finished(req);
2625
2626         GOTO(put, rc);
2627
2628 put:
2629         lfsck_tgt_put(ltd);
2630
2631         return rc;
2632 }
2633
2634 static int lfsck_layout_slave_conditional_destroy(const struct lu_env *env,
2635                                                   struct lfsck_component *com,
2636                                                   struct lfsck_request *lr)
2637 {
2638         struct lfsck_thread_info        *info   = lfsck_env_info(env);
2639         struct lu_attr                  *la     = &info->lti_la;
2640         union ldlm_policy_data          *policy = &info->lti_policy;
2641         struct ldlm_res_id              *resid  = &info->lti_resid;
2642         struct lfsck_instance           *lfsck  = com->lc_lfsck;
2643         struct dt_device                *dev    = lfsck->li_bottom;
2644         struct lu_fid                   *fid    = &lr->lr_fid;
2645         struct dt_object                *obj;
2646         struct thandle                  *th     = NULL;
2647         struct lustre_handle             lh     = { 0 };
2648         __u64                            flags  = 0;
2649         int                              rc     = 0;
2650         ENTRY;
2651
2652         if (lfsck->li_bookmark_ram.lb_param & LPF_DRYRUN)
2653                 RETURN(0);
2654
2655         obj = lfsck_object_find_by_dev(env, dev, fid);
2656         if (IS_ERR(obj))
2657                 RETURN(PTR_ERR(obj));
2658
2659         dt_read_lock(env, obj, 0);
2660         if (dt_object_exists(obj) == 0 ||
2661             lfsck_is_dead_obj(obj)) {
2662                 dt_read_unlock(env, obj);
2663
2664                 GOTO(put, rc = -ENOENT);
2665         }
2666
2667         /* Get obj's attr without lock firstly. */
2668         rc = dt_attr_get(env, obj, la);
2669         dt_read_unlock(env, obj);
2670         if (rc != 0)
2671                 GOTO(put, rc);
2672
2673         if (likely(la->la_ctime != 0 || la->la_mode & S_ISUID))
2674                 GOTO(put, rc = -ETXTBSY);
2675
2676         /* Acquire extent lock on [0, EOF] to sync with all possible written. */
2677         LASSERT(lfsck->li_namespace != NULL);
2678
2679         memset(policy, 0, sizeof(*policy));
2680         policy->l_extent.end = OBD_OBJECT_EOF;
2681         ost_fid_build_resid(fid, resid);
2682         rc = ldlm_cli_enqueue_local(env, lfsck->li_namespace, resid,
2683                                     LDLM_EXTENT, policy, LCK_EX, &flags,
2684                                     ldlm_blocking_ast, ldlm_completion_ast,
2685                                     NULL, NULL, 0, LVB_T_NONE, NULL, &lh);
2686         if (rc != ELDLM_OK)
2687                 GOTO(put, rc = -EIO);
2688
2689         dt_write_lock(env, obj, 0);
2690         /* Get obj's attr within lock again. */
2691         rc = dt_attr_get(env, obj, la);
2692         if (rc != 0)
2693                 GOTO(unlock, rc);
2694
2695         if (la->la_ctime != 0)
2696                 GOTO(unlock, rc = -ETXTBSY);
2697
2698         th = lfsck_trans_create(env, dev, lfsck);
2699         if (IS_ERR(th))
2700                 GOTO(unlock, rc = PTR_ERR(th));
2701
2702         rc = dt_declare_ref_del(env, obj, th);
2703         if (rc != 0)
2704                 GOTO(stop, rc);
2705
2706         rc = dt_declare_destroy(env, obj, th);
2707         if (rc != 0)
2708                 GOTO(stop, rc);
2709
2710         rc = dt_trans_start_local(env, dev, th);
2711         if (rc != 0)
2712                 GOTO(stop, rc);
2713
2714         rc = dt_ref_del(env, obj, th);
2715         if (rc != 0)
2716                 GOTO(stop, rc);
2717
2718         rc = dt_destroy(env, obj, th);
2719         if (rc == 0)
2720                 CDEBUG(D_LFSCK, "%s: layout LFSCK destroyed the empty "
2721                        "OST-object "DFID" that was created for reparing "
2722                        "dangling referenced case. But the original missing "
2723                        "OST-object is found now.\n",
2724                        lfsck_lfsck2name(lfsck), PFID(fid));
2725
2726         GOTO(stop, rc);
2727
2728 stop:
2729         dt_trans_stop(env, dev, th);
2730
2731 unlock:
2732         dt_write_unlock(env, obj);
2733         ldlm_lock_decref(&lh, LCK_EX);
2734
2735 put:
2736         lfsck_object_put(env, obj);
2737
2738         return rc;
2739 }
2740
2741 /**
2742  * Some OST-object has occupied the specified layout EA slot.
2743  * Such OST-object may be generated by the LFSCK when repair
2744  * dangling referenced MDT-object, which can be indicated by
2745  * attr::la_ctime == 0 but without S_ISUID in la_mode. If it
2746  * is true and such OST-object has not been modified yet, we
2747  * will replace it with the orphan OST-object; otherwise the
2748  * LFSCK will create new MDT-object to reference the orphan.
2749  *
2750  * \retval       +1: repaired
2751  * \retval        0: did nothing
2752  * \retval      -ve: on error
2753  */
2754 static int lfsck_layout_conflict_create(const struct lu_env *env,
2755                                         struct lfsck_component *com,
2756                                         struct lfsck_tgt_desc *ltd,
2757                                         struct lu_orphan_rec_v3 *rec,
2758                                         struct dt_object *parent,
2759                                         struct lu_fid *cfid,
2760                                         struct lu_buf *ea_buf,
2761                                         struct lov_mds_md_v1 *lmm,
2762                                         struct lov_ost_data_v1 *slot,
2763                                         __u32 ea_off, int lovea_size)
2764 {
2765         struct lfsck_thread_info *info          = lfsck_env_info(env);
2766         struct lu_fid            *cfid2         = &info->lti_fid2;
2767         struct ost_id            *oi            = &info->lti_oi;
2768         struct dt_device         *dev           = lfsck_obj2dev(parent);
2769         struct thandle           *th            = NULL;
2770         struct lustre_handle      lh            = { 0 };
2771         __u32                     ost_idx2      = le32_to_cpu(slot->l_ost_idx);
2772         int                       rc            = 0;
2773         ENTRY;
2774
2775         while (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val)) {
2776                 if (unlikely(!thread_is_running(&com->lc_lfsck->li_thread)))
2777                         RETURN(0);
2778         }
2779
2780         ostid_le_to_cpu(&slot->l_ost_oi, oi);
2781         rc = ostid_to_fid(cfid2, oi, ost_idx2);
2782         if (rc != 0)
2783                 GOTO(out, rc);
2784
2785         rc = lfsck_ibits_lock(env, com->lc_lfsck, parent, &lh,
2786                               MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR,
2787                               LCK_EX);
2788         if (rc != 0)
2789                 GOTO(out, rc);
2790
2791         rc = lfsck_layout_master_conditional_destroy(env, com, cfid2, ost_idx2);
2792
2793         /* If the conflict OST-obejct is not created for fixing dangling
2794          * referenced MDT-object in former LFSCK check/repair, or it has
2795          * been modified by others, then we cannot destroy it. Re-create
2796          * a new MDT-object for the orphan OST-object. */
2797         if (rc == -ETXTBSY) {
2798                 /* No need the layout lock on the original parent. */
2799                 lfsck_ibits_unlock(&lh, LCK_EX);
2800
2801                 fid_zero(&rec->lor_rec.lor_fid);
2802                 snprintf(info->lti_tmpbuf, sizeof(info->lti_tmpbuf),
2803                          "-"DFID"-%x", PFID(lu_object_fid(&parent->do_lu)),
2804                          ea_off);
2805                 rc = lfsck_layout_recreate_parent(env, com, ltd, rec, cfid,
2806                                                 info->lti_tmpbuf, "C", ea_off);
2807
2808                 RETURN(rc);
2809         }
2810
2811         if (rc != 0 && rc != -ENOENT)
2812                 GOTO(unlock, rc);
2813
2814         if (lfsck_is_dryrun(com->lc_lfsck))
2815                 GOTO(unlock, rc = 0);
2816
2817         th = lfsck_trans_create(env, dev, com->lc_lfsck);
2818         if (IS_ERR(th))
2819                 GOTO(unlock, rc = PTR_ERR(th));
2820
2821         rc = dt_declare_xattr_set(env, parent, ea_buf, XATTR_NAME_LOV,
2822                                   LU_XATTR_REPLACE, th);
2823         if (rc != 0)
2824                 GOTO(stop, rc);
2825
2826         rc = dt_trans_start_local(env, dev, th);
2827         if (rc != 0)
2828                 GOTO(stop, rc);
2829
2830         dt_write_lock(env, parent, 0);
2831         lmm->lmm_layout_gen = cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1);
2832         rc = lfsck_layout_refill_lovea(env, com->lc_lfsck, th, parent, cfid,
2833                                        ea_buf, lmm, slot, LU_XATTR_REPLACE,
2834                                        ltd->ltd_index, lovea_size);
2835         dt_write_unlock(env, parent);
2836
2837         GOTO(stop, rc);
2838
2839 stop:
2840         dt_trans_stop(env, dev, th);
2841
2842 unlock:
2843         lfsck_ibits_unlock(&lh, LCK_EX);
2844
2845 out:
2846         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant replaced the conflict "
2847                "OST-object "DFID" on the OST %x with the orphan "DFID" on "
2848                "the OST %x: parent "DFID", stripe-index %u: rc = %d\n",
2849                lfsck_lfsck2name(com->lc_lfsck), PFID(cfid2), ost_idx2,
2850                PFID(cfid), ltd->ltd_index, PFID(lfsck_dto2fid(parent)),
2851                ea_off, rc);
2852
2853         return rc >= 0 ? 1 : rc;
2854 }
2855
2856 /**
2857  * \retval       +1: repaired
2858  * \retval        0: did nothing
2859  * \retval      -ve: on error
2860  */
2861 static int lfsck_layout_recreate_lovea(const struct lu_env *env,
2862                                        struct lfsck_component *com,
2863                                        struct lfsck_tgt_desc *ltd,
2864                                        struct lu_orphan_rec_v3 *rec,
2865                                        struct dt_object *parent,
2866                                        struct lu_fid *cfid,
2867                                        __u32 ost_idx, __u32 ea_off)
2868 {
2869         struct lfsck_thread_info *info          = lfsck_env_info(env);
2870         struct lu_buf            *buf           = &info->lti_big_buf;
2871         struct lu_fid            *fid           = &info->lti_fid2;
2872         struct ost_id            *oi            = &info->lti_oi;
2873         struct lfsck_instance    *lfsck         = com->lc_lfsck;
2874         struct dt_device         *dt            = lfsck_obj2dev(parent);
2875         struct lfsck_bookmark    *bk            = &lfsck->li_bookmark_ram;
2876         struct ost_layout        *ol            = &rec->lor_layout;
2877         struct lov_comp_md_v1    *lcm           = NULL;
2878         struct lov_comp_md_entry_v1 *lcme       = NULL;
2879         struct thandle           *handle        = NULL;
2880         size_t                    lovea_size;
2881         struct lov_mds_md_v1     *lmm;
2882         struct lov_ost_data_v1   *objs;
2883         struct lustre_handle      lh            = { 0 };
2884         __u32                     magic;
2885         __u32 flags = 0;
2886         int                       fl            = 0;
2887         int                       rc            = 0;
2888         int                       rc1;
2889         int                       i;
2890         int pos = 0;
2891         __u16 count;
2892         bool locked = false;
2893         bool new_mirror = true;
2894         ENTRY;
2895
2896         if (lfsck_is_dryrun(lfsck))
2897                 RETURN(0);
2898
2899         rc = lfsck_ibits_lock(env, lfsck, parent, &lh,
2900                               MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR,
2901                               LCK_EX);
2902         if (rc != 0) {
2903                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant failed to recreate "
2904                        "LOV EA for "DFID": parent "DFID", OST-index %u, "
2905                        "stripe-index %u, comp_id %u, comp_start %llu, "
2906                        "comp_end %llu, layout version %u, range %u: rc = %d\n",
2907                        lfsck_lfsck2name(lfsck), PFID(cfid),
2908                        PFID(lfsck_dto2fid(parent)), ost_idx, ea_off,
2909                        ol->ol_comp_id, ol->ol_comp_start,
2910                        ol->ol_comp_end, rec->lor_layout_version,
2911                        rec->lor_range, rc);
2912
2913                 RETURN(rc);
2914         }
2915
2916 again:
2917         if (locked) {
2918                 dt_write_unlock(env, parent);
2919                 locked = false;
2920         }
2921
2922         if (handle != NULL) {
2923                 dt_trans_stop(env, dt, handle);
2924                 handle = NULL;
2925         }
2926
2927         if (rc < 0)
2928                 GOTO(unlock_layout, rc);
2929
2930         lovea_size = rc;
2931         if (buf->lb_len < lovea_size) {
2932                 lu_buf_realloc(buf, lovea_size);
2933                 if (buf->lb_buf == NULL)
2934                         GOTO(unlock_layout, rc = -ENOMEM);
2935         }
2936
2937         if (!(bk->lb_param & LPF_DRYRUN)) {
2938                 handle = lfsck_trans_create(env, dt, lfsck);
2939                 if (IS_ERR(handle))
2940                         GOTO(unlock_layout, rc = PTR_ERR(handle));
2941
2942                 rc = dt_declare_xattr_set(env, parent, buf, XATTR_NAME_LOV,
2943                                           fl, handle);
2944                 if (rc != 0)
2945                         GOTO(stop, rc);
2946
2947                 rc = dt_trans_start_local(env, dt, handle);
2948                 if (rc != 0)
2949                         GOTO(stop, rc);
2950         }
2951
2952         dt_write_lock(env, parent, 0);
2953         locked = true;
2954         rc = dt_xattr_get(env, parent, buf, XATTR_NAME_LOV);
2955         if (rc == -ERANGE) {
2956                 rc = dt_xattr_get(env, parent, &LU_BUF_NULL, XATTR_NAME_LOV);
2957                 LASSERT(rc != 0);
2958                 goto again;
2959         } else if (rc == -ENODATA || rc == 0) {
2960                 lovea_size = lfsck_lovea_size(ol, ea_off);
2961                 /* If the declared is not big enough, re-try. */
2962                 if (buf->lb_len < lovea_size) {
2963                         rc = lovea_size;
2964                         goto again;
2965                 }
2966                 fl = LU_XATTR_CREATE;
2967         } else if (rc < 0) {
2968                 GOTO(unlock_parent, rc);
2969         } else if (unlikely(buf->lb_len == 0)) {
2970                 goto again;
2971         } else {
2972                 fl = LU_XATTR_REPLACE;
2973                 lovea_size = rc;
2974         }
2975
2976         if (fl == LU_XATTR_CREATE) {
2977                 if (bk->lb_param & LPF_DRYRUN)
2978                         GOTO(unlock_parent, rc = 1);
2979
2980                 LASSERT(buf->lb_len >= lovea_size);
2981
2982                 rc = lfsck_layout_update_lovea(env, lfsck, handle, rec, parent,
2983                                                cfid, buf, fl, ost_idx, ea_off);
2984
2985                 GOTO(unlock_parent, rc);
2986         }
2987
2988         lmm = buf->lb_buf;
2989         rc1 = lfsck_layout_verify_header(parent, lmm, lovea_size);
2990
2991         /* If the LOV EA crashed, the rebuild it. */
2992         if (rc1 == -EINVAL) {
2993                 if (bk->lb_param & LPF_DRYRUN)
2994                         GOTO(unlock_parent, rc = 1);
2995
2996                 LASSERT(buf->lb_len >= lovea_size);
2997
2998                 rc = lfsck_layout_update_lovea(env, lfsck, handle, rec, parent,
2999                                                cfid, buf, fl, ost_idx, ea_off);
3000
3001                 GOTO(unlock_parent, rc);
3002         }
3003
3004         /* For other unknown magic/pattern, keep the current LOV EA. */
3005         if (rc1 == -EOPNOTSUPP)
3006                 GOTO(unlock_parent, rc1 = 0);
3007
3008         if (rc1)
3009                 GOTO(unlock_parent, rc = rc1);
3010
3011         magic = le32_to_cpu(lmm->lmm_magic);
3012         if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) {
3013                 __u64 start;
3014                 __u64 end;
3015                 __u16 mirror_id0 = mirror_id_of(ol->ol_comp_id);
3016                 __u16 mirror_id1;
3017
3018                 if (bk->lb_param & LPF_DRYRUN)
3019                         GOTO(unlock_parent, rc = 1);
3020
3021                 lcm = buf->lb_buf;
3022                 count = le16_to_cpu(lcm->lcm_entry_count);
3023                 for (i = 0; i < count; pos = ++i) {
3024                         lcme = &lcm->lcm_entries[i];
3025                         start = le64_to_cpu(lcme->lcme_extent.e_start);
3026                         end = le64_to_cpu(lcme->lcme_extent.e_end);
3027                         mirror_id1 = mirror_id_of(le32_to_cpu(lcme->lcme_id));
3028
3029                         if (mirror_id0 > mirror_id1)
3030                                 continue;
3031
3032                         if (mirror_id0 < mirror_id1)
3033                                 break;
3034
3035                         new_mirror = false;
3036                         if (end <= ol->ol_comp_start)
3037                                 continue;
3038
3039                         if (start >= ol->ol_comp_end)
3040                                 break;
3041
3042                         lmm = buf->lb_buf + le32_to_cpu(lcme->lcme_offset);
3043                         magic = le32_to_cpu(lmm->lmm_magic);
3044                         flags = le32_to_cpu(lcme->lcme_flags);
3045                         goto further;
3046                 }
3047
3048                 rc = lfsck_layout_add_comp(env, lfsck, handle, rec, parent,
3049                                 cfid, buf, ost_idx, ea_off, pos, new_mirror);
3050
3051                 GOTO(unlock_parent, rc);
3052         }
3053
3054 further:
3055         count = le16_to_cpu(lmm->lmm_stripe_count);
3056         if (count == 0)
3057                 GOTO(unlock_parent, rc = -EINVAL);
3058         LASSERT(count > 0);
3059
3060         /* Exceed the current end of MDT-object layout EA. Then extend it. */
3061         if (count <= ea_off) {
3062                 if (bk->lb_param & LPF_DRYRUN)
3063                         GOTO(unlock_parent, rc = 1);
3064
3065                 lovea_size = lov_mds_md_size(ea_off + 1, magic);
3066                 /* If the declared is not big enough, re-try. */
3067                 if (buf->lb_len < lovea_size) {
3068                         rc = lovea_size;
3069                         goto again;
3070                 }
3071
3072                 if (lcm) {
3073                         LASSERT(lcme);
3074
3075                         lcme->lcme_flags = cpu_to_le32(flags | LCME_FL_INIT);
3076                         lfsck_layout_update_lcm(lcm, lcme,
3077                                                 rec->lor_layout_version,
3078                                                 rec->lor_range);
3079                 }
3080
3081                 rc = lfsck_layout_extend_v1v3_lovea(env, lfsck, handle, ol,
3082                                         parent, cfid, buf, ost_idx, ea_off);
3083
3084                 GOTO(unlock_parent, rc);
3085         }
3086
3087         LASSERTF(rc > 0, "invalid rc = %d\n", rc);
3088
3089         if (magic == LOV_MAGIC_V1) {
3090                 objs = &lmm->lmm_objects[0];
3091         } else {
3092                 LASSERT(magic == LOV_MAGIC_V3);
3093                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0];
3094         }
3095
3096         for (i = 0; i < count; i++, objs++) {
3097                 /* The MDT-object was created via lfsck_layout_recover_create()
3098                  * by others before, and we fill the dummy layout EA. */
3099                 if ((lcme && !(flags & LCME_FL_INIT)) ||
3100                      lovea_slot_is_dummy(objs)) {
3101                         if (i != ea_off)
3102                                 continue;
3103
3104                         if (bk->lb_param & LPF_DRYRUN)
3105                                 GOTO(unlock_parent, rc = 1);
3106
3107                         lmm->lmm_layout_gen =
3108                             cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1);
3109                         if (lcme) {
3110                                 LASSERT(lcm);
3111
3112                                 if (le32_to_cpu(lmm->lmm_stripe_size) !=
3113                                         ol->ol_stripe_size ||
3114                                     le16_to_cpu(lmm->lmm_stripe_count) !=
3115                                         ol->ol_stripe_count ||
3116                                     le64_to_cpu(lcme->lcme_extent.e_start) !=
3117                                         ol->ol_comp_start ||
3118                                     le64_to_cpu(lcme->lcme_extent.e_end) !=
3119                                         ol->ol_comp_end) {
3120                                         CDEBUG(D_LFSCK, "%s: found invalid "
3121                                         "component for "DFID ": parent "DFID
3122                                         ", stripe-index %u, stripe_size %u, "
3123                                         "stripe_count %u, comp_id %u, "
3124                                         "comp_start %llu, comp_end %llu, "
3125                                         "cur_stripe_size %u, "
3126                                         "cur_stripe_count %u, "
3127                                         "cur_comp_start %llu, "
3128                                         "cur_comp_end %llu\n",
3129                                         lfsck_lfsck2name(lfsck), PFID(cfid),
3130                                         PFID(lfsck_dto2fid(parent)), ea_off,
3131                                         ol->ol_stripe_size,
3132                                         ol->ol_stripe_count, ol->ol_comp_id,
3133                                         ol->ol_comp_start, ol->ol_comp_end,
3134                                         le32_to_cpu(lmm->lmm_stripe_size),
3135                                         le16_to_cpu(lmm->lmm_stripe_count),
3136                                         le64_to_cpu(lcme->lcme_extent.e_start),
3137                                         le64_to_cpu(lcme->lcme_extent.e_end));
3138
3139                                         GOTO(unlock_parent, rc = -EINVAL);
3140                                 }
3141
3142                                 lovea_size = le32_to_cpu(lcm->lcm_size);
3143                                 lcme->lcme_flags = cpu_to_le32(flags |
3144                                                                LCME_FL_INIT);
3145                                 lfsck_layout_update_lcm(lcm, lcme,
3146                                                         rec->lor_layout_version,
3147                                                         rec->lor_range);
3148                         }
3149
3150                         LASSERTF(buf->lb_len >= lovea_size,
3151                                  "buffer len %d is less than real size %d\n",
3152                                  (int)buf->lb_len, (int)lovea_size);
3153
3154                         rc = lfsck_layout_refill_lovea(env, lfsck, handle,
3155                                                 parent, cfid, buf, lmm, objs,
3156                                                 fl, ost_idx, lovea_size);
3157
3158                         CDEBUG(D_LFSCK, "%s layout LFSCK assistant fill "
3159                                "dummy layout slot for "DFID": parent "DFID
3160                                ", OST-index %u, stripe-index %u: rc = %d\n",
3161                                lfsck_lfsck2name(lfsck), PFID(cfid),
3162                                PFID(lfsck_dto2fid(parent)), ost_idx, i, rc);
3163
3164                         GOTO(unlock_parent, rc);
3165                 }
3166
3167                 ostid_le_to_cpu(&objs->l_ost_oi, oi);
3168                 rc = ostid_to_fid(fid, oi, le32_to_cpu(objs->l_ost_idx));
3169                 if (rc != 0) {
3170                         CDEBUG(D_LFSCK, "%s: the parent "DFID" contains "
3171                                "invalid layout EA at the slot %d, index %u\n",
3172                                lfsck_lfsck2name(lfsck),
3173                                PFID(lfsck_dto2fid(parent)), i,
3174                                le32_to_cpu(objs->l_ost_idx));
3175
3176                         GOTO(unlock_parent, rc);
3177                 }
3178
3179                 /* It should be rare case, the slot is there, but the LFSCK
3180                  * does not handle it during the first-phase cycle scanning. */
3181                 if (unlikely(lu_fid_eq(fid, cfid))) {
3182                         if (i == ea_off) {
3183                                 GOTO(unlock_parent, rc = 0);
3184                         } else {
3185                                 /* Rare case that the OST-object index
3186                                  * does not match the parent MDT-object
3187                                  * layout EA. We trust the later one. */
3188                                 if (bk->lb_param & LPF_DRYRUN)
3189                                         GOTO(unlock_parent, rc = 1);
3190
3191                                 dt_write_unlock(env, parent);
3192                                 if (handle != NULL)
3193                                         dt_trans_stop(env, dt, handle);
3194                                 lfsck_ibits_unlock(&lh, LCK_EX);
3195                                 rc = lfsck_layout_update_pfid(env, com, parent,
3196                                                         cfid, ltd->ltd_tgt,
3197                                                         rec, i);
3198
3199                                 CDEBUG(D_LFSCK, "%s layout LFSCK assistant "
3200                                        "updated OST-object's pfid for "DFID
3201                                        ": parent "DFID", OST-index %u, "
3202                                        "stripe-index %u: rc = %d\n",
3203                                        lfsck_lfsck2name(lfsck), PFID(cfid),
3204                                        PFID(lfsck_dto2fid(parent)),
3205                                        ltd->ltd_index, i, rc);
3206
3207                                 RETURN(rc);
3208                         }
3209                 }
3210         }
3211
3212         /* The MDT-object exists, but related layout EA slot is occupied
3213          * by others. */
3214         if (bk->lb_param & LPF_DRYRUN)
3215                 GOTO(unlock_parent, rc = 1);
3216
3217         dt_write_unlock(env, parent);
3218         if (handle != NULL)
3219                 dt_trans_stop(env, dt, handle);
3220         lfsck_ibits_unlock(&lh, LCK_EX);
3221         if (magic == LOV_MAGIC_V1)
3222                 objs = &lmm->lmm_objects[ea_off];
3223         else
3224                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[ea_off];
3225         rc = lfsck_layout_conflict_create(env, com, ltd, rec, parent, cfid,
3226                                           buf, lmm, objs, ea_off, lovea_size);
3227
3228         RETURN(rc);
3229
3230 unlock_parent:
3231         if (locked)
3232                 dt_write_unlock(env, parent);
3233
3234 stop:
3235         if (handle != NULL)
3236                 dt_trans_stop(env, dt, handle);
3237
3238 unlock_layout:
3239         lfsck_ibits_unlock(&lh, LCK_EX);
3240
3241         return rc;
3242 }
3243
3244 static int lfsck_layout_scan_orphan_one(const struct lu_env *env,
3245                                         struct lfsck_component *com,
3246                                         struct lfsck_tgt_desc *ltd,
3247                                         struct lu_orphan_rec_v3 *rec,
3248                                         struct lu_fid *cfid)
3249 {
3250         struct lfsck_layout     *lo     = com->lc_file_ram;
3251         struct lu_fid           *pfid   = &rec->lor_rec.lor_fid;
3252         struct dt_object        *parent = NULL;
3253         __u32                    ea_off = pfid->f_stripe_idx;
3254         int                      rc     = 0;
3255         ENTRY;
3256
3257         if (!fid_is_sane(cfid))
3258                 GOTO(out, rc = -EINVAL);
3259
3260         pfid->f_ver = 0;
3261         if (fid_is_zero(pfid)) {
3262                 rc = lfsck_layout_recreate_parent(env, com, ltd, rec, cfid,
3263                                                   "", "N", ea_off);
3264                 GOTO(out, rc);
3265         }
3266
3267         if (!fid_is_sane(pfid))
3268                 GOTO(out, rc = -EINVAL);
3269
3270         parent = lfsck_object_find_by_dev(env, com->lc_lfsck->li_bottom, pfid);
3271         if (IS_ERR(parent))
3272                 GOTO(out, rc = PTR_ERR(parent));
3273
3274         if (unlikely(dt_object_remote(parent) != 0))
3275                 GOTO(put, rc = -EXDEV);
3276
3277         if (dt_object_exists(parent) == 0) {
3278                 lfsck_object_put(env, parent);
3279                 rc = lfsck_layout_recreate_parent(env, com, ltd, rec, cfid,
3280                                                   "", "R", ea_off);
3281                 GOTO(out, rc);
3282         }
3283
3284         if (!S_ISREG(lu_object_attr(&parent->do_lu)))
3285                 GOTO(put, rc = -EISDIR);
3286
3287         /* The orphan OST-object claims to be the parent's stripe, then
3288          * related dangling record in the trace file is meaningless. */
3289         rc = lfsck_layout_del_dangling_rec(env, com, pfid,
3290                                            rec->lor_layout.ol_comp_id, ea_off);
3291         if (rc && rc != -ENOENT)
3292                 GOTO(put, rc);
3293
3294         rc = lfsck_layout_recreate_lovea(env, com, ltd, rec, parent, cfid,
3295                                          ltd->ltd_index, ea_off);
3296
3297         GOTO(put, rc);
3298
3299 put:
3300         if (rc <= 0)
3301                 lfsck_object_put(env, parent);
3302         else
3303                 /* The layout EA is changed, need to be reloaded next time. */
3304                 dt_object_put_nocache(env, parent);
3305
3306 out:
3307         down_write(&com->lc_sem);
3308         com->lc_new_scanned++;
3309         com->lc_new_checked++;
3310         if (rc > 0) {
3311                 lo->ll_objs_repaired[LLIT_ORPHAN - 1]++;
3312                 rc = 0;
3313         } else if (rc < 0) {
3314                 lo->ll_objs_failed_phase2++;
3315         }
3316         up_write(&com->lc_sem);
3317
3318         return rc;
3319 }
3320
3321 static int lfsck_layout_scan_orphan(const struct lu_env *env,
3322                                     struct lfsck_component *com,
3323                                     struct lfsck_tgt_desc *ltd)
3324 {
3325         struct lfsck_assistant_data     *lad    = com->lc_data;
3326         struct lfsck_instance           *lfsck  = com->lc_lfsck;
3327         struct lfsck_bookmark           *bk     = &lfsck->li_bookmark_ram;
3328         struct lfsck_thread_info        *info   = lfsck_env_info(env);
3329         struct lu_fid                   *fid    = &info->lti_fid;
3330         struct dt_object                *obj;
3331         const struct dt_it_ops          *iops;
3332         struct dt_it                    *di;
3333         int                              rc     = 0;
3334         ENTRY;
3335
3336         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant starts the orphan "
3337                "scanning for OST%04x\n",
3338                lfsck_lfsck2name(lfsck), ltd->ltd_index);
3339
3340         if (test_bit(ltd->ltd_index, lad->lad_bitmap)) {
3341                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant skip the orphan "
3342                        "scanning for OST%04x\n",
3343                        lfsck_lfsck2name(lfsck), ltd->ltd_index);
3344
3345                 RETURN(0);
3346         }
3347
3348         fid->f_seq = fid_idif_seq(0, ltd->ltd_index);
3349         fid->f_oid = fid->f_ver = 0;
3350
3351         obj = lfsck_object_find_by_dev(env, ltd->ltd_tgt, fid);
3352         if (unlikely(IS_ERR(obj)))
3353                 GOTO(log, rc = PTR_ERR(obj));
3354
3355         rc = obj->do_ops->do_index_try(env, obj,
3356                                        &dt_lfsck_layout_orphan_features);
3357         if (rc != 0)
3358                 GOTO(put, rc);
3359
3360         iops = &obj->do_index_ops->dio_it;
3361         di = iops->init(env, obj, 0);
3362         if (IS_ERR(di))
3363                 GOTO(put, rc = PTR_ERR(di));
3364
3365         rc = iops->load(env, di, 0);
3366         if (rc == -ESRCH) {
3367                 /* -ESRCH means that the orphan OST-objects rbtree has been
3368                  * cleanup because of the OSS server restart or other errors. */
3369                 lfsck_lad_set_bitmap(env, com, ltd->ltd_index);
3370                 GOTO(fini, rc);
3371         }
3372
3373         if (rc == 0)
3374                 rc = iops->next(env, di);
3375         else if (rc > 0)
3376                 rc = 0;
3377
3378         if (rc < 0)
3379                 GOTO(fini, rc);
3380
3381         if (rc > 0)
3382                 GOTO(fini, rc = 0);
3383
3384         do {
3385                 struct dt_key           *key;
3386                 struct lu_orphan_rec_v3 *rec = &info->lti_rec;
3387
3388                 if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val) &&
3389                     unlikely(!thread_is_running(&lfsck->li_thread)))
3390                         break;
3391
3392                 key = iops->key(env, di);
3393                 com->lc_fid_latest_scanned_phase2 = *(struct lu_fid *)key;
3394                 /* Remote target OST may be runnning old LFSCK */
3395                 memset(rec, 0, sizeof(*rec));
3396                 rc = iops->rec(env, di, (struct dt_rec *)rec, 0);
3397                 if (rc == 0)
3398                         rc = lfsck_layout_scan_orphan_one(env, com, ltd, rec,
3399                                         &com->lc_fid_latest_scanned_phase2);
3400                 if (rc != 0 && bk->lb_param & LPF_FAILOUT)
3401                         GOTO(fini, rc);
3402
3403                 lfsck_control_speed_by_self(com);
3404                 do {
3405                         rc = iops->next(env, di);
3406                 } while (rc < 0 && !(bk->lb_param & LPF_FAILOUT));
3407         } while (rc == 0);
3408
3409         GOTO(fini, rc);
3410
3411 fini:
3412         iops->put(env, di);
3413         iops->fini(env, di);
3414 put:
3415         lfsck_object_put(env, obj);
3416
3417 log:
3418         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant finished the orphan "
3419                "scanning for OST%04x: rc = %d\n",
3420                lfsck_lfsck2name(lfsck), ltd->ltd_index, rc);
3421
3422         return rc > 0 ? 0 : rc;
3423 }
3424
3425 static int lfsck_lov2layout(struct lov_mds_md_v1 *lmm, struct filter_fid *ff,
3426                             __u32 comp_id)
3427 {
3428         struct ost_layout *ol = &ff->ff_layout;
3429         __u32 magic = le32_to_cpu(lmm->lmm_magic);
3430         int rc = 0;
3431         ENTRY;
3432
3433         if (magic == LOV_MAGIC_V1 || magic == LOV_MAGIC_V3) {
3434                 ol->ol_stripe_size = lmm->lmm_stripe_size;
3435                 ol->ol_stripe_count = lmm->lmm_stripe_count;
3436                 ol->ol_comp_start = 0;
3437                 ol->ol_comp_end = 0;
3438                 ol->ol_comp_id = 0;
3439                 ff->ff_layout_version = 0;
3440                 ff->ff_range = 0;
3441         } else if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) {
3442                 struct lov_comp_md_v1 *lcm = (struct lov_comp_md_v1 *)lmm;
3443                 struct lov_comp_md_entry_v1 *lcme = NULL;
3444                 __u16 count = le16_to_cpu(lcm->lcm_entry_count);
3445                 int i;
3446
3447                 for (i = 0; i < count; i++) {
3448                         lcme = &lcm->lcm_entries[i];
3449                         if (le32_to_cpu(lcme->lcme_id) == comp_id) {
3450                                 LASSERT(le32_to_cpu(lcme->lcme_flags) &
3451                                         LCME_FL_INIT);
3452
3453                                 break;
3454                         }
3455                 }
3456
3457                 /* The comp has been removed, do nothing. */
3458                 if (i == count)
3459                         GOTO(out, rc = 1);
3460
3461                 lmm = (void *)lmm + le32_to_cpu(lcme->lcme_offset);
3462                 ol->ol_stripe_size = le32_to_cpu(lmm->lmm_stripe_size);
3463                 ol->ol_stripe_count = le32_to_cpu(lmm->lmm_stripe_count);
3464                 ol->ol_comp_start = le64_to_cpu(lcme->lcme_extent.e_start);
3465                 ol->ol_comp_end = le64_to_cpu(lcme->lcme_extent.e_end);
3466                 ol->ol_comp_id = le32_to_cpu(lcme->lcme_id);
3467                 ff->ff_layout_version = le32_to_cpu(lcme->lcme_layout_gen);
3468                 ff->ff_range = 0;
3469         } else {
3470                 GOTO(out, rc = -EINVAL);
3471         }
3472
3473         EXIT;
3474
3475 out:
3476         return rc;
3477 }
3478
3479 /**
3480  * Repair the MDT-object with dangling LOV EA reference.
3481  *
3482  * we need to repair the inconsistency according to the users' requirement:
3483  *
3484  * 1) Keep the inconsistency there and report the inconsistency case,
3485  *    then give the chance to the application to find related issues,
3486  *    and the users can make the decision about how to handle it with
3487  *    more human knownledge. (by default)
3488  *
3489  * 2) Re-create the missing OST-object with the FID/owner information.
3490  *
3491  * \param[in] env       pointer to the thread context
3492  * \param[in] com       the layout LFSCK component
3493  * \param[in] parent    the MDT-object with dangling LOV EA reference
3494  * \param[in] child     the OST-object to be created
3495  * \param[in] comp_id   the component ID of the OST-object in the LOV EA
3496  * \param[in] ea_off    the offset of the OST-object in the LOV EA
3497  * \param[in] ost_idx   the index of OST on which the OST-object resides
3498  *
3499  * \retval              +1 for repair successfully
3500  * \retval              0 for did nothing
3501  * \retval              negative error number on failure
3502  */
3503 static int __lfsck_layout_repair_dangling(const struct lu_env *env,
3504                                           struct lfsck_component *com,
3505                                           struct dt_object *parent,
3506                                           struct dt_object *child,
3507                                           __u32 comp_id, __u32 ea_off,
3508                                           __u32 ost_idx, bool log)
3509 {
3510         struct lfsck_thread_info *info = lfsck_env_info(env);
3511         struct filter_fid *ff = &info->lti_ff;
3512         struct dt_object_format *dof = &info->lti_dof;
3513         struct lu_attr *la = &info->lti_la;
3514         struct lfsck_instance *lfsck = com->lc_lfsck;
3515         struct dt_device *dev = lfsck_obj2dev(child);
3516         const struct lu_fid *pfid = lfsck_dto2fid(parent);
3517         const struct lu_fid *cfid = lfsck_dto2fid(child);
3518         struct lu_buf *tbuf = &info->lti_big_buf;
3519         struct thandle *handle;
3520         struct lu_buf *buf;
3521         struct lustre_handle lh = { 0 };
3522         int rc;
3523         ENTRY;
3524
3525         if (!(lfsck->li_bookmark_ram.lb_param & LPF_CREATE_OSTOBJ))
3526                 GOTO(log, rc = 1);
3527
3528         if (lfsck->li_bookmark_ram.lb_param & LPF_DRYRUN)
3529                 GOTO(log, rc = 1);
3530
3531         rc = lfsck_ibits_lock(env, lfsck, parent, &lh,
3532                               MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR,
3533                               LCK_EX);
3534         if (rc != 0)
3535                 GOTO(log, rc);
3536
3537         rc = dt_attr_get(env, parent, la);
3538         if (rc != 0)
3539                 GOTO(unlock1, rc);
3540
3541         la->la_mode = S_IFREG | 0666;
3542         la->la_atime = la->la_mtime = la->la_ctime = 0;
3543         la->la_valid = LA_TYPE | LA_MODE | LA_UID | LA_GID |
3544                        LA_ATIME | LA_MTIME | LA_CTIME;
3545         memset(dof, 0, sizeof(*dof));
3546         ff->ff_parent.f_seq = cpu_to_le64(pfid->f_seq);
3547         ff->ff_parent.f_oid = cpu_to_le32(pfid->f_oid);
3548         /* Currently, the filter_fid::ff_parent::f_ver is not the real parent
3549          * MDT-object's FID::f_ver, instead it is the OST-object index in its
3550          * parent MDT-object's layout EA. */
3551         ff->ff_parent.f_stripe_idx = cpu_to_le32(ea_off);
3552
3553         rc = lfsck_layout_get_lovea(env, parent, tbuf);
3554         if (unlikely(rc == -ENODATA))
3555                 rc = 0;
3556         if (rc <= 0)
3557                 GOTO(unlock1, rc);
3558
3559         rc = lfsck_lov2layout(tbuf->lb_buf, ff, comp_id);
3560         if (rc)
3561                 GOTO(unlock1, rc);
3562
3563         buf = lfsck_buf_get(env, ff, sizeof(struct filter_fid));
3564         handle = lfsck_trans_create(env, dev, lfsck);
3565         if (IS_ERR(handle))
3566                 GOTO(unlock1, rc = PTR_ERR(handle));
3567
3568         rc = dt_declare_create(env, child, la, NULL, dof, handle);
3569         if (rc != 0)
3570                 GOTO(stop, rc);
3571
3572         rc = dt_declare_xattr_set(env, child, buf, XATTR_NAME_FID,
3573                                   LU_XATTR_CREATE, handle);
3574         if (rc != 0)
3575                 GOTO(stop, rc);
3576
3577         rc = dt_trans_start_local(env, dev, handle);
3578         if (rc != 0)
3579                 GOTO(stop, rc);
3580
3581         dt_read_lock(env, parent, 0);
3582         if (unlikely(lfsck_is_dead_obj(parent)))
3583                 GOTO(unlock2, rc = 0);
3584
3585         if (lfsck->li_bookmark_ram.lb_param & LPF_DELAY_CREATE_OSTOBJ) {
3586                 struct ost_id *oi = &info->lti_oi;
3587                 struct lu_fid *tfid = &info->lti_fid2;
3588                 struct lu_buf *lovea = &info->lti_big_buf;
3589                 struct lov_mds_md_v1 *lmm;
3590                 struct lov_ost_data_v1 *objs;
3591                 __u32 magic;
3592                 int count;
3593                 int idx2;
3594
3595                 rc = lfsck_layout_get_lovea(env, parent, lovea);
3596                 if (unlikely(rc == -ENODATA))
3597                         rc = 0;
3598                 if (rc <= 0)
3599                         GOTO(unlock2, rc);
3600
3601                 lmm = lovea->lb_buf;
3602                 magic = le32_to_cpu(lmm->lmm_magic);
3603                 if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) {
3604                         struct lov_comp_md_v1 *lcm = buf->lb_buf;
3605                         struct lov_comp_md_entry_v1 *lcme;
3606                         __u16 count = le16_to_cpu(lcm->lcm_entry_count);
3607                         int i;
3608
3609                         for (i = 0; i < count; i++) {
3610                                 lcme = &lcm->lcm_entries[i];
3611                                 if (le32_to_cpu(lcme->lcme_id) == comp_id) {
3612                                         LASSERT(le32_to_cpu(lcme->lcme_flags) &
3613                                                 LCME_FL_INIT);
3614
3615                                         lmm = lovea->lb_buf +
3616                                                 le32_to_cpu(lcme->lcme_offset);
3617                                         magic = le32_to_cpu(lmm->lmm_magic);
3618                                         goto check;
3619                                 }
3620                         }
3621
3622                         /* Someone removed the component, do nothing. */
3623                         GOTO(unlock2, rc = 0);
3624                 }
3625
3626 check:
3627                 count = le16_to_cpu(lmm->lmm_stripe_count);
3628                 /* Someone changed the LOV EA, do nothing. */
3629                 if (count <= ea_off)
3630                         GOTO(unlock2, rc = 0);
3631
3632                 if (magic == LOV_MAGIC_V1) {
3633                         objs = &lmm->lmm_objects[ea_off];
3634                 } else {
3635                         LASSERT(magic == LOV_MAGIC_V3);
3636
3637                         objs = &((struct lov_mds_md_v3 *)lmm)->\
3638                                                         lmm_objects[ea_off];
3639                 }
3640
3641                 ostid_le_to_cpu(&objs->l_ost_oi, oi);
3642                 idx2 = le32_to_cpu(objs->l_ost_idx);
3643                 rc = ostid_to_fid(tfid, oi, idx2);
3644                 /* Someone changed the LOV EA, do nothing. */
3645                 if (rc != 0 || !lu_fid_eq(tfid, cfid))
3646                         GOTO(unlock2, rc);
3647         }
3648
3649         rc = dt_create(env, child, la, NULL, dof, handle);
3650         if (rc != 0)
3651                 GOTO(unlock2, rc);
3652
3653         rc = dt_xattr_set(env, child, buf, XATTR_NAME_FID, LU_XATTR_CREATE,
3654                           handle);
3655
3656         GOTO(unlock2, rc);
3657
3658 unlock2:
3659         dt_read_unlock(env, parent);
3660
3661 stop:
3662         rc = lfsck_layout_trans_stop(env, dev, handle, rc);
3663
3664 unlock1:
3665         lfsck_ibits_unlock(&lh, LCK_EX);
3666
3667 log:
3668         if (rc && log)
3669                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant found "
3670                        "dangling reference for: parent "DFID", child "
3671                        DFID", comp_id %u, ea_off %u, ost_idx %u, %s: "
3672                        "rc = %d\n",
3673                        lfsck_lfsck2name(lfsck), PFID(pfid), PFID(cfid),
3674                        comp_id, ea_off, ost_idx,
3675                        (lfsck->li_bookmark_ram.lb_param & LPF_CREATE_OSTOBJ) ?
3676                                 "Create the lost OST-object as required" :
3677                                 "Keep the MDT-object there by default", rc);
3678
3679         return rc;
3680 }
3681
3682 /**
3683  * Repair the MDT-object with dangling LOV EA reference.
3684  *
3685  * Prepare parameters and call __lfsck_layout_repair_dangling()
3686  * to repair the dangling LOV EA reference.
3687  *
3688  * \param[in] env       pointer to the thread context
3689  * \param[in] com       the layout LFSCK component
3690  * \param[in] pfid      the MDT-object's FID
3691  * \param[in] cfid      the FID for the OST-object to be created
3692  * \param[in] comp_id   the component ID of the OST-object in the LOV EA
3693  * \param[in] ea_off    the offset of the OST-object in the LOV EA
3694  * \param[in] ost_idx   the index of OST on which the OST-object resides
3695  *
3696  * \retval              +1 for repair successfully
3697  * \retval              0 for did nothing
3698  * \retval              negative error number on failure
3699  */
3700 static int lfsck_layout_repair_dangling(const struct lu_env *env,
3701                                         struct lfsck_component *com,
3702                                         const struct lu_fid *pfid,
3703                                         const struct lu_fid *cfid,
3704                                         __u32 comp_id, __u32 ea_off,
3705                                         __u32 ost_idx)
3706 {
3707         struct lfsck_instance *lfsck = com->lc_lfsck;
3708         struct dt_object *parent = NULL;
3709         struct dt_object *child = NULL;
3710         struct lfsck_tgt_desc *ltd;
3711         int rc;
3712         ENTRY;
3713
3714         parent = lfsck_object_find_bottom(env, lfsck, pfid);
3715         if (IS_ERR(parent))
3716                 GOTO(log, rc = PTR_ERR(parent));
3717
3718         /* The MDT-object has been removed. */
3719         if (dt_object_exists(parent) == 0)
3720                 GOTO(log, rc = 0);
3721
3722         ltd = lfsck_ltd2tgt(&lfsck->li_ost_descs, ost_idx);
3723         if (unlikely(ltd == NULL))
3724                 GOTO(log, rc = -ENODEV);
3725
3726         child = lfsck_object_find_by_dev(env, ltd->ltd_tgt, cfid);
3727         if (IS_ERR(child))
3728                 GOTO(log, rc = PTR_ERR(child));
3729
3730         /* The OST-object has been created. */
3731         if (unlikely(dt_object_exists(child) != 0))
3732                 GOTO(log, rc = 0);
3733
3734         rc = __lfsck_layout_repair_dangling(env, com, parent, child,
3735                                             comp_id, ea_off, ost_idx, false);
3736
3737         GOTO(log, rc);
3738
3739 log:
3740         if (child != NULL && !IS_ERR(child))
3741                 lfsck_object_put(env, child);
3742
3743         if (parent != NULL && !IS_ERR(parent))
3744                 lfsck_object_put(env, parent);
3745
3746         if (rc)
3747                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant found "
3748                        "dangling reference for: parent "DFID", child "
3749                        DFID", comp_id %u, ea_off %u, ost_idx %u, %s: rc = %d\n",
3750                        lfsck_lfsck2name(lfsck), PFID(pfid), PFID(cfid),
3751                        comp_id, ea_off, ost_idx,
3752                        (lfsck->li_bookmark_ram.lb_param & LPF_CREATE_OSTOBJ) ?
3753                                 "Create the lost OST-object as required" :
3754                                 "Keep the MDT-object there by default", rc);
3755
3756         return rc;
3757 }
3758
3759 /* If the OST-object does not recognize the MDT-object as its parent, and
3760  * there is no other MDT-object claims as its parent, then just trust the
3761  * given MDT-object as its parent. So update the OST-object filter_fid. */
3762 static int lfsck_layout_repair_unmatched_pair(const struct lu_env *env,
3763                                               struct lfsck_component *com,
3764                                               struct dt_object *parent,
3765                                               struct lfsck_layout_req *llr,
3766                                               struct lu_attr *la)
3767 {
3768         struct lfsck_thread_info        *info   = lfsck_env_info(env);
3769         struct filter_fid               *ff     = &info->lti_ff;
3770         struct dt_object                *child  = llr->llr_child;
3771         struct dt_device                *dev    = lfsck_obj2dev(child);
3772         const struct lu_fid             *tfid   = lu_object_fid(&parent->do_lu);
3773         struct lu_buf                   *tbuf   = &info->lti_big_buf;
3774         struct thandle                  *handle;
3775         struct lu_buf                   *buf;
3776         struct lustre_handle             lh     = { 0 };
3777         int                              rc;
3778         ENTRY;
3779
3780         if (com->lc_lfsck->li_bookmark_ram.lb_param & LPF_DRYRUN)
3781                 GOTO(log, rc = 0);
3782
3783         rc = lfsck_ibits_lock(env, com->lc_lfsck, parent, &lh,
3784                               MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR,
3785                               LCK_EX);
3786         if (rc != 0)
3787                 GOTO(log, rc);
3788
3789         ff->ff_parent.f_seq = cpu_to_le64(tfid->f_seq);
3790         ff->ff_parent.f_oid = cpu_to_le32(tfid->f_oid);
3791         /* Currently, the filter_fid::ff_parent::f_ver is not the real parent
3792          * MDT-object's FID::f_ver, instead it is the OST-object index in its
3793          * parent MDT-object's layout EA. */
3794         ff->ff_parent.f_stripe_idx = cpu_to_le32(llr->llr_lov_idx);
3795
3796         rc = lfsck_layout_get_lovea(env, parent, tbuf);
3797         if (unlikely(rc == -ENODATA))
3798                 rc = 0;
3799         if (rc <= 0)
3800                 GOTO(unlock1, rc);
3801
3802         rc = lfsck_lov2layout(tbuf->lb_buf, ff, llr->llr_comp_id);
3803         if (rc)
3804                 GOTO(unlock1, rc);
3805
3806         buf = lfsck_buf_get(env, ff, sizeof(*ff));
3807
3808         handle = lfsck_trans_create(env, dev, com->lc_lfsck);
3809         if (IS_ERR(handle))
3810                 GOTO(unlock1, rc = PTR_ERR(handle));
3811
3812         rc = dt_declare_xattr_set(env, child, buf, XATTR_NAME_FID, 0, handle);
3813         if (rc != 0)
3814                 GOTO(stop, rc);
3815
3816         rc = dt_attr_get(env, parent, la);
3817         if (rc != 0)
3818                 GOTO(stop, rc);
3819
3820         la->la_valid = LA_UID | LA_GID;
3821         rc = dt_declare_attr_set(env, child, la, handle);
3822         if (rc != 0)
3823                 GOTO(stop, rc);
3824
3825         rc = dt_trans_start_local(env, dev, handle);
3826         if (rc != 0)
3827                 GOTO(stop, rc);
3828
3829         dt_write_lock(env, parent, 0);
3830         if (unlikely(lfsck_is_dead_obj(parent)))
3831                 GOTO(unlock2, rc = 1);
3832
3833         rc = dt_xattr_set(env, child, buf, XATTR_NAME_FID, 0, handle);
3834         if (rc != 0)
3835                 GOTO(unlock2, rc);
3836
3837         /* Get the latest parent's owner. */
3838         rc = dt_attr_get(env, parent, la);
3839         if (rc != 0)
3840                 GOTO(unlock2, rc);
3841
3842         la->la_valid = LA_UID | LA_GID;
3843         rc = dt_attr_set(env, child, la, handle);
3844
3845         GOTO(unlock2, rc);
3846
3847 unlock2:
3848         dt_write_unlock(env, parent);
3849
3850 stop:
3851         rc = lfsck_layout_trans_stop(env, dev, handle, rc);
3852
3853 unlock1:
3854         lfsck_ibits_unlock(&lh, LCK_EX);
3855
3856 log:
3857         if (rc)
3858                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant repaired "
3859                        "unmatched MDT-OST pair for: parent "DFID
3860                        ", child "DFID", comp_id %u, OST-index %u, "
3861                        "stripe-index %u, owner %u/%u: rc = %d\n",
3862                        lfsck_lfsck2name(com->lc_lfsck),
3863                        PFID(lfsck_dto2fid(parent)),
3864                        PFID(lfsck_dto2fid(child)),
3865                        llr->llr_comp_id, llr->llr_ost_idx, llr->llr_lov_idx,
3866                        la->la_uid, la->la_gid, rc);
3867
3868         return rc;
3869 }
3870
3871 /* If there are more than one MDT-objects claim as the OST-object's parent,
3872  * and the OST-object only recognizes one of them, then we need to generate
3873  * new OST-object(s) with new fid(s) for the non-recognized MDT-object(s). */
3874 static int lfsck_layout_repair_multiple_references(const struct lu_env *env,
3875                                                    struct lfsck_component *com,
3876                                                    struct dt_object *parent,
3877                                                    struct lfsck_layout_req *llr,
3878                                                    struct lu_attr *la)
3879 {
3880         struct lfsck_thread_info        *info   = lfsck_env_info(env);
3881         struct dt_allocation_hint       *hint   = &info->lti_hint;
3882         struct dt_object_format         *dof    = &info->lti_dof;
3883         struct ost_id                   *oi     = &info->lti_oi;
3884         struct lu_buf                   *buf    = &info->lti_big_buf;
3885         struct lfsck_instance           *lfsck  = com->lc_lfsck;
3886         struct dt_device                *dev;
3887         struct lu_device                *d      =
3888                                 &lfsck_obj2dev(llr->llr_child)->dd_lu_dev;
3889         struct lu_object                *o;
3890         struct lu_object                *n;
3891         struct dt_object                *child  = NULL;
3892         struct thandle                  *handle = NULL;
3893         struct lov_mds_md_v1            *lmm;
3894         struct lov_ost_data_v1          *objs;
3895         const struct lu_fid             *pfid   = lfsck_dto2fid(parent);
3896         struct lu_fid                    tfid;
3897         struct lustre_handle             lh     = { 0 };
3898         __u32                            magic;
3899         __u32                            index;
3900         int                              rc;
3901         ENTRY;
3902
3903         if (lfsck->li_bookmark_ram.lb_param & LPF_DRYRUN)
3904                 RETURN(0);
3905
3906         /* We use two separated transactions to repair the inconsistency.
3907          *
3908          * 1) create the child (OST-object).
3909          * 2) update the parent LOV EA according to the child's FID.
3910          *
3911          * If 1) succeed, but 2) failed or aborted, then such OST-object will be
3912          * handled as orphan when the layout LFSCK run next time.
3913          *
3914          * If 1) failed, but 2) succeed, then such OST-object will be re-created
3915          * as dangling referened case when the layout LFSCK run next time. */
3916
3917         /* The 1st transaction. */
3918         o = lu_object_anon(env, d, NULL);
3919         if (IS_ERR(o))
3920                 GOTO(log, rc = PTR_ERR(o));
3921
3922         n = lu_object_locate(o->lo_header, d->ld_type);
3923         if (unlikely(n == NULL)) {
3924                 lu_object_put_nocache(env, o);
3925
3926                 GOTO(log, rc = -EINVAL);
3927         }
3928
3929         child = container_of(n, struct dt_object, do_lu);
3930         memset(hint, 0, sizeof(*hint));
3931         rc = dt_attr_get(env, parent, la);
3932         if (rc != 0)
3933                 GOTO(log, rc);
3934
3935         la->la_valid = LA_UID | LA_GID;
3936         memset(dof, 0, sizeof(*dof));
3937
3938         dev = lfsck_obj2dev(child);
3939         handle = lfsck_trans_create(env, dev, lfsck);
3940         if (IS_ERR(handle))
3941                 GOTO(log, rc = PTR_ERR(handle));
3942
3943         rc = dt_declare_create(env, child, la, hint, dof, handle);
3944         if (rc != 0)
3945                 GOTO(stop, rc);
3946
3947         rc = dt_trans_start_local(env, dev, handle);
3948         if (rc != 0)
3949                 GOTO(stop, rc);
3950
3951         rc = dt_create(env, child, la, hint, dof, handle);
3952         dt_trans_stop(env, dev, handle);
3953         handle = NULL;
3954         if (rc != 0)
3955                 GOTO(log, rc);
3956
3957         rc = lfsck_ibits_lock(env, lfsck, parent, &lh,
3958                               MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR,
3959                               LCK_EX);
3960         if (rc != 0)
3961                 GOTO(log, rc);
3962
3963         /* The 2nd transaction. */
3964
3965         /* XXX: Generally, we should use bottom device (OSD) to update parent
3966          *      LOV EA. But because the LOD-object still references the wrong
3967          *      OSP-object that should be detached after the parent's LOV EA
3968          *      refreshed. Unfortunately, there is no suitable API for that.
3969          *      So we have to make the LOD to re-load the OSP-object(s) via
3970          *      replacing the LOV EA against the LOD-object.
3971          *
3972          *      Once the DNE2 patches have been landed, we can replace the
3973          *      LOD device with the OSD device. LU-6230. */
3974
3975         dev = lfsck->li_next;
3976         parent = lfsck_object_locate(dev, parent);
3977         if (IS_ERR(parent))
3978                 GOTO(log, rc = PTR_ERR(parent));
3979
3980         handle = lfsck_trans_create(env, dev, lfsck);
3981         if (IS_ERR(handle))
3982                 GOTO(log, rc = PTR_ERR(handle));
3983
3984         rc = dt_declare_xattr_set(env, parent, buf, XATTR_NAME_LOV,
3985                                   LU_XATTR_REPLACE, handle);
3986         if (rc != 0)
3987                 GOTO(stop, rc);
3988
3989         rc = dt_trans_start_local(env, dev, handle);
3990         if (rc != 0)
3991                 GOTO(stop, rc);
3992
3993         dt_write_lock(env, parent, 0);
3994         if (unlikely(lfsck_is_dead_obj(parent)))
3995                 GOTO(unlock, rc = 0);
3996
3997         rc = lfsck_layout_get_lovea(env, parent, buf);
3998         if (unlikely(rc == -ENODATA))
3999                 rc = 0;
4000         if (rc <= 0)
4001                 GOTO(unlock, rc);
4002
4003         lmm = buf->lb_buf;
4004         magic = le32_to_cpu(lmm->lmm_magic);
4005         if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) {
4006                 struct lov_comp_md_v1 *lcm = buf->lb_buf;
4007                 struct lov_comp_md_entry_v1 *lcme;
4008                 __u16 count = le16_to_cpu(lcm->lcm_entry_count);
4009                 int i;
4010
4011                 LASSERT(llr->llr_comp_id != 0);
4012
4013                 for (i = 0; i < count; i++) {
4014                         lcme = &lcm->lcm_entries[i];
4015                         if (le32_to_cpu(lcme->lcme_id) == llr->llr_comp_id) {
4016                                 LASSERT(le32_to_cpu(lcme->lcme_flags) &
4017                                         LCME_FL_INIT);
4018
4019                                 le32_add_cpu(&lcm->lcm_layout_gen, 1);
4020                                 lmm = buf->lb_buf +
4021                                         le32_to_cpu(lcme->lcme_offset);
4022                                 magic = le32_to_cpu(lmm->lmm_magic);
4023                                 goto set;
4024                         }
4025                 }
4026
4027                 GOTO(unlock, rc = 0);
4028         }
4029
4030 set:
4031         if (magic == LOV_MAGIC_V1) {
4032                 objs = &lmm->lmm_objects[llr->llr_lov_idx];
4033         } else {
4034                 LASSERT(magic == LOV_MAGIC_V3);
4035                 objs =
4036                 &((struct lov_mds_md_v3 *)lmm)->lmm_objects[llr->llr_lov_idx];
4037         }
4038
4039         ostid_le_to_cpu(&objs->l_ost_oi, oi);
4040         index = le32_to_cpu(objs->l_ost_idx);
4041         rc = ostid_to_fid(&tfid, oi, index);
4042         /* Someone changed layout during the LFSCK, no need to repair then. */
4043         if (rc == 0 && !lu_fid_eq(&tfid, lu_object_fid(&llr->llr_child->do_lu)))
4044                 GOTO(unlock, rc = 0);
4045
4046         lmm->lmm_layout_gen = cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1);
4047         fid_to_ostid(lu_object_fid(&child->do_lu), oi);
4048         ostid_cpu_to_le(oi, &objs->l_ost_oi);
4049         objs->l_ost_gen = cpu_to_le32(0);
4050         objs->l_ost_idx = cpu_to_le32(llr->llr_ost_idx);
4051         rc = dt_xattr_set(env, parent, buf, XATTR_NAME_LOV,
4052                           LU_XATTR_REPLACE, handle);
4053
4054         GOTO(unlock, rc = (rc == 0 ? 1 : rc));
4055
4056 unlock:
4057         dt_write_unlock(env, parent);
4058
4059 stop:
4060         if (handle != NULL)
4061                 dt_trans_stop(env, dev, handle);
4062
4063 log:
4064         lfsck_ibits_unlock(&lh, LCK_EX);
4065         if (child != NULL)
4066                 lfsck_object_put(env, child);
4067
4068         if (rc)
4069                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant repaired "
4070                        "multiple references for: parent "DFID", comp_id %u, "
4071                        "OST-index %u, stripe-index %u, owner %u/%u: rc = %d\n",
4072                        lfsck_lfsck2name(lfsck), PFID(pfid),
4073                        llr->llr_comp_id, llr->llr_ost_idx, llr->llr_lov_idx,
4074                        la->la_uid, la->la_gid, rc);
4075
4076         return rc;
4077 }
4078
4079 /* If the MDT-object and the OST-object have different owner information,
4080  * then trust the MDT-object, because the normal chown/chgrp handle order
4081  * is from MDT to OST, and it is possible that some chown/chgrp operation
4082  * is partly done. */
4083 static int lfsck_layout_repair_owner(const struct lu_env *env,
4084                                      struct lfsck_component *com,
4085                                      struct dt_object *parent,
4086                                      struct lfsck_layout_req *llr,
4087                                      struct lu_attr *pla,
4088                                      const struct lu_attr *cla)
4089 {
4090         struct lfsck_thread_info        *info   = lfsck_env_info(env);
4091         struct lu_attr                  *tla    = &info->lti_la2;
4092         struct dt_object                *child  = llr->llr_child;
4093         struct dt_device                *dev    = lfsck_obj2dev(child);
4094         struct thandle                  *handle;
4095         int                              rc;
4096         dt_obj_version_t                 version;
4097         ENTRY;
4098
4099         if (com->lc_lfsck->li_bookmark_ram.lb_param & LPF_DRYRUN)
4100                 RETURN(0);
4101
4102         tla->la_uid = pla->la_uid;
4103         tla->la_gid = pla->la_gid;
4104         tla->la_valid = LA_UID | LA_GID;
4105         handle = lfsck_trans_create(env, dev, com->lc_lfsck);
4106         if (IS_ERR(handle))
4107                 GOTO(log, rc = PTR_ERR(handle));
4108
4109         rc = dt_declare_attr_set(env, child, tla, handle);
4110         if (rc != 0)
4111                 GOTO(stop, rc);
4112
4113         rc = dt_trans_start_local(env, dev, handle);
4114         if (rc != 0)
4115                 GOTO(stop, rc);
4116
4117         /* Use the dt_object lock to serialize with destroy and attr_set. */
4118         dt_read_lock(env, parent, 0);
4119         if (unlikely(lfsck_is_dead_obj(parent)))
4120                 GOTO(unlock, rc = 1);
4121
4122         version = dt_version_get(env, child);
4123         if (version == -EOPNOTSUPP)
4124                 version = 0;
4125
4126         /* Get the latest parent's owner. */
4127         rc = dt_attr_get(env, parent, pla);
4128         if (rc != 0)
4129                 GOTO(unlock, rc);
4130
4131         /* Some others chown/chgrp during the LFSCK, needs to do nothing. */
4132         if (unlikely((!version && tla->la_ctime == 0) ||
4133                      tla->la_uid != pla->la_uid || tla->la_gid != pla->la_gid))
4134                 rc = 1;
4135         else
4136                 rc = dt_attr_set(env, child, tla, handle);
4137
4138         GOTO(unlock, rc);
4139
4140 unlock:
4141         dt_read_unlock(env, parent);
4142
4143 stop:
4144         rc = lfsck_layout_trans_stop(env, dev, handle, rc);
4145
4146 log:
4147         if (rc != 0)
4148                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant repaired "
4149                        "inconsistent file owner for: parent "DFID", child "DFID
4150                        ", OST-index %u, stripe-index %u, old owner %u/%u, "
4151                        "new owner %u/%u: rc = %d\n",
4152                        lfsck_lfsck2name(com->lc_lfsck),
4153                        PFID(lfsck_dto2fid(parent)), PFID(lfsck_dto2fid(child)),
4154                        llr->llr_ost_idx, llr->llr_lov_idx,
4155                        cla->la_uid, cla->la_gid, tla->la_uid, tla->la_gid, rc);
4156
4157         return rc;
4158 }
4159
4160 #define CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid, msg)               \
4161         CDEBUG(D_LFSCK, "%s:("DFID"|"DFID")/"DFID":XATTR %s: %s\n",      \
4162                lfsck_lfsck2name(lfsck), PFID(&lso->lso_fid), PFID(pfid), \
4163                PFID(cfid), XATTR_NAME_FID, msg);
4164
4165 /* Check whether the OST-object correctly back points to the
4166  * MDT-object (@parent) via the XATTR_NAME_FID xattr (@pfid). */
4167 static int lfsck_layout_check_parent(const struct lu_env *env,
4168                                      struct lfsck_component *com,
4169                                      struct lfsck_assistant_object *lso,
4170                                      struct filter_fid *ff,
4171                                      const struct lu_fid *cfid,
4172                                      const struct lu_attr *cla,
4173                                      struct lfsck_layout_req *llr)
4174 {
4175         struct lfsck_thread_info        *info   = lfsck_env_info(env);
4176         struct lu_buf                   *buf    = &info->lti_big_buf;
4177         struct lu_fid                   *pfid   = &info->lti_fid;
4178         struct dt_object                *tobj;
4179         struct lov_mds_md_v1            *lmm;
4180         struct lov_ost_data_v1          *objs;
4181         struct lustre_handle             lh     = { 0 };
4182         struct lfsck_instance           *lfsck  = com->lc_lfsck;
4183         int                              rc;
4184         int                              i;
4185         __u32                            magic;
4186         __u32                            idx;
4187         __u16                            count;
4188         ENTRY;
4189
4190         *pfid = ff->ff_parent;
4191         idx = pfid->f_stripe_idx;
4192         pfid->f_ver = 0;
4193
4194         if (unlikely(!fid_is_sane(pfid))) {
4195                 CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid,
4196                                       "the parent FID is invalid");
4197
4198                 RETURN(LLIT_UNMATCHED_PAIR);
4199         }
4200
4201         if (lu_fid_eq(pfid, &lso->lso_fid)) {
4202                 if (likely(llr->llr_lov_idx == idx))
4203                         RETURN(0);
4204
4205                 CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid,
4206                                       "the stripe index is unmatched");
4207
4208                 RETURN(LLIT_UNMATCHED_PAIR);
4209         }
4210
4211         tobj = lfsck_object_find_bottom(env, com->lc_lfsck, pfid);
4212         if (IS_ERR(tobj))
4213                 RETURN(PTR_ERR(tobj));
4214
4215         if (dt_object_exists(tobj) == 0) {
4216                 CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid,
4217                                       "the parent is nonexistent");
4218
4219                 GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4220         }
4221
4222         if (lfsck_is_dead_obj(tobj)) {
4223                 CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid,
4224                                       "the parent is dead object");
4225
4226                 GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4227         }
4228
4229         if (!S_ISREG(lfsck_object_type(tobj))) {
4230                 CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid,
4231                                       "the parent is not a regular file");
4232
4233                 GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4234         }
4235
4236         /* Load the tobj's layout EA, in spite of it is a local MDT-object or
4237          * remote one on another MDT. Then check whether the given OST-object
4238          * is in such layout. If yes, it is multiple referenced, otherwise it
4239          * is unmatched referenced case. */
4240         rc = lfsck_layout_get_lovea(env, tobj, buf);
4241         if (rc == 0 || rc == -ENODATA || rc == -ENOENT) {
4242                 CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid,
4243                                       "the parent has no stripe data");
4244
4245                 GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4246         }
4247
4248         if (unlikely(rc == -EOPNOTSUPP))
4249                 GOTO(out, rc = LLIT_NONE);
4250
4251         if (rc < 0)
4252                 GOTO(out, rc);
4253
4254         lmm = buf->lb_buf;
4255         magic = le32_to_cpu(lmm->lmm_magic);
4256         if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) {
4257                 struct lov_comp_md_v1 *lcm = buf->lb_buf;
4258                 struct lov_comp_md_entry_v1 *lcme;
4259
4260                 if (ff->ff_layout.ol_comp_id == 0) {
4261                         CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid,
4262                                               "the parent has incorrect comp_id");
4263
4264                         GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4265                 }
4266
4267                 count = le16_to_cpu(lcm->lcm_entry_count);
4268                 for (i = 0; i < count; i++) {
4269                         lcme = &lcm->lcm_entries[i];
4270                         if (le32_to_cpu(lcme->lcme_id) ==
4271                             ff->ff_layout.ol_comp_id) {
4272                                 lmm = buf->lb_buf +
4273                                         le32_to_cpu(lcme->lcme_offset);
4274                                 magic = le32_to_cpu(lmm->lmm_magic);
4275                                 if (!(le32_to_cpu(lcme->lcme_flags) &
4276                                       LCME_FL_INIT)) {
4277                                         CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid,
4278                                                               cfid,
4279                                                               "the parent has uninitialized component");
4280
4281                                         GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4282                                 }
4283
4284                                 goto further;
4285                         }
4286                 }
4287
4288                 CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid,
4289                                       "the parent has no matched comp_id");
4290
4291                 GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4292         }
4293
4294 further:
4295         if (magic == LOV_MAGIC_V1) {
4296                 objs = &lmm->lmm_objects[0];
4297         } else {
4298                 LASSERT(magic == LOV_MAGIC_V3);
4299                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0];
4300         }
4301
4302         count = le16_to_cpu(lmm->lmm_stripe_count);
4303         for (i = 0; i < count; i++, objs++) {
4304                 struct lu_fid           *tfid   = &info->lti_fid2;
4305                 struct ost_id           *oi     = &info->lti_oi;
4306                 __u32                    idx2;
4307
4308                 if (lovea_slot_is_dummy(objs))
4309                         continue;
4310
4311                 ostid_le_to_cpu(&objs->l_ost_oi, oi);
4312                 idx2 = le32_to_cpu(objs->l_ost_idx);
4313                 rc = ostid_to_fid(tfid, oi, idx2);
4314                 if (rc != 0) {
4315                         CDEBUG(D_LFSCK, "%s: the parent "DFID" contains "
4316                                "invalid layout EA at the slot %d, index %u\n",
4317                                lfsck_lfsck2name(com->lc_lfsck),
4318                                PFID(pfid), i, idx2);
4319
4320                         GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4321                 }
4322
4323                 if (lu_fid_eq(cfid, tfid)) {
4324                         rc = lfsck_ibits_lock(env, com->lc_lfsck, tobj, &lh,
4325                                               MDS_INODELOCK_UPDATE |
4326                                               MDS_INODELOCK_LAYOUT |
4327                                               MDS_INODELOCK_XATTR,
4328                                               LCK_EX);
4329                         if (rc != 0)
4330                                 GOTO(out, rc);
4331
4332                         dt_read_lock(env, tobj, 0);
4333
4334                         /* For local MDT-object, re-check existence
4335                          * after taken the lock. */
4336                         if (!dt_object_remote(tobj)) {
4337                                 if (dt_object_exists(tobj) == 0 ||
4338                                     lfsck_is_dead_obj(tobj)) {
4339                                         CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid,
4340                                                               cfid,
4341                                                               "the parent doesn't exist anymore after lock");
4342
4343                                         rc = LLIT_UNMATCHED_PAIR;
4344                                 } else {
4345                                         rc = LLIT_MULTIPLE_REFERENCED;
4346                                 }
4347
4348                                 GOTO(unlock, rc);
4349                         }
4350
4351                         /* For migration case, the new MDT-object and old
4352                          * MDT-object may reference the same OST-object at
4353                          * some migration internal time.
4354                          *
4355                          * For remote MDT-object, the local MDT may not know
4356                          * whether it has been removed or not.  Try checking
4357                          * for a non-existent xattr to check if this object
4358                          * has been been removed or not. */
4359                         rc = dt_xattr_get(env, tobj, &LU_BUF_NULL,
4360                                           XATTR_NAME_DUMMY);
4361                         if (unlikely(rc == -ENOENT || rc >= 0)) {
4362                                 CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid,
4363                                                       "the parent is remote object and nonexistent after lock");
4364
4365                                 rc = LLIT_UNMATCHED_PAIR;
4366                         } else if (rc == -ENODATA) {
4367                                 rc = LLIT_MULTIPLE_REFERENCED;
4368                         }
4369
4370                         GOTO(unlock, rc);
4371                 }
4372         }
4373
4374         CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid,
4375                               "the parent has no matched stripe");
4376
4377         GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4378
4379 unlock:
4380         if (lustre_handle_is_used(&lh)) {
4381                 dt_read_unlock(env, tobj);
4382                 lfsck_ibits_unlock(&lh, LCK_EX);
4383         }
4384
4385 out:
4386         lfsck_object_put(env, tobj);
4387
4388         return rc;
4389 }
4390
4391 /*
4392  * If the MDT-object has the LUSTRE_ENCRYPT_FL flag, it needs to be set
4393  * on the OST-object as well.
4394  */
4395 static int lfsck_layout_repair_encflag(const struct lu_env *env,
4396                                        struct lfsck_component *com,
4397                                        struct dt_object *parent,
4398                                        struct lfsck_layout_req *llr)
4399 {
4400         struct lfsck_thread_info *info = lfsck_env_info(env);
4401         struct lu_attr *tla = &info->lti_la2;
4402         struct dt_object *child = llr->llr_child;
4403         struct dt_device *dev = lfsck_obj2dev(child);
4404         struct thandle *handle;
4405         int rc;
4406
4407         ENTRY;
4408
4409         tla->la_valid = LA_FLAGS;
4410         tla->la_flags = LUSTRE_ENCRYPT_FL;
4411         handle = lfsck_trans_create(env, dev, com->lc_lfsck);
4412         if (IS_ERR(handle))
4413                 GOTO(log, rc = PTR_ERR(handle));
4414
4415         rc = dt_declare_attr_set(env, child, tla, handle);
4416         if (rc != 0)
4417                 GOTO(stop, rc);
4418
4419         rc = dt_trans_start_local(env, dev, handle);
4420         if (rc != 0)
4421                 GOTO(stop, rc);
4422
4423         /* Use the dt_object lock to serialize with destroy and attr_set. */
4424         dt_read_lock(env, parent, 0);
4425         if (unlikely(lfsck_is_dead_obj(parent)))
4426                 GOTO(unlock, rc = 1);
4427
4428         rc = dt_attr_set(env, child, tla, handle);
4429         GOTO(unlock, rc);
4430
4431 unlock:
4432         dt_read_unlock(env, parent);
4433
4434 stop:
4435         rc = lfsck_layout_trans_stop(env, dev, handle, rc);
4436
4437 log:
4438         if (rc != 0)
4439                 CDEBUG(D_LFSCK,
4440                        "%s: layout LFSCK assistant repair of inconsistent file enc flag for: parent "
4441                        DFID", child "
4442                        DFID", OST-index %u, stripe-index %u: rc = %d\n",
4443                        lfsck_lfsck2name(com->lc_lfsck),
4444                        PFID(lfsck_dto2fid(parent)), PFID(lfsck_dto2fid(child)),
4445                        llr->llr_ost_idx, llr->llr_lov_idx, rc);
4446
4447         return rc;
4448 }
4449
4450 static int lfsck_layout_assistant_handler_p1(const struct lu_env *env,
4451                                              struct lfsck_component *com,
4452                                              struct lfsck_assistant_req *lar)
4453 {
4454         struct lfsck_layout_req              *llr    =
4455                 container_of(lar, struct lfsck_layout_req, llr_lar);
4456         struct lfsck_assistant_object        *lso    = lar->lar_parent;
4457         struct lfsck_layout                  *lo     = com->lc_file_ram;
4458         struct lfsck_thread_info             *info   = lfsck_env_info(env);
4459         struct filter_fid                    *ff     = &info->lti_ff;
4460         struct lu_buf buf = { .lb_buf = ff,
4461                               .lb_len = sizeof(*ff) };
4462         struct dt_object                     *parent = NULL;
4463         struct dt_object                     *child  = llr->llr_child;
4464         struct lu_attr                       *pla    = &lso->lso_attr;
4465         struct lu_attr                       *cla    = &info->lti_la;
4466         struct lfsck_instance                *lfsck  = com->lc_lfsck;
4467         struct lfsck_bookmark                *bk     = &lfsck->li_bookmark_ram;
4468         enum lfsck_layout_inconsistency_type  type   = LLIT_NONE;
4469         int                                   rc;
4470         ENTRY;
4471
4472         if (lso->lso_dead)
4473                 RETURN(0);
4474
4475         CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_ENGINE_DELAY, cfs_fail_val);
4476
4477         rc = dt_attr_get(env, child, cla);
4478         if (rc == -ENOENT) {
4479                 parent = lfsck_assistant_object_load(env, lfsck, lso);
4480                 if (IS_ERR(parent)) {
4481                         rc = PTR_ERR(parent);
4482
4483                         RETURN(rc == -ENOENT ? 0 : rc);
4484                 }
4485
4486                 type = LLIT_DANGLING;
4487                 goto repair;
4488         }
4489
4490         if (rc != 0)
4491                 GOTO(out, rc);
4492
4493         if (!(bk->lb_param & LPF_DRYRUN) &&
4494             pla->la_valid & LA_FLAGS && pla->la_flags & LUSTRE_ENCRYPT_FL) {
4495                 /* MDT-inode is encrypted */
4496                 struct lu_buf lb = { .lb_buf = NULL, .lb_len = 0 };
4497
4498                 /* if OST-inode is missing encryption.c xattr, fix it */
4499                 if (dt_xattr_get(env, child, &lb,
4500                                  LL_XATTR_NAME_ENCRYPTION_CONTEXT) >= 0)
4501                         goto check_fid;
4502
4503                 if (parent == NULL)
4504                         parent = lfsck_assistant_object_load(env, lfsck, lso);
4505                 if (!IS_ERR_OR_NULL(parent))
4506                         rc = lfsck_layout_repair_encflag(env, com, parent, llr);
4507                 down_write(&com->lc_sem);
4508                 if (rc < 0)
4509                         lfsck_layout_record_failure(env, lfsck, lo);
4510                 else if (rc > 0)
4511                         lo->ll_objs_repaired[LLIT_OTHERS - 1]++;
4512                 up_write(&com->lc_sem);
4513         }
4514
4515 check_fid:
4516         lfsck_buf_init(&buf, ff, sizeof(*ff));
4517         rc = dt_xattr_get(env, child, &buf, XATTR_NAME_FID);
4518         if (unlikely(rc > 0 && rc < sizeof(struct lu_fid))) {
4519                 CDEBUG(D_LFSCK, "%s:"DFID"/"DFID": "
4520                        "the child object's %s is corrupted\n",
4521                        lfsck_lfsck2name(lfsck), PFID(&lso->lso_fid),
4522                        PFID(lu_object_fid(&child->do_lu)),
4523                        XATTR_NAME_FID);
4524
4525                 type = LLIT_UNMATCHED_PAIR;
4526                 goto repair;
4527         }
4528
4529         if (rc < 0 && rc != -ENODATA)
4530                 GOTO(out, rc);
4531
4532         if (rc == 0 || rc == -ENODATA)
4533                 GOTO(check_owner, rc = 0);
4534
4535         filter_fid_le_to_cpu(ff, ff, sizeof(*ff));
4536         rc = lfsck_layout_check_parent(env, com, lso, ff,
4537                                        lu_object_fid(&child->do_lu), cla, llr);
4538         if (rc > 0) {
4539                 type = rc;
4540                 goto repair;
4541         }
4542
4543         if (rc < 0)
4544                 GOTO(out, rc);
4545
4546 check_owner:
4547         /* Someone may has changed the owner after the parent attr pre-loaded.
4548          * It can be handled later inside the lfsck_layout_repair_owner(). */
4549         if (unlikely(cla->la_uid != pla->la_uid ||
4550                      cla->la_gid != pla->la_gid)) {
4551                 type = LLIT_INCONSISTENT_OWNER;
4552                 goto repair;
4553         }
4554
4555 repair:
4556         if (type == LLIT_NONE)
4557                 GOTO(out, rc = 0);
4558
4559         if (bk->lb_param & LPF_DRYRUN)
4560                 GOTO(out, rc = 1);
4561
4562         if (parent == NULL) {
4563                 parent = lfsck_assistant_object_load(env, lfsck, lso);
4564                 if (IS_ERR(parent)) {
4565                         rc = PTR_ERR(parent);
4566
4567                         if (rc == -ENOENT)
4568                                 RETURN(0);
4569
4570                         GOTO(out, rc);
4571                 }
4572         }
4573
4574         switch (type) {
4575         case LLIT_DANGLING:
4576                 if (bk->lb_param & LPF_DELAY_CREATE_OSTOBJ)
4577                         rc = lfsck_layout_ins_dangling_rec(env, com,
4578                                 lfsck_dto2fid(parent), lfsck_dto2fid(child),
4579                                 llr->llr_comp_id, llr->llr_lov_idx,
4580                                 llr->llr_ost_idx);
4581                 else
4582                         rc = __lfsck_layout_repair_dangling(env, com, parent,
4583                                                             llr->llr_child,
4584                                                             llr->llr_comp_id,
4585                                                             llr->llr_lov_idx,
4586                                                             llr->llr_ost_idx,
4587                                                             true);
4588                 break;
4589         case LLIT_UNMATCHED_PAIR:
4590                 rc = lfsck_layout_repair_unmatched_pair(env, com, parent,
4591                                                         llr, pla);
4592                 break;
4593         case LLIT_MULTIPLE_REFERENCED:
4594                 rc = lfsck_layout_repair_multiple_references(env, com, parent,
4595                                                              llr, pla);
4596                 break;
4597         case LLIT_INCONSISTENT_OWNER:
4598                 rc = lfsck_layout_repair_owner(env, com, parent, llr, pla, cla);
4599                 break;
4600         default:
4601                 rc = 0;
4602                 break;
4603         }
4604
4605         GOTO(out, rc);
4606
4607 out:
4608         down_write(&com->lc_sem);
4609         if (rc < 0) {
4610                 struct lfsck_assistant_data *lad = com->lc_data;
4611
4612                 if (unlikely(test_bit(LAD_EXIT, &lad->lad_flags))) {
4613                         rc = 0;
4614                 } else if (rc == -ENOTCONN || rc == -ESHUTDOWN ||
4615                            rc == -ETIMEDOUT || rc == -EHOSTDOWN ||
4616                            rc == -EHOSTUNREACH) {
4617                         /* If cannot touch the target server,
4618                          * mark the LFSCK as INCOMPLETE. */
4619                         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant fail to "
4620                                "talk with OST %x: rc = %d\n",
4621                                lfsck_lfsck2name(lfsck), llr->llr_ost_idx, rc);
4622                         lfsck_lad_set_bitmap(env, com, llr->llr_ost_idx);
4623                         lo->ll_objs_skipped++;
4624                         rc = 0;
4625                 } else {
4626                         lfsck_layout_record_failure(env, lfsck, lo);
4627                 }
4628         } else if (rc > 0 && (type != LLIT_DANGLING ||
4629                               !(bk->lb_param & LPF_DELAY_CREATE_OSTOBJ))) {
4630                 LASSERTF(type > LLIT_NONE && type <= LLIT_MAX,
4631                          "unknown type = %d\n", type);
4632
4633                 lo->ll_objs_repaired[type - 1]++;
4634                 if (bk->lb_param & LPF_DRYRUN &&
4635                     unlikely(lo->ll_pos_first_inconsistent == 0))
4636                         lo->ll_pos_first_inconsistent =
4637                         lfsck->li_obj_oit->do_index_ops->dio_it.store(env,
4638                                                         lfsck->li_di_oit);
4639         }
4640         up_write(&com->lc_sem);
4641
4642         if (parent != NULL && !IS_ERR(parent))
4643                 lfsck_object_put(env, parent);
4644
4645         return rc;
4646 }
4647
4648 static int
4649 lfsck_layout_double_scan_one_trace_file(const struct lu_env *env,
4650                                         struct lfsck_component *com,
4651                                         struct dt_object *obj, bool first)
4652 {
4653         struct lfsck_instance *lfsck = com->lc_lfsck;
4654         struct ptlrpc_thread *thread = &lfsck->li_thread;
4655         struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram;
4656         struct lfsck_layout *lo = com->lc_file_ram;
4657         const struct dt_it_ops *iops = &obj->do_index_ops->dio_it;
4658         struct dt_it *di;
4659         struct dt_key *key;
4660         struct lfsck_layout_dangling_key *parent =
4661                                         &lfsck_env_info(env)->lti_lldk;
4662         struct lu_fid *cfid = &lfsck_env_info(env)->lti_fid3;
4663         __u32 ost_idx;
4664         int rc;
4665         ENTRY;
4666
4667         di = iops->init(env, obj, 0);
4668         if (IS_ERR(di))
4669                 RETURN(PTR_ERR(di));
4670
4671         if (first)
4672                 lldk_cpu_to_be(parent, &lo->ll_lldk_latest_scanned_phase2);
4673         else
4674                 memset(parent, 0, sizeof(*parent));
4675         rc = iops->get(env, di, (const struct dt_key *)parent);
4676         if (rc < 0)
4677                 GOTO(fini, rc);
4678
4679         if (first) {
4680                 /* The start one either has been processed or does not exist,
4681                  * skip it. */
4682                 rc = iops->next(env, di);
4683                 if (rc != 0)
4684                         GOTO(put, rc);
4685         }
4686
4687         do {
4688                 if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val) &&
4689                     unlikely(!thread_is_running(thread)))
4690                         GOTO(put, rc = 0);
4691
4692                 key = iops->key(env, di);
4693                 if (IS_ERR(key)) {
4694                         rc = PTR_ERR(key);
4695                         if (rc == -ENOENT)
4696                                 GOTO(put, rc = 1);
4697
4698                         goto checkpoint;
4699                 }
4700
4701                 lldk_be_to_cpu(parent,
4702                                 (const struct lfsck_layout_dangling_key *)key);
4703                 if (!fid_is_sane(&parent->lldk_fid)) {
4704                         rc = 0;
4705                         goto checkpoint;
4706                 }
4707
4708                 rc = iops->rec(env, di, (struct dt_rec *)cfid, 0);
4709                 if (rc == 0) {
4710                         fid_be_to_cpu(cfid, cfid);
4711                         ost_idx = cfid->f_ver;
4712                         cfid->f_ver = 0;
4713                         if (!fid_is_sane(cfid)) {
4714                                 rc = 0;
4715                                 goto checkpoint;
4716                         }
4717
4718                         rc = lfsck_layout_repair_dangling(env, com,
4719                                         &parent->lldk_fid, cfid,
4720                                         parent->lldk_comp_id,
4721                                         parent->lldk_ea_off, ost_idx);
4722                 }
4723
4724 checkpoint:
4725                 down_write(&com->lc_sem);
4726                 com->lc_new_checked++;
4727                 com->lc_new_scanned++;
4728                 if (rc >= 0)
4729                         lo->ll_lldk_latest_scanned_phase2 = *parent;
4730
4731                 if (rc > 0)
4732                         lo->ll_objs_repaired[LLIT_DANGLING - 1]++;
4733                 else if (rc < 0)
4734                         lo->ll_objs_failed_phase2++;
4735                 up_write(&com->lc_sem);
4736
4737                 if (rc < 0 && bk->lb_param & LPF_FAILOUT)
4738                         GOTO(put, rc);
4739
4740                 if (unlikely(com->lc_time_next_checkpoint <=
4741                              ktime_get_seconds()) &&
4742                     com->lc_new_checked != 0) {
4743                         down_write(&com->lc_sem);
4744                         lo->ll_run_time_phase2 += ktime_get_seconds() -
4745                                                   com->lc_time_last_checkpoint;
4746                         lo->ll_time_last_checkpoint = ktime_get_real_seconds();
4747                         lo->ll_objs_checked_phase2 += com->lc_new_checked;
4748                         com->lc_new_checked = 0;
4749                         lfsck_layout_store(env, com);
4750                         up_write(&com->lc_sem);
4751
4752                         com->lc_time_last_checkpoint = ktime_get_seconds();
4753                         com->lc_time_next_checkpoint =
4754                                 com->lc_time_last_checkpoint +
4755                                 LFSCK_CHECKPOINT_INTERVAL;
4756                 }
4757
4758                 lfsck_control_speed_by_self(com);
4759                 if (unlikely(!thread_is_running(thread)))
4760                         GOTO(put, rc = 0);
4761
4762                 rc = iops->next(env, di);
4763         } while (rc == 0);
4764
4765         GOTO(put, rc);
4766
4767 put:
4768         iops->put(env, di);
4769
4770 fini:
4771         iops->fini(env, di);
4772
4773         return rc;
4774 }
4775
4776 static int lfsck_layout_assistant_handler_p2(const struct lu_env *env,
4777                                              struct lfsck_component *com)
4778 {
4779         struct lfsck_assistant_data     *lad    = com->lc_data;
4780         struct lfsck_instance           *lfsck  = com->lc_lfsck;
4781         struct lfsck_bookmark           *bk     = &lfsck->li_bookmark_ram;
4782         struct lfsck_tgt_descs          *ltds   = &lfsck->li_ost_descs;
4783         struct lfsck_tgt_desc           *ltd;
4784         int                              rc     = 0;
4785         ENTRY;
4786
4787         CDEBUG(D_LFSCK, "%s: layout LFSCK phase2 scan start\n",
4788                lfsck_lfsck2name(lfsck));
4789
4790         spin_lock(&ltds->ltd_lock);
4791         while (!list_empty(&lad->lad_ost_phase2_list)) {
4792                 ltd = list_first_entry(&lad->lad_ost_phase2_list,
4793                                        struct lfsck_tgt_desc,
4794                                        ltd_layout_phase_list);
4795                 list_del_init(&ltd->ltd_layout_phase_list);
4796                 if (bk->lb_param & LPF_OST_ORPHAN) {
4797                         spin_unlock(&ltds->ltd_lock);
4798                         rc = lfsck_layout_scan_orphan(env, com, ltd);
4799                         if (rc != 0 && bk->lb_param & LPF_FAILOUT)
4800                                 RETURN(rc);
4801
4802                         if (unlikely(test_bit(LAD_EXIT, &lad->lad_flags) ||
4803                                      !thread_is_running(&lfsck->li_thread)))
4804                                 RETURN(0);
4805                         spin_lock(&ltds->ltd_lock);
4806                 }
4807         }
4808
4809         if (list_empty(&lad->lad_ost_phase1_list))
4810                 rc = 1;
4811         else
4812                 rc = 0;
4813         spin_unlock(&ltds->ltd_lock);
4814
4815         if (rc == 1 && bk->lb_param & LPF_OST_ORPHAN) {
4816                 struct lfsck_layout *lo = com->lc_file_ram;
4817                 int i;
4818
4819                 com->lc_new_checked = 0;
4820                 com->lc_new_scanned = 0;
4821                 com->lc_time_last_checkpoint = ktime_get_seconds();
4822                 com->lc_time_next_checkpoint = com->lc_time_last_checkpoint +
4823                                                LFSCK_CHECKPOINT_INTERVAL;
4824
4825                 i = lfsck_sub_trace_file_fid2idx(
4826                                 &lo->ll_lldk_latest_scanned_phase2.lldk_fid);
4827                 rc = lfsck_layout_double_scan_one_trace_file(env, com,
4828                                 com->lc_sub_trace_objs[i].lsto_obj, true);
4829                 while (rc > 0 && ++i < LFSCK_STF_COUNT)
4830                         rc = lfsck_layout_double_scan_one_trace_file(env, com,
4831                                 com->lc_sub_trace_objs[i].lsto_obj, false);
4832
4833                 CDEBUG(D_LFSCK, "%s: layout LFSCK phase2 scan dangling stop "
4834                        "at the No. %d trace file: rc = %d\n",
4835                        lfsck_lfsck2name(lfsck), i, rc);
4836         }
4837
4838         CDEBUG(D_LFSCK, "%s: layout LFSCK phase2 scan stop: rc = %d\n",
4839                lfsck_lfsck2name(lfsck), rc);
4840
4841         RETURN(rc);
4842 }
4843
4844 static int
4845 lfsck_layout_slave_async_interpret(const struct lu_env *env,
4846                                    struct ptlrpc_request *req,
4847                                    void *args, int rc)
4848 {
4849         struct lfsck_layout_slave_async_args *llsaa = args;
4850         struct obd_export *exp = llsaa->llsaa_exp;
4851         struct lfsck_component *com = llsaa->llsaa_com;
4852         struct lfsck_layout_slave_target *llst = llsaa->llsaa_llst;
4853         struct lfsck_layout_slave_data *llsd = com->lc_data;
4854         struct lfsck_reply *lr = NULL;
4855         bool done = false;
4856
4857         if (rc != 0) {
4858                 /* It is probably caused by network trouble, or target crash,
4859                  * it will try several times (depends on the obd_timeout, and
4860                  * will not less than 3 times). But to make the LFSCK can go
4861                  * ahead, we should not try for ever. After some try but still
4862                  * hit failure, it will assume that the target exit the LFSCK
4863                  * prcoessing and stop try. */
4864                 if (rc == -ENOTCONN || rc == -ESHUTDOWN) {
4865                         int max_try = max_t(int, obd_timeout / 30, 3);
4866
4867                         if (++(llst->llst_failures) > max_try)
4868                                 done = true;
4869                 } else {
4870                         done = true;
4871                 }
4872         } else {
4873                 llst->llst_failures = 0;
4874                 lr = req_capsule_server_get(&req->rq_pill, &RMF_LFSCK_REPLY);
4875                 if (lr->lr_status != LS_SCANNING_PHASE1 &&
4876                     lr->lr_status != LS_SCANNING_PHASE2)
4877                         done = true;
4878         }
4879
4880         if (done) {
4881                 CDEBUG(D_LFSCK, "%s: layout LFSCK slave gets the MDT %x "
4882                        "status %d, failures_try %d\n", lfsck_lfsck2name(com->lc_lfsck),
4883                        llst->llst_index, lr != NULL ? lr->lr_status : rc,
4884                        llst->llst_failures);
4885
4886                 lfsck_layout_llst_del(llsd, llst);
4887         }
4888
4889         lfsck_layout_llst_put(llst);
4890         lfsck_component_put(env, com);
4891         class_export_put(exp);
4892
4893         return 0;
4894 }
4895
4896 static int lfsck_layout_async_query(const struct lu_env *env,
4897                                     struct lfsck_component *com,
4898                                     struct obd_export *exp,
4899                                     struct lfsck_layout_slave_target *llst,
4900                                     struct lfsck_request *lr,
4901                                     struct ptlrpc_request_set *set)
4902 {
4903         struct lfsck_layout_slave_async_args *llsaa;
4904         struct ptlrpc_request                *req;
4905         struct lfsck_request                 *tmp;
4906         int                                   rc;
4907         ENTRY;
4908
4909         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LFSCK_QUERY);
4910         if (req == NULL)
4911                 RETURN(-ENOMEM);
4912
4913         rc = ptlrpc_request_pack(req, LUSTRE_OBD_VERSION, LFSCK_QUERY);
4914         if (rc != 0) {
4915                 ptlrpc_request_free(req);
4916                 RETURN(rc);
4917         }
4918
4919         tmp = req_capsule_client_get(&req->rq_pill, &RMF_LFSCK_REQUEST);
4920         *tmp = *lr;
4921         ptlrpc_request_set_replen(req);
4922
4923         llsaa = ptlrpc_req_async_args(llsaa, req);
4924         llsaa->llsaa_exp = exp;
4925         llsaa->llsaa_com = lfsck_component_get(com);
4926         llsaa->llsaa_llst = llst;
4927         req->rq_interpret_reply = lfsck_layout_slave_async_interpret;
4928         req->rq_allow_intr = 1;
4929         req->rq_no_delay = 1;
4930         ptlrpc_set_add_req(set, req);
4931
4932         RETURN(0);
4933 }
4934
4935 static int lfsck_layout_async_notify(const struct lu_env *env,
4936                                      struct obd_export *exp,
4937                                      struct lfsck_request *lr,
4938                                      struct ptlrpc_request_set *set)
4939 {
4940         struct ptlrpc_request   *req;
4941         struct lfsck_request    *tmp;
4942         int                      rc;
4943         ENTRY;
4944
4945         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LFSCK_NOTIFY);
4946         if (req == NULL)
4947                 RETURN(-ENOMEM);
4948
4949         rc = ptlrpc_request_pack(req, LUSTRE_OBD_VERSION, LFSCK_NOTIFY);
4950         if (rc != 0) {
4951                 ptlrpc_request_free(req);
4952                 RETURN(rc);
4953         }
4954
4955         tmp = req_capsule_client_get(&req->rq_pill, &RMF_LFSCK_REQUEST);
4956         *tmp = *lr;
4957         ptlrpc_request_set_replen(req);
4958         req->rq_allow_intr = 1;
4959         req->rq_no_delay = 1;
4960         ptlrpc_set_add_req(set, req);
4961
4962         RETURN(0);
4963 }
4964
4965 static int
4966 lfsck_layout_slave_query_master(const struct lu_env *env,
4967                                 struct lfsck_component *com)
4968 {
4969         struct lfsck_request             *lr    = &lfsck_env_info(env)->lti_lr;
4970         struct lfsck_instance            *lfsck = com->lc_lfsck;
4971         struct lfsck_layout_slave_data   *llsd  = com->lc_data;
4972         struct lfsck_layout_slave_target *llst;
4973         struct obd_export                *exp;
4974         struct ptlrpc_request_set        *set;
4975         int                               rc    = 0;
4976         int                               rc1   = 0;
4977         ENTRY;
4978
4979         set = ptlrpc_prep_set();
4980         if (set == NULL)
4981                 GOTO(log, rc = -ENOMEM);
4982
4983         memset(lr, 0, sizeof(*lr));
4984         lr->lr_event = LE_QUERY;
4985         lr->lr_active = LFSCK_TYPE_LAYOUT;
4986
4987         llsd->llsd_touch_gen++;
4988         spin_lock(&llsd->llsd_lock);
4989         while (!list_empty(&llsd->llsd_master_list)) {
4990                 llst = list_first_entry(&llsd->llsd_master_list,
4991                                         struct lfsck_layout_slave_target,
4992                                         llst_list);
4993                 if (llst->llst_gen == llsd->llsd_touch_gen)
4994                         break;
4995
4996                 llst->llst_gen = llsd->llsd_touch_gen;
4997                 list_move_tail(&llst->llst_list,
4998                                &llsd->llsd_master_list);
4999                 atomic_inc(&llst->llst_ref);
5000                 spin_unlock(&llsd->llsd_lock);
5001
5002                 exp = lustre_find_lwp_by_index(lfsck->li_obd->obd_name,
5003                                                llst->llst_index);
5004                 if (exp == NULL) {
5005                         lfsck_layout_llst_del(llsd, llst);
5006                         lfsck_layout_llst_put(llst);
5007                         spin_lock(&llsd->llsd_lock);
5008                         continue;
5009                 }
5010
5011                 rc = lfsck_layout_async_query(env, com, exp, llst, lr, set);
5012                 if (rc != 0) {
5013                         CDEBUG(D_LFSCK, "%s: layout LFSCK slave fail to "
5014                                "query %s for layout: rc = %d\n",
5015                                lfsck_lfsck2name(lfsck),
5016                                exp->exp_obd->obd_name, rc);
5017
5018                         rc1 = rc;
5019                         lfsck_layout_llst_put(llst);
5020                         class_export_put(exp);
5021                 }
5022                 spin_lock(&llsd->llsd_lock);
5023         }
5024         spin_unlock(&llsd->llsd_lock);
5025
5026         rc = ptlrpc_set_wait(env, set);
5027         ptlrpc_set_destroy(set);
5028
5029         GOTO(log, rc = (rc1 != 0 ? rc1 : rc));
5030
5031 log:
5032         CDEBUG(D_LFSCK, "%s: layout LFSCK slave queries master: rc = %d\n",
5033                lfsck_lfsck2name(com->lc_lfsck), rc);
5034
5035         return rc;
5036 }
5037
5038 static void
5039 lfsck_layout_slave_notify_master(const struct lu_env *env,
5040                                  struct lfsck_component *com,
5041                                  enum lfsck_events event, int result)
5042 {
5043         struct lfsck_layout              *lo    = com->lc_file_ram;
5044         struct lfsck_instance            *lfsck = com->lc_lfsck;
5045         struct lfsck_layout_slave_data   *llsd  = com->lc_data;
5046         struct lfsck_request             *lr    = &lfsck_env_info(env)->lti_lr;
5047         struct lfsck_layout_slave_target *llst;
5048         struct obd_export                *exp;
5049         struct ptlrpc_request_set        *set;
5050         int                               rc;
5051         ENTRY;
5052
5053         CDEBUG(D_LFSCK, "%s: layout LFSCK slave notifies master\n",
5054                lfsck_lfsck2name(com->lc_lfsck));
5055
5056         set = ptlrpc_prep_set();
5057         if (set == NULL)
5058                 RETURN_EXIT;
5059
5060         memset(lr, 0, sizeof(*lr));
5061         lr->lr_event = event;
5062         lr->lr_flags = LEF_FROM_OST;
5063         lr->lr_status = result;
5064         lr->lr_index = lfsck_dev_idx(lfsck);
5065         lr->lr_active = LFSCK_TYPE_LAYOUT;
5066         lr->lr_flags2 = lo->ll_flags;
5067         llsd->llsd_touch_gen++;
5068         spin_lock(&llsd->llsd_lock);
5069         while (!list_empty(&llsd->llsd_master_list)) {
5070                 llst = list_first_entry(&llsd->llsd_master_list,
5071                                         struct lfsck_layout_slave_target,
5072                                         llst_list);
5073                 if (llst->llst_gen == llsd->llsd_touch_gen)
5074                         break;
5075
5076                 llst->llst_gen = llsd->llsd_touch_gen;
5077                 list_move_tail(&llst->llst_list,
5078                                &llsd->llsd_master_list);
5079                 atomic_inc(&llst->llst_ref);
5080                 spin_unlock(&llsd->llsd_lock);
5081
5082                 exp = lustre_find_lwp_by_index(lfsck->li_obd->obd_name,
5083                                                llst->llst_index);
5084                 if (exp == NULL) {
5085                         lfsck_layout_llst_del(llsd, llst);
5086                         lfsck_layout_llst_put(llst);
5087                         spin_lock(&llsd->llsd_lock);
5088                         continue;
5089                 }
5090
5091                 rc = lfsck_layout_async_notify(env, exp, lr, set);
5092                 if (rc != 0)
5093                         CDEBUG(D_LFSCK, "%s: layout LFSCK slave fail to "
5094                                "notify %s for layout: rc = %d\n",
5095                                lfsck_lfsck2name(lfsck),
5096                                exp->exp_obd->obd_name, rc);
5097
5098                 lfsck_layout_llst_put(llst);
5099                 class_export_put(exp);
5100                 spin_lock(&llsd->llsd_lock);
5101         }
5102         spin_unlock(&llsd->llsd_lock);
5103
5104         ptlrpc_set_wait(env, set);
5105         ptlrpc_set_destroy(set);
5106
5107         RETURN_EXIT;
5108 }
5109
5110 /*
5111  * \ret -ENODATA: unrecognized stripe
5112  * \ret = 0     : recognized stripe
5113  * \ret < 0     : other failures
5114  */
5115 static int lfsck_layout_master_check_pairs(const struct lu_env *env,
5116                                            struct lfsck_component *com,
5117                                            struct lu_fid *cfid,
5118                                            struct lu_fid *pfid, __u32 comp_id)
5119 {
5120         struct lfsck_thread_info        *info   = lfsck_env_info(env);
5121         struct lu_buf                   *buf    = &info->lti_big_buf;
5122         struct ost_id                   *oi     = &info->lti_oi;
5123         struct dt_object                *obj;
5124         struct lov_mds_md_v1            *lmm;
5125         struct lov_ost_data_v1          *objs;
5126         __u32                            idx    = pfid->f_stripe_idx;
5127         __u32                            magic;
5128         int                              rc     = 0;
5129         int                              i;
5130         __u16                            count;
5131         ENTRY;
5132
5133         pfid->f_ver = 0;
5134         obj = lfsck_object_find_bottom(env, com->lc_lfsck, pfid);
5135         if (IS_ERR(obj))
5136                 RETURN(PTR_ERR(obj));
5137
5138         dt_read_lock(env, obj, 0);
5139         if (unlikely(dt_object_exists(obj) == 0 ||
5140                      lfsck_is_dead_obj(obj)))
5141                 GOTO(unlock, rc = -ENOENT);
5142
5143         if (!S_ISREG(lfsck_object_type(obj)))
5144                 GOTO(unlock, rc = -ENODATA);
5145
5146         rc = lfsck_layout_get_lovea(env, obj, buf);
5147         if (rc < 0)
5148                 GOTO(unlock, rc);
5149
5150         lmm = buf->lb_buf;
5151         magic = le32_to_cpu(lmm->lmm_magic);
5152         if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) {
5153                 struct lov_comp_md_v1 *lcm = buf->lb_buf;
5154                 struct lov_comp_md_entry_v1 *lcme;
5155
5156                 if (comp_id == 0)
5157                         GOTO(unlock, rc = -ENODATA);
5158
5159                 count = le16_to_cpu(lcm->lcm_entry_count);
5160                 for (i = 0; i < count; i++) {
5161                         lcme = &lcm->lcm_entries[i];
5162                         if (le32_to_cpu(lcme->lcme_id) == comp_id) {
5163                                 lmm = buf->lb_buf +
5164                                         le32_to_cpu(lcme->lcme_offset);
5165                                 magic = le32_to_cpu(lmm->lmm_magic);
5166                                 if (!(le32_to_cpu(lcme->lcme_flags) &
5167                                       LCME_FL_INIT))
5168                                         GOTO(unlock, rc = -ENODATA);
5169
5170                                 goto further;
5171                         }
5172                 }
5173
5174                 GOTO(unlock, rc = -ENODATA);
5175         }
5176
5177 further:
5178         if (magic == LOV_MAGIC_V1) {
5179                 objs = &lmm->lmm_objects[0];
5180         } else {
5181                 LASSERT(magic == LOV_MAGIC_V3);
5182                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0];
5183         }
5184
5185         fid_to_ostid(cfid, oi);
5186         count = le16_to_cpu(lmm->lmm_stripe_count);
5187         for (i = 0; i < count; i++, objs++) {
5188                 struct ost_id oi2;
5189
5190                 ostid_le_to_cpu(&objs->l_ost_oi, &oi2);
5191                 if (memcmp(oi, &oi2, sizeof(*oi)) == 0)
5192                         GOTO(unlock, rc = (i != idx ? -ENODATA : 0));
5193         }
5194
5195         GOTO(unlock, rc = -ENODATA);
5196
5197 unlock:
5198         dt_read_unlock(env, obj);
5199         lfsck_object_put(env, obj);
5200
5201         return rc;
5202 }
5203
5204 /*
5205  * The LFSCK-on-OST will ask the LFSCK-on-MDT to check whether the given
5206  * MDT-object/OST-object pairs match or not to aviod transfer MDT-object
5207  * layout EA from MDT to OST. On one hand, the OST no need to understand
5208  * the layout EA structure; on the other hand, it may cause trouble when
5209  * transfer large layout EA from MDT to OST via normal OUT RPC.
5210  *
5211  * \ret > 0: unrecognized stripe
5212  * \ret = 0: recognized stripe
5213  * \ret < 0: other failures
5214  */
5215 static int lfsck_layout_slave_check_pairs(const struct lu_env *env,
5216                                           struct lfsck_component *com,
5217                                           struct lu_fid *cfid,
5218                                           struct lu_fid *pfid, __u32 comp_id)
5219 {
5220         struct lfsck_instance    *lfsck  = com->lc_lfsck;
5221         struct obd_device        *obd    = lfsck->li_obd;
5222         struct seq_server_site   *ss     = lfsck_dev_site(lfsck);
5223         struct obd_export        *exp    = NULL;
5224         struct ptlrpc_request    *req    = NULL;
5225         struct lfsck_request     *lr;
5226         struct lu_seq_range      *range  = &lfsck_env_info(env)->lti_range;
5227         int                       rc     = 0;
5228         ENTRY;
5229
5230         if (unlikely(fid_is_idif(pfid)))
5231                 RETURN(1);
5232
5233         fld_range_set_any(range);
5234         rc = fld_server_lookup(env, ss->ss_server_fld, fid_seq(pfid), range);
5235         if (rc != 0)
5236                 RETURN(rc == -ENOENT ? 1 : rc);
5237
5238         if (unlikely(!fld_range_is_mdt(range)))
5239                 RETURN(1);
5240
5241         exp = lustre_find_lwp_by_index(obd->obd_name, range->lsr_index);
5242         if (unlikely(exp == NULL))
5243                 RETURN(1);
5244
5245         if (!(exp_connect_flags(exp) & OBD_CONNECT_LFSCK))
5246                 GOTO(out, rc = -EOPNOTSUPP);
5247
5248         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LFSCK_NOTIFY);
5249         if (req == NULL)
5250                 GOTO(out, rc = -ENOMEM);
5251
5252         rc = ptlrpc_request_pack(req, LUSTRE_OBD_VERSION, LFSCK_NOTIFY);
5253         if (rc != 0) {
5254                 ptlrpc_request_free(req);
5255
5256                 GOTO(out, rc);
5257         }
5258
5259         lr = req_capsule_client_get(&req->rq_pill, &RMF_LFSCK_REQUEST);
5260         memset(lr, 0, sizeof(*lr));
5261         lr->lr_event = LE_PAIRS_VERIFY;
5262         lr->lr_active = LFSCK_TYPE_LAYOUT;
5263         lr->lr_fid = *cfid; /* OST-object itself FID. */
5264         lr->lr_fid2 = *pfid; /* The claimed parent FID. */
5265         lr->lr_comp_id = comp_id;
5266
5267         ptlrpc_request_set_replen(req);
5268         rc = ptlrpc_queue_wait(req);
5269         ptlrpc_req_finished(req);
5270
5271         if (rc == -ENOENT || rc == -ENODATA)
5272                 rc = 1;
5273
5274         GOTO(out, rc);
5275
5276 out:
5277         if (exp != NULL)
5278                 class_export_put(exp);
5279
5280         return rc;
5281 }
5282
5283 static int lfsck_layout_slave_repair_pfid(const struct lu_env *env,
5284                                           struct lfsck_component *com,
5285                                           struct lfsck_req_local *lrl)
5286 {
5287         struct dt_object        *obj;
5288         int                      rc     = 0;
5289         ENTRY;
5290
5291         obj = lfsck_object_find_bottom(env, com->lc_lfsck, &lrl->lrl_fid);
5292         if (IS_ERR(obj))
5293                 GOTO(log, rc = PTR_ERR(obj));
5294
5295         rc = __lfsck_layout_update_pfid(env, com, obj,
5296                                         &lrl->lrl_ff_client.ff_parent,
5297                                         &lrl->lrl_ff_client.ff_layout,
5298                                         lrl->lrl_ff_client.ff_layout_version,
5299                                         lrl->lrl_ff_client.ff_range,
5300                                         lrl->lrl_ff_client.ff_parent.f_ver);
5301
5302         lfsck_object_put(env, obj);
5303
5304 log:
5305         CDEBUG(D_LFSCK, "%s: layout LFSCK slave repaired pfid for "DFID
5306                ", parent "DFID": rc = %d\n", lfsck_lfsck2name(com->lc_lfsck),
5307                PFID(&lrl->lrl_fid), PFID(&lrl->lrl_ff_client.ff_parent), rc);
5308
5309         return rc;
5310 }
5311
5312 /* layout APIs */
5313
5314 static void lfsck_layout_slave_quit(const struct lu_env *env,
5315                                     struct lfsck_component *com);
5316
5317 static int lfsck_layout_reset(const struct lu_env *env,
5318                               struct lfsck_component *com, bool init)
5319 {
5320         struct lfsck_layout     *lo    = com->lc_file_ram;
5321         int                      rc;
5322
5323         down_write(&com->lc_sem);
5324         if (init) {
5325                 memset(lo, 0, com->lc_file_size);
5326         } else {
5327                 __u32 count = lo->ll_success_count;
5328                 time64_t last_time = lo->ll_time_last_complete;
5329
5330                 memset(lo, 0, com->lc_file_size);
5331                 lo->ll_success_count = count;
5332                 lo->ll_time_last_complete = last_time;
5333         }
5334
5335         lo->ll_magic = LFSCK_LAYOUT_MAGIC;
5336         lo->ll_status = LS_INIT;
5337
5338         if (com->lc_lfsck->li_master) {
5339                 struct lfsck_assistant_data *lad = com->lc_data;
5340
5341                 clear_bit(LAD_INCOMPLETE, &lad->lad_flags);
5342                 bitmap_zero(lad->lad_bitmap, lad->lad_bitmap_count);
5343         }
5344
5345         rc = lfsck_layout_store(env, com);
5346         if (rc == 0 && com->lc_lfsck->li_master)
5347                 rc = lfsck_load_sub_trace_files(env, com,
5348                         &dt_lfsck_layout_dangling_features, LFSCK_LAYOUT, true);
5349         up_write(&com->lc_sem);
5350
5351         CDEBUG(D_LFSCK, "%s: layout LFSCK reset: rc = %d\n",
5352                lfsck_lfsck2name(com->lc_lfsck), rc);
5353
5354         return rc;
5355 }
5356
5357 static void lfsck_layout_fail(const struct lu_env *env,
5358                               struct lfsck_component *com, bool new_checked)
5359 {
5360         struct lfsck_layout *lo = com->lc_file_ram;
5361
5362         down_write(&com->lc_sem);
5363         if (new_checked)
5364                 com->lc_new_checked++;
5365         lfsck_layout_record_failure(env, com->lc_lfsck, lo);
5366         up_write(&com->lc_sem);
5367 }
5368
5369 static int lfsck_layout_master_checkpoint(const struct lu_env *env,
5370                                           struct lfsck_component *com, bool init)
5371 {
5372         struct lfsck_instance   *lfsck   = com->lc_lfsck;
5373         struct lfsck_layout     *lo      = com->lc_file_ram;
5374         int                      rc;
5375
5376         if (!init) {
5377                 rc = lfsck_checkpoint_generic(env, com);
5378                 if (rc != 0)
5379                         return rc > 0 ? 0 : rc;
5380         }
5381
5382         down_write(&com->lc_sem);
5383         if (init) {
5384                 lo->ll_pos_latest_start =
5385                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
5386         } else {
5387                 lo->ll_pos_last_checkpoint =
5388                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
5389                 lo->ll_run_time_phase1 += ktime_get_seconds() -
5390                                           lfsck->li_time_last_checkpoint;
5391                 lo->ll_time_last_checkpoint = ktime_get_real_seconds();
5392                 lo->ll_objs_checked_phase1 += com->lc_new_checked;
5393                 com->lc_new_checked = 0;
5394         }
5395
5396         rc = lfsck_layout_store(env, com);
5397         up_write(&com->lc_sem);
5398
5399         CDEBUG(D_LFSCK, "%s: layout LFSCK master checkpoint at the pos ["
5400                "%llu], status = %d: rc = %d\n", lfsck_lfsck2name(lfsck),
5401                lfsck->li_pos_current.lp_oit_cookie, lo->ll_status, rc);
5402
5403         return rc;
5404 }
5405
5406 static int lfsck_layout_slave_checkpoint(const struct lu_env *env,
5407                                          struct lfsck_component *com, bool init)
5408 {
5409         struct lfsck_instance   *lfsck = com->lc_lfsck;
5410         struct lfsck_layout     *lo    = com->lc_file_ram;
5411         int                      rc;
5412
5413         if (com->lc_new_checked == 0 && !init)
5414                 return 0;
5415
5416         down_write(&com->lc_sem);
5417         if (init) {
5418                 lo->ll_pos_latest_start =
5419                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
5420         } else {
5421                 lo->ll_pos_last_checkpoint =
5422                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
5423                 lo->ll_run_time_phase1 += ktime_get_seconds() -
5424                                           lfsck->li_time_last_checkpoint;
5425                 lo->ll_time_last_checkpoint = ktime_get_real_seconds();
5426                 lo->ll_objs_checked_phase1 += com->lc_new_checked;
5427                 com->lc_new_checked = 0;
5428         }
5429
5430         rc = lfsck_layout_store(env, com);
5431         up_write(&com->lc_sem);
5432
5433         CDEBUG(D_LFSCK, "%s: layout LFSCK slave checkpoint at the pos ["
5434                "%llu], status = %d: rc = %d\n", lfsck_lfsck2name(lfsck),
5435                lfsck->li_pos_current.lp_oit_cookie, lo->ll_status, rc);
5436
5437         return rc;
5438 }
5439
5440 static int lfsck_layout_prep(const struct lu_env *env,
5441                              struct lfsck_component *com,
5442                              struct lfsck_start *start)
5443 {
5444         struct lfsck_instance   *lfsck  = com->lc_lfsck;
5445         struct lfsck_layout     *lo     = com->lc_file_ram;
5446         struct lfsck_position   *pos    = &com->lc_pos_start;
5447
5448         fid_zero(&pos->lp_dir_parent);
5449         pos->lp_dir_cookie = 0;
5450         if (lo->ll_status == LS_COMPLETED ||
5451             lo->ll_status == LS_PARTIAL ||
5452             /* To handle orphan, must scan from the beginning. */
5453             (start != NULL && start->ls_flags & LPF_OST_ORPHAN)) {
5454                 int rc;
5455
5456                 rc = lfsck_layout_reset(env, com, false);
5457                 if (rc == 0)
5458                         rc = lfsck_set_param(env, lfsck, start, true);
5459
5460                 if (rc != 0) {
5461                         CDEBUG(D_LFSCK, "%s: layout LFSCK prep failed: "
5462                                "rc = %d\n", lfsck_lfsck2name(lfsck), rc);
5463
5464                         return rc;
5465                 }
5466         }
5467
5468         down_write(&com->lc_sem);
5469         lo->ll_time_latest_start = ktime_get_real_seconds();
5470         spin_lock(&lfsck->li_lock);
5471         if (lo->ll_flags & LF_SCANNED_ONCE) {
5472                 if (!lfsck->li_drop_dryrun ||
5473                     lo->ll_pos_first_inconsistent == 0) {
5474                         lo->ll_status = LS_SCANNING_PHASE2;
5475                         list_move_tail(&com->lc_link,
5476                                        &lfsck->li_list_double_scan);
5477                         pos->lp_oit_cookie = 0;
5478                 } else {
5479                         int i;
5480
5481                         lo->ll_status = LS_SCANNING_PHASE1;
5482                         lo->ll_run_time_phase1 = 0;
5483                         lo->ll_run_time_phase2 = 0;
5484                         lo->ll_objs_checked_phase1 = 0;
5485                         lo->ll_objs_checked_phase2 = 0;
5486                         lo->ll_objs_failed_phase1 = 0;
5487                         lo->ll_objs_failed_phase2 = 0;
5488                         for (i = 0; i < LLIT_MAX; i++)
5489                                 lo->ll_objs_repaired[i] = 0;
5490
5491                         pos->lp_oit_cookie = lo->ll_pos_first_inconsistent;
5492                         fid_zero(&com->lc_fid_latest_scanned_phase2);
5493                 }
5494         } else {
5495                 lo->ll_status = LS_SCANNING_PHASE1;
5496                 if (!lfsck->li_drop_dryrun ||
5497                     lo->ll_pos_first_inconsistent == 0)
5498                         pos->lp_oit_cookie = lo->ll_pos_last_checkpoint + 1;
5499                 else
5500                         pos->lp_oit_cookie = lo->ll_pos_first_inconsistent;
5501         }
5502         spin_unlock(&lfsck->li_lock);
5503         up_write(&com->lc_sem);
5504
5505         return 0;
5506 }
5507
5508 static int lfsck_layout_slave_prep(const struct lu_env *env,
5509                                    struct lfsck_component *com,
5510                                    struct lfsck_start_param *lsp)
5511 {
5512         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
5513         struct lfsck_instance           *lfsck  = com->lc_lfsck;
5514         struct lfsck_layout             *lo     = com->lc_file_ram;
5515         struct lfsck_start              *start  = lsp->lsp_start;
5516         int                              rc;
5517
5518         rc = lfsck_layout_prep(env, com, start);
5519         if (rc != 0)
5520                 return rc;
5521
5522         if (lo->ll_flags & LF_CRASHED_LASTID &&
5523             list_empty(&llsd->llsd_master_list)) {
5524                 LASSERT(lfsck->li_out_notify != NULL);
5525
5526                 lfsck->li_out_notify(env, lfsck->li_out_notify_data,
5527                                      LE_LASTID_REBUILDING);
5528         }
5529
5530         if (!lsp->lsp_index_valid)
5531                 return 0;
5532
5533         rc = lfsck_layout_llst_add(llsd, lsp->lsp_index);
5534         if (rc == 0 && start != NULL && start->ls_flags & LPF_OST_ORPHAN) {
5535                 LASSERT(!llsd->llsd_rbtree_valid);
5536
5537                 down_write(&llsd->llsd_rb_rwsem);
5538                 rc = lfsck_rbtree_setup(env, com);
5539                 up_write(&llsd->llsd_rb_rwsem);
5540         }
5541
5542         CDEBUG(D_LFSCK, "%s: layout LFSCK slave prep done, start pos ["
5543                "%llu]\n", lfsck_lfsck2name(lfsck),
5544                com->lc_pos_start.lp_oit_cookie);
5545
5546         return rc;
5547 }
5548
5549 static int lfsck_layout_master_prep(const struct lu_env *env,
5550                                     struct lfsck_component *com,
5551                                     struct lfsck_start_param *lsp)
5552 {
5553         int rc;
5554         ENTRY;
5555
5556         rc = lfsck_layout_load_bitmap(env, com);
5557         if (rc != 0) {
5558                 rc = lfsck_layout_reset(env, com, false);
5559                 if (rc == 0)
5560                         rc = lfsck_set_param(env, com->lc_lfsck,
5561                                              lsp->lsp_start, true);
5562
5563                 if (rc != 0)
5564                         GOTO(log, rc);
5565         }
5566
5567         rc = lfsck_layout_prep(env, com, lsp->lsp_start);
5568         if (rc != 0)
5569                 RETURN(rc);
5570
5571         rc = lfsck_start_assistant(env, com, lsp);
5572
5573         GOTO(log, rc);
5574
5575 log:
5576         CDEBUG(D_LFSCK, "%s: layout LFSCK master prep done, start pos ["
5577                "%llu]\n", lfsck_lfsck2name(com->lc_lfsck),
5578                com->lc_pos_start.lp_oit_cookie);
5579
5580         return 0;
5581 }
5582
5583 /* Pre-fetch the attribute for each stripe in the given layout EA. */
5584 static int lfsck_layout_scan_stripes(const struct lu_env *env,
5585                                      struct lfsck_component *com,
5586                                      struct dt_object *parent,
5587                                      struct lov_mds_md_v1 *lmm, __u32 comp_id)
5588 {
5589         struct lfsck_thread_info        *info    = lfsck_env_info(env);
5590         struct lfsck_instance           *lfsck   = com->lc_lfsck;
5591         struct lfsck_bookmark           *bk      = &lfsck->li_bookmark_ram;
5592         struct lfsck_layout             *lo      = com->lc_file_ram;
5593         struct lfsck_assistant_data     *lad     = com->lc_data;
5594         struct lfsck_assistant_object   *lso     = NULL;
5595         struct lov_ost_data_v1          *objs;
5596         struct lfsck_tgt_descs          *ltds    = &lfsck->li_ost_descs;
5597         struct ptlrpc_thread            *mthread = &lfsck->li_thread;
5598         struct ptlrpc_thread            *athread = &lad->lad_thread;
5599         struct lu_buf                    buf;
5600         int                              rc      = 0;
5601         int                              i;
5602         __u32                            magic;
5603         __u16                            count;
5604         ENTRY;
5605
5606         lfsck_buf_init(&buf, &info->lti_ff, sizeof(struct filter_fid));
5607         magic = le32_to_cpu(lmm->lmm_magic);
5608         if (magic == LOV_MAGIC_V1) {
5609                 objs = &lmm->lmm_objects[0];
5610         } else {
5611                 LASSERT(magic == LOV_MAGIC_V3);
5612                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0];
5613         }
5614
5615         count = le16_to_cpu(lmm->lmm_stripe_count);
5616         for (i = 0; i < count; i++, objs++) {
5617                 struct lu_fid           *fid    = &info->lti_fid;
5618                 struct ost_id           *oi     = &info->lti_oi;
5619                 struct lfsck_layout_req *llr;
5620                 struct lfsck_tgt_desc   *tgt    = NULL;
5621                 struct dt_object        *cobj   = NULL;
5622                 __u32                    index;
5623                 bool                     wakeup = false;
5624
5625                 if (unlikely(lovea_slot_is_dummy(objs)))
5626                         continue;
5627
5628                 wait_event_idle(mthread->t_ctl_waitq,
5629                                 lad->lad_prefetched < bk->lb_async_windows ||
5630                                 !thread_is_running(mthread) ||
5631                                 thread_is_stopped(athread));
5632
5633                 if (unlikely(!thread_is_running(mthread)) ||
5634                              thread_is_stopped(athread))
5635                         GOTO(out, rc = 0);
5636
5637                 if (unlikely(lfsck_is_dead_obj(parent)))
5638                         GOTO(out, rc = 0);
5639
5640                 ostid_le_to_cpu(&objs->l_ost_oi, oi);
5641                 index = le32_to_cpu(objs->l_ost_idx);
5642                 rc = ostid_to_fid(fid, oi, index);
5643                 if (rc != 0) {
5644                         CDEBUG(D_LFSCK, "%s: get invalid layout EA for "DFID
5645                                ": "DOSTID", idx %u, comp_id %u\n",
5646                                lfsck_lfsck2name(lfsck),
5647                                PFID(lfsck_dto2fid(parent)), POSTID(oi),
5648                                index, comp_id);
5649                         goto next;
5650                 }
5651
5652                 tgt = lfsck_tgt_get(ltds, index);
5653                 if (unlikely(tgt == NULL)) {
5654                         CDEBUG(D_LFSCK, "%s: cannot talk with OST %x which "
5655                                "did not join the layout LFSCK, comp_id %u\n",
5656                                lfsck_lfsck2name(lfsck), index, comp_id);
5657                         lfsck_lad_set_bitmap(env, com, index);
5658                         goto next;
5659                 }
5660
5661                 /* There is potential deadlock race condition between object
5662                  * destroy and layout LFSCK. Consider the following scenario:
5663                  *
5664                  * 1) The LFSCK thread obtained the parent object firstly, at
5665                  *    that time, the parent object has not been destroyed yet.
5666                  *
5667                  * 2) One RPC service thread destroyed the parent and all its
5668                  *    children objects. Because the LFSCK is referencing the
5669                  *    parent object, then the parent object will be marked as
5670                  *    dying in RAM. On the other hand, the parent object is
5671                  *    referencing all its children objects, then all children
5672                  *    objects will be marked as dying in RAM also.
5673                  *
5674                  * 3) The LFSCK thread tries to find some child object with
5675                  *    the parent object referenced. Then it will find that the
5676                  *    child object is dying. According to the object visibility
5677                  *    rules: the object with dying flag cannot be returned to
5678                  *    others. So the LFSCK thread has to wait until the dying
5679                  *    object has been purged from RAM, then it can allocate a
5680                  *    new object (with the same FID) in RAM. Unfortunately, the
5681                  *    LFSCK thread itself is referencing the parent object, and
5682                  *    cause the parent object cannot be purged, then cause the
5683                  *    child object cannot be purged also. So the LFSCK thread
5684                  *    will fall into deadlock.
5685                  */
5686                 cobj = lfsck_object_find_by_dev(env, tgt->ltd_tgt, fid);
5687                 if (IS_ERR(cobj)) {
5688                         if (lfsck_is_dead_obj(parent)) {
5689                                 lfsck_tgt_put(tgt);
5690
5691                                 GOTO(out, rc = 0);
5692                         }
5693
5694                         rc = PTR_ERR(cobj);
5695                         goto next;
5696                 }
5697
5698                 rc = dt_declare_attr_get(env, cobj);
5699                 if (rc)
5700                         goto next;
5701
5702                 rc = dt_declare_xattr_get(env, cobj, &buf, XATTR_NAME_FID);
5703                 if (rc)
5704                         goto next;
5705
5706                 if (lso == NULL) {
5707                         struct lu_attr *attr = &info->lti_la;
5708
5709                         rc = dt_attr_get(env, parent, attr);
5710                         if (rc != 0)
5711                                 goto next;
5712
5713                         lso = lfsck_assistant_object_init(env,
5714                                 lfsck_dto2fid(parent), attr,
5715                                 lfsck->li_pos_current.lp_oit_cookie, false);
5716                         if (IS_ERR(lso)) {
5717                                 rc = PTR_ERR(lso);
5718                                 lso = NULL;
5719
5720                                 goto next;
5721                         }
5722                 }
5723
5724                 llr = lfsck_layout_assistant_req_init(lso, cobj, comp_id,
5725                                                       index, i);
5726                 if (IS_ERR(llr)) {
5727                         rc = PTR_ERR(llr);
5728                         goto next;
5729                 }
5730
5731                 cobj = NULL;
5732                 spin_lock(&lad->lad_lock);
5733                 if (lad->lad_assistant_status < 0) {
5734                         spin_unlock(&lad->lad_lock);
5735                         lfsck_layout_assistant_req_fini(env, &llr->llr_lar);
5736                         lfsck_tgt_put(tgt);
5737                         RETURN(lad->lad_assistant_status);
5738                 }
5739
5740                 list_add_tail(&llr->llr_lar.lar_list, &lad->lad_req_list);
5741                 if (lad->lad_prefetched == 0)
5742                         wakeup = true;
5743
5744                 lad->lad_prefetched++;
5745                 spin_unlock(&lad->lad_lock);
5746                 if (wakeup)
5747                         wake_up(&athread->t_ctl_waitq);
5748
5749 next:
5750                 down_write(&com->lc_sem);
5751                 com->lc_new_checked++;
5752                 if (rc < 0)
5753                         lfsck_layout_record_failure(env, lfsck, lo);
5754                 up_write(&com->lc_sem);
5755
5756                 if (cobj != NULL && !IS_ERR(cobj))
5757                         lfsck_object_put(env, cobj);
5758
5759                 if (likely(tgt != NULL))
5760                         lfsck_tgt_put(tgt);
5761
5762                 if (rc < 0 && bk->lb_param & LPF_FAILOUT)
5763                         GOTO(out, rc);
5764         }
5765
5766         GOTO(out, rc = 0);
5767
5768 out:
5769         if (lso != NULL)
5770                 lfsck_assistant_object_put(env, lso);
5771
5772         return rc;
5773 }
5774
5775 /* For the given object, read its layout EA locally. For each stripe, pre-fetch
5776  * the OST-object's attribute and generate an structure lfsck_layout_req on the
5777  * list ::lad_req_list.
5778  *
5779  * For each request on above list, the lfsck_layout_assistant thread compares
5780  * the OST side attribute with local attribute, if inconsistent, then repair it.
5781  *
5782  * All above processing is async mode with pipeline. */
5783 static int lfsck_layout_master_exec_oit(const struct lu_env *env,
5784                                         struct lfsck_component *com,
5785                                         struct dt_object *obj)
5786 {
5787         struct lfsck_thread_info        *info   = lfsck_env_info(env);
5788         struct ost_id                   *oi     = &info->lti_oi;
5789         struct lfsck_layout             *lo     = com->lc_file_ram;
5790         struct lfsck_assistant_data     *lad    = com->lc_data;
5791         struct lfsck_instance           *lfsck  = com->lc_lfsck;
5792         struct lfsck_bookmark           *bk     = &lfsck->li_bookmark_ram;
5793         struct thandle                  *handle = NULL;
5794         struct lu_buf                   *buf    = &info->lti_big_buf;
5795         struct lov_mds_md_v1            *lmm    = NULL;
5796         struct dt_device                *dev    = lfsck_obj2dev(obj);
5797         struct lustre_handle             lh     = { 0 };
5798         struct lu_buf                    ea_buf = { NULL };
5799         struct lov_comp_md_v1           *lcm    = NULL;
5800         struct lov_comp_md_entry_v1     *lcme   = NULL;
5801         int                              rc     = 0;
5802         int                              size   = 0;
5803         __u32                            magic  = 0;
5804         __u16                            count  = 0;
5805         bool                             locked = false;
5806         bool                             stripe = false;
5807         bool                             bad_oi = false;
5808         ENTRY;
5809
5810         if (!S_ISREG(lfsck_object_type(obj)))
5811                 GOTO(out, rc = 0);
5812
5813         if (lad->lad_assistant_status < 0)
5814                 GOTO(out, rc = -ESRCH);
5815
5816         fid_to_lmm_oi(lfsck_dto2fid(obj), oi);
5817         lmm_oi_cpu_to_le(oi, oi);
5818         dt_read_lock(env, obj, 0);
5819         locked = true;
5820
5821 again:
5822         bad_oi = false;
5823         if (dt_object_exists(obj) == 0 ||
5824             lfsck_is_dead_obj(obj))
5825                 GOTO(out, rc = 0);
5826
5827         rc = lfsck_layout_get_lovea(env, obj, buf);
5828         if (rc == -EINVAL || rc == -ENODATA || rc == -EOPNOTSUPP)
5829                 /* Skip bad lov EA during the 1st cycle scanning, and
5830                  * try to recover it via orphan in the 2nd scanning. */
5831                 rc = 0;
5832         if (rc <= 0)
5833                 GOTO(out, rc);
5834
5835         size = rc;
5836         lmm = buf->lb_buf;
5837         magic = le32_to_cpu(lmm->lmm_magic);
5838         if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) {
5839                 struct lov_mds_md_v1 *v1;
5840                 int i;
5841
5842                 lcm = buf->lb_buf;
5843                 count = le16_to_cpu(lcm->lcm_entry_count);
5844                 for (i = 0; i < count; i++) {
5845                         lcme = &lcm->lcm_entries[i];
5846                         v1 = buf->lb_buf + le32_to_cpu(lcme->lcme_offset);
5847                         if (memcmp(oi, &v1->lmm_oi, sizeof(*oi)) != 0)
5848                                 goto fix;
5849                 }
5850
5851                 GOTO(out, stripe = true);
5852         } else if (memcmp(oi, &lmm->lmm_oi, sizeof(*oi)) == 0) {
5853                 GOTO(out, stripe = true);
5854         }
5855
5856 fix:
5857         /* Inconsistent lmm_oi, should be repaired. */
5858         bad_oi = true;
5859
5860         if (bk->lb_param & LPF_DRYRUN) {
5861                 lo->ll_objs_repaired[LLIT_OTHERS - 1]++;
5862
5863                 GOTO(out, stripe = true);
5864         }
5865
5866         if (!lustre_handle_is_used(&lh)) {
5867                 dt_read_unlock(env, obj);
5868                 locked = false;
5869                 rc = lfsck_ibits_lock(env, lfsck, obj, &lh,
5870                                       MDS_INODELOCK_LAYOUT |
5871                                       MDS_INODELOCK_XATTR, LCK_EX);
5872                 if (rc != 0)
5873                         GOTO(out, rc);
5874
5875                 handle = lfsck_trans_create(env, dev, lfsck);
5876                 if (IS_ERR(handle))
5877                         GOTO(out, rc = PTR_ERR(handle));
5878
5879                 lfsck_buf_init(&ea_buf, buf->lb_buf, size);
5880                 rc = dt_declare_xattr_set(env, obj, &ea_buf, XATTR_NAME_LOV,
5881                                           LU_XATTR_REPLACE, handle);
5882                 if (rc != 0)
5883                         GOTO(out, rc);
5884
5885                 rc = dt_trans_start_local(env, dev, handle);
5886                 if (rc != 0)
5887                         GOTO(out, rc);
5888
5889                 dt_write_lock(env, obj, 0);
5890                 locked = true;
5891
5892                 goto again;
5893         }
5894
5895         if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) {
5896                 struct lov_mds_md_v1 *v1;
5897                 int i;
5898
5899                 for (i = 0; i < count; i++) {
5900                         lcme = &lcm->lcm_entries[i];
5901                         v1 = buf->lb_buf + le32_to_cpu(lcme->lcme_offset);
5902                         v1->lmm_oi = *oi;
5903                 }
5904         } else {
5905                 lmm->lmm_oi = *oi;
5906         }
5907
5908         rc = dt_xattr_set(env, obj, &ea_buf, XATTR_NAME_LOV,
5909                           LU_XATTR_REPLACE, handle);
5910         if (rc != 0)
5911                 GOTO(out, rc);
5912
5913         lo->ll_objs_repaired[LLIT_OTHERS - 1]++;
5914
5915         GOTO(out, stripe = true);
5916
5917 out:
5918         if (locked) {
5919                 if (lustre_handle_is_used(&lh))
5920                         dt_write_unlock(env, obj);
5921                 else
5922                         dt_read_unlock(env, obj);
5923         }
5924
5925         if (handle != NULL && !IS_ERR(handle))
5926                 dt_trans_stop(env, dev, handle);
5927
5928         lfsck_ibits_unlock(&lh, LCK_EX);
5929
5930         if (bad_oi)
5931                 CDEBUG(D_LFSCK, "%s: layout LFSCK master %s bad lmm_oi for "
5932                        DFID": rc = %d\n", lfsck_lfsck2name(lfsck),
5933                        bk->lb_param & LPF_DRYRUN ? "found" : "repaired",
5934                        PFID(lfsck_dto2fid(obj)), rc);
5935
5936         if (stripe) {
5937                 if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) {
5938                         int i;
5939
5940                         for (i = 0; i < count; i++) {
5941                                 lcme = &lcm->lcm_entries[i];
5942                                 if (!(le32_to_cpu(lcme->lcme_flags) &
5943                                       LCME_FL_INIT))
5944                                         continue;
5945
5946                                 rc = lfsck_layout_scan_stripes(env, com, obj,
5947                                         (struct lov_mds_md_v1 *)(buf->lb_buf +
5948                                         le32_to_cpu(lcme->lcme_offset)),
5949                                         le32_to_cpu(lcme->lcme_id));
5950                         }
5951                 } else {
5952                         rc = lfsck_layout_scan_stripes(env, com, obj, lmm, 0);
5953                 }
5954         } else {
5955                 down_write(&com->lc_sem);
5956                 com->lc_new_checked++;
5957                 if (rc < 0)
5958                         lfsck_layout_record_failure(env, lfsck, lo);
5959                 up_write(&com->lc_sem);
5960         }
5961
5962         return rc;
5963 }
5964
5965 static int lfsck_layout_slave_exec_oit(const struct lu_env *env,
5966                                        struct lfsck_component *com,
5967                                        struct dt_object *obj)
5968 {
5969         struct lfsck_instance           *lfsck  = com->lc_lfsck;
5970         struct lfsck_layout             *lo     = com->lc_file_ram;
5971         const struct lu_fid             *fid    = lfsck_dto2fid(obj);
5972         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
5973         struct lfsck_layout_seq         *lls;
5974         __u64                            seq;
5975         __u64                            oid;
5976         int                              rc;
5977         ENTRY;
5978
5979         LASSERT(llsd != NULL);
5980
5981         if (CFS_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY5) &&
5982             cfs_fail_val == lfsck_dev_idx(lfsck)) {
5983                 struct ptlrpc_thread    *thread = &lfsck->li_thread;
5984
5985                 wait_event_idle_timeout(thread->t_ctl_waitq,
5986                                         !thread_is_running(thread),
5987                                         cfs_time_seconds(1));
5988         }
5989
5990         lfsck_rbtree_update_bitmap(env, com, fid, false);
5991
5992         down_write(&com->lc_sem);
5993         if (fid_is_idif(fid))
5994                 seq = 0;
5995         else if (!fid_is_norm(fid) ||
5996                  !fid_is_for_ostobj(env, lfsck, obj, fid))
5997                 GOTO(unlock, rc = 0);
5998         else
5999                 seq = fid_seq(fid);
6000         com->lc_new_checked++;
6001
6002         lls = lfsck_layout_seq_lookup(llsd, seq);
6003         if (lls == NULL) {
6004                 OBD_ALLOC_PTR(lls);
6005                 if (unlikely(lls == NULL))
6006                         GOTO(unlock, rc = -ENOMEM);
6007
6008                 INIT_LIST_HEAD(&lls->lls_list);
6009                 lls->lls_seq = seq;
6010                 rc = lfsck_layout_lastid_load(env, com, lls);
6011                 if (rc != 0) {
6012                         CDEBUG(D_LFSCK, "%s: layout LFSCK failed to "
6013                               "load LAST_ID for %#llx: rc = %d\n",
6014                               lfsck_lfsck2name(com->lc_lfsck), seq, rc);
6015                         lo->ll_objs_failed_phase1++;
6016                         OBD_FREE_PTR(lls);
6017                         GOTO(unlock, rc);
6018                 }
6019
6020                 lfsck_layout_seq_insert(llsd, lls);
6021         }
6022
6023         if (unlikely(fid_is_last_id(fid)))
6024                 GOTO(unlock, rc = 0);
6025
6026         if (fid_is_idif(fid))
6027                 oid = fid_idif_id(fid_seq(fid), fid_oid(fid), fid_ver(fid));
6028         else
6029                 oid = fid_oid(fid);
6030
6031         if (oid > lls->lls_lastid_known)
6032                 lls->lls_lastid_known = oid;
6033
6034         if (oid > lls->lls_lastid) {
6035                 if (!(lo->ll_flags & LF_CRASHED_LASTID)) {
6036                         /* OFD may create new objects during LFSCK scanning. */
6037                         rc = lfsck_layout_lastid_reload(env, com, lls);
6038                         if (unlikely(rc != 0)) {
6039                                 CDEBUG(D_LFSCK, "%s: layout LFSCK failed to "
6040                                       "reload LAST_ID for %#llx: rc = %d\n",
6041                                       lfsck_lfsck2name(com->lc_lfsck),
6042                                       lls->lls_seq, rc);
6043
6044                                 GOTO(unlock, rc);
6045                         }
6046
6047                         if (oid <= lls->lls_lastid ||
6048                             lo->ll_flags & LF_CRASHED_LASTID)
6049                                 GOTO(unlock, rc = 0);
6050
6051                         LASSERT(lfsck->li_out_notify != NULL);
6052
6053                         lfsck->li_out_notify(env, lfsck->li_out_notify_data,
6054                                              LE_LASTID_REBUILDING);
6055                         lo->ll_flags |= LF_CRASHED_LASTID;
6056
6057                         CDEBUG(D_LFSCK, "%s: layout LFSCK finds crashed "
6058                                "LAST_ID file (2) for the sequence %#llx"
6059                                ", old value %llu, known value %llu\n",
6060                                lfsck_lfsck2name(lfsck), lls->lls_seq,
6061                                lls->lls_lastid, oid);
6062                 }
6063
6064                 lls->lls_lastid = oid;
6065                 lls->lls_dirty = 1;
6066         }
6067
6068         GOTO(unlock, rc = 0);
6069
6070 unlock:
6071         up_write(&com->lc_sem);
6072
6073         return rc;
6074 }
6075
6076 static int lfsck_layout_exec_dir(const struct lu_env *env,
6077                                  struct lfsck_component *com,
6078                                  struct lfsck_assistant_object *lso,
6079                                  struct lu_dirent *ent, __u16 type)
6080 {
6081         return 0;
6082 }
6083
6084 static int lfsck_layout_master_post(const struct lu_env *env,
6085                                     struct lfsck_component *com,
6086                                     int result, bool init)
6087 {
6088         struct lfsck_instance   *lfsck  = com->lc_lfsck;
6089         struct lfsck_layout     *lo     = com->lc_file_ram;
6090         int                      rc;
6091         ENTRY;
6092
6093         lfsck_post_generic(env, com, &result);
6094
6095         down_write(&com->lc_sem);
6096         spin_lock(&lfsck->li_lock);
6097         if (!init)
6098                 lo->ll_pos_last_checkpoint =
6099                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
6100
6101         if (result > 0) {
6102                 if (lo->ll_flags & LF_INCOMPLETE)
6103                         lo->ll_status = LS_PARTIAL;
6104                 else
6105                         lo->ll_status = LS_SCANNING_PHASE2;
6106                 lo->ll_flags |= LF_SCANNED_ONCE;
6107                 lo->ll_flags &= ~LF_UPGRADE;
6108                 list_move_tail(&com->lc_link, &lfsck->li_list_double_scan);
6109         } else if (result == 0) {
6110                 if (lfsck->li_status != 0)
6111                         lo->ll_status = lfsck->li_status;
6112                 else
6113                         lo->ll_status = LS_STOPPED;
6114                 if (lo->ll_status != LS_PAUSED)
6115                         list_move_tail(&com->lc_link, &lfsck->li_list_idle);
6116         } else {
6117                 lo->ll_status = LS_FAILED;
6118                 list_move_tail(&com->lc_link, &lfsck->li_list_idle);
6119         }
6120         spin_unlock(&lfsck->li_lock);
6121
6122         if (!init) {
6123                 lo->ll_run_time_phase1 += ktime_get_seconds() -
6124                                           lfsck->li_time_last_checkpoint;
6125                 lo->ll_time_last_checkpoint = ktime_get_real_seconds();
6126                 lo->ll_objs_checked_phase1 += com->lc_new_checked;
6127                 com->lc_new_checked = 0;
6128         }
6129
6130         rc = lfsck_layout_store(env, com);
6131         up_write(&com->lc_sem);
6132
6133         CDEBUG(D_LFSCK, "%s: layout LFSCK master post done: rc = %d\n",
6134                lfsck_lfsck2name(lfsck), rc);
6135
6136         RETURN(rc);
6137 }
6138
6139 static int lfsck_layout_slave_post(const struct lu_env *env,
6140                                    struct lfsck_component *com,
6141                                    int result, bool init)
6142 {
6143         struct lfsck_instance   *lfsck = com->lc_lfsck;
6144         struct lfsck_layout     *lo    = com->lc_file_ram;
6145         int                      rc;
6146         bool                     done  = false;
6147
6148         down_write(&com->lc_sem);
6149         rc = lfsck_layout_lastid_store(env, com);
6150         if (rc != 0)
6151                 result = rc;
6152
6153         LASSERT(lfsck->li_out_notify != NULL);
6154
6155         spin_lock(&lfsck->li_lock);
6156         if (!init)
6157                 lo->ll_pos_last_checkpoint =
6158                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
6159
6160         if (result > 0) {
6161                 lo->ll_status = LS_SCANNING_PHASE2;
6162                 lo->ll_flags |= LF_SCANNED_ONCE;
6163                 if (lo->ll_flags & LF_CRASHED_LASTID) {
6164                         done = true;
6165                         lo->ll_flags &= ~LF_CRASHED_LASTID;
6166
6167                         CDEBUG(D_LFSCK, "%s: layout LFSCK has rebuilt "
6168                                "crashed LAST_ID files successfully\n",
6169                                lfsck_lfsck2name(lfsck));
6170                 }
6171                 lo->ll_flags &= ~LF_UPGRADE;
6172                 list_move_tail(&com->lc_link, &lfsck->li_list_double_scan);
6173         } else if (result == 0) {
6174                 if (lfsck->li_status != 0)
6175                         lo->ll_status = lfsck->li_status;
6176                 else
6177                         lo->ll_status = LS_STOPPED;
6178                 if (lo->ll_status != LS_PAUSED)
6179                         list_move_tail(&com->lc_link, &lfsck->li_list_idle);
6180         } else {
6181                 lo->ll_status = LS_FAILED;
6182                 list_move_tail(&com->lc_link, &lfsck->li_list_idle);
6183         }
6184         spin_unlock(&lfsck->li_lock);
6185
6186         if (done)
6187                 lfsck->li_out_notify(env, lfsck->li_out_notify_data,
6188                                      LE_LASTID_REBUILT);
6189
6190         if (!init) {
6191                 lo->ll_run_time_phase1 += ktime_get_seconds() -
6192                                           lfsck->li_time_last_checkpoint;
6193                 lo->ll_time_last_checkpoint = ktime_get_real_seconds();
6194                 lo->ll_objs_checked_phase1 += com->lc_new_checked;
6195                 com->lc_new_checked = 0;
6196         }
6197
6198         rc = lfsck_layout_store(env, com);
6199         up_write(&com->lc_sem);
6200
6201         lfsck_layout_slave_notify_master(env, com, LE_PHASE1_DONE, result);
6202
6203         CDEBUG(D_LFSCK, "%s: layout LFSCK slave post done: rc = %d\n",
6204                lfsck_lfsck2name(lfsck), rc);
6205
6206         return rc;
6207 }
6208
6209 static void lfsck_layout_dump(const struct lu_env *env,
6210                               struct lfsck_component *com, struct seq_file *m)
6211 {
6212         struct lfsck_instance   *lfsck = com->lc_lfsck;
6213         struct lfsck_bookmark   *bk    = &lfsck->li_bookmark_ram;
6214         struct lfsck_layout     *lo    = com->lc_file_ram;
6215         const char *prefix;
6216
6217         down_read(&com->lc_sem);
6218         if (bk->lb_param & LPF_DRYRUN)
6219                 prefix = "inconsistent";
6220         else
6221                 prefix = "repaired";
6222
6223         seq_printf(m, "name: lfsck_layout\n"
6224                    "magic: %#x\n"
6225                    "version: %d\n"
6226                    "status: %s\n",
6227                    lo->ll_magic,
6228                    bk->lb_version,
6229                    lfsck_status2name(lo->ll_status));
6230
6231         lfsck_bits_dump(m, lo->ll_flags, lfsck_flags_names, "flags");
6232
6233         lfsck_bits_dump(m, bk->lb_param, lfsck_param_names, "param");
6234
6235         lfsck_time_dump(m, lo->ll_time_last_complete, "last_completed");
6236
6237         lfsck_time_dump(m, lo->ll_time_latest_start, "latest_start");
6238
6239         lfsck_time_dump(m, lo->ll_time_last_checkpoint, "last_checkpoint");
6240
6241         seq_printf(m, "latest_start_position: %llu\n"
6242                    "last_checkpoint_position: %llu\n"
6243                    "first_failure_position: %llu\n",
6244                    lo->ll_pos_latest_start,
6245                    lo->ll_pos_last_checkpoint,
6246                    lo->ll_pos_first_inconsistent);
6247
6248         seq_printf(m, "success_count: %u\n"
6249                    "%s_dangling: %llu\n"
6250                    "%s_unmatched_pair: %llu\n"
6251                    "%s_multiple_referenced: %llu\n"
6252                    "%s_orphan: %llu\n"
6253                    "%s_inconsistent_owner: %llu\n"
6254                    "%s_others: %llu\n"
6255                    "skipped: %llu\n"
6256                    "failed_phase1: %llu\n"
6257                    "failed_phase2: %llu\n",
6258                    lo->ll_success_count,
6259                    prefix, lo->ll_objs_repaired[LLIT_DANGLING - 1],
6260                    prefix, lo->ll_objs_repaired[LLIT_UNMATCHED_PAIR - 1],
6261                    prefix, lo->ll_objs_repaired[LLIT_MULTIPLE_REFERENCED - 1],
6262                    prefix, lo->ll_objs_repaired[LLIT_ORPHAN - 1],
6263                    prefix, lo->ll_objs_repaired[LLIT_INCONSISTENT_OWNER - 1],
6264                    prefix, lo->ll_objs_repaired[LLIT_OTHERS - 1],
6265                    lo->ll_objs_skipped,
6266                    lo->ll_objs_failed_phase1,
6267                    lo->ll_objs_failed_phase2);
6268
6269         if (lo->ll_status == LS_SCANNING_PHASE1) {
6270                 time64_t duration = ktime_get_seconds() -
6271                                     lfsck->li_time_last_checkpoint;
6272                 u64 checked = lo->ll_objs_checked_phase1 +
6273                               com->lc_new_checked;
6274                 u64 speed = checked;
6275                 u64 new_checked = com->lc_new_checked;
6276                 time64_t rtime = lo->ll_run_time_phase1 + duration;
6277                 u64 pos;
6278
6279                 if (duration != 0)
6280                         new_checked = div64_s64(new_checked, duration);
6281                 if (rtime != 0)
6282                         speed = div64_s64(speed, rtime);
6283                 seq_printf(m, "checked_phase1: %llu\n"
6284                            "checked_phase2: %llu\n"
6285                            "run_time_phase1: %lld seconds\n"
6286                            "run_time_phase2: %lld seconds\n"
6287                            "average_speed_phase1: %llu items/sec\n"
6288                            "average_speed_phase2: N/A\n"
6289                            "real_time_speed_phase1: %llu items/sec\n"
6290                            "real_time_speed_phase2: N/A\n",
6291                            checked,
6292                            lo->ll_objs_checked_phase2,
6293                            rtime,
6294                            lo->ll_run_time_phase2,
6295                            speed,
6296                            new_checked);
6297
6298                 if (likely(lfsck->li_di_oit)) {
6299                         const struct dt_it_ops *iops =
6300                                 &lfsck->li_obj_oit->do_index_ops->dio_it;
6301
6302                         /* The low layer otable-based iteration position may NOT
6303                          * exactly match the layout-based directory traversal
6304                          * cookie. Generally, it is not a serious issue. But the
6305                          * caller should NOT make assumption on that. */
6306                         pos = iops->store(env, lfsck->li_di_oit);
6307                         if (!lfsck->li_current_oit_processed)
6308                                 pos--;
6309                 } else {
6310                         pos = lo->ll_pos_last_checkpoint;
6311                 }
6312
6313                 seq_printf(m, "current_position: %llu\n", pos);
6314         } else if (lo->ll_status == LS_SCANNING_PHASE2) {
6315                 time64_t duration = ktime_get_seconds() -
6316                                     com->lc_time_last_checkpoint;
6317                 u64 checked = lo->ll_objs_checked_phase2 +
6318                               com->lc_new_checked;
6319                 u64 speed1 = lo->ll_objs_checked_phase1;
6320                 u64 speed2 = checked;
6321                 u64 new_checked = com->lc_new_checked;
6322                 time64_t rtime = lo->ll_run_time_phase2 + duration;
6323
6324                 if (duration != 0)
6325                         new_checked = div64_s64(new_checked, duration);
6326                 if (lo->ll_run_time_phase1 != 0)
6327                         speed1 = div64_s64(speed1, lo->ll_run_time_phase1);
6328                 if (rtime != 0)
6329                         speed2 = div64_s64(speed2, rtime);
6330                 seq_printf(m, "checked_phase1: %llu\n"
6331                            "checked_phase2: %llu\n"
6332                            "run_time_phase1: %lld seconds\n"
6333                            "run_time_phase2: %lld seconds\n"
6334                            "average_speed_phase1: %llu items/sec\n"
6335                            "average_speed_phase2: %llu items/sec\n"
6336                            "real_time_speed_phase1: N/A\n"
6337                            "real_time_speed_phase2: %llu items/sec\n"
6338                            "current_position: "DFID"\n",
6339                            lo->ll_objs_checked_phase1,
6340                            checked,
6341                            lo->ll_run_time_phase1,
6342                            rtime,
6343                            speed1,
6344                            speed2,
6345                            new_checked,
6346                            PFID(&com->lc_fid_latest_scanned_phase2));
6347         } else {
6348                 __u64 speed1 = lo->ll_objs_checked_phase1;
6349                 __u64 speed2 = lo->ll_objs_checked_phase2;
6350
6351                 if (lo->ll_run_time_phase1 != 0)
6352                         speed1 = div64_s64(speed1, lo->ll_run_time_phase1);
6353                 if (lo->ll_run_time_phase2 != 0)
6354                         speed2 = div64_s64(speed2, lo->ll_run_time_phase2);
6355                 seq_printf(m, "checked_phase1: %llu\n"
6356                            "checked_phase2: %llu\n"
6357                            "run_time_phase1: %lld seconds\n"
6358                            "run_time_phase2: %lld seconds\n"
6359                            "average_speed_phase1: %llu items/sec\n"
6360                            "average_speed_phase2: %llu objs/sec\n"
6361                            "real_time_speed_phase1: N/A\n"
6362                            "real_time_speed_phase2: N/A\n"
6363                            "current_position: N/A\n",
6364                            lo->ll_objs_checked_phase1,
6365                            lo->ll_objs_checked_phase2,
6366                            lo->ll_run_time_phase1,
6367                            lo->ll_run_time_phase2,
6368                            speed1,
6369                            speed2);
6370         }
6371
6372         up_read(&com->lc_sem);
6373 }
6374
6375 static int lfsck_layout_master_double_scan(const struct lu_env *env,
6376                                            struct lfsck_component *com)
6377 {
6378         struct lfsck_layout             *lo     = com->lc_file_ram;
6379         struct lfsck_assistant_data     *lad    = com->lc_data;
6380         struct lfsck_instance           *lfsck  = com->lc_lfsck;
6381         struct lfsck_tgt_descs          *ltds;
6382         struct lfsck_tgt_desc           *ltd;
6383         struct lfsck_tgt_desc           *next;
6384         int                              rc;
6385
6386         rc = lfsck_double_scan_generic(env, com, lo->ll_status);
6387
6388         if (thread_is_stopped(&lad->lad_thread)) {
6389                 LASSERT(list_empty(&lad->lad_req_list));
6390                 LASSERT(list_empty(&lad->lad_ost_phase1_list));
6391                 LASSERT(list_empty(&lad->lad_mdt_phase1_list));
6392
6393                 ltds = &lfsck->li_ost_descs;
6394                 spin_lock(&ltds->ltd_lock);
6395                 list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase2_list,
6396                                          ltd_layout_phase_list) {
6397                         list_del_init(&ltd->ltd_layout_phase_list);
6398                 }
6399                 spin_unlock(&ltds->ltd_lock);
6400
6401                 ltds = &lfsck->li_mdt_descs;
6402                 spin_lock(&ltds->ltd_lock);
6403                 list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase2_list,
6404                                          ltd_layout_phase_list) {
6405                         list_del_init(&ltd->ltd_layout_phase_list);
6406                 }
6407                 spin_unlock(&ltds->ltd_lock);
6408         }
6409
6410         return rc;
6411 }
6412
6413 static int lfsck_layout_slave_double_scan(const struct lu_env *env,
6414                                           struct lfsck_component *com)
6415 {
6416         struct lfsck_instance           *lfsck  = com->lc_lfsck;
6417         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
6418         struct lfsck_layout             *lo     = com->lc_file_ram;
6419         struct ptlrpc_thread            *thread = &lfsck->li_thread;
6420         int                              rc;
6421         ENTRY;
6422
6423         CDEBUG(D_LFSCK, "%s: layout LFSCK slave phase2 scan start\n",
6424                lfsck_lfsck2name(lfsck));
6425
6426         atomic_inc(&lfsck->li_double_scan_count);
6427
6428         if (lo->ll_flags & LF_INCOMPLETE)
6429                 GOTO(done, rc = 1);
6430
6431         com->lc_new_checked = 0;
6432         com->lc_new_scanned = 0;
6433         com->lc_time_last_checkpoint = ktime_get_seconds();
6434         com->lc_time_next_checkpoint = com->lc_time_last_checkpoint +
6435                                        LFSCK_CHECKPOINT_INTERVAL;
6436
6437         while (1) {
6438                 rc = lfsck_layout_slave_query_master(env, com);
6439                 if (list_empty(&llsd->llsd_master_list)) {
6440                         if (unlikely(!thread_is_running(thread)))
6441                                 rc = 0;
6442                         else
6443                                 rc = 1;
6444
6445                         GOTO(done, rc);
6446                 }
6447
6448                 if (rc < 0)
6449                         GOTO(done, rc);
6450
6451                 rc = wait_event_idle_timeout(
6452                         thread->t_ctl_waitq,
6453                         !thread_is_running(thread) ||
6454                         lo->ll_flags & LF_INCOMPLETE ||
6455                         list_empty(&llsd->llsd_master_list),
6456                         cfs_time_seconds(30));
6457                 if (unlikely(!thread_is_running(thread)))
6458                         GOTO(done, rc = 0);
6459
6460                 if (lo->ll_flags & LF_INCOMPLETE)
6461                         GOTO(done, rc = 1);
6462
6463                 if (rc == 0)
6464                         continue;
6465
6466                 GOTO(done, rc = 1);
6467         }
6468
6469 done:
6470         rc = lfsck_layout_double_scan_result(env, com, rc);
6471         lfsck_layout_slave_notify_master(env, com, LE_PHASE2_DONE,
6472                         (rc > 0 && lo->ll_flags & LF_INCOMPLETE) ? 0 : rc);
6473         lfsck_layout_slave_quit(env, com);
6474         if (atomic_dec_and_test(&lfsck->li_double_scan_count))
6475                 wake_up(&lfsck->li_thread.t_ctl_waitq);
6476
6477         CDEBUG(D_LFSCK, "%s: layout LFSCK slave phase2 scan finished, "
6478                "status %d: rc = %d\n",
6479                lfsck_lfsck2name(lfsck), lo->ll_status, rc);
6480
6481         return rc;
6482 }
6483
6484 static void lfsck_layout_master_data_release(const struct lu_env *env,
6485                                              struct lfsck_component *com)
6486 {
6487         struct lfsck_assistant_data     *lad    = com->lc_data;
6488         struct lfsck_instance           *lfsck  = com->lc_lfsck;
6489         struct lfsck_tgt_descs          *ltds;
6490         struct lfsck_tgt_desc           *ltd;
6491         struct lfsck_tgt_desc           *next;
6492
6493         LASSERT(lad != NULL);
6494         LASSERT(thread_is_init(&lad->lad_thread) ||
6495                 thread_is_stopped(&lad->lad_thread));
6496         LASSERT(list_empty(&lad->lad_req_list));
6497
6498         com->lc_data = NULL;
6499
6500         ltds = &lfsck->li_ost_descs;
6501         spin_lock(&ltds->ltd_lock);
6502         list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase1_list,
6503                                  ltd_layout_phase_list) {
6504                 list_del_init(&ltd->ltd_layout_phase_list);
6505         }
6506         list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase2_list,
6507                                  ltd_layout_phase_list) {
6508                 list_del_init(&ltd->ltd_layout_phase_list);
6509         }
6510         list_for_each_entry_safe(ltd, next, &lad->lad_ost_list,
6511                                  ltd_layout_list) {
6512                 list_del_init(&ltd->ltd_layout_list);
6513         }
6514         spin_unlock(&ltds->ltd_lock);
6515
6516         ltds = &lfsck->li_mdt_descs;
6517         spin_lock(&ltds->ltd_lock);
6518         list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase1_list,
6519                                  ltd_layout_phase_list) {
6520                 list_del_init(&ltd->ltd_layout_phase_list);
6521         }
6522         list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase2_list,
6523                                  ltd_layout_phase_list) {
6524                 list_del_init(&ltd->ltd_layout_phase_list);
6525         }
6526         list_for_each_entry_safe(ltd, next, &lad->lad_mdt_list,
6527                                  ltd_layout_list) {
6528                 list_del_init(&ltd->ltd_layout_list);
6529         }
6530         spin_unlock(&ltds->ltd_lock);
6531
6532         bitmap_free(lad->lad_bitmap);
6533
6534         OBD_FREE_PTR(lad);
6535 }
6536
6537 static void lfsck_layout_slave_data_release(const struct lu_env *env,
6538                                             struct lfsck_component *com)
6539 {
6540         struct lfsck_layout_slave_data *llsd = com->lc_data;
6541
6542         lfsck_layout_slave_quit(env, com);
6543         com->lc_data = NULL;
6544         OBD_FREE_PTR(llsd);
6545 }
6546
6547 static void lfsck_layout_master_quit(const struct lu_env *env,
6548                                      struct lfsck_component *com)
6549 {
6550         struct lfsck_assistant_data     *lad    = com->lc_data;
6551         struct lfsck_instance           *lfsck  = com->lc_lfsck;
6552         struct lfsck_tgt_descs          *ltds;
6553         struct lfsck_tgt_desc           *ltd;
6554         struct lfsck_tgt_desc           *next;
6555
6556         LASSERT(lad != NULL);
6557
6558         lfsck_quit_generic(env, com);
6559
6560         LASSERT(thread_is_init(&lad->lad_thread) ||
6561                 thread_is_stopped(&lad->lad_thread));
6562         LASSERT(list_empty(&lad->lad_req_list));
6563
6564         ltds = &lfsck->li_ost_descs;
6565         spin_lock(&ltds->ltd_lock);
6566         list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase1_list,
6567                                  ltd_layout_phase_list) {
6568                 list_del_init(&ltd->ltd_layout_phase_list);
6569         }
6570         list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase2_list,
6571                                  ltd_layout_phase_list) {
6572                 list_del_init(&ltd->ltd_layout_phase_list);
6573         }
6574         spin_unlock(&ltds->ltd_lock);
6575
6576         ltds = &lfsck->li_mdt_descs;
6577         spin_lock(&ltds->ltd_lock);
6578         list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase1_list,
6579                                  ltd_layout_phase_list) {
6580                 list_del_init(&ltd->ltd_layout_phase_list);
6581         }
6582         list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase2_list,
6583                                  ltd_layout_phase_list) {
6584                 list_del_init(&ltd->ltd_layout_phase_list);
6585         }
6586         spin_unlock(&ltds->ltd_lock);
6587 }
6588
6589 static void lfsck_layout_slave_quit(const struct lu_env *env,
6590                                     struct lfsck_component *com)
6591 {
6592         struct lfsck_layout_slave_data   *llsd  = com->lc_data;
6593         struct lfsck_layout_seq          *lls;
6594         struct lfsck_layout_seq          *next;
6595         struct lfsck_layout_slave_target *llst;
6596
6597         LASSERT(llsd != NULL);
6598
6599         down_write(&com->lc_sem);
6600         list_for_each_entry_safe(lls, next, &llsd->llsd_seq_list,
6601                                  lls_list) {
6602                 list_del_init(&lls->lls_list);
6603                 lfsck_object_put(env, lls->lls_lastid_obj);
6604                 OBD_FREE_PTR(lls);
6605         }
6606         up_write(&com->lc_sem);
6607
6608         spin_lock(&llsd->llsd_lock);
6609         while (!list_empty(&llsd->llsd_master_list)) {
6610                 llst = list_first_entry(&llsd->llsd_master_list,
6611                                         struct lfsck_layout_slave_target,
6612                                         llst_list);
6613                 list_del_init(&llst->llst_list);
6614                 spin_unlock(&llsd->llsd_lock);
6615                 lfsck_layout_llst_put(llst);
6616                 spin_lock(&llsd->llsd_lock);
6617         }
6618         spin_unlock(&llsd->llsd_lock);
6619
6620         lfsck_rbtree_cleanup(env, com);
6621 }
6622
6623 static int lfsck_layout_master_in_notify(const struct lu_env *env,
6624                                          struct lfsck_component *com,
6625                                          struct lfsck_request *lr)
6626 {
6627         struct lfsck_instance           *lfsck = com->lc_lfsck;
6628         struct lfsck_layout             *lo    = com->lc_file_ram;
6629         struct lfsck_assistant_data     *lad   = com->lc_data;
6630         struct lfsck_tgt_descs          *ltds;
6631         struct lfsck_tgt_desc           *ltd;
6632         bool                             fail  = false;
6633         ENTRY;
6634
6635         if (lr->lr_event == LE_PAIRS_VERIFY) {
6636                 int rc;
6637
6638                 rc = lfsck_layout_master_check_pairs(env, com, &lr->lr_fid,
6639                                                      &lr->lr_fid2,
6640                                                      lr->lr_comp_id);
6641
6642                 RETURN(rc);
6643         }
6644
6645         CDEBUG(D_LFSCK, "%s: layout LFSCK master handles notify %u "
6646                "from %s %x, status %d, flags %x, flags2 %x\n",
6647                lfsck_lfsck2name(lfsck), lr->lr_event,
6648                (lr->lr_flags & LEF_FROM_OST) ? "OST" : "MDT",
6649                lr->lr_index, lr->lr_status, lr->lr_flags, lr->lr_flags2);
6650
6651         if (lr->lr_event != LE_PHASE1_DONE &&
6652             lr->lr_event != LE_PHASE2_DONE &&
6653             lr->lr_event != LE_PEER_EXIT)
6654                 RETURN(-EINVAL);
6655
6656         if (lr->lr_flags & LEF_FROM_OST)
6657                 ltds = &lfsck->li_ost_descs;
6658         else
6659                 ltds = &lfsck->li_mdt_descs;
6660         spin_lock(&ltds->ltd_lock);
6661         ltd = lfsck_ltd2tgt(ltds, lr->lr_index);
6662         if (ltd == NULL) {
6663                 spin_unlock(&ltds->ltd_lock);
6664
6665                 RETURN(-ENXIO);
6666         }
6667
6668         list_del_init(&ltd->ltd_layout_phase_list);
6669         switch (lr->lr_event) {
6670         case LE_PHASE1_DONE:
6671                 if (lr->lr_status <= 0 || lr->lr_flags2 & LF_INCOMPLETE) {
6672                         if (lr->lr_flags2 & LF_INCOMPLETE) {
6673                                 if (lr->lr_flags & LEF_FROM_OST)
6674                                         lfsck_lad_set_bitmap(env, com,
6675                                                              ltd->ltd_index);
6676                                 else
6677                                         lo->ll_flags |= LF_INCOMPLETE;
6678                         }
6679                         ltd->ltd_layout_done = 1;
6680                         list_del_init(&ltd->ltd_layout_list);
6681                         fail = true;
6682                         break;
6683                 }
6684
6685                 if (lr->lr_flags & LEF_FROM_OST) {
6686                         if (list_empty(&ltd->ltd_layout_list))
6687                                 list_add_tail(&ltd->ltd_layout_list,
6688                                               &lad->lad_ost_list);
6689                         list_add_tail(&ltd->ltd_layout_phase_list,
6690                                       &lad->lad_ost_phase2_list);
6691                 } else {
6692                         if (list_empty(&ltd->ltd_layout_list))
6693                                 list_add_tail(&ltd->ltd_layout_list,
6694                                               &lad->lad_mdt_list);
6695                         list_add_tail(&ltd->ltd_layout_phase_list,
6696                                       &lad->lad_mdt_phase2_list);
6697                 }
6698                 break;
6699         case LE_PHASE2_DONE:
6700                 ltd->ltd_layout_done = 1;
6701                 if (!list_empty(&ltd->ltd_layout_list))
6702                         list_del_init(&ltd->ltd_layout_list);
6703
6704                 if (lr->lr_flags2 & LF_INCOMPLETE) {
6705                         lfsck_lad_set_bitmap(env, com, ltd->ltd_index);
6706                         fail = true;
6707                 }
6708
6709                 break;
6710         case LE_PEER_EXIT:
6711                 fail = true;
6712                 ltd->ltd_layout_done = 1;
6713                 list_del_init(&ltd->ltd_layout_list);
6714                 if (!(lfsck->li_bookmark_ram.lb_param & LPF_FAILOUT) &&
6715                     !(lr->lr_flags & LEF_FROM_OST))
6716                                 lo->ll_flags |= LF_INCOMPLETE;
6717                 break;
6718         default:
6719                 break;
6720         }
6721         spin_unlock(&ltds->ltd_lock);
6722
6723         if (fail && lfsck->li_bookmark_ram.lb_param & LPF_FAILOUT) {
6724                 struct lfsck_stop *stop = &lfsck_env_info(env)->lti_stop;
6725
6726                 memset(stop, 0, sizeof(*stop));
6727                 stop->ls_status = lr->lr_status;
6728                 stop->ls_flags = lr->lr_param & ~LPF_BROADCAST;
6729                 lfsck_stop(env, lfsck->li_bottom, stop);
6730         } else if (lfsck_phase2_next_ready(lad)) {
6731                 wake_up(&lad->lad_thread.t_ctl_waitq);
6732         }
6733
6734         RETURN(0);
6735 }
6736
6737 static int lfsck_layout_slave_in_notify_local(const struct lu_env *env,
6738                                               struct lfsck_component *com,
6739                                               struct lfsck_req_local *lrl,
6740                                               struct thandle *th)
6741 {
6742         ENTRY;
6743
6744         switch (lrl->lrl_event) {
6745         case LEL_FID_ACCESSED:
6746                 lfsck_rbtree_update_bitmap(env, com, &lrl->lrl_fid, true);
6747                 RETURN(0);
6748         case LEL_PAIRS_VERIFY_LOCAL: {
6749                 int rc;
6750
6751                 lrl->lrl_status = LPVS_INIT;
6752                 /* Firstly, if the MDT-object which is claimed via OST-object
6753                  * local stored PFID xattr recognizes the OST-object, then it
6754                  * must be that the client given PFID is wrong. */
6755                 rc = lfsck_layout_slave_check_pairs(env, com, &lrl->lrl_fid,
6756                                 &lrl->lrl_ff_local.ff_parent,
6757                                 lrl->lrl_ff_local.ff_layout.ol_comp_id);
6758                 if (rc <= 0)
6759                         RETURN(0);
6760
6761                 lrl->lrl_status = LPVS_INCONSISTENT;
6762                 /* The OST-object local stored PFID xattr is stale. We need to
6763                  * check whether the MDT-object that is claimed via the client
6764                  * given PFID information recognizes the OST-object or not. If
6765                  * matches, then need to update the OST-object's PFID xattr. */
6766                 rc = lfsck_layout_slave_check_pairs(env, com, &lrl->lrl_fid,
6767                                 &lrl->lrl_ff_client.ff_parent,
6768                                 lrl->lrl_ff_client.ff_layout.ol_comp_id);
6769                 /* For rc < 0 case:
6770                  * We are not sure whether the client given PFID information
6771                  * is correct or not, do nothing to avoid improper fixing.
6772                  *
6773                  * For rc > 0 case:
6774                  * The client given PFID information is also invalid, we can
6775                  * NOT fix the OST-object inconsistency.
6776                  */
6777                 if (!rc) {
6778                         lrl->lrl_status = LPVS_INCONSISTENT_TOFIX;
6779                         rc = lfsck_layout_slave_repair_pfid(env, com, lrl);
6780                 }
6781
6782                 RETURN(rc);
6783         }
6784         default:
6785                 break;
6786         }
6787
6788         RETURN(-EOPNOTSUPP);
6789 }
6790
6791 static int lfsck_layout_slave_in_notify(const struct lu_env *env,
6792                                         struct lfsck_component *com,
6793                                         struct lfsck_request *lr)
6794 {
6795         struct lfsck_instance *lfsck = com->lc_lfsck;
6796         struct lfsck_layout_slave_data *llsd = com->lc_data;
6797         struct lfsck_layout_slave_target *llst;
6798         int rc;
6799         ENTRY;
6800
6801         switch (lr->lr_event) {
6802         case LE_CONDITIONAL_DESTROY:
6803                 rc = lfsck_layout_slave_conditional_destroy(env, com, lr);
6804                 RETURN(rc);
6805         case LE_PHASE1_DONE: {
6806                 if (lr->lr_flags2 & LF_INCOMPLETE) {
6807                         struct lfsck_layout *lo = com->lc_file_ram;
6808
6809                         lo->ll_flags |= LF_INCOMPLETE;
6810                         llst = lfsck_layout_llst_find_and_del(llsd,
6811                                                               lr->lr_index,
6812                                                               true);
6813                         if (llst != NULL) {
6814                                 lfsck_layout_llst_put(llst);
6815                                 wake_up(&lfsck->li_thread.t_ctl_waitq);
6816                         }
6817                 }
6818
6819                 RETURN(0);
6820         }
6821         case LE_PHASE2_DONE:
6822         case LE_PEER_EXIT:
6823                 CDEBUG(D_LFSCK, "%s: layout LFSCK slave handle notify %u "
6824                        "from MDT %x, status %d\n", lfsck_lfsck2name(lfsck),
6825                        lr->lr_event, lr->lr_index, lr->lr_status);
6826                 break;
6827         default:
6828                 RETURN(-EINVAL);
6829         }
6830
6831         llst = lfsck_layout_llst_find_and_del(llsd, lr->lr_index, true);
6832         if (llst == NULL)
6833                 RETURN(0);
6834
6835         lfsck_layout_llst_put(llst);
6836         if (list_empty(&llsd->llsd_master_list))
6837                 wake_up(&lfsck->li_thread.t_ctl_waitq);
6838
6839         if (lr->lr_event == LE_PEER_EXIT &&
6840             (lfsck->li_bookmark_ram.lb_param & LPF_FAILOUT ||
6841              (list_empty(&llsd->llsd_master_list) &&
6842               (lr->lr_status == LS_STOPPED ||
6843                lr->lr_status == LS_CO_STOPPED)))) {
6844                 struct lfsck_stop *stop = &lfsck_env_info(env)->lti_stop;
6845
6846                 memset(stop, 0, sizeof(*stop));
6847                 stop->ls_status = lr->lr_status;
6848                 stop->ls_flags = lr->lr_param & ~LPF_BROADCAST;
6849                 lfsck_stop(env, lfsck->li_bottom, stop);
6850         }
6851
6852         RETURN(0);
6853 }
6854
6855 static void lfsck_layout_repaired(struct lfsck_layout *lo, __u64 *count)
6856 {
6857         int i;
6858
6859         for (i = 0; i < LLIT_MAX; i++)
6860                 *count += lo->ll_objs_repaired[i];
6861 }
6862
6863 static int lfsck_layout_query_all(const struct lu_env *env,
6864                                   struct lfsck_component *com,
6865                                   __u32 *mdts_count, __u32 *osts_count,
6866                                   __u64 *repaired)
6867 {
6868         struct lfsck_layout *lo = com->lc_file_ram;
6869         struct lfsck_tgt_descs *ltds;
6870         struct lfsck_tgt_desc *ltd;
6871         int idx;
6872         int rc;
6873         ENTRY;
6874
6875         rc = lfsck_query_all(env, com);
6876         if (rc != 0)
6877                 RETURN(rc);
6878
6879         ltds = &com->lc_lfsck->li_mdt_descs;
6880         down_read(&ltds->ltd_rw_sem);
6881         for_each_set_bit(idx, ltds->ltd_tgts_bitmap, ltds->ltd_tgts_mask_len) {
6882                 ltd = lfsck_ltd2tgt(ltds, idx);
6883                 LASSERT(ltd != NULL);
6884
6885                 mdts_count[ltd->ltd_layout_status]++;
6886                 *repaired += ltd->ltd_layout_repaired;
6887         }
6888         up_read(&ltds->ltd_rw_sem);
6889
6890         ltds = &com->lc_lfsck->li_ost_descs;
6891         down_read(&ltds->ltd_rw_sem);
6892         for_each_set_bit(idx, ltds->ltd_tgts_bitmap, ltds->ltd_tgts_mask_len) {
6893                 ltd = lfsck_ltd2tgt(ltds, idx);
6894                 LASSERT(ltd != NULL);
6895
6896                 osts_count[ltd->ltd_layout_status]++;
6897                 *repaired += ltd->ltd_layout_repaired;
6898         }
6899         up_read(&ltds->ltd_rw_sem);
6900
6901         down_read(&com->lc_sem);
6902         mdts_count[lo->ll_status]++;
6903         lfsck_layout_repaired(lo, repaired);
6904         up_read(&com->lc_sem);
6905
6906         RETURN(0);
6907 }
6908
6909 static int lfsck_layout_query(const struct lu_env *env,
6910                               struct lfsck_component *com,
6911                               struct lfsck_request *req,
6912                               struct lfsck_reply *rep,
6913                               struct lfsck_query *que, int idx)
6914 {
6915         struct lfsck_layout *lo = com->lc_file_ram;
6916         int rc = 0;
6917
6918         if (que != NULL) {
6919                 LASSERT(com->lc_lfsck->li_master);
6920
6921                 rc = lfsck_layout_query_all(env, com,
6922                                             que->lu_mdts_count[idx],
6923                                             que->lu_osts_count[idx],
6924                                             &que->lu_repaired[idx]);
6925         } else {
6926                 down_read(&com->lc_sem);
6927                 rep->lr_status = lo->ll_status;
6928                 if (req->lr_flags & LEF_QUERY_ALL)
6929                         lfsck_layout_repaired(lo, &rep->lr_repaired);
6930                 up_read(&com->lc_sem);
6931         }
6932
6933         return rc;
6934 }
6935
6936 /* with lfsck::li_lock held */
6937 static int lfsck_layout_slave_join(const struct lu_env *env,
6938                                    struct lfsck_component *com,
6939                                    struct lfsck_start_param *lsp)
6940 {
6941         struct lfsck_instance            *lfsck = com->lc_lfsck;
6942         struct lfsck_layout_slave_data   *llsd  = com->lc_data;
6943         struct lfsck_layout_slave_target *llst;
6944         struct lfsck_start               *start = lsp->lsp_start;
6945         int                               rc    = 0;
6946         ENTRY;
6947
6948         if (start == NULL || !(start->ls_flags & LPF_OST_ORPHAN))
6949                 RETURN(0);
6950
6951         if (!lsp->lsp_index_valid)
6952                 RETURN(-EINVAL);
6953
6954         /* If someone is running the LFSCK without orphan handling,
6955          * it will not maintain the object accessing rbtree. So we
6956          * cannot join it for orphan handling. */
6957         if (!llsd->llsd_rbtree_valid)
6958                 RETURN(-EBUSY);
6959
6960         spin_unlock(&lfsck->li_lock);
6961         rc = lfsck_layout_llst_add(llsd, lsp->lsp_index);
6962         spin_lock(&lfsck->li_lock);
6963         if (rc == 0 && !thread_is_running(&lfsck->li_thread)) {
6964                 spin_unlock(&lfsck->li_lock);
6965                 llst = lfsck_layout_llst_find_and_del(llsd, lsp->lsp_index,
6966                                                       true);
6967                 if (llst != NULL)
6968                         lfsck_layout_llst_put(llst);
6969                 spin_lock(&lfsck->li_lock);
6970                 rc = -EAGAIN;
6971         }
6972
6973         RETURN(rc);
6974 }
6975
6976 static const struct lfsck_operations lfsck_layout_master_ops = {
6977         .lfsck_reset            = lfsck_layout_reset,
6978         .lfsck_fail             = lfsck_layout_fail,
6979         .lfsck_checkpoint       = lfsck_layout_master_checkpoint,
6980         .lfsck_prep             = lfsck_layout_master_prep,
6981         .lfsck_exec_oit         = lfsck_layout_master_exec_oit,
6982         .lfsck_exec_dir         = lfsck_layout_exec_dir,
6983         .lfsck_post             = lfsck_layout_master_post,
6984         .lfsck_dump             = lfsck_layout_dump,
6985         .lfsck_double_scan      = lfsck_layout_master_double_scan,
6986         .lfsck_data_release     = lfsck_layout_master_data_release,
6987         .lfsck_quit             = lfsck_layout_master_quit,
6988         .lfsck_in_notify        = lfsck_layout_master_in_notify,
6989         .lfsck_query            = lfsck_layout_query,
6990 };
6991
6992 static const struct lfsck_operations lfsck_layout_slave_ops = {
6993         .lfsck_reset            = lfsck_layout_reset,
6994         .lfsck_fail             = lfsck_layout_fail,
6995         .lfsck_checkpoint       = lfsck_layout_slave_checkpoint,
6996         .lfsck_prep             = lfsck_layout_slave_prep,
6997         .lfsck_exec_oit         = lfsck_layout_slave_exec_oit,
6998         .lfsck_exec_dir         = lfsck_layout_exec_dir,
6999         .lfsck_post             = lfsck_layout_slave_post,
7000         .lfsck_dump             = lfsck_layout_dump,
7001         .lfsck_double_scan      = lfsck_layout_slave_double_scan,
7002         .lfsck_data_release     = lfsck_layout_slave_data_release,
7003         .lfsck_quit             = lfsck_layout_slave_quit,
7004         .lfsck_in_notify_local  = lfsck_layout_slave_in_notify_local,
7005         .lfsck_in_notify        = lfsck_layout_slave_in_notify,
7006         .lfsck_query            = lfsck_layout_query,
7007         .lfsck_join             = lfsck_layout_slave_join,
7008 };
7009
7010 static void lfsck_layout_assistant_fill_pos(const struct lu_env *env,
7011                                             struct lfsck_component *com,
7012                                             struct lfsck_position *pos)
7013 {
7014         struct lfsck_assistant_data     *lad = com->lc_data;
7015         struct lfsck_layout_req         *llr;
7016
7017         if (((struct lfsck_layout *)(com->lc_file_ram))->ll_status !=
7018             LS_SCANNING_PHASE1)
7019                 return;
7020
7021         if (list_empty(&lad->lad_req_list))
7022                 return;
7023
7024         llr = list_first_entry(&lad->lad_req_list,
7025                                struct lfsck_layout_req,
7026                                llr_lar.lar_list);
7027         pos->lp_oit_cookie = llr->llr_lar.lar_parent->lso_oit_cookie - 1;
7028 }
7029
7030 const struct lfsck_assistant_operations lfsck_layout_assistant_ops = {
7031         .la_handler_p1          = lfsck_layout_assistant_handler_p1,
7032         .la_handler_p2          = lfsck_layout_assistant_handler_p2,
7033         .la_fill_pos            = lfsck_layout_assistant_fill_pos,
7034         .la_double_scan_result  = lfsck_layout_double_scan_result,
7035         .la_req_fini            = lfsck_layout_assistant_req_fini,
7036         .la_sync_failures       = lfsck_layout_assistant_sync_failures,
7037 };
7038
7039 int lfsck_layout_setup(const struct lu_env *env, struct lfsck_instance *lfsck)
7040 {
7041         struct lfsck_component  *com;
7042         struct lfsck_layout     *lo;
7043         struct dt_object        *root = NULL;
7044         struct dt_object        *obj;
7045         int                      i;
7046         int                      rc;
7047         ENTRY;
7048
7049         OBD_ALLOC_PTR(com);
7050         if (com == NULL)
7051                 RETURN(-ENOMEM);
7052
7053         INIT_LIST_HEAD(&com->lc_link);
7054         INIT_LIST_HEAD(&com->lc_link_dir);
7055         init_rwsem(&com->lc_sem);
7056         atomic_set(&com->lc_ref, 1);
7057         com->lc_lfsck = lfsck;
7058         com->lc_type = LFSCK_TYPE_LAYOUT;
7059         if (lfsck->li_master) {
7060                 com->lc_ops = &lfsck_layout_master_ops;
7061                 com->lc_data = lfsck_assistant_data_init(
7062                                 &lfsck_layout_assistant_ops,
7063                                 LFSCK_LAYOUT);
7064                 if (com->lc_data == NULL)
7065                         GOTO(out, rc = -ENOMEM);
7066
7067                 for (i = 0; i < LFSCK_STF_COUNT; i++)
7068                         mutex_init(&com->lc_sub_trace_objs[i].lsto_mutex);
7069         } else {
7070                 struct lfsck_layout_slave_data *llsd;
7071
7072                 com->lc_ops = &lfsck_layout_slave_ops;
7073                 OBD_ALLOC_PTR(llsd);
7074                 if (llsd == NULL)
7075                         GOTO(out, rc = -ENOMEM);
7076
7077                 INIT_LIST_HEAD(&llsd->llsd_seq_list);
7078                 INIT_LIST_HEAD(&llsd->llsd_master_list);
7079                 spin_lock_init(&llsd->llsd_lock);
7080                 llsd->llsd_rb_root = RB_ROOT;
7081                 init_rwsem(&llsd->llsd_rb_rwsem);
7082                 com->lc_data = llsd;
7083         }
7084         com->lc_file_size = sizeof(*lo);
7085         OBD_ALLOC(com->lc_file_ram, com->lc_file_size);
7086         if (com->lc_file_ram == NULL)
7087                 GOTO(out, rc = -ENOMEM);
7088
7089         OBD_ALLOC(com->lc_file_disk, com->lc_file_size);
7090         if (com->lc_file_disk == NULL)
7091                 GOTO(out, rc = -ENOMEM);
7092
7093         root = dt_locate(env, lfsck->li_bottom, &lfsck->li_local_root_fid);
7094         if (IS_ERR(root))
7095                 GOTO(out, rc = PTR_ERR(root));
7096
7097         if (unlikely(!dt_try_as_dir(env, root, true)))
7098                 GOTO(out, rc = -ENOTDIR);
7099
7100         obj = local_file_find_or_create(env, lfsck->li_los, root,
7101                                         LFSCK_LAYOUT,
7102                                         S_IFREG | S_IRUGO | S_IWUSR);
7103         if (IS_ERR(obj))
7104                 GOTO(out, rc = PTR_ERR(obj));
7105
7106         com->lc_obj = obj;
7107         rc = lfsck_layout_load(env, com);
7108         if (rc > 0) {
7109                 rc = lfsck_layout_reset(env, com, true);
7110         } else if (rc == -ENOENT) {
7111                 rc = lfsck_layout_init(env, com);
7112         } else if (lfsck->li_master) {
7113                 rc = lfsck_load_sub_trace_files(env, com,
7114                                 &dt_lfsck_layout_dangling_features,
7115                                 LFSCK_LAYOUT, false);
7116                 if (rc)
7117                         rc = lfsck_layout_reset(env, com, true);
7118         }
7119
7120         if (rc != 0)
7121                 GOTO(out, rc);
7122
7123         lo = com->lc_file_ram;
7124         switch (lo->ll_status) {
7125         case LS_INIT:
7126         case LS_COMPLETED:
7127         case LS_FAILED:
7128         case LS_STOPPED:
7129         case LS_PARTIAL:
7130                 spin_lock(&lfsck->li_lock);
7131                 list_add_tail(&com->lc_link, &lfsck->li_list_idle);
7132                 spin_unlock(&lfsck->li_lock);
7133                 break;
7134         default:
7135                 CERROR("%s: unknown lfsck_layout status %d\n",
7136                        lfsck_lfsck2name(lfsck), lo->ll_status);
7137                 fallthrough;
7138         case LS_SCANNING_PHASE1:
7139         case LS_SCANNING_PHASE2:
7140                 /* No need to store the status to disk right now.
7141                  * If the system crashed before the status stored,
7142                  * it will be loaded back when next time. */
7143                 lo->ll_status = LS_CRASHED;
7144                 if (!lfsck->li_master)
7145                         lo->ll_flags |= LF_INCOMPLETE;
7146                 fallthrough;
7147         case LS_PAUSED:
7148         case LS_CRASHED:
7149         case LS_CO_FAILED:
7150         case LS_CO_STOPPED:
7151         case LS_CO_PAUSED:
7152                 spin_lock(&lfsck->li_lock);
7153                 list_add_tail(&com->lc_link, &lfsck->li_list_scan);
7154                 spin_unlock(&lfsck->li_lock);
7155                 break;
7156         }
7157
7158         if (lo->ll_flags & LF_CRASHED_LASTID) {
7159                 LASSERT(lfsck->li_out_notify != NULL);
7160
7161                 lfsck->li_out_notify(env, lfsck->li_out_notify_data,
7162                                      LE_LASTID_REBUILDING);
7163         }
7164
7165         GOTO(out, rc = 0);
7166
7167 out:
7168         if (root != NULL && !IS_ERR(root))
7169                 lfsck_object_put(env, root);
7170
7171         if (rc != 0) {
7172                 lfsck_component_cleanup(env, com);
7173                 CERROR("%s: fail to init layout LFSCK component: rc = %d\n",
7174                        lfsck_lfsck2name(lfsck), rc);
7175         }
7176
7177         return rc;
7178 }
7179
7180 struct lfsck_orphan_it {
7181         struct lfsck_component           *loi_com;
7182         struct lfsck_rbtree_node         *loi_lrn;
7183         struct lfsck_layout_slave_target *loi_llst;
7184         struct lu_fid                     loi_key;
7185         struct lu_orphan_rec_v3           loi_rec;
7186         __u64                             loi_hash;
7187         unsigned int                      loi_over:1;
7188 };
7189
7190 static int lfsck_fid_match_idx(const struct lu_env *env,
7191                                struct lfsck_instance *lfsck,
7192                                const struct lu_fid *fid, int idx)
7193 {
7194         struct seq_server_site  *ss;
7195         struct lu_server_fld    *sf;
7196         struct lu_seq_range     *range = &lfsck_env_info(env)->lti_range;
7197         int                      rc;
7198
7199         /* All abnormal cases will be returned to MDT0. */
7200         if (!fid_is_norm(fid)) {
7201                 if (idx == 0)
7202                         return 1;
7203
7204                 return 0;
7205         }
7206
7207         ss = lfsck_dev_site(lfsck);
7208         if (unlikely(ss == NULL))
7209                 return -ENOTCONN;
7210
7211         sf = ss->ss_server_fld;
7212         LASSERT(sf != NULL);
7213
7214         fld_range_set_any(range);
7215         rc = fld_server_lookup(env, sf, fid_seq(fid), range);
7216         if (rc != 0)
7217                 return rc;
7218
7219         if (!fld_range_is_mdt(range))
7220                 return -EINVAL;
7221
7222         if (range->lsr_index == idx)
7223                 return 1;
7224
7225         return 0;
7226 }
7227
7228 static void lfsck_layout_destroy_orphan(const struct lu_env *env,
7229                                         struct lfsck_instance *lfsck,
7230                                         struct dt_object *obj)
7231 {
7232         struct dt_device        *dev    = lfsck_obj2dev(obj);
7233         struct thandle          *handle;
7234         int                      rc;
7235         ENTRY;
7236
7237         if (lfsck->li_bookmark_ram.lb_param & LPF_DRYRUN)
7238                 GOTO(log, rc = 0);
7239
7240         handle = lfsck_trans_create(env, dev, lfsck);
7241         if (IS_ERR(handle))
7242                 RETURN_EXIT;
7243
7244         rc = dt_declare_ref_del(env, obj, handle);
7245         if (rc != 0)
7246                 GOTO(stop, rc);
7247
7248         rc = dt_declare_destroy(env, obj, handle);
7249         if (rc != 0)
7250                 GOTO(stop, rc);
7251
7252         rc = dt_trans_start_local(env, dev, handle);
7253         if (rc != 0)
7254                 GOTO(stop, rc);
7255
7256         dt_write_lock(env, obj, 0);
7257         rc = dt_ref_del(env, obj, handle);
7258         if (rc == 0)
7259                 rc = dt_destroy(env, obj, handle);
7260         dt_write_unlock(env, obj);
7261
7262         GOTO(stop, rc);
7263
7264 stop:
7265         dt_trans_stop(env, dev, handle);
7266
7267 log:
7268         CDEBUG(D_LFSCK, "destroy orphan OST-object "DFID": rc = %d\n",
7269                PFID(lfsck_dto2fid(obj)), rc);
7270
7271         RETURN_EXIT;
7272 }
7273
7274 static int lfsck_orphan_index_lookup(const struct lu_env *env,
7275                                      struct dt_object *dt,
7276                                      struct dt_rec *rec,
7277                                      const struct dt_key *key)
7278 {
7279         return -EOPNOTSUPP;
7280 }
7281
7282 static int lfsck_orphan_index_declare_insert(const struct lu_env *env,
7283                                              struct dt_object *dt,
7284                                              const struct dt_rec *rec,
7285                                              const struct dt_key *key,
7286                                              struct thandle *handle)
7287 {
7288         return -EOPNOTSUPP;
7289 }
7290
7291 static int lfsck_orphan_index_insert(const struct lu_env *env,
7292                                      struct dt_object *dt,
7293                                      const struct dt_rec *rec,
7294                                      const struct dt_key *key,
7295                                      struct thandle *handle)
7296 {
7297         return -EOPNOTSUPP;
7298 }
7299
7300 static int lfsck_orphan_index_declare_delete(const struct lu_env *env,
7301                                              struct dt_object *dt,
7302                                              const struct dt_key *key,
7303                                              struct thandle *handle)
7304 {
7305         return -EOPNOTSUPP;
7306 }
7307
7308 static int lfsck_orphan_index_delete(const struct lu_env *env,
7309                                      struct dt_object *dt,
7310                                      const struct dt_key *key,
7311                                      struct thandle *handle)
7312 {
7313         return -EOPNOTSUPP;
7314 }
7315
7316 static struct dt_it *lfsck_orphan_it_init(const struct lu_env *env,
7317                                           struct dt_object *dt,
7318                                           __u32 attr)
7319 {
7320         struct dt_device                *dev    = lu2dt_dev(dt->do_lu.lo_dev);
7321         struct lfsck_instance           *lfsck;
7322         struct lfsck_component          *com    = NULL;
7323         struct lfsck_layout_slave_data  *llsd;
7324         struct lfsck_orphan_it          *it     = NULL;
7325         struct lfsck_layout             *lo;
7326         int                              rc     = 0;
7327         ENTRY;
7328
7329         lfsck = lfsck_instance_find(dev, true, false);
7330         if (unlikely(lfsck == NULL))
7331                 RETURN(ERR_PTR(-ENXIO));
7332
7333         com = lfsck_component_find(lfsck, LFSCK_TYPE_LAYOUT);
7334         if (unlikely(com == NULL))
7335                 GOTO(out, rc = -ENOENT);
7336
7337         lo = com->lc_file_ram;
7338         if (lo->ll_flags & LF_INCOMPLETE)
7339                 GOTO(out, rc = -ESRCH);
7340
7341         llsd = com->lc_data;
7342         if (!llsd->llsd_rbtree_valid)
7343                 GOTO(out, rc = -ESRCH);
7344
7345         OBD_ALLOC_PTR(it);
7346         if (it == NULL)
7347                 GOTO(out, rc = -ENOMEM);
7348
7349         it->loi_llst = lfsck_layout_llst_find_and_del(llsd, attr, false);
7350         if (it->loi_llst == NULL)
7351                 GOTO(out, rc = -ENXIO);
7352
7353         if (dev->dd_record_fid_accessed) {
7354                 /* The first iteration against the rbtree, scan the whole rbtree
7355                  * to remove the nodes which do NOT need to be handled. */
7356                 down_write(&llsd->llsd_rb_rwsem);
7357                 if (dev->dd_record_fid_accessed) {
7358                         struct rb_node                  *node;
7359                         struct rb_node                  *next;
7360                         struct lfsck_rbtree_node        *lrn;
7361
7362                         /* No need to record the fid accessing anymore. */
7363                         dev->dd_record_fid_accessed = 0;
7364
7365                         node = rb_first(&llsd->llsd_rb_root);
7366                         while (node != NULL) {
7367                                 next = rb_next(node);
7368                                 lrn = rb_entry(node, struct lfsck_rbtree_node,
7369                                                lrn_node);
7370                                 if (atomic_read(&lrn->lrn_known_count) <=
7371                                     atomic_read(&lrn->lrn_accessed_count)) {
7372                                         rb_erase(node, &llsd->llsd_rb_root);
7373                                         lfsck_rbtree_free(lrn);
7374                                 }
7375                                 node = next;
7376                         }
7377                 }
7378                 up_write(&llsd->llsd_rb_rwsem);
7379         }
7380
7381         /* read lock the rbtree when init, and unlock when fini */
7382         down_read(&llsd->llsd_rb_rwsem);
7383         it->loi_com = com;
7384         com = NULL;
7385
7386         GOTO(out, rc = 0);
7387
7388 out:
7389         if (com != NULL)
7390                 lfsck_component_put(env, com);
7391
7392         CDEBUG(D_LFSCK, "%s: init the orphan iteration: rc = %d\n",
7393                lfsck_lfsck2name(lfsck), rc);
7394
7395         lfsck_instance_put(env, lfsck);
7396         if (rc != 0) {
7397                 if (it != NULL)
7398                         OBD_FREE_PTR(it);
7399
7400                 it = (struct lfsck_orphan_it *)ERR_PTR(rc);
7401         }
7402
7403         return (struct dt_it *)it;
7404 }
7405
7406 static void lfsck_orphan_it_fini(const struct lu_env *env,
7407                                  struct dt_it *di)
7408 {
7409         struct lfsck_orphan_it           *it    = (struct lfsck_orphan_it *)di;
7410         struct lfsck_component           *com   = it->loi_com;
7411         struct lfsck_layout_slave_data   *llsd;
7412         struct lfsck_layout_slave_target *llst;
7413
7414         if (com != NULL) {
7415                 CDEBUG(D_LFSCK, "%s: fini the orphan iteration\n",
7416                        lfsck_lfsck2name(com->lc_lfsck));
7417
7418                 llsd = com->lc_data;
7419                 up_read(&llsd->llsd_rb_rwsem);
7420                 llst = it->loi_llst;
7421                 LASSERT(llst != NULL);
7422
7423                 /* Save the key and hash for iterate next. */
7424                 llst->llst_fid = it->loi_key;
7425                 llst->llst_hash = it->loi_hash;
7426                 lfsck_layout_llst_put(llst);
7427                 lfsck_component_put(env, com);
7428         }
7429         OBD_FREE_PTR(it);
7430 }
7431
7432 /**
7433  * \retval       +1: the iteration finished
7434  * \retval        0: on success, not finished
7435  * \retval      -ve: on error
7436  */
7437 static int lfsck_orphan_it_next(const struct lu_env *env,
7438                                 struct dt_it *di)
7439 {
7440         struct lfsck_thread_info        *info   = lfsck_env_info(env);
7441         struct filter_fid               *ff     = &info->lti_ff;
7442         struct lu_attr                  *la     = &info->lti_la;
7443         struct lfsck_orphan_it          *it     = (struct lfsck_orphan_it *)di;
7444         struct lu_fid                   *key    = &it->loi_key;
7445         struct lu_orphan_rec_v3         *rec    = &it->loi_rec;
7446         struct ost_layout               *ol     = &rec->lor_layout;
7447         struct lfsck_component          *com    = it->loi_com;
7448         struct lfsck_instance           *lfsck  = com->lc_lfsck;
7449         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
7450         struct dt_object                *obj;
7451         struct lfsck_rbtree_node        *lrn;
7452         int                              pos;
7453         int                              rc;
7454         __u32                            save;
7455         __u32                            idx    = it->loi_llst->llst_index;
7456         bool                             exact  = false;
7457         ENTRY;
7458
7459         if (it->loi_over)
7460                 RETURN(1);
7461
7462 again0:
7463         lrn = it->loi_lrn;
7464         if (lrn == NULL) {
7465                 lrn = lfsck_rbtree_search(llsd, key, &exact);
7466                 if (lrn == NULL) {
7467                         it->loi_over = 1;
7468                         RETURN(1);
7469                 }
7470
7471                 it->loi_lrn = lrn;
7472                 if (!exact) {
7473                         key->f_seq = lrn->lrn_seq;
7474                         key->f_oid = lrn->lrn_first_oid;
7475                         key->f_ver = 0;
7476                 }
7477         } else {
7478                 key->f_oid++;
7479                 if (unlikely(key->f_oid == 0)) {
7480                         key->f_seq++;
7481                         it->loi_lrn = NULL;
7482                         goto again0;
7483                 }
7484
7485                 if (key->f_oid >=
7486                     lrn->lrn_first_oid + LFSCK_RBTREE_BITMAP_WIDTH) {
7487                         it->loi_lrn = NULL;
7488                         goto again0;
7489                 }
7490         }
7491
7492         if (unlikely(atomic_read(&lrn->lrn_known_count) <=
7493                      atomic_read(&lrn->lrn_accessed_count))) {
7494                 struct rb_node *next = rb_next(&lrn->lrn_node);
7495
7496                 while (next != NULL) {
7497                         lrn = rb_entry(next, struct lfsck_rbtree_node,
7498                                        lrn_node);
7499                         if (atomic_read(&lrn->lrn_known_count) >
7500                             atomic_read(&lrn->lrn_accessed_count))
7501                                 break;
7502                         next = rb_next(next);
7503                 }
7504
7505                 if (next == NULL) {
7506                         it->loi_over = 1;
7507                         RETURN(1);
7508                 }
7509
7510                 it->loi_lrn = lrn;
7511                 key->f_seq = lrn->lrn_seq;
7512                 key->f_oid = lrn->lrn_first_oid;
7513                 key->f_ver = 0;
7514         }
7515
7516         pos = key->f_oid - lrn->lrn_first_oid;
7517
7518 again1:
7519         pos = find_next_bit(lrn->lrn_known_bitmap,
7520                             LFSCK_RBTREE_BITMAP_WIDTH, pos);
7521         if (pos >= LFSCK_RBTREE_BITMAP_WIDTH) {
7522                 key->f_oid = lrn->lrn_first_oid + pos;
7523                 if (unlikely(key->f_oid < lrn->lrn_first_oid)) {
7524                         key->f_seq++;
7525                         key->f_oid = 0;
7526                 }
7527                 it->loi_lrn = NULL;
7528                 goto again0;
7529         }
7530
7531         if (test_bit(pos, lrn->lrn_accessed_bitmap)) {
7532                 pos++;
7533                 goto again1;
7534         }
7535
7536         key->f_oid = lrn->lrn_first_oid + pos;
7537         obj = lfsck_object_find_bottom(env, lfsck, key);
7538         if (IS_ERR(obj)) {
7539                 rc = PTR_ERR(obj);
7540                 if (rc == -ENOENT) {
7541                         pos++;
7542                         goto again1;
7543                 }
7544                 RETURN(rc);
7545         }
7546
7547         dt_read_lock(env, obj, 0);
7548         if (dt_object_exists(obj) == 0 ||
7549             lfsck_is_dead_obj(obj)) {
7550                 dt_read_unlock(env, obj);
7551                 lfsck_object_put(env, obj);
7552                 pos++;
7553                 goto again1;
7554         }
7555
7556         rc = dt_attr_get(env, obj, la);
7557         if (rc != 0)
7558                 GOTO(out, rc);
7559
7560         rc = dt_xattr_get(env, obj, lfsck_buf_get(env, ff, sizeof(*ff)),
7561                           XATTR_NAME_FID);
7562         if (rc == -ENODATA) {
7563                 /* For the pre-created OST-object, update the bitmap to avoid
7564                  * others LFSCK (second phase) iteration to touch it again. */
7565                 if (la->la_ctime == 0) {
7566                         if (!test_and_set_bit(pos, lrn->lrn_accessed_bitmap))
7567                                 atomic_inc(&lrn->lrn_accessed_count);
7568
7569                         /* For the race between repairing dangling referenced
7570                          * MDT-object and unlink the file, it may left orphan
7571                          * OST-object there. Destroy it now! */
7572                         if (unlikely(!(la->la_mode & S_ISUID))) {
7573                                 dt_read_unlock(env, obj);
7574                                 lfsck_layout_destroy_orphan(env, lfsck, obj);
7575                                 lfsck_object_put(env, obj);
7576                                 pos++;
7577                                 goto again1;
7578                         }
7579                 } else if (idx == 0) {
7580                         /* If the orphan OST-object has no parent information,
7581                          * regard it as referenced by the MDT-object on MDT0. */
7582                         fid_zero(&rec->lor_rec.lor_fid);
7583                         rec->lor_rec.lor_uid = la->la_uid;
7584                         rec->lor_rec.lor_gid = la->la_gid;
7585                         memset(ol, 0, sizeof(*ol));
7586                         rec->lor_layout_version = 0;
7587                         rec->lor_range = 0;
7588
7589                         GOTO(out, rc = 0);
7590                 }
7591
7592                 dt_read_unlock(env, obj);
7593                 lfsck_object_put(env, obj);
7594                 pos++;
7595                 goto again1;
7596         }
7597
7598         if (rc < sizeof(struct lu_fid))
7599                 GOTO(out, rc = (rc < 0 ? rc : -EINVAL));
7600
7601         fid_le_to_cpu(&rec->lor_rec.lor_fid, &ff->ff_parent);
7602         /* Currently, the filter_fid::ff_parent::f_ver is not the real parent
7603          * MDT-object's FID::f_ver, instead it is the OST-object index in its
7604          * parent MDT-object's layout EA. */
7605         save = rec->lor_rec.lor_fid.f_stripe_idx;
7606         rec->lor_rec.lor_fid.f_ver = 0;
7607         rc = lfsck_fid_match_idx(env, lfsck, &rec->lor_rec.lor_fid, idx);
7608         /* If the orphan OST-object does not claim the MDT, then next.
7609          *
7610          * If we do not know whether it matches or not, then return it
7611          * to the MDT for further check. */
7612         if (rc == 0) {
7613                 dt_read_unlock(env, obj);
7614                 lfsck_object_put(env, obj);
7615                 pos++;
7616                 goto again1;
7617         }
7618
7619         rec->lor_rec.lor_fid.f_stripe_idx = save;
7620         rec->lor_rec.lor_uid = la->la_uid;
7621         rec->lor_rec.lor_gid = la->la_gid;
7622         ost_layout_le_to_cpu(ol, &ff->ff_layout);
7623         rec->lor_layout_version =
7624                 le32_to_cpu(ff->ff_layout_version & ~LU_LAYOUT_RESYNC);
7625         rec->lor_range = le32_to_cpu(ff->ff_range);
7626
7627         CDEBUG(D_LFSCK, "%s: return orphan "DFID", PFID "DFID", owner %u:%u, "
7628                "stripe size %u, stripe count %u, COMP id %u, COMP start %llu, "
7629                "COMP end %llu, layout version %u, range %u\n",
7630                lfsck_lfsck2name(com->lc_lfsck), PFID(key),
7631                PFID(&rec->lor_rec.lor_fid), rec->lor_rec.lor_uid,
7632                rec->lor_rec.lor_gid, ol->ol_stripe_size, ol->ol_stripe_count,
7633                ol->ol_comp_id, ol->ol_comp_start, ol->ol_comp_end,
7634                rec->lor_layout_version, rec->lor_range);
7635
7636         GOTO(out, rc = 0);
7637
7638 out:
7639         dt_read_unlock(env, obj);
7640         lfsck_object_put(env, obj);
7641         if (rc == 0)
7642                 it->loi_hash++;
7643
7644         return rc;
7645 }
7646
7647 /**
7648  * \retval       +1: locate to the exactly position
7649  * \retval        0: cannot locate to the exactly position,
7650  *                   call next() to move to a valid position.
7651  * \retval      -ve: on error
7652  */
7653 static int lfsck_orphan_it_get(const struct lu_env *env,
7654                                struct dt_it *di,
7655                                const struct dt_key *key)
7656 {
7657         struct lfsck_orphan_it  *it   = (struct lfsck_orphan_it *)di;
7658         int                      rc;
7659
7660         it->loi_key = *(struct lu_fid *)key;
7661         rc = lfsck_orphan_it_next(env, di);
7662         if (rc == 1)
7663                 return 0;
7664
7665         if (rc == 0)
7666                 return 1;
7667
7668         return rc;
7669 }
7670
7671 static void lfsck_orphan_it_put(const struct lu_env *env,
7672                                 struct dt_it *di)
7673 {
7674 }
7675
7676 static struct dt_key *lfsck_orphan_it_key(const struct lu_env *env,
7677                                           const struct dt_it *di)
7678 {
7679         struct lfsck_orphan_it *it = (struct lfsck_orphan_it *)di;
7680
7681         return (struct dt_key *)&it->loi_key;
7682 }
7683
7684 static int lfsck_orphan_it_key_size(const struct lu_env *env,
7685                                     const struct dt_it *di)
7686 {
7687         return sizeof(struct lu_fid);
7688 }
7689
7690 static int lfsck_orphan_it_rec(const struct lu_env *env,
7691                                const struct dt_it *di,
7692                                struct dt_rec *rec,
7693                                __u32 attr)
7694 {
7695         struct lfsck_orphan_it *it = (struct lfsck_orphan_it *)di;
7696
7697         *(struct lu_orphan_rec_v3 *)rec = it->loi_rec;
7698
7699         return 0;
7700 }
7701
7702 static __u64 lfsck_orphan_it_store(const struct lu_env *env,
7703                                    const struct dt_it *di)
7704 {
7705         struct lfsck_orphan_it  *it   = (struct lfsck_orphan_it *)di;
7706
7707         return it->loi_hash;
7708 }
7709
7710 /**
7711  * \retval       +1: locate to the exactly position
7712  * \retval        0: cannot locate to the exactly position,
7713  *                   call next() to move to a valid position.
7714  * \retval      -ve: on error
7715  */
7716 static int lfsck_orphan_it_load(const struct lu_env *env,
7717                                 const struct dt_it *di,
7718                                 __u64 hash)
7719 {
7720         struct lfsck_orphan_it           *it   = (struct lfsck_orphan_it *)di;
7721         struct lfsck_layout_slave_target *llst = it->loi_llst;
7722         int                               rc;
7723
7724         LASSERT(llst != NULL);
7725
7726         if (hash != llst->llst_hash) {
7727                 CDEBUG(D_LFSCK, "%s: the given hash %llu for orphan "
7728                        "iteration does not match the one when fini "
7729                        "%llu, to be reset.\n",
7730                        lfsck_lfsck2name(it->loi_com->lc_lfsck), hash,
7731                        llst->llst_hash);
7732                 fid_zero(&llst->llst_fid);
7733                 llst->llst_hash = 0;
7734         }
7735
7736         it->loi_key = llst->llst_fid;
7737         it->loi_hash = llst->llst_hash;
7738         rc = lfsck_orphan_it_next(env, (struct dt_it *)di);
7739         if (rc == 1)
7740                 return 0;
7741
7742         if (rc == 0)
7743                 return 1;
7744
7745         return rc;
7746 }
7747
7748 static int lfsck_orphan_it_key_rec(const struct lu_env *env,
7749                                    const struct dt_it *di,
7750                                    void *key_rec)
7751 {
7752         return 0;
7753 }
7754
7755 static const struct dt_index_operations lfsck_orphan_index_ops = {
7756         .dio_lookup             = lfsck_orphan_index_lookup,
7757         .dio_declare_insert     = lfsck_orphan_index_declare_insert,
7758         .dio_insert             = lfsck_orphan_index_insert,
7759         .dio_declare_delete     = lfsck_orphan_index_declare_delete,
7760         .dio_delete             = lfsck_orphan_index_delete,
7761         .dio_it = {
7762                 .init           = lfsck_orphan_it_init,
7763                 .fini           = lfsck_orphan_it_fini,
7764                 .get            = lfsck_orphan_it_get,
7765                 .put            = lfsck_orphan_it_put,
7766                 .next           = lfsck_orphan_it_next,
7767                 .key            = lfsck_orphan_it_key,
7768                 .key_size       = lfsck_orphan_it_key_size,
7769                 .rec            = lfsck_orphan_it_rec,
7770                 .store          = lfsck_orphan_it_store,
7771                 .load           = lfsck_orphan_it_load,
7772                 .key_rec        = lfsck_orphan_it_key_rec,
7773         }
7774 };