Whamcloud - gitweb
LU-10467 lustre: use wait_event_idle() where appropriate.
[fs/lustre-release.git] / lustre / lfsck / lfsck_layout.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License version 2 for more details.  A copy is
14  * included in the COPYING file that accompanied this code.
15
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright (c) 2014, 2017, Intel Corporation.
24  */
25 /*
26  * lustre/lfsck/lfsck_layout.c
27  *
28  * Author: Fan, Yong <fan.yong@intel.com>
29  */
30
31 #ifndef EXPORT_SYMTAB
32 # define EXPORT_SYMTAB
33 #endif
34 #define DEBUG_SUBSYSTEM S_LFSCK
35
36 #include <linux/bitops.h>
37 #include <linux/rbtree.h>
38
39 #include <lu_object.h>
40 #include <dt_object.h>
41 #include <lustre_fid.h>
42 #include <lustre_lib.h>
43 #include <lustre_net.h>
44 #include <md_object.h>
45 #include <obd_class.h>
46
47 #include "lfsck_internal.h"
48
49 #define LFSCK_LAYOUT_MAGIC_V1           0xB173AE14
50 #define LFSCK_LAYOUT_MAGIC_V2           0xB1734D76
51 #define LFSCK_LAYOUT_MAGIC_V3           0xB17371B9
52 #define LFSCK_LAYOUT_MAGIC_V4           0xB1732FED
53
54 #define LFSCK_LAYOUT_MAGIC              LFSCK_LAYOUT_MAGIC_V4
55
56 struct lfsck_layout_seq {
57         struct list_head         lls_list;
58         __u64                    lls_seq;
59         __u64                    lls_lastid;
60         __u64                    lls_lastid_known;
61         struct dt_object        *lls_lastid_obj;
62         unsigned int             lls_dirty:1;
63 };
64
65 struct lfsck_layout_slave_target {
66         /* link into lfsck_layout_slave_data::llsd_master_list. */
67         struct list_head        llst_list;
68         /* The position for next record in the rbtree for iteration. */
69         struct lu_fid           llst_fid;
70         /* Dummy hash for iteration against the rbtree. */
71         __u64                   llst_hash;
72         __u64                   llst_gen;
73         atomic_t                llst_ref;
74         __u32                   llst_index;
75         /* How many times we have failed to get the master status. */
76         int                     llst_failures;
77 };
78
79 struct lfsck_layout_slave_data {
80         /* list for lfsck_layout_seq */
81         struct list_head         llsd_seq_list;
82
83         /* list for the masters involve layout verification. */
84         struct list_head         llsd_master_list;
85         spinlock_t               llsd_lock;
86         __u64                    llsd_touch_gen;
87         struct dt_object        *llsd_rb_obj;
88         struct rb_root           llsd_rb_root;
89         struct rw_semaphore      llsd_rb_rwsem;
90         unsigned int             llsd_rbtree_valid:1;
91 };
92
93 struct lfsck_layout_slave_async_args {
94         struct obd_export                *llsaa_exp;
95         struct lfsck_component           *llsaa_com;
96         struct lfsck_layout_slave_target *llsaa_llst;
97 };
98
99 static inline bool lfsck_comp_extent_aligned(__u64 size)
100 {
101          return (size & (LOV_MIN_STRIPE_SIZE - 1)) == 0;
102 }
103
104 static inline void
105 lfsck_layout_llst_put(struct lfsck_layout_slave_target *llst)
106 {
107         if (atomic_dec_and_test(&llst->llst_ref)) {
108                 LASSERT(list_empty(&llst->llst_list));
109
110                 OBD_FREE_PTR(llst);
111         }
112 }
113
114 static inline int
115 lfsck_layout_llst_add(struct lfsck_layout_slave_data *llsd, __u32 index)
116 {
117         struct lfsck_layout_slave_target *llst;
118         struct lfsck_layout_slave_target *tmp;
119         int                               rc   = 0;
120
121         OBD_ALLOC_PTR(llst);
122         if (llst == NULL)
123                 return -ENOMEM;
124
125         INIT_LIST_HEAD(&llst->llst_list);
126         llst->llst_gen = 0;
127         llst->llst_index = index;
128         atomic_set(&llst->llst_ref, 1);
129
130         spin_lock(&llsd->llsd_lock);
131         list_for_each_entry(tmp, &llsd->llsd_master_list, llst_list) {
132                 if (tmp->llst_index == index) {
133                         rc = -EALREADY;
134                         break;
135                 }
136         }
137         if (rc == 0)
138                 list_add_tail(&llst->llst_list, &llsd->llsd_master_list);
139         spin_unlock(&llsd->llsd_lock);
140
141         if (rc != 0)
142                 OBD_FREE_PTR(llst);
143
144         return rc;
145 }
146
147 static inline void
148 lfsck_layout_llst_del(struct lfsck_layout_slave_data *llsd,
149                       struct lfsck_layout_slave_target *llst)
150 {
151         bool del = false;
152
153         spin_lock(&llsd->llsd_lock);
154         if (!list_empty(&llst->llst_list)) {
155                 list_del_init(&llst->llst_list);
156                 del = true;
157         }
158         spin_unlock(&llsd->llsd_lock);
159
160         if (del)
161                 lfsck_layout_llst_put(llst);
162 }
163
164 static inline struct lfsck_layout_slave_target *
165 lfsck_layout_llst_find_and_del(struct lfsck_layout_slave_data *llsd,
166                                __u32 index, bool unlink)
167 {
168         struct lfsck_layout_slave_target *llst;
169
170         spin_lock(&llsd->llsd_lock);
171         list_for_each_entry(llst, &llsd->llsd_master_list, llst_list) {
172                 if (llst->llst_index == index) {
173                         if (unlink)
174                                 list_del_init(&llst->llst_list);
175                         else
176                                 atomic_inc(&llst->llst_ref);
177                         spin_unlock(&llsd->llsd_lock);
178
179                         return llst;
180                 }
181         }
182         spin_unlock(&llsd->llsd_lock);
183
184         return NULL;
185 }
186
187 static struct lfsck_layout_req *
188 lfsck_layout_assistant_req_init(struct lfsck_assistant_object *lso,
189                                 struct dt_object *child, __u32 comp_id,
190                                 __u32 ost_idx, __u32 lov_idx)
191 {
192         struct lfsck_layout_req *llr;
193
194         OBD_ALLOC_PTR(llr);
195         if (llr == NULL)
196                 return ERR_PTR(-ENOMEM);
197
198         INIT_LIST_HEAD(&llr->llr_lar.lar_list);
199         llr->llr_lar.lar_parent = lfsck_assistant_object_get(lso);
200         llr->llr_child = child;
201         llr->llr_comp_id = comp_id;
202         llr->llr_ost_idx = ost_idx;
203         llr->llr_lov_idx = lov_idx;
204
205         return llr;
206 }
207
208 static void lfsck_layout_assistant_req_fini(const struct lu_env *env,
209                                             struct lfsck_assistant_req *lar)
210 {
211         struct lfsck_layout_req *llr =
212                         container_of0(lar, struct lfsck_layout_req, llr_lar);
213
214         lfsck_object_put(env, llr->llr_child);
215         lfsck_assistant_object_put(env, lar->lar_parent);
216         OBD_FREE_PTR(llr);
217 }
218
219 static int
220 lfsck_layout_assistant_sync_failures_interpret(const struct lu_env *env,
221                                                struct ptlrpc_request *req,
222                                                void *args, int rc)
223 {
224         if (rc == 0) {
225                 struct lfsck_async_interpret_args *laia = args;
226                 struct lfsck_tgt_desc             *ltd  = laia->laia_ltd;
227
228                 ltd->ltd_synced_failures = 1;
229                 atomic_dec(laia->laia_count);
230         }
231
232         return 0;
233 }
234
235 /**
236  * Notify remote LFSCK instances about former failures.
237  *
238  * The local LFSCK instance has recorded which OSTs have ever failed to respond
239  * some LFSCK verification requests (maybe because of network issues or the OST
240  * itself trouble). During the respond gap, the OST may missed some OST-objects
241  * verification, then the OST cannot know whether related OST-objects have been
242  * referenced by related MDT-objects or not, then in the second-stage scanning,
243  * these OST-objects will be regarded as orphan, if the OST-object contains bad
244  * parent FID for back reference, then it will misguide the LFSCK to make wrong
245  * fixing for the fake orphan.
246  *
247  * To avoid above trouble, when layout LFSCK finishes the first-stage scanning,
248  * it will scan the bitmap for the ever failed OSTs, and notify them that they
249  * have ever missed some OST-object verification and should skip the handling
250  * for orphan OST-objects on all MDTs that are in the layout LFSCK.
251  *
252  * \param[in] env       pointer to the thread context
253  * \param[in] com       pointer to the lfsck component
254  * \param[in] lr        pointer to the lfsck request
255  */
256 static void lfsck_layout_assistant_sync_failures(const struct lu_env *env,
257                                                  struct lfsck_component *com,
258                                                  struct lfsck_request *lr)
259 {
260         struct lfsck_async_interpret_args *laia  =
261                                 &lfsck_env_info(env)->lti_laia2;
262         struct lfsck_assistant_data       *lad   = com->lc_data;
263         struct lfsck_layout               *lo    = com->lc_file_ram;
264         struct lfsck_instance             *lfsck = com->lc_lfsck;
265         struct lfsck_tgt_descs            *ltds  = &lfsck->li_ost_descs;
266         struct lfsck_tgt_desc             *ltd;
267         struct ptlrpc_request_set         *set;
268         atomic_t                           count;
269         __u32                              idx;
270         int                                rc    = 0;
271         ENTRY;
272
273         if (!test_bit(LAD_INCOMPLETE, &lad->lad_flags))
274                 RETURN_EXIT;
275
276         /* If the MDT has ever failed to verfiy some OST-objects,
277          * then sync failures with them firstly. */
278         lr->lr_flags2 = lo->ll_flags | LF_INCOMPLETE;
279
280         atomic_set(&count, 0);
281         memset(laia, 0, sizeof(*laia));
282         laia->laia_count = &count;
283         set = ptlrpc_prep_set();
284         if (set == NULL)
285                 GOTO(out, rc = -ENOMEM);
286
287         down_read(&ltds->ltd_rw_sem);
288         cfs_foreach_bit(lad->lad_bitmap, idx) {
289                 ltd = lfsck_ltd2tgt(ltds, idx);
290                 if (unlikely(!ltd))
291                         continue;
292
293                 laia->laia_ltd = ltd;
294                 rc = lfsck_async_request(env, ltd->ltd_exp, lr, set,
295                                 lfsck_layout_assistant_sync_failures_interpret,
296                                 laia, LFSCK_NOTIFY);
297                 if (rc != 0) {
298                         CDEBUG(D_LFSCK, "%s: LFSCK assistant fail to "
299                                "notify target %x for %s phase1 done: "
300                                "rc = %d\n", lfsck_lfsck2name(com->lc_lfsck),
301                                ltd->ltd_index, lad->lad_name, rc);
302
303                         break;
304                 }
305
306                 atomic_inc(&count);
307         }
308         up_read(&ltds->ltd_rw_sem);
309
310         if (rc == 0 && atomic_read(&count) > 0)
311                 rc = ptlrpc_set_wait(env, set);
312
313         ptlrpc_set_destroy(set);
314
315         if (rc == 0 && atomic_read(&count) > 0)
316                 rc = -EINVAL;
317
318         GOTO(out, rc);
319
320 out:
321         if (rc != 0)
322                 /* If failed to sync failures with the OSTs, then have to
323                  * mark the whole LFSCK as LF_INCOMPLETE to skip the whole
324                  * subsequent orphan OST-object handling. */
325                 lo->ll_flags |= LF_INCOMPLETE;
326
327         lr->lr_flags2 = lo->ll_flags;
328 }
329
330 static int lfsck_layout_verify_header_v1v3(struct dt_object *obj,
331                                            struct lov_mds_md_v1 *lmm,
332                                            __u64 start, __u32 comp_id)
333 {
334         __u32 magic;
335         __u32 pattern;
336
337         magic = le32_to_cpu(lmm->lmm_magic);
338         /* If magic crashed, keep it there. Sometime later, during OST-object
339          * orphan handling, if some OST-object(s) back-point to it, it can be
340          * verified and repaired. */
341         if (magic != LOV_MAGIC_V1 && magic != LOV_MAGIC_V3) {
342                 int rc;
343
344                 if ((magic & LOV_MAGIC_MASK) == LOV_MAGIC_MAGIC)
345                         rc = -EOPNOTSUPP;
346                 else
347                         rc = -EINVAL;
348
349                 CDEBUG(D_LFSCK, "%s LOV EA magic %u for the file "DFID"\n",
350                        rc == -EINVAL ? "Unknown" : "Unsupported",
351                        magic, PFID(lfsck_dto2fid(obj)));
352
353                 return rc;
354         }
355
356         pattern = le32_to_cpu(lmm->lmm_pattern);
357
358 #if 0
359         /* XXX: DoM file verification will be supportted via LU-11081. */
360         if (lov_pattern(pattern) == LOV_PATTERN_MDT) {
361                 if (start != 0) {
362                         CDEBUG(D_LFSCK, "The DoM entry for "DFID" is not "
363                                "the first component in the mirror %x/%llu\n",
364                                PFID(lfsck_dto2fid(obj)), comp_id, start);
365
366                         return -EINVAL;
367                 }
368         }
369 #endif
370
371         if (!lov_pattern_supported_normal_comp(lov_pattern(pattern))) {
372                 CDEBUG(D_LFSCK, "Unsupported LOV EA pattern %u for the file "
373                        DFID" in the component %x\n",
374                        pattern, PFID(lfsck_dto2fid(obj)), comp_id);
375
376                 return -EOPNOTSUPP;
377         }
378
379         return 0;
380 }
381
382 static int lfsck_layout_verify_header_foreign(struct dt_object *obj,
383                                               struct lov_foreign_md *lfm,
384                                               size_t len)
385 {
386         /* magic has been verified already */
387         __u32 value_len = le32_to_cpu(lfm->lfm_length);
388         /* type and flags are not checked for instance */
389
390         CDEBUG(D_INFO, "foreign LOV EA, magic %x, len %u, type %x, flags %x, for file "DFID"\n",
391                le32_to_cpu(lfm->lfm_magic), value_len,
392                le32_to_cpu(lfm->lfm_type), le32_to_cpu(lfm->lfm_flags),
393                PFID(lfsck_dto2fid(obj)));
394
395         if (len != value_len + offsetof(typeof(*lfm), lfm_value))
396                 CDEBUG(D_LFSCK, "foreign LOV EA internal size %u does not match EA full size %zu for file "DFID"\n",
397                        value_len, len, PFID(lfsck_dto2fid(obj)));
398
399         /* nothing to repair */
400         return -ENODATA;
401 }
402
403 static int lfsck_layout_verify_header(struct dt_object *obj,
404                                       struct lov_mds_md_v1 *lmm, size_t len)
405 {
406         int rc = 0;
407
408         if (le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_COMP_V1) {
409                 struct lov_comp_md_v1 *lcm = (struct lov_comp_md_v1 *)lmm;
410                 int i;
411                 __u16 count = le16_to_cpu(lcm->lcm_entry_count);
412
413                 if (unlikely(count == 0)) {
414                         CDEBUG(D_LFSCK, "the PFL file "DFID" contains invalid "
415                                "components count 0\n",
416                                PFID(lfsck_dto2fid(obj)));
417
418                         return -EINVAL;
419                 }
420
421                 for (i = 0; i < count && !rc; i++) {
422                         struct lov_comp_md_entry_v1 *lcme =
423                                                 &lcm->lcm_entries[i];
424                         __u64 start = le64_to_cpu(lcme->lcme_extent.e_start);
425                         __u64 end = le64_to_cpu(lcme->lcme_extent.e_end);
426                         __u32 comp_id = le32_to_cpu(lcme->lcme_id);
427
428                         if (unlikely(comp_id == LCME_ID_INVAL ||
429                                      comp_id > LCME_ID_MAX)) {
430                                 CDEBUG(D_LFSCK, "found invalid FPL ID %u "
431                                        "for the file "DFID" at idx %d\n",
432                                        comp_id, PFID(lfsck_dto2fid(obj)), i);
433
434                                 return -EINVAL;
435                         }
436
437                         if (unlikely(start >= end ||
438                                      !lfsck_comp_extent_aligned(start) ||
439                                      (!lfsck_comp_extent_aligned(end) &&
440                                       end != LUSTRE_EOF))) {
441                                 CDEBUG(D_LFSCK, "found invalid FPL extent "
442                                        "range [%llu - %llu) for the file "
443                                        DFID" at idx %d\n",
444                                        start, end, PFID(lfsck_dto2fid(obj)), i);
445
446                                 return -EINVAL;
447                         }
448
449                         rc = lfsck_layout_verify_header_v1v3(obj,
450                                         (struct lov_mds_md_v1 *)((char *)lmm +
451                                         le32_to_cpu(lcme->lcme_offset)), start,
452                                         comp_id);
453                 }
454         } else if (le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_FOREIGN) {
455                 rc = lfsck_layout_verify_header_foreign(obj,
456                                                 (struct lov_foreign_md *)lmm,
457                                                 len);
458         } else {
459                 rc = lfsck_layout_verify_header_v1v3(obj, lmm, 1, 0);
460         }
461
462         return rc;
463 }
464
465 static int lfsck_layout_get_lovea(const struct lu_env *env,
466                                   struct dt_object *obj, struct lu_buf *buf)
467 {
468         int rc;
469         int rc1;
470
471 again:
472         rc = dt_xattr_get(env, obj, buf, XATTR_NAME_LOV);
473         if (rc == -ERANGE) {
474                 rc = dt_xattr_get(env, obj, &LU_BUF_NULL, XATTR_NAME_LOV);
475                 if (rc <= 0)
476                         return !rc ? -ENODATA : rc;
477
478                 lu_buf_realloc(buf, rc);
479                 if (buf->lb_buf == NULL)
480                         return -ENOMEM;
481
482                 goto again;
483         }
484
485         if (rc <= 0)
486                 return !rc ? -ENODATA : rc;
487
488         if (unlikely(buf->lb_buf == NULL)) {
489                 lu_buf_alloc(buf, rc);
490                 if (buf->lb_buf == NULL)
491                         return -ENOMEM;
492
493                 goto again;
494         }
495
496         rc1 = lfsck_layout_verify_header(obj, buf->lb_buf, rc);
497
498         return rc1 ? rc1 : rc;
499 }
500
501 #define LFSCK_RBTREE_BITMAP_SIZE        PAGE_SIZE
502 #define LFSCK_RBTREE_BITMAP_WIDTH       (LFSCK_RBTREE_BITMAP_SIZE << 3)
503 #define LFSCK_RBTREE_BITMAP_MASK        (LFSCK_RBTREE_BITMAP_WIDTH - 1)
504
505 struct lfsck_rbtree_node {
506         struct rb_node   lrn_node;
507         __u64            lrn_seq;
508         __u32            lrn_first_oid;
509         atomic_t         lrn_known_count;
510         atomic_t         lrn_accessed_count;
511         void            *lrn_known_bitmap;
512         void            *lrn_accessed_bitmap;
513 };
514
515 static inline int lfsck_rbtree_cmp(struct lfsck_rbtree_node *lrn,
516                                    __u64 seq, __u32 oid)
517 {
518         if (seq < lrn->lrn_seq)
519                 return -1;
520
521         if (seq > lrn->lrn_seq)
522                 return 1;
523
524         if (oid < lrn->lrn_first_oid)
525                 return -1;
526
527         if (oid - lrn->lrn_first_oid >= LFSCK_RBTREE_BITMAP_WIDTH)
528                 return 1;
529
530         return 0;
531 }
532
533 /* The caller should hold llsd->llsd_rb_lock. */
534 static struct lfsck_rbtree_node *
535 lfsck_rbtree_search(struct lfsck_layout_slave_data *llsd,
536                     const struct lu_fid *fid, bool *exact)
537 {
538         struct rb_node           *node  = llsd->llsd_rb_root.rb_node;
539         struct rb_node           *prev  = NULL;
540         struct lfsck_rbtree_node *lrn   = NULL;
541         int                       rc    = 0;
542
543         if (exact != NULL)
544                 *exact = true;
545
546         while (node != NULL) {
547                 prev = node;
548                 lrn = rb_entry(node, struct lfsck_rbtree_node, lrn_node);
549                 rc = lfsck_rbtree_cmp(lrn, fid_seq(fid), fid_oid(fid));
550                 if (rc < 0)
551                         node = node->rb_left;
552                 else if (rc > 0)
553                         node = node->rb_right;
554                 else
555                         return lrn;
556         }
557
558         if (exact == NULL)
559                 return NULL;
560
561         /* If there is no exactly matched one, then to the next valid one. */
562         *exact = false;
563
564         /* The rbtree is empty. */
565         if (rc == 0)
566                 return NULL;
567
568         if (rc < 0)
569                 return lrn;
570
571         node = rb_next(prev);
572
573         /* The end of the rbtree. */
574         if (node == NULL)
575                 return NULL;
576
577         lrn = rb_entry(node, struct lfsck_rbtree_node, lrn_node);
578
579         return lrn;
580 }
581
582 static struct lfsck_rbtree_node *lfsck_rbtree_new(const struct lu_env *env,
583                                                   const struct lu_fid *fid)
584 {
585         struct lfsck_rbtree_node *lrn;
586
587         OBD_ALLOC_PTR(lrn);
588         if (lrn == NULL)
589                 return ERR_PTR(-ENOMEM);
590
591         OBD_ALLOC(lrn->lrn_known_bitmap, LFSCK_RBTREE_BITMAP_SIZE);
592         if (lrn->lrn_known_bitmap == NULL) {
593                 OBD_FREE_PTR(lrn);
594
595                 return ERR_PTR(-ENOMEM);
596         }
597
598         OBD_ALLOC(lrn->lrn_accessed_bitmap, LFSCK_RBTREE_BITMAP_SIZE);
599         if (lrn->lrn_accessed_bitmap == NULL) {
600                 OBD_FREE(lrn->lrn_known_bitmap, LFSCK_RBTREE_BITMAP_SIZE);
601                 OBD_FREE_PTR(lrn);
602
603                 return ERR_PTR(-ENOMEM);
604         }
605
606         RB_CLEAR_NODE(&lrn->lrn_node);
607         lrn->lrn_seq = fid_seq(fid);
608         lrn->lrn_first_oid = fid_oid(fid) & ~LFSCK_RBTREE_BITMAP_MASK;
609         atomic_set(&lrn->lrn_known_count, 0);
610         atomic_set(&lrn->lrn_accessed_count, 0);
611
612         return lrn;
613 }
614
615 static void lfsck_rbtree_free(struct lfsck_rbtree_node *lrn)
616 {
617         OBD_FREE(lrn->lrn_accessed_bitmap, LFSCK_RBTREE_BITMAP_SIZE);
618         OBD_FREE(lrn->lrn_known_bitmap, LFSCK_RBTREE_BITMAP_SIZE);
619         OBD_FREE_PTR(lrn);
620 }
621
622 /* The caller should hold lock. */
623 static struct lfsck_rbtree_node *
624 lfsck_rbtree_insert(struct lfsck_layout_slave_data *llsd,
625                     struct lfsck_rbtree_node *lrn)
626 {
627         struct rb_node           **pos    = &llsd->llsd_rb_root.rb_node;
628         struct rb_node            *parent = NULL;
629         struct lfsck_rbtree_node  *tmp;
630         int                        rc;
631
632         while (*pos != NULL) {
633                 parent = *pos;
634                 tmp = rb_entry(parent, struct lfsck_rbtree_node, lrn_node);
635                 rc = lfsck_rbtree_cmp(tmp, lrn->lrn_seq, lrn->lrn_first_oid);
636                 if (rc < 0)
637                         pos = &(*pos)->rb_left;
638                 else if (rc > 0)
639                         pos = &(*pos)->rb_right;
640                 else
641                         return tmp;
642         }
643
644         rb_link_node(&lrn->lrn_node, parent, pos);
645         rb_insert_color(&lrn->lrn_node, &llsd->llsd_rb_root);
646
647         return lrn;
648 }
649
650 extern const struct dt_index_operations lfsck_orphan_index_ops;
651
652 static int lfsck_rbtree_setup(const struct lu_env *env,
653                               struct lfsck_component *com)
654 {
655         struct lu_fid                   *fid    = &lfsck_env_info(env)->lti_fid;
656         struct lfsck_instance           *lfsck  = com->lc_lfsck;
657         struct dt_device                *dev    = lfsck->li_bottom;
658         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
659         struct dt_object                *obj;
660
661         fid->f_seq = FID_SEQ_LAYOUT_RBTREE;
662         fid->f_oid = lfsck_dev_idx(lfsck);
663         fid->f_ver = 0;
664         obj = dt_locate(env, dev, fid);
665         if (IS_ERR(obj))
666                 RETURN(PTR_ERR(obj));
667
668         /* Generate an in-RAM object to stand for the layout rbtree.
669          * Scanning the layout rbtree will be via the iteration over
670          * the object. In the future, the rbtree may be written onto
671          * disk with the object.
672          *
673          * Mark the object to be as exist. */
674         obj->do_lu.lo_header->loh_attr |= LOHA_EXISTS;
675         obj->do_index_ops = &lfsck_orphan_index_ops;
676         llsd->llsd_rb_obj = obj;
677         llsd->llsd_rbtree_valid = 1;
678         dev->dd_record_fid_accessed = 1;
679
680         CDEBUG(D_LFSCK, "%s: layout LFSCK init OST-objects accessing bitmap\n",
681                lfsck_lfsck2name(lfsck));
682
683         return 0;
684 }
685
686 static void lfsck_rbtree_cleanup(const struct lu_env *env,
687                                  struct lfsck_component *com)
688 {
689         struct lfsck_instance           *lfsck = com->lc_lfsck;
690         struct lfsck_layout_slave_data  *llsd  = com->lc_data;
691         struct rb_node                  *node  = rb_first(&llsd->llsd_rb_root);
692         struct rb_node                  *next;
693         struct lfsck_rbtree_node        *lrn;
694
695         lfsck->li_bottom->dd_record_fid_accessed = 0;
696         /* Invalid the rbtree, then no others will use it. */
697         down_write(&llsd->llsd_rb_rwsem);
698         llsd->llsd_rbtree_valid = 0;
699         up_write(&llsd->llsd_rb_rwsem);
700
701         while (node != NULL) {
702                 next = rb_next(node);
703                 lrn = rb_entry(node, struct lfsck_rbtree_node, lrn_node);
704                 rb_erase(node, &llsd->llsd_rb_root);
705                 lfsck_rbtree_free(lrn);
706                 node = next;
707         }
708
709         if (llsd->llsd_rb_obj != NULL) {
710                 lfsck_object_put(env, llsd->llsd_rb_obj);
711                 llsd->llsd_rb_obj = NULL;
712         }
713
714         CDEBUG(D_LFSCK, "%s: layout LFSCK fini OST-objects accessing bitmap\n",
715                lfsck_lfsck2name(lfsck));
716 }
717
718 static void lfsck_rbtree_update_bitmap(const struct lu_env *env,
719                                        struct lfsck_component *com,
720                                        const struct lu_fid *fid,
721                                        bool accessed)
722 {
723         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
724         struct lfsck_rbtree_node        *lrn;
725         bool                             insert = false;
726         int                              idx;
727         int                              rc     = 0;
728         ENTRY;
729
730         if (unlikely(!fid_is_sane(fid) || fid_is_last_id(fid)))
731                 RETURN_EXIT;
732
733         if (!fid_is_idif(fid) && !fid_is_norm(fid))
734                 RETURN_EXIT;
735
736         down_read(&llsd->llsd_rb_rwsem);
737         if (!llsd->llsd_rbtree_valid)
738                 GOTO(unlock, rc = 0);
739
740         lrn = lfsck_rbtree_search(llsd, fid, NULL);
741         if (lrn == NULL) {
742                 struct lfsck_rbtree_node *tmp;
743
744                 LASSERT(!insert);
745
746                 up_read(&llsd->llsd_rb_rwsem);
747                 tmp = lfsck_rbtree_new(env, fid);
748                 if (IS_ERR(tmp))
749                         GOTO(out, rc = PTR_ERR(tmp));
750
751                 insert = true;
752                 down_write(&llsd->llsd_rb_rwsem);
753                 if (!llsd->llsd_rbtree_valid) {
754                         lfsck_rbtree_free(tmp);
755                         GOTO(unlock, rc = 0);
756                 }
757
758                 lrn = lfsck_rbtree_insert(llsd, tmp);
759                 if (lrn != tmp)
760                         lfsck_rbtree_free(tmp);
761         }
762
763         idx = fid_oid(fid) & LFSCK_RBTREE_BITMAP_MASK;
764         /* Any accessed object must be a known object. */
765         if (!test_and_set_bit(idx, lrn->lrn_known_bitmap))
766                 atomic_inc(&lrn->lrn_known_count);
767         if (accessed && !test_and_set_bit(idx, lrn->lrn_accessed_bitmap))
768                 atomic_inc(&lrn->lrn_accessed_count);
769
770         GOTO(unlock, rc = 0);
771
772 unlock:
773         if (insert)
774                 up_write(&llsd->llsd_rb_rwsem);
775         else
776                 up_read(&llsd->llsd_rb_rwsem);
777 out:
778         if (rc != 0 && accessed) {
779                 struct lfsck_layout *lo = com->lc_file_ram;
780
781                 CDEBUG(D_LFSCK, "%s: fail to update OST-objects accessing "
782                        "bitmap, and will cause incorrect LFSCK OST-object "
783                        "handling, so disable it to cancel orphan handling "
784                        "for related device. rc = %d\n",
785                        lfsck_lfsck2name(com->lc_lfsck), rc);
786
787                 lo->ll_flags |= LF_INCOMPLETE;
788                 lfsck_rbtree_cleanup(env, com);
789         }
790 }
791
792 static inline void lldk_le_to_cpu(struct lfsck_layout_dangling_key *des,
793                                   const struct lfsck_layout_dangling_key *src)
794 {
795         fid_le_to_cpu(&des->lldk_fid, &src->lldk_fid);
796         des->lldk_comp_id = le32_to_cpu(src->lldk_comp_id);
797         des->lldk_ea_off = le32_to_cpu(src->lldk_ea_off);
798 }
799
800 static inline void lldk_cpu_to_le(struct lfsck_layout_dangling_key *des,
801                                   const struct lfsck_layout_dangling_key *src)
802 {
803         fid_cpu_to_le(&des->lldk_fid, &src->lldk_fid);
804         des->lldk_comp_id = cpu_to_le32(src->lldk_comp_id);
805         des->lldk_ea_off = cpu_to_le32(src->lldk_ea_off);
806 }
807
808 static inline void lldk_be_to_cpu(struct lfsck_layout_dangling_key *des,
809                                   const struct lfsck_layout_dangling_key *src)
810 {
811         fid_be_to_cpu(&des->lldk_fid, &src->lldk_fid);
812         des->lldk_comp_id = be32_to_cpu(src->lldk_comp_id);
813         des->lldk_ea_off = be32_to_cpu(src->lldk_ea_off);
814 }
815
816 static inline void lldk_cpu_to_be(struct lfsck_layout_dangling_key *des,
817                                   const struct lfsck_layout_dangling_key *src)
818 {
819         fid_cpu_to_be(&des->lldk_fid, &src->lldk_fid);
820         des->lldk_comp_id = cpu_to_be32(src->lldk_comp_id);
821         des->lldk_ea_off = cpu_to_be32(src->lldk_ea_off);
822 }
823
824 static void lfsck_layout_le_to_cpu(struct lfsck_layout *des,
825                                    const struct lfsck_layout *src)
826 {
827         int i;
828
829         des->ll_magic = le32_to_cpu(src->ll_magic);
830         des->ll_status = le32_to_cpu(src->ll_status);
831         des->ll_flags = le32_to_cpu(src->ll_flags);
832         des->ll_success_count = le32_to_cpu(src->ll_success_count);
833         des->ll_run_time_phase1 = le64_to_cpu(src->ll_run_time_phase1);
834         des->ll_run_time_phase2 = le64_to_cpu(src->ll_run_time_phase2);
835         des->ll_time_last_complete = le64_to_cpu(src->ll_time_last_complete);
836         des->ll_time_latest_start = le64_to_cpu(src->ll_time_latest_start);
837         des->ll_time_last_checkpoint =
838                                 le64_to_cpu(src->ll_time_last_checkpoint);
839         des->ll_pos_latest_start = le64_to_cpu(src->ll_pos_latest_start);
840         des->ll_pos_last_checkpoint = le64_to_cpu(src->ll_pos_last_checkpoint);
841         des->ll_pos_first_inconsistent =
842                         le64_to_cpu(src->ll_pos_first_inconsistent);
843         des->ll_objs_checked_phase1 = le64_to_cpu(src->ll_objs_checked_phase1);
844         des->ll_objs_failed_phase1 = le64_to_cpu(src->ll_objs_failed_phase1);
845         des->ll_objs_checked_phase2 = le64_to_cpu(src->ll_objs_checked_phase2);
846         des->ll_objs_failed_phase2 = le64_to_cpu(src->ll_objs_failed_phase2);
847         for (i = 0; i < LLIT_MAX; i++)
848                 des->ll_objs_repaired[i] =
849                                 le64_to_cpu(src->ll_objs_repaired[i]);
850         des->ll_objs_skipped = le64_to_cpu(src->ll_objs_skipped);
851         des->ll_bitmap_size = le32_to_cpu(src->ll_bitmap_size);
852         lldk_le_to_cpu(&des->ll_lldk_latest_scanned_phase2,
853                        &src->ll_lldk_latest_scanned_phase2);
854 }
855
856 static void lfsck_layout_cpu_to_le(struct lfsck_layout *des,
857                                    const struct lfsck_layout *src)
858 {
859         int i;
860
861         des->ll_magic = cpu_to_le32(src->ll_magic);
862         des->ll_status = cpu_to_le32(src->ll_status);
863         des->ll_flags = cpu_to_le32(src->ll_flags);
864         des->ll_success_count = cpu_to_le32(src->ll_success_count);
865         des->ll_run_time_phase1 = cpu_to_le64(src->ll_run_time_phase1);
866         des->ll_run_time_phase2 = cpu_to_le64(src->ll_run_time_phase2);
867         des->ll_time_last_complete = cpu_to_le64(src->ll_time_last_complete);
868         des->ll_time_latest_start = cpu_to_le64(src->ll_time_latest_start);
869         des->ll_time_last_checkpoint =
870                                 cpu_to_le64(src->ll_time_last_checkpoint);
871         des->ll_pos_latest_start = cpu_to_le64(src->ll_pos_latest_start);
872         des->ll_pos_last_checkpoint = cpu_to_le64(src->ll_pos_last_checkpoint);
873         des->ll_pos_first_inconsistent =
874                         cpu_to_le64(src->ll_pos_first_inconsistent);
875         des->ll_objs_checked_phase1 = cpu_to_le64(src->ll_objs_checked_phase1);
876         des->ll_objs_failed_phase1 = cpu_to_le64(src->ll_objs_failed_phase1);
877         des->ll_objs_checked_phase2 = cpu_to_le64(src->ll_objs_checked_phase2);
878         des->ll_objs_failed_phase2 = cpu_to_le64(src->ll_objs_failed_phase2);
879         for (i = 0; i < LLIT_MAX; i++)
880                 des->ll_objs_repaired[i] =
881                                 cpu_to_le64(src->ll_objs_repaired[i]);
882         des->ll_objs_skipped = cpu_to_le64(src->ll_objs_skipped);
883         des->ll_bitmap_size = cpu_to_le32(src->ll_bitmap_size);
884         lldk_cpu_to_le(&des->ll_lldk_latest_scanned_phase2,
885                        &src->ll_lldk_latest_scanned_phase2);
886 }
887
888 /**
889  * Load the OST bitmap from the lfsck_layout trace file.
890  *
891  * \param[in] env       pointer to the thread context
892  * \param[in] com       pointer to the lfsck component
893  *
894  * \retval              0 for success
895  * \retval              negative error number on failure or data corruption
896  */
897 static int lfsck_layout_load_bitmap(const struct lu_env *env,
898                                     struct lfsck_component *com)
899 {
900         struct dt_object                *obj    = com->lc_obj;
901         struct lfsck_assistant_data     *lad    = com->lc_data;
902         struct lfsck_layout             *lo     = com->lc_file_ram;
903         struct cfs_bitmap                       *bitmap = lad->lad_bitmap;
904         loff_t                           pos    = com->lc_file_size;
905         ssize_t                          size;
906         __u32                            nbits;
907         int                              rc;
908         ENTRY;
909
910         if (com->lc_lfsck->li_ost_descs.ltd_tgts_bitmap->size >
911             lo->ll_bitmap_size)
912                 nbits = com->lc_lfsck->li_ost_descs.ltd_tgts_bitmap->size;
913         else
914                 nbits = lo->ll_bitmap_size;
915
916         if (unlikely(nbits < BITS_PER_LONG))
917                 nbits = BITS_PER_LONG;
918
919         if (nbits > bitmap->size) {
920                 __u32 new_bits = bitmap->size;
921                 struct cfs_bitmap *new_bitmap;
922
923                 while (new_bits < nbits)
924                         new_bits <<= 1;
925
926                 new_bitmap = CFS_ALLOCATE_BITMAP(new_bits);
927                 if (new_bitmap == NULL)
928                         RETURN(-ENOMEM);
929
930                 lad->lad_bitmap = new_bitmap;
931                 CFS_FREE_BITMAP(bitmap);
932                 bitmap = new_bitmap;
933         }
934
935         if (lo->ll_bitmap_size == 0) {
936                 clear_bit(LAD_INCOMPLETE, &lad->lad_flags);
937                 CFS_RESET_BITMAP(bitmap);
938
939                 RETURN(0);
940         }
941
942         size = (lo->ll_bitmap_size + 7) >> 3;
943         rc = dt_read(env, obj, lfsck_buf_get(env, bitmap->data, size), &pos);
944         if (rc != size)
945                 RETURN(rc >= 0 ? -EINVAL : rc);
946
947         if (cfs_bitmap_check_empty(bitmap))
948                 clear_bit(LAD_INCOMPLETE, &lad->lad_flags);
949         else
950                 set_bit(LAD_INCOMPLETE, &lad->lad_flags);
951
952         RETURN(0);
953 }
954
955 /**
956  * Load the layout LFSCK trace file from disk.
957  *
958  * The layout LFSCK trace file records the layout LFSCK status information
959  * and other statistics, such as how many objects have been scanned, and how
960  * many objects have been repaired, and etc. It also contains the bitmap for
961  * failed OSTs during the layout LFSCK. All these information will be loaded
962  * from disk to RAM when the layout LFSCK component setup.
963  *
964  * \param[in] env       pointer to the thread context
965  * \param[in] com       pointer to the lfsck component
966  *
967  * \retval              positive number for file data corruption, the caller
968  *                      should reset the layout LFSCK trace file
969  * \retval              0 for success
970  * \retval              negative error number on failure
971  */
972 static int lfsck_layout_load(const struct lu_env *env,
973                              struct lfsck_component *com)
974 {
975         struct lfsck_layout             *lo     = com->lc_file_ram;
976         ssize_t                          size   = com->lc_file_size;
977         loff_t                           pos    = 0;
978         int                              rc;
979
980         rc = dt_read(env, com->lc_obj,
981                      lfsck_buf_get(env, com->lc_file_disk, size), &pos);
982         if (rc == 0) {
983                 return -ENOENT;
984         } else if (rc < 0) {
985                 CDEBUG(D_LFSCK, "%s: failed to load lfsck_layout: rc = %d\n",
986                        lfsck_lfsck2name(com->lc_lfsck), rc);
987                 return rc;
988         } else if (rc != size) {
989                 CDEBUG(D_LFSCK, "%s: lfsck_layout size %u != %u; reset it\n",
990                        lfsck_lfsck2name(com->lc_lfsck), rc, (unsigned int)size);
991                 return 1;
992         }
993
994         lfsck_layout_le_to_cpu(lo, com->lc_file_disk);
995         if (lo->ll_magic != LFSCK_LAYOUT_MAGIC) {
996                 CDEBUG(D_LFSCK, "%s: invalid lfsck_layout magic %#x != %#x, "
997                        "to be reset\n", lfsck_lfsck2name(com->lc_lfsck),
998                        lo->ll_magic, LFSCK_LAYOUT_MAGIC);
999                 return 1;
1000         }
1001
1002         return 0;
1003 }
1004
1005 /**
1006  * Store the layout LFSCK trace file on disk.
1007  *
1008  * The layout LFSCK trace file records the layout LFSCK status information
1009  * and other statistics, such as how many objects have been scanned, and how
1010  * many objects have been repaired, and etc. It also contains the bitmap for
1011  * failed OSTs during the layout LFSCK. All these information will be synced
1012  * from RAM to disk periodically.
1013  *
1014  * \param[in] env       pointer to the thread context
1015  * \param[in] com       pointer to the lfsck component
1016  *
1017  * \retval              0 for success
1018  * \retval              negative error number on failure
1019  */
1020 static int lfsck_layout_store(const struct lu_env *env,
1021                               struct lfsck_component *com)
1022 {
1023         struct dt_object        *obj    = com->lc_obj;
1024         struct lfsck_instance   *lfsck  = com->lc_lfsck;
1025         struct lfsck_layout     *lo_ram = com->lc_file_ram;
1026         struct lfsck_layout     *lo     = com->lc_file_disk;
1027         struct thandle          *th;
1028         struct dt_device        *dev    = lfsck_obj2dev(obj);
1029         struct cfs_bitmap       *bitmap = NULL;
1030         loff_t                   pos;
1031         ssize_t                  size   = com->lc_file_size;
1032         __u32                    nbits  = 0;
1033         int                      rc;
1034         ENTRY;
1035
1036         if (lfsck->li_master) {
1037                 struct lfsck_assistant_data *lad = com->lc_data;
1038
1039                 bitmap = lad->lad_bitmap;
1040                 nbits = bitmap->size;
1041
1042                 LASSERT(nbits > 0);
1043                 LASSERTF((nbits & 7) == 0, "Invalid nbits %u\n", nbits);
1044         }
1045
1046         lo_ram->ll_bitmap_size = nbits;
1047         lfsck_layout_cpu_to_le(lo, lo_ram);
1048         th = dt_trans_create(env, dev);
1049         if (IS_ERR(th))
1050                 GOTO(log, rc = PTR_ERR(th));
1051
1052         rc = dt_declare_record_write(env, obj, lfsck_buf_get(env, lo, size),
1053                                      (loff_t)0, th);
1054         if (rc != 0)
1055                 GOTO(out, rc);
1056
1057         if (bitmap != NULL) {
1058                 rc = dt_declare_record_write(env, obj,
1059                                 lfsck_buf_get(env, bitmap->data, nbits >> 3),
1060                                 (loff_t)size, th);
1061                 if (rc != 0)
1062                         GOTO(out, rc);
1063         }
1064
1065         rc = dt_trans_start_local(env, dev, th);
1066         if (rc != 0)
1067                 GOTO(out, rc);
1068
1069         pos = 0;
1070         rc = dt_record_write(env, obj, lfsck_buf_get(env, lo, size), &pos, th);
1071         if (rc != 0)
1072                 GOTO(out, rc);
1073
1074         if (bitmap != NULL) {
1075                 pos = size;
1076                 rc = dt_record_write(env, obj,
1077                                 lfsck_buf_get(env, bitmap->data, nbits >> 3),
1078                                 &pos, th);
1079         }
1080
1081         GOTO(out, rc);
1082
1083 out:
1084         dt_trans_stop(env, dev, th);
1085
1086 log:
1087         if (rc != 0)
1088                 CDEBUG(D_LFSCK, "%s: fail to store lfsck_layout: rc = %d\n",
1089                        lfsck_lfsck2name(lfsck), rc);
1090
1091         return rc;
1092 }
1093
1094 static int lfsck_layout_init(const struct lu_env *env,
1095                              struct lfsck_component *com)
1096 {
1097         struct lfsck_layout *lo = com->lc_file_ram;
1098         int rc;
1099
1100         memset(lo, 0, com->lc_file_size);
1101         lo->ll_magic = LFSCK_LAYOUT_MAGIC;
1102         lo->ll_status = LS_INIT;
1103         down_write(&com->lc_sem);
1104         rc = lfsck_layout_store(env, com);
1105         if (rc == 0 && com->lc_lfsck->li_master)
1106                 rc = lfsck_load_sub_trace_files(env, com,
1107                         &dt_lfsck_layout_dangling_features, LFSCK_LAYOUT, true);
1108         up_write(&com->lc_sem);
1109
1110         return rc;
1111 }
1112
1113 static int fid_is_for_ostobj(const struct lu_env *env,
1114                              struct lfsck_instance *lfsck,
1115                              struct dt_object *obj, const struct lu_fid *fid)
1116 {
1117         struct seq_server_site  *ss     = lfsck_dev_site(lfsck);
1118         struct lu_seq_range     *range  = &lfsck_env_info(env)->lti_range;
1119         struct lustre_ost_attrs *loa;
1120         int                      rc;
1121
1122         fld_range_set_any(range);
1123         rc = fld_server_lookup(env, ss->ss_server_fld, fid_seq(fid), range);
1124         if (rc == 0) {
1125                 if (fld_range_is_ost(range))
1126                         return 1;
1127
1128                 return 0;
1129         }
1130
1131         loa = &lfsck_env_info(env)->lti_loa;
1132         rc = dt_xattr_get(env, obj, lfsck_buf_get(env, loa, sizeof(*loa)),
1133                           XATTR_NAME_LMA);
1134         if (rc >= (int)sizeof(struct lustre_mdt_attrs)) {
1135                 lustre_lma_swab(&loa->loa_lma);
1136
1137                 return loa->loa_lma.lma_compat & LMAC_FID_ON_OST ? 1 : 0;
1138         }
1139
1140         rc = dt_xattr_get(env, obj, &LU_BUF_NULL, XATTR_NAME_FID);
1141
1142         return rc > 0;
1143 }
1144
1145 static struct lfsck_layout_seq *
1146 lfsck_layout_seq_lookup(struct lfsck_layout_slave_data *llsd, __u64 seq)
1147 {
1148         struct lfsck_layout_seq *lls;
1149
1150         list_for_each_entry(lls, &llsd->llsd_seq_list, lls_list) {
1151                 if (lls->lls_seq == seq)
1152                         return lls;
1153
1154                 if (lls->lls_seq > seq)
1155                         return NULL;
1156         }
1157
1158         return NULL;
1159 }
1160
1161 static void
1162 lfsck_layout_seq_insert(struct lfsck_layout_slave_data *llsd,
1163                         struct lfsck_layout_seq *lls)
1164 {
1165         struct lfsck_layout_seq *tmp;
1166         struct list_head        *pos = &llsd->llsd_seq_list;
1167
1168         list_for_each_entry(tmp, &llsd->llsd_seq_list, lls_list) {
1169                 if (lls->lls_seq < tmp->lls_seq) {
1170                         pos = &tmp->lls_list;
1171                         break;
1172                 }
1173         }
1174         list_add_tail(&lls->lls_list, pos);
1175 }
1176
1177 static int
1178 lfsck_layout_lastid_create(const struct lu_env *env,
1179                            struct lfsck_instance *lfsck,
1180                            struct dt_object *obj)
1181 {
1182         struct lfsck_thread_info *info   = lfsck_env_info(env);
1183         struct lu_attr           *la     = &info->lti_la;
1184         struct dt_object_format  *dof    = &info->lti_dof;
1185         struct lfsck_bookmark    *bk     = &lfsck->li_bookmark_ram;
1186         struct dt_device         *dt     = lfsck_obj2dev(obj);
1187         struct thandle           *th;
1188         __u64                     lastid = 0;
1189         loff_t                    pos    = 0;
1190         int                       rc;
1191         ENTRY;
1192
1193         if (bk->lb_param & LPF_DRYRUN)
1194                 return 0;
1195
1196         memset(la, 0, sizeof(*la));
1197         la->la_mode = S_IFREG |  S_IRUGO | S_IWUSR;
1198         la->la_valid = LA_MODE | LA_UID | LA_GID;
1199         memset(dof, 0, sizeof(*dof));
1200         dof->dof_type = dt_mode_to_dft(S_IFREG);
1201
1202         th = dt_trans_create(env, dt);
1203         if (IS_ERR(th))
1204                 GOTO(log, rc = PTR_ERR(th));
1205
1206         rc = dt_declare_create(env, obj, la, NULL, dof, th);
1207         if (rc != 0)
1208                 GOTO(stop, rc);
1209
1210         rc = dt_declare_record_write(env, obj,
1211                                      lfsck_buf_get(env, &lastid,
1212                                                    sizeof(lastid)),
1213                                      pos, th);
1214         if (rc != 0)
1215                 GOTO(stop, rc);
1216
1217         rc = dt_trans_start_local(env, dt, th);
1218         if (rc != 0)
1219                 GOTO(stop, rc);
1220
1221         dt_write_lock(env, obj, 0);
1222         if (likely(dt_object_exists(obj) == 0)) {
1223                 rc = dt_create(env, obj, la, NULL, dof, th);
1224                 if (rc == 0)
1225                         rc = dt_record_write(env, obj,
1226                                 lfsck_buf_get(env, &lastid, sizeof(lastid)),
1227                                 &pos, th);
1228         }
1229         dt_write_unlock(env, obj);
1230
1231         GOTO(stop, rc);
1232
1233 stop:
1234         dt_trans_stop(env, dt, th);
1235
1236 log:
1237         CDEBUG(D_LFSCK, "%s: layout LFSCK will create LAST_ID for <seq> "
1238                "%#llx: rc = %d\n",
1239                lfsck_lfsck2name(lfsck), fid_seq(lfsck_dto2fid(obj)), rc);
1240
1241         return rc;
1242 }
1243
1244 static int
1245 lfsck_layout_lastid_reload(const struct lu_env *env,
1246                            struct lfsck_component *com,
1247                            struct lfsck_layout_seq *lls)
1248 {
1249         __u64   lastid;
1250         loff_t  pos     = 0;
1251         int     rc;
1252
1253         dt_read_lock(env, lls->lls_lastid_obj, 0);
1254         rc = dt_record_read(env, lls->lls_lastid_obj,
1255                             lfsck_buf_get(env, &lastid, sizeof(lastid)), &pos);
1256         dt_read_unlock(env, lls->lls_lastid_obj);
1257         if (unlikely(rc != 0))
1258                 return rc;
1259
1260         lastid = le64_to_cpu(lastid);
1261         if (lastid < lls->lls_lastid_known) {
1262                 struct lfsck_instance   *lfsck  = com->lc_lfsck;
1263                 struct lfsck_layout     *lo     = com->lc_file_ram;
1264
1265                 lls->lls_lastid = lls->lls_lastid_known;
1266                 lls->lls_dirty = 1;
1267                 if (!(lo->ll_flags & LF_CRASHED_LASTID)) {
1268                         LASSERT(lfsck->li_out_notify != NULL);
1269
1270                         lfsck->li_out_notify(env, lfsck->li_out_notify_data,
1271                                              LE_LASTID_REBUILDING);
1272                         lo->ll_flags |= LF_CRASHED_LASTID;
1273
1274                         CDEBUG(D_LFSCK, "%s: layout LFSCK finds crashed "
1275                                "LAST_ID file (1) for the sequence %#llx"
1276                                ", old value %llu, known value %llu\n",
1277                                lfsck_lfsck2name(lfsck), lls->lls_seq,
1278                                lastid, lls->lls_lastid);
1279                 }
1280         } else if (lastid >= lls->lls_lastid) {
1281                 lls->lls_lastid = lastid;
1282                 lls->lls_dirty = 0;
1283         }
1284
1285         return 0;
1286 }
1287
1288 static int
1289 lfsck_layout_lastid_store(const struct lu_env *env,
1290                           struct lfsck_component *com)
1291 {
1292         struct lfsck_instance           *lfsck  = com->lc_lfsck;
1293         struct lfsck_bookmark           *bk     = &lfsck->li_bookmark_ram;
1294         struct dt_device                *dt     = lfsck->li_bottom;
1295         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
1296         struct lfsck_layout_seq         *lls;
1297         struct thandle                  *th;
1298         __u64                            lastid;
1299         int                              rc     = 0;
1300         int                              rc1    = 0;
1301
1302         list_for_each_entry(lls, &llsd->llsd_seq_list, lls_list) {
1303                 loff_t pos = 0;
1304
1305                 if (!lls->lls_dirty)
1306                         continue;
1307
1308                 CDEBUG(D_LFSCK, "%s: layout LFSCK will sync the LAST_ID for "
1309                        "<seq> %#llx as <oid> %llu\n",
1310                        lfsck_lfsck2name(lfsck), lls->lls_seq, lls->lls_lastid);
1311
1312                 if (bk->lb_param & LPF_DRYRUN) {
1313                         lls->lls_dirty = 0;
1314                         continue;
1315                 }
1316
1317                 th = dt_trans_create(env, dt);
1318                 if (IS_ERR(th)) {
1319                         rc1 = PTR_ERR(th);
1320                         CDEBUG(D_LFSCK, "%s: layout LFSCK failed to store "
1321                                "the LAST_ID for <seq> %#llx(1): rc = %d\n",
1322                                lfsck_lfsck2name(com->lc_lfsck),
1323                                lls->lls_seq, rc1);
1324                         continue;
1325                 }
1326
1327                 lastid = cpu_to_le64(lls->lls_lastid);
1328                 rc = dt_declare_record_write(env, lls->lls_lastid_obj,
1329                                              lfsck_buf_get(env, &lastid,
1330                                                            sizeof(lastid)),
1331                                              pos, th);
1332                 if (rc != 0)
1333                         goto stop;
1334
1335                 rc = dt_trans_start_local(env, dt, th);
1336                 if (rc != 0)
1337                         goto stop;
1338
1339                 dt_write_lock(env, lls->lls_lastid_obj, 0);
1340                 rc = dt_record_write(env, lls->lls_lastid_obj,
1341                                      lfsck_buf_get(env, &lastid,
1342                                      sizeof(lastid)), &pos, th);
1343                 dt_write_unlock(env, lls->lls_lastid_obj);
1344                 if (rc == 0)
1345                         lls->lls_dirty = 0;
1346
1347 stop:
1348                 dt_trans_stop(env, dt, th);
1349                 if (rc != 0) {
1350                         rc1 = rc;
1351                         CDEBUG(D_LFSCK, "%s: layout LFSCK failed to store "
1352                                "the LAST_ID for <seq> %#llx(2): rc = %d\n",
1353                                lfsck_lfsck2name(com->lc_lfsck),
1354                                lls->lls_seq, rc1);
1355                 }
1356         }
1357
1358         return rc1;
1359 }
1360
1361 static int
1362 lfsck_layout_lastid_load(const struct lu_env *env,
1363                          struct lfsck_component *com,
1364                          struct lfsck_layout_seq *lls)
1365 {
1366         struct lfsck_instance   *lfsck  = com->lc_lfsck;
1367         struct lfsck_layout     *lo     = com->lc_file_ram;
1368         struct lu_fid           *fid    = &lfsck_env_info(env)->lti_fid;
1369         struct dt_object        *obj;
1370         loff_t                   pos    = 0;
1371         int                      rc;
1372         ENTRY;
1373
1374         lu_last_id_fid(fid, lls->lls_seq, lfsck_dev_idx(lfsck));
1375         obj = dt_locate(env, lfsck->li_bottom, fid);
1376         if (IS_ERR(obj))
1377                 RETURN(PTR_ERR(obj));
1378
1379         /* LAST_ID crashed, to be rebuilt */
1380         if (dt_object_exists(obj) == 0) {
1381                 if (!(lo->ll_flags & LF_CRASHED_LASTID)) {
1382                         LASSERT(lfsck->li_out_notify != NULL);
1383
1384                         lfsck->li_out_notify(env, lfsck->li_out_notify_data,
1385                                              LE_LASTID_REBUILDING);
1386                         lo->ll_flags |= LF_CRASHED_LASTID;
1387
1388                         CDEBUG(D_LFSCK, "%s: layout LFSCK cannot find the "
1389                                "LAST_ID file for sequence %#llx\n",
1390                                lfsck_lfsck2name(lfsck), lls->lls_seq);
1391
1392                         if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY4) &&
1393                             cfs_fail_val > 0) {
1394                                 struct l_wait_info lwi = LWI_TIMEOUT(
1395                                                 cfs_time_seconds(cfs_fail_val),
1396                                                 NULL, NULL);
1397
1398                                 /* Some others may changed the cfs_fail_val
1399                                  * as zero after above check, re-check it for
1400                                  * sure to avoid falling into wait for ever. */
1401                                 if (likely(lwi.lwi_timeout > 0)) {
1402                                         struct ptlrpc_thread *thread =
1403                                                 &lfsck->li_thread;
1404
1405                                         up_write(&com->lc_sem);
1406                                         l_wait_event(thread->t_ctl_waitq,
1407                                                      !thread_is_running(thread),
1408                                                      &lwi);
1409                                         down_write(&com->lc_sem);
1410                                 }
1411                         }
1412                 }
1413
1414                 rc = lfsck_layout_lastid_create(env, lfsck, obj);
1415         } else {
1416                 dt_read_lock(env, obj, 0);
1417                 rc = dt_read(env, obj,
1418                         lfsck_buf_get(env, &lls->lls_lastid, sizeof(__u64)),
1419                         &pos);
1420                 dt_read_unlock(env, obj);
1421                 if (rc != 0 && rc != sizeof(__u64))
1422                         GOTO(out, rc = (rc > 0 ? -EFAULT : rc));
1423
1424                 if (rc == 0 && !(lo->ll_flags & LF_CRASHED_LASTID)) {
1425                         LASSERT(lfsck->li_out_notify != NULL);
1426
1427                         lfsck->li_out_notify(env, lfsck->li_out_notify_data,
1428                                              LE_LASTID_REBUILDING);
1429                         lo->ll_flags |= LF_CRASHED_LASTID;
1430
1431                         CDEBUG(D_LFSCK, "%s: layout LFSCK finds invalid "
1432                                "LAST_ID file for the sequence %#llx"
1433                                ": rc = %d\n",
1434                                lfsck_lfsck2name(lfsck), lls->lls_seq, rc);
1435                 }
1436
1437                 lls->lls_lastid = le64_to_cpu(lls->lls_lastid);
1438                 rc = 0;
1439         }
1440
1441         GOTO(out, rc);
1442
1443 out:
1444         if (rc != 0)
1445                 lfsck_object_put(env, obj);
1446         else
1447                 lls->lls_lastid_obj = obj;
1448
1449         return rc;
1450 }
1451
1452 static void lfsck_layout_record_failure(const struct lu_env *env,
1453                                         struct lfsck_instance *lfsck,
1454                                         struct lfsck_layout *lo)
1455 {
1456         __u64 cookie;
1457
1458         lo->ll_objs_failed_phase1++;
1459         cookie = lfsck->li_obj_oit->do_index_ops->dio_it.store(env,
1460                                                         lfsck->li_di_oit);
1461         if (lo->ll_pos_first_inconsistent == 0 ||
1462             lo->ll_pos_first_inconsistent < cookie) {
1463                 lo->ll_pos_first_inconsistent = cookie;
1464
1465                 CDEBUG(D_LFSCK, "%s: layout LFSCK hit first non-repaired "
1466                        "inconsistency at the pos [%llu]\n",
1467                        lfsck_lfsck2name(lfsck),
1468                        lo->ll_pos_first_inconsistent);
1469         }
1470 }
1471
1472 static int lfsck_layout_double_scan_result(const struct lu_env *env,
1473                                            struct lfsck_component *com,
1474                                            int rc)
1475 {
1476         struct lfsck_instance   *lfsck = com->lc_lfsck;
1477         struct lfsck_layout     *lo    = com->lc_file_ram;
1478
1479         CDEBUG(D_LFSCK, "%s: layout LFSCK double scan: rc = %d\n",
1480                lfsck_lfsck2name(lfsck), rc);
1481
1482         down_write(&com->lc_sem);
1483         lo->ll_run_time_phase2 += ktime_get_seconds() -
1484                                   com->lc_time_last_checkpoint;
1485         lo->ll_time_last_checkpoint = ktime_get_real_seconds();
1486         lo->ll_objs_checked_phase2 += com->lc_new_checked;
1487
1488         if (rc > 0) {
1489                 if (lo->ll_flags & LF_INCOMPLETE) {
1490                         lo->ll_status = LS_PARTIAL;
1491                 } else {
1492                         if (lfsck->li_master) {
1493                                 struct lfsck_assistant_data *lad = com->lc_data;
1494
1495                                 if (test_bit(LAD_INCOMPLETE, &lad->lad_flags))
1496                                         lo->ll_status = LS_PARTIAL;
1497                                 else
1498                                         lo->ll_status = LS_COMPLETED;
1499                         } else {
1500                                 lo->ll_status = LS_COMPLETED;
1501                         }
1502                 }
1503                 lo->ll_flags &= ~LF_SCANNED_ONCE;
1504                 if (!(lfsck->li_bookmark_ram.lb_param & LPF_DRYRUN))
1505                         lo->ll_flags &= ~LF_INCONSISTENT;
1506                 lo->ll_time_last_complete = lo->ll_time_last_checkpoint;
1507                 lo->ll_success_count++;
1508         } else if (rc == 0) {
1509                 if (lfsck->li_status != 0)
1510                         lo->ll_status = lfsck->li_status;
1511                 else
1512                         lo->ll_status = LS_STOPPED;
1513         } else {
1514                 lo->ll_status = LS_FAILED;
1515         }
1516
1517         rc = lfsck_layout_store(env, com);
1518         up_write(&com->lc_sem);
1519
1520         CDEBUG(D_LFSCK, "%s: layout LFSCK double scan result %u: rc = %d\n",
1521                lfsck_lfsck2name(lfsck), lo->ll_status, rc);
1522
1523         return rc;
1524 }
1525
1526 static int lfsck_layout_trans_stop(const struct lu_env *env,
1527                                    struct dt_device *dev,
1528                                    struct thandle *handle, int result)
1529 {
1530         int rc;
1531
1532         /* XXX: If there is something worng or it needs to repair nothing,
1533          *      then notify the lower to stop the modification. Currently,
1534          *      we use th_result for such purpose, that may be replaced by
1535          *      some rollback mechanism in the future. */
1536         handle->th_result = result;
1537         rc = dt_trans_stop(env, dev, handle);
1538         if (result != 0)
1539                 return result > 0 ? 0 : result;
1540
1541         return rc == 0 ? 1 : rc;
1542 }
1543
1544 static int lfsck_layout_ins_dangling_rec(const struct lu_env *env,
1545                                          struct lfsck_component *com,
1546                                          const struct lu_fid *pfid,
1547                                          const struct lu_fid *cfid,
1548                                          __u32 comp_id, __u32 ea_off,
1549                                          __u32 ost_idx)
1550 {
1551         struct lfsck_layout_dangling_key *key = &lfsck_env_info(env)->lti_lldk;
1552         struct lu_fid *rec = &lfsck_env_info(env)->lti_fid3;
1553         struct dt_device *dev;
1554         struct dt_object *obj;
1555         struct thandle *th = NULL;
1556         int idx;
1557         int rc = 0;
1558         ENTRY;
1559
1560         idx = lfsck_sub_trace_file_fid2idx(pfid);
1561         obj = com->lc_sub_trace_objs[idx].lsto_obj;
1562         dev = lfsck_obj2dev(obj);
1563
1564         fid_cpu_to_be(&key->lldk_fid, pfid);
1565         key->lldk_comp_id = cpu_to_be32(comp_id);
1566         key->lldk_ea_off = cpu_to_be32(ea_off);
1567
1568         fid_cpu_to_be(rec, cfid);
1569         rec->f_ver = cpu_to_be32(ost_idx);
1570
1571         mutex_lock(&com->lc_sub_trace_objs[idx].lsto_mutex);
1572
1573         th = dt_trans_create(env, dev);
1574         if (IS_ERR(th))
1575                 GOTO(unlock, rc = PTR_ERR(th));
1576
1577         rc = dt_declare_insert(env, obj,
1578                                (const struct dt_rec *)rec,
1579                                (const struct dt_key *)key, th);
1580         if (rc)
1581                 GOTO(unlock, rc);
1582
1583         rc = dt_trans_start_local(env, dev, th);
1584         if (rc)
1585                 GOTO(unlock, rc);
1586
1587         rc = dt_insert(env, obj, (const struct dt_rec *)rec,
1588                        (const struct dt_key *)key, th);
1589
1590         GOTO(unlock, rc);
1591
1592 unlock:
1593         if (th && !IS_ERR(th))
1594                 dt_trans_stop(env, dev, th);
1595
1596         mutex_unlock(&com->lc_sub_trace_objs[idx].lsto_mutex);
1597
1598         CDEBUG(D_LFSCK, "%s: insert the paris "DFID" => "DFID", comp_id = %u, "
1599                "ea_off = %u, ost_idx = %u, into the trace file for further "
1600                "dangling check: rc = %d\n", lfsck_lfsck2name(com->lc_lfsck),
1601                PFID(pfid), PFID(cfid), comp_id, ea_off, ost_idx, rc);
1602
1603         return rc;
1604 }
1605
1606 static int lfsck_layout_del_dangling_rec(const struct lu_env *env,
1607                                          struct lfsck_component *com,
1608                                          const struct lu_fid *fid,
1609                                          __u32 comp_id, __u32 ea_off)
1610 {
1611         struct lfsck_layout_dangling_key *key = &lfsck_env_info(env)->lti_lldk;
1612         struct dt_device *dev;
1613         struct dt_object *obj;
1614         struct thandle *th = NULL;
1615         int idx;
1616         int rc = 0;
1617         ENTRY;
1618
1619         idx = lfsck_sub_trace_file_fid2idx(fid);
1620         obj = com->lc_sub_trace_objs[idx].lsto_obj;
1621         dev = lfsck_obj2dev(obj);
1622
1623         fid_cpu_to_be(&key->lldk_fid, fid);
1624         key->lldk_comp_id = cpu_to_be32(comp_id);
1625         key->lldk_ea_off = cpu_to_be32(ea_off);
1626
1627         mutex_lock(&com->lc_sub_trace_objs[idx].lsto_mutex);
1628
1629         th = dt_trans_create(env, dev);
1630         if (IS_ERR(th))
1631                 GOTO(unlock, rc = PTR_ERR(th));
1632
1633         rc = dt_declare_delete(env, obj, (const struct dt_key *)key, th);
1634         if (rc)
1635                 GOTO(unlock, rc);
1636
1637         rc = dt_trans_start_local(env, dev, th);
1638         if (rc)
1639                 GOTO(unlock, rc);
1640
1641         rc = dt_delete(env, obj, (const struct dt_key *)key, th);
1642
1643         GOTO(unlock, rc);
1644
1645 unlock:
1646         if (th && !IS_ERR(th))
1647                 dt_trans_stop(env, dev, th);
1648
1649         mutex_unlock(&com->lc_sub_trace_objs[idx].lsto_mutex);
1650
1651         CDEBUG(D_LFSCK, "%s: delete the dangling record for "DFID
1652                ", comp_id = %u, ea_off = %u from the trace file: rc = %d\n",
1653                lfsck_lfsck2name(com->lc_lfsck), PFID(fid), comp_id, ea_off, rc);
1654
1655         return rc;
1656 }
1657
1658 /**
1659  * Get the system default stripe size.
1660  *
1661  * \param[in] env       pointer to the thread context
1662  * \param[in] lfsck     pointer to the lfsck instance
1663  * \param[out] size     pointer to the default stripe size
1664  *
1665  * \retval              0 for success
1666  * \retval              negative error number on failure
1667  */
1668 static int lfsck_layout_get_def_stripesize(const struct lu_env *env,
1669                                            struct lfsck_instance *lfsck,
1670                                            __u32 *size)
1671 {
1672         struct lov_user_md      *lum = &lfsck_env_info(env)->lti_lum;
1673         struct dt_object        *root;
1674         int                      rc;
1675
1676         root = dt_locate(env, lfsck->li_next, &lfsck->li_local_root_fid);
1677         if (IS_ERR(root))
1678                 return PTR_ERR(root);
1679
1680         /* Get the default stripe size via xattr_get on the backend root. */
1681         rc = dt_xattr_get(env, root, lfsck_buf_get(env, lum, sizeof(*lum)),
1682                           XATTR_NAME_LOV);
1683         if (rc > 0) {
1684                 /* The lum->lmm_stripe_size is LE mode. The *size also
1685                  * should be LE mode. So it is unnecessary to convert. */
1686                 *size = lum->lmm_stripe_size;
1687                 rc = 0;
1688         } else if (unlikely(rc == 0)) {
1689                 rc = -EINVAL;
1690         }
1691
1692         lfsck_object_put(env, root);
1693
1694         return rc;
1695 }
1696
1697 /**
1698  * \retval       +1: repaired
1699  * \retval        0: did nothing
1700  * \retval      -ve: on error
1701  */
1702 static int lfsck_layout_refill_lovea(const struct lu_env *env,
1703                                      struct lfsck_instance *lfsck,
1704                                      struct thandle *handle,
1705                                      struct dt_object *parent,
1706                                      const struct lu_fid *cfid,
1707                                      struct lu_buf *buf,
1708                                      struct lov_mds_md_v1 *lmm,
1709                                      struct lov_ost_data_v1 *slot,
1710                                      int fl, __u32 ost_idx, int size)
1711 {
1712         struct ost_id           *oi     = &lfsck_env_info(env)->lti_oi;
1713         struct lu_buf            ea_buf;
1714         int                      rc;
1715         __u32                    magic;
1716         __u32                    pattern;
1717         __u16                    count;
1718         ENTRY;
1719
1720         magic = le32_to_cpu(lmm->lmm_magic);
1721         pattern = le32_to_cpu(lmm->lmm_pattern);
1722         count = le16_to_cpu(lmm->lmm_stripe_count);
1723
1724         fid_to_ostid(cfid, oi);
1725         ostid_cpu_to_le(oi, &slot->l_ost_oi);
1726         slot->l_ost_gen = cpu_to_le32(0);
1727         slot->l_ost_idx = cpu_to_le32(ost_idx);
1728
1729         if (pattern & LOV_PATTERN_F_HOLE) {
1730                 struct lov_ost_data_v1 *objs;
1731                 int                     i;
1732
1733                 if (magic == LOV_MAGIC_V1)
1734                         objs = &lmm->lmm_objects[0];
1735                 else
1736                         objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0];
1737                 for (i = 0; i < count; i++, objs++) {
1738                         if (lovea_slot_is_dummy(objs))
1739                                 break;
1740                 }
1741
1742                 /* If the @slot is the last dummy slot to be refilled,
1743                  * then drop LOV_PATTERN_F_HOLE from lmm::lmm_pattern. */
1744                 if (i == count) {
1745                         lmm->lmm_pattern =
1746                                 cpu_to_le32(pattern & ~LOV_PATTERN_F_HOLE);
1747
1748                         CDEBUG(D_LFSCK, "%s: remove layout HOLE for "DFID
1749                                ": parent "DFID"\n", lfsck_lfsck2name(lfsck),
1750                                PFID(cfid), PFID(lfsck_dto2fid(parent)));
1751                 }
1752         }
1753
1754         lfsck_buf_init(&ea_buf, buf->lb_buf, size);
1755         rc = dt_xattr_set(env, parent, &ea_buf, XATTR_NAME_LOV, fl, handle);
1756         if (rc == 0)
1757                 rc = 1;
1758
1759         RETURN(rc);
1760 }
1761
1762 static struct lov_ost_data_v1 *
1763 __lfsck_layout_new_v1_lovea(struct lov_mds_md_v1 *lmm,
1764                             const struct lu_fid *pfid,
1765                             __u32 stripe_size, __u32 ea_off,
1766                             __u32 pattern, __u16 count)
1767 {
1768         lmm->lmm_magic = cpu_to_le32(LOV_MAGIC_V1);
1769         lmm->lmm_pattern = cpu_to_le32(pattern);
1770         fid_to_lmm_oi(pfid, &lmm->lmm_oi);
1771         lmm_oi_cpu_to_le(&lmm->lmm_oi, &lmm->lmm_oi);
1772         lmm->lmm_stripe_size = cpu_to_le32(stripe_size);
1773         lmm->lmm_stripe_count = cpu_to_le16(count);
1774         lmm->lmm_layout_gen = cpu_to_le16(1);
1775         memset(&lmm->lmm_objects[0], 0,
1776                sizeof(struct lov_ost_data_v1) * count);
1777
1778         return &lmm->lmm_objects[ea_off];
1779 }
1780
1781 static int lfsck_layout_new_v1_lovea(const struct lu_env *env,
1782                                      struct lfsck_instance *lfsck,
1783                                      struct ost_layout *ol,
1784                                      struct dt_object *parent,
1785                                      struct lu_buf *buf, __u32 ea_off,
1786                                      struct lov_mds_md_v1 **lmm,
1787                                      struct lov_ost_data_v1 **objs)
1788 {
1789         int size;
1790         __u32 stripe_size = ol->ol_stripe_size;
1791         __u32 pattern = LOV_PATTERN_RAID0;
1792         __u16 count;
1793
1794         if (ol->ol_stripe_count != 0)
1795                 count = ol->ol_stripe_count;
1796         else
1797                 count = ea_off + 1;
1798
1799         size = lov_mds_md_size(count, LOV_MAGIC_V1);
1800         LASSERTF(buf->lb_len >= size,
1801                  "buffer len %d is less than real size %d\n",
1802                  (int)buf->lb_len, size);
1803
1804         if (stripe_size == 0) {
1805                 int rc;
1806
1807                 rc = lfsck_layout_get_def_stripesize(env, lfsck, &stripe_size);
1808                 if (rc)
1809                         return rc;
1810         }
1811
1812         *lmm = buf->lb_buf;
1813         if (ol->ol_stripe_count > 1 ||
1814             (ol->ol_stripe_count == 0 && ea_off != 0)) {
1815                 pattern |= LOV_PATTERN_F_HOLE;
1816                 memset(&(*lmm)->lmm_objects[0], 0,
1817                        count * sizeof(struct lov_ost_data_v1));
1818         }
1819
1820         *objs = __lfsck_layout_new_v1_lovea(*lmm, lfsck_dto2fid(parent),
1821                                 stripe_size, ea_off, pattern, count);
1822
1823         return size;
1824 }
1825
1826 static int lfsck_layout_new_comp_lovea(const struct lu_env *env,
1827                                        struct lu_orphan_rec_v3 *rec,
1828                                        struct dt_object *parent,
1829                                        struct lu_buf *buf, __u32 ea_off,
1830                                        struct lov_mds_md_v1 **lmm,
1831                                        struct lov_ost_data_v1 **objs)
1832 {
1833         struct ost_layout *ol = &rec->lor_layout;
1834         struct lov_comp_md_v1 *lcm;
1835         struct lov_comp_md_entry_v1 *lcme;
1836         __u32 pattern = LOV_PATTERN_RAID0;
1837         __u32 offset = sizeof(*lcm) + sizeof(*lcme);
1838         int lcme_size = lov_mds_md_size(ol->ol_stripe_count, LOV_MAGIC_V1);
1839         int size = offset + lcme_size;
1840
1841         LASSERTF(buf->lb_len >= size,
1842                  "buffer len %d is less than real size %d\n",
1843                  (int)buf->lb_len, size);
1844
1845         lcm = buf->lb_buf;
1846         lcm->lcm_magic = cpu_to_le32(LOV_MAGIC_COMP_V1);
1847         lcm->lcm_size = cpu_to_le32(size);
1848         if (rec->lor_range) {
1849                 lcm->lcm_layout_gen = cpu_to_le32(rec->lor_layout_version +
1850                                                   rec->lor_range);
1851                 lcm->lcm_flags = cpu_to_le16(LCM_FL_WRITE_PENDING);
1852         } else if (rec->lor_layout_version) {
1853                 lcm->lcm_layout_gen = cpu_to_le32(rec->lor_layout_version +
1854                                                   rec->lor_range);
1855                 lcm->lcm_flags = cpu_to_le16(LCM_FL_NONE);
1856         } else {
1857                 lcm->lcm_layout_gen = cpu_to_le32(1);
1858                 lcm->lcm_flags = cpu_to_le16(LCM_FL_NONE);
1859         }
1860         lcm->lcm_entry_count = cpu_to_le16(1);
1861         /* Currently, we do not know how many mirrors will be, set it as zero
1862          * at the beginning. It will be updated when more mirrors are found. */
1863         lcm->lcm_mirror_count = 0;
1864
1865         lcme = &lcm->lcm_entries[0];
1866         lcme->lcme_id = cpu_to_le32(ol->ol_comp_id);
1867         lcme->lcme_flags = cpu_to_le32(LCME_FL_INIT);
1868         lcme->lcme_extent.e_start = cpu_to_le64(ol->ol_comp_start);
1869         lcme->lcme_extent.e_end = cpu_to_le64(ol->ol_comp_end);
1870         lcme->lcme_offset = cpu_to_le32(offset);
1871         lcme->lcme_size = cpu_to_le32(lcme_size);
1872         lcme->lcme_layout_gen = lcm->lcm_layout_gen;
1873         if (ol->ol_stripe_count > 1)
1874                 pattern |= LOV_PATTERN_F_HOLE;
1875
1876         *lmm = buf->lb_buf + offset;
1877         *objs = __lfsck_layout_new_v1_lovea(*lmm, lfsck_dto2fid(parent),
1878                                             ol->ol_stripe_size, ea_off,
1879                                             pattern, ol->ol_stripe_count);
1880
1881         return size;
1882 }
1883
1884 static void lfsck_layout_update_lcm(struct lov_comp_md_v1 *lcm,
1885                                     struct lov_comp_md_entry_v1 *lcme,
1886                                     __u32 version, __u32 range)
1887 {
1888         struct lov_comp_md_entry_v1 *tmp;
1889         __u64 start = le64_to_cpu(lcme->lcme_extent.e_start);
1890         __u64 end = le64_to_cpu(lcme->lcme_extent.e_end);
1891         __u32 gen = version + range;
1892         __u32 tmp_gen;
1893         int i;
1894         __u16 count = le16_to_cpu(lcm->lcm_entry_count);
1895         __u16 flags = le16_to_cpu(lcm->lcm_flags);
1896
1897         if (!gen)
1898                 gen = 1;
1899         lcme->lcme_layout_gen = cpu_to_le32(gen);
1900         if (le32_to_cpu(lcm->lcm_layout_gen) < gen)
1901                 lcm->lcm_layout_gen = cpu_to_le32(gen);
1902
1903         if (range)
1904                 lcm->lcm_flags = cpu_to_le16(LCM_FL_WRITE_PENDING);
1905         else if (flags == LCM_FL_NONE && le16_to_cpu(lcm->lcm_mirror_count) > 0)
1906                 lcm->lcm_flags = cpu_to_le16(LCM_FL_RDONLY);
1907
1908         for (i = 0; i < count; i++) {
1909                 tmp = &lcm->lcm_entries[i];
1910                 if (le64_to_cpu(tmp->lcme_extent.e_end) <= start)
1911                         continue;
1912
1913                 if (le64_to_cpu(tmp->lcme_extent.e_start) >= end)
1914                         continue;
1915
1916                 if (le32_to_cpu(tmp->lcme_flags) & LCME_FL_STALE)
1917                         continue;
1918
1919                 tmp_gen = le32_to_cpu(tmp->lcme_layout_gen);
1920                 /* "lcme_layout_gen == 0" but without LCME_FL_STALE flag,
1921                  * then it should be the latest version of all mirrors. */
1922                 if (tmp_gen == 0 || tmp_gen > gen) {
1923                         lcme->lcme_flags = cpu_to_le32(
1924                                 le32_to_cpu(lcme->lcme_flags) | LCME_FL_STALE);
1925                         break;
1926                 }
1927
1928                 if (tmp_gen < gen)
1929                         tmp->lcme_flags = cpu_to_le32(
1930                                 le32_to_cpu(tmp->lcme_flags) | LCME_FL_STALE);
1931         }
1932 }
1933
1934 static int lfsck_layout_add_comp(const struct lu_env *env,
1935                                  struct lfsck_instance *lfsck,
1936                                  struct thandle *handle,
1937                                  struct lu_orphan_rec_v3 *rec,
1938                                  struct dt_object *parent,
1939                                  const struct lu_fid *cfid,
1940                                  struct lu_buf *buf, __u32 ost_idx,
1941                                  __u32 ea_off, int pos, bool new_mirror)
1942 {
1943         struct ost_layout *ol = &rec->lor_layout;
1944         struct lov_comp_md_v1 *lcm = buf->lb_buf;
1945         struct lov_comp_md_entry_v1 *lcme;
1946         struct lov_mds_md_v1 *lmm;
1947         struct lov_ost_data_v1 *objs;
1948         int added = sizeof(*lcme) +
1949                     lov_mds_md_size(ol->ol_stripe_count, LOV_MAGIC_V1);
1950         int size = le32_to_cpu(lcm->lcm_size) + added;
1951         int rc;
1952         int i;
1953         __u32 offset;
1954         __u32 pattern = LOV_PATTERN_RAID0;
1955         __u16 count = le16_to_cpu(lcm->lcm_entry_count);
1956         ENTRY;
1957
1958         lu_buf_check_and_grow(buf, size);
1959         /* set the lcm again because lu_buf_check_and_grow() may
1960          * have reallocated the buf. */
1961         lcm = buf->lb_buf;
1962         lcm->lcm_size = cpu_to_le32(size);
1963         lcm->lcm_entry_count = cpu_to_le16(count + 1);
1964         if (new_mirror)
1965                 le16_add_cpu(&lcm->lcm_mirror_count, 1);
1966
1967         /* 1. Move the component bodies from [pos, count-1] to [pos+1, count]
1968          *    with distance of 'added'. */
1969         if (pos < count) {
1970                 size = 0;
1971                 for (i = pos; i < count; i++) {
1972                         lcme = &lcm->lcm_entries[i];
1973                         size += le32_to_cpu(lcme->lcme_size);
1974                 }
1975
1976                 offset = le32_to_cpu(lcm->lcm_entries[pos].lcme_offset);
1977                 memmove(buf->lb_buf + offset + added,
1978                         buf->lb_buf + offset, size);
1979         }
1980
1981         size = 0;
1982         /* 2. Move the component header [0, pos-1] to [0, pos-1] with distance
1983          *    of 'sizeof(struct lov_comp_md_entry_v1)' */
1984         if (pos > 0) {
1985                 for (i = 0; i < pos; i++) {
1986                         lcme = &lcm->lcm_entries[i];
1987                         size += le32_to_cpu(lcme->lcme_size);
1988                 }
1989
1990                 offset = le32_to_cpu(lcm->lcm_entries[0].lcme_offset);
1991                 memmove(buf->lb_buf + offset + sizeof(*lcme),
1992                         buf->lb_buf + offset, size);
1993         }
1994
1995         /* 3. Recalculate the enter offset for the component [pos, count-1] */
1996         for (i = count - 1; i >= pos; i--) {
1997                 lcm->lcm_entries[i + 1] = lcm->lcm_entries[i];
1998                 lcm->lcm_entries[i + 1].lcme_offset =
1999                         cpu_to_le32(le32_to_cpu(lcm->lcm_entries[i + 1].
2000                                                 lcme_offset) + added);
2001         }
2002
2003         /* 4. Recalculate the enter offset for the component [0, pos) */
2004         for (i = 0; i < pos; i++) {
2005                 lcm->lcm_entries[i].lcme_offset =
2006                         cpu_to_le32(le32_to_cpu(lcm->lcm_entries[i].
2007                                                 lcme_offset) + sizeof(*lcme));
2008         }
2009
2010         offset = sizeof(*lcm) + sizeof(*lcme) * (count + 1) + size;
2011         /* 4. Insert the new component header (entry) at the slot 'pos'. */
2012         lcme = &lcm->lcm_entries[pos];
2013         lcme->lcme_id = cpu_to_le32(ol->ol_comp_id);
2014         lcme->lcme_flags = cpu_to_le32(LCME_FL_INIT);
2015         lcme->lcme_extent.e_start = cpu_to_le64(ol->ol_comp_start);
2016         lcme->lcme_extent.e_end = cpu_to_le64(ol->ol_comp_end);
2017         lcme->lcme_offset = cpu_to_le32(offset);
2018         lcme->lcme_size = cpu_to_le32(lov_mds_md_size(ol->ol_stripe_count,
2019                                                       LOV_MAGIC_V1));
2020
2021         if (ol->ol_stripe_count > 1)
2022                 pattern |= LOV_PATTERN_F_HOLE;
2023
2024         lmm = buf->lb_buf + offset;
2025         /* 5. Insert teh new component body at the 'offset'. */
2026         objs = __lfsck_layout_new_v1_lovea(lmm, lfsck_dto2fid(parent),
2027                                            ol->ol_stripe_size, ea_off,
2028                                            pattern, ol->ol_stripe_count);
2029
2030         /* 6. Update mirror related flags and version. */
2031         lfsck_layout_update_lcm(lcm, lcme, rec->lor_layout_version,
2032                                 rec->lor_range);
2033
2034         rc = lfsck_layout_refill_lovea(env, lfsck, handle, parent, cfid, buf,
2035                                        lmm, objs, LU_XATTR_REPLACE, ost_idx,
2036                                        le32_to_cpu(lcm->lcm_size));
2037
2038         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant add new COMP for "
2039                DFID": parent "DFID", OST-index %u, stripe-index %u, "
2040                "stripe_size %u, stripe_count %u, comp_id %u, comp_start %llu, "
2041                "comp_end %llu, layout version %u, range %u, "
2042                "%s LOV EA hole: rc = %d\n",
2043                lfsck_lfsck2name(lfsck), PFID(cfid), PFID(lfsck_dto2fid(parent)),
2044                ost_idx, ea_off, ol->ol_stripe_size, ol->ol_stripe_count,
2045                ol->ol_comp_id, ol->ol_comp_start, ol->ol_comp_end,
2046                rec->lor_layout_version, rec->lor_range,
2047                le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_HOLE ?
2048                "with" : "without", rc);
2049
2050         RETURN(rc);
2051 }
2052
2053 static int lfsck_layout_extend_v1v3_lovea(const struct lu_env *env,
2054                                           struct lfsck_instance *lfsck,
2055                                           struct thandle *handle,
2056                                           struct ost_layout *ol,
2057                                           struct dt_object *parent,
2058                                           const struct lu_fid *cfid,
2059                                           struct lu_buf *buf, __u32 ost_idx,
2060                                           __u32 ea_off)
2061 {
2062         struct lov_mds_md_v1 *lmm = buf->lb_buf;
2063         struct lov_ost_data_v1 *objs;
2064         __u16 count = le16_to_cpu(lmm->lmm_stripe_count);
2065         __u32 magic = le32_to_cpu(lmm->lmm_magic);
2066         int size;
2067         int gap;
2068         int rc;
2069         ENTRY;
2070
2071         /* The original LOVEA maybe re-generated via old filter_fid, at
2072          * that time, we do not know the stripe count and stripe size. */
2073         if (ol->ol_stripe_count > count)
2074                 count = ol->ol_stripe_count;
2075         if (ol->ol_stripe_size != 0 &&
2076             ol->ol_stripe_size != le32_to_cpu(lmm->lmm_stripe_size))
2077                 lmm->lmm_stripe_size = cpu_to_le32(ol->ol_stripe_size);
2078
2079         if (magic == LOV_MAGIC_V1)
2080                 objs = &lmm->lmm_objects[count];
2081         else
2082                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[count];
2083
2084         gap = ea_off - count;
2085         if (gap >= 0)
2086                 count = ea_off + 1;
2087
2088         size = lov_mds_md_size(count, magic);
2089         LASSERTF(buf->lb_len >= size,
2090                  "buffer len %d is less than real size %d\n",
2091                  (int)buf->lb_len, size);
2092
2093         if (gap > 0) {
2094                 memset(objs, 0, gap * sizeof(*objs));
2095                 lmm->lmm_pattern |= cpu_to_le32(LOV_PATTERN_F_HOLE);
2096         }
2097
2098         lmm->lmm_layout_gen = cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1);
2099         lmm->lmm_stripe_count = cpu_to_le16(count);
2100         objs += gap;
2101
2102         rc = lfsck_layout_refill_lovea(env, lfsck, handle, parent, cfid, buf,
2103                                 lmm, objs, LU_XATTR_REPLACE, ost_idx, size);
2104
2105         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant extend layout EA for "
2106                DFID": parent "DFID", OST-index %u, stripe-index %u, "
2107                "stripe_size %u, stripe_count %u, comp_id %u, comp_start %llu, "
2108                "comp_end %llu, %s LOV EA hole: rc = %d\n",
2109                lfsck_lfsck2name(lfsck), PFID(cfid), PFID(lfsck_dto2fid(parent)),
2110                ost_idx, ea_off, ol->ol_stripe_size, ol->ol_stripe_count,
2111                ol->ol_comp_id, ol->ol_comp_start, ol->ol_comp_end,
2112                le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_HOLE ?
2113                "with" : "without", rc);
2114
2115         RETURN(rc);
2116 }
2117
2118 /**
2119  * \retval       +1: repaired
2120  * \retval        0: did nothing
2121  * \retval      -ve: on error
2122  */
2123 static int lfsck_layout_update_lovea(const struct lu_env *env,
2124                                      struct lfsck_instance *lfsck,
2125                                      struct thandle *handle,
2126                                      struct lu_orphan_rec_v3 *rec,
2127                                      struct dt_object *parent,
2128                                      const struct lu_fid *cfid,
2129                                      struct lu_buf *buf, int fl,
2130                                      __u32 ost_idx, __u32 ea_off)
2131 {
2132         struct ost_layout *ol = &rec->lor_layout;
2133         struct lov_mds_md_v1 *lmm = NULL;
2134         struct lov_ost_data_v1 *objs = NULL;
2135         int rc = 0;
2136         ENTRY;
2137
2138         if (ol->ol_comp_id != 0)
2139                 rc = lfsck_layout_new_comp_lovea(env, rec, parent, buf, ea_off,
2140                                                  &lmm, &objs);
2141         else
2142                 rc = lfsck_layout_new_v1_lovea(env, lfsck, &rec->lor_layout,
2143                                                parent, buf, ea_off, &lmm,
2144                                                &objs);
2145         if (rc > 0)
2146                 rc = lfsck_layout_refill_lovea(env, lfsck, handle, parent, cfid,
2147                                                buf, lmm, objs, fl, ost_idx, rc);
2148
2149         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant created layout EA for "
2150                DFID": parent "DFID", OST-index %u, stripe-index %u, "
2151                "stripe_size %u, stripe_count %u, comp_id %u, comp_start %llu, "
2152                "comp_end %llu, layout version %u, range %u, fl %d, "
2153                "%s LOV EA hole: rc = %d\n",
2154                lfsck_lfsck2name(lfsck), PFID(cfid), PFID(lfsck_dto2fid(parent)),
2155                ost_idx, ea_off, ol->ol_stripe_size, ol->ol_stripe_count,
2156                ol->ol_comp_id, ol->ol_comp_start, ol->ol_comp_end,
2157                rec->lor_layout_version, rec->lor_range, fl,
2158                le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_HOLE ?
2159                "with" : "without", rc);
2160
2161         RETURN(rc);
2162 }
2163
2164 static int __lfsck_layout_update_pfid(const struct lu_env *env,
2165                                       struct dt_object *child,
2166                                       const struct lu_fid *pfid,
2167                                       const struct ost_layout *ol, __u32 offset,
2168                                       __u32 version, __u32 range)
2169 {
2170         struct dt_device        *dev    = lfsck_obj2dev(child);
2171         struct filter_fid       *ff     = &lfsck_env_info(env)->lti_ff;
2172         struct thandle          *handle;
2173         struct lu_buf            buf    = { NULL };
2174         int                      rc;
2175
2176         ff->ff_parent.f_seq = cpu_to_le64(pfid->f_seq);
2177         ff->ff_parent.f_oid = cpu_to_le32(pfid->f_oid);
2178         /* Currently, the filter_fid::ff_parent::f_ver is not the real parent
2179          * MDT-object's FID::f_ver, instead it is the OST-object index in its
2180          * parent MDT-object's layout EA. */
2181         ff->ff_parent.f_stripe_idx = cpu_to_le32(offset);
2182         ost_layout_cpu_to_le(&ff->ff_layout, ol);
2183         ff->ff_layout_version = cpu_to_le32(version);
2184         ff->ff_range = cpu_to_le32(range);
2185         lfsck_buf_init(&buf, ff, sizeof(*ff));
2186
2187         handle = dt_trans_create(env, dev);
2188         if (IS_ERR(handle))
2189                 RETURN(PTR_ERR(handle));
2190
2191         rc = dt_declare_xattr_set(env, child, &buf, XATTR_NAME_FID, 0, handle);
2192         if (rc != 0)
2193                 GOTO(stop, rc);
2194
2195         rc = dt_trans_start_local(env, dev, handle);
2196         if (rc != 0)
2197                 GOTO(stop, rc);
2198
2199         rc = dt_xattr_set(env, child, &buf, XATTR_NAME_FID, 0, handle);
2200
2201         GOTO(stop, rc);
2202
2203 stop:
2204         dt_trans_stop(env, dev, handle);
2205
2206         return rc;
2207 }
2208
2209 /**
2210  * \retval       +1: repaired
2211  * \retval        0: did nothing
2212  * \retval      -ve: on error
2213  */
2214 static int lfsck_layout_update_pfid(const struct lu_env *env,
2215                                     struct lfsck_component *com,
2216                                     struct dt_object *parent,
2217                                     struct lu_fid *cfid,
2218                                     struct dt_device *cdev,
2219                                     struct lu_orphan_rec_v3 *rec, __u32 ea_off)
2220 {
2221         struct dt_object        *child;
2222         int                      rc     = 0;
2223         ENTRY;
2224
2225         child = lfsck_object_find_by_dev(env, cdev, cfid);
2226         if (IS_ERR(child))
2227                 RETURN(PTR_ERR(child));
2228
2229         rc = __lfsck_layout_update_pfid(env, child,
2230                                         lu_object_fid(&parent->do_lu),
2231                                         &rec->lor_layout, ea_off,
2232                                         rec->lor_layout_version,
2233                                         rec->lor_range);
2234         lfsck_object_put(env, child);
2235
2236         RETURN(rc == 0 ? 1 : rc);
2237 }
2238
2239 static int lfsck_lovea_size(struct ost_layout *ol, __u32 ea_off)
2240 {
2241         if (ol->ol_comp_id != 0)
2242                 return sizeof(struct lov_comp_md_v1) +
2243                        sizeof(struct lov_comp_md_entry_v1) +
2244                        lov_mds_md_size(ol->ol_stripe_count, LOV_MAGIC_V1);
2245
2246         if (ol->ol_stripe_count != 0)
2247                 return lov_mds_md_size(ol->ol_stripe_count, LOV_MAGIC_V1);
2248
2249         return lov_mds_md_size(ea_off + 1, LOV_MAGIC_V1);
2250 }
2251
2252 /**
2253  * This function will create the MDT-object with the given (partial) LOV EA.
2254  *
2255  * Under some data corruption cases, the MDT-object of the file may be lost,
2256  * but its OST-objects, or some of them are there. The layout LFSCK needs to
2257  * re-create the MDT-object with the orphan OST-object(s) information.
2258  *
2259  * On the other hand, the LFSCK may has created some OST-object for repairing
2260  * dangling LOV EA reference, but as the LFSCK processing, it may find that
2261  * the old OST-object is there and should replace the former new created OST
2262  * object. Unfortunately, some others have modified such newly created object.
2263  * To keep the data (both new and old), the LFSCK will create MDT-object with
2264  * new FID to reference the original OST-object.
2265  *
2266  * \param[in] env       pointer to the thread context
2267  * \param[in] com       pointer to the lfsck component
2268  * \param[in] ltd       pointer to target device descriptor
2269  * \param[in] rec       pointer to the record for the orphan OST-object
2270  * \param[in] cfid      pointer to FID for the orphan OST-object
2271  * \param[in] infix     additional information, such as the FID for original
2272  *                      MDT-object and the stripe offset in the LOV EA
2273  * \param[in] type      the type for describing why the orphan MDT-object is
2274  *                      created. The rules are as following:
2275  *
2276  *  type "C":           Multiple OST-objects claim the same MDT-object and the
2277  *                      same slot in the layout EA. Then the LFSCK will create
2278  *                      new MDT-object(s) to hold the conflict OST-object(s).
2279  *
2280  *  type "N":           The orphan OST-object does not know which one was the
2281  *                      real parent MDT-object, so the LFSCK uses new FID for
2282  *                      its parent MDT-object.
2283  *
2284  *  type "R":           The orphan OST-object knows its parent MDT-object FID,
2285  *                      but does not know the position (the file name) in the
2286  *                      layout.
2287  *
2288  *  type "D":           The MDT-object is a directory, it may knows its parent
2289  *                      but because there is no valid linkEA, the LFSCK cannot
2290  *                      know where to put it back to the namespace.
2291  *  type "O":           The MDT-object has no linkEA, and there is no name
2292  *                      entry that references the MDT-object.
2293  *
2294  *  type "P":           The orphan object to be created was a parent directory
2295  *                      of some MDT-object which linkEA shows that the @orphan
2296  *                      object is missing.
2297  *
2298  * The orphan name will be like:
2299  * ${FID}-${infix}-${type}-${conflict_version}
2300  *
2301  * \param[in] ea_off    the stripe offset in the LOV EA
2302  *
2303  * \retval              positive on repaired something
2304  * \retval              0 if needs to repair nothing
2305  * \retval              negative error number on failure
2306  */
2307 static int lfsck_layout_recreate_parent(const struct lu_env *env,
2308                                         struct lfsck_component *com,
2309                                         struct lfsck_tgt_desc *ltd,
2310                                         struct lu_orphan_rec_v3 *rec,
2311                                         struct lu_fid *cfid,
2312                                         const char *infix,
2313                                         const char *type,
2314                                         __u32 ea_off)
2315 {
2316         struct lfsck_thread_info        *info   = lfsck_env_info(env);
2317         struct dt_insert_rec            *dtrec  = &info->lti_dt_rec;
2318         char                            *name   = info->lti_key;
2319         struct lu_attr                  *la     = &info->lti_la2;
2320         struct dt_object_format         *dof    = &info->lti_dof;
2321         struct lfsck_instance           *lfsck  = com->lc_lfsck;
2322         struct lu_fid                   *pfid   = &rec->lor_rec.lor_fid;
2323         struct lu_fid                   *tfid   = &info->lti_fid3;
2324         struct dt_device                *dev    = lfsck->li_bottom;
2325         struct dt_object                *lpf    = lfsck->li_lpf_obj;
2326         struct dt_object                *pobj   = NULL;
2327         struct dt_object                *cobj   = NULL;
2328         struct thandle                  *th     = NULL;
2329         struct lu_buf                   *ea_buf = &info->lti_big_buf;
2330         struct lu_buf                    lov_buf;
2331         struct lfsck_lock_handle        *llh    = &info->lti_llh;
2332         struct linkea_data               ldata  = { NULL };
2333         struct lu_buf                    linkea_buf;
2334         const struct lu_name            *pname;
2335         int                              size   = 0;
2336         int                              idx    = 0;
2337         int                              rc     = 0;
2338         ENTRY;
2339
2340         if (unlikely(lpf == NULL))
2341                 GOTO(log, rc = -ENXIO);
2342
2343         /* We use two separated transactions to repair the inconsistency.
2344          *
2345          * 1) create the MDT-object locally.
2346          * 2) update the OST-object's PFID EA if necessary.
2347          *
2348          * If 1) succeed, but 2) failed, then the OST-object's PFID EA will be
2349          * updated when the layout LFSCK run next time.
2350          *
2351          * If 1) failed, but 2) succeed, then such MDT-object will be re-created
2352          * when the layout LFSCK run next time. */
2353
2354         if (fid_is_zero(pfid)) {
2355                 rc = lfsck_fid_alloc(env, lfsck, pfid, false);
2356                 if (rc != 0)
2357                         GOTO(log, rc);
2358
2359                 cobj = lfsck_object_find_by_dev(env, ltd->ltd_tgt, cfid);
2360                 if (IS_ERR(cobj))
2361                         GOTO(log, rc = PTR_ERR(cobj));
2362         }
2363
2364         pobj = lfsck_object_find_by_dev(env, dev, pfid);
2365         if (IS_ERR(pobj))
2366                 GOTO(log, rc = PTR_ERR(pobj));
2367
2368         LASSERT(infix != NULL);
2369         LASSERT(type != NULL);
2370
2371         memset(la, 0, sizeof(*la));
2372         la->la_uid = rec->lor_rec.lor_uid;
2373         la->la_gid = rec->lor_rec.lor_gid;
2374         la->la_mode = S_IFREG | S_IRUSR;
2375         la->la_valid = LA_MODE | LA_UID | LA_GID;
2376
2377         memset(dof, 0, sizeof(*dof));
2378         dof->dof_type = dt_mode_to_dft(S_IFREG);
2379         /* Because the dof->dof_reg.striped = 0, the LOD will not create
2380          * the stripe(s). The LFSCK will specify the LOV EA via
2381          * lfsck_layout_update_lovea(). */
2382
2383         size = lfsck_lovea_size(&rec->lor_layout, ea_off);
2384         if (ea_buf->lb_len < size) {
2385                 lu_buf_realloc(ea_buf, size);
2386                 if (ea_buf->lb_buf == NULL)
2387                         GOTO(log, rc = -ENOMEM);
2388         }
2389
2390 again:
2391         do {
2392                 snprintf(name, NAME_MAX, DFID"%s-%s-%d", PFID(pfid), infix,
2393                          type, idx++);
2394                 rc = dt_lookup(env, lfsck->li_lpf_obj, (struct dt_rec *)tfid,
2395                                (const struct dt_key *)name);
2396                 if (rc != 0 && rc != -ENOENT)
2397                         GOTO(log, rc);
2398         } while (rc == 0);
2399
2400         rc = lfsck_lock(env, lfsck, lfsck->li_lpf_obj, name, llh,
2401                         MDS_INODELOCK_UPDATE, LCK_PW);
2402         if (rc != 0)
2403                 GOTO(log, rc);
2404
2405         /* Re-check whether the name conflict with othrs after taken
2406          * the ldlm lock. */
2407         rc = dt_lookup(env, lfsck->li_lpf_obj, (struct dt_rec *)tfid,
2408                        (const struct dt_key *)name);
2409         if (unlikely(rc == 0)) {
2410                 lfsck_unlock(llh);
2411                 goto again;
2412         }
2413
2414         if (rc != -ENOENT)
2415                 GOTO(unlock, rc);
2416
2417         pname = lfsck_name_get_const(env, name, strlen(name));
2418         rc = linkea_links_new(&ldata, &lfsck_env_info(env)->lti_linkea_buf,
2419                               pname, lfsck_dto2fid(lfsck->li_lpf_obj));
2420         if (rc != 0)
2421                 GOTO(unlock, rc);
2422
2423         /* The 1st transaction. */
2424         th = dt_trans_create(env, dev);
2425         if (IS_ERR(th))
2426                 GOTO(unlock, rc = PTR_ERR(th));
2427
2428         rc = dt_declare_create(env, pobj, la, NULL, dof, th);
2429         if (rc != 0)
2430                 GOTO(stop, rc);
2431
2432         lfsck_buf_init(&lov_buf, ea_buf->lb_buf, size);
2433         rc = dt_declare_xattr_set(env, pobj, &lov_buf, XATTR_NAME_LOV,
2434                                   LU_XATTR_CREATE, th);
2435         if (rc != 0)
2436                 GOTO(stop, rc);
2437
2438         dtrec->rec_fid = pfid;
2439         dtrec->rec_type = S_IFREG;
2440         rc = dt_declare_insert(env, lpf,
2441                                (const struct dt_rec *)dtrec,
2442                                (const struct dt_key *)name, th);
2443         if (rc != 0)
2444                 GOTO(stop, rc);
2445
2446         lfsck_buf_init(&linkea_buf, ldata.ld_buf->lb_buf,
2447                        ldata.ld_leh->leh_len);
2448         rc = dt_declare_xattr_set(env, pobj, &linkea_buf,
2449                                   XATTR_NAME_LINK, 0, th);
2450         if (rc != 0)
2451                 GOTO(stop, rc);
2452
2453         rc = dt_trans_start_local(env, dev, th);
2454         if (rc != 0)
2455                 GOTO(stop, rc);
2456
2457         dt_write_lock(env, pobj, 0);
2458         rc = dt_create(env, pobj, la, NULL, dof, th);
2459         if (rc == 0)
2460                 rc = lfsck_layout_update_lovea(env, lfsck, th, rec, pobj, cfid,
2461                         &lov_buf, LU_XATTR_CREATE, ltd->ltd_index, ea_off);
2462         dt_write_unlock(env, pobj);
2463         if (rc < 0)
2464                 GOTO(stop, rc);
2465
2466         rc = dt_insert(env, lpf, (const struct dt_rec *)dtrec,
2467                        (const struct dt_key *)name, th);
2468         if (rc != 0)
2469                 GOTO(stop, rc);
2470
2471         rc = dt_xattr_set(env, pobj, &linkea_buf, XATTR_NAME_LINK, 0, th);
2472         if (rc == 0 && cobj != NULL) {
2473                 dt_trans_stop(env, dev, th);
2474                 th = NULL;
2475
2476                 /* The 2nd transaction. */
2477                 rc = __lfsck_layout_update_pfid(env, cobj, pfid,
2478                                                 &rec->lor_layout, ea_off,
2479                                                 rec->lor_layout_version,
2480                                                 rec->lor_range);
2481         }
2482
2483         GOTO(stop, rc);
2484
2485 stop:
2486         if (th != NULL)
2487                 dt_trans_stop(env, dev, th);
2488
2489 unlock:
2490         lfsck_unlock(llh);
2491
2492 log:
2493         if (cobj != NULL && !IS_ERR(cobj))
2494                 lfsck_object_put(env, cobj);
2495         if (pobj != NULL && !IS_ERR(pobj))
2496                 lfsck_object_put(env, pobj);
2497
2498         if (rc < 0)
2499                 CDEBUG(D_LFSCK, "%s layout LFSCK assistant failed to "
2500                        "recreate the lost MDT-object: parent "DFID
2501                        ", child "DFID", OST-index %u, stripe-index %u, "
2502                        "infix %s, type %s: rc = %d\n",
2503                        lfsck_lfsck2name(lfsck), PFID(pfid), PFID(cfid),
2504                        ltd->ltd_index, ea_off, infix, type, rc);
2505
2506         return rc >= 0 ? 1 : rc;
2507 }
2508
2509 static int lfsck_layout_master_conditional_destroy(const struct lu_env *env,
2510                                                    struct lfsck_component *com,
2511                                                    const struct lu_fid *fid,
2512                                                    __u32 index)
2513 {
2514         struct lfsck_thread_info *info  = lfsck_env_info(env);
2515         struct lfsck_request     *lr    = &info->lti_lr;
2516         struct lfsck_instance    *lfsck = com->lc_lfsck;
2517         struct lfsck_tgt_desc    *ltd;
2518         struct ptlrpc_request    *req;
2519         struct lfsck_request     *tmp;
2520         struct obd_export        *exp;
2521         int                       rc    = 0;
2522         ENTRY;
2523
2524         ltd = lfsck_tgt_get(&lfsck->li_ost_descs, index);
2525         if (unlikely(ltd == NULL))
2526                 RETURN(-ENXIO);
2527
2528         exp = ltd->ltd_exp;
2529         if (!(exp_connect_flags(exp) & OBD_CONNECT_LFSCK))
2530                 GOTO(put, rc = -EOPNOTSUPP);
2531
2532         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LFSCK_NOTIFY);
2533         if (req == NULL)
2534                 GOTO(put, rc = -ENOMEM);
2535
2536         rc = ptlrpc_request_pack(req, LUSTRE_OBD_VERSION, LFSCK_NOTIFY);
2537         if (rc != 0) {
2538                 ptlrpc_request_free(req);
2539
2540                 GOTO(put, rc);
2541         }
2542
2543         memset(lr, 0, sizeof(*lr));
2544         lr->lr_event = LE_CONDITIONAL_DESTROY;
2545         lr->lr_active = LFSCK_TYPE_LAYOUT;
2546         lr->lr_fid = *fid;
2547
2548         tmp = req_capsule_client_get(&req->rq_pill, &RMF_LFSCK_REQUEST);
2549         *tmp = *lr;
2550         ptlrpc_request_set_replen(req);
2551
2552         rc = ptlrpc_queue_wait(req);
2553         ptlrpc_req_finished(req);
2554
2555         GOTO(put, rc);
2556
2557 put:
2558         lfsck_tgt_put(ltd);
2559
2560         return rc;
2561 }
2562
2563 static int lfsck_layout_slave_conditional_destroy(const struct lu_env *env,
2564                                                   struct lfsck_component *com,
2565                                                   struct lfsck_request *lr)
2566 {
2567         struct lfsck_thread_info        *info   = lfsck_env_info(env);
2568         struct lu_attr                  *la     = &info->lti_la;
2569         union ldlm_policy_data          *policy = &info->lti_policy;
2570         struct ldlm_res_id              *resid  = &info->lti_resid;
2571         struct lfsck_instance           *lfsck  = com->lc_lfsck;
2572         struct dt_device                *dev    = lfsck->li_bottom;
2573         struct lu_fid                   *fid    = &lr->lr_fid;
2574         struct dt_object                *obj;
2575         struct thandle                  *th     = NULL;
2576         struct lustre_handle             lh     = { 0 };
2577         __u64                            flags  = 0;
2578         int                              rc     = 0;
2579         ENTRY;
2580
2581         obj = lfsck_object_find_by_dev(env, dev, fid);
2582         if (IS_ERR(obj))
2583                 RETURN(PTR_ERR(obj));
2584
2585         dt_read_lock(env, obj, 0);
2586         if (dt_object_exists(obj) == 0 ||
2587             lfsck_is_dead_obj(obj)) {
2588                 dt_read_unlock(env, obj);
2589
2590                 GOTO(put, rc = -ENOENT);
2591         }
2592
2593         /* Get obj's attr without lock firstly. */
2594         rc = dt_attr_get(env, obj, la);
2595         dt_read_unlock(env, obj);
2596         if (rc != 0)
2597                 GOTO(put, rc);
2598
2599         if (likely(la->la_ctime != 0 || la->la_mode & S_ISUID))
2600                 GOTO(put, rc = -ETXTBSY);
2601
2602         /* Acquire extent lock on [0, EOF] to sync with all possible written. */
2603         LASSERT(lfsck->li_namespace != NULL);
2604
2605         memset(policy, 0, sizeof(*policy));
2606         policy->l_extent.end = OBD_OBJECT_EOF;
2607         ost_fid_build_resid(fid, resid);
2608         rc = ldlm_cli_enqueue_local(env, lfsck->li_namespace, resid,
2609                                     LDLM_EXTENT, policy, LCK_EX, &flags,
2610                                     ldlm_blocking_ast, ldlm_completion_ast,
2611                                     NULL, NULL, 0, LVB_T_NONE, NULL, &lh);
2612         if (rc != ELDLM_OK)
2613                 GOTO(put, rc = -EIO);
2614
2615         dt_write_lock(env, obj, 0);
2616         /* Get obj's attr within lock again. */
2617         rc = dt_attr_get(env, obj, la);
2618         if (rc != 0)
2619                 GOTO(unlock, rc);
2620
2621         if (la->la_ctime != 0)
2622                 GOTO(unlock, rc = -ETXTBSY);
2623
2624         th = dt_trans_create(env, dev);
2625         if (IS_ERR(th))
2626                 GOTO(unlock, rc = PTR_ERR(th));
2627
2628         rc = dt_declare_ref_del(env, obj, th);
2629         if (rc != 0)
2630                 GOTO(stop, rc);
2631
2632         rc = dt_declare_destroy(env, obj, th);
2633         if (rc != 0)
2634                 GOTO(stop, rc);
2635
2636         rc = dt_trans_start_local(env, dev, th);
2637         if (rc != 0)
2638                 GOTO(stop, rc);
2639
2640         rc = dt_ref_del(env, obj, th);
2641         if (rc != 0)
2642                 GOTO(stop, rc);
2643
2644         rc = dt_destroy(env, obj, th);
2645         if (rc == 0)
2646                 CDEBUG(D_LFSCK, "%s: layout LFSCK destroyed the empty "
2647                        "OST-object "DFID" that was created for reparing "
2648                        "dangling referenced case. But the original missing "
2649                        "OST-object is found now.\n",
2650                        lfsck_lfsck2name(lfsck), PFID(fid));
2651
2652         GOTO(stop, rc);
2653
2654 stop:
2655         dt_trans_stop(env, dev, th);
2656
2657 unlock:
2658         dt_write_unlock(env, obj);
2659         ldlm_lock_decref(&lh, LCK_EX);
2660
2661 put:
2662         lfsck_object_put(env, obj);
2663
2664         return rc;
2665 }
2666
2667 /**
2668  * Some OST-object has occupied the specified layout EA slot.
2669  * Such OST-object may be generated by the LFSCK when repair
2670  * dangling referenced MDT-object, which can be indicated by
2671  * attr::la_ctime == 0 but without S_ISUID in la_mode. If it
2672  * is true and such OST-object has not been modified yet, we
2673  * will replace it with the orphan OST-object; otherwise the
2674  * LFSCK will create new MDT-object to reference the orphan.
2675  *
2676  * \retval       +1: repaired
2677  * \retval        0: did nothing
2678  * \retval      -ve: on error
2679  */
2680 static int lfsck_layout_conflict_create(const struct lu_env *env,
2681                                         struct lfsck_component *com,
2682                                         struct lfsck_tgt_desc *ltd,
2683                                         struct lu_orphan_rec_v3 *rec,
2684                                         struct dt_object *parent,
2685                                         struct lu_fid *cfid,
2686                                         struct lu_buf *ea_buf,
2687                                         struct lov_mds_md_v1 *lmm,
2688                                         struct lov_ost_data_v1 *slot,
2689                                         __u32 ea_off, int lovea_size)
2690 {
2691         struct lfsck_thread_info *info          = lfsck_env_info(env);
2692         struct lu_fid            *cfid2         = &info->lti_fid2;
2693         struct ost_id            *oi            = &info->lti_oi;
2694         struct dt_device         *dev           = lfsck_obj2dev(parent);
2695         struct thandle           *th            = NULL;
2696         struct lustre_handle      lh            = { 0 };
2697         __u32                     ost_idx2      = le32_to_cpu(slot->l_ost_idx);
2698         int                       rc            = 0;
2699         ENTRY;
2700
2701         while (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val)) {
2702                 if (unlikely(!thread_is_running(&com->lc_lfsck->li_thread)))
2703                         RETURN(0);
2704         }
2705
2706         ostid_le_to_cpu(&slot->l_ost_oi, oi);
2707         rc = ostid_to_fid(cfid2, oi, ost_idx2);
2708         if (rc != 0)
2709                 GOTO(out, rc);
2710
2711         rc = lfsck_ibits_lock(env, com->lc_lfsck, parent, &lh,
2712                               MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR,
2713                               LCK_EX);
2714         if (rc != 0)
2715                 GOTO(out, rc);
2716
2717         rc = lfsck_layout_master_conditional_destroy(env, com, cfid2, ost_idx2);
2718
2719         /* If the conflict OST-obejct is not created for fixing dangling
2720          * referenced MDT-object in former LFSCK check/repair, or it has
2721          * been modified by others, then we cannot destroy it. Re-create
2722          * a new MDT-object for the orphan OST-object. */
2723         if (rc == -ETXTBSY) {
2724                 /* No need the layout lock on the original parent. */
2725                 lfsck_ibits_unlock(&lh, LCK_EX);
2726
2727                 fid_zero(&rec->lor_rec.lor_fid);
2728                 snprintf(info->lti_tmpbuf, sizeof(info->lti_tmpbuf),
2729                          "-"DFID"-%x", PFID(lu_object_fid(&parent->do_lu)),
2730                          ea_off);
2731                 rc = lfsck_layout_recreate_parent(env, com, ltd, rec, cfid,
2732                                                 info->lti_tmpbuf, "C", ea_off);
2733
2734                 RETURN(rc);
2735         }
2736
2737         if (rc != 0 && rc != -ENOENT)
2738                 GOTO(unlock, rc);
2739
2740         th = dt_trans_create(env, dev);
2741         if (IS_ERR(th))
2742                 GOTO(unlock, rc = PTR_ERR(th));
2743
2744         rc = dt_declare_xattr_set(env, parent, ea_buf, XATTR_NAME_LOV,
2745                                   LU_XATTR_REPLACE, th);
2746         if (rc != 0)
2747                 GOTO(stop, rc);
2748
2749         rc = dt_trans_start_local(env, dev, th);
2750         if (rc != 0)
2751                 GOTO(stop, rc);
2752
2753         dt_write_lock(env, parent, 0);
2754         lmm->lmm_layout_gen = cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1);
2755         rc = lfsck_layout_refill_lovea(env, com->lc_lfsck, th, parent, cfid,
2756                                        ea_buf, lmm, slot, LU_XATTR_REPLACE,
2757                                        ltd->ltd_index, lovea_size);
2758         dt_write_unlock(env, parent);
2759
2760         GOTO(stop, rc);
2761
2762 stop:
2763         dt_trans_stop(env, dev, th);
2764
2765 unlock:
2766         lfsck_ibits_unlock(&lh, LCK_EX);
2767
2768 out:
2769         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant replaced the conflict "
2770                "OST-object "DFID" on the OST %x with the orphan "DFID" on "
2771                "the OST %x: parent "DFID", stripe-index %u: rc = %d\n",
2772                lfsck_lfsck2name(com->lc_lfsck), PFID(cfid2), ost_idx2,
2773                PFID(cfid), ltd->ltd_index, PFID(lfsck_dto2fid(parent)),
2774                ea_off, rc);
2775
2776         return rc >= 0 ? 1 : rc;
2777 }
2778
2779 /**
2780  * \retval       +1: repaired
2781  * \retval        0: did nothing
2782  * \retval      -ve: on error
2783  */
2784 static int lfsck_layout_recreate_lovea(const struct lu_env *env,
2785                                        struct lfsck_component *com,
2786                                        struct lfsck_tgt_desc *ltd,
2787                                        struct lu_orphan_rec_v3 *rec,
2788                                        struct dt_object *parent,
2789                                        struct lu_fid *cfid,
2790                                        __u32 ost_idx, __u32 ea_off)
2791 {
2792         struct lfsck_thread_info *info          = lfsck_env_info(env);
2793         struct lu_buf            *buf           = &info->lti_big_buf;
2794         struct lu_fid            *fid           = &info->lti_fid2;
2795         struct ost_id            *oi            = &info->lti_oi;
2796         struct lfsck_instance    *lfsck         = com->lc_lfsck;
2797         struct dt_device         *dt            = lfsck_obj2dev(parent);
2798         struct lfsck_bookmark    *bk            = &lfsck->li_bookmark_ram;
2799         struct ost_layout        *ol            = &rec->lor_layout;
2800         struct lov_comp_md_v1    *lcm           = NULL;
2801         struct lov_comp_md_entry_v1 *lcme       = NULL;
2802         struct thandle           *handle        = NULL;
2803         size_t                    lovea_size;
2804         struct lov_mds_md_v1     *lmm;
2805         struct lov_ost_data_v1   *objs;
2806         struct lustre_handle      lh            = { 0 };
2807         __u32                     magic;
2808         __u32 flags = 0;
2809         int                       fl            = 0;
2810         int                       rc            = 0;
2811         int                       rc1;
2812         int                       i;
2813         int pos = 0;
2814         __u16 count;
2815         bool locked = false;
2816         bool new_mirror = true;
2817         ENTRY;
2818
2819         rc = lfsck_ibits_lock(env, lfsck, parent, &lh,
2820                               MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR,
2821                               LCK_EX);
2822         if (rc != 0) {
2823                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant failed to recreate "
2824                        "LOV EA for "DFID": parent "DFID", OST-index %u, "
2825                        "stripe-index %u, comp_id %u, comp_start %llu, "
2826                        "comp_end %llu, layout version %u, range %u: rc = %d\n",
2827                        lfsck_lfsck2name(lfsck), PFID(cfid),
2828                        PFID(lfsck_dto2fid(parent)), ost_idx, ea_off,
2829                        ol->ol_comp_id, ol->ol_comp_start,
2830                        ol->ol_comp_end, rec->lor_layout_version,
2831                        rec->lor_range, rc);
2832
2833                 RETURN(rc);
2834         }
2835
2836 again:
2837         if (locked) {
2838                 dt_write_unlock(env, parent);
2839                 locked = false;
2840         }
2841
2842         if (handle != NULL) {
2843                 dt_trans_stop(env, dt, handle);
2844                 handle = NULL;
2845         }
2846
2847         if (rc < 0)
2848                 GOTO(unlock_layout, rc);
2849
2850         lovea_size = rc;
2851         if (buf->lb_len < lovea_size) {
2852                 lu_buf_realloc(buf, lovea_size);
2853                 if (buf->lb_buf == NULL)
2854                         GOTO(unlock_layout, rc = -ENOMEM);
2855         }
2856
2857         if (!(bk->lb_param & LPF_DRYRUN)) {
2858                 handle = dt_trans_create(env, dt);
2859                 if (IS_ERR(handle))
2860                         GOTO(unlock_layout, rc = PTR_ERR(handle));
2861
2862                 rc = dt_declare_xattr_set(env, parent, buf, XATTR_NAME_LOV,
2863                                           fl, handle);
2864                 if (rc != 0)
2865                         GOTO(stop, rc);
2866
2867                 rc = dt_trans_start_local(env, dt, handle);
2868                 if (rc != 0)
2869                         GOTO(stop, rc);
2870         }
2871
2872         dt_write_lock(env, parent, 0);
2873         locked = true;
2874         rc = dt_xattr_get(env, parent, buf, XATTR_NAME_LOV);
2875         if (rc == -ERANGE) {
2876                 rc = dt_xattr_get(env, parent, &LU_BUF_NULL, XATTR_NAME_LOV);
2877                 LASSERT(rc != 0);
2878                 goto again;
2879         } else if (rc == -ENODATA || rc == 0) {
2880                 lovea_size = lfsck_lovea_size(ol, ea_off);
2881                 /* If the declared is not big enough, re-try. */
2882                 if (buf->lb_len < lovea_size) {
2883                         rc = lovea_size;
2884                         goto again;
2885                 }
2886                 fl = LU_XATTR_CREATE;
2887         } else if (rc < 0) {
2888                 GOTO(unlock_parent, rc);
2889         } else if (unlikely(buf->lb_len == 0)) {
2890                 goto again;
2891         } else {
2892                 fl = LU_XATTR_REPLACE;
2893                 lovea_size = rc;
2894         }
2895
2896         if (fl == LU_XATTR_CREATE) {
2897                 if (bk->lb_param & LPF_DRYRUN)
2898                         GOTO(unlock_parent, rc = 1);
2899
2900                 LASSERT(buf->lb_len >= lovea_size);
2901
2902                 rc = lfsck_layout_update_lovea(env, lfsck, handle, rec, parent,
2903                                                cfid, buf, fl, ost_idx, ea_off);
2904
2905                 GOTO(unlock_parent, rc);
2906         }
2907
2908         lmm = buf->lb_buf;
2909         rc1 = lfsck_layout_verify_header(parent, lmm, lovea_size);
2910
2911         /* If the LOV EA crashed, the rebuild it. */
2912         if (rc1 == -EINVAL) {
2913                 if (bk->lb_param & LPF_DRYRUN)
2914                         GOTO(unlock_parent, rc = 1);
2915
2916                 LASSERT(buf->lb_len >= lovea_size);
2917
2918                 rc = lfsck_layout_update_lovea(env, lfsck, handle, rec, parent,
2919                                                cfid, buf, fl, ost_idx, ea_off);
2920
2921                 GOTO(unlock_parent, rc);
2922         }
2923
2924         /* For other unknown magic/pattern, keep the current LOV EA. */
2925         if (rc1 == -EOPNOTSUPP)
2926                 GOTO(unlock_parent, rc1 = 0);
2927
2928         if (rc1)
2929                 GOTO(unlock_parent, rc = rc1);
2930
2931         magic = le32_to_cpu(lmm->lmm_magic);
2932         if (magic == LOV_MAGIC_COMP_V1) {
2933                 __u64 start;
2934                 __u64 end;
2935                 __u16 mirror_id0 = mirror_id_of(ol->ol_comp_id);
2936                 __u16 mirror_id1;
2937
2938                 if (bk->lb_param & LPF_DRYRUN)
2939                         GOTO(unlock_parent, rc = 1);
2940
2941                 lcm = buf->lb_buf;
2942                 count = le16_to_cpu(lcm->lcm_entry_count);
2943                 for (i = 0; i < count; pos = ++i) {
2944                         lcme = &lcm->lcm_entries[i];
2945                         start = le64_to_cpu(lcme->lcme_extent.e_start);
2946                         end = le64_to_cpu(lcme->lcme_extent.e_end);
2947                         mirror_id1 = mirror_id_of(le32_to_cpu(lcme->lcme_id));
2948
2949                         if (mirror_id0 > mirror_id1)
2950                                 continue;
2951
2952                         if (mirror_id0 < mirror_id1)
2953                                 break;
2954
2955                         new_mirror = false;
2956                         if (end <= ol->ol_comp_start)
2957                                 continue;
2958
2959                         if (start >= ol->ol_comp_end)
2960                                 break;
2961
2962                         lmm = buf->lb_buf + le32_to_cpu(lcme->lcme_offset);
2963                         magic = le32_to_cpu(lmm->lmm_magic);
2964                         flags = le32_to_cpu(lcme->lcme_flags);
2965                         goto further;
2966                 }
2967
2968                 rc = lfsck_layout_add_comp(env, lfsck, handle, rec, parent,
2969                                 cfid, buf, ost_idx, ea_off, pos, new_mirror);
2970
2971                 GOTO(unlock_parent, rc);
2972         }
2973
2974 further:
2975         count = le16_to_cpu(lmm->lmm_stripe_count);
2976         if (count == 0)
2977                 GOTO(unlock_parent, rc = -EINVAL);
2978         LASSERT(count > 0);
2979
2980         /* Exceed the current end of MDT-object layout EA. Then extend it. */
2981         if (count <= ea_off) {
2982                 if (bk->lb_param & LPF_DRYRUN)
2983                         GOTO(unlock_parent, rc = 1);
2984
2985                 lovea_size = lov_mds_md_size(ea_off + 1, magic);
2986                 /* If the declared is not big enough, re-try. */
2987                 if (buf->lb_len < lovea_size) {
2988                         rc = lovea_size;
2989                         goto again;
2990                 }
2991
2992                 if (lcm) {
2993                         LASSERT(lcme);
2994
2995                         lcme->lcme_flags = cpu_to_le32(flags | LCME_FL_INIT);
2996                         lfsck_layout_update_lcm(lcm, lcme,
2997                                                 rec->lor_layout_version,
2998                                                 rec->lor_range);
2999                 }
3000
3001                 rc = lfsck_layout_extend_v1v3_lovea(env, lfsck, handle, ol,
3002                                         parent, cfid, buf, ost_idx, ea_off);
3003
3004                 GOTO(unlock_parent, rc);
3005         }
3006
3007         LASSERTF(rc > 0, "invalid rc = %d\n", rc);
3008
3009         if (magic == LOV_MAGIC_V1) {
3010                 objs = &lmm->lmm_objects[0];
3011         } else {
3012                 LASSERT(magic == LOV_MAGIC_V3);
3013                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0];
3014         }
3015
3016         for (i = 0; i < count; i++, objs++) {
3017                 /* The MDT-object was created via lfsck_layout_recover_create()
3018                  * by others before, and we fill the dummy layout EA. */
3019                 if ((lcme && !(flags & LCME_FL_INIT)) ||
3020                      lovea_slot_is_dummy(objs)) {
3021                         if (i != ea_off)
3022                                 continue;
3023
3024                         if (bk->lb_param & LPF_DRYRUN)
3025                                 GOTO(unlock_parent, rc = 1);
3026
3027                         lmm->lmm_layout_gen =
3028                             cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1);
3029                         if (lcme) {
3030                                 LASSERT(lcm);
3031
3032                                 if (le32_to_cpu(lmm->lmm_stripe_size) !=
3033                                         ol->ol_stripe_size ||
3034                                     le16_to_cpu(lmm->lmm_stripe_count) !=
3035                                         ol->ol_stripe_count ||
3036                                     le64_to_cpu(lcme->lcme_extent.e_start) !=
3037                                         ol->ol_comp_start ||
3038                                     le64_to_cpu(lcme->lcme_extent.e_end) !=
3039                                         ol->ol_comp_end) {
3040                                         CDEBUG(D_LFSCK, "%s: found invalid "
3041                                         "component for "DFID ": parent "DFID
3042                                         ", stripe-index %u, stripe_size %u, "
3043                                         "stripe_count %u, comp_id %u, "
3044                                         "comp_start %llu, comp_end %llu, "
3045                                         "cur_stripe_size %u, "
3046                                         "cur_stripe_count %u, "
3047                                         "cur_comp_start %llu, "
3048                                         "cur_comp_end %llu\n",
3049                                         lfsck_lfsck2name(lfsck), PFID(cfid),
3050                                         PFID(lfsck_dto2fid(parent)), ea_off,
3051                                         ol->ol_stripe_size,
3052                                         ol->ol_stripe_count, ol->ol_comp_id,
3053                                         ol->ol_comp_start, ol->ol_comp_end,
3054                                         le32_to_cpu(lmm->lmm_stripe_size),
3055                                         le16_to_cpu(lmm->lmm_stripe_count),
3056                                         le64_to_cpu(lcme->lcme_extent.e_start),
3057                                         le64_to_cpu(lcme->lcme_extent.e_end));
3058
3059                                         GOTO(unlock_parent, rc = -EINVAL);
3060                                 }
3061
3062                                 lovea_size = le32_to_cpu(lcm->lcm_size);
3063                                 lcme->lcme_flags = cpu_to_le32(flags |
3064                                                                LCME_FL_INIT);
3065                                 lfsck_layout_update_lcm(lcm, lcme,
3066                                                         rec->lor_layout_version,
3067                                                         rec->lor_range);
3068                         }
3069
3070                         LASSERTF(buf->lb_len >= lovea_size,
3071                                  "buffer len %d is less than real size %d\n",
3072                                  (int)buf->lb_len, (int)lovea_size);
3073
3074                         rc = lfsck_layout_refill_lovea(env, lfsck, handle,
3075                                                 parent, cfid, buf, lmm, objs,
3076                                                 fl, ost_idx, lovea_size);
3077
3078                         CDEBUG(D_LFSCK, "%s layout LFSCK assistant fill "
3079                                "dummy layout slot for "DFID": parent "DFID
3080                                ", OST-index %u, stripe-index %u: rc = %d\n",
3081                                lfsck_lfsck2name(lfsck), PFID(cfid),
3082                                PFID(lfsck_dto2fid(parent)), ost_idx, i, rc);
3083
3084                         GOTO(unlock_parent, rc);
3085                 }
3086
3087                 ostid_le_to_cpu(&objs->l_ost_oi, oi);
3088                 rc = ostid_to_fid(fid, oi, le32_to_cpu(objs->l_ost_idx));
3089                 if (rc != 0) {
3090                         CDEBUG(D_LFSCK, "%s: the parent "DFID" contains "
3091                                "invalid layout EA at the slot %d, index %u\n",
3092                                lfsck_lfsck2name(lfsck),
3093                                PFID(lfsck_dto2fid(parent)), i,
3094                                le32_to_cpu(objs->l_ost_idx));
3095
3096                         GOTO(unlock_parent, rc);
3097                 }
3098
3099                 /* It should be rare case, the slot is there, but the LFSCK
3100                  * does not handle it during the first-phase cycle scanning. */
3101                 if (unlikely(lu_fid_eq(fid, cfid))) {
3102                         if (i == ea_off) {
3103                                 GOTO(unlock_parent, rc = 0);
3104                         } else {
3105                                 /* Rare case that the OST-object index
3106                                  * does not match the parent MDT-object
3107                                  * layout EA. We trust the later one. */
3108                                 if (bk->lb_param & LPF_DRYRUN)
3109                                         GOTO(unlock_parent, rc = 1);
3110
3111                                 dt_write_unlock(env, parent);
3112                                 if (handle != NULL)
3113                                         dt_trans_stop(env, dt, handle);
3114                                 lfsck_ibits_unlock(&lh, LCK_EX);
3115                                 rc = lfsck_layout_update_pfid(env, com, parent,
3116                                                         cfid, ltd->ltd_tgt,
3117                                                         rec, i);
3118
3119                                 CDEBUG(D_LFSCK, "%s layout LFSCK assistant "
3120                                        "updated OST-object's pfid for "DFID
3121                                        ": parent "DFID", OST-index %u, "
3122                                        "stripe-index %u: rc = %d\n",
3123                                        lfsck_lfsck2name(lfsck), PFID(cfid),
3124                                        PFID(lfsck_dto2fid(parent)),
3125                                        ltd->ltd_index, i, rc);
3126
3127                                 RETURN(rc);
3128                         }
3129                 }
3130         }
3131
3132         /* The MDT-object exists, but related layout EA slot is occupied
3133          * by others. */
3134         if (bk->lb_param & LPF_DRYRUN)
3135                 GOTO(unlock_parent, rc = 1);
3136
3137         dt_write_unlock(env, parent);
3138         if (handle != NULL)
3139                 dt_trans_stop(env, dt, handle);
3140         lfsck_ibits_unlock(&lh, LCK_EX);
3141         if (magic == LOV_MAGIC_V1)
3142                 objs = &lmm->lmm_objects[ea_off];
3143         else
3144                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[ea_off];
3145         rc = lfsck_layout_conflict_create(env, com, ltd, rec, parent, cfid,
3146                                           buf, lmm, objs, ea_off, lovea_size);
3147
3148         RETURN(rc);
3149
3150 unlock_parent:
3151         if (locked)
3152                 dt_write_unlock(env, parent);
3153
3154 stop:
3155         if (handle != NULL)
3156                 dt_trans_stop(env, dt, handle);
3157
3158 unlock_layout:
3159         lfsck_ibits_unlock(&lh, LCK_EX);
3160
3161         return rc;
3162 }
3163
3164 static int lfsck_layout_scan_orphan_one(const struct lu_env *env,
3165                                         struct lfsck_component *com,
3166                                         struct lfsck_tgt_desc *ltd,
3167                                         struct lu_orphan_rec_v3 *rec,
3168                                         struct lu_fid *cfid)
3169 {
3170         struct lfsck_layout     *lo     = com->lc_file_ram;
3171         struct lu_fid           *pfid   = &rec->lor_rec.lor_fid;
3172         struct dt_object        *parent = NULL;
3173         __u32                    ea_off = pfid->f_stripe_idx;
3174         int                      rc     = 0;
3175         ENTRY;
3176
3177         if (!fid_is_sane(cfid))
3178                 GOTO(out, rc = -EINVAL);
3179
3180         pfid->f_ver = 0;
3181         if (fid_is_zero(pfid)) {
3182                 rc = lfsck_layout_recreate_parent(env, com, ltd, rec, cfid,
3183                                                   "", "N", ea_off);
3184                 GOTO(out, rc);
3185         }
3186
3187         if (!fid_is_sane(pfid))
3188                 GOTO(out, rc = -EINVAL);
3189
3190         parent = lfsck_object_find_by_dev(env, com->lc_lfsck->li_bottom, pfid);
3191         if (IS_ERR(parent))
3192                 GOTO(out, rc = PTR_ERR(parent));
3193
3194         if (unlikely(dt_object_remote(parent) != 0))
3195                 GOTO(put, rc = -EXDEV);
3196
3197         if (dt_object_exists(parent) == 0) {
3198                 lfsck_object_put(env, parent);
3199                 rc = lfsck_layout_recreate_parent(env, com, ltd, rec, cfid,
3200                                                   "", "R", ea_off);
3201                 GOTO(out, rc);
3202         }
3203
3204         if (!S_ISREG(lu_object_attr(&parent->do_lu)))
3205                 GOTO(put, rc = -EISDIR);
3206
3207         /* The orphan OST-object claims to be the parent's stripe, then
3208          * related dangling record in the trace file is meaningless. */
3209         rc = lfsck_layout_del_dangling_rec(env, com, pfid,
3210                                            rec->lor_layout.ol_comp_id, ea_off);
3211         if (rc && rc != -ENOENT)
3212                 GOTO(put, rc);
3213
3214         rc = lfsck_layout_recreate_lovea(env, com, ltd, rec, parent, cfid,
3215                                          ltd->ltd_index, ea_off);
3216
3217         GOTO(put, rc);
3218
3219 put:
3220         if (rc <= 0)
3221                 lfsck_object_put(env, parent);
3222         else
3223                 /* The layout EA is changed, need to be reloaded next time. */
3224                 dt_object_put_nocache(env, parent);
3225
3226 out:
3227         down_write(&com->lc_sem);
3228         com->lc_new_scanned++;
3229         com->lc_new_checked++;
3230         if (rc > 0) {
3231                 lo->ll_objs_repaired[LLIT_ORPHAN - 1]++;
3232                 rc = 0;
3233         } else if (rc < 0) {
3234                 lo->ll_objs_failed_phase2++;
3235         }
3236         up_write(&com->lc_sem);
3237
3238         return rc;
3239 }
3240
3241 static int lfsck_layout_scan_orphan(const struct lu_env *env,
3242                                     struct lfsck_component *com,
3243                                     struct lfsck_tgt_desc *ltd)
3244 {
3245         struct lfsck_assistant_data     *lad    = com->lc_data;
3246         struct lfsck_instance           *lfsck  = com->lc_lfsck;
3247         struct lfsck_bookmark           *bk     = &lfsck->li_bookmark_ram;
3248         struct lfsck_thread_info        *info   = lfsck_env_info(env);
3249         struct lu_fid                   *fid    = &info->lti_fid;
3250         struct dt_object                *obj;
3251         const struct dt_it_ops          *iops;
3252         struct dt_it                    *di;
3253         int                              rc     = 0;
3254         ENTRY;
3255
3256         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant starts the orphan "
3257                "scanning for OST%04x\n",
3258                lfsck_lfsck2name(lfsck), ltd->ltd_index);
3259
3260         if (cfs_bitmap_check(lad->lad_bitmap, ltd->ltd_index)) {
3261                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant skip the orphan "
3262                        "scanning for OST%04x\n",
3263                        lfsck_lfsck2name(lfsck), ltd->ltd_index);
3264
3265                 RETURN(0);
3266         }
3267
3268         fid->f_seq = fid_idif_seq(0, ltd->ltd_index);
3269         fid->f_oid = fid->f_ver = 0;
3270
3271         obj = lfsck_object_find_by_dev(env, ltd->ltd_tgt, fid);
3272         if (unlikely(IS_ERR(obj)))
3273                 GOTO(log, rc = PTR_ERR(obj));
3274
3275         rc = obj->do_ops->do_index_try(env, obj,
3276                                        &dt_lfsck_layout_orphan_features);
3277         if (rc != 0)
3278                 GOTO(put, rc);
3279
3280         iops = &obj->do_index_ops->dio_it;
3281         di = iops->init(env, obj, 0);
3282         if (IS_ERR(di))
3283                 GOTO(put, rc = PTR_ERR(di));
3284
3285         rc = iops->load(env, di, 0);
3286         if (rc == -ESRCH) {
3287                 /* -ESRCH means that the orphan OST-objects rbtree has been
3288                  * cleanup because of the OSS server restart or other errors. */
3289                 lfsck_lad_set_bitmap(env, com, ltd->ltd_index);
3290                 GOTO(fini, rc);
3291         }
3292
3293         if (rc == 0)
3294                 rc = iops->next(env, di);
3295         else if (rc > 0)
3296                 rc = 0;
3297
3298         if (rc < 0)
3299                 GOTO(fini, rc);
3300
3301         if (rc > 0)
3302                 GOTO(fini, rc = 0);
3303
3304         do {
3305                 struct dt_key           *key;
3306                 struct lu_orphan_rec_v3 *rec = &info->lti_rec;
3307
3308                 if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val) &&
3309                     unlikely(!thread_is_running(&lfsck->li_thread)))
3310                         break;
3311
3312                 key = iops->key(env, di);
3313                 com->lc_fid_latest_scanned_phase2 = *(struct lu_fid *)key;
3314                 /* Remote target OST may be runnning old LFSCK */
3315                 memset(rec, 0, sizeof(*rec));
3316                 rc = iops->rec(env, di, (struct dt_rec *)rec, 0);
3317                 if (rc == 0)
3318                         rc = lfsck_layout_scan_orphan_one(env, com, ltd, rec,
3319                                         &com->lc_fid_latest_scanned_phase2);
3320                 if (rc != 0 && bk->lb_param & LPF_FAILOUT)
3321                         GOTO(fini, rc);
3322
3323                 lfsck_control_speed_by_self(com);
3324                 do {
3325                         rc = iops->next(env, di);
3326                 } while (rc < 0 && !(bk->lb_param & LPF_FAILOUT));
3327         } while (rc == 0);
3328
3329         GOTO(fini, rc);
3330
3331 fini:
3332         iops->put(env, di);
3333         iops->fini(env, di);
3334 put:
3335         lfsck_object_put(env, obj);
3336
3337 log:
3338         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant finished the orphan "
3339                "scanning for OST%04x: rc = %d\n",
3340                lfsck_lfsck2name(lfsck), ltd->ltd_index, rc);
3341
3342         return rc > 0 ? 0 : rc;
3343 }
3344
3345 static int lfsck_lov2layout(struct lov_mds_md_v1 *lmm, struct filter_fid *ff,
3346                             __u32 comp_id)
3347 {
3348         struct ost_layout *ol = &ff->ff_layout;
3349         __u32 magic = le32_to_cpu(lmm->lmm_magic);
3350         int rc = 0;
3351         ENTRY;
3352
3353         if (magic == LOV_MAGIC_V1 || magic == LOV_MAGIC_V3) {
3354                 ol->ol_stripe_size = lmm->lmm_stripe_size;
3355                 ol->ol_stripe_count = lmm->lmm_stripe_count;
3356                 ol->ol_comp_start = 0;
3357                 ol->ol_comp_end = 0;
3358                 ol->ol_comp_id = 0;
3359                 ff->ff_layout_version = 0;
3360                 ff->ff_range = 0;
3361         } else if (magic == LOV_MAGIC_COMP_V1) {
3362                 struct lov_comp_md_v1 *lcm = (struct lov_comp_md_v1 *)lmm;
3363                 struct lov_comp_md_entry_v1 *lcme = NULL;
3364                 __u16 count = le16_to_cpu(lcm->lcm_entry_count);
3365                 int i;
3366
3367                 for (i = 0; i < count; i++) {
3368                         lcme = &lcm->lcm_entries[i];
3369                         if (le32_to_cpu(lcme->lcme_id) == comp_id) {
3370                                 LASSERT(le32_to_cpu(lcme->lcme_flags) &
3371                                         LCME_FL_INIT);
3372
3373                                 break;
3374                         }
3375                 }
3376
3377                 /* The comp has been removed, do nothing. */
3378                 if (i == count)
3379                         GOTO(out, rc = 1);
3380
3381                 lmm = (void *)lmm + le32_to_cpu(lcme->lcme_offset);
3382                 ol->ol_stripe_size = le32_to_cpu(lmm->lmm_stripe_size);
3383                 ol->ol_stripe_count = le32_to_cpu(lmm->lmm_stripe_count);
3384                 ol->ol_comp_start = le64_to_cpu(lcme->lcme_extent.e_start);
3385                 ol->ol_comp_end = le64_to_cpu(lcme->lcme_extent.e_end);
3386                 ol->ol_comp_id = le32_to_cpu(lcme->lcme_id);
3387                 ff->ff_layout_version = le32_to_cpu(lcme->lcme_layout_gen);
3388                 ff->ff_range = 0;
3389         } else {
3390                 GOTO(out, rc = -EINVAL);
3391         }
3392
3393         EXIT;
3394
3395 out:
3396         return rc;
3397 }
3398
3399 /**
3400  * Repair the MDT-object with dangling LOV EA reference.
3401  *
3402  * we need to repair the inconsistency according to the users' requirement:
3403  *
3404  * 1) Keep the inconsistency there and report the inconsistency case,
3405  *    then give the chance to the application to find related issues,
3406  *    and the users can make the decision about how to handle it with
3407  *    more human knownledge. (by default)
3408  *
3409  * 2) Re-create the missing OST-object with the FID/owner information.
3410  *
3411  * \param[in] env       pointer to the thread context
3412  * \param[in] com       the layout LFSCK component
3413  * \param[in] parent    the MDT-object with dangling LOV EA reference
3414  * \param[in] child     the OST-object to be created
3415  * \param[in] comp_id   the component ID of the OST-object in the LOV EA
3416  * \param[in] ea_off    the offset of the OST-object in the LOV EA
3417  * \param[in] ost_idx   the index of OST on which the OST-object resides
3418  *
3419  * \retval              +1 for repair successfully
3420  * \retval              0 for did nothing
3421  * \retval              negative error number on failure
3422  */
3423 static int __lfsck_layout_repair_dangling(const struct lu_env *env,
3424                                           struct lfsck_component *com,
3425                                           struct dt_object *parent,
3426                                           struct dt_object *child,
3427                                           __u32 comp_id, __u32 ea_off,
3428                                           __u32 ost_idx, bool log)
3429 {
3430         struct lfsck_thread_info *info = lfsck_env_info(env);
3431         struct filter_fid *ff = &info->lti_ff;
3432         struct dt_object_format *dof = &info->lti_dof;
3433         struct lu_attr *la = &info->lti_la;
3434         struct lfsck_instance *lfsck = com->lc_lfsck;
3435         struct dt_device *dev = lfsck_obj2dev(child);
3436         const struct lu_fid *pfid = lfsck_dto2fid(parent);
3437         const struct lu_fid *cfid = lfsck_dto2fid(child);
3438         struct lu_buf *tbuf = &info->lti_big_buf;
3439         struct thandle *handle;
3440         struct lu_buf *buf;
3441         struct lustre_handle lh = { 0 };
3442         int rc;
3443         ENTRY;
3444
3445         if (!(lfsck->li_bookmark_ram.lb_param & LPF_CREATE_OSTOBJ))
3446                 GOTO(log, rc = 1);
3447
3448         rc = lfsck_ibits_lock(env, lfsck, parent, &lh,
3449                               MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR,
3450                               LCK_EX);
3451         if (rc != 0)
3452                 GOTO(log, rc);
3453
3454         rc = dt_attr_get(env, parent, la);
3455         if (rc != 0)
3456                 GOTO(unlock1, rc);
3457
3458         la->la_mode = S_IFREG | 0666;
3459         la->la_atime = la->la_mtime = la->la_ctime = 0;
3460         la->la_valid = LA_TYPE | LA_MODE | LA_UID | LA_GID |
3461                        LA_ATIME | LA_MTIME | LA_CTIME;
3462         memset(dof, 0, sizeof(*dof));
3463         ff->ff_parent.f_seq = cpu_to_le64(pfid->f_seq);
3464         ff->ff_parent.f_oid = cpu_to_le32(pfid->f_oid);
3465         /* Currently, the filter_fid::ff_parent::f_ver is not the real parent
3466          * MDT-object's FID::f_ver, instead it is the OST-object index in its
3467          * parent MDT-object's layout EA. */
3468         ff->ff_parent.f_stripe_idx = cpu_to_le32(ea_off);
3469
3470         rc = lfsck_layout_get_lovea(env, parent, tbuf);
3471         if (unlikely(rc == -ENODATA))
3472                 rc = 0;
3473         if (rc <= 0)
3474                 GOTO(unlock1, rc);
3475
3476         rc = lfsck_lov2layout(tbuf->lb_buf, ff, comp_id);
3477         if (rc)
3478                 GOTO(unlock1, rc);
3479
3480         buf = lfsck_buf_get(env, ff, sizeof(struct filter_fid));
3481         handle = dt_trans_create(env, dev);
3482         if (IS_ERR(handle))
3483                 GOTO(unlock1, rc = PTR_ERR(handle));
3484
3485         rc = dt_declare_create(env, child, la, NULL, dof, handle);
3486         if (rc != 0)
3487                 GOTO(stop, rc);
3488
3489         rc = dt_declare_xattr_set(env, child, buf, XATTR_NAME_FID,
3490                                   LU_XATTR_CREATE, handle);
3491         if (rc != 0)
3492                 GOTO(stop, rc);
3493
3494         rc = dt_trans_start_local(env, dev, handle);
3495         if (rc != 0)
3496                 GOTO(stop, rc);
3497
3498         dt_read_lock(env, parent, 0);
3499         if (unlikely(lfsck_is_dead_obj(parent)))
3500                 GOTO(unlock2, rc = 0);
3501
3502         if (lfsck->li_bookmark_ram.lb_param & LPF_DELAY_CREATE_OSTOBJ) {
3503                 struct ost_id *oi = &info->lti_oi;
3504                 struct lu_fid *tfid = &info->lti_fid2;
3505                 struct lu_buf *lovea = &info->lti_big_buf;
3506                 struct lov_mds_md_v1 *lmm;
3507                 struct lov_ost_data_v1 *objs;
3508                 __u32 magic;
3509                 int count;
3510                 int idx2;
3511
3512                 rc = lfsck_layout_get_lovea(env, parent, lovea);
3513                 if (unlikely(rc == -ENODATA))
3514                         rc = 0;
3515                 if (rc <= 0)
3516                         GOTO(unlock2, rc);
3517
3518                 lmm = lovea->lb_buf;
3519                 magic = le32_to_cpu(lmm->lmm_magic);
3520                 if (magic == LOV_MAGIC_COMP_V1) {
3521                         struct lov_comp_md_v1 *lcm = buf->lb_buf;
3522                         struct lov_comp_md_entry_v1 *lcme;
3523                         __u16 count = le16_to_cpu(lcm->lcm_entry_count);
3524                         int i;
3525
3526                         for (i = 0; i < count; i++) {
3527                                 lcme = &lcm->lcm_entries[i];
3528                                 if (le32_to_cpu(lcme->lcme_id) == comp_id) {
3529                                         LASSERT(le32_to_cpu(lcme->lcme_flags) &
3530                                                 LCME_FL_INIT);
3531
3532                                         lmm = lovea->lb_buf +
3533                                                 le32_to_cpu(lcme->lcme_offset);
3534                                         magic = le32_to_cpu(lmm->lmm_magic);
3535                                         goto check;
3536                                 }
3537                         }
3538
3539                         /* Someone removed the component, do nothing. */
3540                         GOTO(unlock2, rc = 0);
3541                 }
3542
3543 check:
3544                 count = le16_to_cpu(lmm->lmm_stripe_count);
3545                 /* Someone changed the LOV EA, do nothing. */
3546                 if (count <= ea_off)
3547                         GOTO(unlock2, rc = 0);
3548
3549                 if (magic == LOV_MAGIC_V1) {
3550                         objs = &lmm->lmm_objects[ea_off];
3551                 } else {
3552                         LASSERT(magic == LOV_MAGIC_V3);
3553
3554                         objs = &((struct lov_mds_md_v3 *)lmm)->\
3555                                                         lmm_objects[ea_off];
3556                 }
3557
3558                 ostid_le_to_cpu(&objs->l_ost_oi, oi);
3559                 idx2 = le32_to_cpu(objs->l_ost_idx);
3560                 rc = ostid_to_fid(tfid, oi, idx2);
3561                 /* Someone changed the LOV EA, do nothing. */
3562                 if (rc != 0 || !lu_fid_eq(tfid, cfid))
3563                         GOTO(unlock2, rc);
3564         }
3565
3566         rc = dt_create(env, child, la, NULL, dof, handle);
3567         if (rc != 0)
3568                 GOTO(unlock2, rc);
3569
3570         rc = dt_xattr_set(env, child, buf, XATTR_NAME_FID, LU_XATTR_CREATE,
3571                           handle);
3572
3573         GOTO(unlock2, rc);
3574
3575 unlock2:
3576         dt_read_unlock(env, parent);
3577
3578 stop:
3579         rc = lfsck_layout_trans_stop(env, dev, handle, rc);
3580
3581 unlock1:
3582         lfsck_ibits_unlock(&lh, LCK_EX);
3583
3584 log:
3585         if (rc && log)
3586                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant found "
3587                        "dangling reference for: parent "DFID", child "
3588                        DFID", comp_id %u, ea_off %u, ost_idx %u, %s: "
3589                        "rc = %d\n",
3590                        lfsck_lfsck2name(lfsck), PFID(pfid), PFID(cfid),
3591                        comp_id, ea_off, ost_idx,
3592                        (lfsck->li_bookmark_ram.lb_param & LPF_CREATE_OSTOBJ) ?
3593                                 "Create the lost OST-object as required" :
3594                                 "Keep the MDT-object there by default", rc);
3595
3596         return rc;
3597 }
3598
3599 /**
3600  * Repair the MDT-object with dangling LOV EA reference.
3601  *
3602  * Prepare parameters and call __lfsck_layout_repair_dangling()
3603  * to repair the dangling LOV EA reference.
3604  *
3605  * \param[in] env       pointer to the thread context
3606  * \param[in] com       the layout LFSCK component
3607  * \param[in] pfid      the MDT-object's FID
3608  * \param[in] cfid      the FID for the OST-object to be created
3609  * \param[in] comp_id   the component ID of the OST-object in the LOV EA
3610  * \param[in] ea_off    the offset of the OST-object in the LOV EA
3611  * \param[in] ost_idx   the index of OST on which the OST-object resides
3612  *
3613  * \retval              +1 for repair successfully
3614  * \retval              0 for did nothing
3615  * \retval              negative error number on failure
3616  */
3617 static int lfsck_layout_repair_dangling(const struct lu_env *env,
3618                                         struct lfsck_component *com,
3619                                         const struct lu_fid *pfid,
3620                                         const struct lu_fid *cfid,
3621                                         __u32 comp_id, __u32 ea_off,
3622                                         __u32 ost_idx)
3623 {
3624         struct lfsck_instance *lfsck = com->lc_lfsck;
3625         struct dt_object *parent = NULL;
3626         struct dt_object *child = NULL;
3627         struct lfsck_tgt_desc *ltd;
3628         int rc;
3629         ENTRY;
3630
3631         parent = lfsck_object_find_bottom(env, lfsck, pfid);
3632         if (IS_ERR(parent))
3633                 GOTO(log, rc = PTR_ERR(parent));
3634
3635         /* The MDT-object has been removed. */
3636         if (dt_object_exists(parent) == 0)
3637                 GOTO(log, rc = 0);
3638
3639         ltd = lfsck_ltd2tgt(&lfsck->li_ost_descs, ost_idx);
3640         if (unlikely(ltd == NULL))
3641                 GOTO(log, rc = -ENODEV);
3642
3643         child = lfsck_object_find_by_dev(env, ltd->ltd_tgt, cfid);
3644         if (IS_ERR(child))
3645                 GOTO(log, rc = PTR_ERR(child));
3646
3647         /* The OST-object has been created. */
3648         if (unlikely(dt_object_exists(child) != 0))
3649                 GOTO(log, rc = 0);
3650
3651         rc = __lfsck_layout_repair_dangling(env, com, parent, child,
3652                                             comp_id, ea_off, ost_idx, false);
3653
3654         GOTO(log, rc);
3655
3656 log:
3657         if (child != NULL && !IS_ERR(child))
3658                 lfsck_object_put(env, child);
3659
3660         if (parent != NULL && !IS_ERR(parent))
3661                 lfsck_object_put(env, parent);
3662
3663         if (rc)
3664                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant found "
3665                        "dangling reference for: parent "DFID", child "
3666                        DFID", comp_id %u, ea_off %u, ost_idx %u, %s: rc = %d\n",
3667                        lfsck_lfsck2name(lfsck), PFID(pfid), PFID(cfid),
3668                        comp_id, ea_off, ost_idx,
3669                        (lfsck->li_bookmark_ram.lb_param & LPF_CREATE_OSTOBJ) ?
3670                                 "Create the lost OST-object as required" :
3671                                 "Keep the MDT-object there by default", rc);
3672
3673         return rc;
3674 }
3675
3676 /* If the OST-object does not recognize the MDT-object as its parent, and
3677  * there is no other MDT-object claims as its parent, then just trust the
3678  * given MDT-object as its parent. So update the OST-object filter_fid. */
3679 static int lfsck_layout_repair_unmatched_pair(const struct lu_env *env,
3680                                               struct lfsck_component *com,
3681                                               struct dt_object *parent,
3682                                               struct lfsck_layout_req *llr,
3683                                               struct lu_attr *la)
3684 {
3685         struct lfsck_thread_info        *info   = lfsck_env_info(env);
3686         struct filter_fid               *ff     = &info->lti_ff;
3687         struct dt_object                *child  = llr->llr_child;
3688         struct dt_device                *dev    = lfsck_obj2dev(child);
3689         const struct lu_fid             *tfid   = lu_object_fid(&parent->do_lu);
3690         struct lu_buf                   *tbuf   = &info->lti_big_buf;
3691         struct thandle                  *handle;
3692         struct lu_buf                   *buf;
3693         struct lustre_handle             lh     = { 0 };
3694         int                              rc;
3695         ENTRY;
3696
3697         rc = lfsck_ibits_lock(env, com->lc_lfsck, parent, &lh,
3698                               MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR,
3699                               LCK_EX);
3700         if (rc != 0)
3701                 GOTO(log, rc);
3702
3703         ff->ff_parent.f_seq = cpu_to_le64(tfid->f_seq);
3704         ff->ff_parent.f_oid = cpu_to_le32(tfid->f_oid);
3705         /* Currently, the filter_fid::ff_parent::f_ver is not the real parent
3706          * MDT-object's FID::f_ver, instead it is the OST-object index in its
3707          * parent MDT-object's layout EA. */
3708         ff->ff_parent.f_stripe_idx = cpu_to_le32(llr->llr_lov_idx);
3709
3710         rc = lfsck_layout_get_lovea(env, parent, tbuf);
3711         if (unlikely(rc == -ENODATA))
3712                 rc = 0;
3713         if (rc <= 0)
3714                 GOTO(unlock1, rc);
3715
3716         rc = lfsck_lov2layout(tbuf->lb_buf, ff, llr->llr_comp_id);
3717         if (rc)
3718                 GOTO(unlock1, rc);
3719
3720         buf = lfsck_buf_get(env, ff, sizeof(*ff));
3721
3722         handle = dt_trans_create(env, dev);
3723         if (IS_ERR(handle))
3724                 GOTO(unlock1, rc = PTR_ERR(handle));
3725
3726         rc = dt_declare_xattr_set(env, child, buf, XATTR_NAME_FID, 0, handle);
3727         if (rc != 0)
3728                 GOTO(stop, rc);
3729
3730         rc = dt_attr_get(env, parent, la);
3731         if (rc != 0)
3732                 GOTO(stop, rc);
3733
3734         la->la_valid = LA_UID | LA_GID;
3735         rc = dt_declare_attr_set(env, child, la, handle);
3736         if (rc != 0)
3737                 GOTO(stop, rc);
3738
3739         rc = dt_trans_start_local(env, dev, handle);
3740         if (rc != 0)
3741                 GOTO(stop, rc);
3742
3743         dt_write_lock(env, parent, 0);
3744         if (unlikely(lfsck_is_dead_obj(parent)))
3745                 GOTO(unlock2, rc = 1);
3746
3747         rc = dt_xattr_set(env, child, buf, XATTR_NAME_FID, 0, handle);
3748         if (rc != 0)
3749                 GOTO(unlock2, rc);
3750
3751         /* Get the latest parent's owner. */
3752         rc = dt_attr_get(env, parent, la);
3753         if (rc != 0)
3754                 GOTO(unlock2, rc);
3755
3756         la->la_valid = LA_UID | LA_GID;
3757         rc = dt_attr_set(env, child, la, handle);
3758
3759         GOTO(unlock2, rc);
3760
3761 unlock2:
3762         dt_write_unlock(env, parent);
3763
3764 stop:
3765         rc = lfsck_layout_trans_stop(env, dev, handle, rc);
3766
3767 unlock1:
3768         lfsck_ibits_unlock(&lh, LCK_EX);
3769
3770 log:
3771         if (rc)
3772                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant repaired "
3773                        "unmatched MDT-OST pair for: parent "DFID
3774                        ", child "DFID", comp_id %u, OST-index %u, "
3775                        "stripe-index %u, owner %u/%u: rc = %d\n",
3776                        lfsck_lfsck2name(com->lc_lfsck),
3777                        PFID(lfsck_dto2fid(parent)),
3778                        PFID(lfsck_dto2fid(child)),
3779                        llr->llr_comp_id, llr->llr_ost_idx, llr->llr_lov_idx,
3780                        la->la_uid, la->la_gid, rc);
3781
3782         return rc;
3783 }
3784
3785 /* If there are more than one MDT-objects claim as the OST-object's parent,
3786  * and the OST-object only recognizes one of them, then we need to generate
3787  * new OST-object(s) with new fid(s) for the non-recognized MDT-object(s). */
3788 static int lfsck_layout_repair_multiple_references(const struct lu_env *env,
3789                                                    struct lfsck_component *com,
3790                                                    struct dt_object *parent,
3791                                                    struct lfsck_layout_req *llr,
3792                                                    struct lu_attr *la)
3793 {
3794         struct lfsck_thread_info        *info   = lfsck_env_info(env);
3795         struct dt_allocation_hint       *hint   = &info->lti_hint;
3796         struct dt_object_format         *dof    = &info->lti_dof;
3797         struct ost_id                   *oi     = &info->lti_oi;
3798         struct lu_buf                   *buf    = &info->lti_big_buf;
3799         struct lfsck_instance           *lfsck  = com->lc_lfsck;
3800         struct dt_device                *dev;
3801         struct lu_device                *d      =
3802                                 &lfsck_obj2dev(llr->llr_child)->dd_lu_dev;
3803         struct lu_object                *o;
3804         struct lu_object                *n;
3805         struct dt_object                *child  = NULL;
3806         struct thandle                  *handle = NULL;
3807         struct lov_mds_md_v1            *lmm;
3808         struct lov_ost_data_v1          *objs;
3809         const struct lu_fid             *pfid   = lfsck_dto2fid(parent);
3810         struct lu_fid                    tfid;
3811         struct lustre_handle             lh     = { 0 };
3812         __u32                            magic;
3813         __u32                            index;
3814         int                              rc;
3815         ENTRY;
3816
3817         /* We use two separated transactions to repair the inconsistency.
3818          *
3819          * 1) create the child (OST-object).
3820          * 2) update the parent LOV EA according to the child's FID.
3821          *
3822          * If 1) succeed, but 2) failed or aborted, then such OST-object will be
3823          * handled as orphan when the layout LFSCK run next time.
3824          *
3825          * If 1) failed, but 2) succeed, then such OST-object will be re-created
3826          * as dangling referened case when the layout LFSCK run next time. */
3827
3828         /* The 1st transaction. */
3829         o = lu_object_anon(env, d, NULL);
3830         if (IS_ERR(o))
3831                 GOTO(log, rc = PTR_ERR(o));
3832
3833         n = lu_object_locate(o->lo_header, d->ld_type);
3834         if (unlikely(n == NULL)) {
3835                 lu_object_put_nocache(env, o);
3836
3837                 GOTO(log, rc = -EINVAL);
3838         }
3839
3840         child = container_of(n, struct dt_object, do_lu);
3841         memset(hint, 0, sizeof(*hint));
3842         rc = dt_attr_get(env, parent, la);
3843         if (rc != 0)
3844                 GOTO(log, rc);
3845
3846         la->la_valid = LA_UID | LA_GID;
3847         memset(dof, 0, sizeof(*dof));
3848
3849         dev = lfsck_obj2dev(child);
3850         handle = dt_trans_create(env, dev);
3851         if (IS_ERR(handle))
3852                 GOTO(log, rc = PTR_ERR(handle));
3853
3854         rc = dt_declare_create(env, child, la, hint, dof, handle);
3855         if (rc != 0)
3856                 GOTO(stop, rc);
3857
3858         rc = dt_trans_start_local(env, dev, handle);
3859         if (rc != 0)
3860                 GOTO(stop, rc);
3861
3862         rc = dt_create(env, child, la, hint, dof, handle);
3863         dt_trans_stop(env, dev, handle);
3864         handle = NULL;
3865         if (rc != 0)
3866                 GOTO(log, rc);
3867
3868         rc = lfsck_ibits_lock(env, lfsck, parent, &lh,
3869                               MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR,
3870                               LCK_EX);
3871         if (rc != 0)
3872                 GOTO(log, rc);
3873
3874         /* The 2nd transaction. */
3875
3876         /* XXX: Generally, we should use bottom device (OSD) to update parent
3877          *      LOV EA. But because the LOD-object still references the wrong
3878          *      OSP-object that should be detached after the parent's LOV EA
3879          *      refreshed. Unfortunately, there is no suitable API for that.
3880          *      So we have to make the LOD to re-load the OSP-object(s) via
3881          *      replacing the LOV EA against the LOD-object.
3882          *
3883          *      Once the DNE2 patches have been landed, we can replace the
3884          *      LOD device with the OSD device. LU-6230. */
3885
3886         dev = lfsck->li_next;
3887         parent = lfsck_object_locate(dev, parent);
3888         if (IS_ERR(parent))
3889                 GOTO(log, rc = PTR_ERR(parent));
3890
3891         handle = dt_trans_create(env, dev);
3892         if (IS_ERR(handle))
3893                 GOTO(log, rc = PTR_ERR(handle));
3894
3895         rc = dt_declare_xattr_set(env, parent, buf, XATTR_NAME_LOV,
3896                                   LU_XATTR_REPLACE, handle);
3897         if (rc != 0)
3898                 GOTO(stop, rc);
3899
3900         rc = dt_trans_start_local(env, dev, handle);
3901         if (rc != 0)
3902                 GOTO(stop, rc);
3903
3904         dt_write_lock(env, parent, 0);
3905         if (unlikely(lfsck_is_dead_obj(parent)))
3906                 GOTO(unlock, rc = 0);
3907
3908         rc = lfsck_layout_get_lovea(env, parent, buf);
3909         if (unlikely(rc == -ENODATA))
3910                 rc = 0;
3911         if (rc <= 0)
3912                 GOTO(unlock, rc);
3913
3914         lmm = buf->lb_buf;
3915         magic = le32_to_cpu(lmm->lmm_magic);
3916         if (magic == LOV_MAGIC_COMP_V1) {
3917                 struct lov_comp_md_v1 *lcm = buf->lb_buf;
3918                 struct lov_comp_md_entry_v1 *lcme;
3919                 __u16 count = le16_to_cpu(lcm->lcm_entry_count);
3920                 int i;
3921
3922                 LASSERT(llr->llr_comp_id != 0);
3923
3924                 for (i = 0; i < count; i++) {
3925                         lcme = &lcm->lcm_entries[i];
3926                         if (le32_to_cpu(lcme->lcme_id) == llr->llr_comp_id) {
3927                                 LASSERT(le32_to_cpu(lcme->lcme_flags) &
3928                                         LCME_FL_INIT);
3929
3930                                 le32_add_cpu(&lcm->lcm_layout_gen, 1);
3931                                 lmm = buf->lb_buf +
3932                                         le32_to_cpu(lcme->lcme_offset);
3933                                 magic = le32_to_cpu(lmm->lmm_magic);
3934                                 goto set;
3935                         }
3936                 }
3937
3938                 GOTO(unlock, rc = 0);
3939         }
3940
3941 set:
3942         if (magic == LOV_MAGIC_V1) {
3943                 objs = &lmm->lmm_objects[llr->llr_lov_idx];
3944         } else {
3945                 LASSERT(magic == LOV_MAGIC_V3);
3946                 objs =
3947                 &((struct lov_mds_md_v3 *)lmm)->lmm_objects[llr->llr_lov_idx];
3948         }
3949
3950         ostid_le_to_cpu(&objs->l_ost_oi, oi);
3951         index = le32_to_cpu(objs->l_ost_idx);
3952         rc = ostid_to_fid(&tfid, oi, index);
3953         /* Someone changed layout during the LFSCK, no need to repair then. */
3954         if (rc == 0 && !lu_fid_eq(&tfid, lu_object_fid(&llr->llr_child->do_lu)))
3955                 GOTO(unlock, rc = 0);
3956
3957         lmm->lmm_layout_gen = cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1);
3958         fid_to_ostid(lu_object_fid(&child->do_lu), oi);
3959         ostid_cpu_to_le(oi, &objs->l_ost_oi);
3960         objs->l_ost_gen = cpu_to_le32(0);
3961         objs->l_ost_idx = cpu_to_le32(llr->llr_ost_idx);
3962         rc = dt_xattr_set(env, parent, buf, XATTR_NAME_LOV,
3963                           LU_XATTR_REPLACE, handle);
3964
3965         GOTO(unlock, rc = (rc == 0 ? 1 : rc));
3966
3967 unlock:
3968         dt_write_unlock(env, parent);
3969
3970 stop:
3971         if (handle != NULL)
3972                 dt_trans_stop(env, dev, handle);
3973
3974 log:
3975         lfsck_ibits_unlock(&lh, LCK_EX);
3976         if (child != NULL)
3977                 lfsck_object_put(env, child);
3978
3979         if (rc)
3980                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant repaired "
3981                        "multiple references for: parent "DFID", comp_id %u, "
3982                        "OST-index %u, stripe-index %u, owner %u/%u: rc = %d\n",
3983                        lfsck_lfsck2name(lfsck), PFID(pfid),
3984                        llr->llr_comp_id, llr->llr_ost_idx, llr->llr_lov_idx,
3985                        la->la_uid, la->la_gid, rc);
3986
3987         return rc;
3988 }
3989
3990 /* If the MDT-object and the OST-object have different owner information,
3991  * then trust the MDT-object, because the normal chown/chgrp handle order
3992  * is from MDT to OST, and it is possible that some chown/chgrp operation
3993  * is partly done. */
3994 static int lfsck_layout_repair_owner(const struct lu_env *env,
3995                                      struct lfsck_component *com,
3996                                      struct dt_object *parent,
3997                                      struct lfsck_layout_req *llr,
3998                                      struct lu_attr *pla,
3999                                      const struct lu_attr *cla)
4000 {
4001         struct lfsck_thread_info        *info   = lfsck_env_info(env);
4002         struct lu_attr                  *tla    = &info->lti_la2;
4003         struct dt_object                *child  = llr->llr_child;
4004         struct dt_device                *dev    = lfsck_obj2dev(child);
4005         struct thandle                  *handle;
4006         int                              rc;
4007         dt_obj_version_t                 version;
4008         ENTRY;
4009
4010         tla->la_uid = pla->la_uid;
4011         tla->la_gid = pla->la_gid;
4012         tla->la_valid = LA_UID | LA_GID;
4013         handle = dt_trans_create(env, dev);
4014         if (IS_ERR(handle))
4015                 GOTO(log, rc = PTR_ERR(handle));
4016
4017         rc = dt_declare_attr_set(env, child, tla, handle);
4018         if (rc != 0)
4019                 GOTO(stop, rc);
4020
4021         rc = dt_trans_start_local(env, dev, handle);
4022         if (rc != 0)
4023                 GOTO(stop, rc);
4024
4025         /* Use the dt_object lock to serialize with destroy and attr_set. */
4026         dt_read_lock(env, parent, 0);
4027         if (unlikely(lfsck_is_dead_obj(parent)))
4028                 GOTO(unlock, rc = 1);
4029
4030         version = dt_version_get(env, child);
4031         if (version == -EOPNOTSUPP)
4032                 version = 0;
4033
4034         /* Get the latest parent's owner. */
4035         rc = dt_attr_get(env, parent, pla);
4036         if (rc != 0)
4037                 GOTO(unlock, rc);
4038
4039         /* Some others chown/chgrp during the LFSCK, needs to do nothing. */
4040         if (unlikely((!version && tla->la_ctime == 0) ||
4041                      tla->la_uid != pla->la_uid || tla->la_gid != pla->la_gid))
4042                 rc = 1;
4043         else
4044                 rc = dt_attr_set(env, child, tla, handle);
4045
4046         GOTO(unlock, rc);
4047
4048 unlock:
4049         dt_read_unlock(env, parent);
4050
4051 stop:
4052         rc = lfsck_layout_trans_stop(env, dev, handle, rc);
4053
4054 log:
4055         if (rc != 0)
4056                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant repaired "
4057                        "inconsistent file owner for: parent "DFID", child "DFID
4058                        ", OST-index %u, stripe-index %u, old owner %u/%u, "
4059                        "new owner %u/%u: rc = %d\n",
4060                        lfsck_lfsck2name(com->lc_lfsck),
4061                        PFID(lfsck_dto2fid(parent)), PFID(lfsck_dto2fid(child)),
4062                        llr->llr_ost_idx, llr->llr_lov_idx,
4063                        cla->la_uid, cla->la_gid, tla->la_uid, tla->la_gid, rc);
4064
4065         return rc;
4066 }
4067
4068 /* Check whether the OST-object correctly back points to the
4069  * MDT-object (@parent) via the XATTR_NAME_FID xattr (@pfid). */
4070 static int lfsck_layout_check_parent(const struct lu_env *env,
4071                                      struct lfsck_component *com,
4072                                      struct lfsck_assistant_object *lso,
4073                                      struct filter_fid *ff,
4074                                      const struct lu_fid *cfid,
4075                                      const struct lu_attr *cla,
4076                                      struct lfsck_layout_req *llr)
4077 {
4078         struct lfsck_thread_info        *info   = lfsck_env_info(env);
4079         struct lu_buf                   *buf    = &info->lti_big_buf;
4080         struct lu_fid                   *pfid   = &info->lti_fid;
4081         struct dt_object                *tobj;
4082         struct lov_mds_md_v1            *lmm;
4083         struct lov_ost_data_v1          *objs;
4084         struct lustre_handle             lh     = { 0 };
4085         int                              rc;
4086         int                              i;
4087         __u32                            magic;
4088         __u32                            idx;
4089         __u16                            count;
4090         ENTRY;
4091
4092         *pfid = ff->ff_parent;
4093         idx = pfid->f_stripe_idx;
4094         pfid->f_ver = 0;
4095
4096         if (unlikely(!fid_is_sane(pfid)))
4097                 RETURN(LLIT_UNMATCHED_PAIR);
4098
4099         if (lu_fid_eq(pfid, &lso->lso_fid)) {
4100                 if (likely(llr->llr_lov_idx == idx))
4101                         RETURN(0);
4102
4103                 RETURN(LLIT_UNMATCHED_PAIR);
4104         }
4105
4106         tobj = lfsck_object_find_bottom(env, com->lc_lfsck, pfid);
4107         if (IS_ERR(tobj))
4108                 RETURN(PTR_ERR(tobj));
4109
4110         if (dt_object_exists(tobj) == 0 || lfsck_is_dead_obj(tobj) ||
4111             !S_ISREG(lfsck_object_type(tobj)))
4112                 GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4113
4114         /* Load the tobj's layout EA, in spite of it is a local MDT-object or
4115          * remote one on another MDT. Then check whether the given OST-object
4116          * is in such layout. If yes, it is multiple referenced, otherwise it
4117          * is unmatched referenced case. */
4118         rc = lfsck_layout_get_lovea(env, tobj, buf);
4119         if (rc == 0 || rc == -ENODATA || rc == -ENOENT)
4120                 GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4121
4122         if (unlikely(rc == -EOPNOTSUPP))
4123                 GOTO(out, rc = LLIT_NONE);
4124
4125         if (rc < 0)
4126                 GOTO(out, rc);
4127
4128         lmm = buf->lb_buf;
4129         magic = le32_to_cpu(lmm->lmm_magic);
4130         if (magic == LOV_MAGIC_COMP_V1) {
4131                 struct lov_comp_md_v1 *lcm = buf->lb_buf;
4132                 struct lov_comp_md_entry_v1 *lcme;
4133
4134                 if (ff->ff_layout.ol_comp_id == 0)
4135                         GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4136
4137                 count = le16_to_cpu(lcm->lcm_entry_count);
4138                 for (i = 0; i < count; i++) {
4139                         lcme = &lcm->lcm_entries[i];
4140                         if (le32_to_cpu(lcme->lcme_id) ==
4141                             ff->ff_layout.ol_comp_id) {
4142                                 lmm = buf->lb_buf +
4143                                         le32_to_cpu(lcme->lcme_offset);
4144                                 magic = le32_to_cpu(lmm->lmm_magic);
4145                                 if (!(le32_to_cpu(lcme->lcme_flags) &
4146                                       LCME_FL_INIT))
4147                                         GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4148
4149                                 goto further;
4150                         }
4151                 }
4152
4153                 GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4154         }
4155
4156 further:
4157         if (magic == LOV_MAGIC_V1) {
4158                 objs = &lmm->lmm_objects[0];
4159         } else {
4160                 LASSERT(magic == LOV_MAGIC_V3);
4161                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0];
4162         }
4163
4164         count = le16_to_cpu(lmm->lmm_stripe_count);
4165         for (i = 0; i < count; i++, objs++) {
4166                 struct lu_fid           *tfid   = &info->lti_fid2;
4167                 struct ost_id           *oi     = &info->lti_oi;
4168                 __u32                    idx2;
4169
4170                 if (lovea_slot_is_dummy(objs))
4171                         continue;
4172
4173                 ostid_le_to_cpu(&objs->l_ost_oi, oi);
4174                 idx2 = le32_to_cpu(objs->l_ost_idx);
4175                 rc = ostid_to_fid(tfid, oi, idx2);
4176                 if (rc != 0) {
4177                         CDEBUG(D_LFSCK, "%s: the parent "DFID" contains "
4178                                "invalid layout EA at the slot %d, index %u\n",
4179                                lfsck_lfsck2name(com->lc_lfsck),
4180                                PFID(pfid), i, idx2);
4181
4182                         GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4183                 }
4184
4185                 if (lu_fid_eq(cfid, tfid)) {
4186                         rc = lfsck_ibits_lock(env, com->lc_lfsck, tobj, &lh,
4187                                               MDS_INODELOCK_UPDATE |
4188                                               MDS_INODELOCK_LAYOUT |
4189                                               MDS_INODELOCK_XATTR,
4190                                               LCK_EX);
4191                         if (rc != 0)
4192                                 GOTO(out, rc);
4193
4194                         dt_read_lock(env, tobj, 0);
4195
4196                         /* For local MDT-object, re-check existence
4197                          * after taken the lock. */
4198                         if (!dt_object_remote(tobj)) {
4199                                 if (dt_object_exists(tobj) == 0 ||
4200                                     lfsck_is_dead_obj(tobj))
4201                                         rc = LLIT_UNMATCHED_PAIR;
4202                                 else
4203                                         rc = LLIT_MULTIPLE_REFERENCED;
4204
4205                                 GOTO(unlock, rc);
4206                         }
4207
4208                         /* For migration case, the new MDT-object and old
4209                          * MDT-object may reference the same OST-object at
4210                          * some migration internal time.
4211                          *
4212                          * For remote MDT-object, the local MDT may not know
4213                          * whether it has been removed or not.  Try checking
4214                          * for a non-existent xattr to check if this object
4215                          * has been been removed or not. */
4216                         rc = dt_xattr_get(env, tobj, &LU_BUF_NULL,
4217                                           XATTR_NAME_DUMMY);
4218                         if (unlikely(rc == -ENOENT || rc >= 0))
4219                                 rc = LLIT_UNMATCHED_PAIR;
4220                         else if (rc == -ENODATA)
4221                                 rc = LLIT_MULTIPLE_REFERENCED;
4222
4223                         GOTO(unlock, rc);
4224                 }
4225         }
4226
4227         GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4228
4229 unlock:
4230         if (lustre_handle_is_used(&lh)) {
4231                 dt_read_unlock(env, tobj);
4232                 lfsck_ibits_unlock(&lh, LCK_EX);
4233         }
4234
4235 out:
4236         lfsck_object_put(env, tobj);
4237
4238         return rc;
4239 }
4240
4241 static int lfsck_layout_assistant_handler_p1(const struct lu_env *env,
4242                                              struct lfsck_component *com,
4243                                              struct lfsck_assistant_req *lar)
4244 {
4245         struct lfsck_layout_req              *llr    =
4246                         container_of0(lar, struct lfsck_layout_req, llr_lar);
4247         struct lfsck_assistant_object        *lso    = lar->lar_parent;
4248         struct lfsck_layout                  *lo     = com->lc_file_ram;
4249         struct lfsck_thread_info             *info   = lfsck_env_info(env);
4250         struct filter_fid                    *ff     = &info->lti_ff;
4251         struct lu_buf buf = { .lb_buf = ff,
4252                               .lb_len = sizeof(*ff) };
4253         struct dt_object                     *parent = NULL;
4254         struct dt_object                     *child  = llr->llr_child;
4255         struct lu_attr                       *pla    = &lso->lso_attr;
4256         struct lu_attr                       *cla    = &info->lti_la;
4257         struct lfsck_instance                *lfsck  = com->lc_lfsck;
4258         struct lfsck_bookmark                *bk     = &lfsck->li_bookmark_ram;
4259         enum lfsck_layout_inconsistency_type  type   = LLIT_NONE;
4260         int                                   rc;
4261         ENTRY;
4262
4263         if (lso->lso_dead)
4264                 RETURN(0);
4265
4266         CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_ENGINE_DELAY, cfs_fail_val);
4267
4268         rc = dt_attr_get(env, child, cla);
4269         if (rc == -ENOENT) {
4270                 parent = lfsck_assistant_object_load(env, lfsck, lso);
4271                 if (IS_ERR(parent)) {
4272                         rc = PTR_ERR(parent);
4273
4274                         RETURN(rc == -ENOENT ? 0 : rc);
4275                 }
4276
4277                 type = LLIT_DANGLING;
4278                 goto repair;
4279         }
4280
4281         if (rc != 0)
4282                 GOTO(out, rc);
4283
4284         lfsck_buf_init(&buf, ff, sizeof(*ff));
4285         rc = dt_xattr_get(env, child, &buf, XATTR_NAME_FID);
4286         if (unlikely(rc > 0 && rc < sizeof(struct lu_fid))) {
4287                 type = LLIT_UNMATCHED_PAIR;
4288                 goto repair;
4289         }
4290
4291         if (rc < 0 && rc != -ENODATA)
4292                 GOTO(out, rc);
4293
4294         if (rc == 0 || rc == -ENODATA)
4295                 GOTO(check_owner, rc = 0);
4296
4297         filter_fid_le_to_cpu(ff, ff, sizeof(*ff));
4298         rc = lfsck_layout_check_parent(env, com, lso, ff,
4299                                        lu_object_fid(&child->do_lu), cla, llr);
4300         if (rc > 0) {
4301                 type = rc;
4302                 goto repair;
4303         }
4304
4305         if (rc < 0)
4306                 GOTO(out, rc);
4307
4308 check_owner:
4309         /* Someone may has changed the owner after the parent attr pre-loaded.
4310          * It can be handled later inside the lfsck_layout_repair_owner(). */
4311         if (unlikely(cla->la_uid != pla->la_uid ||
4312                      cla->la_gid != pla->la_gid)) {
4313                 type = LLIT_INCONSISTENT_OWNER;
4314                 goto repair;
4315         }
4316
4317 repair:
4318         if (type == LLIT_NONE)
4319                 GOTO(out, rc = 0);
4320
4321         if (bk->lb_param & LPF_DRYRUN)
4322                 GOTO(out, rc = 1);
4323
4324         if (parent == NULL) {
4325                 parent = lfsck_assistant_object_load(env, lfsck, lso);
4326                 if (IS_ERR(parent)) {
4327                         rc = PTR_ERR(parent);
4328
4329                         if (rc == -ENOENT)
4330                                 RETURN(0);
4331
4332                         GOTO(out, rc);
4333                 }
4334         }
4335
4336         switch (type) {
4337         case LLIT_DANGLING:
4338                 if (bk->lb_param & LPF_DELAY_CREATE_OSTOBJ)
4339                         rc = lfsck_layout_ins_dangling_rec(env, com,
4340                                 lfsck_dto2fid(parent), lfsck_dto2fid(child),
4341                                 llr->llr_comp_id, llr->llr_lov_idx,
4342                                 llr->llr_ost_idx);
4343                 else
4344                         rc = __lfsck_layout_repair_dangling(env, com, parent,
4345                                                             llr->llr_child,
4346                                                             llr->llr_comp_id,
4347                                                             llr->llr_lov_idx,
4348                                                             llr->llr_ost_idx,
4349                                                             true);
4350                 break;
4351         case LLIT_UNMATCHED_PAIR:
4352                 rc = lfsck_layout_repair_unmatched_pair(env, com, parent,
4353                                                         llr, pla);
4354                 break;
4355         case LLIT_MULTIPLE_REFERENCED:
4356                 rc = lfsck_layout_repair_multiple_references(env, com, parent,
4357                                                              llr, pla);
4358                 break;
4359         case LLIT_INCONSISTENT_OWNER:
4360                 rc = lfsck_layout_repair_owner(env, com, parent, llr, pla, cla);
4361                 break;
4362         default:
4363                 rc = 0;
4364                 break;
4365         }
4366
4367         GOTO(out, rc);
4368
4369 out:
4370         down_write(&com->lc_sem);
4371         if (rc < 0) {
4372                 struct lfsck_assistant_data *lad = com->lc_data;
4373
4374                 if (unlikely(test_bit(LAD_EXIT, &lad->lad_flags))) {
4375                         rc = 0;
4376                 } else if (rc == -ENOTCONN || rc == -ESHUTDOWN ||
4377                            rc == -ETIMEDOUT || rc == -EHOSTDOWN ||
4378                            rc == -EHOSTUNREACH) {
4379                         /* If cannot touch the target server,
4380                          * mark the LFSCK as INCOMPLETE. */
4381                         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant fail to "
4382                                "talk with OST %x: rc = %d\n",
4383                                lfsck_lfsck2name(lfsck), llr->llr_ost_idx, rc);
4384                         lfsck_lad_set_bitmap(env, com, llr->llr_ost_idx);
4385                         lo->ll_objs_skipped++;
4386                         rc = 0;
4387                 } else {
4388                         lfsck_layout_record_failure(env, lfsck, lo);
4389                 }
4390         } else if (rc > 0 && (type != LLIT_DANGLING ||
4391                               !(bk->lb_param & LPF_DELAY_CREATE_OSTOBJ))) {
4392                 LASSERTF(type > LLIT_NONE && type <= LLIT_MAX,
4393                          "unknown type = %d\n", type);
4394
4395                 lo->ll_objs_repaired[type - 1]++;
4396                 if (bk->lb_param & LPF_DRYRUN &&
4397                     unlikely(lo->ll_pos_first_inconsistent == 0))
4398                         lo->ll_pos_first_inconsistent =
4399                         lfsck->li_obj_oit->do_index_ops->dio_it.store(env,
4400                                                         lfsck->li_di_oit);
4401         }
4402         up_write(&com->lc_sem);
4403
4404         if (parent != NULL && !IS_ERR(parent))
4405                 lfsck_object_put(env, parent);
4406
4407         return rc;
4408 }
4409
4410 static int
4411 lfsck_layout_double_scan_one_trace_file(const struct lu_env *env,
4412                                         struct lfsck_component *com,
4413                                         struct dt_object *obj, bool first)
4414 {
4415         struct lfsck_instance *lfsck = com->lc_lfsck;
4416         struct ptlrpc_thread *thread = &lfsck->li_thread;
4417         struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram;
4418         struct lfsck_layout *lo = com->lc_file_ram;
4419         const struct dt_it_ops *iops = &obj->do_index_ops->dio_it;
4420         struct dt_it *di;
4421         struct dt_key *key;
4422         struct lfsck_layout_dangling_key *parent =
4423                                         &lfsck_env_info(env)->lti_lldk;
4424         struct lu_fid *cfid = &lfsck_env_info(env)->lti_fid3;
4425         __u32 ost_idx;
4426         int rc;
4427         ENTRY;
4428
4429         di = iops->init(env, obj, 0);
4430         if (IS_ERR(di))
4431                 RETURN(PTR_ERR(di));
4432
4433         if (first)
4434                 lldk_cpu_to_be(parent, &lo->ll_lldk_latest_scanned_phase2);
4435         else
4436                 memset(parent, 0, sizeof(*parent));
4437         rc = iops->get(env, di, (const struct dt_key *)parent);
4438         if (rc < 0)
4439                 GOTO(fini, rc);
4440
4441         if (first) {
4442                 /* The start one either has been processed or does not exist,
4443                  * skip it. */
4444                 rc = iops->next(env, di);
4445                 if (rc != 0)
4446                         GOTO(put, rc);
4447         }
4448
4449         do {
4450                 if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val) &&
4451                     unlikely(!thread_is_running(thread)))
4452                         GOTO(put, rc = 0);
4453
4454                 key = iops->key(env, di);
4455                 if (IS_ERR(key)) {
4456                         rc = PTR_ERR(key);
4457                         if (rc == -ENOENT)
4458                                 GOTO(put, rc = 1);
4459
4460                         goto checkpoint;
4461                 }
4462
4463                 lldk_be_to_cpu(parent,
4464                                 (const struct lfsck_layout_dangling_key *)key);
4465                 if (!fid_is_sane(&parent->lldk_fid)) {
4466                         rc = 0;
4467                         goto checkpoint;
4468                 }
4469
4470                 rc = iops->rec(env, di, (struct dt_rec *)cfid, 0);
4471                 if (rc == 0) {
4472                         fid_be_to_cpu(cfid, cfid);
4473                         ost_idx = cfid->f_ver;
4474                         cfid->f_ver = 0;
4475                         if (!fid_is_sane(cfid)) {
4476                                 rc = 0;
4477                                 goto checkpoint;
4478                         }
4479
4480                         rc = lfsck_layout_repair_dangling(env, com,
4481                                         &parent->lldk_fid, cfid,
4482                                         parent->lldk_comp_id,
4483                                         parent->lldk_ea_off, ost_idx);
4484                 }
4485
4486 checkpoint:
4487                 down_write(&com->lc_sem);
4488                 com->lc_new_checked++;
4489                 com->lc_new_scanned++;
4490                 if (rc >= 0)
4491                         lo->ll_lldk_latest_scanned_phase2 = *parent;
4492
4493                 if (rc > 0)
4494                         lo->ll_objs_repaired[LLIT_DANGLING - 1]++;
4495                 else if (rc < 0)
4496                         lo->ll_objs_failed_phase2++;
4497                 up_write(&com->lc_sem);
4498
4499                 if (rc < 0 && bk->lb_param & LPF_FAILOUT)
4500                         GOTO(put, rc);
4501
4502                 if (unlikely(com->lc_time_next_checkpoint <=
4503                              ktime_get_seconds()) &&
4504                     com->lc_new_checked != 0) {
4505                         down_write(&com->lc_sem);
4506                         lo->ll_run_time_phase2 += ktime_get_seconds() -
4507                                                   com->lc_time_last_checkpoint;
4508                         lo->ll_time_last_checkpoint = ktime_get_real_seconds();
4509                         lo->ll_objs_checked_phase2 += com->lc_new_checked;
4510                         com->lc_new_checked = 0;
4511                         lfsck_layout_store(env, com);
4512                         up_write(&com->lc_sem);
4513
4514                         com->lc_time_last_checkpoint = ktime_get_seconds();
4515                         com->lc_time_next_checkpoint =
4516                                 com->lc_time_last_checkpoint +
4517                                 LFSCK_CHECKPOINT_INTERVAL;
4518                 }
4519
4520                 lfsck_control_speed_by_self(com);
4521                 if (unlikely(!thread_is_running(thread)))
4522                         GOTO(put, rc = 0);
4523
4524                 rc = iops->next(env, di);
4525         } while (rc == 0);
4526
4527         GOTO(put, rc);
4528
4529 put:
4530         iops->put(env, di);
4531
4532 fini:
4533         iops->fini(env, di);
4534
4535         return rc;
4536 }
4537
4538 static int lfsck_layout_assistant_handler_p2(const struct lu_env *env,
4539                                              struct lfsck_component *com)
4540 {
4541         struct lfsck_assistant_data     *lad    = com->lc_data;
4542         struct lfsck_instance           *lfsck  = com->lc_lfsck;
4543         struct lfsck_bookmark           *bk     = &lfsck->li_bookmark_ram;
4544         struct lfsck_tgt_descs          *ltds   = &lfsck->li_ost_descs;
4545         struct lfsck_tgt_desc           *ltd;
4546         int                              rc     = 0;
4547         ENTRY;
4548
4549         CDEBUG(D_LFSCK, "%s: layout LFSCK phase2 scan start\n",
4550                lfsck_lfsck2name(lfsck));
4551
4552         spin_lock(&ltds->ltd_lock);
4553         while (!list_empty(&lad->lad_ost_phase2_list)) {
4554                 ltd = list_entry(lad->lad_ost_phase2_list.next,
4555                                  struct lfsck_tgt_desc,
4556                                  ltd_layout_phase_list);
4557                 list_del_init(&ltd->ltd_layout_phase_list);
4558                 if (bk->lb_param & LPF_OST_ORPHAN) {
4559                         spin_unlock(&ltds->ltd_lock);
4560                         rc = lfsck_layout_scan_orphan(env, com, ltd);
4561                         if (rc != 0 && bk->lb_param & LPF_FAILOUT)
4562                                 RETURN(rc);
4563
4564                         if (unlikely(test_bit(LAD_EXIT, &lad->lad_flags) ||
4565                                      !thread_is_running(&lfsck->li_thread)))
4566                                 RETURN(0);
4567                         spin_lock(&ltds->ltd_lock);
4568                 }
4569         }
4570
4571         if (list_empty(&lad->lad_ost_phase1_list))
4572                 rc = 1;
4573         else
4574                 rc = 0;
4575         spin_unlock(&ltds->ltd_lock);
4576
4577         if (rc == 1 && bk->lb_param & LPF_OST_ORPHAN) {
4578                 struct lfsck_layout *lo = com->lc_file_ram;
4579                 int i;
4580
4581                 com->lc_new_checked = 0;
4582                 com->lc_new_scanned = 0;
4583                 com->lc_time_last_checkpoint = ktime_get_seconds();
4584                 com->lc_time_next_checkpoint = com->lc_time_last_checkpoint +
4585                                                LFSCK_CHECKPOINT_INTERVAL;
4586
4587                 i = lfsck_sub_trace_file_fid2idx(
4588                                 &lo->ll_lldk_latest_scanned_phase2.lldk_fid);
4589                 rc = lfsck_layout_double_scan_one_trace_file(env, com,
4590                                 com->lc_sub_trace_objs[i].lsto_obj, true);
4591                 while (rc > 0 && ++i < LFSCK_STF_COUNT)
4592                         rc = lfsck_layout_double_scan_one_trace_file(env, com,
4593                                 com->lc_sub_trace_objs[i].lsto_obj, false);
4594
4595                 CDEBUG(D_LFSCK, "%s: layout LFSCK phase2 scan dangling stop "
4596                        "at the No. %d trace file: rc = %d\n",
4597                        lfsck_lfsck2name(lfsck), i, rc);
4598         }
4599
4600         CDEBUG(D_LFSCK, "%s: layout LFSCK phase2 scan stop: rc = %d\n",
4601                lfsck_lfsck2name(lfsck), rc);
4602
4603         RETURN(rc);
4604 }
4605
4606 static int
4607 lfsck_layout_slave_async_interpret(const struct lu_env *env,
4608                                    struct ptlrpc_request *req,
4609                                    void *args, int rc)
4610 {
4611         struct lfsck_layout_slave_async_args *llsaa = args;
4612         struct obd_export *exp = llsaa->llsaa_exp;
4613         struct lfsck_component *com = llsaa->llsaa_com;
4614         struct lfsck_layout_slave_target *llst = llsaa->llsaa_llst;
4615         struct lfsck_layout_slave_data *llsd = com->lc_data;
4616         struct lfsck_reply *lr = NULL;
4617         bool done = false;
4618
4619         if (rc != 0) {
4620                 /* It is probably caused by network trouble, or target crash,
4621                  * it will try several times (depends on the obd_timeout, and
4622                  * will not less than 3 times). But to make the LFSCK can go
4623                  * ahead, we should not try for ever. After some try but still
4624                  * hit failure, it will assume that the target exit the LFSCK
4625                  * prcoessing and stop try. */
4626                 if (rc == -ENOTCONN || rc == -ESHUTDOWN) {
4627                         int max_try = max_t(int, obd_timeout / 30, 3);
4628
4629                         if (++(llst->llst_failures) > max_try)
4630                                 done = true;
4631                 } else {
4632                         done = true;
4633                 }
4634         } else {
4635                 llst->llst_failures = 0;
4636                 lr = req_capsule_server_get(&req->rq_pill, &RMF_LFSCK_REPLY);
4637                 if (lr->lr_status != LS_SCANNING_PHASE1 &&
4638                     lr->lr_status != LS_SCANNING_PHASE2)
4639                         done = true;
4640         }
4641
4642         if (done) {
4643                 CDEBUG(D_LFSCK, "%s: layout LFSCK slave gets the MDT %x "
4644                        "status %d, failures_try %d\n", lfsck_lfsck2name(com->lc_lfsck),
4645                        llst->llst_index, lr != NULL ? lr->lr_status : rc,
4646                        llst->llst_failures);
4647
4648                 lfsck_layout_llst_del(llsd, llst);
4649         }
4650
4651         lfsck_layout_llst_put(llst);
4652         lfsck_component_put(env, com);
4653         class_export_put(exp);
4654
4655         return 0;
4656 }
4657
4658 static int lfsck_layout_async_query(const struct lu_env *env,
4659                                     struct lfsck_component *com,
4660                                     struct obd_export *exp,
4661                                     struct lfsck_layout_slave_target *llst,
4662                                     struct lfsck_request *lr,
4663                                     struct ptlrpc_request_set *set)
4664 {
4665         struct lfsck_layout_slave_async_args *llsaa;
4666         struct ptlrpc_request                *req;
4667         struct lfsck_request                 *tmp;
4668         int                                   rc;
4669         ENTRY;
4670
4671         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LFSCK_QUERY);
4672         if (req == NULL)
4673                 RETURN(-ENOMEM);
4674
4675         rc = ptlrpc_request_pack(req, LUSTRE_OBD_VERSION, LFSCK_QUERY);
4676         if (rc != 0) {
4677                 ptlrpc_request_free(req);
4678                 RETURN(rc);
4679         }
4680
4681         tmp = req_capsule_client_get(&req->rq_pill, &RMF_LFSCK_REQUEST);
4682         *tmp = *lr;
4683         ptlrpc_request_set_replen(req);
4684
4685         llsaa = ptlrpc_req_async_args(llsaa, req);
4686         llsaa->llsaa_exp = exp;
4687         llsaa->llsaa_com = lfsck_component_get(com);
4688         llsaa->llsaa_llst = llst;
4689         req->rq_interpret_reply = lfsck_layout_slave_async_interpret;
4690         req->rq_allow_intr = 1;
4691         req->rq_no_delay = 1;
4692         ptlrpc_set_add_req(set, req);
4693
4694         RETURN(0);
4695 }
4696
4697 static int lfsck_layout_async_notify(const struct lu_env *env,
4698                                      struct obd_export *exp,
4699                                      struct lfsck_request *lr,
4700                                      struct ptlrpc_request_set *set)
4701 {
4702         struct ptlrpc_request   *req;
4703         struct lfsck_request    *tmp;
4704         int                      rc;
4705         ENTRY;
4706
4707         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LFSCK_NOTIFY);
4708         if (req == NULL)
4709                 RETURN(-ENOMEM);
4710
4711         rc = ptlrpc_request_pack(req, LUSTRE_OBD_VERSION, LFSCK_NOTIFY);
4712         if (rc != 0) {
4713                 ptlrpc_request_free(req);
4714                 RETURN(rc);
4715         }
4716
4717         tmp = req_capsule_client_get(&req->rq_pill, &RMF_LFSCK_REQUEST);
4718         *tmp = *lr;
4719         ptlrpc_request_set_replen(req);
4720         req->rq_allow_intr = 1;
4721         req->rq_no_delay = 1;
4722         ptlrpc_set_add_req(set, req);
4723
4724         RETURN(0);
4725 }
4726
4727 static int
4728 lfsck_layout_slave_query_master(const struct lu_env *env,
4729                                 struct lfsck_component *com)
4730 {
4731         struct lfsck_request             *lr    = &lfsck_env_info(env)->lti_lr;
4732         struct lfsck_instance            *lfsck = com->lc_lfsck;
4733         struct lfsck_layout_slave_data   *llsd  = com->lc_data;
4734         struct lfsck_layout_slave_target *llst;
4735         struct obd_export                *exp;
4736         struct ptlrpc_request_set        *set;
4737         int                               rc    = 0;
4738         int                               rc1   = 0;
4739         ENTRY;
4740
4741         set = ptlrpc_prep_set();
4742         if (set == NULL)
4743                 GOTO(log, rc = -ENOMEM);
4744
4745         memset(lr, 0, sizeof(*lr));
4746         lr->lr_event = LE_QUERY;
4747         lr->lr_active = LFSCK_TYPE_LAYOUT;
4748
4749         llsd->llsd_touch_gen++;
4750         spin_lock(&llsd->llsd_lock);
4751         while (!list_empty(&llsd->llsd_master_list)) {
4752                 llst = list_entry(llsd->llsd_master_list.next,
4753                                   struct lfsck_layout_slave_target,
4754                                   llst_list);
4755                 if (llst->llst_gen == llsd->llsd_touch_gen)
4756                         break;
4757
4758                 llst->llst_gen = llsd->llsd_touch_gen;
4759                 list_move_tail(&llst->llst_list,
4760                                &llsd->llsd_master_list);
4761                 atomic_inc(&llst->llst_ref);
4762                 spin_unlock(&llsd->llsd_lock);
4763
4764                 exp = lustre_find_lwp_by_index(lfsck->li_obd->obd_name,
4765                                                llst->llst_index);
4766                 if (exp == NULL) {
4767                         lfsck_layout_llst_del(llsd, llst);
4768                         lfsck_layout_llst_put(llst);
4769                         spin_lock(&llsd->llsd_lock);
4770                         continue;
4771                 }
4772
4773                 rc = lfsck_layout_async_query(env, com, exp, llst, lr, set);
4774                 if (rc != 0) {
4775                         CDEBUG(D_LFSCK, "%s: layout LFSCK slave fail to "
4776                                "query %s for layout: rc = %d\n",
4777                                lfsck_lfsck2name(lfsck),
4778                                exp->exp_obd->obd_name, rc);
4779
4780                         rc1 = rc;
4781                         lfsck_layout_llst_put(llst);
4782                         class_export_put(exp);
4783                 }
4784                 spin_lock(&llsd->llsd_lock);
4785         }
4786         spin_unlock(&llsd->llsd_lock);
4787
4788         rc = ptlrpc_set_wait(env, set);
4789         ptlrpc_set_destroy(set);
4790
4791         GOTO(log, rc = (rc1 != 0 ? rc1 : rc));
4792
4793 log:
4794         CDEBUG(D_LFSCK, "%s: layout LFSCK slave queries master: rc = %d\n",
4795                lfsck_lfsck2name(com->lc_lfsck), rc);
4796
4797         return rc;
4798 }
4799
4800 static void
4801 lfsck_layout_slave_notify_master(const struct lu_env *env,
4802                                  struct lfsck_component *com,
4803                                  enum lfsck_events event, int result)
4804 {
4805         struct lfsck_layout              *lo    = com->lc_file_ram;
4806         struct lfsck_instance            *lfsck = com->lc_lfsck;
4807         struct lfsck_layout_slave_data   *llsd  = com->lc_data;
4808         struct lfsck_request             *lr    = &lfsck_env_info(env)->lti_lr;
4809         struct lfsck_layout_slave_target *llst;
4810         struct obd_export                *exp;
4811         struct ptlrpc_request_set        *set;
4812         int                               rc;
4813         ENTRY;
4814
4815         CDEBUG(D_LFSCK, "%s: layout LFSCK slave notifies master\n",
4816                lfsck_lfsck2name(com->lc_lfsck));
4817
4818         set = ptlrpc_prep_set();
4819         if (set == NULL)
4820                 RETURN_EXIT;
4821
4822         memset(lr, 0, sizeof(*lr));
4823         lr->lr_event = event;
4824         lr->lr_flags = LEF_FROM_OST;
4825         lr->lr_status = result;
4826         lr->lr_index = lfsck_dev_idx(lfsck);
4827         lr->lr_active = LFSCK_TYPE_LAYOUT;
4828         lr->lr_flags2 = lo->ll_flags;
4829         llsd->llsd_touch_gen++;
4830         spin_lock(&llsd->llsd_lock);
4831         while (!list_empty(&llsd->llsd_master_list)) {
4832                 llst = list_entry(llsd->llsd_master_list.next,
4833                                   struct lfsck_layout_slave_target,
4834                                   llst_list);
4835                 if (llst->llst_gen == llsd->llsd_touch_gen)
4836                         break;
4837
4838                 llst->llst_gen = llsd->llsd_touch_gen;
4839                 list_move_tail(&llst->llst_list,
4840                                &llsd->llsd_master_list);
4841                 atomic_inc(&llst->llst_ref);
4842                 spin_unlock(&llsd->llsd_lock);
4843
4844                 exp = lustre_find_lwp_by_index(lfsck->li_obd->obd_name,
4845                                                llst->llst_index);
4846                 if (exp == NULL) {
4847                         lfsck_layout_llst_del(llsd, llst);
4848                         lfsck_layout_llst_put(llst);
4849                         spin_lock(&llsd->llsd_lock);
4850                         continue;
4851                 }
4852
4853                 rc = lfsck_layout_async_notify(env, exp, lr, set);
4854                 if (rc != 0)
4855                         CDEBUG(D_LFSCK, "%s: layout LFSCK slave fail to "
4856                                "notify %s for layout: rc = %d\n",
4857                                lfsck_lfsck2name(lfsck),
4858                                exp->exp_obd->obd_name, rc);
4859
4860                 lfsck_layout_llst_put(llst);
4861                 class_export_put(exp);
4862                 spin_lock(&llsd->llsd_lock);
4863         }
4864         spin_unlock(&llsd->llsd_lock);
4865
4866         ptlrpc_set_wait(env, set);
4867         ptlrpc_set_destroy(set);
4868
4869         RETURN_EXIT;
4870 }
4871
4872 /*
4873  * \ret -ENODATA: unrecognized stripe
4874  * \ret = 0     : recognized stripe
4875  * \ret < 0     : other failures
4876  */
4877 static int lfsck_layout_master_check_pairs(const struct lu_env *env,
4878                                            struct lfsck_component *com,
4879                                            struct lu_fid *cfid,
4880                                            struct lu_fid *pfid, __u32 comp_id)
4881 {
4882         struct lfsck_thread_info        *info   = lfsck_env_info(env);
4883         struct lu_buf                   *buf    = &info->lti_big_buf;
4884         struct ost_id                   *oi     = &info->lti_oi;
4885         struct dt_object                *obj;
4886         struct lov_mds_md_v1            *lmm;
4887         struct lov_ost_data_v1          *objs;
4888         __u32                            idx    = pfid->f_stripe_idx;
4889         __u32                            magic;
4890         int                              rc     = 0;
4891         int                              i;
4892         __u16                            count;
4893         ENTRY;
4894
4895         pfid->f_ver = 0;
4896         obj = lfsck_object_find_bottom(env, com->lc_lfsck, pfid);
4897         if (IS_ERR(obj))
4898                 RETURN(PTR_ERR(obj));
4899
4900         dt_read_lock(env, obj, 0);
4901         if (unlikely(dt_object_exists(obj) == 0 ||
4902                      lfsck_is_dead_obj(obj)))
4903                 GOTO(unlock, rc = -ENOENT);
4904
4905         if (!S_ISREG(lfsck_object_type(obj)))
4906                 GOTO(unlock, rc = -ENODATA);
4907
4908         rc = lfsck_layout_get_lovea(env, obj, buf);
4909         if (rc < 0)
4910                 GOTO(unlock, rc);
4911
4912         lmm = buf->lb_buf;
4913         magic = le32_to_cpu(lmm->lmm_magic);
4914         if (magic == LOV_MAGIC_COMP_V1) {
4915                 struct lov_comp_md_v1 *lcm = buf->lb_buf;
4916                 struct lov_comp_md_entry_v1 *lcme;
4917
4918                 if (comp_id == 0)
4919                         GOTO(unlock, rc = -ENODATA);
4920
4921                 count = le16_to_cpu(lcm->lcm_entry_count);
4922                 for (i = 0; i < count; i++) {
4923                         lcme = &lcm->lcm_entries[i];
4924                         if (le32_to_cpu(lcme->lcme_id) == comp_id) {
4925                                 lmm = buf->lb_buf +
4926                                         le32_to_cpu(lcme->lcme_offset);
4927                                 magic = le32_to_cpu(lmm->lmm_magic);
4928                                 if (!(le32_to_cpu(lcme->lcme_flags) &
4929                                       LCME_FL_INIT))
4930                                         GOTO(unlock, rc = -ENODATA);
4931
4932                                 goto further;
4933                         }
4934                 }
4935
4936                 GOTO(unlock, rc = -ENODATA);
4937         }
4938
4939 further:
4940         if (magic == LOV_MAGIC_V1) {
4941                 objs = &lmm->lmm_objects[0];
4942         } else {
4943                 LASSERT(magic == LOV_MAGIC_V3);
4944                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0];
4945         }
4946
4947         fid_to_ostid(cfid, oi);
4948         count = le16_to_cpu(lmm->lmm_stripe_count);
4949         for (i = 0; i < count; i++, objs++) {
4950                 struct ost_id oi2;
4951
4952                 ostid_le_to_cpu(&objs->l_ost_oi, &oi2);
4953                 if (memcmp(oi, &oi2, sizeof(*oi)) == 0)
4954                         GOTO(unlock, rc = (i != idx ? -ENODATA : 0));
4955         }
4956
4957         GOTO(unlock, rc = -ENODATA);
4958
4959 unlock:
4960         dt_read_unlock(env, obj);
4961         lfsck_object_put(env, obj);
4962
4963         return rc;
4964 }
4965
4966 /*
4967  * The LFSCK-on-OST will ask the LFSCK-on-MDT to check whether the given
4968  * MDT-object/OST-object pairs match or not to aviod transfer MDT-object
4969  * layout EA from MDT to OST. On one hand, the OST no need to understand
4970  * the layout EA structure; on the other hand, it may cause trouble when
4971  * transfer large layout EA from MDT to OST via normal OUT RPC.
4972  *
4973  * \ret > 0: unrecognized stripe
4974  * \ret = 0: recognized stripe
4975  * \ret < 0: other failures
4976  */
4977 static int lfsck_layout_slave_check_pairs(const struct lu_env *env,
4978                                           struct lfsck_component *com,
4979                                           struct lu_fid *cfid,
4980                                           struct lu_fid *pfid, __u32 comp_id)
4981 {
4982         struct lfsck_instance    *lfsck  = com->lc_lfsck;
4983         struct obd_device        *obd    = lfsck->li_obd;
4984         struct seq_server_site   *ss     = lfsck_dev_site(lfsck);
4985         struct obd_export        *exp    = NULL;
4986         struct ptlrpc_request    *req    = NULL;
4987         struct lfsck_request     *lr;
4988         struct lu_seq_range      *range  = &lfsck_env_info(env)->lti_range;
4989         int                       rc     = 0;
4990         ENTRY;
4991
4992         if (unlikely(fid_is_idif(pfid)))
4993                 RETURN(1);
4994
4995         fld_range_set_any(range);
4996         rc = fld_server_lookup(env, ss->ss_server_fld, fid_seq(pfid), range);
4997         if (rc != 0)
4998                 RETURN(rc == -ENOENT ? 1 : rc);
4999
5000         if (unlikely(!fld_range_is_mdt(range)))
5001                 RETURN(1);
5002
5003         exp = lustre_find_lwp_by_index(obd->obd_name, range->lsr_index);
5004         if (unlikely(exp == NULL))
5005                 RETURN(1);
5006
5007         if (!(exp_connect_flags(exp) & OBD_CONNECT_LFSCK))
5008                 GOTO(out, rc = -EOPNOTSUPP);
5009
5010         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LFSCK_NOTIFY);
5011         if (req == NULL)
5012                 GOTO(out, rc = -ENOMEM);
5013
5014         rc = ptlrpc_request_pack(req, LUSTRE_OBD_VERSION, LFSCK_NOTIFY);
5015         if (rc != 0) {
5016                 ptlrpc_request_free(req);
5017
5018                 GOTO(out, rc);
5019         }
5020
5021         lr = req_capsule_client_get(&req->rq_pill, &RMF_LFSCK_REQUEST);
5022         memset(lr, 0, sizeof(*lr));
5023         lr->lr_event = LE_PAIRS_VERIFY;
5024         lr->lr_active = LFSCK_TYPE_LAYOUT;
5025         lr->lr_fid = *cfid; /* OST-object itself FID. */
5026         lr->lr_fid2 = *pfid; /* The claimed parent FID. */
5027         lr->lr_comp_id = comp_id;
5028
5029         ptlrpc_request_set_replen(req);
5030         rc = ptlrpc_queue_wait(req);
5031         ptlrpc_req_finished(req);
5032
5033         if (rc == -ENOENT || rc == -ENODATA)
5034                 rc = 1;
5035
5036         GOTO(out, rc);
5037
5038 out:
5039         if (exp != NULL)
5040                 class_export_put(exp);
5041
5042         return rc;
5043 }
5044
5045 static int lfsck_layout_slave_repair_pfid(const struct lu_env *env,
5046                                           struct lfsck_component *com,
5047                                           struct lfsck_req_local *lrl)
5048 {
5049         struct dt_object        *obj;
5050         int                      rc     = 0;
5051         ENTRY;
5052
5053         obj = lfsck_object_find_bottom(env, com->lc_lfsck, &lrl->lrl_fid);
5054         if (IS_ERR(obj))
5055                 GOTO(log, rc = PTR_ERR(obj));
5056
5057         dt_write_lock(env, obj, 0);
5058         if (unlikely(dt_object_exists(obj) == 0 ||
5059                      lfsck_is_dead_obj(obj)))
5060                 GOTO(unlock, rc = 0);
5061
5062         rc = __lfsck_layout_update_pfid(env, obj, &lrl->lrl_ff_client.ff_parent,
5063                                         &lrl->lrl_ff_client.ff_layout,
5064                                         lrl->lrl_ff_client.ff_layout_version,
5065                                         lrl->lrl_ff_client.ff_range,
5066                                         lrl->lrl_ff_client.ff_parent.f_ver);
5067
5068         GOTO(unlock, rc);
5069
5070 unlock:
5071         dt_write_unlock(env, obj);
5072         lfsck_object_put(env, obj);
5073
5074 log:
5075         CDEBUG(D_LFSCK, "%s: layout LFSCK slave repaired pfid for "DFID
5076                ", parent "DFID": rc = %d\n", lfsck_lfsck2name(com->lc_lfsck),
5077                PFID(&lrl->lrl_fid), PFID(&lrl->lrl_ff_client.ff_parent), rc);
5078
5079         return rc;
5080 }
5081
5082 /* layout APIs */
5083
5084 static void lfsck_layout_slave_quit(const struct lu_env *env,
5085                                     struct lfsck_component *com);
5086
5087 static int lfsck_layout_reset(const struct lu_env *env,
5088                               struct lfsck_component *com, bool init)
5089 {
5090         struct lfsck_layout     *lo    = com->lc_file_ram;
5091         int                      rc;
5092
5093         down_write(&com->lc_sem);
5094         if (init) {
5095                 memset(lo, 0, com->lc_file_size);
5096         } else {
5097                 __u32 count = lo->ll_success_count;
5098                 time64_t last_time = lo->ll_time_last_complete;
5099
5100                 memset(lo, 0, com->lc_file_size);
5101                 lo->ll_success_count = count;
5102                 lo->ll_time_last_complete = last_time;
5103         }
5104
5105         lo->ll_magic = LFSCK_LAYOUT_MAGIC;
5106         lo->ll_status = LS_INIT;
5107
5108         if (com->lc_lfsck->li_master) {
5109                 struct lfsck_assistant_data *lad = com->lc_data;
5110
5111                 clear_bit(LAD_INCOMPLETE, &lad->lad_flags);
5112                 CFS_RESET_BITMAP(lad->lad_bitmap);
5113         }
5114
5115         rc = lfsck_layout_store(env, com);
5116         if (rc == 0 && com->lc_lfsck->li_master)
5117                 rc = lfsck_load_sub_trace_files(env, com,
5118                         &dt_lfsck_layout_dangling_features, LFSCK_LAYOUT, true);
5119         up_write(&com->lc_sem);
5120
5121         CDEBUG(D_LFSCK, "%s: layout LFSCK reset: rc = %d\n",
5122                lfsck_lfsck2name(com->lc_lfsck), rc);
5123
5124         return rc;
5125 }
5126
5127 static void lfsck_layout_fail(const struct lu_env *env,
5128                               struct lfsck_component *com, bool new_checked)
5129 {
5130         struct lfsck_layout *lo = com->lc_file_ram;
5131
5132         down_write(&com->lc_sem);
5133         if (new_checked)
5134                 com->lc_new_checked++;
5135         lfsck_layout_record_failure(env, com->lc_lfsck, lo);
5136         up_write(&com->lc_sem);
5137 }
5138
5139 static int lfsck_layout_master_checkpoint(const struct lu_env *env,
5140                                           struct lfsck_component *com, bool init)
5141 {
5142         struct lfsck_instance   *lfsck   = com->lc_lfsck;
5143         struct lfsck_layout     *lo      = com->lc_file_ram;
5144         int                      rc;
5145
5146         if (!init) {
5147                 rc = lfsck_checkpoint_generic(env, com);
5148                 if (rc != 0)
5149                         return rc > 0 ? 0 : rc;
5150         }
5151
5152         down_write(&com->lc_sem);
5153         if (init) {
5154                 lo->ll_pos_latest_start =
5155                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
5156         } else {
5157                 lo->ll_pos_last_checkpoint =
5158                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
5159                 lo->ll_run_time_phase1 += ktime_get_seconds() -
5160                                           lfsck->li_time_last_checkpoint;
5161                 lo->ll_time_last_checkpoint = ktime_get_real_seconds();
5162                 lo->ll_objs_checked_phase1 += com->lc_new_checked;
5163                 com->lc_new_checked = 0;
5164         }
5165
5166         rc = lfsck_layout_store(env, com);
5167         up_write(&com->lc_sem);
5168
5169         CDEBUG(D_LFSCK, "%s: layout LFSCK master checkpoint at the pos ["
5170                "%llu], status = %d: rc = %d\n", lfsck_lfsck2name(lfsck),
5171                lfsck->li_pos_current.lp_oit_cookie, lo->ll_status, rc);
5172
5173         return rc;
5174 }
5175
5176 static int lfsck_layout_slave_checkpoint(const struct lu_env *env,
5177                                          struct lfsck_component *com, bool init)
5178 {
5179         struct lfsck_instance   *lfsck = com->lc_lfsck;
5180         struct lfsck_layout     *lo    = com->lc_file_ram;
5181         int                      rc;
5182
5183         if (com->lc_new_checked == 0 && !init)
5184                 return 0;
5185
5186         down_write(&com->lc_sem);
5187         if (init) {
5188                 lo->ll_pos_latest_start =
5189                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
5190         } else {
5191                 lo->ll_pos_last_checkpoint =
5192                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
5193                 lo->ll_run_time_phase1 += ktime_get_seconds() -
5194                                           lfsck->li_time_last_checkpoint;
5195                 lo->ll_time_last_checkpoint = ktime_get_real_seconds();
5196                 lo->ll_objs_checked_phase1 += com->lc_new_checked;
5197                 com->lc_new_checked = 0;
5198         }
5199
5200         rc = lfsck_layout_store(env, com);
5201         up_write(&com->lc_sem);
5202
5203         CDEBUG(D_LFSCK, "%s: layout LFSCK slave checkpoint at the pos ["
5204                "%llu], status = %d: rc = %d\n", lfsck_lfsck2name(lfsck),
5205                lfsck->li_pos_current.lp_oit_cookie, lo->ll_status, rc);
5206
5207         return rc;
5208 }
5209
5210 static int lfsck_layout_prep(const struct lu_env *env,
5211                              struct lfsck_component *com,
5212                              struct lfsck_start *start)
5213 {
5214         struct lfsck_instance   *lfsck  = com->lc_lfsck;
5215         struct lfsck_layout     *lo     = com->lc_file_ram;
5216         struct lfsck_position   *pos    = &com->lc_pos_start;
5217
5218         fid_zero(&pos->lp_dir_parent);
5219         pos->lp_dir_cookie = 0;
5220         if (lo->ll_status == LS_COMPLETED ||
5221             lo->ll_status == LS_PARTIAL ||
5222             /* To handle orphan, must scan from the beginning. */
5223             (start != NULL && start->ls_flags & LPF_OST_ORPHAN)) {
5224                 int rc;
5225
5226                 rc = lfsck_layout_reset(env, com, false);
5227                 if (rc == 0)
5228                         rc = lfsck_set_param(env, lfsck, start, true);
5229
5230                 if (rc != 0) {
5231                         CDEBUG(D_LFSCK, "%s: layout LFSCK prep failed: "
5232                                "rc = %d\n", lfsck_lfsck2name(lfsck), rc);
5233
5234                         return rc;
5235                 }
5236         }
5237
5238         down_write(&com->lc_sem);
5239         lo->ll_time_latest_start = ktime_get_real_seconds();
5240         spin_lock(&lfsck->li_lock);
5241         if (lo->ll_flags & LF_SCANNED_ONCE) {
5242                 if (!lfsck->li_drop_dryrun ||
5243                     lo->ll_pos_first_inconsistent == 0) {
5244                         lo->ll_status = LS_SCANNING_PHASE2;
5245                         list_move_tail(&com->lc_link,
5246                                        &lfsck->li_list_double_scan);
5247                         pos->lp_oit_cookie = 0;
5248                 } else {
5249                         int i;
5250
5251                         lo->ll_status = LS_SCANNING_PHASE1;
5252                         lo->ll_run_time_phase1 = 0;
5253                         lo->ll_run_time_phase2 = 0;
5254                         lo->ll_objs_checked_phase1 = 0;
5255                         lo->ll_objs_checked_phase2 = 0;
5256                         lo->ll_objs_failed_phase1 = 0;
5257                         lo->ll_objs_failed_phase2 = 0;
5258                         for (i = 0; i < LLIT_MAX; i++)
5259                                 lo->ll_objs_repaired[i] = 0;
5260
5261                         pos->lp_oit_cookie = lo->ll_pos_first_inconsistent;
5262                         fid_zero(&com->lc_fid_latest_scanned_phase2);
5263                 }
5264         } else {
5265                 lo->ll_status = LS_SCANNING_PHASE1;
5266                 if (!lfsck->li_drop_dryrun ||
5267                     lo->ll_pos_first_inconsistent == 0)
5268                         pos->lp_oit_cookie = lo->ll_pos_last_checkpoint + 1;
5269                 else
5270                         pos->lp_oit_cookie = lo->ll_pos_first_inconsistent;
5271         }
5272         spin_unlock(&lfsck->li_lock);
5273         up_write(&com->lc_sem);
5274
5275         return 0;
5276 }
5277
5278 static int lfsck_layout_slave_prep(const struct lu_env *env,
5279                                    struct lfsck_component *com,
5280                                    struct lfsck_start_param *lsp)
5281 {
5282         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
5283         struct lfsck_instance           *lfsck  = com->lc_lfsck;
5284         struct lfsck_layout             *lo     = com->lc_file_ram;
5285         struct lfsck_start              *start  = lsp->lsp_start;
5286         int                              rc;
5287
5288         rc = lfsck_layout_prep(env, com, start);
5289         if (rc != 0)
5290                 return rc;
5291
5292         if (lo->ll_flags & LF_CRASHED_LASTID &&
5293             list_empty(&llsd->llsd_master_list)) {
5294                 LASSERT(lfsck->li_out_notify != NULL);
5295
5296                 lfsck->li_out_notify(env, lfsck->li_out_notify_data,
5297                                      LE_LASTID_REBUILDING);
5298         }
5299
5300         if (!lsp->lsp_index_valid)
5301                 return 0;
5302
5303         rc = lfsck_layout_llst_add(llsd, lsp->lsp_index);
5304         if (rc == 0 && start != NULL && start->ls_flags & LPF_OST_ORPHAN) {
5305                 LASSERT(!llsd->llsd_rbtree_valid);
5306
5307                 down_write(&llsd->llsd_rb_rwsem);
5308                 rc = lfsck_rbtree_setup(env, com);
5309                 up_write(&llsd->llsd_rb_rwsem);
5310         }
5311
5312         CDEBUG(D_LFSCK, "%s: layout LFSCK slave prep done, start pos ["
5313                "%llu]\n", lfsck_lfsck2name(lfsck),
5314                com->lc_pos_start.lp_oit_cookie);
5315
5316         return rc;
5317 }
5318
5319 static int lfsck_layout_master_prep(const struct lu_env *env,
5320                                     struct lfsck_component *com,
5321                                     struct lfsck_start_param *lsp)
5322 {
5323         int rc;
5324         ENTRY;
5325
5326         rc = lfsck_layout_load_bitmap(env, com);
5327         if (rc != 0) {
5328                 rc = lfsck_layout_reset(env, com, false);
5329                 if (rc == 0)
5330                         rc = lfsck_set_param(env, com->lc_lfsck,
5331                                              lsp->lsp_start, true);
5332
5333                 if (rc != 0)
5334                         GOTO(log, rc);
5335         }
5336
5337         rc = lfsck_layout_prep(env, com, lsp->lsp_start);
5338         if (rc != 0)
5339                 RETURN(rc);
5340
5341         rc = lfsck_start_assistant(env, com, lsp);
5342
5343         GOTO(log, rc);
5344
5345 log:
5346         CDEBUG(D_LFSCK, "%s: layout LFSCK master prep done, start pos ["
5347                "%llu]\n", lfsck_lfsck2name(com->lc_lfsck),
5348                com->lc_pos_start.lp_oit_cookie);
5349
5350         return 0;
5351 }
5352
5353 /* Pre-fetch the attribute for each stripe in the given layout EA. */
5354 static int lfsck_layout_scan_stripes(const struct lu_env *env,
5355                                      struct lfsck_component *com,
5356                                      struct dt_object *parent,
5357                                      struct lov_mds_md_v1 *lmm, __u32 comp_id)
5358 {
5359         struct lfsck_thread_info        *info    = lfsck_env_info(env);
5360         struct lfsck_instance           *lfsck   = com->lc_lfsck;
5361         struct lfsck_bookmark           *bk      = &lfsck->li_bookmark_ram;
5362         struct lfsck_layout             *lo      = com->lc_file_ram;
5363         struct lfsck_assistant_data     *lad     = com->lc_data;
5364         struct lfsck_assistant_object   *lso     = NULL;
5365         struct lov_ost_data_v1          *objs;
5366         struct lfsck_tgt_descs          *ltds    = &lfsck->li_ost_descs;
5367         struct ptlrpc_thread            *mthread = &lfsck->li_thread;
5368         struct ptlrpc_thread            *athread = &lad->lad_thread;
5369         struct lu_buf                    buf;
5370         int                              rc      = 0;
5371         int                              i;
5372         __u32                            magic;
5373         __u16                            count;
5374         ENTRY;
5375
5376         lfsck_buf_init(&buf, &info->lti_ff, sizeof(struct filter_fid));
5377         magic = le32_to_cpu(lmm->lmm_magic);
5378         if (magic == LOV_MAGIC_V1) {
5379                 objs = &lmm->lmm_objects[0];
5380         } else {
5381                 LASSERT(magic == LOV_MAGIC_V3);
5382                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0];
5383         }
5384
5385         count = le16_to_cpu(lmm->lmm_stripe_count);
5386         for (i = 0; i < count; i++, objs++) {
5387                 struct lu_fid           *fid    = &info->lti_fid;
5388                 struct ost_id           *oi     = &info->lti_oi;
5389                 struct lfsck_layout_req *llr;
5390                 struct lfsck_tgt_desc   *tgt    = NULL;
5391                 struct dt_object        *cobj   = NULL;
5392                 __u32                    index;
5393                 bool                     wakeup = false;
5394
5395                 if (unlikely(lovea_slot_is_dummy(objs)))
5396                         continue;
5397
5398                 wait_event_idle(mthread->t_ctl_waitq,
5399                                 lad->lad_prefetched < bk->lb_async_windows ||
5400                                 !thread_is_running(mthread) ||
5401                                 thread_is_stopped(athread));
5402
5403                 if (unlikely(!thread_is_running(mthread)) ||
5404                              thread_is_stopped(athread))
5405                         GOTO(out, rc = 0);
5406
5407                 if (unlikely(lfsck_is_dead_obj(parent)))
5408                         GOTO(out, rc = 0);
5409
5410                 ostid_le_to_cpu(&objs->l_ost_oi, oi);
5411                 index = le32_to_cpu(objs->l_ost_idx);
5412                 rc = ostid_to_fid(fid, oi, index);
5413                 if (rc != 0) {
5414                         CDEBUG(D_LFSCK, "%s: get invalid layout EA for "DFID
5415                                ": "DOSTID", idx %u, comp_id %u\n",
5416                                lfsck_lfsck2name(lfsck),
5417                                PFID(lfsck_dto2fid(parent)), POSTID(oi),
5418                                index, comp_id);
5419                         goto next;
5420                 }
5421
5422                 tgt = lfsck_tgt_get(ltds, index);
5423                 if (unlikely(tgt == NULL)) {
5424                         CDEBUG(D_LFSCK, "%s: cannot talk with OST %x which "
5425                                "did not join the layout LFSCK, comp_id %u\n",
5426                                lfsck_lfsck2name(lfsck), index, comp_id);
5427                         lfsck_lad_set_bitmap(env, com, index);
5428                         goto next;
5429                 }
5430
5431                 /* There is potential deadlock race condition between object
5432                  * destroy and layout LFSCK. Consider the following scenario:
5433                  *
5434                  * 1) The LFSCK thread obtained the parent object firstly, at
5435                  *    that time, the parent object has not been destroyed yet.
5436                  *
5437                  * 2) One RPC service thread destroyed the parent and all its
5438                  *    children objects. Because the LFSCK is referencing the
5439                  *    parent object, then the parent object will be marked as
5440                  *    dying in RAM. On the other hand, the parent object is
5441                  *    referencing all its children objects, then all children
5442                  *    objects will be marked as dying in RAM also.
5443                  *
5444                  * 3) The LFSCK thread tries to find some child object with
5445                  *    the parent object referenced. Then it will find that the
5446                  *    child object is dying. According to the object visibility
5447                  *    rules: the object with dying flag cannot be returned to
5448                  *    others. So the LFSCK thread has to wait until the dying
5449                  *    object has been purged from RAM, then it can allocate a
5450                  *    new object (with the same FID) in RAM. Unfortunately, the
5451                  *    LFSCK thread itself is referencing the parent object, and
5452                  *    cause the parent object cannot be purged, then cause the
5453                  *    child object cannot be purged also. So the LFSCK thread
5454                  *    will fall into deadlock.
5455                  */
5456                 cobj = lfsck_object_find_by_dev(env, tgt->ltd_tgt, fid);
5457                 if (IS_ERR(cobj)) {
5458                         if (lfsck_is_dead_obj(parent)) {
5459                                 lfsck_tgt_put(tgt);
5460
5461                                 GOTO(out, rc = 0);
5462                         }
5463
5464                         rc = PTR_ERR(cobj);
5465                         goto next;
5466                 }
5467
5468                 rc = dt_declare_attr_get(env, cobj);
5469                 if (rc)
5470                         goto next;
5471
5472                 rc = dt_declare_xattr_get(env, cobj, &buf, XATTR_NAME_FID);
5473                 if (rc)
5474                         goto next;
5475
5476                 if (lso == NULL) {
5477                         struct lu_attr *attr = &info->lti_la;
5478
5479                         rc = dt_attr_get(env, parent, attr);
5480                         if (rc != 0)
5481                                 goto next;
5482
5483                         lso = lfsck_assistant_object_init(env,
5484                                 lfsck_dto2fid(parent), attr,
5485                                 lfsck->li_pos_current.lp_oit_cookie, false);
5486                         if (IS_ERR(lso)) {
5487                                 rc = PTR_ERR(lso);
5488                                 lso = NULL;
5489
5490                                 goto next;
5491                         }
5492                 }
5493
5494                 llr = lfsck_layout_assistant_req_init(lso, cobj, comp_id,
5495                                                       index, i);
5496                 if (IS_ERR(llr)) {
5497                         rc = PTR_ERR(llr);
5498                         goto next;
5499                 }
5500
5501                 cobj = NULL;
5502                 spin_lock(&lad->lad_lock);
5503                 if (lad->lad_assistant_status < 0) {
5504                         spin_unlock(&lad->lad_lock);
5505                         lfsck_layout_assistant_req_fini(env, &llr->llr_lar);
5506                         lfsck_tgt_put(tgt);
5507                         RETURN(lad->lad_assistant_status);
5508                 }
5509
5510                 list_add_tail(&llr->llr_lar.lar_list, &lad->lad_req_list);
5511                 if (lad->lad_prefetched == 0)
5512                         wakeup = true;
5513
5514                 lad->lad_prefetched++;
5515                 spin_unlock(&lad->lad_lock);
5516                 if (wakeup)
5517                         wake_up_all(&athread->t_ctl_waitq);
5518
5519 next:
5520                 down_write(&com->lc_sem);
5521                 com->lc_new_checked++;
5522                 if (rc < 0)
5523                         lfsck_layout_record_failure(env, lfsck, lo);
5524                 up_write(&com->lc_sem);
5525
5526                 if (cobj != NULL && !IS_ERR(cobj))
5527                         lfsck_object_put(env, cobj);
5528
5529                 if (likely(tgt != NULL))
5530                         lfsck_tgt_put(tgt);
5531
5532                 if (rc < 0 && bk->lb_param & LPF_FAILOUT)
5533                         GOTO(out, rc);
5534         }
5535
5536         GOTO(out, rc = 0);
5537
5538 out:
5539         if (lso != NULL)
5540                 lfsck_assistant_object_put(env, lso);
5541
5542         return rc;
5543 }
5544
5545 /* For the given object, read its layout EA locally. For each stripe, pre-fetch
5546  * the OST-object's attribute and generate an structure lfsck_layout_req on the
5547  * list ::lad_req_list.
5548  *
5549  * For each request on above list, the lfsck_layout_assistant thread compares
5550  * the OST side attribute with local attribute, if inconsistent, then repair it.
5551  *
5552  * All above processing is async mode with pipeline. */
5553 static int lfsck_layout_master_exec_oit(const struct lu_env *env,
5554                                         struct lfsck_component *com,
5555                                         struct dt_object *obj)
5556 {
5557         struct lfsck_thread_info        *info   = lfsck_env_info(env);
5558         struct ost_id                   *oi     = &info->lti_oi;
5559         struct lfsck_layout             *lo     = com->lc_file_ram;
5560         struct lfsck_assistant_data     *lad    = com->lc_data;
5561         struct lfsck_instance           *lfsck  = com->lc_lfsck;
5562         struct lfsck_bookmark           *bk     = &lfsck->li_bookmark_ram;
5563         struct thandle                  *handle = NULL;
5564         struct lu_buf                   *buf    = &info->lti_big_buf;
5565         struct lov_mds_md_v1            *lmm    = NULL;
5566         struct dt_device                *dev    = lfsck_obj2dev(obj);
5567         struct lustre_handle             lh     = { 0 };
5568         struct lu_buf                    ea_buf = { NULL };
5569         struct lov_comp_md_v1           *lcm    = NULL;
5570         struct lov_comp_md_entry_v1     *lcme   = NULL;
5571         int                              rc     = 0;
5572         int                              size   = 0;
5573         __u32                            magic  = 0;
5574         __u16                            count  = 0;
5575         bool                             locked = false;
5576         bool                             stripe = false;
5577         bool                             bad_oi = false;
5578         ENTRY;
5579
5580         if (!S_ISREG(lfsck_object_type(obj)))
5581                 GOTO(out, rc = 0);
5582
5583         if (lad->lad_assistant_status < 0)
5584                 GOTO(out, rc = -ESRCH);
5585
5586         fid_to_lmm_oi(lfsck_dto2fid(obj), oi);
5587         lmm_oi_cpu_to_le(oi, oi);
5588         dt_read_lock(env, obj, 0);
5589         locked = true;
5590
5591 again:
5592         bad_oi = false;
5593         if (dt_object_exists(obj) == 0 ||
5594             lfsck_is_dead_obj(obj))
5595                 GOTO(out, rc = 0);
5596
5597         rc = lfsck_layout_get_lovea(env, obj, buf);
5598         if (rc == -EINVAL || rc == -ENODATA || rc == -EOPNOTSUPP)
5599                 /* Skip bad lov EA during the 1st cycle scanning, and
5600                  * try to recover it via orphan in the 2nd scanning. */
5601                 rc = 0;
5602         if (rc <= 0)
5603                 GOTO(out, rc);
5604
5605         size = rc;
5606         lmm = buf->lb_buf;
5607         magic = le32_to_cpu(lmm->lmm_magic);
5608         if (magic == LOV_MAGIC_COMP_V1) {
5609                 int i;
5610
5611                 lcm = buf->lb_buf;
5612                 count = le16_to_cpu(lcm->lcm_entry_count);
5613                 for (i = 0; i < count; i++) {
5614                         lcme = &lcm->lcm_entries[i];
5615                         lmm = buf->lb_buf + le32_to_cpu(lcme->lcme_offset);
5616                         if (memcmp(oi, &lmm->lmm_oi, sizeof(*oi)) != 0)
5617                                 goto fix;
5618                 }
5619
5620                 GOTO(out, stripe = true);
5621         } else if (memcmp(oi, &lmm->lmm_oi, sizeof(*oi)) == 0) {
5622                 GOTO(out, stripe = true);
5623         }
5624
5625 fix:
5626         /* Inconsistent lmm_oi, should be repaired. */
5627         bad_oi = true;
5628
5629         if (bk->lb_param & LPF_DRYRUN) {
5630                 lo->ll_objs_repaired[LLIT_OTHERS - 1]++;
5631
5632                 GOTO(out, stripe = true);
5633         }
5634
5635         if (!lustre_handle_is_used(&lh)) {
5636                 dt_read_unlock(env, obj);
5637                 locked = false;
5638                 rc = lfsck_ibits_lock(env, lfsck, obj, &lh,
5639                                       MDS_INODELOCK_LAYOUT |
5640                                       MDS_INODELOCK_XATTR, LCK_EX);
5641                 if (rc != 0)
5642                         GOTO(out, rc);
5643
5644                 handle = dt_trans_create(env, dev);
5645                 if (IS_ERR(handle))
5646                         GOTO(out, rc = PTR_ERR(handle));
5647
5648                 lfsck_buf_init(&ea_buf, buf->lb_buf, size);
5649                 rc = dt_declare_xattr_set(env, obj, &ea_buf, XATTR_NAME_LOV,
5650                                           LU_XATTR_REPLACE, handle);
5651                 if (rc != 0)
5652                         GOTO(out, rc);
5653
5654                 rc = dt_trans_start_local(env, dev, handle);
5655                 if (rc != 0)
5656                         GOTO(out, rc);
5657
5658                 dt_write_lock(env, obj, 0);
5659                 locked = true;
5660
5661                 goto again;
5662         }
5663
5664         if (magic == LOV_MAGIC_COMP_V1) {
5665                 int i;
5666
5667                 for (i = 0; i < count; i++) {
5668                         lcme = &lcm->lcm_entries[i];
5669                         lmm = buf->lb_buf + le32_to_cpu(lcme->lcme_offset);
5670                         lmm->lmm_oi = *oi;
5671                 }
5672         } else {
5673                 lmm->lmm_oi = *oi;
5674         }
5675
5676         rc = dt_xattr_set(env, obj, &ea_buf, XATTR_NAME_LOV,
5677                           LU_XATTR_REPLACE, handle);
5678         if (rc != 0)
5679                 GOTO(out, rc);
5680
5681         lo->ll_objs_repaired[LLIT_OTHERS - 1]++;
5682
5683         GOTO(out, stripe = true);
5684
5685 out:
5686         if (locked) {
5687                 if (lustre_handle_is_used(&lh))
5688                         dt_write_unlock(env, obj);
5689                 else
5690                         dt_read_unlock(env, obj);
5691         }
5692
5693         if (handle != NULL && !IS_ERR(handle))
5694                 dt_trans_stop(env, dev, handle);
5695
5696         lfsck_ibits_unlock(&lh, LCK_EX);
5697
5698         if (bad_oi)
5699                 CDEBUG(D_LFSCK, "%s: layout LFSCK master %s bad lmm_oi for "
5700                        DFID": rc = %d\n", lfsck_lfsck2name(lfsck),
5701                        bk->lb_param & LPF_DRYRUN ? "found" : "repaired",
5702                        PFID(lfsck_dto2fid(obj)), rc);
5703
5704         if (stripe) {
5705                 if (magic == LOV_MAGIC_COMP_V1) {
5706                         int i;
5707
5708                         for (i = 0; i < count; i++) {
5709                                 lcme = &lcm->lcm_entries[i];
5710                                 if (!(le32_to_cpu(lcme->lcme_flags) &
5711                                       LCME_FL_INIT))
5712                                         continue;
5713
5714                                 rc = lfsck_layout_scan_stripes(env, com, obj,
5715                                         (struct lov_mds_md_v1 *)(buf->lb_buf +
5716                                         le32_to_cpu(lcme->lcme_offset)),
5717                                         le32_to_cpu(lcme->lcme_id));
5718                         }
5719                 } else {
5720                         rc = lfsck_layout_scan_stripes(env, com, obj, lmm, 0);
5721                 }
5722         } else {
5723                 down_write(&com->lc_sem);
5724                 com->lc_new_checked++;
5725                 if (rc < 0)
5726                         lfsck_layout_record_failure(env, lfsck, lo);
5727                 up_write(&com->lc_sem);
5728         }
5729
5730         return rc;
5731 }
5732
5733 static int lfsck_layout_slave_exec_oit(const struct lu_env *env,
5734                                        struct lfsck_component *com,
5735                                        struct dt_object *obj)
5736 {
5737         struct lfsck_instance           *lfsck  = com->lc_lfsck;
5738         struct lfsck_layout             *lo     = com->lc_file_ram;
5739         const struct lu_fid             *fid    = lfsck_dto2fid(obj);
5740         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
5741         struct lfsck_layout_seq         *lls;
5742         __u64                            seq;
5743         __u64                            oid;
5744         int                              rc;
5745         ENTRY;
5746
5747         LASSERT(llsd != NULL);
5748
5749         if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY5) &&
5750             cfs_fail_val == lfsck_dev_idx(lfsck)) {
5751                 struct l_wait_info       lwi = LWI_TIMEOUT(cfs_time_seconds(1),
5752                                                            NULL, NULL);
5753                 struct ptlrpc_thread    *thread = &lfsck->li_thread;
5754
5755                 l_wait_event(thread->t_ctl_waitq,
5756                              !thread_is_running(thread),
5757                              &lwi);
5758         }
5759
5760         lfsck_rbtree_update_bitmap(env, com, fid, false);
5761
5762         down_write(&com->lc_sem);
5763         if (fid_is_idif(fid))
5764                 seq = 0;
5765         else if (!fid_is_norm(fid) ||
5766                  !fid_is_for_ostobj(env, lfsck, obj, fid))
5767                 GOTO(unlock, rc = 0);
5768         else
5769                 seq = fid_seq(fid);
5770         com->lc_new_checked++;
5771
5772         lls = lfsck_layout_seq_lookup(llsd, seq);
5773         if (lls == NULL) {
5774                 OBD_ALLOC_PTR(lls);
5775                 if (unlikely(lls == NULL))
5776                         GOTO(unlock, rc = -ENOMEM);
5777
5778                 INIT_LIST_HEAD(&lls->lls_list);
5779                 lls->lls_seq = seq;
5780                 rc = lfsck_layout_lastid_load(env, com, lls);
5781                 if (rc != 0) {
5782                         CDEBUG(D_LFSCK, "%s: layout LFSCK failed to "
5783                               "load LAST_ID for %#llx: rc = %d\n",
5784                               lfsck_lfsck2name(com->lc_lfsck), seq, rc);
5785                         lo->ll_objs_failed_phase1++;
5786                         OBD_FREE_PTR(lls);
5787                         GOTO(unlock, rc);
5788                 }
5789
5790                 lfsck_layout_seq_insert(llsd, lls);
5791         }
5792
5793         if (unlikely(fid_is_last_id(fid)))
5794                 GOTO(unlock, rc = 0);
5795
5796         if (fid_is_idif(fid))
5797                 oid = fid_idif_id(fid_seq(fid), fid_oid(fid), fid_ver(fid));
5798         else
5799                 oid = fid_oid(fid);
5800
5801         if (oid > lls->lls_lastid_known)
5802                 lls->lls_lastid_known = oid;
5803
5804         if (oid > lls->lls_lastid) {
5805                 if (!(lo->ll_flags & LF_CRASHED_LASTID)) {
5806                         /* OFD may create new objects during LFSCK scanning. */
5807                         rc = lfsck_layout_lastid_reload(env, com, lls);
5808                         if (unlikely(rc != 0)) {
5809                                 CDEBUG(D_LFSCK, "%s: layout LFSCK failed to "
5810                                       "reload LAST_ID for %#llx: rc = %d\n",
5811                                       lfsck_lfsck2name(com->lc_lfsck),
5812                                       lls->lls_seq, rc);
5813
5814                                 GOTO(unlock, rc);
5815                         }
5816
5817                         if (oid <= lls->lls_lastid ||
5818                             lo->ll_flags & LF_CRASHED_LASTID)
5819                                 GOTO(unlock, rc = 0);
5820
5821                         LASSERT(lfsck->li_out_notify != NULL);
5822
5823                         lfsck->li_out_notify(env, lfsck->li_out_notify_data,
5824                                              LE_LASTID_REBUILDING);
5825                         lo->ll_flags |= LF_CRASHED_LASTID;
5826
5827                         CDEBUG(D_LFSCK, "%s: layout LFSCK finds crashed "
5828                                "LAST_ID file (2) for the sequence %#llx"
5829                                ", old value %llu, known value %llu\n",
5830                                lfsck_lfsck2name(lfsck), lls->lls_seq,
5831                                lls->lls_lastid, oid);
5832                 }
5833
5834                 lls->lls_lastid = oid;
5835                 lls->lls_dirty = 1;
5836         }
5837
5838         GOTO(unlock, rc = 0);
5839
5840 unlock:
5841         up_write(&com->lc_sem);
5842
5843         return rc;
5844 }
5845
5846 static int lfsck_layout_exec_dir(const struct lu_env *env,
5847                                  struct lfsck_component *com,
5848                                  struct lfsck_assistant_object *lso,
5849                                  struct lu_dirent *ent, __u16 type)
5850 {
5851         return 0;
5852 }
5853
5854 static int lfsck_layout_master_post(const struct lu_env *env,
5855                                     struct lfsck_component *com,
5856                                     int result, bool init)
5857 {
5858         struct lfsck_instance   *lfsck  = com->lc_lfsck;
5859         struct lfsck_layout     *lo     = com->lc_file_ram;
5860         int                      rc;
5861         ENTRY;
5862
5863         lfsck_post_generic(env, com, &result);
5864
5865         down_write(&com->lc_sem);
5866         spin_lock(&lfsck->li_lock);
5867         if (!init)
5868                 lo->ll_pos_last_checkpoint =
5869                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
5870
5871         if (result > 0) {
5872                 if (lo->ll_flags & LF_INCOMPLETE)
5873                         lo->ll_status = LS_PARTIAL;
5874                 else
5875                         lo->ll_status = LS_SCANNING_PHASE2;
5876                 lo->ll_flags |= LF_SCANNED_ONCE;
5877                 lo->ll_flags &= ~LF_UPGRADE;
5878                 list_move_tail(&com->lc_link, &lfsck->li_list_double_scan);
5879         } else if (result == 0) {
5880                 if (lfsck->li_status != 0)
5881                         lo->ll_status = lfsck->li_status;
5882                 else
5883                         lo->ll_status = LS_STOPPED;
5884                 if (lo->ll_status != LS_PAUSED)
5885                         list_move_tail(&com->lc_link, &lfsck->li_list_idle);
5886         } else {
5887                 lo->ll_status = LS_FAILED;
5888                 list_move_tail(&com->lc_link, &lfsck->li_list_idle);
5889         }
5890         spin_unlock(&lfsck->li_lock);
5891
5892         if (!init) {
5893                 lo->ll_run_time_phase1 += ktime_get_seconds() -
5894                                           lfsck->li_time_last_checkpoint;
5895                 lo->ll_time_last_checkpoint = ktime_get_real_seconds();
5896                 lo->ll_objs_checked_phase1 += com->lc_new_checked;
5897                 com->lc_new_checked = 0;
5898         }
5899
5900         rc = lfsck_layout_store(env, com);
5901         up_write(&com->lc_sem);
5902
5903         CDEBUG(D_LFSCK, "%s: layout LFSCK master post done: rc = %d\n",
5904                lfsck_lfsck2name(lfsck), rc);
5905
5906         RETURN(rc);
5907 }
5908
5909 static int lfsck_layout_slave_post(const struct lu_env *env,
5910                                    struct lfsck_component *com,
5911                                    int result, bool init)
5912 {
5913         struct lfsck_instance   *lfsck = com->lc_lfsck;
5914         struct lfsck_layout     *lo    = com->lc_file_ram;
5915         int                      rc;
5916         bool                     done  = false;
5917
5918         down_write(&com->lc_sem);
5919         rc = lfsck_layout_lastid_store(env, com);
5920         if (rc != 0)
5921                 result = rc;
5922
5923         LASSERT(lfsck->li_out_notify != NULL);
5924
5925         spin_lock(&lfsck->li_lock);
5926         if (!init)
5927                 lo->ll_pos_last_checkpoint =
5928                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
5929
5930         if (result > 0) {
5931                 lo->ll_status = LS_SCANNING_PHASE2;
5932                 lo->ll_flags |= LF_SCANNED_ONCE;
5933                 if (lo->ll_flags & LF_CRASHED_LASTID) {
5934                         done = true;
5935                         lo->ll_flags &= ~LF_CRASHED_LASTID;
5936
5937                         CDEBUG(D_LFSCK, "%s: layout LFSCK has rebuilt "
5938                                "crashed LAST_ID files successfully\n",
5939                                lfsck_lfsck2name(lfsck));
5940                 }
5941                 lo->ll_flags &= ~LF_UPGRADE;
5942                 list_move_tail(&com->lc_link, &lfsck->li_list_double_scan);
5943         } else if (result == 0) {
5944                 if (lfsck->li_status != 0)
5945                         lo->ll_status = lfsck->li_status;
5946                 else
5947                         lo->ll_status = LS_STOPPED;
5948                 if (lo->ll_status != LS_PAUSED)
5949                         list_move_tail(&com->lc_link, &lfsck->li_list_idle);
5950         } else {
5951                 lo->ll_status = LS_FAILED;
5952                 list_move_tail(&com->lc_link, &lfsck->li_list_idle);
5953         }
5954         spin_unlock(&lfsck->li_lock);
5955
5956         if (done)
5957                 lfsck->li_out_notify(env, lfsck->li_out_notify_data,
5958                                      LE_LASTID_REBUILT);
5959
5960         if (!init) {
5961                 lo->ll_run_time_phase1 += ktime_get_seconds() -
5962                                           lfsck->li_time_last_checkpoint;
5963                 lo->ll_time_last_checkpoint = ktime_get_real_seconds();
5964                 lo->ll_objs_checked_phase1 += com->lc_new_checked;
5965                 com->lc_new_checked = 0;
5966         }
5967
5968         rc = lfsck_layout_store(env, com);
5969         up_write(&com->lc_sem);
5970
5971         lfsck_layout_slave_notify_master(env, com, LE_PHASE1_DONE, result);
5972
5973         CDEBUG(D_LFSCK, "%s: layout LFSCK slave post done: rc = %d\n",
5974                lfsck_lfsck2name(lfsck), rc);
5975
5976         return rc;
5977 }
5978
5979 static void lfsck_layout_dump(const struct lu_env *env,
5980                               struct lfsck_component *com, struct seq_file *m)
5981 {
5982         struct lfsck_instance   *lfsck = com->lc_lfsck;
5983         struct lfsck_bookmark   *bk    = &lfsck->li_bookmark_ram;
5984         struct lfsck_layout     *lo    = com->lc_file_ram;
5985         const char *prefix;
5986
5987         down_read(&com->lc_sem);
5988         if (bk->lb_param & LPF_DRYRUN)
5989                 prefix = "inconsistent";
5990         else
5991                 prefix = "repaired";
5992
5993         seq_printf(m, "name: lfsck_layout\n"
5994                    "magic: %#x\n"
5995                    "version: %d\n"
5996                    "status: %s\n",
5997                    lo->ll_magic,
5998                    bk->lb_version,
5999                    lfsck_status2name(lo->ll_status));
6000
6001         lfsck_bits_dump(m, lo->ll_flags, lfsck_flags_names, "flags");
6002
6003         lfsck_bits_dump(m, bk->lb_param, lfsck_param_names, "param");
6004
6005         lfsck_time_dump(m, lo->ll_time_last_complete, "last_completed");
6006
6007         lfsck_time_dump(m, lo->ll_time_latest_start, "latest_start");
6008
6009         lfsck_time_dump(m, lo->ll_time_last_checkpoint, "last_checkpoint");
6010
6011         seq_printf(m, "latest_start_position: %llu\n"
6012                    "last_checkpoint_position: %llu\n"
6013                    "first_failure_position: %llu\n",
6014                    lo->ll_pos_latest_start,
6015                    lo->ll_pos_last_checkpoint,
6016                    lo->ll_pos_first_inconsistent);
6017
6018         seq_printf(m, "success_count: %u\n"
6019                    "%s_dangling: %llu\n"
6020                    "%s_unmatched_pair: %llu\n"
6021                    "%s_multiple_referenced: %llu\n"
6022                    "%s_orphan: %llu\n"
6023                    "%s_inconsistent_owner: %llu\n"
6024                    "%s_others: %llu\n"
6025                    "skipped: %llu\n"
6026                    "failed_phase1: %llu\n"
6027                    "failed_phase2: %llu\n",
6028                    lo->ll_success_count,
6029                    prefix, lo->ll_objs_repaired[LLIT_DANGLING - 1],
6030                    prefix, lo->ll_objs_repaired[LLIT_UNMATCHED_PAIR - 1],
6031                    prefix, lo->ll_objs_repaired[LLIT_MULTIPLE_REFERENCED - 1],
6032                    prefix, lo->ll_objs_repaired[LLIT_ORPHAN - 1],
6033                    prefix, lo->ll_objs_repaired[LLIT_INCONSISTENT_OWNER - 1],
6034                    prefix, lo->ll_objs_repaired[LLIT_OTHERS - 1],
6035                    lo->ll_objs_skipped,
6036                    lo->ll_objs_failed_phase1,
6037                    lo->ll_objs_failed_phase2);
6038
6039         if (lo->ll_status == LS_SCANNING_PHASE1) {
6040                 time64_t duration = ktime_get_seconds() -
6041                                     lfsck->li_time_last_checkpoint;
6042                 u64 checked = lo->ll_objs_checked_phase1 +
6043                               com->lc_new_checked;
6044                 u64 speed = checked;
6045                 u64 new_checked = com->lc_new_checked;
6046                 time64_t rtime = lo->ll_run_time_phase1 + duration;
6047                 u64 pos;
6048
6049                 if (duration != 0)
6050                         new_checked = div64_s64(new_checked, duration);
6051                 if (rtime != 0)
6052                         speed = div64_s64(speed, rtime);
6053                 seq_printf(m, "checked_phase1: %llu\n"
6054                            "checked_phase2: %llu\n"
6055                            "run_time_phase1: %lld seconds\n"
6056                            "run_time_phase2: %lld seconds\n"
6057                            "average_speed_phase1: %llu items/sec\n"
6058                            "average_speed_phase2: N/A\n"
6059                            "real-time_speed_phase1: %llu items/sec\n"
6060                            "real-time_speed_phase2: N/A\n",
6061                            checked,
6062                            lo->ll_objs_checked_phase2,
6063                            rtime,
6064                            lo->ll_run_time_phase2,
6065                            speed,
6066                            new_checked);
6067
6068                 if (likely(lfsck->li_di_oit)) {
6069                         const struct dt_it_ops *iops =
6070                                 &lfsck->li_obj_oit->do_index_ops->dio_it;
6071
6072                         /* The low layer otable-based iteration position may NOT
6073                          * exactly match the layout-based directory traversal
6074                          * cookie. Generally, it is not a serious issue. But the
6075                          * caller should NOT make assumption on that. */
6076                         pos = iops->store(env, lfsck->li_di_oit);
6077                         if (!lfsck->li_current_oit_processed)
6078                                 pos--;
6079                 } else {
6080                         pos = lo->ll_pos_last_checkpoint;
6081                 }
6082
6083                 seq_printf(m, "current_position: %llu\n", pos);
6084         } else if (lo->ll_status == LS_SCANNING_PHASE2) {
6085                 time64_t duration = ktime_get_seconds() -
6086                                     com->lc_time_last_checkpoint;
6087                 u64 checked = lo->ll_objs_checked_phase2 +
6088                               com->lc_new_checked;
6089                 u64 speed1 = lo->ll_objs_checked_phase1;
6090                 u64 speed2 = checked;
6091                 u64 new_checked = com->lc_new_checked;
6092                 time64_t rtime = lo->ll_run_time_phase2 + duration;
6093
6094                 if (duration != 0)
6095                         new_checked = div64_s64(new_checked, duration);
6096                 if (lo->ll_run_time_phase1 != 0)
6097                         speed1 = div64_s64(speed1, lo->ll_run_time_phase1);
6098                 if (rtime != 0)
6099                         speed2 = div64_s64(speed2, rtime);
6100                 seq_printf(m, "checked_phase1: %llu\n"
6101                            "checked_phase2: %llu\n"
6102                            "run_time_phase1: %lld seconds\n"
6103                            "run_time_phase2: %lld seconds\n"
6104                            "average_speed_phase1: %llu items/sec\n"
6105                            "average_speed_phase2: %llu items/sec\n"
6106                            "real-time_speed_phase1: N/A\n"
6107                            "real-time_speed_phase2: %llu items/sec\n"
6108                            "current_position: "DFID"\n",
6109                            lo->ll_objs_checked_phase1,
6110                            checked,
6111                            lo->ll_run_time_phase1,
6112                            rtime,
6113                            speed1,
6114                            speed2,
6115                            new_checked,
6116                            PFID(&com->lc_fid_latest_scanned_phase2));
6117         } else {
6118                 __u64 speed1 = lo->ll_objs_checked_phase1;
6119                 __u64 speed2 = lo->ll_objs_checked_phase2;
6120
6121                 if (lo->ll_run_time_phase1 != 0)
6122                         speed1 = div64_s64(speed1, lo->ll_run_time_phase1);
6123                 if (lo->ll_run_time_phase2 != 0)
6124                         speed2 = div64_s64(speed2, lo->ll_run_time_phase2);
6125                 seq_printf(m, "checked_phase1: %llu\n"
6126                            "checked_phase2: %llu\n"
6127                            "run_time_phase1: %lld seconds\n"
6128                            "run_time_phase2: %lld seconds\n"
6129                            "average_speed_phase1: %llu items/sec\n"
6130                            "average_speed_phase2: %llu objs/sec\n"
6131                            "real-time_speed_phase1: N/A\n"
6132                            "real-time_speed_phase2: N/A\n"
6133                            "current_position: N/A\n",
6134                            lo->ll_objs_checked_phase1,
6135                            lo->ll_objs_checked_phase2,
6136                            lo->ll_run_time_phase1,
6137                            lo->ll_run_time_phase2,
6138                            speed1,
6139                            speed2);
6140         }
6141
6142         up_read(&com->lc_sem);
6143 }
6144
6145 static int lfsck_layout_master_double_scan(const struct lu_env *env,
6146                                            struct lfsck_component *com)
6147 {
6148         struct lfsck_layout             *lo     = com->lc_file_ram;
6149         struct lfsck_assistant_data     *lad    = com->lc_data;
6150         struct lfsck_instance           *lfsck  = com->lc_lfsck;
6151         struct lfsck_tgt_descs          *ltds;
6152         struct lfsck_tgt_desc           *ltd;
6153         struct lfsck_tgt_desc           *next;
6154         int                              rc;
6155
6156         rc = lfsck_double_scan_generic(env, com, lo->ll_status);
6157
6158         if (thread_is_stopped(&lad->lad_thread)) {
6159                 LASSERT(list_empty(&lad->lad_req_list));
6160                 LASSERT(list_empty(&lad->lad_ost_phase1_list));
6161                 LASSERT(list_empty(&lad->lad_mdt_phase1_list));
6162
6163                 ltds = &lfsck->li_ost_descs;
6164                 spin_lock(&ltds->ltd_lock);
6165                 list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase2_list,
6166                                          ltd_layout_phase_list) {
6167                         list_del_init(&ltd->ltd_layout_phase_list);
6168                 }
6169                 spin_unlock(&ltds->ltd_lock);
6170
6171                 ltds = &lfsck->li_mdt_descs;
6172                 spin_lock(&ltds->ltd_lock);
6173                 list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase2_list,
6174                                          ltd_layout_phase_list) {
6175                         list_del_init(&ltd->ltd_layout_phase_list);
6176                 }
6177                 spin_unlock(&ltds->ltd_lock);
6178         }
6179
6180         return rc;
6181 }
6182
6183 static int lfsck_layout_slave_double_scan(const struct lu_env *env,
6184                                           struct lfsck_component *com)
6185 {
6186         struct lfsck_instance           *lfsck  = com->lc_lfsck;
6187         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
6188         struct lfsck_layout             *lo     = com->lc_file_ram;
6189         struct ptlrpc_thread            *thread = &lfsck->li_thread;
6190         int                              rc;
6191         ENTRY;
6192
6193         CDEBUG(D_LFSCK, "%s: layout LFSCK slave phase2 scan start\n",
6194                lfsck_lfsck2name(lfsck));
6195
6196         atomic_inc(&lfsck->li_double_scan_count);
6197
6198         if (lo->ll_flags & LF_INCOMPLETE)
6199                 GOTO(done, rc = 1);
6200
6201         com->lc_new_checked = 0;
6202         com->lc_new_scanned = 0;
6203         com->lc_time_last_checkpoint = ktime_get_seconds();
6204         com->lc_time_next_checkpoint = com->lc_time_last_checkpoint +
6205                                        LFSCK_CHECKPOINT_INTERVAL;
6206
6207         while (1) {
6208                 struct l_wait_info lwi = LWI_TIMEOUT(cfs_time_seconds(30),
6209                                                      NULL, NULL);
6210
6211                 rc = lfsck_layout_slave_query_master(env, com);
6212                 if (list_empty(&llsd->llsd_master_list)) {
6213                         if (unlikely(!thread_is_running(thread)))
6214                                 rc = 0;
6215                         else
6216                                 rc = 1;
6217
6218                         GOTO(done, rc);
6219                 }
6220
6221                 if (rc < 0)
6222                         GOTO(done, rc);
6223
6224                 rc = l_wait_event(thread->t_ctl_waitq,
6225                                   !thread_is_running(thread) ||
6226                                   lo->ll_flags & LF_INCOMPLETE ||
6227                                   list_empty(&llsd->llsd_master_list),
6228                                   &lwi);
6229                 if (unlikely(!thread_is_running(thread)))
6230                         GOTO(done, rc = 0);
6231
6232                 if (lo->ll_flags & LF_INCOMPLETE)
6233                         GOTO(done, rc = 1);
6234
6235                 if (rc == -ETIMEDOUT)
6236                         continue;
6237
6238                 GOTO(done, rc = (rc < 0 ? rc : 1));
6239         }
6240
6241 done:
6242         rc = lfsck_layout_double_scan_result(env, com, rc);
6243         lfsck_layout_slave_notify_master(env, com, LE_PHASE2_DONE,
6244                         (rc > 0 && lo->ll_flags & LF_INCOMPLETE) ? 0 : rc);
6245         lfsck_layout_slave_quit(env, com);
6246         if (atomic_dec_and_test(&lfsck->li_double_scan_count))
6247                 wake_up_all(&lfsck->li_thread.t_ctl_waitq);
6248
6249         CDEBUG(D_LFSCK, "%s: layout LFSCK slave phase2 scan finished, "
6250                "status %d: rc = %d\n",
6251                lfsck_lfsck2name(lfsck), lo->ll_status, rc);
6252
6253         return rc;
6254 }
6255
6256 static void lfsck_layout_master_data_release(const struct lu_env *env,
6257                                              struct lfsck_component *com)
6258 {
6259         struct lfsck_assistant_data     *lad    = com->lc_data;
6260         struct lfsck_instance           *lfsck  = com->lc_lfsck;
6261         struct lfsck_tgt_descs          *ltds;
6262         struct lfsck_tgt_desc           *ltd;
6263         struct lfsck_tgt_desc           *next;
6264
6265         LASSERT(lad != NULL);
6266         LASSERT(thread_is_init(&lad->lad_thread) ||
6267                 thread_is_stopped(&lad->lad_thread));
6268         LASSERT(list_empty(&lad->lad_req_list));
6269
6270         com->lc_data = NULL;
6271
6272         ltds = &lfsck->li_ost_descs;
6273         spin_lock(&ltds->ltd_lock);
6274         list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase1_list,
6275                                  ltd_layout_phase_list) {
6276                 list_del_init(&ltd->ltd_layout_phase_list);
6277         }
6278         list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase2_list,
6279                                  ltd_layout_phase_list) {
6280                 list_del_init(&ltd->ltd_layout_phase_list);
6281         }
6282         list_for_each_entry_safe(ltd, next, &lad->lad_ost_list,
6283                                  ltd_layout_list) {
6284                 list_del_init(&ltd->ltd_layout_list);
6285         }
6286         spin_unlock(&ltds->ltd_lock);
6287
6288         ltds = &lfsck->li_mdt_descs;
6289         spin_lock(&ltds->ltd_lock);
6290         list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase1_list,
6291                                  ltd_layout_phase_list) {
6292                 list_del_init(&ltd->ltd_layout_phase_list);
6293         }
6294         list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase2_list,
6295                                  ltd_layout_phase_list) {
6296                 list_del_init(&ltd->ltd_layout_phase_list);
6297         }
6298         list_for_each_entry_safe(ltd, next, &lad->lad_mdt_list,
6299                                  ltd_layout_list) {
6300                 list_del_init(&ltd->ltd_layout_list);
6301         }
6302         spin_unlock(&ltds->ltd_lock);
6303
6304         if (likely(lad->lad_bitmap != NULL))
6305                 CFS_FREE_BITMAP(lad->lad_bitmap);
6306
6307         OBD_FREE_PTR(lad);
6308 }
6309
6310 static void lfsck_layout_slave_data_release(const struct lu_env *env,
6311                                             struct lfsck_component *com)
6312 {
6313         struct lfsck_layout_slave_data *llsd = com->lc_data;
6314
6315         lfsck_layout_slave_quit(env, com);
6316         com->lc_data = NULL;
6317         OBD_FREE_PTR(llsd);
6318 }
6319
6320 static void lfsck_layout_master_quit(const struct lu_env *env,
6321                                      struct lfsck_component *com)
6322 {
6323         struct lfsck_assistant_data     *lad    = com->lc_data;
6324         struct lfsck_instance           *lfsck  = com->lc_lfsck;
6325         struct lfsck_tgt_descs          *ltds;
6326         struct lfsck_tgt_desc           *ltd;
6327         struct lfsck_tgt_desc           *next;
6328
6329         LASSERT(lad != NULL);
6330
6331         lfsck_quit_generic(env, com);
6332
6333         LASSERT(thread_is_init(&lad->lad_thread) ||
6334                 thread_is_stopped(&lad->lad_thread));
6335         LASSERT(list_empty(&lad->lad_req_list));
6336
6337         ltds = &lfsck->li_ost_descs;
6338         spin_lock(&ltds->ltd_lock);
6339         list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase1_list,
6340                                  ltd_layout_phase_list) {
6341                 list_del_init(&ltd->ltd_layout_phase_list);
6342         }
6343         list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase2_list,
6344                                  ltd_layout_phase_list) {
6345                 list_del_init(&ltd->ltd_layout_phase_list);
6346         }
6347         spin_unlock(&ltds->ltd_lock);
6348
6349         ltds = &lfsck->li_mdt_descs;
6350         spin_lock(&ltds->ltd_lock);
6351         list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase1_list,
6352                                  ltd_layout_phase_list) {
6353                 list_del_init(&ltd->ltd_layout_phase_list);
6354         }
6355         list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase2_list,
6356                                  ltd_layout_phase_list) {
6357                 list_del_init(&ltd->ltd_layout_phase_list);
6358         }
6359         spin_unlock(&ltds->ltd_lock);
6360 }
6361
6362 static void lfsck_layout_slave_quit(const struct lu_env *env,
6363                                     struct lfsck_component *com)
6364 {
6365         struct lfsck_layout_slave_data   *llsd  = com->lc_data;
6366         struct lfsck_layout_seq          *lls;
6367         struct lfsck_layout_seq          *next;
6368         struct lfsck_layout_slave_target *llst;
6369
6370         LASSERT(llsd != NULL);
6371
6372         down_write(&com->lc_sem);
6373         list_for_each_entry_safe(lls, next, &llsd->llsd_seq_list,
6374                                  lls_list) {
6375                 list_del_init(&lls->lls_list);
6376                 lfsck_object_put(env, lls->lls_lastid_obj);
6377                 OBD_FREE_PTR(lls);
6378         }
6379         up_write(&com->lc_sem);
6380
6381         spin_lock(&llsd->llsd_lock);
6382         while (!list_empty(&llsd->llsd_master_list)) {
6383                 llst = list_entry(llsd->llsd_master_list.next,
6384                                   struct lfsck_layout_slave_target, llst_list);
6385                 list_del_init(&llst->llst_list);
6386                 spin_unlock(&llsd->llsd_lock);
6387                 lfsck_layout_llst_put(llst);
6388                 spin_lock(&llsd->llsd_lock);
6389         }
6390         spin_unlock(&llsd->llsd_lock);
6391
6392         lfsck_rbtree_cleanup(env, com);
6393 }
6394
6395 static int lfsck_layout_master_in_notify(const struct lu_env *env,
6396                                          struct lfsck_component *com,
6397                                          struct lfsck_request *lr)
6398 {
6399         struct lfsck_instance           *lfsck = com->lc_lfsck;
6400         struct lfsck_layout             *lo    = com->lc_file_ram;
6401         struct lfsck_assistant_data     *lad   = com->lc_data;
6402         struct lfsck_tgt_descs          *ltds;
6403         struct lfsck_tgt_desc           *ltd;
6404         bool                             fail  = false;
6405         ENTRY;
6406
6407         if (lr->lr_event == LE_PAIRS_VERIFY) {
6408                 int rc;
6409
6410                 rc = lfsck_layout_master_check_pairs(env, com, &lr->lr_fid,
6411                                                      &lr->lr_fid2,
6412                                                      lr->lr_comp_id);
6413
6414                 RETURN(rc);
6415         }
6416
6417         CDEBUG(D_LFSCK, "%s: layout LFSCK master handles notify %u "
6418                "from %s %x, status %d, flags %x, flags2 %x\n",
6419                lfsck_lfsck2name(lfsck), lr->lr_event,
6420                (lr->lr_flags & LEF_FROM_OST) ? "OST" : "MDT",
6421                lr->lr_index, lr->lr_status, lr->lr_flags, lr->lr_flags2);
6422
6423         if (lr->lr_event != LE_PHASE1_DONE &&
6424             lr->lr_event != LE_PHASE2_DONE &&
6425             lr->lr_event != LE_PEER_EXIT)
6426                 RETURN(-EINVAL);
6427
6428         if (lr->lr_flags & LEF_FROM_OST)
6429                 ltds = &lfsck->li_ost_descs;
6430         else
6431                 ltds = &lfsck->li_mdt_descs;
6432         spin_lock(&ltds->ltd_lock);
6433         ltd = lfsck_ltd2tgt(ltds, lr->lr_index);
6434         if (ltd == NULL) {
6435                 spin_unlock(&ltds->ltd_lock);
6436
6437                 RETURN(-ENXIO);
6438         }
6439
6440         list_del_init(&ltd->ltd_layout_phase_list);
6441         switch (lr->lr_event) {
6442         case LE_PHASE1_DONE:
6443                 if (lr->lr_status <= 0 || lr->lr_flags2 & LF_INCOMPLETE) {
6444                         if (lr->lr_flags2 & LF_INCOMPLETE) {
6445                                 if (lr->lr_flags & LEF_FROM_OST)
6446                                         lfsck_lad_set_bitmap(env, com,
6447                                                              ltd->ltd_index);
6448                                 else
6449                                         lo->ll_flags |= LF_INCOMPLETE;
6450                         }
6451                         ltd->ltd_layout_done = 1;
6452                         list_del_init(&ltd->ltd_layout_list);
6453                         fail = true;
6454                         break;
6455                 }
6456
6457                 if (lr->lr_flags & LEF_FROM_OST) {
6458                         if (list_empty(&ltd->ltd_layout_list))
6459                                 list_add_tail(&ltd->ltd_layout_list,
6460                                               &lad->lad_ost_list);
6461                         list_add_tail(&ltd->ltd_layout_phase_list,
6462                                       &lad->lad_ost_phase2_list);
6463                 } else {
6464                         if (list_empty(&ltd->ltd_layout_list))
6465                                 list_add_tail(&ltd->ltd_layout_list,
6466                                               &lad->lad_mdt_list);
6467                         list_add_tail(&ltd->ltd_layout_phase_list,
6468                                       &lad->lad_mdt_phase2_list);
6469                 }
6470                 break;
6471         case LE_PHASE2_DONE:
6472                 ltd->ltd_layout_done = 1;
6473                 if (!list_empty(&ltd->ltd_layout_list))
6474                         list_del_init(&ltd->ltd_layout_list);
6475
6476                 if (lr->lr_flags2 & LF_INCOMPLETE) {
6477                         lfsck_lad_set_bitmap(env, com, ltd->ltd_index);
6478                         fail = true;
6479                 }
6480
6481                 break;
6482         case LE_PEER_EXIT:
6483                 fail = true;
6484                 ltd->ltd_layout_done = 1;
6485                 list_del_init(&ltd->ltd_layout_list);
6486                 if (!(lfsck->li_bookmark_ram.lb_param & LPF_FAILOUT) &&
6487                     !(lr->lr_flags & LEF_FROM_OST))
6488                                 lo->ll_flags |= LF_INCOMPLETE;
6489                 break;
6490         default:
6491                 break;
6492         }
6493         spin_unlock(&ltds->ltd_lock);
6494
6495         if (fail && lfsck->li_bookmark_ram.lb_param & LPF_FAILOUT) {
6496                 struct lfsck_stop *stop = &lfsck_env_info(env)->lti_stop;
6497
6498                 memset(stop, 0, sizeof(*stop));
6499                 stop->ls_status = lr->lr_status;
6500                 stop->ls_flags = lr->lr_param & ~LPF_BROADCAST;
6501                 lfsck_stop(env, lfsck->li_bottom, stop);
6502         } else if (lfsck_phase2_next_ready(lad)) {
6503                 wake_up_all(&lad->lad_thread.t_ctl_waitq);
6504         }
6505
6506         RETURN(0);
6507 }
6508
6509 static int lfsck_layout_slave_in_notify_local(const struct lu_env *env,
6510                                               struct lfsck_component *com,
6511                                               struct lfsck_req_local *lrl,
6512                                               struct thandle *th)
6513 {
6514         ENTRY;
6515
6516         switch (lrl->lrl_event) {
6517         case LEL_FID_ACCESSED:
6518                 lfsck_rbtree_update_bitmap(env, com, &lrl->lrl_fid, true);
6519                 RETURN(0);
6520         case LEL_PAIRS_VERIFY_LOCAL: {
6521                 int rc;
6522
6523                 lrl->lrl_status = LPVS_INIT;
6524                 /* Firstly, if the MDT-object which is claimed via OST-object
6525                  * local stored PFID xattr recognizes the OST-object, then it
6526                  * must be that the client given PFID is wrong. */
6527                 rc = lfsck_layout_slave_check_pairs(env, com, &lrl->lrl_fid,
6528                                 &lrl->lrl_ff_local.ff_parent,
6529                                 lrl->lrl_ff_local.ff_layout.ol_comp_id);
6530                 if (rc <= 0)
6531                         RETURN(0);
6532
6533                 lrl->lrl_status = LPVS_INCONSISTENT;
6534                 /* The OST-object local stored PFID xattr is stale. We need to
6535                  * check whether the MDT-object that is claimed via the client
6536                  * given PFID information recognizes the OST-object or not. If
6537                  * matches, then need to update the OST-object's PFID xattr. */
6538                 rc = lfsck_layout_slave_check_pairs(env, com, &lrl->lrl_fid,
6539                                 &lrl->lrl_ff_client.ff_parent,
6540                                 lrl->lrl_ff_client.ff_layout.ol_comp_id);
6541                 /* For rc < 0 case:
6542                  * We are not sure whether the client given PFID information
6543                  * is correct or not, do nothing to avoid improper fixing.
6544                  *
6545                  * For rc > 0 case:
6546                  * The client given PFID information is also invalid, we can
6547                  * NOT fix the OST-object inconsistency.
6548                  */
6549                 if (!rc) {
6550                         lrl->lrl_status = LPVS_INCONSISTENT_TOFIX;
6551                         rc = lfsck_layout_slave_repair_pfid(env, com, lrl);
6552                 }
6553
6554                 RETURN(rc);
6555         }
6556         default:
6557                 break;
6558         }
6559
6560         RETURN(-EOPNOTSUPP);
6561 }
6562
6563 static int lfsck_layout_slave_in_notify(const struct lu_env *env,
6564                                         struct lfsck_component *com,
6565                                         struct lfsck_request *lr)
6566 {
6567         struct lfsck_instance *lfsck = com->lc_lfsck;
6568         struct lfsck_layout_slave_data *llsd = com->lc_data;
6569         struct lfsck_layout_slave_target *llst;
6570         int rc;
6571         ENTRY;
6572
6573         switch (lr->lr_event) {
6574         case LE_CONDITIONAL_DESTROY:
6575                 rc = lfsck_layout_slave_conditional_destroy(env, com, lr);
6576                 RETURN(rc);
6577         case LE_PHASE1_DONE: {
6578                 if (lr->lr_flags2 & LF_INCOMPLETE) {
6579                         struct lfsck_layout *lo = com->lc_file_ram;
6580
6581                         lo->ll_flags |= LF_INCOMPLETE;
6582                         llst = lfsck_layout_llst_find_and_del(llsd,
6583                                                               lr->lr_index,
6584                                                               true);
6585                         if (llst != NULL) {
6586                                 lfsck_layout_llst_put(llst);
6587                                 wake_up_all(&lfsck->li_thread.t_ctl_waitq);
6588                         }
6589                 }
6590
6591                 RETURN(0);
6592         }
6593         case LE_PHASE2_DONE:
6594         case LE_PEER_EXIT:
6595                 CDEBUG(D_LFSCK, "%s: layout LFSCK slave handle notify %u "
6596                        "from MDT %x, status %d\n", lfsck_lfsck2name(lfsck),
6597                        lr->lr_event, lr->lr_index, lr->lr_status);
6598                 break;
6599         default:
6600                 RETURN(-EINVAL);
6601         }
6602
6603         llst = lfsck_layout_llst_find_and_del(llsd, lr->lr_index, true);
6604         if (llst == NULL)
6605                 RETURN(0);
6606
6607         lfsck_layout_llst_put(llst);
6608         if (list_empty(&llsd->llsd_master_list))
6609                 wake_up_all(&lfsck->li_thread.t_ctl_waitq);
6610
6611         if (lr->lr_event == LE_PEER_EXIT &&
6612             (lfsck->li_bookmark_ram.lb_param & LPF_FAILOUT ||
6613              (list_empty(&llsd->llsd_master_list) &&
6614               (lr->lr_status == LS_STOPPED ||
6615                lr->lr_status == LS_CO_STOPPED)))) {
6616                 struct lfsck_stop *stop = &lfsck_env_info(env)->lti_stop;
6617
6618                 memset(stop, 0, sizeof(*stop));
6619                 stop->ls_status = lr->lr_status;
6620                 stop->ls_flags = lr->lr_param & ~LPF_BROADCAST;
6621                 lfsck_stop(env, lfsck->li_bottom, stop);
6622         }
6623
6624         RETURN(0);
6625 }
6626
6627 static void lfsck_layout_repaired(struct lfsck_layout *lo, __u64 *count)
6628 {
6629         int i;
6630
6631         for (i = 0; i < LLIT_MAX; i++)
6632                 *count += lo->ll_objs_repaired[i];
6633 }
6634
6635 static int lfsck_layout_query_all(const struct lu_env *env,
6636                                   struct lfsck_component *com,
6637                                   __u32 *mdts_count, __u32 *osts_count,
6638                                   __u64 *repaired)
6639 {
6640         struct lfsck_layout *lo = com->lc_file_ram;
6641         struct lfsck_tgt_descs *ltds;
6642         struct lfsck_tgt_desc *ltd;
6643         int idx;
6644         int rc;
6645         ENTRY;
6646
6647         rc = lfsck_query_all(env, com);
6648         if (rc != 0)
6649                 RETURN(rc);
6650
6651         ltds = &com->lc_lfsck->li_mdt_descs;
6652         down_read(&ltds->ltd_rw_sem);
6653         cfs_foreach_bit(ltds->ltd_tgts_bitmap, idx) {
6654                 ltd = lfsck_ltd2tgt(ltds, idx);
6655                 LASSERT(ltd != NULL);
6656
6657                 mdts_count[ltd->ltd_layout_status]++;
6658                 *repaired += ltd->ltd_layout_repaired;
6659         }
6660         up_read(&ltds->ltd_rw_sem);
6661
6662         ltds = &com->lc_lfsck->li_ost_descs;
6663         down_read(&ltds->ltd_rw_sem);
6664         cfs_foreach_bit(ltds->ltd_tgts_bitmap, idx) {
6665                 ltd = lfsck_ltd2tgt(ltds, idx);
6666                 LASSERT(ltd != NULL);
6667
6668                 osts_count[ltd->ltd_layout_status]++;
6669                 *repaired += ltd->ltd_layout_repaired;
6670         }
6671         up_read(&ltds->ltd_rw_sem);
6672
6673         down_read(&com->lc_sem);
6674         mdts_count[lo->ll_status]++;
6675         lfsck_layout_repaired(lo, repaired);
6676         up_read(&com->lc_sem);
6677
6678         RETURN(0);
6679 }
6680
6681 static int lfsck_layout_query(const struct lu_env *env,
6682                               struct lfsck_component *com,
6683                               struct lfsck_request *req,
6684                               struct lfsck_reply *rep,
6685                               struct lfsck_query *que, int idx)
6686 {
6687         struct lfsck_layout *lo = com->lc_file_ram;
6688         int rc = 0;
6689
6690         if (que != NULL) {
6691                 LASSERT(com->lc_lfsck->li_master);
6692
6693                 rc = lfsck_layout_query_all(env, com,
6694                                             que->lu_mdts_count[idx],
6695                                             que->lu_osts_count[idx],
6696                                             &que->lu_repaired[idx]);
6697         } else {
6698                 down_read(&com->lc_sem);
6699                 rep->lr_status = lo->ll_status;
6700                 if (req->lr_flags & LEF_QUERY_ALL)
6701                         lfsck_layout_repaired(lo, &rep->lr_repaired);
6702                 up_read(&com->lc_sem);
6703         }
6704
6705         return rc;
6706 }
6707
6708 /* with lfsck::li_lock held */
6709 static int lfsck_layout_slave_join(const struct lu_env *env,
6710                                    struct lfsck_component *com,
6711                                    struct lfsck_start_param *lsp)
6712 {
6713         struct lfsck_instance            *lfsck = com->lc_lfsck;
6714         struct lfsck_layout_slave_data   *llsd  = com->lc_data;
6715         struct lfsck_layout_slave_target *llst;
6716         struct lfsck_start               *start = lsp->lsp_start;
6717         int                               rc    = 0;
6718         ENTRY;
6719
6720         if (start == NULL || !(start->ls_flags & LPF_OST_ORPHAN))
6721                 RETURN(0);
6722
6723         if (!lsp->lsp_index_valid)
6724                 RETURN(-EINVAL);
6725
6726         /* If someone is running the LFSCK without orphan handling,
6727          * it will not maintain the object accessing rbtree. So we
6728          * cannot join it for orphan handling. */
6729         if (!llsd->llsd_rbtree_valid)
6730                 RETURN(-EBUSY);
6731
6732         spin_unlock(&lfsck->li_lock);
6733         rc = lfsck_layout_llst_add(llsd, lsp->lsp_index);
6734         spin_lock(&lfsck->li_lock);
6735         if (rc == 0 && !thread_is_running(&lfsck->li_thread)) {
6736                 spin_unlock(&lfsck->li_lock);
6737                 llst = lfsck_layout_llst_find_and_del(llsd, lsp->lsp_index,
6738                                                       true);
6739                 if (llst != NULL)
6740                         lfsck_layout_llst_put(llst);
6741                 spin_lock(&lfsck->li_lock);
6742                 rc = -EAGAIN;
6743         }
6744
6745         RETURN(rc);
6746 }
6747
6748 static struct lfsck_operations lfsck_layout_master_ops = {
6749         .lfsck_reset            = lfsck_layout_reset,
6750         .lfsck_fail             = lfsck_layout_fail,
6751         .lfsck_checkpoint       = lfsck_layout_master_checkpoint,
6752         .lfsck_prep             = lfsck_layout_master_prep,
6753         .lfsck_exec_oit         = lfsck_layout_master_exec_oit,
6754         .lfsck_exec_dir         = lfsck_layout_exec_dir,
6755         .lfsck_post             = lfsck_layout_master_post,
6756         .lfsck_dump             = lfsck_layout_dump,
6757         .lfsck_double_scan      = lfsck_layout_master_double_scan,
6758         .lfsck_data_release     = lfsck_layout_master_data_release,
6759         .lfsck_quit             = lfsck_layout_master_quit,
6760         .lfsck_in_notify        = lfsck_layout_master_in_notify,
6761         .lfsck_query            = lfsck_layout_query,
6762 };
6763
6764 static struct lfsck_operations lfsck_layout_slave_ops = {
6765         .lfsck_reset            = lfsck_layout_reset,
6766         .lfsck_fail             = lfsck_layout_fail,
6767         .lfsck_checkpoint       = lfsck_layout_slave_checkpoint,
6768         .lfsck_prep             = lfsck_layout_slave_prep,
6769         .lfsck_exec_oit         = lfsck_layout_slave_exec_oit,
6770         .lfsck_exec_dir         = lfsck_layout_exec_dir,
6771         .lfsck_post             = lfsck_layout_slave_post,
6772         .lfsck_dump             = lfsck_layout_dump,
6773         .lfsck_double_scan      = lfsck_layout_slave_double_scan,
6774         .lfsck_data_release     = lfsck_layout_slave_data_release,
6775         .lfsck_quit             = lfsck_layout_slave_quit,
6776         .lfsck_in_notify_local  = lfsck_layout_slave_in_notify_local,
6777         .lfsck_in_notify        = lfsck_layout_slave_in_notify,
6778         .lfsck_query            = lfsck_layout_query,
6779         .lfsck_join             = lfsck_layout_slave_join,
6780 };
6781
6782 static void lfsck_layout_assistant_fill_pos(const struct lu_env *env,
6783                                             struct lfsck_component *com,
6784                                             struct lfsck_position *pos)
6785 {
6786         struct lfsck_assistant_data     *lad = com->lc_data;
6787         struct lfsck_layout_req         *llr;
6788
6789         if (((struct lfsck_layout *)(com->lc_file_ram))->ll_status !=
6790             LS_SCANNING_PHASE1)
6791                 return;
6792
6793         if (list_empty(&lad->lad_req_list))
6794                 return;
6795
6796         llr = list_entry(lad->lad_req_list.next,
6797                          struct lfsck_layout_req,
6798                          llr_lar.lar_list);
6799         pos->lp_oit_cookie = llr->llr_lar.lar_parent->lso_oit_cookie - 1;
6800 }
6801
6802 struct lfsck_assistant_operations lfsck_layout_assistant_ops = {
6803         .la_handler_p1          = lfsck_layout_assistant_handler_p1,
6804         .la_handler_p2          = lfsck_layout_assistant_handler_p2,
6805         .la_fill_pos            = lfsck_layout_assistant_fill_pos,
6806         .la_double_scan_result  = lfsck_layout_double_scan_result,
6807         .la_req_fini            = lfsck_layout_assistant_req_fini,
6808         .la_sync_failures       = lfsck_layout_assistant_sync_failures,
6809 };
6810
6811 int lfsck_layout_setup(const struct lu_env *env, struct lfsck_instance *lfsck)
6812 {
6813         struct lfsck_component  *com;
6814         struct lfsck_layout     *lo;
6815         struct dt_object        *root = NULL;
6816         struct dt_object        *obj;
6817         int                      i;
6818         int                      rc;
6819         ENTRY;
6820
6821         OBD_ALLOC_PTR(com);
6822         if (com == NULL)
6823                 RETURN(-ENOMEM);
6824
6825         INIT_LIST_HEAD(&com->lc_link);
6826         INIT_LIST_HEAD(&com->lc_link_dir);
6827         init_rwsem(&com->lc_sem);
6828         atomic_set(&com->lc_ref, 1);
6829         com->lc_lfsck = lfsck;
6830         com->lc_type = LFSCK_TYPE_LAYOUT;
6831         if (lfsck->li_master) {
6832                 com->lc_ops = &lfsck_layout_master_ops;
6833                 com->lc_data = lfsck_assistant_data_init(
6834                                 &lfsck_layout_assistant_ops,
6835                                 LFSCK_LAYOUT);
6836                 if (com->lc_data == NULL)
6837                         GOTO(out, rc = -ENOMEM);
6838
6839                 for (i = 0; i < LFSCK_STF_COUNT; i++)
6840                         mutex_init(&com->lc_sub_trace_objs[i].lsto_mutex);
6841         } else {
6842                 struct lfsck_layout_slave_data *llsd;
6843
6844                 com->lc_ops = &lfsck_layout_slave_ops;
6845                 OBD_ALLOC_PTR(llsd);
6846                 if (llsd == NULL)
6847                         GOTO(out, rc = -ENOMEM);
6848
6849                 INIT_LIST_HEAD(&llsd->llsd_seq_list);
6850                 INIT_LIST_HEAD(&llsd->llsd_master_list);
6851                 spin_lock_init(&llsd->llsd_lock);
6852                 llsd->llsd_rb_root = RB_ROOT;
6853                 init_rwsem(&llsd->llsd_rb_rwsem);
6854                 com->lc_data = llsd;
6855         }
6856         com->lc_file_size = sizeof(*lo);
6857         OBD_ALLOC(com->lc_file_ram, com->lc_file_size);
6858         if (com->lc_file_ram == NULL)
6859                 GOTO(out, rc = -ENOMEM);
6860
6861         OBD_ALLOC(com->lc_file_disk, com->lc_file_size);
6862         if (com->lc_file_disk == NULL)
6863                 GOTO(out, rc = -ENOMEM);
6864
6865         root = dt_locate(env, lfsck->li_bottom, &lfsck->li_local_root_fid);
6866         if (IS_ERR(root))
6867                 GOTO(out, rc = PTR_ERR(root));
6868
6869         if (unlikely(!dt_try_as_dir(env, root)))
6870                 GOTO(out, rc = -ENOTDIR);
6871
6872         obj = local_file_find_or_create(env, lfsck->li_los, root,
6873                                         LFSCK_LAYOUT,
6874                                         S_IFREG | S_IRUGO | S_IWUSR);
6875         if (IS_ERR(obj))
6876                 GOTO(out, rc = PTR_ERR(obj));
6877
6878         com->lc_obj = obj;
6879         rc = lfsck_layout_load(env, com);
6880         if (rc > 0) {
6881                 rc = lfsck_layout_reset(env, com, true);
6882         } else if (rc == -ENOENT) {
6883                 rc = lfsck_layout_init(env, com);
6884         } else if (lfsck->li_master) {
6885                 rc = lfsck_load_sub_trace_files(env, com,
6886                                 &dt_lfsck_layout_dangling_features,
6887                                 LFSCK_LAYOUT, false);
6888                 if (rc)
6889                         rc = lfsck_layout_reset(env, com, true);
6890         }
6891
6892         if (rc != 0)
6893                 GOTO(out, rc);
6894
6895         lo = com->lc_file_ram;
6896         switch (lo->ll_status) {
6897         case LS_INIT:
6898         case LS_COMPLETED:
6899         case LS_FAILED:
6900         case LS_STOPPED:
6901         case LS_PARTIAL:
6902                 spin_lock(&lfsck->li_lock);
6903                 list_add_tail(&com->lc_link, &lfsck->li_list_idle);
6904                 spin_unlock(&lfsck->li_lock);
6905                 break;
6906         default:
6907                 CERROR("%s: unknown lfsck_layout status %d\n",
6908                        lfsck_lfsck2name(lfsck), lo->ll_status);
6909                 /* fall through */
6910         case LS_SCANNING_PHASE1:
6911         case LS_SCANNING_PHASE2:
6912                 /* No need to store the status to disk right now.
6913                  * If the system crashed before the status stored,
6914                  * it will be loaded back when next time. */
6915                 lo->ll_status = LS_CRASHED;
6916                 if (!lfsck->li_master)
6917                         lo->ll_flags |= LF_INCOMPLETE;
6918                 /* fall through */
6919         case LS_PAUSED:
6920         case LS_CRASHED:
6921         case LS_CO_FAILED:
6922         case LS_CO_STOPPED:
6923         case LS_CO_PAUSED:
6924                 spin_lock(&lfsck->li_lock);
6925                 list_add_tail(&com->lc_link, &lfsck->li_list_scan);
6926                 spin_unlock(&lfsck->li_lock);
6927                 break;
6928         }
6929
6930         if (lo->ll_flags & LF_CRASHED_LASTID) {
6931                 LASSERT(lfsck->li_out_notify != NULL);
6932
6933                 lfsck->li_out_notify(env, lfsck->li_out_notify_data,
6934                                      LE_LASTID_REBUILDING);
6935         }
6936
6937         GOTO(out, rc = 0);
6938
6939 out:
6940         if (root != NULL && !IS_ERR(root))
6941                 lfsck_object_put(env, root);
6942
6943         if (rc != 0) {
6944                 lfsck_component_cleanup(env, com);
6945                 CERROR("%s: fail to init layout LFSCK component: rc = %d\n",
6946                        lfsck_lfsck2name(lfsck), rc);
6947         }
6948
6949         return rc;
6950 }
6951
6952 struct lfsck_orphan_it {
6953         struct lfsck_component           *loi_com;
6954         struct lfsck_rbtree_node         *loi_lrn;
6955         struct lfsck_layout_slave_target *loi_llst;
6956         struct lu_fid                     loi_key;
6957         struct lu_orphan_rec_v3           loi_rec;
6958         __u64                             loi_hash;
6959         unsigned int                      loi_over:1;
6960 };
6961
6962 static int lfsck_fid_match_idx(const struct lu_env *env,
6963                                struct lfsck_instance *lfsck,
6964                                const struct lu_fid *fid, int idx)
6965 {
6966         struct seq_server_site  *ss;
6967         struct lu_server_fld    *sf;
6968         struct lu_seq_range     *range = &lfsck_env_info(env)->lti_range;
6969         int                      rc;
6970
6971         /* All abnormal cases will be returned to MDT0. */
6972         if (!fid_is_norm(fid)) {
6973                 if (idx == 0)
6974                         return 1;
6975
6976                 return 0;
6977         }
6978
6979         ss = lfsck_dev_site(lfsck);
6980         if (unlikely(ss == NULL))
6981                 return -ENOTCONN;
6982
6983         sf = ss->ss_server_fld;
6984         LASSERT(sf != NULL);
6985
6986         fld_range_set_any(range);
6987         rc = fld_server_lookup(env, sf, fid_seq(fid), range);
6988         if (rc != 0)
6989                 return rc;
6990
6991         if (!fld_range_is_mdt(range))
6992                 return -EINVAL;
6993
6994         if (range->lsr_index == idx)
6995                 return 1;
6996
6997         return 0;
6998 }
6999
7000 static void lfsck_layout_destroy_orphan(const struct lu_env *env,
7001                                         struct dt_object *obj)
7002 {
7003         struct dt_device        *dev    = lfsck_obj2dev(obj);
7004         struct thandle          *handle;
7005         int                      rc;
7006         ENTRY;
7007
7008         handle = dt_trans_create(env, dev);
7009         if (IS_ERR(handle))
7010                 RETURN_EXIT;
7011
7012         rc = dt_declare_ref_del(env, obj, handle);
7013         if (rc != 0)
7014                 GOTO(stop, rc);
7015
7016         rc = dt_declare_destroy(env, obj, handle);
7017         if (rc != 0)
7018                 GOTO(stop, rc);
7019
7020         rc = dt_trans_start_local(env, dev, handle);
7021         if (rc != 0)
7022                 GOTO(stop, rc);
7023
7024         dt_write_lock(env, obj, 0);
7025         rc = dt_ref_del(env, obj, handle);
7026         if (rc == 0)
7027                 rc = dt_destroy(env, obj, handle);
7028         dt_write_unlock(env, obj);
7029
7030         GOTO(stop, rc);
7031
7032 stop:
7033         dt_trans_stop(env, dev, handle);
7034
7035         CDEBUG(D_LFSCK, "destroy orphan OST-object "DFID": rc = %d\n",
7036                PFID(lfsck_dto2fid(obj)), rc);
7037
7038         RETURN_EXIT;
7039 }
7040
7041 static int lfsck_orphan_index_lookup(const struct lu_env *env,
7042                                      struct dt_object *dt,
7043                                      struct dt_rec *rec,
7044                                      const struct dt_key *key)
7045 {
7046         return -EOPNOTSUPP;
7047 }
7048
7049 static int lfsck_orphan_index_declare_insert(const struct lu_env *env,
7050                                              struct dt_object *dt,
7051                                              const struct dt_rec *rec,
7052                                              const struct dt_key *key,
7053                                              struct thandle *handle)
7054 {
7055         return -EOPNOTSUPP;
7056 }
7057
7058 static int lfsck_orphan_index_insert(const struct lu_env *env,
7059                                      struct dt_object *dt,
7060                                      const struct dt_rec *rec,
7061                                      const struct dt_key *key,
7062                                      struct thandle *handle)
7063 {
7064         return -EOPNOTSUPP;
7065 }
7066
7067 static int lfsck_orphan_index_declare_delete(const struct lu_env *env,
7068                                              struct dt_object *dt,
7069                                              const struct dt_key *key,
7070                                              struct thandle *handle)
7071 {
7072         return -EOPNOTSUPP;
7073 }
7074
7075 static int lfsck_orphan_index_delete(const struct lu_env *env,
7076                                      struct dt_object *dt,
7077                                      const struct dt_key *key,
7078                                      struct thandle *handle)
7079 {
7080         return -EOPNOTSUPP;
7081 }
7082
7083 static struct dt_it *lfsck_orphan_it_init(const struct lu_env *env,
7084                                           struct dt_object *dt,
7085                                           __u32 attr)
7086 {
7087         struct dt_device                *dev    = lu2dt_dev(dt->do_lu.lo_dev);
7088         struct lfsck_instance           *lfsck;
7089         struct lfsck_component          *com    = NULL;
7090         struct lfsck_layout_slave_data  *llsd;
7091         struct lfsck_orphan_it          *it     = NULL;
7092         struct lfsck_layout             *lo;
7093         int                              rc     = 0;
7094         ENTRY;
7095
7096         lfsck = lfsck_instance_find(dev, true, false);
7097         if (unlikely(lfsck == NULL))
7098                 RETURN(ERR_PTR(-ENXIO));
7099
7100         com = lfsck_component_find(lfsck, LFSCK_TYPE_LAYOUT);
7101         if (unlikely(com == NULL))
7102                 GOTO(out, rc = -ENOENT);
7103
7104         lo = com->lc_file_ram;
7105         if (lo->ll_flags & LF_INCOMPLETE)
7106                 GOTO(out, rc = -ESRCH);
7107
7108         llsd = com->lc_data;
7109         if (!llsd->llsd_rbtree_valid)
7110                 GOTO(out, rc = -ESRCH);
7111
7112         OBD_ALLOC_PTR(it);
7113         if (it == NULL)
7114                 GOTO(out, rc = -ENOMEM);
7115
7116         it->loi_llst = lfsck_layout_llst_find_and_del(llsd, attr, false);
7117         if (it->loi_llst == NULL)
7118                 GOTO(out, rc = -ENXIO);
7119
7120         if (dev->dd_record_fid_accessed) {
7121                 /* The first iteration against the rbtree, scan the whole rbtree
7122                  * to remove the nodes which do NOT need to be handled. */
7123                 down_write(&llsd->llsd_rb_rwsem);
7124                 if (dev->dd_record_fid_accessed) {
7125                         struct rb_node                  *node;
7126                         struct rb_node                  *next;
7127                         struct lfsck_rbtree_node        *lrn;
7128
7129                         /* No need to record the fid accessing anymore. */
7130                         dev->dd_record_fid_accessed = 0;
7131
7132                         node = rb_first(&llsd->llsd_rb_root);
7133                         while (node != NULL) {
7134                                 next = rb_next(node);
7135                                 lrn = rb_entry(node, struct lfsck_rbtree_node,
7136                                                lrn_node);
7137                                 if (atomic_read(&lrn->lrn_known_count) <=
7138                                     atomic_read(&lrn->lrn_accessed_count)) {
7139                                         rb_erase(node, &llsd->llsd_rb_root);
7140                                         lfsck_rbtree_free(lrn);
7141                                 }
7142                                 node = next;
7143                         }
7144                 }
7145                 up_write(&llsd->llsd_rb_rwsem);
7146         }
7147
7148         /* read lock the rbtree when init, and unlock when fini */
7149         down_read(&llsd->llsd_rb_rwsem);
7150         it->loi_com = com;
7151         com = NULL;
7152
7153         GOTO(out, rc = 0);
7154
7155 out:
7156         if (com != NULL)
7157                 lfsck_component_put(env, com);
7158
7159         CDEBUG(D_LFSCK, "%s: init the orphan iteration: rc = %d\n",
7160                lfsck_lfsck2name(lfsck), rc);
7161
7162         lfsck_instance_put(env, lfsck);
7163         if (rc != 0) {
7164                 if (it != NULL)
7165                         OBD_FREE_PTR(it);
7166
7167                 it = (struct lfsck_orphan_it *)ERR_PTR(rc);
7168         }
7169
7170         return (struct dt_it *)it;
7171 }
7172
7173 static void lfsck_orphan_it_fini(const struct lu_env *env,
7174                                  struct dt_it *di)
7175 {
7176         struct lfsck_orphan_it           *it    = (struct lfsck_orphan_it *)di;
7177         struct lfsck_component           *com   = it->loi_com;
7178         struct lfsck_layout_slave_data   *llsd;
7179         struct lfsck_layout_slave_target *llst;
7180
7181         if (com != NULL) {
7182                 CDEBUG(D_LFSCK, "%s: fini the orphan iteration\n",
7183                        lfsck_lfsck2name(com->lc_lfsck));
7184
7185                 llsd = com->lc_data;
7186                 up_read(&llsd->llsd_rb_rwsem);
7187                 llst = it->loi_llst;
7188                 LASSERT(llst != NULL);
7189
7190                 /* Save the key and hash for iterate next. */
7191                 llst->llst_fid = it->loi_key;
7192                 llst->llst_hash = it->loi_hash;
7193                 lfsck_layout_llst_put(llst);
7194                 lfsck_component_put(env, com);
7195         }
7196         OBD_FREE_PTR(it);
7197 }
7198
7199 /**
7200  * \retval       +1: the iteration finished
7201  * \retval        0: on success, not finished
7202  * \retval      -ve: on error
7203  */
7204 static int lfsck_orphan_it_next(const struct lu_env *env,
7205                                 struct dt_it *di)
7206 {
7207         struct lfsck_thread_info        *info   = lfsck_env_info(env);
7208         struct filter_fid               *ff     = &info->lti_ff;
7209         struct lu_attr                  *la     = &info->lti_la;
7210         struct lfsck_orphan_it          *it     = (struct lfsck_orphan_it *)di;
7211         struct lu_fid                   *key    = &it->loi_key;
7212         struct lu_orphan_rec_v3         *rec    = &it->loi_rec;
7213         struct ost_layout               *ol     = &rec->lor_layout;
7214         struct lfsck_component          *com    = it->loi_com;
7215         struct lfsck_instance           *lfsck  = com->lc_lfsck;
7216         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
7217         struct dt_object                *obj;
7218         struct lfsck_rbtree_node        *lrn;
7219         int                              pos;
7220         int                              rc;
7221         __u32                            save;
7222         __u32                            idx    = it->loi_llst->llst_index;
7223         bool                             exact  = false;
7224         ENTRY;
7225
7226         if (it->loi_over)
7227                 RETURN(1);
7228
7229 again0:
7230         lrn = it->loi_lrn;
7231         if (lrn == NULL) {
7232                 lrn = lfsck_rbtree_search(llsd, key, &exact);
7233                 if (lrn == NULL) {
7234                         it->loi_over = 1;
7235                         RETURN(1);
7236                 }
7237
7238                 it->loi_lrn = lrn;
7239                 if (!exact) {
7240                         key->f_seq = lrn->lrn_seq;
7241                         key->f_oid = lrn->lrn_first_oid;
7242                         key->f_ver = 0;
7243                 }
7244         } else {
7245                 key->f_oid++;
7246                 if (unlikely(key->f_oid == 0)) {
7247                         key->f_seq++;
7248                         it->loi_lrn = NULL;
7249                         goto again0;
7250                 }
7251
7252                 if (key->f_oid >=
7253                     lrn->lrn_first_oid + LFSCK_RBTREE_BITMAP_WIDTH) {
7254                         it->loi_lrn = NULL;
7255                         goto again0;
7256                 }
7257         }
7258
7259         if (unlikely(atomic_read(&lrn->lrn_known_count) <=
7260                      atomic_read(&lrn->lrn_accessed_count))) {
7261                 struct rb_node *next = rb_next(&lrn->lrn_node);
7262
7263                 while (next != NULL) {
7264                         lrn = rb_entry(next, struct lfsck_rbtree_node,
7265                                        lrn_node);
7266                         if (atomic_read(&lrn->lrn_known_count) >
7267                             atomic_read(&lrn->lrn_accessed_count))
7268                                 break;
7269                         next = rb_next(next);
7270                 }
7271
7272                 if (next == NULL) {
7273                         it->loi_over = 1;
7274                         RETURN(1);
7275                 }
7276
7277                 it->loi_lrn = lrn;
7278                 key->f_seq = lrn->lrn_seq;
7279                 key->f_oid = lrn->lrn_first_oid;
7280                 key->f_ver = 0;
7281         }
7282
7283         pos = key->f_oid - lrn->lrn_first_oid;
7284
7285 again1:
7286         pos = find_next_bit(lrn->lrn_known_bitmap,
7287                             LFSCK_RBTREE_BITMAP_WIDTH, pos);
7288         if (pos >= LFSCK_RBTREE_BITMAP_WIDTH) {
7289                 key->f_oid = lrn->lrn_first_oid + pos;
7290                 if (unlikely(key->f_oid < lrn->lrn_first_oid)) {
7291                         key->f_seq++;
7292                         key->f_oid = 0;
7293                 }
7294                 it->loi_lrn = NULL;
7295                 goto again0;
7296         }
7297
7298         if (test_bit(pos, lrn->lrn_accessed_bitmap)) {
7299                 pos++;
7300                 goto again1;
7301         }
7302
7303         key->f_oid = lrn->lrn_first_oid + pos;
7304         obj = lfsck_object_find_bottom(env, lfsck, key);
7305         if (IS_ERR(obj)) {
7306                 rc = PTR_ERR(obj);
7307                 if (rc == -ENOENT) {
7308                         pos++;
7309                         goto again1;
7310                 }
7311                 RETURN(rc);
7312         }
7313
7314         dt_read_lock(env, obj, 0);
7315         if (dt_object_exists(obj) == 0 ||
7316             lfsck_is_dead_obj(obj)) {
7317                 dt_read_unlock(env, obj);
7318                 lfsck_object_put(env, obj);
7319                 pos++;
7320                 goto again1;
7321         }
7322
7323         rc = dt_attr_get(env, obj, la);
7324         if (rc != 0)
7325                 GOTO(out, rc);
7326
7327         rc = dt_xattr_get(env, obj, lfsck_buf_get(env, ff, sizeof(*ff)),
7328                           XATTR_NAME_FID);
7329         if (rc == -ENODATA) {
7330                 /* For the pre-created OST-object, update the bitmap to avoid
7331                  * others LFSCK (second phase) iteration to touch it again. */
7332                 if (la->la_ctime == 0) {
7333                         if (!test_and_set_bit(pos, lrn->lrn_accessed_bitmap))
7334                                 atomic_inc(&lrn->lrn_accessed_count);
7335
7336                         /* For the race between repairing dangling referenced
7337                          * MDT-object and unlink the file, it may left orphan
7338                          * OST-object there. Destroy it now! */
7339                         if (unlikely(!(la->la_mode & S_ISUID))) {
7340                                 dt_read_unlock(env, obj);
7341                                 lfsck_layout_destroy_orphan(env, obj);
7342                                 lfsck_object_put(env, obj);
7343                                 pos++;
7344                                 goto again1;
7345                         }
7346                 } else if (idx == 0) {
7347                         /* If the orphan OST-object has no parent information,
7348                          * regard it as referenced by the MDT-object on MDT0. */
7349                         fid_zero(&rec->lor_rec.lor_fid);
7350                         rec->lor_rec.lor_uid = la->la_uid;
7351                         rec->lor_rec.lor_gid = la->la_gid;
7352                         memset(ol, 0, sizeof(*ol));
7353                         rec->lor_layout_version = 0;
7354                         rec->lor_range = 0;
7355
7356                         GOTO(out, rc = 0);
7357                 }
7358
7359                 dt_read_unlock(env, obj);
7360                 lfsck_object_put(env, obj);
7361                 pos++;
7362                 goto again1;
7363         }
7364
7365         if (rc < sizeof(struct lu_fid))
7366                 GOTO(out, rc = (rc < 0 ? rc : -EINVAL));
7367
7368         fid_le_to_cpu(&rec->lor_rec.lor_fid, &ff->ff_parent);
7369         /* Currently, the filter_fid::ff_parent::f_ver is not the real parent
7370          * MDT-object's FID::f_ver, instead it is the OST-object index in its
7371          * parent MDT-object's layout EA. */
7372         save = rec->lor_rec.lor_fid.f_stripe_idx;
7373         rec->lor_rec.lor_fid.f_ver = 0;
7374         rc = lfsck_fid_match_idx(env, lfsck, &rec->lor_rec.lor_fid, idx);
7375         /* If the orphan OST-object does not claim the MDT, then next.
7376          *
7377          * If we do not know whether it matches or not, then return it
7378          * to the MDT for further check. */
7379         if (rc == 0) {
7380                 dt_read_unlock(env, obj);
7381                 lfsck_object_put(env, obj);
7382                 pos++;
7383                 goto again1;
7384         }
7385
7386         rec->lor_rec.lor_fid.f_stripe_idx = save;
7387         rec->lor_rec.lor_uid = la->la_uid;
7388         rec->lor_rec.lor_gid = la->la_gid;
7389         ost_layout_le_to_cpu(ol, &ff->ff_layout);
7390         rec->lor_layout_version =
7391                 le32_to_cpu(ff->ff_layout_version & ~LU_LAYOUT_RESYNC);
7392         rec->lor_range = le32_to_cpu(ff->ff_range);
7393
7394         CDEBUG(D_LFSCK, "%s: return orphan "DFID", PFID "DFID", owner %u:%u, "
7395                "stripe size %u, stripe count %u, COMP id %u, COMP start %llu, "
7396                "COMP end %llu, layout version %u, range %u\n",
7397                lfsck_lfsck2name(com->lc_lfsck), PFID(key),
7398                PFID(&rec->lor_rec.lor_fid), rec->lor_rec.lor_uid,
7399                rec->lor_rec.lor_gid, ol->ol_stripe_size, ol->ol_stripe_count,
7400                ol->ol_comp_id, ol->ol_comp_start, ol->ol_comp_end,
7401                rec->lor_layout_version, rec->lor_range);
7402
7403         GOTO(out, rc = 0);
7404
7405 out:
7406         dt_read_unlock(env, obj);
7407         lfsck_object_put(env, obj);
7408         if (rc == 0)
7409                 it->loi_hash++;
7410
7411         return rc;
7412 }
7413
7414 /**
7415  * \retval       +1: locate to the exactly position
7416  * \retval        0: cannot locate to the exactly position,
7417  *                   call next() to move to a valid position.
7418  * \retval      -ve: on error
7419  */
7420 static int lfsck_orphan_it_get(const struct lu_env *env,
7421                                struct dt_it *di,
7422                                const struct dt_key *key)
7423 {
7424         struct lfsck_orphan_it  *it   = (struct lfsck_orphan_it *)di;
7425         int                      rc;
7426
7427         it->loi_key = *(struct lu_fid *)key;
7428         rc = lfsck_orphan_it_next(env, di);
7429         if (rc == 1)
7430                 return 0;
7431
7432         if (rc == 0)
7433                 return 1;
7434
7435         return rc;
7436 }
7437
7438 static void lfsck_orphan_it_put(const struct lu_env *env,
7439                                 struct dt_it *di)
7440 {
7441 }
7442
7443 static struct dt_key *lfsck_orphan_it_key(const struct lu_env *env,
7444                                           const struct dt_it *di)
7445 {
7446         struct lfsck_orphan_it *it = (struct lfsck_orphan_it *)di;
7447
7448         return (struct dt_key *)&it->loi_key;
7449 }
7450
7451 static int lfsck_orphan_it_key_size(const struct lu_env *env,
7452                                     const struct dt_it *di)
7453 {
7454         return sizeof(struct lu_fid);
7455 }
7456
7457 static int lfsck_orphan_it_rec(const struct lu_env *env,
7458                                const struct dt_it *di,
7459                                struct dt_rec *rec,
7460                                __u32 attr)
7461 {
7462         struct lfsck_orphan_it *it = (struct lfsck_orphan_it *)di;
7463
7464         *(struct lu_orphan_rec_v3 *)rec = it->loi_rec;
7465
7466         return 0;
7467 }
7468
7469 static __u64 lfsck_orphan_it_store(const struct lu_env *env,
7470                                    const struct dt_it *di)
7471 {
7472         struct lfsck_orphan_it  *it   = (struct lfsck_orphan_it *)di;
7473
7474         return it->loi_hash;
7475 }
7476
7477 /**
7478  * \retval       +1: locate to the exactly position
7479  * \retval        0: cannot locate to the exactly position,
7480  *                   call next() to move to a valid position.
7481  * \retval      -ve: on error
7482  */
7483 static int lfsck_orphan_it_load(const struct lu_env *env,
7484                                 const struct dt_it *di,
7485                                 __u64 hash)
7486 {
7487         struct lfsck_orphan_it           *it   = (struct lfsck_orphan_it *)di;
7488         struct lfsck_layout_slave_target *llst = it->loi_llst;
7489         int                               rc;
7490
7491         LASSERT(llst != NULL);
7492
7493         if (hash != llst->llst_hash) {
7494                 CDEBUG(D_LFSCK, "%s: the given hash %llu for orphan "
7495                        "iteration does not match the one when fini "
7496                        "%llu, to be reset.\n",
7497                        lfsck_lfsck2name(it->loi_com->lc_lfsck), hash,
7498                        llst->llst_hash);
7499                 fid_zero(&llst->llst_fid);
7500                 llst->llst_hash = 0;
7501         }
7502
7503         it->loi_key = llst->llst_fid;
7504         it->loi_hash = llst->llst_hash;
7505         rc = lfsck_orphan_it_next(env, (struct dt_it *)di);
7506         if (rc == 1)
7507                 return 0;
7508
7509         if (rc == 0)
7510                 return 1;
7511
7512         return rc;
7513 }
7514
7515 static int lfsck_orphan_it_key_rec(const struct lu_env *env,
7516                                    const struct dt_it *di,
7517                                    void *key_rec)
7518 {
7519         return 0;
7520 }
7521
7522 const struct dt_index_operations lfsck_orphan_index_ops = {
7523         .dio_lookup             = lfsck_orphan_index_lookup,
7524         .dio_declare_insert     = lfsck_orphan_index_declare_insert,
7525         .dio_insert             = lfsck_orphan_index_insert,
7526         .dio_declare_delete     = lfsck_orphan_index_declare_delete,
7527         .dio_delete             = lfsck_orphan_index_delete,
7528         .dio_it = {
7529                 .init           = lfsck_orphan_it_init,
7530                 .fini           = lfsck_orphan_it_fini,
7531                 .get            = lfsck_orphan_it_get,
7532                 .put            = lfsck_orphan_it_put,
7533                 .next           = lfsck_orphan_it_next,
7534                 .key            = lfsck_orphan_it_key,
7535                 .key_size       = lfsck_orphan_it_key_size,
7536                 .rec            = lfsck_orphan_it_rec,
7537                 .store          = lfsck_orphan_it_store,
7538                 .load           = lfsck_orphan_it_load,
7539                 .key_rec        = lfsck_orphan_it_key_rec,
7540         }
7541 };